Mercurial > cgi-bin > hgwebdir.cgi > PR > Applications > Vthread > Vthread__Blocked_Matrix_Mult__Bench
changeset 1:133633d1c10f
First version modified from SSR copy -- not working yet
| author | Me |
|---|---|
| date | Tue, 16 Nov 2010 16:02:51 +0100 |
| parents | 8d14fe28a782 |
| children | 46ceb3dd0f0a |
| files | src/Application/Matrix_Mult.h src/Application/VPThread__Matrix_Mult/Divide_Pr.c src/Application/VPThread__Matrix_Mult/EntryPoint.c src/Application/VPThread__Matrix_Mult/Result_Pr.c src/Application/VPThread__Matrix_Mult/VPThread__Matrix_Mult.h src/Application/VPThread__Matrix_Mult/subMatrix_Pr.c |
| diffstat | 6 files changed, 182 insertions(+), 91 deletions(-) [+] |
line diff
1.1 --- a/src/Application/Matrix_Mult.h Wed Nov 10 22:26:57 2010 -0800 1.2 +++ b/src/Application/Matrix_Mult.h Tue Nov 16 16:02:51 2010 +0100 1.3 @@ -10,7 +10,7 @@ 1.4 #include <unistd.h> 1.5 #include <malloc.h> 1.6 1.7 -#include "../SSR_lib/VMS/VMS_primitive_data_types.h" 1.8 +#include "../VPThread_lib/VMS/VMS_primitive_data_types.h" 1.9 #include "ParamHelper/Param.h" 1.10 1.11 //============================== Structures ==============================
2.1 --- a/src/Application/VPThread__Matrix_Mult/Divide_Pr.c Wed Nov 10 22:26:57 2010 -0800 2.2 +++ b/src/Application/VPThread__Matrix_Mult/Divide_Pr.c Tue Nov 16 16:02:51 2010 +0100 2.3 @@ -7,7 +7,7 @@ 2.4 */ 2.5 2.6 2.7 -#include "SSR_Matrix_Mult.h" 2.8 +#include "VPThread__Matrix_Mult.h" 2.9 #include <math.h> 2.10 #include <string.h> 2.11 2.12 @@ -65,11 +65,11 @@ 2.13 * processors, 2.14 * then does a receive of a message from the result processor that gives 2.15 * the divider ownership of the result matrix. 2.16 - * Finally, the divider returns the result matrix out of the SSR system. 2.17 + * Finally, the divider returns the result matrix out of the VPThread system. 2.18 * 2.19 * Divider chooses the size of sub-matrices via an algorithm that tries to 2.20 * keep the minimum work above a threshold. The threshold is machine- 2.21 - * dependent, so ask SSR for min work-unit time to get a 2.22 + * dependent, so ask VPThread for min work-unit time to get a 2.23 * given overhead 2.24 * 2.25 * Divide min work-unit cycles by measured-cycles for one matrix-cell 2.26 @@ -114,21 +114,22 @@ 2.27 */ 2.28 2.29 void divideWorkIntoSubMatrixPairProcrs( void *_dividerParams, 2.30 - VirtProcr *animPr ) 2.31 + VirtProcr *animatingThd ) 2.32 { VirtProcr *resultPr; 2.33 DividerParams *dividerParams; 2.34 ResultsParams *resultsParams; 2.35 Matrix *leftMatrix, *rightMatrix, *resultMatrix; 2.36 void *msg; 2.37 SlicingStrucCarrier *slicingStrucCarrier; 2.38 - float32 *resultArray; //points to array inside result matrix 2.39 - 2.40 + float32 *resultArray; //points to array inside result matrix 2.41 + MatrixMultGlobals *globals; 2.42 + 2.43 DEBUG( dbgAppFlow, "start divide\n") 2.44 2.45 int32 2.46 divideProbe = VMS__create_single_interval_probe( "divideProbe", 2.47 - animPr ); 2.48 - VMS__record_sched_choice_into_probe( divideProbe, animPr ); 2.49 + animatingThd ); 2.50 + VMS__record_sched_choice_into_probe( divideProbe, animatingThd ); 2.51 VMS__record_interval_start_in_probe( divideProbe ); 2.52 2.53 //=========== Setup -- make local copies of ptd-to-things, malloc, aso 2.54 @@ -160,9 +161,9 @@ 2.55 2.56 //transpose the right matrix 2.57 float32 * 2.58 - transRightArray = SSR__malloc_to( rightMatrix->numRows * 2.59 - rightMatrix->numCols * sizeof(float32), 2.60 - animPr ); 2.61 + transRightArray = 2.62 + VPThread__malloc( rightMatrix->numRows * rightMatrix->numCols * 2.63 + sizeof(float32), animatingThd ); 2.64 2.65 //copy values from orig matrix to local 2.66 copyTranspose( rightMatrix->numRows, rightMatrix->numCols, 2.67 @@ -182,34 +183,60 @@ 2.68 //The ideal size is the one takes the number of cycles to calculate 2.69 // such that calc time is equal or greater than min work-unit size 2.70 slicingStrucCarrier = 2.71 - calcIdealSizeAndSliceDimensions( leftMatrix, rightMatrix, animPr ); 2.72 + calcIdealSizeAndSliceDimensions(leftMatrix,rightMatrix,animatingThd); 2.73 2.74 //Make the results processor, now that know how many to wait for 2.75 - resultsParams = SSR__malloc_to( sizeof(ResultsParams), animPr ); 2.76 + resultsParams = VPThread__malloc( sizeof(ResultsParams), animatingThd ); 2.77 resultsParams->numSubMatrixPairs = 2.78 slicingStrucCarrier->leftRowSlices->numVals * 2.79 slicingStrucCarrier->rightColSlices->numVals * 2.80 slicingStrucCarrier->vecSlices->numVals; 2.81 - resultsParams->dividerPr = animPr; 2.82 + resultsParams->dividerPr = animatingThd; 2.83 resultsParams->numCols = rightMatrix->numCols; 2.84 resultsParams->numRows = leftMatrix->numRows; 2.85 resultsParams->resultArray = resultArray; 2.86 2.87 + //========== Set up global vars, including conds and mutexes ========== 2.88 + globals = VMS__malloc( sizeof(MatrixMultGlobals) ); 2.89 + VPThread__set_globals_to( globals ); 2.90 2.91 - resultPr = 2.92 - SSR__create_procr_with( &gatherResults, resultsParams, animPr); 2.93 + globals->results_mutex = VPThread__make_mutex( animatingThd ); 2.94 + globals->results_cond = VPThread__make_cond( globals->results_mutex, 2.95 + animatingThd ); 2.96 2.97 + globals->vector_mutex = VPThread__make_mutex( animatingThd ); 2.98 + globals->vector_cond = VPThread__make_cond( globals->vector_mutex, 2.99 + animatingThd ); 2.100 + 2.101 + globals->start_mutex = VPThread__make_mutex( animatingThd ); 2.102 + globals->start_cond = VPThread__make_cond( globals->start_mutex, 2.103 + animatingThd ); 2.104 + //====================================================================== 2.105 + 2.106 + //get results-comm lock before create results-thd, to ensure it can't 2.107 + // signal that results are available before this thd is waiting on cond 2.108 + VPThread__mutex_lock( globals->results_mutex, animatingThd ); 2.109 + 2.110 + //also get the start lock & use to ensure no vector threads send a 2.111 + // signal before the results thread is waiting on vector cond 2.112 + VPThread__mutex_lock( globals->start_mutex, animatingThd ); 2.113 + 2.114 + 2.115 + VPThread__create_thread( &gatherResults, resultsParams, animatingThd ); 2.116 + 2.117 + //Now wait for results thd to signal that it has vector lock 2.118 + VPThread__cond_wait( globals->start_cond, animatingThd ); 2.119 + VPThread__mutex_unlock( globals->start_mutex, animatingThd );//done w/lock 2.120 + 2.121 //Make the sub-matrices, and pair them up, and make processor to 2.122 // calc product of each pair. 2.123 makeSubMatricesAndProcrs( leftMatrix, rightMatrix, 2.124 slicingStrucCarrier, 2.125 - resultPr, animPr); 2.126 + resultPr, animatingThd); 2.127 2.128 - //result array is allocated externally, so no message from resultPr 2.129 - // however, do have to wait before printing out stats, so wait 2.130 - // for an empty handshake message 2.131 - msg = SSR__receive_from_to( resultPr, animPr ); 2.132 - } 2.133 + //Wait for results thread to say results are good 2.134 + VPThread__cond_wait( globals->results_cond, animatingThd ); 2.135 + } 2.136 2.137 2.138 //=============== Work done -- send results back ================= 2.139 @@ -220,11 +247,13 @@ 2.140 VMS__record_interval_end_in_probe( divideProbe ); 2.141 VMS__print_stats_of_all_probes(); 2.142 2.143 - //nothing left to do so dissipate, SSR will wait to shutdown and hence 2.144 - // make results available to outside until all the processors have 2.145 - // dissipated -- so no need to wait for results processor 2.146 + //nothing left to do so dissipate, VPThread will wait to shutdown, 2.147 + // making results available to outside, until all the processors have 2.148 + // dissipated -- so actually no need to wait for results processor 2.149 + //However, following the pattern, so done with comm, release lock 2.150 + VPThread__mutex_unlock( globals->results_mutex, animatingThd ); 2.151 2.152 - SSR__dissipate_procr( animPr ); //all procrs dissipate self at end 2.153 + VPThread__dissipate_thread( animatingThd ); //all procrs dissipate self at end 2.154 //when all of the processors have dissipated, the "create seed and do 2.155 // work" call in the entry point function returns 2.156 } 2.157 @@ -237,7 +266,7 @@ 2.158 float32 idealSizeOfSide, idealSizeOfSide1, idealSizeOfSide2; 2.159 SlicingStruc *leftRowSlices, *vecSlices, *rightColSlices; 2.160 SlicingStrucCarrier *slicingStrucCarrier = 2.161 - SSR__malloc_to(sizeof(SlicingStrucCarrier), animPr); 2.162 + VPThread__malloc(sizeof(SlicingStrucCarrier), animPr); 2.163 2.164 int minWorkUnitCycles, primitiveCycles, idealNumWorkUnits; 2.165 float64 numPrimitiveOpsInMinWorkUnit; 2.166 @@ -245,11 +274,11 @@ 2.167 2.168 //======= Calc ideal size of min-sized sub-matrix ======== 2.169 2.170 - //ask SSR for the number of cycles of the minimum work unit, at given 2.171 + //ask VPThread for the number of cycles of the minimum work unit, at given 2.172 // percent overhead then add a guess at overhead from this divider 2.173 - minWorkUnitCycles = SSR__giveMinWorkUnitCycles( .05 ); 2.174 + minWorkUnitCycles = VPThread__giveMinWorkUnitCycles( .05 ); 2.175 2.176 - //ask SSR for number of cycles of the "primitive" op of matrix mult 2.177 + //ask VPThread for number of cycles of the "primitive" op of matrix mult 2.178 primitiveCycles = measureMatrixMultPrimitive( animPr ); 2.179 2.180 numPrimitiveOpsInMinWorkUnit = 2.181 @@ -259,7 +288,7 @@ 2.182 // then multiply by 5 because the primitive is 5x5 2.183 idealSizeOfSide1 = 5 * cbrt( numPrimitiveOpsInMinWorkUnit ); 2.184 2.185 - idealNumWorkUnits = SSR__giveIdealNumWorkUnits(); 2.186 + idealNumWorkUnits = VPThread__giveIdealNumWorkUnits(); 2.187 2.188 idealSizeOfSide2 = leftMatrix->numRows / rint(cbrt( idealNumWorkUnits )); 2.189 idealSizeOfSide2 *= 0.6; //finer granularity to help load balance 2.190 @@ -312,7 +341,7 @@ 2.191 leftRowSlices = slicingStrucCarrier->leftRowSlices; 2.192 vecSlices = slicingStrucCarrier->vecSlices; 2.193 rightColSlices = slicingStrucCarrier->rightColSlices; 2.194 - SSR__free( slicingStrucCarrier, animPr ); 2.195 + VPThread__free( slicingStrucCarrier, animPr ); 2.196 2.197 //================ Make sub-matrices, given the slicing ================ 2.198 SubMatrix **leftSubMatrices, **rightSubMatrices; 2.199 @@ -363,7 +392,7 @@ 2.200 numLeftColIdxs = numColIdxs; 2.201 numRightColIdxs = numVecIdxs; 2.202 2.203 - numCores = SSR__give_number_of_cores_to_schedule_onto(); 2.204 + numCores = VPThread__give_number_of_cores_to_schedule_onto(); 2.205 2.206 numToPutOntoEachCore = numRowIdxs*numColIdxs/numCores; 2.207 leftOverFraction = 0; 2.208 @@ -380,7 +409,7 @@ 2.209 for( vecIdx = 0; vecIdx < numVecIdxs; vecIdx++ ) 2.210 { 2.211 //Make the processor for the pair of sub-matrices 2.212 - subMatrixPairParams = SSR__malloc_to( sizeof(SMPairParams), 2.213 + subMatrixPairParams = VPThread__malloc( sizeof(SMPairParams), 2.214 animatingPr); 2.215 subMatrixPairParams->leftSubMatrix = 2.216 leftSubMatrices[ leftRowIdxOffset + vecIdx ]; 2.217 @@ -388,10 +417,10 @@ 2.218 subMatrixPairParams->rightSubMatrix = 2.219 rightSubMatrices[ vecIdx * numRightColIdxs + resColIdx ]; 2.220 2.221 - subMatrixPairParams->resultPr = resultPr; 2.222 + subMatrixPairParams->resultPr = resultPr fix_this; 2.223 2.224 //put all pairs from the same vector onto same core 2.225 - SSR__create_procr_with_affinity( &calcSubMatrixProduct, 2.226 + VPThread__create_thread_with_affinity( &calcSubMatrixProduct, 2.227 subMatrixPairParams, 2.228 animatingPr, 2.229 coreToScheduleOnto ); 2.230 @@ -424,7 +453,6 @@ 2.231 2.232 } 2.233 } 2.234 - 2.235 } 2.236 2.237 2.238 @@ -447,7 +475,7 @@ 2.239 rowStartVals = rowSlices->startVals; 2.240 colStartVals = colSlices->startVals; 2.241 2.242 - subMatrices = SSR__malloc_to(numRowIdxs * numColIdxs * sizeof(SubMatrix*), 2.243 + subMatrices = VPThread__malloc(numRowIdxs * numColIdxs * sizeof(SubMatrix*), 2.244 animPr ); 2.245 2.246 for( rowIdx = 0; rowIdx < numRowIdxs; rowIdx++ ) 2.247 @@ -463,7 +491,7 @@ 2.248 startCol = colStartVals[colIdx]; 2.249 endCol = colStartVals[colIdx + 1] -1; 2.250 2.251 - newSubMatrix = SSR__malloc_to( sizeof(SubMatrix), animPr ); 2.252 + newSubMatrix = VPThread__malloc( sizeof(SubMatrix), animPr ); 2.253 newSubMatrix->numRows = endRow - startRow +1; 2.254 newSubMatrix->numCols = endCol - startCol +1; 2.255 newSubMatrix->origMatrix = origMatrix; 2.256 @@ -496,11 +524,11 @@ 2.257 { 2.258 subMatrix = subMatrices[ rowOffset + colIdx ]; 2.259 if( subMatrix->alreadyCopied ) 2.260 - SSR__free( subMatrix->array, animPr ); 2.261 - SSR__free( subMatrix, animPr ); 2.262 + VPThread__free( subMatrix->array, animPr ); 2.263 + VPThread__free( subMatrix, animPr ); 2.264 } 2.265 } 2.266 - SSR__free( subMatrices, animPr ); 2.267 + VPThread__free( subMatrices, animPr ); 2.268 } 2.269 2.270 2.271 @@ -510,12 +538,12 @@ 2.272 VirtProcr *animPr ) 2.273 { float32 residualAcc = 0; 2.274 int numSlices, i, *startVals, sizeOfSlice, endCondition; 2.275 - SlicingStruc *slicingStruc = SSR__malloc_to(sizeof(SlicingStruc), animPr); 2.276 + SlicingStruc *slicingStruc = VPThread__malloc(sizeof(SlicingStruc), animPr); 2.277 2.278 //calc size of matrix need to hold start vals -- 2.279 numSlices = (int32)( (float32)(endVal -startVal +1) / idealSizeOfSide); 2.280 2.281 - startVals = SSR__malloc_to( (numSlices + 1) * sizeof(int32), animPr ); 2.282 + startVals = VPThread__malloc( (numSlices + 1) * sizeof(int32), animPr ); 2.283 2.284 //Calc the upper limit of start value -- when get above this, end loop 2.285 // by saving highest value of the matrix dimension to access, plus 1 2.286 @@ -549,8 +577,8 @@ 2.287 void 2.288 freeSlicingStruc( SlicingStruc *slicingStruc, VirtProcr *animPr ) 2.289 { 2.290 - SSR__free( slicingStruc->startVals, animPr ); 2.291 - SSR__free( slicingStruc, animPr ); 2.292 + VPThread__free( slicingStruc->startVals, animPr ); 2.293 + VPThread__free( slicingStruc, animPr ); 2.294 } 2.295 2.296 2.297 @@ -561,9 +589,9 @@ 2.298 float32 *res, *left, *right; 2.299 2.300 //setup inputs 2.301 - left = SSR__malloc_to( 5 * 5 * sizeof( float32 ), animPr ); 2.302 - right = SSR__malloc_to( 5 * 5 * sizeof( float32 ), animPr ); 2.303 - res = SSR__malloc_to( 5 * 5 * sizeof( float32 ), animPr ); 2.304 + left = VPThread__malloc( 5 * 5 * sizeof( float32 ), animPr ); 2.305 + right = VPThread__malloc( 5 * 5 * sizeof( float32 ), animPr ); 2.306 + res = VPThread__malloc( 5 * 5 * sizeof( float32 ), animPr ); 2.307 2.308 for( r = 0; r < 5; r++ ) 2.309 { 2.310 @@ -575,7 +603,7 @@ 2.311 } 2.312 2.313 //do primitive 2.314 - SSR__start_primitive(); //for now, just takes time stamp 2.315 + VPThread__start_primitive(); //for now, just takes time stamp 2.316 for( r = 0; r < 5; r++ ) 2.317 { 2.318 for( c = 0; c < 5; c++ ) 2.319 @@ -587,11 +615,11 @@ 2.320 } 2.321 } 2.322 numCycles = 2.323 - SSR__end_primitive_and_give_cycles(); 2.324 + VPThread__end_primitive_and_give_cycles(); 2.325 2.326 - SSR__free( left, animPr ); 2.327 - SSR__free( right, animPr ); 2.328 - SSR__free( res, animPr ); 2.329 + VPThread__free( left, animPr ); 2.330 + VPThread__free( right, animPr ); 2.331 + VPThread__free( res, animPr ); 2.332 2.333 return numCycles; 2.334 }
3.1 --- a/src/Application/VPThread__Matrix_Mult/EntryPoint.c Wed Nov 10 22:26:57 2010 -0800 3.2 +++ b/src/Application/VPThread__Matrix_Mult/EntryPoint.c Tue Nov 16 16:02:51 2010 +0100 3.3 @@ -8,7 +8,7 @@ 3.4 3.5 #include <math.h> 3.6 3.7 -#include "SSR_Matrix_Mult.h" 3.8 +#include "VPThread__Matrix_Mult.h" 3.9 3.10 3.11 3.12 @@ -54,8 +54,8 @@ 3.13 3.14 //create divider processor, start doing the work, and wait till done 3.15 //This function is the "border crossing" between normal code and SSR 3.16 - SSR__create_seed_procr_and_do_work( ÷WorkIntoSubMatrixPairProcrs, 3.17 - dividerParams ); 3.18 + VPThread__create_seed_procr_and_do_work(÷WorkIntoSubMatrixPairProcrs, 3.19 + dividerParams ); 3.20 3.21 free( dividerParams ); 3.22 return resMatrix;
4.1 --- a/src/Application/VPThread__Matrix_Mult/Result_Pr.c Wed Nov 10 22:26:57 2010 -0800 4.2 +++ b/src/Application/VPThread__Matrix_Mult/Result_Pr.c Tue Nov 16 16:02:51 2010 +0100 4.3 @@ -6,7 +6,7 @@ 4.4 * 4.5 */ 4.6 4.7 -#include "SSR_Matrix_Mult.h" 4.8 +#include "VPThread__Matrix_Mult.h" 4.9 4.10 //===================== 4.11 void inline 4.12 @@ -26,13 +26,30 @@ 4.13 *After the count reaches the point that all results have been received, it 4.14 * returns the result matrix and dissipates. 4.15 */ 4.16 -void gatherResults( void *_params, VirtProcr *animatingPr ) 4.17 +void gatherResults( void *_params, VirtProcr *animatingThd ) 4.18 { VirtProcr *dividerPr; 4.19 ResultsParams *params; 4.20 int row, col, numRows, numCols, numSubMatrixPairs, count=0; 4.21 float32 *resultArray; 4.22 void *msg; 4.23 SMPairParams *resParams; 4.24 + //====================== thread stuff ======================= 4.25 + MatrixMultGlobals *globals =(MatrixMultGlobals *)VPThread__give_globals(); 4.26 + 4.27 + 4.28 + //get vector-comm lock before loop, so that this thd keeps lock after 4.29 + // one wait until it enters the next wait -- forces see-saw btwn 4.30 + // waiters and signalers -- wait-signal-wait-signal-... 4.31 + VPThread__mutex_lock( globals->vector_mutex, animatingThd ); 4.32 + 4.33 + //Tell divider that have the vector lock -- so it's sure won't miss any 4.34 + // signals from the vector-threads it's about to create 4.35 + //Don't need a signal variable -- this thd can't be created until 4.36 + // divider thd already has the start lock 4.37 + VPThread__mutex_lock( globals->start_mutex, animatingThd );//finish wait 4.38 + VPThread__cond_signal( globals->start_cond, animatingThd ); 4.39 + VPThread__mutex_unlock( globals->start_mutex, animatingThd );//finish wait 4.40 + //=========================================================== 4.41 4.42 DEBUG( dbgAppFlow, "start resultPr\n") 4.43 4.44 @@ -47,9 +64,16 @@ 4.45 4.46 while( count < numSubMatrixPairs ) 4.47 { 4.48 - msg = SSR__receive_type_to( RESULTS_MSG, animatingPr ); 4.49 + //receive a vector-result from a vector-thread 4.50 + VPThread__cond_wait( globals->vector_cond, animatingThd ); 4.51 4.52 - resParams = (SMPairParams *)msg; 4.53 + //At this point, animating thread owns the vector lock, so all 4.54 + // pairs trying to signal they have a result are waiting to get that 4.55 + // lock -- only one gets it at a time, and when signal, this thd 4.56 + // gets the lock and does the body of this loop, then when does the 4.57 + // wait again, that releases the lock for next pair-thread to get it 4.58 + resParams = globals->currSMPairParams; 4.59 + 4.60 accumulateResult( resultArray, resParams->partialResultArray, 4.61 resParams->leftSubMatrix->origStartRow, 4.62 resParams->leftSubMatrix->numRows, 4.63 @@ -57,22 +81,22 @@ 4.64 resParams->rightSubMatrix->numCols, 4.65 resParams->rightSubMatrix->origMatrix->numCols ); 4.66 4.67 - SSR__free( resParams->partialResultArray, animatingPr ); 4.68 + VPThread__free( resParams->partialResultArray, animatingThd ); 4.69 4.70 //there is only one copy of results procr, so can update numUsesLeft 4.71 // without concurrency worries. When zero, free the sub-matrix 4.72 resParams->leftSubMatrix->numUsesLeft -= 1; 4.73 if( resParams->leftSubMatrix->numUsesLeft == 0 ) 4.74 { 4.75 - SSR__free( resParams->leftSubMatrix->array, animatingPr ); 4.76 - SSR__free( resParams->leftSubMatrix, animatingPr ); 4.77 + VPThread__free( resParams->leftSubMatrix->array, animatingThd ); 4.78 + VPThread__free( resParams->leftSubMatrix, animatingThd ); 4.79 } 4.80 4.81 resParams->rightSubMatrix->numUsesLeft -= 1; 4.82 if( resParams->rightSubMatrix->numUsesLeft == 0 ) 4.83 { 4.84 - SSR__free( resParams->rightSubMatrix->array, animatingPr ); 4.85 - SSR__free( resParams->rightSubMatrix, animatingPr ); 4.86 + VPThread__free( resParams->rightSubMatrix->array, animatingThd ); 4.87 + VPThread__free( resParams->rightSubMatrix, animatingThd ); 4.88 } 4.89 4.90 //count of how many sub-matrix pairs accumulated so know when done 4.91 @@ -82,9 +106,20 @@ 4.92 //Done -- could just dissipate -- SSR will wait for all processors to 4.93 // dissipate before shutting down, and thereby making results avaial to 4.94 // outside, so no need to stop the divider from dissipating, so no need 4.95 - // to send a hand-shake message to it -- bug makes debug easier 4.96 - SSR__send_from_to( NULL, animatingPr, dividerPr ); 4.97 - SSR__dissipate_procr( animatingPr ); //frees any data owned by procr 4.98 + // to send a hand-shake message to it -- but makes debug easier 4.99 + //However, following pattern, so all comms done, release lock 4.100 + VPThread__mutex_unlock( globals->vector_mutex, animatingThd ); 4.101 + 4.102 + //Send result to divider (seed) thread 4.103 + // note, divider thd had to hold the results-comm lock before creating 4.104 + // this thread, to be sure no race 4.105 + VPThread__mutex_lock( globals->results_mutex, animatingThd ); 4.106 + //globals->results = resultMatrixArray; 4.107 + VPThread__cond_signal( globals->results_cond, animatingThd ); 4.108 + VPThread__mutex_unlock( globals->results_mutex, animatingThd ); //releases 4.109 + //divider thread from its wait, at point this executes 4.110 + 4.111 + VPThread__dissipate_thread( animatingThd ); //frees any data owned by procr 4.112 } 4.113 4.114 void inline
5.1 --- a/src/Application/VPThread__Matrix_Mult/VPThread__Matrix_Mult.h Wed Nov 10 22:26:57 2010 -0800 5.2 +++ b/src/Application/VPThread__Matrix_Mult/VPThread__Matrix_Mult.h Tue Nov 16 16:02:51 2010 +0100 5.3 @@ -8,7 +8,7 @@ 5.4 5.5 #include <stdio.h> 5.6 5.7 -#include "../../SSR_lib/SSR.h" 5.8 +#include "../../VPThread_lib/VPThread.h" 5.9 #include "../Matrix_Mult.h" 5.10 5.11 5.12 @@ -81,6 +81,29 @@ 5.13 RESULTS_MSG = 1 5.14 }; 5.15 5.16 + 5.17 +typedef struct 5.18 + { 5.19 + //for communicating sub-matrix-pair results to results Thd 5.20 + int32 vector_mutex; 5.21 + int32 vector_cond; 5.22 + SMPairParams *currSMPairParams; 5.23 + 5.24 + //for communicating results array back to seed (divider) Thd 5.25 + int32 results_mutex; 5.26 + int32 results_cond; 5.27 + float32 *results; 5.28 + 5.29 + //for ensuring results thd has vector lock before making vector thds 5.30 + int32 start_mutex; 5.31 + int32 start_cond; 5.32 + 5.33 + Matrix *rightMatrix; 5.34 + Matrix *resultMatrix; 5.35 + } 5.36 +MatrixMultGlobals; 5.37 + 5.38 + 5.39 //============================= Processor Functions ========================= 5.40 void divideWorkIntoSubMatrixPairProcrs( void *data, VirtProcr *animatingPr ); 5.41 void calcSubMatrixProduct( void *data, VirtProcr *animatingPr );
6.1 --- a/src/Application/VPThread__Matrix_Mult/subMatrix_Pr.c Wed Nov 10 22:26:57 2010 -0800 6.2 +++ b/src/Application/VPThread__Matrix_Mult/subMatrix_Pr.c Tue Nov 16 16:02:51 2010 +0100 6.3 @@ -8,7 +8,7 @@ 6.4 6.5 #include <string.h> 6.6 6.7 -#include "SSR_Matrix_Mult.h" 6.8 +#include "VPThread__Matrix_Mult.h" 6.9 6.10 6.11 6.12 @@ -50,6 +50,7 @@ 6.13 VirtProcr *resultPr; 6.14 float32 *leftArray, *rightArray, *resArray; 6.15 SubMatrix *leftSubMatrix, *rightSubMatrix; 6.16 + MatrixMultGlobals *globals =(MatrixMultGlobals *)VPThread__give_globals(); 6.17 6.18 DEBUG1(dbgAppFlow, "start sub-matrix mult: %d\n", animatingPr->procrID) 6.19 int32 subMatrixProbe = VMS__create_single_interval_probe( "subMtx", 6.20 @@ -72,7 +73,7 @@ 6.21 6.22 int32 6.23 resSize = leftSubMatrix->numRows * rightSubMatrix->numCols * sizeof(float32); 6.24 - resArray = SSR__malloc_to( resSize, animatingPr ); 6.25 + resArray = VPThread__malloc( resSize, animatingPr ); 6.26 memset( resArray, 0, resSize ); 6.27 6.28 6.29 @@ -91,8 +92,15 @@ 6.30 6.31 VMS__record_interval_end_in_probe( subMatrixProbe ); 6.32 6.33 - SSR__send_of_type_to( animatingPr, params, RESULTS_MSG, resultPr ); 6.34 - SSR__dissipate_procr( animatingPr ); 6.35 + //Send result to results thread 6.36 + //This pattern works 'cause only get lock when results thd inside wait 6.37 + VPThread__mutex_lock( globals->vector_mutex, animatingPr ); 6.38 + globals->currSMPairParams = params; 6.39 + VPThread__cond_signal( globals->vector_cond, animatingPr ); 6.40 + VPThread__mutex_unlock( globals->vector_mutex, animatingPr );//release 6.41 + //wait-er -- cond_signal implemented such that wait-er gets lock, no other 6.42 + 6.43 + VPThread__dissipate_thread( animatingPr ); 6.44 } 6.45 6.46 6.47 @@ -226,7 +234,7 @@ 6.48 float32 *origArray, *subArray; 6.49 6.50 if( subMatrix->alreadyCopied ) return; 6.51 - SSR__start_singleton( copyMatrixSingleton, &&EndOfTransSingleton, animPr); 6.52 + VPThread__start_singleton( copyTransposeSingleton, animPr); 6.53 6.54 origMatrix = subMatrix->origMatrix; 6.55 origArray = origMatrix->array; 6.56 @@ -236,7 +244,7 @@ 6.57 origStartCol = subMatrix->origStartCol; 6.58 origStride = origMatrix->numCols; 6.59 6.60 - subArray = SSR__malloc_to( numRows * numCols *sizeof(float32),animPr); 6.61 + subArray = VPThread__malloc( numRows * numCols *sizeof(float32),animPr); 6.62 subMatrix->array = subArray; 6.63 6.64 //copy values from orig matrix to local 6.65 @@ -244,9 +252,8 @@ 6.66 origStartRow, origStartCol, origStride, 6.67 subArray, origArray ); 6.68 6.69 - subMatrix->alreadyCopied = TRUE; //must be last thing before label 6.70 - EndOfTransSingleton: 6.71 - return; 6.72 + VPThread__end_singleton( copyTransposeSingleton, animPr); 6.73 + subMatrix->alreadyCopied = TRUE; //anywhere after singleton work finished 6.74 } 6.75 6.76 6.77 @@ -259,13 +266,12 @@ 6.78 6.79 //This lets only a single VP execute the code between start and 6.80 // end -- using start and end so that work runs outside the master. 6.81 - //Inside, if a second VP ever executes the start, it will be returned 6.82 - // from the end-point. 6.83 - //Note, for non-GCC, can add a second SSR call at the end, and inside 6.84 - // that one, look at the stack at the return addr & save that in an 6.85 - // array indexed by singletonID 6.86 - if( subMatrix->alreadyCopied ) return; 6.87 - SSR__start_singleton( copyMatrixSingleton, &&EndOfCopySingleton, animPr ); 6.88 + //If a second VP ever executes the start, it will be returned 6.89 + // from the end-point. If it executions start after another but before 6.90 + // that other has finished, this one will remain suspended until the 6.91 + // other finishes, then be resumed from the end-point. 6.92 + if( subMatrix->alreadyCopied ) return; //an optimization -- set below 6.93 + VPThread__start_singleton( copyMatrixSingleton, animPr ); 6.94 6.95 6.96 origMatrix = subMatrix->origMatrix; 6.97 @@ -276,7 +282,7 @@ 6.98 origStartCol = subMatrix->origStartCol; 6.99 origStride = origMatrix->numCols; 6.100 6.101 - subArray = SSR__malloc_to( numRows * numCols *sizeof(float32),animPr); 6.102 + subArray = VPThread__malloc( numRows * numCols *sizeof(float32),animPr); 6.103 subMatrix->array = subArray; 6.104 6.105 //copy values from orig matrix to local 6.106 @@ -293,7 +299,6 @@ 6.107 } 6.108 } 6.109 6.110 - subMatrix->alreadyCopied = TRUE; //must be last thing before label 6.111 - EndOfCopySingleton: 6.112 - return; 6.113 + subMatrix->alreadyCopied = TRUE; //must be after singleton work finished 6.114 + VPThread__end_singleton( copyMatrixSingleton, animPr ); 6.115 }
