# HG changeset patch # User Me # Date 1289919771 -3600 # Node ID 133633d1c10f32526f25affc7bac6287652070f9 # Parent 8d14fe28a7828920362a23fbfde81a877651e403 First version modified from SSR copy -- not working yet diff -r 8d14fe28a782 -r 133633d1c10f src/Application/Matrix_Mult.h --- a/src/Application/Matrix_Mult.h Wed Nov 10 22:26:57 2010 -0800 +++ b/src/Application/Matrix_Mult.h Tue Nov 16 16:02:51 2010 +0100 @@ -10,7 +10,7 @@ #include #include -#include "../SSR_lib/VMS/VMS_primitive_data_types.h" +#include "../VPThread_lib/VMS/VMS_primitive_data_types.h" #include "ParamHelper/Param.h" //============================== Structures ============================== diff -r 8d14fe28a782 -r 133633d1c10f src/Application/VPThread__Matrix_Mult/Divide_Pr.c --- a/src/Application/VPThread__Matrix_Mult/Divide_Pr.c Wed Nov 10 22:26:57 2010 -0800 +++ b/src/Application/VPThread__Matrix_Mult/Divide_Pr.c Tue Nov 16 16:02:51 2010 +0100 @@ -7,7 +7,7 @@ */ -#include "SSR_Matrix_Mult.h" +#include "VPThread__Matrix_Mult.h" #include #include @@ -65,11 +65,11 @@ * processors, * then does a receive of a message from the result processor that gives * the divider ownership of the result matrix. - * Finally, the divider returns the result matrix out of the SSR system. + * Finally, the divider returns the result matrix out of the VPThread system. * * Divider chooses the size of sub-matrices via an algorithm that tries to * keep the minimum work above a threshold. The threshold is machine- - * dependent, so ask SSR for min work-unit time to get a + * dependent, so ask VPThread for min work-unit time to get a * given overhead * * Divide min work-unit cycles by measured-cycles for one matrix-cell @@ -114,21 +114,22 @@ */ void divideWorkIntoSubMatrixPairProcrs( void *_dividerParams, - VirtProcr *animPr ) + VirtProcr *animatingThd ) { VirtProcr *resultPr; DividerParams *dividerParams; ResultsParams *resultsParams; Matrix *leftMatrix, *rightMatrix, *resultMatrix; void *msg; SlicingStrucCarrier *slicingStrucCarrier; - float32 *resultArray; //points to array inside result matrix - + float32 *resultArray; //points to array inside result matrix + MatrixMultGlobals *globals; + DEBUG( dbgAppFlow, "start divide\n") int32 divideProbe = VMS__create_single_interval_probe( "divideProbe", - animPr ); - VMS__record_sched_choice_into_probe( divideProbe, animPr ); + animatingThd ); + VMS__record_sched_choice_into_probe( divideProbe, animatingThd ); VMS__record_interval_start_in_probe( divideProbe ); //=========== Setup -- make local copies of ptd-to-things, malloc, aso @@ -160,9 +161,9 @@ //transpose the right matrix float32 * - transRightArray = SSR__malloc_to( rightMatrix->numRows * - rightMatrix->numCols * sizeof(float32), - animPr ); + transRightArray = + VPThread__malloc( rightMatrix->numRows * rightMatrix->numCols * + sizeof(float32), animatingThd ); //copy values from orig matrix to local copyTranspose( rightMatrix->numRows, rightMatrix->numCols, @@ -182,34 +183,60 @@ //The ideal size is the one takes the number of cycles to calculate // such that calc time is equal or greater than min work-unit size slicingStrucCarrier = - calcIdealSizeAndSliceDimensions( leftMatrix, rightMatrix, animPr ); + calcIdealSizeAndSliceDimensions(leftMatrix,rightMatrix,animatingThd); //Make the results processor, now that know how many to wait for - resultsParams = SSR__malloc_to( sizeof(ResultsParams), animPr ); + resultsParams = VPThread__malloc( sizeof(ResultsParams), animatingThd ); resultsParams->numSubMatrixPairs = slicingStrucCarrier->leftRowSlices->numVals * slicingStrucCarrier->rightColSlices->numVals * slicingStrucCarrier->vecSlices->numVals; - resultsParams->dividerPr = animPr; + resultsParams->dividerPr = animatingThd; resultsParams->numCols = rightMatrix->numCols; resultsParams->numRows = leftMatrix->numRows; resultsParams->resultArray = resultArray; + //========== Set up global vars, including conds and mutexes ========== + globals = VMS__malloc( sizeof(MatrixMultGlobals) ); + VPThread__set_globals_to( globals ); - resultPr = - SSR__create_procr_with( &gatherResults, resultsParams, animPr); + globals->results_mutex = VPThread__make_mutex( animatingThd ); + globals->results_cond = VPThread__make_cond( globals->results_mutex, + animatingThd ); + globals->vector_mutex = VPThread__make_mutex( animatingThd ); + globals->vector_cond = VPThread__make_cond( globals->vector_mutex, + animatingThd ); + + globals->start_mutex = VPThread__make_mutex( animatingThd ); + globals->start_cond = VPThread__make_cond( globals->start_mutex, + animatingThd ); + //====================================================================== + + //get results-comm lock before create results-thd, to ensure it can't + // signal that results are available before this thd is waiting on cond + VPThread__mutex_lock( globals->results_mutex, animatingThd ); + + //also get the start lock & use to ensure no vector threads send a + // signal before the results thread is waiting on vector cond + VPThread__mutex_lock( globals->start_mutex, animatingThd ); + + + VPThread__create_thread( &gatherResults, resultsParams, animatingThd ); + + //Now wait for results thd to signal that it has vector lock + VPThread__cond_wait( globals->start_cond, animatingThd ); + VPThread__mutex_unlock( globals->start_mutex, animatingThd );//done w/lock + //Make the sub-matrices, and pair them up, and make processor to // calc product of each pair. makeSubMatricesAndProcrs( leftMatrix, rightMatrix, slicingStrucCarrier, - resultPr, animPr); + resultPr, animatingThd); - //result array is allocated externally, so no message from resultPr - // however, do have to wait before printing out stats, so wait - // for an empty handshake message - msg = SSR__receive_from_to( resultPr, animPr ); - } + //Wait for results thread to say results are good + VPThread__cond_wait( globals->results_cond, animatingThd ); + } //=============== Work done -- send results back ================= @@ -220,11 +247,13 @@ VMS__record_interval_end_in_probe( divideProbe ); VMS__print_stats_of_all_probes(); - //nothing left to do so dissipate, SSR will wait to shutdown and hence - // make results available to outside until all the processors have - // dissipated -- so no need to wait for results processor + //nothing left to do so dissipate, VPThread will wait to shutdown, + // making results available to outside, until all the processors have + // dissipated -- so actually no need to wait for results processor + //However, following the pattern, so done with comm, release lock + VPThread__mutex_unlock( globals->results_mutex, animatingThd ); - SSR__dissipate_procr( animPr ); //all procrs dissipate self at end + VPThread__dissipate_thread( animatingThd ); //all procrs dissipate self at end //when all of the processors have dissipated, the "create seed and do // work" call in the entry point function returns } @@ -237,7 +266,7 @@ float32 idealSizeOfSide, idealSizeOfSide1, idealSizeOfSide2; SlicingStruc *leftRowSlices, *vecSlices, *rightColSlices; SlicingStrucCarrier *slicingStrucCarrier = - SSR__malloc_to(sizeof(SlicingStrucCarrier), animPr); + VPThread__malloc(sizeof(SlicingStrucCarrier), animPr); int minWorkUnitCycles, primitiveCycles, idealNumWorkUnits; float64 numPrimitiveOpsInMinWorkUnit; @@ -245,11 +274,11 @@ //======= Calc ideal size of min-sized sub-matrix ======== - //ask SSR for the number of cycles of the minimum work unit, at given + //ask VPThread for the number of cycles of the minimum work unit, at given // percent overhead then add a guess at overhead from this divider - minWorkUnitCycles = SSR__giveMinWorkUnitCycles( .05 ); + minWorkUnitCycles = VPThread__giveMinWorkUnitCycles( .05 ); - //ask SSR for number of cycles of the "primitive" op of matrix mult + //ask VPThread for number of cycles of the "primitive" op of matrix mult primitiveCycles = measureMatrixMultPrimitive( animPr ); numPrimitiveOpsInMinWorkUnit = @@ -259,7 +288,7 @@ // then multiply by 5 because the primitive is 5x5 idealSizeOfSide1 = 5 * cbrt( numPrimitiveOpsInMinWorkUnit ); - idealNumWorkUnits = SSR__giveIdealNumWorkUnits(); + idealNumWorkUnits = VPThread__giveIdealNumWorkUnits(); idealSizeOfSide2 = leftMatrix->numRows / rint(cbrt( idealNumWorkUnits )); idealSizeOfSide2 *= 0.6; //finer granularity to help load balance @@ -312,7 +341,7 @@ leftRowSlices = slicingStrucCarrier->leftRowSlices; vecSlices = slicingStrucCarrier->vecSlices; rightColSlices = slicingStrucCarrier->rightColSlices; - SSR__free( slicingStrucCarrier, animPr ); + VPThread__free( slicingStrucCarrier, animPr ); //================ Make sub-matrices, given the slicing ================ SubMatrix **leftSubMatrices, **rightSubMatrices; @@ -363,7 +392,7 @@ numLeftColIdxs = numColIdxs; numRightColIdxs = numVecIdxs; - numCores = SSR__give_number_of_cores_to_schedule_onto(); + numCores = VPThread__give_number_of_cores_to_schedule_onto(); numToPutOntoEachCore = numRowIdxs*numColIdxs/numCores; leftOverFraction = 0; @@ -380,7 +409,7 @@ for( vecIdx = 0; vecIdx < numVecIdxs; vecIdx++ ) { //Make the processor for the pair of sub-matrices - subMatrixPairParams = SSR__malloc_to( sizeof(SMPairParams), + subMatrixPairParams = VPThread__malloc( sizeof(SMPairParams), animatingPr); subMatrixPairParams->leftSubMatrix = leftSubMatrices[ leftRowIdxOffset + vecIdx ]; @@ -388,10 +417,10 @@ subMatrixPairParams->rightSubMatrix = rightSubMatrices[ vecIdx * numRightColIdxs + resColIdx ]; - subMatrixPairParams->resultPr = resultPr; + subMatrixPairParams->resultPr = resultPr fix_this; //put all pairs from the same vector onto same core - SSR__create_procr_with_affinity( &calcSubMatrixProduct, + VPThread__create_thread_with_affinity( &calcSubMatrixProduct, subMatrixPairParams, animatingPr, coreToScheduleOnto ); @@ -424,7 +453,6 @@ } } - } @@ -447,7 +475,7 @@ rowStartVals = rowSlices->startVals; colStartVals = colSlices->startVals; - subMatrices = SSR__malloc_to(numRowIdxs * numColIdxs * sizeof(SubMatrix*), + subMatrices = VPThread__malloc(numRowIdxs * numColIdxs * sizeof(SubMatrix*), animPr ); for( rowIdx = 0; rowIdx < numRowIdxs; rowIdx++ ) @@ -463,7 +491,7 @@ startCol = colStartVals[colIdx]; endCol = colStartVals[colIdx + 1] -1; - newSubMatrix = SSR__malloc_to( sizeof(SubMatrix), animPr ); + newSubMatrix = VPThread__malloc( sizeof(SubMatrix), animPr ); newSubMatrix->numRows = endRow - startRow +1; newSubMatrix->numCols = endCol - startCol +1; newSubMatrix->origMatrix = origMatrix; @@ -496,11 +524,11 @@ { subMatrix = subMatrices[ rowOffset + colIdx ]; if( subMatrix->alreadyCopied ) - SSR__free( subMatrix->array, animPr ); - SSR__free( subMatrix, animPr ); + VPThread__free( subMatrix->array, animPr ); + VPThread__free( subMatrix, animPr ); } } - SSR__free( subMatrices, animPr ); + VPThread__free( subMatrices, animPr ); } @@ -510,12 +538,12 @@ VirtProcr *animPr ) { float32 residualAcc = 0; int numSlices, i, *startVals, sizeOfSlice, endCondition; - SlicingStruc *slicingStruc = SSR__malloc_to(sizeof(SlicingStruc), animPr); + SlicingStruc *slicingStruc = VPThread__malloc(sizeof(SlicingStruc), animPr); //calc size of matrix need to hold start vals -- numSlices = (int32)( (float32)(endVal -startVal +1) / idealSizeOfSide); - startVals = SSR__malloc_to( (numSlices + 1) * sizeof(int32), animPr ); + startVals = VPThread__malloc( (numSlices + 1) * sizeof(int32), animPr ); //Calc the upper limit of start value -- when get above this, end loop // by saving highest value of the matrix dimension to access, plus 1 @@ -549,8 +577,8 @@ void freeSlicingStruc( SlicingStruc *slicingStruc, VirtProcr *animPr ) { - SSR__free( slicingStruc->startVals, animPr ); - SSR__free( slicingStruc, animPr ); + VPThread__free( slicingStruc->startVals, animPr ); + VPThread__free( slicingStruc, animPr ); } @@ -561,9 +589,9 @@ float32 *res, *left, *right; //setup inputs - left = SSR__malloc_to( 5 * 5 * sizeof( float32 ), animPr ); - right = SSR__malloc_to( 5 * 5 * sizeof( float32 ), animPr ); - res = SSR__malloc_to( 5 * 5 * sizeof( float32 ), animPr ); + left = VPThread__malloc( 5 * 5 * sizeof( float32 ), animPr ); + right = VPThread__malloc( 5 * 5 * sizeof( float32 ), animPr ); + res = VPThread__malloc( 5 * 5 * sizeof( float32 ), animPr ); for( r = 0; r < 5; r++ ) { @@ -575,7 +603,7 @@ } //do primitive - SSR__start_primitive(); //for now, just takes time stamp + VPThread__start_primitive(); //for now, just takes time stamp for( r = 0; r < 5; r++ ) { for( c = 0; c < 5; c++ ) @@ -587,11 +615,11 @@ } } numCycles = - SSR__end_primitive_and_give_cycles(); + VPThread__end_primitive_and_give_cycles(); - SSR__free( left, animPr ); - SSR__free( right, animPr ); - SSR__free( res, animPr ); + VPThread__free( left, animPr ); + VPThread__free( right, animPr ); + VPThread__free( res, animPr ); return numCycles; } diff -r 8d14fe28a782 -r 133633d1c10f src/Application/VPThread__Matrix_Mult/EntryPoint.c --- a/src/Application/VPThread__Matrix_Mult/EntryPoint.c Wed Nov 10 22:26:57 2010 -0800 +++ b/src/Application/VPThread__Matrix_Mult/EntryPoint.c Tue Nov 16 16:02:51 2010 +0100 @@ -8,7 +8,7 @@ #include -#include "SSR_Matrix_Mult.h" +#include "VPThread__Matrix_Mult.h" @@ -54,8 +54,8 @@ //create divider processor, start doing the work, and wait till done //This function is the "border crossing" between normal code and SSR - SSR__create_seed_procr_and_do_work( ÷WorkIntoSubMatrixPairProcrs, - dividerParams ); + VPThread__create_seed_procr_and_do_work(÷WorkIntoSubMatrixPairProcrs, + dividerParams ); free( dividerParams ); return resMatrix; diff -r 8d14fe28a782 -r 133633d1c10f src/Application/VPThread__Matrix_Mult/Result_Pr.c --- a/src/Application/VPThread__Matrix_Mult/Result_Pr.c Wed Nov 10 22:26:57 2010 -0800 +++ b/src/Application/VPThread__Matrix_Mult/Result_Pr.c Tue Nov 16 16:02:51 2010 +0100 @@ -6,7 +6,7 @@ * */ -#include "SSR_Matrix_Mult.h" +#include "VPThread__Matrix_Mult.h" //===================== void inline @@ -26,13 +26,30 @@ *After the count reaches the point that all results have been received, it * returns the result matrix and dissipates. */ -void gatherResults( void *_params, VirtProcr *animatingPr ) +void gatherResults( void *_params, VirtProcr *animatingThd ) { VirtProcr *dividerPr; ResultsParams *params; int row, col, numRows, numCols, numSubMatrixPairs, count=0; float32 *resultArray; void *msg; SMPairParams *resParams; + //====================== thread stuff ======================= + MatrixMultGlobals *globals =(MatrixMultGlobals *)VPThread__give_globals(); + + + //get vector-comm lock before loop, so that this thd keeps lock after + // one wait until it enters the next wait -- forces see-saw btwn + // waiters and signalers -- wait-signal-wait-signal-... + VPThread__mutex_lock( globals->vector_mutex, animatingThd ); + + //Tell divider that have the vector lock -- so it's sure won't miss any + // signals from the vector-threads it's about to create + //Don't need a signal variable -- this thd can't be created until + // divider thd already has the start lock + VPThread__mutex_lock( globals->start_mutex, animatingThd );//finish wait + VPThread__cond_signal( globals->start_cond, animatingThd ); + VPThread__mutex_unlock( globals->start_mutex, animatingThd );//finish wait + //=========================================================== DEBUG( dbgAppFlow, "start resultPr\n") @@ -47,9 +64,16 @@ while( count < numSubMatrixPairs ) { - msg = SSR__receive_type_to( RESULTS_MSG, animatingPr ); + //receive a vector-result from a vector-thread + VPThread__cond_wait( globals->vector_cond, animatingThd ); - resParams = (SMPairParams *)msg; + //At this point, animating thread owns the vector lock, so all + // pairs trying to signal they have a result are waiting to get that + // lock -- only one gets it at a time, and when signal, this thd + // gets the lock and does the body of this loop, then when does the + // wait again, that releases the lock for next pair-thread to get it + resParams = globals->currSMPairParams; + accumulateResult( resultArray, resParams->partialResultArray, resParams->leftSubMatrix->origStartRow, resParams->leftSubMatrix->numRows, @@ -57,22 +81,22 @@ resParams->rightSubMatrix->numCols, resParams->rightSubMatrix->origMatrix->numCols ); - SSR__free( resParams->partialResultArray, animatingPr ); + VPThread__free( resParams->partialResultArray, animatingThd ); //there is only one copy of results procr, so can update numUsesLeft // without concurrency worries. When zero, free the sub-matrix resParams->leftSubMatrix->numUsesLeft -= 1; if( resParams->leftSubMatrix->numUsesLeft == 0 ) { - SSR__free( resParams->leftSubMatrix->array, animatingPr ); - SSR__free( resParams->leftSubMatrix, animatingPr ); + VPThread__free( resParams->leftSubMatrix->array, animatingThd ); + VPThread__free( resParams->leftSubMatrix, animatingThd ); } resParams->rightSubMatrix->numUsesLeft -= 1; if( resParams->rightSubMatrix->numUsesLeft == 0 ) { - SSR__free( resParams->rightSubMatrix->array, animatingPr ); - SSR__free( resParams->rightSubMatrix, animatingPr ); + VPThread__free( resParams->rightSubMatrix->array, animatingThd ); + VPThread__free( resParams->rightSubMatrix, animatingThd ); } //count of how many sub-matrix pairs accumulated so know when done @@ -82,9 +106,20 @@ //Done -- could just dissipate -- SSR will wait for all processors to // dissipate before shutting down, and thereby making results avaial to // outside, so no need to stop the divider from dissipating, so no need - // to send a hand-shake message to it -- bug makes debug easier - SSR__send_from_to( NULL, animatingPr, dividerPr ); - SSR__dissipate_procr( animatingPr ); //frees any data owned by procr + // to send a hand-shake message to it -- but makes debug easier + //However, following pattern, so all comms done, release lock + VPThread__mutex_unlock( globals->vector_mutex, animatingThd ); + + //Send result to divider (seed) thread + // note, divider thd had to hold the results-comm lock before creating + // this thread, to be sure no race + VPThread__mutex_lock( globals->results_mutex, animatingThd ); + //globals->results = resultMatrixArray; + VPThread__cond_signal( globals->results_cond, animatingThd ); + VPThread__mutex_unlock( globals->results_mutex, animatingThd ); //releases + //divider thread from its wait, at point this executes + + VPThread__dissipate_thread( animatingThd ); //frees any data owned by procr } void inline diff -r 8d14fe28a782 -r 133633d1c10f src/Application/VPThread__Matrix_Mult/VPThread__Matrix_Mult.h --- a/src/Application/VPThread__Matrix_Mult/VPThread__Matrix_Mult.h Wed Nov 10 22:26:57 2010 -0800 +++ b/src/Application/VPThread__Matrix_Mult/VPThread__Matrix_Mult.h Tue Nov 16 16:02:51 2010 +0100 @@ -8,7 +8,7 @@ #include -#include "../../SSR_lib/SSR.h" +#include "../../VPThread_lib/VPThread.h" #include "../Matrix_Mult.h" @@ -81,6 +81,29 @@ RESULTS_MSG = 1 }; + +typedef struct + { + //for communicating sub-matrix-pair results to results Thd + int32 vector_mutex; + int32 vector_cond; + SMPairParams *currSMPairParams; + + //for communicating results array back to seed (divider) Thd + int32 results_mutex; + int32 results_cond; + float32 *results; + + //for ensuring results thd has vector lock before making vector thds + int32 start_mutex; + int32 start_cond; + + Matrix *rightMatrix; + Matrix *resultMatrix; + } +MatrixMultGlobals; + + //============================= Processor Functions ========================= void divideWorkIntoSubMatrixPairProcrs( void *data, VirtProcr *animatingPr ); void calcSubMatrixProduct( void *data, VirtProcr *animatingPr ); diff -r 8d14fe28a782 -r 133633d1c10f src/Application/VPThread__Matrix_Mult/subMatrix_Pr.c --- a/src/Application/VPThread__Matrix_Mult/subMatrix_Pr.c Wed Nov 10 22:26:57 2010 -0800 +++ b/src/Application/VPThread__Matrix_Mult/subMatrix_Pr.c Tue Nov 16 16:02:51 2010 +0100 @@ -8,7 +8,7 @@ #include -#include "SSR_Matrix_Mult.h" +#include "VPThread__Matrix_Mult.h" @@ -50,6 +50,7 @@ VirtProcr *resultPr; float32 *leftArray, *rightArray, *resArray; SubMatrix *leftSubMatrix, *rightSubMatrix; + MatrixMultGlobals *globals =(MatrixMultGlobals *)VPThread__give_globals(); DEBUG1(dbgAppFlow, "start sub-matrix mult: %d\n", animatingPr->procrID) int32 subMatrixProbe = VMS__create_single_interval_probe( "subMtx", @@ -72,7 +73,7 @@ int32 resSize = leftSubMatrix->numRows * rightSubMatrix->numCols * sizeof(float32); - resArray = SSR__malloc_to( resSize, animatingPr ); + resArray = VPThread__malloc( resSize, animatingPr ); memset( resArray, 0, resSize ); @@ -91,8 +92,15 @@ VMS__record_interval_end_in_probe( subMatrixProbe ); - SSR__send_of_type_to( animatingPr, params, RESULTS_MSG, resultPr ); - SSR__dissipate_procr( animatingPr ); + //Send result to results thread + //This pattern works 'cause only get lock when results thd inside wait + VPThread__mutex_lock( globals->vector_mutex, animatingPr ); + globals->currSMPairParams = params; + VPThread__cond_signal( globals->vector_cond, animatingPr ); + VPThread__mutex_unlock( globals->vector_mutex, animatingPr );//release + //wait-er -- cond_signal implemented such that wait-er gets lock, no other + + VPThread__dissipate_thread( animatingPr ); } @@ -226,7 +234,7 @@ float32 *origArray, *subArray; if( subMatrix->alreadyCopied ) return; - SSR__start_singleton( copyMatrixSingleton, &&EndOfTransSingleton, animPr); + VPThread__start_singleton( copyTransposeSingleton, animPr); origMatrix = subMatrix->origMatrix; origArray = origMatrix->array; @@ -236,7 +244,7 @@ origStartCol = subMatrix->origStartCol; origStride = origMatrix->numCols; - subArray = SSR__malloc_to( numRows * numCols *sizeof(float32),animPr); + subArray = VPThread__malloc( numRows * numCols *sizeof(float32),animPr); subMatrix->array = subArray; //copy values from orig matrix to local @@ -244,9 +252,8 @@ origStartRow, origStartCol, origStride, subArray, origArray ); - subMatrix->alreadyCopied = TRUE; //must be last thing before label - EndOfTransSingleton: - return; + VPThread__end_singleton( copyTransposeSingleton, animPr); + subMatrix->alreadyCopied = TRUE; //anywhere after singleton work finished } @@ -259,13 +266,12 @@ //This lets only a single VP execute the code between start and // end -- using start and end so that work runs outside the master. - //Inside, if a second VP ever executes the start, it will be returned - // from the end-point. - //Note, for non-GCC, can add a second SSR call at the end, and inside - // that one, look at the stack at the return addr & save that in an - // array indexed by singletonID - if( subMatrix->alreadyCopied ) return; - SSR__start_singleton( copyMatrixSingleton, &&EndOfCopySingleton, animPr ); + //If a second VP ever executes the start, it will be returned + // from the end-point. If it executions start after another but before + // that other has finished, this one will remain suspended until the + // other finishes, then be resumed from the end-point. + if( subMatrix->alreadyCopied ) return; //an optimization -- set below + VPThread__start_singleton( copyMatrixSingleton, animPr ); origMatrix = subMatrix->origMatrix; @@ -276,7 +282,7 @@ origStartCol = subMatrix->origStartCol; origStride = origMatrix->numCols; - subArray = SSR__malloc_to( numRows * numCols *sizeof(float32),animPr); + subArray = VPThread__malloc( numRows * numCols *sizeof(float32),animPr); subMatrix->array = subArray; //copy values from orig matrix to local @@ -293,7 +299,6 @@ } } - subMatrix->alreadyCopied = TRUE; //must be last thing before label - EndOfCopySingleton: - return; + subMatrix->alreadyCopied = TRUE; //must be after singleton work finished + VPThread__end_singleton( copyMatrixSingleton, animPr ); }