# HG changeset patch # User Me # Date 1288742450 25200 # Node ID 4e14e2663af9065983d3817bdf62480829f6428b # Parent f33a9cba5d890cd27ac0cd5d6cdad1839568f102 Fixed concurrency bug -- added singleton to SSR -- works! 3.4x speedup diff -r f33a9cba5d89 -r 4e14e2663af9 src/Application/SSR_Matrix_Mult/Divide_Pr.c --- a/src/Application/SSR_Matrix_Mult/Divide_Pr.c Thu Oct 14 17:10:17 2010 -0700 +++ b/src/Application/SSR_Matrix_Mult/Divide_Pr.c Tue Nov 02 17:00:50 2010 -0700 @@ -18,20 +18,28 @@ #define NUM_CELLS_IN_SEQUENTIAL_CUTOFF 1000 -int -measureMatrixMultPrimitive(); - +//=========================================================================== +int inline +measureMatrixMultPrimitive( VirtProcr *animPr ); SlicingStrucCarrier * -calcIdealSizeAndSliceDimensions( Matrix *leftMatrix, Matrix *rightMatrix ); +calcIdealSizeAndSliceDimensions( Matrix *leftMatrix, Matrix *rightMatrix, + VirtProcr *animPr ); SlicingStruc * -sliceUpDimension( float32 idealSizeOfPiece, int startVal, int endVal ); +sliceUpDimension( float32 idealSizeOfSide, int startVal, int endVal, + VirtProcr *animPr ); + +void +freeSlicingStruc( SlicingStruc *slicingStruc, VirtProcr *animPr ); SubMatrix ** createSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices, - Matrix *origMatrix ); + int32 numUses, Matrix *origMatrix, VirtProcr *animPr ); +void +freeSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices, + SubMatrix **subMatrices, VirtProcr *animPr ); void pairUpSubMatricesAndMakeProcessors( SubMatrix **leftSubMatrices, @@ -105,44 +113,59 @@ */ void divideWorkIntoSubMatrixPairProcrs( void *_dividerParams, - VirtProcr *animatingPr ) + VirtProcr *animPr ) { VirtProcr *resultPr; DividerParams *dividerParams; ResultsParams *resultsParams; Matrix *leftMatrix, *rightMatrix, *resultMatrix; void *msg; SlicingStrucCarrier *slicingStrucCarrier; - float32 *resultArray; //points to array to be put inside result - // matrix + float32 *resultArray; //points to array inside result matrix - PRINT_DEBUG("start divide\n") + DEBUG("start divide\n") + int32 + divideProbe = VMS__create_single_interval_probe( "divideProbe", + animPr ); + VMS__record_sched_choice_into_probe( divideProbe, animPr ); + VMS__record_interval_start_in_probe( divideProbe ); //=========== Setup -- make local copies of ptd-to-things, malloc, aso + int32 numResRows, numResCols, vectLength; dividerParams = (DividerParams *)_dividerParams; leftMatrix = dividerParams->leftMatrix; rightMatrix = dividerParams->rightMatrix; + vectLength = leftMatrix->numCols; + numResRows = leftMatrix->numRows; + numResCols = rightMatrix->numCols; + resultArray = dividerParams->resultMatrix->array; //============== Do either sequential mult or do division ============== //Check if input matrices too small -- if yes, just do sequential - if( leftMatrix->numRows * leftMatrix->numCols * rightMatrix->numCols - < NUM_CELLS_IN_SEQUENTIAL_CUTOFF ) //curoff is determined by overhead - // of this divider -- relatively machine-independent - { int32 vectLength, numResRows, numResCols; + //Cutoff is determined by overhead of this divider -- relatively + // machine-independent + if( (float32)leftMatrix->numRows * (float32)leftMatrix->numCols * + (float32)rightMatrix->numCols < NUM_CELLS_IN_SEQUENTIAL_CUTOFF ) + { + //====== Do sequential multiply on a single core + DEBUG("doing sequential") - //====== Do sequential multiply on a single core + //have to transpose the right matrix first + float32 * + transRightArray = SSR__malloc_to( rightMatrix->numRows * + rightMatrix->numCols * + sizeof(float32), animPr ); - vectLength = leftMatrix->numCols; - numResRows = leftMatrix->numRows; - numResCols = rightMatrix->numCols; - - resultArray = malloc( numResRows * numResCols * sizeof(float32) ); - - multiplyMatrixArrays( vectLength, numResRows, numResCols, + //copy values from orig matrix to local + copyTranspose( rightMatrix->numRows, rightMatrix->numCols, + 0, 0, rightMatrix->numRows, + transRightArray, rightMatrix->array ); + + multiplyMatrixArraysTransposed( vectLength, numResRows, numResCols, leftMatrix->array, rightMatrix->array, resultArray ); } @@ -155,65 +178,62 @@ //The ideal size is the one takes the number of cycles to calculate // such that calc time is equal or greater than min work-unit size slicingStrucCarrier = - calcIdealSizeAndSliceDimensions( leftMatrix, rightMatrix ); + calcIdealSizeAndSliceDimensions( leftMatrix, rightMatrix, animPr ); //Make the results processor, now that know how many to wait for - resultsParams = SSR__malloc_size_to(sizeof(ResultsParams),animatingPr); - resultsParams->dividerPr = animatingPr; + resultsParams = SSR__malloc_to( sizeof(ResultsParams), animPr ); resultsParams->numSubMatrixPairs = slicingStrucCarrier->leftRowSlices->numVals * slicingStrucCarrier->rightColSlices->numVals * slicingStrucCarrier->vecSlices->numVals; - resultsParams->numCols = rightMatrix->numCols; - resultsParams->numRows = leftMatrix->numRows; + resultsParams->dividerPr = animPr; + resultsParams->numCols = rightMatrix->numCols; + resultsParams->numRows = leftMatrix->numRows; + resultsParams->resultArray = resultArray; + resultPr = - SSR__create_procr_with( &gatherResults, resultsParams, animatingPr); + SSR__create_procr_with( &gatherResults, resultsParams, animPr); //Make the sub-matrices, and pair them up, and make processor to // calc product of each pair. makeSubMatricesAndProcrs( leftMatrix, rightMatrix, slicingStrucCarrier, - resultPr, animatingPr); + resultPr, animPr); - //Get result from result procr - msg = SSR__receive_from_to( resultPr, animatingPr ); - resultArray = (float32 *) msg; - } + //result array is allocated externally, so no message from resultPr + // however, do have to wait before printing out stats, so wait + // for an empty handshake message + msg = SSR__receive_from_to( resultPr, animPr ); + } //=============== Work done -- send results back ================= - //prepare results to persist outside of SSR when return from entry pt - //The results of the all the work have to be linked-to from the data - // struc given to the seed procr -- this divide func is animated by - // that seed procr, so have to link results to the _dividerParams. - resultMatrix = SSR__malloc_size_to(sizeof(Matrix),animatingPr); - resultMatrix->array = resultArray; - resultMatrix->numCols = rightMatrix->numCols; - resultMatrix->numRows = leftMatrix->numRows; + DEBUG_MSG( dbgAppFlow, "end divide\n") + VMS__record_interval_end_in_probe( divideProbe ); + VMS__print_stats_of_all_probes(); - dividerParams->resultMatrix = resultMatrix; - SSR__transfer_ownership_to_outside( msg ); //so not freed - SSR__transfer_ownership_to_outside( resultMatrix ); + //nothing left to do so dissipate, SSR will wait to shutdown and hence + // make results available to outside until all the processors have + // dissipated -- so no need to wait for results processor - PRINT_DEBUG("end divide\n") - - SSR__dissipate_procr( animatingPr ); //all procrs dissipate self at end + SSR__dissipate_procr( animPr ); //all procrs dissipate self at end //when all of the processors have dissipated, the "create seed and do // work" call in the entry point function returns } SlicingStrucCarrier * -calcIdealSizeAndSliceDimensions( Matrix *leftMatrix, Matrix *rightMatrix ) -{ +calcIdealSizeAndSliceDimensions( Matrix *leftMatrix, Matrix *rightMatrix, + VirtProcr *animPr ) + { float32 idealSizeOfSide, idealSizeOfSide1, idealSizeOfSide2; SlicingStruc *leftRowSlices, *vecSlices, *rightColSlices; SlicingStrucCarrier *slicingStrucCarrier = - malloc(sizeof(SlicingStrucCarrier)); + SSR__malloc_to(sizeof(SlicingStrucCarrier), animPr); int minWorkUnitCycles, primitiveCycles, idealNumWorkUnits; float64 numPrimitiveOpsInMinWorkUnit; @@ -226,7 +246,7 @@ minWorkUnitCycles = SSR__giveMinWorkUnitCycles( .05 ); //ask SSR for number of cycles of the "primitive" op of matrix mult - primitiveCycles = measureMatrixMultPrimitive(); + primitiveCycles = measureMatrixMultPrimitive( animPr ); numPrimitiveOpsInMinWorkUnit = (float64)minWorkUnitCycles / (float64)primitiveCycles; @@ -238,6 +258,7 @@ idealNumWorkUnits = SSR__giveIdealNumWorkUnits(); idealSizeOfSide2 = leftMatrix->numRows / rint(cbrt( idealNumWorkUnits )); + idealSizeOfSide2 *= 0.6; //finer granularity to help load balance if( idealSizeOfSide1 > idealSizeOfSide2 ) idealSizeOfSide = idealSizeOfSide1; @@ -261,41 +282,47 @@ endRightCol = rightMatrix->numCols -1; leftRowSlices = - sliceUpDimension( idealSizeOfSide, startLeftRow, endLeftRow ); + sliceUpDimension( idealSizeOfSide, startLeftRow, endLeftRow, animPr ); vecSlices = - sliceUpDimension( idealSizeOfSide, startVec, endVec ); + sliceUpDimension( idealSizeOfSide, startVec, endVec, animPr ); rightColSlices = - sliceUpDimension( idealSizeOfSide, startRightCol, endRightCol ); + sliceUpDimension( idealSizeOfSide, startRightCol, endRightCol,animPr); slicingStrucCarrier->leftRowSlices = leftRowSlices; slicingStrucCarrier->vecSlices = vecSlices; slicingStrucCarrier->rightColSlices = rightColSlices; return slicingStrucCarrier; -} + } void makeSubMatricesAndProcrs( Matrix *leftMatrix, Matrix *rightMatrix, SlicingStrucCarrier *slicingStrucCarrier, - VirtProcr *resultPr, VirtProcr *animatingPr ) + VirtProcr *resultPr, VirtProcr *animPr ) { SlicingStruc *leftRowSlices, *vecSlices, *rightColSlices; leftRowSlices = slicingStrucCarrier->leftRowSlices; vecSlices = slicingStrucCarrier->vecSlices; rightColSlices = slicingStrucCarrier->rightColSlices; + SSR__free( slicingStrucCarrier, animPr ); //================ Make sub-matrices, given the slicing ================ SubMatrix **leftSubMatrices, **rightSubMatrices; leftSubMatrices = - createSubMatrices( leftRowSlices, vecSlices, - leftMatrix ); + createSubMatrices( leftRowSlices, vecSlices, rightColSlices->numVals, + leftMatrix, animPr ); + //double_check_that_always_numRows_in_right_same_as_numCols_in_left(); rightSubMatrices = - createSubMatrices( vecSlices, rightColSlices, - rightMatrix ); + createSubMatrices( vecSlices, rightColSlices, leftRowSlices->numVals, + rightMatrix, animPr ); + + freeSlicingStruc( leftRowSlices, animPr ); + freeSlicingStruc( vecSlices, animPr ); + freeSlicingStruc( rightColSlices, animPr ); //============== pair the sub-matrices and make processors ============== int32 numRowIdxs, numColIdxs, numVecIdxs; @@ -308,7 +335,7 @@ numRowIdxs, numColIdxs, numVecIdxs, resultPr, - animatingPr ); + animPr ); } @@ -326,21 +353,30 @@ int32 numLeftColIdxs, numRightColIdxs; int32 leftRowIdxOffset; SMPairParams *subMatrixPairParams; + float32 numToPutOntoEachCore, leftOverFraction; + int32 numCores, coreToScheduleOnto, numVecOnCurrCore; numLeftColIdxs = numColIdxs; numRightColIdxs = numVecIdxs; + numCores = SSR__give_number_of_cores_to_schedule_onto(); + + numToPutOntoEachCore = numRowIdxs*numColIdxs/numCores; + leftOverFraction = 0; + numVecOnCurrCore = 0; + coreToScheduleOnto = 0; + for( resRowIdx = 0; resRowIdx < numRowIdxs; resRowIdx++ ) { leftRowIdxOffset = resRowIdx * numLeftColIdxs; for( resColIdx = 0; resColIdx < numColIdxs; resColIdx++ ) { - + for( vecIdx = 0; vecIdx < numVecIdxs; vecIdx++ ) { //Make the processor for the pair of sub-matrices - subMatrixPairParams = SSR__malloc_size_to(sizeof(SMPairParams), + subMatrixPairParams = SSR__malloc_to( sizeof(SMPairParams), animatingPr); subMatrixPairParams->leftSubMatrix = leftSubMatrices[ leftRowIdxOffset + vecIdx ]; @@ -350,9 +386,36 @@ subMatrixPairParams->resultPr = resultPr; - SSR__create_procr_with( &calcSubMatrixProduct, - subMatrixPairParams, - animatingPr ); + //put all pairs from the same vector onto same core + SSR__create_procr_with_affinity( &calcSubMatrixProduct, + subMatrixPairParams, + animatingPr, + coreToScheduleOnto ); + } + + //Trying to distribute the subMatrix-vectors across the cores, so + // that each core gets the same number of vectors, with a max + // imbalance of 1 vector more on some cores than others + numVecOnCurrCore += 1; + if( numVecOnCurrCore + leftOverFraction >= numToPutOntoEachCore -1 ) + { + //deal with fractional part, to ensure that imbalance is 1 max + // IE, core with most has only 1 more than core with least + leftOverFraction += numToPutOntoEachCore - numVecOnCurrCore; + if( leftOverFraction >= 1 ) + { leftOverFraction -= 1; + numVecOnCurrCore = -1; + } + else + { numVecOnCurrCore = 0; + } + //Move to next core, max core-value to incr to is numCores -1 + if( coreToScheduleOnto >= numCores -1 ) + { coreToScheduleOnto = 0; + } + else + { coreToScheduleOnto += 1; + } } } } @@ -365,7 +428,7 @@ */ SubMatrix ** createSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices, - Matrix *origMatrix ) + int32 numUses, Matrix *origMatrix, VirtProcr *animPr ) { int32 numRowIdxs, numColIdxs, rowIdx, colIdx; int32 startRow, endRow, startCol, endCol; @@ -379,7 +442,8 @@ rowStartVals = rowSlices->startVals; colStartVals = colSlices->startVals; - subMatrices = malloc( numRowIdxs * numColIdxs * sizeof(SubMatrix *) ); + subMatrices = SSR__malloc_to(numRowIdxs * numColIdxs * sizeof(SubMatrix*), + animPr ); for( rowIdx = 0; rowIdx < numRowIdxs; rowIdx++ ) { @@ -394,13 +458,14 @@ startCol = colStartVals[colIdx]; endCol = colStartVals[colIdx + 1] -1; - newSubMatrix = malloc( sizeof(SubMatrix) ); + newSubMatrix = SSR__malloc_to( sizeof(SubMatrix), animPr ); newSubMatrix->numRows = endRow - startRow +1; newSubMatrix->numCols = endCol - startCol +1; newSubMatrix->origMatrix = origMatrix; newSubMatrix->origStartRow = startRow; newSubMatrix->origStartCol = startCol; newSubMatrix->alreadyCopied = FALSE; + newSubMatrix->numUsesLeft = numUses; //can free after this many subMatrices[ rowOffset + colIdx ] = newSubMatrix; } @@ -409,18 +474,43 @@ } +void +freeSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices, + SubMatrix **subMatrices, VirtProcr *animPr ) + { + int32 numRowIdxs, numColIdxs, rowIdx, colIdx, rowOffset; + SubMatrix *subMatrix; + + numRowIdxs = rowSlices->numVals; + numColIdxs = colSlices->numVals; + + for( rowIdx = 0; rowIdx < numRowIdxs; rowIdx++ ) + { + rowOffset = rowIdx * numColIdxs; + for( colIdx = 0; colIdx < numColIdxs; colIdx++ ) + { + subMatrix = subMatrices[ rowOffset + colIdx ]; + if( subMatrix->alreadyCopied ) + SSR__free( subMatrix->array, animPr ); + SSR__free( subMatrix, animPr ); + } + } + SSR__free( subMatrices, animPr ); + } + SlicingStruc * -sliceUpDimension( float32 idealSizeOfSide, int startVal, int endVal ) +sliceUpDimension( float32 idealSizeOfSide, int startVal, int endVal, + VirtProcr *animPr ) { float32 residualAcc = 0; int numSlices, i, *startVals, sizeOfSlice, endCondition; - SlicingStruc *slicingStruc = malloc( sizeof(SlicingStruc) ); + SlicingStruc *slicingStruc = SSR__malloc_to(sizeof(SlicingStruc), animPr); //calc size of matrix need to hold start vals -- numSlices = (int32)( (float32)(endVal -startVal +1) / idealSizeOfSide); - startVals = malloc( (numSlices + 1) * sizeof(int32) ); + startVals = SSR__malloc_to( (numSlices + 1) * sizeof(int32), animPr ); //Calc the upper limit of start value -- when get above this, end loop // by saving highest value of the matrix dimension to access, plus 1 @@ -451,17 +541,24 @@ return slicingStruc; } +void +freeSlicingStruc( SlicingStruc *slicingStruc, VirtProcr *animPr ) + { + SSR__free( slicingStruc->startVals, animPr ); + SSR__free( slicingStruc, animPr ); + } + int inline -measureMatrixMultPrimitive() +measureMatrixMultPrimitive( VirtProcr *animPr ) { int r, c, v, numCycles; float32 *res, *left, *right; //setup inputs - left = malloc( 5 * 5 * sizeof( float32 ) ); - right = malloc( 5 * 5 * sizeof( float32 ) ); - res = malloc( 5 * 5 * sizeof( float32 ) ); + left = SSR__malloc_to( 5 * 5 * sizeof( float32 ), animPr ); + right = SSR__malloc_to( 5 * 5 * sizeof( float32 ), animPr ); + res = SSR__malloc_to( 5 * 5 * sizeof( float32 ), animPr ); for( r = 0; r < 5; r++ ) { @@ -485,8 +582,11 @@ } } numCycles = - SSR__end_primitive_and_give_cycles(); + SSR__end_primitive_and_give_cycles(); + + SSR__free( left, animPr ); + SSR__free( right, animPr ); + SSR__free( res, animPr ); return numCycles; } - diff -r f33a9cba5d89 -r 4e14e2663af9 src/Application/SSR_Matrix_Mult/EntryPoint.c --- a/src/Application/SSR_Matrix_Mult/EntryPoint.c Thu Oct 14 17:10:17 2010 -0700 +++ b/src/Application/SSR_Matrix_Mult/EntryPoint.c Tue Nov 02 17:00:50 2010 -0700 @@ -30,6 +30,7 @@ multiplyTheseMatrices( Matrix *leftMatrix, Matrix *rightMatrix ) { Matrix *resMatrix; DividerParams *dividerParams; + int32 numResRows, numResCols; dividerParams = malloc( sizeof( DividerParams ) ); @@ -37,13 +38,25 @@ dividerParams->rightMatrix = rightMatrix; + numResRows = leftMatrix->numRows; + numResCols = rightMatrix->numCols; + + //VMS has its own separate internal malloc, so to get results out, + // have to pass in empty array for it to fill up + //The alternative is internally telling SSR make external space to use + resMatrix = malloc( sizeof(Matrix) ); + resMatrix->array = malloc( numResRows * numResCols * sizeof(float32)); + resMatrix->numCols = rightMatrix->numCols; + resMatrix->numRows = leftMatrix->numRows; + + + dividerParams->resultMatrix = resMatrix; + //create divider processor, start doing the work, and wait till done //This function is the "border crossing" between normal code and SSR SSR__create_seed_procr_and_do_work( ÷WorkIntoSubMatrixPairProcrs, dividerParams ); - //get result matrix and return it - resMatrix = dividerParams->resultMatrix; free( dividerParams ); return resMatrix; } diff -r f33a9cba5d89 -r 4e14e2663af9 src/Application/SSR_Matrix_Mult/Result_Pr.c --- a/src/Application/SSR_Matrix_Mult/Result_Pr.c Thu Oct 14 17:10:17 2010 -0700 +++ b/src/Application/SSR_Matrix_Mult/Result_Pr.c Tue Nov 02 17:00:50 2010 -0700 @@ -8,6 +8,7 @@ #include "SSR_Matrix_Mult.h" +//===================== void inline accumulateResult( float32 *resultArray, float32 *subMatrixResultArray, int32 startRow, @@ -16,6 +17,7 @@ int32 numCols, int32 numOrigCols ); +//=========================================================================== /*The Result Processor gets a message from each of the vector processors, * puts the result from the message in its location in the result- @@ -32,7 +34,7 @@ void *msg; SMPairParams *resParams; - PRINT_DEBUG("start resultPr\n") + DEBUG("start resultPr\n") params = (ResultsParams *)_params; dividerPr = params->dividerPr; @@ -40,8 +42,7 @@ numRows = params->numRows; numCols = params->numCols; - resultArray = SSR__malloc_size_to( numRows * numCols * sizeof(float32), - animatingPr ); + resultArray = params->resultArray; //zero out the results array -- will be accumulating, so must start 0 for( row = 0; row < numRows; row++ ) @@ -57,24 +58,45 @@ msg = SSR__receive_type_to( RESULTS_MSG, animatingPr ); resParams = (SMPairParams *)msg; - accumulateResult( resultArray, resParams->resultArray, + accumulateResult( resultArray, resParams->partialResultArray, resParams->leftSubMatrix->origStartRow, resParams->leftSubMatrix->numRows, resParams->rightSubMatrix->origStartCol, resParams->rightSubMatrix->numCols, resParams->rightSubMatrix->origMatrix->numCols ); + + SSR__free( resParams->partialResultArray, animatingPr ); + + //there is only one copy of results procr, so can update numUsesLeft + // without concurrency worries. When zero, free the sub-matrix + resParams->leftSubMatrix->numUsesLeft -= 1; + if( resParams->leftSubMatrix->numUsesLeft == 0 ) + { + SSR__free( resParams->leftSubMatrix->array, animatingPr ); + SSR__free( resParams->leftSubMatrix, animatingPr ); + } + + resParams->rightSubMatrix->numUsesLeft -= 1; + if( resParams->rightSubMatrix->numUsesLeft == 0 ) + { + SSR__free( resParams->rightSubMatrix->array, animatingPr ); + SSR__free( resParams->rightSubMatrix, animatingPr ); + } + + //count of how many sub-matrix pairs accumulated so know when done count++; } - //if were real lang, would have auto-nested transfer -- but HelloWorld - // language, so have to transfer ownership of each allocated block of - // locations separately - SSR__transfer_ownership_of_from_to( resultArray, animatingPr, dividerPr ); - SSR__send_from_to( resultArray, animatingPr, dividerPr ); + + //Done -- could just dissipate -- SSR will wait for all processors to + // dissipate before shutting down, and thereby making results avaial to + // outside, so no need to stop the divider from dissipating, so no need + // to send a hand-shake message to it -- bug makes debug easier + SSR__send_from_to( NULL, animatingPr, dividerPr ); SSR__dissipate_procr( animatingPr ); //frees any data owned by procr } void inline -accumulateResult( float32 *resultArray, float32 *subMatrixResultArray, +accumulateResult( float32 *resultArray, float32 *subMatrixPairResultArray, int32 startRow, int32 numRows, int32 startCol, @@ -86,8 +108,8 @@ { for( col = 0; col < numCols; col++ ) { - resultArray[ (row + startRow) * numOrigCols + col + startCol ] += - subMatrixResultArray[ row * numCols + col ]; + resultArray[ (row + startRow) * numOrigCols + (col + startCol) ] += + subMatrixPairResultArray[ row * numCols + col ]; } } diff -r f33a9cba5d89 -r 4e14e2663af9 src/Application/SSR_Matrix_Mult/SSR_Matrix_Mult.h --- a/src/Application/SSR_Matrix_Mult/SSR_Matrix_Mult.h Thu Oct 14 17:10:17 2010 -0700 +++ b/src/Application/SSR_Matrix_Mult/SSR_Matrix_Mult.h Tue Nov 02 17:00:50 2010 -0700 @@ -17,8 +17,10 @@ #define COLS_IN_BLOCK 32 #define VEC_IN_BLOCK 32 +#define copyMatrixSingleton 1 +#define copyTransposeSingleton 2 -#define PRINT_DEBUG(msg) //printf(msg); fflush(stdin); +#define DEBUG(msg) //printf(msg); fflush(stdin); //============================== Structures ============================== typedef struct @@ -35,6 +37,7 @@ int numRows; int numCols; int numSubMatrixPairs; + float32 *resultArray; } ResultsParams; @@ -46,6 +49,7 @@ int32 origStartRow; int32 origStartCol; int32 alreadyCopied; + int32 numUsesLeft; //have update via message to avoid multiple writers float32 *array; //2D, but dynamically sized, so use addr arith } SubMatrix; @@ -54,7 +58,7 @@ { VirtProcr *resultPr; SubMatrix *leftSubMatrix; SubMatrix *rightSubMatrix; - float32 *resultArray; + float32 *partialResultArray; } SMPairParams; diff -r f33a9cba5d89 -r 4e14e2663af9 src/Application/SSR_Matrix_Mult/subMatrix_Pr.c --- a/src/Application/SSR_Matrix_Mult/subMatrix_Pr.c Thu Oct 14 17:10:17 2010 -0700 +++ b/src/Application/SSR_Matrix_Mult/subMatrix_Pr.c Tue Nov 02 17:00:50 2010 -0700 @@ -10,10 +10,10 @@ void inline -copyFromOrig( SubMatrix *subMatrix ); +copyFromOrig( SubMatrix *subMatrix, VirtProcr *animPr ); void inline -copyTransposeFromOrig( SubMatrix *subMatrix ); +copyTransposeFromOrig( SubMatrix *subMatrix, VirtProcr *animPr ); void inline multiplySubBlocksTransposed( float32 *leftArray, float32 *rightArray, @@ -24,7 +24,7 @@ int resStride, int inpStride ); void inline -multiplyMatrixArrays( int32 vecLength, int32 numResRows, int32 numResCols, +multiplyMatrixArraysTransposed( int32 vecLength, int32 numResRows, int32 numResCols, float32 *leftArray, float32 *rightArray, float32 *resArray ); @@ -48,7 +48,7 @@ float32 *leftArray, *rightArray, *resArray; SubMatrix *leftSubMatrix, *rightSubMatrix; - PRINT_DEBUG("start sub-matrix mult\n") + DEBUG("start sub-matrix mult\n") params = (SMPairParams *)data; resultPr = params->resultPr; @@ -56,14 +56,15 @@ rightSubMatrix = params->rightSubMatrix; //make sure the input sub-matrices have been copied out of orig - copyFromOrig( leftSubMatrix ); - copyTransposeFromOrig( rightSubMatrix ); + //do it here, inside sub-matrix pair to hopefully gain reuse in cache + copyFromOrig( leftSubMatrix, animatingPr ); + copyTransposeFromOrig( rightSubMatrix, animatingPr ); leftArray = leftSubMatrix->array; rightArray = rightSubMatrix->array; - resArray = malloc( leftSubMatrix->numRows * rightSubMatrix->numCols * - sizeof( float32 ) ); + resArray = SSR__malloc_to(leftSubMatrix->numRows * rightSubMatrix->numCols + * sizeof( float32 ), animatingPr ); int32 numResRows, numResCols, vectLength; @@ -72,12 +73,12 @@ numResRows = leftSubMatrix->numRows; numResCols = rightSubMatrix->numCols; - multiplyMatrixArrays( vectLength, numResRows, numResCols, - leftArray, rightArray, + multiplyMatrixArraysTransposed( vectLength, numResRows, numResCols, + leftArray, rightArray, resArray ); //send result to result processor - params->resultArray = resArray; + params->partialResultArray = resArray; SSR__send_of_type_to( animatingPr, params, RESULTS_MSG, resultPr ); SSR__dissipate_procr( animatingPr ); } @@ -95,7 +96,8 @@ * */ void inline -multiplyMatrixArrays( int32 vecLength, int32 numResRows, int32 numResCols, +multiplyMatrixArraysTransposed( int32 vecLength, int32 numResRows, + int32 numResCols, float32 *leftArray, float32 *rightArray, float32 *resArray ) { @@ -172,29 +174,15 @@ } } + +/*Reuse this in divider when do the sequential multiply case + */ void inline -copyTransposeFromOrig( SubMatrix *subMatrix ) - { int numCols, numRows, origStartRow, origStartCol, origStride, stride; - Matrix *origMatrix; - float32 *origArray, *subArray; - - if( subMatrix->alreadyCopied ) return; - - subMatrix->alreadyCopied = TRUE; - - origMatrix = subMatrix->origMatrix; - origArray = origMatrix->array; - numCols = subMatrix->numCols; - numRows = subMatrix->numRows; - stride = numRows; - origStartRow = subMatrix->origStartRow; - origStartCol = subMatrix->origStartCol; - origStride = origMatrix->numCols; - - subArray = malloc( numRows * numCols * sizeof(float32) ); - subMatrix->array = subArray; - - //copy values from orig matrix to local +copyTranspose( int32 numRows, int32 numCols, + int32 origStartRow, int32 origStartCol, int32 origStride, + float32 *subArray, float32 *origArray ) + { int32 stride = numRows; + int row, col, origOffset; for( row = 0; row < numRows; row++ ) { @@ -203,21 +191,60 @@ { //transpose means swap row & col -- traverse orig matrix normally // but put into reversed place in local array -- means the - // stride is the num rows now, so col * numRows + row + // stride is the numRows now, so col * numRows + row subArray[ col * stride + row ] = origArray[ origOffset + col ]; - } + } } } void inline -copyFromOrig( SubMatrix *subMatrix ) +copyTransposeFromOrig( SubMatrix *subMatrix, VirtProcr *animPr ) + { int numCols, numRows, origStartRow, origStartCol, origStride, stride; + Matrix *origMatrix; + float32 *origArray, *subArray; + + if( subMatrix->alreadyCopied ) return; + SSR__start_singleton( copyMatrixSingleton, &&EndOfTransSingleton, animPr); + + origMatrix = subMatrix->origMatrix; + origArray = origMatrix->array; + numCols = subMatrix->numCols; + numRows = subMatrix->numRows; + origStartRow = subMatrix->origStartRow; + origStartCol = subMatrix->origStartCol; + origStride = origMatrix->numCols; + + subArray = SSR__malloc_to( numRows * numCols *sizeof(float32),animPr); + subMatrix->array = subArray; + + //copy values from orig matrix to local + copyTranspose( numRows, numCols, + origStartRow, origStartCol, origStride, + subArray, origArray ); + + subMatrix->alreadyCopied = TRUE; //must be last thing before label + EndOfTransSingleton: + return; + } + + +void inline +copyFromOrig( SubMatrix *subMatrix, VirtProcr *animPr ) { int numCols, numRows, origStartRow, origStartCol, stride, origStride; Matrix *origMatrix; float32 *origArray, *subArray; + + //This lets only a single VP execute the code between start and + // end -- using start and end so that work runs outside the master. + //Inside, if a second VP ever executes the start, it will be returned + // from the end-point. + //Note, for non-GCC, can add a second SSR call at the end, and inside + // that one, look at the stack at the return addr & save that in an + // array indexed by singletonID if( subMatrix->alreadyCopied ) return; + SSR__start_singleton( copyMatrixSingleton, &&EndOfCopySingleton, animPr ); - subMatrix->alreadyCopied = TRUE; origMatrix = subMatrix->origMatrix; origArray = origMatrix->array; @@ -225,13 +252,14 @@ numRows = subMatrix->numRows; origStartRow = subMatrix->origStartRow; origStartCol = subMatrix->origStartCol; - stride = numCols; origStride = origMatrix->numCols; - subArray = malloc( numRows * numCols * sizeof(float32) ); + subArray = SSR__malloc_to( numRows * numCols *sizeof(float32),animPr); subMatrix->array = subArray; //copy values from orig matrix to local + stride = numCols; + int row, col, offset, origOffset; for( row = 0; row < numRows; row++ ) { @@ -242,4 +270,8 @@ subArray[ offset + col ] = origArray[ origOffset + col ]; } } + + subMatrix->alreadyCopied = TRUE; //must be last thing before label + EndOfCopySingleton: + return; }