Mercurial > cgi-bin > hgwebdir.cgi > PR > Applications > VCilk > VCilk__Blocked_Matrix_Mult__Bench
changeset 2:5311bd32c3cb
Bug-free blocked Matrix Mult in VCilk -- uses Nov 8 version of VMS
author | Me |
---|---|
date | Mon, 08 Nov 2010 03:59:51 -0800 |
parents | 06c80cd3c303 |
children | 12bcb9728e1c |
files | src/Application/VCilk__Matrix_Mult/Divide_Pr.c src/Application/VCilk__Matrix_Mult/VCilk__Matrix_Mult.h src/Application/VCilk__Matrix_Mult/Vector_Pr.c src/Application/VCilk__Matrix_Mult/subMatrix_Pr.c |
diffstat | 4 files changed, 122 insertions(+), 84 deletions(-) [+] |
line diff
1.1 --- a/src/Application/VCilk__Matrix_Mult/Divide_Pr.c Sun Nov 07 06:55:26 2010 -0800 1.2 +++ b/src/Application/VCilk__Matrix_Mult/Divide_Pr.c Mon Nov 08 03:59:51 2010 -0800 1.3 @@ -10,6 +10,7 @@ 1.4 #include "VCilk__Matrix_Mult.h" 1.5 #include <math.h> 1.6 #include <sys/time.h> 1.7 +#include <string.h> 1.8 1.9 //The time to compute this many result values should equal the time to 1.10 // perform this division on a matrix of size gives that many result calcs 1.11 @@ -126,25 +127,29 @@ 1.12 float32 *resultArray; //points to array to be put inside result 1.13 // matrix 1.14 1.15 - PRINT_DEBUG("start divide\n") 1.16 + DEBUG( dbgAppFlow, "start divide\n") 1.17 1.18 int32 1.19 divideProbe = VMS__create_single_interval_probe( "divideProbe", 1.20 animPr ); 1.21 VMS__record_sched_choice_into_probe( divideProbe, animPr ); 1.22 VMS__record_interval_start_in_probe( divideProbe ); 1.23 - 1.24 + 1.25 //=========== Setup -- make local copies of ptd-to-things, malloc, aso 1.26 - int32 numResRows, numResCols; 1.27 + int32 numResRows, numResCols, vectLength; 1.28 1.29 dividerParams = (DividerParams *)_dividerParams; 1.30 1.31 leftMatrix = dividerParams->leftMatrix; 1.32 rightMatrix = dividerParams->rightMatrix; 1.33 1.34 + vectLength = leftMatrix->numCols; 1.35 numResRows = leftMatrix->numRows; 1.36 numResCols = rightMatrix->numCols; 1.37 resultArray = dividerParams->resultMatrix->array; 1.38 + 1.39 + //zero the result array 1.40 + memset( resultArray, 0, numResRows * numResCols * sizeof(float32) ); 1.41 1.42 1.43 //============== Do either sequential mult or do division ============== 1.44 @@ -157,11 +162,21 @@ 1.45 { int32 vectLength; 1.46 1.47 //====== Do sequential multiply on a single core 1.48 - PRINT_DEBUG("doing sequential") 1.49 - vectLength = leftMatrix->numCols; 1.50 + DEBUG( dbgAppFlow, "doing sequential") 1.51 1.52 - multiplyMatrixArrays( vectLength, numResRows, numResCols, 1.53 - leftMatrix->array, rightMatrix->array, 1.54 + //transpose the right matrix 1.55 + float32 * 1.56 + transRightArray = VCilk__malloc( rightMatrix->numRows * 1.57 + rightMatrix->numCols * 1.58 + sizeof(float32), animPr ); 1.59 + 1.60 + //copy values from orig matrix to local 1.61 + copyTranspose( rightMatrix->numRows, rightMatrix->numCols, 1.62 + 0, 0, rightMatrix->numRows, 1.63 + transRightArray, rightMatrix->array ); 1.64 + 1.65 + multiplyMatrixArraysTransposed( vectLength, numResRows, numResCols, 1.66 + leftMatrix->array, transRightArray, 1.67 resultArray ); 1.68 } 1.69 else 1.70 @@ -176,17 +191,6 @@ 1.71 calcIdealSizeAndSliceDimensions( leftMatrix, rightMatrix, animPr); 1.72 1.73 1.74 - //Make the results processor, now that know how many to wait for 1.75 -/* 1.76 - resultsParams = VCilk__malloc( sizeof(ResultsParams) ); 1.77 - resultsParams->dividerPr = animatingPr; 1.78 - resultsParams->numSubMatrixPairs = 1.79 - slicingStrucCarrier->leftRowSlices->numVals * 1.80 - slicingStrucCarrier->rightColSlices->numVals * 1.81 - slicingStrucCarrier->vecSlices->numVals; 1.82 - resultsParams->numCols = rightMatrix->numCols; 1.83 - resultsParams->numRows = leftMatrix->numRows; 1.84 -*/ 1.85 1.86 //Make the sub-matrices, and pair them up, then spawn processors to 1.87 // calc product of each pair. 1.88 @@ -204,10 +208,10 @@ 1.89 // system, by entry-point Fn, and passed in through dividerParams. 1.90 //So, nothing to do to send results back -- they're seen by side-effect 1.91 1.92 - PRINT_DEBUG("end divide\n") 1.93 + DEBUG( dbgAppFlow, "end divide\n") 1.94 1.95 VMS__record_interval_end_in_probe( divideProbe ); 1.96 - VMS__print_stats_of_all_probes; 1.97 + VMS__print_stats_of_all_probes(); 1.98 1.99 VCilk__dissipate_procr( animPr ); //all procrs dissipate self at end 1.100 //when all of the processors have dissipated, the "create seed and do 1.101 @@ -247,7 +251,7 @@ 1.102 idealNumWorkUnits = VCilk__giveIdealNumWorkUnits(); 1.103 1.104 idealSizeOfSide2 = leftMatrix->numRows / rint(cbrt( idealNumWorkUnits )); 1.105 - idealSizeOfSide2 *= 0.6; //finer granularity to help load balance 1.106 + idealSizeOfSide2 *= 0.8; //finer granularity to help load balance 1.107 1.108 if( idealSizeOfSide1 > idealSizeOfSide2 ) 1.109 idealSizeOfSide = idealSizeOfSide1; 1.110 @@ -287,7 +291,7 @@ 1.111 } 1.112 1.113 1.114 -void 1.115 +void inline 1.116 makeSubMatricesAndSpawnAndSync( Matrix *leftMatrix, Matrix *rightMatrix, 1.117 SlicingStrucCarrier *slicingStrucCarrier, 1.118 float32 *resultArray, VirtProcr *animPr ) 1.119 @@ -308,6 +312,10 @@ 1.120 createSubMatrices( vecSlices, rightColSlices, 1.121 rightMatrix, animPr ); 1.122 1.123 + freeSlicingStruc( leftRowSlices, animPr ); 1.124 + freeSlicingStruc( vecSlices, animPr ); 1.125 + freeSlicingStruc( rightColSlices, animPr ); 1.126 + 1.127 //============== pair the sub-matrices and make processors ============== 1.128 int32 numRowIdxs, numColIdxs, numVecIdxs; 1.129 1.130 @@ -321,11 +329,6 @@ 1.131 //It syncs inside, so know all work is done now: free the sub-matrices 1.132 freeSubMatrices( leftRowSlices, vecSlices, leftSubMatrices, animPr ); 1.133 freeSubMatrices( vecSlices, rightColSlices, rightSubMatrices, animPr ); 1.134 - 1.135 - 1.136 - freeSlicingStruc( leftRowSlices, animPr ); 1.137 - freeSlicingStruc( vecSlices, animPr ); 1.138 - freeSlicingStruc( rightColSlices, animPr ); 1.139 } 1.140 1.141 1.142 @@ -336,7 +339,7 @@ 1.143 * core 1.144 * 1.145 */ 1.146 -void 1.147 +void inline 1.148 pairUpSubMatricesAndSpawnAndSync( SubMatrix **leftSubMatrices, 1.149 SubMatrix **rightSubMatrices, 1.150 int32 numRowIdxs, int32 numColIdxs,
2.1 --- a/src/Application/VCilk__Matrix_Mult/VCilk__Matrix_Mult.h Sun Nov 07 06:55:26 2010 -0800 2.2 +++ b/src/Application/VCilk__Matrix_Mult/VCilk__Matrix_Mult.h Mon Nov 08 03:59:51 2010 -0800 2.3 @@ -10,6 +10,7 @@ 2.4 2.5 #include "../../VCilk_lib/VCilk.h" 2.6 #include "../Matrix_Mult.h" 2.7 +#include "../../VCilk_lib/VMS/VMS.h" 2.8 2.9 2.10 //=============================== Defines ============================== 2.11 @@ -17,8 +18,8 @@ 2.12 #define COLS_IN_BLOCK 32 2.13 #define VEC_IN_BLOCK 32 2.14 2.15 - 2.16 -//#define PRINT_DEBUG(msg) //printf(msg); fflush(stdin); 2.17 +#define copyMatrixSingleton 1 2.18 +#define copyTransposeSingleton 2 2.19 2.20 //============================== Structures ============================== 2.21 typedef struct 2.22 @@ -56,7 +57,7 @@ 2.23 { VirtProcr *resultPr; 2.24 SubMatrix *leftSubMatrix; 2.25 SubMatrix *rightSubMatrix; 2.26 - float32 *resultArray; 2.27 + float32 *partialResultArray; 2.28 } 2.29 SMPairParams; 2.30
3.1 --- a/src/Application/VCilk__Matrix_Mult/Vector_Pr.c Sun Nov 07 06:55:26 2010 -0800 3.2 +++ b/src/Application/VCilk__Matrix_Mult/Vector_Pr.c Mon Nov 08 03:59:51 2010 -0800 3.3 @@ -33,10 +33,12 @@ 3.4 3.5 vecParams = (VecParams *)_vecParams; 3.6 3.7 + #ifdef TURN_ON_DEBUG_PROBES 3.8 int32 subMatrixVectorProbe = 3.9 VMS__create_single_interval_probe( "subMtxVect", animPr ); 3.10 VMS__record_sched_choice_into_probe( subMatrixVectorProbe, animPr ); 3.11 VMS__record_interval_start_in_probe( subMatrixVectorProbe ); 3.12 + #endif 3.13 3.14 3.15 numVecIdxs = vecParams->numVecIdxs; 3.16 @@ -55,7 +57,7 @@ 3.17 for( vecIdx = 0; vecIdx < numVecIdxs; vecIdx++ ) 3.18 { 3.19 //Make the processor for the pair of sub-matrices 3.20 - subMatrixPairParams = VCilk__malloc(sizeof(SMPairParams),animPr); 3.21 + subMatrixPairParams = VCilk__malloc( sizeof(SMPairParams), animPr ); 3.22 subMatrixPairParams->leftSubMatrix = 3.23 leftSubMatrices[ leftRowIdxOffset + vecIdx ]; 3.24 3.25 @@ -75,7 +77,7 @@ 3.26 { 3.27 subMatrixPairParams = vecOfSubMatrixParams[ vecIdx ]; 3.28 3.29 - accumulateResult( resultArray, subMatrixPairParams->resultArray, 3.30 + accumulateResult( resultArray, subMatrixPairParams->partialResultArray, 3.31 subMatrixPairParams->leftSubMatrix->origStartRow, 3.32 subMatrixPairParams->leftSubMatrix->numRows, 3.33 subMatrixPairParams->rightSubMatrix->origStartCol, 3.34 @@ -87,14 +89,16 @@ 3.35 // core and re-use that array, and prevents writes from causing 3.36 // thrashing of the cache -- as long as array big enough, the copy 3.37 // overhead is miniscule vs the size-of-side reuse of each byte 3.38 - VCilk__free( subMatrixPairParams->resultArray, animPr ); 3.39 + VCilk__free( subMatrixPairParams->partialResultArray, animPr ); 3.40 VCilk__free( subMatrixPairParams, animPr ); 3.41 } 3.42 VCilk__free( vecOfSubMatrixParams, animPr ); 3.43 VCilk__free( vecParams, animPr ); 3.44 3.45 + #ifdef TURN_ON_DEBUG_PROBES 3.46 VMS__record_interval_end_in_probe( subMatrixVectorProbe ); 3.47 - 3.48 + #endif 3.49 + 3.50 VCilk__dissipate_procr( animPr ); 3.51 } 3.52
4.1 --- a/src/Application/VCilk__Matrix_Mult/subMatrix_Pr.c Sun Nov 07 06:55:26 2010 -0800 4.2 +++ b/src/Application/VCilk__Matrix_Mult/subMatrix_Pr.c Mon Nov 08 03:59:51 2010 -0800 4.3 @@ -6,6 +6,8 @@ 4.4 * 4.5 */ 4.6 4.7 +#include <string.h> 4.8 + 4.9 #include "VCilk__Matrix_Mult.h" 4.10 4.11 4.12 @@ -24,7 +26,7 @@ 4.13 int resStride, int inpStride ); 4.14 4.15 void inline 4.16 -multiplyMatrixArrays( int32 vecLength, int32 numResRows, int32 numResCols, 4.17 +multiplyMatrixArraysTransposed( int32 vecLength, int32 numResRows, int32 numResCols, 4.18 float32 *leftArray, float32 *rightArray, 4.19 float32 *resArray ); 4.20 4.21 @@ -48,11 +50,14 @@ 4.22 float32 *leftArray, *rightArray, *resArray; 4.23 SubMatrix *leftSubMatrix, *rightSubMatrix; 4.24 4.25 - PRINT_DEBUG("start sub-matrix mult\n") 4.26 - int32 subMatrixProbe = VMS__create_single_interval_probe( "subMtx", 4.27 - animatingPr); 4.28 + 4.29 + DEBUG( dbgAppFlow, "start sub-matrix mult\n") 4.30 + #ifdef TURN_ON_DEBUG_PROBES 4.31 + int32 subMatrixProbe = 4.32 + VMS__create_single_interval_probe( "subMtx", animatingPr); 4.33 VMS__record_sched_choice_into_probe( subMatrixProbe, animatingPr ); 4.34 VMS__record_interval_start_in_probe( subMatrixProbe ); 4.35 + #endif 4.36 4.37 params = (SMPairParams *)data; 4.38 // resultPr = params->resultPr; 4.39 @@ -90,14 +95,16 @@ 4.40 numResRows = leftSubMatrix->numRows; 4.41 numResCols = rightSubMatrix->numCols; 4.42 4.43 - multiplyMatrixArrays( vectLength, numResRows, numResCols, 4.44 + multiplyMatrixArraysTransposed( vectLength, numResRows, numResCols, 4.45 leftArray, rightArray, 4.46 resArray ); 4.47 4.48 //send result by side-effect 4.49 - params->resultArray = resArray; 4.50 + params->partialResultArray = resArray; 4.51 4.52 + #ifdef TURN_ON_DEBUG_PROBES 4.53 VMS__record_interval_end_in_probe( subMatrixProbe ); 4.54 + #endif 4.55 4.56 VCilk__dissipate_procr( animatingPr ); 4.57 } 4.58 @@ -120,9 +127,10 @@ 4.59 * of the two input matrices. 4.60 */ 4.61 void inline 4.62 -multiplyMatrixArrays( int32 vecLength, int32 numResRows, int32 numResCols, 4.63 - float32 *leftArray, float32 *rightArray, 4.64 - float32 *resArray ) 4.65 +multiplyMatrixArraysTransposed( int32 vecLength, int32 numResRows, 4.66 + int32 numResCols, 4.67 + float32 *leftArray, float32 *rightArray, 4.68 + float32 *resArray ) 4.69 { 4.70 int resStride, inpStride; 4.71 int resStartRow, resStartCol, resEndRow, resEndCol, startVec, endVec; 4.72 @@ -133,19 +141,17 @@ 4.73 for( resStartRow = 0; resStartRow < numResRows; ) 4.74 { 4.75 resEndRow = resStartRow + ROWS_IN_BLOCK -1; //start at zero, so -1 4.76 - if( resEndRow > numResRows ) resEndRow = numResRows; 4.77 + if( resEndRow > numResRows ) resEndRow = numResRows -1; 4.78 4.79 for( resStartCol = 0; resStartCol < numResCols; ) 4.80 { 4.81 resEndCol = resStartCol + COLS_IN_BLOCK -1; 4.82 - if( resEndCol > numResCols ) resEndCol = numResCols; 4.83 - resStartCol = resEndCol +1; 4.84 - 4.85 + if( resEndCol > numResCols ) resEndCol = numResCols -1; 4.86 4.87 for( startVec = 0; startVec < vecLength; ) 4.88 { 4.89 endVec = startVec + VEC_IN_BLOCK -1; 4.90 - if( endVec > vecLength ) endVec = vecLength; 4.91 + if( endVec > vecLength ) endVec = vecLength -1; 4.92 4.93 //By having the "vector" of sub-blocks in a sub-block slice 4.94 // be marched down in inner loop, are re-using the result 4.95 @@ -183,12 +189,12 @@ 4.96 int leftOffset, rightOffset; 4.97 float32 result; 4.98 4.99 - //The result row is used only for the left matrix, res col for the right 4.100 + //The result row is used for the left matrix, res col for the right 4.101 for( resCol = resStartCol; resCol <= resEndCol; resCol++ ) 4.102 - { 4.103 + { 4.104 for( resRow = resStartRow; resRow <= resEndRow; resRow++ ) 4.105 - { 4.106 - leftOffset = resRow * inpStride;//left & right inp strides always same 4.107 + { 4.108 + leftOffset = resRow * inpStride;//left & right inp strides same 4.109 rightOffset = resCol * inpStride;// because right is transposed 4.110 result = 0; 4.111 for( vec = startVec; vec <= endVec; vec++ ) 4.112 @@ -196,40 +202,21 @@ 4.113 result += 4.114 leftArray[ leftOffset + vec] * rightArray[ rightOffset + vec]; 4.115 } 4.116 - 4.117 + 4.118 resArray[ resRow * resStride + resCol ] += result; 4.119 } 4.120 } 4.121 } 4.122 4.123 -/*Note: don't do the copy when create, because create on a different core -- 4.124 - * do the copy on the core that will do the calcs, then try to keep work on 4.125 - * that core that reuses the array data 4.126 + 4.127 +/*Reuse this in divider when do the sequential multiply case 4.128 */ 4.129 void inline 4.130 -copyTransposeFromOrig( SubMatrix *subMatrix, VirtProcr *animPr ) 4.131 - { int numCols, numRows, origStartRow, origStartCol, origStride, stride; 4.132 - Matrix *origMatrix; 4.133 - float32 *origArray, *subArray; 4.134 +copyTranspose( int32 numRows, int32 numCols, 4.135 + int32 origStartRow, int32 origStartCol, int32 origStride, 4.136 + float32 *subArray, float32 *origArray ) 4.137 + { int32 stride = numRows; 4.138 4.139 - if( subMatrix->alreadyCopied ) return; 4.140 - 4.141 - subMatrix->alreadyCopied = TRUE; 4.142 - 4.143 - origMatrix = subMatrix->origMatrix; 4.144 - origArray = origMatrix->array; 4.145 - numCols = subMatrix->numCols; 4.146 - numRows = subMatrix->numRows; 4.147 - stride = numRows; 4.148 - origStartRow = subMatrix->origStartRow; 4.149 - origStartCol = subMatrix->origStartCol; 4.150 - origStride = origMatrix->numCols; 4.151 - 4.152 - //This is free in Divide pr after all calcs are done 4.153 - subArray = VCilk__malloc( numRows * numCols * sizeof(float32), animPr ); 4.154 - subMatrix->array = subArray; 4.155 - 4.156 - //copy values from orig matrix to local 4.157 int row, col, origOffset; 4.158 for( row = 0; row < numRows; row++ ) 4.159 { 4.160 @@ -238,21 +225,60 @@ 4.161 { 4.162 //transpose means swap row & col -- traverse orig matrix normally 4.163 // but put into reversed place in local array -- means the 4.164 - // stride is the num rows now, so col * numRows + row 4.165 + // stride is the numRows now, so col * numRows + row 4.166 subArray[ col * stride + row ] = origArray[ origOffset + col ]; 4.167 - } 4.168 + } 4.169 } 4.170 } 4.171 4.172 void inline 4.173 +copyTransposeFromOrig( SubMatrix *subMatrix, VirtProcr *animPr ) 4.174 + { int numCols, numRows, origStartRow, origStartCol, origStride, stride; 4.175 + Matrix *origMatrix; 4.176 + float32 *origArray, *subArray; 4.177 + 4.178 + if( subMatrix->alreadyCopied ) return; 4.179 + VCilk__start_singleton(copyMatrixSingleton,&&EndOfTranspSingleton,animPr); 4.180 + 4.181 + origMatrix = subMatrix->origMatrix; 4.182 + origArray = origMatrix->array; 4.183 + numCols = subMatrix->numCols; 4.184 + numRows = subMatrix->numRows; 4.185 + origStartRow = subMatrix->origStartRow; 4.186 + origStartCol = subMatrix->origStartCol; 4.187 + origStride = origMatrix->numCols; 4.188 + 4.189 + subArray = VCilk__malloc( numRows * numCols *sizeof(float32),animPr); 4.190 + subMatrix->array = subArray; 4.191 + 4.192 + //copy values from orig matrix to local 4.193 + copyTranspose( numRows, numCols, 4.194 + origStartRow, origStartCol, origStride, 4.195 + subArray, origArray ); 4.196 + 4.197 + subMatrix->alreadyCopied = TRUE; //must be last thing before label 4.198 + EndOfTranspSingleton: 4.199 + return; 4.200 + } 4.201 + 4.202 + 4.203 +void inline 4.204 copyFromOrig( SubMatrix *subMatrix, VirtProcr *animPr ) 4.205 { int numCols, numRows, origStartRow, origStartCol, stride, origStride; 4.206 Matrix *origMatrix; 4.207 float32 *origArray, *subArray; 4.208 4.209 + 4.210 + //This lets only a single VP execute the code between start and 4.211 + // end -- using start and end so that work runs outside the master. 4.212 + //Inside, if a second VP ever executes the start, it will be returned 4.213 + // from the end-point. 4.214 + //Note, for non-GCC, can add a second SSR call at the end, and inside 4.215 + // that one, look at the stack at the return addr & save that in an 4.216 + // array indexed by singletonID 4.217 if( subMatrix->alreadyCopied ) return; 4.218 + VCilk__start_singleton( copyMatrixSingleton, &&EndOfCopySingleton,animPr); 4.219 4.220 - subMatrix->alreadyCopied = TRUE; 4.221 4.222 origMatrix = subMatrix->origMatrix; 4.223 origArray = origMatrix->array; 4.224 @@ -260,14 +286,14 @@ 4.225 numRows = subMatrix->numRows; 4.226 origStartRow = subMatrix->origStartRow; 4.227 origStartCol = subMatrix->origStartCol; 4.228 - stride = numCols; 4.229 origStride = origMatrix->numCols; 4.230 4.231 - //This is freed in Divide pr after all calcs are done 4.232 subArray = VCilk__malloc( numRows * numCols *sizeof(float32),animPr); 4.233 subMatrix->array = subArray; 4.234 4.235 //copy values from orig matrix to local 4.236 + stride = numCols; 4.237 + 4.238 int row, col, offset, origOffset; 4.239 for( row = 0; row < numRows; row++ ) 4.240 { 4.241 @@ -278,4 +304,8 @@ 4.242 subArray[ offset + col ] = origArray[ origOffset + col ]; 4.243 } 4.244 } 4.245 + 4.246 + subMatrix->alreadyCopied = TRUE; //must be last thing before label 4.247 + EndOfCopySingleton: 4.248 + return; 4.249 }