# HG changeset patch # User Me # Date 1288917855 25200 # Node ID cbd8db6b8657e3e3c30f35c6552284b3b244733f # Parent 4e14e2663af9065983d3817bdf62480829f6428b Fixed last bugs in matrix multiply code -- gives correct answers consistently Needed to add initializing result matrices to 0 'cause accumulating and fixed bug in sequential bypass where passed the wrong array and fixed problem with end-conditions in blocked multiply loop nest diff -r 4e14e2663af9 -r cbd8db6b8657 src/Application/SSR_Matrix_Mult/Divide_Pr.c --- a/src/Application/SSR_Matrix_Mult/Divide_Pr.c Tue Nov 02 17:00:50 2010 -0700 +++ b/src/Application/SSR_Matrix_Mult/Divide_Pr.c Thu Nov 04 17:44:15 2010 -0700 @@ -122,7 +122,7 @@ SlicingStrucCarrier *slicingStrucCarrier; float32 *resultArray; //points to array inside result matrix - DEBUG("start divide\n") + DEBUG( dbgAppFlow, "start divide\n") int32 divideProbe = VMS__create_single_interval_probe( "divideProbe", @@ -152,13 +152,16 @@ (float32)rightMatrix->numCols < NUM_CELLS_IN_SEQUENTIAL_CUTOFF ) { //====== Do sequential multiply on a single core - DEBUG("doing sequential") + DEBUG( dbgAppFlow, "doing sequential") - //have to transpose the right matrix first + //zero the result array + memset( resultArray, 0, numResRows * numResCols * sizeof(float32) ); + + //transpose the right matrix float32 * - transRightArray = SSR__malloc_to( rightMatrix->numRows * - rightMatrix->numCols * - sizeof(float32), animPr ); + transRightArray = SSR__malloc_to( rightMatrix->numRows * + rightMatrix->numCols * sizeof(float32), + animPr ); //copy values from orig matrix to local copyTranspose( rightMatrix->numRows, rightMatrix->numCols, @@ -166,7 +169,7 @@ transRightArray, rightMatrix->array ); multiplyMatrixArraysTransposed( vectLength, numResRows, numResCols, - leftMatrix->array, rightMatrix->array, + leftMatrix->array, transRightArray, resultArray ); } else @@ -211,7 +214,7 @@ //=============== Work done -- send results back ================= - DEBUG_MSG( dbgAppFlow, "end divide\n") + DEBUG( dbgAppFlow, "end divide\n") VMS__record_interval_end_in_probe( divideProbe ); VMS__print_stats_of_all_probes(); @@ -417,6 +420,7 @@ { coreToScheduleOnto += 1; } } + } } diff -r 4e14e2663af9 -r cbd8db6b8657 src/Application/SSR_Matrix_Mult/Result_Pr.c --- a/src/Application/SSR_Matrix_Mult/Result_Pr.c Tue Nov 02 17:00:50 2010 -0700 +++ b/src/Application/SSR_Matrix_Mult/Result_Pr.c Thu Nov 04 17:44:15 2010 -0700 @@ -34,7 +34,7 @@ void *msg; SMPairParams *resParams; - DEBUG("start resultPr\n") + DEBUG( dbgAppFlow, "start resultPr\n") params = (ResultsParams *)_params; dividerPr = params->dividerPr; diff -r 4e14e2663af9 -r cbd8db6b8657 src/Application/SSR_Matrix_Mult/SSR_Matrix_Mult.h --- a/src/Application/SSR_Matrix_Mult/SSR_Matrix_Mult.h Tue Nov 02 17:00:50 2010 -0700 +++ b/src/Application/SSR_Matrix_Mult/SSR_Matrix_Mult.h Thu Nov 04 17:44:15 2010 -0700 @@ -20,8 +20,6 @@ #define copyMatrixSingleton 1 #define copyTransposeSingleton 2 -#define DEBUG(msg) //printf(msg); fflush(stdin); - //============================== Structures ============================== typedef struct { diff -r 4e14e2663af9 -r cbd8db6b8657 src/Application/SSR_Matrix_Mult/subMatrix_Pr.c --- a/src/Application/SSR_Matrix_Mult/subMatrix_Pr.c Tue Nov 02 17:00:50 2010 -0700 +++ b/src/Application/SSR_Matrix_Mult/subMatrix_Pr.c Thu Nov 04 17:44:15 2010 -0700 @@ -48,7 +48,7 @@ float32 *leftArray, *rightArray, *resArray; SubMatrix *leftSubMatrix, *rightSubMatrix; - DEBUG("start sub-matrix mult\n") + DEBUG1(dbgAppFlow, "start sub-matrix mult: %d\n", animatingPr->procrID) params = (SMPairParams *)data; resultPr = params->resultPr; @@ -63,8 +63,10 @@ leftArray = leftSubMatrix->array; rightArray = rightSubMatrix->array; - resArray = SSR__malloc_to(leftSubMatrix->numRows * rightSubMatrix->numCols - * sizeof( float32 ), animatingPr ); + int32 + resSize = leftSubMatrix->numRows * rightSubMatrix->numCols * sizeof(float32); + resArray = SSR__malloc_to( resSize, animatingPr ); + memset( resArray, 0, resSize ); int32 numResRows, numResCols, vectLength; @@ -84,97 +86,107 @@ } -/*Divides into 32x32 sub-matrices, 3 of which fit into 32KB L1 cache - * Would be nice to embed this within another level that divided into + +/*Divides result and each input into 32x32 sub-matrices, 3 of which fit into + * the 32KB L1 cache. + *Would be nice to embed this within another level that divided into * 8x8 tiles of those, where one 8x8 tile fits within 2MB L2 cache * *Eventually want these divisions to be automatic, using DKU pattern - * embedded into SSR, and with VMS controlling the divisions according to - * the cache sizes, which it knows about. - *And, want VMS to work with language to split among main-mems, so a socket + * embedded into VMS and exposed in the language, and with VMS controlling the + * divisions according to the cache sizes, which it knows about. + *Also, want VMS to work with language to split among main-mems, so a socket * only cranks on data in its local segment of main mem * + *So, outer two loops determine start and end points within the result matrix. + * Inside that, a loop dets the start and end points along the shared dimensions + * of the two input matrices. */ void inline multiplyMatrixArraysTransposed( int32 vecLength, int32 numResRows, int32 numResCols, - float32 *leftArray, float32 *rightArray, - float32 *resArray ) + float32 *leftArray, float32 *rightArray, + float32 *resArray ) { int resStride, inpStride; - int startRow, startCol, endRow, endCol, startVec, endVec; + int resStartRow, resStartCol, resEndRow, resEndCol, startVec, endVec; resStride = numResCols; inpStride = vecLength; - for( startRow = 0; startRow < numResRows; ) + for( resStartRow = 0; resStartRow < numResRows; ) { - endRow = startRow + ROWS_IN_BLOCK; - if( endRow > numResRows ) endRow = numResRows; + resEndRow = resStartRow + ROWS_IN_BLOCK -1; //start at zero, so -1 + if( resEndRow > numResRows ) resEndRow = numResRows -1; - for( startCol = 0; startCol < numResCols; ) + for( resStartCol = 0; resStartCol < numResCols; ) { - endCol = startCol + COLS_IN_BLOCK; - if( endCol > numResCols ) endCol = numResCols; + resEndCol = resStartCol + COLS_IN_BLOCK -1; + if( resEndCol > numResCols ) resEndCol = numResCols -1; for( startVec = 0; startVec < vecLength; ) { - endVec = startVec + VEC_IN_BLOCK; - if( endVec > vecLength ) endVec = vecLength; + endVec = startVec + VEC_IN_BLOCK -1; + if( endVec > vecLength ) endVec = vecLength -1; //By having the "vector" of sub-blocks in a sub-block slice // be marched down in inner loop, are re-using the result - // matrix, which stays in L1 cache -- can only re-use one of - // the three, so this is the most important -- avoids writing + // matrix, which stays in L1 cache and re-using the left sub-mat + // which repeats for each right sub-mat -- can only re-use two of + // the three, so result is the most important -- avoids writing // dirty blocks until those result-locations fully done //Row and Col is position in result matrix -- so row and vec // for left array, then vec and col for right array multiplySubBlocksTransposed( leftArray, rightArray, resArray, - startRow, endRow, - startCol, endCol, + resStartRow, resEndRow, + resStartCol, resEndCol, startVec, endVec, resStride, inpStride ); - startVec = endVec; + startVec = endVec +1; } - startCol = endCol; + resStartCol = resEndCol +1; } - startRow = endRow; + resStartRow = resEndRow +1; } } + void inline multiplySubBlocksTransposed( float32 *leftArray, float32 *rightArray, float32 *resArray, - int startRow, int endRow, - int startCol, int endCol, + int resStartRow, int resEndRow, + int resStartCol, int resEndCol, int startVec, int endVec, int resStride, int inpStride ) { - int row, col, vec; + int resRow, resCol, vec; int leftOffset, rightOffset; float32 result; - - for( row = startRow; row < endRow; row++ ) - { - for( col = startCol; col < endCol; col++ ) - { - leftOffset = row * inpStride;//left & right inp strides always same - rightOffset = col * inpStride;// because right is transposed + + //The result row is used only for the left matrix, res col for the right + for( resCol = resStartCol; resCol <= resEndCol; resCol++ ) + { + for( resRow = resStartRow; resRow <= resEndRow; resRow++ ) + { + leftOffset = resRow * inpStride;//left & right inp strides always same + rightOffset = resCol * inpStride;// because right is transposed result = 0; - for( vec = startVec; vec < endVec; vec++ ) + for( vec = startVec; vec <= endVec; vec++ ) { result += leftArray[ leftOffset + vec] * rightArray[ rightOffset + vec]; } - - resArray[ row * resStride + col ] += result; + + resArray[ resRow * resStride + resCol ] += result; } } } + + /*Reuse this in divider when do the sequential multiply case */ void inline diff -r 4e14e2663af9 -r cbd8db6b8657 src/Application/main.c --- a/src/Application/main.c Tue Nov 02 17:00:50 2010 -0700 +++ b/src/Application/main.c Thu Nov 04 17:44:15 2010 -0700 @@ -19,6 +19,8 @@ { Matrix *leftMatrix, *rightMatrix, *resultMatrix; ParamBag *paramBag; + printf( "arguments: %s | %s\n", argv[0], argv[1] ); + paramBag = makeParamBag(); readParamFileIntoBag( argv[1], paramBag ); initialize_Input_Matrices_Via( &leftMatrix, &rightMatrix, paramBag );