# HG changeset patch # User Me # Date 1289141726 28800 # Node ID 06c80cd3c3039bcbe762c8c238cef488658b4d69 # Parent 56e17dcfc0c32aaa3cabf7c19208cd2345b501f1 Fixed bugs in multiply code -- has correct loop nest now and initializes result diff -r 56e17dcfc0c3 -r 06c80cd3c303 src/Application/VCilk__Matrix_Mult/Divide_Pr.c --- a/src/Application/VCilk__Matrix_Mult/Divide_Pr.c Sat Oct 30 20:43:49 2010 -0700 +++ b/src/Application/VCilk__Matrix_Mult/Divide_Pr.c Sun Nov 07 06:55:26 2010 -0800 @@ -128,7 +128,6 @@ PRINT_DEBUG("start divide\n") -//TODO: VMS__create_block_of_probes_with_idxs( 0, 2 ); int32 divideProbe = VMS__create_single_interval_probe( "divideProbe", animPr ); diff -r 56e17dcfc0c3 -r 06c80cd3c303 src/Application/VCilk__Matrix_Mult/subMatrix_Pr.c --- a/src/Application/VCilk__Matrix_Mult/subMatrix_Pr.c Sat Oct 30 20:43:49 2010 -0700 +++ b/src/Application/VCilk__Matrix_Mult/subMatrix_Pr.c Sun Nov 07 06:55:26 2010 -0800 @@ -78,8 +78,10 @@ // thrashing of the cache -- as long as array big enough, the copy // overhead is small because each byte is reused size-of-side times //This is freed in the vector processor - resArray = VCilk__malloc(leftSubMatrix->numRows * rightSubMatrix->numCols* - sizeof( float32 ), animatingPr ); + int32 + resSize = leftSubMatrix->numRows * rightSubMatrix->numCols * sizeof(float32); + resArray = VCilk__malloc( resSize, animatingPr ); + memset( resArray, 0, resSize ); int32 numResRows, numResCols, vectLength; @@ -101,16 +103,21 @@ } -/*Divides into 32x32 sub-matrices, 3 of which fit into 32KB L1 cache - * Would be nice to embed this within another level that divided into + +/*Divides result and each input into 32x32 sub-matrices, 3 of which fit into + * the 32KB L1 cache. + *Would be nice to embed this within another level that divided into * 8x8 tiles of those, where one 8x8 tile fits within 2MB L2 cache * *Eventually want these divisions to be automatic, using DKU pattern - * embedded into VCilk, and with VMS controlling the divisions according to - * the cache sizes, which it knows about. - *And, want VMS to work with language to split among main-mems, so a socket + * embedded into VMS and exposed in the language, and with VMS controlling the + * divisions according to the cache sizes, which it knows about. + *Also, want VMS to work with language to split among main-mems, so a socket * only cranks on data in its local segment of main mem * + *So, outer two loops determine start and end points within the result matrix. + * Inside that, a loop dets the start and end points along the shared dimensions + * of the two input matrices. */ void inline multiplyMatrixArrays( int32 vecLength, int32 numResRows, int32 numResCols, @@ -118,74 +125,79 @@ float32 *resArray ) { int resStride, inpStride; - int startRow, startCol, endRow, endCol, startVec, endVec; + int resStartRow, resStartCol, resEndRow, resEndCol, startVec, endVec; resStride = numResCols; inpStride = vecLength; - for( startRow = 0; startRow < numResRows; ) + for( resStartRow = 0; resStartRow < numResRows; ) { - endRow = startRow + ROWS_IN_BLOCK; - if( endRow > numResRows ) endRow = numResRows; + resEndRow = resStartRow + ROWS_IN_BLOCK -1; //start at zero, so -1 + if( resEndRow > numResRows ) resEndRow = numResRows; - for( startCol = 0; startCol < numResCols; ) + for( resStartCol = 0; resStartCol < numResCols; ) { - endCol = startCol + COLS_IN_BLOCK; - if( endCol > numResCols ) endCol = numResCols; + resEndCol = resStartCol + COLS_IN_BLOCK -1; + if( resEndCol > numResCols ) resEndCol = numResCols; + resStartCol = resEndCol +1; + for( startVec = 0; startVec < vecLength; ) { - endVec = startVec + VEC_IN_BLOCK; + endVec = startVec + VEC_IN_BLOCK -1; if( endVec > vecLength ) endVec = vecLength; //By having the "vector" of sub-blocks in a sub-block slice // be marched down in inner loop, are re-using the result - // matrix, which stays in L1 cache -- can only re-use one of - // the three, so this is the most important -- avoids writing + // matrix, which stays in L1 cache and re-using the left sub-mat + // which repeats for each right sub-mat -- can only re-use two of + // the three, so result is the most important -- avoids writing // dirty blocks until those result-locations fully done //Row and Col is position in result matrix -- so row and vec // for left array, then vec and col for right array multiplySubBlocksTransposed( leftArray, rightArray, resArray, - startRow, endRow, - startCol, endCol, + resStartRow, resEndRow, + resStartCol, resEndCol, startVec, endVec, resStride, inpStride ); - startVec = endVec; + startVec = endVec +1; } - startCol = endCol; + resStartCol = resEndCol +1; } - startRow = endRow; + resStartRow = resEndRow +1; } } + void inline multiplySubBlocksTransposed( float32 *leftArray, float32 *rightArray, float32 *resArray, - int startRow, int endRow, - int startCol, int endCol, + int resStartRow, int resEndRow, + int resStartCol, int resEndCol, int startVec, int endVec, int resStride, int inpStride ) { - int row, col, vec; + int resRow, resCol, vec; int leftOffset, rightOffset; float32 result; - - for( row = startRow; row < endRow; row++ ) + + //The result row is used only for the left matrix, res col for the right + for( resCol = resStartCol; resCol <= resEndCol; resCol++ ) { - for( col = startCol; col < endCol; col++ ) + for( resRow = resStartRow; resRow <= resEndRow; resRow++ ) { - leftOffset = row * inpStride;//left & right inp strides always same - rightOffset = col * inpStride;// because right is transposed + leftOffset = resRow * inpStride;//left & right inp strides always same + rightOffset = resCol * inpStride;// because right is transposed result = 0; - for( vec = startVec; vec < endVec; vec++ ) + for( vec = startVec; vec <= endVec; vec++ ) { result += leftArray[ leftOffset + vec] * rightArray[ rightOffset + vec]; } - resArray[ row * resStride + col ] += result; + resArray[ resRow * resStride + resCol ] += result; } } } @@ -214,7 +226,7 @@ origStride = origMatrix->numCols; //This is free in Divide pr after all calcs are done - subArray = VCilk__malloc( numRows * numCols * sizeof(float32),animPr); + subArray = VCilk__malloc( numRows * numCols * sizeof(float32), animPr ); subMatrix->array = subArray; //copy values from orig matrix to local diff -r 56e17dcfc0c3 -r 06c80cd3c303 src/Application/main.c --- a/src/Application/main.c Sat Oct 30 20:43:49 2010 -0700 +++ b/src/Application/main.c Sun Nov 07 06:55:26 2010 -0800 @@ -20,6 +20,7 @@ ParamBag *paramBag; paramBag = makeParamBag(); + printf("arguments: %s | %s | %s | %s\n", argv[0], argv[1], argv[2], argv[3] ); readParamFileIntoBag( argv[1], paramBag ); initialize_Input_Matrices_Via( &leftMatrix, &rightMatrix, paramBag );