Mercurial > cgi-bin > hgwebdir.cgi > PR > Applications > VCilk > VCilk__Blocked_Matrix_Mult__Bench
changeset 1:06c80cd3c303
Fixed bugs in multiply code -- has correct loop nest now and initializes result
| author | Me |
|---|---|
| date | Sun, 07 Nov 2010 06:55:26 -0800 |
| parents | 56e17dcfc0c3 |
| children | 5311bd32c3cb |
| files | src/Application/VCilk__Matrix_Mult/Divide_Pr.c src/Application/VCilk__Matrix_Mult/subMatrix_Pr.c src/Application/main.c |
| diffstat | 3 files changed, 46 insertions(+), 34 deletions(-) [+] |
line diff
1.1 --- a/src/Application/VCilk__Matrix_Mult/Divide_Pr.c Sat Oct 30 20:43:49 2010 -0700 1.2 +++ b/src/Application/VCilk__Matrix_Mult/Divide_Pr.c Sun Nov 07 06:55:26 2010 -0800 1.3 @@ -128,7 +128,6 @@ 1.4 1.5 PRINT_DEBUG("start divide\n") 1.6 1.7 -//TODO: VMS__create_block_of_probes_with_idxs( 0, 2 ); 1.8 int32 1.9 divideProbe = VMS__create_single_interval_probe( "divideProbe", 1.10 animPr );
2.1 --- a/src/Application/VCilk__Matrix_Mult/subMatrix_Pr.c Sat Oct 30 20:43:49 2010 -0700 2.2 +++ b/src/Application/VCilk__Matrix_Mult/subMatrix_Pr.c Sun Nov 07 06:55:26 2010 -0800 2.3 @@ -78,8 +78,10 @@ 2.4 // thrashing of the cache -- as long as array big enough, the copy 2.5 // overhead is small because each byte is reused size-of-side times 2.6 //This is freed in the vector processor 2.7 - resArray = VCilk__malloc(leftSubMatrix->numRows * rightSubMatrix->numCols* 2.8 - sizeof( float32 ), animatingPr ); 2.9 + int32 2.10 + resSize = leftSubMatrix->numRows * rightSubMatrix->numCols * sizeof(float32); 2.11 + resArray = VCilk__malloc( resSize, animatingPr ); 2.12 + memset( resArray, 0, resSize ); 2.13 2.14 2.15 int32 numResRows, numResCols, vectLength; 2.16 @@ -101,16 +103,21 @@ 2.17 } 2.18 2.19 2.20 -/*Divides into 32x32 sub-matrices, 3 of which fit into 32KB L1 cache 2.21 - * Would be nice to embed this within another level that divided into 2.22 + 2.23 +/*Divides result and each input into 32x32 sub-matrices, 3 of which fit into 2.24 + * the 32KB L1 cache. 2.25 + *Would be nice to embed this within another level that divided into 2.26 * 8x8 tiles of those, where one 8x8 tile fits within 2MB L2 cache 2.27 * 2.28 *Eventually want these divisions to be automatic, using DKU pattern 2.29 - * embedded into VCilk, and with VMS controlling the divisions according to 2.30 - * the cache sizes, which it knows about. 2.31 - *And, want VMS to work with language to split among main-mems, so a socket 2.32 + * embedded into VMS and exposed in the language, and with VMS controlling the 2.33 + * divisions according to the cache sizes, which it knows about. 2.34 + *Also, want VMS to work with language to split among main-mems, so a socket 2.35 * only cranks on data in its local segment of main mem 2.36 * 2.37 + *So, outer two loops determine start and end points within the result matrix. 2.38 + * Inside that, a loop dets the start and end points along the shared dimensions 2.39 + * of the two input matrices. 2.40 */ 2.41 void inline 2.42 multiplyMatrixArrays( int32 vecLength, int32 numResRows, int32 numResCols, 2.43 @@ -118,74 +125,79 @@ 2.44 float32 *resArray ) 2.45 { 2.46 int resStride, inpStride; 2.47 - int startRow, startCol, endRow, endCol, startVec, endVec; 2.48 + int resStartRow, resStartCol, resEndRow, resEndCol, startVec, endVec; 2.49 2.50 resStride = numResCols; 2.51 inpStride = vecLength; 2.52 2.53 - for( startRow = 0; startRow < numResRows; ) 2.54 + for( resStartRow = 0; resStartRow < numResRows; ) 2.55 { 2.56 - endRow = startRow + ROWS_IN_BLOCK; 2.57 - if( endRow > numResRows ) endRow = numResRows; 2.58 + resEndRow = resStartRow + ROWS_IN_BLOCK -1; //start at zero, so -1 2.59 + if( resEndRow > numResRows ) resEndRow = numResRows; 2.60 2.61 - for( startCol = 0; startCol < numResCols; ) 2.62 + for( resStartCol = 0; resStartCol < numResCols; ) 2.63 { 2.64 - endCol = startCol + COLS_IN_BLOCK; 2.65 - if( endCol > numResCols ) endCol = numResCols; 2.66 + resEndCol = resStartCol + COLS_IN_BLOCK -1; 2.67 + if( resEndCol > numResCols ) resEndCol = numResCols; 2.68 + resStartCol = resEndCol +1; 2.69 + 2.70 2.71 for( startVec = 0; startVec < vecLength; ) 2.72 { 2.73 - endVec = startVec + VEC_IN_BLOCK; 2.74 + endVec = startVec + VEC_IN_BLOCK -1; 2.75 if( endVec > vecLength ) endVec = vecLength; 2.76 2.77 //By having the "vector" of sub-blocks in a sub-block slice 2.78 // be marched down in inner loop, are re-using the result 2.79 - // matrix, which stays in L1 cache -- can only re-use one of 2.80 - // the three, so this is the most important -- avoids writing 2.81 + // matrix, which stays in L1 cache and re-using the left sub-mat 2.82 + // which repeats for each right sub-mat -- can only re-use two of 2.83 + // the three, so result is the most important -- avoids writing 2.84 // dirty blocks until those result-locations fully done 2.85 //Row and Col is position in result matrix -- so row and vec 2.86 // for left array, then vec and col for right array 2.87 multiplySubBlocksTransposed( leftArray, rightArray, 2.88 resArray, 2.89 - startRow, endRow, 2.90 - startCol, endCol, 2.91 + resStartRow, resEndRow, 2.92 + resStartCol, resEndCol, 2.93 startVec, endVec, 2.94 resStride, inpStride ); 2.95 - startVec = endVec; 2.96 + startVec = endVec +1; 2.97 } 2.98 - startCol = endCol; 2.99 + resStartCol = resEndCol +1; 2.100 } 2.101 - startRow = endRow; 2.102 + resStartRow = resEndRow +1; 2.103 } 2.104 } 2.105 2.106 2.107 + 2.108 void inline 2.109 multiplySubBlocksTransposed( float32 *leftArray, float32 *rightArray, 2.110 float32 *resArray, 2.111 - int startRow, int endRow, 2.112 - int startCol, int endCol, 2.113 + int resStartRow, int resEndRow, 2.114 + int resStartCol, int resEndCol, 2.115 int startVec, int endVec, 2.116 int resStride, int inpStride ) 2.117 { 2.118 - int row, col, vec; 2.119 + int resRow, resCol, vec; 2.120 int leftOffset, rightOffset; 2.121 float32 result; 2.122 - 2.123 - for( row = startRow; row < endRow; row++ ) 2.124 + 2.125 + //The result row is used only for the left matrix, res col for the right 2.126 + for( resCol = resStartCol; resCol <= resEndCol; resCol++ ) 2.127 { 2.128 - for( col = startCol; col < endCol; col++ ) 2.129 + for( resRow = resStartRow; resRow <= resEndRow; resRow++ ) 2.130 { 2.131 - leftOffset = row * inpStride;//left & right inp strides always same 2.132 - rightOffset = col * inpStride;// because right is transposed 2.133 + leftOffset = resRow * inpStride;//left & right inp strides always same 2.134 + rightOffset = resCol * inpStride;// because right is transposed 2.135 result = 0; 2.136 - for( vec = startVec; vec < endVec; vec++ ) 2.137 + for( vec = startVec; vec <= endVec; vec++ ) 2.138 { 2.139 result += 2.140 leftArray[ leftOffset + vec] * rightArray[ rightOffset + vec]; 2.141 } 2.142 2.143 - resArray[ row * resStride + col ] += result; 2.144 + resArray[ resRow * resStride + resCol ] += result; 2.145 } 2.146 } 2.147 } 2.148 @@ -214,7 +226,7 @@ 2.149 origStride = origMatrix->numCols; 2.150 2.151 //This is free in Divide pr after all calcs are done 2.152 - subArray = VCilk__malloc( numRows * numCols * sizeof(float32),animPr); 2.153 + subArray = VCilk__malloc( numRows * numCols * sizeof(float32), animPr ); 2.154 subMatrix->array = subArray; 2.155 2.156 //copy values from orig matrix to local
3.1 --- a/src/Application/main.c Sat Oct 30 20:43:49 2010 -0700 3.2 +++ b/src/Application/main.c Sun Nov 07 06:55:26 2010 -0800 3.3 @@ -20,6 +20,7 @@ 3.4 ParamBag *paramBag; 3.5 3.6 paramBag = makeParamBag(); 3.7 + printf("arguments: %s | %s | %s | %s\n", argv[0], argv[1], argv[2], argv[3] ); 3.8 readParamFileIntoBag( argv[1], paramBag ); 3.9 initialize_Input_Matrices_Via( &leftMatrix, &rightMatrix, paramBag ); 3.10
