Mercurial > cgi-bin > hgwebdir.cgi > PR > Applications > C > Cilk > Cilk__Matrix_Mult__Bench
changeset 3:bf7331ed394e tip
Working version of blocked matrix mult -- same as SSR, VCilk & VPThread
| author | Me |
|---|---|
| date | Wed, 10 Nov 2010 06:07:54 -0800 |
| parents | ec0629f70ee5 |
| children | |
| files | src/Application/CILK__Matrix_Mult/CILK__Matrix_Mult.h src/Application/CILK__Matrix_Mult/Divide_Pr.cilk src/Application/CILK__Matrix_Mult/EntryPoint.cilk src/Application/CILK__Matrix_Mult/Vector_Pr.cilk src/Application/CILK__Matrix_Mult/subMatrix_Pr.cilk src/Application/Makefile src/Application/Matrix_Mult.c src/Application/Matrix_Mult.h src/Application/main.cilk |
| diffstat | 9 files changed, 1199 insertions(+), 206 deletions(-) [+] |
line diff
1.1 --- a/src/Application/CILK__Matrix_Mult/CILK__Matrix_Mult.h Tue Oct 26 19:34:03 2010 -0700 1.2 +++ b/src/Application/CILK__Matrix_Mult/CILK__Matrix_Mult.h Wed Nov 10 06:07:54 2010 -0800 1.3 @@ -3,14 +3,23 @@ 1.4 * Licensed under GNU General Public License version 2 1.5 */ 1.6 1.7 -#ifndef _VPThread__MATRIX_MULT_H_ 1.8 -#define _VPThread__MATRIX_MULT_H_ 1.9 +#ifndef _Cilk__MATRIX_MULT_H_ 1.10 +#define _Cilk__MATRIX_MULT_H_ 1.11 1.12 #include <stdio.h> 1.13 1.14 #include "VMS_primitive_data_types.h" 1.15 #include "../Matrix_Mult.h" 1.16 1.17 +//=============================== Defines ============================== 1.18 +#define ROWS_IN_BLOCK 32 1.19 +#define COLS_IN_BLOCK 32 1.20 +#define VEC_IN_BLOCK 32 1.21 + 1.22 +#define copyMatrixSingleton 1 1.23 +#define copyTransposeSingleton 2 1.24 + 1.25 + 1.26 //============================== Structures ============================== 1.27 typedef struct 1.28 { 1.29 @@ -20,44 +29,55 @@ 1.30 } 1.31 DividerParams; 1.32 1.33 -typedef struct 1.34 - { 1.35 - int numRows; 1.36 - int numCols; 1.37 +typedef 1.38 +struct 1.39 + { int32 numRows; 1.40 + int32 numCols; 1.41 + Matrix *origMatrix; 1.42 + int32 origStartRow; 1.43 + int32 origStartCol; 1.44 + int32 alreadyCopied; 1.45 + float32 *array; //2D, but dynamically sized, so use addr arith 1.46 } 1.47 -ResultsParams; 1.48 +SubMatrix; 1.49 1.50 typedef struct 1.51 { 1.52 - int myCol; 1.53 - int myRow; 1.54 - int vectLength; 1.55 - Matrix *leftMatrix; 1.56 - Matrix *rightMatrix; 1.57 - float32 result; 1.58 + SubMatrix *leftSubMatrix; 1.59 + SubMatrix *rightSubMatrix; 1.60 + float32 *partialResultArray; 1.61 } 1.62 -VectorParams; 1.63 +SMPairParams; 1.64 + 1.65 +typedef 1.66 +struct 1.67 + { int32 numVals; 1.68 + int32 *startVals; 1.69 + } 1.70 +SlicingStruc; 1.71 + 1.72 +typedef 1.73 +struct 1.74 + { 1.75 + SlicingStruc *leftRowSlices; 1.76 + SlicingStruc *vecSlices; 1.77 + SlicingStruc *rightColSlices; 1.78 + } 1.79 +SlicingStrucCarrier; 1.80 1.81 typedef struct 1.82 { 1.83 - //for communicating vector results to results Thd 1.84 - int32 vector_mutex; 1.85 - int32 vector_cond; 1.86 - VectorParams *currVector; 1.87 - 1.88 - //for communicating results array back to seed (divider) Thd 1.89 - int32 results_mutex; 1.90 - int32 results_cond; 1.91 - float32 *results; 1.92 - 1.93 - //for ensuring results thd has vector lock before making vector thds 1.94 - int32 start_mutex; 1.95 - int32 start_cond; 1.96 - 1.97 - Matrix *rightMatrix; 1.98 - Matrix *resultMatrix; 1.99 + int32 numVecIdxs; 1.100 + int32 numRightColIdxs; 1.101 + int32 leftRowIdxOffset; 1.102 + int32 resColIdx; 1.103 + SubMatrix **leftSubMatrices; 1.104 + SubMatrix **rightSubMatrices; 1.105 + float32 *resultArray; 1.106 + int32 coreToRunOn; 1.107 + int32 vecID; 1.108 } 1.109 -MatrixMultGlobals; 1.110 +VecParams; 1.111 1.112 1.113 //============================= Processor Functions ========================= 1.114 @@ -67,8 +87,9 @@ 1.115 1.116 1.117 //================================ Entry Point ============================== 1.118 -//cilk Matrix *\ 1.119 -multiplyTheseMatrices( Matrix *leftMatrix, Matrix *rightMatrix ); 1.120 +//cilk 1.121 +//Matrix * 1.122 +//multiplyTheseMatrices( Matrix *leftMatrix, Matrix *rightMatrix ); 1.123 1.124 1.125 #endif /*_VPThread__MATRIX_MULT_H_*/
2.1 --- a/src/Application/CILK__Matrix_Mult/Divide_Pr.cilk Tue Oct 26 19:34:03 2010 -0700 2.2 +++ b/src/Application/CILK__Matrix_Mult/Divide_Pr.cilk Wed Nov 10 06:07:54 2010 -0800 2.3 @@ -1,72 +1,610 @@ 2.4 -/* 2.5 - * Copyright 2009 OpenSourceStewardshipFoundation.org 2.6 - * Licensed under GNU General Public License version 2 2.7 - * 2.8 - * Author: seanhalle@yahoo.com 2.9 - * 2.10 - */ 2.11 - 2.12 - 2.13 -#include "CILK__Matrix_Mult.h" 2.14 - 2.15 -cilk float32 calcVector( void * ); 2.16 - 2.17 -/*Divider creates one processor for every row-col pair. 2.18 - * It hands them: 2.19 +/* 2.20 + * Copyright 2009 OpenSourceStewardshipFoundation.org 2.21 + * Licensed under GNU General Public License version 2 2.22 + * 2.23 + * Author: seanhalle@yahoo.com 2.24 + * 2.25 + */ 2.26 + 2.27 + 2.28 +#include "CILK__Matrix_Mult.h" 2.29 +#include "VMS_primitive_data_types.h" 2.30 + 2.31 +#include <math.h> 2.32 +#include <sys/time.h> 2.33 +#include <string.h> 2.34 +#include <malloc.h> 2.35 + 2.36 + //The time to compute this many result values should equal the time to 2.37 + // perform this division on a matrix of size gives that many result calcs 2.38 + //IE, size this so that sequential time to calc equals divide time 2.39 + // find the value by experimenting -- but divide time and calc time scale 2.40 + // same way, so this value should remain valid across hardware 2.41 + //Divide time is about 800us on 2.4Ghz core2Quad laptop core 2.42 + //num cells is the cube of a side, when have two square matrices 2.43 +#define NUM_CELLS_IN_SEQUENTIAL_CUTOFF 100000 /* about 46x46 */ 2.44 + 2.45 + //Cilk doesn't have VCilk's facilities, so define constants 2.46 +#define MIN_WORK_UNIT_CYCLES 100000 2.47 +#define IDEAL_NUM_WORK_UNITS 20 2.48 +#define NUMBER_OF_CORES_TO_SPAWN_ONTO 4 2.49 + 2.50 +//=============================== External ================================= 2.51 +cilk void calcVectorOfSubMatrices( void *params ); 2.52 + 2.53 +void inline 2.54 +copyTranspose( int32 numRows, int32 numCols, 2.55 + int32 origStartRow, int32 origStartCol, int32 origStride, 2.56 + float32 *subArray, float32 *origArray ); 2.57 + 2.58 +void inline 2.59 +multiplyMatrixArraysTransposed( int32 vecLength, int32 numResRows, int32 numResCols, 2.60 + float32 *leftArray, float32 *rightArray, 2.61 + float32 *resArray ); 2.62 + 2.63 + 2.64 +void 2.65 +startTimeInterval(); 2.66 + 2.67 +void 2.68 +endIntervalAndPrintTime(); 2.69 + 2.70 + 2.71 +//============================= Within-File =============================== 2.72 +int inline 2.73 +measureMatrixMultPrimitive(); 2.74 + 2.75 +SlicingStrucCarrier * 2.76 +calcIdealSizeAndSliceDimensions( Matrix *leftMatrix, Matrix *rightMatrix ); 2.77 + 2.78 +SlicingStruc * 2.79 +sliceUpDimension( float32 idealSizeOfSide, int startVal, int endVal ); 2.80 + 2.81 +void 2.82 +freeSlicingStruc( SlicingStruc *slicingStruc ); 2.83 + 2.84 +SubMatrix ** 2.85 +createSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices, 2.86 + Matrix *origMatrix, int32 transposeTheMatrix ); 2.87 + 2.88 +void 2.89 +freeSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices, 2.90 + SubMatrix **subMatrices ); 2.91 + 2.92 +cilk void 2.93 +pairUpSubMatricesAndSpawnAndSync( SubMatrix **leftSubMatrices, 2.94 + SubMatrix **rightSubMatrices, 2.95 + int32 numRowIdxs, int32 numColIdxs, 2.96 + int32 numVecIdxs, 2.97 + float32 *resultArray ); 2.98 + 2.99 +cilk void 2.100 +makeSubMatricesAndSpawnAndSync( Matrix *leftMatrix, Matrix *rightMatrix, 2.101 + SlicingStrucCarrier *slicingStrucCarrier, 2.102 + float32 *resultArray ); 2.103 + 2.104 +//=========================================================================== 2.105 + 2.106 +/*Divider creates one processor for every sub-matrix 2.107 + * It hands them: 2.108 * the name of the result processor that they should send their results to, 2.109 - * the left and right matrices, and the row and col they should multiply 2.110 - * the length of the vector 2.111 - * It first creates the result processor, then all the vector processors, 2.112 + * the left and right matrices, and the rows and cols they should multiply 2.113 + * It first creates the result processor, then all the sub-matrixPair 2.114 + * processors, 2.115 * then does a receive of a message from the result processor that gives 2.116 * the divider ownership of the result matrix. 2.117 - * Finally, the divider returns the result matrix out of the VPThread system. 2.118 - */ 2.119 - 2.120 -cilk void divideIntoVectors( void *_dividerParams ) 2.121 - { 2.122 - DividerParams *dividerParams; 2.123 - VectorParams *vectParams; 2.124 - Matrix *leftMatrix, *rightMatrix, *resultMatrix; 2.125 - int32 numCells, numCols, mrow, mcol; 2.126 - float32 *resultMatrixArray; 2.127 - 2.128 - dividerParams = (DividerParams *)_dividerParams; 2.129 - 2.130 - leftMatrix = dividerParams->leftMatrix; 2.131 - rightMatrix = dividerParams->rightMatrix; 2.132 - 2.133 - 2.134 - numCols = rightMatrix->numCols; 2.135 - 2.136 - numCells = leftMatrix->numRows * rightMatrix->numCols; 2.137 - resultMatrixArray = malloc( numCells * sizeof( float32 ) ); 2.138 - 2.139 - 2.140 - //spawn vector calcs 2.141 - for( mrow = 0; mrow < leftMatrix->numRows; mrow++ ) 2.142 - { for( mcol = 0; mcol < rightMatrix->numCols; mcol++ ) 2.143 - { 2.144 - vectParams = malloc( sizeof(VectorParams) ); 2.145 - vectParams->myCol = mcol; 2.146 - vectParams->myRow = mrow; 2.147 - vectParams->vectLength = leftMatrix->numCols; 2.148 - vectParams->leftMatrix = leftMatrix; 2.149 - vectParams->rightMatrix = rightMatrix; 2.150 - 2.151 - 2.152 - resultMatrixArray[ mrow * numCols + mcol ] = spawn calcVector( vectParams ); 2.153 - } 2.154 - } 2.155 - 2.156 - sync; 2.157 - 2.158 - 2.159 - //The results of the all the work have to be linked-to from the data 2.160 - // struc given to the seed procr -- this divide func is animated by 2.161 - // that seed procr, so have to link results to the _dividerParams. 2.162 - resultMatrix = malloc( sizeof(Matrix) ); 2.163 - resultMatrix->numCols = rightMatrix->numCols; 2.164 - resultMatrix->numRows = leftMatrix->numRows; 2.165 - dividerParams->resultMatrix = resultMatrix; 2.166 - resultMatrix->matrix = resultMatrixArray; 2.167 + * Finally, the divider returns the result matrix out of the VCilk system. 2.168 + * 2.169 + * Divider chooses the size of sub-matrices via an algorithm that tries to 2.170 + * keep the minimum work above a threshold. The threshold is machine- 2.171 + * dependent, so ask VCilk for min work-unit time to get a 2.172 + * given overhead 2.173 + * 2.174 + * Divide min work-unit cycles by measured-cycles for one matrix-cell 2.175 + * product -- gives the number of products need to have in min size 2.176 + * matrix. 2.177 + * 2.178 + * So then, take cubed root of this to get the size of a side of min sub- 2.179 + * matrix. That is the size of the ideal square sub-matrix -- so tile 2.180 + * up the two input matrices into ones as close as possible to that size, 2.181 + * and create the pairs of sub-matrices. 2.182 + * 2.183 + *======================== STRATEGIC OVERVIEW ======================= 2.184 + * 2.185 + *This division is a bit tricky, because have to create things in advance 2.186 + * that it's not at first obvious need to be created.. 2.187 + * 2.188 + *First slice up each dimension -- three of them.. this is because will have 2.189 + * to create the sub-matrix's data-structures before pairing the sub-matrices 2.190 + * with each other -- so, have three dimensions to slice up before can 2.191 + * create the sub-matrix data-strucs -- also, have to be certain that the 2.192 + * cols of the left input have the exact same slicing as the rows of the 2.193 + * left matrix, so just to be sure, do the slicing calc once, then use it 2.194 + * for both. 2.195 + * 2.196 + *So, goes like this: 2.197 + *1) calculate the start & end values of each dimension in each matrix. 2.198 + *2) use those values to create sub-matrix structures 2.199 + *3) combine sub-matrices into pairs, as the tasks to perform. 2.200 + * 2.201 + *Have to calculate separately from creating the sub-matrices because of the 2.202 + * nature of the nesting -- would either end up creating the same sub-matrix 2.203 + * multiple times, or else would have to put in detection of whether had 2.204 + * made a particular one already if tried to combine steps 1 and 2. 2.205 + * 2.206 + *Step 3 has to be separate because of the nesting, as well -- same reason, 2.207 + * would either create same sub-matrix multiple times, or else have to 2.208 + * add detection of whether was already created. 2.209 + * 2.210 + *Another way to look at it: there's one level of loop to divide dimensions, 2.211 + * two levels of nesting to create sub-matrices, and three levels to pair 2.212 + * up the sub-matrices. 2.213 + */ 2.214 + 2.215 +cilk void 2.216 +divideWorkIntoSubMatrixPairProcrs( void *_dividerParams ) 2.217 + { 2.218 + DividerParams *dividerParams; 2.219 + Matrix *leftMatrix, *rightMatrix; 2.220 + 2.221 + SlicingStrucCarrier *slicingStrucCarrier; 2.222 + float32 *resultArray; //points to array to be put inside result 2.223 + // matrix 2.224 + int32 numResRows, numResCols, vectLength; 2.225 + 2.226 + 2.227 + 2.228 + startTimeInterval(); 2.229 + 2.230 + 2.231 + //=========== Setup -- make local copies of ptd-to-things, malloc, aso 2.232 + 2.233 +// printf("\nin divider\n"); 2.234 + dividerParams = (DividerParams *)_dividerParams; 2.235 + 2.236 + leftMatrix = dividerParams->leftMatrix; 2.237 + rightMatrix = dividerParams->rightMatrix; 2.238 + 2.239 + vectLength = leftMatrix->numCols; 2.240 + numResRows = leftMatrix->numRows; 2.241 + numResCols = rightMatrix->numCols; 2.242 + resultArray = dividerParams->resultMatrix->array; 2.243 + 2.244 + //zero the result array 2.245 + memset( resultArray, 0, numResRows * numResCols * sizeof(float32) ); 2.246 + 2.247 + 2.248 + //============== Do either sequential mult or do division ============== 2.249 + 2.250 + //Check if input matrices too small -- if yes, just do sequential 2.251 + //Cutoff is determined by overhead of this divider -- relatively 2.252 + // machine-independent 2.253 + if( (float32)leftMatrix->numRows * (float32)leftMatrix->numCols * 2.254 + (float32)rightMatrix->numCols < NUM_CELLS_IN_SEQUENTIAL_CUTOFF ) 2.255 + { int32 vectLength; 2.256 + 2.257 + //====== Do sequential multiply on a single core 2.258 + 2.259 + //transpose the right matrix 2.260 + float32 * 2.261 + transRightArray = malloc( rightMatrix->numRows * 2.262 + rightMatrix->numCols * 2.263 + sizeof(float32) ); 2.264 + 2.265 + //copy values from orig matrix to local 2.266 + copyTranspose( rightMatrix->numRows, rightMatrix->numCols, 2.267 + 0, 0, rightMatrix->numRows, 2.268 + transRightArray, rightMatrix->array ); 2.269 + 2.270 + multiplyMatrixArraysTransposed( vectLength, numResRows, numResCols, 2.271 + leftMatrix->array, transRightArray, 2.272 + resultArray ); 2.273 + } 2.274 + else 2.275 + { 2.276 + //====== Do parallel multiply across cores 2.277 + 2.278 + //Calc the ideal size of sub-matrix and slice up the dimensions of 2.279 + // the two matrices. 2.280 + //The ideal size is the one takes the number of cycles to calculate 2.281 + // such that calc time is equal or greater than min work-unit size 2.282 + slicingStrucCarrier = 2.283 + calcIdealSizeAndSliceDimensions( leftMatrix, rightMatrix ); 2.284 + 2.285 + 2.286 + //Make the sub-matrices, and pair them up, then spawn processors to 2.287 + // calc product of each pair. 2.288 +// printf("first spawn\n"); 2.289 + spawn makeSubMatricesAndSpawnAndSync( leftMatrix, rightMatrix, 2.290 + slicingStrucCarrier, 2.291 + resultArray ); 2.292 + sync; 2.293 + //The result array will get filled in by the spawned children 2.294 + } 2.295 + 2.296 + 2.297 + //=============== Work done -- send results back ================= 2.298 + 2.299 + 2.300 + endIntervalAndPrintTime(); 2.301 + 2.302 +// printf("done with divider\n"); 2.303 + 2.304 + //results sent back by side-effect 2.305 +} 2.306 + 2.307 + 2.308 +cilk void 2.309 +makeSubMatricesAndSpawnAndSync( Matrix *leftMatrix, Matrix *rightMatrix, 2.310 + SlicingStrucCarrier *slicingStrucCarrier, 2.311 + float32 *resultArray ) 2.312 + { 2.313 + SlicingStruc *leftRowSlices, *vecSlices, *rightColSlices; 2.314 + SubMatrix **leftSubMatrices, **rightSubMatrices; 2.315 + int32 numRowIdxs, numColIdxs, numVecIdxs; 2.316 + 2.317 + leftRowSlices = slicingStrucCarrier->leftRowSlices; 2.318 + vecSlices = slicingStrucCarrier->vecSlices; 2.319 + rightColSlices = slicingStrucCarrier->rightColSlices; 2.320 + free( slicingStrucCarrier ); 2.321 + 2.322 + //================ Make sub-matrices, given the slicing ================ 2.323 +// printf("about to create submatrices\n"); 2.324 + leftSubMatrices = 2.325 + createSubMatrices( leftRowSlices, vecSlices, 2.326 + leftMatrix, FALSE ); 2.327 + rightSubMatrices = 2.328 + createSubMatrices( vecSlices, rightColSlices, 2.329 + rightMatrix, TRUE ); 2.330 + 2.331 + //============== pair the sub-matrices and make processors ============== 2.332 + 2.333 + numRowIdxs = leftRowSlices->numVals; 2.334 + numColIdxs = rightColSlices->numVals; 2.335 + numVecIdxs = vecSlices->numVals; 2.336 +// printf("about to spawn %d, %d, %d\n", numRowIdxs, numColIdxs, numVecIdxs); 2.337 + spawn pairUpSubMatricesAndSpawnAndSync( leftSubMatrices, rightSubMatrices, 2.338 + numRowIdxs, numColIdxs, numVecIdxs, 2.339 + resultArray ); 2.340 + sync; 2.341 +// printf("done with sub matrices spawn and sync\n"); 2.342 + freeSubMatrices( leftRowSlices, vecSlices, leftSubMatrices ); 2.343 + freeSubMatrices( vecSlices, rightColSlices, rightSubMatrices ); 2.344 + 2.345 +// printf("done freeing sub matrices\n"); 2.346 + //It syncs inside, so know all work is done now: free the sub-matrices 2.347 + freeSlicingStruc( leftRowSlices ); 2.348 + freeSlicingStruc( vecSlices ); 2.349 + freeSlicingStruc( rightColSlices ); 2.350 + 2.351 +// printf("done freeing slicing strucs\n"); 2.352 } 2.353 + 2.354 + 2.355 + 2.356 + 2.357 +/* numRows*colsPerRow/numCores = numToPutOntoEachCore; 2.358 + * put all from a given row onto same core, until exhaust allotment for that 2.359 + * core 2.360 + * 2.361 + */ 2.362 +cilk void 2.363 +pairUpSubMatricesAndSpawnAndSync( SubMatrix **leftSubMatrices, 2.364 + SubMatrix **rightSubMatrices, 2.365 + int32 numRowIdxs, int32 numColIdxs, 2.366 + int32 numVecIdxs, 2.367 + float32 *resultArray ) 2.368 + { 2.369 + int32 resRowIdx, resColIdx; 2.370 + int32 numLeftColIdxs, numRightColIdxs; 2.371 + int32 leftRowIdxOffset; 2.372 + VecParams *vecParams; 2.373 + float32 numToPutOntoEachCore, leftOverFraction; 2.374 + int32 numCores, currCore, numOnCurrCore, numVecs = 0; 2.375 + 2.376 + numLeftColIdxs = numColIdxs; 2.377 + numRightColIdxs = numVecIdxs; 2.378 + 2.379 + numCores = NUMBER_OF_CORES_TO_SPAWN_ONTO; 2.380 + 2.381 + numToPutOntoEachCore = numRowIdxs*numColIdxs/numCores; 2.382 + leftOverFraction = 0; 2.383 + numOnCurrCore = 0; 2.384 + currCore = 0; 2.385 + 2.386 + resRowIdx = 0; 2.387 +// printf("spawning vects, numOnEachCore: %f\n", numToPutOntoEachCore); 2.388 + for( resRowIdx = 0; resRowIdx < numRowIdxs; resRowIdx++ ) 2.389 + { 2.390 + leftRowIdxOffset = resRowIdx * numLeftColIdxs; 2.391 + 2.392 + for( resColIdx = 0; resColIdx < numColIdxs; resColIdx++ ) 2.393 + { 2.394 + vecParams = malloc( sizeof(VecParams) ); 2.395 + 2.396 + vecParams->numVecIdxs = numVecIdxs; 2.397 + vecParams->numRightColIdxs = numRightColIdxs; 2.398 + vecParams->leftRowIdxOffset = leftRowIdxOffset; 2.399 + vecParams->resColIdx = resColIdx; 2.400 + vecParams->leftSubMatrices = leftSubMatrices; 2.401 + vecParams->rightSubMatrices = rightSubMatrices; 2.402 + vecParams->resultArray = resultArray; 2.403 + vecParams->coreToRunOn = currCore; 2.404 + vecParams->vecID = numVecs++; 2.405 + 2.406 +// printf("spawning vect %d\n", numVecs-1); fflush(stdin); 2.407 + spawn calcVectorOfSubMatrices( vecParams ); 2.408 + 2.409 + numOnCurrCore += 1; 2.410 + if( numOnCurrCore + leftOverFraction >= numToPutOntoEachCore - 1 ) 2.411 + { 2.412 + //deal with fractional part, to ensure that imbalance is 1 max 2.413 + // IE, core with most has only 1 more than core with least 2.414 + leftOverFraction += numToPutOntoEachCore - numOnCurrCore; 2.415 + if( leftOverFraction >= 1 ) 2.416 + { leftOverFraction -= 1; 2.417 + numOnCurrCore = -1; 2.418 + } 2.419 + else 2.420 + { numOnCurrCore = 0; 2.421 + } 2.422 + //Move to next core, max core-value to incr to is numCores -1 2.423 + if( currCore >= numCores -1 ) 2.424 + { currCore = 0; 2.425 + } 2.426 + else 2.427 + { currCore += 1; 2.428 + } 2.429 + } 2.430 + } 2.431 + } 2.432 + 2.433 + //Free Note: vector of sub-matrices does its own free-ing, even vec-params 2.434 + 2.435 +// printf("done with making vectors\n", numToPutOntoEachCore); 2.436 + 2.437 + sync; 2.438 + 2.439 + //free the sub-matrices in Fn that called this one 2.440 + } 2.441 + 2.442 + 2.443 +/*Walk through the two slice-strucs, making sub-matrix strucs as go 2.444 + */ 2.445 +SubMatrix ** 2.446 +createSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices, 2.447 + Matrix *origMatrix, int32 transposeTheMatrix ) 2.448 + { 2.449 + int32 numRowIdxs, numColIdxs, rowIdx, colIdx; 2.450 + int32 startRow, endRow, startCol, endCol; 2.451 + int32 *rowStartVals, *colStartVals; 2.452 + int32 rowOffset, dummy; 2.453 + SubMatrix **subMatrices, *newSubMatrix; 2.454 + 2.455 + numRowIdxs = rowSlices->numVals; 2.456 + numColIdxs = colSlices->numVals; 2.457 + 2.458 + rowStartVals = rowSlices->startVals; 2.459 + colStartVals = colSlices->startVals; 2.460 + 2.461 + subMatrices = malloc( numRowIdxs * numColIdxs *sizeof(SubMatrix *) ); 2.462 + 2.463 + for( rowIdx = 0; rowIdx < numRowIdxs; rowIdx++ ) 2.464 + { 2.465 + rowOffset = rowIdx * numColIdxs; 2.466 + 2.467 + startRow = rowStartVals[rowIdx]; 2.468 + endRow = rowStartVals[rowIdx + 1] -1; //"fake" start above last is 2.469 + // at last valid idx + 1 & is 2.470 + // 1 greater than end value 2.471 + for( colIdx = 0; colIdx < numColIdxs; colIdx++ ) 2.472 + { 2.473 + startCol = colStartVals[colIdx]; 2.474 + endCol = colStartVals[colIdx + 1] -1; 2.475 + 2.476 + newSubMatrix = malloc( sizeof(SubMatrix) ); 2.477 + newSubMatrix->numRows = endRow - startRow +1; 2.478 + newSubMatrix->numCols = endCol - startCol +1; 2.479 + newSubMatrix->origMatrix = origMatrix; 2.480 + newSubMatrix->origStartRow = startRow; 2.481 + newSubMatrix->origStartCol = startCol; 2.482 + //no parallel singleton in Cilk, so copy here 2.483 + if( transposeTheMatrix ) 2.484 + { copyTransposeFromOrig( newSubMatrix ); 2.485 + } 2.486 + else 2.487 + { copyFromOrig( newSubMatrix ); 2.488 + } 2.489 +// printf("just copied: %X", newSubMatrix->array); 2.490 + newSubMatrix->alreadyCopied = TRUE; 2.491 + 2.492 + subMatrices[ rowOffset + colIdx ] = newSubMatrix; 2.493 + } 2.494 + } 2.495 + return subMatrices; 2.496 + } 2.497 + 2.498 +void 2.499 +freeSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices, 2.500 + SubMatrix **subMatrices ) 2.501 + { 2.502 + int32 numRowIdxs, numColIdxs, rowIdx, colIdx, rowOffset; 2.503 + SubMatrix *subMatrix; 2.504 + 2.505 + numRowIdxs = rowSlices->numVals; 2.506 + numColIdxs = colSlices->numVals; 2.507 + 2.508 + for( rowIdx = 0; rowIdx < numRowIdxs; rowIdx++ ) 2.509 + { 2.510 + rowOffset = rowIdx * numColIdxs; 2.511 + for( colIdx = 0; colIdx < numColIdxs; colIdx++ ) 2.512 + { 2.513 + subMatrix = subMatrices[ rowOffset + colIdx ]; 2.514 + if( subMatrix->alreadyCopied ) 2.515 + free( subMatrix->array ); 2.516 + free( subMatrix ); 2.517 + } 2.518 + } 2.519 + free( subMatrices ); 2.520 + } 2.521 + 2.522 + 2.523 +SlicingStrucCarrier * 2.524 +calcIdealSizeAndSliceDimensions( Matrix *leftMatrix, Matrix *rightMatrix ) 2.525 +{ 2.526 + float32 idealSizeOfSide, idealSizeOfSide1, idealSizeOfSide2; 2.527 + SlicingStruc *leftRowSlices, *vecSlices, *rightColSlices; 2.528 + int32 startLeftRow, endLeftRow, startVec,endVec,startRightCol,endRightCol; 2.529 + 2.530 + SlicingStrucCarrier *slicingStrucCarrier = 2.531 + malloc(sizeof(SlicingStrucCarrier) ); 2.532 + 2.533 + int minWorkUnitCycles, primitiveCycles, idealNumWorkUnits; 2.534 + float64 numPrimitiveOpsInMinWorkUnit; 2.535 + 2.536 + 2.537 + //======= Calc ideal size of min-sized sub-matrix ======== 2.538 + 2.539 + //ask VCilk for the number of cycles of the minimum work unit, at given 2.540 + // percent overhead then add a guess at overhead from this divider 2.541 + minWorkUnitCycles = MIN_WORK_UNIT_CYCLES; 2.542 + 2.543 + //ask VCilk for number of cycles of the "primitive" op of matrix mult 2.544 + primitiveCycles = measureMatrixMultPrimitive( ); 2.545 + 2.546 + numPrimitiveOpsInMinWorkUnit = 2.547 + (float64)minWorkUnitCycles / (float64)primitiveCycles; 2.548 + 2.549 + //take cubed root -- that's number of these in a "side" of sub-matrix 2.550 + // then multiply by 5 because the primitive is 5x5 2.551 + idealSizeOfSide1 = 5 * cbrt( numPrimitiveOpsInMinWorkUnit ); 2.552 + 2.553 + idealNumWorkUnits = IDEAL_NUM_WORK_UNITS; 2.554 + 2.555 + idealSizeOfSide2 = leftMatrix->numRows / rint(cbrt( idealNumWorkUnits )); 2.556 + idealSizeOfSide2 *= 0.8; //finer granularity to help load balance 2.557 + 2.558 + if( idealSizeOfSide1 > idealSizeOfSide2 ) 2.559 + idealSizeOfSide = idealSizeOfSide1; 2.560 + else 2.561 + idealSizeOfSide = idealSizeOfSide2; 2.562 + 2.563 + 2.564 + //============ Slice up dimensions, now that know target size =========== 2.565 + 2.566 + //Tell the slicer the target size of a side (floating pt), the start 2.567 + // value to start slicing at, and the end value to stop slicing at 2.568 + //It returns an array of start value of each chunk, plus number of them 2.569 + 2.570 + startLeftRow = 0; 2.571 + endLeftRow = leftMatrix->numRows -1; 2.572 + startVec = 0; 2.573 + endVec = leftMatrix->numCols -1; 2.574 + startRightCol = 0; 2.575 + endRightCol = rightMatrix->numCols -1; 2.576 + 2.577 + leftRowSlices = 2.578 + sliceUpDimension( idealSizeOfSide, startLeftRow, endLeftRow ); 2.579 + 2.580 + vecSlices = 2.581 + sliceUpDimension( idealSizeOfSide, startVec, endVec ); 2.582 + 2.583 + rightColSlices = 2.584 + sliceUpDimension( idealSizeOfSide, startRightCol, endRightCol ); 2.585 + 2.586 + slicingStrucCarrier->leftRowSlices = leftRowSlices; 2.587 + slicingStrucCarrier->vecSlices = vecSlices; 2.588 + slicingStrucCarrier->rightColSlices = rightColSlices; 2.589 + 2.590 + return slicingStrucCarrier; 2.591 +} 2.592 + 2.593 + 2.594 +SlicingStruc * 2.595 +sliceUpDimension( float32 idealSizeOfSide, int startVal, int endVal ) 2.596 + { float32 residualAcc = 0; 2.597 + int numSlices, i, *startVals, sizeOfSlice, endCondition; 2.598 + SlicingStruc *slicingStruc = malloc( sizeof(SlicingStruc) ); 2.599 + 2.600 + //calc size of matrix need to hold start vals -- 2.601 + numSlices = (int32)( (float32)(endVal -startVal +1) / idealSizeOfSide); 2.602 + 2.603 + startVals = malloc( (numSlices + 1) * sizeof(int32) ); 2.604 + 2.605 + //Calc the upper limit of start value -- when get above this, end loop 2.606 + // by saving highest value of the matrix dimension to access, plus 1 2.607 + // as the start point of the imaginary slice following the last one 2.608 + //Plus 1 because go up to value but not include when process last slice 2.609 + //The stopping condition is half-a-size less than highest value because 2.610 + // don't want any pieces smaller than half the ideal size -- just tack 2.611 + // little ones onto end of last one 2.612 + endCondition = endVal - (int) (idealSizeOfSide/2); //end *value*, not size 2.613 + for( i = 0; startVal <= endVal; i++ ) 2.614 + { 2.615 + startVals[i] = startVal; 2.616 + residualAcc += idealSizeOfSide; 2.617 + sizeOfSlice = (int)residualAcc; 2.618 + residualAcc -= (float32)sizeOfSlice; 2.619 + startVal += sizeOfSlice; //ex @size = 2 get 0, 2, 4, 6, 8.. 2.620 + 2.621 + if( startVal > endCondition ) 2.622 + { startVal = endVal + 1; 2.623 + startVals[ i + 1 ] = startVal; 2.624 + } 2.625 + } 2.626 + 2.627 + slicingStruc->startVals = startVals; 2.628 + slicingStruc->numVals = i; //loop incr'd, so == last valid start idx+1 2.629 + // which means is num sub-matrices in dim 2.630 + // also == idx of the fake start just above 2.631 + return slicingStruc; 2.632 + } 2.633 + 2.634 +void 2.635 +freeSlicingStruc( SlicingStruc *slicingStruc ) 2.636 + { 2.637 + free( slicingStruc->startVals ); 2.638 + free( slicingStruc ); 2.639 + } 2.640 + 2.641 + 2.642 +int inline 2.643 +measureMatrixMultPrimitive() 2.644 + { 2.645 + int r, c, v, numCycles; 2.646 + float32 *res, *left, *right; 2.647 + 2.648 + //setup inputs 2.649 + left = malloc( 5 * 5 * sizeof( float32 ) ); 2.650 + right = malloc( 5 * 5 * sizeof( float32 ) ); 2.651 + res = malloc( 5 * 5 * sizeof( float32 ) ); 2.652 + 2.653 + for( r = 0; r < 5; r++ ) 2.654 + { 2.655 + for( c = 0; c < 5; c++ ) 2.656 + { 2.657 + left[ r * 5 + c ] = r; 2.658 + right[ r * 5 + c ] = c; 2.659 + } 2.660 + } 2.661 + 2.662 + //do primitive 2.663 +// VCilk__start_primitive(); //for now, just takes time stamp 2.664 + for( r = 0; r < 5; r++ ) 2.665 + { 2.666 + for( c = 0; c < 5; c++ ) 2.667 + { 2.668 + for( v = 0; v < 5; v++ ) 2.669 + { 2.670 + res[ r * 5 + c ] = left[ r * 5 + v ] * right[ v * 5 + c ]; 2.671 + } 2.672 + } 2.673 + } 2.674 +// numCycles = VCilk__end_primitive_and_give_cycles(); 2.675 + 2.676 + free( left ); 2.677 + free( right ); 2.678 + free( res ); 2.679 + 2.680 + return numCycles; 2.681 + }
3.1 --- a/src/Application/CILK__Matrix_Mult/EntryPoint.cilk Tue Oct 26 19:34:03 2010 -0700 3.2 +++ b/src/Application/CILK__Matrix_Mult/EntryPoint.cilk Wed Nov 10 06:07:54 2010 -0800 3.3 @@ -1,16 +1,22 @@ 3.4 /* 3.5 - * Copyright 2009 OpenSourceCodeStewardshipFoundation.org 3.6 + * Copyright 2009 OpenSourceStewardshipFoundation.org 3.7 * Licensed under GNU General Public License version 2 3.8 * 3.9 * Author: seanhalle@yahoo.com 3.10 * 3.11 */ 3.12 3.13 +#include <math.h> 3.14 + 3.15 #include "CILK__Matrix_Mult.h" 3.16 3.17 -cilk void divideIntoVectors( void * ); 3.18 +//========================================================================== 3.19 +cilk void 3.20 +divideWorkIntoSubMatrixPairProcrs( void *_dividerParams ); 3.21 3.22 -/*Every VPThread system has an "entry point" function that creates the first 3.23 +//========================================================================== 3.24 + 3.25 +/*Every VCilk system has an "entry point" function that creates the first 3.26 * processor, which starts the chain of creating more processors.. 3.27 * eventually all of the processors will dissipate themselves, and 3.28 * return. 3.29 @@ -19,27 +25,41 @@ 3.30 * functions do: 3.31 *1) it creates the params for the seed processor, from the 3.32 * parameters passed into the entry-point function 3.33 - *2) it calls VPThread__create_seed_procr_and_do_work 3.34 + *2) it calls VCilk__create_seed_procr_and_do_work 3.35 *3) it gets the return value from the params struc, frees the params struc, 3.36 * and returns the value from the function 3.37 * 3.38 */ 3.39 -cilk 3.40 -Matrix * 3.41 +cilk Matrix * 3.42 multiplyTheseMatrices( Matrix *leftMatrix, Matrix *rightMatrix ) 3.43 { Matrix *resMatrix; 3.44 DividerParams *dividerParams; 3.45 + int32 numResRows, numResCols; 3.46 3.47 - 3.48 +// printf("entry point"); 3.49 dividerParams = malloc( sizeof( DividerParams ) ); 3.50 dividerParams->leftMatrix = leftMatrix; 3.51 dividerParams->rightMatrix = rightMatrix; 3.52 + 3.53 + numResRows = leftMatrix->numRows; 3.54 + numResCols = rightMatrix->numCols; 3.55 + resMatrix = malloc( sizeof(Matrix) ); 3.56 + resMatrix->array = malloc( numResRows * numResCols * sizeof(float32)); 3.57 + resMatrix->numCols = rightMatrix->numCols; 3.58 + resMatrix->numRows = leftMatrix->numRows; 3.59 3.60 - spawn divideIntoVectors( dividerParams ); 3.61 + 3.62 + dividerParams->resultMatrix = resMatrix; 3.63 + 3.64 + //create divider processor, start doing the work, and wait till done 3.65 + //This function is the "border crossing" between normal code and VCilk 3.66 + //VCilk__create_seed_procr_and_do_work( ÷WorkIntoSubMatrixPairProcrs,\ 3.67 + dividerParams ); 3.68 + spawn divideWorkIntoSubMatrixPairProcrs( dividerParams ); 3.69 + 3.70 sync; 3.71 - 3.72 - //get result matrix and return it 3.73 - resMatrix = dividerParams->resultMatrix; 3.74 + 3.75 + //return result matrix 3.76 free( dividerParams ); 3.77 return resMatrix; 3.78 }
4.1 --- a/src/Application/CILK__Matrix_Mult/Vector_Pr.cilk Tue Oct 26 19:34:03 2010 -0700 4.2 +++ b/src/Application/CILK__Matrix_Mult/Vector_Pr.cilk Wed Nov 10 06:07:54 2010 -0800 4.3 @@ -1,48 +1,119 @@ 4.4 -/* 4.5 - * Copyright 2009 OpenSourceCodeStewardshipFoundation.org 4.6 +/* 4.7 + * Copyright 2009 OpenSourceStewardshipFoundation.org 4.8 * Licensed under GNU General Public License version 2 4.9 * 4.10 - * Author: SeanHalle@yahoo.com 4.11 + * Author: seanhalle@yahoo.com 4.12 * 4.13 */ 4.14 4.15 + 4.16 #include "CILK__Matrix_Mult.h" 4.17 +#include <math.h> 4.18 +#include <stdlib.h> 4.19 +#include <malloc.h> 4.20 4.21 -/*A Vector processor is created with an environment that holds two matrices, 4.22 - * the row and col that it owns, and the name of a result gathering 4.23 - * processor. 4.24 - *It calculates its vector product then sends the result to the result 4.25 - * processor, which puts it into the result matrix and returns that matrix 4.26 - * when all is done. 4.27 - */ 4.28 -cilk 4.29 -float32 4.30 -calcVector( void *data ) 4.31 - { 4.32 - VectorParams *params; 4.33 - int myRow, myCol, vectLength, pos; 4.34 - float32 *leftMatrixArray, *rightMatrixArray, result = 0.0; 4.35 - Matrix *leftMatrix, *rightMatrix; 4.36 +//=========================================================================== 4.37 4.38 - params = (VectorParams *)data; 4.39 - myCol = params->myCol; 4.40 - myRow = params->myRow; 4.41 - vectLength = params->vectLength; 4.42 - leftMatrix = params->leftMatrix; 4.43 - rightMatrix = params->rightMatrix; 4.44 - leftMatrixArray = leftMatrix->matrix; 4.45 - rightMatrixArray = rightMatrix->matrix; 4.46 - //===================== DEBUG ====================== 4.47 - #ifdef PRINT_DEBUG 4.48 - if( myCol == 0 ) 4.49 - printf("start vector: %d, %d\n", myRow, myCol ); fflush(stdin); 4.50 - #endif 4.51 - //==================================================== 4.52 +cilk void calcSubMatrixProduct( void *data ); 4.53 4.54 - for( pos = 0; pos < vectLength; pos++ ) 4.55 + 4.56 +void inline 4.57 +accumulateResult( float32 *resultArray, float32 *subMatrixResultArray, 4.58 + int32 startRow, 4.59 + int32 numRows, 4.60 + int32 startCol, 4.61 + int32 numCols, 4.62 + int32 numOrigCols ); 4.63 + 4.64 + 4.65 +//=========================================================================== 4.66 + 4.67 +cilk void 4.68 +calcVectorOfSubMatrices( void *_vecParams ) 4.69 + { int32 numVecIdxs, leftRowIdxOffset, numRightColIdxs, resColIdx; 4.70 + SubMatrix **leftSubMatrices, **rightSubMatrices; 4.71 + float32 *resultArray; 4.72 + int32 vecIdx, coreWithAffinity; 4.73 + SMPairParams *subMatrixPairParams, **vecOfSubMatrixParams; 4.74 + VecParams *vecParams; 4.75 + 4.76 + vecParams = (VecParams *)_vecParams; 4.77 + 4.78 + numVecIdxs = vecParams->numVecIdxs; 4.79 + numRightColIdxs = vecParams->numRightColIdxs; 4.80 + leftRowIdxOffset = vecParams->leftRowIdxOffset; 4.81 + resColIdx = vecParams->resColIdx; 4.82 + leftSubMatrices = vecParams->leftSubMatrices; 4.83 + rightSubMatrices = vecParams->rightSubMatrices; 4.84 + resultArray = vecParams->resultArray; 4.85 + coreWithAffinity = vecParams->coreToRunOn; 4.86 + 4.87 +// printf("inside vector %d, %d\n", leftRowIdxOffset, resColIdx ); 4.88 + 4.89 + vecOfSubMatrixParams = malloc( numVecIdxs * sizeof(SMPairParams *) ); 4.90 + if( vecOfSubMatrixParams == 0 ){printf("malloc error"); exit(1);} 4.91 + 4.92 + for( vecIdx = 0; vecIdx < numVecIdxs; vecIdx++ ) 4.93 { 4.94 - result += leftMatrixArray[ myRow * vectLength + pos ] * 4.95 - rightMatrixArray[ pos * vectLength + myCol]; 4.96 + //Make the processor for the pair of sub-matrices 4.97 + subMatrixPairParams = malloc( sizeof(SMPairParams) ); 4.98 + subMatrixPairParams->leftSubMatrix = 4.99 + leftSubMatrices[ leftRowIdxOffset + vecIdx ]; 4.100 + 4.101 + subMatrixPairParams->rightSubMatrix = 4.102 + rightSubMatrices[ vecIdx * numRightColIdxs + resColIdx ]; 4.103 + 4.104 +// printf("about to spawn pair %X\n", (int)subMatrixPairParams->leftSubMatrix->array); 4.105 + spawn calcSubMatrixProduct( subMatrixPairParams ); 4.106 +// printf("done with spawn\n"); 4.107 + vecOfSubMatrixParams[ vecIdx ] = subMatrixPairParams; 4.108 } 4.109 - return result; 4.110 + 4.111 +// printf("done spawning product pairs\n"); 4.112 + sync; 4.113 + 4.114 + //now accumulate individual result matrices into final result matrix 4.115 + for( vecIdx = 0; vecIdx < numVecIdxs; vecIdx++ ) 4.116 + { 4.117 + subMatrixPairParams = vecOfSubMatrixParams[ vecIdx ]; 4.118 + 4.119 + accumulateResult( resultArray, subMatrixPairParams->partialResultArray, 4.120 + subMatrixPairParams->leftSubMatrix->origStartRow, 4.121 + subMatrixPairParams->leftSubMatrix->numRows, 4.122 + subMatrixPairParams->rightSubMatrix->origStartCol, 4.123 + subMatrixPairParams->rightSubMatrix->numCols, 4.124 + subMatrixPairParams->rightSubMatrix->origMatrix->numCols); 4.125 + 4.126 + //Note, resultArray is made on the core that produces the results 4.127 + // that gives chance to set affinity so all in vector run on same 4.128 + // core and re-use that array, and prevents writes from causing 4.129 + // thrashing of the cache -- as long as array big enough, the copy 4.130 + // overhead is miniscule vs the size-of-side reuse of each byte 4.131 + free( subMatrixPairParams->partialResultArray ); 4.132 + free( subMatrixPairParams ); 4.133 + } 4.134 + free( vecOfSubMatrixParams ); 4.135 + free( vecParams ); 4.136 } 4.137 + 4.138 + 4.139 + 4.140 +void inline 4.141 +accumulateResult( float32 *resultArray, float32 *subMatrixResultArray, 4.142 + int32 startRow, 4.143 + int32 numRows, 4.144 + int32 startCol, 4.145 + int32 numCols, 4.146 + int32 numOrigCols ) 4.147 + { int32 row, col; 4.148 + 4.149 + for( row = 0; row < numRows; row++ ) 4.150 + { 4.151 + for( col = 0; col < numCols; col++ ) 4.152 + { 4.153 + resultArray[ (row + startRow) * numOrigCols + col + startCol ] += 4.154 + subMatrixResultArray[ row * numCols + col ]; 4.155 + } 4.156 + } 4.157 + 4.158 + }
5.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 5.2 +++ b/src/Application/CILK__Matrix_Mult/subMatrix_Pr.cilk Wed Nov 10 06:07:54 2010 -0800 5.3 @@ -0,0 +1,301 @@ 5.4 +/* 5.5 + * Copyright 2010 OpenSourceStewardshipFoundation.org 5.6 + * Licensed under GNU General Public License version 2 5.7 + * 5.8 + * Author: SeanHalle@yahoo.com 5.9 + * 5.10 + */ 5.11 + 5.12 +#include <string.h> 5.13 + 5.14 +#include "CILK__Matrix_Mult.h" 5.15 + 5.16 + 5.17 +void inline 5.18 +copyFromOrig( SubMatrix *subMatrix ); 5.19 + 5.20 +void inline 5.21 +copyTransposeFromOrig( SubMatrix *subMatrix ); 5.22 + 5.23 +void inline 5.24 +multiplySubBlocksTransposed( float32 *leftArray, float32 *rightArray, 5.25 + float32 *resArray, 5.26 + int startRow, int endRow, 5.27 + int startCol, int endCol, 5.28 + int startVec, int endVec, 5.29 + int resStride, int inpStride ); 5.30 + 5.31 +void inline 5.32 +multiplyMatrixArraysTransposed( int32 vecLength, int32 numResRows, int32 numResCols, 5.33 + float32 *leftArray, float32 *rightArray, 5.34 + float32 *resArray ); 5.35 + 5.36 + 5.37 +/*A processor is created with an environment that holds two matrices, 5.38 + * the row and col that it owns, and the name of a result gathering 5.39 + * processor. 5.40 + *It calculates the product of two sub-portions of the input matrices 5.41 + * by using Intel's mkl library for single-core. 5.42 + * 5.43 + *This demonstrates using optimized single-threaded code inside scheduled 5.44 + * work-units. 5.45 + * 5.46 + *When done, it sends the result to the result processor 5.47 + */ 5.48 +cilk void 5.49 +calcSubMatrixProduct( void *data ) 5.50 + { 5.51 + SMPairParams *params; 5.52 + float32 *leftArray, *rightArray, *resArray; 5.53 + SubMatrix *leftSubMatrix, *rightSubMatrix; 5.54 + int32 resSize; 5.55 + int32 numResRows, numResCols, vectLength; 5.56 + 5.57 +// printf("inside submatrix-pair\n"); 5.58 + params = (SMPairParams *)data; 5.59 + leftSubMatrix = params->leftSubMatrix; 5.60 + rightSubMatrix = params->rightSubMatrix; 5.61 + 5.62 + 5.63 + //make sure the input sub-matrices have been copied out of orig 5.64 + //copyFromOrig( leftSubMatrix, animatingPr ); 5.65 + //copyTransposeFromOrig( rightSubMatrix, animatingPr ); 5.66 + 5.67 + leftArray = leftSubMatrix->array; 5.68 + rightArray = rightSubMatrix->array; 5.69 + 5.70 + //make this array here, on the core that computes the results 5.71 + // with Cilk's semantics, have to have separate result array for each 5.72 + // spawned processor -- unless want to change the spawn and sync 5.73 + // pattern, such that spawn one from each vector, then sync, then 5.74 + // another, and so forth -- this will cause idle time due to imbalance 5.75 + // in matrix sizes 5.76 + //This also gives chance to set affinity so all in vector run on same 5.77 + // core and re-use the accumulation array, 5.78 + //As a side-benefit, it also prevents writes from causing 5.79 + // thrashing of the cache -- as long as array big enough, the copy 5.80 + // overhead is small because each byte is reused size-of-side times 5.81 + //This is freed in the vector processor 5.82 + resSize = leftSubMatrix->numRows * rightSubMatrix->numCols * sizeof(float32); 5.83 + resArray = malloc( resSize ); 5.84 + memset( resArray, 0, resSize ); 5.85 + 5.86 + 5.87 + vectLength = leftSubMatrix->numCols; 5.88 + numResRows = leftSubMatrix->numRows; 5.89 + numResCols = rightSubMatrix->numCols; 5.90 + 5.91 +// printf("just before multiply arrays %X\n", leftArray); 5.92 + multiplyMatrixArraysTransposed( vectLength, numResRows, numResCols, 5.93 + leftArray, rightArray, 5.94 + resArray ); 5.95 + 5.96 + //send result by side-effect 5.97 + params->partialResultArray = resArray; 5.98 + } 5.99 + 5.100 + 5.101 + 5.102 +/*Divides result and each input into 32x32 sub-matrices, 3 of which fit into 5.103 + * the 32KB L1 cache. 5.104 + *Would be nice to embed this within another level that divided into 5.105 + * 8x8 tiles of those, where one 8x8 tile fits within 2MB L2 cache 5.106 + * 5.107 + *Eventually want these divisions to be automatic, using DKU pattern 5.108 + * embedded into VMS and exposed in the language, and with VMS controlling the 5.109 + * divisions according to the cache sizes, which it knows about. 5.110 + *Also, want VMS to work with language to split among main-mems, so a socket 5.111 + * only cranks on data in its local segment of main mem 5.112 + * 5.113 + *So, outer two loops determine start and end points within the result matrix. 5.114 + * Inside that, a loop dets the start and end points along the shared dimensions 5.115 + * of the two input matrices. 5.116 + */ 5.117 +void inline 5.118 +multiplyMatrixArraysTransposed( int32 vecLength, int32 numResRows, 5.119 + int32 numResCols, 5.120 + float32 *leftArray, float32 *rightArray, 5.121 + float32 *resArray ) 5.122 + { 5.123 + int resStride, inpStride; 5.124 + int resStartRow, resStartCol, resEndRow, resEndCol, startVec, endVec; 5.125 + 5.126 + resStride = numResCols; 5.127 + inpStride = vecLength; 5.128 + 5.129 + for( resStartRow = 0; resStartRow < numResRows; ) 5.130 + { 5.131 + resEndRow = resStartRow + ROWS_IN_BLOCK -1; //start at zero, so -1 5.132 + if( resEndRow > numResRows ) resEndRow = numResRows -1; 5.133 + 5.134 + for( resStartCol = 0; resStartCol < numResCols; ) 5.135 + { 5.136 + resEndCol = resStartCol + COLS_IN_BLOCK -1; 5.137 + if( resEndCol > numResCols ) resEndCol = numResCols -1; 5.138 + 5.139 + for( startVec = 0; startVec < vecLength; ) 5.140 + { 5.141 + endVec = startVec + VEC_IN_BLOCK -1; 5.142 + if( endVec > vecLength ) endVec = vecLength -1; 5.143 + 5.144 +// printf("just before multiply sub-blocks %X\n", leftArray); 5.145 + //By having the "vector" of sub-blocks in a sub-block slice 5.146 + // be marched down in inner loop, are re-using the result 5.147 + // matrix, which stays in L1 cache and re-using the left sub-mat 5.148 + // which repeats for each right sub-mat -- can only re-use two of 5.149 + // the three, so result is the most important -- avoids writing 5.150 + // dirty blocks until those result-locations fully done 5.151 + //Row and Col is position in result matrix -- so row and vec 5.152 + // for left array, then vec and col for right array 5.153 + multiplySubBlocksTransposed( leftArray, rightArray, 5.154 + resArray, 5.155 + resStartRow, resEndRow, 5.156 + resStartCol, resEndCol, 5.157 + startVec, endVec, 5.158 + resStride, inpStride ); 5.159 + startVec = endVec +1; 5.160 + } 5.161 + resStartCol = resEndCol +1; 5.162 + } 5.163 + resStartRow = resEndRow +1; 5.164 + } 5.165 + } 5.166 + 5.167 + 5.168 + 5.169 +void inline 5.170 +multiplySubBlocksTransposed( float32 *leftArray, float32 *rightArray, 5.171 + float32 *resArray, 5.172 + int resStartRow, int resEndRow, 5.173 + int resStartCol, int resEndCol, 5.174 + int startVec, int endVec, 5.175 + int resStride, int inpStride ) 5.176 + { 5.177 + int resRow, resCol, vec; 5.178 + int leftOffset, rightOffset; 5.179 + float32 result; 5.180 + 5.181 +// printf("start col, row | end col, row: %d, %d, %d, %d\n", resStartCol, resStartRow, resEndCol, resEndRow); 5.182 + //The result row is used for the left matrix, res col for the right 5.183 + for( resCol = resStartCol; resCol <= resEndCol; resCol++ ) 5.184 + { 5.185 + for( resRow = resStartRow; resRow <= resEndRow; resRow++ ) 5.186 + { 5.187 + leftOffset = resRow * inpStride;//left & right inp strides same 5.188 + rightOffset = resCol * inpStride;// because right is transposed 5.189 + result = 0; 5.190 + for( vec = startVec; vec <= endVec; vec++ ) 5.191 + { 5.192 + result += 5.193 + leftArray[ leftOffset + vec] * rightArray[ rightOffset + vec]; 5.194 + } 5.195 + 5.196 + resArray[ resRow * resStride + resCol ] += result; 5.197 + } 5.198 + } 5.199 + } 5.200 + 5.201 + 5.202 +/*Reuse this in divider when do the sequential multiply case 5.203 + */ 5.204 +void inline 5.205 +copyTranspose( int32 numRows, int32 numCols, 5.206 + int32 origStartRow, int32 origStartCol, int32 origStride, 5.207 + float32 *subArray, float32 *origArray ) 5.208 + { int32 stride; 5.209 + int32 row, col, origOffset; 5.210 + 5.211 + stride = numRows; 5.212 + for( row = 0; row < numRows; row++ ) 5.213 + { 5.214 + origOffset = (row + origStartRow) * origStride + origStartCol; 5.215 + for( col = 0; col < numCols; col++ ) 5.216 + { 5.217 + //transpose means swap row & col -- traverse orig matrix normally 5.218 + // but put into reversed place in local array -- means the 5.219 + // stride is the numRows now, so col * numRows + row 5.220 + subArray[ col * stride + row ] = origArray[ origOffset + col ]; 5.221 + } 5.222 + } 5.223 + } 5.224 + 5.225 +void inline 5.226 +copyTransposeFromOrig( SubMatrix *subMatrix ) 5.227 + { int numCols, numRows, origStartRow, origStartCol, origStride, stride; 5.228 + Matrix *origMatrix; 5.229 + float32 *origArray, *subArray; 5.230 + 5.231 + 5.232 +// if( subMatrix->alreadyCopied ) return; 5.233 +// VCilk__start_singleton(copyMatrixSingleton,&&EndOfTranspSingleton,animPr); 5.234 + 5.235 + origMatrix = subMatrix->origMatrix; 5.236 + origArray = origMatrix->array; 5.237 + numCols = subMatrix->numCols; 5.238 + numRows = subMatrix->numRows; 5.239 + origStartRow = subMatrix->origStartRow; 5.240 + origStartCol = subMatrix->origStartCol; 5.241 + origStride = origMatrix->numCols; 5.242 + 5.243 + subArray = malloc( numRows * numCols *sizeof(float32) ); 5.244 + subMatrix->array = subArray; 5.245 +// printf("copying transpose %X\n", subArray); 5.246 + 5.247 + //copy values from orig matrix to local 5.248 + copyTranspose( numRows, numCols, 5.249 + origStartRow, origStartCol, origStride, 5.250 + subArray, origArray ); 5.251 + 5.252 + subMatrix->alreadyCopied = TRUE; //must be last thing before label 5.253 +// EndOfTranspSingleton: 5.254 + return; 5.255 + } 5.256 + 5.257 + 5.258 +void inline 5.259 +copyFromOrig( SubMatrix *subMatrix ) 5.260 + { int numCols, numRows, origStartRow, origStartCol, stride, origStride; 5.261 + Matrix *origMatrix; 5.262 + float32 *origArray, *subArray; 5.263 + int32 row, col, offset, origOffset; 5.264 + 5.265 + //This lets only a single VP execute the code between start and 5.266 + // end -- using start and end so that work runs outside the master. 5.267 + //Inside, if a second VP ever executes the start, it will be returned 5.268 + // from the end-point. 5.269 + //Note, for non-GCC, can add a second SSR call at the end, and inside 5.270 + // that one, look at the stack at the return addr & save that in an 5.271 + // array indexed by singletonID 5.272 +// if( subMatrix->alreadyCopied ) return; 5.273 +// VCilk__start_singleton( copyMatrixSingleton, &&EndOfCopySingleton,animPr); 5.274 + 5.275 + 5.276 + origMatrix = subMatrix->origMatrix; 5.277 + origArray = origMatrix->array; 5.278 + numCols = subMatrix->numCols; 5.279 + numRows = subMatrix->numRows; 5.280 + origStartRow = subMatrix->origStartRow; 5.281 + origStartCol = subMatrix->origStartCol; 5.282 + origStride = origMatrix->numCols; 5.283 + 5.284 + subArray = malloc( numRows * numCols *sizeof(float32) ); 5.285 + subMatrix->array = subArray; 5.286 +// printf("copying normal %X\n", subArray); 5.287 + 5.288 + //copy values from orig matrix to local 5.289 + stride = numCols; 5.290 + 5.291 + for( row = 0; row < numRows; row++ ) 5.292 + { 5.293 + offset = row * stride; 5.294 + origOffset = (row + origStartRow) * origStride + origStartCol; 5.295 + for( col = 0; col < numCols; col++ ) 5.296 + { 5.297 + subArray[ offset + col ] = origArray[ origOffset + col ]; 5.298 + } 5.299 + } 5.300 + 5.301 + subMatrix->alreadyCopied = TRUE; //must be last thing before label 5.302 +// EndOfCopySingleton: 5.303 + return; 5.304 + }
6.1 --- a/src/Application/Makefile Tue Oct 26 19:34:03 2010 -0700 6.2 +++ b/src/Application/Makefile Wed Nov 10 06:07:54 2010 -0800 6.3 @@ -1,35 +1,51 @@ 6.4 +# 6.5 +# Copyright Nov 6, 2010 OpenSourceStewardshipFoundation.org 6.6 +# Licensed under GNU General Public License version 2 6.7 +# 6.8 +# author seanhalle@yahoo.com 6.9 6.10 6.11 CILK_SOURCE = \ 6.12 CILK__Matrix_Mult/EntryPoint.cilk \ 6.13 CILK__Matrix_Mult/Divide_Pr.cilk \ 6.14 CILK__Matrix_Mult/Vector_Pr.cilk \ 6.15 + CILK__Matrix_Mult/subMatrix_Pr.cilk \ 6.16 main.cilk 6.17 6.18 C_SOURCE = \ 6.19 matrix_mult.c \ 6.20 - ParamHelper/ParamBag.c\ 6.21 + ParamHelper/ParamBag.c \ 6.22 ParamHelper/ReadParamsFromFile.c 6.23 6.24 +#The next two rules make a new string, with the same names as the 6.25 +# source, but the endings changed to .o 6.26 +#The third concatenates the two source-file strings 6.27 C_OBJS = $(C_SOURCE:.c=.o) 6.28 6.29 CILK_OBJS = $(CILK_SOURCE:.cilk=.o) 6.30 6.31 -OBJECTS = $(C_SOURCE) $(CILK_SOURCE) 6.32 +OBJECTS = $(C_OBJS) $(CILK_OBJS) 6.33 + 6.34 6.35 #Make has the built-in variable "$<" which is the source file 6.36 # and "$@" which is the target for that source 6.37 +#The first rule says for each file in C_SOURCE, put the 6.38 +# name in place of $< and expand 6.39 +# C_OBJS in place of $@ and run the gcc command, when files are 6.40 +# out of date 6.41 $(C_OBJS): $(C_SOURCE) 6.42 - gcc -c $< -o $@ 6.43 + gcc -c $*.c -o $*.o; 6.44 6.45 $(CILK_OBJS): $(CILK_SOURCE) 6.46 - gcc -c $< -o $@ 6.47 + cilkc -c $*.cilk -o $*.o 6.48 6.49 all: $(OBJECTS) 6.50 cilkc $(OBJECTS) -o CILK_Linux__Matrix_Mult; \ 6.51 cp CILK_Linux__Matrix_Mult ~/D/2__INRIA_OMP/1__Development/2__runs_and_data/executables 6.52 6.53 6.54 +clean: 6.55 + rm *.o; rm ParamHelper/*.o; rm CILK__Matrix_Mult/*.o 6.56 6.57 #================================================================ 6.58 #Other stuff tried/played_with/copied 6.59 @@ -62,8 +78,6 @@ 6.60 #================================================================ 6.61 # playing with below.. 6.62 6.63 -#7C9A-RV6P-3XE2-JV99-426K-2K 6.64 - 6.65 #rule for inferring that the .cilk file is the source for .o file 6.66 # and how to create the .o from the .cilk 6.67 #%.o : %.cilk
7.1 --- a/src/Application/Matrix_Mult.c Tue Oct 26 19:34:03 2010 -0700 7.2 +++ b/src/Application/Matrix_Mult.c Wed Nov 10 06:07:54 2010 -0800 7.3 @@ -61,7 +61,7 @@ 7.4 7.5 numRows = matrixStruc->numRows; 7.6 numCols = matrixStruc->numCols; 7.7 - matrixStart = matrixStruc->matrix; 7.8 + matrixStart = matrixStruc->array; 7.9 7.10 file = fopen( matrixFileName, "r" ); 7.11 if( file == NULL ) { printf( "\nCouldn't open file!!\n"); exit(1);} 7.12 @@ -131,7 +131,7 @@ 7.13 retMatrix = malloc( sizeof( Matrix ) ); 7.14 retMatrix->numRows = numRows; 7.15 retMatrix->numCols = numCols; 7.16 - retMatrix->matrix = malloc( numRows * numCols * sizeof(float32) ); 7.17 + retMatrix->array = malloc( numRows * numCols * sizeof(float32) ); 7.18 7.19 return retMatrix; 7.20 } 7.21 @@ -142,24 +142,26 @@ 7.22 } 7.23 void 7.24 freeMatrix( Matrix * matrix ) 7.25 - { free( matrix->matrix ); 7.26 + { free( matrix->array ); 7.27 free( matrix ); 7.28 } 7.29 7.30 void 7.31 printMatrix( Matrix *matrix ) 7.32 - { int r, c, numRows, numCols; 7.33 + { 7.34 + int r, c, numRows, numCols, rowsToPrint, colsToPrint, rowIncr, colIncr; 7.35 float32 *matrixArray; 7.36 7.37 - numRows = matrix->numRows; 7.38 - numCols = matrix->numCols; 7.39 - matrixArray = matrix->matrix; 7.40 + numRows = rowsToPrint = matrix->numRows; 7.41 + numCols = colsToPrint = matrix->numCols; 7.42 + matrixArray = matrix->array; 7.43 7.44 - for( r = 0; r < numRows; r++ ) 7.45 - { for( c = 0; c < numCols; c++ ) 7.46 - { printf( "%f | ", *(matrixArray + r*numCols + c) ); 7.47 + rowIncr = numRows/20; if(rowIncr == 0) rowIncr = 1;//20 to 39 rows printed 7.48 + colIncr = numCols/20; if(colIncr == 0) colIncr = 1;//20 to 39 cols printed 7.49 + for( r = 0; r < numRows; r += rowIncr ) 7.50 + { for( c = 0; c < numCols; c += colIncr ) 7.51 + { printf( "%3.1f | ", matrixArray[ r * numCols + c ] ); 7.52 } 7.53 printf("\n"); 7.54 } 7.55 } 7.56 -
8.1 --- a/src/Application/Matrix_Mult.h Tue Oct 26 19:34:03 2010 -0700 8.2 +++ b/src/Application/Matrix_Mult.h Wed Nov 10 06:07:54 2010 -0800 8.3 @@ -19,7 +19,7 @@ 8.4 struct 8.5 { int32 numRows; 8.6 int32 numCols; 8.7 - float32 *matrix; //2D, but dynamically sized, so use addr arith 8.8 + float32 *array; //2D, but dynamically sized, so use addr arith 8.9 } 8.10 Matrix; 8.11
9.1 --- a/src/Application/main.cilk Tue Oct 26 19:34:03 2010 -0700 9.2 +++ b/src/Application/main.cilk Wed Nov 10 06:07:54 2010 -0800 9.3 @@ -1,41 +1,67 @@ 9.4 -/* 9.5 - * Copyright Oct 24, 2009 OpenSourceCodeStewardshipFoundation.org 9.6 +/* 9.7 + * Copyright 2010 OpenSourcStewardshipFoundation.org 9.8 * Licensed under GNU General Public License version 2 9.9 - * 9.10 - * author seanhalle@yahoo.com 9.11 - */ 9.12 - 9.13 -#include <malloc.h> 9.14 -#include <stdlib.h> 9.15 - 9.16 -#include "Matrix_Mult.h" 9.17 -#include "CILK__Matrix_Mult/CILK__Matrix_Mult.h" 9.18 - 9.19 -cilk Matrix * 9.20 -multiplyTheseMatrices( Matrix *leftMatrix, Matrix *rightMatrix ); 9.21 - 9.22 -/** 9.23 - *Matrix multiply program written using VMS_HW piggy-back language 9.24 - * 9.25 - */ 9.26 -cilk 9.27 -int main( int argc, char **argv ) 9.28 - { Matrix *leftMatrix, *rightMatrix, *resultMatrix; 9.29 - ParamBag *paramBag; 9.30 - 9.31 - 9.32 - paramBag = makeParamBag(); 9.33 - readParamFileIntoBag( argv[1], paramBag ); 9.34 - initialize_Input_Matrices_Via( &leftMatrix, &rightMatrix, paramBag ); 9.35 - 9.36 - resultMatrix = spawn multiplyTheseMatrices( leftMatrix, rightMatrix ); 9.37 - sync; 9.38 - 9.39 - printf("\nresult matrix: \n"); 9.40 - 9.41 -// printMatrix( resultMatrix ); 9.42 - 9.43 -// VPThread__print_stats(); 9.44 - 9.45 - exit(0); //cleans up 9.46 - } 9.47 + * 9.48 + * author seanhalle@yahoo.com 9.49 + */ 9.50 + 9.51 +#include <malloc.h> 9.52 +#include <stdlib.h> 9.53 + 9.54 +#include "Matrix_Mult.h" 9.55 +#include "CILK__Matrix_Mult/CILK__Matrix_Mult.h" 9.56 + 9.57 + 9.58 + //single global var -- just get it done 9.59 +struct timeval startStamp; 9.60 + 9.61 + 9.62 +void 9.63 +startTimeInterval() 9.64 + { 9.65 + gettimeofday( &startStamp, NULL); 9.66 + } 9.67 + 9.68 + 9.69 +void 9.70 +endIntervalAndPrintTime() 9.71 + { 9.72 + struct timeval endStamp; 9.73 + float64 startSecs, endSecs, intervalSecs; 9.74 + 9.75 + gettimeofday( &endStamp, NULL); 9.76 + 9.77 + startSecs = startStamp.tv_sec + ( startStamp.tv_usec / 1000000.0 ); 9.78 + endSecs = endStamp.tv_sec + ( endStamp.tv_usec / 1000000.0 ); 9.79 + 9.80 + intervalSecs = endSecs - startSecs; 9.81 + printf("Interval: %f", intervalSecs); 9.82 + } 9.83 + 9.84 + 9.85 +cilk Matrix * 9.86 +multiplyTheseMatrices( Matrix *leftMatrix, Matrix *rightMatrix ); 9.87 + 9.88 +/** 9.89 + *Matrix multiply program written using VMS_HW piggy-back language 9.90 + * 9.91 + */ 9.92 +cilk 9.93 +int main( int argc, char **argv ) 9.94 + { Matrix *leftMatrix, *rightMatrix, *resultMatrix; 9.95 + ParamBag *paramBag; 9.96 + 9.97 + 9.98 + paramBag = makeParamBag(); 9.99 + readParamFileIntoBag( argv[1], paramBag ); 9.100 + initialize_Input_Matrices_Via( &leftMatrix, &rightMatrix, paramBag ); 9.101 + 9.102 + resultMatrix = spawn multiplyTheseMatrices( leftMatrix, rightMatrix ); 9.103 + sync; 9.104 + 9.105 + printf("\nresult matrix: \n"); 9.106 + 9.107 +// printMatrix( resultMatrix ); 9.108 + 9.109 + exit(0); //cleans up 9.110 + }
