Mercurial > cgi-bin > hgwebdir.cgi > PR > Applications > VCilk > VCilk__Blocked_Matrix_Mult__Bench
changeset 5:e223756d0f0c tip
Fixed uninitialized variable and removed warnings
| author | Merten Sach <msach@mailbox.tu-berlin.de> |
|---|---|
| date | Wed, 11 May 2011 15:58:04 +0200 |
| parents | ecba4ae0be7a |
| children | |
| files | src/Application/Matrix_Mult.c src/Application/VCilk__Matrix_Mult/Divide_Pr.c src/Application/VCilk__Matrix_Mult/VCilk__Matrix_Mult.h src/Application/VCilk__Matrix_Mult/subMatrix_Pr.c src/Application/main.c |
| diffstat | 5 files changed, 1207 insertions(+), 1204 deletions(-) [+] |
line diff
1.1 --- a/src/Application/Matrix_Mult.c Wed May 11 15:40:54 2011 +0200 1.2 +++ b/src/Application/Matrix_Mult.c Wed May 11 15:58:04 2011 +0200 1.3 @@ -1,167 +1,167 @@ 1.4 -/* 1.5 1.6 - * Copyright 2009 OpenSourceStewardshipFoundation.org 1.7 1.8 - * Licensed under GNU General Public License version 2 1.9 1.10 - * 1.11 1.12 - * Author: seanhalle@yahoo.com 1.13 1.14 - * 1.15 1.16 - * Created on November 15, 2009, 2:35 AM 1.17 1.18 - */ 1.19 1.20 - 1.21 1.22 -#include <malloc.h> 1.23 1.24 -#include <stdlib.h> 1.25 1.26 - 1.27 1.28 -#include "Matrix_Mult.h" 1.29 1.30 -#include "ParamHelper/Param.h" 1.31 1.32 - 1.33 1.34 - 1.35 1.36 - 1.37 1.38 - void 1.39 1.40 -initialize_Input_Matrices_Via( Matrix **leftMatrix, Matrix **rightMatrix, 1.41 1.42 - ParamBag *paramBag ) 1.43 1.44 - { char *leftMatrixFileName, *rightMatrixFileName; 1.45 1.46 - int leftMatrixRows, leftMatrixCols, rightMatrixRows, rightMatrixCols; 1.47 1.48 - 1.49 1.50 - ParamStruc *param; 1.51 1.52 - param = getParamFromBag( "leftMatrixRows", paramBag ); 1.53 1.54 - leftMatrixRows = param->intValue; 1.55 1.56 - param = getParamFromBag( "leftMatrixCols", paramBag ); 1.57 1.58 - leftMatrixCols = param->intValue; 1.59 1.60 - *leftMatrix = makeMatrix_WithResMat( leftMatrixRows, leftMatrixCols ); 1.61 1.62 - 1.63 1.64 - param = getParamFromBag( "leftMatrixFileName", paramBag ); 1.65 1.66 - leftMatrixFileName = param->strValue; //no need to copy 1.67 1.68 - read_Matrix_From_File( *leftMatrix, leftMatrixFileName ); 1.69 1.70 - 1.71 1.72 - param = getParamFromBag( "rightMatrixRows", paramBag ); 1.73 1.74 - rightMatrixRows = param->intValue; 1.75 1.76 - param = getParamFromBag( "rightMatrixCols", paramBag ); 1.77 1.78 - rightMatrixCols = param->intValue; 1.79 1.80 - *rightMatrix = makeMatrix_WithResMat( rightMatrixRows, rightMatrixCols ); 1.81 1.82 - 1.83 1.84 - param = getParamFromBag( "rightMatrixFileName", paramBag ); 1.85 1.86 - rightMatrixFileName = param->strValue; 1.87 1.88 - read_Matrix_From_File( *rightMatrix, rightMatrixFileName ); 1.89 1.90 - } 1.91 1.92 - 1.93 1.94 - 1.95 1.96 -void parseLineIntoRow( char *line, float32* row ); 1.97 1.98 - 1.99 1.100 - 1.101 1.102 - void 1.103 1.104 -read_Matrix_From_File( Matrix *matrixStruc, char *matrixFileName ) 1.105 1.106 - { int row, maxRead, numRows, numCols; 1.107 1.108 - float32 *matrixStart; 1.109 1.110 - size_t lineSz = 0; 1.111 1.112 - FILE *file; 1.113 1.114 - char *line = NULL; 1.115 1.116 - 1.117 1.118 - lineSz = 50000; //max length of line in a matrix data file 1.119 1.120 - line = (char *) malloc( lineSz ); 1.121 1.122 - if( line == NULL ) printf( "no mem for matrix line" ); 1.123 1.124 - 1.125 1.126 - numRows = matrixStruc->numRows; 1.127 1.128 - numCols = matrixStruc->numCols; 1.129 1.130 - matrixStart = matrixStruc->array; 1.131 1.132 - 1.133 1.134 - file = fopen( matrixFileName, "r" ); 1.135 1.136 - if( file == NULL ) { printf( "\nCouldn't open file!!\n"); exit(1);} 1.137 1.138 - fseek( file, 0, SEEK_SET ); 1.139 1.140 - for( row = 0; row < numRows; row++ ) 1.141 1.142 - { 1.143 1.144 - if( feof( file ) ) printf( "file ran out too soon" ); 1.145 1.146 - maxRead = getline( &line, &lineSz, file ); 1.147 1.148 - if( maxRead == -1 ) printf( "prob reading mat line"); 1.149 1.150 - 1.151 1.152 - if( *line == '\n') continue; //blank line 1.153 1.154 - if( *line == '/' ) continue; //comment line 1.155 1.156 - 1.157 1.158 - parseLineIntoRow( line, matrixStart + row * numCols ); 1.159 1.160 - } 1.161 1.162 - free( line ); 1.163 1.164 - } 1.165 1.166 - 1.167 1.168 -/*This function relies on each line having the proper number of cols. It 1.169 1.170 - * doesn't check, nor enforce, so if the file is improperly formatted it 1.171 1.172 - * can write over unrelated memory 1.173 1.174 - */ 1.175 1.176 - void 1.177 1.178 -parseLineIntoRow( char *line, float32* row ) 1.179 1.180 - { 1.181 1.182 - char *valueStr, *searchPos; 1.183 1.184 - 1.185 1.186 - //read the float values 1.187 1.188 - searchPos = valueStr = line; //start 1.189 1.190 - 1.191 1.192 - for( ; *searchPos != 0; searchPos++) //bit dangerous, should use buff len 1.193 1.194 - { 1.195 1.196 - if( *searchPos == '\n' ) //last col.. relying on well-formatted file 1.197 1.198 - { *searchPos = 0; 1.199 1.200 - *row = atof( valueStr ); 1.201 1.202 - break; //end FOR loop 1.203 1.204 - } 1.205 1.206 - if( *searchPos == ',' ) 1.207 1.208 - { *searchPos = 0; //mark end of string 1.209 1.210 - *row = (float32) atof( valueStr ); 1.211 1.212 - row += 1; //address arith 1.213 1.214 - //skip any spaces before digits.. use searchPos + 1 to skip the 0 1.215 1.216 - for( ; *(searchPos + 1)== ' ' && *(searchPos + 1) !=0; searchPos++); 1.217 1.218 - valueStr = searchPos + 1; 1.219 1.220 - } 1.221 1.222 - } 1.223 1.224 - } 1.225 1.226 - 1.227 1.228 - //========================================================================== 1.229 1.230 - 1.231 1.232 -/*In the "_Flat" version of constructor, do only malloc of the top data struc 1.233 1.234 - * and set values in that top-level. Don't malloc any sub-structures. 1.235 1.236 - */ 1.237 1.238 - Matrix * 1.239 1.240 -makeMatrix_Flat( int32 numRows, int32 numCols ) 1.241 1.242 - { Matrix * retMatrix; 1.243 1.244 - retMatrix = malloc( sizeof( Matrix ) ); 1.245 1.246 - retMatrix->numRows = numRows; 1.247 1.248 - retMatrix->numCols = numCols; 1.249 1.250 - 1.251 1.252 - return retMatrix; 1.253 1.254 - } 1.255 1.256 - 1.257 1.258 - Matrix * 1.259 1.260 -makeMatrix_WithResMat( int32 numRows, int32 numCols ) 1.261 1.262 - { Matrix * retMatrix; 1.263 1.264 - retMatrix = malloc( sizeof( Matrix ) ); 1.265 1.266 - retMatrix->numRows = numRows; 1.267 1.268 - retMatrix->numCols = numCols; 1.269 1.270 - retMatrix->array = malloc( numRows * numCols * sizeof(float32) ); 1.271 1.272 - 1.273 1.274 - return retMatrix; 1.275 1.276 - } 1.277 1.278 - 1.279 1.280 - void 1.281 1.282 -freeMatrix_Flat( Matrix * matrix ) 1.283 1.284 - { //( matrix ); 1.285 1.286 - } 1.287 1.288 - void 1.289 1.290 -freeMatrix( Matrix * matrix ) 1.291 1.292 - { free( matrix->array ); 1.293 1.294 - free( matrix ); 1.295 1.296 - } 1.297 1.298 - 1.299 1.300 -void 1.301 1.302 -printMatrix( Matrix *matrix ) 1.303 1.304 - { int r, c, numRows, numCols, rowsToPrint, colsToPrint, rowIncr, colIncr; 1.305 1.306 - float32 *matrixArray; 1.307 1.308 - 1.309 1.310 - numRows = rowsToPrint = matrix->numRows; 1.311 1.312 - numCols = colsToPrint = matrix->numCols; 1.313 1.314 - matrixArray = matrix->array; 1.315 1.316 - 1.317 1.318 - rowIncr = numRows/20; if(rowIncr == 0) rowIncr = 1;//20 to 39 rows printed 1.319 1.320 - colIncr = numCols/20; if(colIncr == 0) colIncr = 1;//20 to 39 cols printed 1.321 1.322 - for( r = 0; r < numRows; r += rowIncr ) 1.323 1.324 - { for( c = 0; c < numCols; c += colIncr ) 1.325 1.326 - { printf( "%3.1f | ", matrixArray[ r * numCols + c ] ); 1.327 1.328 - } 1.329 1.330 - printf("\n"); 1.331 1.332 - } 1.333 1.334 - } 1.335 1.336 - 1.337 1.338 +/* 1.339 + * Copyright 2009 OpenSourceStewardshipFoundation.org 1.340 + * Licensed under GNU General Public License version 2 1.341 + * 1.342 + * Author: seanhalle@yahoo.com 1.343 + * 1.344 + * Created on November 15, 2009, 2:35 AM 1.345 + */ 1.346 + 1.347 +#include <malloc.h> 1.348 +#include <stdlib.h> 1.349 + 1.350 +#include "Matrix_Mult.h" 1.351 +#include "ParamHelper/Param.h" 1.352 + 1.353 + 1.354 + 1.355 + void 1.356 +initialize_Input_Matrices_Via( Matrix **leftMatrix, Matrix **rightMatrix, 1.357 + ParamBag *paramBag ) 1.358 + { char *leftMatrixFileName, *rightMatrixFileName; 1.359 + int leftMatrixRows, leftMatrixCols, rightMatrixRows, rightMatrixCols; 1.360 + 1.361 + ParamStruc *param; 1.362 + param = getParamFromBag( "leftMatrixRows", paramBag ); 1.363 + leftMatrixRows = param->intValue; 1.364 + param = getParamFromBag( "leftMatrixCols", paramBag ); 1.365 + leftMatrixCols = param->intValue; 1.366 + *leftMatrix = makeMatrix_WithResMat( leftMatrixRows, leftMatrixCols ); 1.367 + 1.368 + param = getParamFromBag( "leftMatrixFileName", paramBag ); 1.369 + leftMatrixFileName = param->strValue; //no need to copy 1.370 + read_Matrix_From_File( *leftMatrix, leftMatrixFileName ); 1.371 + 1.372 + param = getParamFromBag( "rightMatrixRows", paramBag ); 1.373 + rightMatrixRows = param->intValue; 1.374 + param = getParamFromBag( "rightMatrixCols", paramBag ); 1.375 + rightMatrixCols = param->intValue; 1.376 + *rightMatrix = makeMatrix_WithResMat( rightMatrixRows, rightMatrixCols ); 1.377 + 1.378 + param = getParamFromBag( "rightMatrixFileName", paramBag ); 1.379 + rightMatrixFileName = param->strValue; 1.380 + read_Matrix_From_File( *rightMatrix, rightMatrixFileName ); 1.381 + } 1.382 + 1.383 + 1.384 +void parseLineIntoRow( char *line, float32* row ); 1.385 + 1.386 + 1.387 + void 1.388 +read_Matrix_From_File( Matrix *matrixStruc, char *matrixFileName ) 1.389 + { int row, maxRead, numRows, numCols; 1.390 + float32 *matrixStart; 1.391 + size_t lineSz = 0; 1.392 + FILE *file; 1.393 + char *line = NULL; 1.394 + 1.395 + lineSz = 50000; //max length of line in a matrix data file 1.396 + line = (char *) malloc( lineSz ); 1.397 + if( line == NULL ) printf( "no mem for matrix line" ); 1.398 + 1.399 + numRows = matrixStruc->numRows; 1.400 + numCols = matrixStruc->numCols; 1.401 + matrixStart = matrixStruc->array; 1.402 + 1.403 + file = fopen( matrixFileName, "r" ); 1.404 + if( file == NULL ) { printf( "\nCouldn't open file!!\n"); exit(1);} 1.405 + fseek( file, 0, SEEK_SET ); 1.406 + for( row = 0; row < numRows; row++ ) 1.407 + { 1.408 + if( feof( file ) ) printf( "file ran out too soon" ); 1.409 + maxRead = getline( &line, &lineSz, file ); 1.410 + if( maxRead == -1 ) printf( "prob reading mat line"); 1.411 + 1.412 + if( *line == '\n') continue; //blank line 1.413 + if( *line == '/' ) continue; //comment line 1.414 + 1.415 + parseLineIntoRow( line, matrixStart + row * numCols ); 1.416 + } 1.417 + free( line ); 1.418 + } 1.419 + 1.420 +/*This function relies on each line having the proper number of cols. It 1.421 + * doesn't check, nor enforce, so if the file is improperly formatted it 1.422 + * can write over unrelated memory 1.423 + */ 1.424 + void 1.425 +parseLineIntoRow( char *line, float32* row ) 1.426 + { 1.427 + char *valueStr, *searchPos; 1.428 + 1.429 + //read the float values 1.430 + searchPos = valueStr = line; //start 1.431 + 1.432 + for( ; *searchPos != 0; searchPos++) //bit dangerous, should use buff len 1.433 + { 1.434 + if( *searchPos == '\n' ) //last col.. relying on well-formatted file 1.435 + { *searchPos = 0; 1.436 + *row = atof( valueStr ); 1.437 + break; //end FOR loop 1.438 + } 1.439 + if( *searchPos == ',' ) 1.440 + { *searchPos = 0; //mark end of string 1.441 + *row = (float32) atof( valueStr ); 1.442 + row += 1; //address arith 1.443 + //skip any spaces before digits.. use searchPos + 1 to skip the 0 1.444 + for( ; *(searchPos + 1)== ' ' && *(searchPos + 1) !=0; searchPos++); 1.445 + valueStr = searchPos + 1; 1.446 + } 1.447 + } 1.448 + } 1.449 + 1.450 + //========================================================================== 1.451 + 1.452 +/*In the "_Flat" version of constructor, do only malloc of the top data struc 1.453 + * and set values in that top-level. Don't malloc any sub-structures. 1.454 + */ 1.455 + Matrix * 1.456 +makeMatrix_Flat( int32 numRows, int32 numCols ) 1.457 + { Matrix * retMatrix; 1.458 + retMatrix = malloc( sizeof( Matrix ) ); 1.459 + retMatrix->numRows = numRows; 1.460 + retMatrix->numCols = numCols; 1.461 + 1.462 + return retMatrix; 1.463 + } 1.464 + 1.465 + Matrix * 1.466 +makeMatrix_WithResMat( int32 numRows, int32 numCols ) 1.467 + { Matrix * retMatrix; 1.468 + retMatrix = malloc( sizeof( Matrix ) ); 1.469 + retMatrix->numRows = numRows; 1.470 + retMatrix->numCols = numCols; 1.471 + retMatrix->array = malloc( numRows * numCols * sizeof(float32) ); 1.472 + 1.473 + return retMatrix; 1.474 + } 1.475 + 1.476 + void 1.477 +freeMatrix_Flat( Matrix * matrix ) 1.478 + { //( matrix ); 1.479 + } 1.480 + void 1.481 +freeMatrix( Matrix * matrix ) 1.482 + { free( matrix->array ); 1.483 + free( matrix ); 1.484 + } 1.485 + 1.486 +void 1.487 +printMatrix( Matrix *matrix ) 1.488 + { int r, c, numRows, numCols, rowsToPrint, colsToPrint, rowIncr, colIncr; 1.489 + float32 *matrixArray; 1.490 + 1.491 + numRows = rowsToPrint = matrix->numRows; 1.492 + numCols = colsToPrint = matrix->numCols; 1.493 + matrixArray = matrix->array; 1.494 + 1.495 + rowIncr = numRows/20; if(rowIncr == 0) rowIncr = 1;//20 to 39 rows printed 1.496 + colIncr = numCols/20; if(colIncr == 0) colIncr = 1;//20 to 39 cols printed 1.497 + for( r = 0; r < numRows; r += rowIncr ) 1.498 + { for( c = 0; c < numCols; c += colIncr ) 1.499 + { printf( "%3.1f | ", matrixArray[ r * numCols + c ] ); 1.500 + } 1.501 + printf("\n"); 1.502 + } 1.503 + } 1.504 +
2.1 --- a/src/Application/VCilk__Matrix_Mult/Divide_Pr.c Wed May 11 15:40:54 2011 +0200 2.2 +++ b/src/Application/VCilk__Matrix_Mult/Divide_Pr.c Wed May 11 15:58:04 2011 +0200 2.3 @@ -1,588 +1,591 @@ 2.4 -/* 2.5 2.6 - * Copyright 2009 OpenSourceStewardshipFoundation.org 2.7 2.8 - * Licensed under GNU General Public License version 2 2.9 2.10 - * 2.11 2.12 - * Author: seanhalle@yahoo.com 2.13 2.14 - * 2.15 2.16 - */ 2.17 2.18 - 2.19 2.20 - 2.21 2.22 -#include "VCilk__Matrix_Mult.h" 2.23 2.24 -#include <math.h> 2.25 2.26 -#include <sys/time.h> 2.27 2.28 -#include <string.h> 2.29 2.30 - 2.31 2.32 - //The time to compute this many result values should equal the time to 2.33 2.34 - // perform this division on a matrix of size gives that many result calcs 2.35 2.36 - //IE, size this so that sequential time to calc equals divide time 2.37 2.38 - // find the value by experimenting -- but divide time and calc time scale 2.39 2.40 - // same way, so this value should remain valid across hardware 2.41 2.42 - //Divide time is about 800us on 2.4Ghz core2Quad laptop core 2.43 2.44 - //num cells is the cube of a side, when have two square matrices 2.45 2.46 -#define NUM_CELLS_IN_SEQUENTIAL_CUTOFF 100000 /* about 46x46 */ 2.47 2.48 - 2.49 2.50 - 2.51 2.52 -//=========================================================================== 2.53 2.54 -int inline 2.55 2.56 -measureMatrixMultPrimitive( VirtProcr *animPr ); 2.57 2.58 - 2.59 2.60 -SlicingStrucCarrier * 2.61 2.62 -calcIdealSizeAndSliceDimensions( Matrix *leftMatrix, Matrix *rightMatrix, 2.63 2.64 - VirtProcr *animPr ); 2.65 2.66 - 2.67 2.68 -SlicingStruc * 2.69 2.70 -sliceUpDimension( float32 idealSizeOfSide, int startVal, int endVal, 2.71 2.72 - VirtProcr *animPr ); 2.73 2.74 - 2.75 2.76 -void 2.77 2.78 -freeSlicingStruc( SlicingStruc *slicingStruc, VirtProcr *animPr ); 2.79 2.80 - 2.81 2.82 -SubMatrix ** 2.83 2.84 -createSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices, 2.85 2.86 - Matrix *origMatrix, VirtProcr *animPr ); 2.87 2.88 - 2.89 2.90 -void 2.91 2.92 -freeSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices, 2.93 2.94 - SubMatrix **subMatrices, VirtProcr *animPr ); 2.95 2.96 - 2.97 2.98 -void 2.99 2.100 -pairUpSubMatricesAndSpawnAndSync( SubMatrix **leftSubMatrices, 2.101 2.102 - SubMatrix **rightSubMatrices, 2.103 2.104 - int32 numRowIdxs, int32 numColIdxs, 2.105 2.106 - int32 numVecIdxs, 2.107 2.108 - float32 *resultArray, 2.109 2.110 - VirtProcr *animatingPr ); 2.111 2.112 - 2.113 2.114 -void 2.115 2.116 -makeSubMatricesAndSpawnAndSync( Matrix *leftMatrix, Matrix *rightMatrix, 2.117 2.118 - SlicingStrucCarrier *slicingStrucCarrier, 2.119 2.120 - float32 *resultArray, VirtProcr *animatingPr ); 2.121 2.122 - 2.123 2.124 -//=========================================================================== 2.125 2.126 - 2.127 2.128 -/*Divider creates one processor for every sub-matrix 2.129 2.130 - * It hands them: 2.131 2.132 - * the name of the result processor that they should send their results to, 2.133 2.134 - * the left and right matrices, and the rows and cols they should multiply 2.135 2.136 - * It first creates the result processor, then all the sub-matrixPair 2.137 2.138 - * processors, 2.139 2.140 - * then does a receive of a message from the result processor that gives 2.141 2.142 - * the divider ownership of the result matrix. 2.143 2.144 - * Finally, the divider returns the result matrix out of the VCilk system. 2.145 2.146 - * 2.147 2.148 - * Divider chooses the size of sub-matrices via an algorithm that tries to 2.149 2.150 - * keep the minimum work above a threshold. The threshold is machine- 2.151 2.152 - * dependent, so ask VCilk for min work-unit time to get a 2.153 2.154 - * given overhead 2.155 2.156 - * 2.157 2.158 - * Divide min work-unit cycles by measured-cycles for one matrix-cell 2.159 2.160 - * product -- gives the number of products need to have in min size 2.161 2.162 - * matrix. 2.163 2.164 - * 2.165 2.166 - * So then, take cubed root of this to get the size of a side of min sub- 2.167 2.168 - * matrix. That is the size of the ideal square sub-matrix -- so tile 2.169 2.170 - * up the two input matrices into ones as close as possible to that size, 2.171 2.172 - * and create the pairs of sub-matrices. 2.173 2.174 - * 2.175 2.176 - *======================== STRATEGIC OVERVIEW ======================= 2.177 2.178 - * 2.179 2.180 - *This division is a bit tricky, because have to create things in advance 2.181 2.182 - * that it's not at first obvious need to be created.. 2.183 2.184 - * 2.185 2.186 - *First slice up each dimension -- three of them.. this is because will have 2.187 2.188 - * to create the sub-matrix's data-structures before pairing the sub-matrices 2.189 2.190 - * with each other -- so, have three dimensions to slice up before can 2.191 2.192 - * create the sub-matrix data-strucs -- also, have to be certain that the 2.193 2.194 - * cols of the left input have the exact same slicing as the rows of the 2.195 2.196 - * left matrix, so just to be sure, do the slicing calc once, then use it 2.197 2.198 - * for both. 2.199 2.200 - * 2.201 2.202 - *So, goes like this: 2.203 2.204 - *1) calculate the start & end values of each dimension in each matrix. 2.205 2.206 - *2) use those values to create sub-matrix structures 2.207 2.208 - *3) combine sub-matrices into pairs, as the tasks to perform. 2.209 2.210 - * 2.211 2.212 - *Have to calculate separately from creating the sub-matrices because of the 2.213 2.214 - * nature of the nesting -- would either end up creating the same sub-matrix 2.215 2.216 - * multiple times, or else would have to put in detection of whether had 2.217 2.218 - * made a particular one already if tried to combine steps 1 and 2. 2.219 2.220 - * 2.221 2.222 - *Step 3 has to be separate because of the nesting, as well -- same reason, 2.223 2.224 - * would either create same sub-matrix multiple times, or else have to 2.225 2.226 - * add detection of whether was already created. 2.227 2.228 - * 2.229 2.230 - *Another way to look at it: there's one level of loop to divide dimensions, 2.231 2.232 - * two levels of nesting to create sub-matrices, and three levels to pair 2.233 2.234 - * up the sub-matrices. 2.235 2.236 - */ 2.237 2.238 - 2.239 2.240 -void divideWorkIntoSubMatrixPairProcrs( void *_dividerParams, 2.241 2.242 - VirtProcr *animPr ) 2.243 2.244 - { 2.245 2.246 - DividerParams *dividerParams; 2.247 2.248 - ResultsParams *resultsParams; 2.249 2.250 - Matrix *leftMatrix, *rightMatrix, *resultMatrix; 2.251 2.252 - void *msg; 2.253 2.254 - SlicingStrucCarrier *slicingStrucCarrier; 2.255 2.256 - float32 *resultArray; //points to array to be put inside result 2.257 2.258 - // matrix 2.259 2.260 - 2.261 2.262 - DEBUG( dbgAppFlow, "start divide\n") 2.263 2.264 - 2.265 2.266 - int32 2.267 2.268 - divideProbe = VMS__create_single_interval_probe( "divideProbe", 2.269 2.270 - animPr ); 2.271 2.272 - VMS__record_sched_choice_into_probe( divideProbe, animPr ); 2.273 2.274 - VMS__record_interval_start_in_probe( divideProbe ); 2.275 2.276 - 2.277 2.278 - //=========== Setup -- make local copies of ptd-to-things, malloc, aso 2.279 2.280 - int32 numResRows, numResCols, vectLength; 2.281 2.282 - 2.283 2.284 - dividerParams = (DividerParams *)_dividerParams; 2.285 2.286 - 2.287 2.288 - leftMatrix = dividerParams->leftMatrix; 2.289 2.290 - rightMatrix = dividerParams->rightMatrix; 2.291 2.292 - 2.293 2.294 - vectLength = leftMatrix->numCols; 2.295 2.296 - numResRows = leftMatrix->numRows; 2.297 2.298 - numResCols = rightMatrix->numCols; 2.299 2.300 - resultArray = dividerParams->resultMatrix->array; 2.301 2.302 - 2.303 2.304 - //zero the result array 2.305 2.306 - memset( resultArray, 0, numResRows * numResCols * sizeof(float32) ); 2.307 2.308 - 2.309 2.310 - 2.311 2.312 - //============== Do either sequential mult or do division ============== 2.313 2.314 - 2.315 2.316 - //Check if input matrices too small -- if yes, just do sequential 2.317 2.318 - //Cutoff is determined by overhead of this divider -- relatively 2.319 2.320 - // machine-independent 2.321 2.322 - if( (float32)leftMatrix->numRows * (float32)leftMatrix->numCols * 2.323 2.324 - (float32)rightMatrix->numCols < NUM_CELLS_IN_SEQUENTIAL_CUTOFF ) 2.325 2.326 - { int32 vectLength; 2.327 2.328 - 2.329 2.330 - //====== Do sequential multiply on a single core 2.331 2.332 - DEBUG( dbgAppFlow, "doing sequential") 2.333 2.334 - 2.335 2.336 - //transpose the right matrix 2.337 2.338 - float32 * 2.339 2.340 - transRightArray = VCilk__malloc( rightMatrix->numRows * 2.341 2.342 - rightMatrix->numCols * 2.343 2.344 - sizeof(float32), animPr ); 2.345 2.346 - 2.347 2.348 - //copy values from orig matrix to local 2.349 2.350 - copyTranspose( rightMatrix->numRows, rightMatrix->numCols, 2.351 2.352 - 0, 0, rightMatrix->numRows, 2.353 2.354 - transRightArray, rightMatrix->array ); 2.355 2.356 - 2.357 2.358 - multiplyMatrixArraysTransposed( vectLength, numResRows, numResCols, 2.359 2.360 - leftMatrix->array, transRightArray, 2.361 2.362 - resultArray ); 2.363 2.364 - } 2.365 2.366 - else 2.367 2.368 - { 2.369 2.370 - //====== Do parallel multiply across cores 2.371 2.372 - 2.373 2.374 - //Calc the ideal size of sub-matrix and slice up the dimensions of 2.375 2.376 - // the two matrices. 2.377 2.378 - //The ideal size is the one takes the number of cycles to calculate 2.379 2.380 - // such that calc time is equal or greater than min work-unit size 2.381 2.382 - slicingStrucCarrier = 2.383 2.384 - calcIdealSizeAndSliceDimensions( leftMatrix, rightMatrix, animPr); 2.385 2.386 - 2.387 2.388 - 2.389 2.390 - 2.391 2.392 - //Make the sub-matrices, and pair them up, then spawn processors to 2.393 2.394 - // calc product of each pair. 2.395 2.396 - makeSubMatricesAndSpawnAndSync( leftMatrix, rightMatrix, 2.397 2.398 - slicingStrucCarrier, 2.399 2.400 - resultArray, animPr); 2.401 2.402 - //The result array will get filled in by the spawned children 2.403 2.404 - } 2.405 2.406 - 2.407 2.408 - 2.409 2.410 - //=============== Work done -- send results back ================= 2.411 2.412 - 2.413 2.414 - 2.415 2.416 - //results have been saved into an array that was made outside the VMS 2.417 2.418 - // system, by entry-point Fn, and passed in through dividerParams. 2.419 2.420 - //So, nothing to do to send results back -- they're seen by side-effect 2.421 2.422 - 2.423 2.424 - DEBUG( dbgAppFlow, "*** end divide ***\n") 2.425 2.426 - 2.427 2.428 - VMS__record_interval_end_in_probe( divideProbe ); 2.429 2.430 - VMS__print_stats_of_all_probes(); 2.431 2.432 - 2.433 2.434 - VCilk__dissipate_procr( animPr ); //all procrs dissipate self at end 2.435 2.436 - //when all of the processors have dissipated, the "create seed and do 2.437 2.438 - // work" call in the entry point function returns 2.439 2.440 - } 2.441 2.442 - 2.443 2.444 - 2.445 2.446 -SlicingStrucCarrier * 2.447 2.448 -calcIdealSizeAndSliceDimensions( Matrix *leftMatrix, Matrix *rightMatrix, 2.449 2.450 - VirtProcr *animPr ) 2.451 2.452 -{ 2.453 2.454 - float32 idealSizeOfSide, idealSizeOfSide1, idealSizeOfSide2; 2.455 2.456 - SlicingStruc *leftRowSlices, *vecSlices, *rightColSlices; 2.457 2.458 - SlicingStrucCarrier *slicingStrucCarrier = 2.459 2.460 - VCilk__malloc(sizeof(SlicingStrucCarrier), animPr ); 2.461 2.462 - 2.463 2.464 - int minWorkUnitCycles, primitiveCycles, idealNumWorkUnits; 2.465 2.466 - float64 numPrimitiveOpsInMinWorkUnit; 2.467 2.468 - 2.469 2.470 - 2.471 2.472 - //======= Calc ideal size of min-sized sub-matrix ======== 2.473 2.474 - 2.475 2.476 - //ask VCilk for the number of cycles of the minimum work unit, at given 2.477 2.478 - // percent overhead then add a guess at overhead from this divider 2.479 2.480 - minWorkUnitCycles = VCilk__giveMinWorkUnitCycles( .05 ); 2.481 2.482 - 2.483 2.484 - //ask VCilk for number of cycles of the "primitive" op of matrix mult 2.485 2.486 - primitiveCycles = measureMatrixMultPrimitive( animPr ); 2.487 2.488 - 2.489 2.490 - numPrimitiveOpsInMinWorkUnit = 2.491 2.492 - (float64)minWorkUnitCycles / (float64)primitiveCycles; 2.493 2.494 - 2.495 2.496 - //take cubed root -- that's number of these in a "side" of sub-matrix 2.497 2.498 - // then multiply by 5 because the primitive is 5x5 2.499 2.500 - idealSizeOfSide1 = 5 * cbrt( numPrimitiveOpsInMinWorkUnit ); 2.501 2.502 - 2.503 2.504 - idealNumWorkUnits = VCilk__giveIdealNumWorkUnits(); 2.505 2.506 - 2.507 2.508 - idealSizeOfSide2 = leftMatrix->numRows / rint(cbrt( idealNumWorkUnits )); 2.509 2.510 - idealSizeOfSide2 *= 0.8; //finer granularity to help load balance 2.511 2.512 - 2.513 2.514 - if( idealSizeOfSide1 > idealSizeOfSide2 ) 2.515 2.516 - idealSizeOfSide = idealSizeOfSide1; 2.517 2.518 - else 2.519 2.520 - idealSizeOfSide = idealSizeOfSide2; 2.521 2.522 - 2.523 2.524 - //The multiply inner loop blocks the array to fit into L1 cache 2.525 2.526 -// if( idealSizeOfSide < ROWS_IN_BLOCK ) idealSizeOfSide = ROWS_IN_BLOCK; 2.527 2.528 - 2.529 2.530 - //============ Slice up dimensions, now that know target size =========== 2.531 2.532 - 2.533 2.534 - //Tell the slicer the target size of a side (floating pt), the start 2.535 2.536 - // value to start slicing at, and the end value to stop slicing at 2.537 2.538 - //It returns an array of start value of each chunk, plus number of them 2.539 2.540 - int32 startLeftRow, endLeftRow, startVec,endVec,startRightCol,endRightCol; 2.541 2.542 - startLeftRow = 0; 2.543 2.544 - endLeftRow = leftMatrix->numRows -1; 2.545 2.546 - startVec = 0; 2.547 2.548 - endVec = leftMatrix->numCols -1; 2.549 2.550 - startRightCol = 0; 2.551 2.552 - endRightCol = rightMatrix->numCols -1; 2.553 2.554 - 2.555 2.556 - leftRowSlices = 2.557 2.558 - sliceUpDimension( idealSizeOfSide, startLeftRow, endLeftRow, animPr ); 2.559 2.560 - 2.561 2.562 - vecSlices = 2.563 2.564 - sliceUpDimension( idealSizeOfSide, startVec, endVec, animPr ); 2.565 2.566 - 2.567 2.568 - rightColSlices = 2.569 2.570 - sliceUpDimension( idealSizeOfSide, startRightCol, endRightCol,animPr); 2.571 2.572 - 2.573 2.574 - slicingStrucCarrier->leftRowSlices = leftRowSlices; 2.575 2.576 - slicingStrucCarrier->vecSlices = vecSlices; 2.577 2.578 - slicingStrucCarrier->rightColSlices = rightColSlices; 2.579 2.580 - 2.581 2.582 - DEBUG1( dbgAppFlow, "leftRowSlices %d | ", leftRowSlices->numVals ); 2.583 2.584 - DEBUG1( dbgAppFlow, "rightColSlices %d | ",rightColSlices->numVals); 2.585 2.586 - DEBUG1( dbgAppFlow, "vecSlices %d\n", vecSlices->numVals ); 2.587 2.588 - return slicingStrucCarrier; 2.589 2.590 -} 2.591 2.592 - 2.593 2.594 - 2.595 2.596 -void inline 2.597 2.598 -makeSubMatricesAndSpawnAndSync( Matrix *leftMatrix, Matrix *rightMatrix, 2.599 2.600 - SlicingStrucCarrier *slicingStrucCarrier, 2.601 2.602 - float32 *resultArray, VirtProcr *animPr ) 2.603 2.604 - { 2.605 2.606 - SlicingStruc *leftRowSlices, *vecSlices, *rightColSlices; 2.607 2.608 - 2.609 2.610 - leftRowSlices = slicingStrucCarrier->leftRowSlices; 2.611 2.612 - vecSlices = slicingStrucCarrier->vecSlices; 2.613 2.614 - rightColSlices = slicingStrucCarrier->rightColSlices; 2.615 2.616 - VCilk__free( slicingStrucCarrier, animPr ); 2.617 2.618 - 2.619 2.620 - //================ Make sub-matrices, given the slicing ================ 2.621 2.622 - SubMatrix **leftSubMatrices, **rightSubMatrices; 2.623 2.624 - leftSubMatrices = 2.625 2.626 - createSubMatrices( leftRowSlices, vecSlices, 2.627 2.628 - leftMatrix, animPr ); 2.629 2.630 - rightSubMatrices = 2.631 2.632 - createSubMatrices( vecSlices, rightColSlices, 2.633 2.634 - rightMatrix, animPr ); 2.635 2.636 - 2.637 2.638 - freeSlicingStruc( leftRowSlices, animPr ); 2.639 2.640 - freeSlicingStruc( vecSlices, animPr ); 2.641 2.642 - freeSlicingStruc( rightColSlices, animPr ); 2.643 2.644 - 2.645 2.646 - //============== pair the sub-matrices and make processors ============== 2.647 2.648 - int32 numRowIdxs, numColIdxs, numVecIdxs; 2.649 2.650 - 2.651 2.652 - numRowIdxs = leftRowSlices->numVals; 2.653 2.654 - numColIdxs = rightColSlices->numVals; 2.655 2.656 - numVecIdxs = vecSlices->numVals; 2.657 2.658 - pairUpSubMatricesAndSpawnAndSync( leftSubMatrices, rightSubMatrices, 2.659 2.660 - numRowIdxs, numColIdxs, numVecIdxs, 2.661 2.662 - resultArray, 2.663 2.664 - animPr ); 2.665 2.666 - //It syncs inside, so know all work is done now: free the sub-matrices 2.667 2.668 - freeSubMatrices( leftRowSlices, vecSlices, leftSubMatrices, animPr ); 2.669 2.670 - freeSubMatrices( vecSlices, rightColSlices, rightSubMatrices, animPr ); 2.671 2.672 - } 2.673 2.674 - 2.675 2.676 - 2.677 2.678 - 2.679 2.680 - 2.681 2.682 -/* numRows*colsPerRow/numCores = numToPutOntoEachCore; 2.683 2.684 - * put all from a given row onto same core, until exhaust allotment for that 2.685 2.686 - * core 2.687 2.688 - * 2.689 2.690 - */ 2.691 2.692 -void inline 2.693 2.694 -pairUpSubMatricesAndSpawnAndSync( SubMatrix **leftSubMatrices, 2.695 2.696 - SubMatrix **rightSubMatrices, 2.697 2.698 - int32 numRowIdxs, int32 numColIdxs, 2.699 2.700 - int32 numVecIdxs, 2.701 2.702 - float32 *resultArray, 2.703 2.704 - VirtProcr *animatingPr ) 2.705 2.706 - { 2.707 2.708 - int32 resRowIdx, resColIdx; 2.709 2.710 - int32 numLeftColIdxs, numRightColIdxs; 2.711 2.712 - int32 leftRowIdxOffset; 2.713 2.714 - VecParams *vecParams; 2.715 2.716 - float32 numToPutOntoEachCore, leftOverFraction; 2.717 2.718 - int32 numCores, currCore, numOnCurrCore; 2.719 2.720 - 2.721 2.722 - numLeftColIdxs = numColIdxs; 2.723 2.724 - numRightColIdxs = numVecIdxs; 2.725 2.726 - 2.727 2.728 - numCores = VCilk__give_number_of_cores_to_spawn_onto(); 2.729 2.730 - 2.731 2.732 - numToPutOntoEachCore = numRowIdxs*numColIdxs/numCores; 2.733 2.734 - leftOverFraction = 0; 2.735 2.736 - numOnCurrCore = 0; 2.737 2.738 - currCore = 0; 2.739 2.740 - 2.741 2.742 - for( resRowIdx = 0; resRowIdx < numRowIdxs; resRowIdx++ ) 2.743 2.744 - { 2.745 2.746 - leftRowIdxOffset = resRowIdx * numLeftColIdxs; 2.747 2.748 - 2.749 2.750 - for( resColIdx = 0; resColIdx < numColIdxs; resColIdx++ ) 2.751 2.752 - { 2.753 2.754 - vecParams = VCilk__malloc( sizeof(VecParams), animatingPr ); 2.755 2.756 - 2.757 2.758 - vecParams->numVecIdxs = numVecIdxs; 2.759 2.760 - vecParams->numRightColIdxs = numRightColIdxs; 2.761 2.762 - vecParams->leftRowIdxOffset = leftRowIdxOffset; 2.763 2.764 - vecParams->resColIdx = resColIdx; 2.765 2.766 - vecParams->leftSubMatrices = leftSubMatrices; 2.767 2.768 - vecParams->rightSubMatrices = rightSubMatrices; 2.769 2.770 - vecParams->resultArray = resultArray; 2.771 2.772 - vecParams->coreToRunOn = currCore; 2.773 2.774 - 2.775 2.776 - VCilk__spawn( currCore, &calcVectorOfSubMatrices, vecParams, 2.777 2.778 - animatingPr ); 2.779 2.780 - 2.781 2.782 - numOnCurrCore += 1; 2.783 2.784 - if( numOnCurrCore + leftOverFraction >= numToPutOntoEachCore - 1 ) 2.785 2.786 - { 2.787 2.788 - //deal with fractional part, to ensure that imbalance is 1 max 2.789 2.790 - // IE, core with most has only 1 more than core with least 2.791 2.792 - leftOverFraction += numToPutOntoEachCore - numOnCurrCore; 2.793 2.794 - if( leftOverFraction >= 1 ) 2.795 2.796 - { leftOverFraction -= 1; 2.797 2.798 - numOnCurrCore = -1; 2.799 2.800 - } 2.801 2.802 - else 2.803 2.804 - { numOnCurrCore = 0; 2.805 2.806 - } 2.807 2.808 - //Move to next core, max core-value to incr to is numCores -1 2.809 2.810 - if( currCore >= numCores -1 ) 2.811 2.812 - { currCore = 0; 2.813 2.814 - } 2.815 2.816 - else 2.817 2.818 - { currCore += 1; 2.819 2.820 - } 2.821 2.822 - } 2.823 2.824 - } 2.825 2.826 - } 2.827 2.828 - 2.829 2.830 - //Free Note: vector of sub-matrices does its own free-ing, even vec-params 2.831 2.832 - 2.833 2.834 -//TODO: timeToSpawnProbe = VMS__get_probe_by_name( "timeToSpawnProbe" ); 2.835 2.836 -// VMS__end_interval_on_probe( timeToSpawnProbe ); 2.837 2.838 - 2.839 2.840 - VCilk__sync( animatingPr ); 2.841 2.842 - 2.843 2.844 - //free the sub-matrices in Fn that called this one 2.845 2.846 - } 2.847 2.848 - 2.849 2.850 - 2.851 2.852 -/*Walk through the two slice-strucs, making sub-matrix strucs as go 2.853 2.854 - */ 2.855 2.856 -SubMatrix ** 2.857 2.858 -createSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices, 2.859 2.860 - Matrix *origMatrix, VirtProcr *animPr ) 2.861 2.862 - { 2.863 2.864 - int32 numRowIdxs, numColIdxs, rowIdx, colIdx; 2.865 2.866 - int32 startRow, endRow, startCol, endCol; 2.867 2.868 - int32 *rowStartVals, *colStartVals; 2.869 2.870 - int32 rowOffset; 2.871 2.872 - SubMatrix **subMatrices, *newSubMatrix; 2.873 2.874 - 2.875 2.876 - numRowIdxs = rowSlices->numVals; 2.877 2.878 - numColIdxs = colSlices->numVals; 2.879 2.880 - 2.881 2.882 - rowStartVals = rowSlices->startVals; 2.883 2.884 - colStartVals = colSlices->startVals; 2.885 2.886 - 2.887 2.888 - subMatrices = VCilk__malloc( numRowIdxs * numColIdxs *sizeof(SubMatrix *), 2.889 2.890 - animPr ); 2.891 2.892 - 2.893 2.894 - for( rowIdx = 0; rowIdx < numRowIdxs; rowIdx++ ) 2.895 2.896 - { 2.897 2.898 - rowOffset = rowIdx * numColIdxs; 2.899 2.900 - 2.901 2.902 - startRow = rowStartVals[rowIdx]; 2.903 2.904 - endRow = rowStartVals[rowIdx + 1] -1; //"fake" start above last is 2.905 2.906 - // at last valid idx + 1 & is 2.907 2.908 - // 1 greater than end value 2.909 2.910 - for( colIdx = 0; colIdx < numColIdxs; colIdx++ ) 2.911 2.912 - { 2.913 2.914 - startCol = colStartVals[colIdx]; 2.915 2.916 - endCol = colStartVals[colIdx + 1] -1; 2.917 2.918 - 2.919 2.920 - newSubMatrix = VCilk__malloc( sizeof(SubMatrix), animPr ); 2.921 2.922 - newSubMatrix->numRows = endRow - startRow +1; 2.923 2.924 - newSubMatrix->numCols = endCol - startCol +1; 2.925 2.926 - newSubMatrix->origMatrix = origMatrix; 2.927 2.928 - newSubMatrix->origStartRow = startRow; 2.929 2.930 - newSubMatrix->origStartCol = startCol; 2.931 2.932 - newSubMatrix->alreadyCopied = FALSE; 2.933 2.934 - 2.935 2.936 - subMatrices[ rowOffset + colIdx ] = newSubMatrix; 2.937 2.938 - } 2.939 2.940 - } 2.941 2.942 - return subMatrices; 2.943 2.944 - } 2.945 2.946 - 2.947 2.948 -void 2.949 2.950 -freeSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices, 2.951 2.952 - SubMatrix **subMatrices, VirtProcr *animPr ) 2.953 2.954 - { 2.955 2.956 - int32 numRowIdxs, numColIdxs, rowIdx, colIdx, rowOffset; 2.957 2.958 - SubMatrix *subMatrix; 2.959 2.960 - 2.961 2.962 - numRowIdxs = rowSlices->numVals; 2.963 2.964 - numColIdxs = colSlices->numVals; 2.965 2.966 - 2.967 2.968 - for( rowIdx = 0; rowIdx < numRowIdxs; rowIdx++ ) 2.969 2.970 - { 2.971 2.972 - rowOffset = rowIdx * numColIdxs; 2.973 2.974 - for( colIdx = 0; colIdx < numColIdxs; colIdx++ ) 2.975 2.976 - { 2.977 2.978 - subMatrix = subMatrices[ rowOffset + colIdx ]; 2.979 2.980 - if( subMatrix->alreadyCopied ) 2.981 2.982 - VCilk__free( subMatrix->array, animPr ); 2.983 2.984 - VCilk__free( subMatrix, animPr ); 2.985 2.986 - } 2.987 2.988 - } 2.989 2.990 - VCilk__free( subMatrices, animPr ); 2.991 2.992 - } 2.993 2.994 - 2.995 2.996 - 2.997 2.998 - 2.999 2.1000 -SlicingStruc * 2.1001 2.1002 -sliceUpDimension( float32 idealSizeOfSide, int startVal, int endVal, 2.1003 2.1004 - VirtProcr *animPr ) 2.1005 2.1006 - { float32 residualAcc = 0; 2.1007 2.1008 - int numSlices, i, *startVals, sizeOfSlice, endCondition; 2.1009 2.1010 - SlicingStruc *slicingStruc = VCilk__malloc( sizeof(SlicingStruc), animPr); 2.1011 2.1012 - 2.1013 2.1014 - //calc size of matrix need to hold start vals -- 2.1015 2.1016 - numSlices = (int32)( (float32)(endVal -startVal +1) / idealSizeOfSide); 2.1017 2.1018 - 2.1019 2.1020 - startVals = VCilk__malloc( (numSlices + 1) * sizeof(int32), animPr ); 2.1021 2.1022 - 2.1023 2.1024 - //Calc the upper limit of start value -- when get above this, end loop 2.1025 2.1026 - // by saving highest value of the matrix dimension to access, plus 1 2.1027 2.1028 - // as the start point of the imaginary slice following the last one 2.1029 2.1030 - //Plus 1 because go up to value but not include when process last slice 2.1031 2.1032 - //The stopping condition is half-a-size less than highest value because 2.1033 2.1034 - // don't want any pieces smaller than half the ideal size -- just tack 2.1035 2.1036 - // little ones onto end of last one 2.1037 2.1038 - endCondition = endVal - (int) (idealSizeOfSide/2); //end *value*, not size 2.1039 2.1040 - for( i = 0; startVal <= endVal; i++ ) 2.1041 2.1042 - { 2.1043 2.1044 - startVals[i] = startVal; 2.1045 2.1046 - residualAcc += idealSizeOfSide; 2.1047 2.1048 - sizeOfSlice = (int)residualAcc; 2.1049 2.1050 - residualAcc -= (float32)sizeOfSlice; 2.1051 2.1052 - startVal += sizeOfSlice; //ex @size = 2 get 0, 2, 4, 6, 8.. 2.1053 2.1054 - 2.1055 2.1056 - if( startVal > endCondition ) 2.1057 2.1058 - { startVal = endVal + 1; 2.1059 2.1060 - startVals[ i + 1 ] = startVal; 2.1061 2.1062 - } 2.1063 2.1064 - } 2.1065 2.1066 - 2.1067 2.1068 - slicingStruc->startVals = startVals; 2.1069 2.1070 - slicingStruc->numVals = i; //loop incr'd, so == last valid start idx+1 2.1071 2.1072 - // which means is num sub-matrices in dim 2.1073 2.1074 - // also == idx of the fake start just above 2.1075 2.1076 - return slicingStruc; 2.1077 2.1078 - } 2.1079 2.1080 - 2.1081 2.1082 -void 2.1083 2.1084 -freeSlicingStruc( SlicingStruc *slicingStruc, VirtProcr *animPr ) 2.1085 2.1086 - { 2.1087 2.1088 - VCilk__free( slicingStruc->startVals, animPr ); 2.1089 2.1090 - VCilk__free( slicingStruc, animPr ); 2.1091 2.1092 - } 2.1093 2.1094 - 2.1095 2.1096 - 2.1097 2.1098 -int inline 2.1099 2.1100 -measureMatrixMultPrimitive( VirtProcr *animPr ) 2.1101 2.1102 - { 2.1103 2.1104 - int r, c, v, numCycles; 2.1105 2.1106 - float32 *res, *left, *right; 2.1107 2.1108 - 2.1109 2.1110 - //setup inputs 2.1111 2.1112 - left = VCilk__malloc( 5 * 5 * sizeof( float32 ), animPr ); 2.1113 2.1114 - right = VCilk__malloc( 5 * 5 * sizeof( float32 ), animPr ); 2.1115 2.1116 - res = VCilk__malloc( 5 * 5 * sizeof( float32 ), animPr ); 2.1117 2.1118 - 2.1119 2.1120 - for( r = 0; r < 5; r++ ) 2.1121 2.1122 - { 2.1123 2.1124 - for( c = 0; c < 5; c++ ) 2.1125 2.1126 - { 2.1127 2.1128 - left[ r * 5 + c ] = r; 2.1129 2.1130 - right[ r * 5 + c ] = c; 2.1131 2.1132 - } 2.1133 2.1134 - } 2.1135 2.1136 - 2.1137 2.1138 - //do primitive 2.1139 2.1140 - VCilk__start_primitive(); //for now, just takes time stamp 2.1141 2.1142 - for( r = 0; r < 5; r++ ) 2.1143 2.1144 - { 2.1145 2.1146 - for( c = 0; c < 5; c++ ) 2.1147 2.1148 - { 2.1149 2.1150 - for( v = 0; v < 5; v++ ) 2.1151 2.1152 - { 2.1153 2.1154 - res[ r * 5 + c ] = left[ r * 5 + v ] * right[ v * 5 + c ]; 2.1155 2.1156 - } 2.1157 2.1158 - } 2.1159 2.1160 - } 2.1161 2.1162 - numCycles = 2.1163 2.1164 - VCilk__end_primitive_and_give_cycles(); 2.1165 2.1166 - 2.1167 2.1168 - VCilk__free( left, animPr ); 2.1169 2.1170 - VCilk__free( right, animPr ); 2.1171 2.1172 - VCilk__free( res, animPr ); 2.1173 2.1174 - 2.1175 2.1176 - return numCycles; 2.1177 2.1178 - } 2.1179 2.1180 +/* 2.1181 + * Copyright 2009 OpenSourceStewardshipFoundation.org 2.1182 + * Licensed under GNU General Public License version 2 2.1183 + * 2.1184 + * Author: seanhalle@yahoo.com 2.1185 + * 2.1186 + */ 2.1187 + 2.1188 + 2.1189 +#include "VCilk__Matrix_Mult.h" 2.1190 +#include <math.h> 2.1191 +#include <sys/time.h> 2.1192 +#include <string.h> 2.1193 + 2.1194 + //The time to compute this many result values should equal the time to 2.1195 + // perform this division on a matrix of size gives that many result calcs 2.1196 + //IE, size this so that sequential time to calc equals divide time 2.1197 + // find the value by experimenting -- but divide time and calc time scale 2.1198 + // same way, so this value should remain valid across hardware 2.1199 + //Divide time is about 800us on 2.4Ghz core2Quad laptop core 2.1200 + //num cells is the cube of a side, when have two square matrices 2.1201 +#define NUM_CELLS_IN_SEQUENTIAL_CUTOFF 100000 /* about 46x46 */ 2.1202 + 2.1203 + 2.1204 +//=========================================================================== 2.1205 +int inline 2.1206 +measureMatrixMultPrimitive( VirtProcr *animPr ); 2.1207 + 2.1208 +SlicingStrucCarrier * 2.1209 +calcIdealSizeAndSliceDimensions( Matrix *leftMatrix, Matrix *rightMatrix, 2.1210 + VirtProcr *animPr ); 2.1211 + 2.1212 +SlicingStruc * 2.1213 +sliceUpDimension( float32 idealSizeOfSide, int startVal, int endVal, 2.1214 + VirtProcr *animPr ); 2.1215 + 2.1216 +void 2.1217 +freeSlicingStruc( SlicingStruc *slicingStruc, VirtProcr *animPr ); 2.1218 + 2.1219 +SubMatrix ** 2.1220 +createSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices, 2.1221 + Matrix *origMatrix, VirtProcr *animPr ); 2.1222 + 2.1223 +void 2.1224 +freeSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices, 2.1225 + SubMatrix **subMatrices, VirtProcr *animPr ); 2.1226 + 2.1227 +void 2.1228 +pairUpSubMatricesAndSpawnAndSync( SubMatrix **leftSubMatrices, 2.1229 + SubMatrix **rightSubMatrices, 2.1230 + int32 numRowIdxs, int32 numColIdxs, 2.1231 + int32 numVecIdxs, 2.1232 + float32 *resultArray, 2.1233 + VirtProcr *animatingPr ); 2.1234 + 2.1235 +void 2.1236 +makeSubMatricesAndSpawnAndSync( Matrix *leftMatrix, Matrix *rightMatrix, 2.1237 + SlicingStrucCarrier *slicingStrucCarrier, 2.1238 + float32 *resultArray, VirtProcr *animatingPr ); 2.1239 + 2.1240 +//=========================================================================== 2.1241 + 2.1242 +/*Divider creates one processor for every sub-matrix 2.1243 + * It hands them: 2.1244 + * the name of the result processor that they should send their results to, 2.1245 + * the left and right matrices, and the rows and cols they should multiply 2.1246 + * It first creates the result processor, then all the sub-matrixPair 2.1247 + * processors, 2.1248 + * then does a receive of a message from the result processor that gives 2.1249 + * the divider ownership of the result matrix. 2.1250 + * Finally, the divider returns the result matrix out of the VCilk system. 2.1251 + * 2.1252 + * Divider chooses the size of sub-matrices via an algorithm that tries to 2.1253 + * keep the minimum work above a threshold. The threshold is machine- 2.1254 + * dependent, so ask VCilk for min work-unit time to get a 2.1255 + * given overhead 2.1256 + * 2.1257 + * Divide min work-unit cycles by measured-cycles for one matrix-cell 2.1258 + * product -- gives the number of products need to have in min size 2.1259 + * matrix. 2.1260 + * 2.1261 + * So then, take cubed root of this to get the size of a side of min sub- 2.1262 + * matrix. That is the size of the ideal square sub-matrix -- so tile 2.1263 + * up the two input matrices into ones as close as possible to that size, 2.1264 + * and create the pairs of sub-matrices. 2.1265 + * 2.1266 + *======================== STRATEGIC OVERVIEW ======================= 2.1267 + * 2.1268 + *This division is a bit tricky, because have to create things in advance 2.1269 + * that it's not at first obvious need to be created.. 2.1270 + * 2.1271 + *First slice up each dimension -- three of them.. this is because will have 2.1272 + * to create the sub-matrix's data-structures before pairing the sub-matrices 2.1273 + * with each other -- so, have three dimensions to slice up before can 2.1274 + * create the sub-matrix data-strucs -- also, have to be certain that the 2.1275 + * cols of the left input have the exact same slicing as the rows of the 2.1276 + * left matrix, so just to be sure, do the slicing calc once, then use it 2.1277 + * for both. 2.1278 + * 2.1279 + *So, goes like this: 2.1280 + *1) calculate the start & end values of each dimension in each matrix. 2.1281 + *2) use those values to create sub-matrix structures 2.1282 + *3) combine sub-matrices into pairs, as the tasks to perform. 2.1283 + * 2.1284 + *Have to calculate separately from creating the sub-matrices because of the 2.1285 + * nature of the nesting -- would either end up creating the same sub-matrix 2.1286 + * multiple times, or else would have to put in detection of whether had 2.1287 + * made a particular one already if tried to combine steps 1 and 2. 2.1288 + * 2.1289 + *Step 3 has to be separate because of the nesting, as well -- same reason, 2.1290 + * would either create same sub-matrix multiple times, or else have to 2.1291 + * add detection of whether was already created. 2.1292 + * 2.1293 + *Another way to look at it: there's one level of loop to divide dimensions, 2.1294 + * two levels of nesting to create sub-matrices, and three levels to pair 2.1295 + * up the sub-matrices. 2.1296 + */ 2.1297 + 2.1298 +void divideWorkIntoSubMatrixPairProcrs( void *_dividerParams, 2.1299 + VirtProcr *animPr ) 2.1300 + { 2.1301 + DividerParams *dividerParams; 2.1302 + ResultsParams *resultsParams; 2.1303 + Matrix *leftMatrix, *rightMatrix, *resultMatrix; 2.1304 + void *msg; 2.1305 + SlicingStrucCarrier *slicingStrucCarrier; 2.1306 + float32 *resultArray; //points to array to be put inside result 2.1307 + // matrix 2.1308 + 2.1309 + DEBUG( dbgAppFlow, "start divide\n") 2.1310 + 2.1311 + int32 2.1312 + divideProbe = VMS__create_single_interval_probe( "divideProbe", 2.1313 + animPr ); 2.1314 + VMS__record_sched_choice_into_probe( divideProbe, animPr ); 2.1315 + VMS__record_interval_start_in_probe( divideProbe ); 2.1316 + 2.1317 + //=========== Setup -- make local copies of ptd-to-things, malloc, aso 2.1318 + int32 numResRows, numResCols, vectLength; 2.1319 + 2.1320 + dividerParams = (DividerParams *)_dividerParams; 2.1321 + 2.1322 + leftMatrix = dividerParams->leftMatrix; 2.1323 + rightMatrix = dividerParams->rightMatrix; 2.1324 + 2.1325 + vectLength = leftMatrix->numCols; 2.1326 + numResRows = leftMatrix->numRows; 2.1327 + numResCols = rightMatrix->numCols; 2.1328 + resultArray = dividerParams->resultMatrix->array; 2.1329 + 2.1330 + //zero the result array 2.1331 + memset( resultArray, 0, numResRows * numResCols * sizeof(float32) ); 2.1332 + 2.1333 + 2.1334 + //============== Do either sequential mult or do division ============== 2.1335 + 2.1336 + //Check if input matrices too small -- if yes, just do sequential 2.1337 + //Cutoff is determined by overhead of this divider -- relatively 2.1338 + // machine-independent 2.1339 + if( (float32)leftMatrix->numRows * (float32)leftMatrix->numCols * 2.1340 + (float32)rightMatrix->numCols < NUM_CELLS_IN_SEQUENTIAL_CUTOFF ) 2.1341 + { int32 vectLength; 2.1342 + 2.1343 + //====== Do sequential multiply on a single core 2.1344 + DEBUG( dbgAppFlow, "doing sequential") 2.1345 + 2.1346 + //transpose the right matrix 2.1347 + float32 * 2.1348 + transRightArray = VCilk__malloc( rightMatrix->numRows * 2.1349 + rightMatrix->numCols * 2.1350 + sizeof(float32), animPr ); 2.1351 + 2.1352 + //copy values from orig matrix to local 2.1353 + copyTranspose( rightMatrix->numRows, rightMatrix->numCols, 2.1354 + 0, 0, rightMatrix->numRows, 2.1355 + transRightArray, rightMatrix->array ); 2.1356 + 2.1357 + multiplyMatrixArraysTransposed( vectLength, numResRows, numResCols, 2.1358 + leftMatrix->array, transRightArray, 2.1359 + resultArray ); 2.1360 + } 2.1361 + else 2.1362 + { 2.1363 + //====== Do parallel multiply across cores 2.1364 + 2.1365 + //Calc the ideal size of sub-matrix and slice up the dimensions of 2.1366 + // the two matrices. 2.1367 + //The ideal size is the one takes the number of cycles to calculate 2.1368 + // such that calc time is equal or greater than min work-unit size 2.1369 + slicingStrucCarrier = 2.1370 + calcIdealSizeAndSliceDimensions( leftMatrix, rightMatrix, animPr); 2.1371 + 2.1372 + 2.1373 + 2.1374 + //Make the sub-matrices, and pair them up, then spawn processors to 2.1375 + // calc product of each pair. 2.1376 + makeSubMatricesAndSpawnAndSync( leftMatrix, rightMatrix, 2.1377 + slicingStrucCarrier, 2.1378 + resultArray, animPr); 2.1379 + //The result array will get filled in by the spawned children 2.1380 + } 2.1381 + 2.1382 + 2.1383 + //=============== Work done -- send results back ================= 2.1384 + 2.1385 + 2.1386 + //results have been saved into an array that was made outside the VMS 2.1387 + // system, by entry-point Fn, and passed in through dividerParams. 2.1388 + //So, nothing to do to send results back -- they're seen by side-effect 2.1389 + 2.1390 + DEBUG( dbgAppFlow, "*** end divide ***\n") 2.1391 + 2.1392 + VMS__record_interval_end_in_probe( divideProbe ); 2.1393 + VMS__print_stats_of_all_probes(); 2.1394 + 2.1395 + VCilk__dissipate_procr( animPr ); //all procrs dissipate self at end 2.1396 + //when all of the processors have dissipated, the "create seed and do 2.1397 + // work" call in the entry point function returns 2.1398 + } 2.1399 + 2.1400 + 2.1401 +SlicingStrucCarrier * 2.1402 +calcIdealSizeAndSliceDimensions( Matrix *leftMatrix, Matrix *rightMatrix, 2.1403 + VirtProcr *animPr ) 2.1404 +{ 2.1405 + float32 idealSizeOfSide, idealSizeOfSide1, idealSizeOfSide2; 2.1406 + SlicingStruc *leftRowSlices, *vecSlices, *rightColSlices; 2.1407 + SlicingStrucCarrier *slicingStrucCarrier = 2.1408 + VCilk__malloc(sizeof(SlicingStrucCarrier), animPr ); 2.1409 + 2.1410 + int minWorkUnitCycles, primitiveCycles, idealNumWorkUnits; 2.1411 + float64 numPrimitiveOpsInMinWorkUnit; 2.1412 + 2.1413 + 2.1414 + //======= Calc ideal size of min-sized sub-matrix ======== 2.1415 + 2.1416 + //ask VCilk for the number of cycles of the minimum work unit, at given 2.1417 + // percent overhead then add a guess at overhead from this divider 2.1418 + minWorkUnitCycles = VCilk__giveMinWorkUnitCycles( .05 ); 2.1419 + 2.1420 + //ask VCilk for number of cycles of the "primitive" op of matrix mult 2.1421 + primitiveCycles = measureMatrixMultPrimitive( animPr ); 2.1422 + 2.1423 + numPrimitiveOpsInMinWorkUnit = 2.1424 + (float64)minWorkUnitCycles / (float64)primitiveCycles; 2.1425 + 2.1426 + //take cubed root -- that's number of these in a "side" of sub-matrix 2.1427 + // then multiply by 5 because the primitive is 5x5 2.1428 + idealSizeOfSide1 = 5 * cbrt( numPrimitiveOpsInMinWorkUnit ); 2.1429 + 2.1430 + idealNumWorkUnits = VCilk__giveIdealNumWorkUnits(); 2.1431 + 2.1432 + idealSizeOfSide2 = leftMatrix->numRows / rint(cbrt( idealNumWorkUnits )); 2.1433 + idealSizeOfSide2 *= 0.8; //finer granularity to help load balance 2.1434 + 2.1435 + if( idealSizeOfSide1 > idealSizeOfSide2 ) 2.1436 + idealSizeOfSide = idealSizeOfSide1; 2.1437 + else 2.1438 + idealSizeOfSide = idealSizeOfSide2; 2.1439 + 2.1440 + //The multiply inner loop blocks the array to fit into L1 cache 2.1441 +// if( idealSizeOfSide < ROWS_IN_BLOCK ) idealSizeOfSide = ROWS_IN_BLOCK; 2.1442 + 2.1443 + //============ Slice up dimensions, now that know target size =========== 2.1444 + 2.1445 + //Tell the slicer the target size of a side (floating pt), the start 2.1446 + // value to start slicing at, and the end value to stop slicing at 2.1447 + //It returns an array of start value of each chunk, plus number of them 2.1448 + int32 startLeftRow, endLeftRow, startVec,endVec,startRightCol,endRightCol; 2.1449 + startLeftRow = 0; 2.1450 + endLeftRow = leftMatrix->numRows -1; 2.1451 + startVec = 0; 2.1452 + endVec = leftMatrix->numCols -1; 2.1453 + startRightCol = 0; 2.1454 + endRightCol = rightMatrix->numCols -1; 2.1455 + 2.1456 + leftRowSlices = 2.1457 + sliceUpDimension( idealSizeOfSide, startLeftRow, endLeftRow, animPr ); 2.1458 + 2.1459 + vecSlices = 2.1460 + sliceUpDimension( idealSizeOfSide, startVec, endVec, animPr ); 2.1461 + 2.1462 + rightColSlices = 2.1463 + sliceUpDimension( idealSizeOfSide, startRightCol, endRightCol,animPr); 2.1464 + 2.1465 + slicingStrucCarrier->leftRowSlices = leftRowSlices; 2.1466 + slicingStrucCarrier->vecSlices = vecSlices; 2.1467 + slicingStrucCarrier->rightColSlices = rightColSlices; 2.1468 + 2.1469 + DEBUG1( dbgAppFlow, "leftRowSlices %d | ", leftRowSlices->numVals ); 2.1470 + DEBUG1( dbgAppFlow, "rightColSlices %d | ",rightColSlices->numVals); 2.1471 + DEBUG1( dbgAppFlow, "vecSlices %d\n", vecSlices->numVals ); 2.1472 + return slicingStrucCarrier; 2.1473 +} 2.1474 + 2.1475 + 2.1476 +void inline 2.1477 +makeSubMatricesAndSpawnAndSync( Matrix *leftMatrix, Matrix *rightMatrix, 2.1478 + SlicingStrucCarrier *slicingStrucCarrier, 2.1479 + float32 *resultArray, VirtProcr *animPr ) 2.1480 + { 2.1481 + SlicingStruc *leftRowSlices, *vecSlices, *rightColSlices; 2.1482 + 2.1483 + leftRowSlices = slicingStrucCarrier->leftRowSlices; 2.1484 + vecSlices = slicingStrucCarrier->vecSlices; 2.1485 + rightColSlices = slicingStrucCarrier->rightColSlices; 2.1486 + VCilk__free( slicingStrucCarrier, animPr ); 2.1487 + 2.1488 + //================ Make sub-matrices, given the slicing ================ 2.1489 + SubMatrix **leftSubMatrices, **rightSubMatrices; 2.1490 + leftSubMatrices = 2.1491 + createSubMatrices( leftRowSlices, vecSlices, 2.1492 + leftMatrix, animPr ); 2.1493 + rightSubMatrices = 2.1494 + createSubMatrices( vecSlices, rightColSlices, 2.1495 + rightMatrix, animPr ); 2.1496 + 2.1497 + freeSlicingStruc( leftRowSlices, animPr ); 2.1498 + freeSlicingStruc( vecSlices, animPr ); 2.1499 + freeSlicingStruc( rightColSlices, animPr ); 2.1500 + 2.1501 + //============== pair the sub-matrices and make processors ============== 2.1502 + int32 numRowIdxs, numColIdxs, numVecIdxs; 2.1503 + 2.1504 + numRowIdxs = leftRowSlices->numVals; 2.1505 + numColIdxs = rightColSlices->numVals; 2.1506 + numVecIdxs = vecSlices->numVals; 2.1507 + pairUpSubMatricesAndSpawnAndSync( leftSubMatrices, rightSubMatrices, 2.1508 + numRowIdxs, numColIdxs, numVecIdxs, 2.1509 + resultArray, 2.1510 + animPr ); 2.1511 + //It syncs inside, so know all work is done now: free the sub-matrices 2.1512 + freeSubMatrices( leftRowSlices, vecSlices, leftSubMatrices, animPr ); 2.1513 + freeSubMatrices( vecSlices, rightColSlices, rightSubMatrices, animPr ); 2.1514 + } 2.1515 + 2.1516 + 2.1517 + 2.1518 + 2.1519 +/* numRows*colsPerRow/numCores = numToPutOntoEachCore; 2.1520 + * put all from a given row onto same core, until exhaust allotment for that 2.1521 + * core 2.1522 + * 2.1523 + */ 2.1524 +void inline 2.1525 +pairUpSubMatricesAndSpawnAndSync( SubMatrix **leftSubMatrices, 2.1526 + SubMatrix **rightSubMatrices, 2.1527 + int32 numRowIdxs, int32 numColIdxs, 2.1528 + int32 numVecIdxs, 2.1529 + float32 *resultArray, 2.1530 + VirtProcr *animatingPr ) 2.1531 + { 2.1532 + int32 resRowIdx, resColIdx; 2.1533 + int32 numLeftColIdxs, numRightColIdxs; 2.1534 + int32 leftRowIdxOffset; 2.1535 + VecParams *vecParams; 2.1536 + float32 numToPutOntoEachCore, leftOverFraction; 2.1537 + int32 numCores, currCore, numOnCurrCore; 2.1538 + 2.1539 + numLeftColIdxs = numColIdxs; 2.1540 + numRightColIdxs = numVecIdxs; 2.1541 + 2.1542 + numCores = VCilk__give_number_of_cores_to_spawn_onto(); 2.1543 + 2.1544 + numToPutOntoEachCore = numRowIdxs*numColIdxs/numCores; 2.1545 + leftOverFraction = 0; 2.1546 + numOnCurrCore = 0; 2.1547 + currCore = 0; 2.1548 + 2.1549 + for( resRowIdx = 0; resRowIdx < numRowIdxs; resRowIdx++ ) 2.1550 + { 2.1551 + leftRowIdxOffset = resRowIdx * numLeftColIdxs; 2.1552 + 2.1553 + for( resColIdx = 0; resColIdx < numColIdxs; resColIdx++ ) 2.1554 + { 2.1555 + vecParams = VCilk__malloc( sizeof(VecParams), animatingPr ); 2.1556 + 2.1557 + vecParams->numVecIdxs = numVecIdxs; 2.1558 + vecParams->numRightColIdxs = numRightColIdxs; 2.1559 + vecParams->leftRowIdxOffset = leftRowIdxOffset; 2.1560 + vecParams->resColIdx = resColIdx; 2.1561 + vecParams->leftSubMatrices = leftSubMatrices; 2.1562 + vecParams->rightSubMatrices = rightSubMatrices; 2.1563 + vecParams->resultArray = resultArray; 2.1564 + vecParams->coreToRunOn = currCore; 2.1565 + 2.1566 + VCilk__spawn( currCore, &calcVectorOfSubMatrices, vecParams, 2.1567 + animatingPr ); 2.1568 + 2.1569 + numOnCurrCore += 1; 2.1570 + if( numOnCurrCore + leftOverFraction >= numToPutOntoEachCore - 1 ) 2.1571 + { 2.1572 + //deal with fractional part, to ensure that imbalance is 1 max 2.1573 + // IE, core with most has only 1 more than core with least 2.1574 + leftOverFraction += numToPutOntoEachCore - numOnCurrCore; 2.1575 + if( leftOverFraction >= 1 ) 2.1576 + { leftOverFraction -= 1; 2.1577 + numOnCurrCore = -1; 2.1578 + } 2.1579 + else 2.1580 + { numOnCurrCore = 0; 2.1581 + } 2.1582 + //Move to next core, max core-value to incr to is numCores -1 2.1583 + if( currCore >= numCores -1 ) 2.1584 + { currCore = 0; 2.1585 + } 2.1586 + else 2.1587 + { currCore += 1; 2.1588 + } 2.1589 + } 2.1590 + } 2.1591 + } 2.1592 + 2.1593 + //Free Note: vector of sub-matrices does its own free-ing, even vec-params 2.1594 + 2.1595 +//TODO: timeToSpawnProbe = VMS__get_probe_by_name( "timeToSpawnProbe" ); 2.1596 +// VMS__end_interval_on_probe( timeToSpawnProbe ); 2.1597 + 2.1598 + VCilk__sync( animatingPr ); 2.1599 + 2.1600 + //free the sub-matrices in Fn that called this one 2.1601 + } 2.1602 + 2.1603 + 2.1604 +/*Walk through the two slice-strucs, making sub-matrix strucs as go 2.1605 + */ 2.1606 +SubMatrix ** 2.1607 +createSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices, 2.1608 + Matrix *origMatrix, VirtProcr *animPr ) 2.1609 + { 2.1610 + int32 numRowIdxs, numColIdxs, rowIdx, colIdx; 2.1611 + int32 startRow, endRow, startCol, endCol; 2.1612 + int32 *rowStartVals, *colStartVals; 2.1613 + int32 rowOffset; 2.1614 + SubMatrix **subMatrices, *newSubMatrix; 2.1615 + 2.1616 + numRowIdxs = rowSlices->numVals; 2.1617 + numColIdxs = colSlices->numVals; 2.1618 + 2.1619 + rowStartVals = rowSlices->startVals; 2.1620 + colStartVals = colSlices->startVals; 2.1621 + 2.1622 + subMatrices = VCilk__malloc( numRowIdxs * numColIdxs *sizeof(SubMatrix *), 2.1623 + animPr ); 2.1624 + 2.1625 + for( rowIdx = 0; rowIdx < numRowIdxs; rowIdx++ ) 2.1626 + { 2.1627 + rowOffset = rowIdx * numColIdxs; 2.1628 + 2.1629 + startRow = rowStartVals[rowIdx]; 2.1630 + endRow = rowStartVals[rowIdx + 1] -1; //"fake" start above last is 2.1631 + // at last valid idx + 1 & is 2.1632 + // 1 greater than end value 2.1633 + for( colIdx = 0; colIdx < numColIdxs; colIdx++ ) 2.1634 + { 2.1635 + startCol = colStartVals[colIdx]; 2.1636 + endCol = colStartVals[colIdx + 1] -1; 2.1637 + 2.1638 + newSubMatrix = VCilk__malloc( sizeof(SubMatrix), animPr ); 2.1639 + newSubMatrix->numRows = endRow - startRow +1; 2.1640 + newSubMatrix->numCols = endCol - startCol +1; 2.1641 + newSubMatrix->origMatrix = origMatrix; 2.1642 + newSubMatrix->origStartRow = startRow; 2.1643 + newSubMatrix->origStartCol = startCol; 2.1644 + newSubMatrix->alreadyCopied = FALSE; 2.1645 + //Prevent uninitialized memory 2.1646 + newSubMatrix->copySingleton = NULL; 2.1647 + newSubMatrix->copyTransSingleton = NULL; 2.1648 + 2.1649 + subMatrices[ rowOffset + colIdx ] = newSubMatrix; 2.1650 + } 2.1651 + } 2.1652 + return subMatrices; 2.1653 + } 2.1654 + 2.1655 +void 2.1656 +freeSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices, 2.1657 + SubMatrix **subMatrices, VirtProcr *animPr ) 2.1658 + { 2.1659 + int32 numRowIdxs, numColIdxs, rowIdx, colIdx, rowOffset; 2.1660 + SubMatrix *subMatrix; 2.1661 + 2.1662 + numRowIdxs = rowSlices->numVals; 2.1663 + numColIdxs = colSlices->numVals; 2.1664 + 2.1665 + for( rowIdx = 0; rowIdx < numRowIdxs; rowIdx++ ) 2.1666 + { 2.1667 + rowOffset = rowIdx * numColIdxs; 2.1668 + for( colIdx = 0; colIdx < numColIdxs; colIdx++ ) 2.1669 + { 2.1670 + subMatrix = subMatrices[ rowOffset + colIdx ]; 2.1671 + if( subMatrix->alreadyCopied ) 2.1672 + VCilk__free( subMatrix->array, animPr ); 2.1673 + VCilk__free( subMatrix, animPr ); 2.1674 + } 2.1675 + } 2.1676 + VCilk__free( subMatrices, animPr ); 2.1677 + } 2.1678 + 2.1679 + 2.1680 + 2.1681 +SlicingStruc * 2.1682 +sliceUpDimension( float32 idealSizeOfSide, int startVal, int endVal, 2.1683 + VirtProcr *animPr ) 2.1684 + { float32 residualAcc = 0; 2.1685 + int numSlices, i, *startVals, sizeOfSlice, endCondition; 2.1686 + SlicingStruc *slicingStruc = VCilk__malloc( sizeof(SlicingStruc), animPr); 2.1687 + 2.1688 + //calc size of matrix need to hold start vals -- 2.1689 + numSlices = (int32)( (float32)(endVal -startVal +1) / idealSizeOfSide); 2.1690 + 2.1691 + startVals = VCilk__malloc( (numSlices + 1) * sizeof(int32), animPr ); 2.1692 + 2.1693 + //Calc the upper limit of start value -- when get above this, end loop 2.1694 + // by saving highest value of the matrix dimension to access, plus 1 2.1695 + // as the start point of the imaginary slice following the last one 2.1696 + //Plus 1 because go up to value but not include when process last slice 2.1697 + //The stopping condition is half-a-size less than highest value because 2.1698 + // don't want any pieces smaller than half the ideal size -- just tack 2.1699 + // little ones onto end of last one 2.1700 + endCondition = endVal - (int) (idealSizeOfSide/2); //end *value*, not size 2.1701 + for( i = 0; startVal <= endVal; i++ ) 2.1702 + { 2.1703 + startVals[i] = startVal; 2.1704 + residualAcc += idealSizeOfSide; 2.1705 + sizeOfSlice = (int)residualAcc; 2.1706 + residualAcc -= (float32)sizeOfSlice; 2.1707 + startVal += sizeOfSlice; //ex @size = 2 get 0, 2, 4, 6, 8.. 2.1708 + 2.1709 + if( startVal > endCondition ) 2.1710 + { startVal = endVal + 1; 2.1711 + startVals[ i + 1 ] = startVal; 2.1712 + } 2.1713 + } 2.1714 + 2.1715 + slicingStruc->startVals = startVals; 2.1716 + slicingStruc->numVals = i; //loop incr'd, so == last valid start idx+1 2.1717 + // which means is num sub-matrices in dim 2.1718 + // also == idx of the fake start just above 2.1719 + return slicingStruc; 2.1720 + } 2.1721 + 2.1722 +void 2.1723 +freeSlicingStruc( SlicingStruc *slicingStruc, VirtProcr *animPr ) 2.1724 + { 2.1725 + VCilk__free( slicingStruc->startVals, animPr ); 2.1726 + VCilk__free( slicingStruc, animPr ); 2.1727 + } 2.1728 + 2.1729 + 2.1730 +int inline 2.1731 +measureMatrixMultPrimitive( VirtProcr *animPr ) 2.1732 + { 2.1733 + int r, c, v, numCycles; 2.1734 + float32 *res, *left, *right; 2.1735 + 2.1736 + //setup inputs 2.1737 + left = VCilk__malloc( 5 * 5 * sizeof( float32 ), animPr ); 2.1738 + right = VCilk__malloc( 5 * 5 * sizeof( float32 ), animPr ); 2.1739 + res = VCilk__malloc( 5 * 5 * sizeof( float32 ), animPr ); 2.1740 + 2.1741 + for( r = 0; r < 5; r++ ) 2.1742 + { 2.1743 + for( c = 0; c < 5; c++ ) 2.1744 + { 2.1745 + left[ r * 5 + c ] = r; 2.1746 + right[ r * 5 + c ] = c; 2.1747 + } 2.1748 + } 2.1749 + 2.1750 + //do primitive 2.1751 + VCilk__start_primitive(); //for now, just takes time stamp 2.1752 + for( r = 0; r < 5; r++ ) 2.1753 + { 2.1754 + for( c = 0; c < 5; c++ ) 2.1755 + { 2.1756 + for( v = 0; v < 5; v++ ) 2.1757 + { 2.1758 + res[ r * 5 + c ] = left[ r * 5 + v ] * right[ v * 5 + c ]; 2.1759 + } 2.1760 + } 2.1761 + } 2.1762 + numCycles = 2.1763 + VCilk__end_primitive_and_give_cycles(); 2.1764 + 2.1765 + VCilk__free( left, animPr ); 2.1766 + VCilk__free( right, animPr ); 2.1767 + VCilk__free( res, animPr ); 2.1768 + 2.1769 + return numCycles; 2.1770 + }
3.1 --- a/src/Application/VCilk__Matrix_Mult/VCilk__Matrix_Mult.h Wed May 11 15:40:54 2011 +0200 3.2 +++ b/src/Application/VCilk__Matrix_Mult/VCilk__Matrix_Mult.h Wed May 11 15:58:04 2011 +0200 3.3 @@ -1,106 +1,106 @@ 3.4 -/* 3.5 3.6 - * Copyright Oct 24, 2009 OpenSourceStewardshipFoundation.org 3.7 3.8 - * Licensed under GNU General Public License version 2 3.9 3.10 - */ 3.11 3.12 - 3.13 3.14 -#ifndef _VCilk_MATRIX_MULT_H_ 3.15 3.16 -#define _VCilk_MATRIX_MULT_H_ 3.17 3.18 - 3.19 3.20 -#include <stdio.h> 3.21 3.22 - 3.23 3.24 -#include "../../VCilk_lib/VCilk.h" 3.25 3.26 -#include "../Matrix_Mult.h" 3.27 3.28 -#include "../../VCilk_lib/VMS/VMS.h" 3.29 3.30 - 3.31 3.32 - 3.33 3.34 -//=============================== Defines ============================== 3.35 3.36 -#define ROWS_IN_BLOCK 32 3.37 3.38 -#define COLS_IN_BLOCK 32 3.39 3.40 -#define VEC_IN_BLOCK 32 3.41 3.42 - 3.43 3.44 -#define copyMatrixSingleton 1 3.45 3.46 -#define copyTransposeSingleton 2 3.47 3.48 - 3.49 3.50 -//============================== Structures ============================== 3.51 3.52 -typedef struct 3.53 3.54 - { 3.55 3.56 - Matrix *leftMatrix; 3.57 3.58 - Matrix *rightMatrix; 3.59 3.60 - Matrix *resultMatrix; 3.61 3.62 - 3.63 3.64 - TSCount numTSCsToExe; 3.65 3.66 - } 3.67 3.68 -DividerParams; 3.69 3.70 - 3.71 3.72 -typedef struct 3.73 3.74 - { 3.75 3.76 - VirtProcr *dividerPr; 3.77 3.78 - int numRows; 3.79 3.80 - int numCols; 3.81 3.82 - int numSubMatrixPairs; 3.83 3.84 - } 3.85 3.86 -ResultsParams; 3.87 3.88 - 3.89 3.90 -typedef 3.91 3.92 -struct 3.93 3.94 - { int32 numRows; 3.95 3.96 - int32 numCols; 3.97 3.98 - Matrix *origMatrix; 3.99 3.100 - int32 origStartRow; 3.101 3.102 - int32 origStartCol; 3.103 3.104 - int32 alreadyCopied; 3.105 3.106 - VCilkSingleton *copySingleton; 3.107 3.108 - VCilkSingleton *copyTransSingleton; 3.109 3.110 - float32 *array; //2D, but dynamically sized, so use addr arith 3.111 3.112 - } 3.113 3.114 -SubMatrix; 3.115 3.116 - 3.117 3.118 -typedef struct 3.119 3.120 - { VirtProcr *resultPr; 3.121 3.122 - SubMatrix *leftSubMatrix; 3.123 3.124 - SubMatrix *rightSubMatrix; 3.125 3.126 - float32 *partialResultArray; 3.127 3.128 - } 3.129 3.130 -SMPairParams; 3.131 3.132 - 3.133 3.134 -typedef 3.135 3.136 -struct 3.137 3.138 - { int32 numVals; 3.139 3.140 - int32 *startVals; 3.141 3.142 - } 3.143 3.144 -SlicingStruc; 3.145 3.146 - 3.147 3.148 -typedef 3.149 3.150 -struct 3.151 3.152 - { 3.153 3.154 - SlicingStruc *leftRowSlices; 3.155 3.156 - SlicingStruc *vecSlices; 3.157 3.158 - SlicingStruc *rightColSlices; 3.159 3.160 - } 3.161 3.162 -SlicingStrucCarrier; 3.163 3.164 - 3.165 3.166 -typedef struct 3.167 3.168 - { 3.169 3.170 - int32 numVecIdxs; 3.171 3.172 - int32 numRightColIdxs; 3.173 3.174 - int32 leftRowIdxOffset; 3.175 3.176 - int32 resColIdx; 3.177 3.178 - SubMatrix **leftSubMatrices; 3.179 3.180 - SubMatrix **rightSubMatrices; 3.181 3.182 - float32 *resultArray; 3.183 3.184 - int32 coreToRunOn; 3.185 3.186 - } 3.187 3.188 -VecParams; 3.189 3.190 - 3.191 3.192 -//============================= Processor Functions ========================= 3.193 3.194 -void divideWorkIntoSubMatrixPairProcrs( void *data, VirtProcr *animatingPr ); 3.195 3.196 -void calcSubMatrixProduct( void *data, VirtProcr *animatingPr ); 3.197 3.198 -void calcVectorOfSubMatrices( void *_vecParams, VirtProcr *animatingPr ); 3.199 3.200 - 3.201 3.202 - 3.203 3.204 -//================================ Entry Point ============================== 3.205 3.206 -Matrix * 3.207 3.208 -multiplyTheseMatrices( Matrix *leftMatrix, Matrix *rightMatrix ); 3.209 3.210 - 3.211 3.212 - 3.213 3.214 -#endif /*_VCilk_MATRIX_MULT_H_*/ 3.215 3.216 +/* 3.217 + * Copyright Oct 24, 2009 OpenSourceStewardshipFoundation.org 3.218 + * Licensed under GNU General Public License version 2 3.219 + */ 3.220 + 3.221 +#ifndef _VCilk_MATRIX_MULT_H_ 3.222 +#define _VCilk_MATRIX_MULT_H_ 3.223 + 3.224 +#include <stdio.h> 3.225 + 3.226 +#include "../../VCilk_lib/VCilk.h" 3.227 +#include "../Matrix_Mult.h" 3.228 +#include "../../VCilk_lib/VMS/VMS.h" 3.229 + 3.230 + 3.231 +//=============================== Defines ============================== 3.232 +#define ROWS_IN_BLOCK 32 3.233 +#define COLS_IN_BLOCK 32 3.234 +#define VEC_IN_BLOCK 32 3.235 + 3.236 +#define copyMatrixSingleton 1 3.237 +#define copyTransposeSingleton 2 3.238 + 3.239 +//============================== Structures ============================== 3.240 +typedef struct 3.241 + { 3.242 + Matrix *leftMatrix; 3.243 + Matrix *rightMatrix; 3.244 + Matrix *resultMatrix; 3.245 + 3.246 + TSCount numTSCsToExe; 3.247 + } 3.248 +DividerParams; 3.249 + 3.250 +typedef struct 3.251 + { 3.252 + VirtProcr *dividerPr; 3.253 + int numRows; 3.254 + int numCols; 3.255 + int numSubMatrixPairs; 3.256 + } 3.257 +ResultsParams; 3.258 + 3.259 +typedef 3.260 +struct 3.261 + { int32 numRows; 3.262 + int32 numCols; 3.263 + Matrix *origMatrix; 3.264 + int32 origStartRow; 3.265 + int32 origStartCol; 3.266 + int32 alreadyCopied; 3.267 + VCilkSingleton *copySingleton; 3.268 + VCilkSingleton *copyTransSingleton; 3.269 + float32 *array; //2D, but dynamically sized, so use addr arith 3.270 + } 3.271 +SubMatrix; 3.272 + 3.273 +typedef struct 3.274 + { VirtProcr *resultPr; 3.275 + SubMatrix *leftSubMatrix; 3.276 + SubMatrix *rightSubMatrix; 3.277 + float32 *partialResultArray; 3.278 + } 3.279 +SMPairParams; 3.280 + 3.281 +typedef 3.282 +struct 3.283 + { int32 numVals; 3.284 + int32 *startVals; 3.285 + } 3.286 +SlicingStruc; 3.287 + 3.288 +typedef 3.289 +struct 3.290 + { 3.291 + SlicingStruc *leftRowSlices; 3.292 + SlicingStruc *vecSlices; 3.293 + SlicingStruc *rightColSlices; 3.294 + } 3.295 +SlicingStrucCarrier; 3.296 + 3.297 +typedef struct 3.298 + { 3.299 + int32 numVecIdxs; 3.300 + int32 numRightColIdxs; 3.301 + int32 leftRowIdxOffset; 3.302 + int32 resColIdx; 3.303 + SubMatrix **leftSubMatrices; 3.304 + SubMatrix **rightSubMatrices; 3.305 + float32 *resultArray; 3.306 + int32 coreToRunOn; 3.307 + } 3.308 +VecParams; 3.309 + 3.310 +//============================= Processor Functions ========================= 3.311 +void divideWorkIntoSubMatrixPairProcrs( void *data, VirtProcr *animatingPr ); 3.312 +void calcSubMatrixProduct( void *data, VirtProcr *animatingPr ); 3.313 +void calcVectorOfSubMatrices( void *_vecParams, VirtProcr *animatingPr ); 3.314 + 3.315 + 3.316 +//================================ Entry Point ============================== 3.317 +Matrix * 3.318 +multiplyTheseMatrices( Matrix *leftMatrix, Matrix *rightMatrix ); 3.319 + 3.320 + 3.321 +#endif /*_VCilk_MATRIX_MULT_H_*/
4.1 --- a/src/Application/VCilk__Matrix_Mult/subMatrix_Pr.c Wed May 11 15:40:54 2011 +0200 4.2 +++ b/src/Application/VCilk__Matrix_Mult/subMatrix_Pr.c Wed May 11 15:58:04 2011 +0200 4.3 @@ -1,308 +1,308 @@ 4.4 -/* 4.5 4.6 - * Copyright 2009 OpenSourceStewardshipFoundation.org 4.7 4.8 - * Licensed under GNU General Public License version 2 4.9 4.10 - * 4.11 4.12 - * Author: SeanHalle@yahoo.com 4.13 4.14 - * 4.15 4.16 - */ 4.17 4.18 - 4.19 4.20 -#include <string.h> 4.21 4.22 - 4.23 4.24 -#include "VCilk__Matrix_Mult.h" 4.25 4.26 - 4.27 4.28 - 4.29 4.30 -void inline 4.31 4.32 -copyFromOrig( SubMatrix *subMatrix, VirtProcr *animPr ); 4.33 4.34 - 4.35 4.36 -void inline 4.37 4.38 -copyTransposeFromOrig( SubMatrix *subMatrix, VirtProcr *animPr ); 4.39 4.40 - 4.41 4.42 -void inline 4.43 4.44 -multiplySubBlocksTransposed( float32 *leftArray, float32 *rightArray, 4.45 4.46 - float32 *resArray, 4.47 4.48 - int startRow, int endRow, 4.49 4.50 - int startCol, int endCol, 4.51 4.52 - int startVec, int endVec, 4.53 4.54 - int resStride, int inpStride ); 4.55 4.56 - 4.57 4.58 -void inline 4.59 4.60 -multiplyMatrixArraysTransposed( int32 vecLength, int32 numResRows, int32 numResCols, 4.61 4.62 - float32 *leftArray, float32 *rightArray, 4.63 4.64 - float32 *resArray ); 4.65 4.66 - 4.67 4.68 - 4.69 4.70 -/*A processor is created with an environment that holds two matrices, 4.71 4.72 - * the row and col that it owns, and the name of a result gathering 4.73 4.74 - * processor. 4.75 4.76 - *It calculates the product of two sub-portions of the input matrices 4.77 4.78 - * by using Intel's mkl library for single-core. 4.79 4.80 - * 4.81 4.82 - *This demonstrates using optimized single-threaded code inside scheduled 4.83 4.84 - * work-units. 4.85 4.86 - * 4.87 4.88 - *When done, it sends the result to the result processor 4.89 4.90 - */ 4.91 4.92 -void 4.93 4.94 -calcSubMatrixProduct( void *data, VirtProcr *animPr ) 4.95 4.96 - { 4.97 4.98 - SMPairParams *params; 4.99 4.100 - VirtProcr *resultPr; 4.101 4.102 - float32 *leftArray, *rightArray, *resArray; 4.103 4.104 - SubMatrix *leftSubMatrix, *rightSubMatrix; 4.105 4.106 - 4.107 4.108 - 4.109 4.110 - DEBUG1( dbgAppFlow, "start sub-matrix mult %d\n", animPr->procrID) 4.111 4.112 - #ifdef TURN_ON_DEBUG_PROBES 4.113 4.114 - int32 subMatrixProbe = 4.115 4.116 - VMS__create_single_interval_probe( "subMtx", animPr); 4.117 4.118 - VMS__record_sched_choice_into_probe( subMatrixProbe, animPr ); 4.119 4.120 - VMS__record_interval_start_in_probe( subMatrixProbe ); 4.121 4.122 - #endif 4.123 4.124 - 4.125 4.126 - params = (SMPairParams *)data; 4.127 4.128 -// resultPr = params->resultPr; 4.129 4.130 - leftSubMatrix = params->leftSubMatrix; 4.131 4.132 - rightSubMatrix = params->rightSubMatrix; 4.133 4.134 - 4.135 4.136 - //make sure the input sub-matrices have been copied out of orig 4.137 4.138 - copyFromOrig( leftSubMatrix, animPr ); 4.139 4.140 - copyTransposeFromOrig( rightSubMatrix, animPr ); 4.141 4.142 - 4.143 4.144 - leftArray = leftSubMatrix->array; 4.145 4.146 - rightArray = rightSubMatrix->array; 4.147 4.148 - 4.149 4.150 - //make this array here, on the core that computes the results 4.151 4.152 - // with Cilk's semantics, have to have separate result array for each 4.153 4.154 - // spawned processor -- unless want to change the spawn and sync 4.155 4.156 - // pattern, such that spawn one from each vector, then sync, then 4.157 4.158 - // another, and so forth -- this will cause idle time due to imbalance 4.159 4.160 - // in matrix sizes 4.161 4.162 - //This also gives chance to set affinity so all in vector run on same 4.163 4.164 - // core and re-use the accumulation array, 4.165 4.166 - //As a side-benefit, it also prevents writes from causing 4.167 4.168 - // thrashing of the cache -- as long as array big enough, the copy 4.169 4.170 - // overhead is small because each byte is reused size-of-side times 4.171 4.172 - //This is freed in the vector processor 4.173 4.174 - int32 4.175 4.176 - resSize = leftSubMatrix->numRows * rightSubMatrix->numCols * sizeof(float32); 4.177 4.178 - resArray = VCilk__malloc( resSize, animPr ); 4.179 4.180 - memset( resArray, 0, resSize ); 4.181 4.182 - 4.183 4.184 - 4.185 4.186 - int32 numResRows, numResCols, vectLength; 4.187 4.188 - 4.189 4.190 - vectLength = leftSubMatrix->numCols; 4.191 4.192 - numResRows = leftSubMatrix->numRows; 4.193 4.194 - numResCols = rightSubMatrix->numCols; 4.195 4.196 - 4.197 4.198 - multiplyMatrixArraysTransposed( vectLength, numResRows, numResCols, 4.199 4.200 - leftArray, rightArray, 4.201 4.202 - resArray ); 4.203 4.204 - 4.205 4.206 - //send result by side-effect 4.207 4.208 - params->partialResultArray = resArray; 4.209 4.210 - 4.211 4.212 - #ifdef TURN_ON_DEBUG_PROBES 4.213 4.214 - VMS__record_interval_end_in_probe( subMatrixProbe ); 4.215 4.216 - #endif 4.217 4.218 - 4.219 4.220 - DEBUG1( dbgAppFlow, "end sub-matrix mult %d\n", animPr->procrID) 4.221 4.222 - VCilk__dissipate_procr( animPr ); 4.223 4.224 - } 4.225 4.226 - 4.227 4.228 - 4.229 4.230 - 4.231 4.232 -/*Divides result and each input into 32x32 sub-matrices, 3 of which fit into 4.233 4.234 - * the 32KB L1 cache. 4.235 4.236 - *Would be nice to embed this within another level that divided into 4.237 4.238 - * 8x8 tiles of those, where one 8x8 tile fits within 2MB L2 cache 4.239 4.240 - * 4.241 4.242 - *Eventually want these divisions to be automatic, using DKU pattern 4.243 4.244 - * embedded into VMS and exposed in the language, and with VMS controlling the 4.245 4.246 - * divisions according to the cache sizes, which it knows about. 4.247 4.248 - *Also, want VMS to work with language to split among main-mems, so a socket 4.249 4.250 - * only cranks on data in its local segment of main mem 4.251 4.252 - * 4.253 4.254 - *So, outer two loops determine start and end points within the result matrix. 4.255 4.256 - * Inside that, a loop dets the start and end points along the shared dimensions 4.257 4.258 - * of the two input matrices. 4.259 4.260 - */ 4.261 4.262 -void inline 4.263 4.264 -multiplyMatrixArraysTransposed( int32 vecLength, int32 numResRows, 4.265 4.266 - int32 numResCols, 4.267 4.268 - float32 *leftArray, float32 *rightArray, 4.269 4.270 - float32 *resArray ) 4.271 4.272 - { 4.273 4.274 - int resStride, inpStride; 4.275 4.276 - int resStartRow, resStartCol, resEndRow, resEndCol, startVec, endVec; 4.277 4.278 - 4.279 4.280 - resStride = numResCols; 4.281 4.282 - inpStride = vecLength; 4.283 4.284 - 4.285 4.286 - for( resStartRow = 0; resStartRow < numResRows; ) 4.287 4.288 - { 4.289 4.290 - resEndRow = resStartRow + ROWS_IN_BLOCK -1; //start at zero, so -1 4.291 4.292 - if( resEndRow > numResRows ) resEndRow = numResRows -1; 4.293 4.294 - 4.295 4.296 - for( resStartCol = 0; resStartCol < numResCols; ) 4.297 4.298 - { 4.299 4.300 - resEndCol = resStartCol + COLS_IN_BLOCK -1; 4.301 4.302 - if( resEndCol > numResCols ) resEndCol = numResCols -1; 4.303 4.304 - 4.305 4.306 - for( startVec = 0; startVec < vecLength; ) 4.307 4.308 - { 4.309 4.310 - endVec = startVec + VEC_IN_BLOCK -1; 4.311 4.312 - if( endVec > vecLength ) endVec = vecLength -1; 4.313 4.314 - 4.315 4.316 - //By having the "vector" of sub-blocks in a sub-block slice 4.317 4.318 - // be marched down in inner loop, are re-using the result 4.319 4.320 - // matrix, which stays in L1 cache and re-using the left sub-mat 4.321 4.322 - // which repeats for each right sub-mat -- can only re-use two of 4.323 4.324 - // the three, so result is the most important -- avoids writing 4.325 4.326 - // dirty blocks until those result-locations fully done 4.327 4.328 - //Row and Col is position in result matrix -- so row and vec 4.329 4.330 - // for left array, then vec and col for right array 4.331 4.332 - multiplySubBlocksTransposed( leftArray, rightArray, 4.333 4.334 - resArray, 4.335 4.336 - resStartRow, resEndRow, 4.337 4.338 - resStartCol, resEndCol, 4.339 4.340 - startVec, endVec, 4.341 4.342 - resStride, inpStride ); 4.343 4.344 - startVec = endVec +1; 4.345 4.346 - } 4.347 4.348 - resStartCol = resEndCol +1; 4.349 4.350 - } 4.351 4.352 - resStartRow = resEndRow +1; 4.353 4.354 - } 4.355 4.356 - } 4.357 4.358 - 4.359 4.360 - 4.361 4.362 - 4.363 4.364 -void inline 4.365 4.366 -multiplySubBlocksTransposed( float32 *leftArray, float32 *rightArray, 4.367 4.368 - float32 *resArray, 4.369 4.370 - int resStartRow, int resEndRow, 4.371 4.372 - int resStartCol, int resEndCol, 4.373 4.374 - int startVec, int endVec, 4.375 4.376 - int resStride, int inpStride ) 4.377 4.378 - { 4.379 4.380 - int resRow, resCol, vec; 4.381 4.382 - int leftOffset, rightOffset; 4.383 4.384 - float32 result; 4.385 4.386 - 4.387 4.388 - //The result row is used for the left matrix, res col for the right 4.389 4.390 - for( resCol = resStartCol; resCol <= resEndCol; resCol++ ) 4.391 4.392 - { 4.393 4.394 - for( resRow = resStartRow; resRow <= resEndRow; resRow++ ) 4.395 4.396 - { 4.397 4.398 - leftOffset = resRow * inpStride;//left & right inp strides same 4.399 4.400 - rightOffset = resCol * inpStride;// because right is transposed 4.401 4.402 - result = 0; 4.403 4.404 - for( vec = startVec; vec <= endVec; vec++ ) 4.405 4.406 - { 4.407 4.408 - result += 4.409 4.410 - leftArray[ leftOffset + vec] * rightArray[ rightOffset + vec]; 4.411 4.412 - } 4.413 4.414 - 4.415 4.416 - resArray[ resRow * resStride + resCol ] += result; 4.417 4.418 - } 4.419 4.420 - } 4.421 4.422 - } 4.423 4.424 - 4.425 4.426 - 4.427 4.428 -/*Reuse this in divider when do the sequential multiply case 4.429 4.430 - */ 4.431 4.432 -void inline 4.433 4.434 -copyTranspose( int32 numRows, int32 numCols, 4.435 4.436 - int32 origStartRow, int32 origStartCol, int32 origStride, 4.437 4.438 - float32 *subArray, float32 *origArray ) 4.439 4.440 - { int32 stride = numRows; 4.441 4.442 - 4.443 4.444 - int row, col, origOffset; 4.445 4.446 - for( row = 0; row < numRows; row++ ) 4.447 4.448 - { 4.449 4.450 - origOffset = (row + origStartRow) * origStride + origStartCol; 4.451 4.452 - for( col = 0; col < numCols; col++ ) 4.453 4.454 - { 4.455 4.456 - //transpose means swap row & col -- traverse orig matrix normally 4.457 4.458 - // but put into reversed place in local array -- means the 4.459 4.460 - // stride is the numRows now, so col * numRows + row 4.461 4.462 - subArray[ col * stride + row ] = origArray[ origOffset + col ]; 4.463 4.464 - } 4.465 4.466 - } 4.467 4.468 - } 4.469 4.470 - 4.471 4.472 -void inline 4.473 4.474 -copyTransposeFromOrig( SubMatrix *subMatrix, VirtProcr *animPr ) 4.475 4.476 - { int numCols, numRows, origStartRow, origStartCol, origStride, stride; 4.477 4.478 - Matrix *origMatrix; 4.479 4.480 - float32 *origArray, *subArray; 4.481 4.482 - 4.483 4.484 - VCilk__start_data_singleton( &(subMatrix->copyTransSingleton), animPr ); 4.485 4.486 - 4.487 4.488 - origMatrix = subMatrix->origMatrix; 4.489 4.490 - origArray = origMatrix->array; 4.491 4.492 - numCols = subMatrix->numCols; 4.493 4.494 - numRows = subMatrix->numRows; 4.495 4.496 - origStartRow = subMatrix->origStartRow; 4.497 4.498 - origStartCol = subMatrix->origStartCol; 4.499 4.500 - origStride = origMatrix->numCols; 4.501 4.502 - 4.503 4.504 - subArray = VCilk__malloc( numRows * numCols *sizeof(float32),animPr); 4.505 4.506 - subMatrix->array = subArray; 4.507 4.508 - 4.509 4.510 - //copy values from orig matrix to local 4.511 4.512 - copyTranspose( numRows, numCols, 4.513 4.514 - origStartRow, origStartCol, origStride, 4.515 4.516 - subArray, origArray ); 4.517 4.518 - 4.519 4.520 - VCilk__end_data_singleton( &(subMatrix->copyTransSingleton), animPr ); 4.521 4.522 - return; 4.523 4.524 - } 4.525 4.526 - 4.527 4.528 - 4.529 4.530 -void inline 4.531 4.532 -copyFromOrig( SubMatrix *subMatrix, VirtProcr *animPr ) 4.533 4.534 - { int numCols, numRows, origStartRow, origStartCol, stride, origStride; 4.535 4.536 - Matrix *origMatrix; 4.537 4.538 - float32 *origArray, *subArray; 4.539 4.540 - 4.541 4.542 - 4.543 4.544 - //This lets only a single VP execute the code between start and 4.545 4.546 - // end -- using start and end so that work runs outside the master. 4.547 4.548 - //Inside, if a second VP ever executes the start, it will be returned 4.549 4.550 - // from the end-point. 4.551 4.552 - //Note, for non-GCC, can add a second SSR call at the end, and inside 4.553 4.554 - // that one, look at the stack at the return addr & save that in an 4.555 4.556 - // array indexed by singletonID 4.557 4.558 - VCilk__start_data_singleton( &(subMatrix->copySingleton), animPr ); 4.559 4.560 - 4.561 4.562 - 4.563 4.564 - origMatrix = subMatrix->origMatrix; 4.565 4.566 - origArray = origMatrix->array; 4.567 4.568 - numCols = subMatrix->numCols; 4.569 4.570 - numRows = subMatrix->numRows; 4.571 4.572 - origStartRow = subMatrix->origStartRow; 4.573 4.574 - origStartCol = subMatrix->origStartCol; 4.575 4.576 - origStride = origMatrix->numCols; 4.577 4.578 - 4.579 4.580 - subArray = VCilk__malloc( numRows * numCols *sizeof(float32),animPr); 4.581 4.582 - subMatrix->array = subArray; 4.583 4.584 - 4.585 4.586 - //copy values from orig matrix to local 4.587 4.588 - stride = numCols; 4.589 4.590 - 4.591 4.592 - int row, col, offset, origOffset; 4.593 4.594 - for( row = 0; row < numRows; row++ ) 4.595 4.596 - { 4.597 4.598 - offset = row * stride; 4.599 4.600 - origOffset = (row + origStartRow) * origStride + origStartCol; 4.601 4.602 - for( col = 0; col < numCols; col++ ) 4.603 4.604 - { 4.605 4.606 - subArray[ offset + col ] = origArray[ origOffset + col ]; 4.607 4.608 - } 4.609 4.610 - } 4.611 4.612 - VCilk__end_data_singleton( &(subMatrix->copySingleton), animPr ); 4.613 4.614 - 4.615 4.616 - return; 4.617 4.618 - } 4.619 4.620 +/* 4.621 + * Copyright 2009 OpenSourceStewardshipFoundation.org 4.622 + * Licensed under GNU General Public License version 2 4.623 + * 4.624 + * Author: SeanHalle@yahoo.com 4.625 + * 4.626 + */ 4.627 + 4.628 +#include <string.h> 4.629 + 4.630 +#include "VCilk__Matrix_Mult.h" 4.631 + 4.632 + 4.633 +void inline 4.634 +copyFromOrig( SubMatrix *subMatrix, VirtProcr *animPr ); 4.635 + 4.636 +void inline 4.637 +copyTransposeFromOrig( SubMatrix *subMatrix, VirtProcr *animPr ); 4.638 + 4.639 +void inline 4.640 +multiplySubBlocksTransposed( float32 *leftArray, float32 *rightArray, 4.641 + float32 *resArray, 4.642 + int startRow, int endRow, 4.643 + int startCol, int endCol, 4.644 + int startVec, int endVec, 4.645 + int resStride, int inpStride ); 4.646 + 4.647 +void inline 4.648 +multiplyMatrixArraysTransposed( int32 vecLength, int32 numResRows, int32 numResCols, 4.649 + float32 *leftArray, float32 *rightArray, 4.650 + float32 *resArray ); 4.651 + 4.652 + 4.653 +/*A processor is created with an environment that holds two matrices, 4.654 + * the row and col that it owns, and the name of a result gathering 4.655 + * processor. 4.656 + *It calculates the product of two sub-portions of the input matrices 4.657 + * by using Intel's mkl library for single-core. 4.658 + * 4.659 + *This demonstrates using optimized single-threaded code inside scheduled 4.660 + * work-units. 4.661 + * 4.662 + *When done, it sends the result to the result processor 4.663 + */ 4.664 +void 4.665 +calcSubMatrixProduct( void *data, VirtProcr *animPr ) 4.666 + { 4.667 + SMPairParams *params; 4.668 + VirtProcr *resultPr; 4.669 + float32 *leftArray, *rightArray, *resArray; 4.670 + SubMatrix *leftSubMatrix, *rightSubMatrix; 4.671 + 4.672 + 4.673 + DEBUG1( dbgAppFlow, "start sub-matrix mult %d\n", animPr->procrID) 4.674 + #ifdef TURN_ON_DEBUG_PROBES 4.675 + int32 subMatrixProbe = 4.676 + VMS__create_single_interval_probe( "subMtx", animPr); 4.677 + VMS__record_sched_choice_into_probe( subMatrixProbe, animPr ); 4.678 + VMS__record_interval_start_in_probe( subMatrixProbe ); 4.679 + #endif 4.680 + 4.681 + params = (SMPairParams *)data; 4.682 +// resultPr = params->resultPr; 4.683 + leftSubMatrix = params->leftSubMatrix; 4.684 + rightSubMatrix = params->rightSubMatrix; 4.685 + 4.686 + //make sure the input sub-matrices have been copied out of orig 4.687 + copyFromOrig( leftSubMatrix, animPr ); 4.688 + copyTransposeFromOrig( rightSubMatrix, animPr ); 4.689 + 4.690 + leftArray = leftSubMatrix->array; 4.691 + rightArray = rightSubMatrix->array; 4.692 + 4.693 + //make this array here, on the core that computes the results 4.694 + // with Cilk's semantics, have to have separate result array for each 4.695 + // spawned processor -- unless want to change the spawn and sync 4.696 + // pattern, such that spawn one from each vector, then sync, then 4.697 + // another, and so forth -- this will cause idle time due to imbalance 4.698 + // in matrix sizes 4.699 + //This also gives chance to set affinity so all in vector run on same 4.700 + // core and re-use the accumulation array, 4.701 + //As a side-benefit, it also prevents writes from causing 4.702 + // thrashing of the cache -- as long as array big enough, the copy 4.703 + // overhead is small because each byte is reused size-of-side times 4.704 + //This is freed in the vector processor 4.705 + int32 4.706 + resSize = leftSubMatrix->numRows * rightSubMatrix->numCols * sizeof(float32); 4.707 + resArray = VCilk__malloc( resSize, animPr ); 4.708 + memset( resArray, 0, resSize ); 4.709 + 4.710 + 4.711 + int32 numResRows, numResCols, vectLength; 4.712 + 4.713 + vectLength = leftSubMatrix->numCols; 4.714 + numResRows = leftSubMatrix->numRows; 4.715 + numResCols = rightSubMatrix->numCols; 4.716 + 4.717 + multiplyMatrixArraysTransposed( vectLength, numResRows, numResCols, 4.718 + leftArray, rightArray, 4.719 + resArray ); 4.720 + 4.721 + //send result by side-effect 4.722 + params->partialResultArray = resArray; 4.723 + 4.724 + #ifdef TURN_ON_DEBUG_PROBES 4.725 + VMS__record_interval_end_in_probe( subMatrixProbe ); 4.726 + #endif 4.727 + 4.728 + DEBUG1( dbgAppFlow, "end sub-matrix mult %d\n", animPr->procrID) 4.729 + VCilk__dissipate_procr( animPr ); 4.730 + } 4.731 + 4.732 + 4.733 + 4.734 +/*Divides result and each input into 32x32 sub-matrices, 3 of which fit into 4.735 + * the 32KB L1 cache. 4.736 + *Would be nice to embed this within another level that divided into 4.737 + * 8x8 tiles of those, where one 8x8 tile fits within 2MB L2 cache 4.738 + * 4.739 + *Eventually want these divisions to be automatic, using DKU pattern 4.740 + * embedded into VMS and exposed in the language, and with VMS controlling the 4.741 + * divisions according to the cache sizes, which it knows about. 4.742 + *Also, want VMS to work with language to split among main-mems, so a socket 4.743 + * only cranks on data in its local segment of main mem 4.744 + * 4.745 + *So, outer two loops determine start and end points within the result matrix. 4.746 + * Inside that, a loop dets the start and end points along the shared dimensions 4.747 + * of the two input matrices. 4.748 + */ 4.749 +void inline 4.750 +multiplyMatrixArraysTransposed( int32 vecLength, int32 numResRows, 4.751 + int32 numResCols, 4.752 + float32 *leftArray, float32 *rightArray, 4.753 + float32 *resArray ) 4.754 + { 4.755 + int resStride, inpStride; 4.756 + int resStartRow, resStartCol, resEndRow, resEndCol, startVec, endVec; 4.757 + 4.758 + resStride = numResCols; 4.759 + inpStride = vecLength; 4.760 + 4.761 + for( resStartRow = 0; resStartRow < numResRows; ) 4.762 + { 4.763 + resEndRow = resStartRow + ROWS_IN_BLOCK -1; //start at zero, so -1 4.764 + if( resEndRow > numResRows ) resEndRow = numResRows -1; 4.765 + 4.766 + for( resStartCol = 0; resStartCol < numResCols; ) 4.767 + { 4.768 + resEndCol = resStartCol + COLS_IN_BLOCK -1; 4.769 + if( resEndCol > numResCols ) resEndCol = numResCols -1; 4.770 + 4.771 + for( startVec = 0; startVec < vecLength; ) 4.772 + { 4.773 + endVec = startVec + VEC_IN_BLOCK -1; 4.774 + if( endVec > vecLength ) endVec = vecLength -1; 4.775 + 4.776 + //By having the "vector" of sub-blocks in a sub-block slice 4.777 + // be marched down in inner loop, are re-using the result 4.778 + // matrix, which stays in L1 cache and re-using the left sub-mat 4.779 + // which repeats for each right sub-mat -- can only re-use two of 4.780 + // the three, so result is the most important -- avoids writing 4.781 + // dirty blocks until those result-locations fully done 4.782 + //Row and Col is position in result matrix -- so row and vec 4.783 + // for left array, then vec and col for right array 4.784 + multiplySubBlocksTransposed( leftArray, rightArray, 4.785 + resArray, 4.786 + resStartRow, resEndRow, 4.787 + resStartCol, resEndCol, 4.788 + startVec, endVec, 4.789 + resStride, inpStride ); 4.790 + startVec = endVec +1; 4.791 + } 4.792 + resStartCol = resEndCol +1; 4.793 + } 4.794 + resStartRow = resEndRow +1; 4.795 + } 4.796 + } 4.797 + 4.798 + 4.799 + 4.800 +void inline 4.801 +multiplySubBlocksTransposed( float32 *leftArray, float32 *rightArray, 4.802 + float32 *resArray, 4.803 + int resStartRow, int resEndRow, 4.804 + int resStartCol, int resEndCol, 4.805 + int startVec, int endVec, 4.806 + int resStride, int inpStride ) 4.807 + { 4.808 + int resRow, resCol, vec; 4.809 + int leftOffset, rightOffset; 4.810 + float32 result; 4.811 + 4.812 + //The result row is used for the left matrix, res col for the right 4.813 + for( resCol = resStartCol; resCol <= resEndCol; resCol++ ) 4.814 + { 4.815 + for( resRow = resStartRow; resRow <= resEndRow; resRow++ ) 4.816 + { 4.817 + leftOffset = resRow * inpStride;//left & right inp strides same 4.818 + rightOffset = resCol * inpStride;// because right is transposed 4.819 + result = 0; 4.820 + for( vec = startVec; vec <= endVec; vec++ ) 4.821 + { 4.822 + result += 4.823 + leftArray[ leftOffset + vec] * rightArray[ rightOffset + vec]; 4.824 + } 4.825 + 4.826 + resArray[ resRow * resStride + resCol ] += result; 4.827 + } 4.828 + } 4.829 + } 4.830 + 4.831 + 4.832 +/*Reuse this in divider when do the sequential multiply case 4.833 + */ 4.834 +void inline 4.835 +copyTranspose( int32 numRows, int32 numCols, 4.836 + int32 origStartRow, int32 origStartCol, int32 origStride, 4.837 + float32 *subArray, float32 *origArray ) 4.838 + { int32 stride = numRows; 4.839 + 4.840 + int row, col, origOffset; 4.841 + for( row = 0; row < numRows; row++ ) 4.842 + { 4.843 + origOffset = (row + origStartRow) * origStride + origStartCol; 4.844 + for( col = 0; col < numCols; col++ ) 4.845 + { 4.846 + //transpose means swap row & col -- traverse orig matrix normally 4.847 + // but put into reversed place in local array -- means the 4.848 + // stride is the numRows now, so col * numRows + row 4.849 + subArray[ col * stride + row ] = origArray[ origOffset + col ]; 4.850 + } 4.851 + } 4.852 + } 4.853 + 4.854 +void inline 4.855 +copyTransposeFromOrig( SubMatrix *subMatrix, VirtProcr *animPr ) 4.856 + { int numCols, numRows, origStartRow, origStartCol, origStride, stride; 4.857 + Matrix *origMatrix; 4.858 + float32 *origArray, *subArray; 4.859 + 4.860 + VCilk__start_data_singleton( &(subMatrix->copyTransSingleton), animPr ); 4.861 + 4.862 + origMatrix = subMatrix->origMatrix; 4.863 + origArray = origMatrix->array; 4.864 + numCols = subMatrix->numCols; 4.865 + numRows = subMatrix->numRows; 4.866 + origStartRow = subMatrix->origStartRow; 4.867 + origStartCol = subMatrix->origStartCol; 4.868 + origStride = origMatrix->numCols; 4.869 + 4.870 + subArray = VCilk__malloc( numRows * numCols *sizeof(float32),animPr); 4.871 + subMatrix->array = subArray; 4.872 + 4.873 + //copy values from orig matrix to local 4.874 + copyTranspose( numRows, numCols, 4.875 + origStartRow, origStartCol, origStride, 4.876 + subArray, origArray ); 4.877 + 4.878 + VCilk__end_data_singleton( &(subMatrix->copyTransSingleton), animPr ); 4.879 + return; 4.880 + } 4.881 + 4.882 + 4.883 +void inline 4.884 +copyFromOrig( SubMatrix *subMatrix, VirtProcr *animPr ) 4.885 + { int numCols, numRows, origStartRow, origStartCol, stride, origStride; 4.886 + Matrix *origMatrix; 4.887 + float32 *origArray, *subArray; 4.888 + 4.889 + 4.890 + //This lets only a single VP execute the code between start and 4.891 + // end -- using start and end so that work runs outside the master. 4.892 + //Inside, if a second VP ever executes the start, it will be returned 4.893 + // from the end-point. 4.894 + //Note, for non-GCC, can add a second SSR call at the end, and inside 4.895 + // that one, look at the stack at the return addr & save that in an 4.896 + // array indexed by singletonID 4.897 + VCilk__start_data_singleton( &(subMatrix->copySingleton), animPr ); 4.898 + 4.899 + 4.900 + origMatrix = subMatrix->origMatrix; 4.901 + origArray = origMatrix->array; 4.902 + numCols = subMatrix->numCols; 4.903 + numRows = subMatrix->numRows; 4.904 + origStartRow = subMatrix->origStartRow; 4.905 + origStartCol = subMatrix->origStartCol; 4.906 + origStride = origMatrix->numCols; 4.907 + 4.908 + subArray = VCilk__malloc( numRows * numCols *sizeof(float32),animPr); 4.909 + subMatrix->array = subArray; 4.910 + 4.911 + //copy values from orig matrix to local 4.912 + stride = numCols; 4.913 + 4.914 + int row, col, offset, origOffset; 4.915 + for( row = 0; row < numRows; row++ ) 4.916 + { 4.917 + offset = row * stride; 4.918 + origOffset = (row + origStartRow) * origStride + origStartCol; 4.919 + for( col = 0; col < numCols; col++ ) 4.920 + { 4.921 + subArray[ offset + col ] = origArray[ origOffset + col ]; 4.922 + } 4.923 + } 4.924 + VCilk__end_data_singleton( &(subMatrix->copySingleton), animPr ); 4.925 + 4.926 + return; 4.927 + }
5.1 --- a/src/Application/main.c Wed May 11 15:40:54 2011 +0200 5.2 +++ b/src/Application/main.c Wed May 11 15:58:04 2011 +0200 5.3 @@ -1,35 +1,35 @@ 5.4 -/* 5.5 5.6 - * Copyright Oct 24, 2009 OpenSourceStewardshipFoundation.org 5.7 5.8 - * Licensed under GNU General Public License version 2 5.9 5.10 - * 5.11 5.12 - * author seanhalle@yahoo.com 5.13 5.14 - */ 5.15 5.16 - 5.17 5.18 -#include <malloc.h> 5.19 5.20 -#include <stdlib.h> 5.21 5.22 - 5.23 5.24 -#include "Matrix_Mult.h" 5.25 5.26 -#include "VCilk__Matrix_Mult/VCilk__Matrix_Mult.h" 5.27 5.28 - 5.29 5.30 -/** 5.31 5.32 - *Matrix multiply program written using VMS_HW piggy-back language 5.33 5.34 - * 5.35 5.36 - */ 5.37 5.38 -int main( int argc, char **argv ) 5.39 5.40 - { Matrix *leftMatrix, *rightMatrix, *resultMatrix; 5.41 5.42 - ParamBag *paramBag; 5.43 5.44 - 5.45 5.46 - paramBag = makeParamBag(); 5.47 5.48 - printf("arguments: %s | %s | %s | %s\n", argv[0], argv[1], argv[2], argv[3] ); 5.49 5.50 - readParamFileIntoBag( argv[1], paramBag ); 5.51 5.52 - initialize_Input_Matrices_Via( &leftMatrix, &rightMatrix, paramBag ); 5.53 5.54 - 5.55 5.56 - resultMatrix = multiplyTheseMatrices( leftMatrix, rightMatrix ); 5.57 5.58 - 5.59 5.60 -// printf("\nresult matrix: \n"); \ 5.61 5.62 - printMatrix( resultMatrix ); 5.63 5.64 - 5.65 5.66 -// VCilk__print_stats(); 5.67 5.68 - fflush(stdin); 5.69 5.70 - exit(0); //cleans up 5.71 5.72 - } 5.73 5.74 +/* 5.75 + * Copyright Oct 24, 2009 OpenSourceStewardshipFoundation.org 5.76 + * Licensed under GNU General Public License version 2 5.77 + * 5.78 + * author seanhalle@yahoo.com 5.79 + */ 5.80 + 5.81 +#include <malloc.h> 5.82 +#include <stdlib.h> 5.83 + 5.84 +#include "Matrix_Mult.h" 5.85 +#include "VCilk__Matrix_Mult/VCilk__Matrix_Mult.h" 5.86 + 5.87 +/** 5.88 + *Matrix multiply program written using VMS_HW piggy-back language 5.89 + * 5.90 + */ 5.91 +int main( int argc, char **argv ) 5.92 + { Matrix *leftMatrix, *rightMatrix, *resultMatrix; 5.93 + ParamBag *paramBag; 5.94 + 5.95 + paramBag = makeParamBag(); 5.96 + printf("arguments: %s | %s | %s | %s\n", argv[0], argv[1], argv[2], argv[3] ); 5.97 + readParamFileIntoBag( argv[1], paramBag ); 5.98 + initialize_Input_Matrices_Via( &leftMatrix, &rightMatrix, paramBag ); 5.99 + 5.100 + resultMatrix = multiplyTheseMatrices( leftMatrix, rightMatrix ); 5.101 + 5.102 +// printf("\nresult matrix: \n"); \ 5.103 + printMatrix( resultMatrix ); 5.104 + 5.105 +// VCilk__print_stats(); 5.106 + fflush(stdin); 5.107 + exit(0); //cleans up 5.108 + }
