Mercurial > cgi-bin > hgwebdir.cgi > PR > Applications > VCilk > VCilk__Blocked_Matrix_Mult__Bench
changeset 4:ecba4ae0be7a
Seans development tree
| author | Merten Sach <msach@mailbox.tu-berlin.de> |
|---|---|
| date | Wed, 11 May 2011 15:40:54 +0200 |
| parents | 12bcb9728e1c |
| children | e223756d0f0c |
| files | src/Application/Matrix_Mult.c src/Application/Matrix_Mult.h src/Application/VCilk__Matrix_Mult/Divide_Pr.c src/Application/VCilk__Matrix_Mult/EntryPoint.c src/Application/VCilk__Matrix_Mult/VCilk__Matrix_Mult.h src/Application/VCilk__Matrix_Mult/Vector_Pr.c src/Application/VCilk__Matrix_Mult/subMatrix_Pr.c src/Application/main.c |
| diffstat | 8 files changed, 1469 insertions(+), 1471 deletions(-) [+] |
line diff
1.1 --- a/src/Application/Matrix_Mult.c Mon Nov 15 12:08:19 2010 -0800 1.2 +++ b/src/Application/Matrix_Mult.c Wed May 11 15:40:54 2011 +0200 1.3 @@ -1,167 +1,167 @@ 1.4 -/* 1.5 - * Copyright 2009 OpenSourceStewardshipFoundation.org 1.6 - * Licensed under GNU General Public License version 2 1.7 - * 1.8 - * Author: seanhalle@yahoo.com 1.9 - * 1.10 - * Created on November 15, 2009, 2:35 AM 1.11 - */ 1.12 - 1.13 -#include <malloc.h> 1.14 -#include <stdlib.h> 1.15 - 1.16 -#include "Matrix_Mult.h" 1.17 -#include "ParamHelper/Param.h" 1.18 - 1.19 - 1.20 - 1.21 - void 1.22 -initialize_Input_Matrices_Via( Matrix **leftMatrix, Matrix **rightMatrix, 1.23 - ParamBag *paramBag ) 1.24 - { char *leftMatrixFileName, *rightMatrixFileName; 1.25 - int leftMatrixRows, leftMatrixCols, rightMatrixRows, rightMatrixCols; 1.26 - 1.27 - ParamStruc *param; 1.28 - param = getParamFromBag( "leftMatrixRows", paramBag ); 1.29 - leftMatrixRows = param->intValue; 1.30 - param = getParamFromBag( "leftMatrixCols", paramBag ); 1.31 - leftMatrixCols = param->intValue; 1.32 - *leftMatrix = makeMatrix_WithResMat( leftMatrixRows, leftMatrixCols ); 1.33 - 1.34 - param = getParamFromBag( "leftMatrixFileName", paramBag ); 1.35 - leftMatrixFileName = param->strValue; //no need to copy 1.36 - read_Matrix_From_File( *leftMatrix, leftMatrixFileName ); 1.37 - 1.38 - param = getParamFromBag( "rightMatrixRows", paramBag ); 1.39 - rightMatrixRows = param->intValue; 1.40 - param = getParamFromBag( "rightMatrixCols", paramBag ); 1.41 - rightMatrixCols = param->intValue; 1.42 - *rightMatrix = makeMatrix_WithResMat( rightMatrixRows, rightMatrixCols ); 1.43 - 1.44 - param = getParamFromBag( "rightMatrixFileName", paramBag ); 1.45 - rightMatrixFileName = param->strValue; 1.46 - read_Matrix_From_File( *rightMatrix, rightMatrixFileName ); 1.47 - } 1.48 - 1.49 - 1.50 -void parseLineIntoRow( char *line, float32* row ); 1.51 - 1.52 - 1.53 - void 1.54 -read_Matrix_From_File( Matrix *matrixStruc, char *matrixFileName ) 1.55 - { int row, maxRead, numRows, numCols; 1.56 - float32 *matrixStart; 1.57 - size_t lineSz = 0; 1.58 - FILE *file; 1.59 - char *line = NULL; 1.60 - 1.61 - lineSz = 50000; //max length of line in a matrix data file 1.62 - line = (char *) malloc( lineSz ); 1.63 - if( line == NULL ) printf( "no mem for matrix line" ); 1.64 - 1.65 - numRows = matrixStruc->numRows; 1.66 - numCols = matrixStruc->numCols; 1.67 - matrixStart = matrixStruc->array; 1.68 - 1.69 - file = fopen( matrixFileName, "r" ); 1.70 - if( file == NULL ) { printf( "\nCouldn't open file!!\n"); exit(1);} 1.71 - fseek( file, 0, SEEK_SET ); 1.72 - for( row = 0; row < numRows; row++ ) 1.73 - { 1.74 - if( feof( file ) ) printf( "file ran out too soon" ); 1.75 - maxRead = getline( &line, &lineSz, file ); 1.76 - if( maxRead == -1 ) printf( "prob reading mat line"); 1.77 - 1.78 - if( *line == '\n') continue; //blank line 1.79 - if( *line == '/' ) continue; //comment line 1.80 - 1.81 - parseLineIntoRow( line, matrixStart + row * numCols ); 1.82 - } 1.83 - free( line ); 1.84 - } 1.85 - 1.86 -/*This function relies on each line having the proper number of cols. It 1.87 - * doesn't check, nor enforce, so if the file is improperly formatted it 1.88 - * can write over unrelated memory 1.89 - */ 1.90 - void 1.91 -parseLineIntoRow( char *line, float32* row ) 1.92 - { 1.93 - char *valueStr, *searchPos; 1.94 - 1.95 - //read the float values 1.96 - searchPos = valueStr = line; //start 1.97 - 1.98 - for( ; *searchPos != 0; searchPos++) //bit dangerous, should use buff len 1.99 - { 1.100 - if( *searchPos == '\n' ) //last col.. relying on well-formatted file 1.101 - { *searchPos = 0; 1.102 - *row = atof( valueStr ); 1.103 - break; //end FOR loop 1.104 - } 1.105 - if( *searchPos == ',' ) 1.106 - { *searchPos = 0; //mark end of string 1.107 - *row = (float32) atof( valueStr ); 1.108 - row += 1; //address arith 1.109 - //skip any spaces before digits.. use searchPos + 1 to skip the 0 1.110 - for( ; *(searchPos + 1)== ' ' && *(searchPos + 1) !=0; searchPos++); 1.111 - valueStr = searchPos + 1; 1.112 - } 1.113 - } 1.114 - } 1.115 - 1.116 - //========================================================================== 1.117 - 1.118 -/*In the "_Flat" version of constructor, do only malloc of the top data struc 1.119 - * and set values in that top-level. Don't malloc any sub-structures. 1.120 - */ 1.121 - Matrix * 1.122 -makeMatrix_Flat( int32 numRows, int32 numCols ) 1.123 - { Matrix * retMatrix; 1.124 - retMatrix = malloc( sizeof( Matrix ) ); 1.125 - retMatrix->numRows = numRows; 1.126 - retMatrix->numCols = numCols; 1.127 - 1.128 - return retMatrix; 1.129 - } 1.130 - 1.131 - Matrix * 1.132 -makeMatrix_WithResMat( int32 numRows, int32 numCols ) 1.133 - { Matrix * retMatrix; 1.134 - retMatrix = malloc( sizeof( Matrix ) ); 1.135 - retMatrix->numRows = numRows; 1.136 - retMatrix->numCols = numCols; 1.137 - retMatrix->array = malloc( numRows * numCols * sizeof(float32) ); 1.138 - 1.139 - return retMatrix; 1.140 - } 1.141 - 1.142 - void 1.143 -freeMatrix_Flat( Matrix * matrix ) 1.144 - { //( matrix ); 1.145 - } 1.146 - void 1.147 -freeMatrix( Matrix * matrix ) 1.148 - { free( matrix->array ); 1.149 - free( matrix ); 1.150 - } 1.151 - 1.152 -void 1.153 -printMatrix( Matrix *matrix ) 1.154 - { int r, c, numRows, numCols, rowsToPrint, colsToPrint, rowIncr, colIncr; 1.155 - float32 *matrixArray; 1.156 - 1.157 - numRows = rowsToPrint = matrix->numRows; 1.158 - numCols = colsToPrint = matrix->numCols; 1.159 - matrixArray = matrix->array; 1.160 - 1.161 - rowIncr = numRows/20; if(rowIncr == 0) rowIncr = 1;//20 to 39 rows printed 1.162 - colIncr = numCols/20; if(colIncr == 0) colIncr = 1;//20 to 39 cols printed 1.163 - for( r = 0; r < numRows; r += rowIncr ) 1.164 - { for( c = 0; c < numCols; c += colIncr ) 1.165 - { printf( "%3.1f | ", matrixArray[ r * numCols + c ] ); 1.166 - } 1.167 - printf("\n"); 1.168 - } 1.169 - } 1.170 - 1.171 +/* 1.172 1.173 + * Copyright 2009 OpenSourceStewardshipFoundation.org 1.174 1.175 + * Licensed under GNU General Public License version 2 1.176 1.177 + * 1.178 1.179 + * Author: seanhalle@yahoo.com 1.180 1.181 + * 1.182 1.183 + * Created on November 15, 2009, 2:35 AM 1.184 1.185 + */ 1.186 1.187 + 1.188 1.189 +#include <malloc.h> 1.190 1.191 +#include <stdlib.h> 1.192 1.193 + 1.194 1.195 +#include "Matrix_Mult.h" 1.196 1.197 +#include "ParamHelper/Param.h" 1.198 1.199 + 1.200 1.201 + 1.202 1.203 + 1.204 1.205 + void 1.206 1.207 +initialize_Input_Matrices_Via( Matrix **leftMatrix, Matrix **rightMatrix, 1.208 1.209 + ParamBag *paramBag ) 1.210 1.211 + { char *leftMatrixFileName, *rightMatrixFileName; 1.212 1.213 + int leftMatrixRows, leftMatrixCols, rightMatrixRows, rightMatrixCols; 1.214 1.215 + 1.216 1.217 + ParamStruc *param; 1.218 1.219 + param = getParamFromBag( "leftMatrixRows", paramBag ); 1.220 1.221 + leftMatrixRows = param->intValue; 1.222 1.223 + param = getParamFromBag( "leftMatrixCols", paramBag ); 1.224 1.225 + leftMatrixCols = param->intValue; 1.226 1.227 + *leftMatrix = makeMatrix_WithResMat( leftMatrixRows, leftMatrixCols ); 1.228 1.229 + 1.230 1.231 + param = getParamFromBag( "leftMatrixFileName", paramBag ); 1.232 1.233 + leftMatrixFileName = param->strValue; //no need to copy 1.234 1.235 + read_Matrix_From_File( *leftMatrix, leftMatrixFileName ); 1.236 1.237 + 1.238 1.239 + param = getParamFromBag( "rightMatrixRows", paramBag ); 1.240 1.241 + rightMatrixRows = param->intValue; 1.242 1.243 + param = getParamFromBag( "rightMatrixCols", paramBag ); 1.244 1.245 + rightMatrixCols = param->intValue; 1.246 1.247 + *rightMatrix = makeMatrix_WithResMat( rightMatrixRows, rightMatrixCols ); 1.248 1.249 + 1.250 1.251 + param = getParamFromBag( "rightMatrixFileName", paramBag ); 1.252 1.253 + rightMatrixFileName = param->strValue; 1.254 1.255 + read_Matrix_From_File( *rightMatrix, rightMatrixFileName ); 1.256 1.257 + } 1.258 1.259 + 1.260 1.261 + 1.262 1.263 +void parseLineIntoRow( char *line, float32* row ); 1.264 1.265 + 1.266 1.267 + 1.268 1.269 + void 1.270 1.271 +read_Matrix_From_File( Matrix *matrixStruc, char *matrixFileName ) 1.272 1.273 + { int row, maxRead, numRows, numCols; 1.274 1.275 + float32 *matrixStart; 1.276 1.277 + size_t lineSz = 0; 1.278 1.279 + FILE *file; 1.280 1.281 + char *line = NULL; 1.282 1.283 + 1.284 1.285 + lineSz = 50000; //max length of line in a matrix data file 1.286 1.287 + line = (char *) malloc( lineSz ); 1.288 1.289 + if( line == NULL ) printf( "no mem for matrix line" ); 1.290 1.291 + 1.292 1.293 + numRows = matrixStruc->numRows; 1.294 1.295 + numCols = matrixStruc->numCols; 1.296 1.297 + matrixStart = matrixStruc->array; 1.298 1.299 + 1.300 1.301 + file = fopen( matrixFileName, "r" ); 1.302 1.303 + if( file == NULL ) { printf( "\nCouldn't open file!!\n"); exit(1);} 1.304 1.305 + fseek( file, 0, SEEK_SET ); 1.306 1.307 + for( row = 0; row < numRows; row++ ) 1.308 1.309 + { 1.310 1.311 + if( feof( file ) ) printf( "file ran out too soon" ); 1.312 1.313 + maxRead = getline( &line, &lineSz, file ); 1.314 1.315 + if( maxRead == -1 ) printf( "prob reading mat line"); 1.316 1.317 + 1.318 1.319 + if( *line == '\n') continue; //blank line 1.320 1.321 + if( *line == '/' ) continue; //comment line 1.322 1.323 + 1.324 1.325 + parseLineIntoRow( line, matrixStart + row * numCols ); 1.326 1.327 + } 1.328 1.329 + free( line ); 1.330 1.331 + } 1.332 1.333 + 1.334 1.335 +/*This function relies on each line having the proper number of cols. It 1.336 1.337 + * doesn't check, nor enforce, so if the file is improperly formatted it 1.338 1.339 + * can write over unrelated memory 1.340 1.341 + */ 1.342 1.343 + void 1.344 1.345 +parseLineIntoRow( char *line, float32* row ) 1.346 1.347 + { 1.348 1.349 + char *valueStr, *searchPos; 1.350 1.351 + 1.352 1.353 + //read the float values 1.354 1.355 + searchPos = valueStr = line; //start 1.356 1.357 + 1.358 1.359 + for( ; *searchPos != 0; searchPos++) //bit dangerous, should use buff len 1.360 1.361 + { 1.362 1.363 + if( *searchPos == '\n' ) //last col.. relying on well-formatted file 1.364 1.365 + { *searchPos = 0; 1.366 1.367 + *row = atof( valueStr ); 1.368 1.369 + break; //end FOR loop 1.370 1.371 + } 1.372 1.373 + if( *searchPos == ',' ) 1.374 1.375 + { *searchPos = 0; //mark end of string 1.376 1.377 + *row = (float32) atof( valueStr ); 1.378 1.379 + row += 1; //address arith 1.380 1.381 + //skip any spaces before digits.. use searchPos + 1 to skip the 0 1.382 1.383 + for( ; *(searchPos + 1)== ' ' && *(searchPos + 1) !=0; searchPos++); 1.384 1.385 + valueStr = searchPos + 1; 1.386 1.387 + } 1.388 1.389 + } 1.390 1.391 + } 1.392 1.393 + 1.394 1.395 + //========================================================================== 1.396 1.397 + 1.398 1.399 +/*In the "_Flat" version of constructor, do only malloc of the top data struc 1.400 1.401 + * and set values in that top-level. Don't malloc any sub-structures. 1.402 1.403 + */ 1.404 1.405 + Matrix * 1.406 1.407 +makeMatrix_Flat( int32 numRows, int32 numCols ) 1.408 1.409 + { Matrix * retMatrix; 1.410 1.411 + retMatrix = malloc( sizeof( Matrix ) ); 1.412 1.413 + retMatrix->numRows = numRows; 1.414 1.415 + retMatrix->numCols = numCols; 1.416 1.417 + 1.418 1.419 + return retMatrix; 1.420 1.421 + } 1.422 1.423 + 1.424 1.425 + Matrix * 1.426 1.427 +makeMatrix_WithResMat( int32 numRows, int32 numCols ) 1.428 1.429 + { Matrix * retMatrix; 1.430 1.431 + retMatrix = malloc( sizeof( Matrix ) ); 1.432 1.433 + retMatrix->numRows = numRows; 1.434 1.435 + retMatrix->numCols = numCols; 1.436 1.437 + retMatrix->array = malloc( numRows * numCols * sizeof(float32) ); 1.438 1.439 + 1.440 1.441 + return retMatrix; 1.442 1.443 + } 1.444 1.445 + 1.446 1.447 + void 1.448 1.449 +freeMatrix_Flat( Matrix * matrix ) 1.450 1.451 + { //( matrix ); 1.452 1.453 + } 1.454 1.455 + void 1.456 1.457 +freeMatrix( Matrix * matrix ) 1.458 1.459 + { free( matrix->array ); 1.460 1.461 + free( matrix ); 1.462 1.463 + } 1.464 1.465 + 1.466 1.467 +void 1.468 1.469 +printMatrix( Matrix *matrix ) 1.470 1.471 + { int r, c, numRows, numCols, rowsToPrint, colsToPrint, rowIncr, colIncr; 1.472 1.473 + float32 *matrixArray; 1.474 1.475 + 1.476 1.477 + numRows = rowsToPrint = matrix->numRows; 1.478 1.479 + numCols = colsToPrint = matrix->numCols; 1.480 1.481 + matrixArray = matrix->array; 1.482 1.483 + 1.484 1.485 + rowIncr = numRows/20; if(rowIncr == 0) rowIncr = 1;//20 to 39 rows printed 1.486 1.487 + colIncr = numCols/20; if(colIncr == 0) colIncr = 1;//20 to 39 cols printed 1.488 1.489 + for( r = 0; r < numRows; r += rowIncr ) 1.490 1.491 + { for( c = 0; c < numCols; c += colIncr ) 1.492 1.493 + { printf( "%3.1f | ", matrixArray[ r * numCols + c ] ); 1.494 1.495 + } 1.496 1.497 + printf("\n"); 1.498 1.499 + } 1.500 1.501 + } 1.502 1.503 + 1.504
2.1 --- a/src/Application/Matrix_Mult.h Mon Nov 15 12:08:19 2010 -0800 2.2 +++ b/src/Application/Matrix_Mult.h Wed May 11 15:40:54 2011 +0200 2.3 @@ -1,77 +1,77 @@ 2.4 -/* 2.5 - * Copyright Oct 24, 2009 OpenSourceStewardshipFoundation.org 2.6 - * Licensed under GNU General Public License version 2 2.7 - */ 2.8 - 2.9 -#ifndef MATRIX_MULT_H_ 2.10 -#define MATRIX_MULT_H_ 2.11 - 2.12 -#include <stdio.h> 2.13 -#include <unistd.h> 2.14 -#include <malloc.h> 2.15 - 2.16 -#include "../VCilk_lib/VMS/VMS_primitive_data_types.h" 2.17 -#include "ParamHelper/Param.h" 2.18 - 2.19 -//============================== Structures ============================== 2.20 - 2.21 -typedef 2.22 -struct 2.23 - { int32 numRows; 2.24 - int32 numCols; 2.25 - float32 *array; //2D, but dynamically sized, so use addr arith 2.26 - } 2.27 -Matrix; 2.28 - 2.29 -/* This is the "appSpecificPiece" that is carried inside a DKUPiece. 2.30 - * In the DKUPiece data struc it is declared to be of type "void *". This 2.31 - * allows the application to define any data structure it wants and put it 2.32 - * into a DKUPiece. 2.33 - * When the app specific info is used, it is in app code, so it is cast to 2.34 - * the correct type to tell the compiler how to access fields. 2.35 - * This keeps all app-specific things out of the DKU directory, as per the 2.36 - * DKU standard. */ 2.37 -typedef 2.38 -struct 2.39 - { 2.40 - // pointers to shared data.. the result matrix must be created when the 2.41 - // left and right matrices are put into the root ancestor DKUPiece. 2.42 - Matrix * leftMatrix; 2.43 - Matrix * rightMatrix; 2.44 - Matrix * resultMatrix; 2.45 - 2.46 - // define the starting and ending boundaries for this piece of the 2.47 - // result matrix. These are derivable from the left and right 2.48 - // matrices, but included them for readability of code. 2.49 - int prodStartRow, prodEndRow; 2.50 - int prodStartCol, prodEndCol; 2.51 - // Start and end of the portion of the left matrix that contributes to 2.52 - // this piece of the product 2.53 - int leftStartRow, leftEndRow; 2.54 - int leftStartCol, leftEndCol; 2.55 - // Start and end of the portion of the right matrix that contributes to 2.56 - // this piece of the product 2.57 - int rightStartRow, rightEndRow; 2.58 - int rightStartCol, rightEndCol; 2.59 - } 2.60 -MatrixProdPiece; 2.61 - 2.62 -//============================== Functions ================================ 2.63 -void readFile(); 2.64 - 2.65 -Matrix *makeMatrix( int32 numRows, int32 numCols ); 2.66 -Matrix *makeMatrix_Flat( int32 numRows, int32 numCols ); 2.67 -Matrix *makeMatrix_WithResMat( int32 numRows, int32 numCols ); 2.68 -void freeMatrix_Flat( Matrix * matrix ); 2.69 -void freeMatrix( Matrix * matrix ); 2.70 -void printMatrix( Matrix *matrix ); 2.71 - 2.72 -void read_Matrix_From_File( Matrix *matrixStruc, char *matrixFileName ); 2.73 - 2.74 -void 2.75 -initialize_Input_Matrices_Via( Matrix **leftMatrix, Matrix **rightMatrix, 2.76 - ParamBag *paramBag ); 2.77 - 2.78 -//=========================================================================== 2.79 - 2.80 -#endif /*MATRIX_MULT_H_*/ 2.81 +/* 2.82 2.83 + * Copyright Oct 24, 2009 OpenSourceStewardshipFoundation.org 2.84 2.85 + * Licensed under GNU General Public License version 2 2.86 2.87 + */ 2.88 2.89 + 2.90 2.91 +#ifndef MATRIX_MULT_H_ 2.92 2.93 +#define MATRIX_MULT_H_ 2.94 2.95 + 2.96 2.97 +#include <stdio.h> 2.98 2.99 +#include <unistd.h> 2.100 2.101 +#include <malloc.h> 2.102 2.103 + 2.104 2.105 +#include "../VCilk_lib/VMS/VMS_primitive_data_types.h" 2.106 2.107 +#include "ParamHelper/Param.h" 2.108 2.109 + 2.110 2.111 +//============================== Structures ============================== 2.112 2.113 + 2.114 2.115 +typedef 2.116 2.117 +struct 2.118 2.119 + { int32 numRows; 2.120 2.121 + int32 numCols; 2.122 2.123 + float32 *array; //2D, but dynamically sized, so use addr arith 2.124 2.125 + } 2.126 2.127 +Matrix; 2.128 2.129 + 2.130 2.131 +/* This is the "appSpecificPiece" that is carried inside a DKUPiece. 2.132 2.133 + * In the DKUPiece data struc it is declared to be of type "void *". This 2.134 2.135 + * allows the application to define any data structure it wants and put it 2.136 2.137 + * into a DKUPiece. 2.138 2.139 + * When the app specific info is used, it is in app code, so it is cast to 2.140 2.141 + * the correct type to tell the compiler how to access fields. 2.142 2.143 + * This keeps all app-specific things out of the DKU directory, as per the 2.144 2.145 + * DKU standard. */ 2.146 2.147 +typedef 2.148 2.149 +struct 2.150 2.151 + { 2.152 2.153 + // pointers to shared data.. the result matrix must be created when the 2.154 2.155 + // left and right matrices are put into the root ancestor DKUPiece. 2.156 2.157 + Matrix * leftMatrix; 2.158 2.159 + Matrix * rightMatrix; 2.160 2.161 + Matrix * resultMatrix; 2.162 2.163 + 2.164 2.165 + // define the starting and ending boundaries for this piece of the 2.166 2.167 + // result matrix. These are derivable from the left and right 2.168 2.169 + // matrices, but included them for readability of code. 2.170 2.171 + int prodStartRow, prodEndRow; 2.172 2.173 + int prodStartCol, prodEndCol; 2.174 2.175 + // Start and end of the portion of the left matrix that contributes to 2.176 2.177 + // this piece of the product 2.178 2.179 + int leftStartRow, leftEndRow; 2.180 2.181 + int leftStartCol, leftEndCol; 2.182 2.183 + // Start and end of the portion of the right matrix that contributes to 2.184 2.185 + // this piece of the product 2.186 2.187 + int rightStartRow, rightEndRow; 2.188 2.189 + int rightStartCol, rightEndCol; 2.190 2.191 + } 2.192 2.193 +MatrixProdPiece; 2.194 2.195 + 2.196 2.197 +//============================== Functions ================================ 2.198 2.199 +void readFile(); 2.200 2.201 + 2.202 2.203 +Matrix *makeMatrix( int32 numRows, int32 numCols ); 2.204 2.205 +Matrix *makeMatrix_Flat( int32 numRows, int32 numCols ); 2.206 2.207 +Matrix *makeMatrix_WithResMat( int32 numRows, int32 numCols ); 2.208 2.209 +void freeMatrix_Flat( Matrix * matrix ); 2.210 2.211 +void freeMatrix( Matrix * matrix ); 2.212 2.213 +void printMatrix( Matrix *matrix ); 2.214 2.215 + 2.216 2.217 +void read_Matrix_From_File( Matrix *matrixStruc, char *matrixFileName ); 2.218 2.219 + 2.220 2.221 +void 2.222 2.223 +initialize_Input_Matrices_Via( Matrix **leftMatrix, Matrix **rightMatrix, 2.224 2.225 + ParamBag *paramBag ); 2.226 2.227 + 2.228 2.229 +//=========================================================================== 2.230 2.231 + 2.232 2.233 +#endif /*MATRIX_MULT_H_*/ 2.234
3.1 --- a/src/Application/VCilk__Matrix_Mult/Divide_Pr.c Mon Nov 15 12:08:19 2010 -0800 3.2 +++ b/src/Application/VCilk__Matrix_Mult/Divide_Pr.c Wed May 11 15:40:54 2011 +0200 3.3 @@ -1,588 +1,588 @@ 3.4 -/* 3.5 - * Copyright 2009 OpenSourceStewardshipFoundation.org 3.6 - * Licensed under GNU General Public License version 2 3.7 - * 3.8 - * Author: seanhalle@yahoo.com 3.9 - * 3.10 - */ 3.11 - 3.12 - 3.13 -#include "VCilk__Matrix_Mult.h" 3.14 -#include <math.h> 3.15 -#include <sys/time.h> 3.16 -#include <string.h> 3.17 - 3.18 - //The time to compute this many result values should equal the time to 3.19 - // perform this division on a matrix of size gives that many result calcs 3.20 - //IE, size this so that sequential time to calc equals divide time 3.21 - // find the value by experimenting -- but divide time and calc time scale 3.22 - // same way, so this value should remain valid across hardware 3.23 - //Divide time is about 800us on 2.4Ghz core2Quad laptop core 3.24 - //num cells is the cube of a side, when have two square matrices 3.25 -#define NUM_CELLS_IN_SEQUENTIAL_CUTOFF 100000 /* about 46x46 */ 3.26 - 3.27 - 3.28 -//=========================================================================== 3.29 -int inline 3.30 -measureMatrixMultPrimitive( VirtProcr *animPr ); 3.31 - 3.32 -SlicingStrucCarrier * 3.33 -calcIdealSizeAndSliceDimensions( Matrix *leftMatrix, Matrix *rightMatrix, 3.34 - VirtProcr *animPr ); 3.35 - 3.36 -SlicingStruc * 3.37 -sliceUpDimension( float32 idealSizeOfSide, int startVal, int endVal, 3.38 - VirtProcr *animPr ); 3.39 - 3.40 -void 3.41 -freeSlicingStruc( SlicingStruc *slicingStruc, VirtProcr *animPr ); 3.42 - 3.43 -SubMatrix ** 3.44 -createSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices, 3.45 - Matrix *origMatrix, VirtProcr *animPr ); 3.46 - 3.47 -void 3.48 -freeSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices, 3.49 - SubMatrix **subMatrices, VirtProcr *animPr ); 3.50 - 3.51 -void 3.52 -pairUpSubMatricesAndSpawnAndSync( SubMatrix **leftSubMatrices, 3.53 - SubMatrix **rightSubMatrices, 3.54 - int32 numRowIdxs, int32 numColIdxs, 3.55 - int32 numVecIdxs, 3.56 - float32 *resultArray, 3.57 - VirtProcr *animatingPr ); 3.58 - 3.59 -void 3.60 -makeSubMatricesAndSpawnAndSync( Matrix *leftMatrix, Matrix *rightMatrix, 3.61 - SlicingStrucCarrier *slicingStrucCarrier, 3.62 - float32 *resultArray, VirtProcr *animatingPr ); 3.63 - 3.64 -//=========================================================================== 3.65 - 3.66 -/*Divider creates one processor for every sub-matrix 3.67 - * It hands them: 3.68 - * the name of the result processor that they should send their results to, 3.69 - * the left and right matrices, and the rows and cols they should multiply 3.70 - * It first creates the result processor, then all the sub-matrixPair 3.71 - * processors, 3.72 - * then does a receive of a message from the result processor that gives 3.73 - * the divider ownership of the result matrix. 3.74 - * Finally, the divider returns the result matrix out of the VCilk system. 3.75 - * 3.76 - * Divider chooses the size of sub-matrices via an algorithm that tries to 3.77 - * keep the minimum work above a threshold. The threshold is machine- 3.78 - * dependent, so ask VCilk for min work-unit time to get a 3.79 - * given overhead 3.80 - * 3.81 - * Divide min work-unit cycles by measured-cycles for one matrix-cell 3.82 - * product -- gives the number of products need to have in min size 3.83 - * matrix. 3.84 - * 3.85 - * So then, take cubed root of this to get the size of a side of min sub- 3.86 - * matrix. That is the size of the ideal square sub-matrix -- so tile 3.87 - * up the two input matrices into ones as close as possible to that size, 3.88 - * and create the pairs of sub-matrices. 3.89 - * 3.90 - *======================== STRATEGIC OVERVIEW ======================= 3.91 - * 3.92 - *This division is a bit tricky, because have to create things in advance 3.93 - * that it's not at first obvious need to be created.. 3.94 - * 3.95 - *First slice up each dimension -- three of them.. this is because will have 3.96 - * to create the sub-matrix's data-structures before pairing the sub-matrices 3.97 - * with each other -- so, have three dimensions to slice up before can 3.98 - * create the sub-matrix data-strucs -- also, have to be certain that the 3.99 - * cols of the left input have the exact same slicing as the rows of the 3.100 - * left matrix, so just to be sure, do the slicing calc once, then use it 3.101 - * for both. 3.102 - * 3.103 - *So, goes like this: 3.104 - *1) calculate the start & end values of each dimension in each matrix. 3.105 - *2) use those values to create sub-matrix structures 3.106 - *3) combine sub-matrices into pairs, as the tasks to perform. 3.107 - * 3.108 - *Have to calculate separately from creating the sub-matrices because of the 3.109 - * nature of the nesting -- would either end up creating the same sub-matrix 3.110 - * multiple times, or else would have to put in detection of whether had 3.111 - * made a particular one already if tried to combine steps 1 and 2. 3.112 - * 3.113 - *Step 3 has to be separate because of the nesting, as well -- same reason, 3.114 - * would either create same sub-matrix multiple times, or else have to 3.115 - * add detection of whether was already created. 3.116 - * 3.117 - *Another way to look at it: there's one level of loop to divide dimensions, 3.118 - * two levels of nesting to create sub-matrices, and three levels to pair 3.119 - * up the sub-matrices. 3.120 - */ 3.121 - 3.122 -void divideWorkIntoSubMatrixPairProcrs( void *_dividerParams, 3.123 - VirtProcr *animPr ) 3.124 - { 3.125 - DividerParams *dividerParams; 3.126 - ResultsParams *resultsParams; 3.127 - Matrix *leftMatrix, *rightMatrix, *resultMatrix; 3.128 - void *msg; 3.129 - SlicingStrucCarrier *slicingStrucCarrier; 3.130 - float32 *resultArray; //points to array to be put inside result 3.131 - // matrix 3.132 - 3.133 - DEBUG( dbgAppFlow, "start divide\n") 3.134 - 3.135 - int32 3.136 - divideProbe = VMS__create_single_interval_probe( "divideProbe", 3.137 - animPr ); 3.138 - VMS__record_sched_choice_into_probe( divideProbe, animPr ); 3.139 - VMS__record_interval_start_in_probe( divideProbe ); 3.140 - 3.141 - //=========== Setup -- make local copies of ptd-to-things, malloc, aso 3.142 - int32 numResRows, numResCols, vectLength; 3.143 - 3.144 - dividerParams = (DividerParams *)_dividerParams; 3.145 - 3.146 - leftMatrix = dividerParams->leftMatrix; 3.147 - rightMatrix = dividerParams->rightMatrix; 3.148 - 3.149 - vectLength = leftMatrix->numCols; 3.150 - numResRows = leftMatrix->numRows; 3.151 - numResCols = rightMatrix->numCols; 3.152 - resultArray = dividerParams->resultMatrix->array; 3.153 - 3.154 - //zero the result array 3.155 - memset( resultArray, 0, numResRows * numResCols * sizeof(float32) ); 3.156 - 3.157 - 3.158 - //============== Do either sequential mult or do division ============== 3.159 - 3.160 - //Check if input matrices too small -- if yes, just do sequential 3.161 - //Cutoff is determined by overhead of this divider -- relatively 3.162 - // machine-independent 3.163 - if( (float32)leftMatrix->numRows * (float32)leftMatrix->numCols * 3.164 - (float32)rightMatrix->numCols < NUM_CELLS_IN_SEQUENTIAL_CUTOFF ) 3.165 - { int32 vectLength; 3.166 - 3.167 - //====== Do sequential multiply on a single core 3.168 - DEBUG( dbgAppFlow, "doing sequential") 3.169 - 3.170 - //transpose the right matrix 3.171 - float32 * 3.172 - transRightArray = VCilk__malloc( rightMatrix->numRows * 3.173 - rightMatrix->numCols * 3.174 - sizeof(float32), animPr ); 3.175 - 3.176 - //copy values from orig matrix to local 3.177 - copyTranspose( rightMatrix->numRows, rightMatrix->numCols, 3.178 - 0, 0, rightMatrix->numRows, 3.179 - transRightArray, rightMatrix->array ); 3.180 - 3.181 - multiplyMatrixArraysTransposed( vectLength, numResRows, numResCols, 3.182 - leftMatrix->array, transRightArray, 3.183 - resultArray ); 3.184 - } 3.185 - else 3.186 - { 3.187 - //====== Do parallel multiply across cores 3.188 - 3.189 - //Calc the ideal size of sub-matrix and slice up the dimensions of 3.190 - // the two matrices. 3.191 - //The ideal size is the one takes the number of cycles to calculate 3.192 - // such that calc time is equal or greater than min work-unit size 3.193 - slicingStrucCarrier = 3.194 - calcIdealSizeAndSliceDimensions( leftMatrix, rightMatrix, animPr); 3.195 - 3.196 - 3.197 - 3.198 - //Make the sub-matrices, and pair them up, then spawn processors to 3.199 - // calc product of each pair. 3.200 - makeSubMatricesAndSpawnAndSync( leftMatrix, rightMatrix, 3.201 - slicingStrucCarrier, 3.202 - resultArray, animPr); 3.203 - //The result array will get filled in by the spawned children 3.204 - } 3.205 - 3.206 - 3.207 - //=============== Work done -- send results back ================= 3.208 - 3.209 - 3.210 - //results have been saved into an array that was made outside the VMS 3.211 - // system, by entry-point Fn, and passed in through dividerParams. 3.212 - //So, nothing to do to send results back -- they're seen by side-effect 3.213 - 3.214 - DEBUG( dbgAppFlow, "*** end divide ***\n") 3.215 - 3.216 - VMS__record_interval_end_in_probe( divideProbe ); 3.217 - VMS__print_stats_of_all_probes(); 3.218 - 3.219 - VCilk__dissipate_procr( animPr ); //all procrs dissipate self at end 3.220 - //when all of the processors have dissipated, the "create seed and do 3.221 - // work" call in the entry point function returns 3.222 - } 3.223 - 3.224 - 3.225 -SlicingStrucCarrier * 3.226 -calcIdealSizeAndSliceDimensions( Matrix *leftMatrix, Matrix *rightMatrix, 3.227 - VirtProcr *animPr ) 3.228 -{ 3.229 - float32 idealSizeOfSide, idealSizeOfSide1, idealSizeOfSide2; 3.230 - SlicingStruc *leftRowSlices, *vecSlices, *rightColSlices; 3.231 - SlicingStrucCarrier *slicingStrucCarrier = 3.232 - VCilk__malloc(sizeof(SlicingStrucCarrier), animPr ); 3.233 - 3.234 - int minWorkUnitCycles, primitiveCycles, idealNumWorkUnits; 3.235 - float64 numPrimitiveOpsInMinWorkUnit; 3.236 - 3.237 - 3.238 - //======= Calc ideal size of min-sized sub-matrix ======== 3.239 - 3.240 - //ask VCilk for the number of cycles of the minimum work unit, at given 3.241 - // percent overhead then add a guess at overhead from this divider 3.242 - minWorkUnitCycles = VCilk__giveMinWorkUnitCycles( .05 ); 3.243 - 3.244 - //ask VCilk for number of cycles of the "primitive" op of matrix mult 3.245 - primitiveCycles = measureMatrixMultPrimitive( animPr ); 3.246 - 3.247 - numPrimitiveOpsInMinWorkUnit = 3.248 - (float64)minWorkUnitCycles / (float64)primitiveCycles; 3.249 - 3.250 - //take cubed root -- that's number of these in a "side" of sub-matrix 3.251 - // then multiply by 5 because the primitive is 5x5 3.252 - idealSizeOfSide1 = 5 * cbrt( numPrimitiveOpsInMinWorkUnit ); 3.253 - 3.254 - idealNumWorkUnits = VCilk__giveIdealNumWorkUnits(); 3.255 - 3.256 - idealSizeOfSide2 = leftMatrix->numRows / rint(cbrt( idealNumWorkUnits )); 3.257 - idealSizeOfSide2 *= 0.8; //finer granularity to help load balance 3.258 - 3.259 - if( idealSizeOfSide1 > idealSizeOfSide2 ) 3.260 - idealSizeOfSide = idealSizeOfSide1; 3.261 - else 3.262 - idealSizeOfSide = idealSizeOfSide2; 3.263 - 3.264 - //The multiply inner loop blocks the array to fit into L1 cache 3.265 -// if( idealSizeOfSide < ROWS_IN_BLOCK ) idealSizeOfSide = ROWS_IN_BLOCK; 3.266 - 3.267 - //============ Slice up dimensions, now that know target size =========== 3.268 - 3.269 - //Tell the slicer the target size of a side (floating pt), the start 3.270 - // value to start slicing at, and the end value to stop slicing at 3.271 - //It returns an array of start value of each chunk, plus number of them 3.272 - int32 startLeftRow, endLeftRow, startVec,endVec,startRightCol,endRightCol; 3.273 - startLeftRow = 0; 3.274 - endLeftRow = leftMatrix->numRows -1; 3.275 - startVec = 0; 3.276 - endVec = leftMatrix->numCols -1; 3.277 - startRightCol = 0; 3.278 - endRightCol = rightMatrix->numCols -1; 3.279 - 3.280 - leftRowSlices = 3.281 - sliceUpDimension( idealSizeOfSide, startLeftRow, endLeftRow, animPr ); 3.282 - 3.283 - vecSlices = 3.284 - sliceUpDimension( idealSizeOfSide, startVec, endVec, animPr ); 3.285 - 3.286 - rightColSlices = 3.287 - sliceUpDimension( idealSizeOfSide, startRightCol, endRightCol,animPr); 3.288 - 3.289 - slicingStrucCarrier->leftRowSlices = leftRowSlices; 3.290 - slicingStrucCarrier->vecSlices = vecSlices; 3.291 - slicingStrucCarrier->rightColSlices = rightColSlices; 3.292 - 3.293 - DEBUG1( dbgAppFlow, "leftRowSlices %d | ", leftRowSlices->numVals ); 3.294 - DEBUG1( dbgAppFlow, "rightColSlices %d | ",rightColSlices->numVals); 3.295 - DEBUG1( dbgAppFlow, "vecSlices %d\n", vecSlices->numVals ); 3.296 - return slicingStrucCarrier; 3.297 -} 3.298 - 3.299 - 3.300 -void inline 3.301 -makeSubMatricesAndSpawnAndSync( Matrix *leftMatrix, Matrix *rightMatrix, 3.302 - SlicingStrucCarrier *slicingStrucCarrier, 3.303 - float32 *resultArray, VirtProcr *animPr ) 3.304 - { 3.305 - SlicingStruc *leftRowSlices, *vecSlices, *rightColSlices; 3.306 - 3.307 - leftRowSlices = slicingStrucCarrier->leftRowSlices; 3.308 - vecSlices = slicingStrucCarrier->vecSlices; 3.309 - rightColSlices = slicingStrucCarrier->rightColSlices; 3.310 - VCilk__free( slicingStrucCarrier, animPr ); 3.311 - 3.312 - //================ Make sub-matrices, given the slicing ================ 3.313 - SubMatrix **leftSubMatrices, **rightSubMatrices; 3.314 - leftSubMatrices = 3.315 - createSubMatrices( leftRowSlices, vecSlices, 3.316 - leftMatrix, animPr ); 3.317 - rightSubMatrices = 3.318 - createSubMatrices( vecSlices, rightColSlices, 3.319 - rightMatrix, animPr ); 3.320 - 3.321 - freeSlicingStruc( leftRowSlices, animPr ); 3.322 - freeSlicingStruc( vecSlices, animPr ); 3.323 - freeSlicingStruc( rightColSlices, animPr ); 3.324 - 3.325 - //============== pair the sub-matrices and make processors ============== 3.326 - int32 numRowIdxs, numColIdxs, numVecIdxs; 3.327 - 3.328 - numRowIdxs = leftRowSlices->numVals; 3.329 - numColIdxs = rightColSlices->numVals; 3.330 - numVecIdxs = vecSlices->numVals; 3.331 - pairUpSubMatricesAndSpawnAndSync( leftSubMatrices, rightSubMatrices, 3.332 - numRowIdxs, numColIdxs, numVecIdxs, 3.333 - resultArray, 3.334 - animPr ); 3.335 - //It syncs inside, so know all work is done now: free the sub-matrices 3.336 - freeSubMatrices( leftRowSlices, vecSlices, leftSubMatrices, animPr ); 3.337 - freeSubMatrices( vecSlices, rightColSlices, rightSubMatrices, animPr ); 3.338 - } 3.339 - 3.340 - 3.341 - 3.342 - 3.343 -/* numRows*colsPerRow/numCores = numToPutOntoEachCore; 3.344 - * put all from a given row onto same core, until exhaust allotment for that 3.345 - * core 3.346 - * 3.347 - */ 3.348 -void inline 3.349 -pairUpSubMatricesAndSpawnAndSync( SubMatrix **leftSubMatrices, 3.350 - SubMatrix **rightSubMatrices, 3.351 - int32 numRowIdxs, int32 numColIdxs, 3.352 - int32 numVecIdxs, 3.353 - float32 *resultArray, 3.354 - VirtProcr *animatingPr ) 3.355 - { 3.356 - int32 resRowIdx, resColIdx; 3.357 - int32 numLeftColIdxs, numRightColIdxs; 3.358 - int32 leftRowIdxOffset; 3.359 - VecParams *vecParams; 3.360 - float32 numToPutOntoEachCore, leftOverFraction; 3.361 - int32 numCores, currCore, numOnCurrCore; 3.362 - 3.363 - numLeftColIdxs = numColIdxs; 3.364 - numRightColIdxs = numVecIdxs; 3.365 - 3.366 - numCores = VCilk__give_number_of_cores_to_spawn_onto(); 3.367 - 3.368 - numToPutOntoEachCore = numRowIdxs*numColIdxs/numCores; 3.369 - leftOverFraction = 0; 3.370 - numOnCurrCore = 0; 3.371 - currCore = 0; 3.372 - 3.373 - for( resRowIdx = 0; resRowIdx < numRowIdxs; resRowIdx++ ) 3.374 - { 3.375 - leftRowIdxOffset = resRowIdx * numLeftColIdxs; 3.376 - 3.377 - for( resColIdx = 0; resColIdx < numColIdxs; resColIdx++ ) 3.378 - { 3.379 - vecParams = VCilk__malloc( sizeof(VecParams), animatingPr ); 3.380 - 3.381 - vecParams->numVecIdxs = numVecIdxs; 3.382 - vecParams->numRightColIdxs = numRightColIdxs; 3.383 - vecParams->leftRowIdxOffset = leftRowIdxOffset; 3.384 - vecParams->resColIdx = resColIdx; 3.385 - vecParams->leftSubMatrices = leftSubMatrices; 3.386 - vecParams->rightSubMatrices = rightSubMatrices; 3.387 - vecParams->resultArray = resultArray; 3.388 - vecParams->coreToRunOn = currCore; 3.389 - 3.390 - VCilk__spawn( currCore, &calcVectorOfSubMatrices, vecParams, 3.391 - animatingPr ); 3.392 - 3.393 - numOnCurrCore += 1; 3.394 - if( numOnCurrCore + leftOverFraction >= numToPutOntoEachCore - 1 ) 3.395 - { 3.396 - //deal with fractional part, to ensure that imbalance is 1 max 3.397 - // IE, core with most has only 1 more than core with least 3.398 - leftOverFraction += numToPutOntoEachCore - numOnCurrCore; 3.399 - if( leftOverFraction >= 1 ) 3.400 - { leftOverFraction -= 1; 3.401 - numOnCurrCore = -1; 3.402 - } 3.403 - else 3.404 - { numOnCurrCore = 0; 3.405 - } 3.406 - //Move to next core, max core-value to incr to is numCores -1 3.407 - if( currCore >= numCores -1 ) 3.408 - { currCore = 0; 3.409 - } 3.410 - else 3.411 - { currCore += 1; 3.412 - } 3.413 - } 3.414 - } 3.415 - } 3.416 - 3.417 - //Free Note: vector of sub-matrices does its own free-ing, even vec-params 3.418 - 3.419 -//TODO: timeToSpawnProbe = VMS__get_probe_by_name( "timeToSpawnProbe" ); 3.420 -// VMS__end_interval_on_probe( timeToSpawnProbe ); 3.421 - 3.422 - VCilk__sync( animatingPr ); 3.423 - 3.424 - //free the sub-matrices in Fn that called this one 3.425 - } 3.426 - 3.427 - 3.428 -/*Walk through the two slice-strucs, making sub-matrix strucs as go 3.429 - */ 3.430 -SubMatrix ** 3.431 -createSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices, 3.432 - Matrix *origMatrix, VirtProcr *animPr ) 3.433 - { 3.434 - int32 numRowIdxs, numColIdxs, rowIdx, colIdx; 3.435 - int32 startRow, endRow, startCol, endCol; 3.436 - int32 *rowStartVals, *colStartVals; 3.437 - int32 rowOffset; 3.438 - SubMatrix **subMatrices, *newSubMatrix; 3.439 - 3.440 - numRowIdxs = rowSlices->numVals; 3.441 - numColIdxs = colSlices->numVals; 3.442 - 3.443 - rowStartVals = rowSlices->startVals; 3.444 - colStartVals = colSlices->startVals; 3.445 - 3.446 - subMatrices = VCilk__malloc( numRowIdxs * numColIdxs *sizeof(SubMatrix *), 3.447 - animPr ); 3.448 - 3.449 - for( rowIdx = 0; rowIdx < numRowIdxs; rowIdx++ ) 3.450 - { 3.451 - rowOffset = rowIdx * numColIdxs; 3.452 - 3.453 - startRow = rowStartVals[rowIdx]; 3.454 - endRow = rowStartVals[rowIdx + 1] -1; //"fake" start above last is 3.455 - // at last valid idx + 1 & is 3.456 - // 1 greater than end value 3.457 - for( colIdx = 0; colIdx < numColIdxs; colIdx++ ) 3.458 - { 3.459 - startCol = colStartVals[colIdx]; 3.460 - endCol = colStartVals[colIdx + 1] -1; 3.461 - 3.462 - newSubMatrix = VCilk__malloc( sizeof(SubMatrix), animPr ); 3.463 - newSubMatrix->numRows = endRow - startRow +1; 3.464 - newSubMatrix->numCols = endCol - startCol +1; 3.465 - newSubMatrix->origMatrix = origMatrix; 3.466 - newSubMatrix->origStartRow = startRow; 3.467 - newSubMatrix->origStartCol = startCol; 3.468 - newSubMatrix->alreadyCopied = FALSE; 3.469 - 3.470 - subMatrices[ rowOffset + colIdx ] = newSubMatrix; 3.471 - } 3.472 - } 3.473 - return subMatrices; 3.474 - } 3.475 - 3.476 -void 3.477 -freeSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices, 3.478 - SubMatrix **subMatrices, VirtProcr *animPr ) 3.479 - { 3.480 - int32 numRowIdxs, numColIdxs, rowIdx, colIdx, rowOffset; 3.481 - SubMatrix *subMatrix; 3.482 - 3.483 - numRowIdxs = rowSlices->numVals; 3.484 - numColIdxs = colSlices->numVals; 3.485 - 3.486 - for( rowIdx = 0; rowIdx < numRowIdxs; rowIdx++ ) 3.487 - { 3.488 - rowOffset = rowIdx * numColIdxs; 3.489 - for( colIdx = 0; colIdx < numColIdxs; colIdx++ ) 3.490 - { 3.491 - subMatrix = subMatrices[ rowOffset + colIdx ]; 3.492 - if( subMatrix->alreadyCopied ) 3.493 - VCilk__free( subMatrix->array, animPr ); 3.494 - VCilk__free( subMatrix, animPr ); 3.495 - } 3.496 - } 3.497 - VCilk__free( subMatrices, animPr ); 3.498 - } 3.499 - 3.500 - 3.501 - 3.502 -SlicingStruc * 3.503 -sliceUpDimension( float32 idealSizeOfSide, int startVal, int endVal, 3.504 - VirtProcr *animPr ) 3.505 - { float32 residualAcc = 0; 3.506 - int numSlices, i, *startVals, sizeOfSlice, endCondition; 3.507 - SlicingStruc *slicingStruc = VCilk__malloc( sizeof(SlicingStruc), animPr); 3.508 - 3.509 - //calc size of matrix need to hold start vals -- 3.510 - numSlices = (int32)( (float32)(endVal -startVal +1) / idealSizeOfSide); 3.511 - 3.512 - startVals = VCilk__malloc( (numSlices + 1) * sizeof(int32), animPr ); 3.513 - 3.514 - //Calc the upper limit of start value -- when get above this, end loop 3.515 - // by saving highest value of the matrix dimension to access, plus 1 3.516 - // as the start point of the imaginary slice following the last one 3.517 - //Plus 1 because go up to value but not include when process last slice 3.518 - //The stopping condition is half-a-size less than highest value because 3.519 - // don't want any pieces smaller than half the ideal size -- just tack 3.520 - // little ones onto end of last one 3.521 - endCondition = endVal - (int) (idealSizeOfSide/2); //end *value*, not size 3.522 - for( i = 0; startVal <= endVal; i++ ) 3.523 - { 3.524 - startVals[i] = startVal; 3.525 - residualAcc += idealSizeOfSide; 3.526 - sizeOfSlice = (int)residualAcc; 3.527 - residualAcc -= (float32)sizeOfSlice; 3.528 - startVal += sizeOfSlice; //ex @size = 2 get 0, 2, 4, 6, 8.. 3.529 - 3.530 - if( startVal > endCondition ) 3.531 - { startVal = endVal + 1; 3.532 - startVals[ i + 1 ] = startVal; 3.533 - } 3.534 - } 3.535 - 3.536 - slicingStruc->startVals = startVals; 3.537 - slicingStruc->numVals = i; //loop incr'd, so == last valid start idx+1 3.538 - // which means is num sub-matrices in dim 3.539 - // also == idx of the fake start just above 3.540 - return slicingStruc; 3.541 - } 3.542 - 3.543 -void 3.544 -freeSlicingStruc( SlicingStruc *slicingStruc, VirtProcr *animPr ) 3.545 - { 3.546 - VCilk__free( slicingStruc->startVals, animPr ); 3.547 - VCilk__free( slicingStruc, animPr ); 3.548 - } 3.549 - 3.550 - 3.551 -int inline 3.552 -measureMatrixMultPrimitive( VirtProcr *animPr ) 3.553 - { 3.554 - int r, c, v, numCycles; 3.555 - float32 *res, *left, *right; 3.556 - 3.557 - //setup inputs 3.558 - left = VCilk__malloc( 5 * 5 * sizeof( float32 ), animPr ); 3.559 - right = VCilk__malloc( 5 * 5 * sizeof( float32 ), animPr ); 3.560 - res = VCilk__malloc( 5 * 5 * sizeof( float32 ), animPr ); 3.561 - 3.562 - for( r = 0; r < 5; r++ ) 3.563 - { 3.564 - for( c = 0; c < 5; c++ ) 3.565 - { 3.566 - left[ r * 5 + c ] = r; 3.567 - right[ r * 5 + c ] = c; 3.568 - } 3.569 - } 3.570 - 3.571 - //do primitive 3.572 - VCilk__start_primitive(); //for now, just takes time stamp 3.573 - for( r = 0; r < 5; r++ ) 3.574 - { 3.575 - for( c = 0; c < 5; c++ ) 3.576 - { 3.577 - for( v = 0; v < 5; v++ ) 3.578 - { 3.579 - res[ r * 5 + c ] = left[ r * 5 + v ] * right[ v * 5 + c ]; 3.580 - } 3.581 - } 3.582 - } 3.583 - numCycles = 3.584 - VCilk__end_primitive_and_give_cycles(); 3.585 - 3.586 - VCilk__free( left, animPr ); 3.587 - VCilk__free( right, animPr ); 3.588 - VCilk__free( res, animPr ); 3.589 - 3.590 - return numCycles; 3.591 - } 3.592 +/* 3.593 3.594 + * Copyright 2009 OpenSourceStewardshipFoundation.org 3.595 3.596 + * Licensed under GNU General Public License version 2 3.597 3.598 + * 3.599 3.600 + * Author: seanhalle@yahoo.com 3.601 3.602 + * 3.603 3.604 + */ 3.605 3.606 + 3.607 3.608 + 3.609 3.610 +#include "VCilk__Matrix_Mult.h" 3.611 3.612 +#include <math.h> 3.613 3.614 +#include <sys/time.h> 3.615 3.616 +#include <string.h> 3.617 3.618 + 3.619 3.620 + //The time to compute this many result values should equal the time to 3.621 3.622 + // perform this division on a matrix of size gives that many result calcs 3.623 3.624 + //IE, size this so that sequential time to calc equals divide time 3.625 3.626 + // find the value by experimenting -- but divide time and calc time scale 3.627 3.628 + // same way, so this value should remain valid across hardware 3.629 3.630 + //Divide time is about 800us on 2.4Ghz core2Quad laptop core 3.631 3.632 + //num cells is the cube of a side, when have two square matrices 3.633 3.634 +#define NUM_CELLS_IN_SEQUENTIAL_CUTOFF 100000 /* about 46x46 */ 3.635 3.636 + 3.637 3.638 + 3.639 3.640 +//=========================================================================== 3.641 3.642 +int inline 3.643 3.644 +measureMatrixMultPrimitive( VirtProcr *animPr ); 3.645 3.646 + 3.647 3.648 +SlicingStrucCarrier * 3.649 3.650 +calcIdealSizeAndSliceDimensions( Matrix *leftMatrix, Matrix *rightMatrix, 3.651 3.652 + VirtProcr *animPr ); 3.653 3.654 + 3.655 3.656 +SlicingStruc * 3.657 3.658 +sliceUpDimension( float32 idealSizeOfSide, int startVal, int endVal, 3.659 3.660 + VirtProcr *animPr ); 3.661 3.662 + 3.663 3.664 +void 3.665 3.666 +freeSlicingStruc( SlicingStruc *slicingStruc, VirtProcr *animPr ); 3.667 3.668 + 3.669 3.670 +SubMatrix ** 3.671 3.672 +createSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices, 3.673 3.674 + Matrix *origMatrix, VirtProcr *animPr ); 3.675 3.676 + 3.677 3.678 +void 3.679 3.680 +freeSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices, 3.681 3.682 + SubMatrix **subMatrices, VirtProcr *animPr ); 3.683 3.684 + 3.685 3.686 +void 3.687 3.688 +pairUpSubMatricesAndSpawnAndSync( SubMatrix **leftSubMatrices, 3.689 3.690 + SubMatrix **rightSubMatrices, 3.691 3.692 + int32 numRowIdxs, int32 numColIdxs, 3.693 3.694 + int32 numVecIdxs, 3.695 3.696 + float32 *resultArray, 3.697 3.698 + VirtProcr *animatingPr ); 3.699 3.700 + 3.701 3.702 +void 3.703 3.704 +makeSubMatricesAndSpawnAndSync( Matrix *leftMatrix, Matrix *rightMatrix, 3.705 3.706 + SlicingStrucCarrier *slicingStrucCarrier, 3.707 3.708 + float32 *resultArray, VirtProcr *animatingPr ); 3.709 3.710 + 3.711 3.712 +//=========================================================================== 3.713 3.714 + 3.715 3.716 +/*Divider creates one processor for every sub-matrix 3.717 3.718 + * It hands them: 3.719 3.720 + * the name of the result processor that they should send their results to, 3.721 3.722 + * the left and right matrices, and the rows and cols they should multiply 3.723 3.724 + * It first creates the result processor, then all the sub-matrixPair 3.725 3.726 + * processors, 3.727 3.728 + * then does a receive of a message from the result processor that gives 3.729 3.730 + * the divider ownership of the result matrix. 3.731 3.732 + * Finally, the divider returns the result matrix out of the VCilk system. 3.733 3.734 + * 3.735 3.736 + * Divider chooses the size of sub-matrices via an algorithm that tries to 3.737 3.738 + * keep the minimum work above a threshold. The threshold is machine- 3.739 3.740 + * dependent, so ask VCilk for min work-unit time to get a 3.741 3.742 + * given overhead 3.743 3.744 + * 3.745 3.746 + * Divide min work-unit cycles by measured-cycles for one matrix-cell 3.747 3.748 + * product -- gives the number of products need to have in min size 3.749 3.750 + * matrix. 3.751 3.752 + * 3.753 3.754 + * So then, take cubed root of this to get the size of a side of min sub- 3.755 3.756 + * matrix. That is the size of the ideal square sub-matrix -- so tile 3.757 3.758 + * up the two input matrices into ones as close as possible to that size, 3.759 3.760 + * and create the pairs of sub-matrices. 3.761 3.762 + * 3.763 3.764 + *======================== STRATEGIC OVERVIEW ======================= 3.765 3.766 + * 3.767 3.768 + *This division is a bit tricky, because have to create things in advance 3.769 3.770 + * that it's not at first obvious need to be created.. 3.771 3.772 + * 3.773 3.774 + *First slice up each dimension -- three of them.. this is because will have 3.775 3.776 + * to create the sub-matrix's data-structures before pairing the sub-matrices 3.777 3.778 + * with each other -- so, have three dimensions to slice up before can 3.779 3.780 + * create the sub-matrix data-strucs -- also, have to be certain that the 3.781 3.782 + * cols of the left input have the exact same slicing as the rows of the 3.783 3.784 + * left matrix, so just to be sure, do the slicing calc once, then use it 3.785 3.786 + * for both. 3.787 3.788 + * 3.789 3.790 + *So, goes like this: 3.791 3.792 + *1) calculate the start & end values of each dimension in each matrix. 3.793 3.794 + *2) use those values to create sub-matrix structures 3.795 3.796 + *3) combine sub-matrices into pairs, as the tasks to perform. 3.797 3.798 + * 3.799 3.800 + *Have to calculate separately from creating the sub-matrices because of the 3.801 3.802 + * nature of the nesting -- would either end up creating the same sub-matrix 3.803 3.804 + * multiple times, or else would have to put in detection of whether had 3.805 3.806 + * made a particular one already if tried to combine steps 1 and 2. 3.807 3.808 + * 3.809 3.810 + *Step 3 has to be separate because of the nesting, as well -- same reason, 3.811 3.812 + * would either create same sub-matrix multiple times, or else have to 3.813 3.814 + * add detection of whether was already created. 3.815 3.816 + * 3.817 3.818 + *Another way to look at it: there's one level of loop to divide dimensions, 3.819 3.820 + * two levels of nesting to create sub-matrices, and three levels to pair 3.821 3.822 + * up the sub-matrices. 3.823 3.824 + */ 3.825 3.826 + 3.827 3.828 +void divideWorkIntoSubMatrixPairProcrs( void *_dividerParams, 3.829 3.830 + VirtProcr *animPr ) 3.831 3.832 + { 3.833 3.834 + DividerParams *dividerParams; 3.835 3.836 + ResultsParams *resultsParams; 3.837 3.838 + Matrix *leftMatrix, *rightMatrix, *resultMatrix; 3.839 3.840 + void *msg; 3.841 3.842 + SlicingStrucCarrier *slicingStrucCarrier; 3.843 3.844 + float32 *resultArray; //points to array to be put inside result 3.845 3.846 + // matrix 3.847 3.848 + 3.849 3.850 + DEBUG( dbgAppFlow, "start divide\n") 3.851 3.852 + 3.853 3.854 + int32 3.855 3.856 + divideProbe = VMS__create_single_interval_probe( "divideProbe", 3.857 3.858 + animPr ); 3.859 3.860 + VMS__record_sched_choice_into_probe( divideProbe, animPr ); 3.861 3.862 + VMS__record_interval_start_in_probe( divideProbe ); 3.863 3.864 + 3.865 3.866 + //=========== Setup -- make local copies of ptd-to-things, malloc, aso 3.867 3.868 + int32 numResRows, numResCols, vectLength; 3.869 3.870 + 3.871 3.872 + dividerParams = (DividerParams *)_dividerParams; 3.873 3.874 + 3.875 3.876 + leftMatrix = dividerParams->leftMatrix; 3.877 3.878 + rightMatrix = dividerParams->rightMatrix; 3.879 3.880 + 3.881 3.882 + vectLength = leftMatrix->numCols; 3.883 3.884 + numResRows = leftMatrix->numRows; 3.885 3.886 + numResCols = rightMatrix->numCols; 3.887 3.888 + resultArray = dividerParams->resultMatrix->array; 3.889 3.890 + 3.891 3.892 + //zero the result array 3.893 3.894 + memset( resultArray, 0, numResRows * numResCols * sizeof(float32) ); 3.895 3.896 + 3.897 3.898 + 3.899 3.900 + //============== Do either sequential mult or do division ============== 3.901 3.902 + 3.903 3.904 + //Check if input matrices too small -- if yes, just do sequential 3.905 3.906 + //Cutoff is determined by overhead of this divider -- relatively 3.907 3.908 + // machine-independent 3.909 3.910 + if( (float32)leftMatrix->numRows * (float32)leftMatrix->numCols * 3.911 3.912 + (float32)rightMatrix->numCols < NUM_CELLS_IN_SEQUENTIAL_CUTOFF ) 3.913 3.914 + { int32 vectLength; 3.915 3.916 + 3.917 3.918 + //====== Do sequential multiply on a single core 3.919 3.920 + DEBUG( dbgAppFlow, "doing sequential") 3.921 3.922 + 3.923 3.924 + //transpose the right matrix 3.925 3.926 + float32 * 3.927 3.928 + transRightArray = VCilk__malloc( rightMatrix->numRows * 3.929 3.930 + rightMatrix->numCols * 3.931 3.932 + sizeof(float32), animPr ); 3.933 3.934 + 3.935 3.936 + //copy values from orig matrix to local 3.937 3.938 + copyTranspose( rightMatrix->numRows, rightMatrix->numCols, 3.939 3.940 + 0, 0, rightMatrix->numRows, 3.941 3.942 + transRightArray, rightMatrix->array ); 3.943 3.944 + 3.945 3.946 + multiplyMatrixArraysTransposed( vectLength, numResRows, numResCols, 3.947 3.948 + leftMatrix->array, transRightArray, 3.949 3.950 + resultArray ); 3.951 3.952 + } 3.953 3.954 + else 3.955 3.956 + { 3.957 3.958 + //====== Do parallel multiply across cores 3.959 3.960 + 3.961 3.962 + //Calc the ideal size of sub-matrix and slice up the dimensions of 3.963 3.964 + // the two matrices. 3.965 3.966 + //The ideal size is the one takes the number of cycles to calculate 3.967 3.968 + // such that calc time is equal or greater than min work-unit size 3.969 3.970 + slicingStrucCarrier = 3.971 3.972 + calcIdealSizeAndSliceDimensions( leftMatrix, rightMatrix, animPr); 3.973 3.974 + 3.975 3.976 + 3.977 3.978 + 3.979 3.980 + //Make the sub-matrices, and pair them up, then spawn processors to 3.981 3.982 + // calc product of each pair. 3.983 3.984 + makeSubMatricesAndSpawnAndSync( leftMatrix, rightMatrix, 3.985 3.986 + slicingStrucCarrier, 3.987 3.988 + resultArray, animPr); 3.989 3.990 + //The result array will get filled in by the spawned children 3.991 3.992 + } 3.993 3.994 + 3.995 3.996 + 3.997 3.998 + //=============== Work done -- send results back ================= 3.999 3.1000 + 3.1001 3.1002 + 3.1003 3.1004 + //results have been saved into an array that was made outside the VMS 3.1005 3.1006 + // system, by entry-point Fn, and passed in through dividerParams. 3.1007 3.1008 + //So, nothing to do to send results back -- they're seen by side-effect 3.1009 3.1010 + 3.1011 3.1012 + DEBUG( dbgAppFlow, "*** end divide ***\n") 3.1013 3.1014 + 3.1015 3.1016 + VMS__record_interval_end_in_probe( divideProbe ); 3.1017 3.1018 + VMS__print_stats_of_all_probes(); 3.1019 3.1020 + 3.1021 3.1022 + VCilk__dissipate_procr( animPr ); //all procrs dissipate self at end 3.1023 3.1024 + //when all of the processors have dissipated, the "create seed and do 3.1025 3.1026 + // work" call in the entry point function returns 3.1027 3.1028 + } 3.1029 3.1030 + 3.1031 3.1032 + 3.1033 3.1034 +SlicingStrucCarrier * 3.1035 3.1036 +calcIdealSizeAndSliceDimensions( Matrix *leftMatrix, Matrix *rightMatrix, 3.1037 3.1038 + VirtProcr *animPr ) 3.1039 3.1040 +{ 3.1041 3.1042 + float32 idealSizeOfSide, idealSizeOfSide1, idealSizeOfSide2; 3.1043 3.1044 + SlicingStruc *leftRowSlices, *vecSlices, *rightColSlices; 3.1045 3.1046 + SlicingStrucCarrier *slicingStrucCarrier = 3.1047 3.1048 + VCilk__malloc(sizeof(SlicingStrucCarrier), animPr ); 3.1049 3.1050 + 3.1051 3.1052 + int minWorkUnitCycles, primitiveCycles, idealNumWorkUnits; 3.1053 3.1054 + float64 numPrimitiveOpsInMinWorkUnit; 3.1055 3.1056 + 3.1057 3.1058 + 3.1059 3.1060 + //======= Calc ideal size of min-sized sub-matrix ======== 3.1061 3.1062 + 3.1063 3.1064 + //ask VCilk for the number of cycles of the minimum work unit, at given 3.1065 3.1066 + // percent overhead then add a guess at overhead from this divider 3.1067 3.1068 + minWorkUnitCycles = VCilk__giveMinWorkUnitCycles( .05 ); 3.1069 3.1070 + 3.1071 3.1072 + //ask VCilk for number of cycles of the "primitive" op of matrix mult 3.1073 3.1074 + primitiveCycles = measureMatrixMultPrimitive( animPr ); 3.1075 3.1076 + 3.1077 3.1078 + numPrimitiveOpsInMinWorkUnit = 3.1079 3.1080 + (float64)minWorkUnitCycles / (float64)primitiveCycles; 3.1081 3.1082 + 3.1083 3.1084 + //take cubed root -- that's number of these in a "side" of sub-matrix 3.1085 3.1086 + // then multiply by 5 because the primitive is 5x5 3.1087 3.1088 + idealSizeOfSide1 = 5 * cbrt( numPrimitiveOpsInMinWorkUnit ); 3.1089 3.1090 + 3.1091 3.1092 + idealNumWorkUnits = VCilk__giveIdealNumWorkUnits(); 3.1093 3.1094 + 3.1095 3.1096 + idealSizeOfSide2 = leftMatrix->numRows / rint(cbrt( idealNumWorkUnits )); 3.1097 3.1098 + idealSizeOfSide2 *= 0.8; //finer granularity to help load balance 3.1099 3.1100 + 3.1101 3.1102 + if( idealSizeOfSide1 > idealSizeOfSide2 ) 3.1103 3.1104 + idealSizeOfSide = idealSizeOfSide1; 3.1105 3.1106 + else 3.1107 3.1108 + idealSizeOfSide = idealSizeOfSide2; 3.1109 3.1110 + 3.1111 3.1112 + //The multiply inner loop blocks the array to fit into L1 cache 3.1113 3.1114 +// if( idealSizeOfSide < ROWS_IN_BLOCK ) idealSizeOfSide = ROWS_IN_BLOCK; 3.1115 3.1116 + 3.1117 3.1118 + //============ Slice up dimensions, now that know target size =========== 3.1119 3.1120 + 3.1121 3.1122 + //Tell the slicer the target size of a side (floating pt), the start 3.1123 3.1124 + // value to start slicing at, and the end value to stop slicing at 3.1125 3.1126 + //It returns an array of start value of each chunk, plus number of them 3.1127 3.1128 + int32 startLeftRow, endLeftRow, startVec,endVec,startRightCol,endRightCol; 3.1129 3.1130 + startLeftRow = 0; 3.1131 3.1132 + endLeftRow = leftMatrix->numRows -1; 3.1133 3.1134 + startVec = 0; 3.1135 3.1136 + endVec = leftMatrix->numCols -1; 3.1137 3.1138 + startRightCol = 0; 3.1139 3.1140 + endRightCol = rightMatrix->numCols -1; 3.1141 3.1142 + 3.1143 3.1144 + leftRowSlices = 3.1145 3.1146 + sliceUpDimension( idealSizeOfSide, startLeftRow, endLeftRow, animPr ); 3.1147 3.1148 + 3.1149 3.1150 + vecSlices = 3.1151 3.1152 + sliceUpDimension( idealSizeOfSide, startVec, endVec, animPr ); 3.1153 3.1154 + 3.1155 3.1156 + rightColSlices = 3.1157 3.1158 + sliceUpDimension( idealSizeOfSide, startRightCol, endRightCol,animPr); 3.1159 3.1160 + 3.1161 3.1162 + slicingStrucCarrier->leftRowSlices = leftRowSlices; 3.1163 3.1164 + slicingStrucCarrier->vecSlices = vecSlices; 3.1165 3.1166 + slicingStrucCarrier->rightColSlices = rightColSlices; 3.1167 3.1168 + 3.1169 3.1170 + DEBUG1( dbgAppFlow, "leftRowSlices %d | ", leftRowSlices->numVals ); 3.1171 3.1172 + DEBUG1( dbgAppFlow, "rightColSlices %d | ",rightColSlices->numVals); 3.1173 3.1174 + DEBUG1( dbgAppFlow, "vecSlices %d\n", vecSlices->numVals ); 3.1175 3.1176 + return slicingStrucCarrier; 3.1177 3.1178 +} 3.1179 3.1180 + 3.1181 3.1182 + 3.1183 3.1184 +void inline 3.1185 3.1186 +makeSubMatricesAndSpawnAndSync( Matrix *leftMatrix, Matrix *rightMatrix, 3.1187 3.1188 + SlicingStrucCarrier *slicingStrucCarrier, 3.1189 3.1190 + float32 *resultArray, VirtProcr *animPr ) 3.1191 3.1192 + { 3.1193 3.1194 + SlicingStruc *leftRowSlices, *vecSlices, *rightColSlices; 3.1195 3.1196 + 3.1197 3.1198 + leftRowSlices = slicingStrucCarrier->leftRowSlices; 3.1199 3.1200 + vecSlices = slicingStrucCarrier->vecSlices; 3.1201 3.1202 + rightColSlices = slicingStrucCarrier->rightColSlices; 3.1203 3.1204 + VCilk__free( slicingStrucCarrier, animPr ); 3.1205 3.1206 + 3.1207 3.1208 + //================ Make sub-matrices, given the slicing ================ 3.1209 3.1210 + SubMatrix **leftSubMatrices, **rightSubMatrices; 3.1211 3.1212 + leftSubMatrices = 3.1213 3.1214 + createSubMatrices( leftRowSlices, vecSlices, 3.1215 3.1216 + leftMatrix, animPr ); 3.1217 3.1218 + rightSubMatrices = 3.1219 3.1220 + createSubMatrices( vecSlices, rightColSlices, 3.1221 3.1222 + rightMatrix, animPr ); 3.1223 3.1224 + 3.1225 3.1226 + freeSlicingStruc( leftRowSlices, animPr ); 3.1227 3.1228 + freeSlicingStruc( vecSlices, animPr ); 3.1229 3.1230 + freeSlicingStruc( rightColSlices, animPr ); 3.1231 3.1232 + 3.1233 3.1234 + //============== pair the sub-matrices and make processors ============== 3.1235 3.1236 + int32 numRowIdxs, numColIdxs, numVecIdxs; 3.1237 3.1238 + 3.1239 3.1240 + numRowIdxs = leftRowSlices->numVals; 3.1241 3.1242 + numColIdxs = rightColSlices->numVals; 3.1243 3.1244 + numVecIdxs = vecSlices->numVals; 3.1245 3.1246 + pairUpSubMatricesAndSpawnAndSync( leftSubMatrices, rightSubMatrices, 3.1247 3.1248 + numRowIdxs, numColIdxs, numVecIdxs, 3.1249 3.1250 + resultArray, 3.1251 3.1252 + animPr ); 3.1253 3.1254 + //It syncs inside, so know all work is done now: free the sub-matrices 3.1255 3.1256 + freeSubMatrices( leftRowSlices, vecSlices, leftSubMatrices, animPr ); 3.1257 3.1258 + freeSubMatrices( vecSlices, rightColSlices, rightSubMatrices, animPr ); 3.1259 3.1260 + } 3.1261 3.1262 + 3.1263 3.1264 + 3.1265 3.1266 + 3.1267 3.1268 + 3.1269 3.1270 +/* numRows*colsPerRow/numCores = numToPutOntoEachCore; 3.1271 3.1272 + * put all from a given row onto same core, until exhaust allotment for that 3.1273 3.1274 + * core 3.1275 3.1276 + * 3.1277 3.1278 + */ 3.1279 3.1280 +void inline 3.1281 3.1282 +pairUpSubMatricesAndSpawnAndSync( SubMatrix **leftSubMatrices, 3.1283 3.1284 + SubMatrix **rightSubMatrices, 3.1285 3.1286 + int32 numRowIdxs, int32 numColIdxs, 3.1287 3.1288 + int32 numVecIdxs, 3.1289 3.1290 + float32 *resultArray, 3.1291 3.1292 + VirtProcr *animatingPr ) 3.1293 3.1294 + { 3.1295 3.1296 + int32 resRowIdx, resColIdx; 3.1297 3.1298 + int32 numLeftColIdxs, numRightColIdxs; 3.1299 3.1300 + int32 leftRowIdxOffset; 3.1301 3.1302 + VecParams *vecParams; 3.1303 3.1304 + float32 numToPutOntoEachCore, leftOverFraction; 3.1305 3.1306 + int32 numCores, currCore, numOnCurrCore; 3.1307 3.1308 + 3.1309 3.1310 + numLeftColIdxs = numColIdxs; 3.1311 3.1312 + numRightColIdxs = numVecIdxs; 3.1313 3.1314 + 3.1315 3.1316 + numCores = VCilk__give_number_of_cores_to_spawn_onto(); 3.1317 3.1318 + 3.1319 3.1320 + numToPutOntoEachCore = numRowIdxs*numColIdxs/numCores; 3.1321 3.1322 + leftOverFraction = 0; 3.1323 3.1324 + numOnCurrCore = 0; 3.1325 3.1326 + currCore = 0; 3.1327 3.1328 + 3.1329 3.1330 + for( resRowIdx = 0; resRowIdx < numRowIdxs; resRowIdx++ ) 3.1331 3.1332 + { 3.1333 3.1334 + leftRowIdxOffset = resRowIdx * numLeftColIdxs; 3.1335 3.1336 + 3.1337 3.1338 + for( resColIdx = 0; resColIdx < numColIdxs; resColIdx++ ) 3.1339 3.1340 + { 3.1341 3.1342 + vecParams = VCilk__malloc( sizeof(VecParams), animatingPr ); 3.1343 3.1344 + 3.1345 3.1346 + vecParams->numVecIdxs = numVecIdxs; 3.1347 3.1348 + vecParams->numRightColIdxs = numRightColIdxs; 3.1349 3.1350 + vecParams->leftRowIdxOffset = leftRowIdxOffset; 3.1351 3.1352 + vecParams->resColIdx = resColIdx; 3.1353 3.1354 + vecParams->leftSubMatrices = leftSubMatrices; 3.1355 3.1356 + vecParams->rightSubMatrices = rightSubMatrices; 3.1357 3.1358 + vecParams->resultArray = resultArray; 3.1359 3.1360 + vecParams->coreToRunOn = currCore; 3.1361 3.1362 + 3.1363 3.1364 + VCilk__spawn( currCore, &calcVectorOfSubMatrices, vecParams, 3.1365 3.1366 + animatingPr ); 3.1367 3.1368 + 3.1369 3.1370 + numOnCurrCore += 1; 3.1371 3.1372 + if( numOnCurrCore + leftOverFraction >= numToPutOntoEachCore - 1 ) 3.1373 3.1374 + { 3.1375 3.1376 + //deal with fractional part, to ensure that imbalance is 1 max 3.1377 3.1378 + // IE, core with most has only 1 more than core with least 3.1379 3.1380 + leftOverFraction += numToPutOntoEachCore - numOnCurrCore; 3.1381 3.1382 + if( leftOverFraction >= 1 ) 3.1383 3.1384 + { leftOverFraction -= 1; 3.1385 3.1386 + numOnCurrCore = -1; 3.1387 3.1388 + } 3.1389 3.1390 + else 3.1391 3.1392 + { numOnCurrCore = 0; 3.1393 3.1394 + } 3.1395 3.1396 + //Move to next core, max core-value to incr to is numCores -1 3.1397 3.1398 + if( currCore >= numCores -1 ) 3.1399 3.1400 + { currCore = 0; 3.1401 3.1402 + } 3.1403 3.1404 + else 3.1405 3.1406 + { currCore += 1; 3.1407 3.1408 + } 3.1409 3.1410 + } 3.1411 3.1412 + } 3.1413 3.1414 + } 3.1415 3.1416 + 3.1417 3.1418 + //Free Note: vector of sub-matrices does its own free-ing, even vec-params 3.1419 3.1420 + 3.1421 3.1422 +//TODO: timeToSpawnProbe = VMS__get_probe_by_name( "timeToSpawnProbe" ); 3.1423 3.1424 +// VMS__end_interval_on_probe( timeToSpawnProbe ); 3.1425 3.1426 + 3.1427 3.1428 + VCilk__sync( animatingPr ); 3.1429 3.1430 + 3.1431 3.1432 + //free the sub-matrices in Fn that called this one 3.1433 3.1434 + } 3.1435 3.1436 + 3.1437 3.1438 + 3.1439 3.1440 +/*Walk through the two slice-strucs, making sub-matrix strucs as go 3.1441 3.1442 + */ 3.1443 3.1444 +SubMatrix ** 3.1445 3.1446 +createSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices, 3.1447 3.1448 + Matrix *origMatrix, VirtProcr *animPr ) 3.1449 3.1450 + { 3.1451 3.1452 + int32 numRowIdxs, numColIdxs, rowIdx, colIdx; 3.1453 3.1454 + int32 startRow, endRow, startCol, endCol; 3.1455 3.1456 + int32 *rowStartVals, *colStartVals; 3.1457 3.1458 + int32 rowOffset; 3.1459 3.1460 + SubMatrix **subMatrices, *newSubMatrix; 3.1461 3.1462 + 3.1463 3.1464 + numRowIdxs = rowSlices->numVals; 3.1465 3.1466 + numColIdxs = colSlices->numVals; 3.1467 3.1468 + 3.1469 3.1470 + rowStartVals = rowSlices->startVals; 3.1471 3.1472 + colStartVals = colSlices->startVals; 3.1473 3.1474 + 3.1475 3.1476 + subMatrices = VCilk__malloc( numRowIdxs * numColIdxs *sizeof(SubMatrix *), 3.1477 3.1478 + animPr ); 3.1479 3.1480 + 3.1481 3.1482 + for( rowIdx = 0; rowIdx < numRowIdxs; rowIdx++ ) 3.1483 3.1484 + { 3.1485 3.1486 + rowOffset = rowIdx * numColIdxs; 3.1487 3.1488 + 3.1489 3.1490 + startRow = rowStartVals[rowIdx]; 3.1491 3.1492 + endRow = rowStartVals[rowIdx + 1] -1; //"fake" start above last is 3.1493 3.1494 + // at last valid idx + 1 & is 3.1495 3.1496 + // 1 greater than end value 3.1497 3.1498 + for( colIdx = 0; colIdx < numColIdxs; colIdx++ ) 3.1499 3.1500 + { 3.1501 3.1502 + startCol = colStartVals[colIdx]; 3.1503 3.1504 + endCol = colStartVals[colIdx + 1] -1; 3.1505 3.1506 + 3.1507 3.1508 + newSubMatrix = VCilk__malloc( sizeof(SubMatrix), animPr ); 3.1509 3.1510 + newSubMatrix->numRows = endRow - startRow +1; 3.1511 3.1512 + newSubMatrix->numCols = endCol - startCol +1; 3.1513 3.1514 + newSubMatrix->origMatrix = origMatrix; 3.1515 3.1516 + newSubMatrix->origStartRow = startRow; 3.1517 3.1518 + newSubMatrix->origStartCol = startCol; 3.1519 3.1520 + newSubMatrix->alreadyCopied = FALSE; 3.1521 3.1522 + 3.1523 3.1524 + subMatrices[ rowOffset + colIdx ] = newSubMatrix; 3.1525 3.1526 + } 3.1527 3.1528 + } 3.1529 3.1530 + return subMatrices; 3.1531 3.1532 + } 3.1533 3.1534 + 3.1535 3.1536 +void 3.1537 3.1538 +freeSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices, 3.1539 3.1540 + SubMatrix **subMatrices, VirtProcr *animPr ) 3.1541 3.1542 + { 3.1543 3.1544 + int32 numRowIdxs, numColIdxs, rowIdx, colIdx, rowOffset; 3.1545 3.1546 + SubMatrix *subMatrix; 3.1547 3.1548 + 3.1549 3.1550 + numRowIdxs = rowSlices->numVals; 3.1551 3.1552 + numColIdxs = colSlices->numVals; 3.1553 3.1554 + 3.1555 3.1556 + for( rowIdx = 0; rowIdx < numRowIdxs; rowIdx++ ) 3.1557 3.1558 + { 3.1559 3.1560 + rowOffset = rowIdx * numColIdxs; 3.1561 3.1562 + for( colIdx = 0; colIdx < numColIdxs; colIdx++ ) 3.1563 3.1564 + { 3.1565 3.1566 + subMatrix = subMatrices[ rowOffset + colIdx ]; 3.1567 3.1568 + if( subMatrix->alreadyCopied ) 3.1569 3.1570 + VCilk__free( subMatrix->array, animPr ); 3.1571 3.1572 + VCilk__free( subMatrix, animPr ); 3.1573 3.1574 + } 3.1575 3.1576 + } 3.1577 3.1578 + VCilk__free( subMatrices, animPr ); 3.1579 3.1580 + } 3.1581 3.1582 + 3.1583 3.1584 + 3.1585 3.1586 + 3.1587 3.1588 +SlicingStruc * 3.1589 3.1590 +sliceUpDimension( float32 idealSizeOfSide, int startVal, int endVal, 3.1591 3.1592 + VirtProcr *animPr ) 3.1593 3.1594 + { float32 residualAcc = 0; 3.1595 3.1596 + int numSlices, i, *startVals, sizeOfSlice, endCondition; 3.1597 3.1598 + SlicingStruc *slicingStruc = VCilk__malloc( sizeof(SlicingStruc), animPr); 3.1599 3.1600 + 3.1601 3.1602 + //calc size of matrix need to hold start vals -- 3.1603 3.1604 + numSlices = (int32)( (float32)(endVal -startVal +1) / idealSizeOfSide); 3.1605 3.1606 + 3.1607 3.1608 + startVals = VCilk__malloc( (numSlices + 1) * sizeof(int32), animPr ); 3.1609 3.1610 + 3.1611 3.1612 + //Calc the upper limit of start value -- when get above this, end loop 3.1613 3.1614 + // by saving highest value of the matrix dimension to access, plus 1 3.1615 3.1616 + // as the start point of the imaginary slice following the last one 3.1617 3.1618 + //Plus 1 because go up to value but not include when process last slice 3.1619 3.1620 + //The stopping condition is half-a-size less than highest value because 3.1621 3.1622 + // don't want any pieces smaller than half the ideal size -- just tack 3.1623 3.1624 + // little ones onto end of last one 3.1625 3.1626 + endCondition = endVal - (int) (idealSizeOfSide/2); //end *value*, not size 3.1627 3.1628 + for( i = 0; startVal <= endVal; i++ ) 3.1629 3.1630 + { 3.1631 3.1632 + startVals[i] = startVal; 3.1633 3.1634 + residualAcc += idealSizeOfSide; 3.1635 3.1636 + sizeOfSlice = (int)residualAcc; 3.1637 3.1638 + residualAcc -= (float32)sizeOfSlice; 3.1639 3.1640 + startVal += sizeOfSlice; //ex @size = 2 get 0, 2, 4, 6, 8.. 3.1641 3.1642 + 3.1643 3.1644 + if( startVal > endCondition ) 3.1645 3.1646 + { startVal = endVal + 1; 3.1647 3.1648 + startVals[ i + 1 ] = startVal; 3.1649 3.1650 + } 3.1651 3.1652 + } 3.1653 3.1654 + 3.1655 3.1656 + slicingStruc->startVals = startVals; 3.1657 3.1658 + slicingStruc->numVals = i; //loop incr'd, so == last valid start idx+1 3.1659 3.1660 + // which means is num sub-matrices in dim 3.1661 3.1662 + // also == idx of the fake start just above 3.1663 3.1664 + return slicingStruc; 3.1665 3.1666 + } 3.1667 3.1668 + 3.1669 3.1670 +void 3.1671 3.1672 +freeSlicingStruc( SlicingStruc *slicingStruc, VirtProcr *animPr ) 3.1673 3.1674 + { 3.1675 3.1676 + VCilk__free( slicingStruc->startVals, animPr ); 3.1677 3.1678 + VCilk__free( slicingStruc, animPr ); 3.1679 3.1680 + } 3.1681 3.1682 + 3.1683 3.1684 + 3.1685 3.1686 +int inline 3.1687 3.1688 +measureMatrixMultPrimitive( VirtProcr *animPr ) 3.1689 3.1690 + { 3.1691 3.1692 + int r, c, v, numCycles; 3.1693 3.1694 + float32 *res, *left, *right; 3.1695 3.1696 + 3.1697 3.1698 + //setup inputs 3.1699 3.1700 + left = VCilk__malloc( 5 * 5 * sizeof( float32 ), animPr ); 3.1701 3.1702 + right = VCilk__malloc( 5 * 5 * sizeof( float32 ), animPr ); 3.1703 3.1704 + res = VCilk__malloc( 5 * 5 * sizeof( float32 ), animPr ); 3.1705 3.1706 + 3.1707 3.1708 + for( r = 0; r < 5; r++ ) 3.1709 3.1710 + { 3.1711 3.1712 + for( c = 0; c < 5; c++ ) 3.1713 3.1714 + { 3.1715 3.1716 + left[ r * 5 + c ] = r; 3.1717 3.1718 + right[ r * 5 + c ] = c; 3.1719 3.1720 + } 3.1721 3.1722 + } 3.1723 3.1724 + 3.1725 3.1726 + //do primitive 3.1727 3.1728 + VCilk__start_primitive(); //for now, just takes time stamp 3.1729 3.1730 + for( r = 0; r < 5; r++ ) 3.1731 3.1732 + { 3.1733 3.1734 + for( c = 0; c < 5; c++ ) 3.1735 3.1736 + { 3.1737 3.1738 + for( v = 0; v < 5; v++ ) 3.1739 3.1740 + { 3.1741 3.1742 + res[ r * 5 + c ] = left[ r * 5 + v ] * right[ v * 5 + c ]; 3.1743 3.1744 + } 3.1745 3.1746 + } 3.1747 3.1748 + } 3.1749 3.1750 + numCycles = 3.1751 3.1752 + VCilk__end_primitive_and_give_cycles(); 3.1753 3.1754 + 3.1755 3.1756 + VCilk__free( left, animPr ); 3.1757 3.1758 + VCilk__free( right, animPr ); 3.1759 3.1760 + VCilk__free( res, animPr ); 3.1761 3.1762 + 3.1763 3.1764 + return numCycles; 3.1765 3.1766 + } 3.1767
4.1 --- a/src/Application/VCilk__Matrix_Mult/EntryPoint.c Mon Nov 15 12:08:19 2010 -0800 4.2 +++ b/src/Application/VCilk__Matrix_Mult/EntryPoint.c Wed May 11 15:40:54 2011 +0200 4.3 @@ -1,58 +1,58 @@ 4.4 -/* 4.5 - * Copyright 2009 OpenSourceStewardshipFoundation.org 4.6 - * Licensed under GNU General Public License version 2 4.7 - * 4.8 - * Author: seanhalle@yahoo.com 4.9 - * 4.10 - */ 4.11 - 4.12 -#include <math.h> 4.13 - 4.14 -#include "VCilk__Matrix_Mult.h" 4.15 - 4.16 - 4.17 - 4.18 -/*Every VCilk system has an "entry point" function that creates the first 4.19 - * processor, which starts the chain of creating more processors.. 4.20 - * eventually all of the processors will dissipate themselves, and 4.21 - * return. 4.22 - * 4.23 - *This entry-point function follows the same pattern as all entry-point 4.24 - * functions do: 4.25 - *1) it creates the params for the seed processor, from the 4.26 - * parameters passed into the entry-point function 4.27 - *2) it calls VCilk__create_seed_procr_and_do_work 4.28 - *3) it gets the return value from the params struc, frees the params struc, 4.29 - * and returns the value from the function 4.30 - * 4.31 - */ 4.32 -Matrix * 4.33 -multiplyTheseMatrices( Matrix *leftMatrix, Matrix *rightMatrix ) 4.34 - { Matrix *resMatrix; 4.35 - DividerParams *dividerParams; 4.36 - int32 numResRows, numResCols; 4.37 - 4.38 - 4.39 - dividerParams = malloc( sizeof( DividerParams ) ); 4.40 - dividerParams->leftMatrix = leftMatrix; 4.41 - dividerParams->rightMatrix = rightMatrix; 4.42 - 4.43 - numResRows = leftMatrix->numRows; 4.44 - numResCols = rightMatrix->numCols; 4.45 - resMatrix = malloc( sizeof(Matrix) ); 4.46 - resMatrix->array = malloc( numResRows * numResCols * sizeof(float32)); 4.47 - resMatrix->numCols = rightMatrix->numCols; 4.48 - resMatrix->numRows = leftMatrix->numRows; 4.49 - 4.50 - 4.51 - dividerParams->resultMatrix = resMatrix; 4.52 - 4.53 - //create divider processor, start doing the work, and wait till done 4.54 - //This function is the "border crossing" between normal code and VCilk 4.55 - VCilk__create_seed_procr_and_do_work( ÷WorkIntoSubMatrixPairProcrs, 4.56 - dividerParams ); 4.57 - 4.58 - //get result matrix and return it 4.59 - free( dividerParams ); 4.60 - return resMatrix; 4.61 - } 4.62 +/* 4.63 4.64 + * Copyright 2009 OpenSourceStewardshipFoundation.org 4.65 4.66 + * Licensed under GNU General Public License version 2 4.67 4.68 + * 4.69 4.70 + * Author: seanhalle@yahoo.com 4.71 4.72 + * 4.73 4.74 + */ 4.75 4.76 + 4.77 4.78 +#include <math.h> 4.79 4.80 + 4.81 4.82 +#include "VCilk__Matrix_Mult.h" 4.83 4.84 + 4.85 4.86 + 4.87 4.88 + 4.89 4.90 +/*Every VCilk system has an "entry point" function that creates the first 4.91 4.92 + * processor, which starts the chain of creating more processors.. 4.93 4.94 + * eventually all of the processors will dissipate themselves, and 4.95 4.96 + * return. 4.97 4.98 + * 4.99 4.100 + *This entry-point function follows the same pattern as all entry-point 4.101 4.102 + * functions do: 4.103 4.104 + *1) it creates the params for the seed processor, from the 4.105 4.106 + * parameters passed into the entry-point function 4.107 4.108 + *2) it calls VCilk__create_seed_procr_and_do_work 4.109 4.110 + *3) it gets the return value from the params struc, frees the params struc, 4.111 4.112 + * and returns the value from the function 4.113 4.114 + * 4.115 4.116 + */ 4.117 4.118 +Matrix * 4.119 4.120 +multiplyTheseMatrices( Matrix *leftMatrix, Matrix *rightMatrix ) 4.121 4.122 + { Matrix *resMatrix; 4.123 4.124 + DividerParams *dividerParams; 4.125 4.126 + int32 numResRows, numResCols; 4.127 4.128 + 4.129 4.130 + 4.131 4.132 + dividerParams = malloc( sizeof( DividerParams ) ); 4.133 4.134 + dividerParams->leftMatrix = leftMatrix; 4.135 4.136 + dividerParams->rightMatrix = rightMatrix; 4.137 4.138 + 4.139 4.140 + numResRows = leftMatrix->numRows; 4.141 4.142 + numResCols = rightMatrix->numCols; 4.143 4.144 + resMatrix = malloc( sizeof(Matrix) ); 4.145 4.146 + resMatrix->array = malloc( numResRows * numResCols * sizeof(float32)); 4.147 4.148 + resMatrix->numCols = rightMatrix->numCols; 4.149 4.150 + resMatrix->numRows = leftMatrix->numRows; 4.151 4.152 + 4.153 4.154 + 4.155 4.156 + dividerParams->resultMatrix = resMatrix; 4.157 4.158 + 4.159 4.160 + //create divider processor, start doing the work, and wait till done 4.161 4.162 + //This function is the "border crossing" between normal code and VCilk 4.163 4.164 + VCilk__create_seed_procr_and_do_work( ÷WorkIntoSubMatrixPairProcrs, 4.165 4.166 + dividerParams ); 4.167 4.168 + 4.169 4.170 + //get result matrix and return it 4.171 4.172 + free( dividerParams ); 4.173 4.174 + return resMatrix; 4.175 4.176 + } 4.177
5.1 --- a/src/Application/VCilk__Matrix_Mult/VCilk__Matrix_Mult.h Mon Nov 15 12:08:19 2010 -0800 5.2 +++ b/src/Application/VCilk__Matrix_Mult/VCilk__Matrix_Mult.h Wed May 11 15:40:54 2011 +0200 5.3 @@ -1,104 +1,106 @@ 5.4 -/* 5.5 - * Copyright Oct 24, 2009 OpenSourceStewardshipFoundation.org 5.6 - * Licensed under GNU General Public License version 2 5.7 - */ 5.8 - 5.9 -#ifndef _VCilk_MATRIX_MULT_H_ 5.10 -#define _VCilk_MATRIX_MULT_H_ 5.11 - 5.12 -#include <stdio.h> 5.13 - 5.14 -#include "../../VCilk_lib/VCilk.h" 5.15 -#include "../Matrix_Mult.h" 5.16 -#include "../../VCilk_lib/VMS/VMS.h" 5.17 - 5.18 - 5.19 -//=============================== Defines ============================== 5.20 -#define ROWS_IN_BLOCK 32 5.21 -#define COLS_IN_BLOCK 32 5.22 -#define VEC_IN_BLOCK 32 5.23 - 5.24 -#define copyMatrixSingleton 1 5.25 -#define copyTransposeSingleton 2 5.26 - 5.27 -//============================== Structures ============================== 5.28 -typedef struct 5.29 - { 5.30 - Matrix *leftMatrix; 5.31 - Matrix *rightMatrix; 5.32 - Matrix *resultMatrix; 5.33 - 5.34 - TSCount numTSCsToExe; 5.35 - } 5.36 -DividerParams; 5.37 - 5.38 -typedef struct 5.39 - { 5.40 - VirtProcr *dividerPr; 5.41 - int numRows; 5.42 - int numCols; 5.43 - int numSubMatrixPairs; 5.44 - } 5.45 -ResultsParams; 5.46 - 5.47 -typedef 5.48 -struct 5.49 - { int32 numRows; 5.50 - int32 numCols; 5.51 - Matrix *origMatrix; 5.52 - int32 origStartRow; 5.53 - int32 origStartCol; 5.54 - int32 alreadyCopied; 5.55 - float32 *array; //2D, but dynamically sized, so use addr arith 5.56 - } 5.57 -SubMatrix; 5.58 - 5.59 -typedef struct 5.60 - { VirtProcr *resultPr; 5.61 - SubMatrix *leftSubMatrix; 5.62 - SubMatrix *rightSubMatrix; 5.63 - float32 *partialResultArray; 5.64 - } 5.65 -SMPairParams; 5.66 - 5.67 -typedef 5.68 -struct 5.69 - { int32 numVals; 5.70 - int32 *startVals; 5.71 - } 5.72 -SlicingStruc; 5.73 - 5.74 -typedef 5.75 -struct 5.76 - { 5.77 - SlicingStruc *leftRowSlices; 5.78 - SlicingStruc *vecSlices; 5.79 - SlicingStruc *rightColSlices; 5.80 - } 5.81 -SlicingStrucCarrier; 5.82 - 5.83 -typedef struct 5.84 - { 5.85 - int32 numVecIdxs; 5.86 - int32 numRightColIdxs; 5.87 - int32 leftRowIdxOffset; 5.88 - int32 resColIdx; 5.89 - SubMatrix **leftSubMatrices; 5.90 - SubMatrix **rightSubMatrices; 5.91 - float32 *resultArray; 5.92 - int32 coreToRunOn; 5.93 - } 5.94 -VecParams; 5.95 - 5.96 -//============================= Processor Functions ========================= 5.97 -void divideWorkIntoSubMatrixPairProcrs( void *data, VirtProcr *animatingPr ); 5.98 -void calcSubMatrixProduct( void *data, VirtProcr *animatingPr ); 5.99 -void calcVectorOfSubMatrices( void *_vecParams, VirtProcr *animatingPr ); 5.100 - 5.101 - 5.102 -//================================ Entry Point ============================== 5.103 -Matrix * 5.104 -multiplyTheseMatrices( Matrix *leftMatrix, Matrix *rightMatrix ); 5.105 - 5.106 - 5.107 -#endif /*_VCilk_MATRIX_MULT_H_*/ 5.108 +/* 5.109 5.110 + * Copyright Oct 24, 2009 OpenSourceStewardshipFoundation.org 5.111 5.112 + * Licensed under GNU General Public License version 2 5.113 5.114 + */ 5.115 5.116 + 5.117 5.118 +#ifndef _VCilk_MATRIX_MULT_H_ 5.119 5.120 +#define _VCilk_MATRIX_MULT_H_ 5.121 5.122 + 5.123 5.124 +#include <stdio.h> 5.125 5.126 + 5.127 5.128 +#include "../../VCilk_lib/VCilk.h" 5.129 5.130 +#include "../Matrix_Mult.h" 5.131 5.132 +#include "../../VCilk_lib/VMS/VMS.h" 5.133 5.134 + 5.135 5.136 + 5.137 5.138 +//=============================== Defines ============================== 5.139 5.140 +#define ROWS_IN_BLOCK 32 5.141 5.142 +#define COLS_IN_BLOCK 32 5.143 5.144 +#define VEC_IN_BLOCK 32 5.145 5.146 + 5.147 5.148 +#define copyMatrixSingleton 1 5.149 5.150 +#define copyTransposeSingleton 2 5.151 5.152 + 5.153 5.154 +//============================== Structures ============================== 5.155 5.156 +typedef struct 5.157 5.158 + { 5.159 5.160 + Matrix *leftMatrix; 5.161 5.162 + Matrix *rightMatrix; 5.163 5.164 + Matrix *resultMatrix; 5.165 5.166 + 5.167 5.168 + TSCount numTSCsToExe; 5.169 5.170 + } 5.171 5.172 +DividerParams; 5.173 5.174 + 5.175 5.176 +typedef struct 5.177 5.178 + { 5.179 5.180 + VirtProcr *dividerPr; 5.181 5.182 + int numRows; 5.183 5.184 + int numCols; 5.185 5.186 + int numSubMatrixPairs; 5.187 5.188 + } 5.189 5.190 +ResultsParams; 5.191 5.192 + 5.193 5.194 +typedef 5.195 5.196 +struct 5.197 5.198 + { int32 numRows; 5.199 5.200 + int32 numCols; 5.201 5.202 + Matrix *origMatrix; 5.203 5.204 + int32 origStartRow; 5.205 5.206 + int32 origStartCol; 5.207 5.208 + int32 alreadyCopied; 5.209 5.210 + VCilkSingleton *copySingleton; 5.211 5.212 + VCilkSingleton *copyTransSingleton; 5.213 5.214 + float32 *array; //2D, but dynamically sized, so use addr arith 5.215 5.216 + } 5.217 5.218 +SubMatrix; 5.219 5.220 + 5.221 5.222 +typedef struct 5.223 5.224 + { VirtProcr *resultPr; 5.225 5.226 + SubMatrix *leftSubMatrix; 5.227 5.228 + SubMatrix *rightSubMatrix; 5.229 5.230 + float32 *partialResultArray; 5.231 5.232 + } 5.233 5.234 +SMPairParams; 5.235 5.236 + 5.237 5.238 +typedef 5.239 5.240 +struct 5.241 5.242 + { int32 numVals; 5.243 5.244 + int32 *startVals; 5.245 5.246 + } 5.247 5.248 +SlicingStruc; 5.249 5.250 + 5.251 5.252 +typedef 5.253 5.254 +struct 5.255 5.256 + { 5.257 5.258 + SlicingStruc *leftRowSlices; 5.259 5.260 + SlicingStruc *vecSlices; 5.261 5.262 + SlicingStruc *rightColSlices; 5.263 5.264 + } 5.265 5.266 +SlicingStrucCarrier; 5.267 5.268 + 5.269 5.270 +typedef struct 5.271 5.272 + { 5.273 5.274 + int32 numVecIdxs; 5.275 5.276 + int32 numRightColIdxs; 5.277 5.278 + int32 leftRowIdxOffset; 5.279 5.280 + int32 resColIdx; 5.281 5.282 + SubMatrix **leftSubMatrices; 5.283 5.284 + SubMatrix **rightSubMatrices; 5.285 5.286 + float32 *resultArray; 5.287 5.288 + int32 coreToRunOn; 5.289 5.290 + } 5.291 5.292 +VecParams; 5.293 5.294 + 5.295 5.296 +//============================= Processor Functions ========================= 5.297 5.298 +void divideWorkIntoSubMatrixPairProcrs( void *data, VirtProcr *animatingPr ); 5.299 5.300 +void calcSubMatrixProduct( void *data, VirtProcr *animatingPr ); 5.301 5.302 +void calcVectorOfSubMatrices( void *_vecParams, VirtProcr *animatingPr ); 5.303 5.304 + 5.305 5.306 + 5.307 5.308 +//================================ Entry Point ============================== 5.309 5.310 +Matrix * 5.311 5.312 +multiplyTheseMatrices( Matrix *leftMatrix, Matrix *rightMatrix ); 5.313 5.314 + 5.315 5.316 + 5.317 5.318 +#endif /*_VCilk_MATRIX_MULT_H_*/ 5.319
6.1 --- a/src/Application/VCilk__Matrix_Mult/Vector_Pr.c Mon Nov 15 12:08:19 2010 -0800 6.2 +++ b/src/Application/VCilk__Matrix_Mult/Vector_Pr.c Wed May 11 15:40:54 2011 +0200 6.3 @@ -1,130 +1,130 @@ 6.4 -/* 6.5 - * Copyright 2009 OpenSourceStewardshipFoundation.org 6.6 - * Licensed under GNU General Public License version 2 6.7 - * 6.8 - * Author: seanhalle@yahoo.com 6.9 - * 6.10 - */ 6.11 - 6.12 - 6.13 -#include "VCilk__Matrix_Mult.h" 6.14 -#include <math.h> 6.15 -#include <stdlib.h> 6.16 - 6.17 - 6.18 -void inline 6.19 -accumulateResult( float32 *resultArray, float32 *subMatrixResultArray, 6.20 - int32 startRow, 6.21 - int32 numRows, 6.22 - int32 startCol, 6.23 - int32 numCols, 6.24 - int32 numOrigCols ); 6.25 - 6.26 - 6.27 -//=========================================================================== 6.28 - 6.29 -void 6.30 -calcVectorOfSubMatrices( void *_vecParams, VirtProcr *animPr ) 6.31 - { int32 numVecIdxs, leftRowIdxOffset, numRightColIdxs, resColIdx; 6.32 - SubMatrix **leftSubMatrices, **rightSubMatrices; 6.33 - float32 *resultArray; 6.34 - int32 vecIdx, coreWithAffinity; 6.35 - SMPairParams *subMatrixPairParams, **vecOfSubMatrixParams; 6.36 - VecParams *vecParams; 6.37 - 6.38 - vecParams = (VecParams *)_vecParams; 6.39 - 6.40 - DEBUG1( dbgAppFlow, "start vector %d\n", animPr->procrID) 6.41 - #ifdef TURN_ON_DEBUG_PROBES 6.42 - int32 subMatrixVectorProbe = 6.43 - VMS__create_single_interval_probe( "subMtxVect", animPr ); 6.44 - VMS__record_sched_choice_into_probe( subMatrixVectorProbe, animPr ); 6.45 - VMS__record_interval_start_in_probe( subMatrixVectorProbe ); 6.46 - #endif 6.47 - 6.48 - 6.49 - numVecIdxs = vecParams->numVecIdxs; 6.50 - numRightColIdxs = vecParams->numRightColIdxs; 6.51 - leftRowIdxOffset = vecParams->leftRowIdxOffset; 6.52 - resColIdx = vecParams->resColIdx; 6.53 - leftSubMatrices = vecParams->leftSubMatrices; 6.54 - rightSubMatrices = vecParams->rightSubMatrices; 6.55 - resultArray = vecParams->resultArray; 6.56 - coreWithAffinity = vecParams->coreToRunOn; 6.57 - 6.58 - vecOfSubMatrixParams = VCilk__malloc( numVecIdxs * sizeof(SMPairParams *), 6.59 - animPr ); 6.60 - if( vecOfSubMatrixParams == 0 ){printf("malloc error"); exit(1);} 6.61 - 6.62 - for( vecIdx = 0; vecIdx < numVecIdxs; vecIdx++ ) 6.63 - { 6.64 - //Make the processor for the pair of sub-matrices 6.65 - subMatrixPairParams = VCilk__malloc( sizeof(SMPairParams), animPr ); 6.66 - subMatrixPairParams->leftSubMatrix = 6.67 - leftSubMatrices[ leftRowIdxOffset + vecIdx ]; 6.68 - 6.69 - subMatrixPairParams->rightSubMatrix = 6.70 - rightSubMatrices[ vecIdx * numRightColIdxs + resColIdx ]; 6.71 - 6.72 - VCilk__spawn( coreWithAffinity, &calcSubMatrixProduct, 6.73 - subMatrixPairParams, animPr ); 6.74 - 6.75 - vecOfSubMatrixParams[ vecIdx ] = subMatrixPairParams; 6.76 - } 6.77 - 6.78 - DEBUG1( dbgAppFlow, "before sync in vector %d\n", animPr->procrID) 6.79 - VCilk__sync( animPr ); 6.80 - DEBUG1( dbgAppFlow, "**after sync in vector %d\n", animPr->procrID) 6.81 - 6.82 - //now accumulate individual result matrices into final result matrix 6.83 - for( vecIdx = 0; vecIdx < numVecIdxs; vecIdx++ ) 6.84 - { 6.85 - subMatrixPairParams = vecOfSubMatrixParams[ vecIdx ]; 6.86 - 6.87 - accumulateResult( resultArray, subMatrixPairParams->partialResultArray, 6.88 - subMatrixPairParams->leftSubMatrix->origStartRow, 6.89 - subMatrixPairParams->leftSubMatrix->numRows, 6.90 - subMatrixPairParams->rightSubMatrix->origStartCol, 6.91 - subMatrixPairParams->rightSubMatrix->numCols, 6.92 - subMatrixPairParams->rightSubMatrix->origMatrix->numCols); 6.93 - 6.94 - //Note, resultArray is made on the core that produces the results 6.95 - // that gives chance to set affinity so all in vector run on same 6.96 - // core and re-use that array, and prevents writes from causing 6.97 - // thrashing of the cache -- as long as array big enough, the copy 6.98 - // overhead is miniscule vs the size-of-side reuse of each byte 6.99 - VCilk__free( subMatrixPairParams->partialResultArray, animPr ); 6.100 - VCilk__free( subMatrixPairParams, animPr ); 6.101 - } 6.102 - VCilk__free( vecOfSubMatrixParams, animPr ); 6.103 - VCilk__free( vecParams, animPr ); 6.104 - 6.105 - #ifdef TURN_ON_DEBUG_PROBES 6.106 - VMS__record_interval_end_in_probe( subMatrixVectorProbe ); 6.107 - #endif 6.108 - 6.109 - DEBUG1( dbgAppFlow, "end vector %d\n", animPr->procrID) 6.110 - VCilk__dissipate_procr( animPr ); 6.111 - } 6.112 - 6.113 - 6.114 - 6.115 -void inline 6.116 -accumulateResult( float32 *resultArray, float32 *subMatrixResultArray, 6.117 - int32 startRow, 6.118 - int32 numRows, 6.119 - int32 startCol, 6.120 - int32 numCols, 6.121 - int32 numOrigCols ) 6.122 - { int32 row, col; 6.123 - 6.124 - for( row = 0; row < numRows; row++ ) 6.125 - { 6.126 - for( col = 0; col < numCols; col++ ) 6.127 - { 6.128 - resultArray[ (row + startRow) * numOrigCols + col + startCol ] += 6.129 - subMatrixResultArray[ row * numCols + col ]; 6.130 - } 6.131 - } 6.132 - 6.133 - } 6.134 +/* 6.135 6.136 + * Copyright 2009 OpenSourceStewardshipFoundation.org 6.137 6.138 + * Licensed under GNU General Public License version 2 6.139 6.140 + * 6.141 6.142 + * Author: seanhalle@yahoo.com 6.143 6.144 + * 6.145 6.146 + */ 6.147 6.148 + 6.149 6.150 + 6.151 6.152 +#include "VCilk__Matrix_Mult.h" 6.153 6.154 +#include <math.h> 6.155 6.156 +#include <stdlib.h> 6.157 6.158 + 6.159 6.160 + 6.161 6.162 +void inline 6.163 6.164 +accumulateResult( float32 *resultArray, float32 *subMatrixResultArray, 6.165 6.166 + int32 startRow, 6.167 6.168 + int32 numRows, 6.169 6.170 + int32 startCol, 6.171 6.172 + int32 numCols, 6.173 6.174 + int32 numOrigCols ); 6.175 6.176 + 6.177 6.178 + 6.179 6.180 +//=========================================================================== 6.181 6.182 + 6.183 6.184 +void 6.185 6.186 +calcVectorOfSubMatrices( void *_vecParams, VirtProcr *animPr ) 6.187 6.188 + { int32 numVecIdxs, leftRowIdxOffset, numRightColIdxs, resColIdx; 6.189 6.190 + SubMatrix **leftSubMatrices, **rightSubMatrices; 6.191 6.192 + float32 *resultArray; 6.193 6.194 + int32 vecIdx, coreWithAffinity; 6.195 6.196 + SMPairParams *subMatrixPairParams, **vecOfSubMatrixParams; 6.197 6.198 + VecParams *vecParams; 6.199 6.200 + 6.201 6.202 + vecParams = (VecParams *)_vecParams; 6.203 6.204 + 6.205 6.206 + DEBUG1( dbgAppFlow, "start vector %d\n", animPr->procrID) 6.207 6.208 + #ifdef TURN_ON_DEBUG_PROBES 6.209 6.210 + int32 subMatrixVectorProbe = 6.211 6.212 + VMS__create_single_interval_probe( "subMtxVect", animPr ); 6.213 6.214 + VMS__record_sched_choice_into_probe( subMatrixVectorProbe, animPr ); 6.215 6.216 + VMS__record_interval_start_in_probe( subMatrixVectorProbe ); 6.217 6.218 + #endif 6.219 6.220 + 6.221 6.222 + 6.223 6.224 + numVecIdxs = vecParams->numVecIdxs; 6.225 6.226 + numRightColIdxs = vecParams->numRightColIdxs; 6.227 6.228 + leftRowIdxOffset = vecParams->leftRowIdxOffset; 6.229 6.230 + resColIdx = vecParams->resColIdx; 6.231 6.232 + leftSubMatrices = vecParams->leftSubMatrices; 6.233 6.234 + rightSubMatrices = vecParams->rightSubMatrices; 6.235 6.236 + resultArray = vecParams->resultArray; 6.237 6.238 + coreWithAffinity = vecParams->coreToRunOn; 6.239 6.240 + 6.241 6.242 + vecOfSubMatrixParams = VCilk__malloc( numVecIdxs * sizeof(SMPairParams *), 6.243 6.244 + animPr ); 6.245 6.246 + if( vecOfSubMatrixParams == 0 ){printf("malloc error"); exit(1);} 6.247 6.248 + 6.249 6.250 + for( vecIdx = 0; vecIdx < numVecIdxs; vecIdx++ ) 6.251 6.252 + { 6.253 6.254 + //Make the processor for the pair of sub-matrices 6.255 6.256 + subMatrixPairParams = VCilk__malloc( sizeof(SMPairParams), animPr ); 6.257 6.258 + subMatrixPairParams->leftSubMatrix = 6.259 6.260 + leftSubMatrices[ leftRowIdxOffset + vecIdx ]; 6.261 6.262 + 6.263 6.264 + subMatrixPairParams->rightSubMatrix = 6.265 6.266 + rightSubMatrices[ vecIdx * numRightColIdxs + resColIdx ]; 6.267 6.268 + 6.269 6.270 + VCilk__spawn( coreWithAffinity, &calcSubMatrixProduct, 6.271 6.272 + subMatrixPairParams, animPr ); 6.273 6.274 + 6.275 6.276 + vecOfSubMatrixParams[ vecIdx ] = subMatrixPairParams; 6.277 6.278 + } 6.279 6.280 + 6.281 6.282 + DEBUG1( dbgAppFlow, "before sync in vector %d\n", animPr->procrID) 6.283 6.284 + VCilk__sync( animPr ); 6.285 6.286 + DEBUG1( dbgAppFlow, "**after sync in vector %d\n", animPr->procrID) 6.287 6.288 + 6.289 6.290 + //now accumulate individual result matrices into final result matrix 6.291 6.292 + for( vecIdx = 0; vecIdx < numVecIdxs; vecIdx++ ) 6.293 6.294 + { 6.295 6.296 + subMatrixPairParams = vecOfSubMatrixParams[ vecIdx ]; 6.297 6.298 + 6.299 6.300 + accumulateResult( resultArray, subMatrixPairParams->partialResultArray, 6.301 6.302 + subMatrixPairParams->leftSubMatrix->origStartRow, 6.303 6.304 + subMatrixPairParams->leftSubMatrix->numRows, 6.305 6.306 + subMatrixPairParams->rightSubMatrix->origStartCol, 6.307 6.308 + subMatrixPairParams->rightSubMatrix->numCols, 6.309 6.310 + subMatrixPairParams->rightSubMatrix->origMatrix->numCols); 6.311 6.312 + 6.313 6.314 + //Note, resultArray is made on the core that produces the results 6.315 6.316 + // that gives chance to set affinity so all in vector run on same 6.317 6.318 + // core and re-use that array, and prevents writes from causing 6.319 6.320 + // thrashing of the cache -- as long as array big enough, the copy 6.321 6.322 + // overhead is miniscule vs the size-of-side reuse of each byte 6.323 6.324 + VCilk__free( subMatrixPairParams->partialResultArray, animPr ); 6.325 6.326 + VCilk__free( subMatrixPairParams, animPr ); 6.327 6.328 + } 6.329 6.330 + VCilk__free( vecOfSubMatrixParams, animPr ); 6.331 6.332 + VCilk__free( vecParams, animPr ); 6.333 6.334 + 6.335 6.336 + #ifdef TURN_ON_DEBUG_PROBES 6.337 6.338 + VMS__record_interval_end_in_probe( subMatrixVectorProbe ); 6.339 6.340 + #endif 6.341 6.342 + 6.343 6.344 + DEBUG1( dbgAppFlow, "end vector %d\n", animPr->procrID) 6.345 6.346 + VCilk__dissipate_procr( animPr ); 6.347 6.348 + } 6.349 6.350 + 6.351 6.352 + 6.353 6.354 + 6.355 6.356 +void inline 6.357 6.358 +accumulateResult( float32 *resultArray, float32 *subMatrixResultArray, 6.359 6.360 + int32 startRow, 6.361 6.362 + int32 numRows, 6.363 6.364 + int32 startCol, 6.365 6.366 + int32 numCols, 6.367 6.368 + int32 numOrigCols ) 6.369 6.370 + { int32 row, col; 6.371 6.372 + 6.373 6.374 + for( row = 0; row < numRows; row++ ) 6.375 6.376 + { 6.377 6.378 + for( col = 0; col < numCols; col++ ) 6.379 6.380 + { 6.381 6.382 + resultArray[ (row + startRow) * numOrigCols + col + startCol ] += 6.383 6.384 + subMatrixResultArray[ row * numCols + col ]; 6.385 6.386 + } 6.387 6.388 + } 6.389 6.390 + 6.391 6.392 + } 6.393
7.1 --- a/src/Application/VCilk__Matrix_Mult/subMatrix_Pr.c Mon Nov 15 12:08:19 2010 -0800 7.2 +++ b/src/Application/VCilk__Matrix_Mult/subMatrix_Pr.c Wed May 11 15:40:54 2011 +0200 7.3 @@ -1,312 +1,308 @@ 7.4 -/* 7.5 - * Copyright 2009 OpenSourceStewardshipFoundation.org 7.6 - * Licensed under GNU General Public License version 2 7.7 - * 7.8 - * Author: SeanHalle@yahoo.com 7.9 - * 7.10 - */ 7.11 - 7.12 -#include <string.h> 7.13 - 7.14 -#include "VCilk__Matrix_Mult.h" 7.15 - 7.16 - 7.17 -void inline 7.18 -copyFromOrig( SubMatrix *subMatrix, VirtProcr *animPr ); 7.19 - 7.20 -void inline 7.21 -copyTransposeFromOrig( SubMatrix *subMatrix, VirtProcr *animPr ); 7.22 - 7.23 -void inline 7.24 -multiplySubBlocksTransposed( float32 *leftArray, float32 *rightArray, 7.25 - float32 *resArray, 7.26 - int startRow, int endRow, 7.27 - int startCol, int endCol, 7.28 - int startVec, int endVec, 7.29 - int resStride, int inpStride ); 7.30 - 7.31 -void inline 7.32 -multiplyMatrixArraysTransposed( int32 vecLength, int32 numResRows, int32 numResCols, 7.33 - float32 *leftArray, float32 *rightArray, 7.34 - float32 *resArray ); 7.35 - 7.36 - 7.37 -/*A processor is created with an environment that holds two matrices, 7.38 - * the row and col that it owns, and the name of a result gathering 7.39 - * processor. 7.40 - *It calculates the product of two sub-portions of the input matrices 7.41 - * by using Intel's mkl library for single-core. 7.42 - * 7.43 - *This demonstrates using optimized single-threaded code inside scheduled 7.44 - * work-units. 7.45 - * 7.46 - *When done, it sends the result to the result processor 7.47 - */ 7.48 -void 7.49 -calcSubMatrixProduct( void *data, VirtProcr *animPr ) 7.50 - { 7.51 - SMPairParams *params; 7.52 - VirtProcr *resultPr; 7.53 - float32 *leftArray, *rightArray, *resArray; 7.54 - SubMatrix *leftSubMatrix, *rightSubMatrix; 7.55 - 7.56 - 7.57 - DEBUG1( dbgAppFlow, "start sub-matrix mult %d\n", animPr->procrID) 7.58 - #ifdef TURN_ON_DEBUG_PROBES 7.59 - int32 subMatrixProbe = 7.60 - VMS__create_single_interval_probe( "subMtx", animPr); 7.61 - VMS__record_sched_choice_into_probe( subMatrixProbe, animPr ); 7.62 - VMS__record_interval_start_in_probe( subMatrixProbe ); 7.63 - #endif 7.64 - 7.65 - params = (SMPairParams *)data; 7.66 -// resultPr = params->resultPr; 7.67 - leftSubMatrix = params->leftSubMatrix; 7.68 - rightSubMatrix = params->rightSubMatrix; 7.69 - 7.70 - //make sure the input sub-matrices have been copied out of orig 7.71 - copyFromOrig( leftSubMatrix, animPr ); 7.72 - copyTransposeFromOrig( rightSubMatrix, animPr ); 7.73 - 7.74 - leftArray = leftSubMatrix->array; 7.75 - rightArray = rightSubMatrix->array; 7.76 - 7.77 - //make this array here, on the core that computes the results 7.78 - // with Cilk's semantics, have to have separate result array for each 7.79 - // spawned processor -- unless want to change the spawn and sync 7.80 - // pattern, such that spawn one from each vector, then sync, then 7.81 - // another, and so forth -- this will cause idle time due to imbalance 7.82 - // in matrix sizes 7.83 - //This also gives chance to set affinity so all in vector run on same 7.84 - // core and re-use the accumulation array, 7.85 - //As a side-benefit, it also prevents writes from causing 7.86 - // thrashing of the cache -- as long as array big enough, the copy 7.87 - // overhead is small because each byte is reused size-of-side times 7.88 - //This is freed in the vector processor 7.89 - int32 7.90 - resSize = leftSubMatrix->numRows * rightSubMatrix->numCols * sizeof(float32); 7.91 - resArray = VCilk__malloc( resSize, animPr ); 7.92 - memset( resArray, 0, resSize ); 7.93 - 7.94 - 7.95 - int32 numResRows, numResCols, vectLength; 7.96 - 7.97 - vectLength = leftSubMatrix->numCols; 7.98 - numResRows = leftSubMatrix->numRows; 7.99 - numResCols = rightSubMatrix->numCols; 7.100 - 7.101 - multiplyMatrixArraysTransposed( vectLength, numResRows, numResCols, 7.102 - leftArray, rightArray, 7.103 - resArray ); 7.104 - 7.105 - //send result by side-effect 7.106 - params->partialResultArray = resArray; 7.107 - 7.108 - #ifdef TURN_ON_DEBUG_PROBES 7.109 - VMS__record_interval_end_in_probe( subMatrixProbe ); 7.110 - #endif 7.111 - 7.112 - DEBUG1( dbgAppFlow, "end sub-matrix mult %d\n", animPr->procrID) 7.113 - VCilk__dissipate_procr( animPr ); 7.114 - } 7.115 - 7.116 - 7.117 - 7.118 -/*Divides result and each input into 32x32 sub-matrices, 3 of which fit into 7.119 - * the 32KB L1 cache. 7.120 - *Would be nice to embed this within another level that divided into 7.121 - * 8x8 tiles of those, where one 8x8 tile fits within 2MB L2 cache 7.122 - * 7.123 - *Eventually want these divisions to be automatic, using DKU pattern 7.124 - * embedded into VMS and exposed in the language, and with VMS controlling the 7.125 - * divisions according to the cache sizes, which it knows about. 7.126 - *Also, want VMS to work with language to split among main-mems, so a socket 7.127 - * only cranks on data in its local segment of main mem 7.128 - * 7.129 - *So, outer two loops determine start and end points within the result matrix. 7.130 - * Inside that, a loop dets the start and end points along the shared dimensions 7.131 - * of the two input matrices. 7.132 - */ 7.133 -void inline 7.134 -multiplyMatrixArraysTransposed( int32 vecLength, int32 numResRows, 7.135 - int32 numResCols, 7.136 - float32 *leftArray, float32 *rightArray, 7.137 - float32 *resArray ) 7.138 - { 7.139 - int resStride, inpStride; 7.140 - int resStartRow, resStartCol, resEndRow, resEndCol, startVec, endVec; 7.141 - 7.142 - resStride = numResCols; 7.143 - inpStride = vecLength; 7.144 - 7.145 - for( resStartRow = 0; resStartRow < numResRows; ) 7.146 - { 7.147 - resEndRow = resStartRow + ROWS_IN_BLOCK -1; //start at zero, so -1 7.148 - if( resEndRow > numResRows ) resEndRow = numResRows -1; 7.149 - 7.150 - for( resStartCol = 0; resStartCol < numResCols; ) 7.151 - { 7.152 - resEndCol = resStartCol + COLS_IN_BLOCK -1; 7.153 - if( resEndCol > numResCols ) resEndCol = numResCols -1; 7.154 - 7.155 - for( startVec = 0; startVec < vecLength; ) 7.156 - { 7.157 - endVec = startVec + VEC_IN_BLOCK -1; 7.158 - if( endVec > vecLength ) endVec = vecLength -1; 7.159 - 7.160 - //By having the "vector" of sub-blocks in a sub-block slice 7.161 - // be marched down in inner loop, are re-using the result 7.162 - // matrix, which stays in L1 cache and re-using the left sub-mat 7.163 - // which repeats for each right sub-mat -- can only re-use two of 7.164 - // the three, so result is the most important -- avoids writing 7.165 - // dirty blocks until those result-locations fully done 7.166 - //Row and Col is position in result matrix -- so row and vec 7.167 - // for left array, then vec and col for right array 7.168 - multiplySubBlocksTransposed( leftArray, rightArray, 7.169 - resArray, 7.170 - resStartRow, resEndRow, 7.171 - resStartCol, resEndCol, 7.172 - startVec, endVec, 7.173 - resStride, inpStride ); 7.174 - startVec = endVec +1; 7.175 - } 7.176 - resStartCol = resEndCol +1; 7.177 - } 7.178 - resStartRow = resEndRow +1; 7.179 - } 7.180 - } 7.181 - 7.182 - 7.183 - 7.184 -void inline 7.185 -multiplySubBlocksTransposed( float32 *leftArray, float32 *rightArray, 7.186 - float32 *resArray, 7.187 - int resStartRow, int resEndRow, 7.188 - int resStartCol, int resEndCol, 7.189 - int startVec, int endVec, 7.190 - int resStride, int inpStride ) 7.191 - { 7.192 - int resRow, resCol, vec; 7.193 - int leftOffset, rightOffset; 7.194 - float32 result; 7.195 - 7.196 - //The result row is used for the left matrix, res col for the right 7.197 - for( resCol = resStartCol; resCol <= resEndCol; resCol++ ) 7.198 - { 7.199 - for( resRow = resStartRow; resRow <= resEndRow; resRow++ ) 7.200 - { 7.201 - leftOffset = resRow * inpStride;//left & right inp strides same 7.202 - rightOffset = resCol * inpStride;// because right is transposed 7.203 - result = 0; 7.204 - for( vec = startVec; vec <= endVec; vec++ ) 7.205 - { 7.206 - result += 7.207 - leftArray[ leftOffset + vec] * rightArray[ rightOffset + vec]; 7.208 - } 7.209 - 7.210 - resArray[ resRow * resStride + resCol ] += result; 7.211 - } 7.212 - } 7.213 - } 7.214 - 7.215 - 7.216 -/*Reuse this in divider when do the sequential multiply case 7.217 - */ 7.218 -void inline 7.219 -copyTranspose( int32 numRows, int32 numCols, 7.220 - int32 origStartRow, int32 origStartCol, int32 origStride, 7.221 - float32 *subArray, float32 *origArray ) 7.222 - { int32 stride = numRows; 7.223 - 7.224 - int row, col, origOffset; 7.225 - for( row = 0; row < numRows; row++ ) 7.226 - { 7.227 - origOffset = (row + origStartRow) * origStride + origStartCol; 7.228 - for( col = 0; col < numCols; col++ ) 7.229 - { 7.230 - //transpose means swap row & col -- traverse orig matrix normally 7.231 - // but put into reversed place in local array -- means the 7.232 - // stride is the numRows now, so col * numRows + row 7.233 - subArray[ col * stride + row ] = origArray[ origOffset + col ]; 7.234 - } 7.235 - } 7.236 - } 7.237 - 7.238 -void inline 7.239 -copyTransposeFromOrig( SubMatrix *subMatrix, VirtProcr *animPr ) 7.240 - { int numCols, numRows, origStartRow, origStartCol, origStride, stride; 7.241 - Matrix *origMatrix; 7.242 - float32 *origArray, *subArray; 7.243 - 7.244 - if( subMatrix->alreadyCopied ) return; 7.245 - VCilk__start_singleton(copyMatrixSingleton,&&EndOfTranspSingleton,animPr); 7.246 - 7.247 - origMatrix = subMatrix->origMatrix; 7.248 - origArray = origMatrix->array; 7.249 - numCols = subMatrix->numCols; 7.250 - numRows = subMatrix->numRows; 7.251 - origStartRow = subMatrix->origStartRow; 7.252 - origStartCol = subMatrix->origStartCol; 7.253 - origStride = origMatrix->numCols; 7.254 - 7.255 - subArray = VCilk__malloc( numRows * numCols *sizeof(float32),animPr); 7.256 - subMatrix->array = subArray; 7.257 - 7.258 - //copy values from orig matrix to local 7.259 - copyTranspose( numRows, numCols, 7.260 - origStartRow, origStartCol, origStride, 7.261 - subArray, origArray ); 7.262 - 7.263 - subMatrix->alreadyCopied = TRUE; //must be last thing before label 7.264 - EndOfTranspSingleton: 7.265 - return; 7.266 - } 7.267 - 7.268 - 7.269 -void inline 7.270 -copyFromOrig( SubMatrix *subMatrix, VirtProcr *animPr ) 7.271 - { int numCols, numRows, origStartRow, origStartCol, stride, origStride; 7.272 - Matrix *origMatrix; 7.273 - float32 *origArray, *subArray; 7.274 - 7.275 - 7.276 - //This lets only a single VP execute the code between start and 7.277 - // end -- using start and end so that work runs outside the master. 7.278 - //Inside, if a second VP ever executes the start, it will be returned 7.279 - // from the end-point. 7.280 - //Note, for non-GCC, can add a second SSR call at the end, and inside 7.281 - // that one, look at the stack at the return addr & save that in an 7.282 - // array indexed by singletonID 7.283 - if( subMatrix->alreadyCopied ) return; 7.284 - VCilk__start_singleton( copyMatrixSingleton, &&EndOfCopySingleton,animPr); 7.285 - 7.286 - 7.287 - origMatrix = subMatrix->origMatrix; 7.288 - origArray = origMatrix->array; 7.289 - numCols = subMatrix->numCols; 7.290 - numRows = subMatrix->numRows; 7.291 - origStartRow = subMatrix->origStartRow; 7.292 - origStartCol = subMatrix->origStartCol; 7.293 - origStride = origMatrix->numCols; 7.294 - 7.295 - subArray = VCilk__malloc( numRows * numCols *sizeof(float32),animPr); 7.296 - subMatrix->array = subArray; 7.297 - 7.298 - //copy values from orig matrix to local 7.299 - stride = numCols; 7.300 - 7.301 - int row, col, offset, origOffset; 7.302 - for( row = 0; row < numRows; row++ ) 7.303 - { 7.304 - offset = row * stride; 7.305 - origOffset = (row + origStartRow) * origStride + origStartCol; 7.306 - for( col = 0; col < numCols; col++ ) 7.307 - { 7.308 - subArray[ offset + col ] = origArray[ origOffset + col ]; 7.309 - } 7.310 - } 7.311 - 7.312 - subMatrix->alreadyCopied = TRUE; //must be last thing before label 7.313 - EndOfCopySingleton: 7.314 - return; 7.315 - } 7.316 +/* 7.317 7.318 + * Copyright 2009 OpenSourceStewardshipFoundation.org 7.319 7.320 + * Licensed under GNU General Public License version 2 7.321 7.322 + * 7.323 7.324 + * Author: SeanHalle@yahoo.com 7.325 7.326 + * 7.327 7.328 + */ 7.329 7.330 + 7.331 7.332 +#include <string.h> 7.333 7.334 + 7.335 7.336 +#include "VCilk__Matrix_Mult.h" 7.337 7.338 + 7.339 7.340 + 7.341 7.342 +void inline 7.343 7.344 +copyFromOrig( SubMatrix *subMatrix, VirtProcr *animPr ); 7.345 7.346 + 7.347 7.348 +void inline 7.349 7.350 +copyTransposeFromOrig( SubMatrix *subMatrix, VirtProcr *animPr ); 7.351 7.352 + 7.353 7.354 +void inline 7.355 7.356 +multiplySubBlocksTransposed( float32 *leftArray, float32 *rightArray, 7.357 7.358 + float32 *resArray, 7.359 7.360 + int startRow, int endRow, 7.361 7.362 + int startCol, int endCol, 7.363 7.364 + int startVec, int endVec, 7.365 7.366 + int resStride, int inpStride ); 7.367 7.368 + 7.369 7.370 +void inline 7.371 7.372 +multiplyMatrixArraysTransposed( int32 vecLength, int32 numResRows, int32 numResCols, 7.373 7.374 + float32 *leftArray, float32 *rightArray, 7.375 7.376 + float32 *resArray ); 7.377 7.378 + 7.379 7.380 + 7.381 7.382 +/*A processor is created with an environment that holds two matrices, 7.383 7.384 + * the row and col that it owns, and the name of a result gathering 7.385 7.386 + * processor. 7.387 7.388 + *It calculates the product of two sub-portions of the input matrices 7.389 7.390 + * by using Intel's mkl library for single-core. 7.391 7.392 + * 7.393 7.394 + *This demonstrates using optimized single-threaded code inside scheduled 7.395 7.396 + * work-units. 7.397 7.398 + * 7.399 7.400 + *When done, it sends the result to the result processor 7.401 7.402 + */ 7.403 7.404 +void 7.405 7.406 +calcSubMatrixProduct( void *data, VirtProcr *animPr ) 7.407 7.408 + { 7.409 7.410 + SMPairParams *params; 7.411 7.412 + VirtProcr *resultPr; 7.413 7.414 + float32 *leftArray, *rightArray, *resArray; 7.415 7.416 + SubMatrix *leftSubMatrix, *rightSubMatrix; 7.417 7.418 + 7.419 7.420 + 7.421 7.422 + DEBUG1( dbgAppFlow, "start sub-matrix mult %d\n", animPr->procrID) 7.423 7.424 + #ifdef TURN_ON_DEBUG_PROBES 7.425 7.426 + int32 subMatrixProbe = 7.427 7.428 + VMS__create_single_interval_probe( "subMtx", animPr); 7.429 7.430 + VMS__record_sched_choice_into_probe( subMatrixProbe, animPr ); 7.431 7.432 + VMS__record_interval_start_in_probe( subMatrixProbe ); 7.433 7.434 + #endif 7.435 7.436 + 7.437 7.438 + params = (SMPairParams *)data; 7.439 7.440 +// resultPr = params->resultPr; 7.441 7.442 + leftSubMatrix = params->leftSubMatrix; 7.443 7.444 + rightSubMatrix = params->rightSubMatrix; 7.445 7.446 + 7.447 7.448 + //make sure the input sub-matrices have been copied out of orig 7.449 7.450 + copyFromOrig( leftSubMatrix, animPr ); 7.451 7.452 + copyTransposeFromOrig( rightSubMatrix, animPr ); 7.453 7.454 + 7.455 7.456 + leftArray = leftSubMatrix->array; 7.457 7.458 + rightArray = rightSubMatrix->array; 7.459 7.460 + 7.461 7.462 + //make this array here, on the core that computes the results 7.463 7.464 + // with Cilk's semantics, have to have separate result array for each 7.465 7.466 + // spawned processor -- unless want to change the spawn and sync 7.467 7.468 + // pattern, such that spawn one from each vector, then sync, then 7.469 7.470 + // another, and so forth -- this will cause idle time due to imbalance 7.471 7.472 + // in matrix sizes 7.473 7.474 + //This also gives chance to set affinity so all in vector run on same 7.475 7.476 + // core and re-use the accumulation array, 7.477 7.478 + //As a side-benefit, it also prevents writes from causing 7.479 7.480 + // thrashing of the cache -- as long as array big enough, the copy 7.481 7.482 + // overhead is small because each byte is reused size-of-side times 7.483 7.484 + //This is freed in the vector processor 7.485 7.486 + int32 7.487 7.488 + resSize = leftSubMatrix->numRows * rightSubMatrix->numCols * sizeof(float32); 7.489 7.490 + resArray = VCilk__malloc( resSize, animPr ); 7.491 7.492 + memset( resArray, 0, resSize ); 7.493 7.494 + 7.495 7.496 + 7.497 7.498 + int32 numResRows, numResCols, vectLength; 7.499 7.500 + 7.501 7.502 + vectLength = leftSubMatrix->numCols; 7.503 7.504 + numResRows = leftSubMatrix->numRows; 7.505 7.506 + numResCols = rightSubMatrix->numCols; 7.507 7.508 + 7.509 7.510 + multiplyMatrixArraysTransposed( vectLength, numResRows, numResCols, 7.511 7.512 + leftArray, rightArray, 7.513 7.514 + resArray ); 7.515 7.516 + 7.517 7.518 + //send result by side-effect 7.519 7.520 + params->partialResultArray = resArray; 7.521 7.522 + 7.523 7.524 + #ifdef TURN_ON_DEBUG_PROBES 7.525 7.526 + VMS__record_interval_end_in_probe( subMatrixProbe ); 7.527 7.528 + #endif 7.529 7.530 + 7.531 7.532 + DEBUG1( dbgAppFlow, "end sub-matrix mult %d\n", animPr->procrID) 7.533 7.534 + VCilk__dissipate_procr( animPr ); 7.535 7.536 + } 7.537 7.538 + 7.539 7.540 + 7.541 7.542 + 7.543 7.544 +/*Divides result and each input into 32x32 sub-matrices, 3 of which fit into 7.545 7.546 + * the 32KB L1 cache. 7.547 7.548 + *Would be nice to embed this within another level that divided into 7.549 7.550 + * 8x8 tiles of those, where one 8x8 tile fits within 2MB L2 cache 7.551 7.552 + * 7.553 7.554 + *Eventually want these divisions to be automatic, using DKU pattern 7.555 7.556 + * embedded into VMS and exposed in the language, and with VMS controlling the 7.557 7.558 + * divisions according to the cache sizes, which it knows about. 7.559 7.560 + *Also, want VMS to work with language to split among main-mems, so a socket 7.561 7.562 + * only cranks on data in its local segment of main mem 7.563 7.564 + * 7.565 7.566 + *So, outer two loops determine start and end points within the result matrix. 7.567 7.568 + * Inside that, a loop dets the start and end points along the shared dimensions 7.569 7.570 + * of the two input matrices. 7.571 7.572 + */ 7.573 7.574 +void inline 7.575 7.576 +multiplyMatrixArraysTransposed( int32 vecLength, int32 numResRows, 7.577 7.578 + int32 numResCols, 7.579 7.580 + float32 *leftArray, float32 *rightArray, 7.581 7.582 + float32 *resArray ) 7.583 7.584 + { 7.585 7.586 + int resStride, inpStride; 7.587 7.588 + int resStartRow, resStartCol, resEndRow, resEndCol, startVec, endVec; 7.589 7.590 + 7.591 7.592 + resStride = numResCols; 7.593 7.594 + inpStride = vecLength; 7.595 7.596 + 7.597 7.598 + for( resStartRow = 0; resStartRow < numResRows; ) 7.599 7.600 + { 7.601 7.602 + resEndRow = resStartRow + ROWS_IN_BLOCK -1; //start at zero, so -1 7.603 7.604 + if( resEndRow > numResRows ) resEndRow = numResRows -1; 7.605 7.606 + 7.607 7.608 + for( resStartCol = 0; resStartCol < numResCols; ) 7.609 7.610 + { 7.611 7.612 + resEndCol = resStartCol + COLS_IN_BLOCK -1; 7.613 7.614 + if( resEndCol > numResCols ) resEndCol = numResCols -1; 7.615 7.616 + 7.617 7.618 + for( startVec = 0; startVec < vecLength; ) 7.619 7.620 + { 7.621 7.622 + endVec = startVec + VEC_IN_BLOCK -1; 7.623 7.624 + if( endVec > vecLength ) endVec = vecLength -1; 7.625 7.626 + 7.627 7.628 + //By having the "vector" of sub-blocks in a sub-block slice 7.629 7.630 + // be marched down in inner loop, are re-using the result 7.631 7.632 + // matrix, which stays in L1 cache and re-using the left sub-mat 7.633 7.634 + // which repeats for each right sub-mat -- can only re-use two of 7.635 7.636 + // the three, so result is the most important -- avoids writing 7.637 7.638 + // dirty blocks until those result-locations fully done 7.639 7.640 + //Row and Col is position in result matrix -- so row and vec 7.641 7.642 + // for left array, then vec and col for right array 7.643 7.644 + multiplySubBlocksTransposed( leftArray, rightArray, 7.645 7.646 + resArray, 7.647 7.648 + resStartRow, resEndRow, 7.649 7.650 + resStartCol, resEndCol, 7.651 7.652 + startVec, endVec, 7.653 7.654 + resStride, inpStride ); 7.655 7.656 + startVec = endVec +1; 7.657 7.658 + } 7.659 7.660 + resStartCol = resEndCol +1; 7.661 7.662 + } 7.663 7.664 + resStartRow = resEndRow +1; 7.665 7.666 + } 7.667 7.668 + } 7.669 7.670 + 7.671 7.672 + 7.673 7.674 + 7.675 7.676 +void inline 7.677 7.678 +multiplySubBlocksTransposed( float32 *leftArray, float32 *rightArray, 7.679 7.680 + float32 *resArray, 7.681 7.682 + int resStartRow, int resEndRow, 7.683 7.684 + int resStartCol, int resEndCol, 7.685 7.686 + int startVec, int endVec, 7.687 7.688 + int resStride, int inpStride ) 7.689 7.690 + { 7.691 7.692 + int resRow, resCol, vec; 7.693 7.694 + int leftOffset, rightOffset; 7.695 7.696 + float32 result; 7.697 7.698 + 7.699 7.700 + //The result row is used for the left matrix, res col for the right 7.701 7.702 + for( resCol = resStartCol; resCol <= resEndCol; resCol++ ) 7.703 7.704 + { 7.705 7.706 + for( resRow = resStartRow; resRow <= resEndRow; resRow++ ) 7.707 7.708 + { 7.709 7.710 + leftOffset = resRow * inpStride;//left & right inp strides same 7.711 7.712 + rightOffset = resCol * inpStride;// because right is transposed 7.713 7.714 + result = 0; 7.715 7.716 + for( vec = startVec; vec <= endVec; vec++ ) 7.717 7.718 + { 7.719 7.720 + result += 7.721 7.722 + leftArray[ leftOffset + vec] * rightArray[ rightOffset + vec]; 7.723 7.724 + } 7.725 7.726 + 7.727 7.728 + resArray[ resRow * resStride + resCol ] += result; 7.729 7.730 + } 7.731 7.732 + } 7.733 7.734 + } 7.735 7.736 + 7.737 7.738 + 7.739 7.740 +/*Reuse this in divider when do the sequential multiply case 7.741 7.742 + */ 7.743 7.744 +void inline 7.745 7.746 +copyTranspose( int32 numRows, int32 numCols, 7.747 7.748 + int32 origStartRow, int32 origStartCol, int32 origStride, 7.749 7.750 + float32 *subArray, float32 *origArray ) 7.751 7.752 + { int32 stride = numRows; 7.753 7.754 + 7.755 7.756 + int row, col, origOffset; 7.757 7.758 + for( row = 0; row < numRows; row++ ) 7.759 7.760 + { 7.761 7.762 + origOffset = (row + origStartRow) * origStride + origStartCol; 7.763 7.764 + for( col = 0; col < numCols; col++ ) 7.765 7.766 + { 7.767 7.768 + //transpose means swap row & col -- traverse orig matrix normally 7.769 7.770 + // but put into reversed place in local array -- means the 7.771 7.772 + // stride is the numRows now, so col * numRows + row 7.773 7.774 + subArray[ col * stride + row ] = origArray[ origOffset + col ]; 7.775 7.776 + } 7.777 7.778 + } 7.779 7.780 + } 7.781 7.782 + 7.783 7.784 +void inline 7.785 7.786 +copyTransposeFromOrig( SubMatrix *subMatrix, VirtProcr *animPr ) 7.787 7.788 + { int numCols, numRows, origStartRow, origStartCol, origStride, stride; 7.789 7.790 + Matrix *origMatrix; 7.791 7.792 + float32 *origArray, *subArray; 7.793 7.794 + 7.795 7.796 + VCilk__start_data_singleton( &(subMatrix->copyTransSingleton), animPr ); 7.797 7.798 + 7.799 7.800 + origMatrix = subMatrix->origMatrix; 7.801 7.802 + origArray = origMatrix->array; 7.803 7.804 + numCols = subMatrix->numCols; 7.805 7.806 + numRows = subMatrix->numRows; 7.807 7.808 + origStartRow = subMatrix->origStartRow; 7.809 7.810 + origStartCol = subMatrix->origStartCol; 7.811 7.812 + origStride = origMatrix->numCols; 7.813 7.814 + 7.815 7.816 + subArray = VCilk__malloc( numRows * numCols *sizeof(float32),animPr); 7.817 7.818 + subMatrix->array = subArray; 7.819 7.820 + 7.821 7.822 + //copy values from orig matrix to local 7.823 7.824 + copyTranspose( numRows, numCols, 7.825 7.826 + origStartRow, origStartCol, origStride, 7.827 7.828 + subArray, origArray ); 7.829 7.830 + 7.831 7.832 + VCilk__end_data_singleton( &(subMatrix->copyTransSingleton), animPr ); 7.833 7.834 + return; 7.835 7.836 + } 7.837 7.838 + 7.839 7.840 + 7.841 7.842 +void inline 7.843 7.844 +copyFromOrig( SubMatrix *subMatrix, VirtProcr *animPr ) 7.845 7.846 + { int numCols, numRows, origStartRow, origStartCol, stride, origStride; 7.847 7.848 + Matrix *origMatrix; 7.849 7.850 + float32 *origArray, *subArray; 7.851 7.852 + 7.853 7.854 + 7.855 7.856 + //This lets only a single VP execute the code between start and 7.857 7.858 + // end -- using start and end so that work runs outside the master. 7.859 7.860 + //Inside, if a second VP ever executes the start, it will be returned 7.861 7.862 + // from the end-point. 7.863 7.864 + //Note, for non-GCC, can add a second SSR call at the end, and inside 7.865 7.866 + // that one, look at the stack at the return addr & save that in an 7.867 7.868 + // array indexed by singletonID 7.869 7.870 + VCilk__start_data_singleton( &(subMatrix->copySingleton), animPr ); 7.871 7.872 + 7.873 7.874 + 7.875 7.876 + origMatrix = subMatrix->origMatrix; 7.877 7.878 + origArray = origMatrix->array; 7.879 7.880 + numCols = subMatrix->numCols; 7.881 7.882 + numRows = subMatrix->numRows; 7.883 7.884 + origStartRow = subMatrix->origStartRow; 7.885 7.886 + origStartCol = subMatrix->origStartCol; 7.887 7.888 + origStride = origMatrix->numCols; 7.889 7.890 + 7.891 7.892 + subArray = VCilk__malloc( numRows * numCols *sizeof(float32),animPr); 7.893 7.894 + subMatrix->array = subArray; 7.895 7.896 + 7.897 7.898 + //copy values from orig matrix to local 7.899 7.900 + stride = numCols; 7.901 7.902 + 7.903 7.904 + int row, col, offset, origOffset; 7.905 7.906 + for( row = 0; row < numRows; row++ ) 7.907 7.908 + { 7.909 7.910 + offset = row * stride; 7.911 7.912 + origOffset = (row + origStartRow) * origStride + origStartCol; 7.913 7.914 + for( col = 0; col < numCols; col++ ) 7.915 7.916 + { 7.917 7.918 + subArray[ offset + col ] = origArray[ origOffset + col ]; 7.919 7.920 + } 7.921 7.922 + } 7.923 7.924 + VCilk__end_data_singleton( &(subMatrix->copySingleton), animPr ); 7.925 7.926 + 7.927 7.928 + return; 7.929 7.930 + } 7.931
8.1 --- a/src/Application/main.c Mon Nov 15 12:08:19 2010 -0800 8.2 +++ b/src/Application/main.c Wed May 11 15:40:54 2011 +0200 8.3 @@ -1,35 +1,35 @@ 8.4 -/* 8.5 - * Copyright Oct 24, 2009 OpenSourceStewardshipFoundation.org 8.6 - * Licensed under GNU General Public License version 2 8.7 - * 8.8 - * author seanhalle@yahoo.com 8.9 - */ 8.10 - 8.11 -#include <malloc.h> 8.12 -#include <stdlib.h> 8.13 - 8.14 -#include "Matrix_Mult.h" 8.15 -#include "VCilk__Matrix_Mult/VCilk__Matrix_Mult.h" 8.16 - 8.17 -/** 8.18 - *Matrix multiply program written using VMS_HW piggy-back language 8.19 - * 8.20 - */ 8.21 -int main( int argc, char **argv ) 8.22 - { Matrix *leftMatrix, *rightMatrix, *resultMatrix; 8.23 - ParamBag *paramBag; 8.24 - 8.25 - paramBag = makeParamBag(); 8.26 - printf("arguments: %s | %s | %s | %s\n", argv[0], argv[1], argv[2], argv[3] ); 8.27 - readParamFileIntoBag( argv[1], paramBag ); 8.28 - initialize_Input_Matrices_Via( &leftMatrix, &rightMatrix, paramBag ); 8.29 - 8.30 - resultMatrix = multiplyTheseMatrices( leftMatrix, rightMatrix ); 8.31 - 8.32 -// printf("\nresult matrix: \n"); \ 8.33 - printMatrix( resultMatrix ); 8.34 - 8.35 -// VCilk__print_stats(); 8.36 - fflush(stdin); 8.37 - exit(0); //cleans up 8.38 - } 8.39 +/* 8.40 8.41 + * Copyright Oct 24, 2009 OpenSourceStewardshipFoundation.org 8.42 8.43 + * Licensed under GNU General Public License version 2 8.44 8.45 + * 8.46 8.47 + * author seanhalle@yahoo.com 8.48 8.49 + */ 8.50 8.51 + 8.52 8.53 +#include <malloc.h> 8.54 8.55 +#include <stdlib.h> 8.56 8.57 + 8.58 8.59 +#include "Matrix_Mult.h" 8.60 8.61 +#include "VCilk__Matrix_Mult/VCilk__Matrix_Mult.h" 8.62 8.63 + 8.64 8.65 +/** 8.66 8.67 + *Matrix multiply program written using VMS_HW piggy-back language 8.68 8.69 + * 8.70 8.71 + */ 8.72 8.73 +int main( int argc, char **argv ) 8.74 8.75 + { Matrix *leftMatrix, *rightMatrix, *resultMatrix; 8.76 8.77 + ParamBag *paramBag; 8.78 8.79 + 8.80 8.81 + paramBag = makeParamBag(); 8.82 8.83 + printf("arguments: %s | %s | %s | %s\n", argv[0], argv[1], argv[2], argv[3] ); 8.84 8.85 + readParamFileIntoBag( argv[1], paramBag ); 8.86 8.87 + initialize_Input_Matrices_Via( &leftMatrix, &rightMatrix, paramBag ); 8.88 8.89 + 8.90 8.91 + resultMatrix = multiplyTheseMatrices( leftMatrix, rightMatrix ); 8.92 8.93 + 8.94 8.95 +// printf("\nresult matrix: \n"); \ 8.96 8.97 + printMatrix( resultMatrix ); 8.98 8.99 + 8.100 8.101 +// VCilk__print_stats(); 8.102 8.103 + fflush(stdin); 8.104 8.105 + exit(0); //cleans up 8.106 8.107 + } 8.108
