Mercurial > cgi-bin > hgwebdir.cgi > PR > Applications > VCilk > VCilk__Blocked_Matrix_Mult__Bench
changeset 0:56e17dcfc0c3
Initial add -- works, together with VMS pin2Core with vmalloc and probes
| author | Me |
|---|---|
| date | Sat, 30 Oct 2010 20:43:49 -0700 |
| parents | |
| children | 06c80cd3c303 |
| files | .hgignore src/Application/Matrix_Mult.c src/Application/Matrix_Mult.h src/Application/VCilk__Matrix_Mult/Divide_Pr.c src/Application/VCilk__Matrix_Mult/EntryPoint.c src/Application/VCilk__Matrix_Mult/VCilk__Matrix_Mult.h src/Application/VCilk__Matrix_Mult/Vector_Pr.c src/Application/VCilk__Matrix_Mult/subMatrix_Pr.c src/Application/main.c |
| diffstat | 9 files changed, 1417 insertions(+), 0 deletions(-) [+] |
line diff
1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/.hgignore Sat Oct 30 20:43:49 2010 -0700 1.3 @@ -0,0 +1,5 @@ 1.4 +nbproject 1.5 +build 1.6 +dist 1.7 +.dep.inc 1.8 +Makefile
2.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 2.2 +++ b/src/Application/Matrix_Mult.c Sat Oct 30 20:43:49 2010 -0700 2.3 @@ -0,0 +1,167 @@ 2.4 +/* 2.5 + * Copyright 2009 OpenSourceStewardshipFoundation.org 2.6 + * Licensed under GNU General Public License version 2 2.7 + * 2.8 + * Author: seanhalle@yahoo.com 2.9 + * 2.10 + * Created on November 15, 2009, 2:35 AM 2.11 + */ 2.12 + 2.13 +#include <malloc.h> 2.14 +#include <stdlib.h> 2.15 + 2.16 +#include "Matrix_Mult.h" 2.17 +#include "ParamHelper/Param.h" 2.18 + 2.19 + 2.20 + 2.21 + void 2.22 +initialize_Input_Matrices_Via( Matrix **leftMatrix, Matrix **rightMatrix, 2.23 + ParamBag *paramBag ) 2.24 + { char *leftMatrixFileName, *rightMatrixFileName; 2.25 + int leftMatrixRows, leftMatrixCols, rightMatrixRows, rightMatrixCols; 2.26 + 2.27 + ParamStruc *param; 2.28 + param = getParamFromBag( "leftMatrixRows", paramBag ); 2.29 + leftMatrixRows = param->intValue; 2.30 + param = getParamFromBag( "leftMatrixCols", paramBag ); 2.31 + leftMatrixCols = param->intValue; 2.32 + *leftMatrix = makeMatrix_WithResMat( leftMatrixRows, leftMatrixCols ); 2.33 + 2.34 + param = getParamFromBag( "leftMatrixFileName", paramBag ); 2.35 + leftMatrixFileName = param->strValue; //no need to copy 2.36 + read_Matrix_From_File( *leftMatrix, leftMatrixFileName ); 2.37 + 2.38 + param = getParamFromBag( "rightMatrixRows", paramBag ); 2.39 + rightMatrixRows = param->intValue; 2.40 + param = getParamFromBag( "rightMatrixCols", paramBag ); 2.41 + rightMatrixCols = param->intValue; 2.42 + *rightMatrix = makeMatrix_WithResMat( rightMatrixRows, rightMatrixCols ); 2.43 + 2.44 + param = getParamFromBag( "rightMatrixFileName", paramBag ); 2.45 + rightMatrixFileName = param->strValue; 2.46 + read_Matrix_From_File( *rightMatrix, rightMatrixFileName ); 2.47 + } 2.48 + 2.49 + 2.50 +void parseLineIntoRow( char *line, float32* row ); 2.51 + 2.52 + 2.53 + void 2.54 +read_Matrix_From_File( Matrix *matrixStruc, char *matrixFileName ) 2.55 + { int row, maxRead, numRows, numCols; 2.56 + float32 *matrixStart; 2.57 + size_t lineSz = 0; 2.58 + FILE *file; 2.59 + char *line = NULL; 2.60 + 2.61 + lineSz = 50000; //max length of line in a matrix data file 2.62 + line = (char *) malloc( lineSz ); 2.63 + if( line == NULL ) printf( "no mem for matrix line" ); 2.64 + 2.65 + numRows = matrixStruc->numRows; 2.66 + numCols = matrixStruc->numCols; 2.67 + matrixStart = matrixStruc->array; 2.68 + 2.69 + file = fopen( matrixFileName, "r" ); 2.70 + if( file == NULL ) { printf( "\nCouldn't open file!!\n"); exit(1);} 2.71 + fseek( file, 0, SEEK_SET ); 2.72 + for( row = 0; row < numRows; row++ ) 2.73 + { 2.74 + if( feof( file ) ) printf( "file ran out too soon" ); 2.75 + maxRead = getline( &line, &lineSz, file ); 2.76 + if( maxRead == -1 ) printf( "prob reading mat line"); 2.77 + 2.78 + if( *line == '\n') continue; //blank line 2.79 + if( *line == '/' ) continue; //comment line 2.80 + 2.81 + parseLineIntoRow( line, matrixStart + row * numCols ); 2.82 + } 2.83 + free( line ); 2.84 + } 2.85 + 2.86 +/*This function relies on each line having the proper number of cols. It 2.87 + * doesn't check, nor enforce, so if the file is improperly formatted it 2.88 + * can write over unrelated memory 2.89 + */ 2.90 + void 2.91 +parseLineIntoRow( char *line, float32* row ) 2.92 + { 2.93 + char *valueStr, *searchPos; 2.94 + 2.95 + //read the float values 2.96 + searchPos = valueStr = line; //start 2.97 + 2.98 + for( ; *searchPos != 0; searchPos++) //bit dangerous, should use buff len 2.99 + { 2.100 + if( *searchPos == '\n' ) //last col.. relying on well-formatted file 2.101 + { *searchPos = 0; 2.102 + *row = atof( valueStr ); 2.103 + break; //end FOR loop 2.104 + } 2.105 + if( *searchPos == ',' ) 2.106 + { *searchPos = 0; //mark end of string 2.107 + *row = (float32) atof( valueStr ); 2.108 + row += 1; //address arith 2.109 + //skip any spaces before digits.. use searchPos + 1 to skip the 0 2.110 + for( ; *(searchPos + 1)== ' ' && *(searchPos + 1) !=0; searchPos++); 2.111 + valueStr = searchPos + 1; 2.112 + } 2.113 + } 2.114 + } 2.115 + 2.116 + //========================================================================== 2.117 + 2.118 +/*In the "_Flat" version of constructor, do only malloc of the top data struc 2.119 + * and set values in that top-level. Don't malloc any sub-structures. 2.120 + */ 2.121 + Matrix * 2.122 +makeMatrix_Flat( int32 numRows, int32 numCols ) 2.123 + { Matrix * retMatrix; 2.124 + retMatrix = malloc( sizeof( Matrix ) ); 2.125 + retMatrix->numRows = numRows; 2.126 + retMatrix->numCols = numCols; 2.127 + 2.128 + return retMatrix; 2.129 + } 2.130 + 2.131 + Matrix * 2.132 +makeMatrix_WithResMat( int32 numRows, int32 numCols ) 2.133 + { Matrix * retMatrix; 2.134 + retMatrix = malloc( sizeof( Matrix ) ); 2.135 + retMatrix->numRows = numRows; 2.136 + retMatrix->numCols = numCols; 2.137 + retMatrix->array = malloc( numRows * numCols * sizeof(float32) ); 2.138 + 2.139 + return retMatrix; 2.140 + } 2.141 + 2.142 + void 2.143 +freeMatrix_Flat( Matrix * matrix ) 2.144 + { //( matrix ); 2.145 + } 2.146 + void 2.147 +freeMatrix( Matrix * matrix ) 2.148 + { free( matrix->array ); 2.149 + free( matrix ); 2.150 + } 2.151 + 2.152 +void 2.153 +printMatrix( Matrix *matrix ) 2.154 + { int r, c, numRows, numCols, rowsToPrint, colsToPrint, rowIncr, colIncr; 2.155 + float32 *matrixArray; 2.156 + 2.157 + numRows = rowsToPrint = matrix->numRows; 2.158 + numCols = colsToPrint = matrix->numCols; 2.159 + matrixArray = matrix->array; 2.160 + 2.161 + rowIncr = numRows/20; if(rowIncr == 0) rowIncr = 1;//20 to 39 rows printed 2.162 + colIncr = numCols/20; if(colIncr == 0) colIncr = 1;//20 to 39 cols printed 2.163 + for( r = 0; r < numRows; r += rowIncr ) 2.164 + { for( c = 0; c < numCols; c += colIncr ) 2.165 + { printf( "%3.1f | ", matrixArray[ r * numCols + c ] ); 2.166 + } 2.167 + printf("\n"); 2.168 + } 2.169 + } 2.170 +
3.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 3.2 +++ b/src/Application/Matrix_Mult.h Sat Oct 30 20:43:49 2010 -0700 3.3 @@ -0,0 +1,77 @@ 3.4 +/* 3.5 + * Copyright Oct 24, 2009 OpenSourceStewardshipFoundation.org 3.6 + * Licensed under GNU General Public License version 2 3.7 + */ 3.8 + 3.9 +#ifndef MATRIX_MULT_H_ 3.10 +#define MATRIX_MULT_H_ 3.11 + 3.12 +#include <stdio.h> 3.13 +#include <unistd.h> 3.14 +#include <malloc.h> 3.15 + 3.16 +#include "../VCilk_lib/VMS/VMS_primitive_data_types.h" 3.17 +#include "ParamHelper/Param.h" 3.18 + 3.19 +//============================== Structures ============================== 3.20 + 3.21 +typedef 3.22 +struct 3.23 + { int32 numRows; 3.24 + int32 numCols; 3.25 + float32 *array; //2D, but dynamically sized, so use addr arith 3.26 + } 3.27 +Matrix; 3.28 + 3.29 +/* This is the "appSpecificPiece" that is carried inside a DKUPiece. 3.30 + * In the DKUPiece data struc it is declared to be of type "void *". This 3.31 + * allows the application to define any data structure it wants and put it 3.32 + * into a DKUPiece. 3.33 + * When the app specific info is used, it is in app code, so it is cast to 3.34 + * the correct type to tell the compiler how to access fields. 3.35 + * This keeps all app-specific things out of the DKU directory, as per the 3.36 + * DKU standard. */ 3.37 +typedef 3.38 +struct 3.39 + { 3.40 + // pointers to shared data.. the result matrix must be created when the 3.41 + // left and right matrices are put into the root ancestor DKUPiece. 3.42 + Matrix * leftMatrix; 3.43 + Matrix * rightMatrix; 3.44 + Matrix * resultMatrix; 3.45 + 3.46 + // define the starting and ending boundaries for this piece of the 3.47 + // result matrix. These are derivable from the left and right 3.48 + // matrices, but included them for readability of code. 3.49 + int prodStartRow, prodEndRow; 3.50 + int prodStartCol, prodEndCol; 3.51 + // Start and end of the portion of the left matrix that contributes to 3.52 + // this piece of the product 3.53 + int leftStartRow, leftEndRow; 3.54 + int leftStartCol, leftEndCol; 3.55 + // Start and end of the portion of the right matrix that contributes to 3.56 + // this piece of the product 3.57 + int rightStartRow, rightEndRow; 3.58 + int rightStartCol, rightEndCol; 3.59 + } 3.60 +MatrixProdPiece; 3.61 + 3.62 +//============================== Functions ================================ 3.63 +void readFile(); 3.64 + 3.65 +Matrix *makeMatrix( int32 numRows, int32 numCols ); 3.66 +Matrix *makeMatrix_Flat( int32 numRows, int32 numCols ); 3.67 +Matrix *makeMatrix_WithResMat( int32 numRows, int32 numCols ); 3.68 +void freeMatrix_Flat( Matrix * matrix ); 3.69 +void freeMatrix( Matrix * matrix ); 3.70 +void printMatrix( Matrix *matrix ); 3.71 + 3.72 +void read_Matrix_From_File( Matrix *matrixStruc, char *matrixFileName ); 3.73 + 3.74 +void 3.75 +initialize_Input_Matrices_Via( Matrix **leftMatrix, Matrix **rightMatrix, 3.76 + ParamBag *paramBag ); 3.77 + 3.78 +//=========================================================================== 3.79 + 3.80 +#endif /*MATRIX_MULT_H_*/
4.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 4.2 +++ b/src/Application/VCilk__Matrix_Mult/Divide_Pr.c Sat Oct 30 20:43:49 2010 -0700 4.3 @@ -0,0 +1,583 @@ 4.4 +/* 4.5 + * Copyright 2009 OpenSourceStewardshipFoundation.org 4.6 + * Licensed under GNU General Public License version 2 4.7 + * 4.8 + * Author: seanhalle@yahoo.com 4.9 + * 4.10 + */ 4.11 + 4.12 + 4.13 +#include "VCilk__Matrix_Mult.h" 4.14 +#include <math.h> 4.15 +#include <sys/time.h> 4.16 + 4.17 + //The time to compute this many result values should equal the time to 4.18 + // perform this division on a matrix of size gives that many result calcs 4.19 + //IE, size this so that sequential time to calc equals divide time 4.20 + // find the value by experimenting -- but divide time and calc time scale 4.21 + // same way, so this value should remain valid across hardware 4.22 + //Divide time is about 800us on 2.4Ghz core2Quad laptop core 4.23 + //num cells is the cube of a side, when have two square matrices 4.24 +#define NUM_CELLS_IN_SEQUENTIAL_CUTOFF 100000 /* about 46x46 */ 4.25 + 4.26 + 4.27 +//=========================================================================== 4.28 +int inline 4.29 +measureMatrixMultPrimitive( VirtProcr *animPr ); 4.30 + 4.31 +SlicingStrucCarrier * 4.32 +calcIdealSizeAndSliceDimensions( Matrix *leftMatrix, Matrix *rightMatrix, 4.33 + VirtProcr *animPr ); 4.34 + 4.35 +SlicingStruc * 4.36 +sliceUpDimension( float32 idealSizeOfSide, int startVal, int endVal, 4.37 + VirtProcr *animPr ); 4.38 + 4.39 +void 4.40 +freeSlicingStruc( SlicingStruc *slicingStruc, VirtProcr *animPr ); 4.41 + 4.42 +SubMatrix ** 4.43 +createSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices, 4.44 + Matrix *origMatrix, VirtProcr *animPr ); 4.45 + 4.46 +void 4.47 +freeSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices, 4.48 + SubMatrix **subMatrices, VirtProcr *animPr ); 4.49 + 4.50 +void 4.51 +pairUpSubMatricesAndSpawnAndSync( SubMatrix **leftSubMatrices, 4.52 + SubMatrix **rightSubMatrices, 4.53 + int32 numRowIdxs, int32 numColIdxs, 4.54 + int32 numVecIdxs, 4.55 + float32 *resultArray, 4.56 + VirtProcr *animatingPr ); 4.57 + 4.58 +void 4.59 +makeSubMatricesAndSpawnAndSync( Matrix *leftMatrix, Matrix *rightMatrix, 4.60 + SlicingStrucCarrier *slicingStrucCarrier, 4.61 + float32 *resultArray, VirtProcr *animatingPr ); 4.62 + 4.63 +//=========================================================================== 4.64 + 4.65 +/*Divider creates one processor for every sub-matrix 4.66 + * It hands them: 4.67 + * the name of the result processor that they should send their results to, 4.68 + * the left and right matrices, and the rows and cols they should multiply 4.69 + * It first creates the result processor, then all the sub-matrixPair 4.70 + * processors, 4.71 + * then does a receive of a message from the result processor that gives 4.72 + * the divider ownership of the result matrix. 4.73 + * Finally, the divider returns the result matrix out of the VCilk system. 4.74 + * 4.75 + * Divider chooses the size of sub-matrices via an algorithm that tries to 4.76 + * keep the minimum work above a threshold. The threshold is machine- 4.77 + * dependent, so ask VCilk for min work-unit time to get a 4.78 + * given overhead 4.79 + * 4.80 + * Divide min work-unit cycles by measured-cycles for one matrix-cell 4.81 + * product -- gives the number of products need to have in min size 4.82 + * matrix. 4.83 + * 4.84 + * So then, take cubed root of this to get the size of a side of min sub- 4.85 + * matrix. That is the size of the ideal square sub-matrix -- so tile 4.86 + * up the two input matrices into ones as close as possible to that size, 4.87 + * and create the pairs of sub-matrices. 4.88 + * 4.89 + *======================== STRATEGIC OVERVIEW ======================= 4.90 + * 4.91 + *This division is a bit tricky, because have to create things in advance 4.92 + * that it's not at first obvious need to be created.. 4.93 + * 4.94 + *First slice up each dimension -- three of them.. this is because will have 4.95 + * to create the sub-matrix's data-structures before pairing the sub-matrices 4.96 + * with each other -- so, have three dimensions to slice up before can 4.97 + * create the sub-matrix data-strucs -- also, have to be certain that the 4.98 + * cols of the left input have the exact same slicing as the rows of the 4.99 + * left matrix, so just to be sure, do the slicing calc once, then use it 4.100 + * for both. 4.101 + * 4.102 + *So, goes like this: 4.103 + *1) calculate the start & end values of each dimension in each matrix. 4.104 + *2) use those values to create sub-matrix structures 4.105 + *3) combine sub-matrices into pairs, as the tasks to perform. 4.106 + * 4.107 + *Have to calculate separately from creating the sub-matrices because of the 4.108 + * nature of the nesting -- would either end up creating the same sub-matrix 4.109 + * multiple times, or else would have to put in detection of whether had 4.110 + * made a particular one already if tried to combine steps 1 and 2. 4.111 + * 4.112 + *Step 3 has to be separate because of the nesting, as well -- same reason, 4.113 + * would either create same sub-matrix multiple times, or else have to 4.114 + * add detection of whether was already created. 4.115 + * 4.116 + *Another way to look at it: there's one level of loop to divide dimensions, 4.117 + * two levels of nesting to create sub-matrices, and three levels to pair 4.118 + * up the sub-matrices. 4.119 + */ 4.120 + 4.121 +void divideWorkIntoSubMatrixPairProcrs( void *_dividerParams, 4.122 + VirtProcr *animPr ) 4.123 + { 4.124 + DividerParams *dividerParams; 4.125 + ResultsParams *resultsParams; 4.126 + Matrix *leftMatrix, *rightMatrix, *resultMatrix; 4.127 + void *msg; 4.128 + SlicingStrucCarrier *slicingStrucCarrier; 4.129 + float32 *resultArray; //points to array to be put inside result 4.130 + // matrix 4.131 + 4.132 + PRINT_DEBUG("start divide\n") 4.133 + 4.134 +//TODO: VMS__create_block_of_probes_with_idxs( 0, 2 ); 4.135 + int32 4.136 + divideProbe = VMS__create_single_interval_probe( "divideProbe", 4.137 + animPr ); 4.138 + VMS__record_sched_choice_into_probe( divideProbe, animPr ); 4.139 + VMS__record_interval_start_in_probe( divideProbe ); 4.140 + 4.141 + //=========== Setup -- make local copies of ptd-to-things, malloc, aso 4.142 + int32 numResRows, numResCols; 4.143 + 4.144 + dividerParams = (DividerParams *)_dividerParams; 4.145 + 4.146 + leftMatrix = dividerParams->leftMatrix; 4.147 + rightMatrix = dividerParams->rightMatrix; 4.148 + 4.149 + numResRows = leftMatrix->numRows; 4.150 + numResCols = rightMatrix->numCols; 4.151 + resultArray = dividerParams->resultMatrix->array; 4.152 + 4.153 + 4.154 + //============== Do either sequential mult or do division ============== 4.155 + 4.156 + //Check if input matrices too small -- if yes, just do sequential 4.157 + //Cutoff is determined by overhead of this divider -- relatively 4.158 + // machine-independent 4.159 + if( (float32)leftMatrix->numRows * (float32)leftMatrix->numCols * 4.160 + (float32)rightMatrix->numCols < NUM_CELLS_IN_SEQUENTIAL_CUTOFF ) 4.161 + { int32 vectLength; 4.162 + 4.163 + //====== Do sequential multiply on a single core 4.164 + PRINT_DEBUG("doing sequential") 4.165 + vectLength = leftMatrix->numCols; 4.166 + 4.167 + multiplyMatrixArrays( vectLength, numResRows, numResCols, 4.168 + leftMatrix->array, rightMatrix->array, 4.169 + resultArray ); 4.170 + } 4.171 + else 4.172 + { 4.173 + //====== Do parallel multiply across cores 4.174 + 4.175 + //Calc the ideal size of sub-matrix and slice up the dimensions of 4.176 + // the two matrices. 4.177 + //The ideal size is the one takes the number of cycles to calculate 4.178 + // such that calc time is equal or greater than min work-unit size 4.179 + slicingStrucCarrier = 4.180 + calcIdealSizeAndSliceDimensions( leftMatrix, rightMatrix, animPr); 4.181 + 4.182 + 4.183 + //Make the results processor, now that know how many to wait for 4.184 +/* 4.185 + resultsParams = VCilk__malloc( sizeof(ResultsParams) ); 4.186 + resultsParams->dividerPr = animatingPr; 4.187 + resultsParams->numSubMatrixPairs = 4.188 + slicingStrucCarrier->leftRowSlices->numVals * 4.189 + slicingStrucCarrier->rightColSlices->numVals * 4.190 + slicingStrucCarrier->vecSlices->numVals; 4.191 + resultsParams->numCols = rightMatrix->numCols; 4.192 + resultsParams->numRows = leftMatrix->numRows; 4.193 +*/ 4.194 + 4.195 + //Make the sub-matrices, and pair them up, then spawn processors to 4.196 + // calc product of each pair. 4.197 + makeSubMatricesAndSpawnAndSync( leftMatrix, rightMatrix, 4.198 + slicingStrucCarrier, 4.199 + resultArray, animPr); 4.200 + //The result array will get filled in by the spawned children 4.201 + } 4.202 + 4.203 + 4.204 + //=============== Work done -- send results back ================= 4.205 + 4.206 + 4.207 + //results have been saved into an array that was made outside the VMS 4.208 + // system, by entry-point Fn, and passed in through dividerParams. 4.209 + //So, nothing to do to send results back -- they're seen by side-effect 4.210 + 4.211 + PRINT_DEBUG("end divide\n") 4.212 + 4.213 + VMS__record_interval_end_in_probe( divideProbe ); 4.214 + VMS__print_stats_of_all_probes; 4.215 + 4.216 + VCilk__dissipate_procr( animPr ); //all procrs dissipate self at end 4.217 + //when all of the processors have dissipated, the "create seed and do 4.218 + // work" call in the entry point function returns 4.219 + } 4.220 + 4.221 + 4.222 +SlicingStrucCarrier * 4.223 +calcIdealSizeAndSliceDimensions( Matrix *leftMatrix, Matrix *rightMatrix, 4.224 + VirtProcr *animPr ) 4.225 +{ 4.226 + float32 idealSizeOfSide, idealSizeOfSide1, idealSizeOfSide2; 4.227 + SlicingStruc *leftRowSlices, *vecSlices, *rightColSlices; 4.228 + SlicingStrucCarrier *slicingStrucCarrier = 4.229 + VCilk__malloc(sizeof(SlicingStrucCarrier), animPr ); 4.230 + 4.231 + int minWorkUnitCycles, primitiveCycles, idealNumWorkUnits; 4.232 + float64 numPrimitiveOpsInMinWorkUnit; 4.233 + 4.234 + 4.235 + //======= Calc ideal size of min-sized sub-matrix ======== 4.236 + 4.237 + //ask VCilk for the number of cycles of the minimum work unit, at given 4.238 + // percent overhead then add a guess at overhead from this divider 4.239 + minWorkUnitCycles = VCilk__giveMinWorkUnitCycles( .05 ); 4.240 + 4.241 + //ask VCilk for number of cycles of the "primitive" op of matrix mult 4.242 + primitiveCycles = measureMatrixMultPrimitive( animPr ); 4.243 + 4.244 + numPrimitiveOpsInMinWorkUnit = 4.245 + (float64)minWorkUnitCycles / (float64)primitiveCycles; 4.246 + 4.247 + //take cubed root -- that's number of these in a "side" of sub-matrix 4.248 + // then multiply by 5 because the primitive is 5x5 4.249 + idealSizeOfSide1 = 5 * cbrt( numPrimitiveOpsInMinWorkUnit ); 4.250 + 4.251 + idealNumWorkUnits = VCilk__giveIdealNumWorkUnits(); 4.252 + 4.253 + idealSizeOfSide2 = leftMatrix->numRows / rint(cbrt( idealNumWorkUnits )); 4.254 + idealSizeOfSide2 *= 0.6; //finer granularity to help load balance 4.255 + 4.256 + if( idealSizeOfSide1 > idealSizeOfSide2 ) 4.257 + idealSizeOfSide = idealSizeOfSide1; 4.258 + else 4.259 + idealSizeOfSide = idealSizeOfSide2; 4.260 + 4.261 + //The multiply inner loop blocks the array to fit into L1 cache 4.262 +// if( idealSizeOfSide < ROWS_IN_BLOCK ) idealSizeOfSide = ROWS_IN_BLOCK; 4.263 + 4.264 + //============ Slice up dimensions, now that know target size =========== 4.265 + 4.266 + //Tell the slicer the target size of a side (floating pt), the start 4.267 + // value to start slicing at, and the end value to stop slicing at 4.268 + //It returns an array of start value of each chunk, plus number of them 4.269 + int32 startLeftRow, endLeftRow, startVec,endVec,startRightCol,endRightCol; 4.270 + startLeftRow = 0; 4.271 + endLeftRow = leftMatrix->numRows -1; 4.272 + startVec = 0; 4.273 + endVec = leftMatrix->numCols -1; 4.274 + startRightCol = 0; 4.275 + endRightCol = rightMatrix->numCols -1; 4.276 + 4.277 + leftRowSlices = 4.278 + sliceUpDimension( idealSizeOfSide, startLeftRow, endLeftRow, animPr ); 4.279 + 4.280 + vecSlices = 4.281 + sliceUpDimension( idealSizeOfSide, startVec, endVec, animPr ); 4.282 + 4.283 + rightColSlices = 4.284 + sliceUpDimension( idealSizeOfSide, startRightCol, endRightCol,animPr); 4.285 + 4.286 + slicingStrucCarrier->leftRowSlices = leftRowSlices; 4.287 + slicingStrucCarrier->vecSlices = vecSlices; 4.288 + slicingStrucCarrier->rightColSlices = rightColSlices; 4.289 + 4.290 + return slicingStrucCarrier; 4.291 +} 4.292 + 4.293 + 4.294 +void 4.295 +makeSubMatricesAndSpawnAndSync( Matrix *leftMatrix, Matrix *rightMatrix, 4.296 + SlicingStrucCarrier *slicingStrucCarrier, 4.297 + float32 *resultArray, VirtProcr *animPr ) 4.298 + { 4.299 + SlicingStruc *leftRowSlices, *vecSlices, *rightColSlices; 4.300 + 4.301 + leftRowSlices = slicingStrucCarrier->leftRowSlices; 4.302 + vecSlices = slicingStrucCarrier->vecSlices; 4.303 + rightColSlices = slicingStrucCarrier->rightColSlices; 4.304 + VCilk__free( slicingStrucCarrier, animPr ); 4.305 + 4.306 + //================ Make sub-matrices, given the slicing ================ 4.307 + SubMatrix **leftSubMatrices, **rightSubMatrices; 4.308 + leftSubMatrices = 4.309 + createSubMatrices( leftRowSlices, vecSlices, 4.310 + leftMatrix, animPr ); 4.311 + rightSubMatrices = 4.312 + createSubMatrices( vecSlices, rightColSlices, 4.313 + rightMatrix, animPr ); 4.314 + 4.315 + //============== pair the sub-matrices and make processors ============== 4.316 + int32 numRowIdxs, numColIdxs, numVecIdxs; 4.317 + 4.318 + numRowIdxs = leftRowSlices->numVals; 4.319 + numColIdxs = rightColSlices->numVals; 4.320 + numVecIdxs = vecSlices->numVals; 4.321 + pairUpSubMatricesAndSpawnAndSync( leftSubMatrices, rightSubMatrices, 4.322 + numRowIdxs, numColIdxs, numVecIdxs, 4.323 + resultArray, 4.324 + animPr ); 4.325 + //It syncs inside, so know all work is done now: free the sub-matrices 4.326 + freeSubMatrices( leftRowSlices, vecSlices, leftSubMatrices, animPr ); 4.327 + freeSubMatrices( vecSlices, rightColSlices, rightSubMatrices, animPr ); 4.328 + 4.329 + 4.330 + freeSlicingStruc( leftRowSlices, animPr ); 4.331 + freeSlicingStruc( vecSlices, animPr ); 4.332 + freeSlicingStruc( rightColSlices, animPr ); 4.333 + } 4.334 + 4.335 + 4.336 + 4.337 + 4.338 +/* numRows*colsPerRow/numCores = numToPutOntoEachCore; 4.339 + * put all from a given row onto same core, until exhaust allotment for that 4.340 + * core 4.341 + * 4.342 + */ 4.343 +void 4.344 +pairUpSubMatricesAndSpawnAndSync( SubMatrix **leftSubMatrices, 4.345 + SubMatrix **rightSubMatrices, 4.346 + int32 numRowIdxs, int32 numColIdxs, 4.347 + int32 numVecIdxs, 4.348 + float32 *resultArray, 4.349 + VirtProcr *animatingPr ) 4.350 + { 4.351 + int32 resRowIdx, resColIdx; 4.352 + int32 numLeftColIdxs, numRightColIdxs; 4.353 + int32 leftRowIdxOffset; 4.354 + VecParams *vecParams; 4.355 + float32 numToPutOntoEachCore, leftOverFraction; 4.356 + int32 numCores, currCore, numOnCurrCore; 4.357 + 4.358 + numLeftColIdxs = numColIdxs; 4.359 + numRightColIdxs = numVecIdxs; 4.360 + 4.361 + numCores = VCilk__give_number_of_cores_to_spawn_onto(); 4.362 + 4.363 + numToPutOntoEachCore = numRowIdxs*numColIdxs/numCores; 4.364 + leftOverFraction = 0; 4.365 + numOnCurrCore = 0; 4.366 + currCore = 0; 4.367 + 4.368 + for( resRowIdx = 0; resRowIdx < numRowIdxs; resRowIdx++ ) 4.369 + { 4.370 + leftRowIdxOffset = resRowIdx * numLeftColIdxs; 4.371 + 4.372 + for( resColIdx = 0; resColIdx < numColIdxs; resColIdx++ ) 4.373 + { 4.374 + vecParams = VCilk__malloc( sizeof(VecParams), animatingPr ); 4.375 + 4.376 + vecParams->numVecIdxs = numVecIdxs; 4.377 + vecParams->numRightColIdxs = numRightColIdxs; 4.378 + vecParams->leftRowIdxOffset = leftRowIdxOffset; 4.379 + vecParams->resColIdx = resColIdx; 4.380 + vecParams->leftSubMatrices = leftSubMatrices; 4.381 + vecParams->rightSubMatrices = rightSubMatrices; 4.382 + vecParams->resultArray = resultArray; 4.383 + vecParams->coreToRunOn = currCore; 4.384 + 4.385 + VCilk__spawn( currCore, &calcVectorOfSubMatrices, vecParams, 4.386 + animatingPr ); 4.387 + 4.388 + numOnCurrCore += 1; 4.389 + if( numOnCurrCore + leftOverFraction >= numToPutOntoEachCore - 1 ) 4.390 + { 4.391 + //deal with fractional part, to ensure that imbalance is 1 max 4.392 + // IE, core with most has only 1 more than core with least 4.393 + leftOverFraction += numToPutOntoEachCore - numOnCurrCore; 4.394 + if( leftOverFraction >= 1 ) 4.395 + { leftOverFraction -= 1; 4.396 + numOnCurrCore = -1; 4.397 + } 4.398 + else 4.399 + { numOnCurrCore = 0; 4.400 + } 4.401 + //Move to next core, max core-value to incr to is numCores -1 4.402 + if( currCore >= numCores -1 ) 4.403 + { currCore = 0; 4.404 + } 4.405 + else 4.406 + { currCore += 1; 4.407 + } 4.408 + } 4.409 + } 4.410 + } 4.411 + 4.412 + //Free Note: vector of sub-matrices does its own free-ing, even vec-params 4.413 + 4.414 +//TODO: timeToSpawnProbe = VMS__get_probe_by_name( "timeToSpawnProbe" ); 4.415 +// VMS__end_interval_on_probe( timeToSpawnProbe ); 4.416 + 4.417 + VCilk__sync( animatingPr ); 4.418 + 4.419 + //free the sub-matrices in Fn that called this one 4.420 + } 4.421 + 4.422 + 4.423 +/*Walk through the two slice-strucs, making sub-matrix strucs as go 4.424 + */ 4.425 +SubMatrix ** 4.426 +createSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices, 4.427 + Matrix *origMatrix, VirtProcr *animPr ) 4.428 + { 4.429 + int32 numRowIdxs, numColIdxs, rowIdx, colIdx; 4.430 + int32 startRow, endRow, startCol, endCol; 4.431 + int32 *rowStartVals, *colStartVals; 4.432 + int32 rowOffset; 4.433 + SubMatrix **subMatrices, *newSubMatrix; 4.434 + 4.435 + numRowIdxs = rowSlices->numVals; 4.436 + numColIdxs = colSlices->numVals; 4.437 + 4.438 + rowStartVals = rowSlices->startVals; 4.439 + colStartVals = colSlices->startVals; 4.440 + 4.441 + subMatrices = VCilk__malloc( numRowIdxs * numColIdxs *sizeof(SubMatrix *), 4.442 + animPr ); 4.443 + 4.444 + for( rowIdx = 0; rowIdx < numRowIdxs; rowIdx++ ) 4.445 + { 4.446 + rowOffset = rowIdx * numColIdxs; 4.447 + 4.448 + startRow = rowStartVals[rowIdx]; 4.449 + endRow = rowStartVals[rowIdx + 1] -1; //"fake" start above last is 4.450 + // at last valid idx + 1 & is 4.451 + // 1 greater than end value 4.452 + for( colIdx = 0; colIdx < numColIdxs; colIdx++ ) 4.453 + { 4.454 + startCol = colStartVals[colIdx]; 4.455 + endCol = colStartVals[colIdx + 1] -1; 4.456 + 4.457 + newSubMatrix = VCilk__malloc( sizeof(SubMatrix), animPr ); 4.458 + newSubMatrix->numRows = endRow - startRow +1; 4.459 + newSubMatrix->numCols = endCol - startCol +1; 4.460 + newSubMatrix->origMatrix = origMatrix; 4.461 + newSubMatrix->origStartRow = startRow; 4.462 + newSubMatrix->origStartCol = startCol; 4.463 + newSubMatrix->alreadyCopied = FALSE; 4.464 + 4.465 + subMatrices[ rowOffset + colIdx ] = newSubMatrix; 4.466 + } 4.467 + } 4.468 + return subMatrices; 4.469 + } 4.470 + 4.471 +void 4.472 +freeSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices, 4.473 + SubMatrix **subMatrices, VirtProcr *animPr ) 4.474 + { 4.475 + int32 numRowIdxs, numColIdxs, rowIdx, colIdx, rowOffset; 4.476 + SubMatrix *subMatrix; 4.477 + 4.478 + numRowIdxs = rowSlices->numVals; 4.479 + numColIdxs = colSlices->numVals; 4.480 + 4.481 + for( rowIdx = 0; rowIdx < numRowIdxs; rowIdx++ ) 4.482 + { 4.483 + rowOffset = rowIdx * numColIdxs; 4.484 + for( colIdx = 0; colIdx < numColIdxs; colIdx++ ) 4.485 + { 4.486 + subMatrix = subMatrices[ rowOffset + colIdx ]; 4.487 + if( subMatrix->alreadyCopied ) 4.488 + VCilk__free( subMatrix->array, animPr ); 4.489 + VCilk__free( subMatrix, animPr ); 4.490 + } 4.491 + } 4.492 + VCilk__free( subMatrices, animPr ); 4.493 + } 4.494 + 4.495 + 4.496 + 4.497 +SlicingStruc * 4.498 +sliceUpDimension( float32 idealSizeOfSide, int startVal, int endVal, 4.499 + VirtProcr *animPr ) 4.500 + { float32 residualAcc = 0; 4.501 + int numSlices, i, *startVals, sizeOfSlice, endCondition; 4.502 + SlicingStruc *slicingStruc = VCilk__malloc( sizeof(SlicingStruc), animPr); 4.503 + 4.504 + //calc size of matrix need to hold start vals -- 4.505 + numSlices = (int32)( (float32)(endVal -startVal +1) / idealSizeOfSide); 4.506 + 4.507 + startVals = VCilk__malloc( (numSlices + 1) * sizeof(int32), animPr ); 4.508 + 4.509 + //Calc the upper limit of start value -- when get above this, end loop 4.510 + // by saving highest value of the matrix dimension to access, plus 1 4.511 + // as the start point of the imaginary slice following the last one 4.512 + //Plus 1 because go up to value but not include when process last slice 4.513 + //The stopping condition is half-a-size less than highest value because 4.514 + // don't want any pieces smaller than half the ideal size -- just tack 4.515 + // little ones onto end of last one 4.516 + endCondition = endVal - (int) (idealSizeOfSide/2); //end *value*, not size 4.517 + for( i = 0; startVal <= endVal; i++ ) 4.518 + { 4.519 + startVals[i] = startVal; 4.520 + residualAcc += idealSizeOfSide; 4.521 + sizeOfSlice = (int)residualAcc; 4.522 + residualAcc -= (float32)sizeOfSlice; 4.523 + startVal += sizeOfSlice; //ex @size = 2 get 0, 2, 4, 6, 8.. 4.524 + 4.525 + if( startVal > endCondition ) 4.526 + { startVal = endVal + 1; 4.527 + startVals[ i + 1 ] = startVal; 4.528 + } 4.529 + } 4.530 + 4.531 + slicingStruc->startVals = startVals; 4.532 + slicingStruc->numVals = i; //loop incr'd, so == last valid start idx+1 4.533 + // which means is num sub-matrices in dim 4.534 + // also == idx of the fake start just above 4.535 + return slicingStruc; 4.536 + } 4.537 + 4.538 +void 4.539 +freeSlicingStruc( SlicingStruc *slicingStruc, VirtProcr *animPr ) 4.540 + { 4.541 + VCilk__free( slicingStruc->startVals, animPr ); 4.542 + VCilk__free( slicingStruc, animPr ); 4.543 + } 4.544 + 4.545 + 4.546 +int inline 4.547 +measureMatrixMultPrimitive( VirtProcr *animPr ) 4.548 + { 4.549 + int r, c, v, numCycles; 4.550 + float32 *res, *left, *right; 4.551 + 4.552 + //setup inputs 4.553 + left = VCilk__malloc( 5 * 5 * sizeof( float32 ), animPr ); 4.554 + right = VCilk__malloc( 5 * 5 * sizeof( float32 ), animPr ); 4.555 + res = VCilk__malloc( 5 * 5 * sizeof( float32 ), animPr ); 4.556 + 4.557 + for( r = 0; r < 5; r++ ) 4.558 + { 4.559 + for( c = 0; c < 5; c++ ) 4.560 + { 4.561 + left[ r * 5 + c ] = r; 4.562 + right[ r * 5 + c ] = c; 4.563 + } 4.564 + } 4.565 + 4.566 + //do primitive 4.567 + VCilk__start_primitive(); //for now, just takes time stamp 4.568 + for( r = 0; r < 5; r++ ) 4.569 + { 4.570 + for( c = 0; c < 5; c++ ) 4.571 + { 4.572 + for( v = 0; v < 5; v++ ) 4.573 + { 4.574 + res[ r * 5 + c ] = left[ r * 5 + v ] * right[ v * 5 + c ]; 4.575 + } 4.576 + } 4.577 + } 4.578 + numCycles = 4.579 + VCilk__end_primitive_and_give_cycles(); 4.580 + 4.581 + VCilk__free( left, animPr ); 4.582 + VCilk__free( right, animPr ); 4.583 + VCilk__free( res, animPr ); 4.584 + 4.585 + return numCycles; 4.586 + }
5.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 5.2 +++ b/src/Application/VCilk__Matrix_Mult/EntryPoint.c Sat Oct 30 20:43:49 2010 -0700 5.3 @@ -0,0 +1,58 @@ 5.4 +/* 5.5 + * Copyright 2009 OpenSourceStewardshipFoundation.org 5.6 + * Licensed under GNU General Public License version 2 5.7 + * 5.8 + * Author: seanhalle@yahoo.com 5.9 + * 5.10 + */ 5.11 + 5.12 +#include <math.h> 5.13 + 5.14 +#include "VCilk__Matrix_Mult.h" 5.15 + 5.16 + 5.17 + 5.18 +/*Every VCilk system has an "entry point" function that creates the first 5.19 + * processor, which starts the chain of creating more processors.. 5.20 + * eventually all of the processors will dissipate themselves, and 5.21 + * return. 5.22 + * 5.23 + *This entry-point function follows the same pattern as all entry-point 5.24 + * functions do: 5.25 + *1) it creates the params for the seed processor, from the 5.26 + * parameters passed into the entry-point function 5.27 + *2) it calls VCilk__create_seed_procr_and_do_work 5.28 + *3) it gets the return value from the params struc, frees the params struc, 5.29 + * and returns the value from the function 5.30 + * 5.31 + */ 5.32 +Matrix * 5.33 +multiplyTheseMatrices( Matrix *leftMatrix, Matrix *rightMatrix ) 5.34 + { Matrix *resMatrix; 5.35 + DividerParams *dividerParams; 5.36 + int32 numResRows, numResCols; 5.37 + 5.38 + 5.39 + dividerParams = malloc( sizeof( DividerParams ) ); 5.40 + dividerParams->leftMatrix = leftMatrix; 5.41 + dividerParams->rightMatrix = rightMatrix; 5.42 + 5.43 + numResRows = leftMatrix->numRows; 5.44 + numResCols = rightMatrix->numCols; 5.45 + resMatrix = malloc( sizeof(Matrix) ); 5.46 + resMatrix->array = malloc( numResRows * numResCols * sizeof(float32)); 5.47 + resMatrix->numCols = rightMatrix->numCols; 5.48 + resMatrix->numRows = leftMatrix->numRows; 5.49 + 5.50 + 5.51 + dividerParams->resultMatrix = resMatrix; 5.52 + 5.53 + //create divider processor, start doing the work, and wait till done 5.54 + //This function is the "border crossing" between normal code and VCilk 5.55 + VCilk__create_seed_procr_and_do_work( ÷WorkIntoSubMatrixPairProcrs, 5.56 + dividerParams ); 5.57 + 5.58 + //get result matrix and return it 5.59 + free( dividerParams ); 5.60 + return resMatrix; 5.61 + }
6.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 6.2 +++ b/src/Application/VCilk__Matrix_Mult/VCilk__Matrix_Mult.h Sat Oct 30 20:43:49 2010 -0700 6.3 @@ -0,0 +1,103 @@ 6.4 +/* 6.5 + * Copyright Oct 24, 2009 OpenSourceStewardshipFoundation.org 6.6 + * Licensed under GNU General Public License version 2 6.7 + */ 6.8 + 6.9 +#ifndef _VCilk_MATRIX_MULT_H_ 6.10 +#define _VCilk_MATRIX_MULT_H_ 6.11 + 6.12 +#include <stdio.h> 6.13 + 6.14 +#include "../../VCilk_lib/VCilk.h" 6.15 +#include "../Matrix_Mult.h" 6.16 + 6.17 + 6.18 +//=============================== Defines ============================== 6.19 +#define ROWS_IN_BLOCK 32 6.20 +#define COLS_IN_BLOCK 32 6.21 +#define VEC_IN_BLOCK 32 6.22 + 6.23 + 6.24 +//#define PRINT_DEBUG(msg) //printf(msg); fflush(stdin); 6.25 + 6.26 +//============================== Structures ============================== 6.27 +typedef struct 6.28 + { 6.29 + Matrix *leftMatrix; 6.30 + Matrix *rightMatrix; 6.31 + Matrix *resultMatrix; 6.32 + 6.33 + TSCount numTSCsToExe; 6.34 + } 6.35 +DividerParams; 6.36 + 6.37 +typedef struct 6.38 + { 6.39 + VirtProcr *dividerPr; 6.40 + int numRows; 6.41 + int numCols; 6.42 + int numSubMatrixPairs; 6.43 + } 6.44 +ResultsParams; 6.45 + 6.46 +typedef 6.47 +struct 6.48 + { int32 numRows; 6.49 + int32 numCols; 6.50 + Matrix *origMatrix; 6.51 + int32 origStartRow; 6.52 + int32 origStartCol; 6.53 + int32 alreadyCopied; 6.54 + float32 *array; //2D, but dynamically sized, so use addr arith 6.55 + } 6.56 +SubMatrix; 6.57 + 6.58 +typedef struct 6.59 + { VirtProcr *resultPr; 6.60 + SubMatrix *leftSubMatrix; 6.61 + SubMatrix *rightSubMatrix; 6.62 + float32 *resultArray; 6.63 + } 6.64 +SMPairParams; 6.65 + 6.66 +typedef 6.67 +struct 6.68 + { int32 numVals; 6.69 + int32 *startVals; 6.70 + } 6.71 +SlicingStruc; 6.72 + 6.73 +typedef 6.74 +struct 6.75 + { 6.76 + SlicingStruc *leftRowSlices; 6.77 + SlicingStruc *vecSlices; 6.78 + SlicingStruc *rightColSlices; 6.79 + } 6.80 +SlicingStrucCarrier; 6.81 + 6.82 +typedef struct 6.83 + { 6.84 + int32 numVecIdxs; 6.85 + int32 numRightColIdxs; 6.86 + int32 leftRowIdxOffset; 6.87 + int32 resColIdx; 6.88 + SubMatrix **leftSubMatrices; 6.89 + SubMatrix **rightSubMatrices; 6.90 + float32 *resultArray; 6.91 + int32 coreToRunOn; 6.92 + } 6.93 +VecParams; 6.94 + 6.95 +//============================= Processor Functions ========================= 6.96 +void divideWorkIntoSubMatrixPairProcrs( void *data, VirtProcr *animatingPr ); 6.97 +void calcSubMatrixProduct( void *data, VirtProcr *animatingPr ); 6.98 +void calcVectorOfSubMatrices( void *_vecParams, VirtProcr *animatingPr ); 6.99 + 6.100 + 6.101 +//================================ Entry Point ============================== 6.102 +Matrix * 6.103 +multiplyTheseMatrices( Matrix *leftMatrix, Matrix *rightMatrix ); 6.104 + 6.105 + 6.106 +#endif /*_VCilk_MATRIX_MULT_H_*/
7.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 7.2 +++ b/src/Application/VCilk__Matrix_Mult/Vector_Pr.c Sat Oct 30 20:43:49 2010 -0700 7.3 @@ -0,0 +1,121 @@ 7.4 +/* 7.5 + * Copyright 2009 OpenSourceStewardshipFoundation.org 7.6 + * Licensed under GNU General Public License version 2 7.7 + * 7.8 + * Author: seanhalle@yahoo.com 7.9 + * 7.10 + */ 7.11 + 7.12 + 7.13 +#include "VCilk__Matrix_Mult.h" 7.14 +#include <math.h> 7.15 + 7.16 + 7.17 +void inline 7.18 +accumulateResult( float32 *resultArray, float32 *subMatrixResultArray, 7.19 + int32 startRow, 7.20 + int32 numRows, 7.21 + int32 startCol, 7.22 + int32 numCols, 7.23 + int32 numOrigCols ); 7.24 + 7.25 + 7.26 +//=========================================================================== 7.27 + 7.28 +void 7.29 +calcVectorOfSubMatrices( void *_vecParams, VirtProcr *animPr ) 7.30 + { int32 numVecIdxs, leftRowIdxOffset, numRightColIdxs, resColIdx; 7.31 + SubMatrix **leftSubMatrices, **rightSubMatrices; 7.32 + float32 *resultArray; 7.33 + int32 vecIdx, coreWithAffinity; 7.34 + SMPairParams *subMatrixPairParams, **vecOfSubMatrixParams; 7.35 + VecParams *vecParams; 7.36 + 7.37 + vecParams = (VecParams *)_vecParams; 7.38 + 7.39 + int32 subMatrixVectorProbe = 7.40 + VMS__create_single_interval_probe( "subMtxVect", animPr ); 7.41 + VMS__record_sched_choice_into_probe( subMatrixVectorProbe, animPr ); 7.42 + VMS__record_interval_start_in_probe( subMatrixVectorProbe ); 7.43 + 7.44 + 7.45 + numVecIdxs = vecParams->numVecIdxs; 7.46 + numRightColIdxs = vecParams->numRightColIdxs; 7.47 + leftRowIdxOffset = vecParams->leftRowIdxOffset; 7.48 + resColIdx = vecParams->resColIdx; 7.49 + leftSubMatrices = vecParams->leftSubMatrices; 7.50 + rightSubMatrices = vecParams->rightSubMatrices; 7.51 + resultArray = vecParams->resultArray; 7.52 + coreWithAffinity = vecParams->coreToRunOn; 7.53 + 7.54 + vecOfSubMatrixParams = VCilk__malloc( numVecIdxs * sizeof(SMPairParams *), 7.55 + animPr ); 7.56 + if( vecOfSubMatrixParams == 0 ){printf("malloc error"); exit(1);} 7.57 + 7.58 + for( vecIdx = 0; vecIdx < numVecIdxs; vecIdx++ ) 7.59 + { 7.60 + //Make the processor for the pair of sub-matrices 7.61 + subMatrixPairParams = VCilk__malloc(sizeof(SMPairParams),animPr); 7.62 + subMatrixPairParams->leftSubMatrix = 7.63 + leftSubMatrices[ leftRowIdxOffset + vecIdx ]; 7.64 + 7.65 + subMatrixPairParams->rightSubMatrix = 7.66 + rightSubMatrices[ vecIdx * numRightColIdxs + resColIdx ]; 7.67 + 7.68 + VCilk__spawn( coreWithAffinity, &calcSubMatrixProduct, 7.69 + subMatrixPairParams, animPr ); 7.70 + 7.71 + vecOfSubMatrixParams[ vecIdx ] = subMatrixPairParams; 7.72 + } 7.73 + 7.74 + VCilk__sync( animPr ); 7.75 + 7.76 + //now accumulate individual result matrices into final result matrix 7.77 + for( vecIdx = 0; vecIdx < numVecIdxs; vecIdx++ ) 7.78 + { 7.79 + subMatrixPairParams = vecOfSubMatrixParams[ vecIdx ]; 7.80 + 7.81 + accumulateResult( resultArray, subMatrixPairParams->resultArray, 7.82 + subMatrixPairParams->leftSubMatrix->origStartRow, 7.83 + subMatrixPairParams->leftSubMatrix->numRows, 7.84 + subMatrixPairParams->rightSubMatrix->origStartCol, 7.85 + subMatrixPairParams->rightSubMatrix->numCols, 7.86 + subMatrixPairParams->rightSubMatrix->origMatrix->numCols); 7.87 + 7.88 + //Note, resultArray is made on the core that produces the results 7.89 + // that gives chance to set affinity so all in vector run on same 7.90 + // core and re-use that array, and prevents writes from causing 7.91 + // thrashing of the cache -- as long as array big enough, the copy 7.92 + // overhead is miniscule vs the size-of-side reuse of each byte 7.93 + VCilk__free( subMatrixPairParams->resultArray, animPr ); 7.94 + VCilk__free( subMatrixPairParams, animPr ); 7.95 + } 7.96 + VCilk__free( vecOfSubMatrixParams, animPr ); 7.97 + VCilk__free( vecParams, animPr ); 7.98 + 7.99 + VMS__record_interval_end_in_probe( subMatrixVectorProbe ); 7.100 + 7.101 + VCilk__dissipate_procr( animPr ); 7.102 + } 7.103 + 7.104 + 7.105 + 7.106 +void inline 7.107 +accumulateResult( float32 *resultArray, float32 *subMatrixResultArray, 7.108 + int32 startRow, 7.109 + int32 numRows, 7.110 + int32 startCol, 7.111 + int32 numCols, 7.112 + int32 numOrigCols ) 7.113 + { int32 row, col; 7.114 + 7.115 + for( row = 0; row < numRows; row++ ) 7.116 + { 7.117 + for( col = 0; col < numCols; col++ ) 7.118 + { 7.119 + resultArray[ (row + startRow) * numOrigCols + col + startCol ] += 7.120 + subMatrixResultArray[ row * numCols + col ]; 7.121 + } 7.122 + } 7.123 + 7.124 + }
8.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 8.2 +++ b/src/Application/VCilk__Matrix_Mult/subMatrix_Pr.c Sat Oct 30 20:43:49 2010 -0700 8.3 @@ -0,0 +1,269 @@ 8.4 +/* 8.5 + * Copyright 2009 OpenSourceStewardshipFoundation.org 8.6 + * Licensed under GNU General Public License version 2 8.7 + * 8.8 + * Author: SeanHalle@yahoo.com 8.9 + * 8.10 + */ 8.11 + 8.12 +#include "VCilk__Matrix_Mult.h" 8.13 + 8.14 + 8.15 +void inline 8.16 +copyFromOrig( SubMatrix *subMatrix, VirtProcr *animPr ); 8.17 + 8.18 +void inline 8.19 +copyTransposeFromOrig( SubMatrix *subMatrix, VirtProcr *animPr ); 8.20 + 8.21 +void inline 8.22 +multiplySubBlocksTransposed( float32 *leftArray, float32 *rightArray, 8.23 + float32 *resArray, 8.24 + int startRow, int endRow, 8.25 + int startCol, int endCol, 8.26 + int startVec, int endVec, 8.27 + int resStride, int inpStride ); 8.28 + 8.29 +void inline 8.30 +multiplyMatrixArrays( int32 vecLength, int32 numResRows, int32 numResCols, 8.31 + float32 *leftArray, float32 *rightArray, 8.32 + float32 *resArray ); 8.33 + 8.34 + 8.35 +/*A processor is created with an environment that holds two matrices, 8.36 + * the row and col that it owns, and the name of a result gathering 8.37 + * processor. 8.38 + *It calculates the product of two sub-portions of the input matrices 8.39 + * by using Intel's mkl library for single-core. 8.40 + * 8.41 + *This demonstrates using optimized single-threaded code inside scheduled 8.42 + * work-units. 8.43 + * 8.44 + *When done, it sends the result to the result processor 8.45 + */ 8.46 +void 8.47 +calcSubMatrixProduct( void *data, VirtProcr *animatingPr ) 8.48 + { 8.49 + SMPairParams *params; 8.50 + VirtProcr *resultPr; 8.51 + float32 *leftArray, *rightArray, *resArray; 8.52 + SubMatrix *leftSubMatrix, *rightSubMatrix; 8.53 + 8.54 + PRINT_DEBUG("start sub-matrix mult\n") 8.55 + int32 subMatrixProbe = VMS__create_single_interval_probe( "subMtx", 8.56 + animatingPr); 8.57 + VMS__record_sched_choice_into_probe( subMatrixProbe, animatingPr ); 8.58 + VMS__record_interval_start_in_probe( subMatrixProbe ); 8.59 + 8.60 + params = (SMPairParams *)data; 8.61 +// resultPr = params->resultPr; 8.62 + leftSubMatrix = params->leftSubMatrix; 8.63 + rightSubMatrix = params->rightSubMatrix; 8.64 + 8.65 + //make sure the input sub-matrices have been copied out of orig 8.66 + copyFromOrig( leftSubMatrix, animatingPr ); 8.67 + copyTransposeFromOrig( rightSubMatrix, animatingPr ); 8.68 + 8.69 + leftArray = leftSubMatrix->array; 8.70 + rightArray = rightSubMatrix->array; 8.71 + 8.72 + //make this array here, on the core that computes the results 8.73 + // with Cilk's semantics, have to have separate result array for each 8.74 + // spawned processor -- unless want to change the spawn and sync 8.75 + // pattern, such that spawn one from each vector, then sync, then 8.76 + // another, and so forth -- this will cause idle time due to imbalance 8.77 + // in matrix sizes 8.78 + //This also gives chance to set affinity so all in vector run on same 8.79 + // core and re-use the accumulation array, 8.80 + //As a side-benefit, it also prevents writes from causing 8.81 + // thrashing of the cache -- as long as array big enough, the copy 8.82 + // overhead is small because each byte is reused size-of-side times 8.83 + //This is freed in the vector processor 8.84 + resArray = VCilk__malloc(leftSubMatrix->numRows * rightSubMatrix->numCols* 8.85 + sizeof( float32 ), animatingPr ); 8.86 + 8.87 + 8.88 + int32 numResRows, numResCols, vectLength; 8.89 + 8.90 + vectLength = leftSubMatrix->numCols; 8.91 + numResRows = leftSubMatrix->numRows; 8.92 + numResCols = rightSubMatrix->numCols; 8.93 + 8.94 + multiplyMatrixArrays( vectLength, numResRows, numResCols, 8.95 + leftArray, rightArray, 8.96 + resArray ); 8.97 + 8.98 + //send result by side-effect 8.99 + params->resultArray = resArray; 8.100 + 8.101 + VMS__record_interval_end_in_probe( subMatrixProbe ); 8.102 + 8.103 + VCilk__dissipate_procr( animatingPr ); 8.104 + } 8.105 + 8.106 + 8.107 +/*Divides into 32x32 sub-matrices, 3 of which fit into 32KB L1 cache 8.108 + * Would be nice to embed this within another level that divided into 8.109 + * 8x8 tiles of those, where one 8x8 tile fits within 2MB L2 cache 8.110 + * 8.111 + *Eventually want these divisions to be automatic, using DKU pattern 8.112 + * embedded into VCilk, and with VMS controlling the divisions according to 8.113 + * the cache sizes, which it knows about. 8.114 + *And, want VMS to work with language to split among main-mems, so a socket 8.115 + * only cranks on data in its local segment of main mem 8.116 + * 8.117 + */ 8.118 +void inline 8.119 +multiplyMatrixArrays( int32 vecLength, int32 numResRows, int32 numResCols, 8.120 + float32 *leftArray, float32 *rightArray, 8.121 + float32 *resArray ) 8.122 + { 8.123 + int resStride, inpStride; 8.124 + int startRow, startCol, endRow, endCol, startVec, endVec; 8.125 + 8.126 + resStride = numResCols; 8.127 + inpStride = vecLength; 8.128 + 8.129 + for( startRow = 0; startRow < numResRows; ) 8.130 + { 8.131 + endRow = startRow + ROWS_IN_BLOCK; 8.132 + if( endRow > numResRows ) endRow = numResRows; 8.133 + 8.134 + for( startCol = 0; startCol < numResCols; ) 8.135 + { 8.136 + endCol = startCol + COLS_IN_BLOCK; 8.137 + if( endCol > numResCols ) endCol = numResCols; 8.138 + 8.139 + for( startVec = 0; startVec < vecLength; ) 8.140 + { 8.141 + endVec = startVec + VEC_IN_BLOCK; 8.142 + if( endVec > vecLength ) endVec = vecLength; 8.143 + 8.144 + //By having the "vector" of sub-blocks in a sub-block slice 8.145 + // be marched down in inner loop, are re-using the result 8.146 + // matrix, which stays in L1 cache -- can only re-use one of 8.147 + // the three, so this is the most important -- avoids writing 8.148 + // dirty blocks until those result-locations fully done 8.149 + //Row and Col is position in result matrix -- so row and vec 8.150 + // for left array, then vec and col for right array 8.151 + multiplySubBlocksTransposed( leftArray, rightArray, 8.152 + resArray, 8.153 + startRow, endRow, 8.154 + startCol, endCol, 8.155 + startVec, endVec, 8.156 + resStride, inpStride ); 8.157 + startVec = endVec; 8.158 + } 8.159 + startCol = endCol; 8.160 + } 8.161 + startRow = endRow; 8.162 + } 8.163 + } 8.164 + 8.165 + 8.166 +void inline 8.167 +multiplySubBlocksTransposed( float32 *leftArray, float32 *rightArray, 8.168 + float32 *resArray, 8.169 + int startRow, int endRow, 8.170 + int startCol, int endCol, 8.171 + int startVec, int endVec, 8.172 + int resStride, int inpStride ) 8.173 + { 8.174 + int row, col, vec; 8.175 + int leftOffset, rightOffset; 8.176 + float32 result; 8.177 + 8.178 + for( row = startRow; row < endRow; row++ ) 8.179 + { 8.180 + for( col = startCol; col < endCol; col++ ) 8.181 + { 8.182 + leftOffset = row * inpStride;//left & right inp strides always same 8.183 + rightOffset = col * inpStride;// because right is transposed 8.184 + result = 0; 8.185 + for( vec = startVec; vec < endVec; vec++ ) 8.186 + { 8.187 + result += 8.188 + leftArray[ leftOffset + vec] * rightArray[ rightOffset + vec]; 8.189 + } 8.190 + 8.191 + resArray[ row * resStride + col ] += result; 8.192 + } 8.193 + } 8.194 + } 8.195 + 8.196 +/*Note: don't do the copy when create, because create on a different core -- 8.197 + * do the copy on the core that will do the calcs, then try to keep work on 8.198 + * that core that reuses the array data 8.199 + */ 8.200 +void inline 8.201 +copyTransposeFromOrig( SubMatrix *subMatrix, VirtProcr *animPr ) 8.202 + { int numCols, numRows, origStartRow, origStartCol, origStride, stride; 8.203 + Matrix *origMatrix; 8.204 + float32 *origArray, *subArray; 8.205 + 8.206 + if( subMatrix->alreadyCopied ) return; 8.207 + 8.208 + subMatrix->alreadyCopied = TRUE; 8.209 + 8.210 + origMatrix = subMatrix->origMatrix; 8.211 + origArray = origMatrix->array; 8.212 + numCols = subMatrix->numCols; 8.213 + numRows = subMatrix->numRows; 8.214 + stride = numRows; 8.215 + origStartRow = subMatrix->origStartRow; 8.216 + origStartCol = subMatrix->origStartCol; 8.217 + origStride = origMatrix->numCols; 8.218 + 8.219 + //This is free in Divide pr after all calcs are done 8.220 + subArray = VCilk__malloc( numRows * numCols * sizeof(float32),animPr); 8.221 + subMatrix->array = subArray; 8.222 + 8.223 + //copy values from orig matrix to local 8.224 + int row, col, origOffset; 8.225 + for( row = 0; row < numRows; row++ ) 8.226 + { 8.227 + origOffset = (row + origStartRow) * origStride + origStartCol; 8.228 + for( col = 0; col < numCols; col++ ) 8.229 + { 8.230 + //transpose means swap row & col -- traverse orig matrix normally 8.231 + // but put into reversed place in local array -- means the 8.232 + // stride is the num rows now, so col * numRows + row 8.233 + subArray[ col * stride + row ] = origArray[ origOffset + col ]; 8.234 + } 8.235 + } 8.236 + } 8.237 + 8.238 +void inline 8.239 +copyFromOrig( SubMatrix *subMatrix, VirtProcr *animPr ) 8.240 + { int numCols, numRows, origStartRow, origStartCol, stride, origStride; 8.241 + Matrix *origMatrix; 8.242 + float32 *origArray, *subArray; 8.243 + 8.244 + if( subMatrix->alreadyCopied ) return; 8.245 + 8.246 + subMatrix->alreadyCopied = TRUE; 8.247 + 8.248 + origMatrix = subMatrix->origMatrix; 8.249 + origArray = origMatrix->array; 8.250 + numCols = subMatrix->numCols; 8.251 + numRows = subMatrix->numRows; 8.252 + origStartRow = subMatrix->origStartRow; 8.253 + origStartCol = subMatrix->origStartCol; 8.254 + stride = numCols; 8.255 + origStride = origMatrix->numCols; 8.256 + 8.257 + //This is freed in Divide pr after all calcs are done 8.258 + subArray = VCilk__malloc( numRows * numCols *sizeof(float32),animPr); 8.259 + subMatrix->array = subArray; 8.260 + 8.261 + //copy values from orig matrix to local 8.262 + int row, col, offset, origOffset; 8.263 + for( row = 0; row < numRows; row++ ) 8.264 + { 8.265 + offset = row * stride; 8.266 + origOffset = (row + origStartRow) * origStride + origStartCol; 8.267 + for( col = 0; col < numCols; col++ ) 8.268 + { 8.269 + subArray[ offset + col ] = origArray[ origOffset + col ]; 8.270 + } 8.271 + } 8.272 + }
9.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 9.2 +++ b/src/Application/main.c Sat Oct 30 20:43:49 2010 -0700 9.3 @@ -0,0 +1,34 @@ 9.4 +/* 9.5 + * Copyright Oct 24, 2009 OpenSourceStewardshipFoundation.org 9.6 + * Licensed under GNU General Public License version 2 9.7 + * 9.8 + * author seanhalle@yahoo.com 9.9 + */ 9.10 + 9.11 +#include <malloc.h> 9.12 +#include <stdlib.h> 9.13 + 9.14 +#include "Matrix_Mult.h" 9.15 +#include "VCilk__Matrix_Mult/VCilk__Matrix_Mult.h" 9.16 + 9.17 +/** 9.18 + *Matrix multiply program written using VMS_HW piggy-back language 9.19 + * 9.20 + */ 9.21 +int main( int argc, char **argv ) 9.22 + { Matrix *leftMatrix, *rightMatrix, *resultMatrix; 9.23 + ParamBag *paramBag; 9.24 + 9.25 + paramBag = makeParamBag(); 9.26 + readParamFileIntoBag( argv[1], paramBag ); 9.27 + initialize_Input_Matrices_Via( &leftMatrix, &rightMatrix, paramBag ); 9.28 + 9.29 + resultMatrix = multiplyTheseMatrices( leftMatrix, rightMatrix ); 9.30 + 9.31 + printf("\nresult matrix: \n"); 9.32 + printMatrix( resultMatrix ); 9.33 + 9.34 +// VCilk__print_stats(); 9.35 + 9.36 + exit(0); //cleans up 9.37 + }
