Mercurial > cgi-bin > hgwebdir.cgi > PR > Applications > Vthread > Vthread__Blocked_Matrix_Mult__Bench
changeset 0:8d14fe28a782
Initial add -- copied from SSR version
| author | SeanHalle |
|---|---|
| date | Wed, 10 Nov 2010 22:26:57 -0800 |
| parents | |
| children | 133633d1c10f |
| files | .hgignore src/Application/Matrix_Mult.c src/Application/Matrix_Mult.h src/Application/VPThread__Matrix_Mult/Divide_Pr.c src/Application/VPThread__Matrix_Mult/EntryPoint.c src/Application/VPThread__Matrix_Mult/Result_Pr.c src/Application/VPThread__Matrix_Mult/VPThread__Matrix_Mult.h src/Application/VPThread__Matrix_Mult/subMatrix_Pr.c src/Application/main.c |
| diffstat | 9 files changed, 1446 insertions(+), 0 deletions(-) [+] |
line diff
1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/.hgignore Wed Nov 10 22:26:57 2010 -0800 1.3 @@ -0,0 +1,5 @@ 1.4 +nbproject 1.5 +dist 1.6 +build 1.7 +Makefile 1.8 +.dep.inc
2.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 2.2 +++ b/src/Application/Matrix_Mult.c Wed Nov 10 22:26:57 2010 -0800 2.3 @@ -0,0 +1,167 @@ 2.4 +/* 2.5 + * Copyright 2009 OpenSourceStewardshipFoundation.org 2.6 + * Licensed under GNU General Public License version 2 2.7 + * 2.8 + * Author: seanhalle@yahoo.com 2.9 + * 2.10 + * Created on November 15, 2009, 2:35 AM 2.11 + */ 2.12 + 2.13 +#include <malloc.h> 2.14 +#include <stdlib.h> 2.15 + 2.16 +#include "Matrix_Mult.h" 2.17 +#include "ParamHelper/Param.h" 2.18 + 2.19 + 2.20 + 2.21 + void 2.22 +initialize_Input_Matrices_Via( Matrix **leftMatrix, Matrix **rightMatrix, 2.23 + ParamBag *paramBag ) 2.24 + { char *leftMatrixFileName, *rightMatrixFileName; 2.25 + int leftMatrixRows, leftMatrixCols, rightMatrixRows, rightMatrixCols; 2.26 + 2.27 + ParamStruc *param; 2.28 + param = getParamFromBag( "leftMatrixRows", paramBag ); 2.29 + leftMatrixRows = param->intValue; 2.30 + param = getParamFromBag( "leftMatrixCols", paramBag ); 2.31 + leftMatrixCols = param->intValue; 2.32 + *leftMatrix = makeMatrix_WithResMat( leftMatrixRows, leftMatrixCols ); 2.33 + 2.34 + param = getParamFromBag( "leftMatrixFileName", paramBag ); 2.35 + leftMatrixFileName = param->strValue; //no need to copy 2.36 + read_Matrix_From_File( *leftMatrix, leftMatrixFileName ); 2.37 + 2.38 + param = getParamFromBag( "rightMatrixRows", paramBag ); 2.39 + rightMatrixRows = param->intValue; 2.40 + param = getParamFromBag( "rightMatrixCols", paramBag ); 2.41 + rightMatrixCols = param->intValue; 2.42 + *rightMatrix = makeMatrix_WithResMat( rightMatrixRows, rightMatrixCols ); 2.43 + 2.44 + param = getParamFromBag( "rightMatrixFileName", paramBag ); 2.45 + rightMatrixFileName = param->strValue; 2.46 + read_Matrix_From_File( *rightMatrix, rightMatrixFileName ); 2.47 + } 2.48 + 2.49 + 2.50 +void parseLineIntoRow( char *line, float32* row ); 2.51 + 2.52 + 2.53 + void 2.54 +read_Matrix_From_File( Matrix *matrixStruc, char *matrixFileName ) 2.55 + { int row, maxRead, numRows, numCols; 2.56 + float32 *matrixStart; 2.57 + size_t lineSz = 0; 2.58 + FILE *file; 2.59 + char *line = NULL; 2.60 + 2.61 + lineSz = 50000; //max length of line in a matrix data file 2.62 + line = (char *) malloc( lineSz ); 2.63 + if( line == NULL ) printf( "no mem for matrix line" ); 2.64 + 2.65 + numRows = matrixStruc->numRows; 2.66 + numCols = matrixStruc->numCols; 2.67 + matrixStart = matrixStruc->array; 2.68 + 2.69 + file = fopen( matrixFileName, "r" ); 2.70 + if( file == NULL ) { printf( "\nCouldn't open file!!\n"); exit(1);} 2.71 + fseek( file, 0, SEEK_SET ); 2.72 + for( row = 0; row < numRows; row++ ) 2.73 + { 2.74 + if( feof( file ) ) printf( "file ran out too soon" ); 2.75 + maxRead = getline( &line, &lineSz, file ); 2.76 + if( maxRead == -1 ) printf( "prob reading mat line"); 2.77 + 2.78 + if( *line == '\n') continue; //blank line 2.79 + if( *line == '/' ) continue; //comment line 2.80 + 2.81 + parseLineIntoRow( line, matrixStart + row * numCols ); 2.82 + } 2.83 + free( line ); 2.84 + } 2.85 + 2.86 +/*This function relies on each line having the proper number of cols. It 2.87 + * doesn't check, nor enforce, so if the file is improperly formatted it 2.88 + * can write over unrelated memory 2.89 + */ 2.90 + void 2.91 +parseLineIntoRow( char *line, float32* row ) 2.92 + { 2.93 + char *valueStr, *searchPos; 2.94 + 2.95 + //read the float values 2.96 + searchPos = valueStr = line; //start 2.97 + 2.98 + for( ; *searchPos != 0; searchPos++) //bit dangerous, should use buff len 2.99 + { 2.100 + if( *searchPos == '\n' ) //last col.. relying on well-formatted file 2.101 + { *searchPos = 0; 2.102 + *row = atof( valueStr ); 2.103 + break; //end FOR loop 2.104 + } 2.105 + if( *searchPos == ',' ) 2.106 + { *searchPos = 0; //mark end of string 2.107 + *row = (float32) atof( valueStr ); 2.108 + row += 1; //address arith 2.109 + //skip any spaces before digits.. use searchPos + 1 to skip the 0 2.110 + for( ; *(searchPos + 1)== ' ' && *(searchPos + 1) !=0; searchPos++); 2.111 + valueStr = searchPos + 1; 2.112 + } 2.113 + } 2.114 + } 2.115 + 2.116 + //========================================================================== 2.117 + 2.118 +/*In the "_Flat" version of constructor, do only malloc of the top data struc 2.119 + * and set values in that top-level. Don't malloc any sub-structures. 2.120 + */ 2.121 + Matrix * 2.122 +makeMatrix_Flat( int32 numRows, int32 numCols ) 2.123 + { Matrix * retMatrix; 2.124 + retMatrix = malloc( sizeof( Matrix ) ); 2.125 + retMatrix->numRows = numRows; 2.126 + retMatrix->numCols = numCols; 2.127 + 2.128 + return retMatrix; 2.129 + } 2.130 + 2.131 + Matrix * 2.132 +makeMatrix_WithResMat( int32 numRows, int32 numCols ) 2.133 + { Matrix * retMatrix; 2.134 + retMatrix = malloc( sizeof( Matrix ) ); 2.135 + retMatrix->numRows = numRows; 2.136 + retMatrix->numCols = numCols; 2.137 + retMatrix->array = malloc( numRows * numCols * sizeof(float32) ); 2.138 + 2.139 + return retMatrix; 2.140 + } 2.141 + 2.142 + void 2.143 +freeMatrix_Flat( Matrix * matrix ) 2.144 + { //( matrix ); 2.145 + } 2.146 + void 2.147 +freeMatrix( Matrix * matrix ) 2.148 + { free( matrix->array ); 2.149 + free( matrix ); 2.150 + } 2.151 + 2.152 +void 2.153 +printMatrix( Matrix *matrix ) 2.154 + { int r, c, numRows, numCols, rowsToPrint, colsToPrint, rowIncr, colIncr; 2.155 + float32 *matrixArray; 2.156 + 2.157 + numRows = rowsToPrint = matrix->numRows; 2.158 + numCols = colsToPrint = matrix->numCols; 2.159 + matrixArray = matrix->array; 2.160 + 2.161 + rowIncr = numRows/20; if(rowIncr == 0) rowIncr = 1;//20 to 39 rows printed 2.162 + colIncr = numCols/20; if(colIncr == 0) colIncr = 1;//20 to 39 cols printed 2.163 + for( r = 0; r < numRows; r += rowIncr ) 2.164 + { for( c = 0; c < numCols; c += colIncr ) 2.165 + { printf( "%3.1f | ", matrixArray[ r * numCols + c ] ); 2.166 + } 2.167 + printf("\n"); 2.168 + } 2.169 + } 2.170 +
3.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 3.2 +++ b/src/Application/Matrix_Mult.h Wed Nov 10 22:26:57 2010 -0800 3.3 @@ -0,0 +1,77 @@ 3.4 +/* 3.5 + * Copyright Oct 24, 2009 OpenSourceStewardshipFoundation.org 3.6 + * Licensed under GNU General Public License version 2 3.7 + */ 3.8 + 3.9 +#ifndef MATRIX_MULT_H_ 3.10 +#define MATRIX_MULT_H_ 3.11 + 3.12 +#include <stdio.h> 3.13 +#include <unistd.h> 3.14 +#include <malloc.h> 3.15 + 3.16 +#include "../SSR_lib/VMS/VMS_primitive_data_types.h" 3.17 +#include "ParamHelper/Param.h" 3.18 + 3.19 +//============================== Structures ============================== 3.20 + 3.21 +typedef 3.22 +struct 3.23 + { int32 numRows; 3.24 + int32 numCols; 3.25 + float32 *array; //2D, but dynamically sized, so use addr arith 3.26 + } 3.27 +Matrix; 3.28 + 3.29 +/* This is the "appSpecificPiece" that is carried inside a DKUPiece. 3.30 + * In the DKUPiece data struc it is declared to be of type "void *". This 3.31 + * allows the application to define any data structure it wants and put it 3.32 + * into a DKUPiece. 3.33 + * When the app specific info is used, it is in app code, so it is cast to 3.34 + * the correct type to tell the compiler how to access fields. 3.35 + * This keeps all app-specific things out of the DKU directory, as per the 3.36 + * DKU standard. */ 3.37 +typedef 3.38 +struct 3.39 + { 3.40 + // pointers to shared data.. the result matrix must be created when the 3.41 + // left and right matrices are put into the root ancestor DKUPiece. 3.42 + Matrix * leftMatrix; 3.43 + Matrix * rightMatrix; 3.44 + Matrix * resultMatrix; 3.45 + 3.46 + // define the starting and ending boundaries for this piece of the 3.47 + // result matrix. These are derivable from the left and right 3.48 + // matrices, but included them for readability of code. 3.49 + int prodStartRow, prodEndRow; 3.50 + int prodStartCol, prodEndCol; 3.51 + // Start and end of the portion of the left matrix that contributes to 3.52 + // this piece of the product 3.53 + int leftStartRow, leftEndRow; 3.54 + int leftStartCol, leftEndCol; 3.55 + // Start and end of the portion of the right matrix that contributes to 3.56 + // this piece of the product 3.57 + int rightStartRow, rightEndRow; 3.58 + int rightStartCol, rightEndCol; 3.59 + } 3.60 +MatrixProdPiece; 3.61 + 3.62 +//============================== Functions ================================ 3.63 +void readFile(); 3.64 + 3.65 +Matrix *makeMatrix( int32 numRows, int32 numCols ); 3.66 +Matrix *makeMatrix_Flat( int32 numRows, int32 numCols ); 3.67 +Matrix *makeMatrix_WithResMat( int32 numRows, int32 numCols ); 3.68 +void freeMatrix_Flat( Matrix * matrix ); 3.69 +void freeMatrix( Matrix * matrix ); 3.70 +void printMatrix( Matrix *matrix ); 3.71 + 3.72 +void read_Matrix_From_File( Matrix *matrixStruc, char *matrixFileName ); 3.73 + 3.74 +void 3.75 +initialize_Input_Matrices_Via( Matrix **leftMatrix, Matrix **rightMatrix, 3.76 + ParamBag *paramBag ); 3.77 + 3.78 +//=========================================================================== 3.79 + 3.80 +#endif /*MATRIX_MULT_H_*/
4.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 4.2 +++ b/src/Application/VPThread__Matrix_Mult/Divide_Pr.c Wed Nov 10 22:26:57 2010 -0800 4.3 @@ -0,0 +1,597 @@ 4.4 +/* 4.5 + * Copyright 2009 OpenSourceStewardshipFoundation.org 4.6 + * Licensed under GNU General Public License version 2 4.7 + * 4.8 + * Author: seanhalle@yahoo.com 4.9 + * 4.10 + */ 4.11 + 4.12 + 4.13 +#include "SSR_Matrix_Mult.h" 4.14 +#include <math.h> 4.15 +#include <string.h> 4.16 + 4.17 + //The time to compute this many result values should equal the time to 4.18 + // perform this division on a matrix of size gives that many result calcs 4.19 + //IE, size this so that sequential time to calc equals divide time 4.20 + // find the value by experimenting -- but divide time and calc time scale 4.21 + // same way, so this value should remain valid across hardware 4.22 +#define NUM_CELLS_IN_SEQUENTIAL_CUTOFF 1000 4.23 + 4.24 + 4.25 +//=========================================================================== 4.26 +int inline 4.27 +measureMatrixMultPrimitive( VirtProcr *animPr ); 4.28 + 4.29 +SlicingStrucCarrier * 4.30 +calcIdealSizeAndSliceDimensions( Matrix *leftMatrix, Matrix *rightMatrix, 4.31 + VirtProcr *animPr ); 4.32 + 4.33 +SlicingStruc * 4.34 +sliceUpDimension( float32 idealSizeOfSide, int startVal, int endVal, 4.35 + VirtProcr *animPr ); 4.36 + 4.37 +void 4.38 +freeSlicingStruc( SlicingStruc *slicingStruc, VirtProcr *animPr ); 4.39 + 4.40 +SubMatrix ** 4.41 +createSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices, 4.42 + int32 numUses, Matrix *origMatrix, VirtProcr *animPr ); 4.43 + 4.44 +void 4.45 +freeSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices, 4.46 + SubMatrix **subMatrices, VirtProcr *animPr ); 4.47 + 4.48 +void 4.49 +pairUpSubMatricesAndMakeProcessors( SubMatrix **leftSubMatrices, 4.50 + SubMatrix **rightSubMatrices, 4.51 + int32 numRowIdxs, int32 numColIdxs, 4.52 + int32 numVecIdxs, 4.53 + VirtProcr *resultPr, 4.54 + VirtProcr *animatingPr ); 4.55 + 4.56 +void 4.57 +makeSubMatricesAndProcrs( Matrix *leftMatrix, Matrix *rightMatrix, 4.58 + SlicingStrucCarrier *slicingStrucCarrier, 4.59 + VirtProcr *resultPr, VirtProcr *animatingPr ); 4.60 + 4.61 + 4.62 + 4.63 +/*Divider creates one processor for every sub-matrix 4.64 + * It hands them: 4.65 + * the name of the result processor that they should send their results to, 4.66 + * the left and right matrices, and the rows and cols they should multiply 4.67 + * It first creates the result processor, then all the sub-matrixPair 4.68 + * processors, 4.69 + * then does a receive of a message from the result processor that gives 4.70 + * the divider ownership of the result matrix. 4.71 + * Finally, the divider returns the result matrix out of the SSR system. 4.72 + * 4.73 + * Divider chooses the size of sub-matrices via an algorithm that tries to 4.74 + * keep the minimum work above a threshold. The threshold is machine- 4.75 + * dependent, so ask SSR for min work-unit time to get a 4.76 + * given overhead 4.77 + * 4.78 + * Divide min work-unit cycles by measured-cycles for one matrix-cell 4.79 + * product -- gives the number of products need to have in min size 4.80 + * matrix. 4.81 + * 4.82 + * So then, take cubed root of this to get the size of a side of min sub- 4.83 + * matrix. That is the size of the ideal square sub-matrix -- so tile 4.84 + * up the two input matrices into ones as close as possible to that size, 4.85 + * and create the pairs of sub-matrices. 4.86 + * 4.87 + *======================== STRATEGIC OVERVIEW ======================= 4.88 + * 4.89 + *This division is a bit tricky, because have to create things in advance 4.90 + * that it's not at first obvious need to be created.. 4.91 + * 4.92 + *First slice up each dimension -- three of them.. this is because will have 4.93 + * to create the sub-matrix's data-structures before pairing the sub-matrices 4.94 + * with each other -- so, have three dimensions to slice up before can 4.95 + * create the sub-matrix data-strucs -- also, have to be certain that the 4.96 + * cols of the left input have the exact same slicing as the rows of the 4.97 + * left matrix, so just to be sure, do the slicing calc once, then use it 4.98 + * for both. 4.99 + * 4.100 + *So, goes like this: 4.101 + *1) calculate the start & end values of each dimension in each matrix. 4.102 + *2) use those values to create sub-matrix structures 4.103 + *3) combine sub-matrices into pairs, as the tasks to perform. 4.104 + * 4.105 + *Have to calculate separately from creating the sub-matrices because of the 4.106 + * nature of the nesting -- would either end up creating the same sub-matrix 4.107 + * multiple times, or else would have to put in detection of whether had 4.108 + * made a particular one already if tried to combine steps 1 and 2. 4.109 + * 4.110 + *Step 3 has to be separate because of the nesting, as well -- same reason, 4.111 + * would either create same sub-matrix multiple times, or else have to 4.112 + * add detection of whether was already created. 4.113 + * 4.114 + *Another way to look at it: there's one level of loop to divide dimensions, 4.115 + * two levels of nesting to create sub-matrices, and three levels to pair 4.116 + * up the sub-matrices. 4.117 + */ 4.118 + 4.119 +void divideWorkIntoSubMatrixPairProcrs( void *_dividerParams, 4.120 + VirtProcr *animPr ) 4.121 + { VirtProcr *resultPr; 4.122 + DividerParams *dividerParams; 4.123 + ResultsParams *resultsParams; 4.124 + Matrix *leftMatrix, *rightMatrix, *resultMatrix; 4.125 + void *msg; 4.126 + SlicingStrucCarrier *slicingStrucCarrier; 4.127 + float32 *resultArray; //points to array inside result matrix 4.128 + 4.129 + DEBUG( dbgAppFlow, "start divide\n") 4.130 + 4.131 + int32 4.132 + divideProbe = VMS__create_single_interval_probe( "divideProbe", 4.133 + animPr ); 4.134 + VMS__record_sched_choice_into_probe( divideProbe, animPr ); 4.135 + VMS__record_interval_start_in_probe( divideProbe ); 4.136 + 4.137 + //=========== Setup -- make local copies of ptd-to-things, malloc, aso 4.138 + int32 numResRows, numResCols, vectLength; 4.139 + 4.140 + dividerParams = (DividerParams *)_dividerParams; 4.141 + 4.142 + leftMatrix = dividerParams->leftMatrix; 4.143 + rightMatrix = dividerParams->rightMatrix; 4.144 + 4.145 + vectLength = leftMatrix->numCols; 4.146 + numResRows = leftMatrix->numRows; 4.147 + numResCols = rightMatrix->numCols; 4.148 + resultArray = dividerParams->resultMatrix->array; 4.149 + 4.150 + //zero the result array 4.151 + memset( resultArray, 0, numResRows * numResCols * sizeof(float32) ); 4.152 + 4.153 + //============== Do either sequential mult or do division ============== 4.154 + 4.155 + //Check if input matrices too small -- if yes, just do sequential 4.156 + //Cutoff is determined by overhead of this divider -- relatively 4.157 + // machine-independent 4.158 + if( (float32)leftMatrix->numRows * (float32)leftMatrix->numCols * 4.159 + (float32)rightMatrix->numCols < NUM_CELLS_IN_SEQUENTIAL_CUTOFF ) 4.160 + { 4.161 + //====== Do sequential multiply on a single core 4.162 + DEBUG( dbgAppFlow, "doing sequential") 4.163 + 4.164 + //transpose the right matrix 4.165 + float32 * 4.166 + transRightArray = SSR__malloc_to( rightMatrix->numRows * 4.167 + rightMatrix->numCols * sizeof(float32), 4.168 + animPr ); 4.169 + 4.170 + //copy values from orig matrix to local 4.171 + copyTranspose( rightMatrix->numRows, rightMatrix->numCols, 4.172 + 0, 0, rightMatrix->numRows, 4.173 + transRightArray, rightMatrix->array ); 4.174 + 4.175 + multiplyMatrixArraysTransposed( vectLength, numResRows, numResCols, 4.176 + leftMatrix->array, transRightArray, 4.177 + resultArray ); 4.178 + } 4.179 + else 4.180 + { 4.181 + //====== Do parallel multiply across cores 4.182 + 4.183 + //Calc the ideal size of sub-matrix and slice up the dimensions of 4.184 + // the two matrices. 4.185 + //The ideal size is the one takes the number of cycles to calculate 4.186 + // such that calc time is equal or greater than min work-unit size 4.187 + slicingStrucCarrier = 4.188 + calcIdealSizeAndSliceDimensions( leftMatrix, rightMatrix, animPr ); 4.189 + 4.190 + //Make the results processor, now that know how many to wait for 4.191 + resultsParams = SSR__malloc_to( sizeof(ResultsParams), animPr ); 4.192 + resultsParams->numSubMatrixPairs = 4.193 + slicingStrucCarrier->leftRowSlices->numVals * 4.194 + slicingStrucCarrier->rightColSlices->numVals * 4.195 + slicingStrucCarrier->vecSlices->numVals; 4.196 + resultsParams->dividerPr = animPr; 4.197 + resultsParams->numCols = rightMatrix->numCols; 4.198 + resultsParams->numRows = leftMatrix->numRows; 4.199 + resultsParams->resultArray = resultArray; 4.200 + 4.201 + 4.202 + resultPr = 4.203 + SSR__create_procr_with( &gatherResults, resultsParams, animPr); 4.204 + 4.205 + //Make the sub-matrices, and pair them up, and make processor to 4.206 + // calc product of each pair. 4.207 + makeSubMatricesAndProcrs( leftMatrix, rightMatrix, 4.208 + slicingStrucCarrier, 4.209 + resultPr, animPr); 4.210 + 4.211 + //result array is allocated externally, so no message from resultPr 4.212 + // however, do have to wait before printing out stats, so wait 4.213 + // for an empty handshake message 4.214 + msg = SSR__receive_from_to( resultPr, animPr ); 4.215 + } 4.216 + 4.217 + 4.218 + //=============== Work done -- send results back ================= 4.219 + 4.220 + 4.221 + DEBUG( dbgAppFlow, "end divide\n") 4.222 + 4.223 + VMS__record_interval_end_in_probe( divideProbe ); 4.224 + VMS__print_stats_of_all_probes(); 4.225 + 4.226 + //nothing left to do so dissipate, SSR will wait to shutdown and hence 4.227 + // make results available to outside until all the processors have 4.228 + // dissipated -- so no need to wait for results processor 4.229 + 4.230 + SSR__dissipate_procr( animPr ); //all procrs dissipate self at end 4.231 + //when all of the processors have dissipated, the "create seed and do 4.232 + // work" call in the entry point function returns 4.233 + } 4.234 + 4.235 + 4.236 +SlicingStrucCarrier * 4.237 +calcIdealSizeAndSliceDimensions( Matrix *leftMatrix, Matrix *rightMatrix, 4.238 + VirtProcr *animPr ) 4.239 + { 4.240 + float32 idealSizeOfSide, idealSizeOfSide1, idealSizeOfSide2; 4.241 + SlicingStruc *leftRowSlices, *vecSlices, *rightColSlices; 4.242 + SlicingStrucCarrier *slicingStrucCarrier = 4.243 + SSR__malloc_to(sizeof(SlicingStrucCarrier), animPr); 4.244 + 4.245 + int minWorkUnitCycles, primitiveCycles, idealNumWorkUnits; 4.246 + float64 numPrimitiveOpsInMinWorkUnit; 4.247 + 4.248 + 4.249 + //======= Calc ideal size of min-sized sub-matrix ======== 4.250 + 4.251 + //ask SSR for the number of cycles of the minimum work unit, at given 4.252 + // percent overhead then add a guess at overhead from this divider 4.253 + minWorkUnitCycles = SSR__giveMinWorkUnitCycles( .05 ); 4.254 + 4.255 + //ask SSR for number of cycles of the "primitive" op of matrix mult 4.256 + primitiveCycles = measureMatrixMultPrimitive( animPr ); 4.257 + 4.258 + numPrimitiveOpsInMinWorkUnit = 4.259 + (float64)minWorkUnitCycles / (float64)primitiveCycles; 4.260 + 4.261 + //take cubed root -- that's number of these in a "side" of sub-matrix 4.262 + // then multiply by 5 because the primitive is 5x5 4.263 + idealSizeOfSide1 = 5 * cbrt( numPrimitiveOpsInMinWorkUnit ); 4.264 + 4.265 + idealNumWorkUnits = SSR__giveIdealNumWorkUnits(); 4.266 + 4.267 + idealSizeOfSide2 = leftMatrix->numRows / rint(cbrt( idealNumWorkUnits )); 4.268 + idealSizeOfSide2 *= 0.6; //finer granularity to help load balance 4.269 + 4.270 + if( idealSizeOfSide1 > idealSizeOfSide2 ) 4.271 + idealSizeOfSide = idealSizeOfSide1; 4.272 + else 4.273 + idealSizeOfSide = idealSizeOfSide2; 4.274 + 4.275 + //The multiply inner loop blocks the array to fit into L1 cache 4.276 +// if( idealSizeOfSide < ROWS_IN_BLOCK ) idealSizeOfSide = ROWS_IN_BLOCK; 4.277 + 4.278 + //============ Slice up dimensions, now that know target size =========== 4.279 + 4.280 + //Tell the slicer the target size of a side (floating pt), the start 4.281 + // value to start slicing at, and the end value to stop slicing at 4.282 + //It returns an array of start value of each chunk, plus number of them 4.283 + int32 startLeftRow, endLeftRow, startVec,endVec,startRightCol,endRightCol; 4.284 + startLeftRow = 0; 4.285 + endLeftRow = leftMatrix->numRows -1; 4.286 + startVec = 0; 4.287 + endVec = leftMatrix->numCols -1; 4.288 + startRightCol = 0; 4.289 + endRightCol = rightMatrix->numCols -1; 4.290 + 4.291 + leftRowSlices = 4.292 + sliceUpDimension( idealSizeOfSide, startLeftRow, endLeftRow, animPr ); 4.293 + 4.294 + vecSlices = 4.295 + sliceUpDimension( idealSizeOfSide, startVec, endVec, animPr ); 4.296 + 4.297 + rightColSlices = 4.298 + sliceUpDimension( idealSizeOfSide, startRightCol, endRightCol,animPr); 4.299 + 4.300 + slicingStrucCarrier->leftRowSlices = leftRowSlices; 4.301 + slicingStrucCarrier->vecSlices = vecSlices; 4.302 + slicingStrucCarrier->rightColSlices = rightColSlices; 4.303 + 4.304 + return slicingStrucCarrier; 4.305 + } 4.306 + 4.307 + 4.308 +void 4.309 +makeSubMatricesAndProcrs( Matrix *leftMatrix, Matrix *rightMatrix, 4.310 + SlicingStrucCarrier *slicingStrucCarrier, 4.311 + VirtProcr *resultPr, VirtProcr *animPr ) 4.312 + { 4.313 + SlicingStruc *leftRowSlices, *vecSlices, *rightColSlices; 4.314 + 4.315 + leftRowSlices = slicingStrucCarrier->leftRowSlices; 4.316 + vecSlices = slicingStrucCarrier->vecSlices; 4.317 + rightColSlices = slicingStrucCarrier->rightColSlices; 4.318 + SSR__free( slicingStrucCarrier, animPr ); 4.319 + 4.320 + //================ Make sub-matrices, given the slicing ================ 4.321 + SubMatrix **leftSubMatrices, **rightSubMatrices; 4.322 + leftSubMatrices = 4.323 + createSubMatrices( leftRowSlices, vecSlices, rightColSlices->numVals, 4.324 + leftMatrix, animPr ); 4.325 + //double_check_that_always_numRows_in_right_same_as_numCols_in_left(); 4.326 + rightSubMatrices = 4.327 + createSubMatrices( vecSlices, rightColSlices, leftRowSlices->numVals, 4.328 + rightMatrix, animPr ); 4.329 + 4.330 + freeSlicingStruc( leftRowSlices, animPr ); 4.331 + freeSlicingStruc( vecSlices, animPr ); 4.332 + freeSlicingStruc( rightColSlices, animPr ); 4.333 + 4.334 + //============== pair the sub-matrices and make processors ============== 4.335 + int32 numRowIdxs, numColIdxs, numVecIdxs; 4.336 + 4.337 + numRowIdxs = leftRowSlices->numVals; 4.338 + numColIdxs = rightColSlices->numVals; 4.339 + numVecIdxs = vecSlices->numVals; 4.340 + pairUpSubMatricesAndMakeProcessors( leftSubMatrices, 4.341 + rightSubMatrices, 4.342 + numRowIdxs, numColIdxs, 4.343 + numVecIdxs, 4.344 + resultPr, 4.345 + animPr ); 4.346 + } 4.347 + 4.348 + 4.349 + 4.350 + 4.351 +void 4.352 +pairUpSubMatricesAndMakeProcessors( SubMatrix **leftSubMatrices, 4.353 + SubMatrix **rightSubMatrices, 4.354 + int32 numRowIdxs, int32 numColIdxs, 4.355 + int32 numVecIdxs, 4.356 + VirtProcr *resultPr, 4.357 + VirtProcr *animatingPr ) 4.358 + { 4.359 + int32 resRowIdx, resColIdx, vecIdx; 4.360 + int32 numLeftColIdxs, numRightColIdxs; 4.361 + int32 leftRowIdxOffset; 4.362 + SMPairParams *subMatrixPairParams; 4.363 + float32 numToPutOntoEachCore, leftOverFraction; 4.364 + int32 numCores, coreToScheduleOnto, numVecOnCurrCore; 4.365 + 4.366 + numLeftColIdxs = numColIdxs; 4.367 + numRightColIdxs = numVecIdxs; 4.368 + 4.369 + numCores = SSR__give_number_of_cores_to_schedule_onto(); 4.370 + 4.371 + numToPutOntoEachCore = numRowIdxs*numColIdxs/numCores; 4.372 + leftOverFraction = 0; 4.373 + numVecOnCurrCore = 0; 4.374 + coreToScheduleOnto = 0; 4.375 + 4.376 + for( resRowIdx = 0; resRowIdx < numRowIdxs; resRowIdx++ ) 4.377 + { 4.378 + leftRowIdxOffset = resRowIdx * numLeftColIdxs; 4.379 + 4.380 + for( resColIdx = 0; resColIdx < numColIdxs; resColIdx++ ) 4.381 + { 4.382 + 4.383 + for( vecIdx = 0; vecIdx < numVecIdxs; vecIdx++ ) 4.384 + { 4.385 + //Make the processor for the pair of sub-matrices 4.386 + subMatrixPairParams = SSR__malloc_to( sizeof(SMPairParams), 4.387 + animatingPr); 4.388 + subMatrixPairParams->leftSubMatrix = 4.389 + leftSubMatrices[ leftRowIdxOffset + vecIdx ]; 4.390 + 4.391 + subMatrixPairParams->rightSubMatrix = 4.392 + rightSubMatrices[ vecIdx * numRightColIdxs + resColIdx ]; 4.393 + 4.394 + subMatrixPairParams->resultPr = resultPr; 4.395 + 4.396 + //put all pairs from the same vector onto same core 4.397 + SSR__create_procr_with_affinity( &calcSubMatrixProduct, 4.398 + subMatrixPairParams, 4.399 + animatingPr, 4.400 + coreToScheduleOnto ); 4.401 + } 4.402 + 4.403 + //Trying to distribute the subMatrix-vectors across the cores, so 4.404 + // that each core gets the same number of vectors, with a max 4.405 + // imbalance of 1 vector more on some cores than others 4.406 + numVecOnCurrCore += 1; 4.407 + if( numVecOnCurrCore + leftOverFraction >= numToPutOntoEachCore -1 ) 4.408 + { 4.409 + //deal with fractional part, to ensure that imbalance is 1 max 4.410 + // IE, core with most has only 1 more than core with least 4.411 + leftOverFraction += numToPutOntoEachCore - numVecOnCurrCore; 4.412 + if( leftOverFraction >= 1 ) 4.413 + { leftOverFraction -= 1; 4.414 + numVecOnCurrCore = -1; 4.415 + } 4.416 + else 4.417 + { numVecOnCurrCore = 0; 4.418 + } 4.419 + //Move to next core, max core-value to incr to is numCores -1 4.420 + if( coreToScheduleOnto >= numCores -1 ) 4.421 + { coreToScheduleOnto = 0; 4.422 + } 4.423 + else 4.424 + { coreToScheduleOnto += 1; 4.425 + } 4.426 + } 4.427 + 4.428 + } 4.429 + } 4.430 + 4.431 + } 4.432 + 4.433 + 4.434 + 4.435 +/*Walk through the two slice-strucs, making sub-matrix strucs as go 4.436 + */ 4.437 +SubMatrix ** 4.438 +createSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices, 4.439 + int32 numUses, Matrix *origMatrix, VirtProcr *animPr ) 4.440 + { 4.441 + int32 numRowIdxs, numColIdxs, rowIdx, colIdx; 4.442 + int32 startRow, endRow, startCol, endCol; 4.443 + int32 *rowStartVals, *colStartVals; 4.444 + int32 rowOffset; 4.445 + SubMatrix **subMatrices, *newSubMatrix; 4.446 + 4.447 + numRowIdxs = rowSlices->numVals; 4.448 + numColIdxs = colSlices->numVals; 4.449 + 4.450 + rowStartVals = rowSlices->startVals; 4.451 + colStartVals = colSlices->startVals; 4.452 + 4.453 + subMatrices = SSR__malloc_to(numRowIdxs * numColIdxs * sizeof(SubMatrix*), 4.454 + animPr ); 4.455 + 4.456 + for( rowIdx = 0; rowIdx < numRowIdxs; rowIdx++ ) 4.457 + { 4.458 + rowOffset = rowIdx * numColIdxs; 4.459 + 4.460 + startRow = rowStartVals[rowIdx]; 4.461 + endRow = rowStartVals[rowIdx + 1] -1; //"fake" start above last is 4.462 + // at last valid idx + 1 & is 4.463 + // 1 greater than end value 4.464 + for( colIdx = 0; colIdx < numColIdxs; colIdx++ ) 4.465 + { 4.466 + startCol = colStartVals[colIdx]; 4.467 + endCol = colStartVals[colIdx + 1] -1; 4.468 + 4.469 + newSubMatrix = SSR__malloc_to( sizeof(SubMatrix), animPr ); 4.470 + newSubMatrix->numRows = endRow - startRow +1; 4.471 + newSubMatrix->numCols = endCol - startCol +1; 4.472 + newSubMatrix->origMatrix = origMatrix; 4.473 + newSubMatrix->origStartRow = startRow; 4.474 + newSubMatrix->origStartCol = startCol; 4.475 + newSubMatrix->alreadyCopied = FALSE; 4.476 + newSubMatrix->numUsesLeft = numUses; //can free after this many 4.477 + 4.478 + subMatrices[ rowOffset + colIdx ] = newSubMatrix; 4.479 + } 4.480 + } 4.481 + return subMatrices; 4.482 + } 4.483 + 4.484 + 4.485 +void 4.486 +freeSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices, 4.487 + SubMatrix **subMatrices, VirtProcr *animPr ) 4.488 + { 4.489 + int32 numRowIdxs, numColIdxs, rowIdx, colIdx, rowOffset; 4.490 + SubMatrix *subMatrix; 4.491 + 4.492 + numRowIdxs = rowSlices->numVals; 4.493 + numColIdxs = colSlices->numVals; 4.494 + 4.495 + for( rowIdx = 0; rowIdx < numRowIdxs; rowIdx++ ) 4.496 + { 4.497 + rowOffset = rowIdx * numColIdxs; 4.498 + for( colIdx = 0; colIdx < numColIdxs; colIdx++ ) 4.499 + { 4.500 + subMatrix = subMatrices[ rowOffset + colIdx ]; 4.501 + if( subMatrix->alreadyCopied ) 4.502 + SSR__free( subMatrix->array, animPr ); 4.503 + SSR__free( subMatrix, animPr ); 4.504 + } 4.505 + } 4.506 + SSR__free( subMatrices, animPr ); 4.507 + } 4.508 + 4.509 + 4.510 + 4.511 +SlicingStruc * 4.512 +sliceUpDimension( float32 idealSizeOfSide, int startVal, int endVal, 4.513 + VirtProcr *animPr ) 4.514 + { float32 residualAcc = 0; 4.515 + int numSlices, i, *startVals, sizeOfSlice, endCondition; 4.516 + SlicingStruc *slicingStruc = SSR__malloc_to(sizeof(SlicingStruc), animPr); 4.517 + 4.518 + //calc size of matrix need to hold start vals -- 4.519 + numSlices = (int32)( (float32)(endVal -startVal +1) / idealSizeOfSide); 4.520 + 4.521 + startVals = SSR__malloc_to( (numSlices + 1) * sizeof(int32), animPr ); 4.522 + 4.523 + //Calc the upper limit of start value -- when get above this, end loop 4.524 + // by saving highest value of the matrix dimension to access, plus 1 4.525 + // as the start point of the imaginary slice following the last one 4.526 + //Plus 1 because go up to value but not include when process last slice 4.527 + //The stopping condition is half-a-size less than highest value because 4.528 + // don't want any pieces smaller than half the ideal size -- just tack 4.529 + // little ones onto end of last one 4.530 + endCondition = endVal - (int) (idealSizeOfSide/2); //end *value*, not size 4.531 + for( i = 0; startVal <= endVal; i++ ) 4.532 + { 4.533 + startVals[i] = startVal; 4.534 + residualAcc += idealSizeOfSide; 4.535 + sizeOfSlice = (int)residualAcc; 4.536 + residualAcc -= (float32)sizeOfSlice; 4.537 + startVal += sizeOfSlice; //ex @size = 2 get 0, 2, 4, 6, 8.. 4.538 + 4.539 + if( startVal > endCondition ) 4.540 + { startVal = endVal + 1; 4.541 + startVals[ i + 1 ] = startVal; 4.542 + } 4.543 + } 4.544 + 4.545 + slicingStruc->startVals = startVals; 4.546 + slicingStruc->numVals = i; //loop incr'd, so == last valid start idx+1 4.547 + // which means is num sub-matrices in dim 4.548 + // also == idx of the fake start just above 4.549 + return slicingStruc; 4.550 + } 4.551 + 4.552 +void 4.553 +freeSlicingStruc( SlicingStruc *slicingStruc, VirtProcr *animPr ) 4.554 + { 4.555 + SSR__free( slicingStruc->startVals, animPr ); 4.556 + SSR__free( slicingStruc, animPr ); 4.557 + } 4.558 + 4.559 + 4.560 +int inline 4.561 +measureMatrixMultPrimitive( VirtProcr *animPr ) 4.562 + { 4.563 + int r, c, v, numCycles; 4.564 + float32 *res, *left, *right; 4.565 + 4.566 + //setup inputs 4.567 + left = SSR__malloc_to( 5 * 5 * sizeof( float32 ), animPr ); 4.568 + right = SSR__malloc_to( 5 * 5 * sizeof( float32 ), animPr ); 4.569 + res = SSR__malloc_to( 5 * 5 * sizeof( float32 ), animPr ); 4.570 + 4.571 + for( r = 0; r < 5; r++ ) 4.572 + { 4.573 + for( c = 0; c < 5; c++ ) 4.574 + { 4.575 + left[ r * 5 + c ] = r; 4.576 + right[ r * 5 + c ] = c; 4.577 + } 4.578 + } 4.579 + 4.580 + //do primitive 4.581 + SSR__start_primitive(); //for now, just takes time stamp 4.582 + for( r = 0; r < 5; r++ ) 4.583 + { 4.584 + for( c = 0; c < 5; c++ ) 4.585 + { 4.586 + for( v = 0; v < 5; v++ ) 4.587 + { 4.588 + res[ r * 5 + c ] = left[ r * 5 + v ] * right[ v * 5 + c ]; 4.589 + } 4.590 + } 4.591 + } 4.592 + numCycles = 4.593 + SSR__end_primitive_and_give_cycles(); 4.594 + 4.595 + SSR__free( left, animPr ); 4.596 + SSR__free( right, animPr ); 4.597 + SSR__free( res, animPr ); 4.598 + 4.599 + return numCycles; 4.600 + }
5.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 5.2 +++ b/src/Application/VPThread__Matrix_Mult/EntryPoint.c Wed Nov 10 22:26:57 2010 -0800 5.3 @@ -0,0 +1,62 @@ 5.4 +/* 5.5 + * Copyright 2009 OpenSourceStewardshipFoundation.org 5.6 + * Licensed under GNU General Public License version 2 5.7 + * 5.8 + * Author: seanhalle@yahoo.com 5.9 + * 5.10 + */ 5.11 + 5.12 +#include <math.h> 5.13 + 5.14 +#include "SSR_Matrix_Mult.h" 5.15 + 5.16 + 5.17 + 5.18 +/*Every SSR system has an "entry point" function that creates the first 5.19 + * processor, which starts the chain of creating more processors.. 5.20 + * eventually all of the processors will dissipate themselves, and 5.21 + * return. 5.22 + * 5.23 + *This entry-point function follows the same pattern as all entry-point 5.24 + * functions do: 5.25 + *1) it creates the params for the seed processor, from the 5.26 + * parameters passed into the entry-point function 5.27 + *2) it calls SSR__create_seed_procr_and_do_work 5.28 + *3) it gets the return value from the params struc, frees the params struc, 5.29 + * and returns the value from the function 5.30 + * 5.31 + */ 5.32 +Matrix * 5.33 +multiplyTheseMatrices( Matrix *leftMatrix, Matrix *rightMatrix ) 5.34 + { Matrix *resMatrix; 5.35 + DividerParams *dividerParams; 5.36 + int32 numResRows, numResCols; 5.37 + 5.38 + 5.39 + dividerParams = malloc( sizeof( DividerParams ) ); 5.40 + dividerParams->leftMatrix = leftMatrix; 5.41 + dividerParams->rightMatrix = rightMatrix; 5.42 + 5.43 + 5.44 + numResRows = leftMatrix->numRows; 5.45 + numResCols = rightMatrix->numCols; 5.46 + 5.47 + //VMS has its own separate internal malloc, so to get results out, 5.48 + // have to pass in empty array for it to fill up 5.49 + //The alternative is internally telling SSR make external space to use 5.50 + resMatrix = malloc( sizeof(Matrix) ); 5.51 + resMatrix->array = malloc( numResRows * numResCols * sizeof(float32)); 5.52 + resMatrix->numCols = rightMatrix->numCols; 5.53 + resMatrix->numRows = leftMatrix->numRows; 5.54 + 5.55 + 5.56 + dividerParams->resultMatrix = resMatrix; 5.57 + 5.58 + //create divider processor, start doing the work, and wait till done 5.59 + //This function is the "border crossing" between normal code and SSR 5.60 + SSR__create_seed_procr_and_do_work( ÷WorkIntoSubMatrixPairProcrs, 5.61 + dividerParams ); 5.62 + 5.63 + free( dividerParams ); 5.64 + return resMatrix; 5.65 + }
6.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 6.2 +++ b/src/Application/VPThread__Matrix_Mult/Result_Pr.c Wed Nov 10 22:26:57 2010 -0800 6.3 @@ -0,0 +1,108 @@ 6.4 +/* 6.5 + * Copyright 2009 OpenSourceStewardshipFoundation.org 6.6 + * Licensed under GNU General Public License version 2 6.7 + * 6.8 + * Author: seanhalle@yahoo.com 6.9 + * 6.10 + */ 6.11 + 6.12 +#include "SSR_Matrix_Mult.h" 6.13 + 6.14 +//===================== 6.15 +void inline 6.16 +accumulateResult( float32 *resultArray, float32 *subMatrixResultArray, 6.17 + int32 startRow, 6.18 + int32 numRows, 6.19 + int32 startCol, 6.20 + int32 numCols, 6.21 + int32 numOrigCols ); 6.22 + 6.23 +//=========================================================================== 6.24 + 6.25 +/*The Result Processor gets a message from each of the vector processors, 6.26 + * puts the result from the message in its location in the result- 6.27 + * matrix, and increments the count of results. 6.28 + * 6.29 + *After the count reaches the point that all results have been received, it 6.30 + * returns the result matrix and dissipates. 6.31 + */ 6.32 +void gatherResults( void *_params, VirtProcr *animatingPr ) 6.33 + { VirtProcr *dividerPr; 6.34 + ResultsParams *params; 6.35 + int row, col, numRows, numCols, numSubMatrixPairs, count=0; 6.36 + float32 *resultArray; 6.37 + void *msg; 6.38 + SMPairParams *resParams; 6.39 + 6.40 + DEBUG( dbgAppFlow, "start resultPr\n") 6.41 + 6.42 + params = (ResultsParams *)_params; 6.43 + dividerPr = params->dividerPr; 6.44 + numSubMatrixPairs = params->numSubMatrixPairs; 6.45 + numRows = params->numRows; 6.46 + numCols = params->numCols; 6.47 + 6.48 + resultArray = params->resultArray; 6.49 + 6.50 + 6.51 + while( count < numSubMatrixPairs ) 6.52 + { 6.53 + msg = SSR__receive_type_to( RESULTS_MSG, animatingPr ); 6.54 + 6.55 + resParams = (SMPairParams *)msg; 6.56 + accumulateResult( resultArray, resParams->partialResultArray, 6.57 + resParams->leftSubMatrix->origStartRow, 6.58 + resParams->leftSubMatrix->numRows, 6.59 + resParams->rightSubMatrix->origStartCol, 6.60 + resParams->rightSubMatrix->numCols, 6.61 + resParams->rightSubMatrix->origMatrix->numCols ); 6.62 + 6.63 + SSR__free( resParams->partialResultArray, animatingPr ); 6.64 + 6.65 + //there is only one copy of results procr, so can update numUsesLeft 6.66 + // without concurrency worries. When zero, free the sub-matrix 6.67 + resParams->leftSubMatrix->numUsesLeft -= 1; 6.68 + if( resParams->leftSubMatrix->numUsesLeft == 0 ) 6.69 + { 6.70 + SSR__free( resParams->leftSubMatrix->array, animatingPr ); 6.71 + SSR__free( resParams->leftSubMatrix, animatingPr ); 6.72 + } 6.73 + 6.74 + resParams->rightSubMatrix->numUsesLeft -= 1; 6.75 + if( resParams->rightSubMatrix->numUsesLeft == 0 ) 6.76 + { 6.77 + SSR__free( resParams->rightSubMatrix->array, animatingPr ); 6.78 + SSR__free( resParams->rightSubMatrix, animatingPr ); 6.79 + } 6.80 + 6.81 + //count of how many sub-matrix pairs accumulated so know when done 6.82 + count++; 6.83 + } 6.84 + 6.85 + //Done -- could just dissipate -- SSR will wait for all processors to 6.86 + // dissipate before shutting down, and thereby making results avaial to 6.87 + // outside, so no need to stop the divider from dissipating, so no need 6.88 + // to send a hand-shake message to it -- bug makes debug easier 6.89 + SSR__send_from_to( NULL, animatingPr, dividerPr ); 6.90 + SSR__dissipate_procr( animatingPr ); //frees any data owned by procr 6.91 + } 6.92 + 6.93 +void inline 6.94 +accumulateResult( float32 *resultArray, float32 *subMatrixPairResultArray, 6.95 + int32 startRow, 6.96 + int32 numRows, 6.97 + int32 startCol, 6.98 + int32 numCols, 6.99 + int32 numOrigCols ) 6.100 + { int32 row, col; 6.101 + 6.102 + for( row = 0; row < numRows; row++ ) 6.103 + { 6.104 + for( col = 0; col < numCols; col++ ) 6.105 + { 6.106 + resultArray[ (row + startRow) * numOrigCols + (col + startCol) ] += 6.107 + subMatrixPairResultArray[ row * numCols + col ]; 6.108 + } 6.109 + } 6.110 + 6.111 + }
7.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 7.2 +++ b/src/Application/VPThread__Matrix_Mult/VPThread__Matrix_Mult.h Wed Nov 10 22:26:57 2010 -0800 7.3 @@ -0,0 +1,95 @@ 7.4 +/* 7.5 + * Copyright Oct 24, 2009 OpenSourceStewardshipFoundation.org 7.6 + * Licensed under GNU General Public License version 2 7.7 + */ 7.8 + 7.9 +#ifndef _SSR_MATRIX_MULT_H_ 7.10 +#define _SSR_MATRIX_MULT_H_ 7.11 + 7.12 +#include <stdio.h> 7.13 + 7.14 +#include "../../SSR_lib/SSR.h" 7.15 +#include "../Matrix_Mult.h" 7.16 + 7.17 + 7.18 +//=============================== Defines ============================== 7.19 +#define ROWS_IN_BLOCK 32 7.20 +#define COLS_IN_BLOCK 32 7.21 +#define VEC_IN_BLOCK 32 7.22 + 7.23 +#define copyMatrixSingleton 1 7.24 +#define copyTransposeSingleton 2 7.25 + 7.26 +//============================== Structures ============================== 7.27 +typedef struct 7.28 + { 7.29 + Matrix *leftMatrix; 7.30 + Matrix *rightMatrix; 7.31 + Matrix *resultMatrix; 7.32 + } 7.33 +DividerParams; 7.34 + 7.35 +typedef struct 7.36 + { 7.37 + VirtProcr *dividerPr; 7.38 + int numRows; 7.39 + int numCols; 7.40 + int numSubMatrixPairs; 7.41 + float32 *resultArray; 7.42 + } 7.43 +ResultsParams; 7.44 + 7.45 +typedef 7.46 +struct 7.47 + { int32 numRows; 7.48 + int32 numCols; 7.49 + Matrix *origMatrix; 7.50 + int32 origStartRow; 7.51 + int32 origStartCol; 7.52 + int32 alreadyCopied; 7.53 + int32 numUsesLeft; //have update via message to avoid multiple writers 7.54 + float32 *array; //2D, but dynamically sized, so use addr arith 7.55 + } 7.56 +SubMatrix; 7.57 + 7.58 +typedef struct 7.59 + { VirtProcr *resultPr; 7.60 + SubMatrix *leftSubMatrix; 7.61 + SubMatrix *rightSubMatrix; 7.62 + float32 *partialResultArray; 7.63 + } 7.64 +SMPairParams; 7.65 + 7.66 +typedef 7.67 +struct 7.68 + { int32 numVals; 7.69 + int32 *startVals; 7.70 + } 7.71 +SlicingStruc; 7.72 + 7.73 +typedef 7.74 +struct 7.75 + { 7.76 + SlicingStruc *leftRowSlices; 7.77 + SlicingStruc *vecSlices; 7.78 + SlicingStruc *rightColSlices; 7.79 + } 7.80 +SlicingStrucCarrier; 7.81 + 7.82 +enum MMMsgType 7.83 + { 7.84 + RESULTS_MSG = 1 7.85 + }; 7.86 + 7.87 +//============================= Processor Functions ========================= 7.88 +void divideWorkIntoSubMatrixPairProcrs( void *data, VirtProcr *animatingPr ); 7.89 +void calcSubMatrixProduct( void *data, VirtProcr *animatingPr ); 7.90 +void gatherResults( void *data, VirtProcr *animatingPr ); 7.91 + 7.92 + 7.93 +//================================ Entry Point ============================== 7.94 +Matrix * 7.95 +multiplyTheseMatrices( Matrix *leftMatrix, Matrix *rightMatrix ); 7.96 + 7.97 + 7.98 +#endif /*_SSR_MATRIX_MULT_H_*/
8.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 8.2 +++ b/src/Application/VPThread__Matrix_Mult/subMatrix_Pr.c Wed Nov 10 22:26:57 2010 -0800 8.3 @@ -0,0 +1,299 @@ 8.4 +/* 8.5 + * Copyright 2009 OpenSourceStewardshipFoundation.org 8.6 + * Licensed under GNU General Public License version 2 8.7 + * 8.8 + * Author: SeanHalle@yahoo.com 8.9 + * 8.10 + */ 8.11 + 8.12 +#include <string.h> 8.13 + 8.14 +#include "SSR_Matrix_Mult.h" 8.15 + 8.16 + 8.17 + 8.18 +void inline 8.19 +copyFromOrig( SubMatrix *subMatrix, VirtProcr *animPr ); 8.20 + 8.21 +void inline 8.22 +copyTransposeFromOrig( SubMatrix *subMatrix, VirtProcr *animPr ); 8.23 + 8.24 +void inline 8.25 +multiplySubBlocksTransposed( float32 *leftArray, float32 *rightArray, 8.26 + float32 *resArray, 8.27 + int startRow, int endRow, 8.28 + int startCol, int endCol, 8.29 + int startVec, int endVec, 8.30 + int resStride, int inpStride ); 8.31 + 8.32 +void inline 8.33 +multiplyMatrixArraysTransposed( int32 vecLength, int32 numResRows, int32 numResCols, 8.34 + float32 *leftArray, float32 *rightArray, 8.35 + float32 *resArray ); 8.36 + 8.37 + 8.38 +/*A processor is created with an environment that holds two matrices, 8.39 + * the row and col that it owns, and the name of a result gathering 8.40 + * processor. 8.41 + *It calculates the product of two sub-portions of the input matrices 8.42 + * by using Intel's mkl library for single-core. 8.43 + * 8.44 + *This demonstrates using optimized single-threaded code inside scheduled 8.45 + * work-units. 8.46 + * 8.47 + *When done, it sends the result to the result processor 8.48 + */ 8.49 +void 8.50 +calcSubMatrixProduct( void *data, VirtProcr *animatingPr ) 8.51 + { 8.52 + SMPairParams *params; 8.53 + VirtProcr *resultPr; 8.54 + float32 *leftArray, *rightArray, *resArray; 8.55 + SubMatrix *leftSubMatrix, *rightSubMatrix; 8.56 + 8.57 + DEBUG1(dbgAppFlow, "start sub-matrix mult: %d\n", animatingPr->procrID) 8.58 + int32 subMatrixProbe = VMS__create_single_interval_probe( "subMtx", 8.59 + animatingPr); 8.60 + VMS__record_sched_choice_into_probe( subMatrixProbe, animatingPr ); 8.61 + VMS__record_interval_start_in_probe( subMatrixProbe ); 8.62 + 8.63 + params = (SMPairParams *)data; 8.64 + resultPr = params->resultPr; 8.65 + leftSubMatrix = params->leftSubMatrix; 8.66 + rightSubMatrix = params->rightSubMatrix; 8.67 + 8.68 + //make sure the input sub-matrices have been copied out of orig 8.69 + //do it here, inside sub-matrix pair to hopefully gain reuse in cache 8.70 + copyFromOrig( leftSubMatrix, animatingPr ); 8.71 + copyTransposeFromOrig( rightSubMatrix, animatingPr ); 8.72 + 8.73 + leftArray = leftSubMatrix->array; 8.74 + rightArray = rightSubMatrix->array; 8.75 + 8.76 + int32 8.77 + resSize = leftSubMatrix->numRows * rightSubMatrix->numCols * sizeof(float32); 8.78 + resArray = SSR__malloc_to( resSize, animatingPr ); 8.79 + memset( resArray, 0, resSize ); 8.80 + 8.81 + 8.82 + int32 numResRows, numResCols, vectLength; 8.83 + 8.84 + vectLength = leftSubMatrix->numCols; 8.85 + numResRows = leftSubMatrix->numRows; 8.86 + numResCols = rightSubMatrix->numCols; 8.87 + 8.88 + multiplyMatrixArraysTransposed( vectLength, numResRows, numResCols, 8.89 + leftArray, rightArray, 8.90 + resArray ); 8.91 + 8.92 + //send result to result processor 8.93 + params->partialResultArray = resArray; 8.94 + 8.95 + VMS__record_interval_end_in_probe( subMatrixProbe ); 8.96 + 8.97 + SSR__send_of_type_to( animatingPr, params, RESULTS_MSG, resultPr ); 8.98 + SSR__dissipate_procr( animatingPr ); 8.99 + } 8.100 + 8.101 + 8.102 + 8.103 +/*Divides result and each input into 32x32 sub-matrices, 3 of which fit into 8.104 + * the 32KB L1 cache. 8.105 + *Would be nice to embed this within another level that divided into 8.106 + * 8x8 tiles of those, where one 8x8 tile fits within 2MB L2 cache 8.107 + * 8.108 + *Eventually want these divisions to be automatic, using DKU pattern 8.109 + * embedded into VMS and exposed in the language, and with VMS controlling the 8.110 + * divisions according to the cache sizes, which it knows about. 8.111 + *Also, want VMS to work with language to split among main-mems, so a socket 8.112 + * only cranks on data in its local segment of main mem 8.113 + * 8.114 + *So, outer two loops determine start and end points within the result matrix. 8.115 + * Inside that, a loop dets the start and end points along the shared dimensions 8.116 + * of the two input matrices. 8.117 + */ 8.118 +void inline 8.119 +multiplyMatrixArraysTransposed( int32 vecLength, int32 numResRows, 8.120 + int32 numResCols, 8.121 + float32 *leftArray, float32 *rightArray, 8.122 + float32 *resArray ) 8.123 + { 8.124 + int resStride, inpStride; 8.125 + int resStartRow, resStartCol, resEndRow, resEndCol, startVec, endVec; 8.126 + 8.127 + resStride = numResCols; 8.128 + inpStride = vecLength; 8.129 + 8.130 + for( resStartRow = 0; resStartRow < numResRows; ) 8.131 + { 8.132 + resEndRow = resStartRow + ROWS_IN_BLOCK -1; //start at zero, so -1 8.133 + if( resEndRow > numResRows ) resEndRow = numResRows -1; 8.134 + 8.135 + for( resStartCol = 0; resStartCol < numResCols; ) 8.136 + { 8.137 + resEndCol = resStartCol + COLS_IN_BLOCK -1; 8.138 + if( resEndCol > numResCols ) resEndCol = numResCols -1; 8.139 + 8.140 + for( startVec = 0; startVec < vecLength; ) 8.141 + { 8.142 + endVec = startVec + VEC_IN_BLOCK -1; 8.143 + if( endVec > vecLength ) endVec = vecLength -1; 8.144 + 8.145 + //By having the "vector" of sub-blocks in a sub-block slice 8.146 + // be marched down in inner loop, are re-using the result 8.147 + // matrix, which stays in L1 cache and re-using the left sub-mat 8.148 + // which repeats for each right sub-mat -- can only re-use two of 8.149 + // the three, so result is the most important -- avoids writing 8.150 + // dirty blocks until those result-locations fully done 8.151 + //Row and Col is position in result matrix -- so row and vec 8.152 + // for left array, then vec and col for right array 8.153 + multiplySubBlocksTransposed( leftArray, rightArray, 8.154 + resArray, 8.155 + resStartRow, resEndRow, 8.156 + resStartCol, resEndCol, 8.157 + startVec, endVec, 8.158 + resStride, inpStride ); 8.159 + startVec = endVec +1; 8.160 + } 8.161 + resStartCol = resEndCol +1; 8.162 + } 8.163 + resStartRow = resEndRow +1; 8.164 + } 8.165 + } 8.166 + 8.167 + 8.168 + 8.169 +void inline 8.170 +multiplySubBlocksTransposed( float32 *leftArray, float32 *rightArray, 8.171 + float32 *resArray, 8.172 + int resStartRow, int resEndRow, 8.173 + int resStartCol, int resEndCol, 8.174 + int startVec, int endVec, 8.175 + int resStride, int inpStride ) 8.176 + { 8.177 + int resRow, resCol, vec; 8.178 + int leftOffset, rightOffset; 8.179 + float32 result; 8.180 + 8.181 + //The result row is used only for the left matrix, res col for the right 8.182 + for( resCol = resStartCol; resCol <= resEndCol; resCol++ ) 8.183 + { 8.184 + for( resRow = resStartRow; resRow <= resEndRow; resRow++ ) 8.185 + { 8.186 + leftOffset = resRow * inpStride;//left & right inp strides always same 8.187 + rightOffset = resCol * inpStride;// because right is transposed 8.188 + result = 0; 8.189 + for( vec = startVec; vec <= endVec; vec++ ) 8.190 + { 8.191 + result += 8.192 + leftArray[ leftOffset + vec] * rightArray[ rightOffset + vec]; 8.193 + } 8.194 + 8.195 + resArray[ resRow * resStride + resCol ] += result; 8.196 + } 8.197 + } 8.198 + } 8.199 + 8.200 + 8.201 + 8.202 + 8.203 +/*Reuse this in divider when do the sequential multiply case 8.204 + */ 8.205 +void inline 8.206 +copyTranspose( int32 numRows, int32 numCols, 8.207 + int32 origStartRow, int32 origStartCol, int32 origStride, 8.208 + float32 *subArray, float32 *origArray ) 8.209 + { int32 stride = numRows; 8.210 + 8.211 + int row, col, origOffset; 8.212 + for( row = 0; row < numRows; row++ ) 8.213 + { 8.214 + origOffset = (row + origStartRow) * origStride + origStartCol; 8.215 + for( col = 0; col < numCols; col++ ) 8.216 + { 8.217 + //transpose means swap row & col -- traverse orig matrix normally 8.218 + // but put into reversed place in local array -- means the 8.219 + // stride is the numRows now, so col * numRows + row 8.220 + subArray[ col * stride + row ] = origArray[ origOffset + col ]; 8.221 + } 8.222 + } 8.223 + } 8.224 + 8.225 +void inline 8.226 +copyTransposeFromOrig( SubMatrix *subMatrix, VirtProcr *animPr ) 8.227 + { int numCols, numRows, origStartRow, origStartCol, origStride, stride; 8.228 + Matrix *origMatrix; 8.229 + float32 *origArray, *subArray; 8.230 + 8.231 + if( subMatrix->alreadyCopied ) return; 8.232 + SSR__start_singleton( copyMatrixSingleton, &&EndOfTransSingleton, animPr); 8.233 + 8.234 + origMatrix = subMatrix->origMatrix; 8.235 + origArray = origMatrix->array; 8.236 + numCols = subMatrix->numCols; 8.237 + numRows = subMatrix->numRows; 8.238 + origStartRow = subMatrix->origStartRow; 8.239 + origStartCol = subMatrix->origStartCol; 8.240 + origStride = origMatrix->numCols; 8.241 + 8.242 + subArray = SSR__malloc_to( numRows * numCols *sizeof(float32),animPr); 8.243 + subMatrix->array = subArray; 8.244 + 8.245 + //copy values from orig matrix to local 8.246 + copyTranspose( numRows, numCols, 8.247 + origStartRow, origStartCol, origStride, 8.248 + subArray, origArray ); 8.249 + 8.250 + subMatrix->alreadyCopied = TRUE; //must be last thing before label 8.251 + EndOfTransSingleton: 8.252 + return; 8.253 + } 8.254 + 8.255 + 8.256 +void inline 8.257 +copyFromOrig( SubMatrix *subMatrix, VirtProcr *animPr ) 8.258 + { int numCols, numRows, origStartRow, origStartCol, stride, origStride; 8.259 + Matrix *origMatrix; 8.260 + float32 *origArray, *subArray; 8.261 + 8.262 + 8.263 + //This lets only a single VP execute the code between start and 8.264 + // end -- using start and end so that work runs outside the master. 8.265 + //Inside, if a second VP ever executes the start, it will be returned 8.266 + // from the end-point. 8.267 + //Note, for non-GCC, can add a second SSR call at the end, and inside 8.268 + // that one, look at the stack at the return addr & save that in an 8.269 + // array indexed by singletonID 8.270 + if( subMatrix->alreadyCopied ) return; 8.271 + SSR__start_singleton( copyMatrixSingleton, &&EndOfCopySingleton, animPr ); 8.272 + 8.273 + 8.274 + origMatrix = subMatrix->origMatrix; 8.275 + origArray = origMatrix->array; 8.276 + numCols = subMatrix->numCols; 8.277 + numRows = subMatrix->numRows; 8.278 + origStartRow = subMatrix->origStartRow; 8.279 + origStartCol = subMatrix->origStartCol; 8.280 + origStride = origMatrix->numCols; 8.281 + 8.282 + subArray = SSR__malloc_to( numRows * numCols *sizeof(float32),animPr); 8.283 + subMatrix->array = subArray; 8.284 + 8.285 + //copy values from orig matrix to local 8.286 + stride = numCols; 8.287 + 8.288 + int row, col, offset, origOffset; 8.289 + for( row = 0; row < numRows; row++ ) 8.290 + { 8.291 + offset = row * stride; 8.292 + origOffset = (row + origStartRow) * origStride + origStartCol; 8.293 + for( col = 0; col < numCols; col++ ) 8.294 + { 8.295 + subArray[ offset + col ] = origArray[ origOffset + col ]; 8.296 + } 8.297 + } 8.298 + 8.299 + subMatrix->alreadyCopied = TRUE; //must be last thing before label 8.300 + EndOfCopySingleton: 8.301 + return; 8.302 + }
9.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 9.2 +++ b/src/Application/main.c Wed Nov 10 22:26:57 2010 -0800 9.3 @@ -0,0 +1,36 @@ 9.4 +/* 9.5 + * Copyright Oct 24, 2009 OpenSourceStewardshipFoundation.org 9.6 + * Licensed under GNU General Public License version 2 9.7 + * 9.8 + * author seanhalle@yahoo.com 9.9 + */ 9.10 + 9.11 +#include <malloc.h> 9.12 +#include <stdlib.h> 9.13 + 9.14 +#include "Matrix_Mult.h" 9.15 +#include "SSR_Matrix_Mult/SSR_Matrix_Mult.h" 9.16 + 9.17 +/** 9.18 + *Matrix multiply program written using VMS_HW piggy-back language 9.19 + * 9.20 + */ 9.21 +int main( int argc, char **argv ) 9.22 + { Matrix *leftMatrix, *rightMatrix, *resultMatrix; 9.23 + ParamBag *paramBag; 9.24 + 9.25 + printf( "arguments: %s | %s\n", argv[0], argv[1] ); 9.26 + 9.27 + paramBag = makeParamBag(); 9.28 + readParamFileIntoBag( argv[1], paramBag ); 9.29 + initialize_Input_Matrices_Via( &leftMatrix, &rightMatrix, paramBag ); 9.30 + 9.31 + resultMatrix = multiplyTheseMatrices( leftMatrix, rightMatrix ); 9.32 + 9.33 + printf("\nresult matrix: \n"); 9.34 + printMatrix( resultMatrix ); 9.35 + 9.36 +// SSR__print_stats(); 9.37 + 9.38 + exit(0); //cleans up 9.39 + }
