Mercurial > cgi-bin > hgwebdir.cgi > PR > Applications > SSR > SSR__Blocked_Matrix_Mult__Bench
changeset 10:387f3084d9bb
Changed dir structure to new project structure
| author | Me@portablequad |
|---|---|
| date | Tue, 07 Feb 2012 14:07:38 -0800 |
| parents | 95c02c4ad998 |
| children | ca572fdc9a80 |
| files | .hgeol Matrix_Mult.c Matrix_Mult.h SSR_Matrix_Mult/Divide_Pr.c SSR_Matrix_Mult/EntryPoint.c SSR_Matrix_Mult/Result_Pr.c SSR_Matrix_Mult/SSR_Matrix_Mult.h SSR_Matrix_Mult/subMatrix_Pr.c main.c src/Application/Matrix_Mult.c src/Application/Matrix_Mult.h src/Application/SSR_Matrix_Mult/Divide_Pr.c src/Application/SSR_Matrix_Mult/EntryPoint.c src/Application/SSR_Matrix_Mult/Result_Pr.c src/Application/SSR_Matrix_Mult/SSR_Matrix_Mult.h src/Application/SSR_Matrix_Mult/subMatrix_Pr.c src/Application/main.c |
| diffstat | 17 files changed, 1484 insertions(+), 1470 deletions(-) [+] |
line diff
1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/.hgeol Tue Feb 07 14:07:38 2012 -0800 1.3 @@ -0,0 +1,14 @@ 1.4 + 1.5 +[patterns] 1.6 +**.py = native 1.7 +**.txt = native 1.8 +**.c = native 1.9 +**.h = native 1.10 +**.cpp = native 1.11 +**.java = native 1.12 +**.class = bin 1.13 +**.jar = bin 1.14 +**.sh = native 1.15 +**.pl = native 1.16 +**.jpg = bin 1.17 +**.gif = bin
2.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 2.2 +++ b/Matrix_Mult.c Tue Feb 07 14:07:38 2012 -0800 2.3 @@ -0,0 +1,167 @@ 2.4 +/* 2.5 + * Copyright 2009 OpenSourceStewardshipFoundation.org 2.6 + * Licensed under GNU General Public License version 2 2.7 + * 2.8 + * Author: seanhalle@yahoo.com 2.9 + * 2.10 + * Created on November 15, 2009, 2:35 AM 2.11 + */ 2.12 + 2.13 +#include <malloc.h> 2.14 +#include <stdlib.h> 2.15 + 2.16 +#include "Matrix_Mult.h" 2.17 +#include "ParamHelper/Param.h" 2.18 + 2.19 + 2.20 + 2.21 + void 2.22 +initialize_Input_Matrices_Via( Matrix **leftMatrix, Matrix **rightMatrix, 2.23 + ParamBag *paramBag ) 2.24 + { char *leftMatrixFileName, *rightMatrixFileName; 2.25 + int leftMatrixRows, leftMatrixCols, rightMatrixRows, rightMatrixCols; 2.26 + 2.27 + ParamStruc *param; 2.28 + param = getParamFromBag( "leftMatrixRows", paramBag ); 2.29 + leftMatrixRows = param->intValue; 2.30 + param = getParamFromBag( "leftMatrixCols", paramBag ); 2.31 + leftMatrixCols = param->intValue; 2.32 + *leftMatrix = makeMatrix_WithResMat( leftMatrixRows, leftMatrixCols ); 2.33 + 2.34 + param = getParamFromBag( "leftMatrixFileName", paramBag ); 2.35 + leftMatrixFileName = param->strValue; //no need to copy 2.36 + read_Matrix_From_File( *leftMatrix, leftMatrixFileName ); 2.37 + 2.38 + param = getParamFromBag( "rightMatrixRows", paramBag ); 2.39 + rightMatrixRows = param->intValue; 2.40 + param = getParamFromBag( "rightMatrixCols", paramBag ); 2.41 + rightMatrixCols = param->intValue; 2.42 + *rightMatrix = makeMatrix_WithResMat( rightMatrixRows, rightMatrixCols ); 2.43 + 2.44 + param = getParamFromBag( "rightMatrixFileName", paramBag ); 2.45 + rightMatrixFileName = param->strValue; 2.46 + read_Matrix_From_File( *rightMatrix, rightMatrixFileName ); 2.47 + } 2.48 + 2.49 + 2.50 +void parseLineIntoRow( char *line, float32* row ); 2.51 + 2.52 + 2.53 + void 2.54 +read_Matrix_From_File( Matrix *matrixStruc, char *matrixFileName ) 2.55 + { int row, maxRead, numRows, numCols; 2.56 + float32 *matrixStart; 2.57 + size_t lineSz = 0; 2.58 + FILE *file; 2.59 + char *line = NULL; 2.60 + 2.61 + lineSz = 50000; //max length of line in a matrix data file 2.62 + line = (char *) malloc( lineSz ); 2.63 + if( line == NULL ) printf( "no mem for matrix line" ); 2.64 + 2.65 + numRows = matrixStruc->numRows; 2.66 + numCols = matrixStruc->numCols; 2.67 + matrixStart = matrixStruc->array; 2.68 + 2.69 + file = fopen( matrixFileName, "r" ); 2.70 + if( file == NULL ) { printf( "\nCouldn't open file!!\n"); exit(1);} 2.71 + fseek( file, 0, SEEK_SET ); 2.72 + for( row = 0; row < numRows; row++ ) 2.73 + { 2.74 + if( feof( file ) ) printf( "file ran out too soon" ); 2.75 + maxRead = getline( &line, &lineSz, file ); 2.76 + if( maxRead == -1 ) printf( "prob reading mat line"); 2.77 + 2.78 + if( *line == '\n') continue; //blank line 2.79 + if( *line == '/' ) continue; //comment line 2.80 + 2.81 + parseLineIntoRow( line, matrixStart + row * numCols ); 2.82 + } 2.83 + free( line ); 2.84 + } 2.85 + 2.86 +/*This function relies on each line having the proper number of cols. It 2.87 + * doesn't check, nor enforce, so if the file is improperly formatted it 2.88 + * can write over unrelated memory 2.89 + */ 2.90 + void 2.91 +parseLineIntoRow( char *line, float32* row ) 2.92 + { 2.93 + char *valueStr, *searchPos; 2.94 + 2.95 + //read the float values 2.96 + searchPos = valueStr = line; //start 2.97 + 2.98 + for( ; *searchPos != 0; searchPos++) //bit dangerous, should use buff len 2.99 + { 2.100 + if( *searchPos == '\n' ) //last col.. relying on well-formatted file 2.101 + { *searchPos = 0; 2.102 + *row = atof( valueStr ); 2.103 + break; //end FOR loop 2.104 + } 2.105 + if( *searchPos == ',' ) 2.106 + { *searchPos = 0; //mark end of string 2.107 + *row = (float32) atof( valueStr ); 2.108 + row += 1; //address arith 2.109 + //skip any spaces before digits.. use searchPos + 1 to skip the 0 2.110 + for( ; *(searchPos + 1)== ' ' && *(searchPos + 1) !=0; searchPos++); 2.111 + valueStr = searchPos + 1; 2.112 + } 2.113 + } 2.114 + } 2.115 + 2.116 + //========================================================================== 2.117 + 2.118 +/*In the "_Flat" version of constructor, do only malloc of the top data struc 2.119 + * and set values in that top-level. Don't malloc any sub-structures. 2.120 + */ 2.121 + Matrix * 2.122 +makeMatrix_Flat( int32 numRows, int32 numCols ) 2.123 + { Matrix * retMatrix; 2.124 + retMatrix = malloc( sizeof( Matrix ) ); 2.125 + retMatrix->numRows = numRows; 2.126 + retMatrix->numCols = numCols; 2.127 + 2.128 + return retMatrix; 2.129 + } 2.130 + 2.131 + Matrix * 2.132 +makeMatrix_WithResMat( int32 numRows, int32 numCols ) 2.133 + { Matrix * retMatrix; 2.134 + retMatrix = malloc( sizeof( Matrix ) ); 2.135 + retMatrix->numRows = numRows; 2.136 + retMatrix->numCols = numCols; 2.137 + retMatrix->array = malloc( numRows * numCols * sizeof(float32) ); 2.138 + 2.139 + return retMatrix; 2.140 + } 2.141 + 2.142 + void 2.143 +freeMatrix_Flat( Matrix * matrix ) 2.144 + { //( matrix ); 2.145 + } 2.146 + void 2.147 +freeMatrix( Matrix * matrix ) 2.148 + { free( matrix->array ); 2.149 + free( matrix ); 2.150 + } 2.151 + 2.152 +void 2.153 +printMatrix( Matrix *matrix ) 2.154 + { int r, c, numRows, numCols, rowsToPrint, colsToPrint, rowIncr, colIncr; 2.155 + float32 *matrixArray; 2.156 + 2.157 + numRows = rowsToPrint = matrix->numRows; 2.158 + numCols = colsToPrint = matrix->numCols; 2.159 + matrixArray = matrix->array; 2.160 + 2.161 + rowIncr = numRows/20; if(rowIncr == 0) rowIncr = 1;//20 to 39 rows printed 2.162 + colIncr = numCols/20; if(colIncr == 0) colIncr = 1;//20 to 39 cols printed 2.163 + for( r = 0; r < numRows; r += rowIncr ) 2.164 + { for( c = 0; c < numCols; c += colIncr ) 2.165 + { printf( "%3.1f | ", matrixArray[ r * numCols + c ] ); 2.166 + } 2.167 + printf("\n"); 2.168 + } 2.169 + } 2.170 +
3.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 3.2 +++ b/Matrix_Mult.h Tue Feb 07 14:07:38 2012 -0800 3.3 @@ -0,0 +1,77 @@ 3.4 +/* 3.5 + * Copyright Oct 24, 2009 OpenSourceStewardshipFoundation.org 3.6 + * Licensed under GNU General Public License version 2 3.7 + */ 3.8 + 3.9 +#ifndef MATRIX_MULT_H_ 3.10 +#define MATRIX_MULT_H_ 3.11 + 3.12 +#include <stdio.h> 3.13 +#include <unistd.h> 3.14 +#include <malloc.h> 3.15 + 3.16 +#include "../SSR_lib/VMS/VMS_primitive_data_types.h" 3.17 +#include "ParamHelper/Param.h" 3.18 + 3.19 +//============================== Structures ============================== 3.20 + 3.21 +typedef 3.22 +struct 3.23 + { int32 numRows; 3.24 + int32 numCols; 3.25 + float32 *array; //2D, but dynamically sized, so use addr arith 3.26 + } 3.27 +Matrix; 3.28 + 3.29 +/* This is the "appSpecificPiece" that is carried inside a DKUPiece. 3.30 + * In the DKUPiece data struc it is declared to be of type "void *". This 3.31 + * allows the application to define any data structure it wants and put it 3.32 + * into a DKUPiece. 3.33 + * When the app specific info is used, it is in app code, so it is cast to 3.34 + * the correct type to tell the compiler how to access fields. 3.35 + * This keeps all app-specific things out of the DKU directory, as per the 3.36 + * DKU standard. */ 3.37 +typedef 3.38 +struct 3.39 + { 3.40 + // pointers to shared data.. the result matrix must be created when the 3.41 + // left and right matrices are put into the root ancestor DKUPiece. 3.42 + Matrix * leftMatrix; 3.43 + Matrix * rightMatrix; 3.44 + Matrix * resultMatrix; 3.45 + 3.46 + // define the starting and ending boundaries for this piece of the 3.47 + // result matrix. These are derivable from the left and right 3.48 + // matrices, but included them for readability of code. 3.49 + int prodStartRow, prodEndRow; 3.50 + int prodStartCol, prodEndCol; 3.51 + // Start and end of the portion of the left matrix that contributes to 3.52 + // this piece of the product 3.53 + int leftStartRow, leftEndRow; 3.54 + int leftStartCol, leftEndCol; 3.55 + // Start and end of the portion of the right matrix that contributes to 3.56 + // this piece of the product 3.57 + int rightStartRow, rightEndRow; 3.58 + int rightStartCol, rightEndCol; 3.59 + } 3.60 +MatrixProdPiece; 3.61 + 3.62 +//============================== Functions ================================ 3.63 +void readFile(); 3.64 + 3.65 +Matrix *makeMatrix( int32 numRows, int32 numCols ); 3.66 +Matrix *makeMatrix_Flat( int32 numRows, int32 numCols ); 3.67 +Matrix *makeMatrix_WithResMat( int32 numRows, int32 numCols ); 3.68 +void freeMatrix_Flat( Matrix * matrix ); 3.69 +void freeMatrix( Matrix * matrix ); 3.70 +void printMatrix( Matrix *matrix ); 3.71 + 3.72 +void read_Matrix_From_File( Matrix *matrixStruc, char *matrixFileName ); 3.73 + 3.74 +void 3.75 +initialize_Input_Matrices_Via( Matrix **leftMatrix, Matrix **rightMatrix, 3.76 + ParamBag *paramBag ); 3.77 + 3.78 +//=========================================================================== 3.79 + 3.80 +#endif /*MATRIX_MULT_H_*/
4.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 4.2 +++ b/SSR_Matrix_Mult/Divide_Pr.c Tue Feb 07 14:07:38 2012 -0800 4.3 @@ -0,0 +1,603 @@ 4.4 +/* 4.5 + * Copyright 2009 OpenSourceStewardshipFoundation.org 4.6 + * Licensed under GNU General Public License version 2 4.7 + * 4.8 + * Author: seanhalle@yahoo.com 4.9 + * 4.10 + */ 4.11 + 4.12 + 4.13 +#include "SSR_Matrix_Mult.h" 4.14 +#include <math.h> 4.15 +#include <string.h> 4.16 + 4.17 + //The time to compute this many result values should equal the time to 4.18 + // perform this division on a matrix of size gives that many result calcs 4.19 + //IE, size this so that sequential time to calc equals divide time 4.20 + // find the value by experimenting -- but divide time and calc time scale 4.21 + // same way, so this value should remain valid across hardware 4.22 +#define NUM_CELLS_IN_SEQUENTIAL_CUTOFF 1000 4.23 + 4.24 + 4.25 +//=========================================================================== 4.26 +int inline 4.27 +measureMatrixMultPrimitive( VirtProcr *animPr ); 4.28 + 4.29 +SlicingStrucCarrier * 4.30 +calcIdealSizeAndSliceDimensions( Matrix *leftMatrix, Matrix *rightMatrix, 4.31 + VirtProcr *animPr ); 4.32 + 4.33 +SlicingStruc * 4.34 +sliceUpDimension( float32 idealSizeOfSide, int startVal, int endVal, 4.35 + VirtProcr *animPr ); 4.36 + 4.37 +void 4.38 +freeSlicingStruc( SlicingStruc *slicingStruc, VirtProcr *animPr ); 4.39 + 4.40 +SubMatrix ** 4.41 +createSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices, 4.42 + int32 numUses, Matrix *origMatrix, VirtProcr *animPr ); 4.43 + 4.44 +void 4.45 +freeSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices, 4.46 + SubMatrix **subMatrices, VirtProcr *animPr ); 4.47 + 4.48 +void 4.49 +pairUpSubMatricesAndMakeProcessors( SubMatrix **leftSubMatrices, 4.50 + SubMatrix **rightSubMatrices, 4.51 + int32 numRowIdxs, int32 numColIdxs, 4.52 + int32 numVecIdxs, 4.53 + VirtProcr *resultPr, 4.54 + VirtProcr *animatingPr ); 4.55 + 4.56 +void 4.57 +makeSubMatricesAndProcrs( Matrix *leftMatrix, Matrix *rightMatrix, 4.58 + SlicingStrucCarrier *slicingStrucCarrier, 4.59 + VirtProcr *resultPr, VirtProcr *animatingPr ); 4.60 + 4.61 + 4.62 + 4.63 +/*Divider creates one processor for every sub-matrix 4.64 + * It hands them: 4.65 + * the name of the result processor that they should send their results to, 4.66 + * the left and right matrices, and the rows and cols they should multiply 4.67 + * It first creates the result processor, then all the sub-matrixPair 4.68 + * processors, 4.69 + * then does a receive of a message from the result processor that gives 4.70 + * the divider ownership of the result matrix. 4.71 + * Finally, the divider returns the result matrix out of the SSR system. 4.72 + * 4.73 + * Divider chooses the size of sub-matrices via an algorithm that tries to 4.74 + * keep the minimum work above a threshold. The threshold is machine- 4.75 + * dependent, so ask SSR for min work-unit time to get a 4.76 + * given overhead 4.77 + * 4.78 + * Divide min work-unit cycles by measured-cycles for one matrix-cell 4.79 + * product -- gives the number of products need to have in min size 4.80 + * matrix. 4.81 + * 4.82 + * So then, take cubed root of this to get the size of a side of min sub- 4.83 + * matrix. That is the size of the ideal square sub-matrix -- so tile 4.84 + * up the two input matrices into ones as close as possible to that size, 4.85 + * and create the pairs of sub-matrices. 4.86 + * 4.87 + *======================== STRATEGIC OVERVIEW ======================= 4.88 + * 4.89 + *This division is a bit tricky, because have to create things in advance 4.90 + * that it's not at first obvious need to be created.. 4.91 + * 4.92 + *First slice up each dimension -- three of them.. this is because will have 4.93 + * to create the sub-matrix's data-structures before pairing the sub-matrices 4.94 + * with each other -- so, have three dimensions to slice up before can 4.95 + * create the sub-matrix data-strucs -- also, have to be certain that the 4.96 + * cols of the left input have the exact same slicing as the rows of the 4.97 + * left matrix, so just to be sure, do the slicing calc once, then use it 4.98 + * for both. 4.99 + * 4.100 + *So, goes like this: 4.101 + *1) calculate the start & end values of each dimension in each matrix. 4.102 + *2) use those values to create sub-matrix structures 4.103 + *3) combine sub-matrices into pairs, as the tasks to perform. 4.104 + * 4.105 + *Have to calculate separately from creating the sub-matrices because of the 4.106 + * nature of the nesting -- would either end up creating the same sub-matrix 4.107 + * multiple times, or else would have to put in detection of whether had 4.108 + * made a particular one already if tried to combine steps 1 and 2. 4.109 + * 4.110 + *Step 3 has to be separate because of the nesting, as well -- same reason, 4.111 + * would either create same sub-matrix multiple times, or else have to 4.112 + * add detection of whether was already created. 4.113 + * 4.114 + *Another way to look at it: there's one level of loop to divide dimensions, 4.115 + * two levels of nesting to create sub-matrices, and three levels to pair 4.116 + * up the sub-matrices. 4.117 + */ 4.118 + 4.119 +void divideWorkIntoSubMatrixPairProcrs( void *_dividerParams, 4.120 + VirtProcr *animPr ) 4.121 + { VirtProcr *resultPr; 4.122 + DividerParams *dividerParams; 4.123 + ResultsParams *resultsParams; 4.124 + Matrix *leftMatrix, *rightMatrix, *resultMatrix; 4.125 + void *msg; 4.126 + SlicingStrucCarrier *slicingStrucCarrier; 4.127 + float32 *resultArray; //points to array inside result matrix 4.128 + 4.129 + DEBUG( dbgAppFlow, "start divide\n") 4.130 + 4.131 + int32 4.132 + divideProbe = VMS__create_single_interval_probe( "divideProbe", 4.133 + animPr ); 4.134 + VMS__record_sched_choice_into_probe( divideProbe, animPr ); 4.135 + VMS__record_interval_start_in_probe( divideProbe ); 4.136 + 4.137 + //=========== Setup -- make local copies of ptd-to-things, malloc, aso 4.138 + int32 numResRows, numResCols, vectLength; 4.139 + 4.140 + dividerParams = (DividerParams *)_dividerParams; 4.141 + 4.142 + leftMatrix = dividerParams->leftMatrix; 4.143 + rightMatrix = dividerParams->rightMatrix; 4.144 + 4.145 + vectLength = leftMatrix->numCols; 4.146 + numResRows = leftMatrix->numRows; 4.147 + numResCols = rightMatrix->numCols; 4.148 + resultArray = dividerParams->resultMatrix->array; 4.149 + 4.150 + //zero the result array 4.151 + memset( resultArray, 0, numResRows * numResCols * sizeof(float32) ); 4.152 + 4.153 + //============== Do either sequential mult or do division ============== 4.154 + 4.155 + //Check if input matrices too small -- if yes, just do sequential 4.156 + //Cutoff is determined by overhead of this divider -- relatively 4.157 + // machine-independent 4.158 + if( (float32)leftMatrix->numRows * (float32)leftMatrix->numCols * 4.159 + (float32)rightMatrix->numCols < NUM_CELLS_IN_SEQUENTIAL_CUTOFF ) 4.160 + { 4.161 + //====== Do sequential multiply on a single core 4.162 + DEBUG( dbgAppFlow, "doing sequential") 4.163 + 4.164 + //transpose the right matrix 4.165 + float32 * 4.166 + transRightArray = SSR__malloc_to( rightMatrix->numRows * 4.167 + rightMatrix->numCols * sizeof(float32), 4.168 + animPr ); 4.169 + 4.170 + //copy values from orig matrix to local 4.171 + copyTranspose( rightMatrix->numRows, rightMatrix->numCols, 4.172 + 0, 0, rightMatrix->numRows, 4.173 + transRightArray, rightMatrix->array ); 4.174 + 4.175 + multiplyMatrixArraysTransposed( vectLength, numResRows, numResCols, 4.176 + leftMatrix->array, transRightArray, 4.177 + resultArray ); 4.178 + } 4.179 + else 4.180 + { 4.181 + //====== Do parallel multiply across cores 4.182 + 4.183 + //Calc the ideal size of sub-matrix and slice up the dimensions of 4.184 + // the two matrices. 4.185 + //The ideal size is the one takes the number of cycles to calculate 4.186 + // such that calc time is equal or greater than min work-unit size 4.187 + slicingStrucCarrier = 4.188 + calcIdealSizeAndSliceDimensions( leftMatrix, rightMatrix, animPr ); 4.189 + 4.190 + //Make the results processor, now that know how many to wait for 4.191 + resultsParams = SSR__malloc_to( sizeof(ResultsParams), animPr ); 4.192 + resultsParams->numSubMatrixPairs = 4.193 + slicingStrucCarrier->leftRowSlices->numVals * 4.194 + slicingStrucCarrier->rightColSlices->numVals * 4.195 + slicingStrucCarrier->vecSlices->numVals; 4.196 + resultsParams->dividerPr = animPr; 4.197 + resultsParams->numCols = rightMatrix->numCols; 4.198 + resultsParams->numRows = leftMatrix->numRows; 4.199 + resultsParams->resultArray = resultArray; 4.200 + 4.201 + 4.202 + resultPr = 4.203 + SSR__create_procr_with( &gatherResults, resultsParams, animPr); 4.204 + 4.205 + //Make the sub-matrices, and pair them up, and make processor to 4.206 + // calc product of each pair. 4.207 + makeSubMatricesAndProcrs( leftMatrix, rightMatrix, 4.208 + slicingStrucCarrier, 4.209 + resultPr, animPr); 4.210 + 4.211 + //result array is allocated externally, so no message from resultPr 4.212 + // however, do have to wait before printing out stats, so wait 4.213 + // for an empty handshake message 4.214 + msg = SSR__receive_from_to( resultPr, animPr ); 4.215 + } 4.216 + 4.217 + 4.218 + //=============== Work done -- send results back ================= 4.219 + 4.220 + 4.221 + DEBUG( dbgAppFlow, "end divide\n") 4.222 + 4.223 + VMS__record_interval_end_in_probe( divideProbe ); 4.224 + VMS__print_stats_of_all_probes(); 4.225 + 4.226 + //nothing left to do so dissipate, SSR will wait to shutdown and hence 4.227 + // make results available to outside until all the processors have 4.228 + // dissipated -- so no need to wait for results processor 4.229 + 4.230 + SSR__dissipate_procr( animPr ); //all procrs dissipate self at end 4.231 + //when all of the processors have dissipated, the "create seed and do 4.232 + // work" call in the entry point function returns 4.233 + } 4.234 + 4.235 + 4.236 +SlicingStrucCarrier * 4.237 +calcIdealSizeAndSliceDimensions( Matrix *leftMatrix, Matrix *rightMatrix, 4.238 + VirtProcr *animPr ) 4.239 + { 4.240 + float32 idealSizeOfSide, idealSizeOfSide1, idealSizeOfSide2; 4.241 + SlicingStruc *leftRowSlices, *vecSlices, *rightColSlices; 4.242 + SlicingStrucCarrier *slicingStrucCarrier = 4.243 + SSR__malloc_to(sizeof(SlicingStrucCarrier), animPr); 4.244 + 4.245 + int minWorkUnitCycles, primitiveCycles, idealNumWorkUnits; 4.246 + float64 numPrimitiveOpsInMinWorkUnit; 4.247 + 4.248 + 4.249 + //======= Calc ideal size of min-sized sub-matrix ======== 4.250 + 4.251 + //ask SSR for the number of cycles of the minimum work unit, at given 4.252 + // percent overhead then add a guess at overhead from this divider 4.253 + minWorkUnitCycles = SSR__giveMinWorkUnitCycles( .05 ); 4.254 + 4.255 + //ask SSR for number of cycles of the "primitive" op of matrix mult 4.256 + primitiveCycles = measureMatrixMultPrimitive( animPr ); 4.257 + 4.258 + numPrimitiveOpsInMinWorkUnit = 4.259 + (float64)minWorkUnitCycles / (float64)primitiveCycles; 4.260 + 4.261 + //take cubed root -- that's number of these in a "side" of sub-matrix 4.262 + // then multiply by 5 because the primitive is 5x5 4.263 + idealSizeOfSide1 = 5 * cbrt( numPrimitiveOpsInMinWorkUnit ); 4.264 + 4.265 + idealNumWorkUnits = SSR__giveIdealNumWorkUnits(); 4.266 + 4.267 + idealSizeOfSide2 = leftMatrix->numRows / rint(cbrt( idealNumWorkUnits )); 4.268 + idealSizeOfSide2 *= 0.6; //finer granularity to help load balance 4.269 + 4.270 + if( idealSizeOfSide1 > idealSizeOfSide2 ) 4.271 + idealSizeOfSide = idealSizeOfSide1; 4.272 + else 4.273 + idealSizeOfSide = idealSizeOfSide2; 4.274 + 4.275 + //The multiply inner loop blocks the array to fit into L1 cache 4.276 +// if( idealSizeOfSide < ROWS_IN_BLOCK ) idealSizeOfSide = ROWS_IN_BLOCK; 4.277 + 4.278 + //============ Slice up dimensions, now that know target size =========== 4.279 + 4.280 + //Tell the slicer the target size of a side (floating pt), the start 4.281 + // value to start slicing at, and the end value to stop slicing at 4.282 + //It returns an array of start value of each chunk, plus number of them 4.283 + int32 startLeftRow, endLeftRow, startVec,endVec,startRightCol,endRightCol; 4.284 + startLeftRow = 0; 4.285 + endLeftRow = leftMatrix->numRows -1; 4.286 + startVec = 0; 4.287 + endVec = leftMatrix->numCols -1; 4.288 + startRightCol = 0; 4.289 + endRightCol = rightMatrix->numCols -1; 4.290 + 4.291 + leftRowSlices = 4.292 + sliceUpDimension( idealSizeOfSide, startLeftRow, endLeftRow, animPr ); 4.293 + 4.294 + vecSlices = 4.295 + sliceUpDimension( idealSizeOfSide, startVec, endVec, animPr ); 4.296 + 4.297 + rightColSlices = 4.298 + sliceUpDimension( idealSizeOfSide, startRightCol, endRightCol,animPr); 4.299 + 4.300 + slicingStrucCarrier->leftRowSlices = leftRowSlices; 4.301 + slicingStrucCarrier->vecSlices = vecSlices; 4.302 + slicingStrucCarrier->rightColSlices = rightColSlices; 4.303 + 4.304 + return slicingStrucCarrier; 4.305 + } 4.306 + 4.307 + 4.308 +void 4.309 +makeSubMatricesAndProcrs( Matrix *leftMatrix, Matrix *rightMatrix, 4.310 + SlicingStrucCarrier *slicingStrucCarrier, 4.311 + VirtProcr *resultPr, VirtProcr *animPr ) 4.312 + { 4.313 + SlicingStruc *leftRowSlices, *vecSlices, *rightColSlices; 4.314 + 4.315 + leftRowSlices = slicingStrucCarrier->leftRowSlices; 4.316 + vecSlices = slicingStrucCarrier->vecSlices; 4.317 + rightColSlices = slicingStrucCarrier->rightColSlices; 4.318 + SSR__free( slicingStrucCarrier, animPr ); 4.319 + 4.320 + //================ Make sub-matrices, given the slicing ================ 4.321 + SubMatrix **leftSubMatrices, **rightSubMatrices; 4.322 + leftSubMatrices = 4.323 + createSubMatrices( leftRowSlices, vecSlices, rightColSlices->numVals, 4.324 + leftMatrix, animPr ); 4.325 + //double_check_that_always_numRows_in_right_same_as_numCols_in_left(); 4.326 + rightSubMatrices = 4.327 + createSubMatrices( vecSlices, rightColSlices, leftRowSlices->numVals, 4.328 + rightMatrix, animPr ); 4.329 + 4.330 + 4.331 + //============== pair the sub-matrices and make processors ============== 4.332 + int32 numRowIdxs, numColIdxs, numVecIdxs; 4.333 + 4.334 + numRowIdxs = leftRowSlices->numVals; 4.335 + numColIdxs = rightColSlices->numVals; 4.336 + numVecIdxs = vecSlices->numVals; 4.337 + 4.338 + 4.339 + freeSlicingStruc( leftRowSlices, animPr ); 4.340 + freeSlicingStruc( vecSlices, animPr ); 4.341 + freeSlicingStruc( rightColSlices, animPr ); 4.342 + 4.343 + pairUpSubMatricesAndMakeProcessors( leftSubMatrices, 4.344 + rightSubMatrices, 4.345 + numRowIdxs, numColIdxs, 4.346 + numVecIdxs, 4.347 + resultPr, 4.348 + animPr ); 4.349 + } 4.350 + 4.351 + 4.352 + 4.353 + 4.354 +void 4.355 +pairUpSubMatricesAndMakeProcessors( SubMatrix **leftSubMatrices, 4.356 + SubMatrix **rightSubMatrices, 4.357 + int32 numRowIdxs, int32 numColIdxs, 4.358 + int32 numVecIdxs, 4.359 + VirtProcr *resultPr, 4.360 + VirtProcr *animatingPr ) 4.361 + { 4.362 + int32 resRowIdx, resColIdx, vecIdx; 4.363 + int32 numLeftColIdxs, numRightColIdxs; 4.364 + int32 leftRowIdxOffset; 4.365 + SMPairParams *subMatrixPairParams; 4.366 + float32 numToPutOntoEachCore, leftOverFraction; 4.367 + int32 numCores, coreToScheduleOnto, numVecOnCurrCore; 4.368 + 4.369 + numLeftColIdxs = numColIdxs; 4.370 + numRightColIdxs = numVecIdxs; 4.371 + 4.372 + numCores = SSR__give_number_of_cores_to_schedule_onto(); 4.373 + 4.374 + numToPutOntoEachCore = numRowIdxs*numColIdxs/numCores; 4.375 + leftOverFraction = 0; 4.376 + numVecOnCurrCore = 0; 4.377 + coreToScheduleOnto = 0; 4.378 + 4.379 + for( resRowIdx = 0; resRowIdx < numRowIdxs; resRowIdx++ ) 4.380 + { 4.381 + leftRowIdxOffset = resRowIdx * numLeftColIdxs; 4.382 + 4.383 + for( resColIdx = 0; resColIdx < numColIdxs; resColIdx++ ) 4.384 + { 4.385 + 4.386 + for( vecIdx = 0; vecIdx < numVecIdxs; vecIdx++ ) 4.387 + { 4.388 + //Make the processor for the pair of sub-matrices 4.389 + subMatrixPairParams = SSR__malloc_to( sizeof(SMPairParams), 4.390 + animatingPr); 4.391 + subMatrixPairParams->leftSubMatrix = 4.392 + leftSubMatrices[ leftRowIdxOffset + vecIdx ]; 4.393 + 4.394 + subMatrixPairParams->rightSubMatrix = 4.395 + rightSubMatrices[ vecIdx * numRightColIdxs + resColIdx ]; 4.396 + 4.397 + subMatrixPairParams->resultPr = resultPr; 4.398 + 4.399 + //put all pairs from the same vector onto same core 4.400 + SSR__create_procr_with_affinity( &calcSubMatrixProduct, 4.401 + subMatrixPairParams, 4.402 + animatingPr, 4.403 + coreToScheduleOnto ); 4.404 + } 4.405 + 4.406 + //Trying to distribute the subMatrix-vectors across the cores, so 4.407 + // that each core gets the same number of vectors, with a max 4.408 + // imbalance of 1 vector more on some cores than others 4.409 + numVecOnCurrCore += 1; 4.410 + if( numVecOnCurrCore + leftOverFraction >= numToPutOntoEachCore -1 ) 4.411 + { 4.412 + //deal with fractional part, to ensure that imbalance is 1 max 4.413 + // IE, core with most has only 1 more than core with least 4.414 + leftOverFraction += numToPutOntoEachCore - numVecOnCurrCore; 4.415 + if( leftOverFraction >= 1 ) 4.416 + { leftOverFraction -= 1; 4.417 + numVecOnCurrCore = -1; 4.418 + } 4.419 + else 4.420 + { numVecOnCurrCore = 0; 4.421 + } 4.422 + //Move to next core, max core-value to incr to is numCores -1 4.423 + if( coreToScheduleOnto >= numCores -1 ) 4.424 + { coreToScheduleOnto = 0; 4.425 + } 4.426 + else 4.427 + { coreToScheduleOnto += 1; 4.428 + } 4.429 + } 4.430 + 4.431 + } 4.432 + } 4.433 + 4.434 + } 4.435 + 4.436 + 4.437 + 4.438 +/*Walk through the two slice-strucs, making sub-matrix strucs as go 4.439 + */ 4.440 +SubMatrix ** 4.441 +createSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices, 4.442 + int32 numUses, Matrix *origMatrix, VirtProcr *animPr ) 4.443 + { 4.444 + int32 numRowIdxs, numColIdxs, rowIdx, colIdx; 4.445 + int32 startRow, endRow, startCol, endCol; 4.446 + int32 *rowStartVals, *colStartVals; 4.447 + int32 rowOffset; 4.448 + SubMatrix **subMatrices, *newSubMatrix; 4.449 + 4.450 + numRowIdxs = rowSlices->numVals; 4.451 + numColIdxs = colSlices->numVals; 4.452 + 4.453 + rowStartVals = rowSlices->startVals; 4.454 + colStartVals = colSlices->startVals; 4.455 + 4.456 + subMatrices = SSR__malloc_to(numRowIdxs * numColIdxs * sizeof(SubMatrix*), 4.457 + animPr ); 4.458 + 4.459 + for( rowIdx = 0; rowIdx < numRowIdxs; rowIdx++ ) 4.460 + { 4.461 + rowOffset = rowIdx * numColIdxs; 4.462 + 4.463 + startRow = rowStartVals[rowIdx]; 4.464 + endRow = rowStartVals[rowIdx + 1] -1; //"fake" start above last is 4.465 + // at last valid idx + 1 & is 4.466 + // 1 greater than end value 4.467 + for( colIdx = 0; colIdx < numColIdxs; colIdx++ ) 4.468 + { 4.469 + startCol = colStartVals[colIdx]; 4.470 + endCol = colStartVals[colIdx + 1] -1; 4.471 + 4.472 + newSubMatrix = SSR__malloc_to( sizeof(SubMatrix), animPr ); 4.473 + newSubMatrix->numRows = endRow - startRow +1; 4.474 + newSubMatrix->numCols = endCol - startCol +1; 4.475 + newSubMatrix->origMatrix = origMatrix; 4.476 + newSubMatrix->origStartRow = startRow; 4.477 + newSubMatrix->origStartCol = startCol; 4.478 + newSubMatrix->copySingleton = NULL; 4.479 + newSubMatrix->numUsesLeft = numUses; //can free after this many 4.480 + //Prevent uninitialized memory 4.481 + newSubMatrix->copySingleton = NULL; 4.482 + newSubMatrix->copyTransSingleton = NULL; 4.483 + 4.484 + subMatrices[ rowOffset + colIdx ] = newSubMatrix; 4.485 + } 4.486 + } 4.487 + return subMatrices; 4.488 + } 4.489 + 4.490 + 4.491 +void 4.492 +freeSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices, 4.493 + SubMatrix **subMatrices, VirtProcr *animPr ) 4.494 + { 4.495 + int32 numRowIdxs, numColIdxs, rowIdx, colIdx, rowOffset; 4.496 + SubMatrix *subMatrix; 4.497 + 4.498 + numRowIdxs = rowSlices->numVals; 4.499 + numColIdxs = colSlices->numVals; 4.500 + 4.501 + for( rowIdx = 0; rowIdx < numRowIdxs; rowIdx++ ) 4.502 + { 4.503 + rowOffset = rowIdx * numColIdxs; 4.504 + for( colIdx = 0; colIdx < numColIdxs; colIdx++ ) 4.505 + { 4.506 + subMatrix = subMatrices[ rowOffset + colIdx ]; 4.507 + if( subMatrix->alreadyCopied ) 4.508 + SSR__free( subMatrix->array, animPr ); 4.509 + SSR__free( subMatrix, animPr ); 4.510 + } 4.511 + } 4.512 + SSR__free( subMatrices, animPr ); 4.513 + } 4.514 + 4.515 + 4.516 + 4.517 +SlicingStruc * 4.518 +sliceUpDimension( float32 idealSizeOfSide, int startVal, int endVal, 4.519 + VirtProcr *animPr ) 4.520 + { float32 residualAcc = 0; 4.521 + int numSlices, i, *startVals, sizeOfSlice, endCondition; 4.522 + SlicingStruc *slicingStruc = SSR__malloc_to(sizeof(SlicingStruc), animPr); 4.523 + 4.524 + //calc size of matrix need to hold start vals -- 4.525 + numSlices = (int32)( (float32)(endVal -startVal +1) / idealSizeOfSide); 4.526 + 4.527 + startVals = SSR__malloc_to( (numSlices + 1) * sizeof(int32), animPr ); 4.528 + 4.529 + //Calc the upper limit of start value -- when get above this, end loop 4.530 + // by saving highest value of the matrix dimension to access, plus 1 4.531 + // as the start point of the imaginary slice following the last one 4.532 + //Plus 1 because go up to value but not include when process last slice 4.533 + //The stopping condition is half-a-size less than highest value because 4.534 + // don't want any pieces smaller than half the ideal size -- just tack 4.535 + // little ones onto end of last one 4.536 + endCondition = endVal - (int) (idealSizeOfSide/2); //end *value*, not size 4.537 + for( i = 0; startVal <= endVal; i++ ) 4.538 + { 4.539 + startVals[i] = startVal; 4.540 + residualAcc += idealSizeOfSide; 4.541 + sizeOfSlice = (int)residualAcc; 4.542 + residualAcc -= (float32)sizeOfSlice; 4.543 + startVal += sizeOfSlice; //ex @size = 2 get 0, 2, 4, 6, 8.. 4.544 + 4.545 + if( startVal > endCondition ) 4.546 + { startVal = endVal + 1; 4.547 + startVals[ i + 1 ] = startVal; 4.548 + } 4.549 + } 4.550 + 4.551 + slicingStruc->startVals = startVals; 4.552 + slicingStruc->numVals = i; //loop incr'd, so == last valid start idx+1 4.553 + // which means is num sub-matrices in dim 4.554 + // also == idx of the fake start just above 4.555 + return slicingStruc; 4.556 + } 4.557 + 4.558 +void 4.559 +freeSlicingStruc( SlicingStruc *slicingStruc, VirtProcr *animPr ) 4.560 + { 4.561 + SSR__free( slicingStruc->startVals, animPr ); 4.562 + SSR__free( slicingStruc, animPr ); 4.563 + } 4.564 + 4.565 + 4.566 +int inline 4.567 +measureMatrixMultPrimitive( VirtProcr *animPr ) 4.568 + { 4.569 + int r, c, v, numCycles; 4.570 + float32 *res, *left, *right; 4.571 + 4.572 + //setup inputs 4.573 + left = SSR__malloc_to( 5 * 5 * sizeof( float32 ), animPr ); 4.574 + right = SSR__malloc_to( 5 * 5 * sizeof( float32 ), animPr ); 4.575 + res = SSR__malloc_to( 5 * 5 * sizeof( float32 ), animPr ); 4.576 + 4.577 + for( r = 0; r < 5; r++ ) 4.578 + { 4.579 + for( c = 0; c < 5; c++ ) 4.580 + { 4.581 + left[ r * 5 + c ] = r; 4.582 + right[ r * 5 + c ] = c; 4.583 + } 4.584 + } 4.585 + 4.586 + //do primitive 4.587 + SSR__start_primitive(); //for now, just takes time stamp 4.588 + for( r = 0; r < 5; r++ ) 4.589 + { 4.590 + for( c = 0; c < 5; c++ ) 4.591 + { 4.592 + for( v = 0; v < 5; v++ ) 4.593 + { 4.594 + res[ r * 5 + c ] = left[ r * 5 + v ] * right[ v * 5 + c ]; 4.595 + } 4.596 + } 4.597 + } 4.598 + numCycles = 4.599 + SSR__end_primitive_and_give_cycles(); 4.600 + 4.601 + SSR__free( left, animPr ); 4.602 + SSR__free( right, animPr ); 4.603 + SSR__free( res, animPr ); 4.604 + 4.605 + return numCycles; 4.606 + }
5.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 5.2 +++ b/SSR_Matrix_Mult/EntryPoint.c Tue Feb 07 14:07:38 2012 -0800 5.3 @@ -0,0 +1,62 @@ 5.4 +/* 5.5 + * Copyright 2009 OpenSourceStewardshipFoundation.org 5.6 + * Licensed under GNU General Public License version 2 5.7 + * 5.8 + * Author: seanhalle@yahoo.com 5.9 + * 5.10 + */ 5.11 + 5.12 +#include <math.h> 5.13 + 5.14 +#include "SSR_Matrix_Mult.h" 5.15 + 5.16 + 5.17 + 5.18 +/*Every SSR system has an "entry point" function that creates the first 5.19 + * processor, which starts the chain of creating more processors.. 5.20 + * eventually all of the processors will dissipate themselves, and 5.21 + * return. 5.22 + * 5.23 + *This entry-point function follows the same pattern as all entry-point 5.24 + * functions do: 5.25 + *1) it creates the params for the seed processor, from the 5.26 + * parameters passed into the entry-point function 5.27 + *2) it calls SSR__create_seed_procr_and_do_work 5.28 + *3) it gets the return value from the params struc, frees the params struc, 5.29 + * and returns the value from the function 5.30 + * 5.31 + */ 5.32 +Matrix * 5.33 +multiplyTheseMatrices( Matrix *leftMatrix, Matrix *rightMatrix ) 5.34 + { Matrix *resMatrix; 5.35 + DividerParams *dividerParams; 5.36 + int32 numResRows, numResCols; 5.37 + 5.38 + 5.39 + dividerParams = malloc( sizeof( DividerParams ) ); 5.40 + dividerParams->leftMatrix = leftMatrix; 5.41 + dividerParams->rightMatrix = rightMatrix; 5.42 + 5.43 + 5.44 + numResRows = leftMatrix->numRows; 5.45 + numResCols = rightMatrix->numCols; 5.46 + 5.47 + //VMS has its own separate internal malloc, so to get results out, 5.48 + // have to pass in empty array for it to fill up 5.49 + //The alternative is internally telling SSR make external space to use 5.50 + resMatrix = malloc( sizeof(Matrix) ); 5.51 + resMatrix->array = malloc( numResRows * numResCols * sizeof(float32)); 5.52 + resMatrix->numCols = rightMatrix->numCols; 5.53 + resMatrix->numRows = leftMatrix->numRows; 5.54 + 5.55 + 5.56 + dividerParams->resultMatrix = resMatrix; 5.57 + 5.58 + //create divider processor, start doing the work, and wait till done 5.59 + //This function is the "border crossing" between normal code and SSR 5.60 + SSR__create_seed_procr_and_do_work( ÷WorkIntoSubMatrixPairProcrs, 5.61 + dividerParams ); 5.62 + 5.63 + free( dividerParams ); 5.64 + return resMatrix; 5.65 + }
6.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 6.2 +++ b/SSR_Matrix_Mult/Result_Pr.c Tue Feb 07 14:07:38 2012 -0800 6.3 @@ -0,0 +1,108 @@ 6.4 +/* 6.5 + * Copyright 2009 OpenSourceStewardshipFoundation.org 6.6 + * Licensed under GNU General Public License version 2 6.7 + * 6.8 + * Author: seanhalle@yahoo.com 6.9 + * 6.10 + */ 6.11 + 6.12 +#include "SSR_Matrix_Mult.h" 6.13 + 6.14 +//===================== 6.15 +void inline 6.16 +accumulateResult( float32 *resultArray, float32 *subMatrixResultArray, 6.17 + int32 startRow, 6.18 + int32 numRows, 6.19 + int32 startCol, 6.20 + int32 numCols, 6.21 + int32 numOrigCols ); 6.22 + 6.23 +//=========================================================================== 6.24 + 6.25 +/*The Result Processor gets a message from each of the vector processors, 6.26 + * puts the result from the message in its location in the result- 6.27 + * matrix, and increments the count of results. 6.28 + * 6.29 + *After the count reaches the point that all results have been received, it 6.30 + * returns the result matrix and dissipates. 6.31 + */ 6.32 +void gatherResults( void *_params, VirtProcr *animatingPr ) 6.33 + { VirtProcr *dividerPr; 6.34 + ResultsParams *params; 6.35 + int row, col, numRows, numCols, numSubMatrixPairs, count=0; 6.36 + float32 *resultArray; 6.37 + void *msg; 6.38 + SMPairParams *resParams; 6.39 + 6.40 + DEBUG( dbgAppFlow, "start resultPr\n") 6.41 + 6.42 + params = (ResultsParams *)_params; 6.43 + dividerPr = params->dividerPr; 6.44 + numSubMatrixPairs = params->numSubMatrixPairs; 6.45 + numRows = params->numRows; 6.46 + numCols = params->numCols; 6.47 + 6.48 + resultArray = params->resultArray; 6.49 + 6.50 + 6.51 + while( count < numSubMatrixPairs ) 6.52 + { 6.53 + msg = SSR__receive_type_to( RESULTS_MSG, animatingPr ); 6.54 + 6.55 + resParams = (SMPairParams *)msg; 6.56 + accumulateResult( resultArray, resParams->partialResultArray, 6.57 + resParams->leftSubMatrix->origStartRow, 6.58 + resParams->leftSubMatrix->numRows, 6.59 + resParams->rightSubMatrix->origStartCol, 6.60 + resParams->rightSubMatrix->numCols, 6.61 + resParams->rightSubMatrix->origMatrix->numCols ); 6.62 + 6.63 + SSR__free( resParams->partialResultArray, animatingPr ); 6.64 + 6.65 + //there is only one copy of results procr, so can update numUsesLeft 6.66 + // without concurrency worries. When zero, free the sub-matrix 6.67 + resParams->leftSubMatrix->numUsesLeft -= 1; 6.68 + if( resParams->leftSubMatrix->numUsesLeft == 0 ) 6.69 + { 6.70 + SSR__free( resParams->leftSubMatrix->array, animatingPr ); 6.71 + SSR__free( resParams->leftSubMatrix, animatingPr ); 6.72 + } 6.73 + 6.74 + resParams->rightSubMatrix->numUsesLeft -= 1; 6.75 + if( resParams->rightSubMatrix->numUsesLeft == 0 ) 6.76 + { 6.77 + SSR__free( resParams->rightSubMatrix->array, animatingPr ); 6.78 + SSR__free( resParams->rightSubMatrix, animatingPr ); 6.79 + } 6.80 + 6.81 + //count of how many sub-matrix pairs accumulated so know when done 6.82 + count++; 6.83 + } 6.84 + 6.85 + //Done -- could just dissipate -- SSR will wait for all processors to 6.86 + // dissipate before shutting down, and thereby making results avaial to 6.87 + // outside, so no need to stop the divider from dissipating, so no need 6.88 + // to send a hand-shake message to it -- bug makes debug easier 6.89 + SSR__send_from_to( NULL, animatingPr, dividerPr ); 6.90 + SSR__dissipate_procr( animatingPr ); //frees any data owned by procr 6.91 + } 6.92 + 6.93 +void inline 6.94 +accumulateResult( float32 *resultArray, float32 *subMatrixPairResultArray, 6.95 + int32 startRow, 6.96 + int32 numRows, 6.97 + int32 startCol, 6.98 + int32 numCols, 6.99 + int32 numOrigCols ) 6.100 + { int32 row, col; 6.101 + 6.102 + for( row = 0; row < numRows; row++ ) 6.103 + { 6.104 + for( col = 0; col < numCols; col++ ) 6.105 + { 6.106 + resultArray[ (row + startRow) * numOrigCols + (col + startCol) ] += 6.107 + subMatrixPairResultArray[ row * numCols + col ]; 6.108 + } 6.109 + } 6.110 + 6.111 + }
7.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 7.2 +++ b/SSR_Matrix_Mult/SSR_Matrix_Mult.h Tue Feb 07 14:07:38 2012 -0800 7.3 @@ -0,0 +1,97 @@ 7.4 +/* 7.5 + * Copyright Oct 24, 2009 OpenSourceStewardshipFoundation.org 7.6 + * Licensed under GNU General Public License version 2 7.7 + */ 7.8 + 7.9 +#ifndef _SSR_MATRIX_MULT_H_ 7.10 +#define _SSR_MATRIX_MULT_H_ 7.11 + 7.12 +#include <stdio.h> 7.13 + 7.14 +#include "../../SSR_lib/SSR.h" 7.15 +#include "../Matrix_Mult.h" 7.16 + 7.17 + 7.18 +//=============================== Defines ============================== 7.19 +#define ROWS_IN_BLOCK 32 7.20 +#define COLS_IN_BLOCK 32 7.21 +#define VEC_IN_BLOCK 32 7.22 + 7.23 +#define copyMatrixSingleton 1 7.24 +#define copyTransposeSingleton 2 7.25 + 7.26 +//============================== Structures ============================== 7.27 +typedef struct 7.28 + { 7.29 + Matrix *leftMatrix; 7.30 + Matrix *rightMatrix; 7.31 + Matrix *resultMatrix; 7.32 + } 7.33 +DividerParams; 7.34 + 7.35 +typedef struct 7.36 + { 7.37 + VirtProcr *dividerPr; 7.38 + int numRows; 7.39 + int numCols; 7.40 + int numSubMatrixPairs; 7.41 + float32 *resultArray; 7.42 + } 7.43 +ResultsParams; 7.44 + 7.45 +typedef 7.46 +struct 7.47 + { int32 numRows; 7.48 + int32 numCols; 7.49 + Matrix *origMatrix; 7.50 + int32 origStartRow; 7.51 + int32 origStartCol; 7.52 + int32 alreadyCopied; 7.53 + int32 numUsesLeft; //have update via message to avoid multiple writers 7.54 + SSRSingleton *copySingleton; 7.55 + SSRSingleton *copyTransSingleton; 7.56 + float32 *array; //2D, but dynamically sized, so use addr arith 7.57 + } 7.58 +SubMatrix; 7.59 + 7.60 +typedef struct 7.61 + { VirtProcr *resultPr; 7.62 + SubMatrix *leftSubMatrix; 7.63 + SubMatrix *rightSubMatrix; 7.64 + float32 *partialResultArray; 7.65 + } 7.66 +SMPairParams; 7.67 + 7.68 +typedef 7.69 +struct 7.70 + { int32 numVals; 7.71 + int32 *startVals; 7.72 + } 7.73 +SlicingStruc; 7.74 + 7.75 +typedef 7.76 +struct 7.77 + { 7.78 + SlicingStruc *leftRowSlices; 7.79 + SlicingStruc *vecSlices; 7.80 + SlicingStruc *rightColSlices; 7.81 + } 7.82 +SlicingStrucCarrier; 7.83 + 7.84 +enum MMMsgType 7.85 + { 7.86 + RESULTS_MSG = 1 7.87 + }; 7.88 + 7.89 +//============================= Processor Functions ========================= 7.90 +void divideWorkIntoSubMatrixPairProcrs( void *data, VirtProcr *animatingPr ); 7.91 +void calcSubMatrixProduct( void *data, VirtProcr *animatingPr ); 7.92 +void gatherResults( void *data, VirtProcr *animatingPr ); 7.93 + 7.94 + 7.95 +//================================ Entry Point ============================== 7.96 +Matrix * 7.97 +multiplyTheseMatrices( Matrix *leftMatrix, Matrix *rightMatrix ); 7.98 + 7.99 + 7.100 +#endif /*_SSR_MATRIX_MULT_H_*/
8.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 8.2 +++ b/SSR_Matrix_Mult/subMatrix_Pr.c Tue Feb 07 14:07:38 2012 -0800 8.3 @@ -0,0 +1,319 @@ 8.4 +/* 8.5 + * Copyright 2009 OpenSourceStewardshipFoundation.org 8.6 + * Licensed under GNU General Public License version 2 8.7 + * 8.8 + * Author: SeanHalle@yahoo.com 8.9 + * 8.10 + */ 8.11 + 8.12 +#include <string.h> 8.13 + 8.14 +#include "SSR_Matrix_Mult.h" 8.15 + 8.16 + 8.17 + 8.18 +void inline 8.19 +copyFromOrig( SubMatrix *subMatrix, VirtProcr *animPr ); 8.20 + 8.21 +void inline 8.22 +copyTransposeFromOrig( SubMatrix *subMatrix, VirtProcr *animPr ); 8.23 + 8.24 +void inline 8.25 +multiplySubBlocksTransposed( float32 *leftArray, float32 *rightArray, 8.26 + float32 *resArray, 8.27 + int startRow, int endRow, 8.28 + int startCol, int endCol, 8.29 + int startVec, int endVec, 8.30 + int resStride, int inpStride ); 8.31 + 8.32 +void inline 8.33 +multiplyMatrixArraysTransposed( int32 vecLength, int32 numResRows, int32 numResCols, 8.34 + float32 *leftArray, float32 *rightArray, 8.35 + float32 *resArray ); 8.36 + 8.37 + 8.38 +/*A processor is created with an environment that holds two matrices, 8.39 + * the row and col that it owns, and the name of a result gathering 8.40 + * processor. 8.41 + *It calculates the product of two sub-portions of the input matrices 8.42 + * by using Intel's mkl library for single-core. 8.43 + * 8.44 + *This demonstrates using optimized single-threaded code inside scheduled 8.45 + * work-units. 8.46 + * 8.47 + *When done, it sends the result to the result processor 8.48 + */ 8.49 +void 8.50 +calcSubMatrixProduct( void *data, VirtProcr *animatingPr ) 8.51 + { 8.52 + SMPairParams *params; 8.53 + VirtProcr *resultPr; 8.54 + float32 *leftArray, *rightArray, *resArray; 8.55 + SubMatrix *leftSubMatrix, *rightSubMatrix; 8.56 + 8.57 + DEBUG1(dbgAppFlow, "start sub-matrix mult: %d\n", animatingPr->procrID) 8.58 + #ifdef TURN_ON_DEBUG_PROBES 8.59 + int32 subMatrixProbe = VMS__create_single_interval_probe( "subMtx", 8.60 + animatingPr); 8.61 + VMS__record_sched_choice_into_probe( subMatrixProbe, animatingPr ); 8.62 + VMS__record_interval_start_in_probe( subMatrixProbe ); 8.63 + #endif 8.64 + 8.65 + params = (SMPairParams *)data; 8.66 + resultPr = params->resultPr; 8.67 + leftSubMatrix = params->leftSubMatrix; 8.68 + rightSubMatrix = params->rightSubMatrix; 8.69 + 8.70 + //make sure the input sub-matrices have been copied out of orig 8.71 + //do it here, inside sub-matrix pair to hopefully gain reuse in cache 8.72 + copyFromOrig( leftSubMatrix, animatingPr ); 8.73 + copyTransposeFromOrig( rightSubMatrix, animatingPr ); 8.74 + 8.75 + leftArray = leftSubMatrix->array; 8.76 + rightArray = rightSubMatrix->array; 8.77 + 8.78 + int32 8.79 + resSize = leftSubMatrix->numRows * rightSubMatrix->numCols * sizeof(float32); 8.80 + resArray = SSR__malloc_to( resSize, animatingPr ); 8.81 + memset( resArray, 0, resSize ); 8.82 + 8.83 + 8.84 + int32 numResRows, numResCols, vectLength; 8.85 + 8.86 + vectLength = leftSubMatrix->numCols; 8.87 + numResRows = leftSubMatrix->numRows; 8.88 + numResCols = rightSubMatrix->numCols; 8.89 + 8.90 + multiplyMatrixArraysTransposed( vectLength, numResRows, numResCols, 8.91 + leftArray, rightArray, 8.92 + resArray ); 8.93 + 8.94 + //send result to result processor 8.95 + params->partialResultArray = resArray; 8.96 + 8.97 + #ifdef TURN_ON_DEBUG_PROBES 8.98 + VMS__record_interval_end_in_probe( subMatrixProbe ); 8.99 + #endif 8.100 + 8.101 + SSR__send_of_type_to( animatingPr, params, RESULTS_MSG, resultPr ); 8.102 + SSR__dissipate_procr( animatingPr ); 8.103 + } 8.104 + 8.105 + 8.106 + 8.107 +/*Divides result and each input into 32x32 sub-matrices, 3 of which fit into 8.108 + * the 32KB L1 cache. 8.109 + *Would be nice to embed this within another level that divided into 8.110 + * 8x8 tiles of those, where one 8x8 tile fits within 2MB L2 cache 8.111 + * 8.112 + *Eventually want these divisions to be automatic, using DKU pattern 8.113 + * embedded into VMS and exposed in the language, and with VMS controlling the 8.114 + * divisions according to the cache sizes, which it knows about. 8.115 + *Also, want VMS to work with language to split among main-mems, so a socket 8.116 + * only cranks on data in its local segment of main mem 8.117 + * 8.118 + *So, outer two loops determine start and end points within the result matrix. 8.119 + * Inside that, a loop dets the start and end points along the shared dimensions 8.120 + * of the two input matrices. 8.121 + */ 8.122 +void inline 8.123 +multiplyMatrixArraysTransposed( int32 vecLength, int32 numResRows, 8.124 + int32 numResCols, 8.125 + float32 *leftArray, float32 *rightArray, 8.126 + float32 *resArray ) 8.127 + { 8.128 + int resStride, inpStride; 8.129 + int resStartRow, resStartCol, resEndRow, resEndCol, startVec, endVec; 8.130 + 8.131 + resStride = numResCols; 8.132 + inpStride = vecLength; 8.133 + 8.134 + for( resStartRow = 0; resStartRow < numResRows; ) 8.135 + { 8.136 + resEndRow = resStartRow + ROWS_IN_BLOCK -1; //start at zero, so -1 8.137 + if( resEndRow > numResRows ) resEndRow = numResRows -1; 8.138 + 8.139 + for( resStartCol = 0; resStartCol < numResCols; ) 8.140 + { 8.141 + resEndCol = resStartCol + COLS_IN_BLOCK -1; 8.142 + if( resEndCol > numResCols ) resEndCol = numResCols -1; 8.143 + 8.144 + for( startVec = 0; startVec < vecLength; ) 8.145 + { 8.146 + endVec = startVec + VEC_IN_BLOCK -1; 8.147 + if( endVec > vecLength ) endVec = vecLength -1; 8.148 + 8.149 + //By having the "vector" of sub-blocks in a sub-block slice 8.150 + // be marched down in inner loop, are re-using the result 8.151 + // matrix, which stays in L1 cache and re-using the left sub-mat 8.152 + // which repeats for each right sub-mat -- can only re-use two of 8.153 + // the three, so result is the most important -- avoids writing 8.154 + // dirty blocks until those result-locations fully done 8.155 + //Row and Col is position in result matrix -- so row and vec 8.156 + // for left array, then vec and col for right array 8.157 + multiplySubBlocksTransposed( leftArray, rightArray, 8.158 + resArray, 8.159 + resStartRow, resEndRow, 8.160 + resStartCol, resEndCol, 8.161 + startVec, endVec, 8.162 + resStride, inpStride ); 8.163 + startVec = endVec +1; 8.164 + } 8.165 + resStartCol = resEndCol +1; 8.166 + } 8.167 + resStartRow = resEndRow +1; 8.168 + } 8.169 + } 8.170 + 8.171 + 8.172 + 8.173 +void inline 8.174 +multiplySubBlocksTransposed( float32 *leftArray, float32 *rightArray, 8.175 + float32 *resArray, 8.176 + int resStartRow, int resEndRow, 8.177 + int resStartCol, int resEndCol, 8.178 + int startVec, int endVec, 8.179 + int resStride, int inpStride ) 8.180 + { 8.181 + int resRow, resCol, vec; 8.182 + int leftOffset, rightOffset; 8.183 + float32 result; 8.184 + 8.185 + //The result row is used only for the left matrix, res col for the right 8.186 + for( resCol = resStartCol; resCol <= resEndCol; resCol++ ) 8.187 + { 8.188 + for( resRow = resStartRow; resRow <= resEndRow; resRow++ ) 8.189 + { 8.190 + leftOffset = resRow * inpStride;//left & right inp strides always same 8.191 + rightOffset = resCol * inpStride;// because right is transposed 8.192 + result = 0; 8.193 + for( vec = startVec; vec <= endVec; vec++ ) 8.194 + { 8.195 + result += 8.196 + leftArray[ leftOffset + vec] * rightArray[ rightOffset + vec]; 8.197 + } 8.198 + 8.199 + resArray[ resRow * resStride + resCol ] += result; 8.200 + } 8.201 + } 8.202 + } 8.203 + 8.204 + 8.205 + 8.206 + 8.207 +/*Reuse this in divider when do the sequential multiply case 8.208 + */ 8.209 +void inline 8.210 +copyTranspose( int32 numRows, int32 numCols, 8.211 + int32 origStartRow, int32 origStartCol, int32 origStride, 8.212 + float32 *subArray, float32 *origArray ) 8.213 + { int32 stride = numRows; 8.214 + 8.215 + int row, col, origOffset; 8.216 + for( row = 0; row < numRows; row++ ) 8.217 + { 8.218 + origOffset = (row + origStartRow) * origStride + origStartCol; 8.219 + for( col = 0; col < numCols; col++ ) 8.220 + { 8.221 + //transpose means swap row & col -- traverse orig matrix normally 8.222 + // but put into reversed place in local array -- means the 8.223 + // stride is the numRows now, so col * numRows + row 8.224 + subArray[ col * stride + row ] = origArray[ origOffset + col ]; 8.225 + } 8.226 + } 8.227 + } 8.228 + 8.229 +void inline 8.230 +copyTransposeFromOrig( SubMatrix *subMatrix, VirtProcr *animPr ) 8.231 + { int numCols, numRows, origStartRow, origStartCol, origStride, stride; 8.232 + Matrix *origMatrix; 8.233 + float32 *origArray, *subArray; 8.234 + 8.235 +// if( subMatrix->copyTransSingleton && \ 8.236 +// subMatrix->copyTransSingleton->hasFinished ) \ 8.237 +// return; 8.238 + SSR__start_data_singleton( &(subMatrix->copyTransSingleton), animPr ); 8.239 + 8.240 + if( subMatrix->copyTransSingleton->hasFinished ) 8.241 + { 8.242 + printf("error!"); 8.243 + } 8.244 + 8.245 + origMatrix = subMatrix->origMatrix; 8.246 + origArray = origMatrix->array; 8.247 + numCols = subMatrix->numCols; 8.248 + numRows = subMatrix->numRows; 8.249 + origStartRow = subMatrix->origStartRow; 8.250 + origStartCol = subMatrix->origStartCol; 8.251 + origStride = origMatrix->numCols; 8.252 + 8.253 + subArray = SSR__malloc_to( numRows * numCols *sizeof(float32),animPr); 8.254 + subMatrix->array = subArray; 8.255 + 8.256 + //copy values from orig matrix to local 8.257 + copyTranspose( numRows, numCols, 8.258 + origStartRow, origStartCol, origStride, 8.259 + subArray, origArray ); 8.260 + 8.261 + SSR__end_data_singleton( &(subMatrix->copyTransSingleton), animPr ); 8.262 + 8.263 + return; 8.264 + } 8.265 + 8.266 + 8.267 +void inline 8.268 +copyFromOrig( SubMatrix *subMatrix, VirtProcr *animPr ) 8.269 + { int numCols, numRows, origStartRow, origStartCol, stride, origStride; 8.270 + Matrix *origMatrix; 8.271 + float32 *origArray, *subArray; 8.272 + 8.273 + 8.274 + //This lets only a single VP execute the code between start and 8.275 + // end -- using start and end so that work runs outside the master. 8.276 + //Inside, if a second VP ever executes the start, it will be returned 8.277 + // from the end-point. 8.278 + //Note, for non-GCC, can add a second SSR call at the end, and inside 8.279 + // that one, look at the stack at the return addr & save that in an 8.280 + // array indexed by singletonID 8.281 +// if( subMatrix->copySingleton && subMatrix->copySingleton->hasFinished )\ 8.282 + return; 8.283 + SSR__start_data_singleton( &(subMatrix->copySingleton), animPr ); 8.284 + if( subMatrix->copySingleton->endInstrAddr ) 8.285 + { 8.286 + printf("error!"); 8.287 + } 8.288 + 8.289 + if( subMatrix->copySingleton->hasFinished ) 8.290 + { 8.291 + printf("error!"); 8.292 + } 8.293 + 8.294 + origMatrix = subMatrix->origMatrix; 8.295 + origArray = origMatrix->array; 8.296 + numCols = subMatrix->numCols; 8.297 + numRows = subMatrix->numRows; 8.298 + origStartRow = subMatrix->origStartRow; 8.299 + origStartCol = subMatrix->origStartCol; 8.300 + origStride = origMatrix->numCols; 8.301 + 8.302 + subArray = SSR__malloc_to( numRows * numCols *sizeof(float32),animPr); 8.303 + subMatrix->array = subArray; 8.304 + 8.305 + //copy values from orig matrix to local 8.306 + stride = numCols; 8.307 + 8.308 + int row, col, offset, origOffset; 8.309 + for( row = 0; row < numRows; row++ ) 8.310 + { 8.311 + offset = row * stride; 8.312 + origOffset = (row + origStartRow) * origStride + origStartCol; 8.313 + for( col = 0; col < numCols; col++ ) 8.314 + { 8.315 + subArray[ offset + col ] = origArray[ origOffset + col ]; 8.316 + } 8.317 + } 8.318 + 8.319 + SSR__end_data_singleton( &(subMatrix->copySingleton), animPr ); 8.320 + 8.321 + return; 8.322 + }
9.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 9.2 +++ b/main.c Tue Feb 07 14:07:38 2012 -0800 9.3 @@ -0,0 +1,37 @@ 9.4 +/* 9.5 + * Copyright Oct 24, 2009 OpenSourceStewardshipFoundation.org 9.6 + * Licensed under GNU General Public License version 2 9.7 + * 9.8 + * author seanhalle@yahoo.com 9.9 + */ 9.10 + 9.11 +#include <malloc.h> 9.12 +#include <stdlib.h> 9.13 + 9.14 +#include "Matrix_Mult.h" 9.15 +#include "SSR_Matrix_Mult/SSR_Matrix_Mult.h" 9.16 + 9.17 +char __ProgrammName[] = "Blocked Matrix Multiply"; 9.18 +char __DataSet[255]; 9.19 +/** 9.20 + * 9.21 + */ 9.22 +int main( int argc, char **argv ) 9.23 + { Matrix *leftMatrix, *rightMatrix, *resultMatrix; 9.24 + ParamBag *paramBag; 9.25 + 9.26 + printf( "arguments: %s | %s\n", argv[0], argv[1] ); 9.27 + 9.28 + paramBag = makeParamBag(); 9.29 + readParamFileIntoBag( argv[1], paramBag ); 9.30 + initialize_Input_Matrices_Via( &leftMatrix, &rightMatrix, paramBag ); 9.31 + 9.32 + resultMatrix = multiplyTheseMatrices( leftMatrix, rightMatrix ); 9.33 + 9.34 + printf("\nresult matrix: \n"); 9.35 + printMatrix( resultMatrix ); 9.36 +// SSR__print_stats(); 9.37 + fflush(stdin); 9.38 + 9.39 + exit(0); //cleans up 9.40 + }
10.1 --- a/src/Application/Matrix_Mult.c Wed Sep 07 13:06:25 2011 +0200 10.2 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 10.3 @@ -1,167 +0,0 @@ 10.4 -/* 10.5 - * Copyright 2009 OpenSourceStewardshipFoundation.org 10.6 - * Licensed under GNU General Public License version 2 10.7 - * 10.8 - * Author: seanhalle@yahoo.com 10.9 - * 10.10 - * Created on November 15, 2009, 2:35 AM 10.11 - */ 10.12 - 10.13 -#include <malloc.h> 10.14 -#include <stdlib.h> 10.15 - 10.16 -#include "Matrix_Mult.h" 10.17 -#include "ParamHelper/Param.h" 10.18 - 10.19 - 10.20 - 10.21 - void 10.22 -initialize_Input_Matrices_Via( Matrix **leftMatrix, Matrix **rightMatrix, 10.23 - ParamBag *paramBag ) 10.24 - { char *leftMatrixFileName, *rightMatrixFileName; 10.25 - int leftMatrixRows, leftMatrixCols, rightMatrixRows, rightMatrixCols; 10.26 - 10.27 - ParamStruc *param; 10.28 - param = getParamFromBag( "leftMatrixRows", paramBag ); 10.29 - leftMatrixRows = param->intValue; 10.30 - param = getParamFromBag( "leftMatrixCols", paramBag ); 10.31 - leftMatrixCols = param->intValue; 10.32 - *leftMatrix = makeMatrix_WithResMat( leftMatrixRows, leftMatrixCols ); 10.33 - 10.34 - param = getParamFromBag( "leftMatrixFileName", paramBag ); 10.35 - leftMatrixFileName = param->strValue; //no need to copy 10.36 - read_Matrix_From_File( *leftMatrix, leftMatrixFileName ); 10.37 - 10.38 - param = getParamFromBag( "rightMatrixRows", paramBag ); 10.39 - rightMatrixRows = param->intValue; 10.40 - param = getParamFromBag( "rightMatrixCols", paramBag ); 10.41 - rightMatrixCols = param->intValue; 10.42 - *rightMatrix = makeMatrix_WithResMat( rightMatrixRows, rightMatrixCols ); 10.43 - 10.44 - param = getParamFromBag( "rightMatrixFileName", paramBag ); 10.45 - rightMatrixFileName = param->strValue; 10.46 - read_Matrix_From_File( *rightMatrix, rightMatrixFileName ); 10.47 - } 10.48 - 10.49 - 10.50 -void parseLineIntoRow( char *line, float32* row ); 10.51 - 10.52 - 10.53 - void 10.54 -read_Matrix_From_File( Matrix *matrixStruc, char *matrixFileName ) 10.55 - { int row, maxRead, numRows, numCols; 10.56 - float32 *matrixStart; 10.57 - size_t lineSz = 0; 10.58 - FILE *file; 10.59 - char *line = NULL; 10.60 - 10.61 - lineSz = 50000; //max length of line in a matrix data file 10.62 - line = (char *) malloc( lineSz ); 10.63 - if( line == NULL ) printf( "no mem for matrix line" ); 10.64 - 10.65 - numRows = matrixStruc->numRows; 10.66 - numCols = matrixStruc->numCols; 10.67 - matrixStart = matrixStruc->array; 10.68 - 10.69 - file = fopen( matrixFileName, "r" ); 10.70 - if( file == NULL ) { printf( "\nCouldn't open file!!\n"); exit(1);} 10.71 - fseek( file, 0, SEEK_SET ); 10.72 - for( row = 0; row < numRows; row++ ) 10.73 - { 10.74 - if( feof( file ) ) printf( "file ran out too soon" ); 10.75 - maxRead = getline( &line, &lineSz, file ); 10.76 - if( maxRead == -1 ) printf( "prob reading mat line"); 10.77 - 10.78 - if( *line == '\n') continue; //blank line 10.79 - if( *line == '/' ) continue; //comment line 10.80 - 10.81 - parseLineIntoRow( line, matrixStart + row * numCols ); 10.82 - } 10.83 - free( line ); 10.84 - } 10.85 - 10.86 -/*This function relies on each line having the proper number of cols. It 10.87 - * doesn't check, nor enforce, so if the file is improperly formatted it 10.88 - * can write over unrelated memory 10.89 - */ 10.90 - void 10.91 -parseLineIntoRow( char *line, float32* row ) 10.92 - { 10.93 - char *valueStr, *searchPos; 10.94 - 10.95 - //read the float values 10.96 - searchPos = valueStr = line; //start 10.97 - 10.98 - for( ; *searchPos != 0; searchPos++) //bit dangerous, should use buff len 10.99 - { 10.100 - if( *searchPos == '\n' ) //last col.. relying on well-formatted file 10.101 - { *searchPos = 0; 10.102 - *row = atof( valueStr ); 10.103 - break; //end FOR loop 10.104 - } 10.105 - if( *searchPos == ',' ) 10.106 - { *searchPos = 0; //mark end of string 10.107 - *row = (float32) atof( valueStr ); 10.108 - row += 1; //address arith 10.109 - //skip any spaces before digits.. use searchPos + 1 to skip the 0 10.110 - for( ; *(searchPos + 1)== ' ' && *(searchPos + 1) !=0; searchPos++); 10.111 - valueStr = searchPos + 1; 10.112 - } 10.113 - } 10.114 - } 10.115 - 10.116 - //========================================================================== 10.117 - 10.118 -/*In the "_Flat" version of constructor, do only malloc of the top data struc 10.119 - * and set values in that top-level. Don't malloc any sub-structures. 10.120 - */ 10.121 - Matrix * 10.122 -makeMatrix_Flat( int32 numRows, int32 numCols ) 10.123 - { Matrix * retMatrix; 10.124 - retMatrix = malloc( sizeof( Matrix ) ); 10.125 - retMatrix->numRows = numRows; 10.126 - retMatrix->numCols = numCols; 10.127 - 10.128 - return retMatrix; 10.129 - } 10.130 - 10.131 - Matrix * 10.132 -makeMatrix_WithResMat( int32 numRows, int32 numCols ) 10.133 - { Matrix * retMatrix; 10.134 - retMatrix = malloc( sizeof( Matrix ) ); 10.135 - retMatrix->numRows = numRows; 10.136 - retMatrix->numCols = numCols; 10.137 - retMatrix->array = malloc( numRows * numCols * sizeof(float32) ); 10.138 - 10.139 - return retMatrix; 10.140 - } 10.141 - 10.142 - void 10.143 -freeMatrix_Flat( Matrix * matrix ) 10.144 - { //( matrix ); 10.145 - } 10.146 - void 10.147 -freeMatrix( Matrix * matrix ) 10.148 - { free( matrix->array ); 10.149 - free( matrix ); 10.150 - } 10.151 - 10.152 -void 10.153 -printMatrix( Matrix *matrix ) 10.154 - { int r, c, numRows, numCols, rowsToPrint, colsToPrint, rowIncr, colIncr; 10.155 - float32 *matrixArray; 10.156 - 10.157 - numRows = rowsToPrint = matrix->numRows; 10.158 - numCols = colsToPrint = matrix->numCols; 10.159 - matrixArray = matrix->array; 10.160 - 10.161 - rowIncr = numRows/20; if(rowIncr == 0) rowIncr = 1;//20 to 39 rows printed 10.162 - colIncr = numCols/20; if(colIncr == 0) colIncr = 1;//20 to 39 cols printed 10.163 - for( r = 0; r < numRows; r += rowIncr ) 10.164 - { for( c = 0; c < numCols; c += colIncr ) 10.165 - { printf( "%3.1f | ", matrixArray[ r * numCols + c ] ); 10.166 - } 10.167 - printf("\n"); 10.168 - } 10.169 - } 10.170 -
11.1 --- a/src/Application/Matrix_Mult.h Wed Sep 07 13:06:25 2011 +0200 11.2 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 11.3 @@ -1,77 +0,0 @@ 11.4 -/* 11.5 - * Copyright Oct 24, 2009 OpenSourceStewardshipFoundation.org 11.6 - * Licensed under GNU General Public License version 2 11.7 - */ 11.8 - 11.9 -#ifndef MATRIX_MULT_H_ 11.10 -#define MATRIX_MULT_H_ 11.11 - 11.12 -#include <stdio.h> 11.13 -#include <unistd.h> 11.14 -#include <malloc.h> 11.15 - 11.16 -#include "../SSR_lib/VMS/VMS_primitive_data_types.h" 11.17 -#include "ParamHelper/Param.h" 11.18 - 11.19 -//============================== Structures ============================== 11.20 - 11.21 -typedef 11.22 -struct 11.23 - { int32 numRows; 11.24 - int32 numCols; 11.25 - float32 *array; //2D, but dynamically sized, so use addr arith 11.26 - } 11.27 -Matrix; 11.28 - 11.29 -/* This is the "appSpecificPiece" that is carried inside a DKUPiece. 11.30 - * In the DKUPiece data struc it is declared to be of type "void *". This 11.31 - * allows the application to define any data structure it wants and put it 11.32 - * into a DKUPiece. 11.33 - * When the app specific info is used, it is in app code, so it is cast to 11.34 - * the correct type to tell the compiler how to access fields. 11.35 - * This keeps all app-specific things out of the DKU directory, as per the 11.36 - * DKU standard. */ 11.37 -typedef 11.38 -struct 11.39 - { 11.40 - // pointers to shared data.. the result matrix must be created when the 11.41 - // left and right matrices are put into the root ancestor DKUPiece. 11.42 - Matrix * leftMatrix; 11.43 - Matrix * rightMatrix; 11.44 - Matrix * resultMatrix; 11.45 - 11.46 - // define the starting and ending boundaries for this piece of the 11.47 - // result matrix. These are derivable from the left and right 11.48 - // matrices, but included them for readability of code. 11.49 - int prodStartRow, prodEndRow; 11.50 - int prodStartCol, prodEndCol; 11.51 - // Start and end of the portion of the left matrix that contributes to 11.52 - // this piece of the product 11.53 - int leftStartRow, leftEndRow; 11.54 - int leftStartCol, leftEndCol; 11.55 - // Start and end of the portion of the right matrix that contributes to 11.56 - // this piece of the product 11.57 - int rightStartRow, rightEndRow; 11.58 - int rightStartCol, rightEndCol; 11.59 - } 11.60 -MatrixProdPiece; 11.61 - 11.62 -//============================== Functions ================================ 11.63 -void readFile(); 11.64 - 11.65 -Matrix *makeMatrix( int32 numRows, int32 numCols ); 11.66 -Matrix *makeMatrix_Flat( int32 numRows, int32 numCols ); 11.67 -Matrix *makeMatrix_WithResMat( int32 numRows, int32 numCols ); 11.68 -void freeMatrix_Flat( Matrix * matrix ); 11.69 -void freeMatrix( Matrix * matrix ); 11.70 -void printMatrix( Matrix *matrix ); 11.71 - 11.72 -void read_Matrix_From_File( Matrix *matrixStruc, char *matrixFileName ); 11.73 - 11.74 -void 11.75 -initialize_Input_Matrices_Via( Matrix **leftMatrix, Matrix **rightMatrix, 11.76 - ParamBag *paramBag ); 11.77 - 11.78 -//=========================================================================== 11.79 - 11.80 -#endif /*MATRIX_MULT_H_*/
12.1 --- a/src/Application/SSR_Matrix_Mult/Divide_Pr.c Wed Sep 07 13:06:25 2011 +0200 12.2 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 12.3 @@ -1,603 +0,0 @@ 12.4 -/* 12.5 - * Copyright 2009 OpenSourceStewardshipFoundation.org 12.6 - * Licensed under GNU General Public License version 2 12.7 - * 12.8 - * Author: seanhalle@yahoo.com 12.9 - * 12.10 - */ 12.11 - 12.12 - 12.13 -#include "SSR_Matrix_Mult.h" 12.14 -#include <math.h> 12.15 -#include <string.h> 12.16 - 12.17 - //The time to compute this many result values should equal the time to 12.18 - // perform this division on a matrix of size gives that many result calcs 12.19 - //IE, size this so that sequential time to calc equals divide time 12.20 - // find the value by experimenting -- but divide time and calc time scale 12.21 - // same way, so this value should remain valid across hardware 12.22 -#define NUM_CELLS_IN_SEQUENTIAL_CUTOFF 1000 12.23 - 12.24 - 12.25 -//=========================================================================== 12.26 -int inline 12.27 -measureMatrixMultPrimitive( VirtProcr *animPr ); 12.28 - 12.29 -SlicingStrucCarrier * 12.30 -calcIdealSizeAndSliceDimensions( Matrix *leftMatrix, Matrix *rightMatrix, 12.31 - VirtProcr *animPr ); 12.32 - 12.33 -SlicingStruc * 12.34 -sliceUpDimension( float32 idealSizeOfSide, int startVal, int endVal, 12.35 - VirtProcr *animPr ); 12.36 - 12.37 -void 12.38 -freeSlicingStruc( SlicingStruc *slicingStruc, VirtProcr *animPr ); 12.39 - 12.40 -SubMatrix ** 12.41 -createSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices, 12.42 - int32 numUses, Matrix *origMatrix, VirtProcr *animPr ); 12.43 - 12.44 -void 12.45 -freeSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices, 12.46 - SubMatrix **subMatrices, VirtProcr *animPr ); 12.47 - 12.48 -void 12.49 -pairUpSubMatricesAndMakeProcessors( SubMatrix **leftSubMatrices, 12.50 - SubMatrix **rightSubMatrices, 12.51 - int32 numRowIdxs, int32 numColIdxs, 12.52 - int32 numVecIdxs, 12.53 - VirtProcr *resultPr, 12.54 - VirtProcr *animatingPr ); 12.55 - 12.56 -void 12.57 -makeSubMatricesAndProcrs( Matrix *leftMatrix, Matrix *rightMatrix, 12.58 - SlicingStrucCarrier *slicingStrucCarrier, 12.59 - VirtProcr *resultPr, VirtProcr *animatingPr ); 12.60 - 12.61 - 12.62 - 12.63 -/*Divider creates one processor for every sub-matrix 12.64 - * It hands them: 12.65 - * the name of the result processor that they should send their results to, 12.66 - * the left and right matrices, and the rows and cols they should multiply 12.67 - * It first creates the result processor, then all the sub-matrixPair 12.68 - * processors, 12.69 - * then does a receive of a message from the result processor that gives 12.70 - * the divider ownership of the result matrix. 12.71 - * Finally, the divider returns the result matrix out of the SSR system. 12.72 - * 12.73 - * Divider chooses the size of sub-matrices via an algorithm that tries to 12.74 - * keep the minimum work above a threshold. The threshold is machine- 12.75 - * dependent, so ask SSR for min work-unit time to get a 12.76 - * given overhead 12.77 - * 12.78 - * Divide min work-unit cycles by measured-cycles for one matrix-cell 12.79 - * product -- gives the number of products need to have in min size 12.80 - * matrix. 12.81 - * 12.82 - * So then, take cubed root of this to get the size of a side of min sub- 12.83 - * matrix. That is the size of the ideal square sub-matrix -- so tile 12.84 - * up the two input matrices into ones as close as possible to that size, 12.85 - * and create the pairs of sub-matrices. 12.86 - * 12.87 - *======================== STRATEGIC OVERVIEW ======================= 12.88 - * 12.89 - *This division is a bit tricky, because have to create things in advance 12.90 - * that it's not at first obvious need to be created.. 12.91 - * 12.92 - *First slice up each dimension -- three of them.. this is because will have 12.93 - * to create the sub-matrix's data-structures before pairing the sub-matrices 12.94 - * with each other -- so, have three dimensions to slice up before can 12.95 - * create the sub-matrix data-strucs -- also, have to be certain that the 12.96 - * cols of the left input have the exact same slicing as the rows of the 12.97 - * left matrix, so just to be sure, do the slicing calc once, then use it 12.98 - * for both. 12.99 - * 12.100 - *So, goes like this: 12.101 - *1) calculate the start & end values of each dimension in each matrix. 12.102 - *2) use those values to create sub-matrix structures 12.103 - *3) combine sub-matrices into pairs, as the tasks to perform. 12.104 - * 12.105 - *Have to calculate separately from creating the sub-matrices because of the 12.106 - * nature of the nesting -- would either end up creating the same sub-matrix 12.107 - * multiple times, or else would have to put in detection of whether had 12.108 - * made a particular one already if tried to combine steps 1 and 2. 12.109 - * 12.110 - *Step 3 has to be separate because of the nesting, as well -- same reason, 12.111 - * would either create same sub-matrix multiple times, or else have to 12.112 - * add detection of whether was already created. 12.113 - * 12.114 - *Another way to look at it: there's one level of loop to divide dimensions, 12.115 - * two levels of nesting to create sub-matrices, and three levels to pair 12.116 - * up the sub-matrices. 12.117 - */ 12.118 - 12.119 -void divideWorkIntoSubMatrixPairProcrs( void *_dividerParams, 12.120 - VirtProcr *animPr ) 12.121 - { VirtProcr *resultPr; 12.122 - DividerParams *dividerParams; 12.123 - ResultsParams *resultsParams; 12.124 - Matrix *leftMatrix, *rightMatrix, *resultMatrix; 12.125 - void *msg; 12.126 - SlicingStrucCarrier *slicingStrucCarrier; 12.127 - float32 *resultArray; //points to array inside result matrix 12.128 - 12.129 - DEBUG( dbgAppFlow, "start divide\n") 12.130 - 12.131 - int32 12.132 - divideProbe = VMS__create_single_interval_probe( "divideProbe", 12.133 - animPr ); 12.134 - VMS__record_sched_choice_into_probe( divideProbe, animPr ); 12.135 - VMS__record_interval_start_in_probe( divideProbe ); 12.136 - 12.137 - //=========== Setup -- make local copies of ptd-to-things, malloc, aso 12.138 - int32 numResRows, numResCols, vectLength; 12.139 - 12.140 - dividerParams = (DividerParams *)_dividerParams; 12.141 - 12.142 - leftMatrix = dividerParams->leftMatrix; 12.143 - rightMatrix = dividerParams->rightMatrix; 12.144 - 12.145 - vectLength = leftMatrix->numCols; 12.146 - numResRows = leftMatrix->numRows; 12.147 - numResCols = rightMatrix->numCols; 12.148 - resultArray = dividerParams->resultMatrix->array; 12.149 - 12.150 - //zero the result array 12.151 - memset( resultArray, 0, numResRows * numResCols * sizeof(float32) ); 12.152 - 12.153 - //============== Do either sequential mult or do division ============== 12.154 - 12.155 - //Check if input matrices too small -- if yes, just do sequential 12.156 - //Cutoff is determined by overhead of this divider -- relatively 12.157 - // machine-independent 12.158 - if( (float32)leftMatrix->numRows * (float32)leftMatrix->numCols * 12.159 - (float32)rightMatrix->numCols < NUM_CELLS_IN_SEQUENTIAL_CUTOFF ) 12.160 - { 12.161 - //====== Do sequential multiply on a single core 12.162 - DEBUG( dbgAppFlow, "doing sequential") 12.163 - 12.164 - //transpose the right matrix 12.165 - float32 * 12.166 - transRightArray = SSR__malloc_to( rightMatrix->numRows * 12.167 - rightMatrix->numCols * sizeof(float32), 12.168 - animPr ); 12.169 - 12.170 - //copy values from orig matrix to local 12.171 - copyTranspose( rightMatrix->numRows, rightMatrix->numCols, 12.172 - 0, 0, rightMatrix->numRows, 12.173 - transRightArray, rightMatrix->array ); 12.174 - 12.175 - multiplyMatrixArraysTransposed( vectLength, numResRows, numResCols, 12.176 - leftMatrix->array, transRightArray, 12.177 - resultArray ); 12.178 - } 12.179 - else 12.180 - { 12.181 - //====== Do parallel multiply across cores 12.182 - 12.183 - //Calc the ideal size of sub-matrix and slice up the dimensions of 12.184 - // the two matrices. 12.185 - //The ideal size is the one takes the number of cycles to calculate 12.186 - // such that calc time is equal or greater than min work-unit size 12.187 - slicingStrucCarrier = 12.188 - calcIdealSizeAndSliceDimensions( leftMatrix, rightMatrix, animPr ); 12.189 - 12.190 - //Make the results processor, now that know how many to wait for 12.191 - resultsParams = SSR__malloc_to( sizeof(ResultsParams), animPr ); 12.192 - resultsParams->numSubMatrixPairs = 12.193 - slicingStrucCarrier->leftRowSlices->numVals * 12.194 - slicingStrucCarrier->rightColSlices->numVals * 12.195 - slicingStrucCarrier->vecSlices->numVals; 12.196 - resultsParams->dividerPr = animPr; 12.197 - resultsParams->numCols = rightMatrix->numCols; 12.198 - resultsParams->numRows = leftMatrix->numRows; 12.199 - resultsParams->resultArray = resultArray; 12.200 - 12.201 - 12.202 - resultPr = 12.203 - SSR__create_procr_with( &gatherResults, resultsParams, animPr); 12.204 - 12.205 - //Make the sub-matrices, and pair them up, and make processor to 12.206 - // calc product of each pair. 12.207 - makeSubMatricesAndProcrs( leftMatrix, rightMatrix, 12.208 - slicingStrucCarrier, 12.209 - resultPr, animPr); 12.210 - 12.211 - //result array is allocated externally, so no message from resultPr 12.212 - // however, do have to wait before printing out stats, so wait 12.213 - // for an empty handshake message 12.214 - msg = SSR__receive_from_to( resultPr, animPr ); 12.215 - } 12.216 - 12.217 - 12.218 - //=============== Work done -- send results back ================= 12.219 - 12.220 - 12.221 - DEBUG( dbgAppFlow, "end divide\n") 12.222 - 12.223 - VMS__record_interval_end_in_probe( divideProbe ); 12.224 - VMS__print_stats_of_all_probes(); 12.225 - 12.226 - //nothing left to do so dissipate, SSR will wait to shutdown and hence 12.227 - // make results available to outside until all the processors have 12.228 - // dissipated -- so no need to wait for results processor 12.229 - 12.230 - SSR__dissipate_procr( animPr ); //all procrs dissipate self at end 12.231 - //when all of the processors have dissipated, the "create seed and do 12.232 - // work" call in the entry point function returns 12.233 - } 12.234 - 12.235 - 12.236 -SlicingStrucCarrier * 12.237 -calcIdealSizeAndSliceDimensions( Matrix *leftMatrix, Matrix *rightMatrix, 12.238 - VirtProcr *animPr ) 12.239 - { 12.240 - float32 idealSizeOfSide, idealSizeOfSide1, idealSizeOfSide2; 12.241 - SlicingStruc *leftRowSlices, *vecSlices, *rightColSlices; 12.242 - SlicingStrucCarrier *slicingStrucCarrier = 12.243 - SSR__malloc_to(sizeof(SlicingStrucCarrier), animPr); 12.244 - 12.245 - int minWorkUnitCycles, primitiveCycles, idealNumWorkUnits; 12.246 - float64 numPrimitiveOpsInMinWorkUnit; 12.247 - 12.248 - 12.249 - //======= Calc ideal size of min-sized sub-matrix ======== 12.250 - 12.251 - //ask SSR for the number of cycles of the minimum work unit, at given 12.252 - // percent overhead then add a guess at overhead from this divider 12.253 - minWorkUnitCycles = SSR__giveMinWorkUnitCycles( .05 ); 12.254 - 12.255 - //ask SSR for number of cycles of the "primitive" op of matrix mult 12.256 - primitiveCycles = measureMatrixMultPrimitive( animPr ); 12.257 - 12.258 - numPrimitiveOpsInMinWorkUnit = 12.259 - (float64)minWorkUnitCycles / (float64)primitiveCycles; 12.260 - 12.261 - //take cubed root -- that's number of these in a "side" of sub-matrix 12.262 - // then multiply by 5 because the primitive is 5x5 12.263 - idealSizeOfSide1 = 5 * cbrt( numPrimitiveOpsInMinWorkUnit ); 12.264 - 12.265 - idealNumWorkUnits = SSR__giveIdealNumWorkUnits(); 12.266 - 12.267 - idealSizeOfSide2 = leftMatrix->numRows / rint(cbrt( idealNumWorkUnits )); 12.268 - idealSizeOfSide2 *= 0.6; //finer granularity to help load balance 12.269 - 12.270 - if( idealSizeOfSide1 > idealSizeOfSide2 ) 12.271 - idealSizeOfSide = idealSizeOfSide1; 12.272 - else 12.273 - idealSizeOfSide = idealSizeOfSide2; 12.274 - 12.275 - //The multiply inner loop blocks the array to fit into L1 cache 12.276 -// if( idealSizeOfSide < ROWS_IN_BLOCK ) idealSizeOfSide = ROWS_IN_BLOCK; 12.277 - 12.278 - //============ Slice up dimensions, now that know target size =========== 12.279 - 12.280 - //Tell the slicer the target size of a side (floating pt), the start 12.281 - // value to start slicing at, and the end value to stop slicing at 12.282 - //It returns an array of start value of each chunk, plus number of them 12.283 - int32 startLeftRow, endLeftRow, startVec,endVec,startRightCol,endRightCol; 12.284 - startLeftRow = 0; 12.285 - endLeftRow = leftMatrix->numRows -1; 12.286 - startVec = 0; 12.287 - endVec = leftMatrix->numCols -1; 12.288 - startRightCol = 0; 12.289 - endRightCol = rightMatrix->numCols -1; 12.290 - 12.291 - leftRowSlices = 12.292 - sliceUpDimension( idealSizeOfSide, startLeftRow, endLeftRow, animPr ); 12.293 - 12.294 - vecSlices = 12.295 - sliceUpDimension( idealSizeOfSide, startVec, endVec, animPr ); 12.296 - 12.297 - rightColSlices = 12.298 - sliceUpDimension( idealSizeOfSide, startRightCol, endRightCol,animPr); 12.299 - 12.300 - slicingStrucCarrier->leftRowSlices = leftRowSlices; 12.301 - slicingStrucCarrier->vecSlices = vecSlices; 12.302 - slicingStrucCarrier->rightColSlices = rightColSlices; 12.303 - 12.304 - return slicingStrucCarrier; 12.305 - } 12.306 - 12.307 - 12.308 -void 12.309 -makeSubMatricesAndProcrs( Matrix *leftMatrix, Matrix *rightMatrix, 12.310 - SlicingStrucCarrier *slicingStrucCarrier, 12.311 - VirtProcr *resultPr, VirtProcr *animPr ) 12.312 - { 12.313 - SlicingStruc *leftRowSlices, *vecSlices, *rightColSlices; 12.314 - 12.315 - leftRowSlices = slicingStrucCarrier->leftRowSlices; 12.316 - vecSlices = slicingStrucCarrier->vecSlices; 12.317 - rightColSlices = slicingStrucCarrier->rightColSlices; 12.318 - SSR__free( slicingStrucCarrier, animPr ); 12.319 - 12.320 - //================ Make sub-matrices, given the slicing ================ 12.321 - SubMatrix **leftSubMatrices, **rightSubMatrices; 12.322 - leftSubMatrices = 12.323 - createSubMatrices( leftRowSlices, vecSlices, rightColSlices->numVals, 12.324 - leftMatrix, animPr ); 12.325 - //double_check_that_always_numRows_in_right_same_as_numCols_in_left(); 12.326 - rightSubMatrices = 12.327 - createSubMatrices( vecSlices, rightColSlices, leftRowSlices->numVals, 12.328 - rightMatrix, animPr ); 12.329 - 12.330 - 12.331 - //============== pair the sub-matrices and make processors ============== 12.332 - int32 numRowIdxs, numColIdxs, numVecIdxs; 12.333 - 12.334 - numRowIdxs = leftRowSlices->numVals; 12.335 - numColIdxs = rightColSlices->numVals; 12.336 - numVecIdxs = vecSlices->numVals; 12.337 - 12.338 - 12.339 - freeSlicingStruc( leftRowSlices, animPr ); 12.340 - freeSlicingStruc( vecSlices, animPr ); 12.341 - freeSlicingStruc( rightColSlices, animPr ); 12.342 - 12.343 - pairUpSubMatricesAndMakeProcessors( leftSubMatrices, 12.344 - rightSubMatrices, 12.345 - numRowIdxs, numColIdxs, 12.346 - numVecIdxs, 12.347 - resultPr, 12.348 - animPr ); 12.349 - } 12.350 - 12.351 - 12.352 - 12.353 - 12.354 -void 12.355 -pairUpSubMatricesAndMakeProcessors( SubMatrix **leftSubMatrices, 12.356 - SubMatrix **rightSubMatrices, 12.357 - int32 numRowIdxs, int32 numColIdxs, 12.358 - int32 numVecIdxs, 12.359 - VirtProcr *resultPr, 12.360 - VirtProcr *animatingPr ) 12.361 - { 12.362 - int32 resRowIdx, resColIdx, vecIdx; 12.363 - int32 numLeftColIdxs, numRightColIdxs; 12.364 - int32 leftRowIdxOffset; 12.365 - SMPairParams *subMatrixPairParams; 12.366 - float32 numToPutOntoEachCore, leftOverFraction; 12.367 - int32 numCores, coreToScheduleOnto, numVecOnCurrCore; 12.368 - 12.369 - numLeftColIdxs = numColIdxs; 12.370 - numRightColIdxs = numVecIdxs; 12.371 - 12.372 - numCores = SSR__give_number_of_cores_to_schedule_onto(); 12.373 - 12.374 - numToPutOntoEachCore = numRowIdxs*numColIdxs/numCores; 12.375 - leftOverFraction = 0; 12.376 - numVecOnCurrCore = 0; 12.377 - coreToScheduleOnto = 0; 12.378 - 12.379 - for( resRowIdx = 0; resRowIdx < numRowIdxs; resRowIdx++ ) 12.380 - { 12.381 - leftRowIdxOffset = resRowIdx * numLeftColIdxs; 12.382 - 12.383 - for( resColIdx = 0; resColIdx < numColIdxs; resColIdx++ ) 12.384 - { 12.385 - 12.386 - for( vecIdx = 0; vecIdx < numVecIdxs; vecIdx++ ) 12.387 - { 12.388 - //Make the processor for the pair of sub-matrices 12.389 - subMatrixPairParams = SSR__malloc_to( sizeof(SMPairParams), 12.390 - animatingPr); 12.391 - subMatrixPairParams->leftSubMatrix = 12.392 - leftSubMatrices[ leftRowIdxOffset + vecIdx ]; 12.393 - 12.394 - subMatrixPairParams->rightSubMatrix = 12.395 - rightSubMatrices[ vecIdx * numRightColIdxs + resColIdx ]; 12.396 - 12.397 - subMatrixPairParams->resultPr = resultPr; 12.398 - 12.399 - //put all pairs from the same vector onto same core 12.400 - SSR__create_procr_with_affinity( &calcSubMatrixProduct, 12.401 - subMatrixPairParams, 12.402 - animatingPr, 12.403 - coreToScheduleOnto ); 12.404 - } 12.405 - 12.406 - //Trying to distribute the subMatrix-vectors across the cores, so 12.407 - // that each core gets the same number of vectors, with a max 12.408 - // imbalance of 1 vector more on some cores than others 12.409 - numVecOnCurrCore += 1; 12.410 - if( numVecOnCurrCore + leftOverFraction >= numToPutOntoEachCore -1 ) 12.411 - { 12.412 - //deal with fractional part, to ensure that imbalance is 1 max 12.413 - // IE, core with most has only 1 more than core with least 12.414 - leftOverFraction += numToPutOntoEachCore - numVecOnCurrCore; 12.415 - if( leftOverFraction >= 1 ) 12.416 - { leftOverFraction -= 1; 12.417 - numVecOnCurrCore = -1; 12.418 - } 12.419 - else 12.420 - { numVecOnCurrCore = 0; 12.421 - } 12.422 - //Move to next core, max core-value to incr to is numCores -1 12.423 - if( coreToScheduleOnto >= numCores -1 ) 12.424 - { coreToScheduleOnto = 0; 12.425 - } 12.426 - else 12.427 - { coreToScheduleOnto += 1; 12.428 - } 12.429 - } 12.430 - 12.431 - } 12.432 - } 12.433 - 12.434 - } 12.435 - 12.436 - 12.437 - 12.438 -/*Walk through the two slice-strucs, making sub-matrix strucs as go 12.439 - */ 12.440 -SubMatrix ** 12.441 -createSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices, 12.442 - int32 numUses, Matrix *origMatrix, VirtProcr *animPr ) 12.443 - { 12.444 - int32 numRowIdxs, numColIdxs, rowIdx, colIdx; 12.445 - int32 startRow, endRow, startCol, endCol; 12.446 - int32 *rowStartVals, *colStartVals; 12.447 - int32 rowOffset; 12.448 - SubMatrix **subMatrices, *newSubMatrix; 12.449 - 12.450 - numRowIdxs = rowSlices->numVals; 12.451 - numColIdxs = colSlices->numVals; 12.452 - 12.453 - rowStartVals = rowSlices->startVals; 12.454 - colStartVals = colSlices->startVals; 12.455 - 12.456 - subMatrices = SSR__malloc_to(numRowIdxs * numColIdxs * sizeof(SubMatrix*), 12.457 - animPr ); 12.458 - 12.459 - for( rowIdx = 0; rowIdx < numRowIdxs; rowIdx++ ) 12.460 - { 12.461 - rowOffset = rowIdx * numColIdxs; 12.462 - 12.463 - startRow = rowStartVals[rowIdx]; 12.464 - endRow = rowStartVals[rowIdx + 1] -1; //"fake" start above last is 12.465 - // at last valid idx + 1 & is 12.466 - // 1 greater than end value 12.467 - for( colIdx = 0; colIdx < numColIdxs; colIdx++ ) 12.468 - { 12.469 - startCol = colStartVals[colIdx]; 12.470 - endCol = colStartVals[colIdx + 1] -1; 12.471 - 12.472 - newSubMatrix = SSR__malloc_to( sizeof(SubMatrix), animPr ); 12.473 - newSubMatrix->numRows = endRow - startRow +1; 12.474 - newSubMatrix->numCols = endCol - startCol +1; 12.475 - newSubMatrix->origMatrix = origMatrix; 12.476 - newSubMatrix->origStartRow = startRow; 12.477 - newSubMatrix->origStartCol = startCol; 12.478 - newSubMatrix->copySingleton = NULL; 12.479 - newSubMatrix->numUsesLeft = numUses; //can free after this many 12.480 - //Prevent uninitialized memory 12.481 - newSubMatrix->copySingleton = NULL; 12.482 - newSubMatrix->copyTransSingleton = NULL; 12.483 - 12.484 - subMatrices[ rowOffset + colIdx ] = newSubMatrix; 12.485 - } 12.486 - } 12.487 - return subMatrices; 12.488 - } 12.489 - 12.490 - 12.491 -void 12.492 -freeSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices, 12.493 - SubMatrix **subMatrices, VirtProcr *animPr ) 12.494 - { 12.495 - int32 numRowIdxs, numColIdxs, rowIdx, colIdx, rowOffset; 12.496 - SubMatrix *subMatrix; 12.497 - 12.498 - numRowIdxs = rowSlices->numVals; 12.499 - numColIdxs = colSlices->numVals; 12.500 - 12.501 - for( rowIdx = 0; rowIdx < numRowIdxs; rowIdx++ ) 12.502 - { 12.503 - rowOffset = rowIdx * numColIdxs; 12.504 - for( colIdx = 0; colIdx < numColIdxs; colIdx++ ) 12.505 - { 12.506 - subMatrix = subMatrices[ rowOffset + colIdx ]; 12.507 - if( subMatrix->alreadyCopied ) 12.508 - SSR__free( subMatrix->array, animPr ); 12.509 - SSR__free( subMatrix, animPr ); 12.510 - } 12.511 - } 12.512 - SSR__free( subMatrices, animPr ); 12.513 - } 12.514 - 12.515 - 12.516 - 12.517 -SlicingStruc * 12.518 -sliceUpDimension( float32 idealSizeOfSide, int startVal, int endVal, 12.519 - VirtProcr *animPr ) 12.520 - { float32 residualAcc = 0; 12.521 - int numSlices, i, *startVals, sizeOfSlice, endCondition; 12.522 - SlicingStruc *slicingStruc = SSR__malloc_to(sizeof(SlicingStruc), animPr); 12.523 - 12.524 - //calc size of matrix need to hold start vals -- 12.525 - numSlices = (int32)( (float32)(endVal -startVal +1) / idealSizeOfSide); 12.526 - 12.527 - startVals = SSR__malloc_to( (numSlices + 1) * sizeof(int32), animPr ); 12.528 - 12.529 - //Calc the upper limit of start value -- when get above this, end loop 12.530 - // by saving highest value of the matrix dimension to access, plus 1 12.531 - // as the start point of the imaginary slice following the last one 12.532 - //Plus 1 because go up to value but not include when process last slice 12.533 - //The stopping condition is half-a-size less than highest value because 12.534 - // don't want any pieces smaller than half the ideal size -- just tack 12.535 - // little ones onto end of last one 12.536 - endCondition = endVal - (int) (idealSizeOfSide/2); //end *value*, not size 12.537 - for( i = 0; startVal <= endVal; i++ ) 12.538 - { 12.539 - startVals[i] = startVal; 12.540 - residualAcc += idealSizeOfSide; 12.541 - sizeOfSlice = (int)residualAcc; 12.542 - residualAcc -= (float32)sizeOfSlice; 12.543 - startVal += sizeOfSlice; //ex @size = 2 get 0, 2, 4, 6, 8.. 12.544 - 12.545 - if( startVal > endCondition ) 12.546 - { startVal = endVal + 1; 12.547 - startVals[ i + 1 ] = startVal; 12.548 - } 12.549 - } 12.550 - 12.551 - slicingStruc->startVals = startVals; 12.552 - slicingStruc->numVals = i; //loop incr'd, so == last valid start idx+1 12.553 - // which means is num sub-matrices in dim 12.554 - // also == idx of the fake start just above 12.555 - return slicingStruc; 12.556 - } 12.557 - 12.558 -void 12.559 -freeSlicingStruc( SlicingStruc *slicingStruc, VirtProcr *animPr ) 12.560 - { 12.561 - SSR__free( slicingStruc->startVals, animPr ); 12.562 - SSR__free( slicingStruc, animPr ); 12.563 - } 12.564 - 12.565 - 12.566 -int inline 12.567 -measureMatrixMultPrimitive( VirtProcr *animPr ) 12.568 - { 12.569 - int r, c, v, numCycles; 12.570 - float32 *res, *left, *right; 12.571 - 12.572 - //setup inputs 12.573 - left = SSR__malloc_to( 5 * 5 * sizeof( float32 ), animPr ); 12.574 - right = SSR__malloc_to( 5 * 5 * sizeof( float32 ), animPr ); 12.575 - res = SSR__malloc_to( 5 * 5 * sizeof( float32 ), animPr ); 12.576 - 12.577 - for( r = 0; r < 5; r++ ) 12.578 - { 12.579 - for( c = 0; c < 5; c++ ) 12.580 - { 12.581 - left[ r * 5 + c ] = r; 12.582 - right[ r * 5 + c ] = c; 12.583 - } 12.584 - } 12.585 - 12.586 - //do primitive 12.587 - SSR__start_primitive(); //for now, just takes time stamp 12.588 - for( r = 0; r < 5; r++ ) 12.589 - { 12.590 - for( c = 0; c < 5; c++ ) 12.591 - { 12.592 - for( v = 0; v < 5; v++ ) 12.593 - { 12.594 - res[ r * 5 + c ] = left[ r * 5 + v ] * right[ v * 5 + c ]; 12.595 - } 12.596 - } 12.597 - } 12.598 - numCycles = 12.599 - SSR__end_primitive_and_give_cycles(); 12.600 - 12.601 - SSR__free( left, animPr ); 12.602 - SSR__free( right, animPr ); 12.603 - SSR__free( res, animPr ); 12.604 - 12.605 - return numCycles; 12.606 - }
13.1 --- a/src/Application/SSR_Matrix_Mult/EntryPoint.c Wed Sep 07 13:06:25 2011 +0200 13.2 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 13.3 @@ -1,62 +0,0 @@ 13.4 -/* 13.5 - * Copyright 2009 OpenSourceStewardshipFoundation.org 13.6 - * Licensed under GNU General Public License version 2 13.7 - * 13.8 - * Author: seanhalle@yahoo.com 13.9 - * 13.10 - */ 13.11 - 13.12 -#include <math.h> 13.13 - 13.14 -#include "SSR_Matrix_Mult.h" 13.15 - 13.16 - 13.17 - 13.18 -/*Every SSR system has an "entry point" function that creates the first 13.19 - * processor, which starts the chain of creating more processors.. 13.20 - * eventually all of the processors will dissipate themselves, and 13.21 - * return. 13.22 - * 13.23 - *This entry-point function follows the same pattern as all entry-point 13.24 - * functions do: 13.25 - *1) it creates the params for the seed processor, from the 13.26 - * parameters passed into the entry-point function 13.27 - *2) it calls SSR__create_seed_procr_and_do_work 13.28 - *3) it gets the return value from the params struc, frees the params struc, 13.29 - * and returns the value from the function 13.30 - * 13.31 - */ 13.32 -Matrix * 13.33 -multiplyTheseMatrices( Matrix *leftMatrix, Matrix *rightMatrix ) 13.34 - { Matrix *resMatrix; 13.35 - DividerParams *dividerParams; 13.36 - int32 numResRows, numResCols; 13.37 - 13.38 - 13.39 - dividerParams = malloc( sizeof( DividerParams ) ); 13.40 - dividerParams->leftMatrix = leftMatrix; 13.41 - dividerParams->rightMatrix = rightMatrix; 13.42 - 13.43 - 13.44 - numResRows = leftMatrix->numRows; 13.45 - numResCols = rightMatrix->numCols; 13.46 - 13.47 - //VMS has its own separate internal malloc, so to get results out, 13.48 - // have to pass in empty array for it to fill up 13.49 - //The alternative is internally telling SSR make external space to use 13.50 - resMatrix = malloc( sizeof(Matrix) ); 13.51 - resMatrix->array = malloc( numResRows * numResCols * sizeof(float32)); 13.52 - resMatrix->numCols = rightMatrix->numCols; 13.53 - resMatrix->numRows = leftMatrix->numRows; 13.54 - 13.55 - 13.56 - dividerParams->resultMatrix = resMatrix; 13.57 - 13.58 - //create divider processor, start doing the work, and wait till done 13.59 - //This function is the "border crossing" between normal code and SSR 13.60 - SSR__create_seed_procr_and_do_work( ÷WorkIntoSubMatrixPairProcrs, 13.61 - dividerParams ); 13.62 - 13.63 - free( dividerParams ); 13.64 - return resMatrix; 13.65 - }
14.1 --- a/src/Application/SSR_Matrix_Mult/Result_Pr.c Wed Sep 07 13:06:25 2011 +0200 14.2 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 14.3 @@ -1,108 +0,0 @@ 14.4 -/* 14.5 - * Copyright 2009 OpenSourceStewardshipFoundation.org 14.6 - * Licensed under GNU General Public License version 2 14.7 - * 14.8 - * Author: seanhalle@yahoo.com 14.9 - * 14.10 - */ 14.11 - 14.12 -#include "SSR_Matrix_Mult.h" 14.13 - 14.14 -//===================== 14.15 -void inline 14.16 -accumulateResult( float32 *resultArray, float32 *subMatrixResultArray, 14.17 - int32 startRow, 14.18 - int32 numRows, 14.19 - int32 startCol, 14.20 - int32 numCols, 14.21 - int32 numOrigCols ); 14.22 - 14.23 -//=========================================================================== 14.24 - 14.25 -/*The Result Processor gets a message from each of the vector processors, 14.26 - * puts the result from the message in its location in the result- 14.27 - * matrix, and increments the count of results. 14.28 - * 14.29 - *After the count reaches the point that all results have been received, it 14.30 - * returns the result matrix and dissipates. 14.31 - */ 14.32 -void gatherResults( void *_params, VirtProcr *animatingPr ) 14.33 - { VirtProcr *dividerPr; 14.34 - ResultsParams *params; 14.35 - int row, col, numRows, numCols, numSubMatrixPairs, count=0; 14.36 - float32 *resultArray; 14.37 - void *msg; 14.38 - SMPairParams *resParams; 14.39 - 14.40 - DEBUG( dbgAppFlow, "start resultPr\n") 14.41 - 14.42 - params = (ResultsParams *)_params; 14.43 - dividerPr = params->dividerPr; 14.44 - numSubMatrixPairs = params->numSubMatrixPairs; 14.45 - numRows = params->numRows; 14.46 - numCols = params->numCols; 14.47 - 14.48 - resultArray = params->resultArray; 14.49 - 14.50 - 14.51 - while( count < numSubMatrixPairs ) 14.52 - { 14.53 - msg = SSR__receive_type_to( RESULTS_MSG, animatingPr ); 14.54 - 14.55 - resParams = (SMPairParams *)msg; 14.56 - accumulateResult( resultArray, resParams->partialResultArray, 14.57 - resParams->leftSubMatrix->origStartRow, 14.58 - resParams->leftSubMatrix->numRows, 14.59 - resParams->rightSubMatrix->origStartCol, 14.60 - resParams->rightSubMatrix->numCols, 14.61 - resParams->rightSubMatrix->origMatrix->numCols ); 14.62 - 14.63 - SSR__free( resParams->partialResultArray, animatingPr ); 14.64 - 14.65 - //there is only one copy of results procr, so can update numUsesLeft 14.66 - // without concurrency worries. When zero, free the sub-matrix 14.67 - resParams->leftSubMatrix->numUsesLeft -= 1; 14.68 - if( resParams->leftSubMatrix->numUsesLeft == 0 ) 14.69 - { 14.70 - SSR__free( resParams->leftSubMatrix->array, animatingPr ); 14.71 - SSR__free( resParams->leftSubMatrix, animatingPr ); 14.72 - } 14.73 - 14.74 - resParams->rightSubMatrix->numUsesLeft -= 1; 14.75 - if( resParams->rightSubMatrix->numUsesLeft == 0 ) 14.76 - { 14.77 - SSR__free( resParams->rightSubMatrix->array, animatingPr ); 14.78 - SSR__free( resParams->rightSubMatrix, animatingPr ); 14.79 - } 14.80 - 14.81 - //count of how many sub-matrix pairs accumulated so know when done 14.82 - count++; 14.83 - } 14.84 - 14.85 - //Done -- could just dissipate -- SSR will wait for all processors to 14.86 - // dissipate before shutting down, and thereby making results avaial to 14.87 - // outside, so no need to stop the divider from dissipating, so no need 14.88 - // to send a hand-shake message to it -- bug makes debug easier 14.89 - SSR__send_from_to( NULL, animatingPr, dividerPr ); 14.90 - SSR__dissipate_procr( animatingPr ); //frees any data owned by procr 14.91 - } 14.92 - 14.93 -void inline 14.94 -accumulateResult( float32 *resultArray, float32 *subMatrixPairResultArray, 14.95 - int32 startRow, 14.96 - int32 numRows, 14.97 - int32 startCol, 14.98 - int32 numCols, 14.99 - int32 numOrigCols ) 14.100 - { int32 row, col; 14.101 - 14.102 - for( row = 0; row < numRows; row++ ) 14.103 - { 14.104 - for( col = 0; col < numCols; col++ ) 14.105 - { 14.106 - resultArray[ (row + startRow) * numOrigCols + (col + startCol) ] += 14.107 - subMatrixPairResultArray[ row * numCols + col ]; 14.108 - } 14.109 - } 14.110 - 14.111 - }
15.1 --- a/src/Application/SSR_Matrix_Mult/SSR_Matrix_Mult.h Wed Sep 07 13:06:25 2011 +0200 15.2 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 15.3 @@ -1,97 +0,0 @@ 15.4 -/* 15.5 - * Copyright Oct 24, 2009 OpenSourceStewardshipFoundation.org 15.6 - * Licensed under GNU General Public License version 2 15.7 - */ 15.8 - 15.9 -#ifndef _SSR_MATRIX_MULT_H_ 15.10 -#define _SSR_MATRIX_MULT_H_ 15.11 - 15.12 -#include <stdio.h> 15.13 - 15.14 -#include "../../SSR_lib/SSR.h" 15.15 -#include "../Matrix_Mult.h" 15.16 - 15.17 - 15.18 -//=============================== Defines ============================== 15.19 -#define ROWS_IN_BLOCK 32 15.20 -#define COLS_IN_BLOCK 32 15.21 -#define VEC_IN_BLOCK 32 15.22 - 15.23 -#define copyMatrixSingleton 1 15.24 -#define copyTransposeSingleton 2 15.25 - 15.26 -//============================== Structures ============================== 15.27 -typedef struct 15.28 - { 15.29 - Matrix *leftMatrix; 15.30 - Matrix *rightMatrix; 15.31 - Matrix *resultMatrix; 15.32 - } 15.33 -DividerParams; 15.34 - 15.35 -typedef struct 15.36 - { 15.37 - VirtProcr *dividerPr; 15.38 - int numRows; 15.39 - int numCols; 15.40 - int numSubMatrixPairs; 15.41 - float32 *resultArray; 15.42 - } 15.43 -ResultsParams; 15.44 - 15.45 -typedef 15.46 -struct 15.47 - { int32 numRows; 15.48 - int32 numCols; 15.49 - Matrix *origMatrix; 15.50 - int32 origStartRow; 15.51 - int32 origStartCol; 15.52 - int32 alreadyCopied; 15.53 - int32 numUsesLeft; //have update via message to avoid multiple writers 15.54 - SSRSingleton *copySingleton; 15.55 - SSRSingleton *copyTransSingleton; 15.56 - float32 *array; //2D, but dynamically sized, so use addr arith 15.57 - } 15.58 -SubMatrix; 15.59 - 15.60 -typedef struct 15.61 - { VirtProcr *resultPr; 15.62 - SubMatrix *leftSubMatrix; 15.63 - SubMatrix *rightSubMatrix; 15.64 - float32 *partialResultArray; 15.65 - } 15.66 -SMPairParams; 15.67 - 15.68 -typedef 15.69 -struct 15.70 - { int32 numVals; 15.71 - int32 *startVals; 15.72 - } 15.73 -SlicingStruc; 15.74 - 15.75 -typedef 15.76 -struct 15.77 - { 15.78 - SlicingStruc *leftRowSlices; 15.79 - SlicingStruc *vecSlices; 15.80 - SlicingStruc *rightColSlices; 15.81 - } 15.82 -SlicingStrucCarrier; 15.83 - 15.84 -enum MMMsgType 15.85 - { 15.86 - RESULTS_MSG = 1 15.87 - }; 15.88 - 15.89 -//============================= Processor Functions ========================= 15.90 -void divideWorkIntoSubMatrixPairProcrs( void *data, VirtProcr *animatingPr ); 15.91 -void calcSubMatrixProduct( void *data, VirtProcr *animatingPr ); 15.92 -void gatherResults( void *data, VirtProcr *animatingPr ); 15.93 - 15.94 - 15.95 -//================================ Entry Point ============================== 15.96 -Matrix * 15.97 -multiplyTheseMatrices( Matrix *leftMatrix, Matrix *rightMatrix ); 15.98 - 15.99 - 15.100 -#endif /*_SSR_MATRIX_MULT_H_*/
16.1 --- a/src/Application/SSR_Matrix_Mult/subMatrix_Pr.c Wed Sep 07 13:06:25 2011 +0200 16.2 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 16.3 @@ -1,319 +0,0 @@ 16.4 -/* 16.5 - * Copyright 2009 OpenSourceStewardshipFoundation.org 16.6 - * Licensed under GNU General Public License version 2 16.7 - * 16.8 - * Author: SeanHalle@yahoo.com 16.9 - * 16.10 - */ 16.11 - 16.12 -#include <string.h> 16.13 - 16.14 -#include "SSR_Matrix_Mult.h" 16.15 - 16.16 - 16.17 - 16.18 -void inline 16.19 -copyFromOrig( SubMatrix *subMatrix, VirtProcr *animPr ); 16.20 - 16.21 -void inline 16.22 -copyTransposeFromOrig( SubMatrix *subMatrix, VirtProcr *animPr ); 16.23 - 16.24 -void inline 16.25 -multiplySubBlocksTransposed( float32 *leftArray, float32 *rightArray, 16.26 - float32 *resArray, 16.27 - int startRow, int endRow, 16.28 - int startCol, int endCol, 16.29 - int startVec, int endVec, 16.30 - int resStride, int inpStride ); 16.31 - 16.32 -void inline 16.33 -multiplyMatrixArraysTransposed( int32 vecLength, int32 numResRows, int32 numResCols, 16.34 - float32 *leftArray, float32 *rightArray, 16.35 - float32 *resArray ); 16.36 - 16.37 - 16.38 -/*A processor is created with an environment that holds two matrices, 16.39 - * the row and col that it owns, and the name of a result gathering 16.40 - * processor. 16.41 - *It calculates the product of two sub-portions of the input matrices 16.42 - * by using Intel's mkl library for single-core. 16.43 - * 16.44 - *This demonstrates using optimized single-threaded code inside scheduled 16.45 - * work-units. 16.46 - * 16.47 - *When done, it sends the result to the result processor 16.48 - */ 16.49 -void 16.50 -calcSubMatrixProduct( void *data, VirtProcr *animatingPr ) 16.51 - { 16.52 - SMPairParams *params; 16.53 - VirtProcr *resultPr; 16.54 - float32 *leftArray, *rightArray, *resArray; 16.55 - SubMatrix *leftSubMatrix, *rightSubMatrix; 16.56 - 16.57 - DEBUG1(dbgAppFlow, "start sub-matrix mult: %d\n", animatingPr->procrID) 16.58 - #ifdef TURN_ON_DEBUG_PROBES 16.59 - int32 subMatrixProbe = VMS__create_single_interval_probe( "subMtx", 16.60 - animatingPr); 16.61 - VMS__record_sched_choice_into_probe( subMatrixProbe, animatingPr ); 16.62 - VMS__record_interval_start_in_probe( subMatrixProbe ); 16.63 - #endif 16.64 - 16.65 - params = (SMPairParams *)data; 16.66 - resultPr = params->resultPr; 16.67 - leftSubMatrix = params->leftSubMatrix; 16.68 - rightSubMatrix = params->rightSubMatrix; 16.69 - 16.70 - //make sure the input sub-matrices have been copied out of orig 16.71 - //do it here, inside sub-matrix pair to hopefully gain reuse in cache 16.72 - copyFromOrig( leftSubMatrix, animatingPr ); 16.73 - copyTransposeFromOrig( rightSubMatrix, animatingPr ); 16.74 - 16.75 - leftArray = leftSubMatrix->array; 16.76 - rightArray = rightSubMatrix->array; 16.77 - 16.78 - int32 16.79 - resSize = leftSubMatrix->numRows * rightSubMatrix->numCols * sizeof(float32); 16.80 - resArray = SSR__malloc_to( resSize, animatingPr ); 16.81 - memset( resArray, 0, resSize ); 16.82 - 16.83 - 16.84 - int32 numResRows, numResCols, vectLength; 16.85 - 16.86 - vectLength = leftSubMatrix->numCols; 16.87 - numResRows = leftSubMatrix->numRows; 16.88 - numResCols = rightSubMatrix->numCols; 16.89 - 16.90 - multiplyMatrixArraysTransposed( vectLength, numResRows, numResCols, 16.91 - leftArray, rightArray, 16.92 - resArray ); 16.93 - 16.94 - //send result to result processor 16.95 - params->partialResultArray = resArray; 16.96 - 16.97 - #ifdef TURN_ON_DEBUG_PROBES 16.98 - VMS__record_interval_end_in_probe( subMatrixProbe ); 16.99 - #endif 16.100 - 16.101 - SSR__send_of_type_to( animatingPr, params, RESULTS_MSG, resultPr ); 16.102 - SSR__dissipate_procr( animatingPr ); 16.103 - } 16.104 - 16.105 - 16.106 - 16.107 -/*Divides result and each input into 32x32 sub-matrices, 3 of which fit into 16.108 - * the 32KB L1 cache. 16.109 - *Would be nice to embed this within another level that divided into 16.110 - * 8x8 tiles of those, where one 8x8 tile fits within 2MB L2 cache 16.111 - * 16.112 - *Eventually want these divisions to be automatic, using DKU pattern 16.113 - * embedded into VMS and exposed in the language, and with VMS controlling the 16.114 - * divisions according to the cache sizes, which it knows about. 16.115 - *Also, want VMS to work with language to split among main-mems, so a socket 16.116 - * only cranks on data in its local segment of main mem 16.117 - * 16.118 - *So, outer two loops determine start and end points within the result matrix. 16.119 - * Inside that, a loop dets the start and end points along the shared dimensions 16.120 - * of the two input matrices. 16.121 - */ 16.122 -void inline 16.123 -multiplyMatrixArraysTransposed( int32 vecLength, int32 numResRows, 16.124 - int32 numResCols, 16.125 - float32 *leftArray, float32 *rightArray, 16.126 - float32 *resArray ) 16.127 - { 16.128 - int resStride, inpStride; 16.129 - int resStartRow, resStartCol, resEndRow, resEndCol, startVec, endVec; 16.130 - 16.131 - resStride = numResCols; 16.132 - inpStride = vecLength; 16.133 - 16.134 - for( resStartRow = 0; resStartRow < numResRows; ) 16.135 - { 16.136 - resEndRow = resStartRow + ROWS_IN_BLOCK -1; //start at zero, so -1 16.137 - if( resEndRow > numResRows ) resEndRow = numResRows -1; 16.138 - 16.139 - for( resStartCol = 0; resStartCol < numResCols; ) 16.140 - { 16.141 - resEndCol = resStartCol + COLS_IN_BLOCK -1; 16.142 - if( resEndCol > numResCols ) resEndCol = numResCols -1; 16.143 - 16.144 - for( startVec = 0; startVec < vecLength; ) 16.145 - { 16.146 - endVec = startVec + VEC_IN_BLOCK -1; 16.147 - if( endVec > vecLength ) endVec = vecLength -1; 16.148 - 16.149 - //By having the "vector" of sub-blocks in a sub-block slice 16.150 - // be marched down in inner loop, are re-using the result 16.151 - // matrix, which stays in L1 cache and re-using the left sub-mat 16.152 - // which repeats for each right sub-mat -- can only re-use two of 16.153 - // the three, so result is the most important -- avoids writing 16.154 - // dirty blocks until those result-locations fully done 16.155 - //Row and Col is position in result matrix -- so row and vec 16.156 - // for left array, then vec and col for right array 16.157 - multiplySubBlocksTransposed( leftArray, rightArray, 16.158 - resArray, 16.159 - resStartRow, resEndRow, 16.160 - resStartCol, resEndCol, 16.161 - startVec, endVec, 16.162 - resStride, inpStride ); 16.163 - startVec = endVec +1; 16.164 - } 16.165 - resStartCol = resEndCol +1; 16.166 - } 16.167 - resStartRow = resEndRow +1; 16.168 - } 16.169 - } 16.170 - 16.171 - 16.172 - 16.173 -void inline 16.174 -multiplySubBlocksTransposed( float32 *leftArray, float32 *rightArray, 16.175 - float32 *resArray, 16.176 - int resStartRow, int resEndRow, 16.177 - int resStartCol, int resEndCol, 16.178 - int startVec, int endVec, 16.179 - int resStride, int inpStride ) 16.180 - { 16.181 - int resRow, resCol, vec; 16.182 - int leftOffset, rightOffset; 16.183 - float32 result; 16.184 - 16.185 - //The result row is used only for the left matrix, res col for the right 16.186 - for( resCol = resStartCol; resCol <= resEndCol; resCol++ ) 16.187 - { 16.188 - for( resRow = resStartRow; resRow <= resEndRow; resRow++ ) 16.189 - { 16.190 - leftOffset = resRow * inpStride;//left & right inp strides always same 16.191 - rightOffset = resCol * inpStride;// because right is transposed 16.192 - result = 0; 16.193 - for( vec = startVec; vec <= endVec; vec++ ) 16.194 - { 16.195 - result += 16.196 - leftArray[ leftOffset + vec] * rightArray[ rightOffset + vec]; 16.197 - } 16.198 - 16.199 - resArray[ resRow * resStride + resCol ] += result; 16.200 - } 16.201 - } 16.202 - } 16.203 - 16.204 - 16.205 - 16.206 - 16.207 -/*Reuse this in divider when do the sequential multiply case 16.208 - */ 16.209 -void inline 16.210 -copyTranspose( int32 numRows, int32 numCols, 16.211 - int32 origStartRow, int32 origStartCol, int32 origStride, 16.212 - float32 *subArray, float32 *origArray ) 16.213 - { int32 stride = numRows; 16.214 - 16.215 - int row, col, origOffset; 16.216 - for( row = 0; row < numRows; row++ ) 16.217 - { 16.218 - origOffset = (row + origStartRow) * origStride + origStartCol; 16.219 - for( col = 0; col < numCols; col++ ) 16.220 - { 16.221 - //transpose means swap row & col -- traverse orig matrix normally 16.222 - // but put into reversed place in local array -- means the 16.223 - // stride is the numRows now, so col * numRows + row 16.224 - subArray[ col * stride + row ] = origArray[ origOffset + col ]; 16.225 - } 16.226 - } 16.227 - } 16.228 - 16.229 -void inline 16.230 -copyTransposeFromOrig( SubMatrix *subMatrix, VirtProcr *animPr ) 16.231 - { int numCols, numRows, origStartRow, origStartCol, origStride, stride; 16.232 - Matrix *origMatrix; 16.233 - float32 *origArray, *subArray; 16.234 - 16.235 -// if( subMatrix->copyTransSingleton && \ 16.236 -// subMatrix->copyTransSingleton->hasFinished ) \ 16.237 -// return; 16.238 - SSR__start_data_singleton( &(subMatrix->copyTransSingleton), animPr ); 16.239 - 16.240 - if( subMatrix->copyTransSingleton->hasFinished ) 16.241 - { 16.242 - printf("error!"); 16.243 - } 16.244 - 16.245 - origMatrix = subMatrix->origMatrix; 16.246 - origArray = origMatrix->array; 16.247 - numCols = subMatrix->numCols; 16.248 - numRows = subMatrix->numRows; 16.249 - origStartRow = subMatrix->origStartRow; 16.250 - origStartCol = subMatrix->origStartCol; 16.251 - origStride = origMatrix->numCols; 16.252 - 16.253 - subArray = SSR__malloc_to( numRows * numCols *sizeof(float32),animPr); 16.254 - subMatrix->array = subArray; 16.255 - 16.256 - //copy values from orig matrix to local 16.257 - copyTranspose( numRows, numCols, 16.258 - origStartRow, origStartCol, origStride, 16.259 - subArray, origArray ); 16.260 - 16.261 - SSR__end_data_singleton( &(subMatrix->copyTransSingleton), animPr ); 16.262 - 16.263 - return; 16.264 - } 16.265 - 16.266 - 16.267 -void inline 16.268 -copyFromOrig( SubMatrix *subMatrix, VirtProcr *animPr ) 16.269 - { int numCols, numRows, origStartRow, origStartCol, stride, origStride; 16.270 - Matrix *origMatrix; 16.271 - float32 *origArray, *subArray; 16.272 - 16.273 - 16.274 - //This lets only a single VP execute the code between start and 16.275 - // end -- using start and end so that work runs outside the master. 16.276 - //Inside, if a second VP ever executes the start, it will be returned 16.277 - // from the end-point. 16.278 - //Note, for non-GCC, can add a second SSR call at the end, and inside 16.279 - // that one, look at the stack at the return addr & save that in an 16.280 - // array indexed by singletonID 16.281 -// if( subMatrix->copySingleton && subMatrix->copySingleton->hasFinished )\ 16.282 - return; 16.283 - SSR__start_data_singleton( &(subMatrix->copySingleton), animPr ); 16.284 - if( subMatrix->copySingleton->endInstrAddr ) 16.285 - { 16.286 - printf("error!"); 16.287 - } 16.288 - 16.289 - if( subMatrix->copySingleton->hasFinished ) 16.290 - { 16.291 - printf("error!"); 16.292 - } 16.293 - 16.294 - origMatrix = subMatrix->origMatrix; 16.295 - origArray = origMatrix->array; 16.296 - numCols = subMatrix->numCols; 16.297 - numRows = subMatrix->numRows; 16.298 - origStartRow = subMatrix->origStartRow; 16.299 - origStartCol = subMatrix->origStartCol; 16.300 - origStride = origMatrix->numCols; 16.301 - 16.302 - subArray = SSR__malloc_to( numRows * numCols *sizeof(float32),animPr); 16.303 - subMatrix->array = subArray; 16.304 - 16.305 - //copy values from orig matrix to local 16.306 - stride = numCols; 16.307 - 16.308 - int row, col, offset, origOffset; 16.309 - for( row = 0; row < numRows; row++ ) 16.310 - { 16.311 - offset = row * stride; 16.312 - origOffset = (row + origStartRow) * origStride + origStartCol; 16.313 - for( col = 0; col < numCols; col++ ) 16.314 - { 16.315 - subArray[ offset + col ] = origArray[ origOffset + col ]; 16.316 - } 16.317 - } 16.318 - 16.319 - SSR__end_data_singleton( &(subMatrix->copySingleton), animPr ); 16.320 - 16.321 - return; 16.322 - }
17.1 --- a/src/Application/main.c Wed Sep 07 13:06:25 2011 +0200 17.2 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 17.3 @@ -1,37 +0,0 @@ 17.4 -/* 17.5 - * Copyright Oct 24, 2009 OpenSourceStewardshipFoundation.org 17.6 - * Licensed under GNU General Public License version 2 17.7 - * 17.8 - * author seanhalle@yahoo.com 17.9 - */ 17.10 - 17.11 -#include <malloc.h> 17.12 -#include <stdlib.h> 17.13 - 17.14 -#include "Matrix_Mult.h" 17.15 -#include "SSR_Matrix_Mult/SSR_Matrix_Mult.h" 17.16 - 17.17 -char __ProgrammName[] = "Blocked Matrix Multiply"; 17.18 -char __DataSet[255]; 17.19 -/** 17.20 - * 17.21 - */ 17.22 -int main( int argc, char **argv ) 17.23 - { Matrix *leftMatrix, *rightMatrix, *resultMatrix; 17.24 - ParamBag *paramBag; 17.25 - 17.26 - printf( "arguments: %s | %s\n", argv[0], argv[1] ); 17.27 - 17.28 - paramBag = makeParamBag(); 17.29 - readParamFileIntoBag( argv[1], paramBag ); 17.30 - initialize_Input_Matrices_Via( &leftMatrix, &rightMatrix, paramBag ); 17.31 - 17.32 - resultMatrix = multiplyTheseMatrices( leftMatrix, rightMatrix ); 17.33 - 17.34 - printf("\nresult matrix: \n"); 17.35 - printMatrix( resultMatrix ); 17.36 -// SSR__print_stats(); 17.37 - fflush(stdin); 17.38 - 17.39 - exit(0); //cleans up 17.40 - }
