Mercurial > cgi-bin > hgwebdir.cgi > PR > Applications > Vthread > Vthread__Blocked_Matrix_Mult__Bench
changeset 15:69928a38d5af tip
updating to most recent repository structure -- not working yet
| author | Sean Halle <seanhalle@yahoo.com> |
|---|---|
| date | Mon, 17 Sep 2012 18:19:52 -0700 |
| parents | f8d3f16e70c8 |
| children | |
| files | .hgeol .hgignore .hgtags Matrix_Mult.c Matrix_Mult.h Vthread__Matrix_Mult/Divide_Pr.c Vthread__Matrix_Mult/EntryPoint.c Vthread__Matrix_Mult/Result_Pr.c Vthread__Matrix_Mult/Vthread__Matrix_Mult.h Vthread__Matrix_Mult/subMatrix_Pr.c __brch__default main.c src/Application/Matrix_Mult.c src/Application/Matrix_Mult.h src/Application/VPThread__Matrix_Mult/Divide_Pr.c src/Application/VPThread__Matrix_Mult/EntryPoint.c src/Application/VPThread__Matrix_Mult/Result_Pr.c src/Application/VPThread__Matrix_Mult/VPThread__Matrix_Mult.h src/Application/VPThread__Matrix_Mult/subMatrix_Pr.c src/Application/main.c |
| diffstat | 20 files changed, 2090 insertions(+), 1588 deletions(-) [+] |
line diff
1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/.hgeol Mon Sep 17 18:19:52 2012 -0700 1.3 @@ -0,0 +1,14 @@ 1.4 + 1.5 +[patterns] 1.6 +**.py = native 1.7 +**.txt = native 1.8 +**.c = native 1.9 +**.h = native 1.10 +**.cpp = native 1.11 +**.java = native 1.12 +**.class = bin 1.13 +**.jar = bin 1.14 +**.sh = native 1.15 +**.pl = native 1.16 +**.jpg = bin 1.17 +**.gif = bin
2.1 --- a/.hgignore Fri Sep 16 20:12:34 2011 +0200 2.2 +++ b/.hgignore Mon Sep 17 18:19:52 2012 -0700 2.3 @@ -1,7 +1,10 @@ 2.4 -nbproject 2.5 -histograms 2.6 -dist 2.7 -build 2.8 -Makefile 2.9 -.dep.inc 2.10 -scripts 2.11 +syntax: glob 2.12 + 2.13 +histograms 2.14 +nbproject 2.15 +build 2.16 +dist 2.17 +c-ray-mt 2.18 +*.ppm 2.19 +*.o 2.20 +*~
3.1 --- a/.hgtags Fri Sep 16 20:12:34 2011 +0200 3.2 +++ b/.hgtags Mon Sep 17 18:19:52 2012 -0700 3.3 @@ -1,7 +1,1 @@ 3.4 -3e26de9d3e1e66f0556b2664e71740e6ed009eee V0 3.5 -3e26de9d3e1e66f0556b2664e71740e6ed009eee V0 3.6 -0000000000000000000000000000000000000000 V0 3.7 -0000000000000000000000000000000000000000 V0 3.8 -b6e07082767ab984194db038102e2edd7d1917fd V0 3.9 -b6e07082767ab984194db038102e2edd7d1917fd V0 3.10 -0000000000000000000000000000000000000000 V0 3.11 +bef7a9083bd4f8b1fa4e171a36383e78a5f475d5 version_that_goes_into_paper
4.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 4.2 +++ b/Matrix_Mult.c Mon Sep 17 18:19:52 2012 -0700 4.3 @@ -0,0 +1,167 @@ 4.4 +/* 4.5 + * Copyright 2009 OpenSourceStewardshipFoundation.org 4.6 + * Licensed under GNU General Public License version 2 4.7 + * 4.8 + * Author: seanhalle@yahoo.com 4.9 + * 4.10 + * Created on November 15, 2009, 2:35 AM 4.11 + */ 4.12 + 4.13 +#include <malloc.h> 4.14 +#include <stdlib.h> 4.15 + 4.16 +#include "Matrix_Mult.h" 4.17 +#include "C_Libraries/ParamHelper/Param.h" 4.18 + 4.19 + 4.20 + 4.21 + void 4.22 +initialize_Input_Matrices_Via( Matrix **leftMatrix, Matrix **rightMatrix, 4.23 + ParamBag *paramBag ) 4.24 + { char *leftMatrixFileName, *rightMatrixFileName; 4.25 + int leftMatrixRows, leftMatrixCols, rightMatrixRows, rightMatrixCols; 4.26 + 4.27 + ParamStruc *param; 4.28 + param = getParamFromBag( "leftMatrixRows", paramBag ); 4.29 + leftMatrixRows = param->intValue; 4.30 + param = getParamFromBag( "leftMatrixCols", paramBag ); 4.31 + leftMatrixCols = param->intValue; 4.32 + *leftMatrix = makeMatrix_WithResMat( leftMatrixRows, leftMatrixCols ); 4.33 + 4.34 + param = getParamFromBag( "leftMatrixFileName", paramBag ); 4.35 + leftMatrixFileName = param->strValue; //no need to copy 4.36 + read_Matrix_From_File( *leftMatrix, leftMatrixFileName ); 4.37 + 4.38 + param = getParamFromBag( "rightMatrixRows", paramBag ); 4.39 + rightMatrixRows = param->intValue; 4.40 + param = getParamFromBag( "rightMatrixCols", paramBag ); 4.41 + rightMatrixCols = param->intValue; 4.42 + *rightMatrix = makeMatrix_WithResMat( rightMatrixRows, rightMatrixCols ); 4.43 + 4.44 + param = getParamFromBag( "rightMatrixFileName", paramBag ); 4.45 + rightMatrixFileName = param->strValue; 4.46 + read_Matrix_From_File( *rightMatrix, rightMatrixFileName ); 4.47 + } 4.48 + 4.49 + 4.50 +void parseLineIntoRow( char *line, float32* row ); 4.51 + 4.52 + 4.53 + void 4.54 +read_Matrix_From_File( Matrix *matrixStruc, char *matrixFileName ) 4.55 + { int row, maxRead, numRows, numCols; 4.56 + float32 *matrixStart; 4.57 + size_t lineSz = 0; 4.58 + FILE *file; 4.59 + char *line = NULL; 4.60 + 4.61 + lineSz = 50000; //max length of line in a matrix data file 4.62 + line = (char *) malloc( lineSz ); 4.63 + if( line == NULL ) printf( "no mem for matrix line" ); 4.64 + 4.65 + numRows = matrixStruc->numRows; 4.66 + numCols = matrixStruc->numCols; 4.67 + matrixStart = matrixStruc->array; 4.68 + 4.69 + file = fopen( matrixFileName, "r" ); 4.70 + if( file == NULL ) { printf( "\nCouldn't open file!!\n"); exit(1);} 4.71 + fseek( file, 0, SEEK_SET ); 4.72 + for( row = 0; row < numRows; row++ ) 4.73 + { 4.74 + if( feof( file ) ) printf( "file ran out too soon" ); 4.75 + maxRead = getline( &line, &lineSz, file ); 4.76 + if( maxRead == -1 ) printf( "prob reading mat line"); 4.77 + 4.78 + if( *line == '\n') continue; //blank line 4.79 + if( *line == '/' ) continue; //comment line 4.80 + 4.81 + parseLineIntoRow( line, matrixStart + row * numCols ); 4.82 + } 4.83 + free( line ); 4.84 + } 4.85 + 4.86 +/*This function relies on each line having the proper number of cols. It 4.87 + * doesn't check, nor enforce, so if the file is improperly formatted it 4.88 + * can write over unrelated memory 4.89 + */ 4.90 + void 4.91 +parseLineIntoRow( char *line, float32* row ) 4.92 + { 4.93 + char *valueStr, *searchPos; 4.94 + 4.95 + //read the float values 4.96 + searchPos = valueStr = line; //start 4.97 + 4.98 + for( ; *searchPos != 0; searchPos++) //bit dangerous, should use buff len 4.99 + { 4.100 + if( *searchPos == '\n' ) //last col.. relying on well-formatted file 4.101 + { *searchPos = 0; 4.102 + *row = atof( valueStr ); 4.103 + break; //end FOR loop 4.104 + } 4.105 + if( *searchPos == ',' ) 4.106 + { *searchPos = 0; //mark end of string 4.107 + *row = (float32) atof( valueStr ); 4.108 + row += 1; //address arith 4.109 + //skip any spaces before digits.. use searchPos + 1 to skip the 0 4.110 + for( ; *(searchPos + 1)== ' ' && *(searchPos + 1) !=0; searchPos++); 4.111 + valueStr = searchPos + 1; 4.112 + } 4.113 + } 4.114 + } 4.115 + 4.116 + //========================================================================== 4.117 + 4.118 +/*In the "_Flat" version of constructor, do only malloc of the top data struc 4.119 + * and set values in that top-level. Don't malloc any sub-structures. 4.120 + */ 4.121 + Matrix * 4.122 +makeMatrix_Flat( int32 numRows, int32 numCols ) 4.123 + { Matrix * retMatrix; 4.124 + retMatrix = malloc( sizeof( Matrix ) ); 4.125 + retMatrix->numRows = numRows; 4.126 + retMatrix->numCols = numCols; 4.127 + 4.128 + return retMatrix; 4.129 + } 4.130 + 4.131 + Matrix * 4.132 +makeMatrix_WithResMat( int32 numRows, int32 numCols ) 4.133 + { Matrix * retMatrix; 4.134 + retMatrix = malloc( sizeof( Matrix ) ); 4.135 + retMatrix->numRows = numRows; 4.136 + retMatrix->numCols = numCols; 4.137 + retMatrix->array = malloc( numRows * numCols * sizeof(float32) ); 4.138 + 4.139 + return retMatrix; 4.140 + } 4.141 + 4.142 + void 4.143 +freeMatrix_Flat( Matrix * matrix ) 4.144 + { //( matrix ); 4.145 + } 4.146 + void 4.147 +freeMatrix( Matrix * matrix ) 4.148 + { free( matrix->array ); 4.149 + free( matrix ); 4.150 + } 4.151 + 4.152 +void 4.153 +printMatrix( Matrix *matrix ) 4.154 + { int r, c, numRows, numCols, rowsToPrint, colsToPrint, rowIncr, colIncr; 4.155 + float32 *matrixArray; 4.156 + 4.157 + numRows = rowsToPrint = matrix->numRows; 4.158 + numCols = colsToPrint = matrix->numCols; 4.159 + matrixArray = matrix->array; 4.160 + 4.161 + rowIncr = numRows/20; if(rowIncr == 0) rowIncr = 1;//20 to 39 rows printed 4.162 + colIncr = numCols/20; if(colIncr == 0) colIncr = 1;//20 to 39 cols printed 4.163 + for( r = 0; r < numRows; r += rowIncr ) 4.164 + { for( c = 0; c < numCols; c += colIncr ) 4.165 + { printf( "%3.1f | ", matrixArray[ r * numCols + c ] ); 4.166 + } 4.167 + printf("\n"); 4.168 + } 4.169 + } 4.170 +
5.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 5.2 +++ b/Matrix_Mult.h Mon Sep 17 18:19:52 2012 -0700 5.3 @@ -0,0 +1,77 @@ 5.4 +/* 5.5 + * Copyright Oct 24, 2009 OpenSourceStewardshipFoundation.org 5.6 + * Licensed under GNU General Public License version 2 5.7 + */ 5.8 + 5.9 +#ifndef MATRIX_MULT_H_ 5.10 +#define MATRIX_MULT_H_ 5.11 + 5.12 +#include <stdio.h> 5.13 +#include <unistd.h> 5.14 +#include <malloc.h> 5.15 + 5.16 +#include "VMS_impl/VMS_primitive_data_types.h" 5.17 +#include "C_Libraries/ParamHelper/Param.h" 5.18 + 5.19 +//============================== Structures ============================== 5.20 + 5.21 +typedef 5.22 +struct 5.23 + { int32 numRows; 5.24 + int32 numCols; 5.25 + float32 *array; //2D, but dynamically sized, so use addr arith 5.26 + } 5.27 +Matrix; 5.28 + 5.29 +/* This is the "appSpecificPiece" that is carried inside a DKUPiece. 5.30 + * In the DKUPiece data struc it is declared to be of type "void *". This 5.31 + * allows the application to define any data structure it wants and put it 5.32 + * into a DKUPiece. 5.33 + * When the app specific info is used, it is in app code, so it is cast to 5.34 + * the correct type to tell the compiler how to access fields. 5.35 + * This keeps all app-specific things out of the DKU directory, as per the 5.36 + * DKU standard. */ 5.37 +typedef 5.38 +struct 5.39 + { 5.40 + // pointers to shared data.. the result matrix must be created when the 5.41 + // left and right matrices are put into the root ancestor DKUPiece. 5.42 + Matrix * leftMatrix; 5.43 + Matrix * rightMatrix; 5.44 + Matrix * resultMatrix; 5.45 + 5.46 + // define the starting and ending boundaries for this piece of the 5.47 + // result matrix. These are derivable from the left and right 5.48 + // matrices, but included them for readability of code. 5.49 + int prodStartRow, prodEndRow; 5.50 + int prodStartCol, prodEndCol; 5.51 + // Start and end of the portion of the left matrix that contributes to 5.52 + // this piece of the product 5.53 + int leftStartRow, leftEndRow; 5.54 + int leftStartCol, leftEndCol; 5.55 + // Start and end of the portion of the right matrix that contributes to 5.56 + // this piece of the product 5.57 + int rightStartRow, rightEndRow; 5.58 + int rightStartCol, rightEndCol; 5.59 + } 5.60 +MatrixProdPiece; 5.61 + 5.62 +//============================== Functions ================================ 5.63 +void readFile(); 5.64 + 5.65 +Matrix *makeMatrix( int32 numRows, int32 numCols ); 5.66 +Matrix *makeMatrix_Flat( int32 numRows, int32 numCols ); 5.67 +Matrix *makeMatrix_WithResMat( int32 numRows, int32 numCols ); 5.68 +void freeMatrix_Flat( Matrix * matrix ); 5.69 +void freeMatrix( Matrix * matrix ); 5.70 +void printMatrix( Matrix *matrix ); 5.71 + 5.72 +void read_Matrix_From_File( Matrix *matrixStruc, char *matrixFileName ); 5.73 + 5.74 +void 5.75 +initialize_Input_Matrices_Via( Matrix **leftMatrix, Matrix **rightMatrix, 5.76 + ParamBag *paramBag ); 5.77 + 5.78 +//=========================================================================== 5.79 + 5.80 +#endif /*MATRIX_MULT_H_*/
6.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 6.2 +++ b/Vthread__Matrix_Mult/Divide_Pr.c Mon Sep 17 18:19:52 2012 -0700 6.3 @@ -0,0 +1,646 @@ 6.4 +/* 6.5 + * Copyright 2009 OpenSourceStewardshipFoundation.org 6.6 + * Licensed under GNU General Public License version 2 6.7 + * 6.8 + * Author: seanhalle@yahoo.com 6.9 + * 6.10 + */ 6.11 + 6.12 + 6.13 +#include "Vthread__Matrix_Mult.h" 6.14 +#include <math.h> 6.15 +#include <string.h> 6.16 +#include "Vthread_impl/Vthread.h" 6.17 + 6.18 + //The time to compute this many result values should equal the time to 6.19 + // perform this division on a matrix of size gives that many result calcs 6.20 + //IE, size this so that sequential time to calc equals divide time 6.21 + // find the value by experimenting -- but divide time and calc time scale 6.22 + // same way, so this value should remain valid across hardware 6.23 +#define NUM_CELLS_IN_SEQUENTIAL_CUTOFF 1000 6.24 + 6.25 + 6.26 +//=========================================================================== 6.27 +int inline 6.28 +measureMatrixMultPrimitive( SlaveVP *animPr ); 6.29 + 6.30 +SlicingStrucCarrier * 6.31 +calcIdealSizeAndSliceDimensions( Matrix *leftMatrix, Matrix *rightMatrix, 6.32 + SlaveVP *animPr ); 6.33 + 6.34 +SlicingStruc * 6.35 +sliceUpDimension( float32 idealSizeOfSide, int startVal, int endVal, 6.36 + SlaveVP *animPr ); 6.37 + 6.38 +void 6.39 +freeSlicingStruc( SlicingStruc *slicingStruc, SlaveVP *animPr ); 6.40 + 6.41 +SubMatrix ** 6.42 +createSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices, 6.43 + int32 numUses, Matrix *origMatrix, SlaveVP *animPr ); 6.44 + 6.45 +void 6.46 +freeSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices, 6.47 + SubMatrix **subMatrices, SlaveVP *animPr ); 6.48 + 6.49 +void 6.50 +pairUpSubMatricesAndMakeProcessors( SubMatrix **leftSubMatrices, 6.51 + SubMatrix **rightSubMatrices, 6.52 + int32 numRowIdxs, int32 numColIdxs, 6.53 + int32 numVecIdxs, 6.54 + SlaveVP *resultPr, 6.55 + SlaveVP *animatingPr ); 6.56 + 6.57 +void 6.58 +makeSubMatricesAndProcrs( Matrix *leftMatrix, Matrix *rightMatrix, 6.59 + SlicingStrucCarrier *slicingStrucCarrier, 6.60 + SlaveVP *resultPr, SlaveVP *animatingPr ); 6.61 + 6.62 +void inline 6.63 +copyTranspose( int32 numRows, int32 numCols, 6.64 + int32 origStartRow, int32 origStartCol, int32 origStride, 6.65 + float32 *subArray, float32 *origArray ); 6.66 + 6.67 +void inline 6.68 +multiplyMatrixArraysTransposed( int32 vecLength, int32 numResRows, 6.69 + int32 numResCols, 6.70 + float32 *leftArray, float32 *rightArray, 6.71 + float32 *resArray ); 6.72 + 6.73 + 6.74 + 6.75 +/*Divider creates one processor for every sub-matrix 6.76 + * It hands them: 6.77 + * the name of the result processor that they should send their results to, 6.78 + * the left and right matrices, and the rows and cols they should multiply 6.79 + * It first creates the result processor, then all the sub-matrixPair 6.80 + * processors, 6.81 + * then does a receive of a message from the result processor that gives 6.82 + * the divider ownership of the result matrix. 6.83 + * Finally, the divider returns the result matrix out of the Vthread system. 6.84 + * 6.85 + * Divider chooses the size of sub-matrices via an algorithm that tries to 6.86 + * keep the minimum work above a threshold. The threshold is machine- 6.87 + * dependent, so ask Vthread for min work-unit time to get a 6.88 + * given overhead 6.89 + * 6.90 + * Divide min work-unit cycles by measured-cycles for one matrix-cell 6.91 + * product -- gives the number of products need to have in min size 6.92 + * matrix. 6.93 + * 6.94 + * So then, take cubed root of this to get the size of a side of min sub- 6.95 + * matrix. That is the size of the ideal square sub-matrix -- so tile 6.96 + * up the two input matrices into ones as close as possible to that size, 6.97 + * and create the pairs of sub-matrices. 6.98 + * 6.99 + *======================== STRATEGIC OVERVIEW ======================= 6.100 + * 6.101 + *This division is a bit tricky, because have to create things in advance 6.102 + * that it's not at first obvious need to be created.. 6.103 + * 6.104 + *First slice up each dimension -- three of them.. this is because will have 6.105 + * to create the sub-matrix's data-structures before pairing the sub-matrices 6.106 + * with each other -- so, have three dimensions to slice up before can 6.107 + * create the sub-matrix data-strucs -- also, have to be certain that the 6.108 + * cols of the left input have the exact same slicing as the rows of the 6.109 + * left matrix, so just to be sure, do the slicing calc once, then use it 6.110 + * for both. 6.111 + * 6.112 + *So, goes like this: 6.113 + *1) calculate the start & end values of each dimension in each matrix. 6.114 + *2) use those values to create sub-matrix structures 6.115 + *3) combine sub-matrices into pairs, as the tasks to perform. 6.116 + * 6.117 + *Have to calculate separately from creating the sub-matrices because of the 6.118 + * nature of the nesting -- would either end up creating the same sub-matrix 6.119 + * multiple times, or else would have to put in detection of whether had 6.120 + * made a particular one already if tried to combine steps 1 and 2. 6.121 + * 6.122 + *Step 3 has to be separate because of the nesting, as well -- same reason, 6.123 + * would either create same sub-matrix multiple times, or else have to 6.124 + * add detection of whether was already created. 6.125 + * 6.126 + *Another way to look at it: there's one level of loop to divide dimensions, 6.127 + * two levels of nesting to create sub-matrices, and three levels to pair 6.128 + * up the sub-matrices. 6.129 + */ 6.130 + 6.131 +void divideWorkIntoSubMatrixPairProcrs( void *_dividerParams, 6.132 + SlaveVP *animatingThd ) 6.133 + { SlaveVP *resultPr; 6.134 + DividerParams *dividerParams; 6.135 + ResultsParams *resultsParams; 6.136 + Matrix *leftMatrix, *rightMatrix; 6.137 + SlicingStrucCarrier *slicingStrucCarrier; 6.138 + float32 *resultArray; //points to array inside result matrix 6.139 + MatrixMultGlobals *globals; 6.140 + 6.141 + DEBUG( dbgAppFlow, "start divide\n") 6.142 + 6.143 + int32 6.144 + divideProbe = VMS_App__create_single_interval_probe( "divideProbe", 6.145 + animatingThd ); 6.146 + VMS_App__record_sched_choice_into_probe( divideProbe, animatingThd ); 6.147 + VMS_App__record_interval_start_in_probe( divideProbe ); 6.148 + 6.149 + //=========== Setup -- make local copies of ptd-to-things, malloc, aso 6.150 + int32 numResRows, numResCols, vectLength; 6.151 + 6.152 + dividerParams = (DividerParams *)_dividerParams; 6.153 + 6.154 + leftMatrix = dividerParams->leftMatrix; 6.155 + rightMatrix = dividerParams->rightMatrix; 6.156 + 6.157 + vectLength = leftMatrix->numCols; 6.158 + numResRows = leftMatrix->numRows; 6.159 + numResCols = rightMatrix->numCols; 6.160 + resultArray = dividerParams->resultMatrix->array; 6.161 + 6.162 + //zero the result array 6.163 + memset( resultArray, 0, numResRows * numResCols * sizeof(float32) ); 6.164 + 6.165 + //============== Do either sequential mult or do division ============== 6.166 + 6.167 + //Check if input matrices too small -- if yes, just do sequential 6.168 + //Cutoff is determined by overhead of this divider -- relatively 6.169 + // machine-independent 6.170 + if( (float32)leftMatrix->numRows * (float32)leftMatrix->numCols * 6.171 + (float32)rightMatrix->numCols < NUM_CELLS_IN_SEQUENTIAL_CUTOFF ) 6.172 + { 6.173 + //====== Do sequential multiply on a single core 6.174 + DEBUG( dbgAppFlow, "doing sequential") 6.175 + 6.176 + //transpose the right matrix 6.177 + float32 * 6.178 + transRightArray = 6.179 + Vthread__malloc( (size_t)rightMatrix->numRows * 6.180 + (size_t)rightMatrix->numCols * 6.181 + sizeof(float32), animatingThd ); 6.182 + 6.183 + //copy values from orig matrix to local 6.184 + copyTranspose( rightMatrix->numRows, rightMatrix->numCols, 6.185 + 0, 0, rightMatrix->numRows, 6.186 + transRightArray, rightMatrix->array ); 6.187 + 6.188 + multiplyMatrixArraysTransposed( vectLength, numResRows, numResCols, 6.189 + leftMatrix->array, transRightArray, 6.190 + resultArray ); 6.191 + } 6.192 + else 6.193 + { 6.194 + //====== Do parallel multiply across cores 6.195 + 6.196 + DEBUG( dbgAppFlow, "divider: do parallel mult\n") 6.197 + //Calc the ideal size of sub-matrix and slice up the dimensions of 6.198 + // the two matrices. 6.199 + //The ideal size is the one takes the number of cycles to calculate 6.200 + // such that calc time is equal or greater than min work-unit size 6.201 + slicingStrucCarrier = 6.202 + calcIdealSizeAndSliceDimensions(leftMatrix,rightMatrix,animatingThd); 6.203 + 6.204 + //Make the results processor, now that know how many to wait for 6.205 + resultsParams = Vthread__malloc( sizeof(ResultsParams), animatingThd ); 6.206 + resultsParams->numSubMatrixPairs = 6.207 + slicingStrucCarrier->leftRowSlices->numVals * 6.208 + slicingStrucCarrier->rightColSlices->numVals * 6.209 + slicingStrucCarrier->vecSlices->numVals; 6.210 + resultsParams->dividerPr = animatingThd; 6.211 + resultsParams->numCols = rightMatrix->numCols; 6.212 + resultsParams->numRows = leftMatrix->numRows; 6.213 + resultsParams->resultArray = resultArray; 6.214 + 6.215 + //========== Set up global vars, including conds and mutexes ========== 6.216 + globals = VMS_App__malloc( sizeof(MatrixMultGlobals) ); 6.217 + Vthread__set_globals_to( globals ); 6.218 + 6.219 + globals->results_mutex = Vthread__make_mutex( animatingThd ); 6.220 + globals->results_cond = Vthread__make_cond( globals->results_mutex, 6.221 + animatingThd ); 6.222 + 6.223 + globals->vector_mutex = Vthread__make_mutex( animatingThd ); 6.224 + globals->vector_cond = Vthread__make_cond( globals->vector_mutex, 6.225 + animatingThd ); 6.226 + 6.227 + globals->start_mutex = Vthread__make_mutex( animatingThd ); 6.228 + globals->start_cond = Vthread__make_cond( globals->start_mutex, 6.229 + animatingThd ); 6.230 + //====================================================================== 6.231 + 6.232 + DEBUG( dbgAppFlow, "divider: made mutexes and conds\n") 6.233 + //get results-comm lock before create results-thd, to ensure it can't 6.234 + // signal that results are available before this thd is waiting on cond 6.235 + Vthread__mutex_lock( globals->results_mutex, animatingThd ); 6.236 + 6.237 + //also get the start lock & use to ensure no vector threads send a 6.238 + // signal before the results thread is waiting on vector cond 6.239 + Vthread__mutex_lock( globals->start_mutex, animatingThd ); 6.240 + 6.241 + 6.242 + DEBUG( dbgAppFlow, "divider: make result thread\n") 6.243 + Vthread__create_thread( &gatherResults, resultsParams, animatingThd ); 6.244 + 6.245 + //Now wait for results thd to signal that it has vector lock 6.246 + Vthread__cond_wait( globals->start_cond, animatingThd ); 6.247 + Vthread__mutex_unlock( globals->start_mutex, animatingThd );//done w/lock 6.248 + DEBUG( dbgAppFlow, "divider: make sub-matrices\n") 6.249 + 6.250 + //Make the sub-matrices, and pair them up, and make processor to 6.251 + // calc product of each pair. 6.252 + makeSubMatricesAndProcrs( leftMatrix, rightMatrix, 6.253 + slicingStrucCarrier, 6.254 + resultPr, animatingThd); 6.255 + 6.256 + //Wait for results thread to say results are good 6.257 + Vthread__cond_wait( globals->results_cond, animatingThd ); 6.258 + } 6.259 + 6.260 + 6.261 + //=============== Work done -- send results back ================= 6.262 + 6.263 + 6.264 + DEBUG( dbgAppFlow, "end divide\n") 6.265 + 6.266 + VMS_App__record_interval_end_in_probe( divideProbe ); 6.267 + VMS_App__print_stats_of_all_probes(); 6.268 + 6.269 + //nothing left to do so dissipate, Vthread will wait to shutdown, 6.270 + // making results available to outside, until all the processors have 6.271 + // dissipated -- so actually no need to wait for results processor 6.272 + //However, following the pattern, so done with comm, release lock 6.273 + Vthread__mutex_unlock( globals->results_mutex, animatingThd ); 6.274 + 6.275 + Vthread__dissipate_thread( animatingThd ); //all procrs dissipate self at end 6.276 + //when all of the processors have dissipated, the "create seed and do 6.277 + // work" call in the entry point function returns 6.278 + } 6.279 + 6.280 + 6.281 +SlicingStrucCarrier * 6.282 +calcIdealSizeAndSliceDimensions( Matrix *leftMatrix, Matrix *rightMatrix, 6.283 + SlaveVP *animPr ) 6.284 + { 6.285 + float32 idealSizeOfSide, idealSizeOfSide1, idealSizeOfSide2; 6.286 + SlicingStruc *leftRowSlices, *vecSlices, *rightColSlices; 6.287 + SlicingStrucCarrier *slicingStrucCarrier = 6.288 + Vthread__malloc(sizeof(SlicingStrucCarrier), animPr); 6.289 + 6.290 + int minWorkUnitCycles, primitiveCycles, idealNumWorkUnits; 6.291 + float64 numPrimitiveOpsInMinWorkUnit; 6.292 + 6.293 + 6.294 + //======= Calc ideal size of min-sized sub-matrix ======== 6.295 + 6.296 + //ask Vthread for the number of cycles of the minimum work unit, at given 6.297 + // percent overhead then add a guess at overhead from this divider 6.298 + minWorkUnitCycles = Vthread__giveMinWorkUnitCycles( .05 ); 6.299 + 6.300 + //ask Vthread for number of cycles of the "primitive" op of matrix mult 6.301 + primitiveCycles = measureMatrixMultPrimitive( animPr ); 6.302 + 6.303 + numPrimitiveOpsInMinWorkUnit = 6.304 + (float64)minWorkUnitCycles / (float64)primitiveCycles; 6.305 + 6.306 + //take cubed root -- that's number of these in a "side" of sub-matrix 6.307 + // then multiply by 5 because the primitive is 5x5src/Application/Vthread__Matrix_Mult/Divide_Pr.c:403: warning: implicit declaration of function ‘Vthread__give_number_of_cores_to_schedule_onto’ 6.308 + idealSizeOfSide1 = 5 * cbrt( numPrimitiveOpsInMinWorkUnit ); 6.309 + 6.310 + idealNumWorkUnits = Vthread__giveIdealNumWorkUnits(); 6.311 + 6.312 + idealSizeOfSide2 = leftMatrix->numRows / rint(cbrt( idealNumWorkUnits )); 6.313 + idealSizeOfSide2 *= 0.6; //finer granularity to help load balance 6.314 + 6.315 + if( idealSizeOfSide1 > idealSizeOfSide2 ) 6.316 + idealSizeOfSide = idealSizeOfSide1; 6.317 + else 6.318 + idealSizeOfSide = idealSizeOfSide2; 6.319 + 6.320 + //The multiply inner loop blocks the array to fit into L1 cache 6.321 +// if( idealSizeOfSide < ROWS_IN_BLOCK ) idealSizeOfSide = ROWS_IN_BLOCK; 6.322 + 6.323 + //============ Slice up dimensions, now that know target size =========== 6.324 + 6.325 + //Tell the slicer the target size of a side (floating pt), the start 6.326 + // value to start slicing at, and the end value to stop slicing at 6.327 + //It returns an array of start value of each chunk, plus number of them 6.328 + int32 startLeftRow, endLeftRow, startVec,endVec,startRightCol,endRightCol; 6.329 + startLeftRow = 0; 6.330 + endLeftRow = leftMatrix->numRows -1; 6.331 + startVec = 0; 6.332 + endVec = leftMatrix->numCols -1; 6.333 + startRightCol = 0; 6.334 + endRightCol = rightMatrix->numCols -1; 6.335 + 6.336 + leftRowSlices = 6.337 + sliceUpDimension( idealSizeOfSide, startLeftRow, endLeftRow, animPr ); 6.338 + 6.339 + vecSlices = 6.340 + sliceUpDimension( idealSizeOfSide, startVec, endVec, animPr ); 6.341 + 6.342 + rightColSlices = 6.343 + sliceUpDimension( idealSizeOfSide, startRightCol, endRightCol,animPr); 6.344 + 6.345 + slicingStrucCarrier->leftRowSlices = leftRowSlices; 6.346 + slicingStrucCarrier->vecSlices = vecSlices; 6.347 + slicingStrucCarrier->rightColSlices = rightColSlices; 6.348 + 6.349 + return slicingStrucCarrier; 6.350 + } 6.351 + 6.352 + 6.353 +void 6.354 +makeSubMatricesAndProcrs( Matrix *leftMatrix, Matrix *rightMatrix, 6.355 + SlicingStrucCarrier *slicingStrucCarrier, 6.356 + SlaveVP *resultPr, SlaveVP *animPr ) 6.357 + { 6.358 + SlicingStruc *leftRowSlices, *vecSlices, *rightColSlices; 6.359 + 6.360 + leftRowSlices = slicingStrucCarrier->leftRowSlices; 6.361 + vecSlices = slicingStrucCarrier->vecSlices; 6.362 + rightColSlices = slicingStrucCarrier->rightColSlices; 6.363 + Vthread__free( slicingStrucCarrier, animPr ); 6.364 + 6.365 + //================ Make sub-matrices, given the slicing ================ 6.366 + SubMatrix **leftSubMatrices, **rightSubMatrices; 6.367 + leftSubMatrices = 6.368 + createSubMatrices( leftRowSlices, vecSlices, rightColSlices->numVals, 6.369 + leftMatrix, animPr ); 6.370 + //double_check_that_always_numRows_in_right_same_as_numCols_in_left(); 6.371 + rightSubMatrices = 6.372 + createSubMatrices( vecSlices, rightColSlices, leftRowSlices->numVals, 6.373 + rightMatrix, animPr ); 6.374 + 6.375 + 6.376 + //============== pair the sub-matrices and make processors ============== 6.377 + int32 numRowIdxs, numColIdxs, numVecIdxs; 6.378 + 6.379 + numRowIdxs = leftRowSlices->numVals; 6.380 + numColIdxs = rightColSlices->numVals; 6.381 + numVecIdxs = vecSlices->numVals; 6.382 + 6.383 + freeSlicingStruc( leftRowSlices, animPr ); 6.384 + freeSlicingStruc( vecSlices, animPr ); 6.385 + freeSlicingStruc( rightColSlices, animPr ); 6.386 + 6.387 + pairUpSubMatricesAndMakeProcessors( leftSubMatrices, 6.388 + rightSubMatrices, 6.389 + numRowIdxs, numColIdxs, 6.390 + numVecIdxs, 6.391 + resultPr, 6.392 + animPr ); 6.393 + } 6.394 + 6.395 + 6.396 + 6.397 + 6.398 +void 6.399 +pairUpSubMatricesAndMakeProcessors( SubMatrix **leftSubMatrices, 6.400 + SubMatrix **rightSubMatrices, 6.401 + int32 numRowIdxs, int32 numColIdxs, 6.402 + int32 numVecIdxs, 6.403 + SlaveVP *resultPr, 6.404 + SlaveVP *animatingPr ) 6.405 + { 6.406 + int32 resRowIdx, resColIdx, vecIdx; 6.407 + int32 numLeftColIdxs, numRightColIdxs; 6.408 + int32 leftRowIdxOffset; 6.409 + SMPairParams *subMatrixPairParams; 6.410 + float32 numToPutOntoEachCore, leftOverFraction; 6.411 + int32 numCores, coreToScheduleOnto, numVecOnCurrCore; 6.412 + 6.413 + numLeftColIdxs = numColIdxs; 6.414 + numRightColIdxs = numVecIdxs; 6.415 + 6.416 + numCores = Vthread__give_number_of_cores_to_schedule_onto(); 6.417 + 6.418 + numToPutOntoEachCore = numRowIdxs*numColIdxs/numCores; 6.419 + leftOverFraction = 0; 6.420 + numVecOnCurrCore = 0; 6.421 + coreToScheduleOnto = 0; 6.422 + 6.423 + for( resRowIdx = 0; resRowIdx < numRowIdxs; resRowIdx++ ) 6.424 + { 6.425 + leftRowIdxOffset = resRowIdx * numLeftColIdxs; 6.426 + 6.427 + for( resColIdx = 0; resColIdx < numColIdxs; resColIdx++ ) 6.428 + { 6.429 + 6.430 + for( vecIdx = 0; vecIdx < numVecIdxs; vecIdx++ ) 6.431 + { 6.432 + //Make the processor for the pair of sub-matrices 6.433 + subMatrixPairParams = Vthread__malloc( sizeof(SMPairParams), 6.434 + animatingPr); 6.435 + subMatrixPairParams->leftSubMatrix = 6.436 + leftSubMatrices[ leftRowIdxOffset + vecIdx ]; 6.437 + 6.438 + subMatrixPairParams->rightSubMatrix = 6.439 + rightSubMatrices[ vecIdx * numRightColIdxs + resColIdx ]; 6.440 + 6.441 + //subMatrixPairParams->resultPr = resultPr; 6.442 + 6.443 + //put all pairs from the same vector onto same core 6.444 + Vthread__create_thread_with_affinity( &calcSubMatrixProduct, 6.445 + subMatrixPairParams, 6.446 + animatingPr, 6.447 + coreToScheduleOnto ); 6.448 + } 6.449 + 6.450 + //Trying to distribute the subMatrix-vectors across the cores, so 6.451 + // that each core gets the same number of vectors, with a max 6.452 + // imbalance of 1 vector more on some cores than others 6.453 + numVecOnCurrCore += 1; 6.454 + if( numVecOnCurrCore + leftOverFraction >= numToPutOntoEachCore -1 ) 6.455 + { 6.456 + //deal with fractional part, to ensure that imbalance is 1 max 6.457 + // IE, core with most has only 1 more than core with least 6.458 + leftOverFraction += numToPutOntoEachCore - numVecOnCurrCore; 6.459 + if( leftOverFraction >= 1 ) 6.460 + { leftOverFraction -= 1; 6.461 + numVecOnCurrCore = -1; 6.462 + } 6.463 + else 6.464 + { numVecOnCurrCore = 0; 6.465 + } 6.466 + //Move to next core, max core-value to incr to is numCores -1 6.467 + if( coreToScheduleOnto >= numCores -1 ) 6.468 + { coreToScheduleOnto = 0; 6.469 + } 6.470 + else 6.471 + { coreToScheduleOnto += 1; 6.472 + } 6.473 + } 6.474 + 6.475 + } 6.476 + } 6.477 + } 6.478 + 6.479 + 6.480 + 6.481 +/*Walk through the two slice-strucs, making sub-matrix strucs as go 6.482 + */ 6.483 +SubMatrix ** 6.484 +createSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices, 6.485 + int32 numUses, Matrix *origMatrix, SlaveVP *animPr ) 6.486 + { 6.487 + int32 numRowIdxs, numColIdxs, rowIdx, colIdx; 6.488 + int32 startRow, endRow, startCol, endCol; 6.489 + int32 *rowStartVals, *colStartVals; 6.490 + int32 rowOffset; 6.491 + SubMatrix **subMatrices, *newSubMatrix; 6.492 + 6.493 + numRowIdxs = rowSlices->numVals; 6.494 + numColIdxs = colSlices->numVals; 6.495 + 6.496 + rowStartVals = rowSlices->startVals; 6.497 + colStartVals = colSlices->startVals; 6.498 + 6.499 + subMatrices = Vthread__malloc((size_t)numRowIdxs * 6.500 + (size_t)numColIdxs * sizeof(SubMatrix*),animPr ); 6.501 + 6.502 + for( rowIdx = 0; rowIdx < numRowIdxs; rowIdx++ ) 6.503 + { 6.504 + rowOffset = rowIdx * numColIdxs; 6.505 + 6.506 + startRow = rowStartVals[rowIdx]; 6.507 + endRow = rowStartVals[rowIdx + 1] -1; //"fake" start above last is 6.508 + // at last valid idx + 1 & is 6.509 + // 1 greater than end value 6.510 + for( colIdx = 0; colIdx < numColIdxs; colIdx++ ) 6.511 + { 6.512 + startCol = colStartVals[colIdx]; 6.513 + endCol = colStartVals[colIdx + 1] -1; 6.514 + 6.515 + newSubMatrix = Vthread__malloc( sizeof(SubMatrix), animPr ); 6.516 + newSubMatrix->numRows = endRow - startRow +1; 6.517 + newSubMatrix->numCols = endCol - startCol +1; 6.518 + newSubMatrix->origMatrix = origMatrix; 6.519 + newSubMatrix->origStartRow = startRow; 6.520 + newSubMatrix->origStartCol = startCol; 6.521 + newSubMatrix->alreadyCopied = FALSE; 6.522 + newSubMatrix->numUsesLeft = numUses; //can free after this many 6.523 + //Prevent uninitialized memory 6.524 + newSubMatrix->copySingleton = NULL; 6.525 + newSubMatrix->copyTransSingleton = NULL; 6.526 + 6.527 + subMatrices[ rowOffset + colIdx ] = newSubMatrix; 6.528 + } 6.529 + } 6.530 + return subMatrices; 6.531 + } 6.532 + 6.533 + 6.534 +void 6.535 +freeSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices, 6.536 + SubMatrix **subMatrices, SlaveVP *animPr ) 6.537 + { 6.538 + int32 numRowIdxs, numColIdxs, rowIdx, colIdx, rowOffset; 6.539 + SubMatrix *subMatrix; 6.540 + 6.541 + numRowIdxs = rowSlices->numVals; 6.542 + numColIdxs = colSlices->numVals; 6.543 + 6.544 + for( rowIdx = 0; rowIdx < numRowIdxs; rowIdx++ ) 6.545 + { 6.546 + rowOffset = rowIdx * numColIdxs; 6.547 + for( colIdx = 0; colIdx < numColIdxs; colIdx++ ) 6.548 + { 6.549 + subMatrix = subMatrices[ rowOffset + colIdx ]; 6.550 + if( subMatrix->alreadyCopied ) 6.551 + Vthread__free( subMatrix->array, animPr ); 6.552 + Vthread__free( subMatrix, animPr ); 6.553 + } 6.554 + } 6.555 + Vthread__free( subMatrices, animPr ); 6.556 + } 6.557 + 6.558 + 6.559 + 6.560 +SlicingStruc * 6.561 +sliceUpDimension( float32 idealSizeOfSide, int startVal, int endVal, 6.562 + SlaveVP *animPr ) 6.563 + { float32 residualAcc = 0; 6.564 + int numSlices, i, *startVals, sizeOfSlice, endCondition; 6.565 + SlicingStruc *slicingStruc = Vthread__malloc(sizeof(SlicingStruc), animPr); 6.566 + 6.567 + //calc size of matrix need to hold start vals -- 6.568 + numSlices = (int32)( (float32)(endVal -startVal +1) / idealSizeOfSide); 6.569 + 6.570 + startVals = Vthread__malloc( ((size_t)numSlices + 1) * sizeof(int32), animPr ); 6.571 + 6.572 + //Calc the upper limit of start value -- when get above this, end loop 6.573 + // by saving highest value of the matrix dimension to access, plus 1 6.574 + // as the start point of the imaginary slice following the last one 6.575 + //Plus 1 because go up to value but not include when process last slice 6.576 + //The stopping condition is half-a-size less than highest value because 6.577 + // don't want any pieces smaller than half the ideal size -- just tack 6.578 + // little ones onto end of last one 6.579 + endCondition = endVal - (int) (idealSizeOfSide/2); //end *value*, not size 6.580 + for( i = 0; startVal <= endVal; i++ ) 6.581 + { 6.582 + startVals[i] = startVal; 6.583 + residualAcc += idealSizeOfSide; 6.584 + sizeOfSlice = (int)residualAcc; 6.585 + residualAcc -= (float32)sizeOfSlice; 6.586 + startVal += sizeOfSlice; //ex @size = 2 get 0, 2, 4, 6, 8.. 6.587 + 6.588 + if( startVal > endCondition ) 6.589 + { startVal = endVal + 1; 6.590 + startVals[ i + 1 ] = startVal; 6.591 + } 6.592 + } 6.593 + 6.594 + slicingStruc->startVals = startVals; 6.595 + slicingStruc->numVals = i; //loop incr'd, so == last valid start idx+1 6.596 + // which means is num sub-matrices in dim 6.597 + // also == idx of the fake start just above 6.598 + return slicingStruc; 6.599 + } 6.600 + 6.601 +void 6.602 +freeSlicingStruc( SlicingStruc *slicingStruc, SlaveVP *animPr ) 6.603 + { 6.604 + Vthread__free( slicingStruc->startVals, animPr ); 6.605 + Vthread__free( slicingStruc, animPr ); 6.606 + } 6.607 + 6.608 + 6.609 +int inline 6.610 +measureMatrixMultPrimitive( SlaveVP *animPr ) 6.611 + { 6.612 + int r, c, v, numCycles; 6.613 + float32 *res, *left, *right; 6.614 + 6.615 + //setup inputs 6.616 + left = Vthread__malloc( 5 * 5 * sizeof( float32 ), animPr ); 6.617 + right = Vthread__malloc( 5 * 5 * sizeof( float32 ), animPr ); 6.618 + res = Vthread__malloc( 5 * 5 * sizeof( float32 ), animPr ); 6.619 + 6.620 + for( r = 0; r < 5; r++ ) 6.621 + { 6.622 + for( c = 0; c < 5; c++ ) 6.623 + { 6.624 + left[ r * 5 + c ] = r; 6.625 + right[ r * 5 + c ] = c; 6.626 + } 6.627 + } 6.628 + 6.629 + //do primitive 6.630 + Vthread__start_primitive(); //for now, just takes time stamp 6.631 + for( r = 0; r < 5; r++ ) 6.632 + { 6.633 + for( c = 0; c < 5; c++ ) 6.634 + { 6.635 + for( v = 0; v < 5; v++ ) 6.636 + { 6.637 + res[ r * 5 + c ] = left[ r * 5 + v ] * right[ v * 5 + c ]; 6.638 + } 6.639 + } 6.640 + } 6.641 + numCycles = 6.642 + Vthread__end_primitive_and_give_cycles(); 6.643 + 6.644 + Vthread__free( left, animPr ); 6.645 + Vthread__free( right, animPr ); 6.646 + Vthread__free( res, animPr ); 6.647 + 6.648 + return numCycles; 6.649 + }
7.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 7.2 +++ b/Vthread__Matrix_Mult/EntryPoint.c Mon Sep 17 18:19:52 2012 -0700 7.3 @@ -0,0 +1,120 @@ 7.4 +/* 7.5 + * Copyright 2009 OpenSourceStewardshipFoundation.org 7.6 + * Licensed under GNU General Public License version 2 7.7 + * 7.8 + * Author: seanhalle@yahoo.com 7.9 + * 7.10 + */ 7.11 + 7.12 +#include <math.h> 7.13 + 7.14 +#include "Vthread__Matrix_Mult.h" 7.15 + 7.16 + 7.17 + 7.18 +/*Every program for a VMS-based language has an "entry point" function that 7.19 + * creates the first 7.20 + * processor, which starts the chain of creating more processors.. 7.21 + * eventually all of the processors will dissipate themselves, and 7.22 + * return. 7.23 + * 7.24 + *This entry-point function follows the same pattern as all entry-point 7.25 + * functions do: 7.26 + *1) it creates the params for the seed processor, from the 7.27 + * parameters passed into the entry-point function 7.28 + *2) it calls create_seed_slaveVP_and_do_work 7.29 + *3) it gets the return value from the params struc, frees the params struc, 7.30 + * and returns the value from the function 7.31 + * 7.32 + */ 7.33 +Matrix * 7.34 +multiplyTheseMatrices( Matrix *leftMatrix, Matrix *rightMatrix ) 7.35 + { Matrix *resMatrix; 7.36 + DividerParams *dividerParams; 7.37 + int32 numResRows, numResCols; 7.38 + 7.39 + 7.40 + dividerParams = malloc( sizeof( DividerParams ) ); 7.41 + dividerParams->leftMatrix = leftMatrix; 7.42 + dividerParams->rightMatrix = rightMatrix; 7.43 + 7.44 + 7.45 + numResRows = leftMatrix->numRows; 7.46 + numResCols = rightMatrix->numCols; 7.47 + 7.48 + //VMS has its own separate internal malloc, so to get results out, 7.49 + // have to pass in empty array for it to fill up 7.50 + //The alternative is internally telling SSR make external space to use 7.51 + resMatrix = malloc( sizeof(Matrix) ); 7.52 + resMatrix->array = malloc( numResRows * numResCols * sizeof(float32)); 7.53 + resMatrix->numCols = rightMatrix->numCols; 7.54 + resMatrix->numRows = leftMatrix->numRows; 7.55 + 7.56 + 7.57 + dividerParams->resultMatrix = resMatrix; 7.58 + 7.59 + //create divider processor, start doing the work, and wait till done 7.60 + //This function is the "border crossing" between normal code and SSR 7.61 + Vthread__create_seed_slaveVP_and_do_work(÷WorkIntoSubMatrixPairProcrs, 7.62 + dividerParams ); 7.63 + 7.64 + free( dividerParams ); 7.65 + return resMatrix; 7.66 + } 7.67 + 7.68 + 7.69 +Matrix * 7.70 +multiplyTheseMatrices( Matrix *leftMatrix, Matrix *rightMatrix ) 7.71 + { Matrix *resMatrix; 7.72 + DividerParams *dividerParams; 7.73 + int32 numResRows, numResCols; 7.74 + 7.75 + 7.76 + dividerParams = malloc( sizeof( DividerParams ) ); 7.77 + dividerParams->leftMatrix = leftMatrix; 7.78 + dividerParams->rightMatrix = rightMatrix; 7.79 + 7.80 + 7.81 + numResRows = leftMatrix->numRows; 7.82 + numResCols = rightMatrix->numCols; 7.83 + 7.84 + //VMS has its own separate internal malloc, so to get results out, 7.85 + // have to pass in empty array for it to fill up 7.86 + //The alternative is internally telling SSR make external space to use 7.87 + resMatrix = malloc( sizeof(Matrix) ); 7.88 + resMatrix->array = malloc( numResRows * numResCols * sizeof(float32)); 7.89 + resMatrix->numCols = rightMatrix->numCols; 7.90 + resMatrix->numRows = leftMatrix->numRows; 7.91 + 7.92 + 7.93 + dividerParams->resultMatrix = resMatrix; 7.94 + 7.95 + //Vthread__start_lang_running(); 7.96 + //VMS__start_lang_running( Vthread ); 7.97 + 7.98 +/* 7.99 + matrixMultProcessID = 7.100 + Vthread__spawn_program_on_data(÷WorkIntoSubMatrixPairProcrs, 7.101 + dividerParams); 7.102 +*/ 7.103 + VMS__start_VMS_running( ); 7.104 + 7.105 + VMSProcessID matrixMultProcessID; 7.106 + 7.107 + matrixMultProcessID = 7.108 + VMS__spawn_program_on_data_in_Lang( &prog_seed_fn, data, Vthread_lang ); 7.109 + 7.110 + resMatrix = VMS__give_results_when_done_for( matrixMultProcessID ); 7.111 + 7.112 +// Vthread__give_results_when_done_for( matrixMultProcessID ); 7.113 + 7.114 + //VMS__shutdown_lang( Vthread ); 7.115 + 7.116 + //Vthread__shutdown_lang(); 7.117 + 7.118 + VMS__shutdown(); 7.119 + 7.120 + free( dividerParams ); 7.121 + 7.122 + return resMatrix; 7.123 + }
8.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 8.2 +++ b/Vthread__Matrix_Mult/Result_Pr.c Mon Sep 17 18:19:52 2012 -0700 8.3 @@ -0,0 +1,142 @@ 8.4 +/* 8.5 + * Copyright 2009 OpenSourceStewardshipFoundation.org 8.6 + * Licensed under GNU General Public License version 2 8.7 + * 8.8 + * Author: seanhalle@yahoo.com 8.9 + * 8.10 + */ 8.11 + 8.12 +#include "Vthread__Matrix_Mult.h" 8.13 + 8.14 +//===================== 8.15 +void inline 8.16 +accumulateResult( float32 *resultArray, float32 *subMatrixResultArray, 8.17 + int32 startRow, 8.18 + int32 numRows, 8.19 + int32 startCol, 8.20 + int32 numCols, 8.21 + int32 numOrigCols ); 8.22 + 8.23 +//=========================================================================== 8.24 + 8.25 +/*The Result Processor gets a message from each of the vector processors, 8.26 + * puts the result from the message in its location in the result- 8.27 + * matrix, and increments the count of results. 8.28 + * 8.29 + *After the count reaches the point that all results have been received, it 8.30 + * returns the result matrix and dissipates. 8.31 + */ 8.32 +void gatherResults( void *_params, SlaveVP *animatingThd ) 8.33 + { SlaveVP *dividerPr; 8.34 + ResultsParams *params; 8.35 + int numRows, numCols, numSubMatrixPairs, count=0; 8.36 + float32 *resultArray; 8.37 + SMPairParams *resParams; 8.38 + //====================== thread stuff ======================= 8.39 + MatrixMultGlobals *globals =(MatrixMultGlobals *)Vthread__give_globals(); 8.40 + 8.41 + 8.42 + //get vector-comm lock before loop, so that this thd keeps lock after 8.43 + // one wait until it enters the next wait -- forces see-saw btwn 8.44 + // waiters and signalers -- wait-signal-wait-signal-... 8.45 + Vthread__mutex_lock( globals->vector_mutex, animatingThd ); 8.46 + 8.47 + //Tell divider that have the vector lock -- so it's sure won't miss any 8.48 + // signals from the vector-threads it's about to create 8.49 + //Don't need a signal variable -- this thd can't be created until 8.50 + // divider thd already has the start lock 8.51 + Vthread__mutex_lock( globals->start_mutex, animatingThd );//finish wait 8.52 + Vthread__cond_signal( globals->start_cond, animatingThd ); 8.53 + Vthread__mutex_unlock( globals->start_mutex, animatingThd );//finish wait 8.54 + //=========================================================== 8.55 + 8.56 + DEBUG( dbgAppFlow, "start resultPr\n") 8.57 + 8.58 + params = (ResultsParams *)_params; 8.59 + dividerPr = params->dividerPr; 8.60 + numSubMatrixPairs = params->numSubMatrixPairs; 8.61 + numRows = params->numRows; 8.62 + numCols = params->numCols; 8.63 + 8.64 + resultArray = params->resultArray; 8.65 + 8.66 + 8.67 + while( count < numSubMatrixPairs ) 8.68 + { 8.69 + //receive a vector-result from a vector-thread 8.70 + Vthread__cond_wait( globals->vector_cond, animatingThd ); 8.71 + 8.72 + //At this point, animating thread owns the vector lock, so all 8.73 + // pairs trying to signal they have a result are waiting to get that 8.74 + // lock -- only one gets it at a time, and when signal, this thd 8.75 + // gets the lock and does the body of this loop, then when does the 8.76 + // wait again, that releases the lock for next pair-thread to get it 8.77 + resParams = globals->currSMPairParams; 8.78 + 8.79 + accumulateResult( resultArray, resParams->partialResultArray, 8.80 + resParams->leftSubMatrix->origStartRow, 8.81 + resParams->leftSubMatrix->numRows, 8.82 + resParams->rightSubMatrix->origStartCol, 8.83 + resParams->rightSubMatrix->numCols, 8.84 + resParams->rightSubMatrix->origMatrix->numCols ); 8.85 + 8.86 + Vthread__free( resParams->partialResultArray, animatingThd ); 8.87 + 8.88 + //there is only one copy of results procr, so can update numUsesLeft 8.89 + // without concurrency worries. When zero, free the sub-matrix 8.90 + resParams->leftSubMatrix->numUsesLeft -= 1; 8.91 + if( resParams->leftSubMatrix->numUsesLeft == 0 ) 8.92 + { 8.93 + Vthread__free( resParams->leftSubMatrix->array, animatingThd ); 8.94 + Vthread__free( resParams->leftSubMatrix, animatingThd ); 8.95 + } 8.96 + 8.97 + resParams->rightSubMatrix->numUsesLeft -= 1; 8.98 + if( resParams->rightSubMatrix->numUsesLeft == 0 ) 8.99 + { 8.100 + Vthread__free( resParams->rightSubMatrix->array, animatingThd ); 8.101 + Vthread__free( resParams->rightSubMatrix, animatingThd ); 8.102 + } 8.103 + 8.104 + //count of how many sub-matrix pairs accumulated so know when done 8.105 + count++; 8.106 + } 8.107 + 8.108 + //Done -- could just dissipate -- SSR will wait for all processors to 8.109 + // dissipate before shutting down, and thereby making results avaial to 8.110 + // outside, so no need to stop the divider from dissipating, so no need 8.111 + // to send a hand-shake message to it -- but makes debug easier 8.112 + //However, following pattern, so all comms done, release lock 8.113 + Vthread__mutex_unlock( globals->vector_mutex, animatingThd ); 8.114 + 8.115 + //Send result to divider (seed) thread 8.116 + // note, divider thd had to hold the results-comm lock before creating 8.117 + // this thread, to be sure no race 8.118 + Vthread__mutex_lock( globals->results_mutex, animatingThd ); 8.119 + //globals->results = resultMatrixArray; 8.120 + Vthread__cond_signal( globals->results_cond, animatingThd ); 8.121 + Vthread__mutex_unlock( globals->results_mutex, animatingThd ); //releases 8.122 + //divider thread from its wait, at point this executes 8.123 + 8.124 + Vthread__dissipate_thread( animatingThd ); //frees any data owned by procr 8.125 + } 8.126 + 8.127 +void inline 8.128 +accumulateResult( float32 *resultArray, float32 *subMatrixPairResultArray, 8.129 + int32 startRow, 8.130 + int32 numRows, 8.131 + int32 startCol, 8.132 + int32 numCols, 8.133 + int32 numOrigCols ) 8.134 + { int32 row, col; 8.135 + 8.136 + for( row = 0; row < numRows; row++ ) 8.137 + { 8.138 + for( col = 0; col < numCols; col++ ) 8.139 + { 8.140 + resultArray[ (row + startRow) * numOrigCols + (col + startCol) ] += 8.141 + subMatrixPairResultArray[ row * numCols + col ]; 8.142 + } 8.143 + } 8.144 + 8.145 + }
9.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 9.2 +++ b/Vthread__Matrix_Mult/Vthread__Matrix_Mult.h Mon Sep 17 18:19:52 2012 -0700 9.3 @@ -0,0 +1,120 @@ 9.4 +/* 9.5 + * Copyright Oct 24, 2009 OpenSourceStewardshipFoundation.org 9.6 + * Licensed under GNU General Public License version 2 9.7 + */ 9.8 + 9.9 +#ifndef _SSR_MATRIX_MULT_H_ 9.10 +#define _SSR_MATRIX_MULT_H_ 9.11 + 9.12 +#include <stdio.h> 9.13 + 9.14 +#include "Vthread_impl/Vthread.h" 9.15 +#include "../Matrix_Mult.h" 9.16 + 9.17 + 9.18 +//=============================== Defines ============================== 9.19 +#define ROWS_IN_BLOCK 32 9.20 +#define COLS_IN_BLOCK 32 9.21 +#define VEC_IN_BLOCK 32 9.22 + 9.23 +#define copyMatrixSingleton 1 9.24 +#define copyTransposeSingleton 2 9.25 + 9.26 +//============================== Structures ============================== 9.27 +typedef struct 9.28 + { 9.29 + Matrix *leftMatrix; 9.30 + Matrix *rightMatrix; 9.31 + Matrix *resultMatrix; 9.32 + } 9.33 +DividerParams; 9.34 + 9.35 +typedef struct 9.36 + { 9.37 + SlaveVP *dividerPr; 9.38 + int numRows; 9.39 + int numCols; 9.40 + int numSubMatrixPairs; 9.41 + float32 *resultArray; 9.42 + } 9.43 +ResultsParams; 9.44 + 9.45 +typedef 9.46 +struct 9.47 + { int32 numRows; 9.48 + int32 numCols; 9.49 + Matrix *origMatrix; 9.50 + int32 origStartRow; 9.51 + int32 origStartCol; 9.52 + int32 alreadyCopied; 9.53 + VthdSingleton *copySingleton; 9.54 + VthdSingleton *copyTransSingleton; 9.55 + int32 numUsesLeft; //have update via message to avoid multiple writers 9.56 + float32 *array; //2D, but dynamically sized, so use addr arith 9.57 + } 9.58 +SubMatrix; 9.59 + 9.60 +typedef struct 9.61 + { SlaveVP *resultPr; 9.62 + SubMatrix *leftSubMatrix; 9.63 + SubMatrix *rightSubMatrix; 9.64 + float32 *partialResultArray; 9.65 + } 9.66 +SMPairParams; 9.67 + 9.68 +typedef 9.69 +struct 9.70 + { int32 numVals; 9.71 + int32 *startVals; 9.72 + } 9.73 +SlicingStruc; 9.74 + 9.75 +typedef 9.76 +struct 9.77 + { 9.78 + SlicingStruc *leftRowSlices; 9.79 + SlicingStruc *vecSlices; 9.80 + SlicingStruc *rightColSlices; 9.81 + } 9.82 +SlicingStrucCarrier; 9.83 + 9.84 +enum MMMsgType 9.85 + { 9.86 + RESULTS_MSG = 1 9.87 + }; 9.88 + 9.89 + 9.90 +typedef struct 9.91 + { 9.92 + //for communicating sub-matrix-pair results to results Thd 9.93 + int32 vector_mutex; 9.94 + int32 vector_cond; 9.95 + SMPairParams *currSMPairParams; 9.96 + 9.97 + //for communicating results array back to seed (divider) Thd 9.98 + int32 results_mutex; 9.99 + int32 results_cond; 9.100 + float32 *results; 9.101 + 9.102 + //for ensuring results thd has vector lock before making vector thds 9.103 + int32 start_mutex; 9.104 + int32 start_cond; 9.105 + 9.106 + Matrix *rightMatrix; 9.107 + Matrix *resultMatrix; 9.108 + } 9.109 +MatrixMultGlobals; 9.110 + 9.111 + 9.112 +//============================= Processor Functions ========================= 9.113 +void divideWorkIntoSubMatrixPairProcrs( void *data, SlaveVP *animatingPr ); 9.114 +void calcSubMatrixProduct( void *data, SlaveVP *animatingPr ); 9.115 +void gatherResults( void *data, SlaveVP *animatingPr ); 9.116 + 9.117 + 9.118 +//================================ Entry Point ============================== 9.119 +Matrix * 9.120 +multiplyTheseMatrices( Matrix *leftMatrix, Matrix *rightMatrix ); 9.121 + 9.122 + 9.123 +#endif /*_SSR_MATRIX_MULT_H_*/
10.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 10.2 +++ b/Vthread__Matrix_Mult/subMatrix_Pr.c Mon Sep 17 18:19:52 2012 -0700 10.3 @@ -0,0 +1,310 @@ 10.4 +/* 10.5 + * Copyright 2009 OpenSourceStewardshipFoundation.org 10.6 + * Licensed under GNU General Public License version 2 10.7 + * 10.8 + * Author: SeanHalle@yahoo.com 10.9 + * 10.10 + */ 10.11 + 10.12 +#include <string.h> 10.13 + 10.14 +#include "Vthread__Matrix_Mult.h" 10.15 + 10.16 +//Declarations 10.17 +//=================================================== 10.18 + 10.19 +void inline 10.20 +copyFromOrig( SubMatrix *subMatrix, SlaveVP *animPr ); 10.21 + 10.22 +void inline 10.23 +copyTransposeFromOrig( SubMatrix *subMatrix, SlaveVP *animPr ); 10.24 + 10.25 +void inline 10.26 +multiplySubBlocksTransposed( float32 *leftArray, float32 *rightArray, 10.27 + float32 *resArray, 10.28 + int startRow, int endRow, 10.29 + int startCol, int endCol, 10.30 + int startVec, int endVec, 10.31 + int resStride, int inpStride ); 10.32 + 10.33 +void inline 10.34 +multiplyMatrixArraysTransposed( int32 vecLength, int32 numResRows, int32 numResCols, 10.35 + float32 *leftArray, float32 *rightArray, 10.36 + float32 *resArray ); 10.37 + 10.38 +//=================================================== 10.39 + 10.40 +/*A processor is created with an environment that holds two matrices, 10.41 + * the row and col that it owns, and the name of a result gathering 10.42 + * processor. 10.43 + *It calculates the product of two sub-portions of the input matrices 10.44 + * by using Intel's mkl library for single-core. 10.45 + * 10.46 + *This demonstrates using optimized single-threaded code inside scheduled 10.47 + * work-units. 10.48 + * 10.49 + *When done, it sends the result to the result processor 10.50 + */ 10.51 +void 10.52 +calcSubMatrixProduct( void *data, SlaveVP *animatingPr ) 10.53 + { 10.54 + SMPairParams *params; 10.55 + SlaveVP *resultPr; 10.56 + float32 *leftArray, *rightArray, *resArray; 10.57 + SubMatrix *leftSubMatrix, *rightSubMatrix; 10.58 + MatrixMultGlobals *globals =(MatrixMultGlobals *)Vthread__give_globals(); 10.59 + 10.60 + DEBUG1(dbgAppFlow, "start sub-matrix mult: %d\n", animatingPr->procrID) 10.61 + #ifdef TURN_ON_DEBUG_PROBES 10.62 + int32 subMatrixProbe = VMS_App__create_single_interval_probe( "subMtx", 10.63 + animatingPr); 10.64 + VMS_App__record_sched_choice_into_probe( subMatrixProbe, animatingPr ); 10.65 + VMS_App__record_interval_start_in_probe( subMatrixProbe ); 10.66 + #endif 10.67 + 10.68 + params = (SMPairParams *)data; 10.69 + resultPr = params->resultPr; 10.70 + leftSubMatrix = params->leftSubMatrix; 10.71 + rightSubMatrix = params->rightSubMatrix; 10.72 + 10.73 + //make sure the input sub-matrices have been copied out of orig 10.74 + //do it here, inside sub-matrix pair to hopefully gain reuse in cache 10.75 + copyFromOrig( leftSubMatrix, animatingPr ); 10.76 + copyTransposeFromOrig( rightSubMatrix, animatingPr ); 10.77 + 10.78 + leftArray = leftSubMatrix->array; 10.79 + rightArray = rightSubMatrix->array; 10.80 + 10.81 + size_t resSize = (size_t)leftSubMatrix->numRows * 10.82 + (size_t)rightSubMatrix->numCols * sizeof(float32); 10.83 + resArray = Vthread__malloc( resSize, animatingPr ); 10.84 + memset( resArray, 0, resSize ); 10.85 + 10.86 + 10.87 + int32 numResRows, numResCols, vectLength; 10.88 + 10.89 + vectLength = leftSubMatrix->numCols; 10.90 + numResRows = leftSubMatrix->numRows; 10.91 + numResCols = rightSubMatrix->numCols; 10.92 + 10.93 + multiplyMatrixArraysTransposed( vectLength, numResRows, numResCols, 10.94 + leftArray, rightArray, 10.95 + resArray ); 10.96 + 10.97 + //send result to result processor 10.98 + params->partialResultArray = resArray; 10.99 + 10.100 + #ifdef TURN_ON_DEBUG_PROBES 10.101 + VMS_App__record_interval_end_in_probe( subMatrixProbe ); 10.102 + #endif 10.103 + 10.104 + //Send result to results thread 10.105 + //This pattern works 'cause only get lock when results thd inside wait 10.106 + Vthread__mutex_lock( globals->vector_mutex, animatingPr ); 10.107 + globals->currSMPairParams = params; 10.108 + Vthread__cond_signal( globals->vector_cond, animatingPr ); 10.109 + Vthread__mutex_unlock( globals->vector_mutex, animatingPr );//release 10.110 + //wait-er -- cond_signal implemented such that wait-er gets lock, no other 10.111 + 10.112 + DEBUG1(dbgAppFlow, "end sub-matrix mult: %d\n", animatingPr->procrID) 10.113 + Vthread__dissipate_thread( animatingPr ); 10.114 + } 10.115 + 10.116 + 10.117 + 10.118 +/*Divides result and each input into 32x32 sub-matrices, 3 of which fit into 10.119 + * the 32KB L1 cache. 10.120 + *Would be nice to embed this within another level that divided into 10.121 + * 8x8 tiles of those, where one 8x8 tile fits within 2MB L2 cache 10.122 + * 10.123 + *Eventually want these divisions to be automatic, using DKU pattern 10.124 + * embedded into VMS and exposed in the language, and with VMS controlling the 10.125 + * divisions according to the cache sizes, which it knows about. 10.126 + *Also, want VMS to work with language to split among main-mems, so a socket 10.127 + * only cranks on data in its local segment of main mem 10.128 + * 10.129 + *So, outer two loops determine start and end points within the result matrix. 10.130 + * Inside that, a loop dets the start and end points along the shared dimensions 10.131 + * of the two input matrices. 10.132 + */ 10.133 +void inline 10.134 +multiplyMatrixArraysTransposed( int32 vecLength, int32 numResRows, 10.135 + int32 numResCols, 10.136 + float32 *leftArray, float32 *rightArray, 10.137 + float32 *resArray ) 10.138 + { 10.139 + int resStride, inpStride; 10.140 + int resStartRow, resStartCol, resEndRow, resEndCol, startVec, endVec; 10.141 + 10.142 + resStride = numResCols; 10.143 + inpStride = vecLength; 10.144 + 10.145 + for( resStartRow = 0; resStartRow < numResRows; ) 10.146 + { 10.147 + resEndRow = resStartRow + ROWS_IN_BLOCK -1; //start at zero, so -1 10.148 + if( resEndRow > numResRows ) resEndRow = numResRows -1; 10.149 + 10.150 + for( resStartCol = 0; resStartCol < numResCols; ) 10.151 + { 10.152 + resEndCol = resStartCol + COLS_IN_BLOCK -1; 10.153 + if( resEndCol > numResCols ) resEndCol = numResCols -1; 10.154 + 10.155 + for( startVec = 0; startVec < vecLength; ) 10.156 + { 10.157 + endVec = startVec + VEC_IN_BLOCK -1; 10.158 + if( endVec > vecLength ) endVec = vecLength -1; 10.159 + 10.160 + //By having the "vector" of sub-blocks in a sub-block slice 10.161 + // be marched down in inner loop, are re-using the result 10.162 + // matrix, which stays in L1 cache and re-using the left sub-mat 10.163 + // which repeats for each right sub-mat -- can only re-use two of 10.164 + // the three, so result is the most important -- avoids writing 10.165 + // dirty blocks until those result-locations fully done 10.166 + //Row and Col is position in result matrix -- so row and vec 10.167 + // for left array, then vec and col for right array 10.168 + multiplySubBlocksTransposed( leftArray, rightArray, 10.169 + resArray, 10.170 + resStartRow, resEndRow, 10.171 + resStartCol, resEndCol, 10.172 + startVec, endVec, 10.173 + resStride, inpStride ); 10.174 + startVec = endVec +1; 10.175 + } 10.176 + resStartCol = resEndCol +1; 10.177 + } 10.178 + resStartRow = resEndRow +1; 10.179 + } 10.180 + } 10.181 + 10.182 + 10.183 + 10.184 +void inline 10.185 +multiplySubBlocksTransposed( float32 *leftArray, float32 *rightArray, 10.186 + float32 *resArray, 10.187 + int resStartRow, int resEndRow, 10.188 + int resStartCol, int resEndCol, 10.189 + int startVec, int endVec, 10.190 + int resStride, int inpStride ) 10.191 + { 10.192 + int resRow, resCol, vec; 10.193 + int leftOffset, rightOffset; 10.194 + float32 result; 10.195 + 10.196 + //The result row is used only for the left matrix, res col for the right 10.197 + for( resCol = resStartCol; resCol <= resEndCol; resCol++ ) 10.198 + { 10.199 + for( resRow = resStartRow; resRow <= resEndRow; resRow++ ) 10.200 + { 10.201 + leftOffset = resRow * inpStride;//left & right inp strides always same 10.202 + rightOffset = resCol * inpStride;// because right is transposed 10.203 + result = 0; 10.204 + for( vec = startVec; vec <= endVec; vec++ ) 10.205 + { 10.206 + result += 10.207 + leftArray[ leftOffset + vec] * rightArray[ rightOffset + vec]; 10.208 + } 10.209 + 10.210 + resArray[ resRow * resStride + resCol ] += result; 10.211 + } 10.212 + } 10.213 + } 10.214 + 10.215 + 10.216 + 10.217 + 10.218 +/*Reuse this in divider when do the sequential multiply case 10.219 + */ 10.220 +void inline 10.221 +copyTranspose( int32 numRows, int32 numCols, 10.222 + int32 origStartRow, int32 origStartCol, int32 origStride, 10.223 + float32 *subArray, float32 *origArray ) 10.224 + { int32 stride = numRows; 10.225 + 10.226 + int row, col, origOffset; 10.227 + for( row = 0; row < numRows; row++ ) 10.228 + { 10.229 + origOffset = (row + origStartRow) * origStride + origStartCol; 10.230 + for( col = 0; col < numCols; col++ ) 10.231 + { 10.232 + //transpose means swap row & col -- traverse orig matrix normally 10.233 + // but put into reversed place in local array -- means the 10.234 + // stride is the numRows now, so col * numRows + row 10.235 + subArray[ col * stride + row ] = origArray[ origOffset + col ]; 10.236 + } 10.237 + } 10.238 + } 10.239 + 10.240 +void inline 10.241 +copyTransposeFromOrig( SubMatrix *subMatrix, SlaveVP *animPr ) 10.242 + { int numCols, numRows, origStartRow, origStartCol, origStride; 10.243 + Matrix *origMatrix; 10.244 + float32 *origArray, *subArray; 10.245 + 10.246 + Vthread__start_data_singleton( subMatrix->copyTransSingleton, animPr ); 10.247 + 10.248 + origMatrix = subMatrix->origMatrix; 10.249 + origArray = origMatrix->array; 10.250 + numCols = subMatrix->numCols; 10.251 + numRows = subMatrix->numRows; 10.252 + origStartRow = subMatrix->origStartRow; 10.253 + origStartCol = subMatrix->origStartCol; 10.254 + origStride = origMatrix->numCols; 10.255 + 10.256 + subArray = Vthread__malloc( (size_t)numRows * 10.257 + (size_t)numCols *sizeof(float32),animPr); 10.258 + subMatrix->array = subArray; 10.259 + 10.260 + //copy values from orig matrix to local 10.261 + copyTranspose( numRows, numCols, 10.262 + origStartRow, origStartCol, origStride, 10.263 + subArray, origArray ); 10.264 + 10.265 + Vthread__end_data_singleton( subMatrix->copyTransSingleton, animPr ); 10.266 + 10.267 + } 10.268 + 10.269 + 10.270 +void inline 10.271 +copyFromOrig( SubMatrix *subMatrix, SlaveVP *animPr ) 10.272 + { int numCols, numRows, origStartRow, origStartCol, stride, origStride; 10.273 + Matrix *origMatrix; 10.274 + float32 *origArray, *subArray; 10.275 + 10.276 + 10.277 + //This lets only a single VP execute the code between start and 10.278 + // end -- using start and end so that work runs outside the master. 10.279 + //If a second VP ever executes the start, it will be returned 10.280 + // from the end-point. If its execution starts after another but before 10.281 + // that other has finished, this one will remain suspended until the 10.282 + // other finishes, then be resumed from the end-point. 10.283 + Vthread__start_data_singleton( subMatrix->copySingleton, animPr ); 10.284 + 10.285 + 10.286 + origMatrix = subMatrix->origMatrix; 10.287 + origArray = origMatrix->array; 10.288 + numCols = subMatrix->numCols; 10.289 + numRows = subMatrix->numRows; 10.290 + origStartRow = subMatrix->origStartRow; 10.291 + origStartCol = subMatrix->origStartCol; 10.292 + origStride = origMatrix->numCols; 10.293 + 10.294 + subArray = Vthread__malloc( (size_t)numRows * 10.295 + (size_t)numCols *sizeof(float32),animPr); 10.296 + subMatrix->array = subArray; 10.297 + 10.298 + //copy values from orig matrix to local 10.299 + stride = numCols; 10.300 + 10.301 + int row, col, offset, origOffset; 10.302 + for( row = 0; row < numRows; row++ ) 10.303 + { 10.304 + offset = row * stride; 10.305 + origOffset = (row + origStartRow) * origStride + origStartCol; 10.306 + for( col = 0; col < numCols; col++ ) 10.307 + { 10.308 + subArray[ offset + col ] = origArray[ origOffset + col ]; 10.309 + } 10.310 + } 10.311 + 10.312 + Vthread__end_data_singleton( subMatrix->copySingleton, animPr ); 10.313 + }
11.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 11.2 +++ b/__brch__default Mon Sep 17 18:19:52 2012 -0700 11.3 @@ -0,0 +1,3 @@ 11.4 +This branch is for the project structure defined Jan 2012.. the #includes reflect this directory structure. 11.5 + 11.6 +The default branch of the application directory is likely to be the only branch
12.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 12.2 +++ b/main.c Mon Sep 17 18:19:52 2012 -0700 12.3 @@ -0,0 +1,480 @@ 12.4 +/* 12.5 + * 12.6 + */ 12.7 +#include <stdio.h> 12.8 +#include <stdlib.h> 12.9 +#include <string.h> 12.10 +#include <math.h> 12.11 +#include <ctype.h> 12.12 +#include <errno.h> 12.13 +#include <pthread.h> 12.14 +#include <unistd.h> 12.15 +#include "VMS_Implementations/Vthread_impl/VPThread.h" 12.16 +#include "C_Libraries/Queue_impl/PrivateQueue.h" 12.17 + 12.18 +#include <linux/perf_event.h> 12.19 +#include <linux/prctl.h> 12.20 +#include <sys/syscall.h> 12.21 + 12.22 +#undef DEBUG 12.23 +//#define DEBUG 12.24 + 12.25 +#define MEASURE_PERF 12.26 + 12.27 +#if !defined(unix) && !defined(__unix__) 12.28 +#ifdef __MACH__ 12.29 +#define unix 1 12.30 +#define __unix__ 1 12.31 +#endif /* __MACH__ */ 12.32 +#endif /* unix */ 12.33 + 12.34 +/* find the appropriate way to define explicitly sized types */ 12.35 +/* for C99 or GNU libc (also mach's libc) we can use stdint.h */ 12.36 +#if (__STDC_VERSION__ >= 199900) || defined(__GLIBC__) || defined(__MACH__) 12.37 +#include <stdint.h> 12.38 +#elif defined(unix) || defined(__unix__) /* some UNIX systems have them in sys/types.h */ 12.39 +#include <sys/types.h> 12.40 +#elif defined(__WIN32__) || defined(WIN32) /* the nameless one */ 12.41 +typedef unsigned __int8 uint8_t; 12.42 +typedef unsigned __int32 uint32_t; 12.43 +#endif /* sized type detection */ 12.44 + 12.45 +/* provide a millisecond-resolution timer for each system */ 12.46 +#if defined(unix) || defined(__unix__) 12.47 +#include <time.h> 12.48 +#include <sys/time.h> 12.49 +unsigned long get_msec(void) { 12.50 + static struct timeval timeval, first_timeval; 12.51 + 12.52 + gettimeofday(&timeval, 0); 12.53 + if(first_timeval.tv_sec == 0) { 12.54 + first_timeval = timeval; 12.55 + return 0; 12.56 + } 12.57 + return (timeval.tv_sec - first_timeval.tv_sec) * 1000 + (timeval.tv_usec - first_timeval.tv_usec) / 1000; 12.58 +} 12.59 +#elif defined(__WIN32__) || defined(WIN32) 12.60 +#include <windows.h> 12.61 +unsigned long get_msec(void) { 12.62 + return GetTickCount(); 12.63 +} 12.64 +#else 12.65 +//#error "I don't know how to measure time on your platform" 12.66 +#endif 12.67 + 12.68 +//======================== Defines ========================= 12.69 +typedef struct perfData measurement_t; 12.70 +struct perfData{ 12.71 + uint64 cycles; 12.72 + uint64 instructions; 12.73 +}; 12.74 + 12.75 +const char *usage = { 12.76 + "Usage: malloc_test [options]\n" 12.77 + " Spwans a number of threads and allocates memory.\n\n" 12.78 + "Options:\n" 12.79 + " -t <num> how many threads to use (default: 1). This is internaly multiplied by the number of cores.\n" 12.80 + " -o <num> repeat workload and sync operation <m> times\n" 12.81 + " -i <num> size of workload, repeat <n> times\n" 12.82 + " -h this help screen\n\n" 12.83 +}; 12.84 + 12.85 +struct barrier_t 12.86 +{ 12.87 + int counter; 12.88 + int nthreads; 12.89 + int32 mutex; 12.90 + int32 cond; 12.91 + measurement_t endBarrierCycles; 12.92 + 12.93 +}; 12.94 +typedef struct barrier_t barrier; 12.95 + 12.96 +void inline barrier_init(barrier *barr, int nthreads, VirtProcr *animatingPr) 12.97 + { 12.98 + barr->counter = 0; 12.99 + barr->nthreads = nthreads; 12.100 + barr->mutex = VPThread__make_mutex(animatingPr); 12.101 + barr->cond = VPThread__make_cond(barr->mutex, animatingPr); 12.102 + } 12.103 + 12.104 +int cycles_counter_main_fd; 12.105 +void inline barrier_wait(barrier *barr, VirtProcr *animatingPr) 12.106 + { int i; 12.107 + 12.108 + VPThread__mutex_lock(barr->mutex, animatingPr); 12.109 + barr->counter++; 12.110 + if(barr->counter == barr->nthreads) 12.111 + { 12.112 +#ifdef MEASURE_PERF 12.113 + read(cycles_counter_main_fd, &(barr->endBarrierCycles.cycles), \ 12.114 + sizeof(barr->endBarrierCycles.cycles)); 12.115 +#endif 12.116 + 12.117 + barr->counter = 0; 12.118 + for(i=0; i < barr->nthreads; i++) 12.119 + VPThread__cond_signal(barr->cond, animatingPr); 12.120 + } 12.121 + else 12.122 + { VPThread__cond_wait(barr->cond, animatingPr); 12.123 + } 12.124 + VPThread__mutex_unlock(barr->mutex, animatingPr); 12.125 + } 12.126 + 12.127 + 12.128 + 12.129 +typedef struct 12.130 + { struct barrier_t* barrier; 12.131 + uint64_t totalWorkCycles; 12.132 + uint64_t totalBadCycles; 12.133 + uint64_t totalSyncCycles; 12.134 + uint64_t totalBadSyncCycles; 12.135 + uint64 numGoodSyncs; 12.136 + uint64 numGoodTasks; 12.137 + } 12.138 +WorkerParams; 12.139 + 12.140 + 12.141 +typedef struct 12.142 + { measurement_t *startExeCycles; 12.143 + measurement_t *endExeCycles; 12.144 + } 12.145 +BenchParams; 12.146 + 12.147 +//======================== Globals ========================= 12.148 +char __ProgrammName[] = "overhead_test"; 12.149 +char __DataSet[255]; 12.150 + 12.151 +int outer_iters, inner_iters, num_threads; 12.152 +size_t chunk_size = 0; 12.153 + 12.154 +int cycles_counter_fd[NUM_CORES]; 12.155 +struct perf_event_attr* hw_event; 12.156 + 12.157 +WorkerParams *workerParamsArray; 12.158 + 12.159 +//======================== App Code ========================= 12.160 +/* 12.161 + * Workload 12.162 + */ 12.163 + 12.164 +#define saveCyclesAndInstrs(core,cycles) do{ \ 12.165 + int cycles_fd = cycles_counter_fd[core]; \ 12.166 + int nread; \ 12.167 + \ 12.168 + nread = read(cycles_fd,&(cycles),sizeof(cycles)); \ 12.169 + if(nread<0){ \ 12.170 + perror("Error reading cycles counter"); \ 12.171 + cycles = 0; \ 12.172 + } \ 12.173 +} while (0) //macro magic for scoping 12.174 + 12.175 + 12.176 +double 12.177 +worker_TLF(void* _params, VirtProcr* animatingPr) 12.178 + { 12.179 + int i,o; 12.180 + WorkerParams* params = (WorkerParams*)_params; 12.181 + unsigned int totalWorkCycles = 0, totalBadCycles = 0; 12.182 + unsigned int totalSyncCycles = 0, totalBadSyncCycles = 0; 12.183 + unsigned int workspace1=0, numGoodSyncs = 0, numGoodTasks = 0; 12.184 + double workspace2=0.0; 12.185 + int32 privateMutex = VPThread__make_mutex(animatingPr); 12.186 + 12.187 + int cpuid = sched_getcpu(); 12.188 + 12.189 + measurement_t startWorkload, endWorkload, startWorkload2, endWorkload2; 12.190 + uint64 numCycles; 12.191 + for(o=0; o < outer_iters; o++) 12.192 + { 12.193 +#ifdef MEASURE_PERF 12.194 + saveCyclesAndInstrs(cpuid,startWorkload.cycles); 12.195 +#endif 12.196 + 12.197 + //workltask 12.198 + for(i=0; i < inner_iters; i++) 12.199 + { 12.200 + workspace1 += (workspace1 + 32)/2; 12.201 + workspace2 += (workspace2 + 23.2)/1.4; 12.202 + } 12.203 + 12.204 +#ifdef MEASURE_PERF 12.205 + saveCyclesAndInstrs(cpuid,endWorkload.cycles); 12.206 + numCycles = endWorkload.cycles - startWorkload.cycles; 12.207 + //sanity check (400K is about 20K iters) 12.208 + if( numCycles < 400000 ) {totalWorkCycles += numCycles; numGoodTasks++;} 12.209 + else {totalBadCycles += numCycles; } 12.210 +#endif 12.211 + 12.212 + //mutex access often causes switch to different Slave VP 12.213 + VPThread__mutex_lock(privateMutex, animatingPr); 12.214 + 12.215 +/* 12.216 + saveCyclesAndInstrs(cpuid,startWorkload2.cycles); 12.217 + //Task 12.218 + for(i=0; i < inner_iters; i++) 12.219 + { 12.220 + workspace1 += (workspace1 + 32)/2; 12.221 + workspace2 += (workspace2 + 23.2)/1.4; 12.222 + } 12.223 + 12.224 + saveCyclesAndInstrs(cpuid,endWorkload2.cycles); 12.225 + numCycles = endWorkload2.cycles - startWorkload2.cycles; 12.226 + //sanity check (400K is about 20K iters) 12.227 + if( numCycles < 400000 ) {totalWorkCycles += numCycles; numGoodTasks++;} 12.228 + else {totalBadCycles += numCycles; } 12.229 + 12.230 +*/ 12.231 + VPThread__mutex_unlock(privateMutex, animatingPr); 12.232 + } 12.233 + 12.234 + params->totalWorkCycles = totalWorkCycles; 12.235 + params->totalBadCycles = totalBadCycles; 12.236 + params->numGoodTasks = numGoodTasks; 12.237 + params->totalSyncCycles = totalSyncCycles; 12.238 + params->totalBadSyncCycles = totalBadSyncCycles; 12.239 + params->numGoodSyncs = numGoodSyncs; 12.240 +/* 12.241 + params->totalSyncCycles = VMS__give_num_plugin_cycles(); 12.242 + params->totalBadSyncCycles = 0; 12.243 + params->numGoodSyncs = VMS__give_num_plugin_animations(); 12.244 +*/ 12.245 + 12.246 + 12.247 + //Wait for all threads to end 12.248 + barrier_wait(params->barrier, animatingPr); 12.249 + 12.250 + //Shutdown worker 12.251 + VPThread__dissipate_thread(animatingPr); 12.252 + 12.253 + //below return never reached --> there for gcc 12.254 + return (workspace1 + workspace2); //to prevent gcc from optimizing work out 12.255 + } 12.256 + 12.257 + 12.258 +/* this is run after the VMS is set up*/ 12.259 +void benchmark(void *_params, VirtProcr *animatingPr) 12.260 + { 12.261 + int i, cpuID; 12.262 + struct barrier_t barr; 12.263 + BenchParams *params; 12.264 + 12.265 + params = (BenchParams *)_params; 12.266 + 12.267 + barrier_init(&barr, num_threads+1, animatingPr); 12.268 + 12.269 + //prepare input 12.270 + for(i=0; i<num_threads; i++) 12.271 + { 12.272 + workerParamsArray[i].barrier = &barr; 12.273 + } 12.274 + 12.275 + //save cycles before execution of threads, to get total exe cycles 12.276 + measurement_t *startExeCycles, *endExeCycles; 12.277 + startExeCycles = params->startExeCycles; 12.278 + 12.279 +#ifdef MEASURE_PERF 12.280 + int nread = read(cycles_counter_main_fd, &(startExeCycles->cycles), 12.281 + sizeof(startExeCycles->cycles)); 12.282 + if(nread<0) perror("Error reading cycles counter"); 12.283 +#endif 12.284 + 12.285 + //create (which starts running) all threads 12.286 + for(i=0; i<num_threads; i++) 12.287 + { VPThread__create_thread((VirtProcrFnPtr)worker_TLF, &(workerParamsArray[i]), animatingPr); 12.288 + } 12.289 + //wait for all threads to finish 12.290 + barrier_wait(&barr, animatingPr); 12.291 + 12.292 +#ifdef MEASURE_PERF 12.293 + //endBarrierCycles read in barrier_wait()! Merten, email me if want to chg 12.294 + params->endExeCycles->cycles = barr.endBarrierCycles.cycles; 12.295 +#endif 12.296 + 12.297 + 12.298 +/* 12.299 + uint64_t overallWorkCycles = 0; 12.300 + for(i=0; i<num_threads; i++){ 12.301 + printf("WorkCycles: %lu\n",input[i].totalWorkCycles); 12.302 + overallWorkCycles += input[i].totalWorkCycles; 12.303 + } 12.304 + 12.305 + printf("Sum across threads of work cycles: %lu\n", overallWorkCycles); 12.306 + printf("Total Execution: %lu\n", endBenchTime.cycles-startBenchTime.cycles); 12.307 + printf("Runtime/Workcycle Ratio %lu\n", 12.308 + ((endBenchTime.cycles-startBenchTime.cycles)*100)/overallWorkCycles); 12.309 +*/ 12.310 + 12.311 + //====================================================== 12.312 + 12.313 + VPThread__dissipate_thread(animatingPr); 12.314 + } 12.315 + 12.316 +int main(int argc, char **argv) 12.317 + { 12.318 + int i; 12.319 + 12.320 + //set global static variables, based on cmd-line args 12.321 + for(i=1; i<argc; i++) 12.322 + { 12.323 + if(argv[i][0] == '-' && argv[i][2] == 0) 12.324 + { 12.325 + switch(argv[i][1]) 12.326 + { 12.327 + case 't': 12.328 + if(!isdigit(argv[++i][0])) 12.329 + { 12.330 + fprintf(stderr, "-t must be followed by the number of worker threads to spawn\n"); 12.331 + return EXIT_FAILURE; 12.332 + } 12.333 + num_threads = atoi(argv[i]); 12.334 + if(!num_threads) 12.335 + { 12.336 + fprintf(stderr, "invalid number of threads specified: %d\n", num_threads); 12.337 + return EXIT_FAILURE; 12.338 + } 12.339 + break; 12.340 + case 'o': 12.341 + if(!isdigit(argv[++i][0])) 12.342 + { 12.343 + fputs("-i must be followed by a number\n", stderr); 12.344 + return EXIT_FAILURE; 12.345 + } 12.346 + outer_iters = atoi(argv[i]); 12.347 + break; 12.348 + case 'i': 12.349 + if(!isdigit(argv[++i][0])) 12.350 + { 12.351 + fputs("-o must be followed by a number (workload size)\n", stderr); 12.352 + return EXIT_FAILURE; 12.353 + } 12.354 + inner_iters = atoi(argv[i]); 12.355 + break; 12.356 + case 'h': 12.357 + fputs(usage, stdout); 12.358 + return 0; 12.359 + 12.360 + default: 12.361 + fprintf(stderr, "unrecognized argument: %s\n", argv[i]); 12.362 + fputs(usage, stderr); 12.363 + return EXIT_FAILURE; 12.364 + }//switch 12.365 + }//if arg 12.366 + else 12.367 + { 12.368 + fprintf(stderr, "unrecognized argument: %s\n", argv[i]); 12.369 + fputs(usage, stderr); 12.370 + return EXIT_FAILURE; 12.371 + } 12.372 + }//for 12.373 + 12.374 + 12.375 +#ifdef MEASURE_PERF 12.376 + //setup performance counters 12.377 + hw_event = malloc(sizeof(struct perf_event_attr)); 12.378 + memset(hw_event,0,sizeof(struct perf_event_attr)); 12.379 + 12.380 + hw_event->type = PERF_TYPE_HARDWARE; 12.381 + hw_event->size = sizeof(hw_event); 12.382 + hw_event->disabled = 0; 12.383 + hw_event->freq = 0; 12.384 + hw_event->inherit = 1; /* children inherit it */ 12.385 + hw_event->pinned = 1; /* says this virt counter must always be on HW */ 12.386 + hw_event->exclusive = 0; /* only group on PMU */ 12.387 + hw_event->exclude_user = 0; /* don't count user */ 12.388 + hw_event->exclude_kernel = 1; /* don't count kernel */ 12.389 + hw_event->exclude_hv = 1; /* ditto hypervisor */ 12.390 + hw_event->exclude_idle = 1; /* don't count when idle */ 12.391 + hw_event->mmap = 0; /* include mmap data */ 12.392 + hw_event->comm = 0; /* include comm data */ 12.393 + 12.394 + hw_event->config = PERF_COUNT_HW_CPU_CYCLES; //cycles 12.395 + 12.396 + int cpuID, retries; 12.397 + 12.398 + for( cpuID = 0; cpuID < NUM_CORES; cpuID++ ) 12.399 + { retries = 0; 12.400 + do 12.401 + { retries += 1; 12.402 + cycles_counter_fd[cpuID] = 12.403 + syscall(__NR_perf_event_open, hw_event, 12.404 + 0,//pid_t: 0 is "pid of calling process" 12.405 + cpuID,//int: cpu, the value returned by "CPUID" instr(?) 12.406 + -1,//int: group_fd, -1 is "leader" or independent 12.407 + 0//unsigned long: flags 12.408 + ); 12.409 + } 12.410 + while(cycles_counter_fd[cpuID]<0 && retries < 100); 12.411 + if(retries >= 100) 12.412 + { 12.413 + fprintf(stderr,"On core %d: ",cpuID); 12.414 + perror("Failed to open cycles counter"); 12.415 + } 12.416 + } 12.417 + 12.418 + //Set up counter to accumulate total cycles to process, across all CPUs 12.419 + 12.420 + retries = 0; 12.421 + do 12.422 + { retries += 1; 12.423 + cycles_counter_main_fd = 12.424 + syscall(__NR_perf_event_open, hw_event, 12.425 + 0,//pid_t: 0 is "pid of calling process" 12.426 + -1,//int: cpu, -1 means accumulate from all cores 12.427 + -1,//int: group_fd, -1 is "leader" == independent 12.428 + 0//unsigned long: flags 12.429 + ); 12.430 + } 12.431 + while(cycles_counter_main_fd<0 && retries < 100); 12.432 + if(retries >= 100) 12.433 + { 12.434 + fprintf(stderr,"in main "); 12.435 + perror("Failed to open cycles counter"); 12.436 + } 12.437 +#endif 12.438 + 12.439 + measurement_t startExeCycles, endExeCycles; 12.440 + BenchParams *benchParams; 12.441 + 12.442 + benchParams = malloc(sizeof(BenchParams)); 12.443 + 12.444 + benchParams->startExeCycles = &startExeCycles; 12.445 + benchParams->endExeCycles = &endExeCycles; 12.446 + 12.447 + workerParamsArray = (WorkerParams *)malloc( (num_threads + 1) * sizeof(WorkerParams) ); 12.448 + if(workerParamsArray == NULL ) printf("error mallocing worker params array\n"); 12.449 + 12.450 + 12.451 + //This is the transition to the VMS runtime 12.452 + VPThread__create_seed_procr_and_do_work( &benchmark, benchParams ); 12.453 + 12.454 +#ifdef MEASURE_PERF 12.455 + uint64_t totalWorkCyclesAcrossCores = 0, totalBadCyclesAcrossCores = 0; 12.456 + uint64_t totalSyncCyclesAcrossCores = 0, totalBadSyncCyclesAcrossCores = 0; 12.457 + for(i=0; i<num_threads; i++){ 12.458 + printf("WorkCycles: %lu\n",workerParamsArray[i].totalWorkCycles); 12.459 +// printf("Num Good Tasks: %lu\n",workerParamsArray[i].numGoodTasks); 12.460 +// printf("SyncCycles: %lu\n",workerParamsArray[i].totalSyncCycles); 12.461 +// printf("Num Good Syncs: %lu\n",workerParamsArray[i].numGoodSyncs); 12.462 + totalWorkCyclesAcrossCores += workerParamsArray[i].totalWorkCycles; 12.463 + totalBadCyclesAcrossCores += workerParamsArray[i].totalBadCycles; 12.464 + totalSyncCyclesAcrossCores += workerParamsArray[i].totalSyncCycles; 12.465 + totalBadSyncCyclesAcrossCores += workerParamsArray[i].totalBadSyncCycles; 12.466 + } 12.467 + 12.468 + uint64_t totalExeCycles = endExeCycles.cycles - startExeCycles.cycles; 12.469 + totalExeCycles -= totalBadCyclesAcrossCores; 12.470 + uint64 totalOverhead = totalExeCycles - totalWorkCyclesAcrossCores; 12.471 + int32 numSyncs = outer_iters * num_threads * 2; 12.472 + printf("Total Execution Cycles: %lu\n", totalExeCycles); 12.473 + printf("Sum across threads of work cycles: %lu\n", totalWorkCyclesAcrossCores); 12.474 + printf("Sum across threads of bad work cycles: %lu\n", totalBadCyclesAcrossCores); 12.475 +// printf("Sum across threads of Bad Sync cycles: %lu\n", totalBadSyncCyclesAcrossCores); 12.476 + printf("Overhead per sync: %f\n", (double)totalOverhead / (double)numSyncs ); 12.477 + printf("ExeCycles/WorkCycles Ratio %f\n", 12.478 + (double)totalExeCycles / (double)totalWorkCyclesAcrossCores); 12.479 +#else 12.480 + printf("No measurement done!\n"); 12.481 +#endif 12.482 + return 0; 12.483 + }
13.1 --- a/src/Application/Matrix_Mult.c Fri Sep 16 20:12:34 2011 +0200 13.2 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 13.3 @@ -1,167 +0,0 @@ 13.4 -/* 13.5 - * Copyright 2009 OpenSourceStewardshipFoundation.org 13.6 - * Licensed under GNU General Public License version 2 13.7 - * 13.8 - * Author: seanhalle@yahoo.com 13.9 - * 13.10 - * Created on November 15, 2009, 2:35 AM 13.11 - */ 13.12 - 13.13 -#include <malloc.h> 13.14 -#include <stdlib.h> 13.15 - 13.16 -#include "Matrix_Mult.h" 13.17 -#include "ParamHelper/Param.h" 13.18 - 13.19 - 13.20 - 13.21 - void 13.22 -initialize_Input_Matrices_Via( Matrix **leftMatrix, Matrix **rightMatrix, 13.23 - ParamBag *paramBag ) 13.24 - { char *leftMatrixFileName, *rightMatrixFileName; 13.25 - int leftMatrixRows, leftMatrixCols, rightMatrixRows, rightMatrixCols; 13.26 - 13.27 - ParamStruc *param; 13.28 - param = getParamFromBag( "leftMatrixRows", paramBag ); 13.29 - leftMatrixRows = param->intValue; 13.30 - param = getParamFromBag( "leftMatrixCols", paramBag ); 13.31 - leftMatrixCols = param->intValue; 13.32 - *leftMatrix = makeMatrix_WithResMat( leftMatrixRows, leftMatrixCols ); 13.33 - 13.34 - param = getParamFromBag( "leftMatrixFileName", paramBag ); 13.35 - leftMatrixFileName = param->strValue; //no need to copy 13.36 - read_Matrix_From_File( *leftMatrix, leftMatrixFileName ); 13.37 - 13.38 - param = getParamFromBag( "rightMatrixRows", paramBag ); 13.39 - rightMatrixRows = param->intValue; 13.40 - param = getParamFromBag( "rightMatrixCols", paramBag ); 13.41 - rightMatrixCols = param->intValue; 13.42 - *rightMatrix = makeMatrix_WithResMat( rightMatrixRows, rightMatrixCols ); 13.43 - 13.44 - param = getParamFromBag( "rightMatrixFileName", paramBag ); 13.45 - rightMatrixFileName = param->strValue; 13.46 - read_Matrix_From_File( *rightMatrix, rightMatrixFileName ); 13.47 - } 13.48 - 13.49 - 13.50 -void parseLineIntoRow( char *line, float32* row ); 13.51 - 13.52 - 13.53 - void 13.54 -read_Matrix_From_File( Matrix *matrixStruc, char *matrixFileName ) 13.55 - { int row, maxRead, numRows, numCols; 13.56 - float32 *matrixStart; 13.57 - size_t lineSz = 0; 13.58 - FILE *file; 13.59 - char *line = NULL; 13.60 - 13.61 - lineSz = 50000; //max length of line in a matrix data file 13.62 - line = (char *) malloc( lineSz ); 13.63 - if( line == NULL ) printf( "no mem for matrix line" ); 13.64 - 13.65 - numRows = matrixStruc->numRows; 13.66 - numCols = matrixStruc->numCols; 13.67 - matrixStart = matrixStruc->array; 13.68 - 13.69 - file = fopen( matrixFileName, "r" ); 13.70 - if( file == NULL ) { printf( "\nCouldn't open file!!\n"); exit(1);} 13.71 - fseek( file, 0, SEEK_SET ); 13.72 - for( row = 0; row < numRows; row++ ) 13.73 - { 13.74 - if( feof( file ) ) printf( "file ran out too soon" ); 13.75 - maxRead = getline( &line, &lineSz, file ); 13.76 - if( maxRead == -1 ) printf( "prob reading mat line"); 13.77 - 13.78 - if( *line == '\n') continue; //blank line 13.79 - if( *line == '/' ) continue; //comment line 13.80 - 13.81 - parseLineIntoRow( line, matrixStart + row * numCols ); 13.82 - } 13.83 - free( line ); 13.84 - } 13.85 - 13.86 -/*This function relies on each line having the proper number of cols. It 13.87 - * doesn't check, nor enforce, so if the file is improperly formatted it 13.88 - * can write over unrelated memory 13.89 - */ 13.90 - void 13.91 -parseLineIntoRow( char *line, float32* row ) 13.92 - { 13.93 - char *valueStr, *searchPos; 13.94 - 13.95 - //read the float values 13.96 - searchPos = valueStr = line; //start 13.97 - 13.98 - for( ; *searchPos != 0; searchPos++) //bit dangerous, should use buff len 13.99 - { 13.100 - if( *searchPos == '\n' ) //last col.. relying on well-formatted file 13.101 - { *searchPos = 0; 13.102 - *row = atof( valueStr ); 13.103 - break; //end FOR loop 13.104 - } 13.105 - if( *searchPos == ',' ) 13.106 - { *searchPos = 0; //mark end of string 13.107 - *row = (float32) atof( valueStr ); 13.108 - row += 1; //address arith 13.109 - //skip any spaces before digits.. use searchPos + 1 to skip the 0 13.110 - for( ; *(searchPos + 1)== ' ' && *(searchPos + 1) !=0; searchPos++); 13.111 - valueStr = searchPos + 1; 13.112 - } 13.113 - } 13.114 - } 13.115 - 13.116 - //========================================================================== 13.117 - 13.118 -/*In the "_Flat" version of constructor, do only malloc of the top data struc 13.119 - * and set values in that top-level. Don't malloc any sub-structures. 13.120 - */ 13.121 - Matrix * 13.122 -makeMatrix_Flat( int32 numRows, int32 numCols ) 13.123 - { Matrix * retMatrix; 13.124 - retMatrix = malloc( sizeof( Matrix ) ); 13.125 - retMatrix->numRows = numRows; 13.126 - retMatrix->numCols = numCols; 13.127 - 13.128 - return retMatrix; 13.129 - } 13.130 - 13.131 - Matrix * 13.132 -makeMatrix_WithResMat( int32 numRows, int32 numCols ) 13.133 - { Matrix * retMatrix; 13.134 - retMatrix = malloc( sizeof( Matrix ) ); 13.135 - retMatrix->numRows = numRows; 13.136 - retMatrix->numCols = numCols; 13.137 - retMatrix->array = malloc( numRows * numCols * sizeof(float32) ); 13.138 - 13.139 - return retMatrix; 13.140 - } 13.141 - 13.142 - void 13.143 -freeMatrix_Flat( Matrix * matrix ) 13.144 - { //( matrix ); 13.145 - } 13.146 - void 13.147 -freeMatrix( Matrix * matrix ) 13.148 - { free( matrix->array ); 13.149 - free( matrix ); 13.150 - } 13.151 - 13.152 -void 13.153 -printMatrix( Matrix *matrix ) 13.154 - { int r, c, numRows, numCols, rowsToPrint, colsToPrint, rowIncr, colIncr; 13.155 - float32 *matrixArray; 13.156 - 13.157 - numRows = rowsToPrint = matrix->numRows; 13.158 - numCols = colsToPrint = matrix->numCols; 13.159 - matrixArray = matrix->array; 13.160 - 13.161 - rowIncr = numRows/20; if(rowIncr == 0) rowIncr = 1;//20 to 39 rows printed 13.162 - colIncr = numCols/20; if(colIncr == 0) colIncr = 1;//20 to 39 cols printed 13.163 - for( r = 0; r < numRows; r += rowIncr ) 13.164 - { for( c = 0; c < numCols; c += colIncr ) 13.165 - { printf( "%3.1f | ", matrixArray[ r * numCols + c ] ); 13.166 - } 13.167 - printf("\n"); 13.168 - } 13.169 - } 13.170 -
14.1 --- a/src/Application/Matrix_Mult.h Fri Sep 16 20:12:34 2011 +0200 14.2 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 14.3 @@ -1,77 +0,0 @@ 14.4 -/* 14.5 - * Copyright Oct 24, 2009 OpenSourceStewardshipFoundation.org 14.6 - * Licensed under GNU General Public License version 2 14.7 - */ 14.8 - 14.9 -#ifndef MATRIX_MULT_H_ 14.10 -#define MATRIX_MULT_H_ 14.11 - 14.12 -#include <stdio.h> 14.13 -#include <unistd.h> 14.14 -#include <malloc.h> 14.15 - 14.16 -#include "../VPThread_lib/VMS/VMS_primitive_data_types.h" 14.17 -#include "ParamHelper/Param.h" 14.18 - 14.19 -//============================== Structures ============================== 14.20 - 14.21 -typedef 14.22 -struct 14.23 - { int32 numRows; 14.24 - int32 numCols; 14.25 - float32 *array; //2D, but dynamically sized, so use addr arith 14.26 - } 14.27 -Matrix; 14.28 - 14.29 -/* This is the "appSpecificPiece" that is carried inside a DKUPiece. 14.30 - * In the DKUPiece data struc it is declared to be of type "void *". This 14.31 - * allows the application to define any data structure it wants and put it 14.32 - * into a DKUPiece. 14.33 - * When the app specific info is used, it is in app code, so it is cast to 14.34 - * the correct type to tell the compiler how to access fields. 14.35 - * This keeps all app-specific things out of the DKU directory, as per the 14.36 - * DKU standard. */ 14.37 -typedef 14.38 -struct 14.39 - { 14.40 - // pointers to shared data.. the result matrix must be created when the 14.41 - // left and right matrices are put into the root ancestor DKUPiece. 14.42 - Matrix * leftMatrix; 14.43 - Matrix * rightMatrix; 14.44 - Matrix * resultMatrix; 14.45 - 14.46 - // define the starting and ending boundaries for this piece of the 14.47 - // result matrix. These are derivable from the left and right 14.48 - // matrices, but included them for readability of code. 14.49 - int prodStartRow, prodEndRow; 14.50 - int prodStartCol, prodEndCol; 14.51 - // Start and end of the portion of the left matrix that contributes to 14.52 - // this piece of the product 14.53 - int leftStartRow, leftEndRow; 14.54 - int leftStartCol, leftEndCol; 14.55 - // Start and end of the portion of the right matrix that contributes to 14.56 - // this piece of the product 14.57 - int rightStartRow, rightEndRow; 14.58 - int rightStartCol, rightEndCol; 14.59 - } 14.60 -MatrixProdPiece; 14.61 - 14.62 -//============================== Functions ================================ 14.63 -void readFile(); 14.64 - 14.65 -Matrix *makeMatrix( int32 numRows, int32 numCols ); 14.66 -Matrix *makeMatrix_Flat( int32 numRows, int32 numCols ); 14.67 -Matrix *makeMatrix_WithResMat( int32 numRows, int32 numCols ); 14.68 -void freeMatrix_Flat( Matrix * matrix ); 14.69 -void freeMatrix( Matrix * matrix ); 14.70 -void printMatrix( Matrix *matrix ); 14.71 - 14.72 -void read_Matrix_From_File( Matrix *matrixStruc, char *matrixFileName ); 14.73 - 14.74 -void 14.75 -initialize_Input_Matrices_Via( Matrix **leftMatrix, Matrix **rightMatrix, 14.76 - ParamBag *paramBag ); 14.77 - 14.78 -//=========================================================================== 14.79 - 14.80 -#endif /*MATRIX_MULT_H_*/
15.1 --- a/src/Application/VPThread__Matrix_Mult/Divide_Pr.c Fri Sep 16 20:12:34 2011 +0200 15.2 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 15.3 @@ -1,646 +0,0 @@ 15.4 -/* 15.5 - * Copyright 2009 OpenSourceStewardshipFoundation.org 15.6 - * Licensed under GNU General Public License version 2 15.7 - * 15.8 - * Author: seanhalle@yahoo.com 15.9 - * 15.10 - */ 15.11 - 15.12 - 15.13 -#include "VPThread__Matrix_Mult.h" 15.14 -#include <math.h> 15.15 -#include <string.h> 15.16 -#include "../../VPThread_lib/VPThread.h" 15.17 - 15.18 - //The time to compute this many result values should equal the time to 15.19 - // perform this division on a matrix of size gives that many result calcs 15.20 - //IE, size this so that sequential time to calc equals divide time 15.21 - // find the value by experimenting -- but divide time and calc time scale 15.22 - // same way, so this value should remain valid across hardware 15.23 -#define NUM_CELLS_IN_SEQUENTIAL_CUTOFF 1000 15.24 - 15.25 - 15.26 -//=========================================================================== 15.27 -int inline 15.28 -measureMatrixMultPrimitive( VirtProcr *animPr ); 15.29 - 15.30 -SlicingStrucCarrier * 15.31 -calcIdealSizeAndSliceDimensions( Matrix *leftMatrix, Matrix *rightMatrix, 15.32 - VirtProcr *animPr ); 15.33 - 15.34 -SlicingStruc * 15.35 -sliceUpDimension( float32 idealSizeOfSide, int startVal, int endVal, 15.36 - VirtProcr *animPr ); 15.37 - 15.38 -void 15.39 -freeSlicingStruc( SlicingStruc *slicingStruc, VirtProcr *animPr ); 15.40 - 15.41 -SubMatrix ** 15.42 -createSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices, 15.43 - int32 numUses, Matrix *origMatrix, VirtProcr *animPr ); 15.44 - 15.45 -void 15.46 -freeSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices, 15.47 - SubMatrix **subMatrices, VirtProcr *animPr ); 15.48 - 15.49 -void 15.50 -pairUpSubMatricesAndMakeProcessors( SubMatrix **leftSubMatrices, 15.51 - SubMatrix **rightSubMatrices, 15.52 - int32 numRowIdxs, int32 numColIdxs, 15.53 - int32 numVecIdxs, 15.54 - VirtProcr *resultPr, 15.55 - VirtProcr *animatingPr ); 15.56 - 15.57 -void 15.58 -makeSubMatricesAndProcrs( Matrix *leftMatrix, Matrix *rightMatrix, 15.59 - SlicingStrucCarrier *slicingStrucCarrier, 15.60 - VirtProcr *resultPr, VirtProcr *animatingPr ); 15.61 - 15.62 -void inline 15.63 -copyTranspose( int32 numRows, int32 numCols, 15.64 - int32 origStartRow, int32 origStartCol, int32 origStride, 15.65 - float32 *subArray, float32 *origArray ); 15.66 - 15.67 -void inline 15.68 -multiplyMatrixArraysTransposed( int32 vecLength, int32 numResRows, 15.69 - int32 numResCols, 15.70 - float32 *leftArray, float32 *rightArray, 15.71 - float32 *resArray ); 15.72 - 15.73 - 15.74 - 15.75 -/*Divider creates one processor for every sub-matrix 15.76 - * It hands them: 15.77 - * the name of the result processor that they should send their results to, 15.78 - * the left and right matrices, and the rows and cols they should multiply 15.79 - * It first creates the result processor, then all the sub-matrixPair 15.80 - * processors, 15.81 - * then does a receive of a message from the result processor that gives 15.82 - * the divider ownership of the result matrix. 15.83 - * Finally, the divider returns the result matrix out of the VPThread system. 15.84 - * 15.85 - * Divider chooses the size of sub-matrices via an algorithm that tries to 15.86 - * keep the minimum work above a threshold. The threshold is machine- 15.87 - * dependent, so ask VPThread for min work-unit time to get a 15.88 - * given overhead 15.89 - * 15.90 - * Divide min work-unit cycles by measured-cycles for one matrix-cell 15.91 - * product -- gives the number of products need to have in min size 15.92 - * matrix. 15.93 - * 15.94 - * So then, take cubed root of this to get the size of a side of min sub- 15.95 - * matrix. That is the size of the ideal square sub-matrix -- so tile 15.96 - * up the two input matrices into ones as close as possible to that size, 15.97 - * and create the pairs of sub-matrices. 15.98 - * 15.99 - *======================== STRATEGIC OVERVIEW ======================= 15.100 - * 15.101 - *This division is a bit tricky, because have to create things in advance 15.102 - * that it's not at first obvious need to be created.. 15.103 - * 15.104 - *First slice up each dimension -- three of them.. this is because will have 15.105 - * to create the sub-matrix's data-structures before pairing the sub-matrices 15.106 - * with each other -- so, have three dimensions to slice up before can 15.107 - * create the sub-matrix data-strucs -- also, have to be certain that the 15.108 - * cols of the left input have the exact same slicing as the rows of the 15.109 - * left matrix, so just to be sure, do the slicing calc once, then use it 15.110 - * for both. 15.111 - * 15.112 - *So, goes like this: 15.113 - *1) calculate the start & end values of each dimension in each matrix. 15.114 - *2) use those values to create sub-matrix structures 15.115 - *3) combine sub-matrices into pairs, as the tasks to perform. 15.116 - * 15.117 - *Have to calculate separately from creating the sub-matrices because of the 15.118 - * nature of the nesting -- would either end up creating the same sub-matrix 15.119 - * multiple times, or else would have to put in detection of whether had 15.120 - * made a particular one already if tried to combine steps 1 and 2. 15.121 - * 15.122 - *Step 3 has to be separate because of the nesting, as well -- same reason, 15.123 - * would either create same sub-matrix multiple times, or else have to 15.124 - * add detection of whether was already created. 15.125 - * 15.126 - *Another way to look at it: there's one level of loop to divide dimensions, 15.127 - * two levels of nesting to create sub-matrices, and three levels to pair 15.128 - * up the sub-matrices. 15.129 - */ 15.130 - 15.131 -void divideWorkIntoSubMatrixPairProcrs( void *_dividerParams, 15.132 - VirtProcr *animatingThd ) 15.133 - { VirtProcr *resultPr; 15.134 - DividerParams *dividerParams; 15.135 - ResultsParams *resultsParams; 15.136 - Matrix *leftMatrix, *rightMatrix; 15.137 - SlicingStrucCarrier *slicingStrucCarrier; 15.138 - float32 *resultArray; //points to array inside result matrix 15.139 - MatrixMultGlobals *globals; 15.140 - 15.141 - DEBUG( dbgAppFlow, "start divide\n") 15.142 - 15.143 - int32 15.144 - divideProbe = VMS__create_single_interval_probe( "divideProbe", 15.145 - animatingThd ); 15.146 - VMS__record_sched_choice_into_probe( divideProbe, animatingThd ); 15.147 - VMS__record_interval_start_in_probe( divideProbe ); 15.148 - 15.149 - //=========== Setup -- make local copies of ptd-to-things, malloc, aso 15.150 - int32 numResRows, numResCols, vectLength; 15.151 - 15.152 - dividerParams = (DividerParams *)_dividerParams; 15.153 - 15.154 - leftMatrix = dividerParams->leftMatrix; 15.155 - rightMatrix = dividerParams->rightMatrix; 15.156 - 15.157 - vectLength = leftMatrix->numCols; 15.158 - numResRows = leftMatrix->numRows; 15.159 - numResCols = rightMatrix->numCols; 15.160 - resultArray = dividerParams->resultMatrix->array; 15.161 - 15.162 - //zero the result array 15.163 - memset( resultArray, 0, numResRows * numResCols * sizeof(float32) ); 15.164 - 15.165 - //============== Do either sequential mult or do division ============== 15.166 - 15.167 - //Check if input matrices too small -- if yes, just do sequential 15.168 - //Cutoff is determined by overhead of this divider -- relatively 15.169 - // machine-independent 15.170 - if( (float32)leftMatrix->numRows * (float32)leftMatrix->numCols * 15.171 - (float32)rightMatrix->numCols < NUM_CELLS_IN_SEQUENTIAL_CUTOFF ) 15.172 - { 15.173 - //====== Do sequential multiply on a single core 15.174 - DEBUG( dbgAppFlow, "doing sequential") 15.175 - 15.176 - //transpose the right matrix 15.177 - float32 * 15.178 - transRightArray = 15.179 - VPThread__malloc( (size_t)rightMatrix->numRows * 15.180 - (size_t)rightMatrix->numCols * 15.181 - sizeof(float32), animatingThd ); 15.182 - 15.183 - //copy values from orig matrix to local 15.184 - copyTranspose( rightMatrix->numRows, rightMatrix->numCols, 15.185 - 0, 0, rightMatrix->numRows, 15.186 - transRightArray, rightMatrix->array ); 15.187 - 15.188 - multiplyMatrixArraysTransposed( vectLength, numResRows, numResCols, 15.189 - leftMatrix->array, transRightArray, 15.190 - resultArray ); 15.191 - } 15.192 - else 15.193 - { 15.194 - //====== Do parallel multiply across cores 15.195 - 15.196 - DEBUG( dbgAppFlow, "divider: do parallel mult\n") 15.197 - //Calc the ideal size of sub-matrix and slice up the dimensions of 15.198 - // the two matrices. 15.199 - //The ideal size is the one takes the number of cycles to calculate 15.200 - // such that calc time is equal or greater than min work-unit size 15.201 - slicingStrucCarrier = 15.202 - calcIdealSizeAndSliceDimensions(leftMatrix,rightMatrix,animatingThd); 15.203 - 15.204 - //Make the results processor, now that know how many to wait for 15.205 - resultsParams = VPThread__malloc( sizeof(ResultsParams), animatingThd ); 15.206 - resultsParams->numSubMatrixPairs = 15.207 - slicingStrucCarrier->leftRowSlices->numVals * 15.208 - slicingStrucCarrier->rightColSlices->numVals * 15.209 - slicingStrucCarrier->vecSlices->numVals; 15.210 - resultsParams->dividerPr = animatingThd; 15.211 - resultsParams->numCols = rightMatrix->numCols; 15.212 - resultsParams->numRows = leftMatrix->numRows; 15.213 - resultsParams->resultArray = resultArray; 15.214 - 15.215 - //========== Set up global vars, including conds and mutexes ========== 15.216 - globals = VMS__malloc( sizeof(MatrixMultGlobals) ); 15.217 - VPThread__set_globals_to( globals ); 15.218 - 15.219 - globals->results_mutex = VPThread__make_mutex( animatingThd ); 15.220 - globals->results_cond = VPThread__make_cond( globals->results_mutex, 15.221 - animatingThd ); 15.222 - 15.223 - globals->vector_mutex = VPThread__make_mutex( animatingThd ); 15.224 - globals->vector_cond = VPThread__make_cond( globals->vector_mutex, 15.225 - animatingThd ); 15.226 - 15.227 - globals->start_mutex = VPThread__make_mutex( animatingThd ); 15.228 - globals->start_cond = VPThread__make_cond( globals->start_mutex, 15.229 - animatingThd ); 15.230 - //====================================================================== 15.231 - 15.232 - DEBUG( dbgAppFlow, "divider: made mutexes and conds\n") 15.233 - //get results-comm lock before create results-thd, to ensure it can't 15.234 - // signal that results are available before this thd is waiting on cond 15.235 - VPThread__mutex_lock( globals->results_mutex, animatingThd ); 15.236 - 15.237 - //also get the start lock & use to ensure no vector threads send a 15.238 - // signal before the results thread is waiting on vector cond 15.239 - VPThread__mutex_lock( globals->start_mutex, animatingThd ); 15.240 - 15.241 - 15.242 - DEBUG( dbgAppFlow, "divider: make result thread\n") 15.243 - VPThread__create_thread( &gatherResults, resultsParams, animatingThd ); 15.244 - 15.245 - //Now wait for results thd to signal that it has vector lock 15.246 - VPThread__cond_wait( globals->start_cond, animatingThd ); 15.247 - VPThread__mutex_unlock( globals->start_mutex, animatingThd );//done w/lock 15.248 - DEBUG( dbgAppFlow, "divider: make sub-matrices\n") 15.249 - 15.250 - //Make the sub-matrices, and pair them up, and make processor to 15.251 - // calc product of each pair. 15.252 - makeSubMatricesAndProcrs( leftMatrix, rightMatrix, 15.253 - slicingStrucCarrier, 15.254 - resultPr, animatingThd); 15.255 - 15.256 - //Wait for results thread to say results are good 15.257 - VPThread__cond_wait( globals->results_cond, animatingThd ); 15.258 - } 15.259 - 15.260 - 15.261 - //=============== Work done -- send results back ================= 15.262 - 15.263 - 15.264 - DEBUG( dbgAppFlow, "end divide\n") 15.265 - 15.266 - VMS__record_interval_end_in_probe( divideProbe ); 15.267 - VMS__print_stats_of_all_probes(); 15.268 - 15.269 - //nothing left to do so dissipate, VPThread will wait to shutdown, 15.270 - // making results available to outside, until all the processors have 15.271 - // dissipated -- so actually no need to wait for results processor 15.272 - //However, following the pattern, so done with comm, release lock 15.273 - VPThread__mutex_unlock( globals->results_mutex, animatingThd ); 15.274 - 15.275 - VPThread__dissipate_thread( animatingThd ); //all procrs dissipate self at end 15.276 - //when all of the processors have dissipated, the "create seed and do 15.277 - // work" call in the entry point function returns 15.278 - } 15.279 - 15.280 - 15.281 -SlicingStrucCarrier * 15.282 -calcIdealSizeAndSliceDimensions( Matrix *leftMatrix, Matrix *rightMatrix, 15.283 - VirtProcr *animPr ) 15.284 - { 15.285 - float32 idealSizeOfSide, idealSizeOfSide1, idealSizeOfSide2; 15.286 - SlicingStruc *leftRowSlices, *vecSlices, *rightColSlices; 15.287 - SlicingStrucCarrier *slicingStrucCarrier = 15.288 - VPThread__malloc(sizeof(SlicingStrucCarrier), animPr); 15.289 - 15.290 - int minWorkUnitCycles, primitiveCycles, idealNumWorkUnits; 15.291 - float64 numPrimitiveOpsInMinWorkUnit; 15.292 - 15.293 - 15.294 - //======= Calc ideal size of min-sized sub-matrix ======== 15.295 - 15.296 - //ask VPThread for the number of cycles of the minimum work unit, at given 15.297 - // percent overhead then add a guess at overhead from this divider 15.298 - minWorkUnitCycles = VPThread__giveMinWorkUnitCycles( .05 ); 15.299 - 15.300 - //ask VPThread for number of cycles of the "primitive" op of matrix mult 15.301 - primitiveCycles = measureMatrixMultPrimitive( animPr ); 15.302 - 15.303 - numPrimitiveOpsInMinWorkUnit = 15.304 - (float64)minWorkUnitCycles / (float64)primitiveCycles; 15.305 - 15.306 - //take cubed root -- that's number of these in a "side" of sub-matrix 15.307 - // then multiply by 5 because the primitive is 5x5src/Application/VPThread__Matrix_Mult/Divide_Pr.c:403: warning: implicit declaration of function ‘VPThread__give_number_of_cores_to_schedule_onto’ 15.308 - idealSizeOfSide1 = 5 * cbrt( numPrimitiveOpsInMinWorkUnit ); 15.309 - 15.310 - idealNumWorkUnits = VPThread__giveIdealNumWorkUnits(); 15.311 - 15.312 - idealSizeOfSide2 = leftMatrix->numRows / rint(cbrt( idealNumWorkUnits )); 15.313 - idealSizeOfSide2 *= 0.6; //finer granularity to help load balance 15.314 - 15.315 - if( idealSizeOfSide1 > idealSizeOfSide2 ) 15.316 - idealSizeOfSide = idealSizeOfSide1; 15.317 - else 15.318 - idealSizeOfSide = idealSizeOfSide2; 15.319 - 15.320 - //The multiply inner loop blocks the array to fit into L1 cache 15.321 -// if( idealSizeOfSide < ROWS_IN_BLOCK ) idealSizeOfSide = ROWS_IN_BLOCK; 15.322 - 15.323 - //============ Slice up dimensions, now that know target size =========== 15.324 - 15.325 - //Tell the slicer the target size of a side (floating pt), the start 15.326 - // value to start slicing at, and the end value to stop slicing at 15.327 - //It returns an array of start value of each chunk, plus number of them 15.328 - int32 startLeftRow, endLeftRow, startVec,endVec,startRightCol,endRightCol; 15.329 - startLeftRow = 0; 15.330 - endLeftRow = leftMatrix->numRows -1; 15.331 - startVec = 0; 15.332 - endVec = leftMatrix->numCols -1; 15.333 - startRightCol = 0; 15.334 - endRightCol = rightMatrix->numCols -1; 15.335 - 15.336 - leftRowSlices = 15.337 - sliceUpDimension( idealSizeOfSide, startLeftRow, endLeftRow, animPr ); 15.338 - 15.339 - vecSlices = 15.340 - sliceUpDimension( idealSizeOfSide, startVec, endVec, animPr ); 15.341 - 15.342 - rightColSlices = 15.343 - sliceUpDimension( idealSizeOfSide, startRightCol, endRightCol,animPr); 15.344 - 15.345 - slicingStrucCarrier->leftRowSlices = leftRowSlices; 15.346 - slicingStrucCarrier->vecSlices = vecSlices; 15.347 - slicingStrucCarrier->rightColSlices = rightColSlices; 15.348 - 15.349 - return slicingStrucCarrier; 15.350 - } 15.351 - 15.352 - 15.353 -void 15.354 -makeSubMatricesAndProcrs( Matrix *leftMatrix, Matrix *rightMatrix, 15.355 - SlicingStrucCarrier *slicingStrucCarrier, 15.356 - VirtProcr *resultPr, VirtProcr *animPr ) 15.357 - { 15.358 - SlicingStruc *leftRowSlices, *vecSlices, *rightColSlices; 15.359 - 15.360 - leftRowSlices = slicingStrucCarrier->leftRowSlices; 15.361 - vecSlices = slicingStrucCarrier->vecSlices; 15.362 - rightColSlices = slicingStrucCarrier->rightColSlices; 15.363 - VPThread__free( slicingStrucCarrier, animPr ); 15.364 - 15.365 - //================ Make sub-matrices, given the slicing ================ 15.366 - SubMatrix **leftSubMatrices, **rightSubMatrices; 15.367 - leftSubMatrices = 15.368 - createSubMatrices( leftRowSlices, vecSlices, rightColSlices->numVals, 15.369 - leftMatrix, animPr ); 15.370 - //double_check_that_always_numRows_in_right_same_as_numCols_in_left(); 15.371 - rightSubMatrices = 15.372 - createSubMatrices( vecSlices, rightColSlices, leftRowSlices->numVals, 15.373 - rightMatrix, animPr ); 15.374 - 15.375 - 15.376 - //============== pair the sub-matrices and make processors ============== 15.377 - int32 numRowIdxs, numColIdxs, numVecIdxs; 15.378 - 15.379 - numRowIdxs = leftRowSlices->numVals; 15.380 - numColIdxs = rightColSlices->numVals; 15.381 - numVecIdxs = vecSlices->numVals; 15.382 - 15.383 - freeSlicingStruc( leftRowSlices, animPr ); 15.384 - freeSlicingStruc( vecSlices, animPr ); 15.385 - freeSlicingStruc( rightColSlices, animPr ); 15.386 - 15.387 - pairUpSubMatricesAndMakeProcessors( leftSubMatrices, 15.388 - rightSubMatrices, 15.389 - numRowIdxs, numColIdxs, 15.390 - numVecIdxs, 15.391 - resultPr, 15.392 - animPr ); 15.393 - } 15.394 - 15.395 - 15.396 - 15.397 - 15.398 -void 15.399 -pairUpSubMatricesAndMakeProcessors( SubMatrix **leftSubMatrices, 15.400 - SubMatrix **rightSubMatrices, 15.401 - int32 numRowIdxs, int32 numColIdxs, 15.402 - int32 numVecIdxs, 15.403 - VirtProcr *resultPr, 15.404 - VirtProcr *animatingPr ) 15.405 - { 15.406 - int32 resRowIdx, resColIdx, vecIdx; 15.407 - int32 numLeftColIdxs, numRightColIdxs; 15.408 - int32 leftRowIdxOffset; 15.409 - SMPairParams *subMatrixPairParams; 15.410 - float32 numToPutOntoEachCore, leftOverFraction; 15.411 - int32 numCores, coreToScheduleOnto, numVecOnCurrCore; 15.412 - 15.413 - numLeftColIdxs = numColIdxs; 15.414 - numRightColIdxs = numVecIdxs; 15.415 - 15.416 - numCores = VPThread__give_number_of_cores_to_schedule_onto(); 15.417 - 15.418 - numToPutOntoEachCore = numRowIdxs*numColIdxs/numCores; 15.419 - leftOverFraction = 0; 15.420 - numVecOnCurrCore = 0; 15.421 - coreToScheduleOnto = 0; 15.422 - 15.423 - for( resRowIdx = 0; resRowIdx < numRowIdxs; resRowIdx++ ) 15.424 - { 15.425 - leftRowIdxOffset = resRowIdx * numLeftColIdxs; 15.426 - 15.427 - for( resColIdx = 0; resColIdx < numColIdxs; resColIdx++ ) 15.428 - { 15.429 - 15.430 - for( vecIdx = 0; vecIdx < numVecIdxs; vecIdx++ ) 15.431 - { 15.432 - //Make the processor for the pair of sub-matrices 15.433 - subMatrixPairParams = VPThread__malloc( sizeof(SMPairParams), 15.434 - animatingPr); 15.435 - subMatrixPairParams->leftSubMatrix = 15.436 - leftSubMatrices[ leftRowIdxOffset + vecIdx ]; 15.437 - 15.438 - subMatrixPairParams->rightSubMatrix = 15.439 - rightSubMatrices[ vecIdx * numRightColIdxs + resColIdx ]; 15.440 - 15.441 - //subMatrixPairParams->resultPr = resultPr; 15.442 - 15.443 - //put all pairs from the same vector onto same core 15.444 - VPThread__create_thread_with_affinity( &calcSubMatrixProduct, 15.445 - subMatrixPairParams, 15.446 - animatingPr, 15.447 - coreToScheduleOnto ); 15.448 - } 15.449 - 15.450 - //Trying to distribute the subMatrix-vectors across the cores, so 15.451 - // that each core gets the same number of vectors, with a max 15.452 - // imbalance of 1 vector more on some cores than others 15.453 - numVecOnCurrCore += 1; 15.454 - if( numVecOnCurrCore + leftOverFraction >= numToPutOntoEachCore -1 ) 15.455 - { 15.456 - //deal with fractional part, to ensure that imbalance is 1 max 15.457 - // IE, core with most has only 1 more than core with least 15.458 - leftOverFraction += numToPutOntoEachCore - numVecOnCurrCore; 15.459 - if( leftOverFraction >= 1 ) 15.460 - { leftOverFraction -= 1; 15.461 - numVecOnCurrCore = -1; 15.462 - } 15.463 - else 15.464 - { numVecOnCurrCore = 0; 15.465 - } 15.466 - //Move to next core, max core-value to incr to is numCores -1 15.467 - if( coreToScheduleOnto >= numCores -1 ) 15.468 - { coreToScheduleOnto = 0; 15.469 - } 15.470 - else 15.471 - { coreToScheduleOnto += 1; 15.472 - } 15.473 - } 15.474 - 15.475 - } 15.476 - } 15.477 - } 15.478 - 15.479 - 15.480 - 15.481 -/*Walk through the two slice-strucs, making sub-matrix strucs as go 15.482 - */ 15.483 -SubMatrix ** 15.484 -createSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices, 15.485 - int32 numUses, Matrix *origMatrix, VirtProcr *animPr ) 15.486 - { 15.487 - int32 numRowIdxs, numColIdxs, rowIdx, colIdx; 15.488 - int32 startRow, endRow, startCol, endCol; 15.489 - int32 *rowStartVals, *colStartVals; 15.490 - int32 rowOffset; 15.491 - SubMatrix **subMatrices, *newSubMatrix; 15.492 - 15.493 - numRowIdxs = rowSlices->numVals; 15.494 - numColIdxs = colSlices->numVals; 15.495 - 15.496 - rowStartVals = rowSlices->startVals; 15.497 - colStartVals = colSlices->startVals; 15.498 - 15.499 - subMatrices = VPThread__malloc((size_t)numRowIdxs * 15.500 - (size_t)numColIdxs * sizeof(SubMatrix*),animPr ); 15.501 - 15.502 - for( rowIdx = 0; rowIdx < numRowIdxs; rowIdx++ ) 15.503 - { 15.504 - rowOffset = rowIdx * numColIdxs; 15.505 - 15.506 - startRow = rowStartVals[rowIdx]; 15.507 - endRow = rowStartVals[rowIdx + 1] -1; //"fake" start above last is 15.508 - // at last valid idx + 1 & is 15.509 - // 1 greater than end value 15.510 - for( colIdx = 0; colIdx < numColIdxs; colIdx++ ) 15.511 - { 15.512 - startCol = colStartVals[colIdx]; 15.513 - endCol = colStartVals[colIdx + 1] -1; 15.514 - 15.515 - newSubMatrix = VPThread__malloc( sizeof(SubMatrix), animPr ); 15.516 - newSubMatrix->numRows = endRow - startRow +1; 15.517 - newSubMatrix->numCols = endCol - startCol +1; 15.518 - newSubMatrix->origMatrix = origMatrix; 15.519 - newSubMatrix->origStartRow = startRow; 15.520 - newSubMatrix->origStartCol = startCol; 15.521 - newSubMatrix->alreadyCopied = FALSE; 15.522 - newSubMatrix->numUsesLeft = numUses; //can free after this many 15.523 - //Prevent uninitialized memory 15.524 - newSubMatrix->copySingleton = NULL; 15.525 - newSubMatrix->copyTransSingleton = NULL; 15.526 - 15.527 - subMatrices[ rowOffset + colIdx ] = newSubMatrix; 15.528 - } 15.529 - } 15.530 - return subMatrices; 15.531 - } 15.532 - 15.533 - 15.534 -void 15.535 -freeSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices, 15.536 - SubMatrix **subMatrices, VirtProcr *animPr ) 15.537 - { 15.538 - int32 numRowIdxs, numColIdxs, rowIdx, colIdx, rowOffset; 15.539 - SubMatrix *subMatrix; 15.540 - 15.541 - numRowIdxs = rowSlices->numVals; 15.542 - numColIdxs = colSlices->numVals; 15.543 - 15.544 - for( rowIdx = 0; rowIdx < numRowIdxs; rowIdx++ ) 15.545 - { 15.546 - rowOffset = rowIdx * numColIdxs; 15.547 - for( colIdx = 0; colIdx < numColIdxs; colIdx++ ) 15.548 - { 15.549 - subMatrix = subMatrices[ rowOffset + colIdx ]; 15.550 - if( subMatrix->alreadyCopied ) 15.551 - VPThread__free( subMatrix->array, animPr ); 15.552 - VPThread__free( subMatrix, animPr ); 15.553 - } 15.554 - } 15.555 - VPThread__free( subMatrices, animPr ); 15.556 - } 15.557 - 15.558 - 15.559 - 15.560 -SlicingStruc * 15.561 -sliceUpDimension( float32 idealSizeOfSide, int startVal, int endVal, 15.562 - VirtProcr *animPr ) 15.563 - { float32 residualAcc = 0; 15.564 - int numSlices, i, *startVals, sizeOfSlice, endCondition; 15.565 - SlicingStruc *slicingStruc = VPThread__malloc(sizeof(SlicingStruc), animPr); 15.566 - 15.567 - //calc size of matrix need to hold start vals -- 15.568 - numSlices = (int32)( (float32)(endVal -startVal +1) / idealSizeOfSide); 15.569 - 15.570 - startVals = VPThread__malloc( ((size_t)numSlices + 1) * sizeof(int32), animPr ); 15.571 - 15.572 - //Calc the upper limit of start value -- when get above this, end loop 15.573 - // by saving highest value of the matrix dimension to access, plus 1 15.574 - // as the start point of the imaginary slice following the last one 15.575 - //Plus 1 because go up to value but not include when process last slice 15.576 - //The stopping condition is half-a-size less than highest value because 15.577 - // don't want any pieces smaller than half the ideal size -- just tack 15.578 - // little ones onto end of last one 15.579 - endCondition = endVal - (int) (idealSizeOfSide/2); //end *value*, not size 15.580 - for( i = 0; startVal <= endVal; i++ ) 15.581 - { 15.582 - startVals[i] = startVal; 15.583 - residualAcc += idealSizeOfSide; 15.584 - sizeOfSlice = (int)residualAcc; 15.585 - residualAcc -= (float32)sizeOfSlice; 15.586 - startVal += sizeOfSlice; //ex @size = 2 get 0, 2, 4, 6, 8.. 15.587 - 15.588 - if( startVal > endCondition ) 15.589 - { startVal = endVal + 1; 15.590 - startVals[ i + 1 ] = startVal; 15.591 - } 15.592 - } 15.593 - 15.594 - slicingStruc->startVals = startVals; 15.595 - slicingStruc->numVals = i; //loop incr'd, so == last valid start idx+1 15.596 - // which means is num sub-matrices in dim 15.597 - // also == idx of the fake start just above 15.598 - return slicingStruc; 15.599 - } 15.600 - 15.601 -void 15.602 -freeSlicingStruc( SlicingStruc *slicingStruc, VirtProcr *animPr ) 15.603 - { 15.604 - VPThread__free( slicingStruc->startVals, animPr ); 15.605 - VPThread__free( slicingStruc, animPr ); 15.606 - } 15.607 - 15.608 - 15.609 -int inline 15.610 -measureMatrixMultPrimitive( VirtProcr *animPr ) 15.611 - { 15.612 - int r, c, v, numCycles; 15.613 - float32 *res, *left, *right; 15.614 - 15.615 - //setup inputs 15.616 - left = VPThread__malloc( 5 * 5 * sizeof( float32 ), animPr ); 15.617 - right = VPThread__malloc( 5 * 5 * sizeof( float32 ), animPr ); 15.618 - res = VPThread__malloc( 5 * 5 * sizeof( float32 ), animPr ); 15.619 - 15.620 - for( r = 0; r < 5; r++ ) 15.621 - { 15.622 - for( c = 0; c < 5; c++ ) 15.623 - { 15.624 - left[ r * 5 + c ] = r; 15.625 - right[ r * 5 + c ] = c; 15.626 - } 15.627 - } 15.628 - 15.629 - //do primitive 15.630 - VPThread__start_primitive(); //for now, just takes time stamp 15.631 - for( r = 0; r < 5; r++ ) 15.632 - { 15.633 - for( c = 0; c < 5; c++ ) 15.634 - { 15.635 - for( v = 0; v < 5; v++ ) 15.636 - { 15.637 - res[ r * 5 + c ] = left[ r * 5 + v ] * right[ v * 5 + c ]; 15.638 - } 15.639 - } 15.640 - } 15.641 - numCycles = 15.642 - VPThread__end_primitive_and_give_cycles(); 15.643 - 15.644 - VPThread__free( left, animPr ); 15.645 - VPThread__free( right, animPr ); 15.646 - VPThread__free( res, animPr ); 15.647 - 15.648 - return numCycles; 15.649 - }
16.1 --- a/src/Application/VPThread__Matrix_Mult/EntryPoint.c Fri Sep 16 20:12:34 2011 +0200 16.2 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 16.3 @@ -1,62 +0,0 @@ 16.4 -/* 16.5 - * Copyright 2009 OpenSourceStewardshipFoundation.org 16.6 - * Licensed under GNU General Public License version 2 16.7 - * 16.8 - * Author: seanhalle@yahoo.com 16.9 - * 16.10 - */ 16.11 - 16.12 -#include <math.h> 16.13 - 16.14 -#include "VPThread__Matrix_Mult.h" 16.15 - 16.16 - 16.17 - 16.18 -/*Every SSR system has an "entry point" function that creates the first 16.19 - * processor, which starts the chain of creating more processors.. 16.20 - * eventually all of the processors will dissipate themselves, and 16.21 - * return. 16.22 - * 16.23 - *This entry-point function follows the same pattern as all entry-point 16.24 - * functions do: 16.25 - *1) it creates the params for the seed processor, from the 16.26 - * parameters passed into the entry-point function 16.27 - *2) it calls SSR__create_seed_procr_and_do_work 16.28 - *3) it gets the return value from the params struc, frees the params struc, 16.29 - * and returns the value from the function 16.30 - * 16.31 - */ 16.32 -Matrix * 16.33 -multiplyTheseMatrices( Matrix *leftMatrix, Matrix *rightMatrix ) 16.34 - { Matrix *resMatrix; 16.35 - DividerParams *dividerParams; 16.36 - int32 numResRows, numResCols; 16.37 - 16.38 - 16.39 - dividerParams = malloc( sizeof( DividerParams ) ); 16.40 - dividerParams->leftMatrix = leftMatrix; 16.41 - dividerParams->rightMatrix = rightMatrix; 16.42 - 16.43 - 16.44 - numResRows = leftMatrix->numRows; 16.45 - numResCols = rightMatrix->numCols; 16.46 - 16.47 - //VMS has its own separate internal malloc, so to get results out, 16.48 - // have to pass in empty array for it to fill up 16.49 - //The alternative is internally telling SSR make external space to use 16.50 - resMatrix = malloc( sizeof(Matrix) ); 16.51 - resMatrix->array = malloc( numResRows * numResCols * sizeof(float32)); 16.52 - resMatrix->numCols = rightMatrix->numCols; 16.53 - resMatrix->numRows = leftMatrix->numRows; 16.54 - 16.55 - 16.56 - dividerParams->resultMatrix = resMatrix; 16.57 - 16.58 - //create divider processor, start doing the work, and wait till done 16.59 - //This function is the "border crossing" between normal code and SSR 16.60 - VPThread__create_seed_procr_and_do_work(÷WorkIntoSubMatrixPairProcrs, 16.61 - dividerParams ); 16.62 - 16.63 - free( dividerParams ); 16.64 - return resMatrix; 16.65 - }
17.1 --- a/src/Application/VPThread__Matrix_Mult/Result_Pr.c Fri Sep 16 20:12:34 2011 +0200 17.2 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 17.3 @@ -1,142 +0,0 @@ 17.4 -/* 17.5 - * Copyright 2009 OpenSourceStewardshipFoundation.org 17.6 - * Licensed under GNU General Public License version 2 17.7 - * 17.8 - * Author: seanhalle@yahoo.com 17.9 - * 17.10 - */ 17.11 - 17.12 -#include "VPThread__Matrix_Mult.h" 17.13 - 17.14 -//===================== 17.15 -void inline 17.16 -accumulateResult( float32 *resultArray, float32 *subMatrixResultArray, 17.17 - int32 startRow, 17.18 - int32 numRows, 17.19 - int32 startCol, 17.20 - int32 numCols, 17.21 - int32 numOrigCols ); 17.22 - 17.23 -//=========================================================================== 17.24 - 17.25 -/*The Result Processor gets a message from each of the vector processors, 17.26 - * puts the result from the message in its location in the result- 17.27 - * matrix, and increments the count of results. 17.28 - * 17.29 - *After the count reaches the point that all results have been received, it 17.30 - * returns the result matrix and dissipates. 17.31 - */ 17.32 -void gatherResults( void *_params, VirtProcr *animatingThd ) 17.33 - { VirtProcr *dividerPr; 17.34 - ResultsParams *params; 17.35 - int numRows, numCols, numSubMatrixPairs, count=0; 17.36 - float32 *resultArray; 17.37 - SMPairParams *resParams; 17.38 - //====================== thread stuff ======================= 17.39 - MatrixMultGlobals *globals =(MatrixMultGlobals *)VPThread__give_globals(); 17.40 - 17.41 - 17.42 - //get vector-comm lock before loop, so that this thd keeps lock after 17.43 - // one wait until it enters the next wait -- forces see-saw btwn 17.44 - // waiters and signalers -- wait-signal-wait-signal-... 17.45 - VPThread__mutex_lock( globals->vector_mutex, animatingThd ); 17.46 - 17.47 - //Tell divider that have the vector lock -- so it's sure won't miss any 17.48 - // signals from the vector-threads it's about to create 17.49 - //Don't need a signal variable -- this thd can't be created until 17.50 - // divider thd already has the start lock 17.51 - VPThread__mutex_lock( globals->start_mutex, animatingThd );//finish wait 17.52 - VPThread__cond_signal( globals->start_cond, animatingThd ); 17.53 - VPThread__mutex_unlock( globals->start_mutex, animatingThd );//finish wait 17.54 - //=========================================================== 17.55 - 17.56 - DEBUG( dbgAppFlow, "start resultPr\n") 17.57 - 17.58 - params = (ResultsParams *)_params; 17.59 - dividerPr = params->dividerPr; 17.60 - numSubMatrixPairs = params->numSubMatrixPairs; 17.61 - numRows = params->numRows; 17.62 - numCols = params->numCols; 17.63 - 17.64 - resultArray = params->resultArray; 17.65 - 17.66 - 17.67 - while( count < numSubMatrixPairs ) 17.68 - { 17.69 - //receive a vector-result from a vector-thread 17.70 - VPThread__cond_wait( globals->vector_cond, animatingThd ); 17.71 - 17.72 - //At this point, animating thread owns the vector lock, so all 17.73 - // pairs trying to signal they have a result are waiting to get that 17.74 - // lock -- only one gets it at a time, and when signal, this thd 17.75 - // gets the lock and does the body of this loop, then when does the 17.76 - // wait again, that releases the lock for next pair-thread to get it 17.77 - resParams = globals->currSMPairParams; 17.78 - 17.79 - accumulateResult( resultArray, resParams->partialResultArray, 17.80 - resParams->leftSubMatrix->origStartRow, 17.81 - resParams->leftSubMatrix->numRows, 17.82 - resParams->rightSubMatrix->origStartCol, 17.83 - resParams->rightSubMatrix->numCols, 17.84 - resParams->rightSubMatrix->origMatrix->numCols ); 17.85 - 17.86 - VPThread__free( resParams->partialResultArray, animatingThd ); 17.87 - 17.88 - //there is only one copy of results procr, so can update numUsesLeft 17.89 - // without concurrency worries. When zero, free the sub-matrix 17.90 - resParams->leftSubMatrix->numUsesLeft -= 1; 17.91 - if( resParams->leftSubMatrix->numUsesLeft == 0 ) 17.92 - { 17.93 - VPThread__free( resParams->leftSubMatrix->array, animatingThd ); 17.94 - VPThread__free( resParams->leftSubMatrix, animatingThd ); 17.95 - } 17.96 - 17.97 - resParams->rightSubMatrix->numUsesLeft -= 1; 17.98 - if( resParams->rightSubMatrix->numUsesLeft == 0 ) 17.99 - { 17.100 - VPThread__free( resParams->rightSubMatrix->array, animatingThd ); 17.101 - VPThread__free( resParams->rightSubMatrix, animatingThd ); 17.102 - } 17.103 - 17.104 - //count of how many sub-matrix pairs accumulated so know when done 17.105 - count++; 17.106 - } 17.107 - 17.108 - //Done -- could just dissipate -- SSR will wait for all processors to 17.109 - // dissipate before shutting down, and thereby making results avaial to 17.110 - // outside, so no need to stop the divider from dissipating, so no need 17.111 - // to send a hand-shake message to it -- but makes debug easier 17.112 - //However, following pattern, so all comms done, release lock 17.113 - VPThread__mutex_unlock( globals->vector_mutex, animatingThd ); 17.114 - 17.115 - //Send result to divider (seed) thread 17.116 - // note, divider thd had to hold the results-comm lock before creating 17.117 - // this thread, to be sure no race 17.118 - VPThread__mutex_lock( globals->results_mutex, animatingThd ); 17.119 - //globals->results = resultMatrixArray; 17.120 - VPThread__cond_signal( globals->results_cond, animatingThd ); 17.121 - VPThread__mutex_unlock( globals->results_mutex, animatingThd ); //releases 17.122 - //divider thread from its wait, at point this executes 17.123 - 17.124 - VPThread__dissipate_thread( animatingThd ); //frees any data owned by procr 17.125 - } 17.126 - 17.127 -void inline 17.128 -accumulateResult( float32 *resultArray, float32 *subMatrixPairResultArray, 17.129 - int32 startRow, 17.130 - int32 numRows, 17.131 - int32 startCol, 17.132 - int32 numCols, 17.133 - int32 numOrigCols ) 17.134 - { int32 row, col; 17.135 - 17.136 - for( row = 0; row < numRows; row++ ) 17.137 - { 17.138 - for( col = 0; col < numCols; col++ ) 17.139 - { 17.140 - resultArray[ (row + startRow) * numOrigCols + (col + startCol) ] += 17.141 - subMatrixPairResultArray[ row * numCols + col ]; 17.142 - } 17.143 - } 17.144 - 17.145 - }
18.1 --- a/src/Application/VPThread__Matrix_Mult/VPThread__Matrix_Mult.h Fri Sep 16 20:12:34 2011 +0200 18.2 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 18.3 @@ -1,120 +0,0 @@ 18.4 -/* 18.5 - * Copyright Oct 24, 2009 OpenSourceStewardshipFoundation.org 18.6 - * Licensed under GNU General Public License version 2 18.7 - */ 18.8 - 18.9 -#ifndef _SSR_MATRIX_MULT_H_ 18.10 -#define _SSR_MATRIX_MULT_H_ 18.11 - 18.12 -#include <stdio.h> 18.13 - 18.14 -#include "../../VPThread_lib/VPThread.h" 18.15 -#include "../Matrix_Mult.h" 18.16 - 18.17 - 18.18 -//=============================== Defines ============================== 18.19 -#define ROWS_IN_BLOCK 32 18.20 -#define COLS_IN_BLOCK 32 18.21 -#define VEC_IN_BLOCK 32 18.22 - 18.23 -#define copyMatrixSingleton 1 18.24 -#define copyTransposeSingleton 2 18.25 - 18.26 -//============================== Structures ============================== 18.27 -typedef struct 18.28 - { 18.29 - Matrix *leftMatrix; 18.30 - Matrix *rightMatrix; 18.31 - Matrix *resultMatrix; 18.32 - } 18.33 -DividerParams; 18.34 - 18.35 -typedef struct 18.36 - { 18.37 - VirtProcr *dividerPr; 18.38 - int numRows; 18.39 - int numCols; 18.40 - int numSubMatrixPairs; 18.41 - float32 *resultArray; 18.42 - } 18.43 -ResultsParams; 18.44 - 18.45 -typedef 18.46 -struct 18.47 - { int32 numRows; 18.48 - int32 numCols; 18.49 - Matrix *origMatrix; 18.50 - int32 origStartRow; 18.51 - int32 origStartCol; 18.52 - int32 alreadyCopied; 18.53 - VPThdSingleton *copySingleton; 18.54 - VPThdSingleton *copyTransSingleton; 18.55 - int32 numUsesLeft; //have update via message to avoid multiple writers 18.56 - float32 *array; //2D, but dynamically sized, so use addr arith 18.57 - } 18.58 -SubMatrix; 18.59 - 18.60 -typedef struct 18.61 - { VirtProcr *resultPr; 18.62 - SubMatrix *leftSubMatrix; 18.63 - SubMatrix *rightSubMatrix; 18.64 - float32 *partialResultArray; 18.65 - } 18.66 -SMPairParams; 18.67 - 18.68 -typedef 18.69 -struct 18.70 - { int32 numVals; 18.71 - int32 *startVals; 18.72 - } 18.73 -SlicingStruc; 18.74 - 18.75 -typedef 18.76 -struct 18.77 - { 18.78 - SlicingStruc *leftRowSlices; 18.79 - SlicingStruc *vecSlices; 18.80 - SlicingStruc *rightColSlices; 18.81 - } 18.82 -SlicingStrucCarrier; 18.83 - 18.84 -enum MMMsgType 18.85 - { 18.86 - RESULTS_MSG = 1 18.87 - }; 18.88 - 18.89 - 18.90 -typedef struct 18.91 - { 18.92 - //for communicating sub-matrix-pair results to results Thd 18.93 - int32 vector_mutex; 18.94 - int32 vector_cond; 18.95 - SMPairParams *currSMPairParams; 18.96 - 18.97 - //for communicating results array back to seed (divider) Thd 18.98 - int32 results_mutex; 18.99 - int32 results_cond; 18.100 - float32 *results; 18.101 - 18.102 - //for ensuring results thd has vector lock before making vector thds 18.103 - int32 start_mutex; 18.104 - int32 start_cond; 18.105 - 18.106 - Matrix *rightMatrix; 18.107 - Matrix *resultMatrix; 18.108 - } 18.109 -MatrixMultGlobals; 18.110 - 18.111 - 18.112 -//============================= Processor Functions ========================= 18.113 -void divideWorkIntoSubMatrixPairProcrs( void *data, VirtProcr *animatingPr ); 18.114 -void calcSubMatrixProduct( void *data, VirtProcr *animatingPr ); 18.115 -void gatherResults( void *data, VirtProcr *animatingPr ); 18.116 - 18.117 - 18.118 -//================================ Entry Point ============================== 18.119 -Matrix * 18.120 -multiplyTheseMatrices( Matrix *leftMatrix, Matrix *rightMatrix ); 18.121 - 18.122 - 18.123 -#endif /*_SSR_MATRIX_MULT_H_*/
19.1 --- a/src/Application/VPThread__Matrix_Mult/subMatrix_Pr.c Fri Sep 16 20:12:34 2011 +0200 19.2 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 19.3 @@ -1,310 +0,0 @@ 19.4 -/* 19.5 - * Copyright 2009 OpenSourceStewardshipFoundation.org 19.6 - * Licensed under GNU General Public License version 2 19.7 - * 19.8 - * Author: SeanHalle@yahoo.com 19.9 - * 19.10 - */ 19.11 - 19.12 -#include <string.h> 19.13 - 19.14 -#include "VPThread__Matrix_Mult.h" 19.15 - 19.16 -//Declarations 19.17 -//=================================================== 19.18 - 19.19 -void inline 19.20 -copyFromOrig( SubMatrix *subMatrix, VirtProcr *animPr ); 19.21 - 19.22 -void inline 19.23 -copyTransposeFromOrig( SubMatrix *subMatrix, VirtProcr *animPr ); 19.24 - 19.25 -void inline 19.26 -multiplySubBlocksTransposed( float32 *leftArray, float32 *rightArray, 19.27 - float32 *resArray, 19.28 - int startRow, int endRow, 19.29 - int startCol, int endCol, 19.30 - int startVec, int endVec, 19.31 - int resStride, int inpStride ); 19.32 - 19.33 -void inline 19.34 -multiplyMatrixArraysTransposed( int32 vecLength, int32 numResRows, int32 numResCols, 19.35 - float32 *leftArray, float32 *rightArray, 19.36 - float32 *resArray ); 19.37 - 19.38 -//=================================================== 19.39 - 19.40 -/*A processor is created with an environment that holds two matrices, 19.41 - * the row and col that it owns, and the name of a result gathering 19.42 - * processor. 19.43 - *It calculates the product of two sub-portions of the input matrices 19.44 - * by using Intel's mkl library for single-core. 19.45 - * 19.46 - *This demonstrates using optimized single-threaded code inside scheduled 19.47 - * work-units. 19.48 - * 19.49 - *When done, it sends the result to the result processor 19.50 - */ 19.51 -void 19.52 -calcSubMatrixProduct( void *data, VirtProcr *animatingPr ) 19.53 - { 19.54 - SMPairParams *params; 19.55 - VirtProcr *resultPr; 19.56 - float32 *leftArray, *rightArray, *resArray; 19.57 - SubMatrix *leftSubMatrix, *rightSubMatrix; 19.58 - MatrixMultGlobals *globals =(MatrixMultGlobals *)VPThread__give_globals(); 19.59 - 19.60 - DEBUG1(dbgAppFlow, "start sub-matrix mult: %d\n", animatingPr->procrID) 19.61 - #ifdef TURN_ON_DEBUG_PROBES 19.62 - int32 subMatrixProbe = VMS__create_single_interval_probe( "subMtx", 19.63 - animatingPr); 19.64 - VMS__record_sched_choice_into_probe( subMatrixProbe, animatingPr ); 19.65 - VMS__record_interval_start_in_probe( subMatrixProbe ); 19.66 - #endif 19.67 - 19.68 - params = (SMPairParams *)data; 19.69 - resultPr = params->resultPr; 19.70 - leftSubMatrix = params->leftSubMatrix; 19.71 - rightSubMatrix = params->rightSubMatrix; 19.72 - 19.73 - //make sure the input sub-matrices have been copied out of orig 19.74 - //do it here, inside sub-matrix pair to hopefully gain reuse in cache 19.75 - copyFromOrig( leftSubMatrix, animatingPr ); 19.76 - copyTransposeFromOrig( rightSubMatrix, animatingPr ); 19.77 - 19.78 - leftArray = leftSubMatrix->array; 19.79 - rightArray = rightSubMatrix->array; 19.80 - 19.81 - size_t resSize = (size_t)leftSubMatrix->numRows * 19.82 - (size_t)rightSubMatrix->numCols * sizeof(float32); 19.83 - resArray = VPThread__malloc( resSize, animatingPr ); 19.84 - memset( resArray, 0, resSize ); 19.85 - 19.86 - 19.87 - int32 numResRows, numResCols, vectLength; 19.88 - 19.89 - vectLength = leftSubMatrix->numCols; 19.90 - numResRows = leftSubMatrix->numRows; 19.91 - numResCols = rightSubMatrix->numCols; 19.92 - 19.93 - multiplyMatrixArraysTransposed( vectLength, numResRows, numResCols, 19.94 - leftArray, rightArray, 19.95 - resArray ); 19.96 - 19.97 - //send result to result processor 19.98 - params->partialResultArray = resArray; 19.99 - 19.100 - #ifdef TURN_ON_DEBUG_PROBES 19.101 - VMS__record_interval_end_in_probe( subMatrixProbe ); 19.102 - #endif 19.103 - 19.104 - //Send result to results thread 19.105 - //This pattern works 'cause only get lock when results thd inside wait 19.106 - VPThread__mutex_lock( globals->vector_mutex, animatingPr ); 19.107 - globals->currSMPairParams = params; 19.108 - VPThread__cond_signal( globals->vector_cond, animatingPr ); 19.109 - VPThread__mutex_unlock( globals->vector_mutex, animatingPr );//release 19.110 - //wait-er -- cond_signal implemented such that wait-er gets lock, no other 19.111 - 19.112 - DEBUG1(dbgAppFlow, "end sub-matrix mult: %d\n", animatingPr->procrID) 19.113 - VPThread__dissipate_thread( animatingPr ); 19.114 - } 19.115 - 19.116 - 19.117 - 19.118 -/*Divides result and each input into 32x32 sub-matrices, 3 of which fit into 19.119 - * the 32KB L1 cache. 19.120 - *Would be nice to embed this within another level that divided into 19.121 - * 8x8 tiles of those, where one 8x8 tile fits within 2MB L2 cache 19.122 - * 19.123 - *Eventually want these divisions to be automatic, using DKU pattern 19.124 - * embedded into VMS and exposed in the language, and with VMS controlling the 19.125 - * divisions according to the cache sizes, which it knows about. 19.126 - *Also, want VMS to work with language to split among main-mems, so a socket 19.127 - * only cranks on data in its local segment of main mem 19.128 - * 19.129 - *So, outer two loops determine start and end points within the result matrix. 19.130 - * Inside that, a loop dets the start and end points along the shared dimensions 19.131 - * of the two input matrices. 19.132 - */ 19.133 -void inline 19.134 -multiplyMatrixArraysTransposed( int32 vecLength, int32 numResRows, 19.135 - int32 numResCols, 19.136 - float32 *leftArray, float32 *rightArray, 19.137 - float32 *resArray ) 19.138 - { 19.139 - int resStride, inpStride; 19.140 - int resStartRow, resStartCol, resEndRow, resEndCol, startVec, endVec; 19.141 - 19.142 - resStride = numResCols; 19.143 - inpStride = vecLength; 19.144 - 19.145 - for( resStartRow = 0; resStartRow < numResRows; ) 19.146 - { 19.147 - resEndRow = resStartRow + ROWS_IN_BLOCK -1; //start at zero, so -1 19.148 - if( resEndRow > numResRows ) resEndRow = numResRows -1; 19.149 - 19.150 - for( resStartCol = 0; resStartCol < numResCols; ) 19.151 - { 19.152 - resEndCol = resStartCol + COLS_IN_BLOCK -1; 19.153 - if( resEndCol > numResCols ) resEndCol = numResCols -1; 19.154 - 19.155 - for( startVec = 0; startVec < vecLength; ) 19.156 - { 19.157 - endVec = startVec + VEC_IN_BLOCK -1; 19.158 - if( endVec > vecLength ) endVec = vecLength -1; 19.159 - 19.160 - //By having the "vector" of sub-blocks in a sub-block slice 19.161 - // be marched down in inner loop, are re-using the result 19.162 - // matrix, which stays in L1 cache and re-using the left sub-mat 19.163 - // which repeats for each right sub-mat -- can only re-use two of 19.164 - // the three, so result is the most important -- avoids writing 19.165 - // dirty blocks until those result-locations fully done 19.166 - //Row and Col is position in result matrix -- so row and vec 19.167 - // for left array, then vec and col for right array 19.168 - multiplySubBlocksTransposed( leftArray, rightArray, 19.169 - resArray, 19.170 - resStartRow, resEndRow, 19.171 - resStartCol, resEndCol, 19.172 - startVec, endVec, 19.173 - resStride, inpStride ); 19.174 - startVec = endVec +1; 19.175 - } 19.176 - resStartCol = resEndCol +1; 19.177 - } 19.178 - resStartRow = resEndRow +1; 19.179 - } 19.180 - } 19.181 - 19.182 - 19.183 - 19.184 -void inline 19.185 -multiplySubBlocksTransposed( float32 *leftArray, float32 *rightArray, 19.186 - float32 *resArray, 19.187 - int resStartRow, int resEndRow, 19.188 - int resStartCol, int resEndCol, 19.189 - int startVec, int endVec, 19.190 - int resStride, int inpStride ) 19.191 - { 19.192 - int resRow, resCol, vec; 19.193 - int leftOffset, rightOffset; 19.194 - float32 result; 19.195 - 19.196 - //The result row is used only for the left matrix, res col for the right 19.197 - for( resCol = resStartCol; resCol <= resEndCol; resCol++ ) 19.198 - { 19.199 - for( resRow = resStartRow; resRow <= resEndRow; resRow++ ) 19.200 - { 19.201 - leftOffset = resRow * inpStride;//left & right inp strides always same 19.202 - rightOffset = resCol * inpStride;// because right is transposed 19.203 - result = 0; 19.204 - for( vec = startVec; vec <= endVec; vec++ ) 19.205 - { 19.206 - result += 19.207 - leftArray[ leftOffset + vec] * rightArray[ rightOffset + vec]; 19.208 - } 19.209 - 19.210 - resArray[ resRow * resStride + resCol ] += result; 19.211 - } 19.212 - } 19.213 - } 19.214 - 19.215 - 19.216 - 19.217 - 19.218 -/*Reuse this in divider when do the sequential multiply case 19.219 - */ 19.220 -void inline 19.221 -copyTranspose( int32 numRows, int32 numCols, 19.222 - int32 origStartRow, int32 origStartCol, int32 origStride, 19.223 - float32 *subArray, float32 *origArray ) 19.224 - { int32 stride = numRows; 19.225 - 19.226 - int row, col, origOffset; 19.227 - for( row = 0; row < numRows; row++ ) 19.228 - { 19.229 - origOffset = (row + origStartRow) * origStride + origStartCol; 19.230 - for( col = 0; col < numCols; col++ ) 19.231 - { 19.232 - //transpose means swap row & col -- traverse orig matrix normally 19.233 - // but put into reversed place in local array -- means the 19.234 - // stride is the numRows now, so col * numRows + row 19.235 - subArray[ col * stride + row ] = origArray[ origOffset + col ]; 19.236 - } 19.237 - } 19.238 - } 19.239 - 19.240 -void inline 19.241 -copyTransposeFromOrig( SubMatrix *subMatrix, VirtProcr *animPr ) 19.242 - { int numCols, numRows, origStartRow, origStartCol, origStride; 19.243 - Matrix *origMatrix; 19.244 - float32 *origArray, *subArray; 19.245 - 19.246 - VPThread__start_data_singleton( &(subMatrix->copyTransSingleton), animPr ); 19.247 - 19.248 - origMatrix = subMatrix->origMatrix; 19.249 - origArray = origMatrix->array; 19.250 - numCols = subMatrix->numCols; 19.251 - numRows = subMatrix->numRows; 19.252 - origStartRow = subMatrix->origStartRow; 19.253 - origStartCol = subMatrix->origStartCol; 19.254 - origStride = origMatrix->numCols; 19.255 - 19.256 - subArray = VPThread__malloc( (size_t)numRows * 19.257 - (size_t)numCols *sizeof(float32),animPr); 19.258 - subMatrix->array = subArray; 19.259 - 19.260 - //copy values from orig matrix to local 19.261 - copyTranspose( numRows, numCols, 19.262 - origStartRow, origStartCol, origStride, 19.263 - subArray, origArray ); 19.264 - 19.265 - VPThread__end_data_singleton( &(subMatrix->copyTransSingleton), animPr ); 19.266 - 19.267 - } 19.268 - 19.269 - 19.270 -void inline 19.271 -copyFromOrig( SubMatrix *subMatrix, VirtProcr *animPr ) 19.272 - { int numCols, numRows, origStartRow, origStartCol, stride, origStride; 19.273 - Matrix *origMatrix; 19.274 - float32 *origArray, *subArray; 19.275 - 19.276 - 19.277 - //This lets only a single VP execute the code between start and 19.278 - // end -- using start and end so that work runs outside the master. 19.279 - //If a second VP ever executes the start, it will be returned 19.280 - // from the end-point. If its execution starts after another but before 19.281 - // that other has finished, this one will remain suspended until the 19.282 - // other finishes, then be resumed from the end-point. 19.283 - VPThread__start_data_singleton( &(subMatrix->copySingleton), animPr ); 19.284 - 19.285 - 19.286 - origMatrix = subMatrix->origMatrix; 19.287 - origArray = origMatrix->array; 19.288 - numCols = subMatrix->numCols; 19.289 - numRows = subMatrix->numRows; 19.290 - origStartRow = subMatrix->origStartRow; 19.291 - origStartCol = subMatrix->origStartCol; 19.292 - origStride = origMatrix->numCols; 19.293 - 19.294 - subArray = VPThread__malloc( (size_t)numRows * 19.295 - (size_t)numCols *sizeof(float32),animPr); 19.296 - subMatrix->array = subArray; 19.297 - 19.298 - //copy values from orig matrix to local 19.299 - stride = numCols; 19.300 - 19.301 - int row, col, offset, origOffset; 19.302 - for( row = 0; row < numRows; row++ ) 19.303 - { 19.304 - offset = row * stride; 19.305 - origOffset = (row + origStartRow) * origStride + origStartCol; 19.306 - for( col = 0; col < numCols; col++ ) 19.307 - { 19.308 - subArray[ offset + col ] = origArray[ origOffset + col ]; 19.309 - } 19.310 - } 19.311 - 19.312 - VPThread__end_data_singleton( &(subMatrix->copySingleton), animPr ); 19.313 - }
20.1 --- a/src/Application/main.c Fri Sep 16 20:12:34 2011 +0200 20.2 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 20.3 @@ -1,50 +0,0 @@ 20.4 -/* 20.5 - * Copyright Oct 24, 2009 OpenSourceStewardshipFoundation.org 20.6 - * Licensed under GNU General Public License version 2 20.7 - * 20.8 - * author seanhalle@yahoo.com 20.9 - */ 20.10 - 20.11 -#include <malloc.h> 20.12 -#include <stdlib.h> 20.13 - 20.14 -#include "Matrix_Mult.h" 20.15 -#include "VPThread__Matrix_Mult/VPThread__Matrix_Mult.h" 20.16 - 20.17 -char __ProgrammName[] = "Blocked Matrix Multiply"; 20.18 -char __DataSet[255]; 20.19 - 20.20 -void printParams(ParamBag *paramBag) 20.21 -{ 20.22 - snprintf((char*)&__DataSet, 255, 20.23 - "#\tLeft Matrix %d x %d,\n#\tRight Matrix %d x %d", 20.24 - getParamFromBag("leftMatrixRows", paramBag)->intValue, 20.25 - getParamFromBag("leftMatrixCols", paramBag)->intValue, 20.26 - getParamFromBag("rightMatrixRows", paramBag)->intValue, 20.27 - getParamFromBag("rightMatrixCols", paramBag)->intValue); 20.28 -} 20.29 - 20.30 -/** 20.31 - *Matrix multiply program written using VMS_HW piggy-back language 20.32 - * 20.33 - */ 20.34 -int main( int argc, char **argv ) 20.35 - { Matrix *leftMatrix, *rightMatrix, *resultMatrix; 20.36 - ParamBag *paramBag; 20.37 - 20.38 - printf( "arguments: %s | %s\n", argv[0], argv[1] ); 20.39 - 20.40 - paramBag = makeParamBag(); 20.41 - readParamFileIntoBag( argv[1], paramBag ); 20.42 - initialize_Input_Matrices_Via( &leftMatrix, &rightMatrix, paramBag ); 20.43 - printParams(paramBag); 20.44 - 20.45 - resultMatrix = multiplyTheseMatrices( leftMatrix, rightMatrix ); 20.46 - 20.47 - printf("\nresult matrix: \n"); 20.48 - printMatrix( resultMatrix ); 20.49 - 20.50 -// SSR__print_stats(); 20.51 - 20.52 - exit(0); //cleans up 20.53 - }
