# HG changeset patch # User Sean Halle # Date 1347931192 25200 # Node ID 69928a38d5af4b121421054370a1869fe9b488c0 # Parent f8d3f16e70c89c2e8b005de3c0feddddd405d625 updating to most recent repository structure -- not working yet diff -r f8d3f16e70c8 -r 69928a38d5af .hgeol --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/.hgeol Mon Sep 17 18:19:52 2012 -0700 @@ -0,0 +1,14 @@ + +[patterns] +**.py = native +**.txt = native +**.c = native +**.h = native +**.cpp = native +**.java = native +**.class = bin +**.jar = bin +**.sh = native +**.pl = native +**.jpg = bin +**.gif = bin diff -r f8d3f16e70c8 -r 69928a38d5af .hgignore --- a/.hgignore Fri Sep 16 20:12:34 2011 +0200 +++ b/.hgignore Mon Sep 17 18:19:52 2012 -0700 @@ -1,7 +1,10 @@ -nbproject -histograms -dist -build -Makefile -.dep.inc -scripts +syntax: glob + +histograms +nbproject +build +dist +c-ray-mt +*.ppm +*.o +*~ diff -r f8d3f16e70c8 -r 69928a38d5af .hgtags --- a/.hgtags Fri Sep 16 20:12:34 2011 +0200 +++ b/.hgtags Mon Sep 17 18:19:52 2012 -0700 @@ -1,7 +1,1 @@ -3e26de9d3e1e66f0556b2664e71740e6ed009eee V0 -3e26de9d3e1e66f0556b2664e71740e6ed009eee V0 -0000000000000000000000000000000000000000 V0 -0000000000000000000000000000000000000000 V0 -b6e07082767ab984194db038102e2edd7d1917fd V0 -b6e07082767ab984194db038102e2edd7d1917fd V0 -0000000000000000000000000000000000000000 V0 +bef7a9083bd4f8b1fa4e171a36383e78a5f475d5 version_that_goes_into_paper diff -r f8d3f16e70c8 -r 69928a38d5af Matrix_Mult.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Matrix_Mult.c Mon Sep 17 18:19:52 2012 -0700 @@ -0,0 +1,167 @@ +/* + * Copyright 2009 OpenSourceStewardshipFoundation.org + * Licensed under GNU General Public License version 2 + * + * Author: seanhalle@yahoo.com + * + * Created on November 15, 2009, 2:35 AM + */ + +#include +#include + +#include "Matrix_Mult.h" +#include "C_Libraries/ParamHelper/Param.h" + + + + void +initialize_Input_Matrices_Via( Matrix **leftMatrix, Matrix **rightMatrix, + ParamBag *paramBag ) + { char *leftMatrixFileName, *rightMatrixFileName; + int leftMatrixRows, leftMatrixCols, rightMatrixRows, rightMatrixCols; + + ParamStruc *param; + param = getParamFromBag( "leftMatrixRows", paramBag ); + leftMatrixRows = param->intValue; + param = getParamFromBag( "leftMatrixCols", paramBag ); + leftMatrixCols = param->intValue; + *leftMatrix = makeMatrix_WithResMat( leftMatrixRows, leftMatrixCols ); + + param = getParamFromBag( "leftMatrixFileName", paramBag ); + leftMatrixFileName = param->strValue; //no need to copy + read_Matrix_From_File( *leftMatrix, leftMatrixFileName ); + + param = getParamFromBag( "rightMatrixRows", paramBag ); + rightMatrixRows = param->intValue; + param = getParamFromBag( "rightMatrixCols", paramBag ); + rightMatrixCols = param->intValue; + *rightMatrix = makeMatrix_WithResMat( rightMatrixRows, rightMatrixCols ); + + param = getParamFromBag( "rightMatrixFileName", paramBag ); + rightMatrixFileName = param->strValue; + read_Matrix_From_File( *rightMatrix, rightMatrixFileName ); + } + + +void parseLineIntoRow( char *line, float32* row ); + + + void +read_Matrix_From_File( Matrix *matrixStruc, char *matrixFileName ) + { int row, maxRead, numRows, numCols; + float32 *matrixStart; + size_t lineSz = 0; + FILE *file; + char *line = NULL; + + lineSz = 50000; //max length of line in a matrix data file + line = (char *) malloc( lineSz ); + if( line == NULL ) printf( "no mem for matrix line" ); + + numRows = matrixStruc->numRows; + numCols = matrixStruc->numCols; + matrixStart = matrixStruc->array; + + file = fopen( matrixFileName, "r" ); + if( file == NULL ) { printf( "\nCouldn't open file!!\n"); exit(1);} + fseek( file, 0, SEEK_SET ); + for( row = 0; row < numRows; row++ ) + { + if( feof( file ) ) printf( "file ran out too soon" ); + maxRead = getline( &line, &lineSz, file ); + if( maxRead == -1 ) printf( "prob reading mat line"); + + if( *line == '\n') continue; //blank line + if( *line == '/' ) continue; //comment line + + parseLineIntoRow( line, matrixStart + row * numCols ); + } + free( line ); + } + +/*This function relies on each line having the proper number of cols. It + * doesn't check, nor enforce, so if the file is improperly formatted it + * can write over unrelated memory + */ + void +parseLineIntoRow( char *line, float32* row ) + { + char *valueStr, *searchPos; + + //read the float values + searchPos = valueStr = line; //start + + for( ; *searchPos != 0; searchPos++) //bit dangerous, should use buff len + { + if( *searchPos == '\n' ) //last col.. relying on well-formatted file + { *searchPos = 0; + *row = atof( valueStr ); + break; //end FOR loop + } + if( *searchPos == ',' ) + { *searchPos = 0; //mark end of string + *row = (float32) atof( valueStr ); + row += 1; //address arith + //skip any spaces before digits.. use searchPos + 1 to skip the 0 + for( ; *(searchPos + 1)== ' ' && *(searchPos + 1) !=0; searchPos++); + valueStr = searchPos + 1; + } + } + } + + //========================================================================== + +/*In the "_Flat" version of constructor, do only malloc of the top data struc + * and set values in that top-level. Don't malloc any sub-structures. + */ + Matrix * +makeMatrix_Flat( int32 numRows, int32 numCols ) + { Matrix * retMatrix; + retMatrix = malloc( sizeof( Matrix ) ); + retMatrix->numRows = numRows; + retMatrix->numCols = numCols; + + return retMatrix; + } + + Matrix * +makeMatrix_WithResMat( int32 numRows, int32 numCols ) + { Matrix * retMatrix; + retMatrix = malloc( sizeof( Matrix ) ); + retMatrix->numRows = numRows; + retMatrix->numCols = numCols; + retMatrix->array = malloc( numRows * numCols * sizeof(float32) ); + + return retMatrix; + } + + void +freeMatrix_Flat( Matrix * matrix ) + { //( matrix ); + } + void +freeMatrix( Matrix * matrix ) + { free( matrix->array ); + free( matrix ); + } + +void +printMatrix( Matrix *matrix ) + { int r, c, numRows, numCols, rowsToPrint, colsToPrint, rowIncr, colIncr; + float32 *matrixArray; + + numRows = rowsToPrint = matrix->numRows; + numCols = colsToPrint = matrix->numCols; + matrixArray = matrix->array; + + rowIncr = numRows/20; if(rowIncr == 0) rowIncr = 1;//20 to 39 rows printed + colIncr = numCols/20; if(colIncr == 0) colIncr = 1;//20 to 39 cols printed + for( r = 0; r < numRows; r += rowIncr ) + { for( c = 0; c < numCols; c += colIncr ) + { printf( "%3.1f | ", matrixArray[ r * numCols + c ] ); + } + printf("\n"); + } + } + diff -r f8d3f16e70c8 -r 69928a38d5af Matrix_Mult.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Matrix_Mult.h Mon Sep 17 18:19:52 2012 -0700 @@ -0,0 +1,77 @@ +/* + * Copyright Oct 24, 2009 OpenSourceStewardshipFoundation.org + * Licensed under GNU General Public License version 2 + */ + +#ifndef MATRIX_MULT_H_ +#define MATRIX_MULT_H_ + +#include +#include +#include + +#include "VMS_impl/VMS_primitive_data_types.h" +#include "C_Libraries/ParamHelper/Param.h" + +//============================== Structures ============================== + +typedef +struct + { int32 numRows; + int32 numCols; + float32 *array; //2D, but dynamically sized, so use addr arith + } +Matrix; + +/* This is the "appSpecificPiece" that is carried inside a DKUPiece. + * In the DKUPiece data struc it is declared to be of type "void *". This + * allows the application to define any data structure it wants and put it + * into a DKUPiece. + * When the app specific info is used, it is in app code, so it is cast to + * the correct type to tell the compiler how to access fields. + * This keeps all app-specific things out of the DKU directory, as per the + * DKU standard. */ +typedef +struct + { + // pointers to shared data.. the result matrix must be created when the + // left and right matrices are put into the root ancestor DKUPiece. + Matrix * leftMatrix; + Matrix * rightMatrix; + Matrix * resultMatrix; + + // define the starting and ending boundaries for this piece of the + // result matrix. These are derivable from the left and right + // matrices, but included them for readability of code. + int prodStartRow, prodEndRow; + int prodStartCol, prodEndCol; + // Start and end of the portion of the left matrix that contributes to + // this piece of the product + int leftStartRow, leftEndRow; + int leftStartCol, leftEndCol; + // Start and end of the portion of the right matrix that contributes to + // this piece of the product + int rightStartRow, rightEndRow; + int rightStartCol, rightEndCol; + } +MatrixProdPiece; + +//============================== Functions ================================ +void readFile(); + +Matrix *makeMatrix( int32 numRows, int32 numCols ); +Matrix *makeMatrix_Flat( int32 numRows, int32 numCols ); +Matrix *makeMatrix_WithResMat( int32 numRows, int32 numCols ); +void freeMatrix_Flat( Matrix * matrix ); +void freeMatrix( Matrix * matrix ); +void printMatrix( Matrix *matrix ); + +void read_Matrix_From_File( Matrix *matrixStruc, char *matrixFileName ); + +void +initialize_Input_Matrices_Via( Matrix **leftMatrix, Matrix **rightMatrix, + ParamBag *paramBag ); + +//=========================================================================== + +#endif /*MATRIX_MULT_H_*/ diff -r f8d3f16e70c8 -r 69928a38d5af Vthread__Matrix_Mult/Divide_Pr.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Vthread__Matrix_Mult/Divide_Pr.c Mon Sep 17 18:19:52 2012 -0700 @@ -0,0 +1,646 @@ +/* + * Copyright 2009 OpenSourceStewardshipFoundation.org + * Licensed under GNU General Public License version 2 + * + * Author: seanhalle@yahoo.com + * + */ + + +#include "Vthread__Matrix_Mult.h" +#include +#include +#include "Vthread_impl/Vthread.h" + + //The time to compute this many result values should equal the time to + // perform this division on a matrix of size gives that many result calcs + //IE, size this so that sequential time to calc equals divide time + // find the value by experimenting -- but divide time and calc time scale + // same way, so this value should remain valid across hardware +#define NUM_CELLS_IN_SEQUENTIAL_CUTOFF 1000 + + +//=========================================================================== +int inline +measureMatrixMultPrimitive( SlaveVP *animPr ); + +SlicingStrucCarrier * +calcIdealSizeAndSliceDimensions( Matrix *leftMatrix, Matrix *rightMatrix, + SlaveVP *animPr ); + +SlicingStruc * +sliceUpDimension( float32 idealSizeOfSide, int startVal, int endVal, + SlaveVP *animPr ); + +void +freeSlicingStruc( SlicingStruc *slicingStruc, SlaveVP *animPr ); + +SubMatrix ** +createSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices, + int32 numUses, Matrix *origMatrix, SlaveVP *animPr ); + +void +freeSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices, + SubMatrix **subMatrices, SlaveVP *animPr ); + +void +pairUpSubMatricesAndMakeProcessors( SubMatrix **leftSubMatrices, + SubMatrix **rightSubMatrices, + int32 numRowIdxs, int32 numColIdxs, + int32 numVecIdxs, + SlaveVP *resultPr, + SlaveVP *animatingPr ); + +void +makeSubMatricesAndProcrs( Matrix *leftMatrix, Matrix *rightMatrix, + SlicingStrucCarrier *slicingStrucCarrier, + SlaveVP *resultPr, SlaveVP *animatingPr ); + +void inline +copyTranspose( int32 numRows, int32 numCols, + int32 origStartRow, int32 origStartCol, int32 origStride, + float32 *subArray, float32 *origArray ); + +void inline +multiplyMatrixArraysTransposed( int32 vecLength, int32 numResRows, + int32 numResCols, + float32 *leftArray, float32 *rightArray, + float32 *resArray ); + + + +/*Divider creates one processor for every sub-matrix + * It hands them: + * the name of the result processor that they should send their results to, + * the left and right matrices, and the rows and cols they should multiply + * It first creates the result processor, then all the sub-matrixPair + * processors, + * then does a receive of a message from the result processor that gives + * the divider ownership of the result matrix. + * Finally, the divider returns the result matrix out of the Vthread system. + * + * Divider chooses the size of sub-matrices via an algorithm that tries to + * keep the minimum work above a threshold. The threshold is machine- + * dependent, so ask Vthread for min work-unit time to get a + * given overhead + * + * Divide min work-unit cycles by measured-cycles for one matrix-cell + * product -- gives the number of products need to have in min size + * matrix. + * + * So then, take cubed root of this to get the size of a side of min sub- + * matrix. That is the size of the ideal square sub-matrix -- so tile + * up the two input matrices into ones as close as possible to that size, + * and create the pairs of sub-matrices. + * + *======================== STRATEGIC OVERVIEW ======================= + * + *This division is a bit tricky, because have to create things in advance + * that it's not at first obvious need to be created.. + * + *First slice up each dimension -- three of them.. this is because will have + * to create the sub-matrix's data-structures before pairing the sub-matrices + * with each other -- so, have three dimensions to slice up before can + * create the sub-matrix data-strucs -- also, have to be certain that the + * cols of the left input have the exact same slicing as the rows of the + * left matrix, so just to be sure, do the slicing calc once, then use it + * for both. + * + *So, goes like this: + *1) calculate the start & end values of each dimension in each matrix. + *2) use those values to create sub-matrix structures + *3) combine sub-matrices into pairs, as the tasks to perform. + * + *Have to calculate separately from creating the sub-matrices because of the + * nature of the nesting -- would either end up creating the same sub-matrix + * multiple times, or else would have to put in detection of whether had + * made a particular one already if tried to combine steps 1 and 2. + * + *Step 3 has to be separate because of the nesting, as well -- same reason, + * would either create same sub-matrix multiple times, or else have to + * add detection of whether was already created. + * + *Another way to look at it: there's one level of loop to divide dimensions, + * two levels of nesting to create sub-matrices, and three levels to pair + * up the sub-matrices. + */ + +void divideWorkIntoSubMatrixPairProcrs( void *_dividerParams, + SlaveVP *animatingThd ) + { SlaveVP *resultPr; + DividerParams *dividerParams; + ResultsParams *resultsParams; + Matrix *leftMatrix, *rightMatrix; + SlicingStrucCarrier *slicingStrucCarrier; + float32 *resultArray; //points to array inside result matrix + MatrixMultGlobals *globals; + + DEBUG( dbgAppFlow, "start divide\n") + + int32 + divideProbe = VMS_App__create_single_interval_probe( "divideProbe", + animatingThd ); + VMS_App__record_sched_choice_into_probe( divideProbe, animatingThd ); + VMS_App__record_interval_start_in_probe( divideProbe ); + + //=========== Setup -- make local copies of ptd-to-things, malloc, aso + int32 numResRows, numResCols, vectLength; + + dividerParams = (DividerParams *)_dividerParams; + + leftMatrix = dividerParams->leftMatrix; + rightMatrix = dividerParams->rightMatrix; + + vectLength = leftMatrix->numCols; + numResRows = leftMatrix->numRows; + numResCols = rightMatrix->numCols; + resultArray = dividerParams->resultMatrix->array; + + //zero the result array + memset( resultArray, 0, numResRows * numResCols * sizeof(float32) ); + + //============== Do either sequential mult or do division ============== + + //Check if input matrices too small -- if yes, just do sequential + //Cutoff is determined by overhead of this divider -- relatively + // machine-independent + if( (float32)leftMatrix->numRows * (float32)leftMatrix->numCols * + (float32)rightMatrix->numCols < NUM_CELLS_IN_SEQUENTIAL_CUTOFF ) + { + //====== Do sequential multiply on a single core + DEBUG( dbgAppFlow, "doing sequential") + + //transpose the right matrix + float32 * + transRightArray = + Vthread__malloc( (size_t)rightMatrix->numRows * + (size_t)rightMatrix->numCols * + sizeof(float32), animatingThd ); + + //copy values from orig matrix to local + copyTranspose( rightMatrix->numRows, rightMatrix->numCols, + 0, 0, rightMatrix->numRows, + transRightArray, rightMatrix->array ); + + multiplyMatrixArraysTransposed( vectLength, numResRows, numResCols, + leftMatrix->array, transRightArray, + resultArray ); + } + else + { + //====== Do parallel multiply across cores + + DEBUG( dbgAppFlow, "divider: do parallel mult\n") + //Calc the ideal size of sub-matrix and slice up the dimensions of + // the two matrices. + //The ideal size is the one takes the number of cycles to calculate + // such that calc time is equal or greater than min work-unit size + slicingStrucCarrier = + calcIdealSizeAndSliceDimensions(leftMatrix,rightMatrix,animatingThd); + + //Make the results processor, now that know how many to wait for + resultsParams = Vthread__malloc( sizeof(ResultsParams), animatingThd ); + resultsParams->numSubMatrixPairs = + slicingStrucCarrier->leftRowSlices->numVals * + slicingStrucCarrier->rightColSlices->numVals * + slicingStrucCarrier->vecSlices->numVals; + resultsParams->dividerPr = animatingThd; + resultsParams->numCols = rightMatrix->numCols; + resultsParams->numRows = leftMatrix->numRows; + resultsParams->resultArray = resultArray; + + //========== Set up global vars, including conds and mutexes ========== + globals = VMS_App__malloc( sizeof(MatrixMultGlobals) ); + Vthread__set_globals_to( globals ); + + globals->results_mutex = Vthread__make_mutex( animatingThd ); + globals->results_cond = Vthread__make_cond( globals->results_mutex, + animatingThd ); + + globals->vector_mutex = Vthread__make_mutex( animatingThd ); + globals->vector_cond = Vthread__make_cond( globals->vector_mutex, + animatingThd ); + + globals->start_mutex = Vthread__make_mutex( animatingThd ); + globals->start_cond = Vthread__make_cond( globals->start_mutex, + animatingThd ); + //====================================================================== + + DEBUG( dbgAppFlow, "divider: made mutexes and conds\n") + //get results-comm lock before create results-thd, to ensure it can't + // signal that results are available before this thd is waiting on cond + Vthread__mutex_lock( globals->results_mutex, animatingThd ); + + //also get the start lock & use to ensure no vector threads send a + // signal before the results thread is waiting on vector cond + Vthread__mutex_lock( globals->start_mutex, animatingThd ); + + + DEBUG( dbgAppFlow, "divider: make result thread\n") + Vthread__create_thread( &gatherResults, resultsParams, animatingThd ); + + //Now wait for results thd to signal that it has vector lock + Vthread__cond_wait( globals->start_cond, animatingThd ); + Vthread__mutex_unlock( globals->start_mutex, animatingThd );//done w/lock + DEBUG( dbgAppFlow, "divider: make sub-matrices\n") + + //Make the sub-matrices, and pair them up, and make processor to + // calc product of each pair. + makeSubMatricesAndProcrs( leftMatrix, rightMatrix, + slicingStrucCarrier, + resultPr, animatingThd); + + //Wait for results thread to say results are good + Vthread__cond_wait( globals->results_cond, animatingThd ); + } + + + //=============== Work done -- send results back ================= + + + DEBUG( dbgAppFlow, "end divide\n") + + VMS_App__record_interval_end_in_probe( divideProbe ); + VMS_App__print_stats_of_all_probes(); + + //nothing left to do so dissipate, Vthread will wait to shutdown, + // making results available to outside, until all the processors have + // dissipated -- so actually no need to wait for results processor + //However, following the pattern, so done with comm, release lock + Vthread__mutex_unlock( globals->results_mutex, animatingThd ); + + Vthread__dissipate_thread( animatingThd ); //all procrs dissipate self at end + //when all of the processors have dissipated, the "create seed and do + // work" call in the entry point function returns + } + + +SlicingStrucCarrier * +calcIdealSizeAndSliceDimensions( Matrix *leftMatrix, Matrix *rightMatrix, + SlaveVP *animPr ) + { + float32 idealSizeOfSide, idealSizeOfSide1, idealSizeOfSide2; + SlicingStruc *leftRowSlices, *vecSlices, *rightColSlices; + SlicingStrucCarrier *slicingStrucCarrier = + Vthread__malloc(sizeof(SlicingStrucCarrier), animPr); + + int minWorkUnitCycles, primitiveCycles, idealNumWorkUnits; + float64 numPrimitiveOpsInMinWorkUnit; + + + //======= Calc ideal size of min-sized sub-matrix ======== + + //ask Vthread for the number of cycles of the minimum work unit, at given + // percent overhead then add a guess at overhead from this divider + minWorkUnitCycles = Vthread__giveMinWorkUnitCycles( .05 ); + + //ask Vthread for number of cycles of the "primitive" op of matrix mult + primitiveCycles = measureMatrixMultPrimitive( animPr ); + + numPrimitiveOpsInMinWorkUnit = + (float64)minWorkUnitCycles / (float64)primitiveCycles; + + //take cubed root -- that's number of these in a "side" of sub-matrix + // then multiply by 5 because the primitive is 5x5src/Application/Vthread__Matrix_Mult/Divide_Pr.c:403: warning: implicit declaration of function ‘Vthread__give_number_of_cores_to_schedule_onto’ + idealSizeOfSide1 = 5 * cbrt( numPrimitiveOpsInMinWorkUnit ); + + idealNumWorkUnits = Vthread__giveIdealNumWorkUnits(); + + idealSizeOfSide2 = leftMatrix->numRows / rint(cbrt( idealNumWorkUnits )); + idealSizeOfSide2 *= 0.6; //finer granularity to help load balance + + if( idealSizeOfSide1 > idealSizeOfSide2 ) + idealSizeOfSide = idealSizeOfSide1; + else + idealSizeOfSide = idealSizeOfSide2; + + //The multiply inner loop blocks the array to fit into L1 cache +// if( idealSizeOfSide < ROWS_IN_BLOCK ) idealSizeOfSide = ROWS_IN_BLOCK; + + //============ Slice up dimensions, now that know target size =========== + + //Tell the slicer the target size of a side (floating pt), the start + // value to start slicing at, and the end value to stop slicing at + //It returns an array of start value of each chunk, plus number of them + int32 startLeftRow, endLeftRow, startVec,endVec,startRightCol,endRightCol; + startLeftRow = 0; + endLeftRow = leftMatrix->numRows -1; + startVec = 0; + endVec = leftMatrix->numCols -1; + startRightCol = 0; + endRightCol = rightMatrix->numCols -1; + + leftRowSlices = + sliceUpDimension( idealSizeOfSide, startLeftRow, endLeftRow, animPr ); + + vecSlices = + sliceUpDimension( idealSizeOfSide, startVec, endVec, animPr ); + + rightColSlices = + sliceUpDimension( idealSizeOfSide, startRightCol, endRightCol,animPr); + + slicingStrucCarrier->leftRowSlices = leftRowSlices; + slicingStrucCarrier->vecSlices = vecSlices; + slicingStrucCarrier->rightColSlices = rightColSlices; + + return slicingStrucCarrier; + } + + +void +makeSubMatricesAndProcrs( Matrix *leftMatrix, Matrix *rightMatrix, + SlicingStrucCarrier *slicingStrucCarrier, + SlaveVP *resultPr, SlaveVP *animPr ) + { + SlicingStruc *leftRowSlices, *vecSlices, *rightColSlices; + + leftRowSlices = slicingStrucCarrier->leftRowSlices; + vecSlices = slicingStrucCarrier->vecSlices; + rightColSlices = slicingStrucCarrier->rightColSlices; + Vthread__free( slicingStrucCarrier, animPr ); + + //================ Make sub-matrices, given the slicing ================ + SubMatrix **leftSubMatrices, **rightSubMatrices; + leftSubMatrices = + createSubMatrices( leftRowSlices, vecSlices, rightColSlices->numVals, + leftMatrix, animPr ); + //double_check_that_always_numRows_in_right_same_as_numCols_in_left(); + rightSubMatrices = + createSubMatrices( vecSlices, rightColSlices, leftRowSlices->numVals, + rightMatrix, animPr ); + + + //============== pair the sub-matrices and make processors ============== + int32 numRowIdxs, numColIdxs, numVecIdxs; + + numRowIdxs = leftRowSlices->numVals; + numColIdxs = rightColSlices->numVals; + numVecIdxs = vecSlices->numVals; + + freeSlicingStruc( leftRowSlices, animPr ); + freeSlicingStruc( vecSlices, animPr ); + freeSlicingStruc( rightColSlices, animPr ); + + pairUpSubMatricesAndMakeProcessors( leftSubMatrices, + rightSubMatrices, + numRowIdxs, numColIdxs, + numVecIdxs, + resultPr, + animPr ); + } + + + + +void +pairUpSubMatricesAndMakeProcessors( SubMatrix **leftSubMatrices, + SubMatrix **rightSubMatrices, + int32 numRowIdxs, int32 numColIdxs, + int32 numVecIdxs, + SlaveVP *resultPr, + SlaveVP *animatingPr ) + { + int32 resRowIdx, resColIdx, vecIdx; + int32 numLeftColIdxs, numRightColIdxs; + int32 leftRowIdxOffset; + SMPairParams *subMatrixPairParams; + float32 numToPutOntoEachCore, leftOverFraction; + int32 numCores, coreToScheduleOnto, numVecOnCurrCore; + + numLeftColIdxs = numColIdxs; + numRightColIdxs = numVecIdxs; + + numCores = Vthread__give_number_of_cores_to_schedule_onto(); + + numToPutOntoEachCore = numRowIdxs*numColIdxs/numCores; + leftOverFraction = 0; + numVecOnCurrCore = 0; + coreToScheduleOnto = 0; + + for( resRowIdx = 0; resRowIdx < numRowIdxs; resRowIdx++ ) + { + leftRowIdxOffset = resRowIdx * numLeftColIdxs; + + for( resColIdx = 0; resColIdx < numColIdxs; resColIdx++ ) + { + + for( vecIdx = 0; vecIdx < numVecIdxs; vecIdx++ ) + { + //Make the processor for the pair of sub-matrices + subMatrixPairParams = Vthread__malloc( sizeof(SMPairParams), + animatingPr); + subMatrixPairParams->leftSubMatrix = + leftSubMatrices[ leftRowIdxOffset + vecIdx ]; + + subMatrixPairParams->rightSubMatrix = + rightSubMatrices[ vecIdx * numRightColIdxs + resColIdx ]; + + //subMatrixPairParams->resultPr = resultPr; + + //put all pairs from the same vector onto same core + Vthread__create_thread_with_affinity( &calcSubMatrixProduct, + subMatrixPairParams, + animatingPr, + coreToScheduleOnto ); + } + + //Trying to distribute the subMatrix-vectors across the cores, so + // that each core gets the same number of vectors, with a max + // imbalance of 1 vector more on some cores than others + numVecOnCurrCore += 1; + if( numVecOnCurrCore + leftOverFraction >= numToPutOntoEachCore -1 ) + { + //deal with fractional part, to ensure that imbalance is 1 max + // IE, core with most has only 1 more than core with least + leftOverFraction += numToPutOntoEachCore - numVecOnCurrCore; + if( leftOverFraction >= 1 ) + { leftOverFraction -= 1; + numVecOnCurrCore = -1; + } + else + { numVecOnCurrCore = 0; + } + //Move to next core, max core-value to incr to is numCores -1 + if( coreToScheduleOnto >= numCores -1 ) + { coreToScheduleOnto = 0; + } + else + { coreToScheduleOnto += 1; + } + } + + } + } + } + + + +/*Walk through the two slice-strucs, making sub-matrix strucs as go + */ +SubMatrix ** +createSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices, + int32 numUses, Matrix *origMatrix, SlaveVP *animPr ) + { + int32 numRowIdxs, numColIdxs, rowIdx, colIdx; + int32 startRow, endRow, startCol, endCol; + int32 *rowStartVals, *colStartVals; + int32 rowOffset; + SubMatrix **subMatrices, *newSubMatrix; + + numRowIdxs = rowSlices->numVals; + numColIdxs = colSlices->numVals; + + rowStartVals = rowSlices->startVals; + colStartVals = colSlices->startVals; + + subMatrices = Vthread__malloc((size_t)numRowIdxs * + (size_t)numColIdxs * sizeof(SubMatrix*),animPr ); + + for( rowIdx = 0; rowIdx < numRowIdxs; rowIdx++ ) + { + rowOffset = rowIdx * numColIdxs; + + startRow = rowStartVals[rowIdx]; + endRow = rowStartVals[rowIdx + 1] -1; //"fake" start above last is + // at last valid idx + 1 & is + // 1 greater than end value + for( colIdx = 0; colIdx < numColIdxs; colIdx++ ) + { + startCol = colStartVals[colIdx]; + endCol = colStartVals[colIdx + 1] -1; + + newSubMatrix = Vthread__malloc( sizeof(SubMatrix), animPr ); + newSubMatrix->numRows = endRow - startRow +1; + newSubMatrix->numCols = endCol - startCol +1; + newSubMatrix->origMatrix = origMatrix; + newSubMatrix->origStartRow = startRow; + newSubMatrix->origStartCol = startCol; + newSubMatrix->alreadyCopied = FALSE; + newSubMatrix->numUsesLeft = numUses; //can free after this many + //Prevent uninitialized memory + newSubMatrix->copySingleton = NULL; + newSubMatrix->copyTransSingleton = NULL; + + subMatrices[ rowOffset + colIdx ] = newSubMatrix; + } + } + return subMatrices; + } + + +void +freeSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices, + SubMatrix **subMatrices, SlaveVP *animPr ) + { + int32 numRowIdxs, numColIdxs, rowIdx, colIdx, rowOffset; + SubMatrix *subMatrix; + + numRowIdxs = rowSlices->numVals; + numColIdxs = colSlices->numVals; + + for( rowIdx = 0; rowIdx < numRowIdxs; rowIdx++ ) + { + rowOffset = rowIdx * numColIdxs; + for( colIdx = 0; colIdx < numColIdxs; colIdx++ ) + { + subMatrix = subMatrices[ rowOffset + colIdx ]; + if( subMatrix->alreadyCopied ) + Vthread__free( subMatrix->array, animPr ); + Vthread__free( subMatrix, animPr ); + } + } + Vthread__free( subMatrices, animPr ); + } + + + +SlicingStruc * +sliceUpDimension( float32 idealSizeOfSide, int startVal, int endVal, + SlaveVP *animPr ) + { float32 residualAcc = 0; + int numSlices, i, *startVals, sizeOfSlice, endCondition; + SlicingStruc *slicingStruc = Vthread__malloc(sizeof(SlicingStruc), animPr); + + //calc size of matrix need to hold start vals -- + numSlices = (int32)( (float32)(endVal -startVal +1) / idealSizeOfSide); + + startVals = Vthread__malloc( ((size_t)numSlices + 1) * sizeof(int32), animPr ); + + //Calc the upper limit of start value -- when get above this, end loop + // by saving highest value of the matrix dimension to access, plus 1 + // as the start point of the imaginary slice following the last one + //Plus 1 because go up to value but not include when process last slice + //The stopping condition is half-a-size less than highest value because + // don't want any pieces smaller than half the ideal size -- just tack + // little ones onto end of last one + endCondition = endVal - (int) (idealSizeOfSide/2); //end *value*, not size + for( i = 0; startVal <= endVal; i++ ) + { + startVals[i] = startVal; + residualAcc += idealSizeOfSide; + sizeOfSlice = (int)residualAcc; + residualAcc -= (float32)sizeOfSlice; + startVal += sizeOfSlice; //ex @size = 2 get 0, 2, 4, 6, 8.. + + if( startVal > endCondition ) + { startVal = endVal + 1; + startVals[ i + 1 ] = startVal; + } + } + + slicingStruc->startVals = startVals; + slicingStruc->numVals = i; //loop incr'd, so == last valid start idx+1 + // which means is num sub-matrices in dim + // also == idx of the fake start just above + return slicingStruc; + } + +void +freeSlicingStruc( SlicingStruc *slicingStruc, SlaveVP *animPr ) + { + Vthread__free( slicingStruc->startVals, animPr ); + Vthread__free( slicingStruc, animPr ); + } + + +int inline +measureMatrixMultPrimitive( SlaveVP *animPr ) + { + int r, c, v, numCycles; + float32 *res, *left, *right; + + //setup inputs + left = Vthread__malloc( 5 * 5 * sizeof( float32 ), animPr ); + right = Vthread__malloc( 5 * 5 * sizeof( float32 ), animPr ); + res = Vthread__malloc( 5 * 5 * sizeof( float32 ), animPr ); + + for( r = 0; r < 5; r++ ) + { + for( c = 0; c < 5; c++ ) + { + left[ r * 5 + c ] = r; + right[ r * 5 + c ] = c; + } + } + + //do primitive + Vthread__start_primitive(); //for now, just takes time stamp + for( r = 0; r < 5; r++ ) + { + for( c = 0; c < 5; c++ ) + { + for( v = 0; v < 5; v++ ) + { + res[ r * 5 + c ] = left[ r * 5 + v ] * right[ v * 5 + c ]; + } + } + } + numCycles = + Vthread__end_primitive_and_give_cycles(); + + Vthread__free( left, animPr ); + Vthread__free( right, animPr ); + Vthread__free( res, animPr ); + + return numCycles; + } diff -r f8d3f16e70c8 -r 69928a38d5af Vthread__Matrix_Mult/EntryPoint.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Vthread__Matrix_Mult/EntryPoint.c Mon Sep 17 18:19:52 2012 -0700 @@ -0,0 +1,120 @@ +/* + * Copyright 2009 OpenSourceStewardshipFoundation.org + * Licensed under GNU General Public License version 2 + * + * Author: seanhalle@yahoo.com + * + */ + +#include + +#include "Vthread__Matrix_Mult.h" + + + +/*Every program for a VMS-based language has an "entry point" function that + * creates the first + * processor, which starts the chain of creating more processors.. + * eventually all of the processors will dissipate themselves, and + * return. + * + *This entry-point function follows the same pattern as all entry-point + * functions do: + *1) it creates the params for the seed processor, from the + * parameters passed into the entry-point function + *2) it calls create_seed_slaveVP_and_do_work + *3) it gets the return value from the params struc, frees the params struc, + * and returns the value from the function + * + */ +Matrix * +multiplyTheseMatrices( Matrix *leftMatrix, Matrix *rightMatrix ) + { Matrix *resMatrix; + DividerParams *dividerParams; + int32 numResRows, numResCols; + + + dividerParams = malloc( sizeof( DividerParams ) ); + dividerParams->leftMatrix = leftMatrix; + dividerParams->rightMatrix = rightMatrix; + + + numResRows = leftMatrix->numRows; + numResCols = rightMatrix->numCols; + + //VMS has its own separate internal malloc, so to get results out, + // have to pass in empty array for it to fill up + //The alternative is internally telling SSR make external space to use + resMatrix = malloc( sizeof(Matrix) ); + resMatrix->array = malloc( numResRows * numResCols * sizeof(float32)); + resMatrix->numCols = rightMatrix->numCols; + resMatrix->numRows = leftMatrix->numRows; + + + dividerParams->resultMatrix = resMatrix; + + //create divider processor, start doing the work, and wait till done + //This function is the "border crossing" between normal code and SSR + Vthread__create_seed_slaveVP_and_do_work(÷WorkIntoSubMatrixPairProcrs, + dividerParams ); + + free( dividerParams ); + return resMatrix; + } + + +Matrix * +multiplyTheseMatrices( Matrix *leftMatrix, Matrix *rightMatrix ) + { Matrix *resMatrix; + DividerParams *dividerParams; + int32 numResRows, numResCols; + + + dividerParams = malloc( sizeof( DividerParams ) ); + dividerParams->leftMatrix = leftMatrix; + dividerParams->rightMatrix = rightMatrix; + + + numResRows = leftMatrix->numRows; + numResCols = rightMatrix->numCols; + + //VMS has its own separate internal malloc, so to get results out, + // have to pass in empty array for it to fill up + //The alternative is internally telling SSR make external space to use + resMatrix = malloc( sizeof(Matrix) ); + resMatrix->array = malloc( numResRows * numResCols * sizeof(float32)); + resMatrix->numCols = rightMatrix->numCols; + resMatrix->numRows = leftMatrix->numRows; + + + dividerParams->resultMatrix = resMatrix; + + //Vthread__start_lang_running(); + //VMS__start_lang_running( Vthread ); + +/* + matrixMultProcessID = + Vthread__spawn_program_on_data(÷WorkIntoSubMatrixPairProcrs, + dividerParams); +*/ + VMS__start_VMS_running( ); + + VMSProcessID matrixMultProcessID; + + matrixMultProcessID = + VMS__spawn_program_on_data_in_Lang( &prog_seed_fn, data, Vthread_lang ); + + resMatrix = VMS__give_results_when_done_for( matrixMultProcessID ); + +// Vthread__give_results_when_done_for( matrixMultProcessID ); + + //VMS__shutdown_lang( Vthread ); + + //Vthread__shutdown_lang(); + + VMS__shutdown(); + + free( dividerParams ); + + return resMatrix; + } diff -r f8d3f16e70c8 -r 69928a38d5af Vthread__Matrix_Mult/Result_Pr.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Vthread__Matrix_Mult/Result_Pr.c Mon Sep 17 18:19:52 2012 -0700 @@ -0,0 +1,142 @@ +/* + * Copyright 2009 OpenSourceStewardshipFoundation.org + * Licensed under GNU General Public License version 2 + * + * Author: seanhalle@yahoo.com + * + */ + +#include "Vthread__Matrix_Mult.h" + +//===================== +void inline +accumulateResult( float32 *resultArray, float32 *subMatrixResultArray, + int32 startRow, + int32 numRows, + int32 startCol, + int32 numCols, + int32 numOrigCols ); + +//=========================================================================== + +/*The Result Processor gets a message from each of the vector processors, + * puts the result from the message in its location in the result- + * matrix, and increments the count of results. + * + *After the count reaches the point that all results have been received, it + * returns the result matrix and dissipates. + */ +void gatherResults( void *_params, SlaveVP *animatingThd ) + { SlaveVP *dividerPr; + ResultsParams *params; + int numRows, numCols, numSubMatrixPairs, count=0; + float32 *resultArray; + SMPairParams *resParams; + //====================== thread stuff ======================= + MatrixMultGlobals *globals =(MatrixMultGlobals *)Vthread__give_globals(); + + + //get vector-comm lock before loop, so that this thd keeps lock after + // one wait until it enters the next wait -- forces see-saw btwn + // waiters and signalers -- wait-signal-wait-signal-... + Vthread__mutex_lock( globals->vector_mutex, animatingThd ); + + //Tell divider that have the vector lock -- so it's sure won't miss any + // signals from the vector-threads it's about to create + //Don't need a signal variable -- this thd can't be created until + // divider thd already has the start lock + Vthread__mutex_lock( globals->start_mutex, animatingThd );//finish wait + Vthread__cond_signal( globals->start_cond, animatingThd ); + Vthread__mutex_unlock( globals->start_mutex, animatingThd );//finish wait + //=========================================================== + + DEBUG( dbgAppFlow, "start resultPr\n") + + params = (ResultsParams *)_params; + dividerPr = params->dividerPr; + numSubMatrixPairs = params->numSubMatrixPairs; + numRows = params->numRows; + numCols = params->numCols; + + resultArray = params->resultArray; + + + while( count < numSubMatrixPairs ) + { + //receive a vector-result from a vector-thread + Vthread__cond_wait( globals->vector_cond, animatingThd ); + + //At this point, animating thread owns the vector lock, so all + // pairs trying to signal they have a result are waiting to get that + // lock -- only one gets it at a time, and when signal, this thd + // gets the lock and does the body of this loop, then when does the + // wait again, that releases the lock for next pair-thread to get it + resParams = globals->currSMPairParams; + + accumulateResult( resultArray, resParams->partialResultArray, + resParams->leftSubMatrix->origStartRow, + resParams->leftSubMatrix->numRows, + resParams->rightSubMatrix->origStartCol, + resParams->rightSubMatrix->numCols, + resParams->rightSubMatrix->origMatrix->numCols ); + + Vthread__free( resParams->partialResultArray, animatingThd ); + + //there is only one copy of results procr, so can update numUsesLeft + // without concurrency worries. When zero, free the sub-matrix + resParams->leftSubMatrix->numUsesLeft -= 1; + if( resParams->leftSubMatrix->numUsesLeft == 0 ) + { + Vthread__free( resParams->leftSubMatrix->array, animatingThd ); + Vthread__free( resParams->leftSubMatrix, animatingThd ); + } + + resParams->rightSubMatrix->numUsesLeft -= 1; + if( resParams->rightSubMatrix->numUsesLeft == 0 ) + { + Vthread__free( resParams->rightSubMatrix->array, animatingThd ); + Vthread__free( resParams->rightSubMatrix, animatingThd ); + } + + //count of how many sub-matrix pairs accumulated so know when done + count++; + } + + //Done -- could just dissipate -- SSR will wait for all processors to + // dissipate before shutting down, and thereby making results avaial to + // outside, so no need to stop the divider from dissipating, so no need + // to send a hand-shake message to it -- but makes debug easier + //However, following pattern, so all comms done, release lock + Vthread__mutex_unlock( globals->vector_mutex, animatingThd ); + + //Send result to divider (seed) thread + // note, divider thd had to hold the results-comm lock before creating + // this thread, to be sure no race + Vthread__mutex_lock( globals->results_mutex, animatingThd ); + //globals->results = resultMatrixArray; + Vthread__cond_signal( globals->results_cond, animatingThd ); + Vthread__mutex_unlock( globals->results_mutex, animatingThd ); //releases + //divider thread from its wait, at point this executes + + Vthread__dissipate_thread( animatingThd ); //frees any data owned by procr + } + +void inline +accumulateResult( float32 *resultArray, float32 *subMatrixPairResultArray, + int32 startRow, + int32 numRows, + int32 startCol, + int32 numCols, + int32 numOrigCols ) + { int32 row, col; + + for( row = 0; row < numRows; row++ ) + { + for( col = 0; col < numCols; col++ ) + { + resultArray[ (row + startRow) * numOrigCols + (col + startCol) ] += + subMatrixPairResultArray[ row * numCols + col ]; + } + } + + } diff -r f8d3f16e70c8 -r 69928a38d5af Vthread__Matrix_Mult/Vthread__Matrix_Mult.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Vthread__Matrix_Mult/Vthread__Matrix_Mult.h Mon Sep 17 18:19:52 2012 -0700 @@ -0,0 +1,120 @@ +/* + * Copyright Oct 24, 2009 OpenSourceStewardshipFoundation.org + * Licensed under GNU General Public License version 2 + */ + +#ifndef _SSR_MATRIX_MULT_H_ +#define _SSR_MATRIX_MULT_H_ + +#include + +#include "Vthread_impl/Vthread.h" +#include "../Matrix_Mult.h" + + +//=============================== Defines ============================== +#define ROWS_IN_BLOCK 32 +#define COLS_IN_BLOCK 32 +#define VEC_IN_BLOCK 32 + +#define copyMatrixSingleton 1 +#define copyTransposeSingleton 2 + +//============================== Structures ============================== +typedef struct + { + Matrix *leftMatrix; + Matrix *rightMatrix; + Matrix *resultMatrix; + } +DividerParams; + +typedef struct + { + SlaveVP *dividerPr; + int numRows; + int numCols; + int numSubMatrixPairs; + float32 *resultArray; + } +ResultsParams; + +typedef +struct + { int32 numRows; + int32 numCols; + Matrix *origMatrix; + int32 origStartRow; + int32 origStartCol; + int32 alreadyCopied; + VthdSingleton *copySingleton; + VthdSingleton *copyTransSingleton; + int32 numUsesLeft; //have update via message to avoid multiple writers + float32 *array; //2D, but dynamically sized, so use addr arith + } +SubMatrix; + +typedef struct + { SlaveVP *resultPr; + SubMatrix *leftSubMatrix; + SubMatrix *rightSubMatrix; + float32 *partialResultArray; + } +SMPairParams; + +typedef +struct + { int32 numVals; + int32 *startVals; + } +SlicingStruc; + +typedef +struct + { + SlicingStruc *leftRowSlices; + SlicingStruc *vecSlices; + SlicingStruc *rightColSlices; + } +SlicingStrucCarrier; + +enum MMMsgType + { + RESULTS_MSG = 1 + }; + + +typedef struct + { + //for communicating sub-matrix-pair results to results Thd + int32 vector_mutex; + int32 vector_cond; + SMPairParams *currSMPairParams; + + //for communicating results array back to seed (divider) Thd + int32 results_mutex; + int32 results_cond; + float32 *results; + + //for ensuring results thd has vector lock before making vector thds + int32 start_mutex; + int32 start_cond; + + Matrix *rightMatrix; + Matrix *resultMatrix; + } +MatrixMultGlobals; + + +//============================= Processor Functions ========================= +void divideWorkIntoSubMatrixPairProcrs( void *data, SlaveVP *animatingPr ); +void calcSubMatrixProduct( void *data, SlaveVP *animatingPr ); +void gatherResults( void *data, SlaveVP *animatingPr ); + + +//================================ Entry Point ============================== +Matrix * +multiplyTheseMatrices( Matrix *leftMatrix, Matrix *rightMatrix ); + + +#endif /*_SSR_MATRIX_MULT_H_*/ diff -r f8d3f16e70c8 -r 69928a38d5af Vthread__Matrix_Mult/subMatrix_Pr.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Vthread__Matrix_Mult/subMatrix_Pr.c Mon Sep 17 18:19:52 2012 -0700 @@ -0,0 +1,310 @@ +/* + * Copyright 2009 OpenSourceStewardshipFoundation.org + * Licensed under GNU General Public License version 2 + * + * Author: SeanHalle@yahoo.com + * + */ + +#include + +#include "Vthread__Matrix_Mult.h" + +//Declarations +//=================================================== + +void inline +copyFromOrig( SubMatrix *subMatrix, SlaveVP *animPr ); + +void inline +copyTransposeFromOrig( SubMatrix *subMatrix, SlaveVP *animPr ); + +void inline +multiplySubBlocksTransposed( float32 *leftArray, float32 *rightArray, + float32 *resArray, + int startRow, int endRow, + int startCol, int endCol, + int startVec, int endVec, + int resStride, int inpStride ); + +void inline +multiplyMatrixArraysTransposed( int32 vecLength, int32 numResRows, int32 numResCols, + float32 *leftArray, float32 *rightArray, + float32 *resArray ); + +//=================================================== + +/*A processor is created with an environment that holds two matrices, + * the row and col that it owns, and the name of a result gathering + * processor. + *It calculates the product of two sub-portions of the input matrices + * by using Intel's mkl library for single-core. + * + *This demonstrates using optimized single-threaded code inside scheduled + * work-units. + * + *When done, it sends the result to the result processor + */ +void +calcSubMatrixProduct( void *data, SlaveVP *animatingPr ) + { + SMPairParams *params; + SlaveVP *resultPr; + float32 *leftArray, *rightArray, *resArray; + SubMatrix *leftSubMatrix, *rightSubMatrix; + MatrixMultGlobals *globals =(MatrixMultGlobals *)Vthread__give_globals(); + + DEBUG1(dbgAppFlow, "start sub-matrix mult: %d\n", animatingPr->procrID) + #ifdef TURN_ON_DEBUG_PROBES + int32 subMatrixProbe = VMS_App__create_single_interval_probe( "subMtx", + animatingPr); + VMS_App__record_sched_choice_into_probe( subMatrixProbe, animatingPr ); + VMS_App__record_interval_start_in_probe( subMatrixProbe ); + #endif + + params = (SMPairParams *)data; + resultPr = params->resultPr; + leftSubMatrix = params->leftSubMatrix; + rightSubMatrix = params->rightSubMatrix; + + //make sure the input sub-matrices have been copied out of orig + //do it here, inside sub-matrix pair to hopefully gain reuse in cache + copyFromOrig( leftSubMatrix, animatingPr ); + copyTransposeFromOrig( rightSubMatrix, animatingPr ); + + leftArray = leftSubMatrix->array; + rightArray = rightSubMatrix->array; + + size_t resSize = (size_t)leftSubMatrix->numRows * + (size_t)rightSubMatrix->numCols * sizeof(float32); + resArray = Vthread__malloc( resSize, animatingPr ); + memset( resArray, 0, resSize ); + + + int32 numResRows, numResCols, vectLength; + + vectLength = leftSubMatrix->numCols; + numResRows = leftSubMatrix->numRows; + numResCols = rightSubMatrix->numCols; + + multiplyMatrixArraysTransposed( vectLength, numResRows, numResCols, + leftArray, rightArray, + resArray ); + + //send result to result processor + params->partialResultArray = resArray; + + #ifdef TURN_ON_DEBUG_PROBES + VMS_App__record_interval_end_in_probe( subMatrixProbe ); + #endif + + //Send result to results thread + //This pattern works 'cause only get lock when results thd inside wait + Vthread__mutex_lock( globals->vector_mutex, animatingPr ); + globals->currSMPairParams = params; + Vthread__cond_signal( globals->vector_cond, animatingPr ); + Vthread__mutex_unlock( globals->vector_mutex, animatingPr );//release + //wait-er -- cond_signal implemented such that wait-er gets lock, no other + + DEBUG1(dbgAppFlow, "end sub-matrix mult: %d\n", animatingPr->procrID) + Vthread__dissipate_thread( animatingPr ); + } + + + +/*Divides result and each input into 32x32 sub-matrices, 3 of which fit into + * the 32KB L1 cache. + *Would be nice to embed this within another level that divided into + * 8x8 tiles of those, where one 8x8 tile fits within 2MB L2 cache + * + *Eventually want these divisions to be automatic, using DKU pattern + * embedded into VMS and exposed in the language, and with VMS controlling the + * divisions according to the cache sizes, which it knows about. + *Also, want VMS to work with language to split among main-mems, so a socket + * only cranks on data in its local segment of main mem + * + *So, outer two loops determine start and end points within the result matrix. + * Inside that, a loop dets the start and end points along the shared dimensions + * of the two input matrices. + */ +void inline +multiplyMatrixArraysTransposed( int32 vecLength, int32 numResRows, + int32 numResCols, + float32 *leftArray, float32 *rightArray, + float32 *resArray ) + { + int resStride, inpStride; + int resStartRow, resStartCol, resEndRow, resEndCol, startVec, endVec; + + resStride = numResCols; + inpStride = vecLength; + + for( resStartRow = 0; resStartRow < numResRows; ) + { + resEndRow = resStartRow + ROWS_IN_BLOCK -1; //start at zero, so -1 + if( resEndRow > numResRows ) resEndRow = numResRows -1; + + for( resStartCol = 0; resStartCol < numResCols; ) + { + resEndCol = resStartCol + COLS_IN_BLOCK -1; + if( resEndCol > numResCols ) resEndCol = numResCols -1; + + for( startVec = 0; startVec < vecLength; ) + { + endVec = startVec + VEC_IN_BLOCK -1; + if( endVec > vecLength ) endVec = vecLength -1; + + //By having the "vector" of sub-blocks in a sub-block slice + // be marched down in inner loop, are re-using the result + // matrix, which stays in L1 cache and re-using the left sub-mat + // which repeats for each right sub-mat -- can only re-use two of + // the three, so result is the most important -- avoids writing + // dirty blocks until those result-locations fully done + //Row and Col is position in result matrix -- so row and vec + // for left array, then vec and col for right array + multiplySubBlocksTransposed( leftArray, rightArray, + resArray, + resStartRow, resEndRow, + resStartCol, resEndCol, + startVec, endVec, + resStride, inpStride ); + startVec = endVec +1; + } + resStartCol = resEndCol +1; + } + resStartRow = resEndRow +1; + } + } + + + +void inline +multiplySubBlocksTransposed( float32 *leftArray, float32 *rightArray, + float32 *resArray, + int resStartRow, int resEndRow, + int resStartCol, int resEndCol, + int startVec, int endVec, + int resStride, int inpStride ) + { + int resRow, resCol, vec; + int leftOffset, rightOffset; + float32 result; + + //The result row is used only for the left matrix, res col for the right + for( resCol = resStartCol; resCol <= resEndCol; resCol++ ) + { + for( resRow = resStartRow; resRow <= resEndRow; resRow++ ) + { + leftOffset = resRow * inpStride;//left & right inp strides always same + rightOffset = resCol * inpStride;// because right is transposed + result = 0; + for( vec = startVec; vec <= endVec; vec++ ) + { + result += + leftArray[ leftOffset + vec] * rightArray[ rightOffset + vec]; + } + + resArray[ resRow * resStride + resCol ] += result; + } + } + } + + + + +/*Reuse this in divider when do the sequential multiply case + */ +void inline +copyTranspose( int32 numRows, int32 numCols, + int32 origStartRow, int32 origStartCol, int32 origStride, + float32 *subArray, float32 *origArray ) + { int32 stride = numRows; + + int row, col, origOffset; + for( row = 0; row < numRows; row++ ) + { + origOffset = (row + origStartRow) * origStride + origStartCol; + for( col = 0; col < numCols; col++ ) + { + //transpose means swap row & col -- traverse orig matrix normally + // but put into reversed place in local array -- means the + // stride is the numRows now, so col * numRows + row + subArray[ col * stride + row ] = origArray[ origOffset + col ]; + } + } + } + +void inline +copyTransposeFromOrig( SubMatrix *subMatrix, SlaveVP *animPr ) + { int numCols, numRows, origStartRow, origStartCol, origStride; + Matrix *origMatrix; + float32 *origArray, *subArray; + + Vthread__start_data_singleton( subMatrix->copyTransSingleton, animPr ); + + origMatrix = subMatrix->origMatrix; + origArray = origMatrix->array; + numCols = subMatrix->numCols; + numRows = subMatrix->numRows; + origStartRow = subMatrix->origStartRow; + origStartCol = subMatrix->origStartCol; + origStride = origMatrix->numCols; + + subArray = Vthread__malloc( (size_t)numRows * + (size_t)numCols *sizeof(float32),animPr); + subMatrix->array = subArray; + + //copy values from orig matrix to local + copyTranspose( numRows, numCols, + origStartRow, origStartCol, origStride, + subArray, origArray ); + + Vthread__end_data_singleton( subMatrix->copyTransSingleton, animPr ); + + } + + +void inline +copyFromOrig( SubMatrix *subMatrix, SlaveVP *animPr ) + { int numCols, numRows, origStartRow, origStartCol, stride, origStride; + Matrix *origMatrix; + float32 *origArray, *subArray; + + + //This lets only a single VP execute the code between start and + // end -- using start and end so that work runs outside the master. + //If a second VP ever executes the start, it will be returned + // from the end-point. If its execution starts after another but before + // that other has finished, this one will remain suspended until the + // other finishes, then be resumed from the end-point. + Vthread__start_data_singleton( subMatrix->copySingleton, animPr ); + + + origMatrix = subMatrix->origMatrix; + origArray = origMatrix->array; + numCols = subMatrix->numCols; + numRows = subMatrix->numRows; + origStartRow = subMatrix->origStartRow; + origStartCol = subMatrix->origStartCol; + origStride = origMatrix->numCols; + + subArray = Vthread__malloc( (size_t)numRows * + (size_t)numCols *sizeof(float32),animPr); + subMatrix->array = subArray; + + //copy values from orig matrix to local + stride = numCols; + + int row, col, offset, origOffset; + for( row = 0; row < numRows; row++ ) + { + offset = row * stride; + origOffset = (row + origStartRow) * origStride + origStartCol; + for( col = 0; col < numCols; col++ ) + { + subArray[ offset + col ] = origArray[ origOffset + col ]; + } + } + + Vthread__end_data_singleton( subMatrix->copySingleton, animPr ); + } diff -r f8d3f16e70c8 -r 69928a38d5af __brch__default --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/__brch__default Mon Sep 17 18:19:52 2012 -0700 @@ -0,0 +1,3 @@ +This branch is for the project structure defined Jan 2012.. the #includes reflect this directory structure. + +The default branch of the application directory is likely to be the only branch diff -r f8d3f16e70c8 -r 69928a38d5af main.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/main.c Mon Sep 17 18:19:52 2012 -0700 @@ -0,0 +1,480 @@ +/* + * + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include "VMS_Implementations/Vthread_impl/VPThread.h" +#include "C_Libraries/Queue_impl/PrivateQueue.h" + +#include +#include +#include + +#undef DEBUG +//#define DEBUG + +#define MEASURE_PERF + +#if !defined(unix) && !defined(__unix__) +#ifdef __MACH__ +#define unix 1 +#define __unix__ 1 +#endif /* __MACH__ */ +#endif /* unix */ + +/* find the appropriate way to define explicitly sized types */ +/* for C99 or GNU libc (also mach's libc) we can use stdint.h */ +#if (__STDC_VERSION__ >= 199900) || defined(__GLIBC__) || defined(__MACH__) +#include +#elif defined(unix) || defined(__unix__) /* some UNIX systems have them in sys/types.h */ +#include +#elif defined(__WIN32__) || defined(WIN32) /* the nameless one */ +typedef unsigned __int8 uint8_t; +typedef unsigned __int32 uint32_t; +#endif /* sized type detection */ + +/* provide a millisecond-resolution timer for each system */ +#if defined(unix) || defined(__unix__) +#include +#include +unsigned long get_msec(void) { + static struct timeval timeval, first_timeval; + + gettimeofday(&timeval, 0); + if(first_timeval.tv_sec == 0) { + first_timeval = timeval; + return 0; + } + return (timeval.tv_sec - first_timeval.tv_sec) * 1000 + (timeval.tv_usec - first_timeval.tv_usec) / 1000; +} +#elif defined(__WIN32__) || defined(WIN32) +#include +unsigned long get_msec(void) { + return GetTickCount(); +} +#else +//#error "I don't know how to measure time on your platform" +#endif + +//======================== Defines ========================= +typedef struct perfData measurement_t; +struct perfData{ + uint64 cycles; + uint64 instructions; +}; + +const char *usage = { + "Usage: malloc_test [options]\n" + " Spwans a number of threads and allocates memory.\n\n" + "Options:\n" + " -t how many threads to use (default: 1). This is internaly multiplied by the number of cores.\n" + " -o repeat workload and sync operation times\n" + " -i size of workload, repeat times\n" + " -h this help screen\n\n" +}; + +struct barrier_t +{ + int counter; + int nthreads; + int32 mutex; + int32 cond; + measurement_t endBarrierCycles; + +}; +typedef struct barrier_t barrier; + +void inline barrier_init(barrier *barr, int nthreads, VirtProcr *animatingPr) + { + barr->counter = 0; + barr->nthreads = nthreads; + barr->mutex = VPThread__make_mutex(animatingPr); + barr->cond = VPThread__make_cond(barr->mutex, animatingPr); + } + +int cycles_counter_main_fd; +void inline barrier_wait(barrier *barr, VirtProcr *animatingPr) + { int i; + + VPThread__mutex_lock(barr->mutex, animatingPr); + barr->counter++; + if(barr->counter == barr->nthreads) + { +#ifdef MEASURE_PERF + read(cycles_counter_main_fd, &(barr->endBarrierCycles.cycles), \ + sizeof(barr->endBarrierCycles.cycles)); +#endif + + barr->counter = 0; + for(i=0; i < barr->nthreads; i++) + VPThread__cond_signal(barr->cond, animatingPr); + } + else + { VPThread__cond_wait(barr->cond, animatingPr); + } + VPThread__mutex_unlock(barr->mutex, animatingPr); + } + + + +typedef struct + { struct barrier_t* barrier; + uint64_t totalWorkCycles; + uint64_t totalBadCycles; + uint64_t totalSyncCycles; + uint64_t totalBadSyncCycles; + uint64 numGoodSyncs; + uint64 numGoodTasks; + } +WorkerParams; + + +typedef struct + { measurement_t *startExeCycles; + measurement_t *endExeCycles; + } +BenchParams; + +//======================== Globals ========================= +char __ProgrammName[] = "overhead_test"; +char __DataSet[255]; + +int outer_iters, inner_iters, num_threads; +size_t chunk_size = 0; + +int cycles_counter_fd[NUM_CORES]; +struct perf_event_attr* hw_event; + +WorkerParams *workerParamsArray; + +//======================== App Code ========================= +/* + * Workload + */ + +#define saveCyclesAndInstrs(core,cycles) do{ \ + int cycles_fd = cycles_counter_fd[core]; \ + int nread; \ + \ + nread = read(cycles_fd,&(cycles),sizeof(cycles)); \ + if(nread<0){ \ + perror("Error reading cycles counter"); \ + cycles = 0; \ + } \ +} while (0) //macro magic for scoping + + +double +worker_TLF(void* _params, VirtProcr* animatingPr) + { + int i,o; + WorkerParams* params = (WorkerParams*)_params; + unsigned int totalWorkCycles = 0, totalBadCycles = 0; + unsigned int totalSyncCycles = 0, totalBadSyncCycles = 0; + unsigned int workspace1=0, numGoodSyncs = 0, numGoodTasks = 0; + double workspace2=0.0; + int32 privateMutex = VPThread__make_mutex(animatingPr); + + int cpuid = sched_getcpu(); + + measurement_t startWorkload, endWorkload, startWorkload2, endWorkload2; + uint64 numCycles; + for(o=0; o < outer_iters; o++) + { +#ifdef MEASURE_PERF + saveCyclesAndInstrs(cpuid,startWorkload.cycles); +#endif + + //workltask + for(i=0; i < inner_iters; i++) + { + workspace1 += (workspace1 + 32)/2; + workspace2 += (workspace2 + 23.2)/1.4; + } + +#ifdef MEASURE_PERF + saveCyclesAndInstrs(cpuid,endWorkload.cycles); + numCycles = endWorkload.cycles - startWorkload.cycles; + //sanity check (400K is about 20K iters) + if( numCycles < 400000 ) {totalWorkCycles += numCycles; numGoodTasks++;} + else {totalBadCycles += numCycles; } +#endif + + //mutex access often causes switch to different Slave VP + VPThread__mutex_lock(privateMutex, animatingPr); + +/* + saveCyclesAndInstrs(cpuid,startWorkload2.cycles); + //Task + for(i=0; i < inner_iters; i++) + { + workspace1 += (workspace1 + 32)/2; + workspace2 += (workspace2 + 23.2)/1.4; + } + + saveCyclesAndInstrs(cpuid,endWorkload2.cycles); + numCycles = endWorkload2.cycles - startWorkload2.cycles; + //sanity check (400K is about 20K iters) + if( numCycles < 400000 ) {totalWorkCycles += numCycles; numGoodTasks++;} + else {totalBadCycles += numCycles; } + +*/ + VPThread__mutex_unlock(privateMutex, animatingPr); + } + + params->totalWorkCycles = totalWorkCycles; + params->totalBadCycles = totalBadCycles; + params->numGoodTasks = numGoodTasks; + params->totalSyncCycles = totalSyncCycles; + params->totalBadSyncCycles = totalBadSyncCycles; + params->numGoodSyncs = numGoodSyncs; +/* + params->totalSyncCycles = VMS__give_num_plugin_cycles(); + params->totalBadSyncCycles = 0; + params->numGoodSyncs = VMS__give_num_plugin_animations(); +*/ + + + //Wait for all threads to end + barrier_wait(params->barrier, animatingPr); + + //Shutdown worker + VPThread__dissipate_thread(animatingPr); + + //below return never reached --> there for gcc + return (workspace1 + workspace2); //to prevent gcc from optimizing work out + } + + +/* this is run after the VMS is set up*/ +void benchmark(void *_params, VirtProcr *animatingPr) + { + int i, cpuID; + struct barrier_t barr; + BenchParams *params; + + params = (BenchParams *)_params; + + barrier_init(&barr, num_threads+1, animatingPr); + + //prepare input + for(i=0; istartExeCycles; + +#ifdef MEASURE_PERF + int nread = read(cycles_counter_main_fd, &(startExeCycles->cycles), + sizeof(startExeCycles->cycles)); + if(nread<0) perror("Error reading cycles counter"); +#endif + + //create (which starts running) all threads + for(i=0; iendExeCycles->cycles = barr.endBarrierCycles.cycles; +#endif + + +/* + uint64_t overallWorkCycles = 0; + for(i=0; itype = PERF_TYPE_HARDWARE; + hw_event->size = sizeof(hw_event); + hw_event->disabled = 0; + hw_event->freq = 0; + hw_event->inherit = 1; /* children inherit it */ + hw_event->pinned = 1; /* says this virt counter must always be on HW */ + hw_event->exclusive = 0; /* only group on PMU */ + hw_event->exclude_user = 0; /* don't count user */ + hw_event->exclude_kernel = 1; /* don't count kernel */ + hw_event->exclude_hv = 1; /* ditto hypervisor */ + hw_event->exclude_idle = 1; /* don't count when idle */ + hw_event->mmap = 0; /* include mmap data */ + hw_event->comm = 0; /* include comm data */ + + hw_event->config = PERF_COUNT_HW_CPU_CYCLES; //cycles + + int cpuID, retries; + + for( cpuID = 0; cpuID < NUM_CORES; cpuID++ ) + { retries = 0; + do + { retries += 1; + cycles_counter_fd[cpuID] = + syscall(__NR_perf_event_open, hw_event, + 0,//pid_t: 0 is "pid of calling process" + cpuID,//int: cpu, the value returned by "CPUID" instr(?) + -1,//int: group_fd, -1 is "leader" or independent + 0//unsigned long: flags + ); + } + while(cycles_counter_fd[cpuID]<0 && retries < 100); + if(retries >= 100) + { + fprintf(stderr,"On core %d: ",cpuID); + perror("Failed to open cycles counter"); + } + } + + //Set up counter to accumulate total cycles to process, across all CPUs + + retries = 0; + do + { retries += 1; + cycles_counter_main_fd = + syscall(__NR_perf_event_open, hw_event, + 0,//pid_t: 0 is "pid of calling process" + -1,//int: cpu, -1 means accumulate from all cores + -1,//int: group_fd, -1 is "leader" == independent + 0//unsigned long: flags + ); + } + while(cycles_counter_main_fd<0 && retries < 100); + if(retries >= 100) + { + fprintf(stderr,"in main "); + perror("Failed to open cycles counter"); + } +#endif + + measurement_t startExeCycles, endExeCycles; + BenchParams *benchParams; + + benchParams = malloc(sizeof(BenchParams)); + + benchParams->startExeCycles = &startExeCycles; + benchParams->endExeCycles = &endExeCycles; + + workerParamsArray = (WorkerParams *)malloc( (num_threads + 1) * sizeof(WorkerParams) ); + if(workerParamsArray == NULL ) printf("error mallocing worker params array\n"); + + + //This is the transition to the VMS runtime + VPThread__create_seed_procr_and_do_work( &benchmark, benchParams ); + +#ifdef MEASURE_PERF + uint64_t totalWorkCyclesAcrossCores = 0, totalBadCyclesAcrossCores = 0; + uint64_t totalSyncCyclesAcrossCores = 0, totalBadSyncCyclesAcrossCores = 0; + for(i=0; i -#include - -#include "Matrix_Mult.h" -#include "ParamHelper/Param.h" - - - - void -initialize_Input_Matrices_Via( Matrix **leftMatrix, Matrix **rightMatrix, - ParamBag *paramBag ) - { char *leftMatrixFileName, *rightMatrixFileName; - int leftMatrixRows, leftMatrixCols, rightMatrixRows, rightMatrixCols; - - ParamStruc *param; - param = getParamFromBag( "leftMatrixRows", paramBag ); - leftMatrixRows = param->intValue; - param = getParamFromBag( "leftMatrixCols", paramBag ); - leftMatrixCols = param->intValue; - *leftMatrix = makeMatrix_WithResMat( leftMatrixRows, leftMatrixCols ); - - param = getParamFromBag( "leftMatrixFileName", paramBag ); - leftMatrixFileName = param->strValue; //no need to copy - read_Matrix_From_File( *leftMatrix, leftMatrixFileName ); - - param = getParamFromBag( "rightMatrixRows", paramBag ); - rightMatrixRows = param->intValue; - param = getParamFromBag( "rightMatrixCols", paramBag ); - rightMatrixCols = param->intValue; - *rightMatrix = makeMatrix_WithResMat( rightMatrixRows, rightMatrixCols ); - - param = getParamFromBag( "rightMatrixFileName", paramBag ); - rightMatrixFileName = param->strValue; - read_Matrix_From_File( *rightMatrix, rightMatrixFileName ); - } - - -void parseLineIntoRow( char *line, float32* row ); - - - void -read_Matrix_From_File( Matrix *matrixStruc, char *matrixFileName ) - { int row, maxRead, numRows, numCols; - float32 *matrixStart; - size_t lineSz = 0; - FILE *file; - char *line = NULL; - - lineSz = 50000; //max length of line in a matrix data file - line = (char *) malloc( lineSz ); - if( line == NULL ) printf( "no mem for matrix line" ); - - numRows = matrixStruc->numRows; - numCols = matrixStruc->numCols; - matrixStart = matrixStruc->array; - - file = fopen( matrixFileName, "r" ); - if( file == NULL ) { printf( "\nCouldn't open file!!\n"); exit(1);} - fseek( file, 0, SEEK_SET ); - for( row = 0; row < numRows; row++ ) - { - if( feof( file ) ) printf( "file ran out too soon" ); - maxRead = getline( &line, &lineSz, file ); - if( maxRead == -1 ) printf( "prob reading mat line"); - - if( *line == '\n') continue; //blank line - if( *line == '/' ) continue; //comment line - - parseLineIntoRow( line, matrixStart + row * numCols ); - } - free( line ); - } - -/*This function relies on each line having the proper number of cols. It - * doesn't check, nor enforce, so if the file is improperly formatted it - * can write over unrelated memory - */ - void -parseLineIntoRow( char *line, float32* row ) - { - char *valueStr, *searchPos; - - //read the float values - searchPos = valueStr = line; //start - - for( ; *searchPos != 0; searchPos++) //bit dangerous, should use buff len - { - if( *searchPos == '\n' ) //last col.. relying on well-formatted file - { *searchPos = 0; - *row = atof( valueStr ); - break; //end FOR loop - } - if( *searchPos == ',' ) - { *searchPos = 0; //mark end of string - *row = (float32) atof( valueStr ); - row += 1; //address arith - //skip any spaces before digits.. use searchPos + 1 to skip the 0 - for( ; *(searchPos + 1)== ' ' && *(searchPos + 1) !=0; searchPos++); - valueStr = searchPos + 1; - } - } - } - - //========================================================================== - -/*In the "_Flat" version of constructor, do only malloc of the top data struc - * and set values in that top-level. Don't malloc any sub-structures. - */ - Matrix * -makeMatrix_Flat( int32 numRows, int32 numCols ) - { Matrix * retMatrix; - retMatrix = malloc( sizeof( Matrix ) ); - retMatrix->numRows = numRows; - retMatrix->numCols = numCols; - - return retMatrix; - } - - Matrix * -makeMatrix_WithResMat( int32 numRows, int32 numCols ) - { Matrix * retMatrix; - retMatrix = malloc( sizeof( Matrix ) ); - retMatrix->numRows = numRows; - retMatrix->numCols = numCols; - retMatrix->array = malloc( numRows * numCols * sizeof(float32) ); - - return retMatrix; - } - - void -freeMatrix_Flat( Matrix * matrix ) - { //( matrix ); - } - void -freeMatrix( Matrix * matrix ) - { free( matrix->array ); - free( matrix ); - } - -void -printMatrix( Matrix *matrix ) - { int r, c, numRows, numCols, rowsToPrint, colsToPrint, rowIncr, colIncr; - float32 *matrixArray; - - numRows = rowsToPrint = matrix->numRows; - numCols = colsToPrint = matrix->numCols; - matrixArray = matrix->array; - - rowIncr = numRows/20; if(rowIncr == 0) rowIncr = 1;//20 to 39 rows printed - colIncr = numCols/20; if(colIncr == 0) colIncr = 1;//20 to 39 cols printed - for( r = 0; r < numRows; r += rowIncr ) - { for( c = 0; c < numCols; c += colIncr ) - { printf( "%3.1f | ", matrixArray[ r * numCols + c ] ); - } - printf("\n"); - } - } - diff -r f8d3f16e70c8 -r 69928a38d5af src/Application/Matrix_Mult.h --- a/src/Application/Matrix_Mult.h Fri Sep 16 20:12:34 2011 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,77 +0,0 @@ -/* - * Copyright Oct 24, 2009 OpenSourceStewardshipFoundation.org - * Licensed under GNU General Public License version 2 - */ - -#ifndef MATRIX_MULT_H_ -#define MATRIX_MULT_H_ - -#include -#include -#include - -#include "../VPThread_lib/VMS/VMS_primitive_data_types.h" -#include "ParamHelper/Param.h" - -//============================== Structures ============================== - -typedef -struct - { int32 numRows; - int32 numCols; - float32 *array; //2D, but dynamically sized, so use addr arith - } -Matrix; - -/* This is the "appSpecificPiece" that is carried inside a DKUPiece. - * In the DKUPiece data struc it is declared to be of type "void *". This - * allows the application to define any data structure it wants and put it - * into a DKUPiece. - * When the app specific info is used, it is in app code, so it is cast to - * the correct type to tell the compiler how to access fields. - * This keeps all app-specific things out of the DKU directory, as per the - * DKU standard. */ -typedef -struct - { - // pointers to shared data.. the result matrix must be created when the - // left and right matrices are put into the root ancestor DKUPiece. - Matrix * leftMatrix; - Matrix * rightMatrix; - Matrix * resultMatrix; - - // define the starting and ending boundaries for this piece of the - // result matrix. These are derivable from the left and right - // matrices, but included them for readability of code. - int prodStartRow, prodEndRow; - int prodStartCol, prodEndCol; - // Start and end of the portion of the left matrix that contributes to - // this piece of the product - int leftStartRow, leftEndRow; - int leftStartCol, leftEndCol; - // Start and end of the portion of the right matrix that contributes to - // this piece of the product - int rightStartRow, rightEndRow; - int rightStartCol, rightEndCol; - } -MatrixProdPiece; - -//============================== Functions ================================ -void readFile(); - -Matrix *makeMatrix( int32 numRows, int32 numCols ); -Matrix *makeMatrix_Flat( int32 numRows, int32 numCols ); -Matrix *makeMatrix_WithResMat( int32 numRows, int32 numCols ); -void freeMatrix_Flat( Matrix * matrix ); -void freeMatrix( Matrix * matrix ); -void printMatrix( Matrix *matrix ); - -void read_Matrix_From_File( Matrix *matrixStruc, char *matrixFileName ); - -void -initialize_Input_Matrices_Via( Matrix **leftMatrix, Matrix **rightMatrix, - ParamBag *paramBag ); - -//=========================================================================== - -#endif /*MATRIX_MULT_H_*/ diff -r f8d3f16e70c8 -r 69928a38d5af src/Application/VPThread__Matrix_Mult/Divide_Pr.c --- a/src/Application/VPThread__Matrix_Mult/Divide_Pr.c Fri Sep 16 20:12:34 2011 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,646 +0,0 @@ -/* - * Copyright 2009 OpenSourceStewardshipFoundation.org - * Licensed under GNU General Public License version 2 - * - * Author: seanhalle@yahoo.com - * - */ - - -#include "VPThread__Matrix_Mult.h" -#include -#include -#include "../../VPThread_lib/VPThread.h" - - //The time to compute this many result values should equal the time to - // perform this division on a matrix of size gives that many result calcs - //IE, size this so that sequential time to calc equals divide time - // find the value by experimenting -- but divide time and calc time scale - // same way, so this value should remain valid across hardware -#define NUM_CELLS_IN_SEQUENTIAL_CUTOFF 1000 - - -//=========================================================================== -int inline -measureMatrixMultPrimitive( VirtProcr *animPr ); - -SlicingStrucCarrier * -calcIdealSizeAndSliceDimensions( Matrix *leftMatrix, Matrix *rightMatrix, - VirtProcr *animPr ); - -SlicingStruc * -sliceUpDimension( float32 idealSizeOfSide, int startVal, int endVal, - VirtProcr *animPr ); - -void -freeSlicingStruc( SlicingStruc *slicingStruc, VirtProcr *animPr ); - -SubMatrix ** -createSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices, - int32 numUses, Matrix *origMatrix, VirtProcr *animPr ); - -void -freeSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices, - SubMatrix **subMatrices, VirtProcr *animPr ); - -void -pairUpSubMatricesAndMakeProcessors( SubMatrix **leftSubMatrices, - SubMatrix **rightSubMatrices, - int32 numRowIdxs, int32 numColIdxs, - int32 numVecIdxs, - VirtProcr *resultPr, - VirtProcr *animatingPr ); - -void -makeSubMatricesAndProcrs( Matrix *leftMatrix, Matrix *rightMatrix, - SlicingStrucCarrier *slicingStrucCarrier, - VirtProcr *resultPr, VirtProcr *animatingPr ); - -void inline -copyTranspose( int32 numRows, int32 numCols, - int32 origStartRow, int32 origStartCol, int32 origStride, - float32 *subArray, float32 *origArray ); - -void inline -multiplyMatrixArraysTransposed( int32 vecLength, int32 numResRows, - int32 numResCols, - float32 *leftArray, float32 *rightArray, - float32 *resArray ); - - - -/*Divider creates one processor for every sub-matrix - * It hands them: - * the name of the result processor that they should send their results to, - * the left and right matrices, and the rows and cols they should multiply - * It first creates the result processor, then all the sub-matrixPair - * processors, - * then does a receive of a message from the result processor that gives - * the divider ownership of the result matrix. - * Finally, the divider returns the result matrix out of the VPThread system. - * - * Divider chooses the size of sub-matrices via an algorithm that tries to - * keep the minimum work above a threshold. The threshold is machine- - * dependent, so ask VPThread for min work-unit time to get a - * given overhead - * - * Divide min work-unit cycles by measured-cycles for one matrix-cell - * product -- gives the number of products need to have in min size - * matrix. - * - * So then, take cubed root of this to get the size of a side of min sub- - * matrix. That is the size of the ideal square sub-matrix -- so tile - * up the two input matrices into ones as close as possible to that size, - * and create the pairs of sub-matrices. - * - *======================== STRATEGIC OVERVIEW ======================= - * - *This division is a bit tricky, because have to create things in advance - * that it's not at first obvious need to be created.. - * - *First slice up each dimension -- three of them.. this is because will have - * to create the sub-matrix's data-structures before pairing the sub-matrices - * with each other -- so, have three dimensions to slice up before can - * create the sub-matrix data-strucs -- also, have to be certain that the - * cols of the left input have the exact same slicing as the rows of the - * left matrix, so just to be sure, do the slicing calc once, then use it - * for both. - * - *So, goes like this: - *1) calculate the start & end values of each dimension in each matrix. - *2) use those values to create sub-matrix structures - *3) combine sub-matrices into pairs, as the tasks to perform. - * - *Have to calculate separately from creating the sub-matrices because of the - * nature of the nesting -- would either end up creating the same sub-matrix - * multiple times, or else would have to put in detection of whether had - * made a particular one already if tried to combine steps 1 and 2. - * - *Step 3 has to be separate because of the nesting, as well -- same reason, - * would either create same sub-matrix multiple times, or else have to - * add detection of whether was already created. - * - *Another way to look at it: there's one level of loop to divide dimensions, - * two levels of nesting to create sub-matrices, and three levels to pair - * up the sub-matrices. - */ - -void divideWorkIntoSubMatrixPairProcrs( void *_dividerParams, - VirtProcr *animatingThd ) - { VirtProcr *resultPr; - DividerParams *dividerParams; - ResultsParams *resultsParams; - Matrix *leftMatrix, *rightMatrix; - SlicingStrucCarrier *slicingStrucCarrier; - float32 *resultArray; //points to array inside result matrix - MatrixMultGlobals *globals; - - DEBUG( dbgAppFlow, "start divide\n") - - int32 - divideProbe = VMS__create_single_interval_probe( "divideProbe", - animatingThd ); - VMS__record_sched_choice_into_probe( divideProbe, animatingThd ); - VMS__record_interval_start_in_probe( divideProbe ); - - //=========== Setup -- make local copies of ptd-to-things, malloc, aso - int32 numResRows, numResCols, vectLength; - - dividerParams = (DividerParams *)_dividerParams; - - leftMatrix = dividerParams->leftMatrix; - rightMatrix = dividerParams->rightMatrix; - - vectLength = leftMatrix->numCols; - numResRows = leftMatrix->numRows; - numResCols = rightMatrix->numCols; - resultArray = dividerParams->resultMatrix->array; - - //zero the result array - memset( resultArray, 0, numResRows * numResCols * sizeof(float32) ); - - //============== Do either sequential mult or do division ============== - - //Check if input matrices too small -- if yes, just do sequential - //Cutoff is determined by overhead of this divider -- relatively - // machine-independent - if( (float32)leftMatrix->numRows * (float32)leftMatrix->numCols * - (float32)rightMatrix->numCols < NUM_CELLS_IN_SEQUENTIAL_CUTOFF ) - { - //====== Do sequential multiply on a single core - DEBUG( dbgAppFlow, "doing sequential") - - //transpose the right matrix - float32 * - transRightArray = - VPThread__malloc( (size_t)rightMatrix->numRows * - (size_t)rightMatrix->numCols * - sizeof(float32), animatingThd ); - - //copy values from orig matrix to local - copyTranspose( rightMatrix->numRows, rightMatrix->numCols, - 0, 0, rightMatrix->numRows, - transRightArray, rightMatrix->array ); - - multiplyMatrixArraysTransposed( vectLength, numResRows, numResCols, - leftMatrix->array, transRightArray, - resultArray ); - } - else - { - //====== Do parallel multiply across cores - - DEBUG( dbgAppFlow, "divider: do parallel mult\n") - //Calc the ideal size of sub-matrix and slice up the dimensions of - // the two matrices. - //The ideal size is the one takes the number of cycles to calculate - // such that calc time is equal or greater than min work-unit size - slicingStrucCarrier = - calcIdealSizeAndSliceDimensions(leftMatrix,rightMatrix,animatingThd); - - //Make the results processor, now that know how many to wait for - resultsParams = VPThread__malloc( sizeof(ResultsParams), animatingThd ); - resultsParams->numSubMatrixPairs = - slicingStrucCarrier->leftRowSlices->numVals * - slicingStrucCarrier->rightColSlices->numVals * - slicingStrucCarrier->vecSlices->numVals; - resultsParams->dividerPr = animatingThd; - resultsParams->numCols = rightMatrix->numCols; - resultsParams->numRows = leftMatrix->numRows; - resultsParams->resultArray = resultArray; - - //========== Set up global vars, including conds and mutexes ========== - globals = VMS__malloc( sizeof(MatrixMultGlobals) ); - VPThread__set_globals_to( globals ); - - globals->results_mutex = VPThread__make_mutex( animatingThd ); - globals->results_cond = VPThread__make_cond( globals->results_mutex, - animatingThd ); - - globals->vector_mutex = VPThread__make_mutex( animatingThd ); - globals->vector_cond = VPThread__make_cond( globals->vector_mutex, - animatingThd ); - - globals->start_mutex = VPThread__make_mutex( animatingThd ); - globals->start_cond = VPThread__make_cond( globals->start_mutex, - animatingThd ); - //====================================================================== - - DEBUG( dbgAppFlow, "divider: made mutexes and conds\n") - //get results-comm lock before create results-thd, to ensure it can't - // signal that results are available before this thd is waiting on cond - VPThread__mutex_lock( globals->results_mutex, animatingThd ); - - //also get the start lock & use to ensure no vector threads send a - // signal before the results thread is waiting on vector cond - VPThread__mutex_lock( globals->start_mutex, animatingThd ); - - - DEBUG( dbgAppFlow, "divider: make result thread\n") - VPThread__create_thread( &gatherResults, resultsParams, animatingThd ); - - //Now wait for results thd to signal that it has vector lock - VPThread__cond_wait( globals->start_cond, animatingThd ); - VPThread__mutex_unlock( globals->start_mutex, animatingThd );//done w/lock - DEBUG( dbgAppFlow, "divider: make sub-matrices\n") - - //Make the sub-matrices, and pair them up, and make processor to - // calc product of each pair. - makeSubMatricesAndProcrs( leftMatrix, rightMatrix, - slicingStrucCarrier, - resultPr, animatingThd); - - //Wait for results thread to say results are good - VPThread__cond_wait( globals->results_cond, animatingThd ); - } - - - //=============== Work done -- send results back ================= - - - DEBUG( dbgAppFlow, "end divide\n") - - VMS__record_interval_end_in_probe( divideProbe ); - VMS__print_stats_of_all_probes(); - - //nothing left to do so dissipate, VPThread will wait to shutdown, - // making results available to outside, until all the processors have - // dissipated -- so actually no need to wait for results processor - //However, following the pattern, so done with comm, release lock - VPThread__mutex_unlock( globals->results_mutex, animatingThd ); - - VPThread__dissipate_thread( animatingThd ); //all procrs dissipate self at end - //when all of the processors have dissipated, the "create seed and do - // work" call in the entry point function returns - } - - -SlicingStrucCarrier * -calcIdealSizeAndSliceDimensions( Matrix *leftMatrix, Matrix *rightMatrix, - VirtProcr *animPr ) - { - float32 idealSizeOfSide, idealSizeOfSide1, idealSizeOfSide2; - SlicingStruc *leftRowSlices, *vecSlices, *rightColSlices; - SlicingStrucCarrier *slicingStrucCarrier = - VPThread__malloc(sizeof(SlicingStrucCarrier), animPr); - - int minWorkUnitCycles, primitiveCycles, idealNumWorkUnits; - float64 numPrimitiveOpsInMinWorkUnit; - - - //======= Calc ideal size of min-sized sub-matrix ======== - - //ask VPThread for the number of cycles of the minimum work unit, at given - // percent overhead then add a guess at overhead from this divider - minWorkUnitCycles = VPThread__giveMinWorkUnitCycles( .05 ); - - //ask VPThread for number of cycles of the "primitive" op of matrix mult - primitiveCycles = measureMatrixMultPrimitive( animPr ); - - numPrimitiveOpsInMinWorkUnit = - (float64)minWorkUnitCycles / (float64)primitiveCycles; - - //take cubed root -- that's number of these in a "side" of sub-matrix - // then multiply by 5 because the primitive is 5x5src/Application/VPThread__Matrix_Mult/Divide_Pr.c:403: warning: implicit declaration of function ‘VPThread__give_number_of_cores_to_schedule_onto’ - idealSizeOfSide1 = 5 * cbrt( numPrimitiveOpsInMinWorkUnit ); - - idealNumWorkUnits = VPThread__giveIdealNumWorkUnits(); - - idealSizeOfSide2 = leftMatrix->numRows / rint(cbrt( idealNumWorkUnits )); - idealSizeOfSide2 *= 0.6; //finer granularity to help load balance - - if( idealSizeOfSide1 > idealSizeOfSide2 ) - idealSizeOfSide = idealSizeOfSide1; - else - idealSizeOfSide = idealSizeOfSide2; - - //The multiply inner loop blocks the array to fit into L1 cache -// if( idealSizeOfSide < ROWS_IN_BLOCK ) idealSizeOfSide = ROWS_IN_BLOCK; - - //============ Slice up dimensions, now that know target size =========== - - //Tell the slicer the target size of a side (floating pt), the start - // value to start slicing at, and the end value to stop slicing at - //It returns an array of start value of each chunk, plus number of them - int32 startLeftRow, endLeftRow, startVec,endVec,startRightCol,endRightCol; - startLeftRow = 0; - endLeftRow = leftMatrix->numRows -1; - startVec = 0; - endVec = leftMatrix->numCols -1; - startRightCol = 0; - endRightCol = rightMatrix->numCols -1; - - leftRowSlices = - sliceUpDimension( idealSizeOfSide, startLeftRow, endLeftRow, animPr ); - - vecSlices = - sliceUpDimension( idealSizeOfSide, startVec, endVec, animPr ); - - rightColSlices = - sliceUpDimension( idealSizeOfSide, startRightCol, endRightCol,animPr); - - slicingStrucCarrier->leftRowSlices = leftRowSlices; - slicingStrucCarrier->vecSlices = vecSlices; - slicingStrucCarrier->rightColSlices = rightColSlices; - - return slicingStrucCarrier; - } - - -void -makeSubMatricesAndProcrs( Matrix *leftMatrix, Matrix *rightMatrix, - SlicingStrucCarrier *slicingStrucCarrier, - VirtProcr *resultPr, VirtProcr *animPr ) - { - SlicingStruc *leftRowSlices, *vecSlices, *rightColSlices; - - leftRowSlices = slicingStrucCarrier->leftRowSlices; - vecSlices = slicingStrucCarrier->vecSlices; - rightColSlices = slicingStrucCarrier->rightColSlices; - VPThread__free( slicingStrucCarrier, animPr ); - - //================ Make sub-matrices, given the slicing ================ - SubMatrix **leftSubMatrices, **rightSubMatrices; - leftSubMatrices = - createSubMatrices( leftRowSlices, vecSlices, rightColSlices->numVals, - leftMatrix, animPr ); - //double_check_that_always_numRows_in_right_same_as_numCols_in_left(); - rightSubMatrices = - createSubMatrices( vecSlices, rightColSlices, leftRowSlices->numVals, - rightMatrix, animPr ); - - - //============== pair the sub-matrices and make processors ============== - int32 numRowIdxs, numColIdxs, numVecIdxs; - - numRowIdxs = leftRowSlices->numVals; - numColIdxs = rightColSlices->numVals; - numVecIdxs = vecSlices->numVals; - - freeSlicingStruc( leftRowSlices, animPr ); - freeSlicingStruc( vecSlices, animPr ); - freeSlicingStruc( rightColSlices, animPr ); - - pairUpSubMatricesAndMakeProcessors( leftSubMatrices, - rightSubMatrices, - numRowIdxs, numColIdxs, - numVecIdxs, - resultPr, - animPr ); - } - - - - -void -pairUpSubMatricesAndMakeProcessors( SubMatrix **leftSubMatrices, - SubMatrix **rightSubMatrices, - int32 numRowIdxs, int32 numColIdxs, - int32 numVecIdxs, - VirtProcr *resultPr, - VirtProcr *animatingPr ) - { - int32 resRowIdx, resColIdx, vecIdx; - int32 numLeftColIdxs, numRightColIdxs; - int32 leftRowIdxOffset; - SMPairParams *subMatrixPairParams; - float32 numToPutOntoEachCore, leftOverFraction; - int32 numCores, coreToScheduleOnto, numVecOnCurrCore; - - numLeftColIdxs = numColIdxs; - numRightColIdxs = numVecIdxs; - - numCores = VPThread__give_number_of_cores_to_schedule_onto(); - - numToPutOntoEachCore = numRowIdxs*numColIdxs/numCores; - leftOverFraction = 0; - numVecOnCurrCore = 0; - coreToScheduleOnto = 0; - - for( resRowIdx = 0; resRowIdx < numRowIdxs; resRowIdx++ ) - { - leftRowIdxOffset = resRowIdx * numLeftColIdxs; - - for( resColIdx = 0; resColIdx < numColIdxs; resColIdx++ ) - { - - for( vecIdx = 0; vecIdx < numVecIdxs; vecIdx++ ) - { - //Make the processor for the pair of sub-matrices - subMatrixPairParams = VPThread__malloc( sizeof(SMPairParams), - animatingPr); - subMatrixPairParams->leftSubMatrix = - leftSubMatrices[ leftRowIdxOffset + vecIdx ]; - - subMatrixPairParams->rightSubMatrix = - rightSubMatrices[ vecIdx * numRightColIdxs + resColIdx ]; - - //subMatrixPairParams->resultPr = resultPr; - - //put all pairs from the same vector onto same core - VPThread__create_thread_with_affinity( &calcSubMatrixProduct, - subMatrixPairParams, - animatingPr, - coreToScheduleOnto ); - } - - //Trying to distribute the subMatrix-vectors across the cores, so - // that each core gets the same number of vectors, with a max - // imbalance of 1 vector more on some cores than others - numVecOnCurrCore += 1; - if( numVecOnCurrCore + leftOverFraction >= numToPutOntoEachCore -1 ) - { - //deal with fractional part, to ensure that imbalance is 1 max - // IE, core with most has only 1 more than core with least - leftOverFraction += numToPutOntoEachCore - numVecOnCurrCore; - if( leftOverFraction >= 1 ) - { leftOverFraction -= 1; - numVecOnCurrCore = -1; - } - else - { numVecOnCurrCore = 0; - } - //Move to next core, max core-value to incr to is numCores -1 - if( coreToScheduleOnto >= numCores -1 ) - { coreToScheduleOnto = 0; - } - else - { coreToScheduleOnto += 1; - } - } - - } - } - } - - - -/*Walk through the two slice-strucs, making sub-matrix strucs as go - */ -SubMatrix ** -createSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices, - int32 numUses, Matrix *origMatrix, VirtProcr *animPr ) - { - int32 numRowIdxs, numColIdxs, rowIdx, colIdx; - int32 startRow, endRow, startCol, endCol; - int32 *rowStartVals, *colStartVals; - int32 rowOffset; - SubMatrix **subMatrices, *newSubMatrix; - - numRowIdxs = rowSlices->numVals; - numColIdxs = colSlices->numVals; - - rowStartVals = rowSlices->startVals; - colStartVals = colSlices->startVals; - - subMatrices = VPThread__malloc((size_t)numRowIdxs * - (size_t)numColIdxs * sizeof(SubMatrix*),animPr ); - - for( rowIdx = 0; rowIdx < numRowIdxs; rowIdx++ ) - { - rowOffset = rowIdx * numColIdxs; - - startRow = rowStartVals[rowIdx]; - endRow = rowStartVals[rowIdx + 1] -1; //"fake" start above last is - // at last valid idx + 1 & is - // 1 greater than end value - for( colIdx = 0; colIdx < numColIdxs; colIdx++ ) - { - startCol = colStartVals[colIdx]; - endCol = colStartVals[colIdx + 1] -1; - - newSubMatrix = VPThread__malloc( sizeof(SubMatrix), animPr ); - newSubMatrix->numRows = endRow - startRow +1; - newSubMatrix->numCols = endCol - startCol +1; - newSubMatrix->origMatrix = origMatrix; - newSubMatrix->origStartRow = startRow; - newSubMatrix->origStartCol = startCol; - newSubMatrix->alreadyCopied = FALSE; - newSubMatrix->numUsesLeft = numUses; //can free after this many - //Prevent uninitialized memory - newSubMatrix->copySingleton = NULL; - newSubMatrix->copyTransSingleton = NULL; - - subMatrices[ rowOffset + colIdx ] = newSubMatrix; - } - } - return subMatrices; - } - - -void -freeSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices, - SubMatrix **subMatrices, VirtProcr *animPr ) - { - int32 numRowIdxs, numColIdxs, rowIdx, colIdx, rowOffset; - SubMatrix *subMatrix; - - numRowIdxs = rowSlices->numVals; - numColIdxs = colSlices->numVals; - - for( rowIdx = 0; rowIdx < numRowIdxs; rowIdx++ ) - { - rowOffset = rowIdx * numColIdxs; - for( colIdx = 0; colIdx < numColIdxs; colIdx++ ) - { - subMatrix = subMatrices[ rowOffset + colIdx ]; - if( subMatrix->alreadyCopied ) - VPThread__free( subMatrix->array, animPr ); - VPThread__free( subMatrix, animPr ); - } - } - VPThread__free( subMatrices, animPr ); - } - - - -SlicingStruc * -sliceUpDimension( float32 idealSizeOfSide, int startVal, int endVal, - VirtProcr *animPr ) - { float32 residualAcc = 0; - int numSlices, i, *startVals, sizeOfSlice, endCondition; - SlicingStruc *slicingStruc = VPThread__malloc(sizeof(SlicingStruc), animPr); - - //calc size of matrix need to hold start vals -- - numSlices = (int32)( (float32)(endVal -startVal +1) / idealSizeOfSide); - - startVals = VPThread__malloc( ((size_t)numSlices + 1) * sizeof(int32), animPr ); - - //Calc the upper limit of start value -- when get above this, end loop - // by saving highest value of the matrix dimension to access, plus 1 - // as the start point of the imaginary slice following the last one - //Plus 1 because go up to value but not include when process last slice - //The stopping condition is half-a-size less than highest value because - // don't want any pieces smaller than half the ideal size -- just tack - // little ones onto end of last one - endCondition = endVal - (int) (idealSizeOfSide/2); //end *value*, not size - for( i = 0; startVal <= endVal; i++ ) - { - startVals[i] = startVal; - residualAcc += idealSizeOfSide; - sizeOfSlice = (int)residualAcc; - residualAcc -= (float32)sizeOfSlice; - startVal += sizeOfSlice; //ex @size = 2 get 0, 2, 4, 6, 8.. - - if( startVal > endCondition ) - { startVal = endVal + 1; - startVals[ i + 1 ] = startVal; - } - } - - slicingStruc->startVals = startVals; - slicingStruc->numVals = i; //loop incr'd, so == last valid start idx+1 - // which means is num sub-matrices in dim - // also == idx of the fake start just above - return slicingStruc; - } - -void -freeSlicingStruc( SlicingStruc *slicingStruc, VirtProcr *animPr ) - { - VPThread__free( slicingStruc->startVals, animPr ); - VPThread__free( slicingStruc, animPr ); - } - - -int inline -measureMatrixMultPrimitive( VirtProcr *animPr ) - { - int r, c, v, numCycles; - float32 *res, *left, *right; - - //setup inputs - left = VPThread__malloc( 5 * 5 * sizeof( float32 ), animPr ); - right = VPThread__malloc( 5 * 5 * sizeof( float32 ), animPr ); - res = VPThread__malloc( 5 * 5 * sizeof( float32 ), animPr ); - - for( r = 0; r < 5; r++ ) - { - for( c = 0; c < 5; c++ ) - { - left[ r * 5 + c ] = r; - right[ r * 5 + c ] = c; - } - } - - //do primitive - VPThread__start_primitive(); //for now, just takes time stamp - for( r = 0; r < 5; r++ ) - { - for( c = 0; c < 5; c++ ) - { - for( v = 0; v < 5; v++ ) - { - res[ r * 5 + c ] = left[ r * 5 + v ] * right[ v * 5 + c ]; - } - } - } - numCycles = - VPThread__end_primitive_and_give_cycles(); - - VPThread__free( left, animPr ); - VPThread__free( right, animPr ); - VPThread__free( res, animPr ); - - return numCycles; - } diff -r f8d3f16e70c8 -r 69928a38d5af src/Application/VPThread__Matrix_Mult/EntryPoint.c --- a/src/Application/VPThread__Matrix_Mult/EntryPoint.c Fri Sep 16 20:12:34 2011 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,62 +0,0 @@ -/* - * Copyright 2009 OpenSourceStewardshipFoundation.org - * Licensed under GNU General Public License version 2 - * - * Author: seanhalle@yahoo.com - * - */ - -#include - -#include "VPThread__Matrix_Mult.h" - - - -/*Every SSR system has an "entry point" function that creates the first - * processor, which starts the chain of creating more processors.. - * eventually all of the processors will dissipate themselves, and - * return. - * - *This entry-point function follows the same pattern as all entry-point - * functions do: - *1) it creates the params for the seed processor, from the - * parameters passed into the entry-point function - *2) it calls SSR__create_seed_procr_and_do_work - *3) it gets the return value from the params struc, frees the params struc, - * and returns the value from the function - * - */ -Matrix * -multiplyTheseMatrices( Matrix *leftMatrix, Matrix *rightMatrix ) - { Matrix *resMatrix; - DividerParams *dividerParams; - int32 numResRows, numResCols; - - - dividerParams = malloc( sizeof( DividerParams ) ); - dividerParams->leftMatrix = leftMatrix; - dividerParams->rightMatrix = rightMatrix; - - - numResRows = leftMatrix->numRows; - numResCols = rightMatrix->numCols; - - //VMS has its own separate internal malloc, so to get results out, - // have to pass in empty array for it to fill up - //The alternative is internally telling SSR make external space to use - resMatrix = malloc( sizeof(Matrix) ); - resMatrix->array = malloc( numResRows * numResCols * sizeof(float32)); - resMatrix->numCols = rightMatrix->numCols; - resMatrix->numRows = leftMatrix->numRows; - - - dividerParams->resultMatrix = resMatrix; - - //create divider processor, start doing the work, and wait till done - //This function is the "border crossing" between normal code and SSR - VPThread__create_seed_procr_and_do_work(÷WorkIntoSubMatrixPairProcrs, - dividerParams ); - - free( dividerParams ); - return resMatrix; - } diff -r f8d3f16e70c8 -r 69928a38d5af src/Application/VPThread__Matrix_Mult/Result_Pr.c --- a/src/Application/VPThread__Matrix_Mult/Result_Pr.c Fri Sep 16 20:12:34 2011 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,142 +0,0 @@ -/* - * Copyright 2009 OpenSourceStewardshipFoundation.org - * Licensed under GNU General Public License version 2 - * - * Author: seanhalle@yahoo.com - * - */ - -#include "VPThread__Matrix_Mult.h" - -//===================== -void inline -accumulateResult( float32 *resultArray, float32 *subMatrixResultArray, - int32 startRow, - int32 numRows, - int32 startCol, - int32 numCols, - int32 numOrigCols ); - -//=========================================================================== - -/*The Result Processor gets a message from each of the vector processors, - * puts the result from the message in its location in the result- - * matrix, and increments the count of results. - * - *After the count reaches the point that all results have been received, it - * returns the result matrix and dissipates. - */ -void gatherResults( void *_params, VirtProcr *animatingThd ) - { VirtProcr *dividerPr; - ResultsParams *params; - int numRows, numCols, numSubMatrixPairs, count=0; - float32 *resultArray; - SMPairParams *resParams; - //====================== thread stuff ======================= - MatrixMultGlobals *globals =(MatrixMultGlobals *)VPThread__give_globals(); - - - //get vector-comm lock before loop, so that this thd keeps lock after - // one wait until it enters the next wait -- forces see-saw btwn - // waiters and signalers -- wait-signal-wait-signal-... - VPThread__mutex_lock( globals->vector_mutex, animatingThd ); - - //Tell divider that have the vector lock -- so it's sure won't miss any - // signals from the vector-threads it's about to create - //Don't need a signal variable -- this thd can't be created until - // divider thd already has the start lock - VPThread__mutex_lock( globals->start_mutex, animatingThd );//finish wait - VPThread__cond_signal( globals->start_cond, animatingThd ); - VPThread__mutex_unlock( globals->start_mutex, animatingThd );//finish wait - //=========================================================== - - DEBUG( dbgAppFlow, "start resultPr\n") - - params = (ResultsParams *)_params; - dividerPr = params->dividerPr; - numSubMatrixPairs = params->numSubMatrixPairs; - numRows = params->numRows; - numCols = params->numCols; - - resultArray = params->resultArray; - - - while( count < numSubMatrixPairs ) - { - //receive a vector-result from a vector-thread - VPThread__cond_wait( globals->vector_cond, animatingThd ); - - //At this point, animating thread owns the vector lock, so all - // pairs trying to signal they have a result are waiting to get that - // lock -- only one gets it at a time, and when signal, this thd - // gets the lock and does the body of this loop, then when does the - // wait again, that releases the lock for next pair-thread to get it - resParams = globals->currSMPairParams; - - accumulateResult( resultArray, resParams->partialResultArray, - resParams->leftSubMatrix->origStartRow, - resParams->leftSubMatrix->numRows, - resParams->rightSubMatrix->origStartCol, - resParams->rightSubMatrix->numCols, - resParams->rightSubMatrix->origMatrix->numCols ); - - VPThread__free( resParams->partialResultArray, animatingThd ); - - //there is only one copy of results procr, so can update numUsesLeft - // without concurrency worries. When zero, free the sub-matrix - resParams->leftSubMatrix->numUsesLeft -= 1; - if( resParams->leftSubMatrix->numUsesLeft == 0 ) - { - VPThread__free( resParams->leftSubMatrix->array, animatingThd ); - VPThread__free( resParams->leftSubMatrix, animatingThd ); - } - - resParams->rightSubMatrix->numUsesLeft -= 1; - if( resParams->rightSubMatrix->numUsesLeft == 0 ) - { - VPThread__free( resParams->rightSubMatrix->array, animatingThd ); - VPThread__free( resParams->rightSubMatrix, animatingThd ); - } - - //count of how many sub-matrix pairs accumulated so know when done - count++; - } - - //Done -- could just dissipate -- SSR will wait for all processors to - // dissipate before shutting down, and thereby making results avaial to - // outside, so no need to stop the divider from dissipating, so no need - // to send a hand-shake message to it -- but makes debug easier - //However, following pattern, so all comms done, release lock - VPThread__mutex_unlock( globals->vector_mutex, animatingThd ); - - //Send result to divider (seed) thread - // note, divider thd had to hold the results-comm lock before creating - // this thread, to be sure no race - VPThread__mutex_lock( globals->results_mutex, animatingThd ); - //globals->results = resultMatrixArray; - VPThread__cond_signal( globals->results_cond, animatingThd ); - VPThread__mutex_unlock( globals->results_mutex, animatingThd ); //releases - //divider thread from its wait, at point this executes - - VPThread__dissipate_thread( animatingThd ); //frees any data owned by procr - } - -void inline -accumulateResult( float32 *resultArray, float32 *subMatrixPairResultArray, - int32 startRow, - int32 numRows, - int32 startCol, - int32 numCols, - int32 numOrigCols ) - { int32 row, col; - - for( row = 0; row < numRows; row++ ) - { - for( col = 0; col < numCols; col++ ) - { - resultArray[ (row + startRow) * numOrigCols + (col + startCol) ] += - subMatrixPairResultArray[ row * numCols + col ]; - } - } - - } diff -r f8d3f16e70c8 -r 69928a38d5af src/Application/VPThread__Matrix_Mult/VPThread__Matrix_Mult.h --- a/src/Application/VPThread__Matrix_Mult/VPThread__Matrix_Mult.h Fri Sep 16 20:12:34 2011 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,120 +0,0 @@ -/* - * Copyright Oct 24, 2009 OpenSourceStewardshipFoundation.org - * Licensed under GNU General Public License version 2 - */ - -#ifndef _SSR_MATRIX_MULT_H_ -#define _SSR_MATRIX_MULT_H_ - -#include - -#include "../../VPThread_lib/VPThread.h" -#include "../Matrix_Mult.h" - - -//=============================== Defines ============================== -#define ROWS_IN_BLOCK 32 -#define COLS_IN_BLOCK 32 -#define VEC_IN_BLOCK 32 - -#define copyMatrixSingleton 1 -#define copyTransposeSingleton 2 - -//============================== Structures ============================== -typedef struct - { - Matrix *leftMatrix; - Matrix *rightMatrix; - Matrix *resultMatrix; - } -DividerParams; - -typedef struct - { - VirtProcr *dividerPr; - int numRows; - int numCols; - int numSubMatrixPairs; - float32 *resultArray; - } -ResultsParams; - -typedef -struct - { int32 numRows; - int32 numCols; - Matrix *origMatrix; - int32 origStartRow; - int32 origStartCol; - int32 alreadyCopied; - VPThdSingleton *copySingleton; - VPThdSingleton *copyTransSingleton; - int32 numUsesLeft; //have update via message to avoid multiple writers - float32 *array; //2D, but dynamically sized, so use addr arith - } -SubMatrix; - -typedef struct - { VirtProcr *resultPr; - SubMatrix *leftSubMatrix; - SubMatrix *rightSubMatrix; - float32 *partialResultArray; - } -SMPairParams; - -typedef -struct - { int32 numVals; - int32 *startVals; - } -SlicingStruc; - -typedef -struct - { - SlicingStruc *leftRowSlices; - SlicingStruc *vecSlices; - SlicingStruc *rightColSlices; - } -SlicingStrucCarrier; - -enum MMMsgType - { - RESULTS_MSG = 1 - }; - - -typedef struct - { - //for communicating sub-matrix-pair results to results Thd - int32 vector_mutex; - int32 vector_cond; - SMPairParams *currSMPairParams; - - //for communicating results array back to seed (divider) Thd - int32 results_mutex; - int32 results_cond; - float32 *results; - - //for ensuring results thd has vector lock before making vector thds - int32 start_mutex; - int32 start_cond; - - Matrix *rightMatrix; - Matrix *resultMatrix; - } -MatrixMultGlobals; - - -//============================= Processor Functions ========================= -void divideWorkIntoSubMatrixPairProcrs( void *data, VirtProcr *animatingPr ); -void calcSubMatrixProduct( void *data, VirtProcr *animatingPr ); -void gatherResults( void *data, VirtProcr *animatingPr ); - - -//================================ Entry Point ============================== -Matrix * -multiplyTheseMatrices( Matrix *leftMatrix, Matrix *rightMatrix ); - - -#endif /*_SSR_MATRIX_MULT_H_*/ diff -r f8d3f16e70c8 -r 69928a38d5af src/Application/VPThread__Matrix_Mult/subMatrix_Pr.c --- a/src/Application/VPThread__Matrix_Mult/subMatrix_Pr.c Fri Sep 16 20:12:34 2011 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,310 +0,0 @@ -/* - * Copyright 2009 OpenSourceStewardshipFoundation.org - * Licensed under GNU General Public License version 2 - * - * Author: SeanHalle@yahoo.com - * - */ - -#include - -#include "VPThread__Matrix_Mult.h" - -//Declarations -//=================================================== - -void inline -copyFromOrig( SubMatrix *subMatrix, VirtProcr *animPr ); - -void inline -copyTransposeFromOrig( SubMatrix *subMatrix, VirtProcr *animPr ); - -void inline -multiplySubBlocksTransposed( float32 *leftArray, float32 *rightArray, - float32 *resArray, - int startRow, int endRow, - int startCol, int endCol, - int startVec, int endVec, - int resStride, int inpStride ); - -void inline -multiplyMatrixArraysTransposed( int32 vecLength, int32 numResRows, int32 numResCols, - float32 *leftArray, float32 *rightArray, - float32 *resArray ); - -//=================================================== - -/*A processor is created with an environment that holds two matrices, - * the row and col that it owns, and the name of a result gathering - * processor. - *It calculates the product of two sub-portions of the input matrices - * by using Intel's mkl library for single-core. - * - *This demonstrates using optimized single-threaded code inside scheduled - * work-units. - * - *When done, it sends the result to the result processor - */ -void -calcSubMatrixProduct( void *data, VirtProcr *animatingPr ) - { - SMPairParams *params; - VirtProcr *resultPr; - float32 *leftArray, *rightArray, *resArray; - SubMatrix *leftSubMatrix, *rightSubMatrix; - MatrixMultGlobals *globals =(MatrixMultGlobals *)VPThread__give_globals(); - - DEBUG1(dbgAppFlow, "start sub-matrix mult: %d\n", animatingPr->procrID) - #ifdef TURN_ON_DEBUG_PROBES - int32 subMatrixProbe = VMS__create_single_interval_probe( "subMtx", - animatingPr); - VMS__record_sched_choice_into_probe( subMatrixProbe, animatingPr ); - VMS__record_interval_start_in_probe( subMatrixProbe ); - #endif - - params = (SMPairParams *)data; - resultPr = params->resultPr; - leftSubMatrix = params->leftSubMatrix; - rightSubMatrix = params->rightSubMatrix; - - //make sure the input sub-matrices have been copied out of orig - //do it here, inside sub-matrix pair to hopefully gain reuse in cache - copyFromOrig( leftSubMatrix, animatingPr ); - copyTransposeFromOrig( rightSubMatrix, animatingPr ); - - leftArray = leftSubMatrix->array; - rightArray = rightSubMatrix->array; - - size_t resSize = (size_t)leftSubMatrix->numRows * - (size_t)rightSubMatrix->numCols * sizeof(float32); - resArray = VPThread__malloc( resSize, animatingPr ); - memset( resArray, 0, resSize ); - - - int32 numResRows, numResCols, vectLength; - - vectLength = leftSubMatrix->numCols; - numResRows = leftSubMatrix->numRows; - numResCols = rightSubMatrix->numCols; - - multiplyMatrixArraysTransposed( vectLength, numResRows, numResCols, - leftArray, rightArray, - resArray ); - - //send result to result processor - params->partialResultArray = resArray; - - #ifdef TURN_ON_DEBUG_PROBES - VMS__record_interval_end_in_probe( subMatrixProbe ); - #endif - - //Send result to results thread - //This pattern works 'cause only get lock when results thd inside wait - VPThread__mutex_lock( globals->vector_mutex, animatingPr ); - globals->currSMPairParams = params; - VPThread__cond_signal( globals->vector_cond, animatingPr ); - VPThread__mutex_unlock( globals->vector_mutex, animatingPr );//release - //wait-er -- cond_signal implemented such that wait-er gets lock, no other - - DEBUG1(dbgAppFlow, "end sub-matrix mult: %d\n", animatingPr->procrID) - VPThread__dissipate_thread( animatingPr ); - } - - - -/*Divides result and each input into 32x32 sub-matrices, 3 of which fit into - * the 32KB L1 cache. - *Would be nice to embed this within another level that divided into - * 8x8 tiles of those, where one 8x8 tile fits within 2MB L2 cache - * - *Eventually want these divisions to be automatic, using DKU pattern - * embedded into VMS and exposed in the language, and with VMS controlling the - * divisions according to the cache sizes, which it knows about. - *Also, want VMS to work with language to split among main-mems, so a socket - * only cranks on data in its local segment of main mem - * - *So, outer two loops determine start and end points within the result matrix. - * Inside that, a loop dets the start and end points along the shared dimensions - * of the two input matrices. - */ -void inline -multiplyMatrixArraysTransposed( int32 vecLength, int32 numResRows, - int32 numResCols, - float32 *leftArray, float32 *rightArray, - float32 *resArray ) - { - int resStride, inpStride; - int resStartRow, resStartCol, resEndRow, resEndCol, startVec, endVec; - - resStride = numResCols; - inpStride = vecLength; - - for( resStartRow = 0; resStartRow < numResRows; ) - { - resEndRow = resStartRow + ROWS_IN_BLOCK -1; //start at zero, so -1 - if( resEndRow > numResRows ) resEndRow = numResRows -1; - - for( resStartCol = 0; resStartCol < numResCols; ) - { - resEndCol = resStartCol + COLS_IN_BLOCK -1; - if( resEndCol > numResCols ) resEndCol = numResCols -1; - - for( startVec = 0; startVec < vecLength; ) - { - endVec = startVec + VEC_IN_BLOCK -1; - if( endVec > vecLength ) endVec = vecLength -1; - - //By having the "vector" of sub-blocks in a sub-block slice - // be marched down in inner loop, are re-using the result - // matrix, which stays in L1 cache and re-using the left sub-mat - // which repeats for each right sub-mat -- can only re-use two of - // the three, so result is the most important -- avoids writing - // dirty blocks until those result-locations fully done - //Row and Col is position in result matrix -- so row and vec - // for left array, then vec and col for right array - multiplySubBlocksTransposed( leftArray, rightArray, - resArray, - resStartRow, resEndRow, - resStartCol, resEndCol, - startVec, endVec, - resStride, inpStride ); - startVec = endVec +1; - } - resStartCol = resEndCol +1; - } - resStartRow = resEndRow +1; - } - } - - - -void inline -multiplySubBlocksTransposed( float32 *leftArray, float32 *rightArray, - float32 *resArray, - int resStartRow, int resEndRow, - int resStartCol, int resEndCol, - int startVec, int endVec, - int resStride, int inpStride ) - { - int resRow, resCol, vec; - int leftOffset, rightOffset; - float32 result; - - //The result row is used only for the left matrix, res col for the right - for( resCol = resStartCol; resCol <= resEndCol; resCol++ ) - { - for( resRow = resStartRow; resRow <= resEndRow; resRow++ ) - { - leftOffset = resRow * inpStride;//left & right inp strides always same - rightOffset = resCol * inpStride;// because right is transposed - result = 0; - for( vec = startVec; vec <= endVec; vec++ ) - { - result += - leftArray[ leftOffset + vec] * rightArray[ rightOffset + vec]; - } - - resArray[ resRow * resStride + resCol ] += result; - } - } - } - - - - -/*Reuse this in divider when do the sequential multiply case - */ -void inline -copyTranspose( int32 numRows, int32 numCols, - int32 origStartRow, int32 origStartCol, int32 origStride, - float32 *subArray, float32 *origArray ) - { int32 stride = numRows; - - int row, col, origOffset; - for( row = 0; row < numRows; row++ ) - { - origOffset = (row + origStartRow) * origStride + origStartCol; - for( col = 0; col < numCols; col++ ) - { - //transpose means swap row & col -- traverse orig matrix normally - // but put into reversed place in local array -- means the - // stride is the numRows now, so col * numRows + row - subArray[ col * stride + row ] = origArray[ origOffset + col ]; - } - } - } - -void inline -copyTransposeFromOrig( SubMatrix *subMatrix, VirtProcr *animPr ) - { int numCols, numRows, origStartRow, origStartCol, origStride; - Matrix *origMatrix; - float32 *origArray, *subArray; - - VPThread__start_data_singleton( &(subMatrix->copyTransSingleton), animPr ); - - origMatrix = subMatrix->origMatrix; - origArray = origMatrix->array; - numCols = subMatrix->numCols; - numRows = subMatrix->numRows; - origStartRow = subMatrix->origStartRow; - origStartCol = subMatrix->origStartCol; - origStride = origMatrix->numCols; - - subArray = VPThread__malloc( (size_t)numRows * - (size_t)numCols *sizeof(float32),animPr); - subMatrix->array = subArray; - - //copy values from orig matrix to local - copyTranspose( numRows, numCols, - origStartRow, origStartCol, origStride, - subArray, origArray ); - - VPThread__end_data_singleton( &(subMatrix->copyTransSingleton), animPr ); - - } - - -void inline -copyFromOrig( SubMatrix *subMatrix, VirtProcr *animPr ) - { int numCols, numRows, origStartRow, origStartCol, stride, origStride; - Matrix *origMatrix; - float32 *origArray, *subArray; - - - //This lets only a single VP execute the code between start and - // end -- using start and end so that work runs outside the master. - //If a second VP ever executes the start, it will be returned - // from the end-point. If its execution starts after another but before - // that other has finished, this one will remain suspended until the - // other finishes, then be resumed from the end-point. - VPThread__start_data_singleton( &(subMatrix->copySingleton), animPr ); - - - origMatrix = subMatrix->origMatrix; - origArray = origMatrix->array; - numCols = subMatrix->numCols; - numRows = subMatrix->numRows; - origStartRow = subMatrix->origStartRow; - origStartCol = subMatrix->origStartCol; - origStride = origMatrix->numCols; - - subArray = VPThread__malloc( (size_t)numRows * - (size_t)numCols *sizeof(float32),animPr); - subMatrix->array = subArray; - - //copy values from orig matrix to local - stride = numCols; - - int row, col, offset, origOffset; - for( row = 0; row < numRows; row++ ) - { - offset = row * stride; - origOffset = (row + origStartRow) * origStride + origStartCol; - for( col = 0; col < numCols; col++ ) - { - subArray[ offset + col ] = origArray[ origOffset + col ]; - } - } - - VPThread__end_data_singleton( &(subMatrix->copySingleton), animPr ); - } diff -r f8d3f16e70c8 -r 69928a38d5af src/Application/main.c --- a/src/Application/main.c Fri Sep 16 20:12:34 2011 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,50 +0,0 @@ -/* - * Copyright Oct 24, 2009 OpenSourceStewardshipFoundation.org - * Licensed under GNU General Public License version 2 - * - * author seanhalle@yahoo.com - */ - -#include -#include - -#include "Matrix_Mult.h" -#include "VPThread__Matrix_Mult/VPThread__Matrix_Mult.h" - -char __ProgrammName[] = "Blocked Matrix Multiply"; -char __DataSet[255]; - -void printParams(ParamBag *paramBag) -{ - snprintf((char*)&__DataSet, 255, - "#\tLeft Matrix %d x %d,\n#\tRight Matrix %d x %d", - getParamFromBag("leftMatrixRows", paramBag)->intValue, - getParamFromBag("leftMatrixCols", paramBag)->intValue, - getParamFromBag("rightMatrixRows", paramBag)->intValue, - getParamFromBag("rightMatrixCols", paramBag)->intValue); -} - -/** - *Matrix multiply program written using VMS_HW piggy-back language - * - */ -int main( int argc, char **argv ) - { Matrix *leftMatrix, *rightMatrix, *resultMatrix; - ParamBag *paramBag; - - printf( "arguments: %s | %s\n", argv[0], argv[1] ); - - paramBag = makeParamBag(); - readParamFileIntoBag( argv[1], paramBag ); - initialize_Input_Matrices_Via( &leftMatrix, &rightMatrix, paramBag ); - printParams(paramBag); - - resultMatrix = multiplyTheseMatrices( leftMatrix, rightMatrix ); - - printf("\nresult matrix: \n"); - printMatrix( resultMatrix ); - -// SSR__print_stats(); - - exit(0); //cleans up - }