# HG changeset patch # User SeanHalle # Date 1289456817 28800 # Node ID 8d14fe28a7828920362a23fbfde81a877651e403 Initial add -- copied from SSR version diff -r 000000000000 -r 8d14fe28a782 .hgignore --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/.hgignore Wed Nov 10 22:26:57 2010 -0800 @@ -0,0 +1,5 @@ +nbproject +dist +build +Makefile +.dep.inc diff -r 000000000000 -r 8d14fe28a782 src/Application/Matrix_Mult.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/Application/Matrix_Mult.c Wed Nov 10 22:26:57 2010 -0800 @@ -0,0 +1,167 @@ +/* + * Copyright 2009 OpenSourceStewardshipFoundation.org + * Licensed under GNU General Public License version 2 + * + * Author: seanhalle@yahoo.com + * + * Created on November 15, 2009, 2:35 AM + */ + +#include +#include + +#include "Matrix_Mult.h" +#include "ParamHelper/Param.h" + + + + void +initialize_Input_Matrices_Via( Matrix **leftMatrix, Matrix **rightMatrix, + ParamBag *paramBag ) + { char *leftMatrixFileName, *rightMatrixFileName; + int leftMatrixRows, leftMatrixCols, rightMatrixRows, rightMatrixCols; + + ParamStruc *param; + param = getParamFromBag( "leftMatrixRows", paramBag ); + leftMatrixRows = param->intValue; + param = getParamFromBag( "leftMatrixCols", paramBag ); + leftMatrixCols = param->intValue; + *leftMatrix = makeMatrix_WithResMat( leftMatrixRows, leftMatrixCols ); + + param = getParamFromBag( "leftMatrixFileName", paramBag ); + leftMatrixFileName = param->strValue; //no need to copy + read_Matrix_From_File( *leftMatrix, leftMatrixFileName ); + + param = getParamFromBag( "rightMatrixRows", paramBag ); + rightMatrixRows = param->intValue; + param = getParamFromBag( "rightMatrixCols", paramBag ); + rightMatrixCols = param->intValue; + *rightMatrix = makeMatrix_WithResMat( rightMatrixRows, rightMatrixCols ); + + param = getParamFromBag( "rightMatrixFileName", paramBag ); + rightMatrixFileName = param->strValue; + read_Matrix_From_File( *rightMatrix, rightMatrixFileName ); + } + + +void parseLineIntoRow( char *line, float32* row ); + + + void +read_Matrix_From_File( Matrix *matrixStruc, char *matrixFileName ) + { int row, maxRead, numRows, numCols; + float32 *matrixStart; + size_t lineSz = 0; + FILE *file; + char *line = NULL; + + lineSz = 50000; //max length of line in a matrix data file + line = (char *) malloc( lineSz ); + if( line == NULL ) printf( "no mem for matrix line" ); + + numRows = matrixStruc->numRows; + numCols = matrixStruc->numCols; + matrixStart = matrixStruc->array; + + file = fopen( matrixFileName, "r" ); + if( file == NULL ) { printf( "\nCouldn't open file!!\n"); exit(1);} + fseek( file, 0, SEEK_SET ); + for( row = 0; row < numRows; row++ ) + { + if( feof( file ) ) printf( "file ran out too soon" ); + maxRead = getline( &line, &lineSz, file ); + if( maxRead == -1 ) printf( "prob reading mat line"); + + if( *line == '\n') continue; //blank line + if( *line == '/' ) continue; //comment line + + parseLineIntoRow( line, matrixStart + row * numCols ); + } + free( line ); + } + +/*This function relies on each line having the proper number of cols. It + * doesn't check, nor enforce, so if the file is improperly formatted it + * can write over unrelated memory + */ + void +parseLineIntoRow( char *line, float32* row ) + { + char *valueStr, *searchPos; + + //read the float values + searchPos = valueStr = line; //start + + for( ; *searchPos != 0; searchPos++) //bit dangerous, should use buff len + { + if( *searchPos == '\n' ) //last col.. relying on well-formatted file + { *searchPos = 0; + *row = atof( valueStr ); + break; //end FOR loop + } + if( *searchPos == ',' ) + { *searchPos = 0; //mark end of string + *row = (float32) atof( valueStr ); + row += 1; //address arith + //skip any spaces before digits.. use searchPos + 1 to skip the 0 + for( ; *(searchPos + 1)== ' ' && *(searchPos + 1) !=0; searchPos++); + valueStr = searchPos + 1; + } + } + } + + //========================================================================== + +/*In the "_Flat" version of constructor, do only malloc of the top data struc + * and set values in that top-level. Don't malloc any sub-structures. + */ + Matrix * +makeMatrix_Flat( int32 numRows, int32 numCols ) + { Matrix * retMatrix; + retMatrix = malloc( sizeof( Matrix ) ); + retMatrix->numRows = numRows; + retMatrix->numCols = numCols; + + return retMatrix; + } + + Matrix * +makeMatrix_WithResMat( int32 numRows, int32 numCols ) + { Matrix * retMatrix; + retMatrix = malloc( sizeof( Matrix ) ); + retMatrix->numRows = numRows; + retMatrix->numCols = numCols; + retMatrix->array = malloc( numRows * numCols * sizeof(float32) ); + + return retMatrix; + } + + void +freeMatrix_Flat( Matrix * matrix ) + { //( matrix ); + } + void +freeMatrix( Matrix * matrix ) + { free( matrix->array ); + free( matrix ); + } + +void +printMatrix( Matrix *matrix ) + { int r, c, numRows, numCols, rowsToPrint, colsToPrint, rowIncr, colIncr; + float32 *matrixArray; + + numRows = rowsToPrint = matrix->numRows; + numCols = colsToPrint = matrix->numCols; + matrixArray = matrix->array; + + rowIncr = numRows/20; if(rowIncr == 0) rowIncr = 1;//20 to 39 rows printed + colIncr = numCols/20; if(colIncr == 0) colIncr = 1;//20 to 39 cols printed + for( r = 0; r < numRows; r += rowIncr ) + { for( c = 0; c < numCols; c += colIncr ) + { printf( "%3.1f | ", matrixArray[ r * numCols + c ] ); + } + printf("\n"); + } + } + diff -r 000000000000 -r 8d14fe28a782 src/Application/Matrix_Mult.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/Application/Matrix_Mult.h Wed Nov 10 22:26:57 2010 -0800 @@ -0,0 +1,77 @@ +/* + * Copyright Oct 24, 2009 OpenSourceStewardshipFoundation.org + * Licensed under GNU General Public License version 2 + */ + +#ifndef MATRIX_MULT_H_ +#define MATRIX_MULT_H_ + +#include +#include +#include + +#include "../SSR_lib/VMS/VMS_primitive_data_types.h" +#include "ParamHelper/Param.h" + +//============================== Structures ============================== + +typedef +struct + { int32 numRows; + int32 numCols; + float32 *array; //2D, but dynamically sized, so use addr arith + } +Matrix; + +/* This is the "appSpecificPiece" that is carried inside a DKUPiece. + * In the DKUPiece data struc it is declared to be of type "void *". This + * allows the application to define any data structure it wants and put it + * into a DKUPiece. + * When the app specific info is used, it is in app code, so it is cast to + * the correct type to tell the compiler how to access fields. + * This keeps all app-specific things out of the DKU directory, as per the + * DKU standard. */ +typedef +struct + { + // pointers to shared data.. the result matrix must be created when the + // left and right matrices are put into the root ancestor DKUPiece. + Matrix * leftMatrix; + Matrix * rightMatrix; + Matrix * resultMatrix; + + // define the starting and ending boundaries for this piece of the + // result matrix. These are derivable from the left and right + // matrices, but included them for readability of code. + int prodStartRow, prodEndRow; + int prodStartCol, prodEndCol; + // Start and end of the portion of the left matrix that contributes to + // this piece of the product + int leftStartRow, leftEndRow; + int leftStartCol, leftEndCol; + // Start and end of the portion of the right matrix that contributes to + // this piece of the product + int rightStartRow, rightEndRow; + int rightStartCol, rightEndCol; + } +MatrixProdPiece; + +//============================== Functions ================================ +void readFile(); + +Matrix *makeMatrix( int32 numRows, int32 numCols ); +Matrix *makeMatrix_Flat( int32 numRows, int32 numCols ); +Matrix *makeMatrix_WithResMat( int32 numRows, int32 numCols ); +void freeMatrix_Flat( Matrix * matrix ); +void freeMatrix( Matrix * matrix ); +void printMatrix( Matrix *matrix ); + +void read_Matrix_From_File( Matrix *matrixStruc, char *matrixFileName ); + +void +initialize_Input_Matrices_Via( Matrix **leftMatrix, Matrix **rightMatrix, + ParamBag *paramBag ); + +//=========================================================================== + +#endif /*MATRIX_MULT_H_*/ diff -r 000000000000 -r 8d14fe28a782 src/Application/VPThread__Matrix_Mult/Divide_Pr.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/Application/VPThread__Matrix_Mult/Divide_Pr.c Wed Nov 10 22:26:57 2010 -0800 @@ -0,0 +1,597 @@ +/* + * Copyright 2009 OpenSourceStewardshipFoundation.org + * Licensed under GNU General Public License version 2 + * + * Author: seanhalle@yahoo.com + * + */ + + +#include "SSR_Matrix_Mult.h" +#include +#include + + //The time to compute this many result values should equal the time to + // perform this division on a matrix of size gives that many result calcs + //IE, size this so that sequential time to calc equals divide time + // find the value by experimenting -- but divide time and calc time scale + // same way, so this value should remain valid across hardware +#define NUM_CELLS_IN_SEQUENTIAL_CUTOFF 1000 + + +//=========================================================================== +int inline +measureMatrixMultPrimitive( VirtProcr *animPr ); + +SlicingStrucCarrier * +calcIdealSizeAndSliceDimensions( Matrix *leftMatrix, Matrix *rightMatrix, + VirtProcr *animPr ); + +SlicingStruc * +sliceUpDimension( float32 idealSizeOfSide, int startVal, int endVal, + VirtProcr *animPr ); + +void +freeSlicingStruc( SlicingStruc *slicingStruc, VirtProcr *animPr ); + +SubMatrix ** +createSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices, + int32 numUses, Matrix *origMatrix, VirtProcr *animPr ); + +void +freeSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices, + SubMatrix **subMatrices, VirtProcr *animPr ); + +void +pairUpSubMatricesAndMakeProcessors( SubMatrix **leftSubMatrices, + SubMatrix **rightSubMatrices, + int32 numRowIdxs, int32 numColIdxs, + int32 numVecIdxs, + VirtProcr *resultPr, + VirtProcr *animatingPr ); + +void +makeSubMatricesAndProcrs( Matrix *leftMatrix, Matrix *rightMatrix, + SlicingStrucCarrier *slicingStrucCarrier, + VirtProcr *resultPr, VirtProcr *animatingPr ); + + + +/*Divider creates one processor for every sub-matrix + * It hands them: + * the name of the result processor that they should send their results to, + * the left and right matrices, and the rows and cols they should multiply + * It first creates the result processor, then all the sub-matrixPair + * processors, + * then does a receive of a message from the result processor that gives + * the divider ownership of the result matrix. + * Finally, the divider returns the result matrix out of the SSR system. + * + * Divider chooses the size of sub-matrices via an algorithm that tries to + * keep the minimum work above a threshold. The threshold is machine- + * dependent, so ask SSR for min work-unit time to get a + * given overhead + * + * Divide min work-unit cycles by measured-cycles for one matrix-cell + * product -- gives the number of products need to have in min size + * matrix. + * + * So then, take cubed root of this to get the size of a side of min sub- + * matrix. That is the size of the ideal square sub-matrix -- so tile + * up the two input matrices into ones as close as possible to that size, + * and create the pairs of sub-matrices. + * + *======================== STRATEGIC OVERVIEW ======================= + * + *This division is a bit tricky, because have to create things in advance + * that it's not at first obvious need to be created.. + * + *First slice up each dimension -- three of them.. this is because will have + * to create the sub-matrix's data-structures before pairing the sub-matrices + * with each other -- so, have three dimensions to slice up before can + * create the sub-matrix data-strucs -- also, have to be certain that the + * cols of the left input have the exact same slicing as the rows of the + * left matrix, so just to be sure, do the slicing calc once, then use it + * for both. + * + *So, goes like this: + *1) calculate the start & end values of each dimension in each matrix. + *2) use those values to create sub-matrix structures + *3) combine sub-matrices into pairs, as the tasks to perform. + * + *Have to calculate separately from creating the sub-matrices because of the + * nature of the nesting -- would either end up creating the same sub-matrix + * multiple times, or else would have to put in detection of whether had + * made a particular one already if tried to combine steps 1 and 2. + * + *Step 3 has to be separate because of the nesting, as well -- same reason, + * would either create same sub-matrix multiple times, or else have to + * add detection of whether was already created. + * + *Another way to look at it: there's one level of loop to divide dimensions, + * two levels of nesting to create sub-matrices, and three levels to pair + * up the sub-matrices. + */ + +void divideWorkIntoSubMatrixPairProcrs( void *_dividerParams, + VirtProcr *animPr ) + { VirtProcr *resultPr; + DividerParams *dividerParams; + ResultsParams *resultsParams; + Matrix *leftMatrix, *rightMatrix, *resultMatrix; + void *msg; + SlicingStrucCarrier *slicingStrucCarrier; + float32 *resultArray; //points to array inside result matrix + + DEBUG( dbgAppFlow, "start divide\n") + + int32 + divideProbe = VMS__create_single_interval_probe( "divideProbe", + animPr ); + VMS__record_sched_choice_into_probe( divideProbe, animPr ); + VMS__record_interval_start_in_probe( divideProbe ); + + //=========== Setup -- make local copies of ptd-to-things, malloc, aso + int32 numResRows, numResCols, vectLength; + + dividerParams = (DividerParams *)_dividerParams; + + leftMatrix = dividerParams->leftMatrix; + rightMatrix = dividerParams->rightMatrix; + + vectLength = leftMatrix->numCols; + numResRows = leftMatrix->numRows; + numResCols = rightMatrix->numCols; + resultArray = dividerParams->resultMatrix->array; + + //zero the result array + memset( resultArray, 0, numResRows * numResCols * sizeof(float32) ); + + //============== Do either sequential mult or do division ============== + + //Check if input matrices too small -- if yes, just do sequential + //Cutoff is determined by overhead of this divider -- relatively + // machine-independent + if( (float32)leftMatrix->numRows * (float32)leftMatrix->numCols * + (float32)rightMatrix->numCols < NUM_CELLS_IN_SEQUENTIAL_CUTOFF ) + { + //====== Do sequential multiply on a single core + DEBUG( dbgAppFlow, "doing sequential") + + //transpose the right matrix + float32 * + transRightArray = SSR__malloc_to( rightMatrix->numRows * + rightMatrix->numCols * sizeof(float32), + animPr ); + + //copy values from orig matrix to local + copyTranspose( rightMatrix->numRows, rightMatrix->numCols, + 0, 0, rightMatrix->numRows, + transRightArray, rightMatrix->array ); + + multiplyMatrixArraysTransposed( vectLength, numResRows, numResCols, + leftMatrix->array, transRightArray, + resultArray ); + } + else + { + //====== Do parallel multiply across cores + + //Calc the ideal size of sub-matrix and slice up the dimensions of + // the two matrices. + //The ideal size is the one takes the number of cycles to calculate + // such that calc time is equal or greater than min work-unit size + slicingStrucCarrier = + calcIdealSizeAndSliceDimensions( leftMatrix, rightMatrix, animPr ); + + //Make the results processor, now that know how many to wait for + resultsParams = SSR__malloc_to( sizeof(ResultsParams), animPr ); + resultsParams->numSubMatrixPairs = + slicingStrucCarrier->leftRowSlices->numVals * + slicingStrucCarrier->rightColSlices->numVals * + slicingStrucCarrier->vecSlices->numVals; + resultsParams->dividerPr = animPr; + resultsParams->numCols = rightMatrix->numCols; + resultsParams->numRows = leftMatrix->numRows; + resultsParams->resultArray = resultArray; + + + resultPr = + SSR__create_procr_with( &gatherResults, resultsParams, animPr); + + //Make the sub-matrices, and pair them up, and make processor to + // calc product of each pair. + makeSubMatricesAndProcrs( leftMatrix, rightMatrix, + slicingStrucCarrier, + resultPr, animPr); + + //result array is allocated externally, so no message from resultPr + // however, do have to wait before printing out stats, so wait + // for an empty handshake message + msg = SSR__receive_from_to( resultPr, animPr ); + } + + + //=============== Work done -- send results back ================= + + + DEBUG( dbgAppFlow, "end divide\n") + + VMS__record_interval_end_in_probe( divideProbe ); + VMS__print_stats_of_all_probes(); + + //nothing left to do so dissipate, SSR will wait to shutdown and hence + // make results available to outside until all the processors have + // dissipated -- so no need to wait for results processor + + SSR__dissipate_procr( animPr ); //all procrs dissipate self at end + //when all of the processors have dissipated, the "create seed and do + // work" call in the entry point function returns + } + + +SlicingStrucCarrier * +calcIdealSizeAndSliceDimensions( Matrix *leftMatrix, Matrix *rightMatrix, + VirtProcr *animPr ) + { + float32 idealSizeOfSide, idealSizeOfSide1, idealSizeOfSide2; + SlicingStruc *leftRowSlices, *vecSlices, *rightColSlices; + SlicingStrucCarrier *slicingStrucCarrier = + SSR__malloc_to(sizeof(SlicingStrucCarrier), animPr); + + int minWorkUnitCycles, primitiveCycles, idealNumWorkUnits; + float64 numPrimitiveOpsInMinWorkUnit; + + + //======= Calc ideal size of min-sized sub-matrix ======== + + //ask SSR for the number of cycles of the minimum work unit, at given + // percent overhead then add a guess at overhead from this divider + minWorkUnitCycles = SSR__giveMinWorkUnitCycles( .05 ); + + //ask SSR for number of cycles of the "primitive" op of matrix mult + primitiveCycles = measureMatrixMultPrimitive( animPr ); + + numPrimitiveOpsInMinWorkUnit = + (float64)minWorkUnitCycles / (float64)primitiveCycles; + + //take cubed root -- that's number of these in a "side" of sub-matrix + // then multiply by 5 because the primitive is 5x5 + idealSizeOfSide1 = 5 * cbrt( numPrimitiveOpsInMinWorkUnit ); + + idealNumWorkUnits = SSR__giveIdealNumWorkUnits(); + + idealSizeOfSide2 = leftMatrix->numRows / rint(cbrt( idealNumWorkUnits )); + idealSizeOfSide2 *= 0.6; //finer granularity to help load balance + + if( idealSizeOfSide1 > idealSizeOfSide2 ) + idealSizeOfSide = idealSizeOfSide1; + else + idealSizeOfSide = idealSizeOfSide2; + + //The multiply inner loop blocks the array to fit into L1 cache +// if( idealSizeOfSide < ROWS_IN_BLOCK ) idealSizeOfSide = ROWS_IN_BLOCK; + + //============ Slice up dimensions, now that know target size =========== + + //Tell the slicer the target size of a side (floating pt), the start + // value to start slicing at, and the end value to stop slicing at + //It returns an array of start value of each chunk, plus number of them + int32 startLeftRow, endLeftRow, startVec,endVec,startRightCol,endRightCol; + startLeftRow = 0; + endLeftRow = leftMatrix->numRows -1; + startVec = 0; + endVec = leftMatrix->numCols -1; + startRightCol = 0; + endRightCol = rightMatrix->numCols -1; + + leftRowSlices = + sliceUpDimension( idealSizeOfSide, startLeftRow, endLeftRow, animPr ); + + vecSlices = + sliceUpDimension( idealSizeOfSide, startVec, endVec, animPr ); + + rightColSlices = + sliceUpDimension( idealSizeOfSide, startRightCol, endRightCol,animPr); + + slicingStrucCarrier->leftRowSlices = leftRowSlices; + slicingStrucCarrier->vecSlices = vecSlices; + slicingStrucCarrier->rightColSlices = rightColSlices; + + return slicingStrucCarrier; + } + + +void +makeSubMatricesAndProcrs( Matrix *leftMatrix, Matrix *rightMatrix, + SlicingStrucCarrier *slicingStrucCarrier, + VirtProcr *resultPr, VirtProcr *animPr ) + { + SlicingStruc *leftRowSlices, *vecSlices, *rightColSlices; + + leftRowSlices = slicingStrucCarrier->leftRowSlices; + vecSlices = slicingStrucCarrier->vecSlices; + rightColSlices = slicingStrucCarrier->rightColSlices; + SSR__free( slicingStrucCarrier, animPr ); + + //================ Make sub-matrices, given the slicing ================ + SubMatrix **leftSubMatrices, **rightSubMatrices; + leftSubMatrices = + createSubMatrices( leftRowSlices, vecSlices, rightColSlices->numVals, + leftMatrix, animPr ); + //double_check_that_always_numRows_in_right_same_as_numCols_in_left(); + rightSubMatrices = + createSubMatrices( vecSlices, rightColSlices, leftRowSlices->numVals, + rightMatrix, animPr ); + + freeSlicingStruc( leftRowSlices, animPr ); + freeSlicingStruc( vecSlices, animPr ); + freeSlicingStruc( rightColSlices, animPr ); + + //============== pair the sub-matrices and make processors ============== + int32 numRowIdxs, numColIdxs, numVecIdxs; + + numRowIdxs = leftRowSlices->numVals; + numColIdxs = rightColSlices->numVals; + numVecIdxs = vecSlices->numVals; + pairUpSubMatricesAndMakeProcessors( leftSubMatrices, + rightSubMatrices, + numRowIdxs, numColIdxs, + numVecIdxs, + resultPr, + animPr ); + } + + + + +void +pairUpSubMatricesAndMakeProcessors( SubMatrix **leftSubMatrices, + SubMatrix **rightSubMatrices, + int32 numRowIdxs, int32 numColIdxs, + int32 numVecIdxs, + VirtProcr *resultPr, + VirtProcr *animatingPr ) + { + int32 resRowIdx, resColIdx, vecIdx; + int32 numLeftColIdxs, numRightColIdxs; + int32 leftRowIdxOffset; + SMPairParams *subMatrixPairParams; + float32 numToPutOntoEachCore, leftOverFraction; + int32 numCores, coreToScheduleOnto, numVecOnCurrCore; + + numLeftColIdxs = numColIdxs; + numRightColIdxs = numVecIdxs; + + numCores = SSR__give_number_of_cores_to_schedule_onto(); + + numToPutOntoEachCore = numRowIdxs*numColIdxs/numCores; + leftOverFraction = 0; + numVecOnCurrCore = 0; + coreToScheduleOnto = 0; + + for( resRowIdx = 0; resRowIdx < numRowIdxs; resRowIdx++ ) + { + leftRowIdxOffset = resRowIdx * numLeftColIdxs; + + for( resColIdx = 0; resColIdx < numColIdxs; resColIdx++ ) + { + + for( vecIdx = 0; vecIdx < numVecIdxs; vecIdx++ ) + { + //Make the processor for the pair of sub-matrices + subMatrixPairParams = SSR__malloc_to( sizeof(SMPairParams), + animatingPr); + subMatrixPairParams->leftSubMatrix = + leftSubMatrices[ leftRowIdxOffset + vecIdx ]; + + subMatrixPairParams->rightSubMatrix = + rightSubMatrices[ vecIdx * numRightColIdxs + resColIdx ]; + + subMatrixPairParams->resultPr = resultPr; + + //put all pairs from the same vector onto same core + SSR__create_procr_with_affinity( &calcSubMatrixProduct, + subMatrixPairParams, + animatingPr, + coreToScheduleOnto ); + } + + //Trying to distribute the subMatrix-vectors across the cores, so + // that each core gets the same number of vectors, with a max + // imbalance of 1 vector more on some cores than others + numVecOnCurrCore += 1; + if( numVecOnCurrCore + leftOverFraction >= numToPutOntoEachCore -1 ) + { + //deal with fractional part, to ensure that imbalance is 1 max + // IE, core with most has only 1 more than core with least + leftOverFraction += numToPutOntoEachCore - numVecOnCurrCore; + if( leftOverFraction >= 1 ) + { leftOverFraction -= 1; + numVecOnCurrCore = -1; + } + else + { numVecOnCurrCore = 0; + } + //Move to next core, max core-value to incr to is numCores -1 + if( coreToScheduleOnto >= numCores -1 ) + { coreToScheduleOnto = 0; + } + else + { coreToScheduleOnto += 1; + } + } + + } + } + + } + + + +/*Walk through the two slice-strucs, making sub-matrix strucs as go + */ +SubMatrix ** +createSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices, + int32 numUses, Matrix *origMatrix, VirtProcr *animPr ) + { + int32 numRowIdxs, numColIdxs, rowIdx, colIdx; + int32 startRow, endRow, startCol, endCol; + int32 *rowStartVals, *colStartVals; + int32 rowOffset; + SubMatrix **subMatrices, *newSubMatrix; + + numRowIdxs = rowSlices->numVals; + numColIdxs = colSlices->numVals; + + rowStartVals = rowSlices->startVals; + colStartVals = colSlices->startVals; + + subMatrices = SSR__malloc_to(numRowIdxs * numColIdxs * sizeof(SubMatrix*), + animPr ); + + for( rowIdx = 0; rowIdx < numRowIdxs; rowIdx++ ) + { + rowOffset = rowIdx * numColIdxs; + + startRow = rowStartVals[rowIdx]; + endRow = rowStartVals[rowIdx + 1] -1; //"fake" start above last is + // at last valid idx + 1 & is + // 1 greater than end value + for( colIdx = 0; colIdx < numColIdxs; colIdx++ ) + { + startCol = colStartVals[colIdx]; + endCol = colStartVals[colIdx + 1] -1; + + newSubMatrix = SSR__malloc_to( sizeof(SubMatrix), animPr ); + newSubMatrix->numRows = endRow - startRow +1; + newSubMatrix->numCols = endCol - startCol +1; + newSubMatrix->origMatrix = origMatrix; + newSubMatrix->origStartRow = startRow; + newSubMatrix->origStartCol = startCol; + newSubMatrix->alreadyCopied = FALSE; + newSubMatrix->numUsesLeft = numUses; //can free after this many + + subMatrices[ rowOffset + colIdx ] = newSubMatrix; + } + } + return subMatrices; + } + + +void +freeSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices, + SubMatrix **subMatrices, VirtProcr *animPr ) + { + int32 numRowIdxs, numColIdxs, rowIdx, colIdx, rowOffset; + SubMatrix *subMatrix; + + numRowIdxs = rowSlices->numVals; + numColIdxs = colSlices->numVals; + + for( rowIdx = 0; rowIdx < numRowIdxs; rowIdx++ ) + { + rowOffset = rowIdx * numColIdxs; + for( colIdx = 0; colIdx < numColIdxs; colIdx++ ) + { + subMatrix = subMatrices[ rowOffset + colIdx ]; + if( subMatrix->alreadyCopied ) + SSR__free( subMatrix->array, animPr ); + SSR__free( subMatrix, animPr ); + } + } + SSR__free( subMatrices, animPr ); + } + + + +SlicingStruc * +sliceUpDimension( float32 idealSizeOfSide, int startVal, int endVal, + VirtProcr *animPr ) + { float32 residualAcc = 0; + int numSlices, i, *startVals, sizeOfSlice, endCondition; + SlicingStruc *slicingStruc = SSR__malloc_to(sizeof(SlicingStruc), animPr); + + //calc size of matrix need to hold start vals -- + numSlices = (int32)( (float32)(endVal -startVal +1) / idealSizeOfSide); + + startVals = SSR__malloc_to( (numSlices + 1) * sizeof(int32), animPr ); + + //Calc the upper limit of start value -- when get above this, end loop + // by saving highest value of the matrix dimension to access, plus 1 + // as the start point of the imaginary slice following the last one + //Plus 1 because go up to value but not include when process last slice + //The stopping condition is half-a-size less than highest value because + // don't want any pieces smaller than half the ideal size -- just tack + // little ones onto end of last one + endCondition = endVal - (int) (idealSizeOfSide/2); //end *value*, not size + for( i = 0; startVal <= endVal; i++ ) + { + startVals[i] = startVal; + residualAcc += idealSizeOfSide; + sizeOfSlice = (int)residualAcc; + residualAcc -= (float32)sizeOfSlice; + startVal += sizeOfSlice; //ex @size = 2 get 0, 2, 4, 6, 8.. + + if( startVal > endCondition ) + { startVal = endVal + 1; + startVals[ i + 1 ] = startVal; + } + } + + slicingStruc->startVals = startVals; + slicingStruc->numVals = i; //loop incr'd, so == last valid start idx+1 + // which means is num sub-matrices in dim + // also == idx of the fake start just above + return slicingStruc; + } + +void +freeSlicingStruc( SlicingStruc *slicingStruc, VirtProcr *animPr ) + { + SSR__free( slicingStruc->startVals, animPr ); + SSR__free( slicingStruc, animPr ); + } + + +int inline +measureMatrixMultPrimitive( VirtProcr *animPr ) + { + int r, c, v, numCycles; + float32 *res, *left, *right; + + //setup inputs + left = SSR__malloc_to( 5 * 5 * sizeof( float32 ), animPr ); + right = SSR__malloc_to( 5 * 5 * sizeof( float32 ), animPr ); + res = SSR__malloc_to( 5 * 5 * sizeof( float32 ), animPr ); + + for( r = 0; r < 5; r++ ) + { + for( c = 0; c < 5; c++ ) + { + left[ r * 5 + c ] = r; + right[ r * 5 + c ] = c; + } + } + + //do primitive + SSR__start_primitive(); //for now, just takes time stamp + for( r = 0; r < 5; r++ ) + { + for( c = 0; c < 5; c++ ) + { + for( v = 0; v < 5; v++ ) + { + res[ r * 5 + c ] = left[ r * 5 + v ] * right[ v * 5 + c ]; + } + } + } + numCycles = + SSR__end_primitive_and_give_cycles(); + + SSR__free( left, animPr ); + SSR__free( right, animPr ); + SSR__free( res, animPr ); + + return numCycles; + } diff -r 000000000000 -r 8d14fe28a782 src/Application/VPThread__Matrix_Mult/EntryPoint.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/Application/VPThread__Matrix_Mult/EntryPoint.c Wed Nov 10 22:26:57 2010 -0800 @@ -0,0 +1,62 @@ +/* + * Copyright 2009 OpenSourceStewardshipFoundation.org + * Licensed under GNU General Public License version 2 + * + * Author: seanhalle@yahoo.com + * + */ + +#include + +#include "SSR_Matrix_Mult.h" + + + +/*Every SSR system has an "entry point" function that creates the first + * processor, which starts the chain of creating more processors.. + * eventually all of the processors will dissipate themselves, and + * return. + * + *This entry-point function follows the same pattern as all entry-point + * functions do: + *1) it creates the params for the seed processor, from the + * parameters passed into the entry-point function + *2) it calls SSR__create_seed_procr_and_do_work + *3) it gets the return value from the params struc, frees the params struc, + * and returns the value from the function + * + */ +Matrix * +multiplyTheseMatrices( Matrix *leftMatrix, Matrix *rightMatrix ) + { Matrix *resMatrix; + DividerParams *dividerParams; + int32 numResRows, numResCols; + + + dividerParams = malloc( sizeof( DividerParams ) ); + dividerParams->leftMatrix = leftMatrix; + dividerParams->rightMatrix = rightMatrix; + + + numResRows = leftMatrix->numRows; + numResCols = rightMatrix->numCols; + + //VMS has its own separate internal malloc, so to get results out, + // have to pass in empty array for it to fill up + //The alternative is internally telling SSR make external space to use + resMatrix = malloc( sizeof(Matrix) ); + resMatrix->array = malloc( numResRows * numResCols * sizeof(float32)); + resMatrix->numCols = rightMatrix->numCols; + resMatrix->numRows = leftMatrix->numRows; + + + dividerParams->resultMatrix = resMatrix; + + //create divider processor, start doing the work, and wait till done + //This function is the "border crossing" between normal code and SSR + SSR__create_seed_procr_and_do_work( ÷WorkIntoSubMatrixPairProcrs, + dividerParams ); + + free( dividerParams ); + return resMatrix; + } diff -r 000000000000 -r 8d14fe28a782 src/Application/VPThread__Matrix_Mult/Result_Pr.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/Application/VPThread__Matrix_Mult/Result_Pr.c Wed Nov 10 22:26:57 2010 -0800 @@ -0,0 +1,108 @@ +/* + * Copyright 2009 OpenSourceStewardshipFoundation.org + * Licensed under GNU General Public License version 2 + * + * Author: seanhalle@yahoo.com + * + */ + +#include "SSR_Matrix_Mult.h" + +//===================== +void inline +accumulateResult( float32 *resultArray, float32 *subMatrixResultArray, + int32 startRow, + int32 numRows, + int32 startCol, + int32 numCols, + int32 numOrigCols ); + +//=========================================================================== + +/*The Result Processor gets a message from each of the vector processors, + * puts the result from the message in its location in the result- + * matrix, and increments the count of results. + * + *After the count reaches the point that all results have been received, it + * returns the result matrix and dissipates. + */ +void gatherResults( void *_params, VirtProcr *animatingPr ) + { VirtProcr *dividerPr; + ResultsParams *params; + int row, col, numRows, numCols, numSubMatrixPairs, count=0; + float32 *resultArray; + void *msg; + SMPairParams *resParams; + + DEBUG( dbgAppFlow, "start resultPr\n") + + params = (ResultsParams *)_params; + dividerPr = params->dividerPr; + numSubMatrixPairs = params->numSubMatrixPairs; + numRows = params->numRows; + numCols = params->numCols; + + resultArray = params->resultArray; + + + while( count < numSubMatrixPairs ) + { + msg = SSR__receive_type_to( RESULTS_MSG, animatingPr ); + + resParams = (SMPairParams *)msg; + accumulateResult( resultArray, resParams->partialResultArray, + resParams->leftSubMatrix->origStartRow, + resParams->leftSubMatrix->numRows, + resParams->rightSubMatrix->origStartCol, + resParams->rightSubMatrix->numCols, + resParams->rightSubMatrix->origMatrix->numCols ); + + SSR__free( resParams->partialResultArray, animatingPr ); + + //there is only one copy of results procr, so can update numUsesLeft + // without concurrency worries. When zero, free the sub-matrix + resParams->leftSubMatrix->numUsesLeft -= 1; + if( resParams->leftSubMatrix->numUsesLeft == 0 ) + { + SSR__free( resParams->leftSubMatrix->array, animatingPr ); + SSR__free( resParams->leftSubMatrix, animatingPr ); + } + + resParams->rightSubMatrix->numUsesLeft -= 1; + if( resParams->rightSubMatrix->numUsesLeft == 0 ) + { + SSR__free( resParams->rightSubMatrix->array, animatingPr ); + SSR__free( resParams->rightSubMatrix, animatingPr ); + } + + //count of how many sub-matrix pairs accumulated so know when done + count++; + } + + //Done -- could just dissipate -- SSR will wait for all processors to + // dissipate before shutting down, and thereby making results avaial to + // outside, so no need to stop the divider from dissipating, so no need + // to send a hand-shake message to it -- bug makes debug easier + SSR__send_from_to( NULL, animatingPr, dividerPr ); + SSR__dissipate_procr( animatingPr ); //frees any data owned by procr + } + +void inline +accumulateResult( float32 *resultArray, float32 *subMatrixPairResultArray, + int32 startRow, + int32 numRows, + int32 startCol, + int32 numCols, + int32 numOrigCols ) + { int32 row, col; + + for( row = 0; row < numRows; row++ ) + { + for( col = 0; col < numCols; col++ ) + { + resultArray[ (row + startRow) * numOrigCols + (col + startCol) ] += + subMatrixPairResultArray[ row * numCols + col ]; + } + } + + } diff -r 000000000000 -r 8d14fe28a782 src/Application/VPThread__Matrix_Mult/VPThread__Matrix_Mult.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/Application/VPThread__Matrix_Mult/VPThread__Matrix_Mult.h Wed Nov 10 22:26:57 2010 -0800 @@ -0,0 +1,95 @@ +/* + * Copyright Oct 24, 2009 OpenSourceStewardshipFoundation.org + * Licensed under GNU General Public License version 2 + */ + +#ifndef _SSR_MATRIX_MULT_H_ +#define _SSR_MATRIX_MULT_H_ + +#include + +#include "../../SSR_lib/SSR.h" +#include "../Matrix_Mult.h" + + +//=============================== Defines ============================== +#define ROWS_IN_BLOCK 32 +#define COLS_IN_BLOCK 32 +#define VEC_IN_BLOCK 32 + +#define copyMatrixSingleton 1 +#define copyTransposeSingleton 2 + +//============================== Structures ============================== +typedef struct + { + Matrix *leftMatrix; + Matrix *rightMatrix; + Matrix *resultMatrix; + } +DividerParams; + +typedef struct + { + VirtProcr *dividerPr; + int numRows; + int numCols; + int numSubMatrixPairs; + float32 *resultArray; + } +ResultsParams; + +typedef +struct + { int32 numRows; + int32 numCols; + Matrix *origMatrix; + int32 origStartRow; + int32 origStartCol; + int32 alreadyCopied; + int32 numUsesLeft; //have update via message to avoid multiple writers + float32 *array; //2D, but dynamically sized, so use addr arith + } +SubMatrix; + +typedef struct + { VirtProcr *resultPr; + SubMatrix *leftSubMatrix; + SubMatrix *rightSubMatrix; + float32 *partialResultArray; + } +SMPairParams; + +typedef +struct + { int32 numVals; + int32 *startVals; + } +SlicingStruc; + +typedef +struct + { + SlicingStruc *leftRowSlices; + SlicingStruc *vecSlices; + SlicingStruc *rightColSlices; + } +SlicingStrucCarrier; + +enum MMMsgType + { + RESULTS_MSG = 1 + }; + +//============================= Processor Functions ========================= +void divideWorkIntoSubMatrixPairProcrs( void *data, VirtProcr *animatingPr ); +void calcSubMatrixProduct( void *data, VirtProcr *animatingPr ); +void gatherResults( void *data, VirtProcr *animatingPr ); + + +//================================ Entry Point ============================== +Matrix * +multiplyTheseMatrices( Matrix *leftMatrix, Matrix *rightMatrix ); + + +#endif /*_SSR_MATRIX_MULT_H_*/ diff -r 000000000000 -r 8d14fe28a782 src/Application/VPThread__Matrix_Mult/subMatrix_Pr.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/Application/VPThread__Matrix_Mult/subMatrix_Pr.c Wed Nov 10 22:26:57 2010 -0800 @@ -0,0 +1,299 @@ +/* + * Copyright 2009 OpenSourceStewardshipFoundation.org + * Licensed under GNU General Public License version 2 + * + * Author: SeanHalle@yahoo.com + * + */ + +#include + +#include "SSR_Matrix_Mult.h" + + + +void inline +copyFromOrig( SubMatrix *subMatrix, VirtProcr *animPr ); + +void inline +copyTransposeFromOrig( SubMatrix *subMatrix, VirtProcr *animPr ); + +void inline +multiplySubBlocksTransposed( float32 *leftArray, float32 *rightArray, + float32 *resArray, + int startRow, int endRow, + int startCol, int endCol, + int startVec, int endVec, + int resStride, int inpStride ); + +void inline +multiplyMatrixArraysTransposed( int32 vecLength, int32 numResRows, int32 numResCols, + float32 *leftArray, float32 *rightArray, + float32 *resArray ); + + +/*A processor is created with an environment that holds two matrices, + * the row and col that it owns, and the name of a result gathering + * processor. + *It calculates the product of two sub-portions of the input matrices + * by using Intel's mkl library for single-core. + * + *This demonstrates using optimized single-threaded code inside scheduled + * work-units. + * + *When done, it sends the result to the result processor + */ +void +calcSubMatrixProduct( void *data, VirtProcr *animatingPr ) + { + SMPairParams *params; + VirtProcr *resultPr; + float32 *leftArray, *rightArray, *resArray; + SubMatrix *leftSubMatrix, *rightSubMatrix; + + DEBUG1(dbgAppFlow, "start sub-matrix mult: %d\n", animatingPr->procrID) + int32 subMatrixProbe = VMS__create_single_interval_probe( "subMtx", + animatingPr); + VMS__record_sched_choice_into_probe( subMatrixProbe, animatingPr ); + VMS__record_interval_start_in_probe( subMatrixProbe ); + + params = (SMPairParams *)data; + resultPr = params->resultPr; + leftSubMatrix = params->leftSubMatrix; + rightSubMatrix = params->rightSubMatrix; + + //make sure the input sub-matrices have been copied out of orig + //do it here, inside sub-matrix pair to hopefully gain reuse in cache + copyFromOrig( leftSubMatrix, animatingPr ); + copyTransposeFromOrig( rightSubMatrix, animatingPr ); + + leftArray = leftSubMatrix->array; + rightArray = rightSubMatrix->array; + + int32 + resSize = leftSubMatrix->numRows * rightSubMatrix->numCols * sizeof(float32); + resArray = SSR__malloc_to( resSize, animatingPr ); + memset( resArray, 0, resSize ); + + + int32 numResRows, numResCols, vectLength; + + vectLength = leftSubMatrix->numCols; + numResRows = leftSubMatrix->numRows; + numResCols = rightSubMatrix->numCols; + + multiplyMatrixArraysTransposed( vectLength, numResRows, numResCols, + leftArray, rightArray, + resArray ); + + //send result to result processor + params->partialResultArray = resArray; + + VMS__record_interval_end_in_probe( subMatrixProbe ); + + SSR__send_of_type_to( animatingPr, params, RESULTS_MSG, resultPr ); + SSR__dissipate_procr( animatingPr ); + } + + + +/*Divides result and each input into 32x32 sub-matrices, 3 of which fit into + * the 32KB L1 cache. + *Would be nice to embed this within another level that divided into + * 8x8 tiles of those, where one 8x8 tile fits within 2MB L2 cache + * + *Eventually want these divisions to be automatic, using DKU pattern + * embedded into VMS and exposed in the language, and with VMS controlling the + * divisions according to the cache sizes, which it knows about. + *Also, want VMS to work with language to split among main-mems, so a socket + * only cranks on data in its local segment of main mem + * + *So, outer two loops determine start and end points within the result matrix. + * Inside that, a loop dets the start and end points along the shared dimensions + * of the two input matrices. + */ +void inline +multiplyMatrixArraysTransposed( int32 vecLength, int32 numResRows, + int32 numResCols, + float32 *leftArray, float32 *rightArray, + float32 *resArray ) + { + int resStride, inpStride; + int resStartRow, resStartCol, resEndRow, resEndCol, startVec, endVec; + + resStride = numResCols; + inpStride = vecLength; + + for( resStartRow = 0; resStartRow < numResRows; ) + { + resEndRow = resStartRow + ROWS_IN_BLOCK -1; //start at zero, so -1 + if( resEndRow > numResRows ) resEndRow = numResRows -1; + + for( resStartCol = 0; resStartCol < numResCols; ) + { + resEndCol = resStartCol + COLS_IN_BLOCK -1; + if( resEndCol > numResCols ) resEndCol = numResCols -1; + + for( startVec = 0; startVec < vecLength; ) + { + endVec = startVec + VEC_IN_BLOCK -1; + if( endVec > vecLength ) endVec = vecLength -1; + + //By having the "vector" of sub-blocks in a sub-block slice + // be marched down in inner loop, are re-using the result + // matrix, which stays in L1 cache and re-using the left sub-mat + // which repeats for each right sub-mat -- can only re-use two of + // the three, so result is the most important -- avoids writing + // dirty blocks until those result-locations fully done + //Row and Col is position in result matrix -- so row and vec + // for left array, then vec and col for right array + multiplySubBlocksTransposed( leftArray, rightArray, + resArray, + resStartRow, resEndRow, + resStartCol, resEndCol, + startVec, endVec, + resStride, inpStride ); + startVec = endVec +1; + } + resStartCol = resEndCol +1; + } + resStartRow = resEndRow +1; + } + } + + + +void inline +multiplySubBlocksTransposed( float32 *leftArray, float32 *rightArray, + float32 *resArray, + int resStartRow, int resEndRow, + int resStartCol, int resEndCol, + int startVec, int endVec, + int resStride, int inpStride ) + { + int resRow, resCol, vec; + int leftOffset, rightOffset; + float32 result; + + //The result row is used only for the left matrix, res col for the right + for( resCol = resStartCol; resCol <= resEndCol; resCol++ ) + { + for( resRow = resStartRow; resRow <= resEndRow; resRow++ ) + { + leftOffset = resRow * inpStride;//left & right inp strides always same + rightOffset = resCol * inpStride;// because right is transposed + result = 0; + for( vec = startVec; vec <= endVec; vec++ ) + { + result += + leftArray[ leftOffset + vec] * rightArray[ rightOffset + vec]; + } + + resArray[ resRow * resStride + resCol ] += result; + } + } + } + + + + +/*Reuse this in divider when do the sequential multiply case + */ +void inline +copyTranspose( int32 numRows, int32 numCols, + int32 origStartRow, int32 origStartCol, int32 origStride, + float32 *subArray, float32 *origArray ) + { int32 stride = numRows; + + int row, col, origOffset; + for( row = 0; row < numRows; row++ ) + { + origOffset = (row + origStartRow) * origStride + origStartCol; + for( col = 0; col < numCols; col++ ) + { + //transpose means swap row & col -- traverse orig matrix normally + // but put into reversed place in local array -- means the + // stride is the numRows now, so col * numRows + row + subArray[ col * stride + row ] = origArray[ origOffset + col ]; + } + } + } + +void inline +copyTransposeFromOrig( SubMatrix *subMatrix, VirtProcr *animPr ) + { int numCols, numRows, origStartRow, origStartCol, origStride, stride; + Matrix *origMatrix; + float32 *origArray, *subArray; + + if( subMatrix->alreadyCopied ) return; + SSR__start_singleton( copyMatrixSingleton, &&EndOfTransSingleton, animPr); + + origMatrix = subMatrix->origMatrix; + origArray = origMatrix->array; + numCols = subMatrix->numCols; + numRows = subMatrix->numRows; + origStartRow = subMatrix->origStartRow; + origStartCol = subMatrix->origStartCol; + origStride = origMatrix->numCols; + + subArray = SSR__malloc_to( numRows * numCols *sizeof(float32),animPr); + subMatrix->array = subArray; + + //copy values from orig matrix to local + copyTranspose( numRows, numCols, + origStartRow, origStartCol, origStride, + subArray, origArray ); + + subMatrix->alreadyCopied = TRUE; //must be last thing before label + EndOfTransSingleton: + return; + } + + +void inline +copyFromOrig( SubMatrix *subMatrix, VirtProcr *animPr ) + { int numCols, numRows, origStartRow, origStartCol, stride, origStride; + Matrix *origMatrix; + float32 *origArray, *subArray; + + + //This lets only a single VP execute the code between start and + // end -- using start and end so that work runs outside the master. + //Inside, if a second VP ever executes the start, it will be returned + // from the end-point. + //Note, for non-GCC, can add a second SSR call at the end, and inside + // that one, look at the stack at the return addr & save that in an + // array indexed by singletonID + if( subMatrix->alreadyCopied ) return; + SSR__start_singleton( copyMatrixSingleton, &&EndOfCopySingleton, animPr ); + + + origMatrix = subMatrix->origMatrix; + origArray = origMatrix->array; + numCols = subMatrix->numCols; + numRows = subMatrix->numRows; + origStartRow = subMatrix->origStartRow; + origStartCol = subMatrix->origStartCol; + origStride = origMatrix->numCols; + + subArray = SSR__malloc_to( numRows * numCols *sizeof(float32),animPr); + subMatrix->array = subArray; + + //copy values from orig matrix to local + stride = numCols; + + int row, col, offset, origOffset; + for( row = 0; row < numRows; row++ ) + { + offset = row * stride; + origOffset = (row + origStartRow) * origStride + origStartCol; + for( col = 0; col < numCols; col++ ) + { + subArray[ offset + col ] = origArray[ origOffset + col ]; + } + } + + subMatrix->alreadyCopied = TRUE; //must be last thing before label + EndOfCopySingleton: + return; + } diff -r 000000000000 -r 8d14fe28a782 src/Application/main.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/Application/main.c Wed Nov 10 22:26:57 2010 -0800 @@ -0,0 +1,36 @@ +/* + * Copyright Oct 24, 2009 OpenSourceStewardshipFoundation.org + * Licensed under GNU General Public License version 2 + * + * author seanhalle@yahoo.com + */ + +#include +#include + +#include "Matrix_Mult.h" +#include "SSR_Matrix_Mult/SSR_Matrix_Mult.h" + +/** + *Matrix multiply program written using VMS_HW piggy-back language + * + */ +int main( int argc, char **argv ) + { Matrix *leftMatrix, *rightMatrix, *resultMatrix; + ParamBag *paramBag; + + printf( "arguments: %s | %s\n", argv[0], argv[1] ); + + paramBag = makeParamBag(); + readParamFileIntoBag( argv[1], paramBag ); + initialize_Input_Matrices_Via( &leftMatrix, &rightMatrix, paramBag ); + + resultMatrix = multiplyTheseMatrices( leftMatrix, rightMatrix ); + + printf("\nresult matrix: \n"); + printMatrix( resultMatrix ); + +// SSR__print_stats(); + + exit(0); //cleans up + }