# HG changeset patch # User Me # Date 1288496629 25200 # Node ID 56e17dcfc0c32aaa3cabf7c19208cd2345b501f1 Initial add -- works, together with VMS pin2Core with vmalloc and probes diff -r 000000000000 -r 56e17dcfc0c3 .hgignore --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/.hgignore Sat Oct 30 20:43:49 2010 -0700 @@ -0,0 +1,5 @@ +nbproject +build +dist +.dep.inc +Makefile diff -r 000000000000 -r 56e17dcfc0c3 src/Application/Matrix_Mult.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/Application/Matrix_Mult.c Sat Oct 30 20:43:49 2010 -0700 @@ -0,0 +1,167 @@ +/* + * Copyright 2009 OpenSourceStewardshipFoundation.org + * Licensed under GNU General Public License version 2 + * + * Author: seanhalle@yahoo.com + * + * Created on November 15, 2009, 2:35 AM + */ + +#include +#include + +#include "Matrix_Mult.h" +#include "ParamHelper/Param.h" + + + + void +initialize_Input_Matrices_Via( Matrix **leftMatrix, Matrix **rightMatrix, + ParamBag *paramBag ) + { char *leftMatrixFileName, *rightMatrixFileName; + int leftMatrixRows, leftMatrixCols, rightMatrixRows, rightMatrixCols; + + ParamStruc *param; + param = getParamFromBag( "leftMatrixRows", paramBag ); + leftMatrixRows = param->intValue; + param = getParamFromBag( "leftMatrixCols", paramBag ); + leftMatrixCols = param->intValue; + *leftMatrix = makeMatrix_WithResMat( leftMatrixRows, leftMatrixCols ); + + param = getParamFromBag( "leftMatrixFileName", paramBag ); + leftMatrixFileName = param->strValue; //no need to copy + read_Matrix_From_File( *leftMatrix, leftMatrixFileName ); + + param = getParamFromBag( "rightMatrixRows", paramBag ); + rightMatrixRows = param->intValue; + param = getParamFromBag( "rightMatrixCols", paramBag ); + rightMatrixCols = param->intValue; + *rightMatrix = makeMatrix_WithResMat( rightMatrixRows, rightMatrixCols ); + + param = getParamFromBag( "rightMatrixFileName", paramBag ); + rightMatrixFileName = param->strValue; + read_Matrix_From_File( *rightMatrix, rightMatrixFileName ); + } + + +void parseLineIntoRow( char *line, float32* row ); + + + void +read_Matrix_From_File( Matrix *matrixStruc, char *matrixFileName ) + { int row, maxRead, numRows, numCols; + float32 *matrixStart; + size_t lineSz = 0; + FILE *file; + char *line = NULL; + + lineSz = 50000; //max length of line in a matrix data file + line = (char *) malloc( lineSz ); + if( line == NULL ) printf( "no mem for matrix line" ); + + numRows = matrixStruc->numRows; + numCols = matrixStruc->numCols; + matrixStart = matrixStruc->array; + + file = fopen( matrixFileName, "r" ); + if( file == NULL ) { printf( "\nCouldn't open file!!\n"); exit(1);} + fseek( file, 0, SEEK_SET ); + for( row = 0; row < numRows; row++ ) + { + if( feof( file ) ) printf( "file ran out too soon" ); + maxRead = getline( &line, &lineSz, file ); + if( maxRead == -1 ) printf( "prob reading mat line"); + + if( *line == '\n') continue; //blank line + if( *line == '/' ) continue; //comment line + + parseLineIntoRow( line, matrixStart + row * numCols ); + } + free( line ); + } + +/*This function relies on each line having the proper number of cols. It + * doesn't check, nor enforce, so if the file is improperly formatted it + * can write over unrelated memory + */ + void +parseLineIntoRow( char *line, float32* row ) + { + char *valueStr, *searchPos; + + //read the float values + searchPos = valueStr = line; //start + + for( ; *searchPos != 0; searchPos++) //bit dangerous, should use buff len + { + if( *searchPos == '\n' ) //last col.. relying on well-formatted file + { *searchPos = 0; + *row = atof( valueStr ); + break; //end FOR loop + } + if( *searchPos == ',' ) + { *searchPos = 0; //mark end of string + *row = (float32) atof( valueStr ); + row += 1; //address arith + //skip any spaces before digits.. use searchPos + 1 to skip the 0 + for( ; *(searchPos + 1)== ' ' && *(searchPos + 1) !=0; searchPos++); + valueStr = searchPos + 1; + } + } + } + + //========================================================================== + +/*In the "_Flat" version of constructor, do only malloc of the top data struc + * and set values in that top-level. Don't malloc any sub-structures. + */ + Matrix * +makeMatrix_Flat( int32 numRows, int32 numCols ) + { Matrix * retMatrix; + retMatrix = malloc( sizeof( Matrix ) ); + retMatrix->numRows = numRows; + retMatrix->numCols = numCols; + + return retMatrix; + } + + Matrix * +makeMatrix_WithResMat( int32 numRows, int32 numCols ) + { Matrix * retMatrix; + retMatrix = malloc( sizeof( Matrix ) ); + retMatrix->numRows = numRows; + retMatrix->numCols = numCols; + retMatrix->array = malloc( numRows * numCols * sizeof(float32) ); + + return retMatrix; + } + + void +freeMatrix_Flat( Matrix * matrix ) + { //( matrix ); + } + void +freeMatrix( Matrix * matrix ) + { free( matrix->array ); + free( matrix ); + } + +void +printMatrix( Matrix *matrix ) + { int r, c, numRows, numCols, rowsToPrint, colsToPrint, rowIncr, colIncr; + float32 *matrixArray; + + numRows = rowsToPrint = matrix->numRows; + numCols = colsToPrint = matrix->numCols; + matrixArray = matrix->array; + + rowIncr = numRows/20; if(rowIncr == 0) rowIncr = 1;//20 to 39 rows printed + colIncr = numCols/20; if(colIncr == 0) colIncr = 1;//20 to 39 cols printed + for( r = 0; r < numRows; r += rowIncr ) + { for( c = 0; c < numCols; c += colIncr ) + { printf( "%3.1f | ", matrixArray[ r * numCols + c ] ); + } + printf("\n"); + } + } + diff -r 000000000000 -r 56e17dcfc0c3 src/Application/Matrix_Mult.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/Application/Matrix_Mult.h Sat Oct 30 20:43:49 2010 -0700 @@ -0,0 +1,77 @@ +/* + * Copyright Oct 24, 2009 OpenSourceStewardshipFoundation.org + * Licensed under GNU General Public License version 2 + */ + +#ifndef MATRIX_MULT_H_ +#define MATRIX_MULT_H_ + +#include +#include +#include + +#include "../VCilk_lib/VMS/VMS_primitive_data_types.h" +#include "ParamHelper/Param.h" + +//============================== Structures ============================== + +typedef +struct + { int32 numRows; + int32 numCols; + float32 *array; //2D, but dynamically sized, so use addr arith + } +Matrix; + +/* This is the "appSpecificPiece" that is carried inside a DKUPiece. + * In the DKUPiece data struc it is declared to be of type "void *". This + * allows the application to define any data structure it wants and put it + * into a DKUPiece. + * When the app specific info is used, it is in app code, so it is cast to + * the correct type to tell the compiler how to access fields. + * This keeps all app-specific things out of the DKU directory, as per the + * DKU standard. */ +typedef +struct + { + // pointers to shared data.. the result matrix must be created when the + // left and right matrices are put into the root ancestor DKUPiece. + Matrix * leftMatrix; + Matrix * rightMatrix; + Matrix * resultMatrix; + + // define the starting and ending boundaries for this piece of the + // result matrix. These are derivable from the left and right + // matrices, but included them for readability of code. + int prodStartRow, prodEndRow; + int prodStartCol, prodEndCol; + // Start and end of the portion of the left matrix that contributes to + // this piece of the product + int leftStartRow, leftEndRow; + int leftStartCol, leftEndCol; + // Start and end of the portion of the right matrix that contributes to + // this piece of the product + int rightStartRow, rightEndRow; + int rightStartCol, rightEndCol; + } +MatrixProdPiece; + +//============================== Functions ================================ +void readFile(); + +Matrix *makeMatrix( int32 numRows, int32 numCols ); +Matrix *makeMatrix_Flat( int32 numRows, int32 numCols ); +Matrix *makeMatrix_WithResMat( int32 numRows, int32 numCols ); +void freeMatrix_Flat( Matrix * matrix ); +void freeMatrix( Matrix * matrix ); +void printMatrix( Matrix *matrix ); + +void read_Matrix_From_File( Matrix *matrixStruc, char *matrixFileName ); + +void +initialize_Input_Matrices_Via( Matrix **leftMatrix, Matrix **rightMatrix, + ParamBag *paramBag ); + +//=========================================================================== + +#endif /*MATRIX_MULT_H_*/ diff -r 000000000000 -r 56e17dcfc0c3 src/Application/VCilk__Matrix_Mult/Divide_Pr.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/Application/VCilk__Matrix_Mult/Divide_Pr.c Sat Oct 30 20:43:49 2010 -0700 @@ -0,0 +1,583 @@ +/* + * Copyright 2009 OpenSourceStewardshipFoundation.org + * Licensed under GNU General Public License version 2 + * + * Author: seanhalle@yahoo.com + * + */ + + +#include "VCilk__Matrix_Mult.h" +#include +#include + + //The time to compute this many result values should equal the time to + // perform this division on a matrix of size gives that many result calcs + //IE, size this so that sequential time to calc equals divide time + // find the value by experimenting -- but divide time and calc time scale + // same way, so this value should remain valid across hardware + //Divide time is about 800us on 2.4Ghz core2Quad laptop core + //num cells is the cube of a side, when have two square matrices +#define NUM_CELLS_IN_SEQUENTIAL_CUTOFF 100000 /* about 46x46 */ + + +//=========================================================================== +int inline +measureMatrixMultPrimitive( VirtProcr *animPr ); + +SlicingStrucCarrier * +calcIdealSizeAndSliceDimensions( Matrix *leftMatrix, Matrix *rightMatrix, + VirtProcr *animPr ); + +SlicingStruc * +sliceUpDimension( float32 idealSizeOfSide, int startVal, int endVal, + VirtProcr *animPr ); + +void +freeSlicingStruc( SlicingStruc *slicingStruc, VirtProcr *animPr ); + +SubMatrix ** +createSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices, + Matrix *origMatrix, VirtProcr *animPr ); + +void +freeSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices, + SubMatrix **subMatrices, VirtProcr *animPr ); + +void +pairUpSubMatricesAndSpawnAndSync( SubMatrix **leftSubMatrices, + SubMatrix **rightSubMatrices, + int32 numRowIdxs, int32 numColIdxs, + int32 numVecIdxs, + float32 *resultArray, + VirtProcr *animatingPr ); + +void +makeSubMatricesAndSpawnAndSync( Matrix *leftMatrix, Matrix *rightMatrix, + SlicingStrucCarrier *slicingStrucCarrier, + float32 *resultArray, VirtProcr *animatingPr ); + +//=========================================================================== + +/*Divider creates one processor for every sub-matrix + * It hands them: + * the name of the result processor that they should send their results to, + * the left and right matrices, and the rows and cols they should multiply + * It first creates the result processor, then all the sub-matrixPair + * processors, + * then does a receive of a message from the result processor that gives + * the divider ownership of the result matrix. + * Finally, the divider returns the result matrix out of the VCilk system. + * + * Divider chooses the size of sub-matrices via an algorithm that tries to + * keep the minimum work above a threshold. The threshold is machine- + * dependent, so ask VCilk for min work-unit time to get a + * given overhead + * + * Divide min work-unit cycles by measured-cycles for one matrix-cell + * product -- gives the number of products need to have in min size + * matrix. + * + * So then, take cubed root of this to get the size of a side of min sub- + * matrix. That is the size of the ideal square sub-matrix -- so tile + * up the two input matrices into ones as close as possible to that size, + * and create the pairs of sub-matrices. + * + *======================== STRATEGIC OVERVIEW ======================= + * + *This division is a bit tricky, because have to create things in advance + * that it's not at first obvious need to be created.. + * + *First slice up each dimension -- three of them.. this is because will have + * to create the sub-matrix's data-structures before pairing the sub-matrices + * with each other -- so, have three dimensions to slice up before can + * create the sub-matrix data-strucs -- also, have to be certain that the + * cols of the left input have the exact same slicing as the rows of the + * left matrix, so just to be sure, do the slicing calc once, then use it + * for both. + * + *So, goes like this: + *1) calculate the start & end values of each dimension in each matrix. + *2) use those values to create sub-matrix structures + *3) combine sub-matrices into pairs, as the tasks to perform. + * + *Have to calculate separately from creating the sub-matrices because of the + * nature of the nesting -- would either end up creating the same sub-matrix + * multiple times, or else would have to put in detection of whether had + * made a particular one already if tried to combine steps 1 and 2. + * + *Step 3 has to be separate because of the nesting, as well -- same reason, + * would either create same sub-matrix multiple times, or else have to + * add detection of whether was already created. + * + *Another way to look at it: there's one level of loop to divide dimensions, + * two levels of nesting to create sub-matrices, and three levels to pair + * up the sub-matrices. + */ + +void divideWorkIntoSubMatrixPairProcrs( void *_dividerParams, + VirtProcr *animPr ) + { + DividerParams *dividerParams; + ResultsParams *resultsParams; + Matrix *leftMatrix, *rightMatrix, *resultMatrix; + void *msg; + SlicingStrucCarrier *slicingStrucCarrier; + float32 *resultArray; //points to array to be put inside result + // matrix + + PRINT_DEBUG("start divide\n") + +//TODO: VMS__create_block_of_probes_with_idxs( 0, 2 ); + int32 + divideProbe = VMS__create_single_interval_probe( "divideProbe", + animPr ); + VMS__record_sched_choice_into_probe( divideProbe, animPr ); + VMS__record_interval_start_in_probe( divideProbe ); + + //=========== Setup -- make local copies of ptd-to-things, malloc, aso + int32 numResRows, numResCols; + + dividerParams = (DividerParams *)_dividerParams; + + leftMatrix = dividerParams->leftMatrix; + rightMatrix = dividerParams->rightMatrix; + + numResRows = leftMatrix->numRows; + numResCols = rightMatrix->numCols; + resultArray = dividerParams->resultMatrix->array; + + + //============== Do either sequential mult or do division ============== + + //Check if input matrices too small -- if yes, just do sequential + //Cutoff is determined by overhead of this divider -- relatively + // machine-independent + if( (float32)leftMatrix->numRows * (float32)leftMatrix->numCols * + (float32)rightMatrix->numCols < NUM_CELLS_IN_SEQUENTIAL_CUTOFF ) + { int32 vectLength; + + //====== Do sequential multiply on a single core + PRINT_DEBUG("doing sequential") + vectLength = leftMatrix->numCols; + + multiplyMatrixArrays( vectLength, numResRows, numResCols, + leftMatrix->array, rightMatrix->array, + resultArray ); + } + else + { + //====== Do parallel multiply across cores + + //Calc the ideal size of sub-matrix and slice up the dimensions of + // the two matrices. + //The ideal size is the one takes the number of cycles to calculate + // such that calc time is equal or greater than min work-unit size + slicingStrucCarrier = + calcIdealSizeAndSliceDimensions( leftMatrix, rightMatrix, animPr); + + + //Make the results processor, now that know how many to wait for +/* + resultsParams = VCilk__malloc( sizeof(ResultsParams) ); + resultsParams->dividerPr = animatingPr; + resultsParams->numSubMatrixPairs = + slicingStrucCarrier->leftRowSlices->numVals * + slicingStrucCarrier->rightColSlices->numVals * + slicingStrucCarrier->vecSlices->numVals; + resultsParams->numCols = rightMatrix->numCols; + resultsParams->numRows = leftMatrix->numRows; +*/ + + //Make the sub-matrices, and pair them up, then spawn processors to + // calc product of each pair. + makeSubMatricesAndSpawnAndSync( leftMatrix, rightMatrix, + slicingStrucCarrier, + resultArray, animPr); + //The result array will get filled in by the spawned children + } + + + //=============== Work done -- send results back ================= + + + //results have been saved into an array that was made outside the VMS + // system, by entry-point Fn, and passed in through dividerParams. + //So, nothing to do to send results back -- they're seen by side-effect + + PRINT_DEBUG("end divide\n") + + VMS__record_interval_end_in_probe( divideProbe ); + VMS__print_stats_of_all_probes; + + VCilk__dissipate_procr( animPr ); //all procrs dissipate self at end + //when all of the processors have dissipated, the "create seed and do + // work" call in the entry point function returns + } + + +SlicingStrucCarrier * +calcIdealSizeAndSliceDimensions( Matrix *leftMatrix, Matrix *rightMatrix, + VirtProcr *animPr ) +{ + float32 idealSizeOfSide, idealSizeOfSide1, idealSizeOfSide2; + SlicingStruc *leftRowSlices, *vecSlices, *rightColSlices; + SlicingStrucCarrier *slicingStrucCarrier = + VCilk__malloc(sizeof(SlicingStrucCarrier), animPr ); + + int minWorkUnitCycles, primitiveCycles, idealNumWorkUnits; + float64 numPrimitiveOpsInMinWorkUnit; + + + //======= Calc ideal size of min-sized sub-matrix ======== + + //ask VCilk for the number of cycles of the minimum work unit, at given + // percent overhead then add a guess at overhead from this divider + minWorkUnitCycles = VCilk__giveMinWorkUnitCycles( .05 ); + + //ask VCilk for number of cycles of the "primitive" op of matrix mult + primitiveCycles = measureMatrixMultPrimitive( animPr ); + + numPrimitiveOpsInMinWorkUnit = + (float64)minWorkUnitCycles / (float64)primitiveCycles; + + //take cubed root -- that's number of these in a "side" of sub-matrix + // then multiply by 5 because the primitive is 5x5 + idealSizeOfSide1 = 5 * cbrt( numPrimitiveOpsInMinWorkUnit ); + + idealNumWorkUnits = VCilk__giveIdealNumWorkUnits(); + + idealSizeOfSide2 = leftMatrix->numRows / rint(cbrt( idealNumWorkUnits )); + idealSizeOfSide2 *= 0.6; //finer granularity to help load balance + + if( idealSizeOfSide1 > idealSizeOfSide2 ) + idealSizeOfSide = idealSizeOfSide1; + else + idealSizeOfSide = idealSizeOfSide2; + + //The multiply inner loop blocks the array to fit into L1 cache +// if( idealSizeOfSide < ROWS_IN_BLOCK ) idealSizeOfSide = ROWS_IN_BLOCK; + + //============ Slice up dimensions, now that know target size =========== + + //Tell the slicer the target size of a side (floating pt), the start + // value to start slicing at, and the end value to stop slicing at + //It returns an array of start value of each chunk, plus number of them + int32 startLeftRow, endLeftRow, startVec,endVec,startRightCol,endRightCol; + startLeftRow = 0; + endLeftRow = leftMatrix->numRows -1; + startVec = 0; + endVec = leftMatrix->numCols -1; + startRightCol = 0; + endRightCol = rightMatrix->numCols -1; + + leftRowSlices = + sliceUpDimension( idealSizeOfSide, startLeftRow, endLeftRow, animPr ); + + vecSlices = + sliceUpDimension( idealSizeOfSide, startVec, endVec, animPr ); + + rightColSlices = + sliceUpDimension( idealSizeOfSide, startRightCol, endRightCol,animPr); + + slicingStrucCarrier->leftRowSlices = leftRowSlices; + slicingStrucCarrier->vecSlices = vecSlices; + slicingStrucCarrier->rightColSlices = rightColSlices; + + return slicingStrucCarrier; +} + + +void +makeSubMatricesAndSpawnAndSync( Matrix *leftMatrix, Matrix *rightMatrix, + SlicingStrucCarrier *slicingStrucCarrier, + float32 *resultArray, VirtProcr *animPr ) + { + SlicingStruc *leftRowSlices, *vecSlices, *rightColSlices; + + leftRowSlices = slicingStrucCarrier->leftRowSlices; + vecSlices = slicingStrucCarrier->vecSlices; + rightColSlices = slicingStrucCarrier->rightColSlices; + VCilk__free( slicingStrucCarrier, animPr ); + + //================ Make sub-matrices, given the slicing ================ + SubMatrix **leftSubMatrices, **rightSubMatrices; + leftSubMatrices = + createSubMatrices( leftRowSlices, vecSlices, + leftMatrix, animPr ); + rightSubMatrices = + createSubMatrices( vecSlices, rightColSlices, + rightMatrix, animPr ); + + //============== pair the sub-matrices and make processors ============== + int32 numRowIdxs, numColIdxs, numVecIdxs; + + numRowIdxs = leftRowSlices->numVals; + numColIdxs = rightColSlices->numVals; + numVecIdxs = vecSlices->numVals; + pairUpSubMatricesAndSpawnAndSync( leftSubMatrices, rightSubMatrices, + numRowIdxs, numColIdxs, numVecIdxs, + resultArray, + animPr ); + //It syncs inside, so know all work is done now: free the sub-matrices + freeSubMatrices( leftRowSlices, vecSlices, leftSubMatrices, animPr ); + freeSubMatrices( vecSlices, rightColSlices, rightSubMatrices, animPr ); + + + freeSlicingStruc( leftRowSlices, animPr ); + freeSlicingStruc( vecSlices, animPr ); + freeSlicingStruc( rightColSlices, animPr ); + } + + + + +/* numRows*colsPerRow/numCores = numToPutOntoEachCore; + * put all from a given row onto same core, until exhaust allotment for that + * core + * + */ +void +pairUpSubMatricesAndSpawnAndSync( SubMatrix **leftSubMatrices, + SubMatrix **rightSubMatrices, + int32 numRowIdxs, int32 numColIdxs, + int32 numVecIdxs, + float32 *resultArray, + VirtProcr *animatingPr ) + { + int32 resRowIdx, resColIdx; + int32 numLeftColIdxs, numRightColIdxs; + int32 leftRowIdxOffset; + VecParams *vecParams; + float32 numToPutOntoEachCore, leftOverFraction; + int32 numCores, currCore, numOnCurrCore; + + numLeftColIdxs = numColIdxs; + numRightColIdxs = numVecIdxs; + + numCores = VCilk__give_number_of_cores_to_spawn_onto(); + + numToPutOntoEachCore = numRowIdxs*numColIdxs/numCores; + leftOverFraction = 0; + numOnCurrCore = 0; + currCore = 0; + + for( resRowIdx = 0; resRowIdx < numRowIdxs; resRowIdx++ ) + { + leftRowIdxOffset = resRowIdx * numLeftColIdxs; + + for( resColIdx = 0; resColIdx < numColIdxs; resColIdx++ ) + { + vecParams = VCilk__malloc( sizeof(VecParams), animatingPr ); + + vecParams->numVecIdxs = numVecIdxs; + vecParams->numRightColIdxs = numRightColIdxs; + vecParams->leftRowIdxOffset = leftRowIdxOffset; + vecParams->resColIdx = resColIdx; + vecParams->leftSubMatrices = leftSubMatrices; + vecParams->rightSubMatrices = rightSubMatrices; + vecParams->resultArray = resultArray; + vecParams->coreToRunOn = currCore; + + VCilk__spawn( currCore, &calcVectorOfSubMatrices, vecParams, + animatingPr ); + + numOnCurrCore += 1; + if( numOnCurrCore + leftOverFraction >= numToPutOntoEachCore - 1 ) + { + //deal with fractional part, to ensure that imbalance is 1 max + // IE, core with most has only 1 more than core with least + leftOverFraction += numToPutOntoEachCore - numOnCurrCore; + if( leftOverFraction >= 1 ) + { leftOverFraction -= 1; + numOnCurrCore = -1; + } + else + { numOnCurrCore = 0; + } + //Move to next core, max core-value to incr to is numCores -1 + if( currCore >= numCores -1 ) + { currCore = 0; + } + else + { currCore += 1; + } + } + } + } + + //Free Note: vector of sub-matrices does its own free-ing, even vec-params + +//TODO: timeToSpawnProbe = VMS__get_probe_by_name( "timeToSpawnProbe" ); +// VMS__end_interval_on_probe( timeToSpawnProbe ); + + VCilk__sync( animatingPr ); + + //free the sub-matrices in Fn that called this one + } + + +/*Walk through the two slice-strucs, making sub-matrix strucs as go + */ +SubMatrix ** +createSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices, + Matrix *origMatrix, VirtProcr *animPr ) + { + int32 numRowIdxs, numColIdxs, rowIdx, colIdx; + int32 startRow, endRow, startCol, endCol; + int32 *rowStartVals, *colStartVals; + int32 rowOffset; + SubMatrix **subMatrices, *newSubMatrix; + + numRowIdxs = rowSlices->numVals; + numColIdxs = colSlices->numVals; + + rowStartVals = rowSlices->startVals; + colStartVals = colSlices->startVals; + + subMatrices = VCilk__malloc( numRowIdxs * numColIdxs *sizeof(SubMatrix *), + animPr ); + + for( rowIdx = 0; rowIdx < numRowIdxs; rowIdx++ ) + { + rowOffset = rowIdx * numColIdxs; + + startRow = rowStartVals[rowIdx]; + endRow = rowStartVals[rowIdx + 1] -1; //"fake" start above last is + // at last valid idx + 1 & is + // 1 greater than end value + for( colIdx = 0; colIdx < numColIdxs; colIdx++ ) + { + startCol = colStartVals[colIdx]; + endCol = colStartVals[colIdx + 1] -1; + + newSubMatrix = VCilk__malloc( sizeof(SubMatrix), animPr ); + newSubMatrix->numRows = endRow - startRow +1; + newSubMatrix->numCols = endCol - startCol +1; + newSubMatrix->origMatrix = origMatrix; + newSubMatrix->origStartRow = startRow; + newSubMatrix->origStartCol = startCol; + newSubMatrix->alreadyCopied = FALSE; + + subMatrices[ rowOffset + colIdx ] = newSubMatrix; + } + } + return subMatrices; + } + +void +freeSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices, + SubMatrix **subMatrices, VirtProcr *animPr ) + { + int32 numRowIdxs, numColIdxs, rowIdx, colIdx, rowOffset; + SubMatrix *subMatrix; + + numRowIdxs = rowSlices->numVals; + numColIdxs = colSlices->numVals; + + for( rowIdx = 0; rowIdx < numRowIdxs; rowIdx++ ) + { + rowOffset = rowIdx * numColIdxs; + for( colIdx = 0; colIdx < numColIdxs; colIdx++ ) + { + subMatrix = subMatrices[ rowOffset + colIdx ]; + if( subMatrix->alreadyCopied ) + VCilk__free( subMatrix->array, animPr ); + VCilk__free( subMatrix, animPr ); + } + } + VCilk__free( subMatrices, animPr ); + } + + + +SlicingStruc * +sliceUpDimension( float32 idealSizeOfSide, int startVal, int endVal, + VirtProcr *animPr ) + { float32 residualAcc = 0; + int numSlices, i, *startVals, sizeOfSlice, endCondition; + SlicingStruc *slicingStruc = VCilk__malloc( sizeof(SlicingStruc), animPr); + + //calc size of matrix need to hold start vals -- + numSlices = (int32)( (float32)(endVal -startVal +1) / idealSizeOfSide); + + startVals = VCilk__malloc( (numSlices + 1) * sizeof(int32), animPr ); + + //Calc the upper limit of start value -- when get above this, end loop + // by saving highest value of the matrix dimension to access, plus 1 + // as the start point of the imaginary slice following the last one + //Plus 1 because go up to value but not include when process last slice + //The stopping condition is half-a-size less than highest value because + // don't want any pieces smaller than half the ideal size -- just tack + // little ones onto end of last one + endCondition = endVal - (int) (idealSizeOfSide/2); //end *value*, not size + for( i = 0; startVal <= endVal; i++ ) + { + startVals[i] = startVal; + residualAcc += idealSizeOfSide; + sizeOfSlice = (int)residualAcc; + residualAcc -= (float32)sizeOfSlice; + startVal += sizeOfSlice; //ex @size = 2 get 0, 2, 4, 6, 8.. + + if( startVal > endCondition ) + { startVal = endVal + 1; + startVals[ i + 1 ] = startVal; + } + } + + slicingStruc->startVals = startVals; + slicingStruc->numVals = i; //loop incr'd, so == last valid start idx+1 + // which means is num sub-matrices in dim + // also == idx of the fake start just above + return slicingStruc; + } + +void +freeSlicingStruc( SlicingStruc *slicingStruc, VirtProcr *animPr ) + { + VCilk__free( slicingStruc->startVals, animPr ); + VCilk__free( slicingStruc, animPr ); + } + + +int inline +measureMatrixMultPrimitive( VirtProcr *animPr ) + { + int r, c, v, numCycles; + float32 *res, *left, *right; + + //setup inputs + left = VCilk__malloc( 5 * 5 * sizeof( float32 ), animPr ); + right = VCilk__malloc( 5 * 5 * sizeof( float32 ), animPr ); + res = VCilk__malloc( 5 * 5 * sizeof( float32 ), animPr ); + + for( r = 0; r < 5; r++ ) + { + for( c = 0; c < 5; c++ ) + { + left[ r * 5 + c ] = r; + right[ r * 5 + c ] = c; + } + } + + //do primitive + VCilk__start_primitive(); //for now, just takes time stamp + for( r = 0; r < 5; r++ ) + { + for( c = 0; c < 5; c++ ) + { + for( v = 0; v < 5; v++ ) + { + res[ r * 5 + c ] = left[ r * 5 + v ] * right[ v * 5 + c ]; + } + } + } + numCycles = + VCilk__end_primitive_and_give_cycles(); + + VCilk__free( left, animPr ); + VCilk__free( right, animPr ); + VCilk__free( res, animPr ); + + return numCycles; + } diff -r 000000000000 -r 56e17dcfc0c3 src/Application/VCilk__Matrix_Mult/EntryPoint.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/Application/VCilk__Matrix_Mult/EntryPoint.c Sat Oct 30 20:43:49 2010 -0700 @@ -0,0 +1,58 @@ +/* + * Copyright 2009 OpenSourceStewardshipFoundation.org + * Licensed under GNU General Public License version 2 + * + * Author: seanhalle@yahoo.com + * + */ + +#include + +#include "VCilk__Matrix_Mult.h" + + + +/*Every VCilk system has an "entry point" function that creates the first + * processor, which starts the chain of creating more processors.. + * eventually all of the processors will dissipate themselves, and + * return. + * + *This entry-point function follows the same pattern as all entry-point + * functions do: + *1) it creates the params for the seed processor, from the + * parameters passed into the entry-point function + *2) it calls VCilk__create_seed_procr_and_do_work + *3) it gets the return value from the params struc, frees the params struc, + * and returns the value from the function + * + */ +Matrix * +multiplyTheseMatrices( Matrix *leftMatrix, Matrix *rightMatrix ) + { Matrix *resMatrix; + DividerParams *dividerParams; + int32 numResRows, numResCols; + + + dividerParams = malloc( sizeof( DividerParams ) ); + dividerParams->leftMatrix = leftMatrix; + dividerParams->rightMatrix = rightMatrix; + + numResRows = leftMatrix->numRows; + numResCols = rightMatrix->numCols; + resMatrix = malloc( sizeof(Matrix) ); + resMatrix->array = malloc( numResRows * numResCols * sizeof(float32)); + resMatrix->numCols = rightMatrix->numCols; + resMatrix->numRows = leftMatrix->numRows; + + + dividerParams->resultMatrix = resMatrix; + + //create divider processor, start doing the work, and wait till done + //This function is the "border crossing" between normal code and VCilk + VCilk__create_seed_procr_and_do_work( ÷WorkIntoSubMatrixPairProcrs, + dividerParams ); + + //get result matrix and return it + free( dividerParams ); + return resMatrix; + } diff -r 000000000000 -r 56e17dcfc0c3 src/Application/VCilk__Matrix_Mult/VCilk__Matrix_Mult.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/Application/VCilk__Matrix_Mult/VCilk__Matrix_Mult.h Sat Oct 30 20:43:49 2010 -0700 @@ -0,0 +1,103 @@ +/* + * Copyright Oct 24, 2009 OpenSourceStewardshipFoundation.org + * Licensed under GNU General Public License version 2 + */ + +#ifndef _VCilk_MATRIX_MULT_H_ +#define _VCilk_MATRIX_MULT_H_ + +#include + +#include "../../VCilk_lib/VCilk.h" +#include "../Matrix_Mult.h" + + +//=============================== Defines ============================== +#define ROWS_IN_BLOCK 32 +#define COLS_IN_BLOCK 32 +#define VEC_IN_BLOCK 32 + + +//#define PRINT_DEBUG(msg) //printf(msg); fflush(stdin); + +//============================== Structures ============================== +typedef struct + { + Matrix *leftMatrix; + Matrix *rightMatrix; + Matrix *resultMatrix; + + TSCount numTSCsToExe; + } +DividerParams; + +typedef struct + { + VirtProcr *dividerPr; + int numRows; + int numCols; + int numSubMatrixPairs; + } +ResultsParams; + +typedef +struct + { int32 numRows; + int32 numCols; + Matrix *origMatrix; + int32 origStartRow; + int32 origStartCol; + int32 alreadyCopied; + float32 *array; //2D, but dynamically sized, so use addr arith + } +SubMatrix; + +typedef struct + { VirtProcr *resultPr; + SubMatrix *leftSubMatrix; + SubMatrix *rightSubMatrix; + float32 *resultArray; + } +SMPairParams; + +typedef +struct + { int32 numVals; + int32 *startVals; + } +SlicingStruc; + +typedef +struct + { + SlicingStruc *leftRowSlices; + SlicingStruc *vecSlices; + SlicingStruc *rightColSlices; + } +SlicingStrucCarrier; + +typedef struct + { + int32 numVecIdxs; + int32 numRightColIdxs; + int32 leftRowIdxOffset; + int32 resColIdx; + SubMatrix **leftSubMatrices; + SubMatrix **rightSubMatrices; + float32 *resultArray; + int32 coreToRunOn; + } +VecParams; + +//============================= Processor Functions ========================= +void divideWorkIntoSubMatrixPairProcrs( void *data, VirtProcr *animatingPr ); +void calcSubMatrixProduct( void *data, VirtProcr *animatingPr ); +void calcVectorOfSubMatrices( void *_vecParams, VirtProcr *animatingPr ); + + +//================================ Entry Point ============================== +Matrix * +multiplyTheseMatrices( Matrix *leftMatrix, Matrix *rightMatrix ); + + +#endif /*_VCilk_MATRIX_MULT_H_*/ diff -r 000000000000 -r 56e17dcfc0c3 src/Application/VCilk__Matrix_Mult/Vector_Pr.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/Application/VCilk__Matrix_Mult/Vector_Pr.c Sat Oct 30 20:43:49 2010 -0700 @@ -0,0 +1,121 @@ +/* + * Copyright 2009 OpenSourceStewardshipFoundation.org + * Licensed under GNU General Public License version 2 + * + * Author: seanhalle@yahoo.com + * + */ + + +#include "VCilk__Matrix_Mult.h" +#include + + +void inline +accumulateResult( float32 *resultArray, float32 *subMatrixResultArray, + int32 startRow, + int32 numRows, + int32 startCol, + int32 numCols, + int32 numOrigCols ); + + +//=========================================================================== + +void +calcVectorOfSubMatrices( void *_vecParams, VirtProcr *animPr ) + { int32 numVecIdxs, leftRowIdxOffset, numRightColIdxs, resColIdx; + SubMatrix **leftSubMatrices, **rightSubMatrices; + float32 *resultArray; + int32 vecIdx, coreWithAffinity; + SMPairParams *subMatrixPairParams, **vecOfSubMatrixParams; + VecParams *vecParams; + + vecParams = (VecParams *)_vecParams; + + int32 subMatrixVectorProbe = + VMS__create_single_interval_probe( "subMtxVect", animPr ); + VMS__record_sched_choice_into_probe( subMatrixVectorProbe, animPr ); + VMS__record_interval_start_in_probe( subMatrixVectorProbe ); + + + numVecIdxs = vecParams->numVecIdxs; + numRightColIdxs = vecParams->numRightColIdxs; + leftRowIdxOffset = vecParams->leftRowIdxOffset; + resColIdx = vecParams->resColIdx; + leftSubMatrices = vecParams->leftSubMatrices; + rightSubMatrices = vecParams->rightSubMatrices; + resultArray = vecParams->resultArray; + coreWithAffinity = vecParams->coreToRunOn; + + vecOfSubMatrixParams = VCilk__malloc( numVecIdxs * sizeof(SMPairParams *), + animPr ); + if( vecOfSubMatrixParams == 0 ){printf("malloc error"); exit(1);} + + for( vecIdx = 0; vecIdx < numVecIdxs; vecIdx++ ) + { + //Make the processor for the pair of sub-matrices + subMatrixPairParams = VCilk__malloc(sizeof(SMPairParams),animPr); + subMatrixPairParams->leftSubMatrix = + leftSubMatrices[ leftRowIdxOffset + vecIdx ]; + + subMatrixPairParams->rightSubMatrix = + rightSubMatrices[ vecIdx * numRightColIdxs + resColIdx ]; + + VCilk__spawn( coreWithAffinity, &calcSubMatrixProduct, + subMatrixPairParams, animPr ); + + vecOfSubMatrixParams[ vecIdx ] = subMatrixPairParams; + } + + VCilk__sync( animPr ); + + //now accumulate individual result matrices into final result matrix + for( vecIdx = 0; vecIdx < numVecIdxs; vecIdx++ ) + { + subMatrixPairParams = vecOfSubMatrixParams[ vecIdx ]; + + accumulateResult( resultArray, subMatrixPairParams->resultArray, + subMatrixPairParams->leftSubMatrix->origStartRow, + subMatrixPairParams->leftSubMatrix->numRows, + subMatrixPairParams->rightSubMatrix->origStartCol, + subMatrixPairParams->rightSubMatrix->numCols, + subMatrixPairParams->rightSubMatrix->origMatrix->numCols); + + //Note, resultArray is made on the core that produces the results + // that gives chance to set affinity so all in vector run on same + // core and re-use that array, and prevents writes from causing + // thrashing of the cache -- as long as array big enough, the copy + // overhead is miniscule vs the size-of-side reuse of each byte + VCilk__free( subMatrixPairParams->resultArray, animPr ); + VCilk__free( subMatrixPairParams, animPr ); + } + VCilk__free( vecOfSubMatrixParams, animPr ); + VCilk__free( vecParams, animPr ); + + VMS__record_interval_end_in_probe( subMatrixVectorProbe ); + + VCilk__dissipate_procr( animPr ); + } + + + +void inline +accumulateResult( float32 *resultArray, float32 *subMatrixResultArray, + int32 startRow, + int32 numRows, + int32 startCol, + int32 numCols, + int32 numOrigCols ) + { int32 row, col; + + for( row = 0; row < numRows; row++ ) + { + for( col = 0; col < numCols; col++ ) + { + resultArray[ (row + startRow) * numOrigCols + col + startCol ] += + subMatrixResultArray[ row * numCols + col ]; + } + } + + } diff -r 000000000000 -r 56e17dcfc0c3 src/Application/VCilk__Matrix_Mult/subMatrix_Pr.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/Application/VCilk__Matrix_Mult/subMatrix_Pr.c Sat Oct 30 20:43:49 2010 -0700 @@ -0,0 +1,269 @@ +/* + * Copyright 2009 OpenSourceStewardshipFoundation.org + * Licensed under GNU General Public License version 2 + * + * Author: SeanHalle@yahoo.com + * + */ + +#include "VCilk__Matrix_Mult.h" + + +void inline +copyFromOrig( SubMatrix *subMatrix, VirtProcr *animPr ); + +void inline +copyTransposeFromOrig( SubMatrix *subMatrix, VirtProcr *animPr ); + +void inline +multiplySubBlocksTransposed( float32 *leftArray, float32 *rightArray, + float32 *resArray, + int startRow, int endRow, + int startCol, int endCol, + int startVec, int endVec, + int resStride, int inpStride ); + +void inline +multiplyMatrixArrays( int32 vecLength, int32 numResRows, int32 numResCols, + float32 *leftArray, float32 *rightArray, + float32 *resArray ); + + +/*A processor is created with an environment that holds two matrices, + * the row and col that it owns, and the name of a result gathering + * processor. + *It calculates the product of two sub-portions of the input matrices + * by using Intel's mkl library for single-core. + * + *This demonstrates using optimized single-threaded code inside scheduled + * work-units. + * + *When done, it sends the result to the result processor + */ +void +calcSubMatrixProduct( void *data, VirtProcr *animatingPr ) + { + SMPairParams *params; + VirtProcr *resultPr; + float32 *leftArray, *rightArray, *resArray; + SubMatrix *leftSubMatrix, *rightSubMatrix; + + PRINT_DEBUG("start sub-matrix mult\n") + int32 subMatrixProbe = VMS__create_single_interval_probe( "subMtx", + animatingPr); + VMS__record_sched_choice_into_probe( subMatrixProbe, animatingPr ); + VMS__record_interval_start_in_probe( subMatrixProbe ); + + params = (SMPairParams *)data; +// resultPr = params->resultPr; + leftSubMatrix = params->leftSubMatrix; + rightSubMatrix = params->rightSubMatrix; + + //make sure the input sub-matrices have been copied out of orig + copyFromOrig( leftSubMatrix, animatingPr ); + copyTransposeFromOrig( rightSubMatrix, animatingPr ); + + leftArray = leftSubMatrix->array; + rightArray = rightSubMatrix->array; + + //make this array here, on the core that computes the results + // with Cilk's semantics, have to have separate result array for each + // spawned processor -- unless want to change the spawn and sync + // pattern, such that spawn one from each vector, then sync, then + // another, and so forth -- this will cause idle time due to imbalance + // in matrix sizes + //This also gives chance to set affinity so all in vector run on same + // core and re-use the accumulation array, + //As a side-benefit, it also prevents writes from causing + // thrashing of the cache -- as long as array big enough, the copy + // overhead is small because each byte is reused size-of-side times + //This is freed in the vector processor + resArray = VCilk__malloc(leftSubMatrix->numRows * rightSubMatrix->numCols* + sizeof( float32 ), animatingPr ); + + + int32 numResRows, numResCols, vectLength; + + vectLength = leftSubMatrix->numCols; + numResRows = leftSubMatrix->numRows; + numResCols = rightSubMatrix->numCols; + + multiplyMatrixArrays( vectLength, numResRows, numResCols, + leftArray, rightArray, + resArray ); + + //send result by side-effect + params->resultArray = resArray; + + VMS__record_interval_end_in_probe( subMatrixProbe ); + + VCilk__dissipate_procr( animatingPr ); + } + + +/*Divides into 32x32 sub-matrices, 3 of which fit into 32KB L1 cache + * Would be nice to embed this within another level that divided into + * 8x8 tiles of those, where one 8x8 tile fits within 2MB L2 cache + * + *Eventually want these divisions to be automatic, using DKU pattern + * embedded into VCilk, and with VMS controlling the divisions according to + * the cache sizes, which it knows about. + *And, want VMS to work with language to split among main-mems, so a socket + * only cranks on data in its local segment of main mem + * + */ +void inline +multiplyMatrixArrays( int32 vecLength, int32 numResRows, int32 numResCols, + float32 *leftArray, float32 *rightArray, + float32 *resArray ) + { + int resStride, inpStride; + int startRow, startCol, endRow, endCol, startVec, endVec; + + resStride = numResCols; + inpStride = vecLength; + + for( startRow = 0; startRow < numResRows; ) + { + endRow = startRow + ROWS_IN_BLOCK; + if( endRow > numResRows ) endRow = numResRows; + + for( startCol = 0; startCol < numResCols; ) + { + endCol = startCol + COLS_IN_BLOCK; + if( endCol > numResCols ) endCol = numResCols; + + for( startVec = 0; startVec < vecLength; ) + { + endVec = startVec + VEC_IN_BLOCK; + if( endVec > vecLength ) endVec = vecLength; + + //By having the "vector" of sub-blocks in a sub-block slice + // be marched down in inner loop, are re-using the result + // matrix, which stays in L1 cache -- can only re-use one of + // the three, so this is the most important -- avoids writing + // dirty blocks until those result-locations fully done + //Row and Col is position in result matrix -- so row and vec + // for left array, then vec and col for right array + multiplySubBlocksTransposed( leftArray, rightArray, + resArray, + startRow, endRow, + startCol, endCol, + startVec, endVec, + resStride, inpStride ); + startVec = endVec; + } + startCol = endCol; + } + startRow = endRow; + } + } + + +void inline +multiplySubBlocksTransposed( float32 *leftArray, float32 *rightArray, + float32 *resArray, + int startRow, int endRow, + int startCol, int endCol, + int startVec, int endVec, + int resStride, int inpStride ) + { + int row, col, vec; + int leftOffset, rightOffset; + float32 result; + + for( row = startRow; row < endRow; row++ ) + { + for( col = startCol; col < endCol; col++ ) + { + leftOffset = row * inpStride;//left & right inp strides always same + rightOffset = col * inpStride;// because right is transposed + result = 0; + for( vec = startVec; vec < endVec; vec++ ) + { + result += + leftArray[ leftOffset + vec] * rightArray[ rightOffset + vec]; + } + + resArray[ row * resStride + col ] += result; + } + } + } + +/*Note: don't do the copy when create, because create on a different core -- + * do the copy on the core that will do the calcs, then try to keep work on + * that core that reuses the array data + */ +void inline +copyTransposeFromOrig( SubMatrix *subMatrix, VirtProcr *animPr ) + { int numCols, numRows, origStartRow, origStartCol, origStride, stride; + Matrix *origMatrix; + float32 *origArray, *subArray; + + if( subMatrix->alreadyCopied ) return; + + subMatrix->alreadyCopied = TRUE; + + origMatrix = subMatrix->origMatrix; + origArray = origMatrix->array; + numCols = subMatrix->numCols; + numRows = subMatrix->numRows; + stride = numRows; + origStartRow = subMatrix->origStartRow; + origStartCol = subMatrix->origStartCol; + origStride = origMatrix->numCols; + + //This is free in Divide pr after all calcs are done + subArray = VCilk__malloc( numRows * numCols * sizeof(float32),animPr); + subMatrix->array = subArray; + + //copy values from orig matrix to local + int row, col, origOffset; + for( row = 0; row < numRows; row++ ) + { + origOffset = (row + origStartRow) * origStride + origStartCol; + for( col = 0; col < numCols; col++ ) + { + //transpose means swap row & col -- traverse orig matrix normally + // but put into reversed place in local array -- means the + // stride is the num rows now, so col * numRows + row + subArray[ col * stride + row ] = origArray[ origOffset + col ]; + } + } + } + +void inline +copyFromOrig( SubMatrix *subMatrix, VirtProcr *animPr ) + { int numCols, numRows, origStartRow, origStartCol, stride, origStride; + Matrix *origMatrix; + float32 *origArray, *subArray; + + if( subMatrix->alreadyCopied ) return; + + subMatrix->alreadyCopied = TRUE; + + origMatrix = subMatrix->origMatrix; + origArray = origMatrix->array; + numCols = subMatrix->numCols; + numRows = subMatrix->numRows; + origStartRow = subMatrix->origStartRow; + origStartCol = subMatrix->origStartCol; + stride = numCols; + origStride = origMatrix->numCols; + + //This is freed in Divide pr after all calcs are done + subArray = VCilk__malloc( numRows * numCols *sizeof(float32),animPr); + subMatrix->array = subArray; + + //copy values from orig matrix to local + int row, col, offset, origOffset; + for( row = 0; row < numRows; row++ ) + { + offset = row * stride; + origOffset = (row + origStartRow) * origStride + origStartCol; + for( col = 0; col < numCols; col++ ) + { + subArray[ offset + col ] = origArray[ origOffset + col ]; + } + } + } diff -r 000000000000 -r 56e17dcfc0c3 src/Application/main.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/Application/main.c Sat Oct 30 20:43:49 2010 -0700 @@ -0,0 +1,34 @@ +/* + * Copyright Oct 24, 2009 OpenSourceStewardshipFoundation.org + * Licensed under GNU General Public License version 2 + * + * author seanhalle@yahoo.com + */ + +#include +#include + +#include "Matrix_Mult.h" +#include "VCilk__Matrix_Mult/VCilk__Matrix_Mult.h" + +/** + *Matrix multiply program written using VMS_HW piggy-back language + * + */ +int main( int argc, char **argv ) + { Matrix *leftMatrix, *rightMatrix, *resultMatrix; + ParamBag *paramBag; + + paramBag = makeParamBag(); + readParamFileIntoBag( argv[1], paramBag ); + initialize_Input_Matrices_Via( &leftMatrix, &rightMatrix, paramBag ); + + resultMatrix = multiplyTheseMatrices( leftMatrix, rightMatrix ); + + printf("\nresult matrix: \n"); + printMatrix( resultMatrix ); + +// VCilk__print_stats(); + + exit(0); //cleans up + }