# HG changeset patch # User Me # Date 1289398074 28800 # Node ID bf7331ed394e0017dab346fe1a99493f86ba29eb # Parent ec0629f70ee5f2ba5519a0707a7618980ef9b5db Working version of blocked matrix mult -- same as SSR, VCilk & VPThread diff -r ec0629f70ee5 -r bf7331ed394e src/Application/CILK__Matrix_Mult/CILK__Matrix_Mult.h --- a/src/Application/CILK__Matrix_Mult/CILK__Matrix_Mult.h Tue Oct 26 19:34:03 2010 -0700 +++ b/src/Application/CILK__Matrix_Mult/CILK__Matrix_Mult.h Wed Nov 10 06:07:54 2010 -0800 @@ -3,14 +3,23 @@ * Licensed under GNU General Public License version 2 */ -#ifndef _VPThread__MATRIX_MULT_H_ -#define _VPThread__MATRIX_MULT_H_ +#ifndef _Cilk__MATRIX_MULT_H_ +#define _Cilk__MATRIX_MULT_H_ #include #include "VMS_primitive_data_types.h" #include "../Matrix_Mult.h" +//=============================== Defines ============================== +#define ROWS_IN_BLOCK 32 +#define COLS_IN_BLOCK 32 +#define VEC_IN_BLOCK 32 + +#define copyMatrixSingleton 1 +#define copyTransposeSingleton 2 + + //============================== Structures ============================== typedef struct { @@ -20,44 +29,55 @@ } DividerParams; -typedef struct - { - int numRows; - int numCols; +typedef +struct + { int32 numRows; + int32 numCols; + Matrix *origMatrix; + int32 origStartRow; + int32 origStartCol; + int32 alreadyCopied; + float32 *array; //2D, but dynamically sized, so use addr arith } -ResultsParams; +SubMatrix; typedef struct { - int myCol; - int myRow; - int vectLength; - Matrix *leftMatrix; - Matrix *rightMatrix; - float32 result; + SubMatrix *leftSubMatrix; + SubMatrix *rightSubMatrix; + float32 *partialResultArray; } -VectorParams; +SMPairParams; + +typedef +struct + { int32 numVals; + int32 *startVals; + } +SlicingStruc; + +typedef +struct + { + SlicingStruc *leftRowSlices; + SlicingStruc *vecSlices; + SlicingStruc *rightColSlices; + } +SlicingStrucCarrier; typedef struct { - //for communicating vector results to results Thd - int32 vector_mutex; - int32 vector_cond; - VectorParams *currVector; - - //for communicating results array back to seed (divider) Thd - int32 results_mutex; - int32 results_cond; - float32 *results; - - //for ensuring results thd has vector lock before making vector thds - int32 start_mutex; - int32 start_cond; - - Matrix *rightMatrix; - Matrix *resultMatrix; + int32 numVecIdxs; + int32 numRightColIdxs; + int32 leftRowIdxOffset; + int32 resColIdx; + SubMatrix **leftSubMatrices; + SubMatrix **rightSubMatrices; + float32 *resultArray; + int32 coreToRunOn; + int32 vecID; } -MatrixMultGlobals; +VecParams; //============================= Processor Functions ========================= @@ -67,8 +87,9 @@ //================================ Entry Point ============================== -//cilk Matrix *\ -multiplyTheseMatrices( Matrix *leftMatrix, Matrix *rightMatrix ); +//cilk +//Matrix * +//multiplyTheseMatrices( Matrix *leftMatrix, Matrix *rightMatrix ); #endif /*_VPThread__MATRIX_MULT_H_*/ diff -r ec0629f70ee5 -r bf7331ed394e src/Application/CILK__Matrix_Mult/Divide_Pr.cilk --- a/src/Application/CILK__Matrix_Mult/Divide_Pr.cilk Tue Oct 26 19:34:03 2010 -0700 +++ b/src/Application/CILK__Matrix_Mult/Divide_Pr.cilk Wed Nov 10 06:07:54 2010 -0800 @@ -1,72 +1,610 @@ -/* - * Copyright 2009 OpenSourceStewardshipFoundation.org - * Licensed under GNU General Public License version 2 - * - * Author: seanhalle@yahoo.com - * - */ - - -#include "CILK__Matrix_Mult.h" - -cilk float32 calcVector( void * ); - -/*Divider creates one processor for every row-col pair. - * It hands them: +/* + * Copyright 2009 OpenSourceStewardshipFoundation.org + * Licensed under GNU General Public License version 2 + * + * Author: seanhalle@yahoo.com + * + */ + + +#include "CILK__Matrix_Mult.h" +#include "VMS_primitive_data_types.h" + +#include +#include +#include +#include + + //The time to compute this many result values should equal the time to + // perform this division on a matrix of size gives that many result calcs + //IE, size this so that sequential time to calc equals divide time + // find the value by experimenting -- but divide time and calc time scale + // same way, so this value should remain valid across hardware + //Divide time is about 800us on 2.4Ghz core2Quad laptop core + //num cells is the cube of a side, when have two square matrices +#define NUM_CELLS_IN_SEQUENTIAL_CUTOFF 100000 /* about 46x46 */ + + //Cilk doesn't have VCilk's facilities, so define constants +#define MIN_WORK_UNIT_CYCLES 100000 +#define IDEAL_NUM_WORK_UNITS 20 +#define NUMBER_OF_CORES_TO_SPAWN_ONTO 4 + +//=============================== External ================================= +cilk void calcVectorOfSubMatrices( void *params ); + +void inline +copyTranspose( int32 numRows, int32 numCols, + int32 origStartRow, int32 origStartCol, int32 origStride, + float32 *subArray, float32 *origArray ); + +void inline +multiplyMatrixArraysTransposed( int32 vecLength, int32 numResRows, int32 numResCols, + float32 *leftArray, float32 *rightArray, + float32 *resArray ); + + +void +startTimeInterval(); + +void +endIntervalAndPrintTime(); + + +//============================= Within-File =============================== +int inline +measureMatrixMultPrimitive(); + +SlicingStrucCarrier * +calcIdealSizeAndSliceDimensions( Matrix *leftMatrix, Matrix *rightMatrix ); + +SlicingStruc * +sliceUpDimension( float32 idealSizeOfSide, int startVal, int endVal ); + +void +freeSlicingStruc( SlicingStruc *slicingStruc ); + +SubMatrix ** +createSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices, + Matrix *origMatrix, int32 transposeTheMatrix ); + +void +freeSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices, + SubMatrix **subMatrices ); + +cilk void +pairUpSubMatricesAndSpawnAndSync( SubMatrix **leftSubMatrices, + SubMatrix **rightSubMatrices, + int32 numRowIdxs, int32 numColIdxs, + int32 numVecIdxs, + float32 *resultArray ); + +cilk void +makeSubMatricesAndSpawnAndSync( Matrix *leftMatrix, Matrix *rightMatrix, + SlicingStrucCarrier *slicingStrucCarrier, + float32 *resultArray ); + +//=========================================================================== + +/*Divider creates one processor for every sub-matrix + * It hands them: * the name of the result processor that they should send their results to, - * the left and right matrices, and the row and col they should multiply - * the length of the vector - * It first creates the result processor, then all the vector processors, + * the left and right matrices, and the rows and cols they should multiply + * It first creates the result processor, then all the sub-matrixPair + * processors, * then does a receive of a message from the result processor that gives * the divider ownership of the result matrix. - * Finally, the divider returns the result matrix out of the VPThread system. - */ - -cilk void divideIntoVectors( void *_dividerParams ) - { - DividerParams *dividerParams; - VectorParams *vectParams; - Matrix *leftMatrix, *rightMatrix, *resultMatrix; - int32 numCells, numCols, mrow, mcol; - float32 *resultMatrixArray; - - dividerParams = (DividerParams *)_dividerParams; - - leftMatrix = dividerParams->leftMatrix; - rightMatrix = dividerParams->rightMatrix; - - - numCols = rightMatrix->numCols; - - numCells = leftMatrix->numRows * rightMatrix->numCols; - resultMatrixArray = malloc( numCells * sizeof( float32 ) ); - - - //spawn vector calcs - for( mrow = 0; mrow < leftMatrix->numRows; mrow++ ) - { for( mcol = 0; mcol < rightMatrix->numCols; mcol++ ) - { - vectParams = malloc( sizeof(VectorParams) ); - vectParams->myCol = mcol; - vectParams->myRow = mrow; - vectParams->vectLength = leftMatrix->numCols; - vectParams->leftMatrix = leftMatrix; - vectParams->rightMatrix = rightMatrix; - - - resultMatrixArray[ mrow * numCols + mcol ] = spawn calcVector( vectParams ); - } - } - - sync; - - - //The results of the all the work have to be linked-to from the data - // struc given to the seed procr -- this divide func is animated by - // that seed procr, so have to link results to the _dividerParams. - resultMatrix = malloc( sizeof(Matrix) ); - resultMatrix->numCols = rightMatrix->numCols; - resultMatrix->numRows = leftMatrix->numRows; - dividerParams->resultMatrix = resultMatrix; - resultMatrix->matrix = resultMatrixArray; + * Finally, the divider returns the result matrix out of the VCilk system. + * + * Divider chooses the size of sub-matrices via an algorithm that tries to + * keep the minimum work above a threshold. The threshold is machine- + * dependent, so ask VCilk for min work-unit time to get a + * given overhead + * + * Divide min work-unit cycles by measured-cycles for one matrix-cell + * product -- gives the number of products need to have in min size + * matrix. + * + * So then, take cubed root of this to get the size of a side of min sub- + * matrix. That is the size of the ideal square sub-matrix -- so tile + * up the two input matrices into ones as close as possible to that size, + * and create the pairs of sub-matrices. + * + *======================== STRATEGIC OVERVIEW ======================= + * + *This division is a bit tricky, because have to create things in advance + * that it's not at first obvious need to be created.. + * + *First slice up each dimension -- three of them.. this is because will have + * to create the sub-matrix's data-structures before pairing the sub-matrices + * with each other -- so, have three dimensions to slice up before can + * create the sub-matrix data-strucs -- also, have to be certain that the + * cols of the left input have the exact same slicing as the rows of the + * left matrix, so just to be sure, do the slicing calc once, then use it + * for both. + * + *So, goes like this: + *1) calculate the start & end values of each dimension in each matrix. + *2) use those values to create sub-matrix structures + *3) combine sub-matrices into pairs, as the tasks to perform. + * + *Have to calculate separately from creating the sub-matrices because of the + * nature of the nesting -- would either end up creating the same sub-matrix + * multiple times, or else would have to put in detection of whether had + * made a particular one already if tried to combine steps 1 and 2. + * + *Step 3 has to be separate because of the nesting, as well -- same reason, + * would either create same sub-matrix multiple times, or else have to + * add detection of whether was already created. + * + *Another way to look at it: there's one level of loop to divide dimensions, + * two levels of nesting to create sub-matrices, and three levels to pair + * up the sub-matrices. + */ + +cilk void +divideWorkIntoSubMatrixPairProcrs( void *_dividerParams ) + { + DividerParams *dividerParams; + Matrix *leftMatrix, *rightMatrix; + + SlicingStrucCarrier *slicingStrucCarrier; + float32 *resultArray; //points to array to be put inside result + // matrix + int32 numResRows, numResCols, vectLength; + + + + startTimeInterval(); + + + //=========== Setup -- make local copies of ptd-to-things, malloc, aso + +// printf("\nin divider\n"); + dividerParams = (DividerParams *)_dividerParams; + + leftMatrix = dividerParams->leftMatrix; + rightMatrix = dividerParams->rightMatrix; + + vectLength = leftMatrix->numCols; + numResRows = leftMatrix->numRows; + numResCols = rightMatrix->numCols; + resultArray = dividerParams->resultMatrix->array; + + //zero the result array + memset( resultArray, 0, numResRows * numResCols * sizeof(float32) ); + + + //============== Do either sequential mult or do division ============== + + //Check if input matrices too small -- if yes, just do sequential + //Cutoff is determined by overhead of this divider -- relatively + // machine-independent + if( (float32)leftMatrix->numRows * (float32)leftMatrix->numCols * + (float32)rightMatrix->numCols < NUM_CELLS_IN_SEQUENTIAL_CUTOFF ) + { int32 vectLength; + + //====== Do sequential multiply on a single core + + //transpose the right matrix + float32 * + transRightArray = malloc( rightMatrix->numRows * + rightMatrix->numCols * + sizeof(float32) ); + + //copy values from orig matrix to local + copyTranspose( rightMatrix->numRows, rightMatrix->numCols, + 0, 0, rightMatrix->numRows, + transRightArray, rightMatrix->array ); + + multiplyMatrixArraysTransposed( vectLength, numResRows, numResCols, + leftMatrix->array, transRightArray, + resultArray ); + } + else + { + //====== Do parallel multiply across cores + + //Calc the ideal size of sub-matrix and slice up the dimensions of + // the two matrices. + //The ideal size is the one takes the number of cycles to calculate + // such that calc time is equal or greater than min work-unit size + slicingStrucCarrier = + calcIdealSizeAndSliceDimensions( leftMatrix, rightMatrix ); + + + //Make the sub-matrices, and pair them up, then spawn processors to + // calc product of each pair. +// printf("first spawn\n"); + spawn makeSubMatricesAndSpawnAndSync( leftMatrix, rightMatrix, + slicingStrucCarrier, + resultArray ); + sync; + //The result array will get filled in by the spawned children + } + + + //=============== Work done -- send results back ================= + + + endIntervalAndPrintTime(); + +// printf("done with divider\n"); + + //results sent back by side-effect +} + + +cilk void +makeSubMatricesAndSpawnAndSync( Matrix *leftMatrix, Matrix *rightMatrix, + SlicingStrucCarrier *slicingStrucCarrier, + float32 *resultArray ) + { + SlicingStruc *leftRowSlices, *vecSlices, *rightColSlices; + SubMatrix **leftSubMatrices, **rightSubMatrices; + int32 numRowIdxs, numColIdxs, numVecIdxs; + + leftRowSlices = slicingStrucCarrier->leftRowSlices; + vecSlices = slicingStrucCarrier->vecSlices; + rightColSlices = slicingStrucCarrier->rightColSlices; + free( slicingStrucCarrier ); + + //================ Make sub-matrices, given the slicing ================ +// printf("about to create submatrices\n"); + leftSubMatrices = + createSubMatrices( leftRowSlices, vecSlices, + leftMatrix, FALSE ); + rightSubMatrices = + createSubMatrices( vecSlices, rightColSlices, + rightMatrix, TRUE ); + + //============== pair the sub-matrices and make processors ============== + + numRowIdxs = leftRowSlices->numVals; + numColIdxs = rightColSlices->numVals; + numVecIdxs = vecSlices->numVals; +// printf("about to spawn %d, %d, %d\n", numRowIdxs, numColIdxs, numVecIdxs); + spawn pairUpSubMatricesAndSpawnAndSync( leftSubMatrices, rightSubMatrices, + numRowIdxs, numColIdxs, numVecIdxs, + resultArray ); + sync; +// printf("done with sub matrices spawn and sync\n"); + freeSubMatrices( leftRowSlices, vecSlices, leftSubMatrices ); + freeSubMatrices( vecSlices, rightColSlices, rightSubMatrices ); + +// printf("done freeing sub matrices\n"); + //It syncs inside, so know all work is done now: free the sub-matrices + freeSlicingStruc( leftRowSlices ); + freeSlicingStruc( vecSlices ); + freeSlicingStruc( rightColSlices ); + +// printf("done freeing slicing strucs\n"); } + + + + +/* numRows*colsPerRow/numCores = numToPutOntoEachCore; + * put all from a given row onto same core, until exhaust allotment for that + * core + * + */ +cilk void +pairUpSubMatricesAndSpawnAndSync( SubMatrix **leftSubMatrices, + SubMatrix **rightSubMatrices, + int32 numRowIdxs, int32 numColIdxs, + int32 numVecIdxs, + float32 *resultArray ) + { + int32 resRowIdx, resColIdx; + int32 numLeftColIdxs, numRightColIdxs; + int32 leftRowIdxOffset; + VecParams *vecParams; + float32 numToPutOntoEachCore, leftOverFraction; + int32 numCores, currCore, numOnCurrCore, numVecs = 0; + + numLeftColIdxs = numColIdxs; + numRightColIdxs = numVecIdxs; + + numCores = NUMBER_OF_CORES_TO_SPAWN_ONTO; + + numToPutOntoEachCore = numRowIdxs*numColIdxs/numCores; + leftOverFraction = 0; + numOnCurrCore = 0; + currCore = 0; + + resRowIdx = 0; +// printf("spawning vects, numOnEachCore: %f\n", numToPutOntoEachCore); + for( resRowIdx = 0; resRowIdx < numRowIdxs; resRowIdx++ ) + { + leftRowIdxOffset = resRowIdx * numLeftColIdxs; + + for( resColIdx = 0; resColIdx < numColIdxs; resColIdx++ ) + { + vecParams = malloc( sizeof(VecParams) ); + + vecParams->numVecIdxs = numVecIdxs; + vecParams->numRightColIdxs = numRightColIdxs; + vecParams->leftRowIdxOffset = leftRowIdxOffset; + vecParams->resColIdx = resColIdx; + vecParams->leftSubMatrices = leftSubMatrices; + vecParams->rightSubMatrices = rightSubMatrices; + vecParams->resultArray = resultArray; + vecParams->coreToRunOn = currCore; + vecParams->vecID = numVecs++; + +// printf("spawning vect %d\n", numVecs-1); fflush(stdin); + spawn calcVectorOfSubMatrices( vecParams ); + + numOnCurrCore += 1; + if( numOnCurrCore + leftOverFraction >= numToPutOntoEachCore - 1 ) + { + //deal with fractional part, to ensure that imbalance is 1 max + // IE, core with most has only 1 more than core with least + leftOverFraction += numToPutOntoEachCore - numOnCurrCore; + if( leftOverFraction >= 1 ) + { leftOverFraction -= 1; + numOnCurrCore = -1; + } + else + { numOnCurrCore = 0; + } + //Move to next core, max core-value to incr to is numCores -1 + if( currCore >= numCores -1 ) + { currCore = 0; + } + else + { currCore += 1; + } + } + } + } + + //Free Note: vector of sub-matrices does its own free-ing, even vec-params + +// printf("done with making vectors\n", numToPutOntoEachCore); + + sync; + + //free the sub-matrices in Fn that called this one + } + + +/*Walk through the two slice-strucs, making sub-matrix strucs as go + */ +SubMatrix ** +createSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices, + Matrix *origMatrix, int32 transposeTheMatrix ) + { + int32 numRowIdxs, numColIdxs, rowIdx, colIdx; + int32 startRow, endRow, startCol, endCol; + int32 *rowStartVals, *colStartVals; + int32 rowOffset, dummy; + SubMatrix **subMatrices, *newSubMatrix; + + numRowIdxs = rowSlices->numVals; + numColIdxs = colSlices->numVals; + + rowStartVals = rowSlices->startVals; + colStartVals = colSlices->startVals; + + subMatrices = malloc( numRowIdxs * numColIdxs *sizeof(SubMatrix *) ); + + for( rowIdx = 0; rowIdx < numRowIdxs; rowIdx++ ) + { + rowOffset = rowIdx * numColIdxs; + + startRow = rowStartVals[rowIdx]; + endRow = rowStartVals[rowIdx + 1] -1; //"fake" start above last is + // at last valid idx + 1 & is + // 1 greater than end value + for( colIdx = 0; colIdx < numColIdxs; colIdx++ ) + { + startCol = colStartVals[colIdx]; + endCol = colStartVals[colIdx + 1] -1; + + newSubMatrix = malloc( sizeof(SubMatrix) ); + newSubMatrix->numRows = endRow - startRow +1; + newSubMatrix->numCols = endCol - startCol +1; + newSubMatrix->origMatrix = origMatrix; + newSubMatrix->origStartRow = startRow; + newSubMatrix->origStartCol = startCol; + //no parallel singleton in Cilk, so copy here + if( transposeTheMatrix ) + { copyTransposeFromOrig( newSubMatrix ); + } + else + { copyFromOrig( newSubMatrix ); + } +// printf("just copied: %X", newSubMatrix->array); + newSubMatrix->alreadyCopied = TRUE; + + subMatrices[ rowOffset + colIdx ] = newSubMatrix; + } + } + return subMatrices; + } + +void +freeSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices, + SubMatrix **subMatrices ) + { + int32 numRowIdxs, numColIdxs, rowIdx, colIdx, rowOffset; + SubMatrix *subMatrix; + + numRowIdxs = rowSlices->numVals; + numColIdxs = colSlices->numVals; + + for( rowIdx = 0; rowIdx < numRowIdxs; rowIdx++ ) + { + rowOffset = rowIdx * numColIdxs; + for( colIdx = 0; colIdx < numColIdxs; colIdx++ ) + { + subMatrix = subMatrices[ rowOffset + colIdx ]; + if( subMatrix->alreadyCopied ) + free( subMatrix->array ); + free( subMatrix ); + } + } + free( subMatrices ); + } + + +SlicingStrucCarrier * +calcIdealSizeAndSliceDimensions( Matrix *leftMatrix, Matrix *rightMatrix ) +{ + float32 idealSizeOfSide, idealSizeOfSide1, idealSizeOfSide2; + SlicingStruc *leftRowSlices, *vecSlices, *rightColSlices; + int32 startLeftRow, endLeftRow, startVec,endVec,startRightCol,endRightCol; + + SlicingStrucCarrier *slicingStrucCarrier = + malloc(sizeof(SlicingStrucCarrier) ); + + int minWorkUnitCycles, primitiveCycles, idealNumWorkUnits; + float64 numPrimitiveOpsInMinWorkUnit; + + + //======= Calc ideal size of min-sized sub-matrix ======== + + //ask VCilk for the number of cycles of the minimum work unit, at given + // percent overhead then add a guess at overhead from this divider + minWorkUnitCycles = MIN_WORK_UNIT_CYCLES; + + //ask VCilk for number of cycles of the "primitive" op of matrix mult + primitiveCycles = measureMatrixMultPrimitive( ); + + numPrimitiveOpsInMinWorkUnit = + (float64)minWorkUnitCycles / (float64)primitiveCycles; + + //take cubed root -- that's number of these in a "side" of sub-matrix + // then multiply by 5 because the primitive is 5x5 + idealSizeOfSide1 = 5 * cbrt( numPrimitiveOpsInMinWorkUnit ); + + idealNumWorkUnits = IDEAL_NUM_WORK_UNITS; + + idealSizeOfSide2 = leftMatrix->numRows / rint(cbrt( idealNumWorkUnits )); + idealSizeOfSide2 *= 0.8; //finer granularity to help load balance + + if( idealSizeOfSide1 > idealSizeOfSide2 ) + idealSizeOfSide = idealSizeOfSide1; + else + idealSizeOfSide = idealSizeOfSide2; + + + //============ Slice up dimensions, now that know target size =========== + + //Tell the slicer the target size of a side (floating pt), the start + // value to start slicing at, and the end value to stop slicing at + //It returns an array of start value of each chunk, plus number of them + + startLeftRow = 0; + endLeftRow = leftMatrix->numRows -1; + startVec = 0; + endVec = leftMatrix->numCols -1; + startRightCol = 0; + endRightCol = rightMatrix->numCols -1; + + leftRowSlices = + sliceUpDimension( idealSizeOfSide, startLeftRow, endLeftRow ); + + vecSlices = + sliceUpDimension( idealSizeOfSide, startVec, endVec ); + + rightColSlices = + sliceUpDimension( idealSizeOfSide, startRightCol, endRightCol ); + + slicingStrucCarrier->leftRowSlices = leftRowSlices; + slicingStrucCarrier->vecSlices = vecSlices; + slicingStrucCarrier->rightColSlices = rightColSlices; + + return slicingStrucCarrier; +} + + +SlicingStruc * +sliceUpDimension( float32 idealSizeOfSide, int startVal, int endVal ) + { float32 residualAcc = 0; + int numSlices, i, *startVals, sizeOfSlice, endCondition; + SlicingStruc *slicingStruc = malloc( sizeof(SlicingStruc) ); + + //calc size of matrix need to hold start vals -- + numSlices = (int32)( (float32)(endVal -startVal +1) / idealSizeOfSide); + + startVals = malloc( (numSlices + 1) * sizeof(int32) ); + + //Calc the upper limit of start value -- when get above this, end loop + // by saving highest value of the matrix dimension to access, plus 1 + // as the start point of the imaginary slice following the last one + //Plus 1 because go up to value but not include when process last slice + //The stopping condition is half-a-size less than highest value because + // don't want any pieces smaller than half the ideal size -- just tack + // little ones onto end of last one + endCondition = endVal - (int) (idealSizeOfSide/2); //end *value*, not size + for( i = 0; startVal <= endVal; i++ ) + { + startVals[i] = startVal; + residualAcc += idealSizeOfSide; + sizeOfSlice = (int)residualAcc; + residualAcc -= (float32)sizeOfSlice; + startVal += sizeOfSlice; //ex @size = 2 get 0, 2, 4, 6, 8.. + + if( startVal > endCondition ) + { startVal = endVal + 1; + startVals[ i + 1 ] = startVal; + } + } + + slicingStruc->startVals = startVals; + slicingStruc->numVals = i; //loop incr'd, so == last valid start idx+1 + // which means is num sub-matrices in dim + // also == idx of the fake start just above + return slicingStruc; + } + +void +freeSlicingStruc( SlicingStruc *slicingStruc ) + { + free( slicingStruc->startVals ); + free( slicingStruc ); + } + + +int inline +measureMatrixMultPrimitive() + { + int r, c, v, numCycles; + float32 *res, *left, *right; + + //setup inputs + left = malloc( 5 * 5 * sizeof( float32 ) ); + right = malloc( 5 * 5 * sizeof( float32 ) ); + res = malloc( 5 * 5 * sizeof( float32 ) ); + + for( r = 0; r < 5; r++ ) + { + for( c = 0; c < 5; c++ ) + { + left[ r * 5 + c ] = r; + right[ r * 5 + c ] = c; + } + } + + //do primitive +// VCilk__start_primitive(); //for now, just takes time stamp + for( r = 0; r < 5; r++ ) + { + for( c = 0; c < 5; c++ ) + { + for( v = 0; v < 5; v++ ) + { + res[ r * 5 + c ] = left[ r * 5 + v ] * right[ v * 5 + c ]; + } + } + } +// numCycles = VCilk__end_primitive_and_give_cycles(); + + free( left ); + free( right ); + free( res ); + + return numCycles; + } diff -r ec0629f70ee5 -r bf7331ed394e src/Application/CILK__Matrix_Mult/EntryPoint.cilk --- a/src/Application/CILK__Matrix_Mult/EntryPoint.cilk Tue Oct 26 19:34:03 2010 -0700 +++ b/src/Application/CILK__Matrix_Mult/EntryPoint.cilk Wed Nov 10 06:07:54 2010 -0800 @@ -1,16 +1,22 @@ /* - * Copyright 2009 OpenSourceCodeStewardshipFoundation.org + * Copyright 2009 OpenSourceStewardshipFoundation.org * Licensed under GNU General Public License version 2 * * Author: seanhalle@yahoo.com * */ +#include + #include "CILK__Matrix_Mult.h" -cilk void divideIntoVectors( void * ); +//========================================================================== +cilk void +divideWorkIntoSubMatrixPairProcrs( void *_dividerParams ); -/*Every VPThread system has an "entry point" function that creates the first +//========================================================================== + +/*Every VCilk system has an "entry point" function that creates the first * processor, which starts the chain of creating more processors.. * eventually all of the processors will dissipate themselves, and * return. @@ -19,27 +25,41 @@ * functions do: *1) it creates the params for the seed processor, from the * parameters passed into the entry-point function - *2) it calls VPThread__create_seed_procr_and_do_work + *2) it calls VCilk__create_seed_procr_and_do_work *3) it gets the return value from the params struc, frees the params struc, * and returns the value from the function * */ -cilk -Matrix * +cilk Matrix * multiplyTheseMatrices( Matrix *leftMatrix, Matrix *rightMatrix ) { Matrix *resMatrix; DividerParams *dividerParams; + int32 numResRows, numResCols; - +// printf("entry point"); dividerParams = malloc( sizeof( DividerParams ) ); dividerParams->leftMatrix = leftMatrix; dividerParams->rightMatrix = rightMatrix; + + numResRows = leftMatrix->numRows; + numResCols = rightMatrix->numCols; + resMatrix = malloc( sizeof(Matrix) ); + resMatrix->array = malloc( numResRows * numResCols * sizeof(float32)); + resMatrix->numCols = rightMatrix->numCols; + resMatrix->numRows = leftMatrix->numRows; - spawn divideIntoVectors( dividerParams ); + + dividerParams->resultMatrix = resMatrix; + + //create divider processor, start doing the work, and wait till done + //This function is the "border crossing" between normal code and VCilk + //VCilk__create_seed_procr_and_do_work( ÷WorkIntoSubMatrixPairProcrs,\ + dividerParams ); + spawn divideWorkIntoSubMatrixPairProcrs( dividerParams ); + sync; - - //get result matrix and return it - resMatrix = dividerParams->resultMatrix; + + //return result matrix free( dividerParams ); return resMatrix; } diff -r ec0629f70ee5 -r bf7331ed394e src/Application/CILK__Matrix_Mult/Vector_Pr.cilk --- a/src/Application/CILK__Matrix_Mult/Vector_Pr.cilk Tue Oct 26 19:34:03 2010 -0700 +++ b/src/Application/CILK__Matrix_Mult/Vector_Pr.cilk Wed Nov 10 06:07:54 2010 -0800 @@ -1,48 +1,119 @@ -/* - * Copyright 2009 OpenSourceCodeStewardshipFoundation.org +/* + * Copyright 2009 OpenSourceStewardshipFoundation.org * Licensed under GNU General Public License version 2 * - * Author: SeanHalle@yahoo.com + * Author: seanhalle@yahoo.com * */ + #include "CILK__Matrix_Mult.h" +#include +#include +#include -/*A Vector processor is created with an environment that holds two matrices, - * the row and col that it owns, and the name of a result gathering - * processor. - *It calculates its vector product then sends the result to the result - * processor, which puts it into the result matrix and returns that matrix - * when all is done. - */ -cilk -float32 -calcVector( void *data ) - { - VectorParams *params; - int myRow, myCol, vectLength, pos; - float32 *leftMatrixArray, *rightMatrixArray, result = 0.0; - Matrix *leftMatrix, *rightMatrix; +//=========================================================================== - params = (VectorParams *)data; - myCol = params->myCol; - myRow = params->myRow; - vectLength = params->vectLength; - leftMatrix = params->leftMatrix; - rightMatrix = params->rightMatrix; - leftMatrixArray = leftMatrix->matrix; - rightMatrixArray = rightMatrix->matrix; - //===================== DEBUG ====================== - #ifdef PRINT_DEBUG - if( myCol == 0 ) - printf("start vector: %d, %d\n", myRow, myCol ); fflush(stdin); - #endif - //==================================================== +cilk void calcSubMatrixProduct( void *data ); - for( pos = 0; pos < vectLength; pos++ ) + +void inline +accumulateResult( float32 *resultArray, float32 *subMatrixResultArray, + int32 startRow, + int32 numRows, + int32 startCol, + int32 numCols, + int32 numOrigCols ); + + +//=========================================================================== + +cilk void +calcVectorOfSubMatrices( void *_vecParams ) + { int32 numVecIdxs, leftRowIdxOffset, numRightColIdxs, resColIdx; + SubMatrix **leftSubMatrices, **rightSubMatrices; + float32 *resultArray; + int32 vecIdx, coreWithAffinity; + SMPairParams *subMatrixPairParams, **vecOfSubMatrixParams; + VecParams *vecParams; + + vecParams = (VecParams *)_vecParams; + + numVecIdxs = vecParams->numVecIdxs; + numRightColIdxs = vecParams->numRightColIdxs; + leftRowIdxOffset = vecParams->leftRowIdxOffset; + resColIdx = vecParams->resColIdx; + leftSubMatrices = vecParams->leftSubMatrices; + rightSubMatrices = vecParams->rightSubMatrices; + resultArray = vecParams->resultArray; + coreWithAffinity = vecParams->coreToRunOn; + +// printf("inside vector %d, %d\n", leftRowIdxOffset, resColIdx ); + + vecOfSubMatrixParams = malloc( numVecIdxs * sizeof(SMPairParams *) ); + if( vecOfSubMatrixParams == 0 ){printf("malloc error"); exit(1);} + + for( vecIdx = 0; vecIdx < numVecIdxs; vecIdx++ ) { - result += leftMatrixArray[ myRow * vectLength + pos ] * - rightMatrixArray[ pos * vectLength + myCol]; + //Make the processor for the pair of sub-matrices + subMatrixPairParams = malloc( sizeof(SMPairParams) ); + subMatrixPairParams->leftSubMatrix = + leftSubMatrices[ leftRowIdxOffset + vecIdx ]; + + subMatrixPairParams->rightSubMatrix = + rightSubMatrices[ vecIdx * numRightColIdxs + resColIdx ]; + +// printf("about to spawn pair %X\n", (int)subMatrixPairParams->leftSubMatrix->array); + spawn calcSubMatrixProduct( subMatrixPairParams ); +// printf("done with spawn\n"); + vecOfSubMatrixParams[ vecIdx ] = subMatrixPairParams; } - return result; + +// printf("done spawning product pairs\n"); + sync; + + //now accumulate individual result matrices into final result matrix + for( vecIdx = 0; vecIdx < numVecIdxs; vecIdx++ ) + { + subMatrixPairParams = vecOfSubMatrixParams[ vecIdx ]; + + accumulateResult( resultArray, subMatrixPairParams->partialResultArray, + subMatrixPairParams->leftSubMatrix->origStartRow, + subMatrixPairParams->leftSubMatrix->numRows, + subMatrixPairParams->rightSubMatrix->origStartCol, + subMatrixPairParams->rightSubMatrix->numCols, + subMatrixPairParams->rightSubMatrix->origMatrix->numCols); + + //Note, resultArray is made on the core that produces the results + // that gives chance to set affinity so all in vector run on same + // core and re-use that array, and prevents writes from causing + // thrashing of the cache -- as long as array big enough, the copy + // overhead is miniscule vs the size-of-side reuse of each byte + free( subMatrixPairParams->partialResultArray ); + free( subMatrixPairParams ); + } + free( vecOfSubMatrixParams ); + free( vecParams ); } + + + +void inline +accumulateResult( float32 *resultArray, float32 *subMatrixResultArray, + int32 startRow, + int32 numRows, + int32 startCol, + int32 numCols, + int32 numOrigCols ) + { int32 row, col; + + for( row = 0; row < numRows; row++ ) + { + for( col = 0; col < numCols; col++ ) + { + resultArray[ (row + startRow) * numOrigCols + col + startCol ] += + subMatrixResultArray[ row * numCols + col ]; + } + } + + } diff -r ec0629f70ee5 -r bf7331ed394e src/Application/CILK__Matrix_Mult/subMatrix_Pr.cilk --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/Application/CILK__Matrix_Mult/subMatrix_Pr.cilk Wed Nov 10 06:07:54 2010 -0800 @@ -0,0 +1,301 @@ +/* + * Copyright 2010 OpenSourceStewardshipFoundation.org + * Licensed under GNU General Public License version 2 + * + * Author: SeanHalle@yahoo.com + * + */ + +#include + +#include "CILK__Matrix_Mult.h" + + +void inline +copyFromOrig( SubMatrix *subMatrix ); + +void inline +copyTransposeFromOrig( SubMatrix *subMatrix ); + +void inline +multiplySubBlocksTransposed( float32 *leftArray, float32 *rightArray, + float32 *resArray, + int startRow, int endRow, + int startCol, int endCol, + int startVec, int endVec, + int resStride, int inpStride ); + +void inline +multiplyMatrixArraysTransposed( int32 vecLength, int32 numResRows, int32 numResCols, + float32 *leftArray, float32 *rightArray, + float32 *resArray ); + + +/*A processor is created with an environment that holds two matrices, + * the row and col that it owns, and the name of a result gathering + * processor. + *It calculates the product of two sub-portions of the input matrices + * by using Intel's mkl library for single-core. + * + *This demonstrates using optimized single-threaded code inside scheduled + * work-units. + * + *When done, it sends the result to the result processor + */ +cilk void +calcSubMatrixProduct( void *data ) + { + SMPairParams *params; + float32 *leftArray, *rightArray, *resArray; + SubMatrix *leftSubMatrix, *rightSubMatrix; + int32 resSize; + int32 numResRows, numResCols, vectLength; + +// printf("inside submatrix-pair\n"); + params = (SMPairParams *)data; + leftSubMatrix = params->leftSubMatrix; + rightSubMatrix = params->rightSubMatrix; + + + //make sure the input sub-matrices have been copied out of orig + //copyFromOrig( leftSubMatrix, animatingPr ); + //copyTransposeFromOrig( rightSubMatrix, animatingPr ); + + leftArray = leftSubMatrix->array; + rightArray = rightSubMatrix->array; + + //make this array here, on the core that computes the results + // with Cilk's semantics, have to have separate result array for each + // spawned processor -- unless want to change the spawn and sync + // pattern, such that spawn one from each vector, then sync, then + // another, and so forth -- this will cause idle time due to imbalance + // in matrix sizes + //This also gives chance to set affinity so all in vector run on same + // core and re-use the accumulation array, + //As a side-benefit, it also prevents writes from causing + // thrashing of the cache -- as long as array big enough, the copy + // overhead is small because each byte is reused size-of-side times + //This is freed in the vector processor + resSize = leftSubMatrix->numRows * rightSubMatrix->numCols * sizeof(float32); + resArray = malloc( resSize ); + memset( resArray, 0, resSize ); + + + vectLength = leftSubMatrix->numCols; + numResRows = leftSubMatrix->numRows; + numResCols = rightSubMatrix->numCols; + +// printf("just before multiply arrays %X\n", leftArray); + multiplyMatrixArraysTransposed( vectLength, numResRows, numResCols, + leftArray, rightArray, + resArray ); + + //send result by side-effect + params->partialResultArray = resArray; + } + + + +/*Divides result and each input into 32x32 sub-matrices, 3 of which fit into + * the 32KB L1 cache. + *Would be nice to embed this within another level that divided into + * 8x8 tiles of those, where one 8x8 tile fits within 2MB L2 cache + * + *Eventually want these divisions to be automatic, using DKU pattern + * embedded into VMS and exposed in the language, and with VMS controlling the + * divisions according to the cache sizes, which it knows about. + *Also, want VMS to work with language to split among main-mems, so a socket + * only cranks on data in its local segment of main mem + * + *So, outer two loops determine start and end points within the result matrix. + * Inside that, a loop dets the start and end points along the shared dimensions + * of the two input matrices. + */ +void inline +multiplyMatrixArraysTransposed( int32 vecLength, int32 numResRows, + int32 numResCols, + float32 *leftArray, float32 *rightArray, + float32 *resArray ) + { + int resStride, inpStride; + int resStartRow, resStartCol, resEndRow, resEndCol, startVec, endVec; + + resStride = numResCols; + inpStride = vecLength; + + for( resStartRow = 0; resStartRow < numResRows; ) + { + resEndRow = resStartRow + ROWS_IN_BLOCK -1; //start at zero, so -1 + if( resEndRow > numResRows ) resEndRow = numResRows -1; + + for( resStartCol = 0; resStartCol < numResCols; ) + { + resEndCol = resStartCol + COLS_IN_BLOCK -1; + if( resEndCol > numResCols ) resEndCol = numResCols -1; + + for( startVec = 0; startVec < vecLength; ) + { + endVec = startVec + VEC_IN_BLOCK -1; + if( endVec > vecLength ) endVec = vecLength -1; + +// printf("just before multiply sub-blocks %X\n", leftArray); + //By having the "vector" of sub-blocks in a sub-block slice + // be marched down in inner loop, are re-using the result + // matrix, which stays in L1 cache and re-using the left sub-mat + // which repeats for each right sub-mat -- can only re-use two of + // the three, so result is the most important -- avoids writing + // dirty blocks until those result-locations fully done + //Row and Col is position in result matrix -- so row and vec + // for left array, then vec and col for right array + multiplySubBlocksTransposed( leftArray, rightArray, + resArray, + resStartRow, resEndRow, + resStartCol, resEndCol, + startVec, endVec, + resStride, inpStride ); + startVec = endVec +1; + } + resStartCol = resEndCol +1; + } + resStartRow = resEndRow +1; + } + } + + + +void inline +multiplySubBlocksTransposed( float32 *leftArray, float32 *rightArray, + float32 *resArray, + int resStartRow, int resEndRow, + int resStartCol, int resEndCol, + int startVec, int endVec, + int resStride, int inpStride ) + { + int resRow, resCol, vec; + int leftOffset, rightOffset; + float32 result; + +// printf("start col, row | end col, row: %d, %d, %d, %d\n", resStartCol, resStartRow, resEndCol, resEndRow); + //The result row is used for the left matrix, res col for the right + for( resCol = resStartCol; resCol <= resEndCol; resCol++ ) + { + for( resRow = resStartRow; resRow <= resEndRow; resRow++ ) + { + leftOffset = resRow * inpStride;//left & right inp strides same + rightOffset = resCol * inpStride;// because right is transposed + result = 0; + for( vec = startVec; vec <= endVec; vec++ ) + { + result += + leftArray[ leftOffset + vec] * rightArray[ rightOffset + vec]; + } + + resArray[ resRow * resStride + resCol ] += result; + } + } + } + + +/*Reuse this in divider when do the sequential multiply case + */ +void inline +copyTranspose( int32 numRows, int32 numCols, + int32 origStartRow, int32 origStartCol, int32 origStride, + float32 *subArray, float32 *origArray ) + { int32 stride; + int32 row, col, origOffset; + + stride = numRows; + for( row = 0; row < numRows; row++ ) + { + origOffset = (row + origStartRow) * origStride + origStartCol; + for( col = 0; col < numCols; col++ ) + { + //transpose means swap row & col -- traverse orig matrix normally + // but put into reversed place in local array -- means the + // stride is the numRows now, so col * numRows + row + subArray[ col * stride + row ] = origArray[ origOffset + col ]; + } + } + } + +void inline +copyTransposeFromOrig( SubMatrix *subMatrix ) + { int numCols, numRows, origStartRow, origStartCol, origStride, stride; + Matrix *origMatrix; + float32 *origArray, *subArray; + + +// if( subMatrix->alreadyCopied ) return; +// VCilk__start_singleton(copyMatrixSingleton,&&EndOfTranspSingleton,animPr); + + origMatrix = subMatrix->origMatrix; + origArray = origMatrix->array; + numCols = subMatrix->numCols; + numRows = subMatrix->numRows; + origStartRow = subMatrix->origStartRow; + origStartCol = subMatrix->origStartCol; + origStride = origMatrix->numCols; + + subArray = malloc( numRows * numCols *sizeof(float32) ); + subMatrix->array = subArray; +// printf("copying transpose %X\n", subArray); + + //copy values from orig matrix to local + copyTranspose( numRows, numCols, + origStartRow, origStartCol, origStride, + subArray, origArray ); + + subMatrix->alreadyCopied = TRUE; //must be last thing before label +// EndOfTranspSingleton: + return; + } + + +void inline +copyFromOrig( SubMatrix *subMatrix ) + { int numCols, numRows, origStartRow, origStartCol, stride, origStride; + Matrix *origMatrix; + float32 *origArray, *subArray; + int32 row, col, offset, origOffset; + + //This lets only a single VP execute the code between start and + // end -- using start and end so that work runs outside the master. + //Inside, if a second VP ever executes the start, it will be returned + // from the end-point. + //Note, for non-GCC, can add a second SSR call at the end, and inside + // that one, look at the stack at the return addr & save that in an + // array indexed by singletonID +// if( subMatrix->alreadyCopied ) return; +// VCilk__start_singleton( copyMatrixSingleton, &&EndOfCopySingleton,animPr); + + + origMatrix = subMatrix->origMatrix; + origArray = origMatrix->array; + numCols = subMatrix->numCols; + numRows = subMatrix->numRows; + origStartRow = subMatrix->origStartRow; + origStartCol = subMatrix->origStartCol; + origStride = origMatrix->numCols; + + subArray = malloc( numRows * numCols *sizeof(float32) ); + subMatrix->array = subArray; +// printf("copying normal %X\n", subArray); + + //copy values from orig matrix to local + stride = numCols; + + for( row = 0; row < numRows; row++ ) + { + offset = row * stride; + origOffset = (row + origStartRow) * origStride + origStartCol; + for( col = 0; col < numCols; col++ ) + { + subArray[ offset + col ] = origArray[ origOffset + col ]; + } + } + + subMatrix->alreadyCopied = TRUE; //must be last thing before label +// EndOfCopySingleton: + return; + } diff -r ec0629f70ee5 -r bf7331ed394e src/Application/Makefile --- a/src/Application/Makefile Tue Oct 26 19:34:03 2010 -0700 +++ b/src/Application/Makefile Wed Nov 10 06:07:54 2010 -0800 @@ -1,35 +1,51 @@ +# +# Copyright Nov 6, 2010 OpenSourceStewardshipFoundation.org +# Licensed under GNU General Public License version 2 +# +# author seanhalle@yahoo.com CILK_SOURCE = \ CILK__Matrix_Mult/EntryPoint.cilk \ CILK__Matrix_Mult/Divide_Pr.cilk \ CILK__Matrix_Mult/Vector_Pr.cilk \ + CILK__Matrix_Mult/subMatrix_Pr.cilk \ main.cilk C_SOURCE = \ matrix_mult.c \ - ParamHelper/ParamBag.c\ + ParamHelper/ParamBag.c \ ParamHelper/ReadParamsFromFile.c +#The next two rules make a new string, with the same names as the +# source, but the endings changed to .o +#The third concatenates the two source-file strings C_OBJS = $(C_SOURCE:.c=.o) CILK_OBJS = $(CILK_SOURCE:.cilk=.o) -OBJECTS = $(C_SOURCE) $(CILK_SOURCE) +OBJECTS = $(C_OBJS) $(CILK_OBJS) + #Make has the built-in variable "$<" which is the source file # and "$@" which is the target for that source +#The first rule says for each file in C_SOURCE, put the +# name in place of $< and expand +# C_OBJS in place of $@ and run the gcc command, when files are +# out of date $(C_OBJS): $(C_SOURCE) - gcc -c $< -o $@ + gcc -c $*.c -o $*.o; $(CILK_OBJS): $(CILK_SOURCE) - gcc -c $< -o $@ + cilkc -c $*.cilk -o $*.o all: $(OBJECTS) cilkc $(OBJECTS) -o CILK_Linux__Matrix_Mult; \ cp CILK_Linux__Matrix_Mult ~/D/2__INRIA_OMP/1__Development/2__runs_and_data/executables +clean: + rm *.o; rm ParamHelper/*.o; rm CILK__Matrix_Mult/*.o #================================================================ #Other stuff tried/played_with/copied @@ -62,8 +78,6 @@ #================================================================ # playing with below.. -#7C9A-RV6P-3XE2-JV99-426K-2K - #rule for inferring that the .cilk file is the source for .o file # and how to create the .o from the .cilk #%.o : %.cilk diff -r ec0629f70ee5 -r bf7331ed394e src/Application/Matrix_Mult.c --- a/src/Application/Matrix_Mult.c Tue Oct 26 19:34:03 2010 -0700 +++ b/src/Application/Matrix_Mult.c Wed Nov 10 06:07:54 2010 -0800 @@ -61,7 +61,7 @@ numRows = matrixStruc->numRows; numCols = matrixStruc->numCols; - matrixStart = matrixStruc->matrix; + matrixStart = matrixStruc->array; file = fopen( matrixFileName, "r" ); if( file == NULL ) { printf( "\nCouldn't open file!!\n"); exit(1);} @@ -131,7 +131,7 @@ retMatrix = malloc( sizeof( Matrix ) ); retMatrix->numRows = numRows; retMatrix->numCols = numCols; - retMatrix->matrix = malloc( numRows * numCols * sizeof(float32) ); + retMatrix->array = malloc( numRows * numCols * sizeof(float32) ); return retMatrix; } @@ -142,24 +142,26 @@ } void freeMatrix( Matrix * matrix ) - { free( matrix->matrix ); + { free( matrix->array ); free( matrix ); } void printMatrix( Matrix *matrix ) - { int r, c, numRows, numCols; + { + int r, c, numRows, numCols, rowsToPrint, colsToPrint, rowIncr, colIncr; float32 *matrixArray; - numRows = matrix->numRows; - numCols = matrix->numCols; - matrixArray = matrix->matrix; + numRows = rowsToPrint = matrix->numRows; + numCols = colsToPrint = matrix->numCols; + matrixArray = matrix->array; - for( r = 0; r < numRows; r++ ) - { for( c = 0; c < numCols; c++ ) - { printf( "%f | ", *(matrixArray + r*numCols + c) ); + rowIncr = numRows/20; if(rowIncr == 0) rowIncr = 1;//20 to 39 rows printed + colIncr = numCols/20; if(colIncr == 0) colIncr = 1;//20 to 39 cols printed + for( r = 0; r < numRows; r += rowIncr ) + { for( c = 0; c < numCols; c += colIncr ) + { printf( "%3.1f | ", matrixArray[ r * numCols + c ] ); } printf("\n"); } } - diff -r ec0629f70ee5 -r bf7331ed394e src/Application/Matrix_Mult.h --- a/src/Application/Matrix_Mult.h Tue Oct 26 19:34:03 2010 -0700 +++ b/src/Application/Matrix_Mult.h Wed Nov 10 06:07:54 2010 -0800 @@ -19,7 +19,7 @@ struct { int32 numRows; int32 numCols; - float32 *matrix; //2D, but dynamically sized, so use addr arith + float32 *array; //2D, but dynamically sized, so use addr arith } Matrix; diff -r ec0629f70ee5 -r bf7331ed394e src/Application/main.cilk --- a/src/Application/main.cilk Tue Oct 26 19:34:03 2010 -0700 +++ b/src/Application/main.cilk Wed Nov 10 06:07:54 2010 -0800 @@ -1,41 +1,67 @@ -/* - * Copyright Oct 24, 2009 OpenSourceCodeStewardshipFoundation.org +/* + * Copyright 2010 OpenSourcStewardshipFoundation.org * Licensed under GNU General Public License version 2 - * - * author seanhalle@yahoo.com - */ - -#include -#include - -#include "Matrix_Mult.h" -#include "CILK__Matrix_Mult/CILK__Matrix_Mult.h" - -cilk Matrix * -multiplyTheseMatrices( Matrix *leftMatrix, Matrix *rightMatrix ); - -/** - *Matrix multiply program written using VMS_HW piggy-back language - * - */ -cilk -int main( int argc, char **argv ) - { Matrix *leftMatrix, *rightMatrix, *resultMatrix; - ParamBag *paramBag; - - - paramBag = makeParamBag(); - readParamFileIntoBag( argv[1], paramBag ); - initialize_Input_Matrices_Via( &leftMatrix, &rightMatrix, paramBag ); - - resultMatrix = spawn multiplyTheseMatrices( leftMatrix, rightMatrix ); - sync; - - printf("\nresult matrix: \n"); - -// printMatrix( resultMatrix ); - -// VPThread__print_stats(); - - exit(0); //cleans up - } + * + * author seanhalle@yahoo.com + */ + +#include +#include + +#include "Matrix_Mult.h" +#include "CILK__Matrix_Mult/CILK__Matrix_Mult.h" + + + //single global var -- just get it done +struct timeval startStamp; + + +void +startTimeInterval() + { + gettimeofday( &startStamp, NULL); + } + + +void +endIntervalAndPrintTime() + { + struct timeval endStamp; + float64 startSecs, endSecs, intervalSecs; + + gettimeofday( &endStamp, NULL); + + startSecs = startStamp.tv_sec + ( startStamp.tv_usec / 1000000.0 ); + endSecs = endStamp.tv_sec + ( endStamp.tv_usec / 1000000.0 ); + + intervalSecs = endSecs - startSecs; + printf("Interval: %f", intervalSecs); + } + + +cilk Matrix * +multiplyTheseMatrices( Matrix *leftMatrix, Matrix *rightMatrix ); + +/** + *Matrix multiply program written using VMS_HW piggy-back language + * + */ +cilk +int main( int argc, char **argv ) + { Matrix *leftMatrix, *rightMatrix, *resultMatrix; + ParamBag *paramBag; + + + paramBag = makeParamBag(); + readParamFileIntoBag( argv[1], paramBag ); + initialize_Input_Matrices_Via( &leftMatrix, &rightMatrix, paramBag ); + + resultMatrix = spawn multiplyTheseMatrices( leftMatrix, rightMatrix ); + sync; + + printf("\nresult matrix: \n"); + +// printMatrix( resultMatrix ); + + exit(0); //cleans up + }