# HG changeset patch # User Merten Sach # Date 1305121254 -7200 # Node ID ecba4ae0be7ab069641d147bcad1a353aabf428d # Parent 12bcb9728e1ce53aa79780191261d2c530841096 Seans development tree diff -r 12bcb9728e1c -r ecba4ae0be7a src/Application/Matrix_Mult.c --- a/src/Application/Matrix_Mult.c Mon Nov 15 12:08:19 2010 -0800 +++ b/src/Application/Matrix_Mult.c Wed May 11 15:40:54 2011 +0200 @@ -1,167 +1,167 @@ -/* - * Copyright 2009 OpenSourceStewardshipFoundation.org - * Licensed under GNU General Public License version 2 - * - * Author: seanhalle@yahoo.com - * - * Created on November 15, 2009, 2:35 AM - */ - -#include -#include - -#include "Matrix_Mult.h" -#include "ParamHelper/Param.h" - - - - void -initialize_Input_Matrices_Via( Matrix **leftMatrix, Matrix **rightMatrix, - ParamBag *paramBag ) - { char *leftMatrixFileName, *rightMatrixFileName; - int leftMatrixRows, leftMatrixCols, rightMatrixRows, rightMatrixCols; - - ParamStruc *param; - param = getParamFromBag( "leftMatrixRows", paramBag ); - leftMatrixRows = param->intValue; - param = getParamFromBag( "leftMatrixCols", paramBag ); - leftMatrixCols = param->intValue; - *leftMatrix = makeMatrix_WithResMat( leftMatrixRows, leftMatrixCols ); - - param = getParamFromBag( "leftMatrixFileName", paramBag ); - leftMatrixFileName = param->strValue; //no need to copy - read_Matrix_From_File( *leftMatrix, leftMatrixFileName ); - - param = getParamFromBag( "rightMatrixRows", paramBag ); - rightMatrixRows = param->intValue; - param = getParamFromBag( "rightMatrixCols", paramBag ); - rightMatrixCols = param->intValue; - *rightMatrix = makeMatrix_WithResMat( rightMatrixRows, rightMatrixCols ); - - param = getParamFromBag( "rightMatrixFileName", paramBag ); - rightMatrixFileName = param->strValue; - read_Matrix_From_File( *rightMatrix, rightMatrixFileName ); - } - - -void parseLineIntoRow( char *line, float32* row ); - - - void -read_Matrix_From_File( Matrix *matrixStruc, char *matrixFileName ) - { int row, maxRead, numRows, numCols; - float32 *matrixStart; - size_t lineSz = 0; - FILE *file; - char *line = NULL; - - lineSz = 50000; //max length of line in a matrix data file - line = (char *) malloc( lineSz ); - if( line == NULL ) printf( "no mem for matrix line" ); - - numRows = matrixStruc->numRows; - numCols = matrixStruc->numCols; - matrixStart = matrixStruc->array; - - file = fopen( matrixFileName, "r" ); - if( file == NULL ) { printf( "\nCouldn't open file!!\n"); exit(1);} - fseek( file, 0, SEEK_SET ); - for( row = 0; row < numRows; row++ ) - { - if( feof( file ) ) printf( "file ran out too soon" ); - maxRead = getline( &line, &lineSz, file ); - if( maxRead == -1 ) printf( "prob reading mat line"); - - if( *line == '\n') continue; //blank line - if( *line == '/' ) continue; //comment line - - parseLineIntoRow( line, matrixStart + row * numCols ); - } - free( line ); - } - -/*This function relies on each line having the proper number of cols. It - * doesn't check, nor enforce, so if the file is improperly formatted it - * can write over unrelated memory - */ - void -parseLineIntoRow( char *line, float32* row ) - { - char *valueStr, *searchPos; - - //read the float values - searchPos = valueStr = line; //start - - for( ; *searchPos != 0; searchPos++) //bit dangerous, should use buff len - { - if( *searchPos == '\n' ) //last col.. relying on well-formatted file - { *searchPos = 0; - *row = atof( valueStr ); - break; //end FOR loop - } - if( *searchPos == ',' ) - { *searchPos = 0; //mark end of string - *row = (float32) atof( valueStr ); - row += 1; //address arith - //skip any spaces before digits.. use searchPos + 1 to skip the 0 - for( ; *(searchPos + 1)== ' ' && *(searchPos + 1) !=0; searchPos++); - valueStr = searchPos + 1; - } - } - } - - //========================================================================== - -/*In the "_Flat" version of constructor, do only malloc of the top data struc - * and set values in that top-level. Don't malloc any sub-structures. - */ - Matrix * -makeMatrix_Flat( int32 numRows, int32 numCols ) - { Matrix * retMatrix; - retMatrix = malloc( sizeof( Matrix ) ); - retMatrix->numRows = numRows; - retMatrix->numCols = numCols; - - return retMatrix; - } - - Matrix * -makeMatrix_WithResMat( int32 numRows, int32 numCols ) - { Matrix * retMatrix; - retMatrix = malloc( sizeof( Matrix ) ); - retMatrix->numRows = numRows; - retMatrix->numCols = numCols; - retMatrix->array = malloc( numRows * numCols * sizeof(float32) ); - - return retMatrix; - } - - void -freeMatrix_Flat( Matrix * matrix ) - { //( matrix ); - } - void -freeMatrix( Matrix * matrix ) - { free( matrix->array ); - free( matrix ); - } - -void -printMatrix( Matrix *matrix ) - { int r, c, numRows, numCols, rowsToPrint, colsToPrint, rowIncr, colIncr; - float32 *matrixArray; - - numRows = rowsToPrint = matrix->numRows; - numCols = colsToPrint = matrix->numCols; - matrixArray = matrix->array; - - rowIncr = numRows/20; if(rowIncr == 0) rowIncr = 1;//20 to 39 rows printed - colIncr = numCols/20; if(colIncr == 0) colIncr = 1;//20 to 39 cols printed - for( r = 0; r < numRows; r += rowIncr ) - { for( c = 0; c < numCols; c += colIncr ) - { printf( "%3.1f | ", matrixArray[ r * numCols + c ] ); - } - printf("\n"); - } - } - +/* + * Copyright 2009 OpenSourceStewardshipFoundation.org + * Licensed under GNU General Public License version 2 + * + * Author: seanhalle@yahoo.com + * + * Created on November 15, 2009, 2:35 AM + */ + +#include +#include + +#include "Matrix_Mult.h" +#include "ParamHelper/Param.h" + + + + void +initialize_Input_Matrices_Via( Matrix **leftMatrix, Matrix **rightMatrix, + ParamBag *paramBag ) + { char *leftMatrixFileName, *rightMatrixFileName; + int leftMatrixRows, leftMatrixCols, rightMatrixRows, rightMatrixCols; + + ParamStruc *param; + param = getParamFromBag( "leftMatrixRows", paramBag ); + leftMatrixRows = param->intValue; + param = getParamFromBag( "leftMatrixCols", paramBag ); + leftMatrixCols = param->intValue; + *leftMatrix = makeMatrix_WithResMat( leftMatrixRows, leftMatrixCols ); + + param = getParamFromBag( "leftMatrixFileName", paramBag ); + leftMatrixFileName = param->strValue; //no need to copy + read_Matrix_From_File( *leftMatrix, leftMatrixFileName ); + + param = getParamFromBag( "rightMatrixRows", paramBag ); + rightMatrixRows = param->intValue; + param = getParamFromBag( "rightMatrixCols", paramBag ); + rightMatrixCols = param->intValue; + *rightMatrix = makeMatrix_WithResMat( rightMatrixRows, rightMatrixCols ); + + param = getParamFromBag( "rightMatrixFileName", paramBag ); + rightMatrixFileName = param->strValue; + read_Matrix_From_File( *rightMatrix, rightMatrixFileName ); + } + + +void parseLineIntoRow( char *line, float32* row ); + + + void +read_Matrix_From_File( Matrix *matrixStruc, char *matrixFileName ) + { int row, maxRead, numRows, numCols; + float32 *matrixStart; + size_t lineSz = 0; + FILE *file; + char *line = NULL; + + lineSz = 50000; //max length of line in a matrix data file + line = (char *) malloc( lineSz ); + if( line == NULL ) printf( "no mem for matrix line" ); + + numRows = matrixStruc->numRows; + numCols = matrixStruc->numCols; + matrixStart = matrixStruc->array; + + file = fopen( matrixFileName, "r" ); + if( file == NULL ) { printf( "\nCouldn't open file!!\n"); exit(1);} + fseek( file, 0, SEEK_SET ); + for( row = 0; row < numRows; row++ ) + { + if( feof( file ) ) printf( "file ran out too soon" ); + maxRead = getline( &line, &lineSz, file ); + if( maxRead == -1 ) printf( "prob reading mat line"); + + if( *line == '\n') continue; //blank line + if( *line == '/' ) continue; //comment line + + parseLineIntoRow( line, matrixStart + row * numCols ); + } + free( line ); + } + +/*This function relies on each line having the proper number of cols. It + * doesn't check, nor enforce, so if the file is improperly formatted it + * can write over unrelated memory + */ + void +parseLineIntoRow( char *line, float32* row ) + { + char *valueStr, *searchPos; + + //read the float values + searchPos = valueStr = line; //start + + for( ; *searchPos != 0; searchPos++) //bit dangerous, should use buff len + { + if( *searchPos == '\n' ) //last col.. relying on well-formatted file + { *searchPos = 0; + *row = atof( valueStr ); + break; //end FOR loop + } + if( *searchPos == ',' ) + { *searchPos = 0; //mark end of string + *row = (float32) atof( valueStr ); + row += 1; //address arith + //skip any spaces before digits.. use searchPos + 1 to skip the 0 + for( ; *(searchPos + 1)== ' ' && *(searchPos + 1) !=0; searchPos++); + valueStr = searchPos + 1; + } + } + } + + //========================================================================== + +/*In the "_Flat" version of constructor, do only malloc of the top data struc + * and set values in that top-level. Don't malloc any sub-structures. + */ + Matrix * +makeMatrix_Flat( int32 numRows, int32 numCols ) + { Matrix * retMatrix; + retMatrix = malloc( sizeof( Matrix ) ); + retMatrix->numRows = numRows; + retMatrix->numCols = numCols; + + return retMatrix; + } + + Matrix * +makeMatrix_WithResMat( int32 numRows, int32 numCols ) + { Matrix * retMatrix; + retMatrix = malloc( sizeof( Matrix ) ); + retMatrix->numRows = numRows; + retMatrix->numCols = numCols; + retMatrix->array = malloc( numRows * numCols * sizeof(float32) ); + + return retMatrix; + } + + void +freeMatrix_Flat( Matrix * matrix ) + { //( matrix ); + } + void +freeMatrix( Matrix * matrix ) + { free( matrix->array ); + free( matrix ); + } + +void +printMatrix( Matrix *matrix ) + { int r, c, numRows, numCols, rowsToPrint, colsToPrint, rowIncr, colIncr; + float32 *matrixArray; + + numRows = rowsToPrint = matrix->numRows; + numCols = colsToPrint = matrix->numCols; + matrixArray = matrix->array; + + rowIncr = numRows/20; if(rowIncr == 0) rowIncr = 1;//20 to 39 rows printed + colIncr = numCols/20; if(colIncr == 0) colIncr = 1;//20 to 39 cols printed + for( r = 0; r < numRows; r += rowIncr ) + { for( c = 0; c < numCols; c += colIncr ) + { printf( "%3.1f | ", matrixArray[ r * numCols + c ] ); + } + printf("\n"); + } + } + diff -r 12bcb9728e1c -r ecba4ae0be7a src/Application/Matrix_Mult.h --- a/src/Application/Matrix_Mult.h Mon Nov 15 12:08:19 2010 -0800 +++ b/src/Application/Matrix_Mult.h Wed May 11 15:40:54 2011 +0200 @@ -1,77 +1,77 @@ -/* - * Copyright Oct 24, 2009 OpenSourceStewardshipFoundation.org - * Licensed under GNU General Public License version 2 - */ - -#ifndef MATRIX_MULT_H_ -#define MATRIX_MULT_H_ - -#include -#include -#include - -#include "../VCilk_lib/VMS/VMS_primitive_data_types.h" -#include "ParamHelper/Param.h" - -//============================== Structures ============================== - -typedef -struct - { int32 numRows; - int32 numCols; - float32 *array; //2D, but dynamically sized, so use addr arith - } -Matrix; - -/* This is the "appSpecificPiece" that is carried inside a DKUPiece. - * In the DKUPiece data struc it is declared to be of type "void *". This - * allows the application to define any data structure it wants and put it - * into a DKUPiece. - * When the app specific info is used, it is in app code, so it is cast to - * the correct type to tell the compiler how to access fields. - * This keeps all app-specific things out of the DKU directory, as per the - * DKU standard. */ -typedef -struct - { - // pointers to shared data.. the result matrix must be created when the - // left and right matrices are put into the root ancestor DKUPiece. - Matrix * leftMatrix; - Matrix * rightMatrix; - Matrix * resultMatrix; - - // define the starting and ending boundaries for this piece of the - // result matrix. These are derivable from the left and right - // matrices, but included them for readability of code. - int prodStartRow, prodEndRow; - int prodStartCol, prodEndCol; - // Start and end of the portion of the left matrix that contributes to - // this piece of the product - int leftStartRow, leftEndRow; - int leftStartCol, leftEndCol; - // Start and end of the portion of the right matrix that contributes to - // this piece of the product - int rightStartRow, rightEndRow; - int rightStartCol, rightEndCol; - } -MatrixProdPiece; - -//============================== Functions ================================ -void readFile(); - -Matrix *makeMatrix( int32 numRows, int32 numCols ); -Matrix *makeMatrix_Flat( int32 numRows, int32 numCols ); -Matrix *makeMatrix_WithResMat( int32 numRows, int32 numCols ); -void freeMatrix_Flat( Matrix * matrix ); -void freeMatrix( Matrix * matrix ); -void printMatrix( Matrix *matrix ); - -void read_Matrix_From_File( Matrix *matrixStruc, char *matrixFileName ); - -void -initialize_Input_Matrices_Via( Matrix **leftMatrix, Matrix **rightMatrix, - ParamBag *paramBag ); - -//=========================================================================== - -#endif /*MATRIX_MULT_H_*/ +/* + * Copyright Oct 24, 2009 OpenSourceStewardshipFoundation.org + * Licensed under GNU General Public License version 2 + */ + +#ifndef MATRIX_MULT_H_ +#define MATRIX_MULT_H_ + +#include +#include +#include + +#include "../VCilk_lib/VMS/VMS_primitive_data_types.h" +#include "ParamHelper/Param.h" + +//============================== Structures ============================== + +typedef +struct + { int32 numRows; + int32 numCols; + float32 *array; //2D, but dynamically sized, so use addr arith + } +Matrix; + +/* This is the "appSpecificPiece" that is carried inside a DKUPiece. + * In the DKUPiece data struc it is declared to be of type "void *". This + * allows the application to define any data structure it wants and put it + * into a DKUPiece. + * When the app specific info is used, it is in app code, so it is cast to + * the correct type to tell the compiler how to access fields. + * This keeps all app-specific things out of the DKU directory, as per the + * DKU standard. */ +typedef +struct + { + // pointers to shared data.. the result matrix must be created when the + // left and right matrices are put into the root ancestor DKUPiece. + Matrix * leftMatrix; + Matrix * rightMatrix; + Matrix * resultMatrix; + + // define the starting and ending boundaries for this piece of the + // result matrix. These are derivable from the left and right + // matrices, but included them for readability of code. + int prodStartRow, prodEndRow; + int prodStartCol, prodEndCol; + // Start and end of the portion of the left matrix that contributes to + // this piece of the product + int leftStartRow, leftEndRow; + int leftStartCol, leftEndCol; + // Start and end of the portion of the right matrix that contributes to + // this piece of the product + int rightStartRow, rightEndRow; + int rightStartCol, rightEndCol; + } +MatrixProdPiece; + +//============================== Functions ================================ +void readFile(); + +Matrix *makeMatrix( int32 numRows, int32 numCols ); +Matrix *makeMatrix_Flat( int32 numRows, int32 numCols ); +Matrix *makeMatrix_WithResMat( int32 numRows, int32 numCols ); +void freeMatrix_Flat( Matrix * matrix ); +void freeMatrix( Matrix * matrix ); +void printMatrix( Matrix *matrix ); + +void read_Matrix_From_File( Matrix *matrixStruc, char *matrixFileName ); + +void +initialize_Input_Matrices_Via( Matrix **leftMatrix, Matrix **rightMatrix, + ParamBag *paramBag ); + +//=========================================================================== + +#endif /*MATRIX_MULT_H_*/ diff -r 12bcb9728e1c -r ecba4ae0be7a src/Application/VCilk__Matrix_Mult/Divide_Pr.c --- a/src/Application/VCilk__Matrix_Mult/Divide_Pr.c Mon Nov 15 12:08:19 2010 -0800 +++ b/src/Application/VCilk__Matrix_Mult/Divide_Pr.c Wed May 11 15:40:54 2011 +0200 @@ -1,588 +1,588 @@ -/* - * Copyright 2009 OpenSourceStewardshipFoundation.org - * Licensed under GNU General Public License version 2 - * - * Author: seanhalle@yahoo.com - * - */ - - -#include "VCilk__Matrix_Mult.h" -#include -#include -#include - - //The time to compute this many result values should equal the time to - // perform this division on a matrix of size gives that many result calcs - //IE, size this so that sequential time to calc equals divide time - // find the value by experimenting -- but divide time and calc time scale - // same way, so this value should remain valid across hardware - //Divide time is about 800us on 2.4Ghz core2Quad laptop core - //num cells is the cube of a side, when have two square matrices -#define NUM_CELLS_IN_SEQUENTIAL_CUTOFF 100000 /* about 46x46 */ - - -//=========================================================================== -int inline -measureMatrixMultPrimitive( VirtProcr *animPr ); - -SlicingStrucCarrier * -calcIdealSizeAndSliceDimensions( Matrix *leftMatrix, Matrix *rightMatrix, - VirtProcr *animPr ); - -SlicingStruc * -sliceUpDimension( float32 idealSizeOfSide, int startVal, int endVal, - VirtProcr *animPr ); - -void -freeSlicingStruc( SlicingStruc *slicingStruc, VirtProcr *animPr ); - -SubMatrix ** -createSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices, - Matrix *origMatrix, VirtProcr *animPr ); - -void -freeSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices, - SubMatrix **subMatrices, VirtProcr *animPr ); - -void -pairUpSubMatricesAndSpawnAndSync( SubMatrix **leftSubMatrices, - SubMatrix **rightSubMatrices, - int32 numRowIdxs, int32 numColIdxs, - int32 numVecIdxs, - float32 *resultArray, - VirtProcr *animatingPr ); - -void -makeSubMatricesAndSpawnAndSync( Matrix *leftMatrix, Matrix *rightMatrix, - SlicingStrucCarrier *slicingStrucCarrier, - float32 *resultArray, VirtProcr *animatingPr ); - -//=========================================================================== - -/*Divider creates one processor for every sub-matrix - * It hands them: - * the name of the result processor that they should send their results to, - * the left and right matrices, and the rows and cols they should multiply - * It first creates the result processor, then all the sub-matrixPair - * processors, - * then does a receive of a message from the result processor that gives - * the divider ownership of the result matrix. - * Finally, the divider returns the result matrix out of the VCilk system. - * - * Divider chooses the size of sub-matrices via an algorithm that tries to - * keep the minimum work above a threshold. The threshold is machine- - * dependent, so ask VCilk for min work-unit time to get a - * given overhead - * - * Divide min work-unit cycles by measured-cycles for one matrix-cell - * product -- gives the number of products need to have in min size - * matrix. - * - * So then, take cubed root of this to get the size of a side of min sub- - * matrix. That is the size of the ideal square sub-matrix -- so tile - * up the two input matrices into ones as close as possible to that size, - * and create the pairs of sub-matrices. - * - *======================== STRATEGIC OVERVIEW ======================= - * - *This division is a bit tricky, because have to create things in advance - * that it's not at first obvious need to be created.. - * - *First slice up each dimension -- three of them.. this is because will have - * to create the sub-matrix's data-structures before pairing the sub-matrices - * with each other -- so, have three dimensions to slice up before can - * create the sub-matrix data-strucs -- also, have to be certain that the - * cols of the left input have the exact same slicing as the rows of the - * left matrix, so just to be sure, do the slicing calc once, then use it - * for both. - * - *So, goes like this: - *1) calculate the start & end values of each dimension in each matrix. - *2) use those values to create sub-matrix structures - *3) combine sub-matrices into pairs, as the tasks to perform. - * - *Have to calculate separately from creating the sub-matrices because of the - * nature of the nesting -- would either end up creating the same sub-matrix - * multiple times, or else would have to put in detection of whether had - * made a particular one already if tried to combine steps 1 and 2. - * - *Step 3 has to be separate because of the nesting, as well -- same reason, - * would either create same sub-matrix multiple times, or else have to - * add detection of whether was already created. - * - *Another way to look at it: there's one level of loop to divide dimensions, - * two levels of nesting to create sub-matrices, and three levels to pair - * up the sub-matrices. - */ - -void divideWorkIntoSubMatrixPairProcrs( void *_dividerParams, - VirtProcr *animPr ) - { - DividerParams *dividerParams; - ResultsParams *resultsParams; - Matrix *leftMatrix, *rightMatrix, *resultMatrix; - void *msg; - SlicingStrucCarrier *slicingStrucCarrier; - float32 *resultArray; //points to array to be put inside result - // matrix - - DEBUG( dbgAppFlow, "start divide\n") - - int32 - divideProbe = VMS__create_single_interval_probe( "divideProbe", - animPr ); - VMS__record_sched_choice_into_probe( divideProbe, animPr ); - VMS__record_interval_start_in_probe( divideProbe ); - - //=========== Setup -- make local copies of ptd-to-things, malloc, aso - int32 numResRows, numResCols, vectLength; - - dividerParams = (DividerParams *)_dividerParams; - - leftMatrix = dividerParams->leftMatrix; - rightMatrix = dividerParams->rightMatrix; - - vectLength = leftMatrix->numCols; - numResRows = leftMatrix->numRows; - numResCols = rightMatrix->numCols; - resultArray = dividerParams->resultMatrix->array; - - //zero the result array - memset( resultArray, 0, numResRows * numResCols * sizeof(float32) ); - - - //============== Do either sequential mult or do division ============== - - //Check if input matrices too small -- if yes, just do sequential - //Cutoff is determined by overhead of this divider -- relatively - // machine-independent - if( (float32)leftMatrix->numRows * (float32)leftMatrix->numCols * - (float32)rightMatrix->numCols < NUM_CELLS_IN_SEQUENTIAL_CUTOFF ) - { int32 vectLength; - - //====== Do sequential multiply on a single core - DEBUG( dbgAppFlow, "doing sequential") - - //transpose the right matrix - float32 * - transRightArray = VCilk__malloc( rightMatrix->numRows * - rightMatrix->numCols * - sizeof(float32), animPr ); - - //copy values from orig matrix to local - copyTranspose( rightMatrix->numRows, rightMatrix->numCols, - 0, 0, rightMatrix->numRows, - transRightArray, rightMatrix->array ); - - multiplyMatrixArraysTransposed( vectLength, numResRows, numResCols, - leftMatrix->array, transRightArray, - resultArray ); - } - else - { - //====== Do parallel multiply across cores - - //Calc the ideal size of sub-matrix and slice up the dimensions of - // the two matrices. - //The ideal size is the one takes the number of cycles to calculate - // such that calc time is equal or greater than min work-unit size - slicingStrucCarrier = - calcIdealSizeAndSliceDimensions( leftMatrix, rightMatrix, animPr); - - - - //Make the sub-matrices, and pair them up, then spawn processors to - // calc product of each pair. - makeSubMatricesAndSpawnAndSync( leftMatrix, rightMatrix, - slicingStrucCarrier, - resultArray, animPr); - //The result array will get filled in by the spawned children - } - - - //=============== Work done -- send results back ================= - - - //results have been saved into an array that was made outside the VMS - // system, by entry-point Fn, and passed in through dividerParams. - //So, nothing to do to send results back -- they're seen by side-effect - - DEBUG( dbgAppFlow, "*** end divide ***\n") - - VMS__record_interval_end_in_probe( divideProbe ); - VMS__print_stats_of_all_probes(); - - VCilk__dissipate_procr( animPr ); //all procrs dissipate self at end - //when all of the processors have dissipated, the "create seed and do - // work" call in the entry point function returns - } - - -SlicingStrucCarrier * -calcIdealSizeAndSliceDimensions( Matrix *leftMatrix, Matrix *rightMatrix, - VirtProcr *animPr ) -{ - float32 idealSizeOfSide, idealSizeOfSide1, idealSizeOfSide2; - SlicingStruc *leftRowSlices, *vecSlices, *rightColSlices; - SlicingStrucCarrier *slicingStrucCarrier = - VCilk__malloc(sizeof(SlicingStrucCarrier), animPr ); - - int minWorkUnitCycles, primitiveCycles, idealNumWorkUnits; - float64 numPrimitiveOpsInMinWorkUnit; - - - //======= Calc ideal size of min-sized sub-matrix ======== - - //ask VCilk for the number of cycles of the minimum work unit, at given - // percent overhead then add a guess at overhead from this divider - minWorkUnitCycles = VCilk__giveMinWorkUnitCycles( .05 ); - - //ask VCilk for number of cycles of the "primitive" op of matrix mult - primitiveCycles = measureMatrixMultPrimitive( animPr ); - - numPrimitiveOpsInMinWorkUnit = - (float64)minWorkUnitCycles / (float64)primitiveCycles; - - //take cubed root -- that's number of these in a "side" of sub-matrix - // then multiply by 5 because the primitive is 5x5 - idealSizeOfSide1 = 5 * cbrt( numPrimitiveOpsInMinWorkUnit ); - - idealNumWorkUnits = VCilk__giveIdealNumWorkUnits(); - - idealSizeOfSide2 = leftMatrix->numRows / rint(cbrt( idealNumWorkUnits )); - idealSizeOfSide2 *= 0.8; //finer granularity to help load balance - - if( idealSizeOfSide1 > idealSizeOfSide2 ) - idealSizeOfSide = idealSizeOfSide1; - else - idealSizeOfSide = idealSizeOfSide2; - - //The multiply inner loop blocks the array to fit into L1 cache -// if( idealSizeOfSide < ROWS_IN_BLOCK ) idealSizeOfSide = ROWS_IN_BLOCK; - - //============ Slice up dimensions, now that know target size =========== - - //Tell the slicer the target size of a side (floating pt), the start - // value to start slicing at, and the end value to stop slicing at - //It returns an array of start value of each chunk, plus number of them - int32 startLeftRow, endLeftRow, startVec,endVec,startRightCol,endRightCol; - startLeftRow = 0; - endLeftRow = leftMatrix->numRows -1; - startVec = 0; - endVec = leftMatrix->numCols -1; - startRightCol = 0; - endRightCol = rightMatrix->numCols -1; - - leftRowSlices = - sliceUpDimension( idealSizeOfSide, startLeftRow, endLeftRow, animPr ); - - vecSlices = - sliceUpDimension( idealSizeOfSide, startVec, endVec, animPr ); - - rightColSlices = - sliceUpDimension( idealSizeOfSide, startRightCol, endRightCol,animPr); - - slicingStrucCarrier->leftRowSlices = leftRowSlices; - slicingStrucCarrier->vecSlices = vecSlices; - slicingStrucCarrier->rightColSlices = rightColSlices; - - DEBUG1( dbgAppFlow, "leftRowSlices %d | ", leftRowSlices->numVals ); - DEBUG1( dbgAppFlow, "rightColSlices %d | ",rightColSlices->numVals); - DEBUG1( dbgAppFlow, "vecSlices %d\n", vecSlices->numVals ); - return slicingStrucCarrier; -} - - -void inline -makeSubMatricesAndSpawnAndSync( Matrix *leftMatrix, Matrix *rightMatrix, - SlicingStrucCarrier *slicingStrucCarrier, - float32 *resultArray, VirtProcr *animPr ) - { - SlicingStruc *leftRowSlices, *vecSlices, *rightColSlices; - - leftRowSlices = slicingStrucCarrier->leftRowSlices; - vecSlices = slicingStrucCarrier->vecSlices; - rightColSlices = slicingStrucCarrier->rightColSlices; - VCilk__free( slicingStrucCarrier, animPr ); - - //================ Make sub-matrices, given the slicing ================ - SubMatrix **leftSubMatrices, **rightSubMatrices; - leftSubMatrices = - createSubMatrices( leftRowSlices, vecSlices, - leftMatrix, animPr ); - rightSubMatrices = - createSubMatrices( vecSlices, rightColSlices, - rightMatrix, animPr ); - - freeSlicingStruc( leftRowSlices, animPr ); - freeSlicingStruc( vecSlices, animPr ); - freeSlicingStruc( rightColSlices, animPr ); - - //============== pair the sub-matrices and make processors ============== - int32 numRowIdxs, numColIdxs, numVecIdxs; - - numRowIdxs = leftRowSlices->numVals; - numColIdxs = rightColSlices->numVals; - numVecIdxs = vecSlices->numVals; - pairUpSubMatricesAndSpawnAndSync( leftSubMatrices, rightSubMatrices, - numRowIdxs, numColIdxs, numVecIdxs, - resultArray, - animPr ); - //It syncs inside, so know all work is done now: free the sub-matrices - freeSubMatrices( leftRowSlices, vecSlices, leftSubMatrices, animPr ); - freeSubMatrices( vecSlices, rightColSlices, rightSubMatrices, animPr ); - } - - - - -/* numRows*colsPerRow/numCores = numToPutOntoEachCore; - * put all from a given row onto same core, until exhaust allotment for that - * core - * - */ -void inline -pairUpSubMatricesAndSpawnAndSync( SubMatrix **leftSubMatrices, - SubMatrix **rightSubMatrices, - int32 numRowIdxs, int32 numColIdxs, - int32 numVecIdxs, - float32 *resultArray, - VirtProcr *animatingPr ) - { - int32 resRowIdx, resColIdx; - int32 numLeftColIdxs, numRightColIdxs; - int32 leftRowIdxOffset; - VecParams *vecParams; - float32 numToPutOntoEachCore, leftOverFraction; - int32 numCores, currCore, numOnCurrCore; - - numLeftColIdxs = numColIdxs; - numRightColIdxs = numVecIdxs; - - numCores = VCilk__give_number_of_cores_to_spawn_onto(); - - numToPutOntoEachCore = numRowIdxs*numColIdxs/numCores; - leftOverFraction = 0; - numOnCurrCore = 0; - currCore = 0; - - for( resRowIdx = 0; resRowIdx < numRowIdxs; resRowIdx++ ) - { - leftRowIdxOffset = resRowIdx * numLeftColIdxs; - - for( resColIdx = 0; resColIdx < numColIdxs; resColIdx++ ) - { - vecParams = VCilk__malloc( sizeof(VecParams), animatingPr ); - - vecParams->numVecIdxs = numVecIdxs; - vecParams->numRightColIdxs = numRightColIdxs; - vecParams->leftRowIdxOffset = leftRowIdxOffset; - vecParams->resColIdx = resColIdx; - vecParams->leftSubMatrices = leftSubMatrices; - vecParams->rightSubMatrices = rightSubMatrices; - vecParams->resultArray = resultArray; - vecParams->coreToRunOn = currCore; - - VCilk__spawn( currCore, &calcVectorOfSubMatrices, vecParams, - animatingPr ); - - numOnCurrCore += 1; - if( numOnCurrCore + leftOverFraction >= numToPutOntoEachCore - 1 ) - { - //deal with fractional part, to ensure that imbalance is 1 max - // IE, core with most has only 1 more than core with least - leftOverFraction += numToPutOntoEachCore - numOnCurrCore; - if( leftOverFraction >= 1 ) - { leftOverFraction -= 1; - numOnCurrCore = -1; - } - else - { numOnCurrCore = 0; - } - //Move to next core, max core-value to incr to is numCores -1 - if( currCore >= numCores -1 ) - { currCore = 0; - } - else - { currCore += 1; - } - } - } - } - - //Free Note: vector of sub-matrices does its own free-ing, even vec-params - -//TODO: timeToSpawnProbe = VMS__get_probe_by_name( "timeToSpawnProbe" ); -// VMS__end_interval_on_probe( timeToSpawnProbe ); - - VCilk__sync( animatingPr ); - - //free the sub-matrices in Fn that called this one - } - - -/*Walk through the two slice-strucs, making sub-matrix strucs as go - */ -SubMatrix ** -createSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices, - Matrix *origMatrix, VirtProcr *animPr ) - { - int32 numRowIdxs, numColIdxs, rowIdx, colIdx; - int32 startRow, endRow, startCol, endCol; - int32 *rowStartVals, *colStartVals; - int32 rowOffset; - SubMatrix **subMatrices, *newSubMatrix; - - numRowIdxs = rowSlices->numVals; - numColIdxs = colSlices->numVals; - - rowStartVals = rowSlices->startVals; - colStartVals = colSlices->startVals; - - subMatrices = VCilk__malloc( numRowIdxs * numColIdxs *sizeof(SubMatrix *), - animPr ); - - for( rowIdx = 0; rowIdx < numRowIdxs; rowIdx++ ) - { - rowOffset = rowIdx * numColIdxs; - - startRow = rowStartVals[rowIdx]; - endRow = rowStartVals[rowIdx + 1] -1; //"fake" start above last is - // at last valid idx + 1 & is - // 1 greater than end value - for( colIdx = 0; colIdx < numColIdxs; colIdx++ ) - { - startCol = colStartVals[colIdx]; - endCol = colStartVals[colIdx + 1] -1; - - newSubMatrix = VCilk__malloc( sizeof(SubMatrix), animPr ); - newSubMatrix->numRows = endRow - startRow +1; - newSubMatrix->numCols = endCol - startCol +1; - newSubMatrix->origMatrix = origMatrix; - newSubMatrix->origStartRow = startRow; - newSubMatrix->origStartCol = startCol; - newSubMatrix->alreadyCopied = FALSE; - - subMatrices[ rowOffset + colIdx ] = newSubMatrix; - } - } - return subMatrices; - } - -void -freeSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices, - SubMatrix **subMatrices, VirtProcr *animPr ) - { - int32 numRowIdxs, numColIdxs, rowIdx, colIdx, rowOffset; - SubMatrix *subMatrix; - - numRowIdxs = rowSlices->numVals; - numColIdxs = colSlices->numVals; - - for( rowIdx = 0; rowIdx < numRowIdxs; rowIdx++ ) - { - rowOffset = rowIdx * numColIdxs; - for( colIdx = 0; colIdx < numColIdxs; colIdx++ ) - { - subMatrix = subMatrices[ rowOffset + colIdx ]; - if( subMatrix->alreadyCopied ) - VCilk__free( subMatrix->array, animPr ); - VCilk__free( subMatrix, animPr ); - } - } - VCilk__free( subMatrices, animPr ); - } - - - -SlicingStruc * -sliceUpDimension( float32 idealSizeOfSide, int startVal, int endVal, - VirtProcr *animPr ) - { float32 residualAcc = 0; - int numSlices, i, *startVals, sizeOfSlice, endCondition; - SlicingStruc *slicingStruc = VCilk__malloc( sizeof(SlicingStruc), animPr); - - //calc size of matrix need to hold start vals -- - numSlices = (int32)( (float32)(endVal -startVal +1) / idealSizeOfSide); - - startVals = VCilk__malloc( (numSlices + 1) * sizeof(int32), animPr ); - - //Calc the upper limit of start value -- when get above this, end loop - // by saving highest value of the matrix dimension to access, plus 1 - // as the start point of the imaginary slice following the last one - //Plus 1 because go up to value but not include when process last slice - //The stopping condition is half-a-size less than highest value because - // don't want any pieces smaller than half the ideal size -- just tack - // little ones onto end of last one - endCondition = endVal - (int) (idealSizeOfSide/2); //end *value*, not size - for( i = 0; startVal <= endVal; i++ ) - { - startVals[i] = startVal; - residualAcc += idealSizeOfSide; - sizeOfSlice = (int)residualAcc; - residualAcc -= (float32)sizeOfSlice; - startVal += sizeOfSlice; //ex @size = 2 get 0, 2, 4, 6, 8.. - - if( startVal > endCondition ) - { startVal = endVal + 1; - startVals[ i + 1 ] = startVal; - } - } - - slicingStruc->startVals = startVals; - slicingStruc->numVals = i; //loop incr'd, so == last valid start idx+1 - // which means is num sub-matrices in dim - // also == idx of the fake start just above - return slicingStruc; - } - -void -freeSlicingStruc( SlicingStruc *slicingStruc, VirtProcr *animPr ) - { - VCilk__free( slicingStruc->startVals, animPr ); - VCilk__free( slicingStruc, animPr ); - } - - -int inline -measureMatrixMultPrimitive( VirtProcr *animPr ) - { - int r, c, v, numCycles; - float32 *res, *left, *right; - - //setup inputs - left = VCilk__malloc( 5 * 5 * sizeof( float32 ), animPr ); - right = VCilk__malloc( 5 * 5 * sizeof( float32 ), animPr ); - res = VCilk__malloc( 5 * 5 * sizeof( float32 ), animPr ); - - for( r = 0; r < 5; r++ ) - { - for( c = 0; c < 5; c++ ) - { - left[ r * 5 + c ] = r; - right[ r * 5 + c ] = c; - } - } - - //do primitive - VCilk__start_primitive(); //for now, just takes time stamp - for( r = 0; r < 5; r++ ) - { - for( c = 0; c < 5; c++ ) - { - for( v = 0; v < 5; v++ ) - { - res[ r * 5 + c ] = left[ r * 5 + v ] * right[ v * 5 + c ]; - } - } - } - numCycles = - VCilk__end_primitive_and_give_cycles(); - - VCilk__free( left, animPr ); - VCilk__free( right, animPr ); - VCilk__free( res, animPr ); - - return numCycles; - } +/* + * Copyright 2009 OpenSourceStewardshipFoundation.org + * Licensed under GNU General Public License version 2 + * + * Author: seanhalle@yahoo.com + * + */ + + +#include "VCilk__Matrix_Mult.h" +#include +#include +#include + + //The time to compute this many result values should equal the time to + // perform this division on a matrix of size gives that many result calcs + //IE, size this so that sequential time to calc equals divide time + // find the value by experimenting -- but divide time and calc time scale + // same way, so this value should remain valid across hardware + //Divide time is about 800us on 2.4Ghz core2Quad laptop core + //num cells is the cube of a side, when have two square matrices +#define NUM_CELLS_IN_SEQUENTIAL_CUTOFF 100000 /* about 46x46 */ + + +//=========================================================================== +int inline +measureMatrixMultPrimitive( VirtProcr *animPr ); + +SlicingStrucCarrier * +calcIdealSizeAndSliceDimensions( Matrix *leftMatrix, Matrix *rightMatrix, + VirtProcr *animPr ); + +SlicingStruc * +sliceUpDimension( float32 idealSizeOfSide, int startVal, int endVal, + VirtProcr *animPr ); + +void +freeSlicingStruc( SlicingStruc *slicingStruc, VirtProcr *animPr ); + +SubMatrix ** +createSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices, + Matrix *origMatrix, VirtProcr *animPr ); + +void +freeSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices, + SubMatrix **subMatrices, VirtProcr *animPr ); + +void +pairUpSubMatricesAndSpawnAndSync( SubMatrix **leftSubMatrices, + SubMatrix **rightSubMatrices, + int32 numRowIdxs, int32 numColIdxs, + int32 numVecIdxs, + float32 *resultArray, + VirtProcr *animatingPr ); + +void +makeSubMatricesAndSpawnAndSync( Matrix *leftMatrix, Matrix *rightMatrix, + SlicingStrucCarrier *slicingStrucCarrier, + float32 *resultArray, VirtProcr *animatingPr ); + +//=========================================================================== + +/*Divider creates one processor for every sub-matrix + * It hands them: + * the name of the result processor that they should send their results to, + * the left and right matrices, and the rows and cols they should multiply + * It first creates the result processor, then all the sub-matrixPair + * processors, + * then does a receive of a message from the result processor that gives + * the divider ownership of the result matrix. + * Finally, the divider returns the result matrix out of the VCilk system. + * + * Divider chooses the size of sub-matrices via an algorithm that tries to + * keep the minimum work above a threshold. The threshold is machine- + * dependent, so ask VCilk for min work-unit time to get a + * given overhead + * + * Divide min work-unit cycles by measured-cycles for one matrix-cell + * product -- gives the number of products need to have in min size + * matrix. + * + * So then, take cubed root of this to get the size of a side of min sub- + * matrix. That is the size of the ideal square sub-matrix -- so tile + * up the two input matrices into ones as close as possible to that size, + * and create the pairs of sub-matrices. + * + *======================== STRATEGIC OVERVIEW ======================= + * + *This division is a bit tricky, because have to create things in advance + * that it's not at first obvious need to be created.. + * + *First slice up each dimension -- three of them.. this is because will have + * to create the sub-matrix's data-structures before pairing the sub-matrices + * with each other -- so, have three dimensions to slice up before can + * create the sub-matrix data-strucs -- also, have to be certain that the + * cols of the left input have the exact same slicing as the rows of the + * left matrix, so just to be sure, do the slicing calc once, then use it + * for both. + * + *So, goes like this: + *1) calculate the start & end values of each dimension in each matrix. + *2) use those values to create sub-matrix structures + *3) combine sub-matrices into pairs, as the tasks to perform. + * + *Have to calculate separately from creating the sub-matrices because of the + * nature of the nesting -- would either end up creating the same sub-matrix + * multiple times, or else would have to put in detection of whether had + * made a particular one already if tried to combine steps 1 and 2. + * + *Step 3 has to be separate because of the nesting, as well -- same reason, + * would either create same sub-matrix multiple times, or else have to + * add detection of whether was already created. + * + *Another way to look at it: there's one level of loop to divide dimensions, + * two levels of nesting to create sub-matrices, and three levels to pair + * up the sub-matrices. + */ + +void divideWorkIntoSubMatrixPairProcrs( void *_dividerParams, + VirtProcr *animPr ) + { + DividerParams *dividerParams; + ResultsParams *resultsParams; + Matrix *leftMatrix, *rightMatrix, *resultMatrix; + void *msg; + SlicingStrucCarrier *slicingStrucCarrier; + float32 *resultArray; //points to array to be put inside result + // matrix + + DEBUG( dbgAppFlow, "start divide\n") + + int32 + divideProbe = VMS__create_single_interval_probe( "divideProbe", + animPr ); + VMS__record_sched_choice_into_probe( divideProbe, animPr ); + VMS__record_interval_start_in_probe( divideProbe ); + + //=========== Setup -- make local copies of ptd-to-things, malloc, aso + int32 numResRows, numResCols, vectLength; + + dividerParams = (DividerParams *)_dividerParams; + + leftMatrix = dividerParams->leftMatrix; + rightMatrix = dividerParams->rightMatrix; + + vectLength = leftMatrix->numCols; + numResRows = leftMatrix->numRows; + numResCols = rightMatrix->numCols; + resultArray = dividerParams->resultMatrix->array; + + //zero the result array + memset( resultArray, 0, numResRows * numResCols * sizeof(float32) ); + + + //============== Do either sequential mult or do division ============== + + //Check if input matrices too small -- if yes, just do sequential + //Cutoff is determined by overhead of this divider -- relatively + // machine-independent + if( (float32)leftMatrix->numRows * (float32)leftMatrix->numCols * + (float32)rightMatrix->numCols < NUM_CELLS_IN_SEQUENTIAL_CUTOFF ) + { int32 vectLength; + + //====== Do sequential multiply on a single core + DEBUG( dbgAppFlow, "doing sequential") + + //transpose the right matrix + float32 * + transRightArray = VCilk__malloc( rightMatrix->numRows * + rightMatrix->numCols * + sizeof(float32), animPr ); + + //copy values from orig matrix to local + copyTranspose( rightMatrix->numRows, rightMatrix->numCols, + 0, 0, rightMatrix->numRows, + transRightArray, rightMatrix->array ); + + multiplyMatrixArraysTransposed( vectLength, numResRows, numResCols, + leftMatrix->array, transRightArray, + resultArray ); + } + else + { + //====== Do parallel multiply across cores + + //Calc the ideal size of sub-matrix and slice up the dimensions of + // the two matrices. + //The ideal size is the one takes the number of cycles to calculate + // such that calc time is equal or greater than min work-unit size + slicingStrucCarrier = + calcIdealSizeAndSliceDimensions( leftMatrix, rightMatrix, animPr); + + + + //Make the sub-matrices, and pair them up, then spawn processors to + // calc product of each pair. + makeSubMatricesAndSpawnAndSync( leftMatrix, rightMatrix, + slicingStrucCarrier, + resultArray, animPr); + //The result array will get filled in by the spawned children + } + + + //=============== Work done -- send results back ================= + + + //results have been saved into an array that was made outside the VMS + // system, by entry-point Fn, and passed in through dividerParams. + //So, nothing to do to send results back -- they're seen by side-effect + + DEBUG( dbgAppFlow, "*** end divide ***\n") + + VMS__record_interval_end_in_probe( divideProbe ); + VMS__print_stats_of_all_probes(); + + VCilk__dissipate_procr( animPr ); //all procrs dissipate self at end + //when all of the processors have dissipated, the "create seed and do + // work" call in the entry point function returns + } + + +SlicingStrucCarrier * +calcIdealSizeAndSliceDimensions( Matrix *leftMatrix, Matrix *rightMatrix, + VirtProcr *animPr ) +{ + float32 idealSizeOfSide, idealSizeOfSide1, idealSizeOfSide2; + SlicingStruc *leftRowSlices, *vecSlices, *rightColSlices; + SlicingStrucCarrier *slicingStrucCarrier = + VCilk__malloc(sizeof(SlicingStrucCarrier), animPr ); + + int minWorkUnitCycles, primitiveCycles, idealNumWorkUnits; + float64 numPrimitiveOpsInMinWorkUnit; + + + //======= Calc ideal size of min-sized sub-matrix ======== + + //ask VCilk for the number of cycles of the minimum work unit, at given + // percent overhead then add a guess at overhead from this divider + minWorkUnitCycles = VCilk__giveMinWorkUnitCycles( .05 ); + + //ask VCilk for number of cycles of the "primitive" op of matrix mult + primitiveCycles = measureMatrixMultPrimitive( animPr ); + + numPrimitiveOpsInMinWorkUnit = + (float64)minWorkUnitCycles / (float64)primitiveCycles; + + //take cubed root -- that's number of these in a "side" of sub-matrix + // then multiply by 5 because the primitive is 5x5 + idealSizeOfSide1 = 5 * cbrt( numPrimitiveOpsInMinWorkUnit ); + + idealNumWorkUnits = VCilk__giveIdealNumWorkUnits(); + + idealSizeOfSide2 = leftMatrix->numRows / rint(cbrt( idealNumWorkUnits )); + idealSizeOfSide2 *= 0.8; //finer granularity to help load balance + + if( idealSizeOfSide1 > idealSizeOfSide2 ) + idealSizeOfSide = idealSizeOfSide1; + else + idealSizeOfSide = idealSizeOfSide2; + + //The multiply inner loop blocks the array to fit into L1 cache +// if( idealSizeOfSide < ROWS_IN_BLOCK ) idealSizeOfSide = ROWS_IN_BLOCK; + + //============ Slice up dimensions, now that know target size =========== + + //Tell the slicer the target size of a side (floating pt), the start + // value to start slicing at, and the end value to stop slicing at + //It returns an array of start value of each chunk, plus number of them + int32 startLeftRow, endLeftRow, startVec,endVec,startRightCol,endRightCol; + startLeftRow = 0; + endLeftRow = leftMatrix->numRows -1; + startVec = 0; + endVec = leftMatrix->numCols -1; + startRightCol = 0; + endRightCol = rightMatrix->numCols -1; + + leftRowSlices = + sliceUpDimension( idealSizeOfSide, startLeftRow, endLeftRow, animPr ); + + vecSlices = + sliceUpDimension( idealSizeOfSide, startVec, endVec, animPr ); + + rightColSlices = + sliceUpDimension( idealSizeOfSide, startRightCol, endRightCol,animPr); + + slicingStrucCarrier->leftRowSlices = leftRowSlices; + slicingStrucCarrier->vecSlices = vecSlices; + slicingStrucCarrier->rightColSlices = rightColSlices; + + DEBUG1( dbgAppFlow, "leftRowSlices %d | ", leftRowSlices->numVals ); + DEBUG1( dbgAppFlow, "rightColSlices %d | ",rightColSlices->numVals); + DEBUG1( dbgAppFlow, "vecSlices %d\n", vecSlices->numVals ); + return slicingStrucCarrier; +} + + +void inline +makeSubMatricesAndSpawnAndSync( Matrix *leftMatrix, Matrix *rightMatrix, + SlicingStrucCarrier *slicingStrucCarrier, + float32 *resultArray, VirtProcr *animPr ) + { + SlicingStruc *leftRowSlices, *vecSlices, *rightColSlices; + + leftRowSlices = slicingStrucCarrier->leftRowSlices; + vecSlices = slicingStrucCarrier->vecSlices; + rightColSlices = slicingStrucCarrier->rightColSlices; + VCilk__free( slicingStrucCarrier, animPr ); + + //================ Make sub-matrices, given the slicing ================ + SubMatrix **leftSubMatrices, **rightSubMatrices; + leftSubMatrices = + createSubMatrices( leftRowSlices, vecSlices, + leftMatrix, animPr ); + rightSubMatrices = + createSubMatrices( vecSlices, rightColSlices, + rightMatrix, animPr ); + + freeSlicingStruc( leftRowSlices, animPr ); + freeSlicingStruc( vecSlices, animPr ); + freeSlicingStruc( rightColSlices, animPr ); + + //============== pair the sub-matrices and make processors ============== + int32 numRowIdxs, numColIdxs, numVecIdxs; + + numRowIdxs = leftRowSlices->numVals; + numColIdxs = rightColSlices->numVals; + numVecIdxs = vecSlices->numVals; + pairUpSubMatricesAndSpawnAndSync( leftSubMatrices, rightSubMatrices, + numRowIdxs, numColIdxs, numVecIdxs, + resultArray, + animPr ); + //It syncs inside, so know all work is done now: free the sub-matrices + freeSubMatrices( leftRowSlices, vecSlices, leftSubMatrices, animPr ); + freeSubMatrices( vecSlices, rightColSlices, rightSubMatrices, animPr ); + } + + + + +/* numRows*colsPerRow/numCores = numToPutOntoEachCore; + * put all from a given row onto same core, until exhaust allotment for that + * core + * + */ +void inline +pairUpSubMatricesAndSpawnAndSync( SubMatrix **leftSubMatrices, + SubMatrix **rightSubMatrices, + int32 numRowIdxs, int32 numColIdxs, + int32 numVecIdxs, + float32 *resultArray, + VirtProcr *animatingPr ) + { + int32 resRowIdx, resColIdx; + int32 numLeftColIdxs, numRightColIdxs; + int32 leftRowIdxOffset; + VecParams *vecParams; + float32 numToPutOntoEachCore, leftOverFraction; + int32 numCores, currCore, numOnCurrCore; + + numLeftColIdxs = numColIdxs; + numRightColIdxs = numVecIdxs; + + numCores = VCilk__give_number_of_cores_to_spawn_onto(); + + numToPutOntoEachCore = numRowIdxs*numColIdxs/numCores; + leftOverFraction = 0; + numOnCurrCore = 0; + currCore = 0; + + for( resRowIdx = 0; resRowIdx < numRowIdxs; resRowIdx++ ) + { + leftRowIdxOffset = resRowIdx * numLeftColIdxs; + + for( resColIdx = 0; resColIdx < numColIdxs; resColIdx++ ) + { + vecParams = VCilk__malloc( sizeof(VecParams), animatingPr ); + + vecParams->numVecIdxs = numVecIdxs; + vecParams->numRightColIdxs = numRightColIdxs; + vecParams->leftRowIdxOffset = leftRowIdxOffset; + vecParams->resColIdx = resColIdx; + vecParams->leftSubMatrices = leftSubMatrices; + vecParams->rightSubMatrices = rightSubMatrices; + vecParams->resultArray = resultArray; + vecParams->coreToRunOn = currCore; + + VCilk__spawn( currCore, &calcVectorOfSubMatrices, vecParams, + animatingPr ); + + numOnCurrCore += 1; + if( numOnCurrCore + leftOverFraction >= numToPutOntoEachCore - 1 ) + { + //deal with fractional part, to ensure that imbalance is 1 max + // IE, core with most has only 1 more than core with least + leftOverFraction += numToPutOntoEachCore - numOnCurrCore; + if( leftOverFraction >= 1 ) + { leftOverFraction -= 1; + numOnCurrCore = -1; + } + else + { numOnCurrCore = 0; + } + //Move to next core, max core-value to incr to is numCores -1 + if( currCore >= numCores -1 ) + { currCore = 0; + } + else + { currCore += 1; + } + } + } + } + + //Free Note: vector of sub-matrices does its own free-ing, even vec-params + +//TODO: timeToSpawnProbe = VMS__get_probe_by_name( "timeToSpawnProbe" ); +// VMS__end_interval_on_probe( timeToSpawnProbe ); + + VCilk__sync( animatingPr ); + + //free the sub-matrices in Fn that called this one + } + + +/*Walk through the two slice-strucs, making sub-matrix strucs as go + */ +SubMatrix ** +createSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices, + Matrix *origMatrix, VirtProcr *animPr ) + { + int32 numRowIdxs, numColIdxs, rowIdx, colIdx; + int32 startRow, endRow, startCol, endCol; + int32 *rowStartVals, *colStartVals; + int32 rowOffset; + SubMatrix **subMatrices, *newSubMatrix; + + numRowIdxs = rowSlices->numVals; + numColIdxs = colSlices->numVals; + + rowStartVals = rowSlices->startVals; + colStartVals = colSlices->startVals; + + subMatrices = VCilk__malloc( numRowIdxs * numColIdxs *sizeof(SubMatrix *), + animPr ); + + for( rowIdx = 0; rowIdx < numRowIdxs; rowIdx++ ) + { + rowOffset = rowIdx * numColIdxs; + + startRow = rowStartVals[rowIdx]; + endRow = rowStartVals[rowIdx + 1] -1; //"fake" start above last is + // at last valid idx + 1 & is + // 1 greater than end value + for( colIdx = 0; colIdx < numColIdxs; colIdx++ ) + { + startCol = colStartVals[colIdx]; + endCol = colStartVals[colIdx + 1] -1; + + newSubMatrix = VCilk__malloc( sizeof(SubMatrix), animPr ); + newSubMatrix->numRows = endRow - startRow +1; + newSubMatrix->numCols = endCol - startCol +1; + newSubMatrix->origMatrix = origMatrix; + newSubMatrix->origStartRow = startRow; + newSubMatrix->origStartCol = startCol; + newSubMatrix->alreadyCopied = FALSE; + + subMatrices[ rowOffset + colIdx ] = newSubMatrix; + } + } + return subMatrices; + } + +void +freeSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices, + SubMatrix **subMatrices, VirtProcr *animPr ) + { + int32 numRowIdxs, numColIdxs, rowIdx, colIdx, rowOffset; + SubMatrix *subMatrix; + + numRowIdxs = rowSlices->numVals; + numColIdxs = colSlices->numVals; + + for( rowIdx = 0; rowIdx < numRowIdxs; rowIdx++ ) + { + rowOffset = rowIdx * numColIdxs; + for( colIdx = 0; colIdx < numColIdxs; colIdx++ ) + { + subMatrix = subMatrices[ rowOffset + colIdx ]; + if( subMatrix->alreadyCopied ) + VCilk__free( subMatrix->array, animPr ); + VCilk__free( subMatrix, animPr ); + } + } + VCilk__free( subMatrices, animPr ); + } + + + +SlicingStruc * +sliceUpDimension( float32 idealSizeOfSide, int startVal, int endVal, + VirtProcr *animPr ) + { float32 residualAcc = 0; + int numSlices, i, *startVals, sizeOfSlice, endCondition; + SlicingStruc *slicingStruc = VCilk__malloc( sizeof(SlicingStruc), animPr); + + //calc size of matrix need to hold start vals -- + numSlices = (int32)( (float32)(endVal -startVal +1) / idealSizeOfSide); + + startVals = VCilk__malloc( (numSlices + 1) * sizeof(int32), animPr ); + + //Calc the upper limit of start value -- when get above this, end loop + // by saving highest value of the matrix dimension to access, plus 1 + // as the start point of the imaginary slice following the last one + //Plus 1 because go up to value but not include when process last slice + //The stopping condition is half-a-size less than highest value because + // don't want any pieces smaller than half the ideal size -- just tack + // little ones onto end of last one + endCondition = endVal - (int) (idealSizeOfSide/2); //end *value*, not size + for( i = 0; startVal <= endVal; i++ ) + { + startVals[i] = startVal; + residualAcc += idealSizeOfSide; + sizeOfSlice = (int)residualAcc; + residualAcc -= (float32)sizeOfSlice; + startVal += sizeOfSlice; //ex @size = 2 get 0, 2, 4, 6, 8.. + + if( startVal > endCondition ) + { startVal = endVal + 1; + startVals[ i + 1 ] = startVal; + } + } + + slicingStruc->startVals = startVals; + slicingStruc->numVals = i; //loop incr'd, so == last valid start idx+1 + // which means is num sub-matrices in dim + // also == idx of the fake start just above + return slicingStruc; + } + +void +freeSlicingStruc( SlicingStruc *slicingStruc, VirtProcr *animPr ) + { + VCilk__free( slicingStruc->startVals, animPr ); + VCilk__free( slicingStruc, animPr ); + } + + +int inline +measureMatrixMultPrimitive( VirtProcr *animPr ) + { + int r, c, v, numCycles; + float32 *res, *left, *right; + + //setup inputs + left = VCilk__malloc( 5 * 5 * sizeof( float32 ), animPr ); + right = VCilk__malloc( 5 * 5 * sizeof( float32 ), animPr ); + res = VCilk__malloc( 5 * 5 * sizeof( float32 ), animPr ); + + for( r = 0; r < 5; r++ ) + { + for( c = 0; c < 5; c++ ) + { + left[ r * 5 + c ] = r; + right[ r * 5 + c ] = c; + } + } + + //do primitive + VCilk__start_primitive(); //for now, just takes time stamp + for( r = 0; r < 5; r++ ) + { + for( c = 0; c < 5; c++ ) + { + for( v = 0; v < 5; v++ ) + { + res[ r * 5 + c ] = left[ r * 5 + v ] * right[ v * 5 + c ]; + } + } + } + numCycles = + VCilk__end_primitive_and_give_cycles(); + + VCilk__free( left, animPr ); + VCilk__free( right, animPr ); + VCilk__free( res, animPr ); + + return numCycles; + } diff -r 12bcb9728e1c -r ecba4ae0be7a src/Application/VCilk__Matrix_Mult/EntryPoint.c --- a/src/Application/VCilk__Matrix_Mult/EntryPoint.c Mon Nov 15 12:08:19 2010 -0800 +++ b/src/Application/VCilk__Matrix_Mult/EntryPoint.c Wed May 11 15:40:54 2011 +0200 @@ -1,58 +1,58 @@ -/* - * Copyright 2009 OpenSourceStewardshipFoundation.org - * Licensed under GNU General Public License version 2 - * - * Author: seanhalle@yahoo.com - * - */ - -#include - -#include "VCilk__Matrix_Mult.h" - - - -/*Every VCilk system has an "entry point" function that creates the first - * processor, which starts the chain of creating more processors.. - * eventually all of the processors will dissipate themselves, and - * return. - * - *This entry-point function follows the same pattern as all entry-point - * functions do: - *1) it creates the params for the seed processor, from the - * parameters passed into the entry-point function - *2) it calls VCilk__create_seed_procr_and_do_work - *3) it gets the return value from the params struc, frees the params struc, - * and returns the value from the function - * - */ -Matrix * -multiplyTheseMatrices( Matrix *leftMatrix, Matrix *rightMatrix ) - { Matrix *resMatrix; - DividerParams *dividerParams; - int32 numResRows, numResCols; - - - dividerParams = malloc( sizeof( DividerParams ) ); - dividerParams->leftMatrix = leftMatrix; - dividerParams->rightMatrix = rightMatrix; - - numResRows = leftMatrix->numRows; - numResCols = rightMatrix->numCols; - resMatrix = malloc( sizeof(Matrix) ); - resMatrix->array = malloc( numResRows * numResCols * sizeof(float32)); - resMatrix->numCols = rightMatrix->numCols; - resMatrix->numRows = leftMatrix->numRows; - - - dividerParams->resultMatrix = resMatrix; - - //create divider processor, start doing the work, and wait till done - //This function is the "border crossing" between normal code and VCilk - VCilk__create_seed_procr_and_do_work( ÷WorkIntoSubMatrixPairProcrs, - dividerParams ); - - //get result matrix and return it - free( dividerParams ); - return resMatrix; - } +/* + * Copyright 2009 OpenSourceStewardshipFoundation.org + * Licensed under GNU General Public License version 2 + * + * Author: seanhalle@yahoo.com + * + */ + +#include + +#include "VCilk__Matrix_Mult.h" + + + +/*Every VCilk system has an "entry point" function that creates the first + * processor, which starts the chain of creating more processors.. + * eventually all of the processors will dissipate themselves, and + * return. + * + *This entry-point function follows the same pattern as all entry-point + * functions do: + *1) it creates the params for the seed processor, from the + * parameters passed into the entry-point function + *2) it calls VCilk__create_seed_procr_and_do_work + *3) it gets the return value from the params struc, frees the params struc, + * and returns the value from the function + * + */ +Matrix * +multiplyTheseMatrices( Matrix *leftMatrix, Matrix *rightMatrix ) + { Matrix *resMatrix; + DividerParams *dividerParams; + int32 numResRows, numResCols; + + + dividerParams = malloc( sizeof( DividerParams ) ); + dividerParams->leftMatrix = leftMatrix; + dividerParams->rightMatrix = rightMatrix; + + numResRows = leftMatrix->numRows; + numResCols = rightMatrix->numCols; + resMatrix = malloc( sizeof(Matrix) ); + resMatrix->array = malloc( numResRows * numResCols * sizeof(float32)); + resMatrix->numCols = rightMatrix->numCols; + resMatrix->numRows = leftMatrix->numRows; + + + dividerParams->resultMatrix = resMatrix; + + //create divider processor, start doing the work, and wait till done + //This function is the "border crossing" between normal code and VCilk + VCilk__create_seed_procr_and_do_work( ÷WorkIntoSubMatrixPairProcrs, + dividerParams ); + + //get result matrix and return it + free( dividerParams ); + return resMatrix; + } diff -r 12bcb9728e1c -r ecba4ae0be7a src/Application/VCilk__Matrix_Mult/VCilk__Matrix_Mult.h --- a/src/Application/VCilk__Matrix_Mult/VCilk__Matrix_Mult.h Mon Nov 15 12:08:19 2010 -0800 +++ b/src/Application/VCilk__Matrix_Mult/VCilk__Matrix_Mult.h Wed May 11 15:40:54 2011 +0200 @@ -1,104 +1,106 @@ -/* - * Copyright Oct 24, 2009 OpenSourceStewardshipFoundation.org - * Licensed under GNU General Public License version 2 - */ - -#ifndef _VCilk_MATRIX_MULT_H_ -#define _VCilk_MATRIX_MULT_H_ - -#include - -#include "../../VCilk_lib/VCilk.h" -#include "../Matrix_Mult.h" -#include "../../VCilk_lib/VMS/VMS.h" - - -//=============================== Defines ============================== -#define ROWS_IN_BLOCK 32 -#define COLS_IN_BLOCK 32 -#define VEC_IN_BLOCK 32 - -#define copyMatrixSingleton 1 -#define copyTransposeSingleton 2 - -//============================== Structures ============================== -typedef struct - { - Matrix *leftMatrix; - Matrix *rightMatrix; - Matrix *resultMatrix; - - TSCount numTSCsToExe; - } -DividerParams; - -typedef struct - { - VirtProcr *dividerPr; - int numRows; - int numCols; - int numSubMatrixPairs; - } -ResultsParams; - -typedef -struct - { int32 numRows; - int32 numCols; - Matrix *origMatrix; - int32 origStartRow; - int32 origStartCol; - int32 alreadyCopied; - float32 *array; //2D, but dynamically sized, so use addr arith - } -SubMatrix; - -typedef struct - { VirtProcr *resultPr; - SubMatrix *leftSubMatrix; - SubMatrix *rightSubMatrix; - float32 *partialResultArray; - } -SMPairParams; - -typedef -struct - { int32 numVals; - int32 *startVals; - } -SlicingStruc; - -typedef -struct - { - SlicingStruc *leftRowSlices; - SlicingStruc *vecSlices; - SlicingStruc *rightColSlices; - } -SlicingStrucCarrier; - -typedef struct - { - int32 numVecIdxs; - int32 numRightColIdxs; - int32 leftRowIdxOffset; - int32 resColIdx; - SubMatrix **leftSubMatrices; - SubMatrix **rightSubMatrices; - float32 *resultArray; - int32 coreToRunOn; - } -VecParams; - -//============================= Processor Functions ========================= -void divideWorkIntoSubMatrixPairProcrs( void *data, VirtProcr *animatingPr ); -void calcSubMatrixProduct( void *data, VirtProcr *animatingPr ); -void calcVectorOfSubMatrices( void *_vecParams, VirtProcr *animatingPr ); - - -//================================ Entry Point ============================== -Matrix * -multiplyTheseMatrices( Matrix *leftMatrix, Matrix *rightMatrix ); - - -#endif /*_VCilk_MATRIX_MULT_H_*/ +/* + * Copyright Oct 24, 2009 OpenSourceStewardshipFoundation.org + * Licensed under GNU General Public License version 2 + */ + +#ifndef _VCilk_MATRIX_MULT_H_ +#define _VCilk_MATRIX_MULT_H_ + +#include + +#include "../../VCilk_lib/VCilk.h" +#include "../Matrix_Mult.h" +#include "../../VCilk_lib/VMS/VMS.h" + + +//=============================== Defines ============================== +#define ROWS_IN_BLOCK 32 +#define COLS_IN_BLOCK 32 +#define VEC_IN_BLOCK 32 + +#define copyMatrixSingleton 1 +#define copyTransposeSingleton 2 + +//============================== Structures ============================== +typedef struct + { + Matrix *leftMatrix; + Matrix *rightMatrix; + Matrix *resultMatrix; + + TSCount numTSCsToExe; + } +DividerParams; + +typedef struct + { + VirtProcr *dividerPr; + int numRows; + int numCols; + int numSubMatrixPairs; + } +ResultsParams; + +typedef +struct + { int32 numRows; + int32 numCols; + Matrix *origMatrix; + int32 origStartRow; + int32 origStartCol; + int32 alreadyCopied; + VCilkSingleton *copySingleton; + VCilkSingleton *copyTransSingleton; + float32 *array; //2D, but dynamically sized, so use addr arith + } +SubMatrix; + +typedef struct + { VirtProcr *resultPr; + SubMatrix *leftSubMatrix; + SubMatrix *rightSubMatrix; + float32 *partialResultArray; + } +SMPairParams; + +typedef +struct + { int32 numVals; + int32 *startVals; + } +SlicingStruc; + +typedef +struct + { + SlicingStruc *leftRowSlices; + SlicingStruc *vecSlices; + SlicingStruc *rightColSlices; + } +SlicingStrucCarrier; + +typedef struct + { + int32 numVecIdxs; + int32 numRightColIdxs; + int32 leftRowIdxOffset; + int32 resColIdx; + SubMatrix **leftSubMatrices; + SubMatrix **rightSubMatrices; + float32 *resultArray; + int32 coreToRunOn; + } +VecParams; + +//============================= Processor Functions ========================= +void divideWorkIntoSubMatrixPairProcrs( void *data, VirtProcr *animatingPr ); +void calcSubMatrixProduct( void *data, VirtProcr *animatingPr ); +void calcVectorOfSubMatrices( void *_vecParams, VirtProcr *animatingPr ); + + +//================================ Entry Point ============================== +Matrix * +multiplyTheseMatrices( Matrix *leftMatrix, Matrix *rightMatrix ); + + +#endif /*_VCilk_MATRIX_MULT_H_*/ diff -r 12bcb9728e1c -r ecba4ae0be7a src/Application/VCilk__Matrix_Mult/Vector_Pr.c --- a/src/Application/VCilk__Matrix_Mult/Vector_Pr.c Mon Nov 15 12:08:19 2010 -0800 +++ b/src/Application/VCilk__Matrix_Mult/Vector_Pr.c Wed May 11 15:40:54 2011 +0200 @@ -1,130 +1,130 @@ -/* - * Copyright 2009 OpenSourceStewardshipFoundation.org - * Licensed under GNU General Public License version 2 - * - * Author: seanhalle@yahoo.com - * - */ - - -#include "VCilk__Matrix_Mult.h" -#include -#include - - -void inline -accumulateResult( float32 *resultArray, float32 *subMatrixResultArray, - int32 startRow, - int32 numRows, - int32 startCol, - int32 numCols, - int32 numOrigCols ); - - -//=========================================================================== - -void -calcVectorOfSubMatrices( void *_vecParams, VirtProcr *animPr ) - { int32 numVecIdxs, leftRowIdxOffset, numRightColIdxs, resColIdx; - SubMatrix **leftSubMatrices, **rightSubMatrices; - float32 *resultArray; - int32 vecIdx, coreWithAffinity; - SMPairParams *subMatrixPairParams, **vecOfSubMatrixParams; - VecParams *vecParams; - - vecParams = (VecParams *)_vecParams; - - DEBUG1( dbgAppFlow, "start vector %d\n", animPr->procrID) - #ifdef TURN_ON_DEBUG_PROBES - int32 subMatrixVectorProbe = - VMS__create_single_interval_probe( "subMtxVect", animPr ); - VMS__record_sched_choice_into_probe( subMatrixVectorProbe, animPr ); - VMS__record_interval_start_in_probe( subMatrixVectorProbe ); - #endif - - - numVecIdxs = vecParams->numVecIdxs; - numRightColIdxs = vecParams->numRightColIdxs; - leftRowIdxOffset = vecParams->leftRowIdxOffset; - resColIdx = vecParams->resColIdx; - leftSubMatrices = vecParams->leftSubMatrices; - rightSubMatrices = vecParams->rightSubMatrices; - resultArray = vecParams->resultArray; - coreWithAffinity = vecParams->coreToRunOn; - - vecOfSubMatrixParams = VCilk__malloc( numVecIdxs * sizeof(SMPairParams *), - animPr ); - if( vecOfSubMatrixParams == 0 ){printf("malloc error"); exit(1);} - - for( vecIdx = 0; vecIdx < numVecIdxs; vecIdx++ ) - { - //Make the processor for the pair of sub-matrices - subMatrixPairParams = VCilk__malloc( sizeof(SMPairParams), animPr ); - subMatrixPairParams->leftSubMatrix = - leftSubMatrices[ leftRowIdxOffset + vecIdx ]; - - subMatrixPairParams->rightSubMatrix = - rightSubMatrices[ vecIdx * numRightColIdxs + resColIdx ]; - - VCilk__spawn( coreWithAffinity, &calcSubMatrixProduct, - subMatrixPairParams, animPr ); - - vecOfSubMatrixParams[ vecIdx ] = subMatrixPairParams; - } - - DEBUG1( dbgAppFlow, "before sync in vector %d\n", animPr->procrID) - VCilk__sync( animPr ); - DEBUG1( dbgAppFlow, "**after sync in vector %d\n", animPr->procrID) - - //now accumulate individual result matrices into final result matrix - for( vecIdx = 0; vecIdx < numVecIdxs; vecIdx++ ) - { - subMatrixPairParams = vecOfSubMatrixParams[ vecIdx ]; - - accumulateResult( resultArray, subMatrixPairParams->partialResultArray, - subMatrixPairParams->leftSubMatrix->origStartRow, - subMatrixPairParams->leftSubMatrix->numRows, - subMatrixPairParams->rightSubMatrix->origStartCol, - subMatrixPairParams->rightSubMatrix->numCols, - subMatrixPairParams->rightSubMatrix->origMatrix->numCols); - - //Note, resultArray is made on the core that produces the results - // that gives chance to set affinity so all in vector run on same - // core and re-use that array, and prevents writes from causing - // thrashing of the cache -- as long as array big enough, the copy - // overhead is miniscule vs the size-of-side reuse of each byte - VCilk__free( subMatrixPairParams->partialResultArray, animPr ); - VCilk__free( subMatrixPairParams, animPr ); - } - VCilk__free( vecOfSubMatrixParams, animPr ); - VCilk__free( vecParams, animPr ); - - #ifdef TURN_ON_DEBUG_PROBES - VMS__record_interval_end_in_probe( subMatrixVectorProbe ); - #endif - - DEBUG1( dbgAppFlow, "end vector %d\n", animPr->procrID) - VCilk__dissipate_procr( animPr ); - } - - - -void inline -accumulateResult( float32 *resultArray, float32 *subMatrixResultArray, - int32 startRow, - int32 numRows, - int32 startCol, - int32 numCols, - int32 numOrigCols ) - { int32 row, col; - - for( row = 0; row < numRows; row++ ) - { - for( col = 0; col < numCols; col++ ) - { - resultArray[ (row + startRow) * numOrigCols + col + startCol ] += - subMatrixResultArray[ row * numCols + col ]; - } - } - - } +/* + * Copyright 2009 OpenSourceStewardshipFoundation.org + * Licensed under GNU General Public License version 2 + * + * Author: seanhalle@yahoo.com + * + */ + + +#include "VCilk__Matrix_Mult.h" +#include +#include + + +void inline +accumulateResult( float32 *resultArray, float32 *subMatrixResultArray, + int32 startRow, + int32 numRows, + int32 startCol, + int32 numCols, + int32 numOrigCols ); + + +//=========================================================================== + +void +calcVectorOfSubMatrices( void *_vecParams, VirtProcr *animPr ) + { int32 numVecIdxs, leftRowIdxOffset, numRightColIdxs, resColIdx; + SubMatrix **leftSubMatrices, **rightSubMatrices; + float32 *resultArray; + int32 vecIdx, coreWithAffinity; + SMPairParams *subMatrixPairParams, **vecOfSubMatrixParams; + VecParams *vecParams; + + vecParams = (VecParams *)_vecParams; + + DEBUG1( dbgAppFlow, "start vector %d\n", animPr->procrID) + #ifdef TURN_ON_DEBUG_PROBES + int32 subMatrixVectorProbe = + VMS__create_single_interval_probe( "subMtxVect", animPr ); + VMS__record_sched_choice_into_probe( subMatrixVectorProbe, animPr ); + VMS__record_interval_start_in_probe( subMatrixVectorProbe ); + #endif + + + numVecIdxs = vecParams->numVecIdxs; + numRightColIdxs = vecParams->numRightColIdxs; + leftRowIdxOffset = vecParams->leftRowIdxOffset; + resColIdx = vecParams->resColIdx; + leftSubMatrices = vecParams->leftSubMatrices; + rightSubMatrices = vecParams->rightSubMatrices; + resultArray = vecParams->resultArray; + coreWithAffinity = vecParams->coreToRunOn; + + vecOfSubMatrixParams = VCilk__malloc( numVecIdxs * sizeof(SMPairParams *), + animPr ); + if( vecOfSubMatrixParams == 0 ){printf("malloc error"); exit(1);} + + for( vecIdx = 0; vecIdx < numVecIdxs; vecIdx++ ) + { + //Make the processor for the pair of sub-matrices + subMatrixPairParams = VCilk__malloc( sizeof(SMPairParams), animPr ); + subMatrixPairParams->leftSubMatrix = + leftSubMatrices[ leftRowIdxOffset + vecIdx ]; + + subMatrixPairParams->rightSubMatrix = + rightSubMatrices[ vecIdx * numRightColIdxs + resColIdx ]; + + VCilk__spawn( coreWithAffinity, &calcSubMatrixProduct, + subMatrixPairParams, animPr ); + + vecOfSubMatrixParams[ vecIdx ] = subMatrixPairParams; + } + + DEBUG1( dbgAppFlow, "before sync in vector %d\n", animPr->procrID) + VCilk__sync( animPr ); + DEBUG1( dbgAppFlow, "**after sync in vector %d\n", animPr->procrID) + + //now accumulate individual result matrices into final result matrix + for( vecIdx = 0; vecIdx < numVecIdxs; vecIdx++ ) + { + subMatrixPairParams = vecOfSubMatrixParams[ vecIdx ]; + + accumulateResult( resultArray, subMatrixPairParams->partialResultArray, + subMatrixPairParams->leftSubMatrix->origStartRow, + subMatrixPairParams->leftSubMatrix->numRows, + subMatrixPairParams->rightSubMatrix->origStartCol, + subMatrixPairParams->rightSubMatrix->numCols, + subMatrixPairParams->rightSubMatrix->origMatrix->numCols); + + //Note, resultArray is made on the core that produces the results + // that gives chance to set affinity so all in vector run on same + // core and re-use that array, and prevents writes from causing + // thrashing of the cache -- as long as array big enough, the copy + // overhead is miniscule vs the size-of-side reuse of each byte + VCilk__free( subMatrixPairParams->partialResultArray, animPr ); + VCilk__free( subMatrixPairParams, animPr ); + } + VCilk__free( vecOfSubMatrixParams, animPr ); + VCilk__free( vecParams, animPr ); + + #ifdef TURN_ON_DEBUG_PROBES + VMS__record_interval_end_in_probe( subMatrixVectorProbe ); + #endif + + DEBUG1( dbgAppFlow, "end vector %d\n", animPr->procrID) + VCilk__dissipate_procr( animPr ); + } + + + +void inline +accumulateResult( float32 *resultArray, float32 *subMatrixResultArray, + int32 startRow, + int32 numRows, + int32 startCol, + int32 numCols, + int32 numOrigCols ) + { int32 row, col; + + for( row = 0; row < numRows; row++ ) + { + for( col = 0; col < numCols; col++ ) + { + resultArray[ (row + startRow) * numOrigCols + col + startCol ] += + subMatrixResultArray[ row * numCols + col ]; + } + } + + } diff -r 12bcb9728e1c -r ecba4ae0be7a src/Application/VCilk__Matrix_Mult/subMatrix_Pr.c --- a/src/Application/VCilk__Matrix_Mult/subMatrix_Pr.c Mon Nov 15 12:08:19 2010 -0800 +++ b/src/Application/VCilk__Matrix_Mult/subMatrix_Pr.c Wed May 11 15:40:54 2011 +0200 @@ -1,312 +1,308 @@ -/* - * Copyright 2009 OpenSourceStewardshipFoundation.org - * Licensed under GNU General Public License version 2 - * - * Author: SeanHalle@yahoo.com - * - */ - -#include - -#include "VCilk__Matrix_Mult.h" - - -void inline -copyFromOrig( SubMatrix *subMatrix, VirtProcr *animPr ); - -void inline -copyTransposeFromOrig( SubMatrix *subMatrix, VirtProcr *animPr ); - -void inline -multiplySubBlocksTransposed( float32 *leftArray, float32 *rightArray, - float32 *resArray, - int startRow, int endRow, - int startCol, int endCol, - int startVec, int endVec, - int resStride, int inpStride ); - -void inline -multiplyMatrixArraysTransposed( int32 vecLength, int32 numResRows, int32 numResCols, - float32 *leftArray, float32 *rightArray, - float32 *resArray ); - - -/*A processor is created with an environment that holds two matrices, - * the row and col that it owns, and the name of a result gathering - * processor. - *It calculates the product of two sub-portions of the input matrices - * by using Intel's mkl library for single-core. - * - *This demonstrates using optimized single-threaded code inside scheduled - * work-units. - * - *When done, it sends the result to the result processor - */ -void -calcSubMatrixProduct( void *data, VirtProcr *animPr ) - { - SMPairParams *params; - VirtProcr *resultPr; - float32 *leftArray, *rightArray, *resArray; - SubMatrix *leftSubMatrix, *rightSubMatrix; - - - DEBUG1( dbgAppFlow, "start sub-matrix mult %d\n", animPr->procrID) - #ifdef TURN_ON_DEBUG_PROBES - int32 subMatrixProbe = - VMS__create_single_interval_probe( "subMtx", animPr); - VMS__record_sched_choice_into_probe( subMatrixProbe, animPr ); - VMS__record_interval_start_in_probe( subMatrixProbe ); - #endif - - params = (SMPairParams *)data; -// resultPr = params->resultPr; - leftSubMatrix = params->leftSubMatrix; - rightSubMatrix = params->rightSubMatrix; - - //make sure the input sub-matrices have been copied out of orig - copyFromOrig( leftSubMatrix, animPr ); - copyTransposeFromOrig( rightSubMatrix, animPr ); - - leftArray = leftSubMatrix->array; - rightArray = rightSubMatrix->array; - - //make this array here, on the core that computes the results - // with Cilk's semantics, have to have separate result array for each - // spawned processor -- unless want to change the spawn and sync - // pattern, such that spawn one from each vector, then sync, then - // another, and so forth -- this will cause idle time due to imbalance - // in matrix sizes - //This also gives chance to set affinity so all in vector run on same - // core and re-use the accumulation array, - //As a side-benefit, it also prevents writes from causing - // thrashing of the cache -- as long as array big enough, the copy - // overhead is small because each byte is reused size-of-side times - //This is freed in the vector processor - int32 - resSize = leftSubMatrix->numRows * rightSubMatrix->numCols * sizeof(float32); - resArray = VCilk__malloc( resSize, animPr ); - memset( resArray, 0, resSize ); - - - int32 numResRows, numResCols, vectLength; - - vectLength = leftSubMatrix->numCols; - numResRows = leftSubMatrix->numRows; - numResCols = rightSubMatrix->numCols; - - multiplyMatrixArraysTransposed( vectLength, numResRows, numResCols, - leftArray, rightArray, - resArray ); - - //send result by side-effect - params->partialResultArray = resArray; - - #ifdef TURN_ON_DEBUG_PROBES - VMS__record_interval_end_in_probe( subMatrixProbe ); - #endif - - DEBUG1( dbgAppFlow, "end sub-matrix mult %d\n", animPr->procrID) - VCilk__dissipate_procr( animPr ); - } - - - -/*Divides result and each input into 32x32 sub-matrices, 3 of which fit into - * the 32KB L1 cache. - *Would be nice to embed this within another level that divided into - * 8x8 tiles of those, where one 8x8 tile fits within 2MB L2 cache - * - *Eventually want these divisions to be automatic, using DKU pattern - * embedded into VMS and exposed in the language, and with VMS controlling the - * divisions according to the cache sizes, which it knows about. - *Also, want VMS to work with language to split among main-mems, so a socket - * only cranks on data in its local segment of main mem - * - *So, outer two loops determine start and end points within the result matrix. - * Inside that, a loop dets the start and end points along the shared dimensions - * of the two input matrices. - */ -void inline -multiplyMatrixArraysTransposed( int32 vecLength, int32 numResRows, - int32 numResCols, - float32 *leftArray, float32 *rightArray, - float32 *resArray ) - { - int resStride, inpStride; - int resStartRow, resStartCol, resEndRow, resEndCol, startVec, endVec; - - resStride = numResCols; - inpStride = vecLength; - - for( resStartRow = 0; resStartRow < numResRows; ) - { - resEndRow = resStartRow + ROWS_IN_BLOCK -1; //start at zero, so -1 - if( resEndRow > numResRows ) resEndRow = numResRows -1; - - for( resStartCol = 0; resStartCol < numResCols; ) - { - resEndCol = resStartCol + COLS_IN_BLOCK -1; - if( resEndCol > numResCols ) resEndCol = numResCols -1; - - for( startVec = 0; startVec < vecLength; ) - { - endVec = startVec + VEC_IN_BLOCK -1; - if( endVec > vecLength ) endVec = vecLength -1; - - //By having the "vector" of sub-blocks in a sub-block slice - // be marched down in inner loop, are re-using the result - // matrix, which stays in L1 cache and re-using the left sub-mat - // which repeats for each right sub-mat -- can only re-use two of - // the three, so result is the most important -- avoids writing - // dirty blocks until those result-locations fully done - //Row and Col is position in result matrix -- so row and vec - // for left array, then vec and col for right array - multiplySubBlocksTransposed( leftArray, rightArray, - resArray, - resStartRow, resEndRow, - resStartCol, resEndCol, - startVec, endVec, - resStride, inpStride ); - startVec = endVec +1; - } - resStartCol = resEndCol +1; - } - resStartRow = resEndRow +1; - } - } - - - -void inline -multiplySubBlocksTransposed( float32 *leftArray, float32 *rightArray, - float32 *resArray, - int resStartRow, int resEndRow, - int resStartCol, int resEndCol, - int startVec, int endVec, - int resStride, int inpStride ) - { - int resRow, resCol, vec; - int leftOffset, rightOffset; - float32 result; - - //The result row is used for the left matrix, res col for the right - for( resCol = resStartCol; resCol <= resEndCol; resCol++ ) - { - for( resRow = resStartRow; resRow <= resEndRow; resRow++ ) - { - leftOffset = resRow * inpStride;//left & right inp strides same - rightOffset = resCol * inpStride;// because right is transposed - result = 0; - for( vec = startVec; vec <= endVec; vec++ ) - { - result += - leftArray[ leftOffset + vec] * rightArray[ rightOffset + vec]; - } - - resArray[ resRow * resStride + resCol ] += result; - } - } - } - - -/*Reuse this in divider when do the sequential multiply case - */ -void inline -copyTranspose( int32 numRows, int32 numCols, - int32 origStartRow, int32 origStartCol, int32 origStride, - float32 *subArray, float32 *origArray ) - { int32 stride = numRows; - - int row, col, origOffset; - for( row = 0; row < numRows; row++ ) - { - origOffset = (row + origStartRow) * origStride + origStartCol; - for( col = 0; col < numCols; col++ ) - { - //transpose means swap row & col -- traverse orig matrix normally - // but put into reversed place in local array -- means the - // stride is the numRows now, so col * numRows + row - subArray[ col * stride + row ] = origArray[ origOffset + col ]; - } - } - } - -void inline -copyTransposeFromOrig( SubMatrix *subMatrix, VirtProcr *animPr ) - { int numCols, numRows, origStartRow, origStartCol, origStride, stride; - Matrix *origMatrix; - float32 *origArray, *subArray; - - if( subMatrix->alreadyCopied ) return; - VCilk__start_singleton(copyMatrixSingleton,&&EndOfTranspSingleton,animPr); - - origMatrix = subMatrix->origMatrix; - origArray = origMatrix->array; - numCols = subMatrix->numCols; - numRows = subMatrix->numRows; - origStartRow = subMatrix->origStartRow; - origStartCol = subMatrix->origStartCol; - origStride = origMatrix->numCols; - - subArray = VCilk__malloc( numRows * numCols *sizeof(float32),animPr); - subMatrix->array = subArray; - - //copy values from orig matrix to local - copyTranspose( numRows, numCols, - origStartRow, origStartCol, origStride, - subArray, origArray ); - - subMatrix->alreadyCopied = TRUE; //must be last thing before label - EndOfTranspSingleton: - return; - } - - -void inline -copyFromOrig( SubMatrix *subMatrix, VirtProcr *animPr ) - { int numCols, numRows, origStartRow, origStartCol, stride, origStride; - Matrix *origMatrix; - float32 *origArray, *subArray; - - - //This lets only a single VP execute the code between start and - // end -- using start and end so that work runs outside the master. - //Inside, if a second VP ever executes the start, it will be returned - // from the end-point. - //Note, for non-GCC, can add a second SSR call at the end, and inside - // that one, look at the stack at the return addr & save that in an - // array indexed by singletonID - if( subMatrix->alreadyCopied ) return; - VCilk__start_singleton( copyMatrixSingleton, &&EndOfCopySingleton,animPr); - - - origMatrix = subMatrix->origMatrix; - origArray = origMatrix->array; - numCols = subMatrix->numCols; - numRows = subMatrix->numRows; - origStartRow = subMatrix->origStartRow; - origStartCol = subMatrix->origStartCol; - origStride = origMatrix->numCols; - - subArray = VCilk__malloc( numRows * numCols *sizeof(float32),animPr); - subMatrix->array = subArray; - - //copy values from orig matrix to local - stride = numCols; - - int row, col, offset, origOffset; - for( row = 0; row < numRows; row++ ) - { - offset = row * stride; - origOffset = (row + origStartRow) * origStride + origStartCol; - for( col = 0; col < numCols; col++ ) - { - subArray[ offset + col ] = origArray[ origOffset + col ]; - } - } - - subMatrix->alreadyCopied = TRUE; //must be last thing before label - EndOfCopySingleton: - return; - } +/* + * Copyright 2009 OpenSourceStewardshipFoundation.org + * Licensed under GNU General Public License version 2 + * + * Author: SeanHalle@yahoo.com + * + */ + +#include + +#include "VCilk__Matrix_Mult.h" + + +void inline +copyFromOrig( SubMatrix *subMatrix, VirtProcr *animPr ); + +void inline +copyTransposeFromOrig( SubMatrix *subMatrix, VirtProcr *animPr ); + +void inline +multiplySubBlocksTransposed( float32 *leftArray, float32 *rightArray, + float32 *resArray, + int startRow, int endRow, + int startCol, int endCol, + int startVec, int endVec, + int resStride, int inpStride ); + +void inline +multiplyMatrixArraysTransposed( int32 vecLength, int32 numResRows, int32 numResCols, + float32 *leftArray, float32 *rightArray, + float32 *resArray ); + + +/*A processor is created with an environment that holds two matrices, + * the row and col that it owns, and the name of a result gathering + * processor. + *It calculates the product of two sub-portions of the input matrices + * by using Intel's mkl library for single-core. + * + *This demonstrates using optimized single-threaded code inside scheduled + * work-units. + * + *When done, it sends the result to the result processor + */ +void +calcSubMatrixProduct( void *data, VirtProcr *animPr ) + { + SMPairParams *params; + VirtProcr *resultPr; + float32 *leftArray, *rightArray, *resArray; + SubMatrix *leftSubMatrix, *rightSubMatrix; + + + DEBUG1( dbgAppFlow, "start sub-matrix mult %d\n", animPr->procrID) + #ifdef TURN_ON_DEBUG_PROBES + int32 subMatrixProbe = + VMS__create_single_interval_probe( "subMtx", animPr); + VMS__record_sched_choice_into_probe( subMatrixProbe, animPr ); + VMS__record_interval_start_in_probe( subMatrixProbe ); + #endif + + params = (SMPairParams *)data; +// resultPr = params->resultPr; + leftSubMatrix = params->leftSubMatrix; + rightSubMatrix = params->rightSubMatrix; + + //make sure the input sub-matrices have been copied out of orig + copyFromOrig( leftSubMatrix, animPr ); + copyTransposeFromOrig( rightSubMatrix, animPr ); + + leftArray = leftSubMatrix->array; + rightArray = rightSubMatrix->array; + + //make this array here, on the core that computes the results + // with Cilk's semantics, have to have separate result array for each + // spawned processor -- unless want to change the spawn and sync + // pattern, such that spawn one from each vector, then sync, then + // another, and so forth -- this will cause idle time due to imbalance + // in matrix sizes + //This also gives chance to set affinity so all in vector run on same + // core and re-use the accumulation array, + //As a side-benefit, it also prevents writes from causing + // thrashing of the cache -- as long as array big enough, the copy + // overhead is small because each byte is reused size-of-side times + //This is freed in the vector processor + int32 + resSize = leftSubMatrix->numRows * rightSubMatrix->numCols * sizeof(float32); + resArray = VCilk__malloc( resSize, animPr ); + memset( resArray, 0, resSize ); + + + int32 numResRows, numResCols, vectLength; + + vectLength = leftSubMatrix->numCols; + numResRows = leftSubMatrix->numRows; + numResCols = rightSubMatrix->numCols; + + multiplyMatrixArraysTransposed( vectLength, numResRows, numResCols, + leftArray, rightArray, + resArray ); + + //send result by side-effect + params->partialResultArray = resArray; + + #ifdef TURN_ON_DEBUG_PROBES + VMS__record_interval_end_in_probe( subMatrixProbe ); + #endif + + DEBUG1( dbgAppFlow, "end sub-matrix mult %d\n", animPr->procrID) + VCilk__dissipate_procr( animPr ); + } + + + +/*Divides result and each input into 32x32 sub-matrices, 3 of which fit into + * the 32KB L1 cache. + *Would be nice to embed this within another level that divided into + * 8x8 tiles of those, where one 8x8 tile fits within 2MB L2 cache + * + *Eventually want these divisions to be automatic, using DKU pattern + * embedded into VMS and exposed in the language, and with VMS controlling the + * divisions according to the cache sizes, which it knows about. + *Also, want VMS to work with language to split among main-mems, so a socket + * only cranks on data in its local segment of main mem + * + *So, outer two loops determine start and end points within the result matrix. + * Inside that, a loop dets the start and end points along the shared dimensions + * of the two input matrices. + */ +void inline +multiplyMatrixArraysTransposed( int32 vecLength, int32 numResRows, + int32 numResCols, + float32 *leftArray, float32 *rightArray, + float32 *resArray ) + { + int resStride, inpStride; + int resStartRow, resStartCol, resEndRow, resEndCol, startVec, endVec; + + resStride = numResCols; + inpStride = vecLength; + + for( resStartRow = 0; resStartRow < numResRows; ) + { + resEndRow = resStartRow + ROWS_IN_BLOCK -1; //start at zero, so -1 + if( resEndRow > numResRows ) resEndRow = numResRows -1; + + for( resStartCol = 0; resStartCol < numResCols; ) + { + resEndCol = resStartCol + COLS_IN_BLOCK -1; + if( resEndCol > numResCols ) resEndCol = numResCols -1; + + for( startVec = 0; startVec < vecLength; ) + { + endVec = startVec + VEC_IN_BLOCK -1; + if( endVec > vecLength ) endVec = vecLength -1; + + //By having the "vector" of sub-blocks in a sub-block slice + // be marched down in inner loop, are re-using the result + // matrix, which stays in L1 cache and re-using the left sub-mat + // which repeats for each right sub-mat -- can only re-use two of + // the three, so result is the most important -- avoids writing + // dirty blocks until those result-locations fully done + //Row and Col is position in result matrix -- so row and vec + // for left array, then vec and col for right array + multiplySubBlocksTransposed( leftArray, rightArray, + resArray, + resStartRow, resEndRow, + resStartCol, resEndCol, + startVec, endVec, + resStride, inpStride ); + startVec = endVec +1; + } + resStartCol = resEndCol +1; + } + resStartRow = resEndRow +1; + } + } + + + +void inline +multiplySubBlocksTransposed( float32 *leftArray, float32 *rightArray, + float32 *resArray, + int resStartRow, int resEndRow, + int resStartCol, int resEndCol, + int startVec, int endVec, + int resStride, int inpStride ) + { + int resRow, resCol, vec; + int leftOffset, rightOffset; + float32 result; + + //The result row is used for the left matrix, res col for the right + for( resCol = resStartCol; resCol <= resEndCol; resCol++ ) + { + for( resRow = resStartRow; resRow <= resEndRow; resRow++ ) + { + leftOffset = resRow * inpStride;//left & right inp strides same + rightOffset = resCol * inpStride;// because right is transposed + result = 0; + for( vec = startVec; vec <= endVec; vec++ ) + { + result += + leftArray[ leftOffset + vec] * rightArray[ rightOffset + vec]; + } + + resArray[ resRow * resStride + resCol ] += result; + } + } + } + + +/*Reuse this in divider when do the sequential multiply case + */ +void inline +copyTranspose( int32 numRows, int32 numCols, + int32 origStartRow, int32 origStartCol, int32 origStride, + float32 *subArray, float32 *origArray ) + { int32 stride = numRows; + + int row, col, origOffset; + for( row = 0; row < numRows; row++ ) + { + origOffset = (row + origStartRow) * origStride + origStartCol; + for( col = 0; col < numCols; col++ ) + { + //transpose means swap row & col -- traverse orig matrix normally + // but put into reversed place in local array -- means the + // stride is the numRows now, so col * numRows + row + subArray[ col * stride + row ] = origArray[ origOffset + col ]; + } + } + } + +void inline +copyTransposeFromOrig( SubMatrix *subMatrix, VirtProcr *animPr ) + { int numCols, numRows, origStartRow, origStartCol, origStride, stride; + Matrix *origMatrix; + float32 *origArray, *subArray; + + VCilk__start_data_singleton( &(subMatrix->copyTransSingleton), animPr ); + + origMatrix = subMatrix->origMatrix; + origArray = origMatrix->array; + numCols = subMatrix->numCols; + numRows = subMatrix->numRows; + origStartRow = subMatrix->origStartRow; + origStartCol = subMatrix->origStartCol; + origStride = origMatrix->numCols; + + subArray = VCilk__malloc( numRows * numCols *sizeof(float32),animPr); + subMatrix->array = subArray; + + //copy values from orig matrix to local + copyTranspose( numRows, numCols, + origStartRow, origStartCol, origStride, + subArray, origArray ); + + VCilk__end_data_singleton( &(subMatrix->copyTransSingleton), animPr ); + return; + } + + +void inline +copyFromOrig( SubMatrix *subMatrix, VirtProcr *animPr ) + { int numCols, numRows, origStartRow, origStartCol, stride, origStride; + Matrix *origMatrix; + float32 *origArray, *subArray; + + + //This lets only a single VP execute the code between start and + // end -- using start and end so that work runs outside the master. + //Inside, if a second VP ever executes the start, it will be returned + // from the end-point. + //Note, for non-GCC, can add a second SSR call at the end, and inside + // that one, look at the stack at the return addr & save that in an + // array indexed by singletonID + VCilk__start_data_singleton( &(subMatrix->copySingleton), animPr ); + + + origMatrix = subMatrix->origMatrix; + origArray = origMatrix->array; + numCols = subMatrix->numCols; + numRows = subMatrix->numRows; + origStartRow = subMatrix->origStartRow; + origStartCol = subMatrix->origStartCol; + origStride = origMatrix->numCols; + + subArray = VCilk__malloc( numRows * numCols *sizeof(float32),animPr); + subMatrix->array = subArray; + + //copy values from orig matrix to local + stride = numCols; + + int row, col, offset, origOffset; + for( row = 0; row < numRows; row++ ) + { + offset = row * stride; + origOffset = (row + origStartRow) * origStride + origStartCol; + for( col = 0; col < numCols; col++ ) + { + subArray[ offset + col ] = origArray[ origOffset + col ]; + } + } + VCilk__end_data_singleton( &(subMatrix->copySingleton), animPr ); + + return; + } diff -r 12bcb9728e1c -r ecba4ae0be7a src/Application/main.c --- a/src/Application/main.c Mon Nov 15 12:08:19 2010 -0800 +++ b/src/Application/main.c Wed May 11 15:40:54 2011 +0200 @@ -1,35 +1,35 @@ -/* - * Copyright Oct 24, 2009 OpenSourceStewardshipFoundation.org - * Licensed under GNU General Public License version 2 - * - * author seanhalle@yahoo.com - */ - -#include -#include - -#include "Matrix_Mult.h" -#include "VCilk__Matrix_Mult/VCilk__Matrix_Mult.h" - -/** - *Matrix multiply program written using VMS_HW piggy-back language - * - */ -int main( int argc, char **argv ) - { Matrix *leftMatrix, *rightMatrix, *resultMatrix; - ParamBag *paramBag; - - paramBag = makeParamBag(); - printf("arguments: %s | %s | %s | %s\n", argv[0], argv[1], argv[2], argv[3] ); - readParamFileIntoBag( argv[1], paramBag ); - initialize_Input_Matrices_Via( &leftMatrix, &rightMatrix, paramBag ); - - resultMatrix = multiplyTheseMatrices( leftMatrix, rightMatrix ); - -// printf("\nresult matrix: \n"); \ - printMatrix( resultMatrix ); - -// VCilk__print_stats(); - fflush(stdin); - exit(0); //cleans up - } +/* + * Copyright Oct 24, 2009 OpenSourceStewardshipFoundation.org + * Licensed under GNU General Public License version 2 + * + * author seanhalle@yahoo.com + */ + +#include +#include + +#include "Matrix_Mult.h" +#include "VCilk__Matrix_Mult/VCilk__Matrix_Mult.h" + +/** + *Matrix multiply program written using VMS_HW piggy-back language + * + */ +int main( int argc, char **argv ) + { Matrix *leftMatrix, *rightMatrix, *resultMatrix; + ParamBag *paramBag; + + paramBag = makeParamBag(); + printf("arguments: %s | %s | %s | %s\n", argv[0], argv[1], argv[2], argv[3] ); + readParamFileIntoBag( argv[1], paramBag ); + initialize_Input_Matrices_Via( &leftMatrix, &rightMatrix, paramBag ); + + resultMatrix = multiplyTheseMatrices( leftMatrix, rightMatrix ); + +// printf("\nresult matrix: \n"); \ + printMatrix( resultMatrix ); + +// VCilk__print_stats(); + fflush(stdin); + exit(0); //cleans up + }