Mercurial > cgi-bin > hgwebdir.cgi > PR > Applications > VCilk > VCilk__Blocked_Matrix_Mult__Bench

changeset 0:56e17dcfc0c3
Initial add -- works, together with VMS pin2Core with vmalloc and probes
author: Me
date: Sat, 30 Oct 2010 20:43:49 -0700
children: 06c80cd3c303
files: .hgignore src/Application/Matrix_Mult.c src/Application/Matrix_Mult.h src/Application/VCilk__Matrix_Mult/Divide_Pr.c src/Application/VCilk__Matrix_Mult/EntryPoint.c src/Application/VCilk__Matrix_Mult/VCilk__Matrix_Mult.h src/Application/VCilk__Matrix_Mult/Vector_Pr.c src/Application/VCilk__Matrix_Mult/subMatrix_Pr.c src/Application/main.c
diffstat: 9 files changed, 1417 insertions(+), 0 deletions(-) [+]
[-]

.hgignore 5

src/Application/Matrix_Mult.c 167

src/Application/Matrix_Mult.h 77

src/Application/VCilk__Matrix_Mult/Divide_Pr.c 583

src/Application/VCilk__Matrix_Mult/EntryPoint.c 58

src/Application/VCilk__Matrix_Mult/VCilk__Matrix_Mult.h 103

src/Application/VCilk__Matrix_Mult/Vector_Pr.c 121

src/Application/VCilk__Matrix_Mult/subMatrix_Pr.c 269

src/Application/main.c 34 .hgignore 5 src/Application/Matrix_Mult.c 167 src/Application/Matrix_Mult.h 77 src/Application/VCilk__Matrix_Mult/Divide_Pr.c 583 src/Application/VCilk__Matrix_Mult/EntryPoint.c 58 src/Application/VCilk__Matrix_Mult/VCilk__Matrix_Mult.h 103 src/Application/VCilk__Matrix_Mult/Vector_Pr.c 121 src/Application/VCilk__Matrix_Mult/subMatrix_Pr.c 269 src/Application/main.c 34
.hgignore 5
src/Application/Matrix_Mult.c 167
src/Application/Matrix_Mult.h 77
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/.hgignore	Sat Oct 30 20:43:49 2010 -0700
     1.3 @@ -0,0 +1,5 @@
     1.4 +nbproject
     1.5 +build
     1.6 +dist
     1.7 +.dep.inc
     1.8 +Makefile

     2.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     2.2 +++ b/src/Application/Matrix_Mult.c	Sat Oct 30 20:43:49 2010 -0700
     2.3 @@ -0,0 +1,167 @@
     2.4 +/*
     2.5 + *  Copyright 2009 OpenSourceStewardshipFoundation.org
     2.6 + *  Licensed under GNU General Public License version 2
     2.7 + *
     2.8 + * Author: seanhalle@yahoo.com
     2.9 + *
    2.10 + * Created on November 15, 2009, 2:35 AM
    2.11 + */
    2.12 +
    2.13 +#include <malloc.h>
    2.14 +#include <stdlib.h>
    2.15 +
    2.16 +#include "Matrix_Mult.h"
    2.17 +#include "ParamHelper/Param.h"
    2.18 +
    2.19 +
    2.20 + 
    2.21 + void
    2.22 +initialize_Input_Matrices_Via( Matrix  **leftMatrix, Matrix **rightMatrix,
    2.23 +                               ParamBag *paramBag )
    2.24 + { char *leftMatrixFileName, *rightMatrixFileName;
    2.25 +   int   leftMatrixRows, leftMatrixCols, rightMatrixRows, rightMatrixCols;
    2.26 +   
    2.27 +      ParamStruc *param;
    2.28 +      param = getParamFromBag( "leftMatrixRows", paramBag );
    2.29 +   leftMatrixRows = param->intValue;
    2.30 +      param = getParamFromBag( "leftMatrixCols", paramBag );
    2.31 +   leftMatrixCols = param->intValue;
    2.32 +   *leftMatrix = makeMatrix_WithResMat( leftMatrixRows, leftMatrixCols );
    2.33 +   
    2.34 +      param = getParamFromBag( "leftMatrixFileName", paramBag );
    2.35 +   leftMatrixFileName = param->strValue;  //no need to copy
    2.36 +   read_Matrix_From_File( *leftMatrix,  leftMatrixFileName );
    2.37 +   
    2.38 +      param = getParamFromBag( "rightMatrixRows", paramBag );
    2.39 +   rightMatrixRows = param->intValue;
    2.40 +      param = getParamFromBag( "rightMatrixCols", paramBag );
    2.41 +   rightMatrixCols = param->intValue;
    2.42 +   *rightMatrix = makeMatrix_WithResMat( rightMatrixRows, rightMatrixCols );
    2.43 +   
    2.44 +      param = getParamFromBag( "rightMatrixFileName", paramBag );
    2.45 +   rightMatrixFileName = param->strValue;
    2.46 +   read_Matrix_From_File( *rightMatrix, rightMatrixFileName );
    2.47 + }
    2.48 +
    2.49 +
    2.50 +void parseLineIntoRow( char *line, float32* row );
    2.51 +
    2.52 +
    2.53 + void
    2.54 +read_Matrix_From_File( Matrix *matrixStruc, char *matrixFileName )
    2.55 + { int    row, maxRead, numRows, numCols;
    2.56 +   float32 *matrixStart;
    2.57 +   size_t lineSz = 0;
    2.58 +   FILE  *file;
    2.59 +   char  *line = NULL;
    2.60 +   
    2.61 +   lineSz = 50000; //max length of line in a matrix data file
    2.62 +   line = (char *) malloc( lineSz );
    2.63 +   if( line == NULL ) printf( "no mem for matrix line" );
    2.64 +   
    2.65 +   numRows = matrixStruc->numRows;
    2.66 +   numCols = matrixStruc->numCols;
    2.67 +   matrixStart = matrixStruc->array;
    2.68 +
    2.69 +   file = fopen( matrixFileName, "r" );
    2.70 +   if( file == NULL ) { printf( "\nCouldn't open file!!\n"); exit(1);}
    2.71 +   fseek( file, 0, SEEK_SET );
    2.72 +   for( row = 0; row < numRows; row++ )
    2.73 +    {
    2.74 +      if( feof( file ) )  printf( "file ran out too soon" );
    2.75 +      maxRead = getline( &line, &lineSz, file );
    2.76 +      if( maxRead == -1 ) printf( "prob reading mat line");
    2.77 +      
    2.78 +      if( *line == '\n') continue; //blank line
    2.79 +      if( *line == '/' ) continue; //comment line
    2.80 +      
    2.81 +      parseLineIntoRow( line, matrixStart + row * numCols );
    2.82 +    }
    2.83 +   free( line );
    2.84 + }
    2.85 +
    2.86 +/*This function relies on each line having the proper number of cols.  It
    2.87 + * doesn't check, nor enforce, so if the file is improperly formatted it
    2.88 + * can write over unrelated memory
    2.89 + */
    2.90 + void
    2.91 +parseLineIntoRow( char *line, float32* row )
    2.92 + {
    2.93 +   char *valueStr, *searchPos;
    2.94 +   
    2.95 +      //read the float values
    2.96 +   searchPos = valueStr = line; //start
    2.97 +   
    2.98 +   for( ; *searchPos != 0; searchPos++)  //bit dangerous, should use buff len
    2.99 +    {
   2.100 +      if( *searchPos == '\n' ) //last col..  relying on well-formatted file
   2.101 +       { *searchPos = 0;
   2.102 +         *row = atof( valueStr );
   2.103 +         break;                                    //end FOR loop
   2.104 +       }
   2.105 +      if( *searchPos == ',' )
   2.106 +       { *searchPos = 0;                           //mark end of string
   2.107 +         *row = (float32) atof( valueStr );
   2.108 +         row += 1;                                 //address arith
   2.109 +            //skip any spaces before digits.. use searchPos + 1 to skip the 0
   2.110 +         for( ; *(searchPos + 1)== ' ' && *(searchPos + 1) !=0; searchPos++);
   2.111 +         valueStr = searchPos + 1;
   2.112 +       }
   2.113 +    }
   2.114 + }
   2.115 +
   2.116 + //==========================================================================
   2.117 +
   2.118 +/*In the "_Flat" version of constructor, do only malloc of the top data struc
   2.119 + * and set values in that top-level.  Don't malloc any sub-structures.
   2.120 + */
   2.121 + Matrix *
   2.122 +makeMatrix_Flat( int32 numRows, int32 numCols )
   2.123 + { Matrix * retMatrix;
   2.124 +   retMatrix = malloc( sizeof( Matrix ) );
   2.125 +   retMatrix->numRows = numRows;
   2.126 +   retMatrix->numCols = numCols;
   2.127 +
   2.128 +   return retMatrix;
   2.129 + }
   2.130 +
   2.131 + Matrix *
   2.132 +makeMatrix_WithResMat( int32 numRows, int32 numCols )
   2.133 + { Matrix * retMatrix;
   2.134 +   retMatrix = malloc( sizeof( Matrix ) );
   2.135 +   retMatrix->numRows = numRows;
   2.136 +   retMatrix->numCols = numCols;
   2.137 +   retMatrix->array  = malloc( numRows * numCols * sizeof(float32) );
   2.138 +
   2.139 +   return retMatrix;
   2.140 + }
   2.141 +
   2.142 + void
   2.143 +freeMatrix_Flat( Matrix * matrix )
   2.144 + { //( matrix );
   2.145 + }
   2.146 + void
   2.147 +freeMatrix( Matrix * matrix )
   2.148 + { free( matrix->array );
   2.149 +   free( matrix );
   2.150 + }
   2.151 +
   2.152 +void
   2.153 +printMatrix( Matrix *matrix )
   2.154 + { int r, c, numRows, numCols, rowsToPrint, colsToPrint, rowIncr, colIncr;
   2.155 +   float32 *matrixArray;
   2.156 +
   2.157 +   numRows = rowsToPrint = matrix->numRows;
   2.158 +   numCols = colsToPrint = matrix->numCols;
   2.159 +   matrixArray = matrix->array;
   2.160 +
   2.161 +   rowIncr = numRows/20; if(rowIncr == 0) rowIncr = 1;//20 to 39 rows printed
   2.162 +   colIncr = numCols/20; if(colIncr == 0) colIncr = 1;//20 to 39 cols printed
   2.163 +   for( r = 0; r < numRows; r += rowIncr )
   2.164 +    { for( c = 0; c < numCols; c += colIncr )
   2.165 +       { printf( "%3.1f | ", matrixArray[ r * numCols + c ] );
   2.166 +       }
   2.167 +      printf("\n");
   2.168 +    }
   2.169 + }
   2.170 +

     3.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     3.2 +++ b/src/Application/Matrix_Mult.h	Sat Oct 30 20:43:49 2010 -0700
     3.3 @@ -0,0 +1,77 @@
     3.4 +/*
     3.5 + *  Copyright Oct 24, 2009 OpenSourceStewardshipFoundation.org
     3.6 + *  Licensed under GNU General Public License version 2
     3.7 + */
     3.8 +
     3.9 +#ifndef MATRIX_MULT_H_
    3.10 +#define MATRIX_MULT_H_
    3.11 +
    3.12 +#include <stdio.h>
    3.13 +#include <unistd.h>
    3.14 +#include <malloc.h>
    3.15 +
    3.16 +#include "../VCilk_lib/VMS/VMS_primitive_data_types.h"
    3.17 +#include "ParamHelper/Param.h"
    3.18 +
    3.19 +//==============================  Structures  ==============================
    3.20 +
    3.21 +typedef
    3.22 +struct
    3.23 + { int32 numRows;
    3.24 +   int32 numCols;
    3.25 +   float32 *array;  //2D, but dynamically sized, so use addr arith
    3.26 + }
    3.27 +Matrix;
    3.28 +
    3.29 +/* This is the "appSpecificPiece" that is carried inside a DKUPiece.
    3.30 + *  In the DKUPiece data struc it is declared to be of type "void *".  This
    3.31 + *  allows the application to define any data structure it wants and put it
    3.32 + *  into a DKUPiece.
    3.33 + * When the app specific info is used, it is in app code, so it is cast to
    3.34 + *  the correct type to tell the compiler how to access fields.
    3.35 + * This keeps all app-specific things out of the DKU directory, as per the
    3.36 + *  DKU standard. */
    3.37 +typedef
    3.38 +struct
    3.39 + { 
    3.40 +      // pointers to shared data..  the result matrix must be created when the
    3.41 +      //  left and right matrices are put into the root ancestor DKUPiece.
    3.42 +   Matrix * leftMatrix;
    3.43 +   Matrix * rightMatrix;
    3.44 +   Matrix * resultMatrix;
    3.45 +
    3.46 +      // define the starting and ending boundaries for this piece of the
    3.47 +      //  result matrix.  These are derivable from the left and right
    3.48 +      //  matrices, but included them for readability of code.
    3.49 +   int prodStartRow, prodEndRow;
    3.50 +   int prodStartCol, prodEndCol;
    3.51 +      // Start and end of the portion of the left matrix that contributes to
    3.52 +      //  this piece of the product
    3.53 +   int leftStartRow, leftEndRow;
    3.54 +   int leftStartCol, leftEndCol;
    3.55 +      // Start and end of the portion of the right matrix that contributes to
    3.56 +      //  this piece of the product
    3.57 +   int rightStartRow, rightEndRow;
    3.58 +   int rightStartCol, rightEndCol;
    3.59 + }
    3.60 +MatrixProdPiece;
    3.61 +
    3.62 +//==============================  Functions  ================================
    3.63 +void readFile();
    3.64 +
    3.65 +Matrix *makeMatrix( int32 numRows, int32 numCols );
    3.66 +Matrix *makeMatrix_Flat( int32 numRows, int32 numCols );
    3.67 +Matrix *makeMatrix_WithResMat( int32 numRows, int32 numCols );
    3.68 +void    freeMatrix_Flat( Matrix * matrix );
    3.69 +void    freeMatrix( Matrix * matrix );
    3.70 +void    printMatrix( Matrix *matrix );
    3.71 +
    3.72 +void read_Matrix_From_File( Matrix *matrixStruc, char *matrixFileName );
    3.73 +
    3.74 +void
    3.75 +initialize_Input_Matrices_Via( Matrix  **leftMatrix, Matrix **rightMatrix,
    3.76 +                              ParamBag *paramBag );
    3.77 +
    3.78 +//===========================================================================
    3.79 +
    3.80 +#endif /*MATRIX_MULT_H_*/

     4.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     4.2 +++ b/src/Application/VCilk__Matrix_Mult/Divide_Pr.c	Sat Oct 30 20:43:49 2010 -0700
     4.3 @@ -0,0 +1,583 @@
     4.4 +/*
     4.5 + *  Copyright 2009 OpenSourceStewardshipFoundation.org
     4.6 + *  Licensed under GNU General Public License version 2
     4.7 + *
     4.8 + * Author: seanhalle@yahoo.com
     4.9 + *
    4.10 + */
    4.11 +
    4.12 +
    4.13 +#include "VCilk__Matrix_Mult.h"
    4.14 +#include <math.h>
    4.15 +#include <sys/time.h>
    4.16 +
    4.17 +   //The time to compute this many result values should equal the time to
    4.18 +   // perform this division on a matrix of size gives that many result calcs
    4.19 +   //IE, size this so that sequential time to calc equals divide time
    4.20 +   // find the value by experimenting -- but divide time and calc time scale
    4.21 +   // same way, so this value should remain valid across hardware
    4.22 +   //Divide time is about 800us on 2.4Ghz core2Quad laptop core
    4.23 +   //num cells is the cube of a side, when have two square matrices
    4.24 +#define NUM_CELLS_IN_SEQUENTIAL_CUTOFF 100000 /* about 46x46 */
    4.25 +
    4.26 +
    4.27 +//===========================================================================
    4.28 +int inline
    4.29 +measureMatrixMultPrimitive( VirtProcr *animPr );
    4.30 +
    4.31 +SlicingStrucCarrier *
    4.32 +calcIdealSizeAndSliceDimensions( Matrix *leftMatrix, Matrix *rightMatrix,
    4.33 +                                 VirtProcr *animPr );
    4.34 +
    4.35 +SlicingStruc *
    4.36 +sliceUpDimension( float32 idealSizeOfSide, int startVal, int endVal,
    4.37 +                  VirtProcr *animPr );
    4.38 +
    4.39 +void
    4.40 +freeSlicingStruc( SlicingStruc *slicingStruc, VirtProcr *animPr );
    4.41 +
    4.42 +SubMatrix **
    4.43 +createSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices,
    4.44 +                   Matrix *origMatrix, VirtProcr *animPr );
    4.45 +
    4.46 +void
    4.47 +freeSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices,
    4.48 +                 SubMatrix **subMatrices, VirtProcr *animPr );
    4.49 +
    4.50 +void
    4.51 +pairUpSubMatricesAndSpawnAndSync( SubMatrix **leftSubMatrices,
    4.52 +                                    SubMatrix **rightSubMatrices,
    4.53 +                                    int32 numRowIdxs, int32 numColIdxs,
    4.54 +                                    int32 numVecIdxs,
    4.55 +                                    float32 *resultArray,
    4.56 +                                    VirtProcr *animatingPr );
    4.57 +
    4.58 +void
    4.59 +makeSubMatricesAndSpawnAndSync( Matrix *leftMatrix, Matrix *rightMatrix,
    4.60 +            SlicingStrucCarrier *slicingStrucCarrier,
    4.61 +            float32 *resultArray, VirtProcr *animatingPr );
    4.62 +
    4.63 +//===========================================================================
    4.64 +
    4.65 +/*Divider creates one processor for every sub-matrix
    4.66 + * It hands them:
    4.67 + *  the name of the result processor that they should send their results to,
    4.68 + *  the left and right matrices, and the rows and cols they should multiply
    4.69 + * It first creates the result processor, then all the sub-matrixPair
    4.70 + *  processors,
    4.71 + *  then does a receive of a message from the result processor that gives
    4.72 + *  the divider ownership of the result matrix.
    4.73 + * Finally, the divider returns the result matrix out of the VCilk system.
    4.74 + *
    4.75 + * Divider chooses the size of sub-matrices via an algorithm that tries to
    4.76 + *  keep the minimum work above a threshold.  The threshold is machine-
    4.77 + *  dependent, so ask VCilk for min work-unit time to get a
    4.78 + *  given overhead
    4.79 + *
    4.80 + * Divide min work-unit cycles by measured-cycles for one matrix-cell
    4.81 + *  product -- gives the number of products need to have in min size
    4.82 + *  matrix.
    4.83 + *
    4.84 + * So then, take cubed root of this to get the size of a side of min sub-
    4.85 + *  matrix.  That is the size of the ideal square sub-matrix -- so tile
    4.86 + *  up the two input matrices into ones as close as possible to that size,
    4.87 + *  and create the pairs of sub-matrices.
    4.88 + *
    4.89 + *========================  STRATEGIC OVERVIEW  =======================
    4.90 + *
    4.91 + *This division is a bit tricky, because have to create things in advance
    4.92 + * that it's not at first obvious need to be created..
    4.93 + *
    4.94 + *First slice up each dimension -- three of them..  this is because will have
    4.95 + * to create the sub-matrix's data-structures before pairing the sub-matrices
    4.96 + * with each other -- so, have three dimensions to slice up before can
    4.97 + * create the sub-matrix data-strucs -- also, have to be certain that the
    4.98 + * cols of the left input have the exact same slicing as the rows of the
    4.99 + * left matrix, so just to be sure, do the slicing calc once, then use it
   4.100 + * for both.
   4.101 + *
   4.102 + *So, goes like this:
   4.103 + *1) calculate the start & end values of each dimension in each matrix.
   4.104 + *2) use those values to create sub-matrix structures
   4.105 + *3) combine sub-matrices into pairs, as the tasks to perform.
   4.106 + *
   4.107 + *Have to calculate separately from creating the sub-matrices because of the
   4.108 + * nature of the nesting -- would either end up creating the same sub-matrix
   4.109 + * multiple times, or else would have to put in detection of whether had
   4.110 + * made a particular one already if tried to combine steps 1 and 2.
   4.111 + *
   4.112 + *Step 3 has to be separate because of the nesting, as well -- same reason,
   4.113 + * would either create same sub-matrix multiple times, or else have to
   4.114 + * add detection of whether was already created.
   4.115 + *
   4.116 + *Another way to look at it: there's one level of loop to divide dimensions,
   4.117 + * two levels of nesting to create sub-matrices, and three levels to pair
   4.118 + * up the sub-matrices.
   4.119 + */
   4.120 +
   4.121 +void divideWorkIntoSubMatrixPairProcrs( void      *_dividerParams,
   4.122 +                                        VirtProcr *animPr )
   4.123 + { 
   4.124 +   DividerParams   *dividerParams;
   4.125 +   ResultsParams   *resultsParams;
   4.126 +   Matrix          *leftMatrix, *rightMatrix, *resultMatrix;
   4.127 +   void            *msg;
   4.128 +   SlicingStrucCarrier *slicingStrucCarrier;
   4.129 +   float32         *resultArray; //points to array to be put inside result
   4.130 +                                 // matrix
   4.131 +   
   4.132 +         PRINT_DEBUG("start divide\n")
   4.133 +
   4.134 +//TODO:         VMS__create_block_of_probes_with_idxs( 0, 2 );
   4.135 +         int32
   4.136 +         divideProbe = VMS__create_single_interval_probe( "divideProbe",
   4.137 +                                                          animPr );
   4.138 +         VMS__record_sched_choice_into_probe( divideProbe, animPr );
   4.139 +         VMS__record_interval_start_in_probe( divideProbe );
   4.140 +         
   4.141 +   //=========== Setup -- make local copies of ptd-to-things, malloc, aso
   4.142 +   int32 numResRows, numResCols;
   4.143 +
   4.144 +   dividerParams   = (DividerParams *)_dividerParams;
   4.145 +   
   4.146 +   leftMatrix      = dividerParams->leftMatrix;
   4.147 +   rightMatrix     = dividerParams->rightMatrix;
   4.148 +
   4.149 +   numResRows  = leftMatrix->numRows;
   4.150 +   numResCols  = rightMatrix->numCols;
   4.151 +   resultArray = dividerParams->resultMatrix->array;
   4.152 +
   4.153 +   
   4.154 +   //==============  Do either sequential mult or do division ==============
   4.155 +
   4.156 +      //Check if input matrices too small -- if yes, just do sequential
   4.157 +      //Cutoff is determined by overhead of this divider -- relatively
   4.158 +      // machine-independent
   4.159 +   if( (float32)leftMatrix->numRows * (float32)leftMatrix->numCols *
   4.160 +       (float32)rightMatrix->numCols  < NUM_CELLS_IN_SEQUENTIAL_CUTOFF )
   4.161 +    { int32 vectLength;
   4.162 +
   4.163 +      //====== Do sequential multiply on a single core
   4.164 +            PRINT_DEBUG("doing sequential")
   4.165 +      vectLength = leftMatrix->numCols;
   4.166 +
   4.167 +      multiplyMatrixArrays( vectLength, numResRows, numResCols,
   4.168 +                            leftMatrix->array, rightMatrix->array,
   4.169 +                            resultArray );
   4.170 +    }
   4.171 +   else
   4.172 +    {
   4.173 +      //====== Do parallel multiply across cores
   4.174 +
   4.175 +         //Calc the ideal size of sub-matrix and slice up the dimensions of
   4.176 +         // the two matrices.
   4.177 +         //The ideal size is the one takes the number of cycles to calculate
   4.178 +         // such that calc time is equal or greater than min work-unit size
   4.179 +      slicingStrucCarrier =
   4.180 +         calcIdealSizeAndSliceDimensions( leftMatrix, rightMatrix,  animPr);
   4.181 +                                         
   4.182 +
   4.183 +         //Make the results processor, now that know how many to wait for
   4.184 +/*
   4.185 +      resultsParams = VCilk__malloc( sizeof(ResultsParams) );
   4.186 +      resultsParams->dividerPr = animatingPr;
   4.187 +      resultsParams->numSubMatrixPairs  =
   4.188 +         slicingStrucCarrier->leftRowSlices->numVals *
   4.189 +         slicingStrucCarrier->rightColSlices->numVals *
   4.190 +         slicingStrucCarrier->vecSlices->numVals;
   4.191 +      resultsParams->numCols   = rightMatrix->numCols;
   4.192 +      resultsParams->numRows   = leftMatrix->numRows;
   4.193 +*/
   4.194 +
   4.195 +         //Make the sub-matrices, and pair them up, then spawn processors to
   4.196 +         // calc product of each pair.
   4.197 +      makeSubMatricesAndSpawnAndSync( leftMatrix, rightMatrix,
   4.198 +                                      slicingStrucCarrier,
   4.199 +                                      resultArray, animPr);
   4.200 +         //The result array will get filled in by the spawned children
   4.201 +    }
   4.202 +
   4.203 +
   4.204 +   //===============  Work done -- send results back =================
   4.205 +
   4.206 +
   4.207 +      //results have been saved into an array that was made outside the VMS
   4.208 +      // system, by entry-point Fn, and passed in through dividerParams.
   4.209 +      //So, nothing to do to send results back -- they're seen by side-effect
   4.210 +
   4.211 +         PRINT_DEBUG("end divide\n")
   4.212 +
   4.213 +         VMS__record_interval_end_in_probe( divideProbe );
   4.214 +         VMS__print_stats_of_all_probes;
   4.215 +
   4.216 +   VCilk__dissipate_procr( animPr );  //all procrs dissipate self at end
   4.217 +      //when all of the processors have dissipated, the "create seed and do
   4.218 +      // work" call in the entry point function returns
   4.219 + }
   4.220 +
   4.221 +
   4.222 +SlicingStrucCarrier *
   4.223 +calcIdealSizeAndSliceDimensions( Matrix *leftMatrix, Matrix *rightMatrix,
   4.224 +                                 VirtProcr *animPr )
   4.225 +{
   4.226 +   float32 idealSizeOfSide, idealSizeOfSide1, idealSizeOfSide2;
   4.227 +   SlicingStruc *leftRowSlices, *vecSlices, *rightColSlices;
   4.228 +   SlicingStrucCarrier *slicingStrucCarrier =
   4.229 +                         VCilk__malloc(sizeof(SlicingStrucCarrier), animPr );
   4.230 +
   4.231 +   int minWorkUnitCycles, primitiveCycles, idealNumWorkUnits;
   4.232 +   float64 numPrimitiveOpsInMinWorkUnit;
   4.233 +
   4.234 +
   4.235 +   //=======  Calc ideal size of min-sized sub-matrix  ========
   4.236 +
   4.237 +      //ask VCilk for the number of cycles of the minimum work unit, at given
   4.238 +      // percent overhead then add a guess at overhead from this divider
   4.239 +   minWorkUnitCycles = VCilk__giveMinWorkUnitCycles( .05 );
   4.240 +
   4.241 +      //ask VCilk for number of cycles of the "primitive" op of matrix mult
   4.242 +   primitiveCycles = measureMatrixMultPrimitive( animPr );
   4.243 +
   4.244 +   numPrimitiveOpsInMinWorkUnit =
   4.245 +      (float64)minWorkUnitCycles / (float64)primitiveCycles;
   4.246 +
   4.247 +      //take cubed root -- that's number of these in a "side" of sub-matrix
   4.248 +      // then multiply by 5 because the primitive is 5x5
   4.249 +   idealSizeOfSide1 = 5 * cbrt( numPrimitiveOpsInMinWorkUnit );
   4.250 +
   4.251 +   idealNumWorkUnits = VCilk__giveIdealNumWorkUnits();
   4.252 +   
   4.253 +   idealSizeOfSide2 = leftMatrix->numRows / rint(cbrt( idealNumWorkUnits ));
   4.254 +   idealSizeOfSide2 *= 0.6; //finer granularity to help load balance
   4.255 +
   4.256 +   if( idealSizeOfSide1 > idealSizeOfSide2 )
   4.257 +      idealSizeOfSide = idealSizeOfSide1;
   4.258 +   else
   4.259 +      idealSizeOfSide = idealSizeOfSide2;
   4.260 +
   4.261 +      //The multiply inner loop blocks the array to fit into L1 cache
   4.262 +//   if( idealSizeOfSide < ROWS_IN_BLOCK ) idealSizeOfSide = ROWS_IN_BLOCK;
   4.263 +
   4.264 +   //============  Slice up dimensions, now that know target size ===========
   4.265 +
   4.266 +      //Tell the slicer the target size of a side (floating pt), the start
   4.267 +      // value to start slicing at, and the end value to stop slicing at
   4.268 +      //It returns an array of start value of each chunk, plus number of them
   4.269 +   int32 startLeftRow, endLeftRow, startVec,endVec,startRightCol,endRightCol;
   4.270 +   startLeftRow  = 0;
   4.271 +   endLeftRow    = leftMatrix->numRows -1;
   4.272 +   startVec      = 0;
   4.273 +   endVec        = leftMatrix->numCols -1;
   4.274 +   startRightCol = 0;
   4.275 +   endRightCol   = rightMatrix->numCols -1;
   4.276 +
   4.277 +   leftRowSlices =
   4.278 +      sliceUpDimension( idealSizeOfSide,  startLeftRow, endLeftRow, animPr );
   4.279 +
   4.280 +   vecSlices =
   4.281 +      sliceUpDimension( idealSizeOfSide,  startVec, endVec, animPr );
   4.282 +
   4.283 +   rightColSlices =
   4.284 +      sliceUpDimension( idealSizeOfSide,  startRightCol, endRightCol,animPr);
   4.285 +
   4.286 +   slicingStrucCarrier->leftRowSlices  = leftRowSlices;
   4.287 +   slicingStrucCarrier->vecSlices      = vecSlices;
   4.288 +   slicingStrucCarrier->rightColSlices = rightColSlices;
   4.289 +
   4.290 +   return slicingStrucCarrier;
   4.291 +}
   4.292 +
   4.293 +
   4.294 +void
   4.295 +makeSubMatricesAndSpawnAndSync( Matrix  *leftMatrix,  Matrix    *rightMatrix,
   4.296 +                         SlicingStrucCarrier *slicingStrucCarrier,
   4.297 +                         float32 *resultArray, VirtProcr *animPr )
   4.298 + {
   4.299 +   SlicingStruc *leftRowSlices, *vecSlices, *rightColSlices;
   4.300 +   
   4.301 +   leftRowSlices  = slicingStrucCarrier->leftRowSlices;
   4.302 +   vecSlices      = slicingStrucCarrier->vecSlices;
   4.303 +   rightColSlices = slicingStrucCarrier->rightColSlices;
   4.304 +   VCilk__free( slicingStrucCarrier, animPr );
   4.305 +   
   4.306 +   //================  Make sub-matrices, given the slicing  ================
   4.307 +   SubMatrix **leftSubMatrices, **rightSubMatrices;
   4.308 +   leftSubMatrices =
   4.309 +      createSubMatrices( leftRowSlices, vecSlices,
   4.310 +                         leftMatrix, animPr );
   4.311 +   rightSubMatrices =
   4.312 +      createSubMatrices( vecSlices, rightColSlices,
   4.313 +                         rightMatrix, animPr );
   4.314 +
   4.315 +   //==============  pair the sub-matrices and make processors ==============
   4.316 +   int32 numRowIdxs, numColIdxs, numVecIdxs;
   4.317 +
   4.318 +   numRowIdxs = leftRowSlices->numVals;
   4.319 +   numColIdxs = rightColSlices->numVals;
   4.320 +   numVecIdxs = vecSlices->numVals;
   4.321 +   pairUpSubMatricesAndSpawnAndSync( leftSubMatrices, rightSubMatrices,
   4.322 +                                     numRowIdxs, numColIdxs, numVecIdxs,
   4.323 +                                     resultArray,
   4.324 +                                     animPr );
   4.325 +      //It syncs inside, so know all work is done now: free the sub-matrices
   4.326 +   freeSubMatrices( leftRowSlices, vecSlices,  leftSubMatrices, animPr );
   4.327 +   freeSubMatrices( vecSlices, rightColSlices, rightSubMatrices, animPr );
   4.328 +
   4.329 +
   4.330 +   freeSlicingStruc( leftRowSlices, animPr );
   4.331 +   freeSlicingStruc( vecSlices, animPr );
   4.332 +   freeSlicingStruc( rightColSlices, animPr );
   4.333 + }
   4.334 +
   4.335 +
   4.336 +
   4.337 +
   4.338 +/* numRows*colsPerRow/numCores = numToPutOntoEachCore; 
   4.339 + * put all from a given row onto same core, until exhaust allotment for that
   4.340 + *  core
   4.341 + *
   4.342 + */
   4.343 +void
   4.344 +pairUpSubMatricesAndSpawnAndSync( SubMatrix **leftSubMatrices,
   4.345 +                                    SubMatrix **rightSubMatrices,
   4.346 +                                    int32 numRowIdxs, int32 numColIdxs,
   4.347 +                                    int32 numVecIdxs,
   4.348 +                                    float32 *resultArray,
   4.349 +                                    VirtProcr *animatingPr )
   4.350 + {
   4.351 +   int32 resRowIdx, resColIdx;
   4.352 +   int32 numLeftColIdxs, numRightColIdxs;
   4.353 +   int32 leftRowIdxOffset;
   4.354 +   VecParams *vecParams;
   4.355 +   float32 numToPutOntoEachCore, leftOverFraction;
   4.356 +   int32 numCores, currCore, numOnCurrCore;
   4.357 +
   4.358 +   numLeftColIdxs  = numColIdxs;
   4.359 +   numRightColIdxs = numVecIdxs;
   4.360 +
   4.361 +   numCores = VCilk__give_number_of_cores_to_spawn_onto();   
   4.362 +
   4.363 +   numToPutOntoEachCore = numRowIdxs*numColIdxs/numCores;
   4.364 +   leftOverFraction = 0;
   4.365 +   numOnCurrCore = 0;
   4.366 +   currCore = 0;
   4.367 +
   4.368 +   for( resRowIdx = 0; resRowIdx < numRowIdxs; resRowIdx++ )
   4.369 +    {
   4.370 +      leftRowIdxOffset = resRowIdx * numLeftColIdxs;
   4.371 +
   4.372 +      for( resColIdx = 0; resColIdx < numColIdxs; resColIdx++ )
   4.373 +       {
   4.374 +         vecParams = VCilk__malloc( sizeof(VecParams), animatingPr );
   4.375 +         
   4.376 +         vecParams->numVecIdxs       = numVecIdxs;
   4.377 +         vecParams->numRightColIdxs  = numRightColIdxs;
   4.378 +         vecParams->leftRowIdxOffset = leftRowIdxOffset;
   4.379 +         vecParams->resColIdx        = resColIdx;
   4.380 +         vecParams->leftSubMatrices  = leftSubMatrices;
   4.381 +         vecParams->rightSubMatrices = rightSubMatrices;
   4.382 +         vecParams->resultArray      = resultArray;
   4.383 +         vecParams->coreToRunOn      = currCore;
   4.384 +
   4.385 +         VCilk__spawn( currCore, &calcVectorOfSubMatrices, vecParams,
   4.386 +                       animatingPr );
   4.387 +
   4.388 +         numOnCurrCore += 1;
   4.389 +         if( numOnCurrCore + leftOverFraction >= numToPutOntoEachCore - 1 )
   4.390 +          {
   4.391 +               //deal with fractional part, to ensure that imbalance is 1 max
   4.392 +               // IE, core with most has only 1 more than core with least
   4.393 +            leftOverFraction += numToPutOntoEachCore - numOnCurrCore;
   4.394 +            if( leftOverFraction >= 1 )
   4.395 +             { leftOverFraction -= 1;
   4.396 +               numOnCurrCore = -1;
   4.397 +             }
   4.398 +            else
   4.399 +             { numOnCurrCore = 0;
   4.400 +             }
   4.401 +               //Move to next core, max core-value to incr to is numCores -1
   4.402 +            if( currCore >= numCores -1 )
   4.403 +             { currCore = 0;
   4.404 +             }
   4.405 +            else
   4.406 +             { currCore += 1;
   4.407 +             }
   4.408 +          }
   4.409 +       }
   4.410 +    }
   4.411 +   
   4.412 +   //Free Note: vector of sub-matrices does its own free-ing, even vec-params
   4.413 +
   4.414 +//TODO: timeToSpawnProbe = VMS__get_probe_by_name( "timeToSpawnProbe" );
   4.415 +//      VMS__end_interval_on_probe( timeToSpawnProbe );
   4.416 +
   4.417 +   VCilk__sync( animatingPr );
   4.418 +
   4.419 +   //free the sub-matrices in Fn that called this one
   4.420 + }
   4.421 +
   4.422 +
   4.423 +/*Walk through the two slice-strucs, making sub-matrix strucs as go
   4.424 + */
   4.425 +SubMatrix **
   4.426 +createSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices,
   4.427 +                   Matrix *origMatrix, VirtProcr *animPr )
   4.428 + {
   4.429 +   int32 numRowIdxs, numColIdxs, rowIdx, colIdx;
   4.430 +   int32 startRow, endRow, startCol, endCol;
   4.431 +   int32 *rowStartVals, *colStartVals;
   4.432 +   int32 rowOffset;
   4.433 +   SubMatrix **subMatrices, *newSubMatrix;
   4.434 +
   4.435 +   numRowIdxs = rowSlices->numVals;
   4.436 +   numColIdxs = colSlices->numVals;
   4.437 +
   4.438 +   rowStartVals = rowSlices->startVals;
   4.439 +   colStartVals = colSlices->startVals;
   4.440 +
   4.441 +   subMatrices = VCilk__malloc( numRowIdxs * numColIdxs *sizeof(SubMatrix *),
   4.442 +                                animPr );
   4.443 +
   4.444 +   for( rowIdx = 0; rowIdx < numRowIdxs; rowIdx++ )
   4.445 +    {
   4.446 +      rowOffset = rowIdx * numColIdxs;
   4.447 +      
   4.448 +      startRow  = rowStartVals[rowIdx];
   4.449 +      endRow    = rowStartVals[rowIdx + 1] -1; //"fake" start above last is
   4.450 +                                               // at last valid idx + 1 & is
   4.451 +                                               // 1 greater than end value
   4.452 +      for( colIdx = 0; colIdx < numColIdxs; colIdx++ )
   4.453 +       {
   4.454 +         startCol = colStartVals[colIdx];
   4.455 +         endCol   = colStartVals[colIdx + 1] -1;
   4.456 +
   4.457 +         newSubMatrix = VCilk__malloc( sizeof(SubMatrix), animPr );
   4.458 +         newSubMatrix->numRows       = endRow - startRow +1;
   4.459 +         newSubMatrix->numCols       = endCol - startCol +1;
   4.460 +         newSubMatrix->origMatrix    = origMatrix;
   4.461 +         newSubMatrix->origStartRow  = startRow;
   4.462 +         newSubMatrix->origStartCol  = startCol;
   4.463 +         newSubMatrix->alreadyCopied = FALSE;
   4.464 +
   4.465 +         subMatrices[ rowOffset + colIdx ] = newSubMatrix;
   4.466 +       }
   4.467 +    }
   4.468 +   return subMatrices;
   4.469 + }
   4.470 +
   4.471 +void
   4.472 +freeSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices,
   4.473 +                 SubMatrix **subMatrices, VirtProcr *animPr )
   4.474 + {
   4.475 +   int32 numRowIdxs, numColIdxs, rowIdx, colIdx, rowOffset;
   4.476 +   SubMatrix *subMatrix;
   4.477 +
   4.478 +   numRowIdxs = rowSlices->numVals;
   4.479 +   numColIdxs = colSlices->numVals;
   4.480 +
   4.481 +   for( rowIdx = 0; rowIdx < numRowIdxs; rowIdx++ )
   4.482 +    {
   4.483 +      rowOffset = rowIdx * numColIdxs;
   4.484 +      for( colIdx = 0; colIdx < numColIdxs; colIdx++ )
   4.485 +       {
   4.486 +         subMatrix = subMatrices[ rowOffset + colIdx ];
   4.487 +         if( subMatrix->alreadyCopied )
   4.488 +            VCilk__free( subMatrix->array, animPr );
   4.489 +         VCilk__free( subMatrix, animPr );
   4.490 +       }
   4.491 +    }
   4.492 +   VCilk__free( subMatrices, animPr );
   4.493 + }
   4.494 +
   4.495 +
   4.496 +
   4.497 +SlicingStruc *
   4.498 +sliceUpDimension( float32 idealSizeOfSide, int startVal, int endVal,
   4.499 +                  VirtProcr *animPr )
   4.500 + { float32 residualAcc = 0;
   4.501 +   int     numSlices, i, *startVals, sizeOfSlice, endCondition;
   4.502 +   SlicingStruc *slicingStruc = VCilk__malloc( sizeof(SlicingStruc), animPr);
   4.503 +
   4.504 +      //calc size of matrix need to hold start vals --
   4.505 +   numSlices = (int32)( (float32)(endVal -startVal +1) / idealSizeOfSide);
   4.506 +
   4.507 +   startVals = VCilk__malloc( (numSlices + 1) * sizeof(int32), animPr );
   4.508 +
   4.509 +      //Calc the upper limit of start value -- when get above this, end loop
   4.510 +      // by saving highest value of the matrix dimension to access, plus 1
   4.511 +      // as the start point of the imaginary slice following the last one
   4.512 +      //Plus 1 because go up to value but not include when process last slice
   4.513 +      //The stopping condition is half-a-size less than highest value because
   4.514 +      // don't want any pieces smaller than half the ideal size -- just tack
   4.515 +      // little ones onto end of last one
   4.516 +   endCondition = endVal - (int) (idealSizeOfSide/2); //end *value*, not size
   4.517 +   for( i = 0; startVal <= endVal; i++ )
   4.518 +    {
   4.519 +      startVals[i] = startVal;
   4.520 +      residualAcc += idealSizeOfSide;
   4.521 +      sizeOfSlice  = (int)residualAcc;
   4.522 +      residualAcc -= (float32)sizeOfSlice;
   4.523 +      startVal    += sizeOfSlice; //ex @size = 2 get 0, 2, 4, 6, 8..
   4.524 +
   4.525 +      if( startVal > endCondition )
   4.526 +       { startVal = endVal + 1;
   4.527 +         startVals[ i + 1 ] = startVal;
   4.528 +       }
   4.529 +    }
   4.530 +
   4.531 +   slicingStruc->startVals = startVals;
   4.532 +   slicingStruc->numVals   = i;  //loop incr'd, so == last valid start idx+1
   4.533 +                                 // which means is num sub-matrices in dim
   4.534 +                                 // also == idx of the fake start just above
   4.535 +   return slicingStruc;
   4.536 + }
   4.537 +
   4.538 +void
   4.539 +freeSlicingStruc( SlicingStruc *slicingStruc, VirtProcr *animPr )
   4.540 + {
   4.541 +   VCilk__free( slicingStruc->startVals, animPr );
   4.542 +   VCilk__free( slicingStruc, animPr );
   4.543 + }
   4.544 +
   4.545 +
   4.546 +int inline
   4.547 +measureMatrixMultPrimitive( VirtProcr *animPr )
   4.548 + {
   4.549 +   int r, c, v, numCycles;
   4.550 +   float32 *res, *left, *right;
   4.551 +
   4.552 +      //setup inputs
   4.553 +   left  = VCilk__malloc( 5 * 5 * sizeof( float32 ), animPr );
   4.554 +   right = VCilk__malloc( 5 * 5 * sizeof( float32 ), animPr );
   4.555 +   res   = VCilk__malloc( 5 * 5 * sizeof( float32 ), animPr );
   4.556 +
   4.557 +   for( r = 0; r < 5; r++ )
   4.558 +    {
   4.559 +      for( c = 0; c < 5; c++ )
   4.560 +       {
   4.561 +         left[  r * 5 + c ] = r;
   4.562 +         right[ r * 5 + c ] = c;
   4.563 +       }
   4.564 +    }
   4.565 +
   4.566 +      //do primitive
   4.567 +   VCilk__start_primitive();  //for now, just takes time stamp
   4.568 +   for( r = 0; r < 5; r++ )
   4.569 +    {
   4.570 +      for( c = 0; c < 5; c++ )
   4.571 +       {
   4.572 +         for( v = 0; v < 5; v++ )
   4.573 +          {
   4.574 +            res[ r * 5 + c ] = left[ r * 5 + v ] * right[ v * 5 + c ];
   4.575 +          }
   4.576 +       }
   4.577 +    }
   4.578 +   numCycles =
   4.579 +      VCilk__end_primitive_and_give_cycles(); 
   4.580 +
   4.581 +   VCilk__free( left, animPr );
   4.582 +   VCilk__free( right, animPr );
   4.583 +   VCilk__free( res, animPr );
   4.584 +   
   4.585 +   return numCycles;
   4.586 + }

     5.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     5.2 +++ b/src/Application/VCilk__Matrix_Mult/EntryPoint.c	Sat Oct 30 20:43:49 2010 -0700
     5.3 @@ -0,0 +1,58 @@
     5.4 +/*
     5.5 + *  Copyright 2009 OpenSourceStewardshipFoundation.org
     5.6 + *  Licensed under GNU General Public License version 2
     5.7 + *
     5.8 + * Author: seanhalle@yahoo.com
     5.9 + *
    5.10 + */
    5.11 +
    5.12 +#include <math.h>
    5.13 +
    5.14 +#include "VCilk__Matrix_Mult.h"
    5.15 +
    5.16 +
    5.17 +
    5.18 +/*Every VCilk system has an "entry point" function that creates the first
    5.19 + * processor, which starts the chain of creating more processors..
    5.20 + * eventually all of the processors will dissipate themselves, and
    5.21 + * return.
    5.22 + *
    5.23 + *This entry-point function follows the same pattern as all entry-point
    5.24 + * functions do:
    5.25 + *1) it creates the params for the seed processor, from the
    5.26 + *    parameters passed into the entry-point function
    5.27 + *2) it calls VCilk__create_seed_procr_and_do_work
    5.28 + *3) it gets the return value from the params struc, frees the params struc,
    5.29 + *    and returns the value from the function
    5.30 + *
    5.31 + */
    5.32 +Matrix *
    5.33 +multiplyTheseMatrices( Matrix *leftMatrix, Matrix *rightMatrix )
    5.34 + { Matrix          *resMatrix;
    5.35 +   DividerParams   *dividerParams;
    5.36 +   int32            numResRows, numResCols;
    5.37 +
    5.38 +
    5.39 +   dividerParams              = malloc( sizeof( DividerParams ) );
    5.40 +   dividerParams->leftMatrix  = leftMatrix;
    5.41 +   dividerParams->rightMatrix = rightMatrix;
    5.42 + 
    5.43 +   numResRows  = leftMatrix->numRows;
    5.44 +   numResCols  = rightMatrix->numCols;
    5.45 +   resMatrix            = malloc( sizeof(Matrix) );
    5.46 +   resMatrix->array     = malloc( numResRows * numResCols * sizeof(float32));
    5.47 +   resMatrix->numCols   = rightMatrix->numCols;
    5.48 +   resMatrix->numRows   = leftMatrix->numRows;
    5.49 +
    5.50 +
    5.51 +   dividerParams->resultMatrix   = resMatrix;
    5.52 +
    5.53 +      //create divider processor, start doing the work, and wait till done
    5.54 +      //This function is the "border crossing" between normal code and VCilk
    5.55 +   VCilk__create_seed_procr_and_do_work( &divideWorkIntoSubMatrixPairProcrs,
    5.56 +                                         dividerParams );
    5.57 +   
    5.58 +      //get result matrix and return it
    5.59 +   free( dividerParams );
    5.60 +   return resMatrix;
    5.61 + }

     6.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     6.2 +++ b/src/Application/VCilk__Matrix_Mult/VCilk__Matrix_Mult.h	Sat Oct 30 20:43:49 2010 -0700
     6.3 @@ -0,0 +1,103 @@
     6.4 +/*
     6.5 + *  Copyright Oct 24, 2009 OpenSourceStewardshipFoundation.org
     6.6 + *  Licensed under GNU General Public License version 2
     6.7 + */
     6.8 +
     6.9 +#ifndef _VCilk_MATRIX_MULT_H_
    6.10 +#define _VCilk_MATRIX_MULT_H_
    6.11 +
    6.12 +#include <stdio.h>
    6.13 +
    6.14 +#include "../../VCilk_lib/VCilk.h"
    6.15 +#include "../Matrix_Mult.h"
    6.16 +
    6.17 +
    6.18 +//===============================  Defines  ==============================
    6.19 +#define ROWS_IN_BLOCK 32
    6.20 +#define COLS_IN_BLOCK 32
    6.21 +#define VEC_IN_BLOCK  32
    6.22 +
    6.23 +
    6.24 +//#define PRINT_DEBUG(msg) //printf(msg); fflush(stdin);
    6.25 +
    6.26 +//==============================  Structures  ==============================
    6.27 +typedef struct
    6.28 + {
    6.29 +   Matrix *leftMatrix;
    6.30 +   Matrix *rightMatrix;
    6.31 +   Matrix *resultMatrix;
    6.32 +   
    6.33 +   TSCount numTSCsToExe;
    6.34 + }
    6.35 +DividerParams;
    6.36 +
    6.37 +typedef struct
    6.38 + {
    6.39 +   VirtProcr *dividerPr;
    6.40 +   int numRows;
    6.41 +   int numCols;
    6.42 +   int numSubMatrixPairs;
    6.43 + }
    6.44 +ResultsParams;
    6.45 +
    6.46 +typedef
    6.47 +struct
    6.48 + { int32    numRows;
    6.49 +   int32    numCols;
    6.50 +   Matrix  *origMatrix;
    6.51 +   int32    origStartRow;
    6.52 +   int32    origStartCol;
    6.53 +   int32    alreadyCopied;
    6.54 +   float32 *array;  //2D, but dynamically sized, so use addr arith
    6.55 + }
    6.56 +SubMatrix;
    6.57 +
    6.58 +typedef struct
    6.59 + { VirtProcr *resultPr;
    6.60 +   SubMatrix *leftSubMatrix;
    6.61 +   SubMatrix *rightSubMatrix;
    6.62 +   float32   *resultArray;
    6.63 + }
    6.64 +SMPairParams;
    6.65 +
    6.66 +typedef
    6.67 +struct
    6.68 + { int32    numVals;
    6.69 +   int32   *startVals;
    6.70 + }
    6.71 +SlicingStruc;
    6.72 +
    6.73 +typedef
    6.74 +struct
    6.75 + {
    6.76 +   SlicingStruc *leftRowSlices;
    6.77 +   SlicingStruc *vecSlices;
    6.78 +   SlicingStruc *rightColSlices;
    6.79 + }
    6.80 +SlicingStrucCarrier;
    6.81 +
    6.82 +typedef struct
    6.83 + {
    6.84 +   int32 numVecIdxs;
    6.85 +   int32 numRightColIdxs;
    6.86 +   int32 leftRowIdxOffset;
    6.87 +   int32 resColIdx;
    6.88 +   SubMatrix **leftSubMatrices;
    6.89 +   SubMatrix **rightSubMatrices;
    6.90 +   float32 *resultArray;
    6.91 +   int32 coreToRunOn;
    6.92 + }
    6.93 +VecParams;
    6.94 +
    6.95 +//============================= Processor Functions =========================
    6.96 +void divideWorkIntoSubMatrixPairProcrs( void *data, VirtProcr *animatingPr );
    6.97 +void calcSubMatrixProduct(              void *data, VirtProcr *animatingPr );
    6.98 +void calcVectorOfSubMatrices(     void *_vecParams, VirtProcr *animatingPr );
    6.99 +
   6.100 +
   6.101 +//================================ Entry Point ==============================
   6.102 +Matrix *
   6.103 +multiplyTheseMatrices( Matrix *leftMatrix, Matrix *rightMatrix );
   6.104 +
   6.105 +
   6.106 +#endif /*_VCilk_MATRIX_MULT_H_*/

     7.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     7.2 +++ b/src/Application/VCilk__Matrix_Mult/Vector_Pr.c	Sat Oct 30 20:43:49 2010 -0700
     7.3 @@ -0,0 +1,121 @@
     7.4 +/*
     7.5 + *  Copyright 2009 OpenSourceStewardshipFoundation.org
     7.6 + *  Licensed under GNU General Public License version 2
     7.7 + *
     7.8 + * Author: seanhalle@yahoo.com
     7.9 + *
    7.10 + */
    7.11 +
    7.12 +
    7.13 +#include "VCilk__Matrix_Mult.h"
    7.14 +#include <math.h>
    7.15 +
    7.16 +
    7.17 +void inline
    7.18 +accumulateResult( float32 *resultArray, float32 *subMatrixResultArray,
    7.19 +                  int32    startRow,
    7.20 +                  int32    numRows,
    7.21 +                  int32    startCol,
    7.22 +                  int32    numCols,
    7.23 +                  int32    numOrigCols );
    7.24 +
    7.25 +
    7.26 +//===========================================================================
    7.27 +
    7.28 +void
    7.29 +calcVectorOfSubMatrices( void *_vecParams, VirtProcr *animPr )
    7.30 + { int32         numVecIdxs, leftRowIdxOffset, numRightColIdxs, resColIdx;
    7.31 +   SubMatrix   **leftSubMatrices, **rightSubMatrices;
    7.32 +   float32      *resultArray;
    7.33 +   int32         vecIdx, coreWithAffinity;
    7.34 +   SMPairParams *subMatrixPairParams, **vecOfSubMatrixParams;
    7.35 +   VecParams    *vecParams;
    7.36 +
    7.37 +   vecParams = (VecParams *)_vecParams;
    7.38 +
    7.39 +         int32 subMatrixVectorProbe =
    7.40 +                   VMS__create_single_interval_probe( "subMtxVect", animPr );
    7.41 +         VMS__record_sched_choice_into_probe( subMatrixVectorProbe, animPr );
    7.42 +         VMS__record_interval_start_in_probe( subMatrixVectorProbe );
    7.43 +
    7.44 +
    7.45 +   numVecIdxs       = vecParams->numVecIdxs;
    7.46 +   numRightColIdxs  = vecParams->numRightColIdxs;
    7.47 +   leftRowIdxOffset = vecParams->leftRowIdxOffset;
    7.48 +   resColIdx        = vecParams->resColIdx;
    7.49 +   leftSubMatrices  = vecParams->leftSubMatrices;
    7.50 +   rightSubMatrices = vecParams->rightSubMatrices;
    7.51 +   resultArray      = vecParams->resultArray;
    7.52 +   coreWithAffinity = vecParams->coreToRunOn;
    7.53 +         
    7.54 +   vecOfSubMatrixParams = VCilk__malloc( numVecIdxs * sizeof(SMPairParams *),
    7.55 +                                         animPr );
    7.56 +   if( vecOfSubMatrixParams == 0 ){printf("malloc error"); exit(1);}
    7.57 +
    7.58 +   for( vecIdx = 0; vecIdx < numVecIdxs; vecIdx++ )
    7.59 +    {
    7.60 +         //Make the processor for the pair of sub-matrices
    7.61 +      subMatrixPairParams  = VCilk__malloc(sizeof(SMPairParams),animPr);
    7.62 +      subMatrixPairParams->leftSubMatrix  =
    7.63 +         leftSubMatrices[ leftRowIdxOffset + vecIdx ];
    7.64 +
    7.65 +      subMatrixPairParams->rightSubMatrix =
    7.66 +         rightSubMatrices[ vecIdx * numRightColIdxs + resColIdx ];
    7.67 +
    7.68 +      VCilk__spawn( coreWithAffinity,    &calcSubMatrixProduct,
    7.69 +                    subMatrixPairParams, animPr );
    7.70 +
    7.71 +      vecOfSubMatrixParams[ vecIdx ] = subMatrixPairParams;
    7.72 +    }
    7.73 +
    7.74 +   VCilk__sync( animPr );
    7.75 +
    7.76 +      //now accumulate individual result matrices into final result matrix
    7.77 +   for( vecIdx = 0; vecIdx < numVecIdxs; vecIdx++ )
    7.78 +    {
    7.79 +      subMatrixPairParams = vecOfSubMatrixParams[ vecIdx ];
    7.80 +
    7.81 +      accumulateResult( resultArray, subMatrixPairParams->resultArray,
    7.82 +                        subMatrixPairParams->leftSubMatrix->origStartRow,
    7.83 +                        subMatrixPairParams->leftSubMatrix->numRows,
    7.84 +                        subMatrixPairParams->rightSubMatrix->origStartCol,
    7.85 +                        subMatrixPairParams->rightSubMatrix->numCols,
    7.86 +                   subMatrixPairParams->rightSubMatrix->origMatrix->numCols);
    7.87 +
    7.88 +         //Note, resultArray is made on the core that produces the results
    7.89 +         // that gives chance to set affinity so all in vector run on same
    7.90 +         // core and re-use that array, and prevents writes from causing
    7.91 +         // thrashing of the cache -- as long as array big enough, the copy
    7.92 +         // overhead is miniscule vs the size-of-side reuse of each byte
    7.93 +      VCilk__free( subMatrixPairParams->resultArray, animPr );
    7.94 +      VCilk__free( subMatrixPairParams, animPr );
    7.95 +    }
    7.96 +   VCilk__free( vecOfSubMatrixParams, animPr );
    7.97 +   VCilk__free( vecParams, animPr );
    7.98 +
    7.99 +         VMS__record_interval_end_in_probe( subMatrixVectorProbe );
   7.100 +
   7.101 +   VCilk__dissipate_procr( animPr );
   7.102 + }
   7.103 +
   7.104 +
   7.105 +
   7.106 +void inline
   7.107 +accumulateResult( float32 *resultArray, float32 *subMatrixResultArray,
   7.108 +                  int32    startRow,
   7.109 +                  int32    numRows,
   7.110 +                  int32    startCol,
   7.111 +                  int32    numCols,
   7.112 +                  int32    numOrigCols )
   7.113 + { int32 row, col;
   7.114 +
   7.115 +   for( row = 0; row < numRows; row++ )
   7.116 +    {
   7.117 +      for( col = 0; col < numCols; col++ )
   7.118 +       {
   7.119 +         resultArray[ (row + startRow) * numOrigCols + col + startCol ] +=
   7.120 +            subMatrixResultArray[ row * numCols + col ];
   7.121 +       }
   7.122 +    }
   7.123 +
   7.124 + }

     8.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     8.2 +++ b/src/Application/VCilk__Matrix_Mult/subMatrix_Pr.c	Sat Oct 30 20:43:49 2010 -0700
     8.3 @@ -0,0 +1,269 @@
     8.4 +/* 
     8.5 + *  Copyright 2009 OpenSourceStewardshipFoundation.org
     8.6 + *  Licensed under GNU General Public License version 2
     8.7 + *
     8.8 + * Author: SeanHalle@yahoo.com
     8.9 + *
    8.10 + */
    8.11 +
    8.12 +#include "VCilk__Matrix_Mult.h"
    8.13 +
    8.14 +
    8.15 +void inline
    8.16 +copyFromOrig( SubMatrix *subMatrix, VirtProcr *animPr );
    8.17 +
    8.18 +void inline
    8.19 +copyTransposeFromOrig( SubMatrix *subMatrix, VirtProcr *animPr );
    8.20 +
    8.21 +void inline
    8.22 +multiplySubBlocksTransposed( float32 *leftArray, float32 *rightArray,
    8.23 +                     float32 *resArray,
    8.24 +                     int startRow,  int endRow,
    8.25 +                     int startCol,  int endCol,
    8.26 +                     int startVec,  int endVec,
    8.27 +                     int resStride, int inpStride );
    8.28 +
    8.29 +void inline
    8.30 +multiplyMatrixArrays( int32 vecLength, int32 numResRows, int32 numResCols,
    8.31 +                      float32 *leftArray, float32 *rightArray,
    8.32 +                      float32 *resArray );
    8.33 +
    8.34 +
    8.35 +/*A  processor is created with an environment that holds two matrices,
    8.36 + * the row and col that it owns, and the name of a result gathering
    8.37 + * processor.
    8.38 + *It calculates the product of two sub-portions of the input matrices
    8.39 + * by using Intel's mkl library for single-core.
    8.40 + *
    8.41 + *This demonstrates using optimized single-threaded code inside scheduled
    8.42 + * work-units.
    8.43 + *
    8.44 + *When done, it sends the result to the result processor
    8.45 + */
    8.46 +void
    8.47 +calcSubMatrixProduct( void *data, VirtProcr *animatingPr )
    8.48 + { 
    8.49 +   SMPairParams   *params;
    8.50 +   VirtProcr      *resultPr;
    8.51 +   float32        *leftArray,  *rightArray, *resArray;
    8.52 +   SubMatrix      *leftSubMatrix, *rightSubMatrix;
    8.53 +
    8.54 +         PRINT_DEBUG("start sub-matrix mult\n")
    8.55 +         int32 subMatrixProbe = VMS__create_single_interval_probe( "subMtx",
    8.56 +                                                                animatingPr);
    8.57 +         VMS__record_sched_choice_into_probe( subMatrixProbe, animatingPr );
    8.58 +         VMS__record_interval_start_in_probe( subMatrixProbe );
    8.59 +
    8.60 +   params         = (SMPairParams *)data;
    8.61 +//   resultPr       = params->resultPr;
    8.62 +   leftSubMatrix  = params->leftSubMatrix;
    8.63 +   rightSubMatrix = params->rightSubMatrix;
    8.64 +
    8.65 +      //make sure the input sub-matrices have been copied out of orig
    8.66 +   copyFromOrig( leftSubMatrix, animatingPr );
    8.67 +   copyTransposeFromOrig( rightSubMatrix, animatingPr );
    8.68 +   
    8.69 +   leftArray      = leftSubMatrix->array;
    8.70 +   rightArray     = rightSubMatrix->array;
    8.71 +
    8.72 +      //make this array here, on the core that computes the results
    8.73 +      // with Cilk's semantics, have to have separate result array for each
    8.74 +      // spawned processor -- unless want to change the spawn and sync
    8.75 +      // pattern, such that spawn one from each vector, then sync, then
    8.76 +      // another, and so forth -- this will cause idle time due to imbalance
    8.77 +      // in matrix sizes
    8.78 +      //This also gives chance to set affinity so all in vector run on same
    8.79 +      // core and re-use the accumulation array,
    8.80 +      //As a side-benefit, it also prevents writes from causing
    8.81 +      // thrashing of the cache -- as long as array big enough, the copy
    8.82 +      // overhead is small because each byte is reused size-of-side times
    8.83 +      //This is freed in the vector processor
    8.84 +   resArray = VCilk__malloc(leftSubMatrix->numRows * rightSubMatrix->numCols*
    8.85 +                           sizeof( float32 ), animatingPr );
    8.86 +
    8.87 +
    8.88 +   int32 numResRows, numResCols, vectLength;
    8.89 +   
    8.90 +   vectLength = leftSubMatrix->numCols;
    8.91 +   numResRows = leftSubMatrix->numRows;
    8.92 +   numResCols = rightSubMatrix->numCols;
    8.93 +
    8.94 +   multiplyMatrixArrays( vectLength, numResRows, numResCols,
    8.95 +                         leftArray, rightArray,
    8.96 +                         resArray );
    8.97 +
    8.98 +   //send result by side-effect
    8.99 +   params->resultArray = resArray;
   8.100 +
   8.101 +         VMS__record_interval_end_in_probe( subMatrixProbe );
   8.102 +         
   8.103 +   VCilk__dissipate_procr( animatingPr );
   8.104 + }
   8.105 +
   8.106 +
   8.107 +/*Divides into 32x32 sub-matrices, 3 of which fit into 32KB L1 cache
   8.108 + * Would be nice to embed this within another level that divided into
   8.109 + * 8x8 tiles of those, where one 8x8 tile fits within 2MB L2 cache
   8.110 + *
   8.111 + *Eventually want these divisions to be automatic, using DKU pattern
   8.112 + * embedded into VCilk, and with VMS controlling the divisions according to
   8.113 + * the cache sizes, which it knows about.
   8.114 + *And, want VMS to work with language to split among main-mems, so a socket
   8.115 + * only cranks on data in its local segment of main mem
   8.116 + *
   8.117 + */
   8.118 +void inline
   8.119 +multiplyMatrixArrays( int32 vecLength, int32 numResRows, int32 numResCols,
   8.120 +                      float32 *leftArray, float32 *rightArray,
   8.121 +                      float32 *resArray )
   8.122 + {
   8.123 +   int resStride, inpStride;
   8.124 +   int startRow, startCol, endRow, endCol, startVec, endVec;
   8.125 +
   8.126 +   resStride  = numResCols;
   8.127 +   inpStride  = vecLength;
   8.128 +
   8.129 +   for( startRow = 0; startRow < numResRows; )
   8.130 +    {
   8.131 +      endRow = startRow + ROWS_IN_BLOCK;
   8.132 +      if( endRow > numResRows ) endRow = numResRows;
   8.133 +
   8.134 +      for( startCol = 0; startCol < numResCols; )
   8.135 +       {
   8.136 +         endCol   = startCol + COLS_IN_BLOCK;
   8.137 +         if( endCol > numResCols ) endCol = numResCols;
   8.138 +
   8.139 +         for( startVec = 0; startVec < vecLength; )
   8.140 +          {
   8.141 +            endVec   = startVec + VEC_IN_BLOCK;
   8.142 +            if( endVec > vecLength ) endVec = vecLength;
   8.143 +
   8.144 +               //By having the "vector" of sub-blocks in a sub-block slice
   8.145 +               // be marched down in inner loop, are re-using the result
   8.146 +               // matrix, which stays in L1 cache -- can only re-use one of
   8.147 +               // the three, so this is the most important -- avoids writing
   8.148 +               // dirty blocks until those result-locations fully done
   8.149 +               //Row and Col is position in result matrix -- so row and vec
   8.150 +               // for left array, then vec and col for right array
   8.151 +            multiplySubBlocksTransposed( leftArray, rightArray,
   8.152 +                                         resArray,
   8.153 +                                         startRow,  endRow,
   8.154 +                                         startCol,  endCol,
   8.155 +                                         startVec,  endVec,
   8.156 +                                         resStride, inpStride );
   8.157 +            startVec = endVec;
   8.158 +          }
   8.159 +         startCol = endCol;
   8.160 +       }
   8.161 +      startRow = endRow;
   8.162 +    }
   8.163 + }
   8.164 +
   8.165 +
   8.166 +void inline
   8.167 +multiplySubBlocksTransposed( float32 *leftArray, float32 *rightArray,
   8.168 +                     float32 *resArray,
   8.169 +                     int startRow,  int endRow,
   8.170 +                     int startCol,  int endCol,
   8.171 +                     int startVec,  int endVec,
   8.172 +                     int resStride, int inpStride )
   8.173 + {
   8.174 +   int row,    col,        vec;
   8.175 +   int leftOffset, rightOffset;
   8.176 +   float32 result;
   8.177 +   
   8.178 +   for( row = startRow; row < endRow; row++ )
   8.179 +    { 
   8.180 +      for( col = startCol; col < endCol; col++ )
   8.181 +       { 
   8.182 +         leftOffset  = row * inpStride;//left & right inp strides always same
   8.183 +         rightOffset = col * inpStride;// because right is transposed
   8.184 +         result = 0;
   8.185 +         for( vec = startVec; vec < endVec; vec++ )
   8.186 +          {
   8.187 +            result +=
   8.188 +               leftArray[ leftOffset + vec] * rightArray[ rightOffset + vec];
   8.189 +          }
   8.190 +         
   8.191 +         resArray[ row * resStride + col ] += result;
   8.192 +       }
   8.193 +    }
   8.194 + }
   8.195 +
   8.196 +/*Note: don't do the copy when create, because create on a different core --
   8.197 + * do the copy on the core that will do the calcs, then try to keep work on
   8.198 + * that core that reuses the array data
   8.199 + */
   8.200 +void inline
   8.201 +copyTransposeFromOrig( SubMatrix *subMatrix, VirtProcr *animPr )
   8.202 + { int numCols, numRows, origStartRow, origStartCol, origStride, stride;
   8.203 +   Matrix *origMatrix;
   8.204 +   float32 *origArray, *subArray;
   8.205 +
   8.206 +   if( subMatrix->alreadyCopied ) return;
   8.207 +
   8.208 +   subMatrix->alreadyCopied = TRUE;
   8.209 +
   8.210 +   origMatrix   = subMatrix->origMatrix;
   8.211 +   origArray     = origMatrix->array;
   8.212 +   numCols      = subMatrix->numCols;
   8.213 +   numRows      = subMatrix->numRows;
   8.214 +   stride       = numRows;
   8.215 +   origStartRow = subMatrix->origStartRow;
   8.216 +   origStartCol = subMatrix->origStartCol;
   8.217 +   origStride   = origMatrix->numCols;
   8.218 +
   8.219 +      //This is free in Divide pr after all calcs are done
   8.220 +   subArray     = VCilk__malloc( numRows * numCols * sizeof(float32),animPr);
   8.221 +   subMatrix->array = subArray;
   8.222 +
   8.223 +      //copy values from orig matrix to local
   8.224 +   int row, col, origOffset;
   8.225 +   for( row = 0; row < numRows; row++ )
   8.226 +    {
   8.227 +      origOffset = (row + origStartRow) * origStride + origStartCol;
   8.228 +      for( col = 0; col < numCols; col++ )
   8.229 +       {
   8.230 +            //transpose means swap row & col -- traverse orig matrix normally
   8.231 +            // but put into reversed place in local array -- means the
   8.232 +            // stride is the num rows now, so col * numRows + row
   8.233 +         subArray[ col * stride + row ]  =  origArray[ origOffset + col ];
   8.234 +       }      
   8.235 +    }
   8.236 + }
   8.237 +
   8.238 +void inline
   8.239 +copyFromOrig( SubMatrix *subMatrix, VirtProcr *animPr )
   8.240 + { int numCols, numRows, origStartRow, origStartCol, stride, origStride;
   8.241 +   Matrix *origMatrix;
   8.242 +   float32 *origArray, *subArray;
   8.243 +
   8.244 +   if( subMatrix->alreadyCopied ) return;
   8.245 +
   8.246 +   subMatrix->alreadyCopied = TRUE;
   8.247 +
   8.248 +   origMatrix    = subMatrix->origMatrix;
   8.249 +   origArray     = origMatrix->array;
   8.250 +   numCols       = subMatrix->numCols;
   8.251 +   numRows       = subMatrix->numRows;
   8.252 +   origStartRow  = subMatrix->origStartRow;
   8.253 +   origStartCol  = subMatrix->origStartCol;
   8.254 +   stride        = numCols;
   8.255 +   origStride    = origMatrix->numCols;
   8.256 +
   8.257 +      //This is freed in Divide pr after all calcs are done
   8.258 +   subArray      = VCilk__malloc( numRows * numCols *sizeof(float32),animPr);
   8.259 +   subMatrix->array = subArray;
   8.260 +
   8.261 +      //copy values from orig matrix to local
   8.262 +   int row, col, offset, origOffset;
   8.263 +   for( row = 0; row < numRows; row++ )
   8.264 +    {
   8.265 +      offset     = row * stride;
   8.266 +      origOffset = (row + origStartRow) * origStride + origStartCol;
   8.267 +      for( col = 0; col < numCols; col++ )
   8.268 +       {
   8.269 +         subArray[ offset + col ]  =  origArray[ origOffset + col ];
   8.270 +       }
   8.271 +    }
   8.272 + }

     9.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     9.2 +++ b/src/Application/main.c	Sat Oct 30 20:43:49 2010 -0700
     9.3 @@ -0,0 +1,34 @@
     9.4 +/*
     9.5 + *  Copyright Oct 24, 2009 OpenSourceStewardshipFoundation.org
     9.6 + *  Licensed under GNU General Public License version 2
     9.7 + *
     9.8 + * author seanhalle@yahoo.com
     9.9 + */
    9.10 +
    9.11 +#include <malloc.h>
    9.12 +#include <stdlib.h>
    9.13 +
    9.14 +#include "Matrix_Mult.h"
    9.15 +#include "VCilk__Matrix_Mult/VCilk__Matrix_Mult.h"
    9.16 +
    9.17 +/**
    9.18 + *Matrix multiply program written using VMS_HW piggy-back language
    9.19 + * 
    9.20 + */
    9.21 +int main( int argc, char **argv )
    9.22 + { Matrix      *leftMatrix, *rightMatrix, *resultMatrix;
    9.23 +   ParamBag    *paramBag;
    9.24 +   
    9.25 +   paramBag = makeParamBag();
    9.26 +   readParamFileIntoBag( argv[1], paramBag );
    9.27 +   initialize_Input_Matrices_Via( &leftMatrix, &rightMatrix, paramBag );
    9.28 +   
    9.29 +   resultMatrix = multiplyTheseMatrices( leftMatrix, rightMatrix );
    9.30 +
    9.31 +   printf("\nresult matrix: \n");
    9.32 +   printMatrix( resultMatrix );
    9.33 +   
    9.34 +//   VCilk__print_stats();
    9.35 +   
    9.36 +   exit(0); //cleans up
    9.37 + }