changeset 3:bf7331ed394e tip

Working version of blocked matrix mult -- same as SSR, VCilk & VPThread
author Me
date Wed, 10 Nov 2010 06:07:54 -0800
parents ec0629f70ee5
children
files src/Application/CILK__Matrix_Mult/CILK__Matrix_Mult.h src/Application/CILK__Matrix_Mult/Divide_Pr.cilk src/Application/CILK__Matrix_Mult/EntryPoint.cilk src/Application/CILK__Matrix_Mult/Vector_Pr.cilk src/Application/CILK__Matrix_Mult/subMatrix_Pr.cilk src/Application/Makefile src/Application/Matrix_Mult.c src/Application/Matrix_Mult.h src/Application/main.cilk
diffstat 9 files changed, 1199 insertions(+), 206 deletions(-) [+]
line diff
     1.1 --- a/src/Application/CILK__Matrix_Mult/CILK__Matrix_Mult.h	Tue Oct 26 19:34:03 2010 -0700
     1.2 +++ b/src/Application/CILK__Matrix_Mult/CILK__Matrix_Mult.h	Wed Nov 10 06:07:54 2010 -0800
     1.3 @@ -3,14 +3,23 @@
     1.4   *  Licensed under GNU General Public License version 2
     1.5   */
     1.6  
     1.7 -#ifndef _VPThread__MATRIX_MULT_H_
     1.8 -#define _VPThread__MATRIX_MULT_H_
     1.9 +#ifndef _Cilk__MATRIX_MULT_H_
    1.10 +#define _Cilk__MATRIX_MULT_H_
    1.11  
    1.12  #include <stdio.h>
    1.13  
    1.14  #include "VMS_primitive_data_types.h"
    1.15  #include "../Matrix_Mult.h"
    1.16  
    1.17 +//===============================  Defines  ==============================
    1.18 +#define ROWS_IN_BLOCK 32
    1.19 +#define COLS_IN_BLOCK 32
    1.20 +#define VEC_IN_BLOCK  32
    1.21 +
    1.22 +#define copyMatrixSingleton 1
    1.23 +#define copyTransposeSingleton 2
    1.24 +
    1.25 +
    1.26  //==============================  Structures  ==============================
    1.27  typedef struct
    1.28   {
    1.29 @@ -20,44 +29,55 @@
    1.30   }
    1.31  DividerParams;
    1.32  
    1.33 -typedef struct
    1.34 - {
    1.35 -   int numRows;
    1.36 -   int numCols;
    1.37 +typedef
    1.38 +struct
    1.39 + { int32    numRows;
    1.40 +   int32    numCols;
    1.41 +   Matrix  *origMatrix;
    1.42 +   int32    origStartRow;
    1.43 +   int32    origStartCol;
    1.44 +   int32    alreadyCopied;
    1.45 +   float32 *array;  //2D, but dynamically sized, so use addr arith
    1.46   }
    1.47 -ResultsParams;
    1.48 +SubMatrix;
    1.49  
    1.50  typedef struct
    1.51   { 
    1.52 -   int        myCol;
    1.53 -   int        myRow;
    1.54 -   int        vectLength;
    1.55 -   Matrix    *leftMatrix;
    1.56 -   Matrix    *rightMatrix;
    1.57 -   float32    result;
    1.58 +   SubMatrix *leftSubMatrix;
    1.59 +   SubMatrix *rightSubMatrix;
    1.60 +   float32   *partialResultArray;
    1.61   }
    1.62 -VectorParams;
    1.63 +SMPairParams;
    1.64 +
    1.65 +typedef
    1.66 +struct
    1.67 + { int32    numVals;
    1.68 +   int32   *startVals;
    1.69 + }
    1.70 +SlicingStruc;
    1.71 +
    1.72 +typedef
    1.73 +struct
    1.74 + {
    1.75 +   SlicingStruc *leftRowSlices;
    1.76 +   SlicingStruc *vecSlices;
    1.77 +   SlicingStruc *rightColSlices;
    1.78 + }
    1.79 +SlicingStrucCarrier;
    1.80  
    1.81  typedef struct
    1.82   {
    1.83 -      //for communicating vector results to results Thd
    1.84 -   int32         vector_mutex;
    1.85 -   int32         vector_cond;
    1.86 -   VectorParams *currVector;
    1.87 -
    1.88 -      //for communicating results array back to seed (divider) Thd
    1.89 -   int32         results_mutex;
    1.90 -   int32         results_cond;
    1.91 -   float32      *results;
    1.92 -
    1.93 -      //for ensuring results thd has vector lock before making vector thds
    1.94 -   int32         start_mutex;
    1.95 -   int32         start_cond;
    1.96 -
    1.97 -   Matrix *rightMatrix;
    1.98 -   Matrix *resultMatrix;
    1.99 +   int32 numVecIdxs;
   1.100 +   int32 numRightColIdxs;
   1.101 +   int32 leftRowIdxOffset;
   1.102 +   int32 resColIdx;
   1.103 +   SubMatrix **leftSubMatrices;
   1.104 +   SubMatrix **rightSubMatrices;
   1.105 +   float32 *resultArray;
   1.106 +   int32 coreToRunOn;
   1.107 +   int32 vecID;
   1.108   }
   1.109 -MatrixMultGlobals;
   1.110 +VecParams;
   1.111  
   1.112  
   1.113  //============================= Processor Functions =========================
   1.114 @@ -67,8 +87,9 @@
   1.115  
   1.116  
   1.117  //================================ Entry Point ==============================
   1.118 -//cilk Matrix *\
   1.119 -multiplyTheseMatrices( Matrix *leftMatrix, Matrix *rightMatrix );
   1.120 +//cilk 
   1.121 +//Matrix *
   1.122 +//multiplyTheseMatrices( Matrix *leftMatrix, Matrix *rightMatrix );
   1.123  
   1.124  
   1.125  #endif /*_VPThread__MATRIX_MULT_H_*/
     2.1 --- a/src/Application/CILK__Matrix_Mult/Divide_Pr.cilk	Tue Oct 26 19:34:03 2010 -0700
     2.2 +++ b/src/Application/CILK__Matrix_Mult/Divide_Pr.cilk	Wed Nov 10 06:07:54 2010 -0800
     2.3 @@ -1,72 +1,610 @@
     2.4 -/*
     2.5 - *  Copyright 2009 OpenSourceStewardshipFoundation.org
     2.6 - *  Licensed under GNU General Public License version 2
     2.7 - *
     2.8 - * Author: seanhalle@yahoo.com
     2.9 - *
    2.10 - */
    2.11 -
    2.12 -
    2.13 -#include "CILK__Matrix_Mult.h"
    2.14 -
    2.15 -cilk float32 calcVector( void * );
    2.16 -
    2.17 -/*Divider creates one processor for every row-col pair.
    2.18 - * It hands them:
    2.19 +/*
    2.20 + *  Copyright 2009 OpenSourceStewardshipFoundation.org
    2.21 + *  Licensed under GNU General Public License version 2
    2.22 + *
    2.23 + * Author: seanhalle@yahoo.com
    2.24 + *
    2.25 + */
    2.26 +
    2.27 +
    2.28 +#include "CILK__Matrix_Mult.h"
    2.29 +#include "VMS_primitive_data_types.h"
    2.30 +
    2.31 +#include <math.h>
    2.32 +#include <sys/time.h>
    2.33 +#include <string.h>
    2.34 +#include <malloc.h>
    2.35 +
    2.36 +   //The time to compute this many result values should equal the time to
    2.37 +   // perform this division on a matrix of size gives that many result calcs
    2.38 +   //IE, size this so that sequential time to calc equals divide time
    2.39 +   // find the value by experimenting -- but divide time and calc time scale
    2.40 +   // same way, so this value should remain valid across hardware
    2.41 +   //Divide time is about 800us on 2.4Ghz core2Quad laptop core
    2.42 +   //num cells is the cube of a side, when have two square matrices
    2.43 +#define NUM_CELLS_IN_SEQUENTIAL_CUTOFF 100000 /* about 46x46 */
    2.44 +
    2.45 +   //Cilk doesn't have VCilk's facilities, so define constants
    2.46 +#define MIN_WORK_UNIT_CYCLES 100000
    2.47 +#define IDEAL_NUM_WORK_UNITS 20
    2.48 +#define NUMBER_OF_CORES_TO_SPAWN_ONTO 4
    2.49 +
    2.50 +//===============================  External =================================
    2.51 +cilk void calcVectorOfSubMatrices( void *params );
    2.52 +
    2.53 +void inline
    2.54 +copyTranspose( int32 numRows, int32 numCols,
    2.55 +               int32 origStartRow, int32 origStartCol, int32 origStride,
    2.56 +               float32 *subArray, float32 *origArray );
    2.57 +
    2.58 +void inline
    2.59 +multiplyMatrixArraysTransposed( int32 vecLength, int32 numResRows, int32 numResCols,
    2.60 +                      float32 *leftArray, float32 *rightArray,
    2.61 +                      float32 *resArray );
    2.62 +
    2.63 +
    2.64 +void
    2.65 +startTimeInterval();
    2.66 +
    2.67 +void
    2.68 +endIntervalAndPrintTime();
    2.69 +
    2.70 +
    2.71 +//=============================  Within-File  ===============================
    2.72 +int inline
    2.73 +measureMatrixMultPrimitive();
    2.74 +
    2.75 +SlicingStrucCarrier *
    2.76 +calcIdealSizeAndSliceDimensions( Matrix *leftMatrix, Matrix *rightMatrix );
    2.77 +
    2.78 +SlicingStruc *
    2.79 +sliceUpDimension( float32 idealSizeOfSide, int startVal, int endVal );
    2.80 +
    2.81 +void
    2.82 +freeSlicingStruc( SlicingStruc *slicingStruc );
    2.83 +
    2.84 +SubMatrix **
    2.85 +createSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices,
    2.86 +                   Matrix *origMatrix, int32 transposeTheMatrix );
    2.87 +
    2.88 +void
    2.89 +freeSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices,
    2.90 +                 SubMatrix **subMatrices );
    2.91 +
    2.92 +cilk void
    2.93 +pairUpSubMatricesAndSpawnAndSync( SubMatrix **leftSubMatrices,
    2.94 +                                    SubMatrix **rightSubMatrices,
    2.95 +                                    int32 numRowIdxs, int32 numColIdxs,
    2.96 +                                    int32 numVecIdxs,
    2.97 +                                    float32 *resultArray );
    2.98 +
    2.99 +cilk void
   2.100 +makeSubMatricesAndSpawnAndSync( Matrix *leftMatrix, Matrix *rightMatrix,
   2.101 +            SlicingStrucCarrier *slicingStrucCarrier,
   2.102 +            float32 *resultArray );
   2.103 +
   2.104 +//===========================================================================
   2.105 +
   2.106 +/*Divider creates one processor for every sub-matrix
   2.107 + * It hands them:
   2.108   *  the name of the result processor that they should send their results to,
   2.109 - *  the left and right matrices, and the row and col they should multiply
   2.110 - *  the length of the vector
   2.111 - * It first creates the result processor, then all the vector processors,
   2.112 + *  the left and right matrices, and the rows and cols they should multiply
   2.113 + * It first creates the result processor, then all the sub-matrixPair
   2.114 + *  processors,
   2.115   *  then does a receive of a message from the result processor that gives
   2.116   *  the divider ownership of the result matrix.
   2.117 - * Finally, the divider returns the result matrix out of the VPThread system.
   2.118 - */
   2.119 -
   2.120 -cilk void divideIntoVectors( void *_dividerParams )
   2.121 - { 
   2.122 -   DividerParams     *dividerParams;
   2.123 -   VectorParams      *vectParams;
   2.124 -   Matrix            *leftMatrix, *rightMatrix, *resultMatrix;
   2.125 -   int32              numCells, numCols, mrow, mcol;
   2.126 -   float32           *resultMatrixArray;
   2.127 -
   2.128 -   dividerParams   = (DividerParams *)_dividerParams;
   2.129 -   
   2.130 -   leftMatrix      = dividerParams->leftMatrix;
   2.131 -   rightMatrix     = dividerParams->rightMatrix;
   2.132 -
   2.133 -   
   2.134 -   numCols = rightMatrix->numCols;
   2.135 -
   2.136 -   numCells  = leftMatrix->numRows * rightMatrix->numCols;
   2.137 -   resultMatrixArray = malloc( numCells * sizeof( float32 ) );
   2.138 -
   2.139 -
   2.140 -      //spawn vector calcs
   2.141 -   for( mrow = 0; mrow < leftMatrix->numRows; mrow++ )
   2.142 -    { for( mcol = 0; mcol < rightMatrix->numCols; mcol++ )
   2.143 -       {
   2.144 -         vectParams              = malloc( sizeof(VectorParams) );
   2.145 -         vectParams->myCol       = mcol;
   2.146 -         vectParams->myRow       = mrow;
   2.147 -         vectParams->vectLength  = leftMatrix->numCols;
   2.148 -         vectParams->leftMatrix  = leftMatrix;
   2.149 -         vectParams->rightMatrix = rightMatrix;
   2.150 -
   2.151 -
   2.152 -         resultMatrixArray[ mrow * numCols + mcol ] = spawn calcVector( vectParams );
   2.153 -       }
   2.154 -    }
   2.155 -
   2.156 -   sync;
   2.157 -
   2.158 -   
   2.159 -      //The results of the all the work have to be linked-to from the data
   2.160 -      // struc given to the seed procr -- this divide func is animated by
   2.161 -      // that seed procr, so have to link results to the _dividerParams.
   2.162 -   resultMatrix            = malloc( sizeof(Matrix) );
   2.163 -   resultMatrix->numCols   = rightMatrix->numCols;
   2.164 -   resultMatrix->numRows   = leftMatrix->numRows;
   2.165 -   dividerParams->resultMatrix   = resultMatrix;
   2.166 -   resultMatrix->matrix          = resultMatrixArray;
   2.167 + * Finally, the divider returns the result matrix out of the VCilk system.
   2.168 + *
   2.169 + * Divider chooses the size of sub-matrices via an algorithm that tries to
   2.170 + *  keep the minimum work above a threshold.  The threshold is machine-
   2.171 + *  dependent, so ask VCilk for min work-unit time to get a
   2.172 + *  given overhead
   2.173 + *
   2.174 + * Divide min work-unit cycles by measured-cycles for one matrix-cell
   2.175 + *  product -- gives the number of products need to have in min size
   2.176 + *  matrix.
   2.177 + *
   2.178 + * So then, take cubed root of this to get the size of a side of min sub-
   2.179 + *  matrix.  That is the size of the ideal square sub-matrix -- so tile
   2.180 + *  up the two input matrices into ones as close as possible to that size,
   2.181 + *  and create the pairs of sub-matrices.
   2.182 + *
   2.183 + *========================  STRATEGIC OVERVIEW  =======================
   2.184 + *
   2.185 + *This division is a bit tricky, because have to create things in advance
   2.186 + * that it's not at first obvious need to be created..
   2.187 + *
   2.188 + *First slice up each dimension -- three of them..  this is because will have
   2.189 + * to create the sub-matrix's data-structures before pairing the sub-matrices
   2.190 + * with each other -- so, have three dimensions to slice up before can
   2.191 + * create the sub-matrix data-strucs -- also, have to be certain that the
   2.192 + * cols of the left input have the exact same slicing as the rows of the
   2.193 + * left matrix, so just to be sure, do the slicing calc once, then use it
   2.194 + * for both.
   2.195 + *
   2.196 + *So, goes like this:
   2.197 + *1) calculate the start & end values of each dimension in each matrix.
   2.198 + *2) use those values to create sub-matrix structures
   2.199 + *3) combine sub-matrices into pairs, as the tasks to perform.
   2.200 + *
   2.201 + *Have to calculate separately from creating the sub-matrices because of the
   2.202 + * nature of the nesting -- would either end up creating the same sub-matrix
   2.203 + * multiple times, or else would have to put in detection of whether had
   2.204 + * made a particular one already if tried to combine steps 1 and 2.
   2.205 + *
   2.206 + *Step 3 has to be separate because of the nesting, as well -- same reason,
   2.207 + * would either create same sub-matrix multiple times, or else have to
   2.208 + * add detection of whether was already created.
   2.209 + *
   2.210 + *Another way to look at it: there's one level of loop to divide dimensions,
   2.211 + * two levels of nesting to create sub-matrices, and three levels to pair
   2.212 + * up the sub-matrices.
   2.213 + */
   2.214 +
   2.215 +cilk void
   2.216 +divideWorkIntoSubMatrixPairProcrs( void      *_dividerParams )
   2.217 + { 
   2.218 +   DividerParams   *dividerParams;
   2.219 +   Matrix          *leftMatrix, *rightMatrix;
   2.220 +   
   2.221 +   SlicingStrucCarrier *slicingStrucCarrier;
   2.222 +   float32         *resultArray; //points to array to be put inside result
   2.223 +                                 // matrix
   2.224 +   int32 numResRows, numResCols, vectLength;
   2.225 +   
   2.226 +
   2.227 +
   2.228 +   startTimeInterval();
   2.229 +
   2.230 +
   2.231 +   //=========== Setup -- make local copies of ptd-to-things, malloc, aso
   2.232 +
   2.233 +//         printf("\nin divider\n");
   2.234 +   dividerParams   = (DividerParams *)_dividerParams;
   2.235 +   
   2.236 +   leftMatrix      = dividerParams->leftMatrix;
   2.237 +   rightMatrix     = dividerParams->rightMatrix;
   2.238 +
   2.239 +   vectLength  = leftMatrix->numCols;
   2.240 +   numResRows  = leftMatrix->numRows;
   2.241 +   numResCols  = rightMatrix->numCols;
   2.242 +   resultArray = dividerParams->resultMatrix->array;
   2.243 +   
   2.244 +      //zero the result array
   2.245 +   memset( resultArray, 0, numResRows * numResCols * sizeof(float32) );
   2.246 +
   2.247 +   
   2.248 +   //==============  Do either sequential mult or do division ==============
   2.249 +
   2.250 +      //Check if input matrices too small -- if yes, just do sequential
   2.251 +      //Cutoff is determined by overhead of this divider -- relatively
   2.252 +      // machine-independent
   2.253 +   if( (float32)leftMatrix->numRows * (float32)leftMatrix->numCols *
   2.254 +       (float32)rightMatrix->numCols  < NUM_CELLS_IN_SEQUENTIAL_CUTOFF )
   2.255 +    { int32 vectLength;
   2.256 +
   2.257 +      //====== Do sequential multiply on a single core
   2.258 +
   2.259 +         //transpose the right matrix
   2.260 +      float32 *
   2.261 +      transRightArray  = malloc( rightMatrix->numRows *
   2.262 +                                        rightMatrix->numCols *
   2.263 +                                        sizeof(float32) );
   2.264 +
   2.265 +         //copy values from orig matrix to local
   2.266 +      copyTranspose( rightMatrix->numRows, rightMatrix->numCols,
   2.267 +                     0, 0, rightMatrix->numRows,
   2.268 +                     transRightArray, rightMatrix->array );
   2.269 +
   2.270 +      multiplyMatrixArraysTransposed( vectLength, numResRows, numResCols,
   2.271 +                            leftMatrix->array, transRightArray,
   2.272 +                            resultArray );
   2.273 +    }
   2.274 +   else
   2.275 +    {
   2.276 +      //====== Do parallel multiply across cores
   2.277 +
   2.278 +         //Calc the ideal size of sub-matrix and slice up the dimensions of
   2.279 +         // the two matrices.
   2.280 +         //The ideal size is the one takes the number of cycles to calculate
   2.281 +         // such that calc time is equal or greater than min work-unit size
   2.282 +      slicingStrucCarrier =
   2.283 +         calcIdealSizeAndSliceDimensions( leftMatrix, rightMatrix );
   2.284 +
   2.285 +
   2.286 +         //Make the sub-matrices, and pair them up, then spawn processors to
   2.287 +         // calc product of each pair.
   2.288 +//            printf("first spawn\n");
   2.289 +      spawn makeSubMatricesAndSpawnAndSync( leftMatrix, rightMatrix,
   2.290 +                                      slicingStrucCarrier,
   2.291 +                                      resultArray );
   2.292 +      sync;
   2.293 +         //The result array will get filled in by the spawned children
   2.294 +    }
   2.295 +
   2.296 +
   2.297 +   //===============  Work done -- send results back =================
   2.298 +
   2.299 +
   2.300 +   endIntervalAndPrintTime();
   2.301 +
   2.302 +//         printf("done with divider\n");
   2.303 +
   2.304 +      //results sent back by side-effect
   2.305 +}
   2.306 +
   2.307 +
   2.308 +cilk void
   2.309 +makeSubMatricesAndSpawnAndSync( Matrix  *leftMatrix,  Matrix    *rightMatrix,
   2.310 +                         SlicingStrucCarrier *slicingStrucCarrier,
   2.311 +                         float32 *resultArray )
   2.312 + {
   2.313 +   SlicingStruc *leftRowSlices, *vecSlices, *rightColSlices;
   2.314 +   SubMatrix **leftSubMatrices, **rightSubMatrices;
   2.315 +   int32 numRowIdxs, numColIdxs, numVecIdxs;
   2.316 +   
   2.317 +   leftRowSlices  = slicingStrucCarrier->leftRowSlices;
   2.318 +   vecSlices      = slicingStrucCarrier->vecSlices;
   2.319 +   rightColSlices = slicingStrucCarrier->rightColSlices;
   2.320 +   free( slicingStrucCarrier );
   2.321 +   
   2.322 +   //================  Make sub-matrices, given the slicing  ================
   2.323 +//         printf("about to create submatrices\n");
   2.324 +   leftSubMatrices =
   2.325 +      createSubMatrices( leftRowSlices, vecSlices,
   2.326 +                         leftMatrix, FALSE );
   2.327 +   rightSubMatrices =
   2.328 +      createSubMatrices( vecSlices, rightColSlices,
   2.329 +                         rightMatrix, TRUE );
   2.330 +
   2.331 +   //==============  pair the sub-matrices and make processors ==============
   2.332 +
   2.333 +   numRowIdxs = leftRowSlices->numVals;
   2.334 +   numColIdxs = rightColSlices->numVals;
   2.335 +   numVecIdxs = vecSlices->numVals;
   2.336 +//         printf("about to spawn %d, %d, %d\n", numRowIdxs, numColIdxs, numVecIdxs);
   2.337 +   spawn pairUpSubMatricesAndSpawnAndSync( leftSubMatrices, rightSubMatrices,
   2.338 +                                     numRowIdxs, numColIdxs, numVecIdxs,
   2.339 +                                     resultArray );
   2.340 +   sync;
   2.341 +//         printf("done with sub matrices spawn and sync\n");
   2.342 +   freeSubMatrices( leftRowSlices, vecSlices,  leftSubMatrices );
   2.343 +   freeSubMatrices( vecSlices, rightColSlices, rightSubMatrices );
   2.344 +
   2.345 +//         printf("done freeing sub matrices\n");
   2.346 +      //It syncs inside, so know all work is done now: free the sub-matrices
   2.347 +   freeSlicingStruc( leftRowSlices );
   2.348 +   freeSlicingStruc( vecSlices );
   2.349 +   freeSlicingStruc( rightColSlices );
   2.350 +
   2.351 +//         printf("done freeing slicing strucs\n");
   2.352   }
   2.353 +
   2.354 +
   2.355 +
   2.356 +
   2.357 +/* numRows*colsPerRow/numCores = numToPutOntoEachCore; 
   2.358 + * put all from a given row onto same core, until exhaust allotment for that
   2.359 + *  core
   2.360 + *
   2.361 + */
   2.362 +cilk void
   2.363 +pairUpSubMatricesAndSpawnAndSync( SubMatrix **leftSubMatrices,
   2.364 +                                    SubMatrix **rightSubMatrices,
   2.365 +                                    int32 numRowIdxs, int32 numColIdxs,
   2.366 +                                    int32 numVecIdxs,
   2.367 +                                    float32 *resultArray )
   2.368 + {
   2.369 +   int32 resRowIdx, resColIdx;
   2.370 +   int32 numLeftColIdxs, numRightColIdxs;
   2.371 +   int32 leftRowIdxOffset;
   2.372 +   VecParams *vecParams;
   2.373 +   float32 numToPutOntoEachCore, leftOverFraction;
   2.374 +   int32 numCores, currCore, numOnCurrCore, numVecs = 0;
   2.375 +
   2.376 +   numLeftColIdxs  = numColIdxs;
   2.377 +   numRightColIdxs = numVecIdxs;
   2.378 +
   2.379 +   numCores = NUMBER_OF_CORES_TO_SPAWN_ONTO;
   2.380 +
   2.381 +   numToPutOntoEachCore = numRowIdxs*numColIdxs/numCores;
   2.382 +   leftOverFraction = 0;
   2.383 +   numOnCurrCore = 0;
   2.384 +   currCore = 0;
   2.385 +
   2.386 +   resRowIdx = 0;
   2.387 +//         printf("spawning vects, numOnEachCore: %f\n", numToPutOntoEachCore);
   2.388 +   for( resRowIdx = 0; resRowIdx < numRowIdxs; resRowIdx++ )
   2.389 +    {
   2.390 +      leftRowIdxOffset = resRowIdx * numLeftColIdxs;
   2.391 +
   2.392 +      for( resColIdx = 0; resColIdx < numColIdxs; resColIdx++ )
   2.393 +       {
   2.394 +         vecParams = malloc( sizeof(VecParams) );
   2.395 +         
   2.396 +         vecParams->numVecIdxs       = numVecIdxs;
   2.397 +         vecParams->numRightColIdxs  = numRightColIdxs;
   2.398 +         vecParams->leftRowIdxOffset = leftRowIdxOffset;
   2.399 +         vecParams->resColIdx        = resColIdx;
   2.400 +         vecParams->leftSubMatrices  = leftSubMatrices;
   2.401 +         vecParams->rightSubMatrices = rightSubMatrices;
   2.402 +         vecParams->resultArray      = resultArray;
   2.403 +         vecParams->coreToRunOn      = currCore;
   2.404 +         vecParams->vecID            = numVecs++;
   2.405 +
   2.406 +//               printf("spawning vect %d\n", numVecs-1); fflush(stdin);
   2.407 +         spawn calcVectorOfSubMatrices( vecParams );
   2.408 +
   2.409 +         numOnCurrCore += 1;
   2.410 +         if( numOnCurrCore + leftOverFraction >= numToPutOntoEachCore - 1 )
   2.411 +          {
   2.412 +               //deal with fractional part, to ensure that imbalance is 1 max
   2.413 +               // IE, core with most has only 1 more than core with least
   2.414 +            leftOverFraction += numToPutOntoEachCore - numOnCurrCore;
   2.415 +            if( leftOverFraction >= 1 )
   2.416 +             { leftOverFraction -= 1;
   2.417 +               numOnCurrCore = -1;
   2.418 +             }
   2.419 +            else
   2.420 +             { numOnCurrCore = 0;
   2.421 +             }
   2.422 +               //Move to next core, max core-value to incr to is numCores -1
   2.423 +            if( currCore >= numCores -1 )
   2.424 +             { currCore = 0;
   2.425 +             }
   2.426 +            else
   2.427 +             { currCore += 1;
   2.428 +             }
   2.429 +          }
   2.430 +       }
   2.431 +    }
   2.432 +   
   2.433 +   //Free Note: vector of sub-matrices does its own free-ing, even vec-params
   2.434 +
   2.435 +//         printf("done with making vectors\n", numToPutOntoEachCore);
   2.436 +
   2.437 +   sync;
   2.438 +
   2.439 +   //free the sub-matrices in Fn that called this one
   2.440 + }
   2.441 +
   2.442 +
   2.443 +/*Walk through the two slice-strucs, making sub-matrix strucs as go
   2.444 + */
   2.445 +SubMatrix **
   2.446 +createSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices,
   2.447 +                   Matrix *origMatrix, int32 transposeTheMatrix )
   2.448 + {
   2.449 +   int32 numRowIdxs, numColIdxs, rowIdx, colIdx;
   2.450 +   int32 startRow, endRow, startCol, endCol;
   2.451 +   int32 *rowStartVals, *colStartVals;
   2.452 +   int32 rowOffset, dummy;
   2.453 +   SubMatrix **subMatrices, *newSubMatrix;
   2.454 +
   2.455 +   numRowIdxs = rowSlices->numVals;
   2.456 +   numColIdxs = colSlices->numVals;
   2.457 +
   2.458 +   rowStartVals = rowSlices->startVals;
   2.459 +   colStartVals = colSlices->startVals;
   2.460 +
   2.461 +   subMatrices = malloc( numRowIdxs * numColIdxs *sizeof(SubMatrix *) );
   2.462 +
   2.463 +   for( rowIdx = 0; rowIdx < numRowIdxs; rowIdx++ )
   2.464 +    {
   2.465 +      rowOffset = rowIdx * numColIdxs;
   2.466 +      
   2.467 +      startRow  = rowStartVals[rowIdx];
   2.468 +      endRow    = rowStartVals[rowIdx + 1] -1; //"fake" start above last is
   2.469 +                                               // at last valid idx + 1 & is
   2.470 +                                               // 1 greater than end value
   2.471 +      for( colIdx = 0; colIdx < numColIdxs; colIdx++ )
   2.472 +       {
   2.473 +         startCol = colStartVals[colIdx];
   2.474 +         endCol   = colStartVals[colIdx + 1] -1;
   2.475 +
   2.476 +         newSubMatrix = malloc( sizeof(SubMatrix) );
   2.477 +         newSubMatrix->numRows       = endRow - startRow +1;
   2.478 +         newSubMatrix->numCols       = endCol - startCol +1;
   2.479 +         newSubMatrix->origMatrix    = origMatrix;
   2.480 +         newSubMatrix->origStartRow  = startRow;
   2.481 +         newSubMatrix->origStartCol  = startCol;
   2.482 +            //no parallel singleton in Cilk, so copy here
   2.483 +         if( transposeTheMatrix )
   2.484 +          { copyTransposeFromOrig( newSubMatrix );
   2.485 +          }
   2.486 +         else
   2.487 +          { copyFromOrig( newSubMatrix );
   2.488 +          } 
   2.489 +//               printf("just copied: %X", newSubMatrix->array); 
   2.490 +         newSubMatrix->alreadyCopied = TRUE;
   2.491 +
   2.492 +         subMatrices[ rowOffset + colIdx ] = newSubMatrix;
   2.493 +       }
   2.494 +    }
   2.495 +   return subMatrices;
   2.496 + }
   2.497 +
   2.498 +void
   2.499 +freeSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices,
   2.500 +                 SubMatrix **subMatrices )
   2.501 + {
   2.502 +   int32 numRowIdxs, numColIdxs, rowIdx, colIdx, rowOffset;
   2.503 +   SubMatrix *subMatrix;
   2.504 +
   2.505 +   numRowIdxs = rowSlices->numVals;
   2.506 +   numColIdxs = colSlices->numVals;
   2.507 +
   2.508 +   for( rowIdx = 0; rowIdx < numRowIdxs; rowIdx++ )
   2.509 +    {
   2.510 +      rowOffset = rowIdx * numColIdxs;
   2.511 +      for( colIdx = 0; colIdx < numColIdxs; colIdx++ )
   2.512 +       {
   2.513 +         subMatrix = subMatrices[ rowOffset + colIdx ];
   2.514 +         if( subMatrix->alreadyCopied )
   2.515 +            free( subMatrix->array );
   2.516 +         free( subMatrix );
   2.517 +       }
   2.518 +    }
   2.519 +   free( subMatrices );
   2.520 + }
   2.521 +
   2.522 +
   2.523 +SlicingStrucCarrier *
   2.524 +calcIdealSizeAndSliceDimensions( Matrix *leftMatrix, Matrix *rightMatrix )
   2.525 +{
   2.526 +   float32 idealSizeOfSide, idealSizeOfSide1, idealSizeOfSide2;
   2.527 +   SlicingStruc *leftRowSlices, *vecSlices, *rightColSlices;
   2.528 +   int32 startLeftRow, endLeftRow, startVec,endVec,startRightCol,endRightCol;
   2.529 +
   2.530 +   SlicingStrucCarrier *slicingStrucCarrier =
   2.531 +                         malloc(sizeof(SlicingStrucCarrier) );
   2.532 +
   2.533 +   int minWorkUnitCycles, primitiveCycles, idealNumWorkUnits;
   2.534 +   float64 numPrimitiveOpsInMinWorkUnit;
   2.535 +
   2.536 +
   2.537 +   //=======  Calc ideal size of min-sized sub-matrix  ========
   2.538 +
   2.539 +      //ask VCilk for the number of cycles of the minimum work unit, at given
   2.540 +      // percent overhead then add a guess at overhead from this divider
   2.541 +   minWorkUnitCycles = MIN_WORK_UNIT_CYCLES;
   2.542 +
   2.543 +      //ask VCilk for number of cycles of the "primitive" op of matrix mult
   2.544 +   primitiveCycles = measureMatrixMultPrimitive( );
   2.545 +
   2.546 +   numPrimitiveOpsInMinWorkUnit =
   2.547 +      (float64)minWorkUnitCycles / (float64)primitiveCycles;
   2.548 +
   2.549 +      //take cubed root -- that's number of these in a "side" of sub-matrix
   2.550 +      // then multiply by 5 because the primitive is 5x5
   2.551 +   idealSizeOfSide1 = 5 * cbrt( numPrimitiveOpsInMinWorkUnit );
   2.552 +
   2.553 +   idealNumWorkUnits = IDEAL_NUM_WORK_UNITS;
   2.554 +   
   2.555 +   idealSizeOfSide2 = leftMatrix->numRows / rint(cbrt( idealNumWorkUnits ));
   2.556 +   idealSizeOfSide2 *= 0.8; //finer granularity to help load balance
   2.557 +
   2.558 +   if( idealSizeOfSide1 > idealSizeOfSide2 )
   2.559 +      idealSizeOfSide = idealSizeOfSide1;
   2.560 +   else
   2.561 +      idealSizeOfSide = idealSizeOfSide2;
   2.562 +
   2.563 +
   2.564 +   //============  Slice up dimensions, now that know target size ===========
   2.565 +
   2.566 +      //Tell the slicer the target size of a side (floating pt), the start
   2.567 +      // value to start slicing at, and the end value to stop slicing at
   2.568 +      //It returns an array of start value of each chunk, plus number of them
   2.569 +
   2.570 +   startLeftRow  = 0;
   2.571 +   endLeftRow    = leftMatrix->numRows -1;
   2.572 +   startVec      = 0;
   2.573 +   endVec        = leftMatrix->numCols -1;
   2.574 +   startRightCol = 0;
   2.575 +   endRightCol   = rightMatrix->numCols -1;
   2.576 +
   2.577 +   leftRowSlices =
   2.578 +      sliceUpDimension( idealSizeOfSide,  startLeftRow, endLeftRow );
   2.579 +
   2.580 +   vecSlices =
   2.581 +      sliceUpDimension( idealSizeOfSide,  startVec, endVec );
   2.582 +
   2.583 +   rightColSlices =
   2.584 +      sliceUpDimension( idealSizeOfSide,  startRightCol, endRightCol );
   2.585 +
   2.586 +   slicingStrucCarrier->leftRowSlices  = leftRowSlices;
   2.587 +   slicingStrucCarrier->vecSlices      = vecSlices;
   2.588 +   slicingStrucCarrier->rightColSlices = rightColSlices;
   2.589 +
   2.590 +   return slicingStrucCarrier;
   2.591 +}
   2.592 +
   2.593 +
   2.594 +SlicingStruc *
   2.595 +sliceUpDimension( float32 idealSizeOfSide, int startVal, int endVal )
   2.596 + { float32 residualAcc = 0;
   2.597 +   int     numSlices, i, *startVals, sizeOfSlice, endCondition;
   2.598 +   SlicingStruc *slicingStruc = malloc( sizeof(SlicingStruc) );
   2.599 +
   2.600 +      //calc size of matrix need to hold start vals --
   2.601 +   numSlices = (int32)( (float32)(endVal -startVal +1) / idealSizeOfSide);
   2.602 +
   2.603 +   startVals = malloc( (numSlices + 1) * sizeof(int32) );
   2.604 +
   2.605 +      //Calc the upper limit of start value -- when get above this, end loop
   2.606 +      // by saving highest value of the matrix dimension to access, plus 1
   2.607 +      // as the start point of the imaginary slice following the last one
   2.608 +      //Plus 1 because go up to value but not include when process last slice
   2.609 +      //The stopping condition is half-a-size less than highest value because
   2.610 +      // don't want any pieces smaller than half the ideal size -- just tack
   2.611 +      // little ones onto end of last one
   2.612 +   endCondition = endVal - (int) (idealSizeOfSide/2); //end *value*, not size
   2.613 +   for( i = 0; startVal <= endVal; i++ )
   2.614 +    {
   2.615 +      startVals[i] = startVal;
   2.616 +      residualAcc += idealSizeOfSide;
   2.617 +      sizeOfSlice  = (int)residualAcc;
   2.618 +      residualAcc -= (float32)sizeOfSlice;
   2.619 +      startVal    += sizeOfSlice; //ex @size = 2 get 0, 2, 4, 6, 8..
   2.620 +
   2.621 +      if( startVal > endCondition )
   2.622 +       { startVal = endVal + 1;
   2.623 +         startVals[ i + 1 ] = startVal;
   2.624 +       }
   2.625 +    }
   2.626 +
   2.627 +   slicingStruc->startVals = startVals;
   2.628 +   slicingStruc->numVals   = i;  //loop incr'd, so == last valid start idx+1
   2.629 +                                 // which means is num sub-matrices in dim
   2.630 +                                 // also == idx of the fake start just above
   2.631 +   return slicingStruc;
   2.632 + }
   2.633 +
   2.634 +void
   2.635 +freeSlicingStruc( SlicingStruc *slicingStruc )
   2.636 + {
   2.637 +   free( slicingStruc->startVals );
   2.638 +   free( slicingStruc );
   2.639 + }
   2.640 +
   2.641 +
   2.642 +int inline
   2.643 +measureMatrixMultPrimitive()
   2.644 + {
   2.645 +   int r, c, v, numCycles;
   2.646 +   float32 *res, *left, *right;
   2.647 +
   2.648 +      //setup inputs
   2.649 +   left  = malloc( 5 * 5 * sizeof( float32 ) );
   2.650 +   right = malloc( 5 * 5 * sizeof( float32 ) );
   2.651 +   res   = malloc( 5 * 5 * sizeof( float32 ) );
   2.652 +
   2.653 +   for( r = 0; r < 5; r++ )
   2.654 +    {
   2.655 +      for( c = 0; c < 5; c++ )
   2.656 +       {
   2.657 +         left[  r * 5 + c ] = r;
   2.658 +         right[ r * 5 + c ] = c;
   2.659 +       }
   2.660 +    }
   2.661 +
   2.662 +      //do primitive
   2.663 +//   VCilk__start_primitive();  //for now, just takes time stamp
   2.664 +   for( r = 0; r < 5; r++ )
   2.665 +    {
   2.666 +      for( c = 0; c < 5; c++ )
   2.667 +       {
   2.668 +         for( v = 0; v < 5; v++ )
   2.669 +          {
   2.670 +            res[ r * 5 + c ] = left[ r * 5 + v ] * right[ v * 5 + c ];
   2.671 +          }
   2.672 +       }
   2.673 +    }
   2.674 +//   numCycles = VCilk__end_primitive_and_give_cycles();
   2.675 +
   2.676 +   free( left );
   2.677 +   free( right );
   2.678 +   free( res );
   2.679 +   
   2.680 +   return numCycles;
   2.681 + }
     3.1 --- a/src/Application/CILK__Matrix_Mult/EntryPoint.cilk	Tue Oct 26 19:34:03 2010 -0700
     3.2 +++ b/src/Application/CILK__Matrix_Mult/EntryPoint.cilk	Wed Nov 10 06:07:54 2010 -0800
     3.3 @@ -1,16 +1,22 @@
     3.4  /*
     3.5 - *  Copyright 2009 OpenSourceCodeStewardshipFoundation.org
     3.6 + *  Copyright 2009 OpenSourceStewardshipFoundation.org
     3.7   *  Licensed under GNU General Public License version 2
     3.8   *
     3.9   * Author: seanhalle@yahoo.com
    3.10   *
    3.11   */
    3.12  
    3.13 +#include <math.h>
    3.14 +
    3.15  #include "CILK__Matrix_Mult.h"
    3.16  
    3.17 -cilk void divideIntoVectors( void * );
    3.18 +//==========================================================================
    3.19 +cilk void
    3.20 +divideWorkIntoSubMatrixPairProcrs( void      *_dividerParams );
    3.21  
    3.22 -/*Every VPThread system has an "entry point" function that creates the first
    3.23 +//==========================================================================
    3.24 +
    3.25 +/*Every VCilk system has an "entry point" function that creates the first
    3.26   * processor, which starts the chain of creating more processors..
    3.27   * eventually all of the processors will dissipate themselves, and
    3.28   * return.
    3.29 @@ -19,27 +25,41 @@
    3.30   * functions do:
    3.31   *1) it creates the params for the seed processor, from the
    3.32   *    parameters passed into the entry-point function
    3.33 - *2) it calls VPThread__create_seed_procr_and_do_work
    3.34 + *2) it calls VCilk__create_seed_procr_and_do_work
    3.35   *3) it gets the return value from the params struc, frees the params struc,
    3.36   *    and returns the value from the function
    3.37   *
    3.38   */
    3.39 -cilk
    3.40 -Matrix *
    3.41 +cilk Matrix *
    3.42  multiplyTheseMatrices( Matrix *leftMatrix, Matrix *rightMatrix )
    3.43   { Matrix          *resMatrix;
    3.44     DividerParams   *dividerParams;
    3.45 +   int32            numResRows, numResCols;
    3.46  
    3.47 -
    3.48 +//         printf("entry point");
    3.49     dividerParams              = malloc( sizeof( DividerParams ) );
    3.50     dividerParams->leftMatrix  = leftMatrix;
    3.51     dividerParams->rightMatrix = rightMatrix;
    3.52 + 
    3.53 +   numResRows  = leftMatrix->numRows;
    3.54 +   numResCols  = rightMatrix->numCols;
    3.55 +   resMatrix            = malloc( sizeof(Matrix) );
    3.56 +   resMatrix->array     = malloc( numResRows * numResCols * sizeof(float32));
    3.57 +   resMatrix->numCols   = rightMatrix->numCols;
    3.58 +   resMatrix->numRows   = leftMatrix->numRows;
    3.59  
    3.60 -   spawn divideIntoVectors( dividerParams );
    3.61 +
    3.62 +   dividerParams->resultMatrix   = resMatrix;
    3.63 +
    3.64 +      //create divider processor, start doing the work, and wait till done
    3.65 +      //This function is the "border crossing" between normal code and VCilk
    3.66 +   //VCilk__create_seed_procr_and_do_work( &divideWorkIntoSubMatrixPairProcrs,\
    3.67 +                                         dividerParams );
    3.68 +   spawn divideWorkIntoSubMatrixPairProcrs( dividerParams );
    3.69 +
    3.70     sync;
    3.71 -
    3.72 -      //get result matrix and return it
    3.73 -   resMatrix = dividerParams->resultMatrix;
    3.74 +   
    3.75 +      //return result matrix
    3.76     free( dividerParams );
    3.77     return resMatrix;
    3.78   }
     4.1 --- a/src/Application/CILK__Matrix_Mult/Vector_Pr.cilk	Tue Oct 26 19:34:03 2010 -0700
     4.2 +++ b/src/Application/CILK__Matrix_Mult/Vector_Pr.cilk	Wed Nov 10 06:07:54 2010 -0800
     4.3 @@ -1,48 +1,119 @@
     4.4 -/* 
     4.5 - *  Copyright 2009 OpenSourceCodeStewardshipFoundation.org
     4.6 +/*
     4.7 + *  Copyright 2009 OpenSourceStewardshipFoundation.org
     4.8   *  Licensed under GNU General Public License version 2
     4.9   *
    4.10 - * Author: SeanHalle@yahoo.com
    4.11 + * Author: seanhalle@yahoo.com
    4.12   *
    4.13   */
    4.14  
    4.15 +
    4.16  #include "CILK__Matrix_Mult.h"
    4.17 +#include <math.h>
    4.18 +#include <stdlib.h>
    4.19 +#include <malloc.h>
    4.20  
    4.21 -/*A Vector processor is created with an environment that holds two matrices,
    4.22 - * the row and col that it owns, and the name of a result gathering
    4.23 - * processor.
    4.24 - *It calculates its vector product then sends the result to the result
    4.25 - * processor, which puts it into the result matrix and returns that matrix
    4.26 - * when all is done.
    4.27 - */
    4.28 -cilk
    4.29 -float32
    4.30 -calcVector( void *data )
    4.31 - { 
    4.32 -   VectorParams   *params;
    4.33 -   int             myRow, myCol, vectLength, pos;
    4.34 -   float32        *leftMatrixArray, *rightMatrixArray, result = 0.0;
    4.35 -   Matrix         *leftMatrix, *rightMatrix;
    4.36 +//===========================================================================
    4.37  
    4.38 -   params      = (VectorParams *)data;
    4.39 -   myCol       = params->myCol;
    4.40 -   myRow       = params->myRow;
    4.41 -   vectLength  = params->vectLength;
    4.42 -   leftMatrix  = params->leftMatrix;
    4.43 -   rightMatrix = params->rightMatrix;
    4.44 -   leftMatrixArray  = leftMatrix->matrix;
    4.45 -   rightMatrixArray = rightMatrix->matrix;
    4.46 -         //=====================  DEBUG  ======================
    4.47 -         #ifdef PRINT_DEBUG
    4.48 -         if( myCol == 0 )
    4.49 -            printf("start vector: %d, %d\n", myRow, myCol ); fflush(stdin);
    4.50 -         #endif
    4.51 -         //====================================================
    4.52 +cilk void calcSubMatrixProduct( void *data );
    4.53  
    4.54 -   for( pos = 0; pos < vectLength; pos++ )
    4.55 +
    4.56 +void inline
    4.57 +accumulateResult( float32 *resultArray, float32 *subMatrixResultArray,
    4.58 +                  int32    startRow,
    4.59 +                  int32    numRows,
    4.60 +                  int32    startCol,
    4.61 +                  int32    numCols,
    4.62 +                  int32    numOrigCols );
    4.63 +
    4.64 +
    4.65 +//===========================================================================
    4.66 +
    4.67 +cilk void
    4.68 +calcVectorOfSubMatrices( void *_vecParams )
    4.69 + { int32         numVecIdxs, leftRowIdxOffset, numRightColIdxs, resColIdx;
    4.70 +   SubMatrix   **leftSubMatrices, **rightSubMatrices;
    4.71 +   float32      *resultArray;
    4.72 +   int32         vecIdx, coreWithAffinity;
    4.73 +   SMPairParams *subMatrixPairParams, **vecOfSubMatrixParams;
    4.74 +   VecParams    *vecParams;
    4.75 +
    4.76 +   vecParams = (VecParams *)_vecParams;
    4.77 +
    4.78 +   numVecIdxs       = vecParams->numVecIdxs;
    4.79 +   numRightColIdxs  = vecParams->numRightColIdxs;
    4.80 +   leftRowIdxOffset = vecParams->leftRowIdxOffset;
    4.81 +   resColIdx        = vecParams->resColIdx;
    4.82 +   leftSubMatrices  = vecParams->leftSubMatrices;
    4.83 +   rightSubMatrices = vecParams->rightSubMatrices;
    4.84 +   resultArray      = vecParams->resultArray;
    4.85 +   coreWithAffinity = vecParams->coreToRunOn;
    4.86 +         
    4.87 +//         printf("inside vector %d, %d\n", leftRowIdxOffset, resColIdx );
    4.88 +
    4.89 +   vecOfSubMatrixParams = malloc( numVecIdxs * sizeof(SMPairParams *) );
    4.90 +   if( vecOfSubMatrixParams == 0 ){printf("malloc error"); exit(1);}
    4.91 +
    4.92 +   for( vecIdx = 0; vecIdx < numVecIdxs; vecIdx++ )
    4.93      {
    4.94 -      result += leftMatrixArray[ myRow * vectLength + pos ]  *
    4.95 -                rightMatrixArray[ pos  * vectLength + myCol];
    4.96 +         //Make the processor for the pair of sub-matrices
    4.97 +      subMatrixPairParams  = malloc( sizeof(SMPairParams) );
    4.98 +      subMatrixPairParams->leftSubMatrix  =
    4.99 +         leftSubMatrices[ leftRowIdxOffset + vecIdx ];
   4.100 +
   4.101 +      subMatrixPairParams->rightSubMatrix =
   4.102 +         rightSubMatrices[ vecIdx * numRightColIdxs + resColIdx ];
   4.103 +
   4.104 +//            printf("about to spawn pair %X\n", (int)subMatrixPairParams->leftSubMatrix->array);
   4.105 +      spawn calcSubMatrixProduct( subMatrixPairParams );
   4.106 +//            printf("done with spawn\n");
   4.107 +      vecOfSubMatrixParams[ vecIdx ] = subMatrixPairParams;
   4.108      }
   4.109 -   return result;
   4.110 +
   4.111 +//         printf("done spawning product pairs\n");
   4.112 +   sync;
   4.113 +
   4.114 +      //now accumulate individual result matrices into final result matrix
   4.115 +   for( vecIdx = 0; vecIdx < numVecIdxs; vecIdx++ )
   4.116 +    {
   4.117 +      subMatrixPairParams = vecOfSubMatrixParams[ vecIdx ];
   4.118 +
   4.119 +      accumulateResult( resultArray, subMatrixPairParams->partialResultArray,
   4.120 +                        subMatrixPairParams->leftSubMatrix->origStartRow,
   4.121 +                        subMatrixPairParams->leftSubMatrix->numRows,
   4.122 +                        subMatrixPairParams->rightSubMatrix->origStartCol,
   4.123 +                        subMatrixPairParams->rightSubMatrix->numCols,
   4.124 +                   subMatrixPairParams->rightSubMatrix->origMatrix->numCols);
   4.125 +
   4.126 +         //Note, resultArray is made on the core that produces the results
   4.127 +         // that gives chance to set affinity so all in vector run on same
   4.128 +         // core and re-use that array, and prevents writes from causing
   4.129 +         // thrashing of the cache -- as long as array big enough, the copy
   4.130 +         // overhead is miniscule vs the size-of-side reuse of each byte
   4.131 +      free( subMatrixPairParams->partialResultArray );
   4.132 +      free( subMatrixPairParams );
   4.133 +    }
   4.134 +   free( vecOfSubMatrixParams );
   4.135 +   free( vecParams );
   4.136   }
   4.137 +
   4.138 +
   4.139 +
   4.140 +void inline
   4.141 +accumulateResult( float32 *resultArray, float32 *subMatrixResultArray,
   4.142 +                  int32    startRow,
   4.143 +                  int32    numRows,
   4.144 +                  int32    startCol,
   4.145 +                  int32    numCols,
   4.146 +                  int32    numOrigCols )
   4.147 + { int32 row, col;
   4.148 +
   4.149 +   for( row = 0; row < numRows; row++ )
   4.150 +    {
   4.151 +      for( col = 0; col < numCols; col++ )
   4.152 +       {
   4.153 +         resultArray[ (row + startRow) * numOrigCols + col + startCol ] +=
   4.154 +            subMatrixResultArray[ row * numCols + col ];
   4.155 +       }
   4.156 +    }
   4.157 +
   4.158 + }
     5.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     5.2 +++ b/src/Application/CILK__Matrix_Mult/subMatrix_Pr.cilk	Wed Nov 10 06:07:54 2010 -0800
     5.3 @@ -0,0 +1,301 @@
     5.4 +/* 
     5.5 + *  Copyright 2010 OpenSourceStewardshipFoundation.org
     5.6 + *  Licensed under GNU General Public License version 2
     5.7 + *
     5.8 + * Author: SeanHalle@yahoo.com
     5.9 + *
    5.10 + */
    5.11 +
    5.12 +#include <string.h>
    5.13 +
    5.14 +#include "CILK__Matrix_Mult.h"
    5.15 +
    5.16 +
    5.17 +void inline
    5.18 +copyFromOrig( SubMatrix *subMatrix );
    5.19 +
    5.20 +void inline
    5.21 +copyTransposeFromOrig( SubMatrix *subMatrix );
    5.22 +
    5.23 +void inline
    5.24 +multiplySubBlocksTransposed( float32 *leftArray, float32 *rightArray,
    5.25 +                     float32 *resArray,
    5.26 +                     int startRow,  int endRow,
    5.27 +                     int startCol,  int endCol,
    5.28 +                     int startVec,  int endVec,
    5.29 +                     int resStride, int inpStride );
    5.30 +
    5.31 +void inline
    5.32 +multiplyMatrixArraysTransposed( int32 vecLength, int32 numResRows, int32 numResCols,
    5.33 +                      float32 *leftArray, float32 *rightArray,
    5.34 +                      float32 *resArray );
    5.35 +
    5.36 +
    5.37 +/*A  processor is created with an environment that holds two matrices,
    5.38 + * the row and col that it owns, and the name of a result gathering
    5.39 + * processor.
    5.40 + *It calculates the product of two sub-portions of the input matrices
    5.41 + * by using Intel's mkl library for single-core.
    5.42 + *
    5.43 + *This demonstrates using optimized single-threaded code inside scheduled
    5.44 + * work-units.
    5.45 + *
    5.46 + *When done, it sends the result to the result processor
    5.47 + */
    5.48 +cilk void
    5.49 +calcSubMatrixProduct( void *data )
    5.50 + { 
    5.51 +   SMPairParams   *params;
    5.52 +   float32        *leftArray,  *rightArray, *resArray;
    5.53 +   SubMatrix      *leftSubMatrix, *rightSubMatrix;
    5.54 +   int32           resSize;
    5.55 +   int32           numResRows, numResCols, vectLength;
    5.56 +
    5.57 +//         printf("inside submatrix-pair\n");
    5.58 +   params         = (SMPairParams *)data;
    5.59 +   leftSubMatrix  = params->leftSubMatrix;
    5.60 +   rightSubMatrix = params->rightSubMatrix;
    5.61 +
    5.62 +   
    5.63 +      //make sure the input sub-matrices have been copied out of orig
    5.64 +   //copyFromOrig( leftSubMatrix, animatingPr );
    5.65 +   //copyTransposeFromOrig( rightSubMatrix, animatingPr );
    5.66 +   
    5.67 +   leftArray      = leftSubMatrix->array;
    5.68 +   rightArray     = rightSubMatrix->array;
    5.69 +
    5.70 +      //make this array here, on the core that computes the results
    5.71 +      // with Cilk's semantics, have to have separate result array for each
    5.72 +      // spawned processor -- unless want to change the spawn and sync
    5.73 +      // pattern, such that spawn one from each vector, then sync, then
    5.74 +      // another, and so forth -- this will cause idle time due to imbalance
    5.75 +      // in matrix sizes
    5.76 +      //This also gives chance to set affinity so all in vector run on same
    5.77 +      // core and re-use the accumulation array,
    5.78 +      //As a side-benefit, it also prevents writes from causing
    5.79 +      // thrashing of the cache -- as long as array big enough, the copy
    5.80 +      // overhead is small because each byte is reused size-of-side times
    5.81 +      //This is freed in the vector processor
    5.82 +   resSize = leftSubMatrix->numRows * rightSubMatrix->numCols * sizeof(float32);
    5.83 +   resArray = malloc( resSize );
    5.84 +   memset( resArray, 0, resSize );
    5.85 +
    5.86 +   
    5.87 +   vectLength = leftSubMatrix->numCols;
    5.88 +   numResRows = leftSubMatrix->numRows;
    5.89 +   numResCols = rightSubMatrix->numCols;
    5.90 +
    5.91 +//         printf("just before multiply arrays %X\n", leftArray);
    5.92 +   multiplyMatrixArraysTransposed( vectLength, numResRows, numResCols,
    5.93 +                         leftArray, rightArray,
    5.94 +                         resArray );
    5.95 +
    5.96 +   //send result by side-effect
    5.97 +   params->partialResultArray = resArray;
    5.98 + }
    5.99 +
   5.100 +
   5.101 +
   5.102 +/*Divides result and each input into 32x32 sub-matrices, 3 of which fit into
   5.103 + * the 32KB L1 cache.
   5.104 + *Would be nice to embed this within another level that divided into
   5.105 + * 8x8 tiles of those, where one 8x8 tile fits within 2MB L2 cache
   5.106 + *
   5.107 + *Eventually want these divisions to be automatic, using DKU pattern
   5.108 + * embedded into VMS and exposed in the language, and with VMS controlling the
   5.109 + * divisions according to the cache sizes, which it knows about.
   5.110 + *Also, want VMS to work with language to split among main-mems, so a socket
   5.111 + * only cranks on data in its local segment of main mem
   5.112 + *
   5.113 + *So, outer two loops determine start and end points within the result matrix.
   5.114 + * Inside that, a loop dets the start and end points along the shared dimensions
   5.115 + * of the two input matrices.
   5.116 + */
   5.117 +void inline
   5.118 +multiplyMatrixArraysTransposed( int32 vecLength, int32 numResRows,
   5.119 +                                int32 numResCols,
   5.120 +                                float32 *leftArray, float32 *rightArray,
   5.121 +                                float32 *resArray )
   5.122 + {
   5.123 +   int resStride, inpStride;
   5.124 +   int resStartRow, resStartCol, resEndRow, resEndCol, startVec, endVec;
   5.125 +
   5.126 +   resStride  = numResCols;
   5.127 +   inpStride  = vecLength;
   5.128 +
   5.129 +   for( resStartRow = 0; resStartRow < numResRows; )
   5.130 +    {
   5.131 +      resEndRow = resStartRow + ROWS_IN_BLOCK -1;  //start at zero, so -1
   5.132 +      if( resEndRow > numResRows ) resEndRow = numResRows -1;
   5.133 +
   5.134 +      for( resStartCol = 0; resStartCol < numResCols; )
   5.135 +       {
   5.136 +         resEndCol   = resStartCol + COLS_IN_BLOCK -1;
   5.137 +         if( resEndCol > numResCols ) resEndCol = numResCols -1;
   5.138 +
   5.139 +         for( startVec = 0; startVec < vecLength; )
   5.140 +          {
   5.141 +            endVec   = startVec + VEC_IN_BLOCK -1;
   5.142 +            if( endVec > vecLength ) endVec = vecLength -1;
   5.143 +
   5.144 +//                  printf("just before multiply sub-blocks %X\n", leftArray);
   5.145 +               //By having the "vector" of sub-blocks in a sub-block slice
   5.146 +               // be marched down in inner loop, are re-using the result
   5.147 +               // matrix, which stays in L1 cache and re-using the left sub-mat
   5.148 +               // which repeats for each right sub-mat -- can only re-use two of
   5.149 +               // the three, so result is the most important -- avoids writing
   5.150 +               // dirty blocks until those result-locations fully done
   5.151 +               //Row and Col is position in result matrix -- so row and vec
   5.152 +               // for left array, then vec and col for right array
   5.153 +            multiplySubBlocksTransposed( leftArray, rightArray,
   5.154 +                                         resArray,
   5.155 +                                         resStartRow,  resEndRow,
   5.156 +                                         resStartCol,  resEndCol,
   5.157 +                                         startVec,  endVec,
   5.158 +                                         resStride, inpStride );
   5.159 +            startVec = endVec +1;
   5.160 +          }
   5.161 +         resStartCol = resEndCol +1;
   5.162 +       }
   5.163 +      resStartRow = resEndRow +1;
   5.164 +    }
   5.165 + }
   5.166 +
   5.167 +
   5.168 +
   5.169 +void inline
   5.170 +multiplySubBlocksTransposed( float32 *leftArray, float32 *rightArray,
   5.171 +                     float32 *resArray,
   5.172 +                     int resStartRow,  int resEndRow,
   5.173 +                     int resStartCol,  int resEndCol,
   5.174 +                     int startVec,  int endVec,
   5.175 +                     int resStride, int inpStride )
   5.176 + {
   5.177 +   int resRow,     resCol,        vec;
   5.178 +   int leftOffset, rightOffset;
   5.179 +   float32 result;
   5.180 +
   5.181 +//         printf("start col, row | end col, row: %d, %d, %d, %d\n", resStartCol, resStartRow, resEndCol, resEndRow);
   5.182 +      //The result row is used for the left matrix, res col for the right
   5.183 +   for( resCol = resStartCol; resCol <= resEndCol; resCol++ )
   5.184 +    {
   5.185 +      for( resRow = resStartRow; resRow <= resEndRow; resRow++ )
   5.186 +       {
   5.187 +         leftOffset  = resRow * inpStride;//left & right inp strides same
   5.188 +         rightOffset = resCol * inpStride;// because right is transposed
   5.189 +         result = 0;
   5.190 +         for( vec = startVec; vec <= endVec; vec++ )
   5.191 +          {
   5.192 +            result +=
   5.193 +               leftArray[ leftOffset + vec] * rightArray[ rightOffset + vec];
   5.194 +          }
   5.195 +
   5.196 +         resArray[ resRow * resStride + resCol ] += result;
   5.197 +       }
   5.198 +    }
   5.199 + }
   5.200 +
   5.201 +
   5.202 +/*Reuse this in divider when do the sequential multiply case
   5.203 + */
   5.204 +void inline
   5.205 +copyTranspose( int32 numRows, int32 numCols,
   5.206 +               int32 origStartRow, int32 origStartCol, int32 origStride,
   5.207 +               float32 *subArray, float32 *origArray )
   5.208 + { int32 stride;
   5.209 +   int32 row, col, origOffset;
   5.210 +
   5.211 +   stride = numRows;
   5.212 +   for( row = 0; row < numRows; row++ )
   5.213 +    {
   5.214 +      origOffset = (row + origStartRow) * origStride + origStartCol;
   5.215 +      for( col = 0; col < numCols; col++ )
   5.216 +       {
   5.217 +            //transpose means swap row & col -- traverse orig matrix normally
   5.218 +            // but put into reversed place in local array -- means the
   5.219 +            // stride is the numRows now, so col * numRows + row
   5.220 +         subArray[ col * stride + row ]  =  origArray[ origOffset + col ];
   5.221 +       }
   5.222 +    }
   5.223 + }
   5.224 +
   5.225 +void inline
   5.226 +copyTransposeFromOrig( SubMatrix *subMatrix )
   5.227 + { int numCols, numRows, origStartRow, origStartCol, origStride, stride;
   5.228 +   Matrix *origMatrix;
   5.229 +   float32 *origArray, *subArray;
   5.230 +
   5.231 +
   5.232 +//   if( subMatrix->alreadyCopied ) return;
   5.233 +//   VCilk__start_singleton(copyMatrixSingleton,&&EndOfTranspSingleton,animPr);
   5.234 +
   5.235 +   origMatrix   = subMatrix->origMatrix;
   5.236 +   origArray    = origMatrix->array;
   5.237 +   numCols      = subMatrix->numCols;
   5.238 +   numRows      = subMatrix->numRows;
   5.239 +   origStartRow = subMatrix->origStartRow;
   5.240 +   origStartCol = subMatrix->origStartCol;
   5.241 +   origStride   = origMatrix->numCols;
   5.242 +
   5.243 +   subArray     = malloc( numRows * numCols *sizeof(float32) );
   5.244 +   subMatrix->array = subArray;
   5.245 +//         printf("copying transpose %X\n", subArray);
   5.246 +
   5.247 +      //copy values from orig matrix to local
   5.248 +   copyTranspose( numRows, numCols,
   5.249 +                  origStartRow, origStartCol, origStride,
   5.250 +                  subArray, origArray );
   5.251 +
   5.252 +   subMatrix->alreadyCopied = TRUE; //must be last thing before label
   5.253 +//   EndOfTranspSingleton:
   5.254 +   return;
   5.255 + }
   5.256 +
   5.257 +
   5.258 +void inline
   5.259 +copyFromOrig( SubMatrix *subMatrix )
   5.260 + { int numCols, numRows, origStartRow, origStartCol, stride, origStride;
   5.261 +   Matrix *origMatrix;
   5.262 +   float32 *origArray, *subArray;
   5.263 +   int32 row, col, offset, origOffset;
   5.264 +
   5.265 +      //This lets only a single VP execute the code between start and
   5.266 +      // end -- using start and end so that work runs outside the master.
   5.267 +      //Inside, if a second VP ever executes the start, it will be returned
   5.268 +      // from the end-point.
   5.269 +      //Note, for non-GCC, can add a second SSR call at the end, and inside
   5.270 +      // that one, look at the stack at the return addr & save that in an
   5.271 +      // array indexed by singletonID
   5.272 +//   if( subMatrix->alreadyCopied ) return;
   5.273 +//   VCilk__start_singleton( copyMatrixSingleton, &&EndOfCopySingleton,animPr);
   5.274 +
   5.275 +
   5.276 +   origMatrix    = subMatrix->origMatrix;
   5.277 +   origArray     = origMatrix->array;
   5.278 +   numCols       = subMatrix->numCols;
   5.279 +   numRows       = subMatrix->numRows;
   5.280 +   origStartRow  = subMatrix->origStartRow;
   5.281 +   origStartCol  = subMatrix->origStartCol;
   5.282 +   origStride    = origMatrix->numCols;
   5.283 +
   5.284 +   subArray      = malloc( numRows * numCols *sizeof(float32) );
   5.285 +   subMatrix->array = subArray;
   5.286 +//        printf("copying normal %X\n", subArray);
   5.287 +
   5.288 +      //copy values from orig matrix to local
   5.289 +   stride        = numCols;
   5.290 +
   5.291 +   for( row = 0; row < numRows; row++ )
   5.292 +    {
   5.293 +      offset     = row * stride;
   5.294 +      origOffset = (row + origStartRow) * origStride + origStartCol;
   5.295 +      for( col = 0; col < numCols; col++ )
   5.296 +       {
   5.297 +         subArray[ offset + col ]  =  origArray[ origOffset + col ];
   5.298 +       }
   5.299 +    }
   5.300 +
   5.301 +   subMatrix->alreadyCopied = TRUE; //must be last thing before label
   5.302 +//   EndOfCopySingleton:
   5.303 +   return;
   5.304 + }
     6.1 --- a/src/Application/Makefile	Tue Oct 26 19:34:03 2010 -0700
     6.2 +++ b/src/Application/Makefile	Wed Nov 10 06:07:54 2010 -0800
     6.3 @@ -1,35 +1,51 @@
     6.4 +#
     6.5 +#  Copyright Nov 6, 2010 OpenSourceStewardshipFoundation.org
     6.6 +#  Licensed under GNU General Public License version 2
     6.7 +#
     6.8 +# author seanhalle@yahoo.com
     6.9  
    6.10  
    6.11  CILK_SOURCE = \
    6.12     CILK__Matrix_Mult/EntryPoint.cilk \
    6.13     CILK__Matrix_Mult/Divide_Pr.cilk \
    6.14     CILK__Matrix_Mult/Vector_Pr.cilk \
    6.15 +   CILK__Matrix_Mult/subMatrix_Pr.cilk \
    6.16     main.cilk
    6.17  
    6.18  C_SOURCE = \
    6.19     matrix_mult.c \
    6.20 -   ParamHelper/ParamBag.c\
    6.21 +   ParamHelper/ParamBag.c \
    6.22     ParamHelper/ReadParamsFromFile.c
    6.23  
    6.24 +#The next two rules make a new string, with the same names as the
    6.25 +# source, but the endings changed to .o
    6.26 +#The third concatenates the two source-file strings
    6.27  C_OBJS = $(C_SOURCE:.c=.o)
    6.28  
    6.29  CILK_OBJS = $(CILK_SOURCE:.cilk=.o) 
    6.30  
    6.31 -OBJECTS = $(C_SOURCE) $(CILK_SOURCE)
    6.32 +OBJECTS = $(C_OBJS) $(CILK_OBJS)
    6.33 +
    6.34  
    6.35  #Make has the built-in variable "$<" which is the source file
    6.36  # and "$@" which is the target for that source
    6.37 +#The first rule says for each file in C_SOURCE, put the
    6.38 +# name in place of $< and expand
    6.39 +# C_OBJS in place of $@ and run the gcc command, when files are
    6.40 +# out of date
    6.41  $(C_OBJS): $(C_SOURCE)
    6.42 -	gcc -c $< -o $@
    6.43 +	gcc -c $*.c -o $*.o;
    6.44  
    6.45  $(CILK_OBJS): $(CILK_SOURCE)
    6.46 -	gcc -c $< -o $@
    6.47 +	cilkc -c $*.cilk -o $*.o
    6.48  
    6.49  all: $(OBJECTS)
    6.50  	cilkc $(OBJECTS) -o CILK_Linux__Matrix_Mult; \
    6.51  	cp CILK_Linux__Matrix_Mult ~/D/2__INRIA_OMP/1__Development/2__runs_and_data/executables
    6.52  
    6.53  
    6.54 +clean:
    6.55 +	rm *.o; rm ParamHelper/*.o; rm CILK__Matrix_Mult/*.o
    6.56  
    6.57  #================================================================
    6.58  #Other stuff tried/played_with/copied
    6.59 @@ -62,8 +78,6 @@
    6.60  #================================================================
    6.61  # playing with below..
    6.62  
    6.63 -#7C9A-RV6P-3XE2-JV99-426K-2K
    6.64 -
    6.65  #rule for inferring that the .cilk file is the source for .o file
    6.66  # and how to create the .o from the .cilk
    6.67  #%.o : %.cilk
     7.1 --- a/src/Application/Matrix_Mult.c	Tue Oct 26 19:34:03 2010 -0700
     7.2 +++ b/src/Application/Matrix_Mult.c	Wed Nov 10 06:07:54 2010 -0800
     7.3 @@ -61,7 +61,7 @@
     7.4     
     7.5     numRows = matrixStruc->numRows;
     7.6     numCols = matrixStruc->numCols;
     7.7 -   matrixStart = matrixStruc->matrix;
     7.8 +   matrixStart = matrixStruc->array;
     7.9  
    7.10     file = fopen( matrixFileName, "r" );
    7.11     if( file == NULL ) { printf( "\nCouldn't open file!!\n"); exit(1);}
    7.12 @@ -131,7 +131,7 @@
    7.13     retMatrix = malloc( sizeof( Matrix ) );
    7.14     retMatrix->numRows = numRows;
    7.15     retMatrix->numCols = numCols;
    7.16 -   retMatrix->matrix  = malloc( numRows * numCols * sizeof(float32) );
    7.17 +   retMatrix->array  = malloc( numRows * numCols * sizeof(float32) );
    7.18  
    7.19     return retMatrix;
    7.20   }
    7.21 @@ -142,24 +142,26 @@
    7.22   }
    7.23   void
    7.24  freeMatrix( Matrix * matrix )
    7.25 - { free( matrix->matrix );
    7.26 + { free( matrix->array );
    7.27     free( matrix );
    7.28   }
    7.29  
    7.30  void
    7.31  printMatrix( Matrix *matrix )
    7.32 - { int r, c, numRows, numCols;
    7.33 + { 
    7.34 +   int r, c, numRows, numCols, rowsToPrint, colsToPrint, rowIncr, colIncr;
    7.35     float32 *matrixArray;
    7.36  
    7.37 -   numRows = matrix->numRows;
    7.38 -   numCols = matrix->numCols;
    7.39 -   matrixArray = matrix->matrix;
    7.40 +   numRows = rowsToPrint = matrix->numRows;
    7.41 +   numCols = colsToPrint = matrix->numCols;
    7.42 +   matrixArray = matrix->array;
    7.43  
    7.44 -   for( r = 0; r < numRows; r++ )
    7.45 -    { for( c = 0; c < numCols; c++ )
    7.46 -       { printf( "%f | ", *(matrixArray + r*numCols + c) );
    7.47 +   rowIncr = numRows/20; if(rowIncr == 0) rowIncr = 1;//20 to 39 rows printed
    7.48 +   colIncr = numCols/20; if(colIncr == 0) colIncr = 1;//20 to 39 cols printed
    7.49 +   for( r = 0; r < numRows; r += rowIncr )
    7.50 +    { for( c = 0; c < numCols; c += colIncr )
    7.51 +       { printf( "%3.1f | ", matrixArray[ r * numCols + c ] );
    7.52         }
    7.53        printf("\n");
    7.54      }
    7.55   }
    7.56 -
     8.1 --- a/src/Application/Matrix_Mult.h	Tue Oct 26 19:34:03 2010 -0700
     8.2 +++ b/src/Application/Matrix_Mult.h	Wed Nov 10 06:07:54 2010 -0800
     8.3 @@ -19,7 +19,7 @@
     8.4  struct
     8.5   { int32    numRows;
     8.6     int32    numCols;
     8.7 -   float32 *matrix;  //2D, but dynamically sized, so use addr arith
     8.8 +   float32 *array;  //2D, but dynamically sized, so use addr arith
     8.9   }
    8.10  Matrix;
    8.11  
     9.1 --- a/src/Application/main.cilk	Tue Oct 26 19:34:03 2010 -0700
     9.2 +++ b/src/Application/main.cilk	Wed Nov 10 06:07:54 2010 -0800
     9.3 @@ -1,41 +1,67 @@
     9.4 -/*
     9.5 - *  Copyright Oct 24, 2009 OpenSourceCodeStewardshipFoundation.org
     9.6 +/*
     9.7 + *  Copyright 2010 OpenSourcStewardshipFoundation.org
     9.8   *  Licensed under GNU General Public License version 2
     9.9 - *
    9.10 - * author seanhalle@yahoo.com
    9.11 - */
    9.12 -
    9.13 -#include <malloc.h>
    9.14 -#include <stdlib.h>
    9.15 -
    9.16 -#include "Matrix_Mult.h"
    9.17 -#include "CILK__Matrix_Mult/CILK__Matrix_Mult.h"
    9.18 -
    9.19 -cilk Matrix * 
    9.20 -multiplyTheseMatrices( Matrix *leftMatrix, Matrix *rightMatrix );
    9.21 -
    9.22 -/**
    9.23 - *Matrix multiply program written using VMS_HW piggy-back language
    9.24 - * 
    9.25 - */
    9.26 -cilk
    9.27 -int main( int argc, char **argv )
    9.28 - { Matrix      *leftMatrix, *rightMatrix, *resultMatrix;
    9.29 -   ParamBag    *paramBag;
    9.30 -   
    9.31 -   
    9.32 -   paramBag = makeParamBag();
    9.33 -   readParamFileIntoBag( argv[1], paramBag );
    9.34 -   initialize_Input_Matrices_Via( &leftMatrix, &rightMatrix, paramBag );
    9.35 -
    9.36 -   resultMatrix = spawn multiplyTheseMatrices( leftMatrix, rightMatrix );
    9.37 -   sync;
    9.38 -
    9.39 -   printf("\nresult matrix: \n");
    9.40 -
    9.41 -//   printMatrix( resultMatrix );
    9.42 -   
    9.43 -//   VPThread__print_stats();
    9.44 -   
    9.45 -   exit(0); //cleans up
    9.46 - }
    9.47 + *
    9.48 + * author seanhalle@yahoo.com
    9.49 + */
    9.50 +
    9.51 +#include <malloc.h>
    9.52 +#include <stdlib.h>
    9.53 +
    9.54 +#include "Matrix_Mult.h"
    9.55 +#include "CILK__Matrix_Mult/CILK__Matrix_Mult.h"
    9.56 +
    9.57 +
    9.58 +   //single global var -- just get it done
    9.59 +struct timeval startStamp;
    9.60 +
    9.61 +
    9.62 +void
    9.63 +startTimeInterval()
    9.64 + {
    9.65 +   gettimeofday( &startStamp, NULL);
    9.66 + }
    9.67 +
    9.68 +
    9.69 +void
    9.70 +endIntervalAndPrintTime()
    9.71 + { 
    9.72 +   struct timeval endStamp;
    9.73 +   float64 startSecs, endSecs, intervalSecs;
    9.74 +
    9.75 +   gettimeofday( &endStamp, NULL);
    9.76 +
    9.77 +   startSecs = startStamp.tv_sec + ( startStamp.tv_usec / 1000000.0 );
    9.78 +   endSecs   = endStamp.tv_sec   + ( endStamp.tv_usec / 1000000.0 );
    9.79 +
    9.80 +   intervalSecs  = endSecs - startSecs;
    9.81 +   printf("Interval: %f", intervalSecs);
    9.82 + }
    9.83 +
    9.84 +
    9.85 +cilk Matrix * 
    9.86 +multiplyTheseMatrices( Matrix *leftMatrix, Matrix *rightMatrix );
    9.87 +
    9.88 +/**
    9.89 + *Matrix multiply program written using VMS_HW piggy-back language
    9.90 + * 
    9.91 + */
    9.92 +cilk
    9.93 +int main( int argc, char **argv )
    9.94 + { Matrix      *leftMatrix, *rightMatrix, *resultMatrix;
    9.95 +   ParamBag    *paramBag;
    9.96 +   
    9.97 +   
    9.98 +   paramBag = makeParamBag();
    9.99 +   readParamFileIntoBag( argv[1], paramBag );
   9.100 +   initialize_Input_Matrices_Via( &leftMatrix, &rightMatrix, paramBag );
   9.101 +
   9.102 +   resultMatrix = spawn multiplyTheseMatrices( leftMatrix, rightMatrix );
   9.103 +   sync;
   9.104 +
   9.105 +   printf("\nresult matrix: \n");
   9.106 +
   9.107 +//   printMatrix( resultMatrix );
   9.108 +   
   9.109 +   exit(0); //cleans up
   9.110 + }