# HG changeset patch
# User Me
# Date 1289398074 28800
# Node ID bf7331ed394e0017dab346fe1a99493f86ba29eb
# Parent  ec0629f70ee5f2ba5519a0707a7618980ef9b5db
Working version of blocked matrix mult -- same as SSR, VCilk & VPThread

diff -r ec0629f70ee5 -r bf7331ed394e src/Application/CILK__Matrix_Mult/CILK__Matrix_Mult.h
--- a/src/Application/CILK__Matrix_Mult/CILK__Matrix_Mult.h	Tue Oct 26 19:34:03 2010 -0700
+++ b/src/Application/CILK__Matrix_Mult/CILK__Matrix_Mult.h	Wed Nov 10 06:07:54 2010 -0800
@@ -3,14 +3,23 @@
  *  Licensed under GNU General Public License version 2
  */
 
-#ifndef _VPThread__MATRIX_MULT_H_
-#define _VPThread__MATRIX_MULT_H_
+#ifndef _Cilk__MATRIX_MULT_H_
+#define _Cilk__MATRIX_MULT_H_
 
 #include <stdio.h>
 
 #include "VMS_primitive_data_types.h"
 #include "../Matrix_Mult.h"
 
+//===============================  Defines  ==============================
+#define ROWS_IN_BLOCK 32
+#define COLS_IN_BLOCK 32
+#define VEC_IN_BLOCK  32
+
+#define copyMatrixSingleton 1
+#define copyTransposeSingleton 2
+
+
 //==============================  Structures  ==============================
 typedef struct
  {
@@ -20,44 +29,55 @@
  }
 DividerParams;
 
-typedef struct
- {
-   int numRows;
-   int numCols;
+typedef
+struct
+ { int32    numRows;
+   int32    numCols;
+   Matrix  *origMatrix;
+   int32    origStartRow;
+   int32    origStartCol;
+   int32    alreadyCopied;
+   float32 *array;  //2D, but dynamically sized, so use addr arith
  }
-ResultsParams;
+SubMatrix;
 
 typedef struct
  { 
-   int        myCol;
-   int        myRow;
-   int        vectLength;
-   Matrix    *leftMatrix;
-   Matrix    *rightMatrix;
-   float32    result;
+   SubMatrix *leftSubMatrix;
+   SubMatrix *rightSubMatrix;
+   float32   *partialResultArray;
  }
-VectorParams;
+SMPairParams;
+
+typedef
+struct
+ { int32    numVals;
+   int32   *startVals;
+ }
+SlicingStruc;
+
+typedef
+struct
+ {
+   SlicingStruc *leftRowSlices;
+   SlicingStruc *vecSlices;
+   SlicingStruc *rightColSlices;
+ }
+SlicingStrucCarrier;
 
 typedef struct
  {
-      //for communicating vector results to results Thd
-   int32         vector_mutex;
-   int32         vector_cond;
-   VectorParams *currVector;
-
-      //for communicating results array back to seed (divider) Thd
-   int32         results_mutex;
-   int32         results_cond;
-   float32      *results;
-
-      //for ensuring results thd has vector lock before making vector thds
-   int32         start_mutex;
-   int32         start_cond;
-
-   Matrix *rightMatrix;
-   Matrix *resultMatrix;
+   int32 numVecIdxs;
+   int32 numRightColIdxs;
+   int32 leftRowIdxOffset;
+   int32 resColIdx;
+   SubMatrix **leftSubMatrices;
+   SubMatrix **rightSubMatrices;
+   float32 *resultArray;
+   int32 coreToRunOn;
+   int32 vecID;
  }
-MatrixMultGlobals;
+VecParams;
 
 
 //============================= Processor Functions =========================
@@ -67,8 +87,9 @@
 
 
 //================================ Entry Point ==============================
-//cilk Matrix *\
-multiplyTheseMatrices( Matrix *leftMatrix, Matrix *rightMatrix );
+//cilk 
+//Matrix *
+//multiplyTheseMatrices( Matrix *leftMatrix, Matrix *rightMatrix );
 
 
 #endif /*_VPThread__MATRIX_MULT_H_*/
diff -r ec0629f70ee5 -r bf7331ed394e src/Application/CILK__Matrix_Mult/Divide_Pr.cilk
--- a/src/Application/CILK__Matrix_Mult/Divide_Pr.cilk	Tue Oct 26 19:34:03 2010 -0700
+++ b/src/Application/CILK__Matrix_Mult/Divide_Pr.cilk	Wed Nov 10 06:07:54 2010 -0800
@@ -1,72 +1,610 @@
-/*
- *  Copyright 2009 OpenSourceStewardshipFoundation.org
- *  Licensed under GNU General Public License version 2
- *
- * Author: seanhalle@yahoo.com
- *
- */
-
-
-#include "CILK__Matrix_Mult.h"
-
-cilk float32 calcVector( void * );
-
-/*Divider creates one processor for every row-col pair.
- * It hands them:
+/*
+ *  Copyright 2009 OpenSourceStewardshipFoundation.org
+ *  Licensed under GNU General Public License version 2
+ *
+ * Author: seanhalle@yahoo.com
+ *
+ */
+
+
+#include "CILK__Matrix_Mult.h"
+#include "VMS_primitive_data_types.h"
+
+#include <math.h>
+#include <sys/time.h>
+#include <string.h>
+#include <malloc.h>
+
+   //The time to compute this many result values should equal the time to
+   // perform this division on a matrix of size gives that many result calcs
+   //IE, size this so that sequential time to calc equals divide time
+   // find the value by experimenting -- but divide time and calc time scale
+   // same way, so this value should remain valid across hardware
+   //Divide time is about 800us on 2.4Ghz core2Quad laptop core
+   //num cells is the cube of a side, when have two square matrices
+#define NUM_CELLS_IN_SEQUENTIAL_CUTOFF 100000 /* about 46x46 */
+
+   //Cilk doesn't have VCilk's facilities, so define constants
+#define MIN_WORK_UNIT_CYCLES 100000
+#define IDEAL_NUM_WORK_UNITS 20
+#define NUMBER_OF_CORES_TO_SPAWN_ONTO 4
+
+//===============================  External =================================
+cilk void calcVectorOfSubMatrices( void *params );
+
+void inline
+copyTranspose( int32 numRows, int32 numCols,
+               int32 origStartRow, int32 origStartCol, int32 origStride,
+               float32 *subArray, float32 *origArray );
+
+void inline
+multiplyMatrixArraysTransposed( int32 vecLength, int32 numResRows, int32 numResCols,
+                      float32 *leftArray, float32 *rightArray,
+                      float32 *resArray );
+
+
+void
+startTimeInterval();
+
+void
+endIntervalAndPrintTime();
+
+
+//=============================  Within-File  ===============================
+int inline
+measureMatrixMultPrimitive();
+
+SlicingStrucCarrier *
+calcIdealSizeAndSliceDimensions( Matrix *leftMatrix, Matrix *rightMatrix );
+
+SlicingStruc *
+sliceUpDimension( float32 idealSizeOfSide, int startVal, int endVal );
+
+void
+freeSlicingStruc( SlicingStruc *slicingStruc );
+
+SubMatrix **
+createSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices,
+                   Matrix *origMatrix, int32 transposeTheMatrix );
+
+void
+freeSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices,
+                 SubMatrix **subMatrices );
+
+cilk void
+pairUpSubMatricesAndSpawnAndSync( SubMatrix **leftSubMatrices,
+                                    SubMatrix **rightSubMatrices,
+                                    int32 numRowIdxs, int32 numColIdxs,
+                                    int32 numVecIdxs,
+                                    float32 *resultArray );
+
+cilk void
+makeSubMatricesAndSpawnAndSync( Matrix *leftMatrix, Matrix *rightMatrix,
+            SlicingStrucCarrier *slicingStrucCarrier,
+            float32 *resultArray );
+
+//===========================================================================
+
+/*Divider creates one processor for every sub-matrix
+ * It hands them:
  *  the name of the result processor that they should send their results to,
- *  the left and right matrices, and the row and col they should multiply
- *  the length of the vector
- * It first creates the result processor, then all the vector processors,
+ *  the left and right matrices, and the rows and cols they should multiply
+ * It first creates the result processor, then all the sub-matrixPair
+ *  processors,
  *  then does a receive of a message from the result processor that gives
  *  the divider ownership of the result matrix.
- * Finally, the divider returns the result matrix out of the VPThread system.
- */
-
-cilk void divideIntoVectors( void *_dividerParams )
- { 
-   DividerParams     *dividerParams;
-   VectorParams      *vectParams;
-   Matrix            *leftMatrix, *rightMatrix, *resultMatrix;
-   int32              numCells, numCols, mrow, mcol;
-   float32           *resultMatrixArray;
-
-   dividerParams   = (DividerParams *)_dividerParams;
-   
-   leftMatrix      = dividerParams->leftMatrix;
-   rightMatrix     = dividerParams->rightMatrix;
-
-   
-   numCols = rightMatrix->numCols;
-
-   numCells  = leftMatrix->numRows * rightMatrix->numCols;
-   resultMatrixArray = malloc( numCells * sizeof( float32 ) );
-
-
-      //spawn vector calcs
-   for( mrow = 0; mrow < leftMatrix->numRows; mrow++ )
-    { for( mcol = 0; mcol < rightMatrix->numCols; mcol++ )
-       {
-         vectParams              = malloc( sizeof(VectorParams) );
-         vectParams->myCol       = mcol;
-         vectParams->myRow       = mrow;
-         vectParams->vectLength  = leftMatrix->numCols;
-         vectParams->leftMatrix  = leftMatrix;
-         vectParams->rightMatrix = rightMatrix;
-
-
-         resultMatrixArray[ mrow * numCols + mcol ] = spawn calcVector( vectParams );
-       }
-    }
-
-   sync;
-
-   
-      //The results of the all the work have to be linked-to from the data
-      // struc given to the seed procr -- this divide func is animated by
-      // that seed procr, so have to link results to the _dividerParams.
-   resultMatrix            = malloc( sizeof(Matrix) );
-   resultMatrix->numCols   = rightMatrix->numCols;
-   resultMatrix->numRows   = leftMatrix->numRows;
-   dividerParams->resultMatrix   = resultMatrix;
-   resultMatrix->matrix          = resultMatrixArray;
+ * Finally, the divider returns the result matrix out of the VCilk system.
+ *
+ * Divider chooses the size of sub-matrices via an algorithm that tries to
+ *  keep the minimum work above a threshold.  The threshold is machine-
+ *  dependent, so ask VCilk for min work-unit time to get a
+ *  given overhead
+ *
+ * Divide min work-unit cycles by measured-cycles for one matrix-cell
+ *  product -- gives the number of products need to have in min size
+ *  matrix.
+ *
+ * So then, take cubed root of this to get the size of a side of min sub-
+ *  matrix.  That is the size of the ideal square sub-matrix -- so tile
+ *  up the two input matrices into ones as close as possible to that size,
+ *  and create the pairs of sub-matrices.
+ *
+ *========================  STRATEGIC OVERVIEW  =======================
+ *
+ *This division is a bit tricky, because have to create things in advance
+ * that it's not at first obvious need to be created..
+ *
+ *First slice up each dimension -- three of them..  this is because will have
+ * to create the sub-matrix's data-structures before pairing the sub-matrices
+ * with each other -- so, have three dimensions to slice up before can
+ * create the sub-matrix data-strucs -- also, have to be certain that the
+ * cols of the left input have the exact same slicing as the rows of the
+ * left matrix, so just to be sure, do the slicing calc once, then use it
+ * for both.
+ *
+ *So, goes like this:
+ *1) calculate the start & end values of each dimension in each matrix.
+ *2) use those values to create sub-matrix structures
+ *3) combine sub-matrices into pairs, as the tasks to perform.
+ *
+ *Have to calculate separately from creating the sub-matrices because of the
+ * nature of the nesting -- would either end up creating the same sub-matrix
+ * multiple times, or else would have to put in detection of whether had
+ * made a particular one already if tried to combine steps 1 and 2.
+ *
+ *Step 3 has to be separate because of the nesting, as well -- same reason,
+ * would either create same sub-matrix multiple times, or else have to
+ * add detection of whether was already created.
+ *
+ *Another way to look at it: there's one level of loop to divide dimensions,
+ * two levels of nesting to create sub-matrices, and three levels to pair
+ * up the sub-matrices.
+ */
+
+cilk void
+divideWorkIntoSubMatrixPairProcrs( void      *_dividerParams )
+ { 
+   DividerParams   *dividerParams;
+   Matrix          *leftMatrix, *rightMatrix;
+   
+   SlicingStrucCarrier *slicingStrucCarrier;
+   float32         *resultArray; //points to array to be put inside result
+                                 // matrix
+   int32 numResRows, numResCols, vectLength;
+   
+
+
+   startTimeInterval();
+
+
+   //=========== Setup -- make local copies of ptd-to-things, malloc, aso
+
+//         printf("\nin divider\n");
+   dividerParams   = (DividerParams *)_dividerParams;
+   
+   leftMatrix      = dividerParams->leftMatrix;
+   rightMatrix     = dividerParams->rightMatrix;
+
+   vectLength  = leftMatrix->numCols;
+   numResRows  = leftMatrix->numRows;
+   numResCols  = rightMatrix->numCols;
+   resultArray = dividerParams->resultMatrix->array;
+   
+      //zero the result array
+   memset( resultArray, 0, numResRows * numResCols * sizeof(float32) );
+
+   
+   //==============  Do either sequential mult or do division ==============
+
+      //Check if input matrices too small -- if yes, just do sequential
+      //Cutoff is determined by overhead of this divider -- relatively
+      // machine-independent
+   if( (float32)leftMatrix->numRows * (float32)leftMatrix->numCols *
+       (float32)rightMatrix->numCols  < NUM_CELLS_IN_SEQUENTIAL_CUTOFF )
+    { int32 vectLength;
+
+      //====== Do sequential multiply on a single core
+
+         //transpose the right matrix
+      float32 *
+      transRightArray  = malloc( rightMatrix->numRows *
+                                        rightMatrix->numCols *
+                                        sizeof(float32) );
+
+         //copy values from orig matrix to local
+      copyTranspose( rightMatrix->numRows, rightMatrix->numCols,
+                     0, 0, rightMatrix->numRows,
+                     transRightArray, rightMatrix->array );
+
+      multiplyMatrixArraysTransposed( vectLength, numResRows, numResCols,
+                            leftMatrix->array, transRightArray,
+                            resultArray );
+    }
+   else
+    {
+      //====== Do parallel multiply across cores
+
+         //Calc the ideal size of sub-matrix and slice up the dimensions of
+         // the two matrices.
+         //The ideal size is the one takes the number of cycles to calculate
+         // such that calc time is equal or greater than min work-unit size
+      slicingStrucCarrier =
+         calcIdealSizeAndSliceDimensions( leftMatrix, rightMatrix );
+
+
+         //Make the sub-matrices, and pair them up, then spawn processors to
+         // calc product of each pair.
+//            printf("first spawn\n");
+      spawn makeSubMatricesAndSpawnAndSync( leftMatrix, rightMatrix,
+                                      slicingStrucCarrier,
+                                      resultArray );
+      sync;
+         //The result array will get filled in by the spawned children
+    }
+
+
+   //===============  Work done -- send results back =================
+
+
+   endIntervalAndPrintTime();
+
+//         printf("done with divider\n");
+
+      //results sent back by side-effect
+}
+
+
+cilk void
+makeSubMatricesAndSpawnAndSync( Matrix  *leftMatrix,  Matrix    *rightMatrix,
+                         SlicingStrucCarrier *slicingStrucCarrier,
+                         float32 *resultArray )
+ {
+   SlicingStruc *leftRowSlices, *vecSlices, *rightColSlices;
+   SubMatrix **leftSubMatrices, **rightSubMatrices;
+   int32 numRowIdxs, numColIdxs, numVecIdxs;
+   
+   leftRowSlices  = slicingStrucCarrier->leftRowSlices;
+   vecSlices      = slicingStrucCarrier->vecSlices;
+   rightColSlices = slicingStrucCarrier->rightColSlices;
+   free( slicingStrucCarrier );
+   
+   //================  Make sub-matrices, given the slicing  ================
+//         printf("about to create submatrices\n");
+   leftSubMatrices =
+      createSubMatrices( leftRowSlices, vecSlices,
+                         leftMatrix, FALSE );
+   rightSubMatrices =
+      createSubMatrices( vecSlices, rightColSlices,
+                         rightMatrix, TRUE );
+
+   //==============  pair the sub-matrices and make processors ==============
+
+   numRowIdxs = leftRowSlices->numVals;
+   numColIdxs = rightColSlices->numVals;
+   numVecIdxs = vecSlices->numVals;
+//         printf("about to spawn %d, %d, %d\n", numRowIdxs, numColIdxs, numVecIdxs);
+   spawn pairUpSubMatricesAndSpawnAndSync( leftSubMatrices, rightSubMatrices,
+                                     numRowIdxs, numColIdxs, numVecIdxs,
+                                     resultArray );
+   sync;
+//         printf("done with sub matrices spawn and sync\n");
+   freeSubMatrices( leftRowSlices, vecSlices,  leftSubMatrices );
+   freeSubMatrices( vecSlices, rightColSlices, rightSubMatrices );
+
+//         printf("done freeing sub matrices\n");
+      //It syncs inside, so know all work is done now: free the sub-matrices
+   freeSlicingStruc( leftRowSlices );
+   freeSlicingStruc( vecSlices );
+   freeSlicingStruc( rightColSlices );
+
+//         printf("done freeing slicing strucs\n");
  }
+
+
+
+
+/* numRows*colsPerRow/numCores = numToPutOntoEachCore; 
+ * put all from a given row onto same core, until exhaust allotment for that
+ *  core
+ *
+ */
+cilk void
+pairUpSubMatricesAndSpawnAndSync( SubMatrix **leftSubMatrices,
+                                    SubMatrix **rightSubMatrices,
+                                    int32 numRowIdxs, int32 numColIdxs,
+                                    int32 numVecIdxs,
+                                    float32 *resultArray )
+ {
+   int32 resRowIdx, resColIdx;
+   int32 numLeftColIdxs, numRightColIdxs;
+   int32 leftRowIdxOffset;
+   VecParams *vecParams;
+   float32 numToPutOntoEachCore, leftOverFraction;
+   int32 numCores, currCore, numOnCurrCore, numVecs = 0;
+
+   numLeftColIdxs  = numColIdxs;
+   numRightColIdxs = numVecIdxs;
+
+   numCores = NUMBER_OF_CORES_TO_SPAWN_ONTO;
+
+   numToPutOntoEachCore = numRowIdxs*numColIdxs/numCores;
+   leftOverFraction = 0;
+   numOnCurrCore = 0;
+   currCore = 0;
+
+   resRowIdx = 0;
+//         printf("spawning vects, numOnEachCore: %f\n", numToPutOntoEachCore);
+   for( resRowIdx = 0; resRowIdx < numRowIdxs; resRowIdx++ )
+    {
+      leftRowIdxOffset = resRowIdx * numLeftColIdxs;
+
+      for( resColIdx = 0; resColIdx < numColIdxs; resColIdx++ )
+       {
+         vecParams = malloc( sizeof(VecParams) );
+         
+         vecParams->numVecIdxs       = numVecIdxs;
+         vecParams->numRightColIdxs  = numRightColIdxs;
+         vecParams->leftRowIdxOffset = leftRowIdxOffset;
+         vecParams->resColIdx        = resColIdx;
+         vecParams->leftSubMatrices  = leftSubMatrices;
+         vecParams->rightSubMatrices = rightSubMatrices;
+         vecParams->resultArray      = resultArray;
+         vecParams->coreToRunOn      = currCore;
+         vecParams->vecID            = numVecs++;
+
+//               printf("spawning vect %d\n", numVecs-1); fflush(stdin);
+         spawn calcVectorOfSubMatrices( vecParams );
+
+         numOnCurrCore += 1;
+         if( numOnCurrCore + leftOverFraction >= numToPutOntoEachCore - 1 )
+          {
+               //deal with fractional part, to ensure that imbalance is 1 max
+               // IE, core with most has only 1 more than core with least
+            leftOverFraction += numToPutOntoEachCore - numOnCurrCore;
+            if( leftOverFraction >= 1 )
+             { leftOverFraction -= 1;
+               numOnCurrCore = -1;
+             }
+            else
+             { numOnCurrCore = 0;
+             }
+               //Move to next core, max core-value to incr to is numCores -1
+            if( currCore >= numCores -1 )
+             { currCore = 0;
+             }
+            else
+             { currCore += 1;
+             }
+          }
+       }
+    }
+   
+   //Free Note: vector of sub-matrices does its own free-ing, even vec-params
+
+//         printf("done with making vectors\n", numToPutOntoEachCore);
+
+   sync;
+
+   //free the sub-matrices in Fn that called this one
+ }
+
+
+/*Walk through the two slice-strucs, making sub-matrix strucs as go
+ */
+SubMatrix **
+createSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices,
+                   Matrix *origMatrix, int32 transposeTheMatrix )
+ {
+   int32 numRowIdxs, numColIdxs, rowIdx, colIdx;
+   int32 startRow, endRow, startCol, endCol;
+   int32 *rowStartVals, *colStartVals;
+   int32 rowOffset, dummy;
+   SubMatrix **subMatrices, *newSubMatrix;
+
+   numRowIdxs = rowSlices->numVals;
+   numColIdxs = colSlices->numVals;
+
+   rowStartVals = rowSlices->startVals;
+   colStartVals = colSlices->startVals;
+
+   subMatrices = malloc( numRowIdxs * numColIdxs *sizeof(SubMatrix *) );
+
+   for( rowIdx = 0; rowIdx < numRowIdxs; rowIdx++ )
+    {
+      rowOffset = rowIdx * numColIdxs;
+      
+      startRow  = rowStartVals[rowIdx];
+      endRow    = rowStartVals[rowIdx + 1] -1; //"fake" start above last is
+                                               // at last valid idx + 1 & is
+                                               // 1 greater than end value
+      for( colIdx = 0; colIdx < numColIdxs; colIdx++ )
+       {
+         startCol = colStartVals[colIdx];
+         endCol   = colStartVals[colIdx + 1] -1;
+
+         newSubMatrix = malloc( sizeof(SubMatrix) );
+         newSubMatrix->numRows       = endRow - startRow +1;
+         newSubMatrix->numCols       = endCol - startCol +1;
+         newSubMatrix->origMatrix    = origMatrix;
+         newSubMatrix->origStartRow  = startRow;
+         newSubMatrix->origStartCol  = startCol;
+            //no parallel singleton in Cilk, so copy here
+         if( transposeTheMatrix )
+          { copyTransposeFromOrig( newSubMatrix );
+          }
+         else
+          { copyFromOrig( newSubMatrix );
+          } 
+//               printf("just copied: %X", newSubMatrix->array); 
+         newSubMatrix->alreadyCopied = TRUE;
+
+         subMatrices[ rowOffset + colIdx ] = newSubMatrix;
+       }
+    }
+   return subMatrices;
+ }
+
+void
+freeSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices,
+                 SubMatrix **subMatrices )
+ {
+   int32 numRowIdxs, numColIdxs, rowIdx, colIdx, rowOffset;
+   SubMatrix *subMatrix;
+
+   numRowIdxs = rowSlices->numVals;
+   numColIdxs = colSlices->numVals;
+
+   for( rowIdx = 0; rowIdx < numRowIdxs; rowIdx++ )
+    {
+      rowOffset = rowIdx * numColIdxs;
+      for( colIdx = 0; colIdx < numColIdxs; colIdx++ )
+       {
+         subMatrix = subMatrices[ rowOffset + colIdx ];
+         if( subMatrix->alreadyCopied )
+            free( subMatrix->array );
+         free( subMatrix );
+       }
+    }
+   free( subMatrices );
+ }
+
+
+SlicingStrucCarrier *
+calcIdealSizeAndSliceDimensions( Matrix *leftMatrix, Matrix *rightMatrix )
+{
+   float32 idealSizeOfSide, idealSizeOfSide1, idealSizeOfSide2;
+   SlicingStruc *leftRowSlices, *vecSlices, *rightColSlices;
+   int32 startLeftRow, endLeftRow, startVec,endVec,startRightCol,endRightCol;
+
+   SlicingStrucCarrier *slicingStrucCarrier =
+                         malloc(sizeof(SlicingStrucCarrier) );
+
+   int minWorkUnitCycles, primitiveCycles, idealNumWorkUnits;
+   float64 numPrimitiveOpsInMinWorkUnit;
+
+
+   //=======  Calc ideal size of min-sized sub-matrix  ========
+
+      //ask VCilk for the number of cycles of the minimum work unit, at given
+      // percent overhead then add a guess at overhead from this divider
+   minWorkUnitCycles = MIN_WORK_UNIT_CYCLES;
+
+      //ask VCilk for number of cycles of the "primitive" op of matrix mult
+   primitiveCycles = measureMatrixMultPrimitive( );
+
+   numPrimitiveOpsInMinWorkUnit =
+      (float64)minWorkUnitCycles / (float64)primitiveCycles;
+
+      //take cubed root -- that's number of these in a "side" of sub-matrix
+      // then multiply by 5 because the primitive is 5x5
+   idealSizeOfSide1 = 5 * cbrt( numPrimitiveOpsInMinWorkUnit );
+
+   idealNumWorkUnits = IDEAL_NUM_WORK_UNITS;
+   
+   idealSizeOfSide2 = leftMatrix->numRows / rint(cbrt( idealNumWorkUnits ));
+   idealSizeOfSide2 *= 0.8; //finer granularity to help load balance
+
+   if( idealSizeOfSide1 > idealSizeOfSide2 )
+      idealSizeOfSide = idealSizeOfSide1;
+   else
+      idealSizeOfSide = idealSizeOfSide2;
+
+
+   //============  Slice up dimensions, now that know target size ===========
+
+      //Tell the slicer the target size of a side (floating pt), the start
+      // value to start slicing at, and the end value to stop slicing at
+      //It returns an array of start value of each chunk, plus number of them
+
+   startLeftRow  = 0;
+   endLeftRow    = leftMatrix->numRows -1;
+   startVec      = 0;
+   endVec        = leftMatrix->numCols -1;
+   startRightCol = 0;
+   endRightCol   = rightMatrix->numCols -1;
+
+   leftRowSlices =
+      sliceUpDimension( idealSizeOfSide,  startLeftRow, endLeftRow );
+
+   vecSlices =
+      sliceUpDimension( idealSizeOfSide,  startVec, endVec );
+
+   rightColSlices =
+      sliceUpDimension( idealSizeOfSide,  startRightCol, endRightCol );
+
+   slicingStrucCarrier->leftRowSlices  = leftRowSlices;
+   slicingStrucCarrier->vecSlices      = vecSlices;
+   slicingStrucCarrier->rightColSlices = rightColSlices;
+
+   return slicingStrucCarrier;
+}
+
+
+SlicingStruc *
+sliceUpDimension( float32 idealSizeOfSide, int startVal, int endVal )
+ { float32 residualAcc = 0;
+   int     numSlices, i, *startVals, sizeOfSlice, endCondition;
+   SlicingStruc *slicingStruc = malloc( sizeof(SlicingStruc) );
+
+      //calc size of matrix need to hold start vals --
+   numSlices = (int32)( (float32)(endVal -startVal +1) / idealSizeOfSide);
+
+   startVals = malloc( (numSlices + 1) * sizeof(int32) );
+
+      //Calc the upper limit of start value -- when get above this, end loop
+      // by saving highest value of the matrix dimension to access, plus 1
+      // as the start point of the imaginary slice following the last one
+      //Plus 1 because go up to value but not include when process last slice
+      //The stopping condition is half-a-size less than highest value because
+      // don't want any pieces smaller than half the ideal size -- just tack
+      // little ones onto end of last one
+   endCondition = endVal - (int) (idealSizeOfSide/2); //end *value*, not size
+   for( i = 0; startVal <= endVal; i++ )
+    {
+      startVals[i] = startVal;
+      residualAcc += idealSizeOfSide;
+      sizeOfSlice  = (int)residualAcc;
+      residualAcc -= (float32)sizeOfSlice;
+      startVal    += sizeOfSlice; //ex @size = 2 get 0, 2, 4, 6, 8..
+
+      if( startVal > endCondition )
+       { startVal = endVal + 1;
+         startVals[ i + 1 ] = startVal;
+       }
+    }
+
+   slicingStruc->startVals = startVals;
+   slicingStruc->numVals   = i;  //loop incr'd, so == last valid start idx+1
+                                 // which means is num sub-matrices in dim
+                                 // also == idx of the fake start just above
+   return slicingStruc;
+ }
+
+void
+freeSlicingStruc( SlicingStruc *slicingStruc )
+ {
+   free( slicingStruc->startVals );
+   free( slicingStruc );
+ }
+
+
+int inline
+measureMatrixMultPrimitive()
+ {
+   int r, c, v, numCycles;
+   float32 *res, *left, *right;
+
+      //setup inputs
+   left  = malloc( 5 * 5 * sizeof( float32 ) );
+   right = malloc( 5 * 5 * sizeof( float32 ) );
+   res   = malloc( 5 * 5 * sizeof( float32 ) );
+
+   for( r = 0; r < 5; r++ )
+    {
+      for( c = 0; c < 5; c++ )
+       {
+         left[  r * 5 + c ] = r;
+         right[ r * 5 + c ] = c;
+       }
+    }
+
+      //do primitive
+//   VCilk__start_primitive();  //for now, just takes time stamp
+   for( r = 0; r < 5; r++ )
+    {
+      for( c = 0; c < 5; c++ )
+       {
+         for( v = 0; v < 5; v++ )
+          {
+            res[ r * 5 + c ] = left[ r * 5 + v ] * right[ v * 5 + c ];
+          }
+       }
+    }
+//   numCycles = VCilk__end_primitive_and_give_cycles();
+
+   free( left );
+   free( right );
+   free( res );
+   
+   return numCycles;
+ }
diff -r ec0629f70ee5 -r bf7331ed394e src/Application/CILK__Matrix_Mult/EntryPoint.cilk
--- a/src/Application/CILK__Matrix_Mult/EntryPoint.cilk	Tue Oct 26 19:34:03 2010 -0700
+++ b/src/Application/CILK__Matrix_Mult/EntryPoint.cilk	Wed Nov 10 06:07:54 2010 -0800
@@ -1,16 +1,22 @@
 /*
- *  Copyright 2009 OpenSourceCodeStewardshipFoundation.org
+ *  Copyright 2009 OpenSourceStewardshipFoundation.org
  *  Licensed under GNU General Public License version 2
  *
  * Author: seanhalle@yahoo.com
  *
  */
 
+#include <math.h>
+
 #include "CILK__Matrix_Mult.h"
 
-cilk void divideIntoVectors( void * );
+//==========================================================================
+cilk void
+divideWorkIntoSubMatrixPairProcrs( void      *_dividerParams );
 
-/*Every VPThread system has an "entry point" function that creates the first
+//==========================================================================
+
+/*Every VCilk system has an "entry point" function that creates the first
  * processor, which starts the chain of creating more processors..
  * eventually all of the processors will dissipate themselves, and
  * return.
@@ -19,27 +25,41 @@
  * functions do:
  *1) it creates the params for the seed processor, from the
  *    parameters passed into the entry-point function
- *2) it calls VPThread__create_seed_procr_and_do_work
+ *2) it calls VCilk__create_seed_procr_and_do_work
  *3) it gets the return value from the params struc, frees the params struc,
  *    and returns the value from the function
  *
  */
-cilk
-Matrix *
+cilk Matrix *
 multiplyTheseMatrices( Matrix *leftMatrix, Matrix *rightMatrix )
  { Matrix          *resMatrix;
    DividerParams   *dividerParams;
+   int32            numResRows, numResCols;
 
-
+//         printf("entry point");
    dividerParams              = malloc( sizeof( DividerParams ) );
    dividerParams->leftMatrix  = leftMatrix;
    dividerParams->rightMatrix = rightMatrix;
+ 
+   numResRows  = leftMatrix->numRows;
+   numResCols  = rightMatrix->numCols;
+   resMatrix            = malloc( sizeof(Matrix) );
+   resMatrix->array     = malloc( numResRows * numResCols * sizeof(float32));
+   resMatrix->numCols   = rightMatrix->numCols;
+   resMatrix->numRows   = leftMatrix->numRows;
 
-   spawn divideIntoVectors( dividerParams );
+
+   dividerParams->resultMatrix   = resMatrix;
+
+      //create divider processor, start doing the work, and wait till done
+      //This function is the "border crossing" between normal code and VCilk
+   //VCilk__create_seed_procr_and_do_work( &divideWorkIntoSubMatrixPairProcrs,\
+                                         dividerParams );
+   spawn divideWorkIntoSubMatrixPairProcrs( dividerParams );
+
    sync;
-
-      //get result matrix and return it
-   resMatrix = dividerParams->resultMatrix;
+   
+      //return result matrix
    free( dividerParams );
    return resMatrix;
  }
diff -r ec0629f70ee5 -r bf7331ed394e src/Application/CILK__Matrix_Mult/Vector_Pr.cilk
--- a/src/Application/CILK__Matrix_Mult/Vector_Pr.cilk	Tue Oct 26 19:34:03 2010 -0700
+++ b/src/Application/CILK__Matrix_Mult/Vector_Pr.cilk	Wed Nov 10 06:07:54 2010 -0800
@@ -1,48 +1,119 @@
-/* 
- *  Copyright 2009 OpenSourceCodeStewardshipFoundation.org
+/*
+ *  Copyright 2009 OpenSourceStewardshipFoundation.org
  *  Licensed under GNU General Public License version 2
  *
- * Author: SeanHalle@yahoo.com
+ * Author: seanhalle@yahoo.com
  *
  */
 
+
 #include "CILK__Matrix_Mult.h"
+#include <math.h>
+#include <stdlib.h>
+#include <malloc.h>
 
-/*A Vector processor is created with an environment that holds two matrices,
- * the row and col that it owns, and the name of a result gathering
- * processor.
- *It calculates its vector product then sends the result to the result
- * processor, which puts it into the result matrix and returns that matrix
- * when all is done.
- */
-cilk
-float32
-calcVector( void *data )
- { 
-   VectorParams   *params;
-   int             myRow, myCol, vectLength, pos;
-   float32        *leftMatrixArray, *rightMatrixArray, result = 0.0;
-   Matrix         *leftMatrix, *rightMatrix;
+//===========================================================================
 
-   params      = (VectorParams *)data;
-   myCol       = params->myCol;
-   myRow       = params->myRow;
-   vectLength  = params->vectLength;
-   leftMatrix  = params->leftMatrix;
-   rightMatrix = params->rightMatrix;
-   leftMatrixArray  = leftMatrix->matrix;
-   rightMatrixArray = rightMatrix->matrix;
-         //=====================  DEBUG  ======================
-         #ifdef PRINT_DEBUG
-         if( myCol == 0 )
-            printf("start vector: %d, %d\n", myRow, myCol ); fflush(stdin);
-         #endif
-         //====================================================
+cilk void calcSubMatrixProduct( void *data );
 
-   for( pos = 0; pos < vectLength; pos++ )
+
+void inline
+accumulateResult( float32 *resultArray, float32 *subMatrixResultArray,
+                  int32    startRow,
+                  int32    numRows,
+                  int32    startCol,
+                  int32    numCols,
+                  int32    numOrigCols );
+
+
+//===========================================================================
+
+cilk void
+calcVectorOfSubMatrices( void *_vecParams )
+ { int32         numVecIdxs, leftRowIdxOffset, numRightColIdxs, resColIdx;
+   SubMatrix   **leftSubMatrices, **rightSubMatrices;
+   float32      *resultArray;
+   int32         vecIdx, coreWithAffinity;
+   SMPairParams *subMatrixPairParams, **vecOfSubMatrixParams;
+   VecParams    *vecParams;
+
+   vecParams = (VecParams *)_vecParams;
+
+   numVecIdxs       = vecParams->numVecIdxs;
+   numRightColIdxs  = vecParams->numRightColIdxs;
+   leftRowIdxOffset = vecParams->leftRowIdxOffset;
+   resColIdx        = vecParams->resColIdx;
+   leftSubMatrices  = vecParams->leftSubMatrices;
+   rightSubMatrices = vecParams->rightSubMatrices;
+   resultArray      = vecParams->resultArray;
+   coreWithAffinity = vecParams->coreToRunOn;
+         
+//         printf("inside vector %d, %d\n", leftRowIdxOffset, resColIdx );
+
+   vecOfSubMatrixParams = malloc( numVecIdxs * sizeof(SMPairParams *) );
+   if( vecOfSubMatrixParams == 0 ){printf("malloc error"); exit(1);}
+
+   for( vecIdx = 0; vecIdx < numVecIdxs; vecIdx++ )
     {
-      result += leftMatrixArray[ myRow * vectLength + pos ]  *
-                rightMatrixArray[ pos  * vectLength + myCol];
+         //Make the processor for the pair of sub-matrices
+      subMatrixPairParams  = malloc( sizeof(SMPairParams) );
+      subMatrixPairParams->leftSubMatrix  =
+         leftSubMatrices[ leftRowIdxOffset + vecIdx ];
+
+      subMatrixPairParams->rightSubMatrix =
+         rightSubMatrices[ vecIdx * numRightColIdxs + resColIdx ];
+
+//            printf("about to spawn pair %X\n", (int)subMatrixPairParams->leftSubMatrix->array);
+      spawn calcSubMatrixProduct( subMatrixPairParams );
+//            printf("done with spawn\n");
+      vecOfSubMatrixParams[ vecIdx ] = subMatrixPairParams;
     }
-   return result;
+
+//         printf("done spawning product pairs\n");
+   sync;
+
+      //now accumulate individual result matrices into final result matrix
+   for( vecIdx = 0; vecIdx < numVecIdxs; vecIdx++ )
+    {
+      subMatrixPairParams = vecOfSubMatrixParams[ vecIdx ];
+
+      accumulateResult( resultArray, subMatrixPairParams->partialResultArray,
+                        subMatrixPairParams->leftSubMatrix->origStartRow,
+                        subMatrixPairParams->leftSubMatrix->numRows,
+                        subMatrixPairParams->rightSubMatrix->origStartCol,
+                        subMatrixPairParams->rightSubMatrix->numCols,
+                   subMatrixPairParams->rightSubMatrix->origMatrix->numCols);
+
+         //Note, resultArray is made on the core that produces the results
+         // that gives chance to set affinity so all in vector run on same
+         // core and re-use that array, and prevents writes from causing
+         // thrashing of the cache -- as long as array big enough, the copy
+         // overhead is miniscule vs the size-of-side reuse of each byte
+      free( subMatrixPairParams->partialResultArray );
+      free( subMatrixPairParams );
+    }
+   free( vecOfSubMatrixParams );
+   free( vecParams );
  }
+
+
+
+void inline
+accumulateResult( float32 *resultArray, float32 *subMatrixResultArray,
+                  int32    startRow,
+                  int32    numRows,
+                  int32    startCol,
+                  int32    numCols,
+                  int32    numOrigCols )
+ { int32 row, col;
+
+   for( row = 0; row < numRows; row++ )
+    {
+      for( col = 0; col < numCols; col++ )
+       {
+         resultArray[ (row + startRow) * numOrigCols + col + startCol ] +=
+            subMatrixResultArray[ row * numCols + col ];
+       }
+    }
+
+ }
diff -r ec0629f70ee5 -r bf7331ed394e src/Application/CILK__Matrix_Mult/subMatrix_Pr.cilk
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/Application/CILK__Matrix_Mult/subMatrix_Pr.cilk	Wed Nov 10 06:07:54 2010 -0800
@@ -0,0 +1,301 @@
+/* 
+ *  Copyright 2010 OpenSourceStewardshipFoundation.org
+ *  Licensed under GNU General Public License version 2
+ *
+ * Author: SeanHalle@yahoo.com
+ *
+ */
+
+#include <string.h>
+
+#include "CILK__Matrix_Mult.h"
+
+
+void inline
+copyFromOrig( SubMatrix *subMatrix );
+
+void inline
+copyTransposeFromOrig( SubMatrix *subMatrix );
+
+void inline
+multiplySubBlocksTransposed( float32 *leftArray, float32 *rightArray,
+                     float32 *resArray,
+                     int startRow,  int endRow,
+                     int startCol,  int endCol,
+                     int startVec,  int endVec,
+                     int resStride, int inpStride );
+
+void inline
+multiplyMatrixArraysTransposed( int32 vecLength, int32 numResRows, int32 numResCols,
+                      float32 *leftArray, float32 *rightArray,
+                      float32 *resArray );
+
+
+/*A  processor is created with an environment that holds two matrices,
+ * the row and col that it owns, and the name of a result gathering
+ * processor.
+ *It calculates the product of two sub-portions of the input matrices
+ * by using Intel's mkl library for single-core.
+ *
+ *This demonstrates using optimized single-threaded code inside scheduled
+ * work-units.
+ *
+ *When done, it sends the result to the result processor
+ */
+cilk void
+calcSubMatrixProduct( void *data )
+ { 
+   SMPairParams   *params;
+   float32        *leftArray,  *rightArray, *resArray;
+   SubMatrix      *leftSubMatrix, *rightSubMatrix;
+   int32           resSize;
+   int32           numResRows, numResCols, vectLength;
+
+//         printf("inside submatrix-pair\n");
+   params         = (SMPairParams *)data;
+   leftSubMatrix  = params->leftSubMatrix;
+   rightSubMatrix = params->rightSubMatrix;
+
+   
+      //make sure the input sub-matrices have been copied out of orig
+   //copyFromOrig( leftSubMatrix, animatingPr );
+   //copyTransposeFromOrig( rightSubMatrix, animatingPr );
+   
+   leftArray      = leftSubMatrix->array;
+   rightArray     = rightSubMatrix->array;
+
+      //make this array here, on the core that computes the results
+      // with Cilk's semantics, have to have separate result array for each
+      // spawned processor -- unless want to change the spawn and sync
+      // pattern, such that spawn one from each vector, then sync, then
+      // another, and so forth -- this will cause idle time due to imbalance
+      // in matrix sizes
+      //This also gives chance to set affinity so all in vector run on same
+      // core and re-use the accumulation array,
+      //As a side-benefit, it also prevents writes from causing
+      // thrashing of the cache -- as long as array big enough, the copy
+      // overhead is small because each byte is reused size-of-side times
+      //This is freed in the vector processor
+   resSize = leftSubMatrix->numRows * rightSubMatrix->numCols * sizeof(float32);
+   resArray = malloc( resSize );
+   memset( resArray, 0, resSize );
+
+   
+   vectLength = leftSubMatrix->numCols;
+   numResRows = leftSubMatrix->numRows;
+   numResCols = rightSubMatrix->numCols;
+
+//         printf("just before multiply arrays %X\n", leftArray);
+   multiplyMatrixArraysTransposed( vectLength, numResRows, numResCols,
+                         leftArray, rightArray,
+                         resArray );
+
+   //send result by side-effect
+   params->partialResultArray = resArray;
+ }
+
+
+
+/*Divides result and each input into 32x32 sub-matrices, 3 of which fit into
+ * the 32KB L1 cache.
+ *Would be nice to embed this within another level that divided into
+ * 8x8 tiles of those, where one 8x8 tile fits within 2MB L2 cache
+ *
+ *Eventually want these divisions to be automatic, using DKU pattern
+ * embedded into VMS and exposed in the language, and with VMS controlling the
+ * divisions according to the cache sizes, which it knows about.
+ *Also, want VMS to work with language to split among main-mems, so a socket
+ * only cranks on data in its local segment of main mem
+ *
+ *So, outer two loops determine start and end points within the result matrix.
+ * Inside that, a loop dets the start and end points along the shared dimensions
+ * of the two input matrices.
+ */
+void inline
+multiplyMatrixArraysTransposed( int32 vecLength, int32 numResRows,
+                                int32 numResCols,
+                                float32 *leftArray, float32 *rightArray,
+                                float32 *resArray )
+ {
+   int resStride, inpStride;
+   int resStartRow, resStartCol, resEndRow, resEndCol, startVec, endVec;
+
+   resStride  = numResCols;
+   inpStride  = vecLength;
+
+   for( resStartRow = 0; resStartRow < numResRows; )
+    {
+      resEndRow = resStartRow + ROWS_IN_BLOCK -1;  //start at zero, so -1
+      if( resEndRow > numResRows ) resEndRow = numResRows -1;
+
+      for( resStartCol = 0; resStartCol < numResCols; )
+       {
+         resEndCol   = resStartCol + COLS_IN_BLOCK -1;
+         if( resEndCol > numResCols ) resEndCol = numResCols -1;
+
+         for( startVec = 0; startVec < vecLength; )
+          {
+            endVec   = startVec + VEC_IN_BLOCK -1;
+            if( endVec > vecLength ) endVec = vecLength -1;
+
+//                  printf("just before multiply sub-blocks %X\n", leftArray);
+               //By having the "vector" of sub-blocks in a sub-block slice
+               // be marched down in inner loop, are re-using the result
+               // matrix, which stays in L1 cache and re-using the left sub-mat
+               // which repeats for each right sub-mat -- can only re-use two of
+               // the three, so result is the most important -- avoids writing
+               // dirty blocks until those result-locations fully done
+               //Row and Col is position in result matrix -- so row and vec
+               // for left array, then vec and col for right array
+            multiplySubBlocksTransposed( leftArray, rightArray,
+                                         resArray,
+                                         resStartRow,  resEndRow,
+                                         resStartCol,  resEndCol,
+                                         startVec,  endVec,
+                                         resStride, inpStride );
+            startVec = endVec +1;
+          }
+         resStartCol = resEndCol +1;
+       }
+      resStartRow = resEndRow +1;
+    }
+ }
+
+
+
+void inline
+multiplySubBlocksTransposed( float32 *leftArray, float32 *rightArray,
+                     float32 *resArray,
+                     int resStartRow,  int resEndRow,
+                     int resStartCol,  int resEndCol,
+                     int startVec,  int endVec,
+                     int resStride, int inpStride )
+ {
+   int resRow,     resCol,        vec;
+   int leftOffset, rightOffset;
+   float32 result;
+
+//         printf("start col, row | end col, row: %d, %d, %d, %d\n", resStartCol, resStartRow, resEndCol, resEndRow);
+      //The result row is used for the left matrix, res col for the right
+   for( resCol = resStartCol; resCol <= resEndCol; resCol++ )
+    {
+      for( resRow = resStartRow; resRow <= resEndRow; resRow++ )
+       {
+         leftOffset  = resRow * inpStride;//left & right inp strides same
+         rightOffset = resCol * inpStride;// because right is transposed
+         result = 0;
+         for( vec = startVec; vec <= endVec; vec++ )
+          {
+            result +=
+               leftArray[ leftOffset + vec] * rightArray[ rightOffset + vec];
+          }
+
+         resArray[ resRow * resStride + resCol ] += result;
+       }
+    }
+ }
+
+
+/*Reuse this in divider when do the sequential multiply case
+ */
+void inline
+copyTranspose( int32 numRows, int32 numCols,
+               int32 origStartRow, int32 origStartCol, int32 origStride,
+               float32 *subArray, float32 *origArray )
+ { int32 stride;
+   int32 row, col, origOffset;
+
+   stride = numRows;
+   for( row = 0; row < numRows; row++ )
+    {
+      origOffset = (row + origStartRow) * origStride + origStartCol;
+      for( col = 0; col < numCols; col++ )
+       {
+            //transpose means swap row & col -- traverse orig matrix normally
+            // but put into reversed place in local array -- means the
+            // stride is the numRows now, so col * numRows + row
+         subArray[ col * stride + row ]  =  origArray[ origOffset + col ];
+       }
+    }
+ }
+
+void inline
+copyTransposeFromOrig( SubMatrix *subMatrix )
+ { int numCols, numRows, origStartRow, origStartCol, origStride, stride;
+   Matrix *origMatrix;
+   float32 *origArray, *subArray;
+
+
+//   if( subMatrix->alreadyCopied ) return;
+//   VCilk__start_singleton(copyMatrixSingleton,&&EndOfTranspSingleton,animPr);
+
+   origMatrix   = subMatrix->origMatrix;
+   origArray    = origMatrix->array;
+   numCols      = subMatrix->numCols;
+   numRows      = subMatrix->numRows;
+   origStartRow = subMatrix->origStartRow;
+   origStartCol = subMatrix->origStartCol;
+   origStride   = origMatrix->numCols;
+
+   subArray     = malloc( numRows * numCols *sizeof(float32) );
+   subMatrix->array = subArray;
+//         printf("copying transpose %X\n", subArray);
+
+      //copy values from orig matrix to local
+   copyTranspose( numRows, numCols,
+                  origStartRow, origStartCol, origStride,
+                  subArray, origArray );
+
+   subMatrix->alreadyCopied = TRUE; //must be last thing before label
+//   EndOfTranspSingleton:
+   return;
+ }
+
+
+void inline
+copyFromOrig( SubMatrix *subMatrix )
+ { int numCols, numRows, origStartRow, origStartCol, stride, origStride;
+   Matrix *origMatrix;
+   float32 *origArray, *subArray;
+   int32 row, col, offset, origOffset;
+
+      //This lets only a single VP execute the code between start and
+      // end -- using start and end so that work runs outside the master.
+      //Inside, if a second VP ever executes the start, it will be returned
+      // from the end-point.
+      //Note, for non-GCC, can add a second SSR call at the end, and inside
+      // that one, look at the stack at the return addr & save that in an
+      // array indexed by singletonID
+//   if( subMatrix->alreadyCopied ) return;
+//   VCilk__start_singleton( copyMatrixSingleton, &&EndOfCopySingleton,animPr);
+
+
+   origMatrix    = subMatrix->origMatrix;
+   origArray     = origMatrix->array;
+   numCols       = subMatrix->numCols;
+   numRows       = subMatrix->numRows;
+   origStartRow  = subMatrix->origStartRow;
+   origStartCol  = subMatrix->origStartCol;
+   origStride    = origMatrix->numCols;
+
+   subArray      = malloc( numRows * numCols *sizeof(float32) );
+   subMatrix->array = subArray;
+//        printf("copying normal %X\n", subArray);
+
+      //copy values from orig matrix to local
+   stride        = numCols;
+
+   for( row = 0; row < numRows; row++ )
+    {
+      offset     = row * stride;
+      origOffset = (row + origStartRow) * origStride + origStartCol;
+      for( col = 0; col < numCols; col++ )
+       {
+         subArray[ offset + col ]  =  origArray[ origOffset + col ];
+       }
+    }
+
+   subMatrix->alreadyCopied = TRUE; //must be last thing before label
+//   EndOfCopySingleton:
+   return;
+ }
diff -r ec0629f70ee5 -r bf7331ed394e src/Application/Makefile
--- a/src/Application/Makefile	Tue Oct 26 19:34:03 2010 -0700
+++ b/src/Application/Makefile	Wed Nov 10 06:07:54 2010 -0800
@@ -1,35 +1,51 @@
+#
+#  Copyright Nov 6, 2010 OpenSourceStewardshipFoundation.org
+#  Licensed under GNU General Public License version 2
+#
+# author seanhalle@yahoo.com
 
 
 CILK_SOURCE = \
    CILK__Matrix_Mult/EntryPoint.cilk \
    CILK__Matrix_Mult/Divide_Pr.cilk \
    CILK__Matrix_Mult/Vector_Pr.cilk \
+   CILK__Matrix_Mult/subMatrix_Pr.cilk \
    main.cilk
 
 C_SOURCE = \
    matrix_mult.c \
-   ParamHelper/ParamBag.c\
+   ParamHelper/ParamBag.c \
    ParamHelper/ReadParamsFromFile.c
 
+#The next two rules make a new string, with the same names as the
+# source, but the endings changed to .o
+#The third concatenates the two source-file strings
 C_OBJS = $(C_SOURCE:.c=.o)
 
 CILK_OBJS = $(CILK_SOURCE:.cilk=.o) 
 
-OBJECTS = $(C_SOURCE) $(CILK_SOURCE)
+OBJECTS = $(C_OBJS) $(CILK_OBJS)
+
 
 #Make has the built-in variable "$<" which is the source file
 # and "$@" which is the target for that source
+#The first rule says for each file in C_SOURCE, put the
+# name in place of $< and expand
+# C_OBJS in place of $@ and run the gcc command, when files are
+# out of date
 $(C_OBJS): $(C_SOURCE)
-	gcc -c $< -o $@
+	gcc -c $*.c -o $*.o;
 
 $(CILK_OBJS): $(CILK_SOURCE)
-	gcc -c $< -o $@
+	cilkc -c $*.cilk -o $*.o
 
 all: $(OBJECTS)
 	cilkc $(OBJECTS) -o CILK_Linux__Matrix_Mult; \
 	cp CILK_Linux__Matrix_Mult ~/D/2__INRIA_OMP/1__Development/2__runs_and_data/executables
 
 
+clean:
+	rm *.o; rm ParamHelper/*.o; rm CILK__Matrix_Mult/*.o
 
 #================================================================
 #Other stuff tried/played_with/copied
@@ -62,8 +78,6 @@
 #================================================================
 # playing with below..
 
-#7C9A-RV6P-3XE2-JV99-426K-2K
-
 #rule for inferring that the .cilk file is the source for .o file
 # and how to create the .o from the .cilk
 #%.o : %.cilk
diff -r ec0629f70ee5 -r bf7331ed394e src/Application/Matrix_Mult.c
--- a/src/Application/Matrix_Mult.c	Tue Oct 26 19:34:03 2010 -0700
+++ b/src/Application/Matrix_Mult.c	Wed Nov 10 06:07:54 2010 -0800
@@ -61,7 +61,7 @@
    
    numRows = matrixStruc->numRows;
    numCols = matrixStruc->numCols;
-   matrixStart = matrixStruc->matrix;
+   matrixStart = matrixStruc->array;
 
    file = fopen( matrixFileName, "r" );
    if( file == NULL ) { printf( "\nCouldn't open file!!\n"); exit(1);}
@@ -131,7 +131,7 @@
    retMatrix = malloc( sizeof( Matrix ) );
    retMatrix->numRows = numRows;
    retMatrix->numCols = numCols;
-   retMatrix->matrix  = malloc( numRows * numCols * sizeof(float32) );
+   retMatrix->array  = malloc( numRows * numCols * sizeof(float32) );
 
    return retMatrix;
  }
@@ -142,24 +142,26 @@
  }
  void
 freeMatrix( Matrix * matrix )
- { free( matrix->matrix );
+ { free( matrix->array );
    free( matrix );
  }
 
 void
 printMatrix( Matrix *matrix )
- { int r, c, numRows, numCols;
+ { 
+   int r, c, numRows, numCols, rowsToPrint, colsToPrint, rowIncr, colIncr;
    float32 *matrixArray;
 
-   numRows = matrix->numRows;
-   numCols = matrix->numCols;
-   matrixArray = matrix->matrix;
+   numRows = rowsToPrint = matrix->numRows;
+   numCols = colsToPrint = matrix->numCols;
+   matrixArray = matrix->array;
 
-   for( r = 0; r < numRows; r++ )
-    { for( c = 0; c < numCols; c++ )
-       { printf( "%f | ", *(matrixArray + r*numCols + c) );
+   rowIncr = numRows/20; if(rowIncr == 0) rowIncr = 1;//20 to 39 rows printed
+   colIncr = numCols/20; if(colIncr == 0) colIncr = 1;//20 to 39 cols printed
+   for( r = 0; r < numRows; r += rowIncr )
+    { for( c = 0; c < numCols; c += colIncr )
+       { printf( "%3.1f | ", matrixArray[ r * numCols + c ] );
        }
       printf("\n");
     }
  }
-
diff -r ec0629f70ee5 -r bf7331ed394e src/Application/Matrix_Mult.h
--- a/src/Application/Matrix_Mult.h	Tue Oct 26 19:34:03 2010 -0700
+++ b/src/Application/Matrix_Mult.h	Wed Nov 10 06:07:54 2010 -0800
@@ -19,7 +19,7 @@
 struct
  { int32    numRows;
    int32    numCols;
-   float32 *matrix;  //2D, but dynamically sized, so use addr arith
+   float32 *array;  //2D, but dynamically sized, so use addr arith
  }
 Matrix;
 
diff -r ec0629f70ee5 -r bf7331ed394e src/Application/main.cilk
--- a/src/Application/main.cilk	Tue Oct 26 19:34:03 2010 -0700
+++ b/src/Application/main.cilk	Wed Nov 10 06:07:54 2010 -0800
@@ -1,41 +1,67 @@
-/*
- *  Copyright Oct 24, 2009 OpenSourceCodeStewardshipFoundation.org
+/*
+ *  Copyright 2010 OpenSourcStewardshipFoundation.org
  *  Licensed under GNU General Public License version 2
- *
- * author seanhalle@yahoo.com
- */
-
-#include <malloc.h>
-#include <stdlib.h>
-
-#include "Matrix_Mult.h"
-#include "CILK__Matrix_Mult/CILK__Matrix_Mult.h"
-
-cilk Matrix * 
-multiplyTheseMatrices( Matrix *leftMatrix, Matrix *rightMatrix );
-
-/**
- *Matrix multiply program written using VMS_HW piggy-back language
- * 
- */
-cilk
-int main( int argc, char **argv )
- { Matrix      *leftMatrix, *rightMatrix, *resultMatrix;
-   ParamBag    *paramBag;
-   
-   
-   paramBag = makeParamBag();
-   readParamFileIntoBag( argv[1], paramBag );
-   initialize_Input_Matrices_Via( &leftMatrix, &rightMatrix, paramBag );
-
-   resultMatrix = spawn multiplyTheseMatrices( leftMatrix, rightMatrix );
-   sync;
-
-   printf("\nresult matrix: \n");
-
-//   printMatrix( resultMatrix );
-   
-//   VPThread__print_stats();
-   
-   exit(0); //cleans up
- }
+ *
+ * author seanhalle@yahoo.com
+ */
+
+#include <malloc.h>
+#include <stdlib.h>
+
+#include "Matrix_Mult.h"
+#include "CILK__Matrix_Mult/CILK__Matrix_Mult.h"
+
+
+   //single global var -- just get it done
+struct timeval startStamp;
+
+
+void
+startTimeInterval()
+ {
+   gettimeofday( &startStamp, NULL);
+ }
+
+
+void
+endIntervalAndPrintTime()
+ { 
+   struct timeval endStamp;
+   float64 startSecs, endSecs, intervalSecs;
+
+   gettimeofday( &endStamp, NULL);
+
+   startSecs = startStamp.tv_sec + ( startStamp.tv_usec / 1000000.0 );
+   endSecs   = endStamp.tv_sec   + ( endStamp.tv_usec / 1000000.0 );
+
+   intervalSecs  = endSecs - startSecs;
+   printf("Interval: %f", intervalSecs);
+ }
+
+
+cilk Matrix * 
+multiplyTheseMatrices( Matrix *leftMatrix, Matrix *rightMatrix );
+
+/**
+ *Matrix multiply program written using VMS_HW piggy-back language
+ * 
+ */
+cilk
+int main( int argc, char **argv )
+ { Matrix      *leftMatrix, *rightMatrix, *resultMatrix;
+   ParamBag    *paramBag;
+   
+   
+   paramBag = makeParamBag();
+   readParamFileIntoBag( argv[1], paramBag );
+   initialize_Input_Matrices_Via( &leftMatrix, &rightMatrix, paramBag );
+
+   resultMatrix = spawn multiplyTheseMatrices( leftMatrix, rightMatrix );
+   sync;
+
+   printf("\nresult matrix: \n");
+
+//   printMatrix( resultMatrix );
+   
+   exit(0); //cleans up
+ }