# HG changeset patch
# User Some Random Person <seanhalle@yahoo.com>
# Date 1337801959 25200
# Node ID 9cf4c84a309167f2ebf3e6ab3d3eb67d0c217a44

Initial add of copied code -- nonsense code still

diff -r 000000000000 -r 9cf4c84a3091 .hgeol
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/.hgeol	Wed May 23 12:39:19 2012 -0700
@@ -0,0 +1,14 @@
+
+[patterns]
+**.py = native
+**.txt = native
+**.c = native
+**.h = native
+**.cpp = native
+**.java = native
+**.class = bin
+**.jar = bin
+**.sh = native
+**.pl = native
+**.jpg = bin
+**.gif = bin
diff -r 000000000000 -r 9cf4c84a3091 .hgignore
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/.hgignore	Wed May 23 12:39:19 2012 -0700
@@ -0,0 +1,12 @@
+nbproject
+Makefile
+build
+dist
+src/Default
+src/.settings
+src/.cproject
+src/.project
+.dep.inc
+glob:.cproject
+glob:.project
+glob:Debug
diff -r 000000000000 -r 9cf4c84a3091 VSs__Hello_World/EntryPoint.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/VSs__Hello_World/EntryPoint.c	Wed May 23 12:39:19 2012 -0700
@@ -0,0 +1,62 @@
+/*
+ *  Copyright 2009 OpenSourceStewardshipFoundation.org
+ *  Licensed under GNU General Public License version 2
+ *
+ * Author: seanhalle@yahoo.com
+ *
+ */
+
+#include <math.h>
+
+#include "SSR_Matrix_Mult.h"
+
+
+
+/*Every SSR system has an "entry point" function that creates the first
+ * processor, which starts the chain of creating more processors..
+ * eventually all of the processors will dissipate themselves, and
+ * return.
+ *
+ *This entry-point function follows the same pattern as all entry-point
+ * functions do:
+ *1) it creates the params for the seed processor, from the
+ *    parameters passed into the entry-point function
+ *2) it calls SSR__create_seed_procr_and_do_work
+ *3) it gets the return value from the params struc, frees the params struc,
+ *    and returns the value from the function
+ *
+ */
+Matrix *
+multiplyTheseMatrices( Matrix *leftMatrix, Matrix *rightMatrix )
+ { Matrix          *resMatrix;
+   DividerParams   *dividerParams;
+   int32            numResRows, numResCols;
+
+
+   dividerParams              = malloc( sizeof( DividerParams ) );
+   dividerParams->leftMatrix  = leftMatrix;
+   dividerParams->rightMatrix = rightMatrix;
+
+
+   numResRows  = leftMatrix->numRows;
+   numResCols  = rightMatrix->numCols;
+
+      //VMS has its own separate internal malloc, so to get results out,
+      // have to pass in empty array for it to fill up
+      //The alternative is internally telling SSR make external space to use
+   resMatrix            = malloc( sizeof(Matrix) );
+   resMatrix->array     = malloc( numResRows * numResCols * sizeof(float32));
+   resMatrix->numCols   = rightMatrix->numCols;
+   resMatrix->numRows   = leftMatrix->numRows;
+
+
+   dividerParams->resultMatrix   = resMatrix;
+
+      //create divider processor, start doing the work, and wait till done
+      //This function is the "border crossing" between normal code and SSR
+   SSR__create_seed_procr_and_do_work( &divideWorkIntoSubMatrixPairProcrs,
+                                       dividerParams );
+   
+   free( dividerParams );
+   return resMatrix;
+ }
diff -r 000000000000 -r 9cf4c84a3091 VSs__Hello_World/SeedVP.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/VSs__Hello_World/SeedVP.c	Wed May 23 12:39:19 2012 -0700
@@ -0,0 +1,594 @@
+/*
+ *  Copyright 2009 OpenSourceStewardshipFoundation.org
+ *  Licensed under GNU General Public License version 2
+ *
+ * Author: seanhalle@yahoo.com
+ *
+ */
+
+
+#include <math.h>
+#include <string.h>
+#include "SSR_Matrix_Mult.h"
+
+   //The time to compute this many result values should equal the time to
+   // perform this division on a matrix of size gives that many result calcs
+   //IE, size this so that sequential time to calc equals divide time
+   // find the value by experimenting -- but divide time and calc time scale
+   // same way, so this value might remain the same across hardware
+#define NUM_CELLS_IN_SEQUENTIAL_CUTOFF 1000
+
+
+//===========================================================================
+int inline
+measureMatrixMultPrimitive( SlaveVP *animPr );
+
+SlicingStrucCarrier *
+calcIdealSizeAndSliceDimensions( Matrix *leftMatrix, Matrix *rightMatrix,
+                                 SlaveVP *animPr );
+
+SlicingStruc *
+sliceUpDimension( float32 idealSizeOfSide, int startVal, int endVal,
+                  SlaveVP *animPr );
+
+void
+freeSlicingStruc( SlicingStruc *slicingStruc, SlaveVP *animPr );
+
+SubMatrix **
+createSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices,
+                   int32 numUses, Matrix *origMatrix, SlaveVP *animPr );
+
+void
+freeSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices,
+                 SubMatrix **subMatrices, SlaveVP *animPr );
+
+void
+pairUpSubMatricesAndMakeProcessors( SubMatrix **leftSubMatrices,
+                                    SubMatrix **rightSubMatrices,
+                                    int32 numRowIdxs, int32 numColIdxs,
+                                    int32 numVecIdxs,
+                                    SlaveVP *resultPr,
+                                    SlaveVP *animatingPr );
+
+void
+makeSubMatricesAndProcrs( Matrix *leftMatrix, Matrix *rightMatrix,
+            SlicingStrucCarrier *slicingStrucCarrier,
+            SlaveVP *resultPr, SlaveVP *animatingPr );
+
+
+
+/*Divider creates one processor for every sub-matrix
+ * It hands them:
+ *  the name of the result processor that they should send their results to,
+ *  the left and right matrices, and the rows and cols they should multiply
+ * It first creates the result processor, then all the sub-matrixPair
+ *  processors,
+ *  then does a receive of a message from the result processor that gives
+ *  the divider ownership of the result matrix.
+ * Finally, the divider returns the result matrix out of the SSR system.
+ *
+ * Divider chooses the size of sub-matrices via an algorithm that tries to
+ *  keep the minimum work above a threshold.  The threshold is machine-
+ *  dependent, so ask SSR for min work-unit time to get a
+ *  given overhead
+ *
+ * Divide min work-unit cycles by measured-cycles for one matrix-cell
+ *  product -- gives the number of products need to have in min size
+ *  matrix.
+ *
+ * So then, take cubed root of this to get the size of a side of min sub-
+ *  matrix.  That is the size of the ideal square sub-matrix -- so tile
+ *  up the two input matrices into ones as close as possible to that size,
+ *  and create the pairs of sub-matrices.
+ *
+ *========================  STRATEGIC OVERVIEW  =======================
+ *
+ *This division is a bit tricky, because have to create things in advance
+ * that it's not at first obvious need to be created..
+ *
+ *First slice up each dimension -- three of them..  this is because will have
+ * to create the sub-matrix's data-structures before pairing the sub-matrices
+ * with each other -- so, have three dimensions to slice up before can
+ * create the sub-matrix data-strucs -- also, have to be certain that the
+ * cols of the left input have the exact same slicing as the rows of the
+ * left matrix, so just to be sure, do the slicing calc once, then use it
+ * for both.
+ *
+ *So, goes like this:
+ *1) calculate the start & end values of each dimension in each matrix.
+ *2) use those values to create sub-matrix structures
+ *3) combine sub-matrices into pairs, as the tasks to perform.
+ *
+ *Have to calculate separately from creating the sub-matrices because of the
+ * nature of the nesting -- would either end up creating the same sub-matrix
+ * multiple times, or else would have to put in detection of whether had
+ * made a particular one already if tried to combine steps 1 and 2.
+ *
+ *Step 3 has to be separate because of the nesting, as well -- same reason,
+ * would either create same sub-matrix multiple times, or else have to
+ * add detection of whether was already created.
+ *
+ *Another way to look at it: there's one level of loop to divide dimensions,
+ * two levels of nesting to create sub-matrices, and three levels to pair
+ * up the sub-matrices.
+ */
+void divideWorkIntoSubMatrixPairProcrs( void      *_dividerParams,
+                                        SlaveVP *animPr )
+ { SlaveVP       *resultPr;
+   DividerParams   *dividerParams;
+   ResultsParams   *resultsParams;
+   Matrix          *leftMatrix, *rightMatrix, *resultMatrix;
+   void            *msg;
+   SlicingStrucCarrier *slicingStrucCarrier;
+   float32         *resultArray; //points to array inside result matrix
+   
+         DEBUG__printf( dbgAppFlow, "start divide")
+
+         int32
+         divideProbe = VMS_App__create_single_interval_probe( "divideProbe",
+                                                          animPr );
+         VMS_App__record_sched_choice_into_probe( divideProbe, animPr );
+         VMS_App__record_interval_start_in_probe( divideProbe );
+
+   //=========== Setup -- make local copies of ptd-to-things, malloc, aso
+   int32 numResRows, numResCols, vectLength;
+
+   dividerParams   = (DividerParams *)_dividerParams;
+   
+   leftMatrix      = dividerParams->leftMatrix;
+   rightMatrix     = dividerParams->rightMatrix;
+
+   vectLength = leftMatrix->numCols;
+   numResRows = leftMatrix->numRows;
+   numResCols = rightMatrix->numCols;
+   resultArray     = dividerParams->resultMatrix->array;
+   
+      //zero the result array
+   memset( resultArray, 0, numResRows * numResCols * sizeof(float32) );
+
+   //==============  Do either sequential mult or do division ==============
+
+      //Check if input matrices too small -- if yes, just do sequential
+      //Cutoff is determined by overhead of this divider -- relatively
+      // machine-independent
+   if( (float32)leftMatrix->numRows * (float32)leftMatrix->numCols *
+       (float32)rightMatrix->numCols  < NUM_CELLS_IN_SEQUENTIAL_CUTOFF )
+    {
+      //====== Do sequential multiply on a single core
+            DEBUG__printf( dbgAppFlow, "doing sequential")
+            
+         //transpose the right matrix
+      float32 *
+      transRightArray  = SSR__malloc_to( rightMatrix->numRows * 
+                                         rightMatrix->numCols * sizeof(float32),
+                                         animPr );
+
+         //copy values from orig matrix to local
+      copyTranspose( rightMatrix->numRows, rightMatrix->numCols,
+                     0, 0, rightMatrix->numRows,
+                     transRightArray, rightMatrix->array );
+      
+      multiplyMatrixArraysTransposed( vectLength, numResRows, numResCols,
+                            leftMatrix->array, transRightArray,
+                            resultArray );
+    }
+   else
+    {
+      //====== Do parallel multiply across cores
+
+         //Calc the ideal size of sub-matrix and slice up the dimensions of
+         // the two matrices.
+         //The ideal size is the one takes the number of cycles to calculate
+         // such that calc time is equal or greater than min work-unit size
+      slicingStrucCarrier =
+         calcIdealSizeAndSliceDimensions( leftMatrix, rightMatrix, animPr );
+
+         //Make the results processor, now that know how many to wait for
+      resultsParams = SSR__malloc_to( sizeof(ResultsParams), animPr );
+      resultsParams->numSubMatrixPairs  =
+         slicingStrucCarrier->leftRowSlices->numVals *
+         slicingStrucCarrier->rightColSlices->numVals *
+         slicingStrucCarrier->vecSlices->numVals;
+      resultsParams->dividerPr   = animPr;
+      resultsParams->numCols     = rightMatrix->numCols;
+      resultsParams->numRows     = leftMatrix->numRows;
+      resultsParams->resultArray = resultArray;
+
+            DEBUG__printf(dbgAppFlow,"**create result Pr**")
+      resultPr =
+         SSR__create_procr_with( &gatherResults, resultsParams, animPr);
+
+         //Make the sub-matrices, and pair them up, and make processor to
+         // calc product of each pair.
+      makeSubMatricesAndProcrs( leftMatrix, rightMatrix,
+                                    slicingStrucCarrier,
+                                    resultPr, animPr);
+ 
+         //result array is allocated externally, so no message from resultPr
+         // however, do have to wait before printing out stats, so wait
+         // for an empty handshake message
+      msg = SSR__receive_from_to( resultPr, animPr );
+   }
+
+
+   //===============  Work done -- send results back =================
+
+
+         DEBUG__printf( dbgAppFlow, "end divide")
+
+         VMS_App__record_interval_end_in_probe( divideProbe );
+         VMS_App__print_stats_of_all_probes();
+
+      //nothing left to do so dissipate, SSR will wait to shutdown and hence
+      // make results available to outside until all the processors have
+      // dissipated -- so no need to wait for results processor
+
+   SSR__dissipate_procr( animPr );  //all procrs dissipate self at end
+      //when all of the processors have dissipated, the "create seed and do
+      // work" call in the entry point function returns
+ }
+
+
+SlicingStrucCarrier *
+calcIdealSizeAndSliceDimensions( Matrix *leftMatrix, Matrix *rightMatrix,
+                                 SlaveVP *animPr )
+ {
+   float32 idealSizeOfSide, idealSizeOfSide1, idealSizeOfSide2;
+   SlicingStruc *leftRowSlices, *vecSlices, *rightColSlices;
+   SlicingStrucCarrier *slicingStrucCarrier =
+                         SSR__malloc_to(sizeof(SlicingStrucCarrier), animPr);
+
+   int minWorkUnitCycles, primitiveCycles, idealNumWorkUnits;
+   float64 numPrimitiveOpsInMinWorkUnit;
+
+
+   //=======  Calc ideal size of min-sized sub-matrix  ========
+
+      //ask SSR for the number of cycles of the minimum work unit, at given
+      // percent overhead then add a guess at overhead from this divider
+   minWorkUnitCycles = SSR__giveMinWorkUnitCycles( .05 );
+
+      //ask SSR for number of cycles of the "primitive" op of matrix mult
+   primitiveCycles = measureMatrixMultPrimitive( animPr );
+
+   numPrimitiveOpsInMinWorkUnit =
+      (float64)minWorkUnitCycles / (float64)primitiveCycles;
+
+      //take cubed root -- that's number of these in a "side" of sub-matrix
+      // then multiply by 5 because the primitive is 5x5
+   idealSizeOfSide1 = 5 * cbrt( numPrimitiveOpsInMinWorkUnit );
+
+   idealNumWorkUnits = SSR__giveIdealNumWorkUnits();
+   
+   idealSizeOfSide2 = leftMatrix->numRows / rint(cbrt( idealNumWorkUnits ));
+   idealSizeOfSide2 *= 0.6; //finer granularity to help load balance
+
+   if( idealSizeOfSide1 > idealSizeOfSide2 )
+      idealSizeOfSide = idealSizeOfSide1;
+   else
+      idealSizeOfSide = idealSizeOfSide2;
+
+      //The multiply inner loop blocks the array to fit into L1 cache
+//   if( idealSizeOfSide < ROWS_IN_BLOCK ) idealSizeOfSide = ROWS_IN_BLOCK;
+
+   //============  Slice up dimensions, now that know target size ===========
+
+      //Tell the slicer the target size of a side (floating pt), the start
+      // value to start slicing at, and the end value to stop slicing at
+      //It returns an array of start value of each chunk, plus number of them
+   int32 startLeftRow, endLeftRow, startVec,endVec,startRightCol,endRightCol;
+   startLeftRow  = 0;
+   endLeftRow    = leftMatrix->numRows -1;
+   startVec      = 0;
+   endVec        = leftMatrix->numCols -1;
+   startRightCol = 0;
+   endRightCol   = rightMatrix->numCols -1;
+
+   leftRowSlices =
+      sliceUpDimension( idealSizeOfSide,  startLeftRow, endLeftRow, animPr );
+
+   vecSlices =
+      sliceUpDimension( idealSizeOfSide,  startVec, endVec, animPr );
+
+   rightColSlices =
+      sliceUpDimension( idealSizeOfSide,  startRightCol, endRightCol,animPr);
+
+   slicingStrucCarrier->leftRowSlices  = leftRowSlices;
+   slicingStrucCarrier->vecSlices      = vecSlices;
+   slicingStrucCarrier->rightColSlices = rightColSlices;
+
+   return slicingStrucCarrier;
+ }
+
+
+void
+makeSubMatricesAndProcrs( Matrix    *leftMatrix, Matrix    *rightMatrix,
+            SlicingStrucCarrier *slicingStrucCarrier,
+            SlaveVP *resultPr,   SlaveVP *animPr )
+ {
+   SlicingStruc *leftRowSlices, *vecSlices, *rightColSlices;
+   
+   leftRowSlices  = slicingStrucCarrier->leftRowSlices;
+   vecSlices      = slicingStrucCarrier->vecSlices;
+   rightColSlices = slicingStrucCarrier->rightColSlices;
+   SSR__free( slicingStrucCarrier, animPr );
+   
+   //================  Make sub-matrices, given the slicing  ================
+   SubMatrix **leftSubMatrices, **rightSubMatrices;
+   leftSubMatrices =
+      createSubMatrices( leftRowSlices, vecSlices, rightColSlices->numVals,
+                         leftMatrix, animPr );
+   //double_check_that_always_numRows_in_right_same_as_numCols_in_left();
+   rightSubMatrices =
+      createSubMatrices( vecSlices, rightColSlices, leftRowSlices->numVals,
+                         rightMatrix, animPr );
+
+
+   //==============  pair the sub-matrices and make processors ==============
+   int32 numRowIdxs, numColIdxs, numVecIdxs;
+
+   numRowIdxs = leftRowSlices->numVals;
+   numColIdxs = rightColSlices->numVals;
+   numVecIdxs = vecSlices->numVals;
+   
+   
+   freeSlicingStruc( leftRowSlices, animPr );
+   freeSlicingStruc( vecSlices, animPr );
+   freeSlicingStruc( rightColSlices, animPr );
+   
+   pairUpSubMatricesAndMakeProcessors( leftSubMatrices,
+                                       rightSubMatrices,
+                                       numRowIdxs, numColIdxs,
+                                       numVecIdxs,
+                                       resultPr,
+                                       animPr );
+ }
+
+
+
+
+void
+pairUpSubMatricesAndMakeProcessors( SubMatrix **leftSubMatrices,
+                                    SubMatrix **rightSubMatrices,
+                                    int32 numRowIdxs, int32 numColIdxs,
+                                    int32 numVecIdxs,
+                                    SlaveVP *resultPr,
+                                    SlaveVP *animatingPr )
+ {
+   int32 resRowIdx, resColIdx, vecIdx;
+   int32 numLeftColIdxs, numRightColIdxs;
+   int32 leftRowIdxOffset;
+   SMPairParams *subMatrixPairParams;
+   float32 numToPutOntoEachCore, leftOverFraction, numVecOnCurrCore;
+   int32 numCores, coreToAssignOnto;
+
+   numLeftColIdxs  = numColIdxs;
+   numRightColIdxs = numVecIdxs;
+
+   numCores = SSR__give_number_of_cores_to_schedule_onto();
+
+   numToPutOntoEachCore = numRowIdxs*numColIdxs/numCores;
+   leftOverFraction = 0;
+   numVecOnCurrCore = 0;
+   coreToAssignOnto = 0;
+
+   for( resRowIdx = 0; resRowIdx < numRowIdxs; resRowIdx++ )
+    {
+      leftRowIdxOffset = resRowIdx * numLeftColIdxs;
+
+      for( resColIdx = 0; resColIdx < numColIdxs; resColIdx++ )
+       {
+         
+         for( vecIdx = 0; vecIdx < numVecIdxs; vecIdx++ )
+          {
+               //Make the processor for the pair of sub-matrices
+            subMatrixPairParams  = SSR__malloc_to( sizeof(SMPairParams),
+                                                               animatingPr);
+            subMatrixPairParams->leftSubMatrix  =
+               leftSubMatrices[ leftRowIdxOffset + vecIdx ];
+
+            subMatrixPairParams->rightSubMatrix =
+               rightSubMatrices[ vecIdx * numRightColIdxs + resColIdx ];
+
+            subMatrixPairParams->resultPr = resultPr;
+
+               //put all pairs from the same vector onto same core
+            SSR__create_procr_with_affinity( &calcSubMatrixProduct,
+                                             subMatrixPairParams,
+                                             animatingPr,
+                                             coreToAssignOnto );
+
+               //Trying to distribute the subMatrix-vectors across the cores, so
+               // that each core gets the same number of vectors, with a max
+               // imbalance of 1 vector more on some cores than others
+            numVecOnCurrCore += 1;                 //incr before checking, so
+            if( numVecOnCurrCore > numToPutOntoEachCore ) //actual num 1 less
+             {
+                  //deal with fractional part, to ensure that imbalance is 1 max
+                  // IE, core with most has only 1 more than core with least
+               leftOverFraction = numToPutOntoEachCore - numVecOnCurrCore;
+               if( leftOverFraction > 1 ) ERROR("division alg messed up\n");
+               numVecOnCurrCore = leftOverFraction; //accumulates "extra"
+
+                  //Move to next core, max core-value to incr to is numCores -1
+               coreToAssignOnto += 1;
+               if( coreToAssignOnto >= numCores ) coreToAssignOnto = 0;
+             } //if
+          } //for( vecIdx
+       } //for( resColIdx
+    } //for( resRowIdx
+
+ }
+
+
+
+/*Walk through the two slice-strucs, making sub-matrix strucs as go
+ */
+SubMatrix **
+createSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices,
+                   int32 numUses, Matrix *origMatrix, SlaveVP *animPr )
+ {
+   int32 numRowIdxs, numColIdxs, rowIdx, colIdx;
+   int32 startRow, endRow, startCol, endCol;
+   int32 *rowStartVals, *colStartVals;
+   int32 rowOffset;
+   SubMatrix **subMatrices, *newSubMatrix;
+
+   numRowIdxs = rowSlices->numVals;
+   numColIdxs = colSlices->numVals;
+
+   rowStartVals = rowSlices->startVals;
+   colStartVals = colSlices->startVals;
+
+   subMatrices = SSR__malloc_to(numRowIdxs * numColIdxs * sizeof(SubMatrix*),
+                                 animPr );
+
+   for( rowIdx = 0; rowIdx < numRowIdxs; rowIdx++ )
+    {
+      rowOffset = rowIdx * numColIdxs;
+      
+      startRow  = rowStartVals[rowIdx];
+      endRow    = rowStartVals[rowIdx + 1] -1; //"fake" start above last is
+                                               // at last valid idx + 1 & is
+                                               // 1 greater than end value
+      for( colIdx = 0; colIdx < numColIdxs; colIdx++ )
+       {
+         startCol = colStartVals[colIdx];
+         endCol   = colStartVals[colIdx + 1] -1;
+
+         newSubMatrix = SSR__malloc_to( sizeof(SubMatrix), animPr );
+         newSubMatrix->numRows       = endRow - startRow +1;
+         newSubMatrix->numCols       = endCol - startCol +1;
+         newSubMatrix->origMatrix    = origMatrix;
+         newSubMatrix->origStartRow  = startRow;
+         newSubMatrix->origStartCol  = startCol;
+         newSubMatrix->copySingleton = NULL;
+         newSubMatrix->numUsesLeft   = numUses; //can free after this many
+         //Prevent uninitialized memory
+         newSubMatrix->copySingleton = NULL;
+         newSubMatrix->copyTransSingleton = NULL;
+
+         subMatrices[ rowOffset + colIdx ] = newSubMatrix;
+       }
+    }
+   return subMatrices;
+ }
+
+
+void
+freeSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices,
+                 SubMatrix **subMatrices, SlaveVP *animPr )
+ {
+   int32 numRowIdxs, numColIdxs, rowIdx, colIdx, rowOffset;
+   SubMatrix *subMatrix;
+
+   numRowIdxs = rowSlices->numVals;
+   numColIdxs = colSlices->numVals;
+
+   for( rowIdx = 0; rowIdx < numRowIdxs; rowIdx++ )
+    {
+      rowOffset = rowIdx * numColIdxs;
+      for( colIdx = 0; colIdx < numColIdxs; colIdx++ )
+       {
+         subMatrix = subMatrices[ rowOffset + colIdx ];
+         if( subMatrix->alreadyCopied )
+            SSR__free( subMatrix->array, animPr );
+         SSR__free( subMatrix, animPr );
+       }
+    }
+   SSR__free( subMatrices, animPr );
+ }
+
+
+
+SlicingStruc *
+sliceUpDimension( float32 idealSizeOfSide, int startVal, int endVal,
+                  SlaveVP *animPr )
+ { float32 residualAcc = 0;
+   int     numSlices, i, *startVals, sizeOfSlice, endCondition;
+   SlicingStruc *slicingStruc = SSR__malloc_to(sizeof(SlicingStruc), animPr);
+
+      //calc size of matrix need to hold start vals --
+   numSlices = (int32)( (float32)(endVal -startVal +1) / idealSizeOfSide);
+
+   startVals = SSR__malloc_to( (numSlices + 1) * sizeof(int32), animPr );
+
+      //Calc the upper limit of start value -- when get above this, end loop
+      // by saving highest value of the matrix dimension to access, plus 1
+      // as the start point of the imaginary slice following the last one
+      //Plus 1 because go up to value but not include when process last slice
+      //The stopping condition is half-a-size less than highest value because
+      // don't want any pieces smaller than half the ideal size -- just tack
+      // little ones onto end of last one
+   endCondition = endVal - (int) (idealSizeOfSide/2); //end *value*, not size
+   for( i = 0; startVal <= endVal; i++ )
+    {
+      startVals[i] = startVal;
+      residualAcc += idealSizeOfSide;
+      sizeOfSlice  = (int)residualAcc;
+      residualAcc -= (float32)sizeOfSlice;
+      startVal    += sizeOfSlice; //ex @size = 2 get 0, 2, 4, 6, 8..
+
+      if( startVal > endCondition )
+       { startVal = endVal + 1;
+         startVals[ i + 1 ] = startVal;
+       }
+    }
+
+   slicingStruc->startVals = startVals;
+   slicingStruc->numVals   = i;  //loop incr'd, so == last valid start idx+1
+                                 // which means is num sub-matrices in dim
+                                 // also == idx of the fake start just above
+   return slicingStruc;
+ }
+
+void
+freeSlicingStruc( SlicingStruc *slicingStruc, SlaveVP *animPr )
+ {
+   SSR__free( slicingStruc->startVals, animPr );
+   SSR__free( slicingStruc, animPr );
+ }
+
+
+inline int
+measureMatrixMultPrimitive( SlaveVP *animPr )
+ {
+   int r, c, v, numCycles;
+   float32 *res, *left, *right;
+
+      //setup inputs
+   left  = SSR__malloc_to( 5 * 5 * sizeof( float32 ), animPr );
+   right = SSR__malloc_to( 5 * 5 * sizeof( float32 ), animPr );
+   res   = SSR__malloc_to( 5 * 5 * sizeof( float32 ), animPr );
+
+   for( r = 0; r < 5; r++ )
+    {
+      for( c = 0; c < 5; c++ )
+       {
+         left[  r * 5 + c ] = r;
+         right[ r * 5 + c ] = c;
+       }
+    }
+
+      //do primitive
+   SSR__start_primitive();  //for now, just takes time stamp
+   for( r = 0; r < 5; r++ )
+    {
+      for( c = 0; c < 5; c++ )
+       {
+         for( v = 0; v < 5; v++ )
+          {
+            res[ r * 5 + c ] = left[ r * 5 + v ] * right[ v * 5 + c ];
+          }
+       }
+    }
+   numCycles =
+      SSR__end_primitive_and_give_cycles();
+
+   SSR__free( left, animPr );
+   SSR__free( right, animPr );
+   SSR__free( res, animPr );
+
+   return numCycles;
+ }
+
diff -r 000000000000 -r 9cf4c84a3091 VSs__Hello_World/VSs__Hello_World.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/VSs__Hello_World/VSs__Hello_World.h	Wed May 23 12:39:19 2012 -0700
@@ -0,0 +1,94 @@
+/*
+ *  Copyright Oct 24, 2009 OpenSourceStewardshipFoundation.org
+ *  Licensed under GNU General Public License version 2
+ */
+
+#ifndef _SSR_MATRIX_MULT_H_
+#define _SSR_MATRIX_MULT_H_
+
+#include <stdio.h>
+
+#include "SSR_impl/SSR.h"
+#include "../Matrix_Mult.h"
+
+
+//===============================  Defines  ==============================
+#define ROWS_IN_BLOCK 32
+#define COLS_IN_BLOCK 32
+#define VEC_IN_BLOCK  32
+
+#define copyMatrixSingleton 1
+#define copyTransposeSingleton 2
+
+//==============================  Structures  ==============================
+typedef struct
+ {
+   Matrix *leftMatrix;
+   Matrix *rightMatrix;
+   Matrix *resultMatrix;
+ }
+DividerParams;
+
+typedef struct
+ {
+   SlaveVP *dividerPr;
+   int numRows;
+   int numCols;
+   int numSubMatrixPairs;
+   float32 *resultArray;
+ }
+ResultsParams;
+
+typedef struct
+ { int32    numRows;
+   int32    numCols;
+   Matrix  *origMatrix;
+   int32    origStartRow;
+   int32    origStartCol;
+   int32    alreadyCopied;
+   int32    numUsesLeft; //have update via message to avoid multiple writers
+   SSRSingleton *copySingleton;
+   SSRSingleton *copyTransSingleton;
+   float32 *array;  //2D, but dynamically sized, so use addr arith
+ }
+SubMatrix;
+
+typedef struct
+ { SlaveVP   *resultPr;
+   SubMatrix *leftSubMatrix;
+   SubMatrix *rightSubMatrix;
+   float32   *partialResultArray;
+ }
+SMPairParams;
+
+typedef struct
+ { int32    numVals;
+   int32   *startVals;
+ }
+SlicingStruc;
+
+typedef struct
+ {
+   SlicingStruc *leftRowSlices;
+   SlicingStruc *vecSlices;
+   SlicingStruc *rightColSlices;
+ }
+SlicingStrucCarrier;
+
+enum MMMsgType
+ {
+   RESULTS_MSG = 1
+ };
+
+//============================= Processor Functions =========================
+void divideWorkIntoSubMatrixPairProcrs( void *data, SlaveVP *animatingPr );
+void calcSubMatrixProduct(        void *data, SlaveVP *animatingPr );
+void gatherResults(     void *data, SlaveVP *animatingPr );
+
+
+//================================ Entry Point ==============================
+Matrix *
+multiplyTheseMatrices( Matrix *leftMatrix, Matrix *rightMatrix );
+
+
+#endif /*_SSR_MATRIX_MULT_H_*/
diff -r 000000000000 -r 9cf4c84a3091 __brch__default
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/__brch__default	Wed May 23 12:39:19 2012 -0700
@@ -0,0 +1,1 @@
+Applications normally have only the default branch -- they shouldn't be affected by any choices in VMS or language..
diff -r 000000000000 -r 9cf4c84a3091 main.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/main.c	Wed May 23 12:39:19 2012 -0700
@@ -0,0 +1,35 @@
+/*
+ *  Copyright Oct 24, 2009 OpenSourceStewardshipFoundation.org
+ *  Licensed under GNU General Public License version 2
+ *
+ * author seanhalle@yahoo.com
+ */
+
+#include <malloc.h>
+#include <stdlib.h>
+
+#include "Matrix_Mult.h"
+#include "SSR_Matrix_Mult/SSR_Matrix_Mult.h"
+
+/**
+ * 
+ */
+int main( int argc, char **argv )
+ { Matrix      *leftMatrix, *rightMatrix, *resultMatrix;
+   ParamBag    *paramBag;
+   
+   DEBUG__printf2(TRUE, "arguments: %s | %s", argv[0], argv[1] );
+
+   paramBag = makeParamBag();
+   readParamFileIntoBag( argv[1], paramBag );
+   initialize_Input_Matrices_Via( &leftMatrix, &rightMatrix, paramBag );
+   
+   resultMatrix = multiplyTheseMatrices( leftMatrix, rightMatrix );
+
+   printf("\nresult matrix: \n");
+   printMatrix( resultMatrix );
+   
+   fflush(stdin);
+   
+   exit(0); //cleans up
+ }