# HG changeset patch
# User Sean Halle <seanhalle@yahoo.com>
# Date 1347931192 25200
# Node ID 69928a38d5af4b121421054370a1869fe9b488c0
# Parent  f8d3f16e70c89c2e8b005de3c0feddddd405d625
updating to most recent repository structure -- not working yet

diff -r f8d3f16e70c8 -r 69928a38d5af .hgeol
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/.hgeol	Mon Sep 17 18:19:52 2012 -0700
@@ -0,0 +1,14 @@
+
+[patterns]
+**.py = native
+**.txt = native
+**.c = native
+**.h = native
+**.cpp = native
+**.java = native
+**.class = bin
+**.jar = bin
+**.sh = native
+**.pl = native
+**.jpg = bin
+**.gif = bin
diff -r f8d3f16e70c8 -r 69928a38d5af .hgignore
--- a/.hgignore	Fri Sep 16 20:12:34 2011 +0200
+++ b/.hgignore	Mon Sep 17 18:19:52 2012 -0700
@@ -1,7 +1,10 @@
-nbproject
-histograms
-dist
-build
-Makefile
-.dep.inc
-scripts
+syntax: glob
+
+histograms
+nbproject
+build
+dist
+c-ray-mt
+*.ppm
+*.o
+*~
diff -r f8d3f16e70c8 -r 69928a38d5af .hgtags
--- a/.hgtags	Fri Sep 16 20:12:34 2011 +0200
+++ b/.hgtags	Mon Sep 17 18:19:52 2012 -0700
@@ -1,7 +1,1 @@
-3e26de9d3e1e66f0556b2664e71740e6ed009eee V0
-3e26de9d3e1e66f0556b2664e71740e6ed009eee V0
-0000000000000000000000000000000000000000 V0
-0000000000000000000000000000000000000000 V0
-b6e07082767ab984194db038102e2edd7d1917fd V0
-b6e07082767ab984194db038102e2edd7d1917fd V0
-0000000000000000000000000000000000000000 V0
+bef7a9083bd4f8b1fa4e171a36383e78a5f475d5 version_that_goes_into_paper
diff -r f8d3f16e70c8 -r 69928a38d5af Matrix_Mult.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/Matrix_Mult.c	Mon Sep 17 18:19:52 2012 -0700
@@ -0,0 +1,167 @@
+/*
+ *  Copyright 2009 OpenSourceStewardshipFoundation.org
+ *  Licensed under GNU General Public License version 2
+ *
+ * Author: seanhalle@yahoo.com
+ *
+ * Created on November 15, 2009, 2:35 AM
+ */
+
+#include <malloc.h>
+#include <stdlib.h>
+
+#include "Matrix_Mult.h"
+#include "C_Libraries/ParamHelper/Param.h"
+
+
+ 
+ void
+initialize_Input_Matrices_Via( Matrix  **leftMatrix, Matrix **rightMatrix,
+                               ParamBag *paramBag )
+ { char *leftMatrixFileName, *rightMatrixFileName;
+   int   leftMatrixRows, leftMatrixCols, rightMatrixRows, rightMatrixCols;
+   
+      ParamStruc *param;
+      param = getParamFromBag( "leftMatrixRows", paramBag );
+   leftMatrixRows = param->intValue;
+      param = getParamFromBag( "leftMatrixCols", paramBag );
+   leftMatrixCols = param->intValue;
+   *leftMatrix = makeMatrix_WithResMat( leftMatrixRows, leftMatrixCols );
+   
+      param = getParamFromBag( "leftMatrixFileName", paramBag );
+   leftMatrixFileName = param->strValue;  //no need to copy
+   read_Matrix_From_File( *leftMatrix,  leftMatrixFileName );
+   
+      param = getParamFromBag( "rightMatrixRows", paramBag );
+   rightMatrixRows = param->intValue;
+      param = getParamFromBag( "rightMatrixCols", paramBag );
+   rightMatrixCols = param->intValue;
+   *rightMatrix = makeMatrix_WithResMat( rightMatrixRows, rightMatrixCols );
+   
+      param = getParamFromBag( "rightMatrixFileName", paramBag );
+   rightMatrixFileName = param->strValue;
+   read_Matrix_From_File( *rightMatrix, rightMatrixFileName );
+ }
+
+
+void parseLineIntoRow( char *line, float32* row );
+
+
+ void
+read_Matrix_From_File( Matrix *matrixStruc, char *matrixFileName )
+ { int    row, maxRead, numRows, numCols;
+   float32 *matrixStart;
+   size_t lineSz = 0;
+   FILE  *file;
+   char  *line = NULL;
+   
+   lineSz = 50000; //max length of line in a matrix data file
+   line = (char *) malloc( lineSz );
+   if( line == NULL ) printf( "no mem for matrix line" );
+   
+   numRows = matrixStruc->numRows;
+   numCols = matrixStruc->numCols;
+   matrixStart = matrixStruc->array;
+
+   file = fopen( matrixFileName, "r" );
+   if( file == NULL ) { printf( "\nCouldn't open file!!\n"); exit(1);}
+   fseek( file, 0, SEEK_SET );
+   for( row = 0; row < numRows; row++ )
+    {
+      if( feof( file ) )  printf( "file ran out too soon" );
+      maxRead = getline( &line, &lineSz, file );
+      if( maxRead == -1 ) printf( "prob reading mat line");
+      
+      if( *line == '\n') continue; //blank line
+      if( *line == '/' ) continue; //comment line
+      
+      parseLineIntoRow( line, matrixStart + row * numCols );
+    }
+   free( line );
+ }
+
+/*This function relies on each line having the proper number of cols.  It
+ * doesn't check, nor enforce, so if the file is improperly formatted it
+ * can write over unrelated memory
+ */
+ void
+parseLineIntoRow( char *line, float32* row )
+ {
+   char *valueStr, *searchPos;
+   
+      //read the float values
+   searchPos = valueStr = line; //start
+   
+   for( ; *searchPos != 0; searchPos++)  //bit dangerous, should use buff len
+    {
+      if( *searchPos == '\n' ) //last col..  relying on well-formatted file
+       { *searchPos = 0;
+         *row = atof( valueStr );
+         break;                                    //end FOR loop
+       }
+      if( *searchPos == ',' )
+       { *searchPos = 0;                           //mark end of string
+         *row = (float32) atof( valueStr );
+         row += 1;                                 //address arith
+            //skip any spaces before digits.. use searchPos + 1 to skip the 0
+         for( ; *(searchPos + 1)== ' ' && *(searchPos + 1) !=0; searchPos++);
+         valueStr = searchPos + 1;
+       }
+    }
+ }
+
+ //==========================================================================
+
+/*In the "_Flat" version of constructor, do only malloc of the top data struc
+ * and set values in that top-level.  Don't malloc any sub-structures.
+ */
+ Matrix *
+makeMatrix_Flat( int32 numRows, int32 numCols )
+ { Matrix * retMatrix;
+   retMatrix = malloc( sizeof( Matrix ) );
+   retMatrix->numRows = numRows;
+   retMatrix->numCols = numCols;
+
+   return retMatrix;
+ }
+
+ Matrix *
+makeMatrix_WithResMat( int32 numRows, int32 numCols )
+ { Matrix * retMatrix;
+   retMatrix = malloc( sizeof( Matrix ) );
+   retMatrix->numRows = numRows;
+   retMatrix->numCols = numCols;
+   retMatrix->array  = malloc( numRows * numCols * sizeof(float32) );
+
+   return retMatrix;
+ }
+
+ void
+freeMatrix_Flat( Matrix * matrix )
+ { //( matrix );
+ }
+ void
+freeMatrix( Matrix * matrix )
+ { free( matrix->array );
+   free( matrix );
+ }
+
+void
+printMatrix( Matrix *matrix )
+ { int r, c, numRows, numCols, rowsToPrint, colsToPrint, rowIncr, colIncr;
+   float32 *matrixArray;
+
+   numRows = rowsToPrint = matrix->numRows;
+   numCols = colsToPrint = matrix->numCols;
+   matrixArray = matrix->array;
+
+   rowIncr = numRows/20; if(rowIncr == 0) rowIncr = 1;//20 to 39 rows printed
+   colIncr = numCols/20; if(colIncr == 0) colIncr = 1;//20 to 39 cols printed
+   for( r = 0; r < numRows; r += rowIncr )
+    { for( c = 0; c < numCols; c += colIncr )
+       { printf( "%3.1f | ", matrixArray[ r * numCols + c ] );
+       }
+      printf("\n");
+    }
+ }
+
diff -r f8d3f16e70c8 -r 69928a38d5af Matrix_Mult.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/Matrix_Mult.h	Mon Sep 17 18:19:52 2012 -0700
@@ -0,0 +1,77 @@
+/*
+ *  Copyright Oct 24, 2009 OpenSourceStewardshipFoundation.org
+ *  Licensed under GNU General Public License version 2
+ */
+
+#ifndef MATRIX_MULT_H_
+#define MATRIX_MULT_H_
+
+#include <stdio.h>
+#include <unistd.h>
+#include <malloc.h>
+
+#include "VMS_impl/VMS_primitive_data_types.h"
+#include "C_Libraries/ParamHelper/Param.h"
+
+//==============================  Structures  ==============================
+
+typedef
+struct
+ { int32 numRows;
+   int32 numCols;
+   float32 *array;  //2D, but dynamically sized, so use addr arith
+ }
+Matrix;
+
+/* This is the "appSpecificPiece" that is carried inside a DKUPiece.
+ *  In the DKUPiece data struc it is declared to be of type "void *".  This
+ *  allows the application to define any data structure it wants and put it
+ *  into a DKUPiece.
+ * When the app specific info is used, it is in app code, so it is cast to
+ *  the correct type to tell the compiler how to access fields.
+ * This keeps all app-specific things out of the DKU directory, as per the
+ *  DKU standard. */
+typedef
+struct
+ { 
+      // pointers to shared data..  the result matrix must be created when the
+      //  left and right matrices are put into the root ancestor DKUPiece.
+   Matrix * leftMatrix;
+   Matrix * rightMatrix;
+   Matrix * resultMatrix;
+
+      // define the starting and ending boundaries for this piece of the
+      //  result matrix.  These are derivable from the left and right
+      //  matrices, but included them for readability of code.
+   int prodStartRow, prodEndRow;
+   int prodStartCol, prodEndCol;
+      // Start and end of the portion of the left matrix that contributes to
+      //  this piece of the product
+   int leftStartRow, leftEndRow;
+   int leftStartCol, leftEndCol;
+      // Start and end of the portion of the right matrix that contributes to
+      //  this piece of the product
+   int rightStartRow, rightEndRow;
+   int rightStartCol, rightEndCol;
+ }
+MatrixProdPiece;
+
+//==============================  Functions  ================================
+void readFile();
+
+Matrix *makeMatrix( int32 numRows, int32 numCols );
+Matrix *makeMatrix_Flat( int32 numRows, int32 numCols );
+Matrix *makeMatrix_WithResMat( int32 numRows, int32 numCols );
+void    freeMatrix_Flat( Matrix * matrix );
+void    freeMatrix( Matrix * matrix );
+void    printMatrix( Matrix *matrix );
+
+void read_Matrix_From_File( Matrix *matrixStruc, char *matrixFileName );
+
+void
+initialize_Input_Matrices_Via( Matrix  **leftMatrix, Matrix **rightMatrix,
+                              ParamBag *paramBag );
+
+//===========================================================================
+
+#endif /*MATRIX_MULT_H_*/
diff -r f8d3f16e70c8 -r 69928a38d5af Vthread__Matrix_Mult/Divide_Pr.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/Vthread__Matrix_Mult/Divide_Pr.c	Mon Sep 17 18:19:52 2012 -0700
@@ -0,0 +1,646 @@
+/*
+ *  Copyright 2009 OpenSourceStewardshipFoundation.org
+ *  Licensed under GNU General Public License version 2
+ *
+ * Author: seanhalle@yahoo.com
+ *
+ */
+
+
+#include "Vthread__Matrix_Mult.h"
+#include <math.h>
+#include <string.h>
+#include "Vthread_impl/Vthread.h"
+
+   //The time to compute this many result values should equal the time to
+   // perform this division on a matrix of size gives that many result calcs
+   //IE, size this so that sequential time to calc equals divide time
+   // find the value by experimenting -- but divide time and calc time scale
+   // same way, so this value should remain valid across hardware
+#define NUM_CELLS_IN_SEQUENTIAL_CUTOFF 1000
+
+
+//===========================================================================
+int inline
+measureMatrixMultPrimitive( SlaveVP *animPr );
+
+SlicingStrucCarrier *
+calcIdealSizeAndSliceDimensions( Matrix *leftMatrix, Matrix *rightMatrix,
+                                 SlaveVP *animPr );
+
+SlicingStruc *
+sliceUpDimension( float32 idealSizeOfSide, int startVal, int endVal,
+                  SlaveVP *animPr );
+
+void
+freeSlicingStruc( SlicingStruc *slicingStruc, SlaveVP *animPr );
+
+SubMatrix **
+createSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices,
+                   int32 numUses, Matrix *origMatrix, SlaveVP *animPr );
+
+void
+freeSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices,
+                 SubMatrix **subMatrices, SlaveVP *animPr );
+
+void
+pairUpSubMatricesAndMakeProcessors( SubMatrix **leftSubMatrices,
+                                    SubMatrix **rightSubMatrices,
+                                    int32 numRowIdxs, int32 numColIdxs,
+                                    int32 numVecIdxs,
+                                    SlaveVP *resultPr,
+                                    SlaveVP *animatingPr );
+
+void
+makeSubMatricesAndProcrs( Matrix *leftMatrix, Matrix *rightMatrix,
+            SlicingStrucCarrier *slicingStrucCarrier,
+            SlaveVP *resultPr, SlaveVP *animatingPr );
+
+void inline
+copyTranspose( int32 numRows, int32 numCols,
+               int32 origStartRow, int32 origStartCol, int32 origStride,
+               float32 *subArray, float32 *origArray );
+
+void inline
+multiplyMatrixArraysTransposed( int32 vecLength, int32 numResRows,
+                                int32 numResCols,
+                                float32 *leftArray, float32 *rightArray,
+                                float32 *resArray );
+
+
+
+/*Divider creates one processor for every sub-matrix
+ * It hands them:
+ *  the name of the result processor that they should send their results to,
+ *  the left and right matrices, and the rows and cols they should multiply
+ * It first creates the result processor, then all the sub-matrixPair
+ *  processors,
+ *  then does a receive of a message from the result processor that gives
+ *  the divider ownership of the result matrix.
+ * Finally, the divider returns the result matrix out of the Vthread system.
+ *
+ * Divider chooses the size of sub-matrices via an algorithm that tries to
+ *  keep the minimum work above a threshold.  The threshold is machine-
+ *  dependent, so ask Vthread for min work-unit time to get a
+ *  given overhead
+ *
+ * Divide min work-unit cycles by measured-cycles for one matrix-cell
+ *  product -- gives the number of products need to have in min size
+ *  matrix.
+ *
+ * So then, take cubed root of this to get the size of a side of min sub-
+ *  matrix.  That is the size of the ideal square sub-matrix -- so tile
+ *  up the two input matrices into ones as close as possible to that size,
+ *  and create the pairs of sub-matrices.
+ *
+ *========================  STRATEGIC OVERVIEW  =======================
+ *
+ *This division is a bit tricky, because have to create things in advance
+ * that it's not at first obvious need to be created..
+ *
+ *First slice up each dimension -- three of them..  this is because will have
+ * to create the sub-matrix's data-structures before pairing the sub-matrices
+ * with each other -- so, have three dimensions to slice up before can
+ * create the sub-matrix data-strucs -- also, have to be certain that the
+ * cols of the left input have the exact same slicing as the rows of the
+ * left matrix, so just to be sure, do the slicing calc once, then use it
+ * for both.
+ *
+ *So, goes like this:
+ *1) calculate the start & end values of each dimension in each matrix.
+ *2) use those values to create sub-matrix structures
+ *3) combine sub-matrices into pairs, as the tasks to perform.
+ *
+ *Have to calculate separately from creating the sub-matrices because of the
+ * nature of the nesting -- would either end up creating the same sub-matrix
+ * multiple times, or else would have to put in detection of whether had
+ * made a particular one already if tried to combine steps 1 and 2.
+ *
+ *Step 3 has to be separate because of the nesting, as well -- same reason,
+ * would either create same sub-matrix multiple times, or else have to
+ * add detection of whether was already created.
+ *
+ *Another way to look at it: there's one level of loop to divide dimensions,
+ * two levels of nesting to create sub-matrices, and three levels to pair
+ * up the sub-matrices.
+ */
+
+void divideWorkIntoSubMatrixPairProcrs( void      *_dividerParams,
+                                        SlaveVP *animatingThd )
+ { SlaveVP         *resultPr;
+   DividerParams   *dividerParams;
+   ResultsParams   *resultsParams;
+   Matrix          *leftMatrix, *rightMatrix;
+   SlicingStrucCarrier *slicingStrucCarrier;
+   float32             *resultArray; //points to array inside result matrix
+   MatrixMultGlobals   *globals;
+  
+         DEBUG( dbgAppFlow, "start divide\n")
+
+         int32
+         divideProbe = VMS_App__create_single_interval_probe( "divideProbe",
+                                                          animatingThd );
+         VMS_App__record_sched_choice_into_probe( divideProbe, animatingThd );
+         VMS_App__record_interval_start_in_probe( divideProbe );
+
+   //=========== Setup -- make local copies of ptd-to-things, malloc, aso
+   int32 numResRows, numResCols, vectLength;
+
+   dividerParams   = (DividerParams *)_dividerParams;
+   
+   leftMatrix      = dividerParams->leftMatrix;
+   rightMatrix     = dividerParams->rightMatrix;
+
+   vectLength = leftMatrix->numCols;
+   numResRows = leftMatrix->numRows;
+   numResCols = rightMatrix->numCols;
+   resultArray     = dividerParams->resultMatrix->array;
+   
+      //zero the result array
+   memset( resultArray, 0, numResRows * numResCols * sizeof(float32) );
+
+   //==============  Do either sequential mult or do division ==============
+
+      //Check if input matrices too small -- if yes, just do sequential
+      //Cutoff is determined by overhead of this divider -- relatively
+      // machine-independent
+   if( (float32)leftMatrix->numRows * (float32)leftMatrix->numCols *
+       (float32)rightMatrix->numCols  < NUM_CELLS_IN_SEQUENTIAL_CUTOFF )
+    {
+      //====== Do sequential multiply on a single core
+            DEBUG( dbgAppFlow, "doing sequential")
+            
+         //transpose the right matrix
+      float32 *
+      transRightArray  = 
+         Vthread__malloc( (size_t)rightMatrix->numRows * 
+                                (size_t)rightMatrix->numCols *
+                                sizeof(float32), animatingThd );
+
+         //copy values from orig matrix to local
+      copyTranspose( rightMatrix->numRows, rightMatrix->numCols,
+                     0, 0, rightMatrix->numRows,
+                     transRightArray, rightMatrix->array );
+      
+      multiplyMatrixArraysTransposed( vectLength, numResRows, numResCols,
+                            leftMatrix->array, transRightArray,
+                            resultArray );
+    }
+   else
+    {
+      //====== Do parallel multiply across cores
+
+            DEBUG( dbgAppFlow, "divider: do parallel mult\n")
+         //Calc the ideal size of sub-matrix and slice up the dimensions of
+         // the two matrices.
+         //The ideal size is the one takes the number of cycles to calculate
+         // such that calc time is equal or greater than min work-unit size
+      slicingStrucCarrier =
+         calcIdealSizeAndSliceDimensions(leftMatrix,rightMatrix,animatingThd);
+
+         //Make the results processor, now that know how many to wait for
+      resultsParams = Vthread__malloc( sizeof(ResultsParams), animatingThd );
+      resultsParams->numSubMatrixPairs  =
+         slicingStrucCarrier->leftRowSlices->numVals *
+         slicingStrucCarrier->rightColSlices->numVals *
+         slicingStrucCarrier->vecSlices->numVals;
+      resultsParams->dividerPr   = animatingThd;
+      resultsParams->numCols     = rightMatrix->numCols;
+      resultsParams->numRows     = leftMatrix->numRows;
+      resultsParams->resultArray = resultArray;
+
+      //==========  Set up global vars, including conds and mutexes ==========
+      globals = VMS_App__malloc( sizeof(MatrixMultGlobals) );
+      Vthread__set_globals_to( globals );
+
+      globals->results_mutex = Vthread__make_mutex( animatingThd );
+      globals->results_cond  = Vthread__make_cond( globals->results_mutex,
+                                                               animatingThd );
+
+      globals->vector_mutex = Vthread__make_mutex( animatingThd );
+      globals->vector_cond  = Vthread__make_cond( globals->vector_mutex,
+                                                               animatingThd );
+
+      globals->start_mutex = Vthread__make_mutex( animatingThd );
+      globals->start_cond  = Vthread__make_cond( globals->start_mutex,
+                                                               animatingThd );
+      //======================================================================
+
+            DEBUG( dbgAppFlow, "divider: made mutexes and conds\n")
+         //get results-comm lock before create results-thd, to ensure it can't
+         // signal that results are available before this thd is waiting on cond
+      Vthread__mutex_lock( globals->results_mutex, animatingThd );
+
+         //also get the start lock & use to ensure no vector threads send a
+         // signal before the results thread is waiting on vector cond
+      Vthread__mutex_lock( globals->start_mutex, animatingThd );
+
+
+            DEBUG( dbgAppFlow, "divider: make result thread\n")
+      Vthread__create_thread( &gatherResults, resultsParams, animatingThd );
+
+         //Now wait for results thd to signal that it has vector lock
+      Vthread__cond_wait(  globals->start_cond,  animatingThd );
+      Vthread__mutex_unlock( globals->start_mutex, animatingThd );//done w/lock
+            DEBUG( dbgAppFlow, "divider: make sub-matrices\n")
+   
+         //Make the sub-matrices, and pair them up, and make processor to
+         // calc product of each pair.
+      makeSubMatricesAndProcrs( leftMatrix, rightMatrix,
+                                    slicingStrucCarrier,
+                                    resultPr, animatingThd);
+ 
+         //Wait for results thread to say results are good
+      Vthread__cond_wait(  globals->results_cond,  animatingThd );
+    }
+
+
+   //===============  Work done -- send results back =================
+
+
+         DEBUG( dbgAppFlow, "end divide\n")
+
+         VMS_App__record_interval_end_in_probe( divideProbe );
+         VMS_App__print_stats_of_all_probes();
+
+      //nothing left to do so dissipate, Vthread will wait to shutdown,
+      // making results available to outside, until all the processors have
+      // dissipated -- so actually no need to wait for results processor
+      //However, following the pattern, so done with comm, release lock
+   Vthread__mutex_unlock( globals->results_mutex, animatingThd );
+
+   Vthread__dissipate_thread( animatingThd );  //all procrs dissipate self at end
+      //when all of the processors have dissipated, the "create seed and do
+      // work" call in the entry point function returns
+ }
+
+
+SlicingStrucCarrier *
+calcIdealSizeAndSliceDimensions( Matrix *leftMatrix, Matrix *rightMatrix,
+                                 SlaveVP *animPr )
+ {
+   float32 idealSizeOfSide, idealSizeOfSide1, idealSizeOfSide2;
+   SlicingStruc *leftRowSlices, *vecSlices, *rightColSlices;
+   SlicingStrucCarrier *slicingStrucCarrier =
+                       Vthread__malloc(sizeof(SlicingStrucCarrier), animPr);
+
+   int minWorkUnitCycles, primitiveCycles, idealNumWorkUnits;
+   float64 numPrimitiveOpsInMinWorkUnit;
+
+
+   //=======  Calc ideal size of min-sized sub-matrix  ========
+
+      //ask Vthread for the number of cycles of the minimum work unit, at given
+      // percent overhead then add a guess at overhead from this divider
+   minWorkUnitCycles = Vthread__giveMinWorkUnitCycles( .05 );
+
+      //ask Vthread for number of cycles of the "primitive" op of matrix mult
+   primitiveCycles = measureMatrixMultPrimitive( animPr );
+
+   numPrimitiveOpsInMinWorkUnit =
+      (float64)minWorkUnitCycles / (float64)primitiveCycles;
+
+      //take cubed root -- that's number of these in a "side" of sub-matrix
+      // then multiply by 5 because the primitive is 5x5src/Application/Vthread__Matrix_Mult/Divide_Pr.c:403: warning: implicit declaration of function ‘Vthread__give_number_of_cores_to_schedule_onto’
+   idealSizeOfSide1 = 5 * cbrt( numPrimitiveOpsInMinWorkUnit );
+
+   idealNumWorkUnits = Vthread__giveIdealNumWorkUnits();
+   
+   idealSizeOfSide2 = leftMatrix->numRows / rint(cbrt( idealNumWorkUnits ));
+   idealSizeOfSide2 *= 0.6; //finer granularity to help load balance
+
+   if( idealSizeOfSide1 > idealSizeOfSide2 )
+      idealSizeOfSide = idealSizeOfSide1;
+   else
+      idealSizeOfSide = idealSizeOfSide2;
+
+      //The multiply inner loop blocks the array to fit into L1 cache
+//   if( idealSizeOfSide < ROWS_IN_BLOCK ) idealSizeOfSide = ROWS_IN_BLOCK;
+
+   //============  Slice up dimensions, now that know target size ===========
+
+      //Tell the slicer the target size of a side (floating pt), the start
+      // value to start slicing at, and the end value to stop slicing at
+      //It returns an array of start value of each chunk, plus number of them
+   int32 startLeftRow, endLeftRow, startVec,endVec,startRightCol,endRightCol;
+   startLeftRow  = 0;
+   endLeftRow    = leftMatrix->numRows -1;
+   startVec      = 0;
+   endVec        = leftMatrix->numCols -1;
+   startRightCol = 0;
+   endRightCol   = rightMatrix->numCols -1;
+
+   leftRowSlices =
+      sliceUpDimension( idealSizeOfSide,  startLeftRow, endLeftRow, animPr );
+
+   vecSlices =
+      sliceUpDimension( idealSizeOfSide,  startVec, endVec, animPr );
+
+   rightColSlices =
+      sliceUpDimension( idealSizeOfSide,  startRightCol, endRightCol,animPr);
+
+   slicingStrucCarrier->leftRowSlices  = leftRowSlices;
+   slicingStrucCarrier->vecSlices      = vecSlices;
+   slicingStrucCarrier->rightColSlices = rightColSlices;
+
+   return slicingStrucCarrier;
+ }
+
+
+void
+makeSubMatricesAndProcrs( Matrix    *leftMatrix, Matrix    *rightMatrix,
+            SlicingStrucCarrier *slicingStrucCarrier,
+            SlaveVP *resultPr,   SlaveVP *animPr )
+ {
+   SlicingStruc *leftRowSlices, *vecSlices, *rightColSlices;
+   
+   leftRowSlices  = slicingStrucCarrier->leftRowSlices;
+   vecSlices      = slicingStrucCarrier->vecSlices;
+   rightColSlices = slicingStrucCarrier->rightColSlices;
+   Vthread__free( slicingStrucCarrier, animPr );
+   
+   //================  Make sub-matrices, given the slicing  ================
+   SubMatrix **leftSubMatrices, **rightSubMatrices;
+   leftSubMatrices =
+      createSubMatrices( leftRowSlices, vecSlices, rightColSlices->numVals,
+                         leftMatrix, animPr );
+   //double_check_that_always_numRows_in_right_same_as_numCols_in_left();
+   rightSubMatrices =
+      createSubMatrices( vecSlices, rightColSlices, leftRowSlices->numVals,
+                         rightMatrix, animPr );
+
+
+   //==============  pair the sub-matrices and make processors ==============
+   int32 numRowIdxs, numColIdxs, numVecIdxs;
+
+   numRowIdxs = leftRowSlices->numVals;
+   numColIdxs = rightColSlices->numVals;
+   numVecIdxs = vecSlices->numVals;
+   
+   freeSlicingStruc( leftRowSlices, animPr );
+   freeSlicingStruc( vecSlices, animPr );
+   freeSlicingStruc( rightColSlices, animPr );
+   
+   pairUpSubMatricesAndMakeProcessors( leftSubMatrices,
+                                       rightSubMatrices,
+                                       numRowIdxs, numColIdxs,
+                                       numVecIdxs,
+                                       resultPr,
+                                       animPr );
+ }
+
+
+
+
+void
+pairUpSubMatricesAndMakeProcessors( SubMatrix **leftSubMatrices,
+                                    SubMatrix **rightSubMatrices,
+                                    int32 numRowIdxs, int32 numColIdxs,
+                                    int32 numVecIdxs,
+                                    SlaveVP *resultPr,
+                                    SlaveVP *animatingPr )
+ {
+   int32 resRowIdx, resColIdx, vecIdx;
+   int32 numLeftColIdxs, numRightColIdxs;
+   int32 leftRowIdxOffset;
+   SMPairParams *subMatrixPairParams;
+   float32 numToPutOntoEachCore, leftOverFraction;
+   int32 numCores, coreToScheduleOnto, numVecOnCurrCore;
+
+   numLeftColIdxs  = numColIdxs;
+   numRightColIdxs = numVecIdxs;
+
+   numCores = Vthread__give_number_of_cores_to_schedule_onto();
+
+   numToPutOntoEachCore = numRowIdxs*numColIdxs/numCores;
+   leftOverFraction = 0;
+   numVecOnCurrCore = 0;
+   coreToScheduleOnto = 0;
+
+   for( resRowIdx = 0; resRowIdx < numRowIdxs; resRowIdx++ )
+    {
+      leftRowIdxOffset = resRowIdx * numLeftColIdxs;
+
+      for( resColIdx = 0; resColIdx < numColIdxs; resColIdx++ )
+       {
+         
+         for( vecIdx = 0; vecIdx < numVecIdxs; vecIdx++ )
+          {
+               //Make the processor for the pair of sub-matrices
+            subMatrixPairParams  = Vthread__malloc( sizeof(SMPairParams),
+                                                               animatingPr);
+            subMatrixPairParams->leftSubMatrix  =
+               leftSubMatrices[ leftRowIdxOffset + vecIdx ];
+
+            subMatrixPairParams->rightSubMatrix =
+               rightSubMatrices[ vecIdx * numRightColIdxs + resColIdx ];
+
+            //subMatrixPairParams->resultPr = resultPr;
+
+               //put all pairs from the same vector onto same core
+            Vthread__create_thread_with_affinity( &calcSubMatrixProduct,
+                                                   subMatrixPairParams,
+                                                   animatingPr,
+                                                   coreToScheduleOnto );
+          }
+
+            //Trying to distribute the subMatrix-vectors across the cores, so
+            // that each core gets the same number of vectors, with a max
+            // imbalance of 1 vector more on some cores than others
+         numVecOnCurrCore += 1;
+         if( numVecOnCurrCore + leftOverFraction >= numToPutOntoEachCore -1 )
+          {
+               //deal with fractional part, to ensure that imbalance is 1 max
+               // IE, core with most has only 1 more than core with least
+            leftOverFraction += numToPutOntoEachCore - numVecOnCurrCore;
+            if( leftOverFraction >= 1 )
+             { leftOverFraction -= 1;
+               numVecOnCurrCore = -1;
+             }
+            else
+             { numVecOnCurrCore = 0;
+             }
+               //Move to next core, max core-value to incr to is numCores -1
+            if( coreToScheduleOnto >= numCores -1 )
+             { coreToScheduleOnto = 0;
+             }
+            else
+             { coreToScheduleOnto += 1;
+             }
+          }
+
+       }
+    }
+ }
+
+
+
+/*Walk through the two slice-strucs, making sub-matrix strucs as go
+ */
+SubMatrix **
+createSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices,
+                   int32 numUses, Matrix *origMatrix, SlaveVP *animPr )
+ {
+   int32 numRowIdxs, numColIdxs, rowIdx, colIdx;
+   int32 startRow, endRow, startCol, endCol;
+   int32 *rowStartVals, *colStartVals;
+   int32 rowOffset;
+   SubMatrix **subMatrices, *newSubMatrix;
+
+   numRowIdxs = rowSlices->numVals;
+   numColIdxs = colSlices->numVals;
+
+   rowStartVals = rowSlices->startVals;
+   colStartVals = colSlices->startVals;
+
+   subMatrices = Vthread__malloc((size_t)numRowIdxs *
+                            (size_t)numColIdxs * sizeof(SubMatrix*),animPr );
+
+   for( rowIdx = 0; rowIdx < numRowIdxs; rowIdx++ )
+    {
+      rowOffset = rowIdx * numColIdxs;
+      
+      startRow  = rowStartVals[rowIdx];
+      endRow    = rowStartVals[rowIdx + 1] -1; //"fake" start above last is
+                                               // at last valid idx + 1 & is
+                                               // 1 greater than end value
+      for( colIdx = 0; colIdx < numColIdxs; colIdx++ )
+       {
+         startCol = colStartVals[colIdx];
+         endCol   = colStartVals[colIdx + 1] -1;
+
+         newSubMatrix = Vthread__malloc( sizeof(SubMatrix), animPr );
+         newSubMatrix->numRows       = endRow - startRow +1;
+         newSubMatrix->numCols       = endCol - startCol +1;
+         newSubMatrix->origMatrix    = origMatrix;
+         newSubMatrix->origStartRow  = startRow;
+         newSubMatrix->origStartCol  = startCol;
+         newSubMatrix->alreadyCopied = FALSE;
+         newSubMatrix->numUsesLeft   = numUses; //can free after this many
+         //Prevent uninitialized memory
+         newSubMatrix->copySingleton = NULL;
+         newSubMatrix->copyTransSingleton = NULL;
+
+         subMatrices[ rowOffset + colIdx ] = newSubMatrix;
+       }
+    }
+   return subMatrices;
+ }
+
+
+void
+freeSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices,
+                 SubMatrix **subMatrices, SlaveVP *animPr )
+ {
+   int32 numRowIdxs, numColIdxs, rowIdx, colIdx, rowOffset;
+   SubMatrix *subMatrix;
+
+   numRowIdxs = rowSlices->numVals;
+   numColIdxs = colSlices->numVals;
+
+   for( rowIdx = 0; rowIdx < numRowIdxs; rowIdx++ )
+    {
+      rowOffset = rowIdx * numColIdxs;
+      for( colIdx = 0; colIdx < numColIdxs; colIdx++ )
+       {
+         subMatrix = subMatrices[ rowOffset + colIdx ];
+         if( subMatrix->alreadyCopied )
+            Vthread__free( subMatrix->array, animPr );
+         Vthread__free( subMatrix, animPr );
+       }
+    }
+   Vthread__free( subMatrices, animPr );
+ }
+
+
+
+SlicingStruc *
+sliceUpDimension( float32 idealSizeOfSide, int startVal, int endVal,
+                  SlaveVP *animPr )
+ { float32 residualAcc = 0;
+   int     numSlices, i, *startVals, sizeOfSlice, endCondition;
+   SlicingStruc *slicingStruc = Vthread__malloc(sizeof(SlicingStruc), animPr);
+
+      //calc size of matrix need to hold start vals --
+   numSlices = (int32)( (float32)(endVal -startVal +1) / idealSizeOfSide);
+
+   startVals = Vthread__malloc( ((size_t)numSlices + 1) * sizeof(int32), animPr );
+
+      //Calc the upper limit of start value -- when get above this, end loop
+      // by saving highest value of the matrix dimension to access, plus 1
+      // as the start point of the imaginary slice following the last one
+      //Plus 1 because go up to value but not include when process last slice
+      //The stopping condition is half-a-size less than highest value because
+      // don't want any pieces smaller than half the ideal size -- just tack
+      // little ones onto end of last one
+   endCondition = endVal - (int) (idealSizeOfSide/2); //end *value*, not size
+   for( i = 0; startVal <= endVal; i++ )
+    {
+      startVals[i] = startVal;
+      residualAcc += idealSizeOfSide;
+      sizeOfSlice  = (int)residualAcc;
+      residualAcc -= (float32)sizeOfSlice;
+      startVal    += sizeOfSlice; //ex @size = 2 get 0, 2, 4, 6, 8..
+
+      if( startVal > endCondition )
+       { startVal = endVal + 1;
+         startVals[ i + 1 ] = startVal;
+       }
+    }
+
+   slicingStruc->startVals = startVals;
+   slicingStruc->numVals   = i;  //loop incr'd, so == last valid start idx+1
+                                 // which means is num sub-matrices in dim
+                                 // also == idx of the fake start just above
+   return slicingStruc;
+ }
+
+void
+freeSlicingStruc( SlicingStruc *slicingStruc, SlaveVP *animPr )
+ {
+   Vthread__free( slicingStruc->startVals, animPr );
+   Vthread__free( slicingStruc, animPr );
+ }
+
+
+int inline
+measureMatrixMultPrimitive( SlaveVP *animPr )
+ {
+   int r, c, v, numCycles;
+   float32 *res, *left, *right;
+
+      //setup inputs
+   left  = Vthread__malloc( 5 * 5 * sizeof( float32 ), animPr );
+   right = Vthread__malloc( 5 * 5 * sizeof( float32 ), animPr );
+   res   = Vthread__malloc( 5 * 5 * sizeof( float32 ), animPr );
+
+   for( r = 0; r < 5; r++ )
+    {
+      for( c = 0; c < 5; c++ )
+       {
+         left[  r * 5 + c ] = r;
+         right[ r * 5 + c ] = c;
+       }
+    }
+
+      //do primitive
+   Vthread__start_primitive();  //for now, just takes time stamp
+   for( r = 0; r < 5; r++ )
+    {
+      for( c = 0; c < 5; c++ )
+       {
+         for( v = 0; v < 5; v++ )
+          {
+            res[ r * 5 + c ] = left[ r * 5 + v ] * right[ v * 5 + c ];
+          }
+       }
+    }
+   numCycles =
+      Vthread__end_primitive_and_give_cycles();
+
+   Vthread__free( left, animPr );
+   Vthread__free( right, animPr );
+   Vthread__free( res, animPr );
+
+   return numCycles;
+ }
diff -r f8d3f16e70c8 -r 69928a38d5af Vthread__Matrix_Mult/EntryPoint.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/Vthread__Matrix_Mult/EntryPoint.c	Mon Sep 17 18:19:52 2012 -0700
@@ -0,0 +1,120 @@
+/*
+ *  Copyright 2009 OpenSourceStewardshipFoundation.org
+ *  Licensed under GNU General Public License version 2
+ *
+ * Author: seanhalle@yahoo.com
+ *
+ */
+
+#include <math.h>
+
+#include "Vthread__Matrix_Mult.h"
+
+
+
+/*Every program for a VMS-based language has an "entry point" function that
+ * creates the first
+ * processor, which starts the chain of creating more processors..
+ * eventually all of the processors will dissipate themselves, and
+ * return.
+ *
+ *This entry-point function follows the same pattern as all entry-point
+ * functions do:
+ *1) it creates the params for the seed processor, from the
+ *    parameters passed into the entry-point function
+ *2) it calls create_seed_slaveVP_and_do_work
+ *3) it gets the return value from the params struc, frees the params struc,
+ *    and returns the value from the function
+ *
+ */
+Matrix *
+multiplyTheseMatrices( Matrix *leftMatrix, Matrix *rightMatrix )
+ { Matrix          *resMatrix;
+   DividerParams   *dividerParams;
+   int32            numResRows, numResCols;
+
+
+   dividerParams              = malloc( sizeof( DividerParams ) );
+   dividerParams->leftMatrix  = leftMatrix;
+   dividerParams->rightMatrix = rightMatrix;
+
+
+   numResRows  = leftMatrix->numRows;
+   numResCols  = rightMatrix->numCols;
+
+      //VMS has its own separate internal malloc, so to get results out,
+      // have to pass in empty array for it to fill up
+      //The alternative is internally telling SSR make external space to use
+   resMatrix            = malloc( sizeof(Matrix) );
+   resMatrix->array     = malloc( numResRows * numResCols * sizeof(float32));
+   resMatrix->numCols   = rightMatrix->numCols;
+   resMatrix->numRows   = leftMatrix->numRows;
+
+
+   dividerParams->resultMatrix   = resMatrix;
+
+      //create divider processor, start doing the work, and wait till done
+      //This function is the "border crossing" between normal code and SSR
+   Vthread__create_seed_slaveVP_and_do_work(&divideWorkIntoSubMatrixPairProcrs,
+                                             dividerParams );
+   
+   free( dividerParams );
+   return resMatrix;
+ }
+
+   
+Matrix *
+multiplyTheseMatrices( Matrix *leftMatrix, Matrix *rightMatrix )
+ { Matrix          *resMatrix;
+   DividerParams   *dividerParams;
+   int32            numResRows, numResCols;
+
+
+   dividerParams              = malloc( sizeof( DividerParams ) );
+   dividerParams->leftMatrix  = leftMatrix;
+   dividerParams->rightMatrix = rightMatrix;
+
+
+   numResRows  = leftMatrix->numRows;
+   numResCols  = rightMatrix->numCols;
+
+      //VMS has its own separate internal malloc, so to get results out,
+      // have to pass in empty array for it to fill up
+      //The alternative is internally telling SSR make external space to use
+   resMatrix            = malloc( sizeof(Matrix) );
+   resMatrix->array     = malloc( numResRows * numResCols * sizeof(float32));
+   resMatrix->numCols   = rightMatrix->numCols;
+   resMatrix->numRows   = leftMatrix->numRows;
+
+
+   dividerParams->resultMatrix   = resMatrix;
+
+   //Vthread__start_lang_running();
+   //VMS__start_lang_running( Vthread );
+   
+/*
+   matrixMultProcessID = 
+    Vthread__spawn_program_on_data(&divideWorkIntoSubMatrixPairProcrs,
+                                                                 dividerParams);
+*/
+   VMS__start_VMS_running( );
+   
+   VMSProcessID matrixMultProcessID;
+   
+   matrixMultProcessID =
+    VMS__spawn_program_on_data_in_Lang( &prog_seed_fn, data, Vthread_lang );
+   
+   resMatrix = VMS__give_results_when_done_for( matrixMultProcessID );
+   
+//    Vthread__give_results_when_done_for( matrixMultProcessID );
+
+   //VMS__shutdown_lang( Vthread );
+   
+   //Vthread__shutdown_lang();
+   
+   VMS__shutdown();
+   
+   free( dividerParams );
+   
+   return resMatrix;
+ }
diff -r f8d3f16e70c8 -r 69928a38d5af Vthread__Matrix_Mult/Result_Pr.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/Vthread__Matrix_Mult/Result_Pr.c	Mon Sep 17 18:19:52 2012 -0700
@@ -0,0 +1,142 @@
+/*
+ *  Copyright 2009 OpenSourceStewardshipFoundation.org
+ *  Licensed under GNU General Public License version 2
+ *
+ * Author: seanhalle@yahoo.com
+ *
+ */
+
+#include "Vthread__Matrix_Mult.h"
+
+//=====================
+void inline
+accumulateResult( float32 *resultArray, float32 *subMatrixResultArray,
+                  int32    startRow,
+                  int32    numRows,
+                  int32    startCol,
+                  int32    numCols,
+                  int32    numOrigCols );
+
+//===========================================================================
+
+/*The Result Processor gets a message from each of the vector processors,
+ * puts the result from the message in its location in the result-
+ * matrix, and increments the count of results.
+ *
+ *After the count reaches the point that all results have been received, it
+ * returns the result matrix and dissipates.
+ */
+void gatherResults( void *_params, SlaveVP *animatingThd )
+ { SlaveVP *dividerPr;
+   ResultsParams  *params;
+   int            numRows, numCols, numSubMatrixPairs, count=0;
+   float32        *resultArray;
+   SMPairParams   *resParams;
+   //====================== thread stuff =======================
+   MatrixMultGlobals *globals =(MatrixMultGlobals *)Vthread__give_globals();
+
+
+      //get vector-comm lock before loop, so that this thd keeps lock after
+      // one wait until it enters the next wait -- forces see-saw btwn
+      // waiters and signalers -- wait-signal-wait-signal-...
+   Vthread__mutex_lock( globals->vector_mutex, animatingThd );
+
+      //Tell divider that have the vector lock -- so it's sure won't miss any
+      // signals from the vector-threads it's about to create
+      //Don't need a signal variable -- this thd can't be created until
+      // divider thd already has the start lock
+   Vthread__mutex_lock( globals->start_mutex, animatingThd );//finish wait
+   Vthread__cond_signal( globals->start_cond,  animatingThd );
+   Vthread__mutex_unlock( globals->start_mutex, animatingThd );//finish wait
+   //===========================================================
+
+         DEBUG( dbgAppFlow, "start resultPr\n")
+         
+   params    = (ResultsParams *)_params;
+   dividerPr = params->dividerPr;
+   numSubMatrixPairs = params->numSubMatrixPairs;
+   numRows = params->numRows;
+   numCols = params->numCols;
+
+   resultArray = params->resultArray;
+
+
+   while( count < numSubMatrixPairs )
+    {
+         //receive a vector-result from a vector-thread
+      Vthread__cond_wait(  globals->vector_cond,  animatingThd );
+
+         //At this point, animating thread owns the vector lock, so all
+         // pairs trying to signal they have a result are waiting to get that
+         // lock -- only one gets it at a time, and when signal, this thd
+         // gets the lock and does the body of this loop, then when does the
+         // wait again, that releases the lock for next pair-thread to get it
+      resParams = globals->currSMPairParams;
+
+      accumulateResult( resultArray, resParams->partialResultArray,
+                        resParams->leftSubMatrix->origStartRow,
+                        resParams->leftSubMatrix->numRows,
+                        resParams->rightSubMatrix->origStartCol,
+                        resParams->rightSubMatrix->numCols,
+                        resParams->rightSubMatrix->origMatrix->numCols );
+
+      Vthread__free( resParams->partialResultArray, animatingThd );
+      
+         //there is only one copy of results procr, so can update numUsesLeft
+         // without concurrency worries.  When zero, free the sub-matrix
+      resParams->leftSubMatrix->numUsesLeft -= 1;
+      if( resParams->leftSubMatrix->numUsesLeft == 0 )
+       {
+         Vthread__free( resParams->leftSubMatrix->array, animatingThd );
+         Vthread__free( resParams->leftSubMatrix, animatingThd );
+       }
+
+      resParams->rightSubMatrix->numUsesLeft -= 1;
+      if( resParams->rightSubMatrix->numUsesLeft == 0 )
+       {
+         Vthread__free( resParams->rightSubMatrix->array, animatingThd );
+         Vthread__free( resParams->rightSubMatrix, animatingThd );
+       }
+
+         //count of how many sub-matrix pairs accumulated so know when done
+      count++;
+    }
+
+      //Done -- could just dissipate -- SSR will wait for all processors to
+      // dissipate before shutting down, and thereby making results avaial to
+      // outside, so no need to stop the divider from dissipating, so no need
+      // to send a hand-shake message to it -- but makes debug easier
+      //However, following pattern, so all comms done, release lock
+   Vthread__mutex_unlock( globals->vector_mutex, animatingThd );
+
+      //Send result to divider (seed) thread
+      // note, divider thd had to hold the results-comm lock before creating
+      // this thread, to be sure no race
+   Vthread__mutex_lock(   globals->results_mutex, animatingThd );
+   //globals->results = resultMatrixArray;
+   Vthread__cond_signal(  globals->results_cond,  animatingThd );
+   Vthread__mutex_unlock( globals->results_mutex, animatingThd ); //releases
+   //divider thread from its wait, at point this executes
+
+   Vthread__dissipate_thread( animatingThd );  //frees any data owned by procr
+ }
+
+void inline
+accumulateResult( float32 *resultArray, float32 *subMatrixPairResultArray,
+                  int32    startRow,
+                  int32    numRows,
+                  int32    startCol,
+                  int32    numCols,
+                  int32    numOrigCols )
+ { int32 row, col;
+
+   for( row = 0; row < numRows; row++ )
+    {
+      for( col = 0; col < numCols; col++ )
+       {
+         resultArray[ (row + startRow) * numOrigCols + (col + startCol) ] +=
+            subMatrixPairResultArray[ row * numCols + col ];
+       }
+    }
+
+ }
diff -r f8d3f16e70c8 -r 69928a38d5af Vthread__Matrix_Mult/Vthread__Matrix_Mult.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/Vthread__Matrix_Mult/Vthread__Matrix_Mult.h	Mon Sep 17 18:19:52 2012 -0700
@@ -0,0 +1,120 @@
+/*
+ *  Copyright Oct 24, 2009 OpenSourceStewardshipFoundation.org
+ *  Licensed under GNU General Public License version 2
+ */
+
+#ifndef _SSR_MATRIX_MULT_H_
+#define _SSR_MATRIX_MULT_H_
+
+#include <stdio.h>
+
+#include "Vthread_impl/Vthread.h"
+#include "../Matrix_Mult.h"
+
+
+//===============================  Defines  ==============================
+#define ROWS_IN_BLOCK 32
+#define COLS_IN_BLOCK 32
+#define VEC_IN_BLOCK  32
+
+#define copyMatrixSingleton 1
+#define copyTransposeSingleton 2
+
+//==============================  Structures  ==============================
+typedef struct
+ {
+   Matrix *leftMatrix;
+   Matrix *rightMatrix;
+   Matrix *resultMatrix;
+ }
+DividerParams;
+
+typedef struct
+ {
+   SlaveVP *dividerPr;
+   int numRows;
+   int numCols;
+   int numSubMatrixPairs;
+   float32 *resultArray;
+ }
+ResultsParams;
+
+typedef
+struct
+ { int32    numRows;
+   int32    numCols;
+   Matrix  *origMatrix;
+   int32    origStartRow;
+   int32    origStartCol;
+   int32    alreadyCopied;
+   VthdSingleton *copySingleton;
+   VthdSingleton *copyTransSingleton;
+   int32    numUsesLeft; //have update via message to avoid multiple writers
+   float32 *array;  //2D, but dynamically sized, so use addr arith
+ }
+SubMatrix;
+
+typedef struct
+ { SlaveVP *resultPr;
+   SubMatrix *leftSubMatrix;
+   SubMatrix *rightSubMatrix;
+   float32   *partialResultArray;
+ }
+SMPairParams;
+
+typedef
+struct
+ { int32    numVals;
+   int32   *startVals;
+ }
+SlicingStruc;
+
+typedef
+struct
+ {
+   SlicingStruc *leftRowSlices;
+   SlicingStruc *vecSlices;
+   SlicingStruc *rightColSlices;
+ }
+SlicingStrucCarrier;
+
+enum MMMsgType
+ {
+   RESULTS_MSG = 1
+ };
+
+ 
+typedef struct
+ {
+      //for communicating sub-matrix-pair results to results Thd
+   int32         vector_mutex;
+   int32         vector_cond;
+   SMPairParams *currSMPairParams;
+
+      //for communicating results array back to seed (divider) Thd
+   int32         results_mutex;
+   int32         results_cond;
+   float32      *results;
+
+      //for ensuring results thd has vector lock before making vector thds
+   int32         start_mutex;
+   int32         start_cond;
+
+   Matrix *rightMatrix;
+   Matrix *resultMatrix;
+ }
+MatrixMultGlobals;
+
+
+//============================= Processor Functions =========================
+void divideWorkIntoSubMatrixPairProcrs( void *data, SlaveVP *animatingPr );
+void calcSubMatrixProduct(        void *data, SlaveVP *animatingPr );
+void gatherResults(     void *data, SlaveVP *animatingPr );
+
+
+//================================ Entry Point ==============================
+Matrix *
+multiplyTheseMatrices( Matrix *leftMatrix, Matrix *rightMatrix );
+
+
+#endif /*_SSR_MATRIX_MULT_H_*/
diff -r f8d3f16e70c8 -r 69928a38d5af Vthread__Matrix_Mult/subMatrix_Pr.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/Vthread__Matrix_Mult/subMatrix_Pr.c	Mon Sep 17 18:19:52 2012 -0700
@@ -0,0 +1,310 @@
+/* 
+ *  Copyright 2009 OpenSourceStewardshipFoundation.org
+ *  Licensed under GNU General Public License version 2
+ *
+ * Author: SeanHalle@yahoo.com
+ *
+ */
+
+#include <string.h>
+
+#include "Vthread__Matrix_Mult.h"
+
+//Declarations
+//===================================================
+
+void inline
+copyFromOrig( SubMatrix *subMatrix, SlaveVP *animPr );
+
+void inline
+copyTransposeFromOrig( SubMatrix *subMatrix, SlaveVP *animPr );
+
+void inline
+multiplySubBlocksTransposed( float32 *leftArray, float32 *rightArray,
+                     float32 *resArray,
+                     int startRow,  int endRow,
+                     int startCol,  int endCol,
+                     int startVec,  int endVec,
+                     int resStride, int inpStride );
+
+void inline
+multiplyMatrixArraysTransposed( int32 vecLength, int32 numResRows, int32 numResCols,
+                      float32 *leftArray, float32 *rightArray,
+                      float32 *resArray );
+
+//===================================================
+
+/*A  processor is created with an environment that holds two matrices,
+ * the row and col that it owns, and the name of a result gathering
+ * processor.
+ *It calculates the product of two sub-portions of the input matrices
+ * by using Intel's mkl library for single-core.
+ *
+ *This demonstrates using optimized single-threaded code inside scheduled
+ * work-units.
+ *
+ *When done, it sends the result to the result processor
+ */
+void
+calcSubMatrixProduct( void *data, SlaveVP *animatingPr )
+ { 
+   SMPairParams   *params;
+   SlaveVP      *resultPr;
+   float32        *leftArray,  *rightArray, *resArray;
+   SubMatrix      *leftSubMatrix, *rightSubMatrix;
+   MatrixMultGlobals *globals =(MatrixMultGlobals *)Vthread__give_globals();
+
+         DEBUG1(dbgAppFlow, "start sub-matrix mult: %d\n", animatingPr->procrID)
+         #ifdef TURN_ON_DEBUG_PROBES
+         int32 subMatrixProbe = VMS_App__create_single_interval_probe( "subMtx",
+                                                                animatingPr);
+         VMS_App__record_sched_choice_into_probe( subMatrixProbe, animatingPr );
+         VMS_App__record_interval_start_in_probe( subMatrixProbe );
+         #endif
+
+   params         = (SMPairParams *)data;
+   resultPr       = params->resultPr;
+   leftSubMatrix  = params->leftSubMatrix;
+   rightSubMatrix = params->rightSubMatrix;
+
+      //make sure the input sub-matrices have been copied out of orig
+      //do it here, inside sub-matrix pair to hopefully gain reuse in cache
+   copyFromOrig( leftSubMatrix, animatingPr );
+   copyTransposeFromOrig( rightSubMatrix, animatingPr );
+   
+   leftArray      = leftSubMatrix->array;
+   rightArray     = rightSubMatrix->array;
+
+   size_t resSize = (size_t)leftSubMatrix->numRows * 
+                        (size_t)rightSubMatrix->numCols * sizeof(float32);
+   resArray = Vthread__malloc( resSize, animatingPr );
+   memset( resArray, 0, resSize );
+
+
+   int32 numResRows, numResCols, vectLength;
+   
+   vectLength = leftSubMatrix->numCols;
+   numResRows = leftSubMatrix->numRows;
+   numResCols = rightSubMatrix->numCols;
+
+   multiplyMatrixArraysTransposed( vectLength, numResRows, numResCols,
+                         leftArray,  rightArray,
+                         resArray );
+
+   //send result to result processor
+   params->partialResultArray = resArray;
+
+         #ifdef TURN_ON_DEBUG_PROBES
+         VMS_App__record_interval_end_in_probe( subMatrixProbe );
+         #endif
+
+      //Send result to results thread
+      //This pattern works 'cause only get lock when results thd inside wait
+   Vthread__mutex_lock(   globals->vector_mutex, animatingPr );
+   globals->currSMPairParams = params;
+   Vthread__cond_signal(  globals->vector_cond,  animatingPr );
+   Vthread__mutex_unlock( globals->vector_mutex, animatingPr );//release
+   //wait-er -- cond_signal implemented such that wait-er gets lock, no other
+
+         DEBUG1(dbgAppFlow, "end sub-matrix mult: %d\n", animatingPr->procrID)
+   Vthread__dissipate_thread( animatingPr );
+ }
+
+
+
+/*Divides result and each input into 32x32 sub-matrices, 3 of which fit into
+ * the 32KB L1 cache.
+ *Would be nice to embed this within another level that divided into
+ * 8x8 tiles of those, where one 8x8 tile fits within 2MB L2 cache
+ *
+ *Eventually want these divisions to be automatic, using DKU pattern
+ * embedded into VMS and exposed in the language, and with VMS controlling the
+ * divisions according to the cache sizes, which it knows about.
+ *Also, want VMS to work with language to split among main-mems, so a socket
+ * only cranks on data in its local segment of main mem
+ *
+ *So, outer two loops determine start and end points within the result matrix.
+ * Inside that, a loop dets the start and end points along the shared dimensions
+ * of the two input matrices.
+ */
+void inline
+multiplyMatrixArraysTransposed( int32 vecLength, int32 numResRows,
+                                int32 numResCols,
+                                float32 *leftArray, float32 *rightArray,
+                                float32 *resArray )
+ {
+   int resStride, inpStride;
+   int resStartRow, resStartCol, resEndRow, resEndCol, startVec, endVec;
+
+   resStride  = numResCols;
+   inpStride  = vecLength;
+
+   for( resStartRow = 0; resStartRow < numResRows; )
+    {
+      resEndRow = resStartRow + ROWS_IN_BLOCK -1;  //start at zero, so -1
+      if( resEndRow > numResRows ) resEndRow = numResRows -1;
+
+      for( resStartCol = 0; resStartCol < numResCols; )
+       {
+         resEndCol   = resStartCol + COLS_IN_BLOCK -1;
+         if( resEndCol > numResCols ) resEndCol = numResCols -1;
+
+         for( startVec = 0; startVec < vecLength; )
+          {
+            endVec   = startVec + VEC_IN_BLOCK -1;
+            if( endVec > vecLength ) endVec = vecLength -1;
+
+               //By having the "vector" of sub-blocks in a sub-block slice
+               // be marched down in inner loop, are re-using the result
+               // matrix, which stays in L1 cache and re-using the left sub-mat
+               // which repeats for each right sub-mat -- can only re-use two of
+               // the three, so result is the most important -- avoids writing
+               // dirty blocks until those result-locations fully done
+               //Row and Col is position in result matrix -- so row and vec
+               // for left array, then vec and col for right array
+            multiplySubBlocksTransposed( leftArray, rightArray,
+                                         resArray,
+                                         resStartRow,  resEndRow,
+                                         resStartCol,  resEndCol,
+                                         startVec,  endVec,
+                                         resStride, inpStride );
+            startVec = endVec +1;
+          }
+         resStartCol = resEndCol +1;
+       }
+      resStartRow = resEndRow +1;
+    }
+ }
+
+
+
+void inline
+multiplySubBlocksTransposed( float32 *leftArray, float32 *rightArray,
+                     float32 *resArray,
+                     int resStartRow,  int resEndRow,
+                     int resStartCol,  int resEndCol,
+                     int startVec,  int endVec,
+                     int resStride, int inpStride )
+ {
+   int resRow,     resCol,        vec;
+   int leftOffset, rightOffset;
+   float32 result;
+
+      //The result row is used only for the left matrix, res col for the right
+   for( resCol = resStartCol; resCol <= resEndCol; resCol++ )
+    {
+      for( resRow = resStartRow; resRow <= resEndRow; resRow++ )
+       {
+         leftOffset  = resRow * inpStride;//left & right inp strides always same
+         rightOffset = resCol * inpStride;// because right is transposed
+         result = 0;
+         for( vec = startVec; vec <= endVec; vec++ )
+          {
+            result +=
+               leftArray[ leftOffset + vec] * rightArray[ rightOffset + vec];
+          }
+
+         resArray[ resRow * resStride + resCol ] += result;
+       }
+    }
+ }
+
+
+
+
+/*Reuse this in divider when do the sequential multiply case
+ */
+void inline
+copyTranspose( int32 numRows, int32 numCols,
+               int32 origStartRow, int32 origStartCol, int32 origStride,
+               float32 *subArray, float32 *origArray )
+ { int32 stride = numRows;
+ 
+   int row, col, origOffset;
+   for( row = 0; row < numRows; row++ )
+    {
+      origOffset = (row + origStartRow) * origStride + origStartCol;
+      for( col = 0; col < numCols; col++ )
+       {
+            //transpose means swap row & col -- traverse orig matrix normally
+            // but put into reversed place in local array -- means the
+            // stride is the numRows now, so col * numRows + row
+         subArray[ col * stride + row ]  =  origArray[ origOffset + col ];
+       }
+    }
+ }
+
+void inline
+copyTransposeFromOrig( SubMatrix *subMatrix, SlaveVP *animPr )
+ { int numCols, numRows, origStartRow, origStartCol, origStride;
+   Matrix *origMatrix;
+   float32 *origArray, *subArray;
+
+   Vthread__start_data_singleton( subMatrix->copyTransSingleton, animPr );
+
+   origMatrix   = subMatrix->origMatrix;
+   origArray    = origMatrix->array;
+   numCols      = subMatrix->numCols;
+   numRows      = subMatrix->numRows;
+   origStartRow = subMatrix->origStartRow;
+   origStartCol = subMatrix->origStartCol;
+   origStride   = origMatrix->numCols;
+
+   subArray    = Vthread__malloc( (size_t)numRows * 
+                                        (size_t)numCols *sizeof(float32),animPr);
+   subMatrix->array = subArray;
+
+      //copy values from orig matrix to local
+   copyTranspose( numRows, numCols,
+                  origStartRow, origStartCol, origStride,
+                  subArray, origArray );
+
+   Vthread__end_data_singleton( subMatrix->copyTransSingleton, animPr );
+
+ }
+
+
+void inline
+copyFromOrig( SubMatrix *subMatrix, SlaveVP *animPr )
+ { int numCols, numRows, origStartRow, origStartCol, stride, origStride;
+   Matrix *origMatrix;
+   float32 *origArray, *subArray;
+
+
+      //This lets only a single VP execute the code between start and
+      // end -- using start and end so that work runs outside the master.
+      //If a second VP ever executes the start, it will be returned
+      // from the end-point.  If its execution starts after another but before
+      // that other has finished, this one will remain suspended until the
+      // other finishes, then be resumed from the end-point.
+   Vthread__start_data_singleton( subMatrix->copySingleton, animPr );
+
+
+   origMatrix    = subMatrix->origMatrix;
+   origArray     = origMatrix->array;
+   numCols       = subMatrix->numCols;
+   numRows       = subMatrix->numRows;
+   origStartRow  = subMatrix->origStartRow;
+   origStartCol  = subMatrix->origStartCol;
+   origStride    = origMatrix->numCols;
+
+   subArray    = Vthread__malloc( (size_t)numRows * 
+                                        (size_t)numCols *sizeof(float32),animPr);
+   subMatrix->array = subArray;
+
+      //copy values from orig matrix to local
+   stride        = numCols;
+
+   int row, col, offset, origOffset;
+   for( row = 0; row < numRows; row++ )
+    {
+      offset     = row * stride;
+      origOffset = (row + origStartRow) * origStride + origStartCol;
+      for( col = 0; col < numCols; col++ )
+       {
+         subArray[ offset + col ]  =  origArray[ origOffset + col ];
+       }
+    }
+
+   Vthread__end_data_singleton( subMatrix->copySingleton, animPr );
+ }
diff -r f8d3f16e70c8 -r 69928a38d5af __brch__default
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/__brch__default	Mon Sep 17 18:19:52 2012 -0700
@@ -0,0 +1,3 @@
+This branch is for the project structure defined Jan 2012..  the #includes reflect this directory structure.
+
+The default branch of the application directory is likely to be the only branch 
diff -r f8d3f16e70c8 -r 69928a38d5af main.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/main.c	Mon Sep 17 18:19:52 2012 -0700
@@ -0,0 +1,480 @@
+/* 
+ * 
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include <ctype.h>
+#include <errno.h>
+#include <pthread.h>
+#include <unistd.h>
+#include "VMS_Implementations/Vthread_impl/VPThread.h"
+#include "C_Libraries/Queue_impl/PrivateQueue.h"
+
+#include <linux/perf_event.h>
+#include <linux/prctl.h>
+#include <sys/syscall.h>
+
+#undef DEBUG
+//#define DEBUG
+
+#define MEASURE_PERF
+
+#if !defined(unix) && !defined(__unix__)
+#ifdef __MACH__
+#define unix		1
+#define __unix__	1
+#endif	/* __MACH__ */
+#endif	/* unix */
+
+/* find the appropriate way to define explicitly sized types */
+/* for C99 or GNU libc (also mach's libc) we can use stdint.h */
+#if (__STDC_VERSION__ >= 199900) || defined(__GLIBC__) || defined(__MACH__)
+#include <stdint.h>
+#elif defined(unix) || defined(__unix__)	/* some UNIX systems have them in sys/types.h */
+#include <sys/types.h>
+#elif defined(__WIN32__) || defined(WIN32)	/* the nameless one */
+typedef unsigned __int8 uint8_t;
+typedef unsigned __int32 uint32_t;
+#endif	/* sized type detection */
+
+/* provide a millisecond-resolution timer for each system */
+#if defined(unix) || defined(__unix__)
+#include <time.h>
+#include <sys/time.h>
+unsigned long get_msec(void) {
+	static struct timeval timeval, first_timeval;
+
+	gettimeofday(&timeval, 0);
+	if(first_timeval.tv_sec == 0) {
+		first_timeval = timeval;
+		return 0;
+	}
+	return (timeval.tv_sec - first_timeval.tv_sec) * 1000 + (timeval.tv_usec - first_timeval.tv_usec) / 1000;
+}
+#elif defined(__WIN32__) || defined(WIN32)
+#include <windows.h>
+unsigned long get_msec(void) {
+	return GetTickCount();
+}
+#else
+//#error "I don't know how to measure time on your platform"
+#endif
+
+//======================== Defines =========================
+typedef struct perfData measurement_t;
+struct perfData{
+    uint64 cycles;
+    uint64 instructions;
+};
+
+const char *usage = {
+	"Usage: malloc_test [options]\n"
+	"  Spwans a number of threads and allocates memory.\n\n"
+	"Options:\n"
+	"  -t <num>   how many threads to use (default: 1). This is internaly multiplied by the number of cores.\n"
+	"  -o <num>   repeat workload and sync operation <m> times\n"
+        "  -i <num>   size of workload, repeat <n> times\n"     
+	"  -h         this help screen\n\n"
+};
+
+struct barrier_t
+{
+    int counter;
+    int nthreads;
+    int32 mutex;
+    int32 cond;
+    measurement_t endBarrierCycles;
+
+};
+typedef struct barrier_t barrier;
+
+void inline barrier_init(barrier *barr, int nthreads, VirtProcr *animatingPr)
+ {
+   barr->counter = 0;
+   barr->nthreads = nthreads;
+   barr->mutex   = VPThread__make_mutex(animatingPr);
+   barr->cond    = VPThread__make_cond(barr->mutex, animatingPr);
+ }
+
+int cycles_counter_main_fd;
+void inline barrier_wait(barrier *barr, VirtProcr *animatingPr)
+ { int i;
+
+   VPThread__mutex_lock(barr->mutex, animatingPr);
+   barr->counter++;
+   if(barr->counter == barr->nthreads)
+    { 
+#ifdef MEASURE_PERF
+      read(cycles_counter_main_fd, &(barr->endBarrierCycles.cycles), \
+                sizeof(barr->endBarrierCycles.cycles));
+#endif
+       
+      barr->counter = 0;
+      for(i=0; i < barr->nthreads; i++)
+         VPThread__cond_signal(barr->cond, animatingPr);
+    }
+   else
+    { VPThread__cond_wait(barr->cond, animatingPr);
+    }
+   VPThread__mutex_unlock(barr->mutex, animatingPr);
+ }
+
+
+
+typedef struct
+ { struct barrier_t* barrier;
+   uint64_t  totalWorkCycles;
+   uint64_t  totalBadCycles;
+   uint64_t  totalSyncCycles;
+   uint64_t  totalBadSyncCycles;
+   uint64     numGoodSyncs;
+   uint64     numGoodTasks;
+ }
+WorkerParams;
+
+
+typedef struct
+ { measurement_t *startExeCycles;
+   measurement_t *endExeCycles;
+ }
+BenchParams;
+
+//======================== Globals =========================
+char __ProgrammName[] = "overhead_test";
+char __DataSet[255];
+
+int outer_iters, inner_iters, num_threads;
+size_t chunk_size = 0;
+
+int cycles_counter_fd[NUM_CORES];
+struct perf_event_attr* hw_event;
+
+WorkerParams *workerParamsArray;
+
+//======================== App Code =========================
+/*
+ * Workload
+ */
+
+#define saveCyclesAndInstrs(core,cycles) do{     \
+   int cycles_fd = cycles_counter_fd[core];             \
+   int nread;                                           \
+                                                        \
+   nread = read(cycles_fd,&(cycles),sizeof(cycles));    \
+   if(nread<0){                                         \
+       perror("Error reading cycles counter");          \
+       cycles = 0;                                      \
+   }                                                    \
+} while (0) //macro magic for scoping
+
+
+double
+worker_TLF(void* _params, VirtProcr* animatingPr)
+ {
+   int i,o;
+   WorkerParams* params = (WorkerParams*)_params;
+   unsigned int totalWorkCycles = 0, totalBadCycles = 0;
+   unsigned int totalSyncCycles = 0, totalBadSyncCycles = 0;
+   unsigned int workspace1=0, numGoodSyncs = 0, numGoodTasks = 0;
+   double workspace2=0.0;
+   int32 privateMutex = VPThread__make_mutex(animatingPr);
+   
+   int cpuid = sched_getcpu();
+   
+   measurement_t startWorkload, endWorkload, startWorkload2, endWorkload2;
+   uint64 numCycles;
+   for(o=0; o < outer_iters; o++)
+    {
+#ifdef MEASURE_PERF
+          saveCyclesAndInstrs(cpuid,startWorkload.cycles);
+#endif
+       
+      //workltask
+      for(i=0; i < inner_iters; i++)
+       {
+         workspace1 += (workspace1 + 32)/2;
+         workspace2 += (workspace2 + 23.2)/1.4;
+       }
+  
+#ifdef MEASURE_PERF
+          saveCyclesAndInstrs(cpuid,endWorkload.cycles);
+          numCycles = endWorkload.cycles - startWorkload.cycles;
+          //sanity check (400K is about 20K iters)
+          if( numCycles < 400000 ) {totalWorkCycles += numCycles; numGoodTasks++;}
+          else                     {totalBadCycles  += numCycles; }
+#endif
+
+      //mutex access often causes switch to different Slave VP
+      VPThread__mutex_lock(privateMutex, animatingPr);
+      
+/*
+          saveCyclesAndInstrs(cpuid,startWorkload2.cycles);
+      //Task
+      for(i=0; i < inner_iters; i++)
+       {
+         workspace1 += (workspace1 + 32)/2;
+         workspace2 += (workspace2 + 23.2)/1.4;
+       }
+      
+          saveCyclesAndInstrs(cpuid,endWorkload2.cycles);
+          numCycles = endWorkload2.cycles - startWorkload2.cycles;
+          //sanity check (400K is about 20K iters)
+          if( numCycles < 400000 ) {totalWorkCycles += numCycles; numGoodTasks++;}
+          else                     {totalBadCycles  += numCycles; }
+      
+*/
+      VPThread__mutex_unlock(privateMutex, animatingPr);
+    }
+
+   params->totalWorkCycles = totalWorkCycles;
+   params->totalBadCycles = totalBadCycles;
+   params->numGoodTasks   = numGoodTasks;
+   params->totalSyncCycles = totalSyncCycles;
+   params->totalBadSyncCycles = totalBadSyncCycles;
+   params->numGoodSyncs = numGoodSyncs;
+/*
+   params->totalSyncCycles = VMS__give_num_plugin_cycles();
+   params->totalBadSyncCycles = 0;
+   params->numGoodSyncs = VMS__give_num_plugin_animations();
+*/
+   
+   
+   //Wait for all threads to end
+   barrier_wait(params->barrier, animatingPr);
+   
+   //Shutdown worker
+   VPThread__dissipate_thread(animatingPr);
+   
+     //below return never reached --> there for gcc
+   return (workspace1 + workspace2);  //to prevent gcc from optimizing work out
+ }
+
+
+/* this is run after the VMS is set up*/
+void benchmark(void *_params, VirtProcr *animatingPr)
+ {
+   int i, cpuID;
+   struct barrier_t  barr;
+   BenchParams      *params;
+   
+   params = (BenchParams *)_params;
+
+   barrier_init(&barr, num_threads+1, animatingPr);
+      
+   //prepare input
+   for(i=0; i<num_threads; i++)
+    { 
+       workerParamsArray[i].barrier = &barr;
+    }
+     
+   //save cycles before execution of threads, to get total exe cycles
+   measurement_t *startExeCycles, *endExeCycles;
+   startExeCycles = params->startExeCycles;
+   
+#ifdef MEASURE_PERF
+   int nread = read(cycles_counter_main_fd, &(startExeCycles->cycles),
+                sizeof(startExeCycles->cycles));
+   if(nread<0) perror("Error reading cycles counter");
+#endif
+   
+   //create (which starts running) all threads
+   for(i=0; i<num_threads; i++)
+    { VPThread__create_thread((VirtProcrFnPtr)worker_TLF, &(workerParamsArray[i]), animatingPr);
+    }
+   //wait for all threads to finish
+   barrier_wait(&barr, animatingPr);
+  
+#ifdef MEASURE_PERF
+   //endBarrierCycles read in barrier_wait()!  Merten, email me if want to chg
+   params->endExeCycles->cycles = barr.endBarrierCycles.cycles;
+#endif
+   
+
+/*
+   uint64_t overallWorkCycles = 0;
+   for(i=0; i<num_threads; i++){ 
+       printf("WorkCycles: %lu\n",input[i].totalWorkCycles);
+       overallWorkCycles += input[i].totalWorkCycles;
+    }
+   
+   printf("Sum across threads of work cycles: %lu\n", overallWorkCycles);
+   printf("Total Execution: %lu\n", endBenchTime.cycles-startBenchTime.cycles);
+   printf("Runtime/Workcycle Ratio %lu\n", 
+   ((endBenchTime.cycles-startBenchTime.cycles)*100)/overallWorkCycles);
+*/
+
+   //======================================================
+
+   VPThread__dissipate_thread(animatingPr);
+ }
+
+int main(int argc, char **argv)
+ {
+   int i;
+
+   //set global static variables, based on cmd-line args
+   for(i=1; i<argc; i++)
+    {
+      if(argv[i][0] == '-' && argv[i][2] == 0)
+       {
+         switch(argv[i][1])
+          {
+            case 't':
+               if(!isdigit(argv[++i][0]))
+                {
+                  fprintf(stderr, "-t must be followed by the number of worker threads to spawn\n");
+                  return EXIT_FAILURE;
+                }
+               num_threads = atoi(argv[i]);
+               if(!num_threads)
+                {
+                  fprintf(stderr, "invalid number of threads specified: %d\n", num_threads);
+                  return EXIT_FAILURE;
+                }
+            break;
+            case 'o':
+               if(!isdigit(argv[++i][0]))
+                {
+                  fputs("-i must be followed by a number\n", stderr);
+                  return EXIT_FAILURE;
+                }
+               outer_iters = atoi(argv[i]);
+				break;
+            case 'i':
+               if(!isdigit(argv[++i][0]))
+                {
+                  fputs("-o must be followed by a number (workload size)\n", stderr);
+                  return EXIT_FAILURE;
+                }
+               inner_iters = atoi(argv[i]);
+				break;
+            case 'h':
+               fputs(usage, stdout);
+               return 0;
+				
+            default:
+               fprintf(stderr, "unrecognized argument: %s\n", argv[i]);
+               fputs(usage, stderr);
+               return EXIT_FAILURE;
+          }//switch
+       }//if arg
+      else
+       {
+			fprintf(stderr, "unrecognized argument: %s\n", argv[i]);
+			fputs(usage, stderr);
+			return EXIT_FAILURE;
+       }
+    }//for
+   
+   
+#ifdef MEASURE_PERF
+   //setup performance counters
+    hw_event = malloc(sizeof(struct perf_event_attr));
+    memset(hw_event,0,sizeof(struct perf_event_attr));
+    
+    hw_event->type = PERF_TYPE_HARDWARE;
+    hw_event->size = sizeof(hw_event);
+    hw_event->disabled = 0;
+    hw_event->freq = 0;
+    hw_event->inherit = 1; /* children inherit it   */
+    hw_event->pinned = 1; /* says this virt counter must always be on HW */
+    hw_event->exclusive = 0; /* only group on PMU     */
+    hw_event->exclude_user = 0; /* don't count user      */
+    hw_event->exclude_kernel = 1; /* don't count kernel  */
+    hw_event->exclude_hv = 1; /* ditto hypervisor      */
+    hw_event->exclude_idle = 1; /* don't count when idle */
+    hw_event->mmap = 0; /* include mmap data     */
+    hw_event->comm = 0; /* include comm data     */
+
+    hw_event->config = PERF_COUNT_HW_CPU_CYCLES; //cycles
+    
+    int cpuID, retries;
+
+   for( cpuID = 0; cpuID < NUM_CORES; cpuID++ )
+    { retries = 0;
+      do
+       { retries += 1;
+         cycles_counter_fd[cpuID] = 
+          syscall(__NR_perf_event_open, hw_event,
+                  0,//pid_t: 0 is "pid of calling process" 
+                  cpuID,//int: cpu, the value returned by "CPUID" instr(?)
+                  -1,//int: group_fd, -1 is "leader" or independent
+                  0//unsigned long: flags
+                 );
+       }
+      while(cycles_counter_fd[cpuID]<0 && retries < 100);
+      if(retries >= 100)
+       {
+         fprintf(stderr,"On core %d: ",cpuID);
+         perror("Failed to open cycles counter");
+       }
+    }
+
+   //Set up counter to accumulate total cycles to process, across all CPUs
+
+   retries = 0;
+   do
+    { retries += 1;
+      cycles_counter_main_fd = 
+       syscall(__NR_perf_event_open, hw_event,
+               0,//pid_t: 0 is "pid of calling process" 
+               -1,//int: cpu, -1 means accumulate from all cores
+               -1,//int: group_fd, -1 is "leader" == independent
+               0//unsigned long: flags
+              );
+    }
+   while(cycles_counter_main_fd<0 && retries < 100);
+   if(retries >= 100)
+    {
+      fprintf(stderr,"in main ");
+      perror("Failed to open cycles counter");
+    }
+#endif
+   
+   measurement_t startExeCycles, endExeCycles;
+   BenchParams *benchParams;
+   
+   benchParams = malloc(sizeof(BenchParams)); 
+   
+   benchParams->startExeCycles = &startExeCycles;
+   benchParams->endExeCycles   = &endExeCycles;
+   
+   workerParamsArray =  (WorkerParams *)malloc( (num_threads + 1) * sizeof(WorkerParams) );
+   if(workerParamsArray == NULL ) printf("error mallocing worker params array\n");
+   
+ 
+   //This is the transition to the VMS runtime
+   VPThread__create_seed_procr_and_do_work( &benchmark, benchParams );
+   
+#ifdef MEASURE_PERF
+   uint64_t totalWorkCyclesAcrossCores = 0, totalBadCyclesAcrossCores = 0;
+   uint64_t totalSyncCyclesAcrossCores = 0, totalBadSyncCyclesAcrossCores = 0;
+   for(i=0; i<num_threads; i++){ 
+       printf("WorkCycles: %lu\n",workerParamsArray[i].totalWorkCycles);
+//       printf("Num Good Tasks: %lu\n",workerParamsArray[i].numGoodTasks);
+//       printf("SyncCycles: %lu\n",workerParamsArray[i].totalSyncCycles);
+//       printf("Num Good Syncs: %lu\n",workerParamsArray[i].numGoodSyncs);
+       totalWorkCyclesAcrossCores += workerParamsArray[i].totalWorkCycles;
+       totalBadCyclesAcrossCores  += workerParamsArray[i].totalBadCycles;
+       totalSyncCyclesAcrossCores += workerParamsArray[i].totalSyncCycles;
+       totalBadSyncCyclesAcrossCores  += workerParamsArray[i].totalBadSyncCycles;
+    }
+
+   uint64_t totalExeCycles = endExeCycles.cycles - startExeCycles.cycles;
+   totalExeCycles -= totalBadCyclesAcrossCores;
+   uint64 totalOverhead = totalExeCycles - totalWorkCyclesAcrossCores;
+   int32  numSyncs = outer_iters * num_threads * 2;
+   printf("Total Execution Cycles: %lu\n", totalExeCycles);
+   printf("Sum across threads of work cycles: %lu\n", totalWorkCyclesAcrossCores);
+   printf("Sum across threads of bad work cycles: %lu\n", totalBadCyclesAcrossCores);
+//   printf("Sum across threads of Bad Sync cycles: %lu\n", totalBadSyncCyclesAcrossCores);
+   printf("Overhead per sync: %f\n", (double)totalOverhead / (double)numSyncs );
+   printf("ExeCycles/WorkCycles Ratio %f\n", 
+          (double)totalExeCycles / (double)totalWorkCyclesAcrossCores);
+#else
+   printf("No measurement done!\n");
+#endif
+   return 0;
+ }
diff -r f8d3f16e70c8 -r 69928a38d5af src/Application/Matrix_Mult.c
--- a/src/Application/Matrix_Mult.c	Fri Sep 16 20:12:34 2011 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,167 +0,0 @@
-/*
- *  Copyright 2009 OpenSourceStewardshipFoundation.org
- *  Licensed under GNU General Public License version 2
- *
- * Author: seanhalle@yahoo.com
- *
- * Created on November 15, 2009, 2:35 AM
- */
-
-#include <malloc.h>
-#include <stdlib.h>
-
-#include "Matrix_Mult.h"
-#include "ParamHelper/Param.h"
-
-
- 
- void
-initialize_Input_Matrices_Via( Matrix  **leftMatrix, Matrix **rightMatrix,
-                               ParamBag *paramBag )
- { char *leftMatrixFileName, *rightMatrixFileName;
-   int   leftMatrixRows, leftMatrixCols, rightMatrixRows, rightMatrixCols;
-   
-      ParamStruc *param;
-      param = getParamFromBag( "leftMatrixRows", paramBag );
-   leftMatrixRows = param->intValue;
-      param = getParamFromBag( "leftMatrixCols", paramBag );
-   leftMatrixCols = param->intValue;
-   *leftMatrix = makeMatrix_WithResMat( leftMatrixRows, leftMatrixCols );
-   
-      param = getParamFromBag( "leftMatrixFileName", paramBag );
-   leftMatrixFileName = param->strValue;  //no need to copy
-   read_Matrix_From_File( *leftMatrix,  leftMatrixFileName );
-   
-      param = getParamFromBag( "rightMatrixRows", paramBag );
-   rightMatrixRows = param->intValue;
-      param = getParamFromBag( "rightMatrixCols", paramBag );
-   rightMatrixCols = param->intValue;
-   *rightMatrix = makeMatrix_WithResMat( rightMatrixRows, rightMatrixCols );
-   
-      param = getParamFromBag( "rightMatrixFileName", paramBag );
-   rightMatrixFileName = param->strValue;
-   read_Matrix_From_File( *rightMatrix, rightMatrixFileName );
- }
-
-
-void parseLineIntoRow( char *line, float32* row );
-
-
- void
-read_Matrix_From_File( Matrix *matrixStruc, char *matrixFileName )
- { int    row, maxRead, numRows, numCols;
-   float32 *matrixStart;
-   size_t lineSz = 0;
-   FILE  *file;
-   char  *line = NULL;
-   
-   lineSz = 50000; //max length of line in a matrix data file
-   line = (char *) malloc( lineSz );
-   if( line == NULL ) printf( "no mem for matrix line" );
-   
-   numRows = matrixStruc->numRows;
-   numCols = matrixStruc->numCols;
-   matrixStart = matrixStruc->array;
-
-   file = fopen( matrixFileName, "r" );
-   if( file == NULL ) { printf( "\nCouldn't open file!!\n"); exit(1);}
-   fseek( file, 0, SEEK_SET );
-   for( row = 0; row < numRows; row++ )
-    {
-      if( feof( file ) )  printf( "file ran out too soon" );
-      maxRead = getline( &line, &lineSz, file );
-      if( maxRead == -1 ) printf( "prob reading mat line");
-      
-      if( *line == '\n') continue; //blank line
-      if( *line == '/' ) continue; //comment line
-      
-      parseLineIntoRow( line, matrixStart + row * numCols );
-    }
-   free( line );
- }
-
-/*This function relies on each line having the proper number of cols.  It
- * doesn't check, nor enforce, so if the file is improperly formatted it
- * can write over unrelated memory
- */
- void
-parseLineIntoRow( char *line, float32* row )
- {
-   char *valueStr, *searchPos;
-   
-      //read the float values
-   searchPos = valueStr = line; //start
-   
-   for( ; *searchPos != 0; searchPos++)  //bit dangerous, should use buff len
-    {
-      if( *searchPos == '\n' ) //last col..  relying on well-formatted file
-       { *searchPos = 0;
-         *row = atof( valueStr );
-         break;                                    //end FOR loop
-       }
-      if( *searchPos == ',' )
-       { *searchPos = 0;                           //mark end of string
-         *row = (float32) atof( valueStr );
-         row += 1;                                 //address arith
-            //skip any spaces before digits.. use searchPos + 1 to skip the 0
-         for( ; *(searchPos + 1)== ' ' && *(searchPos + 1) !=0; searchPos++);
-         valueStr = searchPos + 1;
-       }
-    }
- }
-
- //==========================================================================
-
-/*In the "_Flat" version of constructor, do only malloc of the top data struc
- * and set values in that top-level.  Don't malloc any sub-structures.
- */
- Matrix *
-makeMatrix_Flat( int32 numRows, int32 numCols )
- { Matrix * retMatrix;
-   retMatrix = malloc( sizeof( Matrix ) );
-   retMatrix->numRows = numRows;
-   retMatrix->numCols = numCols;
-
-   return retMatrix;
- }
-
- Matrix *
-makeMatrix_WithResMat( int32 numRows, int32 numCols )
- { Matrix * retMatrix;
-   retMatrix = malloc( sizeof( Matrix ) );
-   retMatrix->numRows = numRows;
-   retMatrix->numCols = numCols;
-   retMatrix->array  = malloc( numRows * numCols * sizeof(float32) );
-
-   return retMatrix;
- }
-
- void
-freeMatrix_Flat( Matrix * matrix )
- { //( matrix );
- }
- void
-freeMatrix( Matrix * matrix )
- { free( matrix->array );
-   free( matrix );
- }
-
-void
-printMatrix( Matrix *matrix )
- { int r, c, numRows, numCols, rowsToPrint, colsToPrint, rowIncr, colIncr;
-   float32 *matrixArray;
-
-   numRows = rowsToPrint = matrix->numRows;
-   numCols = colsToPrint = matrix->numCols;
-   matrixArray = matrix->array;
-
-   rowIncr = numRows/20; if(rowIncr == 0) rowIncr = 1;//20 to 39 rows printed
-   colIncr = numCols/20; if(colIncr == 0) colIncr = 1;//20 to 39 cols printed
-   for( r = 0; r < numRows; r += rowIncr )
-    { for( c = 0; c < numCols; c += colIncr )
-       { printf( "%3.1f | ", matrixArray[ r * numCols + c ] );
-       }
-      printf("\n");
-    }
- }
-
diff -r f8d3f16e70c8 -r 69928a38d5af src/Application/Matrix_Mult.h
--- a/src/Application/Matrix_Mult.h	Fri Sep 16 20:12:34 2011 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,77 +0,0 @@
-/*
- *  Copyright Oct 24, 2009 OpenSourceStewardshipFoundation.org
- *  Licensed under GNU General Public License version 2
- */
-
-#ifndef MATRIX_MULT_H_
-#define MATRIX_MULT_H_
-
-#include <stdio.h>
-#include <unistd.h>
-#include <malloc.h>
-
-#include "../VPThread_lib/VMS/VMS_primitive_data_types.h"
-#include "ParamHelper/Param.h"
-
-//==============================  Structures  ==============================
-
-typedef
-struct
- { int32 numRows;
-   int32 numCols;
-   float32 *array;  //2D, but dynamically sized, so use addr arith
- }
-Matrix;
-
-/* This is the "appSpecificPiece" that is carried inside a DKUPiece.
- *  In the DKUPiece data struc it is declared to be of type "void *".  This
- *  allows the application to define any data structure it wants and put it
- *  into a DKUPiece.
- * When the app specific info is used, it is in app code, so it is cast to
- *  the correct type to tell the compiler how to access fields.
- * This keeps all app-specific things out of the DKU directory, as per the
- *  DKU standard. */
-typedef
-struct
- { 
-      // pointers to shared data..  the result matrix must be created when the
-      //  left and right matrices are put into the root ancestor DKUPiece.
-   Matrix * leftMatrix;
-   Matrix * rightMatrix;
-   Matrix * resultMatrix;
-
-      // define the starting and ending boundaries for this piece of the
-      //  result matrix.  These are derivable from the left and right
-      //  matrices, but included them for readability of code.
-   int prodStartRow, prodEndRow;
-   int prodStartCol, prodEndCol;
-      // Start and end of the portion of the left matrix that contributes to
-      //  this piece of the product
-   int leftStartRow, leftEndRow;
-   int leftStartCol, leftEndCol;
-      // Start and end of the portion of the right matrix that contributes to
-      //  this piece of the product
-   int rightStartRow, rightEndRow;
-   int rightStartCol, rightEndCol;
- }
-MatrixProdPiece;
-
-//==============================  Functions  ================================
-void readFile();
-
-Matrix *makeMatrix( int32 numRows, int32 numCols );
-Matrix *makeMatrix_Flat( int32 numRows, int32 numCols );
-Matrix *makeMatrix_WithResMat( int32 numRows, int32 numCols );
-void    freeMatrix_Flat( Matrix * matrix );
-void    freeMatrix( Matrix * matrix );
-void    printMatrix( Matrix *matrix );
-
-void read_Matrix_From_File( Matrix *matrixStruc, char *matrixFileName );
-
-void
-initialize_Input_Matrices_Via( Matrix  **leftMatrix, Matrix **rightMatrix,
-                              ParamBag *paramBag );
-
-//===========================================================================
-
-#endif /*MATRIX_MULT_H_*/
diff -r f8d3f16e70c8 -r 69928a38d5af src/Application/VPThread__Matrix_Mult/Divide_Pr.c
--- a/src/Application/VPThread__Matrix_Mult/Divide_Pr.c	Fri Sep 16 20:12:34 2011 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,646 +0,0 @@
-/*
- *  Copyright 2009 OpenSourceStewardshipFoundation.org
- *  Licensed under GNU General Public License version 2
- *
- * Author: seanhalle@yahoo.com
- *
- */
-
-
-#include "VPThread__Matrix_Mult.h"
-#include <math.h>
-#include <string.h>
-#include "../../VPThread_lib/VPThread.h"
-
-   //The time to compute this many result values should equal the time to
-   // perform this division on a matrix of size gives that many result calcs
-   //IE, size this so that sequential time to calc equals divide time
-   // find the value by experimenting -- but divide time and calc time scale
-   // same way, so this value should remain valid across hardware
-#define NUM_CELLS_IN_SEQUENTIAL_CUTOFF 1000
-
-
-//===========================================================================
-int inline
-measureMatrixMultPrimitive( VirtProcr *animPr );
-
-SlicingStrucCarrier *
-calcIdealSizeAndSliceDimensions( Matrix *leftMatrix, Matrix *rightMatrix,
-                                 VirtProcr *animPr );
-
-SlicingStruc *
-sliceUpDimension( float32 idealSizeOfSide, int startVal, int endVal,
-                  VirtProcr *animPr );
-
-void
-freeSlicingStruc( SlicingStruc *slicingStruc, VirtProcr *animPr );
-
-SubMatrix **
-createSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices,
-                   int32 numUses, Matrix *origMatrix, VirtProcr *animPr );
-
-void
-freeSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices,
-                 SubMatrix **subMatrices, VirtProcr *animPr );
-
-void
-pairUpSubMatricesAndMakeProcessors( SubMatrix **leftSubMatrices,
-                                    SubMatrix **rightSubMatrices,
-                                    int32 numRowIdxs, int32 numColIdxs,
-                                    int32 numVecIdxs,
-                                    VirtProcr *resultPr,
-                                    VirtProcr *animatingPr );
-
-void
-makeSubMatricesAndProcrs( Matrix *leftMatrix, Matrix *rightMatrix,
-            SlicingStrucCarrier *slicingStrucCarrier,
-            VirtProcr *resultPr, VirtProcr *animatingPr );
-
-void inline
-copyTranspose( int32 numRows, int32 numCols,
-               int32 origStartRow, int32 origStartCol, int32 origStride,
-               float32 *subArray, float32 *origArray );
-
-void inline
-multiplyMatrixArraysTransposed( int32 vecLength, int32 numResRows,
-                                int32 numResCols,
-                                float32 *leftArray, float32 *rightArray,
-                                float32 *resArray );
-
-
-
-/*Divider creates one processor for every sub-matrix
- * It hands them:
- *  the name of the result processor that they should send their results to,
- *  the left and right matrices, and the rows and cols they should multiply
- * It first creates the result processor, then all the sub-matrixPair
- *  processors,
- *  then does a receive of a message from the result processor that gives
- *  the divider ownership of the result matrix.
- * Finally, the divider returns the result matrix out of the VPThread system.
- *
- * Divider chooses the size of sub-matrices via an algorithm that tries to
- *  keep the minimum work above a threshold.  The threshold is machine-
- *  dependent, so ask VPThread for min work-unit time to get a
- *  given overhead
- *
- * Divide min work-unit cycles by measured-cycles for one matrix-cell
- *  product -- gives the number of products need to have in min size
- *  matrix.
- *
- * So then, take cubed root of this to get the size of a side of min sub-
- *  matrix.  That is the size of the ideal square sub-matrix -- so tile
- *  up the two input matrices into ones as close as possible to that size,
- *  and create the pairs of sub-matrices.
- *
- *========================  STRATEGIC OVERVIEW  =======================
- *
- *This division is a bit tricky, because have to create things in advance
- * that it's not at first obvious need to be created..
- *
- *First slice up each dimension -- three of them..  this is because will have
- * to create the sub-matrix's data-structures before pairing the sub-matrices
- * with each other -- so, have three dimensions to slice up before can
- * create the sub-matrix data-strucs -- also, have to be certain that the
- * cols of the left input have the exact same slicing as the rows of the
- * left matrix, so just to be sure, do the slicing calc once, then use it
- * for both.
- *
- *So, goes like this:
- *1) calculate the start & end values of each dimension in each matrix.
- *2) use those values to create sub-matrix structures
- *3) combine sub-matrices into pairs, as the tasks to perform.
- *
- *Have to calculate separately from creating the sub-matrices because of the
- * nature of the nesting -- would either end up creating the same sub-matrix
- * multiple times, or else would have to put in detection of whether had
- * made a particular one already if tried to combine steps 1 and 2.
- *
- *Step 3 has to be separate because of the nesting, as well -- same reason,
- * would either create same sub-matrix multiple times, or else have to
- * add detection of whether was already created.
- *
- *Another way to look at it: there's one level of loop to divide dimensions,
- * two levels of nesting to create sub-matrices, and three levels to pair
- * up the sub-matrices.
- */
-
-void divideWorkIntoSubMatrixPairProcrs( void      *_dividerParams,
-                                        VirtProcr *animatingThd )
- { VirtProcr       *resultPr;
-   DividerParams   *dividerParams;
-   ResultsParams   *resultsParams;
-   Matrix          *leftMatrix, *rightMatrix;
-   SlicingStrucCarrier *slicingStrucCarrier;
-   float32             *resultArray; //points to array inside result matrix
-   MatrixMultGlobals   *globals;
-  
-         DEBUG( dbgAppFlow, "start divide\n")
-
-         int32
-         divideProbe = VMS__create_single_interval_probe( "divideProbe",
-                                                          animatingThd );
-         VMS__record_sched_choice_into_probe( divideProbe, animatingThd );
-         VMS__record_interval_start_in_probe( divideProbe );
-
-   //=========== Setup -- make local copies of ptd-to-things, malloc, aso
-   int32 numResRows, numResCols, vectLength;
-
-   dividerParams   = (DividerParams *)_dividerParams;
-   
-   leftMatrix      = dividerParams->leftMatrix;
-   rightMatrix     = dividerParams->rightMatrix;
-
-   vectLength = leftMatrix->numCols;
-   numResRows = leftMatrix->numRows;
-   numResCols = rightMatrix->numCols;
-   resultArray     = dividerParams->resultMatrix->array;
-   
-      //zero the result array
-   memset( resultArray, 0, numResRows * numResCols * sizeof(float32) );
-
-   //==============  Do either sequential mult or do division ==============
-
-      //Check if input matrices too small -- if yes, just do sequential
-      //Cutoff is determined by overhead of this divider -- relatively
-      // machine-independent
-   if( (float32)leftMatrix->numRows * (float32)leftMatrix->numCols *
-       (float32)rightMatrix->numCols  < NUM_CELLS_IN_SEQUENTIAL_CUTOFF )
-    {
-      //====== Do sequential multiply on a single core
-            DEBUG( dbgAppFlow, "doing sequential")
-            
-         //transpose the right matrix
-      float32 *
-      transRightArray  = 
-         VPThread__malloc( (size_t)rightMatrix->numRows * 
-                                (size_t)rightMatrix->numCols *
-                                sizeof(float32), animatingThd );
-
-         //copy values from orig matrix to local
-      copyTranspose( rightMatrix->numRows, rightMatrix->numCols,
-                     0, 0, rightMatrix->numRows,
-                     transRightArray, rightMatrix->array );
-      
-      multiplyMatrixArraysTransposed( vectLength, numResRows, numResCols,
-                            leftMatrix->array, transRightArray,
-                            resultArray );
-    }
-   else
-    {
-      //====== Do parallel multiply across cores
-
-            DEBUG( dbgAppFlow, "divider: do parallel mult\n")
-         //Calc the ideal size of sub-matrix and slice up the dimensions of
-         // the two matrices.
-         //The ideal size is the one takes the number of cycles to calculate
-         // such that calc time is equal or greater than min work-unit size
-      slicingStrucCarrier =
-         calcIdealSizeAndSliceDimensions(leftMatrix,rightMatrix,animatingThd);
-
-         //Make the results processor, now that know how many to wait for
-      resultsParams = VPThread__malloc( sizeof(ResultsParams), animatingThd );
-      resultsParams->numSubMatrixPairs  =
-         slicingStrucCarrier->leftRowSlices->numVals *
-         slicingStrucCarrier->rightColSlices->numVals *
-         slicingStrucCarrier->vecSlices->numVals;
-      resultsParams->dividerPr   = animatingThd;
-      resultsParams->numCols     = rightMatrix->numCols;
-      resultsParams->numRows     = leftMatrix->numRows;
-      resultsParams->resultArray = resultArray;
-
-      //==========  Set up global vars, including conds and mutexes ==========
-      globals = VMS__malloc( sizeof(MatrixMultGlobals) );
-      VPThread__set_globals_to( globals );
-
-      globals->results_mutex = VPThread__make_mutex( animatingThd );
-      globals->results_cond  = VPThread__make_cond( globals->results_mutex,
-                                                               animatingThd );
-
-      globals->vector_mutex = VPThread__make_mutex( animatingThd );
-      globals->vector_cond  = VPThread__make_cond( globals->vector_mutex,
-                                                               animatingThd );
-
-      globals->start_mutex = VPThread__make_mutex( animatingThd );
-      globals->start_cond  = VPThread__make_cond( globals->start_mutex,
-                                                               animatingThd );
-      //======================================================================
-
-            DEBUG( dbgAppFlow, "divider: made mutexes and conds\n")
-         //get results-comm lock before create results-thd, to ensure it can't
-         // signal that results are available before this thd is waiting on cond
-      VPThread__mutex_lock( globals->results_mutex, animatingThd );
-
-         //also get the start lock & use to ensure no vector threads send a
-         // signal before the results thread is waiting on vector cond
-      VPThread__mutex_lock( globals->start_mutex, animatingThd );
-
-
-            DEBUG( dbgAppFlow, "divider: make result thread\n")
-      VPThread__create_thread( &gatherResults, resultsParams, animatingThd );
-
-         //Now wait for results thd to signal that it has vector lock
-      VPThread__cond_wait(  globals->start_cond,  animatingThd );
-      VPThread__mutex_unlock( globals->start_mutex, animatingThd );//done w/lock
-            DEBUG( dbgAppFlow, "divider: make sub-matrices\n")
-   
-         //Make the sub-matrices, and pair them up, and make processor to
-         // calc product of each pair.
-      makeSubMatricesAndProcrs( leftMatrix, rightMatrix,
-                                    slicingStrucCarrier,
-                                    resultPr, animatingThd);
- 
-         //Wait for results thread to say results are good
-      VPThread__cond_wait(  globals->results_cond,  animatingThd );
-    }
-
-
-   //===============  Work done -- send results back =================
-
-
-         DEBUG( dbgAppFlow, "end divide\n")
-
-         VMS__record_interval_end_in_probe( divideProbe );
-         VMS__print_stats_of_all_probes();
-
-      //nothing left to do so dissipate, VPThread will wait to shutdown,
-      // making results available to outside, until all the processors have
-      // dissipated -- so actually no need to wait for results processor
-      //However, following the pattern, so done with comm, release lock
-   VPThread__mutex_unlock( globals->results_mutex, animatingThd );
-
-   VPThread__dissipate_thread( animatingThd );  //all procrs dissipate self at end
-      //when all of the processors have dissipated, the "create seed and do
-      // work" call in the entry point function returns
- }
-
-
-SlicingStrucCarrier *
-calcIdealSizeAndSliceDimensions( Matrix *leftMatrix, Matrix *rightMatrix,
-                                 VirtProcr *animPr )
- {
-   float32 idealSizeOfSide, idealSizeOfSide1, idealSizeOfSide2;
-   SlicingStruc *leftRowSlices, *vecSlices, *rightColSlices;
-   SlicingStrucCarrier *slicingStrucCarrier =
-                       VPThread__malloc(sizeof(SlicingStrucCarrier), animPr);
-
-   int minWorkUnitCycles, primitiveCycles, idealNumWorkUnits;
-   float64 numPrimitiveOpsInMinWorkUnit;
-
-
-   //=======  Calc ideal size of min-sized sub-matrix  ========
-
-      //ask VPThread for the number of cycles of the minimum work unit, at given
-      // percent overhead then add a guess at overhead from this divider
-   minWorkUnitCycles = VPThread__giveMinWorkUnitCycles( .05 );
-
-      //ask VPThread for number of cycles of the "primitive" op of matrix mult
-   primitiveCycles = measureMatrixMultPrimitive( animPr );
-
-   numPrimitiveOpsInMinWorkUnit =
-      (float64)minWorkUnitCycles / (float64)primitiveCycles;
-
-      //take cubed root -- that's number of these in a "side" of sub-matrix
-      // then multiply by 5 because the primitive is 5x5src/Application/VPThread__Matrix_Mult/Divide_Pr.c:403: warning: implicit declaration of function ‘VPThread__give_number_of_cores_to_schedule_onto’
-   idealSizeOfSide1 = 5 * cbrt( numPrimitiveOpsInMinWorkUnit );
-
-   idealNumWorkUnits = VPThread__giveIdealNumWorkUnits();
-   
-   idealSizeOfSide2 = leftMatrix->numRows / rint(cbrt( idealNumWorkUnits ));
-   idealSizeOfSide2 *= 0.6; //finer granularity to help load balance
-
-   if( idealSizeOfSide1 > idealSizeOfSide2 )
-      idealSizeOfSide = idealSizeOfSide1;
-   else
-      idealSizeOfSide = idealSizeOfSide2;
-
-      //The multiply inner loop blocks the array to fit into L1 cache
-//   if( idealSizeOfSide < ROWS_IN_BLOCK ) idealSizeOfSide = ROWS_IN_BLOCK;
-
-   //============  Slice up dimensions, now that know target size ===========
-
-      //Tell the slicer the target size of a side (floating pt), the start
-      // value to start slicing at, and the end value to stop slicing at
-      //It returns an array of start value of each chunk, plus number of them
-   int32 startLeftRow, endLeftRow, startVec,endVec,startRightCol,endRightCol;
-   startLeftRow  = 0;
-   endLeftRow    = leftMatrix->numRows -1;
-   startVec      = 0;
-   endVec        = leftMatrix->numCols -1;
-   startRightCol = 0;
-   endRightCol   = rightMatrix->numCols -1;
-
-   leftRowSlices =
-      sliceUpDimension( idealSizeOfSide,  startLeftRow, endLeftRow, animPr );
-
-   vecSlices =
-      sliceUpDimension( idealSizeOfSide,  startVec, endVec, animPr );
-
-   rightColSlices =
-      sliceUpDimension( idealSizeOfSide,  startRightCol, endRightCol,animPr);
-
-   slicingStrucCarrier->leftRowSlices  = leftRowSlices;
-   slicingStrucCarrier->vecSlices      = vecSlices;
-   slicingStrucCarrier->rightColSlices = rightColSlices;
-
-   return slicingStrucCarrier;
- }
-
-
-void
-makeSubMatricesAndProcrs( Matrix    *leftMatrix, Matrix    *rightMatrix,
-            SlicingStrucCarrier *slicingStrucCarrier,
-            VirtProcr *resultPr,   VirtProcr *animPr )
- {
-   SlicingStruc *leftRowSlices, *vecSlices, *rightColSlices;
-   
-   leftRowSlices  = slicingStrucCarrier->leftRowSlices;
-   vecSlices      = slicingStrucCarrier->vecSlices;
-   rightColSlices = slicingStrucCarrier->rightColSlices;
-   VPThread__free( slicingStrucCarrier, animPr );
-   
-   //================  Make sub-matrices, given the slicing  ================
-   SubMatrix **leftSubMatrices, **rightSubMatrices;
-   leftSubMatrices =
-      createSubMatrices( leftRowSlices, vecSlices, rightColSlices->numVals,
-                         leftMatrix, animPr );
-   //double_check_that_always_numRows_in_right_same_as_numCols_in_left();
-   rightSubMatrices =
-      createSubMatrices( vecSlices, rightColSlices, leftRowSlices->numVals,
-                         rightMatrix, animPr );
-
-
-   //==============  pair the sub-matrices and make processors ==============
-   int32 numRowIdxs, numColIdxs, numVecIdxs;
-
-   numRowIdxs = leftRowSlices->numVals;
-   numColIdxs = rightColSlices->numVals;
-   numVecIdxs = vecSlices->numVals;
-   
-   freeSlicingStruc( leftRowSlices, animPr );
-   freeSlicingStruc( vecSlices, animPr );
-   freeSlicingStruc( rightColSlices, animPr );
-   
-   pairUpSubMatricesAndMakeProcessors( leftSubMatrices,
-                                       rightSubMatrices,
-                                       numRowIdxs, numColIdxs,
-                                       numVecIdxs,
-                                       resultPr,
-                                       animPr );
- }
-
-
-
-
-void
-pairUpSubMatricesAndMakeProcessors( SubMatrix **leftSubMatrices,
-                                    SubMatrix **rightSubMatrices,
-                                    int32 numRowIdxs, int32 numColIdxs,
-                                    int32 numVecIdxs,
-                                    VirtProcr *resultPr,
-                                    VirtProcr *animatingPr )
- {
-   int32 resRowIdx, resColIdx, vecIdx;
-   int32 numLeftColIdxs, numRightColIdxs;
-   int32 leftRowIdxOffset;
-   SMPairParams *subMatrixPairParams;
-   float32 numToPutOntoEachCore, leftOverFraction;
-   int32 numCores, coreToScheduleOnto, numVecOnCurrCore;
-
-   numLeftColIdxs  = numColIdxs;
-   numRightColIdxs = numVecIdxs;
-
-   numCores = VPThread__give_number_of_cores_to_schedule_onto();
-
-   numToPutOntoEachCore = numRowIdxs*numColIdxs/numCores;
-   leftOverFraction = 0;
-   numVecOnCurrCore = 0;
-   coreToScheduleOnto = 0;
-
-   for( resRowIdx = 0; resRowIdx < numRowIdxs; resRowIdx++ )
-    {
-      leftRowIdxOffset = resRowIdx * numLeftColIdxs;
-
-      for( resColIdx = 0; resColIdx < numColIdxs; resColIdx++ )
-       {
-         
-         for( vecIdx = 0; vecIdx < numVecIdxs; vecIdx++ )
-          {
-               //Make the processor for the pair of sub-matrices
-            subMatrixPairParams  = VPThread__malloc( sizeof(SMPairParams),
-                                                               animatingPr);
-            subMatrixPairParams->leftSubMatrix  =
-               leftSubMatrices[ leftRowIdxOffset + vecIdx ];
-
-            subMatrixPairParams->rightSubMatrix =
-               rightSubMatrices[ vecIdx * numRightColIdxs + resColIdx ];
-
-            //subMatrixPairParams->resultPr = resultPr;
-
-               //put all pairs from the same vector onto same core
-            VPThread__create_thread_with_affinity( &calcSubMatrixProduct,
-                                                   subMatrixPairParams,
-                                                   animatingPr,
-                                                   coreToScheduleOnto );
-          }
-
-            //Trying to distribute the subMatrix-vectors across the cores, so
-            // that each core gets the same number of vectors, with a max
-            // imbalance of 1 vector more on some cores than others
-         numVecOnCurrCore += 1;
-         if( numVecOnCurrCore + leftOverFraction >= numToPutOntoEachCore -1 )
-          {
-               //deal with fractional part, to ensure that imbalance is 1 max
-               // IE, core with most has only 1 more than core with least
-            leftOverFraction += numToPutOntoEachCore - numVecOnCurrCore;
-            if( leftOverFraction >= 1 )
-             { leftOverFraction -= 1;
-               numVecOnCurrCore = -1;
-             }
-            else
-             { numVecOnCurrCore = 0;
-             }
-               //Move to next core, max core-value to incr to is numCores -1
-            if( coreToScheduleOnto >= numCores -1 )
-             { coreToScheduleOnto = 0;
-             }
-            else
-             { coreToScheduleOnto += 1;
-             }
-          }
-
-       }
-    }
- }
-
-
-
-/*Walk through the two slice-strucs, making sub-matrix strucs as go
- */
-SubMatrix **
-createSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices,
-                   int32 numUses, Matrix *origMatrix, VirtProcr *animPr )
- {
-   int32 numRowIdxs, numColIdxs, rowIdx, colIdx;
-   int32 startRow, endRow, startCol, endCol;
-   int32 *rowStartVals, *colStartVals;
-   int32 rowOffset;
-   SubMatrix **subMatrices, *newSubMatrix;
-
-   numRowIdxs = rowSlices->numVals;
-   numColIdxs = colSlices->numVals;
-
-   rowStartVals = rowSlices->startVals;
-   colStartVals = colSlices->startVals;
-
-   subMatrices = VPThread__malloc((size_t)numRowIdxs *
-                            (size_t)numColIdxs * sizeof(SubMatrix*),animPr );
-
-   for( rowIdx = 0; rowIdx < numRowIdxs; rowIdx++ )
-    {
-      rowOffset = rowIdx * numColIdxs;
-      
-      startRow  = rowStartVals[rowIdx];
-      endRow    = rowStartVals[rowIdx + 1] -1; //"fake" start above last is
-                                               // at last valid idx + 1 & is
-                                               // 1 greater than end value
-      for( colIdx = 0; colIdx < numColIdxs; colIdx++ )
-       {
-         startCol = colStartVals[colIdx];
-         endCol   = colStartVals[colIdx + 1] -1;
-
-         newSubMatrix = VPThread__malloc( sizeof(SubMatrix), animPr );
-         newSubMatrix->numRows       = endRow - startRow +1;
-         newSubMatrix->numCols       = endCol - startCol +1;
-         newSubMatrix->origMatrix    = origMatrix;
-         newSubMatrix->origStartRow  = startRow;
-         newSubMatrix->origStartCol  = startCol;
-         newSubMatrix->alreadyCopied = FALSE;
-         newSubMatrix->numUsesLeft   = numUses; //can free after this many
-         //Prevent uninitialized memory
-         newSubMatrix->copySingleton = NULL;
-         newSubMatrix->copyTransSingleton = NULL;
-
-         subMatrices[ rowOffset + colIdx ] = newSubMatrix;
-       }
-    }
-   return subMatrices;
- }
-
-
-void
-freeSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices,
-                 SubMatrix **subMatrices, VirtProcr *animPr )
- {
-   int32 numRowIdxs, numColIdxs, rowIdx, colIdx, rowOffset;
-   SubMatrix *subMatrix;
-
-   numRowIdxs = rowSlices->numVals;
-   numColIdxs = colSlices->numVals;
-
-   for( rowIdx = 0; rowIdx < numRowIdxs; rowIdx++ )
-    {
-      rowOffset = rowIdx * numColIdxs;
-      for( colIdx = 0; colIdx < numColIdxs; colIdx++ )
-       {
-         subMatrix = subMatrices[ rowOffset + colIdx ];
-         if( subMatrix->alreadyCopied )
-            VPThread__free( subMatrix->array, animPr );
-         VPThread__free( subMatrix, animPr );
-       }
-    }
-   VPThread__free( subMatrices, animPr );
- }
-
-
-
-SlicingStruc *
-sliceUpDimension( float32 idealSizeOfSide, int startVal, int endVal,
-                  VirtProcr *animPr )
- { float32 residualAcc = 0;
-   int     numSlices, i, *startVals, sizeOfSlice, endCondition;
-   SlicingStruc *slicingStruc = VPThread__malloc(sizeof(SlicingStruc), animPr);
-
-      //calc size of matrix need to hold start vals --
-   numSlices = (int32)( (float32)(endVal -startVal +1) / idealSizeOfSide);
-
-   startVals = VPThread__malloc( ((size_t)numSlices + 1) * sizeof(int32), animPr );
-
-      //Calc the upper limit of start value -- when get above this, end loop
-      // by saving highest value of the matrix dimension to access, plus 1
-      // as the start point of the imaginary slice following the last one
-      //Plus 1 because go up to value but not include when process last slice
-      //The stopping condition is half-a-size less than highest value because
-      // don't want any pieces smaller than half the ideal size -- just tack
-      // little ones onto end of last one
-   endCondition = endVal - (int) (idealSizeOfSide/2); //end *value*, not size
-   for( i = 0; startVal <= endVal; i++ )
-    {
-      startVals[i] = startVal;
-      residualAcc += idealSizeOfSide;
-      sizeOfSlice  = (int)residualAcc;
-      residualAcc -= (float32)sizeOfSlice;
-      startVal    += sizeOfSlice; //ex @size = 2 get 0, 2, 4, 6, 8..
-
-      if( startVal > endCondition )
-       { startVal = endVal + 1;
-         startVals[ i + 1 ] = startVal;
-       }
-    }
-
-   slicingStruc->startVals = startVals;
-   slicingStruc->numVals   = i;  //loop incr'd, so == last valid start idx+1
-                                 // which means is num sub-matrices in dim
-                                 // also == idx of the fake start just above
-   return slicingStruc;
- }
-
-void
-freeSlicingStruc( SlicingStruc *slicingStruc, VirtProcr *animPr )
- {
-   VPThread__free( slicingStruc->startVals, animPr );
-   VPThread__free( slicingStruc, animPr );
- }
-
-
-int inline
-measureMatrixMultPrimitive( VirtProcr *animPr )
- {
-   int r, c, v, numCycles;
-   float32 *res, *left, *right;
-
-      //setup inputs
-   left  = VPThread__malloc( 5 * 5 * sizeof( float32 ), animPr );
-   right = VPThread__malloc( 5 * 5 * sizeof( float32 ), animPr );
-   res   = VPThread__malloc( 5 * 5 * sizeof( float32 ), animPr );
-
-   for( r = 0; r < 5; r++ )
-    {
-      for( c = 0; c < 5; c++ )
-       {
-         left[  r * 5 + c ] = r;
-         right[ r * 5 + c ] = c;
-       }
-    }
-
-      //do primitive
-   VPThread__start_primitive();  //for now, just takes time stamp
-   for( r = 0; r < 5; r++ )
-    {
-      for( c = 0; c < 5; c++ )
-       {
-         for( v = 0; v < 5; v++ )
-          {
-            res[ r * 5 + c ] = left[ r * 5 + v ] * right[ v * 5 + c ];
-          }
-       }
-    }
-   numCycles =
-      VPThread__end_primitive_and_give_cycles();
-
-   VPThread__free( left, animPr );
-   VPThread__free( right, animPr );
-   VPThread__free( res, animPr );
-
-   return numCycles;
- }
diff -r f8d3f16e70c8 -r 69928a38d5af src/Application/VPThread__Matrix_Mult/EntryPoint.c
--- a/src/Application/VPThread__Matrix_Mult/EntryPoint.c	Fri Sep 16 20:12:34 2011 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,62 +0,0 @@
-/*
- *  Copyright 2009 OpenSourceStewardshipFoundation.org
- *  Licensed under GNU General Public License version 2
- *
- * Author: seanhalle@yahoo.com
- *
- */
-
-#include <math.h>
-
-#include "VPThread__Matrix_Mult.h"
-
-
-
-/*Every SSR system has an "entry point" function that creates the first
- * processor, which starts the chain of creating more processors..
- * eventually all of the processors will dissipate themselves, and
- * return.
- *
- *This entry-point function follows the same pattern as all entry-point
- * functions do:
- *1) it creates the params for the seed processor, from the
- *    parameters passed into the entry-point function
- *2) it calls SSR__create_seed_procr_and_do_work
- *3) it gets the return value from the params struc, frees the params struc,
- *    and returns the value from the function
- *
- */
-Matrix *
-multiplyTheseMatrices( Matrix *leftMatrix, Matrix *rightMatrix )
- { Matrix          *resMatrix;
-   DividerParams   *dividerParams;
-   int32            numResRows, numResCols;
-
-
-   dividerParams              = malloc( sizeof( DividerParams ) );
-   dividerParams->leftMatrix  = leftMatrix;
-   dividerParams->rightMatrix = rightMatrix;
-
-
-   numResRows  = leftMatrix->numRows;
-   numResCols  = rightMatrix->numCols;
-
-      //VMS has its own separate internal malloc, so to get results out,
-      // have to pass in empty array for it to fill up
-      //The alternative is internally telling SSR make external space to use
-   resMatrix            = malloc( sizeof(Matrix) );
-   resMatrix->array     = malloc( numResRows * numResCols * sizeof(float32));
-   resMatrix->numCols   = rightMatrix->numCols;
-   resMatrix->numRows   = leftMatrix->numRows;
-
-
-   dividerParams->resultMatrix   = resMatrix;
-
-      //create divider processor, start doing the work, and wait till done
-      //This function is the "border crossing" between normal code and SSR
-   VPThread__create_seed_procr_and_do_work(&divideWorkIntoSubMatrixPairProcrs,
-                                           dividerParams );
-   
-   free( dividerParams );
-   return resMatrix;
- }
diff -r f8d3f16e70c8 -r 69928a38d5af src/Application/VPThread__Matrix_Mult/Result_Pr.c
--- a/src/Application/VPThread__Matrix_Mult/Result_Pr.c	Fri Sep 16 20:12:34 2011 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,142 +0,0 @@
-/*
- *  Copyright 2009 OpenSourceStewardshipFoundation.org
- *  Licensed under GNU General Public License version 2
- *
- * Author: seanhalle@yahoo.com
- *
- */
-
-#include "VPThread__Matrix_Mult.h"
-
-//=====================
-void inline
-accumulateResult( float32 *resultArray, float32 *subMatrixResultArray,
-                  int32    startRow,
-                  int32    numRows,
-                  int32    startCol,
-                  int32    numCols,
-                  int32    numOrigCols );
-
-//===========================================================================
-
-/*The Result Processor gets a message from each of the vector processors,
- * puts the result from the message in its location in the result-
- * matrix, and increments the count of results.
- *
- *After the count reaches the point that all results have been received, it
- * returns the result matrix and dissipates.
- */
-void gatherResults( void *_params, VirtProcr *animatingThd )
- { VirtProcr *dividerPr;
-   ResultsParams  *params;
-   int            numRows, numCols, numSubMatrixPairs, count=0;
-   float32        *resultArray;
-   SMPairParams   *resParams;
-   //====================== thread stuff =======================
-   MatrixMultGlobals *globals =(MatrixMultGlobals *)VPThread__give_globals();
-
-
-      //get vector-comm lock before loop, so that this thd keeps lock after
-      // one wait until it enters the next wait -- forces see-saw btwn
-      // waiters and signalers -- wait-signal-wait-signal-...
-   VPThread__mutex_lock( globals->vector_mutex, animatingThd );
-
-      //Tell divider that have the vector lock -- so it's sure won't miss any
-      // signals from the vector-threads it's about to create
-      //Don't need a signal variable -- this thd can't be created until
-      // divider thd already has the start lock
-   VPThread__mutex_lock( globals->start_mutex, animatingThd );//finish wait
-   VPThread__cond_signal( globals->start_cond,  animatingThd );
-   VPThread__mutex_unlock( globals->start_mutex, animatingThd );//finish wait
-   //===========================================================
-
-         DEBUG( dbgAppFlow, "start resultPr\n")
-         
-   params    = (ResultsParams *)_params;
-   dividerPr = params->dividerPr;
-   numSubMatrixPairs = params->numSubMatrixPairs;
-   numRows = params->numRows;
-   numCols = params->numCols;
-
-   resultArray = params->resultArray;
-
-
-   while( count < numSubMatrixPairs )
-    {
-         //receive a vector-result from a vector-thread
-      VPThread__cond_wait(  globals->vector_cond,  animatingThd );
-
-         //At this point, animating thread owns the vector lock, so all
-         // pairs trying to signal they have a result are waiting to get that
-         // lock -- only one gets it at a time, and when signal, this thd
-         // gets the lock and does the body of this loop, then when does the
-         // wait again, that releases the lock for next pair-thread to get it
-      resParams = globals->currSMPairParams;
-
-      accumulateResult( resultArray, resParams->partialResultArray,
-                        resParams->leftSubMatrix->origStartRow,
-                        resParams->leftSubMatrix->numRows,
-                        resParams->rightSubMatrix->origStartCol,
-                        resParams->rightSubMatrix->numCols,
-                        resParams->rightSubMatrix->origMatrix->numCols );
-
-      VPThread__free( resParams->partialResultArray, animatingThd );
-      
-         //there is only one copy of results procr, so can update numUsesLeft
-         // without concurrency worries.  When zero, free the sub-matrix
-      resParams->leftSubMatrix->numUsesLeft -= 1;
-      if( resParams->leftSubMatrix->numUsesLeft == 0 )
-       {
-         VPThread__free( resParams->leftSubMatrix->array, animatingThd );
-         VPThread__free( resParams->leftSubMatrix, animatingThd );
-       }
-
-      resParams->rightSubMatrix->numUsesLeft -= 1;
-      if( resParams->rightSubMatrix->numUsesLeft == 0 )
-       {
-         VPThread__free( resParams->rightSubMatrix->array, animatingThd );
-         VPThread__free( resParams->rightSubMatrix, animatingThd );
-       }
-
-         //count of how many sub-matrix pairs accumulated so know when done
-      count++;
-    }
-
-      //Done -- could just dissipate -- SSR will wait for all processors to
-      // dissipate before shutting down, and thereby making results avaial to
-      // outside, so no need to stop the divider from dissipating, so no need
-      // to send a hand-shake message to it -- but makes debug easier
-      //However, following pattern, so all comms done, release lock
-   VPThread__mutex_unlock( globals->vector_mutex, animatingThd );
-
-      //Send result to divider (seed) thread
-      // note, divider thd had to hold the results-comm lock before creating
-      // this thread, to be sure no race
-   VPThread__mutex_lock(   globals->results_mutex, animatingThd );
-   //globals->results = resultMatrixArray;
-   VPThread__cond_signal(  globals->results_cond,  animatingThd );
-   VPThread__mutex_unlock( globals->results_mutex, animatingThd ); //releases
-   //divider thread from its wait, at point this executes
-
-   VPThread__dissipate_thread( animatingThd );  //frees any data owned by procr
- }
-
-void inline
-accumulateResult( float32 *resultArray, float32 *subMatrixPairResultArray,
-                  int32    startRow,
-                  int32    numRows,
-                  int32    startCol,
-                  int32    numCols,
-                  int32    numOrigCols )
- { int32 row, col;
-
-   for( row = 0; row < numRows; row++ )
-    {
-      for( col = 0; col < numCols; col++ )
-       {
-         resultArray[ (row + startRow) * numOrigCols + (col + startCol) ] +=
-            subMatrixPairResultArray[ row * numCols + col ];
-       }
-    }
-
- }
diff -r f8d3f16e70c8 -r 69928a38d5af src/Application/VPThread__Matrix_Mult/VPThread__Matrix_Mult.h
--- a/src/Application/VPThread__Matrix_Mult/VPThread__Matrix_Mult.h	Fri Sep 16 20:12:34 2011 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,120 +0,0 @@
-/*
- *  Copyright Oct 24, 2009 OpenSourceStewardshipFoundation.org
- *  Licensed under GNU General Public License version 2
- */
-
-#ifndef _SSR_MATRIX_MULT_H_
-#define _SSR_MATRIX_MULT_H_
-
-#include <stdio.h>
-
-#include "../../VPThread_lib/VPThread.h"
-#include "../Matrix_Mult.h"
-
-
-//===============================  Defines  ==============================
-#define ROWS_IN_BLOCK 32
-#define COLS_IN_BLOCK 32
-#define VEC_IN_BLOCK  32
-
-#define copyMatrixSingleton 1
-#define copyTransposeSingleton 2
-
-//==============================  Structures  ==============================
-typedef struct
- {
-   Matrix *leftMatrix;
-   Matrix *rightMatrix;
-   Matrix *resultMatrix;
- }
-DividerParams;
-
-typedef struct
- {
-   VirtProcr *dividerPr;
-   int numRows;
-   int numCols;
-   int numSubMatrixPairs;
-   float32 *resultArray;
- }
-ResultsParams;
-
-typedef
-struct
- { int32    numRows;
-   int32    numCols;
-   Matrix  *origMatrix;
-   int32    origStartRow;
-   int32    origStartCol;
-   int32    alreadyCopied;
-   VPThdSingleton *copySingleton;
-   VPThdSingleton *copyTransSingleton;
-   int32    numUsesLeft; //have update via message to avoid multiple writers
-   float32 *array;  //2D, but dynamically sized, so use addr arith
- }
-SubMatrix;
-
-typedef struct
- { VirtProcr *resultPr;
-   SubMatrix *leftSubMatrix;
-   SubMatrix *rightSubMatrix;
-   float32   *partialResultArray;
- }
-SMPairParams;
-
-typedef
-struct
- { int32    numVals;
-   int32   *startVals;
- }
-SlicingStruc;
-
-typedef
-struct
- {
-   SlicingStruc *leftRowSlices;
-   SlicingStruc *vecSlices;
-   SlicingStruc *rightColSlices;
- }
-SlicingStrucCarrier;
-
-enum MMMsgType
- {
-   RESULTS_MSG = 1
- };
-
- 
-typedef struct
- {
-      //for communicating sub-matrix-pair results to results Thd
-   int32         vector_mutex;
-   int32         vector_cond;
-   SMPairParams *currSMPairParams;
-
-      //for communicating results array back to seed (divider) Thd
-   int32         results_mutex;
-   int32         results_cond;
-   float32      *results;
-
-      //for ensuring results thd has vector lock before making vector thds
-   int32         start_mutex;
-   int32         start_cond;
-
-   Matrix *rightMatrix;
-   Matrix *resultMatrix;
- }
-MatrixMultGlobals;
-
-
-//============================= Processor Functions =========================
-void divideWorkIntoSubMatrixPairProcrs( void *data, VirtProcr *animatingPr );
-void calcSubMatrixProduct(        void *data, VirtProcr *animatingPr );
-void gatherResults(     void *data, VirtProcr *animatingPr );
-
-
-//================================ Entry Point ==============================
-Matrix *
-multiplyTheseMatrices( Matrix *leftMatrix, Matrix *rightMatrix );
-
-
-#endif /*_SSR_MATRIX_MULT_H_*/
diff -r f8d3f16e70c8 -r 69928a38d5af src/Application/VPThread__Matrix_Mult/subMatrix_Pr.c
--- a/src/Application/VPThread__Matrix_Mult/subMatrix_Pr.c	Fri Sep 16 20:12:34 2011 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,310 +0,0 @@
-/* 
- *  Copyright 2009 OpenSourceStewardshipFoundation.org
- *  Licensed under GNU General Public License version 2
- *
- * Author: SeanHalle@yahoo.com
- *
- */
-
-#include <string.h>
-
-#include "VPThread__Matrix_Mult.h"
-
-//Declarations
-//===================================================
-
-void inline
-copyFromOrig( SubMatrix *subMatrix, VirtProcr *animPr );
-
-void inline
-copyTransposeFromOrig( SubMatrix *subMatrix, VirtProcr *animPr );
-
-void inline
-multiplySubBlocksTransposed( float32 *leftArray, float32 *rightArray,
-                     float32 *resArray,
-                     int startRow,  int endRow,
-                     int startCol,  int endCol,
-                     int startVec,  int endVec,
-                     int resStride, int inpStride );
-
-void inline
-multiplyMatrixArraysTransposed( int32 vecLength, int32 numResRows, int32 numResCols,
-                      float32 *leftArray, float32 *rightArray,
-                      float32 *resArray );
-
-//===================================================
-
-/*A  processor is created with an environment that holds two matrices,
- * the row and col that it owns, and the name of a result gathering
- * processor.
- *It calculates the product of two sub-portions of the input matrices
- * by using Intel's mkl library for single-core.
- *
- *This demonstrates using optimized single-threaded code inside scheduled
- * work-units.
- *
- *When done, it sends the result to the result processor
- */
-void
-calcSubMatrixProduct( void *data, VirtProcr *animatingPr )
- { 
-   SMPairParams   *params;
-   VirtProcr      *resultPr;
-   float32        *leftArray,  *rightArray, *resArray;
-   SubMatrix      *leftSubMatrix, *rightSubMatrix;
-   MatrixMultGlobals *globals =(MatrixMultGlobals *)VPThread__give_globals();
-
-         DEBUG1(dbgAppFlow, "start sub-matrix mult: %d\n", animatingPr->procrID)
-         #ifdef TURN_ON_DEBUG_PROBES
-         int32 subMatrixProbe = VMS__create_single_interval_probe( "subMtx",
-                                                                animatingPr);
-         VMS__record_sched_choice_into_probe( subMatrixProbe, animatingPr );
-         VMS__record_interval_start_in_probe( subMatrixProbe );
-         #endif
-
-   params         = (SMPairParams *)data;
-   resultPr       = params->resultPr;
-   leftSubMatrix  = params->leftSubMatrix;
-   rightSubMatrix = params->rightSubMatrix;
-
-      //make sure the input sub-matrices have been copied out of orig
-      //do it here, inside sub-matrix pair to hopefully gain reuse in cache
-   copyFromOrig( leftSubMatrix, animatingPr );
-   copyTransposeFromOrig( rightSubMatrix, animatingPr );
-   
-   leftArray      = leftSubMatrix->array;
-   rightArray     = rightSubMatrix->array;
-
-   size_t resSize = (size_t)leftSubMatrix->numRows * 
-                        (size_t)rightSubMatrix->numCols * sizeof(float32);
-   resArray = VPThread__malloc( resSize, animatingPr );
-   memset( resArray, 0, resSize );
-
-
-   int32 numResRows, numResCols, vectLength;
-   
-   vectLength = leftSubMatrix->numCols;
-   numResRows = leftSubMatrix->numRows;
-   numResCols = rightSubMatrix->numCols;
-
-   multiplyMatrixArraysTransposed( vectLength, numResRows, numResCols,
-                         leftArray,  rightArray,
-                         resArray );
-
-   //send result to result processor
-   params->partialResultArray = resArray;
-
-         #ifdef TURN_ON_DEBUG_PROBES
-         VMS__record_interval_end_in_probe( subMatrixProbe );
-         #endif
-
-      //Send result to results thread
-      //This pattern works 'cause only get lock when results thd inside wait
-   VPThread__mutex_lock(   globals->vector_mutex, animatingPr );
-   globals->currSMPairParams = params;
-   VPThread__cond_signal(  globals->vector_cond,  animatingPr );
-   VPThread__mutex_unlock( globals->vector_mutex, animatingPr );//release
-   //wait-er -- cond_signal implemented such that wait-er gets lock, no other
-
-         DEBUG1(dbgAppFlow, "end sub-matrix mult: %d\n", animatingPr->procrID)
-   VPThread__dissipate_thread( animatingPr );
- }
-
-
-
-/*Divides result and each input into 32x32 sub-matrices, 3 of which fit into
- * the 32KB L1 cache.
- *Would be nice to embed this within another level that divided into
- * 8x8 tiles of those, where one 8x8 tile fits within 2MB L2 cache
- *
- *Eventually want these divisions to be automatic, using DKU pattern
- * embedded into VMS and exposed in the language, and with VMS controlling the
- * divisions according to the cache sizes, which it knows about.
- *Also, want VMS to work with language to split among main-mems, so a socket
- * only cranks on data in its local segment of main mem
- *
- *So, outer two loops determine start and end points within the result matrix.
- * Inside that, a loop dets the start and end points along the shared dimensions
- * of the two input matrices.
- */
-void inline
-multiplyMatrixArraysTransposed( int32 vecLength, int32 numResRows,
-                                int32 numResCols,
-                                float32 *leftArray, float32 *rightArray,
-                                float32 *resArray )
- {
-   int resStride, inpStride;
-   int resStartRow, resStartCol, resEndRow, resEndCol, startVec, endVec;
-
-   resStride  = numResCols;
-   inpStride  = vecLength;
-
-   for( resStartRow = 0; resStartRow < numResRows; )
-    {
-      resEndRow = resStartRow + ROWS_IN_BLOCK -1;  //start at zero, so -1
-      if( resEndRow > numResRows ) resEndRow = numResRows -1;
-
-      for( resStartCol = 0; resStartCol < numResCols; )
-       {
-         resEndCol   = resStartCol + COLS_IN_BLOCK -1;
-         if( resEndCol > numResCols ) resEndCol = numResCols -1;
-
-         for( startVec = 0; startVec < vecLength; )
-          {
-            endVec   = startVec + VEC_IN_BLOCK -1;
-            if( endVec > vecLength ) endVec = vecLength -1;
-
-               //By having the "vector" of sub-blocks in a sub-block slice
-               // be marched down in inner loop, are re-using the result
-               // matrix, which stays in L1 cache and re-using the left sub-mat
-               // which repeats for each right sub-mat -- can only re-use two of
-               // the three, so result is the most important -- avoids writing
-               // dirty blocks until those result-locations fully done
-               //Row and Col is position in result matrix -- so row and vec
-               // for left array, then vec and col for right array
-            multiplySubBlocksTransposed( leftArray, rightArray,
-                                         resArray,
-                                         resStartRow,  resEndRow,
-                                         resStartCol,  resEndCol,
-                                         startVec,  endVec,
-                                         resStride, inpStride );
-            startVec = endVec +1;
-          }
-         resStartCol = resEndCol +1;
-       }
-      resStartRow = resEndRow +1;
-    }
- }
-
-
-
-void inline
-multiplySubBlocksTransposed( float32 *leftArray, float32 *rightArray,
-                     float32 *resArray,
-                     int resStartRow,  int resEndRow,
-                     int resStartCol,  int resEndCol,
-                     int startVec,  int endVec,
-                     int resStride, int inpStride )
- {
-   int resRow,     resCol,        vec;
-   int leftOffset, rightOffset;
-   float32 result;
-
-      //The result row is used only for the left matrix, res col for the right
-   for( resCol = resStartCol; resCol <= resEndCol; resCol++ )
-    {
-      for( resRow = resStartRow; resRow <= resEndRow; resRow++ )
-       {
-         leftOffset  = resRow * inpStride;//left & right inp strides always same
-         rightOffset = resCol * inpStride;// because right is transposed
-         result = 0;
-         for( vec = startVec; vec <= endVec; vec++ )
-          {
-            result +=
-               leftArray[ leftOffset + vec] * rightArray[ rightOffset + vec];
-          }
-
-         resArray[ resRow * resStride + resCol ] += result;
-       }
-    }
- }
-
-
-
-
-/*Reuse this in divider when do the sequential multiply case
- */
-void inline
-copyTranspose( int32 numRows, int32 numCols,
-               int32 origStartRow, int32 origStartCol, int32 origStride,
-               float32 *subArray, float32 *origArray )
- { int32 stride = numRows;
- 
-   int row, col, origOffset;
-   for( row = 0; row < numRows; row++ )
-    {
-      origOffset = (row + origStartRow) * origStride + origStartCol;
-      for( col = 0; col < numCols; col++ )
-       {
-            //transpose means swap row & col -- traverse orig matrix normally
-            // but put into reversed place in local array -- means the
-            // stride is the numRows now, so col * numRows + row
-         subArray[ col * stride + row ]  =  origArray[ origOffset + col ];
-       }
-    }
- }
-
-void inline
-copyTransposeFromOrig( SubMatrix *subMatrix, VirtProcr *animPr )
- { int numCols, numRows, origStartRow, origStartCol, origStride;
-   Matrix *origMatrix;
-   float32 *origArray, *subArray;
-
-   VPThread__start_data_singleton( &(subMatrix->copyTransSingleton), animPr );
-
-   origMatrix   = subMatrix->origMatrix;
-   origArray    = origMatrix->array;
-   numCols      = subMatrix->numCols;
-   numRows      = subMatrix->numRows;
-   origStartRow = subMatrix->origStartRow;
-   origStartCol = subMatrix->origStartCol;
-   origStride   = origMatrix->numCols;
-
-   subArray    = VPThread__malloc( (size_t)numRows * 
-                                        (size_t)numCols *sizeof(float32),animPr);
-   subMatrix->array = subArray;
-
-      //copy values from orig matrix to local
-   copyTranspose( numRows, numCols,
-                  origStartRow, origStartCol, origStride,
-                  subArray, origArray );
-
-   VPThread__end_data_singleton( &(subMatrix->copyTransSingleton), animPr );
-
- }
-
-
-void inline
-copyFromOrig( SubMatrix *subMatrix, VirtProcr *animPr )
- { int numCols, numRows, origStartRow, origStartCol, stride, origStride;
-   Matrix *origMatrix;
-   float32 *origArray, *subArray;
-
-
-      //This lets only a single VP execute the code between start and
-      // end -- using start and end so that work runs outside the master.
-      //If a second VP ever executes the start, it will be returned
-      // from the end-point.  If its execution starts after another but before
-      // that other has finished, this one will remain suspended until the
-      // other finishes, then be resumed from the end-point.
-   VPThread__start_data_singleton( &(subMatrix->copySingleton), animPr );
-
-
-   origMatrix    = subMatrix->origMatrix;
-   origArray     = origMatrix->array;
-   numCols       = subMatrix->numCols;
-   numRows       = subMatrix->numRows;
-   origStartRow  = subMatrix->origStartRow;
-   origStartCol  = subMatrix->origStartCol;
-   origStride    = origMatrix->numCols;
-
-   subArray    = VPThread__malloc( (size_t)numRows * 
-                                        (size_t)numCols *sizeof(float32),animPr);
-   subMatrix->array = subArray;
-
-      //copy values from orig matrix to local
-   stride        = numCols;
-
-   int row, col, offset, origOffset;
-   for( row = 0; row < numRows; row++ )
-    {
-      offset     = row * stride;
-      origOffset = (row + origStartRow) * origStride + origStartCol;
-      for( col = 0; col < numCols; col++ )
-       {
-         subArray[ offset + col ]  =  origArray[ origOffset + col ];
-       }
-    }
-
-   VPThread__end_data_singleton( &(subMatrix->copySingleton), animPr );
- }
diff -r f8d3f16e70c8 -r 69928a38d5af src/Application/main.c
--- a/src/Application/main.c	Fri Sep 16 20:12:34 2011 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,50 +0,0 @@
-/*
- *  Copyright Oct 24, 2009 OpenSourceStewardshipFoundation.org
- *  Licensed under GNU General Public License version 2
- *
- * author seanhalle@yahoo.com
- */
-
-#include <malloc.h>
-#include <stdlib.h>
-
-#include "Matrix_Mult.h"
-#include "VPThread__Matrix_Mult/VPThread__Matrix_Mult.h"
-
-char __ProgrammName[] = "Blocked Matrix Multiply";
-char __DataSet[255];
-
-void printParams(ParamBag *paramBag)
-{
-    snprintf((char*)&__DataSet, 255,
-                "#\tLeft Matrix %d x %d,\n#\tRight Matrix %d x %d",
-                getParamFromBag("leftMatrixRows", paramBag)->intValue,
-                getParamFromBag("leftMatrixCols", paramBag)->intValue,
-                getParamFromBag("rightMatrixRows", paramBag)->intValue,
-                getParamFromBag("rightMatrixCols", paramBag)->intValue);
-}
-
-/**
- *Matrix multiply program written using VMS_HW piggy-back language
- * 
- */
-int main( int argc, char **argv )
- { Matrix      *leftMatrix, *rightMatrix, *resultMatrix;
-   ParamBag    *paramBag;
-   
-   printf( "arguments: %s | %s\n", argv[0], argv[1] );
-
-   paramBag = makeParamBag();
-   readParamFileIntoBag( argv[1], paramBag );
-   initialize_Input_Matrices_Via( &leftMatrix, &rightMatrix, paramBag );
-   printParams(paramBag);
-   
-   resultMatrix = multiplyTheseMatrices( leftMatrix, rightMatrix );
-
-   printf("\nresult matrix: \n");
-   printMatrix( resultMatrix );
-   
-//   SSR__print_stats();
-   
-   exit(0); //cleans up
- }