# HG changeset patch
# User Me
# Date 1288742450 25200
# Node ID 4e14e2663af9065983d3817bdf62480829f6428b
# Parent  f33a9cba5d890cd27ac0cd5d6cdad1839568f102
Fixed concurrency bug -- added singleton to SSR -- works!  3.4x speedup

diff -r f33a9cba5d89 -r 4e14e2663af9 src/Application/SSR_Matrix_Mult/Divide_Pr.c
--- a/src/Application/SSR_Matrix_Mult/Divide_Pr.c	Thu Oct 14 17:10:17 2010 -0700
+++ b/src/Application/SSR_Matrix_Mult/Divide_Pr.c	Tue Nov 02 17:00:50 2010 -0700
@@ -18,20 +18,28 @@
 #define NUM_CELLS_IN_SEQUENTIAL_CUTOFF 1000
 
 
-int
-measureMatrixMultPrimitive();
-
+//===========================================================================
+int inline
+measureMatrixMultPrimitive( VirtProcr *animPr );
 
 SlicingStrucCarrier *
-calcIdealSizeAndSliceDimensions( Matrix *leftMatrix, Matrix *rightMatrix );
+calcIdealSizeAndSliceDimensions( Matrix *leftMatrix, Matrix *rightMatrix,
+                                 VirtProcr *animPr );
 
 SlicingStruc *
-sliceUpDimension( float32 idealSizeOfPiece, int startVal, int endVal );
+sliceUpDimension( float32 idealSizeOfSide, int startVal, int endVal,
+                  VirtProcr *animPr );
+
+void
+freeSlicingStruc( SlicingStruc *slicingStruc, VirtProcr *animPr );
 
 SubMatrix **
 createSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices,
-                   Matrix *origMatrix );
+                   int32 numUses, Matrix *origMatrix, VirtProcr *animPr );
 
+void
+freeSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices,
+                 SubMatrix **subMatrices, VirtProcr *animPr );
 
 void
 pairUpSubMatricesAndMakeProcessors( SubMatrix **leftSubMatrices,
@@ -105,44 +113,59 @@
  */
 
 void divideWorkIntoSubMatrixPairProcrs( void      *_dividerParams,
-                                        VirtProcr *animatingPr )
+                                        VirtProcr *animPr )
  { VirtProcr       *resultPr;
    DividerParams   *dividerParams;
    ResultsParams   *resultsParams;
    Matrix          *leftMatrix, *rightMatrix, *resultMatrix;
    void            *msg;
    SlicingStrucCarrier *slicingStrucCarrier;
-   float32         *resultArray; //points to array to be put inside result
-                                 // matrix
+   float32         *resultArray; //points to array inside result matrix
    
-         PRINT_DEBUG("start divide\n")
+         DEBUG("start divide\n")
 
+         int32
+         divideProbe = VMS__create_single_interval_probe( "divideProbe",
+                                                          animPr );
+         VMS__record_sched_choice_into_probe( divideProbe, animPr );
+         VMS__record_interval_start_in_probe( divideProbe );
 
    //=========== Setup -- make local copies of ptd-to-things, malloc, aso
+   int32 numResRows, numResCols, vectLength;
 
    dividerParams   = (DividerParams *)_dividerParams;
    
    leftMatrix      = dividerParams->leftMatrix;
    rightMatrix     = dividerParams->rightMatrix;
 
+   vectLength = leftMatrix->numCols;
+   numResRows = leftMatrix->numRows;
+   numResCols = rightMatrix->numCols;
+   resultArray     = dividerParams->resultMatrix->array;
 
    //==============  Do either sequential mult or do division ==============
 
       //Check if input matrices too small -- if yes, just do sequential
-   if( leftMatrix->numRows * leftMatrix->numCols * rightMatrix->numCols
-       < NUM_CELLS_IN_SEQUENTIAL_CUTOFF ) //curoff is determined by overhead
-       // of this divider -- relatively machine-independent
-    { int32 vectLength, numResRows, numResCols;
+      //Cutoff is determined by overhead of this divider -- relatively
+      // machine-independent
+   if( (float32)leftMatrix->numRows * (float32)leftMatrix->numCols *
+       (float32)rightMatrix->numCols  < NUM_CELLS_IN_SEQUENTIAL_CUTOFF )
+    {
+      //====== Do sequential multiply on a single core
+            DEBUG("doing sequential")
 
-      //====== Do sequential multiply on a single core
+      //have to transpose the right matrix first
+      float32 *
+      transRightArray  = SSR__malloc_to( rightMatrix->numRows *
+                                         rightMatrix->numCols *
+                                         sizeof(float32),        animPr );
 
-      vectLength = leftMatrix->numCols;
-      numResRows = leftMatrix->numRows;
-      numResCols = rightMatrix->numCols;
-
-      resultArray = malloc( numResRows * numResCols * sizeof(float32) );
-
-      multiplyMatrixArrays( vectLength, numResRows, numResCols,
+         //copy values from orig matrix to local
+      copyTranspose( rightMatrix->numRows, rightMatrix->numCols,
+                     0, 0, rightMatrix->numRows,
+                     transRightArray, rightMatrix->array );
+      
+      multiplyMatrixArraysTransposed( vectLength, numResRows, numResCols,
                             leftMatrix->array, rightMatrix->array,
                             resultArray );
     }
@@ -155,65 +178,62 @@
          //The ideal size is the one takes the number of cycles to calculate
          // such that calc time is equal or greater than min work-unit size
       slicingStrucCarrier =
-         calcIdealSizeAndSliceDimensions( leftMatrix, rightMatrix );
+         calcIdealSizeAndSliceDimensions( leftMatrix, rightMatrix, animPr );
 
          //Make the results processor, now that know how many to wait for
-      resultsParams = SSR__malloc_size_to(sizeof(ResultsParams),animatingPr);
-      resultsParams->dividerPr = animatingPr;
+      resultsParams = SSR__malloc_to( sizeof(ResultsParams), animPr );
       resultsParams->numSubMatrixPairs  =
          slicingStrucCarrier->leftRowSlices->numVals *
          slicingStrucCarrier->rightColSlices->numVals *
          slicingStrucCarrier->vecSlices->numVals;
-      resultsParams->numCols   = rightMatrix->numCols;
-      resultsParams->numRows   = leftMatrix->numRows;
+      resultsParams->dividerPr   = animPr;
+      resultsParams->numCols     = rightMatrix->numCols;
+      resultsParams->numRows     = leftMatrix->numRows;
+      resultsParams->resultArray = resultArray;
+
 
       resultPr =
-         SSR__create_procr_with( &gatherResults, resultsParams, animatingPr);
+         SSR__create_procr_with( &gatherResults, resultsParams, animPr);
 
          //Make the sub-matrices, and pair them up, and make processor to
          // calc product of each pair.
       makeSubMatricesAndProcrs( leftMatrix, rightMatrix,
                                     slicingStrucCarrier,
-                                    resultPr, animatingPr);
+                                    resultPr, animPr);
  
-         //Get result from result procr
-      msg = SSR__receive_from_to( resultPr, animatingPr );
-      resultArray = (float32 *) msg;
-    }
+         //result array is allocated externally, so no message from resultPr
+         // however, do have to wait before printing out stats, so wait
+         // for an empty handshake message
+      msg = SSR__receive_from_to( resultPr, animPr );
+   }
 
 
    //===============  Work done -- send results back =================
 
 
-      //prepare results to persist outside of SSR when return from entry pt
-      //The results of the all the work have to be linked-to from the data
-      // struc given to the seed procr -- this divide func is animated by
-      // that seed procr, so have to link results to the _dividerParams.
-   resultMatrix            = SSR__malloc_size_to(sizeof(Matrix),animatingPr);
-   resultMatrix->array     = resultArray;
-   resultMatrix->numCols   = rightMatrix->numCols;
-   resultMatrix->numRows   = leftMatrix->numRows;
+         DEBUG_MSG( dbgAppFlow, "end divide\n")
 
+         VMS__record_interval_end_in_probe( divideProbe );
+         VMS__print_stats_of_all_probes();
 
-   dividerParams->resultMatrix   = resultMatrix;
-   SSR__transfer_ownership_to_outside( msg ); //so not freed
-   SSR__transfer_ownership_to_outside( resultMatrix );
+      //nothing left to do so dissipate, SSR will wait to shutdown and hence
+      // make results available to outside until all the processors have
+      // dissipated -- so no need to wait for results processor
 
-         PRINT_DEBUG("end divide\n")
-
-   SSR__dissipate_procr( animatingPr );  //all procrs dissipate self at end
+   SSR__dissipate_procr( animPr );  //all procrs dissipate self at end
       //when all of the processors have dissipated, the "create seed and do
       // work" call in the entry point function returns
  }
 
 
 SlicingStrucCarrier *
-calcIdealSizeAndSliceDimensions( Matrix *leftMatrix, Matrix *rightMatrix )
-{
+calcIdealSizeAndSliceDimensions( Matrix *leftMatrix, Matrix *rightMatrix,
+                                 VirtProcr *animPr )
+ {
    float32 idealSizeOfSide, idealSizeOfSide1, idealSizeOfSide2;
    SlicingStruc *leftRowSlices, *vecSlices, *rightColSlices;
    SlicingStrucCarrier *slicingStrucCarrier =
-                                         malloc(sizeof(SlicingStrucCarrier));
+                         SSR__malloc_to(sizeof(SlicingStrucCarrier), animPr);
 
    int minWorkUnitCycles, primitiveCycles, idealNumWorkUnits;
    float64 numPrimitiveOpsInMinWorkUnit;
@@ -226,7 +246,7 @@
    minWorkUnitCycles = SSR__giveMinWorkUnitCycles( .05 );
 
       //ask SSR for number of cycles of the "primitive" op of matrix mult
-   primitiveCycles = measureMatrixMultPrimitive();
+   primitiveCycles = measureMatrixMultPrimitive( animPr );
 
    numPrimitiveOpsInMinWorkUnit =
       (float64)minWorkUnitCycles / (float64)primitiveCycles;
@@ -238,6 +258,7 @@
    idealNumWorkUnits = SSR__giveIdealNumWorkUnits();
    
    idealSizeOfSide2 = leftMatrix->numRows / rint(cbrt( idealNumWorkUnits ));
+   idealSizeOfSide2 *= 0.6; //finer granularity to help load balance
 
    if( idealSizeOfSide1 > idealSizeOfSide2 )
       idealSizeOfSide = idealSizeOfSide1;
@@ -261,41 +282,47 @@
    endRightCol   = rightMatrix->numCols -1;
 
    leftRowSlices =
-      sliceUpDimension( idealSizeOfSide,  startLeftRow, endLeftRow );
+      sliceUpDimension( idealSizeOfSide,  startLeftRow, endLeftRow, animPr );
 
    vecSlices =
-      sliceUpDimension( idealSizeOfSide,  startVec, endVec );
+      sliceUpDimension( idealSizeOfSide,  startVec, endVec, animPr );
 
    rightColSlices =
-      sliceUpDimension( idealSizeOfSide,  startRightCol, endRightCol );
+      sliceUpDimension( idealSizeOfSide,  startRightCol, endRightCol,animPr);
 
    slicingStrucCarrier->leftRowSlices  = leftRowSlices;
    slicingStrucCarrier->vecSlices      = vecSlices;
    slicingStrucCarrier->rightColSlices = rightColSlices;
 
    return slicingStrucCarrier;
-}
+ }
 
 
 void
 makeSubMatricesAndProcrs( Matrix    *leftMatrix, Matrix    *rightMatrix,
             SlicingStrucCarrier *slicingStrucCarrier,
-            VirtProcr *resultPr,   VirtProcr *animatingPr )
+            VirtProcr *resultPr,   VirtProcr *animPr )
  {
    SlicingStruc *leftRowSlices, *vecSlices, *rightColSlices;
    
    leftRowSlices  = slicingStrucCarrier->leftRowSlices;
    vecSlices      = slicingStrucCarrier->vecSlices;
    rightColSlices = slicingStrucCarrier->rightColSlices;
+   SSR__free( slicingStrucCarrier, animPr );
    
    //================  Make sub-matrices, given the slicing  ================
    SubMatrix **leftSubMatrices, **rightSubMatrices;
    leftSubMatrices =
-      createSubMatrices( leftRowSlices, vecSlices,
-                         leftMatrix );
+      createSubMatrices( leftRowSlices, vecSlices, rightColSlices->numVals,
+                         leftMatrix, animPr );
+   //double_check_that_always_numRows_in_right_same_as_numCols_in_left();
    rightSubMatrices =
-      createSubMatrices( vecSlices, rightColSlices,
-                         rightMatrix );
+      createSubMatrices( vecSlices, rightColSlices, leftRowSlices->numVals,
+                         rightMatrix, animPr );
+
+   freeSlicingStruc( leftRowSlices, animPr );
+   freeSlicingStruc( vecSlices, animPr );
+   freeSlicingStruc( rightColSlices, animPr );
 
    //==============  pair the sub-matrices and make processors ==============
    int32 numRowIdxs, numColIdxs, numVecIdxs;
@@ -308,7 +335,7 @@
                                        numRowIdxs, numColIdxs,
                                        numVecIdxs,
                                        resultPr,
-                                       animatingPr );
+                                       animPr );
  }
 
 
@@ -326,21 +353,30 @@
    int32 numLeftColIdxs, numRightColIdxs;
    int32 leftRowIdxOffset;
    SMPairParams *subMatrixPairParams;
+   float32 numToPutOntoEachCore, leftOverFraction;
+   int32 numCores, coreToScheduleOnto, numVecOnCurrCore;
 
    numLeftColIdxs  = numColIdxs;
    numRightColIdxs = numVecIdxs;
 
+   numCores = SSR__give_number_of_cores_to_schedule_onto();
+
+   numToPutOntoEachCore = numRowIdxs*numColIdxs/numCores;
+   leftOverFraction = 0;
+   numVecOnCurrCore = 0;
+   coreToScheduleOnto = 0;
+
    for( resRowIdx = 0; resRowIdx < numRowIdxs; resRowIdx++ )
     {
       leftRowIdxOffset = resRowIdx * numLeftColIdxs;
 
       for( resColIdx = 0; resColIdx < numColIdxs; resColIdx++ )
        {
-
+         
          for( vecIdx = 0; vecIdx < numVecIdxs; vecIdx++ )
           {
                //Make the processor for the pair of sub-matrices
-            subMatrixPairParams  = SSR__malloc_size_to(sizeof(SMPairParams),
+            subMatrixPairParams  = SSR__malloc_to( sizeof(SMPairParams),
                                                                animatingPr);
             subMatrixPairParams->leftSubMatrix  =
                leftSubMatrices[ leftRowIdxOffset + vecIdx ];
@@ -350,9 +386,36 @@
 
             subMatrixPairParams->resultPr = resultPr;
 
-            SSR__create_procr_with( &calcSubMatrixProduct,
-                                    subMatrixPairParams,
-                                    animatingPr );
+               //put all pairs from the same vector onto same core
+            SSR__create_procr_with_affinity( &calcSubMatrixProduct,
+                                             subMatrixPairParams,
+                                             animatingPr,
+                                             coreToScheduleOnto );
+          }
+
+            //Trying to distribute the subMatrix-vectors across the cores, so
+            // that each core gets the same number of vectors, with a max
+            // imbalance of 1 vector more on some cores than others
+         numVecOnCurrCore += 1;
+         if( numVecOnCurrCore + leftOverFraction >= numToPutOntoEachCore -1 )
+          {
+               //deal with fractional part, to ensure that imbalance is 1 max
+               // IE, core with most has only 1 more than core with least
+            leftOverFraction += numToPutOntoEachCore - numVecOnCurrCore;
+            if( leftOverFraction >= 1 )
+             { leftOverFraction -= 1;
+               numVecOnCurrCore = -1;
+             }
+            else
+             { numVecOnCurrCore = 0;
+             }
+               //Move to next core, max core-value to incr to is numCores -1
+            if( coreToScheduleOnto >= numCores -1 )
+             { coreToScheduleOnto = 0;
+             }
+            else
+             { coreToScheduleOnto += 1;
+             }
           }
        }
     }
@@ -365,7 +428,7 @@
  */
 SubMatrix **
 createSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices,
-                   Matrix *origMatrix )
+                   int32 numUses, Matrix *origMatrix, VirtProcr *animPr )
  {
    int32 numRowIdxs, numColIdxs, rowIdx, colIdx;
    int32 startRow, endRow, startCol, endCol;
@@ -379,7 +442,8 @@
    rowStartVals = rowSlices->startVals;
    colStartVals = colSlices->startVals;
 
-   subMatrices = malloc( numRowIdxs * numColIdxs * sizeof(SubMatrix *) );
+   subMatrices = SSR__malloc_to(numRowIdxs * numColIdxs * sizeof(SubMatrix*),
+                                 animPr );
 
    for( rowIdx = 0; rowIdx < numRowIdxs; rowIdx++ )
     {
@@ -394,13 +458,14 @@
          startCol = colStartVals[colIdx];
          endCol   = colStartVals[colIdx + 1] -1;
 
-         newSubMatrix = malloc( sizeof(SubMatrix) );
+         newSubMatrix = SSR__malloc_to( sizeof(SubMatrix), animPr );
          newSubMatrix->numRows       = endRow - startRow +1;
          newSubMatrix->numCols       = endCol - startCol +1;
          newSubMatrix->origMatrix    = origMatrix;
          newSubMatrix->origStartRow  = startRow;
          newSubMatrix->origStartCol  = startCol;
          newSubMatrix->alreadyCopied = FALSE;
+         newSubMatrix->numUsesLeft   = numUses; //can free after this many
 
          subMatrices[ rowOffset + colIdx ] = newSubMatrix;
        }
@@ -409,18 +474,43 @@
  }
 
 
+void
+freeSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices,
+                 SubMatrix **subMatrices, VirtProcr *animPr )
+ {
+   int32 numRowIdxs, numColIdxs, rowIdx, colIdx, rowOffset;
+   SubMatrix *subMatrix;
+
+   numRowIdxs = rowSlices->numVals;
+   numColIdxs = colSlices->numVals;
+
+   for( rowIdx = 0; rowIdx < numRowIdxs; rowIdx++ )
+    {
+      rowOffset = rowIdx * numColIdxs;
+      for( colIdx = 0; colIdx < numColIdxs; colIdx++ )
+       {
+         subMatrix = subMatrices[ rowOffset + colIdx ];
+         if( subMatrix->alreadyCopied )
+            SSR__free( subMatrix->array, animPr );
+         SSR__free( subMatrix, animPr );
+       }
+    }
+   SSR__free( subMatrices, animPr );
+ }
+
 
 
 SlicingStruc *
-sliceUpDimension( float32 idealSizeOfSide, int startVal, int endVal )
+sliceUpDimension( float32 idealSizeOfSide, int startVal, int endVal,
+                  VirtProcr *animPr )
  { float32 residualAcc = 0;
    int     numSlices, i, *startVals, sizeOfSlice, endCondition;
-   SlicingStruc *slicingStruc = malloc( sizeof(SlicingStruc) );
+   SlicingStruc *slicingStruc = SSR__malloc_to(sizeof(SlicingStruc), animPr);
 
       //calc size of matrix need to hold start vals --
    numSlices = (int32)( (float32)(endVal -startVal +1) / idealSizeOfSide);
 
-   startVals = malloc( (numSlices + 1) * sizeof(int32) );
+   startVals = SSR__malloc_to( (numSlices + 1) * sizeof(int32), animPr );
 
       //Calc the upper limit of start value -- when get above this, end loop
       // by saving highest value of the matrix dimension to access, plus 1
@@ -451,17 +541,24 @@
    return slicingStruc;
  }
 
+void
+freeSlicingStruc( SlicingStruc *slicingStruc, VirtProcr *animPr )
+ {
+   SSR__free( slicingStruc->startVals, animPr );
+   SSR__free( slicingStruc, animPr );
+ }
+
 
 int inline
-measureMatrixMultPrimitive()
+measureMatrixMultPrimitive( VirtProcr *animPr )
  {
    int r, c, v, numCycles;
    float32 *res, *left, *right;
 
       //setup inputs
-   left  = malloc( 5 * 5 * sizeof( float32 ) );
-   right = malloc( 5 * 5 * sizeof( float32 ) );
-   res   = malloc( 5 * 5 * sizeof( float32 ) );
+   left  = SSR__malloc_to( 5 * 5 * sizeof( float32 ), animPr );
+   right = SSR__malloc_to( 5 * 5 * sizeof( float32 ), animPr );
+   res   = SSR__malloc_to( 5 * 5 * sizeof( float32 ), animPr );
 
    for( r = 0; r < 5; r++ )
     {
@@ -485,8 +582,11 @@
        }
     }
    numCycles =
-      SSR__end_primitive_and_give_cycles(); 
+      SSR__end_primitive_and_give_cycles();
+
+   SSR__free( left, animPr );
+   SSR__free( right, animPr );
+   SSR__free( res, animPr );
 
    return numCycles;
  }
-
diff -r f33a9cba5d89 -r 4e14e2663af9 src/Application/SSR_Matrix_Mult/EntryPoint.c
--- a/src/Application/SSR_Matrix_Mult/EntryPoint.c	Thu Oct 14 17:10:17 2010 -0700
+++ b/src/Application/SSR_Matrix_Mult/EntryPoint.c	Tue Nov 02 17:00:50 2010 -0700
@@ -30,6 +30,7 @@
 multiplyTheseMatrices( Matrix *leftMatrix, Matrix *rightMatrix )
  { Matrix          *resMatrix;
    DividerParams   *dividerParams;
+   int32            numResRows, numResCols;
 
 
    dividerParams              = malloc( sizeof( DividerParams ) );
@@ -37,13 +38,25 @@
    dividerParams->rightMatrix = rightMatrix;
 
 
+   numResRows  = leftMatrix->numRows;
+   numResCols  = rightMatrix->numCols;
+
+      //VMS has its own separate internal malloc, so to get results out,
+      // have to pass in empty array for it to fill up
+      //The alternative is internally telling SSR make external space to use
+   resMatrix            = malloc( sizeof(Matrix) );
+   resMatrix->array     = malloc( numResRows * numResCols * sizeof(float32));
+   resMatrix->numCols   = rightMatrix->numCols;
+   resMatrix->numRows   = leftMatrix->numRows;
+
+
+   dividerParams->resultMatrix   = resMatrix;
+
       //create divider processor, start doing the work, and wait till done
       //This function is the "border crossing" between normal code and SSR
    SSR__create_seed_procr_and_do_work( &divideWorkIntoSubMatrixPairProcrs,
                                        dividerParams );
    
-      //get result matrix and return it
-   resMatrix = dividerParams->resultMatrix;
    free( dividerParams );
    return resMatrix;
  }
diff -r f33a9cba5d89 -r 4e14e2663af9 src/Application/SSR_Matrix_Mult/Result_Pr.c
--- a/src/Application/SSR_Matrix_Mult/Result_Pr.c	Thu Oct 14 17:10:17 2010 -0700
+++ b/src/Application/SSR_Matrix_Mult/Result_Pr.c	Tue Nov 02 17:00:50 2010 -0700
@@ -8,6 +8,7 @@
 
 #include "SSR_Matrix_Mult.h"
 
+//=====================
 void inline
 accumulateResult( float32 *resultArray, float32 *subMatrixResultArray,
                   int32    startRow,
@@ -16,6 +17,7 @@
                   int32    numCols,
                   int32    numOrigCols );
 
+//===========================================================================
 
 /*The Result Processor gets a message from each of the vector processors,
  * puts the result from the message in its location in the result-
@@ -32,7 +34,7 @@
    void           *msg;
    SMPairParams   *resParams;
 
-         PRINT_DEBUG("start resultPr\n")
+         DEBUG("start resultPr\n")
          
    params    = (ResultsParams *)_params;
    dividerPr = params->dividerPr;
@@ -40,8 +42,7 @@
    numRows = params->numRows;
    numCols = params->numCols;
 
-   resultArray = SSR__malloc_size_to( numRows * numCols * sizeof(float32),
-                                       animatingPr );
+   resultArray = params->resultArray;
 
       //zero out the results array -- will be accumulating, so must start 0
    for( row = 0; row < numRows; row++ )
@@ -57,24 +58,45 @@
       msg = SSR__receive_type_to( RESULTS_MSG, animatingPr );
 
       resParams = (SMPairParams *)msg;
-      accumulateResult( resultArray, resParams->resultArray,
+      accumulateResult( resultArray, resParams->partialResultArray,
                         resParams->leftSubMatrix->origStartRow,
                         resParams->leftSubMatrix->numRows,
                         resParams->rightSubMatrix->origStartCol,
                         resParams->rightSubMatrix->numCols,
                         resParams->rightSubMatrix->origMatrix->numCols );
+
+      SSR__free( resParams->partialResultArray, animatingPr );
+      
+         //there is only one copy of results procr, so can update numUsesLeft
+         // without concurrency worries.  When zero, free the sub-matrix
+      resParams->leftSubMatrix->numUsesLeft -= 1;
+      if( resParams->leftSubMatrix->numUsesLeft == 0 )
+       {
+         SSR__free( resParams->leftSubMatrix->array, animatingPr );
+         SSR__free( resParams->leftSubMatrix, animatingPr );
+       }
+
+      resParams->rightSubMatrix->numUsesLeft -= 1;
+      if( resParams->rightSubMatrix->numUsesLeft == 0 )
+       {
+         SSR__free( resParams->rightSubMatrix->array, animatingPr );
+         SSR__free( resParams->rightSubMatrix, animatingPr );
+       }
+
+         //count of how many sub-matrix pairs accumulated so know when done
       count++;
     }
-      //if were real lang, would have auto-nested transfer -- but HelloWorld
-      // language, so have to transfer ownership of each allocated block of
-      // locations separately
-   SSR__transfer_ownership_of_from_to( resultArray, animatingPr, dividerPr );
-   SSR__send_from_to( resultArray, animatingPr, dividerPr );
+
+      //Done -- could just dissipate -- SSR will wait for all processors to
+      // dissipate before shutting down, and thereby making results avaial to
+      // outside, so no need to stop the divider from dissipating, so no need
+      // to send a hand-shake message to it -- bug makes debug easier
+   SSR__send_from_to( NULL, animatingPr, dividerPr );
    SSR__dissipate_procr( animatingPr );  //frees any data owned by procr
  }
 
 void inline
-accumulateResult( float32 *resultArray, float32 *subMatrixResultArray,
+accumulateResult( float32 *resultArray, float32 *subMatrixPairResultArray,
                   int32    startRow,
                   int32    numRows,
                   int32    startCol,
@@ -86,8 +108,8 @@
     {
       for( col = 0; col < numCols; col++ )
        {
-         resultArray[ (row + startRow) * numOrigCols + col + startCol ] +=
-            subMatrixResultArray[ row * numCols + col ];
+         resultArray[ (row + startRow) * numOrigCols + (col + startCol) ] +=
+            subMatrixPairResultArray[ row * numCols + col ];
        }
     }
 
diff -r f33a9cba5d89 -r 4e14e2663af9 src/Application/SSR_Matrix_Mult/SSR_Matrix_Mult.h
--- a/src/Application/SSR_Matrix_Mult/SSR_Matrix_Mult.h	Thu Oct 14 17:10:17 2010 -0700
+++ b/src/Application/SSR_Matrix_Mult/SSR_Matrix_Mult.h	Tue Nov 02 17:00:50 2010 -0700
@@ -17,8 +17,10 @@
 #define COLS_IN_BLOCK 32
 #define VEC_IN_BLOCK  32
 
+#define copyMatrixSingleton 1
+#define copyTransposeSingleton 2
 
-#define PRINT_DEBUG(msg) //printf(msg); fflush(stdin);
+#define DEBUG(msg) //printf(msg); fflush(stdin);
 
 //==============================  Structures  ==============================
 typedef struct
@@ -35,6 +37,7 @@
    int numRows;
    int numCols;
    int numSubMatrixPairs;
+   float32 *resultArray;
  }
 ResultsParams;
 
@@ -46,6 +49,7 @@
    int32    origStartRow;
    int32    origStartCol;
    int32    alreadyCopied;
+   int32    numUsesLeft; //have update via message to avoid multiple writers
    float32 *array;  //2D, but dynamically sized, so use addr arith
  }
 SubMatrix;
@@ -54,7 +58,7 @@
  { VirtProcr *resultPr;
    SubMatrix *leftSubMatrix;
    SubMatrix *rightSubMatrix;
-   float32   *resultArray;
+   float32   *partialResultArray;
  }
 SMPairParams;
 
diff -r f33a9cba5d89 -r 4e14e2663af9 src/Application/SSR_Matrix_Mult/subMatrix_Pr.c
--- a/src/Application/SSR_Matrix_Mult/subMatrix_Pr.c	Thu Oct 14 17:10:17 2010 -0700
+++ b/src/Application/SSR_Matrix_Mult/subMatrix_Pr.c	Tue Nov 02 17:00:50 2010 -0700
@@ -10,10 +10,10 @@
 
 
 void inline
-copyFromOrig( SubMatrix *subMatrix );
+copyFromOrig( SubMatrix *subMatrix, VirtProcr *animPr );
 
 void inline
-copyTransposeFromOrig( SubMatrix *subMatrix );
+copyTransposeFromOrig( SubMatrix *subMatrix, VirtProcr *animPr );
 
 void inline
 multiplySubBlocksTransposed( float32 *leftArray, float32 *rightArray,
@@ -24,7 +24,7 @@
                      int resStride, int inpStride );
 
 void inline
-multiplyMatrixArrays( int32 vecLength, int32 numResRows, int32 numResCols,
+multiplyMatrixArraysTransposed( int32 vecLength, int32 numResRows, int32 numResCols,
                       float32 *leftArray, float32 *rightArray,
                       float32 *resArray );
 
@@ -48,7 +48,7 @@
    float32        *leftArray,  *rightArray, *resArray;
    SubMatrix      *leftSubMatrix, *rightSubMatrix;
 
-         PRINT_DEBUG("start sub-matrix mult\n")
+         DEBUG("start sub-matrix mult\n")
 
    params         = (SMPairParams *)data;
    resultPr       = params->resultPr;
@@ -56,14 +56,15 @@
    rightSubMatrix = params->rightSubMatrix;
 
       //make sure the input sub-matrices have been copied out of orig
-   copyFromOrig( leftSubMatrix );
-   copyTransposeFromOrig( rightSubMatrix );
+      //do it here, inside sub-matrix pair to hopefully gain reuse in cache
+   copyFromOrig( leftSubMatrix, animatingPr );
+   copyTransposeFromOrig( rightSubMatrix, animatingPr );
    
    leftArray      = leftSubMatrix->array;
    rightArray     = rightSubMatrix->array;
 
-   resArray = malloc( leftSubMatrix->numRows * rightSubMatrix->numCols *
-                         sizeof( float32 ) );
+   resArray = SSR__malloc_to(leftSubMatrix->numRows * rightSubMatrix->numCols
+                             * sizeof( float32 ), animatingPr );
 
 
    int32 numResRows, numResCols, vectLength;
@@ -72,12 +73,12 @@
    numResRows = leftSubMatrix->numRows;
    numResCols = rightSubMatrix->numCols;
 
-   multiplyMatrixArrays( vectLength, numResRows, numResCols,
-                         leftArray, rightArray,
+   multiplyMatrixArraysTransposed( vectLength, numResRows, numResCols,
+                         leftArray,  rightArray,
                          resArray );
 
    //send result to result processor
-   params->resultArray = resArray;
+   params->partialResultArray = resArray;
    SSR__send_of_type_to( animatingPr, params, RESULTS_MSG, resultPr );
    SSR__dissipate_procr( animatingPr );
  }
@@ -95,7 +96,8 @@
  *
  */
 void inline
-multiplyMatrixArrays( int32 vecLength, int32 numResRows, int32 numResCols,
+multiplyMatrixArraysTransposed( int32 vecLength, int32 numResRows,
+                                int32 numResCols,
                       float32 *leftArray, float32 *rightArray,
                       float32 *resArray )
  {
@@ -172,29 +174,15 @@
     }
  }
 
+
+/*Reuse this in divider when do the sequential multiply case
+ */
 void inline
-copyTransposeFromOrig( SubMatrix *subMatrix )
- { int numCols, numRows, origStartRow, origStartCol, origStride, stride;
-   Matrix *origMatrix;
-   float32 *origArray, *subArray;
-
-   if( subMatrix->alreadyCopied ) return;
-
-   subMatrix->alreadyCopied = TRUE;
-
-   origMatrix   = subMatrix->origMatrix;
-   origArray     = origMatrix->array;
-   numCols      = subMatrix->numCols;
-   numRows      = subMatrix->numRows;
-   stride       = numRows;
-   origStartRow = subMatrix->origStartRow;
-   origStartCol = subMatrix->origStartCol;
-   origStride   = origMatrix->numCols;
-
-   subArray      = malloc( numRows * numCols * sizeof(float32) );
-   subMatrix->array = subArray;
-
-      //copy values from orig matrix to local
+copyTranspose( int32 numRows, int32 numCols,
+               int32 origStartRow, int32 origStartCol, int32 origStride,
+               float32 *subArray, float32 *origArray )
+ { int32 stride = numRows;
+ 
    int row, col, origOffset;
    for( row = 0; row < numRows; row++ )
     {
@@ -203,21 +191,60 @@
        {
             //transpose means swap row & col -- traverse orig matrix normally
             // but put into reversed place in local array -- means the
-            // stride is the num rows now, so col * numRows + row
+            // stride is the numRows now, so col * numRows + row
          subArray[ col * stride + row ]  =  origArray[ origOffset + col ];
-       }      
+       }
     }
  }
 
 void inline
-copyFromOrig( SubMatrix *subMatrix )
+copyTransposeFromOrig( SubMatrix *subMatrix, VirtProcr *animPr )
+ { int numCols, numRows, origStartRow, origStartCol, origStride, stride;
+   Matrix *origMatrix;
+   float32 *origArray, *subArray;
+
+   if( subMatrix->alreadyCopied ) return;
+   SSR__start_singleton( copyMatrixSingleton, &&EndOfTransSingleton, animPr);
+
+   origMatrix   = subMatrix->origMatrix;
+   origArray    = origMatrix->array;
+   numCols      = subMatrix->numCols;
+   numRows      = subMatrix->numRows;
+   origStartRow = subMatrix->origStartRow;
+   origStartCol = subMatrix->origStartCol;
+   origStride   = origMatrix->numCols;
+
+   subArray     = SSR__malloc_to( numRows * numCols *sizeof(float32),animPr);
+   subMatrix->array = subArray;
+
+      //copy values from orig matrix to local
+   copyTranspose( numRows, numCols,
+                  origStartRow, origStartCol, origStride,
+                  subArray, origArray );
+
+   subMatrix->alreadyCopied = TRUE; //must be last thing before label
+   EndOfTransSingleton:
+   return;
+ }
+
+
+void inline
+copyFromOrig( SubMatrix *subMatrix, VirtProcr *animPr )
  { int numCols, numRows, origStartRow, origStartCol, stride, origStride;
    Matrix *origMatrix;
    float32 *origArray, *subArray;
 
+
+      //This lets only a single VP execute the code between start and
+      // end -- using start and end so that work runs outside the master.
+      //Inside, if a second VP ever executes the start, it will be returned
+      // from the end-point.
+      //Note, for non-GCC, can add a second SSR call at the end, and inside
+      // that one, look at the stack at the return addr & save that in an
+      // array indexed by singletonID
    if( subMatrix->alreadyCopied ) return;
+   SSR__start_singleton( copyMatrixSingleton, &&EndOfCopySingleton, animPr );
 
-   subMatrix->alreadyCopied = TRUE;
 
    origMatrix    = subMatrix->origMatrix;
    origArray     = origMatrix->array;
@@ -225,13 +252,14 @@
    numRows       = subMatrix->numRows;
    origStartRow  = subMatrix->origStartRow;
    origStartCol  = subMatrix->origStartCol;
-   stride        = numCols;
    origStride    = origMatrix->numCols;
 
-   subArray      = malloc( numRows * numCols * sizeof(float32) );
+   subArray     = SSR__malloc_to( numRows * numCols *sizeof(float32),animPr);
    subMatrix->array = subArray;
 
       //copy values from orig matrix to local
+   stride        = numCols;
+
    int row, col, offset, origOffset;
    for( row = 0; row < numRows; row++ )
     {
@@ -242,4 +270,8 @@
          subArray[ offset + col ]  =  origArray[ origOffset + col ];
        }
     }
+
+   subMatrix->alreadyCopied = TRUE; //must be last thing before label
+   EndOfCopySingleton:
+   return;
  }