# HG changeset patch
# User Me
# Date 1289919771 -3600
# Node ID 133633d1c10f32526f25affc7bac6287652070f9
# Parent  8d14fe28a7828920362a23fbfde81a877651e403
First version modified from SSR copy -- not working yet

diff -r 8d14fe28a782 -r 133633d1c10f src/Application/Matrix_Mult.h
--- a/src/Application/Matrix_Mult.h	Wed Nov 10 22:26:57 2010 -0800
+++ b/src/Application/Matrix_Mult.h	Tue Nov 16 16:02:51 2010 +0100
@@ -10,7 +10,7 @@
 #include <unistd.h>
 #include <malloc.h>
 
-#include "../SSR_lib/VMS/VMS_primitive_data_types.h"
+#include "../VPThread_lib/VMS/VMS_primitive_data_types.h"
 #include "ParamHelper/Param.h"
 
 //==============================  Structures  ==============================
diff -r 8d14fe28a782 -r 133633d1c10f src/Application/VPThread__Matrix_Mult/Divide_Pr.c
--- a/src/Application/VPThread__Matrix_Mult/Divide_Pr.c	Wed Nov 10 22:26:57 2010 -0800
+++ b/src/Application/VPThread__Matrix_Mult/Divide_Pr.c	Tue Nov 16 16:02:51 2010 +0100
@@ -7,7 +7,7 @@
  */
 
 
-#include "SSR_Matrix_Mult.h"
+#include "VPThread__Matrix_Mult.h"
 #include <math.h>
 #include <string.h>
 
@@ -65,11 +65,11 @@
  *  processors,
  *  then does a receive of a message from the result processor that gives
  *  the divider ownership of the result matrix.
- * Finally, the divider returns the result matrix out of the SSR system.
+ * Finally, the divider returns the result matrix out of the VPThread system.
  *
  * Divider chooses the size of sub-matrices via an algorithm that tries to
  *  keep the minimum work above a threshold.  The threshold is machine-
- *  dependent, so ask SSR for min work-unit time to get a
+ *  dependent, so ask VPThread for min work-unit time to get a
  *  given overhead
  *
  * Divide min work-unit cycles by measured-cycles for one matrix-cell
@@ -114,21 +114,22 @@
  */
 
 void divideWorkIntoSubMatrixPairProcrs( void      *_dividerParams,
-                                        VirtProcr *animPr )
+                                        VirtProcr *animatingThd )
  { VirtProcr       *resultPr;
    DividerParams   *dividerParams;
    ResultsParams   *resultsParams;
    Matrix          *leftMatrix, *rightMatrix, *resultMatrix;
    void            *msg;
    SlicingStrucCarrier *slicingStrucCarrier;
-   float32         *resultArray; //points to array inside result matrix
-   
+   float32             *resultArray; //points to array inside result matrix
+   MatrixMultGlobals   *globals;
+  
          DEBUG( dbgAppFlow, "start divide\n")
 
          int32
          divideProbe = VMS__create_single_interval_probe( "divideProbe",
-                                                          animPr );
-         VMS__record_sched_choice_into_probe( divideProbe, animPr );
+                                                          animatingThd );
+         VMS__record_sched_choice_into_probe( divideProbe, animatingThd );
          VMS__record_interval_start_in_probe( divideProbe );
 
    //=========== Setup -- make local copies of ptd-to-things, malloc, aso
@@ -160,9 +161,9 @@
             
          //transpose the right matrix
       float32 *
-      transRightArray  = SSR__malloc_to( rightMatrix->numRows * 
-                                         rightMatrix->numCols * sizeof(float32),
-                                         animPr );
+      transRightArray  = 
+         VPThread__malloc( rightMatrix->numRows * rightMatrix->numCols *
+                           sizeof(float32), animatingThd );
 
          //copy values from orig matrix to local
       copyTranspose( rightMatrix->numRows, rightMatrix->numCols,
@@ -182,34 +183,60 @@
          //The ideal size is the one takes the number of cycles to calculate
          // such that calc time is equal or greater than min work-unit size
       slicingStrucCarrier =
-         calcIdealSizeAndSliceDimensions( leftMatrix, rightMatrix, animPr );
+         calcIdealSizeAndSliceDimensions(leftMatrix,rightMatrix,animatingThd);
 
          //Make the results processor, now that know how many to wait for
-      resultsParams = SSR__malloc_to( sizeof(ResultsParams), animPr );
+      resultsParams = VPThread__malloc( sizeof(ResultsParams), animatingThd );
       resultsParams->numSubMatrixPairs  =
          slicingStrucCarrier->leftRowSlices->numVals *
          slicingStrucCarrier->rightColSlices->numVals *
          slicingStrucCarrier->vecSlices->numVals;
-      resultsParams->dividerPr   = animPr;
+      resultsParams->dividerPr   = animatingThd;
       resultsParams->numCols     = rightMatrix->numCols;
       resultsParams->numRows     = leftMatrix->numRows;
       resultsParams->resultArray = resultArray;
 
+      //==========  Set up global vars, including conds and mutexes ==========
+      globals = VMS__malloc( sizeof(MatrixMultGlobals) );
+      VPThread__set_globals_to( globals );
 
-      resultPr =
-         SSR__create_procr_with( &gatherResults, resultsParams, animPr);
+      globals->results_mutex = VPThread__make_mutex( animatingThd );
+      globals->results_cond  = VPThread__make_cond( globals->results_mutex,
+                                                               animatingThd );
 
+      globals->vector_mutex = VPThread__make_mutex( animatingThd );
+      globals->vector_cond  = VPThread__make_cond( globals->vector_mutex,
+                                                               animatingThd );
+
+      globals->start_mutex = VPThread__make_mutex( animatingThd );
+      globals->start_cond  = VPThread__make_cond( globals->start_mutex,
+                                                               animatingThd );
+      //======================================================================
+
+         //get results-comm lock before create results-thd, to ensure it can't
+         // signal that results are available before this thd is waiting on cond
+      VPThread__mutex_lock( globals->results_mutex, animatingThd );
+
+         //also get the start lock & use to ensure no vector threads send a
+         // signal before the results thread is waiting on vector cond
+      VPThread__mutex_lock( globals->start_mutex, animatingThd );
+
+
+      VPThread__create_thread( &gatherResults, resultsParams, animatingThd );
+
+         //Now wait for results thd to signal that it has vector lock
+      VPThread__cond_wait(  globals->start_cond,  animatingThd );
+      VPThread__mutex_unlock( globals->start_mutex, animatingThd );//done w/lock
+   
          //Make the sub-matrices, and pair them up, and make processor to
          // calc product of each pair.
       makeSubMatricesAndProcrs( leftMatrix, rightMatrix,
                                     slicingStrucCarrier,
-                                    resultPr, animPr);
+                                    resultPr, animatingThd);
  
-         //result array is allocated externally, so no message from resultPr
-         // however, do have to wait before printing out stats, so wait
-         // for an empty handshake message
-      msg = SSR__receive_from_to( resultPr, animPr );
-   }
+         //Wait for results thread to say results are good
+      VPThread__cond_wait(  globals->results_cond,  animatingThd );
+    }
 
 
    //===============  Work done -- send results back =================
@@ -220,11 +247,13 @@
          VMS__record_interval_end_in_probe( divideProbe );
          VMS__print_stats_of_all_probes();
 
-      //nothing left to do so dissipate, SSR will wait to shutdown and hence
-      // make results available to outside until all the processors have
-      // dissipated -- so no need to wait for results processor
+      //nothing left to do so dissipate, VPThread will wait to shutdown,
+      // making results available to outside, until all the processors have
+      // dissipated -- so actually no need to wait for results processor
+      //However, following the pattern, so done with comm, release lock
+   VPThread__mutex_unlock( globals->results_mutex, animatingThd );
 
-   SSR__dissipate_procr( animPr );  //all procrs dissipate self at end
+   VPThread__dissipate_thread( animatingThd );  //all procrs dissipate self at end
       //when all of the processors have dissipated, the "create seed and do
       // work" call in the entry point function returns
  }
@@ -237,7 +266,7 @@
    float32 idealSizeOfSide, idealSizeOfSide1, idealSizeOfSide2;
    SlicingStruc *leftRowSlices, *vecSlices, *rightColSlices;
    SlicingStrucCarrier *slicingStrucCarrier =
-                         SSR__malloc_to(sizeof(SlicingStrucCarrier), animPr);
+                         VPThread__malloc(sizeof(SlicingStrucCarrier), animPr);
 
    int minWorkUnitCycles, primitiveCycles, idealNumWorkUnits;
    float64 numPrimitiveOpsInMinWorkUnit;
@@ -245,11 +274,11 @@
 
    //=======  Calc ideal size of min-sized sub-matrix  ========
 
-      //ask SSR for the number of cycles of the minimum work unit, at given
+      //ask VPThread for the number of cycles of the minimum work unit, at given
       // percent overhead then add a guess at overhead from this divider
-   minWorkUnitCycles = SSR__giveMinWorkUnitCycles( .05 );
+   minWorkUnitCycles = VPThread__giveMinWorkUnitCycles( .05 );
 
-      //ask SSR for number of cycles of the "primitive" op of matrix mult
+      //ask VPThread for number of cycles of the "primitive" op of matrix mult
    primitiveCycles = measureMatrixMultPrimitive( animPr );
 
    numPrimitiveOpsInMinWorkUnit =
@@ -259,7 +288,7 @@
       // then multiply by 5 because the primitive is 5x5
    idealSizeOfSide1 = 5 * cbrt( numPrimitiveOpsInMinWorkUnit );
 
-   idealNumWorkUnits = SSR__giveIdealNumWorkUnits();
+   idealNumWorkUnits = VPThread__giveIdealNumWorkUnits();
    
    idealSizeOfSide2 = leftMatrix->numRows / rint(cbrt( idealNumWorkUnits ));
    idealSizeOfSide2 *= 0.6; //finer granularity to help load balance
@@ -312,7 +341,7 @@
    leftRowSlices  = slicingStrucCarrier->leftRowSlices;
    vecSlices      = slicingStrucCarrier->vecSlices;
    rightColSlices = slicingStrucCarrier->rightColSlices;
-   SSR__free( slicingStrucCarrier, animPr );
+   VPThread__free( slicingStrucCarrier, animPr );
    
    //================  Make sub-matrices, given the slicing  ================
    SubMatrix **leftSubMatrices, **rightSubMatrices;
@@ -363,7 +392,7 @@
    numLeftColIdxs  = numColIdxs;
    numRightColIdxs = numVecIdxs;
 
-   numCores = SSR__give_number_of_cores_to_schedule_onto();
+   numCores = VPThread__give_number_of_cores_to_schedule_onto();
 
    numToPutOntoEachCore = numRowIdxs*numColIdxs/numCores;
    leftOverFraction = 0;
@@ -380,7 +409,7 @@
          for( vecIdx = 0; vecIdx < numVecIdxs; vecIdx++ )
           {
                //Make the processor for the pair of sub-matrices
-            subMatrixPairParams  = SSR__malloc_to( sizeof(SMPairParams),
+            subMatrixPairParams  = VPThread__malloc( sizeof(SMPairParams),
                                                                animatingPr);
             subMatrixPairParams->leftSubMatrix  =
                leftSubMatrices[ leftRowIdxOffset + vecIdx ];
@@ -388,10 +417,10 @@
             subMatrixPairParams->rightSubMatrix =
                rightSubMatrices[ vecIdx * numRightColIdxs + resColIdx ];
 
-            subMatrixPairParams->resultPr = resultPr;
+            subMatrixPairParams->resultPr = resultPr fix_this;
 
                //put all pairs from the same vector onto same core
-            SSR__create_procr_with_affinity( &calcSubMatrixProduct,
+            VPThread__create_thread_with_affinity( &calcSubMatrixProduct,
                                              subMatrixPairParams,
                                              animatingPr,
                                              coreToScheduleOnto );
@@ -424,7 +453,6 @@
  
        }
     }
-
  }
 
 
@@ -447,7 +475,7 @@
    rowStartVals = rowSlices->startVals;
    colStartVals = colSlices->startVals;
 
-   subMatrices = SSR__malloc_to(numRowIdxs * numColIdxs * sizeof(SubMatrix*),
+   subMatrices = VPThread__malloc(numRowIdxs * numColIdxs * sizeof(SubMatrix*),
                                  animPr );
 
    for( rowIdx = 0; rowIdx < numRowIdxs; rowIdx++ )
@@ -463,7 +491,7 @@
          startCol = colStartVals[colIdx];
          endCol   = colStartVals[colIdx + 1] -1;
 
-         newSubMatrix = SSR__malloc_to( sizeof(SubMatrix), animPr );
+         newSubMatrix = VPThread__malloc( sizeof(SubMatrix), animPr );
          newSubMatrix->numRows       = endRow - startRow +1;
          newSubMatrix->numCols       = endCol - startCol +1;
          newSubMatrix->origMatrix    = origMatrix;
@@ -496,11 +524,11 @@
        {
          subMatrix = subMatrices[ rowOffset + colIdx ];
          if( subMatrix->alreadyCopied )
-            SSR__free( subMatrix->array, animPr );
-         SSR__free( subMatrix, animPr );
+            VPThread__free( subMatrix->array, animPr );
+         VPThread__free( subMatrix, animPr );
        }
     }
-   SSR__free( subMatrices, animPr );
+   VPThread__free( subMatrices, animPr );
  }
 
 
@@ -510,12 +538,12 @@
                   VirtProcr *animPr )
  { float32 residualAcc = 0;
    int     numSlices, i, *startVals, sizeOfSlice, endCondition;
-   SlicingStruc *slicingStruc = SSR__malloc_to(sizeof(SlicingStruc), animPr);
+   SlicingStruc *slicingStruc = VPThread__malloc(sizeof(SlicingStruc), animPr);
 
       //calc size of matrix need to hold start vals --
    numSlices = (int32)( (float32)(endVal -startVal +1) / idealSizeOfSide);
 
-   startVals = SSR__malloc_to( (numSlices + 1) * sizeof(int32), animPr );
+   startVals = VPThread__malloc( (numSlices + 1) * sizeof(int32), animPr );
 
       //Calc the upper limit of start value -- when get above this, end loop
       // by saving highest value of the matrix dimension to access, plus 1
@@ -549,8 +577,8 @@
 void
 freeSlicingStruc( SlicingStruc *slicingStruc, VirtProcr *animPr )
  {
-   SSR__free( slicingStruc->startVals, animPr );
-   SSR__free( slicingStruc, animPr );
+   VPThread__free( slicingStruc->startVals, animPr );
+   VPThread__free( slicingStruc, animPr );
  }
 
 
@@ -561,9 +589,9 @@
    float32 *res, *left, *right;
 
       //setup inputs
-   left  = SSR__malloc_to( 5 * 5 * sizeof( float32 ), animPr );
-   right = SSR__malloc_to( 5 * 5 * sizeof( float32 ), animPr );
-   res   = SSR__malloc_to( 5 * 5 * sizeof( float32 ), animPr );
+   left  = VPThread__malloc( 5 * 5 * sizeof( float32 ), animPr );
+   right = VPThread__malloc( 5 * 5 * sizeof( float32 ), animPr );
+   res   = VPThread__malloc( 5 * 5 * sizeof( float32 ), animPr );
 
    for( r = 0; r < 5; r++ )
     {
@@ -575,7 +603,7 @@
     }
 
       //do primitive
-   SSR__start_primitive();  //for now, just takes time stamp
+   VPThread__start_primitive();  //for now, just takes time stamp
    for( r = 0; r < 5; r++ )
     {
       for( c = 0; c < 5; c++ )
@@ -587,11 +615,11 @@
        }
     }
    numCycles =
-      SSR__end_primitive_and_give_cycles();
+      VPThread__end_primitive_and_give_cycles();
 
-   SSR__free( left, animPr );
-   SSR__free( right, animPr );
-   SSR__free( res, animPr );
+   VPThread__free( left, animPr );
+   VPThread__free( right, animPr );
+   VPThread__free( res, animPr );
 
    return numCycles;
  }
diff -r 8d14fe28a782 -r 133633d1c10f src/Application/VPThread__Matrix_Mult/EntryPoint.c
--- a/src/Application/VPThread__Matrix_Mult/EntryPoint.c	Wed Nov 10 22:26:57 2010 -0800
+++ b/src/Application/VPThread__Matrix_Mult/EntryPoint.c	Tue Nov 16 16:02:51 2010 +0100
@@ -8,7 +8,7 @@
 
 #include <math.h>
 
-#include "SSR_Matrix_Mult.h"
+#include "VPThread__Matrix_Mult.h"
 
 
 
@@ -54,8 +54,8 @@
 
       //create divider processor, start doing the work, and wait till done
       //This function is the "border crossing" between normal code and SSR
-   SSR__create_seed_procr_and_do_work( &divideWorkIntoSubMatrixPairProcrs,
-                                       dividerParams );
+   VPThread__create_seed_procr_and_do_work(&divideWorkIntoSubMatrixPairProcrs,
+                                           dividerParams );
    
    free( dividerParams );
    return resMatrix;
diff -r 8d14fe28a782 -r 133633d1c10f src/Application/VPThread__Matrix_Mult/Result_Pr.c
--- a/src/Application/VPThread__Matrix_Mult/Result_Pr.c	Wed Nov 10 22:26:57 2010 -0800
+++ b/src/Application/VPThread__Matrix_Mult/Result_Pr.c	Tue Nov 16 16:02:51 2010 +0100
@@ -6,7 +6,7 @@
  *
  */
 
-#include "SSR_Matrix_Mult.h"
+#include "VPThread__Matrix_Mult.h"
 
 //=====================
 void inline
@@ -26,13 +26,30 @@
  *After the count reaches the point that all results have been received, it
  * returns the result matrix and dissipates.
  */
-void gatherResults( void *_params, VirtProcr *animatingPr )
+void gatherResults( void *_params, VirtProcr *animatingThd )
  { VirtProcr *dividerPr;
    ResultsParams  *params;
    int             row, col, numRows, numCols, numSubMatrixPairs, count=0;
    float32        *resultArray;
    void           *msg;
    SMPairParams   *resParams;
+   //====================== thread stuff =======================
+   MatrixMultGlobals *globals =(MatrixMultGlobals *)VPThread__give_globals();
+
+
+      //get vector-comm lock before loop, so that this thd keeps lock after
+      // one wait until it enters the next wait -- forces see-saw btwn
+      // waiters and signalers -- wait-signal-wait-signal-...
+   VPThread__mutex_lock( globals->vector_mutex, animatingThd );
+
+      //Tell divider that have the vector lock -- so it's sure won't miss any
+      // signals from the vector-threads it's about to create
+      //Don't need a signal variable -- this thd can't be created until
+      // divider thd already has the start lock
+   VPThread__mutex_lock( globals->start_mutex, animatingThd );//finish wait
+   VPThread__cond_signal( globals->start_cond,  animatingThd );
+   VPThread__mutex_unlock( globals->start_mutex, animatingThd );//finish wait
+   //===========================================================
 
          DEBUG( dbgAppFlow, "start resultPr\n")
          
@@ -47,9 +64,16 @@
 
    while( count < numSubMatrixPairs )
     {
-      msg = SSR__receive_type_to( RESULTS_MSG, animatingPr );
+         //receive a vector-result from a vector-thread
+      VPThread__cond_wait(  globals->vector_cond,  animatingThd );
 
-      resParams = (SMPairParams *)msg;
+         //At this point, animating thread owns the vector lock, so all
+         // pairs trying to signal they have a result are waiting to get that
+         // lock -- only one gets it at a time, and when signal, this thd
+         // gets the lock and does the body of this loop, then when does the
+         // wait again, that releases the lock for next pair-thread to get it
+      resParams = globals->currSMPairParams;
+
       accumulateResult( resultArray, resParams->partialResultArray,
                         resParams->leftSubMatrix->origStartRow,
                         resParams->leftSubMatrix->numRows,
@@ -57,22 +81,22 @@
                         resParams->rightSubMatrix->numCols,
                         resParams->rightSubMatrix->origMatrix->numCols );
 
-      SSR__free( resParams->partialResultArray, animatingPr );
+      VPThread__free( resParams->partialResultArray, animatingThd );
       
          //there is only one copy of results procr, so can update numUsesLeft
          // without concurrency worries.  When zero, free the sub-matrix
       resParams->leftSubMatrix->numUsesLeft -= 1;
       if( resParams->leftSubMatrix->numUsesLeft == 0 )
        {
-         SSR__free( resParams->leftSubMatrix->array, animatingPr );
-         SSR__free( resParams->leftSubMatrix, animatingPr );
+         VPThread__free( resParams->leftSubMatrix->array, animatingThd );
+         VPThread__free( resParams->leftSubMatrix, animatingThd );
        }
 
       resParams->rightSubMatrix->numUsesLeft -= 1;
       if( resParams->rightSubMatrix->numUsesLeft == 0 )
        {
-         SSR__free( resParams->rightSubMatrix->array, animatingPr );
-         SSR__free( resParams->rightSubMatrix, animatingPr );
+         VPThread__free( resParams->rightSubMatrix->array, animatingThd );
+         VPThread__free( resParams->rightSubMatrix, animatingThd );
        }
 
          //count of how many sub-matrix pairs accumulated so know when done
@@ -82,9 +106,20 @@
       //Done -- could just dissipate -- SSR will wait for all processors to
       // dissipate before shutting down, and thereby making results avaial to
       // outside, so no need to stop the divider from dissipating, so no need
-      // to send a hand-shake message to it -- bug makes debug easier
-   SSR__send_from_to( NULL, animatingPr, dividerPr );
-   SSR__dissipate_procr( animatingPr );  //frees any data owned by procr
+      // to send a hand-shake message to it -- but makes debug easier
+      //However, following pattern, so all comms done, release lock
+   VPThread__mutex_unlock( globals->vector_mutex, animatingThd );
+
+      //Send result to divider (seed) thread
+      // note, divider thd had to hold the results-comm lock before creating
+      // this thread, to be sure no race
+   VPThread__mutex_lock(   globals->results_mutex, animatingThd );
+   //globals->results = resultMatrixArray;
+   VPThread__cond_signal(  globals->results_cond,  animatingThd );
+   VPThread__mutex_unlock( globals->results_mutex, animatingThd ); //releases
+   //divider thread from its wait, at point this executes
+
+   VPThread__dissipate_thread( animatingThd );  //frees any data owned by procr
  }
 
 void inline
diff -r 8d14fe28a782 -r 133633d1c10f src/Application/VPThread__Matrix_Mult/VPThread__Matrix_Mult.h
--- a/src/Application/VPThread__Matrix_Mult/VPThread__Matrix_Mult.h	Wed Nov 10 22:26:57 2010 -0800
+++ b/src/Application/VPThread__Matrix_Mult/VPThread__Matrix_Mult.h	Tue Nov 16 16:02:51 2010 +0100
@@ -8,7 +8,7 @@
 
 #include <stdio.h>
 
-#include "../../SSR_lib/SSR.h"
+#include "../../VPThread_lib/VPThread.h"
 #include "../Matrix_Mult.h"
 
 
@@ -81,6 +81,29 @@
    RESULTS_MSG = 1
  };
 
+ 
+typedef struct
+ {
+      //for communicating sub-matrix-pair results to results Thd
+   int32         vector_mutex;
+   int32         vector_cond;
+   SMPairParams *currSMPairParams;
+
+      //for communicating results array back to seed (divider) Thd
+   int32         results_mutex;
+   int32         results_cond;
+   float32      *results;
+
+      //for ensuring results thd has vector lock before making vector thds
+   int32         start_mutex;
+   int32         start_cond;
+
+   Matrix *rightMatrix;
+   Matrix *resultMatrix;
+ }
+MatrixMultGlobals;
+
+
 //============================= Processor Functions =========================
 void divideWorkIntoSubMatrixPairProcrs( void *data, VirtProcr *animatingPr );
 void calcSubMatrixProduct(        void *data, VirtProcr *animatingPr );
diff -r 8d14fe28a782 -r 133633d1c10f src/Application/VPThread__Matrix_Mult/subMatrix_Pr.c
--- a/src/Application/VPThread__Matrix_Mult/subMatrix_Pr.c	Wed Nov 10 22:26:57 2010 -0800
+++ b/src/Application/VPThread__Matrix_Mult/subMatrix_Pr.c	Tue Nov 16 16:02:51 2010 +0100
@@ -8,7 +8,7 @@
 
 #include <string.h>
 
-#include "SSR_Matrix_Mult.h"
+#include "VPThread__Matrix_Mult.h"
 
 
 
@@ -50,6 +50,7 @@
    VirtProcr      *resultPr;
    float32        *leftArray,  *rightArray, *resArray;
    SubMatrix      *leftSubMatrix, *rightSubMatrix;
+   MatrixMultGlobals *globals =(MatrixMultGlobals *)VPThread__give_globals();
 
          DEBUG1(dbgAppFlow, "start sub-matrix mult: %d\n", animatingPr->procrID)
          int32 subMatrixProbe = VMS__create_single_interval_probe( "subMtx",
@@ -72,7 +73,7 @@
 
    int32
    resSize = leftSubMatrix->numRows * rightSubMatrix->numCols * sizeof(float32);
-   resArray = SSR__malloc_to( resSize, animatingPr );
+   resArray = VPThread__malloc( resSize, animatingPr );
    memset( resArray, 0, resSize );
 
 
@@ -91,8 +92,15 @@
 
          VMS__record_interval_end_in_probe( subMatrixProbe );
 
-   SSR__send_of_type_to( animatingPr, params, RESULTS_MSG, resultPr );
-   SSR__dissipate_procr( animatingPr );
+      //Send result to results thread
+      //This pattern works 'cause only get lock when results thd inside wait
+   VPThread__mutex_lock(   globals->vector_mutex, animatingPr );
+   globals->currSMPairParams = params;
+   VPThread__cond_signal(  globals->vector_cond,  animatingPr );
+   VPThread__mutex_unlock( globals->vector_mutex, animatingPr );//release
+   //wait-er -- cond_signal implemented such that wait-er gets lock, no other
+
+   VPThread__dissipate_thread( animatingPr );
  }
 
 
@@ -226,7 +234,7 @@
    float32 *origArray, *subArray;
 
    if( subMatrix->alreadyCopied ) return;
-   SSR__start_singleton( copyMatrixSingleton, &&EndOfTransSingleton, animPr);
+   VPThread__start_singleton( copyTransposeSingleton, animPr);
 
    origMatrix   = subMatrix->origMatrix;
    origArray    = origMatrix->array;
@@ -236,7 +244,7 @@
    origStartCol = subMatrix->origStartCol;
    origStride   = origMatrix->numCols;
 
-   subArray     = SSR__malloc_to( numRows * numCols *sizeof(float32),animPr);
+   subArray    = VPThread__malloc( numRows * numCols *sizeof(float32),animPr);
    subMatrix->array = subArray;
 
       //copy values from orig matrix to local
@@ -244,9 +252,8 @@
                   origStartRow, origStartCol, origStride,
                   subArray, origArray );
 
-   subMatrix->alreadyCopied = TRUE; //must be last thing before label
-   EndOfTransSingleton:
-   return;
+   VPThread__end_singleton( copyTransposeSingleton, animPr);
+   subMatrix->alreadyCopied = TRUE; //anywhere after singleton work finished
  }
 
 
@@ -259,13 +266,12 @@
 
       //This lets only a single VP execute the code between start and
       // end -- using start and end so that work runs outside the master.
-      //Inside, if a second VP ever executes the start, it will be returned
-      // from the end-point.
-      //Note, for non-GCC, can add a second SSR call at the end, and inside
-      // that one, look at the stack at the return addr & save that in an
-      // array indexed by singletonID
-   if( subMatrix->alreadyCopied ) return;
-   SSR__start_singleton( copyMatrixSingleton, &&EndOfCopySingleton, animPr );
+      //If a second VP ever executes the start, it will be returned
+      // from the end-point.  If it executions start after another but before
+      // that other has finished, this one will remain suspended until the
+      // other finishes, then be resumed from the end-point.
+   if( subMatrix->alreadyCopied ) return; //an optimization -- set below
+   VPThread__start_singleton( copyMatrixSingleton, animPr );
 
 
    origMatrix    = subMatrix->origMatrix;
@@ -276,7 +282,7 @@
    origStartCol  = subMatrix->origStartCol;
    origStride    = origMatrix->numCols;
 
-   subArray     = SSR__malloc_to( numRows * numCols *sizeof(float32),animPr);
+   subArray    = VPThread__malloc( numRows * numCols *sizeof(float32),animPr);
    subMatrix->array = subArray;
 
       //copy values from orig matrix to local
@@ -293,7 +299,6 @@
        }
     }
 
-   subMatrix->alreadyCopied = TRUE; //must be last thing before label
-   EndOfCopySingleton:
-   return;
+   subMatrix->alreadyCopied = TRUE; //must be after singleton work finished
+   VPThread__end_singleton( copyMatrixSingleton, animPr );
  }