changeset 1:133633d1c10f

First version modified from SSR copy -- not working yet
author Me
date Tue, 16 Nov 2010 16:02:51 +0100
parents 8d14fe28a782
children 46ceb3dd0f0a
files src/Application/Matrix_Mult.h src/Application/VPThread__Matrix_Mult/Divide_Pr.c src/Application/VPThread__Matrix_Mult/EntryPoint.c src/Application/VPThread__Matrix_Mult/Result_Pr.c src/Application/VPThread__Matrix_Mult/VPThread__Matrix_Mult.h src/Application/VPThread__Matrix_Mult/subMatrix_Pr.c
diffstat 6 files changed, 182 insertions(+), 91 deletions(-) [+]
line diff
     1.1 --- a/src/Application/Matrix_Mult.h	Wed Nov 10 22:26:57 2010 -0800
     1.2 +++ b/src/Application/Matrix_Mult.h	Tue Nov 16 16:02:51 2010 +0100
     1.3 @@ -10,7 +10,7 @@
     1.4  #include <unistd.h>
     1.5  #include <malloc.h>
     1.6  
     1.7 -#include "../SSR_lib/VMS/VMS_primitive_data_types.h"
     1.8 +#include "../VPThread_lib/VMS/VMS_primitive_data_types.h"
     1.9  #include "ParamHelper/Param.h"
    1.10  
    1.11  //==============================  Structures  ==============================
     2.1 --- a/src/Application/VPThread__Matrix_Mult/Divide_Pr.c	Wed Nov 10 22:26:57 2010 -0800
     2.2 +++ b/src/Application/VPThread__Matrix_Mult/Divide_Pr.c	Tue Nov 16 16:02:51 2010 +0100
     2.3 @@ -7,7 +7,7 @@
     2.4   */
     2.5  
     2.6  
     2.7 -#include "SSR_Matrix_Mult.h"
     2.8 +#include "VPThread__Matrix_Mult.h"
     2.9  #include <math.h>
    2.10  #include <string.h>
    2.11  
    2.12 @@ -65,11 +65,11 @@
    2.13   *  processors,
    2.14   *  then does a receive of a message from the result processor that gives
    2.15   *  the divider ownership of the result matrix.
    2.16 - * Finally, the divider returns the result matrix out of the SSR system.
    2.17 + * Finally, the divider returns the result matrix out of the VPThread system.
    2.18   *
    2.19   * Divider chooses the size of sub-matrices via an algorithm that tries to
    2.20   *  keep the minimum work above a threshold.  The threshold is machine-
    2.21 - *  dependent, so ask SSR for min work-unit time to get a
    2.22 + *  dependent, so ask VPThread for min work-unit time to get a
    2.23   *  given overhead
    2.24   *
    2.25   * Divide min work-unit cycles by measured-cycles for one matrix-cell
    2.26 @@ -114,21 +114,22 @@
    2.27   */
    2.28  
    2.29  void divideWorkIntoSubMatrixPairProcrs( void      *_dividerParams,
    2.30 -                                        VirtProcr *animPr )
    2.31 +                                        VirtProcr *animatingThd )
    2.32   { VirtProcr       *resultPr;
    2.33     DividerParams   *dividerParams;
    2.34     ResultsParams   *resultsParams;
    2.35     Matrix          *leftMatrix, *rightMatrix, *resultMatrix;
    2.36     void            *msg;
    2.37     SlicingStrucCarrier *slicingStrucCarrier;
    2.38 -   float32         *resultArray; //points to array inside result matrix
    2.39 -   
    2.40 +   float32             *resultArray; //points to array inside result matrix
    2.41 +   MatrixMultGlobals   *globals;
    2.42 +  
    2.43           DEBUG( dbgAppFlow, "start divide\n")
    2.44  
    2.45           int32
    2.46           divideProbe = VMS__create_single_interval_probe( "divideProbe",
    2.47 -                                                          animPr );
    2.48 -         VMS__record_sched_choice_into_probe( divideProbe, animPr );
    2.49 +                                                          animatingThd );
    2.50 +         VMS__record_sched_choice_into_probe( divideProbe, animatingThd );
    2.51           VMS__record_interval_start_in_probe( divideProbe );
    2.52  
    2.53     //=========== Setup -- make local copies of ptd-to-things, malloc, aso
    2.54 @@ -160,9 +161,9 @@
    2.55              
    2.56           //transpose the right matrix
    2.57        float32 *
    2.58 -      transRightArray  = SSR__malloc_to( rightMatrix->numRows * 
    2.59 -                                         rightMatrix->numCols * sizeof(float32),
    2.60 -                                         animPr );
    2.61 +      transRightArray  = 
    2.62 +         VPThread__malloc( rightMatrix->numRows * rightMatrix->numCols *
    2.63 +                           sizeof(float32), animatingThd );
    2.64  
    2.65           //copy values from orig matrix to local
    2.66        copyTranspose( rightMatrix->numRows, rightMatrix->numCols,
    2.67 @@ -182,34 +183,60 @@
    2.68           //The ideal size is the one takes the number of cycles to calculate
    2.69           // such that calc time is equal or greater than min work-unit size
    2.70        slicingStrucCarrier =
    2.71 -         calcIdealSizeAndSliceDimensions( leftMatrix, rightMatrix, animPr );
    2.72 +         calcIdealSizeAndSliceDimensions(leftMatrix,rightMatrix,animatingThd);
    2.73  
    2.74           //Make the results processor, now that know how many to wait for
    2.75 -      resultsParams = SSR__malloc_to( sizeof(ResultsParams), animPr );
    2.76 +      resultsParams = VPThread__malloc( sizeof(ResultsParams), animatingThd );
    2.77        resultsParams->numSubMatrixPairs  =
    2.78           slicingStrucCarrier->leftRowSlices->numVals *
    2.79           slicingStrucCarrier->rightColSlices->numVals *
    2.80           slicingStrucCarrier->vecSlices->numVals;
    2.81 -      resultsParams->dividerPr   = animPr;
    2.82 +      resultsParams->dividerPr   = animatingThd;
    2.83        resultsParams->numCols     = rightMatrix->numCols;
    2.84        resultsParams->numRows     = leftMatrix->numRows;
    2.85        resultsParams->resultArray = resultArray;
    2.86  
    2.87 +      //==========  Set up global vars, including conds and mutexes ==========
    2.88 +      globals = VMS__malloc( sizeof(MatrixMultGlobals) );
    2.89 +      VPThread__set_globals_to( globals );
    2.90  
    2.91 -      resultPr =
    2.92 -         SSR__create_procr_with( &gatherResults, resultsParams, animPr);
    2.93 +      globals->results_mutex = VPThread__make_mutex( animatingThd );
    2.94 +      globals->results_cond  = VPThread__make_cond( globals->results_mutex,
    2.95 +                                                               animatingThd );
    2.96  
    2.97 +      globals->vector_mutex = VPThread__make_mutex( animatingThd );
    2.98 +      globals->vector_cond  = VPThread__make_cond( globals->vector_mutex,
    2.99 +                                                               animatingThd );
   2.100 +
   2.101 +      globals->start_mutex = VPThread__make_mutex( animatingThd );
   2.102 +      globals->start_cond  = VPThread__make_cond( globals->start_mutex,
   2.103 +                                                               animatingThd );
   2.104 +      //======================================================================
   2.105 +
   2.106 +         //get results-comm lock before create results-thd, to ensure it can't
   2.107 +         // signal that results are available before this thd is waiting on cond
   2.108 +      VPThread__mutex_lock( globals->results_mutex, animatingThd );
   2.109 +
   2.110 +         //also get the start lock & use to ensure no vector threads send a
   2.111 +         // signal before the results thread is waiting on vector cond
   2.112 +      VPThread__mutex_lock( globals->start_mutex, animatingThd );
   2.113 +
   2.114 +
   2.115 +      VPThread__create_thread( &gatherResults, resultsParams, animatingThd );
   2.116 +
   2.117 +         //Now wait for results thd to signal that it has vector lock
   2.118 +      VPThread__cond_wait(  globals->start_cond,  animatingThd );
   2.119 +      VPThread__mutex_unlock( globals->start_mutex, animatingThd );//done w/lock
   2.120 +   
   2.121           //Make the sub-matrices, and pair them up, and make processor to
   2.122           // calc product of each pair.
   2.123        makeSubMatricesAndProcrs( leftMatrix, rightMatrix,
   2.124                                      slicingStrucCarrier,
   2.125 -                                    resultPr, animPr);
   2.126 +                                    resultPr, animatingThd);
   2.127   
   2.128 -         //result array is allocated externally, so no message from resultPr
   2.129 -         // however, do have to wait before printing out stats, so wait
   2.130 -         // for an empty handshake message
   2.131 -      msg = SSR__receive_from_to( resultPr, animPr );
   2.132 -   }
   2.133 +         //Wait for results thread to say results are good
   2.134 +      VPThread__cond_wait(  globals->results_cond,  animatingThd );
   2.135 +    }
   2.136  
   2.137  
   2.138     //===============  Work done -- send results back =================
   2.139 @@ -220,11 +247,13 @@
   2.140           VMS__record_interval_end_in_probe( divideProbe );
   2.141           VMS__print_stats_of_all_probes();
   2.142  
   2.143 -      //nothing left to do so dissipate, SSR will wait to shutdown and hence
   2.144 -      // make results available to outside until all the processors have
   2.145 -      // dissipated -- so no need to wait for results processor
   2.146 +      //nothing left to do so dissipate, VPThread will wait to shutdown,
   2.147 +      // making results available to outside, until all the processors have
   2.148 +      // dissipated -- so actually no need to wait for results processor
   2.149 +      //However, following the pattern, so done with comm, release lock
   2.150 +   VPThread__mutex_unlock( globals->results_mutex, animatingThd );
   2.151  
   2.152 -   SSR__dissipate_procr( animPr );  //all procrs dissipate self at end
   2.153 +   VPThread__dissipate_thread( animatingThd );  //all procrs dissipate self at end
   2.154        //when all of the processors have dissipated, the "create seed and do
   2.155        // work" call in the entry point function returns
   2.156   }
   2.157 @@ -237,7 +266,7 @@
   2.158     float32 idealSizeOfSide, idealSizeOfSide1, idealSizeOfSide2;
   2.159     SlicingStruc *leftRowSlices, *vecSlices, *rightColSlices;
   2.160     SlicingStrucCarrier *slicingStrucCarrier =
   2.161 -                         SSR__malloc_to(sizeof(SlicingStrucCarrier), animPr);
   2.162 +                         VPThread__malloc(sizeof(SlicingStrucCarrier), animPr);
   2.163  
   2.164     int minWorkUnitCycles, primitiveCycles, idealNumWorkUnits;
   2.165     float64 numPrimitiveOpsInMinWorkUnit;
   2.166 @@ -245,11 +274,11 @@
   2.167  
   2.168     //=======  Calc ideal size of min-sized sub-matrix  ========
   2.169  
   2.170 -      //ask SSR for the number of cycles of the minimum work unit, at given
   2.171 +      //ask VPThread for the number of cycles of the minimum work unit, at given
   2.172        // percent overhead then add a guess at overhead from this divider
   2.173 -   minWorkUnitCycles = SSR__giveMinWorkUnitCycles( .05 );
   2.174 +   minWorkUnitCycles = VPThread__giveMinWorkUnitCycles( .05 );
   2.175  
   2.176 -      //ask SSR for number of cycles of the "primitive" op of matrix mult
   2.177 +      //ask VPThread for number of cycles of the "primitive" op of matrix mult
   2.178     primitiveCycles = measureMatrixMultPrimitive( animPr );
   2.179  
   2.180     numPrimitiveOpsInMinWorkUnit =
   2.181 @@ -259,7 +288,7 @@
   2.182        // then multiply by 5 because the primitive is 5x5
   2.183     idealSizeOfSide1 = 5 * cbrt( numPrimitiveOpsInMinWorkUnit );
   2.184  
   2.185 -   idealNumWorkUnits = SSR__giveIdealNumWorkUnits();
   2.186 +   idealNumWorkUnits = VPThread__giveIdealNumWorkUnits();
   2.187     
   2.188     idealSizeOfSide2 = leftMatrix->numRows / rint(cbrt( idealNumWorkUnits ));
   2.189     idealSizeOfSide2 *= 0.6; //finer granularity to help load balance
   2.190 @@ -312,7 +341,7 @@
   2.191     leftRowSlices  = slicingStrucCarrier->leftRowSlices;
   2.192     vecSlices      = slicingStrucCarrier->vecSlices;
   2.193     rightColSlices = slicingStrucCarrier->rightColSlices;
   2.194 -   SSR__free( slicingStrucCarrier, animPr );
   2.195 +   VPThread__free( slicingStrucCarrier, animPr );
   2.196     
   2.197     //================  Make sub-matrices, given the slicing  ================
   2.198     SubMatrix **leftSubMatrices, **rightSubMatrices;
   2.199 @@ -363,7 +392,7 @@
   2.200     numLeftColIdxs  = numColIdxs;
   2.201     numRightColIdxs = numVecIdxs;
   2.202  
   2.203 -   numCores = SSR__give_number_of_cores_to_schedule_onto();
   2.204 +   numCores = VPThread__give_number_of_cores_to_schedule_onto();
   2.205  
   2.206     numToPutOntoEachCore = numRowIdxs*numColIdxs/numCores;
   2.207     leftOverFraction = 0;
   2.208 @@ -380,7 +409,7 @@
   2.209           for( vecIdx = 0; vecIdx < numVecIdxs; vecIdx++ )
   2.210            {
   2.211                 //Make the processor for the pair of sub-matrices
   2.212 -            subMatrixPairParams  = SSR__malloc_to( sizeof(SMPairParams),
   2.213 +            subMatrixPairParams  = VPThread__malloc( sizeof(SMPairParams),
   2.214                                                                 animatingPr);
   2.215              subMatrixPairParams->leftSubMatrix  =
   2.216                 leftSubMatrices[ leftRowIdxOffset + vecIdx ];
   2.217 @@ -388,10 +417,10 @@
   2.218              subMatrixPairParams->rightSubMatrix =
   2.219                 rightSubMatrices[ vecIdx * numRightColIdxs + resColIdx ];
   2.220  
   2.221 -            subMatrixPairParams->resultPr = resultPr;
   2.222 +            subMatrixPairParams->resultPr = resultPr fix_this;
   2.223  
   2.224                 //put all pairs from the same vector onto same core
   2.225 -            SSR__create_procr_with_affinity( &calcSubMatrixProduct,
   2.226 +            VPThread__create_thread_with_affinity( &calcSubMatrixProduct,
   2.227                                               subMatrixPairParams,
   2.228                                               animatingPr,
   2.229                                               coreToScheduleOnto );
   2.230 @@ -424,7 +453,6 @@
   2.231   
   2.232         }
   2.233      }
   2.234 -
   2.235   }
   2.236  
   2.237  
   2.238 @@ -447,7 +475,7 @@
   2.239     rowStartVals = rowSlices->startVals;
   2.240     colStartVals = colSlices->startVals;
   2.241  
   2.242 -   subMatrices = SSR__malloc_to(numRowIdxs * numColIdxs * sizeof(SubMatrix*),
   2.243 +   subMatrices = VPThread__malloc(numRowIdxs * numColIdxs * sizeof(SubMatrix*),
   2.244                                   animPr );
   2.245  
   2.246     for( rowIdx = 0; rowIdx < numRowIdxs; rowIdx++ )
   2.247 @@ -463,7 +491,7 @@
   2.248           startCol = colStartVals[colIdx];
   2.249           endCol   = colStartVals[colIdx + 1] -1;
   2.250  
   2.251 -         newSubMatrix = SSR__malloc_to( sizeof(SubMatrix), animPr );
   2.252 +         newSubMatrix = VPThread__malloc( sizeof(SubMatrix), animPr );
   2.253           newSubMatrix->numRows       = endRow - startRow +1;
   2.254           newSubMatrix->numCols       = endCol - startCol +1;
   2.255           newSubMatrix->origMatrix    = origMatrix;
   2.256 @@ -496,11 +524,11 @@
   2.257         {
   2.258           subMatrix = subMatrices[ rowOffset + colIdx ];
   2.259           if( subMatrix->alreadyCopied )
   2.260 -            SSR__free( subMatrix->array, animPr );
   2.261 -         SSR__free( subMatrix, animPr );
   2.262 +            VPThread__free( subMatrix->array, animPr );
   2.263 +         VPThread__free( subMatrix, animPr );
   2.264         }
   2.265      }
   2.266 -   SSR__free( subMatrices, animPr );
   2.267 +   VPThread__free( subMatrices, animPr );
   2.268   }
   2.269  
   2.270  
   2.271 @@ -510,12 +538,12 @@
   2.272                    VirtProcr *animPr )
   2.273   { float32 residualAcc = 0;
   2.274     int     numSlices, i, *startVals, sizeOfSlice, endCondition;
   2.275 -   SlicingStruc *slicingStruc = SSR__malloc_to(sizeof(SlicingStruc), animPr);
   2.276 +   SlicingStruc *slicingStruc = VPThread__malloc(sizeof(SlicingStruc), animPr);
   2.277  
   2.278        //calc size of matrix need to hold start vals --
   2.279     numSlices = (int32)( (float32)(endVal -startVal +1) / idealSizeOfSide);
   2.280  
   2.281 -   startVals = SSR__malloc_to( (numSlices + 1) * sizeof(int32), animPr );
   2.282 +   startVals = VPThread__malloc( (numSlices + 1) * sizeof(int32), animPr );
   2.283  
   2.284        //Calc the upper limit of start value -- when get above this, end loop
   2.285        // by saving highest value of the matrix dimension to access, plus 1
   2.286 @@ -549,8 +577,8 @@
   2.287  void
   2.288  freeSlicingStruc( SlicingStruc *slicingStruc, VirtProcr *animPr )
   2.289   {
   2.290 -   SSR__free( slicingStruc->startVals, animPr );
   2.291 -   SSR__free( slicingStruc, animPr );
   2.292 +   VPThread__free( slicingStruc->startVals, animPr );
   2.293 +   VPThread__free( slicingStruc, animPr );
   2.294   }
   2.295  
   2.296  
   2.297 @@ -561,9 +589,9 @@
   2.298     float32 *res, *left, *right;
   2.299  
   2.300        //setup inputs
   2.301 -   left  = SSR__malloc_to( 5 * 5 * sizeof( float32 ), animPr );
   2.302 -   right = SSR__malloc_to( 5 * 5 * sizeof( float32 ), animPr );
   2.303 -   res   = SSR__malloc_to( 5 * 5 * sizeof( float32 ), animPr );
   2.304 +   left  = VPThread__malloc( 5 * 5 * sizeof( float32 ), animPr );
   2.305 +   right = VPThread__malloc( 5 * 5 * sizeof( float32 ), animPr );
   2.306 +   res   = VPThread__malloc( 5 * 5 * sizeof( float32 ), animPr );
   2.307  
   2.308     for( r = 0; r < 5; r++ )
   2.309      {
   2.310 @@ -575,7 +603,7 @@
   2.311      }
   2.312  
   2.313        //do primitive
   2.314 -   SSR__start_primitive();  //for now, just takes time stamp
   2.315 +   VPThread__start_primitive();  //for now, just takes time stamp
   2.316     for( r = 0; r < 5; r++ )
   2.317      {
   2.318        for( c = 0; c < 5; c++ )
   2.319 @@ -587,11 +615,11 @@
   2.320         }
   2.321      }
   2.322     numCycles =
   2.323 -      SSR__end_primitive_and_give_cycles();
   2.324 +      VPThread__end_primitive_and_give_cycles();
   2.325  
   2.326 -   SSR__free( left, animPr );
   2.327 -   SSR__free( right, animPr );
   2.328 -   SSR__free( res, animPr );
   2.329 +   VPThread__free( left, animPr );
   2.330 +   VPThread__free( right, animPr );
   2.331 +   VPThread__free( res, animPr );
   2.332  
   2.333     return numCycles;
   2.334   }
     3.1 --- a/src/Application/VPThread__Matrix_Mult/EntryPoint.c	Wed Nov 10 22:26:57 2010 -0800
     3.2 +++ b/src/Application/VPThread__Matrix_Mult/EntryPoint.c	Tue Nov 16 16:02:51 2010 +0100
     3.3 @@ -8,7 +8,7 @@
     3.4  
     3.5  #include <math.h>
     3.6  
     3.7 -#include "SSR_Matrix_Mult.h"
     3.8 +#include "VPThread__Matrix_Mult.h"
     3.9  
    3.10  
    3.11  
    3.12 @@ -54,8 +54,8 @@
    3.13  
    3.14        //create divider processor, start doing the work, and wait till done
    3.15        //This function is the "border crossing" between normal code and SSR
    3.16 -   SSR__create_seed_procr_and_do_work( &divideWorkIntoSubMatrixPairProcrs,
    3.17 -                                       dividerParams );
    3.18 +   VPThread__create_seed_procr_and_do_work(&divideWorkIntoSubMatrixPairProcrs,
    3.19 +                                           dividerParams );
    3.20     
    3.21     free( dividerParams );
    3.22     return resMatrix;
     4.1 --- a/src/Application/VPThread__Matrix_Mult/Result_Pr.c	Wed Nov 10 22:26:57 2010 -0800
     4.2 +++ b/src/Application/VPThread__Matrix_Mult/Result_Pr.c	Tue Nov 16 16:02:51 2010 +0100
     4.3 @@ -6,7 +6,7 @@
     4.4   *
     4.5   */
     4.6  
     4.7 -#include "SSR_Matrix_Mult.h"
     4.8 +#include "VPThread__Matrix_Mult.h"
     4.9  
    4.10  //=====================
    4.11  void inline
    4.12 @@ -26,13 +26,30 @@
    4.13   *After the count reaches the point that all results have been received, it
    4.14   * returns the result matrix and dissipates.
    4.15   */
    4.16 -void gatherResults( void *_params, VirtProcr *animatingPr )
    4.17 +void gatherResults( void *_params, VirtProcr *animatingThd )
    4.18   { VirtProcr *dividerPr;
    4.19     ResultsParams  *params;
    4.20     int             row, col, numRows, numCols, numSubMatrixPairs, count=0;
    4.21     float32        *resultArray;
    4.22     void           *msg;
    4.23     SMPairParams   *resParams;
    4.24 +   //====================== thread stuff =======================
    4.25 +   MatrixMultGlobals *globals =(MatrixMultGlobals *)VPThread__give_globals();
    4.26 +
    4.27 +
    4.28 +      //get vector-comm lock before loop, so that this thd keeps lock after
    4.29 +      // one wait until it enters the next wait -- forces see-saw btwn
    4.30 +      // waiters and signalers -- wait-signal-wait-signal-...
    4.31 +   VPThread__mutex_lock( globals->vector_mutex, animatingThd );
    4.32 +
    4.33 +      //Tell divider that have the vector lock -- so it's sure won't miss any
    4.34 +      // signals from the vector-threads it's about to create
    4.35 +      //Don't need a signal variable -- this thd can't be created until
    4.36 +      // divider thd already has the start lock
    4.37 +   VPThread__mutex_lock( globals->start_mutex, animatingThd );//finish wait
    4.38 +   VPThread__cond_signal( globals->start_cond,  animatingThd );
    4.39 +   VPThread__mutex_unlock( globals->start_mutex, animatingThd );//finish wait
    4.40 +   //===========================================================
    4.41  
    4.42           DEBUG( dbgAppFlow, "start resultPr\n")
    4.43           
    4.44 @@ -47,9 +64,16 @@
    4.45  
    4.46     while( count < numSubMatrixPairs )
    4.47      {
    4.48 -      msg = SSR__receive_type_to( RESULTS_MSG, animatingPr );
    4.49 +         //receive a vector-result from a vector-thread
    4.50 +      VPThread__cond_wait(  globals->vector_cond,  animatingThd );
    4.51  
    4.52 -      resParams = (SMPairParams *)msg;
    4.53 +         //At this point, animating thread owns the vector lock, so all
    4.54 +         // pairs trying to signal they have a result are waiting to get that
    4.55 +         // lock -- only one gets it at a time, and when signal, this thd
    4.56 +         // gets the lock and does the body of this loop, then when does the
    4.57 +         // wait again, that releases the lock for next pair-thread to get it
    4.58 +      resParams = globals->currSMPairParams;
    4.59 +
    4.60        accumulateResult( resultArray, resParams->partialResultArray,
    4.61                          resParams->leftSubMatrix->origStartRow,
    4.62                          resParams->leftSubMatrix->numRows,
    4.63 @@ -57,22 +81,22 @@
    4.64                          resParams->rightSubMatrix->numCols,
    4.65                          resParams->rightSubMatrix->origMatrix->numCols );
    4.66  
    4.67 -      SSR__free( resParams->partialResultArray, animatingPr );
    4.68 +      VPThread__free( resParams->partialResultArray, animatingThd );
    4.69        
    4.70           //there is only one copy of results procr, so can update numUsesLeft
    4.71           // without concurrency worries.  When zero, free the sub-matrix
    4.72        resParams->leftSubMatrix->numUsesLeft -= 1;
    4.73        if( resParams->leftSubMatrix->numUsesLeft == 0 )
    4.74         {
    4.75 -         SSR__free( resParams->leftSubMatrix->array, animatingPr );
    4.76 -         SSR__free( resParams->leftSubMatrix, animatingPr );
    4.77 +         VPThread__free( resParams->leftSubMatrix->array, animatingThd );
    4.78 +         VPThread__free( resParams->leftSubMatrix, animatingThd );
    4.79         }
    4.80  
    4.81        resParams->rightSubMatrix->numUsesLeft -= 1;
    4.82        if( resParams->rightSubMatrix->numUsesLeft == 0 )
    4.83         {
    4.84 -         SSR__free( resParams->rightSubMatrix->array, animatingPr );
    4.85 -         SSR__free( resParams->rightSubMatrix, animatingPr );
    4.86 +         VPThread__free( resParams->rightSubMatrix->array, animatingThd );
    4.87 +         VPThread__free( resParams->rightSubMatrix, animatingThd );
    4.88         }
    4.89  
    4.90           //count of how many sub-matrix pairs accumulated so know when done
    4.91 @@ -82,9 +106,20 @@
    4.92        //Done -- could just dissipate -- SSR will wait for all processors to
    4.93        // dissipate before shutting down, and thereby making results avaial to
    4.94        // outside, so no need to stop the divider from dissipating, so no need
    4.95 -      // to send a hand-shake message to it -- bug makes debug easier
    4.96 -   SSR__send_from_to( NULL, animatingPr, dividerPr );
    4.97 -   SSR__dissipate_procr( animatingPr );  //frees any data owned by procr
    4.98 +      // to send a hand-shake message to it -- but makes debug easier
    4.99 +      //However, following pattern, so all comms done, release lock
   4.100 +   VPThread__mutex_unlock( globals->vector_mutex, animatingThd );
   4.101 +
   4.102 +      //Send result to divider (seed) thread
   4.103 +      // note, divider thd had to hold the results-comm lock before creating
   4.104 +      // this thread, to be sure no race
   4.105 +   VPThread__mutex_lock(   globals->results_mutex, animatingThd );
   4.106 +   //globals->results = resultMatrixArray;
   4.107 +   VPThread__cond_signal(  globals->results_cond,  animatingThd );
   4.108 +   VPThread__mutex_unlock( globals->results_mutex, animatingThd ); //releases
   4.109 +   //divider thread from its wait, at point this executes
   4.110 +
   4.111 +   VPThread__dissipate_thread( animatingThd );  //frees any data owned by procr
   4.112   }
   4.113  
   4.114  void inline
     5.1 --- a/src/Application/VPThread__Matrix_Mult/VPThread__Matrix_Mult.h	Wed Nov 10 22:26:57 2010 -0800
     5.2 +++ b/src/Application/VPThread__Matrix_Mult/VPThread__Matrix_Mult.h	Tue Nov 16 16:02:51 2010 +0100
     5.3 @@ -8,7 +8,7 @@
     5.4  
     5.5  #include <stdio.h>
     5.6  
     5.7 -#include "../../SSR_lib/SSR.h"
     5.8 +#include "../../VPThread_lib/VPThread.h"
     5.9  #include "../Matrix_Mult.h"
    5.10  
    5.11  
    5.12 @@ -81,6 +81,29 @@
    5.13     RESULTS_MSG = 1
    5.14   };
    5.15  
    5.16 + 
    5.17 +typedef struct
    5.18 + {
    5.19 +      //for communicating sub-matrix-pair results to results Thd
    5.20 +   int32         vector_mutex;
    5.21 +   int32         vector_cond;
    5.22 +   SMPairParams *currSMPairParams;
    5.23 +
    5.24 +      //for communicating results array back to seed (divider) Thd
    5.25 +   int32         results_mutex;
    5.26 +   int32         results_cond;
    5.27 +   float32      *results;
    5.28 +
    5.29 +      //for ensuring results thd has vector lock before making vector thds
    5.30 +   int32         start_mutex;
    5.31 +   int32         start_cond;
    5.32 +
    5.33 +   Matrix *rightMatrix;
    5.34 +   Matrix *resultMatrix;
    5.35 + }
    5.36 +MatrixMultGlobals;
    5.37 +
    5.38 +
    5.39  //============================= Processor Functions =========================
    5.40  void divideWorkIntoSubMatrixPairProcrs( void *data, VirtProcr *animatingPr );
    5.41  void calcSubMatrixProduct(        void *data, VirtProcr *animatingPr );
     6.1 --- a/src/Application/VPThread__Matrix_Mult/subMatrix_Pr.c	Wed Nov 10 22:26:57 2010 -0800
     6.2 +++ b/src/Application/VPThread__Matrix_Mult/subMatrix_Pr.c	Tue Nov 16 16:02:51 2010 +0100
     6.3 @@ -8,7 +8,7 @@
     6.4  
     6.5  #include <string.h>
     6.6  
     6.7 -#include "SSR_Matrix_Mult.h"
     6.8 +#include "VPThread__Matrix_Mult.h"
     6.9  
    6.10  
    6.11  
    6.12 @@ -50,6 +50,7 @@
    6.13     VirtProcr      *resultPr;
    6.14     float32        *leftArray,  *rightArray, *resArray;
    6.15     SubMatrix      *leftSubMatrix, *rightSubMatrix;
    6.16 +   MatrixMultGlobals *globals =(MatrixMultGlobals *)VPThread__give_globals();
    6.17  
    6.18           DEBUG1(dbgAppFlow, "start sub-matrix mult: %d\n", animatingPr->procrID)
    6.19           int32 subMatrixProbe = VMS__create_single_interval_probe( "subMtx",
    6.20 @@ -72,7 +73,7 @@
    6.21  
    6.22     int32
    6.23     resSize = leftSubMatrix->numRows * rightSubMatrix->numCols * sizeof(float32);
    6.24 -   resArray = SSR__malloc_to( resSize, animatingPr );
    6.25 +   resArray = VPThread__malloc( resSize, animatingPr );
    6.26     memset( resArray, 0, resSize );
    6.27  
    6.28  
    6.29 @@ -91,8 +92,15 @@
    6.30  
    6.31           VMS__record_interval_end_in_probe( subMatrixProbe );
    6.32  
    6.33 -   SSR__send_of_type_to( animatingPr, params, RESULTS_MSG, resultPr );
    6.34 -   SSR__dissipate_procr( animatingPr );
    6.35 +      //Send result to results thread
    6.36 +      //This pattern works 'cause only get lock when results thd inside wait
    6.37 +   VPThread__mutex_lock(   globals->vector_mutex, animatingPr );
    6.38 +   globals->currSMPairParams = params;
    6.39 +   VPThread__cond_signal(  globals->vector_cond,  animatingPr );
    6.40 +   VPThread__mutex_unlock( globals->vector_mutex, animatingPr );//release
    6.41 +   //wait-er -- cond_signal implemented such that wait-er gets lock, no other
    6.42 +
    6.43 +   VPThread__dissipate_thread( animatingPr );
    6.44   }
    6.45  
    6.46  
    6.47 @@ -226,7 +234,7 @@
    6.48     float32 *origArray, *subArray;
    6.49  
    6.50     if( subMatrix->alreadyCopied ) return;
    6.51 -   SSR__start_singleton( copyMatrixSingleton, &&EndOfTransSingleton, animPr);
    6.52 +   VPThread__start_singleton( copyTransposeSingleton, animPr);
    6.53  
    6.54     origMatrix   = subMatrix->origMatrix;
    6.55     origArray    = origMatrix->array;
    6.56 @@ -236,7 +244,7 @@
    6.57     origStartCol = subMatrix->origStartCol;
    6.58     origStride   = origMatrix->numCols;
    6.59  
    6.60 -   subArray     = SSR__malloc_to( numRows * numCols *sizeof(float32),animPr);
    6.61 +   subArray    = VPThread__malloc( numRows * numCols *sizeof(float32),animPr);
    6.62     subMatrix->array = subArray;
    6.63  
    6.64        //copy values from orig matrix to local
    6.65 @@ -244,9 +252,8 @@
    6.66                    origStartRow, origStartCol, origStride,
    6.67                    subArray, origArray );
    6.68  
    6.69 -   subMatrix->alreadyCopied = TRUE; //must be last thing before label
    6.70 -   EndOfTransSingleton:
    6.71 -   return;
    6.72 +   VPThread__end_singleton( copyTransposeSingleton, animPr);
    6.73 +   subMatrix->alreadyCopied = TRUE; //anywhere after singleton work finished
    6.74   }
    6.75  
    6.76  
    6.77 @@ -259,13 +266,12 @@
    6.78  
    6.79        //This lets only a single VP execute the code between start and
    6.80        // end -- using start and end so that work runs outside the master.
    6.81 -      //Inside, if a second VP ever executes the start, it will be returned
    6.82 -      // from the end-point.
    6.83 -      //Note, for non-GCC, can add a second SSR call at the end, and inside
    6.84 -      // that one, look at the stack at the return addr & save that in an
    6.85 -      // array indexed by singletonID
    6.86 -   if( subMatrix->alreadyCopied ) return;
    6.87 -   SSR__start_singleton( copyMatrixSingleton, &&EndOfCopySingleton, animPr );
    6.88 +      //If a second VP ever executes the start, it will be returned
    6.89 +      // from the end-point.  If it executions start after another but before
    6.90 +      // that other has finished, this one will remain suspended until the
    6.91 +      // other finishes, then be resumed from the end-point.
    6.92 +   if( subMatrix->alreadyCopied ) return; //an optimization -- set below
    6.93 +   VPThread__start_singleton( copyMatrixSingleton, animPr );
    6.94  
    6.95  
    6.96     origMatrix    = subMatrix->origMatrix;
    6.97 @@ -276,7 +282,7 @@
    6.98     origStartCol  = subMatrix->origStartCol;
    6.99     origStride    = origMatrix->numCols;
   6.100  
   6.101 -   subArray     = SSR__malloc_to( numRows * numCols *sizeof(float32),animPr);
   6.102 +   subArray    = VPThread__malloc( numRows * numCols *sizeof(float32),animPr);
   6.103     subMatrix->array = subArray;
   6.104  
   6.105        //copy values from orig matrix to local
   6.106 @@ -293,7 +299,6 @@
   6.107         }
   6.108      }
   6.109  
   6.110 -   subMatrix->alreadyCopied = TRUE; //must be last thing before label
   6.111 -   EndOfCopySingleton:
   6.112 -   return;
   6.113 +   subMatrix->alreadyCopied = TRUE; //must be after singleton work finished
   6.114 +   VPThread__end_singleton( copyMatrixSingleton, animPr );
   6.115   }