changeset 1:9ad1a6186956

First shot at application-interface
author Some Random Person <seanhalle@yahoo.com>
date Wed, 23 May 2012 14:24:18 -0700
parents 9cf4c84a3091
children a8a8c4193c9b
files VSs__Hello_World/EntryPoint.c VSs__Hello_World/SeedVP.c VSs__Hello_World/Task.c main.c
diffstat 4 files changed, 50 insertions(+), 624 deletions(-) [+]
line diff
     1.1 --- a/VSs__Hello_World/EntryPoint.c	Wed May 23 12:39:19 2012 -0700
     1.2 +++ b/VSs__Hello_World/EntryPoint.c	Wed May 23 14:24:18 2012 -0700
     1.3 @@ -12,7 +12,7 @@
     1.4  
     1.5  
     1.6  
     1.7 -/*Every SSR system has an "entry point" function that creates the first
     1.8 +/*This "entry point" function creates the first
     1.9   * processor, which starts the chain of creating more processors..
    1.10   * eventually all of the processors will dissipate themselves, and
    1.11   * return.
    1.12 @@ -26,37 +26,13 @@
    1.13   *    and returns the value from the function
    1.14   *
    1.15   */
    1.16 -Matrix *
    1.17 -multiplyTheseMatrices( Matrix *leftMatrix, Matrix *rightMatrix )
    1.18 - { Matrix          *resMatrix;
    1.19 -   DividerParams   *dividerParams;
    1.20 -   int32            numResRows, numResCols;
    1.21 -
    1.22 -
    1.23 -   dividerParams              = malloc( sizeof( DividerParams ) );
    1.24 -   dividerParams->leftMatrix  = leftMatrix;
    1.25 -   dividerParams->rightMatrix = rightMatrix;
    1.26 -
    1.27 -
    1.28 -   numResRows  = leftMatrix->numRows;
    1.29 -   numResCols  = rightMatrix->numCols;
    1.30 -
    1.31 -      //VMS has its own separate internal malloc, so to get results out,
    1.32 -      // have to pass in empty array for it to fill up
    1.33 -      //The alternative is internally telling SSR make external space to use
    1.34 -   resMatrix            = malloc( sizeof(Matrix) );
    1.35 -   resMatrix->array     = malloc( numResRows * numResCols * sizeof(float32));
    1.36 -   resMatrix->numCols   = rightMatrix->numCols;
    1.37 -   resMatrix->numRows   = leftMatrix->numRows;
    1.38 -
    1.39 -
    1.40 -   dividerParams->resultMatrix   = resMatrix;
    1.41 -
    1.42 -      //create divider processor, start doing the work, and wait till done
    1.43 +void
    1.44 +VSs__Hello_World( )
    1.45 + { 
    1.46 +      //create seed processor, start doing the work, and wait till done
    1.47        //This function is the "border crossing" between normal code and SSR
    1.48 -   SSR__create_seed_procr_and_do_work( &divideWorkIntoSubMatrixPairProcrs,
    1.49 -                                       dividerParams );
    1.50 -   
    1.51 -   free( dividerParams );
    1.52 -   return resMatrix;
    1.53 +   VSs__create_seed_procr_and_do_work( &hello_world,
    1.54 +                                       NULL );
    1.55 +  
    1.56 +   return;
    1.57   }
     2.1 --- a/VSs__Hello_World/SeedVP.c	Wed May 23 12:39:19 2012 -0700
     2.2 +++ b/VSs__Hello_World/SeedVP.c	Wed May 23 14:24:18 2012 -0700
     2.3 @@ -9,586 +9,26 @@
     2.4  
     2.5  #include <math.h>
     2.6  #include <string.h>
     2.7 -#include "SSR_Matrix_Mult.h"
     2.8 +#include "VSs__Hello_World.h"
     2.9  
    2.10 -   //The time to compute this many result values should equal the time to
    2.11 -   // perform this division on a matrix of size gives that many result calcs
    2.12 -   //IE, size this so that sequential time to calc equals divide time
    2.13 -   // find the value by experimenting -- but divide time and calc time scale
    2.14 -   // same way, so this value might remain the same across hardware
    2.15 -#define NUM_CELLS_IN_SEQUENTIAL_CUTOFF 1000
    2.16 +void hello_world( void      *_params, SlaveVP *animPr )
    2.17 + { int32 i;
    2.18 +         DEBUG__printf( dbgAppFlow, "start hello_world");
    2.19  
    2.20 +   // create all the task types
    2.21 +   helloWorldTaskType = VMS_App__malloc( sizeof(VSsTaskType) );
    2.22 +   helloWorldTaskType->fn = &hello_world_task;
    2.23 +   helloWorldTaskType->numArgs = 2;
    2.24 +   helloWorldTaskType->argTypes = {NULL, IN};
    2.25 +   helloWorldTaskType->argSizes = {sizeof(int), 16*16*sizeof(float)};
    2.26  
    2.27 -//===========================================================================
    2.28 -int inline
    2.29 -measureMatrixMultPrimitive( SlaveVP *animPr );
    2.30 -
    2.31 -SlicingStrucCarrier *
    2.32 -calcIdealSizeAndSliceDimensions( Matrix *leftMatrix, Matrix *rightMatrix,
    2.33 -                                 SlaveVP *animPr );
    2.34 -
    2.35 -SlicingStruc *
    2.36 -sliceUpDimension( float32 idealSizeOfSide, int startVal, int endVal,
    2.37 -                  SlaveVP *animPr );
    2.38 -
    2.39 -void
    2.40 -freeSlicingStruc( SlicingStruc *slicingStruc, SlaveVP *animPr );
    2.41 -
    2.42 -SubMatrix **
    2.43 -createSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices,
    2.44 -                   int32 numUses, Matrix *origMatrix, SlaveVP *animPr );
    2.45 -
    2.46 -void
    2.47 -freeSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices,
    2.48 -                 SubMatrix **subMatrices, SlaveVP *animPr );
    2.49 -
    2.50 -void
    2.51 -pairUpSubMatricesAndMakeProcessors( SubMatrix **leftSubMatrices,
    2.52 -                                    SubMatrix **rightSubMatrices,
    2.53 -                                    int32 numRowIdxs, int32 numColIdxs,
    2.54 -                                    int32 numVecIdxs,
    2.55 -                                    SlaveVP *resultPr,
    2.56 -                                    SlaveVP *animatingPr );
    2.57 -
    2.58 -void
    2.59 -makeSubMatricesAndProcrs( Matrix *leftMatrix, Matrix *rightMatrix,
    2.60 -            SlicingStrucCarrier *slicingStrucCarrier,
    2.61 -            SlaveVP *resultPr, SlaveVP *animatingPr );
    2.62 -
    2.63 -
    2.64 -
    2.65 -/*Divider creates one processor for every sub-matrix
    2.66 - * It hands them:
    2.67 - *  the name of the result processor that they should send their results to,
    2.68 - *  the left and right matrices, and the rows and cols they should multiply
    2.69 - * It first creates the result processor, then all the sub-matrixPair
    2.70 - *  processors,
    2.71 - *  then does a receive of a message from the result processor that gives
    2.72 - *  the divider ownership of the result matrix.
    2.73 - * Finally, the divider returns the result matrix out of the SSR system.
    2.74 - *
    2.75 - * Divider chooses the size of sub-matrices via an algorithm that tries to
    2.76 - *  keep the minimum work above a threshold.  The threshold is machine-
    2.77 - *  dependent, so ask SSR for min work-unit time to get a
    2.78 - *  given overhead
    2.79 - *
    2.80 - * Divide min work-unit cycles by measured-cycles for one matrix-cell
    2.81 - *  product -- gives the number of products need to have in min size
    2.82 - *  matrix.
    2.83 - *
    2.84 - * So then, take cubed root of this to get the size of a side of min sub-
    2.85 - *  matrix.  That is the size of the ideal square sub-matrix -- so tile
    2.86 - *  up the two input matrices into ones as close as possible to that size,
    2.87 - *  and create the pairs of sub-matrices.
    2.88 - *
    2.89 - *========================  STRATEGIC OVERVIEW  =======================
    2.90 - *
    2.91 - *This division is a bit tricky, because have to create things in advance
    2.92 - * that it's not at first obvious need to be created..
    2.93 - *
    2.94 - *First slice up each dimension -- three of them..  this is because will have
    2.95 - * to create the sub-matrix's data-structures before pairing the sub-matrices
    2.96 - * with each other -- so, have three dimensions to slice up before can
    2.97 - * create the sub-matrix data-strucs -- also, have to be certain that the
    2.98 - * cols of the left input have the exact same slicing as the rows of the
    2.99 - * left matrix, so just to be sure, do the slicing calc once, then use it
   2.100 - * for both.
   2.101 - *
   2.102 - *So, goes like this:
   2.103 - *1) calculate the start & end values of each dimension in each matrix.
   2.104 - *2) use those values to create sub-matrix structures
   2.105 - *3) combine sub-matrices into pairs, as the tasks to perform.
   2.106 - *
   2.107 - *Have to calculate separately from creating the sub-matrices because of the
   2.108 - * nature of the nesting -- would either end up creating the same sub-matrix
   2.109 - * multiple times, or else would have to put in detection of whether had
   2.110 - * made a particular one already if tried to combine steps 1 and 2.
   2.111 - *
   2.112 - *Step 3 has to be separate because of the nesting, as well -- same reason,
   2.113 - * would either create same sub-matrix multiple times, or else have to
   2.114 - * add detection of whether was already created.
   2.115 - *
   2.116 - *Another way to look at it: there's one level of loop to divide dimensions,
   2.117 - * two levels of nesting to create sub-matrices, and three levels to pair
   2.118 - * up the sub-matrices.
   2.119 - */
   2.120 -void divideWorkIntoSubMatrixPairProcrs( void      *_dividerParams,
   2.121 -                                        SlaveVP *animPr )
   2.122 - { SlaveVP       *resultPr;
   2.123 -   DividerParams   *dividerParams;
   2.124 -   ResultsParams   *resultsParams;
   2.125 -   Matrix          *leftMatrix, *rightMatrix, *resultMatrix;
   2.126 -   void            *msg;
   2.127 -   SlicingStrucCarrier *slicingStrucCarrier;
   2.128 -   float32         *resultArray; //points to array inside result matrix
   2.129 +   HelloWorldArgs args; //allocate on stack, VSs copies internally
   2.130     
   2.131 -         DEBUG__printf( dbgAppFlow, "start divide")
   2.132 -
   2.133 -         int32
   2.134 -         divideProbe = VMS_App__create_single_interval_probe( "divideProbe",
   2.135 -                                                          animPr );
   2.136 -         VMS_App__record_sched_choice_into_probe( divideProbe, animPr );
   2.137 -         VMS_App__record_interval_start_in_probe( divideProbe );
   2.138 -
   2.139 -   //=========== Setup -- make local copies of ptd-to-things, malloc, aso
   2.140 -   int32 numResRows, numResCols, vectLength;
   2.141 -
   2.142 -   dividerParams   = (DividerParams *)_dividerParams;
   2.143 -   
   2.144 -   leftMatrix      = dividerParams->leftMatrix;
   2.145 -   rightMatrix     = dividerParams->rightMatrix;
   2.146 -
   2.147 -   vectLength = leftMatrix->numCols;
   2.148 -   numResRows = leftMatrix->numRows;
   2.149 -   numResCols = rightMatrix->numCols;
   2.150 -   resultArray     = dividerParams->resultMatrix->array;
   2.151 -   
   2.152 -      //zero the result array
   2.153 -   memset( resultArray, 0, numResRows * numResCols * sizeof(float32) );
   2.154 -
   2.155 -   //==============  Do either sequential mult or do division ==============
   2.156 -
   2.157 -      //Check if input matrices too small -- if yes, just do sequential
   2.158 -      //Cutoff is determined by overhead of this divider -- relatively
   2.159 -      // machine-independent
   2.160 -   if( (float32)leftMatrix->numRows * (float32)leftMatrix->numCols *
   2.161 -       (float32)rightMatrix->numCols  < NUM_CELLS_IN_SEQUENTIAL_CUTOFF )
   2.162 +   for( i = 0; i < 5; i++ )
   2.163      {
   2.164 -      //====== Do sequential multiply on a single core
   2.165 -            DEBUG__printf( dbgAppFlow, "doing sequential")
   2.166 -            
   2.167 -         //transpose the right matrix
   2.168 -      float32 *
   2.169 -      transRightArray  = SSR__malloc_to( rightMatrix->numRows * 
   2.170 -                                         rightMatrix->numCols * sizeof(float32),
   2.171 -                                         animPr );
   2.172 -
   2.173 -         //copy values from orig matrix to local
   2.174 -      copyTranspose( rightMatrix->numRows, rightMatrix->numCols,
   2.175 -                     0, 0, rightMatrix->numRows,
   2.176 -                     transRightArray, rightMatrix->array );
   2.177 -      
   2.178 -      multiplyMatrixArraysTransposed( vectLength, numResRows, numResCols,
   2.179 -                            leftMatrix->array, transRightArray,
   2.180 -                            resultArray );
   2.181 -    }
   2.182 -   else
   2.183 -    {
   2.184 -      //====== Do parallel multiply across cores
   2.185 -
   2.186 -         //Calc the ideal size of sub-matrix and slice up the dimensions of
   2.187 -         // the two matrices.
   2.188 -         //The ideal size is the one takes the number of cycles to calculate
   2.189 -         // such that calc time is equal or greater than min work-unit size
   2.190 -      slicingStrucCarrier =
   2.191 -         calcIdealSizeAndSliceDimensions( leftMatrix, rightMatrix, animPr );
   2.192 -
   2.193 -         //Make the results processor, now that know how many to wait for
   2.194 -      resultsParams = SSR__malloc_to( sizeof(ResultsParams), animPr );
   2.195 -      resultsParams->numSubMatrixPairs  =
   2.196 -         slicingStrucCarrier->leftRowSlices->numVals *
   2.197 -         slicingStrucCarrier->rightColSlices->numVals *
   2.198 -         slicingStrucCarrier->vecSlices->numVals;
   2.199 -      resultsParams->dividerPr   = animPr;
   2.200 -      resultsParams->numCols     = rightMatrix->numCols;
   2.201 -      resultsParams->numRows     = leftMatrix->numRows;
   2.202 -      resultsParams->resultArray = resultArray;
   2.203 -
   2.204 -            DEBUG__printf(dbgAppFlow,"**create result Pr**")
   2.205 -      resultPr =
   2.206 -         SSR__create_procr_with( &gatherResults, resultsParams, animPr);
   2.207 -
   2.208 -         //Make the sub-matrices, and pair them up, and make processor to
   2.209 -         // calc product of each pair.
   2.210 -      makeSubMatricesAndProcrs( leftMatrix, rightMatrix,
   2.211 -                                    slicingStrucCarrier,
   2.212 -                                    resultPr, animPr);
   2.213 - 
   2.214 -         //result array is allocated externally, so no message from resultPr
   2.215 -         // however, do have to wait before printing out stats, so wait
   2.216 -         // for an empty handshake message
   2.217 -      msg = SSR__receive_from_to( resultPr, animPr );
   2.218 -   }
   2.219 -
   2.220 -
   2.221 -   //===============  Work done -- send results back =================
   2.222 -
   2.223 -
   2.224 -         DEBUG__printf( dbgAppFlow, "end divide")
   2.225 -
   2.226 -         VMS_App__record_interval_end_in_probe( divideProbe );
   2.227 -         VMS_App__print_stats_of_all_probes();
   2.228 -
   2.229 -      //nothing left to do so dissipate, SSR will wait to shutdown and hence
   2.230 -      // make results available to outside until all the processors have
   2.231 -      // dissipated -- so no need to wait for results processor
   2.232 -
   2.233 -   SSR__dissipate_procr( animPr );  //all procrs dissipate self at end
   2.234 -      //when all of the processors have dissipated, the "create seed and do
   2.235 -      // work" call in the entry point function returns
   2.236 +      args.dummy1 = i;
   2.237 +      args.dummy2 = VMS_App__malloc()
   2.238 +      VSs__submit_task( VSsTaskType helloWorldTaskType, &args );
   2.239 +	}
   2.240   }
   2.241  
   2.242 -
   2.243 -SlicingStrucCarrier *
   2.244 -calcIdealSizeAndSliceDimensions( Matrix *leftMatrix, Matrix *rightMatrix,
   2.245 -                                 SlaveVP *animPr )
   2.246 - {
   2.247 -   float32 idealSizeOfSide, idealSizeOfSide1, idealSizeOfSide2;
   2.248 -   SlicingStruc *leftRowSlices, *vecSlices, *rightColSlices;
   2.249 -   SlicingStrucCarrier *slicingStrucCarrier =
   2.250 -                         SSR__malloc_to(sizeof(SlicingStrucCarrier), animPr);
   2.251 -
   2.252 -   int minWorkUnitCycles, primitiveCycles, idealNumWorkUnits;
   2.253 -   float64 numPrimitiveOpsInMinWorkUnit;
   2.254 -
   2.255 -
   2.256 -   //=======  Calc ideal size of min-sized sub-matrix  ========
   2.257 -
   2.258 -      //ask SSR for the number of cycles of the minimum work unit, at given
   2.259 -      // percent overhead then add a guess at overhead from this divider
   2.260 -   minWorkUnitCycles = SSR__giveMinWorkUnitCycles( .05 );
   2.261 -
   2.262 -      //ask SSR for number of cycles of the "primitive" op of matrix mult
   2.263 -   primitiveCycles = measureMatrixMultPrimitive( animPr );
   2.264 -
   2.265 -   numPrimitiveOpsInMinWorkUnit =
   2.266 -      (float64)minWorkUnitCycles / (float64)primitiveCycles;
   2.267 -
   2.268 -      //take cubed root -- that's number of these in a "side" of sub-matrix
   2.269 -      // then multiply by 5 because the primitive is 5x5
   2.270 -   idealSizeOfSide1 = 5 * cbrt( numPrimitiveOpsInMinWorkUnit );
   2.271 -
   2.272 -   idealNumWorkUnits = SSR__giveIdealNumWorkUnits();
   2.273 -   
   2.274 -   idealSizeOfSide2 = leftMatrix->numRows / rint(cbrt( idealNumWorkUnits ));
   2.275 -   idealSizeOfSide2 *= 0.6; //finer granularity to help load balance
   2.276 -
   2.277 -   if( idealSizeOfSide1 > idealSizeOfSide2 )
   2.278 -      idealSizeOfSide = idealSizeOfSide1;
   2.279 -   else
   2.280 -      idealSizeOfSide = idealSizeOfSide2;
   2.281 -
   2.282 -      //The multiply inner loop blocks the array to fit into L1 cache
   2.283 -//   if( idealSizeOfSide < ROWS_IN_BLOCK ) idealSizeOfSide = ROWS_IN_BLOCK;
   2.284 -
   2.285 -   //============  Slice up dimensions, now that know target size ===========
   2.286 -
   2.287 -      //Tell the slicer the target size of a side (floating pt), the start
   2.288 -      // value to start slicing at, and the end value to stop slicing at
   2.289 -      //It returns an array of start value of each chunk, plus number of them
   2.290 -   int32 startLeftRow, endLeftRow, startVec,endVec,startRightCol,endRightCol;
   2.291 -   startLeftRow  = 0;
   2.292 -   endLeftRow    = leftMatrix->numRows -1;
   2.293 -   startVec      = 0;
   2.294 -   endVec        = leftMatrix->numCols -1;
   2.295 -   startRightCol = 0;
   2.296 -   endRightCol   = rightMatrix->numCols -1;
   2.297 -
   2.298 -   leftRowSlices =
   2.299 -      sliceUpDimension( idealSizeOfSide,  startLeftRow, endLeftRow, animPr );
   2.300 -
   2.301 -   vecSlices =
   2.302 -      sliceUpDimension( idealSizeOfSide,  startVec, endVec, animPr );
   2.303 -
   2.304 -   rightColSlices =
   2.305 -      sliceUpDimension( idealSizeOfSide,  startRightCol, endRightCol,animPr);
   2.306 -
   2.307 -   slicingStrucCarrier->leftRowSlices  = leftRowSlices;
   2.308 -   slicingStrucCarrier->vecSlices      = vecSlices;
   2.309 -   slicingStrucCarrier->rightColSlices = rightColSlices;
   2.310 -
   2.311 -   return slicingStrucCarrier;
   2.312 - }
   2.313 -
   2.314 -
   2.315 -void
   2.316 -makeSubMatricesAndProcrs( Matrix    *leftMatrix, Matrix    *rightMatrix,
   2.317 -            SlicingStrucCarrier *slicingStrucCarrier,
   2.318 -            SlaveVP *resultPr,   SlaveVP *animPr )
   2.319 - {
   2.320 -   SlicingStruc *leftRowSlices, *vecSlices, *rightColSlices;
   2.321 -   
   2.322 -   leftRowSlices  = slicingStrucCarrier->leftRowSlices;
   2.323 -   vecSlices      = slicingStrucCarrier->vecSlices;
   2.324 -   rightColSlices = slicingStrucCarrier->rightColSlices;
   2.325 -   SSR__free( slicingStrucCarrier, animPr );
   2.326 -   
   2.327 -   //================  Make sub-matrices, given the slicing  ================
   2.328 -   SubMatrix **leftSubMatrices, **rightSubMatrices;
   2.329 -   leftSubMatrices =
   2.330 -      createSubMatrices( leftRowSlices, vecSlices, rightColSlices->numVals,
   2.331 -                         leftMatrix, animPr );
   2.332 -   //double_check_that_always_numRows_in_right_same_as_numCols_in_left();
   2.333 -   rightSubMatrices =
   2.334 -      createSubMatrices( vecSlices, rightColSlices, leftRowSlices->numVals,
   2.335 -                         rightMatrix, animPr );
   2.336 -
   2.337 -
   2.338 -   //==============  pair the sub-matrices and make processors ==============
   2.339 -   int32 numRowIdxs, numColIdxs, numVecIdxs;
   2.340 -
   2.341 -   numRowIdxs = leftRowSlices->numVals;
   2.342 -   numColIdxs = rightColSlices->numVals;
   2.343 -   numVecIdxs = vecSlices->numVals;
   2.344 -   
   2.345 -   
   2.346 -   freeSlicingStruc( leftRowSlices, animPr );
   2.347 -   freeSlicingStruc( vecSlices, animPr );
   2.348 -   freeSlicingStruc( rightColSlices, animPr );
   2.349 -   
   2.350 -   pairUpSubMatricesAndMakeProcessors( leftSubMatrices,
   2.351 -                                       rightSubMatrices,
   2.352 -                                       numRowIdxs, numColIdxs,
   2.353 -                                       numVecIdxs,
   2.354 -                                       resultPr,
   2.355 -                                       animPr );
   2.356 - }
   2.357 -
   2.358 -
   2.359 -
   2.360 -
   2.361 -void
   2.362 -pairUpSubMatricesAndMakeProcessors( SubMatrix **leftSubMatrices,
   2.363 -                                    SubMatrix **rightSubMatrices,
   2.364 -                                    int32 numRowIdxs, int32 numColIdxs,
   2.365 -                                    int32 numVecIdxs,
   2.366 -                                    SlaveVP *resultPr,
   2.367 -                                    SlaveVP *animatingPr )
   2.368 - {
   2.369 -   int32 resRowIdx, resColIdx, vecIdx;
   2.370 -   int32 numLeftColIdxs, numRightColIdxs;
   2.371 -   int32 leftRowIdxOffset;
   2.372 -   SMPairParams *subMatrixPairParams;
   2.373 -   float32 numToPutOntoEachCore, leftOverFraction, numVecOnCurrCore;
   2.374 -   int32 numCores, coreToAssignOnto;
   2.375 -
   2.376 -   numLeftColIdxs  = numColIdxs;
   2.377 -   numRightColIdxs = numVecIdxs;
   2.378 -
   2.379 -   numCores = SSR__give_number_of_cores_to_schedule_onto();
   2.380 -
   2.381 -   numToPutOntoEachCore = numRowIdxs*numColIdxs/numCores;
   2.382 -   leftOverFraction = 0;
   2.383 -   numVecOnCurrCore = 0;
   2.384 -   coreToAssignOnto = 0;
   2.385 -
   2.386 -   for( resRowIdx = 0; resRowIdx < numRowIdxs; resRowIdx++ )
   2.387 -    {
   2.388 -      leftRowIdxOffset = resRowIdx * numLeftColIdxs;
   2.389 -
   2.390 -      for( resColIdx = 0; resColIdx < numColIdxs; resColIdx++ )
   2.391 -       {
   2.392 -         
   2.393 -         for( vecIdx = 0; vecIdx < numVecIdxs; vecIdx++ )
   2.394 -          {
   2.395 -               //Make the processor for the pair of sub-matrices
   2.396 -            subMatrixPairParams  = SSR__malloc_to( sizeof(SMPairParams),
   2.397 -                                                               animatingPr);
   2.398 -            subMatrixPairParams->leftSubMatrix  =
   2.399 -               leftSubMatrices[ leftRowIdxOffset + vecIdx ];
   2.400 -
   2.401 -            subMatrixPairParams->rightSubMatrix =
   2.402 -               rightSubMatrices[ vecIdx * numRightColIdxs + resColIdx ];
   2.403 -
   2.404 -            subMatrixPairParams->resultPr = resultPr;
   2.405 -
   2.406 -               //put all pairs from the same vector onto same core
   2.407 -            SSR__create_procr_with_affinity( &calcSubMatrixProduct,
   2.408 -                                             subMatrixPairParams,
   2.409 -                                             animatingPr,
   2.410 -                                             coreToAssignOnto );
   2.411 -
   2.412 -               //Trying to distribute the subMatrix-vectors across the cores, so
   2.413 -               // that each core gets the same number of vectors, with a max
   2.414 -               // imbalance of 1 vector more on some cores than others
   2.415 -            numVecOnCurrCore += 1;                 //incr before checking, so
   2.416 -            if( numVecOnCurrCore > numToPutOntoEachCore ) //actual num 1 less
   2.417 -             {
   2.418 -                  //deal with fractional part, to ensure that imbalance is 1 max
   2.419 -                  // IE, core with most has only 1 more than core with least
   2.420 -               leftOverFraction = numToPutOntoEachCore - numVecOnCurrCore;
   2.421 -               if( leftOverFraction > 1 ) ERROR("division alg messed up\n");
   2.422 -               numVecOnCurrCore = leftOverFraction; //accumulates "extra"
   2.423 -
   2.424 -                  //Move to next core, max core-value to incr to is numCores -1
   2.425 -               coreToAssignOnto += 1;
   2.426 -               if( coreToAssignOnto >= numCores ) coreToAssignOnto = 0;
   2.427 -             } //if
   2.428 -          } //for( vecIdx
   2.429 -       } //for( resColIdx
   2.430 -    } //for( resRowIdx
   2.431 -
   2.432 - }
   2.433 -
   2.434 -
   2.435 -
   2.436 -/*Walk through the two slice-strucs, making sub-matrix strucs as go
   2.437 - */
   2.438 -SubMatrix **
   2.439 -createSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices,
   2.440 -                   int32 numUses, Matrix *origMatrix, SlaveVP *animPr )
   2.441 - {
   2.442 -   int32 numRowIdxs, numColIdxs, rowIdx, colIdx;
   2.443 -   int32 startRow, endRow, startCol, endCol;
   2.444 -   int32 *rowStartVals, *colStartVals;
   2.445 -   int32 rowOffset;
   2.446 -   SubMatrix **subMatrices, *newSubMatrix;
   2.447 -
   2.448 -   numRowIdxs = rowSlices->numVals;
   2.449 -   numColIdxs = colSlices->numVals;
   2.450 -
   2.451 -   rowStartVals = rowSlices->startVals;
   2.452 -   colStartVals = colSlices->startVals;
   2.453 -
   2.454 -   subMatrices = SSR__malloc_to(numRowIdxs * numColIdxs * sizeof(SubMatrix*),
   2.455 -                                 animPr );
   2.456 -
   2.457 -   for( rowIdx = 0; rowIdx < numRowIdxs; rowIdx++ )
   2.458 -    {
   2.459 -      rowOffset = rowIdx * numColIdxs;
   2.460 -      
   2.461 -      startRow  = rowStartVals[rowIdx];
   2.462 -      endRow    = rowStartVals[rowIdx + 1] -1; //"fake" start above last is
   2.463 -                                               // at last valid idx + 1 & is
   2.464 -                                               // 1 greater than end value
   2.465 -      for( colIdx = 0; colIdx < numColIdxs; colIdx++ )
   2.466 -       {
   2.467 -         startCol = colStartVals[colIdx];
   2.468 -         endCol   = colStartVals[colIdx + 1] -1;
   2.469 -
   2.470 -         newSubMatrix = SSR__malloc_to( sizeof(SubMatrix), animPr );
   2.471 -         newSubMatrix->numRows       = endRow - startRow +1;
   2.472 -         newSubMatrix->numCols       = endCol - startCol +1;
   2.473 -         newSubMatrix->origMatrix    = origMatrix;
   2.474 -         newSubMatrix->origStartRow  = startRow;
   2.475 -         newSubMatrix->origStartCol  = startCol;
   2.476 -         newSubMatrix->copySingleton = NULL;
   2.477 -         newSubMatrix->numUsesLeft   = numUses; //can free after this many
   2.478 -         //Prevent uninitialized memory
   2.479 -         newSubMatrix->copySingleton = NULL;
   2.480 -         newSubMatrix->copyTransSingleton = NULL;
   2.481 -
   2.482 -         subMatrices[ rowOffset + colIdx ] = newSubMatrix;
   2.483 -       }
   2.484 -    }
   2.485 -   return subMatrices;
   2.486 - }
   2.487 -
   2.488 -
   2.489 -void
   2.490 -freeSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices,
   2.491 -                 SubMatrix **subMatrices, SlaveVP *animPr )
   2.492 - {
   2.493 -   int32 numRowIdxs, numColIdxs, rowIdx, colIdx, rowOffset;
   2.494 -   SubMatrix *subMatrix;
   2.495 -
   2.496 -   numRowIdxs = rowSlices->numVals;
   2.497 -   numColIdxs = colSlices->numVals;
   2.498 -
   2.499 -   for( rowIdx = 0; rowIdx < numRowIdxs; rowIdx++ )
   2.500 -    {
   2.501 -      rowOffset = rowIdx * numColIdxs;
   2.502 -      for( colIdx = 0; colIdx < numColIdxs; colIdx++ )
   2.503 -       {
   2.504 -         subMatrix = subMatrices[ rowOffset + colIdx ];
   2.505 -         if( subMatrix->alreadyCopied )
   2.506 -            SSR__free( subMatrix->array, animPr );
   2.507 -         SSR__free( subMatrix, animPr );
   2.508 -       }
   2.509 -    }
   2.510 -   SSR__free( subMatrices, animPr );
   2.511 - }
   2.512 -
   2.513 -
   2.514 -
   2.515 -SlicingStruc *
   2.516 -sliceUpDimension( float32 idealSizeOfSide, int startVal, int endVal,
   2.517 -                  SlaveVP *animPr )
   2.518 - { float32 residualAcc = 0;
   2.519 -   int     numSlices, i, *startVals, sizeOfSlice, endCondition;
   2.520 -   SlicingStruc *slicingStruc = SSR__malloc_to(sizeof(SlicingStruc), animPr);
   2.521 -
   2.522 -      //calc size of matrix need to hold start vals --
   2.523 -   numSlices = (int32)( (float32)(endVal -startVal +1) / idealSizeOfSide);
   2.524 -
   2.525 -   startVals = SSR__malloc_to( (numSlices + 1) * sizeof(int32), animPr );
   2.526 -
   2.527 -      //Calc the upper limit of start value -- when get above this, end loop
   2.528 -      // by saving highest value of the matrix dimension to access, plus 1
   2.529 -      // as the start point of the imaginary slice following the last one
   2.530 -      //Plus 1 because go up to value but not include when process last slice
   2.531 -      //The stopping condition is half-a-size less than highest value because
   2.532 -      // don't want any pieces smaller than half the ideal size -- just tack
   2.533 -      // little ones onto end of last one
   2.534 -   endCondition = endVal - (int) (idealSizeOfSide/2); //end *value*, not size
   2.535 -   for( i = 0; startVal <= endVal; i++ )
   2.536 -    {
   2.537 -      startVals[i] = startVal;
   2.538 -      residualAcc += idealSizeOfSide;
   2.539 -      sizeOfSlice  = (int)residualAcc;
   2.540 -      residualAcc -= (float32)sizeOfSlice;
   2.541 -      startVal    += sizeOfSlice; //ex @size = 2 get 0, 2, 4, 6, 8..
   2.542 -
   2.543 -      if( startVal > endCondition )
   2.544 -       { startVal = endVal + 1;
   2.545 -         startVals[ i + 1 ] = startVal;
   2.546 -       }
   2.547 -    }
   2.548 -
   2.549 -   slicingStruc->startVals = startVals;
   2.550 -   slicingStruc->numVals   = i;  //loop incr'd, so == last valid start idx+1
   2.551 -                                 // which means is num sub-matrices in dim
   2.552 -                                 // also == idx of the fake start just above
   2.553 -   return slicingStruc;
   2.554 - }
   2.555 -
   2.556 -void
   2.557 -freeSlicingStruc( SlicingStruc *slicingStruc, SlaveVP *animPr )
   2.558 - {
   2.559 -   SSR__free( slicingStruc->startVals, animPr );
   2.560 -   SSR__free( slicingStruc, animPr );
   2.561 - }
   2.562 -
   2.563 -
   2.564 -inline int
   2.565 -measureMatrixMultPrimitive( SlaveVP *animPr )
   2.566 - {
   2.567 -   int r, c, v, numCycles;
   2.568 -   float32 *res, *left, *right;
   2.569 -
   2.570 -      //setup inputs
   2.571 -   left  = SSR__malloc_to( 5 * 5 * sizeof( float32 ), animPr );
   2.572 -   right = SSR__malloc_to( 5 * 5 * sizeof( float32 ), animPr );
   2.573 -   res   = SSR__malloc_to( 5 * 5 * sizeof( float32 ), animPr );
   2.574 -
   2.575 -   for( r = 0; r < 5; r++ )
   2.576 -    {
   2.577 -      for( c = 0; c < 5; c++ )
   2.578 -       {
   2.579 -         left[  r * 5 + c ] = r;
   2.580 -         right[ r * 5 + c ] = c;
   2.581 -       }
   2.582 -    }
   2.583 -
   2.584 -      //do primitive
   2.585 -   SSR__start_primitive();  //for now, just takes time stamp
   2.586 -   for( r = 0; r < 5; r++ )
   2.587 -    {
   2.588 -      for( c = 0; c < 5; c++ )
   2.589 -       {
   2.590 -         for( v = 0; v < 5; v++ )
   2.591 -          {
   2.592 -            res[ r * 5 + c ] = left[ r * 5 + v ] * right[ v * 5 + c ];
   2.593 -          }
   2.594 -       }
   2.595 -    }
   2.596 -   numCycles =
   2.597 -      SSR__end_primitive_and_give_cycles();
   2.598 -
   2.599 -   SSR__free( left, animPr );
   2.600 -   SSR__free( right, animPr );
   2.601 -   SSR__free( res, animPr );
   2.602 -
   2.603 -   return numCycles;
   2.604 - }
   2.605 -
     3.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     3.2 +++ b/VSs__Hello_World/Task.c	Wed May 23 14:24:18 2012 -0700
     3.3 @@ -0,0 +1,21 @@
     3.4 +/*
     3.5 + *  Copyright 2009 OpenSourceStewardshipFoundation.org
     3.6 + *  Licensed under GNU General Public License version 2
     3.7 + *
     3.8 + * Author: seanhalle@yahoo.com
     3.9 + *
    3.10 + */
    3.11 +
    3.12 +
    3.13 +#include <math.h>
    3.14 +#include <string.h>
    3.15 +#include "VSs__Hello_World.h"
    3.16 + 
    3.17 +void hello_world_task( void  *_args, SlaveVP *animPr )
    3.18 + { HelloWorldArgs *args;
    3.19 +		 
    3.20 +   args = (HelloWorldArgs *)_args;
    3.21 +   
    3.22 +   printf("Hello World: %d, %f", args->dummy1, args->dummy2);
    3.23 + }
    3.24 +
     4.1 --- a/main.c	Wed May 23 12:39:19 2012 -0700
     4.2 +++ b/main.c	Wed May 23 14:24:18 2012 -0700
     4.3 @@ -1,5 +1,5 @@
     4.4  /*
     4.5 - *  Copyright Oct 24, 2009 OpenSourceStewardshipFoundation.org
     4.6 + *  Copyright 20012 OpenSourceResearchInstitute.org
     4.7   *  Licensed under GNU General Public License version 2
     4.8   *
     4.9   * author seanhalle@yahoo.com
    4.10 @@ -8,28 +8,17 @@
    4.11  #include <malloc.h>
    4.12  #include <stdlib.h>
    4.13  
    4.14 -#include "Matrix_Mult.h"
    4.15 -#include "SSR_Matrix_Mult/SSR_Matrix_Mult.h"
    4.16 +#include "VSs__Hello_World/VSs__Hello_World.h"
    4.17  
    4.18  /**
    4.19   * 
    4.20   */
    4.21  int main( int argc, char **argv )
    4.22 - { Matrix      *leftMatrix, *rightMatrix, *resultMatrix;
    4.23 -   ParamBag    *paramBag;
    4.24 + { 
    4.25     
    4.26     DEBUG__printf2(TRUE, "arguments: %s | %s", argv[0], argv[1] );
    4.27 -
    4.28 -   paramBag = makeParamBag();
    4.29 -   readParamFileIntoBag( argv[1], paramBag );
    4.30 -   initialize_Input_Matrices_Via( &leftMatrix, &rightMatrix, paramBag );
    4.31     
    4.32 -   resultMatrix = multiplyTheseMatrices( leftMatrix, rightMatrix );
    4.33 -
    4.34 -   printf("\nresult matrix: \n");
    4.35 -   printMatrix( resultMatrix );
    4.36 -   
    4.37 -   fflush(stdin);
    4.38 +   VSs__Hello_World( );
    4.39     
    4.40     exit(0); //cleans up
    4.41   }