changeset 0:8d14fe28a782

Initial add -- copied from SSR version
author SeanHalle
date Wed, 10 Nov 2010 22:26:57 -0800
parents
children 133633d1c10f
files .hgignore src/Application/Matrix_Mult.c src/Application/Matrix_Mult.h src/Application/VPThread__Matrix_Mult/Divide_Pr.c src/Application/VPThread__Matrix_Mult/EntryPoint.c src/Application/VPThread__Matrix_Mult/Result_Pr.c src/Application/VPThread__Matrix_Mult/VPThread__Matrix_Mult.h src/Application/VPThread__Matrix_Mult/subMatrix_Pr.c src/Application/main.c
diffstat 9 files changed, 1446 insertions(+), 0 deletions(-) [+]
line diff
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/.hgignore	Wed Nov 10 22:26:57 2010 -0800
     1.3 @@ -0,0 +1,5 @@
     1.4 +nbproject
     1.5 +dist
     1.6 +build
     1.7 +Makefile
     1.8 +.dep.inc
     2.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     2.2 +++ b/src/Application/Matrix_Mult.c	Wed Nov 10 22:26:57 2010 -0800
     2.3 @@ -0,0 +1,167 @@
     2.4 +/*
     2.5 + *  Copyright 2009 OpenSourceStewardshipFoundation.org
     2.6 + *  Licensed under GNU General Public License version 2
     2.7 + *
     2.8 + * Author: seanhalle@yahoo.com
     2.9 + *
    2.10 + * Created on November 15, 2009, 2:35 AM
    2.11 + */
    2.12 +
    2.13 +#include <malloc.h>
    2.14 +#include <stdlib.h>
    2.15 +
    2.16 +#include "Matrix_Mult.h"
    2.17 +#include "ParamHelper/Param.h"
    2.18 +
    2.19 +
    2.20 + 
    2.21 + void
    2.22 +initialize_Input_Matrices_Via( Matrix  **leftMatrix, Matrix **rightMatrix,
    2.23 +                               ParamBag *paramBag )
    2.24 + { char *leftMatrixFileName, *rightMatrixFileName;
    2.25 +   int   leftMatrixRows, leftMatrixCols, rightMatrixRows, rightMatrixCols;
    2.26 +   
    2.27 +      ParamStruc *param;
    2.28 +      param = getParamFromBag( "leftMatrixRows", paramBag );
    2.29 +   leftMatrixRows = param->intValue;
    2.30 +      param = getParamFromBag( "leftMatrixCols", paramBag );
    2.31 +   leftMatrixCols = param->intValue;
    2.32 +   *leftMatrix = makeMatrix_WithResMat( leftMatrixRows, leftMatrixCols );
    2.33 +   
    2.34 +      param = getParamFromBag( "leftMatrixFileName", paramBag );
    2.35 +   leftMatrixFileName = param->strValue;  //no need to copy
    2.36 +   read_Matrix_From_File( *leftMatrix,  leftMatrixFileName );
    2.37 +   
    2.38 +      param = getParamFromBag( "rightMatrixRows", paramBag );
    2.39 +   rightMatrixRows = param->intValue;
    2.40 +      param = getParamFromBag( "rightMatrixCols", paramBag );
    2.41 +   rightMatrixCols = param->intValue;
    2.42 +   *rightMatrix = makeMatrix_WithResMat( rightMatrixRows, rightMatrixCols );
    2.43 +   
    2.44 +      param = getParamFromBag( "rightMatrixFileName", paramBag );
    2.45 +   rightMatrixFileName = param->strValue;
    2.46 +   read_Matrix_From_File( *rightMatrix, rightMatrixFileName );
    2.47 + }
    2.48 +
    2.49 +
    2.50 +void parseLineIntoRow( char *line, float32* row );
    2.51 +
    2.52 +
    2.53 + void
    2.54 +read_Matrix_From_File( Matrix *matrixStruc, char *matrixFileName )
    2.55 + { int    row, maxRead, numRows, numCols;
    2.56 +   float32 *matrixStart;
    2.57 +   size_t lineSz = 0;
    2.58 +   FILE  *file;
    2.59 +   char  *line = NULL;
    2.60 +   
    2.61 +   lineSz = 50000; //max length of line in a matrix data file
    2.62 +   line = (char *) malloc( lineSz );
    2.63 +   if( line == NULL ) printf( "no mem for matrix line" );
    2.64 +   
    2.65 +   numRows = matrixStruc->numRows;
    2.66 +   numCols = matrixStruc->numCols;
    2.67 +   matrixStart = matrixStruc->array;
    2.68 +
    2.69 +   file = fopen( matrixFileName, "r" );
    2.70 +   if( file == NULL ) { printf( "\nCouldn't open file!!\n"); exit(1);}
    2.71 +   fseek( file, 0, SEEK_SET );
    2.72 +   for( row = 0; row < numRows; row++ )
    2.73 +    {
    2.74 +      if( feof( file ) )  printf( "file ran out too soon" );
    2.75 +      maxRead = getline( &line, &lineSz, file );
    2.76 +      if( maxRead == -1 ) printf( "prob reading mat line");
    2.77 +      
    2.78 +      if( *line == '\n') continue; //blank line
    2.79 +      if( *line == '/' ) continue; //comment line
    2.80 +      
    2.81 +      parseLineIntoRow( line, matrixStart + row * numCols );
    2.82 +    }
    2.83 +   free( line );
    2.84 + }
    2.85 +
    2.86 +/*This function relies on each line having the proper number of cols.  It
    2.87 + * doesn't check, nor enforce, so if the file is improperly formatted it
    2.88 + * can write over unrelated memory
    2.89 + */
    2.90 + void
    2.91 +parseLineIntoRow( char *line, float32* row )
    2.92 + {
    2.93 +   char *valueStr, *searchPos;
    2.94 +   
    2.95 +      //read the float values
    2.96 +   searchPos = valueStr = line; //start
    2.97 +   
    2.98 +   for( ; *searchPos != 0; searchPos++)  //bit dangerous, should use buff len
    2.99 +    {
   2.100 +      if( *searchPos == '\n' ) //last col..  relying on well-formatted file
   2.101 +       { *searchPos = 0;
   2.102 +         *row = atof( valueStr );
   2.103 +         break;                                    //end FOR loop
   2.104 +       }
   2.105 +      if( *searchPos == ',' )
   2.106 +       { *searchPos = 0;                           //mark end of string
   2.107 +         *row = (float32) atof( valueStr );
   2.108 +         row += 1;                                 //address arith
   2.109 +            //skip any spaces before digits.. use searchPos + 1 to skip the 0
   2.110 +         for( ; *(searchPos + 1)== ' ' && *(searchPos + 1) !=0; searchPos++);
   2.111 +         valueStr = searchPos + 1;
   2.112 +       }
   2.113 +    }
   2.114 + }
   2.115 +
   2.116 + //==========================================================================
   2.117 +
   2.118 +/*In the "_Flat" version of constructor, do only malloc of the top data struc
   2.119 + * and set values in that top-level.  Don't malloc any sub-structures.
   2.120 + */
   2.121 + Matrix *
   2.122 +makeMatrix_Flat( int32 numRows, int32 numCols )
   2.123 + { Matrix * retMatrix;
   2.124 +   retMatrix = malloc( sizeof( Matrix ) );
   2.125 +   retMatrix->numRows = numRows;
   2.126 +   retMatrix->numCols = numCols;
   2.127 +
   2.128 +   return retMatrix;
   2.129 + }
   2.130 +
   2.131 + Matrix *
   2.132 +makeMatrix_WithResMat( int32 numRows, int32 numCols )
   2.133 + { Matrix * retMatrix;
   2.134 +   retMatrix = malloc( sizeof( Matrix ) );
   2.135 +   retMatrix->numRows = numRows;
   2.136 +   retMatrix->numCols = numCols;
   2.137 +   retMatrix->array  = malloc( numRows * numCols * sizeof(float32) );
   2.138 +
   2.139 +   return retMatrix;
   2.140 + }
   2.141 +
   2.142 + void
   2.143 +freeMatrix_Flat( Matrix * matrix )
   2.144 + { //( matrix );
   2.145 + }
   2.146 + void
   2.147 +freeMatrix( Matrix * matrix )
   2.148 + { free( matrix->array );
   2.149 +   free( matrix );
   2.150 + }
   2.151 +
   2.152 +void
   2.153 +printMatrix( Matrix *matrix )
   2.154 + { int r, c, numRows, numCols, rowsToPrint, colsToPrint, rowIncr, colIncr;
   2.155 +   float32 *matrixArray;
   2.156 +
   2.157 +   numRows = rowsToPrint = matrix->numRows;
   2.158 +   numCols = colsToPrint = matrix->numCols;
   2.159 +   matrixArray = matrix->array;
   2.160 +
   2.161 +   rowIncr = numRows/20; if(rowIncr == 0) rowIncr = 1;//20 to 39 rows printed
   2.162 +   colIncr = numCols/20; if(colIncr == 0) colIncr = 1;//20 to 39 cols printed
   2.163 +   for( r = 0; r < numRows; r += rowIncr )
   2.164 +    { for( c = 0; c < numCols; c += colIncr )
   2.165 +       { printf( "%3.1f | ", matrixArray[ r * numCols + c ] );
   2.166 +       }
   2.167 +      printf("\n");
   2.168 +    }
   2.169 + }
   2.170 +
     3.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     3.2 +++ b/src/Application/Matrix_Mult.h	Wed Nov 10 22:26:57 2010 -0800
     3.3 @@ -0,0 +1,77 @@
     3.4 +/*
     3.5 + *  Copyright Oct 24, 2009 OpenSourceStewardshipFoundation.org
     3.6 + *  Licensed under GNU General Public License version 2
     3.7 + */
     3.8 +
     3.9 +#ifndef MATRIX_MULT_H_
    3.10 +#define MATRIX_MULT_H_
    3.11 +
    3.12 +#include <stdio.h>
    3.13 +#include <unistd.h>
    3.14 +#include <malloc.h>
    3.15 +
    3.16 +#include "../SSR_lib/VMS/VMS_primitive_data_types.h"
    3.17 +#include "ParamHelper/Param.h"
    3.18 +
    3.19 +//==============================  Structures  ==============================
    3.20 +
    3.21 +typedef
    3.22 +struct
    3.23 + { int32 numRows;
    3.24 +   int32 numCols;
    3.25 +   float32 *array;  //2D, but dynamically sized, so use addr arith
    3.26 + }
    3.27 +Matrix;
    3.28 +
    3.29 +/* This is the "appSpecificPiece" that is carried inside a DKUPiece.
    3.30 + *  In the DKUPiece data struc it is declared to be of type "void *".  This
    3.31 + *  allows the application to define any data structure it wants and put it
    3.32 + *  into a DKUPiece.
    3.33 + * When the app specific info is used, it is in app code, so it is cast to
    3.34 + *  the correct type to tell the compiler how to access fields.
    3.35 + * This keeps all app-specific things out of the DKU directory, as per the
    3.36 + *  DKU standard. */
    3.37 +typedef
    3.38 +struct
    3.39 + { 
    3.40 +      // pointers to shared data..  the result matrix must be created when the
    3.41 +      //  left and right matrices are put into the root ancestor DKUPiece.
    3.42 +   Matrix * leftMatrix;
    3.43 +   Matrix * rightMatrix;
    3.44 +   Matrix * resultMatrix;
    3.45 +
    3.46 +      // define the starting and ending boundaries for this piece of the
    3.47 +      //  result matrix.  These are derivable from the left and right
    3.48 +      //  matrices, but included them for readability of code.
    3.49 +   int prodStartRow, prodEndRow;
    3.50 +   int prodStartCol, prodEndCol;
    3.51 +      // Start and end of the portion of the left matrix that contributes to
    3.52 +      //  this piece of the product
    3.53 +   int leftStartRow, leftEndRow;
    3.54 +   int leftStartCol, leftEndCol;
    3.55 +      // Start and end of the portion of the right matrix that contributes to
    3.56 +      //  this piece of the product
    3.57 +   int rightStartRow, rightEndRow;
    3.58 +   int rightStartCol, rightEndCol;
    3.59 + }
    3.60 +MatrixProdPiece;
    3.61 +
    3.62 +//==============================  Functions  ================================
    3.63 +void readFile();
    3.64 +
    3.65 +Matrix *makeMatrix( int32 numRows, int32 numCols );
    3.66 +Matrix *makeMatrix_Flat( int32 numRows, int32 numCols );
    3.67 +Matrix *makeMatrix_WithResMat( int32 numRows, int32 numCols );
    3.68 +void    freeMatrix_Flat( Matrix * matrix );
    3.69 +void    freeMatrix( Matrix * matrix );
    3.70 +void    printMatrix( Matrix *matrix );
    3.71 +
    3.72 +void read_Matrix_From_File( Matrix *matrixStruc, char *matrixFileName );
    3.73 +
    3.74 +void
    3.75 +initialize_Input_Matrices_Via( Matrix  **leftMatrix, Matrix **rightMatrix,
    3.76 +                              ParamBag *paramBag );
    3.77 +
    3.78 +//===========================================================================
    3.79 +
    3.80 +#endif /*MATRIX_MULT_H_*/
     4.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     4.2 +++ b/src/Application/VPThread__Matrix_Mult/Divide_Pr.c	Wed Nov 10 22:26:57 2010 -0800
     4.3 @@ -0,0 +1,597 @@
     4.4 +/*
     4.5 + *  Copyright 2009 OpenSourceStewardshipFoundation.org
     4.6 + *  Licensed under GNU General Public License version 2
     4.7 + *
     4.8 + * Author: seanhalle@yahoo.com
     4.9 + *
    4.10 + */
    4.11 +
    4.12 +
    4.13 +#include "SSR_Matrix_Mult.h"
    4.14 +#include <math.h>
    4.15 +#include <string.h>
    4.16 +
    4.17 +   //The time to compute this many result values should equal the time to
    4.18 +   // perform this division on a matrix of size gives that many result calcs
    4.19 +   //IE, size this so that sequential time to calc equals divide time
    4.20 +   // find the value by experimenting -- but divide time and calc time scale
    4.21 +   // same way, so this value should remain valid across hardware
    4.22 +#define NUM_CELLS_IN_SEQUENTIAL_CUTOFF 1000
    4.23 +
    4.24 +
    4.25 +//===========================================================================
    4.26 +int inline
    4.27 +measureMatrixMultPrimitive( VirtProcr *animPr );
    4.28 +
    4.29 +SlicingStrucCarrier *
    4.30 +calcIdealSizeAndSliceDimensions( Matrix *leftMatrix, Matrix *rightMatrix,
    4.31 +                                 VirtProcr *animPr );
    4.32 +
    4.33 +SlicingStruc *
    4.34 +sliceUpDimension( float32 idealSizeOfSide, int startVal, int endVal,
    4.35 +                  VirtProcr *animPr );
    4.36 +
    4.37 +void
    4.38 +freeSlicingStruc( SlicingStruc *slicingStruc, VirtProcr *animPr );
    4.39 +
    4.40 +SubMatrix **
    4.41 +createSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices,
    4.42 +                   int32 numUses, Matrix *origMatrix, VirtProcr *animPr );
    4.43 +
    4.44 +void
    4.45 +freeSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices,
    4.46 +                 SubMatrix **subMatrices, VirtProcr *animPr );
    4.47 +
    4.48 +void
    4.49 +pairUpSubMatricesAndMakeProcessors( SubMatrix **leftSubMatrices,
    4.50 +                                    SubMatrix **rightSubMatrices,
    4.51 +                                    int32 numRowIdxs, int32 numColIdxs,
    4.52 +                                    int32 numVecIdxs,
    4.53 +                                    VirtProcr *resultPr,
    4.54 +                                    VirtProcr *animatingPr );
    4.55 +
    4.56 +void
    4.57 +makeSubMatricesAndProcrs( Matrix *leftMatrix, Matrix *rightMatrix,
    4.58 +            SlicingStrucCarrier *slicingStrucCarrier,
    4.59 +            VirtProcr *resultPr, VirtProcr *animatingPr );
    4.60 +
    4.61 +
    4.62 +
    4.63 +/*Divider creates one processor for every sub-matrix
    4.64 + * It hands them:
    4.65 + *  the name of the result processor that they should send their results to,
    4.66 + *  the left and right matrices, and the rows and cols they should multiply
    4.67 + * It first creates the result processor, then all the sub-matrixPair
    4.68 + *  processors,
    4.69 + *  then does a receive of a message from the result processor that gives
    4.70 + *  the divider ownership of the result matrix.
    4.71 + * Finally, the divider returns the result matrix out of the SSR system.
    4.72 + *
    4.73 + * Divider chooses the size of sub-matrices via an algorithm that tries to
    4.74 + *  keep the minimum work above a threshold.  The threshold is machine-
    4.75 + *  dependent, so ask SSR for min work-unit time to get a
    4.76 + *  given overhead
    4.77 + *
    4.78 + * Divide min work-unit cycles by measured-cycles for one matrix-cell
    4.79 + *  product -- gives the number of products need to have in min size
    4.80 + *  matrix.
    4.81 + *
    4.82 + * So then, take cubed root of this to get the size of a side of min sub-
    4.83 + *  matrix.  That is the size of the ideal square sub-matrix -- so tile
    4.84 + *  up the two input matrices into ones as close as possible to that size,
    4.85 + *  and create the pairs of sub-matrices.
    4.86 + *
    4.87 + *========================  STRATEGIC OVERVIEW  =======================
    4.88 + *
    4.89 + *This division is a bit tricky, because have to create things in advance
    4.90 + * that it's not at first obvious need to be created..
    4.91 + *
    4.92 + *First slice up each dimension -- three of them..  this is because will have
    4.93 + * to create the sub-matrix's data-structures before pairing the sub-matrices
    4.94 + * with each other -- so, have three dimensions to slice up before can
    4.95 + * create the sub-matrix data-strucs -- also, have to be certain that the
    4.96 + * cols of the left input have the exact same slicing as the rows of the
    4.97 + * left matrix, so just to be sure, do the slicing calc once, then use it
    4.98 + * for both.
    4.99 + *
   4.100 + *So, goes like this:
   4.101 + *1) calculate the start & end values of each dimension in each matrix.
   4.102 + *2) use those values to create sub-matrix structures
   4.103 + *3) combine sub-matrices into pairs, as the tasks to perform.
   4.104 + *
   4.105 + *Have to calculate separately from creating the sub-matrices because of the
   4.106 + * nature of the nesting -- would either end up creating the same sub-matrix
   4.107 + * multiple times, or else would have to put in detection of whether had
   4.108 + * made a particular one already if tried to combine steps 1 and 2.
   4.109 + *
   4.110 + *Step 3 has to be separate because of the nesting, as well -- same reason,
   4.111 + * would either create same sub-matrix multiple times, or else have to
   4.112 + * add detection of whether was already created.
   4.113 + *
   4.114 + *Another way to look at it: there's one level of loop to divide dimensions,
   4.115 + * two levels of nesting to create sub-matrices, and three levels to pair
   4.116 + * up the sub-matrices.
   4.117 + */
   4.118 +
   4.119 +void divideWorkIntoSubMatrixPairProcrs( void      *_dividerParams,
   4.120 +                                        VirtProcr *animPr )
   4.121 + { VirtProcr       *resultPr;
   4.122 +   DividerParams   *dividerParams;
   4.123 +   ResultsParams   *resultsParams;
   4.124 +   Matrix          *leftMatrix, *rightMatrix, *resultMatrix;
   4.125 +   void            *msg;
   4.126 +   SlicingStrucCarrier *slicingStrucCarrier;
   4.127 +   float32         *resultArray; //points to array inside result matrix
   4.128 +   
   4.129 +         DEBUG( dbgAppFlow, "start divide\n")
   4.130 +
   4.131 +         int32
   4.132 +         divideProbe = VMS__create_single_interval_probe( "divideProbe",
   4.133 +                                                          animPr );
   4.134 +         VMS__record_sched_choice_into_probe( divideProbe, animPr );
   4.135 +         VMS__record_interval_start_in_probe( divideProbe );
   4.136 +
   4.137 +   //=========== Setup -- make local copies of ptd-to-things, malloc, aso
   4.138 +   int32 numResRows, numResCols, vectLength;
   4.139 +
   4.140 +   dividerParams   = (DividerParams *)_dividerParams;
   4.141 +   
   4.142 +   leftMatrix      = dividerParams->leftMatrix;
   4.143 +   rightMatrix     = dividerParams->rightMatrix;
   4.144 +
   4.145 +   vectLength = leftMatrix->numCols;
   4.146 +   numResRows = leftMatrix->numRows;
   4.147 +   numResCols = rightMatrix->numCols;
   4.148 +   resultArray     = dividerParams->resultMatrix->array;
   4.149 +   
   4.150 +      //zero the result array
   4.151 +   memset( resultArray, 0, numResRows * numResCols * sizeof(float32) );
   4.152 +
   4.153 +   //==============  Do either sequential mult or do division ==============
   4.154 +
   4.155 +      //Check if input matrices too small -- if yes, just do sequential
   4.156 +      //Cutoff is determined by overhead of this divider -- relatively
   4.157 +      // machine-independent
   4.158 +   if( (float32)leftMatrix->numRows * (float32)leftMatrix->numCols *
   4.159 +       (float32)rightMatrix->numCols  < NUM_CELLS_IN_SEQUENTIAL_CUTOFF )
   4.160 +    {
   4.161 +      //====== Do sequential multiply on a single core
   4.162 +            DEBUG( dbgAppFlow, "doing sequential")
   4.163 +            
   4.164 +         //transpose the right matrix
   4.165 +      float32 *
   4.166 +      transRightArray  = SSR__malloc_to( rightMatrix->numRows * 
   4.167 +                                         rightMatrix->numCols * sizeof(float32),
   4.168 +                                         animPr );
   4.169 +
   4.170 +         //copy values from orig matrix to local
   4.171 +      copyTranspose( rightMatrix->numRows, rightMatrix->numCols,
   4.172 +                     0, 0, rightMatrix->numRows,
   4.173 +                     transRightArray, rightMatrix->array );
   4.174 +      
   4.175 +      multiplyMatrixArraysTransposed( vectLength, numResRows, numResCols,
   4.176 +                            leftMatrix->array, transRightArray,
   4.177 +                            resultArray );
   4.178 +    }
   4.179 +   else
   4.180 +    {
   4.181 +      //====== Do parallel multiply across cores
   4.182 +
   4.183 +         //Calc the ideal size of sub-matrix and slice up the dimensions of
   4.184 +         // the two matrices.
   4.185 +         //The ideal size is the one takes the number of cycles to calculate
   4.186 +         // such that calc time is equal or greater than min work-unit size
   4.187 +      slicingStrucCarrier =
   4.188 +         calcIdealSizeAndSliceDimensions( leftMatrix, rightMatrix, animPr );
   4.189 +
   4.190 +         //Make the results processor, now that know how many to wait for
   4.191 +      resultsParams = SSR__malloc_to( sizeof(ResultsParams), animPr );
   4.192 +      resultsParams->numSubMatrixPairs  =
   4.193 +         slicingStrucCarrier->leftRowSlices->numVals *
   4.194 +         slicingStrucCarrier->rightColSlices->numVals *
   4.195 +         slicingStrucCarrier->vecSlices->numVals;
   4.196 +      resultsParams->dividerPr   = animPr;
   4.197 +      resultsParams->numCols     = rightMatrix->numCols;
   4.198 +      resultsParams->numRows     = leftMatrix->numRows;
   4.199 +      resultsParams->resultArray = resultArray;
   4.200 +
   4.201 +
   4.202 +      resultPr =
   4.203 +         SSR__create_procr_with( &gatherResults, resultsParams, animPr);
   4.204 +
   4.205 +         //Make the sub-matrices, and pair them up, and make processor to
   4.206 +         // calc product of each pair.
   4.207 +      makeSubMatricesAndProcrs( leftMatrix, rightMatrix,
   4.208 +                                    slicingStrucCarrier,
   4.209 +                                    resultPr, animPr);
   4.210 + 
   4.211 +         //result array is allocated externally, so no message from resultPr
   4.212 +         // however, do have to wait before printing out stats, so wait
   4.213 +         // for an empty handshake message
   4.214 +      msg = SSR__receive_from_to( resultPr, animPr );
   4.215 +   }
   4.216 +
   4.217 +
   4.218 +   //===============  Work done -- send results back =================
   4.219 +
   4.220 +
   4.221 +         DEBUG( dbgAppFlow, "end divide\n")
   4.222 +
   4.223 +         VMS__record_interval_end_in_probe( divideProbe );
   4.224 +         VMS__print_stats_of_all_probes();
   4.225 +
   4.226 +      //nothing left to do so dissipate, SSR will wait to shutdown and hence
   4.227 +      // make results available to outside until all the processors have
   4.228 +      // dissipated -- so no need to wait for results processor
   4.229 +
   4.230 +   SSR__dissipate_procr( animPr );  //all procrs dissipate self at end
   4.231 +      //when all of the processors have dissipated, the "create seed and do
   4.232 +      // work" call in the entry point function returns
   4.233 + }
   4.234 +
   4.235 +
   4.236 +SlicingStrucCarrier *
   4.237 +calcIdealSizeAndSliceDimensions( Matrix *leftMatrix, Matrix *rightMatrix,
   4.238 +                                 VirtProcr *animPr )
   4.239 + {
   4.240 +   float32 idealSizeOfSide, idealSizeOfSide1, idealSizeOfSide2;
   4.241 +   SlicingStruc *leftRowSlices, *vecSlices, *rightColSlices;
   4.242 +   SlicingStrucCarrier *slicingStrucCarrier =
   4.243 +                         SSR__malloc_to(sizeof(SlicingStrucCarrier), animPr);
   4.244 +
   4.245 +   int minWorkUnitCycles, primitiveCycles, idealNumWorkUnits;
   4.246 +   float64 numPrimitiveOpsInMinWorkUnit;
   4.247 +
   4.248 +
   4.249 +   //=======  Calc ideal size of min-sized sub-matrix  ========
   4.250 +
   4.251 +      //ask SSR for the number of cycles of the minimum work unit, at given
   4.252 +      // percent overhead then add a guess at overhead from this divider
   4.253 +   minWorkUnitCycles = SSR__giveMinWorkUnitCycles( .05 );
   4.254 +
   4.255 +      //ask SSR for number of cycles of the "primitive" op of matrix mult
   4.256 +   primitiveCycles = measureMatrixMultPrimitive( animPr );
   4.257 +
   4.258 +   numPrimitiveOpsInMinWorkUnit =
   4.259 +      (float64)minWorkUnitCycles / (float64)primitiveCycles;
   4.260 +
   4.261 +      //take cubed root -- that's number of these in a "side" of sub-matrix
   4.262 +      // then multiply by 5 because the primitive is 5x5
   4.263 +   idealSizeOfSide1 = 5 * cbrt( numPrimitiveOpsInMinWorkUnit );
   4.264 +
   4.265 +   idealNumWorkUnits = SSR__giveIdealNumWorkUnits();
   4.266 +   
   4.267 +   idealSizeOfSide2 = leftMatrix->numRows / rint(cbrt( idealNumWorkUnits ));
   4.268 +   idealSizeOfSide2 *= 0.6; //finer granularity to help load balance
   4.269 +
   4.270 +   if( idealSizeOfSide1 > idealSizeOfSide2 )
   4.271 +      idealSizeOfSide = idealSizeOfSide1;
   4.272 +   else
   4.273 +      idealSizeOfSide = idealSizeOfSide2;
   4.274 +
   4.275 +      //The multiply inner loop blocks the array to fit into L1 cache
   4.276 +//   if( idealSizeOfSide < ROWS_IN_BLOCK ) idealSizeOfSide = ROWS_IN_BLOCK;
   4.277 +
   4.278 +   //============  Slice up dimensions, now that know target size ===========
   4.279 +
   4.280 +      //Tell the slicer the target size of a side (floating pt), the start
   4.281 +      // value to start slicing at, and the end value to stop slicing at
   4.282 +      //It returns an array of start value of each chunk, plus number of them
   4.283 +   int32 startLeftRow, endLeftRow, startVec,endVec,startRightCol,endRightCol;
   4.284 +   startLeftRow  = 0;
   4.285 +   endLeftRow    = leftMatrix->numRows -1;
   4.286 +   startVec      = 0;
   4.287 +   endVec        = leftMatrix->numCols -1;
   4.288 +   startRightCol = 0;
   4.289 +   endRightCol   = rightMatrix->numCols -1;
   4.290 +
   4.291 +   leftRowSlices =
   4.292 +      sliceUpDimension( idealSizeOfSide,  startLeftRow, endLeftRow, animPr );
   4.293 +
   4.294 +   vecSlices =
   4.295 +      sliceUpDimension( idealSizeOfSide,  startVec, endVec, animPr );
   4.296 +
   4.297 +   rightColSlices =
   4.298 +      sliceUpDimension( idealSizeOfSide,  startRightCol, endRightCol,animPr);
   4.299 +
   4.300 +   slicingStrucCarrier->leftRowSlices  = leftRowSlices;
   4.301 +   slicingStrucCarrier->vecSlices      = vecSlices;
   4.302 +   slicingStrucCarrier->rightColSlices = rightColSlices;
   4.303 +
   4.304 +   return slicingStrucCarrier;
   4.305 + }
   4.306 +
   4.307 +
   4.308 +void
   4.309 +makeSubMatricesAndProcrs( Matrix    *leftMatrix, Matrix    *rightMatrix,
   4.310 +            SlicingStrucCarrier *slicingStrucCarrier,
   4.311 +            VirtProcr *resultPr,   VirtProcr *animPr )
   4.312 + {
   4.313 +   SlicingStruc *leftRowSlices, *vecSlices, *rightColSlices;
   4.314 +   
   4.315 +   leftRowSlices  = slicingStrucCarrier->leftRowSlices;
   4.316 +   vecSlices      = slicingStrucCarrier->vecSlices;
   4.317 +   rightColSlices = slicingStrucCarrier->rightColSlices;
   4.318 +   SSR__free( slicingStrucCarrier, animPr );
   4.319 +   
   4.320 +   //================  Make sub-matrices, given the slicing  ================
   4.321 +   SubMatrix **leftSubMatrices, **rightSubMatrices;
   4.322 +   leftSubMatrices =
   4.323 +      createSubMatrices( leftRowSlices, vecSlices, rightColSlices->numVals,
   4.324 +                         leftMatrix, animPr );
   4.325 +   //double_check_that_always_numRows_in_right_same_as_numCols_in_left();
   4.326 +   rightSubMatrices =
   4.327 +      createSubMatrices( vecSlices, rightColSlices, leftRowSlices->numVals,
   4.328 +                         rightMatrix, animPr );
   4.329 +
   4.330 +   freeSlicingStruc( leftRowSlices, animPr );
   4.331 +   freeSlicingStruc( vecSlices, animPr );
   4.332 +   freeSlicingStruc( rightColSlices, animPr );
   4.333 +
   4.334 +   //==============  pair the sub-matrices and make processors ==============
   4.335 +   int32 numRowIdxs, numColIdxs, numVecIdxs;
   4.336 +
   4.337 +   numRowIdxs = leftRowSlices->numVals;
   4.338 +   numColIdxs = rightColSlices->numVals;
   4.339 +   numVecIdxs = vecSlices->numVals;
   4.340 +   pairUpSubMatricesAndMakeProcessors( leftSubMatrices,
   4.341 +                                       rightSubMatrices,
   4.342 +                                       numRowIdxs, numColIdxs,
   4.343 +                                       numVecIdxs,
   4.344 +                                       resultPr,
   4.345 +                                       animPr );
   4.346 + }
   4.347 +
   4.348 +
   4.349 +
   4.350 +
   4.351 +void
   4.352 +pairUpSubMatricesAndMakeProcessors( SubMatrix **leftSubMatrices,
   4.353 +                                    SubMatrix **rightSubMatrices,
   4.354 +                                    int32 numRowIdxs, int32 numColIdxs,
   4.355 +                                    int32 numVecIdxs,
   4.356 +                                    VirtProcr *resultPr,
   4.357 +                                    VirtProcr *animatingPr )
   4.358 + {
   4.359 +   int32 resRowIdx, resColIdx, vecIdx;
   4.360 +   int32 numLeftColIdxs, numRightColIdxs;
   4.361 +   int32 leftRowIdxOffset;
   4.362 +   SMPairParams *subMatrixPairParams;
   4.363 +   float32 numToPutOntoEachCore, leftOverFraction;
   4.364 +   int32 numCores, coreToScheduleOnto, numVecOnCurrCore;
   4.365 +
   4.366 +   numLeftColIdxs  = numColIdxs;
   4.367 +   numRightColIdxs = numVecIdxs;
   4.368 +
   4.369 +   numCores = SSR__give_number_of_cores_to_schedule_onto();
   4.370 +
   4.371 +   numToPutOntoEachCore = numRowIdxs*numColIdxs/numCores;
   4.372 +   leftOverFraction = 0;
   4.373 +   numVecOnCurrCore = 0;
   4.374 +   coreToScheduleOnto = 0;
   4.375 +
   4.376 +   for( resRowIdx = 0; resRowIdx < numRowIdxs; resRowIdx++ )
   4.377 +    {
   4.378 +      leftRowIdxOffset = resRowIdx * numLeftColIdxs;
   4.379 +
   4.380 +      for( resColIdx = 0; resColIdx < numColIdxs; resColIdx++ )
   4.381 +       {
   4.382 +         
   4.383 +         for( vecIdx = 0; vecIdx < numVecIdxs; vecIdx++ )
   4.384 +          {
   4.385 +               //Make the processor for the pair of sub-matrices
   4.386 +            subMatrixPairParams  = SSR__malloc_to( sizeof(SMPairParams),
   4.387 +                                                               animatingPr);
   4.388 +            subMatrixPairParams->leftSubMatrix  =
   4.389 +               leftSubMatrices[ leftRowIdxOffset + vecIdx ];
   4.390 +
   4.391 +            subMatrixPairParams->rightSubMatrix =
   4.392 +               rightSubMatrices[ vecIdx * numRightColIdxs + resColIdx ];
   4.393 +
   4.394 +            subMatrixPairParams->resultPr = resultPr;
   4.395 +
   4.396 +               //put all pairs from the same vector onto same core
   4.397 +            SSR__create_procr_with_affinity( &calcSubMatrixProduct,
   4.398 +                                             subMatrixPairParams,
   4.399 +                                             animatingPr,
   4.400 +                                             coreToScheduleOnto );
   4.401 +          }
   4.402 +
   4.403 +            //Trying to distribute the subMatrix-vectors across the cores, so
   4.404 +            // that each core gets the same number of vectors, with a max
   4.405 +            // imbalance of 1 vector more on some cores than others
   4.406 +         numVecOnCurrCore += 1;
   4.407 +         if( numVecOnCurrCore + leftOverFraction >= numToPutOntoEachCore -1 )
   4.408 +          {
   4.409 +               //deal with fractional part, to ensure that imbalance is 1 max
   4.410 +               // IE, core with most has only 1 more than core with least
   4.411 +            leftOverFraction += numToPutOntoEachCore - numVecOnCurrCore;
   4.412 +            if( leftOverFraction >= 1 )
   4.413 +             { leftOverFraction -= 1;
   4.414 +               numVecOnCurrCore = -1;
   4.415 +             }
   4.416 +            else
   4.417 +             { numVecOnCurrCore = 0;
   4.418 +             }
   4.419 +               //Move to next core, max core-value to incr to is numCores -1
   4.420 +            if( coreToScheduleOnto >= numCores -1 )
   4.421 +             { coreToScheduleOnto = 0;
   4.422 +             }
   4.423 +            else
   4.424 +             { coreToScheduleOnto += 1;
   4.425 +             }
   4.426 +          }
   4.427 + 
   4.428 +       }
   4.429 +    }
   4.430 +
   4.431 + }
   4.432 +
   4.433 +
   4.434 +
   4.435 +/*Walk through the two slice-strucs, making sub-matrix strucs as go
   4.436 + */
   4.437 +SubMatrix **
   4.438 +createSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices,
   4.439 +                   int32 numUses, Matrix *origMatrix, VirtProcr *animPr )
   4.440 + {
   4.441 +   int32 numRowIdxs, numColIdxs, rowIdx, colIdx;
   4.442 +   int32 startRow, endRow, startCol, endCol;
   4.443 +   int32 *rowStartVals, *colStartVals;
   4.444 +   int32 rowOffset;
   4.445 +   SubMatrix **subMatrices, *newSubMatrix;
   4.446 +
   4.447 +   numRowIdxs = rowSlices->numVals;
   4.448 +   numColIdxs = colSlices->numVals;
   4.449 +
   4.450 +   rowStartVals = rowSlices->startVals;
   4.451 +   colStartVals = colSlices->startVals;
   4.452 +
   4.453 +   subMatrices = SSR__malloc_to(numRowIdxs * numColIdxs * sizeof(SubMatrix*),
   4.454 +                                 animPr );
   4.455 +
   4.456 +   for( rowIdx = 0; rowIdx < numRowIdxs; rowIdx++ )
   4.457 +    {
   4.458 +      rowOffset = rowIdx * numColIdxs;
   4.459 +      
   4.460 +      startRow  = rowStartVals[rowIdx];
   4.461 +      endRow    = rowStartVals[rowIdx + 1] -1; //"fake" start above last is
   4.462 +                                               // at last valid idx + 1 & is
   4.463 +                                               // 1 greater than end value
   4.464 +      for( colIdx = 0; colIdx < numColIdxs; colIdx++ )
   4.465 +       {
   4.466 +         startCol = colStartVals[colIdx];
   4.467 +         endCol   = colStartVals[colIdx + 1] -1;
   4.468 +
   4.469 +         newSubMatrix = SSR__malloc_to( sizeof(SubMatrix), animPr );
   4.470 +         newSubMatrix->numRows       = endRow - startRow +1;
   4.471 +         newSubMatrix->numCols       = endCol - startCol +1;
   4.472 +         newSubMatrix->origMatrix    = origMatrix;
   4.473 +         newSubMatrix->origStartRow  = startRow;
   4.474 +         newSubMatrix->origStartCol  = startCol;
   4.475 +         newSubMatrix->alreadyCopied = FALSE;
   4.476 +         newSubMatrix->numUsesLeft   = numUses; //can free after this many
   4.477 +
   4.478 +         subMatrices[ rowOffset + colIdx ] = newSubMatrix;
   4.479 +       }
   4.480 +    }
   4.481 +   return subMatrices;
   4.482 + }
   4.483 +
   4.484 +
   4.485 +void
   4.486 +freeSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices,
   4.487 +                 SubMatrix **subMatrices, VirtProcr *animPr )
   4.488 + {
   4.489 +   int32 numRowIdxs, numColIdxs, rowIdx, colIdx, rowOffset;
   4.490 +   SubMatrix *subMatrix;
   4.491 +
   4.492 +   numRowIdxs = rowSlices->numVals;
   4.493 +   numColIdxs = colSlices->numVals;
   4.494 +
   4.495 +   for( rowIdx = 0; rowIdx < numRowIdxs; rowIdx++ )
   4.496 +    {
   4.497 +      rowOffset = rowIdx * numColIdxs;
   4.498 +      for( colIdx = 0; colIdx < numColIdxs; colIdx++ )
   4.499 +       {
   4.500 +         subMatrix = subMatrices[ rowOffset + colIdx ];
   4.501 +         if( subMatrix->alreadyCopied )
   4.502 +            SSR__free( subMatrix->array, animPr );
   4.503 +         SSR__free( subMatrix, animPr );
   4.504 +       }
   4.505 +    }
   4.506 +   SSR__free( subMatrices, animPr );
   4.507 + }
   4.508 +
   4.509 +
   4.510 +
   4.511 +SlicingStruc *
   4.512 +sliceUpDimension( float32 idealSizeOfSide, int startVal, int endVal,
   4.513 +                  VirtProcr *animPr )
   4.514 + { float32 residualAcc = 0;
   4.515 +   int     numSlices, i, *startVals, sizeOfSlice, endCondition;
   4.516 +   SlicingStruc *slicingStruc = SSR__malloc_to(sizeof(SlicingStruc), animPr);
   4.517 +
   4.518 +      //calc size of matrix need to hold start vals --
   4.519 +   numSlices = (int32)( (float32)(endVal -startVal +1) / idealSizeOfSide);
   4.520 +
   4.521 +   startVals = SSR__malloc_to( (numSlices + 1) * sizeof(int32), animPr );
   4.522 +
   4.523 +      //Calc the upper limit of start value -- when get above this, end loop
   4.524 +      // by saving highest value of the matrix dimension to access, plus 1
   4.525 +      // as the start point of the imaginary slice following the last one
   4.526 +      //Plus 1 because go up to value but not include when process last slice
   4.527 +      //The stopping condition is half-a-size less than highest value because
   4.528 +      // don't want any pieces smaller than half the ideal size -- just tack
   4.529 +      // little ones onto end of last one
   4.530 +   endCondition = endVal - (int) (idealSizeOfSide/2); //end *value*, not size
   4.531 +   for( i = 0; startVal <= endVal; i++ )
   4.532 +    {
   4.533 +      startVals[i] = startVal;
   4.534 +      residualAcc += idealSizeOfSide;
   4.535 +      sizeOfSlice  = (int)residualAcc;
   4.536 +      residualAcc -= (float32)sizeOfSlice;
   4.537 +      startVal    += sizeOfSlice; //ex @size = 2 get 0, 2, 4, 6, 8..
   4.538 +
   4.539 +      if( startVal > endCondition )
   4.540 +       { startVal = endVal + 1;
   4.541 +         startVals[ i + 1 ] = startVal;
   4.542 +       }
   4.543 +    }
   4.544 +
   4.545 +   slicingStruc->startVals = startVals;
   4.546 +   slicingStruc->numVals   = i;  //loop incr'd, so == last valid start idx+1
   4.547 +                                 // which means is num sub-matrices in dim
   4.548 +                                 // also == idx of the fake start just above
   4.549 +   return slicingStruc;
   4.550 + }
   4.551 +
   4.552 +void
   4.553 +freeSlicingStruc( SlicingStruc *slicingStruc, VirtProcr *animPr )
   4.554 + {
   4.555 +   SSR__free( slicingStruc->startVals, animPr );
   4.556 +   SSR__free( slicingStruc, animPr );
   4.557 + }
   4.558 +
   4.559 +
   4.560 +int inline
   4.561 +measureMatrixMultPrimitive( VirtProcr *animPr )
   4.562 + {
   4.563 +   int r, c, v, numCycles;
   4.564 +   float32 *res, *left, *right;
   4.565 +
   4.566 +      //setup inputs
   4.567 +   left  = SSR__malloc_to( 5 * 5 * sizeof( float32 ), animPr );
   4.568 +   right = SSR__malloc_to( 5 * 5 * sizeof( float32 ), animPr );
   4.569 +   res   = SSR__malloc_to( 5 * 5 * sizeof( float32 ), animPr );
   4.570 +
   4.571 +   for( r = 0; r < 5; r++ )
   4.572 +    {
   4.573 +      for( c = 0; c < 5; c++ )
   4.574 +       {
   4.575 +         left[  r * 5 + c ] = r;
   4.576 +         right[ r * 5 + c ] = c;
   4.577 +       }
   4.578 +    }
   4.579 +
   4.580 +      //do primitive
   4.581 +   SSR__start_primitive();  //for now, just takes time stamp
   4.582 +   for( r = 0; r < 5; r++ )
   4.583 +    {
   4.584 +      for( c = 0; c < 5; c++ )
   4.585 +       {
   4.586 +         for( v = 0; v < 5; v++ )
   4.587 +          {
   4.588 +            res[ r * 5 + c ] = left[ r * 5 + v ] * right[ v * 5 + c ];
   4.589 +          }
   4.590 +       }
   4.591 +    }
   4.592 +   numCycles =
   4.593 +      SSR__end_primitive_and_give_cycles();
   4.594 +
   4.595 +   SSR__free( left, animPr );
   4.596 +   SSR__free( right, animPr );
   4.597 +   SSR__free( res, animPr );
   4.598 +
   4.599 +   return numCycles;
   4.600 + }
     5.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     5.2 +++ b/src/Application/VPThread__Matrix_Mult/EntryPoint.c	Wed Nov 10 22:26:57 2010 -0800
     5.3 @@ -0,0 +1,62 @@
     5.4 +/*
     5.5 + *  Copyright 2009 OpenSourceStewardshipFoundation.org
     5.6 + *  Licensed under GNU General Public License version 2
     5.7 + *
     5.8 + * Author: seanhalle@yahoo.com
     5.9 + *
    5.10 + */
    5.11 +
    5.12 +#include <math.h>
    5.13 +
    5.14 +#include "SSR_Matrix_Mult.h"
    5.15 +
    5.16 +
    5.17 +
    5.18 +/*Every SSR system has an "entry point" function that creates the first
    5.19 + * processor, which starts the chain of creating more processors..
    5.20 + * eventually all of the processors will dissipate themselves, and
    5.21 + * return.
    5.22 + *
    5.23 + *This entry-point function follows the same pattern as all entry-point
    5.24 + * functions do:
    5.25 + *1) it creates the params for the seed processor, from the
    5.26 + *    parameters passed into the entry-point function
    5.27 + *2) it calls SSR__create_seed_procr_and_do_work
    5.28 + *3) it gets the return value from the params struc, frees the params struc,
    5.29 + *    and returns the value from the function
    5.30 + *
    5.31 + */
    5.32 +Matrix *
    5.33 +multiplyTheseMatrices( Matrix *leftMatrix, Matrix *rightMatrix )
    5.34 + { Matrix          *resMatrix;
    5.35 +   DividerParams   *dividerParams;
    5.36 +   int32            numResRows, numResCols;
    5.37 +
    5.38 +
    5.39 +   dividerParams              = malloc( sizeof( DividerParams ) );
    5.40 +   dividerParams->leftMatrix  = leftMatrix;
    5.41 +   dividerParams->rightMatrix = rightMatrix;
    5.42 +
    5.43 +
    5.44 +   numResRows  = leftMatrix->numRows;
    5.45 +   numResCols  = rightMatrix->numCols;
    5.46 +
    5.47 +      //VMS has its own separate internal malloc, so to get results out,
    5.48 +      // have to pass in empty array for it to fill up
    5.49 +      //The alternative is internally telling SSR make external space to use
    5.50 +   resMatrix            = malloc( sizeof(Matrix) );
    5.51 +   resMatrix->array     = malloc( numResRows * numResCols * sizeof(float32));
    5.52 +   resMatrix->numCols   = rightMatrix->numCols;
    5.53 +   resMatrix->numRows   = leftMatrix->numRows;
    5.54 +
    5.55 +
    5.56 +   dividerParams->resultMatrix   = resMatrix;
    5.57 +
    5.58 +      //create divider processor, start doing the work, and wait till done
    5.59 +      //This function is the "border crossing" between normal code and SSR
    5.60 +   SSR__create_seed_procr_and_do_work( &divideWorkIntoSubMatrixPairProcrs,
    5.61 +                                       dividerParams );
    5.62 +   
    5.63 +   free( dividerParams );
    5.64 +   return resMatrix;
    5.65 + }
     6.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     6.2 +++ b/src/Application/VPThread__Matrix_Mult/Result_Pr.c	Wed Nov 10 22:26:57 2010 -0800
     6.3 @@ -0,0 +1,108 @@
     6.4 +/*
     6.5 + *  Copyright 2009 OpenSourceStewardshipFoundation.org
     6.6 + *  Licensed under GNU General Public License version 2
     6.7 + *
     6.8 + * Author: seanhalle@yahoo.com
     6.9 + *
    6.10 + */
    6.11 +
    6.12 +#include "SSR_Matrix_Mult.h"
    6.13 +
    6.14 +//=====================
    6.15 +void inline
    6.16 +accumulateResult( float32 *resultArray, float32 *subMatrixResultArray,
    6.17 +                  int32    startRow,
    6.18 +                  int32    numRows,
    6.19 +                  int32    startCol,
    6.20 +                  int32    numCols,
    6.21 +                  int32    numOrigCols );
    6.22 +
    6.23 +//===========================================================================
    6.24 +
    6.25 +/*The Result Processor gets a message from each of the vector processors,
    6.26 + * puts the result from the message in its location in the result-
    6.27 + * matrix, and increments the count of results.
    6.28 + *
    6.29 + *After the count reaches the point that all results have been received, it
    6.30 + * returns the result matrix and dissipates.
    6.31 + */
    6.32 +void gatherResults( void *_params, VirtProcr *animatingPr )
    6.33 + { VirtProcr *dividerPr;
    6.34 +   ResultsParams  *params;
    6.35 +   int             row, col, numRows, numCols, numSubMatrixPairs, count=0;
    6.36 +   float32        *resultArray;
    6.37 +   void           *msg;
    6.38 +   SMPairParams   *resParams;
    6.39 +
    6.40 +         DEBUG( dbgAppFlow, "start resultPr\n")
    6.41 +         
    6.42 +   params    = (ResultsParams *)_params;
    6.43 +   dividerPr = params->dividerPr;
    6.44 +   numSubMatrixPairs = params->numSubMatrixPairs;
    6.45 +   numRows = params->numRows;
    6.46 +   numCols = params->numCols;
    6.47 +
    6.48 +   resultArray = params->resultArray;
    6.49 +
    6.50 +
    6.51 +   while( count < numSubMatrixPairs )
    6.52 +    {
    6.53 +      msg = SSR__receive_type_to( RESULTS_MSG, animatingPr );
    6.54 +
    6.55 +      resParams = (SMPairParams *)msg;
    6.56 +      accumulateResult( resultArray, resParams->partialResultArray,
    6.57 +                        resParams->leftSubMatrix->origStartRow,
    6.58 +                        resParams->leftSubMatrix->numRows,
    6.59 +                        resParams->rightSubMatrix->origStartCol,
    6.60 +                        resParams->rightSubMatrix->numCols,
    6.61 +                        resParams->rightSubMatrix->origMatrix->numCols );
    6.62 +
    6.63 +      SSR__free( resParams->partialResultArray, animatingPr );
    6.64 +      
    6.65 +         //there is only one copy of results procr, so can update numUsesLeft
    6.66 +         // without concurrency worries.  When zero, free the sub-matrix
    6.67 +      resParams->leftSubMatrix->numUsesLeft -= 1;
    6.68 +      if( resParams->leftSubMatrix->numUsesLeft == 0 )
    6.69 +       {
    6.70 +         SSR__free( resParams->leftSubMatrix->array, animatingPr );
    6.71 +         SSR__free( resParams->leftSubMatrix, animatingPr );
    6.72 +       }
    6.73 +
    6.74 +      resParams->rightSubMatrix->numUsesLeft -= 1;
    6.75 +      if( resParams->rightSubMatrix->numUsesLeft == 0 )
    6.76 +       {
    6.77 +         SSR__free( resParams->rightSubMatrix->array, animatingPr );
    6.78 +         SSR__free( resParams->rightSubMatrix, animatingPr );
    6.79 +       }
    6.80 +
    6.81 +         //count of how many sub-matrix pairs accumulated so know when done
    6.82 +      count++;
    6.83 +    }
    6.84 +
    6.85 +      //Done -- could just dissipate -- SSR will wait for all processors to
    6.86 +      // dissipate before shutting down, and thereby making results avaial to
    6.87 +      // outside, so no need to stop the divider from dissipating, so no need
    6.88 +      // to send a hand-shake message to it -- bug makes debug easier
    6.89 +   SSR__send_from_to( NULL, animatingPr, dividerPr );
    6.90 +   SSR__dissipate_procr( animatingPr );  //frees any data owned by procr
    6.91 + }
    6.92 +
    6.93 +void inline
    6.94 +accumulateResult( float32 *resultArray, float32 *subMatrixPairResultArray,
    6.95 +                  int32    startRow,
    6.96 +                  int32    numRows,
    6.97 +                  int32    startCol,
    6.98 +                  int32    numCols,
    6.99 +                  int32    numOrigCols )
   6.100 + { int32 row, col;
   6.101 +
   6.102 +   for( row = 0; row < numRows; row++ )
   6.103 +    {
   6.104 +      for( col = 0; col < numCols; col++ )
   6.105 +       {
   6.106 +         resultArray[ (row + startRow) * numOrigCols + (col + startCol) ] +=
   6.107 +            subMatrixPairResultArray[ row * numCols + col ];
   6.108 +       }
   6.109 +    }
   6.110 +
   6.111 + }
     7.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     7.2 +++ b/src/Application/VPThread__Matrix_Mult/VPThread__Matrix_Mult.h	Wed Nov 10 22:26:57 2010 -0800
     7.3 @@ -0,0 +1,95 @@
     7.4 +/*
     7.5 + *  Copyright Oct 24, 2009 OpenSourceStewardshipFoundation.org
     7.6 + *  Licensed under GNU General Public License version 2
     7.7 + */
     7.8 +
     7.9 +#ifndef _SSR_MATRIX_MULT_H_
    7.10 +#define _SSR_MATRIX_MULT_H_
    7.11 +
    7.12 +#include <stdio.h>
    7.13 +
    7.14 +#include "../../SSR_lib/SSR.h"
    7.15 +#include "../Matrix_Mult.h"
    7.16 +
    7.17 +
    7.18 +//===============================  Defines  ==============================
    7.19 +#define ROWS_IN_BLOCK 32
    7.20 +#define COLS_IN_BLOCK 32
    7.21 +#define VEC_IN_BLOCK  32
    7.22 +
    7.23 +#define copyMatrixSingleton 1
    7.24 +#define copyTransposeSingleton 2
    7.25 +
    7.26 +//==============================  Structures  ==============================
    7.27 +typedef struct
    7.28 + {
    7.29 +   Matrix *leftMatrix;
    7.30 +   Matrix *rightMatrix;
    7.31 +   Matrix *resultMatrix;
    7.32 + }
    7.33 +DividerParams;
    7.34 +
    7.35 +typedef struct
    7.36 + {
    7.37 +   VirtProcr *dividerPr;
    7.38 +   int numRows;
    7.39 +   int numCols;
    7.40 +   int numSubMatrixPairs;
    7.41 +   float32 *resultArray;
    7.42 + }
    7.43 +ResultsParams;
    7.44 +
    7.45 +typedef
    7.46 +struct
    7.47 + { int32    numRows;
    7.48 +   int32    numCols;
    7.49 +   Matrix  *origMatrix;
    7.50 +   int32    origStartRow;
    7.51 +   int32    origStartCol;
    7.52 +   int32    alreadyCopied;
    7.53 +   int32    numUsesLeft; //have update via message to avoid multiple writers
    7.54 +   float32 *array;  //2D, but dynamically sized, so use addr arith
    7.55 + }
    7.56 +SubMatrix;
    7.57 +
    7.58 +typedef struct
    7.59 + { VirtProcr *resultPr;
    7.60 +   SubMatrix *leftSubMatrix;
    7.61 +   SubMatrix *rightSubMatrix;
    7.62 +   float32   *partialResultArray;
    7.63 + }
    7.64 +SMPairParams;
    7.65 +
    7.66 +typedef
    7.67 +struct
    7.68 + { int32    numVals;
    7.69 +   int32   *startVals;
    7.70 + }
    7.71 +SlicingStruc;
    7.72 +
    7.73 +typedef
    7.74 +struct
    7.75 + {
    7.76 +   SlicingStruc *leftRowSlices;
    7.77 +   SlicingStruc *vecSlices;
    7.78 +   SlicingStruc *rightColSlices;
    7.79 + }
    7.80 +SlicingStrucCarrier;
    7.81 +
    7.82 +enum MMMsgType
    7.83 + {
    7.84 +   RESULTS_MSG = 1
    7.85 + };
    7.86 +
    7.87 +//============================= Processor Functions =========================
    7.88 +void divideWorkIntoSubMatrixPairProcrs( void *data, VirtProcr *animatingPr );
    7.89 +void calcSubMatrixProduct(        void *data, VirtProcr *animatingPr );
    7.90 +void gatherResults(     void *data, VirtProcr *animatingPr );
    7.91 +
    7.92 +
    7.93 +//================================ Entry Point ==============================
    7.94 +Matrix *
    7.95 +multiplyTheseMatrices( Matrix *leftMatrix, Matrix *rightMatrix );
    7.96 +
    7.97 +
    7.98 +#endif /*_SSR_MATRIX_MULT_H_*/
     8.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     8.2 +++ b/src/Application/VPThread__Matrix_Mult/subMatrix_Pr.c	Wed Nov 10 22:26:57 2010 -0800
     8.3 @@ -0,0 +1,299 @@
     8.4 +/* 
     8.5 + *  Copyright 2009 OpenSourceStewardshipFoundation.org
     8.6 + *  Licensed under GNU General Public License version 2
     8.7 + *
     8.8 + * Author: SeanHalle@yahoo.com
     8.9 + *
    8.10 + */
    8.11 +
    8.12 +#include <string.h>
    8.13 +
    8.14 +#include "SSR_Matrix_Mult.h"
    8.15 +
    8.16 +
    8.17 +
    8.18 +void inline
    8.19 +copyFromOrig( SubMatrix *subMatrix, VirtProcr *animPr );
    8.20 +
    8.21 +void inline
    8.22 +copyTransposeFromOrig( SubMatrix *subMatrix, VirtProcr *animPr );
    8.23 +
    8.24 +void inline
    8.25 +multiplySubBlocksTransposed( float32 *leftArray, float32 *rightArray,
    8.26 +                     float32 *resArray,
    8.27 +                     int startRow,  int endRow,
    8.28 +                     int startCol,  int endCol,
    8.29 +                     int startVec,  int endVec,
    8.30 +                     int resStride, int inpStride );
    8.31 +
    8.32 +void inline
    8.33 +multiplyMatrixArraysTransposed( int32 vecLength, int32 numResRows, int32 numResCols,
    8.34 +                      float32 *leftArray, float32 *rightArray,
    8.35 +                      float32 *resArray );
    8.36 +
    8.37 +
    8.38 +/*A  processor is created with an environment that holds two matrices,
    8.39 + * the row and col that it owns, and the name of a result gathering
    8.40 + * processor.
    8.41 + *It calculates the product of two sub-portions of the input matrices
    8.42 + * by using Intel's mkl library for single-core.
    8.43 + *
    8.44 + *This demonstrates using optimized single-threaded code inside scheduled
    8.45 + * work-units.
    8.46 + *
    8.47 + *When done, it sends the result to the result processor
    8.48 + */
    8.49 +void
    8.50 +calcSubMatrixProduct( void *data, VirtProcr *animatingPr )
    8.51 + { 
    8.52 +   SMPairParams   *params;
    8.53 +   VirtProcr      *resultPr;
    8.54 +   float32        *leftArray,  *rightArray, *resArray;
    8.55 +   SubMatrix      *leftSubMatrix, *rightSubMatrix;
    8.56 +
    8.57 +         DEBUG1(dbgAppFlow, "start sub-matrix mult: %d\n", animatingPr->procrID)
    8.58 +         int32 subMatrixProbe = VMS__create_single_interval_probe( "subMtx",
    8.59 +                                                                animatingPr);
    8.60 +         VMS__record_sched_choice_into_probe( subMatrixProbe, animatingPr );
    8.61 +         VMS__record_interval_start_in_probe( subMatrixProbe );
    8.62 +
    8.63 +   params         = (SMPairParams *)data;
    8.64 +   resultPr       = params->resultPr;
    8.65 +   leftSubMatrix  = params->leftSubMatrix;
    8.66 +   rightSubMatrix = params->rightSubMatrix;
    8.67 +
    8.68 +      //make sure the input sub-matrices have been copied out of orig
    8.69 +      //do it here, inside sub-matrix pair to hopefully gain reuse in cache
    8.70 +   copyFromOrig( leftSubMatrix, animatingPr );
    8.71 +   copyTransposeFromOrig( rightSubMatrix, animatingPr );
    8.72 +   
    8.73 +   leftArray      = leftSubMatrix->array;
    8.74 +   rightArray     = rightSubMatrix->array;
    8.75 +
    8.76 +   int32
    8.77 +   resSize = leftSubMatrix->numRows * rightSubMatrix->numCols * sizeof(float32);
    8.78 +   resArray = SSR__malloc_to( resSize, animatingPr );
    8.79 +   memset( resArray, 0, resSize );
    8.80 +
    8.81 +
    8.82 +   int32 numResRows, numResCols, vectLength;
    8.83 +   
    8.84 +   vectLength = leftSubMatrix->numCols;
    8.85 +   numResRows = leftSubMatrix->numRows;
    8.86 +   numResCols = rightSubMatrix->numCols;
    8.87 +
    8.88 +   multiplyMatrixArraysTransposed( vectLength, numResRows, numResCols,
    8.89 +                         leftArray,  rightArray,
    8.90 +                         resArray );
    8.91 +
    8.92 +   //send result to result processor
    8.93 +   params->partialResultArray = resArray;
    8.94 +
    8.95 +         VMS__record_interval_end_in_probe( subMatrixProbe );
    8.96 +
    8.97 +   SSR__send_of_type_to( animatingPr, params, RESULTS_MSG, resultPr );
    8.98 +   SSR__dissipate_procr( animatingPr );
    8.99 + }
   8.100 +
   8.101 +
   8.102 +
   8.103 +/*Divides result and each input into 32x32 sub-matrices, 3 of which fit into
   8.104 + * the 32KB L1 cache.
   8.105 + *Would be nice to embed this within another level that divided into
   8.106 + * 8x8 tiles of those, where one 8x8 tile fits within 2MB L2 cache
   8.107 + *
   8.108 + *Eventually want these divisions to be automatic, using DKU pattern
   8.109 + * embedded into VMS and exposed in the language, and with VMS controlling the
   8.110 + * divisions according to the cache sizes, which it knows about.
   8.111 + *Also, want VMS to work with language to split among main-mems, so a socket
   8.112 + * only cranks on data in its local segment of main mem
   8.113 + *
   8.114 + *So, outer two loops determine start and end points within the result matrix.
   8.115 + * Inside that, a loop dets the start and end points along the shared dimensions
   8.116 + * of the two input matrices.
   8.117 + */
   8.118 +void inline
   8.119 +multiplyMatrixArraysTransposed( int32 vecLength, int32 numResRows,
   8.120 +                                int32 numResCols,
   8.121 +                                float32 *leftArray, float32 *rightArray,
   8.122 +                                float32 *resArray )
   8.123 + {
   8.124 +   int resStride, inpStride;
   8.125 +   int resStartRow, resStartCol, resEndRow, resEndCol, startVec, endVec;
   8.126 +
   8.127 +   resStride  = numResCols;
   8.128 +   inpStride  = vecLength;
   8.129 +
   8.130 +   for( resStartRow = 0; resStartRow < numResRows; )
   8.131 +    {
   8.132 +      resEndRow = resStartRow + ROWS_IN_BLOCK -1;  //start at zero, so -1
   8.133 +      if( resEndRow > numResRows ) resEndRow = numResRows -1;
   8.134 +
   8.135 +      for( resStartCol = 0; resStartCol < numResCols; )
   8.136 +       {
   8.137 +         resEndCol   = resStartCol + COLS_IN_BLOCK -1;
   8.138 +         if( resEndCol > numResCols ) resEndCol = numResCols -1;
   8.139 +
   8.140 +         for( startVec = 0; startVec < vecLength; )
   8.141 +          {
   8.142 +            endVec   = startVec + VEC_IN_BLOCK -1;
   8.143 +            if( endVec > vecLength ) endVec = vecLength -1;
   8.144 +
   8.145 +               //By having the "vector" of sub-blocks in a sub-block slice
   8.146 +               // be marched down in inner loop, are re-using the result
   8.147 +               // matrix, which stays in L1 cache and re-using the left sub-mat
   8.148 +               // which repeats for each right sub-mat -- can only re-use two of
   8.149 +               // the three, so result is the most important -- avoids writing
   8.150 +               // dirty blocks until those result-locations fully done
   8.151 +               //Row and Col is position in result matrix -- so row and vec
   8.152 +               // for left array, then vec and col for right array
   8.153 +            multiplySubBlocksTransposed( leftArray, rightArray,
   8.154 +                                         resArray,
   8.155 +                                         resStartRow,  resEndRow,
   8.156 +                                         resStartCol,  resEndCol,
   8.157 +                                         startVec,  endVec,
   8.158 +                                         resStride, inpStride );
   8.159 +            startVec = endVec +1;
   8.160 +          }
   8.161 +         resStartCol = resEndCol +1;
   8.162 +       }
   8.163 +      resStartRow = resEndRow +1;
   8.164 +    }
   8.165 + }
   8.166 +
   8.167 +
   8.168 +
   8.169 +void inline
   8.170 +multiplySubBlocksTransposed( float32 *leftArray, float32 *rightArray,
   8.171 +                     float32 *resArray,
   8.172 +                     int resStartRow,  int resEndRow,
   8.173 +                     int resStartCol,  int resEndCol,
   8.174 +                     int startVec,  int endVec,
   8.175 +                     int resStride, int inpStride )
   8.176 + {
   8.177 +   int resRow,     resCol,        vec;
   8.178 +   int leftOffset, rightOffset;
   8.179 +   float32 result;
   8.180 +
   8.181 +      //The result row is used only for the left matrix, res col for the right
   8.182 +   for( resCol = resStartCol; resCol <= resEndCol; resCol++ )
   8.183 +    {
   8.184 +      for( resRow = resStartRow; resRow <= resEndRow; resRow++ )
   8.185 +       {
   8.186 +         leftOffset  = resRow * inpStride;//left & right inp strides always same
   8.187 +         rightOffset = resCol * inpStride;// because right is transposed
   8.188 +         result = 0;
   8.189 +         for( vec = startVec; vec <= endVec; vec++ )
   8.190 +          {
   8.191 +            result +=
   8.192 +               leftArray[ leftOffset + vec] * rightArray[ rightOffset + vec];
   8.193 +          }
   8.194 +
   8.195 +         resArray[ resRow * resStride + resCol ] += result;
   8.196 +       }
   8.197 +    }
   8.198 + }
   8.199 +
   8.200 +
   8.201 +
   8.202 +
   8.203 +/*Reuse this in divider when do the sequential multiply case
   8.204 + */
   8.205 +void inline
   8.206 +copyTranspose( int32 numRows, int32 numCols,
   8.207 +               int32 origStartRow, int32 origStartCol, int32 origStride,
   8.208 +               float32 *subArray, float32 *origArray )
   8.209 + { int32 stride = numRows;
   8.210 + 
   8.211 +   int row, col, origOffset;
   8.212 +   for( row = 0; row < numRows; row++ )
   8.213 +    {
   8.214 +      origOffset = (row + origStartRow) * origStride + origStartCol;
   8.215 +      for( col = 0; col < numCols; col++ )
   8.216 +       {
   8.217 +            //transpose means swap row & col -- traverse orig matrix normally
   8.218 +            // but put into reversed place in local array -- means the
   8.219 +            // stride is the numRows now, so col * numRows + row
   8.220 +         subArray[ col * stride + row ]  =  origArray[ origOffset + col ];
   8.221 +       }
   8.222 +    }
   8.223 + }
   8.224 +
   8.225 +void inline
   8.226 +copyTransposeFromOrig( SubMatrix *subMatrix, VirtProcr *animPr )
   8.227 + { int numCols, numRows, origStartRow, origStartCol, origStride, stride;
   8.228 +   Matrix *origMatrix;
   8.229 +   float32 *origArray, *subArray;
   8.230 +
   8.231 +   if( subMatrix->alreadyCopied ) return;
   8.232 +   SSR__start_singleton( copyMatrixSingleton, &&EndOfTransSingleton, animPr);
   8.233 +
   8.234 +   origMatrix   = subMatrix->origMatrix;
   8.235 +   origArray    = origMatrix->array;
   8.236 +   numCols      = subMatrix->numCols;
   8.237 +   numRows      = subMatrix->numRows;
   8.238 +   origStartRow = subMatrix->origStartRow;
   8.239 +   origStartCol = subMatrix->origStartCol;
   8.240 +   origStride   = origMatrix->numCols;
   8.241 +
   8.242 +   subArray     = SSR__malloc_to( numRows * numCols *sizeof(float32),animPr);
   8.243 +   subMatrix->array = subArray;
   8.244 +
   8.245 +      //copy values from orig matrix to local
   8.246 +   copyTranspose( numRows, numCols,
   8.247 +                  origStartRow, origStartCol, origStride,
   8.248 +                  subArray, origArray );
   8.249 +
   8.250 +   subMatrix->alreadyCopied = TRUE; //must be last thing before label
   8.251 +   EndOfTransSingleton:
   8.252 +   return;
   8.253 + }
   8.254 +
   8.255 +
   8.256 +void inline
   8.257 +copyFromOrig( SubMatrix *subMatrix, VirtProcr *animPr )
   8.258 + { int numCols, numRows, origStartRow, origStartCol, stride, origStride;
   8.259 +   Matrix *origMatrix;
   8.260 +   float32 *origArray, *subArray;
   8.261 +
   8.262 +
   8.263 +      //This lets only a single VP execute the code between start and
   8.264 +      // end -- using start and end so that work runs outside the master.
   8.265 +      //Inside, if a second VP ever executes the start, it will be returned
   8.266 +      // from the end-point.
   8.267 +      //Note, for non-GCC, can add a second SSR call at the end, and inside
   8.268 +      // that one, look at the stack at the return addr & save that in an
   8.269 +      // array indexed by singletonID
   8.270 +   if( subMatrix->alreadyCopied ) return;
   8.271 +   SSR__start_singleton( copyMatrixSingleton, &&EndOfCopySingleton, animPr );
   8.272 +
   8.273 +
   8.274 +   origMatrix    = subMatrix->origMatrix;
   8.275 +   origArray     = origMatrix->array;
   8.276 +   numCols       = subMatrix->numCols;
   8.277 +   numRows       = subMatrix->numRows;
   8.278 +   origStartRow  = subMatrix->origStartRow;
   8.279 +   origStartCol  = subMatrix->origStartCol;
   8.280 +   origStride    = origMatrix->numCols;
   8.281 +
   8.282 +   subArray     = SSR__malloc_to( numRows * numCols *sizeof(float32),animPr);
   8.283 +   subMatrix->array = subArray;
   8.284 +
   8.285 +      //copy values from orig matrix to local
   8.286 +   stride        = numCols;
   8.287 +
   8.288 +   int row, col, offset, origOffset;
   8.289 +   for( row = 0; row < numRows; row++ )
   8.290 +    {
   8.291 +      offset     = row * stride;
   8.292 +      origOffset = (row + origStartRow) * origStride + origStartCol;
   8.293 +      for( col = 0; col < numCols; col++ )
   8.294 +       {
   8.295 +         subArray[ offset + col ]  =  origArray[ origOffset + col ];
   8.296 +       }
   8.297 +    }
   8.298 +
   8.299 +   subMatrix->alreadyCopied = TRUE; //must be last thing before label
   8.300 +   EndOfCopySingleton:
   8.301 +   return;
   8.302 + }
     9.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     9.2 +++ b/src/Application/main.c	Wed Nov 10 22:26:57 2010 -0800
     9.3 @@ -0,0 +1,36 @@
     9.4 +/*
     9.5 + *  Copyright Oct 24, 2009 OpenSourceStewardshipFoundation.org
     9.6 + *  Licensed under GNU General Public License version 2
     9.7 + *
     9.8 + * author seanhalle@yahoo.com
     9.9 + */
    9.10 +
    9.11 +#include <malloc.h>
    9.12 +#include <stdlib.h>
    9.13 +
    9.14 +#include "Matrix_Mult.h"
    9.15 +#include "SSR_Matrix_Mult/SSR_Matrix_Mult.h"
    9.16 +
    9.17 +/**
    9.18 + *Matrix multiply program written using VMS_HW piggy-back language
    9.19 + * 
    9.20 + */
    9.21 +int main( int argc, char **argv )
    9.22 + { Matrix      *leftMatrix, *rightMatrix, *resultMatrix;
    9.23 +   ParamBag    *paramBag;
    9.24 +   
    9.25 +   printf( "arguments: %s | %s\n", argv[0], argv[1] );
    9.26 +
    9.27 +   paramBag = makeParamBag();
    9.28 +   readParamFileIntoBag( argv[1], paramBag );
    9.29 +   initialize_Input_Matrices_Via( &leftMatrix, &rightMatrix, paramBag );
    9.30 +   
    9.31 +   resultMatrix = multiplyTheseMatrices( leftMatrix, rightMatrix );
    9.32 +
    9.33 +   printf("\nresult matrix: \n");
    9.34 +   printMatrix( resultMatrix );
    9.35 +   
    9.36 +//   SSR__print_stats();
    9.37 +   
    9.38 +   exit(0); //cleans up
    9.39 + }