Mercurial > cgi-bin > hgwebdir.cgi > PR > Applications > SSR > SSR__Blocked_Matrix_Mult__Bench

changeset 10:387f3084d9bb
Changed dir structure to new project structure
author: Me@portablequad
date: Tue, 07 Feb 2012 14:07:38 -0800
parents: 95c02c4ad998
children: ca572fdc9a80
files: .hgeol Matrix_Mult.c Matrix_Mult.h SSR_Matrix_Mult/Divide_Pr.c SSR_Matrix_Mult/EntryPoint.c SSR_Matrix_Mult/Result_Pr.c SSR_Matrix_Mult/SSR_Matrix_Mult.h SSR_Matrix_Mult/subMatrix_Pr.c main.c src/Application/Matrix_Mult.c src/Application/Matrix_Mult.h src/Application/SSR_Matrix_Mult/Divide_Pr.c src/Application/SSR_Matrix_Mult/EntryPoint.c src/Application/SSR_Matrix_Mult/Result_Pr.c src/Application/SSR_Matrix_Mult/SSR_Matrix_Mult.h src/Application/SSR_Matrix_Mult/subMatrix_Pr.c src/Application/main.c
diffstat: 17 files changed, 1484 insertions(+), 1470 deletions(-) [+]
[-]

.hgeol 14

Matrix_Mult.c 167

Matrix_Mult.h 77

SSR_Matrix_Mult/Divide_Pr.c 603

SSR_Matrix_Mult/EntryPoint.c 62

SSR_Matrix_Mult/Result_Pr.c 108

SSR_Matrix_Mult/SSR_Matrix_Mult.h 97

SSR_Matrix_Mult/subMatrix_Pr.c 319

main.c 37

src/Application/Matrix_Mult.c 167

src/Application/Matrix_Mult.h 77

src/Application/SSR_Matrix_Mult/Divide_Pr.c 603

src/Application/SSR_Matrix_Mult/EntryPoint.c 62

src/Application/SSR_Matrix_Mult/Result_Pr.c 108

src/Application/SSR_Matrix_Mult/SSR_Matrix_Mult.h 97

src/Application/SSR_Matrix_Mult/subMatrix_Pr.c 319

src/Application/main.c 37 .hgeol 14 Matrix_Mult.c 167 Matrix_Mult.h 77 SSR_Matrix_Mult/Divide_Pr.c 603 SSR_Matrix_Mult/EntryPoint.c 62 SSR_Matrix_Mult/Result_Pr.c 108 SSR_Matrix_Mult/SSR_Matrix_Mult.h 97 SSR_Matrix_Mult/subMatrix_Pr.c 319 main.c 37 src/Application/Matrix_Mult.c 167 src/Application/Matrix_Mult.h 77 src/Application/SSR_Matrix_Mult/Divide_Pr.c 603 src/Application/SSR_Matrix_Mult/EntryPoint.c 62 src/Application/SSR_Matrix_Mult/Result_Pr.c 108 src/Application/SSR_Matrix_Mult/SSR_Matrix_Mult.h 97 src/Application/SSR_Matrix_Mult/subMatrix_Pr.c 319 src/Application/main.c 37
.hgeol 14
Matrix_Mult.c 167
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/.hgeol	Tue Feb 07 14:07:38 2012 -0800
     1.3 @@ -0,0 +1,14 @@
     1.4 +
     1.5 +[patterns]
     1.6 +**.py = native
     1.7 +**.txt = native
     1.8 +**.c = native
     1.9 +**.h = native
    1.10 +**.cpp = native
    1.11 +**.java = native
    1.12 +**.class = bin
    1.13 +**.jar = bin
    1.14 +**.sh = native
    1.15 +**.pl = native
    1.16 +**.jpg = bin
    1.17 +**.gif = bin

     2.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     2.2 +++ b/Matrix_Mult.c	Tue Feb 07 14:07:38 2012 -0800
     2.3 @@ -0,0 +1,167 @@
     2.4 +/*
     2.5 + *  Copyright 2009 OpenSourceStewardshipFoundation.org
     2.6 + *  Licensed under GNU General Public License version 2
     2.7 + *
     2.8 + * Author: seanhalle@yahoo.com
     2.9 + *
    2.10 + * Created on November 15, 2009, 2:35 AM
    2.11 + */
    2.12 +
    2.13 +#include <malloc.h>
    2.14 +#include <stdlib.h>
    2.15 +
    2.16 +#include "Matrix_Mult.h"
    2.17 +#include "ParamHelper/Param.h"
    2.18 +
    2.19 +
    2.20 + 
    2.21 + void
    2.22 +initialize_Input_Matrices_Via( Matrix  **leftMatrix, Matrix **rightMatrix,
    2.23 +                               ParamBag *paramBag )
    2.24 + { char *leftMatrixFileName, *rightMatrixFileName;
    2.25 +   int   leftMatrixRows, leftMatrixCols, rightMatrixRows, rightMatrixCols;
    2.26 +   
    2.27 +      ParamStruc *param;
    2.28 +      param = getParamFromBag( "leftMatrixRows", paramBag );
    2.29 +   leftMatrixRows = param->intValue;
    2.30 +      param = getParamFromBag( "leftMatrixCols", paramBag );
    2.31 +   leftMatrixCols = param->intValue;
    2.32 +   *leftMatrix = makeMatrix_WithResMat( leftMatrixRows, leftMatrixCols );
    2.33 +   
    2.34 +      param = getParamFromBag( "leftMatrixFileName", paramBag );
    2.35 +   leftMatrixFileName = param->strValue;  //no need to copy
    2.36 +   read_Matrix_From_File( *leftMatrix,  leftMatrixFileName );
    2.37 +   
    2.38 +      param = getParamFromBag( "rightMatrixRows", paramBag );
    2.39 +   rightMatrixRows = param->intValue;
    2.40 +      param = getParamFromBag( "rightMatrixCols", paramBag );
    2.41 +   rightMatrixCols = param->intValue;
    2.42 +   *rightMatrix = makeMatrix_WithResMat( rightMatrixRows, rightMatrixCols );
    2.43 +   
    2.44 +      param = getParamFromBag( "rightMatrixFileName", paramBag );
    2.45 +   rightMatrixFileName = param->strValue;
    2.46 +   read_Matrix_From_File( *rightMatrix, rightMatrixFileName );
    2.47 + }
    2.48 +
    2.49 +
    2.50 +void parseLineIntoRow( char *line, float32* row );
    2.51 +
    2.52 +
    2.53 + void
    2.54 +read_Matrix_From_File( Matrix *matrixStruc, char *matrixFileName )
    2.55 + { int    row, maxRead, numRows, numCols;
    2.56 +   float32 *matrixStart;
    2.57 +   size_t lineSz = 0;
    2.58 +   FILE  *file;
    2.59 +   char  *line = NULL;
    2.60 +   
    2.61 +   lineSz = 50000; //max length of line in a matrix data file
    2.62 +   line = (char *) malloc( lineSz );
    2.63 +   if( line == NULL ) printf( "no mem for matrix line" );
    2.64 +   
    2.65 +   numRows = matrixStruc->numRows;
    2.66 +   numCols = matrixStruc->numCols;
    2.67 +   matrixStart = matrixStruc->array;
    2.68 +
    2.69 +   file = fopen( matrixFileName, "r" );
    2.70 +   if( file == NULL ) { printf( "\nCouldn't open file!!\n"); exit(1);}
    2.71 +   fseek( file, 0, SEEK_SET );
    2.72 +   for( row = 0; row < numRows; row++ )
    2.73 +    {
    2.74 +      if( feof( file ) )  printf( "file ran out too soon" );
    2.75 +      maxRead = getline( &line, &lineSz, file );
    2.76 +      if( maxRead == -1 ) printf( "prob reading mat line");
    2.77 +      
    2.78 +      if( *line == '\n') continue; //blank line
    2.79 +      if( *line == '/' ) continue; //comment line
    2.80 +      
    2.81 +      parseLineIntoRow( line, matrixStart + row * numCols );
    2.82 +    }
    2.83 +   free( line );
    2.84 + }
    2.85 +
    2.86 +/*This function relies on each line having the proper number of cols.  It
    2.87 + * doesn't check, nor enforce, so if the file is improperly formatted it
    2.88 + * can write over unrelated memory
    2.89 + */
    2.90 + void
    2.91 +parseLineIntoRow( char *line, float32* row )
    2.92 + {
    2.93 +   char *valueStr, *searchPos;
    2.94 +   
    2.95 +      //read the float values
    2.96 +   searchPos = valueStr = line; //start
    2.97 +   
    2.98 +   for( ; *searchPos != 0; searchPos++)  //bit dangerous, should use buff len
    2.99 +    {
   2.100 +      if( *searchPos == '\n' ) //last col..  relying on well-formatted file
   2.101 +       { *searchPos = 0;
   2.102 +         *row = atof( valueStr );
   2.103 +         break;                                    //end FOR loop
   2.104 +       }
   2.105 +      if( *searchPos == ',' )
   2.106 +       { *searchPos = 0;                           //mark end of string
   2.107 +         *row = (float32) atof( valueStr );
   2.108 +         row += 1;                                 //address arith
   2.109 +            //skip any spaces before digits.. use searchPos + 1 to skip the 0
   2.110 +         for( ; *(searchPos + 1)== ' ' && *(searchPos + 1) !=0; searchPos++);
   2.111 +         valueStr = searchPos + 1;
   2.112 +       }
   2.113 +    }
   2.114 + }
   2.115 +
   2.116 + //==========================================================================
   2.117 +
   2.118 +/*In the "_Flat" version of constructor, do only malloc of the top data struc
   2.119 + * and set values in that top-level.  Don't malloc any sub-structures.
   2.120 + */
   2.121 + Matrix *
   2.122 +makeMatrix_Flat( int32 numRows, int32 numCols )
   2.123 + { Matrix * retMatrix;
   2.124 +   retMatrix = malloc( sizeof( Matrix ) );
   2.125 +   retMatrix->numRows = numRows;
   2.126 +   retMatrix->numCols = numCols;
   2.127 +
   2.128 +   return retMatrix;
   2.129 + }
   2.130 +
   2.131 + Matrix *
   2.132 +makeMatrix_WithResMat( int32 numRows, int32 numCols )
   2.133 + { Matrix * retMatrix;
   2.134 +   retMatrix = malloc( sizeof( Matrix ) );
   2.135 +   retMatrix->numRows = numRows;
   2.136 +   retMatrix->numCols = numCols;
   2.137 +   retMatrix->array  = malloc( numRows * numCols * sizeof(float32) );
   2.138 +
   2.139 +   return retMatrix;
   2.140 + }
   2.141 +
   2.142 + void
   2.143 +freeMatrix_Flat( Matrix * matrix )
   2.144 + { //( matrix );
   2.145 + }
   2.146 + void
   2.147 +freeMatrix( Matrix * matrix )
   2.148 + { free( matrix->array );
   2.149 +   free( matrix );
   2.150 + }
   2.151 +
   2.152 +void
   2.153 +printMatrix( Matrix *matrix )
   2.154 + { int r, c, numRows, numCols, rowsToPrint, colsToPrint, rowIncr, colIncr;
   2.155 +   float32 *matrixArray;
   2.156 +
   2.157 +   numRows = rowsToPrint = matrix->numRows;
   2.158 +   numCols = colsToPrint = matrix->numCols;
   2.159 +   matrixArray = matrix->array;
   2.160 +
   2.161 +   rowIncr = numRows/20; if(rowIncr == 0) rowIncr = 1;//20 to 39 rows printed
   2.162 +   colIncr = numCols/20; if(colIncr == 0) colIncr = 1;//20 to 39 cols printed
   2.163 +   for( r = 0; r < numRows; r += rowIncr )
   2.164 +    { for( c = 0; c < numCols; c += colIncr )
   2.165 +       { printf( "%3.1f | ", matrixArray[ r * numCols + c ] );
   2.166 +       }
   2.167 +      printf("\n");
   2.168 +    }
   2.169 + }
   2.170 +

     3.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     3.2 +++ b/Matrix_Mult.h	Tue Feb 07 14:07:38 2012 -0800
     3.3 @@ -0,0 +1,77 @@
     3.4 +/*
     3.5 + *  Copyright Oct 24, 2009 OpenSourceStewardshipFoundation.org
     3.6 + *  Licensed under GNU General Public License version 2
     3.7 + */
     3.8 +
     3.9 +#ifndef MATRIX_MULT_H_
    3.10 +#define MATRIX_MULT_H_
    3.11 +
    3.12 +#include <stdio.h>
    3.13 +#include <unistd.h>
    3.14 +#include <malloc.h>
    3.15 +
    3.16 +#include "../SSR_lib/VMS/VMS_primitive_data_types.h"
    3.17 +#include "ParamHelper/Param.h"
    3.18 +
    3.19 +//==============================  Structures  ==============================
    3.20 +
    3.21 +typedef
    3.22 +struct
    3.23 + { int32 numRows;
    3.24 +   int32 numCols;
    3.25 +   float32 *array;  //2D, but dynamically sized, so use addr arith
    3.26 + }
    3.27 +Matrix;
    3.28 +
    3.29 +/* This is the "appSpecificPiece" that is carried inside a DKUPiece.
    3.30 + *  In the DKUPiece data struc it is declared to be of type "void *".  This
    3.31 + *  allows the application to define any data structure it wants and put it
    3.32 + *  into a DKUPiece.
    3.33 + * When the app specific info is used, it is in app code, so it is cast to
    3.34 + *  the correct type to tell the compiler how to access fields.
    3.35 + * This keeps all app-specific things out of the DKU directory, as per the
    3.36 + *  DKU standard. */
    3.37 +typedef
    3.38 +struct
    3.39 + { 
    3.40 +      // pointers to shared data..  the result matrix must be created when the
    3.41 +      //  left and right matrices are put into the root ancestor DKUPiece.
    3.42 +   Matrix * leftMatrix;
    3.43 +   Matrix * rightMatrix;
    3.44 +   Matrix * resultMatrix;
    3.45 +
    3.46 +      // define the starting and ending boundaries for this piece of the
    3.47 +      //  result matrix.  These are derivable from the left and right
    3.48 +      //  matrices, but included them for readability of code.
    3.49 +   int prodStartRow, prodEndRow;
    3.50 +   int prodStartCol, prodEndCol;
    3.51 +      // Start and end of the portion of the left matrix that contributes to
    3.52 +      //  this piece of the product
    3.53 +   int leftStartRow, leftEndRow;
    3.54 +   int leftStartCol, leftEndCol;
    3.55 +      // Start and end of the portion of the right matrix that contributes to
    3.56 +      //  this piece of the product
    3.57 +   int rightStartRow, rightEndRow;
    3.58 +   int rightStartCol, rightEndCol;
    3.59 + }
    3.60 +MatrixProdPiece;
    3.61 +
    3.62 +//==============================  Functions  ================================
    3.63 +void readFile();
    3.64 +
    3.65 +Matrix *makeMatrix( int32 numRows, int32 numCols );
    3.66 +Matrix *makeMatrix_Flat( int32 numRows, int32 numCols );
    3.67 +Matrix *makeMatrix_WithResMat( int32 numRows, int32 numCols );
    3.68 +void    freeMatrix_Flat( Matrix * matrix );
    3.69 +void    freeMatrix( Matrix * matrix );
    3.70 +void    printMatrix( Matrix *matrix );
    3.71 +
    3.72 +void read_Matrix_From_File( Matrix *matrixStruc, char *matrixFileName );
    3.73 +
    3.74 +void
    3.75 +initialize_Input_Matrices_Via( Matrix  **leftMatrix, Matrix **rightMatrix,
    3.76 +                              ParamBag *paramBag );
    3.77 +
    3.78 +//===========================================================================
    3.79 +
    3.80 +#endif /*MATRIX_MULT_H_*/

     4.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     4.2 +++ b/SSR_Matrix_Mult/Divide_Pr.c	Tue Feb 07 14:07:38 2012 -0800
     4.3 @@ -0,0 +1,603 @@
     4.4 +/*
     4.5 + *  Copyright 2009 OpenSourceStewardshipFoundation.org
     4.6 + *  Licensed under GNU General Public License version 2
     4.7 + *
     4.8 + * Author: seanhalle@yahoo.com
     4.9 + *
    4.10 + */
    4.11 +
    4.12 +
    4.13 +#include "SSR_Matrix_Mult.h"
    4.14 +#include <math.h>
    4.15 +#include <string.h>
    4.16 +
    4.17 +   //The time to compute this many result values should equal the time to
    4.18 +   // perform this division on a matrix of size gives that many result calcs
    4.19 +   //IE, size this so that sequential time to calc equals divide time
    4.20 +   // find the value by experimenting -- but divide time and calc time scale
    4.21 +   // same way, so this value should remain valid across hardware
    4.22 +#define NUM_CELLS_IN_SEQUENTIAL_CUTOFF 1000
    4.23 +
    4.24 +
    4.25 +//===========================================================================
    4.26 +int inline
    4.27 +measureMatrixMultPrimitive( VirtProcr *animPr );
    4.28 +
    4.29 +SlicingStrucCarrier *
    4.30 +calcIdealSizeAndSliceDimensions( Matrix *leftMatrix, Matrix *rightMatrix,
    4.31 +                                 VirtProcr *animPr );
    4.32 +
    4.33 +SlicingStruc *
    4.34 +sliceUpDimension( float32 idealSizeOfSide, int startVal, int endVal,
    4.35 +                  VirtProcr *animPr );
    4.36 +
    4.37 +void
    4.38 +freeSlicingStruc( SlicingStruc *slicingStruc, VirtProcr *animPr );
    4.39 +
    4.40 +SubMatrix **
    4.41 +createSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices,
    4.42 +                   int32 numUses, Matrix *origMatrix, VirtProcr *animPr );
    4.43 +
    4.44 +void
    4.45 +freeSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices,
    4.46 +                 SubMatrix **subMatrices, VirtProcr *animPr );
    4.47 +
    4.48 +void
    4.49 +pairUpSubMatricesAndMakeProcessors( SubMatrix **leftSubMatrices,
    4.50 +                                    SubMatrix **rightSubMatrices,
    4.51 +                                    int32 numRowIdxs, int32 numColIdxs,
    4.52 +                                    int32 numVecIdxs,
    4.53 +                                    VirtProcr *resultPr,
    4.54 +                                    VirtProcr *animatingPr );
    4.55 +
    4.56 +void
    4.57 +makeSubMatricesAndProcrs( Matrix *leftMatrix, Matrix *rightMatrix,
    4.58 +            SlicingStrucCarrier *slicingStrucCarrier,
    4.59 +            VirtProcr *resultPr, VirtProcr *animatingPr );
    4.60 +
    4.61 +
    4.62 +
    4.63 +/*Divider creates one processor for every sub-matrix
    4.64 + * It hands them:
    4.65 + *  the name of the result processor that they should send their results to,
    4.66 + *  the left and right matrices, and the rows and cols they should multiply
    4.67 + * It first creates the result processor, then all the sub-matrixPair
    4.68 + *  processors,
    4.69 + *  then does a receive of a message from the result processor that gives
    4.70 + *  the divider ownership of the result matrix.
    4.71 + * Finally, the divider returns the result matrix out of the SSR system.
    4.72 + *
    4.73 + * Divider chooses the size of sub-matrices via an algorithm that tries to
    4.74 + *  keep the minimum work above a threshold.  The threshold is machine-
    4.75 + *  dependent, so ask SSR for min work-unit time to get a
    4.76 + *  given overhead
    4.77 + *
    4.78 + * Divide min work-unit cycles by measured-cycles for one matrix-cell
    4.79 + *  product -- gives the number of products need to have in min size
    4.80 + *  matrix.
    4.81 + *
    4.82 + * So then, take cubed root of this to get the size of a side of min sub-
    4.83 + *  matrix.  That is the size of the ideal square sub-matrix -- so tile
    4.84 + *  up the two input matrices into ones as close as possible to that size,
    4.85 + *  and create the pairs of sub-matrices.
    4.86 + *
    4.87 + *========================  STRATEGIC OVERVIEW  =======================
    4.88 + *
    4.89 + *This division is a bit tricky, because have to create things in advance
    4.90 + * that it's not at first obvious need to be created..
    4.91 + *
    4.92 + *First slice up each dimension -- three of them..  this is because will have
    4.93 + * to create the sub-matrix's data-structures before pairing the sub-matrices
    4.94 + * with each other -- so, have three dimensions to slice up before can
    4.95 + * create the sub-matrix data-strucs -- also, have to be certain that the
    4.96 + * cols of the left input have the exact same slicing as the rows of the
    4.97 + * left matrix, so just to be sure, do the slicing calc once, then use it
    4.98 + * for both.
    4.99 + *
   4.100 + *So, goes like this:
   4.101 + *1) calculate the start & end values of each dimension in each matrix.
   4.102 + *2) use those values to create sub-matrix structures
   4.103 + *3) combine sub-matrices into pairs, as the tasks to perform.
   4.104 + *
   4.105 + *Have to calculate separately from creating the sub-matrices because of the
   4.106 + * nature of the nesting -- would either end up creating the same sub-matrix
   4.107 + * multiple times, or else would have to put in detection of whether had
   4.108 + * made a particular one already if tried to combine steps 1 and 2.
   4.109 + *
   4.110 + *Step 3 has to be separate because of the nesting, as well -- same reason,
   4.111 + * would either create same sub-matrix multiple times, or else have to
   4.112 + * add detection of whether was already created.
   4.113 + *
   4.114 + *Another way to look at it: there's one level of loop to divide dimensions,
   4.115 + * two levels of nesting to create sub-matrices, and three levels to pair
   4.116 + * up the sub-matrices.
   4.117 + */
   4.118 +
   4.119 +void divideWorkIntoSubMatrixPairProcrs( void      *_dividerParams,
   4.120 +                                        VirtProcr *animPr )
   4.121 + { VirtProcr       *resultPr;
   4.122 +   DividerParams   *dividerParams;
   4.123 +   ResultsParams   *resultsParams;
   4.124 +   Matrix          *leftMatrix, *rightMatrix, *resultMatrix;
   4.125 +   void            *msg;
   4.126 +   SlicingStrucCarrier *slicingStrucCarrier;
   4.127 +   float32         *resultArray; //points to array inside result matrix
   4.128 +   
   4.129 +         DEBUG( dbgAppFlow, "start divide\n")
   4.130 +
   4.131 +         int32
   4.132 +         divideProbe = VMS__create_single_interval_probe( "divideProbe",
   4.133 +                                                          animPr );
   4.134 +         VMS__record_sched_choice_into_probe( divideProbe, animPr );
   4.135 +         VMS__record_interval_start_in_probe( divideProbe );
   4.136 +
   4.137 +   //=========== Setup -- make local copies of ptd-to-things, malloc, aso
   4.138 +   int32 numResRows, numResCols, vectLength;
   4.139 +
   4.140 +   dividerParams   = (DividerParams *)_dividerParams;
   4.141 +   
   4.142 +   leftMatrix      = dividerParams->leftMatrix;
   4.143 +   rightMatrix     = dividerParams->rightMatrix;
   4.144 +
   4.145 +   vectLength = leftMatrix->numCols;
   4.146 +   numResRows = leftMatrix->numRows;
   4.147 +   numResCols = rightMatrix->numCols;
   4.148 +   resultArray     = dividerParams->resultMatrix->array;
   4.149 +   
   4.150 +      //zero the result array
   4.151 +   memset( resultArray, 0, numResRows * numResCols * sizeof(float32) );
   4.152 +
   4.153 +   //==============  Do either sequential mult or do division ==============
   4.154 +
   4.155 +      //Check if input matrices too small -- if yes, just do sequential
   4.156 +      //Cutoff is determined by overhead of this divider -- relatively
   4.157 +      // machine-independent
   4.158 +   if( (float32)leftMatrix->numRows * (float32)leftMatrix->numCols *
   4.159 +       (float32)rightMatrix->numCols  < NUM_CELLS_IN_SEQUENTIAL_CUTOFF )
   4.160 +    {
   4.161 +      //====== Do sequential multiply on a single core
   4.162 +            DEBUG( dbgAppFlow, "doing sequential")
   4.163 +            
   4.164 +         //transpose the right matrix
   4.165 +      float32 *
   4.166 +      transRightArray  = SSR__malloc_to( rightMatrix->numRows * 
   4.167 +                                         rightMatrix->numCols * sizeof(float32),
   4.168 +                                         animPr );
   4.169 +
   4.170 +         //copy values from orig matrix to local
   4.171 +      copyTranspose( rightMatrix->numRows, rightMatrix->numCols,
   4.172 +                     0, 0, rightMatrix->numRows,
   4.173 +                     transRightArray, rightMatrix->array );
   4.174 +      
   4.175 +      multiplyMatrixArraysTransposed( vectLength, numResRows, numResCols,
   4.176 +                            leftMatrix->array, transRightArray,
   4.177 +                            resultArray );
   4.178 +    }
   4.179 +   else
   4.180 +    {
   4.181 +      //====== Do parallel multiply across cores
   4.182 +
   4.183 +         //Calc the ideal size of sub-matrix and slice up the dimensions of
   4.184 +         // the two matrices.
   4.185 +         //The ideal size is the one takes the number of cycles to calculate
   4.186 +         // such that calc time is equal or greater than min work-unit size
   4.187 +      slicingStrucCarrier =
   4.188 +         calcIdealSizeAndSliceDimensions( leftMatrix, rightMatrix, animPr );
   4.189 +
   4.190 +         //Make the results processor, now that know how many to wait for
   4.191 +      resultsParams = SSR__malloc_to( sizeof(ResultsParams), animPr );
   4.192 +      resultsParams->numSubMatrixPairs  =
   4.193 +         slicingStrucCarrier->leftRowSlices->numVals *
   4.194 +         slicingStrucCarrier->rightColSlices->numVals *
   4.195 +         slicingStrucCarrier->vecSlices->numVals;
   4.196 +      resultsParams->dividerPr   = animPr;
   4.197 +      resultsParams->numCols     = rightMatrix->numCols;
   4.198 +      resultsParams->numRows     = leftMatrix->numRows;
   4.199 +      resultsParams->resultArray = resultArray;
   4.200 +
   4.201 +
   4.202 +      resultPr =
   4.203 +         SSR__create_procr_with( &gatherResults, resultsParams, animPr);
   4.204 +
   4.205 +         //Make the sub-matrices, and pair them up, and make processor to
   4.206 +         // calc product of each pair.
   4.207 +      makeSubMatricesAndProcrs( leftMatrix, rightMatrix,
   4.208 +                                    slicingStrucCarrier,
   4.209 +                                    resultPr, animPr);
   4.210 + 
   4.211 +         //result array is allocated externally, so no message from resultPr
   4.212 +         // however, do have to wait before printing out stats, so wait
   4.213 +         // for an empty handshake message
   4.214 +      msg = SSR__receive_from_to( resultPr, animPr );
   4.215 +   }
   4.216 +
   4.217 +
   4.218 +   //===============  Work done -- send results back =================
   4.219 +
   4.220 +
   4.221 +         DEBUG( dbgAppFlow, "end divide\n")
   4.222 +
   4.223 +         VMS__record_interval_end_in_probe( divideProbe );
   4.224 +         VMS__print_stats_of_all_probes();
   4.225 +
   4.226 +      //nothing left to do so dissipate, SSR will wait to shutdown and hence
   4.227 +      // make results available to outside until all the processors have
   4.228 +      // dissipated -- so no need to wait for results processor
   4.229 +
   4.230 +   SSR__dissipate_procr( animPr );  //all procrs dissipate self at end
   4.231 +      //when all of the processors have dissipated, the "create seed and do
   4.232 +      // work" call in the entry point function returns
   4.233 + }
   4.234 +
   4.235 +
   4.236 +SlicingStrucCarrier *
   4.237 +calcIdealSizeAndSliceDimensions( Matrix *leftMatrix, Matrix *rightMatrix,
   4.238 +                                 VirtProcr *animPr )
   4.239 + {
   4.240 +   float32 idealSizeOfSide, idealSizeOfSide1, idealSizeOfSide2;
   4.241 +   SlicingStruc *leftRowSlices, *vecSlices, *rightColSlices;
   4.242 +   SlicingStrucCarrier *slicingStrucCarrier =
   4.243 +                         SSR__malloc_to(sizeof(SlicingStrucCarrier), animPr);
   4.244 +
   4.245 +   int minWorkUnitCycles, primitiveCycles, idealNumWorkUnits;
   4.246 +   float64 numPrimitiveOpsInMinWorkUnit;
   4.247 +
   4.248 +
   4.249 +   //=======  Calc ideal size of min-sized sub-matrix  ========
   4.250 +
   4.251 +      //ask SSR for the number of cycles of the minimum work unit, at given
   4.252 +      // percent overhead then add a guess at overhead from this divider
   4.253 +   minWorkUnitCycles = SSR__giveMinWorkUnitCycles( .05 );
   4.254 +
   4.255 +      //ask SSR for number of cycles of the "primitive" op of matrix mult
   4.256 +   primitiveCycles = measureMatrixMultPrimitive( animPr );
   4.257 +
   4.258 +   numPrimitiveOpsInMinWorkUnit =
   4.259 +      (float64)minWorkUnitCycles / (float64)primitiveCycles;
   4.260 +
   4.261 +      //take cubed root -- that's number of these in a "side" of sub-matrix
   4.262 +      // then multiply by 5 because the primitive is 5x5
   4.263 +   idealSizeOfSide1 = 5 * cbrt( numPrimitiveOpsInMinWorkUnit );
   4.264 +
   4.265 +   idealNumWorkUnits = SSR__giveIdealNumWorkUnits();
   4.266 +   
   4.267 +   idealSizeOfSide2 = leftMatrix->numRows / rint(cbrt( idealNumWorkUnits ));
   4.268 +   idealSizeOfSide2 *= 0.6; //finer granularity to help load balance
   4.269 +
   4.270 +   if( idealSizeOfSide1 > idealSizeOfSide2 )
   4.271 +      idealSizeOfSide = idealSizeOfSide1;
   4.272 +   else
   4.273 +      idealSizeOfSide = idealSizeOfSide2;
   4.274 +
   4.275 +      //The multiply inner loop blocks the array to fit into L1 cache
   4.276 +//   if( idealSizeOfSide < ROWS_IN_BLOCK ) idealSizeOfSide = ROWS_IN_BLOCK;
   4.277 +
   4.278 +   //============  Slice up dimensions, now that know target size ===========
   4.279 +
   4.280 +      //Tell the slicer the target size of a side (floating pt), the start
   4.281 +      // value to start slicing at, and the end value to stop slicing at
   4.282 +      //It returns an array of start value of each chunk, plus number of them
   4.283 +   int32 startLeftRow, endLeftRow, startVec,endVec,startRightCol,endRightCol;
   4.284 +   startLeftRow  = 0;
   4.285 +   endLeftRow    = leftMatrix->numRows -1;
   4.286 +   startVec      = 0;
   4.287 +   endVec        = leftMatrix->numCols -1;
   4.288 +   startRightCol = 0;
   4.289 +   endRightCol   = rightMatrix->numCols -1;
   4.290 +
   4.291 +   leftRowSlices =
   4.292 +      sliceUpDimension( idealSizeOfSide,  startLeftRow, endLeftRow, animPr );
   4.293 +
   4.294 +   vecSlices =
   4.295 +      sliceUpDimension( idealSizeOfSide,  startVec, endVec, animPr );
   4.296 +
   4.297 +   rightColSlices =
   4.298 +      sliceUpDimension( idealSizeOfSide,  startRightCol, endRightCol,animPr);
   4.299 +
   4.300 +   slicingStrucCarrier->leftRowSlices  = leftRowSlices;
   4.301 +   slicingStrucCarrier->vecSlices      = vecSlices;
   4.302 +   slicingStrucCarrier->rightColSlices = rightColSlices;
   4.303 +
   4.304 +   return slicingStrucCarrier;
   4.305 + }
   4.306 +
   4.307 +
   4.308 +void
   4.309 +makeSubMatricesAndProcrs( Matrix    *leftMatrix, Matrix    *rightMatrix,
   4.310 +            SlicingStrucCarrier *slicingStrucCarrier,
   4.311 +            VirtProcr *resultPr,   VirtProcr *animPr )
   4.312 + {
   4.313 +   SlicingStruc *leftRowSlices, *vecSlices, *rightColSlices;
   4.314 +   
   4.315 +   leftRowSlices  = slicingStrucCarrier->leftRowSlices;
   4.316 +   vecSlices      = slicingStrucCarrier->vecSlices;
   4.317 +   rightColSlices = slicingStrucCarrier->rightColSlices;
   4.318 +   SSR__free( slicingStrucCarrier, animPr );
   4.319 +   
   4.320 +   //================  Make sub-matrices, given the slicing  ================
   4.321 +   SubMatrix **leftSubMatrices, **rightSubMatrices;
   4.322 +   leftSubMatrices =
   4.323 +      createSubMatrices( leftRowSlices, vecSlices, rightColSlices->numVals,
   4.324 +                         leftMatrix, animPr );
   4.325 +   //double_check_that_always_numRows_in_right_same_as_numCols_in_left();
   4.326 +   rightSubMatrices =
   4.327 +      createSubMatrices( vecSlices, rightColSlices, leftRowSlices->numVals,
   4.328 +                         rightMatrix, animPr );
   4.329 +
   4.330 +
   4.331 +   //==============  pair the sub-matrices and make processors ==============
   4.332 +   int32 numRowIdxs, numColIdxs, numVecIdxs;
   4.333 +
   4.334 +   numRowIdxs = leftRowSlices->numVals;
   4.335 +   numColIdxs = rightColSlices->numVals;
   4.336 +   numVecIdxs = vecSlices->numVals;
   4.337 +   
   4.338 +   
   4.339 +   freeSlicingStruc( leftRowSlices, animPr );
   4.340 +   freeSlicingStruc( vecSlices, animPr );
   4.341 +   freeSlicingStruc( rightColSlices, animPr );
   4.342 +   
   4.343 +   pairUpSubMatricesAndMakeProcessors( leftSubMatrices,
   4.344 +                                       rightSubMatrices,
   4.345 +                                       numRowIdxs, numColIdxs,
   4.346 +                                       numVecIdxs,
   4.347 +                                       resultPr,
   4.348 +                                       animPr );
   4.349 + }
   4.350 +
   4.351 +
   4.352 +
   4.353 +
   4.354 +void
   4.355 +pairUpSubMatricesAndMakeProcessors( SubMatrix **leftSubMatrices,
   4.356 +                                    SubMatrix **rightSubMatrices,
   4.357 +                                    int32 numRowIdxs, int32 numColIdxs,
   4.358 +                                    int32 numVecIdxs,
   4.359 +                                    VirtProcr *resultPr,
   4.360 +                                    VirtProcr *animatingPr )
   4.361 + {
   4.362 +   int32 resRowIdx, resColIdx, vecIdx;
   4.363 +   int32 numLeftColIdxs, numRightColIdxs;
   4.364 +   int32 leftRowIdxOffset;
   4.365 +   SMPairParams *subMatrixPairParams;
   4.366 +   float32 numToPutOntoEachCore, leftOverFraction;
   4.367 +   int32 numCores, coreToScheduleOnto, numVecOnCurrCore;
   4.368 +
   4.369 +   numLeftColIdxs  = numColIdxs;
   4.370 +   numRightColIdxs = numVecIdxs;
   4.371 +
   4.372 +   numCores = SSR__give_number_of_cores_to_schedule_onto();
   4.373 +
   4.374 +   numToPutOntoEachCore = numRowIdxs*numColIdxs/numCores;
   4.375 +   leftOverFraction = 0;
   4.376 +   numVecOnCurrCore = 0;
   4.377 +   coreToScheduleOnto = 0;
   4.378 +
   4.379 +   for( resRowIdx = 0; resRowIdx < numRowIdxs; resRowIdx++ )
   4.380 +    {
   4.381 +      leftRowIdxOffset = resRowIdx * numLeftColIdxs;
   4.382 +
   4.383 +      for( resColIdx = 0; resColIdx < numColIdxs; resColIdx++ )
   4.384 +       {
   4.385 +         
   4.386 +         for( vecIdx = 0; vecIdx < numVecIdxs; vecIdx++ )
   4.387 +          {
   4.388 +               //Make the processor for the pair of sub-matrices
   4.389 +            subMatrixPairParams  = SSR__malloc_to( sizeof(SMPairParams),
   4.390 +                                                               animatingPr);
   4.391 +            subMatrixPairParams->leftSubMatrix  =
   4.392 +               leftSubMatrices[ leftRowIdxOffset + vecIdx ];
   4.393 +
   4.394 +            subMatrixPairParams->rightSubMatrix =
   4.395 +               rightSubMatrices[ vecIdx * numRightColIdxs + resColIdx ];
   4.396 +
   4.397 +            subMatrixPairParams->resultPr = resultPr;
   4.398 +
   4.399 +               //put all pairs from the same vector onto same core
   4.400 +            SSR__create_procr_with_affinity( &calcSubMatrixProduct,
   4.401 +                                             subMatrixPairParams,
   4.402 +                                             animatingPr,
   4.403 +                                             coreToScheduleOnto );
   4.404 +          }
   4.405 +
   4.406 +            //Trying to distribute the subMatrix-vectors across the cores, so
   4.407 +            // that each core gets the same number of vectors, with a max
   4.408 +            // imbalance of 1 vector more on some cores than others
   4.409 +         numVecOnCurrCore += 1;
   4.410 +         if( numVecOnCurrCore + leftOverFraction >= numToPutOntoEachCore -1 )
   4.411 +          {
   4.412 +               //deal with fractional part, to ensure that imbalance is 1 max
   4.413 +               // IE, core with most has only 1 more than core with least
   4.414 +            leftOverFraction += numToPutOntoEachCore - numVecOnCurrCore;
   4.415 +            if( leftOverFraction >= 1 )
   4.416 +             { leftOverFraction -= 1;
   4.417 +               numVecOnCurrCore = -1;
   4.418 +             }
   4.419 +            else
   4.420 +             { numVecOnCurrCore = 0;
   4.421 +             }
   4.422 +               //Move to next core, max core-value to incr to is numCores -1
   4.423 +            if( coreToScheduleOnto >= numCores -1 )
   4.424 +             { coreToScheduleOnto = 0;
   4.425 +             }
   4.426 +            else
   4.427 +             { coreToScheduleOnto += 1;
   4.428 +             }
   4.429 +          }
   4.430 + 
   4.431 +       }
   4.432 +    }
   4.433 +
   4.434 + }
   4.435 +
   4.436 +
   4.437 +
   4.438 +/*Walk through the two slice-strucs, making sub-matrix strucs as go
   4.439 + */
   4.440 +SubMatrix **
   4.441 +createSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices,
   4.442 +                   int32 numUses, Matrix *origMatrix, VirtProcr *animPr )
   4.443 + {
   4.444 +   int32 numRowIdxs, numColIdxs, rowIdx, colIdx;
   4.445 +   int32 startRow, endRow, startCol, endCol;
   4.446 +   int32 *rowStartVals, *colStartVals;
   4.447 +   int32 rowOffset;
   4.448 +   SubMatrix **subMatrices, *newSubMatrix;
   4.449 +
   4.450 +   numRowIdxs = rowSlices->numVals;
   4.451 +   numColIdxs = colSlices->numVals;
   4.452 +
   4.453 +   rowStartVals = rowSlices->startVals;
   4.454 +   colStartVals = colSlices->startVals;
   4.455 +
   4.456 +   subMatrices = SSR__malloc_to(numRowIdxs * numColIdxs * sizeof(SubMatrix*),
   4.457 +                                 animPr );
   4.458 +
   4.459 +   for( rowIdx = 0; rowIdx < numRowIdxs; rowIdx++ )
   4.460 +    {
   4.461 +      rowOffset = rowIdx * numColIdxs;
   4.462 +      
   4.463 +      startRow  = rowStartVals[rowIdx];
   4.464 +      endRow    = rowStartVals[rowIdx + 1] -1; //"fake" start above last is
   4.465 +                                               // at last valid idx + 1 & is
   4.466 +                                               // 1 greater than end value
   4.467 +      for( colIdx = 0; colIdx < numColIdxs; colIdx++ )
   4.468 +       {
   4.469 +         startCol = colStartVals[colIdx];
   4.470 +         endCol   = colStartVals[colIdx + 1] -1;
   4.471 +
   4.472 +         newSubMatrix = SSR__malloc_to( sizeof(SubMatrix), animPr );
   4.473 +         newSubMatrix->numRows       = endRow - startRow +1;
   4.474 +         newSubMatrix->numCols       = endCol - startCol +1;
   4.475 +         newSubMatrix->origMatrix    = origMatrix;
   4.476 +         newSubMatrix->origStartRow  = startRow;
   4.477 +         newSubMatrix->origStartCol  = startCol;
   4.478 +         newSubMatrix->copySingleton = NULL;
   4.479 +         newSubMatrix->numUsesLeft   = numUses; //can free after this many
   4.480 +         //Prevent uninitialized memory
   4.481 +         newSubMatrix->copySingleton = NULL;
   4.482 +         newSubMatrix->copyTransSingleton = NULL;
   4.483 +
   4.484 +         subMatrices[ rowOffset + colIdx ] = newSubMatrix;
   4.485 +       }
   4.486 +    }
   4.487 +   return subMatrices;
   4.488 + }
   4.489 +
   4.490 +
   4.491 +void
   4.492 +freeSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices,
   4.493 +                 SubMatrix **subMatrices, VirtProcr *animPr )
   4.494 + {
   4.495 +   int32 numRowIdxs, numColIdxs, rowIdx, colIdx, rowOffset;
   4.496 +   SubMatrix *subMatrix;
   4.497 +
   4.498 +   numRowIdxs = rowSlices->numVals;
   4.499 +   numColIdxs = colSlices->numVals;
   4.500 +
   4.501 +   for( rowIdx = 0; rowIdx < numRowIdxs; rowIdx++ )
   4.502 +    {
   4.503 +      rowOffset = rowIdx * numColIdxs;
   4.504 +      for( colIdx = 0; colIdx < numColIdxs; colIdx++ )
   4.505 +       {
   4.506 +         subMatrix = subMatrices[ rowOffset + colIdx ];
   4.507 +         if( subMatrix->alreadyCopied )
   4.508 +            SSR__free( subMatrix->array, animPr );
   4.509 +         SSR__free( subMatrix, animPr );
   4.510 +       }
   4.511 +    }
   4.512 +   SSR__free( subMatrices, animPr );
   4.513 + }
   4.514 +
   4.515 +
   4.516 +
   4.517 +SlicingStruc *
   4.518 +sliceUpDimension( float32 idealSizeOfSide, int startVal, int endVal,
   4.519 +                  VirtProcr *animPr )
   4.520 + { float32 residualAcc = 0;
   4.521 +   int     numSlices, i, *startVals, sizeOfSlice, endCondition;
   4.522 +   SlicingStruc *slicingStruc = SSR__malloc_to(sizeof(SlicingStruc), animPr);
   4.523 +
   4.524 +      //calc size of matrix need to hold start vals --
   4.525 +   numSlices = (int32)( (float32)(endVal -startVal +1) / idealSizeOfSide);
   4.526 +
   4.527 +   startVals = SSR__malloc_to( (numSlices + 1) * sizeof(int32), animPr );
   4.528 +
   4.529 +      //Calc the upper limit of start value -- when get above this, end loop
   4.530 +      // by saving highest value of the matrix dimension to access, plus 1
   4.531 +      // as the start point of the imaginary slice following the last one
   4.532 +      //Plus 1 because go up to value but not include when process last slice
   4.533 +      //The stopping condition is half-a-size less than highest value because
   4.534 +      // don't want any pieces smaller than half the ideal size -- just tack
   4.535 +      // little ones onto end of last one
   4.536 +   endCondition = endVal - (int) (idealSizeOfSide/2); //end *value*, not size
   4.537 +   for( i = 0; startVal <= endVal; i++ )
   4.538 +    {
   4.539 +      startVals[i] = startVal;
   4.540 +      residualAcc += idealSizeOfSide;
   4.541 +      sizeOfSlice  = (int)residualAcc;
   4.542 +      residualAcc -= (float32)sizeOfSlice;
   4.543 +      startVal    += sizeOfSlice; //ex @size = 2 get 0, 2, 4, 6, 8..
   4.544 +
   4.545 +      if( startVal > endCondition )
   4.546 +       { startVal = endVal + 1;
   4.547 +         startVals[ i + 1 ] = startVal;
   4.548 +       }
   4.549 +    }
   4.550 +
   4.551 +   slicingStruc->startVals = startVals;
   4.552 +   slicingStruc->numVals   = i;  //loop incr'd, so == last valid start idx+1
   4.553 +                                 // which means is num sub-matrices in dim
   4.554 +                                 // also == idx of the fake start just above
   4.555 +   return slicingStruc;
   4.556 + }
   4.557 +
   4.558 +void
   4.559 +freeSlicingStruc( SlicingStruc *slicingStruc, VirtProcr *animPr )
   4.560 + {
   4.561 +   SSR__free( slicingStruc->startVals, animPr );
   4.562 +   SSR__free( slicingStruc, animPr );
   4.563 + }
   4.564 +
   4.565 +
   4.566 +int inline
   4.567 +measureMatrixMultPrimitive( VirtProcr *animPr )
   4.568 + {
   4.569 +   int r, c, v, numCycles;
   4.570 +   float32 *res, *left, *right;
   4.571 +
   4.572 +      //setup inputs
   4.573 +   left  = SSR__malloc_to( 5 * 5 * sizeof( float32 ), animPr );
   4.574 +   right = SSR__malloc_to( 5 * 5 * sizeof( float32 ), animPr );
   4.575 +   res   = SSR__malloc_to( 5 * 5 * sizeof( float32 ), animPr );
   4.576 +
   4.577 +   for( r = 0; r < 5; r++ )
   4.578 +    {
   4.579 +      for( c = 0; c < 5; c++ )
   4.580 +       {
   4.581 +         left[  r * 5 + c ] = r;
   4.582 +         right[ r * 5 + c ] = c;
   4.583 +       }
   4.584 +    }
   4.585 +
   4.586 +      //do primitive
   4.587 +   SSR__start_primitive();  //for now, just takes time stamp
   4.588 +   for( r = 0; r < 5; r++ )
   4.589 +    {
   4.590 +      for( c = 0; c < 5; c++ )
   4.591 +       {
   4.592 +         for( v = 0; v < 5; v++ )
   4.593 +          {
   4.594 +            res[ r * 5 + c ] = left[ r * 5 + v ] * right[ v * 5 + c ];
   4.595 +          }
   4.596 +       }
   4.597 +    }
   4.598 +   numCycles =
   4.599 +      SSR__end_primitive_and_give_cycles();
   4.600 +
   4.601 +   SSR__free( left, animPr );
   4.602 +   SSR__free( right, animPr );
   4.603 +   SSR__free( res, animPr );
   4.604 +
   4.605 +   return numCycles;
   4.606 + }

     5.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     5.2 +++ b/SSR_Matrix_Mult/EntryPoint.c	Tue Feb 07 14:07:38 2012 -0800
     5.3 @@ -0,0 +1,62 @@
     5.4 +/*
     5.5 + *  Copyright 2009 OpenSourceStewardshipFoundation.org
     5.6 + *  Licensed under GNU General Public License version 2
     5.7 + *
     5.8 + * Author: seanhalle@yahoo.com
     5.9 + *
    5.10 + */
    5.11 +
    5.12 +#include <math.h>
    5.13 +
    5.14 +#include "SSR_Matrix_Mult.h"
    5.15 +
    5.16 +
    5.17 +
    5.18 +/*Every SSR system has an "entry point" function that creates the first
    5.19 + * processor, which starts the chain of creating more processors..
    5.20 + * eventually all of the processors will dissipate themselves, and
    5.21 + * return.
    5.22 + *
    5.23 + *This entry-point function follows the same pattern as all entry-point
    5.24 + * functions do:
    5.25 + *1) it creates the params for the seed processor, from the
    5.26 + *    parameters passed into the entry-point function
    5.27 + *2) it calls SSR__create_seed_procr_and_do_work
    5.28 + *3) it gets the return value from the params struc, frees the params struc,
    5.29 + *    and returns the value from the function
    5.30 + *
    5.31 + */
    5.32 +Matrix *
    5.33 +multiplyTheseMatrices( Matrix *leftMatrix, Matrix *rightMatrix )
    5.34 + { Matrix          *resMatrix;
    5.35 +   DividerParams   *dividerParams;
    5.36 +   int32            numResRows, numResCols;
    5.37 +
    5.38 +
    5.39 +   dividerParams              = malloc( sizeof( DividerParams ) );
    5.40 +   dividerParams->leftMatrix  = leftMatrix;
    5.41 +   dividerParams->rightMatrix = rightMatrix;
    5.42 +
    5.43 +
    5.44 +   numResRows  = leftMatrix->numRows;
    5.45 +   numResCols  = rightMatrix->numCols;
    5.46 +
    5.47 +      //VMS has its own separate internal malloc, so to get results out,
    5.48 +      // have to pass in empty array for it to fill up
    5.49 +      //The alternative is internally telling SSR make external space to use
    5.50 +   resMatrix            = malloc( sizeof(Matrix) );
    5.51 +   resMatrix->array     = malloc( numResRows * numResCols * sizeof(float32));
    5.52 +   resMatrix->numCols   = rightMatrix->numCols;
    5.53 +   resMatrix->numRows   = leftMatrix->numRows;
    5.54 +
    5.55 +
    5.56 +   dividerParams->resultMatrix   = resMatrix;
    5.57 +
    5.58 +      //create divider processor, start doing the work, and wait till done
    5.59 +      //This function is the "border crossing" between normal code and SSR
    5.60 +   SSR__create_seed_procr_and_do_work( &divideWorkIntoSubMatrixPairProcrs,
    5.61 +                                       dividerParams );
    5.62 +   
    5.63 +   free( dividerParams );
    5.64 +   return resMatrix;
    5.65 + }

     6.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     6.2 +++ b/SSR_Matrix_Mult/Result_Pr.c	Tue Feb 07 14:07:38 2012 -0800
     6.3 @@ -0,0 +1,108 @@
     6.4 +/*
     6.5 + *  Copyright 2009 OpenSourceStewardshipFoundation.org
     6.6 + *  Licensed under GNU General Public License version 2
     6.7 + *
     6.8 + * Author: seanhalle@yahoo.com
     6.9 + *
    6.10 + */
    6.11 +
    6.12 +#include "SSR_Matrix_Mult.h"
    6.13 +
    6.14 +//=====================
    6.15 +void inline
    6.16 +accumulateResult( float32 *resultArray, float32 *subMatrixResultArray,
    6.17 +                  int32    startRow,
    6.18 +                  int32    numRows,
    6.19 +                  int32    startCol,
    6.20 +                  int32    numCols,
    6.21 +                  int32    numOrigCols );
    6.22 +
    6.23 +//===========================================================================
    6.24 +
    6.25 +/*The Result Processor gets a message from each of the vector processors,
    6.26 + * puts the result from the message in its location in the result-
    6.27 + * matrix, and increments the count of results.
    6.28 + *
    6.29 + *After the count reaches the point that all results have been received, it
    6.30 + * returns the result matrix and dissipates.
    6.31 + */
    6.32 +void gatherResults( void *_params, VirtProcr *animatingPr )
    6.33 + { VirtProcr *dividerPr;
    6.34 +   ResultsParams  *params;
    6.35 +   int             row, col, numRows, numCols, numSubMatrixPairs, count=0;
    6.36 +   float32        *resultArray;
    6.37 +   void           *msg;
    6.38 +   SMPairParams   *resParams;
    6.39 +
    6.40 +         DEBUG( dbgAppFlow, "start resultPr\n")
    6.41 +         
    6.42 +   params    = (ResultsParams *)_params;
    6.43 +   dividerPr = params->dividerPr;
    6.44 +   numSubMatrixPairs = params->numSubMatrixPairs;
    6.45 +   numRows = params->numRows;
    6.46 +   numCols = params->numCols;
    6.47 +
    6.48 +   resultArray = params->resultArray;
    6.49 +
    6.50 +
    6.51 +   while( count < numSubMatrixPairs )
    6.52 +    {
    6.53 +      msg = SSR__receive_type_to( RESULTS_MSG, animatingPr );
    6.54 +
    6.55 +      resParams = (SMPairParams *)msg;
    6.56 +      accumulateResult( resultArray, resParams->partialResultArray,
    6.57 +                        resParams->leftSubMatrix->origStartRow,
    6.58 +                        resParams->leftSubMatrix->numRows,
    6.59 +                        resParams->rightSubMatrix->origStartCol,
    6.60 +                        resParams->rightSubMatrix->numCols,
    6.61 +                        resParams->rightSubMatrix->origMatrix->numCols );
    6.62 +
    6.63 +      SSR__free( resParams->partialResultArray, animatingPr );
    6.64 +      
    6.65 +         //there is only one copy of results procr, so can update numUsesLeft
    6.66 +         // without concurrency worries.  When zero, free the sub-matrix
    6.67 +      resParams->leftSubMatrix->numUsesLeft -= 1;
    6.68 +      if( resParams->leftSubMatrix->numUsesLeft == 0 )
    6.69 +       {
    6.70 +         SSR__free( resParams->leftSubMatrix->array, animatingPr );
    6.71 +         SSR__free( resParams->leftSubMatrix, animatingPr );
    6.72 +       }
    6.73 +
    6.74 +      resParams->rightSubMatrix->numUsesLeft -= 1;
    6.75 +      if( resParams->rightSubMatrix->numUsesLeft == 0 )
    6.76 +       {
    6.77 +         SSR__free( resParams->rightSubMatrix->array, animatingPr );
    6.78 +         SSR__free( resParams->rightSubMatrix, animatingPr );
    6.79 +       }
    6.80 +
    6.81 +         //count of how many sub-matrix pairs accumulated so know when done
    6.82 +      count++;
    6.83 +    }
    6.84 +
    6.85 +      //Done -- could just dissipate -- SSR will wait for all processors to
    6.86 +      // dissipate before shutting down, and thereby making results avaial to
    6.87 +      // outside, so no need to stop the divider from dissipating, so no need
    6.88 +      // to send a hand-shake message to it -- bug makes debug easier
    6.89 +   SSR__send_from_to( NULL, animatingPr, dividerPr );
    6.90 +   SSR__dissipate_procr( animatingPr );  //frees any data owned by procr
    6.91 + }
    6.92 +
    6.93 +void inline
    6.94 +accumulateResult( float32 *resultArray, float32 *subMatrixPairResultArray,
    6.95 +                  int32    startRow,
    6.96 +                  int32    numRows,
    6.97 +                  int32    startCol,
    6.98 +                  int32    numCols,
    6.99 +                  int32    numOrigCols )
   6.100 + { int32 row, col;
   6.101 +
   6.102 +   for( row = 0; row < numRows; row++ )
   6.103 +    {
   6.104 +      for( col = 0; col < numCols; col++ )
   6.105 +       {
   6.106 +         resultArray[ (row + startRow) * numOrigCols + (col + startCol) ] +=
   6.107 +            subMatrixPairResultArray[ row * numCols + col ];
   6.108 +       }
   6.109 +    }
   6.110 +
   6.111 + }

     7.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     7.2 +++ b/SSR_Matrix_Mult/SSR_Matrix_Mult.h	Tue Feb 07 14:07:38 2012 -0800
     7.3 @@ -0,0 +1,97 @@
     7.4 +/*
     7.5 + *  Copyright Oct 24, 2009 OpenSourceStewardshipFoundation.org
     7.6 + *  Licensed under GNU General Public License version 2
     7.7 + */
     7.8 +
     7.9 +#ifndef _SSR_MATRIX_MULT_H_
    7.10 +#define _SSR_MATRIX_MULT_H_
    7.11 +
    7.12 +#include <stdio.h>
    7.13 +
    7.14 +#include "../../SSR_lib/SSR.h"
    7.15 +#include "../Matrix_Mult.h"
    7.16 +
    7.17 +
    7.18 +//===============================  Defines  ==============================
    7.19 +#define ROWS_IN_BLOCK 32
    7.20 +#define COLS_IN_BLOCK 32
    7.21 +#define VEC_IN_BLOCK  32
    7.22 +
    7.23 +#define copyMatrixSingleton 1
    7.24 +#define copyTransposeSingleton 2
    7.25 +
    7.26 +//==============================  Structures  ==============================
    7.27 +typedef struct
    7.28 + {
    7.29 +   Matrix *leftMatrix;
    7.30 +   Matrix *rightMatrix;
    7.31 +   Matrix *resultMatrix;
    7.32 + }
    7.33 +DividerParams;
    7.34 +
    7.35 +typedef struct
    7.36 + {
    7.37 +   VirtProcr *dividerPr;
    7.38 +   int numRows;
    7.39 +   int numCols;
    7.40 +   int numSubMatrixPairs;
    7.41 +   float32 *resultArray;
    7.42 + }
    7.43 +ResultsParams;
    7.44 +
    7.45 +typedef
    7.46 +struct
    7.47 + { int32    numRows;
    7.48 +   int32    numCols;
    7.49 +   Matrix  *origMatrix;
    7.50 +   int32    origStartRow;
    7.51 +   int32    origStartCol;
    7.52 +   int32    alreadyCopied;
    7.53 +   int32    numUsesLeft; //have update via message to avoid multiple writers
    7.54 +   SSRSingleton *copySingleton;
    7.55 +   SSRSingleton *copyTransSingleton;
    7.56 +   float32 *array;  //2D, but dynamically sized, so use addr arith
    7.57 + }
    7.58 +SubMatrix;
    7.59 +
    7.60 +typedef struct
    7.61 + { VirtProcr *resultPr;
    7.62 +   SubMatrix *leftSubMatrix;
    7.63 +   SubMatrix *rightSubMatrix;
    7.64 +   float32   *partialResultArray;
    7.65 + }
    7.66 +SMPairParams;
    7.67 +
    7.68 +typedef
    7.69 +struct
    7.70 + { int32    numVals;
    7.71 +   int32   *startVals;
    7.72 + }
    7.73 +SlicingStruc;
    7.74 +
    7.75 +typedef
    7.76 +struct
    7.77 + {
    7.78 +   SlicingStruc *leftRowSlices;
    7.79 +   SlicingStruc *vecSlices;
    7.80 +   SlicingStruc *rightColSlices;
    7.81 + }
    7.82 +SlicingStrucCarrier;
    7.83 +
    7.84 +enum MMMsgType
    7.85 + {
    7.86 +   RESULTS_MSG = 1
    7.87 + };
    7.88 +
    7.89 +//============================= Processor Functions =========================
    7.90 +void divideWorkIntoSubMatrixPairProcrs( void *data, VirtProcr *animatingPr );
    7.91 +void calcSubMatrixProduct(        void *data, VirtProcr *animatingPr );
    7.92 +void gatherResults(     void *data, VirtProcr *animatingPr );
    7.93 +
    7.94 +
    7.95 +//================================ Entry Point ==============================
    7.96 +Matrix *
    7.97 +multiplyTheseMatrices( Matrix *leftMatrix, Matrix *rightMatrix );
    7.98 +
    7.99 +
   7.100 +#endif /*_SSR_MATRIX_MULT_H_*/

     8.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     8.2 +++ b/SSR_Matrix_Mult/subMatrix_Pr.c	Tue Feb 07 14:07:38 2012 -0800
     8.3 @@ -0,0 +1,319 @@
     8.4 +/* 
     8.5 + *  Copyright 2009 OpenSourceStewardshipFoundation.org
     8.6 + *  Licensed under GNU General Public License version 2
     8.7 + *
     8.8 + * Author: SeanHalle@yahoo.com
     8.9 + *
    8.10 + */
    8.11 +
    8.12 +#include <string.h>
    8.13 +
    8.14 +#include "SSR_Matrix_Mult.h"
    8.15 +
    8.16 +
    8.17 +
    8.18 +void inline
    8.19 +copyFromOrig( SubMatrix *subMatrix, VirtProcr *animPr );
    8.20 +
    8.21 +void inline
    8.22 +copyTransposeFromOrig( SubMatrix *subMatrix, VirtProcr *animPr );
    8.23 +
    8.24 +void inline
    8.25 +multiplySubBlocksTransposed( float32 *leftArray, float32 *rightArray,
    8.26 +                     float32 *resArray,
    8.27 +                     int startRow,  int endRow,
    8.28 +                     int startCol,  int endCol,
    8.29 +                     int startVec,  int endVec,
    8.30 +                     int resStride, int inpStride );
    8.31 +
    8.32 +void inline
    8.33 +multiplyMatrixArraysTransposed( int32 vecLength, int32 numResRows, int32 numResCols,
    8.34 +                      float32 *leftArray, float32 *rightArray,
    8.35 +                      float32 *resArray );
    8.36 +
    8.37 +
    8.38 +/*A  processor is created with an environment that holds two matrices,
    8.39 + * the row and col that it owns, and the name of a result gathering
    8.40 + * processor.
    8.41 + *It calculates the product of two sub-portions of the input matrices
    8.42 + * by using Intel's mkl library for single-core.
    8.43 + *
    8.44 + *This demonstrates using optimized single-threaded code inside scheduled
    8.45 + * work-units.
    8.46 + *
    8.47 + *When done, it sends the result to the result processor
    8.48 + */
    8.49 +void
    8.50 +calcSubMatrixProduct( void *data, VirtProcr *animatingPr )
    8.51 + { 
    8.52 +   SMPairParams   *params;
    8.53 +   VirtProcr      *resultPr;
    8.54 +   float32        *leftArray,  *rightArray, *resArray;
    8.55 +   SubMatrix      *leftSubMatrix, *rightSubMatrix;
    8.56 +
    8.57 +         DEBUG1(dbgAppFlow, "start sub-matrix mult: %d\n", animatingPr->procrID)
    8.58 +         #ifdef TURN_ON_DEBUG_PROBES
    8.59 +         int32 subMatrixProbe = VMS__create_single_interval_probe( "subMtx",
    8.60 +                                                                animatingPr);
    8.61 +         VMS__record_sched_choice_into_probe( subMatrixProbe, animatingPr );
    8.62 +         VMS__record_interval_start_in_probe( subMatrixProbe );
    8.63 +         #endif
    8.64 +
    8.65 +   params         = (SMPairParams *)data;
    8.66 +   resultPr       = params->resultPr;
    8.67 +   leftSubMatrix  = params->leftSubMatrix;
    8.68 +   rightSubMatrix = params->rightSubMatrix;
    8.69 +
    8.70 +      //make sure the input sub-matrices have been copied out of orig
    8.71 +      //do it here, inside sub-matrix pair to hopefully gain reuse in cache
    8.72 +   copyFromOrig( leftSubMatrix, animatingPr );
    8.73 +   copyTransposeFromOrig( rightSubMatrix, animatingPr );
    8.74 +   
    8.75 +   leftArray      = leftSubMatrix->array;
    8.76 +   rightArray     = rightSubMatrix->array;
    8.77 +
    8.78 +   int32
    8.79 +   resSize = leftSubMatrix->numRows * rightSubMatrix->numCols * sizeof(float32);
    8.80 +   resArray = SSR__malloc_to( resSize, animatingPr );
    8.81 +   memset( resArray, 0, resSize );
    8.82 +
    8.83 +
    8.84 +   int32 numResRows, numResCols, vectLength;
    8.85 +   
    8.86 +   vectLength = leftSubMatrix->numCols;
    8.87 +   numResRows = leftSubMatrix->numRows;
    8.88 +   numResCols = rightSubMatrix->numCols;
    8.89 +
    8.90 +   multiplyMatrixArraysTransposed( vectLength, numResRows, numResCols,
    8.91 +                         leftArray,  rightArray,
    8.92 +                         resArray );
    8.93 +
    8.94 +   //send result to result processor
    8.95 +   params->partialResultArray = resArray;
    8.96 +
    8.97 +         #ifdef TURN_ON_DEBUG_PROBES
    8.98 +         VMS__record_interval_end_in_probe( subMatrixProbe );
    8.99 +         #endif
   8.100 +         
   8.101 +   SSR__send_of_type_to( animatingPr, params, RESULTS_MSG, resultPr );
   8.102 +   SSR__dissipate_procr( animatingPr );
   8.103 + }
   8.104 +
   8.105 +
   8.106 +
   8.107 +/*Divides result and each input into 32x32 sub-matrices, 3 of which fit into
   8.108 + * the 32KB L1 cache.
   8.109 + *Would be nice to embed this within another level that divided into
   8.110 + * 8x8 tiles of those, where one 8x8 tile fits within 2MB L2 cache
   8.111 + *
   8.112 + *Eventually want these divisions to be automatic, using DKU pattern
   8.113 + * embedded into VMS and exposed in the language, and with VMS controlling the
   8.114 + * divisions according to the cache sizes, which it knows about.
   8.115 + *Also, want VMS to work with language to split among main-mems, so a socket
   8.116 + * only cranks on data in its local segment of main mem
   8.117 + *
   8.118 + *So, outer two loops determine start and end points within the result matrix.
   8.119 + * Inside that, a loop dets the start and end points along the shared dimensions
   8.120 + * of the two input matrices.
   8.121 + */
   8.122 +void inline
   8.123 +multiplyMatrixArraysTransposed( int32 vecLength, int32 numResRows,
   8.124 +                                int32 numResCols,
   8.125 +                                float32 *leftArray, float32 *rightArray,
   8.126 +                                float32 *resArray )
   8.127 + {
   8.128 +   int resStride, inpStride;
   8.129 +   int resStartRow, resStartCol, resEndRow, resEndCol, startVec, endVec;
   8.130 +
   8.131 +   resStride  = numResCols;
   8.132 +   inpStride  = vecLength;
   8.133 +
   8.134 +   for( resStartRow = 0; resStartRow < numResRows; )
   8.135 +    {
   8.136 +      resEndRow = resStartRow + ROWS_IN_BLOCK -1;  //start at zero, so -1
   8.137 +      if( resEndRow > numResRows ) resEndRow = numResRows -1;
   8.138 +
   8.139 +      for( resStartCol = 0; resStartCol < numResCols; )
   8.140 +       {
   8.141 +         resEndCol   = resStartCol + COLS_IN_BLOCK -1;
   8.142 +         if( resEndCol > numResCols ) resEndCol = numResCols -1;
   8.143 +
   8.144 +         for( startVec = 0; startVec < vecLength; )
   8.145 +          {
   8.146 +            endVec   = startVec + VEC_IN_BLOCK -1;
   8.147 +            if( endVec > vecLength ) endVec = vecLength -1;
   8.148 +
   8.149 +               //By having the "vector" of sub-blocks in a sub-block slice
   8.150 +               // be marched down in inner loop, are re-using the result
   8.151 +               // matrix, which stays in L1 cache and re-using the left sub-mat
   8.152 +               // which repeats for each right sub-mat -- can only re-use two of
   8.153 +               // the three, so result is the most important -- avoids writing
   8.154 +               // dirty blocks until those result-locations fully done
   8.155 +               //Row and Col is position in result matrix -- so row and vec
   8.156 +               // for left array, then vec and col for right array
   8.157 +            multiplySubBlocksTransposed( leftArray, rightArray,
   8.158 +                                         resArray,
   8.159 +                                         resStartRow,  resEndRow,
   8.160 +                                         resStartCol,  resEndCol,
   8.161 +                                         startVec,  endVec,
   8.162 +                                         resStride, inpStride );
   8.163 +            startVec = endVec +1;
   8.164 +          }
   8.165 +         resStartCol = resEndCol +1;
   8.166 +       }
   8.167 +      resStartRow = resEndRow +1;
   8.168 +    }
   8.169 + }
   8.170 +
   8.171 +
   8.172 +
   8.173 +void inline
   8.174 +multiplySubBlocksTransposed( float32 *leftArray, float32 *rightArray,
   8.175 +                     float32 *resArray,
   8.176 +                     int resStartRow,  int resEndRow,
   8.177 +                     int resStartCol,  int resEndCol,
   8.178 +                     int startVec,  int endVec,
   8.179 +                     int resStride, int inpStride )
   8.180 + {
   8.181 +   int resRow,     resCol,        vec;
   8.182 +   int leftOffset, rightOffset;
   8.183 +   float32 result;
   8.184 +
   8.185 +      //The result row is used only for the left matrix, res col for the right
   8.186 +   for( resCol = resStartCol; resCol <= resEndCol; resCol++ )
   8.187 +    {
   8.188 +      for( resRow = resStartRow; resRow <= resEndRow; resRow++ )
   8.189 +       {
   8.190 +         leftOffset  = resRow * inpStride;//left & right inp strides always same
   8.191 +         rightOffset = resCol * inpStride;// because right is transposed
   8.192 +         result = 0;
   8.193 +         for( vec = startVec; vec <= endVec; vec++ )
   8.194 +          {
   8.195 +            result +=
   8.196 +               leftArray[ leftOffset + vec] * rightArray[ rightOffset + vec];
   8.197 +          }
   8.198 +
   8.199 +         resArray[ resRow * resStride + resCol ] += result;
   8.200 +       }
   8.201 +    }
   8.202 + }
   8.203 +
   8.204 +
   8.205 +
   8.206 +
   8.207 +/*Reuse this in divider when do the sequential multiply case
   8.208 + */
   8.209 +void inline
   8.210 +copyTranspose( int32 numRows, int32 numCols,
   8.211 +               int32 origStartRow, int32 origStartCol, int32 origStride,
   8.212 +               float32 *subArray, float32 *origArray )
   8.213 + { int32 stride = numRows;
   8.214 + 
   8.215 +   int row, col, origOffset;
   8.216 +   for( row = 0; row < numRows; row++ )
   8.217 +    {
   8.218 +      origOffset = (row + origStartRow) * origStride + origStartCol;
   8.219 +      for( col = 0; col < numCols; col++ )
   8.220 +       {
   8.221 +            //transpose means swap row & col -- traverse orig matrix normally
   8.222 +            // but put into reversed place in local array -- means the
   8.223 +            // stride is the numRows now, so col * numRows + row
   8.224 +         subArray[ col * stride + row ]  =  origArray[ origOffset + col ];
   8.225 +       }
   8.226 +    }
   8.227 + }
   8.228 +
   8.229 +void inline
   8.230 +copyTransposeFromOrig( SubMatrix *subMatrix, VirtProcr *animPr )
   8.231 + { int numCols, numRows, origStartRow, origStartCol, origStride, stride;
   8.232 +   Matrix *origMatrix;
   8.233 +   float32 *origArray, *subArray;
   8.234 +
   8.235 +//   if( subMatrix->copyTransSingleton && \
   8.236 +//       subMatrix->copyTransSingleton->hasFinished ) \
   8.237 +//      return;
   8.238 +   SSR__start_data_singleton( &(subMatrix->copyTransSingleton), animPr );
   8.239 +
   8.240 +   if( subMatrix->copyTransSingleton->hasFinished )
   8.241 +    {
   8.242 +      printf("error!");
   8.243 +    }
   8.244 +
   8.245 +   origMatrix   = subMatrix->origMatrix;
   8.246 +   origArray    = origMatrix->array;
   8.247 +   numCols      = subMatrix->numCols;
   8.248 +   numRows      = subMatrix->numRows;
   8.249 +   origStartRow = subMatrix->origStartRow;
   8.250 +   origStartCol = subMatrix->origStartCol;
   8.251 +   origStride   = origMatrix->numCols;
   8.252 +
   8.253 +   subArray     = SSR__malloc_to( numRows * numCols *sizeof(float32),animPr);
   8.254 +   subMatrix->array = subArray;
   8.255 +
   8.256 +      //copy values from orig matrix to local
   8.257 +   copyTranspose( numRows, numCols,
   8.258 +                  origStartRow, origStartCol, origStride,
   8.259 +                  subArray, origArray );
   8.260 +
   8.261 +   SSR__end_data_singleton( &(subMatrix->copyTransSingleton), animPr );
   8.262 +   
   8.263 +   return;
   8.264 + }
   8.265 +
   8.266 +
   8.267 +void inline
   8.268 +copyFromOrig( SubMatrix *subMatrix, VirtProcr *animPr )
   8.269 + { int numCols, numRows, origStartRow, origStartCol, stride, origStride;
   8.270 +   Matrix *origMatrix;
   8.271 +   float32 *origArray, *subArray;
   8.272 +
   8.273 +
   8.274 +      //This lets only a single VP execute the code between start and
   8.275 +      // end -- using start and end so that work runs outside the master.
   8.276 +      //Inside, if a second VP ever executes the start, it will be returned
   8.277 +      // from the end-point.
   8.278 +      //Note, for non-GCC, can add a second SSR call at the end, and inside
   8.279 +      // that one, look at the stack at the return addr & save that in an
   8.280 +      // array indexed by singletonID
   8.281 +//   if( subMatrix->copySingleton && subMatrix->copySingleton->hasFinished )\
   8.282 +      return;
   8.283 +   SSR__start_data_singleton( &(subMatrix->copySingleton), animPr );
   8.284 +   if( subMatrix->copySingleton->endInstrAddr )
   8.285 +    {
   8.286 +      printf("error!");
   8.287 +    }
   8.288 +
   8.289 +   if( subMatrix->copySingleton->hasFinished )
   8.290 +    {
   8.291 +      printf("error!");
   8.292 +    }
   8.293 +
   8.294 +   origMatrix    = subMatrix->origMatrix;
   8.295 +   origArray     = origMatrix->array;
   8.296 +   numCols       = subMatrix->numCols;
   8.297 +   numRows       = subMatrix->numRows;
   8.298 +   origStartRow  = subMatrix->origStartRow;
   8.299 +   origStartCol  = subMatrix->origStartCol;
   8.300 +   origStride    = origMatrix->numCols;
   8.301 +
   8.302 +   subArray     = SSR__malloc_to( numRows * numCols *sizeof(float32),animPr);
   8.303 +   subMatrix->array = subArray;
   8.304 +
   8.305 +      //copy values from orig matrix to local
   8.306 +   stride        = numCols;
   8.307 +
   8.308 +   int row, col, offset, origOffset;
   8.309 +   for( row = 0; row < numRows; row++ )
   8.310 +    {
   8.311 +      offset     = row * stride;
   8.312 +      origOffset = (row + origStartRow) * origStride + origStartCol;
   8.313 +      for( col = 0; col < numCols; col++ )
   8.314 +       {
   8.315 +         subArray[ offset + col ]  =  origArray[ origOffset + col ];
   8.316 +       }
   8.317 +    }
   8.318 +
   8.319 +   SSR__end_data_singleton( &(subMatrix->copySingleton), animPr );
   8.320 +
   8.321 +   return;
   8.322 + }

     9.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     9.2 +++ b/main.c	Tue Feb 07 14:07:38 2012 -0800
     9.3 @@ -0,0 +1,37 @@
     9.4 +/*
     9.5 + *  Copyright Oct 24, 2009 OpenSourceStewardshipFoundation.org
     9.6 + *  Licensed under GNU General Public License version 2
     9.7 + *
     9.8 + * author seanhalle@yahoo.com
     9.9 + */
    9.10 +
    9.11 +#include <malloc.h>
    9.12 +#include <stdlib.h>
    9.13 +
    9.14 +#include "Matrix_Mult.h"
    9.15 +#include "SSR_Matrix_Mult/SSR_Matrix_Mult.h"
    9.16 +
    9.17 +char __ProgrammName[] = "Blocked Matrix Multiply";
    9.18 +char __DataSet[255];
    9.19 +/**
    9.20 + * 
    9.21 + */
    9.22 +int main( int argc, char **argv )
    9.23 + { Matrix      *leftMatrix, *rightMatrix, *resultMatrix;
    9.24 +   ParamBag    *paramBag;
    9.25 +   
    9.26 +   printf( "arguments: %s | %s\n", argv[0], argv[1] );
    9.27 +
    9.28 +   paramBag = makeParamBag();
    9.29 +   readParamFileIntoBag( argv[1], paramBag );
    9.30 +   initialize_Input_Matrices_Via( &leftMatrix, &rightMatrix, paramBag );
    9.31 +   
    9.32 +   resultMatrix = multiplyTheseMatrices( leftMatrix, rightMatrix );
    9.33 +
    9.34 +   printf("\nresult matrix: \n");
    9.35 +   printMatrix( resultMatrix );
    9.36 +//   SSR__print_stats();
    9.37 +   fflush(stdin);
    9.38 +   
    9.39 +   exit(0); //cleans up
    9.40 + }

    10.1 --- a/src/Application/Matrix_Mult.c	Wed Sep 07 13:06:25 2011 +0200
    10.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
    10.3 @@ -1,167 +0,0 @@
    10.4 -/*
    10.5 - *  Copyright 2009 OpenSourceStewardshipFoundation.org
    10.6 - *  Licensed under GNU General Public License version 2
    10.7 - *
    10.8 - * Author: seanhalle@yahoo.com
    10.9 - *
   10.10 - * Created on November 15, 2009, 2:35 AM
   10.11 - */
   10.12 -
   10.13 -#include <malloc.h>
   10.14 -#include <stdlib.h>
   10.15 -
   10.16 -#include "Matrix_Mult.h"
   10.17 -#include "ParamHelper/Param.h"
   10.18 -
   10.19 -
   10.20 - 
   10.21 - void
   10.22 -initialize_Input_Matrices_Via( Matrix  **leftMatrix, Matrix **rightMatrix,
   10.23 -                               ParamBag *paramBag )
   10.24 - { char *leftMatrixFileName, *rightMatrixFileName;
   10.25 -   int   leftMatrixRows, leftMatrixCols, rightMatrixRows, rightMatrixCols;
   10.26 -   
   10.27 -      ParamStruc *param;
   10.28 -      param = getParamFromBag( "leftMatrixRows", paramBag );
   10.29 -   leftMatrixRows = param->intValue;
   10.30 -      param = getParamFromBag( "leftMatrixCols", paramBag );
   10.31 -   leftMatrixCols = param->intValue;
   10.32 -   *leftMatrix = makeMatrix_WithResMat( leftMatrixRows, leftMatrixCols );
   10.33 -   
   10.34 -      param = getParamFromBag( "leftMatrixFileName", paramBag );
   10.35 -   leftMatrixFileName = param->strValue;  //no need to copy
   10.36 -   read_Matrix_From_File( *leftMatrix,  leftMatrixFileName );
   10.37 -   
   10.38 -      param = getParamFromBag( "rightMatrixRows", paramBag );
   10.39 -   rightMatrixRows = param->intValue;
   10.40 -      param = getParamFromBag( "rightMatrixCols", paramBag );
   10.41 -   rightMatrixCols = param->intValue;
   10.42 -   *rightMatrix = makeMatrix_WithResMat( rightMatrixRows, rightMatrixCols );
   10.43 -   
   10.44 -      param = getParamFromBag( "rightMatrixFileName", paramBag );
   10.45 -   rightMatrixFileName = param->strValue;
   10.46 -   read_Matrix_From_File( *rightMatrix, rightMatrixFileName );
   10.47 - }
   10.48 -
   10.49 -
   10.50 -void parseLineIntoRow( char *line, float32* row );
   10.51 -
   10.52 -
   10.53 - void
   10.54 -read_Matrix_From_File( Matrix *matrixStruc, char *matrixFileName )
   10.55 - { int    row, maxRead, numRows, numCols;
   10.56 -   float32 *matrixStart;
   10.57 -   size_t lineSz = 0;
   10.58 -   FILE  *file;
   10.59 -   char  *line = NULL;
   10.60 -   
   10.61 -   lineSz = 50000; //max length of line in a matrix data file
   10.62 -   line = (char *) malloc( lineSz );
   10.63 -   if( line == NULL ) printf( "no mem for matrix line" );
   10.64 -   
   10.65 -   numRows = matrixStruc->numRows;
   10.66 -   numCols = matrixStruc->numCols;
   10.67 -   matrixStart = matrixStruc->array;
   10.68 -
   10.69 -   file = fopen( matrixFileName, "r" );
   10.70 -   if( file == NULL ) { printf( "\nCouldn't open file!!\n"); exit(1);}
   10.71 -   fseek( file, 0, SEEK_SET );
   10.72 -   for( row = 0; row < numRows; row++ )
   10.73 -    {
   10.74 -      if( feof( file ) )  printf( "file ran out too soon" );
   10.75 -      maxRead = getline( &line, &lineSz, file );
   10.76 -      if( maxRead == -1 ) printf( "prob reading mat line");
   10.77 -      
   10.78 -      if( *line == '\n') continue; //blank line
   10.79 -      if( *line == '/' ) continue; //comment line
   10.80 -      
   10.81 -      parseLineIntoRow( line, matrixStart + row * numCols );
   10.82 -    }
   10.83 -   free( line );
   10.84 - }
   10.85 -
   10.86 -/*This function relies on each line having the proper number of cols.  It
   10.87 - * doesn't check, nor enforce, so if the file is improperly formatted it
   10.88 - * can write over unrelated memory
   10.89 - */
   10.90 - void
   10.91 -parseLineIntoRow( char *line, float32* row )
   10.92 - {
   10.93 -   char *valueStr, *searchPos;
   10.94 -   
   10.95 -      //read the float values
   10.96 -   searchPos = valueStr = line; //start
   10.97 -   
   10.98 -   for( ; *searchPos != 0; searchPos++)  //bit dangerous, should use buff len
   10.99 -    {
  10.100 -      if( *searchPos == '\n' ) //last col..  relying on well-formatted file
  10.101 -       { *searchPos = 0;
  10.102 -         *row = atof( valueStr );
  10.103 -         break;                                    //end FOR loop
  10.104 -       }
  10.105 -      if( *searchPos == ',' )
  10.106 -       { *searchPos = 0;                           //mark end of string
  10.107 -         *row = (float32) atof( valueStr );
  10.108 -         row += 1;                                 //address arith
  10.109 -            //skip any spaces before digits.. use searchPos + 1 to skip the 0
  10.110 -         for( ; *(searchPos + 1)== ' ' && *(searchPos + 1) !=0; searchPos++);
  10.111 -         valueStr = searchPos + 1;
  10.112 -       }
  10.113 -    }
  10.114 - }
  10.115 -
  10.116 - //==========================================================================
  10.117 -
  10.118 -/*In the "_Flat" version of constructor, do only malloc of the top data struc
  10.119 - * and set values in that top-level.  Don't malloc any sub-structures.
  10.120 - */
  10.121 - Matrix *
  10.122 -makeMatrix_Flat( int32 numRows, int32 numCols )
  10.123 - { Matrix * retMatrix;
  10.124 -   retMatrix = malloc( sizeof( Matrix ) );
  10.125 -   retMatrix->numRows = numRows;
  10.126 -   retMatrix->numCols = numCols;
  10.127 -
  10.128 -   return retMatrix;
  10.129 - }
  10.130 -
  10.131 - Matrix *
  10.132 -makeMatrix_WithResMat( int32 numRows, int32 numCols )
  10.133 - { Matrix * retMatrix;
  10.134 -   retMatrix = malloc( sizeof( Matrix ) );
  10.135 -   retMatrix->numRows = numRows;
  10.136 -   retMatrix->numCols = numCols;
  10.137 -   retMatrix->array  = malloc( numRows * numCols * sizeof(float32) );
  10.138 -
  10.139 -   return retMatrix;
  10.140 - }
  10.141 -
  10.142 - void
  10.143 -freeMatrix_Flat( Matrix * matrix )
  10.144 - { //( matrix );
  10.145 - }
  10.146 - void
  10.147 -freeMatrix( Matrix * matrix )
  10.148 - { free( matrix->array );
  10.149 -   free( matrix );
  10.150 - }
  10.151 -
  10.152 -void
  10.153 -printMatrix( Matrix *matrix )
  10.154 - { int r, c, numRows, numCols, rowsToPrint, colsToPrint, rowIncr, colIncr;
  10.155 -   float32 *matrixArray;
  10.156 -
  10.157 -   numRows = rowsToPrint = matrix->numRows;
  10.158 -   numCols = colsToPrint = matrix->numCols;
  10.159 -   matrixArray = matrix->array;
  10.160 -
  10.161 -   rowIncr = numRows/20; if(rowIncr == 0) rowIncr = 1;//20 to 39 rows printed
  10.162 -   colIncr = numCols/20; if(colIncr == 0) colIncr = 1;//20 to 39 cols printed
  10.163 -   for( r = 0; r < numRows; r += rowIncr )
  10.164 -    { for( c = 0; c < numCols; c += colIncr )
  10.165 -       { printf( "%3.1f | ", matrixArray[ r * numCols + c ] );
  10.166 -       }
  10.167 -      printf("\n");
  10.168 -    }
  10.169 - }
  10.170 -

    11.1 --- a/src/Application/Matrix_Mult.h	Wed Sep 07 13:06:25 2011 +0200
    11.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
    11.3 @@ -1,77 +0,0 @@
    11.4 -/*
    11.5 - *  Copyright Oct 24, 2009 OpenSourceStewardshipFoundation.org
    11.6 - *  Licensed under GNU General Public License version 2
    11.7 - */
    11.8 -
    11.9 -#ifndef MATRIX_MULT_H_
   11.10 -#define MATRIX_MULT_H_
   11.11 -
   11.12 -#include <stdio.h>
   11.13 -#include <unistd.h>
   11.14 -#include <malloc.h>
   11.15 -
   11.16 -#include "../SSR_lib/VMS/VMS_primitive_data_types.h"
   11.17 -#include "ParamHelper/Param.h"
   11.18 -
   11.19 -//==============================  Structures  ==============================
   11.20 -
   11.21 -typedef
   11.22 -struct
   11.23 - { int32 numRows;
   11.24 -   int32 numCols;
   11.25 -   float32 *array;  //2D, but dynamically sized, so use addr arith
   11.26 - }
   11.27 -Matrix;
   11.28 -
   11.29 -/* This is the "appSpecificPiece" that is carried inside a DKUPiece.
   11.30 - *  In the DKUPiece data struc it is declared to be of type "void *".  This
   11.31 - *  allows the application to define any data structure it wants and put it
   11.32 - *  into a DKUPiece.
   11.33 - * When the app specific info is used, it is in app code, so it is cast to
   11.34 - *  the correct type to tell the compiler how to access fields.
   11.35 - * This keeps all app-specific things out of the DKU directory, as per the
   11.36 - *  DKU standard. */
   11.37 -typedef
   11.38 -struct
   11.39 - { 
   11.40 -      // pointers to shared data..  the result matrix must be created when the
   11.41 -      //  left and right matrices are put into the root ancestor DKUPiece.
   11.42 -   Matrix * leftMatrix;
   11.43 -   Matrix * rightMatrix;
   11.44 -   Matrix * resultMatrix;
   11.45 -
   11.46 -      // define the starting and ending boundaries for this piece of the
   11.47 -      //  result matrix.  These are derivable from the left and right
   11.48 -      //  matrices, but included them for readability of code.
   11.49 -   int prodStartRow, prodEndRow;
   11.50 -   int prodStartCol, prodEndCol;
   11.51 -      // Start and end of the portion of the left matrix that contributes to
   11.52 -      //  this piece of the product
   11.53 -   int leftStartRow, leftEndRow;
   11.54 -   int leftStartCol, leftEndCol;
   11.55 -      // Start and end of the portion of the right matrix that contributes to
   11.56 -      //  this piece of the product
   11.57 -   int rightStartRow, rightEndRow;
   11.58 -   int rightStartCol, rightEndCol;
   11.59 - }
   11.60 -MatrixProdPiece;
   11.61 -
   11.62 -//==============================  Functions  ================================
   11.63 -void readFile();
   11.64 -
   11.65 -Matrix *makeMatrix( int32 numRows, int32 numCols );
   11.66 -Matrix *makeMatrix_Flat( int32 numRows, int32 numCols );
   11.67 -Matrix *makeMatrix_WithResMat( int32 numRows, int32 numCols );
   11.68 -void    freeMatrix_Flat( Matrix * matrix );
   11.69 -void    freeMatrix( Matrix * matrix );
   11.70 -void    printMatrix( Matrix *matrix );
   11.71 -
   11.72 -void read_Matrix_From_File( Matrix *matrixStruc, char *matrixFileName );
   11.73 -
   11.74 -void
   11.75 -initialize_Input_Matrices_Via( Matrix  **leftMatrix, Matrix **rightMatrix,
   11.76 -                              ParamBag *paramBag );
   11.77 -
   11.78 -//===========================================================================
   11.79 -
   11.80 -#endif /*MATRIX_MULT_H_*/

    12.1 --- a/src/Application/SSR_Matrix_Mult/Divide_Pr.c	Wed Sep 07 13:06:25 2011 +0200
    12.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
    12.3 @@ -1,603 +0,0 @@
    12.4 -/*
    12.5 - *  Copyright 2009 OpenSourceStewardshipFoundation.org
    12.6 - *  Licensed under GNU General Public License version 2
    12.7 - *
    12.8 - * Author: seanhalle@yahoo.com
    12.9 - *
   12.10 - */
   12.11 -
   12.12 -
   12.13 -#include "SSR_Matrix_Mult.h"
   12.14 -#include <math.h>
   12.15 -#include <string.h>
   12.16 -
   12.17 -   //The time to compute this many result values should equal the time to
   12.18 -   // perform this division on a matrix of size gives that many result calcs
   12.19 -   //IE, size this so that sequential time to calc equals divide time
   12.20 -   // find the value by experimenting -- but divide time and calc time scale
   12.21 -   // same way, so this value should remain valid across hardware
   12.22 -#define NUM_CELLS_IN_SEQUENTIAL_CUTOFF 1000
   12.23 -
   12.24 -
   12.25 -//===========================================================================
   12.26 -int inline
   12.27 -measureMatrixMultPrimitive( VirtProcr *animPr );
   12.28 -
   12.29 -SlicingStrucCarrier *
   12.30 -calcIdealSizeAndSliceDimensions( Matrix *leftMatrix, Matrix *rightMatrix,
   12.31 -                                 VirtProcr *animPr );
   12.32 -
   12.33 -SlicingStruc *
   12.34 -sliceUpDimension( float32 idealSizeOfSide, int startVal, int endVal,
   12.35 -                  VirtProcr *animPr );
   12.36 -
   12.37 -void
   12.38 -freeSlicingStruc( SlicingStruc *slicingStruc, VirtProcr *animPr );
   12.39 -
   12.40 -SubMatrix **
   12.41 -createSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices,
   12.42 -                   int32 numUses, Matrix *origMatrix, VirtProcr *animPr );
   12.43 -
   12.44 -void
   12.45 -freeSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices,
   12.46 -                 SubMatrix **subMatrices, VirtProcr *animPr );
   12.47 -
   12.48 -void
   12.49 -pairUpSubMatricesAndMakeProcessors( SubMatrix **leftSubMatrices,
   12.50 -                                    SubMatrix **rightSubMatrices,
   12.51 -                                    int32 numRowIdxs, int32 numColIdxs,
   12.52 -                                    int32 numVecIdxs,
   12.53 -                                    VirtProcr *resultPr,
   12.54 -                                    VirtProcr *animatingPr );
   12.55 -
   12.56 -void
   12.57 -makeSubMatricesAndProcrs( Matrix *leftMatrix, Matrix *rightMatrix,
   12.58 -            SlicingStrucCarrier *slicingStrucCarrier,
   12.59 -            VirtProcr *resultPr, VirtProcr *animatingPr );
   12.60 -
   12.61 -
   12.62 -
   12.63 -/*Divider creates one processor for every sub-matrix
   12.64 - * It hands them:
   12.65 - *  the name of the result processor that they should send their results to,
   12.66 - *  the left and right matrices, and the rows and cols they should multiply
   12.67 - * It first creates the result processor, then all the sub-matrixPair
   12.68 - *  processors,
   12.69 - *  then does a receive of a message from the result processor that gives
   12.70 - *  the divider ownership of the result matrix.
   12.71 - * Finally, the divider returns the result matrix out of the SSR system.
   12.72 - *
   12.73 - * Divider chooses the size of sub-matrices via an algorithm that tries to
   12.74 - *  keep the minimum work above a threshold.  The threshold is machine-
   12.75 - *  dependent, so ask SSR for min work-unit time to get a
   12.76 - *  given overhead
   12.77 - *
   12.78 - * Divide min work-unit cycles by measured-cycles for one matrix-cell
   12.79 - *  product -- gives the number of products need to have in min size
   12.80 - *  matrix.
   12.81 - *
   12.82 - * So then, take cubed root of this to get the size of a side of min sub-
   12.83 - *  matrix.  That is the size of the ideal square sub-matrix -- so tile
   12.84 - *  up the two input matrices into ones as close as possible to that size,
   12.85 - *  and create the pairs of sub-matrices.
   12.86 - *
   12.87 - *========================  STRATEGIC OVERVIEW  =======================
   12.88 - *
   12.89 - *This division is a bit tricky, because have to create things in advance
   12.90 - * that it's not at first obvious need to be created..
   12.91 - *
   12.92 - *First slice up each dimension -- three of them..  this is because will have
   12.93 - * to create the sub-matrix's data-structures before pairing the sub-matrices
   12.94 - * with each other -- so, have three dimensions to slice up before can
   12.95 - * create the sub-matrix data-strucs -- also, have to be certain that the
   12.96 - * cols of the left input have the exact same slicing as the rows of the
   12.97 - * left matrix, so just to be sure, do the slicing calc once, then use it
   12.98 - * for both.
   12.99 - *
  12.100 - *So, goes like this:
  12.101 - *1) calculate the start & end values of each dimension in each matrix.
  12.102 - *2) use those values to create sub-matrix structures
  12.103 - *3) combine sub-matrices into pairs, as the tasks to perform.
  12.104 - *
  12.105 - *Have to calculate separately from creating the sub-matrices because of the
  12.106 - * nature of the nesting -- would either end up creating the same sub-matrix
  12.107 - * multiple times, or else would have to put in detection of whether had
  12.108 - * made a particular one already if tried to combine steps 1 and 2.
  12.109 - *
  12.110 - *Step 3 has to be separate because of the nesting, as well -- same reason,
  12.111 - * would either create same sub-matrix multiple times, or else have to
  12.112 - * add detection of whether was already created.
  12.113 - *
  12.114 - *Another way to look at it: there's one level of loop to divide dimensions,
  12.115 - * two levels of nesting to create sub-matrices, and three levels to pair
  12.116 - * up the sub-matrices.
  12.117 - */
  12.118 -
  12.119 -void divideWorkIntoSubMatrixPairProcrs( void      *_dividerParams,
  12.120 -                                        VirtProcr *animPr )
  12.121 - { VirtProcr       *resultPr;
  12.122 -   DividerParams   *dividerParams;
  12.123 -   ResultsParams   *resultsParams;
  12.124 -   Matrix          *leftMatrix, *rightMatrix, *resultMatrix;
  12.125 -   void            *msg;
  12.126 -   SlicingStrucCarrier *slicingStrucCarrier;
  12.127 -   float32         *resultArray; //points to array inside result matrix
  12.128 -   
  12.129 -         DEBUG( dbgAppFlow, "start divide\n")
  12.130 -
  12.131 -         int32
  12.132 -         divideProbe = VMS__create_single_interval_probe( "divideProbe",
  12.133 -                                                          animPr );
  12.134 -         VMS__record_sched_choice_into_probe( divideProbe, animPr );
  12.135 -         VMS__record_interval_start_in_probe( divideProbe );
  12.136 -
  12.137 -   //=========== Setup -- make local copies of ptd-to-things, malloc, aso
  12.138 -   int32 numResRows, numResCols, vectLength;
  12.139 -
  12.140 -   dividerParams   = (DividerParams *)_dividerParams;
  12.141 -   
  12.142 -   leftMatrix      = dividerParams->leftMatrix;
  12.143 -   rightMatrix     = dividerParams->rightMatrix;
  12.144 -
  12.145 -   vectLength = leftMatrix->numCols;
  12.146 -   numResRows = leftMatrix->numRows;
  12.147 -   numResCols = rightMatrix->numCols;
  12.148 -   resultArray     = dividerParams->resultMatrix->array;
  12.149 -   
  12.150 -      //zero the result array
  12.151 -   memset( resultArray, 0, numResRows * numResCols * sizeof(float32) );
  12.152 -
  12.153 -   //==============  Do either sequential mult or do division ==============
  12.154 -
  12.155 -      //Check if input matrices too small -- if yes, just do sequential
  12.156 -      //Cutoff is determined by overhead of this divider -- relatively
  12.157 -      // machine-independent
  12.158 -   if( (float32)leftMatrix->numRows * (float32)leftMatrix->numCols *
  12.159 -       (float32)rightMatrix->numCols  < NUM_CELLS_IN_SEQUENTIAL_CUTOFF )
  12.160 -    {
  12.161 -      //====== Do sequential multiply on a single core
  12.162 -            DEBUG( dbgAppFlow, "doing sequential")
  12.163 -            
  12.164 -         //transpose the right matrix
  12.165 -      float32 *
  12.166 -      transRightArray  = SSR__malloc_to( rightMatrix->numRows * 
  12.167 -                                         rightMatrix->numCols * sizeof(float32),
  12.168 -                                         animPr );
  12.169 -
  12.170 -         //copy values from orig matrix to local
  12.171 -      copyTranspose( rightMatrix->numRows, rightMatrix->numCols,
  12.172 -                     0, 0, rightMatrix->numRows,
  12.173 -                     transRightArray, rightMatrix->array );
  12.174 -      
  12.175 -      multiplyMatrixArraysTransposed( vectLength, numResRows, numResCols,
  12.176 -                            leftMatrix->array, transRightArray,
  12.177 -                            resultArray );
  12.178 -    }
  12.179 -   else
  12.180 -    {
  12.181 -      //====== Do parallel multiply across cores
  12.182 -
  12.183 -         //Calc the ideal size of sub-matrix and slice up the dimensions of
  12.184 -         // the two matrices.
  12.185 -         //The ideal size is the one takes the number of cycles to calculate
  12.186 -         // such that calc time is equal or greater than min work-unit size
  12.187 -      slicingStrucCarrier =
  12.188 -         calcIdealSizeAndSliceDimensions( leftMatrix, rightMatrix, animPr );
  12.189 -
  12.190 -         //Make the results processor, now that know how many to wait for
  12.191 -      resultsParams = SSR__malloc_to( sizeof(ResultsParams), animPr );
  12.192 -      resultsParams->numSubMatrixPairs  =
  12.193 -         slicingStrucCarrier->leftRowSlices->numVals *
  12.194 -         slicingStrucCarrier->rightColSlices->numVals *
  12.195 -         slicingStrucCarrier->vecSlices->numVals;
  12.196 -      resultsParams->dividerPr   = animPr;
  12.197 -      resultsParams->numCols     = rightMatrix->numCols;
  12.198 -      resultsParams->numRows     = leftMatrix->numRows;
  12.199 -      resultsParams->resultArray = resultArray;
  12.200 -
  12.201 -
  12.202 -      resultPr =
  12.203 -         SSR__create_procr_with( &gatherResults, resultsParams, animPr);
  12.204 -
  12.205 -         //Make the sub-matrices, and pair them up, and make processor to
  12.206 -         // calc product of each pair.
  12.207 -      makeSubMatricesAndProcrs( leftMatrix, rightMatrix,
  12.208 -                                    slicingStrucCarrier,
  12.209 -                                    resultPr, animPr);
  12.210 - 
  12.211 -         //result array is allocated externally, so no message from resultPr
  12.212 -         // however, do have to wait before printing out stats, so wait
  12.213 -         // for an empty handshake message
  12.214 -      msg = SSR__receive_from_to( resultPr, animPr );
  12.215 -   }
  12.216 -
  12.217 -
  12.218 -   //===============  Work done -- send results back =================
  12.219 -
  12.220 -
  12.221 -         DEBUG( dbgAppFlow, "end divide\n")
  12.222 -
  12.223 -         VMS__record_interval_end_in_probe( divideProbe );
  12.224 -         VMS__print_stats_of_all_probes();
  12.225 -
  12.226 -      //nothing left to do so dissipate, SSR will wait to shutdown and hence
  12.227 -      // make results available to outside until all the processors have
  12.228 -      // dissipated -- so no need to wait for results processor
  12.229 -
  12.230 -   SSR__dissipate_procr( animPr );  //all procrs dissipate self at end
  12.231 -      //when all of the processors have dissipated, the "create seed and do
  12.232 -      // work" call in the entry point function returns
  12.233 - }
  12.234 -
  12.235 -
  12.236 -SlicingStrucCarrier *
  12.237 -calcIdealSizeAndSliceDimensions( Matrix *leftMatrix, Matrix *rightMatrix,
  12.238 -                                 VirtProcr *animPr )
  12.239 - {
  12.240 -   float32 idealSizeOfSide, idealSizeOfSide1, idealSizeOfSide2;
  12.241 -   SlicingStruc *leftRowSlices, *vecSlices, *rightColSlices;
  12.242 -   SlicingStrucCarrier *slicingStrucCarrier =
  12.243 -                         SSR__malloc_to(sizeof(SlicingStrucCarrier), animPr);
  12.244 -
  12.245 -   int minWorkUnitCycles, primitiveCycles, idealNumWorkUnits;
  12.246 -   float64 numPrimitiveOpsInMinWorkUnit;
  12.247 -
  12.248 -
  12.249 -   //=======  Calc ideal size of min-sized sub-matrix  ========
  12.250 -
  12.251 -      //ask SSR for the number of cycles of the minimum work unit, at given
  12.252 -      // percent overhead then add a guess at overhead from this divider
  12.253 -   minWorkUnitCycles = SSR__giveMinWorkUnitCycles( .05 );
  12.254 -
  12.255 -      //ask SSR for number of cycles of the "primitive" op of matrix mult
  12.256 -   primitiveCycles = measureMatrixMultPrimitive( animPr );
  12.257 -
  12.258 -   numPrimitiveOpsInMinWorkUnit =
  12.259 -      (float64)minWorkUnitCycles / (float64)primitiveCycles;
  12.260 -
  12.261 -      //take cubed root -- that's number of these in a "side" of sub-matrix
  12.262 -      // then multiply by 5 because the primitive is 5x5
  12.263 -   idealSizeOfSide1 = 5 * cbrt( numPrimitiveOpsInMinWorkUnit );
  12.264 -
  12.265 -   idealNumWorkUnits = SSR__giveIdealNumWorkUnits();
  12.266 -   
  12.267 -   idealSizeOfSide2 = leftMatrix->numRows / rint(cbrt( idealNumWorkUnits ));
  12.268 -   idealSizeOfSide2 *= 0.6; //finer granularity to help load balance
  12.269 -
  12.270 -   if( idealSizeOfSide1 > idealSizeOfSide2 )
  12.271 -      idealSizeOfSide = idealSizeOfSide1;
  12.272 -   else
  12.273 -      idealSizeOfSide = idealSizeOfSide2;
  12.274 -
  12.275 -      //The multiply inner loop blocks the array to fit into L1 cache
  12.276 -//   if( idealSizeOfSide < ROWS_IN_BLOCK ) idealSizeOfSide = ROWS_IN_BLOCK;
  12.277 -
  12.278 -   //============  Slice up dimensions, now that know target size ===========
  12.279 -
  12.280 -      //Tell the slicer the target size of a side (floating pt), the start
  12.281 -      // value to start slicing at, and the end value to stop slicing at
  12.282 -      //It returns an array of start value of each chunk, plus number of them
  12.283 -   int32 startLeftRow, endLeftRow, startVec,endVec,startRightCol,endRightCol;
  12.284 -   startLeftRow  = 0;
  12.285 -   endLeftRow    = leftMatrix->numRows -1;
  12.286 -   startVec      = 0;
  12.287 -   endVec        = leftMatrix->numCols -1;
  12.288 -   startRightCol = 0;
  12.289 -   endRightCol   = rightMatrix->numCols -1;
  12.290 -
  12.291 -   leftRowSlices =
  12.292 -      sliceUpDimension( idealSizeOfSide,  startLeftRow, endLeftRow, animPr );
  12.293 -
  12.294 -   vecSlices =
  12.295 -      sliceUpDimension( idealSizeOfSide,  startVec, endVec, animPr );
  12.296 -
  12.297 -   rightColSlices =
  12.298 -      sliceUpDimension( idealSizeOfSide,  startRightCol, endRightCol,animPr);
  12.299 -
  12.300 -   slicingStrucCarrier->leftRowSlices  = leftRowSlices;
  12.301 -   slicingStrucCarrier->vecSlices      = vecSlices;
  12.302 -   slicingStrucCarrier->rightColSlices = rightColSlices;
  12.303 -
  12.304 -   return slicingStrucCarrier;
  12.305 - }
  12.306 -
  12.307 -
  12.308 -void
  12.309 -makeSubMatricesAndProcrs( Matrix    *leftMatrix, Matrix    *rightMatrix,
  12.310 -            SlicingStrucCarrier *slicingStrucCarrier,
  12.311 -            VirtProcr *resultPr,   VirtProcr *animPr )
  12.312 - {
  12.313 -   SlicingStruc *leftRowSlices, *vecSlices, *rightColSlices;
  12.314 -   
  12.315 -   leftRowSlices  = slicingStrucCarrier->leftRowSlices;
  12.316 -   vecSlices      = slicingStrucCarrier->vecSlices;
  12.317 -   rightColSlices = slicingStrucCarrier->rightColSlices;
  12.318 -   SSR__free( slicingStrucCarrier, animPr );
  12.319 -   
  12.320 -   //================  Make sub-matrices, given the slicing  ================
  12.321 -   SubMatrix **leftSubMatrices, **rightSubMatrices;
  12.322 -   leftSubMatrices =
  12.323 -      createSubMatrices( leftRowSlices, vecSlices, rightColSlices->numVals,
  12.324 -                         leftMatrix, animPr );
  12.325 -   //double_check_that_always_numRows_in_right_same_as_numCols_in_left();
  12.326 -   rightSubMatrices =
  12.327 -      createSubMatrices( vecSlices, rightColSlices, leftRowSlices->numVals,
  12.328 -                         rightMatrix, animPr );
  12.329 -
  12.330 -
  12.331 -   //==============  pair the sub-matrices and make processors ==============
  12.332 -   int32 numRowIdxs, numColIdxs, numVecIdxs;
  12.333 -
  12.334 -   numRowIdxs = leftRowSlices->numVals;
  12.335 -   numColIdxs = rightColSlices->numVals;
  12.336 -   numVecIdxs = vecSlices->numVals;
  12.337 -   
  12.338 -   
  12.339 -   freeSlicingStruc( leftRowSlices, animPr );
  12.340 -   freeSlicingStruc( vecSlices, animPr );
  12.341 -   freeSlicingStruc( rightColSlices, animPr );
  12.342 -   
  12.343 -   pairUpSubMatricesAndMakeProcessors( leftSubMatrices,
  12.344 -                                       rightSubMatrices,
  12.345 -                                       numRowIdxs, numColIdxs,
  12.346 -                                       numVecIdxs,
  12.347 -                                       resultPr,
  12.348 -                                       animPr );
  12.349 - }
  12.350 -
  12.351 -
  12.352 -
  12.353 -
  12.354 -void
  12.355 -pairUpSubMatricesAndMakeProcessors( SubMatrix **leftSubMatrices,
  12.356 -                                    SubMatrix **rightSubMatrices,
  12.357 -                                    int32 numRowIdxs, int32 numColIdxs,
  12.358 -                                    int32 numVecIdxs,
  12.359 -                                    VirtProcr *resultPr,
  12.360 -                                    VirtProcr *animatingPr )
  12.361 - {
  12.362 -   int32 resRowIdx, resColIdx, vecIdx;
  12.363 -   int32 numLeftColIdxs, numRightColIdxs;
  12.364 -   int32 leftRowIdxOffset;
  12.365 -   SMPairParams *subMatrixPairParams;
  12.366 -   float32 numToPutOntoEachCore, leftOverFraction;
  12.367 -   int32 numCores, coreToScheduleOnto, numVecOnCurrCore;
  12.368 -
  12.369 -   numLeftColIdxs  = numColIdxs;
  12.370 -   numRightColIdxs = numVecIdxs;
  12.371 -
  12.372 -   numCores = SSR__give_number_of_cores_to_schedule_onto();
  12.373 -
  12.374 -   numToPutOntoEachCore = numRowIdxs*numColIdxs/numCores;
  12.375 -   leftOverFraction = 0;
  12.376 -   numVecOnCurrCore = 0;
  12.377 -   coreToScheduleOnto = 0;
  12.378 -
  12.379 -   for( resRowIdx = 0; resRowIdx < numRowIdxs; resRowIdx++ )
  12.380 -    {
  12.381 -      leftRowIdxOffset = resRowIdx * numLeftColIdxs;
  12.382 -
  12.383 -      for( resColIdx = 0; resColIdx < numColIdxs; resColIdx++ )
  12.384 -       {
  12.385 -         
  12.386 -         for( vecIdx = 0; vecIdx < numVecIdxs; vecIdx++ )
  12.387 -          {
  12.388 -               //Make the processor for the pair of sub-matrices
  12.389 -            subMatrixPairParams  = SSR__malloc_to( sizeof(SMPairParams),
  12.390 -                                                               animatingPr);
  12.391 -            subMatrixPairParams->leftSubMatrix  =
  12.392 -               leftSubMatrices[ leftRowIdxOffset + vecIdx ];
  12.393 -
  12.394 -            subMatrixPairParams->rightSubMatrix =
  12.395 -               rightSubMatrices[ vecIdx * numRightColIdxs + resColIdx ];
  12.396 -
  12.397 -            subMatrixPairParams->resultPr = resultPr;
  12.398 -
  12.399 -               //put all pairs from the same vector onto same core
  12.400 -            SSR__create_procr_with_affinity( &calcSubMatrixProduct,
  12.401 -                                             subMatrixPairParams,
  12.402 -                                             animatingPr,
  12.403 -                                             coreToScheduleOnto );
  12.404 -          }
  12.405 -
  12.406 -            //Trying to distribute the subMatrix-vectors across the cores, so
  12.407 -            // that each core gets the same number of vectors, with a max
  12.408 -            // imbalance of 1 vector more on some cores than others
  12.409 -         numVecOnCurrCore += 1;
  12.410 -         if( numVecOnCurrCore + leftOverFraction >= numToPutOntoEachCore -1 )
  12.411 -          {
  12.412 -               //deal with fractional part, to ensure that imbalance is 1 max
  12.413 -               // IE, core with most has only 1 more than core with least
  12.414 -            leftOverFraction += numToPutOntoEachCore - numVecOnCurrCore;
  12.415 -            if( leftOverFraction >= 1 )
  12.416 -             { leftOverFraction -= 1;
  12.417 -               numVecOnCurrCore = -1;
  12.418 -             }
  12.419 -            else
  12.420 -             { numVecOnCurrCore = 0;
  12.421 -             }
  12.422 -               //Move to next core, max core-value to incr to is numCores -1
  12.423 -            if( coreToScheduleOnto >= numCores -1 )
  12.424 -             { coreToScheduleOnto = 0;
  12.425 -             }
  12.426 -            else
  12.427 -             { coreToScheduleOnto += 1;
  12.428 -             }
  12.429 -          }
  12.430 - 
  12.431 -       }
  12.432 -    }
  12.433 -
  12.434 - }
  12.435 -
  12.436 -
  12.437 -
  12.438 -/*Walk through the two slice-strucs, making sub-matrix strucs as go
  12.439 - */
  12.440 -SubMatrix **
  12.441 -createSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices,
  12.442 -                   int32 numUses, Matrix *origMatrix, VirtProcr *animPr )
  12.443 - {
  12.444 -   int32 numRowIdxs, numColIdxs, rowIdx, colIdx;
  12.445 -   int32 startRow, endRow, startCol, endCol;
  12.446 -   int32 *rowStartVals, *colStartVals;
  12.447 -   int32 rowOffset;
  12.448 -   SubMatrix **subMatrices, *newSubMatrix;
  12.449 -
  12.450 -   numRowIdxs = rowSlices->numVals;
  12.451 -   numColIdxs = colSlices->numVals;
  12.452 -
  12.453 -   rowStartVals = rowSlices->startVals;
  12.454 -   colStartVals = colSlices->startVals;
  12.455 -
  12.456 -   subMatrices = SSR__malloc_to(numRowIdxs * numColIdxs * sizeof(SubMatrix*),
  12.457 -                                 animPr );
  12.458 -
  12.459 -   for( rowIdx = 0; rowIdx < numRowIdxs; rowIdx++ )
  12.460 -    {
  12.461 -      rowOffset = rowIdx * numColIdxs;
  12.462 -      
  12.463 -      startRow  = rowStartVals[rowIdx];
  12.464 -      endRow    = rowStartVals[rowIdx + 1] -1; //"fake" start above last is
  12.465 -                                               // at last valid idx + 1 & is
  12.466 -                                               // 1 greater than end value
  12.467 -      for( colIdx = 0; colIdx < numColIdxs; colIdx++ )
  12.468 -       {
  12.469 -         startCol = colStartVals[colIdx];
  12.470 -         endCol   = colStartVals[colIdx + 1] -1;
  12.471 -
  12.472 -         newSubMatrix = SSR__malloc_to( sizeof(SubMatrix), animPr );
  12.473 -         newSubMatrix->numRows       = endRow - startRow +1;
  12.474 -         newSubMatrix->numCols       = endCol - startCol +1;
  12.475 -         newSubMatrix->origMatrix    = origMatrix;
  12.476 -         newSubMatrix->origStartRow  = startRow;
  12.477 -         newSubMatrix->origStartCol  = startCol;
  12.478 -         newSubMatrix->copySingleton = NULL;
  12.479 -         newSubMatrix->numUsesLeft   = numUses; //can free after this many
  12.480 -         //Prevent uninitialized memory
  12.481 -         newSubMatrix->copySingleton = NULL;
  12.482 -         newSubMatrix->copyTransSingleton = NULL;
  12.483 -
  12.484 -         subMatrices[ rowOffset + colIdx ] = newSubMatrix;
  12.485 -       }
  12.486 -    }
  12.487 -   return subMatrices;
  12.488 - }
  12.489 -
  12.490 -
  12.491 -void
  12.492 -freeSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices,
  12.493 -                 SubMatrix **subMatrices, VirtProcr *animPr )
  12.494 - {
  12.495 -   int32 numRowIdxs, numColIdxs, rowIdx, colIdx, rowOffset;
  12.496 -   SubMatrix *subMatrix;
  12.497 -
  12.498 -   numRowIdxs = rowSlices->numVals;
  12.499 -   numColIdxs = colSlices->numVals;
  12.500 -
  12.501 -   for( rowIdx = 0; rowIdx < numRowIdxs; rowIdx++ )
  12.502 -    {
  12.503 -      rowOffset = rowIdx * numColIdxs;
  12.504 -      for( colIdx = 0; colIdx < numColIdxs; colIdx++ )
  12.505 -       {
  12.506 -         subMatrix = subMatrices[ rowOffset + colIdx ];
  12.507 -         if( subMatrix->alreadyCopied )
  12.508 -            SSR__free( subMatrix->array, animPr );
  12.509 -         SSR__free( subMatrix, animPr );
  12.510 -       }
  12.511 -    }
  12.512 -   SSR__free( subMatrices, animPr );
  12.513 - }
  12.514 -
  12.515 -
  12.516 -
  12.517 -SlicingStruc *
  12.518 -sliceUpDimension( float32 idealSizeOfSide, int startVal, int endVal,
  12.519 -                  VirtProcr *animPr )
  12.520 - { float32 residualAcc = 0;
  12.521 -   int     numSlices, i, *startVals, sizeOfSlice, endCondition;
  12.522 -   SlicingStruc *slicingStruc = SSR__malloc_to(sizeof(SlicingStruc), animPr);
  12.523 -
  12.524 -      //calc size of matrix need to hold start vals --
  12.525 -   numSlices = (int32)( (float32)(endVal -startVal +1) / idealSizeOfSide);
  12.526 -
  12.527 -   startVals = SSR__malloc_to( (numSlices + 1) * sizeof(int32), animPr );
  12.528 -
  12.529 -      //Calc the upper limit of start value -- when get above this, end loop
  12.530 -      // by saving highest value of the matrix dimension to access, plus 1
  12.531 -      // as the start point of the imaginary slice following the last one
  12.532 -      //Plus 1 because go up to value but not include when process last slice
  12.533 -      //The stopping condition is half-a-size less than highest value because
  12.534 -      // don't want any pieces smaller than half the ideal size -- just tack
  12.535 -      // little ones onto end of last one
  12.536 -   endCondition = endVal - (int) (idealSizeOfSide/2); //end *value*, not size
  12.537 -   for( i = 0; startVal <= endVal; i++ )
  12.538 -    {
  12.539 -      startVals[i] = startVal;
  12.540 -      residualAcc += idealSizeOfSide;
  12.541 -      sizeOfSlice  = (int)residualAcc;
  12.542 -      residualAcc -= (float32)sizeOfSlice;
  12.543 -      startVal    += sizeOfSlice; //ex @size = 2 get 0, 2, 4, 6, 8..
  12.544 -
  12.545 -      if( startVal > endCondition )
  12.546 -       { startVal = endVal + 1;
  12.547 -         startVals[ i + 1 ] = startVal;
  12.548 -       }
  12.549 -    }
  12.550 -
  12.551 -   slicingStruc->startVals = startVals;
  12.552 -   slicingStruc->numVals   = i;  //loop incr'd, so == last valid start idx+1
  12.553 -                                 // which means is num sub-matrices in dim
  12.554 -                                 // also == idx of the fake start just above
  12.555 -   return slicingStruc;
  12.556 - }
  12.557 -
  12.558 -void
  12.559 -freeSlicingStruc( SlicingStruc *slicingStruc, VirtProcr *animPr )
  12.560 - {
  12.561 -   SSR__free( slicingStruc->startVals, animPr );
  12.562 -   SSR__free( slicingStruc, animPr );
  12.563 - }
  12.564 -
  12.565 -
  12.566 -int inline
  12.567 -measureMatrixMultPrimitive( VirtProcr *animPr )
  12.568 - {
  12.569 -   int r, c, v, numCycles;
  12.570 -   float32 *res, *left, *right;
  12.571 -
  12.572 -      //setup inputs
  12.573 -   left  = SSR__malloc_to( 5 * 5 * sizeof( float32 ), animPr );
  12.574 -   right = SSR__malloc_to( 5 * 5 * sizeof( float32 ), animPr );
  12.575 -   res   = SSR__malloc_to( 5 * 5 * sizeof( float32 ), animPr );
  12.576 -
  12.577 -   for( r = 0; r < 5; r++ )
  12.578 -    {
  12.579 -      for( c = 0; c < 5; c++ )
  12.580 -       {
  12.581 -         left[  r * 5 + c ] = r;
  12.582 -         right[ r * 5 + c ] = c;
  12.583 -       }
  12.584 -    }
  12.585 -
  12.586 -      //do primitive
  12.587 -   SSR__start_primitive();  //for now, just takes time stamp
  12.588 -   for( r = 0; r < 5; r++ )
  12.589 -    {
  12.590 -      for( c = 0; c < 5; c++ )
  12.591 -       {
  12.592 -         for( v = 0; v < 5; v++ )
  12.593 -          {
  12.594 -            res[ r * 5 + c ] = left[ r * 5 + v ] * right[ v * 5 + c ];
  12.595 -          }
  12.596 -       }
  12.597 -    }
  12.598 -   numCycles =
  12.599 -      SSR__end_primitive_and_give_cycles();
  12.600 -
  12.601 -   SSR__free( left, animPr );
  12.602 -   SSR__free( right, animPr );
  12.603 -   SSR__free( res, animPr );
  12.604 -
  12.605 -   return numCycles;
  12.606 - }

    13.1 --- a/src/Application/SSR_Matrix_Mult/EntryPoint.c	Wed Sep 07 13:06:25 2011 +0200
    13.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
    13.3 @@ -1,62 +0,0 @@
    13.4 -/*
    13.5 - *  Copyright 2009 OpenSourceStewardshipFoundation.org
    13.6 - *  Licensed under GNU General Public License version 2
    13.7 - *
    13.8 - * Author: seanhalle@yahoo.com
    13.9 - *
   13.10 - */
   13.11 -
   13.12 -#include <math.h>
   13.13 -
   13.14 -#include "SSR_Matrix_Mult.h"
   13.15 -
   13.16 -
   13.17 -
   13.18 -/*Every SSR system has an "entry point" function that creates the first
   13.19 - * processor, which starts the chain of creating more processors..
   13.20 - * eventually all of the processors will dissipate themselves, and
   13.21 - * return.
   13.22 - *
   13.23 - *This entry-point function follows the same pattern as all entry-point
   13.24 - * functions do:
   13.25 - *1) it creates the params for the seed processor, from the
   13.26 - *    parameters passed into the entry-point function
   13.27 - *2) it calls SSR__create_seed_procr_and_do_work
   13.28 - *3) it gets the return value from the params struc, frees the params struc,
   13.29 - *    and returns the value from the function
   13.30 - *
   13.31 - */
   13.32 -Matrix *
   13.33 -multiplyTheseMatrices( Matrix *leftMatrix, Matrix *rightMatrix )
   13.34 - { Matrix          *resMatrix;
   13.35 -   DividerParams   *dividerParams;
   13.36 -   int32            numResRows, numResCols;
   13.37 -
   13.38 -
   13.39 -   dividerParams              = malloc( sizeof( DividerParams ) );
   13.40 -   dividerParams->leftMatrix  = leftMatrix;
   13.41 -   dividerParams->rightMatrix = rightMatrix;
   13.42 -
   13.43 -
   13.44 -   numResRows  = leftMatrix->numRows;
   13.45 -   numResCols  = rightMatrix->numCols;
   13.46 -
   13.47 -      //VMS has its own separate internal malloc, so to get results out,
   13.48 -      // have to pass in empty array for it to fill up
   13.49 -      //The alternative is internally telling SSR make external space to use
   13.50 -   resMatrix            = malloc( sizeof(Matrix) );
   13.51 -   resMatrix->array     = malloc( numResRows * numResCols * sizeof(float32));
   13.52 -   resMatrix->numCols   = rightMatrix->numCols;
   13.53 -   resMatrix->numRows   = leftMatrix->numRows;
   13.54 -
   13.55 -
   13.56 -   dividerParams->resultMatrix   = resMatrix;
   13.57 -
   13.58 -      //create divider processor, start doing the work, and wait till done
   13.59 -      //This function is the "border crossing" between normal code and SSR
   13.60 -   SSR__create_seed_procr_and_do_work( &divideWorkIntoSubMatrixPairProcrs,
   13.61 -                                       dividerParams );
   13.62 -   
   13.63 -   free( dividerParams );
   13.64 -   return resMatrix;
   13.65 - }

    14.1 --- a/src/Application/SSR_Matrix_Mult/Result_Pr.c	Wed Sep 07 13:06:25 2011 +0200
    14.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
    14.3 @@ -1,108 +0,0 @@
    14.4 -/*
    14.5 - *  Copyright 2009 OpenSourceStewardshipFoundation.org
    14.6 - *  Licensed under GNU General Public License version 2
    14.7 - *
    14.8 - * Author: seanhalle@yahoo.com
    14.9 - *
   14.10 - */
   14.11 -
   14.12 -#include "SSR_Matrix_Mult.h"
   14.13 -
   14.14 -//=====================
   14.15 -void inline
   14.16 -accumulateResult( float32 *resultArray, float32 *subMatrixResultArray,
   14.17 -                  int32    startRow,
   14.18 -                  int32    numRows,
   14.19 -                  int32    startCol,
   14.20 -                  int32    numCols,
   14.21 -                  int32    numOrigCols );
   14.22 -
   14.23 -//===========================================================================
   14.24 -
   14.25 -/*The Result Processor gets a message from each of the vector processors,
   14.26 - * puts the result from the message in its location in the result-
   14.27 - * matrix, and increments the count of results.
   14.28 - *
   14.29 - *After the count reaches the point that all results have been received, it
   14.30 - * returns the result matrix and dissipates.
   14.31 - */
   14.32 -void gatherResults( void *_params, VirtProcr *animatingPr )
   14.33 - { VirtProcr *dividerPr;
   14.34 -   ResultsParams  *params;
   14.35 -   int             row, col, numRows, numCols, numSubMatrixPairs, count=0;
   14.36 -   float32        *resultArray;
   14.37 -   void           *msg;
   14.38 -   SMPairParams   *resParams;
   14.39 -
   14.40 -         DEBUG( dbgAppFlow, "start resultPr\n")
   14.41 -         
   14.42 -   params    = (ResultsParams *)_params;
   14.43 -   dividerPr = params->dividerPr;
   14.44 -   numSubMatrixPairs = params->numSubMatrixPairs;
   14.45 -   numRows = params->numRows;
   14.46 -   numCols = params->numCols;
   14.47 -
   14.48 -   resultArray = params->resultArray;
   14.49 -
   14.50 -
   14.51 -   while( count < numSubMatrixPairs )
   14.52 -    {
   14.53 -      msg = SSR__receive_type_to( RESULTS_MSG, animatingPr );
   14.54 -
   14.55 -      resParams = (SMPairParams *)msg;
   14.56 -      accumulateResult( resultArray, resParams->partialResultArray,
   14.57 -                        resParams->leftSubMatrix->origStartRow,
   14.58 -                        resParams->leftSubMatrix->numRows,
   14.59 -                        resParams->rightSubMatrix->origStartCol,
   14.60 -                        resParams->rightSubMatrix->numCols,
   14.61 -                        resParams->rightSubMatrix->origMatrix->numCols );
   14.62 -
   14.63 -      SSR__free( resParams->partialResultArray, animatingPr );
   14.64 -      
   14.65 -         //there is only one copy of results procr, so can update numUsesLeft
   14.66 -         // without concurrency worries.  When zero, free the sub-matrix
   14.67 -      resParams->leftSubMatrix->numUsesLeft -= 1;
   14.68 -      if( resParams->leftSubMatrix->numUsesLeft == 0 )
   14.69 -       {
   14.70 -         SSR__free( resParams->leftSubMatrix->array, animatingPr );
   14.71 -         SSR__free( resParams->leftSubMatrix, animatingPr );
   14.72 -       }
   14.73 -
   14.74 -      resParams->rightSubMatrix->numUsesLeft -= 1;
   14.75 -      if( resParams->rightSubMatrix->numUsesLeft == 0 )
   14.76 -       {
   14.77 -         SSR__free( resParams->rightSubMatrix->array, animatingPr );
   14.78 -         SSR__free( resParams->rightSubMatrix, animatingPr );
   14.79 -       }
   14.80 -
   14.81 -         //count of how many sub-matrix pairs accumulated so know when done
   14.82 -      count++;
   14.83 -    }
   14.84 -
   14.85 -      //Done -- could just dissipate -- SSR will wait for all processors to
   14.86 -      // dissipate before shutting down, and thereby making results avaial to
   14.87 -      // outside, so no need to stop the divider from dissipating, so no need
   14.88 -      // to send a hand-shake message to it -- bug makes debug easier
   14.89 -   SSR__send_from_to( NULL, animatingPr, dividerPr );
   14.90 -   SSR__dissipate_procr( animatingPr );  //frees any data owned by procr
   14.91 - }
   14.92 -
   14.93 -void inline
   14.94 -accumulateResult( float32 *resultArray, float32 *subMatrixPairResultArray,
   14.95 -                  int32    startRow,
   14.96 -                  int32    numRows,
   14.97 -                  int32    startCol,
   14.98 -                  int32    numCols,
   14.99 -                  int32    numOrigCols )
  14.100 - { int32 row, col;
  14.101 -
  14.102 -   for( row = 0; row < numRows; row++ )
  14.103 -    {
  14.104 -      for( col = 0; col < numCols; col++ )
  14.105 -       {
  14.106 -         resultArray[ (row + startRow) * numOrigCols + (col + startCol) ] +=
  14.107 -            subMatrixPairResultArray[ row * numCols + col ];
  14.108 -       }
  14.109 -    }
  14.110 -
  14.111 - }

    15.1 --- a/src/Application/SSR_Matrix_Mult/SSR_Matrix_Mult.h	Wed Sep 07 13:06:25 2011 +0200
    15.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
    15.3 @@ -1,97 +0,0 @@
    15.4 -/*
    15.5 - *  Copyright Oct 24, 2009 OpenSourceStewardshipFoundation.org
    15.6 - *  Licensed under GNU General Public License version 2
    15.7 - */
    15.8 -
    15.9 -#ifndef _SSR_MATRIX_MULT_H_
   15.10 -#define _SSR_MATRIX_MULT_H_
   15.11 -
   15.12 -#include <stdio.h>
   15.13 -
   15.14 -#include "../../SSR_lib/SSR.h"
   15.15 -#include "../Matrix_Mult.h"
   15.16 -
   15.17 -
   15.18 -//===============================  Defines  ==============================
   15.19 -#define ROWS_IN_BLOCK 32
   15.20 -#define COLS_IN_BLOCK 32
   15.21 -#define VEC_IN_BLOCK  32
   15.22 -
   15.23 -#define copyMatrixSingleton 1
   15.24 -#define copyTransposeSingleton 2
   15.25 -
   15.26 -//==============================  Structures  ==============================
   15.27 -typedef struct
   15.28 - {
   15.29 -   Matrix *leftMatrix;
   15.30 -   Matrix *rightMatrix;
   15.31 -   Matrix *resultMatrix;
   15.32 - }
   15.33 -DividerParams;
   15.34 -
   15.35 -typedef struct
   15.36 - {
   15.37 -   VirtProcr *dividerPr;
   15.38 -   int numRows;
   15.39 -   int numCols;
   15.40 -   int numSubMatrixPairs;
   15.41 -   float32 *resultArray;
   15.42 - }
   15.43 -ResultsParams;
   15.44 -
   15.45 -typedef
   15.46 -struct
   15.47 - { int32    numRows;
   15.48 -   int32    numCols;
   15.49 -   Matrix  *origMatrix;
   15.50 -   int32    origStartRow;
   15.51 -   int32    origStartCol;
   15.52 -   int32    alreadyCopied;
   15.53 -   int32    numUsesLeft; //have update via message to avoid multiple writers
   15.54 -   SSRSingleton *copySingleton;
   15.55 -   SSRSingleton *copyTransSingleton;
   15.56 -   float32 *array;  //2D, but dynamically sized, so use addr arith
   15.57 - }
   15.58 -SubMatrix;
   15.59 -
   15.60 -typedef struct
   15.61 - { VirtProcr *resultPr;
   15.62 -   SubMatrix *leftSubMatrix;
   15.63 -   SubMatrix *rightSubMatrix;
   15.64 -   float32   *partialResultArray;
   15.65 - }
   15.66 -SMPairParams;
   15.67 -
   15.68 -typedef
   15.69 -struct
   15.70 - { int32    numVals;
   15.71 -   int32   *startVals;
   15.72 - }
   15.73 -SlicingStruc;
   15.74 -
   15.75 -typedef
   15.76 -struct
   15.77 - {
   15.78 -   SlicingStruc *leftRowSlices;
   15.79 -   SlicingStruc *vecSlices;
   15.80 -   SlicingStruc *rightColSlices;
   15.81 - }
   15.82 -SlicingStrucCarrier;
   15.83 -
   15.84 -enum MMMsgType
   15.85 - {
   15.86 -   RESULTS_MSG = 1
   15.87 - };
   15.88 -
   15.89 -//============================= Processor Functions =========================
   15.90 -void divideWorkIntoSubMatrixPairProcrs( void *data, VirtProcr *animatingPr );
   15.91 -void calcSubMatrixProduct(        void *data, VirtProcr *animatingPr );
   15.92 -void gatherResults(     void *data, VirtProcr *animatingPr );
   15.93 -
   15.94 -
   15.95 -//================================ Entry Point ==============================
   15.96 -Matrix *
   15.97 -multiplyTheseMatrices( Matrix *leftMatrix, Matrix *rightMatrix );
   15.98 -
   15.99 -
  15.100 -#endif /*_SSR_MATRIX_MULT_H_*/

    16.1 --- a/src/Application/SSR_Matrix_Mult/subMatrix_Pr.c	Wed Sep 07 13:06:25 2011 +0200
    16.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
    16.3 @@ -1,319 +0,0 @@
    16.4 -/* 
    16.5 - *  Copyright 2009 OpenSourceStewardshipFoundation.org
    16.6 - *  Licensed under GNU General Public License version 2
    16.7 - *
    16.8 - * Author: SeanHalle@yahoo.com
    16.9 - *
   16.10 - */
   16.11 -
   16.12 -#include <string.h>
   16.13 -
   16.14 -#include "SSR_Matrix_Mult.h"
   16.15 -
   16.16 -
   16.17 -
   16.18 -void inline
   16.19 -copyFromOrig( SubMatrix *subMatrix, VirtProcr *animPr );
   16.20 -
   16.21 -void inline
   16.22 -copyTransposeFromOrig( SubMatrix *subMatrix, VirtProcr *animPr );
   16.23 -
   16.24 -void inline
   16.25 -multiplySubBlocksTransposed( float32 *leftArray, float32 *rightArray,
   16.26 -                     float32 *resArray,
   16.27 -                     int startRow,  int endRow,
   16.28 -                     int startCol,  int endCol,
   16.29 -                     int startVec,  int endVec,
   16.30 -                     int resStride, int inpStride );
   16.31 -
   16.32 -void inline
   16.33 -multiplyMatrixArraysTransposed( int32 vecLength, int32 numResRows, int32 numResCols,
   16.34 -                      float32 *leftArray, float32 *rightArray,
   16.35 -                      float32 *resArray );
   16.36 -
   16.37 -
   16.38 -/*A  processor is created with an environment that holds two matrices,
   16.39 - * the row and col that it owns, and the name of a result gathering
   16.40 - * processor.
   16.41 - *It calculates the product of two sub-portions of the input matrices
   16.42 - * by using Intel's mkl library for single-core.
   16.43 - *
   16.44 - *This demonstrates using optimized single-threaded code inside scheduled
   16.45 - * work-units.
   16.46 - *
   16.47 - *When done, it sends the result to the result processor
   16.48 - */
   16.49 -void
   16.50 -calcSubMatrixProduct( void *data, VirtProcr *animatingPr )
   16.51 - { 
   16.52 -   SMPairParams   *params;
   16.53 -   VirtProcr      *resultPr;
   16.54 -   float32        *leftArray,  *rightArray, *resArray;
   16.55 -   SubMatrix      *leftSubMatrix, *rightSubMatrix;
   16.56 -
   16.57 -         DEBUG1(dbgAppFlow, "start sub-matrix mult: %d\n", animatingPr->procrID)
   16.58 -         #ifdef TURN_ON_DEBUG_PROBES
   16.59 -         int32 subMatrixProbe = VMS__create_single_interval_probe( "subMtx",
   16.60 -                                                                animatingPr);
   16.61 -         VMS__record_sched_choice_into_probe( subMatrixProbe, animatingPr );
   16.62 -         VMS__record_interval_start_in_probe( subMatrixProbe );
   16.63 -         #endif
   16.64 -
   16.65 -   params         = (SMPairParams *)data;
   16.66 -   resultPr       = params->resultPr;
   16.67 -   leftSubMatrix  = params->leftSubMatrix;
   16.68 -   rightSubMatrix = params->rightSubMatrix;
   16.69 -
   16.70 -      //make sure the input sub-matrices have been copied out of orig
   16.71 -      //do it here, inside sub-matrix pair to hopefully gain reuse in cache
   16.72 -   copyFromOrig( leftSubMatrix, animatingPr );
   16.73 -   copyTransposeFromOrig( rightSubMatrix, animatingPr );
   16.74 -   
   16.75 -   leftArray      = leftSubMatrix->array;
   16.76 -   rightArray     = rightSubMatrix->array;
   16.77 -
   16.78 -   int32
   16.79 -   resSize = leftSubMatrix->numRows * rightSubMatrix->numCols * sizeof(float32);
   16.80 -   resArray = SSR__malloc_to( resSize, animatingPr );
   16.81 -   memset( resArray, 0, resSize );
   16.82 -
   16.83 -
   16.84 -   int32 numResRows, numResCols, vectLength;
   16.85 -   
   16.86 -   vectLength = leftSubMatrix->numCols;
   16.87 -   numResRows = leftSubMatrix->numRows;
   16.88 -   numResCols = rightSubMatrix->numCols;
   16.89 -
   16.90 -   multiplyMatrixArraysTransposed( vectLength, numResRows, numResCols,
   16.91 -                         leftArray,  rightArray,
   16.92 -                         resArray );
   16.93 -
   16.94 -   //send result to result processor
   16.95 -   params->partialResultArray = resArray;
   16.96 -
   16.97 -         #ifdef TURN_ON_DEBUG_PROBES
   16.98 -         VMS__record_interval_end_in_probe( subMatrixProbe );
   16.99 -         #endif
  16.100 -         
  16.101 -   SSR__send_of_type_to( animatingPr, params, RESULTS_MSG, resultPr );
  16.102 -   SSR__dissipate_procr( animatingPr );
  16.103 - }
  16.104 -
  16.105 -
  16.106 -
  16.107 -/*Divides result and each input into 32x32 sub-matrices, 3 of which fit into
  16.108 - * the 32KB L1 cache.
  16.109 - *Would be nice to embed this within another level that divided into
  16.110 - * 8x8 tiles of those, where one 8x8 tile fits within 2MB L2 cache
  16.111 - *
  16.112 - *Eventually want these divisions to be automatic, using DKU pattern
  16.113 - * embedded into VMS and exposed in the language, and with VMS controlling the
  16.114 - * divisions according to the cache sizes, which it knows about.
  16.115 - *Also, want VMS to work with language to split among main-mems, so a socket
  16.116 - * only cranks on data in its local segment of main mem
  16.117 - *
  16.118 - *So, outer two loops determine start and end points within the result matrix.
  16.119 - * Inside that, a loop dets the start and end points along the shared dimensions
  16.120 - * of the two input matrices.
  16.121 - */
  16.122 -void inline
  16.123 -multiplyMatrixArraysTransposed( int32 vecLength, int32 numResRows,
  16.124 -                                int32 numResCols,
  16.125 -                                float32 *leftArray, float32 *rightArray,
  16.126 -                                float32 *resArray )
  16.127 - {
  16.128 -   int resStride, inpStride;
  16.129 -   int resStartRow, resStartCol, resEndRow, resEndCol, startVec, endVec;
  16.130 -
  16.131 -   resStride  = numResCols;
  16.132 -   inpStride  = vecLength;
  16.133 -
  16.134 -   for( resStartRow = 0; resStartRow < numResRows; )
  16.135 -    {
  16.136 -      resEndRow = resStartRow + ROWS_IN_BLOCK -1;  //start at zero, so -1
  16.137 -      if( resEndRow > numResRows ) resEndRow = numResRows -1;
  16.138 -
  16.139 -      for( resStartCol = 0; resStartCol < numResCols; )
  16.140 -       {
  16.141 -         resEndCol   = resStartCol + COLS_IN_BLOCK -1;
  16.142 -         if( resEndCol > numResCols ) resEndCol = numResCols -1;
  16.143 -
  16.144 -         for( startVec = 0; startVec < vecLength; )
  16.145 -          {
  16.146 -            endVec   = startVec + VEC_IN_BLOCK -1;
  16.147 -            if( endVec > vecLength ) endVec = vecLength -1;
  16.148 -
  16.149 -               //By having the "vector" of sub-blocks in a sub-block slice
  16.150 -               // be marched down in inner loop, are re-using the result
  16.151 -               // matrix, which stays in L1 cache and re-using the left sub-mat
  16.152 -               // which repeats for each right sub-mat -- can only re-use two of
  16.153 -               // the three, so result is the most important -- avoids writing
  16.154 -               // dirty blocks until those result-locations fully done
  16.155 -               //Row and Col is position in result matrix -- so row and vec
  16.156 -               // for left array, then vec and col for right array
  16.157 -            multiplySubBlocksTransposed( leftArray, rightArray,
  16.158 -                                         resArray,
  16.159 -                                         resStartRow,  resEndRow,
  16.160 -                                         resStartCol,  resEndCol,
  16.161 -                                         startVec,  endVec,
  16.162 -                                         resStride, inpStride );
  16.163 -            startVec = endVec +1;
  16.164 -          }
  16.165 -         resStartCol = resEndCol +1;
  16.166 -       }
  16.167 -      resStartRow = resEndRow +1;
  16.168 -    }
  16.169 - }
  16.170 -
  16.171 -
  16.172 -
  16.173 -void inline
  16.174 -multiplySubBlocksTransposed( float32 *leftArray, float32 *rightArray,
  16.175 -                     float32 *resArray,
  16.176 -                     int resStartRow,  int resEndRow,
  16.177 -                     int resStartCol,  int resEndCol,
  16.178 -                     int startVec,  int endVec,
  16.179 -                     int resStride, int inpStride )
  16.180 - {
  16.181 -   int resRow,     resCol,        vec;
  16.182 -   int leftOffset, rightOffset;
  16.183 -   float32 result;
  16.184 -
  16.185 -      //The result row is used only for the left matrix, res col for the right
  16.186 -   for( resCol = resStartCol; resCol <= resEndCol; resCol++ )
  16.187 -    {
  16.188 -      for( resRow = resStartRow; resRow <= resEndRow; resRow++ )
  16.189 -       {
  16.190 -         leftOffset  = resRow * inpStride;//left & right inp strides always same
  16.191 -         rightOffset = resCol * inpStride;// because right is transposed
  16.192 -         result = 0;
  16.193 -         for( vec = startVec; vec <= endVec; vec++ )
  16.194 -          {
  16.195 -            result +=
  16.196 -               leftArray[ leftOffset + vec] * rightArray[ rightOffset + vec];
  16.197 -          }
  16.198 -
  16.199 -         resArray[ resRow * resStride + resCol ] += result;
  16.200 -       }
  16.201 -    }
  16.202 - }
  16.203 -
  16.204 -
  16.205 -
  16.206 -
  16.207 -/*Reuse this in divider when do the sequential multiply case
  16.208 - */
  16.209 -void inline
  16.210 -copyTranspose( int32 numRows, int32 numCols,
  16.211 -               int32 origStartRow, int32 origStartCol, int32 origStride,
  16.212 -               float32 *subArray, float32 *origArray )
  16.213 - { int32 stride = numRows;
  16.214 - 
  16.215 -   int row, col, origOffset;
  16.216 -   for( row = 0; row < numRows; row++ )
  16.217 -    {
  16.218 -      origOffset = (row + origStartRow) * origStride + origStartCol;
  16.219 -      for( col = 0; col < numCols; col++ )
  16.220 -       {
  16.221 -            //transpose means swap row & col -- traverse orig matrix normally
  16.222 -            // but put into reversed place in local array -- means the
  16.223 -            // stride is the numRows now, so col * numRows + row
  16.224 -         subArray[ col * stride + row ]  =  origArray[ origOffset + col ];
  16.225 -       }
  16.226 -    }
  16.227 - }
  16.228 -
  16.229 -void inline
  16.230 -copyTransposeFromOrig( SubMatrix *subMatrix, VirtProcr *animPr )
  16.231 - { int numCols, numRows, origStartRow, origStartCol, origStride, stride;
  16.232 -   Matrix *origMatrix;
  16.233 -   float32 *origArray, *subArray;
  16.234 -
  16.235 -//   if( subMatrix->copyTransSingleton && \
  16.236 -//       subMatrix->copyTransSingleton->hasFinished ) \
  16.237 -//      return;
  16.238 -   SSR__start_data_singleton( &(subMatrix->copyTransSingleton), animPr );
  16.239 -
  16.240 -   if( subMatrix->copyTransSingleton->hasFinished )
  16.241 -    {
  16.242 -      printf("error!");
  16.243 -    }
  16.244 -
  16.245 -   origMatrix   = subMatrix->origMatrix;
  16.246 -   origArray    = origMatrix->array;
  16.247 -   numCols      = subMatrix->numCols;
  16.248 -   numRows      = subMatrix->numRows;
  16.249 -   origStartRow = subMatrix->origStartRow;
  16.250 -   origStartCol = subMatrix->origStartCol;
  16.251 -   origStride   = origMatrix->numCols;
  16.252 -
  16.253 -   subArray     = SSR__malloc_to( numRows * numCols *sizeof(float32),animPr);
  16.254 -   subMatrix->array = subArray;
  16.255 -
  16.256 -      //copy values from orig matrix to local
  16.257 -   copyTranspose( numRows, numCols,
  16.258 -                  origStartRow, origStartCol, origStride,
  16.259 -                  subArray, origArray );
  16.260 -
  16.261 -   SSR__end_data_singleton( &(subMatrix->copyTransSingleton), animPr );
  16.262 -   
  16.263 -   return;
  16.264 - }
  16.265 -
  16.266 -
  16.267 -void inline
  16.268 -copyFromOrig( SubMatrix *subMatrix, VirtProcr *animPr )
  16.269 - { int numCols, numRows, origStartRow, origStartCol, stride, origStride;
  16.270 -   Matrix *origMatrix;
  16.271 -   float32 *origArray, *subArray;
  16.272 -
  16.273 -
  16.274 -      //This lets only a single VP execute the code between start and
  16.275 -      // end -- using start and end so that work runs outside the master.
  16.276 -      //Inside, if a second VP ever executes the start, it will be returned
  16.277 -      // from the end-point.
  16.278 -      //Note, for non-GCC, can add a second SSR call at the end, and inside
  16.279 -      // that one, look at the stack at the return addr & save that in an
  16.280 -      // array indexed by singletonID
  16.281 -//   if( subMatrix->copySingleton && subMatrix->copySingleton->hasFinished )\
  16.282 -      return;
  16.283 -   SSR__start_data_singleton( &(subMatrix->copySingleton), animPr );
  16.284 -   if( subMatrix->copySingleton->endInstrAddr )
  16.285 -    {
  16.286 -      printf("error!");
  16.287 -    }
  16.288 -
  16.289 -   if( subMatrix->copySingleton->hasFinished )
  16.290 -    {
  16.291 -      printf("error!");
  16.292 -    }
  16.293 -
  16.294 -   origMatrix    = subMatrix->origMatrix;
  16.295 -   origArray     = origMatrix->array;
  16.296 -   numCols       = subMatrix->numCols;
  16.297 -   numRows       = subMatrix->numRows;
  16.298 -   origStartRow  = subMatrix->origStartRow;
  16.299 -   origStartCol  = subMatrix->origStartCol;
  16.300 -   origStride    = origMatrix->numCols;
  16.301 -
  16.302 -   subArray     = SSR__malloc_to( numRows * numCols *sizeof(float32),animPr);
  16.303 -   subMatrix->array = subArray;
  16.304 -
  16.305 -      //copy values from orig matrix to local
  16.306 -   stride        = numCols;
  16.307 -
  16.308 -   int row, col, offset, origOffset;
  16.309 -   for( row = 0; row < numRows; row++ )
  16.310 -    {
  16.311 -      offset     = row * stride;
  16.312 -      origOffset = (row + origStartRow) * origStride + origStartCol;
  16.313 -      for( col = 0; col < numCols; col++ )
  16.314 -       {
  16.315 -         subArray[ offset + col ]  =  origArray[ origOffset + col ];
  16.316 -       }
  16.317 -    }
  16.318 -
  16.319 -   SSR__end_data_singleton( &(subMatrix->copySingleton), animPr );
  16.320 -
  16.321 -   return;
  16.322 - }

    17.1 --- a/src/Application/main.c	Wed Sep 07 13:06:25 2011 +0200
    17.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
    17.3 @@ -1,37 +0,0 @@
    17.4 -/*
    17.5 - *  Copyright Oct 24, 2009 OpenSourceStewardshipFoundation.org
    17.6 - *  Licensed under GNU General Public License version 2
    17.7 - *
    17.8 - * author seanhalle@yahoo.com
    17.9 - */
   17.10 -
   17.11 -#include <malloc.h>
   17.12 -#include <stdlib.h>
   17.13 -
   17.14 -#include "Matrix_Mult.h"
   17.15 -#include "SSR_Matrix_Mult/SSR_Matrix_Mult.h"
   17.16 -
   17.17 -char __ProgrammName[] = "Blocked Matrix Multiply";
   17.18 -char __DataSet[255];
   17.19 -/**
   17.20 - * 
   17.21 - */
   17.22 -int main( int argc, char **argv )
   17.23 - { Matrix      *leftMatrix, *rightMatrix, *resultMatrix;
   17.24 -   ParamBag    *paramBag;
   17.25 -   
   17.26 -   printf( "arguments: %s | %s\n", argv[0], argv[1] );
   17.27 -
   17.28 -   paramBag = makeParamBag();
   17.29 -   readParamFileIntoBag( argv[1], paramBag );
   17.30 -   initialize_Input_Matrices_Via( &leftMatrix, &rightMatrix, paramBag );
   17.31 -   
   17.32 -   resultMatrix = multiplyTheseMatrices( leftMatrix, rightMatrix );
   17.33 -
   17.34 -   printf("\nresult matrix: \n");
   17.35 -   printMatrix( resultMatrix );
   17.36 -//   SSR__print_stats();
   17.37 -   fflush(stdin);
   17.38 -   
   17.39 -   exit(0); //cleans up
   17.40 - }