changeset 15:69928a38d5af tip

updating to most recent repository structure -- not working yet
author Sean Halle <seanhalle@yahoo.com>
date Mon, 17 Sep 2012 18:19:52 -0700
parents f8d3f16e70c8
children
files .hgeol .hgignore .hgtags Matrix_Mult.c Matrix_Mult.h Vthread__Matrix_Mult/Divide_Pr.c Vthread__Matrix_Mult/EntryPoint.c Vthread__Matrix_Mult/Result_Pr.c Vthread__Matrix_Mult/Vthread__Matrix_Mult.h Vthread__Matrix_Mult/subMatrix_Pr.c __brch__default main.c src/Application/Matrix_Mult.c src/Application/Matrix_Mult.h src/Application/VPThread__Matrix_Mult/Divide_Pr.c src/Application/VPThread__Matrix_Mult/EntryPoint.c src/Application/VPThread__Matrix_Mult/Result_Pr.c src/Application/VPThread__Matrix_Mult/VPThread__Matrix_Mult.h src/Application/VPThread__Matrix_Mult/subMatrix_Pr.c src/Application/main.c
diffstat 20 files changed, 2090 insertions(+), 1588 deletions(-) [+]
line diff
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/.hgeol	Mon Sep 17 18:19:52 2012 -0700
     1.3 @@ -0,0 +1,14 @@
     1.4 +
     1.5 +[patterns]
     1.6 +**.py = native
     1.7 +**.txt = native
     1.8 +**.c = native
     1.9 +**.h = native
    1.10 +**.cpp = native
    1.11 +**.java = native
    1.12 +**.class = bin
    1.13 +**.jar = bin
    1.14 +**.sh = native
    1.15 +**.pl = native
    1.16 +**.jpg = bin
    1.17 +**.gif = bin
     2.1 --- a/.hgignore	Fri Sep 16 20:12:34 2011 +0200
     2.2 +++ b/.hgignore	Mon Sep 17 18:19:52 2012 -0700
     2.3 @@ -1,7 +1,10 @@
     2.4 -nbproject
     2.5 -histograms
     2.6 -dist
     2.7 -build
     2.8 -Makefile
     2.9 -.dep.inc
    2.10 -scripts
    2.11 +syntax: glob
    2.12 +
    2.13 +histograms
    2.14 +nbproject
    2.15 +build
    2.16 +dist
    2.17 +c-ray-mt
    2.18 +*.ppm
    2.19 +*.o
    2.20 +*~
     3.1 --- a/.hgtags	Fri Sep 16 20:12:34 2011 +0200
     3.2 +++ b/.hgtags	Mon Sep 17 18:19:52 2012 -0700
     3.3 @@ -1,7 +1,1 @@
     3.4 -3e26de9d3e1e66f0556b2664e71740e6ed009eee V0
     3.5 -3e26de9d3e1e66f0556b2664e71740e6ed009eee V0
     3.6 -0000000000000000000000000000000000000000 V0
     3.7 -0000000000000000000000000000000000000000 V0
     3.8 -b6e07082767ab984194db038102e2edd7d1917fd V0
     3.9 -b6e07082767ab984194db038102e2edd7d1917fd V0
    3.10 -0000000000000000000000000000000000000000 V0
    3.11 +bef7a9083bd4f8b1fa4e171a36383e78a5f475d5 version_that_goes_into_paper
     4.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     4.2 +++ b/Matrix_Mult.c	Mon Sep 17 18:19:52 2012 -0700
     4.3 @@ -0,0 +1,167 @@
     4.4 +/*
     4.5 + *  Copyright 2009 OpenSourceStewardshipFoundation.org
     4.6 + *  Licensed under GNU General Public License version 2
     4.7 + *
     4.8 + * Author: seanhalle@yahoo.com
     4.9 + *
    4.10 + * Created on November 15, 2009, 2:35 AM
    4.11 + */
    4.12 +
    4.13 +#include <malloc.h>
    4.14 +#include <stdlib.h>
    4.15 +
    4.16 +#include "Matrix_Mult.h"
    4.17 +#include "C_Libraries/ParamHelper/Param.h"
    4.18 +
    4.19 +
    4.20 + 
    4.21 + void
    4.22 +initialize_Input_Matrices_Via( Matrix  **leftMatrix, Matrix **rightMatrix,
    4.23 +                               ParamBag *paramBag )
    4.24 + { char *leftMatrixFileName, *rightMatrixFileName;
    4.25 +   int   leftMatrixRows, leftMatrixCols, rightMatrixRows, rightMatrixCols;
    4.26 +   
    4.27 +      ParamStruc *param;
    4.28 +      param = getParamFromBag( "leftMatrixRows", paramBag );
    4.29 +   leftMatrixRows = param->intValue;
    4.30 +      param = getParamFromBag( "leftMatrixCols", paramBag );
    4.31 +   leftMatrixCols = param->intValue;
    4.32 +   *leftMatrix = makeMatrix_WithResMat( leftMatrixRows, leftMatrixCols );
    4.33 +   
    4.34 +      param = getParamFromBag( "leftMatrixFileName", paramBag );
    4.35 +   leftMatrixFileName = param->strValue;  //no need to copy
    4.36 +   read_Matrix_From_File( *leftMatrix,  leftMatrixFileName );
    4.37 +   
    4.38 +      param = getParamFromBag( "rightMatrixRows", paramBag );
    4.39 +   rightMatrixRows = param->intValue;
    4.40 +      param = getParamFromBag( "rightMatrixCols", paramBag );
    4.41 +   rightMatrixCols = param->intValue;
    4.42 +   *rightMatrix = makeMatrix_WithResMat( rightMatrixRows, rightMatrixCols );
    4.43 +   
    4.44 +      param = getParamFromBag( "rightMatrixFileName", paramBag );
    4.45 +   rightMatrixFileName = param->strValue;
    4.46 +   read_Matrix_From_File( *rightMatrix, rightMatrixFileName );
    4.47 + }
    4.48 +
    4.49 +
    4.50 +void parseLineIntoRow( char *line, float32* row );
    4.51 +
    4.52 +
    4.53 + void
    4.54 +read_Matrix_From_File( Matrix *matrixStruc, char *matrixFileName )
    4.55 + { int    row, maxRead, numRows, numCols;
    4.56 +   float32 *matrixStart;
    4.57 +   size_t lineSz = 0;
    4.58 +   FILE  *file;
    4.59 +   char  *line = NULL;
    4.60 +   
    4.61 +   lineSz = 50000; //max length of line in a matrix data file
    4.62 +   line = (char *) malloc( lineSz );
    4.63 +   if( line == NULL ) printf( "no mem for matrix line" );
    4.64 +   
    4.65 +   numRows = matrixStruc->numRows;
    4.66 +   numCols = matrixStruc->numCols;
    4.67 +   matrixStart = matrixStruc->array;
    4.68 +
    4.69 +   file = fopen( matrixFileName, "r" );
    4.70 +   if( file == NULL ) { printf( "\nCouldn't open file!!\n"); exit(1);}
    4.71 +   fseek( file, 0, SEEK_SET );
    4.72 +   for( row = 0; row < numRows; row++ )
    4.73 +    {
    4.74 +      if( feof( file ) )  printf( "file ran out too soon" );
    4.75 +      maxRead = getline( &line, &lineSz, file );
    4.76 +      if( maxRead == -1 ) printf( "prob reading mat line");
    4.77 +      
    4.78 +      if( *line == '\n') continue; //blank line
    4.79 +      if( *line == '/' ) continue; //comment line
    4.80 +      
    4.81 +      parseLineIntoRow( line, matrixStart + row * numCols );
    4.82 +    }
    4.83 +   free( line );
    4.84 + }
    4.85 +
    4.86 +/*This function relies on each line having the proper number of cols.  It
    4.87 + * doesn't check, nor enforce, so if the file is improperly formatted it
    4.88 + * can write over unrelated memory
    4.89 + */
    4.90 + void
    4.91 +parseLineIntoRow( char *line, float32* row )
    4.92 + {
    4.93 +   char *valueStr, *searchPos;
    4.94 +   
    4.95 +      //read the float values
    4.96 +   searchPos = valueStr = line; //start
    4.97 +   
    4.98 +   for( ; *searchPos != 0; searchPos++)  //bit dangerous, should use buff len
    4.99 +    {
   4.100 +      if( *searchPos == '\n' ) //last col..  relying on well-formatted file
   4.101 +       { *searchPos = 0;
   4.102 +         *row = atof( valueStr );
   4.103 +         break;                                    //end FOR loop
   4.104 +       }
   4.105 +      if( *searchPos == ',' )
   4.106 +       { *searchPos = 0;                           //mark end of string
   4.107 +         *row = (float32) atof( valueStr );
   4.108 +         row += 1;                                 //address arith
   4.109 +            //skip any spaces before digits.. use searchPos + 1 to skip the 0
   4.110 +         for( ; *(searchPos + 1)== ' ' && *(searchPos + 1) !=0; searchPos++);
   4.111 +         valueStr = searchPos + 1;
   4.112 +       }
   4.113 +    }
   4.114 + }
   4.115 +
   4.116 + //==========================================================================
   4.117 +
   4.118 +/*In the "_Flat" version of constructor, do only malloc of the top data struc
   4.119 + * and set values in that top-level.  Don't malloc any sub-structures.
   4.120 + */
   4.121 + Matrix *
   4.122 +makeMatrix_Flat( int32 numRows, int32 numCols )
   4.123 + { Matrix * retMatrix;
   4.124 +   retMatrix = malloc( sizeof( Matrix ) );
   4.125 +   retMatrix->numRows = numRows;
   4.126 +   retMatrix->numCols = numCols;
   4.127 +
   4.128 +   return retMatrix;
   4.129 + }
   4.130 +
   4.131 + Matrix *
   4.132 +makeMatrix_WithResMat( int32 numRows, int32 numCols )
   4.133 + { Matrix * retMatrix;
   4.134 +   retMatrix = malloc( sizeof( Matrix ) );
   4.135 +   retMatrix->numRows = numRows;
   4.136 +   retMatrix->numCols = numCols;
   4.137 +   retMatrix->array  = malloc( numRows * numCols * sizeof(float32) );
   4.138 +
   4.139 +   return retMatrix;
   4.140 + }
   4.141 +
   4.142 + void
   4.143 +freeMatrix_Flat( Matrix * matrix )
   4.144 + { //( matrix );
   4.145 + }
   4.146 + void
   4.147 +freeMatrix( Matrix * matrix )
   4.148 + { free( matrix->array );
   4.149 +   free( matrix );
   4.150 + }
   4.151 +
   4.152 +void
   4.153 +printMatrix( Matrix *matrix )
   4.154 + { int r, c, numRows, numCols, rowsToPrint, colsToPrint, rowIncr, colIncr;
   4.155 +   float32 *matrixArray;
   4.156 +
   4.157 +   numRows = rowsToPrint = matrix->numRows;
   4.158 +   numCols = colsToPrint = matrix->numCols;
   4.159 +   matrixArray = matrix->array;
   4.160 +
   4.161 +   rowIncr = numRows/20; if(rowIncr == 0) rowIncr = 1;//20 to 39 rows printed
   4.162 +   colIncr = numCols/20; if(colIncr == 0) colIncr = 1;//20 to 39 cols printed
   4.163 +   for( r = 0; r < numRows; r += rowIncr )
   4.164 +    { for( c = 0; c < numCols; c += colIncr )
   4.165 +       { printf( "%3.1f | ", matrixArray[ r * numCols + c ] );
   4.166 +       }
   4.167 +      printf("\n");
   4.168 +    }
   4.169 + }
   4.170 +
     5.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     5.2 +++ b/Matrix_Mult.h	Mon Sep 17 18:19:52 2012 -0700
     5.3 @@ -0,0 +1,77 @@
     5.4 +/*
     5.5 + *  Copyright Oct 24, 2009 OpenSourceStewardshipFoundation.org
     5.6 + *  Licensed under GNU General Public License version 2
     5.7 + */
     5.8 +
     5.9 +#ifndef MATRIX_MULT_H_
    5.10 +#define MATRIX_MULT_H_
    5.11 +
    5.12 +#include <stdio.h>
    5.13 +#include <unistd.h>
    5.14 +#include <malloc.h>
    5.15 +
    5.16 +#include "VMS_impl/VMS_primitive_data_types.h"
    5.17 +#include "C_Libraries/ParamHelper/Param.h"
    5.18 +
    5.19 +//==============================  Structures  ==============================
    5.20 +
    5.21 +typedef
    5.22 +struct
    5.23 + { int32 numRows;
    5.24 +   int32 numCols;
    5.25 +   float32 *array;  //2D, but dynamically sized, so use addr arith
    5.26 + }
    5.27 +Matrix;
    5.28 +
    5.29 +/* This is the "appSpecificPiece" that is carried inside a DKUPiece.
    5.30 + *  In the DKUPiece data struc it is declared to be of type "void *".  This
    5.31 + *  allows the application to define any data structure it wants and put it
    5.32 + *  into a DKUPiece.
    5.33 + * When the app specific info is used, it is in app code, so it is cast to
    5.34 + *  the correct type to tell the compiler how to access fields.
    5.35 + * This keeps all app-specific things out of the DKU directory, as per the
    5.36 + *  DKU standard. */
    5.37 +typedef
    5.38 +struct
    5.39 + { 
    5.40 +      // pointers to shared data..  the result matrix must be created when the
    5.41 +      //  left and right matrices are put into the root ancestor DKUPiece.
    5.42 +   Matrix * leftMatrix;
    5.43 +   Matrix * rightMatrix;
    5.44 +   Matrix * resultMatrix;
    5.45 +
    5.46 +      // define the starting and ending boundaries for this piece of the
    5.47 +      //  result matrix.  These are derivable from the left and right
    5.48 +      //  matrices, but included them for readability of code.
    5.49 +   int prodStartRow, prodEndRow;
    5.50 +   int prodStartCol, prodEndCol;
    5.51 +      // Start and end of the portion of the left matrix that contributes to
    5.52 +      //  this piece of the product
    5.53 +   int leftStartRow, leftEndRow;
    5.54 +   int leftStartCol, leftEndCol;
    5.55 +      // Start and end of the portion of the right matrix that contributes to
    5.56 +      //  this piece of the product
    5.57 +   int rightStartRow, rightEndRow;
    5.58 +   int rightStartCol, rightEndCol;
    5.59 + }
    5.60 +MatrixProdPiece;
    5.61 +
    5.62 +//==============================  Functions  ================================
    5.63 +void readFile();
    5.64 +
    5.65 +Matrix *makeMatrix( int32 numRows, int32 numCols );
    5.66 +Matrix *makeMatrix_Flat( int32 numRows, int32 numCols );
    5.67 +Matrix *makeMatrix_WithResMat( int32 numRows, int32 numCols );
    5.68 +void    freeMatrix_Flat( Matrix * matrix );
    5.69 +void    freeMatrix( Matrix * matrix );
    5.70 +void    printMatrix( Matrix *matrix );
    5.71 +
    5.72 +void read_Matrix_From_File( Matrix *matrixStruc, char *matrixFileName );
    5.73 +
    5.74 +void
    5.75 +initialize_Input_Matrices_Via( Matrix  **leftMatrix, Matrix **rightMatrix,
    5.76 +                              ParamBag *paramBag );
    5.77 +
    5.78 +//===========================================================================
    5.79 +
    5.80 +#endif /*MATRIX_MULT_H_*/
     6.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     6.2 +++ b/Vthread__Matrix_Mult/Divide_Pr.c	Mon Sep 17 18:19:52 2012 -0700
     6.3 @@ -0,0 +1,646 @@
     6.4 +/*
     6.5 + *  Copyright 2009 OpenSourceStewardshipFoundation.org
     6.6 + *  Licensed under GNU General Public License version 2
     6.7 + *
     6.8 + * Author: seanhalle@yahoo.com
     6.9 + *
    6.10 + */
    6.11 +
    6.12 +
    6.13 +#include "Vthread__Matrix_Mult.h"
    6.14 +#include <math.h>
    6.15 +#include <string.h>
    6.16 +#include "Vthread_impl/Vthread.h"
    6.17 +
    6.18 +   //The time to compute this many result values should equal the time to
    6.19 +   // perform this division on a matrix of size gives that many result calcs
    6.20 +   //IE, size this so that sequential time to calc equals divide time
    6.21 +   // find the value by experimenting -- but divide time and calc time scale
    6.22 +   // same way, so this value should remain valid across hardware
    6.23 +#define NUM_CELLS_IN_SEQUENTIAL_CUTOFF 1000
    6.24 +
    6.25 +
    6.26 +//===========================================================================
    6.27 +int inline
    6.28 +measureMatrixMultPrimitive( SlaveVP *animPr );
    6.29 +
    6.30 +SlicingStrucCarrier *
    6.31 +calcIdealSizeAndSliceDimensions( Matrix *leftMatrix, Matrix *rightMatrix,
    6.32 +                                 SlaveVP *animPr );
    6.33 +
    6.34 +SlicingStruc *
    6.35 +sliceUpDimension( float32 idealSizeOfSide, int startVal, int endVal,
    6.36 +                  SlaveVP *animPr );
    6.37 +
    6.38 +void
    6.39 +freeSlicingStruc( SlicingStruc *slicingStruc, SlaveVP *animPr );
    6.40 +
    6.41 +SubMatrix **
    6.42 +createSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices,
    6.43 +                   int32 numUses, Matrix *origMatrix, SlaveVP *animPr );
    6.44 +
    6.45 +void
    6.46 +freeSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices,
    6.47 +                 SubMatrix **subMatrices, SlaveVP *animPr );
    6.48 +
    6.49 +void
    6.50 +pairUpSubMatricesAndMakeProcessors( SubMatrix **leftSubMatrices,
    6.51 +                                    SubMatrix **rightSubMatrices,
    6.52 +                                    int32 numRowIdxs, int32 numColIdxs,
    6.53 +                                    int32 numVecIdxs,
    6.54 +                                    SlaveVP *resultPr,
    6.55 +                                    SlaveVP *animatingPr );
    6.56 +
    6.57 +void
    6.58 +makeSubMatricesAndProcrs( Matrix *leftMatrix, Matrix *rightMatrix,
    6.59 +            SlicingStrucCarrier *slicingStrucCarrier,
    6.60 +            SlaveVP *resultPr, SlaveVP *animatingPr );
    6.61 +
    6.62 +void inline
    6.63 +copyTranspose( int32 numRows, int32 numCols,
    6.64 +               int32 origStartRow, int32 origStartCol, int32 origStride,
    6.65 +               float32 *subArray, float32 *origArray );
    6.66 +
    6.67 +void inline
    6.68 +multiplyMatrixArraysTransposed( int32 vecLength, int32 numResRows,
    6.69 +                                int32 numResCols,
    6.70 +                                float32 *leftArray, float32 *rightArray,
    6.71 +                                float32 *resArray );
    6.72 +
    6.73 +
    6.74 +
    6.75 +/*Divider creates one processor for every sub-matrix
    6.76 + * It hands them:
    6.77 + *  the name of the result processor that they should send their results to,
    6.78 + *  the left and right matrices, and the rows and cols they should multiply
    6.79 + * It first creates the result processor, then all the sub-matrixPair
    6.80 + *  processors,
    6.81 + *  then does a receive of a message from the result processor that gives
    6.82 + *  the divider ownership of the result matrix.
    6.83 + * Finally, the divider returns the result matrix out of the Vthread system.
    6.84 + *
    6.85 + * Divider chooses the size of sub-matrices via an algorithm that tries to
    6.86 + *  keep the minimum work above a threshold.  The threshold is machine-
    6.87 + *  dependent, so ask Vthread for min work-unit time to get a
    6.88 + *  given overhead
    6.89 + *
    6.90 + * Divide min work-unit cycles by measured-cycles for one matrix-cell
    6.91 + *  product -- gives the number of products need to have in min size
    6.92 + *  matrix.
    6.93 + *
    6.94 + * So then, take cubed root of this to get the size of a side of min sub-
    6.95 + *  matrix.  That is the size of the ideal square sub-matrix -- so tile
    6.96 + *  up the two input matrices into ones as close as possible to that size,
    6.97 + *  and create the pairs of sub-matrices.
    6.98 + *
    6.99 + *========================  STRATEGIC OVERVIEW  =======================
   6.100 + *
   6.101 + *This division is a bit tricky, because have to create things in advance
   6.102 + * that it's not at first obvious need to be created..
   6.103 + *
   6.104 + *First slice up each dimension -- three of them..  this is because will have
   6.105 + * to create the sub-matrix's data-structures before pairing the sub-matrices
   6.106 + * with each other -- so, have three dimensions to slice up before can
   6.107 + * create the sub-matrix data-strucs -- also, have to be certain that the
   6.108 + * cols of the left input have the exact same slicing as the rows of the
   6.109 + * left matrix, so just to be sure, do the slicing calc once, then use it
   6.110 + * for both.
   6.111 + *
   6.112 + *So, goes like this:
   6.113 + *1) calculate the start & end values of each dimension in each matrix.
   6.114 + *2) use those values to create sub-matrix structures
   6.115 + *3) combine sub-matrices into pairs, as the tasks to perform.
   6.116 + *
   6.117 + *Have to calculate separately from creating the sub-matrices because of the
   6.118 + * nature of the nesting -- would either end up creating the same sub-matrix
   6.119 + * multiple times, or else would have to put in detection of whether had
   6.120 + * made a particular one already if tried to combine steps 1 and 2.
   6.121 + *
   6.122 + *Step 3 has to be separate because of the nesting, as well -- same reason,
   6.123 + * would either create same sub-matrix multiple times, or else have to
   6.124 + * add detection of whether was already created.
   6.125 + *
   6.126 + *Another way to look at it: there's one level of loop to divide dimensions,
   6.127 + * two levels of nesting to create sub-matrices, and three levels to pair
   6.128 + * up the sub-matrices.
   6.129 + */
   6.130 +
   6.131 +void divideWorkIntoSubMatrixPairProcrs( void      *_dividerParams,
   6.132 +                                        SlaveVP *animatingThd )
   6.133 + { SlaveVP         *resultPr;
   6.134 +   DividerParams   *dividerParams;
   6.135 +   ResultsParams   *resultsParams;
   6.136 +   Matrix          *leftMatrix, *rightMatrix;
   6.137 +   SlicingStrucCarrier *slicingStrucCarrier;
   6.138 +   float32             *resultArray; //points to array inside result matrix
   6.139 +   MatrixMultGlobals   *globals;
   6.140 +  
   6.141 +         DEBUG( dbgAppFlow, "start divide\n")
   6.142 +
   6.143 +         int32
   6.144 +         divideProbe = VMS_App__create_single_interval_probe( "divideProbe",
   6.145 +                                                          animatingThd );
   6.146 +         VMS_App__record_sched_choice_into_probe( divideProbe, animatingThd );
   6.147 +         VMS_App__record_interval_start_in_probe( divideProbe );
   6.148 +
   6.149 +   //=========== Setup -- make local copies of ptd-to-things, malloc, aso
   6.150 +   int32 numResRows, numResCols, vectLength;
   6.151 +
   6.152 +   dividerParams   = (DividerParams *)_dividerParams;
   6.153 +   
   6.154 +   leftMatrix      = dividerParams->leftMatrix;
   6.155 +   rightMatrix     = dividerParams->rightMatrix;
   6.156 +
   6.157 +   vectLength = leftMatrix->numCols;
   6.158 +   numResRows = leftMatrix->numRows;
   6.159 +   numResCols = rightMatrix->numCols;
   6.160 +   resultArray     = dividerParams->resultMatrix->array;
   6.161 +   
   6.162 +      //zero the result array
   6.163 +   memset( resultArray, 0, numResRows * numResCols * sizeof(float32) );
   6.164 +
   6.165 +   //==============  Do either sequential mult or do division ==============
   6.166 +
   6.167 +      //Check if input matrices too small -- if yes, just do sequential
   6.168 +      //Cutoff is determined by overhead of this divider -- relatively
   6.169 +      // machine-independent
   6.170 +   if( (float32)leftMatrix->numRows * (float32)leftMatrix->numCols *
   6.171 +       (float32)rightMatrix->numCols  < NUM_CELLS_IN_SEQUENTIAL_CUTOFF )
   6.172 +    {
   6.173 +      //====== Do sequential multiply on a single core
   6.174 +            DEBUG( dbgAppFlow, "doing sequential")
   6.175 +            
   6.176 +         //transpose the right matrix
   6.177 +      float32 *
   6.178 +      transRightArray  = 
   6.179 +         Vthread__malloc( (size_t)rightMatrix->numRows * 
   6.180 +                                (size_t)rightMatrix->numCols *
   6.181 +                                sizeof(float32), animatingThd );
   6.182 +
   6.183 +         //copy values from orig matrix to local
   6.184 +      copyTranspose( rightMatrix->numRows, rightMatrix->numCols,
   6.185 +                     0, 0, rightMatrix->numRows,
   6.186 +                     transRightArray, rightMatrix->array );
   6.187 +      
   6.188 +      multiplyMatrixArraysTransposed( vectLength, numResRows, numResCols,
   6.189 +                            leftMatrix->array, transRightArray,
   6.190 +                            resultArray );
   6.191 +    }
   6.192 +   else
   6.193 +    {
   6.194 +      //====== Do parallel multiply across cores
   6.195 +
   6.196 +            DEBUG( dbgAppFlow, "divider: do parallel mult\n")
   6.197 +         //Calc the ideal size of sub-matrix and slice up the dimensions of
   6.198 +         // the two matrices.
   6.199 +         //The ideal size is the one takes the number of cycles to calculate
   6.200 +         // such that calc time is equal or greater than min work-unit size
   6.201 +      slicingStrucCarrier =
   6.202 +         calcIdealSizeAndSliceDimensions(leftMatrix,rightMatrix,animatingThd);
   6.203 +
   6.204 +         //Make the results processor, now that know how many to wait for
   6.205 +      resultsParams = Vthread__malloc( sizeof(ResultsParams), animatingThd );
   6.206 +      resultsParams->numSubMatrixPairs  =
   6.207 +         slicingStrucCarrier->leftRowSlices->numVals *
   6.208 +         slicingStrucCarrier->rightColSlices->numVals *
   6.209 +         slicingStrucCarrier->vecSlices->numVals;
   6.210 +      resultsParams->dividerPr   = animatingThd;
   6.211 +      resultsParams->numCols     = rightMatrix->numCols;
   6.212 +      resultsParams->numRows     = leftMatrix->numRows;
   6.213 +      resultsParams->resultArray = resultArray;
   6.214 +
   6.215 +      //==========  Set up global vars, including conds and mutexes ==========
   6.216 +      globals = VMS_App__malloc( sizeof(MatrixMultGlobals) );
   6.217 +      Vthread__set_globals_to( globals );
   6.218 +
   6.219 +      globals->results_mutex = Vthread__make_mutex( animatingThd );
   6.220 +      globals->results_cond  = Vthread__make_cond( globals->results_mutex,
   6.221 +                                                               animatingThd );
   6.222 +
   6.223 +      globals->vector_mutex = Vthread__make_mutex( animatingThd );
   6.224 +      globals->vector_cond  = Vthread__make_cond( globals->vector_mutex,
   6.225 +                                                               animatingThd );
   6.226 +
   6.227 +      globals->start_mutex = Vthread__make_mutex( animatingThd );
   6.228 +      globals->start_cond  = Vthread__make_cond( globals->start_mutex,
   6.229 +                                                               animatingThd );
   6.230 +      //======================================================================
   6.231 +
   6.232 +            DEBUG( dbgAppFlow, "divider: made mutexes and conds\n")
   6.233 +         //get results-comm lock before create results-thd, to ensure it can't
   6.234 +         // signal that results are available before this thd is waiting on cond
   6.235 +      Vthread__mutex_lock( globals->results_mutex, animatingThd );
   6.236 +
   6.237 +         //also get the start lock & use to ensure no vector threads send a
   6.238 +         // signal before the results thread is waiting on vector cond
   6.239 +      Vthread__mutex_lock( globals->start_mutex, animatingThd );
   6.240 +
   6.241 +
   6.242 +            DEBUG( dbgAppFlow, "divider: make result thread\n")
   6.243 +      Vthread__create_thread( &gatherResults, resultsParams, animatingThd );
   6.244 +
   6.245 +         //Now wait for results thd to signal that it has vector lock
   6.246 +      Vthread__cond_wait(  globals->start_cond,  animatingThd );
   6.247 +      Vthread__mutex_unlock( globals->start_mutex, animatingThd );//done w/lock
   6.248 +            DEBUG( dbgAppFlow, "divider: make sub-matrices\n")
   6.249 +   
   6.250 +         //Make the sub-matrices, and pair them up, and make processor to
   6.251 +         // calc product of each pair.
   6.252 +      makeSubMatricesAndProcrs( leftMatrix, rightMatrix,
   6.253 +                                    slicingStrucCarrier,
   6.254 +                                    resultPr, animatingThd);
   6.255 + 
   6.256 +         //Wait for results thread to say results are good
   6.257 +      Vthread__cond_wait(  globals->results_cond,  animatingThd );
   6.258 +    }
   6.259 +
   6.260 +
   6.261 +   //===============  Work done -- send results back =================
   6.262 +
   6.263 +
   6.264 +         DEBUG( dbgAppFlow, "end divide\n")
   6.265 +
   6.266 +         VMS_App__record_interval_end_in_probe( divideProbe );
   6.267 +         VMS_App__print_stats_of_all_probes();
   6.268 +
   6.269 +      //nothing left to do so dissipate, Vthread will wait to shutdown,
   6.270 +      // making results available to outside, until all the processors have
   6.271 +      // dissipated -- so actually no need to wait for results processor
   6.272 +      //However, following the pattern, so done with comm, release lock
   6.273 +   Vthread__mutex_unlock( globals->results_mutex, animatingThd );
   6.274 +
   6.275 +   Vthread__dissipate_thread( animatingThd );  //all procrs dissipate self at end
   6.276 +      //when all of the processors have dissipated, the "create seed and do
   6.277 +      // work" call in the entry point function returns
   6.278 + }
   6.279 +
   6.280 +
   6.281 +SlicingStrucCarrier *
   6.282 +calcIdealSizeAndSliceDimensions( Matrix *leftMatrix, Matrix *rightMatrix,
   6.283 +                                 SlaveVP *animPr )
   6.284 + {
   6.285 +   float32 idealSizeOfSide, idealSizeOfSide1, idealSizeOfSide2;
   6.286 +   SlicingStruc *leftRowSlices, *vecSlices, *rightColSlices;
   6.287 +   SlicingStrucCarrier *slicingStrucCarrier =
   6.288 +                       Vthread__malloc(sizeof(SlicingStrucCarrier), animPr);
   6.289 +
   6.290 +   int minWorkUnitCycles, primitiveCycles, idealNumWorkUnits;
   6.291 +   float64 numPrimitiveOpsInMinWorkUnit;
   6.292 +
   6.293 +
   6.294 +   //=======  Calc ideal size of min-sized sub-matrix  ========
   6.295 +
   6.296 +      //ask Vthread for the number of cycles of the minimum work unit, at given
   6.297 +      // percent overhead then add a guess at overhead from this divider
   6.298 +   minWorkUnitCycles = Vthread__giveMinWorkUnitCycles( .05 );
   6.299 +
   6.300 +      //ask Vthread for number of cycles of the "primitive" op of matrix mult
   6.301 +   primitiveCycles = measureMatrixMultPrimitive( animPr );
   6.302 +
   6.303 +   numPrimitiveOpsInMinWorkUnit =
   6.304 +      (float64)minWorkUnitCycles / (float64)primitiveCycles;
   6.305 +
   6.306 +      //take cubed root -- that's number of these in a "side" of sub-matrix
   6.307 +      // then multiply by 5 because the primitive is 5x5src/Application/Vthread__Matrix_Mult/Divide_Pr.c:403: warning: implicit declaration of function ‘Vthread__give_number_of_cores_to_schedule_onto’
   6.308 +   idealSizeOfSide1 = 5 * cbrt( numPrimitiveOpsInMinWorkUnit );
   6.309 +
   6.310 +   idealNumWorkUnits = Vthread__giveIdealNumWorkUnits();
   6.311 +   
   6.312 +   idealSizeOfSide2 = leftMatrix->numRows / rint(cbrt( idealNumWorkUnits ));
   6.313 +   idealSizeOfSide2 *= 0.6; //finer granularity to help load balance
   6.314 +
   6.315 +   if( idealSizeOfSide1 > idealSizeOfSide2 )
   6.316 +      idealSizeOfSide = idealSizeOfSide1;
   6.317 +   else
   6.318 +      idealSizeOfSide = idealSizeOfSide2;
   6.319 +
   6.320 +      //The multiply inner loop blocks the array to fit into L1 cache
   6.321 +//   if( idealSizeOfSide < ROWS_IN_BLOCK ) idealSizeOfSide = ROWS_IN_BLOCK;
   6.322 +
   6.323 +   //============  Slice up dimensions, now that know target size ===========
   6.324 +
   6.325 +      //Tell the slicer the target size of a side (floating pt), the start
   6.326 +      // value to start slicing at, and the end value to stop slicing at
   6.327 +      //It returns an array of start value of each chunk, plus number of them
   6.328 +   int32 startLeftRow, endLeftRow, startVec,endVec,startRightCol,endRightCol;
   6.329 +   startLeftRow  = 0;
   6.330 +   endLeftRow    = leftMatrix->numRows -1;
   6.331 +   startVec      = 0;
   6.332 +   endVec        = leftMatrix->numCols -1;
   6.333 +   startRightCol = 0;
   6.334 +   endRightCol   = rightMatrix->numCols -1;
   6.335 +
   6.336 +   leftRowSlices =
   6.337 +      sliceUpDimension( idealSizeOfSide,  startLeftRow, endLeftRow, animPr );
   6.338 +
   6.339 +   vecSlices =
   6.340 +      sliceUpDimension( idealSizeOfSide,  startVec, endVec, animPr );
   6.341 +
   6.342 +   rightColSlices =
   6.343 +      sliceUpDimension( idealSizeOfSide,  startRightCol, endRightCol,animPr);
   6.344 +
   6.345 +   slicingStrucCarrier->leftRowSlices  = leftRowSlices;
   6.346 +   slicingStrucCarrier->vecSlices      = vecSlices;
   6.347 +   slicingStrucCarrier->rightColSlices = rightColSlices;
   6.348 +
   6.349 +   return slicingStrucCarrier;
   6.350 + }
   6.351 +
   6.352 +
   6.353 +void
   6.354 +makeSubMatricesAndProcrs( Matrix    *leftMatrix, Matrix    *rightMatrix,
   6.355 +            SlicingStrucCarrier *slicingStrucCarrier,
   6.356 +            SlaveVP *resultPr,   SlaveVP *animPr )
   6.357 + {
   6.358 +   SlicingStruc *leftRowSlices, *vecSlices, *rightColSlices;
   6.359 +   
   6.360 +   leftRowSlices  = slicingStrucCarrier->leftRowSlices;
   6.361 +   vecSlices      = slicingStrucCarrier->vecSlices;
   6.362 +   rightColSlices = slicingStrucCarrier->rightColSlices;
   6.363 +   Vthread__free( slicingStrucCarrier, animPr );
   6.364 +   
   6.365 +   //================  Make sub-matrices, given the slicing  ================
   6.366 +   SubMatrix **leftSubMatrices, **rightSubMatrices;
   6.367 +   leftSubMatrices =
   6.368 +      createSubMatrices( leftRowSlices, vecSlices, rightColSlices->numVals,
   6.369 +                         leftMatrix, animPr );
   6.370 +   //double_check_that_always_numRows_in_right_same_as_numCols_in_left();
   6.371 +   rightSubMatrices =
   6.372 +      createSubMatrices( vecSlices, rightColSlices, leftRowSlices->numVals,
   6.373 +                         rightMatrix, animPr );
   6.374 +
   6.375 +
   6.376 +   //==============  pair the sub-matrices and make processors ==============
   6.377 +   int32 numRowIdxs, numColIdxs, numVecIdxs;
   6.378 +
   6.379 +   numRowIdxs = leftRowSlices->numVals;
   6.380 +   numColIdxs = rightColSlices->numVals;
   6.381 +   numVecIdxs = vecSlices->numVals;
   6.382 +   
   6.383 +   freeSlicingStruc( leftRowSlices, animPr );
   6.384 +   freeSlicingStruc( vecSlices, animPr );
   6.385 +   freeSlicingStruc( rightColSlices, animPr );
   6.386 +   
   6.387 +   pairUpSubMatricesAndMakeProcessors( leftSubMatrices,
   6.388 +                                       rightSubMatrices,
   6.389 +                                       numRowIdxs, numColIdxs,
   6.390 +                                       numVecIdxs,
   6.391 +                                       resultPr,
   6.392 +                                       animPr );
   6.393 + }
   6.394 +
   6.395 +
   6.396 +
   6.397 +
   6.398 +void
   6.399 +pairUpSubMatricesAndMakeProcessors( SubMatrix **leftSubMatrices,
   6.400 +                                    SubMatrix **rightSubMatrices,
   6.401 +                                    int32 numRowIdxs, int32 numColIdxs,
   6.402 +                                    int32 numVecIdxs,
   6.403 +                                    SlaveVP *resultPr,
   6.404 +                                    SlaveVP *animatingPr )
   6.405 + {
   6.406 +   int32 resRowIdx, resColIdx, vecIdx;
   6.407 +   int32 numLeftColIdxs, numRightColIdxs;
   6.408 +   int32 leftRowIdxOffset;
   6.409 +   SMPairParams *subMatrixPairParams;
   6.410 +   float32 numToPutOntoEachCore, leftOverFraction;
   6.411 +   int32 numCores, coreToScheduleOnto, numVecOnCurrCore;
   6.412 +
   6.413 +   numLeftColIdxs  = numColIdxs;
   6.414 +   numRightColIdxs = numVecIdxs;
   6.415 +
   6.416 +   numCores = Vthread__give_number_of_cores_to_schedule_onto();
   6.417 +
   6.418 +   numToPutOntoEachCore = numRowIdxs*numColIdxs/numCores;
   6.419 +   leftOverFraction = 0;
   6.420 +   numVecOnCurrCore = 0;
   6.421 +   coreToScheduleOnto = 0;
   6.422 +
   6.423 +   for( resRowIdx = 0; resRowIdx < numRowIdxs; resRowIdx++ )
   6.424 +    {
   6.425 +      leftRowIdxOffset = resRowIdx * numLeftColIdxs;
   6.426 +
   6.427 +      for( resColIdx = 0; resColIdx < numColIdxs; resColIdx++ )
   6.428 +       {
   6.429 +         
   6.430 +         for( vecIdx = 0; vecIdx < numVecIdxs; vecIdx++ )
   6.431 +          {
   6.432 +               //Make the processor for the pair of sub-matrices
   6.433 +            subMatrixPairParams  = Vthread__malloc( sizeof(SMPairParams),
   6.434 +                                                               animatingPr);
   6.435 +            subMatrixPairParams->leftSubMatrix  =
   6.436 +               leftSubMatrices[ leftRowIdxOffset + vecIdx ];
   6.437 +
   6.438 +            subMatrixPairParams->rightSubMatrix =
   6.439 +               rightSubMatrices[ vecIdx * numRightColIdxs + resColIdx ];
   6.440 +
   6.441 +            //subMatrixPairParams->resultPr = resultPr;
   6.442 +
   6.443 +               //put all pairs from the same vector onto same core
   6.444 +            Vthread__create_thread_with_affinity( &calcSubMatrixProduct,
   6.445 +                                                   subMatrixPairParams,
   6.446 +                                                   animatingPr,
   6.447 +                                                   coreToScheduleOnto );
   6.448 +          }
   6.449 +
   6.450 +            //Trying to distribute the subMatrix-vectors across the cores, so
   6.451 +            // that each core gets the same number of vectors, with a max
   6.452 +            // imbalance of 1 vector more on some cores than others
   6.453 +         numVecOnCurrCore += 1;
   6.454 +         if( numVecOnCurrCore + leftOverFraction >= numToPutOntoEachCore -1 )
   6.455 +          {
   6.456 +               //deal with fractional part, to ensure that imbalance is 1 max
   6.457 +               // IE, core with most has only 1 more than core with least
   6.458 +            leftOverFraction += numToPutOntoEachCore - numVecOnCurrCore;
   6.459 +            if( leftOverFraction >= 1 )
   6.460 +             { leftOverFraction -= 1;
   6.461 +               numVecOnCurrCore = -1;
   6.462 +             }
   6.463 +            else
   6.464 +             { numVecOnCurrCore = 0;
   6.465 +             }
   6.466 +               //Move to next core, max core-value to incr to is numCores -1
   6.467 +            if( coreToScheduleOnto >= numCores -1 )
   6.468 +             { coreToScheduleOnto = 0;
   6.469 +             }
   6.470 +            else
   6.471 +             { coreToScheduleOnto += 1;
   6.472 +             }
   6.473 +          }
   6.474 +
   6.475 +       }
   6.476 +    }
   6.477 + }
   6.478 +
   6.479 +
   6.480 +
   6.481 +/*Walk through the two slice-strucs, making sub-matrix strucs as go
   6.482 + */
   6.483 +SubMatrix **
   6.484 +createSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices,
   6.485 +                   int32 numUses, Matrix *origMatrix, SlaveVP *animPr )
   6.486 + {
   6.487 +   int32 numRowIdxs, numColIdxs, rowIdx, colIdx;
   6.488 +   int32 startRow, endRow, startCol, endCol;
   6.489 +   int32 *rowStartVals, *colStartVals;
   6.490 +   int32 rowOffset;
   6.491 +   SubMatrix **subMatrices, *newSubMatrix;
   6.492 +
   6.493 +   numRowIdxs = rowSlices->numVals;
   6.494 +   numColIdxs = colSlices->numVals;
   6.495 +
   6.496 +   rowStartVals = rowSlices->startVals;
   6.497 +   colStartVals = colSlices->startVals;
   6.498 +
   6.499 +   subMatrices = Vthread__malloc((size_t)numRowIdxs *
   6.500 +                            (size_t)numColIdxs * sizeof(SubMatrix*),animPr );
   6.501 +
   6.502 +   for( rowIdx = 0; rowIdx < numRowIdxs; rowIdx++ )
   6.503 +    {
   6.504 +      rowOffset = rowIdx * numColIdxs;
   6.505 +      
   6.506 +      startRow  = rowStartVals[rowIdx];
   6.507 +      endRow    = rowStartVals[rowIdx + 1] -1; //"fake" start above last is
   6.508 +                                               // at last valid idx + 1 & is
   6.509 +                                               // 1 greater than end value
   6.510 +      for( colIdx = 0; colIdx < numColIdxs; colIdx++ )
   6.511 +       {
   6.512 +         startCol = colStartVals[colIdx];
   6.513 +         endCol   = colStartVals[colIdx + 1] -1;
   6.514 +
   6.515 +         newSubMatrix = Vthread__malloc( sizeof(SubMatrix), animPr );
   6.516 +         newSubMatrix->numRows       = endRow - startRow +1;
   6.517 +         newSubMatrix->numCols       = endCol - startCol +1;
   6.518 +         newSubMatrix->origMatrix    = origMatrix;
   6.519 +         newSubMatrix->origStartRow  = startRow;
   6.520 +         newSubMatrix->origStartCol  = startCol;
   6.521 +         newSubMatrix->alreadyCopied = FALSE;
   6.522 +         newSubMatrix->numUsesLeft   = numUses; //can free after this many
   6.523 +         //Prevent uninitialized memory
   6.524 +         newSubMatrix->copySingleton = NULL;
   6.525 +         newSubMatrix->copyTransSingleton = NULL;
   6.526 +
   6.527 +         subMatrices[ rowOffset + colIdx ] = newSubMatrix;
   6.528 +       }
   6.529 +    }
   6.530 +   return subMatrices;
   6.531 + }
   6.532 +
   6.533 +
   6.534 +void
   6.535 +freeSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices,
   6.536 +                 SubMatrix **subMatrices, SlaveVP *animPr )
   6.537 + {
   6.538 +   int32 numRowIdxs, numColIdxs, rowIdx, colIdx, rowOffset;
   6.539 +   SubMatrix *subMatrix;
   6.540 +
   6.541 +   numRowIdxs = rowSlices->numVals;
   6.542 +   numColIdxs = colSlices->numVals;
   6.543 +
   6.544 +   for( rowIdx = 0; rowIdx < numRowIdxs; rowIdx++ )
   6.545 +    {
   6.546 +      rowOffset = rowIdx * numColIdxs;
   6.547 +      for( colIdx = 0; colIdx < numColIdxs; colIdx++ )
   6.548 +       {
   6.549 +         subMatrix = subMatrices[ rowOffset + colIdx ];
   6.550 +         if( subMatrix->alreadyCopied )
   6.551 +            Vthread__free( subMatrix->array, animPr );
   6.552 +         Vthread__free( subMatrix, animPr );
   6.553 +       }
   6.554 +    }
   6.555 +   Vthread__free( subMatrices, animPr );
   6.556 + }
   6.557 +
   6.558 +
   6.559 +
   6.560 +SlicingStruc *
   6.561 +sliceUpDimension( float32 idealSizeOfSide, int startVal, int endVal,
   6.562 +                  SlaveVP *animPr )
   6.563 + { float32 residualAcc = 0;
   6.564 +   int     numSlices, i, *startVals, sizeOfSlice, endCondition;
   6.565 +   SlicingStruc *slicingStruc = Vthread__malloc(sizeof(SlicingStruc), animPr);
   6.566 +
   6.567 +      //calc size of matrix need to hold start vals --
   6.568 +   numSlices = (int32)( (float32)(endVal -startVal +1) / idealSizeOfSide);
   6.569 +
   6.570 +   startVals = Vthread__malloc( ((size_t)numSlices + 1) * sizeof(int32), animPr );
   6.571 +
   6.572 +      //Calc the upper limit of start value -- when get above this, end loop
   6.573 +      // by saving highest value of the matrix dimension to access, plus 1
   6.574 +      // as the start point of the imaginary slice following the last one
   6.575 +      //Plus 1 because go up to value but not include when process last slice
   6.576 +      //The stopping condition is half-a-size less than highest value because
   6.577 +      // don't want any pieces smaller than half the ideal size -- just tack
   6.578 +      // little ones onto end of last one
   6.579 +   endCondition = endVal - (int) (idealSizeOfSide/2); //end *value*, not size
   6.580 +   for( i = 0; startVal <= endVal; i++ )
   6.581 +    {
   6.582 +      startVals[i] = startVal;
   6.583 +      residualAcc += idealSizeOfSide;
   6.584 +      sizeOfSlice  = (int)residualAcc;
   6.585 +      residualAcc -= (float32)sizeOfSlice;
   6.586 +      startVal    += sizeOfSlice; //ex @size = 2 get 0, 2, 4, 6, 8..
   6.587 +
   6.588 +      if( startVal > endCondition )
   6.589 +       { startVal = endVal + 1;
   6.590 +         startVals[ i + 1 ] = startVal;
   6.591 +       }
   6.592 +    }
   6.593 +
   6.594 +   slicingStruc->startVals = startVals;
   6.595 +   slicingStruc->numVals   = i;  //loop incr'd, so == last valid start idx+1
   6.596 +                                 // which means is num sub-matrices in dim
   6.597 +                                 // also == idx of the fake start just above
   6.598 +   return slicingStruc;
   6.599 + }
   6.600 +
   6.601 +void
   6.602 +freeSlicingStruc( SlicingStruc *slicingStruc, SlaveVP *animPr )
   6.603 + {
   6.604 +   Vthread__free( slicingStruc->startVals, animPr );
   6.605 +   Vthread__free( slicingStruc, animPr );
   6.606 + }
   6.607 +
   6.608 +
   6.609 +int inline
   6.610 +measureMatrixMultPrimitive( SlaveVP *animPr )
   6.611 + {
   6.612 +   int r, c, v, numCycles;
   6.613 +   float32 *res, *left, *right;
   6.614 +
   6.615 +      //setup inputs
   6.616 +   left  = Vthread__malloc( 5 * 5 * sizeof( float32 ), animPr );
   6.617 +   right = Vthread__malloc( 5 * 5 * sizeof( float32 ), animPr );
   6.618 +   res   = Vthread__malloc( 5 * 5 * sizeof( float32 ), animPr );
   6.619 +
   6.620 +   for( r = 0; r < 5; r++ )
   6.621 +    {
   6.622 +      for( c = 0; c < 5; c++ )
   6.623 +       {
   6.624 +         left[  r * 5 + c ] = r;
   6.625 +         right[ r * 5 + c ] = c;
   6.626 +       }
   6.627 +    }
   6.628 +
   6.629 +      //do primitive
   6.630 +   Vthread__start_primitive();  //for now, just takes time stamp
   6.631 +   for( r = 0; r < 5; r++ )
   6.632 +    {
   6.633 +      for( c = 0; c < 5; c++ )
   6.634 +       {
   6.635 +         for( v = 0; v < 5; v++ )
   6.636 +          {
   6.637 +            res[ r * 5 + c ] = left[ r * 5 + v ] * right[ v * 5 + c ];
   6.638 +          }
   6.639 +       }
   6.640 +    }
   6.641 +   numCycles =
   6.642 +      Vthread__end_primitive_and_give_cycles();
   6.643 +
   6.644 +   Vthread__free( left, animPr );
   6.645 +   Vthread__free( right, animPr );
   6.646 +   Vthread__free( res, animPr );
   6.647 +
   6.648 +   return numCycles;
   6.649 + }
     7.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     7.2 +++ b/Vthread__Matrix_Mult/EntryPoint.c	Mon Sep 17 18:19:52 2012 -0700
     7.3 @@ -0,0 +1,120 @@
     7.4 +/*
     7.5 + *  Copyright 2009 OpenSourceStewardshipFoundation.org
     7.6 + *  Licensed under GNU General Public License version 2
     7.7 + *
     7.8 + * Author: seanhalle@yahoo.com
     7.9 + *
    7.10 + */
    7.11 +
    7.12 +#include <math.h>
    7.13 +
    7.14 +#include "Vthread__Matrix_Mult.h"
    7.15 +
    7.16 +
    7.17 +
    7.18 +/*Every program for a VMS-based language has an "entry point" function that
    7.19 + * creates the first
    7.20 + * processor, which starts the chain of creating more processors..
    7.21 + * eventually all of the processors will dissipate themselves, and
    7.22 + * return.
    7.23 + *
    7.24 + *This entry-point function follows the same pattern as all entry-point
    7.25 + * functions do:
    7.26 + *1) it creates the params for the seed processor, from the
    7.27 + *    parameters passed into the entry-point function
    7.28 + *2) it calls create_seed_slaveVP_and_do_work
    7.29 + *3) it gets the return value from the params struc, frees the params struc,
    7.30 + *    and returns the value from the function
    7.31 + *
    7.32 + */
    7.33 +Matrix *
    7.34 +multiplyTheseMatrices( Matrix *leftMatrix, Matrix *rightMatrix )
    7.35 + { Matrix          *resMatrix;
    7.36 +   DividerParams   *dividerParams;
    7.37 +   int32            numResRows, numResCols;
    7.38 +
    7.39 +
    7.40 +   dividerParams              = malloc( sizeof( DividerParams ) );
    7.41 +   dividerParams->leftMatrix  = leftMatrix;
    7.42 +   dividerParams->rightMatrix = rightMatrix;
    7.43 +
    7.44 +
    7.45 +   numResRows  = leftMatrix->numRows;
    7.46 +   numResCols  = rightMatrix->numCols;
    7.47 +
    7.48 +      //VMS has its own separate internal malloc, so to get results out,
    7.49 +      // have to pass in empty array for it to fill up
    7.50 +      //The alternative is internally telling SSR make external space to use
    7.51 +   resMatrix            = malloc( sizeof(Matrix) );
    7.52 +   resMatrix->array     = malloc( numResRows * numResCols * sizeof(float32));
    7.53 +   resMatrix->numCols   = rightMatrix->numCols;
    7.54 +   resMatrix->numRows   = leftMatrix->numRows;
    7.55 +
    7.56 +
    7.57 +   dividerParams->resultMatrix   = resMatrix;
    7.58 +
    7.59 +      //create divider processor, start doing the work, and wait till done
    7.60 +      //This function is the "border crossing" between normal code and SSR
    7.61 +   Vthread__create_seed_slaveVP_and_do_work(&divideWorkIntoSubMatrixPairProcrs,
    7.62 +                                             dividerParams );
    7.63 +   
    7.64 +   free( dividerParams );
    7.65 +   return resMatrix;
    7.66 + }
    7.67 +
    7.68 +   
    7.69 +Matrix *
    7.70 +multiplyTheseMatrices( Matrix *leftMatrix, Matrix *rightMatrix )
    7.71 + { Matrix          *resMatrix;
    7.72 +   DividerParams   *dividerParams;
    7.73 +   int32            numResRows, numResCols;
    7.74 +
    7.75 +
    7.76 +   dividerParams              = malloc( sizeof( DividerParams ) );
    7.77 +   dividerParams->leftMatrix  = leftMatrix;
    7.78 +   dividerParams->rightMatrix = rightMatrix;
    7.79 +
    7.80 +
    7.81 +   numResRows  = leftMatrix->numRows;
    7.82 +   numResCols  = rightMatrix->numCols;
    7.83 +
    7.84 +      //VMS has its own separate internal malloc, so to get results out,
    7.85 +      // have to pass in empty array for it to fill up
    7.86 +      //The alternative is internally telling SSR make external space to use
    7.87 +   resMatrix            = malloc( sizeof(Matrix) );
    7.88 +   resMatrix->array     = malloc( numResRows * numResCols * sizeof(float32));
    7.89 +   resMatrix->numCols   = rightMatrix->numCols;
    7.90 +   resMatrix->numRows   = leftMatrix->numRows;
    7.91 +
    7.92 +
    7.93 +   dividerParams->resultMatrix   = resMatrix;
    7.94 +
    7.95 +   //Vthread__start_lang_running();
    7.96 +   //VMS__start_lang_running( Vthread );
    7.97 +   
    7.98 +/*
    7.99 +   matrixMultProcessID = 
   7.100 +    Vthread__spawn_program_on_data(&divideWorkIntoSubMatrixPairProcrs,
   7.101 +                                                                 dividerParams);
   7.102 +*/
   7.103 +   VMS__start_VMS_running( );
   7.104 +   
   7.105 +   VMSProcessID matrixMultProcessID;
   7.106 +   
   7.107 +   matrixMultProcessID =
   7.108 +    VMS__spawn_program_on_data_in_Lang( &prog_seed_fn, data, Vthread_lang );
   7.109 +   
   7.110 +   resMatrix = VMS__give_results_when_done_for( matrixMultProcessID );
   7.111 +   
   7.112 +//    Vthread__give_results_when_done_for( matrixMultProcessID );
   7.113 +
   7.114 +   //VMS__shutdown_lang( Vthread );
   7.115 +   
   7.116 +   //Vthread__shutdown_lang();
   7.117 +   
   7.118 +   VMS__shutdown();
   7.119 +   
   7.120 +   free( dividerParams );
   7.121 +   
   7.122 +   return resMatrix;
   7.123 + }
     8.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     8.2 +++ b/Vthread__Matrix_Mult/Result_Pr.c	Mon Sep 17 18:19:52 2012 -0700
     8.3 @@ -0,0 +1,142 @@
     8.4 +/*
     8.5 + *  Copyright 2009 OpenSourceStewardshipFoundation.org
     8.6 + *  Licensed under GNU General Public License version 2
     8.7 + *
     8.8 + * Author: seanhalle@yahoo.com
     8.9 + *
    8.10 + */
    8.11 +
    8.12 +#include "Vthread__Matrix_Mult.h"
    8.13 +
    8.14 +//=====================
    8.15 +void inline
    8.16 +accumulateResult( float32 *resultArray, float32 *subMatrixResultArray,
    8.17 +                  int32    startRow,
    8.18 +                  int32    numRows,
    8.19 +                  int32    startCol,
    8.20 +                  int32    numCols,
    8.21 +                  int32    numOrigCols );
    8.22 +
    8.23 +//===========================================================================
    8.24 +
    8.25 +/*The Result Processor gets a message from each of the vector processors,
    8.26 + * puts the result from the message in its location in the result-
    8.27 + * matrix, and increments the count of results.
    8.28 + *
    8.29 + *After the count reaches the point that all results have been received, it
    8.30 + * returns the result matrix and dissipates.
    8.31 + */
    8.32 +void gatherResults( void *_params, SlaveVP *animatingThd )
    8.33 + { SlaveVP *dividerPr;
    8.34 +   ResultsParams  *params;
    8.35 +   int            numRows, numCols, numSubMatrixPairs, count=0;
    8.36 +   float32        *resultArray;
    8.37 +   SMPairParams   *resParams;
    8.38 +   //====================== thread stuff =======================
    8.39 +   MatrixMultGlobals *globals =(MatrixMultGlobals *)Vthread__give_globals();
    8.40 +
    8.41 +
    8.42 +      //get vector-comm lock before loop, so that this thd keeps lock after
    8.43 +      // one wait until it enters the next wait -- forces see-saw btwn
    8.44 +      // waiters and signalers -- wait-signal-wait-signal-...
    8.45 +   Vthread__mutex_lock( globals->vector_mutex, animatingThd );
    8.46 +
    8.47 +      //Tell divider that have the vector lock -- so it's sure won't miss any
    8.48 +      // signals from the vector-threads it's about to create
    8.49 +      //Don't need a signal variable -- this thd can't be created until
    8.50 +      // divider thd already has the start lock
    8.51 +   Vthread__mutex_lock( globals->start_mutex, animatingThd );//finish wait
    8.52 +   Vthread__cond_signal( globals->start_cond,  animatingThd );
    8.53 +   Vthread__mutex_unlock( globals->start_mutex, animatingThd );//finish wait
    8.54 +   //===========================================================
    8.55 +
    8.56 +         DEBUG( dbgAppFlow, "start resultPr\n")
    8.57 +         
    8.58 +   params    = (ResultsParams *)_params;
    8.59 +   dividerPr = params->dividerPr;
    8.60 +   numSubMatrixPairs = params->numSubMatrixPairs;
    8.61 +   numRows = params->numRows;
    8.62 +   numCols = params->numCols;
    8.63 +
    8.64 +   resultArray = params->resultArray;
    8.65 +
    8.66 +
    8.67 +   while( count < numSubMatrixPairs )
    8.68 +    {
    8.69 +         //receive a vector-result from a vector-thread
    8.70 +      Vthread__cond_wait(  globals->vector_cond,  animatingThd );
    8.71 +
    8.72 +         //At this point, animating thread owns the vector lock, so all
    8.73 +         // pairs trying to signal they have a result are waiting to get that
    8.74 +         // lock -- only one gets it at a time, and when signal, this thd
    8.75 +         // gets the lock and does the body of this loop, then when does the
    8.76 +         // wait again, that releases the lock for next pair-thread to get it
    8.77 +      resParams = globals->currSMPairParams;
    8.78 +
    8.79 +      accumulateResult( resultArray, resParams->partialResultArray,
    8.80 +                        resParams->leftSubMatrix->origStartRow,
    8.81 +                        resParams->leftSubMatrix->numRows,
    8.82 +                        resParams->rightSubMatrix->origStartCol,
    8.83 +                        resParams->rightSubMatrix->numCols,
    8.84 +                        resParams->rightSubMatrix->origMatrix->numCols );
    8.85 +
    8.86 +      Vthread__free( resParams->partialResultArray, animatingThd );
    8.87 +      
    8.88 +         //there is only one copy of results procr, so can update numUsesLeft
    8.89 +         // without concurrency worries.  When zero, free the sub-matrix
    8.90 +      resParams->leftSubMatrix->numUsesLeft -= 1;
    8.91 +      if( resParams->leftSubMatrix->numUsesLeft == 0 )
    8.92 +       {
    8.93 +         Vthread__free( resParams->leftSubMatrix->array, animatingThd );
    8.94 +         Vthread__free( resParams->leftSubMatrix, animatingThd );
    8.95 +       }
    8.96 +
    8.97 +      resParams->rightSubMatrix->numUsesLeft -= 1;
    8.98 +      if( resParams->rightSubMatrix->numUsesLeft == 0 )
    8.99 +       {
   8.100 +         Vthread__free( resParams->rightSubMatrix->array, animatingThd );
   8.101 +         Vthread__free( resParams->rightSubMatrix, animatingThd );
   8.102 +       }
   8.103 +
   8.104 +         //count of how many sub-matrix pairs accumulated so know when done
   8.105 +      count++;
   8.106 +    }
   8.107 +
   8.108 +      //Done -- could just dissipate -- SSR will wait for all processors to
   8.109 +      // dissipate before shutting down, and thereby making results avaial to
   8.110 +      // outside, so no need to stop the divider from dissipating, so no need
   8.111 +      // to send a hand-shake message to it -- but makes debug easier
   8.112 +      //However, following pattern, so all comms done, release lock
   8.113 +   Vthread__mutex_unlock( globals->vector_mutex, animatingThd );
   8.114 +
   8.115 +      //Send result to divider (seed) thread
   8.116 +      // note, divider thd had to hold the results-comm lock before creating
   8.117 +      // this thread, to be sure no race
   8.118 +   Vthread__mutex_lock(   globals->results_mutex, animatingThd );
   8.119 +   //globals->results = resultMatrixArray;
   8.120 +   Vthread__cond_signal(  globals->results_cond,  animatingThd );
   8.121 +   Vthread__mutex_unlock( globals->results_mutex, animatingThd ); //releases
   8.122 +   //divider thread from its wait, at point this executes
   8.123 +
   8.124 +   Vthread__dissipate_thread( animatingThd );  //frees any data owned by procr
   8.125 + }
   8.126 +
   8.127 +void inline
   8.128 +accumulateResult( float32 *resultArray, float32 *subMatrixPairResultArray,
   8.129 +                  int32    startRow,
   8.130 +                  int32    numRows,
   8.131 +                  int32    startCol,
   8.132 +                  int32    numCols,
   8.133 +                  int32    numOrigCols )
   8.134 + { int32 row, col;
   8.135 +
   8.136 +   for( row = 0; row < numRows; row++ )
   8.137 +    {
   8.138 +      for( col = 0; col < numCols; col++ )
   8.139 +       {
   8.140 +         resultArray[ (row + startRow) * numOrigCols + (col + startCol) ] +=
   8.141 +            subMatrixPairResultArray[ row * numCols + col ];
   8.142 +       }
   8.143 +    }
   8.144 +
   8.145 + }
     9.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     9.2 +++ b/Vthread__Matrix_Mult/Vthread__Matrix_Mult.h	Mon Sep 17 18:19:52 2012 -0700
     9.3 @@ -0,0 +1,120 @@
     9.4 +/*
     9.5 + *  Copyright Oct 24, 2009 OpenSourceStewardshipFoundation.org
     9.6 + *  Licensed under GNU General Public License version 2
     9.7 + */
     9.8 +
     9.9 +#ifndef _SSR_MATRIX_MULT_H_
    9.10 +#define _SSR_MATRIX_MULT_H_
    9.11 +
    9.12 +#include <stdio.h>
    9.13 +
    9.14 +#include "Vthread_impl/Vthread.h"
    9.15 +#include "../Matrix_Mult.h"
    9.16 +
    9.17 +
    9.18 +//===============================  Defines  ==============================
    9.19 +#define ROWS_IN_BLOCK 32
    9.20 +#define COLS_IN_BLOCK 32
    9.21 +#define VEC_IN_BLOCK  32
    9.22 +
    9.23 +#define copyMatrixSingleton 1
    9.24 +#define copyTransposeSingleton 2
    9.25 +
    9.26 +//==============================  Structures  ==============================
    9.27 +typedef struct
    9.28 + {
    9.29 +   Matrix *leftMatrix;
    9.30 +   Matrix *rightMatrix;
    9.31 +   Matrix *resultMatrix;
    9.32 + }
    9.33 +DividerParams;
    9.34 +
    9.35 +typedef struct
    9.36 + {
    9.37 +   SlaveVP *dividerPr;
    9.38 +   int numRows;
    9.39 +   int numCols;
    9.40 +   int numSubMatrixPairs;
    9.41 +   float32 *resultArray;
    9.42 + }
    9.43 +ResultsParams;
    9.44 +
    9.45 +typedef
    9.46 +struct
    9.47 + { int32    numRows;
    9.48 +   int32    numCols;
    9.49 +   Matrix  *origMatrix;
    9.50 +   int32    origStartRow;
    9.51 +   int32    origStartCol;
    9.52 +   int32    alreadyCopied;
    9.53 +   VthdSingleton *copySingleton;
    9.54 +   VthdSingleton *copyTransSingleton;
    9.55 +   int32    numUsesLeft; //have update via message to avoid multiple writers
    9.56 +   float32 *array;  //2D, but dynamically sized, so use addr arith
    9.57 + }
    9.58 +SubMatrix;
    9.59 +
    9.60 +typedef struct
    9.61 + { SlaveVP *resultPr;
    9.62 +   SubMatrix *leftSubMatrix;
    9.63 +   SubMatrix *rightSubMatrix;
    9.64 +   float32   *partialResultArray;
    9.65 + }
    9.66 +SMPairParams;
    9.67 +
    9.68 +typedef
    9.69 +struct
    9.70 + { int32    numVals;
    9.71 +   int32   *startVals;
    9.72 + }
    9.73 +SlicingStruc;
    9.74 +
    9.75 +typedef
    9.76 +struct
    9.77 + {
    9.78 +   SlicingStruc *leftRowSlices;
    9.79 +   SlicingStruc *vecSlices;
    9.80 +   SlicingStruc *rightColSlices;
    9.81 + }
    9.82 +SlicingStrucCarrier;
    9.83 +
    9.84 +enum MMMsgType
    9.85 + {
    9.86 +   RESULTS_MSG = 1
    9.87 + };
    9.88 +
    9.89 + 
    9.90 +typedef struct
    9.91 + {
    9.92 +      //for communicating sub-matrix-pair results to results Thd
    9.93 +   int32         vector_mutex;
    9.94 +   int32         vector_cond;
    9.95 +   SMPairParams *currSMPairParams;
    9.96 +
    9.97 +      //for communicating results array back to seed (divider) Thd
    9.98 +   int32         results_mutex;
    9.99 +   int32         results_cond;
   9.100 +   float32      *results;
   9.101 +
   9.102 +      //for ensuring results thd has vector lock before making vector thds
   9.103 +   int32         start_mutex;
   9.104 +   int32         start_cond;
   9.105 +
   9.106 +   Matrix *rightMatrix;
   9.107 +   Matrix *resultMatrix;
   9.108 + }
   9.109 +MatrixMultGlobals;
   9.110 +
   9.111 +
   9.112 +//============================= Processor Functions =========================
   9.113 +void divideWorkIntoSubMatrixPairProcrs( void *data, SlaveVP *animatingPr );
   9.114 +void calcSubMatrixProduct(        void *data, SlaveVP *animatingPr );
   9.115 +void gatherResults(     void *data, SlaveVP *animatingPr );
   9.116 +
   9.117 +
   9.118 +//================================ Entry Point ==============================
   9.119 +Matrix *
   9.120 +multiplyTheseMatrices( Matrix *leftMatrix, Matrix *rightMatrix );
   9.121 +
   9.122 +
   9.123 +#endif /*_SSR_MATRIX_MULT_H_*/
    10.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    10.2 +++ b/Vthread__Matrix_Mult/subMatrix_Pr.c	Mon Sep 17 18:19:52 2012 -0700
    10.3 @@ -0,0 +1,310 @@
    10.4 +/* 
    10.5 + *  Copyright 2009 OpenSourceStewardshipFoundation.org
    10.6 + *  Licensed under GNU General Public License version 2
    10.7 + *
    10.8 + * Author: SeanHalle@yahoo.com
    10.9 + *
   10.10 + */
   10.11 +
   10.12 +#include <string.h>
   10.13 +
   10.14 +#include "Vthread__Matrix_Mult.h"
   10.15 +
   10.16 +//Declarations
   10.17 +//===================================================
   10.18 +
   10.19 +void inline
   10.20 +copyFromOrig( SubMatrix *subMatrix, SlaveVP *animPr );
   10.21 +
   10.22 +void inline
   10.23 +copyTransposeFromOrig( SubMatrix *subMatrix, SlaveVP *animPr );
   10.24 +
   10.25 +void inline
   10.26 +multiplySubBlocksTransposed( float32 *leftArray, float32 *rightArray,
   10.27 +                     float32 *resArray,
   10.28 +                     int startRow,  int endRow,
   10.29 +                     int startCol,  int endCol,
   10.30 +                     int startVec,  int endVec,
   10.31 +                     int resStride, int inpStride );
   10.32 +
   10.33 +void inline
   10.34 +multiplyMatrixArraysTransposed( int32 vecLength, int32 numResRows, int32 numResCols,
   10.35 +                      float32 *leftArray, float32 *rightArray,
   10.36 +                      float32 *resArray );
   10.37 +
   10.38 +//===================================================
   10.39 +
   10.40 +/*A  processor is created with an environment that holds two matrices,
   10.41 + * the row and col that it owns, and the name of a result gathering
   10.42 + * processor.
   10.43 + *It calculates the product of two sub-portions of the input matrices
   10.44 + * by using Intel's mkl library for single-core.
   10.45 + *
   10.46 + *This demonstrates using optimized single-threaded code inside scheduled
   10.47 + * work-units.
   10.48 + *
   10.49 + *When done, it sends the result to the result processor
   10.50 + */
   10.51 +void
   10.52 +calcSubMatrixProduct( void *data, SlaveVP *animatingPr )
   10.53 + { 
   10.54 +   SMPairParams   *params;
   10.55 +   SlaveVP      *resultPr;
   10.56 +   float32        *leftArray,  *rightArray, *resArray;
   10.57 +   SubMatrix      *leftSubMatrix, *rightSubMatrix;
   10.58 +   MatrixMultGlobals *globals =(MatrixMultGlobals *)Vthread__give_globals();
   10.59 +
   10.60 +         DEBUG1(dbgAppFlow, "start sub-matrix mult: %d\n", animatingPr->procrID)
   10.61 +         #ifdef TURN_ON_DEBUG_PROBES
   10.62 +         int32 subMatrixProbe = VMS_App__create_single_interval_probe( "subMtx",
   10.63 +                                                                animatingPr);
   10.64 +         VMS_App__record_sched_choice_into_probe( subMatrixProbe, animatingPr );
   10.65 +         VMS_App__record_interval_start_in_probe( subMatrixProbe );
   10.66 +         #endif
   10.67 +
   10.68 +   params         = (SMPairParams *)data;
   10.69 +   resultPr       = params->resultPr;
   10.70 +   leftSubMatrix  = params->leftSubMatrix;
   10.71 +   rightSubMatrix = params->rightSubMatrix;
   10.72 +
   10.73 +      //make sure the input sub-matrices have been copied out of orig
   10.74 +      //do it here, inside sub-matrix pair to hopefully gain reuse in cache
   10.75 +   copyFromOrig( leftSubMatrix, animatingPr );
   10.76 +   copyTransposeFromOrig( rightSubMatrix, animatingPr );
   10.77 +   
   10.78 +   leftArray      = leftSubMatrix->array;
   10.79 +   rightArray     = rightSubMatrix->array;
   10.80 +
   10.81 +   size_t resSize = (size_t)leftSubMatrix->numRows * 
   10.82 +                        (size_t)rightSubMatrix->numCols * sizeof(float32);
   10.83 +   resArray = Vthread__malloc( resSize, animatingPr );
   10.84 +   memset( resArray, 0, resSize );
   10.85 +
   10.86 +
   10.87 +   int32 numResRows, numResCols, vectLength;
   10.88 +   
   10.89 +   vectLength = leftSubMatrix->numCols;
   10.90 +   numResRows = leftSubMatrix->numRows;
   10.91 +   numResCols = rightSubMatrix->numCols;
   10.92 +
   10.93 +   multiplyMatrixArraysTransposed( vectLength, numResRows, numResCols,
   10.94 +                         leftArray,  rightArray,
   10.95 +                         resArray );
   10.96 +
   10.97 +   //send result to result processor
   10.98 +   params->partialResultArray = resArray;
   10.99 +
  10.100 +         #ifdef TURN_ON_DEBUG_PROBES
  10.101 +         VMS_App__record_interval_end_in_probe( subMatrixProbe );
  10.102 +         #endif
  10.103 +
  10.104 +      //Send result to results thread
  10.105 +      //This pattern works 'cause only get lock when results thd inside wait
  10.106 +   Vthread__mutex_lock(   globals->vector_mutex, animatingPr );
  10.107 +   globals->currSMPairParams = params;
  10.108 +   Vthread__cond_signal(  globals->vector_cond,  animatingPr );
  10.109 +   Vthread__mutex_unlock( globals->vector_mutex, animatingPr );//release
  10.110 +   //wait-er -- cond_signal implemented such that wait-er gets lock, no other
  10.111 +
  10.112 +         DEBUG1(dbgAppFlow, "end sub-matrix mult: %d\n", animatingPr->procrID)
  10.113 +   Vthread__dissipate_thread( animatingPr );
  10.114 + }
  10.115 +
  10.116 +
  10.117 +
  10.118 +/*Divides result and each input into 32x32 sub-matrices, 3 of which fit into
  10.119 + * the 32KB L1 cache.
  10.120 + *Would be nice to embed this within another level that divided into
  10.121 + * 8x8 tiles of those, where one 8x8 tile fits within 2MB L2 cache
  10.122 + *
  10.123 + *Eventually want these divisions to be automatic, using DKU pattern
  10.124 + * embedded into VMS and exposed in the language, and with VMS controlling the
  10.125 + * divisions according to the cache sizes, which it knows about.
  10.126 + *Also, want VMS to work with language to split among main-mems, so a socket
  10.127 + * only cranks on data in its local segment of main mem
  10.128 + *
  10.129 + *So, outer two loops determine start and end points within the result matrix.
  10.130 + * Inside that, a loop dets the start and end points along the shared dimensions
  10.131 + * of the two input matrices.
  10.132 + */
  10.133 +void inline
  10.134 +multiplyMatrixArraysTransposed( int32 vecLength, int32 numResRows,
  10.135 +                                int32 numResCols,
  10.136 +                                float32 *leftArray, float32 *rightArray,
  10.137 +                                float32 *resArray )
  10.138 + {
  10.139 +   int resStride, inpStride;
  10.140 +   int resStartRow, resStartCol, resEndRow, resEndCol, startVec, endVec;
  10.141 +
  10.142 +   resStride  = numResCols;
  10.143 +   inpStride  = vecLength;
  10.144 +
  10.145 +   for( resStartRow = 0; resStartRow < numResRows; )
  10.146 +    {
  10.147 +      resEndRow = resStartRow + ROWS_IN_BLOCK -1;  //start at zero, so -1
  10.148 +      if( resEndRow > numResRows ) resEndRow = numResRows -1;
  10.149 +
  10.150 +      for( resStartCol = 0; resStartCol < numResCols; )
  10.151 +       {
  10.152 +         resEndCol   = resStartCol + COLS_IN_BLOCK -1;
  10.153 +         if( resEndCol > numResCols ) resEndCol = numResCols -1;
  10.154 +
  10.155 +         for( startVec = 0; startVec < vecLength; )
  10.156 +          {
  10.157 +            endVec   = startVec + VEC_IN_BLOCK -1;
  10.158 +            if( endVec > vecLength ) endVec = vecLength -1;
  10.159 +
  10.160 +               //By having the "vector" of sub-blocks in a sub-block slice
  10.161 +               // be marched down in inner loop, are re-using the result
  10.162 +               // matrix, which stays in L1 cache and re-using the left sub-mat
  10.163 +               // which repeats for each right sub-mat -- can only re-use two of
  10.164 +               // the three, so result is the most important -- avoids writing
  10.165 +               // dirty blocks until those result-locations fully done
  10.166 +               //Row and Col is position in result matrix -- so row and vec
  10.167 +               // for left array, then vec and col for right array
  10.168 +            multiplySubBlocksTransposed( leftArray, rightArray,
  10.169 +                                         resArray,
  10.170 +                                         resStartRow,  resEndRow,
  10.171 +                                         resStartCol,  resEndCol,
  10.172 +                                         startVec,  endVec,
  10.173 +                                         resStride, inpStride );
  10.174 +            startVec = endVec +1;
  10.175 +          }
  10.176 +         resStartCol = resEndCol +1;
  10.177 +       }
  10.178 +      resStartRow = resEndRow +1;
  10.179 +    }
  10.180 + }
  10.181 +
  10.182 +
  10.183 +
  10.184 +void inline
  10.185 +multiplySubBlocksTransposed( float32 *leftArray, float32 *rightArray,
  10.186 +                     float32 *resArray,
  10.187 +                     int resStartRow,  int resEndRow,
  10.188 +                     int resStartCol,  int resEndCol,
  10.189 +                     int startVec,  int endVec,
  10.190 +                     int resStride, int inpStride )
  10.191 + {
  10.192 +   int resRow,     resCol,        vec;
  10.193 +   int leftOffset, rightOffset;
  10.194 +   float32 result;
  10.195 +
  10.196 +      //The result row is used only for the left matrix, res col for the right
  10.197 +   for( resCol = resStartCol; resCol <= resEndCol; resCol++ )
  10.198 +    {
  10.199 +      for( resRow = resStartRow; resRow <= resEndRow; resRow++ )
  10.200 +       {
  10.201 +         leftOffset  = resRow * inpStride;//left & right inp strides always same
  10.202 +         rightOffset = resCol * inpStride;// because right is transposed
  10.203 +         result = 0;
  10.204 +         for( vec = startVec; vec <= endVec; vec++ )
  10.205 +          {
  10.206 +            result +=
  10.207 +               leftArray[ leftOffset + vec] * rightArray[ rightOffset + vec];
  10.208 +          }
  10.209 +
  10.210 +         resArray[ resRow * resStride + resCol ] += result;
  10.211 +       }
  10.212 +    }
  10.213 + }
  10.214 +
  10.215 +
  10.216 +
  10.217 +
  10.218 +/*Reuse this in divider when do the sequential multiply case
  10.219 + */
  10.220 +void inline
  10.221 +copyTranspose( int32 numRows, int32 numCols,
  10.222 +               int32 origStartRow, int32 origStartCol, int32 origStride,
  10.223 +               float32 *subArray, float32 *origArray )
  10.224 + { int32 stride = numRows;
  10.225 + 
  10.226 +   int row, col, origOffset;
  10.227 +   for( row = 0; row < numRows; row++ )
  10.228 +    {
  10.229 +      origOffset = (row + origStartRow) * origStride + origStartCol;
  10.230 +      for( col = 0; col < numCols; col++ )
  10.231 +       {
  10.232 +            //transpose means swap row & col -- traverse orig matrix normally
  10.233 +            // but put into reversed place in local array -- means the
  10.234 +            // stride is the numRows now, so col * numRows + row
  10.235 +         subArray[ col * stride + row ]  =  origArray[ origOffset + col ];
  10.236 +       }
  10.237 +    }
  10.238 + }
  10.239 +
  10.240 +void inline
  10.241 +copyTransposeFromOrig( SubMatrix *subMatrix, SlaveVP *animPr )
  10.242 + { int numCols, numRows, origStartRow, origStartCol, origStride;
  10.243 +   Matrix *origMatrix;
  10.244 +   float32 *origArray, *subArray;
  10.245 +
  10.246 +   Vthread__start_data_singleton( subMatrix->copyTransSingleton, animPr );
  10.247 +
  10.248 +   origMatrix   = subMatrix->origMatrix;
  10.249 +   origArray    = origMatrix->array;
  10.250 +   numCols      = subMatrix->numCols;
  10.251 +   numRows      = subMatrix->numRows;
  10.252 +   origStartRow = subMatrix->origStartRow;
  10.253 +   origStartCol = subMatrix->origStartCol;
  10.254 +   origStride   = origMatrix->numCols;
  10.255 +
  10.256 +   subArray    = Vthread__malloc( (size_t)numRows * 
  10.257 +                                        (size_t)numCols *sizeof(float32),animPr);
  10.258 +   subMatrix->array = subArray;
  10.259 +
  10.260 +      //copy values from orig matrix to local
  10.261 +   copyTranspose( numRows, numCols,
  10.262 +                  origStartRow, origStartCol, origStride,
  10.263 +                  subArray, origArray );
  10.264 +
  10.265 +   Vthread__end_data_singleton( subMatrix->copyTransSingleton, animPr );
  10.266 +
  10.267 + }
  10.268 +
  10.269 +
  10.270 +void inline
  10.271 +copyFromOrig( SubMatrix *subMatrix, SlaveVP *animPr )
  10.272 + { int numCols, numRows, origStartRow, origStartCol, stride, origStride;
  10.273 +   Matrix *origMatrix;
  10.274 +   float32 *origArray, *subArray;
  10.275 +
  10.276 +
  10.277 +      //This lets only a single VP execute the code between start and
  10.278 +      // end -- using start and end so that work runs outside the master.
  10.279 +      //If a second VP ever executes the start, it will be returned
  10.280 +      // from the end-point.  If its execution starts after another but before
  10.281 +      // that other has finished, this one will remain suspended until the
  10.282 +      // other finishes, then be resumed from the end-point.
  10.283 +   Vthread__start_data_singleton( subMatrix->copySingleton, animPr );
  10.284 +
  10.285 +
  10.286 +   origMatrix    = subMatrix->origMatrix;
  10.287 +   origArray     = origMatrix->array;
  10.288 +   numCols       = subMatrix->numCols;
  10.289 +   numRows       = subMatrix->numRows;
  10.290 +   origStartRow  = subMatrix->origStartRow;
  10.291 +   origStartCol  = subMatrix->origStartCol;
  10.292 +   origStride    = origMatrix->numCols;
  10.293 +
  10.294 +   subArray    = Vthread__malloc( (size_t)numRows * 
  10.295 +                                        (size_t)numCols *sizeof(float32),animPr);
  10.296 +   subMatrix->array = subArray;
  10.297 +
  10.298 +      //copy values from orig matrix to local
  10.299 +   stride        = numCols;
  10.300 +
  10.301 +   int row, col, offset, origOffset;
  10.302 +   for( row = 0; row < numRows; row++ )
  10.303 +    {
  10.304 +      offset     = row * stride;
  10.305 +      origOffset = (row + origStartRow) * origStride + origStartCol;
  10.306 +      for( col = 0; col < numCols; col++ )
  10.307 +       {
  10.308 +         subArray[ offset + col ]  =  origArray[ origOffset + col ];
  10.309 +       }
  10.310 +    }
  10.311 +
  10.312 +   Vthread__end_data_singleton( subMatrix->copySingleton, animPr );
  10.313 + }
    11.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    11.2 +++ b/__brch__default	Mon Sep 17 18:19:52 2012 -0700
    11.3 @@ -0,0 +1,3 @@
    11.4 +This branch is for the project structure defined Jan 2012..  the #includes reflect this directory structure.
    11.5 +
    11.6 +The default branch of the application directory is likely to be the only branch 
    12.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    12.2 +++ b/main.c	Mon Sep 17 18:19:52 2012 -0700
    12.3 @@ -0,0 +1,480 @@
    12.4 +/* 
    12.5 + * 
    12.6 + */
    12.7 +#include <stdio.h>
    12.8 +#include <stdlib.h>
    12.9 +#include <string.h>
   12.10 +#include <math.h>
   12.11 +#include <ctype.h>
   12.12 +#include <errno.h>
   12.13 +#include <pthread.h>
   12.14 +#include <unistd.h>
   12.15 +#include "VMS_Implementations/Vthread_impl/VPThread.h"
   12.16 +#include "C_Libraries/Queue_impl/PrivateQueue.h"
   12.17 +
   12.18 +#include <linux/perf_event.h>
   12.19 +#include <linux/prctl.h>
   12.20 +#include <sys/syscall.h>
   12.21 +
   12.22 +#undef DEBUG
   12.23 +//#define DEBUG
   12.24 +
   12.25 +#define MEASURE_PERF
   12.26 +
   12.27 +#if !defined(unix) && !defined(__unix__)
   12.28 +#ifdef __MACH__
   12.29 +#define unix		1
   12.30 +#define __unix__	1
   12.31 +#endif	/* __MACH__ */
   12.32 +#endif	/* unix */
   12.33 +
   12.34 +/* find the appropriate way to define explicitly sized types */
   12.35 +/* for C99 or GNU libc (also mach's libc) we can use stdint.h */
   12.36 +#if (__STDC_VERSION__ >= 199900) || defined(__GLIBC__) || defined(__MACH__)
   12.37 +#include <stdint.h>
   12.38 +#elif defined(unix) || defined(__unix__)	/* some UNIX systems have them in sys/types.h */
   12.39 +#include <sys/types.h>
   12.40 +#elif defined(__WIN32__) || defined(WIN32)	/* the nameless one */
   12.41 +typedef unsigned __int8 uint8_t;
   12.42 +typedef unsigned __int32 uint32_t;
   12.43 +#endif	/* sized type detection */
   12.44 +
   12.45 +/* provide a millisecond-resolution timer for each system */
   12.46 +#if defined(unix) || defined(__unix__)
   12.47 +#include <time.h>
   12.48 +#include <sys/time.h>
   12.49 +unsigned long get_msec(void) {
   12.50 +	static struct timeval timeval, first_timeval;
   12.51 +
   12.52 +	gettimeofday(&timeval, 0);
   12.53 +	if(first_timeval.tv_sec == 0) {
   12.54 +		first_timeval = timeval;
   12.55 +		return 0;
   12.56 +	}
   12.57 +	return (timeval.tv_sec - first_timeval.tv_sec) * 1000 + (timeval.tv_usec - first_timeval.tv_usec) / 1000;
   12.58 +}
   12.59 +#elif defined(__WIN32__) || defined(WIN32)
   12.60 +#include <windows.h>
   12.61 +unsigned long get_msec(void) {
   12.62 +	return GetTickCount();
   12.63 +}
   12.64 +#else
   12.65 +//#error "I don't know how to measure time on your platform"
   12.66 +#endif
   12.67 +
   12.68 +//======================== Defines =========================
   12.69 +typedef struct perfData measurement_t;
   12.70 +struct perfData{
   12.71 +    uint64 cycles;
   12.72 +    uint64 instructions;
   12.73 +};
   12.74 +
   12.75 +const char *usage = {
   12.76 +	"Usage: malloc_test [options]\n"
   12.77 +	"  Spwans a number of threads and allocates memory.\n\n"
   12.78 +	"Options:\n"
   12.79 +	"  -t <num>   how many threads to use (default: 1). This is internaly multiplied by the number of cores.\n"
   12.80 +	"  -o <num>   repeat workload and sync operation <m> times\n"
   12.81 +        "  -i <num>   size of workload, repeat <n> times\n"     
   12.82 +	"  -h         this help screen\n\n"
   12.83 +};
   12.84 +
   12.85 +struct barrier_t
   12.86 +{
   12.87 +    int counter;
   12.88 +    int nthreads;
   12.89 +    int32 mutex;
   12.90 +    int32 cond;
   12.91 +    measurement_t endBarrierCycles;
   12.92 +
   12.93 +};
   12.94 +typedef struct barrier_t barrier;
   12.95 +
   12.96 +void inline barrier_init(barrier *barr, int nthreads, VirtProcr *animatingPr)
   12.97 + {
   12.98 +   barr->counter = 0;
   12.99 +   barr->nthreads = nthreads;
  12.100 +   barr->mutex   = VPThread__make_mutex(animatingPr);
  12.101 +   barr->cond    = VPThread__make_cond(barr->mutex, animatingPr);
  12.102 + }
  12.103 +
  12.104 +int cycles_counter_main_fd;
  12.105 +void inline barrier_wait(barrier *barr, VirtProcr *animatingPr)
  12.106 + { int i;
  12.107 +
  12.108 +   VPThread__mutex_lock(barr->mutex, animatingPr);
  12.109 +   barr->counter++;
  12.110 +   if(barr->counter == barr->nthreads)
  12.111 +    { 
  12.112 +#ifdef MEASURE_PERF
  12.113 +      read(cycles_counter_main_fd, &(barr->endBarrierCycles.cycles), \
  12.114 +                sizeof(barr->endBarrierCycles.cycles));
  12.115 +#endif
  12.116 +       
  12.117 +      barr->counter = 0;
  12.118 +      for(i=0; i < barr->nthreads; i++)
  12.119 +         VPThread__cond_signal(barr->cond, animatingPr);
  12.120 +    }
  12.121 +   else
  12.122 +    { VPThread__cond_wait(barr->cond, animatingPr);
  12.123 +    }
  12.124 +   VPThread__mutex_unlock(barr->mutex, animatingPr);
  12.125 + }
  12.126 +
  12.127 +
  12.128 +
  12.129 +typedef struct
  12.130 + { struct barrier_t* barrier;
  12.131 +   uint64_t  totalWorkCycles;
  12.132 +   uint64_t  totalBadCycles;
  12.133 +   uint64_t  totalSyncCycles;
  12.134 +   uint64_t  totalBadSyncCycles;
  12.135 +   uint64     numGoodSyncs;
  12.136 +   uint64     numGoodTasks;
  12.137 + }
  12.138 +WorkerParams;
  12.139 +
  12.140 +
  12.141 +typedef struct
  12.142 + { measurement_t *startExeCycles;
  12.143 +   measurement_t *endExeCycles;
  12.144 + }
  12.145 +BenchParams;
  12.146 +
  12.147 +//======================== Globals =========================
  12.148 +char __ProgrammName[] = "overhead_test";
  12.149 +char __DataSet[255];
  12.150 +
  12.151 +int outer_iters, inner_iters, num_threads;
  12.152 +size_t chunk_size = 0;
  12.153 +
  12.154 +int cycles_counter_fd[NUM_CORES];
  12.155 +struct perf_event_attr* hw_event;
  12.156 +
  12.157 +WorkerParams *workerParamsArray;
  12.158 +
  12.159 +//======================== App Code =========================
  12.160 +/*
  12.161 + * Workload
  12.162 + */
  12.163 +
  12.164 +#define saveCyclesAndInstrs(core,cycles) do{     \
  12.165 +   int cycles_fd = cycles_counter_fd[core];             \
  12.166 +   int nread;                                           \
  12.167 +                                                        \
  12.168 +   nread = read(cycles_fd,&(cycles),sizeof(cycles));    \
  12.169 +   if(nread<0){                                         \
  12.170 +       perror("Error reading cycles counter");          \
  12.171 +       cycles = 0;                                      \
  12.172 +   }                                                    \
  12.173 +} while (0) //macro magic for scoping
  12.174 +
  12.175 +
  12.176 +double
  12.177 +worker_TLF(void* _params, VirtProcr* animatingPr)
  12.178 + {
  12.179 +   int i,o;
  12.180 +   WorkerParams* params = (WorkerParams*)_params;
  12.181 +   unsigned int totalWorkCycles = 0, totalBadCycles = 0;
  12.182 +   unsigned int totalSyncCycles = 0, totalBadSyncCycles = 0;
  12.183 +   unsigned int workspace1=0, numGoodSyncs = 0, numGoodTasks = 0;
  12.184 +   double workspace2=0.0;
  12.185 +   int32 privateMutex = VPThread__make_mutex(animatingPr);
  12.186 +   
  12.187 +   int cpuid = sched_getcpu();
  12.188 +   
  12.189 +   measurement_t startWorkload, endWorkload, startWorkload2, endWorkload2;
  12.190 +   uint64 numCycles;
  12.191 +   for(o=0; o < outer_iters; o++)
  12.192 +    {
  12.193 +#ifdef MEASURE_PERF
  12.194 +          saveCyclesAndInstrs(cpuid,startWorkload.cycles);
  12.195 +#endif
  12.196 +       
  12.197 +      //workltask
  12.198 +      for(i=0; i < inner_iters; i++)
  12.199 +       {
  12.200 +         workspace1 += (workspace1 + 32)/2;
  12.201 +         workspace2 += (workspace2 + 23.2)/1.4;
  12.202 +       }
  12.203 +  
  12.204 +#ifdef MEASURE_PERF
  12.205 +          saveCyclesAndInstrs(cpuid,endWorkload.cycles);
  12.206 +          numCycles = endWorkload.cycles - startWorkload.cycles;
  12.207 +          //sanity check (400K is about 20K iters)
  12.208 +          if( numCycles < 400000 ) {totalWorkCycles += numCycles; numGoodTasks++;}
  12.209 +          else                     {totalBadCycles  += numCycles; }
  12.210 +#endif
  12.211 +
  12.212 +      //mutex access often causes switch to different Slave VP
  12.213 +      VPThread__mutex_lock(privateMutex, animatingPr);
  12.214 +      
  12.215 +/*
  12.216 +          saveCyclesAndInstrs(cpuid,startWorkload2.cycles);
  12.217 +      //Task
  12.218 +      for(i=0; i < inner_iters; i++)
  12.219 +       {
  12.220 +         workspace1 += (workspace1 + 32)/2;
  12.221 +         workspace2 += (workspace2 + 23.2)/1.4;
  12.222 +       }
  12.223 +      
  12.224 +          saveCyclesAndInstrs(cpuid,endWorkload2.cycles);
  12.225 +          numCycles = endWorkload2.cycles - startWorkload2.cycles;
  12.226 +          //sanity check (400K is about 20K iters)
  12.227 +          if( numCycles < 400000 ) {totalWorkCycles += numCycles; numGoodTasks++;}
  12.228 +          else                     {totalBadCycles  += numCycles; }
  12.229 +      
  12.230 +*/
  12.231 +      VPThread__mutex_unlock(privateMutex, animatingPr);
  12.232 +    }
  12.233 +
  12.234 +   params->totalWorkCycles = totalWorkCycles;
  12.235 +   params->totalBadCycles = totalBadCycles;
  12.236 +   params->numGoodTasks   = numGoodTasks;
  12.237 +   params->totalSyncCycles = totalSyncCycles;
  12.238 +   params->totalBadSyncCycles = totalBadSyncCycles;
  12.239 +   params->numGoodSyncs = numGoodSyncs;
  12.240 +/*
  12.241 +   params->totalSyncCycles = VMS__give_num_plugin_cycles();
  12.242 +   params->totalBadSyncCycles = 0;
  12.243 +   params->numGoodSyncs = VMS__give_num_plugin_animations();
  12.244 +*/
  12.245 +   
  12.246 +   
  12.247 +   //Wait for all threads to end
  12.248 +   barrier_wait(params->barrier, animatingPr);
  12.249 +   
  12.250 +   //Shutdown worker
  12.251 +   VPThread__dissipate_thread(animatingPr);
  12.252 +   
  12.253 +     //below return never reached --> there for gcc
  12.254 +   return (workspace1 + workspace2);  //to prevent gcc from optimizing work out
  12.255 + }
  12.256 +
  12.257 +
  12.258 +/* this is run after the VMS is set up*/
  12.259 +void benchmark(void *_params, VirtProcr *animatingPr)
  12.260 + {
  12.261 +   int i, cpuID;
  12.262 +   struct barrier_t  barr;
  12.263 +   BenchParams      *params;
  12.264 +   
  12.265 +   params = (BenchParams *)_params;
  12.266 +
  12.267 +   barrier_init(&barr, num_threads+1, animatingPr);
  12.268 +      
  12.269 +   //prepare input
  12.270 +   for(i=0; i<num_threads; i++)
  12.271 +    { 
  12.272 +       workerParamsArray[i].barrier = &barr;
  12.273 +    }
  12.274 +     
  12.275 +   //save cycles before execution of threads, to get total exe cycles
  12.276 +   measurement_t *startExeCycles, *endExeCycles;
  12.277 +   startExeCycles = params->startExeCycles;
  12.278 +   
  12.279 +#ifdef MEASURE_PERF
  12.280 +   int nread = read(cycles_counter_main_fd, &(startExeCycles->cycles),
  12.281 +                sizeof(startExeCycles->cycles));
  12.282 +   if(nread<0) perror("Error reading cycles counter");
  12.283 +#endif
  12.284 +   
  12.285 +   //create (which starts running) all threads
  12.286 +   for(i=0; i<num_threads; i++)
  12.287 +    { VPThread__create_thread((VirtProcrFnPtr)worker_TLF, &(workerParamsArray[i]), animatingPr);
  12.288 +    }
  12.289 +   //wait for all threads to finish
  12.290 +   barrier_wait(&barr, animatingPr);
  12.291 +  
  12.292 +#ifdef MEASURE_PERF
  12.293 +   //endBarrierCycles read in barrier_wait()!  Merten, email me if want to chg
  12.294 +   params->endExeCycles->cycles = barr.endBarrierCycles.cycles;
  12.295 +#endif
  12.296 +   
  12.297 +
  12.298 +/*
  12.299 +   uint64_t overallWorkCycles = 0;
  12.300 +   for(i=0; i<num_threads; i++){ 
  12.301 +       printf("WorkCycles: %lu\n",input[i].totalWorkCycles);
  12.302 +       overallWorkCycles += input[i].totalWorkCycles;
  12.303 +    }
  12.304 +   
  12.305 +   printf("Sum across threads of work cycles: %lu\n", overallWorkCycles);
  12.306 +   printf("Total Execution: %lu\n", endBenchTime.cycles-startBenchTime.cycles);
  12.307 +   printf("Runtime/Workcycle Ratio %lu\n", 
  12.308 +   ((endBenchTime.cycles-startBenchTime.cycles)*100)/overallWorkCycles);
  12.309 +*/
  12.310 +
  12.311 +   //======================================================
  12.312 +
  12.313 +   VPThread__dissipate_thread(animatingPr);
  12.314 + }
  12.315 +
  12.316 +int main(int argc, char **argv)
  12.317 + {
  12.318 +   int i;
  12.319 +
  12.320 +   //set global static variables, based on cmd-line args
  12.321 +   for(i=1; i<argc; i++)
  12.322 +    {
  12.323 +      if(argv[i][0] == '-' && argv[i][2] == 0)
  12.324 +       {
  12.325 +         switch(argv[i][1])
  12.326 +          {
  12.327 +            case 't':
  12.328 +               if(!isdigit(argv[++i][0]))
  12.329 +                {
  12.330 +                  fprintf(stderr, "-t must be followed by the number of worker threads to spawn\n");
  12.331 +                  return EXIT_FAILURE;
  12.332 +                }
  12.333 +               num_threads = atoi(argv[i]);
  12.334 +               if(!num_threads)
  12.335 +                {
  12.336 +                  fprintf(stderr, "invalid number of threads specified: %d\n", num_threads);
  12.337 +                  return EXIT_FAILURE;
  12.338 +                }
  12.339 +            break;
  12.340 +            case 'o':
  12.341 +               if(!isdigit(argv[++i][0]))
  12.342 +                {
  12.343 +                  fputs("-i must be followed by a number\n", stderr);
  12.344 +                  return EXIT_FAILURE;
  12.345 +                }
  12.346 +               outer_iters = atoi(argv[i]);
  12.347 +				break;
  12.348 +            case 'i':
  12.349 +               if(!isdigit(argv[++i][0]))
  12.350 +                {
  12.351 +                  fputs("-o must be followed by a number (workload size)\n", stderr);
  12.352 +                  return EXIT_FAILURE;
  12.353 +                }
  12.354 +               inner_iters = atoi(argv[i]);
  12.355 +				break;
  12.356 +            case 'h':
  12.357 +               fputs(usage, stdout);
  12.358 +               return 0;
  12.359 +				
  12.360 +            default:
  12.361 +               fprintf(stderr, "unrecognized argument: %s\n", argv[i]);
  12.362 +               fputs(usage, stderr);
  12.363 +               return EXIT_FAILURE;
  12.364 +          }//switch
  12.365 +       }//if arg
  12.366 +      else
  12.367 +       {
  12.368 +			fprintf(stderr, "unrecognized argument: %s\n", argv[i]);
  12.369 +			fputs(usage, stderr);
  12.370 +			return EXIT_FAILURE;
  12.371 +       }
  12.372 +    }//for
  12.373 +   
  12.374 +   
  12.375 +#ifdef MEASURE_PERF
  12.376 +   //setup performance counters
  12.377 +    hw_event = malloc(sizeof(struct perf_event_attr));
  12.378 +    memset(hw_event,0,sizeof(struct perf_event_attr));
  12.379 +    
  12.380 +    hw_event->type = PERF_TYPE_HARDWARE;
  12.381 +    hw_event->size = sizeof(hw_event);
  12.382 +    hw_event->disabled = 0;
  12.383 +    hw_event->freq = 0;
  12.384 +    hw_event->inherit = 1; /* children inherit it   */
  12.385 +    hw_event->pinned = 1; /* says this virt counter must always be on HW */
  12.386 +    hw_event->exclusive = 0; /* only group on PMU     */
  12.387 +    hw_event->exclude_user = 0; /* don't count user      */
  12.388 +    hw_event->exclude_kernel = 1; /* don't count kernel  */
  12.389 +    hw_event->exclude_hv = 1; /* ditto hypervisor      */
  12.390 +    hw_event->exclude_idle = 1; /* don't count when idle */
  12.391 +    hw_event->mmap = 0; /* include mmap data     */
  12.392 +    hw_event->comm = 0; /* include comm data     */
  12.393 +
  12.394 +    hw_event->config = PERF_COUNT_HW_CPU_CYCLES; //cycles
  12.395 +    
  12.396 +    int cpuID, retries;
  12.397 +
  12.398 +   for( cpuID = 0; cpuID < NUM_CORES; cpuID++ )
  12.399 +    { retries = 0;
  12.400 +      do
  12.401 +       { retries += 1;
  12.402 +         cycles_counter_fd[cpuID] = 
  12.403 +          syscall(__NR_perf_event_open, hw_event,
  12.404 +                  0,//pid_t: 0 is "pid of calling process" 
  12.405 +                  cpuID,//int: cpu, the value returned by "CPUID" instr(?)
  12.406 +                  -1,//int: group_fd, -1 is "leader" or independent
  12.407 +                  0//unsigned long: flags
  12.408 +                 );
  12.409 +       }
  12.410 +      while(cycles_counter_fd[cpuID]<0 && retries < 100);
  12.411 +      if(retries >= 100)
  12.412 +       {
  12.413 +         fprintf(stderr,"On core %d: ",cpuID);
  12.414 +         perror("Failed to open cycles counter");
  12.415 +       }
  12.416 +    }
  12.417 +
  12.418 +   //Set up counter to accumulate total cycles to process, across all CPUs
  12.419 +
  12.420 +   retries = 0;
  12.421 +   do
  12.422 +    { retries += 1;
  12.423 +      cycles_counter_main_fd = 
  12.424 +       syscall(__NR_perf_event_open, hw_event,
  12.425 +               0,//pid_t: 0 is "pid of calling process" 
  12.426 +               -1,//int: cpu, -1 means accumulate from all cores
  12.427 +               -1,//int: group_fd, -1 is "leader" == independent
  12.428 +               0//unsigned long: flags
  12.429 +              );
  12.430 +    }
  12.431 +   while(cycles_counter_main_fd<0 && retries < 100);
  12.432 +   if(retries >= 100)
  12.433 +    {
  12.434 +      fprintf(stderr,"in main ");
  12.435 +      perror("Failed to open cycles counter");
  12.436 +    }
  12.437 +#endif
  12.438 +   
  12.439 +   measurement_t startExeCycles, endExeCycles;
  12.440 +   BenchParams *benchParams;
  12.441 +   
  12.442 +   benchParams = malloc(sizeof(BenchParams)); 
  12.443 +   
  12.444 +   benchParams->startExeCycles = &startExeCycles;
  12.445 +   benchParams->endExeCycles   = &endExeCycles;
  12.446 +   
  12.447 +   workerParamsArray =  (WorkerParams *)malloc( (num_threads + 1) * sizeof(WorkerParams) );
  12.448 +   if(workerParamsArray == NULL ) printf("error mallocing worker params array\n");
  12.449 +   
  12.450 + 
  12.451 +   //This is the transition to the VMS runtime
  12.452 +   VPThread__create_seed_procr_and_do_work( &benchmark, benchParams );
  12.453 +   
  12.454 +#ifdef MEASURE_PERF
  12.455 +   uint64_t totalWorkCyclesAcrossCores = 0, totalBadCyclesAcrossCores = 0;
  12.456 +   uint64_t totalSyncCyclesAcrossCores = 0, totalBadSyncCyclesAcrossCores = 0;
  12.457 +   for(i=0; i<num_threads; i++){ 
  12.458 +       printf("WorkCycles: %lu\n",workerParamsArray[i].totalWorkCycles);
  12.459 +//       printf("Num Good Tasks: %lu\n",workerParamsArray[i].numGoodTasks);
  12.460 +//       printf("SyncCycles: %lu\n",workerParamsArray[i].totalSyncCycles);
  12.461 +//       printf("Num Good Syncs: %lu\n",workerParamsArray[i].numGoodSyncs);
  12.462 +       totalWorkCyclesAcrossCores += workerParamsArray[i].totalWorkCycles;
  12.463 +       totalBadCyclesAcrossCores  += workerParamsArray[i].totalBadCycles;
  12.464 +       totalSyncCyclesAcrossCores += workerParamsArray[i].totalSyncCycles;
  12.465 +       totalBadSyncCyclesAcrossCores  += workerParamsArray[i].totalBadSyncCycles;
  12.466 +    }
  12.467 +
  12.468 +   uint64_t totalExeCycles = endExeCycles.cycles - startExeCycles.cycles;
  12.469 +   totalExeCycles -= totalBadCyclesAcrossCores;
  12.470 +   uint64 totalOverhead = totalExeCycles - totalWorkCyclesAcrossCores;
  12.471 +   int32  numSyncs = outer_iters * num_threads * 2;
  12.472 +   printf("Total Execution Cycles: %lu\n", totalExeCycles);
  12.473 +   printf("Sum across threads of work cycles: %lu\n", totalWorkCyclesAcrossCores);
  12.474 +   printf("Sum across threads of bad work cycles: %lu\n", totalBadCyclesAcrossCores);
  12.475 +//   printf("Sum across threads of Bad Sync cycles: %lu\n", totalBadSyncCyclesAcrossCores);
  12.476 +   printf("Overhead per sync: %f\n", (double)totalOverhead / (double)numSyncs );
  12.477 +   printf("ExeCycles/WorkCycles Ratio %f\n", 
  12.478 +          (double)totalExeCycles / (double)totalWorkCyclesAcrossCores);
  12.479 +#else
  12.480 +   printf("No measurement done!\n");
  12.481 +#endif
  12.482 +   return 0;
  12.483 + }
    13.1 --- a/src/Application/Matrix_Mult.c	Fri Sep 16 20:12:34 2011 +0200
    13.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
    13.3 @@ -1,167 +0,0 @@
    13.4 -/*
    13.5 - *  Copyright 2009 OpenSourceStewardshipFoundation.org
    13.6 - *  Licensed under GNU General Public License version 2
    13.7 - *
    13.8 - * Author: seanhalle@yahoo.com
    13.9 - *
   13.10 - * Created on November 15, 2009, 2:35 AM
   13.11 - */
   13.12 -
   13.13 -#include <malloc.h>
   13.14 -#include <stdlib.h>
   13.15 -
   13.16 -#include "Matrix_Mult.h"
   13.17 -#include "ParamHelper/Param.h"
   13.18 -
   13.19 -
   13.20 - 
   13.21 - void
   13.22 -initialize_Input_Matrices_Via( Matrix  **leftMatrix, Matrix **rightMatrix,
   13.23 -                               ParamBag *paramBag )
   13.24 - { char *leftMatrixFileName, *rightMatrixFileName;
   13.25 -   int   leftMatrixRows, leftMatrixCols, rightMatrixRows, rightMatrixCols;
   13.26 -   
   13.27 -      ParamStruc *param;
   13.28 -      param = getParamFromBag( "leftMatrixRows", paramBag );
   13.29 -   leftMatrixRows = param->intValue;
   13.30 -      param = getParamFromBag( "leftMatrixCols", paramBag );
   13.31 -   leftMatrixCols = param->intValue;
   13.32 -   *leftMatrix = makeMatrix_WithResMat( leftMatrixRows, leftMatrixCols );
   13.33 -   
   13.34 -      param = getParamFromBag( "leftMatrixFileName", paramBag );
   13.35 -   leftMatrixFileName = param->strValue;  //no need to copy
   13.36 -   read_Matrix_From_File( *leftMatrix,  leftMatrixFileName );
   13.37 -   
   13.38 -      param = getParamFromBag( "rightMatrixRows", paramBag );
   13.39 -   rightMatrixRows = param->intValue;
   13.40 -      param = getParamFromBag( "rightMatrixCols", paramBag );
   13.41 -   rightMatrixCols = param->intValue;
   13.42 -   *rightMatrix = makeMatrix_WithResMat( rightMatrixRows, rightMatrixCols );
   13.43 -   
   13.44 -      param = getParamFromBag( "rightMatrixFileName", paramBag );
   13.45 -   rightMatrixFileName = param->strValue;
   13.46 -   read_Matrix_From_File( *rightMatrix, rightMatrixFileName );
   13.47 - }
   13.48 -
   13.49 -
   13.50 -void parseLineIntoRow( char *line, float32* row );
   13.51 -
   13.52 -
   13.53 - void
   13.54 -read_Matrix_From_File( Matrix *matrixStruc, char *matrixFileName )
   13.55 - { int    row, maxRead, numRows, numCols;
   13.56 -   float32 *matrixStart;
   13.57 -   size_t lineSz = 0;
   13.58 -   FILE  *file;
   13.59 -   char  *line = NULL;
   13.60 -   
   13.61 -   lineSz = 50000; //max length of line in a matrix data file
   13.62 -   line = (char *) malloc( lineSz );
   13.63 -   if( line == NULL ) printf( "no mem for matrix line" );
   13.64 -   
   13.65 -   numRows = matrixStruc->numRows;
   13.66 -   numCols = matrixStruc->numCols;
   13.67 -   matrixStart = matrixStruc->array;
   13.68 -
   13.69 -   file = fopen( matrixFileName, "r" );
   13.70 -   if( file == NULL ) { printf( "\nCouldn't open file!!\n"); exit(1);}
   13.71 -   fseek( file, 0, SEEK_SET );
   13.72 -   for( row = 0; row < numRows; row++ )
   13.73 -    {
   13.74 -      if( feof( file ) )  printf( "file ran out too soon" );
   13.75 -      maxRead = getline( &line, &lineSz, file );
   13.76 -      if( maxRead == -1 ) printf( "prob reading mat line");
   13.77 -      
   13.78 -      if( *line == '\n') continue; //blank line
   13.79 -      if( *line == '/' ) continue; //comment line
   13.80 -      
   13.81 -      parseLineIntoRow( line, matrixStart + row * numCols );
   13.82 -    }
   13.83 -   free( line );
   13.84 - }
   13.85 -
   13.86 -/*This function relies on each line having the proper number of cols.  It
   13.87 - * doesn't check, nor enforce, so if the file is improperly formatted it
   13.88 - * can write over unrelated memory
   13.89 - */
   13.90 - void
   13.91 -parseLineIntoRow( char *line, float32* row )
   13.92 - {
   13.93 -   char *valueStr, *searchPos;
   13.94 -   
   13.95 -      //read the float values
   13.96 -   searchPos = valueStr = line; //start
   13.97 -   
   13.98 -   for( ; *searchPos != 0; searchPos++)  //bit dangerous, should use buff len
   13.99 -    {
  13.100 -      if( *searchPos == '\n' ) //last col..  relying on well-formatted file
  13.101 -       { *searchPos = 0;
  13.102 -         *row = atof( valueStr );
  13.103 -         break;                                    //end FOR loop
  13.104 -       }
  13.105 -      if( *searchPos == ',' )
  13.106 -       { *searchPos = 0;                           //mark end of string
  13.107 -         *row = (float32) atof( valueStr );
  13.108 -         row += 1;                                 //address arith
  13.109 -            //skip any spaces before digits.. use searchPos + 1 to skip the 0
  13.110 -         for( ; *(searchPos + 1)== ' ' && *(searchPos + 1) !=0; searchPos++);
  13.111 -         valueStr = searchPos + 1;
  13.112 -       }
  13.113 -    }
  13.114 - }
  13.115 -
  13.116 - //==========================================================================
  13.117 -
  13.118 -/*In the "_Flat" version of constructor, do only malloc of the top data struc
  13.119 - * and set values in that top-level.  Don't malloc any sub-structures.
  13.120 - */
  13.121 - Matrix *
  13.122 -makeMatrix_Flat( int32 numRows, int32 numCols )
  13.123 - { Matrix * retMatrix;
  13.124 -   retMatrix = malloc( sizeof( Matrix ) );
  13.125 -   retMatrix->numRows = numRows;
  13.126 -   retMatrix->numCols = numCols;
  13.127 -
  13.128 -   return retMatrix;
  13.129 - }
  13.130 -
  13.131 - Matrix *
  13.132 -makeMatrix_WithResMat( int32 numRows, int32 numCols )
  13.133 - { Matrix * retMatrix;
  13.134 -   retMatrix = malloc( sizeof( Matrix ) );
  13.135 -   retMatrix->numRows = numRows;
  13.136 -   retMatrix->numCols = numCols;
  13.137 -   retMatrix->array  = malloc( numRows * numCols * sizeof(float32) );
  13.138 -
  13.139 -   return retMatrix;
  13.140 - }
  13.141 -
  13.142 - void
  13.143 -freeMatrix_Flat( Matrix * matrix )
  13.144 - { //( matrix );
  13.145 - }
  13.146 - void
  13.147 -freeMatrix( Matrix * matrix )
  13.148 - { free( matrix->array );
  13.149 -   free( matrix );
  13.150 - }
  13.151 -
  13.152 -void
  13.153 -printMatrix( Matrix *matrix )
  13.154 - { int r, c, numRows, numCols, rowsToPrint, colsToPrint, rowIncr, colIncr;
  13.155 -   float32 *matrixArray;
  13.156 -
  13.157 -   numRows = rowsToPrint = matrix->numRows;
  13.158 -   numCols = colsToPrint = matrix->numCols;
  13.159 -   matrixArray = matrix->array;
  13.160 -
  13.161 -   rowIncr = numRows/20; if(rowIncr == 0) rowIncr = 1;//20 to 39 rows printed
  13.162 -   colIncr = numCols/20; if(colIncr == 0) colIncr = 1;//20 to 39 cols printed
  13.163 -   for( r = 0; r < numRows; r += rowIncr )
  13.164 -    { for( c = 0; c < numCols; c += colIncr )
  13.165 -       { printf( "%3.1f | ", matrixArray[ r * numCols + c ] );
  13.166 -       }
  13.167 -      printf("\n");
  13.168 -    }
  13.169 - }
  13.170 -
    14.1 --- a/src/Application/Matrix_Mult.h	Fri Sep 16 20:12:34 2011 +0200
    14.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
    14.3 @@ -1,77 +0,0 @@
    14.4 -/*
    14.5 - *  Copyright Oct 24, 2009 OpenSourceStewardshipFoundation.org
    14.6 - *  Licensed under GNU General Public License version 2
    14.7 - */
    14.8 -
    14.9 -#ifndef MATRIX_MULT_H_
   14.10 -#define MATRIX_MULT_H_
   14.11 -
   14.12 -#include <stdio.h>
   14.13 -#include <unistd.h>
   14.14 -#include <malloc.h>
   14.15 -
   14.16 -#include "../VPThread_lib/VMS/VMS_primitive_data_types.h"
   14.17 -#include "ParamHelper/Param.h"
   14.18 -
   14.19 -//==============================  Structures  ==============================
   14.20 -
   14.21 -typedef
   14.22 -struct
   14.23 - { int32 numRows;
   14.24 -   int32 numCols;
   14.25 -   float32 *array;  //2D, but dynamically sized, so use addr arith
   14.26 - }
   14.27 -Matrix;
   14.28 -
   14.29 -/* This is the "appSpecificPiece" that is carried inside a DKUPiece.
   14.30 - *  In the DKUPiece data struc it is declared to be of type "void *".  This
   14.31 - *  allows the application to define any data structure it wants and put it
   14.32 - *  into a DKUPiece.
   14.33 - * When the app specific info is used, it is in app code, so it is cast to
   14.34 - *  the correct type to tell the compiler how to access fields.
   14.35 - * This keeps all app-specific things out of the DKU directory, as per the
   14.36 - *  DKU standard. */
   14.37 -typedef
   14.38 -struct
   14.39 - { 
   14.40 -      // pointers to shared data..  the result matrix must be created when the
   14.41 -      //  left and right matrices are put into the root ancestor DKUPiece.
   14.42 -   Matrix * leftMatrix;
   14.43 -   Matrix * rightMatrix;
   14.44 -   Matrix * resultMatrix;
   14.45 -
   14.46 -      // define the starting and ending boundaries for this piece of the
   14.47 -      //  result matrix.  These are derivable from the left and right
   14.48 -      //  matrices, but included them for readability of code.
   14.49 -   int prodStartRow, prodEndRow;
   14.50 -   int prodStartCol, prodEndCol;
   14.51 -      // Start and end of the portion of the left matrix that contributes to
   14.52 -      //  this piece of the product
   14.53 -   int leftStartRow, leftEndRow;
   14.54 -   int leftStartCol, leftEndCol;
   14.55 -      // Start and end of the portion of the right matrix that contributes to
   14.56 -      //  this piece of the product
   14.57 -   int rightStartRow, rightEndRow;
   14.58 -   int rightStartCol, rightEndCol;
   14.59 - }
   14.60 -MatrixProdPiece;
   14.61 -
   14.62 -//==============================  Functions  ================================
   14.63 -void readFile();
   14.64 -
   14.65 -Matrix *makeMatrix( int32 numRows, int32 numCols );
   14.66 -Matrix *makeMatrix_Flat( int32 numRows, int32 numCols );
   14.67 -Matrix *makeMatrix_WithResMat( int32 numRows, int32 numCols );
   14.68 -void    freeMatrix_Flat( Matrix * matrix );
   14.69 -void    freeMatrix( Matrix * matrix );
   14.70 -void    printMatrix( Matrix *matrix );
   14.71 -
   14.72 -void read_Matrix_From_File( Matrix *matrixStruc, char *matrixFileName );
   14.73 -
   14.74 -void
   14.75 -initialize_Input_Matrices_Via( Matrix  **leftMatrix, Matrix **rightMatrix,
   14.76 -                              ParamBag *paramBag );
   14.77 -
   14.78 -//===========================================================================
   14.79 -
   14.80 -#endif /*MATRIX_MULT_H_*/
    15.1 --- a/src/Application/VPThread__Matrix_Mult/Divide_Pr.c	Fri Sep 16 20:12:34 2011 +0200
    15.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
    15.3 @@ -1,646 +0,0 @@
    15.4 -/*
    15.5 - *  Copyright 2009 OpenSourceStewardshipFoundation.org
    15.6 - *  Licensed under GNU General Public License version 2
    15.7 - *
    15.8 - * Author: seanhalle@yahoo.com
    15.9 - *
   15.10 - */
   15.11 -
   15.12 -
   15.13 -#include "VPThread__Matrix_Mult.h"
   15.14 -#include <math.h>
   15.15 -#include <string.h>
   15.16 -#include "../../VPThread_lib/VPThread.h"
   15.17 -
   15.18 -   //The time to compute this many result values should equal the time to
   15.19 -   // perform this division on a matrix of size gives that many result calcs
   15.20 -   //IE, size this so that sequential time to calc equals divide time
   15.21 -   // find the value by experimenting -- but divide time and calc time scale
   15.22 -   // same way, so this value should remain valid across hardware
   15.23 -#define NUM_CELLS_IN_SEQUENTIAL_CUTOFF 1000
   15.24 -
   15.25 -
   15.26 -//===========================================================================
   15.27 -int inline
   15.28 -measureMatrixMultPrimitive( VirtProcr *animPr );
   15.29 -
   15.30 -SlicingStrucCarrier *
   15.31 -calcIdealSizeAndSliceDimensions( Matrix *leftMatrix, Matrix *rightMatrix,
   15.32 -                                 VirtProcr *animPr );
   15.33 -
   15.34 -SlicingStruc *
   15.35 -sliceUpDimension( float32 idealSizeOfSide, int startVal, int endVal,
   15.36 -                  VirtProcr *animPr );
   15.37 -
   15.38 -void
   15.39 -freeSlicingStruc( SlicingStruc *slicingStruc, VirtProcr *animPr );
   15.40 -
   15.41 -SubMatrix **
   15.42 -createSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices,
   15.43 -                   int32 numUses, Matrix *origMatrix, VirtProcr *animPr );
   15.44 -
   15.45 -void
   15.46 -freeSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices,
   15.47 -                 SubMatrix **subMatrices, VirtProcr *animPr );
   15.48 -
   15.49 -void
   15.50 -pairUpSubMatricesAndMakeProcessors( SubMatrix **leftSubMatrices,
   15.51 -                                    SubMatrix **rightSubMatrices,
   15.52 -                                    int32 numRowIdxs, int32 numColIdxs,
   15.53 -                                    int32 numVecIdxs,
   15.54 -                                    VirtProcr *resultPr,
   15.55 -                                    VirtProcr *animatingPr );
   15.56 -
   15.57 -void
   15.58 -makeSubMatricesAndProcrs( Matrix *leftMatrix, Matrix *rightMatrix,
   15.59 -            SlicingStrucCarrier *slicingStrucCarrier,
   15.60 -            VirtProcr *resultPr, VirtProcr *animatingPr );
   15.61 -
   15.62 -void inline
   15.63 -copyTranspose( int32 numRows, int32 numCols,
   15.64 -               int32 origStartRow, int32 origStartCol, int32 origStride,
   15.65 -               float32 *subArray, float32 *origArray );
   15.66 -
   15.67 -void inline
   15.68 -multiplyMatrixArraysTransposed( int32 vecLength, int32 numResRows,
   15.69 -                                int32 numResCols,
   15.70 -                                float32 *leftArray, float32 *rightArray,
   15.71 -                                float32 *resArray );
   15.72 -
   15.73 -
   15.74 -
   15.75 -/*Divider creates one processor for every sub-matrix
   15.76 - * It hands them:
   15.77 - *  the name of the result processor that they should send their results to,
   15.78 - *  the left and right matrices, and the rows and cols they should multiply
   15.79 - * It first creates the result processor, then all the sub-matrixPair
   15.80 - *  processors,
   15.81 - *  then does a receive of a message from the result processor that gives
   15.82 - *  the divider ownership of the result matrix.
   15.83 - * Finally, the divider returns the result matrix out of the VPThread system.
   15.84 - *
   15.85 - * Divider chooses the size of sub-matrices via an algorithm that tries to
   15.86 - *  keep the minimum work above a threshold.  The threshold is machine-
   15.87 - *  dependent, so ask VPThread for min work-unit time to get a
   15.88 - *  given overhead
   15.89 - *
   15.90 - * Divide min work-unit cycles by measured-cycles for one matrix-cell
   15.91 - *  product -- gives the number of products need to have in min size
   15.92 - *  matrix.
   15.93 - *
   15.94 - * So then, take cubed root of this to get the size of a side of min sub-
   15.95 - *  matrix.  That is the size of the ideal square sub-matrix -- so tile
   15.96 - *  up the two input matrices into ones as close as possible to that size,
   15.97 - *  and create the pairs of sub-matrices.
   15.98 - *
   15.99 - *========================  STRATEGIC OVERVIEW  =======================
  15.100 - *
  15.101 - *This division is a bit tricky, because have to create things in advance
  15.102 - * that it's not at first obvious need to be created..
  15.103 - *
  15.104 - *First slice up each dimension -- three of them..  this is because will have
  15.105 - * to create the sub-matrix's data-structures before pairing the sub-matrices
  15.106 - * with each other -- so, have three dimensions to slice up before can
  15.107 - * create the sub-matrix data-strucs -- also, have to be certain that the
  15.108 - * cols of the left input have the exact same slicing as the rows of the
  15.109 - * left matrix, so just to be sure, do the slicing calc once, then use it
  15.110 - * for both.
  15.111 - *
  15.112 - *So, goes like this:
  15.113 - *1) calculate the start & end values of each dimension in each matrix.
  15.114 - *2) use those values to create sub-matrix structures
  15.115 - *3) combine sub-matrices into pairs, as the tasks to perform.
  15.116 - *
  15.117 - *Have to calculate separately from creating the sub-matrices because of the
  15.118 - * nature of the nesting -- would either end up creating the same sub-matrix
  15.119 - * multiple times, or else would have to put in detection of whether had
  15.120 - * made a particular one already if tried to combine steps 1 and 2.
  15.121 - *
  15.122 - *Step 3 has to be separate because of the nesting, as well -- same reason,
  15.123 - * would either create same sub-matrix multiple times, or else have to
  15.124 - * add detection of whether was already created.
  15.125 - *
  15.126 - *Another way to look at it: there's one level of loop to divide dimensions,
  15.127 - * two levels of nesting to create sub-matrices, and three levels to pair
  15.128 - * up the sub-matrices.
  15.129 - */
  15.130 -
  15.131 -void divideWorkIntoSubMatrixPairProcrs( void      *_dividerParams,
  15.132 -                                        VirtProcr *animatingThd )
  15.133 - { VirtProcr       *resultPr;
  15.134 -   DividerParams   *dividerParams;
  15.135 -   ResultsParams   *resultsParams;
  15.136 -   Matrix          *leftMatrix, *rightMatrix;
  15.137 -   SlicingStrucCarrier *slicingStrucCarrier;
  15.138 -   float32             *resultArray; //points to array inside result matrix
  15.139 -   MatrixMultGlobals   *globals;
  15.140 -  
  15.141 -         DEBUG( dbgAppFlow, "start divide\n")
  15.142 -
  15.143 -         int32
  15.144 -         divideProbe = VMS__create_single_interval_probe( "divideProbe",
  15.145 -                                                          animatingThd );
  15.146 -         VMS__record_sched_choice_into_probe( divideProbe, animatingThd );
  15.147 -         VMS__record_interval_start_in_probe( divideProbe );
  15.148 -
  15.149 -   //=========== Setup -- make local copies of ptd-to-things, malloc, aso
  15.150 -   int32 numResRows, numResCols, vectLength;
  15.151 -
  15.152 -   dividerParams   = (DividerParams *)_dividerParams;
  15.153 -   
  15.154 -   leftMatrix      = dividerParams->leftMatrix;
  15.155 -   rightMatrix     = dividerParams->rightMatrix;
  15.156 -
  15.157 -   vectLength = leftMatrix->numCols;
  15.158 -   numResRows = leftMatrix->numRows;
  15.159 -   numResCols = rightMatrix->numCols;
  15.160 -   resultArray     = dividerParams->resultMatrix->array;
  15.161 -   
  15.162 -      //zero the result array
  15.163 -   memset( resultArray, 0, numResRows * numResCols * sizeof(float32) );
  15.164 -
  15.165 -   //==============  Do either sequential mult or do division ==============
  15.166 -
  15.167 -      //Check if input matrices too small -- if yes, just do sequential
  15.168 -      //Cutoff is determined by overhead of this divider -- relatively
  15.169 -      // machine-independent
  15.170 -   if( (float32)leftMatrix->numRows * (float32)leftMatrix->numCols *
  15.171 -       (float32)rightMatrix->numCols  < NUM_CELLS_IN_SEQUENTIAL_CUTOFF )
  15.172 -    {
  15.173 -      //====== Do sequential multiply on a single core
  15.174 -            DEBUG( dbgAppFlow, "doing sequential")
  15.175 -            
  15.176 -         //transpose the right matrix
  15.177 -      float32 *
  15.178 -      transRightArray  = 
  15.179 -         VPThread__malloc( (size_t)rightMatrix->numRows * 
  15.180 -                                (size_t)rightMatrix->numCols *
  15.181 -                                sizeof(float32), animatingThd );
  15.182 -
  15.183 -         //copy values from orig matrix to local
  15.184 -      copyTranspose( rightMatrix->numRows, rightMatrix->numCols,
  15.185 -                     0, 0, rightMatrix->numRows,
  15.186 -                     transRightArray, rightMatrix->array );
  15.187 -      
  15.188 -      multiplyMatrixArraysTransposed( vectLength, numResRows, numResCols,
  15.189 -                            leftMatrix->array, transRightArray,
  15.190 -                            resultArray );
  15.191 -    }
  15.192 -   else
  15.193 -    {
  15.194 -      //====== Do parallel multiply across cores
  15.195 -
  15.196 -            DEBUG( dbgAppFlow, "divider: do parallel mult\n")
  15.197 -         //Calc the ideal size of sub-matrix and slice up the dimensions of
  15.198 -         // the two matrices.
  15.199 -         //The ideal size is the one takes the number of cycles to calculate
  15.200 -         // such that calc time is equal or greater than min work-unit size
  15.201 -      slicingStrucCarrier =
  15.202 -         calcIdealSizeAndSliceDimensions(leftMatrix,rightMatrix,animatingThd);
  15.203 -
  15.204 -         //Make the results processor, now that know how many to wait for
  15.205 -      resultsParams = VPThread__malloc( sizeof(ResultsParams), animatingThd );
  15.206 -      resultsParams->numSubMatrixPairs  =
  15.207 -         slicingStrucCarrier->leftRowSlices->numVals *
  15.208 -         slicingStrucCarrier->rightColSlices->numVals *
  15.209 -         slicingStrucCarrier->vecSlices->numVals;
  15.210 -      resultsParams->dividerPr   = animatingThd;
  15.211 -      resultsParams->numCols     = rightMatrix->numCols;
  15.212 -      resultsParams->numRows     = leftMatrix->numRows;
  15.213 -      resultsParams->resultArray = resultArray;
  15.214 -
  15.215 -      //==========  Set up global vars, including conds and mutexes ==========
  15.216 -      globals = VMS__malloc( sizeof(MatrixMultGlobals) );
  15.217 -      VPThread__set_globals_to( globals );
  15.218 -
  15.219 -      globals->results_mutex = VPThread__make_mutex( animatingThd );
  15.220 -      globals->results_cond  = VPThread__make_cond( globals->results_mutex,
  15.221 -                                                               animatingThd );
  15.222 -
  15.223 -      globals->vector_mutex = VPThread__make_mutex( animatingThd );
  15.224 -      globals->vector_cond  = VPThread__make_cond( globals->vector_mutex,
  15.225 -                                                               animatingThd );
  15.226 -
  15.227 -      globals->start_mutex = VPThread__make_mutex( animatingThd );
  15.228 -      globals->start_cond  = VPThread__make_cond( globals->start_mutex,
  15.229 -                                                               animatingThd );
  15.230 -      //======================================================================
  15.231 -
  15.232 -            DEBUG( dbgAppFlow, "divider: made mutexes and conds\n")
  15.233 -         //get results-comm lock before create results-thd, to ensure it can't
  15.234 -         // signal that results are available before this thd is waiting on cond
  15.235 -      VPThread__mutex_lock( globals->results_mutex, animatingThd );
  15.236 -
  15.237 -         //also get the start lock & use to ensure no vector threads send a
  15.238 -         // signal before the results thread is waiting on vector cond
  15.239 -      VPThread__mutex_lock( globals->start_mutex, animatingThd );
  15.240 -
  15.241 -
  15.242 -            DEBUG( dbgAppFlow, "divider: make result thread\n")
  15.243 -      VPThread__create_thread( &gatherResults, resultsParams, animatingThd );
  15.244 -
  15.245 -         //Now wait for results thd to signal that it has vector lock
  15.246 -      VPThread__cond_wait(  globals->start_cond,  animatingThd );
  15.247 -      VPThread__mutex_unlock( globals->start_mutex, animatingThd );//done w/lock
  15.248 -            DEBUG( dbgAppFlow, "divider: make sub-matrices\n")
  15.249 -   
  15.250 -         //Make the sub-matrices, and pair them up, and make processor to
  15.251 -         // calc product of each pair.
  15.252 -      makeSubMatricesAndProcrs( leftMatrix, rightMatrix,
  15.253 -                                    slicingStrucCarrier,
  15.254 -                                    resultPr, animatingThd);
  15.255 - 
  15.256 -         //Wait for results thread to say results are good
  15.257 -      VPThread__cond_wait(  globals->results_cond,  animatingThd );
  15.258 -    }
  15.259 -
  15.260 -
  15.261 -   //===============  Work done -- send results back =================
  15.262 -
  15.263 -
  15.264 -         DEBUG( dbgAppFlow, "end divide\n")
  15.265 -
  15.266 -         VMS__record_interval_end_in_probe( divideProbe );
  15.267 -         VMS__print_stats_of_all_probes();
  15.268 -
  15.269 -      //nothing left to do so dissipate, VPThread will wait to shutdown,
  15.270 -      // making results available to outside, until all the processors have
  15.271 -      // dissipated -- so actually no need to wait for results processor
  15.272 -      //However, following the pattern, so done with comm, release lock
  15.273 -   VPThread__mutex_unlock( globals->results_mutex, animatingThd );
  15.274 -
  15.275 -   VPThread__dissipate_thread( animatingThd );  //all procrs dissipate self at end
  15.276 -      //when all of the processors have dissipated, the "create seed and do
  15.277 -      // work" call in the entry point function returns
  15.278 - }
  15.279 -
  15.280 -
  15.281 -SlicingStrucCarrier *
  15.282 -calcIdealSizeAndSliceDimensions( Matrix *leftMatrix, Matrix *rightMatrix,
  15.283 -                                 VirtProcr *animPr )
  15.284 - {
  15.285 -   float32 idealSizeOfSide, idealSizeOfSide1, idealSizeOfSide2;
  15.286 -   SlicingStruc *leftRowSlices, *vecSlices, *rightColSlices;
  15.287 -   SlicingStrucCarrier *slicingStrucCarrier =
  15.288 -                       VPThread__malloc(sizeof(SlicingStrucCarrier), animPr);
  15.289 -
  15.290 -   int minWorkUnitCycles, primitiveCycles, idealNumWorkUnits;
  15.291 -   float64 numPrimitiveOpsInMinWorkUnit;
  15.292 -
  15.293 -
  15.294 -   //=======  Calc ideal size of min-sized sub-matrix  ========
  15.295 -
  15.296 -      //ask VPThread for the number of cycles of the minimum work unit, at given
  15.297 -      // percent overhead then add a guess at overhead from this divider
  15.298 -   minWorkUnitCycles = VPThread__giveMinWorkUnitCycles( .05 );
  15.299 -
  15.300 -      //ask VPThread for number of cycles of the "primitive" op of matrix mult
  15.301 -   primitiveCycles = measureMatrixMultPrimitive( animPr );
  15.302 -
  15.303 -   numPrimitiveOpsInMinWorkUnit =
  15.304 -      (float64)minWorkUnitCycles / (float64)primitiveCycles;
  15.305 -
  15.306 -      //take cubed root -- that's number of these in a "side" of sub-matrix
  15.307 -      // then multiply by 5 because the primitive is 5x5src/Application/VPThread__Matrix_Mult/Divide_Pr.c:403: warning: implicit declaration of function ‘VPThread__give_number_of_cores_to_schedule_onto’
  15.308 -   idealSizeOfSide1 = 5 * cbrt( numPrimitiveOpsInMinWorkUnit );
  15.309 -
  15.310 -   idealNumWorkUnits = VPThread__giveIdealNumWorkUnits();
  15.311 -   
  15.312 -   idealSizeOfSide2 = leftMatrix->numRows / rint(cbrt( idealNumWorkUnits ));
  15.313 -   idealSizeOfSide2 *= 0.6; //finer granularity to help load balance
  15.314 -
  15.315 -   if( idealSizeOfSide1 > idealSizeOfSide2 )
  15.316 -      idealSizeOfSide = idealSizeOfSide1;
  15.317 -   else
  15.318 -      idealSizeOfSide = idealSizeOfSide2;
  15.319 -
  15.320 -      //The multiply inner loop blocks the array to fit into L1 cache
  15.321 -//   if( idealSizeOfSide < ROWS_IN_BLOCK ) idealSizeOfSide = ROWS_IN_BLOCK;
  15.322 -
  15.323 -   //============  Slice up dimensions, now that know target size ===========
  15.324 -
  15.325 -      //Tell the slicer the target size of a side (floating pt), the start
  15.326 -      // value to start slicing at, and the end value to stop slicing at
  15.327 -      //It returns an array of start value of each chunk, plus number of them
  15.328 -   int32 startLeftRow, endLeftRow, startVec,endVec,startRightCol,endRightCol;
  15.329 -   startLeftRow  = 0;
  15.330 -   endLeftRow    = leftMatrix->numRows -1;
  15.331 -   startVec      = 0;
  15.332 -   endVec        = leftMatrix->numCols -1;
  15.333 -   startRightCol = 0;
  15.334 -   endRightCol   = rightMatrix->numCols -1;
  15.335 -
  15.336 -   leftRowSlices =
  15.337 -      sliceUpDimension( idealSizeOfSide,  startLeftRow, endLeftRow, animPr );
  15.338 -
  15.339 -   vecSlices =
  15.340 -      sliceUpDimension( idealSizeOfSide,  startVec, endVec, animPr );
  15.341 -
  15.342 -   rightColSlices =
  15.343 -      sliceUpDimension( idealSizeOfSide,  startRightCol, endRightCol,animPr);
  15.344 -
  15.345 -   slicingStrucCarrier->leftRowSlices  = leftRowSlices;
  15.346 -   slicingStrucCarrier->vecSlices      = vecSlices;
  15.347 -   slicingStrucCarrier->rightColSlices = rightColSlices;
  15.348 -
  15.349 -   return slicingStrucCarrier;
  15.350 - }
  15.351 -
  15.352 -
  15.353 -void
  15.354 -makeSubMatricesAndProcrs( Matrix    *leftMatrix, Matrix    *rightMatrix,
  15.355 -            SlicingStrucCarrier *slicingStrucCarrier,
  15.356 -            VirtProcr *resultPr,   VirtProcr *animPr )
  15.357 - {
  15.358 -   SlicingStruc *leftRowSlices, *vecSlices, *rightColSlices;
  15.359 -   
  15.360 -   leftRowSlices  = slicingStrucCarrier->leftRowSlices;
  15.361 -   vecSlices      = slicingStrucCarrier->vecSlices;
  15.362 -   rightColSlices = slicingStrucCarrier->rightColSlices;
  15.363 -   VPThread__free( slicingStrucCarrier, animPr );
  15.364 -   
  15.365 -   //================  Make sub-matrices, given the slicing  ================
  15.366 -   SubMatrix **leftSubMatrices, **rightSubMatrices;
  15.367 -   leftSubMatrices =
  15.368 -      createSubMatrices( leftRowSlices, vecSlices, rightColSlices->numVals,
  15.369 -                         leftMatrix, animPr );
  15.370 -   //double_check_that_always_numRows_in_right_same_as_numCols_in_left();
  15.371 -   rightSubMatrices =
  15.372 -      createSubMatrices( vecSlices, rightColSlices, leftRowSlices->numVals,
  15.373 -                         rightMatrix, animPr );
  15.374 -
  15.375 -
  15.376 -   //==============  pair the sub-matrices and make processors ==============
  15.377 -   int32 numRowIdxs, numColIdxs, numVecIdxs;
  15.378 -
  15.379 -   numRowIdxs = leftRowSlices->numVals;
  15.380 -   numColIdxs = rightColSlices->numVals;
  15.381 -   numVecIdxs = vecSlices->numVals;
  15.382 -   
  15.383 -   freeSlicingStruc( leftRowSlices, animPr );
  15.384 -   freeSlicingStruc( vecSlices, animPr );
  15.385 -   freeSlicingStruc( rightColSlices, animPr );
  15.386 -   
  15.387 -   pairUpSubMatricesAndMakeProcessors( leftSubMatrices,
  15.388 -                                       rightSubMatrices,
  15.389 -                                       numRowIdxs, numColIdxs,
  15.390 -                                       numVecIdxs,
  15.391 -                                       resultPr,
  15.392 -                                       animPr );
  15.393 - }
  15.394 -
  15.395 -
  15.396 -
  15.397 -
  15.398 -void
  15.399 -pairUpSubMatricesAndMakeProcessors( SubMatrix **leftSubMatrices,
  15.400 -                                    SubMatrix **rightSubMatrices,
  15.401 -                                    int32 numRowIdxs, int32 numColIdxs,
  15.402 -                                    int32 numVecIdxs,
  15.403 -                                    VirtProcr *resultPr,
  15.404 -                                    VirtProcr *animatingPr )
  15.405 - {
  15.406 -   int32 resRowIdx, resColIdx, vecIdx;
  15.407 -   int32 numLeftColIdxs, numRightColIdxs;
  15.408 -   int32 leftRowIdxOffset;
  15.409 -   SMPairParams *subMatrixPairParams;
  15.410 -   float32 numToPutOntoEachCore, leftOverFraction;
  15.411 -   int32 numCores, coreToScheduleOnto, numVecOnCurrCore;
  15.412 -
  15.413 -   numLeftColIdxs  = numColIdxs;
  15.414 -   numRightColIdxs = numVecIdxs;
  15.415 -
  15.416 -   numCores = VPThread__give_number_of_cores_to_schedule_onto();
  15.417 -
  15.418 -   numToPutOntoEachCore = numRowIdxs*numColIdxs/numCores;
  15.419 -   leftOverFraction = 0;
  15.420 -   numVecOnCurrCore = 0;
  15.421 -   coreToScheduleOnto = 0;
  15.422 -
  15.423 -   for( resRowIdx = 0; resRowIdx < numRowIdxs; resRowIdx++ )
  15.424 -    {
  15.425 -      leftRowIdxOffset = resRowIdx * numLeftColIdxs;
  15.426 -
  15.427 -      for( resColIdx = 0; resColIdx < numColIdxs; resColIdx++ )
  15.428 -       {
  15.429 -         
  15.430 -         for( vecIdx = 0; vecIdx < numVecIdxs; vecIdx++ )
  15.431 -          {
  15.432 -               //Make the processor for the pair of sub-matrices
  15.433 -            subMatrixPairParams  = VPThread__malloc( sizeof(SMPairParams),
  15.434 -                                                               animatingPr);
  15.435 -            subMatrixPairParams->leftSubMatrix  =
  15.436 -               leftSubMatrices[ leftRowIdxOffset + vecIdx ];
  15.437 -
  15.438 -            subMatrixPairParams->rightSubMatrix =
  15.439 -               rightSubMatrices[ vecIdx * numRightColIdxs + resColIdx ];
  15.440 -
  15.441 -            //subMatrixPairParams->resultPr = resultPr;
  15.442 -
  15.443 -               //put all pairs from the same vector onto same core
  15.444 -            VPThread__create_thread_with_affinity( &calcSubMatrixProduct,
  15.445 -                                                   subMatrixPairParams,
  15.446 -                                                   animatingPr,
  15.447 -                                                   coreToScheduleOnto );
  15.448 -          }
  15.449 -
  15.450 -            //Trying to distribute the subMatrix-vectors across the cores, so
  15.451 -            // that each core gets the same number of vectors, with a max
  15.452 -            // imbalance of 1 vector more on some cores than others
  15.453 -         numVecOnCurrCore += 1;
  15.454 -         if( numVecOnCurrCore + leftOverFraction >= numToPutOntoEachCore -1 )
  15.455 -          {
  15.456 -               //deal with fractional part, to ensure that imbalance is 1 max
  15.457 -               // IE, core with most has only 1 more than core with least
  15.458 -            leftOverFraction += numToPutOntoEachCore - numVecOnCurrCore;
  15.459 -            if( leftOverFraction >= 1 )
  15.460 -             { leftOverFraction -= 1;
  15.461 -               numVecOnCurrCore = -1;
  15.462 -             }
  15.463 -            else
  15.464 -             { numVecOnCurrCore = 0;
  15.465 -             }
  15.466 -               //Move to next core, max core-value to incr to is numCores -1
  15.467 -            if( coreToScheduleOnto >= numCores -1 )
  15.468 -             { coreToScheduleOnto = 0;
  15.469 -             }
  15.470 -            else
  15.471 -             { coreToScheduleOnto += 1;
  15.472 -             }
  15.473 -          }
  15.474 -
  15.475 -       }
  15.476 -    }
  15.477 - }
  15.478 -
  15.479 -
  15.480 -
  15.481 -/*Walk through the two slice-strucs, making sub-matrix strucs as go
  15.482 - */
  15.483 -SubMatrix **
  15.484 -createSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices,
  15.485 -                   int32 numUses, Matrix *origMatrix, VirtProcr *animPr )
  15.486 - {
  15.487 -   int32 numRowIdxs, numColIdxs, rowIdx, colIdx;
  15.488 -   int32 startRow, endRow, startCol, endCol;
  15.489 -   int32 *rowStartVals, *colStartVals;
  15.490 -   int32 rowOffset;
  15.491 -   SubMatrix **subMatrices, *newSubMatrix;
  15.492 -
  15.493 -   numRowIdxs = rowSlices->numVals;
  15.494 -   numColIdxs = colSlices->numVals;
  15.495 -
  15.496 -   rowStartVals = rowSlices->startVals;
  15.497 -   colStartVals = colSlices->startVals;
  15.498 -
  15.499 -   subMatrices = VPThread__malloc((size_t)numRowIdxs *
  15.500 -                            (size_t)numColIdxs * sizeof(SubMatrix*),animPr );
  15.501 -
  15.502 -   for( rowIdx = 0; rowIdx < numRowIdxs; rowIdx++ )
  15.503 -    {
  15.504 -      rowOffset = rowIdx * numColIdxs;
  15.505 -      
  15.506 -      startRow  = rowStartVals[rowIdx];
  15.507 -      endRow    = rowStartVals[rowIdx + 1] -1; //"fake" start above last is
  15.508 -                                               // at last valid idx + 1 & is
  15.509 -                                               // 1 greater than end value
  15.510 -      for( colIdx = 0; colIdx < numColIdxs; colIdx++ )
  15.511 -       {
  15.512 -         startCol = colStartVals[colIdx];
  15.513 -         endCol   = colStartVals[colIdx + 1] -1;
  15.514 -
  15.515 -         newSubMatrix = VPThread__malloc( sizeof(SubMatrix), animPr );
  15.516 -         newSubMatrix->numRows       = endRow - startRow +1;
  15.517 -         newSubMatrix->numCols       = endCol - startCol +1;
  15.518 -         newSubMatrix->origMatrix    = origMatrix;
  15.519 -         newSubMatrix->origStartRow  = startRow;
  15.520 -         newSubMatrix->origStartCol  = startCol;
  15.521 -         newSubMatrix->alreadyCopied = FALSE;
  15.522 -         newSubMatrix->numUsesLeft   = numUses; //can free after this many
  15.523 -         //Prevent uninitialized memory
  15.524 -         newSubMatrix->copySingleton = NULL;
  15.525 -         newSubMatrix->copyTransSingleton = NULL;
  15.526 -
  15.527 -         subMatrices[ rowOffset + colIdx ] = newSubMatrix;
  15.528 -       }
  15.529 -    }
  15.530 -   return subMatrices;
  15.531 - }
  15.532 -
  15.533 -
  15.534 -void
  15.535 -freeSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices,
  15.536 -                 SubMatrix **subMatrices, VirtProcr *animPr )
  15.537 - {
  15.538 -   int32 numRowIdxs, numColIdxs, rowIdx, colIdx, rowOffset;
  15.539 -   SubMatrix *subMatrix;
  15.540 -
  15.541 -   numRowIdxs = rowSlices->numVals;
  15.542 -   numColIdxs = colSlices->numVals;
  15.543 -
  15.544 -   for( rowIdx = 0; rowIdx < numRowIdxs; rowIdx++ )
  15.545 -    {
  15.546 -      rowOffset = rowIdx * numColIdxs;
  15.547 -      for( colIdx = 0; colIdx < numColIdxs; colIdx++ )
  15.548 -       {
  15.549 -         subMatrix = subMatrices[ rowOffset + colIdx ];
  15.550 -         if( subMatrix->alreadyCopied )
  15.551 -            VPThread__free( subMatrix->array, animPr );
  15.552 -         VPThread__free( subMatrix, animPr );
  15.553 -       }
  15.554 -    }
  15.555 -   VPThread__free( subMatrices, animPr );
  15.556 - }
  15.557 -
  15.558 -
  15.559 -
  15.560 -SlicingStruc *
  15.561 -sliceUpDimension( float32 idealSizeOfSide, int startVal, int endVal,
  15.562 -                  VirtProcr *animPr )
  15.563 - { float32 residualAcc = 0;
  15.564 -   int     numSlices, i, *startVals, sizeOfSlice, endCondition;
  15.565 -   SlicingStruc *slicingStruc = VPThread__malloc(sizeof(SlicingStruc), animPr);
  15.566 -
  15.567 -      //calc size of matrix need to hold start vals --
  15.568 -   numSlices = (int32)( (float32)(endVal -startVal +1) / idealSizeOfSide);
  15.569 -
  15.570 -   startVals = VPThread__malloc( ((size_t)numSlices + 1) * sizeof(int32), animPr );
  15.571 -
  15.572 -      //Calc the upper limit of start value -- when get above this, end loop
  15.573 -      // by saving highest value of the matrix dimension to access, plus 1
  15.574 -      // as the start point of the imaginary slice following the last one
  15.575 -      //Plus 1 because go up to value but not include when process last slice
  15.576 -      //The stopping condition is half-a-size less than highest value because
  15.577 -      // don't want any pieces smaller than half the ideal size -- just tack
  15.578 -      // little ones onto end of last one
  15.579 -   endCondition = endVal - (int) (idealSizeOfSide/2); //end *value*, not size
  15.580 -   for( i = 0; startVal <= endVal; i++ )
  15.581 -    {
  15.582 -      startVals[i] = startVal;
  15.583 -      residualAcc += idealSizeOfSide;
  15.584 -      sizeOfSlice  = (int)residualAcc;
  15.585 -      residualAcc -= (float32)sizeOfSlice;
  15.586 -      startVal    += sizeOfSlice; //ex @size = 2 get 0, 2, 4, 6, 8..
  15.587 -
  15.588 -      if( startVal > endCondition )
  15.589 -       { startVal = endVal + 1;
  15.590 -         startVals[ i + 1 ] = startVal;
  15.591 -       }
  15.592 -    }
  15.593 -
  15.594 -   slicingStruc->startVals = startVals;
  15.595 -   slicingStruc->numVals   = i;  //loop incr'd, so == last valid start idx+1
  15.596 -                                 // which means is num sub-matrices in dim
  15.597 -                                 // also == idx of the fake start just above
  15.598 -   return slicingStruc;
  15.599 - }
  15.600 -
  15.601 -void
  15.602 -freeSlicingStruc( SlicingStruc *slicingStruc, VirtProcr *animPr )
  15.603 - {
  15.604 -   VPThread__free( slicingStruc->startVals, animPr );
  15.605 -   VPThread__free( slicingStruc, animPr );
  15.606 - }
  15.607 -
  15.608 -
  15.609 -int inline
  15.610 -measureMatrixMultPrimitive( VirtProcr *animPr )
  15.611 - {
  15.612 -   int r, c, v, numCycles;
  15.613 -   float32 *res, *left, *right;
  15.614 -
  15.615 -      //setup inputs
  15.616 -   left  = VPThread__malloc( 5 * 5 * sizeof( float32 ), animPr );
  15.617 -   right = VPThread__malloc( 5 * 5 * sizeof( float32 ), animPr );
  15.618 -   res   = VPThread__malloc( 5 * 5 * sizeof( float32 ), animPr );
  15.619 -
  15.620 -   for( r = 0; r < 5; r++ )
  15.621 -    {
  15.622 -      for( c = 0; c < 5; c++ )
  15.623 -       {
  15.624 -         left[  r * 5 + c ] = r;
  15.625 -         right[ r * 5 + c ] = c;
  15.626 -       }
  15.627 -    }
  15.628 -
  15.629 -      //do primitive
  15.630 -   VPThread__start_primitive();  //for now, just takes time stamp
  15.631 -   for( r = 0; r < 5; r++ )
  15.632 -    {
  15.633 -      for( c = 0; c < 5; c++ )
  15.634 -       {
  15.635 -         for( v = 0; v < 5; v++ )
  15.636 -          {
  15.637 -            res[ r * 5 + c ] = left[ r * 5 + v ] * right[ v * 5 + c ];
  15.638 -          }
  15.639 -       }
  15.640 -    }
  15.641 -   numCycles =
  15.642 -      VPThread__end_primitive_and_give_cycles();
  15.643 -
  15.644 -   VPThread__free( left, animPr );
  15.645 -   VPThread__free( right, animPr );
  15.646 -   VPThread__free( res, animPr );
  15.647 -
  15.648 -   return numCycles;
  15.649 - }
    16.1 --- a/src/Application/VPThread__Matrix_Mult/EntryPoint.c	Fri Sep 16 20:12:34 2011 +0200
    16.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
    16.3 @@ -1,62 +0,0 @@
    16.4 -/*
    16.5 - *  Copyright 2009 OpenSourceStewardshipFoundation.org
    16.6 - *  Licensed under GNU General Public License version 2
    16.7 - *
    16.8 - * Author: seanhalle@yahoo.com
    16.9 - *
   16.10 - */
   16.11 -
   16.12 -#include <math.h>
   16.13 -
   16.14 -#include "VPThread__Matrix_Mult.h"
   16.15 -
   16.16 -
   16.17 -
   16.18 -/*Every SSR system has an "entry point" function that creates the first
   16.19 - * processor, which starts the chain of creating more processors..
   16.20 - * eventually all of the processors will dissipate themselves, and
   16.21 - * return.
   16.22 - *
   16.23 - *This entry-point function follows the same pattern as all entry-point
   16.24 - * functions do:
   16.25 - *1) it creates the params for the seed processor, from the
   16.26 - *    parameters passed into the entry-point function
   16.27 - *2) it calls SSR__create_seed_procr_and_do_work
   16.28 - *3) it gets the return value from the params struc, frees the params struc,
   16.29 - *    and returns the value from the function
   16.30 - *
   16.31 - */
   16.32 -Matrix *
   16.33 -multiplyTheseMatrices( Matrix *leftMatrix, Matrix *rightMatrix )
   16.34 - { Matrix          *resMatrix;
   16.35 -   DividerParams   *dividerParams;
   16.36 -   int32            numResRows, numResCols;
   16.37 -
   16.38 -
   16.39 -   dividerParams              = malloc( sizeof( DividerParams ) );
   16.40 -   dividerParams->leftMatrix  = leftMatrix;
   16.41 -   dividerParams->rightMatrix = rightMatrix;
   16.42 -
   16.43 -
   16.44 -   numResRows  = leftMatrix->numRows;
   16.45 -   numResCols  = rightMatrix->numCols;
   16.46 -
   16.47 -      //VMS has its own separate internal malloc, so to get results out,
   16.48 -      // have to pass in empty array for it to fill up
   16.49 -      //The alternative is internally telling SSR make external space to use
   16.50 -   resMatrix            = malloc( sizeof(Matrix) );
   16.51 -   resMatrix->array     = malloc( numResRows * numResCols * sizeof(float32));
   16.52 -   resMatrix->numCols   = rightMatrix->numCols;
   16.53 -   resMatrix->numRows   = leftMatrix->numRows;
   16.54 -
   16.55 -
   16.56 -   dividerParams->resultMatrix   = resMatrix;
   16.57 -
   16.58 -      //create divider processor, start doing the work, and wait till done
   16.59 -      //This function is the "border crossing" between normal code and SSR
   16.60 -   VPThread__create_seed_procr_and_do_work(&divideWorkIntoSubMatrixPairProcrs,
   16.61 -                                           dividerParams );
   16.62 -   
   16.63 -   free( dividerParams );
   16.64 -   return resMatrix;
   16.65 - }
    17.1 --- a/src/Application/VPThread__Matrix_Mult/Result_Pr.c	Fri Sep 16 20:12:34 2011 +0200
    17.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
    17.3 @@ -1,142 +0,0 @@
    17.4 -/*
    17.5 - *  Copyright 2009 OpenSourceStewardshipFoundation.org
    17.6 - *  Licensed under GNU General Public License version 2
    17.7 - *
    17.8 - * Author: seanhalle@yahoo.com
    17.9 - *
   17.10 - */
   17.11 -
   17.12 -#include "VPThread__Matrix_Mult.h"
   17.13 -
   17.14 -//=====================
   17.15 -void inline
   17.16 -accumulateResult( float32 *resultArray, float32 *subMatrixResultArray,
   17.17 -                  int32    startRow,
   17.18 -                  int32    numRows,
   17.19 -                  int32    startCol,
   17.20 -                  int32    numCols,
   17.21 -                  int32    numOrigCols );
   17.22 -
   17.23 -//===========================================================================
   17.24 -
   17.25 -/*The Result Processor gets a message from each of the vector processors,
   17.26 - * puts the result from the message in its location in the result-
   17.27 - * matrix, and increments the count of results.
   17.28 - *
   17.29 - *After the count reaches the point that all results have been received, it
   17.30 - * returns the result matrix and dissipates.
   17.31 - */
   17.32 -void gatherResults( void *_params, VirtProcr *animatingThd )
   17.33 - { VirtProcr *dividerPr;
   17.34 -   ResultsParams  *params;
   17.35 -   int            numRows, numCols, numSubMatrixPairs, count=0;
   17.36 -   float32        *resultArray;
   17.37 -   SMPairParams   *resParams;
   17.38 -   //====================== thread stuff =======================
   17.39 -   MatrixMultGlobals *globals =(MatrixMultGlobals *)VPThread__give_globals();
   17.40 -
   17.41 -
   17.42 -      //get vector-comm lock before loop, so that this thd keeps lock after
   17.43 -      // one wait until it enters the next wait -- forces see-saw btwn
   17.44 -      // waiters and signalers -- wait-signal-wait-signal-...
   17.45 -   VPThread__mutex_lock( globals->vector_mutex, animatingThd );
   17.46 -
   17.47 -      //Tell divider that have the vector lock -- so it's sure won't miss any
   17.48 -      // signals from the vector-threads it's about to create
   17.49 -      //Don't need a signal variable -- this thd can't be created until
   17.50 -      // divider thd already has the start lock
   17.51 -   VPThread__mutex_lock( globals->start_mutex, animatingThd );//finish wait
   17.52 -   VPThread__cond_signal( globals->start_cond,  animatingThd );
   17.53 -   VPThread__mutex_unlock( globals->start_mutex, animatingThd );//finish wait
   17.54 -   //===========================================================
   17.55 -
   17.56 -         DEBUG( dbgAppFlow, "start resultPr\n")
   17.57 -         
   17.58 -   params    = (ResultsParams *)_params;
   17.59 -   dividerPr = params->dividerPr;
   17.60 -   numSubMatrixPairs = params->numSubMatrixPairs;
   17.61 -   numRows = params->numRows;
   17.62 -   numCols = params->numCols;
   17.63 -
   17.64 -   resultArray = params->resultArray;
   17.65 -
   17.66 -
   17.67 -   while( count < numSubMatrixPairs )
   17.68 -    {
   17.69 -         //receive a vector-result from a vector-thread
   17.70 -      VPThread__cond_wait(  globals->vector_cond,  animatingThd );
   17.71 -
   17.72 -         //At this point, animating thread owns the vector lock, so all
   17.73 -         // pairs trying to signal they have a result are waiting to get that
   17.74 -         // lock -- only one gets it at a time, and when signal, this thd
   17.75 -         // gets the lock and does the body of this loop, then when does the
   17.76 -         // wait again, that releases the lock for next pair-thread to get it
   17.77 -      resParams = globals->currSMPairParams;
   17.78 -
   17.79 -      accumulateResult( resultArray, resParams->partialResultArray,
   17.80 -                        resParams->leftSubMatrix->origStartRow,
   17.81 -                        resParams->leftSubMatrix->numRows,
   17.82 -                        resParams->rightSubMatrix->origStartCol,
   17.83 -                        resParams->rightSubMatrix->numCols,
   17.84 -                        resParams->rightSubMatrix->origMatrix->numCols );
   17.85 -
   17.86 -      VPThread__free( resParams->partialResultArray, animatingThd );
   17.87 -      
   17.88 -         //there is only one copy of results procr, so can update numUsesLeft
   17.89 -         // without concurrency worries.  When zero, free the sub-matrix
   17.90 -      resParams->leftSubMatrix->numUsesLeft -= 1;
   17.91 -      if( resParams->leftSubMatrix->numUsesLeft == 0 )
   17.92 -       {
   17.93 -         VPThread__free( resParams->leftSubMatrix->array, animatingThd );
   17.94 -         VPThread__free( resParams->leftSubMatrix, animatingThd );
   17.95 -       }
   17.96 -
   17.97 -      resParams->rightSubMatrix->numUsesLeft -= 1;
   17.98 -      if( resParams->rightSubMatrix->numUsesLeft == 0 )
   17.99 -       {
  17.100 -         VPThread__free( resParams->rightSubMatrix->array, animatingThd );
  17.101 -         VPThread__free( resParams->rightSubMatrix, animatingThd );
  17.102 -       }
  17.103 -
  17.104 -         //count of how many sub-matrix pairs accumulated so know when done
  17.105 -      count++;
  17.106 -    }
  17.107 -
  17.108 -      //Done -- could just dissipate -- SSR will wait for all processors to
  17.109 -      // dissipate before shutting down, and thereby making results avaial to
  17.110 -      // outside, so no need to stop the divider from dissipating, so no need
  17.111 -      // to send a hand-shake message to it -- but makes debug easier
  17.112 -      //However, following pattern, so all comms done, release lock
  17.113 -   VPThread__mutex_unlock( globals->vector_mutex, animatingThd );
  17.114 -
  17.115 -      //Send result to divider (seed) thread
  17.116 -      // note, divider thd had to hold the results-comm lock before creating
  17.117 -      // this thread, to be sure no race
  17.118 -   VPThread__mutex_lock(   globals->results_mutex, animatingThd );
  17.119 -   //globals->results = resultMatrixArray;
  17.120 -   VPThread__cond_signal(  globals->results_cond,  animatingThd );
  17.121 -   VPThread__mutex_unlock( globals->results_mutex, animatingThd ); //releases
  17.122 -   //divider thread from its wait, at point this executes
  17.123 -
  17.124 -   VPThread__dissipate_thread( animatingThd );  //frees any data owned by procr
  17.125 - }
  17.126 -
  17.127 -void inline
  17.128 -accumulateResult( float32 *resultArray, float32 *subMatrixPairResultArray,
  17.129 -                  int32    startRow,
  17.130 -                  int32    numRows,
  17.131 -                  int32    startCol,
  17.132 -                  int32    numCols,
  17.133 -                  int32    numOrigCols )
  17.134 - { int32 row, col;
  17.135 -
  17.136 -   for( row = 0; row < numRows; row++ )
  17.137 -    {
  17.138 -      for( col = 0; col < numCols; col++ )
  17.139 -       {
  17.140 -         resultArray[ (row + startRow) * numOrigCols + (col + startCol) ] +=
  17.141 -            subMatrixPairResultArray[ row * numCols + col ];
  17.142 -       }
  17.143 -    }
  17.144 -
  17.145 - }
    18.1 --- a/src/Application/VPThread__Matrix_Mult/VPThread__Matrix_Mult.h	Fri Sep 16 20:12:34 2011 +0200
    18.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
    18.3 @@ -1,120 +0,0 @@
    18.4 -/*
    18.5 - *  Copyright Oct 24, 2009 OpenSourceStewardshipFoundation.org
    18.6 - *  Licensed under GNU General Public License version 2
    18.7 - */
    18.8 -
    18.9 -#ifndef _SSR_MATRIX_MULT_H_
   18.10 -#define _SSR_MATRIX_MULT_H_
   18.11 -
   18.12 -#include <stdio.h>
   18.13 -
   18.14 -#include "../../VPThread_lib/VPThread.h"
   18.15 -#include "../Matrix_Mult.h"
   18.16 -
   18.17 -
   18.18 -//===============================  Defines  ==============================
   18.19 -#define ROWS_IN_BLOCK 32
   18.20 -#define COLS_IN_BLOCK 32
   18.21 -#define VEC_IN_BLOCK  32
   18.22 -
   18.23 -#define copyMatrixSingleton 1
   18.24 -#define copyTransposeSingleton 2
   18.25 -
   18.26 -//==============================  Structures  ==============================
   18.27 -typedef struct
   18.28 - {
   18.29 -   Matrix *leftMatrix;
   18.30 -   Matrix *rightMatrix;
   18.31 -   Matrix *resultMatrix;
   18.32 - }
   18.33 -DividerParams;
   18.34 -
   18.35 -typedef struct
   18.36 - {
   18.37 -   VirtProcr *dividerPr;
   18.38 -   int numRows;
   18.39 -   int numCols;
   18.40 -   int numSubMatrixPairs;
   18.41 -   float32 *resultArray;
   18.42 - }
   18.43 -ResultsParams;
   18.44 -
   18.45 -typedef
   18.46 -struct
   18.47 - { int32    numRows;
   18.48 -   int32    numCols;
   18.49 -   Matrix  *origMatrix;
   18.50 -   int32    origStartRow;
   18.51 -   int32    origStartCol;
   18.52 -   int32    alreadyCopied;
   18.53 -   VPThdSingleton *copySingleton;
   18.54 -   VPThdSingleton *copyTransSingleton;
   18.55 -   int32    numUsesLeft; //have update via message to avoid multiple writers
   18.56 -   float32 *array;  //2D, but dynamically sized, so use addr arith
   18.57 - }
   18.58 -SubMatrix;
   18.59 -
   18.60 -typedef struct
   18.61 - { VirtProcr *resultPr;
   18.62 -   SubMatrix *leftSubMatrix;
   18.63 -   SubMatrix *rightSubMatrix;
   18.64 -   float32   *partialResultArray;
   18.65 - }
   18.66 -SMPairParams;
   18.67 -
   18.68 -typedef
   18.69 -struct
   18.70 - { int32    numVals;
   18.71 -   int32   *startVals;
   18.72 - }
   18.73 -SlicingStruc;
   18.74 -
   18.75 -typedef
   18.76 -struct
   18.77 - {
   18.78 -   SlicingStruc *leftRowSlices;
   18.79 -   SlicingStruc *vecSlices;
   18.80 -   SlicingStruc *rightColSlices;
   18.81 - }
   18.82 -SlicingStrucCarrier;
   18.83 -
   18.84 -enum MMMsgType
   18.85 - {
   18.86 -   RESULTS_MSG = 1
   18.87 - };
   18.88 -
   18.89 - 
   18.90 -typedef struct
   18.91 - {
   18.92 -      //for communicating sub-matrix-pair results to results Thd
   18.93 -   int32         vector_mutex;
   18.94 -   int32         vector_cond;
   18.95 -   SMPairParams *currSMPairParams;
   18.96 -
   18.97 -      //for communicating results array back to seed (divider) Thd
   18.98 -   int32         results_mutex;
   18.99 -   int32         results_cond;
  18.100 -   float32      *results;
  18.101 -
  18.102 -      //for ensuring results thd has vector lock before making vector thds
  18.103 -   int32         start_mutex;
  18.104 -   int32         start_cond;
  18.105 -
  18.106 -   Matrix *rightMatrix;
  18.107 -   Matrix *resultMatrix;
  18.108 - }
  18.109 -MatrixMultGlobals;
  18.110 -
  18.111 -
  18.112 -//============================= Processor Functions =========================
  18.113 -void divideWorkIntoSubMatrixPairProcrs( void *data, VirtProcr *animatingPr );
  18.114 -void calcSubMatrixProduct(        void *data, VirtProcr *animatingPr );
  18.115 -void gatherResults(     void *data, VirtProcr *animatingPr );
  18.116 -
  18.117 -
  18.118 -//================================ Entry Point ==============================
  18.119 -Matrix *
  18.120 -multiplyTheseMatrices( Matrix *leftMatrix, Matrix *rightMatrix );
  18.121 -
  18.122 -
  18.123 -#endif /*_SSR_MATRIX_MULT_H_*/
    19.1 --- a/src/Application/VPThread__Matrix_Mult/subMatrix_Pr.c	Fri Sep 16 20:12:34 2011 +0200
    19.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
    19.3 @@ -1,310 +0,0 @@
    19.4 -/* 
    19.5 - *  Copyright 2009 OpenSourceStewardshipFoundation.org
    19.6 - *  Licensed under GNU General Public License version 2
    19.7 - *
    19.8 - * Author: SeanHalle@yahoo.com
    19.9 - *
   19.10 - */
   19.11 -
   19.12 -#include <string.h>
   19.13 -
   19.14 -#include "VPThread__Matrix_Mult.h"
   19.15 -
   19.16 -//Declarations
   19.17 -//===================================================
   19.18 -
   19.19 -void inline
   19.20 -copyFromOrig( SubMatrix *subMatrix, VirtProcr *animPr );
   19.21 -
   19.22 -void inline
   19.23 -copyTransposeFromOrig( SubMatrix *subMatrix, VirtProcr *animPr );
   19.24 -
   19.25 -void inline
   19.26 -multiplySubBlocksTransposed( float32 *leftArray, float32 *rightArray,
   19.27 -                     float32 *resArray,
   19.28 -                     int startRow,  int endRow,
   19.29 -                     int startCol,  int endCol,
   19.30 -                     int startVec,  int endVec,
   19.31 -                     int resStride, int inpStride );
   19.32 -
   19.33 -void inline
   19.34 -multiplyMatrixArraysTransposed( int32 vecLength, int32 numResRows, int32 numResCols,
   19.35 -                      float32 *leftArray, float32 *rightArray,
   19.36 -                      float32 *resArray );
   19.37 -
   19.38 -//===================================================
   19.39 -
   19.40 -/*A  processor is created with an environment that holds two matrices,
   19.41 - * the row and col that it owns, and the name of a result gathering
   19.42 - * processor.
   19.43 - *It calculates the product of two sub-portions of the input matrices
   19.44 - * by using Intel's mkl library for single-core.
   19.45 - *
   19.46 - *This demonstrates using optimized single-threaded code inside scheduled
   19.47 - * work-units.
   19.48 - *
   19.49 - *When done, it sends the result to the result processor
   19.50 - */
   19.51 -void
   19.52 -calcSubMatrixProduct( void *data, VirtProcr *animatingPr )
   19.53 - { 
   19.54 -   SMPairParams   *params;
   19.55 -   VirtProcr      *resultPr;
   19.56 -   float32        *leftArray,  *rightArray, *resArray;
   19.57 -   SubMatrix      *leftSubMatrix, *rightSubMatrix;
   19.58 -   MatrixMultGlobals *globals =(MatrixMultGlobals *)VPThread__give_globals();
   19.59 -
   19.60 -         DEBUG1(dbgAppFlow, "start sub-matrix mult: %d\n", animatingPr->procrID)
   19.61 -         #ifdef TURN_ON_DEBUG_PROBES
   19.62 -         int32 subMatrixProbe = VMS__create_single_interval_probe( "subMtx",
   19.63 -                                                                animatingPr);
   19.64 -         VMS__record_sched_choice_into_probe( subMatrixProbe, animatingPr );
   19.65 -         VMS__record_interval_start_in_probe( subMatrixProbe );
   19.66 -         #endif
   19.67 -
   19.68 -   params         = (SMPairParams *)data;
   19.69 -   resultPr       = params->resultPr;
   19.70 -   leftSubMatrix  = params->leftSubMatrix;
   19.71 -   rightSubMatrix = params->rightSubMatrix;
   19.72 -
   19.73 -      //make sure the input sub-matrices have been copied out of orig
   19.74 -      //do it here, inside sub-matrix pair to hopefully gain reuse in cache
   19.75 -   copyFromOrig( leftSubMatrix, animatingPr );
   19.76 -   copyTransposeFromOrig( rightSubMatrix, animatingPr );
   19.77 -   
   19.78 -   leftArray      = leftSubMatrix->array;
   19.79 -   rightArray     = rightSubMatrix->array;
   19.80 -
   19.81 -   size_t resSize = (size_t)leftSubMatrix->numRows * 
   19.82 -                        (size_t)rightSubMatrix->numCols * sizeof(float32);
   19.83 -   resArray = VPThread__malloc( resSize, animatingPr );
   19.84 -   memset( resArray, 0, resSize );
   19.85 -
   19.86 -
   19.87 -   int32 numResRows, numResCols, vectLength;
   19.88 -   
   19.89 -   vectLength = leftSubMatrix->numCols;
   19.90 -   numResRows = leftSubMatrix->numRows;
   19.91 -   numResCols = rightSubMatrix->numCols;
   19.92 -
   19.93 -   multiplyMatrixArraysTransposed( vectLength, numResRows, numResCols,
   19.94 -                         leftArray,  rightArray,
   19.95 -                         resArray );
   19.96 -
   19.97 -   //send result to result processor
   19.98 -   params->partialResultArray = resArray;
   19.99 -
  19.100 -         #ifdef TURN_ON_DEBUG_PROBES
  19.101 -         VMS__record_interval_end_in_probe( subMatrixProbe );
  19.102 -         #endif
  19.103 -
  19.104 -      //Send result to results thread
  19.105 -      //This pattern works 'cause only get lock when results thd inside wait
  19.106 -   VPThread__mutex_lock(   globals->vector_mutex, animatingPr );
  19.107 -   globals->currSMPairParams = params;
  19.108 -   VPThread__cond_signal(  globals->vector_cond,  animatingPr );
  19.109 -   VPThread__mutex_unlock( globals->vector_mutex, animatingPr );//release
  19.110 -   //wait-er -- cond_signal implemented such that wait-er gets lock, no other
  19.111 -
  19.112 -         DEBUG1(dbgAppFlow, "end sub-matrix mult: %d\n", animatingPr->procrID)
  19.113 -   VPThread__dissipate_thread( animatingPr );
  19.114 - }
  19.115 -
  19.116 -
  19.117 -
  19.118 -/*Divides result and each input into 32x32 sub-matrices, 3 of which fit into
  19.119 - * the 32KB L1 cache.
  19.120 - *Would be nice to embed this within another level that divided into
  19.121 - * 8x8 tiles of those, where one 8x8 tile fits within 2MB L2 cache
  19.122 - *
  19.123 - *Eventually want these divisions to be automatic, using DKU pattern
  19.124 - * embedded into VMS and exposed in the language, and with VMS controlling the
  19.125 - * divisions according to the cache sizes, which it knows about.
  19.126 - *Also, want VMS to work with language to split among main-mems, so a socket
  19.127 - * only cranks on data in its local segment of main mem
  19.128 - *
  19.129 - *So, outer two loops determine start and end points within the result matrix.
  19.130 - * Inside that, a loop dets the start and end points along the shared dimensions
  19.131 - * of the two input matrices.
  19.132 - */
  19.133 -void inline
  19.134 -multiplyMatrixArraysTransposed( int32 vecLength, int32 numResRows,
  19.135 -                                int32 numResCols,
  19.136 -                                float32 *leftArray, float32 *rightArray,
  19.137 -                                float32 *resArray )
  19.138 - {
  19.139 -   int resStride, inpStride;
  19.140 -   int resStartRow, resStartCol, resEndRow, resEndCol, startVec, endVec;
  19.141 -
  19.142 -   resStride  = numResCols;
  19.143 -   inpStride  = vecLength;
  19.144 -
  19.145 -   for( resStartRow = 0; resStartRow < numResRows; )
  19.146 -    {
  19.147 -      resEndRow = resStartRow + ROWS_IN_BLOCK -1;  //start at zero, so -1
  19.148 -      if( resEndRow > numResRows ) resEndRow = numResRows -1;
  19.149 -
  19.150 -      for( resStartCol = 0; resStartCol < numResCols; )
  19.151 -       {
  19.152 -         resEndCol   = resStartCol + COLS_IN_BLOCK -1;
  19.153 -         if( resEndCol > numResCols ) resEndCol = numResCols -1;
  19.154 -
  19.155 -         for( startVec = 0; startVec < vecLength; )
  19.156 -          {
  19.157 -            endVec   = startVec + VEC_IN_BLOCK -1;
  19.158 -            if( endVec > vecLength ) endVec = vecLength -1;
  19.159 -
  19.160 -               //By having the "vector" of sub-blocks in a sub-block slice
  19.161 -               // be marched down in inner loop, are re-using the result
  19.162 -               // matrix, which stays in L1 cache and re-using the left sub-mat
  19.163 -               // which repeats for each right sub-mat -- can only re-use two of
  19.164 -               // the three, so result is the most important -- avoids writing
  19.165 -               // dirty blocks until those result-locations fully done
  19.166 -               //Row and Col is position in result matrix -- so row and vec
  19.167 -               // for left array, then vec and col for right array
  19.168 -            multiplySubBlocksTransposed( leftArray, rightArray,
  19.169 -                                         resArray,
  19.170 -                                         resStartRow,  resEndRow,
  19.171 -                                         resStartCol,  resEndCol,
  19.172 -                                         startVec,  endVec,
  19.173 -                                         resStride, inpStride );
  19.174 -            startVec = endVec +1;
  19.175 -          }
  19.176 -         resStartCol = resEndCol +1;
  19.177 -       }
  19.178 -      resStartRow = resEndRow +1;
  19.179 -    }
  19.180 - }
  19.181 -
  19.182 -
  19.183 -
  19.184 -void inline
  19.185 -multiplySubBlocksTransposed( float32 *leftArray, float32 *rightArray,
  19.186 -                     float32 *resArray,
  19.187 -                     int resStartRow,  int resEndRow,
  19.188 -                     int resStartCol,  int resEndCol,
  19.189 -                     int startVec,  int endVec,
  19.190 -                     int resStride, int inpStride )
  19.191 - {
  19.192 -   int resRow,     resCol,        vec;
  19.193 -   int leftOffset, rightOffset;
  19.194 -   float32 result;
  19.195 -
  19.196 -      //The result row is used only for the left matrix, res col for the right
  19.197 -   for( resCol = resStartCol; resCol <= resEndCol; resCol++ )
  19.198 -    {
  19.199 -      for( resRow = resStartRow; resRow <= resEndRow; resRow++ )
  19.200 -       {
  19.201 -         leftOffset  = resRow * inpStride;//left & right inp strides always same
  19.202 -         rightOffset = resCol * inpStride;// because right is transposed
  19.203 -         result = 0;
  19.204 -         for( vec = startVec; vec <= endVec; vec++ )
  19.205 -          {
  19.206 -            result +=
  19.207 -               leftArray[ leftOffset + vec] * rightArray[ rightOffset + vec];
  19.208 -          }
  19.209 -
  19.210 -         resArray[ resRow * resStride + resCol ] += result;
  19.211 -       }
  19.212 -    }
  19.213 - }
  19.214 -
  19.215 -
  19.216 -
  19.217 -
  19.218 -/*Reuse this in divider when do the sequential multiply case
  19.219 - */
  19.220 -void inline
  19.221 -copyTranspose( int32 numRows, int32 numCols,
  19.222 -               int32 origStartRow, int32 origStartCol, int32 origStride,
  19.223 -               float32 *subArray, float32 *origArray )
  19.224 - { int32 stride = numRows;
  19.225 - 
  19.226 -   int row, col, origOffset;
  19.227 -   for( row = 0; row < numRows; row++ )
  19.228 -    {
  19.229 -      origOffset = (row + origStartRow) * origStride + origStartCol;
  19.230 -      for( col = 0; col < numCols; col++ )
  19.231 -       {
  19.232 -            //transpose means swap row & col -- traverse orig matrix normally
  19.233 -            // but put into reversed place in local array -- means the
  19.234 -            // stride is the numRows now, so col * numRows + row
  19.235 -         subArray[ col * stride + row ]  =  origArray[ origOffset + col ];
  19.236 -       }
  19.237 -    }
  19.238 - }
  19.239 -
  19.240 -void inline
  19.241 -copyTransposeFromOrig( SubMatrix *subMatrix, VirtProcr *animPr )
  19.242 - { int numCols, numRows, origStartRow, origStartCol, origStride;
  19.243 -   Matrix *origMatrix;
  19.244 -   float32 *origArray, *subArray;
  19.245 -
  19.246 -   VPThread__start_data_singleton( &(subMatrix->copyTransSingleton), animPr );
  19.247 -
  19.248 -   origMatrix   = subMatrix->origMatrix;
  19.249 -   origArray    = origMatrix->array;
  19.250 -   numCols      = subMatrix->numCols;
  19.251 -   numRows      = subMatrix->numRows;
  19.252 -   origStartRow = subMatrix->origStartRow;
  19.253 -   origStartCol = subMatrix->origStartCol;
  19.254 -   origStride   = origMatrix->numCols;
  19.255 -
  19.256 -   subArray    = VPThread__malloc( (size_t)numRows * 
  19.257 -                                        (size_t)numCols *sizeof(float32),animPr);
  19.258 -   subMatrix->array = subArray;
  19.259 -
  19.260 -      //copy values from orig matrix to local
  19.261 -   copyTranspose( numRows, numCols,
  19.262 -                  origStartRow, origStartCol, origStride,
  19.263 -                  subArray, origArray );
  19.264 -
  19.265 -   VPThread__end_data_singleton( &(subMatrix->copyTransSingleton), animPr );
  19.266 -
  19.267 - }
  19.268 -
  19.269 -
  19.270 -void inline
  19.271 -copyFromOrig( SubMatrix *subMatrix, VirtProcr *animPr )
  19.272 - { int numCols, numRows, origStartRow, origStartCol, stride, origStride;
  19.273 -   Matrix *origMatrix;
  19.274 -   float32 *origArray, *subArray;
  19.275 -
  19.276 -
  19.277 -      //This lets only a single VP execute the code between start and
  19.278 -      // end -- using start and end so that work runs outside the master.
  19.279 -      //If a second VP ever executes the start, it will be returned
  19.280 -      // from the end-point.  If its execution starts after another but before
  19.281 -      // that other has finished, this one will remain suspended until the
  19.282 -      // other finishes, then be resumed from the end-point.
  19.283 -   VPThread__start_data_singleton( &(subMatrix->copySingleton), animPr );
  19.284 -
  19.285 -
  19.286 -   origMatrix    = subMatrix->origMatrix;
  19.287 -   origArray     = origMatrix->array;
  19.288 -   numCols       = subMatrix->numCols;
  19.289 -   numRows       = subMatrix->numRows;
  19.290 -   origStartRow  = subMatrix->origStartRow;
  19.291 -   origStartCol  = subMatrix->origStartCol;
  19.292 -   origStride    = origMatrix->numCols;
  19.293 -
  19.294 -   subArray    = VPThread__malloc( (size_t)numRows * 
  19.295 -                                        (size_t)numCols *sizeof(float32),animPr);
  19.296 -   subMatrix->array = subArray;
  19.297 -
  19.298 -      //copy values from orig matrix to local
  19.299 -   stride        = numCols;
  19.300 -
  19.301 -   int row, col, offset, origOffset;
  19.302 -   for( row = 0; row < numRows; row++ )
  19.303 -    {
  19.304 -      offset     = row * stride;
  19.305 -      origOffset = (row + origStartRow) * origStride + origStartCol;
  19.306 -      for( col = 0; col < numCols; col++ )
  19.307 -       {
  19.308 -         subArray[ offset + col ]  =  origArray[ origOffset + col ];
  19.309 -       }
  19.310 -    }
  19.311 -
  19.312 -   VPThread__end_data_singleton( &(subMatrix->copySingleton), animPr );
  19.313 - }
    20.1 --- a/src/Application/main.c	Fri Sep 16 20:12:34 2011 +0200
    20.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
    20.3 @@ -1,50 +0,0 @@
    20.4 -/*
    20.5 - *  Copyright Oct 24, 2009 OpenSourceStewardshipFoundation.org
    20.6 - *  Licensed under GNU General Public License version 2
    20.7 - *
    20.8 - * author seanhalle@yahoo.com
    20.9 - */
   20.10 -
   20.11 -#include <malloc.h>
   20.12 -#include <stdlib.h>
   20.13 -
   20.14 -#include "Matrix_Mult.h"
   20.15 -#include "VPThread__Matrix_Mult/VPThread__Matrix_Mult.h"
   20.16 -
   20.17 -char __ProgrammName[] = "Blocked Matrix Multiply";
   20.18 -char __DataSet[255];
   20.19 -
   20.20 -void printParams(ParamBag *paramBag)
   20.21 -{
   20.22 -    snprintf((char*)&__DataSet, 255,
   20.23 -                "#\tLeft Matrix %d x %d,\n#\tRight Matrix %d x %d",
   20.24 -                getParamFromBag("leftMatrixRows", paramBag)->intValue,
   20.25 -                getParamFromBag("leftMatrixCols", paramBag)->intValue,
   20.26 -                getParamFromBag("rightMatrixRows", paramBag)->intValue,
   20.27 -                getParamFromBag("rightMatrixCols", paramBag)->intValue);
   20.28 -}
   20.29 -
   20.30 -/**
   20.31 - *Matrix multiply program written using VMS_HW piggy-back language
   20.32 - * 
   20.33 - */
   20.34 -int main( int argc, char **argv )
   20.35 - { Matrix      *leftMatrix, *rightMatrix, *resultMatrix;
   20.36 -   ParamBag    *paramBag;
   20.37 -   
   20.38 -   printf( "arguments: %s | %s\n", argv[0], argv[1] );
   20.39 -
   20.40 -   paramBag = makeParamBag();
   20.41 -   readParamFileIntoBag( argv[1], paramBag );
   20.42 -   initialize_Input_Matrices_Via( &leftMatrix, &rightMatrix, paramBag );
   20.43 -   printParams(paramBag);
   20.44 -   
   20.45 -   resultMatrix = multiplyTheseMatrices( leftMatrix, rightMatrix );
   20.46 -
   20.47 -   printf("\nresult matrix: \n");
   20.48 -   printMatrix( resultMatrix );
   20.49 -   
   20.50 -//   SSR__print_stats();
   20.51 -   
   20.52 -   exit(0); //cleans up
   20.53 - }