Mercurial > cgi-bin > hgwebdir.cgi > PR > Applications > VCilk > VCilk__Blocked_Matrix_Mult__Bench

changeset 5:e223756d0f0c tip
Fixed uninitialized variable and removed warnings
author: Merten Sach <msach@mailbox.tu-berlin.de>
date: Wed, 11 May 2011 15:58:04 +0200
parents: ecba4ae0be7a
files: src/Application/Matrix_Mult.c src/Application/VCilk__Matrix_Mult/Divide_Pr.c src/Application/VCilk__Matrix_Mult/VCilk__Matrix_Mult.h src/Application/VCilk__Matrix_Mult/subMatrix_Pr.c src/Application/main.c
diffstat: 5 files changed, 1207 insertions(+), 1204 deletions(-) [+]
[-]

src/Application/Matrix_Mult.c 334

src/Application/VCilk__Matrix_Mult/Divide_Pr.c 1179

src/Application/VCilk__Matrix_Mult/VCilk__Matrix_Mult.h 212

src/Application/VCilk__Matrix_Mult/subMatrix_Pr.c 616

src/Application/main.c 70 src/Application/Matrix_Mult.c 334 src/Application/VCilk__Matrix_Mult/Divide_Pr.c 1179 src/Application/VCilk__Matrix_Mult/VCilk__Matrix_Mult.h 212 src/Application/VCilk__Matrix_Mult/subMatrix_Pr.c 616 src/Application/main.c 70
src/Application/Matrix_Mult.c 334
src/Application/VCilk__Matrix_Mult/Divide_Pr.c 1179
src/Application/VCilk__Matrix_Mult/VCilk__Matrix_Mult.h 212
     1.1 --- a/src/Application/Matrix_Mult.c	Wed May 11 15:40:54 2011 +0200
     1.2 +++ b/src/Application/Matrix_Mult.c	Wed May 11 15:58:04 2011 +0200
     1.3 @@ -1,167 +1,167 @@
     1.4 -/*
     1.5 
     1.6 - *  Copyright 2009 OpenSourceStewardshipFoundation.org
     1.7 
     1.8 - *  Licensed under GNU General Public License version 2
     1.9 
    1.10 - *
    1.11 
    1.12 - * Author: seanhalle@yahoo.com
    1.13 
    1.14 - *
    1.15 
    1.16 - * Created on November 15, 2009, 2:35 AM
    1.17 
    1.18 - */
    1.19 
    1.20 -
    1.21 
    1.22 -#include <malloc.h>
    1.23 
    1.24 -#include <stdlib.h>
    1.25 
    1.26 -
    1.27 
    1.28 -#include "Matrix_Mult.h"
    1.29 
    1.30 -#include "ParamHelper/Param.h"
    1.31 
    1.32 -
    1.33 
    1.34 -
    1.35 
    1.36 - 
    1.37 
    1.38 - void
    1.39 
    1.40 -initialize_Input_Matrices_Via( Matrix  **leftMatrix, Matrix **rightMatrix,
    1.41 
    1.42 -                               ParamBag *paramBag )
    1.43 
    1.44 - { char *leftMatrixFileName, *rightMatrixFileName;
    1.45 
    1.46 -   int   leftMatrixRows, leftMatrixCols, rightMatrixRows, rightMatrixCols;
    1.47 
    1.48 -   
    1.49 
    1.50 -      ParamStruc *param;
    1.51 
    1.52 -      param = getParamFromBag( "leftMatrixRows", paramBag );
    1.53 
    1.54 -   leftMatrixRows = param->intValue;
    1.55 
    1.56 -      param = getParamFromBag( "leftMatrixCols", paramBag );
    1.57 
    1.58 -   leftMatrixCols = param->intValue;
    1.59 
    1.60 -   *leftMatrix = makeMatrix_WithResMat( leftMatrixRows, leftMatrixCols );
    1.61 
    1.62 -   
    1.63 
    1.64 -      param = getParamFromBag( "leftMatrixFileName", paramBag );
    1.65 
    1.66 -   leftMatrixFileName = param->strValue;  //no need to copy
    1.67 
    1.68 -   read_Matrix_From_File( *leftMatrix,  leftMatrixFileName );
    1.69 
    1.70 -   
    1.71 
    1.72 -      param = getParamFromBag( "rightMatrixRows", paramBag );
    1.73 
    1.74 -   rightMatrixRows = param->intValue;
    1.75 
    1.76 -      param = getParamFromBag( "rightMatrixCols", paramBag );
    1.77 
    1.78 -   rightMatrixCols = param->intValue;
    1.79 
    1.80 -   *rightMatrix = makeMatrix_WithResMat( rightMatrixRows, rightMatrixCols );
    1.81 
    1.82 -   
    1.83 
    1.84 -      param = getParamFromBag( "rightMatrixFileName", paramBag );
    1.85 
    1.86 -   rightMatrixFileName = param->strValue;
    1.87 
    1.88 -   read_Matrix_From_File( *rightMatrix, rightMatrixFileName );
    1.89 
    1.90 - }
    1.91 
    1.92 -
    1.93 
    1.94 -
    1.95 
    1.96 -void parseLineIntoRow( char *line, float32* row );
    1.97 
    1.98 -
    1.99 
   1.100 -
   1.101 
   1.102 - void
   1.103 
   1.104 -read_Matrix_From_File( Matrix *matrixStruc, char *matrixFileName )
   1.105 
   1.106 - { int    row, maxRead, numRows, numCols;
   1.107 
   1.108 -   float32 *matrixStart;
   1.109 
   1.110 -   size_t lineSz = 0;
   1.111 
   1.112 -   FILE  *file;
   1.113 
   1.114 -   char  *line = NULL;
   1.115 
   1.116 -   
   1.117 
   1.118 -   lineSz = 50000; //max length of line in a matrix data file
   1.119 
   1.120 -   line = (char *) malloc( lineSz );
   1.121 
   1.122 -   if( line == NULL ) printf( "no mem for matrix line" );
   1.123 
   1.124 -   
   1.125 
   1.126 -   numRows = matrixStruc->numRows;
   1.127 
   1.128 -   numCols = matrixStruc->numCols;
   1.129 
   1.130 -   matrixStart = matrixStruc->array;
   1.131 
   1.132 -
   1.133 
   1.134 -   file = fopen( matrixFileName, "r" );
   1.135 
   1.136 -   if( file == NULL ) { printf( "\nCouldn't open file!!\n"); exit(1);}
   1.137 
   1.138 -   fseek( file, 0, SEEK_SET );
   1.139 
   1.140 -   for( row = 0; row < numRows; row++ )
   1.141 
   1.142 -    {
   1.143 
   1.144 -      if( feof( file ) )  printf( "file ran out too soon" );
   1.145 
   1.146 -      maxRead = getline( &line, &lineSz, file );
   1.147 
   1.148 -      if( maxRead == -1 ) printf( "prob reading mat line");
   1.149 
   1.150 -      
   1.151 
   1.152 -      if( *line == '\n') continue; //blank line
   1.153 
   1.154 -      if( *line == '/' ) continue; //comment line
   1.155 
   1.156 -      
   1.157 
   1.158 -      parseLineIntoRow( line, matrixStart + row * numCols );
   1.159 
   1.160 -    }
   1.161 
   1.162 -   free( line );
   1.163 
   1.164 - }
   1.165 
   1.166 -
   1.167 
   1.168 -/*This function relies on each line having the proper number of cols.  It
   1.169 
   1.170 - * doesn't check, nor enforce, so if the file is improperly formatted it
   1.171 
   1.172 - * can write over unrelated memory
   1.173 
   1.174 - */
   1.175 
   1.176 - void
   1.177 
   1.178 -parseLineIntoRow( char *line, float32* row )
   1.179 
   1.180 - {
   1.181 
   1.182 -   char *valueStr, *searchPos;
   1.183 
   1.184 -   
   1.185 
   1.186 -      //read the float values
   1.187 
   1.188 -   searchPos = valueStr = line; //start
   1.189 
   1.190 -   
   1.191 
   1.192 -   for( ; *searchPos != 0; searchPos++)  //bit dangerous, should use buff len
   1.193 
   1.194 -    {
   1.195 
   1.196 -      if( *searchPos == '\n' ) //last col..  relying on well-formatted file
   1.197 
   1.198 -       { *searchPos = 0;
   1.199 
   1.200 -         *row = atof( valueStr );
   1.201 
   1.202 -         break;                                    //end FOR loop
   1.203 
   1.204 -       }
   1.205 
   1.206 -      if( *searchPos == ',' )
   1.207 
   1.208 -       { *searchPos = 0;                           //mark end of string
   1.209 
   1.210 -         *row = (float32) atof( valueStr );
   1.211 
   1.212 -         row += 1;                                 //address arith
   1.213 
   1.214 -            //skip any spaces before digits.. use searchPos + 1 to skip the 0
   1.215 
   1.216 -         for( ; *(searchPos + 1)== ' ' && *(searchPos + 1) !=0; searchPos++);
   1.217 
   1.218 -         valueStr = searchPos + 1;
   1.219 
   1.220 -       }
   1.221 
   1.222 -    }
   1.223 
   1.224 - }
   1.225 
   1.226 -
   1.227 
   1.228 - //==========================================================================
   1.229 
   1.230 -
   1.231 
   1.232 -/*In the "_Flat" version of constructor, do only malloc of the top data struc
   1.233 
   1.234 - * and set values in that top-level.  Don't malloc any sub-structures.
   1.235 
   1.236 - */
   1.237 
   1.238 - Matrix *
   1.239 
   1.240 -makeMatrix_Flat( int32 numRows, int32 numCols )
   1.241 
   1.242 - { Matrix * retMatrix;
   1.243 
   1.244 -   retMatrix = malloc( sizeof( Matrix ) );
   1.245 
   1.246 -   retMatrix->numRows = numRows;
   1.247 
   1.248 -   retMatrix->numCols = numCols;
   1.249 
   1.250 -
   1.251 
   1.252 -   return retMatrix;
   1.253 
   1.254 - }
   1.255 
   1.256 -
   1.257 
   1.258 - Matrix *
   1.259 
   1.260 -makeMatrix_WithResMat( int32 numRows, int32 numCols )
   1.261 
   1.262 - { Matrix * retMatrix;
   1.263 
   1.264 -   retMatrix = malloc( sizeof( Matrix ) );
   1.265 
   1.266 -   retMatrix->numRows = numRows;
   1.267 
   1.268 -   retMatrix->numCols = numCols;
   1.269 
   1.270 -   retMatrix->array  = malloc( numRows * numCols * sizeof(float32) );
   1.271 
   1.272 -
   1.273 
   1.274 -   return retMatrix;
   1.275 
   1.276 - }
   1.277 
   1.278 -
   1.279 
   1.280 - void
   1.281 
   1.282 -freeMatrix_Flat( Matrix * matrix )
   1.283 
   1.284 - { //( matrix );
   1.285 
   1.286 - }
   1.287 
   1.288 - void
   1.289 
   1.290 -freeMatrix( Matrix * matrix )
   1.291 
   1.292 - { free( matrix->array );
   1.293 
   1.294 -   free( matrix );
   1.295 
   1.296 - }
   1.297 
   1.298 -
   1.299 
   1.300 -void
   1.301 
   1.302 -printMatrix( Matrix *matrix )
   1.303 
   1.304 - { int r, c, numRows, numCols, rowsToPrint, colsToPrint, rowIncr, colIncr;
   1.305 
   1.306 -   float32 *matrixArray;
   1.307 
   1.308 -
   1.309 
   1.310 -   numRows = rowsToPrint = matrix->numRows;
   1.311 
   1.312 -   numCols = colsToPrint = matrix->numCols;
   1.313 
   1.314 -   matrixArray = matrix->array;
   1.315 
   1.316 -
   1.317 
   1.318 -   rowIncr = numRows/20; if(rowIncr == 0) rowIncr = 1;//20 to 39 rows printed
   1.319 
   1.320 -   colIncr = numCols/20; if(colIncr == 0) colIncr = 1;//20 to 39 cols printed
   1.321 
   1.322 -   for( r = 0; r < numRows; r += rowIncr )
   1.323 
   1.324 -    { for( c = 0; c < numCols; c += colIncr )
   1.325 
   1.326 -       { printf( "%3.1f | ", matrixArray[ r * numCols + c ] );
   1.327 
   1.328 -       }
   1.329 
   1.330 -      printf("\n");
   1.331 
   1.332 -    }
   1.333 
   1.334 - }
   1.335 
   1.336 -
   1.337 
   1.338 +/*
   1.339 + *  Copyright 2009 OpenSourceStewardshipFoundation.org
   1.340 + *  Licensed under GNU General Public License version 2
   1.341 + *
   1.342 + * Author: seanhalle@yahoo.com
   1.343 + *
   1.344 + * Created on November 15, 2009, 2:35 AM
   1.345 + */
   1.346 +
   1.347 +#include <malloc.h>
   1.348 +#include <stdlib.h>
   1.349 +
   1.350 +#include "Matrix_Mult.h"
   1.351 +#include "ParamHelper/Param.h"
   1.352 +
   1.353 +
   1.354 + 
   1.355 + void
   1.356 +initialize_Input_Matrices_Via( Matrix  **leftMatrix, Matrix **rightMatrix,
   1.357 +                               ParamBag *paramBag )
   1.358 + { char *leftMatrixFileName, *rightMatrixFileName;
   1.359 +   int   leftMatrixRows, leftMatrixCols, rightMatrixRows, rightMatrixCols;
   1.360 +   
   1.361 +      ParamStruc *param;
   1.362 +      param = getParamFromBag( "leftMatrixRows", paramBag );
   1.363 +   leftMatrixRows = param->intValue;
   1.364 +      param = getParamFromBag( "leftMatrixCols", paramBag );
   1.365 +   leftMatrixCols = param->intValue;
   1.366 +   *leftMatrix = makeMatrix_WithResMat( leftMatrixRows, leftMatrixCols );
   1.367 +   
   1.368 +      param = getParamFromBag( "leftMatrixFileName", paramBag );
   1.369 +   leftMatrixFileName = param->strValue;  //no need to copy
   1.370 +   read_Matrix_From_File( *leftMatrix,  leftMatrixFileName );
   1.371 +   
   1.372 +      param = getParamFromBag( "rightMatrixRows", paramBag );
   1.373 +   rightMatrixRows = param->intValue;
   1.374 +      param = getParamFromBag( "rightMatrixCols", paramBag );
   1.375 +   rightMatrixCols = param->intValue;
   1.376 +   *rightMatrix = makeMatrix_WithResMat( rightMatrixRows, rightMatrixCols );
   1.377 +   
   1.378 +      param = getParamFromBag( "rightMatrixFileName", paramBag );
   1.379 +   rightMatrixFileName = param->strValue;
   1.380 +   read_Matrix_From_File( *rightMatrix, rightMatrixFileName );
   1.381 + }
   1.382 +
   1.383 +
   1.384 +void parseLineIntoRow( char *line, float32* row );
   1.385 +
   1.386 +
   1.387 + void
   1.388 +read_Matrix_From_File( Matrix *matrixStruc, char *matrixFileName )
   1.389 + { int    row, maxRead, numRows, numCols;
   1.390 +   float32 *matrixStart;
   1.391 +   size_t lineSz = 0;
   1.392 +   FILE  *file;
   1.393 +   char  *line = NULL;
   1.394 +   
   1.395 +   lineSz = 50000; //max length of line in a matrix data file
   1.396 +   line = (char *) malloc( lineSz );
   1.397 +   if( line == NULL ) printf( "no mem for matrix line" );
   1.398 +   
   1.399 +   numRows = matrixStruc->numRows;
   1.400 +   numCols = matrixStruc->numCols;
   1.401 +   matrixStart = matrixStruc->array;
   1.402 +
   1.403 +   file = fopen( matrixFileName, "r" );
   1.404 +   if( file == NULL ) { printf( "\nCouldn't open file!!\n"); exit(1);}
   1.405 +   fseek( file, 0, SEEK_SET );
   1.406 +   for( row = 0; row < numRows; row++ )
   1.407 +    {
   1.408 +      if( feof( file ) )  printf( "file ran out too soon" );
   1.409 +      maxRead = getline( &line, &lineSz, file );
   1.410 +      if( maxRead == -1 ) printf( "prob reading mat line");
   1.411 +      
   1.412 +      if( *line == '\n') continue; //blank line
   1.413 +      if( *line == '/' ) continue; //comment line
   1.414 +      
   1.415 +      parseLineIntoRow( line, matrixStart + row * numCols );
   1.416 +    }
   1.417 +   free( line );
   1.418 + }
   1.419 +
   1.420 +/*This function relies on each line having the proper number of cols.  It
   1.421 + * doesn't check, nor enforce, so if the file is improperly formatted it
   1.422 + * can write over unrelated memory
   1.423 + */
   1.424 + void
   1.425 +parseLineIntoRow( char *line, float32* row )
   1.426 + {
   1.427 +   char *valueStr, *searchPos;
   1.428 +   
   1.429 +      //read the float values
   1.430 +   searchPos = valueStr = line; //start
   1.431 +   
   1.432 +   for( ; *searchPos != 0; searchPos++)  //bit dangerous, should use buff len
   1.433 +    {
   1.434 +      if( *searchPos == '\n' ) //last col..  relying on well-formatted file
   1.435 +       { *searchPos = 0;
   1.436 +         *row = atof( valueStr );
   1.437 +         break;                                    //end FOR loop
   1.438 +       }
   1.439 +      if( *searchPos == ',' )
   1.440 +       { *searchPos = 0;                           //mark end of string
   1.441 +         *row = (float32) atof( valueStr );
   1.442 +         row += 1;                                 //address arith
   1.443 +            //skip any spaces before digits.. use searchPos + 1 to skip the 0
   1.444 +         for( ; *(searchPos + 1)== ' ' && *(searchPos + 1) !=0; searchPos++);
   1.445 +         valueStr = searchPos + 1;
   1.446 +       }
   1.447 +    }
   1.448 + }
   1.449 +
   1.450 + //==========================================================================
   1.451 +
   1.452 +/*In the "_Flat" version of constructor, do only malloc of the top data struc
   1.453 + * and set values in that top-level.  Don't malloc any sub-structures.
   1.454 + */
   1.455 + Matrix *
   1.456 +makeMatrix_Flat( int32 numRows, int32 numCols )
   1.457 + { Matrix * retMatrix;
   1.458 +   retMatrix = malloc( sizeof( Matrix ) );
   1.459 +   retMatrix->numRows = numRows;
   1.460 +   retMatrix->numCols = numCols;
   1.461 +
   1.462 +   return retMatrix;
   1.463 + }
   1.464 +
   1.465 + Matrix *
   1.466 +makeMatrix_WithResMat( int32 numRows, int32 numCols )
   1.467 + { Matrix * retMatrix;
   1.468 +   retMatrix = malloc( sizeof( Matrix ) );
   1.469 +   retMatrix->numRows = numRows;
   1.470 +   retMatrix->numCols = numCols;
   1.471 +   retMatrix->array  = malloc( numRows * numCols * sizeof(float32) );
   1.472 +
   1.473 +   return retMatrix;
   1.474 + }
   1.475 +
   1.476 + void
   1.477 +freeMatrix_Flat( Matrix * matrix )
   1.478 + { //( matrix );
   1.479 + }
   1.480 + void
   1.481 +freeMatrix( Matrix * matrix )
   1.482 + { free( matrix->array );
   1.483 +   free( matrix );
   1.484 + }
   1.485 +
   1.486 +void
   1.487 +printMatrix( Matrix *matrix )
   1.488 + { int r, c, numRows, numCols, rowsToPrint, colsToPrint, rowIncr, colIncr;
   1.489 +   float32 *matrixArray;
   1.490 +
   1.491 +   numRows = rowsToPrint = matrix->numRows;
   1.492 +   numCols = colsToPrint = matrix->numCols;
   1.493 +   matrixArray = matrix->array;
   1.494 +
   1.495 +   rowIncr = numRows/20; if(rowIncr == 0) rowIncr = 1;//20 to 39 rows printed
   1.496 +   colIncr = numCols/20; if(colIncr == 0) colIncr = 1;//20 to 39 cols printed
   1.497 +   for( r = 0; r < numRows; r += rowIncr )
   1.498 +    { for( c = 0; c < numCols; c += colIncr )
   1.499 +       { printf( "%3.1f | ", matrixArray[ r * numCols + c ] );
   1.500 +       }
   1.501 +      printf("\n");
   1.502 +    }
   1.503 + }
   1.504 +

     2.1 --- a/src/Application/VCilk__Matrix_Mult/Divide_Pr.c	Wed May 11 15:40:54 2011 +0200
     2.2 +++ b/src/Application/VCilk__Matrix_Mult/Divide_Pr.c	Wed May 11 15:58:04 2011 +0200
     2.3 @@ -1,588 +1,591 @@
     2.4 -/*
     2.5 
     2.6 - *  Copyright 2009 OpenSourceStewardshipFoundation.org
     2.7 
     2.8 - *  Licensed under GNU General Public License version 2
     2.9 
    2.10 - *
    2.11 
    2.12 - * Author: seanhalle@yahoo.com
    2.13 
    2.14 - *
    2.15 
    2.16 - */
    2.17 
    2.18 -
    2.19 
    2.20 -
    2.21 
    2.22 -#include "VCilk__Matrix_Mult.h"
    2.23 
    2.24 -#include <math.h>
    2.25 
    2.26 -#include <sys/time.h>
    2.27 
    2.28 -#include <string.h>
    2.29 
    2.30 -
    2.31 
    2.32 -   //The time to compute this many result values should equal the time to
    2.33 
    2.34 -   // perform this division on a matrix of size gives that many result calcs
    2.35 
    2.36 -   //IE, size this so that sequential time to calc equals divide time
    2.37 
    2.38 -   // find the value by experimenting -- but divide time and calc time scale
    2.39 
    2.40 -   // same way, so this value should remain valid across hardware
    2.41 
    2.42 -   //Divide time is about 800us on 2.4Ghz core2Quad laptop core
    2.43 
    2.44 -   //num cells is the cube of a side, when have two square matrices
    2.45 
    2.46 -#define NUM_CELLS_IN_SEQUENTIAL_CUTOFF 100000 /* about 46x46 */
    2.47 
    2.48 -
    2.49 
    2.50 -
    2.51 
    2.52 -//===========================================================================
    2.53 
    2.54 -int inline
    2.55 
    2.56 -measureMatrixMultPrimitive( VirtProcr *animPr );
    2.57 
    2.58 -
    2.59 
    2.60 -SlicingStrucCarrier *
    2.61 
    2.62 -calcIdealSizeAndSliceDimensions( Matrix *leftMatrix, Matrix *rightMatrix,
    2.63 
    2.64 -                                 VirtProcr *animPr );
    2.65 
    2.66 -
    2.67 
    2.68 -SlicingStruc *
    2.69 
    2.70 -sliceUpDimension( float32 idealSizeOfSide, int startVal, int endVal,
    2.71 
    2.72 -                  VirtProcr *animPr );
    2.73 
    2.74 -
    2.75 
    2.76 -void
    2.77 
    2.78 -freeSlicingStruc( SlicingStruc *slicingStruc, VirtProcr *animPr );
    2.79 
    2.80 -
    2.81 
    2.82 -SubMatrix **
    2.83 
    2.84 -createSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices,
    2.85 
    2.86 -                   Matrix *origMatrix, VirtProcr *animPr );
    2.87 
    2.88 -
    2.89 
    2.90 -void
    2.91 
    2.92 -freeSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices,
    2.93 
    2.94 -                 SubMatrix **subMatrices, VirtProcr *animPr );
    2.95 
    2.96 -
    2.97 
    2.98 -void
    2.99 
   2.100 -pairUpSubMatricesAndSpawnAndSync( SubMatrix **leftSubMatrices,
   2.101 
   2.102 -                                    SubMatrix **rightSubMatrices,
   2.103 
   2.104 -                                    int32 numRowIdxs, int32 numColIdxs,
   2.105 
   2.106 -                                    int32 numVecIdxs,
   2.107 
   2.108 -                                    float32 *resultArray,
   2.109 
   2.110 -                                    VirtProcr *animatingPr );
   2.111 
   2.112 -
   2.113 
   2.114 -void
   2.115 
   2.116 -makeSubMatricesAndSpawnAndSync( Matrix *leftMatrix, Matrix *rightMatrix,
   2.117 
   2.118 -            SlicingStrucCarrier *slicingStrucCarrier,
   2.119 
   2.120 -            float32 *resultArray, VirtProcr *animatingPr );
   2.121 
   2.122 -
   2.123 
   2.124 -//===========================================================================
   2.125 
   2.126 -
   2.127 
   2.128 -/*Divider creates one processor for every sub-matrix
   2.129 
   2.130 - * It hands them:
   2.131 
   2.132 - *  the name of the result processor that they should send their results to,
   2.133 
   2.134 - *  the left and right matrices, and the rows and cols they should multiply
   2.135 
   2.136 - * It first creates the result processor, then all the sub-matrixPair
   2.137 
   2.138 - *  processors,
   2.139 
   2.140 - *  then does a receive of a message from the result processor that gives
   2.141 
   2.142 - *  the divider ownership of the result matrix.
   2.143 
   2.144 - * Finally, the divider returns the result matrix out of the VCilk system.
   2.145 
   2.146 - *
   2.147 
   2.148 - * Divider chooses the size of sub-matrices via an algorithm that tries to
   2.149 
   2.150 - *  keep the minimum work above a threshold.  The threshold is machine-
   2.151 
   2.152 - *  dependent, so ask VCilk for min work-unit time to get a
   2.153 
   2.154 - *  given overhead
   2.155 
   2.156 - *
   2.157 
   2.158 - * Divide min work-unit cycles by measured-cycles for one matrix-cell
   2.159 
   2.160 - *  product -- gives the number of products need to have in min size
   2.161 
   2.162 - *  matrix.
   2.163 
   2.164 - *
   2.165 
   2.166 - * So then, take cubed root of this to get the size of a side of min sub-
   2.167 
   2.168 - *  matrix.  That is the size of the ideal square sub-matrix -- so tile
   2.169 
   2.170 - *  up the two input matrices into ones as close as possible to that size,
   2.171 
   2.172 - *  and create the pairs of sub-matrices.
   2.173 
   2.174 - *
   2.175 
   2.176 - *========================  STRATEGIC OVERVIEW  =======================
   2.177 
   2.178 - *
   2.179 
   2.180 - *This division is a bit tricky, because have to create things in advance
   2.181 
   2.182 - * that it's not at first obvious need to be created..
   2.183 
   2.184 - *
   2.185 
   2.186 - *First slice up each dimension -- three of them..  this is because will have
   2.187 
   2.188 - * to create the sub-matrix's data-structures before pairing the sub-matrices
   2.189 
   2.190 - * with each other -- so, have three dimensions to slice up before can
   2.191 
   2.192 - * create the sub-matrix data-strucs -- also, have to be certain that the
   2.193 
   2.194 - * cols of the left input have the exact same slicing as the rows of the
   2.195 
   2.196 - * left matrix, so just to be sure, do the slicing calc once, then use it
   2.197 
   2.198 - * for both.
   2.199 
   2.200 - *
   2.201 
   2.202 - *So, goes like this:
   2.203 
   2.204 - *1) calculate the start & end values of each dimension in each matrix.
   2.205 
   2.206 - *2) use those values to create sub-matrix structures
   2.207 
   2.208 - *3) combine sub-matrices into pairs, as the tasks to perform.
   2.209 
   2.210 - *
   2.211 
   2.212 - *Have to calculate separately from creating the sub-matrices because of the
   2.213 
   2.214 - * nature of the nesting -- would either end up creating the same sub-matrix
   2.215 
   2.216 - * multiple times, or else would have to put in detection of whether had
   2.217 
   2.218 - * made a particular one already if tried to combine steps 1 and 2.
   2.219 
   2.220 - *
   2.221 
   2.222 - *Step 3 has to be separate because of the nesting, as well -- same reason,
   2.223 
   2.224 - * would either create same sub-matrix multiple times, or else have to
   2.225 
   2.226 - * add detection of whether was already created.
   2.227 
   2.228 - *
   2.229 
   2.230 - *Another way to look at it: there's one level of loop to divide dimensions,
   2.231 
   2.232 - * two levels of nesting to create sub-matrices, and three levels to pair
   2.233 
   2.234 - * up the sub-matrices.
   2.235 
   2.236 - */
   2.237 
   2.238 -
   2.239 
   2.240 -void divideWorkIntoSubMatrixPairProcrs( void      *_dividerParams,
   2.241 
   2.242 -                                        VirtProcr *animPr )
   2.243 
   2.244 - { 
   2.245 
   2.246 -   DividerParams   *dividerParams;
   2.247 
   2.248 -   ResultsParams   *resultsParams;
   2.249 
   2.250 -   Matrix          *leftMatrix, *rightMatrix, *resultMatrix;
   2.251 
   2.252 -   void            *msg;
   2.253 
   2.254 -   SlicingStrucCarrier *slicingStrucCarrier;
   2.255 
   2.256 -   float32         *resultArray; //points to array to be put inside result
   2.257 
   2.258 -                                 // matrix
   2.259 
   2.260 -   
   2.261 
   2.262 -         DEBUG( dbgAppFlow, "start divide\n")
   2.263 
   2.264 -
   2.265 
   2.266 -         int32
   2.267 
   2.268 -         divideProbe = VMS__create_single_interval_probe( "divideProbe",
   2.269 
   2.270 -                                                          animPr );
   2.271 
   2.272 -         VMS__record_sched_choice_into_probe( divideProbe, animPr );
   2.273 
   2.274 -         VMS__record_interval_start_in_probe( divideProbe );
   2.275 
   2.276 -
   2.277 
   2.278 -   //=========== Setup -- make local copies of ptd-to-things, malloc, aso
   2.279 
   2.280 -   int32 numResRows, numResCols, vectLength;
   2.281 
   2.282 -
   2.283 
   2.284 -   dividerParams   = (DividerParams *)_dividerParams;
   2.285 
   2.286 -   
   2.287 
   2.288 -   leftMatrix      = dividerParams->leftMatrix;
   2.289 
   2.290 -   rightMatrix     = dividerParams->rightMatrix;
   2.291 
   2.292 -
   2.293 
   2.294 -   vectLength  = leftMatrix->numCols;
   2.295 
   2.296 -   numResRows  = leftMatrix->numRows;
   2.297 
   2.298 -   numResCols  = rightMatrix->numCols;
   2.299 
   2.300 -   resultArray = dividerParams->resultMatrix->array;
   2.301 
   2.302 -   
   2.303 
   2.304 -      //zero the result array
   2.305 
   2.306 -   memset( resultArray, 0, numResRows * numResCols * sizeof(float32) );
   2.307 
   2.308 -
   2.309 
   2.310 -   
   2.311 
   2.312 -   //==============  Do either sequential mult or do division ==============
   2.313 
   2.314 -
   2.315 
   2.316 -      //Check if input matrices too small -- if yes, just do sequential
   2.317 
   2.318 -      //Cutoff is determined by overhead of this divider -- relatively
   2.319 
   2.320 -      // machine-independent
   2.321 
   2.322 -   if( (float32)leftMatrix->numRows * (float32)leftMatrix->numCols *
   2.323 
   2.324 -       (float32)rightMatrix->numCols  < NUM_CELLS_IN_SEQUENTIAL_CUTOFF )
   2.325 
   2.326 -    { int32 vectLength;
   2.327 
   2.328 -
   2.329 
   2.330 -      //====== Do sequential multiply on a single core
   2.331 
   2.332 -            DEBUG( dbgAppFlow, "doing sequential")
   2.333 
   2.334 -
   2.335 
   2.336 -         //transpose the right matrix
   2.337 
   2.338 -      float32 *
   2.339 
   2.340 -      transRightArray  = VCilk__malloc( rightMatrix->numRows *
   2.341 
   2.342 -                                        rightMatrix->numCols *
   2.343 
   2.344 -                                        sizeof(float32),       animPr );
   2.345 
   2.346 -
   2.347 
   2.348 -         //copy values from orig matrix to local
   2.349 
   2.350 -      copyTranspose( rightMatrix->numRows, rightMatrix->numCols,
   2.351 
   2.352 -                     0, 0, rightMatrix->numRows,
   2.353 
   2.354 -                     transRightArray, rightMatrix->array );
   2.355 
   2.356 -
   2.357 
   2.358 -      multiplyMatrixArraysTransposed( vectLength, numResRows, numResCols,
   2.359 
   2.360 -                            leftMatrix->array, transRightArray,
   2.361 
   2.362 -                            resultArray );
   2.363 
   2.364 -    }
   2.365 
   2.366 -   else
   2.367 
   2.368 -    {
   2.369 
   2.370 -      //====== Do parallel multiply across cores
   2.371 
   2.372 -
   2.373 
   2.374 -         //Calc the ideal size of sub-matrix and slice up the dimensions of
   2.375 
   2.376 -         // the two matrices.
   2.377 
   2.378 -         //The ideal size is the one takes the number of cycles to calculate
   2.379 
   2.380 -         // such that calc time is equal or greater than min work-unit size
   2.381 
   2.382 -      slicingStrucCarrier =
   2.383 
   2.384 -         calcIdealSizeAndSliceDimensions( leftMatrix, rightMatrix,  animPr);
   2.385 
   2.386 -                                         
   2.387 
   2.388 -
   2.389 
   2.390 -
   2.391 
   2.392 -         //Make the sub-matrices, and pair them up, then spawn processors to
   2.393 
   2.394 -         // calc product of each pair.
   2.395 
   2.396 -      makeSubMatricesAndSpawnAndSync( leftMatrix, rightMatrix,
   2.397 
   2.398 -                                      slicingStrucCarrier,
   2.399 
   2.400 -                                      resultArray, animPr);
   2.401 
   2.402 -         //The result array will get filled in by the spawned children
   2.403 
   2.404 -    }
   2.405 
   2.406 -
   2.407 
   2.408 -
   2.409 
   2.410 -   //===============  Work done -- send results back =================
   2.411 
   2.412 -
   2.413 
   2.414 -
   2.415 
   2.416 -      //results have been saved into an array that was made outside the VMS
   2.417 
   2.418 -      // system, by entry-point Fn, and passed in through dividerParams.
   2.419 
   2.420 -      //So, nothing to do to send results back -- they're seen by side-effect
   2.421 
   2.422 -
   2.423 
   2.424 -         DEBUG( dbgAppFlow, "*** end divide ***\n")
   2.425 
   2.426 -
   2.427 
   2.428 -         VMS__record_interval_end_in_probe( divideProbe );
   2.429 
   2.430 -         VMS__print_stats_of_all_probes();
   2.431 
   2.432 -
   2.433 
   2.434 -   VCilk__dissipate_procr( animPr );  //all procrs dissipate self at end
   2.435 
   2.436 -      //when all of the processors have dissipated, the "create seed and do
   2.437 
   2.438 -      // work" call in the entry point function returns
   2.439 
   2.440 - }
   2.441 
   2.442 -
   2.443 
   2.444 -
   2.445 
   2.446 -SlicingStrucCarrier *
   2.447 
   2.448 -calcIdealSizeAndSliceDimensions( Matrix *leftMatrix, Matrix *rightMatrix,
   2.449 
   2.450 -                                 VirtProcr *animPr )
   2.451 
   2.452 -{
   2.453 
   2.454 -   float32 idealSizeOfSide, idealSizeOfSide1, idealSizeOfSide2;
   2.455 
   2.456 -   SlicingStruc *leftRowSlices, *vecSlices, *rightColSlices;
   2.457 
   2.458 -   SlicingStrucCarrier *slicingStrucCarrier =
   2.459 
   2.460 -                         VCilk__malloc(sizeof(SlicingStrucCarrier), animPr );
   2.461 
   2.462 -
   2.463 
   2.464 -   int minWorkUnitCycles, primitiveCycles, idealNumWorkUnits;
   2.465 
   2.466 -   float64 numPrimitiveOpsInMinWorkUnit;
   2.467 
   2.468 -
   2.469 
   2.470 -
   2.471 
   2.472 -   //=======  Calc ideal size of min-sized sub-matrix  ========
   2.473 
   2.474 -
   2.475 
   2.476 -      //ask VCilk for the number of cycles of the minimum work unit, at given
   2.477 
   2.478 -      // percent overhead then add a guess at overhead from this divider
   2.479 
   2.480 -   minWorkUnitCycles = VCilk__giveMinWorkUnitCycles( .05 );
   2.481 
   2.482 -
   2.483 
   2.484 -      //ask VCilk for number of cycles of the "primitive" op of matrix mult
   2.485 
   2.486 -   primitiveCycles = measureMatrixMultPrimitive( animPr );
   2.487 
   2.488 -
   2.489 
   2.490 -   numPrimitiveOpsInMinWorkUnit =
   2.491 
   2.492 -      (float64)minWorkUnitCycles / (float64)primitiveCycles;
   2.493 
   2.494 -
   2.495 
   2.496 -      //take cubed root -- that's number of these in a "side" of sub-matrix
   2.497 
   2.498 -      // then multiply by 5 because the primitive is 5x5
   2.499 
   2.500 -   idealSizeOfSide1 = 5 * cbrt( numPrimitiveOpsInMinWorkUnit );
   2.501 
   2.502 -
   2.503 
   2.504 -   idealNumWorkUnits = VCilk__giveIdealNumWorkUnits();
   2.505 
   2.506 -   
   2.507 
   2.508 -   idealSizeOfSide2 = leftMatrix->numRows / rint(cbrt( idealNumWorkUnits ));
   2.509 
   2.510 -   idealSizeOfSide2 *= 0.8; //finer granularity to help load balance
   2.511 
   2.512 -
   2.513 
   2.514 -   if( idealSizeOfSide1 > idealSizeOfSide2 )
   2.515 
   2.516 -      idealSizeOfSide = idealSizeOfSide1;
   2.517 
   2.518 -   else
   2.519 
   2.520 -      idealSizeOfSide = idealSizeOfSide2;
   2.521 
   2.522 -
   2.523 
   2.524 -      //The multiply inner loop blocks the array to fit into L1 cache
   2.525 
   2.526 -//   if( idealSizeOfSide < ROWS_IN_BLOCK ) idealSizeOfSide = ROWS_IN_BLOCK;
   2.527 
   2.528 -
   2.529 
   2.530 -   //============  Slice up dimensions, now that know target size ===========
   2.531 
   2.532 -
   2.533 
   2.534 -      //Tell the slicer the target size of a side (floating pt), the start
   2.535 
   2.536 -      // value to start slicing at, and the end value to stop slicing at
   2.537 
   2.538 -      //It returns an array of start value of each chunk, plus number of them
   2.539 
   2.540 -   int32 startLeftRow, endLeftRow, startVec,endVec,startRightCol,endRightCol;
   2.541 
   2.542 -   startLeftRow  = 0;
   2.543 
   2.544 -   endLeftRow    = leftMatrix->numRows -1;
   2.545 
   2.546 -   startVec      = 0;
   2.547 
   2.548 -   endVec        = leftMatrix->numCols -1;
   2.549 
   2.550 -   startRightCol = 0;
   2.551 
   2.552 -   endRightCol   = rightMatrix->numCols -1;
   2.553 
   2.554 -
   2.555 
   2.556 -   leftRowSlices =
   2.557 
   2.558 -      sliceUpDimension( idealSizeOfSide,  startLeftRow, endLeftRow, animPr );
   2.559 
   2.560 -
   2.561 
   2.562 -   vecSlices =
   2.563 
   2.564 -      sliceUpDimension( idealSizeOfSide,  startVec, endVec, animPr );
   2.565 
   2.566 -
   2.567 
   2.568 -   rightColSlices =
   2.569 
   2.570 -      sliceUpDimension( idealSizeOfSide,  startRightCol, endRightCol,animPr);
   2.571 
   2.572 -
   2.573 
   2.574 -   slicingStrucCarrier->leftRowSlices  = leftRowSlices;
   2.575 
   2.576 -   slicingStrucCarrier->vecSlices      = vecSlices;
   2.577 
   2.578 -   slicingStrucCarrier->rightColSlices = rightColSlices;
   2.579 
   2.580 -
   2.581 
   2.582 -         DEBUG1( dbgAppFlow, "leftRowSlices %d | ", leftRowSlices->numVals );
   2.583 
   2.584 -         DEBUG1( dbgAppFlow, "rightColSlices %d | ",rightColSlices->numVals);
   2.585 
   2.586 -         DEBUG1( dbgAppFlow, "vecSlices %d\n", vecSlices->numVals );
   2.587 
   2.588 -   return slicingStrucCarrier;
   2.589 
   2.590 -}
   2.591 
   2.592 -
   2.593 
   2.594 -
   2.595 
   2.596 -void inline
   2.597 
   2.598 -makeSubMatricesAndSpawnAndSync( Matrix  *leftMatrix,  Matrix    *rightMatrix,
   2.599 
   2.600 -                         SlicingStrucCarrier *slicingStrucCarrier,
   2.601 
   2.602 -                         float32 *resultArray, VirtProcr *animPr )
   2.603 
   2.604 - {
   2.605 
   2.606 -   SlicingStruc *leftRowSlices, *vecSlices, *rightColSlices;
   2.607 
   2.608 -   
   2.609 
   2.610 -   leftRowSlices  = slicingStrucCarrier->leftRowSlices;
   2.611 
   2.612 -   vecSlices      = slicingStrucCarrier->vecSlices;
   2.613 
   2.614 -   rightColSlices = slicingStrucCarrier->rightColSlices;
   2.615 
   2.616 -   VCilk__free( slicingStrucCarrier, animPr );
   2.617 
   2.618 -   
   2.619 
   2.620 -   //================  Make sub-matrices, given the slicing  ================
   2.621 
   2.622 -   SubMatrix **leftSubMatrices, **rightSubMatrices;
   2.623 
   2.624 -   leftSubMatrices =
   2.625 
   2.626 -      createSubMatrices( leftRowSlices, vecSlices,
   2.627 
   2.628 -                         leftMatrix, animPr );
   2.629 
   2.630 -   rightSubMatrices =
   2.631 
   2.632 -      createSubMatrices( vecSlices, rightColSlices,
   2.633 
   2.634 -                         rightMatrix, animPr );
   2.635 
   2.636 -
   2.637 
   2.638 -   freeSlicingStruc( leftRowSlices, animPr );
   2.639 
   2.640 -   freeSlicingStruc( vecSlices, animPr );
   2.641 
   2.642 -   freeSlicingStruc( rightColSlices, animPr );
   2.643 
   2.644 -
   2.645 
   2.646 -   //==============  pair the sub-matrices and make processors ==============
   2.647 
   2.648 -   int32 numRowIdxs, numColIdxs, numVecIdxs;
   2.649 
   2.650 -
   2.651 
   2.652 -   numRowIdxs = leftRowSlices->numVals;
   2.653 
   2.654 -   numColIdxs = rightColSlices->numVals;
   2.655 
   2.656 -   numVecIdxs = vecSlices->numVals;
   2.657 
   2.658 -   pairUpSubMatricesAndSpawnAndSync( leftSubMatrices, rightSubMatrices,
   2.659 
   2.660 -                                     numRowIdxs, numColIdxs, numVecIdxs,
   2.661 
   2.662 -                                     resultArray,
   2.663 
   2.664 -                                     animPr );
   2.665 
   2.666 -      //It syncs inside, so know all work is done now: free the sub-matrices
   2.667 
   2.668 -   freeSubMatrices( leftRowSlices, vecSlices,  leftSubMatrices, animPr );
   2.669 
   2.670 -   freeSubMatrices( vecSlices, rightColSlices, rightSubMatrices, animPr );
   2.671 
   2.672 - }
   2.673 
   2.674 -
   2.675 
   2.676 -
   2.677 
   2.678 -
   2.679 
   2.680 -
   2.681 
   2.682 -/* numRows*colsPerRow/numCores = numToPutOntoEachCore; 
   2.683 
   2.684 - * put all from a given row onto same core, until exhaust allotment for that
   2.685 
   2.686 - *  core
   2.687 
   2.688 - *
   2.689 
   2.690 - */
   2.691 
   2.692 -void inline
   2.693 
   2.694 -pairUpSubMatricesAndSpawnAndSync( SubMatrix **leftSubMatrices,
   2.695 
   2.696 -                                    SubMatrix **rightSubMatrices,
   2.697 
   2.698 -                                    int32 numRowIdxs, int32 numColIdxs,
   2.699 
   2.700 -                                    int32 numVecIdxs,
   2.701 
   2.702 -                                    float32 *resultArray,
   2.703 
   2.704 -                                    VirtProcr *animatingPr )
   2.705 
   2.706 - {
   2.707 
   2.708 -   int32 resRowIdx, resColIdx;
   2.709 
   2.710 -   int32 numLeftColIdxs, numRightColIdxs;
   2.711 
   2.712 -   int32 leftRowIdxOffset;
   2.713 
   2.714 -   VecParams *vecParams;
   2.715 
   2.716 -   float32 numToPutOntoEachCore, leftOverFraction;
   2.717 
   2.718 -   int32 numCores, currCore, numOnCurrCore;
   2.719 
   2.720 -
   2.721 
   2.722 -   numLeftColIdxs  = numColIdxs;
   2.723 
   2.724 -   numRightColIdxs = numVecIdxs;
   2.725 
   2.726 -
   2.727 
   2.728 -   numCores = VCilk__give_number_of_cores_to_spawn_onto();   
   2.729 
   2.730 -
   2.731 
   2.732 -   numToPutOntoEachCore = numRowIdxs*numColIdxs/numCores;
   2.733 
   2.734 -   leftOverFraction = 0;
   2.735 
   2.736 -   numOnCurrCore = 0;
   2.737 
   2.738 -   currCore = 0;
   2.739 
   2.740 -
   2.741 
   2.742 -   for( resRowIdx = 0; resRowIdx < numRowIdxs; resRowIdx++ )
   2.743 
   2.744 -    {
   2.745 
   2.746 -      leftRowIdxOffset = resRowIdx * numLeftColIdxs;
   2.747 
   2.748 -
   2.749 
   2.750 -      for( resColIdx = 0; resColIdx < numColIdxs; resColIdx++ )
   2.751 
   2.752 -       {
   2.753 
   2.754 -         vecParams = VCilk__malloc( sizeof(VecParams), animatingPr );
   2.755 
   2.756 -         
   2.757 
   2.758 -         vecParams->numVecIdxs       = numVecIdxs;
   2.759 
   2.760 -         vecParams->numRightColIdxs  = numRightColIdxs;
   2.761 
   2.762 -         vecParams->leftRowIdxOffset = leftRowIdxOffset;
   2.763 
   2.764 -         vecParams->resColIdx        = resColIdx;
   2.765 
   2.766 -         vecParams->leftSubMatrices  = leftSubMatrices;
   2.767 
   2.768 -         vecParams->rightSubMatrices = rightSubMatrices;
   2.769 
   2.770 -         vecParams->resultArray      = resultArray;
   2.771 
   2.772 -         vecParams->coreToRunOn      = currCore;
   2.773 
   2.774 -
   2.775 
   2.776 -         VCilk__spawn( currCore, &calcVectorOfSubMatrices, vecParams,
   2.777 
   2.778 -                       animatingPr );
   2.779 
   2.780 -
   2.781 
   2.782 -         numOnCurrCore += 1;
   2.783 
   2.784 -         if( numOnCurrCore + leftOverFraction >= numToPutOntoEachCore - 1 )
   2.785 
   2.786 -          {
   2.787 
   2.788 -               //deal with fractional part, to ensure that imbalance is 1 max
   2.789 
   2.790 -               // IE, core with most has only 1 more than core with least
   2.791 
   2.792 -            leftOverFraction += numToPutOntoEachCore - numOnCurrCore;
   2.793 
   2.794 -            if( leftOverFraction >= 1 )
   2.795 
   2.796 -             { leftOverFraction -= 1;
   2.797 
   2.798 -               numOnCurrCore = -1;
   2.799 
   2.800 -             }
   2.801 
   2.802 -            else
   2.803 
   2.804 -             { numOnCurrCore = 0;
   2.805 
   2.806 -             }
   2.807 
   2.808 -               //Move to next core, max core-value to incr to is numCores -1
   2.809 
   2.810 -            if( currCore >= numCores -1 )
   2.811 
   2.812 -             { currCore = 0;
   2.813 
   2.814 -             }
   2.815 
   2.816 -            else
   2.817 
   2.818 -             { currCore += 1;
   2.819 
   2.820 -             }
   2.821 
   2.822 -          }
   2.823 
   2.824 -       }
   2.825 
   2.826 -    }
   2.827 
   2.828 -   
   2.829 
   2.830 -   //Free Note: vector of sub-matrices does its own free-ing, even vec-params
   2.831 
   2.832 -
   2.833 
   2.834 -//TODO: timeToSpawnProbe = VMS__get_probe_by_name( "timeToSpawnProbe" );
   2.835 
   2.836 -//      VMS__end_interval_on_probe( timeToSpawnProbe );
   2.837 
   2.838 -
   2.839 
   2.840 -   VCilk__sync( animatingPr );
   2.841 
   2.842 -
   2.843 
   2.844 -   //free the sub-matrices in Fn that called this one
   2.845 
   2.846 - }
   2.847 
   2.848 -
   2.849 
   2.850 -
   2.851 
   2.852 -/*Walk through the two slice-strucs, making sub-matrix strucs as go
   2.853 
   2.854 - */
   2.855 
   2.856 -SubMatrix **
   2.857 
   2.858 -createSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices,
   2.859 
   2.860 -                   Matrix *origMatrix, VirtProcr *animPr )
   2.861 
   2.862 - {
   2.863 
   2.864 -   int32 numRowIdxs, numColIdxs, rowIdx, colIdx;
   2.865 
   2.866 -   int32 startRow, endRow, startCol, endCol;
   2.867 
   2.868 -   int32 *rowStartVals, *colStartVals;
   2.869 
   2.870 -   int32 rowOffset;
   2.871 
   2.872 -   SubMatrix **subMatrices, *newSubMatrix;
   2.873 
   2.874 -
   2.875 
   2.876 -   numRowIdxs = rowSlices->numVals;
   2.877 
   2.878 -   numColIdxs = colSlices->numVals;
   2.879 
   2.880 -
   2.881 
   2.882 -   rowStartVals = rowSlices->startVals;
   2.883 
   2.884 -   colStartVals = colSlices->startVals;
   2.885 
   2.886 -
   2.887 
   2.888 -   subMatrices = VCilk__malloc( numRowIdxs * numColIdxs *sizeof(SubMatrix *),
   2.889 
   2.890 -                                animPr );
   2.891 
   2.892 -
   2.893 
   2.894 -   for( rowIdx = 0; rowIdx < numRowIdxs; rowIdx++ )
   2.895 
   2.896 -    {
   2.897 
   2.898 -      rowOffset = rowIdx * numColIdxs;
   2.899 
   2.900 -      
   2.901 
   2.902 -      startRow  = rowStartVals[rowIdx];
   2.903 
   2.904 -      endRow    = rowStartVals[rowIdx + 1] -1; //"fake" start above last is
   2.905 
   2.906 -                                               // at last valid idx + 1 & is
   2.907 
   2.908 -                                               // 1 greater than end value
   2.909 
   2.910 -      for( colIdx = 0; colIdx < numColIdxs; colIdx++ )
   2.911 
   2.912 -       {
   2.913 
   2.914 -         startCol = colStartVals[colIdx];
   2.915 
   2.916 -         endCol   = colStartVals[colIdx + 1] -1;
   2.917 
   2.918 -
   2.919 
   2.920 -         newSubMatrix = VCilk__malloc( sizeof(SubMatrix), animPr );
   2.921 
   2.922 -         newSubMatrix->numRows       = endRow - startRow +1;
   2.923 
   2.924 -         newSubMatrix->numCols       = endCol - startCol +1;
   2.925 
   2.926 -         newSubMatrix->origMatrix    = origMatrix;
   2.927 
   2.928 -         newSubMatrix->origStartRow  = startRow;
   2.929 
   2.930 -         newSubMatrix->origStartCol  = startCol;
   2.931 
   2.932 -         newSubMatrix->alreadyCopied = FALSE;
   2.933 
   2.934 -
   2.935 
   2.936 -         subMatrices[ rowOffset + colIdx ] = newSubMatrix;
   2.937 
   2.938 -       }
   2.939 
   2.940 -    }
   2.941 
   2.942 -   return subMatrices;
   2.943 
   2.944 - }
   2.945 
   2.946 -
   2.947 
   2.948 -void
   2.949 
   2.950 -freeSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices,
   2.951 
   2.952 -                 SubMatrix **subMatrices, VirtProcr *animPr )
   2.953 
   2.954 - {
   2.955 
   2.956 -   int32 numRowIdxs, numColIdxs, rowIdx, colIdx, rowOffset;
   2.957 
   2.958 -   SubMatrix *subMatrix;
   2.959 
   2.960 -
   2.961 
   2.962 -   numRowIdxs = rowSlices->numVals;
   2.963 
   2.964 -   numColIdxs = colSlices->numVals;
   2.965 
   2.966 -
   2.967 
   2.968 -   for( rowIdx = 0; rowIdx < numRowIdxs; rowIdx++ )
   2.969 
   2.970 -    {
   2.971 
   2.972 -      rowOffset = rowIdx * numColIdxs;
   2.973 
   2.974 -      for( colIdx = 0; colIdx < numColIdxs; colIdx++ )
   2.975 
   2.976 -       {
   2.977 
   2.978 -         subMatrix = subMatrices[ rowOffset + colIdx ];
   2.979 
   2.980 -         if( subMatrix->alreadyCopied )
   2.981 
   2.982 -            VCilk__free( subMatrix->array, animPr );
   2.983 
   2.984 -         VCilk__free( subMatrix, animPr );
   2.985 
   2.986 -       }
   2.987 
   2.988 -    }
   2.989 
   2.990 -   VCilk__free( subMatrices, animPr );
   2.991 
   2.992 - }
   2.993 
   2.994 -
   2.995 
   2.996 -
   2.997 
   2.998 -
   2.999 
  2.1000 -SlicingStruc *
  2.1001 
  2.1002 -sliceUpDimension( float32 idealSizeOfSide, int startVal, int endVal,
  2.1003 
  2.1004 -                  VirtProcr *animPr )
  2.1005 
  2.1006 - { float32 residualAcc = 0;
  2.1007 
  2.1008 -   int     numSlices, i, *startVals, sizeOfSlice, endCondition;
  2.1009 
  2.1010 -   SlicingStruc *slicingStruc = VCilk__malloc( sizeof(SlicingStruc), animPr);
  2.1011 
  2.1012 -
  2.1013 
  2.1014 -      //calc size of matrix need to hold start vals --
  2.1015 
  2.1016 -   numSlices = (int32)( (float32)(endVal -startVal +1) / idealSizeOfSide);
  2.1017 
  2.1018 -
  2.1019 
  2.1020 -   startVals = VCilk__malloc( (numSlices + 1) * sizeof(int32), animPr );
  2.1021 
  2.1022 -
  2.1023 
  2.1024 -      //Calc the upper limit of start value -- when get above this, end loop
  2.1025 
  2.1026 -      // by saving highest value of the matrix dimension to access, plus 1
  2.1027 
  2.1028 -      // as the start point of the imaginary slice following the last one
  2.1029 
  2.1030 -      //Plus 1 because go up to value but not include when process last slice
  2.1031 
  2.1032 -      //The stopping condition is half-a-size less than highest value because
  2.1033 
  2.1034 -      // don't want any pieces smaller than half the ideal size -- just tack
  2.1035 
  2.1036 -      // little ones onto end of last one
  2.1037 
  2.1038 -   endCondition = endVal - (int) (idealSizeOfSide/2); //end *value*, not size
  2.1039 
  2.1040 -   for( i = 0; startVal <= endVal; i++ )
  2.1041 
  2.1042 -    {
  2.1043 
  2.1044 -      startVals[i] = startVal;
  2.1045 
  2.1046 -      residualAcc += idealSizeOfSide;
  2.1047 
  2.1048 -      sizeOfSlice  = (int)residualAcc;
  2.1049 
  2.1050 -      residualAcc -= (float32)sizeOfSlice;
  2.1051 
  2.1052 -      startVal    += sizeOfSlice; //ex @size = 2 get 0, 2, 4, 6, 8..
  2.1053 
  2.1054 -
  2.1055 
  2.1056 -      if( startVal > endCondition )
  2.1057 
  2.1058 -       { startVal = endVal + 1;
  2.1059 
  2.1060 -         startVals[ i + 1 ] = startVal;
  2.1061 
  2.1062 -       }
  2.1063 
  2.1064 -    }
  2.1065 
  2.1066 -
  2.1067 
  2.1068 -   slicingStruc->startVals = startVals;
  2.1069 
  2.1070 -   slicingStruc->numVals   = i;  //loop incr'd, so == last valid start idx+1
  2.1071 
  2.1072 -                                 // which means is num sub-matrices in dim
  2.1073 
  2.1074 -                                 // also == idx of the fake start just above
  2.1075 
  2.1076 -   return slicingStruc;
  2.1077 
  2.1078 - }
  2.1079 
  2.1080 -
  2.1081 
  2.1082 -void
  2.1083 
  2.1084 -freeSlicingStruc( SlicingStruc *slicingStruc, VirtProcr *animPr )
  2.1085 
  2.1086 - {
  2.1087 
  2.1088 -   VCilk__free( slicingStruc->startVals, animPr );
  2.1089 
  2.1090 -   VCilk__free( slicingStruc, animPr );
  2.1091 
  2.1092 - }
  2.1093 
  2.1094 -
  2.1095 
  2.1096 -
  2.1097 
  2.1098 -int inline
  2.1099 
  2.1100 -measureMatrixMultPrimitive( VirtProcr *animPr )
  2.1101 
  2.1102 - {
  2.1103 
  2.1104 -   int r, c, v, numCycles;
  2.1105 
  2.1106 -   float32 *res, *left, *right;
  2.1107 
  2.1108 -
  2.1109 
  2.1110 -      //setup inputs
  2.1111 
  2.1112 -   left  = VCilk__malloc( 5 * 5 * sizeof( float32 ), animPr );
  2.1113 
  2.1114 -   right = VCilk__malloc( 5 * 5 * sizeof( float32 ), animPr );
  2.1115 
  2.1116 -   res   = VCilk__malloc( 5 * 5 * sizeof( float32 ), animPr );
  2.1117 
  2.1118 -
  2.1119 
  2.1120 -   for( r = 0; r < 5; r++ )
  2.1121 
  2.1122 -    {
  2.1123 
  2.1124 -      for( c = 0; c < 5; c++ )
  2.1125 
  2.1126 -       {
  2.1127 
  2.1128 -         left[  r * 5 + c ] = r;
  2.1129 
  2.1130 -         right[ r * 5 + c ] = c;
  2.1131 
  2.1132 -       }
  2.1133 
  2.1134 -    }
  2.1135 
  2.1136 -
  2.1137 
  2.1138 -      //do primitive
  2.1139 
  2.1140 -   VCilk__start_primitive();  //for now, just takes time stamp
  2.1141 
  2.1142 -   for( r = 0; r < 5; r++ )
  2.1143 
  2.1144 -    {
  2.1145 
  2.1146 -      for( c = 0; c < 5; c++ )
  2.1147 
  2.1148 -       {
  2.1149 
  2.1150 -         for( v = 0; v < 5; v++ )
  2.1151 
  2.1152 -          {
  2.1153 
  2.1154 -            res[ r * 5 + c ] = left[ r * 5 + v ] * right[ v * 5 + c ];
  2.1155 
  2.1156 -          }
  2.1157 
  2.1158 -       }
  2.1159 
  2.1160 -    }
  2.1161 
  2.1162 -   numCycles =
  2.1163 
  2.1164 -      VCilk__end_primitive_and_give_cycles(); 
  2.1165 
  2.1166 -
  2.1167 
  2.1168 -   VCilk__free( left, animPr );
  2.1169 
  2.1170 -   VCilk__free( right, animPr );
  2.1171 
  2.1172 -   VCilk__free( res, animPr );
  2.1173 
  2.1174 -   
  2.1175 
  2.1176 -   return numCycles;
  2.1177 
  2.1178 - }
  2.1179 
  2.1180 +/*
  2.1181 + *  Copyright 2009 OpenSourceStewardshipFoundation.org
  2.1182 + *  Licensed under GNU General Public License version 2
  2.1183 + *
  2.1184 + * Author: seanhalle@yahoo.com
  2.1185 + *
  2.1186 + */
  2.1187 +
  2.1188 +
  2.1189 +#include "VCilk__Matrix_Mult.h"
  2.1190 +#include <math.h>
  2.1191 +#include <sys/time.h>
  2.1192 +#include <string.h>
  2.1193 +
  2.1194 +   //The time to compute this many result values should equal the time to
  2.1195 +   // perform this division on a matrix of size gives that many result calcs
  2.1196 +   //IE, size this so that sequential time to calc equals divide time
  2.1197 +   // find the value by experimenting -- but divide time and calc time scale
  2.1198 +   // same way, so this value should remain valid across hardware
  2.1199 +   //Divide time is about 800us on 2.4Ghz core2Quad laptop core
  2.1200 +   //num cells is the cube of a side, when have two square matrices
  2.1201 +#define NUM_CELLS_IN_SEQUENTIAL_CUTOFF 100000 /* about 46x46 */
  2.1202 +
  2.1203 +
  2.1204 +//===========================================================================
  2.1205 +int inline
  2.1206 +measureMatrixMultPrimitive( VirtProcr *animPr );
  2.1207 +
  2.1208 +SlicingStrucCarrier *
  2.1209 +calcIdealSizeAndSliceDimensions( Matrix *leftMatrix, Matrix *rightMatrix,
  2.1210 +                                 VirtProcr *animPr );
  2.1211 +
  2.1212 +SlicingStruc *
  2.1213 +sliceUpDimension( float32 idealSizeOfSide, int startVal, int endVal,
  2.1214 +                  VirtProcr *animPr );
  2.1215 +
  2.1216 +void
  2.1217 +freeSlicingStruc( SlicingStruc *slicingStruc, VirtProcr *animPr );
  2.1218 +
  2.1219 +SubMatrix **
  2.1220 +createSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices,
  2.1221 +                   Matrix *origMatrix, VirtProcr *animPr );
  2.1222 +
  2.1223 +void
  2.1224 +freeSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices,
  2.1225 +                 SubMatrix **subMatrices, VirtProcr *animPr );
  2.1226 +
  2.1227 +void
  2.1228 +pairUpSubMatricesAndSpawnAndSync( SubMatrix **leftSubMatrices,
  2.1229 +                                    SubMatrix **rightSubMatrices,
  2.1230 +                                    int32 numRowIdxs, int32 numColIdxs,
  2.1231 +                                    int32 numVecIdxs,
  2.1232 +                                    float32 *resultArray,
  2.1233 +                                    VirtProcr *animatingPr );
  2.1234 +
  2.1235 +void
  2.1236 +makeSubMatricesAndSpawnAndSync( Matrix *leftMatrix, Matrix *rightMatrix,
  2.1237 +            SlicingStrucCarrier *slicingStrucCarrier,
  2.1238 +            float32 *resultArray, VirtProcr *animatingPr );
  2.1239 +
  2.1240 +//===========================================================================
  2.1241 +
  2.1242 +/*Divider creates one processor for every sub-matrix
  2.1243 + * It hands them:
  2.1244 + *  the name of the result processor that they should send their results to,
  2.1245 + *  the left and right matrices, and the rows and cols they should multiply
  2.1246 + * It first creates the result processor, then all the sub-matrixPair
  2.1247 + *  processors,
  2.1248 + *  then does a receive of a message from the result processor that gives
  2.1249 + *  the divider ownership of the result matrix.
  2.1250 + * Finally, the divider returns the result matrix out of the VCilk system.
  2.1251 + *
  2.1252 + * Divider chooses the size of sub-matrices via an algorithm that tries to
  2.1253 + *  keep the minimum work above a threshold.  The threshold is machine-
  2.1254 + *  dependent, so ask VCilk for min work-unit time to get a
  2.1255 + *  given overhead
  2.1256 + *
  2.1257 + * Divide min work-unit cycles by measured-cycles for one matrix-cell
  2.1258 + *  product -- gives the number of products need to have in min size
  2.1259 + *  matrix.
  2.1260 + *
  2.1261 + * So then, take cubed root of this to get the size of a side of min sub-
  2.1262 + *  matrix.  That is the size of the ideal square sub-matrix -- so tile
  2.1263 + *  up the two input matrices into ones as close as possible to that size,
  2.1264 + *  and create the pairs of sub-matrices.
  2.1265 + *
  2.1266 + *========================  STRATEGIC OVERVIEW  =======================
  2.1267 + *
  2.1268 + *This division is a bit tricky, because have to create things in advance
  2.1269 + * that it's not at first obvious need to be created..
  2.1270 + *
  2.1271 + *First slice up each dimension -- three of them..  this is because will have
  2.1272 + * to create the sub-matrix's data-structures before pairing the sub-matrices
  2.1273 + * with each other -- so, have three dimensions to slice up before can
  2.1274 + * create the sub-matrix data-strucs -- also, have to be certain that the
  2.1275 + * cols of the left input have the exact same slicing as the rows of the
  2.1276 + * left matrix, so just to be sure, do the slicing calc once, then use it
  2.1277 + * for both.
  2.1278 + *
  2.1279 + *So, goes like this:
  2.1280 + *1) calculate the start & end values of each dimension in each matrix.
  2.1281 + *2) use those values to create sub-matrix structures
  2.1282 + *3) combine sub-matrices into pairs, as the tasks to perform.
  2.1283 + *
  2.1284 + *Have to calculate separately from creating the sub-matrices because of the
  2.1285 + * nature of the nesting -- would either end up creating the same sub-matrix
  2.1286 + * multiple times, or else would have to put in detection of whether had
  2.1287 + * made a particular one already if tried to combine steps 1 and 2.
  2.1288 + *
  2.1289 + *Step 3 has to be separate because of the nesting, as well -- same reason,
  2.1290 + * would either create same sub-matrix multiple times, or else have to
  2.1291 + * add detection of whether was already created.
  2.1292 + *
  2.1293 + *Another way to look at it: there's one level of loop to divide dimensions,
  2.1294 + * two levels of nesting to create sub-matrices, and three levels to pair
  2.1295 + * up the sub-matrices.
  2.1296 + */
  2.1297 +
  2.1298 +void divideWorkIntoSubMatrixPairProcrs( void      *_dividerParams,
  2.1299 +                                        VirtProcr *animPr )
  2.1300 + { 
  2.1301 +   DividerParams   *dividerParams;
  2.1302 +   ResultsParams   *resultsParams;
  2.1303 +   Matrix          *leftMatrix, *rightMatrix, *resultMatrix;
  2.1304 +   void            *msg;
  2.1305 +   SlicingStrucCarrier *slicingStrucCarrier;
  2.1306 +   float32         *resultArray; //points to array to be put inside result
  2.1307 +                                 // matrix
  2.1308 +   
  2.1309 +         DEBUG( dbgAppFlow, "start divide\n")
  2.1310 +
  2.1311 +         int32
  2.1312 +         divideProbe = VMS__create_single_interval_probe( "divideProbe",
  2.1313 +                                                          animPr );
  2.1314 +         VMS__record_sched_choice_into_probe( divideProbe, animPr );
  2.1315 +         VMS__record_interval_start_in_probe( divideProbe );
  2.1316 +
  2.1317 +   //=========== Setup -- make local copies of ptd-to-things, malloc, aso
  2.1318 +   int32 numResRows, numResCols, vectLength;
  2.1319 +
  2.1320 +   dividerParams   = (DividerParams *)_dividerParams;
  2.1321 +   
  2.1322 +   leftMatrix      = dividerParams->leftMatrix;
  2.1323 +   rightMatrix     = dividerParams->rightMatrix;
  2.1324 +
  2.1325 +   vectLength  = leftMatrix->numCols;
  2.1326 +   numResRows  = leftMatrix->numRows;
  2.1327 +   numResCols  = rightMatrix->numCols;
  2.1328 +   resultArray = dividerParams->resultMatrix->array;
  2.1329 +   
  2.1330 +      //zero the result array
  2.1331 +   memset( resultArray, 0, numResRows * numResCols * sizeof(float32) );
  2.1332 +
  2.1333 +   
  2.1334 +   //==============  Do either sequential mult or do division ==============
  2.1335 +
  2.1336 +      //Check if input matrices too small -- if yes, just do sequential
  2.1337 +      //Cutoff is determined by overhead of this divider -- relatively
  2.1338 +      // machine-independent
  2.1339 +   if( (float32)leftMatrix->numRows * (float32)leftMatrix->numCols *
  2.1340 +       (float32)rightMatrix->numCols  < NUM_CELLS_IN_SEQUENTIAL_CUTOFF )
  2.1341 +    { int32 vectLength;
  2.1342 +
  2.1343 +      //====== Do sequential multiply on a single core
  2.1344 +            DEBUG( dbgAppFlow, "doing sequential")
  2.1345 +
  2.1346 +         //transpose the right matrix
  2.1347 +      float32 *
  2.1348 +      transRightArray  = VCilk__malloc( rightMatrix->numRows *
  2.1349 +                                        rightMatrix->numCols *
  2.1350 +                                        sizeof(float32),       animPr );
  2.1351 +
  2.1352 +         //copy values from orig matrix to local
  2.1353 +      copyTranspose( rightMatrix->numRows, rightMatrix->numCols,
  2.1354 +                     0, 0, rightMatrix->numRows,
  2.1355 +                     transRightArray, rightMatrix->array );
  2.1356 +
  2.1357 +      multiplyMatrixArraysTransposed( vectLength, numResRows, numResCols,
  2.1358 +                            leftMatrix->array, transRightArray,
  2.1359 +                            resultArray );
  2.1360 +    }
  2.1361 +   else
  2.1362 +    {
  2.1363 +      //====== Do parallel multiply across cores
  2.1364 +
  2.1365 +         //Calc the ideal size of sub-matrix and slice up the dimensions of
  2.1366 +         // the two matrices.
  2.1367 +         //The ideal size is the one takes the number of cycles to calculate
  2.1368 +         // such that calc time is equal or greater than min work-unit size
  2.1369 +      slicingStrucCarrier =
  2.1370 +         calcIdealSizeAndSliceDimensions( leftMatrix, rightMatrix,  animPr);
  2.1371 +                                         
  2.1372 +
  2.1373 +
  2.1374 +         //Make the sub-matrices, and pair them up, then spawn processors to
  2.1375 +         // calc product of each pair.
  2.1376 +      makeSubMatricesAndSpawnAndSync( leftMatrix, rightMatrix,
  2.1377 +                                      slicingStrucCarrier,
  2.1378 +                                      resultArray, animPr);
  2.1379 +         //The result array will get filled in by the spawned children
  2.1380 +    }
  2.1381 +
  2.1382 +
  2.1383 +   //===============  Work done -- send results back =================
  2.1384 +
  2.1385 +
  2.1386 +      //results have been saved into an array that was made outside the VMS
  2.1387 +      // system, by entry-point Fn, and passed in through dividerParams.
  2.1388 +      //So, nothing to do to send results back -- they're seen by side-effect
  2.1389 +
  2.1390 +         DEBUG( dbgAppFlow, "*** end divide ***\n")
  2.1391 +
  2.1392 +         VMS__record_interval_end_in_probe( divideProbe );
  2.1393 +         VMS__print_stats_of_all_probes();
  2.1394 +
  2.1395 +   VCilk__dissipate_procr( animPr );  //all procrs dissipate self at end
  2.1396 +      //when all of the processors have dissipated, the "create seed and do
  2.1397 +      // work" call in the entry point function returns
  2.1398 + }
  2.1399 +
  2.1400 +
  2.1401 +SlicingStrucCarrier *
  2.1402 +calcIdealSizeAndSliceDimensions( Matrix *leftMatrix, Matrix *rightMatrix,
  2.1403 +                                 VirtProcr *animPr )
  2.1404 +{
  2.1405 +   float32 idealSizeOfSide, idealSizeOfSide1, idealSizeOfSide2;
  2.1406 +   SlicingStruc *leftRowSlices, *vecSlices, *rightColSlices;
  2.1407 +   SlicingStrucCarrier *slicingStrucCarrier =
  2.1408 +                         VCilk__malloc(sizeof(SlicingStrucCarrier), animPr );
  2.1409 +
  2.1410 +   int minWorkUnitCycles, primitiveCycles, idealNumWorkUnits;
  2.1411 +   float64 numPrimitiveOpsInMinWorkUnit;
  2.1412 +
  2.1413 +
  2.1414 +   //=======  Calc ideal size of min-sized sub-matrix  ========
  2.1415 +
  2.1416 +      //ask VCilk for the number of cycles of the minimum work unit, at given
  2.1417 +      // percent overhead then add a guess at overhead from this divider
  2.1418 +   minWorkUnitCycles = VCilk__giveMinWorkUnitCycles( .05 );
  2.1419 +
  2.1420 +      //ask VCilk for number of cycles of the "primitive" op of matrix mult
  2.1421 +   primitiveCycles = measureMatrixMultPrimitive( animPr );
  2.1422 +
  2.1423 +   numPrimitiveOpsInMinWorkUnit =
  2.1424 +      (float64)minWorkUnitCycles / (float64)primitiveCycles;
  2.1425 +
  2.1426 +      //take cubed root -- that's number of these in a "side" of sub-matrix
  2.1427 +      // then multiply by 5 because the primitive is 5x5
  2.1428 +   idealSizeOfSide1 = 5 * cbrt( numPrimitiveOpsInMinWorkUnit );
  2.1429 +
  2.1430 +   idealNumWorkUnits = VCilk__giveIdealNumWorkUnits();
  2.1431 +   
  2.1432 +   idealSizeOfSide2 = leftMatrix->numRows / rint(cbrt( idealNumWorkUnits ));
  2.1433 +   idealSizeOfSide2 *= 0.8; //finer granularity to help load balance
  2.1434 +
  2.1435 +   if( idealSizeOfSide1 > idealSizeOfSide2 )
  2.1436 +      idealSizeOfSide = idealSizeOfSide1;
  2.1437 +   else
  2.1438 +      idealSizeOfSide = idealSizeOfSide2;
  2.1439 +
  2.1440 +      //The multiply inner loop blocks the array to fit into L1 cache
  2.1441 +//   if( idealSizeOfSide < ROWS_IN_BLOCK ) idealSizeOfSide = ROWS_IN_BLOCK;
  2.1442 +
  2.1443 +   //============  Slice up dimensions, now that know target size ===========
  2.1444 +
  2.1445 +      //Tell the slicer the target size of a side (floating pt), the start
  2.1446 +      // value to start slicing at, and the end value to stop slicing at
  2.1447 +      //It returns an array of start value of each chunk, plus number of them
  2.1448 +   int32 startLeftRow, endLeftRow, startVec,endVec,startRightCol,endRightCol;
  2.1449 +   startLeftRow  = 0;
  2.1450 +   endLeftRow    = leftMatrix->numRows -1;
  2.1451 +   startVec      = 0;
  2.1452 +   endVec        = leftMatrix->numCols -1;
  2.1453 +   startRightCol = 0;
  2.1454 +   endRightCol   = rightMatrix->numCols -1;
  2.1455 +
  2.1456 +   leftRowSlices =
  2.1457 +      sliceUpDimension( idealSizeOfSide,  startLeftRow, endLeftRow, animPr );
  2.1458 +
  2.1459 +   vecSlices =
  2.1460 +      sliceUpDimension( idealSizeOfSide,  startVec, endVec, animPr );
  2.1461 +
  2.1462 +   rightColSlices =
  2.1463 +      sliceUpDimension( idealSizeOfSide,  startRightCol, endRightCol,animPr);
  2.1464 +
  2.1465 +   slicingStrucCarrier->leftRowSlices  = leftRowSlices;
  2.1466 +   slicingStrucCarrier->vecSlices      = vecSlices;
  2.1467 +   slicingStrucCarrier->rightColSlices = rightColSlices;
  2.1468 +
  2.1469 +         DEBUG1( dbgAppFlow, "leftRowSlices %d | ", leftRowSlices->numVals );
  2.1470 +         DEBUG1( dbgAppFlow, "rightColSlices %d | ",rightColSlices->numVals);
  2.1471 +         DEBUG1( dbgAppFlow, "vecSlices %d\n", vecSlices->numVals );
  2.1472 +   return slicingStrucCarrier;
  2.1473 +}
  2.1474 +
  2.1475 +
  2.1476 +void inline
  2.1477 +makeSubMatricesAndSpawnAndSync( Matrix  *leftMatrix,  Matrix    *rightMatrix,
  2.1478 +                         SlicingStrucCarrier *slicingStrucCarrier,
  2.1479 +                         float32 *resultArray, VirtProcr *animPr )
  2.1480 + {
  2.1481 +   SlicingStruc *leftRowSlices, *vecSlices, *rightColSlices;
  2.1482 +   
  2.1483 +   leftRowSlices  = slicingStrucCarrier->leftRowSlices;
  2.1484 +   vecSlices      = slicingStrucCarrier->vecSlices;
  2.1485 +   rightColSlices = slicingStrucCarrier->rightColSlices;
  2.1486 +   VCilk__free( slicingStrucCarrier, animPr );
  2.1487 +   
  2.1488 +   //================  Make sub-matrices, given the slicing  ================
  2.1489 +   SubMatrix **leftSubMatrices, **rightSubMatrices;
  2.1490 +   leftSubMatrices =
  2.1491 +      createSubMatrices( leftRowSlices, vecSlices,
  2.1492 +                         leftMatrix, animPr );
  2.1493 +   rightSubMatrices =
  2.1494 +      createSubMatrices( vecSlices, rightColSlices,
  2.1495 +                         rightMatrix, animPr );
  2.1496 +
  2.1497 +   freeSlicingStruc( leftRowSlices, animPr );
  2.1498 +   freeSlicingStruc( vecSlices, animPr );
  2.1499 +   freeSlicingStruc( rightColSlices, animPr );
  2.1500 +
  2.1501 +   //==============  pair the sub-matrices and make processors ==============
  2.1502 +   int32 numRowIdxs, numColIdxs, numVecIdxs;
  2.1503 +
  2.1504 +   numRowIdxs = leftRowSlices->numVals;
  2.1505 +   numColIdxs = rightColSlices->numVals;
  2.1506 +   numVecIdxs = vecSlices->numVals;
  2.1507 +   pairUpSubMatricesAndSpawnAndSync( leftSubMatrices, rightSubMatrices,
  2.1508 +                                     numRowIdxs, numColIdxs, numVecIdxs,
  2.1509 +                                     resultArray,
  2.1510 +                                     animPr );
  2.1511 +      //It syncs inside, so know all work is done now: free the sub-matrices
  2.1512 +   freeSubMatrices( leftRowSlices, vecSlices,  leftSubMatrices, animPr );
  2.1513 +   freeSubMatrices( vecSlices, rightColSlices, rightSubMatrices, animPr );
  2.1514 + }
  2.1515 +
  2.1516 +
  2.1517 +
  2.1518 +
  2.1519 +/* numRows*colsPerRow/numCores = numToPutOntoEachCore; 
  2.1520 + * put all from a given row onto same core, until exhaust allotment for that
  2.1521 + *  core
  2.1522 + *
  2.1523 + */
  2.1524 +void inline
  2.1525 +pairUpSubMatricesAndSpawnAndSync( SubMatrix **leftSubMatrices,
  2.1526 +                                    SubMatrix **rightSubMatrices,
  2.1527 +                                    int32 numRowIdxs, int32 numColIdxs,
  2.1528 +                                    int32 numVecIdxs,
  2.1529 +                                    float32 *resultArray,
  2.1530 +                                    VirtProcr *animatingPr )
  2.1531 + {
  2.1532 +   int32 resRowIdx, resColIdx;
  2.1533 +   int32 numLeftColIdxs, numRightColIdxs;
  2.1534 +   int32 leftRowIdxOffset;
  2.1535 +   VecParams *vecParams;
  2.1536 +   float32 numToPutOntoEachCore, leftOverFraction;
  2.1537 +   int32 numCores, currCore, numOnCurrCore;
  2.1538 +
  2.1539 +   numLeftColIdxs  = numColIdxs;
  2.1540 +   numRightColIdxs = numVecIdxs;
  2.1541 +
  2.1542 +   numCores = VCilk__give_number_of_cores_to_spawn_onto();   
  2.1543 +
  2.1544 +   numToPutOntoEachCore = numRowIdxs*numColIdxs/numCores;
  2.1545 +   leftOverFraction = 0;
  2.1546 +   numOnCurrCore = 0;
  2.1547 +   currCore = 0;
  2.1548 +
  2.1549 +   for( resRowIdx = 0; resRowIdx < numRowIdxs; resRowIdx++ )
  2.1550 +    {
  2.1551 +      leftRowIdxOffset = resRowIdx * numLeftColIdxs;
  2.1552 +
  2.1553 +      for( resColIdx = 0; resColIdx < numColIdxs; resColIdx++ )
  2.1554 +       {
  2.1555 +         vecParams = VCilk__malloc( sizeof(VecParams), animatingPr );
  2.1556 +         
  2.1557 +         vecParams->numVecIdxs       = numVecIdxs;
  2.1558 +         vecParams->numRightColIdxs  = numRightColIdxs;
  2.1559 +         vecParams->leftRowIdxOffset = leftRowIdxOffset;
  2.1560 +         vecParams->resColIdx        = resColIdx;
  2.1561 +         vecParams->leftSubMatrices  = leftSubMatrices;
  2.1562 +         vecParams->rightSubMatrices = rightSubMatrices;
  2.1563 +         vecParams->resultArray      = resultArray;
  2.1564 +         vecParams->coreToRunOn      = currCore;
  2.1565 +
  2.1566 +         VCilk__spawn( currCore, &calcVectorOfSubMatrices, vecParams,
  2.1567 +                       animatingPr );
  2.1568 +
  2.1569 +         numOnCurrCore += 1;
  2.1570 +         if( numOnCurrCore + leftOverFraction >= numToPutOntoEachCore - 1 )
  2.1571 +          {
  2.1572 +               //deal with fractional part, to ensure that imbalance is 1 max
  2.1573 +               // IE, core with most has only 1 more than core with least
  2.1574 +            leftOverFraction += numToPutOntoEachCore - numOnCurrCore;
  2.1575 +            if( leftOverFraction >= 1 )
  2.1576 +             { leftOverFraction -= 1;
  2.1577 +               numOnCurrCore = -1;
  2.1578 +             }
  2.1579 +            else
  2.1580 +             { numOnCurrCore = 0;
  2.1581 +             }
  2.1582 +               //Move to next core, max core-value to incr to is numCores -1
  2.1583 +            if( currCore >= numCores -1 )
  2.1584 +             { currCore = 0;
  2.1585 +             }
  2.1586 +            else
  2.1587 +             { currCore += 1;
  2.1588 +             }
  2.1589 +          }
  2.1590 +       }
  2.1591 +    }
  2.1592 +   
  2.1593 +   //Free Note: vector of sub-matrices does its own free-ing, even vec-params
  2.1594 +
  2.1595 +//TODO: timeToSpawnProbe = VMS__get_probe_by_name( "timeToSpawnProbe" );
  2.1596 +//      VMS__end_interval_on_probe( timeToSpawnProbe );
  2.1597 +
  2.1598 +   VCilk__sync( animatingPr );
  2.1599 +
  2.1600 +   //free the sub-matrices in Fn that called this one
  2.1601 + }
  2.1602 +
  2.1603 +
  2.1604 +/*Walk through the two slice-strucs, making sub-matrix strucs as go
  2.1605 + */
  2.1606 +SubMatrix **
  2.1607 +createSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices,
  2.1608 +                   Matrix *origMatrix, VirtProcr *animPr )
  2.1609 + {
  2.1610 +   int32 numRowIdxs, numColIdxs, rowIdx, colIdx;
  2.1611 +   int32 startRow, endRow, startCol, endCol;
  2.1612 +   int32 *rowStartVals, *colStartVals;
  2.1613 +   int32 rowOffset;
  2.1614 +   SubMatrix **subMatrices, *newSubMatrix;
  2.1615 +
  2.1616 +   numRowIdxs = rowSlices->numVals;
  2.1617 +   numColIdxs = colSlices->numVals;
  2.1618 +
  2.1619 +   rowStartVals = rowSlices->startVals;
  2.1620 +   colStartVals = colSlices->startVals;
  2.1621 +
  2.1622 +   subMatrices = VCilk__malloc( numRowIdxs * numColIdxs *sizeof(SubMatrix *),
  2.1623 +                                animPr );
  2.1624 +
  2.1625 +   for( rowIdx = 0; rowIdx < numRowIdxs; rowIdx++ )
  2.1626 +    {
  2.1627 +      rowOffset = rowIdx * numColIdxs;
  2.1628 +      
  2.1629 +      startRow  = rowStartVals[rowIdx];
  2.1630 +      endRow    = rowStartVals[rowIdx + 1] -1; //"fake" start above last is
  2.1631 +                                               // at last valid idx + 1 & is
  2.1632 +                                               // 1 greater than end value
  2.1633 +      for( colIdx = 0; colIdx < numColIdxs; colIdx++ )
  2.1634 +       {
  2.1635 +         startCol = colStartVals[colIdx];
  2.1636 +         endCol   = colStartVals[colIdx + 1] -1;
  2.1637 +
  2.1638 +         newSubMatrix = VCilk__malloc( sizeof(SubMatrix), animPr );
  2.1639 +         newSubMatrix->numRows       = endRow - startRow +1;
  2.1640 +         newSubMatrix->numCols       = endCol - startCol +1;
  2.1641 +         newSubMatrix->origMatrix    = origMatrix;
  2.1642 +         newSubMatrix->origStartRow  = startRow;
  2.1643 +         newSubMatrix->origStartCol  = startCol;
  2.1644 +         newSubMatrix->alreadyCopied = FALSE;
  2.1645 +         //Prevent uninitialized memory
  2.1646 +         newSubMatrix->copySingleton = NULL;
  2.1647 +         newSubMatrix->copyTransSingleton = NULL;
  2.1648 +
  2.1649 +         subMatrices[ rowOffset + colIdx ] = newSubMatrix;
  2.1650 +       }
  2.1651 +    }
  2.1652 +   return subMatrices;
  2.1653 + }
  2.1654 +
  2.1655 +void
  2.1656 +freeSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices,
  2.1657 +                 SubMatrix **subMatrices, VirtProcr *animPr )
  2.1658 + {
  2.1659 +   int32 numRowIdxs, numColIdxs, rowIdx, colIdx, rowOffset;
  2.1660 +   SubMatrix *subMatrix;
  2.1661 +
  2.1662 +   numRowIdxs = rowSlices->numVals;
  2.1663 +   numColIdxs = colSlices->numVals;
  2.1664 +
  2.1665 +   for( rowIdx = 0; rowIdx < numRowIdxs; rowIdx++ )
  2.1666 +    {
  2.1667 +      rowOffset = rowIdx * numColIdxs;
  2.1668 +      for( colIdx = 0; colIdx < numColIdxs; colIdx++ )
  2.1669 +       {
  2.1670 +         subMatrix = subMatrices[ rowOffset + colIdx ];
  2.1671 +         if( subMatrix->alreadyCopied )
  2.1672 +            VCilk__free( subMatrix->array, animPr );
  2.1673 +         VCilk__free( subMatrix, animPr );
  2.1674 +       }
  2.1675 +    }
  2.1676 +   VCilk__free( subMatrices, animPr );
  2.1677 + }
  2.1678 +
  2.1679 +
  2.1680 +
  2.1681 +SlicingStruc *
  2.1682 +sliceUpDimension( float32 idealSizeOfSide, int startVal, int endVal,
  2.1683 +                  VirtProcr *animPr )
  2.1684 + { float32 residualAcc = 0;
  2.1685 +   int     numSlices, i, *startVals, sizeOfSlice, endCondition;
  2.1686 +   SlicingStruc *slicingStruc = VCilk__malloc( sizeof(SlicingStruc), animPr);
  2.1687 +
  2.1688 +      //calc size of matrix need to hold start vals --
  2.1689 +   numSlices = (int32)( (float32)(endVal -startVal +1) / idealSizeOfSide);
  2.1690 +
  2.1691 +   startVals = VCilk__malloc( (numSlices + 1) * sizeof(int32), animPr );
  2.1692 +
  2.1693 +      //Calc the upper limit of start value -- when get above this, end loop
  2.1694 +      // by saving highest value of the matrix dimension to access, plus 1
  2.1695 +      // as the start point of the imaginary slice following the last one
  2.1696 +      //Plus 1 because go up to value but not include when process last slice
  2.1697 +      //The stopping condition is half-a-size less than highest value because
  2.1698 +      // don't want any pieces smaller than half the ideal size -- just tack
  2.1699 +      // little ones onto end of last one
  2.1700 +   endCondition = endVal - (int) (idealSizeOfSide/2); //end *value*, not size
  2.1701 +   for( i = 0; startVal <= endVal; i++ )
  2.1702 +    {
  2.1703 +      startVals[i] = startVal;
  2.1704 +      residualAcc += idealSizeOfSide;
  2.1705 +      sizeOfSlice  = (int)residualAcc;
  2.1706 +      residualAcc -= (float32)sizeOfSlice;
  2.1707 +      startVal    += sizeOfSlice; //ex @size = 2 get 0, 2, 4, 6, 8..
  2.1708 +
  2.1709 +      if( startVal > endCondition )
  2.1710 +       { startVal = endVal + 1;
  2.1711 +         startVals[ i + 1 ] = startVal;
  2.1712 +       }
  2.1713 +    }
  2.1714 +
  2.1715 +   slicingStruc->startVals = startVals;
  2.1716 +   slicingStruc->numVals   = i;  //loop incr'd, so == last valid start idx+1
  2.1717 +                                 // which means is num sub-matrices in dim
  2.1718 +                                 // also == idx of the fake start just above
  2.1719 +   return slicingStruc;
  2.1720 + }
  2.1721 +
  2.1722 +void
  2.1723 +freeSlicingStruc( SlicingStruc *slicingStruc, VirtProcr *animPr )
  2.1724 + {
  2.1725 +   VCilk__free( slicingStruc->startVals, animPr );
  2.1726 +   VCilk__free( slicingStruc, animPr );
  2.1727 + }
  2.1728 +
  2.1729 +
  2.1730 +int inline
  2.1731 +measureMatrixMultPrimitive( VirtProcr *animPr )
  2.1732 + {
  2.1733 +   int r, c, v, numCycles;
  2.1734 +   float32 *res, *left, *right;
  2.1735 +
  2.1736 +      //setup inputs
  2.1737 +   left  = VCilk__malloc( 5 * 5 * sizeof( float32 ), animPr );
  2.1738 +   right = VCilk__malloc( 5 * 5 * sizeof( float32 ), animPr );
  2.1739 +   res   = VCilk__malloc( 5 * 5 * sizeof( float32 ), animPr );
  2.1740 +
  2.1741 +   for( r = 0; r < 5; r++ )
  2.1742 +    {
  2.1743 +      for( c = 0; c < 5; c++ )
  2.1744 +       {
  2.1745 +         left[  r * 5 + c ] = r;
  2.1746 +         right[ r * 5 + c ] = c;
  2.1747 +       }
  2.1748 +    }
  2.1749 +
  2.1750 +      //do primitive
  2.1751 +   VCilk__start_primitive();  //for now, just takes time stamp
  2.1752 +   for( r = 0; r < 5; r++ )
  2.1753 +    {
  2.1754 +      for( c = 0; c < 5; c++ )
  2.1755 +       {
  2.1756 +         for( v = 0; v < 5; v++ )
  2.1757 +          {
  2.1758 +            res[ r * 5 + c ] = left[ r * 5 + v ] * right[ v * 5 + c ];
  2.1759 +          }
  2.1760 +       }
  2.1761 +    }
  2.1762 +   numCycles =
  2.1763 +      VCilk__end_primitive_and_give_cycles(); 
  2.1764 +
  2.1765 +   VCilk__free( left, animPr );
  2.1766 +   VCilk__free( right, animPr );
  2.1767 +   VCilk__free( res, animPr );
  2.1768 +   
  2.1769 +   return numCycles;
  2.1770 + }

     3.1 --- a/src/Application/VCilk__Matrix_Mult/VCilk__Matrix_Mult.h	Wed May 11 15:40:54 2011 +0200
     3.2 +++ b/src/Application/VCilk__Matrix_Mult/VCilk__Matrix_Mult.h	Wed May 11 15:58:04 2011 +0200
     3.3 @@ -1,106 +1,106 @@
     3.4 -/*
     3.5 
     3.6 - *  Copyright Oct 24, 2009 OpenSourceStewardshipFoundation.org
     3.7 
     3.8 - *  Licensed under GNU General Public License version 2
     3.9 
    3.10 - */
    3.11 
    3.12 -
    3.13 
    3.14 -#ifndef _VCilk_MATRIX_MULT_H_
    3.15 
    3.16 -#define _VCilk_MATRIX_MULT_H_
    3.17 
    3.18 -
    3.19 
    3.20 -#include <stdio.h>
    3.21 
    3.22 -
    3.23 
    3.24 -#include "../../VCilk_lib/VCilk.h"
    3.25 
    3.26 -#include "../Matrix_Mult.h"
    3.27 
    3.28 -#include "../../VCilk_lib/VMS/VMS.h"
    3.29 
    3.30 -
    3.31 
    3.32 -
    3.33 
    3.34 -//===============================  Defines  ==============================
    3.35 
    3.36 -#define ROWS_IN_BLOCK 32
    3.37 
    3.38 -#define COLS_IN_BLOCK 32
    3.39 
    3.40 -#define VEC_IN_BLOCK  32
    3.41 
    3.42 -
    3.43 
    3.44 -#define copyMatrixSingleton 1
    3.45 
    3.46 -#define copyTransposeSingleton 2
    3.47 
    3.48 -
    3.49 
    3.50 -//==============================  Structures  ==============================
    3.51 
    3.52 -typedef struct
    3.53 
    3.54 - {
    3.55 
    3.56 -   Matrix *leftMatrix;
    3.57 
    3.58 -   Matrix *rightMatrix;
    3.59 
    3.60 -   Matrix *resultMatrix;
    3.61 
    3.62 -   
    3.63 
    3.64 -   TSCount numTSCsToExe;
    3.65 
    3.66 - }
    3.67 
    3.68 -DividerParams;
    3.69 
    3.70 -
    3.71 
    3.72 -typedef struct
    3.73 
    3.74 - {
    3.75 
    3.76 -   VirtProcr *dividerPr;
    3.77 
    3.78 -   int numRows;
    3.79 
    3.80 -   int numCols;
    3.81 
    3.82 -   int numSubMatrixPairs;
    3.83 
    3.84 - }
    3.85 
    3.86 -ResultsParams;
    3.87 
    3.88 -
    3.89 
    3.90 -typedef
    3.91 
    3.92 -struct
    3.93 
    3.94 - { int32    numRows;
    3.95 
    3.96 -   int32    numCols;
    3.97 
    3.98 -   Matrix  *origMatrix;
    3.99 
   3.100 -   int32    origStartRow;
   3.101 
   3.102 -   int32    origStartCol;
   3.103 
   3.104 -   int32    alreadyCopied;
   3.105 
   3.106 -   VCilkSingleton *copySingleton;
   3.107 
   3.108 -   VCilkSingleton *copyTransSingleton;
   3.109 
   3.110 -   float32 *array;  //2D, but dynamically sized, so use addr arith
   3.111 
   3.112 - }
   3.113 
   3.114 -SubMatrix;
   3.115 
   3.116 -
   3.117 
   3.118 -typedef struct
   3.119 
   3.120 - { VirtProcr *resultPr;
   3.121 
   3.122 -   SubMatrix *leftSubMatrix;
   3.123 
   3.124 -   SubMatrix *rightSubMatrix;
   3.125 
   3.126 -   float32   *partialResultArray;
   3.127 
   3.128 - }
   3.129 
   3.130 -SMPairParams;
   3.131 
   3.132 -
   3.133 
   3.134 -typedef
   3.135 
   3.136 -struct
   3.137 
   3.138 - { int32    numVals;
   3.139 
   3.140 -   int32   *startVals;
   3.141 
   3.142 - }
   3.143 
   3.144 -SlicingStruc;
   3.145 
   3.146 -
   3.147 
   3.148 -typedef
   3.149 
   3.150 -struct
   3.151 
   3.152 - {
   3.153 
   3.154 -   SlicingStruc *leftRowSlices;
   3.155 
   3.156 -   SlicingStruc *vecSlices;
   3.157 
   3.158 -   SlicingStruc *rightColSlices;
   3.159 
   3.160 - }
   3.161 
   3.162 -SlicingStrucCarrier;
   3.163 
   3.164 -
   3.165 
   3.166 -typedef struct
   3.167 
   3.168 - {
   3.169 
   3.170 -   int32 numVecIdxs;
   3.171 
   3.172 -   int32 numRightColIdxs;
   3.173 
   3.174 -   int32 leftRowIdxOffset;
   3.175 
   3.176 -   int32 resColIdx;
   3.177 
   3.178 -   SubMatrix **leftSubMatrices;
   3.179 
   3.180 -   SubMatrix **rightSubMatrices;
   3.181 
   3.182 -   float32 *resultArray;
   3.183 
   3.184 -   int32 coreToRunOn;
   3.185 
   3.186 - }
   3.187 
   3.188 -VecParams;
   3.189 
   3.190 -
   3.191 
   3.192 -//============================= Processor Functions =========================
   3.193 
   3.194 -void divideWorkIntoSubMatrixPairProcrs( void *data, VirtProcr *animatingPr );
   3.195 
   3.196 -void calcSubMatrixProduct(              void *data, VirtProcr *animatingPr );
   3.197 
   3.198 -void calcVectorOfSubMatrices(     void *_vecParams, VirtProcr *animatingPr );
   3.199 
   3.200 -
   3.201 
   3.202 -
   3.203 
   3.204 -//================================ Entry Point ==============================
   3.205 
   3.206 -Matrix *
   3.207 
   3.208 -multiplyTheseMatrices( Matrix *leftMatrix, Matrix *rightMatrix );
   3.209 
   3.210 -
   3.211 
   3.212 -
   3.213 
   3.214 -#endif /*_VCilk_MATRIX_MULT_H_*/
   3.215 
   3.216 +/*
   3.217 + *  Copyright Oct 24, 2009 OpenSourceStewardshipFoundation.org
   3.218 + *  Licensed under GNU General Public License version 2
   3.219 + */
   3.220 +
   3.221 +#ifndef _VCilk_MATRIX_MULT_H_
   3.222 +#define _VCilk_MATRIX_MULT_H_
   3.223 +
   3.224 +#include <stdio.h>
   3.225 +
   3.226 +#include "../../VCilk_lib/VCilk.h"
   3.227 +#include "../Matrix_Mult.h"
   3.228 +#include "../../VCilk_lib/VMS/VMS.h"
   3.229 +
   3.230 +
   3.231 +//===============================  Defines  ==============================
   3.232 +#define ROWS_IN_BLOCK 32
   3.233 +#define COLS_IN_BLOCK 32
   3.234 +#define VEC_IN_BLOCK  32
   3.235 +
   3.236 +#define copyMatrixSingleton 1
   3.237 +#define copyTransposeSingleton 2
   3.238 +
   3.239 +//==============================  Structures  ==============================
   3.240 +typedef struct
   3.241 + {
   3.242 +   Matrix *leftMatrix;
   3.243 +   Matrix *rightMatrix;
   3.244 +   Matrix *resultMatrix;
   3.245 +   
   3.246 +   TSCount numTSCsToExe;
   3.247 + }
   3.248 +DividerParams;
   3.249 +
   3.250 +typedef struct
   3.251 + {
   3.252 +   VirtProcr *dividerPr;
   3.253 +   int numRows;
   3.254 +   int numCols;
   3.255 +   int numSubMatrixPairs;
   3.256 + }
   3.257 +ResultsParams;
   3.258 +
   3.259 +typedef
   3.260 +struct
   3.261 + { int32    numRows;
   3.262 +   int32    numCols;
   3.263 +   Matrix  *origMatrix;
   3.264 +   int32    origStartRow;
   3.265 +   int32    origStartCol;
   3.266 +   int32    alreadyCopied;
   3.267 +   VCilkSingleton *copySingleton;
   3.268 +   VCilkSingleton *copyTransSingleton;
   3.269 +   float32 *array;  //2D, but dynamically sized, so use addr arith
   3.270 + }
   3.271 +SubMatrix;
   3.272 +
   3.273 +typedef struct
   3.274 + { VirtProcr *resultPr;
   3.275 +   SubMatrix *leftSubMatrix;
   3.276 +   SubMatrix *rightSubMatrix;
   3.277 +   float32   *partialResultArray;
   3.278 + }
   3.279 +SMPairParams;
   3.280 +
   3.281 +typedef
   3.282 +struct
   3.283 + { int32    numVals;
   3.284 +   int32   *startVals;
   3.285 + }
   3.286 +SlicingStruc;
   3.287 +
   3.288 +typedef
   3.289 +struct
   3.290 + {
   3.291 +   SlicingStruc *leftRowSlices;
   3.292 +   SlicingStruc *vecSlices;
   3.293 +   SlicingStruc *rightColSlices;
   3.294 + }
   3.295 +SlicingStrucCarrier;
   3.296 +
   3.297 +typedef struct
   3.298 + {
   3.299 +   int32 numVecIdxs;
   3.300 +   int32 numRightColIdxs;
   3.301 +   int32 leftRowIdxOffset;
   3.302 +   int32 resColIdx;
   3.303 +   SubMatrix **leftSubMatrices;
   3.304 +   SubMatrix **rightSubMatrices;
   3.305 +   float32 *resultArray;
   3.306 +   int32 coreToRunOn;
   3.307 + }
   3.308 +VecParams;
   3.309 +
   3.310 +//============================= Processor Functions =========================
   3.311 +void divideWorkIntoSubMatrixPairProcrs( void *data, VirtProcr *animatingPr );
   3.312 +void calcSubMatrixProduct(              void *data, VirtProcr *animatingPr );
   3.313 +void calcVectorOfSubMatrices(     void *_vecParams, VirtProcr *animatingPr );
   3.314 +
   3.315 +
   3.316 +//================================ Entry Point ==============================
   3.317 +Matrix *
   3.318 +multiplyTheseMatrices( Matrix *leftMatrix, Matrix *rightMatrix );
   3.319 +
   3.320 +
   3.321 +#endif /*_VCilk_MATRIX_MULT_H_*/

     4.1 --- a/src/Application/VCilk__Matrix_Mult/subMatrix_Pr.c	Wed May 11 15:40:54 2011 +0200
     4.2 +++ b/src/Application/VCilk__Matrix_Mult/subMatrix_Pr.c	Wed May 11 15:58:04 2011 +0200
     4.3 @@ -1,308 +1,308 @@
     4.4 -/* 
     4.5 
     4.6 - *  Copyright 2009 OpenSourceStewardshipFoundation.org
     4.7 
     4.8 - *  Licensed under GNU General Public License version 2
     4.9 
    4.10 - *
    4.11 
    4.12 - * Author: SeanHalle@yahoo.com
    4.13 
    4.14 - *
    4.15 
    4.16 - */
    4.17 
    4.18 -
    4.19 
    4.20 -#include <string.h>
    4.21 
    4.22 -
    4.23 
    4.24 -#include "VCilk__Matrix_Mult.h"
    4.25 
    4.26 -
    4.27 
    4.28 -
    4.29 
    4.30 -void inline
    4.31 
    4.32 -copyFromOrig( SubMatrix *subMatrix, VirtProcr *animPr );
    4.33 
    4.34 -
    4.35 
    4.36 -void inline
    4.37 
    4.38 -copyTransposeFromOrig( SubMatrix *subMatrix, VirtProcr *animPr );
    4.39 
    4.40 -
    4.41 
    4.42 -void inline
    4.43 
    4.44 -multiplySubBlocksTransposed( float32 *leftArray, float32 *rightArray,
    4.45 
    4.46 -                     float32 *resArray,
    4.47 
    4.48 -                     int startRow,  int endRow,
    4.49 
    4.50 -                     int startCol,  int endCol,
    4.51 
    4.52 -                     int startVec,  int endVec,
    4.53 
    4.54 -                     int resStride, int inpStride );
    4.55 
    4.56 -
    4.57 
    4.58 -void inline
    4.59 
    4.60 -multiplyMatrixArraysTransposed( int32 vecLength, int32 numResRows, int32 numResCols,
    4.61 
    4.62 -                      float32 *leftArray, float32 *rightArray,
    4.63 
    4.64 -                      float32 *resArray );
    4.65 
    4.66 -
    4.67 
    4.68 -
    4.69 
    4.70 -/*A  processor is created with an environment that holds two matrices,
    4.71 
    4.72 - * the row and col that it owns, and the name of a result gathering
    4.73 
    4.74 - * processor.
    4.75 
    4.76 - *It calculates the product of two sub-portions of the input matrices
    4.77 
    4.78 - * by using Intel's mkl library for single-core.
    4.79 
    4.80 - *
    4.81 
    4.82 - *This demonstrates using optimized single-threaded code inside scheduled
    4.83 
    4.84 - * work-units.
    4.85 
    4.86 - *
    4.87 
    4.88 - *When done, it sends the result to the result processor
    4.89 
    4.90 - */
    4.91 
    4.92 -void
    4.93 
    4.94 -calcSubMatrixProduct( void *data, VirtProcr *animPr )
    4.95 
    4.96 - { 
    4.97 
    4.98 -   SMPairParams   *params;
    4.99 
   4.100 -   VirtProcr      *resultPr;
   4.101 
   4.102 -   float32        *leftArray,  *rightArray, *resArray;
   4.103 
   4.104 -   SubMatrix      *leftSubMatrix, *rightSubMatrix;
   4.105 
   4.106 -
   4.107 
   4.108 -
   4.109 
   4.110 -         DEBUG1( dbgAppFlow, "start sub-matrix mult %d\n", animPr->procrID)
   4.111 
   4.112 -         #ifdef TURN_ON_DEBUG_PROBES
   4.113 
   4.114 -         int32 subMatrixProbe = 
   4.115 
   4.116 -            VMS__create_single_interval_probe( "subMtx",      animPr);
   4.117 
   4.118 -         VMS__record_sched_choice_into_probe( subMatrixProbe, animPr );
   4.119 
   4.120 -         VMS__record_interval_start_in_probe( subMatrixProbe );
   4.121 
   4.122 -         #endif
   4.123 
   4.124 -
   4.125 
   4.126 -   params         = (SMPairParams *)data;
   4.127 
   4.128 -//   resultPr       = params->resultPr;
   4.129 
   4.130 -   leftSubMatrix  = params->leftSubMatrix;
   4.131 
   4.132 -   rightSubMatrix = params->rightSubMatrix;
   4.133 
   4.134 -
   4.135 
   4.136 -      //make sure the input sub-matrices have been copied out of orig
   4.137 
   4.138 -   copyFromOrig( leftSubMatrix, animPr );
   4.139 
   4.140 -   copyTransposeFromOrig( rightSubMatrix, animPr );
   4.141 
   4.142 -   
   4.143 
   4.144 -   leftArray      = leftSubMatrix->array;
   4.145 
   4.146 -   rightArray     = rightSubMatrix->array;
   4.147 
   4.148 -
   4.149 
   4.150 -      //make this array here, on the core that computes the results
   4.151 
   4.152 -      // with Cilk's semantics, have to have separate result array for each
   4.153 
   4.154 -      // spawned processor -- unless want to change the spawn and sync
   4.155 
   4.156 -      // pattern, such that spawn one from each vector, then sync, then
   4.157 
   4.158 -      // another, and so forth -- this will cause idle time due to imbalance
   4.159 
   4.160 -      // in matrix sizes
   4.161 
   4.162 -      //This also gives chance to set affinity so all in vector run on same
   4.163 
   4.164 -      // core and re-use the accumulation array,
   4.165 
   4.166 -      //As a side-benefit, it also prevents writes from causing
   4.167 
   4.168 -      // thrashing of the cache -- as long as array big enough, the copy
   4.169 
   4.170 -      // overhead is small because each byte is reused size-of-side times
   4.171 
   4.172 -      //This is freed in the vector processor
   4.173 
   4.174 -   int32
   4.175 
   4.176 -   resSize = leftSubMatrix->numRows * rightSubMatrix->numCols * sizeof(float32);
   4.177 
   4.178 -   resArray = VCilk__malloc( resSize, animPr );
   4.179 
   4.180 -   memset( resArray, 0, resSize );
   4.181 
   4.182 -
   4.183 
   4.184 -
   4.185 
   4.186 -   int32 numResRows, numResCols, vectLength;
   4.187 
   4.188 -   
   4.189 
   4.190 -   vectLength = leftSubMatrix->numCols;
   4.191 
   4.192 -   numResRows = leftSubMatrix->numRows;
   4.193 
   4.194 -   numResCols = rightSubMatrix->numCols;
   4.195 
   4.196 -
   4.197 
   4.198 -   multiplyMatrixArraysTransposed( vectLength, numResRows, numResCols,
   4.199 
   4.200 -                         leftArray, rightArray,
   4.201 
   4.202 -                         resArray );
   4.203 
   4.204 -
   4.205 
   4.206 -   //send result by side-effect
   4.207 
   4.208 -   params->partialResultArray = resArray;
   4.209 
   4.210 -
   4.211 
   4.212 -         #ifdef TURN_ON_DEBUG_PROBES
   4.213 
   4.214 -         VMS__record_interval_end_in_probe( subMatrixProbe );
   4.215 
   4.216 -         #endif
   4.217 
   4.218 -         
   4.219 
   4.220 -         DEBUG1( dbgAppFlow, "end sub-matrix mult %d\n", animPr->procrID)
   4.221 
   4.222 -   VCilk__dissipate_procr( animPr );
   4.223 
   4.224 - }
   4.225 
   4.226 -
   4.227 
   4.228 -
   4.229 
   4.230 -
   4.231 
   4.232 -/*Divides result and each input into 32x32 sub-matrices, 3 of which fit into
   4.233 
   4.234 - * the 32KB L1 cache.
   4.235 
   4.236 - *Would be nice to embed this within another level that divided into
   4.237 
   4.238 - * 8x8 tiles of those, where one 8x8 tile fits within 2MB L2 cache
   4.239 
   4.240 - *
   4.241 
   4.242 - *Eventually want these divisions to be automatic, using DKU pattern
   4.243 
   4.244 - * embedded into VMS and exposed in the language, and with VMS controlling the
   4.245 
   4.246 - * divisions according to the cache sizes, which it knows about.
   4.247 
   4.248 - *Also, want VMS to work with language to split among main-mems, so a socket
   4.249 
   4.250 - * only cranks on data in its local segment of main mem
   4.251 
   4.252 - *
   4.253 
   4.254 - *So, outer two loops determine start and end points within the result matrix.
   4.255 
   4.256 - * Inside that, a loop dets the start and end points along the shared dimensions
   4.257 
   4.258 - * of the two input matrices.
   4.259 
   4.260 - */
   4.261 
   4.262 -void inline
   4.263 
   4.264 -multiplyMatrixArraysTransposed( int32 vecLength, int32 numResRows,
   4.265 
   4.266 -                                int32 numResCols,
   4.267 
   4.268 -                                float32 *leftArray, float32 *rightArray,
   4.269 
   4.270 -                                float32 *resArray )
   4.271 
   4.272 - {
   4.273 
   4.274 -   int resStride, inpStride;
   4.275 
   4.276 -   int resStartRow, resStartCol, resEndRow, resEndCol, startVec, endVec;
   4.277 
   4.278 -
   4.279 
   4.280 -   resStride  = numResCols;
   4.281 
   4.282 -   inpStride  = vecLength;
   4.283 
   4.284 -
   4.285 
   4.286 -   for( resStartRow = 0; resStartRow < numResRows; )
   4.287 
   4.288 -    {
   4.289 
   4.290 -      resEndRow = resStartRow + ROWS_IN_BLOCK -1;  //start at zero, so -1
   4.291 
   4.292 -      if( resEndRow > numResRows ) resEndRow = numResRows -1;
   4.293 
   4.294 -
   4.295 
   4.296 -      for( resStartCol = 0; resStartCol < numResCols; )
   4.297 
   4.298 -       {
   4.299 
   4.300 -         resEndCol   = resStartCol + COLS_IN_BLOCK -1;
   4.301 
   4.302 -         if( resEndCol > numResCols ) resEndCol = numResCols -1;
   4.303 
   4.304 -
   4.305 
   4.306 -         for( startVec = 0; startVec < vecLength; )
   4.307 
   4.308 -          {
   4.309 
   4.310 -            endVec   = startVec + VEC_IN_BLOCK -1;
   4.311 
   4.312 -            if( endVec > vecLength ) endVec = vecLength -1;
   4.313 
   4.314 -
   4.315 
   4.316 -               //By having the "vector" of sub-blocks in a sub-block slice
   4.317 
   4.318 -               // be marched down in inner loop, are re-using the result
   4.319 
   4.320 -               // matrix, which stays in L1 cache and re-using the left sub-mat
   4.321 
   4.322 -               // which repeats for each right sub-mat -- can only re-use two of
   4.323 
   4.324 -               // the three, so result is the most important -- avoids writing
   4.325 
   4.326 -               // dirty blocks until those result-locations fully done
   4.327 
   4.328 -               //Row and Col is position in result matrix -- so row and vec
   4.329 
   4.330 -               // for left array, then vec and col for right array
   4.331 
   4.332 -            multiplySubBlocksTransposed( leftArray, rightArray,
   4.333 
   4.334 -                                         resArray,
   4.335 
   4.336 -                                         resStartRow,  resEndRow,
   4.337 
   4.338 -                                         resStartCol,  resEndCol,
   4.339 
   4.340 -                                         startVec,  endVec,
   4.341 
   4.342 -                                         resStride, inpStride );
   4.343 
   4.344 -            startVec = endVec +1;
   4.345 
   4.346 -          }
   4.347 
   4.348 -         resStartCol = resEndCol +1;
   4.349 
   4.350 -       }
   4.351 
   4.352 -      resStartRow = resEndRow +1;
   4.353 
   4.354 -    }
   4.355 
   4.356 - }
   4.357 
   4.358 -
   4.359 
   4.360 -
   4.361 
   4.362 -
   4.363 
   4.364 -void inline
   4.365 
   4.366 -multiplySubBlocksTransposed( float32 *leftArray, float32 *rightArray,
   4.367 
   4.368 -                     float32 *resArray,
   4.369 
   4.370 -                     int resStartRow,  int resEndRow,
   4.371 
   4.372 -                     int resStartCol,  int resEndCol,
   4.373 
   4.374 -                     int startVec,  int endVec,
   4.375 
   4.376 -                     int resStride, int inpStride )
   4.377 
   4.378 - {
   4.379 
   4.380 -   int resRow,     resCol,        vec;
   4.381 
   4.382 -   int leftOffset, rightOffset;
   4.383 
   4.384 -   float32 result;
   4.385 
   4.386 -
   4.387 
   4.388 -      //The result row is used for the left matrix, res col for the right
   4.389 
   4.390 -   for( resCol = resStartCol; resCol <= resEndCol; resCol++ )
   4.391 
   4.392 -    {
   4.393 
   4.394 -      for( resRow = resStartRow; resRow <= resEndRow; resRow++ )
   4.395 
   4.396 -       {
   4.397 
   4.398 -         leftOffset  = resRow * inpStride;//left & right inp strides same
   4.399 
   4.400 -         rightOffset = resCol * inpStride;// because right is transposed
   4.401 
   4.402 -         result = 0;
   4.403 
   4.404 -         for( vec = startVec; vec <= endVec; vec++ )
   4.405 
   4.406 -          {
   4.407 
   4.408 -            result +=
   4.409 
   4.410 -               leftArray[ leftOffset + vec] * rightArray[ rightOffset + vec];
   4.411 
   4.412 -          }
   4.413 
   4.414 -
   4.415 
   4.416 -         resArray[ resRow * resStride + resCol ] += result;
   4.417 
   4.418 -       }
   4.419 
   4.420 -    }
   4.421 
   4.422 - }
   4.423 
   4.424 -
   4.425 
   4.426 -
   4.427 
   4.428 -/*Reuse this in divider when do the sequential multiply case
   4.429 
   4.430 - */
   4.431 
   4.432 -void inline
   4.433 
   4.434 -copyTranspose( int32 numRows, int32 numCols,
   4.435 
   4.436 -               int32 origStartRow, int32 origStartCol, int32 origStride,
   4.437 
   4.438 -               float32 *subArray, float32 *origArray )
   4.439 
   4.440 - { int32 stride = numRows;
   4.441 
   4.442 -
   4.443 
   4.444 -   int row, col, origOffset;
   4.445 
   4.446 -   for( row = 0; row < numRows; row++ )
   4.447 
   4.448 -    {
   4.449 
   4.450 -      origOffset = (row + origStartRow) * origStride + origStartCol;
   4.451 
   4.452 -      for( col = 0; col < numCols; col++ )
   4.453 
   4.454 -       {
   4.455 
   4.456 -            //transpose means swap row & col -- traverse orig matrix normally
   4.457 
   4.458 -            // but put into reversed place in local array -- means the
   4.459 
   4.460 -            // stride is the numRows now, so col * numRows + row
   4.461 
   4.462 -         subArray[ col * stride + row ]  =  origArray[ origOffset + col ];
   4.463 
   4.464 -       }
   4.465 
   4.466 -    }
   4.467 
   4.468 - }
   4.469 
   4.470 -
   4.471 
   4.472 -void inline
   4.473 
   4.474 -copyTransposeFromOrig( SubMatrix *subMatrix, VirtProcr *animPr )
   4.475 
   4.476 - { int numCols, numRows, origStartRow, origStartCol, origStride, stride;
   4.477 
   4.478 -   Matrix *origMatrix;
   4.479 
   4.480 -   float32 *origArray, *subArray;
   4.481 
   4.482 -
   4.483 
   4.484 -   VCilk__start_data_singleton( &(subMatrix->copyTransSingleton), animPr );
   4.485 
   4.486 -
   4.487 
   4.488 -   origMatrix   = subMatrix->origMatrix;
   4.489 
   4.490 -   origArray    = origMatrix->array;
   4.491 
   4.492 -   numCols      = subMatrix->numCols;
   4.493 
   4.494 -   numRows      = subMatrix->numRows;
   4.495 
   4.496 -   origStartRow = subMatrix->origStartRow;
   4.497 
   4.498 -   origStartCol = subMatrix->origStartCol;
   4.499 
   4.500 -   origStride   = origMatrix->numCols;
   4.501 
   4.502 -
   4.503 
   4.504 -   subArray     = VCilk__malloc( numRows * numCols *sizeof(float32),animPr);
   4.505 
   4.506 -   subMatrix->array = subArray;
   4.507 
   4.508 -
   4.509 
   4.510 -      //copy values from orig matrix to local
   4.511 
   4.512 -   copyTranspose( numRows, numCols,
   4.513 
   4.514 -                  origStartRow, origStartCol, origStride,
   4.515 
   4.516 -                  subArray, origArray );
   4.517 
   4.518 -
   4.519 
   4.520 -   VCilk__end_data_singleton( &(subMatrix->copyTransSingleton), animPr );
   4.521 
   4.522 -   return;
   4.523 
   4.524 - }
   4.525 
   4.526 -
   4.527 
   4.528 -
   4.529 
   4.530 -void inline
   4.531 
   4.532 -copyFromOrig( SubMatrix *subMatrix, VirtProcr *animPr )
   4.533 
   4.534 - { int numCols, numRows, origStartRow, origStartCol, stride, origStride;
   4.535 
   4.536 -   Matrix *origMatrix;
   4.537 
   4.538 -   float32 *origArray, *subArray;
   4.539 
   4.540 -
   4.541 
   4.542 -
   4.543 
   4.544 -      //This lets only a single VP execute the code between start and
   4.545 
   4.546 -      // end -- using start and end so that work runs outside the master.
   4.547 
   4.548 -      //Inside, if a second VP ever executes the start, it will be returned
   4.549 
   4.550 -      // from the end-point.
   4.551 
   4.552 -      //Note, for non-GCC, can add a second SSR call at the end, and inside
   4.553 
   4.554 -      // that one, look at the stack at the return addr & save that in an
   4.555 
   4.556 -      // array indexed by singletonID
   4.557 
   4.558 -   VCilk__start_data_singleton( &(subMatrix->copySingleton), animPr );
   4.559 
   4.560 -
   4.561 
   4.562 -
   4.563 
   4.564 -   origMatrix    = subMatrix->origMatrix;
   4.565 
   4.566 -   origArray     = origMatrix->array;
   4.567 
   4.568 -   numCols       = subMatrix->numCols;
   4.569 
   4.570 -   numRows       = subMatrix->numRows;
   4.571 
   4.572 -   origStartRow  = subMatrix->origStartRow;
   4.573 
   4.574 -   origStartCol  = subMatrix->origStartCol;
   4.575 
   4.576 -   origStride    = origMatrix->numCols;
   4.577 
   4.578 -
   4.579 
   4.580 -   subArray      = VCilk__malloc( numRows * numCols *sizeof(float32),animPr);
   4.581 
   4.582 -   subMatrix->array = subArray;
   4.583 
   4.584 -
   4.585 
   4.586 -      //copy values from orig matrix to local
   4.587 
   4.588 -   stride        = numCols;
   4.589 
   4.590 -
   4.591 
   4.592 -   int row, col, offset, origOffset;
   4.593 
   4.594 -   for( row = 0; row < numRows; row++ )
   4.595 
   4.596 -    {
   4.597 
   4.598 -      offset     = row * stride;
   4.599 
   4.600 -      origOffset = (row + origStartRow) * origStride + origStartCol;
   4.601 
   4.602 -      for( col = 0; col < numCols; col++ )
   4.603 
   4.604 -       {
   4.605 
   4.606 -         subArray[ offset + col ]  =  origArray[ origOffset + col ];
   4.607 
   4.608 -       }
   4.609 
   4.610 -    }
   4.611 
   4.612 -   VCilk__end_data_singleton( &(subMatrix->copySingleton), animPr );
   4.613 
   4.614 -
   4.615 
   4.616 -   return;
   4.617 
   4.618 - }
   4.619 
   4.620 +/* 
   4.621 + *  Copyright 2009 OpenSourceStewardshipFoundation.org
   4.622 + *  Licensed under GNU General Public License version 2
   4.623 + *
   4.624 + * Author: SeanHalle@yahoo.com
   4.625 + *
   4.626 + */
   4.627 +
   4.628 +#include <string.h>
   4.629 +
   4.630 +#include "VCilk__Matrix_Mult.h"
   4.631 +
   4.632 +
   4.633 +void inline
   4.634 +copyFromOrig( SubMatrix *subMatrix, VirtProcr *animPr );
   4.635 +
   4.636 +void inline
   4.637 +copyTransposeFromOrig( SubMatrix *subMatrix, VirtProcr *animPr );
   4.638 +
   4.639 +void inline
   4.640 +multiplySubBlocksTransposed( float32 *leftArray, float32 *rightArray,
   4.641 +                     float32 *resArray,
   4.642 +                     int startRow,  int endRow,
   4.643 +                     int startCol,  int endCol,
   4.644 +                     int startVec,  int endVec,
   4.645 +                     int resStride, int inpStride );
   4.646 +
   4.647 +void inline
   4.648 +multiplyMatrixArraysTransposed( int32 vecLength, int32 numResRows, int32 numResCols,
   4.649 +                      float32 *leftArray, float32 *rightArray,
   4.650 +                      float32 *resArray );
   4.651 +
   4.652 +
   4.653 +/*A  processor is created with an environment that holds two matrices,
   4.654 + * the row and col that it owns, and the name of a result gathering
   4.655 + * processor.
   4.656 + *It calculates the product of two sub-portions of the input matrices
   4.657 + * by using Intel's mkl library for single-core.
   4.658 + *
   4.659 + *This demonstrates using optimized single-threaded code inside scheduled
   4.660 + * work-units.
   4.661 + *
   4.662 + *When done, it sends the result to the result processor
   4.663 + */
   4.664 +void
   4.665 +calcSubMatrixProduct( void *data, VirtProcr *animPr )
   4.666 + { 
   4.667 +   SMPairParams   *params;
   4.668 +   VirtProcr      *resultPr;
   4.669 +   float32        *leftArray,  *rightArray, *resArray;
   4.670 +   SubMatrix      *leftSubMatrix, *rightSubMatrix;
   4.671 +
   4.672 +
   4.673 +         DEBUG1( dbgAppFlow, "start sub-matrix mult %d\n", animPr->procrID)
   4.674 +         #ifdef TURN_ON_DEBUG_PROBES
   4.675 +         int32 subMatrixProbe = 
   4.676 +            VMS__create_single_interval_probe( "subMtx",      animPr);
   4.677 +         VMS__record_sched_choice_into_probe( subMatrixProbe, animPr );
   4.678 +         VMS__record_interval_start_in_probe( subMatrixProbe );
   4.679 +         #endif
   4.680 +
   4.681 +   params         = (SMPairParams *)data;
   4.682 +//   resultPr       = params->resultPr;
   4.683 +   leftSubMatrix  = params->leftSubMatrix;
   4.684 +   rightSubMatrix = params->rightSubMatrix;
   4.685 +
   4.686 +      //make sure the input sub-matrices have been copied out of orig
   4.687 +   copyFromOrig( leftSubMatrix, animPr );
   4.688 +   copyTransposeFromOrig( rightSubMatrix, animPr );
   4.689 +   
   4.690 +   leftArray      = leftSubMatrix->array;
   4.691 +   rightArray     = rightSubMatrix->array;
   4.692 +
   4.693 +      //make this array here, on the core that computes the results
   4.694 +      // with Cilk's semantics, have to have separate result array for each
   4.695 +      // spawned processor -- unless want to change the spawn and sync
   4.696 +      // pattern, such that spawn one from each vector, then sync, then
   4.697 +      // another, and so forth -- this will cause idle time due to imbalance
   4.698 +      // in matrix sizes
   4.699 +      //This also gives chance to set affinity so all in vector run on same
   4.700 +      // core and re-use the accumulation array,
   4.701 +      //As a side-benefit, it also prevents writes from causing
   4.702 +      // thrashing of the cache -- as long as array big enough, the copy
   4.703 +      // overhead is small because each byte is reused size-of-side times
   4.704 +      //This is freed in the vector processor
   4.705 +   int32
   4.706 +   resSize = leftSubMatrix->numRows * rightSubMatrix->numCols * sizeof(float32);
   4.707 +   resArray = VCilk__malloc( resSize, animPr );
   4.708 +   memset( resArray, 0, resSize );
   4.709 +
   4.710 +
   4.711 +   int32 numResRows, numResCols, vectLength;
   4.712 +   
   4.713 +   vectLength = leftSubMatrix->numCols;
   4.714 +   numResRows = leftSubMatrix->numRows;
   4.715 +   numResCols = rightSubMatrix->numCols;
   4.716 +
   4.717 +   multiplyMatrixArraysTransposed( vectLength, numResRows, numResCols,
   4.718 +                         leftArray, rightArray,
   4.719 +                         resArray );
   4.720 +
   4.721 +   //send result by side-effect
   4.722 +   params->partialResultArray = resArray;
   4.723 +
   4.724 +         #ifdef TURN_ON_DEBUG_PROBES
   4.725 +         VMS__record_interval_end_in_probe( subMatrixProbe );
   4.726 +         #endif
   4.727 +         
   4.728 +         DEBUG1( dbgAppFlow, "end sub-matrix mult %d\n", animPr->procrID)
   4.729 +   VCilk__dissipate_procr( animPr );
   4.730 + }
   4.731 +
   4.732 +
   4.733 +
   4.734 +/*Divides result and each input into 32x32 sub-matrices, 3 of which fit into
   4.735 + * the 32KB L1 cache.
   4.736 + *Would be nice to embed this within another level that divided into
   4.737 + * 8x8 tiles of those, where one 8x8 tile fits within 2MB L2 cache
   4.738 + *
   4.739 + *Eventually want these divisions to be automatic, using DKU pattern
   4.740 + * embedded into VMS and exposed in the language, and with VMS controlling the
   4.741 + * divisions according to the cache sizes, which it knows about.
   4.742 + *Also, want VMS to work with language to split among main-mems, so a socket
   4.743 + * only cranks on data in its local segment of main mem
   4.744 + *
   4.745 + *So, outer two loops determine start and end points within the result matrix.
   4.746 + * Inside that, a loop dets the start and end points along the shared dimensions
   4.747 + * of the two input matrices.
   4.748 + */
   4.749 +void inline
   4.750 +multiplyMatrixArraysTransposed( int32 vecLength, int32 numResRows,
   4.751 +                                int32 numResCols,
   4.752 +                                float32 *leftArray, float32 *rightArray,
   4.753 +                                float32 *resArray )
   4.754 + {
   4.755 +   int resStride, inpStride;
   4.756 +   int resStartRow, resStartCol, resEndRow, resEndCol, startVec, endVec;
   4.757 +
   4.758 +   resStride  = numResCols;
   4.759 +   inpStride  = vecLength;
   4.760 +
   4.761 +   for( resStartRow = 0; resStartRow < numResRows; )
   4.762 +    {
   4.763 +      resEndRow = resStartRow + ROWS_IN_BLOCK -1;  //start at zero, so -1
   4.764 +      if( resEndRow > numResRows ) resEndRow = numResRows -1;
   4.765 +
   4.766 +      for( resStartCol = 0; resStartCol < numResCols; )
   4.767 +       {
   4.768 +         resEndCol   = resStartCol + COLS_IN_BLOCK -1;
   4.769 +         if( resEndCol > numResCols ) resEndCol = numResCols -1;
   4.770 +
   4.771 +         for( startVec = 0; startVec < vecLength; )
   4.772 +          {
   4.773 +            endVec   = startVec + VEC_IN_BLOCK -1;
   4.774 +            if( endVec > vecLength ) endVec = vecLength -1;
   4.775 +
   4.776 +               //By having the "vector" of sub-blocks in a sub-block slice
   4.777 +               // be marched down in inner loop, are re-using the result
   4.778 +               // matrix, which stays in L1 cache and re-using the left sub-mat
   4.779 +               // which repeats for each right sub-mat -- can only re-use two of
   4.780 +               // the three, so result is the most important -- avoids writing
   4.781 +               // dirty blocks until those result-locations fully done
   4.782 +               //Row and Col is position in result matrix -- so row and vec
   4.783 +               // for left array, then vec and col for right array
   4.784 +            multiplySubBlocksTransposed( leftArray, rightArray,
   4.785 +                                         resArray,
   4.786 +                                         resStartRow,  resEndRow,
   4.787 +                                         resStartCol,  resEndCol,
   4.788 +                                         startVec,  endVec,
   4.789 +                                         resStride, inpStride );
   4.790 +            startVec = endVec +1;
   4.791 +          }
   4.792 +         resStartCol = resEndCol +1;
   4.793 +       }
   4.794 +      resStartRow = resEndRow +1;
   4.795 +    }
   4.796 + }
   4.797 +
   4.798 +
   4.799 +
   4.800 +void inline
   4.801 +multiplySubBlocksTransposed( float32 *leftArray, float32 *rightArray,
   4.802 +                     float32 *resArray,
   4.803 +                     int resStartRow,  int resEndRow,
   4.804 +                     int resStartCol,  int resEndCol,
   4.805 +                     int startVec,  int endVec,
   4.806 +                     int resStride, int inpStride )
   4.807 + {
   4.808 +   int resRow,     resCol,        vec;
   4.809 +   int leftOffset, rightOffset;
   4.810 +   float32 result;
   4.811 +
   4.812 +      //The result row is used for the left matrix, res col for the right
   4.813 +   for( resCol = resStartCol; resCol <= resEndCol; resCol++ )
   4.814 +    {
   4.815 +      for( resRow = resStartRow; resRow <= resEndRow; resRow++ )
   4.816 +       {
   4.817 +         leftOffset  = resRow * inpStride;//left & right inp strides same
   4.818 +         rightOffset = resCol * inpStride;// because right is transposed
   4.819 +         result = 0;
   4.820 +         for( vec = startVec; vec <= endVec; vec++ )
   4.821 +          {
   4.822 +            result +=
   4.823 +               leftArray[ leftOffset + vec] * rightArray[ rightOffset + vec];
   4.824 +          }
   4.825 +
   4.826 +         resArray[ resRow * resStride + resCol ] += result;
   4.827 +       }
   4.828 +    }
   4.829 + }
   4.830 +
   4.831 +
   4.832 +/*Reuse this in divider when do the sequential multiply case
   4.833 + */
   4.834 +void inline
   4.835 +copyTranspose( int32 numRows, int32 numCols,
   4.836 +               int32 origStartRow, int32 origStartCol, int32 origStride,
   4.837 +               float32 *subArray, float32 *origArray )
   4.838 + { int32 stride = numRows;
   4.839 +
   4.840 +   int row, col, origOffset;
   4.841 +   for( row = 0; row < numRows; row++ )
   4.842 +    {
   4.843 +      origOffset = (row + origStartRow) * origStride + origStartCol;
   4.844 +      for( col = 0; col < numCols; col++ )
   4.845 +       {
   4.846 +            //transpose means swap row & col -- traverse orig matrix normally
   4.847 +            // but put into reversed place in local array -- means the
   4.848 +            // stride is the numRows now, so col * numRows + row
   4.849 +         subArray[ col * stride + row ]  =  origArray[ origOffset + col ];
   4.850 +       }
   4.851 +    }
   4.852 + }
   4.853 +
   4.854 +void inline
   4.855 +copyTransposeFromOrig( SubMatrix *subMatrix, VirtProcr *animPr )
   4.856 + { int numCols, numRows, origStartRow, origStartCol, origStride, stride;
   4.857 +   Matrix *origMatrix;
   4.858 +   float32 *origArray, *subArray;
   4.859 +
   4.860 +   VCilk__start_data_singleton( &(subMatrix->copyTransSingleton), animPr );
   4.861 +
   4.862 +   origMatrix   = subMatrix->origMatrix;
   4.863 +   origArray    = origMatrix->array;
   4.864 +   numCols      = subMatrix->numCols;
   4.865 +   numRows      = subMatrix->numRows;
   4.866 +   origStartRow = subMatrix->origStartRow;
   4.867 +   origStartCol = subMatrix->origStartCol;
   4.868 +   origStride   = origMatrix->numCols;
   4.869 +
   4.870 +   subArray     = VCilk__malloc( numRows * numCols *sizeof(float32),animPr);
   4.871 +   subMatrix->array = subArray;
   4.872 +
   4.873 +      //copy values from orig matrix to local
   4.874 +   copyTranspose( numRows, numCols,
   4.875 +                  origStartRow, origStartCol, origStride,
   4.876 +                  subArray, origArray );
   4.877 +
   4.878 +   VCilk__end_data_singleton( &(subMatrix->copyTransSingleton), animPr );
   4.879 +   return;
   4.880 + }
   4.881 +
   4.882 +
   4.883 +void inline
   4.884 +copyFromOrig( SubMatrix *subMatrix, VirtProcr *animPr )
   4.885 + { int numCols, numRows, origStartRow, origStartCol, stride, origStride;
   4.886 +   Matrix *origMatrix;
   4.887 +   float32 *origArray, *subArray;
   4.888 +
   4.889 +
   4.890 +      //This lets only a single VP execute the code between start and
   4.891 +      // end -- using start and end so that work runs outside the master.
   4.892 +      //Inside, if a second VP ever executes the start, it will be returned
   4.893 +      // from the end-point.
   4.894 +      //Note, for non-GCC, can add a second SSR call at the end, and inside
   4.895 +      // that one, look at the stack at the return addr & save that in an
   4.896 +      // array indexed by singletonID
   4.897 +   VCilk__start_data_singleton( &(subMatrix->copySingleton), animPr );
   4.898 +
   4.899 +
   4.900 +   origMatrix    = subMatrix->origMatrix;
   4.901 +   origArray     = origMatrix->array;
   4.902 +   numCols       = subMatrix->numCols;
   4.903 +   numRows       = subMatrix->numRows;
   4.904 +   origStartRow  = subMatrix->origStartRow;
   4.905 +   origStartCol  = subMatrix->origStartCol;
   4.906 +   origStride    = origMatrix->numCols;
   4.907 +
   4.908 +   subArray      = VCilk__malloc( numRows * numCols *sizeof(float32),animPr);
   4.909 +   subMatrix->array = subArray;
   4.910 +
   4.911 +      //copy values from orig matrix to local
   4.912 +   stride        = numCols;
   4.913 +
   4.914 +   int row, col, offset, origOffset;
   4.915 +   for( row = 0; row < numRows; row++ )
   4.916 +    {
   4.917 +      offset     = row * stride;
   4.918 +      origOffset = (row + origStartRow) * origStride + origStartCol;
   4.919 +      for( col = 0; col < numCols; col++ )
   4.920 +       {
   4.921 +         subArray[ offset + col ]  =  origArray[ origOffset + col ];
   4.922 +       }
   4.923 +    }
   4.924 +   VCilk__end_data_singleton( &(subMatrix->copySingleton), animPr );
   4.925 +
   4.926 +   return;
   4.927 + }

     5.1 --- a/src/Application/main.c	Wed May 11 15:40:54 2011 +0200
     5.2 +++ b/src/Application/main.c	Wed May 11 15:58:04 2011 +0200
     5.3 @@ -1,35 +1,35 @@
     5.4 -/*
     5.5 
     5.6 - *  Copyright Oct 24, 2009 OpenSourceStewardshipFoundation.org
     5.7 
     5.8 - *  Licensed under GNU General Public License version 2
     5.9 
    5.10 - *
    5.11 
    5.12 - * author seanhalle@yahoo.com
    5.13 
    5.14 - */
    5.15 
    5.16 -
    5.17 
    5.18 -#include <malloc.h>
    5.19 
    5.20 -#include <stdlib.h>
    5.21 
    5.22 -
    5.23 
    5.24 -#include "Matrix_Mult.h"
    5.25 
    5.26 -#include "VCilk__Matrix_Mult/VCilk__Matrix_Mult.h"
    5.27 
    5.28 -
    5.29 
    5.30 -/**
    5.31 
    5.32 - *Matrix multiply program written using VMS_HW piggy-back language
    5.33 
    5.34 - * 
    5.35 
    5.36 - */
    5.37 
    5.38 -int main( int argc, char **argv )
    5.39 
    5.40 - { Matrix      *leftMatrix, *rightMatrix, *resultMatrix;
    5.41 
    5.42 -   ParamBag    *paramBag;
    5.43 
    5.44 -   
    5.45 
    5.46 -   paramBag = makeParamBag();
    5.47 
    5.48 -   printf("arguments: %s | %s | %s | %s\n", argv[0], argv[1], argv[2], argv[3] );
    5.49 
    5.50 -   readParamFileIntoBag( argv[1], paramBag );
    5.51 
    5.52 -   initialize_Input_Matrices_Via( &leftMatrix, &rightMatrix, paramBag );
    5.53 
    5.54 -   
    5.55 
    5.56 -   resultMatrix = multiplyTheseMatrices( leftMatrix, rightMatrix );
    5.57 
    5.58 -
    5.59 
    5.60 -//   printf("\nresult matrix: \n"); \
    5.61 
    5.62 -   printMatrix( resultMatrix );
    5.63 
    5.64 -   
    5.65 
    5.66 -//   VCilk__print_stats();
    5.67 
    5.68 -   fflush(stdin);
    5.69 
    5.70 -   exit(0); //cleans up
    5.71 
    5.72 - }
    5.73 
    5.74 +/*
    5.75 + *  Copyright Oct 24, 2009 OpenSourceStewardshipFoundation.org
    5.76 + *  Licensed under GNU General Public License version 2
    5.77 + *
    5.78 + * author seanhalle@yahoo.com
    5.79 + */
    5.80 +
    5.81 +#include <malloc.h>
    5.82 +#include <stdlib.h>
    5.83 +
    5.84 +#include "Matrix_Mult.h"
    5.85 +#include "VCilk__Matrix_Mult/VCilk__Matrix_Mult.h"
    5.86 +
    5.87 +/**
    5.88 + *Matrix multiply program written using VMS_HW piggy-back language
    5.89 + * 
    5.90 + */
    5.91 +int main( int argc, char **argv )
    5.92 + { Matrix      *leftMatrix, *rightMatrix, *resultMatrix;
    5.93 +   ParamBag    *paramBag;
    5.94 +   
    5.95 +   paramBag = makeParamBag();
    5.96 +   printf("arguments: %s | %s | %s | %s\n", argv[0], argv[1], argv[2], argv[3] );
    5.97 +   readParamFileIntoBag( argv[1], paramBag );
    5.98 +   initialize_Input_Matrices_Via( &leftMatrix, &rightMatrix, paramBag );
    5.99 +   
   5.100 +   resultMatrix = multiplyTheseMatrices( leftMatrix, rightMatrix );
   5.101 +
   5.102 +//   printf("\nresult matrix: \n"); \
   5.103 +   printMatrix( resultMatrix );
   5.104 +   
   5.105 +//   VCilk__print_stats();
   5.106 +   fflush(stdin);
   5.107 +   exit(0); //cleans up
   5.108 + }