changeset 4:ecba4ae0be7a

Seans development tree
author Merten Sach <msach@mailbox.tu-berlin.de>
date Wed, 11 May 2011 15:40:54 +0200
parents 12bcb9728e1c
children e223756d0f0c
files src/Application/Matrix_Mult.c src/Application/Matrix_Mult.h src/Application/VCilk__Matrix_Mult/Divide_Pr.c src/Application/VCilk__Matrix_Mult/EntryPoint.c src/Application/VCilk__Matrix_Mult/VCilk__Matrix_Mult.h src/Application/VCilk__Matrix_Mult/Vector_Pr.c src/Application/VCilk__Matrix_Mult/subMatrix_Pr.c src/Application/main.c
diffstat 8 files changed, 1469 insertions(+), 1471 deletions(-) [+]
line diff
     1.1 --- a/src/Application/Matrix_Mult.c	Mon Nov 15 12:08:19 2010 -0800
     1.2 +++ b/src/Application/Matrix_Mult.c	Wed May 11 15:40:54 2011 +0200
     1.3 @@ -1,167 +1,167 @@
     1.4 -/*
     1.5 - *  Copyright 2009 OpenSourceStewardshipFoundation.org
     1.6 - *  Licensed under GNU General Public License version 2
     1.7 - *
     1.8 - * Author: seanhalle@yahoo.com
     1.9 - *
    1.10 - * Created on November 15, 2009, 2:35 AM
    1.11 - */
    1.12 -
    1.13 -#include <malloc.h>
    1.14 -#include <stdlib.h>
    1.15 -
    1.16 -#include "Matrix_Mult.h"
    1.17 -#include "ParamHelper/Param.h"
    1.18 -
    1.19 -
    1.20 - 
    1.21 - void
    1.22 -initialize_Input_Matrices_Via( Matrix  **leftMatrix, Matrix **rightMatrix,
    1.23 -                               ParamBag *paramBag )
    1.24 - { char *leftMatrixFileName, *rightMatrixFileName;
    1.25 -   int   leftMatrixRows, leftMatrixCols, rightMatrixRows, rightMatrixCols;
    1.26 -   
    1.27 -      ParamStruc *param;
    1.28 -      param = getParamFromBag( "leftMatrixRows", paramBag );
    1.29 -   leftMatrixRows = param->intValue;
    1.30 -      param = getParamFromBag( "leftMatrixCols", paramBag );
    1.31 -   leftMatrixCols = param->intValue;
    1.32 -   *leftMatrix = makeMatrix_WithResMat( leftMatrixRows, leftMatrixCols );
    1.33 -   
    1.34 -      param = getParamFromBag( "leftMatrixFileName", paramBag );
    1.35 -   leftMatrixFileName = param->strValue;  //no need to copy
    1.36 -   read_Matrix_From_File( *leftMatrix,  leftMatrixFileName );
    1.37 -   
    1.38 -      param = getParamFromBag( "rightMatrixRows", paramBag );
    1.39 -   rightMatrixRows = param->intValue;
    1.40 -      param = getParamFromBag( "rightMatrixCols", paramBag );
    1.41 -   rightMatrixCols = param->intValue;
    1.42 -   *rightMatrix = makeMatrix_WithResMat( rightMatrixRows, rightMatrixCols );
    1.43 -   
    1.44 -      param = getParamFromBag( "rightMatrixFileName", paramBag );
    1.45 -   rightMatrixFileName = param->strValue;
    1.46 -   read_Matrix_From_File( *rightMatrix, rightMatrixFileName );
    1.47 - }
    1.48 -
    1.49 -
    1.50 -void parseLineIntoRow( char *line, float32* row );
    1.51 -
    1.52 -
    1.53 - void
    1.54 -read_Matrix_From_File( Matrix *matrixStruc, char *matrixFileName )
    1.55 - { int    row, maxRead, numRows, numCols;
    1.56 -   float32 *matrixStart;
    1.57 -   size_t lineSz = 0;
    1.58 -   FILE  *file;
    1.59 -   char  *line = NULL;
    1.60 -   
    1.61 -   lineSz = 50000; //max length of line in a matrix data file
    1.62 -   line = (char *) malloc( lineSz );
    1.63 -   if( line == NULL ) printf( "no mem for matrix line" );
    1.64 -   
    1.65 -   numRows = matrixStruc->numRows;
    1.66 -   numCols = matrixStruc->numCols;
    1.67 -   matrixStart = matrixStruc->array;
    1.68 -
    1.69 -   file = fopen( matrixFileName, "r" );
    1.70 -   if( file == NULL ) { printf( "\nCouldn't open file!!\n"); exit(1);}
    1.71 -   fseek( file, 0, SEEK_SET );
    1.72 -   for( row = 0; row < numRows; row++ )
    1.73 -    {
    1.74 -      if( feof( file ) )  printf( "file ran out too soon" );
    1.75 -      maxRead = getline( &line, &lineSz, file );
    1.76 -      if( maxRead == -1 ) printf( "prob reading mat line");
    1.77 -      
    1.78 -      if( *line == '\n') continue; //blank line
    1.79 -      if( *line == '/' ) continue; //comment line
    1.80 -      
    1.81 -      parseLineIntoRow( line, matrixStart + row * numCols );
    1.82 -    }
    1.83 -   free( line );
    1.84 - }
    1.85 -
    1.86 -/*This function relies on each line having the proper number of cols.  It
    1.87 - * doesn't check, nor enforce, so if the file is improperly formatted it
    1.88 - * can write over unrelated memory
    1.89 - */
    1.90 - void
    1.91 -parseLineIntoRow( char *line, float32* row )
    1.92 - {
    1.93 -   char *valueStr, *searchPos;
    1.94 -   
    1.95 -      //read the float values
    1.96 -   searchPos = valueStr = line; //start
    1.97 -   
    1.98 -   for( ; *searchPos != 0; searchPos++)  //bit dangerous, should use buff len
    1.99 -    {
   1.100 -      if( *searchPos == '\n' ) //last col..  relying on well-formatted file
   1.101 -       { *searchPos = 0;
   1.102 -         *row = atof( valueStr );
   1.103 -         break;                                    //end FOR loop
   1.104 -       }
   1.105 -      if( *searchPos == ',' )
   1.106 -       { *searchPos = 0;                           //mark end of string
   1.107 -         *row = (float32) atof( valueStr );
   1.108 -         row += 1;                                 //address arith
   1.109 -            //skip any spaces before digits.. use searchPos + 1 to skip the 0
   1.110 -         for( ; *(searchPos + 1)== ' ' && *(searchPos + 1) !=0; searchPos++);
   1.111 -         valueStr = searchPos + 1;
   1.112 -       }
   1.113 -    }
   1.114 - }
   1.115 -
   1.116 - //==========================================================================
   1.117 -
   1.118 -/*In the "_Flat" version of constructor, do only malloc of the top data struc
   1.119 - * and set values in that top-level.  Don't malloc any sub-structures.
   1.120 - */
   1.121 - Matrix *
   1.122 -makeMatrix_Flat( int32 numRows, int32 numCols )
   1.123 - { Matrix * retMatrix;
   1.124 -   retMatrix = malloc( sizeof( Matrix ) );
   1.125 -   retMatrix->numRows = numRows;
   1.126 -   retMatrix->numCols = numCols;
   1.127 -
   1.128 -   return retMatrix;
   1.129 - }
   1.130 -
   1.131 - Matrix *
   1.132 -makeMatrix_WithResMat( int32 numRows, int32 numCols )
   1.133 - { Matrix * retMatrix;
   1.134 -   retMatrix = malloc( sizeof( Matrix ) );
   1.135 -   retMatrix->numRows = numRows;
   1.136 -   retMatrix->numCols = numCols;
   1.137 -   retMatrix->array  = malloc( numRows * numCols * sizeof(float32) );
   1.138 -
   1.139 -   return retMatrix;
   1.140 - }
   1.141 -
   1.142 - void
   1.143 -freeMatrix_Flat( Matrix * matrix )
   1.144 - { //( matrix );
   1.145 - }
   1.146 - void
   1.147 -freeMatrix( Matrix * matrix )
   1.148 - { free( matrix->array );
   1.149 -   free( matrix );
   1.150 - }
   1.151 -
   1.152 -void
   1.153 -printMatrix( Matrix *matrix )
   1.154 - { int r, c, numRows, numCols, rowsToPrint, colsToPrint, rowIncr, colIncr;
   1.155 -   float32 *matrixArray;
   1.156 -
   1.157 -   numRows = rowsToPrint = matrix->numRows;
   1.158 -   numCols = colsToPrint = matrix->numCols;
   1.159 -   matrixArray = matrix->array;
   1.160 -
   1.161 -   rowIncr = numRows/20; if(rowIncr == 0) rowIncr = 1;//20 to 39 rows printed
   1.162 -   colIncr = numCols/20; if(colIncr == 0) colIncr = 1;//20 to 39 cols printed
   1.163 -   for( r = 0; r < numRows; r += rowIncr )
   1.164 -    { for( c = 0; c < numCols; c += colIncr )
   1.165 -       { printf( "%3.1f | ", matrixArray[ r * numCols + c ] );
   1.166 -       }
   1.167 -      printf("\n");
   1.168 -    }
   1.169 - }
   1.170 -
   1.171 +/*
   1.172 
   1.173 + *  Copyright 2009 OpenSourceStewardshipFoundation.org
   1.174 
   1.175 + *  Licensed under GNU General Public License version 2
   1.176 
   1.177 + *
   1.178 
   1.179 + * Author: seanhalle@yahoo.com
   1.180 
   1.181 + *
   1.182 
   1.183 + * Created on November 15, 2009, 2:35 AM
   1.184 
   1.185 + */
   1.186 
   1.187 +
   1.188 
   1.189 +#include <malloc.h>
   1.190 
   1.191 +#include <stdlib.h>
   1.192 
   1.193 +
   1.194 
   1.195 +#include "Matrix_Mult.h"
   1.196 
   1.197 +#include "ParamHelper/Param.h"
   1.198 
   1.199 +
   1.200 
   1.201 +
   1.202 
   1.203 + 
   1.204 
   1.205 + void
   1.206 
   1.207 +initialize_Input_Matrices_Via( Matrix  **leftMatrix, Matrix **rightMatrix,
   1.208 
   1.209 +                               ParamBag *paramBag )
   1.210 
   1.211 + { char *leftMatrixFileName, *rightMatrixFileName;
   1.212 
   1.213 +   int   leftMatrixRows, leftMatrixCols, rightMatrixRows, rightMatrixCols;
   1.214 
   1.215 +   
   1.216 
   1.217 +      ParamStruc *param;
   1.218 
   1.219 +      param = getParamFromBag( "leftMatrixRows", paramBag );
   1.220 
   1.221 +   leftMatrixRows = param->intValue;
   1.222 
   1.223 +      param = getParamFromBag( "leftMatrixCols", paramBag );
   1.224 
   1.225 +   leftMatrixCols = param->intValue;
   1.226 
   1.227 +   *leftMatrix = makeMatrix_WithResMat( leftMatrixRows, leftMatrixCols );
   1.228 
   1.229 +   
   1.230 
   1.231 +      param = getParamFromBag( "leftMatrixFileName", paramBag );
   1.232 
   1.233 +   leftMatrixFileName = param->strValue;  //no need to copy
   1.234 
   1.235 +   read_Matrix_From_File( *leftMatrix,  leftMatrixFileName );
   1.236 
   1.237 +   
   1.238 
   1.239 +      param = getParamFromBag( "rightMatrixRows", paramBag );
   1.240 
   1.241 +   rightMatrixRows = param->intValue;
   1.242 
   1.243 +      param = getParamFromBag( "rightMatrixCols", paramBag );
   1.244 
   1.245 +   rightMatrixCols = param->intValue;
   1.246 
   1.247 +   *rightMatrix = makeMatrix_WithResMat( rightMatrixRows, rightMatrixCols );
   1.248 
   1.249 +   
   1.250 
   1.251 +      param = getParamFromBag( "rightMatrixFileName", paramBag );
   1.252 
   1.253 +   rightMatrixFileName = param->strValue;
   1.254 
   1.255 +   read_Matrix_From_File( *rightMatrix, rightMatrixFileName );
   1.256 
   1.257 + }
   1.258 
   1.259 +
   1.260 
   1.261 +
   1.262 
   1.263 +void parseLineIntoRow( char *line, float32* row );
   1.264 
   1.265 +
   1.266 
   1.267 +
   1.268 
   1.269 + void
   1.270 
   1.271 +read_Matrix_From_File( Matrix *matrixStruc, char *matrixFileName )
   1.272 
   1.273 + { int    row, maxRead, numRows, numCols;
   1.274 
   1.275 +   float32 *matrixStart;
   1.276 
   1.277 +   size_t lineSz = 0;
   1.278 
   1.279 +   FILE  *file;
   1.280 
   1.281 +   char  *line = NULL;
   1.282 
   1.283 +   
   1.284 
   1.285 +   lineSz = 50000; //max length of line in a matrix data file
   1.286 
   1.287 +   line = (char *) malloc( lineSz );
   1.288 
   1.289 +   if( line == NULL ) printf( "no mem for matrix line" );
   1.290 
   1.291 +   
   1.292 
   1.293 +   numRows = matrixStruc->numRows;
   1.294 
   1.295 +   numCols = matrixStruc->numCols;
   1.296 
   1.297 +   matrixStart = matrixStruc->array;
   1.298 
   1.299 +
   1.300 
   1.301 +   file = fopen( matrixFileName, "r" );
   1.302 
   1.303 +   if( file == NULL ) { printf( "\nCouldn't open file!!\n"); exit(1);}
   1.304 
   1.305 +   fseek( file, 0, SEEK_SET );
   1.306 
   1.307 +   for( row = 0; row < numRows; row++ )
   1.308 
   1.309 +    {
   1.310 
   1.311 +      if( feof( file ) )  printf( "file ran out too soon" );
   1.312 
   1.313 +      maxRead = getline( &line, &lineSz, file );
   1.314 
   1.315 +      if( maxRead == -1 ) printf( "prob reading mat line");
   1.316 
   1.317 +      
   1.318 
   1.319 +      if( *line == '\n') continue; //blank line
   1.320 
   1.321 +      if( *line == '/' ) continue; //comment line
   1.322 
   1.323 +      
   1.324 
   1.325 +      parseLineIntoRow( line, matrixStart + row * numCols );
   1.326 
   1.327 +    }
   1.328 
   1.329 +   free( line );
   1.330 
   1.331 + }
   1.332 
   1.333 +
   1.334 
   1.335 +/*This function relies on each line having the proper number of cols.  It
   1.336 
   1.337 + * doesn't check, nor enforce, so if the file is improperly formatted it
   1.338 
   1.339 + * can write over unrelated memory
   1.340 
   1.341 + */
   1.342 
   1.343 + void
   1.344 
   1.345 +parseLineIntoRow( char *line, float32* row )
   1.346 
   1.347 + {
   1.348 
   1.349 +   char *valueStr, *searchPos;
   1.350 
   1.351 +   
   1.352 
   1.353 +      //read the float values
   1.354 
   1.355 +   searchPos = valueStr = line; //start
   1.356 
   1.357 +   
   1.358 
   1.359 +   for( ; *searchPos != 0; searchPos++)  //bit dangerous, should use buff len
   1.360 
   1.361 +    {
   1.362 
   1.363 +      if( *searchPos == '\n' ) //last col..  relying on well-formatted file
   1.364 
   1.365 +       { *searchPos = 0;
   1.366 
   1.367 +         *row = atof( valueStr );
   1.368 
   1.369 +         break;                                    //end FOR loop
   1.370 
   1.371 +       }
   1.372 
   1.373 +      if( *searchPos == ',' )
   1.374 
   1.375 +       { *searchPos = 0;                           //mark end of string
   1.376 
   1.377 +         *row = (float32) atof( valueStr );
   1.378 
   1.379 +         row += 1;                                 //address arith
   1.380 
   1.381 +            //skip any spaces before digits.. use searchPos + 1 to skip the 0
   1.382 
   1.383 +         for( ; *(searchPos + 1)== ' ' && *(searchPos + 1) !=0; searchPos++);
   1.384 
   1.385 +         valueStr = searchPos + 1;
   1.386 
   1.387 +       }
   1.388 
   1.389 +    }
   1.390 
   1.391 + }
   1.392 
   1.393 +
   1.394 
   1.395 + //==========================================================================
   1.396 
   1.397 +
   1.398 
   1.399 +/*In the "_Flat" version of constructor, do only malloc of the top data struc
   1.400 
   1.401 + * and set values in that top-level.  Don't malloc any sub-structures.
   1.402 
   1.403 + */
   1.404 
   1.405 + Matrix *
   1.406 
   1.407 +makeMatrix_Flat( int32 numRows, int32 numCols )
   1.408 
   1.409 + { Matrix * retMatrix;
   1.410 
   1.411 +   retMatrix = malloc( sizeof( Matrix ) );
   1.412 
   1.413 +   retMatrix->numRows = numRows;
   1.414 
   1.415 +   retMatrix->numCols = numCols;
   1.416 
   1.417 +
   1.418 
   1.419 +   return retMatrix;
   1.420 
   1.421 + }
   1.422 
   1.423 +
   1.424 
   1.425 + Matrix *
   1.426 
   1.427 +makeMatrix_WithResMat( int32 numRows, int32 numCols )
   1.428 
   1.429 + { Matrix * retMatrix;
   1.430 
   1.431 +   retMatrix = malloc( sizeof( Matrix ) );
   1.432 
   1.433 +   retMatrix->numRows = numRows;
   1.434 
   1.435 +   retMatrix->numCols = numCols;
   1.436 
   1.437 +   retMatrix->array  = malloc( numRows * numCols * sizeof(float32) );
   1.438 
   1.439 +
   1.440 
   1.441 +   return retMatrix;
   1.442 
   1.443 + }
   1.444 
   1.445 +
   1.446 
   1.447 + void
   1.448 
   1.449 +freeMatrix_Flat( Matrix * matrix )
   1.450 
   1.451 + { //( matrix );
   1.452 
   1.453 + }
   1.454 
   1.455 + void
   1.456 
   1.457 +freeMatrix( Matrix * matrix )
   1.458 
   1.459 + { free( matrix->array );
   1.460 
   1.461 +   free( matrix );
   1.462 
   1.463 + }
   1.464 
   1.465 +
   1.466 
   1.467 +void
   1.468 
   1.469 +printMatrix( Matrix *matrix )
   1.470 
   1.471 + { int r, c, numRows, numCols, rowsToPrint, colsToPrint, rowIncr, colIncr;
   1.472 
   1.473 +   float32 *matrixArray;
   1.474 
   1.475 +
   1.476 
   1.477 +   numRows = rowsToPrint = matrix->numRows;
   1.478 
   1.479 +   numCols = colsToPrint = matrix->numCols;
   1.480 
   1.481 +   matrixArray = matrix->array;
   1.482 
   1.483 +
   1.484 
   1.485 +   rowIncr = numRows/20; if(rowIncr == 0) rowIncr = 1;//20 to 39 rows printed
   1.486 
   1.487 +   colIncr = numCols/20; if(colIncr == 0) colIncr = 1;//20 to 39 cols printed
   1.488 
   1.489 +   for( r = 0; r < numRows; r += rowIncr )
   1.490 
   1.491 +    { for( c = 0; c < numCols; c += colIncr )
   1.492 
   1.493 +       { printf( "%3.1f | ", matrixArray[ r * numCols + c ] );
   1.494 
   1.495 +       }
   1.496 
   1.497 +      printf("\n");
   1.498 
   1.499 +    }
   1.500 
   1.501 + }
   1.502 
   1.503 +
   1.504 
     2.1 --- a/src/Application/Matrix_Mult.h	Mon Nov 15 12:08:19 2010 -0800
     2.2 +++ b/src/Application/Matrix_Mult.h	Wed May 11 15:40:54 2011 +0200
     2.3 @@ -1,77 +1,77 @@
     2.4 -/*
     2.5 - *  Copyright Oct 24, 2009 OpenSourceStewardshipFoundation.org
     2.6 - *  Licensed under GNU General Public License version 2
     2.7 - */
     2.8 -
     2.9 -#ifndef MATRIX_MULT_H_
    2.10 -#define MATRIX_MULT_H_
    2.11 -
    2.12 -#include <stdio.h>
    2.13 -#include <unistd.h>
    2.14 -#include <malloc.h>
    2.15 -
    2.16 -#include "../VCilk_lib/VMS/VMS_primitive_data_types.h"
    2.17 -#include "ParamHelper/Param.h"
    2.18 -
    2.19 -//==============================  Structures  ==============================
    2.20 -
    2.21 -typedef
    2.22 -struct
    2.23 - { int32 numRows;
    2.24 -   int32 numCols;
    2.25 -   float32 *array;  //2D, but dynamically sized, so use addr arith
    2.26 - }
    2.27 -Matrix;
    2.28 -
    2.29 -/* This is the "appSpecificPiece" that is carried inside a DKUPiece.
    2.30 - *  In the DKUPiece data struc it is declared to be of type "void *".  This
    2.31 - *  allows the application to define any data structure it wants and put it
    2.32 - *  into a DKUPiece.
    2.33 - * When the app specific info is used, it is in app code, so it is cast to
    2.34 - *  the correct type to tell the compiler how to access fields.
    2.35 - * This keeps all app-specific things out of the DKU directory, as per the
    2.36 - *  DKU standard. */
    2.37 -typedef
    2.38 -struct
    2.39 - { 
    2.40 -      // pointers to shared data..  the result matrix must be created when the
    2.41 -      //  left and right matrices are put into the root ancestor DKUPiece.
    2.42 -   Matrix * leftMatrix;
    2.43 -   Matrix * rightMatrix;
    2.44 -   Matrix * resultMatrix;
    2.45 -
    2.46 -      // define the starting and ending boundaries for this piece of the
    2.47 -      //  result matrix.  These are derivable from the left and right
    2.48 -      //  matrices, but included them for readability of code.
    2.49 -   int prodStartRow, prodEndRow;
    2.50 -   int prodStartCol, prodEndCol;
    2.51 -      // Start and end of the portion of the left matrix that contributes to
    2.52 -      //  this piece of the product
    2.53 -   int leftStartRow, leftEndRow;
    2.54 -   int leftStartCol, leftEndCol;
    2.55 -      // Start and end of the portion of the right matrix that contributes to
    2.56 -      //  this piece of the product
    2.57 -   int rightStartRow, rightEndRow;
    2.58 -   int rightStartCol, rightEndCol;
    2.59 - }
    2.60 -MatrixProdPiece;
    2.61 -
    2.62 -//==============================  Functions  ================================
    2.63 -void readFile();
    2.64 -
    2.65 -Matrix *makeMatrix( int32 numRows, int32 numCols );
    2.66 -Matrix *makeMatrix_Flat( int32 numRows, int32 numCols );
    2.67 -Matrix *makeMatrix_WithResMat( int32 numRows, int32 numCols );
    2.68 -void    freeMatrix_Flat( Matrix * matrix );
    2.69 -void    freeMatrix( Matrix * matrix );
    2.70 -void    printMatrix( Matrix *matrix );
    2.71 -
    2.72 -void read_Matrix_From_File( Matrix *matrixStruc, char *matrixFileName );
    2.73 -
    2.74 -void
    2.75 -initialize_Input_Matrices_Via( Matrix  **leftMatrix, Matrix **rightMatrix,
    2.76 -                              ParamBag *paramBag );
    2.77 -
    2.78 -//===========================================================================
    2.79 -
    2.80 -#endif /*MATRIX_MULT_H_*/
    2.81 +/*
    2.82 
    2.83 + *  Copyright Oct 24, 2009 OpenSourceStewardshipFoundation.org
    2.84 
    2.85 + *  Licensed under GNU General Public License version 2
    2.86 
    2.87 + */
    2.88 
    2.89 +
    2.90 
    2.91 +#ifndef MATRIX_MULT_H_
    2.92 
    2.93 +#define MATRIX_MULT_H_
    2.94 
    2.95 +
    2.96 
    2.97 +#include <stdio.h>
    2.98 
    2.99 +#include <unistd.h>
   2.100 
   2.101 +#include <malloc.h>
   2.102 
   2.103 +
   2.104 
   2.105 +#include "../VCilk_lib/VMS/VMS_primitive_data_types.h"
   2.106 
   2.107 +#include "ParamHelper/Param.h"
   2.108 
   2.109 +
   2.110 
   2.111 +//==============================  Structures  ==============================
   2.112 
   2.113 +
   2.114 
   2.115 +typedef
   2.116 
   2.117 +struct
   2.118 
   2.119 + { int32 numRows;
   2.120 
   2.121 +   int32 numCols;
   2.122 
   2.123 +   float32 *array;  //2D, but dynamically sized, so use addr arith
   2.124 
   2.125 + }
   2.126 
   2.127 +Matrix;
   2.128 
   2.129 +
   2.130 
   2.131 +/* This is the "appSpecificPiece" that is carried inside a DKUPiece.
   2.132 
   2.133 + *  In the DKUPiece data struc it is declared to be of type "void *".  This
   2.134 
   2.135 + *  allows the application to define any data structure it wants and put it
   2.136 
   2.137 + *  into a DKUPiece.
   2.138 
   2.139 + * When the app specific info is used, it is in app code, so it is cast to
   2.140 
   2.141 + *  the correct type to tell the compiler how to access fields.
   2.142 
   2.143 + * This keeps all app-specific things out of the DKU directory, as per the
   2.144 
   2.145 + *  DKU standard. */
   2.146 
   2.147 +typedef
   2.148 
   2.149 +struct
   2.150 
   2.151 + { 
   2.152 
   2.153 +      // pointers to shared data..  the result matrix must be created when the
   2.154 
   2.155 +      //  left and right matrices are put into the root ancestor DKUPiece.
   2.156 
   2.157 +   Matrix * leftMatrix;
   2.158 
   2.159 +   Matrix * rightMatrix;
   2.160 
   2.161 +   Matrix * resultMatrix;
   2.162 
   2.163 +
   2.164 
   2.165 +      // define the starting and ending boundaries for this piece of the
   2.166 
   2.167 +      //  result matrix.  These are derivable from the left and right
   2.168 
   2.169 +      //  matrices, but included them for readability of code.
   2.170 
   2.171 +   int prodStartRow, prodEndRow;
   2.172 
   2.173 +   int prodStartCol, prodEndCol;
   2.174 
   2.175 +      // Start and end of the portion of the left matrix that contributes to
   2.176 
   2.177 +      //  this piece of the product
   2.178 
   2.179 +   int leftStartRow, leftEndRow;
   2.180 
   2.181 +   int leftStartCol, leftEndCol;
   2.182 
   2.183 +      // Start and end of the portion of the right matrix that contributes to
   2.184 
   2.185 +      //  this piece of the product
   2.186 
   2.187 +   int rightStartRow, rightEndRow;
   2.188 
   2.189 +   int rightStartCol, rightEndCol;
   2.190 
   2.191 + }
   2.192 
   2.193 +MatrixProdPiece;
   2.194 
   2.195 +
   2.196 
   2.197 +//==============================  Functions  ================================
   2.198 
   2.199 +void readFile();
   2.200 
   2.201 +
   2.202 
   2.203 +Matrix *makeMatrix( int32 numRows, int32 numCols );
   2.204 
   2.205 +Matrix *makeMatrix_Flat( int32 numRows, int32 numCols );
   2.206 
   2.207 +Matrix *makeMatrix_WithResMat( int32 numRows, int32 numCols );
   2.208 
   2.209 +void    freeMatrix_Flat( Matrix * matrix );
   2.210 
   2.211 +void    freeMatrix( Matrix * matrix );
   2.212 
   2.213 +void    printMatrix( Matrix *matrix );
   2.214 
   2.215 +
   2.216 
   2.217 +void read_Matrix_From_File( Matrix *matrixStruc, char *matrixFileName );
   2.218 
   2.219 +
   2.220 
   2.221 +void
   2.222 
   2.223 +initialize_Input_Matrices_Via( Matrix  **leftMatrix, Matrix **rightMatrix,
   2.224 
   2.225 +                              ParamBag *paramBag );
   2.226 
   2.227 +
   2.228 
   2.229 +//===========================================================================
   2.230 
   2.231 +
   2.232 
   2.233 +#endif /*MATRIX_MULT_H_*/
   2.234 
     3.1 --- a/src/Application/VCilk__Matrix_Mult/Divide_Pr.c	Mon Nov 15 12:08:19 2010 -0800
     3.2 +++ b/src/Application/VCilk__Matrix_Mult/Divide_Pr.c	Wed May 11 15:40:54 2011 +0200
     3.3 @@ -1,588 +1,588 @@
     3.4 -/*
     3.5 - *  Copyright 2009 OpenSourceStewardshipFoundation.org
     3.6 - *  Licensed under GNU General Public License version 2
     3.7 - *
     3.8 - * Author: seanhalle@yahoo.com
     3.9 - *
    3.10 - */
    3.11 -
    3.12 -
    3.13 -#include "VCilk__Matrix_Mult.h"
    3.14 -#include <math.h>
    3.15 -#include <sys/time.h>
    3.16 -#include <string.h>
    3.17 -
    3.18 -   //The time to compute this many result values should equal the time to
    3.19 -   // perform this division on a matrix of size gives that many result calcs
    3.20 -   //IE, size this so that sequential time to calc equals divide time
    3.21 -   // find the value by experimenting -- but divide time and calc time scale
    3.22 -   // same way, so this value should remain valid across hardware
    3.23 -   //Divide time is about 800us on 2.4Ghz core2Quad laptop core
    3.24 -   //num cells is the cube of a side, when have two square matrices
    3.25 -#define NUM_CELLS_IN_SEQUENTIAL_CUTOFF 100000 /* about 46x46 */
    3.26 -
    3.27 -
    3.28 -//===========================================================================
    3.29 -int inline
    3.30 -measureMatrixMultPrimitive( VirtProcr *animPr );
    3.31 -
    3.32 -SlicingStrucCarrier *
    3.33 -calcIdealSizeAndSliceDimensions( Matrix *leftMatrix, Matrix *rightMatrix,
    3.34 -                                 VirtProcr *animPr );
    3.35 -
    3.36 -SlicingStruc *
    3.37 -sliceUpDimension( float32 idealSizeOfSide, int startVal, int endVal,
    3.38 -                  VirtProcr *animPr );
    3.39 -
    3.40 -void
    3.41 -freeSlicingStruc( SlicingStruc *slicingStruc, VirtProcr *animPr );
    3.42 -
    3.43 -SubMatrix **
    3.44 -createSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices,
    3.45 -                   Matrix *origMatrix, VirtProcr *animPr );
    3.46 -
    3.47 -void
    3.48 -freeSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices,
    3.49 -                 SubMatrix **subMatrices, VirtProcr *animPr );
    3.50 -
    3.51 -void
    3.52 -pairUpSubMatricesAndSpawnAndSync( SubMatrix **leftSubMatrices,
    3.53 -                                    SubMatrix **rightSubMatrices,
    3.54 -                                    int32 numRowIdxs, int32 numColIdxs,
    3.55 -                                    int32 numVecIdxs,
    3.56 -                                    float32 *resultArray,
    3.57 -                                    VirtProcr *animatingPr );
    3.58 -
    3.59 -void
    3.60 -makeSubMatricesAndSpawnAndSync( Matrix *leftMatrix, Matrix *rightMatrix,
    3.61 -            SlicingStrucCarrier *slicingStrucCarrier,
    3.62 -            float32 *resultArray, VirtProcr *animatingPr );
    3.63 -
    3.64 -//===========================================================================
    3.65 -
    3.66 -/*Divider creates one processor for every sub-matrix
    3.67 - * It hands them:
    3.68 - *  the name of the result processor that they should send their results to,
    3.69 - *  the left and right matrices, and the rows and cols they should multiply
    3.70 - * It first creates the result processor, then all the sub-matrixPair
    3.71 - *  processors,
    3.72 - *  then does a receive of a message from the result processor that gives
    3.73 - *  the divider ownership of the result matrix.
    3.74 - * Finally, the divider returns the result matrix out of the VCilk system.
    3.75 - *
    3.76 - * Divider chooses the size of sub-matrices via an algorithm that tries to
    3.77 - *  keep the minimum work above a threshold.  The threshold is machine-
    3.78 - *  dependent, so ask VCilk for min work-unit time to get a
    3.79 - *  given overhead
    3.80 - *
    3.81 - * Divide min work-unit cycles by measured-cycles for one matrix-cell
    3.82 - *  product -- gives the number of products need to have in min size
    3.83 - *  matrix.
    3.84 - *
    3.85 - * So then, take cubed root of this to get the size of a side of min sub-
    3.86 - *  matrix.  That is the size of the ideal square sub-matrix -- so tile
    3.87 - *  up the two input matrices into ones as close as possible to that size,
    3.88 - *  and create the pairs of sub-matrices.
    3.89 - *
    3.90 - *========================  STRATEGIC OVERVIEW  =======================
    3.91 - *
    3.92 - *This division is a bit tricky, because have to create things in advance
    3.93 - * that it's not at first obvious need to be created..
    3.94 - *
    3.95 - *First slice up each dimension -- three of them..  this is because will have
    3.96 - * to create the sub-matrix's data-structures before pairing the sub-matrices
    3.97 - * with each other -- so, have three dimensions to slice up before can
    3.98 - * create the sub-matrix data-strucs -- also, have to be certain that the
    3.99 - * cols of the left input have the exact same slicing as the rows of the
   3.100 - * left matrix, so just to be sure, do the slicing calc once, then use it
   3.101 - * for both.
   3.102 - *
   3.103 - *So, goes like this:
   3.104 - *1) calculate the start & end values of each dimension in each matrix.
   3.105 - *2) use those values to create sub-matrix structures
   3.106 - *3) combine sub-matrices into pairs, as the tasks to perform.
   3.107 - *
   3.108 - *Have to calculate separately from creating the sub-matrices because of the
   3.109 - * nature of the nesting -- would either end up creating the same sub-matrix
   3.110 - * multiple times, or else would have to put in detection of whether had
   3.111 - * made a particular one already if tried to combine steps 1 and 2.
   3.112 - *
   3.113 - *Step 3 has to be separate because of the nesting, as well -- same reason,
   3.114 - * would either create same sub-matrix multiple times, or else have to
   3.115 - * add detection of whether was already created.
   3.116 - *
   3.117 - *Another way to look at it: there's one level of loop to divide dimensions,
   3.118 - * two levels of nesting to create sub-matrices, and three levels to pair
   3.119 - * up the sub-matrices.
   3.120 - */
   3.121 -
   3.122 -void divideWorkIntoSubMatrixPairProcrs( void      *_dividerParams,
   3.123 -                                        VirtProcr *animPr )
   3.124 - { 
   3.125 -   DividerParams   *dividerParams;
   3.126 -   ResultsParams   *resultsParams;
   3.127 -   Matrix          *leftMatrix, *rightMatrix, *resultMatrix;
   3.128 -   void            *msg;
   3.129 -   SlicingStrucCarrier *slicingStrucCarrier;
   3.130 -   float32         *resultArray; //points to array to be put inside result
   3.131 -                                 // matrix
   3.132 -   
   3.133 -         DEBUG( dbgAppFlow, "start divide\n")
   3.134 -
   3.135 -         int32
   3.136 -         divideProbe = VMS__create_single_interval_probe( "divideProbe",
   3.137 -                                                          animPr );
   3.138 -         VMS__record_sched_choice_into_probe( divideProbe, animPr );
   3.139 -         VMS__record_interval_start_in_probe( divideProbe );
   3.140 -
   3.141 -   //=========== Setup -- make local copies of ptd-to-things, malloc, aso
   3.142 -   int32 numResRows, numResCols, vectLength;
   3.143 -
   3.144 -   dividerParams   = (DividerParams *)_dividerParams;
   3.145 -   
   3.146 -   leftMatrix      = dividerParams->leftMatrix;
   3.147 -   rightMatrix     = dividerParams->rightMatrix;
   3.148 -
   3.149 -   vectLength  = leftMatrix->numCols;
   3.150 -   numResRows  = leftMatrix->numRows;
   3.151 -   numResCols  = rightMatrix->numCols;
   3.152 -   resultArray = dividerParams->resultMatrix->array;
   3.153 -   
   3.154 -      //zero the result array
   3.155 -   memset( resultArray, 0, numResRows * numResCols * sizeof(float32) );
   3.156 -
   3.157 -   
   3.158 -   //==============  Do either sequential mult or do division ==============
   3.159 -
   3.160 -      //Check if input matrices too small -- if yes, just do sequential
   3.161 -      //Cutoff is determined by overhead of this divider -- relatively
   3.162 -      // machine-independent
   3.163 -   if( (float32)leftMatrix->numRows * (float32)leftMatrix->numCols *
   3.164 -       (float32)rightMatrix->numCols  < NUM_CELLS_IN_SEQUENTIAL_CUTOFF )
   3.165 -    { int32 vectLength;
   3.166 -
   3.167 -      //====== Do sequential multiply on a single core
   3.168 -            DEBUG( dbgAppFlow, "doing sequential")
   3.169 -
   3.170 -         //transpose the right matrix
   3.171 -      float32 *
   3.172 -      transRightArray  = VCilk__malloc( rightMatrix->numRows *
   3.173 -                                        rightMatrix->numCols *
   3.174 -                                        sizeof(float32),       animPr );
   3.175 -
   3.176 -         //copy values from orig matrix to local
   3.177 -      copyTranspose( rightMatrix->numRows, rightMatrix->numCols,
   3.178 -                     0, 0, rightMatrix->numRows,
   3.179 -                     transRightArray, rightMatrix->array );
   3.180 -
   3.181 -      multiplyMatrixArraysTransposed( vectLength, numResRows, numResCols,
   3.182 -                            leftMatrix->array, transRightArray,
   3.183 -                            resultArray );
   3.184 -    }
   3.185 -   else
   3.186 -    {
   3.187 -      //====== Do parallel multiply across cores
   3.188 -
   3.189 -         //Calc the ideal size of sub-matrix and slice up the dimensions of
   3.190 -         // the two matrices.
   3.191 -         //The ideal size is the one takes the number of cycles to calculate
   3.192 -         // such that calc time is equal or greater than min work-unit size
   3.193 -      slicingStrucCarrier =
   3.194 -         calcIdealSizeAndSliceDimensions( leftMatrix, rightMatrix,  animPr);
   3.195 -                                         
   3.196 -
   3.197 -
   3.198 -         //Make the sub-matrices, and pair them up, then spawn processors to
   3.199 -         // calc product of each pair.
   3.200 -      makeSubMatricesAndSpawnAndSync( leftMatrix, rightMatrix,
   3.201 -                                      slicingStrucCarrier,
   3.202 -                                      resultArray, animPr);
   3.203 -         //The result array will get filled in by the spawned children
   3.204 -    }
   3.205 -
   3.206 -
   3.207 -   //===============  Work done -- send results back =================
   3.208 -
   3.209 -
   3.210 -      //results have been saved into an array that was made outside the VMS
   3.211 -      // system, by entry-point Fn, and passed in through dividerParams.
   3.212 -      //So, nothing to do to send results back -- they're seen by side-effect
   3.213 -
   3.214 -         DEBUG( dbgAppFlow, "*** end divide ***\n")
   3.215 -
   3.216 -         VMS__record_interval_end_in_probe( divideProbe );
   3.217 -         VMS__print_stats_of_all_probes();
   3.218 -
   3.219 -   VCilk__dissipate_procr( animPr );  //all procrs dissipate self at end
   3.220 -      //when all of the processors have dissipated, the "create seed and do
   3.221 -      // work" call in the entry point function returns
   3.222 - }
   3.223 -
   3.224 -
   3.225 -SlicingStrucCarrier *
   3.226 -calcIdealSizeAndSliceDimensions( Matrix *leftMatrix, Matrix *rightMatrix,
   3.227 -                                 VirtProcr *animPr )
   3.228 -{
   3.229 -   float32 idealSizeOfSide, idealSizeOfSide1, idealSizeOfSide2;
   3.230 -   SlicingStruc *leftRowSlices, *vecSlices, *rightColSlices;
   3.231 -   SlicingStrucCarrier *slicingStrucCarrier =
   3.232 -                         VCilk__malloc(sizeof(SlicingStrucCarrier), animPr );
   3.233 -
   3.234 -   int minWorkUnitCycles, primitiveCycles, idealNumWorkUnits;
   3.235 -   float64 numPrimitiveOpsInMinWorkUnit;
   3.236 -
   3.237 -
   3.238 -   //=======  Calc ideal size of min-sized sub-matrix  ========
   3.239 -
   3.240 -      //ask VCilk for the number of cycles of the minimum work unit, at given
   3.241 -      // percent overhead then add a guess at overhead from this divider
   3.242 -   minWorkUnitCycles = VCilk__giveMinWorkUnitCycles( .05 );
   3.243 -
   3.244 -      //ask VCilk for number of cycles of the "primitive" op of matrix mult
   3.245 -   primitiveCycles = measureMatrixMultPrimitive( animPr );
   3.246 -
   3.247 -   numPrimitiveOpsInMinWorkUnit =
   3.248 -      (float64)minWorkUnitCycles / (float64)primitiveCycles;
   3.249 -
   3.250 -      //take cubed root -- that's number of these in a "side" of sub-matrix
   3.251 -      // then multiply by 5 because the primitive is 5x5
   3.252 -   idealSizeOfSide1 = 5 * cbrt( numPrimitiveOpsInMinWorkUnit );
   3.253 -
   3.254 -   idealNumWorkUnits = VCilk__giveIdealNumWorkUnits();
   3.255 -   
   3.256 -   idealSizeOfSide2 = leftMatrix->numRows / rint(cbrt( idealNumWorkUnits ));
   3.257 -   idealSizeOfSide2 *= 0.8; //finer granularity to help load balance
   3.258 -
   3.259 -   if( idealSizeOfSide1 > idealSizeOfSide2 )
   3.260 -      idealSizeOfSide = idealSizeOfSide1;
   3.261 -   else
   3.262 -      idealSizeOfSide = idealSizeOfSide2;
   3.263 -
   3.264 -      //The multiply inner loop blocks the array to fit into L1 cache
   3.265 -//   if( idealSizeOfSide < ROWS_IN_BLOCK ) idealSizeOfSide = ROWS_IN_BLOCK;
   3.266 -
   3.267 -   //============  Slice up dimensions, now that know target size ===========
   3.268 -
   3.269 -      //Tell the slicer the target size of a side (floating pt), the start
   3.270 -      // value to start slicing at, and the end value to stop slicing at
   3.271 -      //It returns an array of start value of each chunk, plus number of them
   3.272 -   int32 startLeftRow, endLeftRow, startVec,endVec,startRightCol,endRightCol;
   3.273 -   startLeftRow  = 0;
   3.274 -   endLeftRow    = leftMatrix->numRows -1;
   3.275 -   startVec      = 0;
   3.276 -   endVec        = leftMatrix->numCols -1;
   3.277 -   startRightCol = 0;
   3.278 -   endRightCol   = rightMatrix->numCols -1;
   3.279 -
   3.280 -   leftRowSlices =
   3.281 -      sliceUpDimension( idealSizeOfSide,  startLeftRow, endLeftRow, animPr );
   3.282 -
   3.283 -   vecSlices =
   3.284 -      sliceUpDimension( idealSizeOfSide,  startVec, endVec, animPr );
   3.285 -
   3.286 -   rightColSlices =
   3.287 -      sliceUpDimension( idealSizeOfSide,  startRightCol, endRightCol,animPr);
   3.288 -
   3.289 -   slicingStrucCarrier->leftRowSlices  = leftRowSlices;
   3.290 -   slicingStrucCarrier->vecSlices      = vecSlices;
   3.291 -   slicingStrucCarrier->rightColSlices = rightColSlices;
   3.292 -
   3.293 -         DEBUG1( dbgAppFlow, "leftRowSlices %d | ", leftRowSlices->numVals );
   3.294 -         DEBUG1( dbgAppFlow, "rightColSlices %d | ",rightColSlices->numVals);
   3.295 -         DEBUG1( dbgAppFlow, "vecSlices %d\n", vecSlices->numVals );
   3.296 -   return slicingStrucCarrier;
   3.297 -}
   3.298 -
   3.299 -
   3.300 -void inline
   3.301 -makeSubMatricesAndSpawnAndSync( Matrix  *leftMatrix,  Matrix    *rightMatrix,
   3.302 -                         SlicingStrucCarrier *slicingStrucCarrier,
   3.303 -                         float32 *resultArray, VirtProcr *animPr )
   3.304 - {
   3.305 -   SlicingStruc *leftRowSlices, *vecSlices, *rightColSlices;
   3.306 -   
   3.307 -   leftRowSlices  = slicingStrucCarrier->leftRowSlices;
   3.308 -   vecSlices      = slicingStrucCarrier->vecSlices;
   3.309 -   rightColSlices = slicingStrucCarrier->rightColSlices;
   3.310 -   VCilk__free( slicingStrucCarrier, animPr );
   3.311 -   
   3.312 -   //================  Make sub-matrices, given the slicing  ================
   3.313 -   SubMatrix **leftSubMatrices, **rightSubMatrices;
   3.314 -   leftSubMatrices =
   3.315 -      createSubMatrices( leftRowSlices, vecSlices,
   3.316 -                         leftMatrix, animPr );
   3.317 -   rightSubMatrices =
   3.318 -      createSubMatrices( vecSlices, rightColSlices,
   3.319 -                         rightMatrix, animPr );
   3.320 -
   3.321 -   freeSlicingStruc( leftRowSlices, animPr );
   3.322 -   freeSlicingStruc( vecSlices, animPr );
   3.323 -   freeSlicingStruc( rightColSlices, animPr );
   3.324 -
   3.325 -   //==============  pair the sub-matrices and make processors ==============
   3.326 -   int32 numRowIdxs, numColIdxs, numVecIdxs;
   3.327 -
   3.328 -   numRowIdxs = leftRowSlices->numVals;
   3.329 -   numColIdxs = rightColSlices->numVals;
   3.330 -   numVecIdxs = vecSlices->numVals;
   3.331 -   pairUpSubMatricesAndSpawnAndSync( leftSubMatrices, rightSubMatrices,
   3.332 -                                     numRowIdxs, numColIdxs, numVecIdxs,
   3.333 -                                     resultArray,
   3.334 -                                     animPr );
   3.335 -      //It syncs inside, so know all work is done now: free the sub-matrices
   3.336 -   freeSubMatrices( leftRowSlices, vecSlices,  leftSubMatrices, animPr );
   3.337 -   freeSubMatrices( vecSlices, rightColSlices, rightSubMatrices, animPr );
   3.338 - }
   3.339 -
   3.340 -
   3.341 -
   3.342 -
   3.343 -/* numRows*colsPerRow/numCores = numToPutOntoEachCore; 
   3.344 - * put all from a given row onto same core, until exhaust allotment for that
   3.345 - *  core
   3.346 - *
   3.347 - */
   3.348 -void inline
   3.349 -pairUpSubMatricesAndSpawnAndSync( SubMatrix **leftSubMatrices,
   3.350 -                                    SubMatrix **rightSubMatrices,
   3.351 -                                    int32 numRowIdxs, int32 numColIdxs,
   3.352 -                                    int32 numVecIdxs,
   3.353 -                                    float32 *resultArray,
   3.354 -                                    VirtProcr *animatingPr )
   3.355 - {
   3.356 -   int32 resRowIdx, resColIdx;
   3.357 -   int32 numLeftColIdxs, numRightColIdxs;
   3.358 -   int32 leftRowIdxOffset;
   3.359 -   VecParams *vecParams;
   3.360 -   float32 numToPutOntoEachCore, leftOverFraction;
   3.361 -   int32 numCores, currCore, numOnCurrCore;
   3.362 -
   3.363 -   numLeftColIdxs  = numColIdxs;
   3.364 -   numRightColIdxs = numVecIdxs;
   3.365 -
   3.366 -   numCores = VCilk__give_number_of_cores_to_spawn_onto();   
   3.367 -
   3.368 -   numToPutOntoEachCore = numRowIdxs*numColIdxs/numCores;
   3.369 -   leftOverFraction = 0;
   3.370 -   numOnCurrCore = 0;
   3.371 -   currCore = 0;
   3.372 -
   3.373 -   for( resRowIdx = 0; resRowIdx < numRowIdxs; resRowIdx++ )
   3.374 -    {
   3.375 -      leftRowIdxOffset = resRowIdx * numLeftColIdxs;
   3.376 -
   3.377 -      for( resColIdx = 0; resColIdx < numColIdxs; resColIdx++ )
   3.378 -       {
   3.379 -         vecParams = VCilk__malloc( sizeof(VecParams), animatingPr );
   3.380 -         
   3.381 -         vecParams->numVecIdxs       = numVecIdxs;
   3.382 -         vecParams->numRightColIdxs  = numRightColIdxs;
   3.383 -         vecParams->leftRowIdxOffset = leftRowIdxOffset;
   3.384 -         vecParams->resColIdx        = resColIdx;
   3.385 -         vecParams->leftSubMatrices  = leftSubMatrices;
   3.386 -         vecParams->rightSubMatrices = rightSubMatrices;
   3.387 -         vecParams->resultArray      = resultArray;
   3.388 -         vecParams->coreToRunOn      = currCore;
   3.389 -
   3.390 -         VCilk__spawn( currCore, &calcVectorOfSubMatrices, vecParams,
   3.391 -                       animatingPr );
   3.392 -
   3.393 -         numOnCurrCore += 1;
   3.394 -         if( numOnCurrCore + leftOverFraction >= numToPutOntoEachCore - 1 )
   3.395 -          {
   3.396 -               //deal with fractional part, to ensure that imbalance is 1 max
   3.397 -               // IE, core with most has only 1 more than core with least
   3.398 -            leftOverFraction += numToPutOntoEachCore - numOnCurrCore;
   3.399 -            if( leftOverFraction >= 1 )
   3.400 -             { leftOverFraction -= 1;
   3.401 -               numOnCurrCore = -1;
   3.402 -             }
   3.403 -            else
   3.404 -             { numOnCurrCore = 0;
   3.405 -             }
   3.406 -               //Move to next core, max core-value to incr to is numCores -1
   3.407 -            if( currCore >= numCores -1 )
   3.408 -             { currCore = 0;
   3.409 -             }
   3.410 -            else
   3.411 -             { currCore += 1;
   3.412 -             }
   3.413 -          }
   3.414 -       }
   3.415 -    }
   3.416 -   
   3.417 -   //Free Note: vector of sub-matrices does its own free-ing, even vec-params
   3.418 -
   3.419 -//TODO: timeToSpawnProbe = VMS__get_probe_by_name( "timeToSpawnProbe" );
   3.420 -//      VMS__end_interval_on_probe( timeToSpawnProbe );
   3.421 -
   3.422 -   VCilk__sync( animatingPr );
   3.423 -
   3.424 -   //free the sub-matrices in Fn that called this one
   3.425 - }
   3.426 -
   3.427 -
   3.428 -/*Walk through the two slice-strucs, making sub-matrix strucs as go
   3.429 - */
   3.430 -SubMatrix **
   3.431 -createSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices,
   3.432 -                   Matrix *origMatrix, VirtProcr *animPr )
   3.433 - {
   3.434 -   int32 numRowIdxs, numColIdxs, rowIdx, colIdx;
   3.435 -   int32 startRow, endRow, startCol, endCol;
   3.436 -   int32 *rowStartVals, *colStartVals;
   3.437 -   int32 rowOffset;
   3.438 -   SubMatrix **subMatrices, *newSubMatrix;
   3.439 -
   3.440 -   numRowIdxs = rowSlices->numVals;
   3.441 -   numColIdxs = colSlices->numVals;
   3.442 -
   3.443 -   rowStartVals = rowSlices->startVals;
   3.444 -   colStartVals = colSlices->startVals;
   3.445 -
   3.446 -   subMatrices = VCilk__malloc( numRowIdxs * numColIdxs *sizeof(SubMatrix *),
   3.447 -                                animPr );
   3.448 -
   3.449 -   for( rowIdx = 0; rowIdx < numRowIdxs; rowIdx++ )
   3.450 -    {
   3.451 -      rowOffset = rowIdx * numColIdxs;
   3.452 -      
   3.453 -      startRow  = rowStartVals[rowIdx];
   3.454 -      endRow    = rowStartVals[rowIdx + 1] -1; //"fake" start above last is
   3.455 -                                               // at last valid idx + 1 & is
   3.456 -                                               // 1 greater than end value
   3.457 -      for( colIdx = 0; colIdx < numColIdxs; colIdx++ )
   3.458 -       {
   3.459 -         startCol = colStartVals[colIdx];
   3.460 -         endCol   = colStartVals[colIdx + 1] -1;
   3.461 -
   3.462 -         newSubMatrix = VCilk__malloc( sizeof(SubMatrix), animPr );
   3.463 -         newSubMatrix->numRows       = endRow - startRow +1;
   3.464 -         newSubMatrix->numCols       = endCol - startCol +1;
   3.465 -         newSubMatrix->origMatrix    = origMatrix;
   3.466 -         newSubMatrix->origStartRow  = startRow;
   3.467 -         newSubMatrix->origStartCol  = startCol;
   3.468 -         newSubMatrix->alreadyCopied = FALSE;
   3.469 -
   3.470 -         subMatrices[ rowOffset + colIdx ] = newSubMatrix;
   3.471 -       }
   3.472 -    }
   3.473 -   return subMatrices;
   3.474 - }
   3.475 -
   3.476 -void
   3.477 -freeSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices,
   3.478 -                 SubMatrix **subMatrices, VirtProcr *animPr )
   3.479 - {
   3.480 -   int32 numRowIdxs, numColIdxs, rowIdx, colIdx, rowOffset;
   3.481 -   SubMatrix *subMatrix;
   3.482 -
   3.483 -   numRowIdxs = rowSlices->numVals;
   3.484 -   numColIdxs = colSlices->numVals;
   3.485 -
   3.486 -   for( rowIdx = 0; rowIdx < numRowIdxs; rowIdx++ )
   3.487 -    {
   3.488 -      rowOffset = rowIdx * numColIdxs;
   3.489 -      for( colIdx = 0; colIdx < numColIdxs; colIdx++ )
   3.490 -       {
   3.491 -         subMatrix = subMatrices[ rowOffset + colIdx ];
   3.492 -         if( subMatrix->alreadyCopied )
   3.493 -            VCilk__free( subMatrix->array, animPr );
   3.494 -         VCilk__free( subMatrix, animPr );
   3.495 -       }
   3.496 -    }
   3.497 -   VCilk__free( subMatrices, animPr );
   3.498 - }
   3.499 -
   3.500 -
   3.501 -
   3.502 -SlicingStruc *
   3.503 -sliceUpDimension( float32 idealSizeOfSide, int startVal, int endVal,
   3.504 -                  VirtProcr *animPr )
   3.505 - { float32 residualAcc = 0;
   3.506 -   int     numSlices, i, *startVals, sizeOfSlice, endCondition;
   3.507 -   SlicingStruc *slicingStruc = VCilk__malloc( sizeof(SlicingStruc), animPr);
   3.508 -
   3.509 -      //calc size of matrix need to hold start vals --
   3.510 -   numSlices = (int32)( (float32)(endVal -startVal +1) / idealSizeOfSide);
   3.511 -
   3.512 -   startVals = VCilk__malloc( (numSlices + 1) * sizeof(int32), animPr );
   3.513 -
   3.514 -      //Calc the upper limit of start value -- when get above this, end loop
   3.515 -      // by saving highest value of the matrix dimension to access, plus 1
   3.516 -      // as the start point of the imaginary slice following the last one
   3.517 -      //Plus 1 because go up to value but not include when process last slice
   3.518 -      //The stopping condition is half-a-size less than highest value because
   3.519 -      // don't want any pieces smaller than half the ideal size -- just tack
   3.520 -      // little ones onto end of last one
   3.521 -   endCondition = endVal - (int) (idealSizeOfSide/2); //end *value*, not size
   3.522 -   for( i = 0; startVal <= endVal; i++ )
   3.523 -    {
   3.524 -      startVals[i] = startVal;
   3.525 -      residualAcc += idealSizeOfSide;
   3.526 -      sizeOfSlice  = (int)residualAcc;
   3.527 -      residualAcc -= (float32)sizeOfSlice;
   3.528 -      startVal    += sizeOfSlice; //ex @size = 2 get 0, 2, 4, 6, 8..
   3.529 -
   3.530 -      if( startVal > endCondition )
   3.531 -       { startVal = endVal + 1;
   3.532 -         startVals[ i + 1 ] = startVal;
   3.533 -       }
   3.534 -    }
   3.535 -
   3.536 -   slicingStruc->startVals = startVals;
   3.537 -   slicingStruc->numVals   = i;  //loop incr'd, so == last valid start idx+1
   3.538 -                                 // which means is num sub-matrices in dim
   3.539 -                                 // also == idx of the fake start just above
   3.540 -   return slicingStruc;
   3.541 - }
   3.542 -
   3.543 -void
   3.544 -freeSlicingStruc( SlicingStruc *slicingStruc, VirtProcr *animPr )
   3.545 - {
   3.546 -   VCilk__free( slicingStruc->startVals, animPr );
   3.547 -   VCilk__free( slicingStruc, animPr );
   3.548 - }
   3.549 -
   3.550 -
   3.551 -int inline
   3.552 -measureMatrixMultPrimitive( VirtProcr *animPr )
   3.553 - {
   3.554 -   int r, c, v, numCycles;
   3.555 -   float32 *res, *left, *right;
   3.556 -
   3.557 -      //setup inputs
   3.558 -   left  = VCilk__malloc( 5 * 5 * sizeof( float32 ), animPr );
   3.559 -   right = VCilk__malloc( 5 * 5 * sizeof( float32 ), animPr );
   3.560 -   res   = VCilk__malloc( 5 * 5 * sizeof( float32 ), animPr );
   3.561 -
   3.562 -   for( r = 0; r < 5; r++ )
   3.563 -    {
   3.564 -      for( c = 0; c < 5; c++ )
   3.565 -       {
   3.566 -         left[  r * 5 + c ] = r;
   3.567 -         right[ r * 5 + c ] = c;
   3.568 -       }
   3.569 -    }
   3.570 -
   3.571 -      //do primitive
   3.572 -   VCilk__start_primitive();  //for now, just takes time stamp
   3.573 -   for( r = 0; r < 5; r++ )
   3.574 -    {
   3.575 -      for( c = 0; c < 5; c++ )
   3.576 -       {
   3.577 -         for( v = 0; v < 5; v++ )
   3.578 -          {
   3.579 -            res[ r * 5 + c ] = left[ r * 5 + v ] * right[ v * 5 + c ];
   3.580 -          }
   3.581 -       }
   3.582 -    }
   3.583 -   numCycles =
   3.584 -      VCilk__end_primitive_and_give_cycles(); 
   3.585 -
   3.586 -   VCilk__free( left, animPr );
   3.587 -   VCilk__free( right, animPr );
   3.588 -   VCilk__free( res, animPr );
   3.589 -   
   3.590 -   return numCycles;
   3.591 - }
   3.592 +/*
   3.593 
   3.594 + *  Copyright 2009 OpenSourceStewardshipFoundation.org
   3.595 
   3.596 + *  Licensed under GNU General Public License version 2
   3.597 
   3.598 + *
   3.599 
   3.600 + * Author: seanhalle@yahoo.com
   3.601 
   3.602 + *
   3.603 
   3.604 + */
   3.605 
   3.606 +
   3.607 
   3.608 +
   3.609 
   3.610 +#include "VCilk__Matrix_Mult.h"
   3.611 
   3.612 +#include <math.h>
   3.613 
   3.614 +#include <sys/time.h>
   3.615 
   3.616 +#include <string.h>
   3.617 
   3.618 +
   3.619 
   3.620 +   //The time to compute this many result values should equal the time to
   3.621 
   3.622 +   // perform this division on a matrix of size gives that many result calcs
   3.623 
   3.624 +   //IE, size this so that sequential time to calc equals divide time
   3.625 
   3.626 +   // find the value by experimenting -- but divide time and calc time scale
   3.627 
   3.628 +   // same way, so this value should remain valid across hardware
   3.629 
   3.630 +   //Divide time is about 800us on 2.4Ghz core2Quad laptop core
   3.631 
   3.632 +   //num cells is the cube of a side, when have two square matrices
   3.633 
   3.634 +#define NUM_CELLS_IN_SEQUENTIAL_CUTOFF 100000 /* about 46x46 */
   3.635 
   3.636 +
   3.637 
   3.638 +
   3.639 
   3.640 +//===========================================================================
   3.641 
   3.642 +int inline
   3.643 
   3.644 +measureMatrixMultPrimitive( VirtProcr *animPr );
   3.645 
   3.646 +
   3.647 
   3.648 +SlicingStrucCarrier *
   3.649 
   3.650 +calcIdealSizeAndSliceDimensions( Matrix *leftMatrix, Matrix *rightMatrix,
   3.651 
   3.652 +                                 VirtProcr *animPr );
   3.653 
   3.654 +
   3.655 
   3.656 +SlicingStruc *
   3.657 
   3.658 +sliceUpDimension( float32 idealSizeOfSide, int startVal, int endVal,
   3.659 
   3.660 +                  VirtProcr *animPr );
   3.661 
   3.662 +
   3.663 
   3.664 +void
   3.665 
   3.666 +freeSlicingStruc( SlicingStruc *slicingStruc, VirtProcr *animPr );
   3.667 
   3.668 +
   3.669 
   3.670 +SubMatrix **
   3.671 
   3.672 +createSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices,
   3.673 
   3.674 +                   Matrix *origMatrix, VirtProcr *animPr );
   3.675 
   3.676 +
   3.677 
   3.678 +void
   3.679 
   3.680 +freeSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices,
   3.681 
   3.682 +                 SubMatrix **subMatrices, VirtProcr *animPr );
   3.683 
   3.684 +
   3.685 
   3.686 +void
   3.687 
   3.688 +pairUpSubMatricesAndSpawnAndSync( SubMatrix **leftSubMatrices,
   3.689 
   3.690 +                                    SubMatrix **rightSubMatrices,
   3.691 
   3.692 +                                    int32 numRowIdxs, int32 numColIdxs,
   3.693 
   3.694 +                                    int32 numVecIdxs,
   3.695 
   3.696 +                                    float32 *resultArray,
   3.697 
   3.698 +                                    VirtProcr *animatingPr );
   3.699 
   3.700 +
   3.701 
   3.702 +void
   3.703 
   3.704 +makeSubMatricesAndSpawnAndSync( Matrix *leftMatrix, Matrix *rightMatrix,
   3.705 
   3.706 +            SlicingStrucCarrier *slicingStrucCarrier,
   3.707 
   3.708 +            float32 *resultArray, VirtProcr *animatingPr );
   3.709 
   3.710 +
   3.711 
   3.712 +//===========================================================================
   3.713 
   3.714 +
   3.715 
   3.716 +/*Divider creates one processor for every sub-matrix
   3.717 
   3.718 + * It hands them:
   3.719 
   3.720 + *  the name of the result processor that they should send their results to,
   3.721 
   3.722 + *  the left and right matrices, and the rows and cols they should multiply
   3.723 
   3.724 + * It first creates the result processor, then all the sub-matrixPair
   3.725 
   3.726 + *  processors,
   3.727 
   3.728 + *  then does a receive of a message from the result processor that gives
   3.729 
   3.730 + *  the divider ownership of the result matrix.
   3.731 
   3.732 + * Finally, the divider returns the result matrix out of the VCilk system.
   3.733 
   3.734 + *
   3.735 
   3.736 + * Divider chooses the size of sub-matrices via an algorithm that tries to
   3.737 
   3.738 + *  keep the minimum work above a threshold.  The threshold is machine-
   3.739 
   3.740 + *  dependent, so ask VCilk for min work-unit time to get a
   3.741 
   3.742 + *  given overhead
   3.743 
   3.744 + *
   3.745 
   3.746 + * Divide min work-unit cycles by measured-cycles for one matrix-cell
   3.747 
   3.748 + *  product -- gives the number of products need to have in min size
   3.749 
   3.750 + *  matrix.
   3.751 
   3.752 + *
   3.753 
   3.754 + * So then, take cubed root of this to get the size of a side of min sub-
   3.755 
   3.756 + *  matrix.  That is the size of the ideal square sub-matrix -- so tile
   3.757 
   3.758 + *  up the two input matrices into ones as close as possible to that size,
   3.759 
   3.760 + *  and create the pairs of sub-matrices.
   3.761 
   3.762 + *
   3.763 
   3.764 + *========================  STRATEGIC OVERVIEW  =======================
   3.765 
   3.766 + *
   3.767 
   3.768 + *This division is a bit tricky, because have to create things in advance
   3.769 
   3.770 + * that it's not at first obvious need to be created..
   3.771 
   3.772 + *
   3.773 
   3.774 + *First slice up each dimension -- three of them..  this is because will have
   3.775 
   3.776 + * to create the sub-matrix's data-structures before pairing the sub-matrices
   3.777 
   3.778 + * with each other -- so, have three dimensions to slice up before can
   3.779 
   3.780 + * create the sub-matrix data-strucs -- also, have to be certain that the
   3.781 
   3.782 + * cols of the left input have the exact same slicing as the rows of the
   3.783 
   3.784 + * left matrix, so just to be sure, do the slicing calc once, then use it
   3.785 
   3.786 + * for both.
   3.787 
   3.788 + *
   3.789 
   3.790 + *So, goes like this:
   3.791 
   3.792 + *1) calculate the start & end values of each dimension in each matrix.
   3.793 
   3.794 + *2) use those values to create sub-matrix structures
   3.795 
   3.796 + *3) combine sub-matrices into pairs, as the tasks to perform.
   3.797 
   3.798 + *
   3.799 
   3.800 + *Have to calculate separately from creating the sub-matrices because of the
   3.801 
   3.802 + * nature of the nesting -- would either end up creating the same sub-matrix
   3.803 
   3.804 + * multiple times, or else would have to put in detection of whether had
   3.805 
   3.806 + * made a particular one already if tried to combine steps 1 and 2.
   3.807 
   3.808 + *
   3.809 
   3.810 + *Step 3 has to be separate because of the nesting, as well -- same reason,
   3.811 
   3.812 + * would either create same sub-matrix multiple times, or else have to
   3.813 
   3.814 + * add detection of whether was already created.
   3.815 
   3.816 + *
   3.817 
   3.818 + *Another way to look at it: there's one level of loop to divide dimensions,
   3.819 
   3.820 + * two levels of nesting to create sub-matrices, and three levels to pair
   3.821 
   3.822 + * up the sub-matrices.
   3.823 
   3.824 + */
   3.825 
   3.826 +
   3.827 
   3.828 +void divideWorkIntoSubMatrixPairProcrs( void      *_dividerParams,
   3.829 
   3.830 +                                        VirtProcr *animPr )
   3.831 
   3.832 + { 
   3.833 
   3.834 +   DividerParams   *dividerParams;
   3.835 
   3.836 +   ResultsParams   *resultsParams;
   3.837 
   3.838 +   Matrix          *leftMatrix, *rightMatrix, *resultMatrix;
   3.839 
   3.840 +   void            *msg;
   3.841 
   3.842 +   SlicingStrucCarrier *slicingStrucCarrier;
   3.843 
   3.844 +   float32         *resultArray; //points to array to be put inside result
   3.845 
   3.846 +                                 // matrix
   3.847 
   3.848 +   
   3.849 
   3.850 +         DEBUG( dbgAppFlow, "start divide\n")
   3.851 
   3.852 +
   3.853 
   3.854 +         int32
   3.855 
   3.856 +         divideProbe = VMS__create_single_interval_probe( "divideProbe",
   3.857 
   3.858 +                                                          animPr );
   3.859 
   3.860 +         VMS__record_sched_choice_into_probe( divideProbe, animPr );
   3.861 
   3.862 +         VMS__record_interval_start_in_probe( divideProbe );
   3.863 
   3.864 +
   3.865 
   3.866 +   //=========== Setup -- make local copies of ptd-to-things, malloc, aso
   3.867 
   3.868 +   int32 numResRows, numResCols, vectLength;
   3.869 
   3.870 +
   3.871 
   3.872 +   dividerParams   = (DividerParams *)_dividerParams;
   3.873 
   3.874 +   
   3.875 
   3.876 +   leftMatrix      = dividerParams->leftMatrix;
   3.877 
   3.878 +   rightMatrix     = dividerParams->rightMatrix;
   3.879 
   3.880 +
   3.881 
   3.882 +   vectLength  = leftMatrix->numCols;
   3.883 
   3.884 +   numResRows  = leftMatrix->numRows;
   3.885 
   3.886 +   numResCols  = rightMatrix->numCols;
   3.887 
   3.888 +   resultArray = dividerParams->resultMatrix->array;
   3.889 
   3.890 +   
   3.891 
   3.892 +      //zero the result array
   3.893 
   3.894 +   memset( resultArray, 0, numResRows * numResCols * sizeof(float32) );
   3.895 
   3.896 +
   3.897 
   3.898 +   
   3.899 
   3.900 +   //==============  Do either sequential mult or do division ==============
   3.901 
   3.902 +
   3.903 
   3.904 +      //Check if input matrices too small -- if yes, just do sequential
   3.905 
   3.906 +      //Cutoff is determined by overhead of this divider -- relatively
   3.907 
   3.908 +      // machine-independent
   3.909 
   3.910 +   if( (float32)leftMatrix->numRows * (float32)leftMatrix->numCols *
   3.911 
   3.912 +       (float32)rightMatrix->numCols  < NUM_CELLS_IN_SEQUENTIAL_CUTOFF )
   3.913 
   3.914 +    { int32 vectLength;
   3.915 
   3.916 +
   3.917 
   3.918 +      //====== Do sequential multiply on a single core
   3.919 
   3.920 +            DEBUG( dbgAppFlow, "doing sequential")
   3.921 
   3.922 +
   3.923 
   3.924 +         //transpose the right matrix
   3.925 
   3.926 +      float32 *
   3.927 
   3.928 +      transRightArray  = VCilk__malloc( rightMatrix->numRows *
   3.929 
   3.930 +                                        rightMatrix->numCols *
   3.931 
   3.932 +                                        sizeof(float32),       animPr );
   3.933 
   3.934 +
   3.935 
   3.936 +         //copy values from orig matrix to local
   3.937 
   3.938 +      copyTranspose( rightMatrix->numRows, rightMatrix->numCols,
   3.939 
   3.940 +                     0, 0, rightMatrix->numRows,
   3.941 
   3.942 +                     transRightArray, rightMatrix->array );
   3.943 
   3.944 +
   3.945 
   3.946 +      multiplyMatrixArraysTransposed( vectLength, numResRows, numResCols,
   3.947 
   3.948 +                            leftMatrix->array, transRightArray,
   3.949 
   3.950 +                            resultArray );
   3.951 
   3.952 +    }
   3.953 
   3.954 +   else
   3.955 
   3.956 +    {
   3.957 
   3.958 +      //====== Do parallel multiply across cores
   3.959 
   3.960 +
   3.961 
   3.962 +         //Calc the ideal size of sub-matrix and slice up the dimensions of
   3.963 
   3.964 +         // the two matrices.
   3.965 
   3.966 +         //The ideal size is the one takes the number of cycles to calculate
   3.967 
   3.968 +         // such that calc time is equal or greater than min work-unit size
   3.969 
   3.970 +      slicingStrucCarrier =
   3.971 
   3.972 +         calcIdealSizeAndSliceDimensions( leftMatrix, rightMatrix,  animPr);
   3.973 
   3.974 +                                         
   3.975 
   3.976 +
   3.977 
   3.978 +
   3.979 
   3.980 +         //Make the sub-matrices, and pair them up, then spawn processors to
   3.981 
   3.982 +         // calc product of each pair.
   3.983 
   3.984 +      makeSubMatricesAndSpawnAndSync( leftMatrix, rightMatrix,
   3.985 
   3.986 +                                      slicingStrucCarrier,
   3.987 
   3.988 +                                      resultArray, animPr);
   3.989 
   3.990 +         //The result array will get filled in by the spawned children
   3.991 
   3.992 +    }
   3.993 
   3.994 +
   3.995 
   3.996 +
   3.997 
   3.998 +   //===============  Work done -- send results back =================
   3.999 
  3.1000 +
  3.1001 
  3.1002 +
  3.1003 
  3.1004 +      //results have been saved into an array that was made outside the VMS
  3.1005 
  3.1006 +      // system, by entry-point Fn, and passed in through dividerParams.
  3.1007 
  3.1008 +      //So, nothing to do to send results back -- they're seen by side-effect
  3.1009 
  3.1010 +
  3.1011 
  3.1012 +         DEBUG( dbgAppFlow, "*** end divide ***\n")
  3.1013 
  3.1014 +
  3.1015 
  3.1016 +         VMS__record_interval_end_in_probe( divideProbe );
  3.1017 
  3.1018 +         VMS__print_stats_of_all_probes();
  3.1019 
  3.1020 +
  3.1021 
  3.1022 +   VCilk__dissipate_procr( animPr );  //all procrs dissipate self at end
  3.1023 
  3.1024 +      //when all of the processors have dissipated, the "create seed and do
  3.1025 
  3.1026 +      // work" call in the entry point function returns
  3.1027 
  3.1028 + }
  3.1029 
  3.1030 +
  3.1031 
  3.1032 +
  3.1033 
  3.1034 +SlicingStrucCarrier *
  3.1035 
  3.1036 +calcIdealSizeAndSliceDimensions( Matrix *leftMatrix, Matrix *rightMatrix,
  3.1037 
  3.1038 +                                 VirtProcr *animPr )
  3.1039 
  3.1040 +{
  3.1041 
  3.1042 +   float32 idealSizeOfSide, idealSizeOfSide1, idealSizeOfSide2;
  3.1043 
  3.1044 +   SlicingStruc *leftRowSlices, *vecSlices, *rightColSlices;
  3.1045 
  3.1046 +   SlicingStrucCarrier *slicingStrucCarrier =
  3.1047 
  3.1048 +                         VCilk__malloc(sizeof(SlicingStrucCarrier), animPr );
  3.1049 
  3.1050 +
  3.1051 
  3.1052 +   int minWorkUnitCycles, primitiveCycles, idealNumWorkUnits;
  3.1053 
  3.1054 +   float64 numPrimitiveOpsInMinWorkUnit;
  3.1055 
  3.1056 +
  3.1057 
  3.1058 +
  3.1059 
  3.1060 +   //=======  Calc ideal size of min-sized sub-matrix  ========
  3.1061 
  3.1062 +
  3.1063 
  3.1064 +      //ask VCilk for the number of cycles of the minimum work unit, at given
  3.1065 
  3.1066 +      // percent overhead then add a guess at overhead from this divider
  3.1067 
  3.1068 +   minWorkUnitCycles = VCilk__giveMinWorkUnitCycles( .05 );
  3.1069 
  3.1070 +
  3.1071 
  3.1072 +      //ask VCilk for number of cycles of the "primitive" op of matrix mult
  3.1073 
  3.1074 +   primitiveCycles = measureMatrixMultPrimitive( animPr );
  3.1075 
  3.1076 +
  3.1077 
  3.1078 +   numPrimitiveOpsInMinWorkUnit =
  3.1079 
  3.1080 +      (float64)minWorkUnitCycles / (float64)primitiveCycles;
  3.1081 
  3.1082 +
  3.1083 
  3.1084 +      //take cubed root -- that's number of these in a "side" of sub-matrix
  3.1085 
  3.1086 +      // then multiply by 5 because the primitive is 5x5
  3.1087 
  3.1088 +   idealSizeOfSide1 = 5 * cbrt( numPrimitiveOpsInMinWorkUnit );
  3.1089 
  3.1090 +
  3.1091 
  3.1092 +   idealNumWorkUnits = VCilk__giveIdealNumWorkUnits();
  3.1093 
  3.1094 +   
  3.1095 
  3.1096 +   idealSizeOfSide2 = leftMatrix->numRows / rint(cbrt( idealNumWorkUnits ));
  3.1097 
  3.1098 +   idealSizeOfSide2 *= 0.8; //finer granularity to help load balance
  3.1099 
  3.1100 +
  3.1101 
  3.1102 +   if( idealSizeOfSide1 > idealSizeOfSide2 )
  3.1103 
  3.1104 +      idealSizeOfSide = idealSizeOfSide1;
  3.1105 
  3.1106 +   else
  3.1107 
  3.1108 +      idealSizeOfSide = idealSizeOfSide2;
  3.1109 
  3.1110 +
  3.1111 
  3.1112 +      //The multiply inner loop blocks the array to fit into L1 cache
  3.1113 
  3.1114 +//   if( idealSizeOfSide < ROWS_IN_BLOCK ) idealSizeOfSide = ROWS_IN_BLOCK;
  3.1115 
  3.1116 +
  3.1117 
  3.1118 +   //============  Slice up dimensions, now that know target size ===========
  3.1119 
  3.1120 +
  3.1121 
  3.1122 +      //Tell the slicer the target size of a side (floating pt), the start
  3.1123 
  3.1124 +      // value to start slicing at, and the end value to stop slicing at
  3.1125 
  3.1126 +      //It returns an array of start value of each chunk, plus number of them
  3.1127 
  3.1128 +   int32 startLeftRow, endLeftRow, startVec,endVec,startRightCol,endRightCol;
  3.1129 
  3.1130 +   startLeftRow  = 0;
  3.1131 
  3.1132 +   endLeftRow    = leftMatrix->numRows -1;
  3.1133 
  3.1134 +   startVec      = 0;
  3.1135 
  3.1136 +   endVec        = leftMatrix->numCols -1;
  3.1137 
  3.1138 +   startRightCol = 0;
  3.1139 
  3.1140 +   endRightCol   = rightMatrix->numCols -1;
  3.1141 
  3.1142 +
  3.1143 
  3.1144 +   leftRowSlices =
  3.1145 
  3.1146 +      sliceUpDimension( idealSizeOfSide,  startLeftRow, endLeftRow, animPr );
  3.1147 
  3.1148 +
  3.1149 
  3.1150 +   vecSlices =
  3.1151 
  3.1152 +      sliceUpDimension( idealSizeOfSide,  startVec, endVec, animPr );
  3.1153 
  3.1154 +
  3.1155 
  3.1156 +   rightColSlices =
  3.1157 
  3.1158 +      sliceUpDimension( idealSizeOfSide,  startRightCol, endRightCol,animPr);
  3.1159 
  3.1160 +
  3.1161 
  3.1162 +   slicingStrucCarrier->leftRowSlices  = leftRowSlices;
  3.1163 
  3.1164 +   slicingStrucCarrier->vecSlices      = vecSlices;
  3.1165 
  3.1166 +   slicingStrucCarrier->rightColSlices = rightColSlices;
  3.1167 
  3.1168 +
  3.1169 
  3.1170 +         DEBUG1( dbgAppFlow, "leftRowSlices %d | ", leftRowSlices->numVals );
  3.1171 
  3.1172 +         DEBUG1( dbgAppFlow, "rightColSlices %d | ",rightColSlices->numVals);
  3.1173 
  3.1174 +         DEBUG1( dbgAppFlow, "vecSlices %d\n", vecSlices->numVals );
  3.1175 
  3.1176 +   return slicingStrucCarrier;
  3.1177 
  3.1178 +}
  3.1179 
  3.1180 +
  3.1181 
  3.1182 +
  3.1183 
  3.1184 +void inline
  3.1185 
  3.1186 +makeSubMatricesAndSpawnAndSync( Matrix  *leftMatrix,  Matrix    *rightMatrix,
  3.1187 
  3.1188 +                         SlicingStrucCarrier *slicingStrucCarrier,
  3.1189 
  3.1190 +                         float32 *resultArray, VirtProcr *animPr )
  3.1191 
  3.1192 + {
  3.1193 
  3.1194 +   SlicingStruc *leftRowSlices, *vecSlices, *rightColSlices;
  3.1195 
  3.1196 +   
  3.1197 
  3.1198 +   leftRowSlices  = slicingStrucCarrier->leftRowSlices;
  3.1199 
  3.1200 +   vecSlices      = slicingStrucCarrier->vecSlices;
  3.1201 
  3.1202 +   rightColSlices = slicingStrucCarrier->rightColSlices;
  3.1203 
  3.1204 +   VCilk__free( slicingStrucCarrier, animPr );
  3.1205 
  3.1206 +   
  3.1207 
  3.1208 +   //================  Make sub-matrices, given the slicing  ================
  3.1209 
  3.1210 +   SubMatrix **leftSubMatrices, **rightSubMatrices;
  3.1211 
  3.1212 +   leftSubMatrices =
  3.1213 
  3.1214 +      createSubMatrices( leftRowSlices, vecSlices,
  3.1215 
  3.1216 +                         leftMatrix, animPr );
  3.1217 
  3.1218 +   rightSubMatrices =
  3.1219 
  3.1220 +      createSubMatrices( vecSlices, rightColSlices,
  3.1221 
  3.1222 +                         rightMatrix, animPr );
  3.1223 
  3.1224 +
  3.1225 
  3.1226 +   freeSlicingStruc( leftRowSlices, animPr );
  3.1227 
  3.1228 +   freeSlicingStruc( vecSlices, animPr );
  3.1229 
  3.1230 +   freeSlicingStruc( rightColSlices, animPr );
  3.1231 
  3.1232 +
  3.1233 
  3.1234 +   //==============  pair the sub-matrices and make processors ==============
  3.1235 
  3.1236 +   int32 numRowIdxs, numColIdxs, numVecIdxs;
  3.1237 
  3.1238 +
  3.1239 
  3.1240 +   numRowIdxs = leftRowSlices->numVals;
  3.1241 
  3.1242 +   numColIdxs = rightColSlices->numVals;
  3.1243 
  3.1244 +   numVecIdxs = vecSlices->numVals;
  3.1245 
  3.1246 +   pairUpSubMatricesAndSpawnAndSync( leftSubMatrices, rightSubMatrices,
  3.1247 
  3.1248 +                                     numRowIdxs, numColIdxs, numVecIdxs,
  3.1249 
  3.1250 +                                     resultArray,
  3.1251 
  3.1252 +                                     animPr );
  3.1253 
  3.1254 +      //It syncs inside, so know all work is done now: free the sub-matrices
  3.1255 
  3.1256 +   freeSubMatrices( leftRowSlices, vecSlices,  leftSubMatrices, animPr );
  3.1257 
  3.1258 +   freeSubMatrices( vecSlices, rightColSlices, rightSubMatrices, animPr );
  3.1259 
  3.1260 + }
  3.1261 
  3.1262 +
  3.1263 
  3.1264 +
  3.1265 
  3.1266 +
  3.1267 
  3.1268 +
  3.1269 
  3.1270 +/* numRows*colsPerRow/numCores = numToPutOntoEachCore; 
  3.1271 
  3.1272 + * put all from a given row onto same core, until exhaust allotment for that
  3.1273 
  3.1274 + *  core
  3.1275 
  3.1276 + *
  3.1277 
  3.1278 + */
  3.1279 
  3.1280 +void inline
  3.1281 
  3.1282 +pairUpSubMatricesAndSpawnAndSync( SubMatrix **leftSubMatrices,
  3.1283 
  3.1284 +                                    SubMatrix **rightSubMatrices,
  3.1285 
  3.1286 +                                    int32 numRowIdxs, int32 numColIdxs,
  3.1287 
  3.1288 +                                    int32 numVecIdxs,
  3.1289 
  3.1290 +                                    float32 *resultArray,
  3.1291 
  3.1292 +                                    VirtProcr *animatingPr )
  3.1293 
  3.1294 + {
  3.1295 
  3.1296 +   int32 resRowIdx, resColIdx;
  3.1297 
  3.1298 +   int32 numLeftColIdxs, numRightColIdxs;
  3.1299 
  3.1300 +   int32 leftRowIdxOffset;
  3.1301 
  3.1302 +   VecParams *vecParams;
  3.1303 
  3.1304 +   float32 numToPutOntoEachCore, leftOverFraction;
  3.1305 
  3.1306 +   int32 numCores, currCore, numOnCurrCore;
  3.1307 
  3.1308 +
  3.1309 
  3.1310 +   numLeftColIdxs  = numColIdxs;
  3.1311 
  3.1312 +   numRightColIdxs = numVecIdxs;
  3.1313 
  3.1314 +
  3.1315 
  3.1316 +   numCores = VCilk__give_number_of_cores_to_spawn_onto();   
  3.1317 
  3.1318 +
  3.1319 
  3.1320 +   numToPutOntoEachCore = numRowIdxs*numColIdxs/numCores;
  3.1321 
  3.1322 +   leftOverFraction = 0;
  3.1323 
  3.1324 +   numOnCurrCore = 0;
  3.1325 
  3.1326 +   currCore = 0;
  3.1327 
  3.1328 +
  3.1329 
  3.1330 +   for( resRowIdx = 0; resRowIdx < numRowIdxs; resRowIdx++ )
  3.1331 
  3.1332 +    {
  3.1333 
  3.1334 +      leftRowIdxOffset = resRowIdx * numLeftColIdxs;
  3.1335 
  3.1336 +
  3.1337 
  3.1338 +      for( resColIdx = 0; resColIdx < numColIdxs; resColIdx++ )
  3.1339 
  3.1340 +       {
  3.1341 
  3.1342 +         vecParams = VCilk__malloc( sizeof(VecParams), animatingPr );
  3.1343 
  3.1344 +         
  3.1345 
  3.1346 +         vecParams->numVecIdxs       = numVecIdxs;
  3.1347 
  3.1348 +         vecParams->numRightColIdxs  = numRightColIdxs;
  3.1349 
  3.1350 +         vecParams->leftRowIdxOffset = leftRowIdxOffset;
  3.1351 
  3.1352 +         vecParams->resColIdx        = resColIdx;
  3.1353 
  3.1354 +         vecParams->leftSubMatrices  = leftSubMatrices;
  3.1355 
  3.1356 +         vecParams->rightSubMatrices = rightSubMatrices;
  3.1357 
  3.1358 +         vecParams->resultArray      = resultArray;
  3.1359 
  3.1360 +         vecParams->coreToRunOn      = currCore;
  3.1361 
  3.1362 +
  3.1363 
  3.1364 +         VCilk__spawn( currCore, &calcVectorOfSubMatrices, vecParams,
  3.1365 
  3.1366 +                       animatingPr );
  3.1367 
  3.1368 +
  3.1369 
  3.1370 +         numOnCurrCore += 1;
  3.1371 
  3.1372 +         if( numOnCurrCore + leftOverFraction >= numToPutOntoEachCore - 1 )
  3.1373 
  3.1374 +          {
  3.1375 
  3.1376 +               //deal with fractional part, to ensure that imbalance is 1 max
  3.1377 
  3.1378 +               // IE, core with most has only 1 more than core with least
  3.1379 
  3.1380 +            leftOverFraction += numToPutOntoEachCore - numOnCurrCore;
  3.1381 
  3.1382 +            if( leftOverFraction >= 1 )
  3.1383 
  3.1384 +             { leftOverFraction -= 1;
  3.1385 
  3.1386 +               numOnCurrCore = -1;
  3.1387 
  3.1388 +             }
  3.1389 
  3.1390 +            else
  3.1391 
  3.1392 +             { numOnCurrCore = 0;
  3.1393 
  3.1394 +             }
  3.1395 
  3.1396 +               //Move to next core, max core-value to incr to is numCores -1
  3.1397 
  3.1398 +            if( currCore >= numCores -1 )
  3.1399 
  3.1400 +             { currCore = 0;
  3.1401 
  3.1402 +             }
  3.1403 
  3.1404 +            else
  3.1405 
  3.1406 +             { currCore += 1;
  3.1407 
  3.1408 +             }
  3.1409 
  3.1410 +          }
  3.1411 
  3.1412 +       }
  3.1413 
  3.1414 +    }
  3.1415 
  3.1416 +   
  3.1417 
  3.1418 +   //Free Note: vector of sub-matrices does its own free-ing, even vec-params
  3.1419 
  3.1420 +
  3.1421 
  3.1422 +//TODO: timeToSpawnProbe = VMS__get_probe_by_name( "timeToSpawnProbe" );
  3.1423 
  3.1424 +//      VMS__end_interval_on_probe( timeToSpawnProbe );
  3.1425 
  3.1426 +
  3.1427 
  3.1428 +   VCilk__sync( animatingPr );
  3.1429 
  3.1430 +
  3.1431 
  3.1432 +   //free the sub-matrices in Fn that called this one
  3.1433 
  3.1434 + }
  3.1435 
  3.1436 +
  3.1437 
  3.1438 +
  3.1439 
  3.1440 +/*Walk through the two slice-strucs, making sub-matrix strucs as go
  3.1441 
  3.1442 + */
  3.1443 
  3.1444 +SubMatrix **
  3.1445 
  3.1446 +createSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices,
  3.1447 
  3.1448 +                   Matrix *origMatrix, VirtProcr *animPr )
  3.1449 
  3.1450 + {
  3.1451 
  3.1452 +   int32 numRowIdxs, numColIdxs, rowIdx, colIdx;
  3.1453 
  3.1454 +   int32 startRow, endRow, startCol, endCol;
  3.1455 
  3.1456 +   int32 *rowStartVals, *colStartVals;
  3.1457 
  3.1458 +   int32 rowOffset;
  3.1459 
  3.1460 +   SubMatrix **subMatrices, *newSubMatrix;
  3.1461 
  3.1462 +
  3.1463 
  3.1464 +   numRowIdxs = rowSlices->numVals;
  3.1465 
  3.1466 +   numColIdxs = colSlices->numVals;
  3.1467 
  3.1468 +
  3.1469 
  3.1470 +   rowStartVals = rowSlices->startVals;
  3.1471 
  3.1472 +   colStartVals = colSlices->startVals;
  3.1473 
  3.1474 +
  3.1475 
  3.1476 +   subMatrices = VCilk__malloc( numRowIdxs * numColIdxs *sizeof(SubMatrix *),
  3.1477 
  3.1478 +                                animPr );
  3.1479 
  3.1480 +
  3.1481 
  3.1482 +   for( rowIdx = 0; rowIdx < numRowIdxs; rowIdx++ )
  3.1483 
  3.1484 +    {
  3.1485 
  3.1486 +      rowOffset = rowIdx * numColIdxs;
  3.1487 
  3.1488 +      
  3.1489 
  3.1490 +      startRow  = rowStartVals[rowIdx];
  3.1491 
  3.1492 +      endRow    = rowStartVals[rowIdx + 1] -1; //"fake" start above last is
  3.1493 
  3.1494 +                                               // at last valid idx + 1 & is
  3.1495 
  3.1496 +                                               // 1 greater than end value
  3.1497 
  3.1498 +      for( colIdx = 0; colIdx < numColIdxs; colIdx++ )
  3.1499 
  3.1500 +       {
  3.1501 
  3.1502 +         startCol = colStartVals[colIdx];
  3.1503 
  3.1504 +         endCol   = colStartVals[colIdx + 1] -1;
  3.1505 
  3.1506 +
  3.1507 
  3.1508 +         newSubMatrix = VCilk__malloc( sizeof(SubMatrix), animPr );
  3.1509 
  3.1510 +         newSubMatrix->numRows       = endRow - startRow +1;
  3.1511 
  3.1512 +         newSubMatrix->numCols       = endCol - startCol +1;
  3.1513 
  3.1514 +         newSubMatrix->origMatrix    = origMatrix;
  3.1515 
  3.1516 +         newSubMatrix->origStartRow  = startRow;
  3.1517 
  3.1518 +         newSubMatrix->origStartCol  = startCol;
  3.1519 
  3.1520 +         newSubMatrix->alreadyCopied = FALSE;
  3.1521 
  3.1522 +
  3.1523 
  3.1524 +         subMatrices[ rowOffset + colIdx ] = newSubMatrix;
  3.1525 
  3.1526 +       }
  3.1527 
  3.1528 +    }
  3.1529 
  3.1530 +   return subMatrices;
  3.1531 
  3.1532 + }
  3.1533 
  3.1534 +
  3.1535 
  3.1536 +void
  3.1537 
  3.1538 +freeSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices,
  3.1539 
  3.1540 +                 SubMatrix **subMatrices, VirtProcr *animPr )
  3.1541 
  3.1542 + {
  3.1543 
  3.1544 +   int32 numRowIdxs, numColIdxs, rowIdx, colIdx, rowOffset;
  3.1545 
  3.1546 +   SubMatrix *subMatrix;
  3.1547 
  3.1548 +
  3.1549 
  3.1550 +   numRowIdxs = rowSlices->numVals;
  3.1551 
  3.1552 +   numColIdxs = colSlices->numVals;
  3.1553 
  3.1554 +
  3.1555 
  3.1556 +   for( rowIdx = 0; rowIdx < numRowIdxs; rowIdx++ )
  3.1557 
  3.1558 +    {
  3.1559 
  3.1560 +      rowOffset = rowIdx * numColIdxs;
  3.1561 
  3.1562 +      for( colIdx = 0; colIdx < numColIdxs; colIdx++ )
  3.1563 
  3.1564 +       {
  3.1565 
  3.1566 +         subMatrix = subMatrices[ rowOffset + colIdx ];
  3.1567 
  3.1568 +         if( subMatrix->alreadyCopied )
  3.1569 
  3.1570 +            VCilk__free( subMatrix->array, animPr );
  3.1571 
  3.1572 +         VCilk__free( subMatrix, animPr );
  3.1573 
  3.1574 +       }
  3.1575 
  3.1576 +    }
  3.1577 
  3.1578 +   VCilk__free( subMatrices, animPr );
  3.1579 
  3.1580 + }
  3.1581 
  3.1582 +
  3.1583 
  3.1584 +
  3.1585 
  3.1586 +
  3.1587 
  3.1588 +SlicingStruc *
  3.1589 
  3.1590 +sliceUpDimension( float32 idealSizeOfSide, int startVal, int endVal,
  3.1591 
  3.1592 +                  VirtProcr *animPr )
  3.1593 
  3.1594 + { float32 residualAcc = 0;
  3.1595 
  3.1596 +   int     numSlices, i, *startVals, sizeOfSlice, endCondition;
  3.1597 
  3.1598 +   SlicingStruc *slicingStruc = VCilk__malloc( sizeof(SlicingStruc), animPr);
  3.1599 
  3.1600 +
  3.1601 
  3.1602 +      //calc size of matrix need to hold start vals --
  3.1603 
  3.1604 +   numSlices = (int32)( (float32)(endVal -startVal +1) / idealSizeOfSide);
  3.1605 
  3.1606 +
  3.1607 
  3.1608 +   startVals = VCilk__malloc( (numSlices + 1) * sizeof(int32), animPr );
  3.1609 
  3.1610 +
  3.1611 
  3.1612 +      //Calc the upper limit of start value -- when get above this, end loop
  3.1613 
  3.1614 +      // by saving highest value of the matrix dimension to access, plus 1
  3.1615 
  3.1616 +      // as the start point of the imaginary slice following the last one
  3.1617 
  3.1618 +      //Plus 1 because go up to value but not include when process last slice
  3.1619 
  3.1620 +      //The stopping condition is half-a-size less than highest value because
  3.1621 
  3.1622 +      // don't want any pieces smaller than half the ideal size -- just tack
  3.1623 
  3.1624 +      // little ones onto end of last one
  3.1625 
  3.1626 +   endCondition = endVal - (int) (idealSizeOfSide/2); //end *value*, not size
  3.1627 
  3.1628 +   for( i = 0; startVal <= endVal; i++ )
  3.1629 
  3.1630 +    {
  3.1631 
  3.1632 +      startVals[i] = startVal;
  3.1633 
  3.1634 +      residualAcc += idealSizeOfSide;
  3.1635 
  3.1636 +      sizeOfSlice  = (int)residualAcc;
  3.1637 
  3.1638 +      residualAcc -= (float32)sizeOfSlice;
  3.1639 
  3.1640 +      startVal    += sizeOfSlice; //ex @size = 2 get 0, 2, 4, 6, 8..
  3.1641 
  3.1642 +
  3.1643 
  3.1644 +      if( startVal > endCondition )
  3.1645 
  3.1646 +       { startVal = endVal + 1;
  3.1647 
  3.1648 +         startVals[ i + 1 ] = startVal;
  3.1649 
  3.1650 +       }
  3.1651 
  3.1652 +    }
  3.1653 
  3.1654 +
  3.1655 
  3.1656 +   slicingStruc->startVals = startVals;
  3.1657 
  3.1658 +   slicingStruc->numVals   = i;  //loop incr'd, so == last valid start idx+1
  3.1659 
  3.1660 +                                 // which means is num sub-matrices in dim
  3.1661 
  3.1662 +                                 // also == idx of the fake start just above
  3.1663 
  3.1664 +   return slicingStruc;
  3.1665 
  3.1666 + }
  3.1667 
  3.1668 +
  3.1669 
  3.1670 +void
  3.1671 
  3.1672 +freeSlicingStruc( SlicingStruc *slicingStruc, VirtProcr *animPr )
  3.1673 
  3.1674 + {
  3.1675 
  3.1676 +   VCilk__free( slicingStruc->startVals, animPr );
  3.1677 
  3.1678 +   VCilk__free( slicingStruc, animPr );
  3.1679 
  3.1680 + }
  3.1681 
  3.1682 +
  3.1683 
  3.1684 +
  3.1685 
  3.1686 +int inline
  3.1687 
  3.1688 +measureMatrixMultPrimitive( VirtProcr *animPr )
  3.1689 
  3.1690 + {
  3.1691 
  3.1692 +   int r, c, v, numCycles;
  3.1693 
  3.1694 +   float32 *res, *left, *right;
  3.1695 
  3.1696 +
  3.1697 
  3.1698 +      //setup inputs
  3.1699 
  3.1700 +   left  = VCilk__malloc( 5 * 5 * sizeof( float32 ), animPr );
  3.1701 
  3.1702 +   right = VCilk__malloc( 5 * 5 * sizeof( float32 ), animPr );
  3.1703 
  3.1704 +   res   = VCilk__malloc( 5 * 5 * sizeof( float32 ), animPr );
  3.1705 
  3.1706 +
  3.1707 
  3.1708 +   for( r = 0; r < 5; r++ )
  3.1709 
  3.1710 +    {
  3.1711 
  3.1712 +      for( c = 0; c < 5; c++ )
  3.1713 
  3.1714 +       {
  3.1715 
  3.1716 +         left[  r * 5 + c ] = r;
  3.1717 
  3.1718 +         right[ r * 5 + c ] = c;
  3.1719 
  3.1720 +       }
  3.1721 
  3.1722 +    }
  3.1723 
  3.1724 +
  3.1725 
  3.1726 +      //do primitive
  3.1727 
  3.1728 +   VCilk__start_primitive();  //for now, just takes time stamp
  3.1729 
  3.1730 +   for( r = 0; r < 5; r++ )
  3.1731 
  3.1732 +    {
  3.1733 
  3.1734 +      for( c = 0; c < 5; c++ )
  3.1735 
  3.1736 +       {
  3.1737 
  3.1738 +         for( v = 0; v < 5; v++ )
  3.1739 
  3.1740 +          {
  3.1741 
  3.1742 +            res[ r * 5 + c ] = left[ r * 5 + v ] * right[ v * 5 + c ];
  3.1743 
  3.1744 +          }
  3.1745 
  3.1746 +       }
  3.1747 
  3.1748 +    }
  3.1749 
  3.1750 +   numCycles =
  3.1751 
  3.1752 +      VCilk__end_primitive_and_give_cycles(); 
  3.1753 
  3.1754 +
  3.1755 
  3.1756 +   VCilk__free( left, animPr );
  3.1757 
  3.1758 +   VCilk__free( right, animPr );
  3.1759 
  3.1760 +   VCilk__free( res, animPr );
  3.1761 
  3.1762 +   
  3.1763 
  3.1764 +   return numCycles;
  3.1765 
  3.1766 + }
  3.1767 
     4.1 --- a/src/Application/VCilk__Matrix_Mult/EntryPoint.c	Mon Nov 15 12:08:19 2010 -0800
     4.2 +++ b/src/Application/VCilk__Matrix_Mult/EntryPoint.c	Wed May 11 15:40:54 2011 +0200
     4.3 @@ -1,58 +1,58 @@
     4.4 -/*
     4.5 - *  Copyright 2009 OpenSourceStewardshipFoundation.org
     4.6 - *  Licensed under GNU General Public License version 2
     4.7 - *
     4.8 - * Author: seanhalle@yahoo.com
     4.9 - *
    4.10 - */
    4.11 -
    4.12 -#include <math.h>
    4.13 -
    4.14 -#include "VCilk__Matrix_Mult.h"
    4.15 -
    4.16 -
    4.17 -
    4.18 -/*Every VCilk system has an "entry point" function that creates the first
    4.19 - * processor, which starts the chain of creating more processors..
    4.20 - * eventually all of the processors will dissipate themselves, and
    4.21 - * return.
    4.22 - *
    4.23 - *This entry-point function follows the same pattern as all entry-point
    4.24 - * functions do:
    4.25 - *1) it creates the params for the seed processor, from the
    4.26 - *    parameters passed into the entry-point function
    4.27 - *2) it calls VCilk__create_seed_procr_and_do_work
    4.28 - *3) it gets the return value from the params struc, frees the params struc,
    4.29 - *    and returns the value from the function
    4.30 - *
    4.31 - */
    4.32 -Matrix *
    4.33 -multiplyTheseMatrices( Matrix *leftMatrix, Matrix *rightMatrix )
    4.34 - { Matrix          *resMatrix;
    4.35 -   DividerParams   *dividerParams;
    4.36 -   int32            numResRows, numResCols;
    4.37 -
    4.38 -
    4.39 -   dividerParams              = malloc( sizeof( DividerParams ) );
    4.40 -   dividerParams->leftMatrix  = leftMatrix;
    4.41 -   dividerParams->rightMatrix = rightMatrix;
    4.42 - 
    4.43 -   numResRows  = leftMatrix->numRows;
    4.44 -   numResCols  = rightMatrix->numCols;
    4.45 -   resMatrix            = malloc( sizeof(Matrix) );
    4.46 -   resMatrix->array     = malloc( numResRows * numResCols * sizeof(float32));
    4.47 -   resMatrix->numCols   = rightMatrix->numCols;
    4.48 -   resMatrix->numRows   = leftMatrix->numRows;
    4.49 -
    4.50 -
    4.51 -   dividerParams->resultMatrix   = resMatrix;
    4.52 -
    4.53 -      //create divider processor, start doing the work, and wait till done
    4.54 -      //This function is the "border crossing" between normal code and VCilk
    4.55 -   VCilk__create_seed_procr_and_do_work( &divideWorkIntoSubMatrixPairProcrs,
    4.56 -                                         dividerParams );
    4.57 -   
    4.58 -      //get result matrix and return it
    4.59 -   free( dividerParams );
    4.60 -   return resMatrix;
    4.61 - }
    4.62 +/*
    4.63 
    4.64 + *  Copyright 2009 OpenSourceStewardshipFoundation.org
    4.65 
    4.66 + *  Licensed under GNU General Public License version 2
    4.67 
    4.68 + *
    4.69 
    4.70 + * Author: seanhalle@yahoo.com
    4.71 
    4.72 + *
    4.73 
    4.74 + */
    4.75 
    4.76 +
    4.77 
    4.78 +#include <math.h>
    4.79 
    4.80 +
    4.81 
    4.82 +#include "VCilk__Matrix_Mult.h"
    4.83 
    4.84 +
    4.85 
    4.86 +
    4.87 
    4.88 +
    4.89 
    4.90 +/*Every VCilk system has an "entry point" function that creates the first
    4.91 
    4.92 + * processor, which starts the chain of creating more processors..
    4.93 
    4.94 + * eventually all of the processors will dissipate themselves, and
    4.95 
    4.96 + * return.
    4.97 
    4.98 + *
    4.99 
   4.100 + *This entry-point function follows the same pattern as all entry-point
   4.101 
   4.102 + * functions do:
   4.103 
   4.104 + *1) it creates the params for the seed processor, from the
   4.105 
   4.106 + *    parameters passed into the entry-point function
   4.107 
   4.108 + *2) it calls VCilk__create_seed_procr_and_do_work
   4.109 
   4.110 + *3) it gets the return value from the params struc, frees the params struc,
   4.111 
   4.112 + *    and returns the value from the function
   4.113 
   4.114 + *
   4.115 
   4.116 + */
   4.117 
   4.118 +Matrix *
   4.119 
   4.120 +multiplyTheseMatrices( Matrix *leftMatrix, Matrix *rightMatrix )
   4.121 
   4.122 + { Matrix          *resMatrix;
   4.123 
   4.124 +   DividerParams   *dividerParams;
   4.125 
   4.126 +   int32            numResRows, numResCols;
   4.127 
   4.128 +
   4.129 
   4.130 +
   4.131 
   4.132 +   dividerParams              = malloc( sizeof( DividerParams ) );
   4.133 
   4.134 +   dividerParams->leftMatrix  = leftMatrix;
   4.135 
   4.136 +   dividerParams->rightMatrix = rightMatrix;
   4.137 
   4.138 + 
   4.139 
   4.140 +   numResRows  = leftMatrix->numRows;
   4.141 
   4.142 +   numResCols  = rightMatrix->numCols;
   4.143 
   4.144 +   resMatrix            = malloc( sizeof(Matrix) );
   4.145 
   4.146 +   resMatrix->array     = malloc( numResRows * numResCols * sizeof(float32));
   4.147 
   4.148 +   resMatrix->numCols   = rightMatrix->numCols;
   4.149 
   4.150 +   resMatrix->numRows   = leftMatrix->numRows;
   4.151 
   4.152 +
   4.153 
   4.154 +
   4.155 
   4.156 +   dividerParams->resultMatrix   = resMatrix;
   4.157 
   4.158 +
   4.159 
   4.160 +      //create divider processor, start doing the work, and wait till done
   4.161 
   4.162 +      //This function is the "border crossing" between normal code and VCilk
   4.163 
   4.164 +   VCilk__create_seed_procr_and_do_work( &divideWorkIntoSubMatrixPairProcrs,
   4.165 
   4.166 +                                         dividerParams );
   4.167 
   4.168 +   
   4.169 
   4.170 +      //get result matrix and return it
   4.171 
   4.172 +   free( dividerParams );
   4.173 
   4.174 +   return resMatrix;
   4.175 
   4.176 + }
   4.177 
     5.1 --- a/src/Application/VCilk__Matrix_Mult/VCilk__Matrix_Mult.h	Mon Nov 15 12:08:19 2010 -0800
     5.2 +++ b/src/Application/VCilk__Matrix_Mult/VCilk__Matrix_Mult.h	Wed May 11 15:40:54 2011 +0200
     5.3 @@ -1,104 +1,106 @@
     5.4 -/*
     5.5 - *  Copyright Oct 24, 2009 OpenSourceStewardshipFoundation.org
     5.6 - *  Licensed under GNU General Public License version 2
     5.7 - */
     5.8 -
     5.9 -#ifndef _VCilk_MATRIX_MULT_H_
    5.10 -#define _VCilk_MATRIX_MULT_H_
    5.11 -
    5.12 -#include <stdio.h>
    5.13 -
    5.14 -#include "../../VCilk_lib/VCilk.h"
    5.15 -#include "../Matrix_Mult.h"
    5.16 -#include "../../VCilk_lib/VMS/VMS.h"
    5.17 -
    5.18 -
    5.19 -//===============================  Defines  ==============================
    5.20 -#define ROWS_IN_BLOCK 32
    5.21 -#define COLS_IN_BLOCK 32
    5.22 -#define VEC_IN_BLOCK  32
    5.23 -
    5.24 -#define copyMatrixSingleton 1
    5.25 -#define copyTransposeSingleton 2
    5.26 -
    5.27 -//==============================  Structures  ==============================
    5.28 -typedef struct
    5.29 - {
    5.30 -   Matrix *leftMatrix;
    5.31 -   Matrix *rightMatrix;
    5.32 -   Matrix *resultMatrix;
    5.33 -   
    5.34 -   TSCount numTSCsToExe;
    5.35 - }
    5.36 -DividerParams;
    5.37 -
    5.38 -typedef struct
    5.39 - {
    5.40 -   VirtProcr *dividerPr;
    5.41 -   int numRows;
    5.42 -   int numCols;
    5.43 -   int numSubMatrixPairs;
    5.44 - }
    5.45 -ResultsParams;
    5.46 -
    5.47 -typedef
    5.48 -struct
    5.49 - { int32    numRows;
    5.50 -   int32    numCols;
    5.51 -   Matrix  *origMatrix;
    5.52 -   int32    origStartRow;
    5.53 -   int32    origStartCol;
    5.54 -   int32    alreadyCopied;
    5.55 -   float32 *array;  //2D, but dynamically sized, so use addr arith
    5.56 - }
    5.57 -SubMatrix;
    5.58 -
    5.59 -typedef struct
    5.60 - { VirtProcr *resultPr;
    5.61 -   SubMatrix *leftSubMatrix;
    5.62 -   SubMatrix *rightSubMatrix;
    5.63 -   float32   *partialResultArray;
    5.64 - }
    5.65 -SMPairParams;
    5.66 -
    5.67 -typedef
    5.68 -struct
    5.69 - { int32    numVals;
    5.70 -   int32   *startVals;
    5.71 - }
    5.72 -SlicingStruc;
    5.73 -
    5.74 -typedef
    5.75 -struct
    5.76 - {
    5.77 -   SlicingStruc *leftRowSlices;
    5.78 -   SlicingStruc *vecSlices;
    5.79 -   SlicingStruc *rightColSlices;
    5.80 - }
    5.81 -SlicingStrucCarrier;
    5.82 -
    5.83 -typedef struct
    5.84 - {
    5.85 -   int32 numVecIdxs;
    5.86 -   int32 numRightColIdxs;
    5.87 -   int32 leftRowIdxOffset;
    5.88 -   int32 resColIdx;
    5.89 -   SubMatrix **leftSubMatrices;
    5.90 -   SubMatrix **rightSubMatrices;
    5.91 -   float32 *resultArray;
    5.92 -   int32 coreToRunOn;
    5.93 - }
    5.94 -VecParams;
    5.95 -
    5.96 -//============================= Processor Functions =========================
    5.97 -void divideWorkIntoSubMatrixPairProcrs( void *data, VirtProcr *animatingPr );
    5.98 -void calcSubMatrixProduct(              void *data, VirtProcr *animatingPr );
    5.99 -void calcVectorOfSubMatrices(     void *_vecParams, VirtProcr *animatingPr );
   5.100 -
   5.101 -
   5.102 -//================================ Entry Point ==============================
   5.103 -Matrix *
   5.104 -multiplyTheseMatrices( Matrix *leftMatrix, Matrix *rightMatrix );
   5.105 -
   5.106 -
   5.107 -#endif /*_VCilk_MATRIX_MULT_H_*/
   5.108 +/*
   5.109 
   5.110 + *  Copyright Oct 24, 2009 OpenSourceStewardshipFoundation.org
   5.111 
   5.112 + *  Licensed under GNU General Public License version 2
   5.113 
   5.114 + */
   5.115 
   5.116 +
   5.117 
   5.118 +#ifndef _VCilk_MATRIX_MULT_H_
   5.119 
   5.120 +#define _VCilk_MATRIX_MULT_H_
   5.121 
   5.122 +
   5.123 
   5.124 +#include <stdio.h>
   5.125 
   5.126 +
   5.127 
   5.128 +#include "../../VCilk_lib/VCilk.h"
   5.129 
   5.130 +#include "../Matrix_Mult.h"
   5.131 
   5.132 +#include "../../VCilk_lib/VMS/VMS.h"
   5.133 
   5.134 +
   5.135 
   5.136 +
   5.137 
   5.138 +//===============================  Defines  ==============================
   5.139 
   5.140 +#define ROWS_IN_BLOCK 32
   5.141 
   5.142 +#define COLS_IN_BLOCK 32
   5.143 
   5.144 +#define VEC_IN_BLOCK  32
   5.145 
   5.146 +
   5.147 
   5.148 +#define copyMatrixSingleton 1
   5.149 
   5.150 +#define copyTransposeSingleton 2
   5.151 
   5.152 +
   5.153 
   5.154 +//==============================  Structures  ==============================
   5.155 
   5.156 +typedef struct
   5.157 
   5.158 + {
   5.159 
   5.160 +   Matrix *leftMatrix;
   5.161 
   5.162 +   Matrix *rightMatrix;
   5.163 
   5.164 +   Matrix *resultMatrix;
   5.165 
   5.166 +   
   5.167 
   5.168 +   TSCount numTSCsToExe;
   5.169 
   5.170 + }
   5.171 
   5.172 +DividerParams;
   5.173 
   5.174 +
   5.175 
   5.176 +typedef struct
   5.177 
   5.178 + {
   5.179 
   5.180 +   VirtProcr *dividerPr;
   5.181 
   5.182 +   int numRows;
   5.183 
   5.184 +   int numCols;
   5.185 
   5.186 +   int numSubMatrixPairs;
   5.187 
   5.188 + }
   5.189 
   5.190 +ResultsParams;
   5.191 
   5.192 +
   5.193 
   5.194 +typedef
   5.195 
   5.196 +struct
   5.197 
   5.198 + { int32    numRows;
   5.199 
   5.200 +   int32    numCols;
   5.201 
   5.202 +   Matrix  *origMatrix;
   5.203 
   5.204 +   int32    origStartRow;
   5.205 
   5.206 +   int32    origStartCol;
   5.207 
   5.208 +   int32    alreadyCopied;
   5.209 
   5.210 +   VCilkSingleton *copySingleton;
   5.211 
   5.212 +   VCilkSingleton *copyTransSingleton;
   5.213 
   5.214 +   float32 *array;  //2D, but dynamically sized, so use addr arith
   5.215 
   5.216 + }
   5.217 
   5.218 +SubMatrix;
   5.219 
   5.220 +
   5.221 
   5.222 +typedef struct
   5.223 
   5.224 + { VirtProcr *resultPr;
   5.225 
   5.226 +   SubMatrix *leftSubMatrix;
   5.227 
   5.228 +   SubMatrix *rightSubMatrix;
   5.229 
   5.230 +   float32   *partialResultArray;
   5.231 
   5.232 + }
   5.233 
   5.234 +SMPairParams;
   5.235 
   5.236 +
   5.237 
   5.238 +typedef
   5.239 
   5.240 +struct
   5.241 
   5.242 + { int32    numVals;
   5.243 
   5.244 +   int32   *startVals;
   5.245 
   5.246 + }
   5.247 
   5.248 +SlicingStruc;
   5.249 
   5.250 +
   5.251 
   5.252 +typedef
   5.253 
   5.254 +struct
   5.255 
   5.256 + {
   5.257 
   5.258 +   SlicingStruc *leftRowSlices;
   5.259 
   5.260 +   SlicingStruc *vecSlices;
   5.261 
   5.262 +   SlicingStruc *rightColSlices;
   5.263 
   5.264 + }
   5.265 
   5.266 +SlicingStrucCarrier;
   5.267 
   5.268 +
   5.269 
   5.270 +typedef struct
   5.271 
   5.272 + {
   5.273 
   5.274 +   int32 numVecIdxs;
   5.275 
   5.276 +   int32 numRightColIdxs;
   5.277 
   5.278 +   int32 leftRowIdxOffset;
   5.279 
   5.280 +   int32 resColIdx;
   5.281 
   5.282 +   SubMatrix **leftSubMatrices;
   5.283 
   5.284 +   SubMatrix **rightSubMatrices;
   5.285 
   5.286 +   float32 *resultArray;
   5.287 
   5.288 +   int32 coreToRunOn;
   5.289 
   5.290 + }
   5.291 
   5.292 +VecParams;
   5.293 
   5.294 +
   5.295 
   5.296 +//============================= Processor Functions =========================
   5.297 
   5.298 +void divideWorkIntoSubMatrixPairProcrs( void *data, VirtProcr *animatingPr );
   5.299 
   5.300 +void calcSubMatrixProduct(              void *data, VirtProcr *animatingPr );
   5.301 
   5.302 +void calcVectorOfSubMatrices(     void *_vecParams, VirtProcr *animatingPr );
   5.303 
   5.304 +
   5.305 
   5.306 +
   5.307 
   5.308 +//================================ Entry Point ==============================
   5.309 
   5.310 +Matrix *
   5.311 
   5.312 +multiplyTheseMatrices( Matrix *leftMatrix, Matrix *rightMatrix );
   5.313 
   5.314 +
   5.315 
   5.316 +
   5.317 
   5.318 +#endif /*_VCilk_MATRIX_MULT_H_*/
   5.319 
     6.1 --- a/src/Application/VCilk__Matrix_Mult/Vector_Pr.c	Mon Nov 15 12:08:19 2010 -0800
     6.2 +++ b/src/Application/VCilk__Matrix_Mult/Vector_Pr.c	Wed May 11 15:40:54 2011 +0200
     6.3 @@ -1,130 +1,130 @@
     6.4 -/*
     6.5 - *  Copyright 2009 OpenSourceStewardshipFoundation.org
     6.6 - *  Licensed under GNU General Public License version 2
     6.7 - *
     6.8 - * Author: seanhalle@yahoo.com
     6.9 - *
    6.10 - */
    6.11 -
    6.12 -
    6.13 -#include "VCilk__Matrix_Mult.h"
    6.14 -#include <math.h>
    6.15 -#include <stdlib.h>
    6.16 -
    6.17 -
    6.18 -void inline
    6.19 -accumulateResult( float32 *resultArray, float32 *subMatrixResultArray,
    6.20 -                  int32    startRow,
    6.21 -                  int32    numRows,
    6.22 -                  int32    startCol,
    6.23 -                  int32    numCols,
    6.24 -                  int32    numOrigCols );
    6.25 -
    6.26 -
    6.27 -//===========================================================================
    6.28 -
    6.29 -void
    6.30 -calcVectorOfSubMatrices( void *_vecParams, VirtProcr *animPr )
    6.31 - { int32         numVecIdxs, leftRowIdxOffset, numRightColIdxs, resColIdx;
    6.32 -   SubMatrix   **leftSubMatrices, **rightSubMatrices;
    6.33 -   float32      *resultArray;
    6.34 -   int32         vecIdx, coreWithAffinity;
    6.35 -   SMPairParams *subMatrixPairParams, **vecOfSubMatrixParams;
    6.36 -   VecParams    *vecParams;
    6.37 -
    6.38 -   vecParams = (VecParams *)_vecParams;
    6.39 -
    6.40 -         DEBUG1( dbgAppFlow, "start vector %d\n", animPr->procrID)
    6.41 -         #ifdef TURN_ON_DEBUG_PROBES
    6.42 -         int32 subMatrixVectorProbe =
    6.43 -                   VMS__create_single_interval_probe( "subMtxVect", animPr );
    6.44 -         VMS__record_sched_choice_into_probe( subMatrixVectorProbe, animPr );
    6.45 -         VMS__record_interval_start_in_probe( subMatrixVectorProbe );
    6.46 -         #endif
    6.47 -
    6.48 -
    6.49 -   numVecIdxs       = vecParams->numVecIdxs;
    6.50 -   numRightColIdxs  = vecParams->numRightColIdxs;
    6.51 -   leftRowIdxOffset = vecParams->leftRowIdxOffset;
    6.52 -   resColIdx        = vecParams->resColIdx;
    6.53 -   leftSubMatrices  = vecParams->leftSubMatrices;
    6.54 -   rightSubMatrices = vecParams->rightSubMatrices;
    6.55 -   resultArray      = vecParams->resultArray;
    6.56 -   coreWithAffinity = vecParams->coreToRunOn;
    6.57 -         
    6.58 -   vecOfSubMatrixParams = VCilk__malloc( numVecIdxs * sizeof(SMPairParams *),
    6.59 -                                         animPr );
    6.60 -   if( vecOfSubMatrixParams == 0 ){printf("malloc error"); exit(1);}
    6.61 -
    6.62 -   for( vecIdx = 0; vecIdx < numVecIdxs; vecIdx++ )
    6.63 -    {
    6.64 -         //Make the processor for the pair of sub-matrices
    6.65 -      subMatrixPairParams  = VCilk__malloc( sizeof(SMPairParams), animPr );
    6.66 -      subMatrixPairParams->leftSubMatrix  =
    6.67 -         leftSubMatrices[ leftRowIdxOffset + vecIdx ];
    6.68 -
    6.69 -      subMatrixPairParams->rightSubMatrix =
    6.70 -         rightSubMatrices[ vecIdx * numRightColIdxs + resColIdx ];
    6.71 -
    6.72 -      VCilk__spawn( coreWithAffinity,    &calcSubMatrixProduct,
    6.73 -                    subMatrixPairParams, animPr );
    6.74 -
    6.75 -      vecOfSubMatrixParams[ vecIdx ] = subMatrixPairParams;
    6.76 -    }
    6.77 -
    6.78 -         DEBUG1( dbgAppFlow, "before sync in vector %d\n", animPr->procrID)
    6.79 -   VCilk__sync( animPr );
    6.80 -         DEBUG1( dbgAppFlow, "**after sync in vector %d\n", animPr->procrID)
    6.81 -
    6.82 -      //now accumulate individual result matrices into final result matrix
    6.83 -   for( vecIdx = 0; vecIdx < numVecIdxs; vecIdx++ )
    6.84 -    {
    6.85 -      subMatrixPairParams = vecOfSubMatrixParams[ vecIdx ];
    6.86 -
    6.87 -      accumulateResult( resultArray, subMatrixPairParams->partialResultArray,
    6.88 -                        subMatrixPairParams->leftSubMatrix->origStartRow,
    6.89 -                        subMatrixPairParams->leftSubMatrix->numRows,
    6.90 -                        subMatrixPairParams->rightSubMatrix->origStartCol,
    6.91 -                        subMatrixPairParams->rightSubMatrix->numCols,
    6.92 -                   subMatrixPairParams->rightSubMatrix->origMatrix->numCols);
    6.93 -
    6.94 -         //Note, resultArray is made on the core that produces the results
    6.95 -         // that gives chance to set affinity so all in vector run on same
    6.96 -         // core and re-use that array, and prevents writes from causing
    6.97 -         // thrashing of the cache -- as long as array big enough, the copy
    6.98 -         // overhead is miniscule vs the size-of-side reuse of each byte
    6.99 -      VCilk__free( subMatrixPairParams->partialResultArray, animPr );
   6.100 -      VCilk__free( subMatrixPairParams, animPr );
   6.101 -    }
   6.102 -   VCilk__free( vecOfSubMatrixParams, animPr );
   6.103 -   VCilk__free( vecParams, animPr );
   6.104 -
   6.105 -         #ifdef TURN_ON_DEBUG_PROBES
   6.106 -         VMS__record_interval_end_in_probe( subMatrixVectorProbe );
   6.107 -         #endif
   6.108 -         
   6.109 -         DEBUG1( dbgAppFlow, "end vector %d\n", animPr->procrID)
   6.110 -   VCilk__dissipate_procr( animPr );
   6.111 - }
   6.112 -
   6.113 -
   6.114 -
   6.115 -void inline
   6.116 -accumulateResult( float32 *resultArray, float32 *subMatrixResultArray,
   6.117 -                  int32    startRow,
   6.118 -                  int32    numRows,
   6.119 -                  int32    startCol,
   6.120 -                  int32    numCols,
   6.121 -                  int32    numOrigCols )
   6.122 - { int32 row, col;
   6.123 -
   6.124 -   for( row = 0; row < numRows; row++ )
   6.125 -    {
   6.126 -      for( col = 0; col < numCols; col++ )
   6.127 -       {
   6.128 -         resultArray[ (row + startRow) * numOrigCols + col + startCol ] +=
   6.129 -            subMatrixResultArray[ row * numCols + col ];
   6.130 -       }
   6.131 -    }
   6.132 -
   6.133 - }
   6.134 +/*
   6.135 
   6.136 + *  Copyright 2009 OpenSourceStewardshipFoundation.org
   6.137 
   6.138 + *  Licensed under GNU General Public License version 2
   6.139 
   6.140 + *
   6.141 
   6.142 + * Author: seanhalle@yahoo.com
   6.143 
   6.144 + *
   6.145 
   6.146 + */
   6.147 
   6.148 +
   6.149 
   6.150 +
   6.151 
   6.152 +#include "VCilk__Matrix_Mult.h"
   6.153 
   6.154 +#include <math.h>
   6.155 
   6.156 +#include <stdlib.h>
   6.157 
   6.158 +
   6.159 
   6.160 +
   6.161 
   6.162 +void inline
   6.163 
   6.164 +accumulateResult( float32 *resultArray, float32 *subMatrixResultArray,
   6.165 
   6.166 +                  int32    startRow,
   6.167 
   6.168 +                  int32    numRows,
   6.169 
   6.170 +                  int32    startCol,
   6.171 
   6.172 +                  int32    numCols,
   6.173 
   6.174 +                  int32    numOrigCols );
   6.175 
   6.176 +
   6.177 
   6.178 +
   6.179 
   6.180 +//===========================================================================
   6.181 
   6.182 +
   6.183 
   6.184 +void
   6.185 
   6.186 +calcVectorOfSubMatrices( void *_vecParams, VirtProcr *animPr )
   6.187 
   6.188 + { int32         numVecIdxs, leftRowIdxOffset, numRightColIdxs, resColIdx;
   6.189 
   6.190 +   SubMatrix   **leftSubMatrices, **rightSubMatrices;
   6.191 
   6.192 +   float32      *resultArray;
   6.193 
   6.194 +   int32         vecIdx, coreWithAffinity;
   6.195 
   6.196 +   SMPairParams *subMatrixPairParams, **vecOfSubMatrixParams;
   6.197 
   6.198 +   VecParams    *vecParams;
   6.199 
   6.200 +
   6.201 
   6.202 +   vecParams = (VecParams *)_vecParams;
   6.203 
   6.204 +
   6.205 
   6.206 +         DEBUG1( dbgAppFlow, "start vector %d\n", animPr->procrID)
   6.207 
   6.208 +         #ifdef TURN_ON_DEBUG_PROBES
   6.209 
   6.210 +         int32 subMatrixVectorProbe =
   6.211 
   6.212 +                   VMS__create_single_interval_probe( "subMtxVect", animPr );
   6.213 
   6.214 +         VMS__record_sched_choice_into_probe( subMatrixVectorProbe, animPr );
   6.215 
   6.216 +         VMS__record_interval_start_in_probe( subMatrixVectorProbe );
   6.217 
   6.218 +         #endif
   6.219 
   6.220 +
   6.221 
   6.222 +
   6.223 
   6.224 +   numVecIdxs       = vecParams->numVecIdxs;
   6.225 
   6.226 +   numRightColIdxs  = vecParams->numRightColIdxs;
   6.227 
   6.228 +   leftRowIdxOffset = vecParams->leftRowIdxOffset;
   6.229 
   6.230 +   resColIdx        = vecParams->resColIdx;
   6.231 
   6.232 +   leftSubMatrices  = vecParams->leftSubMatrices;
   6.233 
   6.234 +   rightSubMatrices = vecParams->rightSubMatrices;
   6.235 
   6.236 +   resultArray      = vecParams->resultArray;
   6.237 
   6.238 +   coreWithAffinity = vecParams->coreToRunOn;
   6.239 
   6.240 +         
   6.241 
   6.242 +   vecOfSubMatrixParams = VCilk__malloc( numVecIdxs * sizeof(SMPairParams *),
   6.243 
   6.244 +                                         animPr );
   6.245 
   6.246 +   if( vecOfSubMatrixParams == 0 ){printf("malloc error"); exit(1);}
   6.247 
   6.248 +
   6.249 
   6.250 +   for( vecIdx = 0; vecIdx < numVecIdxs; vecIdx++ )
   6.251 
   6.252 +    {
   6.253 
   6.254 +         //Make the processor for the pair of sub-matrices
   6.255 
   6.256 +      subMatrixPairParams  = VCilk__malloc( sizeof(SMPairParams), animPr );
   6.257 
   6.258 +      subMatrixPairParams->leftSubMatrix  =
   6.259 
   6.260 +         leftSubMatrices[ leftRowIdxOffset + vecIdx ];
   6.261 
   6.262 +
   6.263 
   6.264 +      subMatrixPairParams->rightSubMatrix =
   6.265 
   6.266 +         rightSubMatrices[ vecIdx * numRightColIdxs + resColIdx ];
   6.267 
   6.268 +
   6.269 
   6.270 +      VCilk__spawn( coreWithAffinity,    &calcSubMatrixProduct,
   6.271 
   6.272 +                    subMatrixPairParams, animPr );
   6.273 
   6.274 +
   6.275 
   6.276 +      vecOfSubMatrixParams[ vecIdx ] = subMatrixPairParams;
   6.277 
   6.278 +    }
   6.279 
   6.280 +
   6.281 
   6.282 +         DEBUG1( dbgAppFlow, "before sync in vector %d\n", animPr->procrID)
   6.283 
   6.284 +   VCilk__sync( animPr );
   6.285 
   6.286 +         DEBUG1( dbgAppFlow, "**after sync in vector %d\n", animPr->procrID)
   6.287 
   6.288 +
   6.289 
   6.290 +      //now accumulate individual result matrices into final result matrix
   6.291 
   6.292 +   for( vecIdx = 0; vecIdx < numVecIdxs; vecIdx++ )
   6.293 
   6.294 +    {
   6.295 
   6.296 +      subMatrixPairParams = vecOfSubMatrixParams[ vecIdx ];
   6.297 
   6.298 +
   6.299 
   6.300 +      accumulateResult( resultArray, subMatrixPairParams->partialResultArray,
   6.301 
   6.302 +                        subMatrixPairParams->leftSubMatrix->origStartRow,
   6.303 
   6.304 +                        subMatrixPairParams->leftSubMatrix->numRows,
   6.305 
   6.306 +                        subMatrixPairParams->rightSubMatrix->origStartCol,
   6.307 
   6.308 +                        subMatrixPairParams->rightSubMatrix->numCols,
   6.309 
   6.310 +                   subMatrixPairParams->rightSubMatrix->origMatrix->numCols);
   6.311 
   6.312 +
   6.313 
   6.314 +         //Note, resultArray is made on the core that produces the results
   6.315 
   6.316 +         // that gives chance to set affinity so all in vector run on same
   6.317 
   6.318 +         // core and re-use that array, and prevents writes from causing
   6.319 
   6.320 +         // thrashing of the cache -- as long as array big enough, the copy
   6.321 
   6.322 +         // overhead is miniscule vs the size-of-side reuse of each byte
   6.323 
   6.324 +      VCilk__free( subMatrixPairParams->partialResultArray, animPr );
   6.325 
   6.326 +      VCilk__free( subMatrixPairParams, animPr );
   6.327 
   6.328 +    }
   6.329 
   6.330 +   VCilk__free( vecOfSubMatrixParams, animPr );
   6.331 
   6.332 +   VCilk__free( vecParams, animPr );
   6.333 
   6.334 +
   6.335 
   6.336 +         #ifdef TURN_ON_DEBUG_PROBES
   6.337 
   6.338 +         VMS__record_interval_end_in_probe( subMatrixVectorProbe );
   6.339 
   6.340 +         #endif
   6.341 
   6.342 +         
   6.343 
   6.344 +         DEBUG1( dbgAppFlow, "end vector %d\n", animPr->procrID)
   6.345 
   6.346 +   VCilk__dissipate_procr( animPr );
   6.347 
   6.348 + }
   6.349 
   6.350 +
   6.351 
   6.352 +
   6.353 
   6.354 +
   6.355 
   6.356 +void inline
   6.357 
   6.358 +accumulateResult( float32 *resultArray, float32 *subMatrixResultArray,
   6.359 
   6.360 +                  int32    startRow,
   6.361 
   6.362 +                  int32    numRows,
   6.363 
   6.364 +                  int32    startCol,
   6.365 
   6.366 +                  int32    numCols,
   6.367 
   6.368 +                  int32    numOrigCols )
   6.369 
   6.370 + { int32 row, col;
   6.371 
   6.372 +
   6.373 
   6.374 +   for( row = 0; row < numRows; row++ )
   6.375 
   6.376 +    {
   6.377 
   6.378 +      for( col = 0; col < numCols; col++ )
   6.379 
   6.380 +       {
   6.381 
   6.382 +         resultArray[ (row + startRow) * numOrigCols + col + startCol ] +=
   6.383 
   6.384 +            subMatrixResultArray[ row * numCols + col ];
   6.385 
   6.386 +       }
   6.387 
   6.388 +    }
   6.389 
   6.390 +
   6.391 
   6.392 + }
   6.393 
     7.1 --- a/src/Application/VCilk__Matrix_Mult/subMatrix_Pr.c	Mon Nov 15 12:08:19 2010 -0800
     7.2 +++ b/src/Application/VCilk__Matrix_Mult/subMatrix_Pr.c	Wed May 11 15:40:54 2011 +0200
     7.3 @@ -1,312 +1,308 @@
     7.4 -/* 
     7.5 - *  Copyright 2009 OpenSourceStewardshipFoundation.org
     7.6 - *  Licensed under GNU General Public License version 2
     7.7 - *
     7.8 - * Author: SeanHalle@yahoo.com
     7.9 - *
    7.10 - */
    7.11 -
    7.12 -#include <string.h>
    7.13 -
    7.14 -#include "VCilk__Matrix_Mult.h"
    7.15 -
    7.16 -
    7.17 -void inline
    7.18 -copyFromOrig( SubMatrix *subMatrix, VirtProcr *animPr );
    7.19 -
    7.20 -void inline
    7.21 -copyTransposeFromOrig( SubMatrix *subMatrix, VirtProcr *animPr );
    7.22 -
    7.23 -void inline
    7.24 -multiplySubBlocksTransposed( float32 *leftArray, float32 *rightArray,
    7.25 -                     float32 *resArray,
    7.26 -                     int startRow,  int endRow,
    7.27 -                     int startCol,  int endCol,
    7.28 -                     int startVec,  int endVec,
    7.29 -                     int resStride, int inpStride );
    7.30 -
    7.31 -void inline
    7.32 -multiplyMatrixArraysTransposed( int32 vecLength, int32 numResRows, int32 numResCols,
    7.33 -                      float32 *leftArray, float32 *rightArray,
    7.34 -                      float32 *resArray );
    7.35 -
    7.36 -
    7.37 -/*A  processor is created with an environment that holds two matrices,
    7.38 - * the row and col that it owns, and the name of a result gathering
    7.39 - * processor.
    7.40 - *It calculates the product of two sub-portions of the input matrices
    7.41 - * by using Intel's mkl library for single-core.
    7.42 - *
    7.43 - *This demonstrates using optimized single-threaded code inside scheduled
    7.44 - * work-units.
    7.45 - *
    7.46 - *When done, it sends the result to the result processor
    7.47 - */
    7.48 -void
    7.49 -calcSubMatrixProduct( void *data, VirtProcr *animPr )
    7.50 - { 
    7.51 -   SMPairParams   *params;
    7.52 -   VirtProcr      *resultPr;
    7.53 -   float32        *leftArray,  *rightArray, *resArray;
    7.54 -   SubMatrix      *leftSubMatrix, *rightSubMatrix;
    7.55 -
    7.56 -
    7.57 -         DEBUG1( dbgAppFlow, "start sub-matrix mult %d\n", animPr->procrID)
    7.58 -         #ifdef TURN_ON_DEBUG_PROBES
    7.59 -         int32 subMatrixProbe = 
    7.60 -            VMS__create_single_interval_probe( "subMtx",      animPr);
    7.61 -         VMS__record_sched_choice_into_probe( subMatrixProbe, animPr );
    7.62 -         VMS__record_interval_start_in_probe( subMatrixProbe );
    7.63 -         #endif
    7.64 -
    7.65 -   params         = (SMPairParams *)data;
    7.66 -//   resultPr       = params->resultPr;
    7.67 -   leftSubMatrix  = params->leftSubMatrix;
    7.68 -   rightSubMatrix = params->rightSubMatrix;
    7.69 -
    7.70 -      //make sure the input sub-matrices have been copied out of orig
    7.71 -   copyFromOrig( leftSubMatrix, animPr );
    7.72 -   copyTransposeFromOrig( rightSubMatrix, animPr );
    7.73 -   
    7.74 -   leftArray      = leftSubMatrix->array;
    7.75 -   rightArray     = rightSubMatrix->array;
    7.76 -
    7.77 -      //make this array here, on the core that computes the results
    7.78 -      // with Cilk's semantics, have to have separate result array for each
    7.79 -      // spawned processor -- unless want to change the spawn and sync
    7.80 -      // pattern, such that spawn one from each vector, then sync, then
    7.81 -      // another, and so forth -- this will cause idle time due to imbalance
    7.82 -      // in matrix sizes
    7.83 -      //This also gives chance to set affinity so all in vector run on same
    7.84 -      // core and re-use the accumulation array,
    7.85 -      //As a side-benefit, it also prevents writes from causing
    7.86 -      // thrashing of the cache -- as long as array big enough, the copy
    7.87 -      // overhead is small because each byte is reused size-of-side times
    7.88 -      //This is freed in the vector processor
    7.89 -   int32
    7.90 -   resSize = leftSubMatrix->numRows * rightSubMatrix->numCols * sizeof(float32);
    7.91 -   resArray = VCilk__malloc( resSize, animPr );
    7.92 -   memset( resArray, 0, resSize );
    7.93 -
    7.94 -
    7.95 -   int32 numResRows, numResCols, vectLength;
    7.96 -   
    7.97 -   vectLength = leftSubMatrix->numCols;
    7.98 -   numResRows = leftSubMatrix->numRows;
    7.99 -   numResCols = rightSubMatrix->numCols;
   7.100 -
   7.101 -   multiplyMatrixArraysTransposed( vectLength, numResRows, numResCols,
   7.102 -                         leftArray, rightArray,
   7.103 -                         resArray );
   7.104 -
   7.105 -   //send result by side-effect
   7.106 -   params->partialResultArray = resArray;
   7.107 -
   7.108 -         #ifdef TURN_ON_DEBUG_PROBES
   7.109 -         VMS__record_interval_end_in_probe( subMatrixProbe );
   7.110 -         #endif
   7.111 -         
   7.112 -         DEBUG1( dbgAppFlow, "end sub-matrix mult %d\n", animPr->procrID)
   7.113 -   VCilk__dissipate_procr( animPr );
   7.114 - }
   7.115 -
   7.116 -
   7.117 -
   7.118 -/*Divides result and each input into 32x32 sub-matrices, 3 of which fit into
   7.119 - * the 32KB L1 cache.
   7.120 - *Would be nice to embed this within another level that divided into
   7.121 - * 8x8 tiles of those, where one 8x8 tile fits within 2MB L2 cache
   7.122 - *
   7.123 - *Eventually want these divisions to be automatic, using DKU pattern
   7.124 - * embedded into VMS and exposed in the language, and with VMS controlling the
   7.125 - * divisions according to the cache sizes, which it knows about.
   7.126 - *Also, want VMS to work with language to split among main-mems, so a socket
   7.127 - * only cranks on data in its local segment of main mem
   7.128 - *
   7.129 - *So, outer two loops determine start and end points within the result matrix.
   7.130 - * Inside that, a loop dets the start and end points along the shared dimensions
   7.131 - * of the two input matrices.
   7.132 - */
   7.133 -void inline
   7.134 -multiplyMatrixArraysTransposed( int32 vecLength, int32 numResRows,
   7.135 -                                int32 numResCols,
   7.136 -                                float32 *leftArray, float32 *rightArray,
   7.137 -                                float32 *resArray )
   7.138 - {
   7.139 -   int resStride, inpStride;
   7.140 -   int resStartRow, resStartCol, resEndRow, resEndCol, startVec, endVec;
   7.141 -
   7.142 -   resStride  = numResCols;
   7.143 -   inpStride  = vecLength;
   7.144 -
   7.145 -   for( resStartRow = 0; resStartRow < numResRows; )
   7.146 -    {
   7.147 -      resEndRow = resStartRow + ROWS_IN_BLOCK -1;  //start at zero, so -1
   7.148 -      if( resEndRow > numResRows ) resEndRow = numResRows -1;
   7.149 -
   7.150 -      for( resStartCol = 0; resStartCol < numResCols; )
   7.151 -       {
   7.152 -         resEndCol   = resStartCol + COLS_IN_BLOCK -1;
   7.153 -         if( resEndCol > numResCols ) resEndCol = numResCols -1;
   7.154 -
   7.155 -         for( startVec = 0; startVec < vecLength; )
   7.156 -          {
   7.157 -            endVec   = startVec + VEC_IN_BLOCK -1;
   7.158 -            if( endVec > vecLength ) endVec = vecLength -1;
   7.159 -
   7.160 -               //By having the "vector" of sub-blocks in a sub-block slice
   7.161 -               // be marched down in inner loop, are re-using the result
   7.162 -               // matrix, which stays in L1 cache and re-using the left sub-mat
   7.163 -               // which repeats for each right sub-mat -- can only re-use two of
   7.164 -               // the three, so result is the most important -- avoids writing
   7.165 -               // dirty blocks until those result-locations fully done
   7.166 -               //Row and Col is position in result matrix -- so row and vec
   7.167 -               // for left array, then vec and col for right array
   7.168 -            multiplySubBlocksTransposed( leftArray, rightArray,
   7.169 -                                         resArray,
   7.170 -                                         resStartRow,  resEndRow,
   7.171 -                                         resStartCol,  resEndCol,
   7.172 -                                         startVec,  endVec,
   7.173 -                                         resStride, inpStride );
   7.174 -            startVec = endVec +1;
   7.175 -          }
   7.176 -         resStartCol = resEndCol +1;
   7.177 -       }
   7.178 -      resStartRow = resEndRow +1;
   7.179 -    }
   7.180 - }
   7.181 -
   7.182 -
   7.183 -
   7.184 -void inline
   7.185 -multiplySubBlocksTransposed( float32 *leftArray, float32 *rightArray,
   7.186 -                     float32 *resArray,
   7.187 -                     int resStartRow,  int resEndRow,
   7.188 -                     int resStartCol,  int resEndCol,
   7.189 -                     int startVec,  int endVec,
   7.190 -                     int resStride, int inpStride )
   7.191 - {
   7.192 -   int resRow,     resCol,        vec;
   7.193 -   int leftOffset, rightOffset;
   7.194 -   float32 result;
   7.195 -
   7.196 -      //The result row is used for the left matrix, res col for the right
   7.197 -   for( resCol = resStartCol; resCol <= resEndCol; resCol++ )
   7.198 -    {
   7.199 -      for( resRow = resStartRow; resRow <= resEndRow; resRow++ )
   7.200 -       {
   7.201 -         leftOffset  = resRow * inpStride;//left & right inp strides same
   7.202 -         rightOffset = resCol * inpStride;// because right is transposed
   7.203 -         result = 0;
   7.204 -         for( vec = startVec; vec <= endVec; vec++ )
   7.205 -          {
   7.206 -            result +=
   7.207 -               leftArray[ leftOffset + vec] * rightArray[ rightOffset + vec];
   7.208 -          }
   7.209 -
   7.210 -         resArray[ resRow * resStride + resCol ] += result;
   7.211 -       }
   7.212 -    }
   7.213 - }
   7.214 -
   7.215 -
   7.216 -/*Reuse this in divider when do the sequential multiply case
   7.217 - */
   7.218 -void inline
   7.219 -copyTranspose( int32 numRows, int32 numCols,
   7.220 -               int32 origStartRow, int32 origStartCol, int32 origStride,
   7.221 -               float32 *subArray, float32 *origArray )
   7.222 - { int32 stride = numRows;
   7.223 -
   7.224 -   int row, col, origOffset;
   7.225 -   for( row = 0; row < numRows; row++ )
   7.226 -    {
   7.227 -      origOffset = (row + origStartRow) * origStride + origStartCol;
   7.228 -      for( col = 0; col < numCols; col++ )
   7.229 -       {
   7.230 -            //transpose means swap row & col -- traverse orig matrix normally
   7.231 -            // but put into reversed place in local array -- means the
   7.232 -            // stride is the numRows now, so col * numRows + row
   7.233 -         subArray[ col * stride + row ]  =  origArray[ origOffset + col ];
   7.234 -       }
   7.235 -    }
   7.236 - }
   7.237 -
   7.238 -void inline
   7.239 -copyTransposeFromOrig( SubMatrix *subMatrix, VirtProcr *animPr )
   7.240 - { int numCols, numRows, origStartRow, origStartCol, origStride, stride;
   7.241 -   Matrix *origMatrix;
   7.242 -   float32 *origArray, *subArray;
   7.243 -
   7.244 -   if( subMatrix->alreadyCopied ) return;
   7.245 -   VCilk__start_singleton(copyMatrixSingleton,&&EndOfTranspSingleton,animPr);
   7.246 -
   7.247 -   origMatrix   = subMatrix->origMatrix;
   7.248 -   origArray    = origMatrix->array;
   7.249 -   numCols      = subMatrix->numCols;
   7.250 -   numRows      = subMatrix->numRows;
   7.251 -   origStartRow = subMatrix->origStartRow;
   7.252 -   origStartCol = subMatrix->origStartCol;
   7.253 -   origStride   = origMatrix->numCols;
   7.254 -
   7.255 -   subArray     = VCilk__malloc( numRows * numCols *sizeof(float32),animPr);
   7.256 -   subMatrix->array = subArray;
   7.257 -
   7.258 -      //copy values from orig matrix to local
   7.259 -   copyTranspose( numRows, numCols,
   7.260 -                  origStartRow, origStartCol, origStride,
   7.261 -                  subArray, origArray );
   7.262 -
   7.263 -   subMatrix->alreadyCopied = TRUE; //must be last thing before label
   7.264 -   EndOfTranspSingleton:
   7.265 -   return;
   7.266 - }
   7.267 -
   7.268 -
   7.269 -void inline
   7.270 -copyFromOrig( SubMatrix *subMatrix, VirtProcr *animPr )
   7.271 - { int numCols, numRows, origStartRow, origStartCol, stride, origStride;
   7.272 -   Matrix *origMatrix;
   7.273 -   float32 *origArray, *subArray;
   7.274 -
   7.275 -
   7.276 -      //This lets only a single VP execute the code between start and
   7.277 -      // end -- using start and end so that work runs outside the master.
   7.278 -      //Inside, if a second VP ever executes the start, it will be returned
   7.279 -      // from the end-point.
   7.280 -      //Note, for non-GCC, can add a second SSR call at the end, and inside
   7.281 -      // that one, look at the stack at the return addr & save that in an
   7.282 -      // array indexed by singletonID
   7.283 -   if( subMatrix->alreadyCopied ) return;
   7.284 -   VCilk__start_singleton( copyMatrixSingleton, &&EndOfCopySingleton,animPr);
   7.285 -
   7.286 -
   7.287 -   origMatrix    = subMatrix->origMatrix;
   7.288 -   origArray     = origMatrix->array;
   7.289 -   numCols       = subMatrix->numCols;
   7.290 -   numRows       = subMatrix->numRows;
   7.291 -   origStartRow  = subMatrix->origStartRow;
   7.292 -   origStartCol  = subMatrix->origStartCol;
   7.293 -   origStride    = origMatrix->numCols;
   7.294 -
   7.295 -   subArray      = VCilk__malloc( numRows * numCols *sizeof(float32),animPr);
   7.296 -   subMatrix->array = subArray;
   7.297 -
   7.298 -      //copy values from orig matrix to local
   7.299 -   stride        = numCols;
   7.300 -
   7.301 -   int row, col, offset, origOffset;
   7.302 -   for( row = 0; row < numRows; row++ )
   7.303 -    {
   7.304 -      offset     = row * stride;
   7.305 -      origOffset = (row + origStartRow) * origStride + origStartCol;
   7.306 -      for( col = 0; col < numCols; col++ )
   7.307 -       {
   7.308 -         subArray[ offset + col ]  =  origArray[ origOffset + col ];
   7.309 -       }
   7.310 -    }
   7.311 -
   7.312 -   subMatrix->alreadyCopied = TRUE; //must be last thing before label
   7.313 -   EndOfCopySingleton:
   7.314 -   return;
   7.315 - }
   7.316 +/* 
   7.317 
   7.318 + *  Copyright 2009 OpenSourceStewardshipFoundation.org
   7.319 
   7.320 + *  Licensed under GNU General Public License version 2
   7.321 
   7.322 + *
   7.323 
   7.324 + * Author: SeanHalle@yahoo.com
   7.325 
   7.326 + *
   7.327 
   7.328 + */
   7.329 
   7.330 +
   7.331 
   7.332 +#include <string.h>
   7.333 
   7.334 +
   7.335 
   7.336 +#include "VCilk__Matrix_Mult.h"
   7.337 
   7.338 +
   7.339 
   7.340 +
   7.341 
   7.342 +void inline
   7.343 
   7.344 +copyFromOrig( SubMatrix *subMatrix, VirtProcr *animPr );
   7.345 
   7.346 +
   7.347 
   7.348 +void inline
   7.349 
   7.350 +copyTransposeFromOrig( SubMatrix *subMatrix, VirtProcr *animPr );
   7.351 
   7.352 +
   7.353 
   7.354 +void inline
   7.355 
   7.356 +multiplySubBlocksTransposed( float32 *leftArray, float32 *rightArray,
   7.357 
   7.358 +                     float32 *resArray,
   7.359 
   7.360 +                     int startRow,  int endRow,
   7.361 
   7.362 +                     int startCol,  int endCol,
   7.363 
   7.364 +                     int startVec,  int endVec,
   7.365 
   7.366 +                     int resStride, int inpStride );
   7.367 
   7.368 +
   7.369 
   7.370 +void inline
   7.371 
   7.372 +multiplyMatrixArraysTransposed( int32 vecLength, int32 numResRows, int32 numResCols,
   7.373 
   7.374 +                      float32 *leftArray, float32 *rightArray,
   7.375 
   7.376 +                      float32 *resArray );
   7.377 
   7.378 +
   7.379 
   7.380 +
   7.381 
   7.382 +/*A  processor is created with an environment that holds two matrices,
   7.383 
   7.384 + * the row and col that it owns, and the name of a result gathering
   7.385 
   7.386 + * processor.
   7.387 
   7.388 + *It calculates the product of two sub-portions of the input matrices
   7.389 
   7.390 + * by using Intel's mkl library for single-core.
   7.391 
   7.392 + *
   7.393 
   7.394 + *This demonstrates using optimized single-threaded code inside scheduled
   7.395 
   7.396 + * work-units.
   7.397 
   7.398 + *
   7.399 
   7.400 + *When done, it sends the result to the result processor
   7.401 
   7.402 + */
   7.403 
   7.404 +void
   7.405 
   7.406 +calcSubMatrixProduct( void *data, VirtProcr *animPr )
   7.407 
   7.408 + { 
   7.409 
   7.410 +   SMPairParams   *params;
   7.411 
   7.412 +   VirtProcr      *resultPr;
   7.413 
   7.414 +   float32        *leftArray,  *rightArray, *resArray;
   7.415 
   7.416 +   SubMatrix      *leftSubMatrix, *rightSubMatrix;
   7.417 
   7.418 +
   7.419 
   7.420 +
   7.421 
   7.422 +         DEBUG1( dbgAppFlow, "start sub-matrix mult %d\n", animPr->procrID)
   7.423 
   7.424 +         #ifdef TURN_ON_DEBUG_PROBES
   7.425 
   7.426 +         int32 subMatrixProbe = 
   7.427 
   7.428 +            VMS__create_single_interval_probe( "subMtx",      animPr);
   7.429 
   7.430 +         VMS__record_sched_choice_into_probe( subMatrixProbe, animPr );
   7.431 
   7.432 +         VMS__record_interval_start_in_probe( subMatrixProbe );
   7.433 
   7.434 +         #endif
   7.435 
   7.436 +
   7.437 
   7.438 +   params         = (SMPairParams *)data;
   7.439 
   7.440 +//   resultPr       = params->resultPr;
   7.441 
   7.442 +   leftSubMatrix  = params->leftSubMatrix;
   7.443 
   7.444 +   rightSubMatrix = params->rightSubMatrix;
   7.445 
   7.446 +
   7.447 
   7.448 +      //make sure the input sub-matrices have been copied out of orig
   7.449 
   7.450 +   copyFromOrig( leftSubMatrix, animPr );
   7.451 
   7.452 +   copyTransposeFromOrig( rightSubMatrix, animPr );
   7.453 
   7.454 +   
   7.455 
   7.456 +   leftArray      = leftSubMatrix->array;
   7.457 
   7.458 +   rightArray     = rightSubMatrix->array;
   7.459 
   7.460 +
   7.461 
   7.462 +      //make this array here, on the core that computes the results
   7.463 
   7.464 +      // with Cilk's semantics, have to have separate result array for each
   7.465 
   7.466 +      // spawned processor -- unless want to change the spawn and sync
   7.467 
   7.468 +      // pattern, such that spawn one from each vector, then sync, then
   7.469 
   7.470 +      // another, and so forth -- this will cause idle time due to imbalance
   7.471 
   7.472 +      // in matrix sizes
   7.473 
   7.474 +      //This also gives chance to set affinity so all in vector run on same
   7.475 
   7.476 +      // core and re-use the accumulation array,
   7.477 
   7.478 +      //As a side-benefit, it also prevents writes from causing
   7.479 
   7.480 +      // thrashing of the cache -- as long as array big enough, the copy
   7.481 
   7.482 +      // overhead is small because each byte is reused size-of-side times
   7.483 
   7.484 +      //This is freed in the vector processor
   7.485 
   7.486 +   int32
   7.487 
   7.488 +   resSize = leftSubMatrix->numRows * rightSubMatrix->numCols * sizeof(float32);
   7.489 
   7.490 +   resArray = VCilk__malloc( resSize, animPr );
   7.491 
   7.492 +   memset( resArray, 0, resSize );
   7.493 
   7.494 +
   7.495 
   7.496 +
   7.497 
   7.498 +   int32 numResRows, numResCols, vectLength;
   7.499 
   7.500 +   
   7.501 
   7.502 +   vectLength = leftSubMatrix->numCols;
   7.503 
   7.504 +   numResRows = leftSubMatrix->numRows;
   7.505 
   7.506 +   numResCols = rightSubMatrix->numCols;
   7.507 
   7.508 +
   7.509 
   7.510 +   multiplyMatrixArraysTransposed( vectLength, numResRows, numResCols,
   7.511 
   7.512 +                         leftArray, rightArray,
   7.513 
   7.514 +                         resArray );
   7.515 
   7.516 +
   7.517 
   7.518 +   //send result by side-effect
   7.519 
   7.520 +   params->partialResultArray = resArray;
   7.521 
   7.522 +
   7.523 
   7.524 +         #ifdef TURN_ON_DEBUG_PROBES
   7.525 
   7.526 +         VMS__record_interval_end_in_probe( subMatrixProbe );
   7.527 
   7.528 +         #endif
   7.529 
   7.530 +         
   7.531 
   7.532 +         DEBUG1( dbgAppFlow, "end sub-matrix mult %d\n", animPr->procrID)
   7.533 
   7.534 +   VCilk__dissipate_procr( animPr );
   7.535 
   7.536 + }
   7.537 
   7.538 +
   7.539 
   7.540 +
   7.541 
   7.542 +
   7.543 
   7.544 +/*Divides result and each input into 32x32 sub-matrices, 3 of which fit into
   7.545 
   7.546 + * the 32KB L1 cache.
   7.547 
   7.548 + *Would be nice to embed this within another level that divided into
   7.549 
   7.550 + * 8x8 tiles of those, where one 8x8 tile fits within 2MB L2 cache
   7.551 
   7.552 + *
   7.553 
   7.554 + *Eventually want these divisions to be automatic, using DKU pattern
   7.555 
   7.556 + * embedded into VMS and exposed in the language, and with VMS controlling the
   7.557 
   7.558 + * divisions according to the cache sizes, which it knows about.
   7.559 
   7.560 + *Also, want VMS to work with language to split among main-mems, so a socket
   7.561 
   7.562 + * only cranks on data in its local segment of main mem
   7.563 
   7.564 + *
   7.565 
   7.566 + *So, outer two loops determine start and end points within the result matrix.
   7.567 
   7.568 + * Inside that, a loop dets the start and end points along the shared dimensions
   7.569 
   7.570 + * of the two input matrices.
   7.571 
   7.572 + */
   7.573 
   7.574 +void inline
   7.575 
   7.576 +multiplyMatrixArraysTransposed( int32 vecLength, int32 numResRows,
   7.577 
   7.578 +                                int32 numResCols,
   7.579 
   7.580 +                                float32 *leftArray, float32 *rightArray,
   7.581 
   7.582 +                                float32 *resArray )
   7.583 
   7.584 + {
   7.585 
   7.586 +   int resStride, inpStride;
   7.587 
   7.588 +   int resStartRow, resStartCol, resEndRow, resEndCol, startVec, endVec;
   7.589 
   7.590 +
   7.591 
   7.592 +   resStride  = numResCols;
   7.593 
   7.594 +   inpStride  = vecLength;
   7.595 
   7.596 +
   7.597 
   7.598 +   for( resStartRow = 0; resStartRow < numResRows; )
   7.599 
   7.600 +    {
   7.601 
   7.602 +      resEndRow = resStartRow + ROWS_IN_BLOCK -1;  //start at zero, so -1
   7.603 
   7.604 +      if( resEndRow > numResRows ) resEndRow = numResRows -1;
   7.605 
   7.606 +
   7.607 
   7.608 +      for( resStartCol = 0; resStartCol < numResCols; )
   7.609 
   7.610 +       {
   7.611 
   7.612 +         resEndCol   = resStartCol + COLS_IN_BLOCK -1;
   7.613 
   7.614 +         if( resEndCol > numResCols ) resEndCol = numResCols -1;
   7.615 
   7.616 +
   7.617 
   7.618 +         for( startVec = 0; startVec < vecLength; )
   7.619 
   7.620 +          {
   7.621 
   7.622 +            endVec   = startVec + VEC_IN_BLOCK -1;
   7.623 
   7.624 +            if( endVec > vecLength ) endVec = vecLength -1;
   7.625 
   7.626 +
   7.627 
   7.628 +               //By having the "vector" of sub-blocks in a sub-block slice
   7.629 
   7.630 +               // be marched down in inner loop, are re-using the result
   7.631 
   7.632 +               // matrix, which stays in L1 cache and re-using the left sub-mat
   7.633 
   7.634 +               // which repeats for each right sub-mat -- can only re-use two of
   7.635 
   7.636 +               // the three, so result is the most important -- avoids writing
   7.637 
   7.638 +               // dirty blocks until those result-locations fully done
   7.639 
   7.640 +               //Row and Col is position in result matrix -- so row and vec
   7.641 
   7.642 +               // for left array, then vec and col for right array
   7.643 
   7.644 +            multiplySubBlocksTransposed( leftArray, rightArray,
   7.645 
   7.646 +                                         resArray,
   7.647 
   7.648 +                                         resStartRow,  resEndRow,
   7.649 
   7.650 +                                         resStartCol,  resEndCol,
   7.651 
   7.652 +                                         startVec,  endVec,
   7.653 
   7.654 +                                         resStride, inpStride );
   7.655 
   7.656 +            startVec = endVec +1;
   7.657 
   7.658 +          }
   7.659 
   7.660 +         resStartCol = resEndCol +1;
   7.661 
   7.662 +       }
   7.663 
   7.664 +      resStartRow = resEndRow +1;
   7.665 
   7.666 +    }
   7.667 
   7.668 + }
   7.669 
   7.670 +
   7.671 
   7.672 +
   7.673 
   7.674 +
   7.675 
   7.676 +void inline
   7.677 
   7.678 +multiplySubBlocksTransposed( float32 *leftArray, float32 *rightArray,
   7.679 
   7.680 +                     float32 *resArray,
   7.681 
   7.682 +                     int resStartRow,  int resEndRow,
   7.683 
   7.684 +                     int resStartCol,  int resEndCol,
   7.685 
   7.686 +                     int startVec,  int endVec,
   7.687 
   7.688 +                     int resStride, int inpStride )
   7.689 
   7.690 + {
   7.691 
   7.692 +   int resRow,     resCol,        vec;
   7.693 
   7.694 +   int leftOffset, rightOffset;
   7.695 
   7.696 +   float32 result;
   7.697 
   7.698 +
   7.699 
   7.700 +      //The result row is used for the left matrix, res col for the right
   7.701 
   7.702 +   for( resCol = resStartCol; resCol <= resEndCol; resCol++ )
   7.703 
   7.704 +    {
   7.705 
   7.706 +      for( resRow = resStartRow; resRow <= resEndRow; resRow++ )
   7.707 
   7.708 +       {
   7.709 
   7.710 +         leftOffset  = resRow * inpStride;//left & right inp strides same
   7.711 
   7.712 +         rightOffset = resCol * inpStride;// because right is transposed
   7.713 
   7.714 +         result = 0;
   7.715 
   7.716 +         for( vec = startVec; vec <= endVec; vec++ )
   7.717 
   7.718 +          {
   7.719 
   7.720 +            result +=
   7.721 
   7.722 +               leftArray[ leftOffset + vec] * rightArray[ rightOffset + vec];
   7.723 
   7.724 +          }
   7.725 
   7.726 +
   7.727 
   7.728 +         resArray[ resRow * resStride + resCol ] += result;
   7.729 
   7.730 +       }
   7.731 
   7.732 +    }
   7.733 
   7.734 + }
   7.735 
   7.736 +
   7.737 
   7.738 +
   7.739 
   7.740 +/*Reuse this in divider when do the sequential multiply case
   7.741 
   7.742 + */
   7.743 
   7.744 +void inline
   7.745 
   7.746 +copyTranspose( int32 numRows, int32 numCols,
   7.747 
   7.748 +               int32 origStartRow, int32 origStartCol, int32 origStride,
   7.749 
   7.750 +               float32 *subArray, float32 *origArray )
   7.751 
   7.752 + { int32 stride = numRows;
   7.753 
   7.754 +
   7.755 
   7.756 +   int row, col, origOffset;
   7.757 
   7.758 +   for( row = 0; row < numRows; row++ )
   7.759 
   7.760 +    {
   7.761 
   7.762 +      origOffset = (row + origStartRow) * origStride + origStartCol;
   7.763 
   7.764 +      for( col = 0; col < numCols; col++ )
   7.765 
   7.766 +       {
   7.767 
   7.768 +            //transpose means swap row & col -- traverse orig matrix normally
   7.769 
   7.770 +            // but put into reversed place in local array -- means the
   7.771 
   7.772 +            // stride is the numRows now, so col * numRows + row
   7.773 
   7.774 +         subArray[ col * stride + row ]  =  origArray[ origOffset + col ];
   7.775 
   7.776 +       }
   7.777 
   7.778 +    }
   7.779 
   7.780 + }
   7.781 
   7.782 +
   7.783 
   7.784 +void inline
   7.785 
   7.786 +copyTransposeFromOrig( SubMatrix *subMatrix, VirtProcr *animPr )
   7.787 
   7.788 + { int numCols, numRows, origStartRow, origStartCol, origStride, stride;
   7.789 
   7.790 +   Matrix *origMatrix;
   7.791 
   7.792 +   float32 *origArray, *subArray;
   7.793 
   7.794 +
   7.795 
   7.796 +   VCilk__start_data_singleton( &(subMatrix->copyTransSingleton), animPr );
   7.797 
   7.798 +
   7.799 
   7.800 +   origMatrix   = subMatrix->origMatrix;
   7.801 
   7.802 +   origArray    = origMatrix->array;
   7.803 
   7.804 +   numCols      = subMatrix->numCols;
   7.805 
   7.806 +   numRows      = subMatrix->numRows;
   7.807 
   7.808 +   origStartRow = subMatrix->origStartRow;
   7.809 
   7.810 +   origStartCol = subMatrix->origStartCol;
   7.811 
   7.812 +   origStride   = origMatrix->numCols;
   7.813 
   7.814 +
   7.815 
   7.816 +   subArray     = VCilk__malloc( numRows * numCols *sizeof(float32),animPr);
   7.817 
   7.818 +   subMatrix->array = subArray;
   7.819 
   7.820 +
   7.821 
   7.822 +      //copy values from orig matrix to local
   7.823 
   7.824 +   copyTranspose( numRows, numCols,
   7.825 
   7.826 +                  origStartRow, origStartCol, origStride,
   7.827 
   7.828 +                  subArray, origArray );
   7.829 
   7.830 +
   7.831 
   7.832 +   VCilk__end_data_singleton( &(subMatrix->copyTransSingleton), animPr );
   7.833 
   7.834 +   return;
   7.835 
   7.836 + }
   7.837 
   7.838 +
   7.839 
   7.840 +
   7.841 
   7.842 +void inline
   7.843 
   7.844 +copyFromOrig( SubMatrix *subMatrix, VirtProcr *animPr )
   7.845 
   7.846 + { int numCols, numRows, origStartRow, origStartCol, stride, origStride;
   7.847 
   7.848 +   Matrix *origMatrix;
   7.849 
   7.850 +   float32 *origArray, *subArray;
   7.851 
   7.852 +
   7.853 
   7.854 +
   7.855 
   7.856 +      //This lets only a single VP execute the code between start and
   7.857 
   7.858 +      // end -- using start and end so that work runs outside the master.
   7.859 
   7.860 +      //Inside, if a second VP ever executes the start, it will be returned
   7.861 
   7.862 +      // from the end-point.
   7.863 
   7.864 +      //Note, for non-GCC, can add a second SSR call at the end, and inside
   7.865 
   7.866 +      // that one, look at the stack at the return addr & save that in an
   7.867 
   7.868 +      // array indexed by singletonID
   7.869 
   7.870 +   VCilk__start_data_singleton( &(subMatrix->copySingleton), animPr );
   7.871 
   7.872 +
   7.873 
   7.874 +
   7.875 
   7.876 +   origMatrix    = subMatrix->origMatrix;
   7.877 
   7.878 +   origArray     = origMatrix->array;
   7.879 
   7.880 +   numCols       = subMatrix->numCols;
   7.881 
   7.882 +   numRows       = subMatrix->numRows;
   7.883 
   7.884 +   origStartRow  = subMatrix->origStartRow;
   7.885 
   7.886 +   origStartCol  = subMatrix->origStartCol;
   7.887 
   7.888 +   origStride    = origMatrix->numCols;
   7.889 
   7.890 +
   7.891 
   7.892 +   subArray      = VCilk__malloc( numRows * numCols *sizeof(float32),animPr);
   7.893 
   7.894 +   subMatrix->array = subArray;
   7.895 
   7.896 +
   7.897 
   7.898 +      //copy values from orig matrix to local
   7.899 
   7.900 +   stride        = numCols;
   7.901 
   7.902 +
   7.903 
   7.904 +   int row, col, offset, origOffset;
   7.905 
   7.906 +   for( row = 0; row < numRows; row++ )
   7.907 
   7.908 +    {
   7.909 
   7.910 +      offset     = row * stride;
   7.911 
   7.912 +      origOffset = (row + origStartRow) * origStride + origStartCol;
   7.913 
   7.914 +      for( col = 0; col < numCols; col++ )
   7.915 
   7.916 +       {
   7.917 
   7.918 +         subArray[ offset + col ]  =  origArray[ origOffset + col ];
   7.919 
   7.920 +       }
   7.921 
   7.922 +    }
   7.923 
   7.924 +   VCilk__end_data_singleton( &(subMatrix->copySingleton), animPr );
   7.925 
   7.926 +
   7.927 
   7.928 +   return;
   7.929 
   7.930 + }
   7.931 
     8.1 --- a/src/Application/main.c	Mon Nov 15 12:08:19 2010 -0800
     8.2 +++ b/src/Application/main.c	Wed May 11 15:40:54 2011 +0200
     8.3 @@ -1,35 +1,35 @@
     8.4 -/*
     8.5 - *  Copyright Oct 24, 2009 OpenSourceStewardshipFoundation.org
     8.6 - *  Licensed under GNU General Public License version 2
     8.7 - *
     8.8 - * author seanhalle@yahoo.com
     8.9 - */
    8.10 -
    8.11 -#include <malloc.h>
    8.12 -#include <stdlib.h>
    8.13 -
    8.14 -#include "Matrix_Mult.h"
    8.15 -#include "VCilk__Matrix_Mult/VCilk__Matrix_Mult.h"
    8.16 -
    8.17 -/**
    8.18 - *Matrix multiply program written using VMS_HW piggy-back language
    8.19 - * 
    8.20 - */
    8.21 -int main( int argc, char **argv )
    8.22 - { Matrix      *leftMatrix, *rightMatrix, *resultMatrix;
    8.23 -   ParamBag    *paramBag;
    8.24 -   
    8.25 -   paramBag = makeParamBag();
    8.26 -   printf("arguments: %s | %s | %s | %s\n", argv[0], argv[1], argv[2], argv[3] );
    8.27 -   readParamFileIntoBag( argv[1], paramBag );
    8.28 -   initialize_Input_Matrices_Via( &leftMatrix, &rightMatrix, paramBag );
    8.29 -   
    8.30 -   resultMatrix = multiplyTheseMatrices( leftMatrix, rightMatrix );
    8.31 -
    8.32 -//   printf("\nresult matrix: \n"); \
    8.33 -   printMatrix( resultMatrix );
    8.34 -   
    8.35 -//   VCilk__print_stats();
    8.36 -   fflush(stdin);
    8.37 -   exit(0); //cleans up
    8.38 - }
    8.39 +/*
    8.40 
    8.41 + *  Copyright Oct 24, 2009 OpenSourceStewardshipFoundation.org
    8.42 
    8.43 + *  Licensed under GNU General Public License version 2
    8.44 
    8.45 + *
    8.46 
    8.47 + * author seanhalle@yahoo.com
    8.48 
    8.49 + */
    8.50 
    8.51 +
    8.52 
    8.53 +#include <malloc.h>
    8.54 
    8.55 +#include <stdlib.h>
    8.56 
    8.57 +
    8.58 
    8.59 +#include "Matrix_Mult.h"
    8.60 
    8.61 +#include "VCilk__Matrix_Mult/VCilk__Matrix_Mult.h"
    8.62 
    8.63 +
    8.64 
    8.65 +/**
    8.66 
    8.67 + *Matrix multiply program written using VMS_HW piggy-back language
    8.68 
    8.69 + * 
    8.70 
    8.71 + */
    8.72 
    8.73 +int main( int argc, char **argv )
    8.74 
    8.75 + { Matrix      *leftMatrix, *rightMatrix, *resultMatrix;
    8.76 
    8.77 +   ParamBag    *paramBag;
    8.78 
    8.79 +   
    8.80 
    8.81 +   paramBag = makeParamBag();
    8.82 
    8.83 +   printf("arguments: %s | %s | %s | %s\n", argv[0], argv[1], argv[2], argv[3] );
    8.84 
    8.85 +   readParamFileIntoBag( argv[1], paramBag );
    8.86 
    8.87 +   initialize_Input_Matrices_Via( &leftMatrix, &rightMatrix, paramBag );
    8.88 
    8.89 +   
    8.90 
    8.91 +   resultMatrix = multiplyTheseMatrices( leftMatrix, rightMatrix );
    8.92 
    8.93 +
    8.94 
    8.95 +//   printf("\nresult matrix: \n"); \
    8.96 
    8.97 +   printMatrix( resultMatrix );
    8.98 
    8.99 +   
   8.100 
   8.101 +//   VCilk__print_stats();
   8.102 
   8.103 +   fflush(stdin);
   8.104 
   8.105 +   exit(0); //cleans up
   8.106 
   8.107 + }
   8.108