changeset 1:06c80cd3c303

Fixed bugs in multiply code -- has correct loop nest now and initializes result
author Me
date Sun, 07 Nov 2010 06:55:26 -0800
parents 56e17dcfc0c3
children 5311bd32c3cb
files src/Application/VCilk__Matrix_Mult/Divide_Pr.c src/Application/VCilk__Matrix_Mult/subMatrix_Pr.c src/Application/main.c
diffstat 3 files changed, 46 insertions(+), 34 deletions(-) [+]
line diff
     1.1 --- a/src/Application/VCilk__Matrix_Mult/Divide_Pr.c	Sat Oct 30 20:43:49 2010 -0700
     1.2 +++ b/src/Application/VCilk__Matrix_Mult/Divide_Pr.c	Sun Nov 07 06:55:26 2010 -0800
     1.3 @@ -128,7 +128,6 @@
     1.4     
     1.5           PRINT_DEBUG("start divide\n")
     1.6  
     1.7 -//TODO:         VMS__create_block_of_probes_with_idxs( 0, 2 );
     1.8           int32
     1.9           divideProbe = VMS__create_single_interval_probe( "divideProbe",
    1.10                                                            animPr );
     2.1 --- a/src/Application/VCilk__Matrix_Mult/subMatrix_Pr.c	Sat Oct 30 20:43:49 2010 -0700
     2.2 +++ b/src/Application/VCilk__Matrix_Mult/subMatrix_Pr.c	Sun Nov 07 06:55:26 2010 -0800
     2.3 @@ -78,8 +78,10 @@
     2.4        // thrashing of the cache -- as long as array big enough, the copy
     2.5        // overhead is small because each byte is reused size-of-side times
     2.6        //This is freed in the vector processor
     2.7 -   resArray = VCilk__malloc(leftSubMatrix->numRows * rightSubMatrix->numCols*
     2.8 -                           sizeof( float32 ), animatingPr );
     2.9 +   int32
    2.10 +   resSize = leftSubMatrix->numRows * rightSubMatrix->numCols * sizeof(float32);
    2.11 +   resArray = VCilk__malloc( resSize, animatingPr );
    2.12 +   memset( resArray, 0, resSize );
    2.13  
    2.14  
    2.15     int32 numResRows, numResCols, vectLength;
    2.16 @@ -101,16 +103,21 @@
    2.17   }
    2.18  
    2.19  
    2.20 -/*Divides into 32x32 sub-matrices, 3 of which fit into 32KB L1 cache
    2.21 - * Would be nice to embed this within another level that divided into
    2.22 +
    2.23 +/*Divides result and each input into 32x32 sub-matrices, 3 of which fit into
    2.24 + * the 32KB L1 cache.
    2.25 + *Would be nice to embed this within another level that divided into
    2.26   * 8x8 tiles of those, where one 8x8 tile fits within 2MB L2 cache
    2.27   *
    2.28   *Eventually want these divisions to be automatic, using DKU pattern
    2.29 - * embedded into VCilk, and with VMS controlling the divisions according to
    2.30 - * the cache sizes, which it knows about.
    2.31 - *And, want VMS to work with language to split among main-mems, so a socket
    2.32 + * embedded into VMS and exposed in the language, and with VMS controlling the
    2.33 + * divisions according to the cache sizes, which it knows about.
    2.34 + *Also, want VMS to work with language to split among main-mems, so a socket
    2.35   * only cranks on data in its local segment of main mem
    2.36   *
    2.37 + *So, outer two loops determine start and end points within the result matrix.
    2.38 + * Inside that, a loop dets the start and end points along the shared dimensions
    2.39 + * of the two input matrices.
    2.40   */
    2.41  void inline
    2.42  multiplyMatrixArrays( int32 vecLength, int32 numResRows, int32 numResCols,
    2.43 @@ -118,74 +125,79 @@
    2.44                        float32 *resArray )
    2.45   {
    2.46     int resStride, inpStride;
    2.47 -   int startRow, startCol, endRow, endCol, startVec, endVec;
    2.48 +   int resStartRow, resStartCol, resEndRow, resEndCol, startVec, endVec;
    2.49  
    2.50     resStride  = numResCols;
    2.51     inpStride  = vecLength;
    2.52  
    2.53 -   for( startRow = 0; startRow < numResRows; )
    2.54 +   for( resStartRow = 0; resStartRow < numResRows; )
    2.55      {
    2.56 -      endRow = startRow + ROWS_IN_BLOCK;
    2.57 -      if( endRow > numResRows ) endRow = numResRows;
    2.58 +      resEndRow = resStartRow + ROWS_IN_BLOCK -1;  //start at zero, so -1
    2.59 +      if( resEndRow > numResRows ) resEndRow = numResRows;
    2.60  
    2.61 -      for( startCol = 0; startCol < numResCols; )
    2.62 +      for( resStartCol = 0; resStartCol < numResCols; )
    2.63         {
    2.64 -         endCol   = startCol + COLS_IN_BLOCK;
    2.65 -         if( endCol > numResCols ) endCol = numResCols;
    2.66 +         resEndCol   = resStartCol + COLS_IN_BLOCK -1;
    2.67 +         if( resEndCol > numResCols ) resEndCol = numResCols;
    2.68 +         resStartCol = resEndCol +1;
    2.69 +
    2.70  
    2.71           for( startVec = 0; startVec < vecLength; )
    2.72            {
    2.73 -            endVec   = startVec + VEC_IN_BLOCK;
    2.74 +            endVec   = startVec + VEC_IN_BLOCK -1;
    2.75              if( endVec > vecLength ) endVec = vecLength;
    2.76  
    2.77                 //By having the "vector" of sub-blocks in a sub-block slice
    2.78                 // be marched down in inner loop, are re-using the result
    2.79 -               // matrix, which stays in L1 cache -- can only re-use one of
    2.80 -               // the three, so this is the most important -- avoids writing
    2.81 +               // matrix, which stays in L1 cache and re-using the left sub-mat
    2.82 +               // which repeats for each right sub-mat -- can only re-use two of
    2.83 +               // the three, so result is the most important -- avoids writing
    2.84                 // dirty blocks until those result-locations fully done
    2.85                 //Row and Col is position in result matrix -- so row and vec
    2.86                 // for left array, then vec and col for right array
    2.87              multiplySubBlocksTransposed( leftArray, rightArray,
    2.88                                           resArray,
    2.89 -                                         startRow,  endRow,
    2.90 -                                         startCol,  endCol,
    2.91 +                                         resStartRow,  resEndRow,
    2.92 +                                         resStartCol,  resEndCol,
    2.93                                           startVec,  endVec,
    2.94                                           resStride, inpStride );
    2.95 -            startVec = endVec;
    2.96 +            startVec = endVec +1;
    2.97            }
    2.98 -         startCol = endCol;
    2.99 +         resStartCol = resEndCol +1;
   2.100         }
   2.101 -      startRow = endRow;
   2.102 +      resStartRow = resEndRow +1;
   2.103      }
   2.104   }
   2.105  
   2.106  
   2.107 +
   2.108  void inline
   2.109  multiplySubBlocksTransposed( float32 *leftArray, float32 *rightArray,
   2.110                       float32 *resArray,
   2.111 -                     int startRow,  int endRow,
   2.112 -                     int startCol,  int endCol,
   2.113 +                     int resStartRow,  int resEndRow,
   2.114 +                     int resStartCol,  int resEndCol,
   2.115                       int startVec,  int endVec,
   2.116                       int resStride, int inpStride )
   2.117   {
   2.118 -   int row,    col,        vec;
   2.119 +   int resRow,     resCol,        vec;
   2.120     int leftOffset, rightOffset;
   2.121     float32 result;
   2.122 -   
   2.123 -   for( row = startRow; row < endRow; row++ )
   2.124 +
   2.125 +      //The result row is used only for the left matrix, res col for the right
   2.126 +   for( resCol = resStartCol; resCol <= resEndCol; resCol++ )
   2.127      { 
   2.128 -      for( col = startCol; col < endCol; col++ )
   2.129 +      for( resRow = resStartRow; resRow <= resEndRow; resRow++ )
   2.130         { 
   2.131 -         leftOffset  = row * inpStride;//left & right inp strides always same
   2.132 -         rightOffset = col * inpStride;// because right is transposed
   2.133 +         leftOffset  = resRow * inpStride;//left & right inp strides always same
   2.134 +         rightOffset = resCol * inpStride;// because right is transposed
   2.135           result = 0;
   2.136 -         for( vec = startVec; vec < endVec; vec++ )
   2.137 +         for( vec = startVec; vec <= endVec; vec++ )
   2.138            {
   2.139              result +=
   2.140                 leftArray[ leftOffset + vec] * rightArray[ rightOffset + vec];
   2.141            }
   2.142           
   2.143 -         resArray[ row * resStride + col ] += result;
   2.144 +         resArray[ resRow * resStride + resCol ] += result;
   2.145         }
   2.146      }
   2.147   }
   2.148 @@ -214,7 +226,7 @@
   2.149     origStride   = origMatrix->numCols;
   2.150  
   2.151        //This is free in Divide pr after all calcs are done
   2.152 -   subArray     = VCilk__malloc( numRows * numCols * sizeof(float32),animPr);
   2.153 +   subArray     = VCilk__malloc( numRows * numCols * sizeof(float32), animPr );
   2.154     subMatrix->array = subArray;
   2.155  
   2.156        //copy values from orig matrix to local
     3.1 --- a/src/Application/main.c	Sat Oct 30 20:43:49 2010 -0700
     3.2 +++ b/src/Application/main.c	Sun Nov 07 06:55:26 2010 -0800
     3.3 @@ -20,6 +20,7 @@
     3.4     ParamBag    *paramBag;
     3.5     
     3.6     paramBag = makeParamBag();
     3.7 +   printf("arguments: %s | %s | %s | %s\n", argv[0], argv[1], argv[2], argv[3] );
     3.8     readParamFileIntoBag( argv[1], paramBag );
     3.9     initialize_Input_Matrices_Via( &leftMatrix, &rightMatrix, paramBag );
    3.10