# HG changeset patch
# User Me
# Date 1289141726 28800
# Node ID 06c80cd3c3039bcbe762c8c238cef488658b4d69
# Parent  56e17dcfc0c32aaa3cabf7c19208cd2345b501f1
Fixed bugs in multiply code -- has correct loop nest now and initializes result

diff -r 56e17dcfc0c3 -r 06c80cd3c303 src/Application/VCilk__Matrix_Mult/Divide_Pr.c
--- a/src/Application/VCilk__Matrix_Mult/Divide_Pr.c	Sat Oct 30 20:43:49 2010 -0700
+++ b/src/Application/VCilk__Matrix_Mult/Divide_Pr.c	Sun Nov 07 06:55:26 2010 -0800
@@ -128,7 +128,6 @@
    
          PRINT_DEBUG("start divide\n")
 
-//TODO:         VMS__create_block_of_probes_with_idxs( 0, 2 );
          int32
          divideProbe = VMS__create_single_interval_probe( "divideProbe",
                                                           animPr );
diff -r 56e17dcfc0c3 -r 06c80cd3c303 src/Application/VCilk__Matrix_Mult/subMatrix_Pr.c
--- a/src/Application/VCilk__Matrix_Mult/subMatrix_Pr.c	Sat Oct 30 20:43:49 2010 -0700
+++ b/src/Application/VCilk__Matrix_Mult/subMatrix_Pr.c	Sun Nov 07 06:55:26 2010 -0800
@@ -78,8 +78,10 @@
       // thrashing of the cache -- as long as array big enough, the copy
       // overhead is small because each byte is reused size-of-side times
       //This is freed in the vector processor
-   resArray = VCilk__malloc(leftSubMatrix->numRows * rightSubMatrix->numCols*
-                           sizeof( float32 ), animatingPr );
+   int32
+   resSize = leftSubMatrix->numRows * rightSubMatrix->numCols * sizeof(float32);
+   resArray = VCilk__malloc( resSize, animatingPr );
+   memset( resArray, 0, resSize );
 
 
    int32 numResRows, numResCols, vectLength;
@@ -101,16 +103,21 @@
  }
 
 
-/*Divides into 32x32 sub-matrices, 3 of which fit into 32KB L1 cache
- * Would be nice to embed this within another level that divided into
+
+/*Divides result and each input into 32x32 sub-matrices, 3 of which fit into
+ * the 32KB L1 cache.
+ *Would be nice to embed this within another level that divided into
  * 8x8 tiles of those, where one 8x8 tile fits within 2MB L2 cache
  *
  *Eventually want these divisions to be automatic, using DKU pattern
- * embedded into VCilk, and with VMS controlling the divisions according to
- * the cache sizes, which it knows about.
- *And, want VMS to work with language to split among main-mems, so a socket
+ * embedded into VMS and exposed in the language, and with VMS controlling the
+ * divisions according to the cache sizes, which it knows about.
+ *Also, want VMS to work with language to split among main-mems, so a socket
  * only cranks on data in its local segment of main mem
  *
+ *So, outer two loops determine start and end points within the result matrix.
+ * Inside that, a loop dets the start and end points along the shared dimensions
+ * of the two input matrices.
  */
 void inline
 multiplyMatrixArrays( int32 vecLength, int32 numResRows, int32 numResCols,
@@ -118,74 +125,79 @@
                       float32 *resArray )
  {
    int resStride, inpStride;
-   int startRow, startCol, endRow, endCol, startVec, endVec;
+   int resStartRow, resStartCol, resEndRow, resEndCol, startVec, endVec;
 
    resStride  = numResCols;
    inpStride  = vecLength;
 
-   for( startRow = 0; startRow < numResRows; )
+   for( resStartRow = 0; resStartRow < numResRows; )
     {
-      endRow = startRow + ROWS_IN_BLOCK;
-      if( endRow > numResRows ) endRow = numResRows;
+      resEndRow = resStartRow + ROWS_IN_BLOCK -1;  //start at zero, so -1
+      if( resEndRow > numResRows ) resEndRow = numResRows;
 
-      for( startCol = 0; startCol < numResCols; )
+      for( resStartCol = 0; resStartCol < numResCols; )
        {
-         endCol   = startCol + COLS_IN_BLOCK;
-         if( endCol > numResCols ) endCol = numResCols;
+         resEndCol   = resStartCol + COLS_IN_BLOCK -1;
+         if( resEndCol > numResCols ) resEndCol = numResCols;
+         resStartCol = resEndCol +1;
+
 
          for( startVec = 0; startVec < vecLength; )
           {
-            endVec   = startVec + VEC_IN_BLOCK;
+            endVec   = startVec + VEC_IN_BLOCK -1;
             if( endVec > vecLength ) endVec = vecLength;
 
                //By having the "vector" of sub-blocks in a sub-block slice
                // be marched down in inner loop, are re-using the result
-               // matrix, which stays in L1 cache -- can only re-use one of
-               // the three, so this is the most important -- avoids writing
+               // matrix, which stays in L1 cache and re-using the left sub-mat
+               // which repeats for each right sub-mat -- can only re-use two of
+               // the three, so result is the most important -- avoids writing
                // dirty blocks until those result-locations fully done
                //Row and Col is position in result matrix -- so row and vec
                // for left array, then vec and col for right array
             multiplySubBlocksTransposed( leftArray, rightArray,
                                          resArray,
-                                         startRow,  endRow,
-                                         startCol,  endCol,
+                                         resStartRow,  resEndRow,
+                                         resStartCol,  resEndCol,
                                          startVec,  endVec,
                                          resStride, inpStride );
-            startVec = endVec;
+            startVec = endVec +1;
           }
-         startCol = endCol;
+         resStartCol = resEndCol +1;
        }
-      startRow = endRow;
+      resStartRow = resEndRow +1;
     }
  }
 
 
+
 void inline
 multiplySubBlocksTransposed( float32 *leftArray, float32 *rightArray,
                      float32 *resArray,
-                     int startRow,  int endRow,
-                     int startCol,  int endCol,
+                     int resStartRow,  int resEndRow,
+                     int resStartCol,  int resEndCol,
                      int startVec,  int endVec,
                      int resStride, int inpStride )
  {
-   int row,    col,        vec;
+   int resRow,     resCol,        vec;
    int leftOffset, rightOffset;
    float32 result;
-   
-   for( row = startRow; row < endRow; row++ )
+
+      //The result row is used only for the left matrix, res col for the right
+   for( resCol = resStartCol; resCol <= resEndCol; resCol++ )
     { 
-      for( col = startCol; col < endCol; col++ )
+      for( resRow = resStartRow; resRow <= resEndRow; resRow++ )
        { 
-         leftOffset  = row * inpStride;//left & right inp strides always same
-         rightOffset = col * inpStride;// because right is transposed
+         leftOffset  = resRow * inpStride;//left & right inp strides always same
+         rightOffset = resCol * inpStride;// because right is transposed
          result = 0;
-         for( vec = startVec; vec < endVec; vec++ )
+         for( vec = startVec; vec <= endVec; vec++ )
           {
             result +=
                leftArray[ leftOffset + vec] * rightArray[ rightOffset + vec];
           }
          
-         resArray[ row * resStride + col ] += result;
+         resArray[ resRow * resStride + resCol ] += result;
        }
     }
  }
@@ -214,7 +226,7 @@
    origStride   = origMatrix->numCols;
 
       //This is free in Divide pr after all calcs are done
-   subArray     = VCilk__malloc( numRows * numCols * sizeof(float32),animPr);
+   subArray     = VCilk__malloc( numRows * numCols * sizeof(float32), animPr );
    subMatrix->array = subArray;
 
       //copy values from orig matrix to local
diff -r 56e17dcfc0c3 -r 06c80cd3c303 src/Application/main.c
--- a/src/Application/main.c	Sat Oct 30 20:43:49 2010 -0700
+++ b/src/Application/main.c	Sun Nov 07 06:55:26 2010 -0800
@@ -20,6 +20,7 @@
    ParamBag    *paramBag;
    
    paramBag = makeParamBag();
+   printf("arguments: %s | %s | %s | %s\n", argv[0], argv[1], argv[2], argv[3] );
    readParamFileIntoBag( argv[1], paramBag );
    initialize_Input_Matrices_Via( &leftMatrix, &rightMatrix, paramBag );