# HG changeset patch
# User Me
# Date 1288917855 25200
# Node ID cbd8db6b8657e3e3c30f35c6552284b3b244733f
# Parent  4e14e2663af9065983d3817bdf62480829f6428b
Fixed last bugs in matrix multiply code -- gives correct answers consistently

Needed to add initializing result matrices to 0 'cause accumulating
and fixed bug in sequential bypass where passed the wrong array
and fixed problem with end-conditions in blocked multiply loop nest

diff -r 4e14e2663af9 -r cbd8db6b8657 src/Application/SSR_Matrix_Mult/Divide_Pr.c
--- a/src/Application/SSR_Matrix_Mult/Divide_Pr.c	Tue Nov 02 17:00:50 2010 -0700
+++ b/src/Application/SSR_Matrix_Mult/Divide_Pr.c	Thu Nov 04 17:44:15 2010 -0700
@@ -122,7 +122,7 @@
    SlicingStrucCarrier *slicingStrucCarrier;
    float32         *resultArray; //points to array inside result matrix
    
-         DEBUG("start divide\n")
+         DEBUG( dbgAppFlow, "start divide\n")
 
          int32
          divideProbe = VMS__create_single_interval_probe( "divideProbe",
@@ -152,13 +152,16 @@
        (float32)rightMatrix->numCols  < NUM_CELLS_IN_SEQUENTIAL_CUTOFF )
     {
       //====== Do sequential multiply on a single core
-            DEBUG("doing sequential")
+            DEBUG( dbgAppFlow, "doing sequential")
 
-      //have to transpose the right matrix first
+         //zero the result array
+      memset( resultArray, 0, numResRows * numResCols * sizeof(float32) );
+            
+         //transpose the right matrix
       float32 *
-      transRightArray  = SSR__malloc_to( rightMatrix->numRows *
-                                         rightMatrix->numCols *
-                                         sizeof(float32),        animPr );
+      transRightArray  = SSR__malloc_to( rightMatrix->numRows * 
+                                         rightMatrix->numCols * sizeof(float32),
+                                         animPr );
 
          //copy values from orig matrix to local
       copyTranspose( rightMatrix->numRows, rightMatrix->numCols,
@@ -166,7 +169,7 @@
                      transRightArray, rightMatrix->array );
       
       multiplyMatrixArraysTransposed( vectLength, numResRows, numResCols,
-                            leftMatrix->array, rightMatrix->array,
+                            leftMatrix->array, transRightArray,
                             resultArray );
     }
    else
@@ -211,7 +214,7 @@
    //===============  Work done -- send results back =================
 
 
-         DEBUG_MSG( dbgAppFlow, "end divide\n")
+         DEBUG( dbgAppFlow, "end divide\n")
 
          VMS__record_interval_end_in_probe( divideProbe );
          VMS__print_stats_of_all_probes();
@@ -417,6 +420,7 @@
              { coreToScheduleOnto += 1;
              }
           }
+ 
        }
     }
 
diff -r 4e14e2663af9 -r cbd8db6b8657 src/Application/SSR_Matrix_Mult/Result_Pr.c
--- a/src/Application/SSR_Matrix_Mult/Result_Pr.c	Tue Nov 02 17:00:50 2010 -0700
+++ b/src/Application/SSR_Matrix_Mult/Result_Pr.c	Thu Nov 04 17:44:15 2010 -0700
@@ -34,7 +34,7 @@
    void           *msg;
    SMPairParams   *resParams;
 
-         DEBUG("start resultPr\n")
+         DEBUG( dbgAppFlow, "start resultPr\n")
          
    params    = (ResultsParams *)_params;
    dividerPr = params->dividerPr;
diff -r 4e14e2663af9 -r cbd8db6b8657 src/Application/SSR_Matrix_Mult/SSR_Matrix_Mult.h
--- a/src/Application/SSR_Matrix_Mult/SSR_Matrix_Mult.h	Tue Nov 02 17:00:50 2010 -0700
+++ b/src/Application/SSR_Matrix_Mult/SSR_Matrix_Mult.h	Thu Nov 04 17:44:15 2010 -0700
@@ -20,8 +20,6 @@
 #define copyMatrixSingleton 1
 #define copyTransposeSingleton 2
 
-#define DEBUG(msg) //printf(msg); fflush(stdin);
-
 //==============================  Structures  ==============================
 typedef struct
  {
diff -r 4e14e2663af9 -r cbd8db6b8657 src/Application/SSR_Matrix_Mult/subMatrix_Pr.c
--- a/src/Application/SSR_Matrix_Mult/subMatrix_Pr.c	Tue Nov 02 17:00:50 2010 -0700
+++ b/src/Application/SSR_Matrix_Mult/subMatrix_Pr.c	Thu Nov 04 17:44:15 2010 -0700
@@ -48,7 +48,7 @@
    float32        *leftArray,  *rightArray, *resArray;
    SubMatrix      *leftSubMatrix, *rightSubMatrix;
 
-         DEBUG("start sub-matrix mult\n")
+         DEBUG1(dbgAppFlow, "start sub-matrix mult: %d\n", animatingPr->procrID)
 
    params         = (SMPairParams *)data;
    resultPr       = params->resultPr;
@@ -63,8 +63,10 @@
    leftArray      = leftSubMatrix->array;
    rightArray     = rightSubMatrix->array;
 
-   resArray = SSR__malloc_to(leftSubMatrix->numRows * rightSubMatrix->numCols
-                             * sizeof( float32 ), animatingPr );
+   int32
+   resSize = leftSubMatrix->numRows * rightSubMatrix->numCols * sizeof(float32);
+   resArray = SSR__malloc_to( resSize, animatingPr );
+   memset( resArray, 0, resSize );
 
 
    int32 numResRows, numResCols, vectLength;
@@ -84,97 +86,107 @@
  }
 
 
-/*Divides into 32x32 sub-matrices, 3 of which fit into 32KB L1 cache
- * Would be nice to embed this within another level that divided into
+
+/*Divides result and each input into 32x32 sub-matrices, 3 of which fit into
+ * the 32KB L1 cache.
+ *Would be nice to embed this within another level that divided into
  * 8x8 tiles of those, where one 8x8 tile fits within 2MB L2 cache
  *
  *Eventually want these divisions to be automatic, using DKU pattern
- * embedded into SSR, and with VMS controlling the divisions according to
- * the cache sizes, which it knows about.
- *And, want VMS to work with language to split among main-mems, so a socket
+ * embedded into VMS and exposed in the language, and with VMS controlling the
+ * divisions according to the cache sizes, which it knows about.
+ *Also, want VMS to work with language to split among main-mems, so a socket
  * only cranks on data in its local segment of main mem
  *
+ *So, outer two loops determine start and end points within the result matrix.
+ * Inside that, a loop dets the start and end points along the shared dimensions
+ * of the two input matrices.
  */
 void inline
 multiplyMatrixArraysTransposed( int32 vecLength, int32 numResRows,
                                 int32 numResCols,
-                      float32 *leftArray, float32 *rightArray,
-                      float32 *resArray )
+                                float32 *leftArray, float32 *rightArray,
+                                float32 *resArray )
  {
    int resStride, inpStride;
-   int startRow, startCol, endRow, endCol, startVec, endVec;
+   int resStartRow, resStartCol, resEndRow, resEndCol, startVec, endVec;
 
    resStride  = numResCols;
    inpStride  = vecLength;
 
-   for( startRow = 0; startRow < numResRows; )
+   for( resStartRow = 0; resStartRow < numResRows; )
     {
-      endRow = startRow + ROWS_IN_BLOCK;
-      if( endRow > numResRows ) endRow = numResRows;
+      resEndRow = resStartRow + ROWS_IN_BLOCK -1;  //start at zero, so -1
+      if( resEndRow > numResRows ) resEndRow = numResRows -1;
 
-      for( startCol = 0; startCol < numResCols; )
+      for( resStartCol = 0; resStartCol < numResCols; )
        {
-         endCol   = startCol + COLS_IN_BLOCK;
-         if( endCol > numResCols ) endCol = numResCols;
+         resEndCol   = resStartCol + COLS_IN_BLOCK -1;
+         if( resEndCol > numResCols ) resEndCol = numResCols -1;
 
          for( startVec = 0; startVec < vecLength; )
           {
-            endVec   = startVec + VEC_IN_BLOCK;
-            if( endVec > vecLength ) endVec = vecLength;
+            endVec   = startVec + VEC_IN_BLOCK -1;
+            if( endVec > vecLength ) endVec = vecLength -1;
 
                //By having the "vector" of sub-blocks in a sub-block slice
                // be marched down in inner loop, are re-using the result
-               // matrix, which stays in L1 cache -- can only re-use one of
-               // the three, so this is the most important -- avoids writing
+               // matrix, which stays in L1 cache and re-using the left sub-mat
+               // which repeats for each right sub-mat -- can only re-use two of
+               // the three, so result is the most important -- avoids writing
                // dirty blocks until those result-locations fully done
                //Row and Col is position in result matrix -- so row and vec
                // for left array, then vec and col for right array
             multiplySubBlocksTransposed( leftArray, rightArray,
                                          resArray,
-                                         startRow,  endRow,
-                                         startCol,  endCol,
+                                         resStartRow,  resEndRow,
+                                         resStartCol,  resEndCol,
                                          startVec,  endVec,
                                          resStride, inpStride );
-            startVec = endVec;
+            startVec = endVec +1;
           }
-         startCol = endCol;
+         resStartCol = resEndCol +1;
        }
-      startRow = endRow;
+      resStartRow = resEndRow +1;
     }
  }
 
 
+
 void inline
 multiplySubBlocksTransposed( float32 *leftArray, float32 *rightArray,
                      float32 *resArray,
-                     int startRow,  int endRow,
-                     int startCol,  int endCol,
+                     int resStartRow,  int resEndRow,
+                     int resStartCol,  int resEndCol,
                      int startVec,  int endVec,
                      int resStride, int inpStride )
  {
-   int row,    col,        vec;
+   int resRow,     resCol,        vec;
    int leftOffset, rightOffset;
    float32 result;
-   
-   for( row = startRow; row < endRow; row++ )
-    { 
-      for( col = startCol; col < endCol; col++ )
-       { 
-         leftOffset  = row * inpStride;//left & right inp strides always same
-         rightOffset = col * inpStride;// because right is transposed
+
+      //The result row is used only for the left matrix, res col for the right
+   for( resCol = resStartCol; resCol <= resEndCol; resCol++ )
+    {
+      for( resRow = resStartRow; resRow <= resEndRow; resRow++ )
+       {
+         leftOffset  = resRow * inpStride;//left & right inp strides always same
+         rightOffset = resCol * inpStride;// because right is transposed
          result = 0;
-         for( vec = startVec; vec < endVec; vec++ )
+         for( vec = startVec; vec <= endVec; vec++ )
           {
             result +=
                leftArray[ leftOffset + vec] * rightArray[ rightOffset + vec];
           }
-         
-         resArray[ row * resStride + col ] += result;
+
+         resArray[ resRow * resStride + resCol ] += result;
        }
     }
  }
 
 
+
+
 /*Reuse this in divider when do the sequential multiply case
  */
 void inline
diff -r 4e14e2663af9 -r cbd8db6b8657 src/Application/main.c
--- a/src/Application/main.c	Tue Nov 02 17:00:50 2010 -0700
+++ b/src/Application/main.c	Thu Nov 04 17:44:15 2010 -0700
@@ -19,6 +19,8 @@
  { Matrix      *leftMatrix, *rightMatrix, *resultMatrix;
    ParamBag    *paramBag;
    
+   printf( "arguments: %s | %s\n", argv[0], argv[1] );
+
    paramBag = makeParamBag();
    readParamFileIntoBag( argv[1], paramBag );
    initialize_Input_Matrices_Via( &leftMatrix, &rightMatrix, paramBag );