# HG changeset patch
# User Me
# Date 1290238745 -3600
# Node ID 46ceb3dd0f0a167e4fc6fdbfcc2a5042fad511f5
# Parent  133633d1c10f32526f25affc7bac6287652070f9
Nov 20  PLDI final numbers -- debug statements, singletons and #ifdef'd probes

diff -r 133633d1c10f -r 46ceb3dd0f0a src/Application/VPThread__Matrix_Mult/Divide_Pr.c
--- a/src/Application/VPThread__Matrix_Mult/Divide_Pr.c	Tue Nov 16 16:02:51 2010 +0100
+++ b/src/Application/VPThread__Matrix_Mult/Divide_Pr.c	Sat Nov 20 08:39:05 2010 +0100
@@ -178,6 +178,7 @@
     {
       //====== Do parallel multiply across cores
 
+            DEBUG( dbgAppFlow, "divider: do parallel mult\n")
          //Calc the ideal size of sub-matrix and slice up the dimensions of
          // the two matrices.
          //The ideal size is the one takes the number of cycles to calculate
@@ -213,6 +214,7 @@
                                                                animatingThd );
       //======================================================================
 
+            DEBUG( dbgAppFlow, "divider: made mutexes and conds\n")
          //get results-comm lock before create results-thd, to ensure it can't
          // signal that results are available before this thd is waiting on cond
       VPThread__mutex_lock( globals->results_mutex, animatingThd );
@@ -222,11 +224,13 @@
       VPThread__mutex_lock( globals->start_mutex, animatingThd );
 
 
+            DEBUG( dbgAppFlow, "divider: make result thread\n")
       VPThread__create_thread( &gatherResults, resultsParams, animatingThd );
 
          //Now wait for results thd to signal that it has vector lock
       VPThread__cond_wait(  globals->start_cond,  animatingThd );
       VPThread__mutex_unlock( globals->start_mutex, animatingThd );//done w/lock
+            DEBUG( dbgAppFlow, "divider: make sub-matrices\n")
    
          //Make the sub-matrices, and pair them up, and make processor to
          // calc product of each pair.
@@ -266,7 +270,7 @@
    float32 idealSizeOfSide, idealSizeOfSide1, idealSizeOfSide2;
    SlicingStruc *leftRowSlices, *vecSlices, *rightColSlices;
    SlicingStrucCarrier *slicingStrucCarrier =
-                         VPThread__malloc(sizeof(SlicingStrucCarrier), animPr);
+                       VPThread__malloc(sizeof(SlicingStrucCarrier), animPr);
 
    int minWorkUnitCycles, primitiveCycles, idealNumWorkUnits;
    float64 numPrimitiveOpsInMinWorkUnit;
@@ -417,13 +421,13 @@
             subMatrixPairParams->rightSubMatrix =
                rightSubMatrices[ vecIdx * numRightColIdxs + resColIdx ];
 
-            subMatrixPairParams->resultPr = resultPr fix_this;
+            //subMatrixPairParams->resultPr = resultPr;
 
                //put all pairs from the same vector onto same core
             VPThread__create_thread_with_affinity( &calcSubMatrixProduct,
-                                             subMatrixPairParams,
-                                             animatingPr,
-                                             coreToScheduleOnto );
+                                                   subMatrixPairParams,
+                                                   animatingPr,
+                                                   coreToScheduleOnto );
           }
 
             //Trying to distribute the subMatrix-vectors across the cores, so
diff -r 133633d1c10f -r 46ceb3dd0f0a src/Application/VPThread__Matrix_Mult/VPThread__Matrix_Mult.h
--- a/src/Application/VPThread__Matrix_Mult/VPThread__Matrix_Mult.h	Tue Nov 16 16:02:51 2010 +0100
+++ b/src/Application/VPThread__Matrix_Mult/VPThread__Matrix_Mult.h	Sat Nov 20 08:39:05 2010 +0100
@@ -47,6 +47,8 @@
    int32    origStartRow;
    int32    origStartCol;
    int32    alreadyCopied;
+   VPThdSingleton *copySingleton;
+   VPThdSingleton *copyTransSingleton;
    int32    numUsesLeft; //have update via message to avoid multiple writers
    float32 *array;  //2D, but dynamically sized, so use addr arith
  }
diff -r 133633d1c10f -r 46ceb3dd0f0a src/Application/VPThread__Matrix_Mult/subMatrix_Pr.c
--- a/src/Application/VPThread__Matrix_Mult/subMatrix_Pr.c	Tue Nov 16 16:02:51 2010 +0100
+++ b/src/Application/VPThread__Matrix_Mult/subMatrix_Pr.c	Sat Nov 20 08:39:05 2010 +0100
@@ -53,10 +53,12 @@
    MatrixMultGlobals *globals =(MatrixMultGlobals *)VPThread__give_globals();
 
          DEBUG1(dbgAppFlow, "start sub-matrix mult: %d\n", animatingPr->procrID)
+         #ifdef TURN_ON_DEBUG_PROBES
          int32 subMatrixProbe = VMS__create_single_interval_probe( "subMtx",
                                                                 animatingPr);
          VMS__record_sched_choice_into_probe( subMatrixProbe, animatingPr );
          VMS__record_interval_start_in_probe( subMatrixProbe );
+         #endif
 
    params         = (SMPairParams *)data;
    resultPr       = params->resultPr;
@@ -90,7 +92,9 @@
    //send result to result processor
    params->partialResultArray = resArray;
 
+         #ifdef TURN_ON_DEBUG_PROBES
          VMS__record_interval_end_in_probe( subMatrixProbe );
+         #endif
 
       //Send result to results thread
       //This pattern works 'cause only get lock when results thd inside wait
@@ -100,6 +104,7 @@
    VPThread__mutex_unlock( globals->vector_mutex, animatingPr );//release
    //wait-er -- cond_signal implemented such that wait-er gets lock, no other
 
+         DEBUG1(dbgAppFlow, "end sub-matrix mult: %d\n", animatingPr->procrID)
    VPThread__dissipate_thread( animatingPr );
  }
 
@@ -233,8 +238,7 @@
    Matrix *origMatrix;
    float32 *origArray, *subArray;
 
-   if( subMatrix->alreadyCopied ) return;
-   VPThread__start_singleton( copyTransposeSingleton, animPr);
+   VPThread__start_data_singleton( &(subMatrix->copyTransSingleton), animPr );
 
    origMatrix   = subMatrix->origMatrix;
    origArray    = origMatrix->array;
@@ -252,8 +256,8 @@
                   origStartRow, origStartCol, origStride,
                   subArray, origArray );
 
-   VPThread__end_singleton( copyTransposeSingleton, animPr);
-   subMatrix->alreadyCopied = TRUE; //anywhere after singleton work finished
+   VPThread__end_data_singleton( &(subMatrix->copyTransSingleton), animPr );
+
  }
 
 
@@ -267,11 +271,10 @@
       //This lets only a single VP execute the code between start and
       // end -- using start and end so that work runs outside the master.
       //If a second VP ever executes the start, it will be returned
-      // from the end-point.  If it executions start after another but before
+      // from the end-point.  If its execution starts after another but before
       // that other has finished, this one will remain suspended until the
       // other finishes, then be resumed from the end-point.
-   if( subMatrix->alreadyCopied ) return; //an optimization -- set below
-   VPThread__start_singleton( copyMatrixSingleton, animPr );
+   VPThread__start_data_singleton( &(subMatrix->copySingleton), animPr );
 
 
    origMatrix    = subMatrix->origMatrix;
@@ -299,6 +302,5 @@
        }
     }
 
-   subMatrix->alreadyCopied = TRUE; //must be after singleton work finished
-   VPThread__end_singleton( copyMatrixSingleton, animPr );
+   VPThread__end_data_singleton( &(subMatrix->copySingleton), animPr );
  }
diff -r 133633d1c10f -r 46ceb3dd0f0a src/Application/main.c
--- a/src/Application/main.c	Tue Nov 16 16:02:51 2010 +0100
+++ b/src/Application/main.c	Sat Nov 20 08:39:05 2010 +0100
@@ -9,7 +9,7 @@
 #include <stdlib.h>
 
 #include "Matrix_Mult.h"
-#include "SSR_Matrix_Mult/SSR_Matrix_Mult.h"
+#include "VPThread__Matrix_Mult/VPThread__Matrix_Mult.h"
 
 /**
  *Matrix multiply program written using VMS_HW piggy-back language