changeset 2:5311bd32c3cb

Bug-free blocked Matrix Mult in VCilk -- uses Nov 8 version of VMS
author Me
date Mon, 08 Nov 2010 03:59:51 -0800
parents 06c80cd3c303
children 12bcb9728e1c
files src/Application/VCilk__Matrix_Mult/Divide_Pr.c src/Application/VCilk__Matrix_Mult/VCilk__Matrix_Mult.h src/Application/VCilk__Matrix_Mult/Vector_Pr.c src/Application/VCilk__Matrix_Mult/subMatrix_Pr.c
diffstat 4 files changed, 122 insertions(+), 84 deletions(-) [+]
line diff
     1.1 --- a/src/Application/VCilk__Matrix_Mult/Divide_Pr.c	Sun Nov 07 06:55:26 2010 -0800
     1.2 +++ b/src/Application/VCilk__Matrix_Mult/Divide_Pr.c	Mon Nov 08 03:59:51 2010 -0800
     1.3 @@ -10,6 +10,7 @@
     1.4  #include "VCilk__Matrix_Mult.h"
     1.5  #include <math.h>
     1.6  #include <sys/time.h>
     1.7 +#include <string.h>
     1.8  
     1.9     //The time to compute this many result values should equal the time to
    1.10     // perform this division on a matrix of size gives that many result calcs
    1.11 @@ -126,25 +127,29 @@
    1.12     float32         *resultArray; //points to array to be put inside result
    1.13                                   // matrix
    1.14     
    1.15 -         PRINT_DEBUG("start divide\n")
    1.16 +         DEBUG( dbgAppFlow, "start divide\n")
    1.17  
    1.18           int32
    1.19           divideProbe = VMS__create_single_interval_probe( "divideProbe",
    1.20                                                            animPr );
    1.21           VMS__record_sched_choice_into_probe( divideProbe, animPr );
    1.22           VMS__record_interval_start_in_probe( divideProbe );
    1.23 -         
    1.24 +
    1.25     //=========== Setup -- make local copies of ptd-to-things, malloc, aso
    1.26 -   int32 numResRows, numResCols;
    1.27 +   int32 numResRows, numResCols, vectLength;
    1.28  
    1.29     dividerParams   = (DividerParams *)_dividerParams;
    1.30     
    1.31     leftMatrix      = dividerParams->leftMatrix;
    1.32     rightMatrix     = dividerParams->rightMatrix;
    1.33  
    1.34 +   vectLength  = leftMatrix->numCols;
    1.35     numResRows  = leftMatrix->numRows;
    1.36     numResCols  = rightMatrix->numCols;
    1.37     resultArray = dividerParams->resultMatrix->array;
    1.38 +   
    1.39 +      //zero the result array
    1.40 +   memset( resultArray, 0, numResRows * numResCols * sizeof(float32) );
    1.41  
    1.42     
    1.43     //==============  Do either sequential mult or do division ==============
    1.44 @@ -157,11 +162,21 @@
    1.45      { int32 vectLength;
    1.46  
    1.47        //====== Do sequential multiply on a single core
    1.48 -            PRINT_DEBUG("doing sequential")
    1.49 -      vectLength = leftMatrix->numCols;
    1.50 +            DEBUG( dbgAppFlow, "doing sequential")
    1.51  
    1.52 -      multiplyMatrixArrays( vectLength, numResRows, numResCols,
    1.53 -                            leftMatrix->array, rightMatrix->array,
    1.54 +         //transpose the right matrix
    1.55 +      float32 *
    1.56 +      transRightArray  = VCilk__malloc( rightMatrix->numRows *
    1.57 +                                        rightMatrix->numCols *
    1.58 +                                        sizeof(float32),       animPr );
    1.59 +
    1.60 +         //copy values from orig matrix to local
    1.61 +      copyTranspose( rightMatrix->numRows, rightMatrix->numCols,
    1.62 +                     0, 0, rightMatrix->numRows,
    1.63 +                     transRightArray, rightMatrix->array );
    1.64 +
    1.65 +      multiplyMatrixArraysTransposed( vectLength, numResRows, numResCols,
    1.66 +                            leftMatrix->array, transRightArray,
    1.67                              resultArray );
    1.68      }
    1.69     else
    1.70 @@ -176,17 +191,6 @@
    1.71           calcIdealSizeAndSliceDimensions( leftMatrix, rightMatrix,  animPr);
    1.72                                           
    1.73  
    1.74 -         //Make the results processor, now that know how many to wait for
    1.75 -/*
    1.76 -      resultsParams = VCilk__malloc( sizeof(ResultsParams) );
    1.77 -      resultsParams->dividerPr = animatingPr;
    1.78 -      resultsParams->numSubMatrixPairs  =
    1.79 -         slicingStrucCarrier->leftRowSlices->numVals *
    1.80 -         slicingStrucCarrier->rightColSlices->numVals *
    1.81 -         slicingStrucCarrier->vecSlices->numVals;
    1.82 -      resultsParams->numCols   = rightMatrix->numCols;
    1.83 -      resultsParams->numRows   = leftMatrix->numRows;
    1.84 -*/
    1.85  
    1.86           //Make the sub-matrices, and pair them up, then spawn processors to
    1.87           // calc product of each pair.
    1.88 @@ -204,10 +208,10 @@
    1.89        // system, by entry-point Fn, and passed in through dividerParams.
    1.90        //So, nothing to do to send results back -- they're seen by side-effect
    1.91  
    1.92 -         PRINT_DEBUG("end divide\n")
    1.93 +         DEBUG( dbgAppFlow, "end divide\n")
    1.94  
    1.95           VMS__record_interval_end_in_probe( divideProbe );
    1.96 -         VMS__print_stats_of_all_probes;
    1.97 +         VMS__print_stats_of_all_probes();
    1.98  
    1.99     VCilk__dissipate_procr( animPr );  //all procrs dissipate self at end
   1.100        //when all of the processors have dissipated, the "create seed and do
   1.101 @@ -247,7 +251,7 @@
   1.102     idealNumWorkUnits = VCilk__giveIdealNumWorkUnits();
   1.103     
   1.104     idealSizeOfSide2 = leftMatrix->numRows / rint(cbrt( idealNumWorkUnits ));
   1.105 -   idealSizeOfSide2 *= 0.6; //finer granularity to help load balance
   1.106 +   idealSizeOfSide2 *= 0.8; //finer granularity to help load balance
   1.107  
   1.108     if( idealSizeOfSide1 > idealSizeOfSide2 )
   1.109        idealSizeOfSide = idealSizeOfSide1;
   1.110 @@ -287,7 +291,7 @@
   1.111  }
   1.112  
   1.113  
   1.114 -void
   1.115 +void inline
   1.116  makeSubMatricesAndSpawnAndSync( Matrix  *leftMatrix,  Matrix    *rightMatrix,
   1.117                           SlicingStrucCarrier *slicingStrucCarrier,
   1.118                           float32 *resultArray, VirtProcr *animPr )
   1.119 @@ -308,6 +312,10 @@
   1.120        createSubMatrices( vecSlices, rightColSlices,
   1.121                           rightMatrix, animPr );
   1.122  
   1.123 +   freeSlicingStruc( leftRowSlices, animPr );
   1.124 +   freeSlicingStruc( vecSlices, animPr );
   1.125 +   freeSlicingStruc( rightColSlices, animPr );
   1.126 +
   1.127     //==============  pair the sub-matrices and make processors ==============
   1.128     int32 numRowIdxs, numColIdxs, numVecIdxs;
   1.129  
   1.130 @@ -321,11 +329,6 @@
   1.131        //It syncs inside, so know all work is done now: free the sub-matrices
   1.132     freeSubMatrices( leftRowSlices, vecSlices,  leftSubMatrices, animPr );
   1.133     freeSubMatrices( vecSlices, rightColSlices, rightSubMatrices, animPr );
   1.134 -
   1.135 -
   1.136 -   freeSlicingStruc( leftRowSlices, animPr );
   1.137 -   freeSlicingStruc( vecSlices, animPr );
   1.138 -   freeSlicingStruc( rightColSlices, animPr );
   1.139   }
   1.140  
   1.141  
   1.142 @@ -336,7 +339,7 @@
   1.143   *  core
   1.144   *
   1.145   */
   1.146 -void
   1.147 +void inline
   1.148  pairUpSubMatricesAndSpawnAndSync( SubMatrix **leftSubMatrices,
   1.149                                      SubMatrix **rightSubMatrices,
   1.150                                      int32 numRowIdxs, int32 numColIdxs,
     2.1 --- a/src/Application/VCilk__Matrix_Mult/VCilk__Matrix_Mult.h	Sun Nov 07 06:55:26 2010 -0800
     2.2 +++ b/src/Application/VCilk__Matrix_Mult/VCilk__Matrix_Mult.h	Mon Nov 08 03:59:51 2010 -0800
     2.3 @@ -10,6 +10,7 @@
     2.4  
     2.5  #include "../../VCilk_lib/VCilk.h"
     2.6  #include "../Matrix_Mult.h"
     2.7 +#include "../../VCilk_lib/VMS/VMS.h"
     2.8  
     2.9  
    2.10  //===============================  Defines  ==============================
    2.11 @@ -17,8 +18,8 @@
    2.12  #define COLS_IN_BLOCK 32
    2.13  #define VEC_IN_BLOCK  32
    2.14  
    2.15 -
    2.16 -//#define PRINT_DEBUG(msg) //printf(msg); fflush(stdin);
    2.17 +#define copyMatrixSingleton 1
    2.18 +#define copyTransposeSingleton 2
    2.19  
    2.20  //==============================  Structures  ==============================
    2.21  typedef struct
    2.22 @@ -56,7 +57,7 @@
    2.23   { VirtProcr *resultPr;
    2.24     SubMatrix *leftSubMatrix;
    2.25     SubMatrix *rightSubMatrix;
    2.26 -   float32   *resultArray;
    2.27 +   float32   *partialResultArray;
    2.28   }
    2.29  SMPairParams;
    2.30  
     3.1 --- a/src/Application/VCilk__Matrix_Mult/Vector_Pr.c	Sun Nov 07 06:55:26 2010 -0800
     3.2 +++ b/src/Application/VCilk__Matrix_Mult/Vector_Pr.c	Mon Nov 08 03:59:51 2010 -0800
     3.3 @@ -33,10 +33,12 @@
     3.4  
     3.5     vecParams = (VecParams *)_vecParams;
     3.6  
     3.7 +         #ifdef TURN_ON_DEBUG_PROBES
     3.8           int32 subMatrixVectorProbe =
     3.9                     VMS__create_single_interval_probe( "subMtxVect", animPr );
    3.10           VMS__record_sched_choice_into_probe( subMatrixVectorProbe, animPr );
    3.11           VMS__record_interval_start_in_probe( subMatrixVectorProbe );
    3.12 +         #endif
    3.13  
    3.14  
    3.15     numVecIdxs       = vecParams->numVecIdxs;
    3.16 @@ -55,7 +57,7 @@
    3.17     for( vecIdx = 0; vecIdx < numVecIdxs; vecIdx++ )
    3.18      {
    3.19           //Make the processor for the pair of sub-matrices
    3.20 -      subMatrixPairParams  = VCilk__malloc(sizeof(SMPairParams),animPr);
    3.21 +      subMatrixPairParams  = VCilk__malloc( sizeof(SMPairParams), animPr );
    3.22        subMatrixPairParams->leftSubMatrix  =
    3.23           leftSubMatrices[ leftRowIdxOffset + vecIdx ];
    3.24  
    3.25 @@ -75,7 +77,7 @@
    3.26      {
    3.27        subMatrixPairParams = vecOfSubMatrixParams[ vecIdx ];
    3.28  
    3.29 -      accumulateResult( resultArray, subMatrixPairParams->resultArray,
    3.30 +      accumulateResult( resultArray, subMatrixPairParams->partialResultArray,
    3.31                          subMatrixPairParams->leftSubMatrix->origStartRow,
    3.32                          subMatrixPairParams->leftSubMatrix->numRows,
    3.33                          subMatrixPairParams->rightSubMatrix->origStartCol,
    3.34 @@ -87,14 +89,16 @@
    3.35           // core and re-use that array, and prevents writes from causing
    3.36           // thrashing of the cache -- as long as array big enough, the copy
    3.37           // overhead is miniscule vs the size-of-side reuse of each byte
    3.38 -      VCilk__free( subMatrixPairParams->resultArray, animPr );
    3.39 +      VCilk__free( subMatrixPairParams->partialResultArray, animPr );
    3.40        VCilk__free( subMatrixPairParams, animPr );
    3.41      }
    3.42     VCilk__free( vecOfSubMatrixParams, animPr );
    3.43     VCilk__free( vecParams, animPr );
    3.44  
    3.45 +         #ifdef TURN_ON_DEBUG_PROBES
    3.46           VMS__record_interval_end_in_probe( subMatrixVectorProbe );
    3.47 -
    3.48 +         #endif
    3.49 +         
    3.50     VCilk__dissipate_procr( animPr );
    3.51   }
    3.52  
     4.1 --- a/src/Application/VCilk__Matrix_Mult/subMatrix_Pr.c	Sun Nov 07 06:55:26 2010 -0800
     4.2 +++ b/src/Application/VCilk__Matrix_Mult/subMatrix_Pr.c	Mon Nov 08 03:59:51 2010 -0800
     4.3 @@ -6,6 +6,8 @@
     4.4   *
     4.5   */
     4.6  
     4.7 +#include <string.h>
     4.8 +
     4.9  #include "VCilk__Matrix_Mult.h"
    4.10  
    4.11  
    4.12 @@ -24,7 +26,7 @@
    4.13                       int resStride, int inpStride );
    4.14  
    4.15  void inline
    4.16 -multiplyMatrixArrays( int32 vecLength, int32 numResRows, int32 numResCols,
    4.17 +multiplyMatrixArraysTransposed( int32 vecLength, int32 numResRows, int32 numResCols,
    4.18                        float32 *leftArray, float32 *rightArray,
    4.19                        float32 *resArray );
    4.20  
    4.21 @@ -48,11 +50,14 @@
    4.22     float32        *leftArray,  *rightArray, *resArray;
    4.23     SubMatrix      *leftSubMatrix, *rightSubMatrix;
    4.24  
    4.25 -         PRINT_DEBUG("start sub-matrix mult\n")
    4.26 -         int32 subMatrixProbe = VMS__create_single_interval_probe( "subMtx",
    4.27 -                                                                animatingPr);
    4.28 +
    4.29 +         DEBUG( dbgAppFlow, "start sub-matrix mult\n")
    4.30 +         #ifdef TURN_ON_DEBUG_PROBES
    4.31 +         int32 subMatrixProbe = 
    4.32 +            VMS__create_single_interval_probe( "subMtx",      animatingPr);
    4.33           VMS__record_sched_choice_into_probe( subMatrixProbe, animatingPr );
    4.34           VMS__record_interval_start_in_probe( subMatrixProbe );
    4.35 +         #endif
    4.36  
    4.37     params         = (SMPairParams *)data;
    4.38  //   resultPr       = params->resultPr;
    4.39 @@ -90,14 +95,16 @@
    4.40     numResRows = leftSubMatrix->numRows;
    4.41     numResCols = rightSubMatrix->numCols;
    4.42  
    4.43 -   multiplyMatrixArrays( vectLength, numResRows, numResCols,
    4.44 +   multiplyMatrixArraysTransposed( vectLength, numResRows, numResCols,
    4.45                           leftArray, rightArray,
    4.46                           resArray );
    4.47  
    4.48     //send result by side-effect
    4.49 -   params->resultArray = resArray;
    4.50 +   params->partialResultArray = resArray;
    4.51  
    4.52 +         #ifdef TURN_ON_DEBUG_PROBES
    4.53           VMS__record_interval_end_in_probe( subMatrixProbe );
    4.54 +         #endif
    4.55           
    4.56     VCilk__dissipate_procr( animatingPr );
    4.57   }
    4.58 @@ -120,9 +127,10 @@
    4.59   * of the two input matrices.
    4.60   */
    4.61  void inline
    4.62 -multiplyMatrixArrays( int32 vecLength, int32 numResRows, int32 numResCols,
    4.63 -                      float32 *leftArray, float32 *rightArray,
    4.64 -                      float32 *resArray )
    4.65 +multiplyMatrixArraysTransposed( int32 vecLength, int32 numResRows,
    4.66 +                                int32 numResCols,
    4.67 +                                float32 *leftArray, float32 *rightArray,
    4.68 +                                float32 *resArray )
    4.69   {
    4.70     int resStride, inpStride;
    4.71     int resStartRow, resStartCol, resEndRow, resEndCol, startVec, endVec;
    4.72 @@ -133,19 +141,17 @@
    4.73     for( resStartRow = 0; resStartRow < numResRows; )
    4.74      {
    4.75        resEndRow = resStartRow + ROWS_IN_BLOCK -1;  //start at zero, so -1
    4.76 -      if( resEndRow > numResRows ) resEndRow = numResRows;
    4.77 +      if( resEndRow > numResRows ) resEndRow = numResRows -1;
    4.78  
    4.79        for( resStartCol = 0; resStartCol < numResCols; )
    4.80         {
    4.81           resEndCol   = resStartCol + COLS_IN_BLOCK -1;
    4.82 -         if( resEndCol > numResCols ) resEndCol = numResCols;
    4.83 -         resStartCol = resEndCol +1;
    4.84 -
    4.85 +         if( resEndCol > numResCols ) resEndCol = numResCols -1;
    4.86  
    4.87           for( startVec = 0; startVec < vecLength; )
    4.88            {
    4.89              endVec   = startVec + VEC_IN_BLOCK -1;
    4.90 -            if( endVec > vecLength ) endVec = vecLength;
    4.91 +            if( endVec > vecLength ) endVec = vecLength -1;
    4.92  
    4.93                 //By having the "vector" of sub-blocks in a sub-block slice
    4.94                 // be marched down in inner loop, are re-using the result
    4.95 @@ -183,12 +189,12 @@
    4.96     int leftOffset, rightOffset;
    4.97     float32 result;
    4.98  
    4.99 -      //The result row is used only for the left matrix, res col for the right
   4.100 +      //The result row is used for the left matrix, res col for the right
   4.101     for( resCol = resStartCol; resCol <= resEndCol; resCol++ )
   4.102 -    { 
   4.103 +    {
   4.104        for( resRow = resStartRow; resRow <= resEndRow; resRow++ )
   4.105 -       { 
   4.106 -         leftOffset  = resRow * inpStride;//left & right inp strides always same
   4.107 +       {
   4.108 +         leftOffset  = resRow * inpStride;//left & right inp strides same
   4.109           rightOffset = resCol * inpStride;// because right is transposed
   4.110           result = 0;
   4.111           for( vec = startVec; vec <= endVec; vec++ )
   4.112 @@ -196,40 +202,21 @@
   4.113              result +=
   4.114                 leftArray[ leftOffset + vec] * rightArray[ rightOffset + vec];
   4.115            }
   4.116 -         
   4.117 +
   4.118           resArray[ resRow * resStride + resCol ] += result;
   4.119         }
   4.120      }
   4.121   }
   4.122  
   4.123 -/*Note: don't do the copy when create, because create on a different core --
   4.124 - * do the copy on the core that will do the calcs, then try to keep work on
   4.125 - * that core that reuses the array data
   4.126 +
   4.127 +/*Reuse this in divider when do the sequential multiply case
   4.128   */
   4.129  void inline
   4.130 -copyTransposeFromOrig( SubMatrix *subMatrix, VirtProcr *animPr )
   4.131 - { int numCols, numRows, origStartRow, origStartCol, origStride, stride;
   4.132 -   Matrix *origMatrix;
   4.133 -   float32 *origArray, *subArray;
   4.134 +copyTranspose( int32 numRows, int32 numCols,
   4.135 +               int32 origStartRow, int32 origStartCol, int32 origStride,
   4.136 +               float32 *subArray, float32 *origArray )
   4.137 + { int32 stride = numRows;
   4.138  
   4.139 -   if( subMatrix->alreadyCopied ) return;
   4.140 -
   4.141 -   subMatrix->alreadyCopied = TRUE;
   4.142 -
   4.143 -   origMatrix   = subMatrix->origMatrix;
   4.144 -   origArray     = origMatrix->array;
   4.145 -   numCols      = subMatrix->numCols;
   4.146 -   numRows      = subMatrix->numRows;
   4.147 -   stride       = numRows;
   4.148 -   origStartRow = subMatrix->origStartRow;
   4.149 -   origStartCol = subMatrix->origStartCol;
   4.150 -   origStride   = origMatrix->numCols;
   4.151 -
   4.152 -      //This is free in Divide pr after all calcs are done
   4.153 -   subArray     = VCilk__malloc( numRows * numCols * sizeof(float32), animPr );
   4.154 -   subMatrix->array = subArray;
   4.155 -
   4.156 -      //copy values from orig matrix to local
   4.157     int row, col, origOffset;
   4.158     for( row = 0; row < numRows; row++ )
   4.159      {
   4.160 @@ -238,21 +225,60 @@
   4.161         {
   4.162              //transpose means swap row & col -- traverse orig matrix normally
   4.163              // but put into reversed place in local array -- means the
   4.164 -            // stride is the num rows now, so col * numRows + row
   4.165 +            // stride is the numRows now, so col * numRows + row
   4.166           subArray[ col * stride + row ]  =  origArray[ origOffset + col ];
   4.167 -       }      
   4.168 +       }
   4.169      }
   4.170   }
   4.171  
   4.172  void inline
   4.173 +copyTransposeFromOrig( SubMatrix *subMatrix, VirtProcr *animPr )
   4.174 + { int numCols, numRows, origStartRow, origStartCol, origStride, stride;
   4.175 +   Matrix *origMatrix;
   4.176 +   float32 *origArray, *subArray;
   4.177 +
   4.178 +   if( subMatrix->alreadyCopied ) return;
   4.179 +   VCilk__start_singleton(copyMatrixSingleton,&&EndOfTranspSingleton,animPr);
   4.180 +
   4.181 +   origMatrix   = subMatrix->origMatrix;
   4.182 +   origArray    = origMatrix->array;
   4.183 +   numCols      = subMatrix->numCols;
   4.184 +   numRows      = subMatrix->numRows;
   4.185 +   origStartRow = subMatrix->origStartRow;
   4.186 +   origStartCol = subMatrix->origStartCol;
   4.187 +   origStride   = origMatrix->numCols;
   4.188 +
   4.189 +   subArray     = VCilk__malloc( numRows * numCols *sizeof(float32),animPr);
   4.190 +   subMatrix->array = subArray;
   4.191 +
   4.192 +      //copy values from orig matrix to local
   4.193 +   copyTranspose( numRows, numCols,
   4.194 +                  origStartRow, origStartCol, origStride,
   4.195 +                  subArray, origArray );
   4.196 +
   4.197 +   subMatrix->alreadyCopied = TRUE; //must be last thing before label
   4.198 +   EndOfTranspSingleton:
   4.199 +   return;
   4.200 + }
   4.201 +
   4.202 +
   4.203 +void inline
   4.204  copyFromOrig( SubMatrix *subMatrix, VirtProcr *animPr )
   4.205   { int numCols, numRows, origStartRow, origStartCol, stride, origStride;
   4.206     Matrix *origMatrix;
   4.207     float32 *origArray, *subArray;
   4.208  
   4.209 +
   4.210 +      //This lets only a single VP execute the code between start and
   4.211 +      // end -- using start and end so that work runs outside the master.
   4.212 +      //Inside, if a second VP ever executes the start, it will be returned
   4.213 +      // from the end-point.
   4.214 +      //Note, for non-GCC, can add a second SSR call at the end, and inside
   4.215 +      // that one, look at the stack at the return addr & save that in an
   4.216 +      // array indexed by singletonID
   4.217     if( subMatrix->alreadyCopied ) return;
   4.218 +   VCilk__start_singleton( copyMatrixSingleton, &&EndOfCopySingleton,animPr);
   4.219  
   4.220 -   subMatrix->alreadyCopied = TRUE;
   4.221  
   4.222     origMatrix    = subMatrix->origMatrix;
   4.223     origArray     = origMatrix->array;
   4.224 @@ -260,14 +286,14 @@
   4.225     numRows       = subMatrix->numRows;
   4.226     origStartRow  = subMatrix->origStartRow;
   4.227     origStartCol  = subMatrix->origStartCol;
   4.228 -   stride        = numCols;
   4.229     origStride    = origMatrix->numCols;
   4.230  
   4.231 -      //This is freed in Divide pr after all calcs are done
   4.232     subArray      = VCilk__malloc( numRows * numCols *sizeof(float32),animPr);
   4.233     subMatrix->array = subArray;
   4.234  
   4.235        //copy values from orig matrix to local
   4.236 +   stride        = numCols;
   4.237 +
   4.238     int row, col, offset, origOffset;
   4.239     for( row = 0; row < numRows; row++ )
   4.240      {
   4.241 @@ -278,4 +304,8 @@
   4.242           subArray[ offset + col ]  =  origArray[ origOffset + col ];
   4.243         }
   4.244      }
   4.245 +
   4.246 +   subMatrix->alreadyCopied = TRUE; //must be last thing before label
   4.247 +   EndOfCopySingleton:
   4.248 +   return;
   4.249   }