Mercurial > cgi-bin > hgwebdir.cgi > PR > Applications > DKU > DKU__Matrix_Mult__Bench

changeset 0:d138e0acf9a0 tip
Initial add of standard DKU matrix mult code -- to be modified
author: Sean Halle <seanhalle@yahoo.com>
date: Sun, 26 Aug 2012 03:04:50 -0700
files: BLIS_CONSTANTS.h DKU_INST_MM/Bundling_Quad.c DKU_INST_MM/Communicators.c DKU_INST_MM/DKU_INST_MM.h DKU_INST_MM/DKU_INST_MM_init.c DKU_INST_MM/Divide.c DKU_INST_MM/Kernel.c DKU_INST_MM/MakeRootDKUPieces.c DKU_INST_MM/Maker_and_Freer.c DKU_INST_MM/SerialKernel.c DKU_INST_MM/Undivide.c Matrix_Mult.c Matrix_Mult.h Read_Input_Matrix.c main.c
diffstat: 15 files changed, 2422 insertions(+), 0 deletions(-) [+]
[-]

BLIS_CONSTANTS.h 20

DKU_INST_MM/Bundling_Quad.c 349

DKU_INST_MM/Communicators.c 435

DKU_INST_MM/DKU_INST_MM.h 54

DKU_INST_MM/DKU_INST_MM_init.c 58

DKU_INST_MM/Divide.c 295

DKU_INST_MM/Kernel.c 80

DKU_INST_MM/MakeRootDKUPieces.c 53

DKU_INST_MM/Maker_and_Freer.c 73

DKU_INST_MM/SerialKernel.c 21

DKU_INST_MM/Undivide.c 34

Matrix_Mult.c 221

Matrix_Mult.h 81

Read_Input_Matrix.c 604

main.c 44 BLIS_CONSTANTS.h 20 DKU_INST_MM/Bundling_Quad.c 349 DKU_INST_MM/Communicators.c 435 DKU_INST_MM/DKU_INST_MM.h 54 DKU_INST_MM/DKU_INST_MM_init.c 58 DKU_INST_MM/Divide.c 295 DKU_INST_MM/Kernel.c 80 DKU_INST_MM/MakeRootDKUPieces.c 53 DKU_INST_MM/Maker_and_Freer.c 73 DKU_INST_MM/SerialKernel.c 21 DKU_INST_MM/Undivide.c 34 Matrix_Mult.c 221 Matrix_Mult.h 81 Read_Input_Matrix.c 604 main.c 44
BLIS_CONSTANTS.h 20
DKU_INST_MM/Bundling_Quad.c 349
DKU_INST_MM/Communicators.c 435
DKU_INST_MM/DKU_INST_MM.h 54
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/BLIS_CONSTANTS.h	Sun Aug 26 03:04:50 2012 -0700
     1.3 @@ -0,0 +1,20 @@
     1.4 +/* 
     1.5 + * File:   BLIS_CONSTANTS.h
     1.6 + * Author: SeanHalle@yahoo.com
     1.7 + *
     1.8 + * Created on October 27, 2009, 6:19 AM
     1.9 + */
    1.10 +
    1.11 +#ifndef _BLIS_CONSTANTS_H
    1.12 +#define _BLIS_CONSTANTS_H
    1.13 +
    1.14 +   //DKU Instance ID enum.  Must start at 1.
    1.15 +   //The directory, header, and init file for each instance of the DKU
    1.16 +   // pattern is named the same as the enum.
    1.17 +enum
    1.18 + { DKU_INST_MM = 1
    1.19 + };
    1.20 +
    1.21 +
    1.22 +#endif	/* _BLIS_CONSTANTS_H */
    1.23 +

     2.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     2.2 +++ b/DKU_INST_MM/Bundling_Quad.c	Sun Aug 26 03:04:50 2012 -0700
     2.3 @@ -0,0 +1,349 @@
     2.4 +/*
     2.5 + *  Copyright Oct 24, 2009 OpenSourceCodeStewardshipFoundation.org
     2.6 + *  Licensed under GNU General Public License version 2
     2.7 + *
     2.8 + *
     2.9 + * Author: SeanHalle@yahoo.com
    2.10 + *
    2.11 + */
    2.12 +
    2.13 +#include "malloc.h"
    2.14 +#include "DKU_INST_MM.h"
    2.15 +#include "../../BLIS/DKU/DKU_common/DKU.h"
    2.16 +
    2.17 +//Positions in the bundle
    2.18 +enum
    2.19 + { szPos = 0,
    2.20 +   numLRPos,
    2.21 +   numLCPos,
    2.22 +   numRRPos,
    2.23 +   numRCPos,
    2.24 +   LBMatrixPos,
    2.25 +   numPos = LBMatrixPos
    2.26 + };
    2.27 +
    2.28 +//============================ Bundling Quad ===============================
    2.29 +
    2.30 +/* This is the set of four bundling functions:
    2.31 + *
    2.32 + * bundleInputs     -- takes a DKU piece and returns an array of data that
    2.33 + *                     contains all the information the Kernel will need to
    2.34 + *                     process that DKU piece
    2.35 + *
    2.36 + * unbundleInputs   -- takes the array returned by bundleInputs and turns it
    2.37 + *                     into a DKU piece that can be given to the Kernel
    2.38 + *
    2.39 + * bundleResults    -- takes a DKU piece that has finished going through the
    2.40 + *                     Kernel and places all result information into an array
    2.41 + *
    2.42 + * unbundleResults  -- takes the output from bundleResults plus the original
    2.43 + *                     DKU piece whose inputs were bundled to produce the
    2.44 + *                     results, and modifies the state of the original DKU
    2.45 + *                     piece to be as if it had gone through the kernel.
    2.46 + *
    2.47 + * The bundling quad ("quad" because there are four bundling functions).. the
    2.48 + *  bundling quad is only used for distributed memory hardware.  When used,
    2.49 + *  they can be thought of as operating in two separate memories.
    2.50 + * The bundleInputs and unbundleResults operate in one memory, where the full
    2.51 + *  original data structure is.
    2.52 + * The unbundleInputs and bundleResults operate in the second memory, where
    2.53 + *  the data for one piece gets sent to.
    2.54 + *
    2.55 + *
    2.56 + * Call sequence:
    2.57 + *
    2.58 + * call bundlInputs in memory space of original data, giving it a DKU piece.
    2.59 + * It returns a pointer to a byte array.  (First int32 in array is its size)
    2.60 + * The byte array is sent to remote memory to be processed.
    2.61 + * In the remote memory, the byte array is received.
    2.62 + * The unbundleInputs function is called on it, which creates a new DKUPiece
    2.63 + *  in the heap of the remote memory, and creates on the heap of the remote
    2.64 + *  memory any data structures embedded within the DKUPiece.
    2.65 + * The unbundleInputs function may re-use portions of the byte array it is
    2.66 + *  given, so the run-time in the remote memory must perform buffer
    2.67 + *  management for its communications appropriately.
    2.68 + * The byte array that the unbundleInputs function is given must have been
    2.69 + *  allocated on the heap within the same malloc state as the unbundleInputs
    2.70 + *  and bundleResults functions are linked to.
    2.71 + * The returned DKUPiece is given to the Kernel function to process.
    2.72 + * The DKUPiece is then given to the bundleResults function, which returns
    2.73 + *  a pointer to a byte array.  The unbundleResults may simply return the
    2.74 + *  pointer to the same received inputBundle, so the communication and memory
    2.75 + *  management in the remote memory must behave accordingly.  The
    2.76 + *  bundleResults function will free any memory allocated on the heap by the
    2.77 + *  unbundleInputs function.  The only memory remaining on the heap when the
    2.78 + *  bundleResults function completes is the byte array returned by it.  This
    2.79 + *  is the reason that the original inputBundle given to unbundleInputs must
    2.80 + *  be allocated on the same heap that bundleResults frees from:
    2.81 + *  bundleResults might call free on that inputBundle, so that call must
    2.82 + *  modify the correct malloc state.
    2.83 + * The remote memory sends the resultBundle to the original memory, then
    2.84 + *  frees the resultBundle.
    2.85 + * The original memory pairs the result bundle with the DKUPiece that the
    2.86 + *  corresponding inputBundle was made from.
    2.87 + * The original memory calls unbundleResults, giving it the resultsBundle
    2.88 + *  and the corresponding DKUPiece that remained in the original memory.
    2.89 + * The unbundleResults modifies the original memory such that it is identical
    2.90 + *  to the state it would be in if the Kernel were called on the DKUPiece
    2.91 + *  in the original memory.
    2.92 + * The original memory is responsible for freeing the inputBundle that was
    2.93 + *  made by bundleInputs, but unbundleResults will free the resultsBundle
    2.94 + *  when it is done with it.  This allows unbundleResults to simply assign
    2.95 + *  pointers to portions of the resultsBundle, rather than copying, if that
    2.96 + *  is appropriate for the Kernel.
    2.97 + */
    2.98 +//===========================================================================
    2.99 +
   2.100 +/* Layout:
   2.101 + * sizeOfBundle
   2.102 + * numLeftRows
   2.103 + * numLeftCols
   2.104 + * numRightRows
   2.105 + * numRightCols
   2.106 + * <left matrix data>
   2.107 + * <right matrix data>
   2.108 + *
   2.109 + * calculate the sizes from the numbers of rows and cols of each, use the
   2.110 + * size of the left matrix to calc start addr of right matrix..  in the
   2.111 + * remote memory, just set pointers to the matrix locations in the bundles
   2.112 + */
   2.113 +//TODO: carefully step through bundling quad -- check sizes and addr on 9x9
   2.114 +void* bundleInputs_MM( DKUPiece* piece )
   2.115 + { void *bundle;
   2.116 +   int32  sizeOfLeftMatrix, sizeOfRightMatrix, sizeOfBundle;
   2.117 +   float32 *leftBundleMatrix,*leftMatrix,*rightBundleMatrix,*rightMatrix;
   2.118 +   float32 *leftBundleInsertPt, *rightBundleInsertPt;
   2.119 +   float32 *leftMatrixReadPt, *rightMatrixReadPt;
   2.120 +
   2.121 +   MatrixProdPiece *prodPiece = (MatrixProdPiece *)piece->appSpecificPiece;
   2.122 +   leftMatrix  = prodPiece->leftMatrix->matrix;
   2.123 +   rightMatrix = prodPiece->rightMatrix->matrix;
   2.124 +   
   2.125 +   int32 leftStartRow  = prodPiece->leftStartRow;
   2.126 +   int32 leftStartCol  = prodPiece->leftStartCol;
   2.127 +   int32 rightStartRow = prodPiece->rightStartRow;
   2.128 +   int32 rightStartCol = prodPiece->rightStartCol;
   2.129 +   int32 numLeftRows, numLeftCols, numRightRows, numRightCols;
   2.130 +   
   2.131 +   numLeftRows  = prodPiece->leftEndRow  - leftStartRow  + 1;
   2.132 +   numLeftCols  = prodPiece->leftEndCol  - leftStartCol  + 1;
   2.133 +   numRightRows = prodPiece->rightEndRow - rightStartRow + 1;
   2.134 +   numRightCols = prodPiece->rightEndCol - rightStartCol + 1;
   2.135 +   
   2.136 +   sizeOfLeftMatrix  = sizeof( float32 ) * numLeftRows  * numLeftCols;
   2.137 +   sizeOfRightMatrix = sizeof( float32 ) * numRightRows * numRightCols;
   2.138 +   sizeOfBundle = numPos * sizeof( int32 ) +
   2.139 +                  sizeOfLeftMatrix + sizeOfRightMatrix;
   2.140 +   
   2.141 +   bundle = BLIS_DKU__makeInputBundle( sizeOfBundle );
   2.142 +
   2.143 +   *((int32 *)bundle + szPos)    = sizeOfBundle;
   2.144 +   *((int32 *)bundle + numLRPos) = numLeftRows;
   2.145 +   *((int32 *)bundle + numLCPos) = numLeftCols;
   2.146 +   *((int32 *)bundle + numRRPos) = numRightRows;
   2.147 +   *((int32 *)bundle + numRCPos) = numRightCols;
   2.148 +
   2.149 +   //NOTE: Don't need to know start and end.. they will be set in remote
   2.150 +   // memory according to the size (number) alone
   2.151 +  
   2.152 +   leftBundleMatrix  = (float32 *)  ((int32 *)bundle + LBMatrixPos);
   2.153 +   rightBundleMatrix = leftBundleMatrix + sizeOfLeftMatrix/sizeof(float32);
   2.154 +   
   2.155 +   int32 r, c, numColsInLeftMatrix, numColsInRightMatrix;
   2.156 +   numColsInLeftMatrix  = prodPiece->leftMatrix->numCols;
   2.157 +   leftBundleInsertPt = leftBundleMatrix;
   2.158 +   for( r = 0; r < numLeftRows; r++ )
   2.159 +    { leftMatrixReadPt = leftMatrix + 
   2.160 +                        (leftStartRow + r) * numColsInLeftMatrix +
   2.161 +                         leftStartCol; //these are counts, compiler does *4
   2.162 +      for( c = 0; c < numLeftCols; c++ )
   2.163 +       { 
   2.164 +         *(leftBundleInsertPt++) = *(leftMatrixReadPt++);
   2.165 +       }
   2.166 +    }
   2.167 +   
   2.168 +   // Have to do separate loops for left and right because may be diff shapes
   2.169 +   numColsInRightMatrix = prodPiece->rightMatrix->numCols;
   2.170 +   rightBundleInsertPt = rightBundleMatrix;
   2.171 +   for( r = 0; r < numRightRows; r++ )
   2.172 +    { rightMatrixReadPt = rightMatrix + 
   2.173 +                         (rightStartRow + r) * numColsInRightMatrix +
   2.174 +                          rightStartCol;
   2.175 +      for( c = 0; c < numRightCols; c++ )
   2.176 +       {
   2.177 +         *(rightBundleInsertPt++) = *(rightMatrixReadPt++);
   2.178 +       }
   2.179 +    }
   2.180 +
   2.181 +   return bundle;
   2.182 + }
   2.183 +
   2.184 +/*Leave all the data in bundle, just assign pointers to it.
   2.185 + * Create a DKUPiece data structure, then fill in the sizes and pointers.
   2.186 + *
   2.187 + *This is app code, but need to make it easy for specialization.
   2.188 + *On machines like the Cell, the code for this function will be copied
   2.189 + * over to a separate file, along with any other DKU functions needed in
   2.190 + * the remote memory.
   2.191 + *
   2.192 + *The scheduler in remote memory is responsible for making space for the
   2.193 + * input bundle, and for freeing it (if needed) after the result bundle has
   2.194 + * been sent back.
   2.195 + *
   2.196 + *Model is that use an override of malloc that puts everything malloc'd from
   2.197 + * unbundleInputs calls and from bundleResults call into a buffer in remote
   2.198 + * mem.  This entire buffer is freed after the return of the result bundle is
   2.199 + * complete.
   2.200 + */
   2.201 + void
   2.202 +unbundleInputs_MM( void *bundle, DKUPiece *piece )
   2.203 + { int32 sizeOfBundle, numLeftRows, numLeftCols, numRightRows, numRightCols;
   2.204 +   int32 sizeOfLeftMatrix, sizeOfRightMatrix, sizeOfResultMatrix;
   2.205 +   float32 *leftBundleMatrix, *rightBundleMatrix;
   2.206 +   MatrixProdPiece *prodPiece;
   2.207 +   
   2.208 +   sizeOfBundle = *((int32 *)bundle + szPos);
   2.209 +   
   2.210 +   numLeftRows  = *((int32 *)bundle + numLRPos);
   2.211 +   numLeftCols  = *((int32 *)bundle + numLCPos);
   2.212 +   numRightRows = *((int32 *)bundle + numRRPos);
   2.213 +   numRightCols = *((int32 *)bundle + numRCPos);
   2.214 +   
   2.215 +      
   2.216 +   sizeOfLeftMatrix   = sizeof( float32 ) * numLeftRows  * numLeftCols;
   2.217 +   sizeOfRightMatrix  = sizeof( float32 ) * numRightRows * numRightCols;
   2.218 +   sizeOfResultMatrix = sizeof( float32 ) * numLeftRows  * numRightCols;
   2.219 +   
   2.220 +   leftBundleMatrix  = (float32 *)  ((int32 *)bundle + LBMatrixPos);
   2.221 +   rightBundleMatrix = leftBundleMatrix + sizeOfLeftMatrix/sizeof(float32);
   2.222 +   
   2.223 +//ARCH: check, for Cell, what's involved with re-defining malloc that appears
   2.224 +// inside DKUPiece maker and app spec piece maker.. can make it buffer-alloc?
   2.225 +   
   2.226 +      //that indicate how much stuff is created automatically inside
   2.227 +      //IE, does this make produce the sched data, in bundling quad?
   2.228 +   prodPiece = DKU__makeMatrixProdPiece_Flat( piece );
   2.229 +   piece->appSpecificPiece = prodPiece;
   2.230 +   
   2.231 +   prodPiece->leftMatrix   = 
   2.232 +            DKU__makeMatrix_Flat( numLeftRows,  numLeftCols,  piece );
   2.233 +   prodPiece->leftMatrix->matrix   = leftBundleMatrix;
   2.234 +   
   2.235 +   prodPiece->rightMatrix  =
   2.236 +            DKU__makeMatrix_Flat( numRightRows, numRightCols, piece );
   2.237 +   prodPiece->rightMatrix->matrix  = rightBundleMatrix;
   2.238 +   
   2.239 +   prodPiece->resultMatrix = 
   2.240 +            DKU__makeMatrix_Flat( numLeftRows,  numRightCols, piece );
   2.241 +      //The result matrix is malloc'd, so it's not inside the input bundle,
   2.242 +      // so, to avoid copies when make the result bundle, make it now, then
   2.243 +      // put into the DKUPiece the pos of result matrix in result bundle.
   2.244 +   void *resultBundle =
   2.245 +        BLIS_DKU__malloc_toPiece( sizeOfResultMatrix + sizeof(int32), piece);
   2.246 +   *((int32 *)resultBundle) = sizeOfResultMatrix + sizeof(int32);
   2.247 +      //skip over the "size" at the start of the result bundle
   2.248 +   prodPiece->resultMatrix->matrix = (float32 *)((int32 *)resultBundle + 1);
   2.249 +   
   2.250 +      //now, fill in the iteration bounds so that the kernel processes
   2.251 +      // the entirety of both matrices.
   2.252 +   prodPiece->leftStartRow  = 0;
   2.253 +   prodPiece->leftEndRow    = numLeftRows  - 1; // "- 1" 'cause start at 0
   2.254 +   prodPiece->leftStartCol  = 0;
   2.255 +   prodPiece->leftEndCol    = numLeftCols  - 1;
   2.256 +   
   2.257 +   prodPiece->rightStartRow = 0;
   2.258 +   prodPiece->rightEndRow   = numRightRows - 1;
   2.259 +   prodPiece->rightStartCol = 0;
   2.260 +   prodPiece->rightEndCol   = numRightCols - 1;
   2.261 +   
   2.262 +   prodPiece->prodStartRow  = 0;
   2.263 +   prodPiece->prodEndRow    = numLeftRows  - 1;
   2.264 +   prodPiece->prodStartCol  = 0;
   2.265 +   prodPiece->prodEndCol    = numRightCols - 1;
   2.266 + }
   2.267 +
   2.268 +/*
   2.269 + *Model is that use an override of malloc in remote mem that puts everything
   2.270 + * malloc'd from unbundleInputs calls and from bundleResults call into a
   2.271 + * buffer in remote mem.  The entire buffer is freed after the send of the
   2.272 + * return result bundle is complete.
   2.273 + *
   2.274 + *The application only has to know that it does not perform free on any of
   2.275 + * the inputBundles, nor on any of the resultBundles, in either local or
   2.276 + * remote memory.
   2.277 + *The application also must create new DKUPiece s in the bundling quad plus
   2.278 + * in the Kernel (and all calls rooted at the Kernel) by using
   2.279 + * BLIS_DKU__makeDKUPiece
   2.280 + *Finally, the application must create app-specific pieces
   2.281 + *So anything malloc'd inside bundleResults is still inside the same buffer
   2.282 + * used by unbundleInputs.
   2.283 + */
   2.284 +//ARCH: what about just give unbundleInputs and bundleResults an  "align"
   2.285 +//  operator that's HW-supplied.
   2.286 +// First element of bundle is size, then "0" terminated list of offsets to
   2.287 +// alignable-chunks, then the alignable chunks start.  Alignment happens
   2.288 +// during bundling.  HW also supplies a checker to see if aligned bundle is
   2.289 +// too big. (add a "revert divide" so can do a new divide to get smaller
   2.290 +// pieces, or something.. )
   2.291 + void *
   2.292 +bundleResults_MM( DKUPiece *piece, void *inputBundle )
   2.293 + { MatrixProdPiece *matProd;
   2.294 +   float32 *matProdArr;
   2.295 +   
   2.296 +   matProd = (MatrixProdPiece *)piece->appSpecificPiece;
   2.297 +   matProdArr = matProd->resultMatrix->matrix;
   2.298 +   
   2.299 +      //TODO: figure out soln for alignment or result matrix when it's inside
   2.300 +      // input-bundle
   2.301 +
   2.302 +      //results bundle already made (inside unbundleInputs fn), resultsBundle
   2.303 +      // addr is one int32 before addr of result matrix array.
   2.304 +   void *resultsBundle = ((int8 *)matProdArr - sizeof(int32));
   2.305 +   
   2.306 +   return resultsBundle;
   2.307 + }
   2.308 +
   2.309 +/*  The DKU standard says that the scheduler guarantees that the same
   2.310 + *   DKUPiece that created an input bundle will be called to unbundle
   2.311 + *   the corresponding results bundle.
   2.312 + *  This means that the unbundleResults method is called on the original
   2.313 + *   piece, that still has the position within the result matrix where
   2.314 + *   this piece's results should go.
   2.315 + *  So, copy the values in the incoming result matrix to the correct
   2.316 + *   sub-block of the "original" result matrix
   2.317 + */
   2.318 + void
   2.319 +unbundleResults_MM( void * resultBundle, DKUPiece *origPiece )
   2.320 + { float32 *bundMatrixArr, *resMatrixArr, *bundleReadPt, *resultInsertPt;
   2.321 +   MatrixProdPiece *matProd;
   2.322 +   Matrix *resultMatrix;
   2.323 +   
   2.324 +   int32 resMatNumRows, resMatNumCols;
   2.325 +   int32 prodStartRow, prodEndRow, prodStartCol, prodEndCol, r, c;
   2.326 +   
   2.327 +   bundMatrixArr = (float32 *) ((int32 *)resultBundle + 1);
   2.328 +   
   2.329 +   matProd = (MatrixProdPiece *) origPiece->appSpecificPiece;
   2.330 +   resultMatrix  = matProd->resultMatrix;
   2.331 +   resMatrixArr  = resultMatrix->matrix;
   2.332 +   
   2.333 +   resMatNumRows = resultMatrix->numRows;
   2.334 +   resMatNumCols = resultMatrix->numCols;
   2.335 +   
   2.336 +   prodStartRow  = matProd->prodStartRow;
   2.337 +   prodEndRow    = matProd->prodEndRow;
   2.338 +   prodStartCol  = matProd->prodStartCol;
   2.339 +   prodEndCol    = matProd->prodEndCol;
   2.340 +   
   2.341 +      //copy the results from the matrix in the bundle to
   2.342 +      // the full result matrix.
   2.343 +   bundleReadPt = bundMatrixArr;
   2.344 +   for( r = prodStartRow; r < prodEndRow; r++ )
   2.345 +    { resultInsertPt = resMatrixArr + r * resMatNumCols + prodStartCol;
   2.346 +      for( c = prodStartCol; c <= prodEndCol; c++ )
   2.347 +       { *(resultInsertPt++) = *(bundleReadPt++);
   2.348 +       }
   2.349 +    }
   2.350 + }
   2.351 +
   2.352 +

     3.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     3.2 +++ b/DKU_INST_MM/Communicators.c	Sun Aug 26 03:04:50 2012 -0700
     3.3 @@ -0,0 +1,435 @@
     3.4 +/*
     3.5 + *  Copyright 2009 OpenSourceStewardshipFoundation.org
     3.6 + *  Licensed under GNU General Public License version 2
     3.7 + *
     3.8 + * Author: seanhalle@yahoo.com
     3.9 + */
    3.10 +
    3.11 +#include "DKU_INST_MM.h"
    3.12 +
    3.13 +/*Communicators are used by the scheduler to send data from one piece to
    3.14 + * another.  The kernel specifies when the communication is to take place
    3.15 + * by calling either a send communicator or a receive communicator.
    3.16 + *The communicators know what data to send and where to send it by reading
    3.17 + * information out of the DKUPiece structure.  The information about which
    3.18 + * other piece to send to and what data is placed into the piece by the
    3.19 + * DKUPieceMaker and the Divider.
    3.20 + *This is how dependencies among data pieces are encoded and how the
    3.21 + * scheduler is informed of them.
    3.22 + */
    3.23 +
    3.24 +/*In Deblocking, the dependency pattern is a 45 degree diagonal.  A given
    3.25 + * macro block receives information from the macro block above it and the
    3.26 + * macro block to its left.
    3.27 + *This must be told to the scheduler, so that it can order the execution of
    3.28 + * pieces appropriately.
    3.29 + * application programmer has chosen to divide each
    3.30 + * screen frame into diagonals
    3.31 + */
    3.32 +
    3.33 +/*Okay, so going with the plan that match the scheduler form to the
    3.34 + * application form.  Expose explicitly that different applications have
    3.35 + * characteristics that can be taken advantage of by scheduler impl to make
    3.36 + * more efficient scheduler.
    3.37 + *So, take advantage of flexibility of BLIS interface philosophy..  make an
    3.38 + * interface that is tuned to particular characteristic of application to
    3.39 + * allow efficient scheduler impl that takes advantage.
    3.40 + *In this case, it is fact that have a pattern of dependencies.  Make the
    3.41 + * dependencies first-class entities in the interface.  Make a primitive for
    3.42 + * expressing the dependencies, and one for embodying the dependencies.
    3.43 + *
    3.44 + *The embodiment will be an array.  The DKUPieceMaker will create the array
    3.45 + * and populate it for the initial pieces it makes..  something like that..
    3.46 + * The DKUPieceMaker also returns a pointer to the initially-free piece.
    3.47 + * (Note that there's only ever one initially free piece.  If there were two
    3.48 + *  then they wouldn't depend on each other, and so could be combined into
    3.49 + *  a single piece..  or?  In some cases would require separate
    3.50 + *  "combination" data-struc..  so, maybe an array of initially free)
    3.51 + *Each division places an array in the parent, with one position for each
    3.52 + * sub-piece.  The position holds the count, of the number of propendents,
    3.53 + * for the sub-piece in the corresponding position in the sub-piece array.
    3.54 + *
    3.55 + *The fellow sub-pieces of a parent typically won't have any dependencies
    3.56 + * on each other..  the dependencies are to sub-pieces of other parents.
    3.57 + *So the DKUPieceMaker will create the largest pieces that can be sub-divided
    3.58 + * freely, and also create an array with the propendent count for each of
    3.59 + * those pieces.
    3.60 + *The undivider is where propendent counts are updated.  The Undivider is
    3.61 + * what frees DKUPieces to be scheduled.  That process is inside the
    3.62 + * scheduler implementation.
    3.63 + *The divider is written to know about the arrays of propendent counts of the
    3.64 + * other DKUPieces.  So is the Undivider.  The divider looks at the arrays
    3.65 + * that already exist and populates the new array for the currently being
    3.66 + * divided parent accordingly.  It also puts in information for the Undivider
    3.67 + * to properly update the counts in the arrays when a piece finishes.
    3.68 + *Big question: can Albert perform a static analysis that can understand
    3.69 + * general code in the divider and undivider, given that it knows explicitly
    3.70 + * what the arrays are used for, and just has to look at the code that
    3.71 + * creates, populates, and updates the arrays..  The idea being that the
    3.72 + * polyhedral model can learn the dependencies this way.
    3.73 + *
    3.74 + *Each dependency has an associated communicator.  A communicator is what
    3.75 + * performs transport of the propendent-generated-state to the receiving
    3.76 + * dependent.  A communicator and a dependency arrow are the same thing.
    3.77 + *In shared memory, the communicator is normally not used.  Only in distr
    3.78 + * memory is it invoked to bundle up the propendent data and carry it to the
    3.79 + * dependent.
    3.80 + *The Undivider tells the scheduler each time a completed propendent has
    3.81 + * updated the count of a given dependent.  This will trigger the
    3.82 + * scheduler to fire the communicator, if it has been implemented that way.
    3.83 + *This lets the scheduler, for example, send data around before the 
    3.84 + * propendent data is available.  So, it gets better overlap of communication
    3.85 + * and computation.
    3.86 + *
    3.87 + *So, this scheme so far covers the case of "2D" parallelism in Deblocking.
    3.88 + * But it doesn't yet cover simulation where the communication happens in
    3.89 + * the middle of a loop nest.
    3.90 + *For that, still have the same communicator, which is still one-to-one with
    3.91 + * a dependency in the data.
    3.92 + *But now, the Kernel invokes the communicator, when the propendent has
    3.93 + * finished producing data..  By only calling in the propendent, causality is
    3.94 + * always enforced, with no extra mechanism required in the scheduler.
    3.95 + *The Kernel also calls the communicator in the dependent, in the position
    3.96 + * that data must be received before continuing on.
    3.97 + *This way, the scheduler is free to implement the timing of communication
    3.98 + * in many different ways.  The communicator explicitly copies data to a
    3.99 + * separate communication-area that is made in the propendent DKUPiece
   3.100 + * by the Divider or DKUPieceMaker during creation.
   3.101 + *It returns when the copy is done, handing the scheduler a pointer to the
   3.102 + * data.  The scheduler then handles moving the data to the dependent piece
   3.103 + * (or not, on shared memory).  The receiving end of the communicator
   3.104 + * accepts a pointer to the data.  The Kernel is written to access data
   3.105 + * through that pointer.
   3.106 + *
   3.107 + *Wanting to allow shared memory to NOT perform the copy, just pass a pointer
   3.108 + * to the area of data that the dependent needs, and have the Kernel access
   3.109 + * the data in the appropriate way (according to data in the DKUPiece pointed
   3.110 + * to)..  thing is, still need something to perform the copy in the distr.
   3.111 + * memory case.
   3.112 + *So, seeing two different sending communicators, and two different receiving
   3.113 + * communicators.  For distr mem, the sender does a copy into area in
   3.114 + * sender's DKUPiece, then returns pointer to that copy (it's a bundle, with
   3.115 + * first location being a uint32 with size of bundle)..  or maybe have a
   3.116 + * pre-defined element in the DKUPiece.  The receiver is implemented as part
   3.117 + * of the scheduler..  it returns a pointer to the Dependent calling Kernel.
   3.118 + *Shared mem communicators, the sender is implemented by the scheduler, as
   3.119 + * well as the receiver.  The propendent Kernel simply passes the DKUPiece
   3.120 + * to the sender.
   3.121 + *
   3.122 + *Right.. so, three out of the four are implemented by the scheduler.. so
   3.123 + * just make the fourth be an optional do-hickie.  The scheduler implements
   3.124 + * both the send and receive calls, and the application provides a
   3.125 + * communication bundler.  : D
   3.126 + *The receiver still has two different cases: in one it gets a bundle, in
   3.127 + * the other it gets a pointer to a DKUPiece.  No point in unbundling, just
   3.128 + * so the Kernel can do the gather operation again.  So, the Kernel will have
   3.129 + * to have a different version, one for shared, second for distr.. yuck.
   3.130 + *
   3.131 + *Q: starting to have different "kinds" of DKUPiece now..  how going to
   3.132 + * handle backwards-and-forwards compatibility?
   3.133 + *
   3.134 + *For shared mem,
   3.135 + */
   3.136 +
   3.137 +void * bundleComm_BarnesHut_110( DKUPiece piece )
   3.138 + {
   3.139 +   //this copies data out of the piece for the "110" going direction
   3.140 +   //All communication bundlers have the same prototype: void * foo(DKUPiece)
   3.141 +   //The bundlers are registered with the scheduler for this DKU instance
   3.142 +   // via the BLIS_DKU__set_commBundler_ForID( &foo, commTypeID, DKU_Inst_ID)
   3.143 +   //Which two pieces communicate is set by the propendent and dependent
   3.144 +   // calls to communicate, which happen in the Kernel.  The PieceMaker and
   3.145 +   // Divider know which pieces communicate to which at what points in the
   3.146 +   // Kernel.  So, they insert into the DKUPiece data structure the ID of
   3.147 +   // the piece they send to/receive from.  When the Kernel reaches the 
   3.148 +   // communication point, it takes the propendent/dependent pieceID from 
   3.149 +   // the DKUPiece and passes that to the communicate() call (along with the
   3.150 +   // commTypeID, which is fixed for each call that appears in the Kernel).
   3.151 +   //In other words, the Kernel, Divider, and PieceMaker are the only ones
   3.152 +   // that agree among themselves on what a particular commTypeID means.  For
   3.153 +   // example, for graphs, there is only one type ID, because the process of
   3.154 +   // copying is the same no matter which direction..  but for a mesh, the
   3.155 +   // copy is different for the top, bottom, and two edges..
   3.156 +
   3.157 +   /*There is the basic issue of exposing in the Kernel code whether running
   3.158 +    * on shared memory or distr memory.
   3.159 +    *Could either have a fixed Kernel that adapts, or have two Kernels that
   3.160 +    * are chosen among by the scheduler.
   3.161 +    *If the Kernel is fixed, have only a few choices:
   3.162 +    * Kernel always thinks it's taking data from a DKUPiece, or
   3.163 +    * Kernel always takes data via a fixed interface-call.
   3.164 +    *Not sure how to hide if Kernel always takes from a DKUPiece.. and extra
   3.165 +    * work in remote memory to re-create DKUPiece structure, for only reason
   3.166 +    * to make interface nice (for programmer).
   3.167 +    *With the fixed interface, could provide two adaptors: one for shared
   3.168 +    * mem that gathers from a DKUPiece, other for a bundle.  Then scheduler
   3.169 +    * picks, behind the interface, which adaptor.
   3.170 +    *How would that look to the Kernel?  Something like "update", and the
   3.171 +    * local memory changes..  but that only works if there's a fixed data-
   3.172 +    * struc in local mem to take from as the Kernel computation progresses.
   3.173 +    *IE, have an outer loop that the communication is in, and inner loops
   3.174 +    * that use the result of communication.  For shared mem, want that data-
   3.175 +    * struc to be the appSpecificPiece of another DKUPiece, and locations in
   3.176 +    * it will be read (or written) inside the inner loops.  This means the
   3.177 +    * scheduler has to figure out an ordering to run the Kernels that
   3.178 +    * respects the dependencies.  This is aided by the fact that the Kernel
   3.179 +    * has a separate propendent and dependent call that states the order.
   3.180 +    *There will actually be some Kernels that can't be run this way: the
   3.181 +    * pattern of dependencies has no sequential soln, a copy must be done.
   3.182 +    *Starting to think perhaps it's best to just always do a copy..
   3.183 +    *
   3.184 +    *Other choice is two different versions of the Kernel.. one that reads
   3.185 +    * from a DKUPiece, the other that reads from a commBundle.
   3.186 +    *Or a hybrid Kernel that can read from both, using a flag
   3.187 +    * in the DKUPiece that tells the Kernel whether to receive a
   3.188 +    * commBbundle or whether to take from the normal data in the propendent.
   3.189 +    *The two Kernel version is the most run-time efficient. The main drawback
   3.190 +    * is that the app developer has to make identical changes in two
   3.191 +    * different places.. any time they change one of the Kernel-copies they
   3.192 +    * have to change the other too..  also, it feels weird having two
   3.193 +    * different Kernels..  don't like it..
   3.194 +    *Kinda like the hybrid approach, it has the second least run-time
   3.195 +    * overhead, just an IF statement that will be well predicted each time
   3.196 +    * it accesses data during calculations..  Could even use a #define here,
   3.197 +    * so the IF is known at compile time to always go one direction.  The
   3.198 +    * specialization module would change the #define to: SharedMem or to
   3.199 +    * DistrMem.  Or, make the if a #ifdef in the source..  but that's ugly
   3.200 +    * to read.
   3.201 +    *Something to percolate..  whatever is chosen, the problem is solved,
   3.202 +    * just a matter of tradeoffs at this point..
   3.203 +    */
   3.204 + }
   3.205 +
   3.206 +void sampleKernel( DKUPiece *piece )
   3.207 + {
   3.208 +   initializePropendents();
   3.209 +   for( outer = 0; outer < N; outer++ )
   3.210 +    { fromProp = getFromPropendent( piece->propPieceIDs[ NORTH_PROPENDENT ]);
   3.211 +      for( inner = 0; inner < N; inner++ )
   3.212 +       {    //This Kernel knows that data is an array because it's written
   3.213 +            // by the app-programmer.  It knows that fromProp is also an
   3.214 +            // array because the app progr wrote the commBundler.
   3.215 +         piece->appSpecPiece->data[ inner ] += fromProp[ inner ];
   3.216 +       }
   3.217 +      sendToDependent( piece->depPieceIDs[ SOUTH_DEPENDENT ] );
   3.218 +    }
   3.219 +   finalizeDependents();
   3.220 + }
   3.221 +
   3.222 +/*In the divider, one has a single piece, which has communicators at its
   3.223 + * boundaries already.
   3.224 + *The divider cuts up the piece, and so knows which sub-pieces talk to 
   3.225 + * which others, because it just made them all from the same parent piece.
   3.226 + *The tricky part is connecting the hierarchy.
   3.227 + *The pieces the parent communicated with are on the order of the size of
   3.228 + * the parent.  Those pieces will likely have been divided as well..  now
   3.229 + * the question is how to hand-off from the parent to the appropriate
   3.230 + * sub-pieces.
   3.231 + *One way to do it is to have a Kernel running for each piece in the
   3.232 + * hierarchy, but of two kinds: hierachy-Kernels and "normal" Kernels.  The
   3.233 + * hierarchy Kernels only do communication: they break an in-coming request
   3.234 + * among their sub-pieces, then gather the responses back together.
   3.235 + *This is inefficienct in one sense: direct communication between sub-pieces
   3.236 + * of different Kernels would be optimal.
   3.237 + *However, such a hierarchy will
   3.238 + * only exist when a physical hierarchy exists among machines.  In that
   3.239 + * case, the extra scatter-gather work done by the hierarchy pieces might
   3.240 + * even be more efficient because it makes fewer, larger, communications
   3.241 + * between the larger physical entities.  This is more likely to ameliorate
   3.242 + * the loss from the larger latency in communication at the larger physical
   3.243 + * division.
   3.244 + *Okay, so going with that, for now.  When get details, may see some patterns
   3.245 + * for how to do direct communication among sub-pieces..  (but not holding
   3.246 + * my breath because each of two parents can be divided into a different
   3.247 + * number of sub-pieces.. so there is no one-to-one between sub-pieces on
   3.248 + * the edge of one parent and sub-pieces on a communicating edge of another
   3.249 + * parent.)
   3.250 + *
   3.251 + *This sample is for a big linked list.  Each piece is just a number of nodes
   3.252 + * of the list.
   3.253 + *When the Divider makes sub-pieces, it knows which ones communicate across
   3.254 + * the boundaries of the parent (because the application programmer wrote
   3.255 + * the Divider and placed the code in it that handles the boundaries of the
   3.256 + * parent).
   3.257 + *Patterns for how to do this part:
   3.258 + *Could have the Divider create some new structure that it places in the
   3.259 + * parent that holds the state for the CommKernel.  Put into that structure
   3.260 + * all the sub-piece-IDs that will communicate with it.  Into the sub-pieces
   3.261 + * put a commID not of the parent piece, but of the parent's CommKernel.  So
   3.262 + * a piece gains a second ID when it is divided.  It keeps its original
   3.263 + * commID and uses that to communicate with siblings, while it uses the
   3.264 + * subCommID to communicate with sub-pieces.
   3.265 + *Just going with that one idea for the moment..
   3.266 + *So, for the Linked List example, the Divider will set all the sub-pieces
   3.267 + * to talk to each other, and set the end-pieces to talk to the subCommID of
   3.268 + * the parent.
   3.269 + *The parent, meanwhile, will have a normal commID with which it talks to
   3.270 + * its siblings.
   3.271 + *When the normal Kernel modifies the linkings in the list, it has to check
   3.272 + * if one of the elements modified is a boundary element, and if so if the
   3.273 + * new arrangement has changed which element is the boundary.
   3.274 + *For example, an element is added to one end of a sub-piece.  The added
   3.275 + * element has its link-ptr set to a value that indicates it's a boundary
   3.276 + * element, for example NULL or -1.  The old boundary element's pointer is
   3.277 + * set to the new boundary element.  The commID is taken from the old
   3.278 + * boundary element and put into the new boundary element.  Done.
   3.279 + *The CommKernel will take advantage of the fact that it knows it's in a
   3.280 + * hierarchy..  it will keep an array of the values at the boundaries of its
   3.281 + * siblings.  (Note that one DKUPiece is disallowed from holding a pointer
   3.282 + * to another DKUPiece)..  the values may get out-of-sync, so they will be
   3.283 + * fixed-up when detected.
   3.284 + *Here's the pattern: when CommKernel gets comm from sub-piece, it looks to
   3.285 + * see what kind of comm it is.. if it's a "here's a value to insert", then
   3.286 + * it checks the end-values of its siblings and picks the sibling it belongs
   3.287 + * on.  It sends to that sibling (requires unsolicited reception mechanism,
   3.288 + * such as the signals method for re-divide)..  includes its own boundary
   3.289 + * values (free to piggy-back)..  that updates the receiver's view of the
   3.290 + * sibling-piece's boundary values.
   3.291 + *
   3.292 + *When CommKernel receives a value-to-insert,
   3.293 + * it checks the values at its two boundary elements.  If the value is
   3.294 + * between, then it accepts it.  If not, it responds to the sender, telling
   3.295 + * the sender what the receiver's actual boundary values are.  It then sends
   3.296 + * the value to the piece it believes it should go to (received the sender's
   3.297 + * boundary values along with the insert value, so it's certain it won't
   3.298 + * send back to where it came from..)  Eventually the value will land at
   3.299 + * the correct CommKernel.
   3.300 + *Seeing pieces given to the CommKernel of the top-level parent.  It then
   3.301 + * hands them out among its children, and from there to next level of 
   3.302 + * children, and so on.  Notice that siblings talk directly to each other,
   3.303 + * they don't go up to the parent then back down.
   3.304 + *The value of this abstract data type will be handling an enormous number
   3.305 + * of inserts, deletes, and lookups.
   3.306 + *
   3.307 + */
   3.308 +void sampleDivider( DKUPiece *piece, int numPieces )
   3.309 + { DKUPiece * newPiece;
   3.310 +      //First sub-piece, so it is a boundary of the parent
   3.311 +      //Figure out if parent has a sibling, or if it is natural boundary
   3.312 +   if(numPieces < 2) return; //leave sub-pieces empty if only 1 sub-piece
   3.313 +
   3.314 +   newPiece = makeASubPiece( someValues );
   3.315 +   if( BLIS_DKU__isNaturalBoundary( piece->propPieceIDs[ LEFT_PROPENDENT ]) )
   3.316 +    { newPiece->propPieceIDs[ LEFT_PROPENDENT ] = 
   3.317 +       BLIS_DKU__makeNaturalBoundaryPiece( piece, newPiece, DKU_INST_ID );
   3.318 +    }
   3.319 +   else //parent has a sibling, so communicate with parent's CommKernel
   3.320 +    { newPiece->propPieceIDs[ LEFT_PROPENDENT ] =
   3.321 +       BLIS_DKU__giveCommKernelAsPropendent( piece ); //scheduler returns the
   3.322 +       //thing that it has implemented as "addr" of CommKernel of piece
   3.323 +    }
   3.324 +   for( pieceIdx = 1; pieceIdx < numPieces - 1; pieceIdx++ )
   3.325 +    { newPiece = makeASubPiece( someValues );
   3.326 +      newPiece->propPieceIDs[ LEFT_PROPENDENT ] = (subPieces[pieceIdx - 1]);
   3.327 +      subPieces[pieceIdx-1]->propPieceIDs[ RIGHT_PROPENDENT ] = (newPiece);
   3.328 +    }
   3.329 +   newPiece = makeASubPiece( someValues );
   3.330 +   newPiece->propPieceIDs[ LEFT_PROPENDENT ] = (subPieces[numPieces - 1]);
   3.331 +   if( BLIS_DKU__isNaturalBoundary( piece->propPieceIDs[ RIGHT_PROPENDENT ]))
   3.332 +    { newPiece->propPieceIDs[ RIGHT_PROPENDENT ] =
   3.333 +       BLIS_DKU__makeNaturalBoundaryPiece( piece, newPiece, DKU_INST_ID );
   3.334 +    }
   3.335 +   else //parent has a sibling, so communicate with parent's CommKernel
   3.336 +    { newPiece->propPieceIDs[ RIGHT_PROPENDENT ] =
   3.337 +       BLIS_DKU__giveCommKernelAsPropendent( piece ); //scheduler returns the
   3.338 +       //thing that it has implemented as "addr" of CommKernel of piece
   3.339 +      //Might have to make left propendent and right propendent be different
   3.340 +      // addresses.. in which case include a directionID in the call to
   3.341 +      // the scheduler asking for the address.
   3.342 +      //Might be something about matching dependents with propendents.. not
   3.343 +      // sure how that's going to play out..
   3.344 +      //There's "pull from propendent" and "push to dependent" which are
   3.345 +      // interrupt-model..  then there's propendent sends and dependent
   3.346 +      // receives.
   3.347 +
   3.348 +    }
   3.349 + }
   3.350 +/*Some ill-fits in here..  need to do real app, with real dependencies and
   3.351 + * real comm in it..
   3.352 + *
   3.353 + *Thinking perhaps give each piece a "name" struc..  the makeDKUPiece()
   3.354 + * creates for DKUInstances that have a communicator registered..  (will
   3.355 + * add some overhead to makeDKUPiece if have to do DKU-instance lookup)
   3.356 + *WANT DKU-INSTANCE LOOKUP TO BE TRANSFORMED BY SPECIALIZATION
   3.357 + * in practice, want the lookup to be performed statically, to eliminate the
   3.358 + * dynamic overhead..  can do this if define semantics of DKUInstance to
   3.359 + * be one-time and one-time only setting of functions to an instance..  have
   3.360 + * to check this statically in the BLIS-rule-checker that's run in the
   3.361 + * makefile in the sequential development environment.
   3.362 + *
   3.363 + *Then, the Divider simply copies the name-struc out of target pieces and
   3.364 + * puts it into source pieces.
   3.365 + *This allows, for example, making commKernels that have the name-strucs of
   3.366 + * all the siblings..  so, for example, the commKernel can calculate which
   3.367 + * of the siblings it should send to..  in fact, each of the Kernels can
   3.368 + * calculate which of the siblings it should sent to..  re-use the same
   3.369 + * CommKernel in the leaf-kernels as well as in the parent..
   3.370 + *The only complication to re-using CommKernel on all levels is the
   3.371 + * boundaries..  just make the boundary be the CommKernel in the parent,
   3.372 + * or a natural boundary..
   3.373 + */
   3.374 +
   3.375 +
   3.376 +
   3.377 +/*The purpose of a CommKernel is to gather communications from all the sub-
   3.378 + * pieces, put them together, and turn them into communications from the
   3.379 + * parent piece.  And vice-versa: receive comm, break it up, hand a portion
   3.380 + * to each sub-piece.
   3.381 + *This sample is for a big linked list.
   3.382 + */
   3.383 +void sampleCommKernel( DKUPiece *piece )
   3.384 + {
   3.385 +
   3.386 + }
   3.387 +
   3.388 +/*Q to Albert: just how bad is this?  Will a static tool be able to
   3.389 + * understand the linkages, given that they are a fixed pattern of linkages
   3.390 + * that are signalled by the fixed function-names..?
   3.391 + *For example, could a static tool understand that
   3.392 + * piece->propPieceIDs[ NORTH_PROPENDENT ] means a DKUPiece, which is set
   3.393 + * in the Divider..  and uderstand that
   3.394 + * piece->appSpecPiece->data[ inner ] = fromProp[ inner ];  means the
   3.395 + * actual use of the data gotten from the DKUPiece..  then use that
   3.396 + * understanding to replace the calls with a schedule plus direct access
   3.397 + * to the propendent DKUPiece instead of the intermediate access first to
   3.398 + * the commBundle.
   3.399 + *Thinking this is how I want to go: use a commBundle on both shared and
   3.400 + * distr memory.  Reason is that it makes scheduling simple.  And, there's
   3.401 + * enough semantic information provided by the function calls that a static
   3.402 + * tool should be able to perform a transform that does direct access to the
   3.403 + * DKUPiece by migrating the gather code into the Kernel innards.  The
   3.404 + * index into the commBundle is what identifies the gather statement that
   3.405 + * put the data there.  That gather statement is inserted in place of the
   3.406 + * access to the array.
   3.407 + */
   3.408 +
   3.409 +/*Have to provide boundary-propendents, so pieces that are on the edge of
   3.410 + * a mesh, for example, access the boundary-propendent, with whatever
   3.411 + * boundary data.
   3.412 + *Also, have to provide a "time zero" something or other..  The thing it
   3.413 + * solves is that one Kernel produces data at the end of the inner loop,
   3.414 + * with a sendToDependent() call..  that data is received BEFORE the inner
   3.415 + * loop with a getFromPropendent call.  This is normally fine, as it encodes
   3.416 + * the time-skew, implicitly pipelining.  The only problems are at time
   3.417 + * zero and possibly the very last time-step.
   3.418 + *So, the initializePropendents() sets some default time-zero state that
   3.419 + * will be gotten by the first getFromPropendent() call.
   3.420 + *When the Kernel is done, the finalizeDependents() call handles whatever
   3.421 + * might need to be done with the data from the last sendToDependents()
   3.422 + * call that was performed in the Kernel.
   3.423 + */
   3.424 +
   3.425 +/*Q: How implement comm inside scheduler?
   3.426 + * syntax proposal for performing comm from inside the Kernel is:
   3.427 + * getFromPropendent( piece->propPieceIDs[ NORTH_PROPENDENT ]);
   3.428 + *
   3.429 + * So.. what is stored in that array?  That is the thing that tells the
   3.430 + *  scheduler how to perform the communication..
   3.431 + *Thinking leave it opaque..  it's a void *..  The scheduler fills it in
   3.432 + * itself inside of BLIS_DKU__makeDKUPiece()..  the divider gets this thing
   3.433 + * out of the piece and places it into the propPieceIDs array.
   3.434 + *
   3.435 + *This means that ask the scheduler to create a CommKernel, and ask it to
   3.436 + * give the thingie need to communicate with it.
   3.437 + *Also ask the scheduler to create a natural-boundary piece, and
   3.438 + */

     4.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     4.2 +++ b/DKU_INST_MM/DKU_INST_MM.h	Sun Aug 26 03:04:50 2012 -0700
     4.3 @@ -0,0 +1,54 @@
     4.4 +/*
     4.5 + *  Copyright 2009 OpenSourceCodeStewardshipFoundation.org
     4.6 + *  Licensed under GNU General Public License version 2
     4.7 + *
     4.8 + * Author: seanhalle@yahoo.com
     4.9 + *
    4.10 + */
    4.11 +
    4.12 +#include "../../BLIS/DKU/DKU_common/DKU.h"
    4.13 +
    4.14 +#include "../BLIS_CONSTANTS.h"
    4.15 +
    4.16 +#ifndef _DKU_INST_MM_H
    4.17 +#define	_DKU_INST_MM_H
    4.18 +
    4.19 +
    4.20 +//===========================================================================
    4.21 +//              Declarations of the Standard DKU functions
    4.22 +
    4.23 +void DKU_INST_MM_Init(); //tells BLIS the pointers to the DKU functions
    4.24 +
    4.25 +Divide   divide_MM;
    4.26 +Kernel   kernel_MM;
    4.27 +Undivide unDividePiece_MM;
    4.28 +
    4.29 +MakeRootDKUPieces makeRootDKUPieces_MM;
    4.30 +
    4.31 +SerialKernel      serialKernel_MM;
    4.32 +
    4.33 +BundleInputs      bundleInputs_MM;
    4.34 +UnbundleInputs    unbundleInputs_MM;
    4.35 +BundleResults     bundleResults_MM;
    4.36 +UnbundleResults   unbundleResults_MM;
    4.37 +
    4.38 +//===========================================================================
    4.39 +//
    4.40 +#include "../Matrix_Mult.h"
    4.41 +
    4.42 + void
    4.43 +inner_Kernel( MatrixProdPiece *matrixProdPiece );
    4.44 +
    4.45 + Matrix *
    4.46 +DKU__makeMatrix_Flat( int32 numRows, int32 numCols, DKUPiece *owner );
    4.47 +
    4.48 + MatrixProdPiece *
    4.49 +DKU__makeMatrixProdPiece_Flat( DKUPiece *owner );
    4.50 +
    4.51 + MatrixProdPiece *
    4.52 +DKU__makeMatrixProdPiece_FromMatrixProdPiece
    4.53 +                           ( MatrixProdPiece *parentPiece, DKUPiece *owner );
    4.54 +
    4.55 +
    4.56 +#endif	/* _DKU_INST_MM_H */
    4.57 +

     5.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     5.2 +++ b/DKU_INST_MM/DKU_INST_MM_init.c	Sun Aug 26 03:04:50 2012 -0700
     5.3 @@ -0,0 +1,58 @@
     5.4 +/*
     5.5 + *  Copyright Oct 24, 2009 OpenSourceCodeStewardshipFoundation.org
     5.6 + *  Licensed under GNU General Public License version 2
     5.7 + *
     5.8 + *
     5.9 + * Author: SeanHalle@yahoo.com
    5.10 + */
    5.11 +
    5.12 +#include "DKU_INST_MM.h"
    5.13 +
    5.14 +
    5.15 +/* 
    5.16 + * Part of the BLIS DKU standard.  Each DKU instance is placed in its own
    5.17 + *  directory, a child of the Application directory.  The directory is named
    5.18 + *  the same as the "#define" constant used to identify the DKU instance.
    5.19 + * The directory has two standard files: "#def const".h and
    5.20 + *  "#def const"_init.c.
    5.21 + *
    5.22 + * This is the "#def const"_init.c file..
    5.23 + * It initializes the scheduler for this DKU instance.
    5.24 + * And it tells that scheduler the pointers to all of the DKU functions for
    5.25 + *  this instance.
    5.26 + *
    5.27 + */
    5.28 +void DKU_INST_MM_Init( )
    5.29 + {    // always start init of a DKU instance with this function
    5.30 +      // this isn't modal, so no worries about order of calling these fn
    5.31 +      // Can intertwine init of several instances without harm.
    5.32 +   BLIS_DKU__start_DKU_Instance_Init( DKU_INST_MM );
    5.33 +   
    5.34 +   BLIS_DKU__set_Divide_To_ForID( &divide_MM, DKU_INST_MM );
    5.35 +   BLIS_DKU__set_Kernel_To_ForID( &kernel_MM, DKU_INST_MM );
    5.36 +   BLIS_DKU__set_Undivide_To_ForID( &unDividePiece_MM, DKU_INST_MM );
    5.37 +
    5.38 +   //TODO: figure out make and free -- right depth?  Where used?
    5.39 +   //  make sure don't accidentally free the shared Matrix strucs..
    5.40 +   //  trace where used, see if can find easy-to-see-pattern pattern for how
    5.41 +   //  to do make and free -- DKUPiece automated stuff and whatnot..
    5.42 +   //  maybe just let the App see DKUPiece, and make make and free be done
    5.43 +   //  explicitly in the pieceMaker, divider, undivider, etc.. no automation
    5.44 +   BLIS_DKU__set_MakeRootDKUPieces_To_ForID(&makeRootDKUPieces_MM,DKU_INST_MM);
    5.45 +//   BLIS_DKU__set_FreeAppSpecSubPiece_To_ForID(&freeMatrixProdPiece_Flat,DKU_INST_MM);
    5.46 +//   BLIS_DKU__set_FreeAppSpecRootPiece_To_ForID(&freeMatrixProdPiece_Flat,DKU_INST_MM);
    5.47 +//   BLIS_DKU__set_MakeAppSpecPiece_To_ForID(&makeMatrixProdPiece_Using,DKU_INST_MM);
    5.48 +   BLIS_DKU__set_SerialKernel_To_ForID( &serialKernel_MM, DKU_INST_MM );
    5.49 +   
    5.50 +   BLIS_DKU__set_BundleInputs_To_ForID( &bundleInputs_MM, DKU_INST_MM );
    5.51 +   BLIS_DKU__set_UnbundleInputs_To_ForID(&unbundleInputs_MM,DKU_INST_MM);
    5.52 +   BLIS_DKU__set_BundleResults_To_ForID( &bundleResults_MM, DKU_INST_MM);
    5.53 +   BLIS_DKU__set_UnbundleResults_To_ForID( &unbundleResults_MM,DKU_INST_MM );
    5.54 +   
    5.55 +      //Now that have generic, provide HW-specific overrides
    5.56 +/*   BLIS_DKU__set_A_Divide_Override_To_ForID()
    5.57 +   BLIS_DKU__set_A_DKU_Override_To_ForID( )
    5.58 +*/
    5.59 +      // always end init of a DKU instance with this function
    5.60 +   BLIS_DKU__end_DKU_Instance_Init( DKU_INST_MM );
    5.61 + }

     6.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     6.2 +++ b/DKU_INST_MM/Divide.c	Sun Aug 26 03:04:50 2012 -0700
     6.3 @@ -0,0 +1,295 @@
     6.4 +/*
     6.5 + *  Copyright 2009 OpenSourceCodeStewardshipFoundation.org
     6.6 + *  Licensed under GNU General Public License version 2
     6.7 + *
     6.8 + * Author: seanhalle@yahoo.com
     6.9 + *
    6.10 + */
    6.11 +
    6.12 +#include <math.h>
    6.13 +
    6.14 +#include "DKU_INST_MM.h"
    6.15 +
    6.16 +
    6.17 +typedef
    6.18 +struct
    6.19 + {
    6.20 +   //inputs
    6.21 +   int numLeftRows;
    6.22 +   int numRightCols;
    6.23 +   int numSubPiecesToMake;
    6.24 +   DKUPiece *parPiece;
    6.25 +   
    6.26 +   //outputs
    6.27 +   int numLeftSlices;
    6.28 +   int numRightSlices;
    6.29 +   int finalNumSubPieces;
    6.30 +
    6.31 +   //outputs then inputs
    6.32 +   int *leftSliceStartRows;
    6.33 +   int *rightSliceStartCols;
    6.34 + }
    6.35 +SliceStruc;
    6.36 +
    6.37 +void updateSlicingStrucWithSlicingOfInputMatrices(SliceStruc *slicingStruc);
    6.38 +
    6.39 +void pairSlicesAndMakeProdPieces(SliceStruc *slicingStruc, DKUPiece *oPiece);
    6.40 +MatrixProdPiece* makeChildMatrixProdPieceFrom( MatrixProdPiece *parentPiece);
    6.41 +
    6.42 +
    6.43 +/* The Divider
    6.44 + * Divides the iteration space..
    6.45 + * Matrix Product Piece is a piece of iteration
    6.46 + *  space..  it is the iterations in which one piece of the left
    6.47 + *  matrix is multiplied by one piece of the right matrix.
    6.48 + * Thus, to make product pieces, both the left and right matrices
    6.49 + *  have to be sliced, then all pairings of those slices taken.
    6.50 + *  Each pairing on matrix slices is one product piece.
    6.51 + *
    6.52 + *  So, for example, dividing a product-piece into, say, 4 pieces means
    6.53 + *   dividing the iteration space of the parent piece into 4,
    6.54 + *   which means slicing the left matrix by 2, and the right matrix by 2,
    6.55 + *   and talking all pairings of a left slice times a right slice.
    6.56 + *
    6.57 + * Do division this way:
    6.58 + *  count total number of result cells to be produced.  Divide that by the
    6.59 + *  number of sub-pieces to make.  That gives the target number of result
    6.60 + *  cells in each sub-piece.
    6.61 + * Take the square root of the number of target cells, that's the target
    6.62 + *  number of cols, and target num rows in the result of each sub-piece.
    6.63 + * See how many sqroots fit horizontally, and how many fit vertically
    6.64 + *  take the ceiling of the larger, floor of the smaller.
    6.65 + * That is the number of rows, and number of cols, in each sub-piece.
    6.66 + * Multiply the two to find the number of sub-pieces that will be made.
    6.67 + * If not larger than requested number of sub-pieces, take the ceiling of
    6.68 + *  both num rows and num cols (also covers case of floor is zero).
    6.69 + * Divide num rows in left matrix by target num rows, then use residuals
    6.70 + *  alg. to assign ranges of left matrix rows to each slice.
    6.71 + * Divide num cols in right by target num cols, then use residuals alg.
    6.72 + *  to assign ranges of right matrix cols to each slice.
    6.73 + * Then take all pairings of left-slices with right-slices and make a
    6.74 + *  sub-piece from each pairing
    6.75 + *
    6.76 + * @param numPieces
    6.77 + */
    6.78 +#define SLICE_SCOPE 1
    6.79 +void divide_MM( DKUPiece* oPiece, int numSubPiecesToMake )
    6.80 + { MatrixProdPiece *matrixProdPiece;
    6.81 +   Matrix *leftMatrix, *rightMatrix;
    6.82 +
    6.83 +   matrixProdPiece = (MatrixProdPiece *) oPiece->appSpecificPiece;
    6.84 +
    6.85 +   int leftStartRow, leftEndRow, rightStartCol, rightEndCol;
    6.86 +   leftStartRow  = matrixProdPiece->leftStartRow;
    6.87 +   leftEndRow    = matrixProdPiece->leftEndRow;
    6.88 +   rightStartCol = matrixProdPiece->rightStartCol;
    6.89 +   rightEndCol   = matrixProdPiece->rightEndCol;
    6.90 +   leftMatrix    = matrixProdPiece->leftMatrix;
    6.91 +   rightMatrix   = matrixProdPiece->rightMatrix;
    6.92 +
    6.93 +   int numLeftRows, numRightCols;
    6.94 +   numLeftRows  = leftEndRow  - leftStartRow  + 1; //+1 cause starts at zero
    6.95 +   numRightCols = rightEndCol - rightStartCol + 1;
    6.96 +//==============================================================
    6.97 +
    6.98 +      //need numPieces to be divided into two integers
    6.99 +      // if each result piece is a square, that gives the best surface
   6.100 +      // area to volume == the least communication in distr memory, and
   6.101 +      // the least multiple access by different threads in shared memory.
   6.102 +      //So, just have to figure out the sizes of the horizontal and the
   6.103 +      // vertical.
   6.104 +
   6.105 +      // make sure its possible to make more than 1 sub-piece
   6.106 +   if( numSubPiecesToMake < 2 || ( numLeftRows < 2 && numRightCols < 2 ) )
   6.107 +    { oPiece->numSubPieces = 0;  //scheduler must check for case of 0 pieces
   6.108 +      return;
   6.109 +    }
   6.110 +
   6.111 +   //TODO: scope is the func called within (divide_MM)
   6.112 +   SliceStruc *slicingStruc = 
   6.113 +                   BLIS_DKU__malloc_scope( sizeof(SliceStruc), SLICE_SCOPE );
   6.114 +   slicingStruc->numLeftRows        = numLeftRows;
   6.115 +   slicingStruc->numRightCols       = numRightCols;
   6.116 +   slicingStruc->numSubPiecesToMake = numSubPiecesToMake;
   6.117 +   slicingStruc->parPiece           = oPiece;
   6.118 +   
   6.119 +   updateSlicingStrucWithSlicingOfInputMatrices( slicingStruc );
   6.120 +   
   6.121 +   oPiece->numSubPieces = slicingStruc->finalNumSubPieces;
   6.122 +   if( oPiece->numSubPieces == 0 ) return;
   6.123 +   
   6.124 +      // pair up the slices and make the final DKUPieces
   6.125 +   pairSlicesAndMakeProdPieces( slicingStruc, oPiece );
   6.126 +
   6.127 +   BLIS_DKU__free_scope( SLICE_SCOPE );
   6.128 +   return;
   6.129 + }
   6.130 +
   6.131 +void updateSlicingStrucWithSlicingOfInputMatrices(SliceStruc *slicingStruc)
   6.132 + {
   6.133 +//=======================  Setup  ========================
   6.134 +   int   numLeftRows        = slicingStruc->numLeftRows;
   6.135 +   int   numRightCols       = slicingStruc->numRightCols;
   6.136 +   int   numSubPiecesToMake = slicingStruc->numSubPiecesToMake;
   6.137 +   
   6.138 +   int   numResultCells;
   6.139 +   float targetNumCellsPerPiece, targetDimOfResultPiece;
   6.140 +   float idealNumLeftSlices,     targetNumLeftSlices;
   6.141 +   float idealNumRightSlices,    targetNumRightSlices;
   6.142 +   float targetRowsPerLeftSlice, targetColsPerRightSlice;
   6.143 +   
   6.144 +   float rowAccumulator, colAccumulator;
   6.145 +   int   sliceIdx,       rowIncrement, colIncrement;
   6.146 +   int   numLeftSlices,  numRightSlices;
   6.147 +   
   6.148 +   int leftStartRow, leftEndRow, rightStartCol, rightEndCol;
   6.149 +   MatrixProdPiece *parentProdPiece=slicingStruc->parPiece->appSpecificPiece;
   6.150 +   leftStartRow   = parentProdPiece->leftStartRow;
   6.151 +   leftEndRow     = parentProdPiece->leftEndRow;
   6.152 +   rightStartCol  = parentProdPiece->rightStartCol;
   6.153 +   rightEndCol    = parentProdPiece->rightEndCol;
   6.154 +   
   6.155 +//=======================  Calc num Slices  ========================
   6.156 +      //Calc the closest can reasonably get to square
   6.157 +      //TODO: check that math works right: dividing int by float, need cast?
   6.158 +   numResultCells          = numLeftRows * numRightCols;
   6.159 +   targetNumCellsPerPiece  = numResultCells / numSubPiecesToMake;
   6.160 +   targetDimOfResultPiece  = sqrt( targetNumCellsPerPiece );
   6.161 +   idealNumLeftSlices      = numLeftRows  / targetDimOfResultPiece;
   6.162 +   idealNumRightSlices     = numRightCols / targetDimOfResultPiece;
   6.163 +   
   6.164 +      //Now, product of rows * cols should stay close to numPieces, but
   6.165 +      // have to make num rows an int, and num cols an int
   6.166 +      // means will drop fractional part from larger, then add the number
   6.167 +      // of pieces that fractional part represents back on to the smaller
   6.168 +      // then round to the nearest integer.  The resulting product should
   6.169 +      // still be close to numPieces.
   6.170 +   if( idealNumRightSlices > idealNumLeftSlices )
   6.171 +    { float diff, numPiecesCut;
   6.172 +      //TODO: find floor and "closest int" in C math library.. how use?
   6.173 +      targetNumRightSlices = floor( idealNumRightSlices );
   6.174 +      diff                 = idealNumRightSlices - targetNumRightSlices;
   6.175 +      numPiecesCut         = diff * idealNumLeftSlices;
   6.176 +      idealNumLeftSlices  += numPiecesCut / targetNumRightSlices;
   6.177 +      targetNumLeftSlices  = rint( idealNumLeftSlices );
   6.178 +    }
   6.179 +   else
   6.180 +    { float diff, numPiecesCut;
   6.181 +      targetNumLeftSlices  = floor( idealNumLeftSlices );
   6.182 +      diff                 = idealNumLeftSlices - targetNumLeftSlices;
   6.183 +      numPiecesCut         = diff * idealNumRightSlices;
   6.184 +      idealNumRightSlices += numPiecesCut / targetNumLeftSlices;
   6.185 +      targetNumRightSlices = rint( idealNumRightSlices );
   6.186 +    }
   6.187 +   targetRowsPerLeftSlice  = numLeftRows  / targetNumLeftSlices;
   6.188 +   targetColsPerRightSlice = numRightCols / targetNumRightSlices;
   6.189 +
   6.190 +      //allocate size of worst case + safety
   6.191 +   int size = sizeof(int) * (numSubPiecesToMake + 2);
   6.192 +   int *leftSliceStartRows, *rightSliceStartCols;
   6.193 +   //TODO: "FUNC" is not quite the right scope..  slicingStruc is right scope
   6.194 +   leftSliceStartRows  =  BLIS_DKU__malloc_scope( size, SLICE_SCOPE );
   6.195 +   rightSliceStartCols =  BLIS_DKU__malloc_scope( size, SLICE_SCOPE );
   6.196 +   slicingStruc->leftSliceStartRows  = leftSliceStartRows;
   6.197 +   slicingStruc->rightSliceStartCols = rightSliceStartCols;
   6.198 +
   6.199 +//=======================  Slice Left Matrix  ========================
   6.200 +      //fix for case only 1 row, when leftStartRow == leftEndRow
   6.201 +   leftSliceStartRows[ 0 ] = leftStartRow;
   6.202 +   sliceIdx = 0;
   6.203 +   rowAccumulator = 0;
   6.204 +   int row;
   6.205 +   for( row = leftStartRow; row < leftEndRow; row += rowIncrement )
   6.206 +    {
   6.207 +      leftSliceStartRows[ sliceIdx ] = row;
   6.208 +
   6.209 +      rowAccumulator += targetRowsPerLeftSlice;
   6.210 +      rowIncrement = (int) rowAccumulator;
   6.211 +      if( rowIncrement == 0 ) rowIncrement = 1;//apply at end curr iter
   6.212 +      rowAccumulator -= rowIncrement;
   6.213 +      if( rowAccumulator < 0 ) rowAccumulator = 0;
   6.214 +      sliceIdx += 1;
   6.215 +    }
   6.216 +   if( sliceIdx == 0 ) sliceIdx = 1; //case when only 1 row
   6.217 +   numLeftSlices     = sliceIdx;
   6.218 +   leftSliceStartRows[ sliceIdx ] = leftEndRow + 1;  //use extra slot
   6.219 +
   6.220 +   sliceIdx = 0; colAccumulator = 0; colIncrement = 0;
   6.221 +
   6.222 +//=======================  Slice Right Matrix  ========================
   6.223 +   rightSliceStartCols[ 0 ] = rightStartCol; //in case only 1 col
   6.224 +   int col;
   6.225 +   for( col = rightStartCol; col < rightEndCol; col += colIncrement )
   6.226 +    {
   6.227 +      rightSliceStartCols[ sliceIdx ] = col;
   6.228 +
   6.229 +      colAccumulator += targetColsPerRightSlice;
   6.230 +      colIncrement = (int) colAccumulator;
   6.231 +      if( colIncrement == 0 ) colIncrement = 1;//apply at end curr iter
   6.232 +      colAccumulator -= colIncrement;
   6.233 +      if( colAccumulator < 0 ) colAccumulator = 0;
   6.234 +      sliceIdx += 1;
   6.235 +    }
   6.236 +   if( sliceIdx == 0 ) sliceIdx = 1; //case when only 1 col
   6.237 +   numRightSlices = sliceIdx;
   6.238 +   rightSliceStartCols[ sliceIdx ] = rightEndCol + 1;
   6.239 +
   6.240 +   slicingStruc->numLeftSlices     = numLeftSlices;
   6.241 +   slicingStruc->numRightSlices    = numRightSlices;
   6.242 +   slicingStruc->finalNumSubPieces = numLeftSlices * numRightSlices;
   6.243 +   return;
   6.244 + }
   6.245 +
   6.246 +
   6.247 +void pairSlicesAndMakeProdPieces(SliceStruc *slicingStruc, DKUPiece *oPiece)
   6.248 + { DKUPiece* *subPiecesArray;
   6.249 +   MatrixProdPiece *newProdPiece;
   6.250 +   MatrixProdPiece *parentMatPiece =slicingStruc->parPiece->appSpecificPiece;
   6.251 +   int newPiecePos = 0;
   6.252 +   int leftSliceStartRow, leftSliceEndRow;
   6.253 +   int rightSliceStartCol, rightSliceEndCol;
   6.254 +   int numLeftSlices  = slicingStruc->numLeftSlices;
   6.255 +   int numRightSlices = slicingStruc->numRightSlices;
   6.256 +   int *leftSliceStartRows  = slicingStruc->leftSliceStartRows;
   6.257 +   int *rightSliceStartCols = slicingStruc->rightSliceStartCols;
   6.258 +
   6.259 +   int size = slicingStruc->finalNumSubPieces * sizeof(MatrixProdPiece *);
   6.260 +   subPiecesArray = BLIS_DKU__malloc_toPiece( size, oPiece );
   6.261 +   oPiece->subPiecesArray = subPiecesArray;
   6.262 +
   6.263 +   int rightSliceNum, leftSliceNum;
   6.264 +   for( rightSliceNum = 0; rightSliceNum < numRightSlices; rightSliceNum++ )
   6.265 +    { DKUPiece *newDKUPiece;
   6.266 +      rightSliceStartCol = rightSliceStartCols[ rightSliceNum ];
   6.267 +      rightSliceEndCol   = rightSliceStartCols[ rightSliceNum + 1 ] - 1;
   6.268 +      for( leftSliceNum = 0; leftSliceNum < numLeftSlices; leftSliceNum++ )
   6.269 +       { newDKUPiece  = BLIS_DKU__makeDKUPiece_FromDivider( DKU_INST_MM );
   6.270 +         newProdPiece = DKU__makeMatrixProdPiece_FromMatrixProdPiece
   6.271 +                                             ( parentMatPiece, newDKUPiece );
   6.272 +         newDKUPiece->appSpecificPiece = newProdPiece;
   6.273 +         
   6.274 +         leftSliceStartRow = leftSliceStartRows[ leftSliceNum ];
   6.275 +         leftSliceEndRow   = leftSliceStartRows[ leftSliceNum + 1 ] - 1;
   6.276 +         
   6.277 +         newProdPiece->leftStartRow  = leftSliceStartRow;
   6.278 +         newProdPiece->leftEndRow    = leftSliceEndRow;
   6.279 +         newProdPiece->leftStartCol  = parentMatPiece->leftStartCol;
   6.280 +         newProdPiece->leftEndCol    = parentMatPiece->leftEndCol;
   6.281 +         
   6.282 +         newProdPiece->rightStartRow = parentMatPiece->rightStartRow;
   6.283 +         newProdPiece->rightEndRow   = parentMatPiece->rightEndRow;
   6.284 +         newProdPiece->rightStartCol = rightSliceStartCol;
   6.285 +         newProdPiece->rightEndCol   = rightSliceEndCol;
   6.286 +         
   6.287 +         newProdPiece->prodStartRow  = leftSliceStartRow;
   6.288 +         newProdPiece->prodEndRow    = leftSliceEndRow;
   6.289 +         newProdPiece->prodStartCol  = rightSliceStartCol;
   6.290 +         newProdPiece->prodEndCol    = rightSliceEndCol;
   6.291 +         
   6.292 +         subPiecesArray[ newPiecePos ] = newDKUPiece;
   6.293 +         newPiecePos += 1;
   6.294 +       }
   6.295 +    }//for(int rightSliceNum = 0; rightSliceNum <
   6.296 + }
   6.297 +
   6.298 +//===========================================================================

     7.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     7.2 +++ b/DKU_INST_MM/Kernel.c	Sun Aug 26 03:04:50 2012 -0700
     7.3 @@ -0,0 +1,80 @@
     7.4 +/* 
     7.5 + *  Copyright 2009 OpenSourceCodeStewardshipFoundation.org
     7.6 + *  Licensed under GNU General Public License version 2
     7.7 + *
     7.8 + * Author: SeanHalle@yahoo.com
     7.9 + *
    7.10 + */
    7.11 +
    7.12 +#include "DKU_INST_MM.h"
    7.13 +
    7.14 +
    7.15 +/* Kernel
    7.16 + *
    7.17 + * Computes the passed-in piece's portion of the iteration space.
    7.18 + * A DKUPiece struc is handed to it.
    7.19 + * The DKUPiece struct has a pointer to "app specific info".  This
    7.20 + *  is some application-specific structure that the Scheduler doesn't
    7.21 + *  know the details of.  The Scheduler can pass the app-specific
    7.22 + *  data around, and that's all the Scheduler needs to do with it.
    7.23 + * This kernel, however, is written by the app programmer, and does
    7.24 + *  know the internals of "app specific info".
    7.25 + *
    7.26 + * DKU Std:  data that is accessed inside a Kernel is passed inside
    7.27 + *  a DKUPiece.  Kernel's not allowed to access data via the language's
    7.28 + *  native scoping rules, not even constants.  Thus, pointers to
    7.29 + *  arrays or to a tree's root nodes, or so forth are carried inside
    7.30 + *  the DKUPiece.
    7.31 + * Note, kernel code is re-entrant, so the rules of re-entrant code apply
    7.32 + *
    7.33 + *
    7.34 + * This is the standard Matrix Multiply loop nest.  The only modification
    7.35 + *  for DKU is that it takes loop bounds out of the DKUPiece.
    7.36 + */
    7.37 +void kernel_MM( DKUPiece *pieceToProcess )
    7.38 + { inner_Kernel( (MatrixProdPiece *) pieceToProcess->appSpecificPiece );
    7.39 + }
    7.40 +
    7.41 +/* Separated out the calculations of the Kernel so could re-use as the
    7.42 + *  serial kernel.
    7.43 + */
    7.44 +void inner_Kernel( MatrixProdPiece *matrixProdPiece )
    7.45 + { int32 leftStartRow, leftEndRow, rightStartCol, rightEndCol;
    7.46 +   int32 leftStartCol, rightStartRow;
    7.47 +   int32 numLeftCols, numRightCols, numResMatCols;
    7.48 +   int32 row, col, vectorSize, i;
    7.49 +   float32 *leftMatrix, *rightMatrix, *resultMatrix;
    7.50 +   float32 *leftStartPt, *leftReadPt, *rightStartPt, *rightReadPt;
    7.51 +   
    7.52 +   leftMatrix    = matrixProdPiece->leftMatrix->matrix;
    7.53 +   rightMatrix   = matrixProdPiece->rightMatrix->matrix;
    7.54 +   resultMatrix  = matrixProdPiece->resultMatrix->matrix;
    7.55 +   
    7.56 +   leftStartRow  = matrixProdPiece->leftStartRow;
    7.57 +   leftEndRow    = matrixProdPiece->leftEndRow;
    7.58 +   rightStartCol = matrixProdPiece->rightStartCol;
    7.59 +   rightEndCol   = matrixProdPiece->rightEndCol;
    7.60 +   numResMatCols = matrixProdPiece->resultMatrix->numCols;
    7.61 +   rightStartRow = matrixProdPiece->rightStartRow;
    7.62 +   leftStartCol  = matrixProdPiece->leftStartCol;
    7.63 +   numRightCols  = matrixProdPiece->rightMatrix->numCols;
    7.64 +   numLeftCols   = matrixProdPiece->leftMatrix->numCols;
    7.65 +    
    7.66 +   vectorSize =matrixProdPiece->leftEndCol - matrixProdPiece->leftStartCol+1;
    7.67 +   for( row = leftStartRow; row <= leftEndRow; row++ )
    7.68 +    { leftStartPt  = leftMatrix + row * numLeftCols + leftStartCol;
    7.69 +      for( col = rightStartCol; col <= rightEndCol; col++ )
    7.70 +       { float32 sum = 0;
    7.71 +         
    7.72 +         rightStartPt = rightMatrix + rightStartRow * numRightCols + col;
    7.73 +         
    7.74 +         leftReadPt = leftStartPt;
    7.75 +         rightReadPt = rightStartPt;
    7.76 +         for( i = 0; i < vectorSize; i++)
    7.77 +          { sum += *(leftReadPt++) * *rightReadPt;
    7.78 +            rightReadPt += numRightCols;
    7.79 +          }
    7.80 +         *(resultMatrix + row * numResMatCols + col) = sum;
    7.81 +       }
    7.82 +    }
    7.83 + }

     8.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     8.2 +++ b/DKU_INST_MM/MakeRootDKUPieces.c	Sun Aug 26 03:04:50 2012 -0700
     8.3 @@ -0,0 +1,53 @@
     8.4 +/*
     8.5 + *  Copyright 2009 OpenSourceCodeStewardshipFoundation.org
     8.6 + *  Licensed under GNU General Public License version 2
     8.7 + *
     8.8 + * Author: seanhalle@yahoo.com
     8.9 + */
    8.10 +
    8.11 +#include "DKU_INST_MM.h"
    8.12 +
    8.13 +/* This is what wraps application data inside a DKUPiece.
    8.14 + * It also encodes the dependencies within the data, by making several
    8.15 + *  DKUPieces, which must be executed in sequence, in order to respect the
    8.16 + *  dependencies.
    8.17 + * It tries to make the pieces as large as possible, to maximize the
    8.18 + *  available parallelism.
    8.19 + *
    8.20 + * It allocates space for and fills an array of pointers to the pieces,
    8.21 + *  then returns how many pieces it put into the array.
    8.22 + *
    8.23 + * This function allocates the array of pointers to DKUPieces, plus the
    8.24 + *  DKUPieces, plus all sub-structures inside a DKUPiece.  This is an
    8.25 + *  application-supplied function, so it knows all the app-specific sub-
    8.26 + *  structures inside a DKUPiece.  However, it is called by the scheduler
    8.27 + *  so it has to have a fixed prototype.
    8.28 + * The scheduler calls this function, so it is up to the scheduler to free
    8.29 + *  all the structures this function has allocated.  To do this, a second
    8.30 + *  function is supplied that performs the free s.
    8.31 + *
    8.32 + * DKUPieceMaker returns a RootDKUPieces data structure, which contains the
    8.33 + *  array of DKUPieces that have to be executed in sequence, to preserve
    8.34 + *  the dependencies, and contains the number of such DKUPieces.
    8.35 + * Each call to makeDKUPieces mallocs space for the pieces, plus the
    8.36 + *  root pieces array and the root pieces struc.  This memory is freed by
    8.37 + *  the standard function BLIS_DKU__cleanupRootPieces, which is called by the
    8.38 + *  scheduler.
    8.39 + *
    8.40 + * For Matrix Multiply, the original data is a MatrixProdPair, which is
    8.41 + *  perfectly dividable, so just have to wrap that inside a DKUPiece, and
    8.42 + *  return it inside a RootDKUPieces data structure.
    8.43 + */
    8.44 +RootDKUPieces * makeRootDKUPieces_MM( void *origData )
    8.45 + { RootDKUPieces * rootDKUPieces;
    8.46 +   DKUPiece        *piece;
    8.47 +   
    8.48 +   piece = BLIS_DKU__makeDKUPiece_FromMaker( DKU_INST_MM );
    8.49 +   piece->appSpecificPiece = origData;
    8.50 +   
    8.51 +   int numRootPieces = 1;
    8.52 +   rootDKUPieces = BLIS_DKU__makeRootDKUPiecesStruc( numRootPieces );
    8.53 +   rootDKUPieces->rootPiecesArray[0] = piece;
    8.54 +   
    8.55 +   return rootDKUPieces;
    8.56 + }

     9.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     9.2 +++ b/DKU_INST_MM/Maker_and_Freer.c	Sun Aug 26 03:04:50 2012 -0700
     9.3 @@ -0,0 +1,73 @@
     9.4 +/*
     9.5 + *  Copyright 2009 OpenSourceCodeStewardshipFoundation.org
     9.6 + *  Licensed under GNU General Public License version 2
     9.7 + *
     9.8 + * Author: seanhalle@yahoo.com
     9.9 + *
    9.10 + * Created on November 15, 2009, 2:35 AM
    9.11 + */
    9.12 +
    9.13 +#include <malloc.h>
    9.14 +
    9.15 +#include "DKU_INST_MM.h"
    9.16 +
    9.17 +
    9.18 +//========== Makers and Free-ers for use ONLY within DKU_INST_MM  ===========
    9.19 +//
    9.20 +
    9.21 +/*In the "_Flat" version of constructor, do only malloc of the top data struc
    9.22 + * and set values in that top-level.  Don't malloc any sub-structures.
    9.23 + *
    9.24 + *Used in BundlingQuad in unbundleInputs
    9.25 + */
    9.26 + Matrix *
    9.27 +DKU__makeMatrix_Flat( int32 numRows, int32 numCols, DKUPiece *owner )
    9.28 + { Matrix * retMatrix;
    9.29 +   retMatrix = BLIS_DKU__malloc_toPiece( sizeof( Matrix ), owner );
    9.30 +   retMatrix->numRows = numRows;
    9.31 +   retMatrix->numCols = numCols;
    9.32 +
    9.33 +   return retMatrix;
    9.34 + }
    9.35 +
    9.36 +/* Used In BundlingQuad in unbundleInputs */
    9.37 + MatrixProdPiece *
    9.38 +DKU__makeMatrixProdPiece_Flat( DKUPiece *owner )
    9.39 + { return BLIS_DKU__malloc_toPiece( sizeof(MatrixProdPiece), owner );
    9.40 + }
    9.41 +
    9.42 +
    9.43 +
    9.44 +/* Used in Divider */
    9.45 + MatrixProdPiece *
    9.46 +DKU__makeMatrixProdPiece_FromMatrixProdPiece
    9.47 +                            ( MatrixProdPiece *parentPiece, DKUPiece *owner )
    9.48 + { MatrixProdPiece *newPiece;
    9.49 +   Matrix *leftMatrix      = parentPiece->leftMatrix;
    9.50 +   Matrix *rightMatrix     = parentPiece->rightMatrix;
    9.51 +
    9.52 +   newPiece                = DKU__makeMatrixProdPiece_Flat( owner );
    9.53 +
    9.54 +   newPiece->leftMatrix    = leftMatrix;
    9.55 +   newPiece->rightMatrix   = rightMatrix;
    9.56 +
    9.57 +   newPiece->leftStartRow  = 0;
    9.58 +   newPiece->leftEndRow    = leftMatrix->numRows - 1;
    9.59 +   newPiece->leftStartCol  = 0;
    9.60 +   newPiece->leftEndCol    = leftMatrix->numCols - 1;
    9.61 +
    9.62 +   newPiece->rightStartRow = 0;
    9.63 +   newPiece->rightEndRow   = rightMatrix->numRows - 1;
    9.64 +   newPiece->rightStartCol = 0;
    9.65 +   newPiece->rightEndCol   = rightMatrix->numCols - 1;
    9.66 +
    9.67 +   newPiece->prodStartRow  = newPiece->leftStartRow;
    9.68 +   newPiece->prodEndRow    = newPiece->leftEndRow;
    9.69 +   newPiece->prodStartCol  = newPiece->rightStartCol;
    9.70 +   newPiece->prodEndCol    = newPiece->rightEndCol;
    9.71 +
    9.72 +   newPiece->resultMatrix  = parentPiece->resultMatrix;
    9.73 +
    9.74 +   return newPiece;
    9.75 + }
    9.76 +

    10.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    10.2 +++ b/DKU_INST_MM/SerialKernel.c	Sun Aug 26 03:04:50 2012 -0700
    10.3 @@ -0,0 +1,21 @@
    10.4 +/*
    10.5 + *  Copyright 2009 OpenSourceCodeStewardshipFoundation.org
    10.6 + *  Licensed under GNU General Public License version 2
    10.7 + *
    10.8 + * Author: seanhalle@yahoo.com
    10.9 + *
   10.10 + */
   10.11 +
   10.12 +#include "DKU_INST_MM.h"
   10.13 +
   10.14 +/*The scheduler calls this when there is no benefit from
   10.15 + * parallel execution of the orig data.
   10.16 + *
   10.17 + *For MM, the original data is already in the data structure that's inside
   10.18 + * a DKUPiece, so just cast the oridData and call the Kernel on it
   10.19 + */
   10.20 +void serialKernel_MM( void * origData )
   10.21 + {
   10.22 +   inner_Kernel( (MatrixProdPiece *) origData );
   10.23 + }
   10.24 +

    11.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    11.2 +++ b/DKU_INST_MM/Undivide.c	Sun Aug 26 03:04:50 2012 -0700
    11.3 @@ -0,0 +1,34 @@
    11.4 +/*
    11.5 + *  Copyright 2009 OpenSourceCodeStewardshipFoundation.org
    11.6 + *  Licensed under GNU General Public License version 2
    11.7 + *
    11.8 + * Author: seanhalle@yahoo.com
    11.9 + *
   11.10 + */
   11.11 +
   11.12 +#include "DKU_INST_MM.h"
   11.13 +#include <malloc.h>
   11.14 +
   11.15 +
   11.16 +/* unDivider
   11.17 + *
   11.18 + * Only counts to make sure all the pieces are accounted for, and frees
   11.19 + *  the memory allocated to the finished DKUPiece structs and all sub-
   11.20 + *  structures.
   11.21 + */
   11.22 +void unDividePiece_MM( DKUPiece *parentPiece, DKUPiece *piece )
   11.23 + {
   11.24 +   parentPiece->numSubPiecesUndivided += 1;
   11.25 +   
   11.26 +      //free mem allocated to no-longer needed subPiece
   11.27 +   BLIS_DKU__freeDKUPiece( piece );
   11.28 + }
   11.29 +
   11.30 +//TODO: figure out standard for doing this.. add pointer to func to inits,
   11.31 +//       so fixed freePiece func can have generic code to free app-specific
   11.32 +//       piece, and DKUInstance provides function to free that piece
   11.33 +//       maybe put the pointer into schedData or something..  then this
   11.34 +//       one call to freeDKUPiece can be generic, with HW-specific overlay
   11.35 +// Issue is slowness of indirections into table and dereferencing Fn pointer
   11.36 +
   11.37 +

    12.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    12.2 +++ b/Matrix_Mult.c	Sun Aug 26 03:04:50 2012 -0700
    12.3 @@ -0,0 +1,221 @@
    12.4 +/*
    12.5 + *  Copyright 2009 OpenSourceCodeStewardshipFoundation.org
    12.6 + *  Licensed under GNU General Public License version 2
    12.7 + *
    12.8 + * Author: seanhalle@yahoo.com
    12.9 + *
   12.10 + * Created on November 15, 2009, 2:35 AM
   12.11 + */
   12.12 +
   12.13 +#include <malloc.h>
   12.14 +#include <stdlib.h>
   12.15 +
   12.16 +#include "Matrix_Mult.h"
   12.17 +#include "../BLIS/DKU/DKU_common/DKU.h"
   12.18 +
   12.19 +//======================== For Use OUTSIDE DKU instance =====================
   12.20 +/*
   12.21 + *The DKU code-instance in DKU_INST_MM has its own set of makers and free-ers
   12.22 + * that use BLIS_DKU__malloc.  These are for use in application-code
   12.23 + * outside DKU_INST_MM DKU-code-instance.
   12.24 + */
   12.25 +
   12.26 +
   12.27 +/*In the "_Flat" version of constructor, do only malloc of the top data struc
   12.28 + * and set values in that top-level.  Don't malloc any sub-structures.
   12.29 + */
   12.30 + Matrix *
   12.31 +makeMatrix_Flat( int32 numRows, int32 numCols )
   12.32 + { Matrix * retMatrix;
   12.33 +   retMatrix = malloc( sizeof( Matrix ) );
   12.34 +   retMatrix->numRows = numRows;
   12.35 +   retMatrix->numCols = numCols;
   12.36 +
   12.37 +   return retMatrix;
   12.38 + }
   12.39 +
   12.40 + Matrix *
   12.41 +makeMatrix_WithResMat( int32 numRows, int32 numCols )
   12.42 + { Matrix * retMatrix;
   12.43 +   retMatrix = malloc( sizeof( Matrix ) );
   12.44 +   retMatrix->numRows = numRows;
   12.45 +   retMatrix->numCols = numCols;
   12.46 +   retMatrix->matrix  = malloc( numRows * numCols * sizeof(float32) );
   12.47 +   
   12.48 +   return retMatrix;
   12.49 + }
   12.50 +
   12.51 + void
   12.52 +freeMatrix_Flat( Matrix * matrix )
   12.53 + { //( matrix );
   12.54 + }
   12.55 + void
   12.56 +freeMatrix( Matrix * matrix )
   12.57 + { free( matrix->matrix );
   12.58 +   free( matrix );
   12.59 + }
   12.60 +
   12.61 + MatrixProdPiece *
   12.62 +makeMatrixProdPiece_Empty()
   12.63 + { return malloc( sizeof(MatrixProdPiece) );
   12.64 + }
   12.65 +
   12.66 +
   12.67 + MatrixProdPiece *
   12.68 +makeMatrixProdPiece_Helper( Matrix *leftMatrix, Matrix *rightMatrix )
   12.69 + { MatrixProdPiece *newPiece;
   12.70 +
   12.71 +   newPiece                = makeMatrixProdPiece_Empty( );
   12.72 +
   12.73 +   newPiece->leftMatrix    = leftMatrix;
   12.74 +   newPiece->rightMatrix   = rightMatrix;
   12.75 +
   12.76 +   newPiece->leftStartRow  = 0;
   12.77 +   newPiece->leftEndRow    = leftMatrix->numRows - 1;
   12.78 +   newPiece->leftStartCol  = 0;
   12.79 +   newPiece->leftEndCol    = leftMatrix->numCols - 1;
   12.80 +
   12.81 +   newPiece->rightStartRow = 0;
   12.82 +   newPiece->rightEndRow   = rightMatrix->numRows - 1;
   12.83 +   newPiece->rightStartCol = 0;
   12.84 +   newPiece->rightEndCol   = rightMatrix->numCols - 1;
   12.85 +
   12.86 +   newPiece->prodStartRow  = newPiece->leftStartRow;
   12.87 +   newPiece->prodEndRow    = newPiece->leftEndRow;
   12.88 +   newPiece->prodStartCol  = newPiece->rightStartCol;
   12.89 +   newPiece->prodEndCol    = newPiece->rightEndCol;
   12.90 +
   12.91 +   return newPiece;
   12.92 + }
   12.93 +
   12.94 + MatrixProdPiece *
   12.95 +makeMatrixProdPiece_FromMatrixProdPiece( MatrixProdPiece *parentPiece )
   12.96 + { MatrixProdPiece *newPiece;
   12.97 +   Matrix *leftMatrix      = parentPiece->leftMatrix;
   12.98 +   Matrix *rightMatrix     = parentPiece->rightMatrix;
   12.99 +
  12.100 +   newPiece = makeMatrixProdPiece_Helper( leftMatrix, rightMatrix );
  12.101 +   newPiece->resultMatrix  = parentPiece->resultMatrix;
  12.102 +
  12.103 +   return newPiece;
  12.104 + }
  12.105 +
  12.106 + MatrixProdPiece *
  12.107 +makeMatrixProdPiece_FromMatrices( Matrix *leftMatrix, Matrix *rightMatrix )
  12.108 + { MatrixProdPiece *newPiece;
  12.109 +
  12.110 +   newPiece = makeMatrixProdPiece_Helper( leftMatrix, rightMatrix );
  12.111 +   newPiece->resultMatrix  =
  12.112 +    makeMatrix_WithResMat( leftMatrix->numRows, rightMatrix->numCols );
  12.113 +
  12.114 +   return newPiece;
  12.115 + }
  12.116 +
  12.117 + void
  12.118 +freeMatrixProdPiece_Flat( MatrixProdPiece * piece )
  12.119 + { free( piece );
  12.120 + }
  12.121 +
  12.122 + void
  12.123 +freeMatrixProdPiece( MatrixProdPiece * piece )
  12.124 + { //( piece->leftMatrix );
  12.125 +   freeMatrix( piece->rightMatrix );
  12.126 +   freeMatrix( piece->resultMatrix );
  12.127 +   free( piece );
  12.128 + }
  12.129 +
  12.130 + 
  12.131 + void
  12.132 +initialize_Input_Matrices_Via( Matrix  **leftMatrix, Matrix **rightMatrix,
  12.133 +                               ParamBag *paramBag )
  12.134 + { char *leftMatrixFileName, *rightMatrixFileName;
  12.135 +   int   leftMatrixRows, leftMatrixCols, rightMatrixRows, rightMatrixCols;
  12.136 +   
  12.137 +      ParamStruc *param;
  12.138 +      param = getParamFromBag( "leftMatrixRows", paramBag );
  12.139 +   leftMatrixRows = param->intValue;
  12.140 +      param = getParamFromBag( "leftMatrixCols", paramBag );
  12.141 +   leftMatrixCols = param->intValue;
  12.142 +   *leftMatrix = makeMatrix_WithResMat( leftMatrixRows, leftMatrixCols );
  12.143 +   
  12.144 +      param = getParamFromBag( "leftMatrixFileName", paramBag );
  12.145 +   leftMatrixFileName = param->strValue;  //no need to copy
  12.146 +   read_Matrix_From_File( *leftMatrix,  leftMatrixFileName );
  12.147 +   
  12.148 +      param = getParamFromBag( "rightMatrixRows", paramBag );
  12.149 +   rightMatrixRows = param->intValue;
  12.150 +      param = getParamFromBag( "rightMatrixCols", paramBag );
  12.151 +   rightMatrixCols = param->intValue;
  12.152 +   *rightMatrix = makeMatrix_WithResMat( rightMatrixRows, rightMatrixCols );
  12.153 +   
  12.154 +      param = getParamFromBag( "rightMatrixFileName", paramBag );
  12.155 +   rightMatrixFileName = param->strValue;
  12.156 +   read_Matrix_From_File( *rightMatrix, rightMatrixFileName );
  12.157 + }
  12.158 +
  12.159 +
  12.160 +void parseLineIntoRow( char *line, float32* row );
  12.161 +
  12.162 +
  12.163 + void
  12.164 +read_Matrix_From_File( Matrix *matrixStruc, char *matrixFileName )
  12.165 + { int    row, maxRead, numRows, numCols;
  12.166 +   float32 *matrixStart;
  12.167 +   size_t lineSz = 0;
  12.168 +   FILE  *file;
  12.169 +   char  *line = NULL;
  12.170 +   
  12.171 +   lineSz = 50000; //max length of line in a matrix data file
  12.172 +   line = (char *) malloc( lineSz );
  12.173 +   if( line == NULL ) BLIS_DKU__throwError( "no mem for matrix line" );
  12.174 +   
  12.175 +   numRows = matrixStruc->numRows;
  12.176 +   numCols = matrixStruc->numCols;
  12.177 +   matrixStart = matrixStruc->matrix;
  12.178 +
  12.179 +   printf("Matrix File Path: %s\n", matrixFileName);fflush(stdout);
  12.180 +   file = fopen( matrixFileName, "r" ); if(!file){printf("not open! %d\n",__LINE__); fflush(stdin);}
  12.181 +   fseek( file, 0, SEEK_SET );
  12.182 +   for( row = 0; row < numRows; row++ )
  12.183 +    {
  12.184 +      if( feof( file ) )  BLIS_DKU__throwError( "file ran out too soon" );
  12.185 +      maxRead = getline( &line, &lineSz, file );
  12.186 +      if( maxRead == -1 ) BLIS_DKU__throwError( "prob reading mat line");
  12.187 +      
  12.188 +      if( *line == '\n') continue; //blank line
  12.189 +      if( *line == '/' ) continue; //comment line
  12.190 +      
  12.191 +      parseLineIntoRow( line, matrixStart + row * numCols );
  12.192 +    }
  12.193 +   free( line );
  12.194 + }
  12.195 +
  12.196 +/*This function relies on each line having the proper number of cols.  It
  12.197 + * doesn't check, nor enforce, so if the file is improperly formatted it
  12.198 + * can write over unrelated memory
  12.199 + */
  12.200 + void
  12.201 +parseLineIntoRow( char *line, float32* row )
  12.202 + {
  12.203 +   char *valueStr, *searchPos;
  12.204 +   
  12.205 +      //read the float values
  12.206 +   searchPos = valueStr = line; //start
  12.207 +   
  12.208 +   for( ; *searchPos != 0; searchPos++)  //bit dangerous, should use buff len
  12.209 +    {
  12.210 +      if( *searchPos == '\n' ) //last col..  relying on well-formatted file
  12.211 +       { *searchPos = 0;
  12.212 +         *row = atof( valueStr );
  12.213 +         break;                                    //end FOR loop
  12.214 +       }
  12.215 +      if( *searchPos == ',' )
  12.216 +       { *searchPos = 0;                           //mark end of string
  12.217 +         *row = (float32) atof( valueStr );
  12.218 +         row += 1;                                 //address arith
  12.219 +            //skip any spaces before digits.. use searchPos + 1 to skip the 0
  12.220 +         for( ; *(searchPos + 1)== ' ' && *(searchPos + 1) !=0; searchPos++);
  12.221 +         valueStr = searchPos + 1;
  12.222 +       }
  12.223 +    }
  12.224 + }

    13.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    13.2 +++ b/Matrix_Mult.h	Sun Aug 26 03:04:50 2012 -0700
    13.3 @@ -0,0 +1,81 @@
    13.4 +/*
    13.5 + *  Copyright Oct 24, 2009 OpenSourceCodeStewardshipFoundation.org
    13.6 + *  Licensed under GNU General Public License version 2
    13.7 + */
    13.8 +
    13.9 +#ifndef MATRIX_MULT_H_
   13.10 +#define MATRIX_MULT_H_
   13.11 +
   13.12 +#include <stdio.h>
   13.13 +
   13.14 +#include "../BLIS/BLIS_primitive_data_types.h"
   13.15 +
   13.16 +#include "ParamHelper/Param.h"
   13.17 +
   13.18 +//==============================  Structures  ==============================
   13.19 +
   13.20 +typedef
   13.21 +struct
   13.22 + { int32 numRows;
   13.23 +   int32 numCols;
   13.24 +   float32 *matrix;  //2D, but dynamically sized, so use addr arith
   13.25 + }
   13.26 +Matrix;
   13.27 +
   13.28 +/* This is the "appSpecificPiece" that is carried inside a DKUPiece.
   13.29 + *  In the DKUPiece data struc it is declared to be of type "void *".  This
   13.30 + *  allows the application to define any data structure it wants and put it
   13.31 + *  into a DKUPiece.
   13.32 + * When the app specific info is used, it is in app code, so it is cast to
   13.33 + *  the correct type to tell the compiler how to access fields.
   13.34 + * This keeps all app-specific things out of the DKU directory, as per the
   13.35 + *  DKU standard. */
   13.36 +typedef
   13.37 +struct
   13.38 + { 
   13.39 +      // pointers to shared data..  the result matrix must be created when the
   13.40 +      //  left and right matrices are put into the root ancestor DKUPiece.
   13.41 +   Matrix * leftMatrix;
   13.42 +   Matrix * rightMatrix;
   13.43 +   Matrix * resultMatrix;
   13.44 +
   13.45 +      // define the starting and ending boundaries for this piece of the
   13.46 +      //  result matrix.  These are derivable from the left and right
   13.47 +      //  matrices, but included them for readability of code.
   13.48 +   int prodStartRow, prodEndRow;
   13.49 +   int prodStartCol, prodEndCol;
   13.50 +      // Start and end of the portion of the left matrix that contributes to
   13.51 +      //  this piece of the product
   13.52 +   int leftStartRow, leftEndRow;
   13.53 +   int leftStartCol, leftEndCol;
   13.54 +      // Start and end of the portion of the right matrix that contributes to
   13.55 +      //  this piece of the product
   13.56 +   int rightStartRow, rightEndRow;
   13.57 +   int rightStartCol, rightEndCol;
   13.58 + }
   13.59 +MatrixProdPiece;
   13.60 +
   13.61 +//==============================  Functions  ================================
   13.62 +void readFile();
   13.63 +
   13.64 +Matrix *makeMatrix( int32 numRows, int32 numCols );
   13.65 +Matrix *makeMatrix_Flat( int32 numRows, int32 numCols );
   13.66 +void    freeMatrix_Flat( Matrix * matrix );
   13.67 +void    freeMatrix( Matrix * matrix );
   13.68 +
   13.69 + MatrixProdPiece *
   13.70 +makeMatrixProdPiece_Empty();
   13.71 + MatrixProdPiece *
   13.72 +makeMatrixProdPiece_FromMatrixProdPiece( MatrixProdPiece * piece );
   13.73 + MatrixProdPiece *
   13.74 +makeMatrixProdPiece_FromMatrices( Matrix *leftMatrix, Matrix *rightMatrix );
   13.75 +
   13.76 +void read_Matrix_From_File( Matrix *matrixStruc, char *matrixFileName );
   13.77 +
   13.78 +void freeMatrixProdPiece_Flat( MatrixProdPiece * piece );
   13.79 +void freeMatrixProdPiece( MatrixProdPiece * piece );
   13.80 +
   13.81 +
   13.82 +//===========================================================================
   13.83 +
   13.84 +#endif /*MATRIX_MULT_H_*/

    14.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    14.2 +++ b/Read_Input_Matrix.c	Sun Aug 26 03:04:50 2012 -0700
    14.3 @@ -0,0 +1,604 @@
    14.4 +/* 
    14.5 + * File:   Read_Input.c
    14.6 + * Author: SeanHalle@yahoo.com
    14.7 + *
    14.8 + * Created on June 15, 2009, 10:12 AM
    14.9 + */
   14.10 +
   14.11 +#include <stdio.h>
   14.12 +
   14.13 +//========================
   14.14 +scanf("%[^\t]",a);  //matches everything except tab character
   14.15 +
   14.16 +//======================= Write a structure into a file  ====================
   14.17 +
   14.18 +#include <stdio.h>
   14.19 +#include <string.h>
   14.20 +#include <stdlib.h>
   14.21 +
   14.22 +#define MAX 50
   14.23 +
   14.24 +
   14.25 +typedef struct {
   14.26 +    char  name[10];
   14.27 +    int key;
   14.28 +} file_record;
   14.29 +
   14.30 +/* this function adds the relatiuve addres to the index for a key */
   14.31 +void create_index(long index[], int key, long rel_add ) {
   14.32 +    index[key] = rel_add;
   14.33 +}
   14.34 +
   14.35 +/* this function writes a record to the file */
   14.36 +void write_rec(FILE *fp, file_record rec) {
   14.37 +   fwrite(&rec,sizeof(rec),1,fp);
   14.38 +}
   14.39 +
   14.40 +void main() {
   14.41 +    long rel_add;
   14.42 +    int key;
   14.43 +    file_record frec;
   14.44 +    long index[MAX];/* an index list*/
   14.45 +    int n,i;
   14.46 +
   14.47 +    FILE *recfile=NULL,*ifile=NULL;
   14.48 +    /* this initializes the index list to all ? */
   14.49 +    for(i=0; i< MAX; i++)
   14.50 +        index[i]= (-1);
   14.51 +
   14.52 +    recfile=fopen("mfile","w");
   14.53 +    if(recfile == NULL) {
   14.54 +        printf("Error in openeing file mfile\n");
   14.55 +        exit(0);
   14.56 +    }
   14.57 +    rel_add = 0 ;
   14.58 +    do {
   14.59 +        printf(" Enter the data vlue and the key of the record to be added to file mfile\n");
   14.60 +        scanf("%s %d",frec.name,&frec.key);
   14.61 +        while(index[frec.key] != (-1)) {
   14.62 +            printf(" A record with this key value already exist in a file enter record key value\n");
   14.63 +            scanf("%s %d",frec.name,&frec.key);
   14.64 +        }
   14.65 +        create_index(index,frec.key,rel_add);
   14.66 +         write_rec(recfile,frec);
   14.67 +        rel_add =  ftell(recfile);
   14.68 +        /* this sets the relative address for the next record to be
   14.69 +      the value of current file position pointer in bytes from
   14.70 +      the beginning of the file */
   14.71 +         printf("Enter 1 to continue adding records to the file\n");
   14.72 +         scanf("%d",&n);
   14.73 +    }while(n == 1);
   14.74 +
   14.75 +    ifile=fopen("index_file","w");
   14.76 +
   14.77 +    if(ifile == NULL) {
   14.78 +       printf("Error in openeing file index_file\n");
   14.79 +         exit(0);
   14.80 +    }
   14.81 +
   14.82 +    fwrite(index,sizeof(index),1,ifile);/*writes the complete index into the index_file */
   14.83 +    fclose(recfile);
   14.84 +    fclose(ifile);
   14.85 +    printf("Enter 1 if you want to retrieve a record\n");
   14.86 +    scanf("%d",&n);
   14.87 +
   14.88 +    if( n == 1) {
   14.89 +       ifile=fopen("index_file","r");
   14.90 +       if(ifile == NULL) {
   14.91 +           printf("Error in openeing file index_file\n");
   14.92 +           exit(0);
   14.93 +       }
   14.94 +       fread(index,sizeof(index),1,ifile);
   14.95 +
   14.96 +       /* reads the complete index into the index list from the index_file*/
   14.97 +       fclose(ifile);
   14.98 +       recfile=fopen("mfile","r");
   14.99 +
  14.100 +       if(recfile == NULL) {
  14.101 +           printf("Error in openeing file mfile\n");
  14.102 +           exit(0);
  14.103 +       }
  14.104 +    }
  14.105 +    printf("THE CONTENTS OF FILE IS \n");
  14.106 +
  14.107 +    while( (fread(&frec,sizeof(frec),1,recfile)) != 0)
  14.108 +         printf("%s %d\n",frec.name,frec.key);
  14.109 +
  14.110 +    do {
  14.111 +        printf("Enter the key of the record to be retrieved\n");
  14.112 +        scanf("%d",&key);
  14.113 +        rel_add = index[key]; /*gets the relative address of the record from index list */
  14.114 +        if( (fseek(recfile,rel_add,SEEK_SET))!= 0) {
  14.115 +             printf("Error\n");
  14.116 +             exit(0);
  14.117 +        }
  14.118 +        fread(&frec,sizeof(frec),1,recfile);
  14.119 +        printf("The data value of the retrieved record is %s\n",frec.name);
  14.120 +        printf("Enter 1 if you want to retrieve a record\n");
  14.121 +        scanf("%d",&n);
  14.122 +    } while(n == 1);
  14.123 +
  14.124 +    fclose(recfile);
  14.125 +}
  14.126 +
  14.127 +
  14.128 +
  14.129 +
  14.130 +//========================== Read words in file demo  =======================
  14.131 +
  14.132 +#include <stdio.h>
  14.133 +#include <ctype.h>
  14.134 +#include <string.h>
  14.135 +#include <stdlib.h>
  14.136 +
  14.137 +struct node {
  14.138 +    struct node    *left;       /* tree to the left */
  14.139 +    struct node    *right;      /* tree to the right */
  14.140 +    char           *word;       /* word for this tree */
  14.141 +};
  14.142 +
  14.143 +/* the top of the tree */
  14.144 +static struct node *root = NULL;
  14.145 +
  14.146 +/*
  14.147 + * memory_error -- write error and die                  *
  14.148 + */
  14.149 +void memory_error(void)
  14.150 +{
  14.151 +    fprintf(stderr, "Error:Out of memory\n");
  14.152 +    exit(8);
  14.153 +}
  14.154 +
  14.155 +/*
  14.156 + * save_string -- save a string on the heap             *
  14.157 + *                                                      *
  14.158 + * Parameters                                           *
  14.159 + *      string -- string to save                        *
  14.160 + *                                                      *
  14.161 + * Returns                                              *
  14.162 + *      pointer to malloc-ed section of memory with     *
  14.163 + *      the string copied into it.                      *
  14.164 + */
  14.165 +char *save_string(char *string)
  14.166 +{
  14.167 +    char *new_string;   /* where we are going to put string */
  14.168 +
  14.169 +    new_string = malloc((unsigned) (strlen(string) + 1));
  14.170 +
  14.171 +    if (new_string == NULL)
  14.172 +        memory_error();
  14.173 +
  14.174 +    strcpy(new_string, string);
  14.175 +    return (new_string);
  14.176 +}
  14.177 +/*
  14.178 + * enter -- enter a word into the tree                  *
  14.179 + *                                                      *
  14.180 + * Parameters                                           *
  14.181 + *      node -- current node we are looking at          *
  14.182 + *      word -- word to enter                           *
  14.183 + */
  14.184 +void enter(struct node **node, char *word)
  14.185 +{
  14.186 +    int  result;        /* result of strcmp */
  14.187 +
  14.188 +    char *save_string(char *);  /* save a string on the heap */
  14.189 +
  14.190 +    /*
  14.191 +     * If the current node is null, we have reached the bottom
  14.192 +     * of the tree and must create a new node.
  14.193 +     */
  14.194 +    if ((*node) == NULL) {
  14.195 +
  14.196 +  /* Allocate memory for a new node */
  14.197 +        (*node) = malloc(sizeof(struct node));
  14.198 +        if ((*node) == NULL)
  14.199 +            memory_error();
  14.200 +
  14.201 +  /* Initialize the new node */
  14.202 +        (*node)->left = NULL;
  14.203 +        (*node)->right = NULL;
  14.204 +        (*node)->word = save_string(word);
  14.205 +  return;
  14.206 +    }
  14.207 +    /* Check to see where the word goes */
  14.208 +    result = strcmp((*node)->word, word);
  14.209 +
  14.210 +    /* The current node already contains the word, no entry necessary */
  14.211 +    if (result == 0)
  14.212 +        return;
  14.213 +
  14.214 +    /* The word must be entered in the left or right sub-tree */
  14.215 +    if (result < 0)
  14.216 +        enter(&(*node)->right, word);
  14.217 +    else
  14.218 +        enter(&(*node)->left, word);
  14.219 +}
  14.220 +/*
  14.221 + * scan -- scan the file for words                      *
  14.222 + *                                                      *
  14.223 + * Parameters                                           *
  14.224 + *      name -- name of the file to scan                *
  14.225 + */
  14.226 +void scan(char *name)
  14.227 +{
  14.228 +    char word[100];     /* word we are working on */
  14.229 +    int  index;         /* index into the word */
  14.230 +    int  ch;            /* current character */
  14.231 +    FILE *in_file;      /* input file */
  14.232 +
  14.233 +    in_file = fopen(name, "r");
  14.234 +    if (in_file == NULL) {
  14.235 +        fprintf(stderr, "Error:Unable to open %s\n", name);
  14.236 +        exit(8);
  14.237 +    }
  14.238 +    while (1) {
  14.239 +        /* scan past the whitespace */
  14.240 +        while (1) {
  14.241 +            ch = fgetc(in_file);
  14.242 +
  14.243 +            if (isalpha(ch) || (ch == EOF))
  14.244 +                break;
  14.245 +        }
  14.246 +
  14.247 +        if (ch == EOF)
  14.248 +            break;
  14.249 +
  14.250 +        word[0] = ch;
  14.251 +        for (index = 1; index < sizeof(word); ++index) {
  14.252 +            ch = fgetc(in_file);
  14.253 +            if (!isalpha(ch))
  14.254 +                break;
  14.255 +            word[index] = ch;
  14.256 +        }
  14.257 +        /* put a null on the end */
  14.258 +        word[index] = '\0';
  14.259 +
  14.260 +        enter(&root, word);
  14.261 +    }
  14.262 +    fclose(in_file);
  14.263 +}
  14.264 +/*
  14.265 + * print_tree -- print out the words in a tree          *
  14.266 + *                                                      *
  14.267 + * Parameters                                           *
  14.268 + *      top -- the root of the tree to print            *
  14.269 + */
  14.270 +void print_tree(struct node *top)
  14.271 +{
  14.272 +    if (top == NULL)
  14.273 +        return;                 /* short tree */
  14.274 +
  14.275 +    print_tree(top->left);
  14.276 +    printf("%s\n", top->word);
  14.277 +    print_tree(top->right);
  14.278 +}
  14.279 +
  14.280 +int main(int argc, char *argv[])
  14.281 +{
  14.282 +    if (argc != 2) {
  14.283 +        fprintf(stderr, "Error:Wrong number of parameters\n");
  14.284 +        fprintf(stderr, "      on the command line\n");
  14.285 +        fprintf(stderr, "Usage is:\n");
  14.286 +        fprintf(stderr, "    words 'file'\n");
  14.287 +        exit(8);
  14.288 +    }
  14.289 +    scan(argv[1]);
  14.290 +    print_tree(root);
  14.291 +    return (0);
  14.292 +}
  14.293 +
  14.294 +
  14.295 +
  14.296 +
  14.297 +//==================  Get line demo =========================
  14.298 +#include <stdio.h>
  14.299 +#include <stdlib.h>
  14.300 +#include <string.h>
  14.301 +
  14.302 +#define _GNU_SOURCE
  14.303 +
  14.304 +int main(int argc, char* argv[]) {
  14.305 +
  14.306 +size_t lsize = 0;
  14.307 +ssize_t read;
  14.308 +FILE* conf_file;
  14.309 +char* line = NULL;
  14.310 +
  14.311 +if (argc == 1) {
  14.312 +printf("\nCommand syntax:\n");
  14.313 +printf("\n\tprogramd [ start | stop ]\n");
  14.314 +printf("\t\tstart: start daemon\n");
  14.315 +printf("\t\tstop: stop daemon\n");
  14.316 +}
  14.317 +else if (strcmp(argv[1], "start") == 0) {
  14.318 +conf_file = fopen("/etc/program/program.conf", "r");
  14.319 +fseek(conf_file, 0, SEEK_SET);
  14.320 +while (!feof(conf_file)) {
  14.321 +while (getline(&line, &lsize, conf_file) != -1) {
  14.322 +printf("%s", line);
  14.323 +}
  14.324 +}
  14.325 +}
  14.326 +
  14.327 +//======================
  14.328 +fopen
  14.329 +fread
  14.330 +fscanf
  14.331 +getline
  14.332 +
  14.333 +
  14.334 +//================== scanf demo =============================
  14.335 +/* Q: need this for the GCC atomic operations?
  14.336 + */
  14.337 +#define _GNU_SOURCE
  14.338 +#include <stdio.h>
  14.339 +
  14.340 +void main (void) {
  14.341 +
  14.342 +  /* We will use one floating-point and one integer variable. */
  14.343 +
  14.344 +  double x = .00000123456789;
  14.345 +  int n = 12345;
  14.346 +
  14.347 +
  14.348 +  /* Display plain text. */
  14.349 +
  14.350 +  printf("This is a test\n");
  14.351 +  printf("This\tis\nanother\ttest\n\n");
  14.352 +
  14.353 +
  14.354 +  /* Display an integer. */
  14.355 +
  14.356 +  printf("Here is n: %d\n\n", n);
  14.357 +
  14.358 +
  14.359 +  /* Display a double three different ways. */
  14.360 +
  14.361 +  printf("Here is x: %g\n", x);
  14.362 +  printf("Here is x: %f\n", x);
  14.363 +  printf("Here is x: %e\n\n", x);
  14.364 +
  14.365 +
  14.366 +  /* Display two numbers. */
  14.367 +
  14.368 +  printf("Here are n (%d) and x (%g)\n", n, x);
  14.369 +
  14.370 +}
  14.371 +
  14.372 +//===========================================================
  14.373 +
  14.374 +void read_One_MB(int f, int MB_y, int MB_x, MB_Info *curMB, FILE *inputFH);
  14.375 +
  14.376 +//=================================================================
  14.377 +       { dataFile    = new File( fileName );
  14.378 +         paramScanner = new Scanner( dataFile  );
  14.379 +       }
  14.380 +      catch( Exception e )
  14.381 +       { dataFile    = null;
  14.382 +         paramScanner = null;
  14.383 +         System.err.println( "couldn't open file: " + fileName );
  14.384 +       }
  14.385 +
  14.386 +      paramScanner.useDelimiter(",\\s*|\n|\r\n");
  14.387 +
  14.388 +   MatrixInRowMajor ( int _numRows, int _numCols )
  14.389 +    {
  14.390 +      numRows = _numRows;  // lives in super class
  14.391 +      numCols = _numCols;
  14.392 +
  14.393 +      rows = new float[numRows][];
  14.394 +
  14.395 +      for( int i = 0; i < numRows; i++ )
  14.396 +       {
  14.397 +         rows[ i ] = new float[ numCols ];
  14.398 +       }
  14.399 +    }
  14.400 +  public void fillSelfFromFile( String fileName )
  14.401 +    { float floatValue = 0;
  14.402 +      String floatString;
  14.403 +
  14.404 +      super.setUpScanner( fileName );
  14.405 +
  14.406 +      for( int r = 0; r < numRows; r += 1 )
  14.407 +       { for( int c = 0; c < numCols; c += 1 )
  14.408 +          { floatString = paramScanner.next();
  14.409 +            floatValue = Float.parseFloat( floatString );
  14.410 +            rows[ r ][ c ] = floatValue;
  14.411 +          }
  14.412 +       }
  14.413 +    }
  14.414 +
  14.415 +//================================================================
  14.416 +
  14.417 +
  14.418 +/* Get the data strucs implicitly from the header file
  14.419 + */
  14.420 +void read_All_Frames( FILE *inputFH )
  14.421 + { int MB_x, MB_y, f;
  14.422 +   
  14.423 +   for( f = 1; f <= numFrames; f++ ) //allocated 10 frames of mem, use 8
  14.424 +    {    //PixInFrame, PixInLine, MBsInFrame, frameWidthInMB, etc in header
  14.425 +      uint8_t * startOfFrame_L  = &(input_img_y[f][0]);
  14.426 +      uint8_t * startOfFrame_CR = &(input_img_cr[f][0]);
  14.427 +      uint8_t * startOfFrame_CB = &(input_img_cb[f][0]);
  14.428 +      
  14.429 +      // Reads in one frame
  14.430 +      for( MB_y = 0; MB_y < frameHeightInMB; MB_y++ )
  14.431 +       {
  14.432 +         for( MB_x = 0; MB_x < frameWidthInMB; MB_x++ )
  14.433 +          {    //DEBUG: addr arith checks out (size of MB_Info is 240)
  14.434 +            MB_Info *MBInfo = &(input_MBs[f][0]) +
  14.435 +                                              (MB_y * frameWidthInMB + MB_x);
  14.436 +            // Read Macroblock Parameters and pixel data
  14.437 +            read_One_MB( f, MB_y, MB_x, MBInfo, inputFH );
  14.438 +            
  14.439 +            MBInfo->PixInLine_L  = oPixInLine_L;
  14.440 +            MBInfo->PixInLine_C  = oPixInLine_C;
  14.441 +            
  14.442 +            int offsetToMBsPix_L = 
  14.443 +                       MB_y * (oPixInLine_L * MBHeight_L) + MB_x * MBWidth_L;
  14.444 +            int offsetToMBsPix_C = 
  14.445 +                       MB_y * (oPixInLine_C * MBHeight_C) + MB_x * MBWidth_C;
  14.446 +               //DEBUG: addr arith checks out
  14.447 +            MBInfo->startOfMBsPix_L  = startOfFrame_L  + offsetToMBsPix_L;
  14.448 +            MBInfo->startOfMBsPix_CR = startOfFrame_CR + offsetToMBsPix_C;
  14.449 +            MBInfo->startOfMBsPix_CB = startOfFrame_CB + offsetToMBsPix_C;
  14.450 +          }
  14.451 +       }
  14.452 +    }
  14.453 + }
  14.454 +
  14.455 +/* Reads the parameters of macro block, then reads pixel data of MB.
  14.456 + * Give it frame num, y and x of macro block.  It gets addresses of the
  14.457 + *  arrays by including the header.
  14.458 + */
  14.459 +void read_One_MB(int f, int MB_y, int MB_x, MB_Info *curMB, FILE *inputFH)
  14.460 + {
  14.461 +   int dir, line, i, tmp, x, y, pixelIndex;
  14.462 +   char strTemp[90];
  14.463 +   MB_Info_throwAway throwAway;
  14.464 +   
  14.465 +   fscanf(inputFH, "%s",&(strTemp[0]));  //get rid of unused preamble string
  14.466 +   
  14.467 +   //(*curMB).MB_x = MB_x;
  14.468 +   //(*curMB).MB_y = MB_y;
  14.469 +   
  14.470 +   
  14.471 +   
  14.472 +   /************  Read parameters ******************/
  14.473 +   
  14.474 +      // first, get rid of unused data in the input stream.
  14.475 +   fscanf(inputFH, "%d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d",
  14.476 +         &(throwAway).MB_X,
  14.477 +         &(throwAway).MB_Y,
  14.478 +         &(throwAway).mb_stride,
  14.479 +         &(throwAway).deblocking_filter,
  14.480 +         &(throwAway).picture_structure,
  14.481 +         &(throwAway).slice_alpha_c0_offset,
  14.482 +         &(throwAway).slice_type,
  14.483 +         &(throwAway).chroma_qp_index_offset[0],
  14.484 +         &(throwAway).chroma_qp_index_offset[1],
  14.485 +         &(throwAway).mb_xy_type,
  14.486 +         &(throwAway).mb_xy_type_m1,
  14.487 +         &(throwAway).mb_xy_type_top,
  14.488 +         &(throwAway).qscale_mb_xy,
  14.489 +         &(throwAway).qscale_mb_xy_m1,
  14.490 +         &(throwAway).qscale_mb_xy_top,
  14.491 +         &(throwAway).slice_table_mb_xy,
  14.492 +         &(throwAway).slice_table_mb_xy_m1,
  14.493 +         &(throwAway).slice_table_mb_xy_top
  14.494 +        );
  14.495 +   
  14.496 +   for(dir = 0; dir < 2; dir++)
  14.497 +      for(line=0; line < 5*8; line++)
  14.498 +         fscanf(inputFH, "%d ", &(throwAway).ref_cache[dir][line]);
  14.499 +   
  14.500 +   for(dir = 0; dir < 2; dir++)
  14.501 +      for(line=0; line < 5*8; line++)
  14.502 +         fscanf(inputFH, "%d %d ", &(throwAway).mv_cache[dir][line][0],
  14.503 +                                   &(throwAway).mv_cache[dir][line][1]);
  14.504 +   
  14.505 +   for(line=0; line < 6*8; line++)
  14.506 +      fscanf(inputFH, "%d ", &(throwAway).non_zero_count_cache[line]);
  14.507 +   
  14.508 +   
  14.509 +      //now, get data will use in deblocking
  14.510 +   
  14.511 +   fscanf(inputFH, "%i %i", &(*curMB).endSubBlk[0], &(*curMB).endSubBlk[1]);
  14.512 +   fscanf(inputFH, "%d",&(*curMB).startSubBlk[0]);
  14.513 +   fscanf(inputFH, "%d",&(*curMB).startSubBlk[1]);
  14.514 +   
  14.515 +   //read bS
  14.516 +   for(dir = 0; dir < 2; dir++)
  14.517 +      for(line=0; line < 4; line++)
  14.518 +         for (i=0; i < 4; i++)
  14.519 +          {
  14.520 +            fscanf(inputFH, "%d",&tmp);
  14.521 +            (*curMB).bS[dir][line][i] = (int) tmp;
  14.522 +          }
  14.523 +   
  14.524 +   //read luma_qp
  14.525 +   for(dir = 0; dir < 2; dir++)
  14.526 +      for(line=0; line < 4; line++)
  14.527 +         fscanf(inputFH, "%d",&(*curMB).luma_qp[dir][line]);
  14.528 +   
  14.529 +   //read chroma_qp
  14.530 +   for(dir = 0; dir < 2; dir++)
  14.531 +      for(line=0; line < 4; line++)
  14.532 +         fscanf(inputFH, "%d",&(*curMB).chroma_qp[dir][line]);
  14.533 +    
  14.534 +    
  14.535 +    
  14.536 +   /********* Have MB params, now read pixel data of MB *************/
  14.537 +   
  14.538 +      /* The MB pixel data is read in one MB at a time.  All pixel data
  14.539 +       *  goes into a single array.  The pixel data is layed out in the array
  14.540 +       *  the same as the pixels appear on the screen, in screen-row major
  14.541 +       *  order.  So, all the pixels in the 0th line at the top of the frame
  14.542 +       *  are next to each other, starting at the beginning of the array.
  14.543 +       *  Then, the second line begins at arrayAddr + frame_width_in_pixels,
  14.544 +       *  and so on.
  14.545 +       * Get the pixels for one MB, so have to map the MB location onto the
  14.546 +       *  array location.
  14.547 +       * The position of the MB's 0,0 pixel is offset by the data of all  the
  14.548 +       *  MBs in MB-lines above, and by all the pixels in MBs to the left.
  14.549 +       * So, the number of pixels in a MB-line is the number of lines in a MB
  14.550 +       *  times the number of pixels in a frame-line.  Multiply that by the
  14.551 +       *  number of MB-lines above the current MB.
  14.552 +       * Next, add the offset within the current MB-line, which is the
  14.553 +       *  number of MBs to the left times the width, in pixels, of one MB.
  14.554 +       * Then, to get the offset of a particular pixel in the MB from the
  14.555 +       *  start of the MB, take the number of lines in the MB above the
  14.556 +       *  current pixel times the pixels-per-frame-line, then add the number
  14.557 +       *  of pixels to the left within the MB.
  14.558 +       *
  14.559 +       *(MB_y * MB_height * Frm_width + MB_x * MB_width) + (y * Frm_width + x)
  14.560 +       */
  14.561 +      
  14.562 +      /*  read the pre-deblocking MB pixels, then the correct final pixels.
  14.563 +       * Start of a macro block is num MB rows above * pixel lines in height
  14.564 +       *  of a MB * pixels in a line of frame, plus the numMB to left * pixel
  14.565 +       *  width of a MB.. MB_y and MB_x start at 0, so they are num above
  14.566 +       *  and num to left, repectively.
  14.567 +       * Many of the width and height values are defined in header.*/
  14.568 +      //DEBUG: addr arith checks out
  14.569 +   int offsetToMBsPix_L = MB_y * (oPixInLine_L * MBHeight_L) + MB_x * MBWidth_L;
  14.570 +   for(y=0; y < MBHeight_L; y++)
  14.571 +    { // first read all input Y, then all correct output Y
  14.572 +      int offsetToLineInMB_L = offsetToMBsPix_L + y * oPixInLine_L;
  14.573 +      for(x=0; x < MBWidth_L; x++)
  14.574 +       { pixelIndex = offsetToLineInMB_L + x;
  14.575 +         fscanf(inputFH, "%i",&tmp);
  14.576 +         *(&input_img_y[f][0] + pixelIndex) = (char) tmp;
  14.577 +       }
  14.578 +      for(x=0; x < MBWidth_L; x++)
  14.579 +       { pixelIndex = offsetToLineInMB_L + x;
  14.580 +         fscanf(inputFH, "%i",&tmp);
  14.581 +         *(&correct_img_y[f][0] + pixelIndex) = (char) tmp;
  14.582 +       }
  14.583 +    }
  14.584 +   
  14.585 +      //read croma b and r.. in all rest of code, goes "R before B" but here,
  14.586 +      // input stream has the blue before the red..  FYI
  14.587 +      //DEBUG: addr arith appears to check out (assuming have right model)
  14.588 +   int offsetToMBsPix_C = MB_y * (oPixInLine_C * MBHeight_C) + MB_x * MBWidth_C;
  14.589 +   for (y=0; y < MBHeight_C; y++)
  14.590 +    { // first read all input CB & CR, then all correct output CB & CR
  14.591 +      int offsetToLineInMB_C = offsetToMBsPix_C + y * oPixInLine_C;
  14.592 +      for(x=0; x < MBWidth_C; x++)
  14.593 +       { int pixelIndex_C = offsetToLineInMB_C + x;
  14.594 +         fscanf(inputFH, "%d",&tmp);
  14.595 +         *(&input_img_cb[f][0] + pixelIndex_C) = (char) tmp;
  14.596 +         fscanf(inputFH, "%d",&tmp);
  14.597 +         *(&input_img_cr[f][0] + pixelIndex_C) = (char) tmp;
  14.598 +       }
  14.599 +      for(x=0; x < MBWidth_C; x++)
  14.600 +       { int pixelIndex_C = offsetToLineInMB_C + x;
  14.601 +         fscanf(inputFH, "%d",&tmp);
  14.602 +         *(&correct_img_cb[f][0] + pixelIndex_C) = (char) tmp;
  14.603 +         fscanf(inputFH, "%d",&tmp);
  14.604 +         *(&correct_img_cr[f][0] + pixelIndex_C) = (char) tmp;
  14.605 +       }
  14.606 +    }
  14.607 + }

    15.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    15.2 +++ b/main.c	Sun Aug 26 03:04:50 2012 -0700
    15.3 @@ -0,0 +1,44 @@
    15.4 +/*
    15.5 + *  Copyright Oct 24, 2009 OpenSourceCodeStewardshipFoundation.org
    15.6 + *  Licensed under GNU General Public License version 2
    15.7 + *
    15.8 + * author seanhalle@yahoo.com
    15.9 + */
   15.10 +
   15.11 +
   15.12 +#include <stdio.h>
   15.13 +#include <time.h>
   15.14 +#include <math.h>
   15.15 +#include <float.h>
   15.16 +#include <limits.h>
   15.17 +#include <sys/time.h>
   15.18 +#include <malloc.h>
   15.19 +
   15.20 +#include "BLIS_CONSTANTS.h"
   15.21 +#include "../BLIS/BLIS.h"
   15.22 +#include "../BLIS/DKU/DKU_common/DKU.h"
   15.23 +
   15.24 +#include "Matrix_Mult.h"
   15.25 +
   15.26 +/**
   15.27 + * This is the DKU version of  Matrix Multiply sample application
   15.28 + * 
   15.29 + */
   15.30 +int main( int argc, char **argv )
   15.31 + { Matrix          *leftMatrix, *rightMatrix;
   15.32 +   ParamBag *paramBag;
   15.33 +
   15.34 +   paramBag = makeParamBag();
   15.35 +   readParamFileIntoBag( argv[1], paramBag );
   15.36 +   initialize_Input_Matrices_Via( &leftMatrix, &rightMatrix, paramBag );
   15.37 +   
   15.38 +   resultMatrix = multiplyTheseMatrices( leftMatrix, rightMatrix );
   15.39 +
   15.40 +   printf("\nresult matrix: \n");
   15.41 +   printMatrix( resultMatrix );
   15.42 +   
   15.43 +//   BLIS_DKU__print_Stats_forInst( DKU_INST_MM );
   15.44 +   
   15.45 +   exit(0); //cleans up
   15.46 + }
   15.47 +