# HG changeset patch
# User Sean Halle <seanhalle@yahoo.com>
# Date 1345975490 25200
# Node ID d138e0acf9a06d2989ff63a9481829e1329eb60b

Initial add of standard DKU matrix mult code -- to be modified

diff -r 000000000000 -r d138e0acf9a0 BLIS_CONSTANTS.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/BLIS_CONSTANTS.h	Sun Aug 26 03:04:50 2012 -0700
@@ -0,0 +1,20 @@
+/* 
+ * File:   BLIS_CONSTANTS.h
+ * Author: SeanHalle@yahoo.com
+ *
+ * Created on October 27, 2009, 6:19 AM
+ */
+
+#ifndef _BLIS_CONSTANTS_H
+#define _BLIS_CONSTANTS_H
+
+   //DKU Instance ID enum.  Must start at 1.
+   //The directory, header, and init file for each instance of the DKU
+   // pattern is named the same as the enum.
+enum
+ { DKU_INST_MM = 1
+ };
+
+
+#endif	/* _BLIS_CONSTANTS_H */
+
diff -r 000000000000 -r d138e0acf9a0 DKU_INST_MM/Bundling_Quad.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/DKU_INST_MM/Bundling_Quad.c	Sun Aug 26 03:04:50 2012 -0700
@@ -0,0 +1,349 @@
+/*
+ *  Copyright Oct 24, 2009 OpenSourceCodeStewardshipFoundation.org
+ *  Licensed under GNU General Public License version 2
+ *
+ *
+ * Author: SeanHalle@yahoo.com
+ *
+ */
+
+#include "malloc.h"
+#include "DKU_INST_MM.h"
+#include "../../BLIS/DKU/DKU_common/DKU.h"
+
+//Positions in the bundle
+enum
+ { szPos = 0,
+   numLRPos,
+   numLCPos,
+   numRRPos,
+   numRCPos,
+   LBMatrixPos,
+   numPos = LBMatrixPos
+ };
+
+//============================ Bundling Quad ===============================
+
+/* This is the set of four bundling functions:
+ *
+ * bundleInputs     -- takes a DKU piece and returns an array of data that
+ *                     contains all the information the Kernel will need to
+ *                     process that DKU piece
+ *
+ * unbundleInputs   -- takes the array returned by bundleInputs and turns it
+ *                     into a DKU piece that can be given to the Kernel
+ *
+ * bundleResults    -- takes a DKU piece that has finished going through the
+ *                     Kernel and places all result information into an array
+ *
+ * unbundleResults  -- takes the output from bundleResults plus the original
+ *                     DKU piece whose inputs were bundled to produce the
+ *                     results, and modifies the state of the original DKU
+ *                     piece to be as if it had gone through the kernel.
+ *
+ * The bundling quad ("quad" because there are four bundling functions).. the
+ *  bundling quad is only used for distributed memory hardware.  When used,
+ *  they can be thought of as operating in two separate memories.
+ * The bundleInputs and unbundleResults operate in one memory, where the full
+ *  original data structure is.
+ * The unbundleInputs and bundleResults operate in the second memory, where
+ *  the data for one piece gets sent to.
+ *
+ *
+ * Call sequence:
+ *
+ * call bundlInputs in memory space of original data, giving it a DKU piece.
+ * It returns a pointer to a byte array.  (First int32 in array is its size)
+ * The byte array is sent to remote memory to be processed.
+ * In the remote memory, the byte array is received.
+ * The unbundleInputs function is called on it, which creates a new DKUPiece
+ *  in the heap of the remote memory, and creates on the heap of the remote
+ *  memory any data structures embedded within the DKUPiece.
+ * The unbundleInputs function may re-use portions of the byte array it is
+ *  given, so the run-time in the remote memory must perform buffer
+ *  management for its communications appropriately.
+ * The byte array that the unbundleInputs function is given must have been
+ *  allocated on the heap within the same malloc state as the unbundleInputs
+ *  and bundleResults functions are linked to.
+ * The returned DKUPiece is given to the Kernel function to process.
+ * The DKUPiece is then given to the bundleResults function, which returns
+ *  a pointer to a byte array.  The unbundleResults may simply return the
+ *  pointer to the same received inputBundle, so the communication and memory
+ *  management in the remote memory must behave accordingly.  The
+ *  bundleResults function will free any memory allocated on the heap by the
+ *  unbundleInputs function.  The only memory remaining on the heap when the
+ *  bundleResults function completes is the byte array returned by it.  This
+ *  is the reason that the original inputBundle given to unbundleInputs must
+ *  be allocated on the same heap that bundleResults frees from:
+ *  bundleResults might call free on that inputBundle, so that call must
+ *  modify the correct malloc state.
+ * The remote memory sends the resultBundle to the original memory, then
+ *  frees the resultBundle.
+ * The original memory pairs the result bundle with the DKUPiece that the
+ *  corresponding inputBundle was made from.
+ * The original memory calls unbundleResults, giving it the resultsBundle
+ *  and the corresponding DKUPiece that remained in the original memory.
+ * The unbundleResults modifies the original memory such that it is identical
+ *  to the state it would be in if the Kernel were called on the DKUPiece
+ *  in the original memory.
+ * The original memory is responsible for freeing the inputBundle that was
+ *  made by bundleInputs, but unbundleResults will free the resultsBundle
+ *  when it is done with it.  This allows unbundleResults to simply assign
+ *  pointers to portions of the resultsBundle, rather than copying, if that
+ *  is appropriate for the Kernel.
+ */
+//===========================================================================
+
+/* Layout:
+ * sizeOfBundle
+ * numLeftRows
+ * numLeftCols
+ * numRightRows
+ * numRightCols
+ * <left matrix data>
+ * <right matrix data>
+ *
+ * calculate the sizes from the numbers of rows and cols of each, use the
+ * size of the left matrix to calc start addr of right matrix..  in the
+ * remote memory, just set pointers to the matrix locations in the bundles
+ */
+//TODO: carefully step through bundling quad -- check sizes and addr on 9x9
+void* bundleInputs_MM( DKUPiece* piece )
+ { void *bundle;
+   int32  sizeOfLeftMatrix, sizeOfRightMatrix, sizeOfBundle;
+   float32 *leftBundleMatrix,*leftMatrix,*rightBundleMatrix,*rightMatrix;
+   float32 *leftBundleInsertPt, *rightBundleInsertPt;
+   float32 *leftMatrixReadPt, *rightMatrixReadPt;
+
+   MatrixProdPiece *prodPiece = (MatrixProdPiece *)piece->appSpecificPiece;
+   leftMatrix  = prodPiece->leftMatrix->matrix;
+   rightMatrix = prodPiece->rightMatrix->matrix;
+   
+   int32 leftStartRow  = prodPiece->leftStartRow;
+   int32 leftStartCol  = prodPiece->leftStartCol;
+   int32 rightStartRow = prodPiece->rightStartRow;
+   int32 rightStartCol = prodPiece->rightStartCol;
+   int32 numLeftRows, numLeftCols, numRightRows, numRightCols;
+   
+   numLeftRows  = prodPiece->leftEndRow  - leftStartRow  + 1;
+   numLeftCols  = prodPiece->leftEndCol  - leftStartCol  + 1;
+   numRightRows = prodPiece->rightEndRow - rightStartRow + 1;
+   numRightCols = prodPiece->rightEndCol - rightStartCol + 1;
+   
+   sizeOfLeftMatrix  = sizeof( float32 ) * numLeftRows  * numLeftCols;
+   sizeOfRightMatrix = sizeof( float32 ) * numRightRows * numRightCols;
+   sizeOfBundle = numPos * sizeof( int32 ) +
+                  sizeOfLeftMatrix + sizeOfRightMatrix;
+   
+   bundle = BLIS_DKU__makeInputBundle( sizeOfBundle );
+
+   *((int32 *)bundle + szPos)    = sizeOfBundle;
+   *((int32 *)bundle + numLRPos) = numLeftRows;
+   *((int32 *)bundle + numLCPos) = numLeftCols;
+   *((int32 *)bundle + numRRPos) = numRightRows;
+   *((int32 *)bundle + numRCPos) = numRightCols;
+
+   //NOTE: Don't need to know start and end.. they will be set in remote
+   // memory according to the size (number) alone
+  
+   leftBundleMatrix  = (float32 *)  ((int32 *)bundle + LBMatrixPos);
+   rightBundleMatrix = leftBundleMatrix + sizeOfLeftMatrix/sizeof(float32);
+   
+   int32 r, c, numColsInLeftMatrix, numColsInRightMatrix;
+   numColsInLeftMatrix  = prodPiece->leftMatrix->numCols;
+   leftBundleInsertPt = leftBundleMatrix;
+   for( r = 0; r < numLeftRows; r++ )
+    { leftMatrixReadPt = leftMatrix + 
+                        (leftStartRow + r) * numColsInLeftMatrix +
+                         leftStartCol; //these are counts, compiler does *4
+      for( c = 0; c < numLeftCols; c++ )
+       { 
+         *(leftBundleInsertPt++) = *(leftMatrixReadPt++);
+       }
+    }
+   
+   // Have to do separate loops for left and right because may be diff shapes
+   numColsInRightMatrix = prodPiece->rightMatrix->numCols;
+   rightBundleInsertPt = rightBundleMatrix;
+   for( r = 0; r < numRightRows; r++ )
+    { rightMatrixReadPt = rightMatrix + 
+                         (rightStartRow + r) * numColsInRightMatrix +
+                          rightStartCol;
+      for( c = 0; c < numRightCols; c++ )
+       {
+         *(rightBundleInsertPt++) = *(rightMatrixReadPt++);
+       }
+    }
+
+   return bundle;
+ }
+
+/*Leave all the data in bundle, just assign pointers to it.
+ * Create a DKUPiece data structure, then fill in the sizes and pointers.
+ *
+ *This is app code, but need to make it easy for specialization.
+ *On machines like the Cell, the code for this function will be copied
+ * over to a separate file, along with any other DKU functions needed in
+ * the remote memory.
+ *
+ *The scheduler in remote memory is responsible for making space for the
+ * input bundle, and for freeing it (if needed) after the result bundle has
+ * been sent back.
+ *
+ *Model is that use an override of malloc that puts everything malloc'd from
+ * unbundleInputs calls and from bundleResults call into a buffer in remote
+ * mem.  This entire buffer is freed after the return of the result bundle is
+ * complete.
+ */
+ void
+unbundleInputs_MM( void *bundle, DKUPiece *piece )
+ { int32 sizeOfBundle, numLeftRows, numLeftCols, numRightRows, numRightCols;
+   int32 sizeOfLeftMatrix, sizeOfRightMatrix, sizeOfResultMatrix;
+   float32 *leftBundleMatrix, *rightBundleMatrix;
+   MatrixProdPiece *prodPiece;
+   
+   sizeOfBundle = *((int32 *)bundle + szPos);
+   
+   numLeftRows  = *((int32 *)bundle + numLRPos);
+   numLeftCols  = *((int32 *)bundle + numLCPos);
+   numRightRows = *((int32 *)bundle + numRRPos);
+   numRightCols = *((int32 *)bundle + numRCPos);
+   
+      
+   sizeOfLeftMatrix   = sizeof( float32 ) * numLeftRows  * numLeftCols;
+   sizeOfRightMatrix  = sizeof( float32 ) * numRightRows * numRightCols;
+   sizeOfResultMatrix = sizeof( float32 ) * numLeftRows  * numRightCols;
+   
+   leftBundleMatrix  = (float32 *)  ((int32 *)bundle + LBMatrixPos);
+   rightBundleMatrix = leftBundleMatrix + sizeOfLeftMatrix/sizeof(float32);
+   
+//ARCH: check, for Cell, what's involved with re-defining malloc that appears
+// inside DKUPiece maker and app spec piece maker.. can make it buffer-alloc?
+   
+      //that indicate how much stuff is created automatically inside
+      //IE, does this make produce the sched data, in bundling quad?
+   prodPiece = DKU__makeMatrixProdPiece_Flat( piece );
+   piece->appSpecificPiece = prodPiece;
+   
+   prodPiece->leftMatrix   = 
+            DKU__makeMatrix_Flat( numLeftRows,  numLeftCols,  piece );
+   prodPiece->leftMatrix->matrix   = leftBundleMatrix;
+   
+   prodPiece->rightMatrix  =
+            DKU__makeMatrix_Flat( numRightRows, numRightCols, piece );
+   prodPiece->rightMatrix->matrix  = rightBundleMatrix;
+   
+   prodPiece->resultMatrix = 
+            DKU__makeMatrix_Flat( numLeftRows,  numRightCols, piece );
+      //The result matrix is malloc'd, so it's not inside the input bundle,
+      // so, to avoid copies when make the result bundle, make it now, then
+      // put into the DKUPiece the pos of result matrix in result bundle.
+   void *resultBundle =
+        BLIS_DKU__malloc_toPiece( sizeOfResultMatrix + sizeof(int32), piece);
+   *((int32 *)resultBundle) = sizeOfResultMatrix + sizeof(int32);
+      //skip over the "size" at the start of the result bundle
+   prodPiece->resultMatrix->matrix = (float32 *)((int32 *)resultBundle + 1);
+   
+      //now, fill in the iteration bounds so that the kernel processes
+      // the entirety of both matrices.
+   prodPiece->leftStartRow  = 0;
+   prodPiece->leftEndRow    = numLeftRows  - 1; // "- 1" 'cause start at 0
+   prodPiece->leftStartCol  = 0;
+   prodPiece->leftEndCol    = numLeftCols  - 1;
+   
+   prodPiece->rightStartRow = 0;
+   prodPiece->rightEndRow   = numRightRows - 1;
+   prodPiece->rightStartCol = 0;
+   prodPiece->rightEndCol   = numRightCols - 1;
+   
+   prodPiece->prodStartRow  = 0;
+   prodPiece->prodEndRow    = numLeftRows  - 1;
+   prodPiece->prodStartCol  = 0;
+   prodPiece->prodEndCol    = numRightCols - 1;
+ }
+
+/*
+ *Model is that use an override of malloc in remote mem that puts everything
+ * malloc'd from unbundleInputs calls and from bundleResults call into a
+ * buffer in remote mem.  The entire buffer is freed after the send of the
+ * return result bundle is complete.
+ *
+ *The application only has to know that it does not perform free on any of
+ * the inputBundles, nor on any of the resultBundles, in either local or
+ * remote memory.
+ *The application also must create new DKUPiece s in the bundling quad plus
+ * in the Kernel (and all calls rooted at the Kernel) by using
+ * BLIS_DKU__makeDKUPiece
+ *Finally, the application must create app-specific pieces
+ *So anything malloc'd inside bundleResults is still inside the same buffer
+ * used by unbundleInputs.
+ */
+//ARCH: what about just give unbundleInputs and bundleResults an  "align"
+//  operator that's HW-supplied.
+// First element of bundle is size, then "0" terminated list of offsets to
+// alignable-chunks, then the alignable chunks start.  Alignment happens
+// during bundling.  HW also supplies a checker to see if aligned bundle is
+// too big. (add a "revert divide" so can do a new divide to get smaller
+// pieces, or something.. )
+ void *
+bundleResults_MM( DKUPiece *piece, void *inputBundle )
+ { MatrixProdPiece *matProd;
+   float32 *matProdArr;
+   
+   matProd = (MatrixProdPiece *)piece->appSpecificPiece;
+   matProdArr = matProd->resultMatrix->matrix;
+   
+      //TODO: figure out soln for alignment or result matrix when it's inside
+      // input-bundle
+
+      //results bundle already made (inside unbundleInputs fn), resultsBundle
+      // addr is one int32 before addr of result matrix array.
+   void *resultsBundle = ((int8 *)matProdArr - sizeof(int32));
+   
+   return resultsBundle;
+ }
+
+/*  The DKU standard says that the scheduler guarantees that the same
+ *   DKUPiece that created an input bundle will be called to unbundle
+ *   the corresponding results bundle.
+ *  This means that the unbundleResults method is called on the original
+ *   piece, that still has the position within the result matrix where
+ *   this piece's results should go.
+ *  So, copy the values in the incoming result matrix to the correct
+ *   sub-block of the "original" result matrix
+ */
+ void
+unbundleResults_MM( void * resultBundle, DKUPiece *origPiece )
+ { float32 *bundMatrixArr, *resMatrixArr, *bundleReadPt, *resultInsertPt;
+   MatrixProdPiece *matProd;
+   Matrix *resultMatrix;
+   
+   int32 resMatNumRows, resMatNumCols;
+   int32 prodStartRow, prodEndRow, prodStartCol, prodEndCol, r, c;
+   
+   bundMatrixArr = (float32 *) ((int32 *)resultBundle + 1);
+   
+   matProd = (MatrixProdPiece *) origPiece->appSpecificPiece;
+   resultMatrix  = matProd->resultMatrix;
+   resMatrixArr  = resultMatrix->matrix;
+   
+   resMatNumRows = resultMatrix->numRows;
+   resMatNumCols = resultMatrix->numCols;
+   
+   prodStartRow  = matProd->prodStartRow;
+   prodEndRow    = matProd->prodEndRow;
+   prodStartCol  = matProd->prodStartCol;
+   prodEndCol    = matProd->prodEndCol;
+   
+      //copy the results from the matrix in the bundle to
+      // the full result matrix.
+   bundleReadPt = bundMatrixArr;
+   for( r = prodStartRow; r < prodEndRow; r++ )
+    { resultInsertPt = resMatrixArr + r * resMatNumCols + prodStartCol;
+      for( c = prodStartCol; c <= prodEndCol; c++ )
+       { *(resultInsertPt++) = *(bundleReadPt++);
+       }
+    }
+ }
+
+
diff -r 000000000000 -r d138e0acf9a0 DKU_INST_MM/Communicators.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/DKU_INST_MM/Communicators.c	Sun Aug 26 03:04:50 2012 -0700
@@ -0,0 +1,435 @@
+/*
+ *  Copyright 2009 OpenSourceStewardshipFoundation.org
+ *  Licensed under GNU General Public License version 2
+ *
+ * Author: seanhalle@yahoo.com
+ */
+
+#include "DKU_INST_MM.h"
+
+/*Communicators are used by the scheduler to send data from one piece to
+ * another.  The kernel specifies when the communication is to take place
+ * by calling either a send communicator or a receive communicator.
+ *The communicators know what data to send and where to send it by reading
+ * information out of the DKUPiece structure.  The information about which
+ * other piece to send to and what data is placed into the piece by the
+ * DKUPieceMaker and the Divider.
+ *This is how dependencies among data pieces are encoded and how the
+ * scheduler is informed of them.
+ */
+
+/*In Deblocking, the dependency pattern is a 45 degree diagonal.  A given
+ * macro block receives information from the macro block above it and the
+ * macro block to its left.
+ *This must be told to the scheduler, so that it can order the execution of
+ * pieces appropriately.
+ * application programmer has chosen to divide each
+ * screen frame into diagonals
+ */
+
+/*Okay, so going with the plan that match the scheduler form to the
+ * application form.  Expose explicitly that different applications have
+ * characteristics that can be taken advantage of by scheduler impl to make
+ * more efficient scheduler.
+ *So, take advantage of flexibility of BLIS interface philosophy..  make an
+ * interface that is tuned to particular characteristic of application to
+ * allow efficient scheduler impl that takes advantage.
+ *In this case, it is fact that have a pattern of dependencies.  Make the
+ * dependencies first-class entities in the interface.  Make a primitive for
+ * expressing the dependencies, and one for embodying the dependencies.
+ *
+ *The embodiment will be an array.  The DKUPieceMaker will create the array
+ * and populate it for the initial pieces it makes..  something like that..
+ * The DKUPieceMaker also returns a pointer to the initially-free piece.
+ * (Note that there's only ever one initially free piece.  If there were two
+ *  then they wouldn't depend on each other, and so could be combined into
+ *  a single piece..  or?  In some cases would require separate
+ *  "combination" data-struc..  so, maybe an array of initially free)
+ *Each division places an array in the parent, with one position for each
+ * sub-piece.  The position holds the count, of the number of propendents,
+ * for the sub-piece in the corresponding position in the sub-piece array.
+ *
+ *The fellow sub-pieces of a parent typically won't have any dependencies
+ * on each other..  the dependencies are to sub-pieces of other parents.
+ *So the DKUPieceMaker will create the largest pieces that can be sub-divided
+ * freely, and also create an array with the propendent count for each of
+ * those pieces.
+ *The undivider is where propendent counts are updated.  The Undivider is
+ * what frees DKUPieces to be scheduled.  That process is inside the
+ * scheduler implementation.
+ *The divider is written to know about the arrays of propendent counts of the
+ * other DKUPieces.  So is the Undivider.  The divider looks at the arrays
+ * that already exist and populates the new array for the currently being
+ * divided parent accordingly.  It also puts in information for the Undivider
+ * to properly update the counts in the arrays when a piece finishes.
+ *Big question: can Albert perform a static analysis that can understand
+ * general code in the divider and undivider, given that it knows explicitly
+ * what the arrays are used for, and just has to look at the code that
+ * creates, populates, and updates the arrays..  The idea being that the
+ * polyhedral model can learn the dependencies this way.
+ *
+ *Each dependency has an associated communicator.  A communicator is what
+ * performs transport of the propendent-generated-state to the receiving
+ * dependent.  A communicator and a dependency arrow are the same thing.
+ *In shared memory, the communicator is normally not used.  Only in distr
+ * memory is it invoked to bundle up the propendent data and carry it to the
+ * dependent.
+ *The Undivider tells the scheduler each time a completed propendent has
+ * updated the count of a given dependent.  This will trigger the
+ * scheduler to fire the communicator, if it has been implemented that way.
+ *This lets the scheduler, for example, send data around before the 
+ * propendent data is available.  So, it gets better overlap of communication
+ * and computation.
+ *
+ *So, this scheme so far covers the case of "2D" parallelism in Deblocking.
+ * But it doesn't yet cover simulation where the communication happens in
+ * the middle of a loop nest.
+ *For that, still have the same communicator, which is still one-to-one with
+ * a dependency in the data.
+ *But now, the Kernel invokes the communicator, when the propendent has
+ * finished producing data..  By only calling in the propendent, causality is
+ * always enforced, with no extra mechanism required in the scheduler.
+ *The Kernel also calls the communicator in the dependent, in the position
+ * that data must be received before continuing on.
+ *This way, the scheduler is free to implement the timing of communication
+ * in many different ways.  The communicator explicitly copies data to a
+ * separate communication-area that is made in the propendent DKUPiece
+ * by the Divider or DKUPieceMaker during creation.
+ *It returns when the copy is done, handing the scheduler a pointer to the
+ * data.  The scheduler then handles moving the data to the dependent piece
+ * (or not, on shared memory).  The receiving end of the communicator
+ * accepts a pointer to the data.  The Kernel is written to access data
+ * through that pointer.
+ *
+ *Wanting to allow shared memory to NOT perform the copy, just pass a pointer
+ * to the area of data that the dependent needs, and have the Kernel access
+ * the data in the appropriate way (according to data in the DKUPiece pointed
+ * to)..  thing is, still need something to perform the copy in the distr.
+ * memory case.
+ *So, seeing two different sending communicators, and two different receiving
+ * communicators.  For distr mem, the sender does a copy into area in
+ * sender's DKUPiece, then returns pointer to that copy (it's a bundle, with
+ * first location being a uint32 with size of bundle)..  or maybe have a
+ * pre-defined element in the DKUPiece.  The receiver is implemented as part
+ * of the scheduler..  it returns a pointer to the Dependent calling Kernel.
+ *Shared mem communicators, the sender is implemented by the scheduler, as
+ * well as the receiver.  The propendent Kernel simply passes the DKUPiece
+ * to the sender.
+ *
+ *Right.. so, three out of the four are implemented by the scheduler.. so
+ * just make the fourth be an optional do-hickie.  The scheduler implements
+ * both the send and receive calls, and the application provides a
+ * communication bundler.  : D
+ *The receiver still has two different cases: in one it gets a bundle, in
+ * the other it gets a pointer to a DKUPiece.  No point in unbundling, just
+ * so the Kernel can do the gather operation again.  So, the Kernel will have
+ * to have a different version, one for shared, second for distr.. yuck.
+ *
+ *Q: starting to have different "kinds" of DKUPiece now..  how going to
+ * handle backwards-and-forwards compatibility?
+ *
+ *For shared mem,
+ */
+
+void * bundleComm_BarnesHut_110( DKUPiece piece )
+ {
+   //this copies data out of the piece for the "110" going direction
+   //All communication bundlers have the same prototype: void * foo(DKUPiece)
+   //The bundlers are registered with the scheduler for this DKU instance
+   // via the BLIS_DKU__set_commBundler_ForID( &foo, commTypeID, DKU_Inst_ID)
+   //Which two pieces communicate is set by the propendent and dependent
+   // calls to communicate, which happen in the Kernel.  The PieceMaker and
+   // Divider know which pieces communicate to which at what points in the
+   // Kernel.  So, they insert into the DKUPiece data structure the ID of
+   // the piece they send to/receive from.  When the Kernel reaches the 
+   // communication point, it takes the propendent/dependent pieceID from 
+   // the DKUPiece and passes that to the communicate() call (along with the
+   // commTypeID, which is fixed for each call that appears in the Kernel).
+   //In other words, the Kernel, Divider, and PieceMaker are the only ones
+   // that agree among themselves on what a particular commTypeID means.  For
+   // example, for graphs, there is only one type ID, because the process of
+   // copying is the same no matter which direction..  but for a mesh, the
+   // copy is different for the top, bottom, and two edges..
+
+   /*There is the basic issue of exposing in the Kernel code whether running
+    * on shared memory or distr memory.
+    *Could either have a fixed Kernel that adapts, or have two Kernels that
+    * are chosen among by the scheduler.
+    *If the Kernel is fixed, have only a few choices:
+    * Kernel always thinks it's taking data from a DKUPiece, or
+    * Kernel always takes data via a fixed interface-call.
+    *Not sure how to hide if Kernel always takes from a DKUPiece.. and extra
+    * work in remote memory to re-create DKUPiece structure, for only reason
+    * to make interface nice (for programmer).
+    *With the fixed interface, could provide two adaptors: one for shared
+    * mem that gathers from a DKUPiece, other for a bundle.  Then scheduler
+    * picks, behind the interface, which adaptor.
+    *How would that look to the Kernel?  Something like "update", and the
+    * local memory changes..  but that only works if there's a fixed data-
+    * struc in local mem to take from as the Kernel computation progresses.
+    *IE, have an outer loop that the communication is in, and inner loops
+    * that use the result of communication.  For shared mem, want that data-
+    * struc to be the appSpecificPiece of another DKUPiece, and locations in
+    * it will be read (or written) inside the inner loops.  This means the
+    * scheduler has to figure out an ordering to run the Kernels that
+    * respects the dependencies.  This is aided by the fact that the Kernel
+    * has a separate propendent and dependent call that states the order.
+    *There will actually be some Kernels that can't be run this way: the
+    * pattern of dependencies has no sequential soln, a copy must be done.
+    *Starting to think perhaps it's best to just always do a copy..
+    *
+    *Other choice is two different versions of the Kernel.. one that reads
+    * from a DKUPiece, the other that reads from a commBundle.
+    *Or a hybrid Kernel that can read from both, using a flag
+    * in the DKUPiece that tells the Kernel whether to receive a
+    * commBbundle or whether to take from the normal data in the propendent.
+    *The two Kernel version is the most run-time efficient. The main drawback
+    * is that the app developer has to make identical changes in two
+    * different places.. any time they change one of the Kernel-copies they
+    * have to change the other too..  also, it feels weird having two
+    * different Kernels..  don't like it..
+    *Kinda like the hybrid approach, it has the second least run-time
+    * overhead, just an IF statement that will be well predicted each time
+    * it accesses data during calculations..  Could even use a #define here,
+    * so the IF is known at compile time to always go one direction.  The
+    * specialization module would change the #define to: SharedMem or to
+    * DistrMem.  Or, make the if a #ifdef in the source..  but that's ugly
+    * to read.
+    *Something to percolate..  whatever is chosen, the problem is solved,
+    * just a matter of tradeoffs at this point..
+    */
+ }
+
+void sampleKernel( DKUPiece *piece )
+ {
+   initializePropendents();
+   for( outer = 0; outer < N; outer++ )
+    { fromProp = getFromPropendent( piece->propPieceIDs[ NORTH_PROPENDENT ]);
+      for( inner = 0; inner < N; inner++ )
+       {    //This Kernel knows that data is an array because it's written
+            // by the app-programmer.  It knows that fromProp is also an
+            // array because the app progr wrote the commBundler.
+         piece->appSpecPiece->data[ inner ] += fromProp[ inner ];
+       }
+      sendToDependent( piece->depPieceIDs[ SOUTH_DEPENDENT ] );
+    }
+   finalizeDependents();
+ }
+
+/*In the divider, one has a single piece, which has communicators at its
+ * boundaries already.
+ *The divider cuts up the piece, and so knows which sub-pieces talk to 
+ * which others, because it just made them all from the same parent piece.
+ *The tricky part is connecting the hierarchy.
+ *The pieces the parent communicated with are on the order of the size of
+ * the parent.  Those pieces will likely have been divided as well..  now
+ * the question is how to hand-off from the parent to the appropriate
+ * sub-pieces.
+ *One way to do it is to have a Kernel running for each piece in the
+ * hierarchy, but of two kinds: hierachy-Kernels and "normal" Kernels.  The
+ * hierarchy Kernels only do communication: they break an in-coming request
+ * among their sub-pieces, then gather the responses back together.
+ *This is inefficienct in one sense: direct communication between sub-pieces
+ * of different Kernels would be optimal.
+ *However, such a hierarchy will
+ * only exist when a physical hierarchy exists among machines.  In that
+ * case, the extra scatter-gather work done by the hierarchy pieces might
+ * even be more efficient because it makes fewer, larger, communications
+ * between the larger physical entities.  This is more likely to ameliorate
+ * the loss from the larger latency in communication at the larger physical
+ * division.
+ *Okay, so going with that, for now.  When get details, may see some patterns
+ * for how to do direct communication among sub-pieces..  (but not holding
+ * my breath because each of two parents can be divided into a different
+ * number of sub-pieces.. so there is no one-to-one between sub-pieces on
+ * the edge of one parent and sub-pieces on a communicating edge of another
+ * parent.)
+ *
+ *This sample is for a big linked list.  Each piece is just a number of nodes
+ * of the list.
+ *When the Divider makes sub-pieces, it knows which ones communicate across
+ * the boundaries of the parent (because the application programmer wrote
+ * the Divider and placed the code in it that handles the boundaries of the
+ * parent).
+ *Patterns for how to do this part:
+ *Could have the Divider create some new structure that it places in the
+ * parent that holds the state for the CommKernel.  Put into that structure
+ * all the sub-piece-IDs that will communicate with it.  Into the sub-pieces
+ * put a commID not of the parent piece, but of the parent's CommKernel.  So
+ * a piece gains a second ID when it is divided.  It keeps its original
+ * commID and uses that to communicate with siblings, while it uses the
+ * subCommID to communicate with sub-pieces.
+ *Just going with that one idea for the moment..
+ *So, for the Linked List example, the Divider will set all the sub-pieces
+ * to talk to each other, and set the end-pieces to talk to the subCommID of
+ * the parent.
+ *The parent, meanwhile, will have a normal commID with which it talks to
+ * its siblings.
+ *When the normal Kernel modifies the linkings in the list, it has to check
+ * if one of the elements modified is a boundary element, and if so if the
+ * new arrangement has changed which element is the boundary.
+ *For example, an element is added to one end of a sub-piece.  The added
+ * element has its link-ptr set to a value that indicates it's a boundary
+ * element, for example NULL or -1.  The old boundary element's pointer is
+ * set to the new boundary element.  The commID is taken from the old
+ * boundary element and put into the new boundary element.  Done.
+ *The CommKernel will take advantage of the fact that it knows it's in a
+ * hierarchy..  it will keep an array of the values at the boundaries of its
+ * siblings.  (Note that one DKUPiece is disallowed from holding a pointer
+ * to another DKUPiece)..  the values may get out-of-sync, so they will be
+ * fixed-up when detected.
+ *Here's the pattern: when CommKernel gets comm from sub-piece, it looks to
+ * see what kind of comm it is.. if it's a "here's a value to insert", then
+ * it checks the end-values of its siblings and picks the sibling it belongs
+ * on.  It sends to that sibling (requires unsolicited reception mechanism,
+ * such as the signals method for re-divide)..  includes its own boundary
+ * values (free to piggy-back)..  that updates the receiver's view of the
+ * sibling-piece's boundary values.
+ *
+ *When CommKernel receives a value-to-insert,
+ * it checks the values at its two boundary elements.  If the value is
+ * between, then it accepts it.  If not, it responds to the sender, telling
+ * the sender what the receiver's actual boundary values are.  It then sends
+ * the value to the piece it believes it should go to (received the sender's
+ * boundary values along with the insert value, so it's certain it won't
+ * send back to where it came from..)  Eventually the value will land at
+ * the correct CommKernel.
+ *Seeing pieces given to the CommKernel of the top-level parent.  It then
+ * hands them out among its children, and from there to next level of 
+ * children, and so on.  Notice that siblings talk directly to each other,
+ * they don't go up to the parent then back down.
+ *The value of this abstract data type will be handling an enormous number
+ * of inserts, deletes, and lookups.
+ *
+ */
+void sampleDivider( DKUPiece *piece, int numPieces )
+ { DKUPiece * newPiece;
+      //First sub-piece, so it is a boundary of the parent
+      //Figure out if parent has a sibling, or if it is natural boundary
+   if(numPieces < 2) return; //leave sub-pieces empty if only 1 sub-piece
+
+   newPiece = makeASubPiece( someValues );
+   if( BLIS_DKU__isNaturalBoundary( piece->propPieceIDs[ LEFT_PROPENDENT ]) )
+    { newPiece->propPieceIDs[ LEFT_PROPENDENT ] = 
+       BLIS_DKU__makeNaturalBoundaryPiece( piece, newPiece, DKU_INST_ID );
+    }
+   else //parent has a sibling, so communicate with parent's CommKernel
+    { newPiece->propPieceIDs[ LEFT_PROPENDENT ] =
+       BLIS_DKU__giveCommKernelAsPropendent( piece ); //scheduler returns the
+       //thing that it has implemented as "addr" of CommKernel of piece
+    }
+   for( pieceIdx = 1; pieceIdx < numPieces - 1; pieceIdx++ )
+    { newPiece = makeASubPiece( someValues );
+      newPiece->propPieceIDs[ LEFT_PROPENDENT ] = (subPieces[pieceIdx - 1]);
+      subPieces[pieceIdx-1]->propPieceIDs[ RIGHT_PROPENDENT ] = (newPiece);
+    }
+   newPiece = makeASubPiece( someValues );
+   newPiece->propPieceIDs[ LEFT_PROPENDENT ] = (subPieces[numPieces - 1]);
+   if( BLIS_DKU__isNaturalBoundary( piece->propPieceIDs[ RIGHT_PROPENDENT ]))
+    { newPiece->propPieceIDs[ RIGHT_PROPENDENT ] =
+       BLIS_DKU__makeNaturalBoundaryPiece( piece, newPiece, DKU_INST_ID );
+    }
+   else //parent has a sibling, so communicate with parent's CommKernel
+    { newPiece->propPieceIDs[ RIGHT_PROPENDENT ] =
+       BLIS_DKU__giveCommKernelAsPropendent( piece ); //scheduler returns the
+       //thing that it has implemented as "addr" of CommKernel of piece
+      //Might have to make left propendent and right propendent be different
+      // addresses.. in which case include a directionID in the call to
+      // the scheduler asking for the address.
+      //Might be something about matching dependents with propendents.. not
+      // sure how that's going to play out..
+      //There's "pull from propendent" and "push to dependent" which are
+      // interrupt-model..  then there's propendent sends and dependent
+      // receives.
+
+    }
+ }
+/*Some ill-fits in here..  need to do real app, with real dependencies and
+ * real comm in it..
+ *
+ *Thinking perhaps give each piece a "name" struc..  the makeDKUPiece()
+ * creates for DKUInstances that have a communicator registered..  (will
+ * add some overhead to makeDKUPiece if have to do DKU-instance lookup)
+ *WANT DKU-INSTANCE LOOKUP TO BE TRANSFORMED BY SPECIALIZATION
+ * in practice, want the lookup to be performed statically, to eliminate the
+ * dynamic overhead..  can do this if define semantics of DKUInstance to
+ * be one-time and one-time only setting of functions to an instance..  have
+ * to check this statically in the BLIS-rule-checker that's run in the
+ * makefile in the sequential development environment.
+ *
+ *Then, the Divider simply copies the name-struc out of target pieces and
+ * puts it into source pieces.
+ *This allows, for example, making commKernels that have the name-strucs of
+ * all the siblings..  so, for example, the commKernel can calculate which
+ * of the siblings it should send to..  in fact, each of the Kernels can
+ * calculate which of the siblings it should sent to..  re-use the same
+ * CommKernel in the leaf-kernels as well as in the parent..
+ *The only complication to re-using CommKernel on all levels is the
+ * boundaries..  just make the boundary be the CommKernel in the parent,
+ * or a natural boundary..
+ */
+
+
+
+/*The purpose of a CommKernel is to gather communications from all the sub-
+ * pieces, put them together, and turn them into communications from the
+ * parent piece.  And vice-versa: receive comm, break it up, hand a portion
+ * to each sub-piece.
+ *This sample is for a big linked list.
+ */
+void sampleCommKernel( DKUPiece *piece )
+ {
+
+ }
+
+/*Q to Albert: just how bad is this?  Will a static tool be able to
+ * understand the linkages, given that they are a fixed pattern of linkages
+ * that are signalled by the fixed function-names..?
+ *For example, could a static tool understand that
+ * piece->propPieceIDs[ NORTH_PROPENDENT ] means a DKUPiece, which is set
+ * in the Divider..  and uderstand that
+ * piece->appSpecPiece->data[ inner ] = fromProp[ inner ];  means the
+ * actual use of the data gotten from the DKUPiece..  then use that
+ * understanding to replace the calls with a schedule plus direct access
+ * to the propendent DKUPiece instead of the intermediate access first to
+ * the commBundle.
+ *Thinking this is how I want to go: use a commBundle on both shared and
+ * distr memory.  Reason is that it makes scheduling simple.  And, there's
+ * enough semantic information provided by the function calls that a static
+ * tool should be able to perform a transform that does direct access to the
+ * DKUPiece by migrating the gather code into the Kernel innards.  The
+ * index into the commBundle is what identifies the gather statement that
+ * put the data there.  That gather statement is inserted in place of the
+ * access to the array.
+ */
+
+/*Have to provide boundary-propendents, so pieces that are on the edge of
+ * a mesh, for example, access the boundary-propendent, with whatever
+ * boundary data.
+ *Also, have to provide a "time zero" something or other..  The thing it
+ * solves is that one Kernel produces data at the end of the inner loop,
+ * with a sendToDependent() call..  that data is received BEFORE the inner
+ * loop with a getFromPropendent call.  This is normally fine, as it encodes
+ * the time-skew, implicitly pipelining.  The only problems are at time
+ * zero and possibly the very last time-step.
+ *So, the initializePropendents() sets some default time-zero state that
+ * will be gotten by the first getFromPropendent() call.
+ *When the Kernel is done, the finalizeDependents() call handles whatever
+ * might need to be done with the data from the last sendToDependents()
+ * call that was performed in the Kernel.
+ */
+
+/*Q: How implement comm inside scheduler?
+ * syntax proposal for performing comm from inside the Kernel is:
+ * getFromPropendent( piece->propPieceIDs[ NORTH_PROPENDENT ]);
+ *
+ * So.. what is stored in that array?  That is the thing that tells the
+ *  scheduler how to perform the communication..
+ *Thinking leave it opaque..  it's a void *..  The scheduler fills it in
+ * itself inside of BLIS_DKU__makeDKUPiece()..  the divider gets this thing
+ * out of the piece and places it into the propPieceIDs array.
+ *
+ *This means that ask the scheduler to create a CommKernel, and ask it to
+ * give the thingie need to communicate with it.
+ *Also ask the scheduler to create a natural-boundary piece, and
+ */
diff -r 000000000000 -r d138e0acf9a0 DKU_INST_MM/DKU_INST_MM.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/DKU_INST_MM/DKU_INST_MM.h	Sun Aug 26 03:04:50 2012 -0700
@@ -0,0 +1,54 @@
+/*
+ *  Copyright 2009 OpenSourceCodeStewardshipFoundation.org
+ *  Licensed under GNU General Public License version 2
+ *
+ * Author: seanhalle@yahoo.com
+ *
+ */
+
+#include "../../BLIS/DKU/DKU_common/DKU.h"
+
+#include "../BLIS_CONSTANTS.h"
+
+#ifndef _DKU_INST_MM_H
+#define	_DKU_INST_MM_H
+
+
+//===========================================================================
+//              Declarations of the Standard DKU functions
+
+void DKU_INST_MM_Init(); //tells BLIS the pointers to the DKU functions
+
+Divide   divide_MM;
+Kernel   kernel_MM;
+Undivide unDividePiece_MM;
+
+MakeRootDKUPieces makeRootDKUPieces_MM;
+
+SerialKernel      serialKernel_MM;
+
+BundleInputs      bundleInputs_MM;
+UnbundleInputs    unbundleInputs_MM;
+BundleResults     bundleResults_MM;
+UnbundleResults   unbundleResults_MM;
+
+//===========================================================================
+//
+#include "../Matrix_Mult.h"
+
+ void
+inner_Kernel( MatrixProdPiece *matrixProdPiece );
+
+ Matrix *
+DKU__makeMatrix_Flat( int32 numRows, int32 numCols, DKUPiece *owner );
+
+ MatrixProdPiece *
+DKU__makeMatrixProdPiece_Flat( DKUPiece *owner );
+
+ MatrixProdPiece *
+DKU__makeMatrixProdPiece_FromMatrixProdPiece
+                           ( MatrixProdPiece *parentPiece, DKUPiece *owner );
+
+
+#endif	/* _DKU_INST_MM_H */
+
diff -r 000000000000 -r d138e0acf9a0 DKU_INST_MM/DKU_INST_MM_init.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/DKU_INST_MM/DKU_INST_MM_init.c	Sun Aug 26 03:04:50 2012 -0700
@@ -0,0 +1,58 @@
+/*
+ *  Copyright Oct 24, 2009 OpenSourceCodeStewardshipFoundation.org
+ *  Licensed under GNU General Public License version 2
+ *
+ *
+ * Author: SeanHalle@yahoo.com
+ */
+
+#include "DKU_INST_MM.h"
+
+
+/* 
+ * Part of the BLIS DKU standard.  Each DKU instance is placed in its own
+ *  directory, a child of the Application directory.  The directory is named
+ *  the same as the "#define" constant used to identify the DKU instance.
+ * The directory has two standard files: "#def const".h and
+ *  "#def const"_init.c.
+ *
+ * This is the "#def const"_init.c file..
+ * It initializes the scheduler for this DKU instance.
+ * And it tells that scheduler the pointers to all of the DKU functions for
+ *  this instance.
+ *
+ */
+void DKU_INST_MM_Init( )
+ {    // always start init of a DKU instance with this function
+      // this isn't modal, so no worries about order of calling these fn
+      // Can intertwine init of several instances without harm.
+   BLIS_DKU__start_DKU_Instance_Init( DKU_INST_MM );
+   
+   BLIS_DKU__set_Divide_To_ForID( &divide_MM, DKU_INST_MM );
+   BLIS_DKU__set_Kernel_To_ForID( &kernel_MM, DKU_INST_MM );
+   BLIS_DKU__set_Undivide_To_ForID( &unDividePiece_MM, DKU_INST_MM );
+
+   //TODO: figure out make and free -- right depth?  Where used?
+   //  make sure don't accidentally free the shared Matrix strucs..
+   //  trace where used, see if can find easy-to-see-pattern pattern for how
+   //  to do make and free -- DKUPiece automated stuff and whatnot..
+   //  maybe just let the App see DKUPiece, and make make and free be done
+   //  explicitly in the pieceMaker, divider, undivider, etc.. no automation
+   BLIS_DKU__set_MakeRootDKUPieces_To_ForID(&makeRootDKUPieces_MM,DKU_INST_MM);
+//   BLIS_DKU__set_FreeAppSpecSubPiece_To_ForID(&freeMatrixProdPiece_Flat,DKU_INST_MM);
+//   BLIS_DKU__set_FreeAppSpecRootPiece_To_ForID(&freeMatrixProdPiece_Flat,DKU_INST_MM);
+//   BLIS_DKU__set_MakeAppSpecPiece_To_ForID(&makeMatrixProdPiece_Using,DKU_INST_MM);
+   BLIS_DKU__set_SerialKernel_To_ForID( &serialKernel_MM, DKU_INST_MM );
+   
+   BLIS_DKU__set_BundleInputs_To_ForID( &bundleInputs_MM, DKU_INST_MM );
+   BLIS_DKU__set_UnbundleInputs_To_ForID(&unbundleInputs_MM,DKU_INST_MM);
+   BLIS_DKU__set_BundleResults_To_ForID( &bundleResults_MM, DKU_INST_MM);
+   BLIS_DKU__set_UnbundleResults_To_ForID( &unbundleResults_MM,DKU_INST_MM );
+   
+      //Now that have generic, provide HW-specific overrides
+/*   BLIS_DKU__set_A_Divide_Override_To_ForID()
+   BLIS_DKU__set_A_DKU_Override_To_ForID( )
+*/
+      // always end init of a DKU instance with this function
+   BLIS_DKU__end_DKU_Instance_Init( DKU_INST_MM );
+ }
diff -r 000000000000 -r d138e0acf9a0 DKU_INST_MM/Divide.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/DKU_INST_MM/Divide.c	Sun Aug 26 03:04:50 2012 -0700
@@ -0,0 +1,295 @@
+/*
+ *  Copyright 2009 OpenSourceCodeStewardshipFoundation.org
+ *  Licensed under GNU General Public License version 2
+ *
+ * Author: seanhalle@yahoo.com
+ *
+ */
+
+#include <math.h>
+
+#include "DKU_INST_MM.h"
+
+
+typedef
+struct
+ {
+   //inputs
+   int numLeftRows;
+   int numRightCols;
+   int numSubPiecesToMake;
+   DKUPiece *parPiece;
+   
+   //outputs
+   int numLeftSlices;
+   int numRightSlices;
+   int finalNumSubPieces;
+
+   //outputs then inputs
+   int *leftSliceStartRows;
+   int *rightSliceStartCols;
+ }
+SliceStruc;
+
+void updateSlicingStrucWithSlicingOfInputMatrices(SliceStruc *slicingStruc);
+
+void pairSlicesAndMakeProdPieces(SliceStruc *slicingStruc, DKUPiece *oPiece);
+MatrixProdPiece* makeChildMatrixProdPieceFrom( MatrixProdPiece *parentPiece);
+
+
+/* The Divider
+ * Divides the iteration space..
+ * Matrix Product Piece is a piece of iteration
+ *  space..  it is the iterations in which one piece of the left
+ *  matrix is multiplied by one piece of the right matrix.
+ * Thus, to make product pieces, both the left and right matrices
+ *  have to be sliced, then all pairings of those slices taken.
+ *  Each pairing on matrix slices is one product piece.
+ *
+ *  So, for example, dividing a product-piece into, say, 4 pieces means
+ *   dividing the iteration space of the parent piece into 4,
+ *   which means slicing the left matrix by 2, and the right matrix by 2,
+ *   and talking all pairings of a left slice times a right slice.
+ *
+ * Do division this way:
+ *  count total number of result cells to be produced.  Divide that by the
+ *  number of sub-pieces to make.  That gives the target number of result
+ *  cells in each sub-piece.
+ * Take the square root of the number of target cells, that's the target
+ *  number of cols, and target num rows in the result of each sub-piece.
+ * See how many sqroots fit horizontally, and how many fit vertically
+ *  take the ceiling of the larger, floor of the smaller.
+ * That is the number of rows, and number of cols, in each sub-piece.
+ * Multiply the two to find the number of sub-pieces that will be made.
+ * If not larger than requested number of sub-pieces, take the ceiling of
+ *  both num rows and num cols (also covers case of floor is zero).
+ * Divide num rows in left matrix by target num rows, then use residuals
+ *  alg. to assign ranges of left matrix rows to each slice.
+ * Divide num cols in right by target num cols, then use residuals alg.
+ *  to assign ranges of right matrix cols to each slice.
+ * Then take all pairings of left-slices with right-slices and make a
+ *  sub-piece from each pairing
+ *
+ * @param numPieces
+ */
+#define SLICE_SCOPE 1
+void divide_MM( DKUPiece* oPiece, int numSubPiecesToMake )
+ { MatrixProdPiece *matrixProdPiece;
+   Matrix *leftMatrix, *rightMatrix;
+
+   matrixProdPiece = (MatrixProdPiece *) oPiece->appSpecificPiece;
+
+   int leftStartRow, leftEndRow, rightStartCol, rightEndCol;
+   leftStartRow  = matrixProdPiece->leftStartRow;
+   leftEndRow    = matrixProdPiece->leftEndRow;
+   rightStartCol = matrixProdPiece->rightStartCol;
+   rightEndCol   = matrixProdPiece->rightEndCol;
+   leftMatrix    = matrixProdPiece->leftMatrix;
+   rightMatrix   = matrixProdPiece->rightMatrix;
+
+   int numLeftRows, numRightCols;
+   numLeftRows  = leftEndRow  - leftStartRow  + 1; //+1 cause starts at zero
+   numRightCols = rightEndCol - rightStartCol + 1;
+//==============================================================
+
+      //need numPieces to be divided into two integers
+      // if each result piece is a square, that gives the best surface
+      // area to volume == the least communication in distr memory, and
+      // the least multiple access by different threads in shared memory.
+      //So, just have to figure out the sizes of the horizontal and the
+      // vertical.
+
+      // make sure its possible to make more than 1 sub-piece
+   if( numSubPiecesToMake < 2 || ( numLeftRows < 2 && numRightCols < 2 ) )
+    { oPiece->numSubPieces = 0;  //scheduler must check for case of 0 pieces
+      return;
+    }
+
+   //TODO: scope is the func called within (divide_MM)
+   SliceStruc *slicingStruc = 
+                   BLIS_DKU__malloc_scope( sizeof(SliceStruc), SLICE_SCOPE );
+   slicingStruc->numLeftRows        = numLeftRows;
+   slicingStruc->numRightCols       = numRightCols;
+   slicingStruc->numSubPiecesToMake = numSubPiecesToMake;
+   slicingStruc->parPiece           = oPiece;
+   
+   updateSlicingStrucWithSlicingOfInputMatrices( slicingStruc );
+   
+   oPiece->numSubPieces = slicingStruc->finalNumSubPieces;
+   if( oPiece->numSubPieces == 0 ) return;
+   
+      // pair up the slices and make the final DKUPieces
+   pairSlicesAndMakeProdPieces( slicingStruc, oPiece );
+
+   BLIS_DKU__free_scope( SLICE_SCOPE );
+   return;
+ }
+
+void updateSlicingStrucWithSlicingOfInputMatrices(SliceStruc *slicingStruc)
+ {
+//=======================  Setup  ========================
+   int   numLeftRows        = slicingStruc->numLeftRows;
+   int   numRightCols       = slicingStruc->numRightCols;
+   int   numSubPiecesToMake = slicingStruc->numSubPiecesToMake;
+   
+   int   numResultCells;
+   float targetNumCellsPerPiece, targetDimOfResultPiece;
+   float idealNumLeftSlices,     targetNumLeftSlices;
+   float idealNumRightSlices,    targetNumRightSlices;
+   float targetRowsPerLeftSlice, targetColsPerRightSlice;
+   
+   float rowAccumulator, colAccumulator;
+   int   sliceIdx,       rowIncrement, colIncrement;
+   int   numLeftSlices,  numRightSlices;
+   
+   int leftStartRow, leftEndRow, rightStartCol, rightEndCol;
+   MatrixProdPiece *parentProdPiece=slicingStruc->parPiece->appSpecificPiece;
+   leftStartRow   = parentProdPiece->leftStartRow;
+   leftEndRow     = parentProdPiece->leftEndRow;
+   rightStartCol  = parentProdPiece->rightStartCol;
+   rightEndCol    = parentProdPiece->rightEndCol;
+   
+//=======================  Calc num Slices  ========================
+      //Calc the closest can reasonably get to square
+      //TODO: check that math works right: dividing int by float, need cast?
+   numResultCells          = numLeftRows * numRightCols;
+   targetNumCellsPerPiece  = numResultCells / numSubPiecesToMake;
+   targetDimOfResultPiece  = sqrt( targetNumCellsPerPiece );
+   idealNumLeftSlices      = numLeftRows  / targetDimOfResultPiece;
+   idealNumRightSlices     = numRightCols / targetDimOfResultPiece;
+   
+      //Now, product of rows * cols should stay close to numPieces, but
+      // have to make num rows an int, and num cols an int
+      // means will drop fractional part from larger, then add the number
+      // of pieces that fractional part represents back on to the smaller
+      // then round to the nearest integer.  The resulting product should
+      // still be close to numPieces.
+   if( idealNumRightSlices > idealNumLeftSlices )
+    { float diff, numPiecesCut;
+      //TODO: find floor and "closest int" in C math library.. how use?
+      targetNumRightSlices = floor( idealNumRightSlices );
+      diff                 = idealNumRightSlices - targetNumRightSlices;
+      numPiecesCut         = diff * idealNumLeftSlices;
+      idealNumLeftSlices  += numPiecesCut / targetNumRightSlices;
+      targetNumLeftSlices  = rint( idealNumLeftSlices );
+    }
+   else
+    { float diff, numPiecesCut;
+      targetNumLeftSlices  = floor( idealNumLeftSlices );
+      diff                 = idealNumLeftSlices - targetNumLeftSlices;
+      numPiecesCut         = diff * idealNumRightSlices;
+      idealNumRightSlices += numPiecesCut / targetNumLeftSlices;
+      targetNumRightSlices = rint( idealNumRightSlices );
+    }
+   targetRowsPerLeftSlice  = numLeftRows  / targetNumLeftSlices;
+   targetColsPerRightSlice = numRightCols / targetNumRightSlices;
+
+      //allocate size of worst case + safety
+   int size = sizeof(int) * (numSubPiecesToMake + 2);
+   int *leftSliceStartRows, *rightSliceStartCols;
+   //TODO: "FUNC" is not quite the right scope..  slicingStruc is right scope
+   leftSliceStartRows  =  BLIS_DKU__malloc_scope( size, SLICE_SCOPE );
+   rightSliceStartCols =  BLIS_DKU__malloc_scope( size, SLICE_SCOPE );
+   slicingStruc->leftSliceStartRows  = leftSliceStartRows;
+   slicingStruc->rightSliceStartCols = rightSliceStartCols;
+
+//=======================  Slice Left Matrix  ========================
+      //fix for case only 1 row, when leftStartRow == leftEndRow
+   leftSliceStartRows[ 0 ] = leftStartRow;
+   sliceIdx = 0;
+   rowAccumulator = 0;
+   int row;
+   for( row = leftStartRow; row < leftEndRow; row += rowIncrement )
+    {
+      leftSliceStartRows[ sliceIdx ] = row;
+
+      rowAccumulator += targetRowsPerLeftSlice;
+      rowIncrement = (int) rowAccumulator;
+      if( rowIncrement == 0 ) rowIncrement = 1;//apply at end curr iter
+      rowAccumulator -= rowIncrement;
+      if( rowAccumulator < 0 ) rowAccumulator = 0;
+      sliceIdx += 1;
+    }
+   if( sliceIdx == 0 ) sliceIdx = 1; //case when only 1 row
+   numLeftSlices     = sliceIdx;
+   leftSliceStartRows[ sliceIdx ] = leftEndRow + 1;  //use extra slot
+
+   sliceIdx = 0; colAccumulator = 0; colIncrement = 0;
+
+//=======================  Slice Right Matrix  ========================
+   rightSliceStartCols[ 0 ] = rightStartCol; //in case only 1 col
+   int col;
+   for( col = rightStartCol; col < rightEndCol; col += colIncrement )
+    {
+      rightSliceStartCols[ sliceIdx ] = col;
+
+      colAccumulator += targetColsPerRightSlice;
+      colIncrement = (int) colAccumulator;
+      if( colIncrement == 0 ) colIncrement = 1;//apply at end curr iter
+      colAccumulator -= colIncrement;
+      if( colAccumulator < 0 ) colAccumulator = 0;
+      sliceIdx += 1;
+    }
+   if( sliceIdx == 0 ) sliceIdx = 1; //case when only 1 col
+   numRightSlices = sliceIdx;
+   rightSliceStartCols[ sliceIdx ] = rightEndCol + 1;
+
+   slicingStruc->numLeftSlices     = numLeftSlices;
+   slicingStruc->numRightSlices    = numRightSlices;
+   slicingStruc->finalNumSubPieces = numLeftSlices * numRightSlices;
+   return;
+ }
+
+
+void pairSlicesAndMakeProdPieces(SliceStruc *slicingStruc, DKUPiece *oPiece)
+ { DKUPiece* *subPiecesArray;
+   MatrixProdPiece *newProdPiece;
+   MatrixProdPiece *parentMatPiece =slicingStruc->parPiece->appSpecificPiece;
+   int newPiecePos = 0;
+   int leftSliceStartRow, leftSliceEndRow;
+   int rightSliceStartCol, rightSliceEndCol;
+   int numLeftSlices  = slicingStruc->numLeftSlices;
+   int numRightSlices = slicingStruc->numRightSlices;
+   int *leftSliceStartRows  = slicingStruc->leftSliceStartRows;
+   int *rightSliceStartCols = slicingStruc->rightSliceStartCols;
+
+   int size = slicingStruc->finalNumSubPieces * sizeof(MatrixProdPiece *);
+   subPiecesArray = BLIS_DKU__malloc_toPiece( size, oPiece );
+   oPiece->subPiecesArray = subPiecesArray;
+
+   int rightSliceNum, leftSliceNum;
+   for( rightSliceNum = 0; rightSliceNum < numRightSlices; rightSliceNum++ )
+    { DKUPiece *newDKUPiece;
+      rightSliceStartCol = rightSliceStartCols[ rightSliceNum ];
+      rightSliceEndCol   = rightSliceStartCols[ rightSliceNum + 1 ] - 1;
+      for( leftSliceNum = 0; leftSliceNum < numLeftSlices; leftSliceNum++ )
+       { newDKUPiece  = BLIS_DKU__makeDKUPiece_FromDivider( DKU_INST_MM );
+         newProdPiece = DKU__makeMatrixProdPiece_FromMatrixProdPiece
+                                             ( parentMatPiece, newDKUPiece );
+         newDKUPiece->appSpecificPiece = newProdPiece;
+         
+         leftSliceStartRow = leftSliceStartRows[ leftSliceNum ];
+         leftSliceEndRow   = leftSliceStartRows[ leftSliceNum + 1 ] - 1;
+         
+         newProdPiece->leftStartRow  = leftSliceStartRow;
+         newProdPiece->leftEndRow    = leftSliceEndRow;
+         newProdPiece->leftStartCol  = parentMatPiece->leftStartCol;
+         newProdPiece->leftEndCol    = parentMatPiece->leftEndCol;
+         
+         newProdPiece->rightStartRow = parentMatPiece->rightStartRow;
+         newProdPiece->rightEndRow   = parentMatPiece->rightEndRow;
+         newProdPiece->rightStartCol = rightSliceStartCol;
+         newProdPiece->rightEndCol   = rightSliceEndCol;
+         
+         newProdPiece->prodStartRow  = leftSliceStartRow;
+         newProdPiece->prodEndRow    = leftSliceEndRow;
+         newProdPiece->prodStartCol  = rightSliceStartCol;
+         newProdPiece->prodEndCol    = rightSliceEndCol;
+         
+         subPiecesArray[ newPiecePos ] = newDKUPiece;
+         newPiecePos += 1;
+       }
+    }//for(int rightSliceNum = 0; rightSliceNum <
+ }
+
+//===========================================================================
diff -r 000000000000 -r d138e0acf9a0 DKU_INST_MM/Kernel.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/DKU_INST_MM/Kernel.c	Sun Aug 26 03:04:50 2012 -0700
@@ -0,0 +1,80 @@
+/* 
+ *  Copyright 2009 OpenSourceCodeStewardshipFoundation.org
+ *  Licensed under GNU General Public License version 2
+ *
+ * Author: SeanHalle@yahoo.com
+ *
+ */
+
+#include "DKU_INST_MM.h"
+
+
+/* Kernel
+ *
+ * Computes the passed-in piece's portion of the iteration space.
+ * A DKUPiece struc is handed to it.
+ * The DKUPiece struct has a pointer to "app specific info".  This
+ *  is some application-specific structure that the Scheduler doesn't
+ *  know the details of.  The Scheduler can pass the app-specific
+ *  data around, and that's all the Scheduler needs to do with it.
+ * This kernel, however, is written by the app programmer, and does
+ *  know the internals of "app specific info".
+ *
+ * DKU Std:  data that is accessed inside a Kernel is passed inside
+ *  a DKUPiece.  Kernel's not allowed to access data via the language's
+ *  native scoping rules, not even constants.  Thus, pointers to
+ *  arrays or to a tree's root nodes, or so forth are carried inside
+ *  the DKUPiece.
+ * Note, kernel code is re-entrant, so the rules of re-entrant code apply
+ *
+ *
+ * This is the standard Matrix Multiply loop nest.  The only modification
+ *  for DKU is that it takes loop bounds out of the DKUPiece.
+ */
+void kernel_MM( DKUPiece *pieceToProcess )
+ { inner_Kernel( (MatrixProdPiece *) pieceToProcess->appSpecificPiece );
+ }
+
+/* Separated out the calculations of the Kernel so could re-use as the
+ *  serial kernel.
+ */
+void inner_Kernel( MatrixProdPiece *matrixProdPiece )
+ { int32 leftStartRow, leftEndRow, rightStartCol, rightEndCol;
+   int32 leftStartCol, rightStartRow;
+   int32 numLeftCols, numRightCols, numResMatCols;
+   int32 row, col, vectorSize, i;
+   float32 *leftMatrix, *rightMatrix, *resultMatrix;
+   float32 *leftStartPt, *leftReadPt, *rightStartPt, *rightReadPt;
+   
+   leftMatrix    = matrixProdPiece->leftMatrix->matrix;
+   rightMatrix   = matrixProdPiece->rightMatrix->matrix;
+   resultMatrix  = matrixProdPiece->resultMatrix->matrix;
+   
+   leftStartRow  = matrixProdPiece->leftStartRow;
+   leftEndRow    = matrixProdPiece->leftEndRow;
+   rightStartCol = matrixProdPiece->rightStartCol;
+   rightEndCol   = matrixProdPiece->rightEndCol;
+   numResMatCols = matrixProdPiece->resultMatrix->numCols;
+   rightStartRow = matrixProdPiece->rightStartRow;
+   leftStartCol  = matrixProdPiece->leftStartCol;
+   numRightCols  = matrixProdPiece->rightMatrix->numCols;
+   numLeftCols   = matrixProdPiece->leftMatrix->numCols;
+    
+   vectorSize =matrixProdPiece->leftEndCol - matrixProdPiece->leftStartCol+1;
+   for( row = leftStartRow; row <= leftEndRow; row++ )
+    { leftStartPt  = leftMatrix + row * numLeftCols + leftStartCol;
+      for( col = rightStartCol; col <= rightEndCol; col++ )
+       { float32 sum = 0;
+         
+         rightStartPt = rightMatrix + rightStartRow * numRightCols + col;
+         
+         leftReadPt = leftStartPt;
+         rightReadPt = rightStartPt;
+         for( i = 0; i < vectorSize; i++)
+          { sum += *(leftReadPt++) * *rightReadPt;
+            rightReadPt += numRightCols;
+          }
+         *(resultMatrix + row * numResMatCols + col) = sum;
+       }
+    }
+ }
diff -r 000000000000 -r d138e0acf9a0 DKU_INST_MM/MakeRootDKUPieces.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/DKU_INST_MM/MakeRootDKUPieces.c	Sun Aug 26 03:04:50 2012 -0700
@@ -0,0 +1,53 @@
+/*
+ *  Copyright 2009 OpenSourceCodeStewardshipFoundation.org
+ *  Licensed under GNU General Public License version 2
+ *
+ * Author: seanhalle@yahoo.com
+ */
+
+#include "DKU_INST_MM.h"
+
+/* This is what wraps application data inside a DKUPiece.
+ * It also encodes the dependencies within the data, by making several
+ *  DKUPieces, which must be executed in sequence, in order to respect the
+ *  dependencies.
+ * It tries to make the pieces as large as possible, to maximize the
+ *  available parallelism.
+ *
+ * It allocates space for and fills an array of pointers to the pieces,
+ *  then returns how many pieces it put into the array.
+ *
+ * This function allocates the array of pointers to DKUPieces, plus the
+ *  DKUPieces, plus all sub-structures inside a DKUPiece.  This is an
+ *  application-supplied function, so it knows all the app-specific sub-
+ *  structures inside a DKUPiece.  However, it is called by the scheduler
+ *  so it has to have a fixed prototype.
+ * The scheduler calls this function, so it is up to the scheduler to free
+ *  all the structures this function has allocated.  To do this, a second
+ *  function is supplied that performs the free s.
+ *
+ * DKUPieceMaker returns a RootDKUPieces data structure, which contains the
+ *  array of DKUPieces that have to be executed in sequence, to preserve
+ *  the dependencies, and contains the number of such DKUPieces.
+ * Each call to makeDKUPieces mallocs space for the pieces, plus the
+ *  root pieces array and the root pieces struc.  This memory is freed by
+ *  the standard function BLIS_DKU__cleanupRootPieces, which is called by the
+ *  scheduler.
+ *
+ * For Matrix Multiply, the original data is a MatrixProdPair, which is
+ *  perfectly dividable, so just have to wrap that inside a DKUPiece, and
+ *  return it inside a RootDKUPieces data structure.
+ */
+RootDKUPieces * makeRootDKUPieces_MM( void *origData )
+ { RootDKUPieces * rootDKUPieces;
+   DKUPiece        *piece;
+   
+   piece = BLIS_DKU__makeDKUPiece_FromMaker( DKU_INST_MM );
+   piece->appSpecificPiece = origData;
+   
+   int numRootPieces = 1;
+   rootDKUPieces = BLIS_DKU__makeRootDKUPiecesStruc( numRootPieces );
+   rootDKUPieces->rootPiecesArray[0] = piece;
+   
+   return rootDKUPieces;
+ }
diff -r 000000000000 -r d138e0acf9a0 DKU_INST_MM/Maker_and_Freer.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/DKU_INST_MM/Maker_and_Freer.c	Sun Aug 26 03:04:50 2012 -0700
@@ -0,0 +1,73 @@
+/*
+ *  Copyright 2009 OpenSourceCodeStewardshipFoundation.org
+ *  Licensed under GNU General Public License version 2
+ *
+ * Author: seanhalle@yahoo.com
+ *
+ * Created on November 15, 2009, 2:35 AM
+ */
+
+#include <malloc.h>
+
+#include "DKU_INST_MM.h"
+
+
+//========== Makers and Free-ers for use ONLY within DKU_INST_MM  ===========
+//
+
+/*In the "_Flat" version of constructor, do only malloc of the top data struc
+ * and set values in that top-level.  Don't malloc any sub-structures.
+ *
+ *Used in BundlingQuad in unbundleInputs
+ */
+ Matrix *
+DKU__makeMatrix_Flat( int32 numRows, int32 numCols, DKUPiece *owner )
+ { Matrix * retMatrix;
+   retMatrix = BLIS_DKU__malloc_toPiece( sizeof( Matrix ), owner );
+   retMatrix->numRows = numRows;
+   retMatrix->numCols = numCols;
+
+   return retMatrix;
+ }
+
+/* Used In BundlingQuad in unbundleInputs */
+ MatrixProdPiece *
+DKU__makeMatrixProdPiece_Flat( DKUPiece *owner )
+ { return BLIS_DKU__malloc_toPiece( sizeof(MatrixProdPiece), owner );
+ }
+
+
+
+/* Used in Divider */
+ MatrixProdPiece *
+DKU__makeMatrixProdPiece_FromMatrixProdPiece
+                            ( MatrixProdPiece *parentPiece, DKUPiece *owner )
+ { MatrixProdPiece *newPiece;
+   Matrix *leftMatrix      = parentPiece->leftMatrix;
+   Matrix *rightMatrix     = parentPiece->rightMatrix;
+
+   newPiece                = DKU__makeMatrixProdPiece_Flat( owner );
+
+   newPiece->leftMatrix    = leftMatrix;
+   newPiece->rightMatrix   = rightMatrix;
+
+   newPiece->leftStartRow  = 0;
+   newPiece->leftEndRow    = leftMatrix->numRows - 1;
+   newPiece->leftStartCol  = 0;
+   newPiece->leftEndCol    = leftMatrix->numCols - 1;
+
+   newPiece->rightStartRow = 0;
+   newPiece->rightEndRow   = rightMatrix->numRows - 1;
+   newPiece->rightStartCol = 0;
+   newPiece->rightEndCol   = rightMatrix->numCols - 1;
+
+   newPiece->prodStartRow  = newPiece->leftStartRow;
+   newPiece->prodEndRow    = newPiece->leftEndRow;
+   newPiece->prodStartCol  = newPiece->rightStartCol;
+   newPiece->prodEndCol    = newPiece->rightEndCol;
+
+   newPiece->resultMatrix  = parentPiece->resultMatrix;
+
+   return newPiece;
+ }
+
diff -r 000000000000 -r d138e0acf9a0 DKU_INST_MM/SerialKernel.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/DKU_INST_MM/SerialKernel.c	Sun Aug 26 03:04:50 2012 -0700
@@ -0,0 +1,21 @@
+/*
+ *  Copyright 2009 OpenSourceCodeStewardshipFoundation.org
+ *  Licensed under GNU General Public License version 2
+ *
+ * Author: seanhalle@yahoo.com
+ *
+ */
+
+#include "DKU_INST_MM.h"
+
+/*The scheduler calls this when there is no benefit from
+ * parallel execution of the orig data.
+ *
+ *For MM, the original data is already in the data structure that's inside
+ * a DKUPiece, so just cast the oridData and call the Kernel on it
+ */
+void serialKernel_MM( void * origData )
+ {
+   inner_Kernel( (MatrixProdPiece *) origData );
+ }
+
diff -r 000000000000 -r d138e0acf9a0 DKU_INST_MM/Undivide.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/DKU_INST_MM/Undivide.c	Sun Aug 26 03:04:50 2012 -0700
@@ -0,0 +1,34 @@
+/*
+ *  Copyright 2009 OpenSourceCodeStewardshipFoundation.org
+ *  Licensed under GNU General Public License version 2
+ *
+ * Author: seanhalle@yahoo.com
+ *
+ */
+
+#include "DKU_INST_MM.h"
+#include <malloc.h>
+
+
+/* unDivider
+ *
+ * Only counts to make sure all the pieces are accounted for, and frees
+ *  the memory allocated to the finished DKUPiece structs and all sub-
+ *  structures.
+ */
+void unDividePiece_MM( DKUPiece *parentPiece, DKUPiece *piece )
+ {
+   parentPiece->numSubPiecesUndivided += 1;
+   
+      //free mem allocated to no-longer needed subPiece
+   BLIS_DKU__freeDKUPiece( piece );
+ }
+
+//TODO: figure out standard for doing this.. add pointer to func to inits,
+//       so fixed freePiece func can have generic code to free app-specific
+//       piece, and DKUInstance provides function to free that piece
+//       maybe put the pointer into schedData or something..  then this
+//       one call to freeDKUPiece can be generic, with HW-specific overlay
+// Issue is slowness of indirections into table and dereferencing Fn pointer
+
+
diff -r 000000000000 -r d138e0acf9a0 Matrix_Mult.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/Matrix_Mult.c	Sun Aug 26 03:04:50 2012 -0700
@@ -0,0 +1,221 @@
+/*
+ *  Copyright 2009 OpenSourceCodeStewardshipFoundation.org
+ *  Licensed under GNU General Public License version 2
+ *
+ * Author: seanhalle@yahoo.com
+ *
+ * Created on November 15, 2009, 2:35 AM
+ */
+
+#include <malloc.h>
+#include <stdlib.h>
+
+#include "Matrix_Mult.h"
+#include "../BLIS/DKU/DKU_common/DKU.h"
+
+//======================== For Use OUTSIDE DKU instance =====================
+/*
+ *The DKU code-instance in DKU_INST_MM has its own set of makers and free-ers
+ * that use BLIS_DKU__malloc.  These are for use in application-code
+ * outside DKU_INST_MM DKU-code-instance.
+ */
+
+
+/*In the "_Flat" version of constructor, do only malloc of the top data struc
+ * and set values in that top-level.  Don't malloc any sub-structures.
+ */
+ Matrix *
+makeMatrix_Flat( int32 numRows, int32 numCols )
+ { Matrix * retMatrix;
+   retMatrix = malloc( sizeof( Matrix ) );
+   retMatrix->numRows = numRows;
+   retMatrix->numCols = numCols;
+
+   return retMatrix;
+ }
+
+ Matrix *
+makeMatrix_WithResMat( int32 numRows, int32 numCols )
+ { Matrix * retMatrix;
+   retMatrix = malloc( sizeof( Matrix ) );
+   retMatrix->numRows = numRows;
+   retMatrix->numCols = numCols;
+   retMatrix->matrix  = malloc( numRows * numCols * sizeof(float32) );
+   
+   return retMatrix;
+ }
+
+ void
+freeMatrix_Flat( Matrix * matrix )
+ { //( matrix );
+ }
+ void
+freeMatrix( Matrix * matrix )
+ { free( matrix->matrix );
+   free( matrix );
+ }
+
+ MatrixProdPiece *
+makeMatrixProdPiece_Empty()
+ { return malloc( sizeof(MatrixProdPiece) );
+ }
+
+
+ MatrixProdPiece *
+makeMatrixProdPiece_Helper( Matrix *leftMatrix, Matrix *rightMatrix )
+ { MatrixProdPiece *newPiece;
+
+   newPiece                = makeMatrixProdPiece_Empty( );
+
+   newPiece->leftMatrix    = leftMatrix;
+   newPiece->rightMatrix   = rightMatrix;
+
+   newPiece->leftStartRow  = 0;
+   newPiece->leftEndRow    = leftMatrix->numRows - 1;
+   newPiece->leftStartCol  = 0;
+   newPiece->leftEndCol    = leftMatrix->numCols - 1;
+
+   newPiece->rightStartRow = 0;
+   newPiece->rightEndRow   = rightMatrix->numRows - 1;
+   newPiece->rightStartCol = 0;
+   newPiece->rightEndCol   = rightMatrix->numCols - 1;
+
+   newPiece->prodStartRow  = newPiece->leftStartRow;
+   newPiece->prodEndRow    = newPiece->leftEndRow;
+   newPiece->prodStartCol  = newPiece->rightStartCol;
+   newPiece->prodEndCol    = newPiece->rightEndCol;
+
+   return newPiece;
+ }
+
+ MatrixProdPiece *
+makeMatrixProdPiece_FromMatrixProdPiece( MatrixProdPiece *parentPiece )
+ { MatrixProdPiece *newPiece;
+   Matrix *leftMatrix      = parentPiece->leftMatrix;
+   Matrix *rightMatrix     = parentPiece->rightMatrix;
+
+   newPiece = makeMatrixProdPiece_Helper( leftMatrix, rightMatrix );
+   newPiece->resultMatrix  = parentPiece->resultMatrix;
+
+   return newPiece;
+ }
+
+ MatrixProdPiece *
+makeMatrixProdPiece_FromMatrices( Matrix *leftMatrix, Matrix *rightMatrix )
+ { MatrixProdPiece *newPiece;
+
+   newPiece = makeMatrixProdPiece_Helper( leftMatrix, rightMatrix );
+   newPiece->resultMatrix  =
+    makeMatrix_WithResMat( leftMatrix->numRows, rightMatrix->numCols );
+
+   return newPiece;
+ }
+
+ void
+freeMatrixProdPiece_Flat( MatrixProdPiece * piece )
+ { free( piece );
+ }
+
+ void
+freeMatrixProdPiece( MatrixProdPiece * piece )
+ { //( piece->leftMatrix );
+   freeMatrix( piece->rightMatrix );
+   freeMatrix( piece->resultMatrix );
+   free( piece );
+ }
+
+ 
+ void
+initialize_Input_Matrices_Via( Matrix  **leftMatrix, Matrix **rightMatrix,
+                               ParamBag *paramBag )
+ { char *leftMatrixFileName, *rightMatrixFileName;
+   int   leftMatrixRows, leftMatrixCols, rightMatrixRows, rightMatrixCols;
+   
+      ParamStruc *param;
+      param = getParamFromBag( "leftMatrixRows", paramBag );
+   leftMatrixRows = param->intValue;
+      param = getParamFromBag( "leftMatrixCols", paramBag );
+   leftMatrixCols = param->intValue;
+   *leftMatrix = makeMatrix_WithResMat( leftMatrixRows, leftMatrixCols );
+   
+      param = getParamFromBag( "leftMatrixFileName", paramBag );
+   leftMatrixFileName = param->strValue;  //no need to copy
+   read_Matrix_From_File( *leftMatrix,  leftMatrixFileName );
+   
+      param = getParamFromBag( "rightMatrixRows", paramBag );
+   rightMatrixRows = param->intValue;
+      param = getParamFromBag( "rightMatrixCols", paramBag );
+   rightMatrixCols = param->intValue;
+   *rightMatrix = makeMatrix_WithResMat( rightMatrixRows, rightMatrixCols );
+   
+      param = getParamFromBag( "rightMatrixFileName", paramBag );
+   rightMatrixFileName = param->strValue;
+   read_Matrix_From_File( *rightMatrix, rightMatrixFileName );
+ }
+
+
+void parseLineIntoRow( char *line, float32* row );
+
+
+ void
+read_Matrix_From_File( Matrix *matrixStruc, char *matrixFileName )
+ { int    row, maxRead, numRows, numCols;
+   float32 *matrixStart;
+   size_t lineSz = 0;
+   FILE  *file;
+   char  *line = NULL;
+   
+   lineSz = 50000; //max length of line in a matrix data file
+   line = (char *) malloc( lineSz );
+   if( line == NULL ) BLIS_DKU__throwError( "no mem for matrix line" );
+   
+   numRows = matrixStruc->numRows;
+   numCols = matrixStruc->numCols;
+   matrixStart = matrixStruc->matrix;
+
+   printf("Matrix File Path: %s\n", matrixFileName);fflush(stdout);
+   file = fopen( matrixFileName, "r" ); if(!file){printf("not open! %d\n",__LINE__); fflush(stdin);}
+   fseek( file, 0, SEEK_SET );
+   for( row = 0; row < numRows; row++ )
+    {
+      if( feof( file ) )  BLIS_DKU__throwError( "file ran out too soon" );
+      maxRead = getline( &line, &lineSz, file );
+      if( maxRead == -1 ) BLIS_DKU__throwError( "prob reading mat line");
+      
+      if( *line == '\n') continue; //blank line
+      if( *line == '/' ) continue; //comment line
+      
+      parseLineIntoRow( line, matrixStart + row * numCols );
+    }
+   free( line );
+ }
+
+/*This function relies on each line having the proper number of cols.  It
+ * doesn't check, nor enforce, so if the file is improperly formatted it
+ * can write over unrelated memory
+ */
+ void
+parseLineIntoRow( char *line, float32* row )
+ {
+   char *valueStr, *searchPos;
+   
+      //read the float values
+   searchPos = valueStr = line; //start
+   
+   for( ; *searchPos != 0; searchPos++)  //bit dangerous, should use buff len
+    {
+      if( *searchPos == '\n' ) //last col..  relying on well-formatted file
+       { *searchPos = 0;
+         *row = atof( valueStr );
+         break;                                    //end FOR loop
+       }
+      if( *searchPos == ',' )
+       { *searchPos = 0;                           //mark end of string
+         *row = (float32) atof( valueStr );
+         row += 1;                                 //address arith
+            //skip any spaces before digits.. use searchPos + 1 to skip the 0
+         for( ; *(searchPos + 1)== ' ' && *(searchPos + 1) !=0; searchPos++);
+         valueStr = searchPos + 1;
+       }
+    }
+ }
diff -r 000000000000 -r d138e0acf9a0 Matrix_Mult.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/Matrix_Mult.h	Sun Aug 26 03:04:50 2012 -0700
@@ -0,0 +1,81 @@
+/*
+ *  Copyright Oct 24, 2009 OpenSourceCodeStewardshipFoundation.org
+ *  Licensed under GNU General Public License version 2
+ */
+
+#ifndef MATRIX_MULT_H_
+#define MATRIX_MULT_H_
+
+#include <stdio.h>
+
+#include "../BLIS/BLIS_primitive_data_types.h"
+
+#include "ParamHelper/Param.h"
+
+//==============================  Structures  ==============================
+
+typedef
+struct
+ { int32 numRows;
+   int32 numCols;
+   float32 *matrix;  //2D, but dynamically sized, so use addr arith
+ }
+Matrix;
+
+/* This is the "appSpecificPiece" that is carried inside a DKUPiece.
+ *  In the DKUPiece data struc it is declared to be of type "void *".  This
+ *  allows the application to define any data structure it wants and put it
+ *  into a DKUPiece.
+ * When the app specific info is used, it is in app code, so it is cast to
+ *  the correct type to tell the compiler how to access fields.
+ * This keeps all app-specific things out of the DKU directory, as per the
+ *  DKU standard. */
+typedef
+struct
+ { 
+      // pointers to shared data..  the result matrix must be created when the
+      //  left and right matrices are put into the root ancestor DKUPiece.
+   Matrix * leftMatrix;
+   Matrix * rightMatrix;
+   Matrix * resultMatrix;
+
+      // define the starting and ending boundaries for this piece of the
+      //  result matrix.  These are derivable from the left and right
+      //  matrices, but included them for readability of code.
+   int prodStartRow, prodEndRow;
+   int prodStartCol, prodEndCol;
+      // Start and end of the portion of the left matrix that contributes to
+      //  this piece of the product
+   int leftStartRow, leftEndRow;
+   int leftStartCol, leftEndCol;
+      // Start and end of the portion of the right matrix that contributes to
+      //  this piece of the product
+   int rightStartRow, rightEndRow;
+   int rightStartCol, rightEndCol;
+ }
+MatrixProdPiece;
+
+//==============================  Functions  ================================
+void readFile();
+
+Matrix *makeMatrix( int32 numRows, int32 numCols );
+Matrix *makeMatrix_Flat( int32 numRows, int32 numCols );
+void    freeMatrix_Flat( Matrix * matrix );
+void    freeMatrix( Matrix * matrix );
+
+ MatrixProdPiece *
+makeMatrixProdPiece_Empty();
+ MatrixProdPiece *
+makeMatrixProdPiece_FromMatrixProdPiece( MatrixProdPiece * piece );
+ MatrixProdPiece *
+makeMatrixProdPiece_FromMatrices( Matrix *leftMatrix, Matrix *rightMatrix );
+
+void read_Matrix_From_File( Matrix *matrixStruc, char *matrixFileName );
+
+void freeMatrixProdPiece_Flat( MatrixProdPiece * piece );
+void freeMatrixProdPiece( MatrixProdPiece * piece );
+
+
+//===========================================================================
+
+#endif /*MATRIX_MULT_H_*/
diff -r 000000000000 -r d138e0acf9a0 Read_Input_Matrix.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/Read_Input_Matrix.c	Sun Aug 26 03:04:50 2012 -0700
@@ -0,0 +1,604 @@
+/* 
+ * File:   Read_Input.c
+ * Author: SeanHalle@yahoo.com
+ *
+ * Created on June 15, 2009, 10:12 AM
+ */
+
+#include <stdio.h>
+
+//========================
+scanf("%[^\t]",a);  //matches everything except tab character
+
+//======================= Write a structure into a file  ====================
+
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+
+#define MAX 50
+
+
+typedef struct {
+    char  name[10];
+    int key;
+} file_record;
+
+/* this function adds the relatiuve addres to the index for a key */
+void create_index(long index[], int key, long rel_add ) {
+    index[key] = rel_add;
+}
+
+/* this function writes a record to the file */
+void write_rec(FILE *fp, file_record rec) {
+   fwrite(&rec,sizeof(rec),1,fp);
+}
+
+void main() {
+    long rel_add;
+    int key;
+    file_record frec;
+    long index[MAX];/* an index list*/
+    int n,i;
+
+    FILE *recfile=NULL,*ifile=NULL;
+    /* this initializes the index list to all ? */
+    for(i=0; i< MAX; i++)
+        index[i]= (-1);
+
+    recfile=fopen("mfile","w");
+    if(recfile == NULL) {
+        printf("Error in openeing file mfile\n");
+        exit(0);
+    }
+    rel_add = 0 ;
+    do {
+        printf(" Enter the data vlue and the key of the record to be added to file mfile\n");
+        scanf("%s %d",frec.name,&frec.key);
+        while(index[frec.key] != (-1)) {
+            printf(" A record with this key value already exist in a file enter record key value\n");
+            scanf("%s %d",frec.name,&frec.key);
+        }
+        create_index(index,frec.key,rel_add);
+         write_rec(recfile,frec);
+        rel_add =  ftell(recfile);
+        /* this sets the relative address for the next record to be
+      the value of current file position pointer in bytes from
+      the beginning of the file */
+         printf("Enter 1 to continue adding records to the file\n");
+         scanf("%d",&n);
+    }while(n == 1);
+
+    ifile=fopen("index_file","w");
+
+    if(ifile == NULL) {
+       printf("Error in openeing file index_file\n");
+         exit(0);
+    }
+
+    fwrite(index,sizeof(index),1,ifile);/*writes the complete index into the index_file */
+    fclose(recfile);
+    fclose(ifile);
+    printf("Enter 1 if you want to retrieve a record\n");
+    scanf("%d",&n);
+
+    if( n == 1) {
+       ifile=fopen("index_file","r");
+       if(ifile == NULL) {
+           printf("Error in openeing file index_file\n");
+           exit(0);
+       }
+       fread(index,sizeof(index),1,ifile);
+
+       /* reads the complete index into the index list from the index_file*/
+       fclose(ifile);
+       recfile=fopen("mfile","r");
+
+       if(recfile == NULL) {
+           printf("Error in openeing file mfile\n");
+           exit(0);
+       }
+    }
+    printf("THE CONTENTS OF FILE IS \n");
+
+    while( (fread(&frec,sizeof(frec),1,recfile)) != 0)
+         printf("%s %d\n",frec.name,frec.key);
+
+    do {
+        printf("Enter the key of the record to be retrieved\n");
+        scanf("%d",&key);
+        rel_add = index[key]; /*gets the relative address of the record from index list */
+        if( (fseek(recfile,rel_add,SEEK_SET))!= 0) {
+             printf("Error\n");
+             exit(0);
+        }
+        fread(&frec,sizeof(frec),1,recfile);
+        printf("The data value of the retrieved record is %s\n",frec.name);
+        printf("Enter 1 if you want to retrieve a record\n");
+        scanf("%d",&n);
+    } while(n == 1);
+
+    fclose(recfile);
+}
+
+
+
+
+//========================== Read words in file demo  =======================
+
+#include <stdio.h>
+#include <ctype.h>
+#include <string.h>
+#include <stdlib.h>
+
+struct node {
+    struct node    *left;       /* tree to the left */
+    struct node    *right;      /* tree to the right */
+    char           *word;       /* word for this tree */
+};
+
+/* the top of the tree */
+static struct node *root = NULL;
+
+/*
+ * memory_error -- write error and die                  *
+ */
+void memory_error(void)
+{
+    fprintf(stderr, "Error:Out of memory\n");
+    exit(8);
+}
+
+/*
+ * save_string -- save a string on the heap             *
+ *                                                      *
+ * Parameters                                           *
+ *      string -- string to save                        *
+ *                                                      *
+ * Returns                                              *
+ *      pointer to malloc-ed section of memory with     *
+ *      the string copied into it.                      *
+ */
+char *save_string(char *string)
+{
+    char *new_string;   /* where we are going to put string */
+
+    new_string = malloc((unsigned) (strlen(string) + 1));
+
+    if (new_string == NULL)
+        memory_error();
+
+    strcpy(new_string, string);
+    return (new_string);
+}
+/*
+ * enter -- enter a word into the tree                  *
+ *                                                      *
+ * Parameters                                           *
+ *      node -- current node we are looking at          *
+ *      word -- word to enter                           *
+ */
+void enter(struct node **node, char *word)
+{
+    int  result;        /* result of strcmp */
+
+    char *save_string(char *);  /* save a string on the heap */
+
+    /*
+     * If the current node is null, we have reached the bottom
+     * of the tree and must create a new node.
+     */
+    if ((*node) == NULL) {
+
+  /* Allocate memory for a new node */
+        (*node) = malloc(sizeof(struct node));
+        if ((*node) == NULL)
+            memory_error();
+
+  /* Initialize the new node */
+        (*node)->left = NULL;
+        (*node)->right = NULL;
+        (*node)->word = save_string(word);
+  return;
+    }
+    /* Check to see where the word goes */
+    result = strcmp((*node)->word, word);
+
+    /* The current node already contains the word, no entry necessary */
+    if (result == 0)
+        return;
+
+    /* The word must be entered in the left or right sub-tree */
+    if (result < 0)
+        enter(&(*node)->right, word);
+    else
+        enter(&(*node)->left, word);
+}
+/*
+ * scan -- scan the file for words                      *
+ *                                                      *
+ * Parameters                                           *
+ *      name -- name of the file to scan                *
+ */
+void scan(char *name)
+{
+    char word[100];     /* word we are working on */
+    int  index;         /* index into the word */
+    int  ch;            /* current character */
+    FILE *in_file;      /* input file */
+
+    in_file = fopen(name, "r");
+    if (in_file == NULL) {
+        fprintf(stderr, "Error:Unable to open %s\n", name);
+        exit(8);
+    }
+    while (1) {
+        /* scan past the whitespace */
+        while (1) {
+            ch = fgetc(in_file);
+
+            if (isalpha(ch) || (ch == EOF))
+                break;
+        }
+
+        if (ch == EOF)
+            break;
+
+        word[0] = ch;
+        for (index = 1; index < sizeof(word); ++index) {
+            ch = fgetc(in_file);
+            if (!isalpha(ch))
+                break;
+            word[index] = ch;
+        }
+        /* put a null on the end */
+        word[index] = '\0';
+
+        enter(&root, word);
+    }
+    fclose(in_file);
+}
+/*
+ * print_tree -- print out the words in a tree          *
+ *                                                      *
+ * Parameters                                           *
+ *      top -- the root of the tree to print            *
+ */
+void print_tree(struct node *top)
+{
+    if (top == NULL)
+        return;                 /* short tree */
+
+    print_tree(top->left);
+    printf("%s\n", top->word);
+    print_tree(top->right);
+}
+
+int main(int argc, char *argv[])
+{
+    if (argc != 2) {
+        fprintf(stderr, "Error:Wrong number of parameters\n");
+        fprintf(stderr, "      on the command line\n");
+        fprintf(stderr, "Usage is:\n");
+        fprintf(stderr, "    words 'file'\n");
+        exit(8);
+    }
+    scan(argv[1]);
+    print_tree(root);
+    return (0);
+}
+
+
+
+
+//==================  Get line demo =========================
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#define _GNU_SOURCE
+
+int main(int argc, char* argv[]) {
+
+size_t lsize = 0;
+ssize_t read;
+FILE* conf_file;
+char* line = NULL;
+
+if (argc == 1) {
+printf("\nCommand syntax:\n");
+printf("\n\tprogramd [ start | stop ]\n");
+printf("\t\tstart: start daemon\n");
+printf("\t\tstop: stop daemon\n");
+}
+else if (strcmp(argv[1], "start") == 0) {
+conf_file = fopen("/etc/program/program.conf", "r");
+fseek(conf_file, 0, SEEK_SET);
+while (!feof(conf_file)) {
+while (getline(&line, &lsize, conf_file) != -1) {
+printf("%s", line);
+}
+}
+}
+
+//======================
+fopen
+fread
+fscanf
+getline
+
+
+//================== scanf demo =============================
+/* Q: need this for the GCC atomic operations?
+ */
+#define _GNU_SOURCE
+#include <stdio.h>
+
+void main (void) {
+
+  /* We will use one floating-point and one integer variable. */
+
+  double x = .00000123456789;
+  int n = 12345;
+
+
+  /* Display plain text. */
+
+  printf("This is a test\n");
+  printf("This\tis\nanother\ttest\n\n");
+
+
+  /* Display an integer. */
+
+  printf("Here is n: %d\n\n", n);
+
+
+  /* Display a double three different ways. */
+
+  printf("Here is x: %g\n", x);
+  printf("Here is x: %f\n", x);
+  printf("Here is x: %e\n\n", x);
+
+
+  /* Display two numbers. */
+
+  printf("Here are n (%d) and x (%g)\n", n, x);
+
+}
+
+//===========================================================
+
+void read_One_MB(int f, int MB_y, int MB_x, MB_Info *curMB, FILE *inputFH);
+
+//=================================================================
+       { dataFile    = new File( fileName );
+         paramScanner = new Scanner( dataFile  );
+       }
+      catch( Exception e )
+       { dataFile    = null;
+         paramScanner = null;
+         System.err.println( "couldn't open file: " + fileName );
+       }
+
+      paramScanner.useDelimiter(",\\s*|\n|\r\n");
+
+   MatrixInRowMajor ( int _numRows, int _numCols )
+    {
+      numRows = _numRows;  // lives in super class
+      numCols = _numCols;
+
+      rows = new float[numRows][];
+
+      for( int i = 0; i < numRows; i++ )
+       {
+         rows[ i ] = new float[ numCols ];
+       }
+    }
+  public void fillSelfFromFile( String fileName )
+    { float floatValue = 0;
+      String floatString;
+
+      super.setUpScanner( fileName );
+
+      for( int r = 0; r < numRows; r += 1 )
+       { for( int c = 0; c < numCols; c += 1 )
+          { floatString = paramScanner.next();
+            floatValue = Float.parseFloat( floatString );
+            rows[ r ][ c ] = floatValue;
+          }
+       }
+    }
+
+//================================================================
+
+
+/* Get the data strucs implicitly from the header file
+ */
+void read_All_Frames( FILE *inputFH )
+ { int MB_x, MB_y, f;
+   
+   for( f = 1; f <= numFrames; f++ ) //allocated 10 frames of mem, use 8
+    {    //PixInFrame, PixInLine, MBsInFrame, frameWidthInMB, etc in header
+      uint8_t * startOfFrame_L  = &(input_img_y[f][0]);
+      uint8_t * startOfFrame_CR = &(input_img_cr[f][0]);
+      uint8_t * startOfFrame_CB = &(input_img_cb[f][0]);
+      
+      // Reads in one frame
+      for( MB_y = 0; MB_y < frameHeightInMB; MB_y++ )
+       {
+         for( MB_x = 0; MB_x < frameWidthInMB; MB_x++ )
+          {    //DEBUG: addr arith checks out (size of MB_Info is 240)
+            MB_Info *MBInfo = &(input_MBs[f][0]) +
+                                              (MB_y * frameWidthInMB + MB_x);
+            // Read Macroblock Parameters and pixel data
+            read_One_MB( f, MB_y, MB_x, MBInfo, inputFH );
+            
+            MBInfo->PixInLine_L  = oPixInLine_L;
+            MBInfo->PixInLine_C  = oPixInLine_C;
+            
+            int offsetToMBsPix_L = 
+                       MB_y * (oPixInLine_L * MBHeight_L) + MB_x * MBWidth_L;
+            int offsetToMBsPix_C = 
+                       MB_y * (oPixInLine_C * MBHeight_C) + MB_x * MBWidth_C;
+               //DEBUG: addr arith checks out
+            MBInfo->startOfMBsPix_L  = startOfFrame_L  + offsetToMBsPix_L;
+            MBInfo->startOfMBsPix_CR = startOfFrame_CR + offsetToMBsPix_C;
+            MBInfo->startOfMBsPix_CB = startOfFrame_CB + offsetToMBsPix_C;
+          }
+       }
+    }
+ }
+
+/* Reads the parameters of macro block, then reads pixel data of MB.
+ * Give it frame num, y and x of macro block.  It gets addresses of the
+ *  arrays by including the header.
+ */
+void read_One_MB(int f, int MB_y, int MB_x, MB_Info *curMB, FILE *inputFH)
+ {
+   int dir, line, i, tmp, x, y, pixelIndex;
+   char strTemp[90];
+   MB_Info_throwAway throwAway;
+   
+   fscanf(inputFH, "%s",&(strTemp[0]));  //get rid of unused preamble string
+   
+   //(*curMB).MB_x = MB_x;
+   //(*curMB).MB_y = MB_y;
+   
+   
+   
+   /************  Read parameters ******************/
+   
+      // first, get rid of unused data in the input stream.
+   fscanf(inputFH, "%d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d",
+         &(throwAway).MB_X,
+         &(throwAway).MB_Y,
+         &(throwAway).mb_stride,
+         &(throwAway).deblocking_filter,
+         &(throwAway).picture_structure,
+         &(throwAway).slice_alpha_c0_offset,
+         &(throwAway).slice_type,
+         &(throwAway).chroma_qp_index_offset[0],
+         &(throwAway).chroma_qp_index_offset[1],
+         &(throwAway).mb_xy_type,
+         &(throwAway).mb_xy_type_m1,
+         &(throwAway).mb_xy_type_top,
+         &(throwAway).qscale_mb_xy,
+         &(throwAway).qscale_mb_xy_m1,
+         &(throwAway).qscale_mb_xy_top,
+         &(throwAway).slice_table_mb_xy,
+         &(throwAway).slice_table_mb_xy_m1,
+         &(throwAway).slice_table_mb_xy_top
+        );
+   
+   for(dir = 0; dir < 2; dir++)
+      for(line=0; line < 5*8; line++)
+         fscanf(inputFH, "%d ", &(throwAway).ref_cache[dir][line]);
+   
+   for(dir = 0; dir < 2; dir++)
+      for(line=0; line < 5*8; line++)
+         fscanf(inputFH, "%d %d ", &(throwAway).mv_cache[dir][line][0],
+                                   &(throwAway).mv_cache[dir][line][1]);
+   
+   for(line=0; line < 6*8; line++)
+      fscanf(inputFH, "%d ", &(throwAway).non_zero_count_cache[line]);
+   
+   
+      //now, get data will use in deblocking
+   
+   fscanf(inputFH, "%i %i", &(*curMB).endSubBlk[0], &(*curMB).endSubBlk[1]);
+   fscanf(inputFH, "%d",&(*curMB).startSubBlk[0]);
+   fscanf(inputFH, "%d",&(*curMB).startSubBlk[1]);
+   
+   //read bS
+   for(dir = 0; dir < 2; dir++)
+      for(line=0; line < 4; line++)
+         for (i=0; i < 4; i++)
+          {
+            fscanf(inputFH, "%d",&tmp);
+            (*curMB).bS[dir][line][i] = (int) tmp;
+          }
+   
+   //read luma_qp
+   for(dir = 0; dir < 2; dir++)
+      for(line=0; line < 4; line++)
+         fscanf(inputFH, "%d",&(*curMB).luma_qp[dir][line]);
+   
+   //read chroma_qp
+   for(dir = 0; dir < 2; dir++)
+      for(line=0; line < 4; line++)
+         fscanf(inputFH, "%d",&(*curMB).chroma_qp[dir][line]);
+    
+    
+    
+   /********* Have MB params, now read pixel data of MB *************/
+   
+      /* The MB pixel data is read in one MB at a time.  All pixel data
+       *  goes into a single array.  The pixel data is layed out in the array
+       *  the same as the pixels appear on the screen, in screen-row major
+       *  order.  So, all the pixels in the 0th line at the top of the frame
+       *  are next to each other, starting at the beginning of the array.
+       *  Then, the second line begins at arrayAddr + frame_width_in_pixels,
+       *  and so on.
+       * Get the pixels for one MB, so have to map the MB location onto the
+       *  array location.
+       * The position of the MB's 0,0 pixel is offset by the data of all  the
+       *  MBs in MB-lines above, and by all the pixels in MBs to the left.
+       * So, the number of pixels in a MB-line is the number of lines in a MB
+       *  times the number of pixels in a frame-line.  Multiply that by the
+       *  number of MB-lines above the current MB.
+       * Next, add the offset within the current MB-line, which is the
+       *  number of MBs to the left times the width, in pixels, of one MB.
+       * Then, to get the offset of a particular pixel in the MB from the
+       *  start of the MB, take the number of lines in the MB above the
+       *  current pixel times the pixels-per-frame-line, then add the number
+       *  of pixels to the left within the MB.
+       *
+       *(MB_y * MB_height * Frm_width + MB_x * MB_width) + (y * Frm_width + x)
+       */
+      
+      /*  read the pre-deblocking MB pixels, then the correct final pixels.
+       * Start of a macro block is num MB rows above * pixel lines in height
+       *  of a MB * pixels in a line of frame, plus the numMB to left * pixel
+       *  width of a MB.. MB_y and MB_x start at 0, so they are num above
+       *  and num to left, repectively.
+       * Many of the width and height values are defined in header.*/
+      //DEBUG: addr arith checks out
+   int offsetToMBsPix_L = MB_y * (oPixInLine_L * MBHeight_L) + MB_x * MBWidth_L;
+   for(y=0; y < MBHeight_L; y++)
+    { // first read all input Y, then all correct output Y
+      int offsetToLineInMB_L = offsetToMBsPix_L + y * oPixInLine_L;
+      for(x=0; x < MBWidth_L; x++)
+       { pixelIndex = offsetToLineInMB_L + x;
+         fscanf(inputFH, "%i",&tmp);
+         *(&input_img_y[f][0] + pixelIndex) = (char) tmp;
+       }
+      for(x=0; x < MBWidth_L; x++)
+       { pixelIndex = offsetToLineInMB_L + x;
+         fscanf(inputFH, "%i",&tmp);
+         *(&correct_img_y[f][0] + pixelIndex) = (char) tmp;
+       }
+    }
+   
+      //read croma b and r.. in all rest of code, goes "R before B" but here,
+      // input stream has the blue before the red..  FYI
+      //DEBUG: addr arith appears to check out (assuming have right model)
+   int offsetToMBsPix_C = MB_y * (oPixInLine_C * MBHeight_C) + MB_x * MBWidth_C;
+   for (y=0; y < MBHeight_C; y++)
+    { // first read all input CB & CR, then all correct output CB & CR
+      int offsetToLineInMB_C = offsetToMBsPix_C + y * oPixInLine_C;
+      for(x=0; x < MBWidth_C; x++)
+       { int pixelIndex_C = offsetToLineInMB_C + x;
+         fscanf(inputFH, "%d",&tmp);
+         *(&input_img_cb[f][0] + pixelIndex_C) = (char) tmp;
+         fscanf(inputFH, "%d",&tmp);
+         *(&input_img_cr[f][0] + pixelIndex_C) = (char) tmp;
+       }
+      for(x=0; x < MBWidth_C; x++)
+       { int pixelIndex_C = offsetToLineInMB_C + x;
+         fscanf(inputFH, "%d",&tmp);
+         *(&correct_img_cb[f][0] + pixelIndex_C) = (char) tmp;
+         fscanf(inputFH, "%d",&tmp);
+         *(&correct_img_cr[f][0] + pixelIndex_C) = (char) tmp;
+       }
+    }
+ }
diff -r 000000000000 -r d138e0acf9a0 main.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/main.c	Sun Aug 26 03:04:50 2012 -0700
@@ -0,0 +1,44 @@
+/*
+ *  Copyright Oct 24, 2009 OpenSourceCodeStewardshipFoundation.org
+ *  Licensed under GNU General Public License version 2
+ *
+ * author seanhalle@yahoo.com
+ */
+
+
+#include <stdio.h>
+#include <time.h>
+#include <math.h>
+#include <float.h>
+#include <limits.h>
+#include <sys/time.h>
+#include <malloc.h>
+
+#include "BLIS_CONSTANTS.h"
+#include "../BLIS/BLIS.h"
+#include "../BLIS/DKU/DKU_common/DKU.h"
+
+#include "Matrix_Mult.h"
+
+/**
+ * This is the DKU version of  Matrix Multiply sample application
+ * 
+ */
+int main( int argc, char **argv )
+ { Matrix          *leftMatrix, *rightMatrix;
+   ParamBag *paramBag;
+
+   paramBag = makeParamBag();
+   readParamFileIntoBag( argv[1], paramBag );
+   initialize_Input_Matrices_Via( &leftMatrix, &rightMatrix, paramBag );
+   
+   resultMatrix = multiplyTheseMatrices( leftMatrix, rightMatrix );
+
+   printf("\nresult matrix: \n");
+   printMatrix( resultMatrix );
+   
+//   BLIS_DKU__print_Stats_forInst( DKU_INST_MM );
+   
+   exit(0); //cleans up
+ }
+