# HG changeset patch # User Sean Halle # Date 1345975490 25200 # Node ID d138e0acf9a06d2989ff63a9481829e1329eb60b Initial add of standard DKU matrix mult code -- to be modified diff -r 000000000000 -r d138e0acf9a0 BLIS_CONSTANTS.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/BLIS_CONSTANTS.h Sun Aug 26 03:04:50 2012 -0700 @@ -0,0 +1,20 @@ +/* + * File: BLIS_CONSTANTS.h + * Author: SeanHalle@yahoo.com + * + * Created on October 27, 2009, 6:19 AM + */ + +#ifndef _BLIS_CONSTANTS_H +#define _BLIS_CONSTANTS_H + + //DKU Instance ID enum. Must start at 1. + //The directory, header, and init file for each instance of the DKU + // pattern is named the same as the enum. +enum + { DKU_INST_MM = 1 + }; + + +#endif /* _BLIS_CONSTANTS_H */ + diff -r 000000000000 -r d138e0acf9a0 DKU_INST_MM/Bundling_Quad.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/DKU_INST_MM/Bundling_Quad.c Sun Aug 26 03:04:50 2012 -0700 @@ -0,0 +1,349 @@ +/* + * Copyright Oct 24, 2009 OpenSourceCodeStewardshipFoundation.org + * Licensed under GNU General Public License version 2 + * + * + * Author: SeanHalle@yahoo.com + * + */ + +#include "malloc.h" +#include "DKU_INST_MM.h" +#include "../../BLIS/DKU/DKU_common/DKU.h" + +//Positions in the bundle +enum + { szPos = 0, + numLRPos, + numLCPos, + numRRPos, + numRCPos, + LBMatrixPos, + numPos = LBMatrixPos + }; + +//============================ Bundling Quad =============================== + +/* This is the set of four bundling functions: + * + * bundleInputs -- takes a DKU piece and returns an array of data that + * contains all the information the Kernel will need to + * process that DKU piece + * + * unbundleInputs -- takes the array returned by bundleInputs and turns it + * into a DKU piece that can be given to the Kernel + * + * bundleResults -- takes a DKU piece that has finished going through the + * Kernel and places all result information into an array + * + * unbundleResults -- takes the output from bundleResults plus the original + * DKU piece whose inputs were bundled to produce the + * results, and modifies the state of the original DKU + * piece to be as if it had gone through the kernel. + * + * The bundling quad ("quad" because there are four bundling functions).. the + * bundling quad is only used for distributed memory hardware. When used, + * they can be thought of as operating in two separate memories. + * The bundleInputs and unbundleResults operate in one memory, where the full + * original data structure is. + * The unbundleInputs and bundleResults operate in the second memory, where + * the data for one piece gets sent to. + * + * + * Call sequence: + * + * call bundlInputs in memory space of original data, giving it a DKU piece. + * It returns a pointer to a byte array. (First int32 in array is its size) + * The byte array is sent to remote memory to be processed. + * In the remote memory, the byte array is received. + * The unbundleInputs function is called on it, which creates a new DKUPiece + * in the heap of the remote memory, and creates on the heap of the remote + * memory any data structures embedded within the DKUPiece. + * The unbundleInputs function may re-use portions of the byte array it is + * given, so the run-time in the remote memory must perform buffer + * management for its communications appropriately. + * The byte array that the unbundleInputs function is given must have been + * allocated on the heap within the same malloc state as the unbundleInputs + * and bundleResults functions are linked to. + * The returned DKUPiece is given to the Kernel function to process. + * The DKUPiece is then given to the bundleResults function, which returns + * a pointer to a byte array. The unbundleResults may simply return the + * pointer to the same received inputBundle, so the communication and memory + * management in the remote memory must behave accordingly. The + * bundleResults function will free any memory allocated on the heap by the + * unbundleInputs function. The only memory remaining on the heap when the + * bundleResults function completes is the byte array returned by it. This + * is the reason that the original inputBundle given to unbundleInputs must + * be allocated on the same heap that bundleResults frees from: + * bundleResults might call free on that inputBundle, so that call must + * modify the correct malloc state. + * The remote memory sends the resultBundle to the original memory, then + * frees the resultBundle. + * The original memory pairs the result bundle with the DKUPiece that the + * corresponding inputBundle was made from. + * The original memory calls unbundleResults, giving it the resultsBundle + * and the corresponding DKUPiece that remained in the original memory. + * The unbundleResults modifies the original memory such that it is identical + * to the state it would be in if the Kernel were called on the DKUPiece + * in the original memory. + * The original memory is responsible for freeing the inputBundle that was + * made by bundleInputs, but unbundleResults will free the resultsBundle + * when it is done with it. This allows unbundleResults to simply assign + * pointers to portions of the resultsBundle, rather than copying, if that + * is appropriate for the Kernel. + */ +//=========================================================================== + +/* Layout: + * sizeOfBundle + * numLeftRows + * numLeftCols + * numRightRows + * numRightCols + * + * + * + * calculate the sizes from the numbers of rows and cols of each, use the + * size of the left matrix to calc start addr of right matrix.. in the + * remote memory, just set pointers to the matrix locations in the bundles + */ +//TODO: carefully step through bundling quad -- check sizes and addr on 9x9 +void* bundleInputs_MM( DKUPiece* piece ) + { void *bundle; + int32 sizeOfLeftMatrix, sizeOfRightMatrix, sizeOfBundle; + float32 *leftBundleMatrix,*leftMatrix,*rightBundleMatrix,*rightMatrix; + float32 *leftBundleInsertPt, *rightBundleInsertPt; + float32 *leftMatrixReadPt, *rightMatrixReadPt; + + MatrixProdPiece *prodPiece = (MatrixProdPiece *)piece->appSpecificPiece; + leftMatrix = prodPiece->leftMatrix->matrix; + rightMatrix = prodPiece->rightMatrix->matrix; + + int32 leftStartRow = prodPiece->leftStartRow; + int32 leftStartCol = prodPiece->leftStartCol; + int32 rightStartRow = prodPiece->rightStartRow; + int32 rightStartCol = prodPiece->rightStartCol; + int32 numLeftRows, numLeftCols, numRightRows, numRightCols; + + numLeftRows = prodPiece->leftEndRow - leftStartRow + 1; + numLeftCols = prodPiece->leftEndCol - leftStartCol + 1; + numRightRows = prodPiece->rightEndRow - rightStartRow + 1; + numRightCols = prodPiece->rightEndCol - rightStartCol + 1; + + sizeOfLeftMatrix = sizeof( float32 ) * numLeftRows * numLeftCols; + sizeOfRightMatrix = sizeof( float32 ) * numRightRows * numRightCols; + sizeOfBundle = numPos * sizeof( int32 ) + + sizeOfLeftMatrix + sizeOfRightMatrix; + + bundle = BLIS_DKU__makeInputBundle( sizeOfBundle ); + + *((int32 *)bundle + szPos) = sizeOfBundle; + *((int32 *)bundle + numLRPos) = numLeftRows; + *((int32 *)bundle + numLCPos) = numLeftCols; + *((int32 *)bundle + numRRPos) = numRightRows; + *((int32 *)bundle + numRCPos) = numRightCols; + + //NOTE: Don't need to know start and end.. they will be set in remote + // memory according to the size (number) alone + + leftBundleMatrix = (float32 *) ((int32 *)bundle + LBMatrixPos); + rightBundleMatrix = leftBundleMatrix + sizeOfLeftMatrix/sizeof(float32); + + int32 r, c, numColsInLeftMatrix, numColsInRightMatrix; + numColsInLeftMatrix = prodPiece->leftMatrix->numCols; + leftBundleInsertPt = leftBundleMatrix; + for( r = 0; r < numLeftRows; r++ ) + { leftMatrixReadPt = leftMatrix + + (leftStartRow + r) * numColsInLeftMatrix + + leftStartCol; //these are counts, compiler does *4 + for( c = 0; c < numLeftCols; c++ ) + { + *(leftBundleInsertPt++) = *(leftMatrixReadPt++); + } + } + + // Have to do separate loops for left and right because may be diff shapes + numColsInRightMatrix = prodPiece->rightMatrix->numCols; + rightBundleInsertPt = rightBundleMatrix; + for( r = 0; r < numRightRows; r++ ) + { rightMatrixReadPt = rightMatrix + + (rightStartRow + r) * numColsInRightMatrix + + rightStartCol; + for( c = 0; c < numRightCols; c++ ) + { + *(rightBundleInsertPt++) = *(rightMatrixReadPt++); + } + } + + return bundle; + } + +/*Leave all the data in bundle, just assign pointers to it. + * Create a DKUPiece data structure, then fill in the sizes and pointers. + * + *This is app code, but need to make it easy for specialization. + *On machines like the Cell, the code for this function will be copied + * over to a separate file, along with any other DKU functions needed in + * the remote memory. + * + *The scheduler in remote memory is responsible for making space for the + * input bundle, and for freeing it (if needed) after the result bundle has + * been sent back. + * + *Model is that use an override of malloc that puts everything malloc'd from + * unbundleInputs calls and from bundleResults call into a buffer in remote + * mem. This entire buffer is freed after the return of the result bundle is + * complete. + */ + void +unbundleInputs_MM( void *bundle, DKUPiece *piece ) + { int32 sizeOfBundle, numLeftRows, numLeftCols, numRightRows, numRightCols; + int32 sizeOfLeftMatrix, sizeOfRightMatrix, sizeOfResultMatrix; + float32 *leftBundleMatrix, *rightBundleMatrix; + MatrixProdPiece *prodPiece; + + sizeOfBundle = *((int32 *)bundle + szPos); + + numLeftRows = *((int32 *)bundle + numLRPos); + numLeftCols = *((int32 *)bundle + numLCPos); + numRightRows = *((int32 *)bundle + numRRPos); + numRightCols = *((int32 *)bundle + numRCPos); + + + sizeOfLeftMatrix = sizeof( float32 ) * numLeftRows * numLeftCols; + sizeOfRightMatrix = sizeof( float32 ) * numRightRows * numRightCols; + sizeOfResultMatrix = sizeof( float32 ) * numLeftRows * numRightCols; + + leftBundleMatrix = (float32 *) ((int32 *)bundle + LBMatrixPos); + rightBundleMatrix = leftBundleMatrix + sizeOfLeftMatrix/sizeof(float32); + +//ARCH: check, for Cell, what's involved with re-defining malloc that appears +// inside DKUPiece maker and app spec piece maker.. can make it buffer-alloc? + + //that indicate how much stuff is created automatically inside + //IE, does this make produce the sched data, in bundling quad? + prodPiece = DKU__makeMatrixProdPiece_Flat( piece ); + piece->appSpecificPiece = prodPiece; + + prodPiece->leftMatrix = + DKU__makeMatrix_Flat( numLeftRows, numLeftCols, piece ); + prodPiece->leftMatrix->matrix = leftBundleMatrix; + + prodPiece->rightMatrix = + DKU__makeMatrix_Flat( numRightRows, numRightCols, piece ); + prodPiece->rightMatrix->matrix = rightBundleMatrix; + + prodPiece->resultMatrix = + DKU__makeMatrix_Flat( numLeftRows, numRightCols, piece ); + //The result matrix is malloc'd, so it's not inside the input bundle, + // so, to avoid copies when make the result bundle, make it now, then + // put into the DKUPiece the pos of result matrix in result bundle. + void *resultBundle = + BLIS_DKU__malloc_toPiece( sizeOfResultMatrix + sizeof(int32), piece); + *((int32 *)resultBundle) = sizeOfResultMatrix + sizeof(int32); + //skip over the "size" at the start of the result bundle + prodPiece->resultMatrix->matrix = (float32 *)((int32 *)resultBundle + 1); + + //now, fill in the iteration bounds so that the kernel processes + // the entirety of both matrices. + prodPiece->leftStartRow = 0; + prodPiece->leftEndRow = numLeftRows - 1; // "- 1" 'cause start at 0 + prodPiece->leftStartCol = 0; + prodPiece->leftEndCol = numLeftCols - 1; + + prodPiece->rightStartRow = 0; + prodPiece->rightEndRow = numRightRows - 1; + prodPiece->rightStartCol = 0; + prodPiece->rightEndCol = numRightCols - 1; + + prodPiece->prodStartRow = 0; + prodPiece->prodEndRow = numLeftRows - 1; + prodPiece->prodStartCol = 0; + prodPiece->prodEndCol = numRightCols - 1; + } + +/* + *Model is that use an override of malloc in remote mem that puts everything + * malloc'd from unbundleInputs calls and from bundleResults call into a + * buffer in remote mem. The entire buffer is freed after the send of the + * return result bundle is complete. + * + *The application only has to know that it does not perform free on any of + * the inputBundles, nor on any of the resultBundles, in either local or + * remote memory. + *The application also must create new DKUPiece s in the bundling quad plus + * in the Kernel (and all calls rooted at the Kernel) by using + * BLIS_DKU__makeDKUPiece + *Finally, the application must create app-specific pieces + *So anything malloc'd inside bundleResults is still inside the same buffer + * used by unbundleInputs. + */ +//ARCH: what about just give unbundleInputs and bundleResults an "align" +// operator that's HW-supplied. +// First element of bundle is size, then "0" terminated list of offsets to +// alignable-chunks, then the alignable chunks start. Alignment happens +// during bundling. HW also supplies a checker to see if aligned bundle is +// too big. (add a "revert divide" so can do a new divide to get smaller +// pieces, or something.. ) + void * +bundleResults_MM( DKUPiece *piece, void *inputBundle ) + { MatrixProdPiece *matProd; + float32 *matProdArr; + + matProd = (MatrixProdPiece *)piece->appSpecificPiece; + matProdArr = matProd->resultMatrix->matrix; + + //TODO: figure out soln for alignment or result matrix when it's inside + // input-bundle + + //results bundle already made (inside unbundleInputs fn), resultsBundle + // addr is one int32 before addr of result matrix array. + void *resultsBundle = ((int8 *)matProdArr - sizeof(int32)); + + return resultsBundle; + } + +/* The DKU standard says that the scheduler guarantees that the same + * DKUPiece that created an input bundle will be called to unbundle + * the corresponding results bundle. + * This means that the unbundleResults method is called on the original + * piece, that still has the position within the result matrix where + * this piece's results should go. + * So, copy the values in the incoming result matrix to the correct + * sub-block of the "original" result matrix + */ + void +unbundleResults_MM( void * resultBundle, DKUPiece *origPiece ) + { float32 *bundMatrixArr, *resMatrixArr, *bundleReadPt, *resultInsertPt; + MatrixProdPiece *matProd; + Matrix *resultMatrix; + + int32 resMatNumRows, resMatNumCols; + int32 prodStartRow, prodEndRow, prodStartCol, prodEndCol, r, c; + + bundMatrixArr = (float32 *) ((int32 *)resultBundle + 1); + + matProd = (MatrixProdPiece *) origPiece->appSpecificPiece; + resultMatrix = matProd->resultMatrix; + resMatrixArr = resultMatrix->matrix; + + resMatNumRows = resultMatrix->numRows; + resMatNumCols = resultMatrix->numCols; + + prodStartRow = matProd->prodStartRow; + prodEndRow = matProd->prodEndRow; + prodStartCol = matProd->prodStartCol; + prodEndCol = matProd->prodEndCol; + + //copy the results from the matrix in the bundle to + // the full result matrix. + bundleReadPt = bundMatrixArr; + for( r = prodStartRow; r < prodEndRow; r++ ) + { resultInsertPt = resMatrixArr + r * resMatNumCols + prodStartCol; + for( c = prodStartCol; c <= prodEndCol; c++ ) + { *(resultInsertPt++) = *(bundleReadPt++); + } + } + } + + diff -r 000000000000 -r d138e0acf9a0 DKU_INST_MM/Communicators.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/DKU_INST_MM/Communicators.c Sun Aug 26 03:04:50 2012 -0700 @@ -0,0 +1,435 @@ +/* + * Copyright 2009 OpenSourceStewardshipFoundation.org + * Licensed under GNU General Public License version 2 + * + * Author: seanhalle@yahoo.com + */ + +#include "DKU_INST_MM.h" + +/*Communicators are used by the scheduler to send data from one piece to + * another. The kernel specifies when the communication is to take place + * by calling either a send communicator or a receive communicator. + *The communicators know what data to send and where to send it by reading + * information out of the DKUPiece structure. The information about which + * other piece to send to and what data is placed into the piece by the + * DKUPieceMaker and the Divider. + *This is how dependencies among data pieces are encoded and how the + * scheduler is informed of them. + */ + +/*In Deblocking, the dependency pattern is a 45 degree diagonal. A given + * macro block receives information from the macro block above it and the + * macro block to its left. + *This must be told to the scheduler, so that it can order the execution of + * pieces appropriately. + * application programmer has chosen to divide each + * screen frame into diagonals + */ + +/*Okay, so going with the plan that match the scheduler form to the + * application form. Expose explicitly that different applications have + * characteristics that can be taken advantage of by scheduler impl to make + * more efficient scheduler. + *So, take advantage of flexibility of BLIS interface philosophy.. make an + * interface that is tuned to particular characteristic of application to + * allow efficient scheduler impl that takes advantage. + *In this case, it is fact that have a pattern of dependencies. Make the + * dependencies first-class entities in the interface. Make a primitive for + * expressing the dependencies, and one for embodying the dependencies. + * + *The embodiment will be an array. The DKUPieceMaker will create the array + * and populate it for the initial pieces it makes.. something like that.. + * The DKUPieceMaker also returns a pointer to the initially-free piece. + * (Note that there's only ever one initially free piece. If there were two + * then they wouldn't depend on each other, and so could be combined into + * a single piece.. or? In some cases would require separate + * "combination" data-struc.. so, maybe an array of initially free) + *Each division places an array in the parent, with one position for each + * sub-piece. The position holds the count, of the number of propendents, + * for the sub-piece in the corresponding position in the sub-piece array. + * + *The fellow sub-pieces of a parent typically won't have any dependencies + * on each other.. the dependencies are to sub-pieces of other parents. + *So the DKUPieceMaker will create the largest pieces that can be sub-divided + * freely, and also create an array with the propendent count for each of + * those pieces. + *The undivider is where propendent counts are updated. The Undivider is + * what frees DKUPieces to be scheduled. That process is inside the + * scheduler implementation. + *The divider is written to know about the arrays of propendent counts of the + * other DKUPieces. So is the Undivider. The divider looks at the arrays + * that already exist and populates the new array for the currently being + * divided parent accordingly. It also puts in information for the Undivider + * to properly update the counts in the arrays when a piece finishes. + *Big question: can Albert perform a static analysis that can understand + * general code in the divider and undivider, given that it knows explicitly + * what the arrays are used for, and just has to look at the code that + * creates, populates, and updates the arrays.. The idea being that the + * polyhedral model can learn the dependencies this way. + * + *Each dependency has an associated communicator. A communicator is what + * performs transport of the propendent-generated-state to the receiving + * dependent. A communicator and a dependency arrow are the same thing. + *In shared memory, the communicator is normally not used. Only in distr + * memory is it invoked to bundle up the propendent data and carry it to the + * dependent. + *The Undivider tells the scheduler each time a completed propendent has + * updated the count of a given dependent. This will trigger the + * scheduler to fire the communicator, if it has been implemented that way. + *This lets the scheduler, for example, send data around before the + * propendent data is available. So, it gets better overlap of communication + * and computation. + * + *So, this scheme so far covers the case of "2D" parallelism in Deblocking. + * But it doesn't yet cover simulation where the communication happens in + * the middle of a loop nest. + *For that, still have the same communicator, which is still one-to-one with + * a dependency in the data. + *But now, the Kernel invokes the communicator, when the propendent has + * finished producing data.. By only calling in the propendent, causality is + * always enforced, with no extra mechanism required in the scheduler. + *The Kernel also calls the communicator in the dependent, in the position + * that data must be received before continuing on. + *This way, the scheduler is free to implement the timing of communication + * in many different ways. The communicator explicitly copies data to a + * separate communication-area that is made in the propendent DKUPiece + * by the Divider or DKUPieceMaker during creation. + *It returns when the copy is done, handing the scheduler a pointer to the + * data. The scheduler then handles moving the data to the dependent piece + * (or not, on shared memory). The receiving end of the communicator + * accepts a pointer to the data. The Kernel is written to access data + * through that pointer. + * + *Wanting to allow shared memory to NOT perform the copy, just pass a pointer + * to the area of data that the dependent needs, and have the Kernel access + * the data in the appropriate way (according to data in the DKUPiece pointed + * to).. thing is, still need something to perform the copy in the distr. + * memory case. + *So, seeing two different sending communicators, and two different receiving + * communicators. For distr mem, the sender does a copy into area in + * sender's DKUPiece, then returns pointer to that copy (it's a bundle, with + * first location being a uint32 with size of bundle).. or maybe have a + * pre-defined element in the DKUPiece. The receiver is implemented as part + * of the scheduler.. it returns a pointer to the Dependent calling Kernel. + *Shared mem communicators, the sender is implemented by the scheduler, as + * well as the receiver. The propendent Kernel simply passes the DKUPiece + * to the sender. + * + *Right.. so, three out of the four are implemented by the scheduler.. so + * just make the fourth be an optional do-hickie. The scheduler implements + * both the send and receive calls, and the application provides a + * communication bundler. : D + *The receiver still has two different cases: in one it gets a bundle, in + * the other it gets a pointer to a DKUPiece. No point in unbundling, just + * so the Kernel can do the gather operation again. So, the Kernel will have + * to have a different version, one for shared, second for distr.. yuck. + * + *Q: starting to have different "kinds" of DKUPiece now.. how going to + * handle backwards-and-forwards compatibility? + * + *For shared mem, + */ + +void * bundleComm_BarnesHut_110( DKUPiece piece ) + { + //this copies data out of the piece for the "110" going direction + //All communication bundlers have the same prototype: void * foo(DKUPiece) + //The bundlers are registered with the scheduler for this DKU instance + // via the BLIS_DKU__set_commBundler_ForID( &foo, commTypeID, DKU_Inst_ID) + //Which two pieces communicate is set by the propendent and dependent + // calls to communicate, which happen in the Kernel. The PieceMaker and + // Divider know which pieces communicate to which at what points in the + // Kernel. So, they insert into the DKUPiece data structure the ID of + // the piece they send to/receive from. When the Kernel reaches the + // communication point, it takes the propendent/dependent pieceID from + // the DKUPiece and passes that to the communicate() call (along with the + // commTypeID, which is fixed for each call that appears in the Kernel). + //In other words, the Kernel, Divider, and PieceMaker are the only ones + // that agree among themselves on what a particular commTypeID means. For + // example, for graphs, there is only one type ID, because the process of + // copying is the same no matter which direction.. but for a mesh, the + // copy is different for the top, bottom, and two edges.. + + /*There is the basic issue of exposing in the Kernel code whether running + * on shared memory or distr memory. + *Could either have a fixed Kernel that adapts, or have two Kernels that + * are chosen among by the scheduler. + *If the Kernel is fixed, have only a few choices: + * Kernel always thinks it's taking data from a DKUPiece, or + * Kernel always takes data via a fixed interface-call. + *Not sure how to hide if Kernel always takes from a DKUPiece.. and extra + * work in remote memory to re-create DKUPiece structure, for only reason + * to make interface nice (for programmer). + *With the fixed interface, could provide two adaptors: one for shared + * mem that gathers from a DKUPiece, other for a bundle. Then scheduler + * picks, behind the interface, which adaptor. + *How would that look to the Kernel? Something like "update", and the + * local memory changes.. but that only works if there's a fixed data- + * struc in local mem to take from as the Kernel computation progresses. + *IE, have an outer loop that the communication is in, and inner loops + * that use the result of communication. For shared mem, want that data- + * struc to be the appSpecificPiece of another DKUPiece, and locations in + * it will be read (or written) inside the inner loops. This means the + * scheduler has to figure out an ordering to run the Kernels that + * respects the dependencies. This is aided by the fact that the Kernel + * has a separate propendent and dependent call that states the order. + *There will actually be some Kernels that can't be run this way: the + * pattern of dependencies has no sequential soln, a copy must be done. + *Starting to think perhaps it's best to just always do a copy.. + * + *Other choice is two different versions of the Kernel.. one that reads + * from a DKUPiece, the other that reads from a commBundle. + *Or a hybrid Kernel that can read from both, using a flag + * in the DKUPiece that tells the Kernel whether to receive a + * commBbundle or whether to take from the normal data in the propendent. + *The two Kernel version is the most run-time efficient. The main drawback + * is that the app developer has to make identical changes in two + * different places.. any time they change one of the Kernel-copies they + * have to change the other too.. also, it feels weird having two + * different Kernels.. don't like it.. + *Kinda like the hybrid approach, it has the second least run-time + * overhead, just an IF statement that will be well predicted each time + * it accesses data during calculations.. Could even use a #define here, + * so the IF is known at compile time to always go one direction. The + * specialization module would change the #define to: SharedMem or to + * DistrMem. Or, make the if a #ifdef in the source.. but that's ugly + * to read. + *Something to percolate.. whatever is chosen, the problem is solved, + * just a matter of tradeoffs at this point.. + */ + } + +void sampleKernel( DKUPiece *piece ) + { + initializePropendents(); + for( outer = 0; outer < N; outer++ ) + { fromProp = getFromPropendent( piece->propPieceIDs[ NORTH_PROPENDENT ]); + for( inner = 0; inner < N; inner++ ) + { //This Kernel knows that data is an array because it's written + // by the app-programmer. It knows that fromProp is also an + // array because the app progr wrote the commBundler. + piece->appSpecPiece->data[ inner ] += fromProp[ inner ]; + } + sendToDependent( piece->depPieceIDs[ SOUTH_DEPENDENT ] ); + } + finalizeDependents(); + } + +/*In the divider, one has a single piece, which has communicators at its + * boundaries already. + *The divider cuts up the piece, and so knows which sub-pieces talk to + * which others, because it just made them all from the same parent piece. + *The tricky part is connecting the hierarchy. + *The pieces the parent communicated with are on the order of the size of + * the parent. Those pieces will likely have been divided as well.. now + * the question is how to hand-off from the parent to the appropriate + * sub-pieces. + *One way to do it is to have a Kernel running for each piece in the + * hierarchy, but of two kinds: hierachy-Kernels and "normal" Kernels. The + * hierarchy Kernels only do communication: they break an in-coming request + * among their sub-pieces, then gather the responses back together. + *This is inefficienct in one sense: direct communication between sub-pieces + * of different Kernels would be optimal. + *However, such a hierarchy will + * only exist when a physical hierarchy exists among machines. In that + * case, the extra scatter-gather work done by the hierarchy pieces might + * even be more efficient because it makes fewer, larger, communications + * between the larger physical entities. This is more likely to ameliorate + * the loss from the larger latency in communication at the larger physical + * division. + *Okay, so going with that, for now. When get details, may see some patterns + * for how to do direct communication among sub-pieces.. (but not holding + * my breath because each of two parents can be divided into a different + * number of sub-pieces.. so there is no one-to-one between sub-pieces on + * the edge of one parent and sub-pieces on a communicating edge of another + * parent.) + * + *This sample is for a big linked list. Each piece is just a number of nodes + * of the list. + *When the Divider makes sub-pieces, it knows which ones communicate across + * the boundaries of the parent (because the application programmer wrote + * the Divider and placed the code in it that handles the boundaries of the + * parent). + *Patterns for how to do this part: + *Could have the Divider create some new structure that it places in the + * parent that holds the state for the CommKernel. Put into that structure + * all the sub-piece-IDs that will communicate with it. Into the sub-pieces + * put a commID not of the parent piece, but of the parent's CommKernel. So + * a piece gains a second ID when it is divided. It keeps its original + * commID and uses that to communicate with siblings, while it uses the + * subCommID to communicate with sub-pieces. + *Just going with that one idea for the moment.. + *So, for the Linked List example, the Divider will set all the sub-pieces + * to talk to each other, and set the end-pieces to talk to the subCommID of + * the parent. + *The parent, meanwhile, will have a normal commID with which it talks to + * its siblings. + *When the normal Kernel modifies the linkings in the list, it has to check + * if one of the elements modified is a boundary element, and if so if the + * new arrangement has changed which element is the boundary. + *For example, an element is added to one end of a sub-piece. The added + * element has its link-ptr set to a value that indicates it's a boundary + * element, for example NULL or -1. The old boundary element's pointer is + * set to the new boundary element. The commID is taken from the old + * boundary element and put into the new boundary element. Done. + *The CommKernel will take advantage of the fact that it knows it's in a + * hierarchy.. it will keep an array of the values at the boundaries of its + * siblings. (Note that one DKUPiece is disallowed from holding a pointer + * to another DKUPiece).. the values may get out-of-sync, so they will be + * fixed-up when detected. + *Here's the pattern: when CommKernel gets comm from sub-piece, it looks to + * see what kind of comm it is.. if it's a "here's a value to insert", then + * it checks the end-values of its siblings and picks the sibling it belongs + * on. It sends to that sibling (requires unsolicited reception mechanism, + * such as the signals method for re-divide).. includes its own boundary + * values (free to piggy-back).. that updates the receiver's view of the + * sibling-piece's boundary values. + * + *When CommKernel receives a value-to-insert, + * it checks the values at its two boundary elements. If the value is + * between, then it accepts it. If not, it responds to the sender, telling + * the sender what the receiver's actual boundary values are. It then sends + * the value to the piece it believes it should go to (received the sender's + * boundary values along with the insert value, so it's certain it won't + * send back to where it came from..) Eventually the value will land at + * the correct CommKernel. + *Seeing pieces given to the CommKernel of the top-level parent. It then + * hands them out among its children, and from there to next level of + * children, and so on. Notice that siblings talk directly to each other, + * they don't go up to the parent then back down. + *The value of this abstract data type will be handling an enormous number + * of inserts, deletes, and lookups. + * + */ +void sampleDivider( DKUPiece *piece, int numPieces ) + { DKUPiece * newPiece; + //First sub-piece, so it is a boundary of the parent + //Figure out if parent has a sibling, or if it is natural boundary + if(numPieces < 2) return; //leave sub-pieces empty if only 1 sub-piece + + newPiece = makeASubPiece( someValues ); + if( BLIS_DKU__isNaturalBoundary( piece->propPieceIDs[ LEFT_PROPENDENT ]) ) + { newPiece->propPieceIDs[ LEFT_PROPENDENT ] = + BLIS_DKU__makeNaturalBoundaryPiece( piece, newPiece, DKU_INST_ID ); + } + else //parent has a sibling, so communicate with parent's CommKernel + { newPiece->propPieceIDs[ LEFT_PROPENDENT ] = + BLIS_DKU__giveCommKernelAsPropendent( piece ); //scheduler returns the + //thing that it has implemented as "addr" of CommKernel of piece + } + for( pieceIdx = 1; pieceIdx < numPieces - 1; pieceIdx++ ) + { newPiece = makeASubPiece( someValues ); + newPiece->propPieceIDs[ LEFT_PROPENDENT ] = (subPieces[pieceIdx - 1]); + subPieces[pieceIdx-1]->propPieceIDs[ RIGHT_PROPENDENT ] = (newPiece); + } + newPiece = makeASubPiece( someValues ); + newPiece->propPieceIDs[ LEFT_PROPENDENT ] = (subPieces[numPieces - 1]); + if( BLIS_DKU__isNaturalBoundary( piece->propPieceIDs[ RIGHT_PROPENDENT ])) + { newPiece->propPieceIDs[ RIGHT_PROPENDENT ] = + BLIS_DKU__makeNaturalBoundaryPiece( piece, newPiece, DKU_INST_ID ); + } + else //parent has a sibling, so communicate with parent's CommKernel + { newPiece->propPieceIDs[ RIGHT_PROPENDENT ] = + BLIS_DKU__giveCommKernelAsPropendent( piece ); //scheduler returns the + //thing that it has implemented as "addr" of CommKernel of piece + //Might have to make left propendent and right propendent be different + // addresses.. in which case include a directionID in the call to + // the scheduler asking for the address. + //Might be something about matching dependents with propendents.. not + // sure how that's going to play out.. + //There's "pull from propendent" and "push to dependent" which are + // interrupt-model.. then there's propendent sends and dependent + // receives. + + } + } +/*Some ill-fits in here.. need to do real app, with real dependencies and + * real comm in it.. + * + *Thinking perhaps give each piece a "name" struc.. the makeDKUPiece() + * creates for DKUInstances that have a communicator registered.. (will + * add some overhead to makeDKUPiece if have to do DKU-instance lookup) + *WANT DKU-INSTANCE LOOKUP TO BE TRANSFORMED BY SPECIALIZATION + * in practice, want the lookup to be performed statically, to eliminate the + * dynamic overhead.. can do this if define semantics of DKUInstance to + * be one-time and one-time only setting of functions to an instance.. have + * to check this statically in the BLIS-rule-checker that's run in the + * makefile in the sequential development environment. + * + *Then, the Divider simply copies the name-struc out of target pieces and + * puts it into source pieces. + *This allows, for example, making commKernels that have the name-strucs of + * all the siblings.. so, for example, the commKernel can calculate which + * of the siblings it should send to.. in fact, each of the Kernels can + * calculate which of the siblings it should sent to.. re-use the same + * CommKernel in the leaf-kernels as well as in the parent.. + *The only complication to re-using CommKernel on all levels is the + * boundaries.. just make the boundary be the CommKernel in the parent, + * or a natural boundary.. + */ + + + +/*The purpose of a CommKernel is to gather communications from all the sub- + * pieces, put them together, and turn them into communications from the + * parent piece. And vice-versa: receive comm, break it up, hand a portion + * to each sub-piece. + *This sample is for a big linked list. + */ +void sampleCommKernel( DKUPiece *piece ) + { + + } + +/*Q to Albert: just how bad is this? Will a static tool be able to + * understand the linkages, given that they are a fixed pattern of linkages + * that are signalled by the fixed function-names..? + *For example, could a static tool understand that + * piece->propPieceIDs[ NORTH_PROPENDENT ] means a DKUPiece, which is set + * in the Divider.. and uderstand that + * piece->appSpecPiece->data[ inner ] = fromProp[ inner ]; means the + * actual use of the data gotten from the DKUPiece.. then use that + * understanding to replace the calls with a schedule plus direct access + * to the propendent DKUPiece instead of the intermediate access first to + * the commBundle. + *Thinking this is how I want to go: use a commBundle on both shared and + * distr memory. Reason is that it makes scheduling simple. And, there's + * enough semantic information provided by the function calls that a static + * tool should be able to perform a transform that does direct access to the + * DKUPiece by migrating the gather code into the Kernel innards. The + * index into the commBundle is what identifies the gather statement that + * put the data there. That gather statement is inserted in place of the + * access to the array. + */ + +/*Have to provide boundary-propendents, so pieces that are on the edge of + * a mesh, for example, access the boundary-propendent, with whatever + * boundary data. + *Also, have to provide a "time zero" something or other.. The thing it + * solves is that one Kernel produces data at the end of the inner loop, + * with a sendToDependent() call.. that data is received BEFORE the inner + * loop with a getFromPropendent call. This is normally fine, as it encodes + * the time-skew, implicitly pipelining. The only problems are at time + * zero and possibly the very last time-step. + *So, the initializePropendents() sets some default time-zero state that + * will be gotten by the first getFromPropendent() call. + *When the Kernel is done, the finalizeDependents() call handles whatever + * might need to be done with the data from the last sendToDependents() + * call that was performed in the Kernel. + */ + +/*Q: How implement comm inside scheduler? + * syntax proposal for performing comm from inside the Kernel is: + * getFromPropendent( piece->propPieceIDs[ NORTH_PROPENDENT ]); + * + * So.. what is stored in that array? That is the thing that tells the + * scheduler how to perform the communication.. + *Thinking leave it opaque.. it's a void *.. The scheduler fills it in + * itself inside of BLIS_DKU__makeDKUPiece().. the divider gets this thing + * out of the piece and places it into the propPieceIDs array. + * + *This means that ask the scheduler to create a CommKernel, and ask it to + * give the thingie need to communicate with it. + *Also ask the scheduler to create a natural-boundary piece, and + */ diff -r 000000000000 -r d138e0acf9a0 DKU_INST_MM/DKU_INST_MM.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/DKU_INST_MM/DKU_INST_MM.h Sun Aug 26 03:04:50 2012 -0700 @@ -0,0 +1,54 @@ +/* + * Copyright 2009 OpenSourceCodeStewardshipFoundation.org + * Licensed under GNU General Public License version 2 + * + * Author: seanhalle@yahoo.com + * + */ + +#include "../../BLIS/DKU/DKU_common/DKU.h" + +#include "../BLIS_CONSTANTS.h" + +#ifndef _DKU_INST_MM_H +#define _DKU_INST_MM_H + + +//=========================================================================== +// Declarations of the Standard DKU functions + +void DKU_INST_MM_Init(); //tells BLIS the pointers to the DKU functions + +Divide divide_MM; +Kernel kernel_MM; +Undivide unDividePiece_MM; + +MakeRootDKUPieces makeRootDKUPieces_MM; + +SerialKernel serialKernel_MM; + +BundleInputs bundleInputs_MM; +UnbundleInputs unbundleInputs_MM; +BundleResults bundleResults_MM; +UnbundleResults unbundleResults_MM; + +//=========================================================================== +// +#include "../Matrix_Mult.h" + + void +inner_Kernel( MatrixProdPiece *matrixProdPiece ); + + Matrix * +DKU__makeMatrix_Flat( int32 numRows, int32 numCols, DKUPiece *owner ); + + MatrixProdPiece * +DKU__makeMatrixProdPiece_Flat( DKUPiece *owner ); + + MatrixProdPiece * +DKU__makeMatrixProdPiece_FromMatrixProdPiece + ( MatrixProdPiece *parentPiece, DKUPiece *owner ); + + +#endif /* _DKU_INST_MM_H */ + diff -r 000000000000 -r d138e0acf9a0 DKU_INST_MM/DKU_INST_MM_init.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/DKU_INST_MM/DKU_INST_MM_init.c Sun Aug 26 03:04:50 2012 -0700 @@ -0,0 +1,58 @@ +/* + * Copyright Oct 24, 2009 OpenSourceCodeStewardshipFoundation.org + * Licensed under GNU General Public License version 2 + * + * + * Author: SeanHalle@yahoo.com + */ + +#include "DKU_INST_MM.h" + + +/* + * Part of the BLIS DKU standard. Each DKU instance is placed in its own + * directory, a child of the Application directory. The directory is named + * the same as the "#define" constant used to identify the DKU instance. + * The directory has two standard files: "#def const".h and + * "#def const"_init.c. + * + * This is the "#def const"_init.c file.. + * It initializes the scheduler for this DKU instance. + * And it tells that scheduler the pointers to all of the DKU functions for + * this instance. + * + */ +void DKU_INST_MM_Init( ) + { // always start init of a DKU instance with this function + // this isn't modal, so no worries about order of calling these fn + // Can intertwine init of several instances without harm. + BLIS_DKU__start_DKU_Instance_Init( DKU_INST_MM ); + + BLIS_DKU__set_Divide_To_ForID( ÷_MM, DKU_INST_MM ); + BLIS_DKU__set_Kernel_To_ForID( &kernel_MM, DKU_INST_MM ); + BLIS_DKU__set_Undivide_To_ForID( &unDividePiece_MM, DKU_INST_MM ); + + //TODO: figure out make and free -- right depth? Where used? + // make sure don't accidentally free the shared Matrix strucs.. + // trace where used, see if can find easy-to-see-pattern pattern for how + // to do make and free -- DKUPiece automated stuff and whatnot.. + // maybe just let the App see DKUPiece, and make make and free be done + // explicitly in the pieceMaker, divider, undivider, etc.. no automation + BLIS_DKU__set_MakeRootDKUPieces_To_ForID(&makeRootDKUPieces_MM,DKU_INST_MM); +// BLIS_DKU__set_FreeAppSpecSubPiece_To_ForID(&freeMatrixProdPiece_Flat,DKU_INST_MM); +// BLIS_DKU__set_FreeAppSpecRootPiece_To_ForID(&freeMatrixProdPiece_Flat,DKU_INST_MM); +// BLIS_DKU__set_MakeAppSpecPiece_To_ForID(&makeMatrixProdPiece_Using,DKU_INST_MM); + BLIS_DKU__set_SerialKernel_To_ForID( &serialKernel_MM, DKU_INST_MM ); + + BLIS_DKU__set_BundleInputs_To_ForID( &bundleInputs_MM, DKU_INST_MM ); + BLIS_DKU__set_UnbundleInputs_To_ForID(&unbundleInputs_MM,DKU_INST_MM); + BLIS_DKU__set_BundleResults_To_ForID( &bundleResults_MM, DKU_INST_MM); + BLIS_DKU__set_UnbundleResults_To_ForID( &unbundleResults_MM,DKU_INST_MM ); + + //Now that have generic, provide HW-specific overrides +/* BLIS_DKU__set_A_Divide_Override_To_ForID() + BLIS_DKU__set_A_DKU_Override_To_ForID( ) +*/ + // always end init of a DKU instance with this function + BLIS_DKU__end_DKU_Instance_Init( DKU_INST_MM ); + } diff -r 000000000000 -r d138e0acf9a0 DKU_INST_MM/Divide.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/DKU_INST_MM/Divide.c Sun Aug 26 03:04:50 2012 -0700 @@ -0,0 +1,295 @@ +/* + * Copyright 2009 OpenSourceCodeStewardshipFoundation.org + * Licensed under GNU General Public License version 2 + * + * Author: seanhalle@yahoo.com + * + */ + +#include + +#include "DKU_INST_MM.h" + + +typedef +struct + { + //inputs + int numLeftRows; + int numRightCols; + int numSubPiecesToMake; + DKUPiece *parPiece; + + //outputs + int numLeftSlices; + int numRightSlices; + int finalNumSubPieces; + + //outputs then inputs + int *leftSliceStartRows; + int *rightSliceStartCols; + } +SliceStruc; + +void updateSlicingStrucWithSlicingOfInputMatrices(SliceStruc *slicingStruc); + +void pairSlicesAndMakeProdPieces(SliceStruc *slicingStruc, DKUPiece *oPiece); +MatrixProdPiece* makeChildMatrixProdPieceFrom( MatrixProdPiece *parentPiece); + + +/* The Divider + * Divides the iteration space.. + * Matrix Product Piece is a piece of iteration + * space.. it is the iterations in which one piece of the left + * matrix is multiplied by one piece of the right matrix. + * Thus, to make product pieces, both the left and right matrices + * have to be sliced, then all pairings of those slices taken. + * Each pairing on matrix slices is one product piece. + * + * So, for example, dividing a product-piece into, say, 4 pieces means + * dividing the iteration space of the parent piece into 4, + * which means slicing the left matrix by 2, and the right matrix by 2, + * and talking all pairings of a left slice times a right slice. + * + * Do division this way: + * count total number of result cells to be produced. Divide that by the + * number of sub-pieces to make. That gives the target number of result + * cells in each sub-piece. + * Take the square root of the number of target cells, that's the target + * number of cols, and target num rows in the result of each sub-piece. + * See how many sqroots fit horizontally, and how many fit vertically + * take the ceiling of the larger, floor of the smaller. + * That is the number of rows, and number of cols, in each sub-piece. + * Multiply the two to find the number of sub-pieces that will be made. + * If not larger than requested number of sub-pieces, take the ceiling of + * both num rows and num cols (also covers case of floor is zero). + * Divide num rows in left matrix by target num rows, then use residuals + * alg. to assign ranges of left matrix rows to each slice. + * Divide num cols in right by target num cols, then use residuals alg. + * to assign ranges of right matrix cols to each slice. + * Then take all pairings of left-slices with right-slices and make a + * sub-piece from each pairing + * + * @param numPieces + */ +#define SLICE_SCOPE 1 +void divide_MM( DKUPiece* oPiece, int numSubPiecesToMake ) + { MatrixProdPiece *matrixProdPiece; + Matrix *leftMatrix, *rightMatrix; + + matrixProdPiece = (MatrixProdPiece *) oPiece->appSpecificPiece; + + int leftStartRow, leftEndRow, rightStartCol, rightEndCol; + leftStartRow = matrixProdPiece->leftStartRow; + leftEndRow = matrixProdPiece->leftEndRow; + rightStartCol = matrixProdPiece->rightStartCol; + rightEndCol = matrixProdPiece->rightEndCol; + leftMatrix = matrixProdPiece->leftMatrix; + rightMatrix = matrixProdPiece->rightMatrix; + + int numLeftRows, numRightCols; + numLeftRows = leftEndRow - leftStartRow + 1; //+1 cause starts at zero + numRightCols = rightEndCol - rightStartCol + 1; +//============================================================== + + //need numPieces to be divided into two integers + // if each result piece is a square, that gives the best surface + // area to volume == the least communication in distr memory, and + // the least multiple access by different threads in shared memory. + //So, just have to figure out the sizes of the horizontal and the + // vertical. + + // make sure its possible to make more than 1 sub-piece + if( numSubPiecesToMake < 2 || ( numLeftRows < 2 && numRightCols < 2 ) ) + { oPiece->numSubPieces = 0; //scheduler must check for case of 0 pieces + return; + } + + //TODO: scope is the func called within (divide_MM) + SliceStruc *slicingStruc = + BLIS_DKU__malloc_scope( sizeof(SliceStruc), SLICE_SCOPE ); + slicingStruc->numLeftRows = numLeftRows; + slicingStruc->numRightCols = numRightCols; + slicingStruc->numSubPiecesToMake = numSubPiecesToMake; + slicingStruc->parPiece = oPiece; + + updateSlicingStrucWithSlicingOfInputMatrices( slicingStruc ); + + oPiece->numSubPieces = slicingStruc->finalNumSubPieces; + if( oPiece->numSubPieces == 0 ) return; + + // pair up the slices and make the final DKUPieces + pairSlicesAndMakeProdPieces( slicingStruc, oPiece ); + + BLIS_DKU__free_scope( SLICE_SCOPE ); + return; + } + +void updateSlicingStrucWithSlicingOfInputMatrices(SliceStruc *slicingStruc) + { +//======================= Setup ======================== + int numLeftRows = slicingStruc->numLeftRows; + int numRightCols = slicingStruc->numRightCols; + int numSubPiecesToMake = slicingStruc->numSubPiecesToMake; + + int numResultCells; + float targetNumCellsPerPiece, targetDimOfResultPiece; + float idealNumLeftSlices, targetNumLeftSlices; + float idealNumRightSlices, targetNumRightSlices; + float targetRowsPerLeftSlice, targetColsPerRightSlice; + + float rowAccumulator, colAccumulator; + int sliceIdx, rowIncrement, colIncrement; + int numLeftSlices, numRightSlices; + + int leftStartRow, leftEndRow, rightStartCol, rightEndCol; + MatrixProdPiece *parentProdPiece=slicingStruc->parPiece->appSpecificPiece; + leftStartRow = parentProdPiece->leftStartRow; + leftEndRow = parentProdPiece->leftEndRow; + rightStartCol = parentProdPiece->rightStartCol; + rightEndCol = parentProdPiece->rightEndCol; + +//======================= Calc num Slices ======================== + //Calc the closest can reasonably get to square + //TODO: check that math works right: dividing int by float, need cast? + numResultCells = numLeftRows * numRightCols; + targetNumCellsPerPiece = numResultCells / numSubPiecesToMake; + targetDimOfResultPiece = sqrt( targetNumCellsPerPiece ); + idealNumLeftSlices = numLeftRows / targetDimOfResultPiece; + idealNumRightSlices = numRightCols / targetDimOfResultPiece; + + //Now, product of rows * cols should stay close to numPieces, but + // have to make num rows an int, and num cols an int + // means will drop fractional part from larger, then add the number + // of pieces that fractional part represents back on to the smaller + // then round to the nearest integer. The resulting product should + // still be close to numPieces. + if( idealNumRightSlices > idealNumLeftSlices ) + { float diff, numPiecesCut; + //TODO: find floor and "closest int" in C math library.. how use? + targetNumRightSlices = floor( idealNumRightSlices ); + diff = idealNumRightSlices - targetNumRightSlices; + numPiecesCut = diff * idealNumLeftSlices; + idealNumLeftSlices += numPiecesCut / targetNumRightSlices; + targetNumLeftSlices = rint( idealNumLeftSlices ); + } + else + { float diff, numPiecesCut; + targetNumLeftSlices = floor( idealNumLeftSlices ); + diff = idealNumLeftSlices - targetNumLeftSlices; + numPiecesCut = diff * idealNumRightSlices; + idealNumRightSlices += numPiecesCut / targetNumLeftSlices; + targetNumRightSlices = rint( idealNumRightSlices ); + } + targetRowsPerLeftSlice = numLeftRows / targetNumLeftSlices; + targetColsPerRightSlice = numRightCols / targetNumRightSlices; + + //allocate size of worst case + safety + int size = sizeof(int) * (numSubPiecesToMake + 2); + int *leftSliceStartRows, *rightSliceStartCols; + //TODO: "FUNC" is not quite the right scope.. slicingStruc is right scope + leftSliceStartRows = BLIS_DKU__malloc_scope( size, SLICE_SCOPE ); + rightSliceStartCols = BLIS_DKU__malloc_scope( size, SLICE_SCOPE ); + slicingStruc->leftSliceStartRows = leftSliceStartRows; + slicingStruc->rightSliceStartCols = rightSliceStartCols; + +//======================= Slice Left Matrix ======================== + //fix for case only 1 row, when leftStartRow == leftEndRow + leftSliceStartRows[ 0 ] = leftStartRow; + sliceIdx = 0; + rowAccumulator = 0; + int row; + for( row = leftStartRow; row < leftEndRow; row += rowIncrement ) + { + leftSliceStartRows[ sliceIdx ] = row; + + rowAccumulator += targetRowsPerLeftSlice; + rowIncrement = (int) rowAccumulator; + if( rowIncrement == 0 ) rowIncrement = 1;//apply at end curr iter + rowAccumulator -= rowIncrement; + if( rowAccumulator < 0 ) rowAccumulator = 0; + sliceIdx += 1; + } + if( sliceIdx == 0 ) sliceIdx = 1; //case when only 1 row + numLeftSlices = sliceIdx; + leftSliceStartRows[ sliceIdx ] = leftEndRow + 1; //use extra slot + + sliceIdx = 0; colAccumulator = 0; colIncrement = 0; + +//======================= Slice Right Matrix ======================== + rightSliceStartCols[ 0 ] = rightStartCol; //in case only 1 col + int col; + for( col = rightStartCol; col < rightEndCol; col += colIncrement ) + { + rightSliceStartCols[ sliceIdx ] = col; + + colAccumulator += targetColsPerRightSlice; + colIncrement = (int) colAccumulator; + if( colIncrement == 0 ) colIncrement = 1;//apply at end curr iter + colAccumulator -= colIncrement; + if( colAccumulator < 0 ) colAccumulator = 0; + sliceIdx += 1; + } + if( sliceIdx == 0 ) sliceIdx = 1; //case when only 1 col + numRightSlices = sliceIdx; + rightSliceStartCols[ sliceIdx ] = rightEndCol + 1; + + slicingStruc->numLeftSlices = numLeftSlices; + slicingStruc->numRightSlices = numRightSlices; + slicingStruc->finalNumSubPieces = numLeftSlices * numRightSlices; + return; + } + + +void pairSlicesAndMakeProdPieces(SliceStruc *slicingStruc, DKUPiece *oPiece) + { DKUPiece* *subPiecesArray; + MatrixProdPiece *newProdPiece; + MatrixProdPiece *parentMatPiece =slicingStruc->parPiece->appSpecificPiece; + int newPiecePos = 0; + int leftSliceStartRow, leftSliceEndRow; + int rightSliceStartCol, rightSliceEndCol; + int numLeftSlices = slicingStruc->numLeftSlices; + int numRightSlices = slicingStruc->numRightSlices; + int *leftSliceStartRows = slicingStruc->leftSliceStartRows; + int *rightSliceStartCols = slicingStruc->rightSliceStartCols; + + int size = slicingStruc->finalNumSubPieces * sizeof(MatrixProdPiece *); + subPiecesArray = BLIS_DKU__malloc_toPiece( size, oPiece ); + oPiece->subPiecesArray = subPiecesArray; + + int rightSliceNum, leftSliceNum; + for( rightSliceNum = 0; rightSliceNum < numRightSlices; rightSliceNum++ ) + { DKUPiece *newDKUPiece; + rightSliceStartCol = rightSliceStartCols[ rightSliceNum ]; + rightSliceEndCol = rightSliceStartCols[ rightSliceNum + 1 ] - 1; + for( leftSliceNum = 0; leftSliceNum < numLeftSlices; leftSliceNum++ ) + { newDKUPiece = BLIS_DKU__makeDKUPiece_FromDivider( DKU_INST_MM ); + newProdPiece = DKU__makeMatrixProdPiece_FromMatrixProdPiece + ( parentMatPiece, newDKUPiece ); + newDKUPiece->appSpecificPiece = newProdPiece; + + leftSliceStartRow = leftSliceStartRows[ leftSliceNum ]; + leftSliceEndRow = leftSliceStartRows[ leftSliceNum + 1 ] - 1; + + newProdPiece->leftStartRow = leftSliceStartRow; + newProdPiece->leftEndRow = leftSliceEndRow; + newProdPiece->leftStartCol = parentMatPiece->leftStartCol; + newProdPiece->leftEndCol = parentMatPiece->leftEndCol; + + newProdPiece->rightStartRow = parentMatPiece->rightStartRow; + newProdPiece->rightEndRow = parentMatPiece->rightEndRow; + newProdPiece->rightStartCol = rightSliceStartCol; + newProdPiece->rightEndCol = rightSliceEndCol; + + newProdPiece->prodStartRow = leftSliceStartRow; + newProdPiece->prodEndRow = leftSliceEndRow; + newProdPiece->prodStartCol = rightSliceStartCol; + newProdPiece->prodEndCol = rightSliceEndCol; + + subPiecesArray[ newPiecePos ] = newDKUPiece; + newPiecePos += 1; + } + }//for(int rightSliceNum = 0; rightSliceNum < + } + +//=========================================================================== diff -r 000000000000 -r d138e0acf9a0 DKU_INST_MM/Kernel.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/DKU_INST_MM/Kernel.c Sun Aug 26 03:04:50 2012 -0700 @@ -0,0 +1,80 @@ +/* + * Copyright 2009 OpenSourceCodeStewardshipFoundation.org + * Licensed under GNU General Public License version 2 + * + * Author: SeanHalle@yahoo.com + * + */ + +#include "DKU_INST_MM.h" + + +/* Kernel + * + * Computes the passed-in piece's portion of the iteration space. + * A DKUPiece struc is handed to it. + * The DKUPiece struct has a pointer to "app specific info". This + * is some application-specific structure that the Scheduler doesn't + * know the details of. The Scheduler can pass the app-specific + * data around, and that's all the Scheduler needs to do with it. + * This kernel, however, is written by the app programmer, and does + * know the internals of "app specific info". + * + * DKU Std: data that is accessed inside a Kernel is passed inside + * a DKUPiece. Kernel's not allowed to access data via the language's + * native scoping rules, not even constants. Thus, pointers to + * arrays or to a tree's root nodes, or so forth are carried inside + * the DKUPiece. + * Note, kernel code is re-entrant, so the rules of re-entrant code apply + * + * + * This is the standard Matrix Multiply loop nest. The only modification + * for DKU is that it takes loop bounds out of the DKUPiece. + */ +void kernel_MM( DKUPiece *pieceToProcess ) + { inner_Kernel( (MatrixProdPiece *) pieceToProcess->appSpecificPiece ); + } + +/* Separated out the calculations of the Kernel so could re-use as the + * serial kernel. + */ +void inner_Kernel( MatrixProdPiece *matrixProdPiece ) + { int32 leftStartRow, leftEndRow, rightStartCol, rightEndCol; + int32 leftStartCol, rightStartRow; + int32 numLeftCols, numRightCols, numResMatCols; + int32 row, col, vectorSize, i; + float32 *leftMatrix, *rightMatrix, *resultMatrix; + float32 *leftStartPt, *leftReadPt, *rightStartPt, *rightReadPt; + + leftMatrix = matrixProdPiece->leftMatrix->matrix; + rightMatrix = matrixProdPiece->rightMatrix->matrix; + resultMatrix = matrixProdPiece->resultMatrix->matrix; + + leftStartRow = matrixProdPiece->leftStartRow; + leftEndRow = matrixProdPiece->leftEndRow; + rightStartCol = matrixProdPiece->rightStartCol; + rightEndCol = matrixProdPiece->rightEndCol; + numResMatCols = matrixProdPiece->resultMatrix->numCols; + rightStartRow = matrixProdPiece->rightStartRow; + leftStartCol = matrixProdPiece->leftStartCol; + numRightCols = matrixProdPiece->rightMatrix->numCols; + numLeftCols = matrixProdPiece->leftMatrix->numCols; + + vectorSize =matrixProdPiece->leftEndCol - matrixProdPiece->leftStartCol+1; + for( row = leftStartRow; row <= leftEndRow; row++ ) + { leftStartPt = leftMatrix + row * numLeftCols + leftStartCol; + for( col = rightStartCol; col <= rightEndCol; col++ ) + { float32 sum = 0; + + rightStartPt = rightMatrix + rightStartRow * numRightCols + col; + + leftReadPt = leftStartPt; + rightReadPt = rightStartPt; + for( i = 0; i < vectorSize; i++) + { sum += *(leftReadPt++) * *rightReadPt; + rightReadPt += numRightCols; + } + *(resultMatrix + row * numResMatCols + col) = sum; + } + } + } diff -r 000000000000 -r d138e0acf9a0 DKU_INST_MM/MakeRootDKUPieces.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/DKU_INST_MM/MakeRootDKUPieces.c Sun Aug 26 03:04:50 2012 -0700 @@ -0,0 +1,53 @@ +/* + * Copyright 2009 OpenSourceCodeStewardshipFoundation.org + * Licensed under GNU General Public License version 2 + * + * Author: seanhalle@yahoo.com + */ + +#include "DKU_INST_MM.h" + +/* This is what wraps application data inside a DKUPiece. + * It also encodes the dependencies within the data, by making several + * DKUPieces, which must be executed in sequence, in order to respect the + * dependencies. + * It tries to make the pieces as large as possible, to maximize the + * available parallelism. + * + * It allocates space for and fills an array of pointers to the pieces, + * then returns how many pieces it put into the array. + * + * This function allocates the array of pointers to DKUPieces, plus the + * DKUPieces, plus all sub-structures inside a DKUPiece. This is an + * application-supplied function, so it knows all the app-specific sub- + * structures inside a DKUPiece. However, it is called by the scheduler + * so it has to have a fixed prototype. + * The scheduler calls this function, so it is up to the scheduler to free + * all the structures this function has allocated. To do this, a second + * function is supplied that performs the free s. + * + * DKUPieceMaker returns a RootDKUPieces data structure, which contains the + * array of DKUPieces that have to be executed in sequence, to preserve + * the dependencies, and contains the number of such DKUPieces. + * Each call to makeDKUPieces mallocs space for the pieces, plus the + * root pieces array and the root pieces struc. This memory is freed by + * the standard function BLIS_DKU__cleanupRootPieces, which is called by the + * scheduler. + * + * For Matrix Multiply, the original data is a MatrixProdPair, which is + * perfectly dividable, so just have to wrap that inside a DKUPiece, and + * return it inside a RootDKUPieces data structure. + */ +RootDKUPieces * makeRootDKUPieces_MM( void *origData ) + { RootDKUPieces * rootDKUPieces; + DKUPiece *piece; + + piece = BLIS_DKU__makeDKUPiece_FromMaker( DKU_INST_MM ); + piece->appSpecificPiece = origData; + + int numRootPieces = 1; + rootDKUPieces = BLIS_DKU__makeRootDKUPiecesStruc( numRootPieces ); + rootDKUPieces->rootPiecesArray[0] = piece; + + return rootDKUPieces; + } diff -r 000000000000 -r d138e0acf9a0 DKU_INST_MM/Maker_and_Freer.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/DKU_INST_MM/Maker_and_Freer.c Sun Aug 26 03:04:50 2012 -0700 @@ -0,0 +1,73 @@ +/* + * Copyright 2009 OpenSourceCodeStewardshipFoundation.org + * Licensed under GNU General Public License version 2 + * + * Author: seanhalle@yahoo.com + * + * Created on November 15, 2009, 2:35 AM + */ + +#include + +#include "DKU_INST_MM.h" + + +//========== Makers and Free-ers for use ONLY within DKU_INST_MM =========== +// + +/*In the "_Flat" version of constructor, do only malloc of the top data struc + * and set values in that top-level. Don't malloc any sub-structures. + * + *Used in BundlingQuad in unbundleInputs + */ + Matrix * +DKU__makeMatrix_Flat( int32 numRows, int32 numCols, DKUPiece *owner ) + { Matrix * retMatrix; + retMatrix = BLIS_DKU__malloc_toPiece( sizeof( Matrix ), owner ); + retMatrix->numRows = numRows; + retMatrix->numCols = numCols; + + return retMatrix; + } + +/* Used In BundlingQuad in unbundleInputs */ + MatrixProdPiece * +DKU__makeMatrixProdPiece_Flat( DKUPiece *owner ) + { return BLIS_DKU__malloc_toPiece( sizeof(MatrixProdPiece), owner ); + } + + + +/* Used in Divider */ + MatrixProdPiece * +DKU__makeMatrixProdPiece_FromMatrixProdPiece + ( MatrixProdPiece *parentPiece, DKUPiece *owner ) + { MatrixProdPiece *newPiece; + Matrix *leftMatrix = parentPiece->leftMatrix; + Matrix *rightMatrix = parentPiece->rightMatrix; + + newPiece = DKU__makeMatrixProdPiece_Flat( owner ); + + newPiece->leftMatrix = leftMatrix; + newPiece->rightMatrix = rightMatrix; + + newPiece->leftStartRow = 0; + newPiece->leftEndRow = leftMatrix->numRows - 1; + newPiece->leftStartCol = 0; + newPiece->leftEndCol = leftMatrix->numCols - 1; + + newPiece->rightStartRow = 0; + newPiece->rightEndRow = rightMatrix->numRows - 1; + newPiece->rightStartCol = 0; + newPiece->rightEndCol = rightMatrix->numCols - 1; + + newPiece->prodStartRow = newPiece->leftStartRow; + newPiece->prodEndRow = newPiece->leftEndRow; + newPiece->prodStartCol = newPiece->rightStartCol; + newPiece->prodEndCol = newPiece->rightEndCol; + + newPiece->resultMatrix = parentPiece->resultMatrix; + + return newPiece; + } + diff -r 000000000000 -r d138e0acf9a0 DKU_INST_MM/SerialKernel.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/DKU_INST_MM/SerialKernel.c Sun Aug 26 03:04:50 2012 -0700 @@ -0,0 +1,21 @@ +/* + * Copyright 2009 OpenSourceCodeStewardshipFoundation.org + * Licensed under GNU General Public License version 2 + * + * Author: seanhalle@yahoo.com + * + */ + +#include "DKU_INST_MM.h" + +/*The scheduler calls this when there is no benefit from + * parallel execution of the orig data. + * + *For MM, the original data is already in the data structure that's inside + * a DKUPiece, so just cast the oridData and call the Kernel on it + */ +void serialKernel_MM( void * origData ) + { + inner_Kernel( (MatrixProdPiece *) origData ); + } + diff -r 000000000000 -r d138e0acf9a0 DKU_INST_MM/Undivide.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/DKU_INST_MM/Undivide.c Sun Aug 26 03:04:50 2012 -0700 @@ -0,0 +1,34 @@ +/* + * Copyright 2009 OpenSourceCodeStewardshipFoundation.org + * Licensed under GNU General Public License version 2 + * + * Author: seanhalle@yahoo.com + * + */ + +#include "DKU_INST_MM.h" +#include + + +/* unDivider + * + * Only counts to make sure all the pieces are accounted for, and frees + * the memory allocated to the finished DKUPiece structs and all sub- + * structures. + */ +void unDividePiece_MM( DKUPiece *parentPiece, DKUPiece *piece ) + { + parentPiece->numSubPiecesUndivided += 1; + + //free mem allocated to no-longer needed subPiece + BLIS_DKU__freeDKUPiece( piece ); + } + +//TODO: figure out standard for doing this.. add pointer to func to inits, +// so fixed freePiece func can have generic code to free app-specific +// piece, and DKUInstance provides function to free that piece +// maybe put the pointer into schedData or something.. then this +// one call to freeDKUPiece can be generic, with HW-specific overlay +// Issue is slowness of indirections into table and dereferencing Fn pointer + + diff -r 000000000000 -r d138e0acf9a0 Matrix_Mult.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Matrix_Mult.c Sun Aug 26 03:04:50 2012 -0700 @@ -0,0 +1,221 @@ +/* + * Copyright 2009 OpenSourceCodeStewardshipFoundation.org + * Licensed under GNU General Public License version 2 + * + * Author: seanhalle@yahoo.com + * + * Created on November 15, 2009, 2:35 AM + */ + +#include +#include + +#include "Matrix_Mult.h" +#include "../BLIS/DKU/DKU_common/DKU.h" + +//======================== For Use OUTSIDE DKU instance ===================== +/* + *The DKU code-instance in DKU_INST_MM has its own set of makers and free-ers + * that use BLIS_DKU__malloc. These are for use in application-code + * outside DKU_INST_MM DKU-code-instance. + */ + + +/*In the "_Flat" version of constructor, do only malloc of the top data struc + * and set values in that top-level. Don't malloc any sub-structures. + */ + Matrix * +makeMatrix_Flat( int32 numRows, int32 numCols ) + { Matrix * retMatrix; + retMatrix = malloc( sizeof( Matrix ) ); + retMatrix->numRows = numRows; + retMatrix->numCols = numCols; + + return retMatrix; + } + + Matrix * +makeMatrix_WithResMat( int32 numRows, int32 numCols ) + { Matrix * retMatrix; + retMatrix = malloc( sizeof( Matrix ) ); + retMatrix->numRows = numRows; + retMatrix->numCols = numCols; + retMatrix->matrix = malloc( numRows * numCols * sizeof(float32) ); + + return retMatrix; + } + + void +freeMatrix_Flat( Matrix * matrix ) + { //( matrix ); + } + void +freeMatrix( Matrix * matrix ) + { free( matrix->matrix ); + free( matrix ); + } + + MatrixProdPiece * +makeMatrixProdPiece_Empty() + { return malloc( sizeof(MatrixProdPiece) ); + } + + + MatrixProdPiece * +makeMatrixProdPiece_Helper( Matrix *leftMatrix, Matrix *rightMatrix ) + { MatrixProdPiece *newPiece; + + newPiece = makeMatrixProdPiece_Empty( ); + + newPiece->leftMatrix = leftMatrix; + newPiece->rightMatrix = rightMatrix; + + newPiece->leftStartRow = 0; + newPiece->leftEndRow = leftMatrix->numRows - 1; + newPiece->leftStartCol = 0; + newPiece->leftEndCol = leftMatrix->numCols - 1; + + newPiece->rightStartRow = 0; + newPiece->rightEndRow = rightMatrix->numRows - 1; + newPiece->rightStartCol = 0; + newPiece->rightEndCol = rightMatrix->numCols - 1; + + newPiece->prodStartRow = newPiece->leftStartRow; + newPiece->prodEndRow = newPiece->leftEndRow; + newPiece->prodStartCol = newPiece->rightStartCol; + newPiece->prodEndCol = newPiece->rightEndCol; + + return newPiece; + } + + MatrixProdPiece * +makeMatrixProdPiece_FromMatrixProdPiece( MatrixProdPiece *parentPiece ) + { MatrixProdPiece *newPiece; + Matrix *leftMatrix = parentPiece->leftMatrix; + Matrix *rightMatrix = parentPiece->rightMatrix; + + newPiece = makeMatrixProdPiece_Helper( leftMatrix, rightMatrix ); + newPiece->resultMatrix = parentPiece->resultMatrix; + + return newPiece; + } + + MatrixProdPiece * +makeMatrixProdPiece_FromMatrices( Matrix *leftMatrix, Matrix *rightMatrix ) + { MatrixProdPiece *newPiece; + + newPiece = makeMatrixProdPiece_Helper( leftMatrix, rightMatrix ); + newPiece->resultMatrix = + makeMatrix_WithResMat( leftMatrix->numRows, rightMatrix->numCols ); + + return newPiece; + } + + void +freeMatrixProdPiece_Flat( MatrixProdPiece * piece ) + { free( piece ); + } + + void +freeMatrixProdPiece( MatrixProdPiece * piece ) + { //( piece->leftMatrix ); + freeMatrix( piece->rightMatrix ); + freeMatrix( piece->resultMatrix ); + free( piece ); + } + + + void +initialize_Input_Matrices_Via( Matrix **leftMatrix, Matrix **rightMatrix, + ParamBag *paramBag ) + { char *leftMatrixFileName, *rightMatrixFileName; + int leftMatrixRows, leftMatrixCols, rightMatrixRows, rightMatrixCols; + + ParamStruc *param; + param = getParamFromBag( "leftMatrixRows", paramBag ); + leftMatrixRows = param->intValue; + param = getParamFromBag( "leftMatrixCols", paramBag ); + leftMatrixCols = param->intValue; + *leftMatrix = makeMatrix_WithResMat( leftMatrixRows, leftMatrixCols ); + + param = getParamFromBag( "leftMatrixFileName", paramBag ); + leftMatrixFileName = param->strValue; //no need to copy + read_Matrix_From_File( *leftMatrix, leftMatrixFileName ); + + param = getParamFromBag( "rightMatrixRows", paramBag ); + rightMatrixRows = param->intValue; + param = getParamFromBag( "rightMatrixCols", paramBag ); + rightMatrixCols = param->intValue; + *rightMatrix = makeMatrix_WithResMat( rightMatrixRows, rightMatrixCols ); + + param = getParamFromBag( "rightMatrixFileName", paramBag ); + rightMatrixFileName = param->strValue; + read_Matrix_From_File( *rightMatrix, rightMatrixFileName ); + } + + +void parseLineIntoRow( char *line, float32* row ); + + + void +read_Matrix_From_File( Matrix *matrixStruc, char *matrixFileName ) + { int row, maxRead, numRows, numCols; + float32 *matrixStart; + size_t lineSz = 0; + FILE *file; + char *line = NULL; + + lineSz = 50000; //max length of line in a matrix data file + line = (char *) malloc( lineSz ); + if( line == NULL ) BLIS_DKU__throwError( "no mem for matrix line" ); + + numRows = matrixStruc->numRows; + numCols = matrixStruc->numCols; + matrixStart = matrixStruc->matrix; + + printf("Matrix File Path: %s\n", matrixFileName);fflush(stdout); + file = fopen( matrixFileName, "r" ); if(!file){printf("not open! %d\n",__LINE__); fflush(stdin);} + fseek( file, 0, SEEK_SET ); + for( row = 0; row < numRows; row++ ) + { + if( feof( file ) ) BLIS_DKU__throwError( "file ran out too soon" ); + maxRead = getline( &line, &lineSz, file ); + if( maxRead == -1 ) BLIS_DKU__throwError( "prob reading mat line"); + + if( *line == '\n') continue; //blank line + if( *line == '/' ) continue; //comment line + + parseLineIntoRow( line, matrixStart + row * numCols ); + } + free( line ); + } + +/*This function relies on each line having the proper number of cols. It + * doesn't check, nor enforce, so if the file is improperly formatted it + * can write over unrelated memory + */ + void +parseLineIntoRow( char *line, float32* row ) + { + char *valueStr, *searchPos; + + //read the float values + searchPos = valueStr = line; //start + + for( ; *searchPos != 0; searchPos++) //bit dangerous, should use buff len + { + if( *searchPos == '\n' ) //last col.. relying on well-formatted file + { *searchPos = 0; + *row = atof( valueStr ); + break; //end FOR loop + } + if( *searchPos == ',' ) + { *searchPos = 0; //mark end of string + *row = (float32) atof( valueStr ); + row += 1; //address arith + //skip any spaces before digits.. use searchPos + 1 to skip the 0 + for( ; *(searchPos + 1)== ' ' && *(searchPos + 1) !=0; searchPos++); + valueStr = searchPos + 1; + } + } + } diff -r 000000000000 -r d138e0acf9a0 Matrix_Mult.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Matrix_Mult.h Sun Aug 26 03:04:50 2012 -0700 @@ -0,0 +1,81 @@ +/* + * Copyright Oct 24, 2009 OpenSourceCodeStewardshipFoundation.org + * Licensed under GNU General Public License version 2 + */ + +#ifndef MATRIX_MULT_H_ +#define MATRIX_MULT_H_ + +#include + +#include "../BLIS/BLIS_primitive_data_types.h" + +#include "ParamHelper/Param.h" + +//============================== Structures ============================== + +typedef +struct + { int32 numRows; + int32 numCols; + float32 *matrix; //2D, but dynamically sized, so use addr arith + } +Matrix; + +/* This is the "appSpecificPiece" that is carried inside a DKUPiece. + * In the DKUPiece data struc it is declared to be of type "void *". This + * allows the application to define any data structure it wants and put it + * into a DKUPiece. + * When the app specific info is used, it is in app code, so it is cast to + * the correct type to tell the compiler how to access fields. + * This keeps all app-specific things out of the DKU directory, as per the + * DKU standard. */ +typedef +struct + { + // pointers to shared data.. the result matrix must be created when the + // left and right matrices are put into the root ancestor DKUPiece. + Matrix * leftMatrix; + Matrix * rightMatrix; + Matrix * resultMatrix; + + // define the starting and ending boundaries for this piece of the + // result matrix. These are derivable from the left and right + // matrices, but included them for readability of code. + int prodStartRow, prodEndRow; + int prodStartCol, prodEndCol; + // Start and end of the portion of the left matrix that contributes to + // this piece of the product + int leftStartRow, leftEndRow; + int leftStartCol, leftEndCol; + // Start and end of the portion of the right matrix that contributes to + // this piece of the product + int rightStartRow, rightEndRow; + int rightStartCol, rightEndCol; + } +MatrixProdPiece; + +//============================== Functions ================================ +void readFile(); + +Matrix *makeMatrix( int32 numRows, int32 numCols ); +Matrix *makeMatrix_Flat( int32 numRows, int32 numCols ); +void freeMatrix_Flat( Matrix * matrix ); +void freeMatrix( Matrix * matrix ); + + MatrixProdPiece * +makeMatrixProdPiece_Empty(); + MatrixProdPiece * +makeMatrixProdPiece_FromMatrixProdPiece( MatrixProdPiece * piece ); + MatrixProdPiece * +makeMatrixProdPiece_FromMatrices( Matrix *leftMatrix, Matrix *rightMatrix ); + +void read_Matrix_From_File( Matrix *matrixStruc, char *matrixFileName ); + +void freeMatrixProdPiece_Flat( MatrixProdPiece * piece ); +void freeMatrixProdPiece( MatrixProdPiece * piece ); + + +//=========================================================================== + +#endif /*MATRIX_MULT_H_*/ diff -r 000000000000 -r d138e0acf9a0 Read_Input_Matrix.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Read_Input_Matrix.c Sun Aug 26 03:04:50 2012 -0700 @@ -0,0 +1,604 @@ +/* + * File: Read_Input.c + * Author: SeanHalle@yahoo.com + * + * Created on June 15, 2009, 10:12 AM + */ + +#include + +//======================== +scanf("%[^\t]",a); //matches everything except tab character + +//======================= Write a structure into a file ==================== + +#include +#include +#include + +#define MAX 50 + + +typedef struct { + char name[10]; + int key; +} file_record; + +/* this function adds the relatiuve addres to the index for a key */ +void create_index(long index[], int key, long rel_add ) { + index[key] = rel_add; +} + +/* this function writes a record to the file */ +void write_rec(FILE *fp, file_record rec) { + fwrite(&rec,sizeof(rec),1,fp); +} + +void main() { + long rel_add; + int key; + file_record frec; + long index[MAX];/* an index list*/ + int n,i; + + FILE *recfile=NULL,*ifile=NULL; + /* this initializes the index list to all ? */ + for(i=0; i< MAX; i++) + index[i]= (-1); + + recfile=fopen("mfile","w"); + if(recfile == NULL) { + printf("Error in openeing file mfile\n"); + exit(0); + } + rel_add = 0 ; + do { + printf(" Enter the data vlue and the key of the record to be added to file mfile\n"); + scanf("%s %d",frec.name,&frec.key); + while(index[frec.key] != (-1)) { + printf(" A record with this key value already exist in a file enter record key value\n"); + scanf("%s %d",frec.name,&frec.key); + } + create_index(index,frec.key,rel_add); + write_rec(recfile,frec); + rel_add = ftell(recfile); + /* this sets the relative address for the next record to be + the value of current file position pointer in bytes from + the beginning of the file */ + printf("Enter 1 to continue adding records to the file\n"); + scanf("%d",&n); + }while(n == 1); + + ifile=fopen("index_file","w"); + + if(ifile == NULL) { + printf("Error in openeing file index_file\n"); + exit(0); + } + + fwrite(index,sizeof(index),1,ifile);/*writes the complete index into the index_file */ + fclose(recfile); + fclose(ifile); + printf("Enter 1 if you want to retrieve a record\n"); + scanf("%d",&n); + + if( n == 1) { + ifile=fopen("index_file","r"); + if(ifile == NULL) { + printf("Error in openeing file index_file\n"); + exit(0); + } + fread(index,sizeof(index),1,ifile); + + /* reads the complete index into the index list from the index_file*/ + fclose(ifile); + recfile=fopen("mfile","r"); + + if(recfile == NULL) { + printf("Error in openeing file mfile\n"); + exit(0); + } + } + printf("THE CONTENTS OF FILE IS \n"); + + while( (fread(&frec,sizeof(frec),1,recfile)) != 0) + printf("%s %d\n",frec.name,frec.key); + + do { + printf("Enter the key of the record to be retrieved\n"); + scanf("%d",&key); + rel_add = index[key]; /*gets the relative address of the record from index list */ + if( (fseek(recfile,rel_add,SEEK_SET))!= 0) { + printf("Error\n"); + exit(0); + } + fread(&frec,sizeof(frec),1,recfile); + printf("The data value of the retrieved record is %s\n",frec.name); + printf("Enter 1 if you want to retrieve a record\n"); + scanf("%d",&n); + } while(n == 1); + + fclose(recfile); +} + + + + +//========================== Read words in file demo ======================= + +#include +#include +#include +#include + +struct node { + struct node *left; /* tree to the left */ + struct node *right; /* tree to the right */ + char *word; /* word for this tree */ +}; + +/* the top of the tree */ +static struct node *root = NULL; + +/* + * memory_error -- write error and die * + */ +void memory_error(void) +{ + fprintf(stderr, "Error:Out of memory\n"); + exit(8); +} + +/* + * save_string -- save a string on the heap * + * * + * Parameters * + * string -- string to save * + * * + * Returns * + * pointer to malloc-ed section of memory with * + * the string copied into it. * + */ +char *save_string(char *string) +{ + char *new_string; /* where we are going to put string */ + + new_string = malloc((unsigned) (strlen(string) + 1)); + + if (new_string == NULL) + memory_error(); + + strcpy(new_string, string); + return (new_string); +} +/* + * enter -- enter a word into the tree * + * * + * Parameters * + * node -- current node we are looking at * + * word -- word to enter * + */ +void enter(struct node **node, char *word) +{ + int result; /* result of strcmp */ + + char *save_string(char *); /* save a string on the heap */ + + /* + * If the current node is null, we have reached the bottom + * of the tree and must create a new node. + */ + if ((*node) == NULL) { + + /* Allocate memory for a new node */ + (*node) = malloc(sizeof(struct node)); + if ((*node) == NULL) + memory_error(); + + /* Initialize the new node */ + (*node)->left = NULL; + (*node)->right = NULL; + (*node)->word = save_string(word); + return; + } + /* Check to see where the word goes */ + result = strcmp((*node)->word, word); + + /* The current node already contains the word, no entry necessary */ + if (result == 0) + return; + + /* The word must be entered in the left or right sub-tree */ + if (result < 0) + enter(&(*node)->right, word); + else + enter(&(*node)->left, word); +} +/* + * scan -- scan the file for words * + * * + * Parameters * + * name -- name of the file to scan * + */ +void scan(char *name) +{ + char word[100]; /* word we are working on */ + int index; /* index into the word */ + int ch; /* current character */ + FILE *in_file; /* input file */ + + in_file = fopen(name, "r"); + if (in_file == NULL) { + fprintf(stderr, "Error:Unable to open %s\n", name); + exit(8); + } + while (1) { + /* scan past the whitespace */ + while (1) { + ch = fgetc(in_file); + + if (isalpha(ch) || (ch == EOF)) + break; + } + + if (ch == EOF) + break; + + word[0] = ch; + for (index = 1; index < sizeof(word); ++index) { + ch = fgetc(in_file); + if (!isalpha(ch)) + break; + word[index] = ch; + } + /* put a null on the end */ + word[index] = '\0'; + + enter(&root, word); + } + fclose(in_file); +} +/* + * print_tree -- print out the words in a tree * + * * + * Parameters * + * top -- the root of the tree to print * + */ +void print_tree(struct node *top) +{ + if (top == NULL) + return; /* short tree */ + + print_tree(top->left); + printf("%s\n", top->word); + print_tree(top->right); +} + +int main(int argc, char *argv[]) +{ + if (argc != 2) { + fprintf(stderr, "Error:Wrong number of parameters\n"); + fprintf(stderr, " on the command line\n"); + fprintf(stderr, "Usage is:\n"); + fprintf(stderr, " words 'file'\n"); + exit(8); + } + scan(argv[1]); + print_tree(root); + return (0); +} + + + + +//================== Get line demo ========================= +#include +#include +#include + +#define _GNU_SOURCE + +int main(int argc, char* argv[]) { + +size_t lsize = 0; +ssize_t read; +FILE* conf_file; +char* line = NULL; + +if (argc == 1) { +printf("\nCommand syntax:\n"); +printf("\n\tprogramd [ start | stop ]\n"); +printf("\t\tstart: start daemon\n"); +printf("\t\tstop: stop daemon\n"); +} +else if (strcmp(argv[1], "start") == 0) { +conf_file = fopen("/etc/program/program.conf", "r"); +fseek(conf_file, 0, SEEK_SET); +while (!feof(conf_file)) { +while (getline(&line, &lsize, conf_file) != -1) { +printf("%s", line); +} +} +} + +//====================== +fopen +fread +fscanf +getline + + +//================== scanf demo ============================= +/* Q: need this for the GCC atomic operations? + */ +#define _GNU_SOURCE +#include + +void main (void) { + + /* We will use one floating-point and one integer variable. */ + + double x = .00000123456789; + int n = 12345; + + + /* Display plain text. */ + + printf("This is a test\n"); + printf("This\tis\nanother\ttest\n\n"); + + + /* Display an integer. */ + + printf("Here is n: %d\n\n", n); + + + /* Display a double three different ways. */ + + printf("Here is x: %g\n", x); + printf("Here is x: %f\n", x); + printf("Here is x: %e\n\n", x); + + + /* Display two numbers. */ + + printf("Here are n (%d) and x (%g)\n", n, x); + +} + +//=========================================================== + +void read_One_MB(int f, int MB_y, int MB_x, MB_Info *curMB, FILE *inputFH); + +//================================================================= + { dataFile = new File( fileName ); + paramScanner = new Scanner( dataFile ); + } + catch( Exception e ) + { dataFile = null; + paramScanner = null; + System.err.println( "couldn't open file: " + fileName ); + } + + paramScanner.useDelimiter(",\\s*|\n|\r\n"); + + MatrixInRowMajor ( int _numRows, int _numCols ) + { + numRows = _numRows; // lives in super class + numCols = _numCols; + + rows = new float[numRows][]; + + for( int i = 0; i < numRows; i++ ) + { + rows[ i ] = new float[ numCols ]; + } + } + public void fillSelfFromFile( String fileName ) + { float floatValue = 0; + String floatString; + + super.setUpScanner( fileName ); + + for( int r = 0; r < numRows; r += 1 ) + { for( int c = 0; c < numCols; c += 1 ) + { floatString = paramScanner.next(); + floatValue = Float.parseFloat( floatString ); + rows[ r ][ c ] = floatValue; + } + } + } + +//================================================================ + + +/* Get the data strucs implicitly from the header file + */ +void read_All_Frames( FILE *inputFH ) + { int MB_x, MB_y, f; + + for( f = 1; f <= numFrames; f++ ) //allocated 10 frames of mem, use 8 + { //PixInFrame, PixInLine, MBsInFrame, frameWidthInMB, etc in header + uint8_t * startOfFrame_L = &(input_img_y[f][0]); + uint8_t * startOfFrame_CR = &(input_img_cr[f][0]); + uint8_t * startOfFrame_CB = &(input_img_cb[f][0]); + + // Reads in one frame + for( MB_y = 0; MB_y < frameHeightInMB; MB_y++ ) + { + for( MB_x = 0; MB_x < frameWidthInMB; MB_x++ ) + { //DEBUG: addr arith checks out (size of MB_Info is 240) + MB_Info *MBInfo = &(input_MBs[f][0]) + + (MB_y * frameWidthInMB + MB_x); + // Read Macroblock Parameters and pixel data + read_One_MB( f, MB_y, MB_x, MBInfo, inputFH ); + + MBInfo->PixInLine_L = oPixInLine_L; + MBInfo->PixInLine_C = oPixInLine_C; + + int offsetToMBsPix_L = + MB_y * (oPixInLine_L * MBHeight_L) + MB_x * MBWidth_L; + int offsetToMBsPix_C = + MB_y * (oPixInLine_C * MBHeight_C) + MB_x * MBWidth_C; + //DEBUG: addr arith checks out + MBInfo->startOfMBsPix_L = startOfFrame_L + offsetToMBsPix_L; + MBInfo->startOfMBsPix_CR = startOfFrame_CR + offsetToMBsPix_C; + MBInfo->startOfMBsPix_CB = startOfFrame_CB + offsetToMBsPix_C; + } + } + } + } + +/* Reads the parameters of macro block, then reads pixel data of MB. + * Give it frame num, y and x of macro block. It gets addresses of the + * arrays by including the header. + */ +void read_One_MB(int f, int MB_y, int MB_x, MB_Info *curMB, FILE *inputFH) + { + int dir, line, i, tmp, x, y, pixelIndex; + char strTemp[90]; + MB_Info_throwAway throwAway; + + fscanf(inputFH, "%s",&(strTemp[0])); //get rid of unused preamble string + + //(*curMB).MB_x = MB_x; + //(*curMB).MB_y = MB_y; + + + + /************ Read parameters ******************/ + + // first, get rid of unused data in the input stream. + fscanf(inputFH, "%d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d", + &(throwAway).MB_X, + &(throwAway).MB_Y, + &(throwAway).mb_stride, + &(throwAway).deblocking_filter, + &(throwAway).picture_structure, + &(throwAway).slice_alpha_c0_offset, + &(throwAway).slice_type, + &(throwAway).chroma_qp_index_offset[0], + &(throwAway).chroma_qp_index_offset[1], + &(throwAway).mb_xy_type, + &(throwAway).mb_xy_type_m1, + &(throwAway).mb_xy_type_top, + &(throwAway).qscale_mb_xy, + &(throwAway).qscale_mb_xy_m1, + &(throwAway).qscale_mb_xy_top, + &(throwAway).slice_table_mb_xy, + &(throwAway).slice_table_mb_xy_m1, + &(throwAway).slice_table_mb_xy_top + ); + + for(dir = 0; dir < 2; dir++) + for(line=0; line < 5*8; line++) + fscanf(inputFH, "%d ", &(throwAway).ref_cache[dir][line]); + + for(dir = 0; dir < 2; dir++) + for(line=0; line < 5*8; line++) + fscanf(inputFH, "%d %d ", &(throwAway).mv_cache[dir][line][0], + &(throwAway).mv_cache[dir][line][1]); + + for(line=0; line < 6*8; line++) + fscanf(inputFH, "%d ", &(throwAway).non_zero_count_cache[line]); + + + //now, get data will use in deblocking + + fscanf(inputFH, "%i %i", &(*curMB).endSubBlk[0], &(*curMB).endSubBlk[1]); + fscanf(inputFH, "%d",&(*curMB).startSubBlk[0]); + fscanf(inputFH, "%d",&(*curMB).startSubBlk[1]); + + //read bS + for(dir = 0; dir < 2; dir++) + for(line=0; line < 4; line++) + for (i=0; i < 4; i++) + { + fscanf(inputFH, "%d",&tmp); + (*curMB).bS[dir][line][i] = (int) tmp; + } + + //read luma_qp + for(dir = 0; dir < 2; dir++) + for(line=0; line < 4; line++) + fscanf(inputFH, "%d",&(*curMB).luma_qp[dir][line]); + + //read chroma_qp + for(dir = 0; dir < 2; dir++) + for(line=0; line < 4; line++) + fscanf(inputFH, "%d",&(*curMB).chroma_qp[dir][line]); + + + + /********* Have MB params, now read pixel data of MB *************/ + + /* The MB pixel data is read in one MB at a time. All pixel data + * goes into a single array. The pixel data is layed out in the array + * the same as the pixels appear on the screen, in screen-row major + * order. So, all the pixels in the 0th line at the top of the frame + * are next to each other, starting at the beginning of the array. + * Then, the second line begins at arrayAddr + frame_width_in_pixels, + * and so on. + * Get the pixels for one MB, so have to map the MB location onto the + * array location. + * The position of the MB's 0,0 pixel is offset by the data of all the + * MBs in MB-lines above, and by all the pixels in MBs to the left. + * So, the number of pixels in a MB-line is the number of lines in a MB + * times the number of pixels in a frame-line. Multiply that by the + * number of MB-lines above the current MB. + * Next, add the offset within the current MB-line, which is the + * number of MBs to the left times the width, in pixels, of one MB. + * Then, to get the offset of a particular pixel in the MB from the + * start of the MB, take the number of lines in the MB above the + * current pixel times the pixels-per-frame-line, then add the number + * of pixels to the left within the MB. + * + *(MB_y * MB_height * Frm_width + MB_x * MB_width) + (y * Frm_width + x) + */ + + /* read the pre-deblocking MB pixels, then the correct final pixels. + * Start of a macro block is num MB rows above * pixel lines in height + * of a MB * pixels in a line of frame, plus the numMB to left * pixel + * width of a MB.. MB_y and MB_x start at 0, so they are num above + * and num to left, repectively. + * Many of the width and height values are defined in header.*/ + //DEBUG: addr arith checks out + int offsetToMBsPix_L = MB_y * (oPixInLine_L * MBHeight_L) + MB_x * MBWidth_L; + for(y=0; y < MBHeight_L; y++) + { // first read all input Y, then all correct output Y + int offsetToLineInMB_L = offsetToMBsPix_L + y * oPixInLine_L; + for(x=0; x < MBWidth_L; x++) + { pixelIndex = offsetToLineInMB_L + x; + fscanf(inputFH, "%i",&tmp); + *(&input_img_y[f][0] + pixelIndex) = (char) tmp; + } + for(x=0; x < MBWidth_L; x++) + { pixelIndex = offsetToLineInMB_L + x; + fscanf(inputFH, "%i",&tmp); + *(&correct_img_y[f][0] + pixelIndex) = (char) tmp; + } + } + + //read croma b and r.. in all rest of code, goes "R before B" but here, + // input stream has the blue before the red.. FYI + //DEBUG: addr arith appears to check out (assuming have right model) + int offsetToMBsPix_C = MB_y * (oPixInLine_C * MBHeight_C) + MB_x * MBWidth_C; + for (y=0; y < MBHeight_C; y++) + { // first read all input CB & CR, then all correct output CB & CR + int offsetToLineInMB_C = offsetToMBsPix_C + y * oPixInLine_C; + for(x=0; x < MBWidth_C; x++) + { int pixelIndex_C = offsetToLineInMB_C + x; + fscanf(inputFH, "%d",&tmp); + *(&input_img_cb[f][0] + pixelIndex_C) = (char) tmp; + fscanf(inputFH, "%d",&tmp); + *(&input_img_cr[f][0] + pixelIndex_C) = (char) tmp; + } + for(x=0; x < MBWidth_C; x++) + { int pixelIndex_C = offsetToLineInMB_C + x; + fscanf(inputFH, "%d",&tmp); + *(&correct_img_cb[f][0] + pixelIndex_C) = (char) tmp; + fscanf(inputFH, "%d",&tmp); + *(&correct_img_cr[f][0] + pixelIndex_C) = (char) tmp; + } + } + } diff -r 000000000000 -r d138e0acf9a0 main.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/main.c Sun Aug 26 03:04:50 2012 -0700 @@ -0,0 +1,44 @@ +/* + * Copyright Oct 24, 2009 OpenSourceCodeStewardshipFoundation.org + * Licensed under GNU General Public License version 2 + * + * author seanhalle@yahoo.com + */ + + +#include +#include +#include +#include +#include +#include +#include + +#include "BLIS_CONSTANTS.h" +#include "../BLIS/BLIS.h" +#include "../BLIS/DKU/DKU_common/DKU.h" + +#include "Matrix_Mult.h" + +/** + * This is the DKU version of Matrix Multiply sample application + * + */ +int main( int argc, char **argv ) + { Matrix *leftMatrix, *rightMatrix; + ParamBag *paramBag; + + paramBag = makeParamBag(); + readParamFileIntoBag( argv[1], paramBag ); + initialize_Input_Matrices_Via( &leftMatrix, &rightMatrix, paramBag ); + + resultMatrix = multiplyTheseMatrices( leftMatrix, rightMatrix ); + + printf("\nresult matrix: \n"); + printMatrix( resultMatrix ); + +// BLIS_DKU__print_Stats_forInst( DKU_INST_MM ); + + exit(0); //cleans up + } +