Mercurial > cgi-bin > hgwebdir.cgi > PR > Applications > DKU > DKU__Matrix_Mult__Bench
changeset 0:d138e0acf9a0 tip
Initial add of standard DKU matrix mult code -- to be modified
| author | Sean Halle <seanhalle@yahoo.com> |
|---|---|
| date | Sun, 26 Aug 2012 03:04:50 -0700 |
| parents | |
| children | |
| files | BLIS_CONSTANTS.h DKU_INST_MM/Bundling_Quad.c DKU_INST_MM/Communicators.c DKU_INST_MM/DKU_INST_MM.h DKU_INST_MM/DKU_INST_MM_init.c DKU_INST_MM/Divide.c DKU_INST_MM/Kernel.c DKU_INST_MM/MakeRootDKUPieces.c DKU_INST_MM/Maker_and_Freer.c DKU_INST_MM/SerialKernel.c DKU_INST_MM/Undivide.c Matrix_Mult.c Matrix_Mult.h Read_Input_Matrix.c main.c |
| diffstat | 15 files changed, 2422 insertions(+), 0 deletions(-) [+] |
line diff
1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/BLIS_CONSTANTS.h Sun Aug 26 03:04:50 2012 -0700 1.3 @@ -0,0 +1,20 @@ 1.4 +/* 1.5 + * File: BLIS_CONSTANTS.h 1.6 + * Author: SeanHalle@yahoo.com 1.7 + * 1.8 + * Created on October 27, 2009, 6:19 AM 1.9 + */ 1.10 + 1.11 +#ifndef _BLIS_CONSTANTS_H 1.12 +#define _BLIS_CONSTANTS_H 1.13 + 1.14 + //DKU Instance ID enum. Must start at 1. 1.15 + //The directory, header, and init file for each instance of the DKU 1.16 + // pattern is named the same as the enum. 1.17 +enum 1.18 + { DKU_INST_MM = 1 1.19 + }; 1.20 + 1.21 + 1.22 +#endif /* _BLIS_CONSTANTS_H */ 1.23 +
2.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 2.2 +++ b/DKU_INST_MM/Bundling_Quad.c Sun Aug 26 03:04:50 2012 -0700 2.3 @@ -0,0 +1,349 @@ 2.4 +/* 2.5 + * Copyright Oct 24, 2009 OpenSourceCodeStewardshipFoundation.org 2.6 + * Licensed under GNU General Public License version 2 2.7 + * 2.8 + * 2.9 + * Author: SeanHalle@yahoo.com 2.10 + * 2.11 + */ 2.12 + 2.13 +#include "malloc.h" 2.14 +#include "DKU_INST_MM.h" 2.15 +#include "../../BLIS/DKU/DKU_common/DKU.h" 2.16 + 2.17 +//Positions in the bundle 2.18 +enum 2.19 + { szPos = 0, 2.20 + numLRPos, 2.21 + numLCPos, 2.22 + numRRPos, 2.23 + numRCPos, 2.24 + LBMatrixPos, 2.25 + numPos = LBMatrixPos 2.26 + }; 2.27 + 2.28 +//============================ Bundling Quad =============================== 2.29 + 2.30 +/* This is the set of four bundling functions: 2.31 + * 2.32 + * bundleInputs -- takes a DKU piece and returns an array of data that 2.33 + * contains all the information the Kernel will need to 2.34 + * process that DKU piece 2.35 + * 2.36 + * unbundleInputs -- takes the array returned by bundleInputs and turns it 2.37 + * into a DKU piece that can be given to the Kernel 2.38 + * 2.39 + * bundleResults -- takes a DKU piece that has finished going through the 2.40 + * Kernel and places all result information into an array 2.41 + * 2.42 + * unbundleResults -- takes the output from bundleResults plus the original 2.43 + * DKU piece whose inputs were bundled to produce the 2.44 + * results, and modifies the state of the original DKU 2.45 + * piece to be as if it had gone through the kernel. 2.46 + * 2.47 + * The bundling quad ("quad" because there are four bundling functions).. the 2.48 + * bundling quad is only used for distributed memory hardware. When used, 2.49 + * they can be thought of as operating in two separate memories. 2.50 + * The bundleInputs and unbundleResults operate in one memory, where the full 2.51 + * original data structure is. 2.52 + * The unbundleInputs and bundleResults operate in the second memory, where 2.53 + * the data for one piece gets sent to. 2.54 + * 2.55 + * 2.56 + * Call sequence: 2.57 + * 2.58 + * call bundlInputs in memory space of original data, giving it a DKU piece. 2.59 + * It returns a pointer to a byte array. (First int32 in array is its size) 2.60 + * The byte array is sent to remote memory to be processed. 2.61 + * In the remote memory, the byte array is received. 2.62 + * The unbundleInputs function is called on it, which creates a new DKUPiece 2.63 + * in the heap of the remote memory, and creates on the heap of the remote 2.64 + * memory any data structures embedded within the DKUPiece. 2.65 + * The unbundleInputs function may re-use portions of the byte array it is 2.66 + * given, so the run-time in the remote memory must perform buffer 2.67 + * management for its communications appropriately. 2.68 + * The byte array that the unbundleInputs function is given must have been 2.69 + * allocated on the heap within the same malloc state as the unbundleInputs 2.70 + * and bundleResults functions are linked to. 2.71 + * The returned DKUPiece is given to the Kernel function to process. 2.72 + * The DKUPiece is then given to the bundleResults function, which returns 2.73 + * a pointer to a byte array. The unbundleResults may simply return the 2.74 + * pointer to the same received inputBundle, so the communication and memory 2.75 + * management in the remote memory must behave accordingly. The 2.76 + * bundleResults function will free any memory allocated on the heap by the 2.77 + * unbundleInputs function. The only memory remaining on the heap when the 2.78 + * bundleResults function completes is the byte array returned by it. This 2.79 + * is the reason that the original inputBundle given to unbundleInputs must 2.80 + * be allocated on the same heap that bundleResults frees from: 2.81 + * bundleResults might call free on that inputBundle, so that call must 2.82 + * modify the correct malloc state. 2.83 + * The remote memory sends the resultBundle to the original memory, then 2.84 + * frees the resultBundle. 2.85 + * The original memory pairs the result bundle with the DKUPiece that the 2.86 + * corresponding inputBundle was made from. 2.87 + * The original memory calls unbundleResults, giving it the resultsBundle 2.88 + * and the corresponding DKUPiece that remained in the original memory. 2.89 + * The unbundleResults modifies the original memory such that it is identical 2.90 + * to the state it would be in if the Kernel were called on the DKUPiece 2.91 + * in the original memory. 2.92 + * The original memory is responsible for freeing the inputBundle that was 2.93 + * made by bundleInputs, but unbundleResults will free the resultsBundle 2.94 + * when it is done with it. This allows unbundleResults to simply assign 2.95 + * pointers to portions of the resultsBundle, rather than copying, if that 2.96 + * is appropriate for the Kernel. 2.97 + */ 2.98 +//=========================================================================== 2.99 + 2.100 +/* Layout: 2.101 + * sizeOfBundle 2.102 + * numLeftRows 2.103 + * numLeftCols 2.104 + * numRightRows 2.105 + * numRightCols 2.106 + * <left matrix data> 2.107 + * <right matrix data> 2.108 + * 2.109 + * calculate the sizes from the numbers of rows and cols of each, use the 2.110 + * size of the left matrix to calc start addr of right matrix.. in the 2.111 + * remote memory, just set pointers to the matrix locations in the bundles 2.112 + */ 2.113 +//TODO: carefully step through bundling quad -- check sizes and addr on 9x9 2.114 +void* bundleInputs_MM( DKUPiece* piece ) 2.115 + { void *bundle; 2.116 + int32 sizeOfLeftMatrix, sizeOfRightMatrix, sizeOfBundle; 2.117 + float32 *leftBundleMatrix,*leftMatrix,*rightBundleMatrix,*rightMatrix; 2.118 + float32 *leftBundleInsertPt, *rightBundleInsertPt; 2.119 + float32 *leftMatrixReadPt, *rightMatrixReadPt; 2.120 + 2.121 + MatrixProdPiece *prodPiece = (MatrixProdPiece *)piece->appSpecificPiece; 2.122 + leftMatrix = prodPiece->leftMatrix->matrix; 2.123 + rightMatrix = prodPiece->rightMatrix->matrix; 2.124 + 2.125 + int32 leftStartRow = prodPiece->leftStartRow; 2.126 + int32 leftStartCol = prodPiece->leftStartCol; 2.127 + int32 rightStartRow = prodPiece->rightStartRow; 2.128 + int32 rightStartCol = prodPiece->rightStartCol; 2.129 + int32 numLeftRows, numLeftCols, numRightRows, numRightCols; 2.130 + 2.131 + numLeftRows = prodPiece->leftEndRow - leftStartRow + 1; 2.132 + numLeftCols = prodPiece->leftEndCol - leftStartCol + 1; 2.133 + numRightRows = prodPiece->rightEndRow - rightStartRow + 1; 2.134 + numRightCols = prodPiece->rightEndCol - rightStartCol + 1; 2.135 + 2.136 + sizeOfLeftMatrix = sizeof( float32 ) * numLeftRows * numLeftCols; 2.137 + sizeOfRightMatrix = sizeof( float32 ) * numRightRows * numRightCols; 2.138 + sizeOfBundle = numPos * sizeof( int32 ) + 2.139 + sizeOfLeftMatrix + sizeOfRightMatrix; 2.140 + 2.141 + bundle = BLIS_DKU__makeInputBundle( sizeOfBundle ); 2.142 + 2.143 + *((int32 *)bundle + szPos) = sizeOfBundle; 2.144 + *((int32 *)bundle + numLRPos) = numLeftRows; 2.145 + *((int32 *)bundle + numLCPos) = numLeftCols; 2.146 + *((int32 *)bundle + numRRPos) = numRightRows; 2.147 + *((int32 *)bundle + numRCPos) = numRightCols; 2.148 + 2.149 + //NOTE: Don't need to know start and end.. they will be set in remote 2.150 + // memory according to the size (number) alone 2.151 + 2.152 + leftBundleMatrix = (float32 *) ((int32 *)bundle + LBMatrixPos); 2.153 + rightBundleMatrix = leftBundleMatrix + sizeOfLeftMatrix/sizeof(float32); 2.154 + 2.155 + int32 r, c, numColsInLeftMatrix, numColsInRightMatrix; 2.156 + numColsInLeftMatrix = prodPiece->leftMatrix->numCols; 2.157 + leftBundleInsertPt = leftBundleMatrix; 2.158 + for( r = 0; r < numLeftRows; r++ ) 2.159 + { leftMatrixReadPt = leftMatrix + 2.160 + (leftStartRow + r) * numColsInLeftMatrix + 2.161 + leftStartCol; //these are counts, compiler does *4 2.162 + for( c = 0; c < numLeftCols; c++ ) 2.163 + { 2.164 + *(leftBundleInsertPt++) = *(leftMatrixReadPt++); 2.165 + } 2.166 + } 2.167 + 2.168 + // Have to do separate loops for left and right because may be diff shapes 2.169 + numColsInRightMatrix = prodPiece->rightMatrix->numCols; 2.170 + rightBundleInsertPt = rightBundleMatrix; 2.171 + for( r = 0; r < numRightRows; r++ ) 2.172 + { rightMatrixReadPt = rightMatrix + 2.173 + (rightStartRow + r) * numColsInRightMatrix + 2.174 + rightStartCol; 2.175 + for( c = 0; c < numRightCols; c++ ) 2.176 + { 2.177 + *(rightBundleInsertPt++) = *(rightMatrixReadPt++); 2.178 + } 2.179 + } 2.180 + 2.181 + return bundle; 2.182 + } 2.183 + 2.184 +/*Leave all the data in bundle, just assign pointers to it. 2.185 + * Create a DKUPiece data structure, then fill in the sizes and pointers. 2.186 + * 2.187 + *This is app code, but need to make it easy for specialization. 2.188 + *On machines like the Cell, the code for this function will be copied 2.189 + * over to a separate file, along with any other DKU functions needed in 2.190 + * the remote memory. 2.191 + * 2.192 + *The scheduler in remote memory is responsible for making space for the 2.193 + * input bundle, and for freeing it (if needed) after the result bundle has 2.194 + * been sent back. 2.195 + * 2.196 + *Model is that use an override of malloc that puts everything malloc'd from 2.197 + * unbundleInputs calls and from bundleResults call into a buffer in remote 2.198 + * mem. This entire buffer is freed after the return of the result bundle is 2.199 + * complete. 2.200 + */ 2.201 + void 2.202 +unbundleInputs_MM( void *bundle, DKUPiece *piece ) 2.203 + { int32 sizeOfBundle, numLeftRows, numLeftCols, numRightRows, numRightCols; 2.204 + int32 sizeOfLeftMatrix, sizeOfRightMatrix, sizeOfResultMatrix; 2.205 + float32 *leftBundleMatrix, *rightBundleMatrix; 2.206 + MatrixProdPiece *prodPiece; 2.207 + 2.208 + sizeOfBundle = *((int32 *)bundle + szPos); 2.209 + 2.210 + numLeftRows = *((int32 *)bundle + numLRPos); 2.211 + numLeftCols = *((int32 *)bundle + numLCPos); 2.212 + numRightRows = *((int32 *)bundle + numRRPos); 2.213 + numRightCols = *((int32 *)bundle + numRCPos); 2.214 + 2.215 + 2.216 + sizeOfLeftMatrix = sizeof( float32 ) * numLeftRows * numLeftCols; 2.217 + sizeOfRightMatrix = sizeof( float32 ) * numRightRows * numRightCols; 2.218 + sizeOfResultMatrix = sizeof( float32 ) * numLeftRows * numRightCols; 2.219 + 2.220 + leftBundleMatrix = (float32 *) ((int32 *)bundle + LBMatrixPos); 2.221 + rightBundleMatrix = leftBundleMatrix + sizeOfLeftMatrix/sizeof(float32); 2.222 + 2.223 +//ARCH: check, for Cell, what's involved with re-defining malloc that appears 2.224 +// inside DKUPiece maker and app spec piece maker.. can make it buffer-alloc? 2.225 + 2.226 + //that indicate how much stuff is created automatically inside 2.227 + //IE, does this make produce the sched data, in bundling quad? 2.228 + prodPiece = DKU__makeMatrixProdPiece_Flat( piece ); 2.229 + piece->appSpecificPiece = prodPiece; 2.230 + 2.231 + prodPiece->leftMatrix = 2.232 + DKU__makeMatrix_Flat( numLeftRows, numLeftCols, piece ); 2.233 + prodPiece->leftMatrix->matrix = leftBundleMatrix; 2.234 + 2.235 + prodPiece->rightMatrix = 2.236 + DKU__makeMatrix_Flat( numRightRows, numRightCols, piece ); 2.237 + prodPiece->rightMatrix->matrix = rightBundleMatrix; 2.238 + 2.239 + prodPiece->resultMatrix = 2.240 + DKU__makeMatrix_Flat( numLeftRows, numRightCols, piece ); 2.241 + //The result matrix is malloc'd, so it's not inside the input bundle, 2.242 + // so, to avoid copies when make the result bundle, make it now, then 2.243 + // put into the DKUPiece the pos of result matrix in result bundle. 2.244 + void *resultBundle = 2.245 + BLIS_DKU__malloc_toPiece( sizeOfResultMatrix + sizeof(int32), piece); 2.246 + *((int32 *)resultBundle) = sizeOfResultMatrix + sizeof(int32); 2.247 + //skip over the "size" at the start of the result bundle 2.248 + prodPiece->resultMatrix->matrix = (float32 *)((int32 *)resultBundle + 1); 2.249 + 2.250 + //now, fill in the iteration bounds so that the kernel processes 2.251 + // the entirety of both matrices. 2.252 + prodPiece->leftStartRow = 0; 2.253 + prodPiece->leftEndRow = numLeftRows - 1; // "- 1" 'cause start at 0 2.254 + prodPiece->leftStartCol = 0; 2.255 + prodPiece->leftEndCol = numLeftCols - 1; 2.256 + 2.257 + prodPiece->rightStartRow = 0; 2.258 + prodPiece->rightEndRow = numRightRows - 1; 2.259 + prodPiece->rightStartCol = 0; 2.260 + prodPiece->rightEndCol = numRightCols - 1; 2.261 + 2.262 + prodPiece->prodStartRow = 0; 2.263 + prodPiece->prodEndRow = numLeftRows - 1; 2.264 + prodPiece->prodStartCol = 0; 2.265 + prodPiece->prodEndCol = numRightCols - 1; 2.266 + } 2.267 + 2.268 +/* 2.269 + *Model is that use an override of malloc in remote mem that puts everything 2.270 + * malloc'd from unbundleInputs calls and from bundleResults call into a 2.271 + * buffer in remote mem. The entire buffer is freed after the send of the 2.272 + * return result bundle is complete. 2.273 + * 2.274 + *The application only has to know that it does not perform free on any of 2.275 + * the inputBundles, nor on any of the resultBundles, in either local or 2.276 + * remote memory. 2.277 + *The application also must create new DKUPiece s in the bundling quad plus 2.278 + * in the Kernel (and all calls rooted at the Kernel) by using 2.279 + * BLIS_DKU__makeDKUPiece 2.280 + *Finally, the application must create app-specific pieces 2.281 + *So anything malloc'd inside bundleResults is still inside the same buffer 2.282 + * used by unbundleInputs. 2.283 + */ 2.284 +//ARCH: what about just give unbundleInputs and bundleResults an "align" 2.285 +// operator that's HW-supplied. 2.286 +// First element of bundle is size, then "0" terminated list of offsets to 2.287 +// alignable-chunks, then the alignable chunks start. Alignment happens 2.288 +// during bundling. HW also supplies a checker to see if aligned bundle is 2.289 +// too big. (add a "revert divide" so can do a new divide to get smaller 2.290 +// pieces, or something.. ) 2.291 + void * 2.292 +bundleResults_MM( DKUPiece *piece, void *inputBundle ) 2.293 + { MatrixProdPiece *matProd; 2.294 + float32 *matProdArr; 2.295 + 2.296 + matProd = (MatrixProdPiece *)piece->appSpecificPiece; 2.297 + matProdArr = matProd->resultMatrix->matrix; 2.298 + 2.299 + //TODO: figure out soln for alignment or result matrix when it's inside 2.300 + // input-bundle 2.301 + 2.302 + //results bundle already made (inside unbundleInputs fn), resultsBundle 2.303 + // addr is one int32 before addr of result matrix array. 2.304 + void *resultsBundle = ((int8 *)matProdArr - sizeof(int32)); 2.305 + 2.306 + return resultsBundle; 2.307 + } 2.308 + 2.309 +/* The DKU standard says that the scheduler guarantees that the same 2.310 + * DKUPiece that created an input bundle will be called to unbundle 2.311 + * the corresponding results bundle. 2.312 + * This means that the unbundleResults method is called on the original 2.313 + * piece, that still has the position within the result matrix where 2.314 + * this piece's results should go. 2.315 + * So, copy the values in the incoming result matrix to the correct 2.316 + * sub-block of the "original" result matrix 2.317 + */ 2.318 + void 2.319 +unbundleResults_MM( void * resultBundle, DKUPiece *origPiece ) 2.320 + { float32 *bundMatrixArr, *resMatrixArr, *bundleReadPt, *resultInsertPt; 2.321 + MatrixProdPiece *matProd; 2.322 + Matrix *resultMatrix; 2.323 + 2.324 + int32 resMatNumRows, resMatNumCols; 2.325 + int32 prodStartRow, prodEndRow, prodStartCol, prodEndCol, r, c; 2.326 + 2.327 + bundMatrixArr = (float32 *) ((int32 *)resultBundle + 1); 2.328 + 2.329 + matProd = (MatrixProdPiece *) origPiece->appSpecificPiece; 2.330 + resultMatrix = matProd->resultMatrix; 2.331 + resMatrixArr = resultMatrix->matrix; 2.332 + 2.333 + resMatNumRows = resultMatrix->numRows; 2.334 + resMatNumCols = resultMatrix->numCols; 2.335 + 2.336 + prodStartRow = matProd->prodStartRow; 2.337 + prodEndRow = matProd->prodEndRow; 2.338 + prodStartCol = matProd->prodStartCol; 2.339 + prodEndCol = matProd->prodEndCol; 2.340 + 2.341 + //copy the results from the matrix in the bundle to 2.342 + // the full result matrix. 2.343 + bundleReadPt = bundMatrixArr; 2.344 + for( r = prodStartRow; r < prodEndRow; r++ ) 2.345 + { resultInsertPt = resMatrixArr + r * resMatNumCols + prodStartCol; 2.346 + for( c = prodStartCol; c <= prodEndCol; c++ ) 2.347 + { *(resultInsertPt++) = *(bundleReadPt++); 2.348 + } 2.349 + } 2.350 + } 2.351 + 2.352 +
3.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 3.2 +++ b/DKU_INST_MM/Communicators.c Sun Aug 26 03:04:50 2012 -0700 3.3 @@ -0,0 +1,435 @@ 3.4 +/* 3.5 + * Copyright 2009 OpenSourceStewardshipFoundation.org 3.6 + * Licensed under GNU General Public License version 2 3.7 + * 3.8 + * Author: seanhalle@yahoo.com 3.9 + */ 3.10 + 3.11 +#include "DKU_INST_MM.h" 3.12 + 3.13 +/*Communicators are used by the scheduler to send data from one piece to 3.14 + * another. The kernel specifies when the communication is to take place 3.15 + * by calling either a send communicator or a receive communicator. 3.16 + *The communicators know what data to send and where to send it by reading 3.17 + * information out of the DKUPiece structure. The information about which 3.18 + * other piece to send to and what data is placed into the piece by the 3.19 + * DKUPieceMaker and the Divider. 3.20 + *This is how dependencies among data pieces are encoded and how the 3.21 + * scheduler is informed of them. 3.22 + */ 3.23 + 3.24 +/*In Deblocking, the dependency pattern is a 45 degree diagonal. A given 3.25 + * macro block receives information from the macro block above it and the 3.26 + * macro block to its left. 3.27 + *This must be told to the scheduler, so that it can order the execution of 3.28 + * pieces appropriately. 3.29 + * application programmer has chosen to divide each 3.30 + * screen frame into diagonals 3.31 + */ 3.32 + 3.33 +/*Okay, so going with the plan that match the scheduler form to the 3.34 + * application form. Expose explicitly that different applications have 3.35 + * characteristics that can be taken advantage of by scheduler impl to make 3.36 + * more efficient scheduler. 3.37 + *So, take advantage of flexibility of BLIS interface philosophy.. make an 3.38 + * interface that is tuned to particular characteristic of application to 3.39 + * allow efficient scheduler impl that takes advantage. 3.40 + *In this case, it is fact that have a pattern of dependencies. Make the 3.41 + * dependencies first-class entities in the interface. Make a primitive for 3.42 + * expressing the dependencies, and one for embodying the dependencies. 3.43 + * 3.44 + *The embodiment will be an array. The DKUPieceMaker will create the array 3.45 + * and populate it for the initial pieces it makes.. something like that.. 3.46 + * The DKUPieceMaker also returns a pointer to the initially-free piece. 3.47 + * (Note that there's only ever one initially free piece. If there were two 3.48 + * then they wouldn't depend on each other, and so could be combined into 3.49 + * a single piece.. or? In some cases would require separate 3.50 + * "combination" data-struc.. so, maybe an array of initially free) 3.51 + *Each division places an array in the parent, with one position for each 3.52 + * sub-piece. The position holds the count, of the number of propendents, 3.53 + * for the sub-piece in the corresponding position in the sub-piece array. 3.54 + * 3.55 + *The fellow sub-pieces of a parent typically won't have any dependencies 3.56 + * on each other.. the dependencies are to sub-pieces of other parents. 3.57 + *So the DKUPieceMaker will create the largest pieces that can be sub-divided 3.58 + * freely, and also create an array with the propendent count for each of 3.59 + * those pieces. 3.60 + *The undivider is where propendent counts are updated. The Undivider is 3.61 + * what frees DKUPieces to be scheduled. That process is inside the 3.62 + * scheduler implementation. 3.63 + *The divider is written to know about the arrays of propendent counts of the 3.64 + * other DKUPieces. So is the Undivider. The divider looks at the arrays 3.65 + * that already exist and populates the new array for the currently being 3.66 + * divided parent accordingly. It also puts in information for the Undivider 3.67 + * to properly update the counts in the arrays when a piece finishes. 3.68 + *Big question: can Albert perform a static analysis that can understand 3.69 + * general code in the divider and undivider, given that it knows explicitly 3.70 + * what the arrays are used for, and just has to look at the code that 3.71 + * creates, populates, and updates the arrays.. The idea being that the 3.72 + * polyhedral model can learn the dependencies this way. 3.73 + * 3.74 + *Each dependency has an associated communicator. A communicator is what 3.75 + * performs transport of the propendent-generated-state to the receiving 3.76 + * dependent. A communicator and a dependency arrow are the same thing. 3.77 + *In shared memory, the communicator is normally not used. Only in distr 3.78 + * memory is it invoked to bundle up the propendent data and carry it to the 3.79 + * dependent. 3.80 + *The Undivider tells the scheduler each time a completed propendent has 3.81 + * updated the count of a given dependent. This will trigger the 3.82 + * scheduler to fire the communicator, if it has been implemented that way. 3.83 + *This lets the scheduler, for example, send data around before the 3.84 + * propendent data is available. So, it gets better overlap of communication 3.85 + * and computation. 3.86 + * 3.87 + *So, this scheme so far covers the case of "2D" parallelism in Deblocking. 3.88 + * But it doesn't yet cover simulation where the communication happens in 3.89 + * the middle of a loop nest. 3.90 + *For that, still have the same communicator, which is still one-to-one with 3.91 + * a dependency in the data. 3.92 + *But now, the Kernel invokes the communicator, when the propendent has 3.93 + * finished producing data.. By only calling in the propendent, causality is 3.94 + * always enforced, with no extra mechanism required in the scheduler. 3.95 + *The Kernel also calls the communicator in the dependent, in the position 3.96 + * that data must be received before continuing on. 3.97 + *This way, the scheduler is free to implement the timing of communication 3.98 + * in many different ways. The communicator explicitly copies data to a 3.99 + * separate communication-area that is made in the propendent DKUPiece 3.100 + * by the Divider or DKUPieceMaker during creation. 3.101 + *It returns when the copy is done, handing the scheduler a pointer to the 3.102 + * data. The scheduler then handles moving the data to the dependent piece 3.103 + * (or not, on shared memory). The receiving end of the communicator 3.104 + * accepts a pointer to the data. The Kernel is written to access data 3.105 + * through that pointer. 3.106 + * 3.107 + *Wanting to allow shared memory to NOT perform the copy, just pass a pointer 3.108 + * to the area of data that the dependent needs, and have the Kernel access 3.109 + * the data in the appropriate way (according to data in the DKUPiece pointed 3.110 + * to).. thing is, still need something to perform the copy in the distr. 3.111 + * memory case. 3.112 + *So, seeing two different sending communicators, and two different receiving 3.113 + * communicators. For distr mem, the sender does a copy into area in 3.114 + * sender's DKUPiece, then returns pointer to that copy (it's a bundle, with 3.115 + * first location being a uint32 with size of bundle).. or maybe have a 3.116 + * pre-defined element in the DKUPiece. The receiver is implemented as part 3.117 + * of the scheduler.. it returns a pointer to the Dependent calling Kernel. 3.118 + *Shared mem communicators, the sender is implemented by the scheduler, as 3.119 + * well as the receiver. The propendent Kernel simply passes the DKUPiece 3.120 + * to the sender. 3.121 + * 3.122 + *Right.. so, three out of the four are implemented by the scheduler.. so 3.123 + * just make the fourth be an optional do-hickie. The scheduler implements 3.124 + * both the send and receive calls, and the application provides a 3.125 + * communication bundler. : D 3.126 + *The receiver still has two different cases: in one it gets a bundle, in 3.127 + * the other it gets a pointer to a DKUPiece. No point in unbundling, just 3.128 + * so the Kernel can do the gather operation again. So, the Kernel will have 3.129 + * to have a different version, one for shared, second for distr.. yuck. 3.130 + * 3.131 + *Q: starting to have different "kinds" of DKUPiece now.. how going to 3.132 + * handle backwards-and-forwards compatibility? 3.133 + * 3.134 + *For shared mem, 3.135 + */ 3.136 + 3.137 +void * bundleComm_BarnesHut_110( DKUPiece piece ) 3.138 + { 3.139 + //this copies data out of the piece for the "110" going direction 3.140 + //All communication bundlers have the same prototype: void * foo(DKUPiece) 3.141 + //The bundlers are registered with the scheduler for this DKU instance 3.142 + // via the BLIS_DKU__set_commBundler_ForID( &foo, commTypeID, DKU_Inst_ID) 3.143 + //Which two pieces communicate is set by the propendent and dependent 3.144 + // calls to communicate, which happen in the Kernel. The PieceMaker and 3.145 + // Divider know which pieces communicate to which at what points in the 3.146 + // Kernel. So, they insert into the DKUPiece data structure the ID of 3.147 + // the piece they send to/receive from. When the Kernel reaches the 3.148 + // communication point, it takes the propendent/dependent pieceID from 3.149 + // the DKUPiece and passes that to the communicate() call (along with the 3.150 + // commTypeID, which is fixed for each call that appears in the Kernel). 3.151 + //In other words, the Kernel, Divider, and PieceMaker are the only ones 3.152 + // that agree among themselves on what a particular commTypeID means. For 3.153 + // example, for graphs, there is only one type ID, because the process of 3.154 + // copying is the same no matter which direction.. but for a mesh, the 3.155 + // copy is different for the top, bottom, and two edges.. 3.156 + 3.157 + /*There is the basic issue of exposing in the Kernel code whether running 3.158 + * on shared memory or distr memory. 3.159 + *Could either have a fixed Kernel that adapts, or have two Kernels that 3.160 + * are chosen among by the scheduler. 3.161 + *If the Kernel is fixed, have only a few choices: 3.162 + * Kernel always thinks it's taking data from a DKUPiece, or 3.163 + * Kernel always takes data via a fixed interface-call. 3.164 + *Not sure how to hide if Kernel always takes from a DKUPiece.. and extra 3.165 + * work in remote memory to re-create DKUPiece structure, for only reason 3.166 + * to make interface nice (for programmer). 3.167 + *With the fixed interface, could provide two adaptors: one for shared 3.168 + * mem that gathers from a DKUPiece, other for a bundle. Then scheduler 3.169 + * picks, behind the interface, which adaptor. 3.170 + *How would that look to the Kernel? Something like "update", and the 3.171 + * local memory changes.. but that only works if there's a fixed data- 3.172 + * struc in local mem to take from as the Kernel computation progresses. 3.173 + *IE, have an outer loop that the communication is in, and inner loops 3.174 + * that use the result of communication. For shared mem, want that data- 3.175 + * struc to be the appSpecificPiece of another DKUPiece, and locations in 3.176 + * it will be read (or written) inside the inner loops. This means the 3.177 + * scheduler has to figure out an ordering to run the Kernels that 3.178 + * respects the dependencies. This is aided by the fact that the Kernel 3.179 + * has a separate propendent and dependent call that states the order. 3.180 + *There will actually be some Kernels that can't be run this way: the 3.181 + * pattern of dependencies has no sequential soln, a copy must be done. 3.182 + *Starting to think perhaps it's best to just always do a copy.. 3.183 + * 3.184 + *Other choice is two different versions of the Kernel.. one that reads 3.185 + * from a DKUPiece, the other that reads from a commBundle. 3.186 + *Or a hybrid Kernel that can read from both, using a flag 3.187 + * in the DKUPiece that tells the Kernel whether to receive a 3.188 + * commBbundle or whether to take from the normal data in the propendent. 3.189 + *The two Kernel version is the most run-time efficient. The main drawback 3.190 + * is that the app developer has to make identical changes in two 3.191 + * different places.. any time they change one of the Kernel-copies they 3.192 + * have to change the other too.. also, it feels weird having two 3.193 + * different Kernels.. don't like it.. 3.194 + *Kinda like the hybrid approach, it has the second least run-time 3.195 + * overhead, just an IF statement that will be well predicted each time 3.196 + * it accesses data during calculations.. Could even use a #define here, 3.197 + * so the IF is known at compile time to always go one direction. The 3.198 + * specialization module would change the #define to: SharedMem or to 3.199 + * DistrMem. Or, make the if a #ifdef in the source.. but that's ugly 3.200 + * to read. 3.201 + *Something to percolate.. whatever is chosen, the problem is solved, 3.202 + * just a matter of tradeoffs at this point.. 3.203 + */ 3.204 + } 3.205 + 3.206 +void sampleKernel( DKUPiece *piece ) 3.207 + { 3.208 + initializePropendents(); 3.209 + for( outer = 0; outer < N; outer++ ) 3.210 + { fromProp = getFromPropendent( piece->propPieceIDs[ NORTH_PROPENDENT ]); 3.211 + for( inner = 0; inner < N; inner++ ) 3.212 + { //This Kernel knows that data is an array because it's written 3.213 + // by the app-programmer. It knows that fromProp is also an 3.214 + // array because the app progr wrote the commBundler. 3.215 + piece->appSpecPiece->data[ inner ] += fromProp[ inner ]; 3.216 + } 3.217 + sendToDependent( piece->depPieceIDs[ SOUTH_DEPENDENT ] ); 3.218 + } 3.219 + finalizeDependents(); 3.220 + } 3.221 + 3.222 +/*In the divider, one has a single piece, which has communicators at its 3.223 + * boundaries already. 3.224 + *The divider cuts up the piece, and so knows which sub-pieces talk to 3.225 + * which others, because it just made them all from the same parent piece. 3.226 + *The tricky part is connecting the hierarchy. 3.227 + *The pieces the parent communicated with are on the order of the size of 3.228 + * the parent. Those pieces will likely have been divided as well.. now 3.229 + * the question is how to hand-off from the parent to the appropriate 3.230 + * sub-pieces. 3.231 + *One way to do it is to have a Kernel running for each piece in the 3.232 + * hierarchy, but of two kinds: hierachy-Kernels and "normal" Kernels. The 3.233 + * hierarchy Kernels only do communication: they break an in-coming request 3.234 + * among their sub-pieces, then gather the responses back together. 3.235 + *This is inefficienct in one sense: direct communication between sub-pieces 3.236 + * of different Kernels would be optimal. 3.237 + *However, such a hierarchy will 3.238 + * only exist when a physical hierarchy exists among machines. In that 3.239 + * case, the extra scatter-gather work done by the hierarchy pieces might 3.240 + * even be more efficient because it makes fewer, larger, communications 3.241 + * between the larger physical entities. This is more likely to ameliorate 3.242 + * the loss from the larger latency in communication at the larger physical 3.243 + * division. 3.244 + *Okay, so going with that, for now. When get details, may see some patterns 3.245 + * for how to do direct communication among sub-pieces.. (but not holding 3.246 + * my breath because each of two parents can be divided into a different 3.247 + * number of sub-pieces.. so there is no one-to-one between sub-pieces on 3.248 + * the edge of one parent and sub-pieces on a communicating edge of another 3.249 + * parent.) 3.250 + * 3.251 + *This sample is for a big linked list. Each piece is just a number of nodes 3.252 + * of the list. 3.253 + *When the Divider makes sub-pieces, it knows which ones communicate across 3.254 + * the boundaries of the parent (because the application programmer wrote 3.255 + * the Divider and placed the code in it that handles the boundaries of the 3.256 + * parent). 3.257 + *Patterns for how to do this part: 3.258 + *Could have the Divider create some new structure that it places in the 3.259 + * parent that holds the state for the CommKernel. Put into that structure 3.260 + * all the sub-piece-IDs that will communicate with it. Into the sub-pieces 3.261 + * put a commID not of the parent piece, but of the parent's CommKernel. So 3.262 + * a piece gains a second ID when it is divided. It keeps its original 3.263 + * commID and uses that to communicate with siblings, while it uses the 3.264 + * subCommID to communicate with sub-pieces. 3.265 + *Just going with that one idea for the moment.. 3.266 + *So, for the Linked List example, the Divider will set all the sub-pieces 3.267 + * to talk to each other, and set the end-pieces to talk to the subCommID of 3.268 + * the parent. 3.269 + *The parent, meanwhile, will have a normal commID with which it talks to 3.270 + * its siblings. 3.271 + *When the normal Kernel modifies the linkings in the list, it has to check 3.272 + * if one of the elements modified is a boundary element, and if so if the 3.273 + * new arrangement has changed which element is the boundary. 3.274 + *For example, an element is added to one end of a sub-piece. The added 3.275 + * element has its link-ptr set to a value that indicates it's a boundary 3.276 + * element, for example NULL or -1. The old boundary element's pointer is 3.277 + * set to the new boundary element. The commID is taken from the old 3.278 + * boundary element and put into the new boundary element. Done. 3.279 + *The CommKernel will take advantage of the fact that it knows it's in a 3.280 + * hierarchy.. it will keep an array of the values at the boundaries of its 3.281 + * siblings. (Note that one DKUPiece is disallowed from holding a pointer 3.282 + * to another DKUPiece).. the values may get out-of-sync, so they will be 3.283 + * fixed-up when detected. 3.284 + *Here's the pattern: when CommKernel gets comm from sub-piece, it looks to 3.285 + * see what kind of comm it is.. if it's a "here's a value to insert", then 3.286 + * it checks the end-values of its siblings and picks the sibling it belongs 3.287 + * on. It sends to that sibling (requires unsolicited reception mechanism, 3.288 + * such as the signals method for re-divide).. includes its own boundary 3.289 + * values (free to piggy-back).. that updates the receiver's view of the 3.290 + * sibling-piece's boundary values. 3.291 + * 3.292 + *When CommKernel receives a value-to-insert, 3.293 + * it checks the values at its two boundary elements. If the value is 3.294 + * between, then it accepts it. If not, it responds to the sender, telling 3.295 + * the sender what the receiver's actual boundary values are. It then sends 3.296 + * the value to the piece it believes it should go to (received the sender's 3.297 + * boundary values along with the insert value, so it's certain it won't 3.298 + * send back to where it came from..) Eventually the value will land at 3.299 + * the correct CommKernel. 3.300 + *Seeing pieces given to the CommKernel of the top-level parent. It then 3.301 + * hands them out among its children, and from there to next level of 3.302 + * children, and so on. Notice that siblings talk directly to each other, 3.303 + * they don't go up to the parent then back down. 3.304 + *The value of this abstract data type will be handling an enormous number 3.305 + * of inserts, deletes, and lookups. 3.306 + * 3.307 + */ 3.308 +void sampleDivider( DKUPiece *piece, int numPieces ) 3.309 + { DKUPiece * newPiece; 3.310 + //First sub-piece, so it is a boundary of the parent 3.311 + //Figure out if parent has a sibling, or if it is natural boundary 3.312 + if(numPieces < 2) return; //leave sub-pieces empty if only 1 sub-piece 3.313 + 3.314 + newPiece = makeASubPiece( someValues ); 3.315 + if( BLIS_DKU__isNaturalBoundary( piece->propPieceIDs[ LEFT_PROPENDENT ]) ) 3.316 + { newPiece->propPieceIDs[ LEFT_PROPENDENT ] = 3.317 + BLIS_DKU__makeNaturalBoundaryPiece( piece, newPiece, DKU_INST_ID ); 3.318 + } 3.319 + else //parent has a sibling, so communicate with parent's CommKernel 3.320 + { newPiece->propPieceIDs[ LEFT_PROPENDENT ] = 3.321 + BLIS_DKU__giveCommKernelAsPropendent( piece ); //scheduler returns the 3.322 + //thing that it has implemented as "addr" of CommKernel of piece 3.323 + } 3.324 + for( pieceIdx = 1; pieceIdx < numPieces - 1; pieceIdx++ ) 3.325 + { newPiece = makeASubPiece( someValues ); 3.326 + newPiece->propPieceIDs[ LEFT_PROPENDENT ] = (subPieces[pieceIdx - 1]); 3.327 + subPieces[pieceIdx-1]->propPieceIDs[ RIGHT_PROPENDENT ] = (newPiece); 3.328 + } 3.329 + newPiece = makeASubPiece( someValues ); 3.330 + newPiece->propPieceIDs[ LEFT_PROPENDENT ] = (subPieces[numPieces - 1]); 3.331 + if( BLIS_DKU__isNaturalBoundary( piece->propPieceIDs[ RIGHT_PROPENDENT ])) 3.332 + { newPiece->propPieceIDs[ RIGHT_PROPENDENT ] = 3.333 + BLIS_DKU__makeNaturalBoundaryPiece( piece, newPiece, DKU_INST_ID ); 3.334 + } 3.335 + else //parent has a sibling, so communicate with parent's CommKernel 3.336 + { newPiece->propPieceIDs[ RIGHT_PROPENDENT ] = 3.337 + BLIS_DKU__giveCommKernelAsPropendent( piece ); //scheduler returns the 3.338 + //thing that it has implemented as "addr" of CommKernel of piece 3.339 + //Might have to make left propendent and right propendent be different 3.340 + // addresses.. in which case include a directionID in the call to 3.341 + // the scheduler asking for the address. 3.342 + //Might be something about matching dependents with propendents.. not 3.343 + // sure how that's going to play out.. 3.344 + //There's "pull from propendent" and "push to dependent" which are 3.345 + // interrupt-model.. then there's propendent sends and dependent 3.346 + // receives. 3.347 + 3.348 + } 3.349 + } 3.350 +/*Some ill-fits in here.. need to do real app, with real dependencies and 3.351 + * real comm in it.. 3.352 + * 3.353 + *Thinking perhaps give each piece a "name" struc.. the makeDKUPiece() 3.354 + * creates for DKUInstances that have a communicator registered.. (will 3.355 + * add some overhead to makeDKUPiece if have to do DKU-instance lookup) 3.356 + *WANT DKU-INSTANCE LOOKUP TO BE TRANSFORMED BY SPECIALIZATION 3.357 + * in practice, want the lookup to be performed statically, to eliminate the 3.358 + * dynamic overhead.. can do this if define semantics of DKUInstance to 3.359 + * be one-time and one-time only setting of functions to an instance.. have 3.360 + * to check this statically in the BLIS-rule-checker that's run in the 3.361 + * makefile in the sequential development environment. 3.362 + * 3.363 + *Then, the Divider simply copies the name-struc out of target pieces and 3.364 + * puts it into source pieces. 3.365 + *This allows, for example, making commKernels that have the name-strucs of 3.366 + * all the siblings.. so, for example, the commKernel can calculate which 3.367 + * of the siblings it should send to.. in fact, each of the Kernels can 3.368 + * calculate which of the siblings it should sent to.. re-use the same 3.369 + * CommKernel in the leaf-kernels as well as in the parent.. 3.370 + *The only complication to re-using CommKernel on all levels is the 3.371 + * boundaries.. just make the boundary be the CommKernel in the parent, 3.372 + * or a natural boundary.. 3.373 + */ 3.374 + 3.375 + 3.376 + 3.377 +/*The purpose of a CommKernel is to gather communications from all the sub- 3.378 + * pieces, put them together, and turn them into communications from the 3.379 + * parent piece. And vice-versa: receive comm, break it up, hand a portion 3.380 + * to each sub-piece. 3.381 + *This sample is for a big linked list. 3.382 + */ 3.383 +void sampleCommKernel( DKUPiece *piece ) 3.384 + { 3.385 + 3.386 + } 3.387 + 3.388 +/*Q to Albert: just how bad is this? Will a static tool be able to 3.389 + * understand the linkages, given that they are a fixed pattern of linkages 3.390 + * that are signalled by the fixed function-names..? 3.391 + *For example, could a static tool understand that 3.392 + * piece->propPieceIDs[ NORTH_PROPENDENT ] means a DKUPiece, which is set 3.393 + * in the Divider.. and uderstand that 3.394 + * piece->appSpecPiece->data[ inner ] = fromProp[ inner ]; means the 3.395 + * actual use of the data gotten from the DKUPiece.. then use that 3.396 + * understanding to replace the calls with a schedule plus direct access 3.397 + * to the propendent DKUPiece instead of the intermediate access first to 3.398 + * the commBundle. 3.399 + *Thinking this is how I want to go: use a commBundle on both shared and 3.400 + * distr memory. Reason is that it makes scheduling simple. And, there's 3.401 + * enough semantic information provided by the function calls that a static 3.402 + * tool should be able to perform a transform that does direct access to the 3.403 + * DKUPiece by migrating the gather code into the Kernel innards. The 3.404 + * index into the commBundle is what identifies the gather statement that 3.405 + * put the data there. That gather statement is inserted in place of the 3.406 + * access to the array. 3.407 + */ 3.408 + 3.409 +/*Have to provide boundary-propendents, so pieces that are on the edge of 3.410 + * a mesh, for example, access the boundary-propendent, with whatever 3.411 + * boundary data. 3.412 + *Also, have to provide a "time zero" something or other.. The thing it 3.413 + * solves is that one Kernel produces data at the end of the inner loop, 3.414 + * with a sendToDependent() call.. that data is received BEFORE the inner 3.415 + * loop with a getFromPropendent call. This is normally fine, as it encodes 3.416 + * the time-skew, implicitly pipelining. The only problems are at time 3.417 + * zero and possibly the very last time-step. 3.418 + *So, the initializePropendents() sets some default time-zero state that 3.419 + * will be gotten by the first getFromPropendent() call. 3.420 + *When the Kernel is done, the finalizeDependents() call handles whatever 3.421 + * might need to be done with the data from the last sendToDependents() 3.422 + * call that was performed in the Kernel. 3.423 + */ 3.424 + 3.425 +/*Q: How implement comm inside scheduler? 3.426 + * syntax proposal for performing comm from inside the Kernel is: 3.427 + * getFromPropendent( piece->propPieceIDs[ NORTH_PROPENDENT ]); 3.428 + * 3.429 + * So.. what is stored in that array? That is the thing that tells the 3.430 + * scheduler how to perform the communication.. 3.431 + *Thinking leave it opaque.. it's a void *.. The scheduler fills it in 3.432 + * itself inside of BLIS_DKU__makeDKUPiece().. the divider gets this thing 3.433 + * out of the piece and places it into the propPieceIDs array. 3.434 + * 3.435 + *This means that ask the scheduler to create a CommKernel, and ask it to 3.436 + * give the thingie need to communicate with it. 3.437 + *Also ask the scheduler to create a natural-boundary piece, and 3.438 + */
4.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 4.2 +++ b/DKU_INST_MM/DKU_INST_MM.h Sun Aug 26 03:04:50 2012 -0700 4.3 @@ -0,0 +1,54 @@ 4.4 +/* 4.5 + * Copyright 2009 OpenSourceCodeStewardshipFoundation.org 4.6 + * Licensed under GNU General Public License version 2 4.7 + * 4.8 + * Author: seanhalle@yahoo.com 4.9 + * 4.10 + */ 4.11 + 4.12 +#include "../../BLIS/DKU/DKU_common/DKU.h" 4.13 + 4.14 +#include "../BLIS_CONSTANTS.h" 4.15 + 4.16 +#ifndef _DKU_INST_MM_H 4.17 +#define _DKU_INST_MM_H 4.18 + 4.19 + 4.20 +//=========================================================================== 4.21 +// Declarations of the Standard DKU functions 4.22 + 4.23 +void DKU_INST_MM_Init(); //tells BLIS the pointers to the DKU functions 4.24 + 4.25 +Divide divide_MM; 4.26 +Kernel kernel_MM; 4.27 +Undivide unDividePiece_MM; 4.28 + 4.29 +MakeRootDKUPieces makeRootDKUPieces_MM; 4.30 + 4.31 +SerialKernel serialKernel_MM; 4.32 + 4.33 +BundleInputs bundleInputs_MM; 4.34 +UnbundleInputs unbundleInputs_MM; 4.35 +BundleResults bundleResults_MM; 4.36 +UnbundleResults unbundleResults_MM; 4.37 + 4.38 +//=========================================================================== 4.39 +// 4.40 +#include "../Matrix_Mult.h" 4.41 + 4.42 + void 4.43 +inner_Kernel( MatrixProdPiece *matrixProdPiece ); 4.44 + 4.45 + Matrix * 4.46 +DKU__makeMatrix_Flat( int32 numRows, int32 numCols, DKUPiece *owner ); 4.47 + 4.48 + MatrixProdPiece * 4.49 +DKU__makeMatrixProdPiece_Flat( DKUPiece *owner ); 4.50 + 4.51 + MatrixProdPiece * 4.52 +DKU__makeMatrixProdPiece_FromMatrixProdPiece 4.53 + ( MatrixProdPiece *parentPiece, DKUPiece *owner ); 4.54 + 4.55 + 4.56 +#endif /* _DKU_INST_MM_H */ 4.57 +
5.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 5.2 +++ b/DKU_INST_MM/DKU_INST_MM_init.c Sun Aug 26 03:04:50 2012 -0700 5.3 @@ -0,0 +1,58 @@ 5.4 +/* 5.5 + * Copyright Oct 24, 2009 OpenSourceCodeStewardshipFoundation.org 5.6 + * Licensed under GNU General Public License version 2 5.7 + * 5.8 + * 5.9 + * Author: SeanHalle@yahoo.com 5.10 + */ 5.11 + 5.12 +#include "DKU_INST_MM.h" 5.13 + 5.14 + 5.15 +/* 5.16 + * Part of the BLIS DKU standard. Each DKU instance is placed in its own 5.17 + * directory, a child of the Application directory. The directory is named 5.18 + * the same as the "#define" constant used to identify the DKU instance. 5.19 + * The directory has two standard files: "#def const".h and 5.20 + * "#def const"_init.c. 5.21 + * 5.22 + * This is the "#def const"_init.c file.. 5.23 + * It initializes the scheduler for this DKU instance. 5.24 + * And it tells that scheduler the pointers to all of the DKU functions for 5.25 + * this instance. 5.26 + * 5.27 + */ 5.28 +void DKU_INST_MM_Init( ) 5.29 + { // always start init of a DKU instance with this function 5.30 + // this isn't modal, so no worries about order of calling these fn 5.31 + // Can intertwine init of several instances without harm. 5.32 + BLIS_DKU__start_DKU_Instance_Init( DKU_INST_MM ); 5.33 + 5.34 + BLIS_DKU__set_Divide_To_ForID( ÷_MM, DKU_INST_MM ); 5.35 + BLIS_DKU__set_Kernel_To_ForID( &kernel_MM, DKU_INST_MM ); 5.36 + BLIS_DKU__set_Undivide_To_ForID( &unDividePiece_MM, DKU_INST_MM ); 5.37 + 5.38 + //TODO: figure out make and free -- right depth? Where used? 5.39 + // make sure don't accidentally free the shared Matrix strucs.. 5.40 + // trace where used, see if can find easy-to-see-pattern pattern for how 5.41 + // to do make and free -- DKUPiece automated stuff and whatnot.. 5.42 + // maybe just let the App see DKUPiece, and make make and free be done 5.43 + // explicitly in the pieceMaker, divider, undivider, etc.. no automation 5.44 + BLIS_DKU__set_MakeRootDKUPieces_To_ForID(&makeRootDKUPieces_MM,DKU_INST_MM); 5.45 +// BLIS_DKU__set_FreeAppSpecSubPiece_To_ForID(&freeMatrixProdPiece_Flat,DKU_INST_MM); 5.46 +// BLIS_DKU__set_FreeAppSpecRootPiece_To_ForID(&freeMatrixProdPiece_Flat,DKU_INST_MM); 5.47 +// BLIS_DKU__set_MakeAppSpecPiece_To_ForID(&makeMatrixProdPiece_Using,DKU_INST_MM); 5.48 + BLIS_DKU__set_SerialKernel_To_ForID( &serialKernel_MM, DKU_INST_MM ); 5.49 + 5.50 + BLIS_DKU__set_BundleInputs_To_ForID( &bundleInputs_MM, DKU_INST_MM ); 5.51 + BLIS_DKU__set_UnbundleInputs_To_ForID(&unbundleInputs_MM,DKU_INST_MM); 5.52 + BLIS_DKU__set_BundleResults_To_ForID( &bundleResults_MM, DKU_INST_MM); 5.53 + BLIS_DKU__set_UnbundleResults_To_ForID( &unbundleResults_MM,DKU_INST_MM ); 5.54 + 5.55 + //Now that have generic, provide HW-specific overrides 5.56 +/* BLIS_DKU__set_A_Divide_Override_To_ForID() 5.57 + BLIS_DKU__set_A_DKU_Override_To_ForID( ) 5.58 +*/ 5.59 + // always end init of a DKU instance with this function 5.60 + BLIS_DKU__end_DKU_Instance_Init( DKU_INST_MM ); 5.61 + }
6.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 6.2 +++ b/DKU_INST_MM/Divide.c Sun Aug 26 03:04:50 2012 -0700 6.3 @@ -0,0 +1,295 @@ 6.4 +/* 6.5 + * Copyright 2009 OpenSourceCodeStewardshipFoundation.org 6.6 + * Licensed under GNU General Public License version 2 6.7 + * 6.8 + * Author: seanhalle@yahoo.com 6.9 + * 6.10 + */ 6.11 + 6.12 +#include <math.h> 6.13 + 6.14 +#include "DKU_INST_MM.h" 6.15 + 6.16 + 6.17 +typedef 6.18 +struct 6.19 + { 6.20 + //inputs 6.21 + int numLeftRows; 6.22 + int numRightCols; 6.23 + int numSubPiecesToMake; 6.24 + DKUPiece *parPiece; 6.25 + 6.26 + //outputs 6.27 + int numLeftSlices; 6.28 + int numRightSlices; 6.29 + int finalNumSubPieces; 6.30 + 6.31 + //outputs then inputs 6.32 + int *leftSliceStartRows; 6.33 + int *rightSliceStartCols; 6.34 + } 6.35 +SliceStruc; 6.36 + 6.37 +void updateSlicingStrucWithSlicingOfInputMatrices(SliceStruc *slicingStruc); 6.38 + 6.39 +void pairSlicesAndMakeProdPieces(SliceStruc *slicingStruc, DKUPiece *oPiece); 6.40 +MatrixProdPiece* makeChildMatrixProdPieceFrom( MatrixProdPiece *parentPiece); 6.41 + 6.42 + 6.43 +/* The Divider 6.44 + * Divides the iteration space.. 6.45 + * Matrix Product Piece is a piece of iteration 6.46 + * space.. it is the iterations in which one piece of the left 6.47 + * matrix is multiplied by one piece of the right matrix. 6.48 + * Thus, to make product pieces, both the left and right matrices 6.49 + * have to be sliced, then all pairings of those slices taken. 6.50 + * Each pairing on matrix slices is one product piece. 6.51 + * 6.52 + * So, for example, dividing a product-piece into, say, 4 pieces means 6.53 + * dividing the iteration space of the parent piece into 4, 6.54 + * which means slicing the left matrix by 2, and the right matrix by 2, 6.55 + * and talking all pairings of a left slice times a right slice. 6.56 + * 6.57 + * Do division this way: 6.58 + * count total number of result cells to be produced. Divide that by the 6.59 + * number of sub-pieces to make. That gives the target number of result 6.60 + * cells in each sub-piece. 6.61 + * Take the square root of the number of target cells, that's the target 6.62 + * number of cols, and target num rows in the result of each sub-piece. 6.63 + * See how many sqroots fit horizontally, and how many fit vertically 6.64 + * take the ceiling of the larger, floor of the smaller. 6.65 + * That is the number of rows, and number of cols, in each sub-piece. 6.66 + * Multiply the two to find the number of sub-pieces that will be made. 6.67 + * If not larger than requested number of sub-pieces, take the ceiling of 6.68 + * both num rows and num cols (also covers case of floor is zero). 6.69 + * Divide num rows in left matrix by target num rows, then use residuals 6.70 + * alg. to assign ranges of left matrix rows to each slice. 6.71 + * Divide num cols in right by target num cols, then use residuals alg. 6.72 + * to assign ranges of right matrix cols to each slice. 6.73 + * Then take all pairings of left-slices with right-slices and make a 6.74 + * sub-piece from each pairing 6.75 + * 6.76 + * @param numPieces 6.77 + */ 6.78 +#define SLICE_SCOPE 1 6.79 +void divide_MM( DKUPiece* oPiece, int numSubPiecesToMake ) 6.80 + { MatrixProdPiece *matrixProdPiece; 6.81 + Matrix *leftMatrix, *rightMatrix; 6.82 + 6.83 + matrixProdPiece = (MatrixProdPiece *) oPiece->appSpecificPiece; 6.84 + 6.85 + int leftStartRow, leftEndRow, rightStartCol, rightEndCol; 6.86 + leftStartRow = matrixProdPiece->leftStartRow; 6.87 + leftEndRow = matrixProdPiece->leftEndRow; 6.88 + rightStartCol = matrixProdPiece->rightStartCol; 6.89 + rightEndCol = matrixProdPiece->rightEndCol; 6.90 + leftMatrix = matrixProdPiece->leftMatrix; 6.91 + rightMatrix = matrixProdPiece->rightMatrix; 6.92 + 6.93 + int numLeftRows, numRightCols; 6.94 + numLeftRows = leftEndRow - leftStartRow + 1; //+1 cause starts at zero 6.95 + numRightCols = rightEndCol - rightStartCol + 1; 6.96 +//============================================================== 6.97 + 6.98 + //need numPieces to be divided into two integers 6.99 + // if each result piece is a square, that gives the best surface 6.100 + // area to volume == the least communication in distr memory, and 6.101 + // the least multiple access by different threads in shared memory. 6.102 + //So, just have to figure out the sizes of the horizontal and the 6.103 + // vertical. 6.104 + 6.105 + // make sure its possible to make more than 1 sub-piece 6.106 + if( numSubPiecesToMake < 2 || ( numLeftRows < 2 && numRightCols < 2 ) ) 6.107 + { oPiece->numSubPieces = 0; //scheduler must check for case of 0 pieces 6.108 + return; 6.109 + } 6.110 + 6.111 + //TODO: scope is the func called within (divide_MM) 6.112 + SliceStruc *slicingStruc = 6.113 + BLIS_DKU__malloc_scope( sizeof(SliceStruc), SLICE_SCOPE ); 6.114 + slicingStruc->numLeftRows = numLeftRows; 6.115 + slicingStruc->numRightCols = numRightCols; 6.116 + slicingStruc->numSubPiecesToMake = numSubPiecesToMake; 6.117 + slicingStruc->parPiece = oPiece; 6.118 + 6.119 + updateSlicingStrucWithSlicingOfInputMatrices( slicingStruc ); 6.120 + 6.121 + oPiece->numSubPieces = slicingStruc->finalNumSubPieces; 6.122 + if( oPiece->numSubPieces == 0 ) return; 6.123 + 6.124 + // pair up the slices and make the final DKUPieces 6.125 + pairSlicesAndMakeProdPieces( slicingStruc, oPiece ); 6.126 + 6.127 + BLIS_DKU__free_scope( SLICE_SCOPE ); 6.128 + return; 6.129 + } 6.130 + 6.131 +void updateSlicingStrucWithSlicingOfInputMatrices(SliceStruc *slicingStruc) 6.132 + { 6.133 +//======================= Setup ======================== 6.134 + int numLeftRows = slicingStruc->numLeftRows; 6.135 + int numRightCols = slicingStruc->numRightCols; 6.136 + int numSubPiecesToMake = slicingStruc->numSubPiecesToMake; 6.137 + 6.138 + int numResultCells; 6.139 + float targetNumCellsPerPiece, targetDimOfResultPiece; 6.140 + float idealNumLeftSlices, targetNumLeftSlices; 6.141 + float idealNumRightSlices, targetNumRightSlices; 6.142 + float targetRowsPerLeftSlice, targetColsPerRightSlice; 6.143 + 6.144 + float rowAccumulator, colAccumulator; 6.145 + int sliceIdx, rowIncrement, colIncrement; 6.146 + int numLeftSlices, numRightSlices; 6.147 + 6.148 + int leftStartRow, leftEndRow, rightStartCol, rightEndCol; 6.149 + MatrixProdPiece *parentProdPiece=slicingStruc->parPiece->appSpecificPiece; 6.150 + leftStartRow = parentProdPiece->leftStartRow; 6.151 + leftEndRow = parentProdPiece->leftEndRow; 6.152 + rightStartCol = parentProdPiece->rightStartCol; 6.153 + rightEndCol = parentProdPiece->rightEndCol; 6.154 + 6.155 +//======================= Calc num Slices ======================== 6.156 + //Calc the closest can reasonably get to square 6.157 + //TODO: check that math works right: dividing int by float, need cast? 6.158 + numResultCells = numLeftRows * numRightCols; 6.159 + targetNumCellsPerPiece = numResultCells / numSubPiecesToMake; 6.160 + targetDimOfResultPiece = sqrt( targetNumCellsPerPiece ); 6.161 + idealNumLeftSlices = numLeftRows / targetDimOfResultPiece; 6.162 + idealNumRightSlices = numRightCols / targetDimOfResultPiece; 6.163 + 6.164 + //Now, product of rows * cols should stay close to numPieces, but 6.165 + // have to make num rows an int, and num cols an int 6.166 + // means will drop fractional part from larger, then add the number 6.167 + // of pieces that fractional part represents back on to the smaller 6.168 + // then round to the nearest integer. The resulting product should 6.169 + // still be close to numPieces. 6.170 + if( idealNumRightSlices > idealNumLeftSlices ) 6.171 + { float diff, numPiecesCut; 6.172 + //TODO: find floor and "closest int" in C math library.. how use? 6.173 + targetNumRightSlices = floor( idealNumRightSlices ); 6.174 + diff = idealNumRightSlices - targetNumRightSlices; 6.175 + numPiecesCut = diff * idealNumLeftSlices; 6.176 + idealNumLeftSlices += numPiecesCut / targetNumRightSlices; 6.177 + targetNumLeftSlices = rint( idealNumLeftSlices ); 6.178 + } 6.179 + else 6.180 + { float diff, numPiecesCut; 6.181 + targetNumLeftSlices = floor( idealNumLeftSlices ); 6.182 + diff = idealNumLeftSlices - targetNumLeftSlices; 6.183 + numPiecesCut = diff * idealNumRightSlices; 6.184 + idealNumRightSlices += numPiecesCut / targetNumLeftSlices; 6.185 + targetNumRightSlices = rint( idealNumRightSlices ); 6.186 + } 6.187 + targetRowsPerLeftSlice = numLeftRows / targetNumLeftSlices; 6.188 + targetColsPerRightSlice = numRightCols / targetNumRightSlices; 6.189 + 6.190 + //allocate size of worst case + safety 6.191 + int size = sizeof(int) * (numSubPiecesToMake + 2); 6.192 + int *leftSliceStartRows, *rightSliceStartCols; 6.193 + //TODO: "FUNC" is not quite the right scope.. slicingStruc is right scope 6.194 + leftSliceStartRows = BLIS_DKU__malloc_scope( size, SLICE_SCOPE ); 6.195 + rightSliceStartCols = BLIS_DKU__malloc_scope( size, SLICE_SCOPE ); 6.196 + slicingStruc->leftSliceStartRows = leftSliceStartRows; 6.197 + slicingStruc->rightSliceStartCols = rightSliceStartCols; 6.198 + 6.199 +//======================= Slice Left Matrix ======================== 6.200 + //fix for case only 1 row, when leftStartRow == leftEndRow 6.201 + leftSliceStartRows[ 0 ] = leftStartRow; 6.202 + sliceIdx = 0; 6.203 + rowAccumulator = 0; 6.204 + int row; 6.205 + for( row = leftStartRow; row < leftEndRow; row += rowIncrement ) 6.206 + { 6.207 + leftSliceStartRows[ sliceIdx ] = row; 6.208 + 6.209 + rowAccumulator += targetRowsPerLeftSlice; 6.210 + rowIncrement = (int) rowAccumulator; 6.211 + if( rowIncrement == 0 ) rowIncrement = 1;//apply at end curr iter 6.212 + rowAccumulator -= rowIncrement; 6.213 + if( rowAccumulator < 0 ) rowAccumulator = 0; 6.214 + sliceIdx += 1; 6.215 + } 6.216 + if( sliceIdx == 0 ) sliceIdx = 1; //case when only 1 row 6.217 + numLeftSlices = sliceIdx; 6.218 + leftSliceStartRows[ sliceIdx ] = leftEndRow + 1; //use extra slot 6.219 + 6.220 + sliceIdx = 0; colAccumulator = 0; colIncrement = 0; 6.221 + 6.222 +//======================= Slice Right Matrix ======================== 6.223 + rightSliceStartCols[ 0 ] = rightStartCol; //in case only 1 col 6.224 + int col; 6.225 + for( col = rightStartCol; col < rightEndCol; col += colIncrement ) 6.226 + { 6.227 + rightSliceStartCols[ sliceIdx ] = col; 6.228 + 6.229 + colAccumulator += targetColsPerRightSlice; 6.230 + colIncrement = (int) colAccumulator; 6.231 + if( colIncrement == 0 ) colIncrement = 1;//apply at end curr iter 6.232 + colAccumulator -= colIncrement; 6.233 + if( colAccumulator < 0 ) colAccumulator = 0; 6.234 + sliceIdx += 1; 6.235 + } 6.236 + if( sliceIdx == 0 ) sliceIdx = 1; //case when only 1 col 6.237 + numRightSlices = sliceIdx; 6.238 + rightSliceStartCols[ sliceIdx ] = rightEndCol + 1; 6.239 + 6.240 + slicingStruc->numLeftSlices = numLeftSlices; 6.241 + slicingStruc->numRightSlices = numRightSlices; 6.242 + slicingStruc->finalNumSubPieces = numLeftSlices * numRightSlices; 6.243 + return; 6.244 + } 6.245 + 6.246 + 6.247 +void pairSlicesAndMakeProdPieces(SliceStruc *slicingStruc, DKUPiece *oPiece) 6.248 + { DKUPiece* *subPiecesArray; 6.249 + MatrixProdPiece *newProdPiece; 6.250 + MatrixProdPiece *parentMatPiece =slicingStruc->parPiece->appSpecificPiece; 6.251 + int newPiecePos = 0; 6.252 + int leftSliceStartRow, leftSliceEndRow; 6.253 + int rightSliceStartCol, rightSliceEndCol; 6.254 + int numLeftSlices = slicingStruc->numLeftSlices; 6.255 + int numRightSlices = slicingStruc->numRightSlices; 6.256 + int *leftSliceStartRows = slicingStruc->leftSliceStartRows; 6.257 + int *rightSliceStartCols = slicingStruc->rightSliceStartCols; 6.258 + 6.259 + int size = slicingStruc->finalNumSubPieces * sizeof(MatrixProdPiece *); 6.260 + subPiecesArray = BLIS_DKU__malloc_toPiece( size, oPiece ); 6.261 + oPiece->subPiecesArray = subPiecesArray; 6.262 + 6.263 + int rightSliceNum, leftSliceNum; 6.264 + for( rightSliceNum = 0; rightSliceNum < numRightSlices; rightSliceNum++ ) 6.265 + { DKUPiece *newDKUPiece; 6.266 + rightSliceStartCol = rightSliceStartCols[ rightSliceNum ]; 6.267 + rightSliceEndCol = rightSliceStartCols[ rightSliceNum + 1 ] - 1; 6.268 + for( leftSliceNum = 0; leftSliceNum < numLeftSlices; leftSliceNum++ ) 6.269 + { newDKUPiece = BLIS_DKU__makeDKUPiece_FromDivider( DKU_INST_MM ); 6.270 + newProdPiece = DKU__makeMatrixProdPiece_FromMatrixProdPiece 6.271 + ( parentMatPiece, newDKUPiece ); 6.272 + newDKUPiece->appSpecificPiece = newProdPiece; 6.273 + 6.274 + leftSliceStartRow = leftSliceStartRows[ leftSliceNum ]; 6.275 + leftSliceEndRow = leftSliceStartRows[ leftSliceNum + 1 ] - 1; 6.276 + 6.277 + newProdPiece->leftStartRow = leftSliceStartRow; 6.278 + newProdPiece->leftEndRow = leftSliceEndRow; 6.279 + newProdPiece->leftStartCol = parentMatPiece->leftStartCol; 6.280 + newProdPiece->leftEndCol = parentMatPiece->leftEndCol; 6.281 + 6.282 + newProdPiece->rightStartRow = parentMatPiece->rightStartRow; 6.283 + newProdPiece->rightEndRow = parentMatPiece->rightEndRow; 6.284 + newProdPiece->rightStartCol = rightSliceStartCol; 6.285 + newProdPiece->rightEndCol = rightSliceEndCol; 6.286 + 6.287 + newProdPiece->prodStartRow = leftSliceStartRow; 6.288 + newProdPiece->prodEndRow = leftSliceEndRow; 6.289 + newProdPiece->prodStartCol = rightSliceStartCol; 6.290 + newProdPiece->prodEndCol = rightSliceEndCol; 6.291 + 6.292 + subPiecesArray[ newPiecePos ] = newDKUPiece; 6.293 + newPiecePos += 1; 6.294 + } 6.295 + }//for(int rightSliceNum = 0; rightSliceNum < 6.296 + } 6.297 + 6.298 +//===========================================================================
7.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 7.2 +++ b/DKU_INST_MM/Kernel.c Sun Aug 26 03:04:50 2012 -0700 7.3 @@ -0,0 +1,80 @@ 7.4 +/* 7.5 + * Copyright 2009 OpenSourceCodeStewardshipFoundation.org 7.6 + * Licensed under GNU General Public License version 2 7.7 + * 7.8 + * Author: SeanHalle@yahoo.com 7.9 + * 7.10 + */ 7.11 + 7.12 +#include "DKU_INST_MM.h" 7.13 + 7.14 + 7.15 +/* Kernel 7.16 + * 7.17 + * Computes the passed-in piece's portion of the iteration space. 7.18 + * A DKUPiece struc is handed to it. 7.19 + * The DKUPiece struct has a pointer to "app specific info". This 7.20 + * is some application-specific structure that the Scheduler doesn't 7.21 + * know the details of. The Scheduler can pass the app-specific 7.22 + * data around, and that's all the Scheduler needs to do with it. 7.23 + * This kernel, however, is written by the app programmer, and does 7.24 + * know the internals of "app specific info". 7.25 + * 7.26 + * DKU Std: data that is accessed inside a Kernel is passed inside 7.27 + * a DKUPiece. Kernel's not allowed to access data via the language's 7.28 + * native scoping rules, not even constants. Thus, pointers to 7.29 + * arrays or to a tree's root nodes, or so forth are carried inside 7.30 + * the DKUPiece. 7.31 + * Note, kernel code is re-entrant, so the rules of re-entrant code apply 7.32 + * 7.33 + * 7.34 + * This is the standard Matrix Multiply loop nest. The only modification 7.35 + * for DKU is that it takes loop bounds out of the DKUPiece. 7.36 + */ 7.37 +void kernel_MM( DKUPiece *pieceToProcess ) 7.38 + { inner_Kernel( (MatrixProdPiece *) pieceToProcess->appSpecificPiece ); 7.39 + } 7.40 + 7.41 +/* Separated out the calculations of the Kernel so could re-use as the 7.42 + * serial kernel. 7.43 + */ 7.44 +void inner_Kernel( MatrixProdPiece *matrixProdPiece ) 7.45 + { int32 leftStartRow, leftEndRow, rightStartCol, rightEndCol; 7.46 + int32 leftStartCol, rightStartRow; 7.47 + int32 numLeftCols, numRightCols, numResMatCols; 7.48 + int32 row, col, vectorSize, i; 7.49 + float32 *leftMatrix, *rightMatrix, *resultMatrix; 7.50 + float32 *leftStartPt, *leftReadPt, *rightStartPt, *rightReadPt; 7.51 + 7.52 + leftMatrix = matrixProdPiece->leftMatrix->matrix; 7.53 + rightMatrix = matrixProdPiece->rightMatrix->matrix; 7.54 + resultMatrix = matrixProdPiece->resultMatrix->matrix; 7.55 + 7.56 + leftStartRow = matrixProdPiece->leftStartRow; 7.57 + leftEndRow = matrixProdPiece->leftEndRow; 7.58 + rightStartCol = matrixProdPiece->rightStartCol; 7.59 + rightEndCol = matrixProdPiece->rightEndCol; 7.60 + numResMatCols = matrixProdPiece->resultMatrix->numCols; 7.61 + rightStartRow = matrixProdPiece->rightStartRow; 7.62 + leftStartCol = matrixProdPiece->leftStartCol; 7.63 + numRightCols = matrixProdPiece->rightMatrix->numCols; 7.64 + numLeftCols = matrixProdPiece->leftMatrix->numCols; 7.65 + 7.66 + vectorSize =matrixProdPiece->leftEndCol - matrixProdPiece->leftStartCol+1; 7.67 + for( row = leftStartRow; row <= leftEndRow; row++ ) 7.68 + { leftStartPt = leftMatrix + row * numLeftCols + leftStartCol; 7.69 + for( col = rightStartCol; col <= rightEndCol; col++ ) 7.70 + { float32 sum = 0; 7.71 + 7.72 + rightStartPt = rightMatrix + rightStartRow * numRightCols + col; 7.73 + 7.74 + leftReadPt = leftStartPt; 7.75 + rightReadPt = rightStartPt; 7.76 + for( i = 0; i < vectorSize; i++) 7.77 + { sum += *(leftReadPt++) * *rightReadPt; 7.78 + rightReadPt += numRightCols; 7.79 + } 7.80 + *(resultMatrix + row * numResMatCols + col) = sum; 7.81 + } 7.82 + } 7.83 + }
8.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 8.2 +++ b/DKU_INST_MM/MakeRootDKUPieces.c Sun Aug 26 03:04:50 2012 -0700 8.3 @@ -0,0 +1,53 @@ 8.4 +/* 8.5 + * Copyright 2009 OpenSourceCodeStewardshipFoundation.org 8.6 + * Licensed under GNU General Public License version 2 8.7 + * 8.8 + * Author: seanhalle@yahoo.com 8.9 + */ 8.10 + 8.11 +#include "DKU_INST_MM.h" 8.12 + 8.13 +/* This is what wraps application data inside a DKUPiece. 8.14 + * It also encodes the dependencies within the data, by making several 8.15 + * DKUPieces, which must be executed in sequence, in order to respect the 8.16 + * dependencies. 8.17 + * It tries to make the pieces as large as possible, to maximize the 8.18 + * available parallelism. 8.19 + * 8.20 + * It allocates space for and fills an array of pointers to the pieces, 8.21 + * then returns how many pieces it put into the array. 8.22 + * 8.23 + * This function allocates the array of pointers to DKUPieces, plus the 8.24 + * DKUPieces, plus all sub-structures inside a DKUPiece. This is an 8.25 + * application-supplied function, so it knows all the app-specific sub- 8.26 + * structures inside a DKUPiece. However, it is called by the scheduler 8.27 + * so it has to have a fixed prototype. 8.28 + * The scheduler calls this function, so it is up to the scheduler to free 8.29 + * all the structures this function has allocated. To do this, a second 8.30 + * function is supplied that performs the free s. 8.31 + * 8.32 + * DKUPieceMaker returns a RootDKUPieces data structure, which contains the 8.33 + * array of DKUPieces that have to be executed in sequence, to preserve 8.34 + * the dependencies, and contains the number of such DKUPieces. 8.35 + * Each call to makeDKUPieces mallocs space for the pieces, plus the 8.36 + * root pieces array and the root pieces struc. This memory is freed by 8.37 + * the standard function BLIS_DKU__cleanupRootPieces, which is called by the 8.38 + * scheduler. 8.39 + * 8.40 + * For Matrix Multiply, the original data is a MatrixProdPair, which is 8.41 + * perfectly dividable, so just have to wrap that inside a DKUPiece, and 8.42 + * return it inside a RootDKUPieces data structure. 8.43 + */ 8.44 +RootDKUPieces * makeRootDKUPieces_MM( void *origData ) 8.45 + { RootDKUPieces * rootDKUPieces; 8.46 + DKUPiece *piece; 8.47 + 8.48 + piece = BLIS_DKU__makeDKUPiece_FromMaker( DKU_INST_MM ); 8.49 + piece->appSpecificPiece = origData; 8.50 + 8.51 + int numRootPieces = 1; 8.52 + rootDKUPieces = BLIS_DKU__makeRootDKUPiecesStruc( numRootPieces ); 8.53 + rootDKUPieces->rootPiecesArray[0] = piece; 8.54 + 8.55 + return rootDKUPieces; 8.56 + }
9.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 9.2 +++ b/DKU_INST_MM/Maker_and_Freer.c Sun Aug 26 03:04:50 2012 -0700 9.3 @@ -0,0 +1,73 @@ 9.4 +/* 9.5 + * Copyright 2009 OpenSourceCodeStewardshipFoundation.org 9.6 + * Licensed under GNU General Public License version 2 9.7 + * 9.8 + * Author: seanhalle@yahoo.com 9.9 + * 9.10 + * Created on November 15, 2009, 2:35 AM 9.11 + */ 9.12 + 9.13 +#include <malloc.h> 9.14 + 9.15 +#include "DKU_INST_MM.h" 9.16 + 9.17 + 9.18 +//========== Makers and Free-ers for use ONLY within DKU_INST_MM =========== 9.19 +// 9.20 + 9.21 +/*In the "_Flat" version of constructor, do only malloc of the top data struc 9.22 + * and set values in that top-level. Don't malloc any sub-structures. 9.23 + * 9.24 + *Used in BundlingQuad in unbundleInputs 9.25 + */ 9.26 + Matrix * 9.27 +DKU__makeMatrix_Flat( int32 numRows, int32 numCols, DKUPiece *owner ) 9.28 + { Matrix * retMatrix; 9.29 + retMatrix = BLIS_DKU__malloc_toPiece( sizeof( Matrix ), owner ); 9.30 + retMatrix->numRows = numRows; 9.31 + retMatrix->numCols = numCols; 9.32 + 9.33 + return retMatrix; 9.34 + } 9.35 + 9.36 +/* Used In BundlingQuad in unbundleInputs */ 9.37 + MatrixProdPiece * 9.38 +DKU__makeMatrixProdPiece_Flat( DKUPiece *owner ) 9.39 + { return BLIS_DKU__malloc_toPiece( sizeof(MatrixProdPiece), owner ); 9.40 + } 9.41 + 9.42 + 9.43 + 9.44 +/* Used in Divider */ 9.45 + MatrixProdPiece * 9.46 +DKU__makeMatrixProdPiece_FromMatrixProdPiece 9.47 + ( MatrixProdPiece *parentPiece, DKUPiece *owner ) 9.48 + { MatrixProdPiece *newPiece; 9.49 + Matrix *leftMatrix = parentPiece->leftMatrix; 9.50 + Matrix *rightMatrix = parentPiece->rightMatrix; 9.51 + 9.52 + newPiece = DKU__makeMatrixProdPiece_Flat( owner ); 9.53 + 9.54 + newPiece->leftMatrix = leftMatrix; 9.55 + newPiece->rightMatrix = rightMatrix; 9.56 + 9.57 + newPiece->leftStartRow = 0; 9.58 + newPiece->leftEndRow = leftMatrix->numRows - 1; 9.59 + newPiece->leftStartCol = 0; 9.60 + newPiece->leftEndCol = leftMatrix->numCols - 1; 9.61 + 9.62 + newPiece->rightStartRow = 0; 9.63 + newPiece->rightEndRow = rightMatrix->numRows - 1; 9.64 + newPiece->rightStartCol = 0; 9.65 + newPiece->rightEndCol = rightMatrix->numCols - 1; 9.66 + 9.67 + newPiece->prodStartRow = newPiece->leftStartRow; 9.68 + newPiece->prodEndRow = newPiece->leftEndRow; 9.69 + newPiece->prodStartCol = newPiece->rightStartCol; 9.70 + newPiece->prodEndCol = newPiece->rightEndCol; 9.71 + 9.72 + newPiece->resultMatrix = parentPiece->resultMatrix; 9.73 + 9.74 + return newPiece; 9.75 + } 9.76 +
10.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 10.2 +++ b/DKU_INST_MM/SerialKernel.c Sun Aug 26 03:04:50 2012 -0700 10.3 @@ -0,0 +1,21 @@ 10.4 +/* 10.5 + * Copyright 2009 OpenSourceCodeStewardshipFoundation.org 10.6 + * Licensed under GNU General Public License version 2 10.7 + * 10.8 + * Author: seanhalle@yahoo.com 10.9 + * 10.10 + */ 10.11 + 10.12 +#include "DKU_INST_MM.h" 10.13 + 10.14 +/*The scheduler calls this when there is no benefit from 10.15 + * parallel execution of the orig data. 10.16 + * 10.17 + *For MM, the original data is already in the data structure that's inside 10.18 + * a DKUPiece, so just cast the oridData and call the Kernel on it 10.19 + */ 10.20 +void serialKernel_MM( void * origData ) 10.21 + { 10.22 + inner_Kernel( (MatrixProdPiece *) origData ); 10.23 + } 10.24 +
11.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 11.2 +++ b/DKU_INST_MM/Undivide.c Sun Aug 26 03:04:50 2012 -0700 11.3 @@ -0,0 +1,34 @@ 11.4 +/* 11.5 + * Copyright 2009 OpenSourceCodeStewardshipFoundation.org 11.6 + * Licensed under GNU General Public License version 2 11.7 + * 11.8 + * Author: seanhalle@yahoo.com 11.9 + * 11.10 + */ 11.11 + 11.12 +#include "DKU_INST_MM.h" 11.13 +#include <malloc.h> 11.14 + 11.15 + 11.16 +/* unDivider 11.17 + * 11.18 + * Only counts to make sure all the pieces are accounted for, and frees 11.19 + * the memory allocated to the finished DKUPiece structs and all sub- 11.20 + * structures. 11.21 + */ 11.22 +void unDividePiece_MM( DKUPiece *parentPiece, DKUPiece *piece ) 11.23 + { 11.24 + parentPiece->numSubPiecesUndivided += 1; 11.25 + 11.26 + //free mem allocated to no-longer needed subPiece 11.27 + BLIS_DKU__freeDKUPiece( piece ); 11.28 + } 11.29 + 11.30 +//TODO: figure out standard for doing this.. add pointer to func to inits, 11.31 +// so fixed freePiece func can have generic code to free app-specific 11.32 +// piece, and DKUInstance provides function to free that piece 11.33 +// maybe put the pointer into schedData or something.. then this 11.34 +// one call to freeDKUPiece can be generic, with HW-specific overlay 11.35 +// Issue is slowness of indirections into table and dereferencing Fn pointer 11.36 + 11.37 +
12.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 12.2 +++ b/Matrix_Mult.c Sun Aug 26 03:04:50 2012 -0700 12.3 @@ -0,0 +1,221 @@ 12.4 +/* 12.5 + * Copyright 2009 OpenSourceCodeStewardshipFoundation.org 12.6 + * Licensed under GNU General Public License version 2 12.7 + * 12.8 + * Author: seanhalle@yahoo.com 12.9 + * 12.10 + * Created on November 15, 2009, 2:35 AM 12.11 + */ 12.12 + 12.13 +#include <malloc.h> 12.14 +#include <stdlib.h> 12.15 + 12.16 +#include "Matrix_Mult.h" 12.17 +#include "../BLIS/DKU/DKU_common/DKU.h" 12.18 + 12.19 +//======================== For Use OUTSIDE DKU instance ===================== 12.20 +/* 12.21 + *The DKU code-instance in DKU_INST_MM has its own set of makers and free-ers 12.22 + * that use BLIS_DKU__malloc. These are for use in application-code 12.23 + * outside DKU_INST_MM DKU-code-instance. 12.24 + */ 12.25 + 12.26 + 12.27 +/*In the "_Flat" version of constructor, do only malloc of the top data struc 12.28 + * and set values in that top-level. Don't malloc any sub-structures. 12.29 + */ 12.30 + Matrix * 12.31 +makeMatrix_Flat( int32 numRows, int32 numCols ) 12.32 + { Matrix * retMatrix; 12.33 + retMatrix = malloc( sizeof( Matrix ) ); 12.34 + retMatrix->numRows = numRows; 12.35 + retMatrix->numCols = numCols; 12.36 + 12.37 + return retMatrix; 12.38 + } 12.39 + 12.40 + Matrix * 12.41 +makeMatrix_WithResMat( int32 numRows, int32 numCols ) 12.42 + { Matrix * retMatrix; 12.43 + retMatrix = malloc( sizeof( Matrix ) ); 12.44 + retMatrix->numRows = numRows; 12.45 + retMatrix->numCols = numCols; 12.46 + retMatrix->matrix = malloc( numRows * numCols * sizeof(float32) ); 12.47 + 12.48 + return retMatrix; 12.49 + } 12.50 + 12.51 + void 12.52 +freeMatrix_Flat( Matrix * matrix ) 12.53 + { //( matrix ); 12.54 + } 12.55 + void 12.56 +freeMatrix( Matrix * matrix ) 12.57 + { free( matrix->matrix ); 12.58 + free( matrix ); 12.59 + } 12.60 + 12.61 + MatrixProdPiece * 12.62 +makeMatrixProdPiece_Empty() 12.63 + { return malloc( sizeof(MatrixProdPiece) ); 12.64 + } 12.65 + 12.66 + 12.67 + MatrixProdPiece * 12.68 +makeMatrixProdPiece_Helper( Matrix *leftMatrix, Matrix *rightMatrix ) 12.69 + { MatrixProdPiece *newPiece; 12.70 + 12.71 + newPiece = makeMatrixProdPiece_Empty( ); 12.72 + 12.73 + newPiece->leftMatrix = leftMatrix; 12.74 + newPiece->rightMatrix = rightMatrix; 12.75 + 12.76 + newPiece->leftStartRow = 0; 12.77 + newPiece->leftEndRow = leftMatrix->numRows - 1; 12.78 + newPiece->leftStartCol = 0; 12.79 + newPiece->leftEndCol = leftMatrix->numCols - 1; 12.80 + 12.81 + newPiece->rightStartRow = 0; 12.82 + newPiece->rightEndRow = rightMatrix->numRows - 1; 12.83 + newPiece->rightStartCol = 0; 12.84 + newPiece->rightEndCol = rightMatrix->numCols - 1; 12.85 + 12.86 + newPiece->prodStartRow = newPiece->leftStartRow; 12.87 + newPiece->prodEndRow = newPiece->leftEndRow; 12.88 + newPiece->prodStartCol = newPiece->rightStartCol; 12.89 + newPiece->prodEndCol = newPiece->rightEndCol; 12.90 + 12.91 + return newPiece; 12.92 + } 12.93 + 12.94 + MatrixProdPiece * 12.95 +makeMatrixProdPiece_FromMatrixProdPiece( MatrixProdPiece *parentPiece ) 12.96 + { MatrixProdPiece *newPiece; 12.97 + Matrix *leftMatrix = parentPiece->leftMatrix; 12.98 + Matrix *rightMatrix = parentPiece->rightMatrix; 12.99 + 12.100 + newPiece = makeMatrixProdPiece_Helper( leftMatrix, rightMatrix ); 12.101 + newPiece->resultMatrix = parentPiece->resultMatrix; 12.102 + 12.103 + return newPiece; 12.104 + } 12.105 + 12.106 + MatrixProdPiece * 12.107 +makeMatrixProdPiece_FromMatrices( Matrix *leftMatrix, Matrix *rightMatrix ) 12.108 + { MatrixProdPiece *newPiece; 12.109 + 12.110 + newPiece = makeMatrixProdPiece_Helper( leftMatrix, rightMatrix ); 12.111 + newPiece->resultMatrix = 12.112 + makeMatrix_WithResMat( leftMatrix->numRows, rightMatrix->numCols ); 12.113 + 12.114 + return newPiece; 12.115 + } 12.116 + 12.117 + void 12.118 +freeMatrixProdPiece_Flat( MatrixProdPiece * piece ) 12.119 + { free( piece ); 12.120 + } 12.121 + 12.122 + void 12.123 +freeMatrixProdPiece( MatrixProdPiece * piece ) 12.124 + { //( piece->leftMatrix ); 12.125 + freeMatrix( piece->rightMatrix ); 12.126 + freeMatrix( piece->resultMatrix ); 12.127 + free( piece ); 12.128 + } 12.129 + 12.130 + 12.131 + void 12.132 +initialize_Input_Matrices_Via( Matrix **leftMatrix, Matrix **rightMatrix, 12.133 + ParamBag *paramBag ) 12.134 + { char *leftMatrixFileName, *rightMatrixFileName; 12.135 + int leftMatrixRows, leftMatrixCols, rightMatrixRows, rightMatrixCols; 12.136 + 12.137 + ParamStruc *param; 12.138 + param = getParamFromBag( "leftMatrixRows", paramBag ); 12.139 + leftMatrixRows = param->intValue; 12.140 + param = getParamFromBag( "leftMatrixCols", paramBag ); 12.141 + leftMatrixCols = param->intValue; 12.142 + *leftMatrix = makeMatrix_WithResMat( leftMatrixRows, leftMatrixCols ); 12.143 + 12.144 + param = getParamFromBag( "leftMatrixFileName", paramBag ); 12.145 + leftMatrixFileName = param->strValue; //no need to copy 12.146 + read_Matrix_From_File( *leftMatrix, leftMatrixFileName ); 12.147 + 12.148 + param = getParamFromBag( "rightMatrixRows", paramBag ); 12.149 + rightMatrixRows = param->intValue; 12.150 + param = getParamFromBag( "rightMatrixCols", paramBag ); 12.151 + rightMatrixCols = param->intValue; 12.152 + *rightMatrix = makeMatrix_WithResMat( rightMatrixRows, rightMatrixCols ); 12.153 + 12.154 + param = getParamFromBag( "rightMatrixFileName", paramBag ); 12.155 + rightMatrixFileName = param->strValue; 12.156 + read_Matrix_From_File( *rightMatrix, rightMatrixFileName ); 12.157 + } 12.158 + 12.159 + 12.160 +void parseLineIntoRow( char *line, float32* row ); 12.161 + 12.162 + 12.163 + void 12.164 +read_Matrix_From_File( Matrix *matrixStruc, char *matrixFileName ) 12.165 + { int row, maxRead, numRows, numCols; 12.166 + float32 *matrixStart; 12.167 + size_t lineSz = 0; 12.168 + FILE *file; 12.169 + char *line = NULL; 12.170 + 12.171 + lineSz = 50000; //max length of line in a matrix data file 12.172 + line = (char *) malloc( lineSz ); 12.173 + if( line == NULL ) BLIS_DKU__throwError( "no mem for matrix line" ); 12.174 + 12.175 + numRows = matrixStruc->numRows; 12.176 + numCols = matrixStruc->numCols; 12.177 + matrixStart = matrixStruc->matrix; 12.178 + 12.179 + printf("Matrix File Path: %s\n", matrixFileName);fflush(stdout); 12.180 + file = fopen( matrixFileName, "r" ); if(!file){printf("not open! %d\n",__LINE__); fflush(stdin);} 12.181 + fseek( file, 0, SEEK_SET ); 12.182 + for( row = 0; row < numRows; row++ ) 12.183 + { 12.184 + if( feof( file ) ) BLIS_DKU__throwError( "file ran out too soon" ); 12.185 + maxRead = getline( &line, &lineSz, file ); 12.186 + if( maxRead == -1 ) BLIS_DKU__throwError( "prob reading mat line"); 12.187 + 12.188 + if( *line == '\n') continue; //blank line 12.189 + if( *line == '/' ) continue; //comment line 12.190 + 12.191 + parseLineIntoRow( line, matrixStart + row * numCols ); 12.192 + } 12.193 + free( line ); 12.194 + } 12.195 + 12.196 +/*This function relies on each line having the proper number of cols. It 12.197 + * doesn't check, nor enforce, so if the file is improperly formatted it 12.198 + * can write over unrelated memory 12.199 + */ 12.200 + void 12.201 +parseLineIntoRow( char *line, float32* row ) 12.202 + { 12.203 + char *valueStr, *searchPos; 12.204 + 12.205 + //read the float values 12.206 + searchPos = valueStr = line; //start 12.207 + 12.208 + for( ; *searchPos != 0; searchPos++) //bit dangerous, should use buff len 12.209 + { 12.210 + if( *searchPos == '\n' ) //last col.. relying on well-formatted file 12.211 + { *searchPos = 0; 12.212 + *row = atof( valueStr ); 12.213 + break; //end FOR loop 12.214 + } 12.215 + if( *searchPos == ',' ) 12.216 + { *searchPos = 0; //mark end of string 12.217 + *row = (float32) atof( valueStr ); 12.218 + row += 1; //address arith 12.219 + //skip any spaces before digits.. use searchPos + 1 to skip the 0 12.220 + for( ; *(searchPos + 1)== ' ' && *(searchPos + 1) !=0; searchPos++); 12.221 + valueStr = searchPos + 1; 12.222 + } 12.223 + } 12.224 + }
13.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 13.2 +++ b/Matrix_Mult.h Sun Aug 26 03:04:50 2012 -0700 13.3 @@ -0,0 +1,81 @@ 13.4 +/* 13.5 + * Copyright Oct 24, 2009 OpenSourceCodeStewardshipFoundation.org 13.6 + * Licensed under GNU General Public License version 2 13.7 + */ 13.8 + 13.9 +#ifndef MATRIX_MULT_H_ 13.10 +#define MATRIX_MULT_H_ 13.11 + 13.12 +#include <stdio.h> 13.13 + 13.14 +#include "../BLIS/BLIS_primitive_data_types.h" 13.15 + 13.16 +#include "ParamHelper/Param.h" 13.17 + 13.18 +//============================== Structures ============================== 13.19 + 13.20 +typedef 13.21 +struct 13.22 + { int32 numRows; 13.23 + int32 numCols; 13.24 + float32 *matrix; //2D, but dynamically sized, so use addr arith 13.25 + } 13.26 +Matrix; 13.27 + 13.28 +/* This is the "appSpecificPiece" that is carried inside a DKUPiece. 13.29 + * In the DKUPiece data struc it is declared to be of type "void *". This 13.30 + * allows the application to define any data structure it wants and put it 13.31 + * into a DKUPiece. 13.32 + * When the app specific info is used, it is in app code, so it is cast to 13.33 + * the correct type to tell the compiler how to access fields. 13.34 + * This keeps all app-specific things out of the DKU directory, as per the 13.35 + * DKU standard. */ 13.36 +typedef 13.37 +struct 13.38 + { 13.39 + // pointers to shared data.. the result matrix must be created when the 13.40 + // left and right matrices are put into the root ancestor DKUPiece. 13.41 + Matrix * leftMatrix; 13.42 + Matrix * rightMatrix; 13.43 + Matrix * resultMatrix; 13.44 + 13.45 + // define the starting and ending boundaries for this piece of the 13.46 + // result matrix. These are derivable from the left and right 13.47 + // matrices, but included them for readability of code. 13.48 + int prodStartRow, prodEndRow; 13.49 + int prodStartCol, prodEndCol; 13.50 + // Start and end of the portion of the left matrix that contributes to 13.51 + // this piece of the product 13.52 + int leftStartRow, leftEndRow; 13.53 + int leftStartCol, leftEndCol; 13.54 + // Start and end of the portion of the right matrix that contributes to 13.55 + // this piece of the product 13.56 + int rightStartRow, rightEndRow; 13.57 + int rightStartCol, rightEndCol; 13.58 + } 13.59 +MatrixProdPiece; 13.60 + 13.61 +//============================== Functions ================================ 13.62 +void readFile(); 13.63 + 13.64 +Matrix *makeMatrix( int32 numRows, int32 numCols ); 13.65 +Matrix *makeMatrix_Flat( int32 numRows, int32 numCols ); 13.66 +void freeMatrix_Flat( Matrix * matrix ); 13.67 +void freeMatrix( Matrix * matrix ); 13.68 + 13.69 + MatrixProdPiece * 13.70 +makeMatrixProdPiece_Empty(); 13.71 + MatrixProdPiece * 13.72 +makeMatrixProdPiece_FromMatrixProdPiece( MatrixProdPiece * piece ); 13.73 + MatrixProdPiece * 13.74 +makeMatrixProdPiece_FromMatrices( Matrix *leftMatrix, Matrix *rightMatrix ); 13.75 + 13.76 +void read_Matrix_From_File( Matrix *matrixStruc, char *matrixFileName ); 13.77 + 13.78 +void freeMatrixProdPiece_Flat( MatrixProdPiece * piece ); 13.79 +void freeMatrixProdPiece( MatrixProdPiece * piece ); 13.80 + 13.81 + 13.82 +//=========================================================================== 13.83 + 13.84 +#endif /*MATRIX_MULT_H_*/
14.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 14.2 +++ b/Read_Input_Matrix.c Sun Aug 26 03:04:50 2012 -0700 14.3 @@ -0,0 +1,604 @@ 14.4 +/* 14.5 + * File: Read_Input.c 14.6 + * Author: SeanHalle@yahoo.com 14.7 + * 14.8 + * Created on June 15, 2009, 10:12 AM 14.9 + */ 14.10 + 14.11 +#include <stdio.h> 14.12 + 14.13 +//======================== 14.14 +scanf("%[^\t]",a); //matches everything except tab character 14.15 + 14.16 +//======================= Write a structure into a file ==================== 14.17 + 14.18 +#include <stdio.h> 14.19 +#include <string.h> 14.20 +#include <stdlib.h> 14.21 + 14.22 +#define MAX 50 14.23 + 14.24 + 14.25 +typedef struct { 14.26 + char name[10]; 14.27 + int key; 14.28 +} file_record; 14.29 + 14.30 +/* this function adds the relatiuve addres to the index for a key */ 14.31 +void create_index(long index[], int key, long rel_add ) { 14.32 + index[key] = rel_add; 14.33 +} 14.34 + 14.35 +/* this function writes a record to the file */ 14.36 +void write_rec(FILE *fp, file_record rec) { 14.37 + fwrite(&rec,sizeof(rec),1,fp); 14.38 +} 14.39 + 14.40 +void main() { 14.41 + long rel_add; 14.42 + int key; 14.43 + file_record frec; 14.44 + long index[MAX];/* an index list*/ 14.45 + int n,i; 14.46 + 14.47 + FILE *recfile=NULL,*ifile=NULL; 14.48 + /* this initializes the index list to all ? */ 14.49 + for(i=0; i< MAX; i++) 14.50 + index[i]= (-1); 14.51 + 14.52 + recfile=fopen("mfile","w"); 14.53 + if(recfile == NULL) { 14.54 + printf("Error in openeing file mfile\n"); 14.55 + exit(0); 14.56 + } 14.57 + rel_add = 0 ; 14.58 + do { 14.59 + printf(" Enter the data vlue and the key of the record to be added to file mfile\n"); 14.60 + scanf("%s %d",frec.name,&frec.key); 14.61 + while(index[frec.key] != (-1)) { 14.62 + printf(" A record with this key value already exist in a file enter record key value\n"); 14.63 + scanf("%s %d",frec.name,&frec.key); 14.64 + } 14.65 + create_index(index,frec.key,rel_add); 14.66 + write_rec(recfile,frec); 14.67 + rel_add = ftell(recfile); 14.68 + /* this sets the relative address for the next record to be 14.69 + the value of current file position pointer in bytes from 14.70 + the beginning of the file */ 14.71 + printf("Enter 1 to continue adding records to the file\n"); 14.72 + scanf("%d",&n); 14.73 + }while(n == 1); 14.74 + 14.75 + ifile=fopen("index_file","w"); 14.76 + 14.77 + if(ifile == NULL) { 14.78 + printf("Error in openeing file index_file\n"); 14.79 + exit(0); 14.80 + } 14.81 + 14.82 + fwrite(index,sizeof(index),1,ifile);/*writes the complete index into the index_file */ 14.83 + fclose(recfile); 14.84 + fclose(ifile); 14.85 + printf("Enter 1 if you want to retrieve a record\n"); 14.86 + scanf("%d",&n); 14.87 + 14.88 + if( n == 1) { 14.89 + ifile=fopen("index_file","r"); 14.90 + if(ifile == NULL) { 14.91 + printf("Error in openeing file index_file\n"); 14.92 + exit(0); 14.93 + } 14.94 + fread(index,sizeof(index),1,ifile); 14.95 + 14.96 + /* reads the complete index into the index list from the index_file*/ 14.97 + fclose(ifile); 14.98 + recfile=fopen("mfile","r"); 14.99 + 14.100 + if(recfile == NULL) { 14.101 + printf("Error in openeing file mfile\n"); 14.102 + exit(0); 14.103 + } 14.104 + } 14.105 + printf("THE CONTENTS OF FILE IS \n"); 14.106 + 14.107 + while( (fread(&frec,sizeof(frec),1,recfile)) != 0) 14.108 + printf("%s %d\n",frec.name,frec.key); 14.109 + 14.110 + do { 14.111 + printf("Enter the key of the record to be retrieved\n"); 14.112 + scanf("%d",&key); 14.113 + rel_add = index[key]; /*gets the relative address of the record from index list */ 14.114 + if( (fseek(recfile,rel_add,SEEK_SET))!= 0) { 14.115 + printf("Error\n"); 14.116 + exit(0); 14.117 + } 14.118 + fread(&frec,sizeof(frec),1,recfile); 14.119 + printf("The data value of the retrieved record is %s\n",frec.name); 14.120 + printf("Enter 1 if you want to retrieve a record\n"); 14.121 + scanf("%d",&n); 14.122 + } while(n == 1); 14.123 + 14.124 + fclose(recfile); 14.125 +} 14.126 + 14.127 + 14.128 + 14.129 + 14.130 +//========================== Read words in file demo ======================= 14.131 + 14.132 +#include <stdio.h> 14.133 +#include <ctype.h> 14.134 +#include <string.h> 14.135 +#include <stdlib.h> 14.136 + 14.137 +struct node { 14.138 + struct node *left; /* tree to the left */ 14.139 + struct node *right; /* tree to the right */ 14.140 + char *word; /* word for this tree */ 14.141 +}; 14.142 + 14.143 +/* the top of the tree */ 14.144 +static struct node *root = NULL; 14.145 + 14.146 +/* 14.147 + * memory_error -- write error and die * 14.148 + */ 14.149 +void memory_error(void) 14.150 +{ 14.151 + fprintf(stderr, "Error:Out of memory\n"); 14.152 + exit(8); 14.153 +} 14.154 + 14.155 +/* 14.156 + * save_string -- save a string on the heap * 14.157 + * * 14.158 + * Parameters * 14.159 + * string -- string to save * 14.160 + * * 14.161 + * Returns * 14.162 + * pointer to malloc-ed section of memory with * 14.163 + * the string copied into it. * 14.164 + */ 14.165 +char *save_string(char *string) 14.166 +{ 14.167 + char *new_string; /* where we are going to put string */ 14.168 + 14.169 + new_string = malloc((unsigned) (strlen(string) + 1)); 14.170 + 14.171 + if (new_string == NULL) 14.172 + memory_error(); 14.173 + 14.174 + strcpy(new_string, string); 14.175 + return (new_string); 14.176 +} 14.177 +/* 14.178 + * enter -- enter a word into the tree * 14.179 + * * 14.180 + * Parameters * 14.181 + * node -- current node we are looking at * 14.182 + * word -- word to enter * 14.183 + */ 14.184 +void enter(struct node **node, char *word) 14.185 +{ 14.186 + int result; /* result of strcmp */ 14.187 + 14.188 + char *save_string(char *); /* save a string on the heap */ 14.189 + 14.190 + /* 14.191 + * If the current node is null, we have reached the bottom 14.192 + * of the tree and must create a new node. 14.193 + */ 14.194 + if ((*node) == NULL) { 14.195 + 14.196 + /* Allocate memory for a new node */ 14.197 + (*node) = malloc(sizeof(struct node)); 14.198 + if ((*node) == NULL) 14.199 + memory_error(); 14.200 + 14.201 + /* Initialize the new node */ 14.202 + (*node)->left = NULL; 14.203 + (*node)->right = NULL; 14.204 + (*node)->word = save_string(word); 14.205 + return; 14.206 + } 14.207 + /* Check to see where the word goes */ 14.208 + result = strcmp((*node)->word, word); 14.209 + 14.210 + /* The current node already contains the word, no entry necessary */ 14.211 + if (result == 0) 14.212 + return; 14.213 + 14.214 + /* The word must be entered in the left or right sub-tree */ 14.215 + if (result < 0) 14.216 + enter(&(*node)->right, word); 14.217 + else 14.218 + enter(&(*node)->left, word); 14.219 +} 14.220 +/* 14.221 + * scan -- scan the file for words * 14.222 + * * 14.223 + * Parameters * 14.224 + * name -- name of the file to scan * 14.225 + */ 14.226 +void scan(char *name) 14.227 +{ 14.228 + char word[100]; /* word we are working on */ 14.229 + int index; /* index into the word */ 14.230 + int ch; /* current character */ 14.231 + FILE *in_file; /* input file */ 14.232 + 14.233 + in_file = fopen(name, "r"); 14.234 + if (in_file == NULL) { 14.235 + fprintf(stderr, "Error:Unable to open %s\n", name); 14.236 + exit(8); 14.237 + } 14.238 + while (1) { 14.239 + /* scan past the whitespace */ 14.240 + while (1) { 14.241 + ch = fgetc(in_file); 14.242 + 14.243 + if (isalpha(ch) || (ch == EOF)) 14.244 + break; 14.245 + } 14.246 + 14.247 + if (ch == EOF) 14.248 + break; 14.249 + 14.250 + word[0] = ch; 14.251 + for (index = 1; index < sizeof(word); ++index) { 14.252 + ch = fgetc(in_file); 14.253 + if (!isalpha(ch)) 14.254 + break; 14.255 + word[index] = ch; 14.256 + } 14.257 + /* put a null on the end */ 14.258 + word[index] = '\0'; 14.259 + 14.260 + enter(&root, word); 14.261 + } 14.262 + fclose(in_file); 14.263 +} 14.264 +/* 14.265 + * print_tree -- print out the words in a tree * 14.266 + * * 14.267 + * Parameters * 14.268 + * top -- the root of the tree to print * 14.269 + */ 14.270 +void print_tree(struct node *top) 14.271 +{ 14.272 + if (top == NULL) 14.273 + return; /* short tree */ 14.274 + 14.275 + print_tree(top->left); 14.276 + printf("%s\n", top->word); 14.277 + print_tree(top->right); 14.278 +} 14.279 + 14.280 +int main(int argc, char *argv[]) 14.281 +{ 14.282 + if (argc != 2) { 14.283 + fprintf(stderr, "Error:Wrong number of parameters\n"); 14.284 + fprintf(stderr, " on the command line\n"); 14.285 + fprintf(stderr, "Usage is:\n"); 14.286 + fprintf(stderr, " words 'file'\n"); 14.287 + exit(8); 14.288 + } 14.289 + scan(argv[1]); 14.290 + print_tree(root); 14.291 + return (0); 14.292 +} 14.293 + 14.294 + 14.295 + 14.296 + 14.297 +//================== Get line demo ========================= 14.298 +#include <stdio.h> 14.299 +#include <stdlib.h> 14.300 +#include <string.h> 14.301 + 14.302 +#define _GNU_SOURCE 14.303 + 14.304 +int main(int argc, char* argv[]) { 14.305 + 14.306 +size_t lsize = 0; 14.307 +ssize_t read; 14.308 +FILE* conf_file; 14.309 +char* line = NULL; 14.310 + 14.311 +if (argc == 1) { 14.312 +printf("\nCommand syntax:\n"); 14.313 +printf("\n\tprogramd [ start | stop ]\n"); 14.314 +printf("\t\tstart: start daemon\n"); 14.315 +printf("\t\tstop: stop daemon\n"); 14.316 +} 14.317 +else if (strcmp(argv[1], "start") == 0) { 14.318 +conf_file = fopen("/etc/program/program.conf", "r"); 14.319 +fseek(conf_file, 0, SEEK_SET); 14.320 +while (!feof(conf_file)) { 14.321 +while (getline(&line, &lsize, conf_file) != -1) { 14.322 +printf("%s", line); 14.323 +} 14.324 +} 14.325 +} 14.326 + 14.327 +//====================== 14.328 +fopen 14.329 +fread 14.330 +fscanf 14.331 +getline 14.332 + 14.333 + 14.334 +//================== scanf demo ============================= 14.335 +/* Q: need this for the GCC atomic operations? 14.336 + */ 14.337 +#define _GNU_SOURCE 14.338 +#include <stdio.h> 14.339 + 14.340 +void main (void) { 14.341 + 14.342 + /* We will use one floating-point and one integer variable. */ 14.343 + 14.344 + double x = .00000123456789; 14.345 + int n = 12345; 14.346 + 14.347 + 14.348 + /* Display plain text. */ 14.349 + 14.350 + printf("This is a test\n"); 14.351 + printf("This\tis\nanother\ttest\n\n"); 14.352 + 14.353 + 14.354 + /* Display an integer. */ 14.355 + 14.356 + printf("Here is n: %d\n\n", n); 14.357 + 14.358 + 14.359 + /* Display a double three different ways. */ 14.360 + 14.361 + printf("Here is x: %g\n", x); 14.362 + printf("Here is x: %f\n", x); 14.363 + printf("Here is x: %e\n\n", x); 14.364 + 14.365 + 14.366 + /* Display two numbers. */ 14.367 + 14.368 + printf("Here are n (%d) and x (%g)\n", n, x); 14.369 + 14.370 +} 14.371 + 14.372 +//=========================================================== 14.373 + 14.374 +void read_One_MB(int f, int MB_y, int MB_x, MB_Info *curMB, FILE *inputFH); 14.375 + 14.376 +//================================================================= 14.377 + { dataFile = new File( fileName ); 14.378 + paramScanner = new Scanner( dataFile ); 14.379 + } 14.380 + catch( Exception e ) 14.381 + { dataFile = null; 14.382 + paramScanner = null; 14.383 + System.err.println( "couldn't open file: " + fileName ); 14.384 + } 14.385 + 14.386 + paramScanner.useDelimiter(",\\s*|\n|\r\n"); 14.387 + 14.388 + MatrixInRowMajor ( int _numRows, int _numCols ) 14.389 + { 14.390 + numRows = _numRows; // lives in super class 14.391 + numCols = _numCols; 14.392 + 14.393 + rows = new float[numRows][]; 14.394 + 14.395 + for( int i = 0; i < numRows; i++ ) 14.396 + { 14.397 + rows[ i ] = new float[ numCols ]; 14.398 + } 14.399 + } 14.400 + public void fillSelfFromFile( String fileName ) 14.401 + { float floatValue = 0; 14.402 + String floatString; 14.403 + 14.404 + super.setUpScanner( fileName ); 14.405 + 14.406 + for( int r = 0; r < numRows; r += 1 ) 14.407 + { for( int c = 0; c < numCols; c += 1 ) 14.408 + { floatString = paramScanner.next(); 14.409 + floatValue = Float.parseFloat( floatString ); 14.410 + rows[ r ][ c ] = floatValue; 14.411 + } 14.412 + } 14.413 + } 14.414 + 14.415 +//================================================================ 14.416 + 14.417 + 14.418 +/* Get the data strucs implicitly from the header file 14.419 + */ 14.420 +void read_All_Frames( FILE *inputFH ) 14.421 + { int MB_x, MB_y, f; 14.422 + 14.423 + for( f = 1; f <= numFrames; f++ ) //allocated 10 frames of mem, use 8 14.424 + { //PixInFrame, PixInLine, MBsInFrame, frameWidthInMB, etc in header 14.425 + uint8_t * startOfFrame_L = &(input_img_y[f][0]); 14.426 + uint8_t * startOfFrame_CR = &(input_img_cr[f][0]); 14.427 + uint8_t * startOfFrame_CB = &(input_img_cb[f][0]); 14.428 + 14.429 + // Reads in one frame 14.430 + for( MB_y = 0; MB_y < frameHeightInMB; MB_y++ ) 14.431 + { 14.432 + for( MB_x = 0; MB_x < frameWidthInMB; MB_x++ ) 14.433 + { //DEBUG: addr arith checks out (size of MB_Info is 240) 14.434 + MB_Info *MBInfo = &(input_MBs[f][0]) + 14.435 + (MB_y * frameWidthInMB + MB_x); 14.436 + // Read Macroblock Parameters and pixel data 14.437 + read_One_MB( f, MB_y, MB_x, MBInfo, inputFH ); 14.438 + 14.439 + MBInfo->PixInLine_L = oPixInLine_L; 14.440 + MBInfo->PixInLine_C = oPixInLine_C; 14.441 + 14.442 + int offsetToMBsPix_L = 14.443 + MB_y * (oPixInLine_L * MBHeight_L) + MB_x * MBWidth_L; 14.444 + int offsetToMBsPix_C = 14.445 + MB_y * (oPixInLine_C * MBHeight_C) + MB_x * MBWidth_C; 14.446 + //DEBUG: addr arith checks out 14.447 + MBInfo->startOfMBsPix_L = startOfFrame_L + offsetToMBsPix_L; 14.448 + MBInfo->startOfMBsPix_CR = startOfFrame_CR + offsetToMBsPix_C; 14.449 + MBInfo->startOfMBsPix_CB = startOfFrame_CB + offsetToMBsPix_C; 14.450 + } 14.451 + } 14.452 + } 14.453 + } 14.454 + 14.455 +/* Reads the parameters of macro block, then reads pixel data of MB. 14.456 + * Give it frame num, y and x of macro block. It gets addresses of the 14.457 + * arrays by including the header. 14.458 + */ 14.459 +void read_One_MB(int f, int MB_y, int MB_x, MB_Info *curMB, FILE *inputFH) 14.460 + { 14.461 + int dir, line, i, tmp, x, y, pixelIndex; 14.462 + char strTemp[90]; 14.463 + MB_Info_throwAway throwAway; 14.464 + 14.465 + fscanf(inputFH, "%s",&(strTemp[0])); //get rid of unused preamble string 14.466 + 14.467 + //(*curMB).MB_x = MB_x; 14.468 + //(*curMB).MB_y = MB_y; 14.469 + 14.470 + 14.471 + 14.472 + /************ Read parameters ******************/ 14.473 + 14.474 + // first, get rid of unused data in the input stream. 14.475 + fscanf(inputFH, "%d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d", 14.476 + &(throwAway).MB_X, 14.477 + &(throwAway).MB_Y, 14.478 + &(throwAway).mb_stride, 14.479 + &(throwAway).deblocking_filter, 14.480 + &(throwAway).picture_structure, 14.481 + &(throwAway).slice_alpha_c0_offset, 14.482 + &(throwAway).slice_type, 14.483 + &(throwAway).chroma_qp_index_offset[0], 14.484 + &(throwAway).chroma_qp_index_offset[1], 14.485 + &(throwAway).mb_xy_type, 14.486 + &(throwAway).mb_xy_type_m1, 14.487 + &(throwAway).mb_xy_type_top, 14.488 + &(throwAway).qscale_mb_xy, 14.489 + &(throwAway).qscale_mb_xy_m1, 14.490 + &(throwAway).qscale_mb_xy_top, 14.491 + &(throwAway).slice_table_mb_xy, 14.492 + &(throwAway).slice_table_mb_xy_m1, 14.493 + &(throwAway).slice_table_mb_xy_top 14.494 + ); 14.495 + 14.496 + for(dir = 0; dir < 2; dir++) 14.497 + for(line=0; line < 5*8; line++) 14.498 + fscanf(inputFH, "%d ", &(throwAway).ref_cache[dir][line]); 14.499 + 14.500 + for(dir = 0; dir < 2; dir++) 14.501 + for(line=0; line < 5*8; line++) 14.502 + fscanf(inputFH, "%d %d ", &(throwAway).mv_cache[dir][line][0], 14.503 + &(throwAway).mv_cache[dir][line][1]); 14.504 + 14.505 + for(line=0; line < 6*8; line++) 14.506 + fscanf(inputFH, "%d ", &(throwAway).non_zero_count_cache[line]); 14.507 + 14.508 + 14.509 + //now, get data will use in deblocking 14.510 + 14.511 + fscanf(inputFH, "%i %i", &(*curMB).endSubBlk[0], &(*curMB).endSubBlk[1]); 14.512 + fscanf(inputFH, "%d",&(*curMB).startSubBlk[0]); 14.513 + fscanf(inputFH, "%d",&(*curMB).startSubBlk[1]); 14.514 + 14.515 + //read bS 14.516 + for(dir = 0; dir < 2; dir++) 14.517 + for(line=0; line < 4; line++) 14.518 + for (i=0; i < 4; i++) 14.519 + { 14.520 + fscanf(inputFH, "%d",&tmp); 14.521 + (*curMB).bS[dir][line][i] = (int) tmp; 14.522 + } 14.523 + 14.524 + //read luma_qp 14.525 + for(dir = 0; dir < 2; dir++) 14.526 + for(line=0; line < 4; line++) 14.527 + fscanf(inputFH, "%d",&(*curMB).luma_qp[dir][line]); 14.528 + 14.529 + //read chroma_qp 14.530 + for(dir = 0; dir < 2; dir++) 14.531 + for(line=0; line < 4; line++) 14.532 + fscanf(inputFH, "%d",&(*curMB).chroma_qp[dir][line]); 14.533 + 14.534 + 14.535 + 14.536 + /********* Have MB params, now read pixel data of MB *************/ 14.537 + 14.538 + /* The MB pixel data is read in one MB at a time. All pixel data 14.539 + * goes into a single array. The pixel data is layed out in the array 14.540 + * the same as the pixels appear on the screen, in screen-row major 14.541 + * order. So, all the pixels in the 0th line at the top of the frame 14.542 + * are next to each other, starting at the beginning of the array. 14.543 + * Then, the second line begins at arrayAddr + frame_width_in_pixels, 14.544 + * and so on. 14.545 + * Get the pixels for one MB, so have to map the MB location onto the 14.546 + * array location. 14.547 + * The position of the MB's 0,0 pixel is offset by the data of all the 14.548 + * MBs in MB-lines above, and by all the pixels in MBs to the left. 14.549 + * So, the number of pixels in a MB-line is the number of lines in a MB 14.550 + * times the number of pixels in a frame-line. Multiply that by the 14.551 + * number of MB-lines above the current MB. 14.552 + * Next, add the offset within the current MB-line, which is the 14.553 + * number of MBs to the left times the width, in pixels, of one MB. 14.554 + * Then, to get the offset of a particular pixel in the MB from the 14.555 + * start of the MB, take the number of lines in the MB above the 14.556 + * current pixel times the pixels-per-frame-line, then add the number 14.557 + * of pixels to the left within the MB. 14.558 + * 14.559 + *(MB_y * MB_height * Frm_width + MB_x * MB_width) + (y * Frm_width + x) 14.560 + */ 14.561 + 14.562 + /* read the pre-deblocking MB pixels, then the correct final pixels. 14.563 + * Start of a macro block is num MB rows above * pixel lines in height 14.564 + * of a MB * pixels in a line of frame, plus the numMB to left * pixel 14.565 + * width of a MB.. MB_y and MB_x start at 0, so they are num above 14.566 + * and num to left, repectively. 14.567 + * Many of the width and height values are defined in header.*/ 14.568 + //DEBUG: addr arith checks out 14.569 + int offsetToMBsPix_L = MB_y * (oPixInLine_L * MBHeight_L) + MB_x * MBWidth_L; 14.570 + for(y=0; y < MBHeight_L; y++) 14.571 + { // first read all input Y, then all correct output Y 14.572 + int offsetToLineInMB_L = offsetToMBsPix_L + y * oPixInLine_L; 14.573 + for(x=0; x < MBWidth_L; x++) 14.574 + { pixelIndex = offsetToLineInMB_L + x; 14.575 + fscanf(inputFH, "%i",&tmp); 14.576 + *(&input_img_y[f][0] + pixelIndex) = (char) tmp; 14.577 + } 14.578 + for(x=0; x < MBWidth_L; x++) 14.579 + { pixelIndex = offsetToLineInMB_L + x; 14.580 + fscanf(inputFH, "%i",&tmp); 14.581 + *(&correct_img_y[f][0] + pixelIndex) = (char) tmp; 14.582 + } 14.583 + } 14.584 + 14.585 + //read croma b and r.. in all rest of code, goes "R before B" but here, 14.586 + // input stream has the blue before the red.. FYI 14.587 + //DEBUG: addr arith appears to check out (assuming have right model) 14.588 + int offsetToMBsPix_C = MB_y * (oPixInLine_C * MBHeight_C) + MB_x * MBWidth_C; 14.589 + for (y=0; y < MBHeight_C; y++) 14.590 + { // first read all input CB & CR, then all correct output CB & CR 14.591 + int offsetToLineInMB_C = offsetToMBsPix_C + y * oPixInLine_C; 14.592 + for(x=0; x < MBWidth_C; x++) 14.593 + { int pixelIndex_C = offsetToLineInMB_C + x; 14.594 + fscanf(inputFH, "%d",&tmp); 14.595 + *(&input_img_cb[f][0] + pixelIndex_C) = (char) tmp; 14.596 + fscanf(inputFH, "%d",&tmp); 14.597 + *(&input_img_cr[f][0] + pixelIndex_C) = (char) tmp; 14.598 + } 14.599 + for(x=0; x < MBWidth_C; x++) 14.600 + { int pixelIndex_C = offsetToLineInMB_C + x; 14.601 + fscanf(inputFH, "%d",&tmp); 14.602 + *(&correct_img_cb[f][0] + pixelIndex_C) = (char) tmp; 14.603 + fscanf(inputFH, "%d",&tmp); 14.604 + *(&correct_img_cr[f][0] + pixelIndex_C) = (char) tmp; 14.605 + } 14.606 + } 14.607 + }
15.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 15.2 +++ b/main.c Sun Aug 26 03:04:50 2012 -0700 15.3 @@ -0,0 +1,44 @@ 15.4 +/* 15.5 + * Copyright Oct 24, 2009 OpenSourceCodeStewardshipFoundation.org 15.6 + * Licensed under GNU General Public License version 2 15.7 + * 15.8 + * author seanhalle@yahoo.com 15.9 + */ 15.10 + 15.11 + 15.12 +#include <stdio.h> 15.13 +#include <time.h> 15.14 +#include <math.h> 15.15 +#include <float.h> 15.16 +#include <limits.h> 15.17 +#include <sys/time.h> 15.18 +#include <malloc.h> 15.19 + 15.20 +#include "BLIS_CONSTANTS.h" 15.21 +#include "../BLIS/BLIS.h" 15.22 +#include "../BLIS/DKU/DKU_common/DKU.h" 15.23 + 15.24 +#include "Matrix_Mult.h" 15.25 + 15.26 +/** 15.27 + * This is the DKU version of Matrix Multiply sample application 15.28 + * 15.29 + */ 15.30 +int main( int argc, char **argv ) 15.31 + { Matrix *leftMatrix, *rightMatrix; 15.32 + ParamBag *paramBag; 15.33 + 15.34 + paramBag = makeParamBag(); 15.35 + readParamFileIntoBag( argv[1], paramBag ); 15.36 + initialize_Input_Matrices_Via( &leftMatrix, &rightMatrix, paramBag ); 15.37 + 15.38 + resultMatrix = multiplyTheseMatrices( leftMatrix, rightMatrix ); 15.39 + 15.40 + printf("\nresult matrix: \n"); 15.41 + printMatrix( resultMatrix ); 15.42 + 15.43 +// BLIS_DKU__print_Stats_forInst( DKU_INST_MM ); 15.44 + 15.45 + exit(0); //cleans up 15.46 + } 15.47 +
