# HG changeset patch # User Sean Halle # Date 1357845402 28800 # Node ID ce07f1a42ddfebf3ea71d2e7db34f955e6ba328b # Parent a4bc9d8a8f111cbf19f68b94929b520e6fa185ce Added skeleton for data tracking assigner + design notes diff -r a4bc9d8a8f11 -r ce07f1a42ddf DataTrackingAssigner/Assigner.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/DataTrackingAssigner/Assigner.c Thu Jan 10 11:16:42 2013 -0800 @@ -0,0 +1,214 @@ +/* + * Copyright 2010 OpenSourceCodeStewardshipFoundation + * + * Licensed under BSD + */ + +#include +#include + +#include "Queue_impl/PrivateQueue.h" +#include "SSR.h" +#include "SSR_Request_Handlers.h" + +//=========================== Local Fn Prototypes =========================== + +typedef _SSRCache SSRCache; +typedef _SSRCommPort SSRCommPort; +typedef _CircBuff CircBuff; + + +struct _SSRCache + { int32 size; + CircBuff *dataChunks; + SSRCommPort *ports; + }; + +//says the BW and latency to a cache directly connected to this one +struct _SSRCommPort + { + int32 BandWidth; + int32 Latency; + SSRCache *connectedCache; + }; + +//Data chunk -- probably need more things than this.. +typedef struct + { int32 size; + int32 ID; //the "mostly" unique identifier -- maybe a hash of addr + } +SSRDataChunk; + + + +void +StepOnCache( SSRCache *cache, SSRDataChunk *chunk ); + +//============================== Assigner ================================== +// +/*This evaluates the slave VPs available, and picks the one with the + * least total data travel time. + * + *Will need balancer for time spent calculating vs time-width of work +unit.. given the number of available work-units.. IE, if thousands +of work units available, and they're all small, then counter-productive +to calculate a score for every work-unit.. need a "take best so far" +threshold, after which stop looking.. that threshold is based on +work-unit sizes -- either set by latency/BW at that level of hierarchy +or set by averaging predicted work-content. + + */ +SlaveVP * +pickASlave( SSRSemEnv *semEnv, AnimSlot *slot ) + { SlaveVP *candidateSlave, *readySlaves, *bestSlave; + int32 coreNum, slotNum; + SSRCache *cacheHier; + + float32 timeToGetInputChunks, lowestTime = 0; + + coreNum = slot->coreSlotIsOn; + slotNum = slot->slotIdx; + + //Now, have some choices about how to organize -- if the number of + // cores is small, or are keeping work separated geographically, + // then may make sense to calculate distance to every core each + // time a slave is made ready. Then, have a separate priority queue + // for each slave, and just pick the top when assign (and remove + // that slave from all the other priority Qs) + // wait -- those values will degrade as caches get stepped on, + // so, have to keep a "freshness" count.. and gets complex.. + //Else, if too many cores, then keep an array with each slave, which + // has one position for each core, which holds the time to gather + // the input chunks for that slave to that core plus a freshness + // count. Then, when search for the best slave, first look in the + // array to see if have previously calculated for that slave, and + // to see how fresh that calculation is. If none, or old, then + // calculate the time and save it into the array, and remember the + // slave with the lowest time -- pick that one. + + //Here, just doing brute force -- calc time for all slaves for this + // core, then pick the slave with lowest time + + //get all the candidate work + readySlaves = semEnv->readySlaves; //linked list? + + //calc time for each, keeping lowest + cacheHier = semEnv->cacheHier; //create hier during init + candidateSlave = readySlaves; + while( candidateSlave != NULL ) + { + timeToGetInputChunks = + calcTimeToGetInputChunks( cacheHier, candidateSlave, endCore ); + if(timeToGetInputChunks < lowestTime ) + { lowestTime = timeToGetInputChunks; + bestSlave = candidateSlave; + } + candidateSlave = ;//next slave in list + } + + //return slave with lowest expected time + return bestSlave; //NULL is okay value to return + } + +/*Calculate "goodness" of a work-unit based on + * expected time to move consumed data to the given core. + */ +float32 +calcTimeToGetInputChunks( SSRCache *cacheHier, SlaveVP *candidateSlave, + int32 endCore ) + { SSRDataChunk *inputChunks, *chunk; + SSRSemData semData; + int32 totalTime, chunkPos; + + //Collect the input chunks required by the work + semData = (SSRSemData)candidateSlave->semanticData; + + inputChunks = semData->inputChunks; //Todo add to SSRSemData + numChunks = semData->numInputChunks; + + //calc time to get input chunks + totalTime = 0; + for( chunkPos = 0; chunkPos < numChunks; chunkPos++ ) + { chunk = inputChunks[chunkPos]; + totalTime += calcTimeToGetChunk( cacheHier, chunk, endCore ); + } + return totalTime; + } + +/*Walk the hierarchy, looking for caches that have a portion of the + * data chunk in them. + *For example, could start at the destination core, and work backwards + * to each of the caches in the path to main memory. In each of those, + * look for the data chunk, get the percent expected in the core, then + * calculate the time to the next level up for the remaining + * percentage. + *Figure out some heuristic for resolving when a higher level cache + * gets part of what's in a lower level cache trampled on.. IE, a data + * chunk goes through the L2 cache, into the L1 cache. Say 100% is in + * the L2, but only 25% fit in the L1. Now, the L2 has all of what's + * in L1, plus the rest of the chunk. Next, a + * different core gets a chunk that goes through the L2 cache and + * steps on that first chunk. So now, of the how much of the 75% + * that used to be in L2 that was different from what's in L1 was + * what got stepped on, vs the 25% that was in both.. + * + */ +calcTimeToGetChunk( SSRCache *cacheHier, SSRDataChunk *chunk, int32 endCore ) + { + + } + +/*A named data-chunk travels through the cache hierarchy, and + * "steps on" each cache in its path. + *Step-on means add data-chunk to top of circular buffer, expelling + * bottom, and causing re-calculation of percent left for all chunks + * in buffer + */ +void +stepOnCache( SSRCache *cache, SSRDataChunk *chunk ) + { + + } + +/*This function is called after the assigner has chosen a work-unit + * to assign to the offered core. It is given a data-chunk, what + * percent of it is travelling, the + * starting mem or cache that chunk is in, and the ending cache that + * chunk travels to. + * + *This function then walks the hierarchy of cache structures, from the + * location data starts, to the destination. At each cache along the + * way, it calls "step on", handing it the data chunk. + * + *The caller should use a heuristic to simplify issues of "when" + * does the data travel, and "how much" travels. + * In other words, if an L1 source + * cache is calculated to have 50% of a data-chunk, then only 50% + * travels from it, and the other 50% from main-memory. + * This may complicate, for example, an L2 that both halves travel + * through, where the two halves should be merged. + *Use micro-benchmarks to measure behaviors and make heuristic. + */ +void +travelTheHierarchy( SSRCache *sourceCache, SSRCache *endCache ) + { + // + } + +/*The assigner has to create the data structures representing the + * caches, each of which holds a circular buffer of the data-chunks + * that have stepped on the cache. + *Each cache structure has ports representing the connections to the + * other caches on the chip. + */ +void +initAssigner( SSRSemEnv *semEnv ) + { + VMSCommNode * + systemNode = VMS_SS__give_comm_hierarchy(); //this is read only!! + + //copy the cache hierarchy from systemNode into Cache structs, + // connected by ports. + } + + diff -r a4bc9d8a8f11 -r ce07f1a42ddf Design_Notes__data_tracking_assigner.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Design_Notes__data_tracking_assigner.txt Thu Jan 10 11:16:42 2013 -0800 @@ -0,0 +1,105 @@ + +The approach of proto-runtime to assignment: it is +hardware-availability driven. So, the assigner is triggered at the +point hardware becomes available. It then looks at the work available + and picks what work to assign to that hardware. + +This means that the assigner is written so that it receives a hardware + opportunity as input, and calculates a "goodness" measure for each +work unit available. It then assigns the best work-unit to that given + hardware. + +Track data movement + through caches, and calculate "goodness" of a work-unit based on + expected time to move consumed data to the given core. + +-] Struct for each cache +-] Has a port, with time, to each connected cache +-] Has a circular buffer of data-names, with sizes, and percent left +-] Has a total size + +Will need balancer for time spent calculating vs time-width of work +unit.. given the number of available work-units.. IE, if thousands +of work units available, and they're all small, then counter-productive +to calculate a score for every work-unit.. need a "take best so far" +threshold, after which stop looking.. that threshold is based on +work-unit sizes -- either set by latency/BW at that level of hierarchy +or set by averaging predicted work-content. + +A named data-chunk travels through the cache + hierarchy, and "steps on" each cache in its path, starting at core + that created the data, going to core that consumes. + +-] add data-chunk to top of circular buffer, expelling bottom, and + causing re-calculation of percent left of itself and all below + +Once the assigner has chosen a work-unit to assign to the given core, + record each cache that input data travels through, starting at the + place the data travels from, ending at the given core. Make a + heuristic to simplify issues of "when" does the data travel, and + "how much" travels (IE, if L1 cache is calculated to have 50% of + data-chunk, then only 50% travels through L2 from main-memory -- + this is at least the second time data from that chunk has traveled + through L2, so what portion is expected to be in L2? Use + micro-benchmarks to measure behaviors and make heuristic). + +-] Upon choice, walk the data-chunk through the cache hierarchy, + calling "step on" for each one. + +One idea for managing is to keep a list, for each cache, of the + data-chunks that have landed on it, in the order they landed. At + the point the assigner does a recording of data landing on a cache, + add the landing-event to a list. The list keeps the order of + data-chunks landing on a cache, with the size of each. + +At the point want to calculate the percentage of a given data chunk + that resides in a given cache, find that chunk-name in the list, and + look at all the chunks that have stepped on the cache after it. + Create a heuristic to estimate percent of target chunk still in the + cache, given the cache size, cache type (direct, vs number of + set-ways), and the chunks that have stepped on the cache after the + target chunk passed through. + +-] calculate percent left for each chunk in cache, at point cache gets + "step on" with a new chunk. +-] To find a given chunk in the caches, maybe keep a separate hash + table whose entry is a linked list of cache entries. -- IE, each + time a chunk steps on a cache, create an entry in the cache's + circular buffer, and also create a link to that entry in a linked + list. The list has one link for each cache the chunk steps on. + The list is placed into a hash table, with the chunk's name as the + key. When a cache entry's percent drops to zero, then remove that + link. When last link is removed, remove the hash entry. +-] So, for a given work-unit in the readyQ, for each input chunk, + look in hash table to get entry for that chunk. If none, cost is + a main-memory fetch. Else, go down linked list for the chunk, and + calculate a percent divided by a weighted distance for each (the larger the + distance, the less the percent matters -- a small percent close + beats a large percent far away). Take the best percent/dist as the + score for that chunk in that work-unit. Then, sum up over all + input chunks, and divide by number of chunks (or appropriate way + to combine).. that's the score for that work-unit.. the largest + score is the one chosen. + +Calculate the "goodness" of a candidate work-unit. For each data-chunk + that unit consumes, calculate expected percent of the chunk that + resides in each cache. Given the expected percent, use standard + cache eqns to calculate expected time spent waiting for data transfer + (percent missing times chunk-size divided by line-size times time to + get a line). + +Sum up expected wait-time for each work-unit candidate. That gives a + measure of "goodness" of choosing that work-unit for the current + hardware-opportunity. This is where the interesting work comes in, + of defining an assignment-choosing algorithm that will deliver the + best performance.. + +(Shoaib mentioned defining several alternatives, and perhaps using + measurements of the application to choose which is best..) + + + + + + + diff -r a4bc9d8a8f11 -r ce07f1a42ddf SSR.c --- a/SSR.c Fri Dec 28 09:38:49 2012 -0800 +++ b/SSR.c Thu Jan 10 11:16:42 2013 -0800 @@ -250,21 +250,8 @@ semanticEnv->fnSingletons[i].waitQ = makeVMSQ(); semanticEnv->transactionStrucs[i].waitingVPQ = makeVMSQ(); } - - VMSCommNode * - systemNode = VMS_SS__give_comm_hierarchy(); //this is read only!! - - //Do something with the comm system here.. make own, faster, data - // structure that is used by assigner -- it can include info about - // measured miss rates, and structures that track the data.. - //Next step would be to take a shot at a function call to put into the - // application that gives a "name" to collection of data consumed by - // a work-unit, and a name to the data produced.. along with the size - // each of those named collections. - //Then, can come up with a way to represent how much of each - // named data collection that resides in each of the caches. Keep that - // representation in the data structure that build from parsing the - // comm system returned from VMS. + + initAssigner( semanticEnv ); } diff -r a4bc9d8a8f11 -r ce07f1a42ddf SSR_PluginFns.c --- a/SSR_PluginFns.c Fri Dec 28 09:38:49 2012 -0800 +++ b/SSR_PluginFns.c Thu Jan 10 11:16:42 2013 -0800 @@ -27,11 +27,7 @@ //============================== Assigner ================================== // -/*For SSR, assigning a slave simply takes the next work-unit off the - * ready-to-go work-unit queue and assigns it to the offered slot. - *If the ready-to-go work-unit queue is empty, then nothing to assign - * to the animation slot -- return FALSE to let Master loop know assigning - * that slot failed. +/* */ SlaveVP * SSR__assign_slaveVP_to_slot( void *_semEnv, AnimSlot *slot ) @@ -44,7 +40,9 @@ semEnv = (SSRSemEnv *)_semEnv; - assignPr = readPrivQ( semEnv->readyVPQs[coreNum] ); +// assignPr = readPrivQ( semEnv->readyVPQs[coreNum] ); + assignPr = pickASlave( slot, coreNum ); + //Note, using a non-blocking queue -- it returns NULL if queue empty #ifdef IDLE_SLAVES if(!assignPr){ @@ -321,3 +319,5 @@ #endif writePrivQ( procr, semEnv->readyVPQs[ procr->coreAnimatedBy] ); } + +