Mercurial > cgi-bin > hgwebdir.cgi > VMS > VMS_Implementations > SSR_impls > SSR__MC_shared_impl
changeset 84:ce07f1a42ddf Dev__data_tracking_assign tip
Added skeleton for data tracking assigner + design notes
| author | Sean Halle <seanhalle@yahoo.com> |
|---|---|
| date | Thu, 10 Jan 2013 11:16:42 -0800 |
| parents | a4bc9d8a8f11 |
| children | |
| files | DataTrackingAssigner/Assigner.c Design_Notes__data_tracking_assigner.txt SSR.c SSR_PluginFns.c |
| diffstat | 4 files changed, 327 insertions(+), 21 deletions(-) [+] |
line diff
1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/DataTrackingAssigner/Assigner.c Thu Jan 10 11:16:42 2013 -0800 1.3 @@ -0,0 +1,214 @@ 1.4 +/* 1.5 + * Copyright 2010 OpenSourceCodeStewardshipFoundation 1.6 + * 1.7 + * Licensed under BSD 1.8 + */ 1.9 + 1.10 +#include <stdio.h> 1.11 +#include <stdlib.h> 1.12 + 1.13 +#include "Queue_impl/PrivateQueue.h" 1.14 +#include "SSR.h" 1.15 +#include "SSR_Request_Handlers.h" 1.16 + 1.17 +//=========================== Local Fn Prototypes =========================== 1.18 + 1.19 +typedef _SSRCache SSRCache; 1.20 +typedef _SSRCommPort SSRCommPort; 1.21 +typedef _CircBuff CircBuff; 1.22 + 1.23 + 1.24 +struct _SSRCache 1.25 + { int32 size; 1.26 + CircBuff *dataChunks; 1.27 + SSRCommPort *ports; 1.28 + }; 1.29 + 1.30 +//says the BW and latency to a cache directly connected to this one 1.31 +struct _SSRCommPort 1.32 + { 1.33 + int32 BandWidth; 1.34 + int32 Latency; 1.35 + SSRCache *connectedCache; 1.36 + }; 1.37 + 1.38 +//Data chunk -- probably need more things than this.. 1.39 +typedef struct 1.40 + { int32 size; 1.41 + int32 ID; //the "mostly" unique identifier -- maybe a hash of addr 1.42 + } 1.43 +SSRDataChunk; 1.44 + 1.45 + 1.46 + 1.47 +void 1.48 +StepOnCache( SSRCache *cache, SSRDataChunk *chunk ); 1.49 + 1.50 +//============================== Assigner ================================== 1.51 +// 1.52 +/*This evaluates the slave VPs available, and picks the one with the 1.53 + * least total data travel time. 1.54 + * 1.55 + *Will need balancer for time spent calculating vs time-width of work 1.56 +unit.. given the number of available work-units.. IE, if thousands 1.57 +of work units available, and they're all small, then counter-productive 1.58 +to calculate a score for every work-unit.. need a "take best so far" 1.59 +threshold, after which stop looking.. that threshold is based on 1.60 +work-unit sizes -- either set by latency/BW at that level of hierarchy 1.61 +or set by averaging predicted work-content. 1.62 + 1.63 + */ 1.64 +SlaveVP * 1.65 +pickASlave( SSRSemEnv *semEnv, AnimSlot *slot ) 1.66 + { SlaveVP *candidateSlave, *readySlaves, *bestSlave; 1.67 + int32 coreNum, slotNum; 1.68 + SSRCache *cacheHier; 1.69 + 1.70 + float32 timeToGetInputChunks, lowestTime = 0; 1.71 + 1.72 + coreNum = slot->coreSlotIsOn; 1.73 + slotNum = slot->slotIdx; 1.74 + 1.75 + //Now, have some choices about how to organize -- if the number of 1.76 + // cores is small, or are keeping work separated geographically, 1.77 + // then may make sense to calculate distance to every core each 1.78 + // time a slave is made ready. Then, have a separate priority queue 1.79 + // for each slave, and just pick the top when assign (and remove 1.80 + // that slave from all the other priority Qs) 1.81 + // wait -- those values will degrade as caches get stepped on, 1.82 + // so, have to keep a "freshness" count.. and gets complex.. 1.83 + //Else, if too many cores, then keep an array with each slave, which 1.84 + // has one position for each core, which holds the time to gather 1.85 + // the input chunks for that slave to that core plus a freshness 1.86 + // count. Then, when search for the best slave, first look in the 1.87 + // array to see if have previously calculated for that slave, and 1.88 + // to see how fresh that calculation is. If none, or old, then 1.89 + // calculate the time and save it into the array, and remember the 1.90 + // slave with the lowest time -- pick that one. 1.91 + 1.92 + //Here, just doing brute force -- calc time for all slaves for this 1.93 + // core, then pick the slave with lowest time 1.94 + 1.95 + //get all the candidate work 1.96 + readySlaves = semEnv->readySlaves; //linked list? 1.97 + 1.98 + //calc time for each, keeping lowest 1.99 + cacheHier = semEnv->cacheHier; //create hier during init 1.100 + candidateSlave = readySlaves; 1.101 + while( candidateSlave != NULL ) 1.102 + { 1.103 + timeToGetInputChunks = 1.104 + calcTimeToGetInputChunks( cacheHier, candidateSlave, endCore ); 1.105 + if(timeToGetInputChunks < lowestTime ) 1.106 + { lowestTime = timeToGetInputChunks; 1.107 + bestSlave = candidateSlave; 1.108 + } 1.109 + candidateSlave = ;//next slave in list 1.110 + } 1.111 + 1.112 + //return slave with lowest expected time 1.113 + return bestSlave; //NULL is okay value to return 1.114 + } 1.115 + 1.116 +/*Calculate "goodness" of a work-unit based on 1.117 + * expected time to move consumed data to the given core. 1.118 + */ 1.119 +float32 1.120 +calcTimeToGetInputChunks( SSRCache *cacheHier, SlaveVP *candidateSlave, 1.121 + int32 endCore ) 1.122 + { SSRDataChunk *inputChunks, *chunk; 1.123 + SSRSemData semData; 1.124 + int32 totalTime, chunkPos; 1.125 + 1.126 + //Collect the input chunks required by the work 1.127 + semData = (SSRSemData)candidateSlave->semanticData; 1.128 + 1.129 + inputChunks = semData->inputChunks; //Todo add to SSRSemData 1.130 + numChunks = semData->numInputChunks; 1.131 + 1.132 + //calc time to get input chunks 1.133 + totalTime = 0; 1.134 + for( chunkPos = 0; chunkPos < numChunks; chunkPos++ ) 1.135 + { chunk = inputChunks[chunkPos]; 1.136 + totalTime += calcTimeToGetChunk( cacheHier, chunk, endCore ); 1.137 + } 1.138 + return totalTime; 1.139 + } 1.140 + 1.141 +/*Walk the hierarchy, looking for caches that have a portion of the 1.142 + * data chunk in them. 1.143 + *For example, could start at the destination core, and work backwards 1.144 + * to each of the caches in the path to main memory. In each of those, 1.145 + * look for the data chunk, get the percent expected in the core, then 1.146 + * calculate the time to the next level up for the remaining 1.147 + * percentage. 1.148 + *Figure out some heuristic for resolving when a higher level cache 1.149 + * gets part of what's in a lower level cache trampled on.. IE, a data 1.150 + * chunk goes through the L2 cache, into the L1 cache. Say 100% is in 1.151 + * the L2, but only 25% fit in the L1. Now, the L2 has all of what's 1.152 + * in L1, plus the rest of the chunk. Next, a 1.153 + * different core gets a chunk that goes through the L2 cache and 1.154 + * steps on that first chunk. So now, of the how much of the 75% 1.155 + * that used to be in L2 that was different from what's in L1 was 1.156 + * what got stepped on, vs the 25% that was in both.. 1.157 + * 1.158 + */ 1.159 +calcTimeToGetChunk( SSRCache *cacheHier, SSRDataChunk *chunk, int32 endCore ) 1.160 + { 1.161 + 1.162 + } 1.163 + 1.164 +/*A named data-chunk travels through the cache hierarchy, and 1.165 + * "steps on" each cache in its path. 1.166 + *Step-on means add data-chunk to top of circular buffer, expelling 1.167 + * bottom, and causing re-calculation of percent left for all chunks 1.168 + * in buffer 1.169 + */ 1.170 +void 1.171 +stepOnCache( SSRCache *cache, SSRDataChunk *chunk ) 1.172 + { 1.173 + 1.174 + } 1.175 + 1.176 +/*This function is called after the assigner has chosen a work-unit 1.177 + * to assign to the offered core. It is given a data-chunk, what 1.178 + * percent of it is travelling, the 1.179 + * starting mem or cache that chunk is in, and the ending cache that 1.180 + * chunk travels to. 1.181 + * 1.182 + *This function then walks the hierarchy of cache structures, from the 1.183 + * location data starts, to the destination. At each cache along the 1.184 + * way, it calls "step on", handing it the data chunk. 1.185 + * 1.186 + *The caller should use a heuristic to simplify issues of "when" 1.187 + * does the data travel, and "how much" travels. 1.188 + * In other words, if an L1 source 1.189 + * cache is calculated to have 50% of a data-chunk, then only 50% 1.190 + * travels from it, and the other 50% from main-memory. 1.191 + * This may complicate, for example, an L2 that both halves travel 1.192 + * through, where the two halves should be merged. 1.193 + *Use micro-benchmarks to measure behaviors and make heuristic. 1.194 + */ 1.195 +void 1.196 +travelTheHierarchy( SSRCache *sourceCache, SSRCache *endCache ) 1.197 + { 1.198 + // 1.199 + } 1.200 + 1.201 +/*The assigner has to create the data structures representing the 1.202 + * caches, each of which holds a circular buffer of the data-chunks 1.203 + * that have stepped on the cache. 1.204 + *Each cache structure has ports representing the connections to the 1.205 + * other caches on the chip. 1.206 + */ 1.207 +void 1.208 +initAssigner( SSRSemEnv *semEnv ) 1.209 + { 1.210 + VMSCommNode * 1.211 + systemNode = VMS_SS__give_comm_hierarchy(); //this is read only!! 1.212 + 1.213 + //copy the cache hierarchy from systemNode into Cache structs, 1.214 + // connected by ports. 1.215 + } 1.216 + 1.217 +
2.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 2.2 +++ b/Design_Notes__data_tracking_assigner.txt Thu Jan 10 11:16:42 2013 -0800 2.3 @@ -0,0 +1,105 @@ 2.4 + 2.5 +The approach of proto-runtime to assignment: it is 2.6 +hardware-availability driven. So, the assigner is triggered at the 2.7 +point hardware becomes available. It then looks at the work available 2.8 + and picks what work to assign to that hardware. 2.9 + 2.10 +This means that the assigner is written so that it receives a hardware 2.11 + opportunity as input, and calculates a "goodness" measure for each 2.12 +work unit available. It then assigns the best work-unit to that given 2.13 + hardware. 2.14 + 2.15 +Track data movement 2.16 + through caches, and calculate "goodness" of a work-unit based on 2.17 + expected time to move consumed data to the given core. 2.18 + 2.19 +-] Struct for each cache 2.20 +-] Has a port, with time, to each connected cache 2.21 +-] Has a circular buffer of data-names, with sizes, and percent left 2.22 +-] Has a total size 2.23 + 2.24 +Will need balancer for time spent calculating vs time-width of work 2.25 +unit.. given the number of available work-units.. IE, if thousands 2.26 +of work units available, and they're all small, then counter-productive 2.27 +to calculate a score for every work-unit.. need a "take best so far" 2.28 +threshold, after which stop looking.. that threshold is based on 2.29 +work-unit sizes -- either set by latency/BW at that level of hierarchy 2.30 +or set by averaging predicted work-content. 2.31 + 2.32 +A named data-chunk travels through the cache 2.33 + hierarchy, and "steps on" each cache in its path, starting at core 2.34 + that created the data, going to core that consumes. 2.35 + 2.36 +-] add data-chunk to top of circular buffer, expelling bottom, and 2.37 + causing re-calculation of percent left of itself and all below 2.38 + 2.39 +Once the assigner has chosen a work-unit to assign to the given core, 2.40 + record each cache that input data travels through, starting at the 2.41 + place the data travels from, ending at the given core. Make a 2.42 + heuristic to simplify issues of "when" does the data travel, and 2.43 + "how much" travels (IE, if L1 cache is calculated to have 50% of 2.44 + data-chunk, then only 50% travels through L2 from main-memory -- 2.45 + this is at least the second time data from that chunk has traveled 2.46 + through L2, so what portion is expected to be in L2? Use 2.47 + micro-benchmarks to measure behaviors and make heuristic). 2.48 + 2.49 +-] Upon choice, walk the data-chunk through the cache hierarchy, 2.50 + calling "step on" for each one. 2.51 + 2.52 +One idea for managing is to keep a list, for each cache, of the 2.53 + data-chunks that have landed on it, in the order they landed. At 2.54 + the point the assigner does a recording of data landing on a cache, 2.55 + add the landing-event to a list. The list keeps the order of 2.56 + data-chunks landing on a cache, with the size of each. 2.57 + 2.58 +At the point want to calculate the percentage of a given data chunk 2.59 + that resides in a given cache, find that chunk-name in the list, and 2.60 + look at all the chunks that have stepped on the cache after it. 2.61 + Create a heuristic to estimate percent of target chunk still in the 2.62 + cache, given the cache size, cache type (direct, vs number of 2.63 + set-ways), and the chunks that have stepped on the cache after the 2.64 + target chunk passed through. 2.65 + 2.66 +-] calculate percent left for each chunk in cache, at point cache gets 2.67 + "step on" with a new chunk. 2.68 +-] To find a given chunk in the caches, maybe keep a separate hash 2.69 + table whose entry is a linked list of cache entries. -- IE, each 2.70 + time a chunk steps on a cache, create an entry in the cache's 2.71 + circular buffer, and also create a link to that entry in a linked 2.72 + list. The list has one link for each cache the chunk steps on. 2.73 + The list is placed into a hash table, with the chunk's name as the 2.74 + key. When a cache entry's percent drops to zero, then remove that 2.75 + link. When last link is removed, remove the hash entry. 2.76 +-] So, for a given work-unit in the readyQ, for each input chunk, 2.77 + look in hash table to get entry for that chunk. If none, cost is 2.78 + a main-memory fetch. Else, go down linked list for the chunk, and 2.79 + calculate a percent divided by a weighted distance for each (the larger the 2.80 + distance, the less the percent matters -- a small percent close 2.81 + beats a large percent far away). Take the best percent/dist as the 2.82 + score for that chunk in that work-unit. Then, sum up over all 2.83 + input chunks, and divide by number of chunks (or appropriate way 2.84 + to combine).. that's the score for that work-unit.. the largest 2.85 + score is the one chosen. 2.86 + 2.87 +Calculate the "goodness" of a candidate work-unit. For each data-chunk 2.88 + that unit consumes, calculate expected percent of the chunk that 2.89 + resides in each cache. Given the expected percent, use standard 2.90 + cache eqns to calculate expected time spent waiting for data transfer 2.91 + (percent missing times chunk-size divided by line-size times time to 2.92 + get a line). 2.93 + 2.94 +Sum up expected wait-time for each work-unit candidate. That gives a 2.95 + measure of "goodness" of choosing that work-unit for the current 2.96 + hardware-opportunity. This is where the interesting work comes in, 2.97 + of defining an assignment-choosing algorithm that will deliver the 2.98 + best performance.. 2.99 + 2.100 +(Shoaib mentioned defining several alternatives, and perhaps using 2.101 + measurements of the application to choose which is best..) 2.102 + 2.103 + 2.104 + 2.105 + 2.106 + 2.107 + 2.108 +
3.1 --- a/SSR.c Fri Dec 28 09:38:49 2012 -0800 3.2 +++ b/SSR.c Thu Jan 10 11:16:42 2013 -0800 3.3 @@ -250,21 +250,8 @@ 3.4 semanticEnv->fnSingletons[i].waitQ = makeVMSQ(); 3.5 semanticEnv->transactionStrucs[i].waitingVPQ = makeVMSQ(); 3.6 } 3.7 - 3.8 - VMSCommNode * 3.9 - systemNode = VMS_SS__give_comm_hierarchy(); //this is read only!! 3.10 - 3.11 - //Do something with the comm system here.. make own, faster, data 3.12 - // structure that is used by assigner -- it can include info about 3.13 - // measured miss rates, and structures that track the data.. 3.14 - //Next step would be to take a shot at a function call to put into the 3.15 - // application that gives a "name" to collection of data consumed by 3.16 - // a work-unit, and a name to the data produced.. along with the size 3.17 - // each of those named collections. 3.18 - //Then, can come up with a way to represent how much of each 3.19 - // named data collection that resides in each of the caches. Keep that 3.20 - // representation in the data structure that build from parsing the 3.21 - // comm system returned from VMS. 3.22 + 3.23 + initAssigner( semanticEnv ); 3.24 } 3.25 3.26
4.1 --- a/SSR_PluginFns.c Fri Dec 28 09:38:49 2012 -0800 4.2 +++ b/SSR_PluginFns.c Thu Jan 10 11:16:42 2013 -0800 4.3 @@ -27,11 +27,7 @@ 4.4 4.5 //============================== Assigner ================================== 4.6 // 4.7 -/*For SSR, assigning a slave simply takes the next work-unit off the 4.8 - * ready-to-go work-unit queue and assigns it to the offered slot. 4.9 - *If the ready-to-go work-unit queue is empty, then nothing to assign 4.10 - * to the animation slot -- return FALSE to let Master loop know assigning 4.11 - * that slot failed. 4.12 +/* 4.13 */ 4.14 SlaveVP * 4.15 SSR__assign_slaveVP_to_slot( void *_semEnv, AnimSlot *slot ) 4.16 @@ -44,7 +40,9 @@ 4.17 4.18 semEnv = (SSRSemEnv *)_semEnv; 4.19 4.20 - assignPr = readPrivQ( semEnv->readyVPQs[coreNum] ); 4.21 +// assignPr = readPrivQ( semEnv->readyVPQs[coreNum] ); 4.22 + assignPr = pickASlave( slot, coreNum ); 4.23 + 4.24 //Note, using a non-blocking queue -- it returns NULL if queue empty 4.25 #ifdef IDLE_SLAVES 4.26 if(!assignPr){ 4.27 @@ -321,3 +319,5 @@ 4.28 #endif 4.29 writePrivQ( procr, semEnv->readyVPQs[ procr->coreAnimatedBy] ); 4.30 } 4.31 + 4.32 +
