VMS/VMS_Implementations/SSR_impls/SSR__MC_shared_impl

changeset 84:ce07f1a42ddf Dev__data_tracking_assign tip

Added skeleton for data tracking assigner + design notes
author Sean Halle <seanhalle@yahoo.com>
date Thu, 10 Jan 2013 11:16:42 -0800
parents a4bc9d8a8f11
children
files DataTrackingAssigner/Assigner.c Design_Notes__data_tracking_assigner.txt SSR.c SSR_PluginFns.c
diffstat 4 files changed, 327 insertions(+), 21 deletions(-) [+]
line diff
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/DataTrackingAssigner/Assigner.c	Thu Jan 10 11:16:42 2013 -0800
     1.3 @@ -0,0 +1,214 @@
     1.4 +/*
     1.5 + * Copyright 2010  OpenSourceCodeStewardshipFoundation
     1.6 + *
     1.7 + * Licensed under BSD
     1.8 + */
     1.9 +
    1.10 +#include <stdio.h>
    1.11 +#include <stdlib.h>
    1.12 +
    1.13 +#include "Queue_impl/PrivateQueue.h"
    1.14 +#include "SSR.h"
    1.15 +#include "SSR_Request_Handlers.h"
    1.16 +
    1.17 +//=========================== Local Fn Prototypes ===========================
    1.18 +
    1.19 +typedef _SSRCache    SSRCache;
    1.20 +typedef _SSRCommPort SSRCommPort;
    1.21 +typedef _CircBuff CircBuff;
    1.22 +
    1.23 +
    1.24 +struct _SSRCache
    1.25 + { int32     size;
    1.26 +   CircBuff *dataChunks;
    1.27 +   SSRCommPort *ports;
    1.28 + };
    1.29 +
    1.30 +//says the BW and latency to a cache directly connected to this one
    1.31 +struct _SSRCommPort
    1.32 + {
    1.33 +   int32  BandWidth;
    1.34 +   int32  Latency;
    1.35 +   SSRCache *connectedCache;
    1.36 + };
    1.37 +
    1.38 +//Data chunk -- probably need more things than this..
    1.39 +typedef struct
    1.40 + { int32  size;
    1.41 +   int32  ID;   //the "mostly" unique identifier -- maybe a hash of addr
    1.42 + }
    1.43 +SSRDataChunk;
    1.44 + 
    1.45 +
    1.46 + 
    1.47 +void
    1.48 +StepOnCache( SSRCache *cache, SSRDataChunk *chunk );
    1.49 +
    1.50 +//============================== Assigner ==================================
    1.51 +//
    1.52 +/*This evaluates the slave VPs available, and picks the one with the
    1.53 + * least total data travel time.
    1.54 + * 
    1.55 + *Will need balancer for time spent calculating vs time-width of work
    1.56 +unit..  given the number of available work-units..  IE, if thousands
    1.57 +of work units available, and they're all small, then counter-productive
    1.58 +to calculate a score for every work-unit..  need a "take best so far"
    1.59 +threshold, after which stop looking.. that threshold is based on
    1.60 +work-unit sizes -- either set by latency/BW at that level of hierarchy
    1.61 +or set by averaging predicted work-content.
    1.62 +
    1.63 + */
    1.64 +SlaveVP *
    1.65 +pickASlave( SSRSemEnv *semEnv, AnimSlot *slot )
    1.66 + { SlaveVP   *candidateSlave, *readySlaves, *bestSlave;
    1.67 +   int32      coreNum, slotNum;
    1.68 +   SSRCache  *cacheHier;
    1.69 +   
    1.70 +   float32    timeToGetInputChunks, lowestTime = 0;
    1.71 +   
    1.72 +   coreNum = slot->coreSlotIsOn;
    1.73 +   slotNum = slot->slotIdx;
    1.74 +   
    1.75 +   //Now, have some choices about how to organize -- if the number of
    1.76 +   // cores is small, or are keeping work separated geographically, 
    1.77 +   // then may make sense to calculate distance to every core each
    1.78 +   // time a slave is made ready. Then, have a separate priority queue
    1.79 +   // for each slave, and just pick the top when assign (and remove
    1.80 +   // that slave from all the other priority Qs)
    1.81 +   //     wait -- those values will degrade as caches get stepped on,
    1.82 +   //     so, have to keep a "freshness" count.. and gets complex..
    1.83 +   //Else, if too many cores, then keep an array with each slave, which
    1.84 +   // has one position for each core, which holds the time to gather 
    1.85 +   // the input chunks for that slave to that core plus a freshness
    1.86 +   // count.  Then, when search for the best slave, first look in the
    1.87 +   // array to see if have previously calculated for that slave, and
    1.88 +   // to see how fresh that calculation is.  If none, or old, then 
    1.89 +   // calculate the time and save it into the array, and remember the
    1.90 +   // slave with the lowest time -- pick that one.
    1.91 +
    1.92 +   //Here, just doing brute force -- calc time for all slaves for this
    1.93 +   // core, then pick the slave with lowest time
    1.94 +   
    1.95 +   //get all the candidate work
    1.96 +   readySlaves = semEnv->readySlaves; //linked list?
    1.97 +   
    1.98 +   //calc time for each, keeping lowest
    1.99 +   cacheHier = semEnv->cacheHier; //create hier during init
   1.100 +   candidateSlave = readySlaves;
   1.101 +   while( candidateSlave != NULL )
   1.102 +    { 
   1.103 +      timeToGetInputChunks =
   1.104 +         calcTimeToGetInputChunks( cacheHier, candidateSlave, endCore );
   1.105 +      if(timeToGetInputChunks < lowestTime )
   1.106 +       { lowestTime = timeToGetInputChunks;
   1.107 +         bestSlave  = candidateSlave;
   1.108 +       }
   1.109 +      candidateSlave = ;//next slave in list
   1.110 +    }
   1.111 +   
   1.112 +   //return slave with lowest expected time
   1.113 +   return bestSlave;  //NULL is okay value to return
   1.114 + }
   1.115 +
   1.116 +/*Calculate "goodness" of a work-unit based on
   1.117 + * expected time to move consumed data to the given core.
   1.118 + */
   1.119 +float32
   1.120 +calcTimeToGetInputChunks( SSRCache *cacheHier, SlaveVP *candidateSlave,
   1.121 +                          int32 endCore )
   1.122 + { SSRDataChunk *inputChunks, *chunk;
   1.123 +   SSRSemData semData;
   1.124 +   int32 totalTime, chunkPos;
   1.125 +   
   1.126 +   //Collect the input chunks required by the work
   1.127 +   semData = (SSRSemData)candidateSlave->semanticData;
   1.128 + 
   1.129 +   inputChunks = semData->inputChunks; //Todo add to SSRSemData
   1.130 +   numChunks = semData->numInputChunks;
   1.131 +   
   1.132 +   //calc time to get input chunks
   1.133 +   totalTime = 0;
   1.134 +   for( chunkPos = 0; chunkPos < numChunks; chunkPos++ )
   1.135 +    { chunk = inputChunks[chunkPos];
   1.136 +      totalTime += calcTimeToGetChunk( cacheHier, chunk, endCore );
   1.137 +    }
   1.138 +   return totalTime;
   1.139 + }
   1.140 +
   1.141 +/*Walk the hierarchy, looking for caches that have a portion of the
   1.142 + * data chunk in them.
   1.143 + *For example, could start at the destination core, and work backwards
   1.144 + * to each of the caches in the path to main memory. In each of those,
   1.145 + * look for the data chunk, get the percent expected in the core, then
   1.146 + * calculate the time to the next level up for the remaining 
   1.147 + * percentage.
   1.148 + *Figure out some heuristic for resolving when a higher level cache
   1.149 + * gets part of what's in a lower level cache trampled on..  IE, a data
   1.150 + * chunk goes through the L2 cache, into the L1 cache.  Say 100% is in
   1.151 + * the L2, but only 25% fit in the L1.  Now, the L2 has all of what's
   1.152 + * in L1, plus the rest of the chunk.  Next, a
   1.153 + * different core gets a chunk that goes through the L2 cache and
   1.154 + * steps on that first chunk.  So now, of the how much of the 75% 
   1.155 + * that used to be in L2 that was different from what's in L1 was
   1.156 + * what got stepped on, vs the 25% that was in both..
   1.157 + * 
   1.158 + */
   1.159 +calcTimeToGetChunk( SSRCache *cacheHier, SSRDataChunk *chunk, int32 endCore )
   1.160 + {
   1.161 +   
   1.162 + }
   1.163 +
   1.164 +/*A named data-chunk travels through the cache hierarchy, and
   1.165 + * "steps on" each cache in its path.
   1.166 + *Step-on means add data-chunk to top of circular buffer, expelling
   1.167 + * bottom, and causing re-calculation of percent left for all chunks
   1.168 + * in buffer
   1.169 + */
   1.170 +void
   1.171 +stepOnCache( SSRCache *cache, SSRDataChunk *chunk )
   1.172 + { 
   1.173 +   
   1.174 + }
   1.175 +
   1.176 +/*This function is called after the assigner has chosen a work-unit
   1.177 + * to assign to the offered core.  It is given a data-chunk, what
   1.178 + * percent of it is travelling, the
   1.179 + * starting mem or cache that chunk is in, and the ending cache that
   1.180 + * chunk travels to.
   1.181 + * 
   1.182 + *This function then walks the hierarchy of cache structures, from the
   1.183 + * location data starts, to the destination.  At each cache along the
   1.184 + * way, it calls "step on", handing it the data chunk.
   1.185 + * 
   1.186 + *The caller should use a heuristic to simplify issues of "when"
   1.187 + * does the data travel, and "how much" travels.
   1.188 + * In other words, if an L1 source
   1.189 + * cache is calculated to have 50% of a data-chunk, then only 50%
   1.190 + * travels from it, and the other 50% from main-memory.
   1.191 + * This may complicate, for example, an L2 that both halves travel 
   1.192 + * through, where the two halves should be merged.
   1.193 + *Use micro-benchmarks to measure behaviors and make heuristic.  
   1.194 + */
   1.195 +void
   1.196 +travelTheHierarchy( SSRCache *sourceCache, SSRCache *endCache )
   1.197 + {
   1.198 +   //  
   1.199 + }
   1.200 +
   1.201 +/*The assigner has to create the data structures representing the 
   1.202 + * caches, each of which holds a circular buffer of the data-chunks
   1.203 + * that have stepped on the cache.
   1.204 + *Each cache structure has ports representing the connections to the
   1.205 + * other caches on the chip.
   1.206 + */
   1.207 +void
   1.208 +initAssigner( SSRSemEnv *semEnv )
   1.209 + {
   1.210 +   VMSCommNode *
   1.211 +   systemNode = VMS_SS__give_comm_hierarchy(); //this is read only!!
   1.212 +   
   1.213 +   //copy the cache hierarchy from systemNode into Cache structs,
   1.214 +   // connected by ports.
   1.215 + }
   1.216 +
   1.217 +
     2.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     2.2 +++ b/Design_Notes__data_tracking_assigner.txt	Thu Jan 10 11:16:42 2013 -0800
     2.3 @@ -0,0 +1,105 @@
     2.4 +
     2.5 +The approach of proto-runtime to assignment: it is 
     2.6 +hardware-availability driven.  So, the assigner is triggered at the 
     2.7 +point hardware becomes available.  It then looks at the work available
     2.8 + and picks what work to assign to that hardware.
     2.9 +
    2.10 +This means that the assigner is written so that it receives a hardware
    2.11 + opportunity as input, and calculates a "goodness" measure for each 
    2.12 +work unit available.  It then assigns the best work-unit to that given
    2.13 + hardware.
    2.14 +
    2.15 +Track data movement
    2.16 + through caches, and calculate "goodness" of a work-unit based on
    2.17 + expected time to move consumed data to the given core.
    2.18 +
    2.19 +-] Struct for each cache
    2.20 +-] Has a port, with time, to each connected cache
    2.21 +-] Has a circular buffer of data-names, with sizes, and percent left
    2.22 +-] Has a total size
    2.23 +
    2.24 +Will need balancer for time spent calculating vs time-width of work
    2.25 +unit..  given the number of available work-units..  IE, if thousands
    2.26 +of work units available, and they're all small, then counter-productive
    2.27 +to calculate a score for every work-unit..  need a "take best so far"
    2.28 +threshold, after which stop looking.. that threshold is based on
    2.29 +work-unit sizes -- either set by latency/BW at that level of hierarchy
    2.30 +or set by averaging predicted work-content.
    2.31 +
    2.32 +A named data-chunk travels through the cache
    2.33 + hierarchy, and "steps on" each cache in its path, starting at core
    2.34 + that created the data, going to core that consumes.  
    2.35 +
    2.36 +-] add data-chunk to top of circular buffer, expelling bottom, and
    2.37 +   causing re-calculation of percent left of itself and all below
    2.38 +
    2.39 +Once the assigner has chosen a work-unit to assign to the given core,
    2.40 + record each cache that input data travels through, starting at the
    2.41 + place the data travels from, ending at the given core.  Make a
    2.42 + heuristic to simplify issues of "when" does the data travel, and
    2.43 + "how much" travels (IE, if L1 cache is calculated to have 50% of
    2.44 + data-chunk, then only 50% travels through L2 from main-memory --
    2.45 + this is at least the second time data from that chunk has traveled
    2.46 + through L2, so what portion is expected to be in L2? Use
    2.47 + micro-benchmarks to measure behaviors and make heuristic).  
    2.48 +
    2.49 +-] Upon choice, walk the data-chunk through the cache hierarchy, 
    2.50 +   calling "step on" for each one.
    2.51 +
    2.52 +One idea for managing is to keep a list, for each cache, of the
    2.53 + data-chunks that have landed on it, in the order they landed.  At
    2.54 + the point the assigner does a recording of data landing on a cache,
    2.55 + add the landing-event to a list.  The list keeps the order of
    2.56 + data-chunks landing on a cache, with the size of each.  
    2.57 +
    2.58 +At the point want to calculate the percentage of a given data chunk
    2.59 + that resides in a given cache, find that chunk-name in the list, and
    2.60 + look at all the chunks that have stepped on the cache after it.
    2.61 +  Create a heuristic to estimate percent of target chunk still in the
    2.62 + cache, given the cache size, cache type (direct, vs number of
    2.63 + set-ways), and the chunks that have stepped on the cache after the
    2.64 + target chunk passed through.
    2.65 +
    2.66 +-] calculate percent left for each chunk in cache, at point cache gets
    2.67 +   "step on" with a new chunk.
    2.68 +-] To find a given chunk in the caches, maybe keep a separate hash
    2.69 +   table whose entry is a linked list of cache entries. -- IE, each
    2.70 +   time a chunk steps on a cache, create an entry in the cache's 
    2.71 +   circular buffer, and also create a link to that entry in a linked
    2.72 +   list.  The list has one link for each cache the chunk steps on.
    2.73 +   The list is placed into a hash table, with the chunk's name as the
    2.74 +   key.  When a cache entry's percent drops to zero, then remove that
    2.75 +   link. When last link is removed, remove the hash entry. 
    2.76 +-] So, for a given work-unit in the readyQ, for each input chunk,
    2.77 +   look in hash table to get entry for that chunk.  If none, cost is
    2.78 +   a main-memory fetch.  Else, go down linked list for the chunk, and
    2.79 +   calculate a percent divided by a weighted distance for each (the larger the
    2.80 +   distance, the less the percent matters -- a small percent close
    2.81 +   beats a large percent far away).  Take the best percent/dist as the
    2.82 +   score for that chunk in that work-unit.  Then, sum up over all
    2.83 +   input chunks, and divide by number of chunks (or appropriate way
    2.84 +   to combine)..  that's the score for that work-unit..  the largest
    2.85 +   score is the one chosen.
    2.86 +
    2.87 +Calculate the "goodness" of a candidate work-unit. For each data-chunk
    2.88 + that unit consumes, calculate expected percent of the chunk that
    2.89 + resides in each cache.  Given the expected percent, use standard
    2.90 + cache eqns to calculate expected time spent waiting for data transfer
    2.91 + (percent missing times chunk-size divided by line-size times time to
    2.92 + get a line).
    2.93 +
    2.94 +Sum up expected wait-time for each work-unit candidate.  That gives a
    2.95 + measure of "goodness" of choosing that work-unit for the current
    2.96 + hardware-opportunity.  This is where the interesting work comes in,
    2.97 + of defining an assignment-choosing algorithm that will deliver the
    2.98 + best performance..
    2.99 +
   2.100 +(Shoaib mentioned defining several alternatives, and perhaps using
   2.101 + measurements of the application to choose which is best..)
   2.102 +
   2.103 +
   2.104 + 
   2.105 +
   2.106 +
   2.107 +
   2.108 +
     3.1 --- a/SSR.c	Fri Dec 28 09:38:49 2012 -0800
     3.2 +++ b/SSR.c	Thu Jan 10 11:16:42 2013 -0800
     3.3 @@ -250,21 +250,8 @@
     3.4        semanticEnv->fnSingletons[i].waitQ             = makeVMSQ();
     3.5        semanticEnv->transactionStrucs[i].waitingVPQ   = makeVMSQ();
     3.6      }
     3.7 -   
     3.8 -   VMSCommNode *
     3.9 -   systemNode = VMS_SS__give_comm_hierarchy(); //this is read only!!
    3.10 -   
    3.11 -   //Do something with the comm system here.. make own, faster, data
    3.12 -   // structure that is used by assigner -- it can include info about
    3.13 -   // measured miss rates, and structures that track the data..
    3.14 -   //Next step would be to take a shot at a function call to put into the
    3.15 -   // application that gives a "name" to collection of data consumed by
    3.16 -   // a work-unit, and a name to the data produced..  along with the size
    3.17 -   // each of those named collections.
    3.18 -   //Then, can come up with a way to represent how much of each 
    3.19 -   // named data collection that resides in each of the caches.  Keep that
    3.20 -   // representation in the data structure that build from parsing the
    3.21 -   // comm system returned from VMS.
    3.22 + 
    3.23 +   initAssigner( semanticEnv );
    3.24   }
    3.25  
    3.26  
     4.1 --- a/SSR_PluginFns.c	Fri Dec 28 09:38:49 2012 -0800
     4.2 +++ b/SSR_PluginFns.c	Thu Jan 10 11:16:42 2013 -0800
     4.3 @@ -27,11 +27,7 @@
     4.4  
     4.5  //============================== Assigner ==================================
     4.6  //
     4.7 -/*For SSR, assigning a slave simply takes the next work-unit off the
     4.8 - * ready-to-go work-unit queue and assigns it to the offered slot.
     4.9 - *If the ready-to-go work-unit queue is empty, then nothing to assign
    4.10 - * to the animation slot -- return FALSE to let Master loop know assigning
    4.11 - * that slot failed.
    4.12 +/*
    4.13   */
    4.14  SlaveVP *
    4.15  SSR__assign_slaveVP_to_slot( void *_semEnv, AnimSlot *slot )
    4.16 @@ -44,7 +40,9 @@
    4.17     
    4.18     semEnv  = (SSRSemEnv *)_semEnv;
    4.19  
    4.20 -   assignPr = readPrivQ( semEnv->readyVPQs[coreNum] );
    4.21 +//   assignPr = readPrivQ( semEnv->readyVPQs[coreNum] );
    4.22 +   assignPr = pickASlave( slot, coreNum );
    4.23 +   
    4.24        //Note, using a non-blocking queue -- it returns NULL if queue empty
    4.25     #ifdef IDLE_SLAVES
    4.26     if(!assignPr){
    4.27 @@ -321,3 +319,5 @@
    4.28     #endif
    4.29     writePrivQ( procr, semEnv->readyVPQs[ procr->coreAnimatedBy] );
    4.30   }
    4.31 +
    4.32 +