# HG changeset patch
# User Sean Halle <seanhalle@yahoo.com>
# Date 1357845402 28800
# Node ID ce07f1a42ddfebf3ea71d2e7db34f955e6ba328b
# Parent  a4bc9d8a8f111cbf19f68b94929b520e6fa185ce
Added skeleton for data tracking assigner + design notes

diff -r a4bc9d8a8f11 -r ce07f1a42ddf DataTrackingAssigner/Assigner.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/DataTrackingAssigner/Assigner.c	Thu Jan 10 11:16:42 2013 -0800
@@ -0,0 +1,214 @@
+/*
+ * Copyright 2010  OpenSourceCodeStewardshipFoundation
+ *
+ * Licensed under BSD
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "Queue_impl/PrivateQueue.h"
+#include "SSR.h"
+#include "SSR_Request_Handlers.h"
+
+//=========================== Local Fn Prototypes ===========================
+
+typedef _SSRCache    SSRCache;
+typedef _SSRCommPort SSRCommPort;
+typedef _CircBuff CircBuff;
+
+
+struct _SSRCache
+ { int32     size;
+   CircBuff *dataChunks;
+   SSRCommPort *ports;
+ };
+
+//says the BW and latency to a cache directly connected to this one
+struct _SSRCommPort
+ {
+   int32  BandWidth;
+   int32  Latency;
+   SSRCache *connectedCache;
+ };
+
+//Data chunk -- probably need more things than this..
+typedef struct
+ { int32  size;
+   int32  ID;   //the "mostly" unique identifier -- maybe a hash of addr
+ }
+SSRDataChunk;
+ 
+
+ 
+void
+StepOnCache( SSRCache *cache, SSRDataChunk *chunk );
+
+//============================== Assigner ==================================
+//
+/*This evaluates the slave VPs available, and picks the one with the
+ * least total data travel time.
+ * 
+ *Will need balancer for time spent calculating vs time-width of work
+unit..  given the number of available work-units..  IE, if thousands
+of work units available, and they're all small, then counter-productive
+to calculate a score for every work-unit..  need a "take best so far"
+threshold, after which stop looking.. that threshold is based on
+work-unit sizes -- either set by latency/BW at that level of hierarchy
+or set by averaging predicted work-content.
+
+ */
+SlaveVP *
+pickASlave( SSRSemEnv *semEnv, AnimSlot *slot )
+ { SlaveVP   *candidateSlave, *readySlaves, *bestSlave;
+   int32      coreNum, slotNum;
+   SSRCache  *cacheHier;
+   
+   float32    timeToGetInputChunks, lowestTime = 0;
+   
+   coreNum = slot->coreSlotIsOn;
+   slotNum = slot->slotIdx;
+   
+   //Now, have some choices about how to organize -- if the number of
+   // cores is small, or are keeping work separated geographically, 
+   // then may make sense to calculate distance to every core each
+   // time a slave is made ready. Then, have a separate priority queue
+   // for each slave, and just pick the top when assign (and remove
+   // that slave from all the other priority Qs)
+   //     wait -- those values will degrade as caches get stepped on,
+   //     so, have to keep a "freshness" count.. and gets complex..
+   //Else, if too many cores, then keep an array with each slave, which
+   // has one position for each core, which holds the time to gather 
+   // the input chunks for that slave to that core plus a freshness
+   // count.  Then, when search for the best slave, first look in the
+   // array to see if have previously calculated for that slave, and
+   // to see how fresh that calculation is.  If none, or old, then 
+   // calculate the time and save it into the array, and remember the
+   // slave with the lowest time -- pick that one.
+
+   //Here, just doing brute force -- calc time for all slaves for this
+   // core, then pick the slave with lowest time
+   
+   //get all the candidate work
+   readySlaves = semEnv->readySlaves; //linked list?
+   
+   //calc time for each, keeping lowest
+   cacheHier = semEnv->cacheHier; //create hier during init
+   candidateSlave = readySlaves;
+   while( candidateSlave != NULL )
+    { 
+      timeToGetInputChunks =
+         calcTimeToGetInputChunks( cacheHier, candidateSlave, endCore );
+      if(timeToGetInputChunks < lowestTime )
+       { lowestTime = timeToGetInputChunks;
+         bestSlave  = candidateSlave;
+       }
+      candidateSlave = ;//next slave in list
+    }
+   
+   //return slave with lowest expected time
+   return bestSlave;  //NULL is okay value to return
+ }
+
+/*Calculate "goodness" of a work-unit based on
+ * expected time to move consumed data to the given core.
+ */
+float32
+calcTimeToGetInputChunks( SSRCache *cacheHier, SlaveVP *candidateSlave,
+                          int32 endCore )
+ { SSRDataChunk *inputChunks, *chunk;
+   SSRSemData semData;
+   int32 totalTime, chunkPos;
+   
+   //Collect the input chunks required by the work
+   semData = (SSRSemData)candidateSlave->semanticData;
+ 
+   inputChunks = semData->inputChunks; //Todo add to SSRSemData
+   numChunks = semData->numInputChunks;
+   
+   //calc time to get input chunks
+   totalTime = 0;
+   for( chunkPos = 0; chunkPos < numChunks; chunkPos++ )
+    { chunk = inputChunks[chunkPos];
+      totalTime += calcTimeToGetChunk( cacheHier, chunk, endCore );
+    }
+   return totalTime;
+ }
+
+/*Walk the hierarchy, looking for caches that have a portion of the
+ * data chunk in them.
+ *For example, could start at the destination core, and work backwards
+ * to each of the caches in the path to main memory. In each of those,
+ * look for the data chunk, get the percent expected in the core, then
+ * calculate the time to the next level up for the remaining 
+ * percentage.
+ *Figure out some heuristic for resolving when a higher level cache
+ * gets part of what's in a lower level cache trampled on..  IE, a data
+ * chunk goes through the L2 cache, into the L1 cache.  Say 100% is in
+ * the L2, but only 25% fit in the L1.  Now, the L2 has all of what's
+ * in L1, plus the rest of the chunk.  Next, a
+ * different core gets a chunk that goes through the L2 cache and
+ * steps on that first chunk.  So now, of the how much of the 75% 
+ * that used to be in L2 that was different from what's in L1 was
+ * what got stepped on, vs the 25% that was in both..
+ * 
+ */
+calcTimeToGetChunk( SSRCache *cacheHier, SSRDataChunk *chunk, int32 endCore )
+ {
+   
+ }
+
+/*A named data-chunk travels through the cache hierarchy, and
+ * "steps on" each cache in its path.
+ *Step-on means add data-chunk to top of circular buffer, expelling
+ * bottom, and causing re-calculation of percent left for all chunks
+ * in buffer
+ */
+void
+stepOnCache( SSRCache *cache, SSRDataChunk *chunk )
+ { 
+   
+ }
+
+/*This function is called after the assigner has chosen a work-unit
+ * to assign to the offered core.  It is given a data-chunk, what
+ * percent of it is travelling, the
+ * starting mem or cache that chunk is in, and the ending cache that
+ * chunk travels to.
+ * 
+ *This function then walks the hierarchy of cache structures, from the
+ * location data starts, to the destination.  At each cache along the
+ * way, it calls "step on", handing it the data chunk.
+ * 
+ *The caller should use a heuristic to simplify issues of "when"
+ * does the data travel, and "how much" travels.
+ * In other words, if an L1 source
+ * cache is calculated to have 50% of a data-chunk, then only 50%
+ * travels from it, and the other 50% from main-memory.
+ * This may complicate, for example, an L2 that both halves travel 
+ * through, where the two halves should be merged.
+ *Use micro-benchmarks to measure behaviors and make heuristic.  
+ */
+void
+travelTheHierarchy( SSRCache *sourceCache, SSRCache *endCache )
+ {
+   //  
+ }
+
+/*The assigner has to create the data structures representing the 
+ * caches, each of which holds a circular buffer of the data-chunks
+ * that have stepped on the cache.
+ *Each cache structure has ports representing the connections to the
+ * other caches on the chip.
+ */
+void
+initAssigner( SSRSemEnv *semEnv )
+ {
+   VMSCommNode *
+   systemNode = VMS_SS__give_comm_hierarchy(); //this is read only!!
+   
+   //copy the cache hierarchy from systemNode into Cache structs,
+   // connected by ports.
+ }
+
+
diff -r a4bc9d8a8f11 -r ce07f1a42ddf Design_Notes__data_tracking_assigner.txt
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/Design_Notes__data_tracking_assigner.txt	Thu Jan 10 11:16:42 2013 -0800
@@ -0,0 +1,105 @@
+
+The approach of proto-runtime to assignment: it is 
+hardware-availability driven.  So, the assigner is triggered at the 
+point hardware becomes available.  It then looks at the work available
+ and picks what work to assign to that hardware.
+
+This means that the assigner is written so that it receives a hardware
+ opportunity as input, and calculates a "goodness" measure for each 
+work unit available.  It then assigns the best work-unit to that given
+ hardware.
+
+Track data movement
+ through caches, and calculate "goodness" of a work-unit based on
+ expected time to move consumed data to the given core.
+
+-] Struct for each cache
+-] Has a port, with time, to each connected cache
+-] Has a circular buffer of data-names, with sizes, and percent left
+-] Has a total size
+
+Will need balancer for time spent calculating vs time-width of work
+unit..  given the number of available work-units..  IE, if thousands
+of work units available, and they're all small, then counter-productive
+to calculate a score for every work-unit..  need a "take best so far"
+threshold, after which stop looking.. that threshold is based on
+work-unit sizes -- either set by latency/BW at that level of hierarchy
+or set by averaging predicted work-content.
+
+A named data-chunk travels through the cache
+ hierarchy, and "steps on" each cache in its path, starting at core
+ that created the data, going to core that consumes.  
+
+-] add data-chunk to top of circular buffer, expelling bottom, and
+   causing re-calculation of percent left of itself and all below
+
+Once the assigner has chosen a work-unit to assign to the given core,
+ record each cache that input data travels through, starting at the
+ place the data travels from, ending at the given core.  Make a
+ heuristic to simplify issues of "when" does the data travel, and
+ "how much" travels (IE, if L1 cache is calculated to have 50% of
+ data-chunk, then only 50% travels through L2 from main-memory --
+ this is at least the second time data from that chunk has traveled
+ through L2, so what portion is expected to be in L2? Use
+ micro-benchmarks to measure behaviors and make heuristic).  
+
+-] Upon choice, walk the data-chunk through the cache hierarchy, 
+   calling "step on" for each one.
+
+One idea for managing is to keep a list, for each cache, of the
+ data-chunks that have landed on it, in the order they landed.  At
+ the point the assigner does a recording of data landing on a cache,
+ add the landing-event to a list.  The list keeps the order of
+ data-chunks landing on a cache, with the size of each.  
+
+At the point want to calculate the percentage of a given data chunk
+ that resides in a given cache, find that chunk-name in the list, and
+ look at all the chunks that have stepped on the cache after it.
+  Create a heuristic to estimate percent of target chunk still in the
+ cache, given the cache size, cache type (direct, vs number of
+ set-ways), and the chunks that have stepped on the cache after the
+ target chunk passed through.
+
+-] calculate percent left for each chunk in cache, at point cache gets
+   "step on" with a new chunk.
+-] To find a given chunk in the caches, maybe keep a separate hash
+   table whose entry is a linked list of cache entries. -- IE, each
+   time a chunk steps on a cache, create an entry in the cache's 
+   circular buffer, and also create a link to that entry in a linked
+   list.  The list has one link for each cache the chunk steps on.
+   The list is placed into a hash table, with the chunk's name as the
+   key.  When a cache entry's percent drops to zero, then remove that
+   link. When last link is removed, remove the hash entry. 
+-] So, for a given work-unit in the readyQ, for each input chunk,
+   look in hash table to get entry for that chunk.  If none, cost is
+   a main-memory fetch.  Else, go down linked list for the chunk, and
+   calculate a percent divided by a weighted distance for each (the larger the
+   distance, the less the percent matters -- a small percent close
+   beats a large percent far away).  Take the best percent/dist as the
+   score for that chunk in that work-unit.  Then, sum up over all
+   input chunks, and divide by number of chunks (or appropriate way
+   to combine)..  that's the score for that work-unit..  the largest
+   score is the one chosen.
+
+Calculate the "goodness" of a candidate work-unit. For each data-chunk
+ that unit consumes, calculate expected percent of the chunk that
+ resides in each cache.  Given the expected percent, use standard
+ cache eqns to calculate expected time spent waiting for data transfer
+ (percent missing times chunk-size divided by line-size times time to
+ get a line).
+
+Sum up expected wait-time for each work-unit candidate.  That gives a
+ measure of "goodness" of choosing that work-unit for the current
+ hardware-opportunity.  This is where the interesting work comes in,
+ of defining an assignment-choosing algorithm that will deliver the
+ best performance..
+
+(Shoaib mentioned defining several alternatives, and perhaps using
+ measurements of the application to choose which is best..)
+
+
+ 
+
+
+
+
diff -r a4bc9d8a8f11 -r ce07f1a42ddf SSR.c
--- a/SSR.c	Fri Dec 28 09:38:49 2012 -0800
+++ b/SSR.c	Thu Jan 10 11:16:42 2013 -0800
@@ -250,21 +250,8 @@
       semanticEnv->fnSingletons[i].waitQ             = makeVMSQ();
       semanticEnv->transactionStrucs[i].waitingVPQ   = makeVMSQ();
     }
-   
-   VMSCommNode *
-   systemNode = VMS_SS__give_comm_hierarchy(); //this is read only!!
-   
-   //Do something with the comm system here.. make own, faster, data
-   // structure that is used by assigner -- it can include info about
-   // measured miss rates, and structures that track the data..
-   //Next step would be to take a shot at a function call to put into the
-   // application that gives a "name" to collection of data consumed by
-   // a work-unit, and a name to the data produced..  along with the size
-   // each of those named collections.
-   //Then, can come up with a way to represent how much of each 
-   // named data collection that resides in each of the caches.  Keep that
-   // representation in the data structure that build from parsing the
-   // comm system returned from VMS.
+ 
+   initAssigner( semanticEnv );
  }
 
 
diff -r a4bc9d8a8f11 -r ce07f1a42ddf SSR_PluginFns.c
--- a/SSR_PluginFns.c	Fri Dec 28 09:38:49 2012 -0800
+++ b/SSR_PluginFns.c	Thu Jan 10 11:16:42 2013 -0800
@@ -27,11 +27,7 @@
 
 //============================== Assigner ==================================
 //
-/*For SSR, assigning a slave simply takes the next work-unit off the
- * ready-to-go work-unit queue and assigns it to the offered slot.
- *If the ready-to-go work-unit queue is empty, then nothing to assign
- * to the animation slot -- return FALSE to let Master loop know assigning
- * that slot failed.
+/*
  */
 SlaveVP *
 SSR__assign_slaveVP_to_slot( void *_semEnv, AnimSlot *slot )
@@ -44,7 +40,9 @@
    
    semEnv  = (SSRSemEnv *)_semEnv;
 
-   assignPr = readPrivQ( semEnv->readyVPQs[coreNum] );
+//   assignPr = readPrivQ( semEnv->readyVPQs[coreNum] );
+   assignPr = pickASlave( slot, coreNum );
+   
       //Note, using a non-blocking queue -- it returns NULL if queue empty
    #ifdef IDLE_SLAVES
    if(!assignPr){
@@ -321,3 +319,5 @@
    #endif
    writePrivQ( procr, semEnv->readyVPQs[ procr->coreAnimatedBy] );
  }
+
+