# HG changeset patch
# User Some Random Person <seanhalle@yahoo.com>
# Date 1331688605 25200
# Node ID bd5ab695145c6a36479acc50e4427b3e811882d0
# Parent  a32504bb2a1cee072f697e509d9c649709d57ed7
MEAS__ macros for language added, and renamed a few things

diff -r a32504bb2a1c -r bd5ab695145c DESIGN_NOTES.txt
--- a/DESIGN_NOTES.txt	Tue Mar 13 10:04:14 2012 -0700
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,212 +0,0 @@
-
-From e-mail to Albert, on design of app-virt-procr to core-loop animation
-switch and back.
-
-====================
-General warnings about this code:
-It only compiles in GCC 4.x  (label addr and computed goto)
-Has assembly for x86  32bit
-
-
-====================
-AVProcr data-struc has: stack-ptr, jump-ptr, data-ptr, slotNum, coreloop-ptr
- and semantic-custom-ptr
-
-The VMS Creator: takes ptr to function and ptr to initial data
--- creates a new AVProcr struc
--- sets the jmp-ptr field to the ptr-to-function passed in
--- sets the data-ptr to ptr to initial data passed in
--- if this is for a suspendable virt  processor, then create a stack and set
-   the stack-ptr
-
-VMS_int__create_slaveVP( AVProcrFnPtr fnPtr, void *initialData )
-{
-AVProcr   newSlv = malloc( sizeof(AVProcr) );
-newSlv->jmpPtr = fnPtr;
-newSlv->coreCtlrDonePt = &CoreCtlrDonePt; //label is in coreCtlr
-newSlv->data = initialData;
-newSlv->stackPtr = createNewStack();
-return newSlv;
-}
-
-The semantic layer can then add its own state in the cusom-ptr field
-
-The Scheduler plug-in:
--- Sets slave-ptr in AVProcr, and points the slave to AVProcr
--- if non-suspendable, sets the AVProcr's stack-ptr to the slave's stack-ptr
-
-MasterLoop:
--- puts AVProcr structures onto the workQ
-
-CoreCtlr:
--- gets stack-ptr out of AVProcr and sets the core's stack-ptr to that
--- gets data-ptr out of AVProcr and puts it into reg GCC uses for that param
--- puts AVProcr's addr into reg GCC uses for the AVProcr-pointer param
--- jumps to the addr in AVProcr's jmp-ptr field
-CoreCtlr()
-{ while( FOREVER )
- { nextSlv = readQ( workQ );  //workQ is static (global) var declared volatile
-   <dataPtr-param-register>       = nextSlv->data;
-   <AVProcrPtr-param-register> = nextSlv;
-   <stack-pointer register>          = nextSlv->stackPtr;
-   jmp nextSlv->jmpPtr;
-CoreCtlrDonePt:   //label's addr put into AVProcr when create new one
- }
-}
-(Note, for suspendable processors coming back from suspension, there is no
- need to fill the parameter registers -- they will be discarded)
-
-Suspend an application-level virtual processor:
-VMS_int__AVPSuspend( AVProcr *pr )
-{
-pr->jmpPtr = &ResumePt;  //label defined a few lines below
-pr->slave->doneFlag = TRUE;
-pr->stackPtr = <current SP reg value>;
-jmp pr->coreCtlrDonePt;
-ResumePt: return;
-}
-
-This works because the core controller will have switched back to this stack
- before jumping to ResumePt..    also, the core controller never modifies the
- stack pointer, it simply switches to whatever stack pointer is in the
- next AVProcr it gets off the workQ.
-
-
-
-=============================================================================
-As it is now, there's only one major unknown about GCC (first thing below
-  the line),  and there are a few restrictions, the most intrusive being
-  that the functions the application gives to the semantic layer have a
-  pre-defined prototype -- return nothing, take a pointer to initial data
-  and a pointer to an AVProcr struc, which they're not allowed to modify
-  -- only pass it to semantic-lib calls.
-
-So, here are the assumptions, restrictions, and so forth:
-===========================
-Major assumption:  that GCC will do the following the same way every time:
-  say the application defines a function that fits this typedef:
-typedef void (*AVProcrFnPtr)  ( void *, AVProcr * );
-
-and let's say somewhere in the code they do this:
-AVProcrFnPtr   fnPtr = &someFunc;
-
-then they do this:
-(*fnPtr)( dataPtr, animatingSlaveVPPtr );
-
-Can the registers that GCC uses to pass the two pointers be predicted?
- Will they always be the same registers, in every program that has the
- same typedef?
-If that typedef fixes, guaranteed, the registers (on x86) that GCC will use
- to send the two pointers, then the rest of this solution works.
-
-Change in model: Instead of a virtual processor whose execution trace is
- divided into work-units, replacing that with the pattern that a virtual
- processor is suspended.  Which means, no more "work unit" data structure
- -- instead, it's now an "Application Virtual Processor" structure
- -- AVProcr -- which is given directly to the application function!
-
-   -- You were right, don't need slaves to be virtual processors, only need
-      "scheduling buckets" -- just a way to keep track of things..
-
-Restrictions:
--- the  "virtual entities"  created by the semantic layer must be virtual
-   processors, created with a function-to-execute and initial data -- the
-   function is restricted to return nothing and only take a pointer to the
-   initial data plus a pointer to an AVProcr structure, which represents
-   "self", the virtual processor created.  (This is the interface I showed
-   you for "Hello World" semantic layer).
-What this means for synchronous dataflow, is that the nodes in the graph
-  are virtual processors that in turn spawn a new virtual processor for
-  every "firing" of the node.  This should be fine because the function
-  that the node itself is created with is a "canned" function that is part
-  of the semantic layer -- the function that is spawned is the user-provided
-  function.  The restriction only means that the values from the inputs to
-  the node are packaged as the "initial data" given to the spawned virtual
-  processor -- so the user-function has to cast a void * to the
-  semantic-layer-defined structure by which it gets the inputs to the node.
-
--- Second restriction is that the semantic layer has to use VMS supplied
-   stuff -- for example, the data structure that represents the
-   application-level virtual processor is defined in VMS, and the semantic
-   layer has to call a VMS function in order to suspend a virtual processor.
-
--- Third restriction is that the application code never do anything with
-   the AVProcr structure except pass it to semantic-layer lib calls.
-
--- Fourth restriction is that every virtual processor must call a
-   "dissipate" function as its last act -- the user-supplied
-   virtual-processor function can't just end -- it has to call
-   SemLib__dissipate( AVProcr ) before the closing brace.. and after the
-   semantic layer is done cleaning up its own data, it has to in turn call
-   VMS_int__disspate( AVProcr ).
-
--- For performance reasons, I think I want to have two different kinds of
-   app-virtual processor -- suspendable ones and non-suspendable -- where
-   non-suspendable are not allowed to perform any communication with other
-   virtual processors, except at birth and death.  Suspendable ones, of
-   course can perform communications, create other processors, and so forth
-   -- all of which cause it to suspend.
-The performance difference is that I need a separate stack for each
-  suspendable, but non-suspendable can re-use a fixed number of stacks
-  (one for each slave).
-
-
-==================== May 29
-
-Qs:
---1 how to safely jump between virt processor's trace and coreloop
---2 how to set up __cdecl style stack + frame for just-born virtual processor
---3 how to switch stack-pointers + frame-pointers
-
-
---1:
-Not sure if GCC's computed goto is safe, because modify the stack pointer
-without GCC's knowledge -- although, don't use the stack in the coreloop
-segment, so, actually, that should be safe!
-
-So, GCC has its own special C extensions, one of which gets address of label:
-
-void *labelAddr;
-labelAddr = &&label;
-goto *labelAddr;
-
---2
-In CoreCtlr, will check whether VirtProc just born, or was suspended.
-If just born, do bit of code that sets up the virtual processor's stack
-and frame according to the __cdecl convention for the standard virt proc
-fn typedef -- save the pointer to data and pointer to virt proc struc into
-correct places in the frame
-   __cdecl says, according to:
-http://unixwiz.net/techtips/win32-callconv-asm.html
-To do this:
-push the parameters onto the stack, right most first, working backwards to
- the left.
-Then perform call instr, which pushes return addr onto stack.
-Then callee first pushes the frame pointer, %EBP followed by placing the
-then-current value of stack pointer into %EBP
-push ebp
-mov  ebp, esp    // ebp « esp
-
-Once %ebp has been changed, it can now refer directly to the function's
- arguments as 8(%ebp), 12(%ebp). Note that 0(%ebp) is the old base pointer
- and 4(%ebp) is the old instruction pointer.
-
-Then callee pushes regs it will use then adds to stack pointer the size of
- its local vars.
-
-Stack in callee looks like this:
-16(%ebp)	 - third function parameter
-12(%ebp)	 - second function parameter
-8(%ebp)	 - first function parameter
-4(%ebp)	 - old %EIP (the function's "return address")
-----------^^ State seen at first instr of callee ^^-----------
-0(%ebp)	- old %EBP (previous function's base pointer)
--4(%ebp)	 - save of EAX, the only reg used in function
--8(%ebp)	 - first local variable
--12(%ebp)	 - second local variable
--16(%ebp)	 - third local variable
-
-
---3
-It might be just as simple as two mov instrs, one for %ESP, one for %EBP..
- the stack and frame pointer regs
diff -r a32504bb2a1c -r bd5ab695145c SSR.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/SSR.c	Tue Mar 13 18:30:05 2012 -0700
@@ -0,0 +1,785 @@
+/*
+ * Copyright 2010  OpenSourceCodeStewardshipFoundation
+ *
+ * Licensed under BSD
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <malloc.h>
+
+#include "Queue_impl/PrivateQueue.h"
+#include "Hash_impl/PrivateHash.h"
+
+#include "SSR.h"
+#include "SSR_Counter_Recording.h"
+
+//==========================================================================
+
+void
+SSR__init();
+
+void
+SSR__init_Helper();
+//==========================================================================
+
+
+/*TODO: Q: dealing with library f()s and DKU vs WT vs FoR
+ * (still want to do FoR, with time-lines as syntax, could be super cool)
+ * A: thinking pin the coreCtlrs for all of BLIS -- let Master arbitrate
+ * among library, DKU, WT, FoR -- all the patterns in terms of virtual
+ * processors (or equivalently work-units), so Master picks which virt procr
+ * from which portions of app (DKU, WT, FoR) onto which sched slots
+ *Might even do hierarchy of masters -- group of sched slots for each core
+ * has its own master, that keeps generated work local
+ * single-reader-single-writer sync everywhere -- no atomic primitives
+ * Might have the different assigners talk to each other, to negotiate
+ * larger-grain sharing of resources, according to predicted critical
+ * path, and expansion of work
+ */
+
+
+
+//===========================================================================
+
+
+/*These are the library functions *called in the application*
+ * 
+ *There's a pattern for the outside sequential code to interact with the
+ * VMS_HW code.
+ *The VMS_HW system is inside a boundary..  every SSR system is in its
+ * own directory that contains the functions for each of the processor types.
+ * One of the processor types is the "seed" processor that starts the
+ * cascade of creating all the processors that do the work.
+ *So, in the directory is a file called "EntryPoint.c" that contains the
+ * function, named appropriately to the work performed, that the outside
+ * sequential code calls.  This function follows a pattern:
+ *1) it calls SSR__init()
+ *2) it creates the initial data for the seed processor, which is passed
+ *    in to the function
+ *3) it creates the seed SSR processor, with the data to start it with.
+ *4) it calls startSSRThenWaitUntilWorkDone
+ *5) it gets the returnValue from the transfer struc and returns that
+ *    from the function
+ *
+ *For now, a new SSR system has to be created via SSR__init every
+ * time an entry point function is called -- later, might add letting the
+ * SSR system be created once, and let all the entry points just reuse
+ * it -- want to be as simple as possible now, and see by using what makes
+ * sense for later..
+ */
+
+
+
+//===========================================================================
+
+/*This is the "border crossing" function -- the thing that crosses from the
+ * outside world, into the VMS_HW world.  It initializes and starts up the
+ * VMS system, then creates one processor from the specified function and
+ * puts it into the readyQ.  From that point, that one function is resp.
+ * for creating all the other processors, that then create others, and so
+ * forth.
+ *When all the processors, including the seed, have dissipated, then this
+ * function returns.  The results will have been written by side-effect via
+ * pointers read from, or written into initData.
+ *
+ *NOTE: no Threads should exist in the outside program that might touch
+ * any of the data reachable from initData passed in to here
+ */
+void
+SSR__create_seed_procr_and_do_work( TopLevelFnPtr fnPtr, void *initData )
+ { SSRSemEnv *semEnv;
+   SlaveVP *seedPr;
+
+   SSR__init();      //normal multi-thd
+   
+   semEnv = _VMSMasterEnv->semanticEnv;
+
+      //SSR starts with one processor, which is put into initial environ,
+      // and which then calls create() to create more, thereby expanding work
+   seedPr = SSR__create_procr_helper( fnPtr, initData,
+                                      semEnv, semEnv->nextCoreToGetNewPr++ );
+
+   resume_slaveVP( seedPr, semEnv );
+   
+   VMS_SS__start_the_work_then_wait_until_done();      //normal multi-thd
+
+   SSR__cleanup_after_shutdown();
+ }
+
+
+int32
+SSR__giveMinWorkUnitCycles( float32 percentOverhead )
+ {
+   return MIN_WORK_UNIT_CYCLES;
+ }
+
+int32
+SSR__giveIdealNumWorkUnits()
+ {
+   return NUM_SCHED_SLOTS * NUM_CORES;
+ }
+
+int32
+SSR__give_number_of_cores_to_schedule_onto()
+ {
+   return NUM_CORES;
+ }
+
+/*For now, use TSC -- later, make these two macros with assembly that first
+ * saves jump point, and second jumps back several times to get reliable time
+ */
+void
+SSR__start_primitive()
+ { saveLowTimeStampCountInto( ((SSRSemEnv *)(_VMSMasterEnv->semanticEnv))->
+                              primitiveStartTime );
+ }
+
+/*Just quick and dirty for now -- make reliable later
+ * will want this to jump back several times -- to be sure cache is warm
+ * because don't want comm time included in calc-time measurement -- and
+ * also to throw out any "weird" values due to OS interrupt or TSC rollover
+ */
+int32
+SSR__end_primitive_and_give_cycles()
+ { int32 endTime, startTime;
+   //TODO: fix by repeating time-measurement
+   saveLowTimeStampCountInto( endTime );
+   startTime =((SSRSemEnv*)(_VMSMasterEnv->semanticEnv))->primitiveStartTime;
+   return (endTime - startTime);
+ }
+
+//===========================================================================
+
+/*Initializes all the data-structures for a SSR system -- but doesn't
+ * start it running yet!
+ *
+ *This runs in the main thread -- before VMS starts up
+ * 
+ *This sets up the semantic layer over the VMS system
+ *
+ *First, calls VMS_Setup, then creates own environment, making it ready
+ * for creating the seed processor and then starting the work.
+ */
+void
+SSR__init()
+ {
+   VMS_SS__init();
+      //masterEnv, a global var, now is partially set up by init_VMS
+      // after this, have VMS_int__malloc and VMS_int__free available
+
+   SSR__init_Helper();
+ }
+
+
+void idle_fn(void* data, SlaveVP *animatingSlv){
+    while(1){
+        VMS_int__suspend_slaveVP_and_send_req(animatingSlv);
+    }
+}
+
+void
+SSR__init_Helper()
+ { SSRSemEnv       *semanticEnv;
+   PrivQueueStruc **readyVPQs;
+   int              coreIdx, i, j;
+ 
+      //Hook up the semantic layer's plug-ins to the Master virt procr
+   _VMSMasterEnv->requestHandler = &SSR__Request_Handler;
+   _VMSMasterEnv->slaveAssigner  = &SSR__assign_slaveVP;
+   #ifdef HOLISTIC__TURN_ON_PERF_COUNTERS
+   _VMSMasterEnv->counterHandler = &SSR__counter_handler;
+   #endif
+
+      //create the semantic layer's environment (all its data) and add to
+      // the master environment
+   semanticEnv = VMS_int__malloc( sizeof( SSRSemEnv ) );
+   _VMSMasterEnv->semanticEnv = semanticEnv;
+   
+   #ifdef HOLISTIC__TURN_ON_PERF_COUNTERS
+   SSR__init_counter_data_structs();
+   #endif
+   for(i=0;i<NUM_CORES;++i){
+       for(j=0;j<NUM_SCHED_SLOTS;++j){
+           semanticEnv->idlePr[i][j] = VMS_int__create_slaveVP(&idle_fn,NULL);
+           semanticEnv->idlePr[i][j]->coreAnimatedBy = i;
+       }
+   }
+
+   #ifdef HOLISTIC__TURN_ON_OBSERVE_UCC
+   semanticEnv->unitList = makeListOfArrays(sizeof(Unit),128);
+   semanticEnv->ctlDependenciesList = makeListOfArrays(sizeof(Dependency),128);
+   semanticEnv->commDependenciesList = makeListOfArrays(sizeof(Dependency),128);
+   semanticEnv->dynDependenciesList = makeListOfArrays(sizeof(Dependency),128);
+   semanticEnv->ntonGroupsInfo = makePrivDynArrayOfSize((void***)&(semanticEnv->ntonGroups),8);
+   
+   semanticEnv->hwArcs = makeListOfArrays(sizeof(Dependency),128);
+   memset(semanticEnv->last_in_slot,0,sizeof(NUM_CORES * NUM_SCHED_SLOTS * sizeof(Unit)));
+   #endif
+
+      //create the ready queue, hash tables used for pairing send to receive
+      // and so forth
+      //TODO: add hash tables for pairing sends with receives, and
+      // initialize the data ownership system
+   readyVPQs = VMS_int__malloc( NUM_CORES * sizeof(PrivQueueStruc *) );
+
+   for( coreIdx = 0; coreIdx < NUM_CORES; coreIdx++ )
+    {
+      readyVPQs[ coreIdx ] = makeVMSQ();
+    }
+   
+   semanticEnv->readyVPQs = readyVPQs;
+   
+   semanticEnv->nextCoreToGetNewPr = 0;
+   semanticEnv->numSlaveVP = 0;
+   
+   semanticEnv->commHashTbl  = makeHashTable( 1<<16, &VMS_int__free );//start big
+
+   //TODO: bug -- turn these arrays into dyn arrays to eliminate limit
+   //semanticEnv->singletonHasBeenExecutedFlags = makeDynArrayInfo( );
+   //semanticEnv->transactionStrucs = makeDynArrayInfo( );
+   for( i = 0; i < NUM_STRUCS_IN_SEM_ENV; i++ )
+    {
+      semanticEnv->fnSingletons[i].endInstrAddr      = NULL;
+      semanticEnv->fnSingletons[i].hasBeenStarted    = FALSE;
+      semanticEnv->fnSingletons[i].hasFinished       = FALSE;
+      semanticEnv->fnSingletons[i].waitQ             = makeVMSQ();
+      semanticEnv->transactionStrucs[i].waitingVPQ   = makeVMSQ();
+    }
+ }
+
+
+/*Frees any memory allocated by SSR__init() then calls VMS_int__shutdown
+ */
+void
+SSR__cleanup_after_shutdown()
+ { SSRSemEnv *semanticEnv;
+   
+   semanticEnv = _VMSMasterEnv->semanticEnv;
+
+   #ifdef HOLISTIC__TURN_ON_OBSERVE_UCC
+   //UCC
+   FILE* output;
+   int n;
+   char filename[255];    
+    for(n=0;n<255;n++)
+    {
+        sprintf(filename, "./counters/UCC.%d",n);
+        output = fopen(filename,"r");
+        if(output)
+        {
+            fclose(output);
+        }else{
+            break;
+        }
+    }
+   if(n<255){
+    printf("Saving UCC to File: %s ...\n", filename);
+    output = fopen(filename,"w+");
+    if(output!=NULL){
+        set_dependency_file(output);
+        //fprintf(output,"digraph Dependencies {\n");
+        //set_dot_file(output);
+        //FIXME:  first line still depends on counters being enabled, replace w/ unit struct!
+        //forAllInDynArrayDo(_VMSMasterEnv->counter_history_array_info, &print_dot_node_info );
+        forAllInListOfArraysDo(semanticEnv->unitList, &print_unit_to_file);
+        forAllInListOfArraysDo( semanticEnv->commDependenciesList, &print_comm_dependency_to_file );
+        forAllInListOfArraysDo( semanticEnv->ctlDependenciesList, &print_ctl_dependency_to_file );
+        forAllInDynArrayDo(semanticEnv->ntonGroupsInfo,&print_nton_to_file);
+        //fprintf(output,"}\n");
+        fflush(output);
+
+    } else
+        printf("Opening UCC file failed. Please check that folder \"counters\" exists in run directory and has write permission.\n");
+   } else {
+       printf("Could not open UCC file, please clean \"counters\" folder. (Must contain less than 255 files.)\n");
+   }
+   //Loop Graph
+   for(n=0;n<255;n++)
+    {
+        sprintf(filename, "./counters/LoopGraph.%d",n);
+        output = fopen(filename,"r");
+        if(output)
+        {
+            fclose(output);
+        }else{
+            break;
+        }
+    }
+   if(n<255){
+    printf("Saving LoopGraph to File: %s ...\n", filename);
+    output = fopen(filename,"w+");
+    if(output!=NULL){
+        set_dependency_file(output);
+        //fprintf(output,"digraph Dependencies {\n");
+        //set_dot_file(output);
+        //FIXME:  first line still depends on counters being enabled, replace w/ unit struct!
+        //forAllInDynArrayDo(_VMSMasterEnv->counter_history_array_info, &print_dot_node_info );
+        forAllInListOfArraysDo( semanticEnv->unitList, &print_unit_to_file );
+        forAllInListOfArraysDo( semanticEnv->commDependenciesList, &print_comm_dependency_to_file );
+        forAllInListOfArraysDo( semanticEnv->ctlDependenciesList, &print_ctl_dependency_to_file );
+        forAllInListOfArraysDo( semanticEnv->dynDependenciesList, &print_dyn_dependency_to_file );
+        forAllInListOfArraysDo( semanticEnv->hwArcs, &print_hw_dependency_to_file );
+        //fprintf(output,"}\n");
+        fflush(output);
+
+    } else
+        printf("Opening LoopGraph file failed. Please check that folder \"counters\" exists in run directory and has write permission.\n");
+   } else {
+       printf("Could not open LoopGraph file, please clean \"counters\" folder. (Must contain less than 255 files.)\n");
+   }
+   
+   
+   freeListOfArrays(semanticEnv->unitList);
+   freeListOfArrays(semanticEnv->commDependenciesList);
+   freeListOfArrays(semanticEnv->ctlDependenciesList);
+   freeListOfArrays(semanticEnv->dynDependenciesList);
+   
+   #endif
+#ifdef HOLISTIC__TURN_ON_PERF_COUNTERS    
+    for(n=0;n<255;n++)
+    {
+        sprintf(filename, "./counters/Counters.%d.csv",n);
+        output = fopen(filename,"r");
+        if(output)
+        {
+            fclose(output);
+        }else{
+            break;
+        }
+    }
+    if(n<255){
+    printf("Saving Counter measurements to File: %s ...\n", filename);
+    output = fopen(filename,"w+");
+    if(output!=NULL){
+        set_counter_file(output);
+        int i;
+        for(i=0;i<NUM_CORES;i++){
+            forAllInListOfArraysDo( semanticEnv->counterList[i], &print_counter_events_to_file );
+            fflush(output);
+        }
+
+    } else
+        printf("Opening UCC file failed. Please check that folder \"counters\" exists in run directory and has write permission.\n");
+   } else {
+       printf("Could not open UCC file, please clean \"counters\" folder. (Must contain less than 255 files.)\n");
+   }
+    
+#endif
+/* It's all allocated inside VMS's big chunk -- that's about to be freed, so
+ *  nothing to do here
+   
+
+   for( coreIdx = 0; coreIdx < NUM_CORES; coreIdx++ )
+    {
+      VMS_int__free( semanticEnv->readyVPQs[coreIdx]->startOfData );
+      VMS_int__free( semanticEnv->readyVPQs[coreIdx] );
+    }
+   VMS_int__free( semanticEnv->readyVPQs );
+   
+   freeHashTable( semanticEnv->commHashTbl );
+   VMS_int__free( _VMSMasterEnv->semanticEnv );
+ */
+   VMS_SS__cleanup_at_end_of_shutdown();
+ }
+
+
+//===========================================================================
+
+/*
+ */
+  SlaveVP *
+SSR__create_procr_with( TopLevelFnPtr fnPtr,   void *initData,
+                        SlaveVP *creatingPr )
+ { SSRSemReq reqData;
+
+      //the semantic request data is on the stack and disappears when this
+      // call returns -- it's guaranteed to remain in the VP's stack for as
+      // long as the VP is suspended.
+   reqData.reqType            = 0; //know type because in a VMS create req
+   reqData.coreToAssignOnto = -1; //means round-robin assign
+   reqData.fnPtr              = fnPtr;
+   reqData.initData           = initData;
+   reqData.sendPr             = creatingPr;
+
+   VMS_WL__send_create_slaveVP_req( &reqData, creatingPr );
+
+   return creatingPr->dataRetFromReq;
+ }
+
+  SlaveVP *
+SSR__create_procr_with_affinity( TopLevelFnPtr fnPtr, void *initData,
+                        SlaveVP *creatingPr,  int32  coreToAssignOnto )
+ { SSRSemReq  reqData;
+
+      //the semantic request data is on the stack and disappears when this
+      // call returns -- it's guaranteed to remain in the VP's stack for as
+      // long as the VP is suspended.
+   reqData.reqType            = 0; //know type because in a VMS create req
+   reqData.coreToAssignOnto = coreToAssignOnto;
+   reqData.fnPtr              = fnPtr;
+   reqData.initData           = initData;
+   reqData.sendPr             = creatingPr;
+
+   VMS_WL__send_create_slaveVP_req( &reqData, creatingPr );
+
+   return creatingPr->dataRetFromReq;
+ }
+
+
+  void
+SSR__dissipate_procr( SlaveVP *procrToDissipate )
+ {
+   VMS_WL__send_dissipate_req( procrToDissipate );
+ }
+
+
+//===========================================================================
+
+void *
+SSR__malloc_to( int32 sizeToMalloc, SlaveVP *owningPr )
+ { SSRSemReq reqData;
+
+   reqData.reqType      = malloc_req;
+   reqData.sendPr       = owningPr;
+   reqData.sizeToMalloc = sizeToMalloc;
+
+   VMS_WL__send_sem_request( &reqData, owningPr );
+
+   return owningPr->dataRetFromReq;
+ }
+
+
+/*Sends request to Master, which does the work of freeing
+ */
+void
+SSR__free( void *ptrToFree, SlaveVP *owningPr )
+ { SSRSemReq reqData;
+
+   reqData.reqType      = free_req;
+   reqData.sendPr       = owningPr;
+   reqData.ptrToFree    = ptrToFree;
+
+   VMS_WL__send_sem_request( &reqData, owningPr );
+ }
+
+
+void
+SSR__transfer_ownership_of_from_to( void *data, SlaveVP *oldOwnerSlv,
+                                                  SlaveVP *newOwnerPr )
+ {
+   //TODO: put in the ownership system that automatically frees when no
+   // owners of data left -- will need keeper for keeping data around when
+   // future created processors might need it but don't exist yet
+ }
+
+
+void
+SSR__add_ownership_by_to( SlaveVP *newOwnerSlv, void *data )
+ {
+
+ }
+
+
+void
+SSR__remove_ownership_by_from( SlaveVP *loserSlv, void *dataLosing )
+ {
+
+ }
+
+
+/*Causes the SSR system to remove internal ownership, so data won't be
+ * freed when SSR shuts down, and will persist in the external program.
+ *
+ *Must be called from the processor that currently owns the data.
+ *
+ *IMPL: Transferring ownership touches two different virtual processor's
+ * state -- which means it has to be done carefully -- the VMS rules for
+ * semantic layers say that a work-unit is only allowed to touch the
+ * virtual processor it is part of, and that only a single work-unit per
+ * virtual processor be assigned to a slave at a time.  So, this has to
+ * modify the virtual processor that owns the work-unit that called this
+ * function, then create a request to have the other processor modified.
+ *However, in this case, the TO processor is the outside, and transfers
+ * are only allowed to be called by the giver-upper, so can mark caller of
+ * this function as no longer owner, and return -- done.
+ */
+void
+SSR__transfer_ownership_to_outside( void *data )
+ {
+   //TODO: removeAllOwnersFrom( data );
+ }
+
+
+//===========================================================================
+
+void
+SSR__send_of_type_to( SlaveVP *sendPr, void *msg, const int type,
+                        SlaveVP *receivePr)
+ { SSRSemReq  reqData;
+
+   reqData.receivePr = receivePr;
+   reqData.sendPr    = sendPr;
+   reqData.reqType   = send_type;
+   reqData.msgType   = type;
+   reqData.msg       = msg;
+   reqData.nextReqInHashEntry = NULL;
+
+      //On ownership -- remove inside the send and let ownership sit in limbo
+      // as a potential in an entry in the hash table, when this receive msg
+      // gets paired to a send, the ownership gets added to the receivePr --
+      // the next work-unit in the receivePr's trace will have ownership.
+   VMS_WL__send_sem_request( &reqData, sendPr );
+
+      //When come back from suspend, no longer own data reachable from msg
+      //TODO: release ownership here
+ }
+
+void
+SSR__send_from_to( void *msg, SlaveVP *sendPr, SlaveVP *receivePr )
+ { SSRSemReq  reqData;
+
+      //hash on the receiver, 'cause always know it, but sometimes want to
+      // receive from anonymous sender
+
+   reqData.receivePr = receivePr;
+   reqData.sendPr    = sendPr;
+   reqData.reqType   = send_from_to;
+   reqData.msg       = msg;
+   reqData.nextReqInHashEntry = NULL;
+
+   VMS_WL__send_sem_request( &reqData, sendPr );
+ }
+
+
+//===========================================================================
+
+void *
+SSR__receive_any_to( SlaveVP *receivePr )
+ {
+
+ }
+
+void *
+SSR__receive_type_to( const int type, SlaveVP *receivePr )
+ { 
+   SSRSemReq  reqData;
+
+   reqData.receivePr = receivePr;
+   reqData.reqType   = receive_type;
+   reqData.msgType   = type;
+   reqData.nextReqInHashEntry = NULL;
+
+   VMS_WL__send_sem_request( &reqData, receivePr );
+   
+   return receivePr->dataRetFromReq;
+ }
+
+
+
+/*Call this at point receiving virt pr wants in-coming data.
+ * 
+ *The reason receivePr must call this is that it modifies the receivPr
+ * loc structure directly -- and the VMS rules state a virtual processor
+ * loc structure can only be modified by itself.
+ */
+void *
+SSR__receive_from_to( SlaveVP *sendPr, SlaveVP *receivePr )
+ { SSRSemReq  reqData;
+
+      //hash on the receiver, 'cause always know it, but sometimes want to
+      // receive from anonymous sender
+
+   reqData.receivePr = receivePr;
+   reqData.sendPr    = sendPr;
+   reqData.reqType   = receive_from_to;
+   reqData.nextReqInHashEntry = NULL;
+
+   VMS_WL__send_sem_request( &reqData, receivePr );
+
+   return receivePr->dataRetFromReq;
+ }
+
+
+//===========================================================================
+//
+/*A function singleton is a function whose body executes exactly once, on a
+ * single core, no matter how many times the fuction is called and no
+ * matter how many cores or the timing of cores calling it.
+ *
+ *A data singleton is a ticket attached to data.  That ticket can be used
+ * to get the data through the function exactly once, no matter how many
+ * times the data is given to the function, and no matter the timing of
+ * trying to get the data through from different cores.
+ */
+
+/*asm function declarations*/
+void asm_save_ret_to_singleton(SSRSingleton *singletonPtrAddr);
+void asm_write_ret_from_singleton(SSRSingleton *singletonPtrAddr);
+
+/*Fn singleton uses ID as index into array of singleton structs held in the
+ * semantic environment.
+ */
+void
+SSR__start_fn_singleton( int32 singletonID,   SlaveVP *animPr )
+ {
+   SSRSemReq  reqData;
+
+      //
+   reqData.reqType     = singleton_fn_start;
+   reqData.singletonID = singletonID;
+
+   VMS_WL__send_sem_request( &reqData, animPr );
+   if( animPr->dataRetFromReq ) //will be 0 or addr of label in end singleton
+    {
+       SSRSemEnv *semEnv = VMS_int__give_sem_env_for( animPr );
+       asm_write_ret_from_singleton(&(semEnv->fnSingletons[ singletonID]));
+    }
+ }
+
+/*Data singleton hands addr of loc holding a pointer to a singleton struct.
+ * The start_data_singleton makes the structure and puts its addr into the
+ * location.
+ */
+void
+SSR__start_data_singleton( SSRSingleton **singletonAddr,  SlaveVP *animPr )
+ {
+   SSRSemReq  reqData;
+
+   if( *singletonAddr && (*singletonAddr)->hasFinished )
+       goto JmpToEndSingleton;
+   
+   reqData.reqType          = singleton_data_start;
+   reqData.singletonPtrAddr = singletonAddr;
+
+   VMS_WL__send_sem_request( &reqData, animPr );
+   if( animPr->dataRetFromReq ) //either 0 or end singleton's return addr
+    {    //Assembly code changes the return addr on the stack to the one
+         // saved into the singleton by the end-singleton-fn
+         //The return addr is at 0x4(%%ebp)
+        JmpToEndSingleton:
+          asm_write_ret_from_singleton(*singletonAddr);
+    }
+   //now, simply return
+   //will exit either from the start singleton call or the end-singleton call
+ }
+
+/*Uses ID as index into array of flags.  If flag already set, resumes from
+ * end-label.  Else, sets flag and resumes normally.
+ *
+ *Note, this call cannot be inlined because the instr addr at the label
+ * inside is shared by all invocations of a given singleton ID.
+ */
+void
+SSR__end_fn_singleton( int32 singletonID, SlaveVP *animPr )
+ {
+   SSRSemReq  reqData;
+
+      //don't need this addr until after at least one singleton has reached
+      // this function
+   SSRSemEnv *semEnv = VMS_int__give_sem_env_for( animPr );
+   asm_write_ret_from_singleton(&(semEnv->fnSingletons[ singletonID]));
+
+   reqData.reqType     = singleton_fn_end;
+   reqData.singletonID = singletonID;
+
+   VMS_WL__send_sem_request( &reqData, animPr );
+
+EndSingletonInstrAddr:
+   return;
+ }
+
+void
+SSR__end_data_singleton(  SSRSingleton **singletonPtrAddr, SlaveVP *animPr )
+ {
+   SSRSemReq  reqData;
+
+      //don't need this addr until after singleton struct has reached
+      // this function for first time
+      //do assembly that saves the return addr of this fn call into the
+      // data singleton -- that data-singleton can only be given to exactly
+      // one instance in the code of this function.  However, can use this
+      // function in different places for different data-singletons.
+//   (*(singletonAddr))->endInstrAddr =  &&EndDataSingletonInstrAddr;
+
+
+   asm_save_ret_to_singleton(*singletonPtrAddr);
+
+   reqData.reqType          = singleton_data_end;
+   reqData.singletonPtrAddr = singletonPtrAddr;
+
+   VMS_WL__send_sem_request( &reqData, animPr );
+ }
+
+/*This executes the function in the masterVP, so it executes in isolation
+ * from any other copies -- only one copy of the function can ever execute
+ * at a time.
+ *
+ *It suspends to the master, and the request handler takes the function
+ * pointer out of the request and calls it, then resumes the VP.
+ *Only very short functions should be called this way -- for longer-running
+ * isolation, use transaction-start and transaction-end, which run the code
+ * between as work-code.
+ */
+void
+SSR__animate_short_fn_in_isolation( PtrToAtomicFn ptrToFnToExecInMaster,
+                                    void *data, SlaveVP *animPr )
+ {
+   SSRSemReq  reqData;
+
+      //
+   reqData.reqType          = atomic;
+   reqData.fnToExecInMaster = ptrToFnToExecInMaster;
+   reqData.dataForFn        = data;
+
+   VMS_WL__send_sem_request( &reqData, animPr );
+ }
+
+
+/*This suspends to the master.
+ *First, it looks at the VP's data, to see the highest transactionID that VP
+ * already has entered.  If the current ID is not larger, it throws an
+ * exception stating a bug in the code.  Otherwise it puts the current ID
+ * there, and adds the ID to a linked list of IDs entered -- the list is
+ * used to check that exits are properly ordered.
+ *Next it is uses transactionID as index into an array of transaction
+ * structures.
+ *If the "VP_currently_executing" field is non-null, then put requesting VP
+ * into queue in the struct.  (At some point a holder will request
+ * end-transaction, which will take this VP from the queue and resume it.)
+ *If NULL, then write requesting into the field and resume.
+ */
+void
+SSR__start_transaction( int32 transactionID, SlaveVP *animPr )
+ {
+   SSRSemReq  reqData;
+
+      //
+   reqData.sendPr      = animPr;
+   reqData.reqType     = trans_start;
+   reqData.transID     = transactionID;
+
+   VMS_WL__send_sem_request( &reqData, animPr );
+ }
+
+/*This suspends to the master, then uses transactionID as index into an
+ * array of transaction structures.
+ *It looks at VP_currently_executing to be sure it's same as requesting VP.
+ * If different, throws an exception, stating there's a bug in the code.
+ *Next it looks at the queue in the structure.
+ *If it's empty, it sets VP_currently_executing field to NULL and resumes.
+ *If something in, gets it, sets VP_currently_executing to that VP, then
+ * resumes both.
+ */
+void
+SSR__end_transaction( int32 transactionID, SlaveVP *animPr )
+ {
+   SSRSemReq  reqData;
+
+      //
+   reqData.sendPr      = animPr;
+   reqData.reqType     = trans_end;
+   reqData.transID     = transactionID;
+
+   VMS_WL__send_sem_request( &reqData, animPr );
+ }
diff -r a32504bb2a1c -r bd5ab695145c SSR.h
--- a/SSR.h	Tue Mar 13 10:04:14 2012 -0700
+++ b/SSR.h	Tue Mar 13 18:30:05 2012 -0700
@@ -84,7 +84,7 @@
 
    void              *initData;
    TopLevelFnPtr     fnPtr;
-   int32              coreToScheduleOnto;
+   int32              coreToAssignOnto;
 
    int32              sizeToMalloc;
    void              *ptrToFree;
@@ -182,7 +182,7 @@
 
   SlaveVP *
 SSR__create_procr_with_affinity( TopLevelFnPtr fnPtr,    void *initData,
-                            SlaveVP *creatingPr, int32 coreToScheduleOnto);
+                            SlaveVP *creatingPr, int32 coreToAssignOnto);
 
 void
 SSR__dissipate_procr( SlaveVP *procrToDissipate );
@@ -253,11 +253,15 @@
 SSR__Request_Handler( SlaveVP *requestingPr, void *_semEnv );
 
 SlaveVP *
-SSR__schedule_slaveVP( void *_semEnv, int coreNum, int slotNum );
+SSR__assign_slaveVP( void *_semEnv, int coreNum, SchedSlot *slot );
 
 SlaveVP*
 SSR__create_procr_helper( TopLevelFnPtr fnPtr, void *initData,
-                          SSRSemEnv *semEnv,    int32 coreToScheduleOnto );
+                          SSRSemEnv *semEnv,    int32 coreToAssignOnto );
 
+//=====================  Measurement of Lang Overheads  =====================
+#include "SSR_Measurement.h"
+
+//===========================================================================
 #endif	/* _SSR_H */
 
diff -r a32504bb2a1c -r bd5ab695145c SSR.s
--- a/SSR.s	Tue Mar 13 10:04:14 2012 -0700
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,21 +0,0 @@
-
-//Assembly code takes the return addr off the stack and saves
-// into the singleton.  The first field in the singleton is the
-// "endInstrAddr" field, and the return addr is at 0x4(%ebp)
-.globl asm_save_ret_to_singleton
-asm_save_ret_to_singleton:
-    movq 0x8(%rbp),     %rax   #get ret address, ebp is the same as in the calling function
-    movq     %rax,     (%rdi) #write ret addr to endInstrAddr field
-    ret
-
-
-//Assembly code changes the return addr on the stack to the one
-// saved into the singleton by the end-singleton-fn
-//The stack's return addr is at 0x4(%%ebp)
-.globl asm_write_ret_from_singleton
-asm_write_ret_from_singleton:
-    movq    (%rdi),    %rax  #get endInstrAddr field
-    movq      %rax,    0x8(%rbp) #write return addr to the stack of the caller
-    ret
-
-
diff -r a32504bb2a1c -r bd5ab695145c SSR_Measurement.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/SSR_Measurement.h	Tue Mar 13 18:30:05 2012 -0700
@@ -0,0 +1,87 @@
+/*
+ *  Copyright 2009 OpenSourceStewardshipFoundation.org
+ *  Licensed under GNU General Public License version 2
+ *
+ * Author: seanhalle@yahoo.com
+ *
+ */
+
+#ifndef _SSR_MEAS_H
+#define	_SSR_MEAS_H
+
+
+#ifdef MEAS__TURN_ON_LANG_MEAS
+
+   #ifdef MEAS__Make_Meas_Hists_for_Language
+   #undef MEAS__Make_Meas_Hists_for_Language
+   #endif
+
+
+//===================  Language-specific Measurement Stuff ===================
+//
+//
+   #define SendFromToHistIdx      1 //note: starts at 1
+   #define SendOfTypeHistIdx      2
+   #define ReceiveFromToHistIdx   3
+   #define ReceiveOfTypeHistIdx   4
+
+   #define MEAS__Make_Meas_Hists_for_Language \
+      _VMSMasterEnv->measHistsInfo = \
+                 makePrivDynArrayOfSize( (void***)&(_VMSMasterEnv->measHists), 200); \
+       makeAMeasHist( SendFromToHistIdx,   "SendFromTo",    50, 0, 100 ) \
+       makeAMeasHist( SendOfTypeHistIdx,   "SendOfType",    50, 0, 100 ) \
+       makeAMeasHist( ReceiveFromToHistIdx,"ReceiveFromTo", 50, 0, 100 ) \
+       makeAMeasHist( ReceiveOfTypeHistIdx,"ReceiveOfType", 50, 0, 100 )
+
+   #define Meas_startSendFromTo \
+       int32 startStamp, endStamp; \
+       saveLowTimeStampCountInto( startStamp ); \
+
+   #define Meas_endSendFromTo \
+       saveLowTimeStampCountInto( endStamp ); \
+       addIntervalToHist( startStamp, endStamp, \
+                                _VMSMasterEnv->measHists[ SendFromToHistIdx ] );
+
+   #define Meas_startSendOfType \
+       int32 startStamp, endStamp; \
+       saveLowTimeStampCountInto( startStamp ); \
+
+   #define Meas_endSendOfType \
+       saveLowTimeStampCountInto( endStamp ); \
+       addIntervalToHist( startStamp, endStamp, \
+                                _VMSMasterEnv->measHists[ SendOfTypeHistIdx ] );
+
+   #define Meas_startReceiveFromTo \
+       int32 startStamp, endStamp; \
+       saveLowTimeStampCountInto( startStamp ); \
+
+   #define Meas_endReceiveFromTo \
+       saveLowTimeStampCountInto( endStamp ); \
+       addIntervalToHist( startStamp, endStamp, \
+                                _VMSMasterEnv->measHists[ ReceiveFromToHistIdx ] );
+
+   #define Meas_startReceiveOfType \
+       int32 startStamp, endStamp; \
+       saveLowTimeStampCountInto( startStamp ); \
+
+   #define Meas_endReceiveOfType \
+       saveLowTimeStampCountInto( endStamp ); \
+       addIntervalToHist( startStamp, endStamp, \
+                                _VMSMasterEnv->measHists[ReceiveOfTypeHistIdx ] );
+
+#else //===================== turned off ==========================
+
+   #define MEAS__Make_Meas_Hists_for_Language 
+   #define Meas_startSendFromTo
+   #define Meas_endSendFromTo
+   #define Meas_startSendOfType
+   #define Meas_endSendOfType
+   #define Meas_startReceiveFromTo
+   #define Meas_endReceiveFromTo
+   #define Meas_startReceiveOfType
+   #define Meas_endReceiveOfType
+
+#endif  /* MEAS__TURN_ON_LANG_MEAS */
+
+#endif	/*  */
+
diff -r a32504bb2a1c -r bd5ab695145c SSR_PluginFns.c
--- a/SSR_PluginFns.c	Tue Mar 13 10:04:14 2012 -0700
+++ b/SSR_PluginFns.c	Tue Mar 13 18:30:05 2012 -0700
@@ -7,7 +7,7 @@
 #include <stdio.h>
 #include <stdlib.h>
 
-#include "../../C_Libraries/Queue_impl/PrivateQueue.h"
+#include "Queue_impl/PrivateQueue.h"
 #include "SSR.h"
 #include "SSR_Request_Handlers.h"
 
@@ -25,21 +25,22 @@
 handleCreate( VMSReqst *req, SlaveVP *requestingPr, SSRSemEnv *semEnv  );
 
 
-//============================== Scheduler ==================================
+//============================== Assigner ==================================
 //
-/*For SSR, scheduling a slave simply takes the next work-unit off the
+/*For SSR, assigning a slave simply takes the next work-unit off the
  * ready-to-go work-unit queue and assigns it to the slaveToSched.
- *If the ready-to-go work-unit queue is empty, then nothing to schedule
+ *If the ready-to-go work-unit queue is empty, then nothing to assign
  * to the slave -- return FALSE to let Master loop know scheduling that
  * slave failed.
  */
-char __Scheduler[] = "FIFO Scheduler"; //Gobal variable for name in saved histogram
-
 SlaveVP *
-SSR__schedule_slaveVP( void *_semEnv, int coreNum, int slotNum )
+SSR__assign_slaveVP( void *_semEnv, int coreNum, SchedSlot *slot )
  { SlaveVP   *schedPr;
    SSRSemEnv *semEnv;
-
+   int32      slotNum;
+   
+   slotNum = slot->slotIdx;
+   
    semEnv  = (SSRSemEnv *)_semEnv;
 
    schedPr = readPrivQ( semEnv->readyVPQs[coreNum] );
@@ -48,36 +49,36 @@
        schedPr = semEnv->idlePr[coreNum][slotNum];
      //things that would normally happen in resume(), but these VPs never go there
      #ifdef HOLISTIC__TURN_ON_OBSERVE_UCC
-        schedPr->numTimesScheduled++; //Somewhere here!
+        schedPr->numTimesAssigned++; //Somewhere here!
         Unit newu;
         newu.vp = schedPr->slaveID;
-        newu.task = schedPr->numTimesScheduled;
+        newu.task = schedPr->numTimesAssigned;
         addToListOfArrays(Unit,newu,semEnv->unitList);
    
-        if (schedPr->numTimesScheduled > 1){
+        if (schedPr->numTimesAssigned > 1){
                 Dependency newd;
                 newd.from_vp = schedPr->slaveID;
-                newd.from_task = schedPr->numTimesScheduled - 1;
+                newd.from_task = schedPr->numTimesAssigned - 1;
                 newd.to_vp = schedPr->slaveID;
-                newd.to_task = schedPr->numTimesScheduled;
+                newd.to_task = schedPr->numTimesAssigned;
                 addToListOfArrays(Dependency, newd ,semEnv->ctlDependenciesList);  
         }
       #endif
    }
    #ifdef HOLISTIC__TURN_ON_OBSERVE_UCC
    if (schedPr) {
-        //schedPr->numTimesScheduled++;
+        //schedPr->numTimesAssigned++;
         Unit prev_in_slot = semEnv->last_in_slot[coreNum * NUM_SCHED_SLOTS + slotNum];
         if(prev_in_slot.vp != 0){
                 Dependency newd;
                 newd.from_vp = prev_in_slot.vp;
                 newd.from_task = prev_in_slot.task;
                 newd.to_vp = schedPr->slaveID;
-                newd.to_task = schedPr->numTimesScheduled;
+                newd.to_task = schedPr->numTimesAssigned;
                 addToListOfArrays(Dependency,newd,semEnv->hwArcs);   
         }
         prev_in_slot.vp = schedPr->slaveID;
-        prev_in_slot.task = schedPr->numTimesScheduled;
+        prev_in_slot.task = schedPr->numTimesAssigned;
         semEnv->last_in_slot[coreNum * NUM_SCHED_SLOTS + slotNum] = prev_in_slot;        
    }
    #endif
@@ -91,7 +92,7 @@
  * Upon send, check the hash to see if a receive is waiting.
  * Upon receive, check hash to see if a send has already happened.
  * When other is not there, put in.  When other is there, the comm.
- *  completes, which means the receiver P gets scheduled and
+ *  completes, which means the receiver P gets assigned and
  *  picks up right after the receive request.  So make the work-unit
  *  and put it into the queue of work-units ready to go.
  * Other request is create a new Processor, with the function to run in the
@@ -194,7 +195,7 @@
  */
   SlaveVP *
 SSR__create_procr_helper( TopLevelFnPtr fnPtr, void *initData,
-                          SSRSemEnv *semEnv,    int32 coreToScheduleOnto )
+                          SSRSemEnv *semEnv,    int32 coreToAssignOnto )
  { SlaveVP    *newPr;
    SSRSemData   *semData;
 
@@ -215,7 +216,7 @@
 
    #else
 
-   if(coreToScheduleOnto < 0 || coreToScheduleOnto >= NUM_CORES )
+   if(coreToAssignOnto < 0 || coreToAssignOnto >= NUM_CORES )
     {    //out-of-range, so round-robin assignment
       newPr->coreAnimatedBy = semEnv->nextCoreToGetNewPr;
 
@@ -225,7 +226,7 @@
           semEnv->nextCoreToGetNewPr += 1;
     }
    else //core num in-range, so use it
-    { newPr->coreAnimatedBy = coreToScheduleOnto;
+    { newPr->coreAnimatedBy = coreToAssignOnto;
     }
    #endif
    //========================================================================
@@ -243,14 +244,14 @@
    semReq = VMS_PI__take_sem_reqst_from( req );
  
    newPr = SSR__create_procr_helper( semReq->fnPtr, semReq->initData, semEnv,
-                                     semReq->coreToScheduleOnto );
+                                     semReq->coreToAssignOnto );
    
    DEBUG_Print1(dbgRqstHdlr,"(new VP: %d)\n",newPr->slaveID)
 
    #ifdef HOLISTIC__TURN_ON_OBSERVE_UCC
    Dependency newd;
    newd.from_vp = requestingPr->slaveID;
-   newd.from_task = requestingPr->numTimesScheduled;
+   newd.from_task = requestingPr->numTimesAssigned;
    newd.to_vp = newPr->slaveID;
    newd.to_task = 1;
    //addToListOfArraysDependency(newd,semEnv->commDependenciesList);  
@@ -277,18 +278,18 @@
 */
    #endif
    #ifdef HOLISTIC__TURN_ON_OBSERVE_UCC
-   procr->numTimesScheduled++; //Somewhere here!
+   procr->numTimesAssigned++; //Somewhere here!
    Unit newu;
    newu.vp = procr->slaveID;
-   newu.task = procr->numTimesScheduled;
+   newu.task = procr->numTimesAssigned;
    addToListOfArrays(Unit,newu,semEnv->unitList);
    
-   if (procr->numTimesScheduled > 1){
+   if (procr->numTimesAssigned > 1){
         Dependency newd;
         newd.from_vp = procr->slaveID;
-        newd.from_task = procr->numTimesScheduled - 1;
+        newd.from_task = procr->numTimesAssigned - 1;
         newd.to_vp = procr->slaveID;
-        newd.to_task = procr->numTimesScheduled;
+        newd.to_task = procr->numTimesAssigned;
         addToListOfArrays(Dependency, newd ,semEnv->ctlDependenciesList);  
    }
    #endif
diff -r a32504bb2a1c -r bd5ab695145c SSR_Request_Handlers.c
--- a/SSR_Request_Handlers.c	Tue Mar 13 10:04:14 2012 -0700
+++ b/SSR_Request_Handlers.c	Tue Mar 13 18:30:05 2012 -0700
@@ -129,9 +129,9 @@
        #ifdef HOLISTIC__TURN_ON_OBSERVE_UCC
         Dependency newd;
         newd.from_vp = sendPr->slaveID;
-        newd.from_task = sendPr->numTimesScheduled;
+        newd.from_task = sendPr->numTimesAssigned;
         newd.to_vp = receivePr->slaveID;
-        newd.to_task = receivePr->numTimesScheduled +1;
+        newd.to_task = receivePr->numTimesAssigned +1;
         //(newd,semEnv->commDependenciesList);  
         addToListOfArrays(Dependency,newd,semEnv->dynDependenciesList);  
                 int32 groupId = semReq->msgType;
@@ -143,10 +143,10 @@
         }
         Unit u;
         u.vp = sendPr->slaveID;
-        u.task = sendPr->numTimesScheduled;
+        u.task = sendPr->numTimesAssigned;
         addToListOfArrays(Unit,u,semEnv->ntonGroups[groupId]->senders);
         u.vp = receivePr->slaveID;
-        u.task = receivePr->numTimesScheduled +1;
+        u.task = receivePr->numTimesAssigned +1;
         addToListOfArrays(Unit,u,semEnv->ntonGroups[groupId]->receivers);
        #endif
 
@@ -209,9 +209,9 @@
       #ifdef HOLISTIC__TURN_ON_OBSERVE_UCC
         Dependency newd;
         newd.from_vp = sendPr->slaveID;
-        newd.from_task = sendPr->numTimesScheduled;
+        newd.from_task = sendPr->numTimesAssigned;
         newd.to_vp = receivePr->slaveID;
-        newd.to_task = receivePr->numTimesScheduled +1;
+        newd.to_task = receivePr->numTimesAssigned +1;
         //addToListOfArraysDependency(newd,semEnv->commDependenciesList);  
         addToListOfArrays(Dependency,newd,semEnv->commDependenciesList);   
       #endif 
@@ -318,9 +318,9 @@
        #ifdef HOLISTIC__TURN_ON_OBSERVE_UCC
         Dependency newd;
         newd.from_vp = sendPr->slaveID;
-        newd.from_task = sendPr->numTimesScheduled;
+        newd.from_task = sendPr->numTimesAssigned;
         newd.to_vp = receivePr->slaveID;
-        newd.to_task = receivePr->numTimesScheduled +1;
+        newd.to_task = receivePr->numTimesAssigned +1;
         //addToListOfArraysDependency(newd,semEnv->commDependenciesList);  
         addToListOfArrays(Dependency,newd,semEnv->dynDependenciesList); 
         int32 groupId = semReq->msgType;
@@ -332,10 +332,10 @@
         }
         Unit u;
         u.vp = sendPr->slaveID;
-        u.task = sendPr->numTimesScheduled;
+        u.task = sendPr->numTimesAssigned;
         addToListOfArrays(Unit,u,semEnv->ntonGroups[groupId]->senders);
         u.vp = receivePr->slaveID;
-        u.task = receivePr->numTimesScheduled +1;
+        u.task = receivePr->numTimesAssigned +1;
         addToListOfArrays(Unit,u,semEnv->ntonGroups[groupId]->receivers);
        #endif
       
@@ -378,9 +378,9 @@
       #ifdef HOLISTIC__TURN_ON_OBSERVE_UCC
         Dependency newd;
         newd.from_vp = sendPr->slaveID;
-        newd.from_task = sendPr->numTimesScheduled;
+        newd.from_task = sendPr->numTimesAssigned;
         newd.to_vp = receivePr->slaveID;
-        newd.to_task = receivePr->numTimesScheduled +1;
+        newd.to_task = receivePr->numTimesAssigned +1;
         //addToListOfArraysDependency(newd,semEnv->commDependenciesList);  
         addToListOfArrays(Dependency,newd,semEnv->commDependenciesList);    
       #endif  
diff -r a32504bb2a1c -r bd5ab695145c SSR_asm.s
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/SSR_asm.s	Tue Mar 13 18:30:05 2012 -0700
@@ -0,0 +1,21 @@
+
+//Assembly code takes the return addr off the stack and saves
+// into the singleton.  The first field in the singleton is the
+// "endInstrAddr" field, and the return addr is at 0x4(%ebp)
+.globl asm_save_ret_to_singleton
+asm_save_ret_to_singleton:
+    movq 0x8(%rbp),     %rax   #get ret address, ebp is the same as in the calling function
+    movq     %rax,     (%rdi) #write ret addr to endInstrAddr field
+    ret
+
+
+//Assembly code changes the return addr on the stack to the one
+// saved into the singleton by the end-singleton-fn
+//The stack's return addr is at 0x4(%%ebp)
+.globl asm_write_ret_from_singleton
+asm_write_ret_from_singleton:
+    movq    (%rdi),    %rax  #get endInstrAddr field
+    movq      %rax,    0x8(%rbp) #write return addr to the stack of the caller
+    ret
+
+
diff -r a32504bb2a1c -r bd5ab695145c SSR_lib.c
--- a/SSR_lib.c	Tue Mar 13 10:04:14 2012 -0700
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,807 +0,0 @@
-/*
- * Copyright 2010  OpenSourceCodeStewardshipFoundation
- *
- * Licensed under BSD
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <malloc.h>
-
-#include "../VMS_impl/VMS.h"
-#include "SSR.h"
-#include "../../C_Libraries/Queue_impl/PrivateQueue.h"
-#include "../../C_Libraries/Hash_impl/PrivateHash.h"
-#include "SSR.h"
-#include "SSR_Counter_Recording.h"
-
-//==========================================================================
-
-void
-SSR__init();
-
-void
-SSR__init_Seq();
-
-void
-SSR__init_Helper();
-//==========================================================================
-
-
-/*TODO: Q: dealing with library f()s and DKU vs WT vs FoR
- * (still want to do FoR, with time-lines as syntax, could be super cool)
- * A: thinking pin the coreCtlrs for all of BLIS -- let Master arbitrate
- * among library, DKU, WT, FoR -- all the patterns in terms of virtual
- * processors (or equivalently work-units), so Master picks which virt procr
- * from which portions of app (DKU, WT, FoR) onto which sched slots
- *Might even do hierarchy of masters -- group of sched slots for each core
- * has its own master, that keeps generated work local
- * single-reader-single-writer sync everywhere -- no atomic primitives
- * Might have the different schedulers talk to each other, to negotiate
- * larger-grain sharing of resources, according to predicted critical
- * path, and expansion of work
- */
-
-
-
-//===========================================================================
-
-
-/*These are the library functions *called in the application*
- * 
- *There's a pattern for the outside sequential code to interact with the
- * VMS_HW code.
- *The VMS_HW system is inside a boundary..  every SSR system is in its
- * own directory that contains the functions for each of the processor types.
- * One of the processor types is the "seed" processor that starts the
- * cascade of creating all the processors that do the work.
- *So, in the directory is a file called "EntryPoint.c" that contains the
- * function, named appropriately to the work performed, that the outside
- * sequential code calls.  This function follows a pattern:
- *1) it calls SSR__init()
- *2) it creates the initial data for the seed processor, which is passed
- *    in to the function
- *3) it creates the seed SSR processor, with the data to start it with.
- *4) it calls startSSRThenWaitUntilWorkDone
- *5) it gets the returnValue from the transfer struc and returns that
- *    from the function
- *
- *For now, a new SSR system has to be created via SSR__init every
- * time an entry point function is called -- later, might add letting the
- * SSR system be created once, and let all the entry points just reuse
- * it -- want to be as simple as possible now, and see by using what makes
- * sense for later..
- */
-
-
-
-//===========================================================================
-
-/*This is the "border crossing" function -- the thing that crosses from the
- * outside world, into the VMS_HW world.  It initializes and starts up the
- * VMS system, then creates one processor from the specified function and
- * puts it into the readyQ.  From that point, that one function is resp.
- * for creating all the other processors, that then create others, and so
- * forth.
- *When all the processors, including the seed, have dissipated, then this
- * function returns.  The results will have been written by side-effect via
- * pointers read from, or written into initData.
- *
- *NOTE: no Threads should exist in the outside program that might touch
- * any of the data reachable from initData passed in to here
- */
-void
-SSR__create_seed_procr_and_do_work( TopLevelFnPtr fnPtr, void *initData )
- { SSRSemEnv *semEnv;
-   SlaveVP *seedPr;
-
-   #ifdef DEBUG__TURN_ON_SEQUENTIAL_MODE
-   SSR__init_Seq();  //debug sequential exe
-   #else
-   SSR__init();      //normal multi-thd
-   #endif
-   semEnv = _VMSMasterEnv->semanticEnv;
-
-      //SSR starts with one processor, which is put into initial environ,
-      // and which then calls create() to create more, thereby expanding work
-   seedPr = SSR__create_procr_helper( fnPtr, initData,
-                                      semEnv, semEnv->nextCoreToGetNewPr++ );
-
-   resume_slaveVP( seedPr, semEnv );
-   
-   #ifdef DEBUG__TURN_ON_SEQUENTIAL_MODE
-   VMS_SS__start_the_work_then_wait_until_done_Seq();  //debug sequential exe
-   #else
-   VMS_SS__start_the_work_then_wait_until_done();      //normal multi-thd
-   #endif
-
-   SSR__cleanup_after_shutdown();
- }
-
-
-int32
-SSR__giveMinWorkUnitCycles( float32 percentOverhead )
- {
-   return MIN_WORK_UNIT_CYCLES;
- }
-
-int32
-SSR__giveIdealNumWorkUnits()
- {
-   return NUM_SCHED_SLOTS * NUM_CORES;
- }
-
-int32
-SSR__give_number_of_cores_to_schedule_onto()
- {
-   return NUM_CORES;
- }
-
-/*For now, use TSC -- later, make these two macros with assembly that first
- * saves jump point, and second jumps back several times to get reliable time
- */
-void
-SSR__start_primitive()
- { saveLowTimeStampCountInto( ((SSRSemEnv *)(_VMSMasterEnv->semanticEnv))->
-                              primitiveStartTime );
- }
-
-/*Just quick and dirty for now -- make reliable later
- * will want this to jump back several times -- to be sure cache is warm
- * because don't want comm time included in calc-time measurement -- and
- * also to throw out any "weird" values due to OS interrupt or TSC rollover
- */
-int32
-SSR__end_primitive_and_give_cycles()
- { int32 endTime, startTime;
-   //TODO: fix by repeating time-measurement
-   saveLowTimeStampCountInto( endTime );
-   startTime =((SSRSemEnv*)(_VMSMasterEnv->semanticEnv))->primitiveStartTime;
-   return (endTime - startTime);
- }
-
-//===========================================================================
-
-/*Initializes all the data-structures for a SSR system -- but doesn't
- * start it running yet!
- *
- *This runs in the main thread -- before VMS starts up
- * 
- *This sets up the semantic layer over the VMS system
- *
- *First, calls VMS_Setup, then creates own environment, making it ready
- * for creating the seed processor and then starting the work.
- */
-void
-SSR__init()
- {
-   VMS_SS__init();
-      //masterEnv, a global var, now is partially set up by init_VMS
-      // after this, have VMS_int__malloc and VMS_int__free available
-
-   SSR__init_Helper();
- }
-
-#ifdef DEBUG__TURN_ON_SEQUENTIAL_MODE
-void
-SSR__init_Seq()
- {
-   VMS_SS__init_Seq();
-   flushRegisters();
-      //masterEnv, a global var, now is partially set up by init_VMS
-
-   SSR__init_Helper();
- }
-#endif
-
-void idle_fn(void* data, SlaveVP *animatingSlv){
-    while(1){
-        VMS_int__suspend_slaveVP_and_send_req(animatingSlv);
-    }
-}
-
-void
-SSR__init_Helper()
- { SSRSemEnv       *semanticEnv;
-   PrivQueueStruc **readyVPQs;
-   int              coreIdx, i, j;
- 
-      //Hook up the semantic layer's plug-ins to the Master virt procr
-   _VMSMasterEnv->requestHandler = &SSR__Request_Handler;
-   _VMSMasterEnv->slaveAssigner  = &SSR__schedule_slaveVP;
-   #ifdef HOLISTIC__TURN_ON_PERF_COUNTERS
-   _VMSMasterEnv->counterHandler = &SSR__counter_handler;
-   #endif
-
-      //create the semantic layer's environment (all its data) and add to
-      // the master environment
-   semanticEnv = VMS_int__malloc( sizeof( SSRSemEnv ) );
-   _VMSMasterEnv->semanticEnv = semanticEnv;
-   
-   #ifdef HOLISTIC__TURN_ON_PERF_COUNTERS
-   SSR__init_counter_data_structs();
-   #endif
-   for(i=0;i<NUM_CORES;++i){
-       for(j=0;j<NUM_SCHED_SLOTS;++j){
-           semanticEnv->idlePr[i][j] = VMS_int__create_slaveVP(&idle_fn,NULL);
-           semanticEnv->idlePr[i][j]->coreAnimatedBy = i;
-       }
-   }
-
-   #ifdef HOLISTIC__TURN_ON_OBSERVE_UCC
-   semanticEnv->unitList = makeListOfArrays(sizeof(Unit),128);
-   semanticEnv->ctlDependenciesList = makeListOfArrays(sizeof(Dependency),128);
-   semanticEnv->commDependenciesList = makeListOfArrays(sizeof(Dependency),128);
-   semanticEnv->dynDependenciesList = makeListOfArrays(sizeof(Dependency),128);
-   semanticEnv->ntonGroupsInfo = makePrivDynArrayOfSize((void***)&(semanticEnv->ntonGroups),8);
-   
-   semanticEnv->hwArcs = makeListOfArrays(sizeof(Dependency),128);
-   memset(semanticEnv->last_in_slot,0,sizeof(NUM_CORES * NUM_SCHED_SLOTS * sizeof(Unit)));
-   #endif
-
-      //create the ready queue, hash tables used for pairing send to receive
-      // and so forth
-      //TODO: add hash tables for pairing sends with receives, and
-      // initialize the data ownership system
-   readyVPQs = VMS_int__malloc( NUM_CORES * sizeof(PrivQueueStruc *) );
-
-   for( coreIdx = 0; coreIdx < NUM_CORES; coreIdx++ )
-    {
-      readyVPQs[ coreIdx ] = makeVMSQ();
-    }
-   
-   semanticEnv->readyVPQs = readyVPQs;
-   
-   semanticEnv->nextCoreToGetNewPr = 0;
-   semanticEnv->numSlaveVP = 0;
-   
-   semanticEnv->commHashTbl  = makeHashTable( 1<<16, &VMS_int__free );//start big
-
-   //TODO: bug -- turn these arrays into dyn arrays to eliminate limit
-   //semanticEnv->singletonHasBeenExecutedFlags = makeDynArrayInfo( );
-   //semanticEnv->transactionStrucs = makeDynArrayInfo( );
-   for( i = 0; i < NUM_STRUCS_IN_SEM_ENV; i++ )
-    {
-      semanticEnv->fnSingletons[i].endInstrAddr      = NULL;
-      semanticEnv->fnSingletons[i].hasBeenStarted    = FALSE;
-      semanticEnv->fnSingletons[i].hasFinished       = FALSE;
-      semanticEnv->fnSingletons[i].waitQ             = makeVMSQ();
-      semanticEnv->transactionStrucs[i].waitingVPQ   = makeVMSQ();
-    }
- }
-
-
-/*Frees any memory allocated by SSR__init() then calls VMS_int__shutdown
- */
-void
-SSR__cleanup_after_shutdown()
- { SSRSemEnv *semanticEnv;
-   
-   semanticEnv = _VMSMasterEnv->semanticEnv;
-
-   #ifdef HOLISTIC__TURN_ON_OBSERVE_UCC
-   //UCC
-   FILE* output;
-   int n;
-   char filename[255];    
-    for(n=0;n<255;n++)
-    {
-        sprintf(filename, "./counters/UCC.%d",n);
-        output = fopen(filename,"r");
-        if(output)
-        {
-            fclose(output);
-        }else{
-            break;
-        }
-    }
-   if(n<255){
-    printf("Saving UCC to File: %s ...\n", filename);
-    output = fopen(filename,"w+");
-    if(output!=NULL){
-        set_dependency_file(output);
-        //fprintf(output,"digraph Dependencies {\n");
-        //set_dot_file(output);
-        //FIXME:  first line still depends on counters being enabled, replace w/ unit struct!
-        //forAllInDynArrayDo(_VMSMasterEnv->counter_history_array_info, &print_dot_node_info );
-        forAllInListOfArraysDo(semanticEnv->unitList, &print_unit_to_file);
-        forAllInListOfArraysDo( semanticEnv->commDependenciesList, &print_comm_dependency_to_file );
-        forAllInListOfArraysDo( semanticEnv->ctlDependenciesList, &print_ctl_dependency_to_file );
-        forAllInDynArrayDo(semanticEnv->ntonGroupsInfo,&print_nton_to_file);
-        //fprintf(output,"}\n");
-        fflush(output);
-
-    } else
-        printf("Opening UCC file failed. Please check that folder \"counters\" exists in run directory and has write permission.\n");
-   } else {
-       printf("Could not open UCC file, please clean \"counters\" folder. (Must contain less than 255 files.)\n");
-   }
-   //Loop Graph
-   for(n=0;n<255;n++)
-    {
-        sprintf(filename, "./counters/LoopGraph.%d",n);
-        output = fopen(filename,"r");
-        if(output)
-        {
-            fclose(output);
-        }else{
-            break;
-        }
-    }
-   if(n<255){
-    printf("Saving LoopGraph to File: %s ...\n", filename);
-    output = fopen(filename,"w+");
-    if(output!=NULL){
-        set_dependency_file(output);
-        //fprintf(output,"digraph Dependencies {\n");
-        //set_dot_file(output);
-        //FIXME:  first line still depends on counters being enabled, replace w/ unit struct!
-        //forAllInDynArrayDo(_VMSMasterEnv->counter_history_array_info, &print_dot_node_info );
-        forAllInListOfArraysDo( semanticEnv->unitList, &print_unit_to_file );
-        forAllInListOfArraysDo( semanticEnv->commDependenciesList, &print_comm_dependency_to_file );
-        forAllInListOfArraysDo( semanticEnv->ctlDependenciesList, &print_ctl_dependency_to_file );
-        forAllInListOfArraysDo( semanticEnv->dynDependenciesList, &print_dyn_dependency_to_file );
-        forAllInListOfArraysDo( semanticEnv->hwArcs, &print_hw_dependency_to_file );
-        //fprintf(output,"}\n");
-        fflush(output);
-
-    } else
-        printf("Opening LoopGraph file failed. Please check that folder \"counters\" exists in run directory and has write permission.\n");
-   } else {
-       printf("Could not open LoopGraph file, please clean \"counters\" folder. (Must contain less than 255 files.)\n");
-   }
-   
-   
-   freeListOfArrays(semanticEnv->unitList);
-   freeListOfArrays(semanticEnv->commDependenciesList);
-   freeListOfArrays(semanticEnv->ctlDependenciesList);
-   freeListOfArrays(semanticEnv->dynDependenciesList);
-   
-   #endif
-#ifdef HOLISTIC__TURN_ON_PERF_COUNTERS    
-    for(n=0;n<255;n++)
-    {
-        sprintf(filename, "./counters/Counters.%d.csv",n);
-        output = fopen(filename,"r");
-        if(output)
-        {
-            fclose(output);
-        }else{
-            break;
-        }
-    }
-    if(n<255){
-    printf("Saving Counter measurements to File: %s ...\n", filename);
-    output = fopen(filename,"w+");
-    if(output!=NULL){
-        set_counter_file(output);
-        int i;
-        for(i=0;i<NUM_CORES;i++){
-            forAllInListOfArraysDo( semanticEnv->counterList[i], &print_counter_events_to_file );
-            fflush(output);
-        }
-
-    } else
-        printf("Opening UCC file failed. Please check that folder \"counters\" exists in run directory and has write permission.\n");
-   } else {
-       printf("Could not open UCC file, please clean \"counters\" folder. (Must contain less than 255 files.)\n");
-   }
-    
-#endif
-/* It's all allocated inside VMS's big chunk -- that's about to be freed, so
- *  nothing to do here
-   
-
-   for( coreIdx = 0; coreIdx < NUM_CORES; coreIdx++ )
-    {
-      VMS_int__free( semanticEnv->readyVPQs[coreIdx]->startOfData );
-      VMS_int__free( semanticEnv->readyVPQs[coreIdx] );
-    }
-   VMS_int__free( semanticEnv->readyVPQs );
-   
-   freeHashTable( semanticEnv->commHashTbl );
-   VMS_int__free( _VMSMasterEnv->semanticEnv );
- */
-   VMS_SS__cleanup_at_end_of_shutdown();
- }
-
-
-//===========================================================================
-
-/*
- */
-  SlaveVP *
-SSR__create_procr_with( TopLevelFnPtr fnPtr,   void *initData,
-                        SlaveVP *creatingPr )
- { SSRSemReq reqData;
-
-      //the semantic request data is on the stack and disappears when this
-      // call returns -- it's guaranteed to remain in the VP's stack for as
-      // long as the VP is suspended.
-   reqData.reqType            = 0; //know type because in a VMS create req
-   reqData.coreToScheduleOnto = -1; //means round-robin schedule
-   reqData.fnPtr              = fnPtr;
-   reqData.initData           = initData;
-   reqData.sendPr             = creatingPr;
-
-   VMS_WL__send_create_slaveVP_req( &reqData, creatingPr );
-
-   return creatingPr->dataRetFromReq;
- }
-
-  SlaveVP *
-SSR__create_procr_with_affinity( TopLevelFnPtr fnPtr, void *initData,
-                        SlaveVP *creatingPr,  int32  coreToScheduleOnto )
- { SSRSemReq  reqData;
-
-      //the semantic request data is on the stack and disappears when this
-      // call returns -- it's guaranteed to remain in the VP's stack for as
-      // long as the VP is suspended.
-   reqData.reqType            = 0; //know type because in a VMS create req
-   reqData.coreToScheduleOnto = coreToScheduleOnto;
-   reqData.fnPtr              = fnPtr;
-   reqData.initData           = initData;
-   reqData.sendPr             = creatingPr;
-
-   VMS_WL__send_create_slaveVP_req( &reqData, creatingPr );
-
-   return creatingPr->dataRetFromReq;
- }
-
-
-  void
-SSR__dissipate_procr( SlaveVP *procrToDissipate )
- {
-   VMS_WL__send_dissipate_req( procrToDissipate );
- }
-
-
-//===========================================================================
-
-void *
-SSR__malloc_to( int32 sizeToMalloc, SlaveVP *owningPr )
- { SSRSemReq reqData;
-
-   reqData.reqType      = malloc_req;
-   reqData.sendPr       = owningPr;
-   reqData.sizeToMalloc = sizeToMalloc;
-
-   VMS_WL__send_sem_request( &reqData, owningPr );
-
-   return owningPr->dataRetFromReq;
- }
-
-
-/*Sends request to Master, which does the work of freeing
- */
-void
-SSR__free( void *ptrToFree, SlaveVP *owningPr )
- { SSRSemReq reqData;
-
-   reqData.reqType      = free_req;
-   reqData.sendPr       = owningPr;
-   reqData.ptrToFree    = ptrToFree;
-
-   VMS_WL__send_sem_request( &reqData, owningPr );
- }
-
-
-void
-SSR__transfer_ownership_of_from_to( void *data, SlaveVP *oldOwnerSlv,
-                                                  SlaveVP *newOwnerPr )
- {
-   //TODO: put in the ownership system that automatically frees when no
-   // owners of data left -- will need keeper for keeping data around when
-   // future created processors might need it but don't exist yet
- }
-
-
-void
-SSR__add_ownership_by_to( SlaveVP *newOwnerSlv, void *data )
- {
-
- }
-
-
-void
-SSR__remove_ownership_by_from( SlaveVP *loserSlv, void *dataLosing )
- {
-
- }
-
-
-/*Causes the SSR system to remove internal ownership, so data won't be
- * freed when SSR shuts down, and will persist in the external program.
- *
- *Must be called from the processor that currently owns the data.
- *
- *IMPL: Transferring ownership touches two different virtual processor's
- * state -- which means it has to be done carefully -- the VMS rules for
- * semantic layers say that a work-unit is only allowed to touch the
- * virtual processor it is part of, and that only a single work-unit per
- * virtual processor be scheduled to a slave at a time.  So, this has to
- * modify the virtual processor that owns the work-unit that called this
- * function, then create a request to have the other processor modified.
- *However, in this case, the TO processor is the outside, and transfers
- * are only allowed to be called by the giver-upper, so can mark caller of
- * this function as no longer owner, and return -- done.
- */
-void
-SSR__transfer_ownership_to_outside( void *data )
- {
-   //TODO: removeAllOwnersFrom( data );
- }
-
-
-//===========================================================================
-
-void
-SSR__send_of_type_to( SlaveVP *sendPr, void *msg, const int type,
-                        SlaveVP *receivePr)
- { SSRSemReq  reqData;
-
-   reqData.receivePr = receivePr;
-   reqData.sendPr    = sendPr;
-   reqData.reqType   = send_type;
-   reqData.msgType   = type;
-   reqData.msg       = msg;
-   reqData.nextReqInHashEntry = NULL;
-
-      //On ownership -- remove inside the send and let ownership sit in limbo
-      // as a potential in an entry in the hash table, when this receive msg
-      // gets paired to a send, the ownership gets added to the receivePr --
-      // the next work-unit in the receivePr's trace will have ownership.
-   VMS_WL__send_sem_request( &reqData, sendPr );
-
-      //When come back from suspend, no longer own data reachable from msg
-      //TODO: release ownership here
- }
-
-void
-SSR__send_from_to( void *msg, SlaveVP *sendPr, SlaveVP *receivePr )
- { SSRSemReq  reqData;
-
-      //hash on the receiver, 'cause always know it, but sometimes want to
-      // receive from anonymous sender
-
-   reqData.receivePr = receivePr;
-   reqData.sendPr    = sendPr;
-   reqData.reqType   = send_from_to;
-   reqData.msg       = msg;
-   reqData.nextReqInHashEntry = NULL;
-
-   VMS_WL__send_sem_request( &reqData, sendPr );
- }
-
-
-//===========================================================================
-
-void *
-SSR__receive_any_to( SlaveVP *receivePr )
- {
-
- }
-
-void *
-SSR__receive_type_to( const int type, SlaveVP *receivePr )
- { 
-   SSRSemReq  reqData;
-
-   reqData.receivePr = receivePr;
-   reqData.reqType   = receive_type;
-   reqData.msgType   = type;
-   reqData.nextReqInHashEntry = NULL;
-
-   VMS_WL__send_sem_request( &reqData, receivePr );
-   
-   return receivePr->dataRetFromReq;
- }
-
-
-
-/*Call this at point receiving virt pr wants in-coming data.
- * 
- *The reason receivePr must call this is that it modifies the receivPr
- * loc structure directly -- and the VMS rules state a virtual processor
- * loc structure can only be modified by itself.
- */
-void *
-SSR__receive_from_to( SlaveVP *sendPr, SlaveVP *receivePr )
- { SSRSemReq  reqData;
-
-      //hash on the receiver, 'cause always know it, but sometimes want to
-      // receive from anonymous sender
-
-   reqData.receivePr = receivePr;
-   reqData.sendPr    = sendPr;
-   reqData.reqType   = receive_from_to;
-   reqData.nextReqInHashEntry = NULL;
-
-   VMS_WL__send_sem_request( &reqData, receivePr );
-
-   return receivePr->dataRetFromReq;
- }
-
-
-//===========================================================================
-//
-/*A function singleton is a function whose body executes exactly once, on a
- * single core, no matter how many times the fuction is called and no
- * matter how many cores or the timing of cores calling it.
- *
- *A data singleton is a ticket attached to data.  That ticket can be used
- * to get the data through the function exactly once, no matter how many
- * times the data is given to the function, and no matter the timing of
- * trying to get the data through from different cores.
- */
-
-/*asm function declarations*/
-void asm_save_ret_to_singleton(SSRSingleton *singletonPtrAddr);
-void asm_write_ret_from_singleton(SSRSingleton *singletonPtrAddr);
-
-/*Fn singleton uses ID as index into array of singleton structs held in the
- * semantic environment.
- */
-void
-SSR__start_fn_singleton( int32 singletonID,   SlaveVP *animPr )
- {
-   SSRSemReq  reqData;
-
-      //
-   reqData.reqType     = singleton_fn_start;
-   reqData.singletonID = singletonID;
-
-   VMS_WL__send_sem_request( &reqData, animPr );
-   if( animPr->dataRetFromReq ) //will be 0 or addr of label in end singleton
-    {
-       SSRSemEnv *semEnv = VMS_int__give_sem_env_for( animPr );
-       asm_write_ret_from_singleton(&(semEnv->fnSingletons[ singletonID]));
-    }
- }
-
-/*Data singleton hands addr of loc holding a pointer to a singleton struct.
- * The start_data_singleton makes the structure and puts its addr into the
- * location.
- */
-void
-SSR__start_data_singleton( SSRSingleton **singletonAddr,  SlaveVP *animPr )
- {
-   SSRSemReq  reqData;
-
-   if( *singletonAddr && (*singletonAddr)->hasFinished )
-       goto JmpToEndSingleton;
-   
-   reqData.reqType          = singleton_data_start;
-   reqData.singletonPtrAddr = singletonAddr;
-
-   VMS_WL__send_sem_request( &reqData, animPr );
-   if( animPr->dataRetFromReq ) //either 0 or end singleton's return addr
-    {    //Assembly code changes the return addr on the stack to the one
-         // saved into the singleton by the end-singleton-fn
-         //The return addr is at 0x4(%%ebp)
-        JmpToEndSingleton:
-          asm_write_ret_from_singleton(*singletonAddr);
-    }
-   //now, simply return
-   //will exit either from the start singleton call or the end-singleton call
- }
-
-/*Uses ID as index into array of flags.  If flag already set, resumes from
- * end-label.  Else, sets flag and resumes normally.
- *
- *Note, this call cannot be inlined because the instr addr at the label
- * inside is shared by all invocations of a given singleton ID.
- */
-void
-SSR__end_fn_singleton( int32 singletonID, SlaveVP *animPr )
- {
-   SSRSemReq  reqData;
-
-      //don't need this addr until after at least one singleton has reached
-      // this function
-   SSRSemEnv *semEnv = VMS_int__give_sem_env_for( animPr );
-   asm_write_ret_from_singleton(&(semEnv->fnSingletons[ singletonID]));
-
-   reqData.reqType     = singleton_fn_end;
-   reqData.singletonID = singletonID;
-
-   VMS_WL__send_sem_request( &reqData, animPr );
-
-EndSingletonInstrAddr:
-   return;
- }
-
-void
-SSR__end_data_singleton(  SSRSingleton **singletonPtrAddr, SlaveVP *animPr )
- {
-   SSRSemReq  reqData;
-
-      //don't need this addr until after singleton struct has reached
-      // this function for first time
-      //do assembly that saves the return addr of this fn call into the
-      // data singleton -- that data-singleton can only be given to exactly
-      // one instance in the code of this function.  However, can use this
-      // function in different places for different data-singletons.
-//   (*(singletonAddr))->endInstrAddr =  &&EndDataSingletonInstrAddr;
-
-
-   asm_save_ret_to_singleton(*singletonPtrAddr);
-
-   reqData.reqType          = singleton_data_end;
-   reqData.singletonPtrAddr = singletonPtrAddr;
-
-   VMS_WL__send_sem_request( &reqData, animPr );
- }
-
-/*This executes the function in the masterVP, so it executes in isolation
- * from any other copies -- only one copy of the function can ever execute
- * at a time.
- *
- *It suspends to the master, and the request handler takes the function
- * pointer out of the request and calls it, then resumes the VP.
- *Only very short functions should be called this way -- for longer-running
- * isolation, use transaction-start and transaction-end, which run the code
- * between as work-code.
- */
-void
-SSR__animate_short_fn_in_isolation( PtrToAtomicFn ptrToFnToExecInMaster,
-                                    void *data, SlaveVP *animPr )
- {
-   SSRSemReq  reqData;
-
-      //
-   reqData.reqType          = atomic;
-   reqData.fnToExecInMaster = ptrToFnToExecInMaster;
-   reqData.dataForFn        = data;
-
-   VMS_WL__send_sem_request( &reqData, animPr );
- }
-
-
-/*This suspends to the master.
- *First, it looks at the VP's data, to see the highest transactionID that VP
- * already has entered.  If the current ID is not larger, it throws an
- * exception stating a bug in the code.  Otherwise it puts the current ID
- * there, and adds the ID to a linked list of IDs entered -- the list is
- * used to check that exits are properly ordered.
- *Next it is uses transactionID as index into an array of transaction
- * structures.
- *If the "VP_currently_executing" field is non-null, then put requesting VP
- * into queue in the struct.  (At some point a holder will request
- * end-transaction, which will take this VP from the queue and resume it.)
- *If NULL, then write requesting into the field and resume.
- */
-void
-SSR__start_transaction( int32 transactionID, SlaveVP *animPr )
- {
-   SSRSemReq  reqData;
-
-      //
-   reqData.sendPr      = animPr;
-   reqData.reqType     = trans_start;
-   reqData.transID     = transactionID;
-
-   VMS_WL__send_sem_request( &reqData, animPr );
- }
-
-/*This suspends to the master, then uses transactionID as index into an
- * array of transaction structures.
- *It looks at VP_currently_executing to be sure it's same as requesting VP.
- * If different, throws an exception, stating there's a bug in the code.
- *Next it looks at the queue in the structure.
- *If it's empty, it sets VP_currently_executing field to NULL and resumes.
- *If something in, gets it, sets VP_currently_executing to that VP, then
- * resumes both.
- */
-void
-SSR__end_transaction( int32 transactionID, SlaveVP *animPr )
- {
-   SSRSemReq  reqData;
-
-      //
-   reqData.sendPr      = animPr;
-   reqData.reqType     = trans_end;
-   reqData.transID     = transactionID;
-
-   VMS_WL__send_sem_request( &reqData, animPr );
- }
diff -r a32504bb2a1c -r bd5ab695145c dependency.c
--- a/dependency.c	Tue Mar 13 10:04:14 2012 -0700
+++ b/dependency.c	Tue Mar 13 18:30:05 2012 -0700
@@ -1,5 +1,5 @@
 #include "dependency.h"
-#include "../VMS_impl/VMS.h"
+#include "VMS_impl/VMS.h"
 
 Dependency* new_dependency(int from_vp, int from_task, int to_vp, int to_task){
     Dependency* newDep = (Dependency*) VMS_int__malloc(sizeof(Dependency));