# HG changeset patch
# User Sean Halle <seanhalle@yahoo.com>
# Date 1346058875 25200
# Node ID 9f2a7bd26dd98176fcaa50a76e53b5dfd59cd32d

Initial add -- code is straight copy of VSs implementation.. to be modified

diff -r 000000000000 -r 9f2a7bd26dd9 .hgeol
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/.hgeol	Mon Aug 27 02:14:35 2012 -0700
@@ -0,0 +1,12 @@
+
+[patterns]
+**.py = native
+**.txt = native
+**.c = native
+**.h = native
+**.cpp = native
+**.java = native
+**.sh = native
+**.pl = native
+**.jpg = bin
+**.gif = bin
diff -r 000000000000 -r 9f2a7bd26dd9 .hgignore
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/.hgignore	Mon Aug 27 02:14:35 2012 -0700
@@ -0,0 +1,3 @@
+syntax: glob
+
+*.o
diff -r 000000000000 -r 9f2a7bd26dd9 DKU.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/DKU.c	Mon Aug 27 02:14:35 2012 -0700
@@ -0,0 +1,853 @@
+/*
+ * Copyright 2010  OpenSourceCodeStewardshipFoundation
+ *
+ * Licensed under BSD
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <malloc.h>
+
+#include "Queue_impl/PrivateQueue.h"
+#include "Hash_impl/PrivateHash.h"
+
+#include "VSs.h"
+#include "Measurement/VSs_Counter_Recording.h"
+
+//==========================================================================
+
+void
+VSs__init();
+
+void
+VSs__init_Helper();
+//==========================================================================
+
+
+
+//===========================================================================
+
+
+/*These are the library functions *called in the application*
+ * 
+ *There's a pattern for the outside sequential code to interact with the
+ * VMS_HW code.
+ *The VMS_HW system is inside a boundary..  every VSs system is in its
+ * own directory that contains the functions for each of the processor types.
+ * One of the processor types is the "seed" processor that starts the
+ * cascade of creating all the processors that do the work.
+ *So, in the directory is a file called "EntryPoint.c" that contains the
+ * function, named appropriately to the work performed, that the outside
+ * sequential code calls.  This function follows a pattern:
+ *1) it calls VSs__init()
+ *2) it creates the initial data for the seed processor, which is passed
+ *    in to the function
+ *3) it creates the seed VSs processor, with the data to start it with.
+ *4) it calls startVSsThenWaitUntilWorkDone
+ *5) it gets the returnValue from the transfer struc and returns that
+ *    from the function
+ *
+ *For now, a new VSs system has to be created via VSs__init every
+ * time an entry point function is called -- later, might add letting the
+ * VSs system be created once, and let all the entry points just reuse
+ * it -- want to be as simple as possible now, and see by using what makes
+ * sense for later..
+ */
+
+
+
+//===========================================================================
+
+/*This is the "border crossing" function -- the thing that crosses from the
+ * outside world, into the VMS_HW world.  It initializes and starts up the
+ * VMS system, then creates one processor from the specified function and
+ * puts it into the readyQ.  From that point, that one function is resp.
+ * for creating all the other processors, that then create others, and so
+ * forth.
+ *When all the processors, including the seed, have dissipated, then this
+ * function returns.  The results will have been written by side-effect via
+ * pointers read from, or written into initData.
+ *
+ *NOTE: no Threads should exist in the outside program that might touch
+ * any of the data reachable from initData passed in to here
+ */
+void
+VSs__create_seed_slave_and_do_work( TopLevelFnPtr fnPtr, void *initData )
+ { VSsSemEnv   *semEnv;
+   SlaveVP     *seedSlv;
+   VSsSemData  *semData;
+   VSsTaskStub *threadTaskStub, *parentTaskStub;
+
+   VSs__init();      //normal multi-thd
+   
+   semEnv = _VMSMasterEnv->semanticEnv;
+
+      //VSs starts with one processor, which is put into initial environ,
+      // and which then calls create() to create more, thereby expanding work
+   seedSlv = VSs__create_slave_helper( fnPtr, initData,
+                                     semEnv, semEnv->nextCoreToGetNewSlv++ );
+   
+      //seed slave is a thread slave, so make a thread's task stub for it
+      // and then make another to stand for the seed's parent task.  Make
+      // the parent be already ended, and have one child (the seed).  This
+      // will make the dissipate handler do the right thing when the seed
+      // is dissipated.
+   threadTaskStub = create_thread_task_stub( initData );
+   parentTaskStub = create_thread_task_stub( NULL );
+   parentTaskStub->isEnded = TRUE;
+   parentTaskStub->numLiveChildThreads = 1; //so dissipate works for seed
+   threadTaskStub->parentTaskStub = parentTaskStub;
+   
+   semData = (VSsSemData *)seedSlv->semanticData;
+      //seedVP is a thread, so has a permanent task
+   semData->needsTaskAssigned = FALSE;
+   semData->taskStub = threadTaskStub;
+   semData->slaveType = ThreadSlv;
+
+   resume_slaveVP( seedSlv, semEnv ); //returns right away, just queues Slv
+   
+   VMS_SS__start_the_work_then_wait_until_done();      //normal multi-thd
+
+   VSs__cleanup_after_shutdown();
+ }
+
+
+int32
+VSs__giveMinWorkUnitCycles( float32 percentOverhead )
+ {
+   return MIN_WORK_UNIT_CYCLES;
+ }
+
+int32
+VSs__giveIdealNumWorkUnits()
+ {
+   return NUM_ANIM_SLOTS * NUM_CORES;
+ }
+
+int32
+VSs__give_number_of_cores_to_schedule_onto()
+ {
+   return NUM_CORES;
+ }
+
+/*For now, use TSC -- later, make these two macros with assembly that first
+ * saves jump point, and second jumps back several times to get reliable time
+ */
+void
+VSs__start_primitive()
+ { saveLowTimeStampCountInto( ((VSsSemEnv *)(_VMSMasterEnv->semanticEnv))->
+                              primitiveStartTime );
+ }
+
+/*Just quick and dirty for now -- make reliable later
+ * will want this to jump back several times -- to be sure cache is warm
+ * because don't want comm time included in calc-time measurement -- and
+ * also to throw out any "weird" values due to OS interrupt or TSC rollover
+ */
+int32
+VSs__end_primitive_and_give_cycles()
+ { int32 endTime, startTime;
+   //TODO: fix by repeating time-measurement
+   saveLowTimeStampCountInto( endTime );
+   startTime =((VSsSemEnv*)(_VMSMasterEnv->semanticEnv))->primitiveStartTime;
+   return (endTime - startTime);
+ }
+
+//===========================================================================
+
+/*Initializes all the data-structures for a VSs system -- but doesn't
+ * start it running yet!
+ *
+ *This runs in the main thread -- before VMS starts up
+ * 
+ *This sets up the semantic layer over the VMS system
+ *
+ *First, calls VMS_Setup, then creates own environment, making it ready
+ * for creating the seed processor and then starting the work.
+ */
+void
+VSs__init()
+ {
+   VMS_SS__init();
+      //masterEnv, a global var, now is partially set up by init_VMS
+      // after this, have VMS_int__malloc and VMS_int__free available
+
+   VSs__init_Helper();
+ }
+
+
+void idle_fn(void* data, SlaveVP *animatingSlv){
+    while(1){
+        VMS_int__suspend_slaveVP_and_send_req(animatingSlv);
+    }
+}
+
+void
+VSs__init_Helper()
+ { VSsSemEnv       *semanticEnv;
+   int32            i, coreNum, slotNum;
+   VSsSemData      *semData;
+ 
+      //Hook up the semantic layer's plug-ins to the Master virt procr
+   _VMSMasterEnv->requestHandler = &VSs__Request_Handler;
+   _VMSMasterEnv->slaveAssigner  = &VSs__assign_slaveVP_to_slot;
+
+      //create the semantic layer's environment (all its data) and add to
+      // the master environment
+   semanticEnv = VMS_int__malloc( sizeof( VSsSemEnv ) );
+   _VMSMasterEnv->semanticEnv = semanticEnv;
+   
+   #ifdef HOLISTIC__TURN_ON_PERF_COUNTERS
+   _VMSMasterEnv->counterHandler = &VSs__counter_handler;
+   VSs__init_counter_data_structs();
+   #endif
+
+   semanticEnv->shutdownInitiated = FALSE;
+   semanticEnv->coreIsDone = VMS_int__malloc( NUM_CORES * sizeof( bool32 ) );
+      //For each animation slot, there is an idle slave, and an initial
+      // slave assigned as the current-task-slave.  Create them here.
+   SlaveVP *idleSlv, *slotTaskSlv;
+   for( coreNum = 0; coreNum < NUM_CORES; coreNum++ )
+    { semanticEnv->coreIsDone[coreNum] = FALSE; //use during shutdown
+    
+      for( slotNum = 0; slotNum < NUM_ANIM_SLOTS; ++slotNum )
+       { idleSlv = VSs__create_slave_helper( &idle_fn, NULL, semanticEnv, 0);
+         idleSlv->coreAnimatedBy                = coreNum;
+         idleSlv->animSlotAssignedTo            =
+                               _VMSMasterEnv->allAnimSlots[coreNum][slotNum];
+         semanticEnv->idleSlv[coreNum][slotNum] = idleSlv;
+         
+         slotTaskSlv = VSs__create_slave_helper( &idle_fn, NULL, semanticEnv, 0);
+         slotTaskSlv->coreAnimatedBy            = coreNum;
+         slotTaskSlv->animSlotAssignedTo        = 
+                               _VMSMasterEnv->allAnimSlots[coreNum][slotNum];
+         
+         semData                    = slotTaskSlv->semanticData;
+         semData->needsTaskAssigned = TRUE;
+         semData->slaveType         = SlotTaskSlv;
+         semanticEnv->slotTaskSlvs[coreNum][slotNum] = slotTaskSlv;
+       }
+    }
+
+      //create the ready queues, hash tables used for matching and so forth
+   semanticEnv->slavesReadyToResumeQ = makeVMSQ();
+   semanticEnv->freeExtraTaskSlvQ    = makeVMSQ();
+   semanticEnv->taskReadyQ           = makeVMSQ();
+   
+   semanticEnv->argPtrHashTbl  = makeHashTable32( 16, &VMS_int__free );
+   semanticEnv->commHashTbl    = makeHashTable32( 16, &VMS_int__free );
+   
+   semanticEnv->nextCoreToGetNewSlv = 0;
+   
+
+   //TODO: bug -- turn these arrays into dyn arrays to eliminate limit
+   //semanticEnv->singletonHasBeenExecutedFlags = makeDynArrayInfo( );
+   //semanticEnv->transactionStrucs = makeDynArrayInfo( );
+   for( i = 0; i < NUM_STRUCS_IN_SEM_ENV; i++ )
+    {
+      semanticEnv->fnSingletons[i].endInstrAddr      = NULL;
+      semanticEnv->fnSingletons[i].hasBeenStarted    = FALSE;
+      semanticEnv->fnSingletons[i].hasFinished       = FALSE;
+      semanticEnv->fnSingletons[i].waitQ             = makeVMSQ();
+      semanticEnv->transactionStrucs[i].waitingVPQ   = makeVMSQ();
+    }
+
+   semanticEnv->numLiveExtraTaskSlvs   = 0; //must be last
+   semanticEnv->numLiveThreadSlvs      = 1; //must be last, counts the seed
+
+   #ifdef HOLISTIC__TURN_ON_OBSERVE_UCC
+   semanticEnv->unitList = makeListOfArrays(sizeof(Unit),128);
+   semanticEnv->ctlDependenciesList = makeListOfArrays(sizeof(Dependency),128);
+   semanticEnv->commDependenciesList = makeListOfArrays(sizeof(Dependency),128);
+   semanticEnv->dynDependenciesList = makeListOfArrays(sizeof(Dependency),128);
+   semanticEnv->ntonGroupsInfo = makePrivDynArrayOfSize((void***)&(semanticEnv->ntonGroups),8);
+   
+   semanticEnv->hwArcs = makeListOfArrays(sizeof(Dependency),128);
+   memset(semanticEnv->last_in_slot,0,sizeof(NUM_CORES * NUM_ANIM_SLOTS * sizeof(Unit)));
+   #endif
+ }
+
+
+/*Frees any memory allocated by VSs__init() then calls VMS_int__shutdown
+ */
+void
+VSs__cleanup_after_shutdown()
+ { VSsSemEnv *semanticEnv;
+   
+   semanticEnv = _VMSMasterEnv->semanticEnv;
+
+   #ifdef HOLISTIC__TURN_ON_OBSERVE_UCC
+   //UCC
+   FILE* output;
+   int n;
+   char filename[255];    
+    for(n=0;n<255;n++)
+    {
+        sprintf(filename, "./counters/UCC.%d",n);
+        output = fopen(filename,"r");
+        if(output)
+        {
+            fclose(output);
+        }else{
+            break;
+        }
+    }
+   if(n<255){
+    printf("Saving UCC to File: %s ...\n", filename);
+    output = fopen(filename,"w+");
+    if(output!=NULL){
+        set_dependency_file(output);
+        //fprintf(output,"digraph Dependencies {\n");
+        //set_dot_file(output);
+        //FIXME:  first line still depends on counters being enabled, replace w/ unit struct!
+        //forAllInDynArrayDo(_VMSMasterEnv->counter_history_array_info, &print_dot_node_info );
+        forAllInListOfArraysDo(semanticEnv->unitList, &print_unit_to_file);
+        forAllInListOfArraysDo( semanticEnv->commDependenciesList, &print_comm_dependency_to_file );
+        forAllInListOfArraysDo( semanticEnv->ctlDependenciesList, &print_ctl_dependency_to_file );
+        forAllInDynArrayDo(semanticEnv->ntonGroupsInfo,&print_nton_to_file);
+        //fprintf(output,"}\n");
+        fflush(output);
+
+    } else
+        printf("Opening UCC file failed. Please check that folder \"counters\" exists in run directory and has write permission.\n");
+   } else {
+       printf("Could not open UCC file, please clean \"counters\" folder. (Must contain less than 255 files.)\n");
+   }
+   //Loop Graph
+   for(n=0;n<255;n++)
+    {
+        sprintf(filename, "./counters/LoopGraph.%d",n);
+        output = fopen(filename,"r");
+        if(output)
+        {
+            fclose(output);
+        }else{
+            break;
+        }
+    }
+   if(n<255){
+    printf("Saving LoopGraph to File: %s ...\n", filename);
+    output = fopen(filename,"w+");
+    if(output!=NULL){
+        set_dependency_file(output);
+        //fprintf(output,"digraph Dependencies {\n");
+        //set_dot_file(output);
+        //FIXME:  first line still depends on counters being enabled, replace w/ unit struct!
+        //forAllInDynArrayDo(_VMSMasterEnv->counter_history_array_info, &print_dot_node_info );
+        forAllInListOfArraysDo( semanticEnv->unitList, &print_unit_to_file );
+        forAllInListOfArraysDo( semanticEnv->commDependenciesList, &print_comm_dependency_to_file );
+        forAllInListOfArraysDo( semanticEnv->ctlDependenciesList, &print_ctl_dependency_to_file );
+        forAllInListOfArraysDo( semanticEnv->dynDependenciesList, &print_dyn_dependency_to_file );
+        forAllInListOfArraysDo( semanticEnv->hwArcs, &print_hw_dependency_to_file );
+        //fprintf(output,"}\n");
+        fflush(output);
+
+    } else
+        printf("Opening LoopGraph file failed. Please check that folder \"counters\" exists in run directory and has write permission.\n");
+   } else {
+       printf("Could not open LoopGraph file, please clean \"counters\" folder. (Must contain less than 255 files.)\n");
+   }
+   
+   
+   freeListOfArrays(semanticEnv->unitList);
+   freeListOfArrays(semanticEnv->commDependenciesList);
+   freeListOfArrays(semanticEnv->ctlDependenciesList);
+   freeListOfArrays(semanticEnv->dynDependenciesList);
+   
+   #endif
+#ifdef HOLISTIC__TURN_ON_PERF_COUNTERS    
+    for(n=0;n<255;n++)
+    {
+        sprintf(filename, "./counters/Counters.%d.csv",n);
+        output = fopen(filename,"r");
+        if(output)
+        {
+            fclose(output);
+        }else{
+            break;
+        }
+    }
+    if(n<255){
+    printf("Saving Counter measurements to File: %s ...\n", filename);
+    output = fopen(filename,"w+");
+    if(output!=NULL){
+        set_counter_file(output);
+        int i;
+        for(i=0;i<NUM_CORES;i++){
+            forAllInListOfArraysDo( semanticEnv->counterList[i], &print_counter_events_to_file );
+            fflush(output);
+        }
+
+    } else
+        printf("Opening UCC file failed. Please check that folder \"counters\" exists in run directory and has write permission.\n");
+   } else {
+       printf("Could not open UCC file, please clean \"counters\" folder. (Must contain less than 255 files.)\n");
+   }
+    
+#endif
+/* It's all allocated inside VMS's big chunk -- that's about to be freed, so
+ *  nothing to do here
+   
+
+   for( coreIdx = 0; coreIdx < NUM_CORES; coreIdx++ )
+    {
+      VMS_int__free( semanticEnv->readyVPQs[coreIdx]->startOfData );
+      VMS_int__free( semanticEnv->readyVPQs[coreIdx] );
+    }
+   VMS_int__free( semanticEnv->readyVPQs );
+   
+   freeHashTable( semanticEnv->commHashTbl );
+   VMS_int__free( _VMSMasterEnv->semanticEnv );
+ */
+   VMS_SS__cleanup_at_end_of_shutdown();
+ }
+
+
+//===========================================================================
+
+SlaveVP *
+VSs__create_thread( TopLevelFnPtr fnPtr,   void *initData,
+                        SlaveVP *creatingThd )
+ { VSsSemReq reqData;
+
+      //the semantic request data is on the stack and disappears when this
+      // call returns -- it's guaranteed to remain in the VP's stack for as
+      // long as the VP is suspended.
+   reqData.reqType            = 0; //know type because in a VMS create req
+   reqData.fnPtr              = fnPtr;
+   reqData.initData           = initData;
+   reqData.callingSlv         = creatingThd;
+
+   VMS_WL__send_create_slaveVP_req( &reqData, creatingThd );
+
+   return creatingThd->dataRetFromReq;
+ }
+
+/*This is always the last thing done in the code animated by a thread VP.
+ * Normally, this would be the last line of the thread's top level function.
+ * But, if the thread exits from any point, it has to do so by calling
+ * this.
+ *
+ *It simply sends a dissipate request, which handles all the state cleanup.
+ */
+void
+VSs__end_thread( SlaveVP *thdToEnd )
+ { VSsSemData *semData;
+   
+   VMS_WL__send_dissipate_req( thdToEnd );
+ }
+
+
+
+//===========================================================================
+
+
+//======================= task submit and end ==============================
+/*
+ */
+void
+VSs__submit_task( VSsTaskType *taskType, void *args, SlaveVP *animSlv)
+ { VSsSemReq  reqData;
+
+   reqData.reqType    = submit_task;
+   
+   reqData.taskType   = taskType;
+   reqData.args       = args;
+   reqData.callingSlv = animSlv;
+  
+   reqData.taskID     = NULL;
+ 
+   VMS_WL__send_sem_request( &reqData, animSlv );
+ }
+
+inline int32 *
+VSs__create_taskID_of_size( int32 numInts, SlaveVP *animSlv )
+ { int32 *taskID;
+   
+   taskID    = VMS_WL__malloc( sizeof(int32) + numInts * sizeof(int32) );
+   taskID[0] = numInts;
+   return taskID;
+ }
+
+void
+VSs__submit_task_with_ID( VSsTaskType *taskType, void *args, int32 *taskID, 
+                          SlaveVP     *animSlv)
+ { VSsSemReq  reqData;
+
+   reqData.reqType    = submit_task;
+   
+   reqData.taskType   = taskType;
+   reqData.args       = args;
+   reqData.taskID     = taskID;
+   reqData.callingSlv = animSlv;
+ 
+   VMS_WL__send_sem_request( &reqData, animSlv );
+ }
+
+
+/*This call is the last to happen in every task.  It causes the slave to
+ * suspend and get the next task out of the task-queue.  Notice there is no
+ * assigner here.. only one slave, no slave ReadyQ, and so on..
+ *Can either make the assigner take the next task out of the taskQ, or can
+ * leave all as it is, and make task-end take the next task.
+ *Note: this fits the case in the new VMS for no-context tasks, so will use
+ * the built-in taskQ of new VMS, and should be local and much faster.
+ * 
+ *The task-stub is saved in the animSlv, so the request handler will get it
+ * from there, along with the task-type which has arg types, and so on..
+ * 
+ * NOTE: if want, don't need to send the animating SlaveVP around.. 
+ * instead, can make a single slave per core, and coreCtrlr looks up the
+ * slave from having the core number.
+ * 
+ *But, to stay compatible with all the other VMS languages, leave it in..
+ */
+void
+VSs__end_task( SlaveVP *animSlv )
+ { VSsSemReq  reqData;
+
+   reqData.reqType      = end_task;
+   reqData.callingSlv   = animSlv;
+   
+   VMS_WL__send_sem_request( &reqData, animSlv );
+ }
+
+
+void
+VSs__taskwait(SlaveVP *animSlv)
+{
+    VSsSemReq  reqData;
+
+   reqData.reqType      = taskwait;
+   reqData.callingSlv   = animSlv;
+   
+   VMS_WL__send_sem_request( &reqData, animSlv );
+}
+
+
+
+//==========================  send and receive ============================
+//
+
+inline int32 *
+VSs__give_self_taskID( SlaveVP *animSlv )
+ {
+   return ((VSsSemData*)animSlv->semanticData)->taskStub->taskID;
+ }
+
+//================================ send ===================================
+
+void
+VSs__send_of_type_to( void *msg, const int32 type, int32 *receiverID,
+                      SlaveVP *senderSlv )
+ { VSsSemReq  reqData;
+
+   reqData.reqType    = send_type_to;
+   
+   reqData.msg        = msg;
+   reqData.msgType    = type;
+   reqData.receiverID = receiverID;
+   reqData.senderSlv  = senderSlv;
+   
+   reqData.nextReqInHashEntry = NULL;
+
+   VMS_WL__send_sem_request( &reqData, senderSlv );
+
+      //When come back from suspend, no longer own data reachable from msg
+ }
+
+void
+VSs__send_from_to( void *msg, int32 *senderID, int32 *receiverID, SlaveVP *senderSlv )
+ { VSsSemReq  reqData;
+
+   reqData.reqType     = send_from_to;
+   
+   reqData.msg         = msg;
+   reqData.senderID    = senderID;
+   reqData.receiverID  = receiverID;
+   reqData.senderSlv   = senderSlv;
+
+   reqData.nextReqInHashEntry = NULL;
+
+   VMS_WL__send_sem_request( &reqData, senderSlv );
+ }
+
+
+//================================ receive ================================
+
+/*The "type" version of send and receive creates a many-to-one relationship.
+ * The sender is anonymous, and many sends can stack up, waiting to be
+ * received.  The same receiver can also have send from-to's
+ * waiting for it, and those will be kept separate from the "type"
+ * messages.
+ */
+void *
+VSs__receive_type_to( const int32 type, int32* receiverID, SlaveVP *receiverSlv )
+ {       DEBUG__printf1(dbgRqstHdlr,"WL: receive type to %d",receiverID[1] );
+   VSsSemReq  reqData;
+
+   reqData.reqType     = receive_type_to;
+   
+   reqData.msgType     = type;
+   reqData.receiverID  = receiverID;
+   reqData.receiverSlv = receiverSlv;
+   
+   reqData.nextReqInHashEntry = NULL;
+
+   VMS_WL__send_sem_request( &reqData, receiverSlv );
+   
+   return receiverSlv->dataRetFromReq;
+ }
+
+
+
+/*Call this at the point a receiving task wants in-coming data.
+ * Use this from-to form when know senderID -- it makes a direct channel
+ * between sender and receiver.
+ */
+void *
+VSs__receive_from_to( int32 *senderID, int32 *receiverID, SlaveVP *receiverSlv )
+ { 
+   VSsSemReq  reqData;
+
+   reqData.reqType     = receive_from_to;
+
+   reqData.senderID    = senderID;
+   reqData.receiverID  = receiverID;
+   reqData.receiverSlv = receiverSlv;
+
+   reqData.nextReqInHashEntry = NULL;
+      DEBUG__printf2(dbgRqstHdlr,"WL: receive from %d to: %d", reqData.senderID[1], reqData.receiverID[1]);
+      
+   VMS_WL__send_sem_request( &reqData, receiverSlv );
+
+   return receiverSlv->dataRetFromReq;
+ }
+
+
+
+
+//==========================================================================
+//
+/*A function singleton is a function whose body executes exactly once, on a
+ * single core, no matter how many times the fuction is called and no
+ * matter how many cores or the timing of cores calling it.
+ *
+ *A data singleton is a ticket attached to data.  That ticket can be used
+ * to get the data through the function exactly once, no matter how many
+ * times the data is given to the function, and no matter the timing of
+ * trying to get the data through from different cores.
+ */
+
+/*asm function declarations*/
+void asm_save_ret_to_singleton(VSsSingleton *singletonPtrAddr);
+void asm_write_ret_from_singleton(VSsSingleton *singletonPtrAddr);
+
+/*Fn singleton uses ID as index into array of singleton structs held in the
+ * semantic environment.
+ */
+void
+VSs__start_fn_singleton( int32 singletonID,   SlaveVP *animSlv )
+ {
+   VSsSemReq  reqData;
+
+      //
+   reqData.reqType     = singleton_fn_start;
+   reqData.singletonID = singletonID;
+
+   VMS_WL__send_sem_request( &reqData, animSlv );
+   if( animSlv->dataRetFromReq ) //will be 0 or addr of label in end singleton
+    {
+       VSsSemEnv *semEnv = VMS_int__give_sem_env_for( animSlv );
+       asm_write_ret_from_singleton(&(semEnv->fnSingletons[ singletonID]));
+    }
+ }
+
+/*Data singleton hands addr of loc holding a pointer to a singleton struct.
+ * The start_data_singleton makes the structure and puts its addr into the
+ * location.
+ */
+void
+VSs__start_data_singleton( VSsSingleton **singletonAddr,  SlaveVP *animSlv )
+ {
+   VSsSemReq  reqData;
+
+   if( *singletonAddr && (*singletonAddr)->hasFinished )
+       goto JmpToEndSingleton;
+   
+   reqData.reqType          = singleton_data_start;
+   reqData.singletonPtrAddr = singletonAddr;
+
+   VMS_WL__send_sem_request( &reqData, animSlv );
+   if( animSlv->dataRetFromReq ) //either 0 or end singleton's return addr
+    {    //Assembly code changes the return addr on the stack to the one
+         // saved into the singleton by the end-singleton-fn
+         //The return addr is at 0x4(%%ebp)
+        JmpToEndSingleton:
+          asm_write_ret_from_singleton(*singletonAddr);
+    }
+   //now, simply return
+   //will exit either from the start singleton call or the end-singleton call
+ }
+
+/*Uses ID as index into array of flags.  If flag already set, resumes from
+ * end-label.  Else, sets flag and resumes normally.
+ *
+ *Note, this call cannot be inlined because the instr addr at the label
+ * inside is shared by all invocations of a given singleton ID.
+ */
+void
+VSs__end_fn_singleton( int32 singletonID, SlaveVP *animSlv )
+ {
+   VSsSemReq  reqData;
+
+      //don't need this addr until after at least one singleton has reached
+      // this function
+   VSsSemEnv *semEnv = VMS_int__give_sem_env_for( animSlv );
+   asm_write_ret_from_singleton(&(semEnv->fnSingletons[ singletonID]));
+
+   reqData.reqType     = singleton_fn_end;
+   reqData.singletonID = singletonID;
+
+   VMS_WL__send_sem_request( &reqData, animSlv );
+
+EndSingletonInstrAddr:
+   return;
+ }
+
+void
+VSs__end_data_singleton(  VSsSingleton **singletonPtrAddr, SlaveVP *animSlv )
+ {
+   VSsSemReq  reqData;
+
+      //don't need this addr until after singleton struct has reached
+      // this function for first time
+      //do assembly that saves the return addr of this fn call into the
+      // data singleton -- that data-singleton can only be given to exactly
+      // one instance in the code of this function.  However, can use this
+      // function in different places for different data-singletons.
+//   (*(singletonAddr))->endInstrAddr =  &&EndDataSingletonInstrAddr;
+
+
+   asm_save_ret_to_singleton(*singletonPtrAddr);
+
+   reqData.reqType          = singleton_data_end;
+   reqData.singletonPtrAddr = singletonPtrAddr;
+
+   VMS_WL__send_sem_request( &reqData, animSlv );
+ }
+
+/*This executes the function in the masterVP, so it executes in isolation
+ * from any other copies -- only one copy of the function can ever execute
+ * at a time.
+ *
+ *It suspends to the master, and the request handler takes the function
+ * pointer out of the request and calls it, then resumes the VP.
+ *Only very short functions should be called this way -- for longer-running
+ * isolation, use transaction-start and transaction-end, which run the code
+ * between as work-code.
+ */
+void
+VSs__animate_short_fn_in_isolation( PtrToAtomicFn ptrToFnToExecInMaster,
+                                    void *data, SlaveVP *animSlv )
+ {
+   VSsSemReq  reqData;
+
+      //
+   reqData.reqType          = atomic;
+   reqData.fnToExecInMaster = ptrToFnToExecInMaster;
+   reqData.dataForFn        = data;
+
+   VMS_WL__send_sem_request( &reqData, animSlv );
+ }
+
+
+/*This suspends to the master.
+ *First, it looks at the VP's data, to see the highest transactionID that VP
+ * already has entered.  If the current ID is not larger, it throws an
+ * exception stating a bug in the code.  Otherwise it puts the current ID
+ * there, and adds the ID to a linked list of IDs entered -- the list is
+ * used to check that exits are properly ordered.
+ *Next it is uses transactionID as index into an array of transaction
+ * structures.
+ *If the "VP_currently_executing" field is non-null, then put requesting VP
+ * into queue in the struct.  (At some point a holder will request
+ * end-transaction, which will take this VP from the queue and resume it.)
+ *If NULL, then write requesting into the field and resume.
+ */
+void
+VSs__start_transaction( int32 transactionID, SlaveVP *animSlv )
+ {
+   VSsSemReq  reqData;
+
+      //
+   reqData.callingSlv      = animSlv;
+   reqData.reqType     = trans_start;
+   reqData.transID     = transactionID;
+
+   VMS_WL__send_sem_request( &reqData, animSlv );
+ }
+
+/*This suspends to the master, then uses transactionID as index into an
+ * array of transaction structures.
+ *It looks at VP_currently_executing to be sure it's same as requesting VP.
+ * If different, throws an exception, stating there's a bug in the code.
+ *Next it looks at the queue in the structure.
+ *If it's empty, it sets VP_currently_executing field to NULL and resumes.
+ *If something in, gets it, sets VP_currently_executing to that VP, then
+ * resumes both.
+ */
+void
+VSs__end_transaction( int32 transactionID, SlaveVP *animSlv )
+ {
+   VSsSemReq  reqData;
+
+      //
+   reqData.callingSlv      = animSlv;
+   reqData.reqType     = trans_end;
+   reqData.transID     = transactionID;
+
+   VMS_WL__send_sem_request( &reqData, animSlv );
+ }
+
+//======================== Internal ==================================
+/*
+ */
+SlaveVP *
+VSs__create_slave_with( TopLevelFnPtr fnPtr,   void *initData,
+                        SlaveVP *creatingSlv )
+ { VSsSemReq reqData;
+
+      //the semantic request data is on the stack and disappears when this
+      // call returns -- it's guaranteed to remain in the VP's stack for as
+      // long as the VP is suspended.
+   reqData.reqType            = 0; //know type because in a VMS create req
+   reqData.coreToAssignOnto = -1; //means round-robin assign
+   reqData.fnPtr              = fnPtr;
+   reqData.initData           = initData;
+   reqData.callingSlv             = creatingSlv;
+
+   VMS_WL__send_create_slaveVP_req( &reqData, creatingSlv );
+
+   return creatingSlv->dataRetFromReq;
+ }
+
+SlaveVP *
+VSs__create_slave_with_affinity( TopLevelFnPtr fnPtr, void *initData,
+                        SlaveVP *creatingSlv,  int32  coreToAssignOnto )
+ { VSsSemReq  reqData;
+
+      //the semantic request data is on the stack and disappears when this
+      // call returns -- it's guaranteed to remain in the VP's stack for as
+      // long as the VP is suspended.
+   reqData.reqType            = create_slave_w_aff; //not used, May 2012
+   reqData.coreToAssignOnto   = coreToAssignOnto;
+   reqData.fnPtr              = fnPtr;
+   reqData.initData           = initData;
+   reqData.callingSlv         = creatingSlv;
+
+   VMS_WL__send_create_slaveVP_req( &reqData, creatingSlv );
+
+   return creatingSlv->dataRetFromReq;
+ }
+
diff -r 000000000000 -r 9f2a7bd26dd9 DKU.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/DKU.h	Mon Aug 27 02:14:35 2012 -0700
@@ -0,0 +1,374 @@
+/*
+ *  Copyright 2009 OpenSourceStewardshipFoundation.org
+ *  Licensed under GNU General Public License version 2
+ *
+ * Author: seanhalle@yahoo.com
+ *
+ */
+
+#ifndef _VSs_H
+#define	_VSs_H
+
+#include "Queue_impl/PrivateQueue.h"
+#include "Hash_impl/PrivateHash.h"
+#include "VMS_impl/VMS.h"
+#include "Measurement/dependency.h"
+
+
+//===========================================================================
+#define NUM_STRUCS_IN_SEM_ENV 1000
+
+   //This is hardware dependent -- it's the number of cycles of scheduling
+   // overhead -- if a work unit is fewer than this, it is better being
+   // combined sequentially with other work
+   //This value depends on both VMS overhead and VSs's plugin.  At some point
+   // it will be derived by perf-counter measurements during init of VSs
+#define MIN_WORK_UNIT_CYCLES 20000
+
+//===========================================================================
+/*This header defines everything specific to the VSs semantic plug-in
+ */
+typedef struct _VSsSemReq   VSsSemReq;
+typedef void  (*VSsTaskFnPtr )   ( void *, SlaveVP *);
+typedef void  (*PtrToAtomicFn )  ( void * ); //executed atomically in master
+//===========================================================================
+
+#define NONCTLD 0
+#define IN      1  /*Trick -- READER same as IN*/
+#define OUT     2  /*Trick -- WRITER same as OUT and INOUT*/
+#define INOUT   2  /*Trick -- WRITER same as OUT and INOUT*/
+
+#define READER  1  /*Trick -- READER same as IN*/
+#define WRITER  2  /*Trick -- WRITER same as OUT and INOUT*/
+
+#define IS_A_THREAD NULL
+#define IS_ENDED    NULL
+#define SEED_SLV    NULL
+
+typedef struct
+ {
+   VSsTaskFnPtr fn;
+   int32  numTotalArgs;//the number of inputs to function
+   int32  numCtldArgs;//how many of args have dependencies
+   int32 *argTypes;   //says reader, writer, or non-ctld
+   int32 *argSizes;   //for detecting overlap
+   int32  sizeOfArgs; //for memcpy of args struct
+ }
+VSsTaskType;
+
+
+typedef struct
+ {
+   bool32       hasEnabledNonFinishedWriter;
+   int32        numEnabledNonDoneReaders;
+   PrivQueueStruc *waitersQ;
+ }
+VSsPointerEntry;
+
+typedef struct
+ {
+   void       **args; //ctld args must come first, as ptrs
+   VSsTaskType *taskType;
+   int32       *taskID;
+   int32        numBlockingProp;
+   SlaveVP     *slaveAssignedTo; //only valid before end task (thread)
+   VSsPointerEntry  **ptrEntries;
+   void*        parentTaskStub;
+   int32        numLiveChildTasks;
+   int32        numLiveChildThreads;
+   bool32       isWaitingForChildTasksToEnd;
+   bool32       isWaitingForChildThreadsToEnd;
+   bool32       isEnded;
+ }
+VSsTaskStub;
+
+
+typedef struct
+ {
+   VSsTaskStub *taskStub;
+   int32        argNum;
+   int32        isReader;
+ }
+VSsTaskStubCarrier;
+
+
+/*Semantic layer-specific data sent inside a request from lib called in app
+ * to request handler called in AnimationMaster
+ */
+
+typedef struct
+ {
+   SlaveVP      *VPCurrentlyExecuting;
+   PrivQueueStruc *waitingVPQ;
+ }
+VSsTrans;
+
+/*WARNING: assembly hard-codes position of endInstrAddr as first field
+ */
+typedef struct
+ {
+   void           *endInstrAddr;
+   int32           hasBeenStarted;
+   int32           hasFinished;
+   PrivQueueStruc *waitQ;
+ }
+VSsSingleton;
+
+enum VSsReqType
+ {
+   submit_task = 1,
+   end_task,
+   create_slave,
+   create_slave_w_aff,
+   dissipate_slave,
+   //===============================
+   send_type_to,
+   receive_type_to,
+   send_from_to,
+   receive_from_to,
+   //===============================
+   taskwait,
+   malloc_req,
+   free_req,
+   singleton_fn_start,
+   singleton_fn_end,
+   singleton_data_start,
+   singleton_data_end,
+   atomic,
+   trans_start,
+   trans_end
+ };
+
+struct _VSsSemReq
+ { enum VSsReqType    reqType;
+   SlaveVP           *callingSlv;
+   VSsTaskType       *taskType;
+   void              *args;
+   VSsTaskStub       *taskStub;
+   
+   SlaveVP           *senderSlv;
+   SlaveVP           *receiverSlv;
+   int32             *senderID;
+   int32             *receiverID;
+   int32              msgType;
+   void              *msg;
+   VSsSemReq         *nextReqInHashEntry;
+   int32             *taskID;
+   
+   TopLevelFnPtr      fnPtr;
+   void              *initData;
+   int32              coreToAssignOnto;
+
+   int32              sizeToMalloc;
+   void              *ptrToFree;
+
+   int32              singletonID;
+   VSsSingleton     **singletonPtrAddr;
+
+   PtrToAtomicFn      fnToExecInMaster;
+   void              *dataForFn;
+
+   int32              transID;
+ }
+/* VSsSemReq */;
+
+
+typedef struct
+ {
+   PrivQueueStruc  *slavesReadyToResumeQ; //Shared (slaves not pinned)
+   PrivQueueStruc  *freeExtraTaskSlvQ;    //Shared
+   PrivQueueStruc  *taskReadyQ;           //Shared (tasks not pinned)
+   SlaveVP         *slotTaskSlvs[NUM_CORES][NUM_ANIM_SLOTS];
+   HashTable       *argPtrHashTbl;
+   HashTable       *commHashTbl;
+   int32            numLiveExtraTaskSlvs;
+   int32            numLiveThreadSlvs;
+   int32            nextCoreToGetNewSlv;
+   int32            primitiveStartTime;
+
+                       //fix limit on num with dynArray
+   VSsSingleton     fnSingletons[NUM_STRUCS_IN_SEM_ENV];
+   VSsTrans         transactionStrucs[NUM_STRUCS_IN_SEM_ENV];
+
+   bool32          *coreIsDone;
+   int32            numCoresDone;
+   
+   #ifdef HOLISTIC__TURN_ON_OBSERVE_UCC
+   ListOfArrays* unitList;
+   ListOfArrays* ctlDependenciesList;
+   ListOfArrays* commDependenciesList;
+   NtoN** ntonGroups;
+   PrivDynArrayInfo* ntonGroupsInfo;
+   ListOfArrays* dynDependenciesList;
+   Unit last_in_slot[NUM_CORES * NUM_ANIM_SLOTS];
+   ListOfArrays* hwArcs;
+   #endif
+
+   #ifdef HOLISTIC__TURN_ON_PERF_COUNTERS
+   ListOfArrays* counterList[NUM_CORES];
+   #endif
+   SlaveVP* idleSlv[NUM_CORES][NUM_ANIM_SLOTS];
+   int shutdownInitiated;
+ }
+VSsSemEnv;
+
+
+typedef struct _TransListElem TransListElem;
+struct _TransListElem
+ {
+   int32          transID;
+   TransListElem *nextTrans;
+ };
+//TransListElem
+ 
+enum VSsSlvType
+ { ExtraTaskSlv = 1,
+   SlotTaskSlv,
+   ThreadSlv
+ };
+ 
+typedef struct
+ {
+   int32           highestTransEntered;
+   TransListElem  *lastTransEntered;
+   bool32          needsTaskAssigned;
+   VSsTaskStub    *taskStub;
+   enum VSsSlvType slaveType;
+ }
+VSsSemData;
+ 
+//===========================================================================
+
+void
+VSs__create_seed_slave_and_do_work( TopLevelFnPtr fn, void *initData );
+
+int32
+VSs__giveMinWorkUnitCycles( float32 percentOverhead );
+
+void
+VSs__start_primitive();
+
+int32
+VSs__end_primitive_and_give_cycles();
+
+int32
+VSs__giveIdealNumWorkUnits();
+
+int32
+VSs__give_number_of_cores_to_schedule_onto();
+
+//=======================
+
+void
+VSs__init();
+
+void
+VSs__cleanup_after_shutdown();
+
+//=======================
+
+SlaveVP *
+VSs__create_thread( TopLevelFnPtr fnPtr,   void *initData,
+                                                     SlaveVP *creatingThd );
+
+void
+VSs__end_thread( SlaveVP *thdToEnd );
+
+//=======================
+
+#define VSs__malloc( numBytes, callingSlave ) VMS_App__malloc( numBytes, callingSlave)
+
+#define VSs__free(ptrToFree, callingSlave ) VMS_App__free( ptrToFree, callingSlave )
+
+
+//=======================
+void
+VSs__submit_task( VSsTaskType *taskType, void *args, SlaveVP *animSlv);
+
+inline int32 *
+VSs__create_taskID_of_size( int32 numInts, SlaveVP *animSlv );
+
+void
+VSs__submit_task_with_ID( VSsTaskType *taskType, void *args, int32 *taskID, 
+                          SlaveVP     *animSlv);
+
+void
+VSs__end_task( SlaveVP *animSlv );
+
+//=========================
+void
+VSs__taskwait(SlaveVP *animSlv);
+
+
+inline int32 *
+VSs__give_self_taskID( SlaveVP *animSlv );
+
+void
+VSs__send_of_type_to( void *msg, const int32 type, int32 *receiverID,
+                      SlaveVP *senderSlv );
+
+void
+VSs__send_from_to( void *msg, int32 *senderID, int32 *receiverID, SlaveVP *senderSlv );
+
+void *
+VSs__receive_type_to( const int32 type, int32* receiverID, SlaveVP *receiverSlv );
+
+void *
+VSs__receive_from_to( int32 *senderID, int32 *receiverID, SlaveVP *receiverSlv );
+
+//======================= Concurrency Stuff ======================
+void
+VSs__start_fn_singleton( int32 singletonID, SlaveVP *animSlv );
+
+void
+VSs__end_fn_singleton( int32 singletonID, SlaveVP *animSlv );
+
+void
+VSs__start_data_singleton( VSsSingleton **singeltonAddr, SlaveVP *animSlv );
+
+void
+VSs__end_data_singleton( VSsSingleton **singletonAddr, SlaveVP *animSlv );
+
+void
+VSs__animate_short_fn_in_isolation( PtrToAtomicFn ptrToFnToExecInMaster,
+                                    void *data, SlaveVP *animSlv );
+
+void
+VSs__start_transaction( int32 transactionID, SlaveVP *animSlv );
+
+void
+VSs__end_transaction( int32 transactionID, SlaveVP *animSlv );
+
+
+//=========================  Internal use only  =============================
+void
+VSs__Request_Handler( SlaveVP *requestingSlv, void *_semEnv );
+
+SlaveVP *
+VSs__assign_slaveVP_to_slot( void *_semEnv, AnimSlot *slot );
+
+SlaveVP*
+VSs__create_slave_helper( TopLevelFnPtr fnPtr, void *initData,
+                          VSsSemEnv *semEnv,    int32 coreToAssignOnto );
+
+VSsTaskStub *
+create_thread_task_stub( void *initData );
+
+
+SlaveVP *
+VSs__create_slave_with( TopLevelFnPtr fnPtr, void *initData,
+                          SlaveVP *creatingSlv );
+
+SlaveVP *
+VSs__create_slave_with_affinity( TopLevelFnPtr fnPtr,    void *initData,
+                            SlaveVP *creatingSlv, int32 coreToAssignOnto);
+
+void 
+idle_fn(void* data, SlaveVP *animatingSlv);
+
+//=====================  Measurement of Lang Overheads  =====================
+#include "Measurement/VSs_Measurement.h"
+
+//===========================================================================
+#endif	/* _VSs_H */
+
diff -r 000000000000 -r 9f2a7bd26dd9 DKU_PluginFns.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/DKU_PluginFns.c	Mon Aug 27 02:14:35 2012 -0700
@@ -0,0 +1,505 @@
+/*
+ * Copyright 2010  OpenSourceCodeStewardshipFoundation
+ *
+ * Licensed under BSD
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "Queue_impl/PrivateQueue.h"
+#include "VSs.h"
+#include "VSs_Request_Handlers.h"
+
+//=========================== Local Fn Prototypes ===========================
+void
+resume_slaveVP( SlaveVP *slave, VSsSemEnv *semEnv );
+
+inline void
+handleSemReq( VMSReqst *req, SlaveVP *requestingSlv, VSsSemEnv *semEnv );
+
+inline void
+handleDissipate(                SlaveVP *requestingSlv, VSsSemEnv *semEnv );
+
+inline void
+handleCreate(    VMSReqst *req, SlaveVP *requestingSlv, VSsSemEnv *semEnv );
+
+//============================== Assigner ==================================
+//
+/*The assigner is complicated by having both tasks and explicitly created
+ * VPs, and by tasks being able to suspend.
+ *It can't use an explicit slave to animate a task because of stack
+ * pollution. So, it has to keep the two kinds separate.
+ *Simplest way for the assigner logic is with a Q for extra empty task
+ * slaves, and another Q for slaves of both types that are ready to resume.
+ *
+ *Keep a current task slave for each anim slot. The request handler manages
+ * it by pulling from the extraTaskSlvQ when a task suspends, or else
+ * creating a new task slave if taskSlvQ empty. 
+ *Assigner only assigns a task to the current task slave for the slot.
+ *If no more tasks, then takes a ready to resume slave, if also none of them
+ * then dissipates extra task slaves (one per invocation).
+ *Shutdown condition is: must have no suspended tasks, and no suspended
+ * explicit slaves and no more tasks in taskQ.  Will only have the masters
+ * plus a current task slave for each slot.. detects this condition. 
+ * 
+ *Having the two types of slave is part of having communications directly
+ * between tasks, and tasks to explicit slaves, which requires the ability
+ * to suspend both kinds, but also to keep explicit slave stacks clean from
+ * the junk tasks are allowed to leave behind.
+ */
+SlaveVP *
+VSs__assign_slaveVP_to_slot( void *_semEnv, AnimSlot *slot )
+ { SlaveVP     *returnSlv;
+   VSsSemEnv   *semEnv;
+   VSsSemData  *semData;
+   int32        coreNum, slotNum;
+   VSsTaskStub *newTaskStub;
+   SlaveVP     *extraSlv;
+  
+   coreNum = slot->coreSlotIsOn;
+   slotNum = slot->slotIdx;
+   
+   semEnv  = (VSsSemEnv *)_semEnv;
+   
+      //Check for suspended slaves that are ready to resume
+   returnSlv = readPrivQ( semEnv->slavesReadyToResumeQ );
+   if( returnSlv != NULL )  //Yes, have a slave, so return it.
+    { returnSlv->coreAnimatedBy   = coreNum;
+    
+         //have work, so reset Done flag (when work generated on other core)
+      if( semEnv->coreIsDone[coreNum] == TRUE ) //reads are higher perf
+         semEnv->coreIsDone[coreNum] = FALSE;   //don't just write always
+      goto ReturnTheSlv;
+    }
+   
+      //If none, speculate will have a task, so get the slot slave
+      //TODO: false sharing ?  (think not bad cause mostly read..)
+   returnSlv = semEnv->slotTaskSlvs[coreNum][slotNum];
+   
+   semData = (VSsSemData *)returnSlv->semanticData;
+
+      //There is always a curr task slave, and it always needs a task
+      // (task slaves that are resuming are in resumeQ)
+   newTaskStub = readPrivQ( semEnv->taskReadyQ );
+   if( newTaskStub != NULL )
+    {    //point slave to task's function, and mark slave as having task
+      VMS_int__reset_slaveVP_to_TopLvlFn( returnSlv, 
+                          newTaskStub->taskType->fn, newTaskStub->args );
+      semData->taskStub            = newTaskStub;
+      newTaskStub->slaveAssignedTo = returnSlv;
+      semData->needsTaskAssigned   = FALSE;
+      
+         //have work, so reset Done flag, if was set
+      if( semEnv->coreIsDone[coreNum] == TRUE ) //reads are higher perf
+         semEnv->coreIsDone[coreNum] = FALSE;   //don't just write always
+      goto ReturnTheSlv;
+    }
+   else
+    {    //no task, so try to clean up unused extra task slaves
+      extraSlv = readPrivQ( semEnv->freeExtraTaskSlvQ );
+      if( extraSlv != NULL )
+       {    //have two slaves need tasks, so delete one
+            //This both bounds the num extras, and delivers shutdown cond
+         handleDissipate( extraSlv, semEnv );
+            //then return NULL
+         returnSlv = NULL;
+         goto ReturnTheSlv;
+       }
+      else
+       { //candidate for shutdown.. if all extras dissipated, and no tasks
+         // and no ready to resume slaves, then no way to generate
+         // more tasks (on this core -- other core might have task still)
+         if( semEnv->numLiveExtraTaskSlvs == 0 && 
+             semEnv->numLiveThreadSlvs == 0 )
+          { //This core sees no way to generate more tasks, so say it
+            if( semEnv->coreIsDone[coreNum] == FALSE )
+             { semEnv->numCoresDone += 1;
+               semEnv->coreIsDone[coreNum] = TRUE;
+               #ifdef DEBUG__TURN_ON_SEQUENTIAL_MODE
+               semEnv->shutdownInitiated = TRUE;
+               
+               #else
+               if( semEnv->numCoresDone == NUM_CORES )
+                { //means no cores have work, and none can generate more
+                  semEnv->shutdownInitiated = TRUE;
+                }
+               #endif
+             }
+          }
+            //return NULL.. no task and none to resume
+         returnSlv = NULL;
+            //except if shutdown has been initiated by this or other core
+         if(semEnv->shutdownInitiated) 
+          { returnSlv = VMS_SS__create_shutdown_slave();
+          }
+         goto ReturnTheSlv; //don't need, but completes pattern
+       } //if( extraSlv != NULL )
+    } //if( newTaskStub == NULL )
+   //outcome: 1)slave was just pointed to task, 2)no tasks, so slave NULL
+
+ReturnTheSlv:  //Nina, doing gotos to here should help with holistic..
+
+   #ifdef HOLISTIC__TURN_ON_OBSERVE_UCC
+   if( returnSlv == NULL )
+    { returnSlv = semEnv->idleSlv[coreNum][slotNum]; 
+    
+         //things that would normally happen in resume(), but these VPs
+         // never go there
+      returnSlv->assignCount++; //Somewhere here!
+      Unit newu;
+      newu.vp = returnSlv->slaveID;
+      newu.task = returnSlv->assignCount;
+      addToListOfArrays(Unit,newu,semEnv->unitList);
+
+      if (returnSlv->assignCount > 1)
+       { Dependency newd;
+         newd.from_vp = returnSlv->slaveID;
+         newd.from_task = returnSlv->assignCount - 1;
+         newd.to_vp = returnSlv->slaveID;
+         newd.to_task = returnSlv->assignCount;
+         addToListOfArrays(Dependency, newd ,semEnv->ctlDependenciesList);  
+       }
+    }
+   #endif
+   #ifdef HOLISTIC__TURN_ON_OBSERVE_UCC
+   if( returnSlv != NULL )
+    { //assignSlv->numTimesAssigned++;
+      Unit prev_in_slot = 
+         semEnv->last_in_slot[coreNum * NUM_ANIM_SLOTS + slotNum];
+      if(prev_in_slot.vp != 0)
+       { Dependency newd;
+         newd.from_vp = prev_in_slot.vp;
+         newd.from_task = prev_in_slot.task;
+         newd.to_vp = returnSlv->slaveID;
+         newd.to_task = returnSlv->assignCount;
+         addToListOfArrays(Dependency,newd,semEnv->hwArcs);   
+       }
+      prev_in_slot.vp = returnSlv->slaveID;
+      prev_in_slot.task = returnSlv->assignCount;
+      semEnv->last_in_slot[coreNum * NUM_ANIM_SLOTS + slotNum] =
+         prev_in_slot;        
+    }
+   #endif
+   return( returnSlv );
+ }
+
+
+//===========================  Request Handler  ============================
+//
+/*
+ * (BTW not inline because invoked indirectly via a pointer)
+ */
+void
+VSs__Request_Handler( SlaveVP *requestingSlv, void *_semEnv )
+ { VSsSemEnv *semEnv;
+   VMSReqst  *req;
+   
+   semEnv = (VSsSemEnv *)_semEnv;
+
+   req    = VMS_PI__take_next_request_out_of( requestingSlv );
+
+   while( req != NULL )
+    {
+      switch( req->reqType )
+       { case semantic:     handleSemReq(        req, requestingSlv, semEnv);
+            break;
+         case createReq:    handleCreate(        req, requestingSlv, semEnv);
+            break;
+         case dissipate:    handleDissipate(          requestingSlv, semEnv);
+            break;
+         case VMSSemantic:  VMS_PI__handle_VMSSemReq(req, requestingSlv, semEnv,
+                                           (ResumeSlvFnPtr) &resume_slaveVP);
+            break;
+         default:
+            break;
+       }
+      
+      req = VMS_PI__take_next_request_out_of( requestingSlv );
+    } //while( req != NULL )
+
+ }
+
+
+inline void
+handleSemReq( VMSReqst *req, SlaveVP *reqSlv, VSsSemEnv *semEnv )
+ { VSsSemReq *semReq;
+
+   semReq = VMS_PI__take_sem_reqst_from(req);
+   if( semReq == NULL ) return;
+   switch( semReq->reqType )  //sem handlers are all in other file
+    {
+      case submit_task:     handleSubmitTask(   semReq,         semEnv);
+         break; 
+      case end_task:        handleEndTask(      semReq,         semEnv);
+         break;
+      case send_type_to:    handleSendTypeTo(   semReq,         semEnv);
+         break;
+      case send_from_to:    handleSendFromTo(   semReq,         semEnv);
+         break;
+      case receive_type_to: handleReceiveTypeTo(semReq,         semEnv);
+         break;
+      case receive_from_to: handleReceiveFromTo(semReq,         semEnv);
+         break;
+      case taskwait:        handleTaskwait(     semReq, reqSlv, semEnv);
+           break;
+         
+      //====================================================================
+      case malloc_req:      handleMalloc(       semReq, reqSlv, semEnv);
+         break;
+      case free_req:        handleFree(         semReq, reqSlv, semEnv);
+         break;
+      case singleton_fn_start:  handleStartFnSingleton(semReq, reqSlv, semEnv);
+         break;
+      case singleton_fn_end:    handleEndFnSingleton(  semReq, reqSlv, semEnv);
+         break;
+      case singleton_data_start:handleStartDataSingleton(semReq,reqSlv,semEnv);
+         break;
+      case singleton_data_end:  handleEndDataSingleton(semReq, reqSlv, semEnv);
+         break;
+      case atomic:          handleAtomic(       semReq, reqSlv, semEnv);
+         break;
+      case trans_start:     handleTransStart(   semReq, reqSlv, semEnv);
+         break;
+      case trans_end:       handleTransEnd(     semReq, reqSlv, semEnv);
+         break;
+    }
+ }
+
+
+
+//=========================== VMS Request Handlers ==============================
+/*SlaveVP dissipate -- this is NOT task-end!, only call this to get rid of
+ * extra task slaves, and to end explicitly created threads
+ */
+inline void
+handleDissipate( SlaveVP *requestingSlv, VSsSemEnv *semEnv )
+ { VSsSemData  *semData;
+   VSsTaskStub *parentTaskStub, *ownTaskStub;
+ 
+         DEBUG__printf1(dbgRqstHdlr,"Dissipate request from processor %d",
+                                                     requestingSlv->slaveID)
+   semData = (VSsSemData *)requestingSlv->semanticData;
+
+   if( semData->slaveType == ExtraTaskSlv )
+    { semEnv->numLiveExtraTaskSlvs -= 1; //for detecting shutdown condition
+         //Has no task assigned, so no parents and no children, so free self
+      goto FreeSlaveStateAndReturn;
+    }
+
+   if( semData->slaveType == SlotTaskSlv )
+    {    //should never call dissipate on a slot assigned slave
+      VMS_PI__throw_exception( "dissipate a slot-assigned slave", requestingSlv, NULL );
+    }
+
+      //if make it to here, then is a thread slave ending
+   semEnv->numLiveThreadSlvs -= 1; //for detecting shutdown condition
+   
+   ownTaskStub    = semData->taskStub;
+   parentTaskStub = ownTaskStub->parentTaskStub;
+   parentTaskStub->numLiveChildThreads -= 1;  //not freed, even if ended
+   
+      //if all children ended, then free this task's stub
+      // else, keep stub around, and last child will free it (below)
+   if( ownTaskStub->numLiveChildTasks   == 0 &&
+       ownTaskStub->numLiveChildThreads == 0 )
+      free_task_stub( ownTaskStub );
+   else
+      ownTaskStub->isEnded = TRUE; //for children to see when they end
+
+      //Now, check on parents waiting on child threads to end
+   if( parentTaskStub->isWaitingForChildThreadsToEnd &&
+       parentTaskStub->numLiveChildThreads == 0 )
+    { parentTaskStub->isWaitingForChildThreadsToEnd = FALSE;
+      if( parentTaskStub->isWaitingForChildTasksToEnd )
+        return; //still waiting on tasks (should be impossible)
+      else //parent free to resume
+        resume_slaveVP( parentTaskStub->slaveAssignedTo, semEnv );
+    }
+   
+      //check if this is last child of ended parent (note, not possible to
+      // have more than one level of ancestor waiting to be freed)
+   if( parentTaskStub->isEnded )
+    { if( parentTaskStub->numLiveChildTasks   == 0 && 
+          parentTaskStub->numLiveChildThreads == 0 )
+         free_task_stub( parentTaskStub ); //just stub, semData already freed
+    }
+
+      //Free the semData and requesting slave's base state for all cases
+ FreeSlaveStateAndReturn:
+   VMS_PI__free( semData );
+   VMS_PI__dissipate_slaveVP( requestingSlv );
+   return; 
+      //Note, this is not a location to check for shutdown because doesn't
+      // say anything about work availability here.. check for shutdown in
+      // places try to get work for the core (in the assigner)
+ }
+
+   
+
+/*Re-use this in the entry-point fn
+ */
+inline SlaveVP *
+VSs__create_slave_helper( TopLevelFnPtr fnPtr, void *initData,
+                          VSsSemEnv *semEnv,    int32 coreToAssignOnto )
+ { SlaveVP    *newSlv;
+   VSsSemData   *semData;
+
+      //This is running in master, so use internal version
+   newSlv = VMS_PI__create_slaveVP( fnPtr, initData );
+
+      //task slaves differ from thread slaves by the settings in the taskStub
+      //so, don't create task stub here, only create semData, which is same
+      // for all kinds of slaves
+   semData = VMS_PI__malloc( sizeof(VSsSemData) );
+   semData->highestTransEntered = -1;
+   semData->lastTransEntered    = NULL;
+   semData->needsTaskAssigned   = TRUE;
+   semData->taskStub            = NULL;
+   
+   newSlv->semanticData = semData;
+
+   //=================== Assign new processor to a core =====================
+   #ifdef DEBUG__TURN_ON_SEQUENTIAL_MODE
+   newSlv->coreAnimatedBy = 0;
+
+   #else
+      //Assigning slaves to cores is part of SSR code..
+   if(coreToAssignOnto < 0 || coreToAssignOnto >= NUM_CORES )
+    {    //out-of-range, so round-robin assignment
+      newSlv->coreAnimatedBy = semEnv->nextCoreToGetNewSlv;
+
+      if( semEnv->nextCoreToGetNewSlv >= NUM_CORES - 1 )
+          semEnv->nextCoreToGetNewSlv  = 0;
+      else
+          semEnv->nextCoreToGetNewSlv += 1;
+    }
+   else //core num in-range, so use it
+    { newSlv->coreAnimatedBy = coreToAssignOnto;
+    }
+   #endif
+   //========================================================================
+   
+   return newSlv;
+ }
+
+VSsTaskStub *
+create_thread_task_stub( void *initData )
+ { VSsTaskStub *newStub;
+         
+   newStub = VMS_PI__malloc( sizeof(VSsTaskStub) );
+   newStub->numBlockingProp = 0;
+   newStub->slaveAssignedTo = NULL; //set later
+   newStub->taskType        = IS_A_THREAD;
+   newStub->ptrEntries      = NULL;
+   newStub->args            = initData;  
+   newStub->numLiveChildTasks              = 0;
+   newStub->numLiveChildThreads            = 0;
+   newStub->parentTaskStub                = NULL;
+   newStub->isWaitingForChildTasksToEnd    = FALSE;
+   newStub->isWaitingForChildThreadsToEnd  = FALSE;
+   newStub->taskID          = NULL;
+
+   return newStub;
+ }
+
+/*Application invokes this when it explicitly creates a thread via the
+ * "VSs__create_thread()" command.
+ * 
+ *The request handlers create new task slaves directly, not via this hdlr.
+ * 
+ *Make everything in VSs be a task.  An explicitly created VP is just a
+ * suspendable task, and the seedVP is also a suspendable task. 
+ *So, here, create a task Stub. 
+ * Then, see if there are any extra slaveVPs hanging around, and if not,
+ * call the helper to make a new one.
+ * Then, put the task stub into the slave's semantic Data.
+ *When the slave calls dissipate, have to recycle the task stub.
+ */
+inline void
+handleCreate( VMSReqst *req, SlaveVP *requestingSlv, VSsSemEnv *semEnv )
+ { VSsSemReq  *semReq;
+   SlaveVP    *newSlv;
+   VSsSemData *semData, *parentSemData;
+   
+   semReq = VMS_PI__take_sem_reqst_from( req );
+
+   semEnv->numLiveThreadSlvs += 1;
+   
+      //Deceptive -- must work when creator is a normal task, or seed,
+      // or another thd.. think have valid sem data and task stub for all
+      //This hdlr is NOT called when creating the seed slave
+   parentSemData = (VSsSemData *)semReq->callingSlv->semanticData;
+   parentSemData->taskStub->numLiveChildThreads += 1;
+
+      //use an idle "extra" slave, if have one
+   newSlv = readPrivQ( semEnv->freeExtraTaskSlvQ );
+   if( newSlv != NULL ) //got an idle one, so reset it
+    { semData = (VSsSemData *)newSlv->semanticData;
+      semData->highestTransEntered = -1;
+      semData->lastTransEntered    = NULL;
+      VMS_int__reset_slaveVP_to_TopLvlFn( newSlv, semReq->fnPtr, 
+                                                         semReq->initData );
+    }
+   else //no idle ones, create a new
+    { newSlv = VSs__create_slave_helper( semReq->fnPtr, semReq->initData,
+                                         semEnv, semReq->coreToAssignOnto ); 
+      semData = (VSsSemData *)newSlv->semanticData;
+    }
+
+      //now, create a new task and assign to the thread
+   semData->needsTaskAssigned = FALSE;  //thread has a permanent task
+   semData->taskStub = create_thread_task_stub( semReq->initData );
+   semData->taskStub->parentTaskStub = parentSemData->taskStub;
+   semData->slaveType = ThreadSlv; //this hdlr only creates thread slvs
+
+         DEBUG__printf2(dbgRqstHdlr,"Create from: %d, new VP: %d",
+                                    requestingSlv->slaveID, newSlv->slaveID)
+
+   #ifdef HOLISTIC__TURN_ON_OBSERVE_UCC
+   Dependency newd;
+   newd.from_vp = requestingSlv->slaveID;
+   newd.from_task = requestingSlv->assignCount;
+   newd.to_vp = newSlv->slaveID;
+   newd.to_task = 1;
+   addToListOfArrays(Dependency,newd,semEnv->commDependenciesList);   
+   #endif
+
+      //For VSs, caller needs ptr to created thread returned to it
+   requestingSlv->dataRetFromReq = newSlv;
+   resume_slaveVP(requestingSlv , semEnv );
+   resume_slaveVP( newSlv,        semEnv );
+ }
+
+
+//=========================== Helper ==============================
+void
+resume_slaveVP( SlaveVP *slave, VSsSemEnv *semEnv )
+ {
+      //both suspended tasks and suspended explicit slaves resumed with this
+   writePrivQ( slave, semEnv->slavesReadyToResumeQ );
+   
+   #ifdef HOLISTIC__TURN_ON_PERF_COUNTERS
+/*
+   int lastRecordIdx = slave->counter_history_array_info->numInArray -1;
+   CounterRecord* lastRecord = slave->counter_history[lastRecordIdx];
+   saveLowTimeStampCountInto(lastRecord->unblocked_timestamp);
+*/
+   #endif
+   #ifdef HOLISTIC__TURN_ON_OBSERVE_UCC
+   slave->assignCount++; //Somewhere here!
+   Unit newu;
+   newu.vp = slave->slaveID;
+   newu.task = slave->assignCount;
+   addToListOfArrays(Unit,newu,semEnv->unitList);
+   
+   if (slave->assignCount > 1){
+        Dependency newd;
+        newd.from_vp = slave->slaveID;
+        newd.from_task = slave->assignCount - 1;
+        newd.to_vp = slave->slaveID;
+        newd.to_task = slave->assignCount;
+        addToListOfArrays(Dependency, newd ,semEnv->ctlDependenciesList);  
+   }
+   #endif
+ }
diff -r 000000000000 -r 9f2a7bd26dd9 DKU_Request_Handlers.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/DKU_Request_Handlers.c	Mon Aug 27 02:14:35 2012 -0700
@@ -0,0 +1,1213 @@
+/*
+ * Copyright 2010  OpenSourceCodeStewardshipFoundation
+ *
+ * Licensed under BSD
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "VMS_impl/VMS.h"
+#include "Queue_impl/PrivateQueue.h"
+#include "Hash_impl/PrivateHash.h"
+#include "VSs.h"
+#include "VSs_Request_Handlers.h"
+
+
+
+
+//=========================== Local Fn Prototypes ===========================
+void
+resume_slaveVP( SlaveVP *slave, VSsSemEnv *semEnv );
+
+
+
+//==========================================================================
+//                           Helpers
+//
+
+/*Only clone the elements of req used in these reqst handlers
+ */
+VSsSemReq *
+cloneReq( VSsSemReq *semReq )
+ { VSsSemReq *clonedReq;
+
+   clonedReq             = VMS_PI__malloc( sizeof(VSsSemReq) );
+   clonedReq->reqType    = semReq->reqType;
+   clonedReq->senderSlv  = semReq->senderSlv;
+   clonedReq->receiverSlv= semReq->receiverSlv;
+   clonedReq->msg        = semReq->msg;
+   clonedReq->nextReqInHashEntry = NULL;
+   
+   return clonedReq;
+ }
+
+
+
+HashEntry *
+giveEntryElseInsertReqst32( int32 *key, VSsSemReq *semReq,
+                            HashTable   *commHashTbl )
+ { HashEntry    *entry;
+   VSsSemReq    *waitingReq;
+
+   entry = getEntryFromTable32( key, commHashTbl );
+   if( entry == NULL )
+    {    //no waiting sends or receives, so add this request and exit
+         // note: have to clone the request because it's on stack of sender
+      addValueIntoTable32( key, cloneReq( semReq ), commHashTbl );
+      return NULL;
+    }
+   waitingReq = (VSsSemReq *)entry->content;
+   if( waitingReq == NULL )  //might happen when last waiting gets paired
+    {    //no waiting sends or receives, so add this request and exit
+      entry->content = semReq;
+      return NULL;
+    }
+   return entry;
+ }
+
+      
+inline VSsPointerEntry *
+create_pointer_entry( )
+ { VSsPointerEntry *newEntry;
+   
+   newEntry = VMS_PI__malloc( sizeof(VSsPointerEntry) );
+   newEntry->hasEnabledNonFinishedWriter = FALSE;
+   newEntry->numEnabledNonDoneReaders    = 0;
+   newEntry->waitersQ                    = makePrivQ();
+      
+   return newEntry;
+ }
+
+/*malloc's space and initializes fields -- and COPIES the arg values
+ * to new space
+ */
+inline VSsTaskStub *
+create_task_stub( VSsTaskType *taskType, void **args )
+ { void **newArgs;
+   VSsTaskStub* newStub = VMS_int__malloc( sizeof(VSsTaskStub) + taskType->sizeOfArgs );
+   newStub->numBlockingProp = taskType->numCtldArgs;
+   newStub->slaveAssignedTo = NULL;
+   newStub->taskType   = taskType;
+   newStub->ptrEntries = 
+      VMS_int__malloc( taskType->numCtldArgs * sizeof(VSsPointerEntry *) );
+   newArgs = (void **)( (uint8 *)newStub + sizeof(VSsTaskStub) );
+   newStub->args = newArgs;
+   newStub->numLiveChildTasks   = 0;
+   newStub->numLiveChildThreads = 0;
+   newStub->isEnded = FALSE;
+   
+      //Copy the arg-pointers.. can be more arguments than just the ones 
+      // that StarSs uses to control ordering of task execution.
+   memcpy( newArgs, args, taskType->sizeOfArgs );
+   
+   return newStub;
+ }
+
+inline VSsTaskStubCarrier *
+create_task_carrier( VSsTaskStub *taskStub, int32 argNum, int32 rdOrWrite )
+ { VSsTaskStubCarrier *newCarrier;
+ 
+   newCarrier = VMS_PI__malloc( sizeof(VSsTaskStubCarrier) );
+   newCarrier->taskStub = taskStub;
+   newCarrier->argNum   = argNum;
+   newCarrier->isReader = rdOrWrite == READER;
+ }
+
+//==========================================================================
+//
+//
+/*Submit Task
+ * 
+ *Uses a hash table to match the arg-pointers to each other. So, an
+ * argument-pointer is one-to-one with a hash-table entry.
+ * 
+ *If overlapping region detection is enabled, then a hash entry is one
+ * link in a ring of all entries that overlap each other.  For example,
+ * say region A shared common addresses with region B, but the pointers
+ * to them are different, then the hash entries for the two would be
+ * linked in a ring.  When a pointer is processed, all the pointers in
+ * the ring are processed (Doesn't differentiate independent siblings
+ * from parent-child or conjoined twins overlap..)
+ * NOT ENABLED AS OF MAY 25 2012
+ * 
+ *A hash entry has a queue of tasks that are waiting to access the
+ * pointed-to  region.  The queue goes in the order of creation of
+ * the tasks.  Each entry in the queue has a pointer to the task-stub
+ * and whether the task reads-only vs writes to the hash-entry's region.
+ * 
+ *A hash entry also has a count of the enabled but not yet finished readers
+ * of the region. It also has a flag that says whether a writer has been
+ * enabled and is not yet finished.
+ * 
+ *There are two kinds of events that access a hash entry: creation of a
+ * task and end of a task.
+ *
+ * 
+ * ==========================  creation  ========================== 
+ * 
+ *At creation, make a task-stub.  Set the count of blocking propendents
+ * to the number of controlled arguments (a task can have
+ * arguments that are not controlled by the language, like simple integer
+ * inputs from the sequential portion. Note that all controlled arguments
+ * are pointers, and marked as controlled in the application code).
+ * 
+ *The controlled arguments are then processed one by one.
+ *Processing an argument means getting the hash of the pointer.  Then,
+ * looking up the hash entry.  (If none, create one).
+ *With the hash entry:
+ *
+ *If the arg is a reader, and the entry does not have an enabled
+ * non-finished writer, and the queue is empty (could be prev readers,
+ * then a writer that got queued and now new readers that have to also be
+ * queued).
+ *The reader is free.  So, decrement the blocking-propendent count in
+ * the task-stub. If the count is zero, then put the task-stub into the
+ * readyQ.
+ *At the same time, increment the hash-entry's count of enabled and
+ * non-finished readers. 
+ * 
+ *Otherwise, the reader is put into the hash-entry's Q of waiters
+ * 
+ *If the arg is a writer, plus the entry does not have a current writer,
+ * plus the number of enabled non-finished readers is zero, plus the Q is
+ * empty, then the writer is free.  Mark the entry has having an
+ * enabled and non-finished writer.  Decrement the blocking-propendent
+ * count in the writer's task-stub. If the count is zero, then put the
+ * task-stub into the readyQ.
+ * 
+ *Otherwise, put the writer into the entry's Q of waiters.
+ * 
+ *No matter what, if the hash entry was chained, put it at the start of
+ * the chain.  (Means no-longer-used pointers accumulate at end of chain,
+ * decide garbage collection of no-longer-used pointers later)
+ *
+ *  
+ * ========================== end of task ===========================
+ * 
+ *At the end of a task,
+ *The task's controlled arguments are processed one by one.
+ *Processing an argument means getting the hash of the pointer.  Then,
+ * looking up the hash entry (and putting the entry at the start of the
+ * chain, if there was a chain).
+ *With the hash entry:
+ *
+ *If the arg is a reader, then decrement the enabled and non-finished
+ * reader-count in the hash-entry. If the count becomes zero, then take
+ * the next entry from the Q. It should be a writer, or else there's a
+ * bug in this algorithm.
+ *Set the hash-entry to have an enabled non-finished writer.  Decrement
+ * the blocking-propendent-count of the writer's task-stub.  If the count
+ * has reached zero, then put the task-stub into the readyQ.
+ * 
+ *If the arg is a writer, then clear the enabled non-finished writer flag
+ * of the hash-entry. Take the next entry from the Q. 
+ *If it is a writer, then turn the flag back on.  Decrement the writer's
+ * blocking-propendent-count in its task-stub.  If it becomes zero, then
+ * put the task-stub into the readyQ.
+ *
+ *If it is a reader, then increment the hash-entry's count of enabled
+ * non-finished readers.  Decrement the blocking propendents count of the
+ * reader's task-stub.  If it reaches zero, then put the task-stub into the
+ * readyQ.
+ *Then repeat until encounter a writer -- put that writer back into the Q.
+ * 
+ *That should be it -- that should work.
+ */
+inline void
+handleSubmitTask( VSsSemReq *semReq, VSsSemEnv *semEnv )
+ { uint32           key[3];
+   HashEntry       *rawHashEntry; //has char *, but use with uint32 *
+   VSsPointerEntry *ptrEntry; //contents of hash table entry for an arg pointer
+   void           **args;
+   VSsTaskStub     *taskStub;
+   VSsTaskType     *taskType;
+   VSsTaskStubCarrier *taskCarrier;
+   
+   HashTable *
+   argPtrHashTbl = semEnv->argPtrHashTbl;
+   
+      //suspending a task always makes the slave into an extra slot slave,
+      // because it ends up in the resumeQ, even when resumes immediately.
+      //Eventually task_end will put the slave into the freeExtraTaskSlvQ
+   replaceWithNewSlotSlvIfNeeded( semReq->callingSlv, semEnv );
+ 
+   /* ==========================  creation  ========================== 
+    * 
+    *At creation, make a task-stub.  Set the count of blocking propendents
+    * to the number of controlled arguments (a task can have
+    * arguments that are not controlled by the language, like simple integer
+    * inputs from the sequential portion. Note that all controlled arguments
+    * are pointers, and marked as controlled in the application code).
+    */
+   args     = semReq->args;
+   taskType = semReq->taskType;
+   taskStub = create_task_stub( taskType, args );//copies arg ptrs
+   taskStub->numBlockingProp = taskType->numCtldArgs;
+   taskStub->taskID          = semReq->taskID; //may be NULL
+   
+   VSsSemData* 
+   parentSemData = (VSsSemData*) semReq->callingSlv->semanticData;
+   taskStub->parentTaskStub = (void*) parentSemData->taskStub;
+   parentSemData->taskStub->numLiveChildTasks += 1;
+   
+         //DEBUG__printf3(dbgRqstHdlr,"Submit req from slaveID: %d, from task: %d, for task: %d", semReq->callingSlv->slaveID, parentSemData->taskStub->taskID[1], taskStub->taskID[1])
+         DEBUG__printf2(dbgRqstHdlr,"Submit req from slaveID: %d, for task: %d", semReq->callingSlv->slaveID, taskStub->taskID[1])
+          
+   /*The controlled arguments are then processed one by one.
+    *Processing an argument means getting the hash of the pointer.  Then,
+    * looking up the hash entry.  (If none, create one).
+    */
+   int32 argNum;
+   for( argNum = 0; argNum < taskType->numCtldArgs; argNum++ )
+    { 
+      key[0] = 2; //two 32b values in key
+      *( (uint64*)&key[1]) = (uint64)args[argNum];  //write 64b into two 32b
+
+      /*If the hash entry was chained, put it at the
+       * start of the chain.  (Means no-longer-used pointers accumulate
+       * at end of chain, decide garbage collection later) */
+      rawHashEntry = getEntryFromTable32( key, argPtrHashTbl );
+      if( rawHashEntry == NULL )
+       {    //adding a value auto-creates the hash-entry
+         ptrEntry = create_pointer_entry();
+         rawHashEntry = addValueIntoTable32( key, ptrEntry, argPtrHashTbl );
+       }
+      else
+       { ptrEntry = (VSsPointerEntry *)rawHashEntry->content;
+         if( ptrEntry == NULL )
+          { ptrEntry = create_pointer_entry();
+            rawHashEntry = addValueIntoTable32(key, ptrEntry, argPtrHashTbl);
+          }
+       }
+      taskStub->ptrEntries[argNum] = ptrEntry;
+      
+      /*Have the hash entry.
+       *If the arg is a reader and the entry does not have an enabled
+       * non-finished writer, and the queue is empty. */
+      if( taskType->argTypes[argNum] == READER )
+       { if( !ptrEntry->hasEnabledNonFinishedWriter && 
+             isEmptyPrivQ( ptrEntry->waitersQ ) )
+          { /*The reader is free.  So, decrement the blocking-propendent
+             * count in the task-stub. If the count is zero, then put the
+             * task-stub into the readyQ.  At the same time, increment
+             * the hash-entry's count of enabled and non-finished readers.*/
+            taskStub->numBlockingProp -= 1;
+            if( taskStub->numBlockingProp == 0 )
+             { writePrivQ( taskStub, semEnv->taskReadyQ );
+             }
+            ptrEntry->numEnabledNonDoneReaders += 1;
+          }
+         else
+          { /*Otherwise, the reader is put into the hash-entry's Q of
+             * waiters*/
+            taskCarrier = create_task_carrier( taskStub, argNum, READER );
+            writePrivQ( taskCarrier, ptrEntry->waitersQ );
+          }
+       }
+      else //arg is a writer
+       { /*the arg is a writer, plus the entry does not have a current
+          * writer, plus the number of enabled non-finished readers is
+          * zero, (the Q must be empty, else bug!) then the writer is free*/
+         if( !ptrEntry->hasEnabledNonFinishedWriter &&
+              ptrEntry->numEnabledNonDoneReaders == 0 )
+          { /*Mark the entry has having a enabled and non-finished writer.
+              * Decrement the blocking-propenden count in the writer's
+              * task-stub. If the count is zero, then put the task-stub
+              * into the readyQ.*/
+            taskStub->numBlockingProp -= 1;
+            if( taskStub->numBlockingProp == 0 )
+             { writePrivQ( taskStub, semEnv->taskReadyQ );
+             }
+            ptrEntry->hasEnabledNonFinishedWriter = TRUE;
+          }
+         else
+          {/*Otherwise, put the writer into the entry's Q of waiters.*/
+            taskCarrier = create_task_carrier( taskStub, argNum, WRITER );
+            writePrivQ( taskCarrier, ptrEntry->waitersQ );            
+          }
+       }
+    } //for argNum
+   
+   
+   resume_slaveVP( semReq->callingSlv, semEnv );
+
+   return;
+ }
+
+
+/* ========================== end of task ===========================
+ * 
+ *At the end of a task,
+ *The task's controlled arguments are processed one by one.
+ *Processing an argument means getting the hash of the pointer.  Then,
+ * looking up the hash entry (and putting the entry at the start of the
+ * chain, if there was a chain).
+ *With the hash entry:
+ *
+ *If the arg is a reader, then decrement the enabled and non-finished
+ * reader-count in the hash-entry. If the count becomes zero, then take
+ * the next entry from the Q. It should be a writer, or else there's a
+ * bug in this algorithm.
+ *Set the hash-entry to have an enabled non-finished writer.  Decrement
+ * the blocking-propendent-count of the writer's task-stub.  If the count
+ * has reached zero, then put the task-stub into the readyQ.
+ * 
+ *If the arg is a writer, then clear the enabled non-finished writer flag
+ * of the hash-entry. Take the next entry from the waiters Q. 
+ *If it is a writer, then turn the flag back on.  Decrement the writer's
+ * blocking-propendent-count in its task-stub.  If it becomes zero, then
+ * put the task-stub into the readyQ.
+ *
+ *If waiter is a reader, then do a loop, getting all waiting readers.
+ * For each, increment the hash-entry's count of enabled
+ * non-finished readers.  Decrement the blocking propendents count of the
+ * reader's task-stub.  If it reaches zero, then put the task-stub into the
+ * readyQ.
+ *Repeat until encounter a writer -- put that writer back into the Q.
+ * 
+ *May 2012 -- not keeping track of how many references to a given ptrEntry
+ * exist, so no way to garbage collect..
+ *TODO: Might be safe to delete an entry when task ends and waiterQ empty
+ * and no readers and no writers..
+ */
+inline void
+handleEndTask( VSsSemReq *semReq, VSsSemEnv *semEnv )
+ { VSsPointerEntry  *ptrEntry; //contents of hash table entry for an arg pointer
+   void            **args;
+   VSsSemData       *endingSlvSemData;
+   VSsTaskStub      *endingTaskStub, *waitingTaskStub, *parent;
+   VSsTaskType      *endingTaskType;
+   VSsTaskStubCarrier *waitingTaskCarrier;
+   VSsPointerEntry **ptrEntries;
+         
+ 
+   endingSlvSemData = (VSsSemData *)semReq->callingSlv->semanticData;
+   endingTaskStub   = endingSlvSemData->taskStub;
+   args             = endingTaskStub->args;
+   endingTaskType   = endingTaskStub->taskType;
+   ptrEntries       = endingTaskStub->ptrEntries; //saved in stub when create
+   
+         DEBUG__printf2(dbgRqstHdlr,"EndTask req from slaveID: %d, task: %d",semReq->callingSlv->slaveID, endingTaskStub->taskID[1])
+          
+      //Check if parent was waiting on this task
+   parent = (VSsTaskStub *) endingTaskStub->parentTaskStub;
+   parent->numLiveChildTasks -= 1;
+   if( parent->isWaitingForChildTasksToEnd && parent->numLiveChildTasks == 0)
+    {
+      parent->isWaitingForChildTasksToEnd = FALSE;
+      resume_slaveVP( parent->slaveAssignedTo, semEnv );
+    }
+   
+      //Check if parent ended, and this was last descendent, then free it
+   if( parent->isEnded && parent->numLiveChildTasks == 0 )
+    { VMS_PI__free( parent );
+    }
+   
+   
+      //Now, update state of dependents and start ready tasks
+   /*The task's controlled arguments are processed one by one.
+    *Processing an argument means getting arg-pointer's entry.
+    */
+   int32 argNum;
+   for( argNum = 0; argNum < endingTaskType->numCtldArgs; argNum++ )
+    { 
+      /* commented out 'cause remembering entry ptr when create stub
+      key[0] = 2; //says are 2 32b values in key
+      *( (uint64*)&key[1] ) = args[argNum];  //write 64b ptr into two 32b
+
+       /*If the hash entry was chained, put it at the
+       * start of the chain.  (Means no-longer-used pointers accumulate
+       * at end of chain, decide garbage collection later) 
+       */
+      /*NOTE: don't do hash lookups here, instead, have a pointer to the
+       * hash entry inside task-stub, put there during task creation.
+      rawHashEntry = getEntryFromTable32( key, ptrHashTbl );
+      ptrEntry = (VSsPointerEntry *)rawHashEntry->content;
+      if( ptrEntry == NULL ) 
+          VMS_App__throw_exception("hash entry NULL", NULL, NULL);
+      */ 
+      
+      ptrEntry = ptrEntries[argNum];
+      /*check if the ending task was reader of this arg*/
+      if( endingTaskType->argTypes[argNum] == READER )
+       { /*then decrement the enabled and non-finished reader-count in
+          * the hash-entry. */ 
+         ptrEntry->numEnabledNonDoneReaders -= 1;
+         
+         /*If the count becomes zero, then take the next entry from the Q. 
+          *It should be a writer, or else there's a bug in this algorithm.*/
+         if( ptrEntry->numEnabledNonDoneReaders == 0 )
+          { waitingTaskCarrier = readPrivQ( ptrEntry->waitersQ );
+            if( waitingTaskCarrier == NULL ) 
+             { //TODO: looks safe to delete the ptr entry at this point 
+               continue; //next iter of loop
+             }
+            if( waitingTaskCarrier->isReader ) 
+               VMS_App__throw_exception("READER waiting", NULL, NULL);
+                   
+            waitingTaskStub = waitingTaskCarrier->taskStub;
+            
+            /*Set the hash-entry to have an enabled non-finished writer.*/
+            ptrEntry->hasEnabledNonFinishedWriter = TRUE;
+            
+            /* Decrement the blocking-propendent-count of the writer's
+             * task-stub.  If the count has reached zero, then put the
+             * task-stub into the readyQ.*/
+            waitingTaskStub->numBlockingProp -= 1;
+            if( waitingTaskStub->numBlockingProp == 0 )
+             { writePrivQ( waitingTaskStub, semEnv->taskReadyQ );
+             }
+          }
+       }
+      else /*the ending task is a writer of this arg*/ 
+       { /*clear the enabled non-finished writer flag of the hash-entry.*/
+         ptrEntry->hasEnabledNonFinishedWriter = FALSE;
+         
+         /*Take the next waiter from the hash-entry's Q.*/
+         waitingTaskCarrier = readPrivQ( ptrEntry->waitersQ );
+         if( waitingTaskCarrier == NULL )
+          { //TODO: looks safe to delete ptr entry at this point
+            continue; //go to next iter of loop, done here.
+          }
+         waitingTaskStub = waitingTaskCarrier->taskStub;
+         
+         /*If task is a writer of this hash-entry's pointer*/
+         if( !waitingTaskCarrier->isReader ) 
+          { /* then turn the flag back on.*/
+            ptrEntry->hasEnabledNonFinishedWriter = TRUE;
+            /*Decrement the writer's blocking-propendent-count in task-stub
+             * If it becomes zero, then put the task-stub into the readyQ.*/
+            waitingTaskStub->numBlockingProp -= 1;
+            if( waitingTaskStub->numBlockingProp == 0 )
+             { writePrivQ( waitingTaskStub, semEnv->taskReadyQ );
+             }
+          }
+         else
+          { /*Waiting task is a reader, so do a loop, of all waiting readers
+             * until encounter a writer or waitersQ is empty*/
+            while( TRUE ) /*The checks guarantee have a waiting reader*/
+             { /*Increment the hash-entry's count of enabled non-finished
+                * readers.*/
+               ptrEntry->numEnabledNonDoneReaders += 1;
+
+               /*Decrement the blocking propendents count of the reader's
+                * task-stub.  If it reaches zero, then put the task-stub
+                * into the readyQ.*/
+               waitingTaskStub->numBlockingProp -= 1;
+               if( waitingTaskStub->numBlockingProp == 0 )
+                { writePrivQ( waitingTaskStub, semEnv->taskReadyQ );
+                }
+               /*Get next waiting task*/
+               waitingTaskCarrier = peekPrivQ( ptrEntry->waitersQ );
+               if( waitingTaskCarrier == NULL ) break;
+               if( !waitingTaskCarrier->isReader ) break;
+               waitingTaskCarrier = readPrivQ( ptrEntry->waitersQ );               
+               waitingTaskStub = waitingTaskCarrier->taskStub;
+             }//while waiter is a reader
+          }//if-else, first waiting task is a reader
+       }//if-else, check of ending task, whether writer or reader
+    }//for argnum in ending task
+   
+   
+      //done ending the task, now free the stub + args copy
+      // if still has live children, then keep stub around
+   if( endingTaskStub->numLiveChildTasks   == 0 &&
+       endingTaskStub->numLiveChildThreads == 0 )
+    { free_task_stub( endingTaskStub ); 
+    }
+   
+   
+   endingSlvSemData->needsTaskAssigned = TRUE;
+   
+      //Check if the slave is an extra task slave, and put into free Q
+   if( endingSlvSemData->slaveType == ExtraTaskSlv )
+    { writePrivQ( semReq->callingSlv, semEnv->freeExtraTaskSlvQ );
+    }
+   
+      //otherwise, it's a slot slave, so it will get used from matrix
+      // so, do nothing with it, just return
+   return; 
+ }
+
+inline void
+free_task_stub( VSsTaskStub *stubToFree )
+ { if(stubToFree->ptrEntries != NULL ) //a thread stub has NULL entry
+    { VMS_PI__free( stubToFree->ptrEntries );
+    }
+   VMS_PI__free( stubToFree );
+ }
+
+//========================== Task Comm handlers ===========================
+
+
+
+//============================  Send Handlers ==============================
+/*Send of Type -- The semantic request has the receiving task ID and Type
+ *
+ *Messages of a given Type have to be kept separate..  so need a separate
+ * entry in the hash table for each pair: receiverID, Type
+ *
+ *Also, if same sender sends multiple before any get received, then need to
+ * stack the sends up -- even if a send waits until it's paired, several
+ * separate tasks can send to the same receiver, and doing hash on the
+ * receive task, so they will stack up.
+ */
+inline void
+handleSendTypeTo( VSsSemReq *semReq, VSsSemEnv *semEnv )
+ { SlaveVP    *senderSlv, *receiverSlv;
+   int32      *senderID, *receiverID;
+   int32      *key, keySz, receiverIDNumInt;
+   VSsSemReq  *waitingReq;
+   HashEntry  *entry;
+   HashTable  *commHashTbl = semEnv->commHashTbl;
+   
+   receiverID  = semReq->receiverID; //For "send", know both send & recv procrs
+   senderSlv   = semReq->senderSlv;
+
+         DEBUG__printf2(dbgRqstHdlr,"SendType req from sender slaveID: %d, recTask: %d", senderSlv->slaveID, receiverID[1])
+          
+      //suspending a task always makes the slave into an extra slot slave,
+      // because it ends up in the resumeQ, even when resumes immediately.
+      //Eventually task_end will put the slave into the freeExtraTaskSlvQ
+   replaceWithNewSlotSlvIfNeeded( senderSlv, semEnv );
+         
+   receiverIDNumInt = receiverID[0] + 1; //pos 0 doesn't include itself
+   keySz = receiverIDNumInt * sizeof(int32) + 2 * sizeof(int32);
+   key = VMS_PI__malloc( keySz );
+   key[0] = receiverIDNumInt + 1; //loc 0 is num int32 in key
+   memcpy( &key[1], receiverID, receiverIDNumInt * sizeof(int32) );
+   key[ 1 + receiverIDNumInt ] = semReq->msgType; 
+   
+   entry = giveEntryElseInsertReqst32( key, semReq, commHashTbl );
+   if( entry == NULL ) //was just inserted, means task has to wait
+    { return;
+    }
+
+      //if here, found a waiting request with same key
+   waitingReq = (VSsSemReq *)entry->content;
+
+      //At this point, know have waiting request(s) -- either sends or recv
+      //Note, can only have max of one receive waiting, and cannot have both
+      // sends and receives waiting (they would have paired off)
+      // but can have multiple sends from diff sending VPs, all same msg-type
+   if( waitingReq->reqType == send_type_to )
+    {    //waiting request is another send, so stack this up on list
+         // but first clone the sending request so it persists.
+      VSsSemReq *clonedReq = cloneReq( semReq );
+      clonedReq-> nextReqInHashEntry = waitingReq->nextReqInHashEntry;
+      waitingReq->nextReqInHashEntry = clonedReq;
+         DEBUG__printf2( dbgRqstHdlr, "linked requests: %p, %p ", clonedReq,\
+                                                                 waitingReq )
+      return;
+    }
+   else
+    {    
+       #ifdef HOLISTIC__TURN_ON_OBSERVE_UCC
+        Dependency newd;
+        newd.from_vp = senderID->slaveID;
+        newd.from_task = senderID->assignCount;
+        newd.to_vp = receiverID->slaveID;
+        newd.to_task = receiverID->assignCount +1;
+        //(newd,semEnv->commDependenciesList);  
+        addToListOfArrays(Dependency,newd,semEnv->dynDependenciesList);  
+                int32 groupId = semReq->msgType;
+        if(semEnv->ntonGroupsInfo->numInArray <= groupId){
+            makeHighestDynArrayIndexBeAtLeast(semEnv->ntonGroupsInfo, groupId);
+        }
+        if(semEnv->ntonGroups[groupId] == NULL){
+            semEnv->ntonGroups[groupId] = new_NtoN(groupId);
+        }
+        Unit u;
+        u.vp = senderID->slaveID;
+        u.task = senderID->assignCount;
+        addToListOfArrays(Unit,u,semEnv->ntonGroups[groupId]->senders);
+        u.vp = receiverID->slaveID;
+        u.task = receiverID->assignCount +1;
+        addToListOfArrays(Unit,u,semEnv->ntonGroups[groupId]->receivers);
+       #endif
+
+         //set receiver slave, from the waiting request
+      receiverSlv = waitingReq->receiverSlv;
+      
+         //waiting request is a receive_type_to, so it pairs to this send
+         //First, remove the waiting receive request from the entry
+      entry->content = waitingReq->nextReqInHashEntry;
+      VMS_PI__free( waitingReq ); //Don't use contents -- so free it
+      
+      if( entry->content == NULL )
+       {    //TODO: mod hash table to double-link, so can delete entry from
+            // table without hashing the key and looking it up again
+         deleteEntryFromTable32( (uint32*)entry->key, commHashTbl );  //frees hashEntry
+       }
+      
+         //attach msg that's in this send request to receiving task's Slv
+         // when comes back from suspend will have msg in dataRetFromReq
+      receiverSlv->dataRetFromReq = semReq->msg;
+
+         //bring both processors back from suspend
+      resume_slaveVP( senderSlv,   semEnv );
+      resume_slaveVP( receiverSlv, semEnv );
+
+      return;
+    }
+ }
+
+
+/*Looks like can make single handler for both sends..
+ */
+//TODO: combine both send handlers into single handler
+inline void
+handleSendFromTo( VSsSemReq *semReq, VSsSemEnv *semEnv)
+ { SlaveVP     *senderSlv, *receiverSlv;
+   int32       *senderID, *receiverID;
+   int32       *key, keySz, receiverIDNumInt, senderIDNumInt;
+   VSsSemReq   *waitingReq;
+   HashEntry   *entry;
+   HashTable   *commHashTbl = semEnv->commHashTbl;
+
+         DEBUG__printf2(dbgRqstHdlr,"SendFromTo req from task %d to %d",
+                        semReq->senderID[1],semReq->receiverID[1])
+   
+   receiverID  = semReq->receiverID; //For "send", know both send & recv procrs
+   senderID    = semReq->senderID;
+   senderSlv   = semReq->senderSlv;
+
+      //suspending a task always makes the slave into an extra slot slave,
+      // because it ends up in the resumeQ, even when resumes immediately.
+      //Eventually task_end will put the slave into the freeExtraTaskSlvQ
+   replaceWithNewSlotSlvIfNeeded( senderSlv, semEnv );
+   
+   receiverIDNumInt = receiverID[0] + 1; //include the count in the key
+   senderIDNumInt   = senderID[0] + 1;
+   keySz = (receiverIDNumInt + senderIDNumInt) * sizeof(int32) + sizeof(int32);
+   key   = VMS_PI__malloc( keySz );
+   key[0] = receiverIDNumInt + senderIDNumInt;
+   memcpy( &key[1], receiverID, receiverIDNumInt * sizeof(int32) );
+   memcpy( &key[1 + receiverIDNumInt], senderID, senderIDNumInt * sizeof(int32) );
+
+   entry = giveEntryElseInsertReqst32( key, semReq, commHashTbl );
+   if( entry == NULL ) //was just inserted, means task has to wait
+    { return;
+    }
+
+   waitingReq = (VSsSemReq *)entry->content;
+
+      //At this point, know have waiting request(s) -- either sends or recv
+   if( waitingReq->reqType == send_from_to )
+    { printf("\n ERROR: shouldn't be two send-from-tos waiting \n");
+    }
+   else
+    {    //waiting request is a receive, so it completes pair with this send
+      #ifdef HOLISTIC__TURN_ON_OBSERVE_UCC
+        Dependency newd;
+        newd.from_vp = sendPr->slaveID;
+        newd.from_task = sendPr->assignCount;
+        newd.to_vp = receivePr->slaveID;
+        newd.to_task = receivePr->assignCount +1;
+        //addToListOfArraysDependency(newd,semEnv->commDependenciesList);  
+        addToListOfArrays(Dependency,newd,semEnv->commDependenciesList);   
+      #endif 
+
+         //set receiver slave, from the waiting request
+      receiverSlv = waitingReq->receiverSlv;
+       
+         //First, remove the waiting receive request from the entry
+      entry->content = waitingReq->nextReqInHashEntry;
+      VMS_PI__free( waitingReq ); //Don't use contents -- so free it
+      
+         //can only be one waiting req for "from-to" semantics
+      if( entry->content != NULL )
+       {
+         printf("\nERROR in handleSendFromTo\n");
+       }
+      deleteEntryFromTable32( (uint32*)entry->key, commHashTbl );  //frees HashEntry
+
+         //attach msg that's in this send request to receiving procr
+         // when comes back from suspend, will have msg in dataRetFromReq
+      receiverSlv->dataRetFromReq = semReq->msg;
+
+         //bring both processors back from suspend
+      resume_slaveVP( senderSlv,   semEnv );
+      resume_slaveVP( receiverSlv, semEnv );
+            
+      return;
+    }
+ }
+
+
+
+//==============================  Receives  ===========================
+//
+
+
+inline void
+handleReceiveTypeTo( VSsSemReq *semReq, VSsSemEnv *semEnv)
+ { SlaveVP    *senderSlv, *receiverSlv;
+   int32      *receiverID;
+   int32      *key, keySz, receiverIDNumInt;
+   VSsSemReq  *waitingReq;
+   HashEntry  *entry;
+   HashTable  *commHashTbl = semEnv->commHashTbl;
+   
+         DEBUG__printf2(dbgRqstHdlr,"ReceiveType req to ID: %d type: %d",semReq->receiverID[1], semReq->msgType)
+ 
+   receiverID  = semReq->receiverID; //For "send", know both send & recv procrs
+   receiverSlv = semReq->receiverSlv;
+   
+      //suspending a task always makes the slave into an extra slot slave,
+      // because it ends up in the resumeQ, even when resumes immediately.
+      //Eventually task_end will put the slave into the freeExtraTaskSlvQ
+   replaceWithNewSlotSlvIfNeeded( receiverSlv, semEnv );
+
+      //key is the receiverID plus the type -- have to copy them into key
+   receiverIDNumInt = receiverID[0] + 1; //pos 0 doesn't include itself
+   keySz = receiverIDNumInt * sizeof(int32) + 2 * sizeof(int32);
+   key = VMS_PI__malloc( keySz );
+   key[0] = receiverIDNumInt + 1; //loc 0 is num int32s in key
+   memcpy( &key[1], receiverID, receiverIDNumInt * sizeof(int32) );
+   key[ 1 + receiverIDNumInt ] = semReq->msgType; 
+
+   entry = giveEntryElseInsertReqst32( key, semReq, commHashTbl );//clones
+   if( entry == NULL ) //was just inserted, means task has to wait
+    { return;
+    }
+
+   waitingReq = (VSsSemReq *)entry->content;  //previously cloned by insert
+
+      //At this point, know have waiting request(s) -- should be send(s)
+   if( waitingReq->reqType == send_type_to )
+    {    
+         //set sending slave  from the request
+      senderSlv = waitingReq->senderSlv;
+      
+         //waiting request is a send, so pair it with this receive
+         //first, remove the waiting send request from the list in entry
+      entry->content = waitingReq->nextReqInHashEntry;
+      if( entry->content == NULL )
+       { deleteEntryFromTable32( (uint32*)entry->key, commHashTbl );  //frees HashEntry
+       }
+      
+         //attach msg that's in the send request to receiving procr
+         // when comes back from suspend, will have msg in dataRetFromReq
+      receiverSlv->dataRetFromReq = waitingReq->msg;
+
+         //bring both processors back from suspend
+      VMS_PI__free( waitingReq );
+
+       #ifdef HOLISTIC__TURN_ON_OBSERVE_UCC
+        Dependency newd;
+        newd.from_vp = sendPr->slaveID;
+        newd.from_task = sendPr->assignCount;
+        newd.to_vp = receivePr->slaveID;
+        newd.to_task = receivePr->assignCount +1;
+        //addToListOfArraysDependency(newd,semEnv->commDependenciesList);  
+        addToListOfArrays(Dependency,newd,semEnv->dynDependenciesList); 
+        int32 groupId = semReq->msgType;
+        if(semEnv->ntonGroupsInfo->numInArray <= groupId){
+            makeHighestDynArrayIndexBeAtLeast(semEnv->ntonGroupsInfo, groupId);
+        }
+        if(semEnv->ntonGroups[groupId] == NULL){
+            semEnv->ntonGroups[groupId] = new_NtoN(groupId);
+        }
+        Unit u;
+        u.vp = sendPr->slaveID;
+        u.task = sendPr->assignCount;
+        addToListOfArrays(Unit,u,semEnv->ntonGroups[groupId]->senders);
+        u.vp = receivePr->slaveID;
+        u.task = receivePr->assignCount +1;
+        addToListOfArrays(Unit,u,semEnv->ntonGroups[groupId]->receivers);
+       #endif
+      
+      resume_slaveVP( senderSlv,   semEnv );
+      resume_slaveVP( receiverSlv, semEnv );
+
+      return;
+    }
+   printf("\nLang Impl Error: Should never be two waiting receives!\n");
+ }
+
+
+/*
+ */
+inline void
+handleReceiveFromTo( VSsSemReq *semReq, VSsSemEnv *semEnv)
+ { SlaveVP     *senderSlv, *receiverSlv;
+   int32       *senderID,  *receiverID;
+   int32       *key, keySz, receiverIDNumInt, senderIDNumInt;
+   VSsSemReq   *waitingReq;
+   HashEntry   *entry;
+   HashTable   *commHashTbl = semEnv->commHashTbl;
+
+         DEBUG__printf2(dbgRqstHdlr,"RecFromTo req from ID: %d to ID: %d",semReq->senderID[1],semReq->receiverID[1])
+   
+   receiverID  = semReq->receiverID; //For "send", know both send & recv procrs
+   senderID    = semReq->senderID;
+   receiverSlv = semReq->receiverSlv;
+   
+      //suspending a task always makes the slave into an extra slot slave,
+      // because it ends up in the resumeQ, even when resumes immediately.
+      //Eventually task_end will put the slave into the freeExtraTaskSlvQ
+   replaceWithNewSlotSlvIfNeeded( receiverSlv, semEnv );
+
+   receiverIDNumInt = receiverID[0] + 1; //pos 0 doesn't include itself
+   senderIDNumInt   = senderID[0] + 1;
+   keySz = (receiverIDNumInt + senderIDNumInt) * sizeof(int32) + sizeof(int32);
+   key = VMS_PI__malloc( keySz );
+   key[0] = receiverIDNumInt + senderIDNumInt; //loc 0 is num int32s in key
+   memcpy( &key[1], receiverID, receiverIDNumInt * sizeof(int32) );
+   memcpy( &key[1 + receiverIDNumInt], senderID, senderIDNumInt * sizeof(int32));
+
+   entry = giveEntryElseInsertReqst32( key, semReq, commHashTbl );
+   if( entry == NULL ) //was just inserted, means task has to wait
+    { return;
+    }
+
+   waitingReq = (VSsSemReq *)entry->content;
+
+      //At this point, know have a request to rendez-vous -- should be send
+   if( waitingReq->reqType == send_from_to )
+    {    //waiting request is a send, so pair it with this receive
+      #ifdef HOLISTIC__TURN_ON_OBSERVE_UCC
+        Dependency newd;
+        newd.from_vp = sendPr->slaveID;
+        newd.from_task = sendPr->assignCount;
+        newd.to_vp = receivePr->slaveID;
+        newd.to_task = receivePr->assignCount +1;
+        //addToListOfArraysDependency(newd,semEnv->commDependenciesList);  
+        addToListOfArrays(Dependency,newd,semEnv->commDependenciesList);    
+      #endif
+      
+         //have receiver slave, now set sender slave
+      senderSlv = waitingReq->senderSlv;
+      
+         //For from-to, should only ever be a single reqst waiting tobe paird
+      entry->content = waitingReq->nextReqInHashEntry;
+      if( entry->content != NULL ) printf("\nERROR in handleRecvFromTo\n");
+      deleteEntryFromTable32( (uint32*)entry->key, commHashTbl );  //frees entry too
+
+         //attach msg that's in the send request to receiving procr
+         // when comes back from suspend, will have msg in dataRetFromReq
+      receiverSlv->dataRetFromReq = waitingReq->msg;
+
+         //bring both processors back from suspend
+      VMS_PI__free( waitingReq );
+
+      resume_slaveVP( senderSlv,   semEnv );
+      resume_slaveVP( receiverSlv, semEnv );
+
+      return;
+    }
+   printf("\nLang Impl Error: Should never be two waiting receives!\n");
+ }
+
+//==========================================================================
+inline void
+replaceWithNewSlotSlvIfNeeded( SlaveVP *requestingSlv, VSsSemEnv *semEnv )
+ { SlaveVP *newSlotSlv;
+   VSsSemData *semData, *reqSemData;
+
+   reqSemData = (VSsSemData *)requestingSlv->semanticData;
+   if( reqSemData->slaveType != SlotTaskSlv )
+      return; //already replaced, so just return
+   
+      //get a new slave to be the slot slave
+   newSlotSlv     = readPrivQ( semEnv->freeExtraTaskSlvQ );
+   if( newSlotSlv == NULL )
+    { newSlotSlv  = VSs__create_slave_helper( &idle_fn, NULL, semEnv, 0);
+         //just made a new extra task slave, so count it
+      semEnv->numLiveExtraTaskSlvs += 1;
+    }
+   
+      //set slave values to make it the slot slave
+   semData                        = newSlotSlv->semanticData;
+   semData->taskStub              = NULL;
+   semData->slaveType             = SlotTaskSlv;
+   semData->needsTaskAssigned     = TRUE;
+   
+      //a slot slave is pinned to a particular slot on a particular core
+   newSlotSlv->animSlotAssignedTo = requestingSlv->animSlotAssignedTo;
+   newSlotSlv->coreAnimatedBy     = requestingSlv->coreAnimatedBy;
+    
+      //put it into the slot slave matrix
+   int32 slotNum = requestingSlv->animSlotAssignedTo->slotIdx;
+   int32 coreNum = requestingSlv->coreAnimatedBy;
+   semEnv->slotTaskSlvs[coreNum][slotNum] = newSlotSlv;
+
+      //Fix up requester, to be an extra slave now (but not a free one)
+      // because it's not free, doesn't go into freeExtraTaskSlvQ
+   semData = requestingSlv->semanticData;
+   semData->slaveType = ExtraTaskSlv;
+ }
+
+inline void
+handleTaskwait( VSsSemReq *semReq, SlaveVP *requestingSlv, VSsSemEnv *semEnv)
+ { VSsTaskStub* requestingTaskStub;
+   VSsSemData* semData;
+         DEBUG__printf1(dbgRqstHdlr,"Taskwait request from processor %d",
+                                                      requestingSlv->slaveID)
+    
+   semData = (VSsSemData *)semReq->callingSlv->semanticData;
+   requestingTaskStub = semData->taskStub;
+   
+   if( semData->taskStub->numLiveChildTasks == 0 )
+    {    //nobody to wait for, resume
+      resume_slaveVP( requestingSlv, semEnv );
+    }
+   else  //have to wait, replace requester with new slot slv & mark waiting
+    { 
+       if(semData->slaveType == SlotTaskSlv){
+         replaceWithNewSlotSlvIfNeeded( requestingSlv, semEnv );
+       }
+       
+      requestingTaskStub->isWaitingForChildTasksToEnd = TRUE;
+    }    
+ }
+
+
+//==========================================================================
+/*
+ */
+void
+handleMalloc( VSsSemReq *semReq, SlaveVP *requestingSlv, VSsSemEnv *semEnv )
+ { void *ptr;
+ 
+      DEBUG__printf1(dbgRqstHdlr,"Malloc request from processor %d",requestingSlv->slaveID)
+
+   ptr = VMS_PI__malloc( semReq->sizeToMalloc );
+   requestingSlv->dataRetFromReq = ptr;
+   resume_slaveVP( requestingSlv, semEnv );
+ }
+
+/*
+ */
+void
+handleFree( VSsSemReq *semReq, SlaveVP *requestingSlv, VSsSemEnv *semEnv )
+ {
+         DEBUG__printf1(dbgRqstHdlr,"Free request from processor %d",requestingSlv->slaveID)
+   VMS_PI__free( semReq->ptrToFree );
+   resume_slaveVP( requestingSlv, semEnv );
+ }
+
+
+//===========================================================================
+//
+/*Uses ID as index into array of flags.  If flag already set, resumes from
+ * end-label.  Else, sets flag and resumes normally.
+ */
+void inline
+handleStartSingleton_helper( VSsSingleton *singleton, SlaveVP *reqstingSlv,
+                             VSsSemEnv    *semEnv )
+ {
+   if( singleton->hasFinished )
+    {    //the code that sets the flag to true first sets the end instr addr
+      reqstingSlv->dataRetFromReq = singleton->endInstrAddr;
+      resume_slaveVP( reqstingSlv, semEnv );
+      return;
+    }
+   else if( singleton->hasBeenStarted )
+    {    //singleton is in-progress in a diff slave, so wait for it to finish
+      writePrivQ(reqstingSlv, singleton->waitQ );
+      return;
+    }
+   else
+    {    //hasn't been started, so this is the first attempt at the singleton
+      singleton->hasBeenStarted = TRUE;
+      reqstingSlv->dataRetFromReq = 0x0;
+      resume_slaveVP( reqstingSlv, semEnv );
+      return;
+    }
+ }
+void inline
+handleStartFnSingleton( VSsSemReq *semReq, SlaveVP *requestingSlv,
+                      VSsSemEnv *semEnv )
+ { VSsSingleton *singleton;
+         DEBUG__printf1(dbgRqstHdlr,"StartFnSingleton request from processor %d",requestingSlv->slaveID)
+
+   singleton = &(semEnv->fnSingletons[ semReq->singletonID ]);
+   handleStartSingleton_helper( singleton, requestingSlv, semEnv );
+ }
+void inline
+handleStartDataSingleton( VSsSemReq *semReq, SlaveVP *requestingSlv,
+                      VSsSemEnv *semEnv )
+ { VSsSingleton *singleton;
+
+         DEBUG__printf1(dbgRqstHdlr,"StartDataSingleton request from processor %d",requestingSlv->slaveID)
+   if( *(semReq->singletonPtrAddr) == NULL )
+    { singleton                 = VMS_PI__malloc( sizeof(VSsSingleton) );
+      singleton->waitQ          = makeVMSQ();
+      singleton->endInstrAddr   = 0x0;
+      singleton->hasBeenStarted = FALSE;
+      singleton->hasFinished    = FALSE;
+      *(semReq->singletonPtrAddr)  = singleton;
+    }
+   else
+      singleton = *(semReq->singletonPtrAddr);
+   handleStartSingleton_helper( singleton, requestingSlv, semEnv );
+ }
+
+
+void inline
+handleEndSingleton_helper( VSsSingleton *singleton, SlaveVP *requestingSlv,
+                           VSsSemEnv    *semEnv )
+ { PrivQueueStruc *waitQ;
+   int32           numWaiting, i;
+   SlaveVP      *resumingSlv;
+
+   if( singleton->hasFinished )
+    { //by definition, only one slave should ever be able to run end singleton
+      // so if this is true, is an error
+      ERROR1( "singleton code ran twice", requestingSlv );
+    }
+
+   singleton->hasFinished = TRUE;
+   waitQ = singleton->waitQ;
+   numWaiting = numInPrivQ( waitQ );
+   for( i = 0; i < numWaiting; i++ )
+    {    //they will resume inside start singleton, then jmp to end singleton
+      resumingSlv = readPrivQ( waitQ );
+      resumingSlv->dataRetFromReq = singleton->endInstrAddr;
+      resume_slaveVP( resumingSlv, semEnv );
+    }
+
+   resume_slaveVP( requestingSlv, semEnv );
+
+}
+void inline
+handleEndFnSingleton( VSsSemReq *semReq, SlaveVP *requestingSlv,
+                        VSsSemEnv *semEnv )
+ {
+   VSsSingleton   *singleton;
+
+         DEBUG__printf1(dbgRqstHdlr,"EndFnSingleton request from processor %d",requestingSlv->slaveID)
+   
+   singleton = &(semEnv->fnSingletons[ semReq->singletonID ]);
+   handleEndSingleton_helper( singleton, requestingSlv, semEnv );
+  }
+void inline
+handleEndDataSingleton( VSsSemReq *semReq, SlaveVP *requestingSlv,
+                        VSsSemEnv *semEnv )
+ {
+   VSsSingleton   *singleton;
+
+         DEBUG__printf1(dbgRqstHdlr,"EndDataSingleton request from processor %d",requestingSlv->slaveID)
+   
+   singleton = *(semReq->singletonPtrAddr);
+   handleEndSingleton_helper( singleton, requestingSlv, semEnv );
+  }
+
+
+/*This executes the function in the masterVP, take the function
+ * pointer out of the request and call it, then resume the VP.
+ */
+void
+handleAtomic( VSsSemReq *semReq, SlaveVP *requestingSlv, VSsSemEnv *semEnv )
+ {
+         DEBUG__printf1(dbgRqstHdlr,"Atomic request from processor %d",requestingSlv->slaveID)
+   semReq->fnToExecInMaster( semReq->dataForFn );
+   resume_slaveVP( requestingSlv, semEnv );
+ }
+
+/*First, it looks at the VP's semantic data, to see the highest transactionID
+ * that VP
+ * already has entered.  If the current ID is not larger, it throws an
+ * exception stating a bug in the code.
+ *Otherwise it puts the current ID
+ * there, and adds the ID to a linked list of IDs entered -- the list is
+ * used to check that exits are properly ordered.
+ *Next it is uses transactionID as index into an array of transaction
+ * structures.
+ *If the "VP_currently_executing" field is non-null, then put requesting VP
+ * into queue in the struct.  (At some point a holder will request
+ * end-transaction, which will take this VP from the queue and resume it.)
+ *If NULL, then write requesting into the field and resume.
+ */
+void
+handleTransStart( VSsSemReq *semReq, SlaveVP *requestingSlv,
+                  VSsSemEnv *semEnv )
+ { VSsSemData *semData;
+   TransListElem *nextTransElem;
+
+         DEBUG__printf1(dbgRqstHdlr,"TransStart request from processor %d",requestingSlv->slaveID)
+   
+      //check ordering of entering transactions is correct
+   semData = requestingSlv->semanticData;
+   if( semData->highestTransEntered > semReq->transID )
+    {    //throw VMS exception, which shuts down VMS.
+      VMS_PI__throw_exception( "transID smaller than prev", requestingSlv, NULL);
+    }
+      //add this trans ID to the list of transactions entered -- check when
+      // end a transaction
+   semData->highestTransEntered = semReq->transID;
+   nextTransElem = VMS_PI__malloc( sizeof(TransListElem) );
+   nextTransElem->transID = semReq->transID;
+   nextTransElem->nextTrans = semData->lastTransEntered;
+   semData->lastTransEntered = nextTransElem;
+
+      //get the structure for this transaction ID
+   VSsTrans *
+   transStruc = &(semEnv->transactionStrucs[ semReq->transID ]);
+
+   if( transStruc->VPCurrentlyExecuting == NULL )
+    {
+      transStruc->VPCurrentlyExecuting = requestingSlv;
+      resume_slaveVP( requestingSlv, semEnv );
+    }
+   else
+    {    //note, might make future things cleaner if save request with VP and
+         // add this trans ID to the linked list when gets out of queue.
+         // but don't need for now, and lazy..
+      writePrivQ( requestingSlv, transStruc->waitingVPQ );
+    }
+ }
+
+
+/*Use the trans ID to get the transaction structure from the array.
+ *Look at VP_currently_executing to be sure it's same as requesting VP.
+ * If different, throw an exception, stating there's a bug in the code.
+ *Next, take the first element off the list of entered transactions.
+ * Check to be sure the ending transaction is the same ID as the next on
+ * the list.  If not, incorrectly nested so throw an exception.
+ *
+ *Next, get from the queue in the structure.
+ *If it's empty, set VP_currently_executing field to NULL and resume
+ * requesting VP.
+ *If get somethine, set VP_currently_executing to the VP from the queue, then
+ * resume both.
+ */
+void
+handleTransEnd(VSsSemReq *semReq, SlaveVP *requestingSlv, VSsSemEnv *semEnv)
+ { VSsSemData    *semData;
+   SlaveVP     *waitingSlv;
+   VSsTrans      *transStruc;
+   TransListElem *lastTrans;
+   
+         DEBUG__printf1(dbgRqstHdlr,"TransEnd request from processor %d",requestingSlv->slaveID)
+   
+   transStruc = &(semEnv->transactionStrucs[ semReq->transID ]);
+
+      //make sure transaction ended in same VP as started it.
+   if( transStruc->VPCurrentlyExecuting != requestingSlv )
+    {
+      VMS_PI__throw_exception( "trans ended in diff VP", requestingSlv, NULL );
+    }
+
+      //make sure nesting is correct -- last ID entered should == this ID
+   semData = requestingSlv->semanticData;
+   lastTrans = semData->lastTransEntered;
+   if( lastTrans->transID != semReq->transID )
+    {
+      VMS_PI__throw_exception( "trans incorrectly nested", requestingSlv, NULL );
+    }
+
+   semData->lastTransEntered = semData->lastTransEntered->nextTrans;
+
+
+   waitingSlv = readPrivQ( transStruc->waitingVPQ );
+   transStruc->VPCurrentlyExecuting = waitingSlv;
+
+   if( waitingSlv != NULL )
+      resume_slaveVP( waitingSlv, semEnv );
+
+   resume_slaveVP( requestingSlv, semEnv );
+ }
diff -r 000000000000 -r 9f2a7bd26dd9 DKU_Request_Handlers.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/DKU_Request_Handlers.h	Mon Aug 27 02:14:35 2012 -0700
@@ -0,0 +1,62 @@
+/*
+ *  Copyright 2009 OpenSourceStewardshipFoundation.org
+ *  Licensed under GNU General Public License version 2
+ *
+ * Author: seanhalle@yahoo.com
+ *
+ */
+
+#ifndef _VSs_REQ_H
+#define	_VSs_REQ_H
+
+#include "VSs.h"
+
+/*This header defines everything specific to the VSs semantic plug-in
+ */
+
+inline void
+handleSubmitTask( VSsSemReq *semReq, VSsSemEnv *semEnv);
+inline void
+handleEndTask( VSsSemReq *semReq, VSsSemEnv *semEnv);
+inline void
+handleSendTypeTo( VSsSemReq *semReq, VSsSemEnv *semEnv);
+inline void
+handleSendFromTo( VSsSemReq *semReq, VSsSemEnv *semEnv);
+inline void
+handleReceiveTypeTo( VSsSemReq *semReq, VSsSemEnv *semEnv);
+inline void
+handleReceiveFromTo( VSsSemReq *semReq, VSsSemEnv *semEnv);
+inline void
+handleTaskwait(VSsSemReq *semReq, SlaveVP *requestingSlv, VSsSemEnv *semEnv);
+
+inline void
+handleMalloc( VSsSemReq *semReq, SlaveVP *requestingSlv, VSsSemEnv *semEnv);
+inline void
+handleFree( VSsSemReq *semReq, SlaveVP *requestingSlv, VSsSemEnv *semEnv );
+inline void
+handleTransEnd(VSsSemReq *semReq, SlaveVP *requestingSlv, VSsSemEnv*semEnv);
+inline void
+handleTransStart( VSsSemReq *semReq, SlaveVP *requestingSlv,
+                  VSsSemEnv *semEnv );
+inline void
+handleAtomic( VSsSemReq *semReq, SlaveVP *requestingSlv, VSsSemEnv *semEnv);
+inline void
+handleStartFnSingleton( VSsSemReq *semReq, SlaveVP *reqstingSlv,
+                      VSsSemEnv *semEnv );
+inline void
+handleEndFnSingleton( VSsSemReq *semReq, SlaveVP *requestingSlv,
+                    VSsSemEnv *semEnv );
+inline void
+handleStartDataSingleton( VSsSemReq *semReq, SlaveVP *reqstingSlv,
+                      VSsSemEnv *semEnv );
+inline void
+handleEndDataSingleton( VSsSemReq *semReq, SlaveVP *requestingSlv,
+                    VSsSemEnv *semEnv );
+inline void
+free_task_stub( VSsTaskStub *stubToFree );
+inline void
+replaceWithNewSlotSlvIfNeeded( SlaveVP *requestingSlv, VSsSemEnv *semEnv );
+
+
+#endif	/* _VSs_REQ_H */
+
diff -r 000000000000 -r 9f2a7bd26dd9 DKU_singleton_asm.s
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/DKU_singleton_asm.s	Mon Aug 27 02:14:35 2012 -0700
@@ -0,0 +1,21 @@
+
+//Assembly code takes the return addr off the stack and saves
+// into the singleton.  The first field in the singleton is the
+// "endInstrAddr" field, and the return addr is at 0x4(%ebp)
+.globl asm_save_ret_to_singleton
+asm_save_ret_to_singleton:
+    movq 0x8(%rbp),     %rax   #get ret address, ebp is the same as in the calling function
+    movq     %rax,     (%rdi) #write ret addr to endInstrAddr field
+    ret
+
+
+//Assembly code changes the return addr on the stack to the one
+// saved into the singleton by the end-singleton-fn
+//The stack's return addr is at 0x4(%%ebp)
+.globl asm_write_ret_from_singleton
+asm_write_ret_from_singleton:
+    movq    (%rdi),    %rax  #get endInstrAddr field
+    movq      %rax,    0x8(%rbp) #write return addr to the stack of the caller
+    ret
+
+
diff -r 000000000000 -r 9f2a7bd26dd9 Measurement/DKU_Counter_Recording.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/Measurement/DKU_Counter_Recording.c	Mon Aug 27 02:14:35 2012 -0700
@@ -0,0 +1,121 @@
+/*
+ * 
+ * author: Nina Engelhardt
+ */
+
+#include "VSs_Counter_Recording.h"
+#include "VMS_impl/VMS.h"
+#include "VSs_impl/VSs.h"
+
+#ifdef HOLISTIC__TURN_ON_PERF_COUNTERS
+
+void VSs__init_counter_data_structs(){
+    VSsSemEnv *semanticEnv = _VMSMasterEnv->semanticEnv;
+    int i;
+    for(i=0;i<NUM_CORES;i++){
+        semanticEnv->counterList[i] = makeListOfArrays(sizeof(CounterEvent), 128);
+    }
+}
+
+void addToListOfArraysCounterEvent(CounterEvent value, ListOfArrays* list){
+    int offset_in_fragment = list->next_free_index % list->num_entries_per_fragment; 
+    if(offset_in_fragment == 0){ 
+        void* newBlock = malloc(list->entry_size * list->num_entries_per_fragment); 
+        addToDynArray(newBlock,list->dim1info); 
+    } 
+    CounterEvent* typedFragment = (CounterEvent*) ((list->dim1)[list->dim1info->numInArray -1]); 
+    typedFragment[offset_in_fragment] = value; 
+    list->next_free_index++; 
+}
+
+void VSs__counter_handler(int evt_type, int vpid, int task, SlaveVP* pr, uint64 cycles, uint64 instrs)
+{
+    
+    if (pr->typeOfVP == Master || pr->typeOfVP == Shutdown)
+     { //Only save values for application work, done in a SlaveVP
+        return;
+     }
+
+    VSsSemEnv *semanticEnv = _VMSMasterEnv->semanticEnv;
+            
+    CounterEvent e;
+    e.event_type = evt_type;
+    e.vp = vpid;
+    e.task = task;
+    
+    e.cycles = cycles;
+    e.instrs = instrs;
+    
+    if(pr){
+        e.coreID = pr->coreAnimatedBy;
+        e.slot = pr->animSlotAssignedTo;
+    } else {
+        e.coreID = -1;
+        e.slot = NULL;
+    }
+    
+    int corenum;
+    
+    if(pr) corenum = pr->coreAnimatedBy; else return; 
+  
+    if(evt_type==Work_start || evt_type==Work_end || evt_type==AppResponderInvocation_start){
+        addToListOfArrays_ext(CounterEvent,e,semanticEnv->counterList[corenum]);
+    } else {
+        addToListOfArraysCounterEvent(e,semanticEnv->counterList[corenum]);
+    }
+}
+
+void set_counter_file(FILE* f){
+    counterfile = f;
+}
+
+void print_counter_events_to_file(void* _e){
+     CounterEvent* e = (CounterEvent*) _e;
+     fprintf(counterfile, "event, ");
+     switch(e->event_type){
+         case AppResponderInvocation_start:
+             fprintf(counterfile, "AppResponderInvocation_start");
+             break;
+         case AppResponder_start:
+             fprintf(counterfile, "AppResponder_start");
+             break;
+         case AppResponder_end:
+             fprintf(counterfile, "AppResponder_end");
+             break;
+         case AssignerInvocation_start:
+             fprintf(counterfile, "AssignerInvocation_start");
+             break;
+         case NextAssigner_start:
+             fprintf(counterfile, "NextAssigner_start");
+             break;
+         case Assigner_start:
+             fprintf(counterfile, "Assigner_start");
+             break;
+         case Assigner_end:
+             fprintf(counterfile, "Assigner_end");
+             break;
+         case Work_end:
+             fprintf(counterfile, "Work_end");
+             break;
+         case Work_start:
+             fprintf(counterfile, "Work_start");
+             break;
+         case HwResponderInvocation_start:
+             fprintf(counterfile, "HwResponderInvocation_start");
+             break;
+         case Timestamp_start:
+             fprintf(counterfile, "Timestamp_start");
+             break;
+         case Timestamp_end:
+             fprintf(counterfile, "Timestamp_end");
+             break;
+         default:
+             fprintf(counterfile, "unknown event");
+     }
+     fprintf(counterfile,", %d, %d, %llu, %llu",e->vp,e->task,e->cycles,e->instrs);
+     if(e->coreID >=0)
+         fprintf(counterfile,", %d",e->coreID);
+     fprintf(counterfile,"\n");
+     fflush(counterfile);
+}
+#endif
diff -r 000000000000 -r 9f2a7bd26dd9 Measurement/DKU_Counter_Recording.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/Measurement/DKU_Counter_Recording.h	Mon Aug 27 02:14:35 2012 -0700
@@ -0,0 +1,33 @@
+/* 
+ * File:   VSs_Counter_Recording.h
+ * Author: nengel
+ *
+ * Created on January 11, 2012, 3:03 PM
+ */
+
+#ifndef VSs_COUNTER_RECORDING_H
+#define	VSs_COUNTER_RECORDING_H
+
+#include "VMS_impl/VMS.h"
+
+typedef struct {
+   int event_type;
+   int coreID;
+   AnimSlot* slot;
+   int vp;
+   int task;
+   uint64 cycles;
+   uint64 instrs;
+} CounterEvent;
+
+FILE* counterfile;
+
+void VSs__init_counter_data_structs();
+
+void VSs__counter_handler(int evt_type, int vpid, int task, SlaveVP* pr, uint64 cycles, uint64 instrs);
+
+void set_counter_file(FILE* f);
+
+void print_counter_events_to_file(void* _e);
+#endif	/* VSs_COUNTER_RECORDING_H */
+
diff -r 000000000000 -r 9f2a7bd26dd9 Measurement/DKU_Measurement.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/Measurement/DKU_Measurement.h	Mon Aug 27 02:14:35 2012 -0700
@@ -0,0 +1,87 @@
+/*
+ *  Copyright 2009 OpenSourceStewardshipFoundation.org
+ *  Licensed under GNU General Public License version 2
+ *
+ * Author: seanhalle@yahoo.com
+ *
+ */
+
+#ifndef _VSs_MEAS_H
+#define	_VSs_MEAS_H
+
+
+#ifdef MEAS__TURN_ON_LANG_MEAS
+
+   #ifdef MEAS__Make_Meas_Hists_for_Language
+   #undef MEAS__Make_Meas_Hists_for_Language
+   #endif
+
+
+//===================  Language-specific Measurement Stuff ===================
+//
+//
+   #define SendFromToHistIdx      1 //note: starts at 1
+   #define SendOfTypeHistIdx      2
+   #define ReceiveFromToHistIdx   3
+   #define ReceiveOfTypeHistIdx   4
+
+   #define MEAS__Make_Meas_Hists_for_Language \
+      _VMSMasterEnv->measHistsInfo = \
+                 makePrivDynArrayOfSize( (void***)&(_VMSMasterEnv->measHists), 200); \
+       makeAMeasHist( SendFromToHistIdx,   "SendFromTo",    50, 0, 100 ) \
+       makeAMeasHist( SendOfTypeHistIdx,   "SendOfType",    50, 0, 100 ) \
+       makeAMeasHist( ReceiveFromToHistIdx,"ReceiveFromTo", 50, 0, 100 ) \
+       makeAMeasHist( ReceiveOfTypeHistIdx,"ReceiveOfType", 50, 0, 100 )
+
+   #define Meas_startSendFromTo \
+       int32 startStamp, endStamp; \
+       saveLowTimeStampCountInto( startStamp ); \
+
+   #define Meas_endSendFromTo \
+       saveLowTimeStampCountInto( endStamp ); \
+       addIntervalToHist( startStamp, endStamp, \
+                                _VMSMasterEnv->measHists[ SendFromToHistIdx ] );
+
+   #define Meas_startSendOfType \
+       int32 startStamp, endStamp; \
+       saveLowTimeStampCountInto( startStamp ); \
+
+   #define Meas_endSendOfType \
+       saveLowTimeStampCountInto( endStamp ); \
+       addIntervalToHist( startStamp, endStamp, \
+                                _VMSMasterEnv->measHists[ SendOfTypeHistIdx ] );
+
+   #define Meas_startReceiveFromTo \
+       int32 startStamp, endStamp; \
+       saveLowTimeStampCountInto( startStamp ); \
+
+   #define Meas_endReceiveFromTo \
+       saveLowTimeStampCountInto( endStamp ); \
+       addIntervalToHist( startStamp, endStamp, \
+                                _VMSMasterEnv->measHists[ ReceiveFromToHistIdx ] );
+
+   #define Meas_startReceiveOfType \
+       int32 startStamp, endStamp; \
+       saveLowTimeStampCountInto( startStamp ); \
+
+   #define Meas_endReceiveOfType \
+       saveLowTimeStampCountInto( endStamp ); \
+       addIntervalToHist( startStamp, endStamp, \
+                                _VMSMasterEnv->measHists[ReceiveOfTypeHistIdx ] );
+
+#else //===================== turned off ==========================
+
+   #define MEAS__Make_Meas_Hists_for_Language
+   #define Meas_startSendFromTo
+   #define Meas_endSendFromTo
+   #define Meas_startSendOfType
+   #define Meas_endSendOfType
+   #define Meas_startReceiveFromTo
+   #define Meas_endReceiveFromTo
+   #define Meas_startReceiveOfType
+   #define Meas_endReceiveOfType
+
+#endif  /* MEAS__TURN_ON_LANG_MEAS */
+
+#endif	/*  */
+
diff -r 000000000000 -r 9f2a7bd26dd9 Measurement/dependency.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/Measurement/dependency.c	Mon Aug 27 02:14:35 2012 -0700
@@ -0,0 +1,78 @@
+#include "dependency.h"
+#include "VMS_impl/VMS.h"
+
+Dependency* new_dependency(int from_vp, int from_task, int to_vp, int to_task){
+    Dependency* newDep = (Dependency*) VMS_int__malloc(sizeof(Dependency));
+    if (newDep!=NULL){
+        newDep->from_vp = from_vp;
+        newDep->from_task = from_task;
+        newDep->to_vp = to_vp;
+        newDep->to_task = to_task;
+    }
+    return newDep;
+}
+
+NtoN* new_NtoN(int id){
+    NtoN* newn = (NtoN*) VMS_int__malloc(sizeof(NtoN));
+    newn->id = id;
+    newn->senders = makeListOfArrays(sizeof(Unit), 64);
+    newn->receivers = makeListOfArrays(sizeof(Unit), 64);
+    return newn;
+}
+
+int set_dependency_file(FILE* file){
+    dependency_file = file;
+}
+
+void print_ctl_dependency_to_file(void* _dep){
+    Dependency* dep = (Dependency*) _dep;
+    if(!dep) return;
+    fprintf(dependency_file,"ctlDep,%d,%d,%d,%d\n",dep->from_vp,dep->from_task,dep->to_vp,dep->to_task);
+}
+
+void print_comm_dependency_to_file(void* _dep){
+    Dependency* dep = (Dependency*) _dep;
+    if(!dep) return;
+    fprintf(dependency_file,"commDep,%d,%d,%d,%d\n",dep->from_vp,dep->from_task,dep->to_vp,dep->to_task);
+}
+
+void print_dyn_dependency_to_file(void* _dep){
+    Dependency* dep = (Dependency*) _dep;
+    if(!dep) return;
+    fprintf(dependency_file,"dynDep,%d,%d,%d,%d\n",dep->from_vp,dep->from_task,dep->to_vp,dep->to_task);
+}
+
+void print_hw_dependency_to_file(void* _dep){
+    Dependency* dep = (Dependency*) _dep;
+    if(!dep) return;
+    fprintf(dependency_file,"hwDep,%d,%d,%d,%d\n",dep->from_vp,dep->from_task,dep->to_vp,dep->to_task);
+}
+
+void print_dependency_to_file(void* _dep){
+    Dependency* dep = (Dependency*) _dep;
+    if(!dep) return;
+    fprintf(dependency_file,"VP_%d_%d -> VP_%d_%d;\n",dep->from_vp,dep->from_task,dep->to_vp,dep->to_task);
+}
+
+void print_unit_to_file(void* _unit){
+    Unit* unit = (Unit*) _unit;
+    if(!unit) return;
+    fprintf(dependency_file,"unit,%d,%d\n",unit->vp,unit->task);
+}
+
+void print_nton_set_helper(void* _u){
+    Unit* u = (Unit*) _u;
+    if(!u) return;
+    fprintf(dependency_file,",%d,%d",u->vp,u->task);
+}
+
+void print_nton_to_file(void* _nton){
+    NtoN* nton = (NtoN*) _nton;
+    if(!nton) return;
+    //assert(nton->senders->next_free_index==nton->receivers->next_free_index);
+    int numInSet = nton->senders->next_free_index;
+    fprintf(dependency_file,"NtoN,%d",numInSet);
+    forAllInListOfArraysDo(nton->senders,&print_nton_set_helper);
+    forAllInListOfArraysDo(nton->receivers,&print_nton_set_helper);
+    fprintf(dependency_file,"\n");
+}
\ No newline at end of file
diff -r 000000000000 -r 9f2a7bd26dd9 Measurement/dependency.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/Measurement/dependency.h	Mon Aug 27 02:14:35 2012 -0700
@@ -0,0 +1,56 @@
+/* 
+ * File:   dependency.h
+ * Author: Nina Engelhardt
+ *
+ * Created on 29. August 2011, 17:41
+ */
+
+#ifndef  _DEPENDENCY_H
+#define	_DEPENDENCY_H
+
+
+#include <stdio.h>
+#include "ListOfArrays/ListOfArrays.h"
+
+typedef struct {
+    int vp;
+    int task;
+} Unit;
+
+typedef struct {
+    int from_vp;
+    int from_task;
+    int to_vp;
+    int to_task;
+} Dependency; 
+
+typedef struct {
+    int32 id;
+    ListOfArrays* senders;
+    ListOfArrays* receivers;
+} NtoN;
+
+FILE* dependency_file;
+
+Dependency* new_dependency(int from_vp, int from_task, int to_vp, int to_task);
+
+NtoN* new_NtoN(int id);
+
+int set_dependency_file(FILE* file);
+
+void print_ctl_dependency_to_file(void* _dep);
+
+void print_comm_dependency_to_file(void* _dep);
+
+void print_dyn_dependency_to_file(void* _dep);
+
+void print_hw_dependency_to_file(void* _dep);
+
+void print_dependency_to_file(void* dep);
+
+void print_unit_to_file(void* unit);
+
+void print_nton_to_file(void* _nton);
+
+#endif	/* DEPENDENCY_H */
+
diff -r 000000000000 -r 9f2a7bd26dd9 __brch__default
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/__brch__default	Mon Aug 27 02:14:35 2012 -0700
@@ -0,0 +1,1 @@
+This branch is standard default