# HG changeset patch
# User Some Random Person <seanhalle@yahoo.com>
# Date 1331440538 28800
# Node ID 712218cdc4ba46a217d65d143fe099aaf6d26237
# Parent  10a72bcedbf02196ff080ab6b6b74a5a71d612c3
more changes to vms_impl and ssr_impl

diff -r 10a72bcedbf0 -r 712218cdc4ba CoreLoop.c
--- a/CoreLoop.c	Fri Mar 09 22:30:26 2012 -0800
+++ b/CoreLoop.c	Sat Mar 10 20:35:38 2012 -0800
@@ -14,180 +14,255 @@
 #include <pthread.h>
 #include <sched.h>
 
-void *terminateCoreLoop(SlaveVP *currSlv);
+//=====================  Functions local to this file =======================
+void *terminateCoreController(SlaveVP *currSlv);
+//===========================================================================
 
-/*This is the loop that runs in the OS Thread pinned to each core
- *Get Slv from queue,
- * save state of current animator, then load in state of Slv, using
- * jmp instr to switch the program-counter state -- making the Slv
- * the new animator.
- *At some point, the Slv will suspend itself by saving out its
- * animator state (stack ptr, frame ptr, program counter) and switching
- * back to the OS Thread's animator state, which means restoring the
- * stack and frame and jumping to the core loop start point.
- *This cycle then repeats, until a special shutdown virtual processor is
- * animated, which jumps to the end point at the bottom of core loop.
+
+/*The Core Controller is logically "beneath" the masterVP and slave VPs.  Its
+ * job is to control which of those VPs the core animates.  Any time one of
+ * those VPs suspends, the suspend-primitive switches the core over to
+ * animating the core controller.  The core controller then follows a very
+ * basic pattern to choose which VP will get animated next, then switches
+ * the core over to animating that VP.  So, all VPs switch the core to
+ * core controller, which then chooses which VP the core animates next.
+ *
+ *The way the core controller decides which VP to switch the core to next is:
+ * 1) There are a number of "scheduling slots", which the master VP fills up
+ *    with slave VPs that are ready to be animated.  So, the core controller
+ *    just iterates through the scheduling slots.  When the next slot has a
+ *    slave VP in it, the core controller switches the core over to animate
+ *    that slave.
+ * 2) When the core controller checks a scheduling slot, and it's empty,
+ *    then the controller switches the core over to animating the master VP,
+ *    whose job is to find more slave VPs ready, and assign those to 
+ *    scheduling slots.
+ *
+ *So, in effect, a scheduling slot functions as another layer of virtual
+ * processor.  A slot has the logical meaning of being an animator that
+ * animates the slave assigned to it.  However, the core controller sits
+ * below the slots, and sequences down them, assigning the actual physical
+ * core to each slot, in turn.
+ *The reason for having the scheduling slots and core controller is to 
+ * amortize the overhead of switching to the master VP and running it.  With
+ * multiple scheduling slots, the time to switch-to-master and the code in
+ * the master loop is divided by the number of scheduling slots.
+ *The core controller and scheduling slots are not fundamental parts of VMS,
+ * but rather optimizations put into the shared-semantic-state version of
+ * VMS.  Other versions of VMS will not have a core controller nor scheduling
+ * slots.
+ * 
+ *The core controller "owns" the physical core, in effect, and is the 
+ * function given to the pthread creation call.  Hence, it contains code
+ * related to pthread startup, synchronizing the controllers to all start
+ * at the same time-point, and pinning the pthreads to physical cores.
+ * 
  */
 void *
-coreLoop( void *paramsIn )
+coreController( void *paramsIn )
  { 
-   ThdParams      *coreLoopThdParams;
-   int32           thisCoresIdx, currSlotIdx;
+   int32           thisCoresIdx;
+   int32           numRepetitionsWithNoWork;
    SlaveVP        *currVP;
    SchedSlot      *currSlot, **schedSlots;
-   cpu_set_t       coreMask;  //has 1 in bit positions of allowed cores
+   int32           currSlotIdx;
+   int32          *addrOfMasterLock;
+   SlaveVP        *thisCoresMasterVP;
+      //Variables used for pthread related things
+   ThdParams      *coreCtlrThdParams;
+   cpu_set_t       coreMask;  //used during pinning pthread to CPU core
    int32           errorCode;
+      //Variables used during measurements
    TSCountLowHigh  endSusp;
+      //Variables used in random-backoff, for master-lock and waiting for work
+   volatile double workspace1,workspace2; //busy-wait fake work
+   uint32_t seed1 = rand()%1000; // init random number generator for retries
+   uint32_t seed2 = rand()%1000;
+      //Variable for work-stealing -- a gate protects a critical section
+   volatile GateStruc gate;      //on stack to avoid false-sharing
 
-      //work-stealing struc on stack to prevent false-sharing in cache-line
-   volatile GateStruc gate;
-   //preGateProgress, waitProgress, exitProgress, gateClosed;
-
-
-   coreLoopThdParams = (ThdParams *)paramsIn;
-   thisCoresIdx = coreLoopThdParams->coreNum;
+   
+   //===============  Initializations ===================
+   coreCtlrThdParams = (ThdParams *)paramsIn;
+   thisCoresIdx = coreCtlrThdParams->coreNum;
 
    gate.gateClosed      = FALSE;
    gate.preGateProgress = 0;
    gate.waitProgress    = 0;
    gate.exitProgress    = 0;
-   _VMSMasterEnv->workStealingGates[ thisCoresIdx ] = (GateStruc*)&gate;//race @startup
+   //TODO: pad these to prevent false-sharing, and fix the race at startup
+   _VMSMasterEnv->workStealingGates[ thisCoresIdx ] = (GateStruc*)&gate;
 
-      //wait until signalled that setup is complete
+      //Assembly that saves addr of label of return instr -- label in assmbly
+   recordCoreCtlrReturnLabelAddr((void**)&(_VMSMasterEnv->coreCtlrReturnPt));
+
+   schedSlots = _VMSMasterEnv->allSchedSlots[thisCoresIdx];
+   currSlotIdx = 0; //start at slot 0, go up until one empty, then do master
+   numRepetitionsWithNoWork = 0;
+   addrOfMasterLock = &(_VMSMasterEnv->masterLock);
+   thisCoresMasterVP = _VMSMasterEnv->masterVPs[thisCoresIdx];
+   
+   //==================== pthread related stuff ======================
+      //pin the pthread to the core
+      //Linux requires pinning to be done inside the thread-function
+      //Designate a core by a 1 in bit-position corresponding to the core
+   CPU_ZERO(&coreMask); //initialize mask bits to zero
+   CPU_SET(coreCtlrThdParams->coreNum,&coreMask); //set bit repr the coreNum
+   pthread_t selfThd = pthread_self();
+   errorCode =
+   pthread_setaffinity_np( selfThd, sizeof(coreMask), &coreMask);
+   if(errorCode){ printf("\n pinning thd to core failed \n"); exit(0); }
+
+      //make sure the controllers all start at same time, by making them wait
    pthread_mutex_lock(   &suspendLock );
    while( !(_VMSMasterEnv->setupComplete) )
-    {
-      pthread_cond_wait( &suspendCond,
-                         &suspendLock );
+    { pthread_cond_wait( &suspendCond, &suspendLock );
     }
    pthread_mutex_unlock( &suspendLock );
 
-      //printf( "\nCore unsuspended: %d\n", coreLoopThdParams->coreNum );
+   //====================== The Core Controller ======================
+   while(1)  //An endless loop is just one way of doing the control structure
+    {        //Assembly code switches the core between animating a VP and
+             // animating this core controller.  The switch is done by
+             // changing the stack-pointer and frame-pointer and then doing
+             // an assembly jmp.  When reading this code, the effect is 
+             // that the "switchToSlv()" at the end of the loop is sort of a
+             // "warp in time" -- the core disappears inside this, jmps to
+             // animating a VP, and when that VP suspends, the suspend
+             // jmps back. This has the effect of "returning" from the
+             // switchToSlv() call. Then control loops back to here.
+             //Alternatively, the VP suspend primitive could just not bother
+             // returning from switchToSlv, and instead jmp directly to here.
+      
+      if( currSlotIdx >= NUM_SCHED_SLOTS ) goto switchToMaster;
+      currSlot = schedSlots[ currSlotIdx ];
 
-      //set thread affinity
-      //Linux requires pinning thd to core inside thread-function
-      //Designate a core by a 1 in bit-position corresponding to the core
-   CPU_ZERO(&coreMask);
-   CPU_SET(coreLoopThdParams->coreNum,&coreMask);
-   //coreMask = 1L << coreLoopThdParams->coreNum;
+      if( ! currSlot->needsSlaveAssigned ) //slot does have slave assigned
+       { numRepetitionsWithNoWork = 0;     //reset B2B master count
+         currSlotIdx ++;
+         currVP = currSlot->slaveAssignedToSlot;
+       }
+      else //slot is empty, so switch to master
+       {
+       switchToMaster:
+         currSlotIdx = 0; //doing switch to master, so start over at slot 0
+         currVP = NULL;
 
-   pthread_t selfThd = pthread_self();
-   errorCode =
-   pthread_setaffinity_np( selfThd, sizeof(coreMask), &coreMask);
-   
-   if(errorCode){ printf("\nset affinity failure\n"); exit(0); }
+               MEAS__Capture_Pre_Master_Lock_Point;
 
-   
-      //Save return addr from stack into master-env for use later
-   recordCoreLoopReturnLabelAddr((void**)&(_VMSMasterEnv->coreLoopReturnPt));
+         int tries = 0; int gotLock = 0;
+         while( currVP == NULL ) //keep going until get master lock
+          { 
+            gotLock = __sync_bool_compare_and_swap( addrOfMasterLock,
+                                                    UNLOCKED, LOCKED );
+            if( gotLock )
+             {    //At this point, have run out of slaves, so tried to get
+                  // the master lock, and have successfully gotten it.
+                  //So, set the currVP to this core's masterVP and break out
+                  // of the get-lock loop.  Below, assembly code will switch
+                  // the core over to animating the masterVP.  When it's 
+                  // done, the masterVP will use assembly to switch the core
+                  // back to animating this core controller
+               currVP = thisCoresMasterVP;
+               if( numRepetitionsWithNoWork > NUM_REPS_W_NO_WORK_BEFORE_YIELD )
+                {       DEBUG_Print( dbgB2BMaster,"Lots of reps w/o work\n");
+                  pthread_yield();
+                }
+               numRepetitionsWithNoWork += 1;
+               break;  //end while -- have a VP to animate now
+             }
 
-   currSlotIdx = 0; //start at slot 0, go up until one empty, then do master
-   
-   while(1){
-   
-   schedSlots = _VMSMasterEnv->allSchedSlots[thisCoresIdx];
+            tries++;   //if too many, means too much contention
+            if( tries > MASTERLOCK_RETRIES_BEFORE_YIELD ) { tries = 0; pthread_yield(); }
+          }
+               MEAS__Capture_Post_Master_Lock_Point;
+       }
 
-   if( currSlotIdx >= NUM_SCHED_SLOTS ) goto switchToMaster;
-   
-   currSlot = schedSlots[ currSlotIdx ];
-   
-   if( ! currSlot->needsSlaveAssigned ) //slot does have slave assigned
-    { _VMSMasterEnv->numMasterInARow[thisCoresIdx] = 0; //reset B2B master count
-      currSlotIdx ++;
-      currVP = currSlot->slaveAssignedToSlot;
-    }
-   else //slot is empty, so switch to master
-    {
-   switchToMaster:
-      currSlotIdx = 0; //switch to master, so start over at slot 0
-      currVP = NULL;
-   
-            MEAS__Capture_Pre_Master_Lock_Point;
-            
-      int tries = 0; int gotLock = 0;
-      while( currVP == NULL ) //keep going until get master lock
-       { 
-         gotLock = __sync_bool_compare_and_swap(&(_VMSMasterEnv->masterLock),
-                                                          UNLOCKED, LOCKED );
-         if( gotLock )
-          {    //run own MasterVP -- jmps to coreLoops startPt when done
-            currVP = _VMSMasterEnv->masterVPs[thisCoresIdx];
-            if( _VMSMasterEnv->numMasterInARow[thisCoresIdx] > 10 )
-             {       DEBUG( dbgB2BMaster,"Many back to back MasterVPs\n");
-               pthread_yield();
-             }
-            _VMSMasterEnv->numMasterInARow[thisCoresIdx] += 1;
-            break;  //end while -- have a Slv to animate now
-          }
 
-         tries++;   //if too many, means master on other core taking too long
-         if( tries > MASTERLOCK_RETRIES ) { tries = 0; pthread_yield(); }
-       }
-            MEAS__Capture_Post_Master_Lock_Point;
-    }
+      switchToSlv(currVP); //Slave suspend makes core "return" from this call
+      flushRegisters();    //prevent GCC optimization from doing bad things 
 
-   
-   switchToSlv(currVP); //Slave suspend makes core "return" from this call
-   flushRegisters();
-   
-          MEAS__capture_end_susp_in_CoreLoop_forSys;
+             MEAS__Capture_End_Susp_in_CoreCtlr_ForSys;
           
-   }//CoreLoop      
+    }//while(1)
  }
 
 
 void *
-terminateCoreLoop(SlaveVP *currSlv){
+terminateCoreController(SlaveVP *currSlv)
+ {
    //first free shutdown Slv that jumped here -- it first restores the
    // coreloop's stack, so addr of currSlv in stack frame is still correct
    VMS_int__dissipate_slaveVP( currSlv );
    pthread_exit( NULL );
-}
+ }
 
 
 
 #ifdef DEBUG__TURN_ON_SEQUENTIAL_MODE
 
 //===========================================================================
-/*This sequential version is exact same as threaded, except doesn't do the
- * pin-threads part, nor the wait until setup complete part.
+/*This sequential version does the same as threaded, except doesn't do the
+ * pin-threads part, nor the wait until setup complete and acquire master
+ * lock parts.
  */
 void *
-coreLoop_Seq( void *paramsIn )
+coreCtlr_Seq( void *paramsIn )
  {
-   SlaveVP      *currSlv;
-   VMSQueueStruc *readyToAnimateQ;
+   int32           thisCoresIdx;
+   int32           numRepetitionsWithNoWork;
+   SlaveVP        *currVP;
+   SchedSlot      *currSlot, **schedSlots;
+   int32           currSlotIdx;
+   int32          *addrOfMasterLock;
+   SlaveVP        *thisCoresMasterVP;
    
-   ThdParams      *coreLoopThdParams;
-   int             thisCoresIdx;
+   //===============  Initializations ===================
+   schedSlots = _VMSMasterEnv->allSchedSlots[thisCoresIdx];
+   currSlotIdx = 0; //start at slot 0, go up until one empty, then do master
+   numRepetitionsWithNoWork = 0;
+   addrOfMasterLock = &(_VMSMasterEnv->masterLock);
+   thisCoresMasterVP = _VMSMasterEnv->masterVPs[thisCoresIdx];
    
-   coreLoopThdParams = (ThdParams *)paramsIn;
-//   thisCoresIdx = coreLoopThdParams->coreNum;
-   thisCoresIdx = 0;
+   thisCoresIdx = 0; //sequential version
 
-   //Save the return address in the SwitchSlv function
-   recordCoreLoopReturnLabelAddr(&(_VMSMasterEnv->coreLoopReturnPt));
+      //Assembly that saves addr of label of return instr -- label in assmbly
+   recordCoreCtlrReturnLabelAddr((void**)&(_VMSMasterEnv->coreCtlrReturnPt));
 
    
-   while(1){
-      //Get virtual processor from queue
-      //_VMSWorkQ must be a global, static volatile var, so not kept in reg,
-      // which forces reloading the pointer after each jmp to this point
-   readyToAnimateQ  = _VMSMasterEnv->readyToAnimateQs[thisCoresIdx];
-   currSlv = (SlaveVP *) readVMSQ( readyToAnimateQ );
-   if( currSlv == NULL )
-    { if( _VMSMasterEnv->numMasterInARow[thisCoresIdx] > 1000 )
-       { printf("too many back to back MasterVP\n"); exit(1); }
-      _VMSMasterEnv->numMasterInARow[thisCoresIdx] += 1;
-      
-      currSlv = _VMSMasterEnv->masterVPs[thisCoresIdx];
-    }
-   else
-      _VMSMasterEnv->numMasterInARow[thisCoresIdx] = 0;
+   //====================== The Core Controller ======================
+   while(1)
+    {
+      if( currSlotIdx >= NUM_SCHED_SLOTS ) goto switchToMaster;
+      currSlot = schedSlots[ currSlotIdx ];
 
+      if( ! currSlot->needsSlaveAssigned ) //slot does have slave assigned
+       { numRepetitionsWithNoWork = 0;     //reset B2B master count
+         currSlotIdx ++;
+         currVP = currSlot->slaveAssignedToSlot;
+       }
+      else //slot is empty, so switch to master
+       {
+       switchToMaster:
+         currSlotIdx = 0; //doing switch to master, so start over at slot 0
+         
+         currVP = thisCoresMasterVP;
+         
+               MEAS__Capture_Pre_Master_Lock_Point;  //back to back because
+               MEAS__Capture_Post_Master_Lock_Point; // sequential version
+         
+         if( numRepetitionsWithNoWork > NUM_REPS_W_NO_WORK_BEFORE_YIELD )
+          { printf("Lots of reps w/o work\n");
+            exit(0); //if no work, no way to ever get it in sequential!
+          }
+         numRepetitionsWithNoWork += 1;
+       }
 
-   switchToSlv( currSlv );
-   flushRegisters();
-   }
+      switchToSlv(currVP); //Slave suspend makes core "return" from this call
+      flushRegisters();    //prevent GCC optimization from doing bad things 
+
+             MEAS__Capture_End_Susp_in_CoreCtlr_ForSys;
+             
+    } //while(1)
  }
 #endif
diff -r 10a72bcedbf0 -r 712218cdc4ba Defines/VMS_defs__DEBUG.h
--- a/Defines/VMS_defs__DEBUG.h	Fri Mar 09 22:30:26 2012 -0800
+++ b/Defines/VMS_defs__DEBUG.h	Sat Mar 10 20:35:38 2012 -0800
@@ -13,16 +13,16 @@
 /*
  */
 #ifdef DEBUG__TURN_ON_DEBUG_MSGS
-   #define DEBUG(  bool, msg) \
+   #define DEBUG_Print(  bool, msg) \
       if( bool){ printf(msg); fflush(stdin);}
-   #define DEBUG1( bool, msg, param)  \
+   #define DEBUG_Print1( bool, msg, param)  \
       if(bool){printf(msg, param); fflush(stdin);}
-   #define DEBUG2( bool, msg, p1, p2) \
+   #define DEBUG_Print2( bool, msg, p1, p2) \
       if(bool) {printf(msg, p1, p2); fflush(stdin);}
 #else
-   #define DEBUG(  bool, msg)         
-   #define DEBUG1( bool, msg, param)  
-   #define DEBUG2( bool, msg, p1, p2) 
+   #define DEBUG_Print(  bool, msg)         
+   #define DEBUG_Print1( bool, msg, param)  
+   #define DEBUG_Print2( bool, msg, p1, p2) 
 #endif
 
 //============================= ERROR MSGs ============================
diff -r 10a72bcedbf0 -r 712218cdc4ba Defines/VMS_defs__HW_constants.h
--- a/Defines/VMS_defs__HW_constants.h	Fri Mar 09 22:30:26 2012 -0800
+++ b/Defines/VMS_defs__HW_constants.h	Sat Mar 10 20:35:38 2012 -0800
@@ -22,7 +22,8 @@
 
 #define MIN_WORK_UNIT_CYCLES 20000
 
-#define MASTERLOCK_RETRIES 100
+#define NUM_REPS_W_NO_WORK_BEFORE_YIELD 10
+#define MASTERLOCK_RETRIES_BEFORE_YIELD 100
 
    // stack size in virtual processors created
 #define VIRT_PROCR_STACK_SIZE 0x8000 /* 32K */
diff -r 10a72bcedbf0 -r 712218cdc4ba Defines/VMS_defs__MEAS.h
--- a/Defines/VMS_defs__MEAS.h	Fri Mar 09 22:30:26 2012 -0800
+++ b/Defines/VMS_defs__MEAS.h	Sat Mar 10 20:35:38 2012 -0800
@@ -270,7 +270,7 @@
     * Now, measures cycles from there to here
     * Master and Plugin will add this value to other trace-seg measures
     */
-   #define MEAS__capture_end_susp_in_CoreLoop_forSys\
+   #define MEAS__Capture_End_Susp_in_CoreCtlr_ForSys\
           saveTSCLowHigh(endSusp); \
           numCycles = endSusp.longVal - currVP->startSusp.longVal; \
           /*sanity check (400K is about 20K iters)*/ \
@@ -288,7 +288,7 @@
    #define MEAS__startMasterLoop_forSys 
    #define MEAS__startReqHdlr_forSys
    #define MEAS__endMasterLoop_forSys
-   #define MEAS__capture_end_susp_in_CoreLoop_forSys
+   #define MEAS__Capture_End_Susp_in_CoreCtlr_ForSys
    #define MEAS__Print_Hists_for_System_Meas 
 #endif
 
diff -r 10a72bcedbf0 -r 712218cdc4ba Defines/VMS_defs__turn_on_and_off.h
--- a/Defines/VMS_defs__turn_on_and_off.h	Fri Mar 09 22:30:26 2012 -0800
+++ b/Defines/VMS_defs__turn_on_and_off.h	Sat Mar 10 20:35:38 2012 -0800
@@ -21,7 +21,7 @@
 /*turns on the probe-instrumentation in the application -- when not
  * defined, the calls to the probe functions turn into comments
  */
-//#define DEBUG__TURN_ON_DEBUG_MSGS
+#define DEBUG__TURN_ON_DEBUG_MSGS
 //#define DEBUG__TURN_ON_ERROR_MSGS
 
 /*These defines turn types of bug messages on and off
diff -r 10a72bcedbf0 -r 712218cdc4ba MasterLoop.c
--- a/MasterLoop.c	Fri Mar 09 22:30:26 2012 -0800
+++ b/MasterLoop.c	Sat Mar 10 20:35:38 2012 -0800
@@ -41,7 +41,7 @@
  *
  */
 
-/*May 29, 2010 -- birth a Master during init so that first core loop to
+/*May 29, 2010 -- birth a Master during init so that first core controller to
  * start running gets it and does all the stuff for a newly born --
  * from then on, will be doing continuation, but do suspension self
  * directly at end of master loop
@@ -49,8 +49,8 @@
  * all the others -- then does any extra setup needed and puts it into the
  * work queue.
  *However means have to make masterEnv a global static volatile the same way
- * did with readyToAnimateQ in core loop.  -- for performance, put the
- * jump to the core loop directly in here, and have it directly jump back.
+ * did with readyToAnimateQ in core controller.  -- for performance, put the
+ * jump to the core controller directly in here, and have it directly jump back.
  *
  *
  *Aug 18, 2010 -- Going to a separate MasterVP for each core, to see if this
@@ -62,7 +62,7 @@
  * animates this function has a different one.
  *
  *At this point, the masterLoop does not write itself into the queue anymore,
- * instead, the coreLoop acquires the masterLock when it has nothing to
+ * instead, the coreCtlr acquires the masterLock when it has nothing to
  * animate, and then animates its own masterLoop.  However, still try to put
  * several AppSlvs into the queue to amortize the startup cost of switching
  * to the MasterVP.  Note, don't have to worry about latency of requests much
@@ -164,7 +164,7 @@
 
          MEAS__Capture_Post_Master_Point;
    
-   masterSwitchToCoreLoop(animatingSlv);
+   masterSwitchToCoreCtlr(animatingSlv);
    flushRegisters();
    }//MasterLoop
 
@@ -260,7 +260,7 @@
  *Note, have single-reader, single-writer pattern for all variables used to
  * communicate between stealer and victims
  *
- *So, scan the queues of the core loops, until find non-empty.  Each core
+ *So, scan the queues of the core controllers, until find non-empty.  Each core
  * has its own list that it scans.  The list goes in order from closest to
  * furthest core, so it steals first from close cores.  Later can add
  * taking info from the app about overlapping footprints, and scan all the
@@ -316,7 +316,7 @@
       //have a victim core, now get the stealer-lock
    gotLock =__sync_bool_compare_and_swap( &(_VMSMasterEnv->workStealingLock),
                                                           UNLOCKED, LOCKED );
-   if( !gotLock ) return; //go back to core loop, which will re-start master
+   if( !gotLock ) return; //go back to core controller, which will re-start master
 
 
    //====== Start Gate-protection =======
diff -r 10a72bcedbf0 -r 712218cdc4ba Probes/probes.c
--- a/Probes/probes.c	Fri Mar 09 22:30:26 2012 -0800
+++ b/Probes/probes.c	Sat Mar 10 20:35:38 2012 -0800
@@ -199,7 +199,7 @@
 VMS_impl__record_interval_start_in_probe( int32 probeID )
  { IntervalProbe *probe;
 
-         DEBUG( dbgProbes, "record start of interval\n" )
+         DEBUG_Print( dbgProbes, "record start of interval\n" )
    probe = _VMSMasterEnv->intervalProbes[ probeID ];
 
       //record *start* point as last thing, after lookup
@@ -253,7 +253,7 @@
 
 #endif
    
-         DEBUG( dbgProbes, "record end of interval\n" )
+         DEBUG_Print( dbgProbes, "record end of interval\n" )
  }
 
 
diff -r 10a72bcedbf0 -r 712218cdc4ba VMS.h
--- a/VMS.h	Fri Mar 09 22:30:26 2012 -0800
+++ b/VMS.h	Sat Mar 10 20:35:38 2012 -0800
@@ -52,7 +52,8 @@
 
 //============================ HW Dependent Fns ================================
 
-#include "VMS__HW_dependent.h"
+#include "Hardware_Dependent/VMS__HW_measurement.h"
+#include "Hardware_Dependent/VMS__primitives.h"
 
 //============================= Statistics ==================================
 
@@ -114,9 +115,9 @@
    void       *framePtr;
    void       *resumeInstrPtr;
    
-   void       *coreLoopStartPt;  //allows proto-runtime to be linked later
-   void       *coreLoopFramePtr; //restore before jmp back to core loop
-   void       *coreLoopStackPtr; //restore before jmp back to core loop
+   void       *coreCtlrStartPt;  //allows proto-runtime to be linked later
+   void       *coreCtlrFramePtr; //restore before jmp back to core controller
+   void       *coreCtlrStackPtr; //restore before jmp back to core controller
 
    SchedSlot  *schedSlot;
    VMSReqst   *requests;
@@ -151,7 +152,7 @@
    MallocArrays   *freeLists;
    int32            amtOfOutstandingMem; //total currently allocated
 
-   void            *coreLoopReturnPt;//addr to jump to to re-enter coreLoop
+   void            *coreCtlrReturnPt;//addr to jump to to re-enter coreCtlr
 
    int32            setupComplete;
    int32            numMasterInARow[NUM_CORES];//detect back-to-back masterVP
@@ -198,8 +199,8 @@
 
 //=======================  OS Thread related  ===============================
 
-void * coreLoop( void *paramsIn );  //standard PThreads fn prototype
-void * coreLoop_Seq( void *paramsIn );  //standard PThreads fn prototype
+void * coreController( void *paramsIn );  //standard PThreads fn prototype
+void * coreCtlr_Seq( void *paramsIn );  //standard PThreads fn prototype
 void masterLoop( void *initData, SlaveVP *masterVP );
 
 
@@ -214,8 +215,8 @@
 
 volatile MasterEnv      *_VMSMasterEnv __align_to_cacheline__;
 
-pthread_t       coreLoopThdHandles[ NUM_CORES ];  //pthread's virt-procr state
-ThdParams      *coreLoopThdParams [ NUM_CORES ];
+pthread_t       coreCtlrThdHandles[ NUM_CORES ];  //pthread's virt-procr state
+ThdParams      *coreCtlrThdParams [ NUM_CORES ];
 pthread_mutex_t suspendLock   = PTHREAD_MUTEX_INITIALIZER;
 pthread_cond_t  suspendCond  = PTHREAD_COND_INITIALIZER;
 
diff -r 10a72bcedbf0 -r 712218cdc4ba VMS__WL.c
--- a/VMS__WL.c	Fri Mar 09 22:30:26 2012 -0800
+++ b/VMS__WL.c	Sat Mar 10 20:35:38 2012 -0800
@@ -47,7 +47,7 @@
  * does the work of freeing memory and removing the processor from the
  * semantic environment's data structures.
  *The request handler also is what figures out when to shutdown the VMS
- * system -- which causes all the core loop threads to die, and returns from
+ * system -- which causes all the core controller threads to die, and returns from
  * the call that started up VMS to perform the work.
  *
  *This form is a bit misleading to understand if one is trying to figure out
diff -r 10a72bcedbf0 -r 712218cdc4ba VMS__int.c
--- a/VMS__int.c	Fri Mar 09 22:30:26 2012 -0800
+++ b/VMS__int.c	Sat Mar 10 20:35:38 2012 -0800
@@ -53,7 +53,7 @@
 /*there is a label inside this function -- save the addr of this label in
  * the callingSlv struc, as the pick-up point from which to start the next
  * work-unit for that slave.  If turns out have to save registers, then
- * save them in the slave struc too.  Then do assembly jump to the CoreLoop's
+ * save them in the slave struc too.  Then do assembly jump to the CoreCtlr's
  * "done with work-unit" label.  The slave struc is in the request in the
  * slave that animated the just-ended work-unit, so all the state is saved
  * there, and will get passed along, inside the request handler, to the
@@ -64,7 +64,7 @@
  { 
 
       //The request to master will cause this suspended Slv to get
-      // scheduled again at some future point -- to resume, core loop jumps
+      // scheduled again at some future point -- to resume, core ctlr jumps
       // to the resume point (below), which causes restore of saved regs and
       // "return" from this call.
    //animatingSlv->resumeInstrPtr = &&ResumePt;
@@ -73,7 +73,7 @@
    animatingSlv->schedSlot->workIsDone = TRUE;
 
          MEAS__Capture_Pre_Susp_Point;
-   switchToCoreLoop(animatingSlv);
+   switchToCoreCtlr(animatingSlv);
    flushRegisters();
          MEAS__Capture_Post_Susp_Point;
 		 
diff -r 10a72bcedbf0 -r 712218cdc4ba VMS__startup_and_shutdown.c
--- a/VMS__startup_and_shutdown.c	Fri Mar 09 22:30:26 2012 -0800
+++ b/VMS__startup_and_shutdown.c	Sat Mar 10 20:35:38 2012 -0800
@@ -27,7 +27,7 @@
 create_masterEnv();
 
 void
-create_the_coreLoop_OS_threads();
+create_the_coreCtlr_OS_threads();
 
 MallocProlog *
 create_free_list();
@@ -74,7 +74,7 @@
       flushRegisters();  //? not sure why here -- merten added it..?
    #else
       create_masterEnv();
-      create_the_coreLoop_OS_threads();
+      create_the_coreCtlr_OS_threads();
    #endif
  }
 
@@ -155,7 +155,7 @@
    #ifdef DEBUG__TURN_ON_SEQUENTIAL_MODE
       //Nothing else to create for sequential mode
    #else
-      create_the_coreLoop_OS_threads();
+      create_the_coreCtlr_OS_threads();
    #endif    
  }
 */
@@ -277,7 +277,7 @@
    
    masterEnv     = (MasterEnv*)_VMSMasterEnv;
    
-      //Make a readyToAnimateQ for each core loop
+      //Make a readyToAnimateQ for each core controller
    readyToAnimateQs = VMS_int__malloc( NUM_CORES * sizeof(VMSQueueStruc *) );
    masterVPs        = VMS_int__malloc( NUM_CORES * sizeof(SlaveVP *) );
 
@@ -359,7 +359,7 @@
 
 
 void
-create_the_coreLoop_OS_threads()
+create_the_coreCtlr_OS_threads()
  {
    //========================================================================
    //                      Create the Threads
@@ -367,19 +367,19 @@
 
       //Need the threads to be created suspended, and wait for a signal
       // before proceeding -- gives time after creating to initialize other
-      // stuff before the coreLoops set off.
+      // stuff before the coreCtlrs set off.
    _VMSMasterEnv->setupComplete = 0;
 
-      //Make the threads that animate the core loops
+      //Make the threads that animate the core controllers
    for( coreIdx=0; coreIdx < NUM_CORES; coreIdx++ )
-    { coreLoopThdParams[coreIdx]          = VMS_int__malloc( sizeof(ThdParams) );
-      coreLoopThdParams[coreIdx]->coreNum = coreIdx;
+    { coreCtlrThdParams[coreIdx]          = VMS_int__malloc( sizeof(ThdParams) );
+      coreCtlrThdParams[coreIdx]->coreNum = coreIdx;
 
       retCode =
-      pthread_create( &(coreLoopThdHandles[coreIdx]),
+      pthread_create( &(coreCtlrThdHandles[coreIdx]),
                         thdAttrs,
-                       &coreLoop,
-               (void *)(coreLoopThdParams[coreIdx]) );
+                       &coreController,
+               (void *)(coreCtlrThdParams[coreIdx]) );
       if(retCode){printf("ERROR creating thread: %d\n", retCode); exit(1);}
     }
  }
@@ -412,17 +412,17 @@
  { 
 #ifdef DEBUG__TURN_ON_SEQUENTIAL_MODE
    /*Only difference between version with an OS thread pinned to each core and
-    * the sequential version of VMS is VMS__init_Seq, this, and coreLoop_Seq.
+    * the sequential version of VMS is VMS__init_Seq, this, and coreCtlr_Seq.
     */
          //Instead of un-suspending threads, just call the one and only
-         // core loop (sequential version), in the main thread.
-      coreLoop_Seq( NULL );
+         // core ctlr (sequential version), in the main thread.
+      coreCtlr_Seq( NULL );
       flushRegisters();
 #else
    int coreIdx;
-      //Start the core loops running
+      //Start the core controllers running
    
-      //tell the core loop threads that setup is complete
+      //tell the core controller threads that setup is complete
       //get lock, to lock out any threads still starting up -- they'll see
       // that setupComplete is true before entering while loop, and so never
       // wait on the condition
@@ -435,7 +435,7 @@
       //wait for all to complete
    for( coreIdx=0; coreIdx < NUM_CORES; coreIdx++ )
     {
-      pthread_join( coreLoopThdHandles[coreIdx], NULL );
+      pthread_join( coreCtlrThdHandles[coreIdx], NULL );
     }
    
       //NOTE: do not clean up VMS env here -- semantic layer has to have
@@ -454,7 +454,7 @@
 
 
 /*This is called by the semantic layer's request handler when it decides its
- * time to shut down the VMS system.  Calling this causes the core loop OS
+ * time to shut down the VMS system.  Calling this causes the core controller OS
  * threads to exit, which unblocks the entry-point function that started up
  * VMS, and allows it to grab the result and return to the original single-
  * threaded application.
@@ -469,7 +469,7 @@
  * locations it needs, and give ownership to masterVP.  Then, they will be
  * automatically freed.
  *
- *In here,create one core-loop shut-down processor for each core loop and put
+ *In here,create one core-loop shut-down processor for each core controller and put
  * them all directly into the readyToAnimateQ.
  *Note, this function can ONLY be called after the semantic environment no
  * longer cares if AppSlvs get animated after the point this is called.  In
@@ -482,7 +482,7 @@
  { int coreIdx;
    SlaveVP *shutDownSlv;
 
-      //create the shutdown processors, one for each core loop -- put them
+      //create the shutdown processors, one for each core controller -- put them
       // directly into the Q -- each core will die when gets one
    for( coreIdx=0; coreIdx < NUM_CORES; coreIdx++ )
     {    //Note, this is running in the master
@@ -492,26 +492,26 @@
  }
 
 
-/*Am trying to be cute, avoiding IF statement in coreLoop that checks for
+/*Am trying to be cute, avoiding IF statement in coreCtlr that checks for
  * a special shutdown slaveVP.  Ended up with extra-complex shutdown sequence.
  *This function has the sole purpose of setting the stack and framePtr
- * to the coreLoop's stack and framePtr.. it does that then jumps to the
- * core loop's shutdown point -- might be able to just call Pthread_exit
+ * to the coreCtlr's stack and framePtr.. it does that then jumps to the
+ * core ctlr's shutdown point -- might be able to just call Pthread_exit
  * from here, but am going back to the pthread's stack and setting everything
  * up just as if it never jumped out, before calling pthread_exit.
- *The end-point of core loop will free the stack and so forth of the
+ *The end-point of core ctlr will free the stack and so forth of the
  * processor that animates this function, (this fn is transfering the
  * animator of the AppSlv that is in turn animating this function over
- * to core loop function -- note that this slices out a level of virtual
+ * to core controller function -- note that this slices out a level of virtual
  * processors).
  */
 void
 endOSThreadFn( void *initData, SlaveVP *animatingSlv )
  { 
    #ifdef DEBUG__TURN_ON_SEQUENTIAL_MODE
-    asmTerminateCoreLoopSeq(animatingSlv);
+    asmTerminateCoreCtlrSeq(animatingSlv);
    #else
-    asmTerminateCoreLoop(animatingSlv);
+    asmTerminateCoreCtlr(animatingSlv);
    #endif
  }