# HG changeset patch
# User Some Random Person <seanhalle@yahoo.com>
# Date 1331658126 25200
# Node ID c88ce1db91ef3c39f00a390d13f29d754876aca3
# Parent  8059fb8d5465aba70aa50a537b60d55f25b8d3f8
Compiles, but does not run properly -- and changed MasterLoop to SchedulingMaster

diff -r 8059fb8d5465 -r c88ce1db91ef CoreController.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/CoreController.c	Tue Mar 13 10:02:06 2012 -0700
@@ -0,0 +1,333 @@
+/*
+ * Copyright 2010  OpenSourceStewardshipFoundation
+ *
+ * Licensed under BSD
+ */
+
+
+#include "VMS.h"
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <time.h>
+
+#include <pthread.h>
+#include <sched.h>
+
+//=====================  Functions local to this file =======================
+void *terminateCoreController(SlaveVP *currSlv);
+inline void
+doBackoff_for_TooLongToGetLock( int32  numTriesToGetLock, uint32 *seed1, 
+                                uint32 *seed2 );
+inline void
+doBackoff_for_TooLongWithNoWork( int32   numRepsWithNoWork, uint32 *seed1, 
+                                 uint32 *seed2 );
+
+//===========================================================================
+
+
+/*The Core Controller is logically "beneath" the masterVP and slave VPs.  Its
+ * job is to control which of those VPs the core animates.  Any time one of
+ * those VPs suspends, the suspend-primitive switches the core over to
+ * animating the core controller.  The core controller then follows a very
+ * basic pattern to choose which VP will get animated next, then switches
+ * the core over to animating that VP.  So, all VPs switch the core to
+ * core controller, which then chooses which VP the core animates next.
+ *
+ *The way the core controller decides which VP to switch the core to next is:
+ * 1) There are a number of "scheduling slots", which the master VP fills up
+ *    with slave VPs that are ready to be animated.  So, the core controller
+ *    just iterates through the scheduling slots.  When the next slot has a
+ *    slave VP in it, the core controller switches the core over to animate
+ *    that slave.
+ * 2) When the core controller checks a scheduling slot, and it's empty,
+ *    then the controller switches the core over to animating the master VP,
+ *    whose job is to find more slave VPs ready, and assign those to 
+ *    scheduling slots.
+ *
+ *So, in effect, a scheduling slot functions as another layer of virtual
+ * processor.  A slot has the logical meaning of being an animator that
+ * animates the slave assigned to it.  However, the core controller sits
+ * below the slots, and sequences down them, assigning the actual physical
+ * core to each slot, in turn.
+ *The reason for having the scheduling slots and core controller is to 
+ * amortize the overhead of switching to the master VP and running it.  With
+ * multiple scheduling slots, the time to switch-to-master and the code in
+ * the master loop is divided by the number of scheduling slots.
+ *The core controller and scheduling slots are not fundamental parts of VMS,
+ * but rather optimizations put into the shared-semantic-state version of
+ * VMS.  Other versions of VMS will not have a core controller nor scheduling
+ * slots.
+ * 
+ *The core controller "owns" the physical core, in effect, and is the 
+ * function given to the pthread creation call.  Hence, it contains code
+ * related to pthread startup, synchronizing the controllers to all start
+ * at the same time-point, and pinning the pthreads to physical cores.
+ * 
+ */
+void *
+coreController( void *paramsIn )
+ { 
+   int32           thisCoresIdx;
+   int32           numRepetitionsWithNoWork;
+   SlaveVP        *currVP;
+   SchedSlot      *currSlot, **schedSlots;
+   int32           currSlotIdx;
+   volatile int32 *addrOfMasterLock; //thing pointed to is volatile, not ptr
+   SlaveVP        *thisCoresMasterVP;
+      //Variables used for pthread related things
+   ThdParams      *coreCtlrThdParams;
+   cpu_set_t       coreMask;  //used during pinning pthread to CPU core
+   int32           errorCode;
+      //Variables used during measurements
+   TSCountLowHigh  endSusp;
+      //Variables used in random-backoff, for master-lock and waiting for work
+   uint32_t seed1 = rand()%1000; // init random number generator for retries
+   uint32_t seed2 = rand()%1000;
+      //Variable for work-stealing -- a gate protects a critical section
+   volatile GateStruc gate;      //on stack to avoid false-sharing
+
+   
+   //===============  Initializations ===================
+   coreCtlrThdParams = (ThdParams *)paramsIn;
+   thisCoresIdx = coreCtlrThdParams->coreNum;
+
+   gate.gateClosed      = FALSE;
+   gate.preGateProgress = 0;
+   gate.waitProgress    = 0;
+   gate.exitProgress    = 0;
+   //TODO: pad these to prevent false-sharing, and fix the race at startup
+   _VMSMasterEnv->workStealingGates[ thisCoresIdx ] = (GateStruc*)&gate;
+
+      //Assembly that saves addr of label of return instr -- label in assmbly
+   recordCoreCtlrReturnLabelAddr((void**)&(_VMSMasterEnv->coreCtlrReturnPt));
+
+   schedSlots = _VMSMasterEnv->allSchedSlots[thisCoresIdx];
+   currSlotIdx = 0; //start at slot 0, go up until one empty, then do master
+   numRepetitionsWithNoWork = 0;
+   addrOfMasterLock = &(_VMSMasterEnv->masterLock);
+   thisCoresMasterVP = _VMSMasterEnv->masterVPs[thisCoresIdx];
+   
+   //==================== pthread related stuff ======================
+      //pin the pthread to the core
+      //Linux requires pinning to be done inside the thread-function
+      //Designate a core by a 1 in bit-position corresponding to the core
+   CPU_ZERO(&coreMask); //initialize mask bits to zero
+   CPU_SET(coreCtlrThdParams->coreNum,&coreMask); //set bit repr the coreNum
+   pthread_t selfThd = pthread_self();
+   errorCode =
+   pthread_setaffinity_np( selfThd, sizeof(coreMask), &coreMask);
+   if(errorCode){ printf("\n pinning thd to core failed \n"); exit(0); }
+
+      //make sure the controllers all start at same time, by making them wait
+   pthread_mutex_lock(   &suspendLock );
+   while( !(_VMSMasterEnv->setupComplete) )
+    { pthread_cond_wait( &suspendCond, &suspendLock );
+    }
+   pthread_mutex_unlock( &suspendLock );
+
+   //====================== The Core Controller ======================
+   while(1)  //An endless loop is just one way of doing the control structure
+    {        //Assembly code switches the core between animating a VP and
+             // animating this core controller.  The switch is done by
+             // changing the stack-pointer and frame-pointer and then doing
+             // an assembly jmp.  When reading this code, the effect is 
+             // that the "switchToSlv()" at the end of the loop is sort of a
+             // "warp in time" -- the core disappears inside this, jmps to
+             // animating a VP, and when that VP suspends, the suspend
+             // jmps back. This has the effect of "returning" from the
+             // switchToSlv() call. Then control loops back to here.
+             //Alternatively, the VP suspend primitive could just not bother
+             // returning from switchToSlv, and instead jmp directly to here.
+      
+      if( currSlotIdx >= NUM_SCHED_SLOTS ) goto switchToMaster;
+      currSlot = schedSlots[ currSlotIdx ];
+
+      
+      if( ! currSlot->needsSlaveAssigned ) //slot does have slave assigned
+       { numRepetitionsWithNoWork = 0;     //reset B2B master count
+         currSlotIdx ++;
+         currVP = currSlot->slaveAssignedToSlot;
+       }
+      else //slot is empty, so switch to master
+       {
+       switchToMaster:
+         currSlotIdx = 0; //doing switch to master, so start over at slot 0
+         currVP = NULL;
+
+               MEAS__Capture_Pre_Master_Lock_Point;
+
+         int numTriesToGetLock = 0; int gotLock = 0;
+         while( currVP == NULL ) //keep going until get master lock
+          { 
+               //At this point, first thing to do is get lock.  But, want to
+               // reduce lock contention from cores with no work, so first
+               // check if this is a core with no work, and busy wait if so.
+               //Then, if it's been way too long without work, yield pthread
+            if( numRepetitionsWithNoWork > NUM_REPS_W_NO_WORK_BEFORE_BACKOFF)
+               doBackoff_for_TooLongWithNoWork( numRepetitionsWithNoWork, &seed1, &seed2 );
+            if( numRepetitionsWithNoWork > NUM_REPS_W_NO_WORK_BEFORE_YIELD )
+             { numRepetitionsWithNoWork = 0; pthread_yield(); }
+
+               
+               //Now, try to get the lock
+            gotLock = __sync_bool_compare_and_swap( addrOfMasterLock,
+                                                    UNLOCKED, LOCKED );
+            if( gotLock )
+             {    //At this point, have run out of slaves, so tried to get
+                  // the master lock, and have successfully gotten it.
+                  //So, set the currVP to this core's masterVP and break out
+                  // of the get-lock loop.  Below, assembly code will switch
+                  // the core over to animating the masterVP.  When it's 
+                  // done, the masterVP will use assembly to switch the core
+                  // back to animating this core controller
+               currVP = thisCoresMasterVP;
+               numRepetitionsWithNoWork += 1;
+               break;  //end while -- have a VP to animate now
+             }
+               //Get here only when failed to get lock
+
+            numTriesToGetLock++;   //if too many, means too much contention
+            if( numTriesToGetLock > NUM_TRIES_BEFORE_DO_BACKOFF ) 
+               doBackoff_for_TooLongToGetLock( numTriesToGetLock, &seed1, &seed2 );
+            if( numTriesToGetLock > MASTERLOCK_RETRIES_BEFORE_YIELD ) 
+             { numTriesToGetLock = 0; pthread_yield(); }
+          }
+               MEAS__Capture_Post_Master_Lock_Point;
+       }
+
+
+      switchToSlv(currVP); //Slave suspend makes core "return" from this call
+      flushRegisters();    //prevent GCC optimization from doing bad things 
+
+             MEAS__Capture_End_Susp_in_CoreCtlr_ForSys;
+          
+    }//while(1)
+ }
+
+
+void *
+terminateCoreCtlr(SlaveVP *currSlv)
+ {
+   //first free shutdown Slv that jumped here -- it first restores the
+   // coreloop's stack, so addr of currSlv in stack frame is still correct
+   VMS_int__dissipate_slaveVP( currSlv );
+   pthread_exit( NULL );
+ }
+
+
+/*Used by the backoff to pick a random amount of busy-wait.  Can't use the
+ * system rand because it takes much too long.
+ *Note, are passing pointers to the seeds, which are then modified
+ */
+inline uint32_t
+randomNumber(uint32_t* seed1, uint32_t* seed2)
+ {
+	*seed1 = 36969 * (*seed1 & 65535) + (*seed1 >> 16);
+	*seed2 = 18000 * (*seed2 & 65535) + (*seed2 >> 16);
+	return (*seed1 << 16) + *seed2;
+ }
+
+/*Busy-wait for a random number of cycles -- chooses number of cycles 
+ * differently than for the too-many-tries-to-get-lock backoff
+ */
+inline void
+doBackoff_for_TooLongWithNoWork( int32   numRepsWithNoWork, uint32 *seed1, 
+                                 uint32 *seed2 )
+ { int32 i, waitIterations;
+   volatile double fakeWorkVar; //busy-wait fake work
+ 
+   waitIterations = 
+    randomNumber(seed1, seed2) % 
+    (numRepsWithNoWork * numRepsWithNoWork * NUM_CORES);
+   for( i = 0; i < waitIterations; i++ )
+    { fakeWorkVar += (fakeWorkVar + 32.0) / 2.0; //busy-wait
+    }
+ }
+
+/*Busy-waits for a random number of cycles -- chooses number of cycles 
+ * differently than for the no-work backoff
+ */
+inline void
+doBackoff_for_TooLongToGetLock( int32 numTriesToGetLock, uint32 *seed1, 
+                                uint32 *seed2 )
+ { int32 i, waitIterations;
+   volatile double fakeWorkVar; //busy-wait fake work
+
+   waitIterations = 
+    randomNumber(seed1, seed2) % 
+    (numTriesToGetLock * NUM_TRIES_TO_GET_LOCK_BACKOFF_WEIGHT);   
+   //addToHist( wait_iterations, coreLoopThdParams->wait_iterations_hist );
+   for( i = 0; i < waitIterations; i++ )
+    { fakeWorkVar += (fakeWorkVar + 32.0) / 2.0; //busy-wait
+    }
+ }
+
+
+#ifdef DEBUG__TURN_ON_SEQUENTIAL_MODE
+
+//===========================================================================
+/*This sequential version does the same as threaded, except doesn't do the
+ * pin-threads part, nor the wait until setup complete and acquire master
+ * lock parts.
+ */
+void *
+coreCtlr_Seq( void *paramsIn )
+ {
+   int32           thisCoresIdx;
+   int32           numRepetitionsWithNoWork;
+   SlaveVP        *currVP;
+   SchedSlot      *currSlot, **schedSlots;
+   int32           currSlotIdx;
+   int32          *addrOfMasterLock;
+   SlaveVP        *thisCoresMasterVP;
+   
+   //===============  Initializations ===================
+   schedSlots = _VMSMasterEnv->allSchedSlots[thisCoresIdx];
+   currSlotIdx = 0; //start at slot 0, go up until one empty, then do master
+   numRepetitionsWithNoWork = 0;
+   addrOfMasterLock = &(_VMSMasterEnv->masterLock);
+   thisCoresMasterVP = _VMSMasterEnv->masterVPs[thisCoresIdx];
+   
+   thisCoresIdx = 0; //sequential version
+
+      //Assembly that saves addr of label of return instr -- label in assmbly
+   recordCoreCtlrReturnLabelAddr((void**)&(_VMSMasterEnv->coreCtlrReturnPt));
+
+   
+   //====================== The Core Controller ======================
+   while(1)
+    {
+      if( currSlotIdx >= NUM_SCHED_SLOTS ) goto switchToMaster;
+      currSlot = schedSlots[ currSlotIdx ];
+
+      if( ! currSlot->needsSlaveAssigned ) //slot does have slave assigned
+       { numRepetitionsWithNoWork = 0;     //reset B2B master count
+         currSlotIdx ++;
+         currVP = currSlot->slaveAssignedToSlot;
+       }
+      else //slot is empty, so switch to master
+       {
+       switchToMaster:
+         currSlotIdx = 0; //doing switch to master, so start over at slot 0
+         
+         currVP = thisCoresMasterVP;
+         
+               MEAS__Capture_Pre_Master_Lock_Point;  //back to back because
+               MEAS__Capture_Post_Master_Lock_Point; // sequential version
+         
+         if( numRepetitionsWithNoWork > NUM_REPS_W_NO_WORK_BEFORE_YIELD )
+          { printf("Lots of reps w/o work\n");
+            exit(0); //if no work, no way to ever get it in sequential!
+          }
+         numRepetitionsWithNoWork += 1;
+       }
+
+      switchToSlv(currVP); //Slave suspend makes core "return" from this call
+      flushRegisters();    //prevent GCC optimization from doing bad things 
+
+             MEAS__Capture_End_Susp_in_CoreCtlr_ForSys;
+             
+    } //while(1)
+ }
+#endif
diff -r 8059fb8d5465 -r c88ce1db91ef CoreLoop.c
--- a/CoreLoop.c	Mon Mar 12 05:38:07 2012 -0700
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,332 +0,0 @@
-/*
- * Copyright 2010  OpenSourceStewardshipFoundation
- *
- * Licensed under BSD
- */
-
-
-#include "VMS.h"
-
-#include <stdlib.h>
-#include <stdio.h>
-#include <time.h>
-
-#include <pthread.h>
-#include <sched.h>
-
-//=====================  Functions local to this file =======================
-void *terminateCoreController(SlaveVP *currSlv);
-inline void
-doBackoff_for_TooLongToGetLock( int32  numTriesToGetLock, uint32 *seed1, 
-                                uint32 *seed2 );
-inline void
-doBackoff_for_TooLongWithNoWork( int32   numRepsWithNoWork, uint32 *seed1, 
-                                 uint32 *seed2 );
-//===========================================================================
-
-
-/*The Core Controller is logically "beneath" the masterVP and slave VPs.  Its
- * job is to control which of those VPs the core animates.  Any time one of
- * those VPs suspends, the suspend-primitive switches the core over to
- * animating the core controller.  The core controller then follows a very
- * basic pattern to choose which VP will get animated next, then switches
- * the core over to animating that VP.  So, all VPs switch the core to
- * core controller, which then chooses which VP the core animates next.
- *
- *The way the core controller decides which VP to switch the core to next is:
- * 1) There are a number of "scheduling slots", which the master VP fills up
- *    with slave VPs that are ready to be animated.  So, the core controller
- *    just iterates through the scheduling slots.  When the next slot has a
- *    slave VP in it, the core controller switches the core over to animate
- *    that slave.
- * 2) When the core controller checks a scheduling slot, and it's empty,
- *    then the controller switches the core over to animating the master VP,
- *    whose job is to find more slave VPs ready, and assign those to 
- *    scheduling slots.
- *
- *So, in effect, a scheduling slot functions as another layer of virtual
- * processor.  A slot has the logical meaning of being an animator that
- * animates the slave assigned to it.  However, the core controller sits
- * below the slots, and sequences down them, assigning the actual physical
- * core to each slot, in turn.
- *The reason for having the scheduling slots and core controller is to 
- * amortize the overhead of switching to the master VP and running it.  With
- * multiple scheduling slots, the time to switch-to-master and the code in
- * the master loop is divided by the number of scheduling slots.
- *The core controller and scheduling slots are not fundamental parts of VMS,
- * but rather optimizations put into the shared-semantic-state version of
- * VMS.  Other versions of VMS will not have a core controller nor scheduling
- * slots.
- * 
- *The core controller "owns" the physical core, in effect, and is the 
- * function given to the pthread creation call.  Hence, it contains code
- * related to pthread startup, synchronizing the controllers to all start
- * at the same time-point, and pinning the pthreads to physical cores.
- * 
- */
-void *
-coreController( void *paramsIn )
- { 
-   int32           thisCoresIdx;
-   int32           numRepetitionsWithNoWork;
-   SlaveVP        *currVP;
-   SchedSlot      *currSlot, **schedSlots;
-   int32           currSlotIdx;
-   volatile int32 *addrOfMasterLock; //thing pointed to is volatile, not ptr
-   SlaveVP        *thisCoresMasterVP;
-      //Variables used for pthread related things
-   ThdParams      *coreCtlrThdParams;
-   cpu_set_t       coreMask;  //used during pinning pthread to CPU core
-   int32           errorCode;
-      //Variables used during measurements
-   TSCountLowHigh  endSusp;
-      //Variables used in random-backoff, for master-lock and waiting for work
-   uint32_t seed1 = rand()%1000; // init random number generator for retries
-   uint32_t seed2 = rand()%1000;
-      //Variable for work-stealing -- a gate protects a critical section
-   volatile GateStruc gate;      //on stack to avoid false-sharing
-
-   
-   //===============  Initializations ===================
-   coreCtlrThdParams = (ThdParams *)paramsIn;
-   thisCoresIdx = coreCtlrThdParams->coreNum;
-
-   gate.gateClosed      = FALSE;
-   gate.preGateProgress = 0;
-   gate.waitProgress    = 0;
-   gate.exitProgress    = 0;
-   //TODO: pad these to prevent false-sharing, and fix the race at startup
-   _VMSMasterEnv->workStealingGates[ thisCoresIdx ] = (GateStruc*)&gate;
-
-      //Assembly that saves addr of label of return instr -- label in assmbly
-   recordCoreCtlrReturnLabelAddr((void**)&(_VMSMasterEnv->coreCtlrReturnPt));
-
-   schedSlots = _VMSMasterEnv->allSchedSlots[thisCoresIdx];
-   currSlotIdx = 0; //start at slot 0, go up until one empty, then do master
-   numRepetitionsWithNoWork = 0;
-   addrOfMasterLock = &(_VMSMasterEnv->masterLock);
-   thisCoresMasterVP = _VMSMasterEnv->masterVPs[thisCoresIdx];
-   
-   //==================== pthread related stuff ======================
-      //pin the pthread to the core
-      //Linux requires pinning to be done inside the thread-function
-      //Designate a core by a 1 in bit-position corresponding to the core
-   CPU_ZERO(&coreMask); //initialize mask bits to zero
-   CPU_SET(coreCtlrThdParams->coreNum,&coreMask); //set bit repr the coreNum
-   pthread_t selfThd = pthread_self();
-   errorCode =
-   pthread_setaffinity_np( selfThd, sizeof(coreMask), &coreMask);
-   if(errorCode){ printf("\n pinning thd to core failed \n"); exit(0); }
-
-      //make sure the controllers all start at same time, by making them wait
-   pthread_mutex_lock(   &suspendLock );
-   while( !(_VMSMasterEnv->setupComplete) )
-    { pthread_cond_wait( &suspendCond, &suspendLock );
-    }
-   pthread_mutex_unlock( &suspendLock );
-
-   //====================== The Core Controller ======================
-   while(1)  //An endless loop is just one way of doing the control structure
-    {        //Assembly code switches the core between animating a VP and
-             // animating this core controller.  The switch is done by
-             // changing the stack-pointer and frame-pointer and then doing
-             // an assembly jmp.  When reading this code, the effect is 
-             // that the "switchToSlv()" at the end of the loop is sort of a
-             // "warp in time" -- the core disappears inside this, jmps to
-             // animating a VP, and when that VP suspends, the suspend
-             // jmps back. This has the effect of "returning" from the
-             // switchToSlv() call. Then control loops back to here.
-             //Alternatively, the VP suspend primitive could just not bother
-             // returning from switchToSlv, and instead jmp directly to here.
-      
-      if( currSlotIdx >= NUM_SCHED_SLOTS ) goto switchToMaster;
-      currSlot = schedSlots[ currSlotIdx ];
-
-      
-      if( ! currSlot->needsSlaveAssigned ) //slot does have slave assigned
-       { numRepetitionsWithNoWork = 0;     //reset B2B master count
-         currSlotIdx ++;
-         currVP = currSlot->slaveAssignedToSlot;
-       }
-      else //slot is empty, so switch to master
-       {
-       switchToMaster:
-         currSlotIdx = 0; //doing switch to master, so start over at slot 0
-         currVP = NULL;
-
-               MEAS__Capture_Pre_Master_Lock_Point;
-
-         int numTriesToGetLock = 0; int gotLock = 0;
-         while( currVP == NULL ) //keep going until get master lock
-          { 
-               //At this point, first thing to do is get lock.  But, want to
-               // reduce lock contention from cores with no work, so first
-               // check if this is a core with no work, and busy wait if so.
-               //Then, if it's been way too long without work, yield pthread
-            if( numRepetitionsWithNoWork > NUM_REPS_W_NO_WORK_BEFORE_BACKOFF)
-               doBackoff_for_TooLongWithNoWork( numRepetitionsWithNoWork, &seed1, &seed2 );
-            if( numRepetitionsWithNoWork > NUM_REPS_W_NO_WORK_BEFORE_YIELD )
-             { numRepetitionsWithNoWork = 0; pthread_yield(); }
-
-               
-               //Now, try to get the lock
-            gotLock = __sync_bool_compare_and_swap( addrOfMasterLock,
-                                                    UNLOCKED, LOCKED );
-            if( gotLock )
-             {    //At this point, have run out of slaves, so tried to get
-                  // the master lock, and have successfully gotten it.
-                  //So, set the currVP to this core's masterVP and break out
-                  // of the get-lock loop.  Below, assembly code will switch
-                  // the core over to animating the masterVP.  When it's 
-                  // done, the masterVP will use assembly to switch the core
-                  // back to animating this core controller
-               currVP = thisCoresMasterVP;
-               numRepetitionsWithNoWork += 1;
-               break;  //end while -- have a VP to animate now
-             }
-               //Get here only when failed to get lock
-
-            numTriesToGetLock++;   //if too many, means too much contention
-            if( numTriesToGetLock > NUM_TRIES_BEFORE_DO_BACKOFF ) 
-               doBackoff_for_TooLongToGetLock( numTriesToGetLock, &seed1, &seed2 );
-            if( numTriesToGetLock > MASTERLOCK_RETRIES_BEFORE_YIELD ) 
-             { numTriesToGetLock = 0; pthread_yield(); }
-          }
-               MEAS__Capture_Post_Master_Lock_Point;
-       }
-
-
-      switchToSlv(currVP); //Slave suspend makes core "return" from this call
-      flushRegisters();    //prevent GCC optimization from doing bad things 
-
-             MEAS__Capture_End_Susp_in_CoreCtlr_ForSys;
-          
-    }//while(1)
- }
-
-
-void *
-terminateCoreController(SlaveVP *currSlv)
- {
-   //first free shutdown Slv that jumped here -- it first restores the
-   // coreloop's stack, so addr of currSlv in stack frame is still correct
-   VMS_int__dissipate_slaveVP( currSlv );
-   pthread_exit( NULL );
- }
-
-
-/*Used by the backoff to pick a random amount of busy-wait.  Can't use the
- * system rand because it takes much too long.
- *Note, are passing pointers to the seeds, which are then modified
- */
-inline uint32_t
-randomNumber(uint32_t* seed1, uint32_t* seed2)
- {
-	*seed1 = 36969 * (*seed1 & 65535) + (*seed1 >> 16);
-	*seed2 = 18000 * (*seed2 & 65535) + (*seed2 >> 16);
-	return (*seed1 << 16) + *seed2;
- }
-
-/*Busy-wait for a random number of cycles -- chooses number of cycles 
- * differently than for the too-many-tries-to-get-lock backoff
- */
-inline void
-doBackoff_for_TooLongWithNoWork( int32   numRepsWithNoWork, uint32 *seed1, 
-                                 uint32 *seed2 )
- { int32 i, waitIterations;
-   volatile double fakeWorkVar; //busy-wait fake work
- 
-   waitIterations = 
-    randomNumber(seed1, seed2) % 
-    (numRepsWithNoWork * numRepsWithNoWork * NUM_CORES);
-   for( i = 0; i < waitIterations; i++ )
-    { fakeWorkVar += (fakeWorkVar + 32.0) / 2.0; //busy-wait
-    }
- }
-
-/*Busy-waits for a random number of cycles -- chooses number of cycles 
- * differently than for the no-work backoff
- */
-inline void
-doBackoff_for_TooLongToGetLock( int32 numTriesToGetLock, uint32 *seed1, 
-                                uint32 *seed2 )
- { int32 i, waitIterations;
-   volatile double fakeWorkVar; //busy-wait fake work
-
-   waitIterations = 
-    randomNumber(seed1, seed2) % 
-    (numTriesToGetLock * NUM_TRIES_TO_GET_LOCK_BACKOFF_WEIGHT);   
-   //addToHist( wait_iterations, coreLoopThdParams->wait_iterations_hist );
-   for( i = 0; i < waitIterations; i++ )
-    { fakeWorkVar += (fakeWorkVar + 32.0) / 2.0; //busy-wait
-    }
- }
-
-
-#ifdef DEBUG__TURN_ON_SEQUENTIAL_MODE
-
-//===========================================================================
-/*This sequential version does the same as threaded, except doesn't do the
- * pin-threads part, nor the wait until setup complete and acquire master
- * lock parts.
- */
-void *
-coreCtlr_Seq( void *paramsIn )
- {
-   int32           thisCoresIdx;
-   int32           numRepetitionsWithNoWork;
-   SlaveVP        *currVP;
-   SchedSlot      *currSlot, **schedSlots;
-   int32           currSlotIdx;
-   int32          *addrOfMasterLock;
-   SlaveVP        *thisCoresMasterVP;
-   
-   //===============  Initializations ===================
-   schedSlots = _VMSMasterEnv->allSchedSlots[thisCoresIdx];
-   currSlotIdx = 0; //start at slot 0, go up until one empty, then do master
-   numRepetitionsWithNoWork = 0;
-   addrOfMasterLock = &(_VMSMasterEnv->masterLock);
-   thisCoresMasterVP = _VMSMasterEnv->masterVPs[thisCoresIdx];
-   
-   thisCoresIdx = 0; //sequential version
-
-      //Assembly that saves addr of label of return instr -- label in assmbly
-   recordCoreCtlrReturnLabelAddr((void**)&(_VMSMasterEnv->coreCtlrReturnPt));
-
-   
-   //====================== The Core Controller ======================
-   while(1)
-    {
-      if( currSlotIdx >= NUM_SCHED_SLOTS ) goto switchToMaster;
-      currSlot = schedSlots[ currSlotIdx ];
-
-      if( ! currSlot->needsSlaveAssigned ) //slot does have slave assigned
-       { numRepetitionsWithNoWork = 0;     //reset B2B master count
-         currSlotIdx ++;
-         currVP = currSlot->slaveAssignedToSlot;
-       }
-      else //slot is empty, so switch to master
-       {
-       switchToMaster:
-         currSlotIdx = 0; //doing switch to master, so start over at slot 0
-         
-         currVP = thisCoresMasterVP;
-         
-               MEAS__Capture_Pre_Master_Lock_Point;  //back to back because
-               MEAS__Capture_Post_Master_Lock_Point; // sequential version
-         
-         if( numRepetitionsWithNoWork > NUM_REPS_W_NO_WORK_BEFORE_YIELD )
-          { printf("Lots of reps w/o work\n");
-            exit(0); //if no work, no way to ever get it in sequential!
-          }
-         numRepetitionsWithNoWork += 1;
-       }
-
-      switchToSlv(currVP); //Slave suspend makes core "return" from this call
-      flushRegisters();    //prevent GCC optimization from doing bad things 
-
-             MEAS__Capture_End_Susp_in_CoreCtlr_ForSys;
-             
-    } //while(1)
- }
-#endif
diff -r 8059fb8d5465 -r c88ce1db91ef Defines/VMS_defs__MEAS.h
--- a/Defines/VMS_defs__MEAS.h	Mon Mar 12 05:38:07 2012 -0700
+++ b/Defines/VMS_defs__MEAS.h	Tue Mar 13 10:02:06 2012 -0700
@@ -6,8 +6,8 @@
  * 
  */
 
-#ifndef  _VMS_DEFS_MEAS_H
-#define	_VMS_DEFS_MEAS_H
+#ifndef _VMS_DEFS_MEAS_H
+#define _VMS_DEFS_MEAS_H
 #define _GNU_SOURCE
 
 //==================  Macros define types of meas want  =====================
@@ -321,5 +321,5 @@
 
 
 //===========================================================================
-#endif	/* _VMS_DEFS_H */
+#endif	/* _VMS_DEFS_MEAS_H */
 
diff -r 8059fb8d5465 -r c88ce1db91ef Defines/VMS_defs__turn_on_and_off.h
--- a/Defines/VMS_defs__turn_on_and_off.h	Mon Mar 12 05:38:07 2012 -0700
+++ b/Defines/VMS_defs__turn_on_and_off.h	Tue Mar 13 10:02:06 2012 -0700
@@ -15,7 +15,7 @@
  * It still does co-routines and all the mechanisms are the same, it just
  * has only a single thread and animates Slvs one at a time
  */
-//#define DEBUG__TURN_ON_SEQUENTIAL_MODE
+#define DEBUG__TURN_ON_SEQUENTIAL_MODE
 
 
 /*turns on the probe-instrumentation in the application -- when not
diff -r 8059fb8d5465 -r c88ce1db91ef MasterLoop.c
--- a/MasterLoop.c	Mon Mar 12 05:38:07 2012 -0700
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,349 +0,0 @@
-/*
- * Copyright 2010  OpenSourceStewardshipFoundation
- * 
- * Licensed under BSD
- */
-
-
-
-#include <stdio.h>
-#include <stddef.h>
-
-#include "VMS.h"
-
-
-//===========================================================================
-void inline
-stealWorkInto( SchedSlot *currSlot, VMSQueueStruc *readyToAnimateQ,
-               SlaveVP *masterVP );
-
-//===========================================================================
-
-
-
-/*This code is animated by the virtual Master processor.
- *
- *Polls each sched slot exactly once, hands any requests made by a newly
- * done slave to the "request handler" plug-in function
- *
- *Any slots that need a Slv assigned are given to the "schedule"
- * plug-in function, which tries to assign a Slv (slave) to it.
- *
- *When all slots needing a processor have been given to the schedule plug-in,
- * a fraction of the slaves successfully scheduled are put into the
- * work queue, then a continuation of this function is put in, then the rest
- * of the Slvs that were successfully scheduled.
- *
- *The first thing the continuation does is busy-wait until the previous
- * animation completes.  This is because an (unlikely) continuation may
- * sneak through queue before previous continuation is done putting second
- * part of scheduled slaves in, which is the only race condition.
- *
- */
-
-/*May 29, 2010 -- birth a Master during init so that first core controller to
- * start running gets it and does all the stuff for a newly born --
- * from then on, will be doing continuation, but do suspension self
- * directly at end of master loop
- *So VMS_WL__init just births the master virtual processor same way it births
- * all the others -- then does any extra setup needed and puts it into the
- * work queue.
- *However means have to make masterEnv a global static volatile the same way
- * did with readyToAnimateQ in core controller.  -- for performance, put the
- * jump to the core controller directly in here, and have it directly jump back.
- *
- *
- *Aug 18, 2010 -- Going to a separate MasterVP for each core, to see if this
- * avoids the suspected bug in the system stack that causes bizarre faults
- * at random places in the system code.
- *
- *So, this function is coupled to each of the MasterVPs, -- meaning this
- * function can't rely on a particular stack and frame -- each MasterVP that
- * animates this function has a different one.
- *
- *At this point, the masterLoop does not write itself into the queue anymore,
- * instead, the coreCtlr acquires the masterLock when it has nothing to
- * animate, and then animates its own masterLoop.  However, still try to put
- * several AppSlvs into the queue to amortize the startup cost of switching
- * to the MasterVP.  Note, don't have to worry about latency of requests much
- * because most requests generate work for same core -- only latency issue
- * is case when other cores starved and one core's requests generate work
- * for them -- so keep max in queue to 3 or 4..
- */
-void masterLoop( void *initData, SlaveVP *animatingSlv )
- { 
-   int32           slotIdx, numSlotsFilled;
-   SlaveVP        *schedSlaveVP;
-   SchedSlot      *currSlot, **schedSlots;
-   MasterEnv      *masterEnv;
-   VMSQueueStruc  *readyToAnimateQ;
-   
-   Sched_Assigner  slaveAssigner;
-   RequestHandler  requestHandler;
-   void           *semanticEnv;
-
-   int32           thisCoresIdx;
-   SlaveVP        *masterVP;
-   volatile        SlaveVP *volatileMasterVP;
-   
-   volatileMasterVP = animatingSlv;
-   masterVP         = (SlaveVP*)volatileMasterVP; //used to force re-define after jmp
-
-      //First animation of each MasterVP will in turn animate this part
-      // of setup code.. (Slv creator sets up the stack as if this function
-      // was called normally, but actually get here by jmp)
-      //So, setup values about stack ptr, jmp pt and all that
-   //masterVP->resumeInstrPtr = &&masterLoopStartPt;
-
-
-      //Note, got rid of writing the stack and frame ptr up here, because
-      // only one
-      // core can ever animate a given MasterVP, so don't need to communicate
-      // new frame and stack ptr to the MasterVP storage before a second
-      // version of that MasterVP can get animated on a different core.
-      //Also got rid of the busy-wait.
-
-   
-   //masterLoopStartPt:
-   while(1){
-       
-      MEAS__Capture_Pre_Master_Point
-
-   masterEnv        = (MasterEnv*)_VMSMasterEnv;
-   
-      //GCC may optimize so doesn't always re-define from frame-storage
-   masterVP         = (SlaveVP*)volatileMasterVP;  //just to make sure after jmp
-   thisCoresIdx     = masterVP->coreAnimatedBy;
-   readyToAnimateQ  = masterEnv->readyToAnimateQs[thisCoresIdx];
-   schedSlots       = masterEnv->allSchedSlots[thisCoresIdx];
-
-   requestHandler   = masterEnv->requestHandler;
-   slaveAssigner    = masterEnv->slaveAssigner;
-   semanticEnv      = masterEnv->semanticEnv;
-
-
-      //Poll each slot's Done flag
-   numSlotsFilled = 0;
-   for( slotIdx = 0; slotIdx < NUM_SCHED_SLOTS; slotIdx++)
-    {
-      currSlot = schedSlots[ slotIdx ];
-
-      if( currSlot->workIsDone )
-       {
-         currSlot->workIsDone         = FALSE;
-         currSlot->needsSlaveAssigned = TRUE;
-
-               MEAS__startReqHdlr;
-               
-            //process the requests made by the slave (held inside slave struc)
-         (*requestHandler)( currSlot->slaveAssignedToSlot, semanticEnv );
-         
-               MEAS__endReqHdlr;
-       }
-      if( currSlot->needsSlaveAssigned )
-       {    //give slot a new Slv
-         schedSlaveVP =
-          (*slaveAssigner)( semanticEnv, thisCoresIdx );
-         
-         if( schedSlaveVP != NULL )
-          { currSlot->slaveAssignedToSlot = schedSlaveVP;
-            schedSlaveVP->schedSlot       = currSlot;
-            currSlot->needsSlaveAssigned  = FALSE;
-            numSlotsFilled               += 1;
-          }
-       }
-    }
-
-   
-   #ifdef SYS__TURN_ON_WORK_STEALING
-      //If no slots filled, means no more work, look for work to steal.
-   if( numSlotsFilled == 0 )
-    { gateProtected_stealWorkInto( currSlot, readyToAnimateQ, masterVP );
-    }
-   #endif
-
-         MEAS__Capture_Post_Master_Point;
-   
-   masterSwitchToCoreCtlr(animatingSlv);
-   flushRegisters();
-   }//MasterLoop
-
-
- }
-
-
-
-/*This has a race condition -- the coreloops are accessing their own queues
- * at the same time that this work-stealer on a different core is trying to
- */
-void inline
-stealWorkInto( SchedSlot *currSlot, VMSQueueStruc *readyToAnimateQ,
-               SlaveVP *masterVP )
- { 
-   SlaveVP   *stolenSlv;
-   int32        coreIdx, i;
-   VMSQueueStruc *currQ;
-
-   stolenSlv = NULL;
-   coreIdx = masterVP->coreAnimatedBy;
-   for( i = 0; i < NUM_CORES -1; i++ )
-    {
-      if( coreIdx >= NUM_CORES -1 )
-       { coreIdx = 0;
-       }
-      else
-       { coreIdx++;
-       }
-      currQ = _VMSMasterEnv->readyToAnimateQs[coreIdx];
-      if( numInVMSQ( currQ ) > 0 )
-       { stolenSlv = readVMSQ (currQ );
-         break;
-       }
-    }
-
-   if( stolenSlv != NULL )
-    { currSlot->slaveAssignedToSlot = stolenSlv;
-      stolenSlv->schedSlot           = currSlot;
-      currSlot->needsSlaveAssigned  = FALSE;
-
-      writeVMSQ( stolenSlv, readyToAnimateQ );
-    }
- }
-
-/*This algorithm makes the common case fast.  Make the coreloop passive,
- * and show its progress.  Make the stealer control a gate that coreloop
- * has to pass.
- *To avoid interference, only one stealer at a time.  Use a global
- * stealer-lock.
- *
- *The pattern is based on a gate -- stealer shuts the gate, then monitors
- * to be sure any already past make it all the way out, before starting.
- *So, have a "progress" measure just before the gate, then have two after it,
- * one is in a "waiting room" outside the gate, the other is at the exit.
- *Then, the stealer first shuts the gate, then checks the progress measure
- * outside it, then looks to see if the progress measure at the exit is the
- * same.  If yes, it knows the protected area is empty 'cause no other way
- * to get in and the last to get in also exited.
- *If the progress measure at the exit is not the same, then the stealer goes
- * into a loop checking both the waiting-area and the exit progress-measures
- * until one of them shows the same as the measure outside the gate.  Might
- * as well re-read the measure outside the gate each go around, just to be
- * sure.  It is guaranteed that one of the two will eventually match the one
- * outside the gate.
- *
- *Here's an informal proof of correctness:
- *The gate can be closed at any point, and have only four cases:
- *  1) coreloop made it past the gate-closing but not yet past the exit
- *  2) coreloop made it past the pre-gate progress update but not yet past
- *     the gate,
- *  3) coreloop is right before the pre-gate update
- *  4) coreloop is past the exit and far from the pre-gate update.
- *
- * Covering the cases in reverse order,
- *  4) is not a problem -- stealer will read pre-gate progress, see that it
- *     matches exit progress, and the gate is closed, so stealer can proceed.
- *  3) stealer will read pre-gate progress just after coreloop updates it..
- *     so stealer goes into a loop until the coreloop causes wait-progress
- *     to match pre-gate progress, so then stealer can proceed
- *  2) same as 3..
- *  1) stealer reads pre-gate progress, sees that it's different than exit,
- *     so goes into loop until exit matches pre-gate, now it knows coreloop
- *     is not in protected and cannot get back in, so can proceed.
- *
- *Implementation for the stealer:
- *
- *First, acquire the stealer lock -- only cores with no work to do will
- * compete to steal, so not a big performance penalty having only one --
- * will rarely have multiple stealers in a system with plenty of work -- and
- * in a system with little work, it doesn't matter.
- *
- *Note, have single-reader, single-writer pattern for all variables used to
- * communicate between stealer and victims
- *
- *So, scan the queues of the core controllers, until find non-empty.  Each core
- * has its own list that it scans.  The list goes in order from closest to
- * furthest core, so it steals first from close cores.  Later can add
- * taking info from the app about overlapping footprints, and scan all the
- * others then choose work with the most footprint overlap with the contents
- * of this core's cache.
- *
- *Now, have a victim want to take work from.  So, shut the gate in that
- * coreloop, by setting the "gate closed" var on its stack to TRUE.
- *Then, read the core's pre-gate progress and compare to the core's exit
- * progress.
- *If same, can proceed to take work from the coreloop's queue.  When done,
- * write FALSE to gate closed var.
- *If different, then enter a loop that reads the pre-gate progress, then
- * compares to exit progress then to wait progress.  When one of two
- * matches, proceed.  Take work from the coreloop's queue.  When done,
- * write FALSE to the gate closed var.
- * 
- */
-void inline
-gateProtected_stealWorkInto( SchedSlot *currSlot,
-                             VMSQueueStruc *myReadyToAnimateQ,
-                             SlaveVP *masterVP )
- {
-   SlaveVP     *stolenSlv;
-   int32          coreIdx, i, haveAVictim, gotLock;
-   VMSQueueStruc *victimsQ;
-
-   volatile GateStruc *vicGate;
-   int32               coreMightBeInProtected;
-
-
-
-      //see if any other cores have work available to steal
-   haveAVictim = FALSE;
-   coreIdx = masterVP->coreAnimatedBy;
-   for( i = 0; i < NUM_CORES -1; i++ )
-    {
-      if( coreIdx >= NUM_CORES -1 )
-       { coreIdx = 0;
-       }
-      else
-       { coreIdx++;
-       }
-      victimsQ = _VMSMasterEnv->readyToAnimateQs[coreIdx];
-      if( numInVMSQ( victimsQ ) > 0 )
-       { haveAVictim = TRUE;
-         vicGate = _VMSMasterEnv->workStealingGates[ coreIdx ];
-         break;
-       }
-    }
-   if( !haveAVictim ) return;  //no work to steal, exit
-
-      //have a victim core, now get the stealer-lock
-   gotLock =__sync_bool_compare_and_swap( &(_VMSMasterEnv->workStealingLock),
-                                                          UNLOCKED, LOCKED );
-   if( !gotLock ) return; //go back to core controller, which will re-start master
-
-
-   //====== Start Gate-protection =======
-   vicGate->gateClosed = TRUE;
-   coreMightBeInProtected= vicGate->preGateProgress != vicGate->exitProgress;
-   while( coreMightBeInProtected )
-    {    //wait until sure
-      if( vicGate->preGateProgress == vicGate->waitProgress )
-         coreMightBeInProtected = FALSE;
-      if( vicGate->preGateProgress == vicGate->exitProgress )
-         coreMightBeInProtected = FALSE;
-    }
-
-   stolenSlv = readVMSQ ( victimsQ );
-
-   vicGate->gateClosed = FALSE;
-   //======= End Gate-protection  =======
-
-
-   if( stolenSlv != NULL )  //victim could have been in protected and taken
-    { currSlot->slaveAssignedToSlot = stolenSlv;
-      stolenSlv->schedSlot           = currSlot;
-      currSlot->needsSlaveAssigned  = FALSE;
-
-      writeVMSQ( stolenSlv, myReadyToAnimateQ );
-    }
-
-      //unlock the work stealing lock
-   _VMSMasterEnv->workStealingLock = UNLOCKED;
- }
diff -r 8059fb8d5465 -r c88ce1db91ef Probes/probes.c
--- a/Probes/probes.c	Mon Mar 12 05:38:07 2012 -0700
+++ b/Probes/probes.c	Tue Mar 13 10:02:06 2012 -0700
@@ -298,7 +298,6 @@
 VMS_impl__print_stats_of_all_probes()
  {
    forAllInDynArrayDo( _VMSMasterEnv->dynIntervalProbesInfo,
-                       &VMS_impl__print_stats_of_probe );
+                          (DynArrayFnPtr) &VMS_impl__print_stats_of_probe );
    fflush( stdout );
  }
-typedef void  (*DynArrayFnPtr)  ( void * );  //fn has to cast void *
diff -r 8059fb8d5465 -r c88ce1db91ef Probes/probes.h
--- a/Probes/probes.h	Mon Mar 12 05:38:07 2012 -0700
+++ b/Probes/probes.h	Tue Mar 13 10:02:06 2012 -0700
@@ -107,7 +107,7 @@
 VMS_impl__record_interval_end_in_probe( int32 probeID );
 
 void
-VMS_impl__print_stats_of_probe( IntervalProbe *probe )
+VMS_impl__print_stats_of_probe( IntervalProbe *probe );
 
 void
 VMS_impl__print_stats_of_all_probes();
diff -r 8059fb8d5465 -r c88ce1db91ef SchedulingMaster.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/SchedulingMaster.c	Tue Mar 13 10:02:06 2012 -0700
@@ -0,0 +1,349 @@
+/*
+ * Copyright 2010  OpenSourceStewardshipFoundation
+ * 
+ * Licensed under BSD
+ */
+
+
+
+#include <stdio.h>
+#include <stddef.h>
+
+#include "VMS.h"
+
+
+//===========================================================================
+void inline
+stealWorkInto( SchedSlot *currSlot, VMSQueueStruc *readyToAnimateQ,
+               SlaveVP *masterVP );
+
+//===========================================================================
+
+
+
+/*This code is animated by the virtual Master processor.
+ *
+ *Polls each sched slot exactly once, hands any requests made by a newly
+ * done slave to the "request handler" plug-in function
+ *
+ *Any slots that need a Slv assigned are given to the "schedule"
+ * plug-in function, which tries to assign a Slv (slave) to it.
+ *
+ *When all slots needing a processor have been given to the schedule plug-in,
+ * a fraction of the slaves successfully scheduled are put into the
+ * work queue, then a continuation of this function is put in, then the rest
+ * of the Slvs that were successfully scheduled.
+ *
+ *The first thing the continuation does is busy-wait until the previous
+ * animation completes.  This is because an (unlikely) continuation may
+ * sneak through queue before previous continuation is done putting second
+ * part of scheduled slaves in, which is the only race condition.
+ *
+ */
+
+/*May 29, 2010 -- birth a Master during init so that first core controller to
+ * start running gets it and does all the stuff for a newly born --
+ * from then on, will be doing continuation, but do suspension self
+ * directly at end of master loop
+ *So VMS_WL__init just births the master virtual processor same way it births
+ * all the others -- then does any extra setup needed and puts it into the
+ * work queue.
+ *However means have to make masterEnv a global static volatile the same way
+ * did with readyToAnimateQ in core controller.  -- for performance, put the
+ * jump to the core controller directly in here, and have it directly jump back.
+ *
+ *
+ *Aug 18, 2010 -- Going to a separate MasterVP for each core, to see if this
+ * avoids the suspected bug in the system stack that causes bizarre faults
+ * at random places in the system code.
+ *
+ *So, this function is coupled to each of the MasterVPs, -- meaning this
+ * function can't rely on a particular stack and frame -- each MasterVP that
+ * animates this function has a different one.
+ *
+ *At this point, the schedulingMaster does not write itself into the queue anymore,
+ * instead, the coreCtlr acquires the masterLock when it has nothing to
+ * animate, and then animates its own schedulingMaster.  However, still try to put
+ * several AppSlvs into the queue to amortize the startup cost of switching
+ * to the MasterVP.  Note, don't have to worry about latency of requests much
+ * because most requests generate work for same core -- only latency issue
+ * is case when other cores starved and one core's requests generate work
+ * for them -- so keep max in queue to 3 or 4..
+ */
+void schedulingMaster( void *initData, SlaveVP *animatingSlv )
+ { 
+   int32           slotIdx, numSlotsFilled;
+   SlaveVP        *schedSlaveVP;
+   SchedSlot      *currSlot, **schedSlots;
+   MasterEnv      *masterEnv;
+   VMSQueueStruc  *readyToAnimateQ;
+   
+   Sched_Assigner  slaveAssigner;
+   RequestHandler  requestHandler;
+   void           *semanticEnv;
+
+   int32           thisCoresIdx;
+   SlaveVP        *masterVP;
+   volatile        SlaveVP *volatileMasterVP;
+   
+   volatileMasterVP = animatingSlv;
+   masterVP         = (SlaveVP*)volatileMasterVP; //used to force re-define after jmp
+
+      //First animation of each MasterVP will in turn animate this part
+      // of setup code.. (Slv creator sets up the stack as if this function
+      // was called normally, but actually get here by jmp)
+      //So, setup values about stack ptr, jmp pt and all that
+   //masterVP->resumeInstrPtr = &&schedulingMasterStartPt;
+
+
+      //Note, got rid of writing the stack and frame ptr up here, because
+      // only one
+      // core can ever animate a given MasterVP, so don't need to communicate
+      // new frame and stack ptr to the MasterVP storage before a second
+      // version of that MasterVP can get animated on a different core.
+      //Also got rid of the busy-wait.
+
+   
+   //schedulingMasterStartPt:
+   while(1){
+       
+      MEAS__Capture_Pre_Master_Point
+
+   masterEnv        = (MasterEnv*)_VMSMasterEnv;
+   
+      //GCC may optimize so doesn't always re-define from frame-storage
+   masterVP         = (SlaveVP*)volatileMasterVP;  //just to make sure after jmp
+   thisCoresIdx     = masterVP->coreAnimatedBy;
+   readyToAnimateQ  = masterEnv->readyToAnimateQs[thisCoresIdx];
+   schedSlots       = masterEnv->allSchedSlots[thisCoresIdx];
+
+   requestHandler   = masterEnv->requestHandler;
+   slaveAssigner    = masterEnv->slaveAssigner;
+   semanticEnv      = masterEnv->semanticEnv;
+
+
+      //Poll each slot's Done flag
+   numSlotsFilled = 0;
+   for( slotIdx = 0; slotIdx < NUM_SCHED_SLOTS; slotIdx++)
+    {
+      currSlot = schedSlots[ slotIdx ];
+
+      if( currSlot->workIsDone )
+       {
+         currSlot->workIsDone         = FALSE;
+         currSlot->needsSlaveAssigned = TRUE;
+
+               MEAS__startReqHdlr;
+               
+            //process the requests made by the slave (held inside slave struc)
+         (*requestHandler)( currSlot->slaveAssignedToSlot, semanticEnv );
+         
+               MEAS__endReqHdlr;
+       }
+      if( currSlot->needsSlaveAssigned )
+       {    //give slot a new Slv
+         schedSlaveVP =
+          (*slaveAssigner)( semanticEnv, thisCoresIdx );
+         
+         if( schedSlaveVP != NULL )
+          { currSlot->slaveAssignedToSlot = schedSlaveVP;
+            schedSlaveVP->schedSlot       = currSlot;
+            currSlot->needsSlaveAssigned  = FALSE;
+            numSlotsFilled               += 1;
+          }
+       }
+    }
+
+   
+   #ifdef SYS__TURN_ON_WORK_STEALING
+      //If no slots filled, means no more work, look for work to steal.
+   if( numSlotsFilled == 0 )
+    { gateProtected_stealWorkInto( currSlot, readyToAnimateQ, masterVP );
+    }
+   #endif
+
+         MEAS__Capture_Post_Master_Point;
+   
+   masterSwitchToCoreCtlr(animatingSlv);
+   flushRegisters();
+   }//MasterLoop
+
+
+ }
+
+
+
+/*This has a race condition -- the coreloops are accessing their own queues
+ * at the same time that this work-stealer on a different core is trying to
+ */
+void inline
+stealWorkInto( SchedSlot *currSlot, VMSQueueStruc *readyToAnimateQ,
+               SlaveVP *masterVP )
+ { 
+   SlaveVP   *stolenSlv;
+   int32        coreIdx, i;
+   VMSQueueStruc *currQ;
+
+   stolenSlv = NULL;
+   coreIdx = masterVP->coreAnimatedBy;
+   for( i = 0; i < NUM_CORES -1; i++ )
+    {
+      if( coreIdx >= NUM_CORES -1 )
+       { coreIdx = 0;
+       }
+      else
+       { coreIdx++;
+       }
+      currQ = _VMSMasterEnv->readyToAnimateQs[coreIdx];
+      if( numInVMSQ( currQ ) > 0 )
+       { stolenSlv = readVMSQ (currQ );
+         break;
+       }
+    }
+
+   if( stolenSlv != NULL )
+    { currSlot->slaveAssignedToSlot = stolenSlv;
+      stolenSlv->schedSlot           = currSlot;
+      currSlot->needsSlaveAssigned  = FALSE;
+
+      writeVMSQ( stolenSlv, readyToAnimateQ );
+    }
+ }
+
+/*This algorithm makes the common case fast.  Make the coreloop passive,
+ * and show its progress.  Make the stealer control a gate that coreloop
+ * has to pass.
+ *To avoid interference, only one stealer at a time.  Use a global
+ * stealer-lock.
+ *
+ *The pattern is based on a gate -- stealer shuts the gate, then monitors
+ * to be sure any already past make it all the way out, before starting.
+ *So, have a "progress" measure just before the gate, then have two after it,
+ * one is in a "waiting room" outside the gate, the other is at the exit.
+ *Then, the stealer first shuts the gate, then checks the progress measure
+ * outside it, then looks to see if the progress measure at the exit is the
+ * same.  If yes, it knows the protected area is empty 'cause no other way
+ * to get in and the last to get in also exited.
+ *If the progress measure at the exit is not the same, then the stealer goes
+ * into a loop checking both the waiting-area and the exit progress-measures
+ * until one of them shows the same as the measure outside the gate.  Might
+ * as well re-read the measure outside the gate each go around, just to be
+ * sure.  It is guaranteed that one of the two will eventually match the one
+ * outside the gate.
+ *
+ *Here's an informal proof of correctness:
+ *The gate can be closed at any point, and have only four cases:
+ *  1) coreloop made it past the gate-closing but not yet past the exit
+ *  2) coreloop made it past the pre-gate progress update but not yet past
+ *     the gate,
+ *  3) coreloop is right before the pre-gate update
+ *  4) coreloop is past the exit and far from the pre-gate update.
+ *
+ * Covering the cases in reverse order,
+ *  4) is not a problem -- stealer will read pre-gate progress, see that it
+ *     matches exit progress, and the gate is closed, so stealer can proceed.
+ *  3) stealer will read pre-gate progress just after coreloop updates it..
+ *     so stealer goes into a loop until the coreloop causes wait-progress
+ *     to match pre-gate progress, so then stealer can proceed
+ *  2) same as 3..
+ *  1) stealer reads pre-gate progress, sees that it's different than exit,
+ *     so goes into loop until exit matches pre-gate, now it knows coreloop
+ *     is not in protected and cannot get back in, so can proceed.
+ *
+ *Implementation for the stealer:
+ *
+ *First, acquire the stealer lock -- only cores with no work to do will
+ * compete to steal, so not a big performance penalty having only one --
+ * will rarely have multiple stealers in a system with plenty of work -- and
+ * in a system with little work, it doesn't matter.
+ *
+ *Note, have single-reader, single-writer pattern for all variables used to
+ * communicate between stealer and victims
+ *
+ *So, scan the queues of the core controllers, until find non-empty.  Each core
+ * has its own list that it scans.  The list goes in order from closest to
+ * furthest core, so it steals first from close cores.  Later can add
+ * taking info from the app about overlapping footprints, and scan all the
+ * others then choose work with the most footprint overlap with the contents
+ * of this core's cache.
+ *
+ *Now, have a victim want to take work from.  So, shut the gate in that
+ * coreloop, by setting the "gate closed" var on its stack to TRUE.
+ *Then, read the core's pre-gate progress and compare to the core's exit
+ * progress.
+ *If same, can proceed to take work from the coreloop's queue.  When done,
+ * write FALSE to gate closed var.
+ *If different, then enter a loop that reads the pre-gate progress, then
+ * compares to exit progress then to wait progress.  When one of two
+ * matches, proceed.  Take work from the coreloop's queue.  When done,
+ * write FALSE to the gate closed var.
+ * 
+ */
+void inline
+gateProtected_stealWorkInto( SchedSlot *currSlot,
+                             VMSQueueStruc *myReadyToAnimateQ,
+                             SlaveVP *masterVP )
+ {
+   SlaveVP     *stolenSlv;
+   int32          coreIdx, i, haveAVictim, gotLock;
+   VMSQueueStruc *victimsQ;
+
+   volatile GateStruc *vicGate;
+   int32               coreMightBeInProtected;
+
+
+
+      //see if any other cores have work available to steal
+   haveAVictim = FALSE;
+   coreIdx = masterVP->coreAnimatedBy;
+   for( i = 0; i < NUM_CORES -1; i++ )
+    {
+      if( coreIdx >= NUM_CORES -1 )
+       { coreIdx = 0;
+       }
+      else
+       { coreIdx++;
+       }
+      victimsQ = _VMSMasterEnv->readyToAnimateQs[coreIdx];
+      if( numInVMSQ( victimsQ ) > 0 )
+       { haveAVictim = TRUE;
+         vicGate = _VMSMasterEnv->workStealingGates[ coreIdx ];
+         break;
+       }
+    }
+   if( !haveAVictim ) return;  //no work to steal, exit
+
+      //have a victim core, now get the stealer-lock
+   gotLock =__sync_bool_compare_and_swap( &(_VMSMasterEnv->workStealingLock),
+                                                          UNLOCKED, LOCKED );
+   if( !gotLock ) return; //go back to core controller, which will re-start master
+
+
+   //====== Start Gate-protection =======
+   vicGate->gateClosed = TRUE;
+   coreMightBeInProtected= vicGate->preGateProgress != vicGate->exitProgress;
+   while( coreMightBeInProtected )
+    {    //wait until sure
+      if( vicGate->preGateProgress == vicGate->waitProgress )
+         coreMightBeInProtected = FALSE;
+      if( vicGate->preGateProgress == vicGate->exitProgress )
+         coreMightBeInProtected = FALSE;
+    }
+
+   stolenSlv = readVMSQ ( victimsQ );
+
+   vicGate->gateClosed = FALSE;
+   //======= End Gate-protection  =======
+
+
+   if( stolenSlv != NULL )  //victim could have been in protected and taken
+    { currSlot->slaveAssignedToSlot = stolenSlv;
+      stolenSlv->schedSlot           = currSlot;
+      currSlot->needsSlaveAssigned  = FALSE;
+
+      writeVMSQ( stolenSlv, myReadyToAnimateQ );
+    }
+
+      //unlock the work stealing lock
+   _VMSMasterEnv->workStealingLock = UNLOCKED;
+ }
diff -r 8059fb8d5465 -r c88ce1db91ef VMS.h
--- a/VMS.h	Mon Mar 12 05:38:07 2012 -0700
+++ b/VMS.h	Tue Mar 13 10:02:06 2012 -0700
@@ -201,7 +201,7 @@
 
 void * coreController( void *paramsIn );  //standard PThreads fn prototype
 void * coreCtlr_Seq( void *paramsIn );  //standard PThreads fn prototype
-void masterLoop( void *initData, SlaveVP *masterVP );
+void schedulingMaster( void *initData, SlaveVP *masterVP );
 
 
 typedef struct
@@ -215,10 +215,11 @@
 
 volatile MasterEnv      *_VMSMasterEnv __align_to_cacheline__;
 
-pthread_t       coreCtlrThdHandles[ NUM_CORES ];  //pthread's virt-procr state
+pthread_t       coreCtlrThdHandles[ NUM_CORES ]; //pthread's virt-procr state
 ThdParams      *coreCtlrThdParams [ NUM_CORES ];
-pthread_mutex_t suspendLock  = PTHREAD_MUTEX_INITIALIZER;
-pthread_cond_t  suspendCond  = PTHREAD_COND_INITIALIZER;
+
+pthread_mutex_t suspendLock;
+pthread_cond_t  suspendCond;
 
 //=========================  Function Prototypes  ===========================
 
diff -r 8059fb8d5465 -r c88ce1db91ef VMS__startup_and_shutdown.c
--- a/VMS__startup_and_shutdown.c	Mon Mar 12 05:38:07 2012 -0700
+++ b/VMS__startup_and_shutdown.c	Tue Mar 13 10:02:06 2012 -0700
@@ -10,6 +10,7 @@
 #include <malloc.h>
 #include <inttypes.h>
 #include <sys/time.h>
+#include <pthread.h>
 
 #include "VMS.h"
 
@@ -43,7 +44,7 @@
  *    the master Slv into the work-queue, ready for first "call"
  * 2) Semantic layer then does its own init, which creates the seed virt
  *    slave inside the semantic layer, ready to schedule it when
- *    asked by the first run of the masterLoop.
+ *    asked by the first run of the schedulingMaster.
  *
  *This part is bit weird because VMS really wants to be "always there", and
  * have applications attach and detach..  for now, this VMS is part of
@@ -51,7 +52,7 @@
  *
  *The semantic layer is isolated from the VMS internals by making the
  * semantic layer do setup to a state that it's ready with its
- * initial Slvs, ready to schedule them to slots when the masterLoop
+ * initial Slvs, ready to schedule them to slots when the schedulingMaster
  * asks.  Without this pattern, the semantic layer's setup would
  * have to modify slots directly to assign the initial virt-procrs, and put
  * them into the readyToAnimateQ itself, breaking the isolation completely.
@@ -71,7 +72,7 @@
  {
    #ifdef DEBUG__TURN_ON_SEQUENTIAL_MODE
       create_masterEnv();
-      flushRegisters();  //? not sure why here -- merten added it..?
+      printf( "\n\n Running in SEQUENTIAL mode \n\n" );
    #else
       create_masterEnv();
       create_the_coreCtlr_OS_threads();
@@ -292,7 +293,7 @@
       readyToAnimateQs[ coreIdx ] = makeVMSQ();
       
          //Q: should give masterVP core-specific info as its init data?
-      masterVPs[ coreIdx ] = VMS_int__create_slaveVP( (TopLevelFnPtr)&masterLoop, (void*)masterEnv );
+      masterVPs[ coreIdx ] = VMS_int__create_slaveVP( (TopLevelFnPtr)&schedulingMaster, (void*)masterEnv );
       masterVPs[ coreIdx ]->coreAnimatedBy = coreIdx;
       allSchedSlots[ coreIdx ] = create_sched_slots(); //makes for one core
       _VMSMasterEnv->numMasterInARow[ coreIdx ] = 0;
@@ -426,6 +427,8 @@
       //get lock, to lock out any threads still starting up -- they'll see
       // that setupComplete is true before entering while loop, and so never
       // wait on the condition
+   pthread_mutex_init( &suspendLock, NULL );
+   pthread_cond_init( &suspendCond, NULL );
    pthread_mutex_lock(     &suspendLock );
    _VMSMasterEnv->setupComplete = 1;
    pthread_mutex_unlock(   &suspendLock );
diff -r 8059fb8d5465 -r c88ce1db91ef VMS_primitive_data_types.h
--- a/VMS_primitive_data_types.h	Mon Mar 12 05:38:07 2012 -0700
+++ b/VMS_primitive_data_types.h	Tue Mar 13 10:02:06 2012 -0700
@@ -7,8 +7,8 @@
 
  */
 
-#ifndef  _PRIMITIVE_DATA_TYPES_H
-#define	_PRIMITIVE_DATA_TYPES_H
+#ifndef _PRIMITIVE_DATA_TYPES_H
+#define _PRIMITIVE_DATA_TYPES_H
 
 
 /*For portability, need primitive data types that have a well defined
diff -r 8059fb8d5465 -r c88ce1db91ef vmalloc.c
--- a/vmalloc.c	Mon Mar 12 05:38:07 2012 -0700
+++ b/vmalloc.c	Tue Mar 13 10:02:06 2012 -0700
@@ -15,7 +15,7 @@
 #include <math.h>
 
 #include "VMS.h"
-#include "C_Libraries/Histogram/Histogram.h"
+#include "Histogram/Histogram.h"
 
 #define MAX_UINT64 0xFFFFFFFFFFFFFFFF