changeset 222:c88ce1db91ef Common_Ancestor

Compiles, but does not run properly -- and changed MasterLoop to SchedulingMaster
author Some Random Person <seanhalle@yahoo.com>
date Tue, 13 Mar 2012 10:02:06 -0700
parents 8059fb8d5465
children b0b93147adfb
files CoreController.c CoreLoop.c Defines/VMS_defs__MEAS.h Defines/VMS_defs__turn_on_and_off.h MasterLoop.c Probes/probes.c Probes/probes.h SchedulingMaster.c VMS.h VMS__startup_and_shutdown.c VMS_primitive_data_types.h vmalloc.c
diffstat 12 files changed, 703 insertions(+), 699 deletions(-) [+]
line diff
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/CoreController.c	Tue Mar 13 10:02:06 2012 -0700
     1.3 @@ -0,0 +1,333 @@
     1.4 +/*
     1.5 + * Copyright 2010  OpenSourceStewardshipFoundation
     1.6 + *
     1.7 + * Licensed under BSD
     1.8 + */
     1.9 +
    1.10 +
    1.11 +#include "VMS.h"
    1.12 +
    1.13 +#include <stdlib.h>
    1.14 +#include <stdio.h>
    1.15 +#include <time.h>
    1.16 +
    1.17 +#include <pthread.h>
    1.18 +#include <sched.h>
    1.19 +
    1.20 +//=====================  Functions local to this file =======================
    1.21 +void *terminateCoreController(SlaveVP *currSlv);
    1.22 +inline void
    1.23 +doBackoff_for_TooLongToGetLock( int32  numTriesToGetLock, uint32 *seed1, 
    1.24 +                                uint32 *seed2 );
    1.25 +inline void
    1.26 +doBackoff_for_TooLongWithNoWork( int32   numRepsWithNoWork, uint32 *seed1, 
    1.27 +                                 uint32 *seed2 );
    1.28 +
    1.29 +//===========================================================================
    1.30 +
    1.31 +
    1.32 +/*The Core Controller is logically "beneath" the masterVP and slave VPs.  Its
    1.33 + * job is to control which of those VPs the core animates.  Any time one of
    1.34 + * those VPs suspends, the suspend-primitive switches the core over to
    1.35 + * animating the core controller.  The core controller then follows a very
    1.36 + * basic pattern to choose which VP will get animated next, then switches
    1.37 + * the core over to animating that VP.  So, all VPs switch the core to
    1.38 + * core controller, which then chooses which VP the core animates next.
    1.39 + *
    1.40 + *The way the core controller decides which VP to switch the core to next is:
    1.41 + * 1) There are a number of "scheduling slots", which the master VP fills up
    1.42 + *    with slave VPs that are ready to be animated.  So, the core controller
    1.43 + *    just iterates through the scheduling slots.  When the next slot has a
    1.44 + *    slave VP in it, the core controller switches the core over to animate
    1.45 + *    that slave.
    1.46 + * 2) When the core controller checks a scheduling slot, and it's empty,
    1.47 + *    then the controller switches the core over to animating the master VP,
    1.48 + *    whose job is to find more slave VPs ready, and assign those to 
    1.49 + *    scheduling slots.
    1.50 + *
    1.51 + *So, in effect, a scheduling slot functions as another layer of virtual
    1.52 + * processor.  A slot has the logical meaning of being an animator that
    1.53 + * animates the slave assigned to it.  However, the core controller sits
    1.54 + * below the slots, and sequences down them, assigning the actual physical
    1.55 + * core to each slot, in turn.
    1.56 + *The reason for having the scheduling slots and core controller is to 
    1.57 + * amortize the overhead of switching to the master VP and running it.  With
    1.58 + * multiple scheduling slots, the time to switch-to-master and the code in
    1.59 + * the master loop is divided by the number of scheduling slots.
    1.60 + *The core controller and scheduling slots are not fundamental parts of VMS,
    1.61 + * but rather optimizations put into the shared-semantic-state version of
    1.62 + * VMS.  Other versions of VMS will not have a core controller nor scheduling
    1.63 + * slots.
    1.64 + * 
    1.65 + *The core controller "owns" the physical core, in effect, and is the 
    1.66 + * function given to the pthread creation call.  Hence, it contains code
    1.67 + * related to pthread startup, synchronizing the controllers to all start
    1.68 + * at the same time-point, and pinning the pthreads to physical cores.
    1.69 + * 
    1.70 + */
    1.71 +void *
    1.72 +coreController( void *paramsIn )
    1.73 + { 
    1.74 +   int32           thisCoresIdx;
    1.75 +   int32           numRepetitionsWithNoWork;
    1.76 +   SlaveVP        *currVP;
    1.77 +   SchedSlot      *currSlot, **schedSlots;
    1.78 +   int32           currSlotIdx;
    1.79 +   volatile int32 *addrOfMasterLock; //thing pointed to is volatile, not ptr
    1.80 +   SlaveVP        *thisCoresMasterVP;
    1.81 +      //Variables used for pthread related things
    1.82 +   ThdParams      *coreCtlrThdParams;
    1.83 +   cpu_set_t       coreMask;  //used during pinning pthread to CPU core
    1.84 +   int32           errorCode;
    1.85 +      //Variables used during measurements
    1.86 +   TSCountLowHigh  endSusp;
    1.87 +      //Variables used in random-backoff, for master-lock and waiting for work
    1.88 +   uint32_t seed1 = rand()%1000; // init random number generator for retries
    1.89 +   uint32_t seed2 = rand()%1000;
    1.90 +      //Variable for work-stealing -- a gate protects a critical section
    1.91 +   volatile GateStruc gate;      //on stack to avoid false-sharing
    1.92 +
    1.93 +   
    1.94 +   //===============  Initializations ===================
    1.95 +   coreCtlrThdParams = (ThdParams *)paramsIn;
    1.96 +   thisCoresIdx = coreCtlrThdParams->coreNum;
    1.97 +
    1.98 +   gate.gateClosed      = FALSE;
    1.99 +   gate.preGateProgress = 0;
   1.100 +   gate.waitProgress    = 0;
   1.101 +   gate.exitProgress    = 0;
   1.102 +   //TODO: pad these to prevent false-sharing, and fix the race at startup
   1.103 +   _VMSMasterEnv->workStealingGates[ thisCoresIdx ] = (GateStruc*)&gate;
   1.104 +
   1.105 +      //Assembly that saves addr of label of return instr -- label in assmbly
   1.106 +   recordCoreCtlrReturnLabelAddr((void**)&(_VMSMasterEnv->coreCtlrReturnPt));
   1.107 +
   1.108 +   schedSlots = _VMSMasterEnv->allSchedSlots[thisCoresIdx];
   1.109 +   currSlotIdx = 0; //start at slot 0, go up until one empty, then do master
   1.110 +   numRepetitionsWithNoWork = 0;
   1.111 +   addrOfMasterLock = &(_VMSMasterEnv->masterLock);
   1.112 +   thisCoresMasterVP = _VMSMasterEnv->masterVPs[thisCoresIdx];
   1.113 +   
   1.114 +   //==================== pthread related stuff ======================
   1.115 +      //pin the pthread to the core
   1.116 +      //Linux requires pinning to be done inside the thread-function
   1.117 +      //Designate a core by a 1 in bit-position corresponding to the core
   1.118 +   CPU_ZERO(&coreMask); //initialize mask bits to zero
   1.119 +   CPU_SET(coreCtlrThdParams->coreNum,&coreMask); //set bit repr the coreNum
   1.120 +   pthread_t selfThd = pthread_self();
   1.121 +   errorCode =
   1.122 +   pthread_setaffinity_np( selfThd, sizeof(coreMask), &coreMask);
   1.123 +   if(errorCode){ printf("\n pinning thd to core failed \n"); exit(0); }
   1.124 +
   1.125 +      //make sure the controllers all start at same time, by making them wait
   1.126 +   pthread_mutex_lock(   &suspendLock );
   1.127 +   while( !(_VMSMasterEnv->setupComplete) )
   1.128 +    { pthread_cond_wait( &suspendCond, &suspendLock );
   1.129 +    }
   1.130 +   pthread_mutex_unlock( &suspendLock );
   1.131 +
   1.132 +   //====================== The Core Controller ======================
   1.133 +   while(1)  //An endless loop is just one way of doing the control structure
   1.134 +    {        //Assembly code switches the core between animating a VP and
   1.135 +             // animating this core controller.  The switch is done by
   1.136 +             // changing the stack-pointer and frame-pointer and then doing
   1.137 +             // an assembly jmp.  When reading this code, the effect is 
   1.138 +             // that the "switchToSlv()" at the end of the loop is sort of a
   1.139 +             // "warp in time" -- the core disappears inside this, jmps to
   1.140 +             // animating a VP, and when that VP suspends, the suspend
   1.141 +             // jmps back. This has the effect of "returning" from the
   1.142 +             // switchToSlv() call. Then control loops back to here.
   1.143 +             //Alternatively, the VP suspend primitive could just not bother
   1.144 +             // returning from switchToSlv, and instead jmp directly to here.
   1.145 +      
   1.146 +      if( currSlotIdx >= NUM_SCHED_SLOTS ) goto switchToMaster;
   1.147 +      currSlot = schedSlots[ currSlotIdx ];
   1.148 +
   1.149 +      
   1.150 +      if( ! currSlot->needsSlaveAssigned ) //slot does have slave assigned
   1.151 +       { numRepetitionsWithNoWork = 0;     //reset B2B master count
   1.152 +         currSlotIdx ++;
   1.153 +         currVP = currSlot->slaveAssignedToSlot;
   1.154 +       }
   1.155 +      else //slot is empty, so switch to master
   1.156 +       {
   1.157 +       switchToMaster:
   1.158 +         currSlotIdx = 0; //doing switch to master, so start over at slot 0
   1.159 +         currVP = NULL;
   1.160 +
   1.161 +               MEAS__Capture_Pre_Master_Lock_Point;
   1.162 +
   1.163 +         int numTriesToGetLock = 0; int gotLock = 0;
   1.164 +         while( currVP == NULL ) //keep going until get master lock
   1.165 +          { 
   1.166 +               //At this point, first thing to do is get lock.  But, want to
   1.167 +               // reduce lock contention from cores with no work, so first
   1.168 +               // check if this is a core with no work, and busy wait if so.
   1.169 +               //Then, if it's been way too long without work, yield pthread
   1.170 +            if( numRepetitionsWithNoWork > NUM_REPS_W_NO_WORK_BEFORE_BACKOFF)
   1.171 +               doBackoff_for_TooLongWithNoWork( numRepetitionsWithNoWork, &seed1, &seed2 );
   1.172 +            if( numRepetitionsWithNoWork > NUM_REPS_W_NO_WORK_BEFORE_YIELD )
   1.173 +             { numRepetitionsWithNoWork = 0; pthread_yield(); }
   1.174 +
   1.175 +               
   1.176 +               //Now, try to get the lock
   1.177 +            gotLock = __sync_bool_compare_and_swap( addrOfMasterLock,
   1.178 +                                                    UNLOCKED, LOCKED );
   1.179 +            if( gotLock )
   1.180 +             {    //At this point, have run out of slaves, so tried to get
   1.181 +                  // the master lock, and have successfully gotten it.
   1.182 +                  //So, set the currVP to this core's masterVP and break out
   1.183 +                  // of the get-lock loop.  Below, assembly code will switch
   1.184 +                  // the core over to animating the masterVP.  When it's 
   1.185 +                  // done, the masterVP will use assembly to switch the core
   1.186 +                  // back to animating this core controller
   1.187 +               currVP = thisCoresMasterVP;
   1.188 +               numRepetitionsWithNoWork += 1;
   1.189 +               break;  //end while -- have a VP to animate now
   1.190 +             }
   1.191 +               //Get here only when failed to get lock
   1.192 +
   1.193 +            numTriesToGetLock++;   //if too many, means too much contention
   1.194 +            if( numTriesToGetLock > NUM_TRIES_BEFORE_DO_BACKOFF ) 
   1.195 +               doBackoff_for_TooLongToGetLock( numTriesToGetLock, &seed1, &seed2 );
   1.196 +            if( numTriesToGetLock > MASTERLOCK_RETRIES_BEFORE_YIELD ) 
   1.197 +             { numTriesToGetLock = 0; pthread_yield(); }
   1.198 +          }
   1.199 +               MEAS__Capture_Post_Master_Lock_Point;
   1.200 +       }
   1.201 +
   1.202 +
   1.203 +      switchToSlv(currVP); //Slave suspend makes core "return" from this call
   1.204 +      flushRegisters();    //prevent GCC optimization from doing bad things 
   1.205 +
   1.206 +             MEAS__Capture_End_Susp_in_CoreCtlr_ForSys;
   1.207 +          
   1.208 +    }//while(1)
   1.209 + }
   1.210 +
   1.211 +
   1.212 +void *
   1.213 +terminateCoreCtlr(SlaveVP *currSlv)
   1.214 + {
   1.215 +   //first free shutdown Slv that jumped here -- it first restores the
   1.216 +   // coreloop's stack, so addr of currSlv in stack frame is still correct
   1.217 +   VMS_int__dissipate_slaveVP( currSlv );
   1.218 +   pthread_exit( NULL );
   1.219 + }
   1.220 +
   1.221 +
   1.222 +/*Used by the backoff to pick a random amount of busy-wait.  Can't use the
   1.223 + * system rand because it takes much too long.
   1.224 + *Note, are passing pointers to the seeds, which are then modified
   1.225 + */
   1.226 +inline uint32_t
   1.227 +randomNumber(uint32_t* seed1, uint32_t* seed2)
   1.228 + {
   1.229 +	*seed1 = 36969 * (*seed1 & 65535) + (*seed1 >> 16);
   1.230 +	*seed2 = 18000 * (*seed2 & 65535) + (*seed2 >> 16);
   1.231 +	return (*seed1 << 16) + *seed2;
   1.232 + }
   1.233 +
   1.234 +/*Busy-wait for a random number of cycles -- chooses number of cycles 
   1.235 + * differently than for the too-many-tries-to-get-lock backoff
   1.236 + */
   1.237 +inline void
   1.238 +doBackoff_for_TooLongWithNoWork( int32   numRepsWithNoWork, uint32 *seed1, 
   1.239 +                                 uint32 *seed2 )
   1.240 + { int32 i, waitIterations;
   1.241 +   volatile double fakeWorkVar; //busy-wait fake work
   1.242 + 
   1.243 +   waitIterations = 
   1.244 +    randomNumber(seed1, seed2) % 
   1.245 +    (numRepsWithNoWork * numRepsWithNoWork * NUM_CORES);
   1.246 +   for( i = 0; i < waitIterations; i++ )
   1.247 +    { fakeWorkVar += (fakeWorkVar + 32.0) / 2.0; //busy-wait
   1.248 +    }
   1.249 + }
   1.250 +
   1.251 +/*Busy-waits for a random number of cycles -- chooses number of cycles 
   1.252 + * differently than for the no-work backoff
   1.253 + */
   1.254 +inline void
   1.255 +doBackoff_for_TooLongToGetLock( int32 numTriesToGetLock, uint32 *seed1, 
   1.256 +                                uint32 *seed2 )
   1.257 + { int32 i, waitIterations;
   1.258 +   volatile double fakeWorkVar; //busy-wait fake work
   1.259 +
   1.260 +   waitIterations = 
   1.261 +    randomNumber(seed1, seed2) % 
   1.262 +    (numTriesToGetLock * NUM_TRIES_TO_GET_LOCK_BACKOFF_WEIGHT);   
   1.263 +   //addToHist( wait_iterations, coreLoopThdParams->wait_iterations_hist );
   1.264 +   for( i = 0; i < waitIterations; i++ )
   1.265 +    { fakeWorkVar += (fakeWorkVar + 32.0) / 2.0; //busy-wait
   1.266 +    }
   1.267 + }
   1.268 +
   1.269 +
   1.270 +#ifdef DEBUG__TURN_ON_SEQUENTIAL_MODE
   1.271 +
   1.272 +//===========================================================================
   1.273 +/*This sequential version does the same as threaded, except doesn't do the
   1.274 + * pin-threads part, nor the wait until setup complete and acquire master
   1.275 + * lock parts.
   1.276 + */
   1.277 +void *
   1.278 +coreCtlr_Seq( void *paramsIn )
   1.279 + {
   1.280 +   int32           thisCoresIdx;
   1.281 +   int32           numRepetitionsWithNoWork;
   1.282 +   SlaveVP        *currVP;
   1.283 +   SchedSlot      *currSlot, **schedSlots;
   1.284 +   int32           currSlotIdx;
   1.285 +   int32          *addrOfMasterLock;
   1.286 +   SlaveVP        *thisCoresMasterVP;
   1.287 +   
   1.288 +   //===============  Initializations ===================
   1.289 +   schedSlots = _VMSMasterEnv->allSchedSlots[thisCoresIdx];
   1.290 +   currSlotIdx = 0; //start at slot 0, go up until one empty, then do master
   1.291 +   numRepetitionsWithNoWork = 0;
   1.292 +   addrOfMasterLock = &(_VMSMasterEnv->masterLock);
   1.293 +   thisCoresMasterVP = _VMSMasterEnv->masterVPs[thisCoresIdx];
   1.294 +   
   1.295 +   thisCoresIdx = 0; //sequential version
   1.296 +
   1.297 +      //Assembly that saves addr of label of return instr -- label in assmbly
   1.298 +   recordCoreCtlrReturnLabelAddr((void**)&(_VMSMasterEnv->coreCtlrReturnPt));
   1.299 +
   1.300 +   
   1.301 +   //====================== The Core Controller ======================
   1.302 +   while(1)
   1.303 +    {
   1.304 +      if( currSlotIdx >= NUM_SCHED_SLOTS ) goto switchToMaster;
   1.305 +      currSlot = schedSlots[ currSlotIdx ];
   1.306 +
   1.307 +      if( ! currSlot->needsSlaveAssigned ) //slot does have slave assigned
   1.308 +       { numRepetitionsWithNoWork = 0;     //reset B2B master count
   1.309 +         currSlotIdx ++;
   1.310 +         currVP = currSlot->slaveAssignedToSlot;
   1.311 +       }
   1.312 +      else //slot is empty, so switch to master
   1.313 +       {
   1.314 +       switchToMaster:
   1.315 +         currSlotIdx = 0; //doing switch to master, so start over at slot 0
   1.316 +         
   1.317 +         currVP = thisCoresMasterVP;
   1.318 +         
   1.319 +               MEAS__Capture_Pre_Master_Lock_Point;  //back to back because
   1.320 +               MEAS__Capture_Post_Master_Lock_Point; // sequential version
   1.321 +         
   1.322 +         if( numRepetitionsWithNoWork > NUM_REPS_W_NO_WORK_BEFORE_YIELD )
   1.323 +          { printf("Lots of reps w/o work\n");
   1.324 +            exit(0); //if no work, no way to ever get it in sequential!
   1.325 +          }
   1.326 +         numRepetitionsWithNoWork += 1;
   1.327 +       }
   1.328 +
   1.329 +      switchToSlv(currVP); //Slave suspend makes core "return" from this call
   1.330 +      flushRegisters();    //prevent GCC optimization from doing bad things 
   1.331 +
   1.332 +             MEAS__Capture_End_Susp_in_CoreCtlr_ForSys;
   1.333 +             
   1.334 +    } //while(1)
   1.335 + }
   1.336 +#endif
     2.1 --- a/CoreLoop.c	Mon Mar 12 05:38:07 2012 -0700
     2.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
     2.3 @@ -1,332 +0,0 @@
     2.4 -/*
     2.5 - * Copyright 2010  OpenSourceStewardshipFoundation
     2.6 - *
     2.7 - * Licensed under BSD
     2.8 - */
     2.9 -
    2.10 -
    2.11 -#include "VMS.h"
    2.12 -
    2.13 -#include <stdlib.h>
    2.14 -#include <stdio.h>
    2.15 -#include <time.h>
    2.16 -
    2.17 -#include <pthread.h>
    2.18 -#include <sched.h>
    2.19 -
    2.20 -//=====================  Functions local to this file =======================
    2.21 -void *terminateCoreController(SlaveVP *currSlv);
    2.22 -inline void
    2.23 -doBackoff_for_TooLongToGetLock( int32  numTriesToGetLock, uint32 *seed1, 
    2.24 -                                uint32 *seed2 );
    2.25 -inline void
    2.26 -doBackoff_for_TooLongWithNoWork( int32   numRepsWithNoWork, uint32 *seed1, 
    2.27 -                                 uint32 *seed2 );
    2.28 -//===========================================================================
    2.29 -
    2.30 -
    2.31 -/*The Core Controller is logically "beneath" the masterVP and slave VPs.  Its
    2.32 - * job is to control which of those VPs the core animates.  Any time one of
    2.33 - * those VPs suspends, the suspend-primitive switches the core over to
    2.34 - * animating the core controller.  The core controller then follows a very
    2.35 - * basic pattern to choose which VP will get animated next, then switches
    2.36 - * the core over to animating that VP.  So, all VPs switch the core to
    2.37 - * core controller, which then chooses which VP the core animates next.
    2.38 - *
    2.39 - *The way the core controller decides which VP to switch the core to next is:
    2.40 - * 1) There are a number of "scheduling slots", which the master VP fills up
    2.41 - *    with slave VPs that are ready to be animated.  So, the core controller
    2.42 - *    just iterates through the scheduling slots.  When the next slot has a
    2.43 - *    slave VP in it, the core controller switches the core over to animate
    2.44 - *    that slave.
    2.45 - * 2) When the core controller checks a scheduling slot, and it's empty,
    2.46 - *    then the controller switches the core over to animating the master VP,
    2.47 - *    whose job is to find more slave VPs ready, and assign those to 
    2.48 - *    scheduling slots.
    2.49 - *
    2.50 - *So, in effect, a scheduling slot functions as another layer of virtual
    2.51 - * processor.  A slot has the logical meaning of being an animator that
    2.52 - * animates the slave assigned to it.  However, the core controller sits
    2.53 - * below the slots, and sequences down them, assigning the actual physical
    2.54 - * core to each slot, in turn.
    2.55 - *The reason for having the scheduling slots and core controller is to 
    2.56 - * amortize the overhead of switching to the master VP and running it.  With
    2.57 - * multiple scheduling slots, the time to switch-to-master and the code in
    2.58 - * the master loop is divided by the number of scheduling slots.
    2.59 - *The core controller and scheduling slots are not fundamental parts of VMS,
    2.60 - * but rather optimizations put into the shared-semantic-state version of
    2.61 - * VMS.  Other versions of VMS will not have a core controller nor scheduling
    2.62 - * slots.
    2.63 - * 
    2.64 - *The core controller "owns" the physical core, in effect, and is the 
    2.65 - * function given to the pthread creation call.  Hence, it contains code
    2.66 - * related to pthread startup, synchronizing the controllers to all start
    2.67 - * at the same time-point, and pinning the pthreads to physical cores.
    2.68 - * 
    2.69 - */
    2.70 -void *
    2.71 -coreController( void *paramsIn )
    2.72 - { 
    2.73 -   int32           thisCoresIdx;
    2.74 -   int32           numRepetitionsWithNoWork;
    2.75 -   SlaveVP        *currVP;
    2.76 -   SchedSlot      *currSlot, **schedSlots;
    2.77 -   int32           currSlotIdx;
    2.78 -   volatile int32 *addrOfMasterLock; //thing pointed to is volatile, not ptr
    2.79 -   SlaveVP        *thisCoresMasterVP;
    2.80 -      //Variables used for pthread related things
    2.81 -   ThdParams      *coreCtlrThdParams;
    2.82 -   cpu_set_t       coreMask;  //used during pinning pthread to CPU core
    2.83 -   int32           errorCode;
    2.84 -      //Variables used during measurements
    2.85 -   TSCountLowHigh  endSusp;
    2.86 -      //Variables used in random-backoff, for master-lock and waiting for work
    2.87 -   uint32_t seed1 = rand()%1000; // init random number generator for retries
    2.88 -   uint32_t seed2 = rand()%1000;
    2.89 -      //Variable for work-stealing -- a gate protects a critical section
    2.90 -   volatile GateStruc gate;      //on stack to avoid false-sharing
    2.91 -
    2.92 -   
    2.93 -   //===============  Initializations ===================
    2.94 -   coreCtlrThdParams = (ThdParams *)paramsIn;
    2.95 -   thisCoresIdx = coreCtlrThdParams->coreNum;
    2.96 -
    2.97 -   gate.gateClosed      = FALSE;
    2.98 -   gate.preGateProgress = 0;
    2.99 -   gate.waitProgress    = 0;
   2.100 -   gate.exitProgress    = 0;
   2.101 -   //TODO: pad these to prevent false-sharing, and fix the race at startup
   2.102 -   _VMSMasterEnv->workStealingGates[ thisCoresIdx ] = (GateStruc*)&gate;
   2.103 -
   2.104 -      //Assembly that saves addr of label of return instr -- label in assmbly
   2.105 -   recordCoreCtlrReturnLabelAddr((void**)&(_VMSMasterEnv->coreCtlrReturnPt));
   2.106 -
   2.107 -   schedSlots = _VMSMasterEnv->allSchedSlots[thisCoresIdx];
   2.108 -   currSlotIdx = 0; //start at slot 0, go up until one empty, then do master
   2.109 -   numRepetitionsWithNoWork = 0;
   2.110 -   addrOfMasterLock = &(_VMSMasterEnv->masterLock);
   2.111 -   thisCoresMasterVP = _VMSMasterEnv->masterVPs[thisCoresIdx];
   2.112 -   
   2.113 -   //==================== pthread related stuff ======================
   2.114 -      //pin the pthread to the core
   2.115 -      //Linux requires pinning to be done inside the thread-function
   2.116 -      //Designate a core by a 1 in bit-position corresponding to the core
   2.117 -   CPU_ZERO(&coreMask); //initialize mask bits to zero
   2.118 -   CPU_SET(coreCtlrThdParams->coreNum,&coreMask); //set bit repr the coreNum
   2.119 -   pthread_t selfThd = pthread_self();
   2.120 -   errorCode =
   2.121 -   pthread_setaffinity_np( selfThd, sizeof(coreMask), &coreMask);
   2.122 -   if(errorCode){ printf("\n pinning thd to core failed \n"); exit(0); }
   2.123 -
   2.124 -      //make sure the controllers all start at same time, by making them wait
   2.125 -   pthread_mutex_lock(   &suspendLock );
   2.126 -   while( !(_VMSMasterEnv->setupComplete) )
   2.127 -    { pthread_cond_wait( &suspendCond, &suspendLock );
   2.128 -    }
   2.129 -   pthread_mutex_unlock( &suspendLock );
   2.130 -
   2.131 -   //====================== The Core Controller ======================
   2.132 -   while(1)  //An endless loop is just one way of doing the control structure
   2.133 -    {        //Assembly code switches the core between animating a VP and
   2.134 -             // animating this core controller.  The switch is done by
   2.135 -             // changing the stack-pointer and frame-pointer and then doing
   2.136 -             // an assembly jmp.  When reading this code, the effect is 
   2.137 -             // that the "switchToSlv()" at the end of the loop is sort of a
   2.138 -             // "warp in time" -- the core disappears inside this, jmps to
   2.139 -             // animating a VP, and when that VP suspends, the suspend
   2.140 -             // jmps back. This has the effect of "returning" from the
   2.141 -             // switchToSlv() call. Then control loops back to here.
   2.142 -             //Alternatively, the VP suspend primitive could just not bother
   2.143 -             // returning from switchToSlv, and instead jmp directly to here.
   2.144 -      
   2.145 -      if( currSlotIdx >= NUM_SCHED_SLOTS ) goto switchToMaster;
   2.146 -      currSlot = schedSlots[ currSlotIdx ];
   2.147 -
   2.148 -      
   2.149 -      if( ! currSlot->needsSlaveAssigned ) //slot does have slave assigned
   2.150 -       { numRepetitionsWithNoWork = 0;     //reset B2B master count
   2.151 -         currSlotIdx ++;
   2.152 -         currVP = currSlot->slaveAssignedToSlot;
   2.153 -       }
   2.154 -      else //slot is empty, so switch to master
   2.155 -       {
   2.156 -       switchToMaster:
   2.157 -         currSlotIdx = 0; //doing switch to master, so start over at slot 0
   2.158 -         currVP = NULL;
   2.159 -
   2.160 -               MEAS__Capture_Pre_Master_Lock_Point;
   2.161 -
   2.162 -         int numTriesToGetLock = 0; int gotLock = 0;
   2.163 -         while( currVP == NULL ) //keep going until get master lock
   2.164 -          { 
   2.165 -               //At this point, first thing to do is get lock.  But, want to
   2.166 -               // reduce lock contention from cores with no work, so first
   2.167 -               // check if this is a core with no work, and busy wait if so.
   2.168 -               //Then, if it's been way too long without work, yield pthread
   2.169 -            if( numRepetitionsWithNoWork > NUM_REPS_W_NO_WORK_BEFORE_BACKOFF)
   2.170 -               doBackoff_for_TooLongWithNoWork( numRepetitionsWithNoWork, &seed1, &seed2 );
   2.171 -            if( numRepetitionsWithNoWork > NUM_REPS_W_NO_WORK_BEFORE_YIELD )
   2.172 -             { numRepetitionsWithNoWork = 0; pthread_yield(); }
   2.173 -
   2.174 -               
   2.175 -               //Now, try to get the lock
   2.176 -            gotLock = __sync_bool_compare_and_swap( addrOfMasterLock,
   2.177 -                                                    UNLOCKED, LOCKED );
   2.178 -            if( gotLock )
   2.179 -             {    //At this point, have run out of slaves, so tried to get
   2.180 -                  // the master lock, and have successfully gotten it.
   2.181 -                  //So, set the currVP to this core's masterVP and break out
   2.182 -                  // of the get-lock loop.  Below, assembly code will switch
   2.183 -                  // the core over to animating the masterVP.  When it's 
   2.184 -                  // done, the masterVP will use assembly to switch the core
   2.185 -                  // back to animating this core controller
   2.186 -               currVP = thisCoresMasterVP;
   2.187 -               numRepetitionsWithNoWork += 1;
   2.188 -               break;  //end while -- have a VP to animate now
   2.189 -             }
   2.190 -               //Get here only when failed to get lock
   2.191 -
   2.192 -            numTriesToGetLock++;   //if too many, means too much contention
   2.193 -            if( numTriesToGetLock > NUM_TRIES_BEFORE_DO_BACKOFF ) 
   2.194 -               doBackoff_for_TooLongToGetLock( numTriesToGetLock, &seed1, &seed2 );
   2.195 -            if( numTriesToGetLock > MASTERLOCK_RETRIES_BEFORE_YIELD ) 
   2.196 -             { numTriesToGetLock = 0; pthread_yield(); }
   2.197 -          }
   2.198 -               MEAS__Capture_Post_Master_Lock_Point;
   2.199 -       }
   2.200 -
   2.201 -
   2.202 -      switchToSlv(currVP); //Slave suspend makes core "return" from this call
   2.203 -      flushRegisters();    //prevent GCC optimization from doing bad things 
   2.204 -
   2.205 -             MEAS__Capture_End_Susp_in_CoreCtlr_ForSys;
   2.206 -          
   2.207 -    }//while(1)
   2.208 - }
   2.209 -
   2.210 -
   2.211 -void *
   2.212 -terminateCoreController(SlaveVP *currSlv)
   2.213 - {
   2.214 -   //first free shutdown Slv that jumped here -- it first restores the
   2.215 -   // coreloop's stack, so addr of currSlv in stack frame is still correct
   2.216 -   VMS_int__dissipate_slaveVP( currSlv );
   2.217 -   pthread_exit( NULL );
   2.218 - }
   2.219 -
   2.220 -
   2.221 -/*Used by the backoff to pick a random amount of busy-wait.  Can't use the
   2.222 - * system rand because it takes much too long.
   2.223 - *Note, are passing pointers to the seeds, which are then modified
   2.224 - */
   2.225 -inline uint32_t
   2.226 -randomNumber(uint32_t* seed1, uint32_t* seed2)
   2.227 - {
   2.228 -	*seed1 = 36969 * (*seed1 & 65535) + (*seed1 >> 16);
   2.229 -	*seed2 = 18000 * (*seed2 & 65535) + (*seed2 >> 16);
   2.230 -	return (*seed1 << 16) + *seed2;
   2.231 - }
   2.232 -
   2.233 -/*Busy-wait for a random number of cycles -- chooses number of cycles 
   2.234 - * differently than for the too-many-tries-to-get-lock backoff
   2.235 - */
   2.236 -inline void
   2.237 -doBackoff_for_TooLongWithNoWork( int32   numRepsWithNoWork, uint32 *seed1, 
   2.238 -                                 uint32 *seed2 )
   2.239 - { int32 i, waitIterations;
   2.240 -   volatile double fakeWorkVar; //busy-wait fake work
   2.241 - 
   2.242 -   waitIterations = 
   2.243 -    randomNumber(seed1, seed2) % 
   2.244 -    (numRepsWithNoWork * numRepsWithNoWork * NUM_CORES);
   2.245 -   for( i = 0; i < waitIterations; i++ )
   2.246 -    { fakeWorkVar += (fakeWorkVar + 32.0) / 2.0; //busy-wait
   2.247 -    }
   2.248 - }
   2.249 -
   2.250 -/*Busy-waits for a random number of cycles -- chooses number of cycles 
   2.251 - * differently than for the no-work backoff
   2.252 - */
   2.253 -inline void
   2.254 -doBackoff_for_TooLongToGetLock( int32 numTriesToGetLock, uint32 *seed1, 
   2.255 -                                uint32 *seed2 )
   2.256 - { int32 i, waitIterations;
   2.257 -   volatile double fakeWorkVar; //busy-wait fake work
   2.258 -
   2.259 -   waitIterations = 
   2.260 -    randomNumber(seed1, seed2) % 
   2.261 -    (numTriesToGetLock * NUM_TRIES_TO_GET_LOCK_BACKOFF_WEIGHT);   
   2.262 -   //addToHist( wait_iterations, coreLoopThdParams->wait_iterations_hist );
   2.263 -   for( i = 0; i < waitIterations; i++ )
   2.264 -    { fakeWorkVar += (fakeWorkVar + 32.0) / 2.0; //busy-wait
   2.265 -    }
   2.266 - }
   2.267 -
   2.268 -
   2.269 -#ifdef DEBUG__TURN_ON_SEQUENTIAL_MODE
   2.270 -
   2.271 -//===========================================================================
   2.272 -/*This sequential version does the same as threaded, except doesn't do the
   2.273 - * pin-threads part, nor the wait until setup complete and acquire master
   2.274 - * lock parts.
   2.275 - */
   2.276 -void *
   2.277 -coreCtlr_Seq( void *paramsIn )
   2.278 - {
   2.279 -   int32           thisCoresIdx;
   2.280 -   int32           numRepetitionsWithNoWork;
   2.281 -   SlaveVP        *currVP;
   2.282 -   SchedSlot      *currSlot, **schedSlots;
   2.283 -   int32           currSlotIdx;
   2.284 -   int32          *addrOfMasterLock;
   2.285 -   SlaveVP        *thisCoresMasterVP;
   2.286 -   
   2.287 -   //===============  Initializations ===================
   2.288 -   schedSlots = _VMSMasterEnv->allSchedSlots[thisCoresIdx];
   2.289 -   currSlotIdx = 0; //start at slot 0, go up until one empty, then do master
   2.290 -   numRepetitionsWithNoWork = 0;
   2.291 -   addrOfMasterLock = &(_VMSMasterEnv->masterLock);
   2.292 -   thisCoresMasterVP = _VMSMasterEnv->masterVPs[thisCoresIdx];
   2.293 -   
   2.294 -   thisCoresIdx = 0; //sequential version
   2.295 -
   2.296 -      //Assembly that saves addr of label of return instr -- label in assmbly
   2.297 -   recordCoreCtlrReturnLabelAddr((void**)&(_VMSMasterEnv->coreCtlrReturnPt));
   2.298 -
   2.299 -   
   2.300 -   //====================== The Core Controller ======================
   2.301 -   while(1)
   2.302 -    {
   2.303 -      if( currSlotIdx >= NUM_SCHED_SLOTS ) goto switchToMaster;
   2.304 -      currSlot = schedSlots[ currSlotIdx ];
   2.305 -
   2.306 -      if( ! currSlot->needsSlaveAssigned ) //slot does have slave assigned
   2.307 -       { numRepetitionsWithNoWork = 0;     //reset B2B master count
   2.308 -         currSlotIdx ++;
   2.309 -         currVP = currSlot->slaveAssignedToSlot;
   2.310 -       }
   2.311 -      else //slot is empty, so switch to master
   2.312 -       {
   2.313 -       switchToMaster:
   2.314 -         currSlotIdx = 0; //doing switch to master, so start over at slot 0
   2.315 -         
   2.316 -         currVP = thisCoresMasterVP;
   2.317 -         
   2.318 -               MEAS__Capture_Pre_Master_Lock_Point;  //back to back because
   2.319 -               MEAS__Capture_Post_Master_Lock_Point; // sequential version
   2.320 -         
   2.321 -         if( numRepetitionsWithNoWork > NUM_REPS_W_NO_WORK_BEFORE_YIELD )
   2.322 -          { printf("Lots of reps w/o work\n");
   2.323 -            exit(0); //if no work, no way to ever get it in sequential!
   2.324 -          }
   2.325 -         numRepetitionsWithNoWork += 1;
   2.326 -       }
   2.327 -
   2.328 -      switchToSlv(currVP); //Slave suspend makes core "return" from this call
   2.329 -      flushRegisters();    //prevent GCC optimization from doing bad things 
   2.330 -
   2.331 -             MEAS__Capture_End_Susp_in_CoreCtlr_ForSys;
   2.332 -             
   2.333 -    } //while(1)
   2.334 - }
   2.335 -#endif
     3.1 --- a/Defines/VMS_defs__MEAS.h	Mon Mar 12 05:38:07 2012 -0700
     3.2 +++ b/Defines/VMS_defs__MEAS.h	Tue Mar 13 10:02:06 2012 -0700
     3.3 @@ -6,8 +6,8 @@
     3.4   * 
     3.5   */
     3.6  
     3.7 -#ifndef  _VMS_DEFS_MEAS_H
     3.8 -#define	_VMS_DEFS_MEAS_H
     3.9 +#ifndef _VMS_DEFS_MEAS_H
    3.10 +#define _VMS_DEFS_MEAS_H
    3.11  #define _GNU_SOURCE
    3.12  
    3.13  //==================  Macros define types of meas want  =====================
    3.14 @@ -321,5 +321,5 @@
    3.15  
    3.16  
    3.17  //===========================================================================
    3.18 -#endif	/* _VMS_DEFS_H */
    3.19 +#endif	/* _VMS_DEFS_MEAS_H */
    3.20  
     4.1 --- a/Defines/VMS_defs__turn_on_and_off.h	Mon Mar 12 05:38:07 2012 -0700
     4.2 +++ b/Defines/VMS_defs__turn_on_and_off.h	Tue Mar 13 10:02:06 2012 -0700
     4.3 @@ -15,7 +15,7 @@
     4.4   * It still does co-routines and all the mechanisms are the same, it just
     4.5   * has only a single thread and animates Slvs one at a time
     4.6   */
     4.7 -//#define DEBUG__TURN_ON_SEQUENTIAL_MODE
     4.8 +#define DEBUG__TURN_ON_SEQUENTIAL_MODE
     4.9  
    4.10  
    4.11  /*turns on the probe-instrumentation in the application -- when not
     5.1 --- a/MasterLoop.c	Mon Mar 12 05:38:07 2012 -0700
     5.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
     5.3 @@ -1,349 +0,0 @@
     5.4 -/*
     5.5 - * Copyright 2010  OpenSourceStewardshipFoundation
     5.6 - * 
     5.7 - * Licensed under BSD
     5.8 - */
     5.9 -
    5.10 -
    5.11 -
    5.12 -#include <stdio.h>
    5.13 -#include <stddef.h>
    5.14 -
    5.15 -#include "VMS.h"
    5.16 -
    5.17 -
    5.18 -//===========================================================================
    5.19 -void inline
    5.20 -stealWorkInto( SchedSlot *currSlot, VMSQueueStruc *readyToAnimateQ,
    5.21 -               SlaveVP *masterVP );
    5.22 -
    5.23 -//===========================================================================
    5.24 -
    5.25 -
    5.26 -
    5.27 -/*This code is animated by the virtual Master processor.
    5.28 - *
    5.29 - *Polls each sched slot exactly once, hands any requests made by a newly
    5.30 - * done slave to the "request handler" plug-in function
    5.31 - *
    5.32 - *Any slots that need a Slv assigned are given to the "schedule"
    5.33 - * plug-in function, which tries to assign a Slv (slave) to it.
    5.34 - *
    5.35 - *When all slots needing a processor have been given to the schedule plug-in,
    5.36 - * a fraction of the slaves successfully scheduled are put into the
    5.37 - * work queue, then a continuation of this function is put in, then the rest
    5.38 - * of the Slvs that were successfully scheduled.
    5.39 - *
    5.40 - *The first thing the continuation does is busy-wait until the previous
    5.41 - * animation completes.  This is because an (unlikely) continuation may
    5.42 - * sneak through queue before previous continuation is done putting second
    5.43 - * part of scheduled slaves in, which is the only race condition.
    5.44 - *
    5.45 - */
    5.46 -
    5.47 -/*May 29, 2010 -- birth a Master during init so that first core controller to
    5.48 - * start running gets it and does all the stuff for a newly born --
    5.49 - * from then on, will be doing continuation, but do suspension self
    5.50 - * directly at end of master loop
    5.51 - *So VMS_WL__init just births the master virtual processor same way it births
    5.52 - * all the others -- then does any extra setup needed and puts it into the
    5.53 - * work queue.
    5.54 - *However means have to make masterEnv a global static volatile the same way
    5.55 - * did with readyToAnimateQ in core controller.  -- for performance, put the
    5.56 - * jump to the core controller directly in here, and have it directly jump back.
    5.57 - *
    5.58 - *
    5.59 - *Aug 18, 2010 -- Going to a separate MasterVP for each core, to see if this
    5.60 - * avoids the suspected bug in the system stack that causes bizarre faults
    5.61 - * at random places in the system code.
    5.62 - *
    5.63 - *So, this function is coupled to each of the MasterVPs, -- meaning this
    5.64 - * function can't rely on a particular stack and frame -- each MasterVP that
    5.65 - * animates this function has a different one.
    5.66 - *
    5.67 - *At this point, the masterLoop does not write itself into the queue anymore,
    5.68 - * instead, the coreCtlr acquires the masterLock when it has nothing to
    5.69 - * animate, and then animates its own masterLoop.  However, still try to put
    5.70 - * several AppSlvs into the queue to amortize the startup cost of switching
    5.71 - * to the MasterVP.  Note, don't have to worry about latency of requests much
    5.72 - * because most requests generate work for same core -- only latency issue
    5.73 - * is case when other cores starved and one core's requests generate work
    5.74 - * for them -- so keep max in queue to 3 or 4..
    5.75 - */
    5.76 -void masterLoop( void *initData, SlaveVP *animatingSlv )
    5.77 - { 
    5.78 -   int32           slotIdx, numSlotsFilled;
    5.79 -   SlaveVP        *schedSlaveVP;
    5.80 -   SchedSlot      *currSlot, **schedSlots;
    5.81 -   MasterEnv      *masterEnv;
    5.82 -   VMSQueueStruc  *readyToAnimateQ;
    5.83 -   
    5.84 -   Sched_Assigner  slaveAssigner;
    5.85 -   RequestHandler  requestHandler;
    5.86 -   void           *semanticEnv;
    5.87 -
    5.88 -   int32           thisCoresIdx;
    5.89 -   SlaveVP        *masterVP;
    5.90 -   volatile        SlaveVP *volatileMasterVP;
    5.91 -   
    5.92 -   volatileMasterVP = animatingSlv;
    5.93 -   masterVP         = (SlaveVP*)volatileMasterVP; //used to force re-define after jmp
    5.94 -
    5.95 -      //First animation of each MasterVP will in turn animate this part
    5.96 -      // of setup code.. (Slv creator sets up the stack as if this function
    5.97 -      // was called normally, but actually get here by jmp)
    5.98 -      //So, setup values about stack ptr, jmp pt and all that
    5.99 -   //masterVP->resumeInstrPtr = &&masterLoopStartPt;
   5.100 -
   5.101 -
   5.102 -      //Note, got rid of writing the stack and frame ptr up here, because
   5.103 -      // only one
   5.104 -      // core can ever animate a given MasterVP, so don't need to communicate
   5.105 -      // new frame and stack ptr to the MasterVP storage before a second
   5.106 -      // version of that MasterVP can get animated on a different core.
   5.107 -      //Also got rid of the busy-wait.
   5.108 -
   5.109 -   
   5.110 -   //masterLoopStartPt:
   5.111 -   while(1){
   5.112 -       
   5.113 -      MEAS__Capture_Pre_Master_Point
   5.114 -
   5.115 -   masterEnv        = (MasterEnv*)_VMSMasterEnv;
   5.116 -   
   5.117 -      //GCC may optimize so doesn't always re-define from frame-storage
   5.118 -   masterVP         = (SlaveVP*)volatileMasterVP;  //just to make sure after jmp
   5.119 -   thisCoresIdx     = masterVP->coreAnimatedBy;
   5.120 -   readyToAnimateQ  = masterEnv->readyToAnimateQs[thisCoresIdx];
   5.121 -   schedSlots       = masterEnv->allSchedSlots[thisCoresIdx];
   5.122 -
   5.123 -   requestHandler   = masterEnv->requestHandler;
   5.124 -   slaveAssigner    = masterEnv->slaveAssigner;
   5.125 -   semanticEnv      = masterEnv->semanticEnv;
   5.126 -
   5.127 -
   5.128 -      //Poll each slot's Done flag
   5.129 -   numSlotsFilled = 0;
   5.130 -   for( slotIdx = 0; slotIdx < NUM_SCHED_SLOTS; slotIdx++)
   5.131 -    {
   5.132 -      currSlot = schedSlots[ slotIdx ];
   5.133 -
   5.134 -      if( currSlot->workIsDone )
   5.135 -       {
   5.136 -         currSlot->workIsDone         = FALSE;
   5.137 -         currSlot->needsSlaveAssigned = TRUE;
   5.138 -
   5.139 -               MEAS__startReqHdlr;
   5.140 -               
   5.141 -            //process the requests made by the slave (held inside slave struc)
   5.142 -         (*requestHandler)( currSlot->slaveAssignedToSlot, semanticEnv );
   5.143 -         
   5.144 -               MEAS__endReqHdlr;
   5.145 -       }
   5.146 -      if( currSlot->needsSlaveAssigned )
   5.147 -       {    //give slot a new Slv
   5.148 -         schedSlaveVP =
   5.149 -          (*slaveAssigner)( semanticEnv, thisCoresIdx );
   5.150 -         
   5.151 -         if( schedSlaveVP != NULL )
   5.152 -          { currSlot->slaveAssignedToSlot = schedSlaveVP;
   5.153 -            schedSlaveVP->schedSlot       = currSlot;
   5.154 -            currSlot->needsSlaveAssigned  = FALSE;
   5.155 -            numSlotsFilled               += 1;
   5.156 -          }
   5.157 -       }
   5.158 -    }
   5.159 -
   5.160 -   
   5.161 -   #ifdef SYS__TURN_ON_WORK_STEALING
   5.162 -      //If no slots filled, means no more work, look for work to steal.
   5.163 -   if( numSlotsFilled == 0 )
   5.164 -    { gateProtected_stealWorkInto( currSlot, readyToAnimateQ, masterVP );
   5.165 -    }
   5.166 -   #endif
   5.167 -
   5.168 -         MEAS__Capture_Post_Master_Point;
   5.169 -   
   5.170 -   masterSwitchToCoreCtlr(animatingSlv);
   5.171 -   flushRegisters();
   5.172 -   }//MasterLoop
   5.173 -
   5.174 -
   5.175 - }
   5.176 -
   5.177 -
   5.178 -
   5.179 -/*This has a race condition -- the coreloops are accessing their own queues
   5.180 - * at the same time that this work-stealer on a different core is trying to
   5.181 - */
   5.182 -void inline
   5.183 -stealWorkInto( SchedSlot *currSlot, VMSQueueStruc *readyToAnimateQ,
   5.184 -               SlaveVP *masterVP )
   5.185 - { 
   5.186 -   SlaveVP   *stolenSlv;
   5.187 -   int32        coreIdx, i;
   5.188 -   VMSQueueStruc *currQ;
   5.189 -
   5.190 -   stolenSlv = NULL;
   5.191 -   coreIdx = masterVP->coreAnimatedBy;
   5.192 -   for( i = 0; i < NUM_CORES -1; i++ )
   5.193 -    {
   5.194 -      if( coreIdx >= NUM_CORES -1 )
   5.195 -       { coreIdx = 0;
   5.196 -       }
   5.197 -      else
   5.198 -       { coreIdx++;
   5.199 -       }
   5.200 -      currQ = _VMSMasterEnv->readyToAnimateQs[coreIdx];
   5.201 -      if( numInVMSQ( currQ ) > 0 )
   5.202 -       { stolenSlv = readVMSQ (currQ );
   5.203 -         break;
   5.204 -       }
   5.205 -    }
   5.206 -
   5.207 -   if( stolenSlv != NULL )
   5.208 -    { currSlot->slaveAssignedToSlot = stolenSlv;
   5.209 -      stolenSlv->schedSlot           = currSlot;
   5.210 -      currSlot->needsSlaveAssigned  = FALSE;
   5.211 -
   5.212 -      writeVMSQ( stolenSlv, readyToAnimateQ );
   5.213 -    }
   5.214 - }
   5.215 -
   5.216 -/*This algorithm makes the common case fast.  Make the coreloop passive,
   5.217 - * and show its progress.  Make the stealer control a gate that coreloop
   5.218 - * has to pass.
   5.219 - *To avoid interference, only one stealer at a time.  Use a global
   5.220 - * stealer-lock.
   5.221 - *
   5.222 - *The pattern is based on a gate -- stealer shuts the gate, then monitors
   5.223 - * to be sure any already past make it all the way out, before starting.
   5.224 - *So, have a "progress" measure just before the gate, then have two after it,
   5.225 - * one is in a "waiting room" outside the gate, the other is at the exit.
   5.226 - *Then, the stealer first shuts the gate, then checks the progress measure
   5.227 - * outside it, then looks to see if the progress measure at the exit is the
   5.228 - * same.  If yes, it knows the protected area is empty 'cause no other way
   5.229 - * to get in and the last to get in also exited.
   5.230 - *If the progress measure at the exit is not the same, then the stealer goes
   5.231 - * into a loop checking both the waiting-area and the exit progress-measures
   5.232 - * until one of them shows the same as the measure outside the gate.  Might
   5.233 - * as well re-read the measure outside the gate each go around, just to be
   5.234 - * sure.  It is guaranteed that one of the two will eventually match the one
   5.235 - * outside the gate.
   5.236 - *
   5.237 - *Here's an informal proof of correctness:
   5.238 - *The gate can be closed at any point, and have only four cases:
   5.239 - *  1) coreloop made it past the gate-closing but not yet past the exit
   5.240 - *  2) coreloop made it past the pre-gate progress update but not yet past
   5.241 - *     the gate,
   5.242 - *  3) coreloop is right before the pre-gate update
   5.243 - *  4) coreloop is past the exit and far from the pre-gate update.
   5.244 - *
   5.245 - * Covering the cases in reverse order,
   5.246 - *  4) is not a problem -- stealer will read pre-gate progress, see that it
   5.247 - *     matches exit progress, and the gate is closed, so stealer can proceed.
   5.248 - *  3) stealer will read pre-gate progress just after coreloop updates it..
   5.249 - *     so stealer goes into a loop until the coreloop causes wait-progress
   5.250 - *     to match pre-gate progress, so then stealer can proceed
   5.251 - *  2) same as 3..
   5.252 - *  1) stealer reads pre-gate progress, sees that it's different than exit,
   5.253 - *     so goes into loop until exit matches pre-gate, now it knows coreloop
   5.254 - *     is not in protected and cannot get back in, so can proceed.
   5.255 - *
   5.256 - *Implementation for the stealer:
   5.257 - *
   5.258 - *First, acquire the stealer lock -- only cores with no work to do will
   5.259 - * compete to steal, so not a big performance penalty having only one --
   5.260 - * will rarely have multiple stealers in a system with plenty of work -- and
   5.261 - * in a system with little work, it doesn't matter.
   5.262 - *
   5.263 - *Note, have single-reader, single-writer pattern for all variables used to
   5.264 - * communicate between stealer and victims
   5.265 - *
   5.266 - *So, scan the queues of the core controllers, until find non-empty.  Each core
   5.267 - * has its own list that it scans.  The list goes in order from closest to
   5.268 - * furthest core, so it steals first from close cores.  Later can add
   5.269 - * taking info from the app about overlapping footprints, and scan all the
   5.270 - * others then choose work with the most footprint overlap with the contents
   5.271 - * of this core's cache.
   5.272 - *
   5.273 - *Now, have a victim want to take work from.  So, shut the gate in that
   5.274 - * coreloop, by setting the "gate closed" var on its stack to TRUE.
   5.275 - *Then, read the core's pre-gate progress and compare to the core's exit
   5.276 - * progress.
   5.277 - *If same, can proceed to take work from the coreloop's queue.  When done,
   5.278 - * write FALSE to gate closed var.
   5.279 - *If different, then enter a loop that reads the pre-gate progress, then
   5.280 - * compares to exit progress then to wait progress.  When one of two
   5.281 - * matches, proceed.  Take work from the coreloop's queue.  When done,
   5.282 - * write FALSE to the gate closed var.
   5.283 - * 
   5.284 - */
   5.285 -void inline
   5.286 -gateProtected_stealWorkInto( SchedSlot *currSlot,
   5.287 -                             VMSQueueStruc *myReadyToAnimateQ,
   5.288 -                             SlaveVP *masterVP )
   5.289 - {
   5.290 -   SlaveVP     *stolenSlv;
   5.291 -   int32          coreIdx, i, haveAVictim, gotLock;
   5.292 -   VMSQueueStruc *victimsQ;
   5.293 -
   5.294 -   volatile GateStruc *vicGate;
   5.295 -   int32               coreMightBeInProtected;
   5.296 -
   5.297 -
   5.298 -
   5.299 -      //see if any other cores have work available to steal
   5.300 -   haveAVictim = FALSE;
   5.301 -   coreIdx = masterVP->coreAnimatedBy;
   5.302 -   for( i = 0; i < NUM_CORES -1; i++ )
   5.303 -    {
   5.304 -      if( coreIdx >= NUM_CORES -1 )
   5.305 -       { coreIdx = 0;
   5.306 -       }
   5.307 -      else
   5.308 -       { coreIdx++;
   5.309 -       }
   5.310 -      victimsQ = _VMSMasterEnv->readyToAnimateQs[coreIdx];
   5.311 -      if( numInVMSQ( victimsQ ) > 0 )
   5.312 -       { haveAVictim = TRUE;
   5.313 -         vicGate = _VMSMasterEnv->workStealingGates[ coreIdx ];
   5.314 -         break;
   5.315 -       }
   5.316 -    }
   5.317 -   if( !haveAVictim ) return;  //no work to steal, exit
   5.318 -
   5.319 -      //have a victim core, now get the stealer-lock
   5.320 -   gotLock =__sync_bool_compare_and_swap( &(_VMSMasterEnv->workStealingLock),
   5.321 -                                                          UNLOCKED, LOCKED );
   5.322 -   if( !gotLock ) return; //go back to core controller, which will re-start master
   5.323 -
   5.324 -
   5.325 -   //====== Start Gate-protection =======
   5.326 -   vicGate->gateClosed = TRUE;
   5.327 -   coreMightBeInProtected= vicGate->preGateProgress != vicGate->exitProgress;
   5.328 -   while( coreMightBeInProtected )
   5.329 -    {    //wait until sure
   5.330 -      if( vicGate->preGateProgress == vicGate->waitProgress )
   5.331 -         coreMightBeInProtected = FALSE;
   5.332 -      if( vicGate->preGateProgress == vicGate->exitProgress )
   5.333 -         coreMightBeInProtected = FALSE;
   5.334 -    }
   5.335 -
   5.336 -   stolenSlv = readVMSQ ( victimsQ );
   5.337 -
   5.338 -   vicGate->gateClosed = FALSE;
   5.339 -   //======= End Gate-protection  =======
   5.340 -
   5.341 -
   5.342 -   if( stolenSlv != NULL )  //victim could have been in protected and taken
   5.343 -    { currSlot->slaveAssignedToSlot = stolenSlv;
   5.344 -      stolenSlv->schedSlot           = currSlot;
   5.345 -      currSlot->needsSlaveAssigned  = FALSE;
   5.346 -
   5.347 -      writeVMSQ( stolenSlv, myReadyToAnimateQ );
   5.348 -    }
   5.349 -
   5.350 -      //unlock the work stealing lock
   5.351 -   _VMSMasterEnv->workStealingLock = UNLOCKED;
   5.352 - }
     6.1 --- a/Probes/probes.c	Mon Mar 12 05:38:07 2012 -0700
     6.2 +++ b/Probes/probes.c	Tue Mar 13 10:02:06 2012 -0700
     6.3 @@ -298,7 +298,6 @@
     6.4  VMS_impl__print_stats_of_all_probes()
     6.5   {
     6.6     forAllInDynArrayDo( _VMSMasterEnv->dynIntervalProbesInfo,
     6.7 -                       &VMS_impl__print_stats_of_probe );
     6.8 +                          (DynArrayFnPtr) &VMS_impl__print_stats_of_probe );
     6.9     fflush( stdout );
    6.10   }
    6.11 -typedef void  (*DynArrayFnPtr)  ( void * );  //fn has to cast void *
     7.1 --- a/Probes/probes.h	Mon Mar 12 05:38:07 2012 -0700
     7.2 +++ b/Probes/probes.h	Tue Mar 13 10:02:06 2012 -0700
     7.3 @@ -107,7 +107,7 @@
     7.4  VMS_impl__record_interval_end_in_probe( int32 probeID );
     7.5  
     7.6  void
     7.7 -VMS_impl__print_stats_of_probe( IntervalProbe *probe )
     7.8 +VMS_impl__print_stats_of_probe( IntervalProbe *probe );
     7.9  
    7.10  void
    7.11  VMS_impl__print_stats_of_all_probes();
     8.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     8.2 +++ b/SchedulingMaster.c	Tue Mar 13 10:02:06 2012 -0700
     8.3 @@ -0,0 +1,349 @@
     8.4 +/*
     8.5 + * Copyright 2010  OpenSourceStewardshipFoundation
     8.6 + * 
     8.7 + * Licensed under BSD
     8.8 + */
     8.9 +
    8.10 +
    8.11 +
    8.12 +#include <stdio.h>
    8.13 +#include <stddef.h>
    8.14 +
    8.15 +#include "VMS.h"
    8.16 +
    8.17 +
    8.18 +//===========================================================================
    8.19 +void inline
    8.20 +stealWorkInto( SchedSlot *currSlot, VMSQueueStruc *readyToAnimateQ,
    8.21 +               SlaveVP *masterVP );
    8.22 +
    8.23 +//===========================================================================
    8.24 +
    8.25 +
    8.26 +
    8.27 +/*This code is animated by the virtual Master processor.
    8.28 + *
    8.29 + *Polls each sched slot exactly once, hands any requests made by a newly
    8.30 + * done slave to the "request handler" plug-in function
    8.31 + *
    8.32 + *Any slots that need a Slv assigned are given to the "schedule"
    8.33 + * plug-in function, which tries to assign a Slv (slave) to it.
    8.34 + *
    8.35 + *When all slots needing a processor have been given to the schedule plug-in,
    8.36 + * a fraction of the slaves successfully scheduled are put into the
    8.37 + * work queue, then a continuation of this function is put in, then the rest
    8.38 + * of the Slvs that were successfully scheduled.
    8.39 + *
    8.40 + *The first thing the continuation does is busy-wait until the previous
    8.41 + * animation completes.  This is because an (unlikely) continuation may
    8.42 + * sneak through queue before previous continuation is done putting second
    8.43 + * part of scheduled slaves in, which is the only race condition.
    8.44 + *
    8.45 + */
    8.46 +
    8.47 +/*May 29, 2010 -- birth a Master during init so that first core controller to
    8.48 + * start running gets it and does all the stuff for a newly born --
    8.49 + * from then on, will be doing continuation, but do suspension self
    8.50 + * directly at end of master loop
    8.51 + *So VMS_WL__init just births the master virtual processor same way it births
    8.52 + * all the others -- then does any extra setup needed and puts it into the
    8.53 + * work queue.
    8.54 + *However means have to make masterEnv a global static volatile the same way
    8.55 + * did with readyToAnimateQ in core controller.  -- for performance, put the
    8.56 + * jump to the core controller directly in here, and have it directly jump back.
    8.57 + *
    8.58 + *
    8.59 + *Aug 18, 2010 -- Going to a separate MasterVP for each core, to see if this
    8.60 + * avoids the suspected bug in the system stack that causes bizarre faults
    8.61 + * at random places in the system code.
    8.62 + *
    8.63 + *So, this function is coupled to each of the MasterVPs, -- meaning this
    8.64 + * function can't rely on a particular stack and frame -- each MasterVP that
    8.65 + * animates this function has a different one.
    8.66 + *
    8.67 + *At this point, the schedulingMaster does not write itself into the queue anymore,
    8.68 + * instead, the coreCtlr acquires the masterLock when it has nothing to
    8.69 + * animate, and then animates its own schedulingMaster.  However, still try to put
    8.70 + * several AppSlvs into the queue to amortize the startup cost of switching
    8.71 + * to the MasterVP.  Note, don't have to worry about latency of requests much
    8.72 + * because most requests generate work for same core -- only latency issue
    8.73 + * is case when other cores starved and one core's requests generate work
    8.74 + * for them -- so keep max in queue to 3 or 4..
    8.75 + */
    8.76 +void schedulingMaster( void *initData, SlaveVP *animatingSlv )
    8.77 + { 
    8.78 +   int32           slotIdx, numSlotsFilled;
    8.79 +   SlaveVP        *schedSlaveVP;
    8.80 +   SchedSlot      *currSlot, **schedSlots;
    8.81 +   MasterEnv      *masterEnv;
    8.82 +   VMSQueueStruc  *readyToAnimateQ;
    8.83 +   
    8.84 +   Sched_Assigner  slaveAssigner;
    8.85 +   RequestHandler  requestHandler;
    8.86 +   void           *semanticEnv;
    8.87 +
    8.88 +   int32           thisCoresIdx;
    8.89 +   SlaveVP        *masterVP;
    8.90 +   volatile        SlaveVP *volatileMasterVP;
    8.91 +   
    8.92 +   volatileMasterVP = animatingSlv;
    8.93 +   masterVP         = (SlaveVP*)volatileMasterVP; //used to force re-define after jmp
    8.94 +
    8.95 +      //First animation of each MasterVP will in turn animate this part
    8.96 +      // of setup code.. (Slv creator sets up the stack as if this function
    8.97 +      // was called normally, but actually get here by jmp)
    8.98 +      //So, setup values about stack ptr, jmp pt and all that
    8.99 +   //masterVP->resumeInstrPtr = &&schedulingMasterStartPt;
   8.100 +
   8.101 +
   8.102 +      //Note, got rid of writing the stack and frame ptr up here, because
   8.103 +      // only one
   8.104 +      // core can ever animate a given MasterVP, so don't need to communicate
   8.105 +      // new frame and stack ptr to the MasterVP storage before a second
   8.106 +      // version of that MasterVP can get animated on a different core.
   8.107 +      //Also got rid of the busy-wait.
   8.108 +
   8.109 +   
   8.110 +   //schedulingMasterStartPt:
   8.111 +   while(1){
   8.112 +       
   8.113 +      MEAS__Capture_Pre_Master_Point
   8.114 +
   8.115 +   masterEnv        = (MasterEnv*)_VMSMasterEnv;
   8.116 +   
   8.117 +      //GCC may optimize so doesn't always re-define from frame-storage
   8.118 +   masterVP         = (SlaveVP*)volatileMasterVP;  //just to make sure after jmp
   8.119 +   thisCoresIdx     = masterVP->coreAnimatedBy;
   8.120 +   readyToAnimateQ  = masterEnv->readyToAnimateQs[thisCoresIdx];
   8.121 +   schedSlots       = masterEnv->allSchedSlots[thisCoresIdx];
   8.122 +
   8.123 +   requestHandler   = masterEnv->requestHandler;
   8.124 +   slaveAssigner    = masterEnv->slaveAssigner;
   8.125 +   semanticEnv      = masterEnv->semanticEnv;
   8.126 +
   8.127 +
   8.128 +      //Poll each slot's Done flag
   8.129 +   numSlotsFilled = 0;
   8.130 +   for( slotIdx = 0; slotIdx < NUM_SCHED_SLOTS; slotIdx++)
   8.131 +    {
   8.132 +      currSlot = schedSlots[ slotIdx ];
   8.133 +
   8.134 +      if( currSlot->workIsDone )
   8.135 +       {
   8.136 +         currSlot->workIsDone         = FALSE;
   8.137 +         currSlot->needsSlaveAssigned = TRUE;
   8.138 +
   8.139 +               MEAS__startReqHdlr;
   8.140 +               
   8.141 +            //process the requests made by the slave (held inside slave struc)
   8.142 +         (*requestHandler)( currSlot->slaveAssignedToSlot, semanticEnv );
   8.143 +         
   8.144 +               MEAS__endReqHdlr;
   8.145 +       }
   8.146 +      if( currSlot->needsSlaveAssigned )
   8.147 +       {    //give slot a new Slv
   8.148 +         schedSlaveVP =
   8.149 +          (*slaveAssigner)( semanticEnv, thisCoresIdx );
   8.150 +         
   8.151 +         if( schedSlaveVP != NULL )
   8.152 +          { currSlot->slaveAssignedToSlot = schedSlaveVP;
   8.153 +            schedSlaveVP->schedSlot       = currSlot;
   8.154 +            currSlot->needsSlaveAssigned  = FALSE;
   8.155 +            numSlotsFilled               += 1;
   8.156 +          }
   8.157 +       }
   8.158 +    }
   8.159 +
   8.160 +   
   8.161 +   #ifdef SYS__TURN_ON_WORK_STEALING
   8.162 +      //If no slots filled, means no more work, look for work to steal.
   8.163 +   if( numSlotsFilled == 0 )
   8.164 +    { gateProtected_stealWorkInto( currSlot, readyToAnimateQ, masterVP );
   8.165 +    }
   8.166 +   #endif
   8.167 +
   8.168 +         MEAS__Capture_Post_Master_Point;
   8.169 +   
   8.170 +   masterSwitchToCoreCtlr(animatingSlv);
   8.171 +   flushRegisters();
   8.172 +   }//MasterLoop
   8.173 +
   8.174 +
   8.175 + }
   8.176 +
   8.177 +
   8.178 +
   8.179 +/*This has a race condition -- the coreloops are accessing their own queues
   8.180 + * at the same time that this work-stealer on a different core is trying to
   8.181 + */
   8.182 +void inline
   8.183 +stealWorkInto( SchedSlot *currSlot, VMSQueueStruc *readyToAnimateQ,
   8.184 +               SlaveVP *masterVP )
   8.185 + { 
   8.186 +   SlaveVP   *stolenSlv;
   8.187 +   int32        coreIdx, i;
   8.188 +   VMSQueueStruc *currQ;
   8.189 +
   8.190 +   stolenSlv = NULL;
   8.191 +   coreIdx = masterVP->coreAnimatedBy;
   8.192 +   for( i = 0; i < NUM_CORES -1; i++ )
   8.193 +    {
   8.194 +      if( coreIdx >= NUM_CORES -1 )
   8.195 +       { coreIdx = 0;
   8.196 +       }
   8.197 +      else
   8.198 +       { coreIdx++;
   8.199 +       }
   8.200 +      currQ = _VMSMasterEnv->readyToAnimateQs[coreIdx];
   8.201 +      if( numInVMSQ( currQ ) > 0 )
   8.202 +       { stolenSlv = readVMSQ (currQ );
   8.203 +         break;
   8.204 +       }
   8.205 +    }
   8.206 +
   8.207 +   if( stolenSlv != NULL )
   8.208 +    { currSlot->slaveAssignedToSlot = stolenSlv;
   8.209 +      stolenSlv->schedSlot           = currSlot;
   8.210 +      currSlot->needsSlaveAssigned  = FALSE;
   8.211 +
   8.212 +      writeVMSQ( stolenSlv, readyToAnimateQ );
   8.213 +    }
   8.214 + }
   8.215 +
   8.216 +/*This algorithm makes the common case fast.  Make the coreloop passive,
   8.217 + * and show its progress.  Make the stealer control a gate that coreloop
   8.218 + * has to pass.
   8.219 + *To avoid interference, only one stealer at a time.  Use a global
   8.220 + * stealer-lock.
   8.221 + *
   8.222 + *The pattern is based on a gate -- stealer shuts the gate, then monitors
   8.223 + * to be sure any already past make it all the way out, before starting.
   8.224 + *So, have a "progress" measure just before the gate, then have two after it,
   8.225 + * one is in a "waiting room" outside the gate, the other is at the exit.
   8.226 + *Then, the stealer first shuts the gate, then checks the progress measure
   8.227 + * outside it, then looks to see if the progress measure at the exit is the
   8.228 + * same.  If yes, it knows the protected area is empty 'cause no other way
   8.229 + * to get in and the last to get in also exited.
   8.230 + *If the progress measure at the exit is not the same, then the stealer goes
   8.231 + * into a loop checking both the waiting-area and the exit progress-measures
   8.232 + * until one of them shows the same as the measure outside the gate.  Might
   8.233 + * as well re-read the measure outside the gate each go around, just to be
   8.234 + * sure.  It is guaranteed that one of the two will eventually match the one
   8.235 + * outside the gate.
   8.236 + *
   8.237 + *Here's an informal proof of correctness:
   8.238 + *The gate can be closed at any point, and have only four cases:
   8.239 + *  1) coreloop made it past the gate-closing but not yet past the exit
   8.240 + *  2) coreloop made it past the pre-gate progress update but not yet past
   8.241 + *     the gate,
   8.242 + *  3) coreloop is right before the pre-gate update
   8.243 + *  4) coreloop is past the exit and far from the pre-gate update.
   8.244 + *
   8.245 + * Covering the cases in reverse order,
   8.246 + *  4) is not a problem -- stealer will read pre-gate progress, see that it
   8.247 + *     matches exit progress, and the gate is closed, so stealer can proceed.
   8.248 + *  3) stealer will read pre-gate progress just after coreloop updates it..
   8.249 + *     so stealer goes into a loop until the coreloop causes wait-progress
   8.250 + *     to match pre-gate progress, so then stealer can proceed
   8.251 + *  2) same as 3..
   8.252 + *  1) stealer reads pre-gate progress, sees that it's different than exit,
   8.253 + *     so goes into loop until exit matches pre-gate, now it knows coreloop
   8.254 + *     is not in protected and cannot get back in, so can proceed.
   8.255 + *
   8.256 + *Implementation for the stealer:
   8.257 + *
   8.258 + *First, acquire the stealer lock -- only cores with no work to do will
   8.259 + * compete to steal, so not a big performance penalty having only one --
   8.260 + * will rarely have multiple stealers in a system with plenty of work -- and
   8.261 + * in a system with little work, it doesn't matter.
   8.262 + *
   8.263 + *Note, have single-reader, single-writer pattern for all variables used to
   8.264 + * communicate between stealer and victims
   8.265 + *
   8.266 + *So, scan the queues of the core controllers, until find non-empty.  Each core
   8.267 + * has its own list that it scans.  The list goes in order from closest to
   8.268 + * furthest core, so it steals first from close cores.  Later can add
   8.269 + * taking info from the app about overlapping footprints, and scan all the
   8.270 + * others then choose work with the most footprint overlap with the contents
   8.271 + * of this core's cache.
   8.272 + *
   8.273 + *Now, have a victim want to take work from.  So, shut the gate in that
   8.274 + * coreloop, by setting the "gate closed" var on its stack to TRUE.
   8.275 + *Then, read the core's pre-gate progress and compare to the core's exit
   8.276 + * progress.
   8.277 + *If same, can proceed to take work from the coreloop's queue.  When done,
   8.278 + * write FALSE to gate closed var.
   8.279 + *If different, then enter a loop that reads the pre-gate progress, then
   8.280 + * compares to exit progress then to wait progress.  When one of two
   8.281 + * matches, proceed.  Take work from the coreloop's queue.  When done,
   8.282 + * write FALSE to the gate closed var.
   8.283 + * 
   8.284 + */
   8.285 +void inline
   8.286 +gateProtected_stealWorkInto( SchedSlot *currSlot,
   8.287 +                             VMSQueueStruc *myReadyToAnimateQ,
   8.288 +                             SlaveVP *masterVP )
   8.289 + {
   8.290 +   SlaveVP     *stolenSlv;
   8.291 +   int32          coreIdx, i, haveAVictim, gotLock;
   8.292 +   VMSQueueStruc *victimsQ;
   8.293 +
   8.294 +   volatile GateStruc *vicGate;
   8.295 +   int32               coreMightBeInProtected;
   8.296 +
   8.297 +
   8.298 +
   8.299 +      //see if any other cores have work available to steal
   8.300 +   haveAVictim = FALSE;
   8.301 +   coreIdx = masterVP->coreAnimatedBy;
   8.302 +   for( i = 0; i < NUM_CORES -1; i++ )
   8.303 +    {
   8.304 +      if( coreIdx >= NUM_CORES -1 )
   8.305 +       { coreIdx = 0;
   8.306 +       }
   8.307 +      else
   8.308 +       { coreIdx++;
   8.309 +       }
   8.310 +      victimsQ = _VMSMasterEnv->readyToAnimateQs[coreIdx];
   8.311 +      if( numInVMSQ( victimsQ ) > 0 )
   8.312 +       { haveAVictim = TRUE;
   8.313 +         vicGate = _VMSMasterEnv->workStealingGates[ coreIdx ];
   8.314 +         break;
   8.315 +       }
   8.316 +    }
   8.317 +   if( !haveAVictim ) return;  //no work to steal, exit
   8.318 +
   8.319 +      //have a victim core, now get the stealer-lock
   8.320 +   gotLock =__sync_bool_compare_and_swap( &(_VMSMasterEnv->workStealingLock),
   8.321 +                                                          UNLOCKED, LOCKED );
   8.322 +   if( !gotLock ) return; //go back to core controller, which will re-start master
   8.323 +
   8.324 +
   8.325 +   //====== Start Gate-protection =======
   8.326 +   vicGate->gateClosed = TRUE;
   8.327 +   coreMightBeInProtected= vicGate->preGateProgress != vicGate->exitProgress;
   8.328 +   while( coreMightBeInProtected )
   8.329 +    {    //wait until sure
   8.330 +      if( vicGate->preGateProgress == vicGate->waitProgress )
   8.331 +         coreMightBeInProtected = FALSE;
   8.332 +      if( vicGate->preGateProgress == vicGate->exitProgress )
   8.333 +         coreMightBeInProtected = FALSE;
   8.334 +    }
   8.335 +
   8.336 +   stolenSlv = readVMSQ ( victimsQ );
   8.337 +
   8.338 +   vicGate->gateClosed = FALSE;
   8.339 +   //======= End Gate-protection  =======
   8.340 +
   8.341 +
   8.342 +   if( stolenSlv != NULL )  //victim could have been in protected and taken
   8.343 +    { currSlot->slaveAssignedToSlot = stolenSlv;
   8.344 +      stolenSlv->schedSlot           = currSlot;
   8.345 +      currSlot->needsSlaveAssigned  = FALSE;
   8.346 +
   8.347 +      writeVMSQ( stolenSlv, myReadyToAnimateQ );
   8.348 +    }
   8.349 +
   8.350 +      //unlock the work stealing lock
   8.351 +   _VMSMasterEnv->workStealingLock = UNLOCKED;
   8.352 + }
     9.1 --- a/VMS.h	Mon Mar 12 05:38:07 2012 -0700
     9.2 +++ b/VMS.h	Tue Mar 13 10:02:06 2012 -0700
     9.3 @@ -201,7 +201,7 @@
     9.4  
     9.5  void * coreController( void *paramsIn );  //standard PThreads fn prototype
     9.6  void * coreCtlr_Seq( void *paramsIn );  //standard PThreads fn prototype
     9.7 -void masterLoop( void *initData, SlaveVP *masterVP );
     9.8 +void schedulingMaster( void *initData, SlaveVP *masterVP );
     9.9  
    9.10  
    9.11  typedef struct
    9.12 @@ -215,10 +215,11 @@
    9.13  
    9.14  volatile MasterEnv      *_VMSMasterEnv __align_to_cacheline__;
    9.15  
    9.16 -pthread_t       coreCtlrThdHandles[ NUM_CORES ];  //pthread's virt-procr state
    9.17 +pthread_t       coreCtlrThdHandles[ NUM_CORES ]; //pthread's virt-procr state
    9.18  ThdParams      *coreCtlrThdParams [ NUM_CORES ];
    9.19 -pthread_mutex_t suspendLock  = PTHREAD_MUTEX_INITIALIZER;
    9.20 -pthread_cond_t  suspendCond  = PTHREAD_COND_INITIALIZER;
    9.21 +
    9.22 +pthread_mutex_t suspendLock;
    9.23 +pthread_cond_t  suspendCond;
    9.24  
    9.25  //=========================  Function Prototypes  ===========================
    9.26  
    10.1 --- a/VMS__startup_and_shutdown.c	Mon Mar 12 05:38:07 2012 -0700
    10.2 +++ b/VMS__startup_and_shutdown.c	Tue Mar 13 10:02:06 2012 -0700
    10.3 @@ -10,6 +10,7 @@
    10.4  #include <malloc.h>
    10.5  #include <inttypes.h>
    10.6  #include <sys/time.h>
    10.7 +#include <pthread.h>
    10.8  
    10.9  #include "VMS.h"
   10.10  
   10.11 @@ -43,7 +44,7 @@
   10.12   *    the master Slv into the work-queue, ready for first "call"
   10.13   * 2) Semantic layer then does its own init, which creates the seed virt
   10.14   *    slave inside the semantic layer, ready to schedule it when
   10.15 - *    asked by the first run of the masterLoop.
   10.16 + *    asked by the first run of the schedulingMaster.
   10.17   *
   10.18   *This part is bit weird because VMS really wants to be "always there", and
   10.19   * have applications attach and detach..  for now, this VMS is part of
   10.20 @@ -51,7 +52,7 @@
   10.21   *
   10.22   *The semantic layer is isolated from the VMS internals by making the
   10.23   * semantic layer do setup to a state that it's ready with its
   10.24 - * initial Slvs, ready to schedule them to slots when the masterLoop
   10.25 + * initial Slvs, ready to schedule them to slots when the schedulingMaster
   10.26   * asks.  Without this pattern, the semantic layer's setup would
   10.27   * have to modify slots directly to assign the initial virt-procrs, and put
   10.28   * them into the readyToAnimateQ itself, breaking the isolation completely.
   10.29 @@ -71,7 +72,7 @@
   10.30   {
   10.31     #ifdef DEBUG__TURN_ON_SEQUENTIAL_MODE
   10.32        create_masterEnv();
   10.33 -      flushRegisters();  //? not sure why here -- merten added it..?
   10.34 +      printf( "\n\n Running in SEQUENTIAL mode \n\n" );
   10.35     #else
   10.36        create_masterEnv();
   10.37        create_the_coreCtlr_OS_threads();
   10.38 @@ -292,7 +293,7 @@
   10.39        readyToAnimateQs[ coreIdx ] = makeVMSQ();
   10.40        
   10.41           //Q: should give masterVP core-specific info as its init data?
   10.42 -      masterVPs[ coreIdx ] = VMS_int__create_slaveVP( (TopLevelFnPtr)&masterLoop, (void*)masterEnv );
   10.43 +      masterVPs[ coreIdx ] = VMS_int__create_slaveVP( (TopLevelFnPtr)&schedulingMaster, (void*)masterEnv );
   10.44        masterVPs[ coreIdx ]->coreAnimatedBy = coreIdx;
   10.45        allSchedSlots[ coreIdx ] = create_sched_slots(); //makes for one core
   10.46        _VMSMasterEnv->numMasterInARow[ coreIdx ] = 0;
   10.47 @@ -426,6 +427,8 @@
   10.48        //get lock, to lock out any threads still starting up -- they'll see
   10.49        // that setupComplete is true before entering while loop, and so never
   10.50        // wait on the condition
   10.51 +   pthread_mutex_init( &suspendLock, NULL );
   10.52 +   pthread_cond_init( &suspendCond, NULL );
   10.53     pthread_mutex_lock(     &suspendLock );
   10.54     _VMSMasterEnv->setupComplete = 1;
   10.55     pthread_mutex_unlock(   &suspendLock );
    11.1 --- a/VMS_primitive_data_types.h	Mon Mar 12 05:38:07 2012 -0700
    11.2 +++ b/VMS_primitive_data_types.h	Tue Mar 13 10:02:06 2012 -0700
    11.3 @@ -7,8 +7,8 @@
    11.4  
    11.5   */
    11.6  
    11.7 -#ifndef  _PRIMITIVE_DATA_TYPES_H
    11.8 -#define	_PRIMITIVE_DATA_TYPES_H
    11.9 +#ifndef _PRIMITIVE_DATA_TYPES_H
   11.10 +#define _PRIMITIVE_DATA_TYPES_H
   11.11  
   11.12  
   11.13  /*For portability, need primitive data types that have a well defined
    12.1 --- a/vmalloc.c	Mon Mar 12 05:38:07 2012 -0700
    12.2 +++ b/vmalloc.c	Tue Mar 13 10:02:06 2012 -0700
    12.3 @@ -15,7 +15,7 @@
    12.4  #include <math.h>
    12.5  
    12.6  #include "VMS.h"
    12.7 -#include "C_Libraries/Histogram/Histogram.h"
    12.8 +#include "Histogram/Histogram.h"
    12.9  
   12.10  #define MAX_UINT64 0xFFFFFFFFFFFFFFFF
   12.11