Mercurial > cgi-bin > hgwebdir.cgi > VMS > VMS_Implementations > VMS_impls > VMS__MC_shared_impl

changeset 216:712218cdc4ba Common_Ancestor
more changes to vms_impl and ssr_impl
author: Some Random Person <seanhalle@yahoo.com>
date: Sat, 10 Mar 2012 20:35:38 -0800
parents: 10a72bcedbf0
children: ecbdb74cad97
files: CoreLoop.c Defines/VMS_defs__DEBUG.h Defines/VMS_defs__HW_constants.h Defines/VMS_defs__MEAS.h Defines/VMS_defs__turn_on_and_off.h MasterLoop.c Probes/probes.c VMS.h VMS__WL.c VMS__int.c VMS__startup_and_shutdown.c
diffstat: 11 files changed, 260 insertions(+), 183 deletions(-) [+]
[-]

CoreLoop.c 321

Defines/VMS_defs__DEBUG.h 12

Defines/VMS_defs__HW_constants.h 3

Defines/VMS_defs__MEAS.h 4

Defines/VMS_defs__turn_on_and_off.h 2

MasterLoop.c 14

Probes/probes.c 4

VMS.h 19

VMS__WL.c 2

VMS__int.c 6

VMS__startup_and_shutdown.c 56 CoreLoop.c 321 Defines/VMS_defs__DEBUG.h 12 Defines/VMS_defs__HW_constants.h 3 Defines/VMS_defs__MEAS.h 4 Defines/VMS_defs__turn_on_and_off.h 2 MasterLoop.c 14 Probes/probes.c 4 VMS.h 19 VMS__WL.c 2 VMS__int.c 6 VMS__startup_and_shutdown.c 56
CoreLoop.c 321
Defines/VMS_defs__DEBUG.h 12
     1.1 --- a/CoreLoop.c	Fri Mar 09 22:30:26 2012 -0800
     1.2 +++ b/CoreLoop.c	Sat Mar 10 20:35:38 2012 -0800
     1.3 @@ -14,180 +14,255 @@
     1.4  #include <pthread.h>
     1.5  #include <sched.h>
     1.6  
     1.7 -void *terminateCoreLoop(SlaveVP *currSlv);
     1.8 +//=====================  Functions local to this file =======================
     1.9 +void *terminateCoreController(SlaveVP *currSlv);
    1.10 +//===========================================================================
    1.11  
    1.12 -/*This is the loop that runs in the OS Thread pinned to each core
    1.13 - *Get Slv from queue,
    1.14 - * save state of current animator, then load in state of Slv, using
    1.15 - * jmp instr to switch the program-counter state -- making the Slv
    1.16 - * the new animator.
    1.17 - *At some point, the Slv will suspend itself by saving out its
    1.18 - * animator state (stack ptr, frame ptr, program counter) and switching
    1.19 - * back to the OS Thread's animator state, which means restoring the
    1.20 - * stack and frame and jumping to the core loop start point.
    1.21 - *This cycle then repeats, until a special shutdown virtual processor is
    1.22 - * animated, which jumps to the end point at the bottom of core loop.
    1.23 +
    1.24 +/*The Core Controller is logically "beneath" the masterVP and slave VPs.  Its
    1.25 + * job is to control which of those VPs the core animates.  Any time one of
    1.26 + * those VPs suspends, the suspend-primitive switches the core over to
    1.27 + * animating the core controller.  The core controller then follows a very
    1.28 + * basic pattern to choose which VP will get animated next, then switches
    1.29 + * the core over to animating that VP.  So, all VPs switch the core to
    1.30 + * core controller, which then chooses which VP the core animates next.
    1.31 + *
    1.32 + *The way the core controller decides which VP to switch the core to next is:
    1.33 + * 1) There are a number of "scheduling slots", which the master VP fills up
    1.34 + *    with slave VPs that are ready to be animated.  So, the core controller
    1.35 + *    just iterates through the scheduling slots.  When the next slot has a
    1.36 + *    slave VP in it, the core controller switches the core over to animate
    1.37 + *    that slave.
    1.38 + * 2) When the core controller checks a scheduling slot, and it's empty,
    1.39 + *    then the controller switches the core over to animating the master VP,
    1.40 + *    whose job is to find more slave VPs ready, and assign those to 
    1.41 + *    scheduling slots.
    1.42 + *
    1.43 + *So, in effect, a scheduling slot functions as another layer of virtual
    1.44 + * processor.  A slot has the logical meaning of being an animator that
    1.45 + * animates the slave assigned to it.  However, the core controller sits
    1.46 + * below the slots, and sequences down them, assigning the actual physical
    1.47 + * core to each slot, in turn.
    1.48 + *The reason for having the scheduling slots and core controller is to 
    1.49 + * amortize the overhead of switching to the master VP and running it.  With
    1.50 + * multiple scheduling slots, the time to switch-to-master and the code in
    1.51 + * the master loop is divided by the number of scheduling slots.
    1.52 + *The core controller and scheduling slots are not fundamental parts of VMS,
    1.53 + * but rather optimizations put into the shared-semantic-state version of
    1.54 + * VMS.  Other versions of VMS will not have a core controller nor scheduling
    1.55 + * slots.
    1.56 + * 
    1.57 + *The core controller "owns" the physical core, in effect, and is the 
    1.58 + * function given to the pthread creation call.  Hence, it contains code
    1.59 + * related to pthread startup, synchronizing the controllers to all start
    1.60 + * at the same time-point, and pinning the pthreads to physical cores.
    1.61 + * 
    1.62   */
    1.63  void *
    1.64 -coreLoop( void *paramsIn )
    1.65 +coreController( void *paramsIn )
    1.66   { 
    1.67 -   ThdParams      *coreLoopThdParams;
    1.68 -   int32           thisCoresIdx, currSlotIdx;
    1.69 +   int32           thisCoresIdx;
    1.70 +   int32           numRepetitionsWithNoWork;
    1.71     SlaveVP        *currVP;
    1.72     SchedSlot      *currSlot, **schedSlots;
    1.73 -   cpu_set_t       coreMask;  //has 1 in bit positions of allowed cores
    1.74 +   int32           currSlotIdx;
    1.75 +   int32          *addrOfMasterLock;
    1.76 +   SlaveVP        *thisCoresMasterVP;
    1.77 +      //Variables used for pthread related things
    1.78 +   ThdParams      *coreCtlrThdParams;
    1.79 +   cpu_set_t       coreMask;  //used during pinning pthread to CPU core
    1.80     int32           errorCode;
    1.81 +      //Variables used during measurements
    1.82     TSCountLowHigh  endSusp;
    1.83 +      //Variables used in random-backoff, for master-lock and waiting for work
    1.84 +   volatile double workspace1,workspace2; //busy-wait fake work
    1.85 +   uint32_t seed1 = rand()%1000; // init random number generator for retries
    1.86 +   uint32_t seed2 = rand()%1000;
    1.87 +      //Variable for work-stealing -- a gate protects a critical section
    1.88 +   volatile GateStruc gate;      //on stack to avoid false-sharing
    1.89  
    1.90 -      //work-stealing struc on stack to prevent false-sharing in cache-line
    1.91 -   volatile GateStruc gate;
    1.92 -   //preGateProgress, waitProgress, exitProgress, gateClosed;
    1.93 -
    1.94 -
    1.95 -   coreLoopThdParams = (ThdParams *)paramsIn;
    1.96 -   thisCoresIdx = coreLoopThdParams->coreNum;
    1.97 +   
    1.98 +   //===============  Initializations ===================
    1.99 +   coreCtlrThdParams = (ThdParams *)paramsIn;
   1.100 +   thisCoresIdx = coreCtlrThdParams->coreNum;
   1.101  
   1.102     gate.gateClosed      = FALSE;
   1.103     gate.preGateProgress = 0;
   1.104     gate.waitProgress    = 0;
   1.105     gate.exitProgress    = 0;
   1.106 -   _VMSMasterEnv->workStealingGates[ thisCoresIdx ] = (GateStruc*)&gate;//race @startup
   1.107 +   //TODO: pad these to prevent false-sharing, and fix the race at startup
   1.108 +   _VMSMasterEnv->workStealingGates[ thisCoresIdx ] = (GateStruc*)&gate;
   1.109  
   1.110 -      //wait until signalled that setup is complete
   1.111 +      //Assembly that saves addr of label of return instr -- label in assmbly
   1.112 +   recordCoreCtlrReturnLabelAddr((void**)&(_VMSMasterEnv->coreCtlrReturnPt));
   1.113 +
   1.114 +   schedSlots = _VMSMasterEnv->allSchedSlots[thisCoresIdx];
   1.115 +   currSlotIdx = 0; //start at slot 0, go up until one empty, then do master
   1.116 +   numRepetitionsWithNoWork = 0;
   1.117 +   addrOfMasterLock = &(_VMSMasterEnv->masterLock);
   1.118 +   thisCoresMasterVP = _VMSMasterEnv->masterVPs[thisCoresIdx];
   1.119 +   
   1.120 +   //==================== pthread related stuff ======================
   1.121 +      //pin the pthread to the core
   1.122 +      //Linux requires pinning to be done inside the thread-function
   1.123 +      //Designate a core by a 1 in bit-position corresponding to the core
   1.124 +   CPU_ZERO(&coreMask); //initialize mask bits to zero
   1.125 +   CPU_SET(coreCtlrThdParams->coreNum,&coreMask); //set bit repr the coreNum
   1.126 +   pthread_t selfThd = pthread_self();
   1.127 +   errorCode =
   1.128 +   pthread_setaffinity_np( selfThd, sizeof(coreMask), &coreMask);
   1.129 +   if(errorCode){ printf("\n pinning thd to core failed \n"); exit(0); }
   1.130 +
   1.131 +      //make sure the controllers all start at same time, by making them wait
   1.132     pthread_mutex_lock(   &suspendLock );
   1.133     while( !(_VMSMasterEnv->setupComplete) )
   1.134 -    {
   1.135 -      pthread_cond_wait( &suspendCond,
   1.136 -                         &suspendLock );
   1.137 +    { pthread_cond_wait( &suspendCond, &suspendLock );
   1.138      }
   1.139     pthread_mutex_unlock( &suspendLock );
   1.140  
   1.141 -      //printf( "\nCore unsuspended: %d\n", coreLoopThdParams->coreNum );
   1.142 +   //====================== The Core Controller ======================
   1.143 +   while(1)  //An endless loop is just one way of doing the control structure
   1.144 +    {        //Assembly code switches the core between animating a VP and
   1.145 +             // animating this core controller.  The switch is done by
   1.146 +             // changing the stack-pointer and frame-pointer and then doing
   1.147 +             // an assembly jmp.  When reading this code, the effect is 
   1.148 +             // that the "switchToSlv()" at the end of the loop is sort of a
   1.149 +             // "warp in time" -- the core disappears inside this, jmps to
   1.150 +             // animating a VP, and when that VP suspends, the suspend
   1.151 +             // jmps back. This has the effect of "returning" from the
   1.152 +             // switchToSlv() call. Then control loops back to here.
   1.153 +             //Alternatively, the VP suspend primitive could just not bother
   1.154 +             // returning from switchToSlv, and instead jmp directly to here.
   1.155 +      
   1.156 +      if( currSlotIdx >= NUM_SCHED_SLOTS ) goto switchToMaster;
   1.157 +      currSlot = schedSlots[ currSlotIdx ];
   1.158  
   1.159 -      //set thread affinity
   1.160 -      //Linux requires pinning thd to core inside thread-function
   1.161 -      //Designate a core by a 1 in bit-position corresponding to the core
   1.162 -   CPU_ZERO(&coreMask);
   1.163 -   CPU_SET(coreLoopThdParams->coreNum,&coreMask);
   1.164 -   //coreMask = 1L << coreLoopThdParams->coreNum;
   1.165 +      if( ! currSlot->needsSlaveAssigned ) //slot does have slave assigned
   1.166 +       { numRepetitionsWithNoWork = 0;     //reset B2B master count
   1.167 +         currSlotIdx ++;
   1.168 +         currVP = currSlot->slaveAssignedToSlot;
   1.169 +       }
   1.170 +      else //slot is empty, so switch to master
   1.171 +       {
   1.172 +       switchToMaster:
   1.173 +         currSlotIdx = 0; //doing switch to master, so start over at slot 0
   1.174 +         currVP = NULL;
   1.175  
   1.176 -   pthread_t selfThd = pthread_self();
   1.177 -   errorCode =
   1.178 -   pthread_setaffinity_np( selfThd, sizeof(coreMask), &coreMask);
   1.179 -   
   1.180 -   if(errorCode){ printf("\nset affinity failure\n"); exit(0); }
   1.181 +               MEAS__Capture_Pre_Master_Lock_Point;
   1.182  
   1.183 -   
   1.184 -      //Save return addr from stack into master-env for use later
   1.185 -   recordCoreLoopReturnLabelAddr((void**)&(_VMSMasterEnv->coreLoopReturnPt));
   1.186 +         int tries = 0; int gotLock = 0;
   1.187 +         while( currVP == NULL ) //keep going until get master lock
   1.188 +          { 
   1.189 +            gotLock = __sync_bool_compare_and_swap( addrOfMasterLock,
   1.190 +                                                    UNLOCKED, LOCKED );
   1.191 +            if( gotLock )
   1.192 +             {    //At this point, have run out of slaves, so tried to get
   1.193 +                  // the master lock, and have successfully gotten it.
   1.194 +                  //So, set the currVP to this core's masterVP and break out
   1.195 +                  // of the get-lock loop.  Below, assembly code will switch
   1.196 +                  // the core over to animating the masterVP.  When it's 
   1.197 +                  // done, the masterVP will use assembly to switch the core
   1.198 +                  // back to animating this core controller
   1.199 +               currVP = thisCoresMasterVP;
   1.200 +               if( numRepetitionsWithNoWork > NUM_REPS_W_NO_WORK_BEFORE_YIELD )
   1.201 +                {       DEBUG_Print( dbgB2BMaster,"Lots of reps w/o work\n");
   1.202 +                  pthread_yield();
   1.203 +                }
   1.204 +               numRepetitionsWithNoWork += 1;
   1.205 +               break;  //end while -- have a VP to animate now
   1.206 +             }
   1.207  
   1.208 -   currSlotIdx = 0; //start at slot 0, go up until one empty, then do master
   1.209 -   
   1.210 -   while(1){
   1.211 -   
   1.212 -   schedSlots = _VMSMasterEnv->allSchedSlots[thisCoresIdx];
   1.213 +            tries++;   //if too many, means too much contention
   1.214 +            if( tries > MASTERLOCK_RETRIES_BEFORE_YIELD ) { tries = 0; pthread_yield(); }
   1.215 +          }
   1.216 +               MEAS__Capture_Post_Master_Lock_Point;
   1.217 +       }
   1.218  
   1.219 -   if( currSlotIdx >= NUM_SCHED_SLOTS ) goto switchToMaster;
   1.220 -   
   1.221 -   currSlot = schedSlots[ currSlotIdx ];
   1.222 -   
   1.223 -   if( ! currSlot->needsSlaveAssigned ) //slot does have slave assigned
   1.224 -    { _VMSMasterEnv->numMasterInARow[thisCoresIdx] = 0; //reset B2B master count
   1.225 -      currSlotIdx ++;
   1.226 -      currVP = currSlot->slaveAssignedToSlot;
   1.227 -    }
   1.228 -   else //slot is empty, so switch to master
   1.229 -    {
   1.230 -   switchToMaster:
   1.231 -      currSlotIdx = 0; //switch to master, so start over at slot 0
   1.232 -      currVP = NULL;
   1.233 -   
   1.234 -            MEAS__Capture_Pre_Master_Lock_Point;
   1.235 -            
   1.236 -      int tries = 0; int gotLock = 0;
   1.237 -      while( currVP == NULL ) //keep going until get master lock
   1.238 -       { 
   1.239 -         gotLock = __sync_bool_compare_and_swap(&(_VMSMasterEnv->masterLock),
   1.240 -                                                          UNLOCKED, LOCKED );
   1.241 -         if( gotLock )
   1.242 -          {    //run own MasterVP -- jmps to coreLoops startPt when done
   1.243 -            currVP = _VMSMasterEnv->masterVPs[thisCoresIdx];
   1.244 -            if( _VMSMasterEnv->numMasterInARow[thisCoresIdx] > 10 )
   1.245 -             {       DEBUG( dbgB2BMaster,"Many back to back MasterVPs\n");
   1.246 -               pthread_yield();
   1.247 -             }
   1.248 -            _VMSMasterEnv->numMasterInARow[thisCoresIdx] += 1;
   1.249 -            break;  //end while -- have a Slv to animate now
   1.250 -          }
   1.251  
   1.252 -         tries++;   //if too many, means master on other core taking too long
   1.253 -         if( tries > MASTERLOCK_RETRIES ) { tries = 0; pthread_yield(); }
   1.254 -       }
   1.255 -            MEAS__Capture_Post_Master_Lock_Point;
   1.256 -    }
   1.257 +      switchToSlv(currVP); //Slave suspend makes core "return" from this call
   1.258 +      flushRegisters();    //prevent GCC optimization from doing bad things 
   1.259  
   1.260 -   
   1.261 -   switchToSlv(currVP); //Slave suspend makes core "return" from this call
   1.262 -   flushRegisters();
   1.263 -   
   1.264 -          MEAS__capture_end_susp_in_CoreLoop_forSys;
   1.265 +             MEAS__Capture_End_Susp_in_CoreCtlr_ForSys;
   1.266            
   1.267 -   }//CoreLoop      
   1.268 +    }//while(1)
   1.269   }
   1.270  
   1.271  
   1.272  void *
   1.273 -terminateCoreLoop(SlaveVP *currSlv){
   1.274 +terminateCoreController(SlaveVP *currSlv)
   1.275 + {
   1.276     //first free shutdown Slv that jumped here -- it first restores the
   1.277     // coreloop's stack, so addr of currSlv in stack frame is still correct
   1.278     VMS_int__dissipate_slaveVP( currSlv );
   1.279     pthread_exit( NULL );
   1.280 -}
   1.281 + }
   1.282  
   1.283  
   1.284  
   1.285  #ifdef DEBUG__TURN_ON_SEQUENTIAL_MODE
   1.286  
   1.287  //===========================================================================
   1.288 -/*This sequential version is exact same as threaded, except doesn't do the
   1.289 - * pin-threads part, nor the wait until setup complete part.
   1.290 +/*This sequential version does the same as threaded, except doesn't do the
   1.291 + * pin-threads part, nor the wait until setup complete and acquire master
   1.292 + * lock parts.
   1.293   */
   1.294  void *
   1.295 -coreLoop_Seq( void *paramsIn )
   1.296 +coreCtlr_Seq( void *paramsIn )
   1.297   {
   1.298 -   SlaveVP      *currSlv;
   1.299 -   VMSQueueStruc *readyToAnimateQ;
   1.300 +   int32           thisCoresIdx;
   1.301 +   int32           numRepetitionsWithNoWork;
   1.302 +   SlaveVP        *currVP;
   1.303 +   SchedSlot      *currSlot, **schedSlots;
   1.304 +   int32           currSlotIdx;
   1.305 +   int32          *addrOfMasterLock;
   1.306 +   SlaveVP        *thisCoresMasterVP;
   1.307     
   1.308 -   ThdParams      *coreLoopThdParams;
   1.309 -   int             thisCoresIdx;
   1.310 +   //===============  Initializations ===================
   1.311 +   schedSlots = _VMSMasterEnv->allSchedSlots[thisCoresIdx];
   1.312 +   currSlotIdx = 0; //start at slot 0, go up until one empty, then do master
   1.313 +   numRepetitionsWithNoWork = 0;
   1.314 +   addrOfMasterLock = &(_VMSMasterEnv->masterLock);
   1.315 +   thisCoresMasterVP = _VMSMasterEnv->masterVPs[thisCoresIdx];
   1.316     
   1.317 -   coreLoopThdParams = (ThdParams *)paramsIn;
   1.318 -//   thisCoresIdx = coreLoopThdParams->coreNum;
   1.319 -   thisCoresIdx = 0;
   1.320 +   thisCoresIdx = 0; //sequential version
   1.321  
   1.322 -   //Save the return address in the SwitchSlv function
   1.323 -   recordCoreLoopReturnLabelAddr(&(_VMSMasterEnv->coreLoopReturnPt));
   1.324 +      //Assembly that saves addr of label of return instr -- label in assmbly
   1.325 +   recordCoreCtlrReturnLabelAddr((void**)&(_VMSMasterEnv->coreCtlrReturnPt));
   1.326  
   1.327     
   1.328 -   while(1){
   1.329 -      //Get virtual processor from queue
   1.330 -      //_VMSWorkQ must be a global, static volatile var, so not kept in reg,
   1.331 -      // which forces reloading the pointer after each jmp to this point
   1.332 -   readyToAnimateQ  = _VMSMasterEnv->readyToAnimateQs[thisCoresIdx];
   1.333 -   currSlv = (SlaveVP *) readVMSQ( readyToAnimateQ );
   1.334 -   if( currSlv == NULL )
   1.335 -    { if( _VMSMasterEnv->numMasterInARow[thisCoresIdx] > 1000 )
   1.336 -       { printf("too many back to back MasterVP\n"); exit(1); }
   1.337 -      _VMSMasterEnv->numMasterInARow[thisCoresIdx] += 1;
   1.338 -      
   1.339 -      currSlv = _VMSMasterEnv->masterVPs[thisCoresIdx];
   1.340 -    }
   1.341 -   else
   1.342 -      _VMSMasterEnv->numMasterInARow[thisCoresIdx] = 0;
   1.343 +   //====================== The Core Controller ======================
   1.344 +   while(1)
   1.345 +    {
   1.346 +      if( currSlotIdx >= NUM_SCHED_SLOTS ) goto switchToMaster;
   1.347 +      currSlot = schedSlots[ currSlotIdx ];
   1.348  
   1.349 +      if( ! currSlot->needsSlaveAssigned ) //slot does have slave assigned
   1.350 +       { numRepetitionsWithNoWork = 0;     //reset B2B master count
   1.351 +         currSlotIdx ++;
   1.352 +         currVP = currSlot->slaveAssignedToSlot;
   1.353 +       }
   1.354 +      else //slot is empty, so switch to master
   1.355 +       {
   1.356 +       switchToMaster:
   1.357 +         currSlotIdx = 0; //doing switch to master, so start over at slot 0
   1.358 +         
   1.359 +         currVP = thisCoresMasterVP;
   1.360 +         
   1.361 +               MEAS__Capture_Pre_Master_Lock_Point;  //back to back because
   1.362 +               MEAS__Capture_Post_Master_Lock_Point; // sequential version
   1.363 +         
   1.364 +         if( numRepetitionsWithNoWork > NUM_REPS_W_NO_WORK_BEFORE_YIELD )
   1.365 +          { printf("Lots of reps w/o work\n");
   1.366 +            exit(0); //if no work, no way to ever get it in sequential!
   1.367 +          }
   1.368 +         numRepetitionsWithNoWork += 1;
   1.369 +       }
   1.370  
   1.371 -   switchToSlv( currSlv );
   1.372 -   flushRegisters();
   1.373 -   }
   1.374 +      switchToSlv(currVP); //Slave suspend makes core "return" from this call
   1.375 +      flushRegisters();    //prevent GCC optimization from doing bad things 
   1.376 +
   1.377 +             MEAS__Capture_End_Susp_in_CoreCtlr_ForSys;
   1.378 +             
   1.379 +    } //while(1)
   1.380   }
   1.381  #endif

     2.1 --- a/Defines/VMS_defs__DEBUG.h	Fri Mar 09 22:30:26 2012 -0800
     2.2 +++ b/Defines/VMS_defs__DEBUG.h	Sat Mar 10 20:35:38 2012 -0800
     2.3 @@ -13,16 +13,16 @@
     2.4  /*
     2.5   */
     2.6  #ifdef DEBUG__TURN_ON_DEBUG_MSGS
     2.7 -   #define DEBUG(  bool, msg) \
     2.8 +   #define DEBUG_Print(  bool, msg) \
     2.9        if( bool){ printf(msg); fflush(stdin);}
    2.10 -   #define DEBUG1( bool, msg, param)  \
    2.11 +   #define DEBUG_Print1( bool, msg, param)  \
    2.12        if(bool){printf(msg, param); fflush(stdin);}
    2.13 -   #define DEBUG2( bool, msg, p1, p2) \
    2.14 +   #define DEBUG_Print2( bool, msg, p1, p2) \
    2.15        if(bool) {printf(msg, p1, p2); fflush(stdin);}
    2.16  #else
    2.17 -   #define DEBUG(  bool, msg)         
    2.18 -   #define DEBUG1( bool, msg, param)  
    2.19 -   #define DEBUG2( bool, msg, p1, p2) 
    2.20 +   #define DEBUG_Print(  bool, msg)         
    2.21 +   #define DEBUG_Print1( bool, msg, param)  
    2.22 +   #define DEBUG_Print2( bool, msg, p1, p2) 
    2.23  #endif
    2.24  
    2.25  //============================= ERROR MSGs ============================

     3.1 --- a/Defines/VMS_defs__HW_constants.h	Fri Mar 09 22:30:26 2012 -0800
     3.2 +++ b/Defines/VMS_defs__HW_constants.h	Sat Mar 10 20:35:38 2012 -0800
     3.3 @@ -22,7 +22,8 @@
     3.4  
     3.5  #define MIN_WORK_UNIT_CYCLES 20000
     3.6  
     3.7 -#define MASTERLOCK_RETRIES 100
     3.8 +#define NUM_REPS_W_NO_WORK_BEFORE_YIELD 10
     3.9 +#define MASTERLOCK_RETRIES_BEFORE_YIELD 100
    3.10  
    3.11     // stack size in virtual processors created
    3.12  #define VIRT_PROCR_STACK_SIZE 0x8000 /* 32K */

     4.1 --- a/Defines/VMS_defs__MEAS.h	Fri Mar 09 22:30:26 2012 -0800
     4.2 +++ b/Defines/VMS_defs__MEAS.h	Sat Mar 10 20:35:38 2012 -0800
     4.3 @@ -270,7 +270,7 @@
     4.4      * Now, measures cycles from there to here
     4.5      * Master and Plugin will add this value to other trace-seg measures
     4.6      */
     4.7 -   #define MEAS__capture_end_susp_in_CoreLoop_forSys\
     4.8 +   #define MEAS__Capture_End_Susp_in_CoreCtlr_ForSys\
     4.9            saveTSCLowHigh(endSusp); \
    4.10            numCycles = endSusp.longVal - currVP->startSusp.longVal; \
    4.11            /*sanity check (400K is about 20K iters)*/ \
    4.12 @@ -288,7 +288,7 @@
    4.13     #define MEAS__startMasterLoop_forSys 
    4.14     #define MEAS__startReqHdlr_forSys
    4.15     #define MEAS__endMasterLoop_forSys
    4.16 -   #define MEAS__capture_end_susp_in_CoreLoop_forSys
    4.17 +   #define MEAS__Capture_End_Susp_in_CoreCtlr_ForSys
    4.18     #define MEAS__Print_Hists_for_System_Meas 
    4.19  #endif
    4.20  

     5.1 --- a/Defines/VMS_defs__turn_on_and_off.h	Fri Mar 09 22:30:26 2012 -0800
     5.2 +++ b/Defines/VMS_defs__turn_on_and_off.h	Sat Mar 10 20:35:38 2012 -0800
     5.3 @@ -21,7 +21,7 @@
     5.4  /*turns on the probe-instrumentation in the application -- when not
     5.5   * defined, the calls to the probe functions turn into comments
     5.6   */
     5.7 -//#define DEBUG__TURN_ON_DEBUG_MSGS
     5.8 +#define DEBUG__TURN_ON_DEBUG_MSGS
     5.9  //#define DEBUG__TURN_ON_ERROR_MSGS
    5.10  
    5.11  /*These defines turn types of bug messages on and off

     6.1 --- a/MasterLoop.c	Fri Mar 09 22:30:26 2012 -0800
     6.2 +++ b/MasterLoop.c	Sat Mar 10 20:35:38 2012 -0800
     6.3 @@ -41,7 +41,7 @@
     6.4   *
     6.5   */
     6.6  
     6.7 -/*May 29, 2010 -- birth a Master during init so that first core loop to
     6.8 +/*May 29, 2010 -- birth a Master during init so that first core controller to
     6.9   * start running gets it and does all the stuff for a newly born --
    6.10   * from then on, will be doing continuation, but do suspension self
    6.11   * directly at end of master loop
    6.12 @@ -49,8 +49,8 @@
    6.13   * all the others -- then does any extra setup needed and puts it into the
    6.14   * work queue.
    6.15   *However means have to make masterEnv a global static volatile the same way
    6.16 - * did with readyToAnimateQ in core loop.  -- for performance, put the
    6.17 - * jump to the core loop directly in here, and have it directly jump back.
    6.18 + * did with readyToAnimateQ in core controller.  -- for performance, put the
    6.19 + * jump to the core controller directly in here, and have it directly jump back.
    6.20   *
    6.21   *
    6.22   *Aug 18, 2010 -- Going to a separate MasterVP for each core, to see if this
    6.23 @@ -62,7 +62,7 @@
    6.24   * animates this function has a different one.
    6.25   *
    6.26   *At this point, the masterLoop does not write itself into the queue anymore,
    6.27 - * instead, the coreLoop acquires the masterLock when it has nothing to
    6.28 + * instead, the coreCtlr acquires the masterLock when it has nothing to
    6.29   * animate, and then animates its own masterLoop.  However, still try to put
    6.30   * several AppSlvs into the queue to amortize the startup cost of switching
    6.31   * to the MasterVP.  Note, don't have to worry about latency of requests much
    6.32 @@ -164,7 +164,7 @@
    6.33  
    6.34           MEAS__Capture_Post_Master_Point;
    6.35     
    6.36 -   masterSwitchToCoreLoop(animatingSlv);
    6.37 +   masterSwitchToCoreCtlr(animatingSlv);
    6.38     flushRegisters();
    6.39     }//MasterLoop
    6.40  
    6.41 @@ -260,7 +260,7 @@
    6.42   *Note, have single-reader, single-writer pattern for all variables used to
    6.43   * communicate between stealer and victims
    6.44   *
    6.45 - *So, scan the queues of the core loops, until find non-empty.  Each core
    6.46 + *So, scan the queues of the core controllers, until find non-empty.  Each core
    6.47   * has its own list that it scans.  The list goes in order from closest to
    6.48   * furthest core, so it steals first from close cores.  Later can add
    6.49   * taking info from the app about overlapping footprints, and scan all the
    6.50 @@ -316,7 +316,7 @@
    6.51        //have a victim core, now get the stealer-lock
    6.52     gotLock =__sync_bool_compare_and_swap( &(_VMSMasterEnv->workStealingLock),
    6.53                                                            UNLOCKED, LOCKED );
    6.54 -   if( !gotLock ) return; //go back to core loop, which will re-start master
    6.55 +   if( !gotLock ) return; //go back to core controller, which will re-start master
    6.56  
    6.57  
    6.58     //====== Start Gate-protection =======

     7.1 --- a/Probes/probes.c	Fri Mar 09 22:30:26 2012 -0800
     7.2 +++ b/Probes/probes.c	Sat Mar 10 20:35:38 2012 -0800
     7.3 @@ -199,7 +199,7 @@
     7.4  VMS_impl__record_interval_start_in_probe( int32 probeID )
     7.5   { IntervalProbe *probe;
     7.6  
     7.7 -         DEBUG( dbgProbes, "record start of interval\n" )
     7.8 +         DEBUG_Print( dbgProbes, "record start of interval\n" )
     7.9     probe = _VMSMasterEnv->intervalProbes[ probeID ];
    7.10  
    7.11        //record *start* point as last thing, after lookup
    7.12 @@ -253,7 +253,7 @@
    7.13  
    7.14  #endif
    7.15     
    7.16 -         DEBUG( dbgProbes, "record end of interval\n" )
    7.17 +         DEBUG_Print( dbgProbes, "record end of interval\n" )
    7.18   }
    7.19  
    7.20  

     8.1 --- a/VMS.h	Fri Mar 09 22:30:26 2012 -0800
     8.2 +++ b/VMS.h	Sat Mar 10 20:35:38 2012 -0800
     8.3 @@ -52,7 +52,8 @@
     8.4  
     8.5  //============================ HW Dependent Fns ================================
     8.6  
     8.7 -#include "VMS__HW_dependent.h"
     8.8 +#include "Hardware_Dependent/VMS__HW_measurement.h"
     8.9 +#include "Hardware_Dependent/VMS__primitives.h"
    8.10  
    8.11  //============================= Statistics ==================================
    8.12  
    8.13 @@ -114,9 +115,9 @@
    8.14     void       *framePtr;
    8.15     void       *resumeInstrPtr;
    8.16     
    8.17 -   void       *coreLoopStartPt;  //allows proto-runtime to be linked later
    8.18 -   void       *coreLoopFramePtr; //restore before jmp back to core loop
    8.19 -   void       *coreLoopStackPtr; //restore before jmp back to core loop
    8.20 +   void       *coreCtlrStartPt;  //allows proto-runtime to be linked later
    8.21 +   void       *coreCtlrFramePtr; //restore before jmp back to core controller
    8.22 +   void       *coreCtlrStackPtr; //restore before jmp back to core controller
    8.23  
    8.24     SchedSlot  *schedSlot;
    8.25     VMSReqst   *requests;
    8.26 @@ -151,7 +152,7 @@
    8.27     MallocArrays   *freeLists;
    8.28     int32            amtOfOutstandingMem; //total currently allocated
    8.29  
    8.30 -   void            *coreLoopReturnPt;//addr to jump to to re-enter coreLoop
    8.31 +   void            *coreCtlrReturnPt;//addr to jump to to re-enter coreCtlr
    8.32  
    8.33     int32            setupComplete;
    8.34     int32            numMasterInARow[NUM_CORES];//detect back-to-back masterVP
    8.35 @@ -198,8 +199,8 @@
    8.36  
    8.37  //=======================  OS Thread related  ===============================
    8.38  
    8.39 -void * coreLoop( void *paramsIn );  //standard PThreads fn prototype
    8.40 -void * coreLoop_Seq( void *paramsIn );  //standard PThreads fn prototype
    8.41 +void * coreController( void *paramsIn );  //standard PThreads fn prototype
    8.42 +void * coreCtlr_Seq( void *paramsIn );  //standard PThreads fn prototype
    8.43  void masterLoop( void *initData, SlaveVP *masterVP );
    8.44  
    8.45  
    8.46 @@ -214,8 +215,8 @@
    8.47  
    8.48  volatile MasterEnv      *_VMSMasterEnv __align_to_cacheline__;
    8.49  
    8.50 -pthread_t       coreLoopThdHandles[ NUM_CORES ];  //pthread's virt-procr state
    8.51 -ThdParams      *coreLoopThdParams [ NUM_CORES ];
    8.52 +pthread_t       coreCtlrThdHandles[ NUM_CORES ];  //pthread's virt-procr state
    8.53 +ThdParams      *coreCtlrThdParams [ NUM_CORES ];
    8.54  pthread_mutex_t suspendLock   = PTHREAD_MUTEX_INITIALIZER;
    8.55  pthread_cond_t  suspendCond  = PTHREAD_COND_INITIALIZER;
    8.56  

     9.1 --- a/VMS__WL.c	Fri Mar 09 22:30:26 2012 -0800
     9.2 +++ b/VMS__WL.c	Sat Mar 10 20:35:38 2012 -0800
     9.3 @@ -47,7 +47,7 @@
     9.4   * does the work of freeing memory and removing the processor from the
     9.5   * semantic environment's data structures.
     9.6   *The request handler also is what figures out when to shutdown the VMS
     9.7 - * system -- which causes all the core loop threads to die, and returns from
     9.8 + * system -- which causes all the core controller threads to die, and returns from
     9.9   * the call that started up VMS to perform the work.
    9.10   *
    9.11   *This form is a bit misleading to understand if one is trying to figure out

    10.1 --- a/VMS__int.c	Fri Mar 09 22:30:26 2012 -0800
    10.2 +++ b/VMS__int.c	Sat Mar 10 20:35:38 2012 -0800
    10.3 @@ -53,7 +53,7 @@
    10.4  /*there is a label inside this function -- save the addr of this label in
    10.5   * the callingSlv struc, as the pick-up point from which to start the next
    10.6   * work-unit for that slave.  If turns out have to save registers, then
    10.7 - * save them in the slave struc too.  Then do assembly jump to the CoreLoop's
    10.8 + * save them in the slave struc too.  Then do assembly jump to the CoreCtlr's
    10.9   * "done with work-unit" label.  The slave struc is in the request in the
   10.10   * slave that animated the just-ended work-unit, so all the state is saved
   10.11   * there, and will get passed along, inside the request handler, to the
   10.12 @@ -64,7 +64,7 @@
   10.13   { 
   10.14  
   10.15        //The request to master will cause this suspended Slv to get
   10.16 -      // scheduled again at some future point -- to resume, core loop jumps
   10.17 +      // scheduled again at some future point -- to resume, core ctlr jumps
   10.18        // to the resume point (below), which causes restore of saved regs and
   10.19        // "return" from this call.
   10.20     //animatingSlv->resumeInstrPtr = &&ResumePt;
   10.21 @@ -73,7 +73,7 @@
   10.22     animatingSlv->schedSlot->workIsDone = TRUE;
   10.23  
   10.24           MEAS__Capture_Pre_Susp_Point;
   10.25 -   switchToCoreLoop(animatingSlv);
   10.26 +   switchToCoreCtlr(animatingSlv);
   10.27     flushRegisters();
   10.28           MEAS__Capture_Post_Susp_Point;
   10.29  		 

    11.1 --- a/VMS__startup_and_shutdown.c	Fri Mar 09 22:30:26 2012 -0800
    11.2 +++ b/VMS__startup_and_shutdown.c	Sat Mar 10 20:35:38 2012 -0800
    11.3 @@ -27,7 +27,7 @@
    11.4  create_masterEnv();
    11.5  
    11.6  void
    11.7 -create_the_coreLoop_OS_threads();
    11.8 +create_the_coreCtlr_OS_threads();
    11.9  
   11.10  MallocProlog *
   11.11  create_free_list();
   11.12 @@ -74,7 +74,7 @@
   11.13        flushRegisters();  //? not sure why here -- merten added it..?
   11.14     #else
   11.15        create_masterEnv();
   11.16 -      create_the_coreLoop_OS_threads();
   11.17 +      create_the_coreCtlr_OS_threads();
   11.18     #endif
   11.19   }
   11.20  
   11.21 @@ -155,7 +155,7 @@
   11.22     #ifdef DEBUG__TURN_ON_SEQUENTIAL_MODE
   11.23        //Nothing else to create for sequential mode
   11.24     #else
   11.25 -      create_the_coreLoop_OS_threads();
   11.26 +      create_the_coreCtlr_OS_threads();
   11.27     #endif    
   11.28   }
   11.29  */
   11.30 @@ -277,7 +277,7 @@
   11.31     
   11.32     masterEnv     = (MasterEnv*)_VMSMasterEnv;
   11.33     
   11.34 -      //Make a readyToAnimateQ for each core loop
   11.35 +      //Make a readyToAnimateQ for each core controller
   11.36     readyToAnimateQs = VMS_int__malloc( NUM_CORES * sizeof(VMSQueueStruc *) );
   11.37     masterVPs        = VMS_int__malloc( NUM_CORES * sizeof(SlaveVP *) );
   11.38  
   11.39 @@ -359,7 +359,7 @@
   11.40  
   11.41  
   11.42  void
   11.43 -create_the_coreLoop_OS_threads()
   11.44 +create_the_coreCtlr_OS_threads()
   11.45   {
   11.46     //========================================================================
   11.47     //                      Create the Threads
   11.48 @@ -367,19 +367,19 @@
   11.49  
   11.50        //Need the threads to be created suspended, and wait for a signal
   11.51        // before proceeding -- gives time after creating to initialize other
   11.52 -      // stuff before the coreLoops set off.
   11.53 +      // stuff before the coreCtlrs set off.
   11.54     _VMSMasterEnv->setupComplete = 0;
   11.55  
   11.56 -      //Make the threads that animate the core loops
   11.57 +      //Make the threads that animate the core controllers
   11.58     for( coreIdx=0; coreIdx < NUM_CORES; coreIdx++ )
   11.59 -    { coreLoopThdParams[coreIdx]          = VMS_int__malloc( sizeof(ThdParams) );
   11.60 -      coreLoopThdParams[coreIdx]->coreNum = coreIdx;
   11.61 +    { coreCtlrThdParams[coreIdx]          = VMS_int__malloc( sizeof(ThdParams) );
   11.62 +      coreCtlrThdParams[coreIdx]->coreNum = coreIdx;
   11.63  
   11.64        retCode =
   11.65 -      pthread_create( &(coreLoopThdHandles[coreIdx]),
   11.66 +      pthread_create( &(coreCtlrThdHandles[coreIdx]),
   11.67                          thdAttrs,
   11.68 -                       &coreLoop,
   11.69 -               (void *)(coreLoopThdParams[coreIdx]) );
   11.70 +                       &coreController,
   11.71 +               (void *)(coreCtlrThdParams[coreIdx]) );
   11.72        if(retCode){printf("ERROR creating thread: %d\n", retCode); exit(1);}
   11.73      }
   11.74   }
   11.75 @@ -412,17 +412,17 @@
   11.76   { 
   11.77  #ifdef DEBUG__TURN_ON_SEQUENTIAL_MODE
   11.78     /*Only difference between version with an OS thread pinned to each core and
   11.79 -    * the sequential version of VMS is VMS__init_Seq, this, and coreLoop_Seq.
   11.80 +    * the sequential version of VMS is VMS__init_Seq, this, and coreCtlr_Seq.
   11.81      */
   11.82           //Instead of un-suspending threads, just call the one and only
   11.83 -         // core loop (sequential version), in the main thread.
   11.84 -      coreLoop_Seq( NULL );
   11.85 +         // core ctlr (sequential version), in the main thread.
   11.86 +      coreCtlr_Seq( NULL );
   11.87        flushRegisters();
   11.88  #else
   11.89     int coreIdx;
   11.90 -      //Start the core loops running
   11.91 +      //Start the core controllers running
   11.92     
   11.93 -      //tell the core loop threads that setup is complete
   11.94 +      //tell the core controller threads that setup is complete
   11.95        //get lock, to lock out any threads still starting up -- they'll see
   11.96        // that setupComplete is true before entering while loop, and so never
   11.97        // wait on the condition
   11.98 @@ -435,7 +435,7 @@
   11.99        //wait for all to complete
  11.100     for( coreIdx=0; coreIdx < NUM_CORES; coreIdx++ )
  11.101      {
  11.102 -      pthread_join( coreLoopThdHandles[coreIdx], NULL );
  11.103 +      pthread_join( coreCtlrThdHandles[coreIdx], NULL );
  11.104      }
  11.105     
  11.106        //NOTE: do not clean up VMS env here -- semantic layer has to have
  11.107 @@ -454,7 +454,7 @@
  11.108  
  11.109  
  11.110  /*This is called by the semantic layer's request handler when it decides its
  11.111 - * time to shut down the VMS system.  Calling this causes the core loop OS
  11.112 + * time to shut down the VMS system.  Calling this causes the core controller OS
  11.113   * threads to exit, which unblocks the entry-point function that started up
  11.114   * VMS, and allows it to grab the result and return to the original single-
  11.115   * threaded application.
  11.116 @@ -469,7 +469,7 @@
  11.117   * locations it needs, and give ownership to masterVP.  Then, they will be
  11.118   * automatically freed.
  11.119   *
  11.120 - *In here,create one core-loop shut-down processor for each core loop and put
  11.121 + *In here,create one core-loop shut-down processor for each core controller and put
  11.122   * them all directly into the readyToAnimateQ.
  11.123   *Note, this function can ONLY be called after the semantic environment no
  11.124   * longer cares if AppSlvs get animated after the point this is called.  In
  11.125 @@ -482,7 +482,7 @@
  11.126   { int coreIdx;
  11.127     SlaveVP *shutDownSlv;
  11.128  
  11.129 -      //create the shutdown processors, one for each core loop -- put them
  11.130 +      //create the shutdown processors, one for each core controller -- put them
  11.131        // directly into the Q -- each core will die when gets one
  11.132     for( coreIdx=0; coreIdx < NUM_CORES; coreIdx++ )
  11.133      {    //Note, this is running in the master
  11.134 @@ -492,26 +492,26 @@
  11.135   }
  11.136  
  11.137  
  11.138 -/*Am trying to be cute, avoiding IF statement in coreLoop that checks for
  11.139 +/*Am trying to be cute, avoiding IF statement in coreCtlr that checks for
  11.140   * a special shutdown slaveVP.  Ended up with extra-complex shutdown sequence.
  11.141   *This function has the sole purpose of setting the stack and framePtr
  11.142 - * to the coreLoop's stack and framePtr.. it does that then jumps to the
  11.143 - * core loop's shutdown point -- might be able to just call Pthread_exit
  11.144 + * to the coreCtlr's stack and framePtr.. it does that then jumps to the
  11.145 + * core ctlr's shutdown point -- might be able to just call Pthread_exit
  11.146   * from here, but am going back to the pthread's stack and setting everything
  11.147   * up just as if it never jumped out, before calling pthread_exit.
  11.148 - *The end-point of core loop will free the stack and so forth of the
  11.149 + *The end-point of core ctlr will free the stack and so forth of the
  11.150   * processor that animates this function, (this fn is transfering the
  11.151   * animator of the AppSlv that is in turn animating this function over
  11.152 - * to core loop function -- note that this slices out a level of virtual
  11.153 + * to core controller function -- note that this slices out a level of virtual
  11.154   * processors).
  11.155   */
  11.156  void
  11.157  endOSThreadFn( void *initData, SlaveVP *animatingSlv )
  11.158   { 
  11.159     #ifdef DEBUG__TURN_ON_SEQUENTIAL_MODE
  11.160 -    asmTerminateCoreLoopSeq(animatingSlv);
  11.161 +    asmTerminateCoreCtlrSeq(animatingSlv);
  11.162     #else
  11.163 -    asmTerminateCoreLoop(animatingSlv);
  11.164 +    asmTerminateCoreCtlr(animatingSlv);
  11.165     #endif
  11.166   }
  11.167