changeset 55:3bac84e4e56e

Works with correct matrix mult Nov 4 -- switch animators macros, many updates Changed all queues back to VMSQ variants #defines correct, protected, work-stealing, with compiler switch in and out
author Me
date Thu, 04 Nov 2010 18:13:18 -0700
parents f8508572f3de
children 26d53313a8f2
files CoreLoop.c MasterLoop.c SwitchAnimators.h VMS.c VMS.h probes.c vmalloc.c
diffstat 7 files changed, 442 insertions(+), 183 deletions(-) [+]
line diff
     1.1 --- a/CoreLoop.c	Tue Nov 02 16:43:01 2010 -0700
     1.2 +++ b/CoreLoop.c	Thu Nov 04 18:13:18 2010 -0700
     1.3 @@ -34,13 +34,24 @@
     1.4     ThdParams      *coreLoopThdParams;
     1.5     int             thisCoresIdx;
     1.6     VirtProcr      *currPr;
     1.7 -   SRSWQueueStruc *readyToAnimateQ;
     1.8 +   VMSQueueStruc *readyToAnimateQ;
     1.9     unsigned long   coreMask;  //has 1 in bit positions of allowed cores
    1.10     int             errorCode;
    1.11 -   
    1.12 +
    1.13 +      //work-stealing struc on stack to prevent false-sharing in cache-line
    1.14 +   volatile GateStruc gate;
    1.15 +   //preGateProgress, waitProgress, exitProgress, gateClosed;
    1.16 +
    1.17 +
    1.18     coreLoopThdParams = (ThdParams *)paramsIn;
    1.19     thisCoresIdx = coreLoopThdParams->coreNum;
    1.20  
    1.21 +   gate.gateClosed      = FALSE;
    1.22 +   gate.preGateProgress = 0;
    1.23 +   gate.waitProgress    = 0;
    1.24 +   gate.exitProgress    = 0;
    1.25 +   _VMSMasterEnv->workStealingGates[ thisCoresIdx ] = &gate;//race @startup
    1.26 +
    1.27        //wait until signalled that setup is complete
    1.28     pthread_mutex_lock(   &suspendLock );
    1.29     while( !(_VMSMasterEnv->setupComplete) )
    1.30 @@ -87,32 +98,38 @@
    1.31        // which forces reloading the pointer after each jmp to this point
    1.32     readyToAnimateQ  = _VMSMasterEnv->readyToAnimateQs[thisCoresIdx];
    1.33  
    1.34 -   currPr = (VirtProcr *) readSRSWQ_NonBlocking( readyToAnimateQ );
    1.35 -   
    1.36 +   #ifdef USE_WORK_STEALING
    1.37 +      //Alg for work-stealing designed to make common case fast.  Comment
    1.38 +      // in stealer code explains.
    1.39 +   gate.preGateProgress++;
    1.40 +   if( gate.gateClosed )
    1.41 +    {    //now, set coreloop's progress, so stealer can see that core loop
    1.42 +         // has made it into the waiting area.
    1.43 +      gate.waitProgress = gate.preGateProgress;
    1.44 +      while( gate.gateClosed ) /*busy wait*/;
    1.45 +    }
    1.46 +
    1.47 +   currPr = (VirtProcr *) readVMSQ( readyToAnimateQ );
    1.48 +
    1.49 +      //Set the coreloop's progress, so stealer can see it has made it out
    1.50 +      // of the protected area
    1.51 +   gate.exitProgress = gate.preGateProgress;
    1.52 +   #else
    1.53 +   currPr = (VirtProcr *) readVMSQ( readyToAnimateQ );
    1.54 +   #endif
    1.55 +
    1.56     if( currPr != NULL ) _VMSMasterEnv->numMasterInARow[thisCoresIdx] = 0;
    1.57  
    1.58     int tries = 0; int gotLock = 0;
    1.59 -   while( currPr == NULL )
    1.60 -    {    //no VPs ready to animate, so run MasterVP --later make "try Master"
    1.61 -         // VPs & put one in every queue at strategic point -- so have work
    1.62 -         // avail if don't get lock & short-circuit out of it if master has
    1.63 -         // recently run on another core
    1.64 -         //TODO: perf -- "try Master" VP that checks if should run Master Fn
    1.65 -         //But just letting queue run empty is quickest to see if pinning VP
    1.66 -         // to core will solve the bizarre random seg-faults in system stack.
    1.67 -
    1.68 -         //check if get the MasterLock
    1.69 +   while( currPr == NULL ) //if queue was empty, enter get masterLock loop
    1.70 +    {    //queue was empty, so get master lock
    1.71        gotLock = __sync_bool_compare_and_swap( &(_VMSMasterEnv->masterLock), \
    1.72 -                                                 UNLOCKED, LOCKED );
    1.73 +                                                          UNLOCKED, LOCKED );
    1.74        if( gotLock )
    1.75 -       {    //run own MasterVP -- when its done, unlocks MasterLock and
    1.76 -            // jumps back to coreLoops's startPt
    1.77 +       {    //run own MasterVP -- jmps to coreLoops startPt when done
    1.78           currPr = _VMSMasterEnv->masterVPs[thisCoresIdx];
    1.79 -         if( _VMSMasterEnv->numMasterInARow[thisCoresIdx] > 100 )
    1.80 -          { //printf("1000 back to back MasterVP\n");
    1.81 -            //TODO: turn this into work-stealing from another core
    1.82 -            //only yield if no work to steal -- and count consecutive yields
    1.83 -            // if too many of those, then sleep for 10ms or whatever
    1.84 +         if( _VMSMasterEnv->numMasterInARow[thisCoresIdx] > 1000 )
    1.85 +          {       DEBUG( dbgB2BMaster,"Many back to back MasterVPs\n");
    1.86              pthread_yield();
    1.87            }
    1.88           _VMSMasterEnv->numMasterInARow[thisCoresIdx] += 1;
    1.89 @@ -124,38 +141,7 @@
    1.90      }
    1.91     
    1.92  
    1.93 -      //switch to virt procr's stack and frame ptr then jump to virt procr fn
    1.94 -   void *stackPtr, *framePtr, *jmpPt, *coreLoopFramePtrAddr, \
    1.95 -        *coreLoopStackPtrAddr;
    1.96 -   
    1.97 -   stackPtr = currPr->stackPtr;
    1.98 -   framePtr = currPr->framePtr;
    1.99 -   jmpPt    = currPr->nextInstrPt;
   1.100 -   coreLoopFramePtrAddr = &(currPr->coreLoopFramePtr);
   1.101 -   coreLoopStackPtrAddr = &(currPr->coreLoopStackPtr);
   1.102 -
   1.103 -      //Save the core loop's stack and frame pointers into virt procr struct
   1.104 -      // then switch to stack ptr and frame ptr of virt procr & jmp to it
   1.105 -      //This was a pain to get right because GCC converts the "(jmpPt)" to
   1.106 -      // frame-relative mem-op -- so generated machine code first changed the
   1.107 -      // frame pointer, then tried to jump to an addr stored on stack, which
   1.108 -      // it accessed as an offset from frame-ptr!  (wrong frame-ptr now)
   1.109 -      //Explicitly loading into eax before changing frame-ptr fixed it
   1.110 -      //Also, it turns "(currPr->coreLoopFramePtr)" into a temporary on the
   1.111 -      // stack, so "movl %%ebp, %0" saves to the temp, NOT the data-struc!
   1.112 -   asm volatile("movl %0, %%eax;      \
   1.113 -                 movl %%esp, (%%eax); \
   1.114 -                 movl %1, %%eax;      \
   1.115 -                 movl %%ebp, (%%eax); \
   1.116 -                 movl %2, %%eax;      \
   1.117 -                 movl %3, %%esp;      \
   1.118 -                 movl %4, %%ebp;      \
   1.119 -                 jmp  %%eax"          \
   1.120 -   /* outputs */ : "=g"(coreLoopStackPtrAddr),                 \
   1.121 -                   "=g"(coreLoopFramePtrAddr)                  \
   1.122 -   /* inputs  */ : "g" (jmpPt), "g" (stackPtr), "g" (framePtr) \
   1.123 -   /* clobber */ : "memory", "%eax", "%ebx", "%ecx", "%edx", "%edi", "%esi" \
   1.124 -                );
   1.125 +   SwitchToVP( currPr )
   1.126  
   1.127     //=========== jmp to here when want to shut down the VMS system ==========
   1.128     CoreLoopEndPt:
   1.129 @@ -176,7 +162,7 @@
   1.130  coreLoop_Seq( void *paramsIn )
   1.131   {
   1.132     VirtProcr      *currPr;
   1.133 -   SRSWQueueStruc *readyToAnimateQ;
   1.134 +   VMSQueueStruc *readyToAnimateQ;
   1.135     
   1.136     ThdParams      *coreLoopThdParams;
   1.137     int             thisCoresIdx;
   1.138 @@ -207,7 +193,7 @@
   1.139        //_VMSWorkQ must be a global, static volatile var, so not kept in reg,
   1.140        // which forces reloading the pointer after each jmp to this point
   1.141     readyToAnimateQ  = _VMSMasterEnv->readyToAnimateQs[thisCoresIdx];
   1.142 -   currPr = (VirtProcr *) readSRSWQ_NonBlocking( readyToAnimateQ );
   1.143 +   currPr = (VirtProcr *) readVMSQ( readyToAnimateQ );
   1.144     if( currPr == NULL )
   1.145      { if( _VMSMasterEnv->numMasterInARow[thisCoresIdx] > 1000 )
   1.146         { printf("too many back to back MasterVP\n"); exit(1); }
   1.147 @@ -219,38 +205,7 @@
   1.148        _VMSMasterEnv->numMasterInARow[thisCoresIdx] = 0;
   1.149  
   1.150  
   1.151 -      //switch to virt procr's stack and frame ptr then jump to virt procr
   1.152 -   void *stackPtr, *framePtr, *jmpPt, *coreLoopFramePtrAddr, \
   1.153 -        *coreLoopStackPtrAddr;
   1.154 -
   1.155 -   stackPtr = currPr->stackPtr;
   1.156 -   framePtr = currPr->framePtr;
   1.157 -   jmpPt    = currPr->nextInstrPt;
   1.158 -   coreLoopFramePtrAddr = &(currPr->coreLoopFramePtr);
   1.159 -   coreLoopStackPtrAddr = &(currPr->coreLoopStackPtr);
   1.160 -
   1.161 -      //Save the core loop's stack and frame pointers into virt procr struct
   1.162 -      // then switch to stack ptr and frame ptr of virt procr & jmp to it
   1.163 -      //This was a pain to get right because GCC converts the "(jmpPt)" to
   1.164 -      // frame-relative mem-op -- so generated machine code first changed the
   1.165 -      // frame pointer, then tried to jump to an addr stored on stack, which
   1.166 -      // it accessed as an offset from frame-ptr!  (wrong frame-ptr now)
   1.167 -      //Explicitly loading into eax before changing frame-ptr fixed it
   1.168 -      //Also, it turns "(currPr->coreLoopFramePtr)" into a temporary on the
   1.169 -      // stack, so "movl %%ebp, %0" saves to the temp, NOT the data-struc!
   1.170 -   asm volatile("movl %0, %%eax;      \
   1.171 -                 movl %%esp, (%%eax); \
   1.172 -                 movl %1, %%eax;      \
   1.173 -                 movl %%ebp, (%%eax); \
   1.174 -                 movl %2, %%eax;      \
   1.175 -                 movl %3, %%esp;      \
   1.176 -                 movl %4, %%ebp;      \
   1.177 -                 jmp  %%eax"          \
   1.178 -   /* outputs */ : "=g"(coreLoopStackPtrAddr),                 \
   1.179 -                   "=g"(coreLoopFramePtrAddr)                  \
   1.180 -   /* inputs  */ : "g" (jmpPt), "g" (stackPtr), "g" (framePtr) \
   1.181 -   /* clobber */ : "memory", "%eax", "%ebx", "%ecx", "%edx", "%edi", "%esi" \
   1.182 -                );
   1.183 +   SwitchToVP( currPr )
   1.184  
   1.185     //========================================================================
   1.186        //jmp to here when want to shut down the VMS system.  A shutdown VP is
     2.1 --- a/MasterLoop.c	Tue Nov 02 16:43:01 2010 -0700
     2.2 +++ b/MasterLoop.c	Thu Nov 04 18:13:18 2010 -0700
     2.3 @@ -12,6 +12,14 @@
     2.4  #include "VMS.h"
     2.5  
     2.6  
     2.7 +//===========================================================================
     2.8 +void inline
     2.9 +stealWorkInto( SchedSlot *currSlot, VMSQueueStruc *readyToAnimateQ,
    2.10 +               VirtProcr *masterPr );
    2.11 +
    2.12 +//===========================================================================
    2.13 +
    2.14 +
    2.15  
    2.16  /*This code is animated by the virtual Master processor.
    2.17   *
    2.18 @@ -64,7 +72,7 @@
    2.19   */
    2.20  void masterLoop( void *initData, VirtProcr *animatingPr )
    2.21   { 
    2.22 -   int             slotIdx;
    2.23 +   int32           slotIdx, numSlotsFilled;
    2.24     VirtProcr      *schedVirtPr;
    2.25     SchedSlot      *currSlot, **schedSlots;
    2.26     MasterEnv      *masterEnv;
    2.27 @@ -74,7 +82,7 @@
    2.28     RequestHandler  requestHandler;
    2.29     void           *semanticEnv;
    2.30  
    2.31 -   int             thisCoresIdx;
    2.32 +   int32           thisCoresIdx;
    2.33     VirtProcr      *masterPr;
    2.34     volatile        VirtProcr *volatileMasterPr;
    2.35     
    2.36 @@ -108,7 +116,7 @@
    2.37  
    2.38     masterEnv        = _VMSMasterEnv;
    2.39     
    2.40 -//TODO: check that compiles so that always re-define from frame-storage
    2.41 +      //GCC may optimize so doesn't always re-define from frame-storage
    2.42     masterPr         = volatileMasterPr;  //just to make sure after jmp
    2.43     thisCoresIdx     = masterPr->coreAnimatedBy;
    2.44     readyToAnimateQ  = masterEnv->readyToAnimateQs[thisCoresIdx];
    2.45 @@ -120,6 +128,7 @@
    2.46  
    2.47  
    2.48        //Poll each slot's Done flag
    2.49 +   numSlotsFilled = 0;
    2.50     for( slotIdx = 0; slotIdx < NUM_SCHED_SLOTS; slotIdx++)
    2.51      {
    2.52        currSlot = schedSlots[ slotIdx ];
    2.53 @@ -141,46 +150,203 @@
    2.54            { currSlot->procrAssignedToSlot = schedVirtPr;
    2.55              schedVirtPr->schedSlot        = currSlot;
    2.56              currSlot->needsProcrAssigned  = FALSE;
    2.57 -
    2.58 -            writeSRSWQ( schedVirtPr, readyToAnimateQ );
    2.59 +            numSlotsFilled               += 1;
    2.60 +            
    2.61 +            writeVMSQ( schedVirtPr, readyToAnimateQ );
    2.62            }
    2.63         }
    2.64      }
    2.65  
    2.66 +   
    2.67 +   #ifdef USE_WORK_STEALING
    2.68 +      //If no slots filled, means no more work, look for work to steal.
    2.69 +   if( numSlotsFilled == 0 )
    2.70 +    { gateProtected_stealWorkInto( currSlot, readyToAnimateQ, masterPr );
    2.71 +    }
    2.72 +   #endif
    2.73  
    2.74 -      //Save stack ptr and frame, restore CoreLoop's stack and frame,
    2.75 -      // and clear the MasterLock
    2.76 -      //TODO: cafefully verify don't need to force saving anything to stack
    2.77 -      // before jumping back to core loop.
    2.78 -   void           *stackPtrAddr, *framePtrAddr, *masterLockAddr;
    2.79 -   void           *jmpPt, *coreLoopFramePtr, *coreLoopStackPtr;
    2.80 -
    2.81 -   stackPtrAddr      = &(masterPr->stackPtr);
    2.82 -   framePtrAddr      = &(masterPr->framePtr);
    2.83 -   masterLockAddr    = &(_VMSMasterEnv->masterLock);
    2.84 -
    2.85 -   jmpPt             = _VMSMasterEnv->coreLoopStartPt;
    2.86 -   coreLoopFramePtr  = masterPr->coreLoopFramePtr;//need this only
    2.87 -   coreLoopStackPtr  = masterPr->coreLoopStackPtr;//shouldn't need -- safety
    2.88     
    2.89     #ifdef MEAS__TIME_MASTER
    2.90     saveLowTimeStampCountInto( masterPr->endMasterTSCLow );
    2.91     #endif
    2.92  
    2.93 -   asm volatile("movl %0,     %%eax;  \
    2.94 -                 movl %%esp, (%%eax); \
    2.95 -                 movl %1,     %%eax;  \
    2.96 -                 movl %%ebp, (%%eax); \
    2.97 -                 movl %2, %%ebx;      \
    2.98 -                 movl %3, %%eax;      \
    2.99 -                 movl %4, %%esp;      \
   2.100 -                 movl %5, %%ebp;      \
   2.101 -                 movl $0x0, (%%ebx);  \
   2.102 -                 jmp  %%eax;"         \
   2.103 -   /* outputs */ : "=g" (stackPtrAddr), "=g" (framePtrAddr),                \
   2.104 -                   "=g"(masterLockAddr)                                     \
   2.105 -   /* inputs  */ : "g" (jmpPt), "g"(coreLoopStackPtr), "g"(coreLoopFramePtr)\
   2.106 -   /* clobber */ : "memory", "%eax", "%ebx", "%ecx", "%edx", "%edi", "%esi" \
   2.107 -                );//can probably make clobber list empty -- but safe for now
   2.108 +   
   2.109 +   masterSwitchToCoreLoop( masterPr )
   2.110   }
   2.111  
   2.112 +
   2.113 +
   2.114 +/*This has a race condition -- the coreloops are accessing their own queues
   2.115 + * at the same time that this work-stealer on a different core is trying to
   2.116 + */
   2.117 +void inline
   2.118 +stealWorkInto( SchedSlot *currSlot, VMSQueueStruc *readyToAnimateQ,
   2.119 +               VirtProcr *masterPr )
   2.120 + { 
   2.121 +   VirtProcr   *stolenPr;
   2.122 +   int32        coreIdx, i;
   2.123 +   VMSQueueStruc *currQ;
   2.124 +
   2.125 +   stolenPr = NULL;
   2.126 +   coreIdx = masterPr->coreAnimatedBy;
   2.127 +   for( i = 0; i < NUM_CORES -1; i++ )
   2.128 +    {
   2.129 +      if( coreIdx >= NUM_CORES -1 )
   2.130 +       { coreIdx = 0;
   2.131 +       }
   2.132 +      else
   2.133 +       { coreIdx++;
   2.134 +       }
   2.135 +      currQ = _VMSMasterEnv->readyToAnimateQs[coreIdx];
   2.136 +      if( numInVMSQ( currQ ) > 0 )
   2.137 +       { stolenPr = readVMSQ (currQ );
   2.138 +         break;
   2.139 +       }
   2.140 +    }
   2.141 +
   2.142 +   if( stolenPr != NULL )
   2.143 +    { currSlot->procrAssignedToSlot = stolenPr;
   2.144 +      stolenPr->schedSlot           = currSlot;
   2.145 +      currSlot->needsProcrAssigned  = FALSE;
   2.146 +
   2.147 +      writeVMSQ( stolenPr, readyToAnimateQ );
   2.148 +    }
   2.149 + }
   2.150 +
   2.151 +/*This algorithm makes the common case fast.  Make the coreloop passive,
   2.152 + * and show its progress.  Make the stealer control a gate that coreloop
   2.153 + * has to pass.
   2.154 + *To avoid interference, only one stealer at a time.  Use a global
   2.155 + * stealer-lock.
   2.156 + *
   2.157 + *The pattern is based on a gate -- stealer shuts the gate, then monitors
   2.158 + * to be sure any already past make it all the way out, before starting.
   2.159 + *So, have a "progress" measure just before the gate, then have two after it,
   2.160 + * one is in a "waiting room" outside the gate, the other is at the exit.
   2.161 + *Then, the stealer first shuts the gate, then checks the progress measure
   2.162 + * outside it, then looks to see if the progress measure at the exit is the
   2.163 + * same.  If yes, it knows the protected area is empty 'cause no other way
   2.164 + * to get in and the last to get in also exited.
   2.165 + *If the progress measure at the exit is not the same, then the stealer goes
   2.166 + * into a loop checking both the waiting-area and the exit progress-measures
   2.167 + * until one of them shows the same as the measure outside the gate.  Might
   2.168 + * as well re-read the measure outside the gate each go around, just to be
   2.169 + * sure.  It is guaranteed that one of the two will eventually match the one
   2.170 + * outside the gate.
   2.171 + *
   2.172 + *Here's an informal proof of correctness:
   2.173 + *The gate can be closed at any point, and have only four cases:
   2.174 + *  1) coreloop made it past the gate-closing but not yet past the exit
   2.175 + *  2) coreloop made it past the pre-gate progress update but not yet past
   2.176 + *     the gate,
   2.177 + *  3) coreloop is right before the pre-gate update
   2.178 + *  4) coreloop is past the exit and far from the pre-gate update.
   2.179 + *
   2.180 + * Covering the cases in reverse order,
   2.181 + *  4) is not a problem -- stealer will read pre-gate progress, see that it
   2.182 + *     matches exit progress, and the gate is closed, so stealer can proceed.
   2.183 + *  3) stealer will read pre-gate progress just after coreloop updates it..
   2.184 + *     so stealer goes into a loop until the coreloop causes wait-progress
   2.185 + *     to match pre-gate progress, so then stealer can proceed
   2.186 + *  2) same as 3..
   2.187 + *  1) stealer reads pre-gate progress, sees that it's different than exit,
   2.188 + *     so goes into loop until exit matches pre-gate, now it knows coreloop
   2.189 + *     is not in protected and cannot get back in, so can proceed.
   2.190 + *
   2.191 + *Implementation for the stealer:
   2.192 + *
   2.193 + *First, acquire the stealer lock -- only cores with no work to do will
   2.194 + * compete to steal, so not a big performance penalty having only one --
   2.195 + * will rarely have multiple stealers in a system with plenty of work -- and
   2.196 + * in a system with little work, it doesn't matter.
   2.197 + *
   2.198 + *Note, have single-reader, single-writer pattern for all variables used to
   2.199 + * communicate between stealer and victims
   2.200 + *
   2.201 + *So, scan the queues of the core loops, until find non-empty.  Each core
   2.202 + * has its own list that it scans.  The list goes in order from closest to
   2.203 + * furthest core, so it steals first from close cores.  Later can add
   2.204 + * taking info from the app about overlapping footprints, and scan all the
   2.205 + * others then choose work with the most footprint overlap with the contents
   2.206 + * of this core's cache.
   2.207 + *
   2.208 + *Now, have a victim want to take work from.  So, shut the gate in that
   2.209 + * coreloop, by setting the "gate closed" var on its stack to TRUE.
   2.210 + *Then, read the core's pre-gate progress and compare to the core's exit
   2.211 + * progress.
   2.212 + *If same, can proceed to take work from the coreloop's queue.  When done,
   2.213 + * write FALSE to gate closed var.
   2.214 + *If different, then enter a loop that reads the pre-gate progress, then
   2.215 + * compares to exit progress then to wait progress.  When one of two
   2.216 + * matches, proceed.  Take work from the coreloop's queue.  When done,
   2.217 + * write FALSE to the gate closed var.
   2.218 + * 
   2.219 + */
   2.220 +void inline
   2.221 +gateProtected_stealWorkInto( SchedSlot *currSlot,
   2.222 +                             VMSQueueStruc *myReadyToAnimateQ,
   2.223 +                             VirtProcr *masterPr )
   2.224 + {
   2.225 +   VirtProcr     *stolenPr;
   2.226 +   int32          coreIdx, i, haveAVictim, gotLock;
   2.227 +   VMSQueueStruc *victimsQ;
   2.228 +
   2.229 +   volatile GateStruc *vicGate;
   2.230 +   int32               coreMightBeInProtected;
   2.231 +
   2.232 +
   2.233 +
   2.234 +      //see if any other cores have work available to steal
   2.235 +   haveAVictim = FALSE;
   2.236 +   coreIdx = masterPr->coreAnimatedBy;
   2.237 +   for( i = 0; i < NUM_CORES -1; i++ )
   2.238 +    {
   2.239 +      if( coreIdx >= NUM_CORES -1 )
   2.240 +       { coreIdx = 0;
   2.241 +       }
   2.242 +      else
   2.243 +       { coreIdx++;
   2.244 +       }
   2.245 +      victimsQ = _VMSMasterEnv->readyToAnimateQs[coreIdx];
   2.246 +      if( numInVMSQ( victimsQ ) > 0 )
   2.247 +       { haveAVictim = TRUE;
   2.248 +         vicGate = _VMSMasterEnv->workStealingGates[ coreIdx ];
   2.249 +         break;
   2.250 +       }
   2.251 +    }
   2.252 +   if( !haveAVictim ) return;  //no work to steal, exit
   2.253 +
   2.254 +      //have a victim core, now get the stealer-lock
   2.255 +   gotLock =__sync_bool_compare_and_swap( &(_VMSMasterEnv->workStealingLock),
   2.256 +                                                          UNLOCKED, LOCKED );
   2.257 +   if( !gotLock ) return; //go back to core loop, which will re-start master
   2.258 +
   2.259 +
   2.260 +   //====== Start Gate-protection =======
   2.261 +   vicGate->gateClosed = TRUE;
   2.262 +   coreMightBeInProtected= vicGate->preGateProgress != vicGate->exitProgress;
   2.263 +   while( coreMightBeInProtected )
   2.264 +    {    //wait until sure
   2.265 +      if( vicGate->preGateProgress == vicGate->waitProgress )
   2.266 +         coreMightBeInProtected = FALSE;
   2.267 +      if( vicGate->preGateProgress == vicGate->exitProgress )
   2.268 +         coreMightBeInProtected = FALSE;
   2.269 +    }
   2.270 +
   2.271 +   stolenPr = readVMSQ ( victimsQ );
   2.272 +
   2.273 +   vicGate->gateClosed = FALSE;
   2.274 +   //======= End Gate-protection  =======
   2.275 +
   2.276 +
   2.277 +   if( stolenPr != NULL )  //victim could have been in protected and taken
   2.278 +    { currSlot->procrAssignedToSlot = stolenPr;
   2.279 +      stolenPr->schedSlot           = currSlot;
   2.280 +      currSlot->needsProcrAssigned  = FALSE;
   2.281 +
   2.282 +      writeVMSQ( stolenPr, myReadyToAnimateQ );
   2.283 +    }
   2.284 +
   2.285 +      //unlock the work stealing lock
   2.286 +   _VMSMasterEnv->workStealingLock = UNLOCKED;
   2.287 + }
     3.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     3.2 +++ b/SwitchAnimators.h	Thu Nov 04 18:13:18 2010 -0700
     3.3 @@ -0,0 +1,138 @@
     3.4 +/*
     3.5 + *  Copyright 2009 OpenSourceStewardshipFoundation.org
     3.6 + *  Licensed under GNU General Public License version 2
     3.7 + *
     3.8 + * Author: seanhalle@yahoo.com
     3.9 + * 
    3.10 + */
    3.11 +
    3.12 +#ifndef _SwitchAnimators_H
    3.13 +#define	_SwitchAnimators_H
    3.14 +#define __USE_GNU
    3.15 +
    3.16 +/*Isolating code for switching between animators within these macros -- at
    3.17 + * some point will make switches to compile for 32 bit or for 64 bit, which
    3.18 + * having these isolated will make cleaner
    3.19 + *
    3.20 + *This also makes it easier to change architectures, at some point
    3.21 + *And it cleans the code up, having the ugly assembly out of the way
    3.22 + */
    3.23 +
    3.24 +//=========================== MasterVP to CoreLoop ==========================
    3.25 +//
    3.26 +      //Save stack ptr and frame, restore CoreLoop's stack and frame,
    3.27 +      // and clear the MasterLock
    3.28 +      //GCC's -O3 messes with this -- go through generated -- protect somehow
    3.29 +      //
    3.30 +#define masterSwitchToCoreLoop( masterPr )   \
    3.31 +   void           *stackPtrAddr, *framePtrAddr, *masterLockAddr; \
    3.32 +   void           *jmpPt, *coreLoopFramePtr, *coreLoopStackPtr;  \
    3.33 +\
    3.34 +   stackPtrAddr      = &(masterPr->stackPtr); \
    3.35 +   framePtrAddr      = &(masterPr->framePtr); \
    3.36 +   masterLockAddr    = &(_VMSMasterEnv->masterLock); \
    3.37 +\
    3.38 +   jmpPt             = _VMSMasterEnv->coreLoopStartPt; \
    3.39 +   coreLoopFramePtr  = masterPr->coreLoopFramePtr; \
    3.40 +   coreLoopStackPtr  = masterPr->coreLoopStackPtr; \
    3.41 +\
    3.42 +   asm volatile("movl %0,     %%eax;  \
    3.43 +                 movl %%esp, (%%eax); \
    3.44 +                 movl %1,     %%eax;  \
    3.45 +                 movl %%ebp, (%%eax); \
    3.46 +                 movl %2, %%ebx;      \
    3.47 +                 movl %3, %%eax;      \
    3.48 +                 movl %4, %%esp;      \
    3.49 +                 movl %5, %%ebp;      \
    3.50 +                 movl $0x0, (%%ebx);  \
    3.51 +                 jmp  %%eax;"         \
    3.52 +   /* outputs */ : "=g" (stackPtrAddr), "=g" (framePtrAddr),                \
    3.53 +                   "=g"(masterLockAddr)                                     \
    3.54 +   /* inputs  */ : "g" (jmpPt), "g"(coreLoopStackPtr), "g"(coreLoopFramePtr)\
    3.55 +   /* clobber */ : "memory", "%eax", "%ebx", "%ecx", "%edx", "%edi", "%esi" \
    3.56 +                );//can probably make clobber list empty -- but safe for now
    3.57 +
    3.58 +
    3.59 +//=========================== SlaveVP to CoreLoop ===========================
    3.60 +//
    3.61 +
    3.62 +#define    SwitchToCoreLoop( animatingPr ) \
    3.63 +   void *jmpPt, *stackPtrAddr, *framePtrAddr, *coreLoopStackPtr; \
    3.64 +   void *coreLoopFramePtr; \
    3.65 +\
    3.66 +   stackPtrAddr      = &(animatingPr->stackPtr); \
    3.67 +   framePtrAddr      = &(animatingPr->framePtr); \
    3.68 +\
    3.69 +   jmpPt             = _VMSMasterEnv->coreLoopStartPt; \
    3.70 +   coreLoopFramePtr  = animatingPr->coreLoopFramePtr; \
    3.71 +   coreLoopStackPtr  = animatingPr->coreLoopStackPtr; \
    3.72 +\
    3.73 +      /*Save the virt procr's stack and frame ptrs*/ \
    3.74 +   asm volatile("movl %0,     %%eax;  \
    3.75 +                 movl %%esp, (%%eax); \
    3.76 +                 movl %1,     %%eax;  \
    3.77 +                 movl %%ebp, (%%eax) "\
    3.78 +   /* outputs */ : "=g" (stackPtrAddr), "=g" (framePtrAddr) \
    3.79 +   /* inputs  */ :        \
    3.80 +   /* clobber */ : "%eax" \
    3.81 +                ); \
    3.82 +\
    3.83 +     /*restore coreloop's frame ptr, then jump back to "start" of core loop*/\
    3.84 +     /*Note, GCC compiles to assembly that saves esp and ebp in the stack*/ \
    3.85 +     /* frame -- so have to explicitly do assembly that saves to memory*/ \
    3.86 +   asm volatile("movl %0, %%eax;      \
    3.87 +                 movl %1, %%esp;      \
    3.88 +                 movl %2, %%ebp;      \
    3.89 +                 jmp  %%eax    "      \
    3.90 +   /* outputs */ :                    \
    3.91 +   /* inputs  */ : "m" (jmpPt), "m"(coreLoopStackPtr), "m"(coreLoopFramePtr)\
    3.92 +   /* clobber */ : "memory", "%eax", "%ebx", "%ecx", "%edx", "%edi","%esi"  \
    3.93 +                );
    3.94 + //list everything as clobbered to force GCC to save all
    3.95 + // live vars that are in regs on stack before this
    3.96 + // assembly, so that stack pointer is correct, before jmp
    3.97 +
    3.98 +
    3.99 +
   3.100 +//============================== CoreLoop to VP =============================
   3.101 +//
   3.102 +      //Save the core loop's stack and frame pointers into virt procr struct
   3.103 +      // then switch to stack ptr and frame ptr of virt procr & jmp to it
   3.104 +      //This was a pain to get right because GCC converts the "(jmpPt)" to
   3.105 +      // frame-relative mem-op -- so generated machine code first changed the
   3.106 +      // frame pointer, then tried to jump to an addr stored on stack, which
   3.107 +      // it accessed as an offset from frame-ptr!  (wrong frame-ptr now)
   3.108 +      //Explicitly loading into eax before changing frame-ptr fixed it
   3.109 +      //Also, it turns "(currPr->coreLoopFramePtr)" into a temporary on the
   3.110 +      // stack, so "movl %%ebp, %0" saves to the temp, NOT the data-struc!
   3.111 +
   3.112 +
   3.113 +      //switch to virt procr's stack and frame ptr then jump to virt procr fn
   3.114 +
   3.115 +#define SwitchToVP( currPr ) \
   3.116 +   void *stackPtr, *framePtr, *jmpPt, *coreLoopFramePtrAddr, \
   3.117 +        *coreLoopStackPtrAddr; \
   3.118 +\
   3.119 +   stackPtr = currPr->stackPtr; \
   3.120 +   framePtr = currPr->framePtr; \
   3.121 +   jmpPt    = currPr->nextInstrPt; \
   3.122 +   coreLoopFramePtrAddr = &(currPr->coreLoopFramePtr); \
   3.123 +   coreLoopStackPtrAddr = &(currPr->coreLoopStackPtr); \
   3.124 +\
   3.125 +   asm volatile("movl %0, %%eax;      \
   3.126 +                 movl %%esp, (%%eax); \
   3.127 +                 movl %1, %%eax;      \
   3.128 +                 movl %%ebp, (%%eax); \
   3.129 +                 movl %2, %%eax;      \
   3.130 +                 movl %3, %%esp;      \
   3.131 +                 movl %4, %%ebp;      \
   3.132 +                 jmp  %%eax"          \
   3.133 +   /* outputs */ : "=g"(coreLoopStackPtrAddr),                 \
   3.134 +                   "=g"(coreLoopFramePtrAddr)                  \
   3.135 +   /* inputs  */ : "g" (jmpPt), "g" (stackPtr), "g" (framePtr) \
   3.136 +   /* clobber */ : "memory", "%eax", "%ebx", "%ecx", "%edx", "%edi", "%esi" \
   3.137 +                );
   3.138 +
   3.139 +   
   3.140 +#endif	/* _SwitchAnimators_H */
   3.141 +
     4.1 --- a/VMS.c	Tue Nov 02 16:43:01 2010 -0700
     4.2 +++ b/VMS.c	Thu Nov 04 18:13:18 2010 -0700
     4.3 @@ -87,7 +87,7 @@
     4.4  void
     4.5  create_masterEnv()
     4.6   { MasterEnv       *masterEnv;
     4.7 -   SRSWQueueStruc **readyToAnimateQs;
     4.8 +   VMSQueueStruc **readyToAnimateQs;
     4.9     int              coreIdx;
    4.10     VirtProcr      **masterVPs;
    4.11     SchedSlot     ***allSchedSlots; //ptr to array of ptrs
    4.12 @@ -105,7 +105,7 @@
    4.13     masterEnv     = _VMSMasterEnv;
    4.14     
    4.15        //Make a readyToAnimateQ for each core loop
    4.16 -   readyToAnimateQs = VMS__malloc( NUM_CORES * sizeof(SRSWQueueStruc *) );
    4.17 +   readyToAnimateQs = VMS__malloc( NUM_CORES * sizeof(VMSQueueStruc *) );
    4.18     masterVPs        = VMS__malloc( NUM_CORES * sizeof(VirtProcr *) );
    4.19  
    4.20        //One array for each core, 3 in array, core's masterVP scheds all
    4.21 @@ -114,18 +114,20 @@
    4.22     _VMSMasterEnv->numProcrsCreated = 0;  //used by create procr
    4.23     for( coreIdx = 0; coreIdx < NUM_CORES; coreIdx++ )
    4.24      {    
    4.25 -      readyToAnimateQs[ coreIdx ] = makeSRSWQ();
    4.26 +      readyToAnimateQs[ coreIdx ] = makeVMSQ();
    4.27        
    4.28           //Q: should give masterVP core-specific info as its init data?
    4.29        masterVPs[ coreIdx ] = VMS__create_procr( &masterLoop, masterEnv );
    4.30        masterVPs[ coreIdx ]->coreAnimatedBy = coreIdx;
    4.31        allSchedSlots[ coreIdx ] = create_sched_slots(); //makes for one core
    4.32        _VMSMasterEnv->numMasterInARow[ coreIdx ] = 0;
    4.33 +      _VMSMasterEnv->workStealingGates[ coreIdx ] = NULL;
    4.34      }
    4.35     _VMSMasterEnv->readyToAnimateQs = readyToAnimateQs;
    4.36     _VMSMasterEnv->masterVPs        = masterVPs;
    4.37     _VMSMasterEnv->masterLock       = UNLOCKED;
    4.38     _VMSMasterEnv->allSchedSlots    = allSchedSlots;
    4.39 +   _VMSMasterEnv->workStealingLock = UNLOCKED;
    4.40  
    4.41  
    4.42        //Aug 19, 2010:  no longer need to place initial masterVP into queue
    4.43 @@ -338,8 +340,7 @@
    4.44   */
    4.45  void
    4.46  VMS__suspend_procr( VirtProcr *animatingPr )
    4.47 - { void *jmpPt, *stackPtrAddr, *framePtrAddr, *coreLoopStackPtr;
    4.48 -   void *coreLoopFramePtr;
    4.49 + { 
    4.50  
    4.51        //The request to master will cause this suspended virt procr to get
    4.52        // scheduled again at some future point -- to resume, core loop jumps
    4.53 @@ -350,23 +351,6 @@
    4.54        //return ownership of the virt procr and sched slot to Master virt pr
    4.55     animatingPr->schedSlot->workIsDone = TRUE;
    4.56  
    4.57 -   stackPtrAddr      = &(animatingPr->stackPtr);
    4.58 -   framePtrAddr      = &(animatingPr->framePtr);
    4.59 -
    4.60 -   jmpPt             = _VMSMasterEnv->coreLoopStartPt;
    4.61 -   coreLoopFramePtr  = animatingPr->coreLoopFramePtr;//need this only
    4.62 -   coreLoopStackPtr  = animatingPr->coreLoopStackPtr;//safety
    4.63 -
    4.64 -      //Save the virt procr's stack and frame ptrs,
    4.65 -   asm volatile("movl %0,     %%eax;  \
    4.66 -                 movl %%esp, (%%eax); \
    4.67 -                 movl %1,     %%eax;  \
    4.68 -                 movl %%ebp, (%%eax) "\
    4.69 -   /* outputs */ : "=g" (stackPtrAddr), "=g" (framePtrAddr) \
    4.70 -   /* inputs  */ :        \
    4.71 -   /* clobber */ : "%eax" \
    4.72 -                );
    4.73 -
    4.74     //===========================  Measurement stuff ========================
    4.75     #ifdef MEAS__TIME_STAMP_SUSP
    4.76        //record time stamp: compare to time-stamp recorded below
    4.77 @@ -374,20 +358,10 @@
    4.78     #endif
    4.79     //=======================================================================
    4.80  
    4.81 -      //restore coreloop's frame ptr, then jump back to "start" of core loop
    4.82 -      //Note, GCC compiles to assembly that saves esp and ebp in the stack
    4.83 -      // frame -- so have to explicitly do assembly that saves to memory
    4.84 -   asm volatile("movl %0, %%eax;      \
    4.85 -                 movl %1, %%esp;      \
    4.86 -                 movl %2, %%ebp;      \
    4.87 -                 jmp  %%eax    "      \
    4.88 -   /* outputs */ :                    \
    4.89 -   /* inputs  */ : "m" (jmpPt), "m"(coreLoopStackPtr), "m"(coreLoopFramePtr)\
    4.90 -   /* clobber */ : "memory", "%eax", "%ebx", "%ecx", "%edx", "%edi","%esi"  \
    4.91 -                ); //list everything as clobbered to force GCC to save all
    4.92 -                   // live vars that are in regs on stack before this
    4.93 -                   // assembly, so that stack pointer is correct, before jmp
    4.94  
    4.95 +   SwitchToCoreLoop( animatingPr )
    4.96 +
    4.97 +   //=======================================================================
    4.98  ResumePt:
    4.99     #ifdef MEAS__TIME_STAMP_SUSP
   4.100        //NOTE: only take low part of count -- do sanity check when take diff
   4.101 @@ -673,7 +647,7 @@
   4.102     for( coreIdx=0; coreIdx < NUM_CORES; coreIdx++ )
   4.103      {    //Note, this is running in the master
   4.104        shutDownPr = VMS__create_procr( &endOSThreadFn, NULL );
   4.105 -      writeSRSWQ( shutDownPr, _VMSMasterEnv->readyToAnimateQs[coreIdx] );
   4.106 +      writeVMSQ( shutDownPr, _VMSMasterEnv->readyToAnimateQs[coreIdx] );
   4.107      }
   4.108  
   4.109   }
   4.110 @@ -717,7 +691,7 @@
   4.111  void
   4.112  VMS__cleanup_at_end_of_shutdown()
   4.113   { 
   4.114 -   SRSWQueueStruc **readyToAnimateQs;
   4.115 +   VMSQueueStruc **readyToAnimateQs;
   4.116     int              coreIdx;
   4.117     VirtProcr      **masterVPs;
   4.118     SchedSlot     ***allSchedSlots; //ptr to array of ptrs
   4.119 @@ -731,7 +705,7 @@
   4.120     
   4.121     for( coreIdx = 0; coreIdx < NUM_CORES; coreIdx++ )
   4.122      {
   4.123 -      freeSRSWQ( readyToAnimateQs[ coreIdx ] );
   4.124 +      freeVMSQ( readyToAnimateQs[ coreIdx ] );
   4.125           //master VPs were created external to VMS, so use external free
   4.126        VMS__dissipate_procr( masterVPs[ coreIdx ] );
   4.127        
     5.1 --- a/VMS.h	Tue Nov 02 16:43:01 2010 -0700
     5.2 +++ b/VMS.h	Thu Nov 04 18:13:18 2010 -0700
     5.3 @@ -11,7 +11,7 @@
     5.4  #define __USE_GNU
     5.5  
     5.6  #include "VMS_primitive_data_types.h"
     5.7 -#include "Queue_impl/BlockingQueue.h"
     5.8 +#include "Queue_impl/PrivateQueue.h"
     5.9  #include "Histogram/Histogram.h"
    5.10  #include "DynArray/DynArray.h"
    5.11  #include "Hash_impl/PrivateHash.h"
    5.12 @@ -22,28 +22,36 @@
    5.13  
    5.14  
    5.15  //===============================  Debug  ===================================
    5.16 -   //These defines turn types of bug messages on and off
    5.17 -#define dbgProbes FALSE
    5.18 -#define dbgAppFlow FALSE
    5.19 -
    5.20 +//
    5.21     //When SEQUENTIAL is defined, VMS does sequential exe in the main thread
    5.22     // It still does co-routines and all the mechanisms are the same, it just
    5.23     // has only a single thread and animates VPs one at a time
    5.24  //#define SEQUENTIAL
    5.25  
    5.26 +//#define USE_WORK_STEALING
    5.27 +
    5.28     //turns on the probe-instrumentation in the application -- when not
    5.29     // defined, the calls to the probe functions turn into comments
    5.30  #define STATS__ENABLE_PROBES
    5.31  
    5.32 +   //These defines turn types of bug messages on and off
    5.33 +   // be sure debug messages are un-commented (next block of defines)
    5.34 +#define dbgProbes FALSE   /* for issues inside probes themselves*/
    5.35 +#define dbgAppFlow TRUE  /* Top level flow of application code -- general*/
    5.36 +#define dbgB2BMaster FALSE/* in coreloop, back to back master VPs*/
    5.37 +#define dbgRqstHdlr FALSE /* in request handler code*/
    5.38  
    5.39 -#define DEBUG(msg)// printf(msg); fflush(stdin);
    5.40 -#define DEBUG_MSG( bool, msg) //if( bool){ printf(msg); fflush(stdin);}
    5.41 -#define PRINT1_DEBUG(msg, param) //printf(msg, param); fflush(stdin);
    5.42 -#define PRINT2_DEBUG(msg, p1, p2) //printf(msg, p1, p2); fflush(stdin);
    5.43 +   //Comment or un- the substitute half to turn on/off types of debug message
    5.44 +#define DEBUG(  bool, msg)         \
    5.45 +   if( bool){ printf(msg); fflush(stdin);}
    5.46 +#define DEBUG1( bool, msg, param)  \
    5.47 +   if(bool){printf(msg, param); fflush(stdin);}
    5.48 +#define DEBUG2( bool, msg, p1, p2) \
    5.49 +   //if(bool) {printf(msg, p1, p2); fflush(stdin);}
    5.50  
    5.51 -#define PRINT_ERROR(msg) printf(msg); fflush(stdin);
    5.52 -#define PRINT1_ERROR(msg, param) printf(msg, param); fflush(stdin);
    5.53 -#define PRINT2_ERROR(msg, p1, p2) printf(msg, p1, p2); fflush(stdin);
    5.54 +#define ERROR(msg) printf(msg); fflush(stdin);
    5.55 +#define ERROR1(msg, param) printf(msg, param); fflush(stdin);
    5.56 +#define ERROR2(msg, p1, p2) printf(msg, p1, p2); fflush(stdin);
    5.57  
    5.58  //===========================  STATS =======================
    5.59  
    5.60 @@ -56,6 +64,8 @@
    5.61  #define MEAS__TIME_MASTER
    5.62  #define MEAS__NUM_TIMES_TO_RUN 100000
    5.63  
    5.64 +   //For code that calculates normalization-offset between TSC counts of
    5.65 +   // different cores.
    5.66  #define NUM_TSC_ROUND_TRIPS 10
    5.67  
    5.68  
    5.69 @@ -64,8 +74,9 @@
    5.70     // machine
    5.71  #define NUM_CORES        4
    5.72  
    5.73 -   // balance amortizing master fixed overhead vs imbalance potential
    5.74 -#define NUM_SCHED_SLOTS  3
    5.75 +   // tradeoff amortizing master fixed overhead vs imbalance potential
    5.76 +   // when work-stealing, can make bigger, at risk of losing cache affinity
    5.77 +#define NUM_SCHED_SLOTS  5
    5.78  
    5.79  #define MIN_WORK_UNIT_CYCLES 20000
    5.80  
    5.81 @@ -82,10 +93,11 @@
    5.82  
    5.83  #define SUCCESS 0
    5.84  
    5.85 -#define writeVMSQ     writeSRSWQ
    5.86 -#define readVMSQ      readSRSWQ
    5.87 -#define makeVMSQ      makeSRSWQ
    5.88 -#define VMSQueueStruc SRSWQueueStruc
    5.89 +#define writeVMSQ     writePrivQ
    5.90 +#define readVMSQ      readPrivQ
    5.91 +#define makeVMSQ      makePrivQ
    5.92 +#define numInVMSQ     numInPrivQ
    5.93 +#define VMSQueueStruc PrivQueueStruc
    5.94  
    5.95  
    5.96  
    5.97 @@ -96,6 +108,8 @@
    5.98  typedef struct _VMSReqst      VMSReqst;
    5.99  typedef struct _VirtProcr     VirtProcr;
   5.100  typedef struct _IntervalProbe IntervalProbe;
   5.101 +typedef struct _GateStruc     GateStruc;
   5.102 +
   5.103  
   5.104  typedef VirtProcr * (*SlaveScheduler)  ( void *, int );   //semEnv, coreIdx
   5.105  typedef void  (*RequestHandler)  ( VirtProcr *, void * ); //prWReqst, semEnv
   5.106 @@ -190,7 +204,7 @@
   5.107     RequestHandler   requestHandler;
   5.108     
   5.109     SchedSlot     ***allSchedSlots;
   5.110 -   SRSWQueueStruc **readyToAnimateQs;
   5.111 +   VMSQueueStruc **readyToAnimateQs;
   5.112     VirtProcr      **masterVPs;
   5.113  
   5.114     void            *semanticEnv;
   5.115 @@ -205,6 +219,9 @@
   5.116     int32            masterLock;
   5.117  
   5.118     int32            numMasterInARow[NUM_CORES];//detect back-to-back masterVP
   5.119 +   GateStruc      **workStealingGates[ NUM_CORES ]; //concurrent work-steal
   5.120 +   int32            workStealingLock;
   5.121 +   
   5.122     int32            numProcrsCreated; //gives ordering to processor creation
   5.123  
   5.124        //=========== MEASUREMENT STUFF =============
   5.125 @@ -216,13 +233,21 @@
   5.126   }
   5.127  MasterEnv;
   5.128  
   5.129 -//=============================
   5.130 +//=========================  Extra Stuff Data Strucs  =======================
   5.131  typedef struct
   5.132   {
   5.133  
   5.134   }
   5.135  VMSExcp;
   5.136  
   5.137 +struct _GateStruc
   5.138 + {
   5.139 +   int32 gateClosed;
   5.140 +   int32 preGateProgress;
   5.141 +   int32 waitProgress;
   5.142 +   int32 exitProgress;
   5.143 + };
   5.144 +//GateStruc
   5.145  
   5.146  //=======================  OS Thread related  ===============================
   5.147  
   5.148 @@ -342,6 +367,7 @@
   5.149                  );
   5.150  //=====
   5.151  
   5.152 +#include "SwitchAnimators.h"
   5.153  #include "probes.h"
   5.154  
   5.155  #endif	/* _VMS_H */
     6.1 --- a/probes.c	Tue Nov 02 16:43:01 2010 -0700
     6.2 +++ b/probes.c	Thu Nov 04 18:13:18 2010 -0700
     6.3 @@ -253,7 +253,7 @@
     6.4  VMS_impl__record_interval_start_in_probe( int32 probeID )
     6.5   { IntervalProbe *probe;
     6.6  
     6.7 -         DEBUG_MSG( dbgProbes, "record start of interval\n" )
     6.8 +         DEBUG( dbgProbes, "record start of interval\n" )
     6.9     probe = _VMSMasterEnv->intervalProbes[ probeID ];
    6.10     gettimeofday( &(probe->startStamp), NULL );
    6.11   }
    6.12 @@ -268,7 +268,7 @@
    6.13     struct timeval *endStamp, *startStamp;
    6.14     float64 startSecs, endSecs;
    6.15  
    6.16 -         DEBUG_MSG( dbgProbes, "record end of interval\n" )
    6.17 +         DEBUG( dbgProbes, "record end of interval\n" )
    6.18        //possible seg-fault if array resized by diff core right after this
    6.19        // one gets probe..?  Something like that?  Might be safe.. don't care
    6.20     probe = _VMSMasterEnv->intervalProbes[ probeID ];
     7.1 --- a/vmalloc.c	Tue Nov 02 16:43:01 2010 -0700
     7.2 +++ b/vmalloc.c	Thu Nov 04 18:13:18 2010 -0700
     7.3 @@ -67,7 +67,7 @@
     7.4      }
     7.5  
     7.6     if( foundElem == NULL )
     7.7 -    { PRINT_ERROR("\nmalloc failed\n")
     7.8 +    { ERROR("\nmalloc failed\n")
     7.9        return NULL;  //indicates malloc failed
    7.10      }
    7.11        //Using a kludge to identify the element that is the top chunk in the