# HG changeset patch
# User Me
# Date 1288919598 25200
# Node ID 3bac84e4e56e3e88a9e4977cee9946143551f863
# Parent  f8508572f3de9080da6148d30219271c7ecff065
Works with correct matrix mult Nov 4 -- switch animators macros, many updates

Changed all queues back to VMSQ variants #defines
correct, protected, work-stealing, with compiler switch in and out

diff -r f8508572f3de -r 3bac84e4e56e CoreLoop.c
--- a/CoreLoop.c	Tue Nov 02 16:43:01 2010 -0700
+++ b/CoreLoop.c	Thu Nov 04 18:13:18 2010 -0700
@@ -34,13 +34,24 @@
    ThdParams      *coreLoopThdParams;
    int             thisCoresIdx;
    VirtProcr      *currPr;
-   SRSWQueueStruc *readyToAnimateQ;
+   VMSQueueStruc *readyToAnimateQ;
    unsigned long   coreMask;  //has 1 in bit positions of allowed cores
    int             errorCode;
-   
+
+      //work-stealing struc on stack to prevent false-sharing in cache-line
+   volatile GateStruc gate;
+   //preGateProgress, waitProgress, exitProgress, gateClosed;
+
+
    coreLoopThdParams = (ThdParams *)paramsIn;
    thisCoresIdx = coreLoopThdParams->coreNum;
 
+   gate.gateClosed      = FALSE;
+   gate.preGateProgress = 0;
+   gate.waitProgress    = 0;
+   gate.exitProgress    = 0;
+   _VMSMasterEnv->workStealingGates[ thisCoresIdx ] = &gate;//race @startup
+
       //wait until signalled that setup is complete
    pthread_mutex_lock(   &suspendLock );
    while( !(_VMSMasterEnv->setupComplete) )
@@ -87,32 +98,38 @@
       // which forces reloading the pointer after each jmp to this point
    readyToAnimateQ  = _VMSMasterEnv->readyToAnimateQs[thisCoresIdx];
 
-   currPr = (VirtProcr *) readSRSWQ_NonBlocking( readyToAnimateQ );
-   
+   #ifdef USE_WORK_STEALING
+      //Alg for work-stealing designed to make common case fast.  Comment
+      // in stealer code explains.
+   gate.preGateProgress++;
+   if( gate.gateClosed )
+    {    //now, set coreloop's progress, so stealer can see that core loop
+         // has made it into the waiting area.
+      gate.waitProgress = gate.preGateProgress;
+      while( gate.gateClosed ) /*busy wait*/;
+    }
+
+   currPr = (VirtProcr *) readVMSQ( readyToAnimateQ );
+
+      //Set the coreloop's progress, so stealer can see it has made it out
+      // of the protected area
+   gate.exitProgress = gate.preGateProgress;
+   #else
+   currPr = (VirtProcr *) readVMSQ( readyToAnimateQ );
+   #endif
+
    if( currPr != NULL ) _VMSMasterEnv->numMasterInARow[thisCoresIdx] = 0;
 
    int tries = 0; int gotLock = 0;
-   while( currPr == NULL )
-    {    //no VPs ready to animate, so run MasterVP --later make "try Master"
-         // VPs & put one in every queue at strategic point -- so have work
-         // avail if don't get lock & short-circuit out of it if master has
-         // recently run on another core
-         //TODO: perf -- "try Master" VP that checks if should run Master Fn
-         //But just letting queue run empty is quickest to see if pinning VP
-         // to core will solve the bizarre random seg-faults in system stack.
-
-         //check if get the MasterLock
+   while( currPr == NULL ) //if queue was empty, enter get masterLock loop
+    {    //queue was empty, so get master lock
       gotLock = __sync_bool_compare_and_swap( &(_VMSMasterEnv->masterLock), \
-                                                 UNLOCKED, LOCKED );
+                                                          UNLOCKED, LOCKED );
       if( gotLock )
-       {    //run own MasterVP -- when its done, unlocks MasterLock and
-            // jumps back to coreLoops's startPt
+       {    //run own MasterVP -- jmps to coreLoops startPt when done
          currPr = _VMSMasterEnv->masterVPs[thisCoresIdx];
-         if( _VMSMasterEnv->numMasterInARow[thisCoresIdx] > 100 )
-          { //printf("1000 back to back MasterVP\n");
-            //TODO: turn this into work-stealing from another core
-            //only yield if no work to steal -- and count consecutive yields
-            // if too many of those, then sleep for 10ms or whatever
+         if( _VMSMasterEnv->numMasterInARow[thisCoresIdx] > 1000 )
+          {       DEBUG( dbgB2BMaster,"Many back to back MasterVPs\n");
             pthread_yield();
           }
          _VMSMasterEnv->numMasterInARow[thisCoresIdx] += 1;
@@ -124,38 +141,7 @@
     }
    
 
-      //switch to virt procr's stack and frame ptr then jump to virt procr fn
-   void *stackPtr, *framePtr, *jmpPt, *coreLoopFramePtrAddr, \
-        *coreLoopStackPtrAddr;
-   
-   stackPtr = currPr->stackPtr;
-   framePtr = currPr->framePtr;
-   jmpPt    = currPr->nextInstrPt;
-   coreLoopFramePtrAddr = &(currPr->coreLoopFramePtr);
-   coreLoopStackPtrAddr = &(currPr->coreLoopStackPtr);
-
-      //Save the core loop's stack and frame pointers into virt procr struct
-      // then switch to stack ptr and frame ptr of virt procr & jmp to it
-      //This was a pain to get right because GCC converts the "(jmpPt)" to
-      // frame-relative mem-op -- so generated machine code first changed the
-      // frame pointer, then tried to jump to an addr stored on stack, which
-      // it accessed as an offset from frame-ptr!  (wrong frame-ptr now)
-      //Explicitly loading into eax before changing frame-ptr fixed it
-      //Also, it turns "(currPr->coreLoopFramePtr)" into a temporary on the
-      // stack, so "movl %%ebp, %0" saves to the temp, NOT the data-struc!
-   asm volatile("movl %0, %%eax;      \
-                 movl %%esp, (%%eax); \
-                 movl %1, %%eax;      \
-                 movl %%ebp, (%%eax); \
-                 movl %2, %%eax;      \
-                 movl %3, %%esp;      \
-                 movl %4, %%ebp;      \
-                 jmp  %%eax"          \
-   /* outputs */ : "=g"(coreLoopStackPtrAddr),                 \
-                   "=g"(coreLoopFramePtrAddr)                  \
-   /* inputs  */ : "g" (jmpPt), "g" (stackPtr), "g" (framePtr) \
-   /* clobber */ : "memory", "%eax", "%ebx", "%ecx", "%edx", "%edi", "%esi" \
-                );
+   SwitchToVP( currPr )
 
    //=========== jmp to here when want to shut down the VMS system ==========
    CoreLoopEndPt:
@@ -176,7 +162,7 @@
 coreLoop_Seq( void *paramsIn )
  {
    VirtProcr      *currPr;
-   SRSWQueueStruc *readyToAnimateQ;
+   VMSQueueStruc *readyToAnimateQ;
    
    ThdParams      *coreLoopThdParams;
    int             thisCoresIdx;
@@ -207,7 +193,7 @@
       //_VMSWorkQ must be a global, static volatile var, so not kept in reg,
       // which forces reloading the pointer after each jmp to this point
    readyToAnimateQ  = _VMSMasterEnv->readyToAnimateQs[thisCoresIdx];
-   currPr = (VirtProcr *) readSRSWQ_NonBlocking( readyToAnimateQ );
+   currPr = (VirtProcr *) readVMSQ( readyToAnimateQ );
    if( currPr == NULL )
     { if( _VMSMasterEnv->numMasterInARow[thisCoresIdx] > 1000 )
        { printf("too many back to back MasterVP\n"); exit(1); }
@@ -219,38 +205,7 @@
       _VMSMasterEnv->numMasterInARow[thisCoresIdx] = 0;
 
 
-      //switch to virt procr's stack and frame ptr then jump to virt procr
-   void *stackPtr, *framePtr, *jmpPt, *coreLoopFramePtrAddr, \
-        *coreLoopStackPtrAddr;
-
-   stackPtr = currPr->stackPtr;
-   framePtr = currPr->framePtr;
-   jmpPt    = currPr->nextInstrPt;
-   coreLoopFramePtrAddr = &(currPr->coreLoopFramePtr);
-   coreLoopStackPtrAddr = &(currPr->coreLoopStackPtr);
-
-      //Save the core loop's stack and frame pointers into virt procr struct
-      // then switch to stack ptr and frame ptr of virt procr & jmp to it
-      //This was a pain to get right because GCC converts the "(jmpPt)" to
-      // frame-relative mem-op -- so generated machine code first changed the
-      // frame pointer, then tried to jump to an addr stored on stack, which
-      // it accessed as an offset from frame-ptr!  (wrong frame-ptr now)
-      //Explicitly loading into eax before changing frame-ptr fixed it
-      //Also, it turns "(currPr->coreLoopFramePtr)" into a temporary on the
-      // stack, so "movl %%ebp, %0" saves to the temp, NOT the data-struc!
-   asm volatile("movl %0, %%eax;      \
-                 movl %%esp, (%%eax); \
-                 movl %1, %%eax;      \
-                 movl %%ebp, (%%eax); \
-                 movl %2, %%eax;      \
-                 movl %3, %%esp;      \
-                 movl %4, %%ebp;      \
-                 jmp  %%eax"          \
-   /* outputs */ : "=g"(coreLoopStackPtrAddr),                 \
-                   "=g"(coreLoopFramePtrAddr)                  \
-   /* inputs  */ : "g" (jmpPt), "g" (stackPtr), "g" (framePtr) \
-   /* clobber */ : "memory", "%eax", "%ebx", "%ecx", "%edx", "%edi", "%esi" \
-                );
+   SwitchToVP( currPr )
 
    //========================================================================
       //jmp to here when want to shut down the VMS system.  A shutdown VP is
diff -r f8508572f3de -r 3bac84e4e56e MasterLoop.c
--- a/MasterLoop.c	Tue Nov 02 16:43:01 2010 -0700
+++ b/MasterLoop.c	Thu Nov 04 18:13:18 2010 -0700
@@ -12,6 +12,14 @@
 #include "VMS.h"
 
 
+//===========================================================================
+void inline
+stealWorkInto( SchedSlot *currSlot, VMSQueueStruc *readyToAnimateQ,
+               VirtProcr *masterPr );
+
+//===========================================================================
+
+
 
 /*This code is animated by the virtual Master processor.
  *
@@ -64,7 +72,7 @@
  */
 void masterLoop( void *initData, VirtProcr *animatingPr )
  { 
-   int             slotIdx;
+   int32           slotIdx, numSlotsFilled;
    VirtProcr      *schedVirtPr;
    SchedSlot      *currSlot, **schedSlots;
    MasterEnv      *masterEnv;
@@ -74,7 +82,7 @@
    RequestHandler  requestHandler;
    void           *semanticEnv;
 
-   int             thisCoresIdx;
+   int32           thisCoresIdx;
    VirtProcr      *masterPr;
    volatile        VirtProcr *volatileMasterPr;
    
@@ -108,7 +116,7 @@
 
    masterEnv        = _VMSMasterEnv;
    
-//TODO: check that compiles so that always re-define from frame-storage
+      //GCC may optimize so doesn't always re-define from frame-storage
    masterPr         = volatileMasterPr;  //just to make sure after jmp
    thisCoresIdx     = masterPr->coreAnimatedBy;
    readyToAnimateQ  = masterEnv->readyToAnimateQs[thisCoresIdx];
@@ -120,6 +128,7 @@
 
 
       //Poll each slot's Done flag
+   numSlotsFilled = 0;
    for( slotIdx = 0; slotIdx < NUM_SCHED_SLOTS; slotIdx++)
     {
       currSlot = schedSlots[ slotIdx ];
@@ -141,46 +150,203 @@
           { currSlot->procrAssignedToSlot = schedVirtPr;
             schedVirtPr->schedSlot        = currSlot;
             currSlot->needsProcrAssigned  = FALSE;
-
-            writeSRSWQ( schedVirtPr, readyToAnimateQ );
+            numSlotsFilled               += 1;
+            
+            writeVMSQ( schedVirtPr, readyToAnimateQ );
           }
        }
     }
 
+   
+   #ifdef USE_WORK_STEALING
+      //If no slots filled, means no more work, look for work to steal.
+   if( numSlotsFilled == 0 )
+    { gateProtected_stealWorkInto( currSlot, readyToAnimateQ, masterPr );
+    }
+   #endif
 
-      //Save stack ptr and frame, restore CoreLoop's stack and frame,
-      // and clear the MasterLock
-      //TODO: cafefully verify don't need to force saving anything to stack
-      // before jumping back to core loop.
-   void           *stackPtrAddr, *framePtrAddr, *masterLockAddr;
-   void           *jmpPt, *coreLoopFramePtr, *coreLoopStackPtr;
-
-   stackPtrAddr      = &(masterPr->stackPtr);
-   framePtrAddr      = &(masterPr->framePtr);
-   masterLockAddr    = &(_VMSMasterEnv->masterLock);
-
-   jmpPt             = _VMSMasterEnv->coreLoopStartPt;
-   coreLoopFramePtr  = masterPr->coreLoopFramePtr;//need this only
-   coreLoopStackPtr  = masterPr->coreLoopStackPtr;//shouldn't need -- safety
    
    #ifdef MEAS__TIME_MASTER
    saveLowTimeStampCountInto( masterPr->endMasterTSCLow );
    #endif
 
-   asm volatile("movl %0,     %%eax;  \
-                 movl %%esp, (%%eax); \
-                 movl %1,     %%eax;  \
-                 movl %%ebp, (%%eax); \
-                 movl %2, %%ebx;      \
-                 movl %3, %%eax;      \
-                 movl %4, %%esp;      \
-                 movl %5, %%ebp;      \
-                 movl $0x0, (%%ebx);  \
-                 jmp  %%eax;"         \
-   /* outputs */ : "=g" (stackPtrAddr), "=g" (framePtrAddr),                \
-                   "=g"(masterLockAddr)                                     \
-   /* inputs  */ : "g" (jmpPt), "g"(coreLoopStackPtr), "g"(coreLoopFramePtr)\
-   /* clobber */ : "memory", "%eax", "%ebx", "%ecx", "%edx", "%edi", "%esi" \
-                );//can probably make clobber list empty -- but safe for now
+   
+   masterSwitchToCoreLoop( masterPr )
  }
 
+
+
+/*This has a race condition -- the coreloops are accessing their own queues
+ * at the same time that this work-stealer on a different core is trying to
+ */
+void inline
+stealWorkInto( SchedSlot *currSlot, VMSQueueStruc *readyToAnimateQ,
+               VirtProcr *masterPr )
+ { 
+   VirtProcr   *stolenPr;
+   int32        coreIdx, i;
+   VMSQueueStruc *currQ;
+
+   stolenPr = NULL;
+   coreIdx = masterPr->coreAnimatedBy;
+   for( i = 0; i < NUM_CORES -1; i++ )
+    {
+      if( coreIdx >= NUM_CORES -1 )
+       { coreIdx = 0;
+       }
+      else
+       { coreIdx++;
+       }
+      currQ = _VMSMasterEnv->readyToAnimateQs[coreIdx];
+      if( numInVMSQ( currQ ) > 0 )
+       { stolenPr = readVMSQ (currQ );
+         break;
+       }
+    }
+
+   if( stolenPr != NULL )
+    { currSlot->procrAssignedToSlot = stolenPr;
+      stolenPr->schedSlot           = currSlot;
+      currSlot->needsProcrAssigned  = FALSE;
+
+      writeVMSQ( stolenPr, readyToAnimateQ );
+    }
+ }
+
+/*This algorithm makes the common case fast.  Make the coreloop passive,
+ * and show its progress.  Make the stealer control a gate that coreloop
+ * has to pass.
+ *To avoid interference, only one stealer at a time.  Use a global
+ * stealer-lock.
+ *
+ *The pattern is based on a gate -- stealer shuts the gate, then monitors
+ * to be sure any already past make it all the way out, before starting.
+ *So, have a "progress" measure just before the gate, then have two after it,
+ * one is in a "waiting room" outside the gate, the other is at the exit.
+ *Then, the stealer first shuts the gate, then checks the progress measure
+ * outside it, then looks to see if the progress measure at the exit is the
+ * same.  If yes, it knows the protected area is empty 'cause no other way
+ * to get in and the last to get in also exited.
+ *If the progress measure at the exit is not the same, then the stealer goes
+ * into a loop checking both the waiting-area and the exit progress-measures
+ * until one of them shows the same as the measure outside the gate.  Might
+ * as well re-read the measure outside the gate each go around, just to be
+ * sure.  It is guaranteed that one of the two will eventually match the one
+ * outside the gate.
+ *
+ *Here's an informal proof of correctness:
+ *The gate can be closed at any point, and have only four cases:
+ *  1) coreloop made it past the gate-closing but not yet past the exit
+ *  2) coreloop made it past the pre-gate progress update but not yet past
+ *     the gate,
+ *  3) coreloop is right before the pre-gate update
+ *  4) coreloop is past the exit and far from the pre-gate update.
+ *
+ * Covering the cases in reverse order,
+ *  4) is not a problem -- stealer will read pre-gate progress, see that it
+ *     matches exit progress, and the gate is closed, so stealer can proceed.
+ *  3) stealer will read pre-gate progress just after coreloop updates it..
+ *     so stealer goes into a loop until the coreloop causes wait-progress
+ *     to match pre-gate progress, so then stealer can proceed
+ *  2) same as 3..
+ *  1) stealer reads pre-gate progress, sees that it's different than exit,
+ *     so goes into loop until exit matches pre-gate, now it knows coreloop
+ *     is not in protected and cannot get back in, so can proceed.
+ *
+ *Implementation for the stealer:
+ *
+ *First, acquire the stealer lock -- only cores with no work to do will
+ * compete to steal, so not a big performance penalty having only one --
+ * will rarely have multiple stealers in a system with plenty of work -- and
+ * in a system with little work, it doesn't matter.
+ *
+ *Note, have single-reader, single-writer pattern for all variables used to
+ * communicate between stealer and victims
+ *
+ *So, scan the queues of the core loops, until find non-empty.  Each core
+ * has its own list that it scans.  The list goes in order from closest to
+ * furthest core, so it steals first from close cores.  Later can add
+ * taking info from the app about overlapping footprints, and scan all the
+ * others then choose work with the most footprint overlap with the contents
+ * of this core's cache.
+ *
+ *Now, have a victim want to take work from.  So, shut the gate in that
+ * coreloop, by setting the "gate closed" var on its stack to TRUE.
+ *Then, read the core's pre-gate progress and compare to the core's exit
+ * progress.
+ *If same, can proceed to take work from the coreloop's queue.  When done,
+ * write FALSE to gate closed var.
+ *If different, then enter a loop that reads the pre-gate progress, then
+ * compares to exit progress then to wait progress.  When one of two
+ * matches, proceed.  Take work from the coreloop's queue.  When done,
+ * write FALSE to the gate closed var.
+ * 
+ */
+void inline
+gateProtected_stealWorkInto( SchedSlot *currSlot,
+                             VMSQueueStruc *myReadyToAnimateQ,
+                             VirtProcr *masterPr )
+ {
+   VirtProcr     *stolenPr;
+   int32          coreIdx, i, haveAVictim, gotLock;
+   VMSQueueStruc *victimsQ;
+
+   volatile GateStruc *vicGate;
+   int32               coreMightBeInProtected;
+
+
+
+      //see if any other cores have work available to steal
+   haveAVictim = FALSE;
+   coreIdx = masterPr->coreAnimatedBy;
+   for( i = 0; i < NUM_CORES -1; i++ )
+    {
+      if( coreIdx >= NUM_CORES -1 )
+       { coreIdx = 0;
+       }
+      else
+       { coreIdx++;
+       }
+      victimsQ = _VMSMasterEnv->readyToAnimateQs[coreIdx];
+      if( numInVMSQ( victimsQ ) > 0 )
+       { haveAVictim = TRUE;
+         vicGate = _VMSMasterEnv->workStealingGates[ coreIdx ];
+         break;
+       }
+    }
+   if( !haveAVictim ) return;  //no work to steal, exit
+
+      //have a victim core, now get the stealer-lock
+   gotLock =__sync_bool_compare_and_swap( &(_VMSMasterEnv->workStealingLock),
+                                                          UNLOCKED, LOCKED );
+   if( !gotLock ) return; //go back to core loop, which will re-start master
+
+
+   //====== Start Gate-protection =======
+   vicGate->gateClosed = TRUE;
+   coreMightBeInProtected= vicGate->preGateProgress != vicGate->exitProgress;
+   while( coreMightBeInProtected )
+    {    //wait until sure
+      if( vicGate->preGateProgress == vicGate->waitProgress )
+         coreMightBeInProtected = FALSE;
+      if( vicGate->preGateProgress == vicGate->exitProgress )
+         coreMightBeInProtected = FALSE;
+    }
+
+   stolenPr = readVMSQ ( victimsQ );
+
+   vicGate->gateClosed = FALSE;
+   //======= End Gate-protection  =======
+
+
+   if( stolenPr != NULL )  //victim could have been in protected and taken
+    { currSlot->procrAssignedToSlot = stolenPr;
+      stolenPr->schedSlot           = currSlot;
+      currSlot->needsProcrAssigned  = FALSE;
+
+      writeVMSQ( stolenPr, myReadyToAnimateQ );
+    }
+
+      //unlock the work stealing lock
+   _VMSMasterEnv->workStealingLock = UNLOCKED;
+ }
diff -r f8508572f3de -r 3bac84e4e56e SwitchAnimators.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/SwitchAnimators.h	Thu Nov 04 18:13:18 2010 -0700
@@ -0,0 +1,138 @@
+/*
+ *  Copyright 2009 OpenSourceStewardshipFoundation.org
+ *  Licensed under GNU General Public License version 2
+ *
+ * Author: seanhalle@yahoo.com
+ * 
+ */
+
+#ifndef _SwitchAnimators_H
+#define	_SwitchAnimators_H
+#define __USE_GNU
+
+/*Isolating code for switching between animators within these macros -- at
+ * some point will make switches to compile for 32 bit or for 64 bit, which
+ * having these isolated will make cleaner
+ *
+ *This also makes it easier to change architectures, at some point
+ *And it cleans the code up, having the ugly assembly out of the way
+ */
+
+//=========================== MasterVP to CoreLoop ==========================
+//
+      //Save stack ptr and frame, restore CoreLoop's stack and frame,
+      // and clear the MasterLock
+      //GCC's -O3 messes with this -- go through generated -- protect somehow
+      //
+#define masterSwitchToCoreLoop( masterPr )   \
+   void           *stackPtrAddr, *framePtrAddr, *masterLockAddr; \
+   void           *jmpPt, *coreLoopFramePtr, *coreLoopStackPtr;  \
+\
+   stackPtrAddr      = &(masterPr->stackPtr); \
+   framePtrAddr      = &(masterPr->framePtr); \
+   masterLockAddr    = &(_VMSMasterEnv->masterLock); \
+\
+   jmpPt             = _VMSMasterEnv->coreLoopStartPt; \
+   coreLoopFramePtr  = masterPr->coreLoopFramePtr; \
+   coreLoopStackPtr  = masterPr->coreLoopStackPtr; \
+\
+   asm volatile("movl %0,     %%eax;  \
+                 movl %%esp, (%%eax); \
+                 movl %1,     %%eax;  \
+                 movl %%ebp, (%%eax); \
+                 movl %2, %%ebx;      \
+                 movl %3, %%eax;      \
+                 movl %4, %%esp;      \
+                 movl %5, %%ebp;      \
+                 movl $0x0, (%%ebx);  \
+                 jmp  %%eax;"         \
+   /* outputs */ : "=g" (stackPtrAddr), "=g" (framePtrAddr),                \
+                   "=g"(masterLockAddr)                                     \
+   /* inputs  */ : "g" (jmpPt), "g"(coreLoopStackPtr), "g"(coreLoopFramePtr)\
+   /* clobber */ : "memory", "%eax", "%ebx", "%ecx", "%edx", "%edi", "%esi" \
+                );//can probably make clobber list empty -- but safe for now
+
+
+//=========================== SlaveVP to CoreLoop ===========================
+//
+
+#define    SwitchToCoreLoop( animatingPr ) \
+   void *jmpPt, *stackPtrAddr, *framePtrAddr, *coreLoopStackPtr; \
+   void *coreLoopFramePtr; \
+\
+   stackPtrAddr      = &(animatingPr->stackPtr); \
+   framePtrAddr      = &(animatingPr->framePtr); \
+\
+   jmpPt             = _VMSMasterEnv->coreLoopStartPt; \
+   coreLoopFramePtr  = animatingPr->coreLoopFramePtr; \
+   coreLoopStackPtr  = animatingPr->coreLoopStackPtr; \
+\
+      /*Save the virt procr's stack and frame ptrs*/ \
+   asm volatile("movl %0,     %%eax;  \
+                 movl %%esp, (%%eax); \
+                 movl %1,     %%eax;  \
+                 movl %%ebp, (%%eax) "\
+   /* outputs */ : "=g" (stackPtrAddr), "=g" (framePtrAddr) \
+   /* inputs  */ :        \
+   /* clobber */ : "%eax" \
+                ); \
+\
+     /*restore coreloop's frame ptr, then jump back to "start" of core loop*/\
+     /*Note, GCC compiles to assembly that saves esp and ebp in the stack*/ \
+     /* frame -- so have to explicitly do assembly that saves to memory*/ \
+   asm volatile("movl %0, %%eax;      \
+                 movl %1, %%esp;      \
+                 movl %2, %%ebp;      \
+                 jmp  %%eax    "      \
+   /* outputs */ :                    \
+   /* inputs  */ : "m" (jmpPt), "m"(coreLoopStackPtr), "m"(coreLoopFramePtr)\
+   /* clobber */ : "memory", "%eax", "%ebx", "%ecx", "%edx", "%edi","%esi"  \
+                );
+ //list everything as clobbered to force GCC to save all
+ // live vars that are in regs on stack before this
+ // assembly, so that stack pointer is correct, before jmp
+
+
+
+//============================== CoreLoop to VP =============================
+//
+      //Save the core loop's stack and frame pointers into virt procr struct
+      // then switch to stack ptr and frame ptr of virt procr & jmp to it
+      //This was a pain to get right because GCC converts the "(jmpPt)" to
+      // frame-relative mem-op -- so generated machine code first changed the
+      // frame pointer, then tried to jump to an addr stored on stack, which
+      // it accessed as an offset from frame-ptr!  (wrong frame-ptr now)
+      //Explicitly loading into eax before changing frame-ptr fixed it
+      //Also, it turns "(currPr->coreLoopFramePtr)" into a temporary on the
+      // stack, so "movl %%ebp, %0" saves to the temp, NOT the data-struc!
+
+
+      //switch to virt procr's stack and frame ptr then jump to virt procr fn
+
+#define SwitchToVP( currPr ) \
+   void *stackPtr, *framePtr, *jmpPt, *coreLoopFramePtrAddr, \
+        *coreLoopStackPtrAddr; \
+\
+   stackPtr = currPr->stackPtr; \
+   framePtr = currPr->framePtr; \
+   jmpPt    = currPr->nextInstrPt; \
+   coreLoopFramePtrAddr = &(currPr->coreLoopFramePtr); \
+   coreLoopStackPtrAddr = &(currPr->coreLoopStackPtr); \
+\
+   asm volatile("movl %0, %%eax;      \
+                 movl %%esp, (%%eax); \
+                 movl %1, %%eax;      \
+                 movl %%ebp, (%%eax); \
+                 movl %2, %%eax;      \
+                 movl %3, %%esp;      \
+                 movl %4, %%ebp;      \
+                 jmp  %%eax"          \
+   /* outputs */ : "=g"(coreLoopStackPtrAddr),                 \
+                   "=g"(coreLoopFramePtrAddr)                  \
+   /* inputs  */ : "g" (jmpPt), "g" (stackPtr), "g" (framePtr) \
+   /* clobber */ : "memory", "%eax", "%ebx", "%ecx", "%edx", "%edi", "%esi" \
+                );
+
+   
+#endif	/* _SwitchAnimators_H */
+
diff -r f8508572f3de -r 3bac84e4e56e VMS.c
--- a/VMS.c	Tue Nov 02 16:43:01 2010 -0700
+++ b/VMS.c	Thu Nov 04 18:13:18 2010 -0700
@@ -87,7 +87,7 @@
 void
 create_masterEnv()
  { MasterEnv       *masterEnv;
-   SRSWQueueStruc **readyToAnimateQs;
+   VMSQueueStruc **readyToAnimateQs;
    int              coreIdx;
    VirtProcr      **masterVPs;
    SchedSlot     ***allSchedSlots; //ptr to array of ptrs
@@ -105,7 +105,7 @@
    masterEnv     = _VMSMasterEnv;
    
       //Make a readyToAnimateQ for each core loop
-   readyToAnimateQs = VMS__malloc( NUM_CORES * sizeof(SRSWQueueStruc *) );
+   readyToAnimateQs = VMS__malloc( NUM_CORES * sizeof(VMSQueueStruc *) );
    masterVPs        = VMS__malloc( NUM_CORES * sizeof(VirtProcr *) );
 
       //One array for each core, 3 in array, core's masterVP scheds all
@@ -114,18 +114,20 @@
    _VMSMasterEnv->numProcrsCreated = 0;  //used by create procr
    for( coreIdx = 0; coreIdx < NUM_CORES; coreIdx++ )
     {    
-      readyToAnimateQs[ coreIdx ] = makeSRSWQ();
+      readyToAnimateQs[ coreIdx ] = makeVMSQ();
       
          //Q: should give masterVP core-specific info as its init data?
       masterVPs[ coreIdx ] = VMS__create_procr( &masterLoop, masterEnv );
       masterVPs[ coreIdx ]->coreAnimatedBy = coreIdx;
       allSchedSlots[ coreIdx ] = create_sched_slots(); //makes for one core
       _VMSMasterEnv->numMasterInARow[ coreIdx ] = 0;
+      _VMSMasterEnv->workStealingGates[ coreIdx ] = NULL;
     }
    _VMSMasterEnv->readyToAnimateQs = readyToAnimateQs;
    _VMSMasterEnv->masterVPs        = masterVPs;
    _VMSMasterEnv->masterLock       = UNLOCKED;
    _VMSMasterEnv->allSchedSlots    = allSchedSlots;
+   _VMSMasterEnv->workStealingLock = UNLOCKED;
 
 
       //Aug 19, 2010:  no longer need to place initial masterVP into queue
@@ -338,8 +340,7 @@
  */
 void
 VMS__suspend_procr( VirtProcr *animatingPr )
- { void *jmpPt, *stackPtrAddr, *framePtrAddr, *coreLoopStackPtr;
-   void *coreLoopFramePtr;
+ { 
 
       //The request to master will cause this suspended virt procr to get
       // scheduled again at some future point -- to resume, core loop jumps
@@ -350,23 +351,6 @@
       //return ownership of the virt procr and sched slot to Master virt pr
    animatingPr->schedSlot->workIsDone = TRUE;
 
-   stackPtrAddr      = &(animatingPr->stackPtr);
-   framePtrAddr      = &(animatingPr->framePtr);
-
-   jmpPt             = _VMSMasterEnv->coreLoopStartPt;
-   coreLoopFramePtr  = animatingPr->coreLoopFramePtr;//need this only
-   coreLoopStackPtr  = animatingPr->coreLoopStackPtr;//safety
-
-      //Save the virt procr's stack and frame ptrs,
-   asm volatile("movl %0,     %%eax;  \
-                 movl %%esp, (%%eax); \
-                 movl %1,     %%eax;  \
-                 movl %%ebp, (%%eax) "\
-   /* outputs */ : "=g" (stackPtrAddr), "=g" (framePtrAddr) \
-   /* inputs  */ :        \
-   /* clobber */ : "%eax" \
-                );
-
    //===========================  Measurement stuff ========================
    #ifdef MEAS__TIME_STAMP_SUSP
       //record time stamp: compare to time-stamp recorded below
@@ -374,20 +358,10 @@
    #endif
    //=======================================================================
 
-      //restore coreloop's frame ptr, then jump back to "start" of core loop
-      //Note, GCC compiles to assembly that saves esp and ebp in the stack
-      // frame -- so have to explicitly do assembly that saves to memory
-   asm volatile("movl %0, %%eax;      \
-                 movl %1, %%esp;      \
-                 movl %2, %%ebp;      \
-                 jmp  %%eax    "      \
-   /* outputs */ :                    \
-   /* inputs  */ : "m" (jmpPt), "m"(coreLoopStackPtr), "m"(coreLoopFramePtr)\
-   /* clobber */ : "memory", "%eax", "%ebx", "%ecx", "%edx", "%edi","%esi"  \
-                ); //list everything as clobbered to force GCC to save all
-                   // live vars that are in regs on stack before this
-                   // assembly, so that stack pointer is correct, before jmp
 
+   SwitchToCoreLoop( animatingPr )
+
+   //=======================================================================
 ResumePt:
    #ifdef MEAS__TIME_STAMP_SUSP
       //NOTE: only take low part of count -- do sanity check when take diff
@@ -673,7 +647,7 @@
    for( coreIdx=0; coreIdx < NUM_CORES; coreIdx++ )
     {    //Note, this is running in the master
       shutDownPr = VMS__create_procr( &endOSThreadFn, NULL );
-      writeSRSWQ( shutDownPr, _VMSMasterEnv->readyToAnimateQs[coreIdx] );
+      writeVMSQ( shutDownPr, _VMSMasterEnv->readyToAnimateQs[coreIdx] );
     }
 
  }
@@ -717,7 +691,7 @@
 void
 VMS__cleanup_at_end_of_shutdown()
  { 
-   SRSWQueueStruc **readyToAnimateQs;
+   VMSQueueStruc **readyToAnimateQs;
    int              coreIdx;
    VirtProcr      **masterVPs;
    SchedSlot     ***allSchedSlots; //ptr to array of ptrs
@@ -731,7 +705,7 @@
    
    for( coreIdx = 0; coreIdx < NUM_CORES; coreIdx++ )
     {
-      freeSRSWQ( readyToAnimateQs[ coreIdx ] );
+      freeVMSQ( readyToAnimateQs[ coreIdx ] );
          //master VPs were created external to VMS, so use external free
       VMS__dissipate_procr( masterVPs[ coreIdx ] );
       
diff -r f8508572f3de -r 3bac84e4e56e VMS.h
--- a/VMS.h	Tue Nov 02 16:43:01 2010 -0700
+++ b/VMS.h	Thu Nov 04 18:13:18 2010 -0700
@@ -11,7 +11,7 @@
 #define __USE_GNU
 
 #include "VMS_primitive_data_types.h"
-#include "Queue_impl/BlockingQueue.h"
+#include "Queue_impl/PrivateQueue.h"
 #include "Histogram/Histogram.h"
 #include "DynArray/DynArray.h"
 #include "Hash_impl/PrivateHash.h"
@@ -22,28 +22,36 @@
 
 
 //===============================  Debug  ===================================
-   //These defines turn types of bug messages on and off
-#define dbgProbes FALSE
-#define dbgAppFlow FALSE
-
+//
    //When SEQUENTIAL is defined, VMS does sequential exe in the main thread
    // It still does co-routines and all the mechanisms are the same, it just
    // has only a single thread and animates VPs one at a time
 //#define SEQUENTIAL
 
+//#define USE_WORK_STEALING
+
    //turns on the probe-instrumentation in the application -- when not
    // defined, the calls to the probe functions turn into comments
 #define STATS__ENABLE_PROBES
 
+   //These defines turn types of bug messages on and off
+   // be sure debug messages are un-commented (next block of defines)
+#define dbgProbes FALSE   /* for issues inside probes themselves*/
+#define dbgAppFlow TRUE  /* Top level flow of application code -- general*/
+#define dbgB2BMaster FALSE/* in coreloop, back to back master VPs*/
+#define dbgRqstHdlr FALSE /* in request handler code*/
 
-#define DEBUG(msg)// printf(msg); fflush(stdin);
-#define DEBUG_MSG( bool, msg) //if( bool){ printf(msg); fflush(stdin);}
-#define PRINT1_DEBUG(msg, param) //printf(msg, param); fflush(stdin);
-#define PRINT2_DEBUG(msg, p1, p2) //printf(msg, p1, p2); fflush(stdin);
+   //Comment or un- the substitute half to turn on/off types of debug message
+#define DEBUG(  bool, msg)         \
+   if( bool){ printf(msg); fflush(stdin);}
+#define DEBUG1( bool, msg, param)  \
+   if(bool){printf(msg, param); fflush(stdin);}
+#define DEBUG2( bool, msg, p1, p2) \
+   //if(bool) {printf(msg, p1, p2); fflush(stdin);}
 
-#define PRINT_ERROR(msg) printf(msg); fflush(stdin);
-#define PRINT1_ERROR(msg, param) printf(msg, param); fflush(stdin);
-#define PRINT2_ERROR(msg, p1, p2) printf(msg, p1, p2); fflush(stdin);
+#define ERROR(msg) printf(msg); fflush(stdin);
+#define ERROR1(msg, param) printf(msg, param); fflush(stdin);
+#define ERROR2(msg, p1, p2) printf(msg, p1, p2); fflush(stdin);
 
 //===========================  STATS =======================
 
@@ -56,6 +64,8 @@
 #define MEAS__TIME_MASTER
 #define MEAS__NUM_TIMES_TO_RUN 100000
 
+   //For code that calculates normalization-offset between TSC counts of
+   // different cores.
 #define NUM_TSC_ROUND_TRIPS 10
 
 
@@ -64,8 +74,9 @@
    // machine
 #define NUM_CORES        4
 
-   // balance amortizing master fixed overhead vs imbalance potential
-#define NUM_SCHED_SLOTS  3
+   // tradeoff amortizing master fixed overhead vs imbalance potential
+   // when work-stealing, can make bigger, at risk of losing cache affinity
+#define NUM_SCHED_SLOTS  5
 
 #define MIN_WORK_UNIT_CYCLES 20000
 
@@ -82,10 +93,11 @@
 
 #define SUCCESS 0
 
-#define writeVMSQ     writeSRSWQ
-#define readVMSQ      readSRSWQ
-#define makeVMSQ      makeSRSWQ
-#define VMSQueueStruc SRSWQueueStruc
+#define writeVMSQ     writePrivQ
+#define readVMSQ      readPrivQ
+#define makeVMSQ      makePrivQ
+#define numInVMSQ     numInPrivQ
+#define VMSQueueStruc PrivQueueStruc
 
 
 
@@ -96,6 +108,8 @@
 typedef struct _VMSReqst      VMSReqst;
 typedef struct _VirtProcr     VirtProcr;
 typedef struct _IntervalProbe IntervalProbe;
+typedef struct _GateStruc     GateStruc;
+
 
 typedef VirtProcr * (*SlaveScheduler)  ( void *, int );   //semEnv, coreIdx
 typedef void  (*RequestHandler)  ( VirtProcr *, void * ); //prWReqst, semEnv
@@ -190,7 +204,7 @@
    RequestHandler   requestHandler;
    
    SchedSlot     ***allSchedSlots;
-   SRSWQueueStruc **readyToAnimateQs;
+   VMSQueueStruc **readyToAnimateQs;
    VirtProcr      **masterVPs;
 
    void            *semanticEnv;
@@ -205,6 +219,9 @@
    int32            masterLock;
 
    int32            numMasterInARow[NUM_CORES];//detect back-to-back masterVP
+   GateStruc      **workStealingGates[ NUM_CORES ]; //concurrent work-steal
+   int32            workStealingLock;
+   
    int32            numProcrsCreated; //gives ordering to processor creation
 
       //=========== MEASUREMENT STUFF =============
@@ -216,13 +233,21 @@
  }
 MasterEnv;
 
-//=============================
+//=========================  Extra Stuff Data Strucs  =======================
 typedef struct
  {
 
  }
 VMSExcp;
 
+struct _GateStruc
+ {
+   int32 gateClosed;
+   int32 preGateProgress;
+   int32 waitProgress;
+   int32 exitProgress;
+ };
+//GateStruc
 
 //=======================  OS Thread related  ===============================
 
@@ -342,6 +367,7 @@
                 );
 //=====
 
+#include "SwitchAnimators.h"
 #include "probes.h"
 
 #endif	/* _VMS_H */
diff -r f8508572f3de -r 3bac84e4e56e probes.c
--- a/probes.c	Tue Nov 02 16:43:01 2010 -0700
+++ b/probes.c	Thu Nov 04 18:13:18 2010 -0700
@@ -253,7 +253,7 @@
 VMS_impl__record_interval_start_in_probe( int32 probeID )
  { IntervalProbe *probe;
 
-         DEBUG_MSG( dbgProbes, "record start of interval\n" )
+         DEBUG( dbgProbes, "record start of interval\n" )
    probe = _VMSMasterEnv->intervalProbes[ probeID ];
    gettimeofday( &(probe->startStamp), NULL );
  }
@@ -268,7 +268,7 @@
    struct timeval *endStamp, *startStamp;
    float64 startSecs, endSecs;
 
-         DEBUG_MSG( dbgProbes, "record end of interval\n" )
+         DEBUG( dbgProbes, "record end of interval\n" )
       //possible seg-fault if array resized by diff core right after this
       // one gets probe..?  Something like that?  Might be safe.. don't care
    probe = _VMSMasterEnv->intervalProbes[ probeID ];
diff -r f8508572f3de -r 3bac84e4e56e vmalloc.c
--- a/vmalloc.c	Tue Nov 02 16:43:01 2010 -0700
+++ b/vmalloc.c	Thu Nov 04 18:13:18 2010 -0700
@@ -67,7 +67,7 @@
     }
 
    if( foundElem == NULL )
-    { PRINT_ERROR("\nmalloc failed\n")
+    { ERROR("\nmalloc failed\n")
       return NULL;  //indicates malloc failed
     }
       //Using a kludge to identify the element that is the top chunk in the