# HG changeset patch
# User SeanHalle
# Date 1289485191 28800
# Node ID 984f7d78bfdf0b3d976557bdf8511e95fed3368b
# Parent  4fbc2165e493d10b78cccfd8b8298e46cef04dd3# Parent  7b799a46cc8721572b30ffa19773b7a4b54bb582
Merge See what happens -- merged test stuff into Nov 8 VMS version

diff -r 4fbc2165e493 -r 984f7d78bfdf CoreLoop.c
--- a/CoreLoop.c	Tue Oct 26 18:31:34 2010 -0700
+++ b/CoreLoop.c	Thu Nov 11 06:19:51 2010 -0800
@@ -41,10 +41,32 @@
    VMSQueueStruc *readyToAnimateQ;
    unsigned long   coreMask;  //has 1 in bit positions of allowed cores
    int             errorCode;
-   
+
+      //work-stealing struc on stack to prevent false-sharing in cache-line
+   volatile GateStruc gate;
+   //preGateProgress, waitProgress, exitProgress, gateClosed;
+
+
    coreLoopThdParams = (ThdParams *)paramsIn;
    thisCoresIdx = coreLoopThdParams->coreNum;
 
+   gate.gateClosed      = FALSE;
+   gate.preGateProgress = 0;
+   gate.waitProgress    = 0;
+   gate.exitProgress    = 0;
+   _VMSMasterEnv->workStealingGates[ thisCoresIdx ] = &gate;//race @startup
+
+      //wait until signalled that setup is complete
+   pthread_mutex_lock(   &suspendLock );
+   while( !(_VMSMasterEnv->setupComplete) )
+    {
+      pthread_cond_wait( &suspend_cond,
+                         &suspendLock );
+    }
+   pthread_mutex_unlock( &suspendLock );
+
+      //printf( "\nCore unsuspended: %d\n", coreLoopThdParams->coreNum );
+
       //set thread affinity
       //Linux requires pinning thd to core inside thread-function
       //Designate a core by a 1 in bit-position corresponding to the core
@@ -53,25 +75,9 @@
    pthread_t selfThd = pthread_self();
    errorCode =
    pthread_setaffinity_np( selfThd, sizeof(coreMask), &coreMask);
-
+   
    if(errorCode){ printf("\nset affinity failure\n"); exit(0); }
 
-      //measure offsets between TSCs
-      //Core 0 is the reference core, the rest react to it.
-   if( thisCoresIdx == 0 ) measureTSCOffsetsAsCore0();
-   else measureTSCOffsetsAsRemoteCore( thisCoresIdx );
-   
-      //wait until signalled that setup is complete
-   pthread_mutex_lock(   &suspendLock );
-   while( !(_VMSMasterEnv->setupComplete) )
-    { pthread_cond_wait( &suspend_cond, &suspendLock );
-    }
-   pthread_mutex_unlock( &suspendLock );
-
-
-      //printf( "\nCore unsuspended: %d\n", coreLoopThdParams->coreNum );
-
-
    
       //Save addr of "end core loop" label - jump to it to shut down coreloop
       //To get label addr in non-gcc compiler, can trick it by making a call
@@ -88,82 +94,64 @@
    
       // Get to work!  --  virt procr jumps back here when suspends
       //Note, have to restore the frame-pointer before jump to here, to get
-      // this code to work right (readyToAnimateQ and so forth are frame-ptr
-      // relative)
+      // this code to work right (readyToAnimateQ and so forth are frame-ptr relative)
 CoreLoopStartPt:
    
       //Get virtual processor from queue
-      //_VMSWorkQ must be a global, static volatile var, so not kept in reg,
+      //The Q must be a global, static volatile var, so not kept in reg,
       // which forces reloading the pointer after each jmp to this point
    readyToAnimateQ  = _VMSMasterEnv->readyToAnimateQs[thisCoresIdx];
 
-   currPr = (VirtProcr *) readSRSWQ_NonBlocking( readyToAnimateQ );
+   #ifdef USE_WORK_STEALING
+      //Alg for work-stealing designed to make common case fast.  Comment
+      // in stealer code explains.
+   gate.preGateProgress++;
+   if( gate.gateClosed )
+    {    //now, set coreloop's progress, so stealer can see that core loop
+         // has made it into the waiting area.
+      gate.waitProgress = gate.preGateProgress;
+      while( gate.gateClosed ) /*busy wait*/;
+    }
+
+   currPr = (VirtProcr *) readVMSQ( readyToAnimateQ );
+
+      //Set the coreloop's progress, so stealer can see it has made it out
+      // of the protected area
+   gate.exitProgress = gate.preGateProgress;
+   #else
+   currPr = (VirtProcr *) readVMSQ( readyToAnimateQ );
+   #endif
+
+   if( currPr != NULL ) _VMSMasterEnv->numMasterInARow[thisCoresIdx] = 0;
+
    int tries = 0; int gotLock = 0;
-   while( currPr == NULL )
-    {    //no VPs ready to animate, so run MasterVP --later make "try Master"
-         // VPs & put one in every queue at strategic point -- so have work
-         // avail if don't get lock & short-circuit out of it if master has
-         // recently run on another core
-         //TODO: perf -- "try Master" VP that checks if should run Master Fn
-         //But just letting queue run empty is quickest to see if pinning VP
-         // to core will solve the bizarre random seg-faults in system stack.
-
-         //check if get the MasterLock
+   while( currPr == NULL ) //if queue was empty, enter get masterLock loop
+    {    //queue was empty, so get master lock
       gotLock = __sync_bool_compare_and_swap( &(_VMSMasterEnv->masterLock), \
-                                                 UNLOCKED, LOCKED );
-
+                                                          UNLOCKED, LOCKED );
       if( gotLock )
-       {
-            //run own MasterVP -- when its done, unlocks MasterLock and
-            // jumps back to coreLoops's startPt
+       {    //run own MasterVP -- jmps to coreLoops startPt when done
          currPr = _VMSMasterEnv->masterVPs[thisCoresIdx];
-         addToHist( tries, _VMSMasterEnv->stats->masterLockHist );
+         if( _VMSMasterEnv->numMasterInARow[thisCoresIdx] > 1000 )
+          {       DEBUG( dbgB2BMaster,"Many back to back MasterVPs\n");
+            pthread_yield();
+          }
+         _VMSMasterEnv->numMasterInARow[thisCoresIdx] += 1;
          break;  //end while -- have a VP to animate now
        }
       
-      tries++;
-      
-      if( tries % READYTOANIMATE_RETRIES == 0 ) pthread_yield();
+      tries++;      //if too many, means master on other core taking too long
+      if( tries > MASTERLOCK_RETRIES ) { tries = 0; pthread_yield(); }
     }
    
-      //switch to virt procr's stack and frame ptr then jump to virt procr fn
-   void *stackPtr, *framePtr, *jmpPt, *coreLoopFramePtrAddr, \
-        *coreLoopStackPtrAddr;
-   
-   stackPtr = currPr->stackPtr;
-   framePtr = currPr->framePtr;
-   jmpPt    = currPr->nextInstrPt;
-   coreLoopFramePtrAddr = &(currPr->coreLoopFramePtr);
-   coreLoopStackPtrAddr = &(currPr->coreLoopStackPtr);
 
-      //Save the core loop's stack and frame pointers into virt procr struct
-      // then switch to stack ptr and frame ptr of virt procr & jmp to it
-      //This was a pain to get right because GCC converts the "(jmpPt)" to
-      // frame-relative mem-op -- so generated machine code first changed the
-      // frame pointer, then tried to jump to an addr stored on stack, which
-      // it accessed as an offset from frame-ptr!  (wrong frame-ptr now)
-      //Explicitly loading into eax before changing frame-ptr fixed it
-      //Also, it turns "(currPr->coreLoopFramePtr)" into a temporary on the
-      // stack, so "movl %%ebp, %0" saves to the temp, NOT the data-struc!
-   asm volatile("movl %0, %%eax;      \
-                 movl %%esp, (%%eax); \
-                 movl %1, %%eax;      \
-                 movl %%ebp, (%%eax); \
-                 movl %2, %%eax;      \
-                 movl %3, %%esp;      \
-                 movl %4, %%ebp;      \
-                 jmp  %%eax"          \
-   /* outputs */ : "=g"(coreLoopStackPtrAddr),                 \
-                   "=g"(coreLoopFramePtrAddr)                  \
-   /* inputs  */ : "g" (jmpPt), "g" (stackPtr), "g" (framePtr) \
-   /* clobber */ : "memory", "%eax", "%ebx", "%ecx", "%edx", "%edi", "%esi" \
-                );
+   SwitchToVP( currPr )
 
    //=========== jmp to here when want to shut down the VMS system ==========
    CoreLoopEndPt:
       //first free shutdown VP that jumped here -- it first restores the
       // coreloop's stack, so addr of currPr in stack frame is still correct
-   VMS__handle_dissipate_reqst( currPr );
+   VMS__dissipate_procr( currPr );
    pthread_exit( NULL );
  }
 
@@ -195,62 +183,33 @@
    _VMSMasterEnv->coreLoopStartPt = &&SeqCoreLoopStartPt;
    _VMSMasterEnv->coreLoopEndPt   = &&SeqCoreLoopEndPt;
 
-      //Core loop has no values live upon CoreLoopStartPt except
-      // readyToAnimateQ
+      //Core loop has no values live upon CoreLoopStartPt except readyToAnimateQ
       // every value in the code is defined by a statement in core loop,
       // after the start point -- with the one exception of _VMSWorkQ
 
 
       // Get to work!  --  virt procr jumps back here when done or suspends
       //Note, have to restore the frame-pointer before jump to here, to get
-      // this code to work right (readyToAnimateQ and so forth are frame-ptr
-      // relative)
+      // this code to work right (readyToAnimateQ and so forth are frame-ptr relative)
 SeqCoreLoopStartPt:
 
       //Get virtual processor from queue
       //_VMSWorkQ must be a global, static volatile var, so not kept in reg,
       // which forces reloading the pointer after each jmp to this point
    readyToAnimateQ  = _VMSMasterEnv->readyToAnimateQs[thisCoresIdx];
-   currPr = (VirtProcr *) readSRSWQ_NonBlocking( readyToAnimateQ );
+   currPr = (VirtProcr *) readVMSQ( readyToAnimateQ );
    if( currPr == NULL )
+    { if( _VMSMasterEnv->numMasterInARow[thisCoresIdx] > 1000 )
+       { printf("too many back to back MasterVP\n"); exit(1); }
+      _VMSMasterEnv->numMasterInARow[thisCoresIdx] += 1;
+      
       currPr = _VMSMasterEnv->masterVPs[thisCoresIdx];
-   
+    }
+   else
+      _VMSMasterEnv->numMasterInARow[thisCoresIdx] = 0;
 
-//   printf("core %d loop procr addr: %d\n", coreLoopThdParams->coreNum, \
-//       (int)currPr ); fflush(stdin);
 
-      //switch to virt procr's stack and frame ptr then jump to virt procr
-   void *stackPtr, *framePtr, *jmpPt, *coreLoopFramePtrAddr, \
-        *coreLoopStackPtrAddr;
-
-   stackPtr = currPr->stackPtr;
-   framePtr = currPr->framePtr;
-   jmpPt    = currPr->nextInstrPt;
-   coreLoopFramePtrAddr = &(currPr->coreLoopFramePtr);
-   coreLoopStackPtrAddr = &(currPr->coreLoopStackPtr);
-
-      //Save the core loop's stack and frame pointers into virt procr struct
-      // then switch to stack ptr and frame ptr of virt procr & jmp to it
-      //This was a pain to get right because GCC converts the "(jmpPt)" to
-      // frame-relative mem-op -- so generated machine code first changed the
-      // frame pointer, then tried to jump to an addr stored on stack, which
-      // it accessed as an offset from frame-ptr!  (wrong frame-ptr now)
-      //Explicitly loading into eax before changing frame-ptr fixed it
-      //Also, it turns "(currPr->coreLoopFramePtr)" into a temporary on the
-      // stack, so "movl %%ebp, %0" saves to the temp, NOT the data-struc!
-   asm volatile("movl %0, %%eax;      \
-                 movl %%esp, (%%eax); \
-                 movl %1, %%eax;      \
-                 movl %%ebp, (%%eax); \
-                 movl %2, %%eax;      \
-                 movl %3, %%esp;      \
-                 movl %4, %%ebp;      \
-                 jmp  %%eax"          \
-   /* outputs */ : "=g"(coreLoopStackPtrAddr),                 \
-                   "=g"(coreLoopFramePtrAddr)                  \
-   /* inputs  */ : "g" (jmpPt), "g" (stackPtr), "g" (framePtr) \
-   /* clobber */ : "memory", "%eax", "%ebx", "%ecx", "%edx", "%edi", "%esi" \
-                );
+   SwitchToVP( currPr )
 
    //========================================================================
       //jmp to here when want to shut down the VMS system.  A shutdown VP is
@@ -260,7 +219,7 @@
       // all the threads to die will proceed, gather the result, and
       // return to the calling application.
 SeqCoreLoopEndPt:
-   VMS__handle_dissipate_reqst( currPr ); //free shutdown pr, that jmpd here
+   VMS__dissipate_procr( currPr ); //free shutdown pr, that jmpd here
    return;
  }
 
@@ -380,6 +339,3 @@
  }
 
 
-
-
-
diff -r 4fbc2165e493 -r 984f7d78bfdf DESIGN_NOTES__VMS.txt
--- a/DESIGN_NOTES__VMS.txt	Tue Oct 26 18:31:34 2010 -0700
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,2 +0,0 @@
-
-Implement VMS this way:
diff -r 4fbc2165e493 -r 984f7d78bfdf MasterLoop.c
--- a/MasterLoop.c	Tue Oct 26 18:31:34 2010 -0700
+++ b/MasterLoop.c	Thu Nov 11 06:19:51 2010 -0800
@@ -7,12 +7,19 @@
 
 
 #include <stdio.h>
-#include <malloc.h>
 #include <stddef.h>
 
 #include "VMS.h"
 
 
+//===========================================================================
+void inline
+stealWorkInto( SchedSlot *currSlot, VMSQueueStruc *readyToAnimateQ,
+               VirtProcr *masterPr );
+
+//===========================================================================
+
+
 
 /*This code is animated by the virtual Master processor.
  *
@@ -65,7 +72,7 @@
  */
 void masterLoop( void *initData, VirtProcr *animatingPr )
  { 
-   int             slotIdx;
+   int32           slotIdx, numSlotsFilled;
    VirtProcr      *schedVirtPr;
    SchedSlot      *currSlot, **schedSlots;
    MasterEnv      *masterEnv;
@@ -75,7 +82,7 @@
    RequestHandler  requestHandler;
    void           *semanticEnv;
 
-   int             thisCoresIdx;
+   int32           thisCoresIdx;
    VirtProcr      *masterPr;
    volatile        VirtProcr *volatileMasterPr;
    
@@ -110,7 +117,7 @@
 
    masterEnv        = _VMSMasterEnv;
    
-//TODO: check that compiles so that always re-define from frame-storage
+      //GCC may optimize so doesn't always re-define from frame-storage
    masterPr         = volatileMasterPr;  //just to make sure after jmp
    thisCoresIdx     = masterPr->coreAnimatedBy;
    readyToAnimateQ  = masterEnv->readyToAnimateQs[thisCoresIdx];
@@ -122,6 +129,7 @@
 
 
       //Poll each slot's Done flag
+   numSlotsFilled = 0;
    for( slotIdx = 0; slotIdx < NUM_SCHED_SLOTS; slotIdx++)
     {
       currSlot = schedSlots[ slotIdx ];
@@ -143,27 +151,21 @@
           { currSlot->procrAssignedToSlot = schedVirtPr;
             schedVirtPr->schedSlot        = currSlot;
             currSlot->needsProcrAssigned  = FALSE;
-
-            writeSRSWQ( schedVirtPr, readyToAnimateQ );
+            numSlotsFilled               += 1;
+            
+            writeVMSQ( schedVirtPr, readyToAnimateQ );
           }
        }
     }
 
+   
+   #ifdef USE_WORK_STEALING
+      //If no slots filled, means no more work, look for work to steal.
+   if( numSlotsFilled == 0 )
+    { gateProtected_stealWorkInto( currSlot, readyToAnimateQ, masterPr );
+    }
+   #endif
 
-      //Save stack ptr and frame, restore CoreLoop's stack and frame,
-      // and clear the MasterLock
-      //TODO: cafefully verify don't need to force saving anything to stack
-      // before jumping back to core loop.
-   void           *stackPtrAddr, *framePtrAddr, *masterLockAddr;
-   void           *jmpPt, *coreLoopFramePtr, *coreLoopStackPtr;
-
-   stackPtrAddr      = &(masterPr->stackPtr);
-   framePtrAddr      = &(masterPr->framePtr);
-   masterLockAddr    = &(_VMSMasterEnv->masterLock);
-
-   jmpPt             = _VMSMasterEnv->coreLoopStartPt;
-   coreLoopFramePtr  = masterPr->coreLoopFramePtr;//need this only
-   coreLoopStackPtr  = masterPr->coreLoopStackPtr;//shouldn't need -- safety
    
    //============================= MEASUREMENT STUFF ========================
    #ifdef MEAS__TIME_MASTER
@@ -172,21 +174,183 @@
    #endif
    //========================================================================
 
-   asm volatile("movl %0,     %%eax;  \
-                 movl %%esp, (%%eax); \
-                 movl %1,     %%eax;  \
-                 movl %%ebp, (%%eax); \
-                 movl %2, %%ebx;      \
-                 movl %3, %%eax;      \
-                 movl %4, %%esp;      \
-                 movl %5, %%ebp;      \
-                 movl $0x0, (%%ebx);  \
-                 jmp  %%eax;"         \
-   /* outputs */ : "=g" (stackPtrAddr), "=g" (framePtrAddr),                \
-                   "=g"(masterLockAddr)                                     \
-   /* inputs  */ : "g" (jmpPt), "g"(coreLoopStackPtr), "g"(coreLoopFramePtr)\
-   /* clobber */ : "memory", "%eax", "%ebx", "%ecx", "%edx", "%edi", "%esi" \
-                );//can probably make clobber list empty -- but safe for now
+   
+   masterSwitchToCoreLoop( masterPr )
  }
 
 
+
+/*This has a race condition -- the coreloops are accessing their own queues
+ * at the same time that this work-stealer on a different core is trying to
+ */
+void inline
+stealWorkInto( SchedSlot *currSlot, VMSQueueStruc *readyToAnimateQ,
+               VirtProcr *masterPr )
+ { 
+   VirtProcr   *stolenPr;
+   int32        coreIdx, i;
+   VMSQueueStruc *currQ;
+
+   stolenPr = NULL;
+   coreIdx = masterPr->coreAnimatedBy;
+   for( i = 0; i < NUM_CORES -1; i++ )
+    {
+      if( coreIdx >= NUM_CORES -1 )
+       { coreIdx = 0;
+       }
+      else
+       { coreIdx++;
+       }
+      currQ = _VMSMasterEnv->readyToAnimateQs[coreIdx];
+      if( numInVMSQ( currQ ) > 0 )
+       { stolenPr = readVMSQ (currQ );
+         break;
+       }
+    }
+
+   if( stolenPr != NULL )
+    { currSlot->procrAssignedToSlot = stolenPr;
+      stolenPr->schedSlot           = currSlot;
+      currSlot->needsProcrAssigned  = FALSE;
+
+      writeVMSQ( stolenPr, readyToAnimateQ );
+    }
+ }
+
+/*This algorithm makes the common case fast.  Make the coreloop passive,
+ * and show its progress.  Make the stealer control a gate that coreloop
+ * has to pass.
+ *To avoid interference, only one stealer at a time.  Use a global
+ * stealer-lock.
+ *
+ *The pattern is based on a gate -- stealer shuts the gate, then monitors
+ * to be sure any already past make it all the way out, before starting.
+ *So, have a "progress" measure just before the gate, then have two after it,
+ * one is in a "waiting room" outside the gate, the other is at the exit.
+ *Then, the stealer first shuts the gate, then checks the progress measure
+ * outside it, then looks to see if the progress measure at the exit is the
+ * same.  If yes, it knows the protected area is empty 'cause no other way
+ * to get in and the last to get in also exited.
+ *If the progress measure at the exit is not the same, then the stealer goes
+ * into a loop checking both the waiting-area and the exit progress-measures
+ * until one of them shows the same as the measure outside the gate.  Might
+ * as well re-read the measure outside the gate each go around, just to be
+ * sure.  It is guaranteed that one of the two will eventually match the one
+ * outside the gate.
+ *
+ *Here's an informal proof of correctness:
+ *The gate can be closed at any point, and have only four cases:
+ *  1) coreloop made it past the gate-closing but not yet past the exit
+ *  2) coreloop made it past the pre-gate progress update but not yet past
+ *     the gate,
+ *  3) coreloop is right before the pre-gate update
+ *  4) coreloop is past the exit and far from the pre-gate update.
+ *
+ * Covering the cases in reverse order,
+ *  4) is not a problem -- stealer will read pre-gate progress, see that it
+ *     matches exit progress, and the gate is closed, so stealer can proceed.
+ *  3) stealer will read pre-gate progress just after coreloop updates it..
+ *     so stealer goes into a loop until the coreloop causes wait-progress
+ *     to match pre-gate progress, so then stealer can proceed
+ *  2) same as 3..
+ *  1) stealer reads pre-gate progress, sees that it's different than exit,
+ *     so goes into loop until exit matches pre-gate, now it knows coreloop
+ *     is not in protected and cannot get back in, so can proceed.
+ *
+ *Implementation for the stealer:
+ *
+ *First, acquire the stealer lock -- only cores with no work to do will
+ * compete to steal, so not a big performance penalty having only one --
+ * will rarely have multiple stealers in a system with plenty of work -- and
+ * in a system with little work, it doesn't matter.
+ *
+ *Note, have single-reader, single-writer pattern for all variables used to
+ * communicate between stealer and victims
+ *
+ *So, scan the queues of the core loops, until find non-empty.  Each core
+ * has its own list that it scans.  The list goes in order from closest to
+ * furthest core, so it steals first from close cores.  Later can add
+ * taking info from the app about overlapping footprints, and scan all the
+ * others then choose work with the most footprint overlap with the contents
+ * of this core's cache.
+ *
+ *Now, have a victim want to take work from.  So, shut the gate in that
+ * coreloop, by setting the "gate closed" var on its stack to TRUE.
+ *Then, read the core's pre-gate progress and compare to the core's exit
+ * progress.
+ *If same, can proceed to take work from the coreloop's queue.  When done,
+ * write FALSE to gate closed var.
+ *If different, then enter a loop that reads the pre-gate progress, then
+ * compares to exit progress then to wait progress.  When one of two
+ * matches, proceed.  Take work from the coreloop's queue.  When done,
+ * write FALSE to the gate closed var.
+ * 
+ */
+void inline
+gateProtected_stealWorkInto( SchedSlot *currSlot,
+                             VMSQueueStruc *myReadyToAnimateQ,
+                             VirtProcr *masterPr )
+ {
+   VirtProcr     *stolenPr;
+   int32          coreIdx, i, haveAVictim, gotLock;
+   VMSQueueStruc *victimsQ;
+
+   volatile GateStruc *vicGate;
+   int32               coreMightBeInProtected;
+
+
+
+      //see if any other cores have work available to steal
+   haveAVictim = FALSE;
+   coreIdx = masterPr->coreAnimatedBy;
+   for( i = 0; i < NUM_CORES -1; i++ )
+    {
+      if( coreIdx >= NUM_CORES -1 )
+       { coreIdx = 0;
+       }
+      else
+       { coreIdx++;
+       }
+      victimsQ = _VMSMasterEnv->readyToAnimateQs[coreIdx];
+      if( numInVMSQ( victimsQ ) > 0 )
+       { haveAVictim = TRUE;
+         vicGate = _VMSMasterEnv->workStealingGates[ coreIdx ];
+         break;
+       }
+    }
+   if( !haveAVictim ) return;  //no work to steal, exit
+
+      //have a victim core, now get the stealer-lock
+   gotLock =__sync_bool_compare_and_swap( &(_VMSMasterEnv->workStealingLock),
+                                                          UNLOCKED, LOCKED );
+   if( !gotLock ) return; //go back to core loop, which will re-start master
+
+
+   //====== Start Gate-protection =======
+   vicGate->gateClosed = TRUE;
+   coreMightBeInProtected= vicGate->preGateProgress != vicGate->exitProgress;
+   while( coreMightBeInProtected )
+    {    //wait until sure
+      if( vicGate->preGateProgress == vicGate->waitProgress )
+         coreMightBeInProtected = FALSE;
+      if( vicGate->preGateProgress == vicGate->exitProgress )
+         coreMightBeInProtected = FALSE;
+    }
+
+   stolenPr = readVMSQ ( victimsQ );
+
+   vicGate->gateClosed = FALSE;
+   //======= End Gate-protection  =======
+
+
+   if( stolenPr != NULL )  //victim could have been in protected and taken
+    { currSlot->procrAssignedToSlot = stolenPr;
+      stolenPr->schedSlot           = currSlot;
+      currSlot->needsProcrAssigned  = FALSE;
+
+      writeVMSQ( stolenPr, myReadyToAnimateQ );
+    }
+
+      //unlock the work stealing lock
+   _VMSMasterEnv->workStealingLock = UNLOCKED;
+ }
diff -r 4fbc2165e493 -r 984f7d78bfdf SwitchAnimators.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/SwitchAnimators.h	Thu Nov 11 06:19:51 2010 -0800
@@ -0,0 +1,138 @@
+/*
+ *  Copyright 2009 OpenSourceStewardshipFoundation.org
+ *  Licensed under GNU General Public License version 2
+ *
+ * Author: seanhalle@yahoo.com
+ * 
+ */
+
+#ifndef _SwitchAnimators_H
+#define	_SwitchAnimators_H
+#define __USE_GNU
+
+/*Isolating code for switching between animators within these macros -- at
+ * some point will make switches to compile for 32 bit or for 64 bit, which
+ * having these isolated will make cleaner
+ *
+ *This also makes it easier to change architectures, at some point
+ *And it cleans the code up, having the ugly assembly out of the way
+ */
+
+//=========================== MasterVP to CoreLoop ==========================
+//
+      //Save stack ptr and frame, restore CoreLoop's stack and frame,
+      // and clear the MasterLock
+      //GCC's -O3 messes with this -- go through generated -- protect somehow
+      //
+#define masterSwitchToCoreLoop( masterPr )   \
+   void           *stackPtrAddr, *framePtrAddr, *masterLockAddr; \
+   void           *jmpPt, *coreLoopFramePtr, *coreLoopStackPtr;  \
+\
+   stackPtrAddr      = &(masterPr->stackPtr); \
+   framePtrAddr      = &(masterPr->framePtr); \
+   masterLockAddr    = &(_VMSMasterEnv->masterLock); \
+\
+   jmpPt             = _VMSMasterEnv->coreLoopStartPt; \
+   coreLoopFramePtr  = masterPr->coreLoopFramePtr; \
+   coreLoopStackPtr  = masterPr->coreLoopStackPtr; \
+\
+   asm volatile("movl %0,     %%eax;  \
+                 movl %%esp, (%%eax); \
+                 movl %1,     %%eax;  \
+                 movl %%ebp, (%%eax); \
+                 movl %2, %%ebx;      \
+                 movl %3, %%eax;      \
+                 movl %4, %%esp;      \
+                 movl %5, %%ebp;      \
+                 movl $0x0, (%%ebx);  \
+                 jmp  %%eax;"         \
+   /* outputs */ : "=g" (stackPtrAddr), "=g" (framePtrAddr),                \
+                   "=g"(masterLockAddr)                                     \
+   /* inputs  */ : "g" (jmpPt), "g"(coreLoopStackPtr), "g"(coreLoopFramePtr)\
+   /* clobber */ : "memory", "%eax", "%ebx", "%ecx", "%edx", "%edi", "%esi" \
+                );//can probably make clobber list empty -- but safe for now
+
+
+//=========================== SlaveVP to CoreLoop ===========================
+//
+
+#define    SwitchToCoreLoop( animatingPr ) \
+   void *jmpPt, *stackPtrAddr, *framePtrAddr, *coreLoopStackPtr; \
+   void *coreLoopFramePtr; \
+\
+   stackPtrAddr      = &(animatingPr->stackPtr); \
+   framePtrAddr      = &(animatingPr->framePtr); \
+\
+   jmpPt             = _VMSMasterEnv->coreLoopStartPt; \
+   coreLoopFramePtr  = animatingPr->coreLoopFramePtr; \
+   coreLoopStackPtr  = animatingPr->coreLoopStackPtr; \
+\
+      /*Save the virt procr's stack and frame ptrs*/ \
+   asm volatile("movl %0,     %%eax;  \
+                 movl %%esp, (%%eax); \
+                 movl %1,     %%eax;  \
+                 movl %%ebp, (%%eax) "\
+   /* outputs */ : "=g" (stackPtrAddr), "=g" (framePtrAddr) \
+   /* inputs  */ :        \
+   /* clobber */ : "%eax" \
+                ); \
+\
+     /*restore coreloop's frame ptr, then jump back to "start" of core loop*/\
+     /*Note, GCC compiles to assembly that saves esp and ebp in the stack*/ \
+     /* frame -- so have to explicitly do assembly that saves to memory*/ \
+   asm volatile("movl %0, %%eax;      \
+                 movl %1, %%esp;      \
+                 movl %2, %%ebp;      \
+                 jmp  %%eax    "      \
+   /* outputs */ :                    \
+   /* inputs  */ : "m" (jmpPt), "m"(coreLoopStackPtr), "m"(coreLoopFramePtr)\
+   /* clobber */ : "memory", "%eax", "%ebx", "%ecx", "%edx", "%edi","%esi"  \
+                );
+ //list everything as clobbered to force GCC to save all
+ // live vars that are in regs on stack before this
+ // assembly, so that stack pointer is correct, before jmp
+
+
+
+//============================== CoreLoop to VP =============================
+//
+      //Save the core loop's stack and frame pointers into virt procr struct
+      // then switch to stack ptr and frame ptr of virt procr & jmp to it
+      //This was a pain to get right because GCC converts the "(jmpPt)" to
+      // frame-relative mem-op -- so generated machine code first changed the
+      // frame pointer, then tried to jump to an addr stored on stack, which
+      // it accessed as an offset from frame-ptr!  (wrong frame-ptr now)
+      //Explicitly loading into eax before changing frame-ptr fixed it
+      //Also, it turns "(currPr->coreLoopFramePtr)" into a temporary on the
+      // stack, so "movl %%ebp, %0" saves to the temp, NOT the data-struc!
+
+
+      //switch to virt procr's stack and frame ptr then jump to virt procr fn
+
+#define SwitchToVP( currPr ) \
+   void *stackPtr, *framePtr, *jmpPt, *coreLoopFramePtrAddr, \
+        *coreLoopStackPtrAddr; \
+\
+   stackPtr = currPr->stackPtr; \
+   framePtr = currPr->framePtr; \
+   jmpPt    = currPr->nextInstrPt; \
+   coreLoopFramePtrAddr = &(currPr->coreLoopFramePtr); \
+   coreLoopStackPtrAddr = &(currPr->coreLoopStackPtr); \
+\
+   asm volatile("movl %0, %%eax;      \
+                 movl %%esp, (%%eax); \
+                 movl %1, %%eax;      \
+                 movl %%ebp, (%%eax); \
+                 movl %2, %%eax;      \
+                 movl %3, %%esp;      \
+                 movl %4, %%ebp;      \
+                 jmp  %%eax"          \
+   /* outputs */ : "=g"(coreLoopStackPtrAddr),                 \
+                   "=g"(coreLoopFramePtrAddr)                  \
+   /* inputs  */ : "g" (jmpPt), "g" (stackPtr), "g" (framePtr) \
+   /* clobber */ : "memory", "%eax", "%ebx", "%ecx", "%edx", "%edi", "%esi" \
+                );
+
+   
+#endif	/* _SwitchAnimators_H */
+
diff -r 4fbc2165e493 -r 984f7d78bfdf VMS.c
--- a/VMS.c	Tue Oct 26 18:31:34 2010 -0700
+++ b/VMS.c	Thu Nov 11 06:19:51 2010 -0800
@@ -6,7 +6,9 @@
 
 #include <stdio.h>
 #include <stdlib.h>
+#include <string.h>
 #include <malloc.h>
+#include <sys/time.h>
 
 #include "VMS.h"
 #include "Queue_impl/BlockingQueue.h"
@@ -28,6 +30,12 @@
 void
 create_the_coreLoop_OS_threads();
 
+MallocProlog *
+create_free_list();
+
+void
+endOSThreadFn( void *initData, VirtProcr *animatingPr );
+
 pthread_mutex_t suspendLock = PTHREAD_MUTEX_INITIALIZER;
 pthread_cond_t  suspend_cond  = PTHREAD_COND_INITIALIZER;
 
@@ -83,34 +91,43 @@
    int              coreIdx;
    VirtProcr      **masterVPs;
    SchedSlot     ***allSchedSlots; //ptr to array of ptrs
-   
+
+
       //Make the master env, which holds everything else
    _VMSMasterEnv = malloc( sizeof(MasterEnv) );
+
+        //Very first thing put into the master env is the free-list, seeded
+        // with a massive initial chunk of memory.
+        //After this, all other mallocs are VMS__malloc.
+   _VMSMasterEnv->freeListHead        = VMS_ext__create_free_list();
+
+   //===================== Only VMS__malloc after this ====================
    masterEnv     = _VMSMasterEnv;
-      //Need to set start pt here 'cause used by seed procr, which is created
-      // before the first core loop starts up. -- not sure how yet..
-//   masterEnv->coreLoopStartPt = ;
-//   masterEnv->coreLoopEndPt   = ;
    
       //Make a readyToAnimateQ for each core loop
-   readyToAnimateQs = malloc( NUM_CORES * sizeof(VMSQueueStruc *) );
-   masterVPs        = malloc( NUM_CORES * sizeof(VirtProcr *) );
+   readyToAnimateQs = VMS__malloc( NUM_CORES * sizeof(VMSQueueStruc *) );
+   masterVPs        = VMS__malloc( NUM_CORES * sizeof(VirtProcr *) );
 
       //One array for each core, 3 in array, core's masterVP scheds all
-   allSchedSlots    = malloc( NUM_CORES * sizeof(SchedSlot *) );
+   allSchedSlots    = VMS__malloc( NUM_CORES * sizeof(SchedSlot *) );
 
+   _VMSMasterEnv->numProcrsCreated = 0;  //used by create procr
    for( coreIdx = 0; coreIdx < NUM_CORES; coreIdx++ )
-    {
-      readyToAnimateQs[ coreIdx ] = makeSRSWQ();
+    {    
+      readyToAnimateQs[ coreIdx ] = makeVMSQ();
       
-         //Q: should give masterVP core-specific into as its init data?
+         //Q: should give masterVP core-specific info as its init data?
       masterVPs[ coreIdx ] = VMS__create_procr( &masterLoop, masterEnv );
       masterVPs[ coreIdx ]->coreAnimatedBy = coreIdx;
       allSchedSlots[ coreIdx ] = create_sched_slots(); //makes for one core
+      _VMSMasterEnv->numMasterInARow[ coreIdx ] = 0;
+      _VMSMasterEnv->workStealingGates[ coreIdx ] = NULL;
     }
    _VMSMasterEnv->readyToAnimateQs = readyToAnimateQs;
    _VMSMasterEnv->masterVPs        = masterVPs;
+   _VMSMasterEnv->masterLock       = UNLOCKED;
    _VMSMasterEnv->allSchedSlots    = allSchedSlots;
+   _VMSMasterEnv->workStealingLock = UNLOCKED;
 
    //============================= MEASUREMENT STUFF ========================
    #ifdef MEAS__TIME_MASTER
@@ -125,59 +142,33 @@
       // because coreLoop now controls -- animates its masterVP when no work
 
 
-   //==================== malloc substitute ========================
-   //
-   //Testing whether malloc is using thread-local storage and therefore
-   // causing unreliable behavior.
-   //Just allocate a massive chunk of memory and roll own malloc/free and
-   // make app use VMS__malloc_to, which will suspend and perform malloc
-   // in the master, taking from this massive chunk.
+   //============================= MEASUREMENT STUFF ========================
+   #ifdef STATS__TURN_ON_PROBES
+   _VMSMasterEnv->dynIntervalProbesInfo =
+              makePrivDynArrayOfSize( &(_VMSMasterEnv->intervalProbes), 200);
 
-//   initFreeList();
+   _VMSMasterEnv->probeNameHashTbl = makeHashTable( 1000, &VMS__free );
+   
+      //put creation time directly into master env, for fast retrieval
+   struct timeval timeStamp;
+   gettimeofday( &(timeStamp), NULL);
+   _VMSMasterEnv->createPtInSecs =
+                           timeStamp.tv_sec +(timeStamp.tv_usec/1000000.0);
+   #endif
+   //========================================================================
 
  }
 
-/*
-void
-initMasterMalloc()
- {
-   _VMSMasterEnv->mallocChunk = malloc( MASSIVE_MALLOC_SIZE );
-
-      //The free-list element is the first several locations of an
-      // allocated chunk -- the address given to the application is pre-
-      // pended with both the ownership structure and the free-list struc.
-      //So, write the values of these into the first locations of
-      // mallocChunk -- which marks it as free & puts in its size.
-   listElem = (FreeListElem *)_VMSMasterEnv->mallocChunk;
-   listElem->size = MASSIVE_MALLOC_SIZE - NUM_PREPEND_BYTES
-   listElem->next = NULL;
- }
-
-void
-dissipateMasterMalloc()
- {
-      //Just foo code -- to get going -- doing as if free list were link-list
-   currElem = _VMSMasterEnv->freeList;
-   while( currElem != NULL )
-    {
-      nextElem = currElem->next;
-      masterFree( currElem );
-      currElem = nextElem;
-    }
-   free( _VMSMasterEnv->freeList );
- }
- */
-
 SchedSlot **
 create_sched_slots()
  { SchedSlot  **schedSlots;
    int i;
 
-   schedSlots  = malloc( NUM_SCHED_SLOTS * sizeof(SchedSlot *) );
+   schedSlots  = VMS__malloc( NUM_SCHED_SLOTS * sizeof(SchedSlot *) );
 
    for( i = 0; i < NUM_SCHED_SLOTS; i++ )
     {
-      schedSlots[i] = malloc( sizeof(SchedSlot) );
+      schedSlots[i] = VMS__malloc( sizeof(SchedSlot) );
 
          //Set state to mean "handling requests done, slot needs filling"
       schedSlots[i]->workIsDone         = FALSE;
@@ -192,9 +183,9 @@
  { int i;
    for( i = 0; i < NUM_SCHED_SLOTS; i++ )
     {
-      free( schedSlots[i] );
+      VMS__free( schedSlots[i] );
     }
-   free( schedSlots );
+   VMS__free( schedSlots );
  }
 
 
@@ -203,7 +194,7 @@
  {
    //========================================================================
    //                      Create the Threads
-   int coreIdx, retCode, i;
+   int coreIdx, retCode;
 
       //create the arrays used to measure TSC offsets between cores
    pongNums  = malloc( NUM_CORES * sizeof( int ) );
@@ -227,7 +218,7 @@
 
       //Make the threads that animate the core loops
    for( coreIdx=0; coreIdx < NUM_CORES; coreIdx++ )
-    { coreLoopThdParams[coreIdx]          = malloc( sizeof(ThdParams) );
+    { coreLoopThdParams[coreIdx]          = VMS__malloc( sizeof(ThdParams) );
       coreLoopThdParams[coreIdx]->coreNum = coreIdx;
 
       retCode =
@@ -235,7 +226,7 @@
                         thdAttrs,
                        &coreLoop,
                (void *)(coreLoopThdParams[coreIdx]) );
-      if(retCode){printf("ERROR creating thread: %d\n", retCode); exit(0);}
+      if(retCode){printf("ERROR creating thread: %d\n", retCode); exit(1);}
     }
  }
 
@@ -307,10 +298,11 @@
  * animator state to return to --
  *
  */
-VirtProcr *
-VMS__create_procr( VirtProcrFnPtr fnPtr, void *initialData )
- { VirtProcr *newPr;
-   char      *stackLocs, *stackPtr;
+inline VirtProcr *
+create_procr_helper( VirtProcr *newPr,       VirtProcrFnPtr  fnPtr,
+                     void      *initialData, char           *stackLocs )
+ {
+   char  *stackPtr;
 
    //============================= MEASUREMENT STUFF ========================
    #ifdef MEAS__TIME_MASTER
@@ -318,23 +310,19 @@
    saveLowTimeStampCountInto( startStamp );
    #endif
    //========================================================================
-
-   newPr              = malloc( sizeof(VirtProcr) );
-   newPr->procrID     = numProcrsCreated++;
-   newPr->nextInstrPt = fnPtr;
-   newPr->initialData = initialData;
-   newPr->requests    = NULL;
-   newPr->schedSlot   = NULL;
-//   newPr->coreLoopStartPt = _VMSMasterEnv->coreLoopStartPt;
+   newPr->startOfStack = stackLocs;
+   newPr->procrID      = _VMSMasterEnv->numProcrsCreated++;
+   newPr->nextInstrPt  = fnPtr;
+   newPr->initialData  = initialData;
+   newPr->requests     = NULL;
+   newPr->schedSlot    = NULL;
 
       //fnPtr takes two params -- void *initData & void *animProcr
       //alloc stack locations, make stackPtr be the highest addr minus room
       // for 2 params + return addr.  Return addr (NULL) is in loc pointed to
       // by stackPtr, initData at stackPtr + 4 bytes, animatingPr just above
-   stackLocs = malloc( VIRT_PROCR_STACK_SIZE );
-         if(stackLocs == 0) {perror("error: malloc stack"); exit(1);}
-   newPr->startOfStack = stackLocs;
    stackPtr = ( (char *)stackLocs + VIRT_PROCR_STACK_SIZE - 0x10 );
+   
       //setup __cdecl on stack -- coreloop will switch to stackPtr before jmp
    *( (int *)stackPtr + 2 ) = (int) newPr; //rightmost param -- 32bit pointer
    *( (int *)stackPtr + 1 ) = (int) initialData;  //next  param to left
@@ -347,12 +335,48 @@
    saveLowTimeStampCountInto( endStamp );
    addIntervalToHist( startStamp, endStamp,
                       _VMSMasterEnv->stats->createHist );
+   //============================= MEASUREMENT STUFF ========================
+   #ifdef STATS__TURN_ON_PROBES
+   struct timeval timeStamp;
+   gettimeofday( &(timeStamp), NULL);
+   newPr->createPtInSecs = timeStamp.tv_sec +(timeStamp.tv_usec/1000000.0) -
+                                               _VMSMasterEnv->createPtInSecs;
    #endif
    //========================================================================
-   
+
    return newPr;
  }
 
+inline VirtProcr *
+VMS__create_procr( VirtProcrFnPtr fnPtr, void *initialData )
+ { VirtProcr *newPr;
+   char      *stackLocs;
+
+   newPr      = VMS__malloc( sizeof(VirtProcr) );
+   stackLocs  = VMS__malloc( VIRT_PROCR_STACK_SIZE );
+   if( stackLocs == 0 )
+    { perror("VMS__malloc stack"); exit(1); }
+
+   return create_procr_helper( newPr, fnPtr, initialData, stackLocs );
+ }
+
+/* "ext" designates that it's for use outside the VMS system -- should only
+ * be called from main thread or other thread -- never from code animated by
+ * a VMS virtual processor.
+ */
+inline VirtProcr *
+VMS_ext__create_procr( VirtProcrFnPtr fnPtr, void *initialData )
+ { VirtProcr *newPr;
+   char      *stackLocs;
+
+   newPr      = malloc( sizeof(VirtProcr) );
+   stackLocs  = malloc( VIRT_PROCR_STACK_SIZE );
+   if( stackLocs == 0 )
+    { perror("malloc stack"); exit(1); }
+
+   return create_procr_helper( newPr, fnPtr, initialData, stackLocs );
+ }
+
 
 /*there is a label inside this function -- save the addr of this label in
  * the callingPr struc, as the pick-up point from which to start the next
@@ -365,8 +389,7 @@
  */
 void
 VMS__suspend_procr( VirtProcr *animatingPr )
- { void *jmpPt, *stackPtrAddr, *framePtrAddr, *coreLoopStackPtr;
-   void *coreLoopFramePtr;
+ { 
 
       //The request to master will cause this suspended virt procr to get
       // scheduled again at some future point -- to resume, core loop jumps
@@ -376,24 +399,6 @@
 
       //return ownership of the virt procr and sched slot to Master virt pr
    animatingPr->schedSlot->workIsDone = TRUE;
-//   coreIdx = callingPr->coreAnimatedBy;
-
-   stackPtrAddr      = &(animatingPr->stackPtr);
-   framePtrAddr      = &(animatingPr->framePtr);
-
-   jmpPt             = _VMSMasterEnv->coreLoopStartPt;
-   coreLoopFramePtr  = animatingPr->coreLoopFramePtr;//need this only
-   coreLoopStackPtr  = animatingPr->coreLoopStackPtr;//safety
-
-      //Save the virt procr's stack and frame ptrs,
-   asm volatile("movl %0,     %%eax;  \
-                 movl %%esp, (%%eax); \
-                 movl %1,     %%eax;  \
-                 movl %%ebp, (%%eax) "\
-   /* outputs */ : "=g" (stackPtrAddr), "=g" (framePtrAddr) \
-   /* inputs  */ :        \
-   /* clobber */ : "%eax" \
-                );
 
    //===========================  Measurement stuff ========================
    #ifdef MEAS__TIME_STAMP_SUSP
@@ -402,20 +407,10 @@
    #endif
    //=======================================================================
 
-      //restore coreloop's frame ptr, then jump back to "start" of core loop
-      //Note, GCC compiles to assembly that saves esp and ebp in the stack
-      // frame -- so have to explicitly do assembly that saves to memory
-   asm volatile("movl %0, %%eax;      \
-                 movl %1, %%esp;      \
-                 movl %2, %%ebp;      \
-                 jmp  %%eax    "      \
-   /* outputs */ :                    \
-   /* inputs  */ : "m" (jmpPt), "m"(coreLoopStackPtr), "m"(coreLoopFramePtr)\
-   /* clobber */ : "memory", "%eax", "%ebx", "%ecx", "%edx", "%edi","%esi"  \
-                ); //list everything as clobbered to force GCC to save all
-                   // live vars that are in regs on stack before this
-                   // assembly, so that stack pointer is correct, before jmp
 
+   SwitchToCoreLoop( animatingPr )
+
+   //=======================================================================
 ResumePt:
    #ifdef MEAS__TIME_STAMP_SUSP
       //NOTE: only take low part of count -- do sanity check when take diff
@@ -427,6 +422,31 @@
 
 
 
+/*For this implementation of VMS, it may not make much sense to have the
+ * system of requests for creating a new processor done this way.. but over
+ * the scope of single-master, multi-master, mult-tasking, OS-implementing,
+ * distributed-memory, and so on, this gives VMS implementation a chance to
+ * do stuff before suspend, in the AppVP, and in the Master before the plugin
+ * is called, as well as in the lang-lib before this is called, and in the
+ * plugin.  So, this gives both VMS and language implementations a chance to
+ * intercept at various points and do order-dependent stuff.
+ *Having a standard VMSNewPrReqData struc allows the language to create and
+ * free the struc, while VMS knows how to get the newPr if it wants it, and
+ * it lets the lang have lang-specific data related to creation transported
+ * to the plugin.
+ */
+void
+VMS__send_create_procr_req( void *semReqData, VirtProcr *reqstingPr )
+ { VMSReqst req;
+
+   req.reqType          = createReq;
+   req.semReqData       = semReqData;
+   req.nextReqst        = reqstingPr->requests;
+   reqstingPr->requests = &req;
+
+   VMS__suspend_procr( reqstingPr );
+ }
+
 
 /*
  *This adds a request to dissipate, then suspends the processor so that the
@@ -450,81 +470,102 @@
  * pears -- making that suspend the last thing in the virt procr's trace.
  */
 void
-VMS__dissipate_procr( VirtProcr *procrToDissipate )
+VMS__send_dissipate_req( VirtProcr *procrToDissipate )
+ { VMSReqst req;
+
+   req.reqType                = dissipate;
+   req.nextReqst              = procrToDissipate->requests;
+   procrToDissipate->requests = &req;
+
+   VMS__suspend_procr( procrToDissipate );
+ }
+
+
+/* "ext" designates that it's for use outside the VMS system -- should only
+ * be called from main thread or other thread -- never from code animated by
+ * a VMS virtual processor.
+ *
+ *Use this version to dissipate VPs created outside the VMS system.
+ */
+void
+VMS_ext__dissipate_procr( VirtProcr *procrToDissipate )
+ {
+      //NOTE: initialData was given to the processor, so should either have
+      // been alloc'd with VMS__malloc, or freed by the level above animPr.
+      //So, all that's left to free here is the stack and the VirtProcr struc
+      // itself
+      //Note, should not stack-allocate initial data -- no guarantee, in
+      // general that creating processor will outlive ones it creates.
+   free( procrToDissipate->startOfStack );
+   free( procrToDissipate );
+ }
+
+
+
+/*This call's name indicates that request is malloc'd -- so req handler
+ * has to free any extra requests tacked on before a send, using this.
+ *
+ * This inserts the semantic-layer's request data into standard VMS carrier
+ * request data-struct that is mallocd.  The sem request doesn't need to
+ * be malloc'd if this is called inside the same call chain before the
+ * send of the last request is called.
+ *
+ *The request handler has to call VMS__free_VMSReq for any of these
+ */
+inline void
+VMS__add_sem_request_in_mallocd_VMSReqst( void *semReqData,
+                                          VirtProcr *callingPr )
  { VMSReqst *req;
 
-   req = malloc( sizeof(VMSReqst) );
-//   req->virtProcrFrom      = callingPr;
-   req->reqType               = dissipate;
-   req->nextReqst             = procrToDissipate->requests;
-   procrToDissipate->requests = req;
-   
-   VMS__suspend_procr( procrToDissipate );
-}
-
-
-/*This inserts the semantic-layer's request data into standard VMS carrier
- */
-inline void
-VMS__add_sem_request( void *semReqData, VirtProcr *callingPr )
- { VMSReqst *req;
-
-   req = malloc( sizeof(VMSReqst) );
-//   req->virtProcrFrom      = callingPr;
-   req->reqType        = semantic;
-   req->semReqData     = semReqData;
-   req->nextReqst      = callingPr->requests;
+   req = VMS__malloc( sizeof(VMSReqst) );
+   req->reqType         = semantic;
+   req->semReqData      = semReqData;
+   req->nextReqst       = callingPr->requests;
    callingPr->requests = req;
  }
 
+/*This inserts the semantic-layer's request data into standard VMS carrier
+ * request data-struct is allocated on stack of this call & ptr to it sent
+ * to plugin
+ *Then it does suspend, to cause request to be sent.
+ */
+inline void
+VMS__send_sem_request( void *semReqData, VirtProcr *callingPr )
+ { VMSReqst req;
 
-/*Use this to get first request before starting request handler's loop
+   req.reqType         = semantic;
+   req.semReqData      = semReqData;
+   req.nextReqst       = callingPr->requests;
+   callingPr->requests = &req;
+   
+   VMS__suspend_procr( callingPr );
+ }
+
+
+inline void
+VMS__send_VMSSem_request( void *semReqData, VirtProcr *callingPr )
+ { VMSReqst req;
+
+   req.reqType         = VMSSemantic;
+   req.semReqData      = semReqData;
+   req.nextReqst       = callingPr->requests; //gab any other preceeding 
+   callingPr->requests = &req;
+
+   VMS__suspend_procr( callingPr );
+ }
+
+
+/*
  */
 VMSReqst *
-VMS__take_top_request_from( VirtProcr *procrWithReq )
- { VMSReqst *req;
-
-   req = procrWithReq->requests;
-   if( req == NULL ) return req;
-
-   procrWithReq->requests = procrWithReq->requests->nextReqst;
-   return req;
- }
-
-/*A subtle bug due to freeing then accessing "next" after freed caused this
- * form of call to be put in -- so call this at end of request handler loop
- * that iterates through the requests.
- */
-VMSReqst *
-VMS__free_top_and_give_next_request_from( VirtProcr *procrWithReq )
+VMS__take_next_request_out_of( VirtProcr *procrWithReq )
  { VMSReqst *req;
 
    req = procrWithReq->requests;
    if( req == NULL ) return NULL;
 
    procrWithReq->requests = procrWithReq->requests->nextReqst;
-   VMS__free_request( req );
-   return procrWithReq->requests;
- }
-
-
-//TODO: add a semantic-layer supplied "freer" for the semantic-data portion
-// of a request -- IE call with both a virt procr and a fn-ptr to request
-// freer (also maybe put sem request freer as a field in virt procr?)
-//MeasVMS relies right now on this only freeing VMS layer of request -- the
-// semantic portion of request is alloc'd and freed by request handler
-void
-VMS__free_request( VMSReqst *req )
- {
-   free( req );
- }
-
-
-
-inline int
-VMS__isSemanticReqst( VMSReqst *req )
- {
-   return ( req->reqType == semantic );
+   return req;
  }
 
 
@@ -534,36 +575,52 @@
    return req->semReqData;
  }
 
-inline int
-VMS__isDissipateReqst( VMSReqst *req )
- {
-   return ( req->reqType == dissipate );
- }
 
-inline int
-VMS__isCreateReqst( VMSReqst *req )
- {
-   return ( req->reqType == regCreated );
- }
 
-void
-VMS__send_req_to_register_new_procr(VirtProcr *newPr, VirtProcr *reqstingPr)
- { VMSReqst *req;
+/* This is for OS requests and VMS infrastructure requests, such as to create
+ *  a probe -- a probe is inside the heart of VMS-core, it's not part of any
+ *  language -- but it's also a semantic thing that's triggered from and used
+ *  in the application.. so it crosses abstractions..  so, need some special
+ *  pattern here for handling such requests.
+ * Doing this just like it were a second language sharing VMS-core.
+ * 
+ * This is called from the language's request handler when it sees a request
+ *  of type VMSSemReq
+ *
+ * TODO: Later change this, to give probes their own separate plugin & have
+ *  VMS-core steer the request to appropriate plugin
+ * Do the same for OS calls -- look later at it..
+ */
+void inline
+VMS__handle_VMSSemReq( VMSReqst *req, VirtProcr *requestingPr, void *semEnv,
+                       ResumePrFnPtr resumePrFnPtr )
+ { VMSSemReq     *semReq;
+   IntervalProbe *newProbe;
+   int32          nameLen;
 
-   req                  = malloc( sizeof(VMSReqst) );
-   req->reqType         = regCreated;
-   req->semReqData      = newPr;
-   req->nextReqst       = reqstingPr->requests;
-   reqstingPr->requests = req;
+   semReq = req->semReqData;
 
-   VMS__suspend_procr( reqstingPr );
+   newProbe          = VMS__malloc( sizeof(IntervalProbe) );
+   nameLen = strlen( semReq->nameStr );
+   newProbe->nameStr = VMS__malloc( nameLen );
+   memcpy( newProbe->nameStr, semReq->nameStr, nameLen );
+   newProbe->hist    = NULL;
+   newProbe->schedChoiceWasRecorded = FALSE;
+
+      //This runs in masterVP, so no race-condition worries
+   newProbe->probeID =
+             addToDynArray( newProbe, _VMSMasterEnv->dynIntervalProbesInfo );
+
+   requestingPr->dataRetFromReq = newProbe;
+
+   (*resumePrFnPtr)( requestingPr, semEnv );
  }
 
 
 
 /*This must be called by the request handler plugin -- it cannot be called
  * from the semantic library "dissipate processor" function -- instead, the
- * semantic layer has to generate a request for the plug-in to call this
+ * semantic layer has to generate a request, and the plug-in calls this
  * function.
  *The reason is that this frees the virtual processor's stack -- which is
  * still in use inside semantic library calls!
@@ -579,33 +636,31 @@
  * of dis-owning it.
  */
 void
-VMS__handle_dissipate_reqst( VirtProcr *animatingPr )
+VMS__dissipate_procr( VirtProcr *animatingPr )
  {
       //dis-own all locations owned by this processor, causing to be freed
       // any locations that it is (was) sole owner of
 //TODO: implement VMS__malloc system, including "give up ownership"
 
-      //The dissipate request might still be attached, so remove and free it
-   VMS__free_top_and_give_next_request_from( animatingPr );
 
       //NOTE: initialData was given to the processor, so should either have
       // been alloc'd with VMS__malloc, or freed by the level above animPr.
       //So, all that's left to free here is the stack and the VirtProcr struc
       // itself
-   free( animatingPr->startOfStack );
-   free( animatingPr );
+      //Note, should not stack-allocate initial data -- no guarantee, in
+      // general that creating processor will outlive ones it creates.
+   VMS__free( animatingPr->startOfStack );
+   VMS__free( animatingPr );
  }
 
 
-//TODO: re-architect so that have clean separation between request handler
+//TODO: look at architecting cleanest separation between request handler
 // and master loop, for dissipate, create, shutdown, and other non-semantic
 // requests.  Issue is chain: one removes requests from AppVP, one dispatches
 // on type of request, and one handles each type..  but some types require
 // action from both request handler and master loop -- maybe just give the
 // request handler calls like:  VMS__handle_X_request_type
 
-void
-endOSThreadFn( void *initData, VirtProcr *animatingPr );
 
 /*This is called by the semantic layer's request handler when it decides its
  * time to shut down the VMS system.  Calling this causes the core loop OS
@@ -619,10 +674,9 @@
  * masterVP any AppVPs that might still be allocated and sitting in the
  * semantic environment, or have been orphaned in the _VMSWorkQ.
  * 
- *NOTE: the semantic plug-in is expected to use VMS__malloc_to to get all the
+ *NOTE: the semantic plug-in is expected to use VMS__malloc to get all the
  * locations it needs, and give ownership to masterVP.  Then, they will be
- * automatically freed when the masterVP is dissipated.  (This happens after
- * the core loop threads have all exited)
+ * automatically freed.
  *
  *In here,create one core-loop shut-down processor for each core loop and put
  * them all directly into the readyToAnimateQ.
@@ -633,16 +687,16 @@
  * point is it sure that all results have completed.
  */
 void
-VMS__handle_shutdown_reqst( void *dummy, VirtProcr *animatingPr )
+VMS__shutdown()
  { int coreIdx;
    VirtProcr *shutDownPr;
 
       //create the shutdown processors, one for each core loop -- put them
       // directly into the Q -- each core will die when gets one
    for( coreIdx=0; coreIdx < NUM_CORES; coreIdx++ )
-    {
+    {    //Note, this is running in the master
       shutDownPr = VMS__create_procr( &endOSThreadFn, NULL );
-      writeSRSWQ( shutDownPr, _VMSMasterEnv->readyToAnimateQs[coreIdx] );
+      writeVMSQ( shutDownPr, _VMSMasterEnv->readyToAnimateQs[coreIdx] );
     }
 
  }
@@ -681,49 +735,60 @@
  }
 
 
-/*This is called after the threads have shut down and control has returned
- * to the semantic layer, in the entry point function in the main thread.
- * It has to free anything allocated during VMS_init, and any other alloc'd
- * locations that might be left over.
+/*This is called from the startup & shutdown
  */
 void
-VMS__cleanup_after_shutdown()
+VMS__cleanup_at_end_of_shutdown()
  { 
    VMSQueueStruc **readyToAnimateQs;
    int              coreIdx;
    VirtProcr      **masterVPs;
    SchedSlot     ***allSchedSlots; //ptr to array of ptrs
 
+      //All the environment data has been allocated with VMS__malloc, so just
+      // free its internal big-chunk and all inside it disappear.
+/*
    readyToAnimateQs = _VMSMasterEnv->readyToAnimateQs;
    masterVPs        = _VMSMasterEnv->masterVPs;
    allSchedSlots    = _VMSMasterEnv->allSchedSlots;
    
    for( coreIdx = 0; coreIdx < NUM_CORES; coreIdx++ )
     {
-      freeSRSWQ( readyToAnimateQs[ coreIdx ] );
-
-      VMS__handle_dissipate_reqst( masterVPs[ coreIdx ] );
+      freeVMSQ( readyToAnimateQs[ coreIdx ] );
+         //master VPs were created external to VMS, so use external free
+      VMS__dissipate_procr( masterVPs[ coreIdx ] );
       
       freeSchedSlots( allSchedSlots[ coreIdx ] );
     }
    
-   free( _VMSMasterEnv->readyToAnimateQs );
-   free( _VMSMasterEnv->masterVPs );
-   free( _VMSMasterEnv->allSchedSlots );
-
-   free( _VMSMasterEnv );
+   VMS__free( _VMSMasterEnv->readyToAnimateQs );
+   VMS__free( _VMSMasterEnv->masterVPs );
+   VMS__free( _VMSMasterEnv->allSchedSlots );
+   
+   //============================= MEASUREMENT STUFF ========================
+   #ifdef STATS__TURN_ON_PROBES
+   freeDynArrayDeep( _VMSMasterEnv->dynIntervalProbesInfo, &VMS__free_probe);
+   #endif
+   //========================================================================
+*/
+      //These are the only two that use system free 
+   VMS_ext__free_free_list( _VMSMasterEnv->freeListHead );
+   free( (void *)_VMSMasterEnv );
  }
 
 
-//===========================================================================
+//================================
 
-inline TSCount getTSC()
- { unsigned int low, high;
-   TSCount  out;
 
-   saveTimeStampCountInto( low, high );
-   out = high;
-   out = (out << 32) + low;
-   return out;
+/*Later, improve this -- for now, just exits the application after printing
+ * the error message.
+ */
+void
+VMS__throw_exception( char *msgStr, VirtProcr *reqstPr, VMSExcp *excpData )
+ {
+   printf(msgStr);
+   fflush(stdin);
+   exit(1);
  }
 
+
diff -r 4fbc2165e493 -r 984f7d78bfdf VMS.h
--- a/VMS.h	Tue Oct 26 18:31:34 2010 -0700
+++ b/VMS.h	Thu Nov 11 06:19:51 2010 -0800
@@ -7,22 +7,54 @@
  */
 
 #ifndef _VMS_H
-#define _VMS_H
+#define	_VMS_H
 #define __USE_GNU
 
 #include "VMS_primitive_data_types.h"
-#include "Queue_impl/BlockingQueue.h"
+#include "Queue_impl/PrivateQueue.h"
 #include "Histogram/Histogram.h"
+#include "DynArray/DynArray.h"
+#include "Hash_impl/PrivateHash.h"
+#include "vmalloc.h"
+
 #include <pthread.h>
+#include <sys/time.h>
 
+
+//===============================  Debug  ===================================
+//
    //When SEQUENTIAL is defined, VMS does sequential exe in the main thread
    // It still does co-routines and all the mechanisms are the same, it just
    // has only a single thread and animates VPs one at a time
 //#define SEQUENTIAL
 
-#define PRINT_DEBUG(msg) //printf(msg); fflush(stdin);
-#define PRINT1_DEBUG(msg, param) //printf(msg, param); fflush(stdin);
-#define PRINT2_DEBUG(msg, p1, p2) //printf(msg, p1, p2); fflush(stdin);
+//#define USE_WORK_STEALING
+
+   //turns on the probe-instrumentation in the application -- when not
+   // defined, the calls to the probe functions turn into comments
+#define STATS__ENABLE_PROBES
+//#define TURN_ON_DEBUG_PROBES
+
+   //These defines turn types of bug messages on and off
+   // be sure debug messages are un-commented (next block of defines)
+#define dbgProbes    FALSE /* for issues inside probes themselves*/
+#define dbgAppFlow   FALSE /* Top level flow of application code -- general*/
+#define dbgB2BMaster FALSE /* in coreloop, back to back master VPs*/
+#define dbgRqstHdlr  FALSE /* in request handler code*/
+
+   //Comment or un- the substitute half to turn on/off types of debug message
+#define DEBUG(  bool, msg)         \
+//   if( bool){ printf(msg); fflush(stdin);}
+#define DEBUG1( bool, msg, param)  \
+//   if(bool){printf(msg, param); fflush(stdin);}
+#define DEBUG2( bool, msg, p1, p2) \
+//   if(bool) {printf(msg, p1, p2); fflush(stdin);}
+
+#define ERROR(msg) printf(msg); fflush(stdin);
+#define ERROR1(msg, param) printf(msg, param); fflush(stdin);
+#define ERROR2(msg, p1, p2) printf(msg, p1, p2); fflush(stdin);
+
+//===========================  STATS =======================
 
    //when MEAS__TIME_STAMP_SUSP is defined, causes code to be inserted and
    // compiled-in that saves the low part of the time stamp count just before
@@ -33,53 +65,97 @@
 #define MEAS__TIME_MASTER
 #define MEAS__NUM_TIMES_TO_RUN 100000
 
+   //For code that calculates normalization-offset between TSC counts of
+   // different cores.
 #define NUM_TSC_ROUND_TRIPS 10
 
+
+//=========================  Hardware related Constants =====================
    //This value is the number of hardware threads in the shared memory
    // machine
 #define NUM_CORES        4
 
-   // balance amortizing master fixed overhead vs imbalance potential
-#define NUM_SCHED_SLOTS  3
+   // tradeoff amortizing master fixed overhead vs imbalance potential
+   // when work-stealing, can make bigger, at risk of losing cache affinity
+#define NUM_SCHED_SLOTS  5
 
 #define MIN_WORK_UNIT_CYCLES 20000
 
-#define READYTOANIMATE_RETRIES 10000
+#define MASTERLOCK_RETRIES 10000
 
-   // stack
-#define VIRT_PROCR_STACK_SIZE 0x10000
+   // stack size in virtual processors created
+#define VIRT_PROCR_STACK_SIZE 0x4000 /* 16K */
 
-   //256M of total memory for VMS__malloc
-#define MASSIVE_MALLOC_SIZE 0x10000000
+   // memory for VMS__malloc
+#define MALLOC_ADDITIONAL_MEM_FROM_OS_SIZE 0x10000000 /* 256M */
 
-#define NUM_PREPEND_BYTES sizeof(FreeListElem) + sizeof(ownerElem);
+
+//==============================
 
 #define SUCCESS 0
 
-#define writeVMSQ     writeSRSWQ
-#define readVMSQ      readSRSWQ
-#define makeVMSQ      makeSRSWQ
-#define VMSQueueStruc SRSWQueueStruc
+#define writeVMSQ     writePrivQ
+#define readVMSQ      readPrivQ
+#define makeVMSQ      makePrivQ
+#define numInVMSQ     numInPrivQ
+#define VMSQueueStruc PrivQueueStruc
 
-//#define thdAttrs NULL  //For PThreads
 
-typedef struct _SchedSlot  SchedSlot;
-typedef struct _VMSReqst   VMSReqst;
-typedef struct _VirtProcr  VirtProcr;
+
+//===========================================================================
+typedef unsigned long long TSCount;
+
+typedef struct _SchedSlot     SchedSlot;
+typedef struct _VMSReqst      VMSReqst;
+typedef struct _VirtProcr     VirtProcr;
+typedef struct _IntervalProbe IntervalProbe;
+typedef struct _GateStruc     GateStruc;
+
 
 typedef VirtProcr * (*SlaveScheduler)  ( void *, int );   //semEnv, coreIdx
 typedef void  (*RequestHandler)  ( VirtProcr *, void * ); //prWReqst, semEnv
 typedef void  (*VirtProcrFnPtr)  ( void *, VirtProcr * ); //initData, animPr
 typedef void    VirtProcrFn      ( void *, VirtProcr * ); //initData, animPr
+typedef void  (*ResumePrFnPtr)   ( VirtProcr *, void * );
+
+
+//============= Requests ===========
+//
+
+enum VMSReqstType   //avoid starting enums at 0, for debug reasons
+ {
+   semantic = 1,
+   createReq,
+   dissipate,
+   VMSSemantic      //goes with VMSSemReqst below
+ };
+
+struct _VMSReqst
+ {
+   enum VMSReqstType  reqType;//used for dissipate and in future for IO requests
+   void              *semReqData;
+
+   VMSReqst *nextReqst;
+ };
+//VMSReqst
+
+enum VMSSemReqstType   //These are equivalent to semantic requests, but for
+ {                     // VMS's services available directly to app, like OS
+   createProbe = 1,    // and probe services -- like a VMS-wide built-in lang
+   openFile,
+   otherIO
+ };
 
 typedef struct
- {
-   void           *endThdPt;
-   unsigned int    coreNum;
+ { enum VMSSemReqstType reqType;
+   VirtProcr           *requestingPr;
+   char                *nameStr;  //for create probe
  }
-ThdParams;
+ VMSSemReq;
 
 
+//====================  Core data structures  ===================
+
 struct _SchedSlot
  {
    int         workIsDone;
@@ -87,24 +163,6 @@
    VirtProcr  *procrAssignedToSlot;
  };
 //SchedSlot
- 
-enum ReqstType
- {
-   semantic = 1,
-   dissipate,
-   regCreated,
-   IO
- };
-
-struct _VMSReqst
- {
-//   VirtProcr   *virtProcrFrom;
-   enum ReqstType  reqType;//used for dissipate and in future for IO requests
-   void           *semReqData;
-
-   VMSReqst *nextReqst;
- };
-//VMSReqst
 
 struct _VirtProcr
  { int         procrID;  //for debugging -- count up each time create
@@ -123,9 +181,10 @@
    SchedSlot  *schedSlot;
    VMSReqst   *requests;
 
-   void       *semanticData;
+   void       *semanticData; //this lives here for the life of VP
+   void       *dataRetFromReq;//values returned from plugin to VP go here
 
-   //============================= MEASUREMENT STUFF ========================
+      //=========== MEASUREMENT STUFF ==========
    #ifdef MEAS__TIME_STAMP_SUSP
    unsigned int preSuspTSCLow;
    unsigned int postSuspTSCLow;
@@ -134,7 +193,8 @@
    unsigned int startMasterTSCLow;
    unsigned int endMasterTSCLow;
    #endif
-   //========================================================================
+   
+   float64      createPtInSecs;  //have space but don't use on some configs
  };
 //VirtProcr
 
@@ -158,37 +218,79 @@
 
    void            *semanticEnv;
    void            *OSEventStruc;   //for future, when add I/O to BLIS
+   MallocProlog    *freeListHead;
+   int32            amtOfOutstandingMem; //total currently allocated
 
    void            *coreLoopStartPt;//addr to jump to to re-enter coreLoop
    void            *coreLoopEndPt;  //addr to jump to to shut down a coreLoop
 
-   int              setupComplete;
-   int              masterLock;
+   int32            setupComplete;
+   int32            masterLock;
 
    VMSStats        *stats;
+   int32            numMasterInARow[NUM_CORES];//detect back-to-back masterVP
+   GateStruc       *workStealingGates[ NUM_CORES ]; //concurrent work-steal
+   int32            workStealingLock;
+   
+   int32            numProcrsCreated; //gives ordering to processor creation
+
+      //=========== MEASUREMENT STUFF =============
+   IntervalProbe  **intervalProbes;
+   PrivDynArrayInfo    *dynIntervalProbesInfo;
+   HashTable       *probeNameHashTbl;
+   int32            masterCreateProbeID;
+   float64          createPtInSecs;
  }
 MasterEnv;
 
+//=========================  Extra Stuff Data Strucs  =======================
+typedef struct
+ {
 
-//==========================================================
+ }
+VMSExcp;
+
+struct _GateStruc
+ {
+   int32 gateClosed;
+   int32 preGateProgress;
+   int32 waitProgress;
+   int32 exitProgress;
+ };
+//GateStruc
+
+//=======================  OS Thread related  ===============================
 
 void * coreLoop( void *paramsIn );  //standard PThreads fn prototype
 void * coreLoop_Seq( void *paramsIn );  //standard PThreads fn prototype
 void masterLoop( void *initData, VirtProcr *masterPr );
 
 
-//=====================  Global Vars ===================
-
+typedef struct
+ {
+   void           *endThdPt;
+   unsigned int    coreNum;
+ }
+ThdParams;
 
 pthread_t       coreLoopThdHandles[ NUM_CORES ];  //pthread's virt-procr state
 ThdParams      *coreLoopThdParams [ NUM_CORES ];
 pthread_mutex_t suspendLock;
 pthread_cond_t  suspend_cond;
 
+
+
+//=====================  Global Vars ===================
+
 volatile MasterEnv      *_VMSMasterEnv;
 
 
-//==========================
+
+
+//===========================  Function Prototypes  =========================
+
+
+//========== Setup and shutdown ==========
 void
 VMS__init();
 
@@ -204,69 +306,59 @@
 VirtProcr *
 VMS__create_procr( VirtProcrFnPtr fnPtr, void *initialData );
 
+void
+VMS__dissipate_procr( VirtProcr *procrToDissipate );
+
+   //Use this to create processor inside entry point & other places outside
+   // the VMS system boundary (IE, not run in slave nor Master)
 VirtProcr *
-VMS__create_the_shutdown_procr();
-
-//==========================
-inline void
-VMS__add_sem_request( void *semReqData, VirtProcr *callingPr );
+VMS_ext__create_procr( VirtProcrFnPtr fnPtr, void *initialData );
 
 void
-VMS__send_req_to_register_new_procr( VirtProcr *newPrToRegister,
-                                      VirtProcr *reqstingPr );
+VMS_ext__dissipate_procr( VirtProcr *procrToDissipate );
 
 void
-VMS__free_request( VMSReqst *req );
+VMS__throw_exception( char *msgStr, VirtProcr *reqstPr, VMSExcp *excpData );
 
 void
-VMS__remove_and_free_top_request( VirtProcr *reqstingPr );
+VMS__shutdown();
+
+void
+VMS__cleanup_at_end_of_shutdown();
+
+
+//==============  Request Related  ===============
+
+void
+VMS__suspend_procr( VirtProcr *callingPr );
+
+inline void
+VMS__add_sem_request_in_mallocd_VMSReqst( void *semReqData, VirtProcr *callingPr );
+
+inline void
+VMS__send_sem_request( void *semReqData, VirtProcr *callingPr );
+
+void
+VMS__send_create_procr_req( void *semReqData, VirtProcr *reqstingPr );
+
+void inline
+VMS__send_dissipate_req( VirtProcr *prToDissipate );
+
+inline void
+VMS__send_VMSSem_request( void *semReqData, VirtProcr *callingPr );
 
 VMSReqst *
-VMS__take_top_request_from( VirtProcr *reqstingPr );
-
-VMSReqst *
-VMS__free_top_and_give_next_request_from( VirtProcr *procrWithReq );
+VMS__take_next_request_out_of( VirtProcr *procrWithReq );
 
 inline void *
 VMS__take_sem_reqst_from( VMSReqst *req );
 
-inline int
-VMS__isSemanticReqst( VMSReqst *req );
-
-inline int
-VMS__isDissipateReqst( VMSReqst *req );
-
-inline int
-VMS__isCreateReqst( VMSReqst *req );
-
-//==========================
-
-void
-VMS__suspend_procr( VirtProcr *callingPr );
-
-void
-VMS__dissipate_procr( VirtProcr *prToDissipate );
-
-void
-VMS__handle_dissipate_reqst( VirtProcr *procrToDissipate );
-
-void
-VMS__cleanup_after_shutdown();
-
-//==========================
-void
-measureTSCOffsetsAsCore0();
-
-void
-measureTSCOffsetsAsRemoteCore( int coreIdx );
-
-//============================= Statistics ==================================
-
-typedef unsigned long long TSCount;
-
    //Frequency of TS counts
    //TODO: change freq for each machine
 #define TSCOUNT_FREQ 3180000000
+//======================== STATS ======================
+
+//===== RDTSC wrapper =====
 
 #define saveTimeStampCountInto(low, high) \
    asm volatile("RDTSC;                   \
@@ -284,10 +376,12 @@
    /* inputs  */ :                        \
    /* clobber */ : "%eax", "%edx"         \
                 );
+//=====
 
-inline TSCount getTSC();
+#include "SwitchAnimators.h"
+#include "probes.h"
 
-inline TSCount getTSC();
+
 
 //===================== Debug ==========================
 int numProcrsCreated;
@@ -298,4 +392,3 @@
 TSCount  *pingTimes;
 
 #endif	/* _VMS_H */
-
diff -r 4fbc2165e493 -r 984f7d78bfdf VMS__DESIGN_NOTES.txt
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/VMS__DESIGN_NOTES.txt	Thu Nov 11 06:19:51 2010 -0800
@@ -0,0 +1,2 @@
+
+Implement VMS this way:
diff -r 4fbc2165e493 -r 984f7d78bfdf probes.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/probes.c	Thu Nov 11 06:19:51 2010 -0800
@@ -0,0 +1,354 @@
+/*
+ * Copyright 2010  OpenSourceStewardshipFoundation
+ *
+ * Licensed under BSD
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <malloc.h>
+#include <sys/time.h>
+#include <string.h>
+
+#include "VMS.h"
+#include "Queue_impl/BlockingQueue.h"
+#include "Histogram/Histogram.h"
+
+
+//================================ STATS ====================================
+
+inline TSCount getTSCount()
+ { unsigned int low, high;
+   TSCount  out;
+
+   saveTimeStampCountInto( low, high );
+   out = high;
+   out = (out << 32) + low;
+   return out;
+ }
+
+
+
+//====================  Probes =================
+#ifdef STATS__USE_TSC_PROBES
+
+int32
+VMS__create_histogram_probe( int32 numBins, float32 startValue,
+                             float32 binWidth, char *nameStr )
+ { IntervalProbe *newProbe;
+   int32 idx;
+   FloatHist *hist;
+
+   idx = VMS__create_single_interval_probe( nameStr );
+   newProbe =  _VMSMasterEnv->intervalProbes[ idx ];
+
+   hist =  makeFloatHistogram( numBins, startValue, binWidth );
+   newProbe->hist = hist;
+   return idx;
+ }
+
+void
+VMS_impl__record_interval_start_in_probe( int32 probeID )
+ { IntervalProbe *probe;
+
+   probe = _VMSMasterEnv->intervalProbes[ probeID ];
+   probe->startStamp = getTSCount();
+ }
+
+void
+VMS_impl__record_interval_end_in_probe( int32 probeID )
+ { IntervalProbe *probe;
+   TSCount endStamp;
+
+   endStamp = getTSCount();
+
+   probe = _VMSMasterEnv->intervalProbes[ probeID ];
+   probe->endStamp = endStamp;
+
+   if( probe->hist != NULL )
+    { TSCount interval = probe->endStamp - probe->startStamp;
+         //if the interval is sane, then add to histogram
+      if( interval < probe->hist->endOfRange * 10 )
+         addToFloatHist( interval, probe->hist );
+    }
+ }
+
+void
+VMS_impl__print_stats_of_probe( int32 probeID )
+ { IntervalProbe *probe;
+
+   probe = _VMSMasterEnv->intervalProbes[ probeID ];
+
+   if( probe->hist == NULL )
+    {
+      printf("probe: %s, interval: %.6lf\n", probe->nameStr,probe->interval);
+    }
+
+   else
+    {
+      printf( "probe: %s\n", probe->nameStr );
+      printFloatHist( probe->hist );
+    }
+ }
+#else
+
+/*
+ * In practice, probe operations are called from the app, from inside slaves
+ *  -- so have to be sure each probe is single-VP owned, and be sure that
+ *  any place common structures are modified it's done inside the master.
+ * So -- the only place common structures are modified is during creation.
+ *  after that, all mods are to individual instances.
+ *
+ * Thniking perhaps should change the semantics to be that probes are
+ *  attached to the virtual processor -- and then everything is guaranteed
+ *  to be isolated -- except then can't take any intervals that span VPs,
+ *  and would have to transfer the probes to Master env when VP dissipates..
+ *  gets messy..
+ *
+ * For now, just making so that probe creation causes a suspend, so that
+ *  the dynamic array in the master env is only modified from the master
+ * 
+ */
+IntervalProbe *
+create_generic_probe( char *nameStr, VirtProcr *animPr )
+ { IntervalProbe *newProbe;
+   VMSSemReq reqData;
+
+   reqData.reqType  = createProbe;
+   reqData.nameStr  = nameStr;
+
+   VMS__send_VMSSem_request( &reqData, animPr );
+
+   return animPr->dataRetFromReq;
+ }
+
+/*Use this version from outside VMS -- it uses external malloc, and modifies
+ * dynamic array, so can't be animated in a slave VP
+ */
+IntervalProbe *
+ext__create_generic_probe( char *nameStr )
+ { IntervalProbe *newProbe;
+   int32          nameLen;
+
+   newProbe          = malloc( sizeof(IntervalProbe) );
+   nameLen = strlen( nameStr );
+   newProbe->nameStr = malloc( nameLen );
+   memcpy( newProbe->nameStr, nameStr, nameLen );
+   newProbe->hist    = NULL;
+   newProbe->schedChoiceWasRecorded = FALSE;
+   newProbe->probeID =
+             addToDynArray( newProbe, _VMSMasterEnv->dynIntervalProbesInfo );
+
+   return newProbe;
+ }
+
+
+/*Only call from inside master or main startup/shutdown thread
+ */
+void
+VMS_impl__free_probe( IntervalProbe *probe )
+ { if( probe->hist != NULL )   freeDblHist( probe->hist );
+   if( probe->nameStr != NULL) VMS__free( probe->nameStr );
+   VMS__free( probe );
+ }
+
+
+int32
+VMS_impl__record_time_point_into_new_probe( char *nameStr, VirtProcr *animPr)
+ { IntervalProbe *newProbe;
+   struct timeval *startStamp;
+   float64 startSecs;
+
+   newProbe           = create_generic_probe( nameStr, animPr );
+   newProbe->endSecs  = 0;
+
+   gettimeofday( &(newProbe->startStamp), NULL);
+
+      //turn into a double
+   startStamp = &(newProbe->startStamp);
+   startSecs = startStamp->tv_sec + ( startStamp->tv_usec / 1000000.0 );
+   newProbe->startSecs = startSecs;
+
+   return newProbe->probeID;
+ }
+
+int32
+VMS_ext_impl__record_time_point_into_new_probe( char *nameStr )
+ { IntervalProbe *newProbe;
+   struct timeval *startStamp;
+   float64 startSecs;
+
+   newProbe           = ext__create_generic_probe( nameStr );
+   newProbe->endSecs  = 0;
+
+   gettimeofday( &(newProbe->startStamp), NULL);
+
+      //turn into a double
+   startStamp = &(newProbe->startStamp);
+   startSecs = startStamp->tv_sec + ( startStamp->tv_usec / 1000000.0 );
+   newProbe->startSecs = startSecs;
+
+   return newProbe->probeID;
+ }
+
+int32
+VMS_impl__create_single_interval_probe( char *nameStr, VirtProcr *animPr )
+ { IntervalProbe *newProbe;
+
+   newProbe = create_generic_probe( nameStr, animPr );
+   
+   return newProbe->probeID;
+ }
+
+int32
+VMS_impl__create_histogram_probe( int32   numBins, float64    startValue,
+               float64 binWidth, char   *nameStr, VirtProcr *animPr )
+ { IntervalProbe *newProbe;
+   DblHist *hist;
+
+   newProbe = create_generic_probe( nameStr, animPr );
+   
+   hist =  makeDblHistogram( numBins, startValue, binWidth );
+   newProbe->hist = hist;
+   return newProbe->probeID;
+ }
+
+void
+VMS_impl__index_probe_by_its_name( int32 probeID, VirtProcr *animPr )
+ { IntervalProbe *probe;
+
+   //TODO: fix this To be in Master -- race condition
+   probe = _VMSMasterEnv->intervalProbes[ probeID ];
+
+   addValueIntoTable(probe->nameStr, probe, _VMSMasterEnv->probeNameHashTbl);
+ }
+
+IntervalProbe *
+VMS_impl__get_probe_by_name( char *probeName, VirtProcr *animPr )
+ {
+   //TODO: fix this To be in Master -- race condition
+   return getValueFromTable( probeName, _VMSMasterEnv->probeNameHashTbl );
+ }
+
+
+/*Everything is local to the animating procr, so no need for request, do
+ * work locally, in the anim Pr
+ */
+void
+VMS_impl__record_sched_choice_into_probe( int32 probeID, VirtProcr *animatingPr )
+ { IntervalProbe *probe;
+ 
+   probe = _VMSMasterEnv->intervalProbes[ probeID ];
+   probe->schedChoiceWasRecorded = TRUE;
+   probe->coreNum = animatingPr->coreAnimatedBy;
+   probe->procrID = animatingPr->procrID;
+   probe->procrCreateSecs = animatingPr->createPtInSecs;
+ }
+
+/*Everything is local to the animating procr, so no need for request, do
+ * work locally, in the anim Pr
+ */
+void
+VMS_impl__record_interval_start_in_probe( int32 probeID )
+ { IntervalProbe *probe;
+
+         DEBUG( dbgProbes, "record start of interval\n" )
+   probe = _VMSMasterEnv->intervalProbes[ probeID ];
+   gettimeofday( &(probe->startStamp), NULL );
+ }
+
+
+/*Everything is local to the animating procr, so no need for request, do
+ * work locally, in the anim Pr
+ */
+void
+VMS_impl__record_interval_end_in_probe( int32 probeID )
+ { IntervalProbe *probe;
+   struct timeval *endStamp, *startStamp;
+   float64 startSecs, endSecs;
+
+         DEBUG( dbgProbes, "record end of interval\n" )
+      //possible seg-fault if array resized by diff core right after this
+      // one gets probe..?  Something like that?  Might be safe.. don't care
+   probe = _VMSMasterEnv->intervalProbes[ probeID ];
+   gettimeofday( &(probe->endStamp), NULL);
+
+      //now turn into an interval held in a double
+   startStamp = &(probe->startStamp);
+   endStamp   = &(probe->endStamp);
+
+   startSecs = startStamp->tv_sec + ( startStamp->tv_usec / 1000000.0 );
+   endSecs   = endStamp->tv_sec   + ( endStamp->tv_usec / 1000000.0 );
+
+   probe->interval  = endSecs - startSecs;
+   probe->startSecs = startSecs;
+   probe->endSecs   = endSecs;
+
+   if( probe->hist != NULL )
+    {
+         //if the interval is sane, then add to histogram
+      if( probe->interval < probe->hist->endOfRange * 10 )
+         addToDblHist( probe->interval, probe->hist );
+    }
+ }
+
+void
+print_probe_helper( IntervalProbe *probe )
+ {
+   printf( "\nprobe: %s, ",  probe->nameStr );
+   
+   if( probe->schedChoiceWasRecorded )
+    { printf( "coreNum: %d, procrID: %d, procrCreated: %.6lf | ",
+              probe->coreNum, probe->procrID, probe->procrCreateSecs );
+    }
+
+   if( probe->endSecs == 0 ) //just a single point in time
+    {
+      printf( " time point: %.6lf\n",
+              probe->startSecs - _VMSMasterEnv->createPtInSecs );
+    }
+   else if( probe->hist == NULL ) //just an interval
+    {
+      printf( " startSecs: %.6lf, interval: %.6lf\n", 
+         probe->startSecs - _VMSMasterEnv->createPtInSecs, probe->interval);
+    }
+   else  //a full histogram of intervals
+    {
+      printDblHist( probe->hist );
+    }
+ }
+
+//TODO: change so pass around pointer to probe instead of its array-index..
+// will eliminate chance for timing of resize to cause problems with the
+// lookup -- even though don't think it actually can cause problems..
+// there's no need to pass index around -- have hash table for names, and
+// only need it once, then have ptr to probe..  the thing about enum the
+// index and use that as name is clunky in practice -- just hash.
+void
+VMS_impl__print_stats_of_probe( int32 probeID )
+ { IntervalProbe *probe;
+
+   probe = _VMSMasterEnv->intervalProbes[ probeID ];
+
+   print_probe_helper( probe );
+ }
+
+
+
+void
+generic_print_probe( void *_probe )
+ { IntervalProbe *probe;
+
+   probe = (IntervalProbe *)_probe;
+   print_probe_helper( probe );
+ }
+
+void
+VMS_impl__print_stats_of_all_probes()
+ { IntervalProbe *probe;
+
+   forAllInDynArrayDo( _VMSMasterEnv->dynIntervalProbesInfo,
+                       &generic_print_probe );
+   fflush( stdout );
+ }
+#endif
diff -r 4fbc2165e493 -r 984f7d78bfdf probes.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/probes.h	Thu Nov 11 06:19:51 2010 -0800
@@ -0,0 +1,194 @@
+/*
+ *  Copyright 2009 OpenSourceStewardshipFoundation.org
+ *  Licensed under GNU General Public License version 2
+ *
+ * Author: seanhalle@yahoo.com
+ * 
+ */
+
+#ifndef _PROBES_H
+#define	_PROBES_H
+#define __USE_GNU
+
+#include "VMS_primitive_data_types.h"
+
+#include <sys/time.h>
+
+
+   //when STATS__TURN_ON_PROBES is defined allows using probes to measure
+   // time intervals.  The probes are macros that only compile to something
+   // when STATS__TURN_ON_PROBES is defined.  The probes are saved in the
+   // master env -- but only when this is defined.
+   //The TSC probes use RDTSC instr, can be unreliable, Dbl uses gettimeofday
+#define STATS__TURN_ON_PROBES
+//#define STATS__USE_TSC_PROBES
+#define STATS__USE_DBL_PROBES
+
+//typedef struct _IntervalProbe IntervalProbe; //in VMS.h
+
+struct _IntervalProbe
+ {
+   char           *nameStr;
+   int32           probeID;
+
+   int32           schedChoiceWasRecorded;
+   int32           coreNum;
+   int32           procrID;
+   float64         procrCreateSecs;
+
+   #ifdef STATS__USE_TSC_PROBES
+   TSCount    startStamp;
+   TSCount    endStamp;
+   #else
+   struct timeval  startStamp;
+   struct timeval  endStamp;
+   #endif
+   float64         startSecs;
+   float64         endSecs;
+   float64         interval;
+   DblHist        *hist;//if NULL, then is single interval probe
+ };
+
+
+//============================= Statistics ==================================
+
+   //Frequency of TS counts
+   //TODO: change freq for each machine
+#define TSCOUNT_FREQ 3180000000
+
+inline TSCount getTSCount();
+
+
+//======================== Probes =============================
+//
+// Use macros to allow turning probes off with a #define switch
+#ifdef STATS__ENABLE_PROBES
+int32
+VMS_impl__record_time_point_into_new_probe( char *nameStr,VirtProcr *animPr);
+#define VMS__record_time_point_into_new_probe( nameStr, animPr ) \
+        VMS_impl__record_time_point_in_new_probe( nameStr, animPr )
+
+int32
+VMS_ext_impl__record_time_point_into_new_probe( char *nameStr );
+#define VMS_ext__record_time_point_into_new_probe( nameStr ) \
+        VMS_ext_impl__record_time_point_into_new_probe( nameStr )
+
+
+int32
+VMS_impl__create_single_interval_probe( char *nameStr, VirtProcr *animPr );
+#define VMS__create_single_interval_probe( nameStr, animPr ) \
+        VMS_impl__create_single_interval_probe( nameStr, animPr )
+
+
+int32
+VMS_impl__create_histogram_probe( int32   numBins, float64    startValue,
+               float64 binWidth, char    *nameStr, VirtProcr *animPr );
+#define VMS__create_histogram_probe(      numBins, startValue,              \
+                                          binWidth, nameStr, animPr )       \
+        VMS_impl__create_histogram_probe( numBins, startValue,              \
+                                          binWidth, nameStr, animPr )
+void
+VMS_impl__free_probe( IntervalProbe *probe );
+#define VMS__free_probe( probe ) \
+        VMS_impl__free_probe( probe )
+
+void
+VMS_impl__index_probe_by_its_name( int32 probeID, VirtProcr *animPr );
+#define VMS__index_probe_by_its_name( probeID, animPr ) \
+        VMS_impl__index_probe_by_its_name( probeID, animPr )
+
+IntervalProbe *
+VMS_impl__get_probe_by_name( char *probeName, VirtProcr *animPr );
+#define VMS__get_probe_by_name( probeID, animPr ) \
+        VMS_impl__get_probe_by_name( probeName, animPr )
+
+void
+VMS_impl__record_sched_choice_into_probe( int32 probeID, VirtProcr *animPr );
+#define VMS__record_sched_choice_into_probe( probeID, animPr ) \
+        VMS_impl__record_sched_choice_into_probe( probeID, animPr )
+
+void
+VMS_impl__record_interval_start_in_probe( int32 probeID );
+#define VMS__record_interval_start_in_probe( probeID ) \
+        VMS_impl__record_interval_start_in_probe( probeID )
+
+void
+VMS_impl__record_interval_end_in_probe( int32 probeID );
+#define VMS__record_interval_end_in_probe( probeID ) \
+        VMS_impl__record_interval_end_in_probe( probeID )
+
+void
+VMS_impl__print_stats_of_probe( int32 probeID );
+#define VMS__print_stats_of_probe( probeID ) \
+        VMS_impl__print_stats_of_probe( probeID )
+
+void
+VMS_impl__print_stats_of_all_probes();
+#define VMS__print_stats_of_all_probes \
+        VMS_impl__print_stats_of_all_probes
+
+
+#else
+int32
+VMS_impl__record_time_point_into_new_probe( char *nameStr,VirtProcr *animPr);
+#define VMS__record_time_point_into_new_probe( nameStr, animPr ) \
+       0 /* do nothing */
+
+int32
+VMS_ext_impl__record_time_point_into_new_probe( char *nameStr, VirtProcr *animPr);
+#define VMS_ext__record_time_point_into_new_probe( nameStr ) \
+       0 /* do nothing */
+
+
+int32
+VMS_impl__create_single_interval_probe( char *nameStr, VirtProcr *animPr );
+#define VMS__create_single_interval_probe( nameStr, animPr ) \
+       0 /* do nothing */
+
+
+int32
+VMS_impl__create_histogram_probe( int32   numBins, float64    startValue,
+               float64 binWidth, char    *nameStr, VirtProcr *animPr );
+#define VMS__create_histogram_probe(      numBins, startValue,              \
+                                          binWidth, nameStr, animPr )       \
+       0 /* do nothing */
+
+void
+VMS_impl__index_probe_by_its_name( int32 probeID, VirtProcr *animPr );
+#define VMS__index_probe_by_its_name( probeID, animPr ) \
+        /* do nothing */
+
+IntervalProbe *
+VMS_impl__get_probe_by_name( char *probeName, VirtProcr *animPr );
+#define VMS__get_probe_by_name( probeID, animPr ) \
+       NULL /* do nothing */
+
+void
+VMS_impl__record_sched_choice_into_probe( int32 probeID, VirtProcr *animPr );
+#define VMS__record_sched_choice_into_probe( probeID, animPr ) \
+        /* do nothing */
+
+void
+VMS_impl__record_interval_start_in_probe( int32 probeID );
+#define VMS__record_interval_start_in_probe( probeID ) \
+        /* do nothing */
+
+void
+VMS_impl__record_interval_end_in_probe( int32 probeID );
+#define VMS__record_interval_end_in_probe( probeID ) \
+        /* do nothing */
+
+void
+VMS_impl__print_stats_of_probe( int32 probeID );
+#define VMS__print_stats_of_probe( probeID ) \
+        /* do nothing */
+
+void
+VMS_impl__print_stats_of_all_probes();
+#define VMS__print_stats_of_all_probes \
+        /* do nothing */
+
+#endif   /* defined STATS__ENABLE_PROBES */
+
+#endif	/* _PROBES_H */
+
diff -r 4fbc2165e493 -r 984f7d78bfdf vmalloc.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/vmalloc.c	Thu Nov 11 06:19:51 2010 -0800
@@ -0,0 +1,327 @@
+/*
+ *  Copyright 2009 OpenSourceCodeStewardshipFoundation.org
+ *  Licensed under GNU General Public License version 2
+ *
+ * Author: seanhalle@yahoo.com
+ *
+ * Created on November 14, 2009, 9:07 PM
+ */
+
+#include <malloc.h>
+#include <stdlib.h>
+
+#include "VMS.h"
+
+/*Helper function
+ *Insert a newly generated free chunk into the first spot on the free list.
+ * The chunk is cast as a MallocProlog, so the various pointers in it are
+ * accessed with C's help -- and the size of the prolog is easily added to
+ * the pointer when a chunk is returned to the app -- so C handles changes
+ * in pointer sizes among machines.
+ *
+ *The list head is a normal MallocProlog struct -- identified by its
+ * prevChunkInFreeList being NULL -- the only one.
+ *
+ *The end of the list is identified by next chunk being NULL, as usual.
+ */
+void inline
+add_chunk_to_free_list( MallocProlog *chunk, MallocProlog *listHead )
+ { 
+   chunk->nextChunkInFreeList     = listHead->nextChunkInFreeList;
+   if( chunk->nextChunkInFreeList != NULL ) //if not last in free list
+      chunk->nextChunkInFreeList->prevChunkInFreeList = chunk;
+   chunk->prevChunkInFreeList     = listHead;
+   listHead->nextChunkInFreeList  = chunk;
+ }
+
+
+/*This is sequential code, meant to only be called from the Master, not from
+ * any slave VPs.
+ *Search down list, checking size by the nextHigherInMem pointer, to find
+ * first chunk bigger than size needed.
+ *Shave off the extra and make it into a new free-list element, hook it in
+ * then return the address of the found element plus size of prolog.
+ *
+ *Will find a
+ */
+void *
+VMS__malloc( int32 sizeRequested )
+ { MallocProlog *foundElem = NULL, *currElem, *newElem;
+   int32         amountExtra, foundElemIsTopOfHeap, sizeConsumed,sizeOfFound;
+
+      //step up the size to be aligned at 16-byte boundary, prob better ways
+   sizeRequested = ((sizeRequested + 16) >> 4) << 4;
+   currElem = (_VMSMasterEnv->freeListHead)->nextChunkInFreeList;
+
+   while( currElem != NULL )
+    {    //check if size of currElem is big enough
+      sizeOfFound=(int32)((char*)currElem->nextHigherInMem -(char*)currElem);
+      amountExtra = sizeOfFound - sizeRequested - sizeof(MallocProlog);
+      if( amountExtra > 0 )
+       {    //found it, get out of loop
+         foundElem = currElem;
+         currElem = NULL;
+       }
+      else
+         currElem = currElem->nextChunkInFreeList;
+    }
+
+   if( foundElem == NULL )
+    { ERROR("\nmalloc failed\n")
+      return (void *)NULL;  //indicates malloc failed
+    }
+      //Using a kludge to identify the element that is the top chunk in the
+      // heap -- saving top-of-heap addr in head's nextHigherInMem -- and
+      // save addr of start of heap in head's nextLowerInMem
+      //Will handle top of Heap specially
+   foundElemIsTopOfHeap = foundElem->nextHigherInMem ==
+                          _VMSMasterEnv->freeListHead->nextHigherInMem;
+
+      //before shave off and try to insert new elem, remove found elem
+      //note, foundElem will never be the head, so always has valid prevChunk
+   foundElem->prevChunkInFreeList->nextChunkInFreeList =
+                                              foundElem->nextChunkInFreeList;
+   if( foundElem->nextChunkInFreeList != NULL )
+    { foundElem->nextChunkInFreeList->prevChunkInFreeList =
+                                              foundElem->prevChunkInFreeList;
+    }
+   foundElem->prevChunkInFreeList = NULL;//indicates elem currently allocated
+   
+      //if enough, turn extra into new elem & insert it
+   if( amountExtra > 64 )
+    {    //make new elem by adding to addr of curr elem then casting
+      sizeConsumed = sizeof(MallocProlog) + sizeRequested;
+      newElem = (MallocProlog *)( (char *)foundElem + sizeConsumed );
+      newElem->nextHigherInMem   = foundElem->nextHigherInMem;
+      newElem->nextLowerInMem    = foundElem;
+      foundElem->nextHigherInMem = newElem;
+      
+      if( ! foundElemIsTopOfHeap )
+       {    //there is no next higher for top of heap, so can't write to it
+         newElem->nextHigherInMem->nextLowerInMem = newElem;
+       }
+      add_chunk_to_free_list( newElem, _VMSMasterEnv->freeListHead );
+    }
+   else
+    {
+      sizeConsumed = sizeOfFound;
+    }
+  _VMSMasterEnv->amtOfOutstandingMem += sizeConsumed;
+
+      //skip over the prolog by adding its size to the pointer return
+   return (void *)((char *)foundElem + sizeof(MallocProlog));
+ }
+
+
+/*This is sequential code -- only to be called from the Master
+ * When free, subtract the size of prolog from pointer, then cast it to a
+ * MallocProlog.  Then check the nextLower and nextHigher chunks to see if
+ * one or both are also free, and coalesce if so, and if neither free, then
+ * add this one to free-list.
+ */
+void
+VMS__free( void *ptrToFree )
+ { MallocProlog *elemToFree, *nextLowerElem, *nextHigherElem;
+   int32         lowerExistsAndIsFree, higherExistsAndIsFree, sizeOfElem;
+
+   if( ptrToFree < (void*)_VMSMasterEnv->freeListHead->nextLowerInMem ||
+       ptrToFree > (void*)_VMSMasterEnv->freeListHead->nextHigherInMem )
+    {    //outside the range of data owned by VMS's malloc, so do nothing
+      return;
+    }
+      //subtract size of prolog to get pointer to prolog, then cast
+   elemToFree = (MallocProlog *)((char *)ptrToFree - sizeof(MallocProlog));
+   sizeOfElem =(int32)((char*)elemToFree->nextHigherInMem-(char*)elemToFree);
+
+   if( elemToFree->prevChunkInFreeList != NULL )
+    { printf( "error: freeing same element twice!" ); exit(1);
+    }
+
+   _VMSMasterEnv->amtOfOutstandingMem -= sizeOfElem;
+
+   nextLowerElem  = elemToFree->nextLowerInMem;
+   nextHigherElem = elemToFree->nextHigherInMem;
+
+   if( nextHigherElem == NULL )
+      higherExistsAndIsFree = FALSE;
+   else //okay exists, now check if in the free-list by checking back ptr
+      higherExistsAndIsFree = (nextHigherElem->prevChunkInFreeList != NULL);
+    
+   if( nextLowerElem == NULL )
+      lowerExistsAndIsFree = FALSE;
+   else //okay, it exists, now check if it's free
+      lowerExistsAndIsFree = (nextLowerElem->prevChunkInFreeList != NULL);
+    
+
+      //now, know what exists and what's free
+   if( lowerExistsAndIsFree )
+    { if( higherExistsAndIsFree )
+       {    //both exist and are free, so coalesce all three
+            //First, remove higher from free-list
+         nextHigherElem->prevChunkInFreeList->nextChunkInFreeList =
+                                         nextHigherElem->nextChunkInFreeList;
+         if( nextHigherElem->nextChunkInFreeList != NULL ) //end-of-list?
+            nextHigherElem->nextChunkInFreeList->prevChunkInFreeList =
+                                         nextHigherElem->prevChunkInFreeList;
+            //Now, fix-up sequence-in-mem list -- by side-effect, this also
+            // changes size of the lower elem, which is still in free-list
+         nextLowerElem->nextHigherInMem = nextHigherElem->nextHigherInMem;
+         if( nextHigherElem->nextHigherInMem !=
+             _VMSMasterEnv->freeListHead->nextHigherInMem )
+            nextHigherElem->nextHigherInMem->nextLowerInMem = nextLowerElem;
+            //notice didn't do anything to elemToFree -- it simply is no
+            // longer reachable from any of the lists.  Wonder if could be a
+            // security leak because left valid addresses in it,
+            // but don't care for now.
+       }
+      else
+       {    //lower is the only of the two that exists and is free,
+            //In this case, no adjustment to free-list, just change mem-list.
+            // By side-effect, changes size of the lower elem
+         nextLowerElem->nextHigherInMem = elemToFree->nextHigherInMem;
+         if( elemToFree->nextHigherInMem !=
+             _VMSMasterEnv->freeListHead->nextHigherInMem )
+            elemToFree->nextHigherInMem->nextLowerInMem = nextLowerElem;
+       }
+    }
+   else
+    {    //lower either doesn't exist or isn't free, so check higher
+      if( higherExistsAndIsFree )
+       {    //higher exists and is the only of the two free
+            //First, in free-list, replace higher elem with the one to free
+         elemToFree->nextChunkInFreeList=nextHigherElem->nextChunkInFreeList;
+         elemToFree->prevChunkInFreeList=nextHigherElem->prevChunkInFreeList;
+         elemToFree->prevChunkInFreeList->nextChunkInFreeList = elemToFree;
+         if( elemToFree->nextChunkInFreeList != NULL ) // end-of-list?
+            elemToFree->nextChunkInFreeList->prevChunkInFreeList =elemToFree;
+            //Now chg mem-list. By side-effect, changes size of elemToFree
+         elemToFree->nextHigherInMem = nextHigherElem->nextHigherInMem;
+         if( elemToFree->nextHigherInMem !=
+             _VMSMasterEnv->freeListHead->nextHigherInMem )
+            elemToFree->nextHigherInMem->nextLowerInMem = elemToFree;
+       }
+      else
+       {    //neither lower nor higher is availabe to coalesce so add to list
+            // this makes prev chunk ptr non-null, which indicates it's free
+         elemToFree->nextChunkInFreeList =
+                            _VMSMasterEnv->freeListHead->nextChunkInFreeList;
+         _VMSMasterEnv->freeListHead->nextChunkInFreeList = elemToFree;
+         if( elemToFree->nextChunkInFreeList != NULL ) // end-of-list?
+            elemToFree->nextChunkInFreeList->prevChunkInFreeList =elemToFree;
+         elemToFree->prevChunkInFreeList = _VMSMasterEnv->freeListHead;
+       }
+    }
+
+ }
+
+
+/*Allocates memory from the external system -- higher overhead
+ *
+ *Because of Linux's malloc throwing bizarre random faults when malloc is
+ * used inside a VMS virtual processor, have to pass this as a request and
+ * have the core loop do it when it gets around to it -- will look for these
+ * chores leftover from the previous animation of masterVP the next time it
+ * goes to animate the masterVP -- so it takes two separate masterVP
+ * animations, separated by work, to complete an external malloc or
+ * external free request.
+ *
+ *Thinking core loop accepts signals -- just looks if signal-location is
+ * empty or not --
+ */
+void *
+VMS__malloc_in_ext( int32 sizeRequested )
+ {
+ /*
+      //This is running in the master, so no chance for multiple cores to be
+      // competing for the core's flag.
+   if(  *(_VMSMasterEnv->coreLoopSignalAddr[ 0 ]) != 0 )
+    {    //something has already signalled to core loop, so save the signal
+         // and look, next time master animated, to see if can send it.
+         //Note, the addr to put a signal is in the coreloop's frame, so just
+         // checks it each time through -- make it volatile to avoid GCC
+         // optimizations -- it's a coreloop local var that only changes
+         // after jumping away.  The signal includes the addr to send the
+         //return to -- even if just empty return completion-signal
+         //
+         //save the signal in some queue that the master looks at each time
+         // it starts up -- one loc says if empty for fast common case --
+         //something like that -- want to hide this inside this call -- but
+         // think this has to come as a request -- req handler gives procr
+         // back to master loop, which gives it back to req handler at point
+         // it sees that core loop has sent return signal.  Something like
+         // that.
+      saveTheSignal
+
+    }
+  coreSigData->type = malloc;
+  coreSigData->sizeToMalloc = sizeRequested;
+  coreSigData->locToSignalCompletion = &figureOut;
+   _VMSMasterEnv->coreLoopSignals[ 0 ] = coreSigData;
+  */
+      //just risk system-stack faults until get this figured out
+   return malloc( sizeRequested );
+ }
+
+
+/*Frees memory that was allocated in the external system -- higher overhead
+ *
+ *As noted in external malloc comment, this is clunky 'cause the free has
+ * to be called in the core loop.
+ */
+void
+VMS__free_in_ext( void *ptrToFree )
+ {
+      //just risk system-stack faults until get this figured out
+   free( ptrToFree );
+
+      //TODO: fix this -- so 
+ }
+
+
+/*Designed to be called from the main thread outside of VMS, during init
+ */
+MallocProlog *
+VMS_ext__create_free_list()
+ { MallocProlog *freeListHead, *firstChunk;
+
+      //Note, this is running in the main thread -- all increases in malloc
+      // mem and all frees of it must be done in this thread, with the
+      // thread's original stack available
+   freeListHead = malloc( sizeof(MallocProlog) );
+   firstChunk   = malloc( MALLOC_ADDITIONAL_MEM_FROM_OS_SIZE );
+   if( firstChunk == NULL ) {printf("malloc error\n"); exit(1);}
+
+   freeListHead->prevChunkInFreeList = NULL;
+      //Use this addr to free the heap when cleanup
+   freeListHead->nextLowerInMem      = firstChunk;
+      //to identify top-of-heap elem, compare this addr to elem's next higher
+   freeListHead->nextHigherInMem     = (void*)( (char*)firstChunk +
+                                         MALLOC_ADDITIONAL_MEM_FROM_OS_SIZE);
+   freeListHead->nextChunkInFreeList = firstChunk;
+
+   firstChunk->nextChunkInFreeList   = NULL;
+   firstChunk->prevChunkInFreeList   = freeListHead;
+      //next Higher has to be set to top of chunk, so can calc size in malloc
+   firstChunk->nextHigherInMem       = (void*)( (char*)firstChunk +
+                                         MALLOC_ADDITIONAL_MEM_FROM_OS_SIZE);
+   firstChunk->nextLowerInMem        = NULL; //identifies as bott of heap
+   
+   _VMSMasterEnv->amtOfOutstandingMem = 0; //none allocated yet
+
+   return freeListHead;
+ }
+
+
+/*Designed to be called from the main thread outside of VMS, during cleanup
+ */
+void
+VMS_ext__free_free_list( MallocProlog *freeListHead )
+ {    
+      //stashed a ptr to the one and only bug chunk malloc'd from OS in the
+      // free list head's next lower in mem pointer
+   free( freeListHead->nextLowerInMem );
+
+   //don't free the head -- it'll be in an array eventually -- free whole
+   // array when all the free lists linked from it have already been freed
+ }
+
diff -r 4fbc2165e493 -r 984f7d78bfdf vmalloc.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/vmalloc.h	Thu Nov 11 06:19:51 2010 -0800
@@ -0,0 +1,52 @@
+/*
+ *  Copyright 2009 OpenSourceCodeStewardshipFoundation.org
+ *  Licensed under GNU General Public License version 2
+ *
+ * Author: seanhalle@yahoo.com
+ *
+ * Created on November 14, 2009, 9:07 PM
+ */
+
+#include <malloc.h>
+#include "VMS_primitive_data_types.h"
+
+typedef struct _MallocProlog MallocProlog;
+
+struct _MallocProlog
+ {
+   MallocProlog *nextChunkInFreeList;
+   MallocProlog *prevChunkInFreeList;
+   MallocProlog *nextHigherInMem;
+   MallocProlog *nextLowerInMem;
+ };
+//MallocProlog
+
+typedef struct
+ {
+   MallocProlog *firstChunkInFreeList;
+   int32         numInList;
+ }
+FreeListHead;
+
+void *
+VMS__malloc( int32 sizeRequested );
+
+void
+VMS__free( void *ptrToFree );
+
+/*Allocates memory from the external system -- higher overhead
+ */
+void *
+VMS__malloc_in_ext( int32 sizeRequested );
+
+/*Frees memory that was allocated in the external system -- higher overhead
+ */
+void
+VMS__free_in_ext( void *ptrToFree );
+
+
+MallocProlog *
+VMS_ext__create_free_list();
+
+void
+VMS_ext__free_free_list( MallocProlog *freeListHead );