# HG changeset patch
# User Some Random Person <seanhalle@yahoo.com>
# Date 1329939552 28800
# Node ID eaf7e4c58c9e0b9279d9cbf16e3a17c18b4a0bc1
# Parent  bc4cb994f11451921a3c960a34de1747560a672b
Create common_ancestor brch -- all branches will be closed, then new ones
created with this as the common ancestor of all branches -- it is incomplete!
only code that is common to all HW and Feat and FeatDev branches is in here

diff -r bc4cb994f114 -r eaf7e4c58c9e .hgignore
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/.hgignore	Wed Feb 22 11:39:12 2012 -0800
@@ -0,0 +1,3 @@
+syntax: glob
+
+*.o
diff -r bc4cb994f114 -r eaf7e4c58c9e .hgtags
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/.hgtags	Wed Feb 22 11:39:12 2012 -0800
@@ -0,0 +1,1 @@
+9c3107044f86c36fea3a8f72f64910b1363555be Dec27_2010_about_to_add_sched_record
diff -r bc4cb994f114 -r eaf7e4c58c9e CoreLoop.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/CoreLoop.c	Wed Feb 22 11:39:12 2012 -0800
@@ -0,0 +1,214 @@
+/*
+ * Copyright 2010  OpenSourceStewardshipFoundation
+ *
+ * Licensed under BSD
+ */
+
+
+#include "VMS.h"
+#include "ProcrContext.h"
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <time.h>
+
+#include <pthread.h>
+#include <sched.h>
+
+void *terminateCoreLoop(SlaveVP *currPr);
+
+/*This is the loop that runs in the OS Thread pinned to each core
+ *Get virt procr from queue,
+ * save state of current animator, then load in state of virt procr, using
+ * jmp instr to switch the program-counter state -- making the virt procr
+ * the new animator.
+ *At some point, the virt procr will suspend itself by saving out its
+ * animator state (stack ptr, frame ptr, program counter) and switching
+ * back to the OS Thread's animator state, which means restoring the
+ * stack and frame and jumping to the core loop start point.
+ *This cycle then repeats, until a special shutdown virtual processor is
+ * animated, which jumps to the end point at the bottom of core loop.
+ */
+void *
+coreLoop( void *paramsIn )
+ { 
+   ThdParams      *coreLoopThdParams;
+   int             thisCoresIdx;
+   SlaveVP        *currPr;
+   VMSQueueStruc  *readyToAnimateQ;
+   cpu_set_t       coreMask;  //has 1 in bit positions of allowed cores
+   int             errorCode;
+
+      //work-stealing struc on stack to prevent false-sharing in cache-line
+   volatile GateStruc gate;
+   //preGateProgress, waitProgress, exitProgress, gateClosed;
+
+
+   coreLoopThdParams = (ThdParams *)paramsIn;
+   thisCoresIdx = coreLoopThdParams->coreNum;
+
+   gate.gateClosed      = FALSE;
+   gate.preGateProgress = 0;
+   gate.waitProgress    = 0;
+   gate.exitProgress    = 0;
+   _VMSMasterEnv->workStealingGates[ thisCoresIdx ] = (GateStruc*)&gate;//race @startup
+
+      //wait until signalled that setup is complete
+   pthread_mutex_lock(   &suspendLock );
+   while( !(_VMSMasterEnv->setupComplete) )
+    {
+      pthread_cond_wait( &suspend_cond,
+                         &suspendLock );
+    }
+   pthread_mutex_unlock( &suspendLock );
+
+      //printf( "\nCore unsuspended: %d\n", coreLoopThdParams->coreNum );
+
+      //set thread affinity
+      //Linux requires pinning thd to core inside thread-function
+      //Designate a core by a 1 in bit-position corresponding to the core
+   CPU_ZERO(&coreMask);
+   CPU_SET(coreLoopThdParams->coreNum,&coreMask);
+   //coreMask = 1L << coreLoopThdParams->coreNum;
+
+   pthread_t selfThd = pthread_self();
+   errorCode =
+   pthread_setaffinity_np( selfThd, sizeof(coreMask), &coreMask);
+   
+   if(errorCode){ printf("\nset affinity failure\n"); exit(0); }
+
+   
+   //Save the return address in the SwitchVP function
+   saveCoreLoopReturnAddr((void**)&(_VMSMasterEnv->coreLoopReturnPt));
+
+   
+   while(1){
+   
+      //Get virtual processor from queue
+      //The Q must be a global, static volatile var, so not kept in reg,
+      // which forces reloading the pointer after each jmp to this point
+   readyToAnimateQ  = _VMSMasterEnv->readyToAnimateQs[thisCoresIdx];
+
+   #ifdef USE_WORK_STEALING
+      //Alg for work-stealing designed to make common case fast.  Comment
+      // in stealer code explains.
+   gate.preGateProgress++;
+   if( gate.gateClosed )
+    {    //now, set coreloop's progress, so stealer can see that core loop
+         // has made it into the waiting area.
+      gate.waitProgress = gate.preGateProgress;
+      while( gate.gateClosed ) /*busy wait*/;
+    }
+
+   currPr = (SlaveVP *) readVMSQ( readyToAnimateQ );
+
+      //Set the coreloop's progress, so stealer can see it has made it out
+      // of the protected area
+   gate.exitProgress = gate.preGateProgress;
+   #else
+   currPr = (SlaveVP *) readVMSQ( readyToAnimateQ );
+   #endif
+
+   if( currPr != NULL ) _VMSMasterEnv->numMasterInARow[thisCoresIdx] = 0;
+   else
+    {
+      //============================= MEASUREMENT STUFF =====================
+      #ifdef MEAS__TIME_MASTER_LOCK
+      int32 startStamp, endStamp;
+      saveLowTimeStampCountInto( startStamp );
+      #endif
+      //=====================================================================
+      int tries = 0; int gotLock = 0;
+      while( currPr == NULL ) //if queue was empty, enter get masterLock loop
+       {    //queue was empty, so get master lock
+
+         gotLock = __sync_bool_compare_and_swap(&(_VMSMasterEnv->masterLock),
+                                                          UNLOCKED, LOCKED );
+         if( gotLock )
+          {    //run own MasterVP -- jmps to coreLoops startPt when done
+            currPr = _VMSMasterEnv->masterVPs[thisCoresIdx];
+            if( _VMSMasterEnv->numMasterInARow[thisCoresIdx] > 1000 )
+             {       DEBUG( dbgB2BMaster,"Many back to back MasterVPs\n");
+               pthread_yield();
+             }
+            _VMSMasterEnv->numMasterInARow[thisCoresIdx] += 1;
+            break;  //end while -- have a VP to animate now
+          }
+
+         tries++;      //if too many, means master on other core taking too long
+         if( tries > MASTERLOCK_RETRIES ) { tries = 0; pthread_yield(); }
+       }
+      //============================= MEASUREMENT STUFF =====================
+      #ifdef MEAS__TIME_MASTER_LOCK
+      saveLowTimeStampCountInto( endStamp );
+      addIntervalToHist( startStamp, endStamp,
+                         _VMSMasterEnv->masterLockLowTimeHist );
+      addIntervalToHist( startStamp, endStamp,
+                         _VMSMasterEnv->masterLockHighTimeHist );
+      #endif
+      //=====================================================================
+
+    }
+
+   
+   switchToVP(currPr); //The VPs return in here
+   flushRegisters();
+   }//CoreLoop      
+ }
+
+
+void *
+terminateCoreLoop(SlaveVP *currPr){
+   //first free shutdown VP that jumped here -- it first restores the
+   // coreloop's stack, so addr of currPr in stack frame is still correct
+   VMS_int__dissipate_procr( currPr );
+   pthread_exit( NULL );
+}
+
+
+
+#ifdef SEQUENTIAL
+
+//===========================================================================
+/*This sequential version is exact same as threaded, except doesn't do the
+ * pin-threads part, nor the wait until setup complete part.
+ */
+void *
+coreLoop_Seq( void *paramsIn )
+ {
+   SlaveVP      *currPr;
+   VMSQueueStruc *readyToAnimateQ;
+   
+   ThdParams      *coreLoopThdParams;
+   int             thisCoresIdx;
+   
+   coreLoopThdParams = (ThdParams *)paramsIn;
+//   thisCoresIdx = coreLoopThdParams->coreNum;
+   thisCoresIdx = 0;
+
+   //Save the return address in the SwitchVP function
+   saveCoreLoopReturnAddr(&(_VMSMasterEnv->coreLoopReturnPt));
+
+   
+   while(1){
+      //Get virtual processor from queue
+      //_VMSWorkQ must be a global, static volatile var, so not kept in reg,
+      // which forces reloading the pointer after each jmp to this point
+   readyToAnimateQ  = _VMSMasterEnv->readyToAnimateQs[thisCoresIdx];
+   currPr = (SlaveVP *) readVMSQ( readyToAnimateQ );
+   if( currPr == NULL )
+    { if( _VMSMasterEnv->numMasterInARow[thisCoresIdx] > 1000 )
+       { printf("too many back to back MasterVP\n"); exit(1); }
+      _VMSMasterEnv->numMasterInARow[thisCoresIdx] += 1;
+      
+      currPr = _VMSMasterEnv->masterVPs[thisCoresIdx];
+    }
+   else
+      _VMSMasterEnv->numMasterInARow[thisCoresIdx] = 0;
+
+
+   switchToVP( currPr );
+   flushRegisters();
+   }
+ }
+#endif
diff -r bc4cb994f114 -r eaf7e4c58c9e MasterLoop.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/MasterLoop.c	Wed Feb 22 11:39:12 2012 -0800
@@ -0,0 +1,373 @@
+/*
+ * Copyright 2010  OpenSourceStewardshipFoundation
+ * 
+ * Licensed under BSD
+ */
+
+
+
+#include <stdio.h>
+#include <stddef.h>
+
+#include "VMS.h"
+#include "ProcrContext.h"
+
+
+//===========================================================================
+void inline
+stealWorkInto( SchedSlot *currSlot, VMSQueueStruc *readyToAnimateQ,
+               SlaveVP *masterPr );
+
+//===========================================================================
+
+
+
+/*This code is animated by the virtual Master processor.
+ *
+ *Polls each sched slot exactly once, hands any requests made by a newly
+ * done slave to the "request handler" plug-in function
+ *
+ *Any slots that need a virt procr assigned are given to the "schedule"
+ * plug-in function, which tries to assign a virt procr (slave) to it.
+ *
+ *When all slots needing a processor have been given to the schedule plug-in,
+ * a fraction of the procrs successfully scheduled are put into the
+ * work queue, then a continuation of this function is put in, then the rest
+ * of the virt procrs that were successfully scheduled.
+ *
+ *The first thing the continuation does is busy-wait until the previous
+ * animation completes.  This is because an (unlikely) continuation may
+ * sneak through queue before previous continuation is done putting second
+ * part of scheduled slaves in, which is the only race condition.
+ *
+ */
+
+/*May 29, 2010 -- birth a Master during init so that first core loop to
+ * start running gets it and does all the stuff for a newly born --
+ * from then on, will be doing continuation, but do suspension self
+ * directly at end of master loop
+ *So VMS__init just births the master virtual processor same way it births
+ * all the others -- then does any extra setup needed and puts it into the
+ * work queue.
+ *However means have to make masterEnv a global static volatile the same way
+ * did with readyToAnimateQ in core loop.  -- for performance, put the
+ * jump to the core loop directly in here, and have it directly jump back.
+ *
+ *
+ *Aug 18, 2010 -- Going to a separate MasterVP for each core, to see if this
+ * avoids the suspected bug in the system stack that causes bizarre faults
+ * at random places in the system code.
+ *
+ *So, this function is coupled to each of the MasterVPs, -- meaning this
+ * function can't rely on a particular stack and frame -- each MasterVP that
+ * animates this function has a different one.
+ *
+ *At this point, the masterLoop does not write itself into the queue anymore,
+ * instead, the coreLoop acquires the masterLock when it has nothing to
+ * animate, and then animates its own masterLoop.  However, still try to put
+ * several AppVPs into the queue to amortize the startup cost of switching
+ * to the MasterVP.  Note, don't have to worry about latency of requests much
+ * because most requests generate work for same core -- only latency issue
+ * is case when other cores starved and one core's requests generate work
+ * for them -- so keep max in queue to 3 or 4..
+ */
+void masterLoop( void *initData, SlaveVP *animatingPr )
+ { 
+   int32           slotIdx, numSlotsFilled;
+   SlaveVP      *schedVirtPr;
+   SchedSlot      *currSlot, **schedSlots;
+   MasterEnv      *masterEnv;
+   VMSQueueStruc  *readyToAnimateQ;
+   
+   Sched_Assigner  slaveScheduler;
+   RequestHandler  requestHandler;
+   void           *semanticEnv;
+
+   int32           thisCoresIdx;
+   SlaveVP      *masterPr;
+   volatile        SlaveVP *volatileMasterPr;
+   
+   volatileMasterPr = animatingPr;
+   masterPr         = (SlaveVP*)volatileMasterPr; //used to force re-define after jmp
+
+      //First animation of each MasterVP will in turn animate this part
+      // of setup code.. (VP creator sets up the stack as if this function
+      // was called normally, but actually get here by jmp)
+      //So, setup values about stack ptr, jmp pt and all that
+   //masterPr->resumeInstrPtr = &&masterLoopStartPt;
+
+
+      //Note, got rid of writing the stack and frame ptr up here, because
+      // only one
+      // core can ever animate a given MasterVP, so don't need to communicate
+      // new frame and stack ptr to the MasterVP storage before a second
+      // version of that MasterVP can get animated on a different core.
+      //Also got rid of the busy-wait.
+
+   
+   //masterLoopStartPt:
+   while(1){
+       
+   //============================= MEASUREMENT STUFF ========================
+   #ifdef MEAS__TIME_MASTER
+      //Total Master time includes one coreloop time -- just assume the core
+      // loop time is same for Master as for AppVPs, even though it may be
+      // smaller due to higher predictability of the fixed jmp.
+   saveLowTimeStampCountInto( masterPr->startMasterTSCLow );
+   #endif
+   //========================================================================
+
+   masterEnv        = (MasterEnv*)_VMSMasterEnv;
+   
+      //GCC may optimize so doesn't always re-define from frame-storage
+   masterPr         = (SlaveVP*)volatileMasterPr;  //just to make sure after jmp
+   thisCoresIdx     = masterPr->coreAnimatedBy;
+   readyToAnimateQ  = masterEnv->readyToAnimateQs[thisCoresIdx];
+   schedSlots       = masterEnv->allSchedSlots[thisCoresIdx];
+
+   requestHandler   = masterEnv->requestHandler;
+   slaveScheduler   = masterEnv->slaveSchedAssigner;
+   semanticEnv      = masterEnv->semanticEnv;
+
+
+      //Poll each slot's Done flag
+   numSlotsFilled = 0;
+   for( slotIdx = 0; slotIdx < NUM_SCHED_SLOTS; slotIdx++)
+    {
+      currSlot = schedSlots[ slotIdx ];
+
+      if( currSlot->workIsDone )
+       {
+         currSlot->workIsDone         = FALSE;
+         currSlot->needsProcrAssigned = TRUE;
+
+            //process requests from slave to master
+               //====================== MEASUREMENT STUFF ===================
+               #ifdef MEAS__TIME_PLUGIN
+               int32 startStamp1, endStamp1;
+               saveLowTimeStampCountInto( startStamp1 );
+               #endif
+               //============================================================
+         (*requestHandler)( currSlot->procrAssignedToSlot, semanticEnv );
+               //====================== MEASUREMENT STUFF ===================
+               #ifdef MEAS__TIME_PLUGIN
+               saveLowTimeStampCountInto( endStamp1 );
+               addIntervalToHist( startStamp1, endStamp1,
+                                        _VMSMasterEnv->reqHdlrLowTimeHist );
+               addIntervalToHist( startStamp1, endStamp1,
+                                        _VMSMasterEnv->reqHdlrHighTimeHist );
+               #endif
+               //============================================================
+       }
+      if( currSlot->needsProcrAssigned )
+       {    //give slot a new virt procr
+         schedVirtPr =
+          (*slaveScheduler)( semanticEnv, thisCoresIdx );
+         
+         if( schedVirtPr != NULL )
+          { currSlot->procrAssignedToSlot = schedVirtPr;
+            schedVirtPr->schedSlot        = currSlot;
+            currSlot->needsProcrAssigned  = FALSE;
+            numSlotsFilled               += 1;
+            
+            writeVMSQ( schedVirtPr, readyToAnimateQ );
+          }
+       }
+    }
+
+   
+   #ifdef USE_WORK_STEALING
+      //If no slots filled, means no more work, look for work to steal.
+   if( numSlotsFilled == 0 )
+    { gateProtected_stealWorkInto( currSlot, readyToAnimateQ, masterPr );
+    }
+   #endif
+
+   
+   #ifdef MEAS__TIME_MASTER
+   saveLowTimeStampCountInto( masterPr->endMasterTSCLow );
+   #endif
+
+   masterSwitchToCoreLoop(animatingPr);
+   flushRegisters();
+   }//MasterLoop
+
+
+ }
+
+
+
+/*This has a race condition -- the coreloops are accessing their own queues
+ * at the same time that this work-stealer on a different core is trying to
+ */
+void inline
+stealWorkInto( SchedSlot *currSlot, VMSQueueStruc *readyToAnimateQ,
+               SlaveVP *masterPr )
+ { 
+   SlaveVP   *stolenPr;
+   int32        coreIdx, i;
+   VMSQueueStruc *currQ;
+
+   stolenPr = NULL;
+   coreIdx = masterPr->coreAnimatedBy;
+   for( i = 0; i < NUM_CORES -1; i++ )
+    {
+      if( coreIdx >= NUM_CORES -1 )
+       { coreIdx = 0;
+       }
+      else
+       { coreIdx++;
+       }
+      currQ = _VMSMasterEnv->readyToAnimateQs[coreIdx];
+      if( numInVMSQ( currQ ) > 0 )
+       { stolenPr = readVMSQ (currQ );
+         break;
+       }
+    }
+
+   if( stolenPr != NULL )
+    { currSlot->procrAssignedToSlot = stolenPr;
+      stolenPr->schedSlot           = currSlot;
+      currSlot->needsProcrAssigned  = FALSE;
+
+      writeVMSQ( stolenPr, readyToAnimateQ );
+    }
+ }
+
+/*This algorithm makes the common case fast.  Make the coreloop passive,
+ * and show its progress.  Make the stealer control a gate that coreloop
+ * has to pass.
+ *To avoid interference, only one stealer at a time.  Use a global
+ * stealer-lock.
+ *
+ *The pattern is based on a gate -- stealer shuts the gate, then monitors
+ * to be sure any already past make it all the way out, before starting.
+ *So, have a "progress" measure just before the gate, then have two after it,
+ * one is in a "waiting room" outside the gate, the other is at the exit.
+ *Then, the stealer first shuts the gate, then checks the progress measure
+ * outside it, then looks to see if the progress measure at the exit is the
+ * same.  If yes, it knows the protected area is empty 'cause no other way
+ * to get in and the last to get in also exited.
+ *If the progress measure at the exit is not the same, then the stealer goes
+ * into a loop checking both the waiting-area and the exit progress-measures
+ * until one of them shows the same as the measure outside the gate.  Might
+ * as well re-read the measure outside the gate each go around, just to be
+ * sure.  It is guaranteed that one of the two will eventually match the one
+ * outside the gate.
+ *
+ *Here's an informal proof of correctness:
+ *The gate can be closed at any point, and have only four cases:
+ *  1) coreloop made it past the gate-closing but not yet past the exit
+ *  2) coreloop made it past the pre-gate progress update but not yet past
+ *     the gate,
+ *  3) coreloop is right before the pre-gate update
+ *  4) coreloop is past the exit and far from the pre-gate update.
+ *
+ * Covering the cases in reverse order,
+ *  4) is not a problem -- stealer will read pre-gate progress, see that it
+ *     matches exit progress, and the gate is closed, so stealer can proceed.
+ *  3) stealer will read pre-gate progress just after coreloop updates it..
+ *     so stealer goes into a loop until the coreloop causes wait-progress
+ *     to match pre-gate progress, so then stealer can proceed
+ *  2) same as 3..
+ *  1) stealer reads pre-gate progress, sees that it's different than exit,
+ *     so goes into loop until exit matches pre-gate, now it knows coreloop
+ *     is not in protected and cannot get back in, so can proceed.
+ *
+ *Implementation for the stealer:
+ *
+ *First, acquire the stealer lock -- only cores with no work to do will
+ * compete to steal, so not a big performance penalty having only one --
+ * will rarely have multiple stealers in a system with plenty of work -- and
+ * in a system with little work, it doesn't matter.
+ *
+ *Note, have single-reader, single-writer pattern for all variables used to
+ * communicate between stealer and victims
+ *
+ *So, scan the queues of the core loops, until find non-empty.  Each core
+ * has its own list that it scans.  The list goes in order from closest to
+ * furthest core, so it steals first from close cores.  Later can add
+ * taking info from the app about overlapping footprints, and scan all the
+ * others then choose work with the most footprint overlap with the contents
+ * of this core's cache.
+ *
+ *Now, have a victim want to take work from.  So, shut the gate in that
+ * coreloop, by setting the "gate closed" var on its stack to TRUE.
+ *Then, read the core's pre-gate progress and compare to the core's exit
+ * progress.
+ *If same, can proceed to take work from the coreloop's queue.  When done,
+ * write FALSE to gate closed var.
+ *If different, then enter a loop that reads the pre-gate progress, then
+ * compares to exit progress then to wait progress.  When one of two
+ * matches, proceed.  Take work from the coreloop's queue.  When done,
+ * write FALSE to the gate closed var.
+ * 
+ */
+void inline
+gateProtected_stealWorkInto( SchedSlot *currSlot,
+                             VMSQueueStruc *myReadyToAnimateQ,
+                             SlaveVP *masterPr )
+ {
+   SlaveVP     *stolenPr;
+   int32          coreIdx, i, haveAVictim, gotLock;
+   VMSQueueStruc *victimsQ;
+
+   volatile GateStruc *vicGate;
+   int32               coreMightBeInProtected;
+
+
+
+      //see if any other cores have work available to steal
+   haveAVictim = FALSE;
+   coreIdx = masterPr->coreAnimatedBy;
+   for( i = 0; i < NUM_CORES -1; i++ )
+    {
+      if( coreIdx >= NUM_CORES -1 )
+       { coreIdx = 0;
+       }
+      else
+       { coreIdx++;
+       }
+      victimsQ = _VMSMasterEnv->readyToAnimateQs[coreIdx];
+      if( numInVMSQ( victimsQ ) > 0 )
+       { haveAVictim = TRUE;
+         vicGate = _VMSMasterEnv->workStealingGates[ coreIdx ];
+         break;
+       }
+    }
+   if( !haveAVictim ) return;  //no work to steal, exit
+
+      //have a victim core, now get the stealer-lock
+   gotLock =__sync_bool_compare_and_swap( &(_VMSMasterEnv->workStealingLock),
+                                                          UNLOCKED, LOCKED );
+   if( !gotLock ) return; //go back to core loop, which will re-start master
+
+
+   //====== Start Gate-protection =======
+   vicGate->gateClosed = TRUE;
+   coreMightBeInProtected= vicGate->preGateProgress != vicGate->exitProgress;
+   while( coreMightBeInProtected )
+    {    //wait until sure
+      if( vicGate->preGateProgress == vicGate->waitProgress )
+         coreMightBeInProtected = FALSE;
+      if( vicGate->preGateProgress == vicGate->exitProgress )
+         coreMightBeInProtected = FALSE;
+    }
+
+   stolenPr = readVMSQ ( victimsQ );
+
+   vicGate->gateClosed = FALSE;
+   //======= End Gate-protection  =======
+
+
+   if( stolenPr != NULL )  //victim could have been in protected and taken
+    { currSlot->procrAssignedToSlot = stolenPr;
+      stolenPr->schedSlot           = currSlot;
+      currSlot->needsProcrAssigned  = FALSE;
+
+      writeVMSQ( stolenPr, myReadyToAnimateQ );
+    }
+
+      //unlock the work stealing lock
+   _VMSMasterEnv->workStealingLock = UNLOCKED;
+ }
diff -r bc4cb994f114 -r eaf7e4c58c9e VMS.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/VMS.h	Wed Feb 22 11:39:12 2012 -0800
@@ -0,0 +1,377 @@
+/*
+ *  Copyright 2009 OpenSourceStewardshipFoundation.org
+ *  Licensed under GNU General Public License version 2
+ *
+ * Author: seanhalle@yahoo.com
+ * 
+ */
+
+#ifndef _VMS_H
+#define	_VMS_H
+#define _GNU_SOURCE
+
+#include "VMS_primitive_data_types.h"
+#include "C_Libraries/DynArray/DynArray.h"
+#include "C_Libraries/Hash_impl/PrivateHash.h"
+#include "C_Libraries/Histogram/Histogram.h"
+#include "C_Libraries/Queue_impl/PrivateQueue.h"
+#include "vmalloc.h"
+
+#include <pthread.h>
+#include <sys/time.h>
+
+//=================  Defines: included from separate files  =================
+//
+// Note: ALL defines are in other files, none are in here
+//
+#include "VMS_defs__main.h"
+
+
+//================================ Typedefs =================================
+//
+typedef unsigned long long TSCount;
+typedef union
+ { uint32 lowHigh[2];
+   uint64 longVal;
+ }
+TSCountLowHigh;
+
+typedef struct _SchedSlot     SchedSlot;
+typedef struct _VMSReqst      VMSReqst;
+typedef struct _SlaveVP       SlaveVP;
+typedef struct _MasterVP      MasterVP;
+typedef struct _IntervalProbe IntervalProbe;
+typedef struct _GateStruc     GateStruc;
+
+
+typedef SlaveVP * (*Sched_Assigner)  ( void *, int );   //semEnv, coreIdx
+typedef void  (*RequestHandler)  ( SlaveVP *, void * ); //prWReqst, semEnv
+typedef void  (*TopLevelFnPtr)  ( void *, SlaveVP * ); //initData, animPr
+typedef void    TopLevelFn      ( void *, SlaveVP * ); //initData, animPr
+typedef void  (*ResumeVPFnPtr)   ( SlaveVP *, void * );
+
+//============================= Statistics ==================================
+
+inline TSCount getTSCount();
+
+//============= Request Related ===========
+//
+
+enum VMSReqstType   //avoid starting enums at 0, for debug reasons
+ {
+   semantic = 1,
+   createReq,
+   dissipate,
+   VMSSemantic      //goes with VMSSemReqst below
+ };
+
+struct _VMSReqst
+ {
+   enum VMSReqstType  reqType;//used for dissipate and in future for IO requests
+   void              *semReqData;
+
+   VMSReqst *nextReqst;
+ };
+//VMSReqst
+
+enum VMSSemReqstType   //These are equivalent to semantic requests, but for
+ {                     // VMS's services available directly to app, like OS
+   createProbe = 1,    // and probe services -- like a VMS-wide built-in lang
+   openFile,
+   otherIO
+ };
+
+typedef struct
+ { enum VMSSemReqstType reqType;
+   SlaveVP           *requestingPr;
+   char                *nameStr;  //for create probe
+ }
+ VMSSemReq;
+
+
+//====================  Core data structures  ===================
+
+struct _SchedSlot
+ {
+   int         workIsDone;
+   int         needsProcrAssigned;
+   SlaveVP  *procrAssignedToSlot;
+ };
+//SchedSlot
+
+/*WARNING: re-arranging this data structure could cause VP switching
+ *         assembly code to fail -- hard-codes offsets of fields
+ */
+struct _SlaveVP
+ { int         procrID;  //for debugging -- count up each time create
+   int         coreAnimatedBy;
+   void       *startOfStack;
+   void       *stackPtr;
+   void       *framePtr;
+   void       *resumeInstrPtr;
+   
+   void       *coreLoopStartPt;  //allows proto-runtime to be linked later
+   void       *coreLoopFramePtr; //restore before jmp back to core loop
+   void       *coreLoopStackPtr; //restore before jmp back to core loop
+
+   SchedSlot  *schedSlot;
+   VMSReqst   *requests;
+
+   void       *semanticData; //this livesUSE_GNU here for the life of VP
+   void       *dataRetFromReq;//values returned from plugin to VP go here
+
+      //=========== MEASUREMENT STUFF ==========
+       #ifdef MEAS__TIME_STAMP_SUSP
+       uint32  preSuspTSCLow;
+       uint32  postSuspTSCLow;
+       #endif
+       #ifdef MEAS__TIME_MASTER /* in SlaveVP because multiple masterVPs*/
+       uint32  startMasterTSCLow;USE_GNU
+       uint32  endMasterTSCLow;
+       #endif
+       #ifdef MEAS__TIME_2011_SYS
+       TSCountLowHigh  startSusp;
+       uint64  totalSuspCycles;
+       uint32  numGoodSusp;
+       #endif
+      //========================================
+   
+   float64      createPtInSecs;  //have space but don't use on some configs
+ };
+//SlaveVP
+
+
+/*WARNING: re-arranging this data structure could cause VP-switching
+ *         assembly code to fail -- hard-codes offsets of fields
+ *         (because -O3 messes with things otherwise)
+ */
+typedef struct
+ {
+   union{ //adds padding to put masterLock on its own cache-line to elim
+          // false sharing (masterLock is most-accessed var in VMS)
+        volatile int32   masterLock;
+        char             padding[CACHE_LINE_SZ];    
+   } masterLockUnion;
+   Sched_Assigner   slaveSchedAssigner;
+   RequestHandler   requestHandler;
+   
+   SchedSlot     ***allSchedSlots;
+   VMSQueueStruc **readyToAnimateQs;
+   SlaveVP      **masterVPs;
+
+   void            *semanticEnv;
+   void            *OSEventStruc;   //for future, when add I/O to BLIS
+   MallocArrays    *freeLists;
+   int32            amtOfOutstandingMem; //total currently allocated
+
+   void            *coreLoopReturnPt;//addr to jump to to re-enter coreLoop
+
+   int32            setupComplete;
+   //int32            numMasterInARow[NUM_CORES];//detect back-to-back masterVP
+   GateStruc       *workStealingGates[ NUM_CORES ]; //concurrent work-steal
+   int32            workStealingLock;
+   
+   int32            numVPsCreated; //gives ordering to processor creation
+
+      //=========== MEASUREMENT STUFF =============
+       IntervalProbe   **intervalProbes;
+       PrivDynArrayInfo *dynIntervalProbesInfo;
+       HashTable        *probeNameHashTbl;
+       int32             masterCreateProbeID;
+       float64           createPtInSecs;
+       Histogram       **measHists;
+       PrivDynArrayInfo *measHistsInfo;
+       #ifdef MEAS__TIME_PLUGIN
+       Histogram       *reqHdlrLowTimeHist;
+       Histogram       *reqHdlrHighTimeHist;
+       #endif
+       #ifdef MEAS__TIME_MALLOC
+       Histogram       *mallocTimeHist;
+       Histogram       *freeTimeHist;
+       #endif
+       #ifdef MEAS__TIME_MASTER_LOCK
+       Histogram       *masterLockLowTimeHist;
+       Histogram       *masterLockHighTimeHist;
+       #endif
+       #ifdef MEAS__TIME_2011_SYS
+       TSCountLowHigh   startMaster;
+       uint64           totalMasterCycles;
+       uint32           numMasterAnimations;
+       TSCountLowHigh   startReqHdlr;
+       uint64           totalPluginCycles;
+       uint32           numPluginAnimations;
+       uint64           cyclesTillStartMasterLoop;
+       TSCountLowHigh   endMasterLoop;
+       #endif
+      //==========================================
+ }
+MasterEnv;
+
+//=========================  Extra Stuff Data Strucs  =======================
+typedef struct
+ {
+
+ }
+VMSExcp;
+
+struct _GateStruc
+ {
+   int32 gateClosed;
+   int32 preGateProgress;
+   int32 waitProgress;
+   int32 exitProgress;
+ };
+//GateStruc
+
+//=======================  OS Thread related  ===============================
+
+void * coreLoop( void *paramsIn );  //standard PThreads fn prototype
+void * coreLoop_Seq( void *paramsIn );  //standard PThreads fn prototype
+void masterLoop( void *initData, SlaveVP *masterVP );
+
+
+typedef struct
+ {
+   void           *endThdPt;
+   unsigned int    coreNum;
+ }
+ThdParams;
+
+pthread_t       coreLoopThdHandles[ NUM_CORES ];  //pthread's virt-procr state
+ThdParams      *coreLoopThdParams [ NUM_CORES ];
+pthread_mutex_t suspendLock;
+pthread_cond_t  suspend_cond;
+
+
+
+//=============================  Global Vars ================================
+
+volatile MasterEnv      *_VMSMasterEnv __align_to_cacheline__;
+
+
+
+
+//=========================  Function Prototypes  ===========================
+
+
+//========== Setup and shutdown ==========
+void
+VMS_int__init();
+
+Fix seed-procr creation -- put box around language, have lang register stuff
+        with VMS.
+        have main program explicitly INIT Lang! -- makes more sense to
+        C programmers -- makes it clear that there's a transition.
+(might need to have the pthreads remain waiting for
+        cond until work is scheduled)
+Have main do call to tell language to perform work -- like did with DKU
+
+Ex: "HWSim__run_a_simulation(netlist, paramBag);"
+        "processID = SSR__run_program(seed_fn, seedData); "
+        "SSR__Wait_for_program_to_end(processID);"
+        "SSR__run_program_and_wait_till_it_ends(seed_fn, seedData);"
+        
+        allows multiple languages to be started, and programs run in several,
+        overlapped, or one program to be run that uses multiple langs..?
+        So, each program is in separate directory:
+            "HWSim_ArchDef__PingPong"  "SSR_Program__Blocked_Matrix_Mult"
+        
+        Those programs can talk to each other, via VMS, by handles they each
+        return
+        "processIDs[0] = SSR__run_program(seed_fn1, seedData1);"
+        "processIDs[1] = SSR__run_program(seed_fn2, seedData2);"
+        "SSR__link_programs(processIDs, 2);"
+or even
+        "processIDs[0] = Vthread__run_program(seed_fn1, seedData1);"
+        "processIDs[1] = SSR__run_program(seed_fn2, seedData2);"
+        "VMS__link_programs(processIDs, 2);"
+        Then, the programs just know they sync with other prog, but use own
+        lang's sync constructs -- VMS uses message system to establish tie-pt,
+        each lang defines what a tie-point means to it..  (work with the
+        diff semantics?)
+void
+VMS_WL__start_the_work_then_wait_until_done();
+
+void
+VMS_int__shutdown();
+
+void
+VMS_int__cleanup_at_end_of_shutdown();
+
+
+//==============    ===============
+
+inline SlaveVP *
+VMS_int__create_procr( TopLevelFnPtr fnPtr, void *dataParam );
+
+inline void
+VMS_int__point_slave_to_Fn( SlaveVP *slaveVP, TopLevelFnPtr fnPtr,
+                            void    *dataParam);
+
+void
+VMS_int__save_return_addr_into_ptd_to_loc(void *ptrToReturnAddrHoldingLoc);
+
+void
+VMS_int__write_return_addr_from_ptd_to_loc(void *ptrToReturnAddrHoldingLoc);
+
+void
+VMS_int__dissipate_procr( SlaveVP *procrToDissipate );
+
+   //Use this to create processor inside entry point & other places outside
+   // the VMS system boundary (IE, not run in slave nor Master)
+SlaveVP *
+VMS_ext__create_procr( TopLevelFnPtr fnPtr, void *dataParam );
+
+void
+VMS_ext__dissipate_procr( SlaveVP *procrToDissipate );
+
+void
+VMS_PI__throw_exception( char *msgStr, SlaveVP *reqstPr, VMSExcp *excpData );
+
+void *
+VMS_WL__give_sem_env_for( SlaveVP *animPr );
+
+//==============  Request Related  ===============
+
+void
+VMS_int__suspend_procr( SlaveVP *callingPr );
+
+inline void
+VMS_WL__add_sem_request_in_mallocd_VMSReqst( void *semReqData, SlaveVP *callingPr );
+
+inline void
+VMS_WL__send_sem_request( void *semReqData, SlaveVP *callingPr );
+
+void
+VMS_WL__send_create_procr_req( void *semReqData, SlaveVP *reqstingPr );
+
+void inline
+VMS_WL__send_dissipate_req( SlaveVP *prToDissipate );
+
+inline void
+VMS_WL__send_VMSSem_request( void *semReqData, SlaveVP *callingPr );
+
+VMSReqst *
+VMS_PI__take_next_request_out_of( SlaveVP *procrWithReq );
+
+inline void *
+VMS_PI__take_sem_reqst_from( VMSReqst *req );
+
+void inline
+VMS_PI__handle_VMSSemReq( VMSReqst *req, SlaveVP *requestingPr, void *semEnv,
+                       ResumeVPFnPtr resumePrFnPtr );
+
+//======================== MEASUREMENT ======================
+uint64
+VMS_WL__give_num_plugin_cycles();
+uint32
+VMS_WL__give_num_plugin_animations();
+
+
+
+#include "VMS__HW_dependent.h"
+#include "probes.h"
+#include "vutilities.h"
+
+#endif	/* _VMS_H */
+
diff -r bc4cb994f114 -r eaf7e4c58c9e VMS__HW_dependent.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/VMS__HW_dependent.c	Wed Feb 22 11:39:12 2012 -0800
@@ -0,0 +1,47 @@
+/*
+ * This File contains all hardware dependent C code.
+ */
+
+
+#include "VMS.h"
+
+/*Set up the stack with __cdecl structure on it
+ * Except doing a trick for 64 bits, where put top-level fn pointer on
+ * stack, then call an assembly helper that copies it into a reg and
+ * jumps to it.  So, set the resumeInstrPtr to the helper-assembly.
+ *No need to save registers on old stack frame, because there's no old
+ * animator state to return to
+ */
+VMS_int__point_slave_to_Fn( SlaveVP *slaveVP, TopLevelFnPtr fnPtr,
+                            void    *dataParam)
+ { void  *stackPtr;
+
+// Start of Hardware dependent part           
+   
+    //Set slave's instr pointer to a helper Fn that copies params from stack
+   slaveVP->resumeInstrPtr  = (TopLevelFnPtr)&startUpTopLevelFn;
+   
+    //fnPtr takes two params -- void *dataParam & void *animProcr
+    // Stack grows *down*, so start it at highest stack addr, minus room
+    // for 2 params + return addr. 
+   stackPtr = 
+     (void *)slaveVP->startOfStack + VIRT_PROCR_STACK_SIZE - 4*sizeof(void*);
+  
+    //setup __cdecl on stack
+    //Normally, return Addr is in loc pointed to by stackPtr, but doing a
+    // trick for 64 bit arch, where put ptr to top-level fn there instead,
+    // and set resumeInstrPtr to a helper-fn that copies the top-level
+    // fn ptr and params into registers.
+    //Then, dataParam is at stackPtr + 8 bytes, & animating SlaveVP above
+   *((SlaveVP**)stackPtr + 2 ) = slaveVP; //rightmost param
+   *((void**)stackPtr + 1 ) = dataParam;  //next  param to left
+   *((void**)stackPtr) = (void*)fnPtr;    //copied to reg by helper Fn
+   
+  
+// end of Hardware dependent part           
+   
+      //core loop will switch to stack & frame pointers stored in slave,
+      // suspend will save processor's stack and frame into slave
+   slaveVP->stackPtr = slaveVP->startOfStack; 
+   slaveVP->framePtr = slaveVP->startOfStack; 
+ }
\ No newline at end of file
diff -r bc4cb994f114 -r eaf7e4c58c9e VMS__HW_dependent.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/VMS__HW_dependent.h	Wed Feb 22 11:39:12 2012 -0800
@@ -0,0 +1,33 @@
+/*
+ *  Copyright 2009 OpenSourceStewardshipFoundation.org
+ *  Licensed under GNU General Public License version 2
+ *
+ * Author: seanhalle@yahoo.com
+ * 
+ */
+
+#ifndef _ProcrContext_H
+#define	_ProcrContext_H
+#define _GNU_SOURCE
+
+void saveCoreLoopReturnAddr(void **returnAddress);
+
+void switchToVP(SlaveVP *nextProcr);
+
+void switchToCoreLoop(SlaveVP *nextProcr);
+
+void masterSwitchToCoreLoop(SlaveVP *nextProcr);
+
+void startUpTopLevelFn();
+
+void *asmTerminateCoreLoop(SlaveVP *currPr);
+
+#define flushRegisters() \
+        asm volatile ("":::"%rbx", "%r12", "%r13","%r14","%r15")
+
+inline SlaveVP *
+create_procr_helper( SlaveVP *newPr,       TopLevelFnPtr  fnPtr,
+                     void      *dataParam, void           *stackLocs );
+
+#endif	/* _ProcrContext_H */
+
diff -r bc4cb994f114 -r eaf7e4c58c9e VMS__HW_dependent.s
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/VMS__HW_dependent.s	Wed Feb 22 11:39:12 2012 -0800
@@ -0,0 +1,167 @@
+.data
+
+
+.text
+
+//Save return label address for the coreLoop to pointer
+//Arguments: Pointer to variable holding address
+.globl saveCoreLoopReturnAddr
+saveCoreLoopReturnAddr:
+    movq    $coreLoopReturn, %rcx   #load label address
+    movq    %rcx, (%rdi)           #save address to pointer
+    ret
+
+
+//Trick for 64 bit arch -- copies args from stack into regs, then does jmp to
+// the top-level function, which was pointed to by the stack-ptr
+.globl startUpTopLevelFn
+startUpTopLevelFn:
+    movq    %rdi      , %rsi #get second argument from first argument of switchVP
+    movq    0x08(%rsp), %rdi #get first argument from stack
+    movq    (%rsp)    , %rax #get top-level function's addr from stack
+    jmp     *%rax            #jump to the top-level function
+
+//Switches form CoreLoop to VP ether a normal VP or the Master Loop
+//switch to virt procr's stack and frame ptr then jump to virt procr fn
+/* SlaveVP  offsets:
+ * 0x10  stackPtr
+ * 0x18 framePtr
+ * 0x20 resumeInstrPtr
+ * 0x30 coreLoopFramePtr
+ * 0x38 coreLoopStackPtr
+ *
+ * _VMSMasterEnv  offsets:
+ * 0x48 coreLoopReturnPt
+ * 0x54 masterLock
+ */
+.globl switchToVP
+switchToVP:
+    #SlaveVP in %rdi
+    movq    %rsp      , 0x38(%rdi)   #save core loop stack pointer 
+    movq    %rbp      , 0x30(%rdi)   #save core loop frame pointer
+    movq    0x10(%rdi), %rsp         #restore stack pointer
+    movq    0x18(%rdi), %rbp         #restore frame pointer
+    movq    0x20(%rdi), %rax         #get jmp pointer
+    jmp     *%rax                    #jmp to VP
+coreLoopReturn:
+    ret
+
+    
+//switches to core loop. saves return address
+/* SlaveVP  offsets:
+ * 0x10  stackPtr
+ * 0x18 framePtr
+ * 0x20 resumeInstrPtr
+ * 0x30 coreLoopFramePtr
+ * 0x38 coreLoopStackPtr
+ *
+ * _VMSMasterEnv  offsets:
+ * 0x48 coreLoopReturnPt
+ * 0x54 masterLock
+ */
+.globl switchToCoreLoop
+switchToCoreLoop:
+    #SlaveVP in %rdi
+    movq    $VPReturn , 0x20(%rdi)   #store return address
+    movq    %rsp      , 0x10(%rdi)   #save stack pointer 
+    movq    %rbp      , 0x18(%rdi)   #save frame pointer
+    movq    0x38(%rdi), %rsp         #restore stack pointer
+    movq    0x30(%rdi), %rbp         #restore frame pointer
+    movq    $_VMSMasterEnv, %rcx
+    movq    (%rcx)    , %rcx
+    movq    0x48(%rcx), %rax         #get CoreLoopStartPt
+    jmp     *%rax                    #jmp to CoreLoop
+VPReturn:
+    ret
+
+
+
+//switches to core loop from master. saves return address
+//Releases masterLock so the next MasterLoop can be executed
+/* SlaveVP  offsets:
+ * 0x10  stackPtr
+ * 0x18 framePtr
+ * 0x20 resumeInstrPtr
+ * 0x30 coreLoopFramePtr
+ * 0x38 coreLoopStackPtr
+ *
+ * _VMSMasterEnv  offsets:
+ * 0x48 coreLoopReturnPt
+ * 0x54 masterLock
+ */
+.globl masterSwitchToCoreLoop
+masterSwitchToCoreLoop:
+    #SlaveVP in %rdi
+    movq    $MasterReturn, 0x20(%rdi)   #store return address
+    movq    %rsp      , 0x10(%rdi)   #save stack pointer 
+    movq    %rbp      , 0x18(%rdi)   #save frame pointer
+    movq    0x38(%rdi), %rsp         #restore stack pointer
+    movq    0x30(%rdi), %rbp         #restore frame pointer
+    movq    $_VMSMasterEnv, %rcx
+    movq    (%rcx)    , %rcx
+    movq    0x48(%rcx), %rax         #get CoreLoopStartPt
+    movl    $0x0      , 0x54(%rcx)   #release lock
+    jmp     *%rax                    #jmp to CoreLoop
+MasterReturn:
+    ret
+
+
+//Switch to terminateCoreLoop
+//therefor switch to coreLoop context from master context
+// no need to call because the stack is already set up for switchVP
+// and virtPr is in %rdi
+// and both functions have the same argument.
+// do not save register of VP because this function will never return
+/* SlaveVP  offsets:
+ * 0x10  stackPtr
+ * 0x18 framePtr
+ * 0x20 resumeInstrPtr
+ * 0x30 coreLoopFramePtr
+ * 0x38 coreLoopStackPtr
+ *
+ * _VMSMasterEnv  offsets:
+ * 0x48 coreLoopReturnPt
+ * 0x58 masterLock
+ */
+.globl asmTerminateCoreLoop
+asmTerminateCoreLoop:
+    #SlaveVP in %rdi
+    movq    0x38(%rdi), %rsp         #restore stack pointer
+    movq    0x30(%rdi), %rbp         #restore frame pointer
+    movq    $terminateCoreLoop, %rax
+    jmp     *%rax                    #jmp to CoreLoop
+
+
+/*
+ * This one for the sequential version is special. It discards the current stack
+ * and returns directly from the coreLoop after VMS__dissipate_procr was called
+ */
+.globl asmTerminateCoreLoopSeq
+asmTerminateCoreLoopSeq:
+    #SlaveVP in %rdi
+    movq    0x38(%rdi), %rsp         #restore stack pointer
+    movq    0x30(%rdi), %rbp         #restore frame pointer
+    #argument is in %rdi
+    call    VMS__dissipate_procr
+    movq    %rbp      , %rsp        #goto the coreLoops stack
+    pop     %rbp        #restore the old framepointer
+    ret                 #return from core loop
+    
+
+//Assembly code takes the return addr off the stack and saves
+// into the loc pointed to by rdi.  The return addr is at 0x8(%rbp) for 64bit
+.globl asm_save_ret_to_singleton
+VMS_int__save_return_addr_into_ptd_to_loc:
+    movq 0x8(%rbp),     %rax  #get ret address, rbp is the same as in the calling function
+    movq     %rax,     (%rdi) #write ret addr to endInstrAddr field
+    ret
+
+
+//Assembly code changes the return addr on the stack to the one
+// pointed to by the parameter. The stack's return addr is at 0x8(%rbp)
+.globl asm_write_ret_from_singleton
+VMS_int__write_return_addr_from_ptd_to_loc:
+    movq    (%rdi),    %rax      #get return addr
+    movq      %rax,    0x8(%rbp) #write return addr to the stack of the caller
+    ret
+
diff -r bc4cb994f114 -r eaf7e4c58c9e VMS__PI.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/VMS__PI.c	Wed Feb 22 11:39:12 2012 -0800
@@ -0,0 +1,87 @@
+/*
+ * Copyright 2010  OpenSourceStewardshipFoundation
+ *
+ * Licensed under BSD
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <malloc.h>
+#include <inttypes.h>
+#include <sys/time.h>
+
+#include "VMS.h"
+
+
+/*
+ */
+VMSReqst *
+VMS_PI__take_next_request_out_of( SlaveVP *procrWithReq )
+ { VMSReqst *req;
+
+   req = procrWithReq->requests;
+   if( req == NULL ) return NULL;
+
+   procrWithReq->requests = procrWithReq->requests->nextReqst;
+   return req;
+ }
+
+
+inline void *
+VMS_PI__take_sem_reqst_from( VMSReqst *req )
+ {
+   return req->semReqData;
+ }
+
+
+
+/* This is for OS requests and VMS infrastructure requests, such as to create
+ *  a probe -- a probe is inside the heart of VMS-core, it's not part of any
+ *  language -- but it's also a semantic thing that's triggered from and used
+ *  in the application.. so it crosses abstractions..  so, need some special
+ *  pattern here for handling such requests.
+ * Doing this just like it were a second language sharing VMS-core.
+ * 
+ * This is called from the language's request handler when it sees a request
+ *  of type VMSSemReq
+ *
+ * TODO: Later change this, to give probes their own separate plugin & have
+ *  VMS-core steer the request to appropriate plugin
+ * Do the same for OS calls -- look later at it..
+ */
+void inline
+VMS_PI__handle_VMSSemReq( VMSReqst *req, SlaveVP *requestingPr, void *semEnv,
+                       ResumeVPFnPtr resumePrFnPtr )
+ { VMSSemReq     *semReq;
+   IntervalProbe *newProbe;
+
+   semReq = req->semReqData;
+
+   newProbe          = VMS_int__malloc( sizeof(IntervalProbe) );
+   newProbe->nameStr = VMS_int__strDup( semReq->nameStr );
+   newProbe->hist    = NULL;
+   newProbe->schedChoiceWasRecorded = FALSE;
+
+      //This runs in masterVP, so no race-condition worries
+   newProbe->probeID =
+             addToDynArray( newProbe, _VMSMasterEnv->dynIntervalProbesInfo );
+
+   requestingPr->dataRetFromReq = newProbe;
+
+   (*resumePrFnPtr)( requestingPr, semEnv );
+ }
+
+
+/*Later, improve this -- for now, just exits the application after printing
+ * the error message.
+ */
+void
+VMS_PI__throw_exception( char *msgStr, SlaveVP *reqstPr, VMSExcp *excpData )
+ {
+   printf("%s",msgStr);
+   fflush(stdin);
+   exit(1);
+ }
+
+
diff -r bc4cb994f114 -r eaf7e4c58c9e VMS__WL.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/VMS__WL.c	Wed Feb 22 11:39:12 2012 -0800
@@ -0,0 +1,138 @@
+/*
+ * Copyright 2010  OpenSourceStewardshipFoundation
+ *
+ * Licensed under BSD
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <malloc.h>
+#include <inttypes.h>
+#include <sys/time.h>
+
+#include "VMS.h"
+
+
+/*Anticipating multi-tasking
+ */
+void *
+VMS_WL__give_sem_env_for( SlaveVP *animPr )
+ {
+   return _VMSMasterEnv->semanticEnv;
+ }
+
+
+/*For this implementation of VMS, it may not make much sense to have the
+ * system of requests for creating a new processor done this way.. but over
+ * the scope of single-master, multi-master, mult-tasking, OS-implementing,
+ * distributed-memory, and so on, this gives VMS implementation a chance to
+ * do stuff before suspend, in the AppVP, and in the Master before the plugin
+ * is called, as well as in the lang-lib before this is called, and in the
+ * plugin.  So, this gives both VMS and language implementations a chance to
+ * intercept at various points and do order-dependent stuff.
+ *Having a standard VMSNewPrReqData struc allows the language to create and
+ * free the struc, while VMS knows how to get the newPr if it wants it, and
+ * it lets the lang have lang-specific data related to creation transported
+ * to the plugin.
+ */
+void
+VMS_WL__send_create_procr_req( void *semReqData, SlaveVP *reqstingPr )
+ { VMSReqst req;
+
+   req.reqType          = createReq;
+   req.semReqData       = semReqData;
+   req.nextReqst        = reqstingPr->requests;
+   reqstingPr->requests = &req;
+
+   VMS_int__suspend_procr( reqstingPr );
+ }
+
+
+/*
+ *This adds a request to dissipate, then suspends the processor so that the
+ * request handler will receive the request.  The request handler is what
+ * does the work of freeing memory and removing the processor from the
+ * semantic environment's data structures.
+ *The request handler also is what figures out when to shutdown the VMS
+ * system -- which causes all the core loop threads to die, and returns from
+ * the call that started up VMS to perform the work.
+ *
+ *This form is a bit misleading to understand if one is trying to figure out
+ * how VMS works -- it looks like a normal function call, but inside it
+ * sends a request to the request handler and suspends the processor, which
+ * jumps out of the VMS__dissipate_procr function, and out of all nestings
+ * above it, transferring the work of dissipating to the request handler,
+ * which then does the actual work -- causing the processor that animated
+ * the call of this function to disappear and the "hanging" state of this
+ * function to just poof into thin air -- the virtual processor's trace
+ * never returns from this call, but instead the virtual processor's trace
+ * gets suspended in this call and all the virt processor's state disap-
+ * pears -- making that suspend the last thing in the virt procr's trace.
+ */
+void
+VMS_WL__send_dissipate_req( SlaveVP *procrToDissipate )
+ { VMSReqst req;
+
+   req.reqType                = dissipate;
+   req.nextReqst              = procrToDissipate->requests;
+   procrToDissipate->requests = &req;
+
+   VMS_int__suspend_procr( procrToDissipate );
+ }
+
+
+
+/*This call's name indicates that request is malloc'd -- so req handler
+ * has to free any extra requests tacked on before a send, using this.
+ *
+ * This inserts the semantic-layer's request data into standard VMS carrier
+ * request data-struct that is mallocd.  The sem request doesn't need to
+ * be malloc'd if this is called inside the same call chain before the
+ * send of the last request is called.
+ *
+ *The request handler has to call VMS__free_VMSReq for any of these
+ */
+inline void
+VMS_WL__add_sem_request_in_mallocd_VMSReqst( void *semReqData,
+                                          SlaveVP *callingPr )
+ { VMSReqst *req;
+
+   req = VMS_int__malloc( sizeof(VMSReqst) );
+   req->reqType         = semantic;
+   req->semReqData      = semReqData;
+   req->nextReqst       = callingPr->requests;
+   callingPr->requests = req;
+ }
+
+/*This inserts the semantic-layer's request data into standard VMS carrier
+ * request data-struct is allocated on stack of this call & ptr to it sent
+ * to plugin
+ *Then it does suspend, to cause request to be sent.
+ */
+inline void
+VMS_WL__send_sem_request( void *semReqData, SlaveVP *callingPr )
+ { VMSReqst req;
+
+   req.reqType         = semantic;
+   req.semReqData      = semReqData;
+   req.nextReqst       = callingPr->requests;
+   callingPr->requests = &req;
+   
+   VMS_int__suspend_procr( callingPr );
+ }
+
+
+inline void
+VMS_WL__send_VMSSem_request( void *semReqData, SlaveVP *callingPr )
+ { VMSReqst req;
+
+   req.reqType         = VMSSemantic;
+   req.semReqData      = semReqData;
+   req.nextReqst       = callingPr->requests; //gab any other preceeding 
+   callingPr->requests = &req;
+
+   VMS_int__suspend_procr( callingPr );
+ }
+
+
diff -r bc4cb994f114 -r eaf7e4c58c9e VMS__int.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/VMS__int.c	Wed Feb 22 11:39:12 2012 -0800
@@ -0,0 +1,155 @@
+/*
+ * Copyright 2010  OpenSourceStewardshipFoundation
+ *
+ * Licensed under BSD
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <malloc.h>
+#include <inttypes.h>
+#include <sys/time.h>
+
+#include "VMS.h"
+
+
+inline SlaveVP *
+VMS_int__create_procr( TopLevelFnPtr fnPtr, void *dataParam )
+ { SlaveVP *newPr;
+   void      *stackLocs;
+
+   newPr      = VMS_int__malloc( sizeof(SlaveVP) );
+   stackLocs  = VMS_int__malloc( VIRT_PROCR_STACK_SIZE );
+   if( stackLocs == 0 )
+    { perror("VMS__malloc stack"); exit(1); }
+
+   _VMSMasterEnv->numSlaves += 1;
+
+   return create_procr_helper( newPr, fnPtr, dataParam, stackLocs );
+ }
+
+/* "ext" designates that it's for use outside the VMS system -- should only
+ * be called from main thread or other thread -- never from code animated by
+ * a VMS virtual processor.
+ */
+inline SlaveVP *
+VMS_ext__create_procr( TopLevelFnPtr fnPtr, void *dataParam )
+ { SlaveVP *newPr;
+   char      *stackLocs;
+
+   newPr      = malloc( sizeof(SlaveVP) );
+   stackLocs  = malloc( VIRT_PROCR_STACK_SIZE );
+   if( stackLocs == 0 )
+    { perror("malloc stack"); exit(1); }
+
+   return create_procr_helper( newPr, fnPtr, dataParam, stackLocs );
+ }
+
+
+//===========================================================================
+/*there is a label inside this function -- save the addr of this label in
+ * the callingPr struc, as the pick-up point from which to start the next
+ * work-unit for that procr.  If turns out have to save registers, then
+ * save them in the procr struc too.  Then do assembly jump to the CoreLoop's
+ * "done with work-unit" label.  The procr struc is in the request in the
+ * slave that animated the just-ended work-unit, so all the state is saved
+ * there, and will get passed along, inside the request handler, to the
+ * next work-unit for that procr.
+ */
+void
+VMS_int__suspend_procr( SlaveVP *animatingPr )
+ { 
+
+      //The request to master will cause this suspended virt procr to get
+      // scheduled again at some future point -- to resume, core loop jumps
+      // to the resume point (below), which causes restore of saved regs and
+      // "return" from this call.
+   //animatingPr->resumeInstrPtr = &&ResumePt;
+
+      //return ownership of the virt procr and sched slot to Master virt pr
+   animatingPr->schedSlot->workIsDone = TRUE;
+
+   //===========================  Measurement stuff ========================
+   #ifdef MEAS__TIME_STAMP_SUSP
+      //record time stamp: compare to time-stamp recorded below
+   saveLowTimeStampCountInto( animatingPr->preSuspTSCLow );
+   #endif
+   //=======================================================================
+
+   switchToCoreLoop(animatingPr);
+   flushRegisters();
+
+   //=======================================================================
+
+   #ifdef MEAS__TIME_STAMP_SUSP
+      //NOTE: only take low part of count -- do sanity check when take diff
+   saveLowTimeStampCountInto( animatingPr->postSuspTSCLow );
+   #endif
+
+   return;
+ }
+
+
+/* "ext" designates that it's for use outside the VMS system -- should only
+ * be called from main thread or other thread -- never from code animated by
+ * a SlaveVP, nor from a masterVP.
+ *
+ *Use this version to dissipate VPs created outside the VMS system.
+ */
+void
+VMS_ext__dissipate_procr( SlaveVP *procrToDissipate )
+ {
+      //NOTE: dataParam was given to the processor, so should either have
+      // been alloc'd with VMS__malloc, or freed by the level above animPr.
+      //So, all that's left to free here is the stack and the SlaveVP struc
+      // itself
+      //Note, should not stack-allocate the data param -- no guarantee, in
+      // general that creating processor will outlive ones it creates.
+   free( procrToDissipate->startOfStack );
+   free( procrToDissipate );
+ }
+
+
+
+/*This must be called by the request handler plugin -- it cannot be called
+ * from the semantic library "dissipate processor" function -- instead, the
+ * semantic layer has to generate a request, and the plug-in calls this
+ * function.
+ *The reason is that this frees the virtual processor's stack -- which is
+ * still in use inside semantic library calls!
+ *
+ *This frees or recycles all the state owned by and comprising the VMS
+ * portion of the animating virtual procr.  The request handler must first
+ * free any semantic data created for the processor that didn't use the
+ * VMS_malloc mechanism.  Then it calls this, which first asks the malloc
+ * system to disown any state that did use VMS_malloc, and then frees the
+ * statck and the processor-struct itself.
+ *If the dissipated processor is the sole (remaining) owner of VMS__malloc'd
+ * state, then that state gets freed (or sent to recycling) as a side-effect
+ * of dis-owning it.
+ */
+void
+VMS_int__dissipate_procr( SlaveVP *animatingPr )
+ {
+      //dis-own all locations owned by this processor, causing to be freed
+      // any locations that it is (was) sole owner of
+//TODO: implement VMS__malloc system, including "give up ownership"
+
+   _VMSMasterEnv->numSlaves -= 1;
+   if( _VMSMasterEnv->numSlaves == 0 )
+    {    //no more work, so shutdown
+      VMS_int__shutdown();  //note, creates 4 shut-down processors
+    }
+
+      //NOTE: dataParam was given to the processor, so should either have
+      // been alloc'd with VMS__malloc, or freed by the level above animPr.
+      //So, all that's left to free here is the stack and the SlaveVP struc
+      // itself
+      //Note, should not stack-allocate initial data -- no guarantee, in
+      // general that creating processor will outlive ones it creates.
+   VMS_int__free( animatingPr->startOfStack );
+   VMS_int__free( animatingPr );
+ }
+
+
diff -r bc4cb994f114 -r eaf7e4c58c9e VMS__startup_and_shutdown.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/VMS__startup_and_shutdown.c	Wed Feb 22 11:39:12 2012 -0800
@@ -0,0 +1,458 @@
+/*
+ * Copyright 2010  OpenSourceStewardshipFoundation
+ *
+ * Licensed under BSD
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <malloc.h>
+#include <inttypes.h>
+#include <sys/time.h>
+
+#include "VMS.h"
+#include "VMS__HW_dependent.h"
+
+
+#define thdAttrs NULL
+
+//===========================================================================
+void
+shutdownFn( void *dummy, SlaveVP *dummy2 );
+
+SchedSlot **
+create_sched_slots();
+
+void
+create_masterEnv();
+
+void
+create_the_coreLoop_OS_threads();
+
+MallocProlog *
+create_free_list();
+
+void
+endOSThreadFn( void *initData, SlaveVP *animatingPr );
+
+pthread_mutex_t suspendLock = PTHREAD_MUTEX_INITIALIZER;
+pthread_cond_t  suspend_cond  = PTHREAD_COND_INITIALIZER;
+
+//===========================================================================
+
+/*Setup has two phases:
+ * 1) Semantic layer first calls init_VMS, which creates masterEnv, and puts
+ *    the master virt procr into the work-queue, ready for first "call"
+ * 2) Semantic layer then does its own init, which creates the seed virt
+ *    procr inside the semantic layer, ready to schedule it when
+ *    asked by the first run of the masterLoop.
+ *
+ *This part is bit weird because VMS really wants to be "always there", and
+ * have applications attach and detach..  for now, this VMS is part of
+ * the app, so the VMS system starts up as part of running the app.
+ *
+ *The semantic layer is isolated from the VMS internals by making the
+ * semantic layer do setup to a state that it's ready with its
+ * initial virt procrs, ready to schedule them to slots when the masterLoop
+ * asks.  Without this pattern, the semantic layer's setup would
+ * have to modify slots directly to assign the initial virt-procrs, and put
+ * them into the readyToAnimateQ itself, breaking the isolation completely.
+ *
+ * 
+ *The semantic layer creates the initial virt procr(s), and adds its
+ * own environment to masterEnv, and fills in the pointers to
+ * the requestHandler and slaveScheduler plug-in functions
+ */
+
+/*This allocates VMS data structures, populates the master VMSProc,
+ * and master environment, and returns the master environment to the semantic
+ * layer.
+ */
+void
+VMS_int__init()
+ {
+
+#ifdef SEQUENTIAL
+   create_masterEnv();
+   flushRegisters();  //? not sure why here -- merten added it..?
+#else
+   create_masterEnv();
+   create_the_coreLoop_OS_threads();
+#endif
+ }
+
+void
+create_masterEnv()
+ { MasterEnv       *masterEnv;
+   VMSQueueStruc **readyToAnimateQs;
+   int              coreIdx;
+   SlaveVP      **masterVPs;
+   SchedSlot     ***allSchedSlots; //ptr to array of ptrs
+
+
+      //Make the master env, which holds everything else
+   _VMSMasterEnv = malloc( sizeof(MasterEnv) );
+
+        //Very first thing put into the master env is the free-list, seeded
+        // with a massive initial chunk of memory.
+        //After this, all other mallocs are VMS__malloc.
+   _VMSMasterEnv->freeListHead        = VMS_ext__create_free_list();
+
+
+   //============================= MEASUREMENT STUFF ========================
+   #ifdef MEAS__TIME_MALLOC
+   _VMSMasterEnv->mallocTimeHist  = makeFixedBinHistExt( 100, 0, 30,
+                                                       "malloc_time_hist");
+   _VMSMasterEnv->freeTimeHist  = makeFixedBinHistExt( 100, 0, 30,
+                                                       "free_time_hist");
+   #endif
+   #ifdef MEAS__TIME_PLUGIN
+   _VMSMasterEnv->reqHdlrLowTimeHist  = makeFixedBinHistExt( 100, 0, 200,
+                                                     "plugin_low_time_hist");
+   _VMSMasterEnv->reqHdlrHighTimeHist  = makeFixedBinHistExt( 100, 0, 200,
+                                                    "plugin_high_time_hist");
+   #endif
+   //========================================================================
+
+   //===================== Only VMS__malloc after this ====================
+   masterEnv     = (MasterEnv*)_VMSMasterEnv;
+   
+      //Make a readyToAnimateQ for each core loop
+   readyToAnimateQs = VMS_int__malloc( NUM_CORES * sizeof(VMSQueueStruc *) );
+   masterVPs        = VMS_int__malloc( NUM_CORES * sizeof(SlaveVP *) );
+
+      //One array for each core, 3 in array, core's masterVP scheds all
+   allSchedSlots    = VMS_int__malloc( NUM_CORES * sizeof(SchedSlot *) );
+
+   _VMSMasterEnv->numSlaves = 0;  //used to detect shut-down condition
+
+   _VMSMasterEnv->numVPsCreated = 0;  //used by create procr to set ID
+   for( coreIdx = 0; coreIdx < NUM_CORES; coreIdx++ )
+    {    
+      readyToAnimateQs[ coreIdx ] = makeVMSQ();
+      
+         //Q: should give masterVP core-specific info as its init data?
+      masterVPs[ coreIdx ] = VMS_int__create_procr( (TopLevelFnPtr)&masterLoop, (void*)masterEnv );
+      masterVPs[ coreIdx ]->coreAnimatedBy = coreIdx;
+      allSchedSlots[ coreIdx ] = create_sched_slots(); //makes for one core
+      _VMSMasterEnv->numMasterInARow[ coreIdx ] = 0;
+      _VMSMasterEnv->workStealingGates[ coreIdx ] = NULL;
+    }
+   _VMSMasterEnv->readyToAnimateQs = readyToAnimateQs;
+   _VMSMasterEnv->masterVPs        = masterVPs;
+   _VMSMasterEnv->masterLock       = UNLOCKED;
+   _VMSMasterEnv->allSchedSlots    = allSchedSlots;
+   _VMSMasterEnv->workStealingLock = UNLOCKED;
+
+
+      //Aug 19, 2010:  no longer need to place initial masterVP into queue
+      // because coreLoop now controls -- animates its masterVP when no work
+
+
+   //============================= MEASUREMENT STUFF ========================
+   #ifdef STATS__TURN_ON_PROBES
+   _VMSMasterEnv->dynIntervalProbesInfo =
+              makePrivDynArrayOfSize( (void***)&(_VMSMasterEnv->intervalProbes), 200);
+
+   _VMSMasterEnv->probeNameHashTbl = makeHashTable( 1000, &VMS_int__free );
+   
+      //put creation time directly into master env, for fast retrieval
+   struct timeval timeStamp;
+   gettimeofday( &(timeStamp), NULL);
+   _VMSMasterEnv->createPtInSecs =
+                           timeStamp.tv_sec +(timeStamp.tv_usec/1000000.0);
+   #endif
+   #ifdef MEAS__TIME_MASTER_LOCK
+   _VMSMasterEnv->masterLockLowTimeHist  = makeFixedBinHist( 50, 0, 2,
+                                                "master lock low time hist");
+   _VMSMasterEnv->masterLockHighTimeHist  = makeFixedBinHist( 50, 0, 100,
+                                               "master lock high time hist");
+   #endif
+   
+   MakeTheMeasHists();
+   //========================================================================
+ }
+
+SchedSlot **
+create_sched_slots()
+ { SchedSlot  **schedSlots;
+   int i;
+
+   schedSlots  = VMS_int__malloc( NUM_SCHED_SLOTS * sizeof(SchedSlot *) );
+
+   for( i = 0; i < NUM_SCHED_SLOTS; i++ )
+    {
+      schedSlots[i] = VMS_int__malloc( sizeof(SchedSlot) );
+
+         //Set state to mean "handling requests done, slot needs filling"
+      schedSlots[i]->workIsDone         = FALSE;
+      schedSlots[i]->needsProcrAssigned = TRUE;
+    }
+   return schedSlots;
+ }
+
+
+void
+freeSchedSlots( SchedSlot **schedSlots )
+ { int i;
+   for( i = 0; i < NUM_SCHED_SLOTS; i++ )
+    {
+      VMS_int__free( schedSlots[i] );
+    }
+   VMS_int__free( schedSlots );
+ }
+
+
+void
+create_the_coreLoop_OS_threads()
+ {
+   //========================================================================
+   //                      Create the Threads
+   int coreIdx, retCode;
+
+      //Need the threads to be created suspended, and wait for a signal
+      // before proceeding -- gives time after creating to initialize other
+      // stuff before the coreLoops set off.
+   _VMSMasterEnv->setupComplete = 0;
+
+      //Make the threads that animate the core loops
+   for( coreIdx=0; coreIdx < NUM_CORES; coreIdx++ )
+    { coreLoopThdParams[coreIdx]          = VMS_int__malloc( sizeof(ThdParams) );
+      coreLoopThdParams[coreIdx]->coreNum = coreIdx;
+
+      retCode =
+      pthread_create( &(coreLoopThdHandles[coreIdx]),
+                        thdAttrs,
+                       &coreLoop,
+               (void *)(coreLoopThdParams[coreIdx]) );
+      if(retCode){printf("ERROR creating thread: %d\n", retCode); exit(1);}
+    }
+ }
+
+
+
+void
+VMS_WL__register_request_handler( RequestHandler requestHandler )
+ { _VMSMasterEnv->requestHandler = requestHandler;
+ }
+
+
+void
+VMS_WL__register_sched_assigner( Sched_Assigner schedAssigner )
+ { _VMSMasterEnv->slaveSchedAssigner = schedAssigner;
+ }
+
+VMS_WL__register_semantic_env( void *semanticEnv )
+ { _VMSMasterEnv->semanticEnv = semanticEnv;
+ }
+
+
+/*This is what causes the VMS system to initialize.. then waits for it to
+ * exit.
+ * 
+ *Wrapper lib layer calls this when it wants the system to start running..
+ */
+void
+VMS_WL__start_the_work_then_wait_until_done()
+ { 
+#ifdef SEQUENTIAL
+   /*Only difference between version with an OS thread pinned to each core and
+    * the sequential version of VMS is VMS__init_Seq, this, and coreLoop_Seq.
+    */
+         //Instead of un-suspending threads, just call the one and only
+         // core loop (sequential version), in the main thread.
+      coreLoop_Seq( NULL );
+      flushRegisters();
+#else
+   int coreIdx;
+      //Start the core loops running
+   
+      //tell the core loop threads that setup is complete
+      //get lock, to lock out any threads still starting up -- they'll see
+      // that setupComplete is true before entering while loop, and so never
+      // wait on the condition
+   pthread_mutex_lock(     &suspendLock );
+   _VMSMasterEnv->setupComplete = 1;
+   pthread_mutex_unlock(   &suspendLock );
+   pthread_cond_broadcast( &suspend_cond );
+   
+   
+      //wait for all to complete
+   for( coreIdx=0; coreIdx < NUM_CORES; coreIdx++ )
+    {
+      pthread_join( coreLoopThdHandles[coreIdx], NULL );
+    }
+   
+      //NOTE: do not clean up VMS env here -- semantic layer has to have
+      // a chance to clean up its environment first, then do a call to free
+      // the Master env and rest of VMS locations
+#endif
+ }
+
+
+//TODO: look at architecting cleanest separation between request handler
+// and master loop, for dissipate, create, shutdown, and other non-semantic
+// requests.  Issue is chain: one removes requests from AppVP, one dispatches
+// on type of request, and one handles each type..  but some types require
+// action from both request handler and master loop -- maybe just give the
+// request handler calls like:  VMS__handle_X_request_type
+
+
+/*This is called by the semantic layer's request handler when it decides its
+ * time to shut down the VMS system.  Calling this causes the core loop OS
+ * threads to exit, which unblocks the entry-point function that started up
+ * VMS, and allows it to grab the result and return to the original single-
+ * threaded application.
+ * 
+ *The _VMSMasterEnv is needed by this shut down function, so the create-seed-
+ * and-wait function has to free a bunch of stuff after it detects the
+ * threads have all died: the masterEnv, the thread-related locations,
+ * masterVP any AppVPs that might still be allocated and sitting in the
+ * semantic environment, or have been orphaned in the _VMSWorkQ.
+ * 
+ *NOTE: the semantic plug-in is expected to use VMS__malloc to get all the
+ * locations it needs, and give ownership to masterVP.  Then, they will be
+ * automatically freed.
+ *
+ *In here,create one core-loop shut-down processor for each core loop and put
+ * them all directly into the readyToAnimateQ.
+ *Note, this function can ONLY be called after the semantic environment no
+ * longer cares if AppVPs get animated after the point this is called.  In
+ * other words, this can be used as an abort, or else it should only be
+ * called when all AppVPs have finished dissipate requests -- only at that
+ * point is it sure that all results have completed.
+ */
+void
+VMS_int__shutdown()
+ { int coreIdx;
+   SlaveVP *shutDownPr;
+
+      //create the shutdown processors, one for each core loop -- put them
+      // directly into the Q -- each core will die when gets one
+   for( coreIdx=0; coreIdx < NUM_CORES; coreIdx++ )
+    {    //Note, this is running in the master
+      shutDownPr = VMS_int__create_procr( &endOSThreadFn, NULL );
+      writeVMSQ( shutDownPr, _VMSMasterEnv->readyToAnimateQs[coreIdx] );
+    }
+
+ }
+
+
+/*Am trying to be cute, avoiding IF statement in coreLoop that checks for
+ * a special shutdown procr.  Ended up with extra-complex shutdown sequence.
+ *This function has the sole purpose of setting the stack and framePtr
+ * to the coreLoop's stack and framePtr.. it does that then jumps to the
+ * core loop's shutdown point -- might be able to just call Pthread_exit
+ * from here, but am going back to the pthread's stack and setting everything
+ * up just as if it never jumped out, before calling pthread_exit.
+ *The end-point of core loop will free the stack and so forth of the
+ * processor that animates this function, (this fn is transfering the
+ * animator of the AppVP that is in turn animating this function over
+ * to core loop function -- note that this slices out a level of virtual
+ * processors).
+ */
+void
+endOSThreadFn( void *initData, SlaveVP *animatingPr )
+ { 
+#ifdef SEQUENTIAL
+    asmTerminateCoreLoopSeq(animatingPr);
+#else
+    asmTerminateCoreLoop(animatingPr);
+#endif
+ }
+
+
+/*This is called from the startup & shutdown
+ */
+void
+VMS_int__cleanup_at_end_of_shutdown()
+ { 
+   //unused
+   //VMSQueueStruc **readyToAnimateQs;
+   //int              coreIdx;
+   //SlaveVP      **masterVPs;
+   //SchedSlot     ***allSchedSlots; //ptr to array of ptrs
+
+      //Before getting rid of everything, print out any measurements made
+   forAllInDynArrayDo( _VMSMasterEnv->measHistsInfo, (DynArrayFnPtr)&printHist );
+   forAllInDynArrayDo( _VMSMasterEnv->measHistsInfo, (DynArrayFnPtr)&saveHistToFile);
+   forAllInDynArrayDo( _VMSMasterEnv->measHistsInfo, &freeHist );
+   #ifdef MEAS__TIME_PLUGIN
+   printHist( _VMSMasterEnv->reqHdlrLowTimeHist );
+   saveHistToFile( _VMSMasterEnv->reqHdlrLowTimeHist );
+   printHist( _VMSMasterEnv->reqHdlrHighTimeHist );
+   saveHistToFile( _VMSMasterEnv->reqHdlrHighTimeHist );
+   freeHistExt( _VMSMasterEnv->reqHdlrLowTimeHist );
+   freeHistExt( _VMSMasterEnv->reqHdlrHighTimeHist );
+   #endif
+   #ifdef MEAS__TIME_MALLOC
+   printHist( _VMSMasterEnv->mallocTimeHist   );
+   saveHistToFile( _VMSMasterEnv->mallocTimeHist   );
+   printHist( _VMSMasterEnv->freeTimeHist     );
+   saveHistToFile( _VMSMasterEnv->freeTimeHist     );
+   freeHistExt( _VMSMasterEnv->mallocTimeHist );
+   freeHistExt( _VMSMasterEnv->freeTimeHist   );
+   #endif
+   #ifdef MEAS__TIME_MASTER_LOCK
+   printHist( _VMSMasterEnv->masterLockLowTimeHist );
+   printHist( _VMSMasterEnv->masterLockHighTimeHist );
+   #endif
+   #ifdef MEAS__TIME_MASTER
+   printHist( _VMSMasterEnv->pluginTimeHist );
+   for( coreIdx = 0; coreIdx < NUM_CORES; coreIdx++ )
+    {
+      freeVMSQ( readyToAnimateQs[ coreIdx ] );
+         //master VPs were created external to VMS, so use external free
+      VMS_int__dissipate_procr( masterVPs[ coreIdx ] );
+
+      freeSchedSlots( allSchedSlots[ coreIdx ] );
+    }
+   #endif
+   #ifdef MEAS__TIME_STAMP_SUSP
+   printHist( _VMSMasterEnv->pluginTimeHist );
+   for( coreIdx = 0; coreIdx < NUM_CORES; coreIdx++ )
+    {
+      freeVMSQ( readyToAnimateQs[ coreIdx ] );
+         //master VPs were created external to VMS, so use external free
+      VMS_int__dissipate_procr( masterVPs[ coreIdx ] );
+
+      freeSchedSlots( allSchedSlots[ coreIdx ] );
+    }
+   #endif
+
+      //All the environment data has been allocated with VMS__malloc, so just
+      // free its internal big-chunk and all inside it disappear.
+/*
+   readyToAnimateQs = _VMSMasterEnv->readyToAnimateQs;
+   masterVPs        = _VMSMasterEnv->masterVPs;
+   allSchedSlots    = _VMSMasterEnv->allSchedSlots;
+   
+   for( coreIdx = 0; coreIdx < NUM_CORES; coreIdx++ )
+    {
+      freeVMSQ( readyToAnimateQs[ coreIdx ] );
+         //master VPs were created external to VMS, so use external free
+      VMS__dissipate_procr( masterVPs[ coreIdx ] );
+      
+      freeSchedSlots( allSchedSlots[ coreIdx ] );
+    }
+   
+   VMS__free( _VMSMasterEnv->readyToAnimateQs );
+   VMS__free( _VMSMasterEnv->masterVPs );
+   VMS__free( _VMSMasterEnv->allSchedSlots );
+   
+   //============================= MEASUREMENT STUFF ========================
+   #ifdef STATS__TURN_ON_PROBES
+   freeDynArrayDeep( _VMSMasterEnv->dynIntervalProbesInfo, &VMS__free_probe);
+   #endif
+   //========================================================================
+*/
+      //These are the only two that use system free 
+   VMS_ext__free_free_list( _VMSMasterEnv->freeListHead );
+   free( (void *)_VMSMasterEnv );
+ }
+
+
+//================================
+
+
diff -r bc4cb994f114 -r eaf7e4c58c9e VMS_defs__HW_specific.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/VMS_defs__HW_specific.h	Wed Feb 22 11:39:12 2012 -0800
@@ -0,0 +1,51 @@
+/*
+ *  Copyright 2012 OpenSourceStewardshipFoundation
+ *  Licensed under BSD
+ *
+ * Author: seanhalle@yahoo.com
+ * 
+ */
+
+#ifndef _VMS_HW_SPEC_DEFS_H
+#define	_VMS_HW_SPEC_DEFS_H
+#define _GNU_SOURCE
+
+
+//=========================  Hardware related Constants =====================
+   //This value is the number of hardware threads in the shared memory
+   // machine
+#define NUM_CORES        4
+
+   // tradeoff amortizing master fixed overhead vs imbalance potential
+   // when work-stealing, can make bigger, at risk of losing cache affinity
+#define NUM_SCHED_SLOTS  3
+
+#define MIN_WORK_UNIT_CYCLES 20000
+
+#define MASTERLOCK_RETRIES 10000
+
+   // stack size in virtual processors created
+#define VIRT_PROCR_STACK_SIZE 0x8000 /* 32K */
+
+   // memory for VMS__malloc
+#define MALLOC_ADDITIONAL_MEM_FROM_OS_SIZE 0x40000000 /* 1G */
+
+   //Frequency of TS counts -- have to do tests to verify
+   //NOTE: turn off (in BIOS)  TURBO-BOOST and SPEED-STEP else won't be const
+#define TSCOUNT_FREQ 3180000000
+
+#define CACHE_LINE_SZ 256
+#define PAGE_SIZE 4096
+
+//To prevent false-sharing, aligns a variable to a cache-line boundary.
+//No need to use for local vars because those are never shared between cores
+#define __align_to_cacheline__ __attribute__ ((aligned(CACHE_LINE_SZ)))
+
+//aligns a pointer to cacheline. The memory area has to contain at least
+//CACHE_LINE_SZ bytes more then needed
+#define __align_address(ptr) ((void*)(((uintptr_t)(ptr))&((uintptr_t)(~0x0FF))))
+
+//===========================================================================
+
+#endif	/* _VMS_DEFS_H */
+
diff -r bc4cb994f114 -r eaf7e4c58c9e VMS_defs__lang_specific.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/VMS_defs__lang_specific.h	Wed Feb 22 11:39:12 2012 -0800
@@ -0,0 +1,182 @@
+/*
+ *  Copyright 2009 OpenSourceStewardshipFoundation.org
+ *  Licensed under GNU General Public License version 2
+ *
+ * Author: seanhalle@yahoo.com
+ * 
+ */
+
+#ifndef _VMS_LANG_SPEC_DEFS_H
+#define	_VMS_LANG_SPEC_DEFS_H
+
+
+
+//===================  Language-specific Measurement Stuff ===================
+//
+//TODO:  Figure out way to move these into language dir..
+//   wrap them in #ifdef MEAS__...
+//
+#ifndef MAKE_HISTS_FOR_MEASUREMENTS
+#define MakeTheMeasHists() 
+#endif
+
+//===========================================================================
+//VPThread
+#ifdef VTHREAD
+
+#define createHistIdx      1  //note: starts at 1
+#define mutexLockHistIdx   2
+#define mutexUnlockHistIdx 3
+#define condWaitHistIdx    4
+#define condSignalHistIdx  5
+
+#define MakeTheMeasHists() \
+   _VMSMasterEnv->measHistsInfo = \
+              makePrivDynArrayOfSize( (void***)&(_VMSMasterEnv->measHists), 200); \
+   makeAMeasHist( createHistIdx,      "create",        250, 0, 100 ) \
+   makeAMeasHist( mutexLockHistIdx,   "mutex_lock",    50, 0, 100 ) \
+   makeAMeasHist( mutexUnlockHistIdx, "mutex_unlock",  50, 0, 100 ) \
+   makeAMeasHist( condWaitHistIdx,    "cond_wait",     50, 0, 100 ) \
+   makeAMeasHist( condSignalHistIdx,  "cond_signal",   50, 0, 100 )
+
+   
+#define Meas_startCreate \
+    int32 startStamp, endStamp; \
+    saveLowTimeStampCountInto( startStamp ); \
+
+#define Meas_endCreate \
+    saveLowTimeStampCountInto( endStamp ); \
+    addIntervalToHist( startStamp, endStamp, \
+                                 _VMSMasterEnv->measHists[ createHistIdx ] );
+
+#define Meas_startMutexLock \
+    int32 startStamp, endStamp; \
+    saveLowTimeStampCountInto( startStamp ); \
+
+#define Meas_endMutexLock \
+    saveLowTimeStampCountInto( endStamp ); \
+    addIntervalToHist( startStamp, endStamp, \
+                              _VMSMasterEnv->measHists[ mutexLockHistIdx ] );
+
+#define Meas_startMutexUnlock \
+    int32 startStamp, endStamp; \
+    saveLowTimeStampCountInto( startStamp ); \
+
+#define Meas_endMutexUnlock \
+    saveLowTimeStampCountInto( endStamp ); \
+    addIntervalToHist( startStamp, endStamp, \
+                            _VMSMasterEnv->measHists[ mutexUnlockHistIdx ] );
+
+#define Meas_startCondWait \
+    int32 startStamp, endStamp; \
+    saveLowTimeStampCountInto( startStamp ); \
+
+#define Meas_endCondWait \
+    saveLowTimeStampCountInto( endStamp ); \
+    addIntervalToHist( startStamp, endStamp, \
+                               _VMSMasterEnv->measHists[ condWaitHistIdx ] );
+
+#define Meas_startCondSignal \
+    int32 startStamp, endStamp; \
+    saveLowTimeStampCountInto( startStamp ); \
+
+#define Meas_endCondSignal \
+    saveLowTimeStampCountInto( endStamp ); \
+    addIntervalToHist( startStamp, endStamp, \
+                             _VMSMasterEnv->measHists[ condSignalHistIdx ] );
+
+#endif
+
+
+
+//===========================================================================
+//VCilk
+
+#ifdef VCILK
+
+#define spawnHistIdx      1 //note: starts at 1
+#define syncHistIdx       2
+
+#define MakeTheMeasHists() \
+   _VMSMasterEnv->measHistsInfo = \
+          makePrivDynArrayOfSize( (void***)&(_VMSMasterEnv->measHists), 200); \
+    makeAMeasHist( spawnHistIdx,      "Spawn",        50, 0, 200 ) \
+    makeAMeasHist( syncHistIdx,       "Sync",         50, 0, 200 )
+
+
+#define Meas_startSpawn \
+    int32 startStamp, endStamp; \
+    saveLowTimeStampCountInto( startStamp ); \
+
+#define Meas_endSpawn \
+    saveLowTimeStampCountInto( endStamp ); \
+    addIntervalToHist( startStamp, endStamp, \
+                             _VMSMasterEnv->measHists[ spawnHistIdx ] );
+
+#define Meas_startSync \
+    int32 startStamp, endStamp; \
+    saveLowTimeStampCountInto( startStamp ); \
+
+#define Meas_endSync \
+    saveLowTimeStampCountInto( endStamp ); \
+    addIntervalToHist( startStamp, endStamp, \
+                             _VMSMasterEnv->measHists[ syncHistIdx ] );
+#endif
+
+//===========================================================================
+// SSR
+
+#ifdef SSR
+
+#define SendFromToHistIdx      1 //note: starts at 1
+#define SendOfTypeHistIdx      2
+#define ReceiveFromToHistIdx   3
+#define ReceiveOfTypeHistIdx   4
+
+#define MakeTheMeasHists() \
+   _VMSMasterEnv->measHistsInfo = \
+              makePrivDynArrayOfSize( (void***)&(_VMSMasterEnv->measHists), 200); \
+    makeAMeasHist( SendFromToHistIdx,   "SendFromTo",    50, 0, 100 ) \
+    makeAMeasHist( SendOfTypeHistIdx,   "SendOfType",    50, 0, 100 ) \
+    makeAMeasHist( ReceiveFromToHistIdx,"ReceiveFromTo", 50, 0, 100 ) \
+    makeAMeasHist( ReceiveOfTypeHistIdx,"ReceiveOfType", 50, 0, 100 )
+
+#define Meas_startSendFromTo \
+    int32 startStamp, endStamp; \
+    saveLowTimeStampCountInto( startStamp ); \
+
+#define Meas_endSendFromTo \
+    saveLowTimeStampCountInto( endStamp ); \
+    addIntervalToHist( startStamp, endStamp, \
+                             _VMSMasterEnv->measHists[ SendFromToHistIdx ] );
+
+#define Meas_startSendOfType \
+    int32 startStamp, endStamp; \
+    saveLowTimeStampCountInto( startStamp ); \
+
+#define Meas_endSendOfType \
+    saveLowTimeStampCountInto( endStamp ); \
+    addIntervalToHist( startStamp, endStamp, \
+                             _VMSMasterEnv->measHists[ SendOfTypeHistIdx ] );
+
+#define Meas_startReceiveFromTo \
+    int32 startStamp, endStamp; \
+    saveLowTimeStampCountInto( startStamp ); \
+
+#define Meas_endReceiveFromTo \
+    saveLowTimeStampCountInto( endStamp ); \
+    addIntervalToHist( startStamp, endStamp, \
+                             _VMSMasterEnv->measHists[ ReceiveFromToHistIdx ] );
+
+#define Meas_startReceiveOfType \
+    int32 startStamp, endStamp; \
+    saveLowTimeStampCountInto( startStamp ); \
+
+#define Meas_endReceiveOfType \
+    saveLowTimeStampCountInto( endStamp ); \
+    addIntervalToHist( startStamp, endStamp, \
+                             _VMSMasterEnv->measHists[ReceiveOfTypeHistIdx ] );
+#endif  /* SSR */
+
+#endif	/* _VMS_DEFS_H */
+
diff -r bc4cb994f114 -r eaf7e4c58c9e VMS_defs__main.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/VMS_defs__main.h	Wed Feb 22 11:39:12 2012 -0800
@@ -0,0 +1,185 @@
+/*
+ *  Copyright 2009 OpenSourceStewardshipFoundation.org
+ *  Licensed under GNU General Public License version 2
+ *
+ * Author: seanhalle@yahoo.com
+ * 
+ */
+
+#ifndef _VMS_DEFS_H
+#define	_VMS_DEFS_H
+#define _GNU_SOURCE
+
+//===========================  VMS-wide defs  ===============================
+#include "VMS_primitive_data_types.h"
+
+#define SUCCESS 0
+
+   //only after macro-expansion are the defs of writePrivQ, aso looked up
+   // so these defs can be at the top, and writePrivQ defined later on..
+#define writeVMSQ     writePrivQ
+#define readVMSQ      readPrivQ
+#define makeVMSQ      makeVMSPrivQ
+#define numInVMSQ     numInPrivQ
+#define VMSQueueStruc PrivQueueStruc
+
+
+//======================  Hardware Specific Defs ============================
+#include "VMS_defs__HW_specific.h"
+
+//=========================  Debug Related Defs =============================
+//
+//When SEQUENTIAL is defined, VMS does sequential exe in the main thread
+// It still does co-routines and all the mechanisms are the same, it just
+// has only a single thread and animates VPs one at a time
+//#define SEQUENTIAL
+
+//#define USE_WORK_STEALING
+
+//turns on the probe-instrumentation in the application -- when not
+// defined, the calls to the probe functions turn into comments
+#define STATS__ENABLE_PROBES
+//#define TURN_ON_DEBUG_PROBES
+
+//These defines turn types of bug messages on and off
+// be sure debug messages are un-commented (next block of defines)
+#define dbgAppFlow   TRUE /* Top level flow of application code -- general*/
+#define dbgProbes    FALSE /* for issues inside probes themselves*/
+#define dbgB2BMaster FALSE /* in coreloop, back to back master VPs*/
+#define dbgRqstHdlr  FALSE /* in request handler code*/
+
+//Comment or un- the substitute half to turn on/off types of debug message
+#define DEBUG(  bool, msg)         \
+//  if( bool){ printf(msg); fflush(stdin);}
+#define DEBUG1( bool, msg, param)  \
+//   if(bool){printf(msg, param); fflush(stdin);}
+#define DEBUG2( bool, msg, p1, p2) \
+//   if(bool) {printf(msg, p1, p2); fflush(stdin);}
+
+#define ERROR(msg) printf(msg);
+#define ERROR1(msg, param) printf(msg, param); 
+#define ERROR2(msg, p1, p2) printf(msg, p1, p2);
+
+//======================  Measurement Related Defs ==========================
+//
+//
+   //when STATS__TURN_ON_PROBES is defined allows using probes to measure
+   // time intervals.  The probes are macros that only compile to something
+   // when STATS__TURN_ON_PROBES is defined.  The probes are saved in the
+   // master env -- but only when this is defined.
+   //The TSC probes use RDTSC instr, can be unreliable, Dbl uses gettimeofday
+#define STATS__TURN_ON_PROBES
+//#define STATS__USE_TSC_PROBES
+#define STATS__USE_DBL_PROBES
+
+//==================  Turn Measurement Things on and off ====================
+
+//#define MEAS__TIME_2011_SYS
+
+//define this if any MEAS__... below are
+//#define MAKE_HISTS_FOR_MEASUREMENTS
+   //when MEAS__TIME_STAMP_SUSP is defined, causes code to be inserted and
+   // compiled-in that saves the low part of the time stamp count just before
+   // suspending a processor and just after resuming that processor.  It is
+   // saved into a field added to VirtProcr.  Have to sanity-check for
+   // rollover of low portion into high portion.
+//#define MEAS__TIME_STAMP_SUSP
+//#define MEAS__TIME_MASTER
+//#define MEAS__TIME_PLUGIN
+//#define MEAS__TIME_MALLOC
+//#define MEAS__TIME_MASTER_LOCK
+
+   //For code that calculates normalization-offset between TSC counts of
+   // different cores.
+//#define NUM_TSC_ROUND_TRIPS 10
+
+
+
+//===================  Macros to Capture Measurements  ======================
+//
+//===== RDTSC wrapper ===== 
+//Also runs with x86_64 code
+#define saveTSCLowHigh(lowHighIn) \
+   asm volatile("RDTSC;                   \
+                 movl %%eax, %0;          \
+                 movl %%edx, %1;"         \
+   /* outputs */ : "=m" (lowHighIn.lowHigh[0]), "=m" (lowHighIn.lowHigh[1])\
+   /* inputs  */ :                        \
+   /* clobber */ : "%eax", "%edx"         \
+                );
+
+#define saveTimeStampCountInto(low, high) \
+   asm volatile("RDTSC;                   \
+                 movl %%eax, %0;          \
+                 movl %%edx, %1;"         \
+   /* outputs */ : "=m" (low), "=m" (high)\
+   /* inputs  */ :                        \
+   /* clobber */ : "%eax", "%edx"         \
+                );
+
+#define saveLowTimeStampCountInto(low)    \
+   asm volatile("RDTSC;                   \
+                 movl %%eax, %0;"         \
+   /* outputs */ : "=m" (low)             \
+   /* inputs  */ :                        \
+   /* clobber */ : "%eax", "%edx"         \
+                );
+
+
+//==================  Macros define types of meas want  =====================
+#ifdef MEAS__TIME_PLUGIN
+
+#define Meas_startReqHdlr \
+        int32 startStamp1, endStamp1; \
+        saveLowTimeStampCountInto( startStamp1 );
+
+#define Meas_endReqHdlr \
+        saveLowTimeStampCountInto( endStamp1 ); \
+        addIntervalToHist( startStamp1, endStamp1, \
+                           _VMSMasterEnv->reqHdlrLowTimeHist ); \
+        addIntervalToHist( startStamp1, endStamp1, \
+                           _VMSMasterEnv->reqHdlrHighTimeHist );
+               
+#elif defined MEAS__TIME_2011_SYS
+#define Meas_startMasterLoop \
+        TSCountLowHigh startStamp1, endStamp1; \
+        saveTSCLowHigh( endStamp1 ); \
+        _VMSMasterEnv->cyclesTillStartMasterLoop = \
+        endStamp1.longVal - masterVP->startSusp.longVal;
+
+#define Meas_startReqHdlr \
+        saveTSCLowHigh( startStamp1 ); \
+        _VMSMasterEnv->startReqHdlr.longVal = startStamp1.longVal;
+
+#define Meas_endReqHdlr 
+
+#define Meas_endMasterLoop \
+        saveTSCLowHigh( startStamp1 ); \
+        _VMSMasterEnv->endMasterLoop.longVal = startStamp1.longVal;
+
+#else
+#define Meas_startMasterLoop 
+#define Meas_startReqHdlr 
+#define Meas_endReqHdlr 
+#define Meas_endMasterLoop
+#endif
+
+//======================  Histogram Macros -- Create ========================
+//
+//
+#ifdef MAKE_HISTS_FOR_MEASUREMENTS
+#define makeAMeasHist( idx, name, numBins, startVal, binWidth ) \
+   makeHighestDynArrayIndexBeAtLeast( _VMSMasterEnv->measHistsInfo, idx ); \
+   _VMSMasterEnv->measHists[idx] =  \
+                       makeFixedBinHist( numBins, startVal, binWidth, name );
+#else
+#define makeAMeasHist( idx, name, numBins, startVal, binWidth )
+#endif
+
+
+#define MEAS__SUB_CREATE  /*turn on/off subtraction of create from plugin*/
+
+#include "VMS_defs__lang_specific.h"
+
+#endif	/* _VMS_DEFS_H */
+
diff -r bc4cb994f114 -r eaf7e4c58c9e VMS_primitive_data_types.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/VMS_primitive_data_types.h	Wed Feb 22 11:39:12 2012 -0800
@@ -0,0 +1,53 @@
+/*
+ *  Copyright 2009 OpenSourceStewardshipFoundation.org
+ *  Licensed under GNU General Public License version 2
+ *  
+ * Author: seanhalle@yahoo.com
+ *  
+
+ */
+
+#ifndef _BLIS_PRIMITIVE_DATA_TYPES_H
+#define	_BLIS_PRIMITIVE_DATA_TYPES_H
+
+
+/*For portability, need primitive data types that have a well defined
+ * size, and well-defined layout into bytes
+ *To do this, provide BLIS standard aliases for all primitive data types
+ *These aliases must be used in all BLIS functions instead of the ANSI types
+ *
+ *These definitions will be replaced inside each specialization module
+ * according to the compiler used in that module and the hardware being
+ * specialized to.
+ */
+/*
+#define    int8  char
+#define   uint8  char
+#define    int16 short
+#define   uint16 unsigned short
+#define    int32 int
+#define   uint32 unsigned int
+#define    int64 long long
+#define   uint64 unsigned long long
+#define  float32 float
+#define  float64 double
+*/
+typedef char               bool8;
+typedef char               int8;
+typedef char               uint8;
+typedef short              int16;
+typedef unsigned short     uint16;
+typedef int                int32;
+typedef unsigned int       uint32;
+typedef long long          int64;
+typedef unsigned long long uint64;
+typedef float              float32;
+typedef double             float64;
+//typedef double double      float128;
+#define float128 double double
+
+#define TRUE  1
+#define FALSE 0
+
+#endif	/* _BLIS_PRIMITIVE_DATA_TYPES_H */
+
diff -r bc4cb994f114 -r eaf7e4c58c9e __brch__Common_ancestor
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/__brch__Common_ancestor	Wed Feb 22 11:39:12 2012 -0800
@@ -0,0 +1,33 @@
+A HW branch for:
+
+generic MultiCore machines with x86 64bit instruction set
+
+This branch shouldn't be used, except as a lazy fall-back.  Instead, try out other branches tuned to specific hardware platforms to find the one that performs best on your machine.  Use the "exe_time_vs_task_size" project to generate curves of overhead, and compare result from various branches.
+
+Note, if this branch is used, then NUM_CORES in VMS_HW_specific_defs.h file has to be updated with the number of cores in your machine
+
+========  Background on branch naming  =========
+
+There are two kinds of branchs: ones used to develop features, and ones tuned to particular hardware.  A given HW branch may combine features from several feature-branches, picking and choosing among them.
+
+After Feb 2012, branches are named by the scheme:
+
+feat__<feat_descr>__<HW_feat_dev_on>
+
+HW__<desc_of_HW_brch_tuned_for>
+
+where <HW_feat_dev_on> and <desc_of_HW_brch_tuned_for> follow the pattern:
+
+<num_socket> x <num_cores>_<Manuf>_<special_features>
+
+Examples:
+
+feat__exp_array_malloc
+
+feat__rand_backoff__4x10_Intel_WestmereEx
+
+HW__1x4_Intel_SandyBridge
+
+HW__4x10_Intel_WestmereEx
+
+HW__1x4_AMD_mobile
diff -r bc4cb994f114 -r eaf7e4c58c9e __brch__DEPRECATED_README
--- a/__brch__DEPRECATED_README	Mon Feb 13 13:34:13 2012 -0800
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,29 +0,0 @@
-*DEPRECATED*  as of Feb 2012, this branch should not be used.  Too many variations of VMS for MC_shared exist.
-
-Instead, choose a branch that has the best implementation for the machine being run on.  For example, single-socket with 2 cores, or with 4 cores, or with 8 cores all have their own branches with code tuned to that number of cores.  AMD processors require different low-level tweaking than Intel, and so on.
-
-============== Background on Branch Naming ============
-
-There are two kinds of branchs: ones used to develop features, and ones tuned to particular hardware.  A given HW branch may combine features from several feature-branches, picking and choosing among them.
-
-Legacy branches, from before Feb 2012 have random names.  After Feb 2012, they're named by the scheme:
-
-feat__<feat_descr>__<HW_feat_dev_on>
-
-HW__<desc_of_HW_brch_tuned_for>
-
-where <HW_feat_dev_on> and <desc_of_HW_brch_tuned_for> follow the pattern:
-
-<num_socket> x <num_cores>_<ArchName>_<optional_special_features>
-
-Examples:
-
-feat__exp_array_malloc__generic_MC
-
-feat__rand_backoff__4x10_WestmereEx
-
-HW__1x4_SandyBridge
-
-HW__4x10_WestmereEx
-
-HW__1x4_AMD_mobile
\ No newline at end of file
diff -r bc4cb994f114 -r eaf7e4c58c9e probes.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/probes.c	Wed Feb 22 11:39:12 2012 -0800
@@ -0,0 +1,339 @@
+/*
+ * Copyright 2010  OpenSourceStewardshipFoundation
+ *
+ * Licensed under BSD
+ */
+
+#include <stdio.h>
+#include <malloc.h>
+#include <sys/time.h>
+
+#include "VMS.h"
+
+
+
+//====================  Probes =================
+#ifdef STATS__USE_TSC_PROBES
+
+int32
+VMS__create_histogram_probe( int32 numBins, float32 startValue,
+                             float32 binWidth, char *nameStr )
+ { IntervalProbe *newProbe;
+   int32 idx;
+   FloatHist *hist;
+
+   idx = VMS__create_single_interval_probe( nameStr );
+   newProbe =  _VMSMasterEnv->intervalProbes[ idx ];
+
+   hist =  makeFloatHistogram( numBins, startValue, binWidth );
+   newProbe->hist = hist;
+   return idx;
+ }
+
+void
+VMS_impl__record_interval_start_in_probe( int32 probeID )
+ { IntervalProbe *probe;
+
+   probe = _VMSMasterEnv->intervalProbes[ probeID ];
+   probe->startStamp = getTSCount();
+ }
+
+void
+VMS_impl__record_interval_end_in_probe( int32 probeID )
+ { IntervalProbe *probe;
+   TSCount endStamp;
+
+   endStamp = getTSCount();
+
+   probe = _VMSMasterEnv->intervalProbes[ probeID ];
+   probe->endStamp = endStamp;
+
+   if( probe->hist != NULL )
+    { TSCount interval = probe->endStamp - probe->startStamp;
+         //if the interval is sane, then add to histogram
+      if( interval < probe->hist->endOfRange * 10 )
+         addToFloatHist( interval, probe->hist );
+    }
+ }
+
+void
+VMS_impl__print_stats_of_probe( int32 probeID )
+ { IntervalProbe *probe;
+
+   probe = _VMSMasterEnv->intervalProbes[ probeID ];
+
+   if( probe->hist == NULL )
+    {
+      printf("probe: %s, interval: %.6lf\n", probe->nameStr,probe->interval);
+    }
+
+   else
+    {
+      printf( "probe: %s\n", probe->nameStr );
+      printFloatHist( probe->hist );
+    }
+ }
+#else
+
+/*
+ * In practice, probe operations are called from the app, from inside slaves
+ *  -- so have to be sure each probe is single-VP owned, and be sure that
+ *  any place common structures are modified it's done inside the master.
+ * So -- the only place common structures are modified is during creation.
+ *  after that, all mods are to individual instances.
+ *
+ * Thniking perhaps should change the semantics to be that probes are
+ *  attached to the virtual processor -- and then everything is guaranteed
+ *  to be isolated -- except then can't take any intervals that span VPs,
+ *  and would have to transfer the probes to Master env when VP dissipates..
+ *  gets messy..
+ *
+ * For now, just making so that probe creation causes a suspend, so that
+ *  the dynamic array in the master env is only modified from the master
+ * 
+ */
+IntervalProbe *
+create_generic_probe( char *nameStr, SlaveVP *animPr )
+{
+   VMSSemReq reqData;
+
+   reqData.reqType  = createProbe;
+   reqData.nameStr  = nameStr;
+
+   VMS_WL__send_VMSSem_request( &reqData, animPr );
+
+   return animPr->dataRetFromReq;
+ }
+
+/*Use this version from outside VMS -- it uses external malloc, and modifies
+ * dynamic array, so can't be animated in a slave VP
+ */
+IntervalProbe *
+ext__create_generic_probe( char *nameStr )
+ { IntervalProbe *newProbe;
+   int32          nameLen;
+
+   newProbe          = malloc( sizeof(IntervalProbe) );
+   nameLen = strlen( nameStr );
+   newProbe->nameStr = malloc( nameLen );
+   memcpy( newProbe->nameStr, nameStr, nameLen );
+   newProbe->hist    = NULL;
+   newProbe->schedChoiceWasRecorded = FALSE;
+   newProbe->probeID =
+             addToDynArray( newProbe, _VMSMasterEnv->dynIntervalProbesInfo );
+
+   return newProbe;
+ }
+
+
+/*Only call from inside master or main startup/shutdown thread
+ */
+void
+VMS_impl__free_probe( IntervalProbe *probe )
+ { if( probe->hist != NULL )   freeDblHist( probe->hist );
+   if( probe->nameStr != NULL) VMS_int__free( probe->nameStr );
+   VMS_int__free( probe );
+ }
+
+
+int32
+VMS_impl__record_time_point_into_new_probe( char *nameStr, SlaveVP *animPr)
+ { IntervalProbe *newProbe;
+   struct timeval *startStamp;
+   float64 startSecs;
+
+   newProbe           = create_generic_probe( nameStr, animPr );
+   newProbe->endSecs  = 0;
+
+   gettimeofday( &(newProbe->startStamp), NULL);
+
+      //turn into a double
+   startStamp = &(newProbe->startStamp);
+   startSecs = startStamp->tv_sec + ( startStamp->tv_usec / 1000000.0 );
+   newProbe->startSecs = startSecs;
+
+   return newProbe->probeID;
+ }
+
+int32
+VMS_ext_impl__record_time_point_into_new_probe( char *nameStr )
+ { IntervalProbe *newProbe;
+   struct timeval *startStamp;
+   float64 startSecs;
+
+   newProbe           = ext__create_generic_probe( nameStr );
+   newProbe->endSecs  = 0;
+
+   gettimeofday( &(newProbe->startStamp), NULL);
+
+      //turn into a double
+   startStamp = &(newProbe->startStamp);
+   startSecs = startStamp->tv_sec + ( startStamp->tv_usec / 1000000.0 );
+   newProbe->startSecs = startSecs;
+
+   return newProbe->probeID;
+ }
+
+int32
+VMS_impl__create_single_interval_probe( char *nameStr, SlaveVP *animPr )
+ { IntervalProbe *newProbe;
+
+   newProbe = create_generic_probe( nameStr, animPr );
+   
+   return newProbe->probeID;
+ }
+
+int32
+VMS_impl__create_histogram_probe( int32   numBins, float64    startValue,
+               float64 binWidth, char   *nameStr, SlaveVP *animPr )
+ { IntervalProbe *newProbe;
+   DblHist *hist;
+
+   newProbe = create_generic_probe( nameStr, animPr );
+   
+   hist =  makeDblHistogram( numBins, startValue, binWidth );
+   newProbe->hist = hist;
+   return newProbe->probeID;
+ }
+
+void
+VMS_impl__index_probe_by_its_name( int32 probeID, SlaveVP *animPr )
+ { IntervalProbe *probe;
+
+   //TODO: fix this To be in Master -- race condition
+   probe = _VMSMasterEnv->intervalProbes[ probeID ];
+
+   addValueIntoTable(probe->nameStr, probe, _VMSMasterEnv->probeNameHashTbl);
+ }
+
+IntervalProbe *
+VMS_impl__get_probe_by_name( char *probeName, SlaveVP *animPr )
+ {
+   //TODO: fix this To be in Master -- race condition
+   return getValueFromTable( probeName, _VMSMasterEnv->probeNameHashTbl );
+ }
+
+
+/*Everything is local to the animating procr, so no need for request, do
+ * work locally, in the anim Pr
+ */
+void
+VMS_impl__record_sched_choice_into_probe( int32 probeID, SlaveVP *animatingPr )
+ { IntervalProbe *probe;
+ 
+   probe = _VMSMasterEnv->intervalProbes[ probeID ];
+   probe->schedChoiceWasRecorded = TRUE;
+   probe->coreNum = animatingPr->coreAnimatedBy;
+   probe->procrID = animatingPr->procrID;
+   probe->procrCreateSecs = animatingPr->createPtInSecs;
+ }
+
+/*Everything is local to the animating procr, so no need for request, do
+ * work locally, in the anim Pr
+ */
+void
+VMS_impl__record_interval_start_in_probe( int32 probeID )
+ { IntervalProbe *probe;
+
+         DEBUG( dbgProbes, "record start of interval\n" )
+   probe = _VMSMasterEnv->intervalProbes[ probeID ];
+   gettimeofday( &(probe->startStamp), NULL );
+ }
+
+
+/*Everything is local to the animating procr, so no need for request, do
+ * work locally, in the anim Pr
+ */
+void
+VMS_impl__record_interval_end_in_probe( int32 probeID )
+ { IntervalProbe *probe;
+   struct timeval *endStamp, *startStamp;
+   float64 startSecs, endSecs;
+
+         DEBUG( dbgProbes, "record end of interval\n" )
+      //possible seg-fault if array resized by diff core right after this
+      // one gets probe..?  Something like that?  Might be safe.. don't care
+   probe = _VMSMasterEnv->intervalProbes[ probeID ];
+   gettimeofday( &(probe->endStamp), NULL);
+
+      //now turn into an interval held in a double
+   startStamp = &(probe->startStamp);
+   endStamp   = &(probe->endStamp);
+
+   startSecs = startStamp->tv_sec + ( startStamp->tv_usec / 1000000.0 );
+   endSecs   = endStamp->tv_sec   + ( endStamp->tv_usec / 1000000.0 );
+
+   probe->interval  = endSecs - startSecs;
+   probe->startSecs = startSecs;
+   probe->endSecs   = endSecs;
+
+   if( probe->hist != NULL )
+    {
+         //if the interval is sane, then add to histogram
+      if( probe->interval < probe->hist->endOfRange * 10 )
+         addToDblHist( probe->interval, probe->hist );
+    }
+ }
+
+void
+print_probe_helper( IntervalProbe *probe )
+ {
+   printf( "\nprobe: %s, ",  probe->nameStr );
+   
+   
+   if( probe->schedChoiceWasRecorded )
+    { printf( "coreNum: %d, procrID: %d, procrCreated: %0.6f | ",
+              probe->coreNum, probe->procrID, probe->procrCreateSecs );
+    }
+
+   if( probe->endSecs == 0 ) //just a single point in time
+    {
+      printf( " time point: %.6f\n",
+              probe->startSecs - _VMSMasterEnv->createPtInSecs );
+    }
+   else if( probe->hist == NULL ) //just an interval
+    {
+      printf( " startSecs: %.6f interval: %.6f\n", 
+         (probe->startSecs - _VMSMasterEnv->createPtInSecs), probe->interval);
+    }
+   else  //a full histogram of intervals
+    {
+      printDblHist( probe->hist );
+    }
+ }
+
+//TODO: change so pass around pointer to probe instead of its array-index..
+// will eliminate chance for timing of resize to cause problems with the
+// lookup -- even though don't think it actually can cause problems..
+// there's no need to pass index around -- have hash table for names, and
+// only need it once, then have ptr to probe..  the thing about enum the
+// index and use that as name is clunky in practice -- just hash.
+void
+VMS_impl__print_stats_of_probe( int32 probeID )
+ { IntervalProbe *probe;
+
+   probe = _VMSMasterEnv->intervalProbes[ probeID ];
+
+   print_probe_helper( probe );
+ }
+
+
+inline void doNothing(){};
+
+void
+generic_print_probe( void *_probe )
+ { 
+   IntervalProbe *probe = (IntervalProbe *)_probe;
+   
+   //TODO segfault in printf
+   //print_probe_helper( probe );
+ }
+
+void
+VMS_impl__print_stats_of_all_probes()
+ {
+   forAllInDynArrayDo( _VMSMasterEnv->dynIntervalProbesInfo,
+                       &generic_print_probe );
+   fflush( stdout );
+ }
+#endif
diff -r bc4cb994f114 -r eaf7e4c58c9e probes.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/probes.h	Wed Feb 22 11:39:12 2012 -0800
@@ -0,0 +1,182 @@
+/*
+ *  Copyright 2009 OpenSourceStewardshipFoundation.org
+ *  Licensed under GNU General Public License version 2
+ *
+ * Author: seanhalle@yahoo.com
+ * 
+ */
+
+#ifndef _PROBES_H
+#define	_PROBES_H
+#define _GNU_SOURCE
+
+#include "VMS_primitive_data_types.h"
+
+#include <sys/time.h>
+
+/*Note on order of include files:  
+ * This file relies on #defines that appear in other files..
+ */
+
+
+//typedef struct _IntervalProbe IntervalProbe; //in VMS.h
+
+struct _IntervalProbe
+ {
+   char           *nameStr;
+   int32           probeID;
+
+   int32           schedChoiceWasRecorded;
+   int32           coreNum;
+   int32           procrID;
+   float64         procrCreateSecs;
+
+   #ifdef STATS__USE_TSC_PROBES
+   TSCount    startStamp;
+   TSCount    endStamp;
+   #else
+   struct timeval  startStamp;
+   struct timeval  endStamp;
+   #endif
+   float64         startSecs;
+   float64         endSecs;
+   float64         interval;
+   DblHist        *hist;//if NULL, then is single interval probe
+ };
+
+
+
+//======================== Probes =============================
+//
+// Use macros to allow turning probes off with a #define switch
+#ifdef STATS__ENABLE_PROBES
+int32
+VMS_impl__record_time_point_into_new_probe( char *nameStr,SlaveVP *animPr);
+#define VMS__record_time_point_into_new_probe( nameStr, animPr ) \
+        VMS_impl__record_time_point_in_new_probe( nameStr, animPr )
+
+int32
+VMS_ext_impl__record_time_point_into_new_probe( char *nameStr );
+#define VMS_ext__record_time_point_into_new_probe( nameStr ) \
+        VMS_ext_impl__record_time_point_into_new_probe( nameStr )
+
+
+int32
+VMS_impl__create_single_interval_probe( char *nameStr, SlaveVP *animPr );
+#define VMS__create_single_interval_probe( nameStr, animPr ) \
+        VMS_impl__create_single_interval_probe( nameStr, animPr )
+
+
+int32
+VMS_impl__create_histogram_probe( int32   numBins, float64    startValue,
+               float64 binWidth, char    *nameStr, SlaveVP *animPr );
+#define VMS__create_histogram_probe(      numBins, startValue,              \
+                                          binWidth, nameStr, animPr )       \
+        VMS_impl__create_histogram_probe( numBins, startValue,              \
+                                          binWidth, nameStr, animPr )
+void
+VMS_impl__free_probe( IntervalProbe *probe );
+#define VMS__free_probe( probe ) \
+        VMS_impl__free_probe( probe )
+
+void
+VMS_impl__index_probe_by_its_name( int32 probeID, SlaveVP *animPr );
+#define VMS__index_probe_by_its_name( probeID, animPr ) \
+        VMS_impl__index_probe_by_its_name( probeID, animPr )
+
+IntervalProbe *
+VMS_impl__get_probe_by_name( char *probeName, SlaveVP *animPr );
+#define VMS__get_probe_by_name( probeID, animPr ) \
+        VMS_impl__get_probe_by_name( probeName, animPr )
+
+void
+VMS_impl__record_sched_choice_into_probe( int32 probeID, SlaveVP *animPr );
+#define VMS__record_sched_choice_into_probe( probeID, animPr ) \
+        VMS_impl__record_sched_choice_into_probe( probeID, animPr )
+
+void
+VMS_impl__record_interval_start_in_probe( int32 probeID );
+#define VMS__record_interval_start_in_probe( probeID ) \
+        VMS_impl__record_interval_start_in_probe( probeID )
+
+void
+VMS_impl__record_interval_end_in_probe( int32 probeID );
+#define VMS__record_interval_end_in_probe( probeID ) \
+        VMS_impl__record_interval_end_in_probe( probeID )
+
+void
+VMS_impl__print_stats_of_probe( int32 probeID );
+#define VMS__print_stats_of_probe( probeID ) \
+        VMS_impl__print_stats_of_probe( probeID )
+
+void
+VMS_impl__print_stats_of_all_probes();
+#define VMS__print_stats_of_all_probes() \
+        VMS_impl__print_stats_of_all_probes()
+
+
+#else
+int32
+VMS_impl__record_time_point_into_new_probe( char *nameStr,SlaveVP *animPr);
+#define VMS__record_time_point_into_new_probe( nameStr, animPr ) \
+       0 /* do nothing */
+
+int32
+VMS_ext_impl__record_time_point_into_new_probe( char *nameStr );
+#define VMS_ext__record_time_point_into_new_probe( nameStr ) \
+       0 /* do nothing */
+
+
+int32
+VMS_impl__create_single_interval_probe( char *nameStr, SlaveVP *animPr );
+#define VMS__create_single_interval_probe( nameStr, animPr ) \
+       0 /* do nothing */
+
+
+int32
+VMS_impl__create_histogram_probe( int32   numBins, float64    startValue,
+               float64 binWidth, char    *nameStr, SlaveVP *animPr );
+#define VMS__create_histogram_probe(      numBins, startValue,              \
+                                          binWidth, nameStr, animPr )       \
+       0 /* do nothing */
+
+void
+VMS_impl__index_probe_by_its_name( int32 probeID, SlaveVP *animPr );
+#define VMS__index_probe_by_its_name( probeID, animPr ) \
+        /* do nothing */
+
+IntervalProbe *
+VMS_impl__get_probe_by_name( char *probeName, SlaveVP *animPr );
+#define VMS__get_probe_by_name( probeID, animPr ) \
+       NULL /* do nothing */
+
+void
+VMS_impl__record_sched_choice_into_probe( int32 probeID, SlaveVP *animPr );
+#define VMS__record_sched_choice_into_probe( probeID, animPr ) \
+        /* do nothing */
+
+void
+VMS_impl__record_interval_start_in_probe( int32 probeID );
+#define VMS__record_interval_start_in_probe( probeID ) \
+        /* do nothing */
+
+void
+VMS_impl__record_interval_end_in_probe( int32 probeID );
+#define VMS__record_interval_end_in_probe( probeID ) \
+        /* do nothing */
+
+inline void doNothing();
+void
+VMS_impl__print_stats_of_probe( int32 probeID );
+#define VMS__print_stats_of_probe( probeID ) \
+        doNothing/* do nothing */
+
+void
+VMS_impl__print_stats_of_all_probes();
+#define VMS__print_stats_of_all_probes \
+        doNothing/* do nothing */
+
+#endif   /* defined STATS__ENABLE_PROBES */
+
+#endif	/* _PROBES_H */
+
diff -r bc4cb994f114 -r eaf7e4c58c9e vmalloc.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/vmalloc.c	Wed Feb 22 11:39:12 2012 -0800
@@ -0,0 +1,494 @@
+/*
+ *  Copyright 2009 OpenSourceCodeStewardshipFoundation.org
+ *  Licensed under GNU General Public License version 2
+ *
+ * Author: seanhalle@yahoo.com
+ *
+ * Created on November 14, 2009, 9:07 PM
+ */
+
+#include <malloc.h>
+#include <inttypes.h>
+#include <stdlib.h>
+#include <stdio.h>
+
+#include "VMS.h"
+#include "C_Libraries/Histogram/Histogram.h"
+
+/*Helper function
+ *Insert a newly generated free chunk into the first spot on the free list.
+ * The chunk is cast as a MallocProlog, so the various pointers in it are
+ * accessed with C's help -- and the size of the prolog is easily added to
+ * the pointer when a chunk is returned to the app -- so C handles changes
+ * in pointer sizes among machines.
+ *
+ *The list head is a normal MallocProlog struct -- identified by its
+ * prevChunkInFreeList being NULL -- the only one.
+ *
+ *The end of the list is identified by next chunk being NULL, as usual.
+ */
+void inline
+add_chunk_to_free_list( MallocProlog *chunk, MallocProlog *listHead )
+ { 
+   chunk->nextChunkInFreeList     = listHead->nextChunkInFreeList;
+   if( chunk->nextChunkInFreeList != NULL ) //if not last in free list
+      chunk->nextChunkInFreeList->prevChunkInFreeList = chunk;
+   chunk->prevChunkInFreeList     = listHead;
+   listHead->nextChunkInFreeList  = chunk;
+ }
+
+
+/*This is sequential code, meant to only be called from the Master, not from
+ * any slave VPs.
+ *Search down list, checking size by the nextHigherInMem pointer, to find
+ * first chunk bigger than size needed.
+ *Shave off the extra and make it into a new free-list element, hook it in
+ * then return the address of the found element plus size of prolog.
+ *
+ */
+void *VMS_int__malloc( size_t sizeRequested )
+ { MallocProlog *foundElem = NULL, *currElem, *newElem;
+   ssize_t        amountExtra, sizeConsumed,sizeOfFound;
+   uint32        foundElemIsTopOfHeap;
+
+   //============================= MEASUREMENT STUFF ========================
+   #ifdef MEAS__TIME_MALLOC
+   int32 startStamp, endStamp;
+   saveLowTimeStampCountInto( startStamp );
+   #endif
+   //========================================================================
+   
+      //step up the size to be aligned at 16-byte boundary, prob better ways
+   sizeRequested = (sizeRequested + 16) & ~15;
+   currElem = (_VMSMasterEnv->freeListHead)->nextChunkInFreeList;
+
+   while( currElem != NULL )
+    {    //check if size of currElem is big enough
+      sizeOfFound=(size_t)((uintptr_t)currElem->nextHigherInMem -(uintptr_t)currElem);
+      amountExtra = sizeOfFound - sizeRequested - sizeof(MallocProlog);
+      if( amountExtra > 0 )
+       {    //found it, get out of loop
+         foundElem = currElem;
+         currElem = NULL;
+       }
+      else
+         currElem = currElem->nextChunkInFreeList;
+    }
+   
+   if( foundElem == NULL )
+    { ERROR("\nmalloc failed\n")
+      return (void *)NULL;  //indicates malloc failed
+    }
+      //Using a kludge to identify the element that is the top chunk in the
+      // heap -- saving top-of-heap addr in head's nextHigherInMem -- and
+      // save addr of start of heap in head's nextLowerInMem
+      //Will handle top of Heap specially
+   foundElemIsTopOfHeap = foundElem->nextHigherInMem ==
+                          _VMSMasterEnv->freeListHead->nextHigherInMem;
+   
+      //before shave off and try to insert new elem, remove found elem
+      //note, foundElem will never be the head, so always has valid prevChunk
+   foundElem->prevChunkInFreeList->nextChunkInFreeList =
+                                              foundElem->nextChunkInFreeList;
+   if( foundElem->nextChunkInFreeList != NULL )
+    { foundElem->nextChunkInFreeList->prevChunkInFreeList =
+                                              foundElem->prevChunkInFreeList;
+    }
+   foundElem->prevChunkInFreeList = NULL;//indicates elem currently allocated
+   
+      //if enough, turn extra into new elem & insert it
+   if( amountExtra > 64 )
+    {   //make new elem by adding to addr of curr elem then casting
+        sizeConsumed = sizeof(MallocProlog) + sizeRequested; 
+        newElem = (MallocProlog *)( (uintptr_t)foundElem + sizeConsumed );
+        newElem->nextLowerInMem    = foundElem; //This is evil (but why?) 
+        newElem->nextHigherInMem   = foundElem->nextHigherInMem; //This is evil (but why?)
+        foundElem->nextHigherInMem = newElem;
+        if( ! foundElemIsTopOfHeap )
+        {  //there is no next higher for top of heap, so can't write to it
+           newElem->nextHigherInMem->nextLowerInMem = newElem;
+        }
+        add_chunk_to_free_list( newElem, _VMSMasterEnv->freeListHead );
+    }
+   else
+    {
+      sizeConsumed = sizeOfFound;
+    }
+  _VMSMasterEnv->amtOfOutstandingMem += sizeConsumed;
+
+   //============================= MEASUREMENT STUFF ========================
+   #ifdef MEAS__TIME_MALLOC
+   saveLowTimeStampCountInto( endStamp );
+   addIntervalToHist( startStamp, endStamp, _VMSMasterEnv->mallocTimeHist );
+   #endif
+   //========================================================================
+
+      //skip over the prolog by adding its size to the pointer return
+   return (void*)((uintptr_t)foundElem + sizeof(MallocProlog));
+ }
+
+/*This is sequential code, meant to only be called from the Master, not from
+ * any slave VPs.
+ *Search down list, checking size by the nextHigherInMem pointer, to find
+ * first chunk bigger than size needed.
+ *Shave off the extra and make it into a new free-list element, hook it in
+ * then return the address of the found element plus size of prolog.
+ *
+ * The difference to the regular malloc is, that all the allocated chunks are
+ * aligned and padded to the size of a CACHE_LINE_SZ. Thus creating a new chunk
+ * before the aligned chunk.
+ */
+void *VMS_int__malloc_aligned( size_t sizeRequested )
+ { MallocProlog *foundElem = NULL, *currElem, *newElem;
+   ssize_t        amountExtra, sizeConsumed,sizeOfFound,prevAmount;
+   uint32        foundElemIsTopOfHeap;
+
+   //============================= MEASUREMENT STUFF ========================
+   #ifdef MEAS__TIME_MALLOC
+   uint32 startStamp, endStamp;
+   saveLowTimeStampCountInto( startStamp );
+   #endif
+   //========================================================================
+   
+      //step up the size to be multiple of the cache line size
+   sizeRequested = (sizeRequested + CACHE_LINE_SZ) & ~(CACHE_LINE_SZ-1);
+   currElem = (_VMSMasterEnv->freeListHead)->nextChunkInFreeList;
+
+   while( currElem != NULL )
+    {    //check if size of currElem is big enough
+      sizeOfFound=(size_t)((uintptr_t)currElem->nextHigherInMem -(uintptr_t)currElem);
+      amountExtra = sizeOfFound - sizeRequested - sizeof(MallocProlog);
+      if( amountExtra > 0 )
+       {    
+         //look if the found element is already aligned
+         if((((uintptr_t)currElem+sizeof(MallocProlog)) & (uintptr_t)(CACHE_LINE_SZ-1)) == 0){
+             //found it, get out of loop
+             foundElem = currElem;
+             break;
+         }else{
+             //find first aligned address and check if it's still big enough
+             //check also if the space before the aligned address is big enough
+             //for a new element
+             void *firstAlignedAddr = (void*)(((uintptr_t)currElem + 2*CACHE_LINE_SZ) & ~((uintptr_t)(CACHE_LINE_SZ-1)));
+             prevAmount = (uintptr_t)firstAlignedAddr - (uintptr_t)currElem;
+             sizeOfFound=(uintptr_t)currElem->nextHigherInMem -(uintptr_t)firstAlignedAddr + sizeof(MallocProlog);
+             amountExtra= sizeOfFound - sizeRequested - sizeof(MallocProlog);
+             if(prevAmount > 2*sizeof(MallocProlog) && amountExtra > 0 ){
+                 //found suitable element
+                 //create new previous element and exit loop
+                 MallocProlog *newAlignedElem = (MallocProlog*)firstAlignedAddr - 1;
+                 
+                 //insert new element into free list
+                 if(currElem->nextChunkInFreeList != NULL)
+                     currElem->nextChunkInFreeList->prevChunkInFreeList = newAlignedElem;                     
+                 newAlignedElem->prevChunkInFreeList = currElem;
+                 newAlignedElem->nextChunkInFreeList = currElem->nextChunkInFreeList;
+                 currElem->nextChunkInFreeList = newAlignedElem;
+                 
+                 //set higherInMem and lowerInMem
+                 newAlignedElem->nextHigherInMem = currElem->nextHigherInMem;
+                 foundElemIsTopOfHeap = currElem->nextHigherInMem ==
+                          _VMSMasterEnv->freeListHead->nextHigherInMem;
+                 if(!foundElemIsTopOfHeap)
+                     currElem->nextHigherInMem->nextLowerInMem = newAlignedElem;
+                 currElem->nextHigherInMem = newAlignedElem;
+                 newAlignedElem->nextLowerInMem = currElem;
+                 
+                 //Found new element leaving loop
+                 foundElem = newAlignedElem;
+                 break;
+             }
+         }
+         
+       }
+       currElem = currElem->nextChunkInFreeList;
+    }
+
+   if( foundElem == NULL )
+    { ERROR("\nmalloc failed\n")
+      return (void *)NULL;  //indicates malloc failed
+    }
+      //Using a kludge to identify the element that is the top chunk in the
+      // heap -- saving top-of-heap addr in head's nextHigherInMem -- and
+      // save addr of start of heap in head's nextLowerInMem
+      //Will handle top of Heap specially
+   foundElemIsTopOfHeap = foundElem->nextHigherInMem ==
+                          _VMSMasterEnv->freeListHead->nextHigherInMem;
+
+      //before shave off and try to insert new elem, remove found elem
+      //note, foundElem will never be the head, so always has valid prevChunk
+   foundElem->prevChunkInFreeList->nextChunkInFreeList =
+                                              foundElem->nextChunkInFreeList;
+   if( foundElem->nextChunkInFreeList != NULL )
+    { foundElem->nextChunkInFreeList->prevChunkInFreeList =
+                                              foundElem->prevChunkInFreeList;
+    }
+   foundElem->prevChunkInFreeList = NULL;//indicates elem currently allocated
+   
+      //if enough, turn extra into new elem & insert it
+   if( amountExtra > 64 )
+    {    //make new elem by adding to addr of curr elem then casting
+      sizeConsumed = sizeof(MallocProlog) + sizeRequested;
+      newElem = (MallocProlog *)( (uintptr_t)foundElem + sizeConsumed );
+      newElem->nextHigherInMem   = foundElem->nextHigherInMem;
+      newElem->nextLowerInMem    = foundElem;
+      foundElem->nextHigherInMem = newElem;
+      
+      if( ! foundElemIsTopOfHeap )
+       {    //there is no next higher for top of heap, so can't write to it
+         newElem->nextHigherInMem->nextLowerInMem = newElem;
+       }
+      add_chunk_to_free_list( newElem, _VMSMasterEnv->freeListHead );
+    }
+   else
+    {
+      sizeConsumed = sizeOfFound;
+    }
+  _VMSMasterEnv->amtOfOutstandingMem += sizeConsumed;
+
+   //============================= MEASUREMENT STUFF ========================
+   #ifdef MEAS__TIME_MALLOC
+   saveLowTimeStampCountInto( endStamp );
+   addIntervalToHist( startStamp, endStamp, _VMSMasterEnv->mallocTimeHist );
+   #endif
+   //========================================================================
+
+      //skip over the prolog by adding its size to the pointer return
+   return (void*)((uintptr_t)foundElem + sizeof(MallocProlog));
+ }
+
+
+/*This is sequential code -- only to be called from the Master
+ * When free, subtract the size of prolog from pointer, then cast it to a
+ * MallocProlog.  Then check the nextLower and nextHigher chunks to see if
+ * one or both are also free, and coalesce if so, and if neither free, then
+ * add this one to free-list.
+ */
+void
+VMS_int__free( void *ptrToFree )
+ { MallocProlog *elemToFree, *nextLowerElem, *nextHigherElem;
+   size_t         sizeOfElem;
+   uint32         lowerExistsAndIsFree, higherExistsAndIsFree;
+
+   //============================= MEASUREMENT STUFF ========================
+   #ifdef MEAS__TIME_MALLOC
+   int32 startStamp, endStamp;
+   saveLowTimeStampCountInto( startStamp );
+   #endif
+   //========================================================================
+
+   if( ptrToFree < (void*)_VMSMasterEnv->freeListHead->nextLowerInMem ||
+       ptrToFree > (void*)_VMSMasterEnv->freeListHead->nextHigherInMem )
+    {    //outside the range of data owned by VMS's malloc, so do nothing
+      return;
+    }
+      //subtract size of prolog to get pointer to prolog, then cast
+   elemToFree = (MallocProlog *)((uintptr_t)ptrToFree - sizeof(MallocProlog));
+   sizeOfElem =(size_t)((uintptr_t)elemToFree->nextHigherInMem-(uintptr_t)elemToFree);
+
+   if( elemToFree->prevChunkInFreeList != NULL )
+    { printf( "error: freeing same element twice!" ); exit(1);
+    }
+
+   _VMSMasterEnv->amtOfOutstandingMem -= sizeOfElem;
+
+   nextLowerElem  = elemToFree->nextLowerInMem;
+   nextHigherElem = elemToFree->nextHigherInMem;
+
+   if( nextHigherElem == NULL )
+      higherExistsAndIsFree = FALSE;
+   else //okay exists, now check if in the free-list by checking back ptr
+      higherExistsAndIsFree = (nextHigherElem->prevChunkInFreeList != NULL);
+    
+   if( nextLowerElem == NULL )
+      lowerExistsAndIsFree = FALSE;
+   else //okay, it exists, now check if it's free
+      lowerExistsAndIsFree = (nextLowerElem->prevChunkInFreeList != NULL);
+    
+
+      //now, know what exists and what's free
+   if( lowerExistsAndIsFree )
+    { if( higherExistsAndIsFree )
+       {    //both exist and are free, so coalesce all three
+            //First, remove higher from free-list
+         nextHigherElem->prevChunkInFreeList->nextChunkInFreeList =
+                                         nextHigherElem->nextChunkInFreeList;
+         if( nextHigherElem->nextChunkInFreeList != NULL ) //end-of-list?
+            nextHigherElem->nextChunkInFreeList->prevChunkInFreeList =
+                                         nextHigherElem->prevChunkInFreeList;
+            //Now, fix-up sequence-in-mem list -- by side-effect, this also
+            // changes size of the lower elem, which is still in free-list
+         nextLowerElem->nextHigherInMem = nextHigherElem->nextHigherInMem;
+         if( nextHigherElem->nextHigherInMem !=
+             _VMSMasterEnv->freeListHead->nextHigherInMem )
+            nextHigherElem->nextHigherInMem->nextLowerInMem = nextLowerElem;
+            //notice didn't do anything to elemToFree -- it simply is no
+            // longer reachable from any of the lists.  Wonder if could be a
+            // security leak because left valid addresses in it,
+            // but don't care for now.
+       }
+      else
+       {    //lower is the only of the two that exists and is free,
+            //In this case, no adjustment to free-list, just change mem-list.
+            // By side-effect, changes size of the lower elem
+         nextLowerElem->nextHigherInMem = elemToFree->nextHigherInMem;
+         if( elemToFree->nextHigherInMem !=
+             _VMSMasterEnv->freeListHead->nextHigherInMem )
+            elemToFree->nextHigherInMem->nextLowerInMem = nextLowerElem;
+       }
+    }
+   else
+    {    //lower either doesn't exist or isn't free, so check higher
+      if( higherExistsAndIsFree )
+       {    //higher exists and is the only of the two free
+            //First, in free-list, replace higher elem with the one to free
+         elemToFree->nextChunkInFreeList=nextHigherElem->nextChunkInFreeList;
+         elemToFree->prevChunkInFreeList=nextHigherElem->prevChunkInFreeList;
+         elemToFree->prevChunkInFreeList->nextChunkInFreeList = elemToFree;
+         if( elemToFree->nextChunkInFreeList != NULL ) // end-of-list?
+            elemToFree->nextChunkInFreeList->prevChunkInFreeList =elemToFree;
+            //Now chg mem-list. By side-effect, changes size of elemToFree
+         elemToFree->nextHigherInMem = nextHigherElem->nextHigherInMem;
+         if( elemToFree->nextHigherInMem !=
+             _VMSMasterEnv->freeListHead->nextHigherInMem )
+            elemToFree->nextHigherInMem->nextLowerInMem = elemToFree;
+       }
+      else
+       {    //neither lower nor higher is availabe to coalesce so add to list
+            // this makes prev chunk ptr non-null, which indicates it's free
+         elemToFree->nextChunkInFreeList =
+                            _VMSMasterEnv->freeListHead->nextChunkInFreeList;
+         _VMSMasterEnv->freeListHead->nextChunkInFreeList = elemToFree;
+         if( elemToFree->nextChunkInFreeList != NULL ) // end-of-list?
+            elemToFree->nextChunkInFreeList->prevChunkInFreeList =elemToFree;
+         elemToFree->prevChunkInFreeList = _VMSMasterEnv->freeListHead;
+       }
+    }
+   //============================= MEASUREMENT STUFF ========================
+   #ifdef MEAS__TIME_MALLOC
+   saveLowTimeStampCountInto( endStamp );
+   addIntervalToHist( startStamp, endStamp, _VMSMasterEnv->freeTimeHist );
+   #endif
+   //========================================================================
+
+ }
+
+
+/*Allocates memory from the external system -- higher overhead
+ *
+ *Because of Linux's malloc throwing bizarre random faults when malloc is
+ * used inside a VMS virtual processor, have to pass this as a request and
+ * have the core loop do it when it gets around to it -- will look for these
+ * chores leftover from the previous animation of masterVP the next time it
+ * goes to animate the masterVP -- so it takes two separate masterVP
+ * animations, separated by work, to complete an external malloc or
+ * external free request.
+ *
+ *Thinking core loop accepts signals -- just looks if signal-location is
+ * empty or not --
+ */
+void *
+VMS__malloc_in_ext( size_t sizeRequested )
+ {
+ /*
+      //This is running in the master, so no chance for multiple cores to be
+      // competing for the core's flag.
+   if(  *(_VMSMasterEnv->coreLoopSignalAddr[ 0 ]) != 0 )
+    {    //something has already signalled to core loop, so save the signal
+         // and look, next time master animated, to see if can send it.
+         //Note, the addr to put a signal is in the coreloop's frame, so just
+         // checks it each time through -- make it volatile to avoid GCC
+         // optimizations -- it's a coreloop local var that only changes
+         // after jumping away.  The signal includes the addr to send the
+         //return to -- even if just empty return completion-signal
+         //
+         //save the signal in some queue that the master looks at each time
+         // it starts up -- one loc says if empty for fast common case --
+         //something like that -- want to hide this inside this call -- but
+         // think this has to come as a request -- req handler gives procr
+         // back to master loop, which gives it back to req handler at point
+         // it sees that core loop has sent return signal.  Something like
+         // that.
+      saveTheSignal
+
+    }
+  coreSigData->type = malloc;
+  coreSigData->sizeToMalloc = sizeRequested;
+  coreSigData->locToSignalCompletion = &figureOut;
+   _VMSMasterEnv->coreLoopSignals[ 0 ] = coreSigData;
+  */
+      //just risk system-stack faults until get this figured out
+   return malloc( sizeRequested );
+ }
+
+
+/*Frees memory that was allocated in the external system -- higher overhead
+ *
+ *As noted in external malloc comment, this is clunky 'cause the free has
+ * to be called in the core loop.
+ */
+void
+VMS__free_in_ext( void *ptrToFree )
+ {
+      //just risk system-stack faults until get this figured out
+   free( ptrToFree );
+
+      //TODO: fix this -- so 
+ }
+
+
+/*Designed to be called from the main thread outside of VMS, during init
+ */
+MallocProlog *
+VMS_ext__create_free_list()
+ { MallocProlog *freeListHead, *firstChunk;
+
+      //Note, this is running in the main thread -- all increases in malloc
+      // mem and all frees of it must be done in this thread, with the
+      // thread's original stack available
+   freeListHead = malloc( sizeof(MallocProlog) );
+   firstChunk   = malloc( MALLOC_ADDITIONAL_MEM_FROM_OS_SIZE );
+   if( firstChunk == NULL ) {printf("malloc error\n"); exit(1);}
+   
+   //Touch memory to avoid page faults
+   void *ptr,*endPtr; 
+   endPtr = (void*)firstChunk+MALLOC_ADDITIONAL_MEM_FROM_OS_SIZE;
+   for(ptr = firstChunk; ptr < endPtr; ptr+=PAGE_SIZE)
+   {
+       *(char*)ptr = 0;
+   }
+
+   freeListHead->prevChunkInFreeList = NULL;
+      //Use this addr to free the heap when cleanup
+   freeListHead->nextLowerInMem      = firstChunk;
+      //to identify top-of-heap elem, compare this addr to elem's next higher
+   freeListHead->nextHigherInMem     = (void*)( (uintptr_t)firstChunk +
+                                         MALLOC_ADDITIONAL_MEM_FROM_OS_SIZE);
+   freeListHead->nextChunkInFreeList = firstChunk;
+
+   firstChunk->nextChunkInFreeList   = NULL;
+   firstChunk->prevChunkInFreeList   = freeListHead;
+      //next Higher has to be set to top of chunk, so can calc size in malloc
+   firstChunk->nextHigherInMem       = (void*)( (uintptr_t)firstChunk +
+                                         MALLOC_ADDITIONAL_MEM_FROM_OS_SIZE);
+   firstChunk->nextLowerInMem        = NULL; //identifies as bott of heap
+   
+   _VMSMasterEnv->amtOfOutstandingMem = 0; //none allocated yet
+
+   return freeListHead;
+ }
+
+
+/*Designed to be called from the main thread outside of VMS, during cleanup
+ */
+void
+VMS_ext__free_free_list( MallocProlog *freeListHead )
+ {    
+      //stashed a ptr to the one and only bug chunk malloc'd from OS in the
+      // free list head's next lower in mem pointer
+   free( freeListHead->nextLowerInMem );
+
+   //don't free the head -- it'll be in an array eventually -- free whole
+   // array when all the free lists linked from it have already been freed
+ }
+
diff -r bc4cb994f114 -r eaf7e4c58c9e vmalloc.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/vmalloc.h	Wed Feb 22 11:39:12 2012 -0800
@@ -0,0 +1,90 @@
+/*
+ *  Copyright 2009 OpenSourceCodeStewardshipFoundation.org
+ *  Licensed under GNU General Public License version 2
+ *
+ * Author: seanhalle@yahoo.com
+ *
+ * Created on November 14, 2009, 9:07 PM
+ */
+
+#ifndef _VMALLOC_H
+#define	_VMALLOC_H
+
+#include <malloc.h>
+#include <inttypes.h>
+#include "VMS_primitive_data_types.h"
+
+typedef struct _MallocProlog MallocProlog;
+
+struct _MallocProlog
+ {
+   MallocProlog *nextChunkInFreeList;
+   MallocProlog *prevChunkInFreeList;
+   MallocProlog *nextHigherInMem;
+   MallocProlog *nextLowerInMem;
+ };
+//MallocProlog
+
+typedef struct
+ {
+   MallocProlog *firstChunkInFreeList;
+   int32         numInList; //TODO not used
+ }
+FreeListHead;
+
+void *
+VMS_int__malloc( size_t sizeRequested );
+
+void *
+VMS_int__malloc_aligned( size_t sizeRequested );
+
+void
+VMS_int__free( void *ptrToFree );
+
+#define VMS_PI__malloc VMS_int__malloc
+#define VMS_PI__malloc_aligned VMS_int__malloc_aligned
+#define VMS_PI__free VMS_int__free
+/* For now, the PI is protected by master lock, so int malloc fine
+void *
+VMS_PI__malloc( size_t sizeRequested );
+
+void *
+VMS_PI__malloc_aligned( size_t sizeRequested );
+
+void
+VMS_PI__free( void *ptrToFree );
+*/
+
+//TODO: protect WL malloc from concurrency!! shared freelist can be corrupted
+#define VMS_WL__malloc VMS_int__malloc
+#define VMS_WL__malloc_aligned VMS_int__malloc_aligned
+#define VMS_WL__free VMS_int__free
+/*
+void *
+VMS_WL__malloc( size_t sizeRequested );
+
+void *
+VMS_WL__malloc_aligned( size_t sizeRequested );
+
+void
+VMS_WL__free( void *ptrToFree );
+*/
+
+/*Allocates memory from the external system -- higher overhead
+ */
+void *
+VMS__malloc_in_ext( size_t sizeRequested );
+
+/*Frees memory that was allocated in the external system -- higher overhead
+ */
+void
+VMS__free_in_ext( void *ptrToFree );
+
+
+MallocProlog *
+VMS_ext__create_free_list();
+
+void
+VMS_ext__free_free_list( MallocProlog *freeListHead );
+
+#endif
\ No newline at end of file
diff -r bc4cb994f114 -r eaf7e4c58c9e vutilities.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/vutilities.c	Wed Feb 22 11:39:12 2012 -0800
@@ -0,0 +1,25 @@
+/*
+ *  Copyright 2009 OpenSourceCodeStewardshipFoundation.org
+ *  Licensed under GNU General Public License version 2
+ *
+ * Author: seanhalle@yahoo.com
+ *
+ * Created on November 14, 2009, 9:07 PM
+ */
+
+#include <malloc.h>
+#include <stdlib.h>
+
+#include "VMS.h"
+
+
+inline char *
+VMS_int__strDup( char *str )
+ { char *retStr;
+
+   retStr = VMS_int__malloc( strlen(str) + 1 );
+   if( str == NULL ) return str;
+   strcpy( retStr, str );
+
+   return retStr;
+ }
diff -r bc4cb994f114 -r eaf7e4c58c9e vutilities.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/vutilities.h	Wed Feb 22 11:39:12 2012 -0800
@@ -0,0 +1,20 @@
+/*
+ *  Copyright 2009 OpenSourceCodeStewardshipFoundation.org
+ *  Licensed under GNU General Public License version 2
+ *
+ * Author: seanhalle@yahoo.com
+ *
+ * Created on November 14, 2009, 9:07 PM
+ */
+
+
+#ifndef  _UTILITIES_H
+#define	_UTILITIES_H
+
+#include <string.h>
+#include "VMS_primitive_data_types.h"
+
+inline char *
+VMS_int__strDup( char *str );
+ 
+#endif