changeset 61:984f7d78bfdf measure_brch

Merge See what happens -- merged test stuff into Nov 8 VMS version
author SeanHalle
date Thu, 11 Nov 2010 06:19:51 -0800
parents 4fbc2165e493 7b799a46cc87
children
files CoreLoop.c DESIGN_NOTES__VMS.txt MasterLoop.c VMS.c VMS.h
diffstat 11 files changed, 1821 insertions(+), 478 deletions(-) [+]
line diff
     1.1 --- a/CoreLoop.c	Tue Oct 26 18:31:34 2010 -0700
     1.2 +++ b/CoreLoop.c	Thu Nov 11 06:19:51 2010 -0800
     1.3 @@ -41,10 +41,32 @@
     1.4     VMSQueueStruc *readyToAnimateQ;
     1.5     unsigned long   coreMask;  //has 1 in bit positions of allowed cores
     1.6     int             errorCode;
     1.7 -   
     1.8 +
     1.9 +      //work-stealing struc on stack to prevent false-sharing in cache-line
    1.10 +   volatile GateStruc gate;
    1.11 +   //preGateProgress, waitProgress, exitProgress, gateClosed;
    1.12 +
    1.13 +
    1.14     coreLoopThdParams = (ThdParams *)paramsIn;
    1.15     thisCoresIdx = coreLoopThdParams->coreNum;
    1.16  
    1.17 +   gate.gateClosed      = FALSE;
    1.18 +   gate.preGateProgress = 0;
    1.19 +   gate.waitProgress    = 0;
    1.20 +   gate.exitProgress    = 0;
    1.21 +   _VMSMasterEnv->workStealingGates[ thisCoresIdx ] = &gate;//race @startup
    1.22 +
    1.23 +      //wait until signalled that setup is complete
    1.24 +   pthread_mutex_lock(   &suspendLock );
    1.25 +   while( !(_VMSMasterEnv->setupComplete) )
    1.26 +    {
    1.27 +      pthread_cond_wait( &suspend_cond,
    1.28 +                         &suspendLock );
    1.29 +    }
    1.30 +   pthread_mutex_unlock( &suspendLock );
    1.31 +
    1.32 +      //printf( "\nCore unsuspended: %d\n", coreLoopThdParams->coreNum );
    1.33 +
    1.34        //set thread affinity
    1.35        //Linux requires pinning thd to core inside thread-function
    1.36        //Designate a core by a 1 in bit-position corresponding to the core
    1.37 @@ -53,25 +75,9 @@
    1.38     pthread_t selfThd = pthread_self();
    1.39     errorCode =
    1.40     pthread_setaffinity_np( selfThd, sizeof(coreMask), &coreMask);
    1.41 -
    1.42 +   
    1.43     if(errorCode){ printf("\nset affinity failure\n"); exit(0); }
    1.44  
    1.45 -      //measure offsets between TSCs
    1.46 -      //Core 0 is the reference core, the rest react to it.
    1.47 -   if( thisCoresIdx == 0 ) measureTSCOffsetsAsCore0();
    1.48 -   else measureTSCOffsetsAsRemoteCore( thisCoresIdx );
    1.49 -   
    1.50 -      //wait until signalled that setup is complete
    1.51 -   pthread_mutex_lock(   &suspendLock );
    1.52 -   while( !(_VMSMasterEnv->setupComplete) )
    1.53 -    { pthread_cond_wait( &suspend_cond, &suspendLock );
    1.54 -    }
    1.55 -   pthread_mutex_unlock( &suspendLock );
    1.56 -
    1.57 -
    1.58 -      //printf( "\nCore unsuspended: %d\n", coreLoopThdParams->coreNum );
    1.59 -
    1.60 -
    1.61     
    1.62        //Save addr of "end core loop" label - jump to it to shut down coreloop
    1.63        //To get label addr in non-gcc compiler, can trick it by making a call
    1.64 @@ -88,82 +94,64 @@
    1.65     
    1.66        // Get to work!  --  virt procr jumps back here when suspends
    1.67        //Note, have to restore the frame-pointer before jump to here, to get
    1.68 -      // this code to work right (readyToAnimateQ and so forth are frame-ptr
    1.69 -      // relative)
    1.70 +      // this code to work right (readyToAnimateQ and so forth are frame-ptr relative)
    1.71  CoreLoopStartPt:
    1.72     
    1.73        //Get virtual processor from queue
    1.74 -      //_VMSWorkQ must be a global, static volatile var, so not kept in reg,
    1.75 +      //The Q must be a global, static volatile var, so not kept in reg,
    1.76        // which forces reloading the pointer after each jmp to this point
    1.77     readyToAnimateQ  = _VMSMasterEnv->readyToAnimateQs[thisCoresIdx];
    1.78  
    1.79 -   currPr = (VirtProcr *) readSRSWQ_NonBlocking( readyToAnimateQ );
    1.80 +   #ifdef USE_WORK_STEALING
    1.81 +      //Alg for work-stealing designed to make common case fast.  Comment
    1.82 +      // in stealer code explains.
    1.83 +   gate.preGateProgress++;
    1.84 +   if( gate.gateClosed )
    1.85 +    {    //now, set coreloop's progress, so stealer can see that core loop
    1.86 +         // has made it into the waiting area.
    1.87 +      gate.waitProgress = gate.preGateProgress;
    1.88 +      while( gate.gateClosed ) /*busy wait*/;
    1.89 +    }
    1.90 +
    1.91 +   currPr = (VirtProcr *) readVMSQ( readyToAnimateQ );
    1.92 +
    1.93 +      //Set the coreloop's progress, so stealer can see it has made it out
    1.94 +      // of the protected area
    1.95 +   gate.exitProgress = gate.preGateProgress;
    1.96 +   #else
    1.97 +   currPr = (VirtProcr *) readVMSQ( readyToAnimateQ );
    1.98 +   #endif
    1.99 +
   1.100 +   if( currPr != NULL ) _VMSMasterEnv->numMasterInARow[thisCoresIdx] = 0;
   1.101 +
   1.102     int tries = 0; int gotLock = 0;
   1.103 -   while( currPr == NULL )
   1.104 -    {    //no VPs ready to animate, so run MasterVP --later make "try Master"
   1.105 -         // VPs & put one in every queue at strategic point -- so have work
   1.106 -         // avail if don't get lock & short-circuit out of it if master has
   1.107 -         // recently run on another core
   1.108 -         //TODO: perf -- "try Master" VP that checks if should run Master Fn
   1.109 -         //But just letting queue run empty is quickest to see if pinning VP
   1.110 -         // to core will solve the bizarre random seg-faults in system stack.
   1.111 -
   1.112 -         //check if get the MasterLock
   1.113 +   while( currPr == NULL ) //if queue was empty, enter get masterLock loop
   1.114 +    {    //queue was empty, so get master lock
   1.115        gotLock = __sync_bool_compare_and_swap( &(_VMSMasterEnv->masterLock), \
   1.116 -                                                 UNLOCKED, LOCKED );
   1.117 -
   1.118 +                                                          UNLOCKED, LOCKED );
   1.119        if( gotLock )
   1.120 -       {
   1.121 -            //run own MasterVP -- when its done, unlocks MasterLock and
   1.122 -            // jumps back to coreLoops's startPt
   1.123 +       {    //run own MasterVP -- jmps to coreLoops startPt when done
   1.124           currPr = _VMSMasterEnv->masterVPs[thisCoresIdx];
   1.125 -         addToHist( tries, _VMSMasterEnv->stats->masterLockHist );
   1.126 +         if( _VMSMasterEnv->numMasterInARow[thisCoresIdx] > 1000 )
   1.127 +          {       DEBUG( dbgB2BMaster,"Many back to back MasterVPs\n");
   1.128 +            pthread_yield();
   1.129 +          }
   1.130 +         _VMSMasterEnv->numMasterInARow[thisCoresIdx] += 1;
   1.131           break;  //end while -- have a VP to animate now
   1.132         }
   1.133        
   1.134 -      tries++;
   1.135 -      
   1.136 -      if( tries % READYTOANIMATE_RETRIES == 0 ) pthread_yield();
   1.137 +      tries++;      //if too many, means master on other core taking too long
   1.138 +      if( tries > MASTERLOCK_RETRIES ) { tries = 0; pthread_yield(); }
   1.139      }
   1.140     
   1.141 -      //switch to virt procr's stack and frame ptr then jump to virt procr fn
   1.142 -   void *stackPtr, *framePtr, *jmpPt, *coreLoopFramePtrAddr, \
   1.143 -        *coreLoopStackPtrAddr;
   1.144 -   
   1.145 -   stackPtr = currPr->stackPtr;
   1.146 -   framePtr = currPr->framePtr;
   1.147 -   jmpPt    = currPr->nextInstrPt;
   1.148 -   coreLoopFramePtrAddr = &(currPr->coreLoopFramePtr);
   1.149 -   coreLoopStackPtrAddr = &(currPr->coreLoopStackPtr);
   1.150  
   1.151 -      //Save the core loop's stack and frame pointers into virt procr struct
   1.152 -      // then switch to stack ptr and frame ptr of virt procr & jmp to it
   1.153 -      //This was a pain to get right because GCC converts the "(jmpPt)" to
   1.154 -      // frame-relative mem-op -- so generated machine code first changed the
   1.155 -      // frame pointer, then tried to jump to an addr stored on stack, which
   1.156 -      // it accessed as an offset from frame-ptr!  (wrong frame-ptr now)
   1.157 -      //Explicitly loading into eax before changing frame-ptr fixed it
   1.158 -      //Also, it turns "(currPr->coreLoopFramePtr)" into a temporary on the
   1.159 -      // stack, so "movl %%ebp, %0" saves to the temp, NOT the data-struc!
   1.160 -   asm volatile("movl %0, %%eax;      \
   1.161 -                 movl %%esp, (%%eax); \
   1.162 -                 movl %1, %%eax;      \
   1.163 -                 movl %%ebp, (%%eax); \
   1.164 -                 movl %2, %%eax;      \
   1.165 -                 movl %3, %%esp;      \
   1.166 -                 movl %4, %%ebp;      \
   1.167 -                 jmp  %%eax"          \
   1.168 -   /* outputs */ : "=g"(coreLoopStackPtrAddr),                 \
   1.169 -                   "=g"(coreLoopFramePtrAddr)                  \
   1.170 -   /* inputs  */ : "g" (jmpPt), "g" (stackPtr), "g" (framePtr) \
   1.171 -   /* clobber */ : "memory", "%eax", "%ebx", "%ecx", "%edx", "%edi", "%esi" \
   1.172 -                );
   1.173 +   SwitchToVP( currPr )
   1.174  
   1.175     //=========== jmp to here when want to shut down the VMS system ==========
   1.176     CoreLoopEndPt:
   1.177        //first free shutdown VP that jumped here -- it first restores the
   1.178        // coreloop's stack, so addr of currPr in stack frame is still correct
   1.179 -   VMS__handle_dissipate_reqst( currPr );
   1.180 +   VMS__dissipate_procr( currPr );
   1.181     pthread_exit( NULL );
   1.182   }
   1.183  
   1.184 @@ -195,62 +183,33 @@
   1.185     _VMSMasterEnv->coreLoopStartPt = &&SeqCoreLoopStartPt;
   1.186     _VMSMasterEnv->coreLoopEndPt   = &&SeqCoreLoopEndPt;
   1.187  
   1.188 -      //Core loop has no values live upon CoreLoopStartPt except
   1.189 -      // readyToAnimateQ
   1.190 +      //Core loop has no values live upon CoreLoopStartPt except readyToAnimateQ
   1.191        // every value in the code is defined by a statement in core loop,
   1.192        // after the start point -- with the one exception of _VMSWorkQ
   1.193  
   1.194  
   1.195        // Get to work!  --  virt procr jumps back here when done or suspends
   1.196        //Note, have to restore the frame-pointer before jump to here, to get
   1.197 -      // this code to work right (readyToAnimateQ and so forth are frame-ptr
   1.198 -      // relative)
   1.199 +      // this code to work right (readyToAnimateQ and so forth are frame-ptr relative)
   1.200  SeqCoreLoopStartPt:
   1.201  
   1.202        //Get virtual processor from queue
   1.203        //_VMSWorkQ must be a global, static volatile var, so not kept in reg,
   1.204        // which forces reloading the pointer after each jmp to this point
   1.205     readyToAnimateQ  = _VMSMasterEnv->readyToAnimateQs[thisCoresIdx];
   1.206 -   currPr = (VirtProcr *) readSRSWQ_NonBlocking( readyToAnimateQ );
   1.207 +   currPr = (VirtProcr *) readVMSQ( readyToAnimateQ );
   1.208     if( currPr == NULL )
   1.209 +    { if( _VMSMasterEnv->numMasterInARow[thisCoresIdx] > 1000 )
   1.210 +       { printf("too many back to back MasterVP\n"); exit(1); }
   1.211 +      _VMSMasterEnv->numMasterInARow[thisCoresIdx] += 1;
   1.212 +      
   1.213        currPr = _VMSMasterEnv->masterVPs[thisCoresIdx];
   1.214 -   
   1.215 +    }
   1.216 +   else
   1.217 +      _VMSMasterEnv->numMasterInARow[thisCoresIdx] = 0;
   1.218  
   1.219 -//   printf("core %d loop procr addr: %d\n", coreLoopThdParams->coreNum, \
   1.220 -//       (int)currPr ); fflush(stdin);
   1.221  
   1.222 -      //switch to virt procr's stack and frame ptr then jump to virt procr
   1.223 -   void *stackPtr, *framePtr, *jmpPt, *coreLoopFramePtrAddr, \
   1.224 -        *coreLoopStackPtrAddr;
   1.225 -
   1.226 -   stackPtr = currPr->stackPtr;
   1.227 -   framePtr = currPr->framePtr;
   1.228 -   jmpPt    = currPr->nextInstrPt;
   1.229 -   coreLoopFramePtrAddr = &(currPr->coreLoopFramePtr);
   1.230 -   coreLoopStackPtrAddr = &(currPr->coreLoopStackPtr);
   1.231 -
   1.232 -      //Save the core loop's stack and frame pointers into virt procr struct
   1.233 -      // then switch to stack ptr and frame ptr of virt procr & jmp to it
   1.234 -      //This was a pain to get right because GCC converts the "(jmpPt)" to
   1.235 -      // frame-relative mem-op -- so generated machine code first changed the
   1.236 -      // frame pointer, then tried to jump to an addr stored on stack, which
   1.237 -      // it accessed as an offset from frame-ptr!  (wrong frame-ptr now)
   1.238 -      //Explicitly loading into eax before changing frame-ptr fixed it
   1.239 -      //Also, it turns "(currPr->coreLoopFramePtr)" into a temporary on the
   1.240 -      // stack, so "movl %%ebp, %0" saves to the temp, NOT the data-struc!
   1.241 -   asm volatile("movl %0, %%eax;      \
   1.242 -                 movl %%esp, (%%eax); \
   1.243 -                 movl %1, %%eax;      \
   1.244 -                 movl %%ebp, (%%eax); \
   1.245 -                 movl %2, %%eax;      \
   1.246 -                 movl %3, %%esp;      \
   1.247 -                 movl %4, %%ebp;      \
   1.248 -                 jmp  %%eax"          \
   1.249 -   /* outputs */ : "=g"(coreLoopStackPtrAddr),                 \
   1.250 -                   "=g"(coreLoopFramePtrAddr)                  \
   1.251 -   /* inputs  */ : "g" (jmpPt), "g" (stackPtr), "g" (framePtr) \
   1.252 -   /* clobber */ : "memory", "%eax", "%ebx", "%ecx", "%edx", "%edi", "%esi" \
   1.253 -                );
   1.254 +   SwitchToVP( currPr )
   1.255  
   1.256     //========================================================================
   1.257        //jmp to here when want to shut down the VMS system.  A shutdown VP is
   1.258 @@ -260,7 +219,7 @@
   1.259        // all the threads to die will proceed, gather the result, and
   1.260        // return to the calling application.
   1.261  SeqCoreLoopEndPt:
   1.262 -   VMS__handle_dissipate_reqst( currPr ); //free shutdown pr, that jmpd here
   1.263 +   VMS__dissipate_procr( currPr ); //free shutdown pr, that jmpd here
   1.264     return;
   1.265   }
   1.266  
   1.267 @@ -380,6 +339,3 @@
   1.268   }
   1.269  
   1.270  
   1.271 -
   1.272 -
   1.273 -
     2.1 --- a/DESIGN_NOTES__VMS.txt	Tue Oct 26 18:31:34 2010 -0700
     2.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
     2.3 @@ -1,2 +0,0 @@
     2.4 -
     2.5 -Implement VMS this way:
     3.1 --- a/MasterLoop.c	Tue Oct 26 18:31:34 2010 -0700
     3.2 +++ b/MasterLoop.c	Thu Nov 11 06:19:51 2010 -0800
     3.3 @@ -7,12 +7,19 @@
     3.4  
     3.5  
     3.6  #include <stdio.h>
     3.7 -#include <malloc.h>
     3.8  #include <stddef.h>
     3.9  
    3.10  #include "VMS.h"
    3.11  
    3.12  
    3.13 +//===========================================================================
    3.14 +void inline
    3.15 +stealWorkInto( SchedSlot *currSlot, VMSQueueStruc *readyToAnimateQ,
    3.16 +               VirtProcr *masterPr );
    3.17 +
    3.18 +//===========================================================================
    3.19 +
    3.20 +
    3.21  
    3.22  /*This code is animated by the virtual Master processor.
    3.23   *
    3.24 @@ -65,7 +72,7 @@
    3.25   */
    3.26  void masterLoop( void *initData, VirtProcr *animatingPr )
    3.27   { 
    3.28 -   int             slotIdx;
    3.29 +   int32           slotIdx, numSlotsFilled;
    3.30     VirtProcr      *schedVirtPr;
    3.31     SchedSlot      *currSlot, **schedSlots;
    3.32     MasterEnv      *masterEnv;
    3.33 @@ -75,7 +82,7 @@
    3.34     RequestHandler  requestHandler;
    3.35     void           *semanticEnv;
    3.36  
    3.37 -   int             thisCoresIdx;
    3.38 +   int32           thisCoresIdx;
    3.39     VirtProcr      *masterPr;
    3.40     volatile        VirtProcr *volatileMasterPr;
    3.41     
    3.42 @@ -110,7 +117,7 @@
    3.43  
    3.44     masterEnv        = _VMSMasterEnv;
    3.45     
    3.46 -//TODO: check that compiles so that always re-define from frame-storage
    3.47 +      //GCC may optimize so doesn't always re-define from frame-storage
    3.48     masterPr         = volatileMasterPr;  //just to make sure after jmp
    3.49     thisCoresIdx     = masterPr->coreAnimatedBy;
    3.50     readyToAnimateQ  = masterEnv->readyToAnimateQs[thisCoresIdx];
    3.51 @@ -122,6 +129,7 @@
    3.52  
    3.53  
    3.54        //Poll each slot's Done flag
    3.55 +   numSlotsFilled = 0;
    3.56     for( slotIdx = 0; slotIdx < NUM_SCHED_SLOTS; slotIdx++)
    3.57      {
    3.58        currSlot = schedSlots[ slotIdx ];
    3.59 @@ -143,27 +151,21 @@
    3.60            { currSlot->procrAssignedToSlot = schedVirtPr;
    3.61              schedVirtPr->schedSlot        = currSlot;
    3.62              currSlot->needsProcrAssigned  = FALSE;
    3.63 -
    3.64 -            writeSRSWQ( schedVirtPr, readyToAnimateQ );
    3.65 +            numSlotsFilled               += 1;
    3.66 +            
    3.67 +            writeVMSQ( schedVirtPr, readyToAnimateQ );
    3.68            }
    3.69         }
    3.70      }
    3.71  
    3.72 +   
    3.73 +   #ifdef USE_WORK_STEALING
    3.74 +      //If no slots filled, means no more work, look for work to steal.
    3.75 +   if( numSlotsFilled == 0 )
    3.76 +    { gateProtected_stealWorkInto( currSlot, readyToAnimateQ, masterPr );
    3.77 +    }
    3.78 +   #endif
    3.79  
    3.80 -      //Save stack ptr and frame, restore CoreLoop's stack and frame,
    3.81 -      // and clear the MasterLock
    3.82 -      //TODO: cafefully verify don't need to force saving anything to stack
    3.83 -      // before jumping back to core loop.
    3.84 -   void           *stackPtrAddr, *framePtrAddr, *masterLockAddr;
    3.85 -   void           *jmpPt, *coreLoopFramePtr, *coreLoopStackPtr;
    3.86 -
    3.87 -   stackPtrAddr      = &(masterPr->stackPtr);
    3.88 -   framePtrAddr      = &(masterPr->framePtr);
    3.89 -   masterLockAddr    = &(_VMSMasterEnv->masterLock);
    3.90 -
    3.91 -   jmpPt             = _VMSMasterEnv->coreLoopStartPt;
    3.92 -   coreLoopFramePtr  = masterPr->coreLoopFramePtr;//need this only
    3.93 -   coreLoopStackPtr  = masterPr->coreLoopStackPtr;//shouldn't need -- safety
    3.94     
    3.95     //============================= MEASUREMENT STUFF ========================
    3.96     #ifdef MEAS__TIME_MASTER
    3.97 @@ -172,21 +174,183 @@
    3.98     #endif
    3.99     //========================================================================
   3.100  
   3.101 -   asm volatile("movl %0,     %%eax;  \
   3.102 -                 movl %%esp, (%%eax); \
   3.103 -                 movl %1,     %%eax;  \
   3.104 -                 movl %%ebp, (%%eax); \
   3.105 -                 movl %2, %%ebx;      \
   3.106 -                 movl %3, %%eax;      \
   3.107 -                 movl %4, %%esp;      \
   3.108 -                 movl %5, %%ebp;      \
   3.109 -                 movl $0x0, (%%ebx);  \
   3.110 -                 jmp  %%eax;"         \
   3.111 -   /* outputs */ : "=g" (stackPtrAddr), "=g" (framePtrAddr),                \
   3.112 -                   "=g"(masterLockAddr)                                     \
   3.113 -   /* inputs  */ : "g" (jmpPt), "g"(coreLoopStackPtr), "g"(coreLoopFramePtr)\
   3.114 -   /* clobber */ : "memory", "%eax", "%ebx", "%ecx", "%edx", "%edi", "%esi" \
   3.115 -                );//can probably make clobber list empty -- but safe for now
   3.116 +   
   3.117 +   masterSwitchToCoreLoop( masterPr )
   3.118   }
   3.119  
   3.120  
   3.121 +
   3.122 +/*This has a race condition -- the coreloops are accessing their own queues
   3.123 + * at the same time that this work-stealer on a different core is trying to
   3.124 + */
   3.125 +void inline
   3.126 +stealWorkInto( SchedSlot *currSlot, VMSQueueStruc *readyToAnimateQ,
   3.127 +               VirtProcr *masterPr )
   3.128 + { 
   3.129 +   VirtProcr   *stolenPr;
   3.130 +   int32        coreIdx, i;
   3.131 +   VMSQueueStruc *currQ;
   3.132 +
   3.133 +   stolenPr = NULL;
   3.134 +   coreIdx = masterPr->coreAnimatedBy;
   3.135 +   for( i = 0; i < NUM_CORES -1; i++ )
   3.136 +    {
   3.137 +      if( coreIdx >= NUM_CORES -1 )
   3.138 +       { coreIdx = 0;
   3.139 +       }
   3.140 +      else
   3.141 +       { coreIdx++;
   3.142 +       }
   3.143 +      currQ = _VMSMasterEnv->readyToAnimateQs[coreIdx];
   3.144 +      if( numInVMSQ( currQ ) > 0 )
   3.145 +       { stolenPr = readVMSQ (currQ );
   3.146 +         break;
   3.147 +       }
   3.148 +    }
   3.149 +
   3.150 +   if( stolenPr != NULL )
   3.151 +    { currSlot->procrAssignedToSlot = stolenPr;
   3.152 +      stolenPr->schedSlot           = currSlot;
   3.153 +      currSlot->needsProcrAssigned  = FALSE;
   3.154 +
   3.155 +      writeVMSQ( stolenPr, readyToAnimateQ );
   3.156 +    }
   3.157 + }
   3.158 +
   3.159 +/*This algorithm makes the common case fast.  Make the coreloop passive,
   3.160 + * and show its progress.  Make the stealer control a gate that coreloop
   3.161 + * has to pass.
   3.162 + *To avoid interference, only one stealer at a time.  Use a global
   3.163 + * stealer-lock.
   3.164 + *
   3.165 + *The pattern is based on a gate -- stealer shuts the gate, then monitors
   3.166 + * to be sure any already past make it all the way out, before starting.
   3.167 + *So, have a "progress" measure just before the gate, then have two after it,
   3.168 + * one is in a "waiting room" outside the gate, the other is at the exit.
   3.169 + *Then, the stealer first shuts the gate, then checks the progress measure
   3.170 + * outside it, then looks to see if the progress measure at the exit is the
   3.171 + * same.  If yes, it knows the protected area is empty 'cause no other way
   3.172 + * to get in and the last to get in also exited.
   3.173 + *If the progress measure at the exit is not the same, then the stealer goes
   3.174 + * into a loop checking both the waiting-area and the exit progress-measures
   3.175 + * until one of them shows the same as the measure outside the gate.  Might
   3.176 + * as well re-read the measure outside the gate each go around, just to be
   3.177 + * sure.  It is guaranteed that one of the two will eventually match the one
   3.178 + * outside the gate.
   3.179 + *
   3.180 + *Here's an informal proof of correctness:
   3.181 + *The gate can be closed at any point, and have only four cases:
   3.182 + *  1) coreloop made it past the gate-closing but not yet past the exit
   3.183 + *  2) coreloop made it past the pre-gate progress update but not yet past
   3.184 + *     the gate,
   3.185 + *  3) coreloop is right before the pre-gate update
   3.186 + *  4) coreloop is past the exit and far from the pre-gate update.
   3.187 + *
   3.188 + * Covering the cases in reverse order,
   3.189 + *  4) is not a problem -- stealer will read pre-gate progress, see that it
   3.190 + *     matches exit progress, and the gate is closed, so stealer can proceed.
   3.191 + *  3) stealer will read pre-gate progress just after coreloop updates it..
   3.192 + *     so stealer goes into a loop until the coreloop causes wait-progress
   3.193 + *     to match pre-gate progress, so then stealer can proceed
   3.194 + *  2) same as 3..
   3.195 + *  1) stealer reads pre-gate progress, sees that it's different than exit,
   3.196 + *     so goes into loop until exit matches pre-gate, now it knows coreloop
   3.197 + *     is not in protected and cannot get back in, so can proceed.
   3.198 + *
   3.199 + *Implementation for the stealer:
   3.200 + *
   3.201 + *First, acquire the stealer lock -- only cores with no work to do will
   3.202 + * compete to steal, so not a big performance penalty having only one --
   3.203 + * will rarely have multiple stealers in a system with plenty of work -- and
   3.204 + * in a system with little work, it doesn't matter.
   3.205 + *
   3.206 + *Note, have single-reader, single-writer pattern for all variables used to
   3.207 + * communicate between stealer and victims
   3.208 + *
   3.209 + *So, scan the queues of the core loops, until find non-empty.  Each core
   3.210 + * has its own list that it scans.  The list goes in order from closest to
   3.211 + * furthest core, so it steals first from close cores.  Later can add
   3.212 + * taking info from the app about overlapping footprints, and scan all the
   3.213 + * others then choose work with the most footprint overlap with the contents
   3.214 + * of this core's cache.
   3.215 + *
   3.216 + *Now, have a victim want to take work from.  So, shut the gate in that
   3.217 + * coreloop, by setting the "gate closed" var on its stack to TRUE.
   3.218 + *Then, read the core's pre-gate progress and compare to the core's exit
   3.219 + * progress.
   3.220 + *If same, can proceed to take work from the coreloop's queue.  When done,
   3.221 + * write FALSE to gate closed var.
   3.222 + *If different, then enter a loop that reads the pre-gate progress, then
   3.223 + * compares to exit progress then to wait progress.  When one of two
   3.224 + * matches, proceed.  Take work from the coreloop's queue.  When done,
   3.225 + * write FALSE to the gate closed var.
   3.226 + * 
   3.227 + */
   3.228 +void inline
   3.229 +gateProtected_stealWorkInto( SchedSlot *currSlot,
   3.230 +                             VMSQueueStruc *myReadyToAnimateQ,
   3.231 +                             VirtProcr *masterPr )
   3.232 + {
   3.233 +   VirtProcr     *stolenPr;
   3.234 +   int32          coreIdx, i, haveAVictim, gotLock;
   3.235 +   VMSQueueStruc *victimsQ;
   3.236 +
   3.237 +   volatile GateStruc *vicGate;
   3.238 +   int32               coreMightBeInProtected;
   3.239 +
   3.240 +
   3.241 +
   3.242 +      //see if any other cores have work available to steal
   3.243 +   haveAVictim = FALSE;
   3.244 +   coreIdx = masterPr->coreAnimatedBy;
   3.245 +   for( i = 0; i < NUM_CORES -1; i++ )
   3.246 +    {
   3.247 +      if( coreIdx >= NUM_CORES -1 )
   3.248 +       { coreIdx = 0;
   3.249 +       }
   3.250 +      else
   3.251 +       { coreIdx++;
   3.252 +       }
   3.253 +      victimsQ = _VMSMasterEnv->readyToAnimateQs[coreIdx];
   3.254 +      if( numInVMSQ( victimsQ ) > 0 )
   3.255 +       { haveAVictim = TRUE;
   3.256 +         vicGate = _VMSMasterEnv->workStealingGates[ coreIdx ];
   3.257 +         break;
   3.258 +       }
   3.259 +    }
   3.260 +   if( !haveAVictim ) return;  //no work to steal, exit
   3.261 +
   3.262 +      //have a victim core, now get the stealer-lock
   3.263 +   gotLock =__sync_bool_compare_and_swap( &(_VMSMasterEnv->workStealingLock),
   3.264 +                                                          UNLOCKED, LOCKED );
   3.265 +   if( !gotLock ) return; //go back to core loop, which will re-start master
   3.266 +
   3.267 +
   3.268 +   //====== Start Gate-protection =======
   3.269 +   vicGate->gateClosed = TRUE;
   3.270 +   coreMightBeInProtected= vicGate->preGateProgress != vicGate->exitProgress;
   3.271 +   while( coreMightBeInProtected )
   3.272 +    {    //wait until sure
   3.273 +      if( vicGate->preGateProgress == vicGate->waitProgress )
   3.274 +         coreMightBeInProtected = FALSE;
   3.275 +      if( vicGate->preGateProgress == vicGate->exitProgress )
   3.276 +         coreMightBeInProtected = FALSE;
   3.277 +    }
   3.278 +
   3.279 +   stolenPr = readVMSQ ( victimsQ );
   3.280 +
   3.281 +   vicGate->gateClosed = FALSE;
   3.282 +   //======= End Gate-protection  =======
   3.283 +
   3.284 +
   3.285 +   if( stolenPr != NULL )  //victim could have been in protected and taken
   3.286 +    { currSlot->procrAssignedToSlot = stolenPr;
   3.287 +      stolenPr->schedSlot           = currSlot;
   3.288 +      currSlot->needsProcrAssigned  = FALSE;
   3.289 +
   3.290 +      writeVMSQ( stolenPr, myReadyToAnimateQ );
   3.291 +    }
   3.292 +
   3.293 +      //unlock the work stealing lock
   3.294 +   _VMSMasterEnv->workStealingLock = UNLOCKED;
   3.295 + }
     4.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     4.2 +++ b/SwitchAnimators.h	Thu Nov 11 06:19:51 2010 -0800
     4.3 @@ -0,0 +1,138 @@
     4.4 +/*
     4.5 + *  Copyright 2009 OpenSourceStewardshipFoundation.org
     4.6 + *  Licensed under GNU General Public License version 2
     4.7 + *
     4.8 + * Author: seanhalle@yahoo.com
     4.9 + * 
    4.10 + */
    4.11 +
    4.12 +#ifndef _SwitchAnimators_H
    4.13 +#define	_SwitchAnimators_H
    4.14 +#define __USE_GNU
    4.15 +
    4.16 +/*Isolating code for switching between animators within these macros -- at
    4.17 + * some point will make switches to compile for 32 bit or for 64 bit, which
    4.18 + * having these isolated will make cleaner
    4.19 + *
    4.20 + *This also makes it easier to change architectures, at some point
    4.21 + *And it cleans the code up, having the ugly assembly out of the way
    4.22 + */
    4.23 +
    4.24 +//=========================== MasterVP to CoreLoop ==========================
    4.25 +//
    4.26 +      //Save stack ptr and frame, restore CoreLoop's stack and frame,
    4.27 +      // and clear the MasterLock
    4.28 +      //GCC's -O3 messes with this -- go through generated -- protect somehow
    4.29 +      //
    4.30 +#define masterSwitchToCoreLoop( masterPr )   \
    4.31 +   void           *stackPtrAddr, *framePtrAddr, *masterLockAddr; \
    4.32 +   void           *jmpPt, *coreLoopFramePtr, *coreLoopStackPtr;  \
    4.33 +\
    4.34 +   stackPtrAddr      = &(masterPr->stackPtr); \
    4.35 +   framePtrAddr      = &(masterPr->framePtr); \
    4.36 +   masterLockAddr    = &(_VMSMasterEnv->masterLock); \
    4.37 +\
    4.38 +   jmpPt             = _VMSMasterEnv->coreLoopStartPt; \
    4.39 +   coreLoopFramePtr  = masterPr->coreLoopFramePtr; \
    4.40 +   coreLoopStackPtr  = masterPr->coreLoopStackPtr; \
    4.41 +\
    4.42 +   asm volatile("movl %0,     %%eax;  \
    4.43 +                 movl %%esp, (%%eax); \
    4.44 +                 movl %1,     %%eax;  \
    4.45 +                 movl %%ebp, (%%eax); \
    4.46 +                 movl %2, %%ebx;      \
    4.47 +                 movl %3, %%eax;      \
    4.48 +                 movl %4, %%esp;      \
    4.49 +                 movl %5, %%ebp;      \
    4.50 +                 movl $0x0, (%%ebx);  \
    4.51 +                 jmp  %%eax;"         \
    4.52 +   /* outputs */ : "=g" (stackPtrAddr), "=g" (framePtrAddr),                \
    4.53 +                   "=g"(masterLockAddr)                                     \
    4.54 +   /* inputs  */ : "g" (jmpPt), "g"(coreLoopStackPtr), "g"(coreLoopFramePtr)\
    4.55 +   /* clobber */ : "memory", "%eax", "%ebx", "%ecx", "%edx", "%edi", "%esi" \
    4.56 +                );//can probably make clobber list empty -- but safe for now
    4.57 +
    4.58 +
    4.59 +//=========================== SlaveVP to CoreLoop ===========================
    4.60 +//
    4.61 +
    4.62 +#define    SwitchToCoreLoop( animatingPr ) \
    4.63 +   void *jmpPt, *stackPtrAddr, *framePtrAddr, *coreLoopStackPtr; \
    4.64 +   void *coreLoopFramePtr; \
    4.65 +\
    4.66 +   stackPtrAddr      = &(animatingPr->stackPtr); \
    4.67 +   framePtrAddr      = &(animatingPr->framePtr); \
    4.68 +\
    4.69 +   jmpPt             = _VMSMasterEnv->coreLoopStartPt; \
    4.70 +   coreLoopFramePtr  = animatingPr->coreLoopFramePtr; \
    4.71 +   coreLoopStackPtr  = animatingPr->coreLoopStackPtr; \
    4.72 +\
    4.73 +      /*Save the virt procr's stack and frame ptrs*/ \
    4.74 +   asm volatile("movl %0,     %%eax;  \
    4.75 +                 movl %%esp, (%%eax); \
    4.76 +                 movl %1,     %%eax;  \
    4.77 +                 movl %%ebp, (%%eax) "\
    4.78 +   /* outputs */ : "=g" (stackPtrAddr), "=g" (framePtrAddr) \
    4.79 +   /* inputs  */ :        \
    4.80 +   /* clobber */ : "%eax" \
    4.81 +                ); \
    4.82 +\
    4.83 +     /*restore coreloop's frame ptr, then jump back to "start" of core loop*/\
    4.84 +     /*Note, GCC compiles to assembly that saves esp and ebp in the stack*/ \
    4.85 +     /* frame -- so have to explicitly do assembly that saves to memory*/ \
    4.86 +   asm volatile("movl %0, %%eax;      \
    4.87 +                 movl %1, %%esp;      \
    4.88 +                 movl %2, %%ebp;      \
    4.89 +                 jmp  %%eax    "      \
    4.90 +   /* outputs */ :                    \
    4.91 +   /* inputs  */ : "m" (jmpPt), "m"(coreLoopStackPtr), "m"(coreLoopFramePtr)\
    4.92 +   /* clobber */ : "memory", "%eax", "%ebx", "%ecx", "%edx", "%edi","%esi"  \
    4.93 +                );
    4.94 + //list everything as clobbered to force GCC to save all
    4.95 + // live vars that are in regs on stack before this
    4.96 + // assembly, so that stack pointer is correct, before jmp
    4.97 +
    4.98 +
    4.99 +
   4.100 +//============================== CoreLoop to VP =============================
   4.101 +//
   4.102 +      //Save the core loop's stack and frame pointers into virt procr struct
   4.103 +      // then switch to stack ptr and frame ptr of virt procr & jmp to it
   4.104 +      //This was a pain to get right because GCC converts the "(jmpPt)" to
   4.105 +      // frame-relative mem-op -- so generated machine code first changed the
   4.106 +      // frame pointer, then tried to jump to an addr stored on stack, which
   4.107 +      // it accessed as an offset from frame-ptr!  (wrong frame-ptr now)
   4.108 +      //Explicitly loading into eax before changing frame-ptr fixed it
   4.109 +      //Also, it turns "(currPr->coreLoopFramePtr)" into a temporary on the
   4.110 +      // stack, so "movl %%ebp, %0" saves to the temp, NOT the data-struc!
   4.111 +
   4.112 +
   4.113 +      //switch to virt procr's stack and frame ptr then jump to virt procr fn
   4.114 +
   4.115 +#define SwitchToVP( currPr ) \
   4.116 +   void *stackPtr, *framePtr, *jmpPt, *coreLoopFramePtrAddr, \
   4.117 +        *coreLoopStackPtrAddr; \
   4.118 +\
   4.119 +   stackPtr = currPr->stackPtr; \
   4.120 +   framePtr = currPr->framePtr; \
   4.121 +   jmpPt    = currPr->nextInstrPt; \
   4.122 +   coreLoopFramePtrAddr = &(currPr->coreLoopFramePtr); \
   4.123 +   coreLoopStackPtrAddr = &(currPr->coreLoopStackPtr); \
   4.124 +\
   4.125 +   asm volatile("movl %0, %%eax;      \
   4.126 +                 movl %%esp, (%%eax); \
   4.127 +                 movl %1, %%eax;      \
   4.128 +                 movl %%ebp, (%%eax); \
   4.129 +                 movl %2, %%eax;      \
   4.130 +                 movl %3, %%esp;      \
   4.131 +                 movl %4, %%ebp;      \
   4.132 +                 jmp  %%eax"          \
   4.133 +   /* outputs */ : "=g"(coreLoopStackPtrAddr),                 \
   4.134 +                   "=g"(coreLoopFramePtrAddr)                  \
   4.135 +   /* inputs  */ : "g" (jmpPt), "g" (stackPtr), "g" (framePtr) \
   4.136 +   /* clobber */ : "memory", "%eax", "%ebx", "%ecx", "%edx", "%edi", "%esi" \
   4.137 +                );
   4.138 +
   4.139 +   
   4.140 +#endif	/* _SwitchAnimators_H */
   4.141 +
     5.1 --- a/VMS.c	Tue Oct 26 18:31:34 2010 -0700
     5.2 +++ b/VMS.c	Thu Nov 11 06:19:51 2010 -0800
     5.3 @@ -6,7 +6,9 @@
     5.4  
     5.5  #include <stdio.h>
     5.6  #include <stdlib.h>
     5.7 +#include <string.h>
     5.8  #include <malloc.h>
     5.9 +#include <sys/time.h>
    5.10  
    5.11  #include "VMS.h"
    5.12  #include "Queue_impl/BlockingQueue.h"
    5.13 @@ -28,6 +30,12 @@
    5.14  void
    5.15  create_the_coreLoop_OS_threads();
    5.16  
    5.17 +MallocProlog *
    5.18 +create_free_list();
    5.19 +
    5.20 +void
    5.21 +endOSThreadFn( void *initData, VirtProcr *animatingPr );
    5.22 +
    5.23  pthread_mutex_t suspendLock = PTHREAD_MUTEX_INITIALIZER;
    5.24  pthread_cond_t  suspend_cond  = PTHREAD_COND_INITIALIZER;
    5.25  
    5.26 @@ -83,34 +91,43 @@
    5.27     int              coreIdx;
    5.28     VirtProcr      **masterVPs;
    5.29     SchedSlot     ***allSchedSlots; //ptr to array of ptrs
    5.30 -   
    5.31 +
    5.32 +
    5.33        //Make the master env, which holds everything else
    5.34     _VMSMasterEnv = malloc( sizeof(MasterEnv) );
    5.35 +
    5.36 +        //Very first thing put into the master env is the free-list, seeded
    5.37 +        // with a massive initial chunk of memory.
    5.38 +        //After this, all other mallocs are VMS__malloc.
    5.39 +   _VMSMasterEnv->freeListHead        = VMS_ext__create_free_list();
    5.40 +
    5.41 +   //===================== Only VMS__malloc after this ====================
    5.42     masterEnv     = _VMSMasterEnv;
    5.43 -      //Need to set start pt here 'cause used by seed procr, which is created
    5.44 -      // before the first core loop starts up. -- not sure how yet..
    5.45 -//   masterEnv->coreLoopStartPt = ;
    5.46 -//   masterEnv->coreLoopEndPt   = ;
    5.47     
    5.48        //Make a readyToAnimateQ for each core loop
    5.49 -   readyToAnimateQs = malloc( NUM_CORES * sizeof(VMSQueueStruc *) );
    5.50 -   masterVPs        = malloc( NUM_CORES * sizeof(VirtProcr *) );
    5.51 +   readyToAnimateQs = VMS__malloc( NUM_CORES * sizeof(VMSQueueStruc *) );
    5.52 +   masterVPs        = VMS__malloc( NUM_CORES * sizeof(VirtProcr *) );
    5.53  
    5.54        //One array for each core, 3 in array, core's masterVP scheds all
    5.55 -   allSchedSlots    = malloc( NUM_CORES * sizeof(SchedSlot *) );
    5.56 +   allSchedSlots    = VMS__malloc( NUM_CORES * sizeof(SchedSlot *) );
    5.57  
    5.58 +   _VMSMasterEnv->numProcrsCreated = 0;  //used by create procr
    5.59     for( coreIdx = 0; coreIdx < NUM_CORES; coreIdx++ )
    5.60 -    {
    5.61 -      readyToAnimateQs[ coreIdx ] = makeSRSWQ();
    5.62 +    {    
    5.63 +      readyToAnimateQs[ coreIdx ] = makeVMSQ();
    5.64        
    5.65 -         //Q: should give masterVP core-specific into as its init data?
    5.66 +         //Q: should give masterVP core-specific info as its init data?
    5.67        masterVPs[ coreIdx ] = VMS__create_procr( &masterLoop, masterEnv );
    5.68        masterVPs[ coreIdx ]->coreAnimatedBy = coreIdx;
    5.69        allSchedSlots[ coreIdx ] = create_sched_slots(); //makes for one core
    5.70 +      _VMSMasterEnv->numMasterInARow[ coreIdx ] = 0;
    5.71 +      _VMSMasterEnv->workStealingGates[ coreIdx ] = NULL;
    5.72      }
    5.73     _VMSMasterEnv->readyToAnimateQs = readyToAnimateQs;
    5.74     _VMSMasterEnv->masterVPs        = masterVPs;
    5.75 +   _VMSMasterEnv->masterLock       = UNLOCKED;
    5.76     _VMSMasterEnv->allSchedSlots    = allSchedSlots;
    5.77 +   _VMSMasterEnv->workStealingLock = UNLOCKED;
    5.78  
    5.79     //============================= MEASUREMENT STUFF ========================
    5.80     #ifdef MEAS__TIME_MASTER
    5.81 @@ -125,59 +142,33 @@
    5.82        // because coreLoop now controls -- animates its masterVP when no work
    5.83  
    5.84  
    5.85 -   //==================== malloc substitute ========================
    5.86 -   //
    5.87 -   //Testing whether malloc is using thread-local storage and therefore
    5.88 -   // causing unreliable behavior.
    5.89 -   //Just allocate a massive chunk of memory and roll own malloc/free and
    5.90 -   // make app use VMS__malloc_to, which will suspend and perform malloc
    5.91 -   // in the master, taking from this massive chunk.
    5.92 +   //============================= MEASUREMENT STUFF ========================
    5.93 +   #ifdef STATS__TURN_ON_PROBES
    5.94 +   _VMSMasterEnv->dynIntervalProbesInfo =
    5.95 +              makePrivDynArrayOfSize( &(_VMSMasterEnv->intervalProbes), 200);
    5.96  
    5.97 -//   initFreeList();
    5.98 +   _VMSMasterEnv->probeNameHashTbl = makeHashTable( 1000, &VMS__free );
    5.99 +   
   5.100 +      //put creation time directly into master env, for fast retrieval
   5.101 +   struct timeval timeStamp;
   5.102 +   gettimeofday( &(timeStamp), NULL);
   5.103 +   _VMSMasterEnv->createPtInSecs =
   5.104 +                           timeStamp.tv_sec +(timeStamp.tv_usec/1000000.0);
   5.105 +   #endif
   5.106 +   //========================================================================
   5.107  
   5.108   }
   5.109  
   5.110 -/*
   5.111 -void
   5.112 -initMasterMalloc()
   5.113 - {
   5.114 -   _VMSMasterEnv->mallocChunk = malloc( MASSIVE_MALLOC_SIZE );
   5.115 -
   5.116 -      //The free-list element is the first several locations of an
   5.117 -      // allocated chunk -- the address given to the application is pre-
   5.118 -      // pended with both the ownership structure and the free-list struc.
   5.119 -      //So, write the values of these into the first locations of
   5.120 -      // mallocChunk -- which marks it as free & puts in its size.
   5.121 -   listElem = (FreeListElem *)_VMSMasterEnv->mallocChunk;
   5.122 -   listElem->size = MASSIVE_MALLOC_SIZE - NUM_PREPEND_BYTES
   5.123 -   listElem->next = NULL;
   5.124 - }
   5.125 -
   5.126 -void
   5.127 -dissipateMasterMalloc()
   5.128 - {
   5.129 -      //Just foo code -- to get going -- doing as if free list were link-list
   5.130 -   currElem = _VMSMasterEnv->freeList;
   5.131 -   while( currElem != NULL )
   5.132 -    {
   5.133 -      nextElem = currElem->next;
   5.134 -      masterFree( currElem );
   5.135 -      currElem = nextElem;
   5.136 -    }
   5.137 -   free( _VMSMasterEnv->freeList );
   5.138 - }
   5.139 - */
   5.140 -
   5.141  SchedSlot **
   5.142  create_sched_slots()
   5.143   { SchedSlot  **schedSlots;
   5.144     int i;
   5.145  
   5.146 -   schedSlots  = malloc( NUM_SCHED_SLOTS * sizeof(SchedSlot *) );
   5.147 +   schedSlots  = VMS__malloc( NUM_SCHED_SLOTS * sizeof(SchedSlot *) );
   5.148  
   5.149     for( i = 0; i < NUM_SCHED_SLOTS; i++ )
   5.150      {
   5.151 -      schedSlots[i] = malloc( sizeof(SchedSlot) );
   5.152 +      schedSlots[i] = VMS__malloc( sizeof(SchedSlot) );
   5.153  
   5.154           //Set state to mean "handling requests done, slot needs filling"
   5.155        schedSlots[i]->workIsDone         = FALSE;
   5.156 @@ -192,9 +183,9 @@
   5.157   { int i;
   5.158     for( i = 0; i < NUM_SCHED_SLOTS; i++ )
   5.159      {
   5.160 -      free( schedSlots[i] );
   5.161 +      VMS__free( schedSlots[i] );
   5.162      }
   5.163 -   free( schedSlots );
   5.164 +   VMS__free( schedSlots );
   5.165   }
   5.166  
   5.167  
   5.168 @@ -203,7 +194,7 @@
   5.169   {
   5.170     //========================================================================
   5.171     //                      Create the Threads
   5.172 -   int coreIdx, retCode, i;
   5.173 +   int coreIdx, retCode;
   5.174  
   5.175        //create the arrays used to measure TSC offsets between cores
   5.176     pongNums  = malloc( NUM_CORES * sizeof( int ) );
   5.177 @@ -227,7 +218,7 @@
   5.178  
   5.179        //Make the threads that animate the core loops
   5.180     for( coreIdx=0; coreIdx < NUM_CORES; coreIdx++ )
   5.181 -    { coreLoopThdParams[coreIdx]          = malloc( sizeof(ThdParams) );
   5.182 +    { coreLoopThdParams[coreIdx]          = VMS__malloc( sizeof(ThdParams) );
   5.183        coreLoopThdParams[coreIdx]->coreNum = coreIdx;
   5.184  
   5.185        retCode =
   5.186 @@ -235,7 +226,7 @@
   5.187                          thdAttrs,
   5.188                         &coreLoop,
   5.189                 (void *)(coreLoopThdParams[coreIdx]) );
   5.190 -      if(retCode){printf("ERROR creating thread: %d\n", retCode); exit(0);}
   5.191 +      if(retCode){printf("ERROR creating thread: %d\n", retCode); exit(1);}
   5.192      }
   5.193   }
   5.194  
   5.195 @@ -307,10 +298,11 @@
   5.196   * animator state to return to --
   5.197   *
   5.198   */
   5.199 -VirtProcr *
   5.200 -VMS__create_procr( VirtProcrFnPtr fnPtr, void *initialData )
   5.201 - { VirtProcr *newPr;
   5.202 -   char      *stackLocs, *stackPtr;
   5.203 +inline VirtProcr *
   5.204 +create_procr_helper( VirtProcr *newPr,       VirtProcrFnPtr  fnPtr,
   5.205 +                     void      *initialData, char           *stackLocs )
   5.206 + {
   5.207 +   char  *stackPtr;
   5.208  
   5.209     //============================= MEASUREMENT STUFF ========================
   5.210     #ifdef MEAS__TIME_MASTER
   5.211 @@ -318,23 +310,19 @@
   5.212     saveLowTimeStampCountInto( startStamp );
   5.213     #endif
   5.214     //========================================================================
   5.215 -
   5.216 -   newPr              = malloc( sizeof(VirtProcr) );
   5.217 -   newPr->procrID     = numProcrsCreated++;
   5.218 -   newPr->nextInstrPt = fnPtr;
   5.219 -   newPr->initialData = initialData;
   5.220 -   newPr->requests    = NULL;
   5.221 -   newPr->schedSlot   = NULL;
   5.222 -//   newPr->coreLoopStartPt = _VMSMasterEnv->coreLoopStartPt;
   5.223 +   newPr->startOfStack = stackLocs;
   5.224 +   newPr->procrID      = _VMSMasterEnv->numProcrsCreated++;
   5.225 +   newPr->nextInstrPt  = fnPtr;
   5.226 +   newPr->initialData  = initialData;
   5.227 +   newPr->requests     = NULL;
   5.228 +   newPr->schedSlot    = NULL;
   5.229  
   5.230        //fnPtr takes two params -- void *initData & void *animProcr
   5.231        //alloc stack locations, make stackPtr be the highest addr minus room
   5.232        // for 2 params + return addr.  Return addr (NULL) is in loc pointed to
   5.233        // by stackPtr, initData at stackPtr + 4 bytes, animatingPr just above
   5.234 -   stackLocs = malloc( VIRT_PROCR_STACK_SIZE );
   5.235 -         if(stackLocs == 0) {perror("error: malloc stack"); exit(1);}
   5.236 -   newPr->startOfStack = stackLocs;
   5.237     stackPtr = ( (char *)stackLocs + VIRT_PROCR_STACK_SIZE - 0x10 );
   5.238 +   
   5.239        //setup __cdecl on stack -- coreloop will switch to stackPtr before jmp
   5.240     *( (int *)stackPtr + 2 ) = (int) newPr; //rightmost param -- 32bit pointer
   5.241     *( (int *)stackPtr + 1 ) = (int) initialData;  //next  param to left
   5.242 @@ -347,12 +335,48 @@
   5.243     saveLowTimeStampCountInto( endStamp );
   5.244     addIntervalToHist( startStamp, endStamp,
   5.245                        _VMSMasterEnv->stats->createHist );
   5.246 +   //============================= MEASUREMENT STUFF ========================
   5.247 +   #ifdef STATS__TURN_ON_PROBES
   5.248 +   struct timeval timeStamp;
   5.249 +   gettimeofday( &(timeStamp), NULL);
   5.250 +   newPr->createPtInSecs = timeStamp.tv_sec +(timeStamp.tv_usec/1000000.0) -
   5.251 +                                               _VMSMasterEnv->createPtInSecs;
   5.252     #endif
   5.253     //========================================================================
   5.254 -   
   5.255 +
   5.256     return newPr;
   5.257   }
   5.258  
   5.259 +inline VirtProcr *
   5.260 +VMS__create_procr( VirtProcrFnPtr fnPtr, void *initialData )
   5.261 + { VirtProcr *newPr;
   5.262 +   char      *stackLocs;
   5.263 +
   5.264 +   newPr      = VMS__malloc( sizeof(VirtProcr) );
   5.265 +   stackLocs  = VMS__malloc( VIRT_PROCR_STACK_SIZE );
   5.266 +   if( stackLocs == 0 )
   5.267 +    { perror("VMS__malloc stack"); exit(1); }
   5.268 +
   5.269 +   return create_procr_helper( newPr, fnPtr, initialData, stackLocs );
   5.270 + }
   5.271 +
   5.272 +/* "ext" designates that it's for use outside the VMS system -- should only
   5.273 + * be called from main thread or other thread -- never from code animated by
   5.274 + * a VMS virtual processor.
   5.275 + */
   5.276 +inline VirtProcr *
   5.277 +VMS_ext__create_procr( VirtProcrFnPtr fnPtr, void *initialData )
   5.278 + { VirtProcr *newPr;
   5.279 +   char      *stackLocs;
   5.280 +
   5.281 +   newPr      = malloc( sizeof(VirtProcr) );
   5.282 +   stackLocs  = malloc( VIRT_PROCR_STACK_SIZE );
   5.283 +   if( stackLocs == 0 )
   5.284 +    { perror("malloc stack"); exit(1); }
   5.285 +
   5.286 +   return create_procr_helper( newPr, fnPtr, initialData, stackLocs );
   5.287 + }
   5.288 +
   5.289  
   5.290  /*there is a label inside this function -- save the addr of this label in
   5.291   * the callingPr struc, as the pick-up point from which to start the next
   5.292 @@ -365,8 +389,7 @@
   5.293   */
   5.294  void
   5.295  VMS__suspend_procr( VirtProcr *animatingPr )
   5.296 - { void *jmpPt, *stackPtrAddr, *framePtrAddr, *coreLoopStackPtr;
   5.297 -   void *coreLoopFramePtr;
   5.298 + { 
   5.299  
   5.300        //The request to master will cause this suspended virt procr to get
   5.301        // scheduled again at some future point -- to resume, core loop jumps
   5.302 @@ -376,24 +399,6 @@
   5.303  
   5.304        //return ownership of the virt procr and sched slot to Master virt pr
   5.305     animatingPr->schedSlot->workIsDone = TRUE;
   5.306 -//   coreIdx = callingPr->coreAnimatedBy;
   5.307 -
   5.308 -   stackPtrAddr      = &(animatingPr->stackPtr);
   5.309 -   framePtrAddr      = &(animatingPr->framePtr);
   5.310 -
   5.311 -   jmpPt             = _VMSMasterEnv->coreLoopStartPt;
   5.312 -   coreLoopFramePtr  = animatingPr->coreLoopFramePtr;//need this only
   5.313 -   coreLoopStackPtr  = animatingPr->coreLoopStackPtr;//safety
   5.314 -
   5.315 -      //Save the virt procr's stack and frame ptrs,
   5.316 -   asm volatile("movl %0,     %%eax;  \
   5.317 -                 movl %%esp, (%%eax); \
   5.318 -                 movl %1,     %%eax;  \
   5.319 -                 movl %%ebp, (%%eax) "\
   5.320 -   /* outputs */ : "=g" (stackPtrAddr), "=g" (framePtrAddr) \
   5.321 -   /* inputs  */ :        \
   5.322 -   /* clobber */ : "%eax" \
   5.323 -                );
   5.324  
   5.325     //===========================  Measurement stuff ========================
   5.326     #ifdef MEAS__TIME_STAMP_SUSP
   5.327 @@ -402,20 +407,10 @@
   5.328     #endif
   5.329     //=======================================================================
   5.330  
   5.331 -      //restore coreloop's frame ptr, then jump back to "start" of core loop
   5.332 -      //Note, GCC compiles to assembly that saves esp and ebp in the stack
   5.333 -      // frame -- so have to explicitly do assembly that saves to memory
   5.334 -   asm volatile("movl %0, %%eax;      \
   5.335 -                 movl %1, %%esp;      \
   5.336 -                 movl %2, %%ebp;      \
   5.337 -                 jmp  %%eax    "      \
   5.338 -   /* outputs */ :                    \
   5.339 -   /* inputs  */ : "m" (jmpPt), "m"(coreLoopStackPtr), "m"(coreLoopFramePtr)\
   5.340 -   /* clobber */ : "memory", "%eax", "%ebx", "%ecx", "%edx", "%edi","%esi"  \
   5.341 -                ); //list everything as clobbered to force GCC to save all
   5.342 -                   // live vars that are in regs on stack before this
   5.343 -                   // assembly, so that stack pointer is correct, before jmp
   5.344  
   5.345 +   SwitchToCoreLoop( animatingPr )
   5.346 +
   5.347 +   //=======================================================================
   5.348  ResumePt:
   5.349     #ifdef MEAS__TIME_STAMP_SUSP
   5.350        //NOTE: only take low part of count -- do sanity check when take diff
   5.351 @@ -427,6 +422,31 @@
   5.352  
   5.353  
   5.354  
   5.355 +/*For this implementation of VMS, it may not make much sense to have the
   5.356 + * system of requests for creating a new processor done this way.. but over
   5.357 + * the scope of single-master, multi-master, mult-tasking, OS-implementing,
   5.358 + * distributed-memory, and so on, this gives VMS implementation a chance to
   5.359 + * do stuff before suspend, in the AppVP, and in the Master before the plugin
   5.360 + * is called, as well as in the lang-lib before this is called, and in the
   5.361 + * plugin.  So, this gives both VMS and language implementations a chance to
   5.362 + * intercept at various points and do order-dependent stuff.
   5.363 + *Having a standard VMSNewPrReqData struc allows the language to create and
   5.364 + * free the struc, while VMS knows how to get the newPr if it wants it, and
   5.365 + * it lets the lang have lang-specific data related to creation transported
   5.366 + * to the plugin.
   5.367 + */
   5.368 +void
   5.369 +VMS__send_create_procr_req( void *semReqData, VirtProcr *reqstingPr )
   5.370 + { VMSReqst req;
   5.371 +
   5.372 +   req.reqType          = createReq;
   5.373 +   req.semReqData       = semReqData;
   5.374 +   req.nextReqst        = reqstingPr->requests;
   5.375 +   reqstingPr->requests = &req;
   5.376 +
   5.377 +   VMS__suspend_procr( reqstingPr );
   5.378 + }
   5.379 +
   5.380  
   5.381  /*
   5.382   *This adds a request to dissipate, then suspends the processor so that the
   5.383 @@ -450,81 +470,102 @@
   5.384   * pears -- making that suspend the last thing in the virt procr's trace.
   5.385   */
   5.386  void
   5.387 -VMS__dissipate_procr( VirtProcr *procrToDissipate )
   5.388 +VMS__send_dissipate_req( VirtProcr *procrToDissipate )
   5.389 + { VMSReqst req;
   5.390 +
   5.391 +   req.reqType                = dissipate;
   5.392 +   req.nextReqst              = procrToDissipate->requests;
   5.393 +   procrToDissipate->requests = &req;
   5.394 +
   5.395 +   VMS__suspend_procr( procrToDissipate );
   5.396 + }
   5.397 +
   5.398 +
   5.399 +/* "ext" designates that it's for use outside the VMS system -- should only
   5.400 + * be called from main thread or other thread -- never from code animated by
   5.401 + * a VMS virtual processor.
   5.402 + *
   5.403 + *Use this version to dissipate VPs created outside the VMS system.
   5.404 + */
   5.405 +void
   5.406 +VMS_ext__dissipate_procr( VirtProcr *procrToDissipate )
   5.407 + {
   5.408 +      //NOTE: initialData was given to the processor, so should either have
   5.409 +      // been alloc'd with VMS__malloc, or freed by the level above animPr.
   5.410 +      //So, all that's left to free here is the stack and the VirtProcr struc
   5.411 +      // itself
   5.412 +      //Note, should not stack-allocate initial data -- no guarantee, in
   5.413 +      // general that creating processor will outlive ones it creates.
   5.414 +   free( procrToDissipate->startOfStack );
   5.415 +   free( procrToDissipate );
   5.416 + }
   5.417 +
   5.418 +
   5.419 +
   5.420 +/*This call's name indicates that request is malloc'd -- so req handler
   5.421 + * has to free any extra requests tacked on before a send, using this.
   5.422 + *
   5.423 + * This inserts the semantic-layer's request data into standard VMS carrier
   5.424 + * request data-struct that is mallocd.  The sem request doesn't need to
   5.425 + * be malloc'd if this is called inside the same call chain before the
   5.426 + * send of the last request is called.
   5.427 + *
   5.428 + *The request handler has to call VMS__free_VMSReq for any of these
   5.429 + */
   5.430 +inline void
   5.431 +VMS__add_sem_request_in_mallocd_VMSReqst( void *semReqData,
   5.432 +                                          VirtProcr *callingPr )
   5.433   { VMSReqst *req;
   5.434  
   5.435 -   req = malloc( sizeof(VMSReqst) );
   5.436 -//   req->virtProcrFrom      = callingPr;
   5.437 -   req->reqType               = dissipate;
   5.438 -   req->nextReqst             = procrToDissipate->requests;
   5.439 -   procrToDissipate->requests = req;
   5.440 -   
   5.441 -   VMS__suspend_procr( procrToDissipate );
   5.442 -}
   5.443 -
   5.444 -
   5.445 -/*This inserts the semantic-layer's request data into standard VMS carrier
   5.446 - */
   5.447 -inline void
   5.448 -VMS__add_sem_request( void *semReqData, VirtProcr *callingPr )
   5.449 - { VMSReqst *req;
   5.450 -
   5.451 -   req = malloc( sizeof(VMSReqst) );
   5.452 -//   req->virtProcrFrom      = callingPr;
   5.453 -   req->reqType        = semantic;
   5.454 -   req->semReqData     = semReqData;
   5.455 -   req->nextReqst      = callingPr->requests;
   5.456 +   req = VMS__malloc( sizeof(VMSReqst) );
   5.457 +   req->reqType         = semantic;
   5.458 +   req->semReqData      = semReqData;
   5.459 +   req->nextReqst       = callingPr->requests;
   5.460     callingPr->requests = req;
   5.461   }
   5.462  
   5.463 +/*This inserts the semantic-layer's request data into standard VMS carrier
   5.464 + * request data-struct is allocated on stack of this call & ptr to it sent
   5.465 + * to plugin
   5.466 + *Then it does suspend, to cause request to be sent.
   5.467 + */
   5.468 +inline void
   5.469 +VMS__send_sem_request( void *semReqData, VirtProcr *callingPr )
   5.470 + { VMSReqst req;
   5.471  
   5.472 -/*Use this to get first request before starting request handler's loop
   5.473 +   req.reqType         = semantic;
   5.474 +   req.semReqData      = semReqData;
   5.475 +   req.nextReqst       = callingPr->requests;
   5.476 +   callingPr->requests = &req;
   5.477 +   
   5.478 +   VMS__suspend_procr( callingPr );
   5.479 + }
   5.480 +
   5.481 +
   5.482 +inline void
   5.483 +VMS__send_VMSSem_request( void *semReqData, VirtProcr *callingPr )
   5.484 + { VMSReqst req;
   5.485 +
   5.486 +   req.reqType         = VMSSemantic;
   5.487 +   req.semReqData      = semReqData;
   5.488 +   req.nextReqst       = callingPr->requests; //gab any other preceeding 
   5.489 +   callingPr->requests = &req;
   5.490 +
   5.491 +   VMS__suspend_procr( callingPr );
   5.492 + }
   5.493 +
   5.494 +
   5.495 +/*
   5.496   */
   5.497  VMSReqst *
   5.498 -VMS__take_top_request_from( VirtProcr *procrWithReq )
   5.499 - { VMSReqst *req;
   5.500 -
   5.501 -   req = procrWithReq->requests;
   5.502 -   if( req == NULL ) return req;
   5.503 -
   5.504 -   procrWithReq->requests = procrWithReq->requests->nextReqst;
   5.505 -   return req;
   5.506 - }
   5.507 -
   5.508 -/*A subtle bug due to freeing then accessing "next" after freed caused this
   5.509 - * form of call to be put in -- so call this at end of request handler loop
   5.510 - * that iterates through the requests.
   5.511 - */
   5.512 -VMSReqst *
   5.513 -VMS__free_top_and_give_next_request_from( VirtProcr *procrWithReq )
   5.514 +VMS__take_next_request_out_of( VirtProcr *procrWithReq )
   5.515   { VMSReqst *req;
   5.516  
   5.517     req = procrWithReq->requests;
   5.518     if( req == NULL ) return NULL;
   5.519  
   5.520     procrWithReq->requests = procrWithReq->requests->nextReqst;
   5.521 -   VMS__free_request( req );
   5.522 -   return procrWithReq->requests;
   5.523 - }
   5.524 -
   5.525 -
   5.526 -//TODO: add a semantic-layer supplied "freer" for the semantic-data portion
   5.527 -// of a request -- IE call with both a virt procr and a fn-ptr to request
   5.528 -// freer (also maybe put sem request freer as a field in virt procr?)
   5.529 -//MeasVMS relies right now on this only freeing VMS layer of request -- the
   5.530 -// semantic portion of request is alloc'd and freed by request handler
   5.531 -void
   5.532 -VMS__free_request( VMSReqst *req )
   5.533 - {
   5.534 -   free( req );
   5.535 - }
   5.536 -
   5.537 -
   5.538 -
   5.539 -inline int
   5.540 -VMS__isSemanticReqst( VMSReqst *req )
   5.541 - {
   5.542 -   return ( req->reqType == semantic );
   5.543 +   return req;
   5.544   }
   5.545  
   5.546  
   5.547 @@ -534,36 +575,52 @@
   5.548     return req->semReqData;
   5.549   }
   5.550  
   5.551 -inline int
   5.552 -VMS__isDissipateReqst( VMSReqst *req )
   5.553 - {
   5.554 -   return ( req->reqType == dissipate );
   5.555 - }
   5.556  
   5.557 -inline int
   5.558 -VMS__isCreateReqst( VMSReqst *req )
   5.559 - {
   5.560 -   return ( req->reqType == regCreated );
   5.561 - }
   5.562  
   5.563 -void
   5.564 -VMS__send_req_to_register_new_procr(VirtProcr *newPr, VirtProcr *reqstingPr)
   5.565 - { VMSReqst *req;
   5.566 +/* This is for OS requests and VMS infrastructure requests, such as to create
   5.567 + *  a probe -- a probe is inside the heart of VMS-core, it's not part of any
   5.568 + *  language -- but it's also a semantic thing that's triggered from and used
   5.569 + *  in the application.. so it crosses abstractions..  so, need some special
   5.570 + *  pattern here for handling such requests.
   5.571 + * Doing this just like it were a second language sharing VMS-core.
   5.572 + * 
   5.573 + * This is called from the language's request handler when it sees a request
   5.574 + *  of type VMSSemReq
   5.575 + *
   5.576 + * TODO: Later change this, to give probes their own separate plugin & have
   5.577 + *  VMS-core steer the request to appropriate plugin
   5.578 + * Do the same for OS calls -- look later at it..
   5.579 + */
   5.580 +void inline
   5.581 +VMS__handle_VMSSemReq( VMSReqst *req, VirtProcr *requestingPr, void *semEnv,
   5.582 +                       ResumePrFnPtr resumePrFnPtr )
   5.583 + { VMSSemReq     *semReq;
   5.584 +   IntervalProbe *newProbe;
   5.585 +   int32          nameLen;
   5.586  
   5.587 -   req                  = malloc( sizeof(VMSReqst) );
   5.588 -   req->reqType         = regCreated;
   5.589 -   req->semReqData      = newPr;
   5.590 -   req->nextReqst       = reqstingPr->requests;
   5.591 -   reqstingPr->requests = req;
   5.592 +   semReq = req->semReqData;
   5.593  
   5.594 -   VMS__suspend_procr( reqstingPr );
   5.595 +   newProbe          = VMS__malloc( sizeof(IntervalProbe) );
   5.596 +   nameLen = strlen( semReq->nameStr );
   5.597 +   newProbe->nameStr = VMS__malloc( nameLen );
   5.598 +   memcpy( newProbe->nameStr, semReq->nameStr, nameLen );
   5.599 +   newProbe->hist    = NULL;
   5.600 +   newProbe->schedChoiceWasRecorded = FALSE;
   5.601 +
   5.602 +      //This runs in masterVP, so no race-condition worries
   5.603 +   newProbe->probeID =
   5.604 +             addToDynArray( newProbe, _VMSMasterEnv->dynIntervalProbesInfo );
   5.605 +
   5.606 +   requestingPr->dataRetFromReq = newProbe;
   5.607 +
   5.608 +   (*resumePrFnPtr)( requestingPr, semEnv );
   5.609   }
   5.610  
   5.611  
   5.612  
   5.613  /*This must be called by the request handler plugin -- it cannot be called
   5.614   * from the semantic library "dissipate processor" function -- instead, the
   5.615 - * semantic layer has to generate a request for the plug-in to call this
   5.616 + * semantic layer has to generate a request, and the plug-in calls this
   5.617   * function.
   5.618   *The reason is that this frees the virtual processor's stack -- which is
   5.619   * still in use inside semantic library calls!
   5.620 @@ -579,33 +636,31 @@
   5.621   * of dis-owning it.
   5.622   */
   5.623  void
   5.624 -VMS__handle_dissipate_reqst( VirtProcr *animatingPr )
   5.625 +VMS__dissipate_procr( VirtProcr *animatingPr )
   5.626   {
   5.627        //dis-own all locations owned by this processor, causing to be freed
   5.628        // any locations that it is (was) sole owner of
   5.629  //TODO: implement VMS__malloc system, including "give up ownership"
   5.630  
   5.631 -      //The dissipate request might still be attached, so remove and free it
   5.632 -   VMS__free_top_and_give_next_request_from( animatingPr );
   5.633  
   5.634        //NOTE: initialData was given to the processor, so should either have
   5.635        // been alloc'd with VMS__malloc, or freed by the level above animPr.
   5.636        //So, all that's left to free here is the stack and the VirtProcr struc
   5.637        // itself
   5.638 -   free( animatingPr->startOfStack );
   5.639 -   free( animatingPr );
   5.640 +      //Note, should not stack-allocate initial data -- no guarantee, in
   5.641 +      // general that creating processor will outlive ones it creates.
   5.642 +   VMS__free( animatingPr->startOfStack );
   5.643 +   VMS__free( animatingPr );
   5.644   }
   5.645  
   5.646  
   5.647 -//TODO: re-architect so that have clean separation between request handler
   5.648 +//TODO: look at architecting cleanest separation between request handler
   5.649  // and master loop, for dissipate, create, shutdown, and other non-semantic
   5.650  // requests.  Issue is chain: one removes requests from AppVP, one dispatches
   5.651  // on type of request, and one handles each type..  but some types require
   5.652  // action from both request handler and master loop -- maybe just give the
   5.653  // request handler calls like:  VMS__handle_X_request_type
   5.654  
   5.655 -void
   5.656 -endOSThreadFn( void *initData, VirtProcr *animatingPr );
   5.657  
   5.658  /*This is called by the semantic layer's request handler when it decides its
   5.659   * time to shut down the VMS system.  Calling this causes the core loop OS
   5.660 @@ -619,10 +674,9 @@
   5.661   * masterVP any AppVPs that might still be allocated and sitting in the
   5.662   * semantic environment, or have been orphaned in the _VMSWorkQ.
   5.663   * 
   5.664 - *NOTE: the semantic plug-in is expected to use VMS__malloc_to to get all the
   5.665 + *NOTE: the semantic plug-in is expected to use VMS__malloc to get all the
   5.666   * locations it needs, and give ownership to masterVP.  Then, they will be
   5.667 - * automatically freed when the masterVP is dissipated.  (This happens after
   5.668 - * the core loop threads have all exited)
   5.669 + * automatically freed.
   5.670   *
   5.671   *In here,create one core-loop shut-down processor for each core loop and put
   5.672   * them all directly into the readyToAnimateQ.
   5.673 @@ -633,16 +687,16 @@
   5.674   * point is it sure that all results have completed.
   5.675   */
   5.676  void
   5.677 -VMS__handle_shutdown_reqst( void *dummy, VirtProcr *animatingPr )
   5.678 +VMS__shutdown()
   5.679   { int coreIdx;
   5.680     VirtProcr *shutDownPr;
   5.681  
   5.682        //create the shutdown processors, one for each core loop -- put them
   5.683        // directly into the Q -- each core will die when gets one
   5.684     for( coreIdx=0; coreIdx < NUM_CORES; coreIdx++ )
   5.685 -    {
   5.686 +    {    //Note, this is running in the master
   5.687        shutDownPr = VMS__create_procr( &endOSThreadFn, NULL );
   5.688 -      writeSRSWQ( shutDownPr, _VMSMasterEnv->readyToAnimateQs[coreIdx] );
   5.689 +      writeVMSQ( shutDownPr, _VMSMasterEnv->readyToAnimateQs[coreIdx] );
   5.690      }
   5.691  
   5.692   }
   5.693 @@ -681,49 +735,60 @@
   5.694   }
   5.695  
   5.696  
   5.697 -/*This is called after the threads have shut down and control has returned
   5.698 - * to the semantic layer, in the entry point function in the main thread.
   5.699 - * It has to free anything allocated during VMS_init, and any other alloc'd
   5.700 - * locations that might be left over.
   5.701 +/*This is called from the startup & shutdown
   5.702   */
   5.703  void
   5.704 -VMS__cleanup_after_shutdown()
   5.705 +VMS__cleanup_at_end_of_shutdown()
   5.706   { 
   5.707     VMSQueueStruc **readyToAnimateQs;
   5.708     int              coreIdx;
   5.709     VirtProcr      **masterVPs;
   5.710     SchedSlot     ***allSchedSlots; //ptr to array of ptrs
   5.711  
   5.712 +      //All the environment data has been allocated with VMS__malloc, so just
   5.713 +      // free its internal big-chunk and all inside it disappear.
   5.714 +/*
   5.715     readyToAnimateQs = _VMSMasterEnv->readyToAnimateQs;
   5.716     masterVPs        = _VMSMasterEnv->masterVPs;
   5.717     allSchedSlots    = _VMSMasterEnv->allSchedSlots;
   5.718     
   5.719     for( coreIdx = 0; coreIdx < NUM_CORES; coreIdx++ )
   5.720      {
   5.721 -      freeSRSWQ( readyToAnimateQs[ coreIdx ] );
   5.722 -
   5.723 -      VMS__handle_dissipate_reqst( masterVPs[ coreIdx ] );
   5.724 +      freeVMSQ( readyToAnimateQs[ coreIdx ] );
   5.725 +         //master VPs were created external to VMS, so use external free
   5.726 +      VMS__dissipate_procr( masterVPs[ coreIdx ] );
   5.727        
   5.728        freeSchedSlots( allSchedSlots[ coreIdx ] );
   5.729      }
   5.730     
   5.731 -   free( _VMSMasterEnv->readyToAnimateQs );
   5.732 -   free( _VMSMasterEnv->masterVPs );
   5.733 -   free( _VMSMasterEnv->allSchedSlots );
   5.734 -
   5.735 -   free( _VMSMasterEnv );
   5.736 +   VMS__free( _VMSMasterEnv->readyToAnimateQs );
   5.737 +   VMS__free( _VMSMasterEnv->masterVPs );
   5.738 +   VMS__free( _VMSMasterEnv->allSchedSlots );
   5.739 +   
   5.740 +   //============================= MEASUREMENT STUFF ========================
   5.741 +   #ifdef STATS__TURN_ON_PROBES
   5.742 +   freeDynArrayDeep( _VMSMasterEnv->dynIntervalProbesInfo, &VMS__free_probe);
   5.743 +   #endif
   5.744 +   //========================================================================
   5.745 +*/
   5.746 +      //These are the only two that use system free 
   5.747 +   VMS_ext__free_free_list( _VMSMasterEnv->freeListHead );
   5.748 +   free( (void *)_VMSMasterEnv );
   5.749   }
   5.750  
   5.751  
   5.752 -//===========================================================================
   5.753 +//================================
   5.754  
   5.755 -inline TSCount getTSC()
   5.756 - { unsigned int low, high;
   5.757 -   TSCount  out;
   5.758  
   5.759 -   saveTimeStampCountInto( low, high );
   5.760 -   out = high;
   5.761 -   out = (out << 32) + low;
   5.762 -   return out;
   5.763 +/*Later, improve this -- for now, just exits the application after printing
   5.764 + * the error message.
   5.765 + */
   5.766 +void
   5.767 +VMS__throw_exception( char *msgStr, VirtProcr *reqstPr, VMSExcp *excpData )
   5.768 + {
   5.769 +   printf(msgStr);
   5.770 +   fflush(stdin);
   5.771 +   exit(1);
   5.772   }
   5.773  
   5.774 +
     6.1 --- a/VMS.h	Tue Oct 26 18:31:34 2010 -0700
     6.2 +++ b/VMS.h	Thu Nov 11 06:19:51 2010 -0800
     6.3 @@ -7,22 +7,54 @@
     6.4   */
     6.5  
     6.6  #ifndef _VMS_H
     6.7 -#define _VMS_H
     6.8 +#define	_VMS_H
     6.9  #define __USE_GNU
    6.10  
    6.11  #include "VMS_primitive_data_types.h"
    6.12 -#include "Queue_impl/BlockingQueue.h"
    6.13 +#include "Queue_impl/PrivateQueue.h"
    6.14  #include "Histogram/Histogram.h"
    6.15 +#include "DynArray/DynArray.h"
    6.16 +#include "Hash_impl/PrivateHash.h"
    6.17 +#include "vmalloc.h"
    6.18 +
    6.19  #include <pthread.h>
    6.20 +#include <sys/time.h>
    6.21  
    6.22 +
    6.23 +//===============================  Debug  ===================================
    6.24 +//
    6.25     //When SEQUENTIAL is defined, VMS does sequential exe in the main thread
    6.26     // It still does co-routines and all the mechanisms are the same, it just
    6.27     // has only a single thread and animates VPs one at a time
    6.28  //#define SEQUENTIAL
    6.29  
    6.30 -#define PRINT_DEBUG(msg) //printf(msg); fflush(stdin);
    6.31 -#define PRINT1_DEBUG(msg, param) //printf(msg, param); fflush(stdin);
    6.32 -#define PRINT2_DEBUG(msg, p1, p2) //printf(msg, p1, p2); fflush(stdin);
    6.33 +//#define USE_WORK_STEALING
    6.34 +
    6.35 +   //turns on the probe-instrumentation in the application -- when not
    6.36 +   // defined, the calls to the probe functions turn into comments
    6.37 +#define STATS__ENABLE_PROBES
    6.38 +//#define TURN_ON_DEBUG_PROBES
    6.39 +
    6.40 +   //These defines turn types of bug messages on and off
    6.41 +   // be sure debug messages are un-commented (next block of defines)
    6.42 +#define dbgProbes    FALSE /* for issues inside probes themselves*/
    6.43 +#define dbgAppFlow   FALSE /* Top level flow of application code -- general*/
    6.44 +#define dbgB2BMaster FALSE /* in coreloop, back to back master VPs*/
    6.45 +#define dbgRqstHdlr  FALSE /* in request handler code*/
    6.46 +
    6.47 +   //Comment or un- the substitute half to turn on/off types of debug message
    6.48 +#define DEBUG(  bool, msg)         \
    6.49 +//   if( bool){ printf(msg); fflush(stdin);}
    6.50 +#define DEBUG1( bool, msg, param)  \
    6.51 +//   if(bool){printf(msg, param); fflush(stdin);}
    6.52 +#define DEBUG2( bool, msg, p1, p2) \
    6.53 +//   if(bool) {printf(msg, p1, p2); fflush(stdin);}
    6.54 +
    6.55 +#define ERROR(msg) printf(msg); fflush(stdin);
    6.56 +#define ERROR1(msg, param) printf(msg, param); fflush(stdin);
    6.57 +#define ERROR2(msg, p1, p2) printf(msg, p1, p2); fflush(stdin);
    6.58 +
    6.59 +//===========================  STATS =======================
    6.60  
    6.61     //when MEAS__TIME_STAMP_SUSP is defined, causes code to be inserted and
    6.62     // compiled-in that saves the low part of the time stamp count just before
    6.63 @@ -33,53 +65,97 @@
    6.64  #define MEAS__TIME_MASTER
    6.65  #define MEAS__NUM_TIMES_TO_RUN 100000
    6.66  
    6.67 +   //For code that calculates normalization-offset between TSC counts of
    6.68 +   // different cores.
    6.69  #define NUM_TSC_ROUND_TRIPS 10
    6.70  
    6.71 +
    6.72 +//=========================  Hardware related Constants =====================
    6.73     //This value is the number of hardware threads in the shared memory
    6.74     // machine
    6.75  #define NUM_CORES        4
    6.76  
    6.77 -   // balance amortizing master fixed overhead vs imbalance potential
    6.78 -#define NUM_SCHED_SLOTS  3
    6.79 +   // tradeoff amortizing master fixed overhead vs imbalance potential
    6.80 +   // when work-stealing, can make bigger, at risk of losing cache affinity
    6.81 +#define NUM_SCHED_SLOTS  5
    6.82  
    6.83  #define MIN_WORK_UNIT_CYCLES 20000
    6.84  
    6.85 -#define READYTOANIMATE_RETRIES 10000
    6.86 +#define MASTERLOCK_RETRIES 10000
    6.87  
    6.88 -   // stack
    6.89 -#define VIRT_PROCR_STACK_SIZE 0x10000
    6.90 +   // stack size in virtual processors created
    6.91 +#define VIRT_PROCR_STACK_SIZE 0x4000 /* 16K */
    6.92  
    6.93 -   //256M of total memory for VMS__malloc
    6.94 -#define MASSIVE_MALLOC_SIZE 0x10000000
    6.95 +   // memory for VMS__malloc
    6.96 +#define MALLOC_ADDITIONAL_MEM_FROM_OS_SIZE 0x10000000 /* 256M */
    6.97  
    6.98 -#define NUM_PREPEND_BYTES sizeof(FreeListElem) + sizeof(ownerElem);
    6.99 +
   6.100 +//==============================
   6.101  
   6.102  #define SUCCESS 0
   6.103  
   6.104 -#define writeVMSQ     writeSRSWQ
   6.105 -#define readVMSQ      readSRSWQ
   6.106 -#define makeVMSQ      makeSRSWQ
   6.107 -#define VMSQueueStruc SRSWQueueStruc
   6.108 +#define writeVMSQ     writePrivQ
   6.109 +#define readVMSQ      readPrivQ
   6.110 +#define makeVMSQ      makePrivQ
   6.111 +#define numInVMSQ     numInPrivQ
   6.112 +#define VMSQueueStruc PrivQueueStruc
   6.113  
   6.114 -//#define thdAttrs NULL  //For PThreads
   6.115  
   6.116 -typedef struct _SchedSlot  SchedSlot;
   6.117 -typedef struct _VMSReqst   VMSReqst;
   6.118 -typedef struct _VirtProcr  VirtProcr;
   6.119 +
   6.120 +//===========================================================================
   6.121 +typedef unsigned long long TSCount;
   6.122 +
   6.123 +typedef struct _SchedSlot     SchedSlot;
   6.124 +typedef struct _VMSReqst      VMSReqst;
   6.125 +typedef struct _VirtProcr     VirtProcr;
   6.126 +typedef struct _IntervalProbe IntervalProbe;
   6.127 +typedef struct _GateStruc     GateStruc;
   6.128 +
   6.129  
   6.130  typedef VirtProcr * (*SlaveScheduler)  ( void *, int );   //semEnv, coreIdx
   6.131  typedef void  (*RequestHandler)  ( VirtProcr *, void * ); //prWReqst, semEnv
   6.132  typedef void  (*VirtProcrFnPtr)  ( void *, VirtProcr * ); //initData, animPr
   6.133  typedef void    VirtProcrFn      ( void *, VirtProcr * ); //initData, animPr
   6.134 +typedef void  (*ResumePrFnPtr)   ( VirtProcr *, void * );
   6.135 +
   6.136 +
   6.137 +//============= Requests ===========
   6.138 +//
   6.139 +
   6.140 +enum VMSReqstType   //avoid starting enums at 0, for debug reasons
   6.141 + {
   6.142 +   semantic = 1,
   6.143 +   createReq,
   6.144 +   dissipate,
   6.145 +   VMSSemantic      //goes with VMSSemReqst below
   6.146 + };
   6.147 +
   6.148 +struct _VMSReqst
   6.149 + {
   6.150 +   enum VMSReqstType  reqType;//used for dissipate and in future for IO requests
   6.151 +   void              *semReqData;
   6.152 +
   6.153 +   VMSReqst *nextReqst;
   6.154 + };
   6.155 +//VMSReqst
   6.156 +
   6.157 +enum VMSSemReqstType   //These are equivalent to semantic requests, but for
   6.158 + {                     // VMS's services available directly to app, like OS
   6.159 +   createProbe = 1,    // and probe services -- like a VMS-wide built-in lang
   6.160 +   openFile,
   6.161 +   otherIO
   6.162 + };
   6.163  
   6.164  typedef struct
   6.165 - {
   6.166 -   void           *endThdPt;
   6.167 -   unsigned int    coreNum;
   6.168 + { enum VMSSemReqstType reqType;
   6.169 +   VirtProcr           *requestingPr;
   6.170 +   char                *nameStr;  //for create probe
   6.171   }
   6.172 -ThdParams;
   6.173 + VMSSemReq;
   6.174  
   6.175  
   6.176 +//====================  Core data structures  ===================
   6.177 +
   6.178  struct _SchedSlot
   6.179   {
   6.180     int         workIsDone;
   6.181 @@ -87,24 +163,6 @@
   6.182     VirtProcr  *procrAssignedToSlot;
   6.183   };
   6.184  //SchedSlot
   6.185 - 
   6.186 -enum ReqstType
   6.187 - {
   6.188 -   semantic = 1,
   6.189 -   dissipate,
   6.190 -   regCreated,
   6.191 -   IO
   6.192 - };
   6.193 -
   6.194 -struct _VMSReqst
   6.195 - {
   6.196 -//   VirtProcr   *virtProcrFrom;
   6.197 -   enum ReqstType  reqType;//used for dissipate and in future for IO requests
   6.198 -   void           *semReqData;
   6.199 -
   6.200 -   VMSReqst *nextReqst;
   6.201 - };
   6.202 -//VMSReqst
   6.203  
   6.204  struct _VirtProcr
   6.205   { int         procrID;  //for debugging -- count up each time create
   6.206 @@ -123,9 +181,10 @@
   6.207     SchedSlot  *schedSlot;
   6.208     VMSReqst   *requests;
   6.209  
   6.210 -   void       *semanticData;
   6.211 +   void       *semanticData; //this lives here for the life of VP
   6.212 +   void       *dataRetFromReq;//values returned from plugin to VP go here
   6.213  
   6.214 -   //============================= MEASUREMENT STUFF ========================
   6.215 +      //=========== MEASUREMENT STUFF ==========
   6.216     #ifdef MEAS__TIME_STAMP_SUSP
   6.217     unsigned int preSuspTSCLow;
   6.218     unsigned int postSuspTSCLow;
   6.219 @@ -134,7 +193,8 @@
   6.220     unsigned int startMasterTSCLow;
   6.221     unsigned int endMasterTSCLow;
   6.222     #endif
   6.223 -   //========================================================================
   6.224 +   
   6.225 +   float64      createPtInSecs;  //have space but don't use on some configs
   6.226   };
   6.227  //VirtProcr
   6.228  
   6.229 @@ -158,37 +218,79 @@
   6.230  
   6.231     void            *semanticEnv;
   6.232     void            *OSEventStruc;   //for future, when add I/O to BLIS
   6.233 +   MallocProlog    *freeListHead;
   6.234 +   int32            amtOfOutstandingMem; //total currently allocated
   6.235  
   6.236     void            *coreLoopStartPt;//addr to jump to to re-enter coreLoop
   6.237     void            *coreLoopEndPt;  //addr to jump to to shut down a coreLoop
   6.238  
   6.239 -   int              setupComplete;
   6.240 -   int              masterLock;
   6.241 +   int32            setupComplete;
   6.242 +   int32            masterLock;
   6.243  
   6.244     VMSStats        *stats;
   6.245 +   int32            numMasterInARow[NUM_CORES];//detect back-to-back masterVP
   6.246 +   GateStruc       *workStealingGates[ NUM_CORES ]; //concurrent work-steal
   6.247 +   int32            workStealingLock;
   6.248 +   
   6.249 +   int32            numProcrsCreated; //gives ordering to processor creation
   6.250 +
   6.251 +      //=========== MEASUREMENT STUFF =============
   6.252 +   IntervalProbe  **intervalProbes;
   6.253 +   PrivDynArrayInfo    *dynIntervalProbesInfo;
   6.254 +   HashTable       *probeNameHashTbl;
   6.255 +   int32            masterCreateProbeID;
   6.256 +   float64          createPtInSecs;
   6.257   }
   6.258  MasterEnv;
   6.259  
   6.260 +//=========================  Extra Stuff Data Strucs  =======================
   6.261 +typedef struct
   6.262 + {
   6.263  
   6.264 -//==========================================================
   6.265 + }
   6.266 +VMSExcp;
   6.267 +
   6.268 +struct _GateStruc
   6.269 + {
   6.270 +   int32 gateClosed;
   6.271 +   int32 preGateProgress;
   6.272 +   int32 waitProgress;
   6.273 +   int32 exitProgress;
   6.274 + };
   6.275 +//GateStruc
   6.276 +
   6.277 +//=======================  OS Thread related  ===============================
   6.278  
   6.279  void * coreLoop( void *paramsIn );  //standard PThreads fn prototype
   6.280  void * coreLoop_Seq( void *paramsIn );  //standard PThreads fn prototype
   6.281  void masterLoop( void *initData, VirtProcr *masterPr );
   6.282  
   6.283  
   6.284 -//=====================  Global Vars ===================
   6.285 -
   6.286 +typedef struct
   6.287 + {
   6.288 +   void           *endThdPt;
   6.289 +   unsigned int    coreNum;
   6.290 + }
   6.291 +ThdParams;
   6.292  
   6.293  pthread_t       coreLoopThdHandles[ NUM_CORES ];  //pthread's virt-procr state
   6.294  ThdParams      *coreLoopThdParams [ NUM_CORES ];
   6.295  pthread_mutex_t suspendLock;
   6.296  pthread_cond_t  suspend_cond;
   6.297  
   6.298 +
   6.299 +
   6.300 +//=====================  Global Vars ===================
   6.301 +
   6.302  volatile MasterEnv      *_VMSMasterEnv;
   6.303  
   6.304  
   6.305 -//==========================
   6.306 +
   6.307 +
   6.308 +//===========================  Function Prototypes  =========================
   6.309 +
   6.310 +
   6.311 +//========== Setup and shutdown ==========
   6.312  void
   6.313  VMS__init();
   6.314  
   6.315 @@ -204,69 +306,59 @@
   6.316  VirtProcr *
   6.317  VMS__create_procr( VirtProcrFnPtr fnPtr, void *initialData );
   6.318  
   6.319 +void
   6.320 +VMS__dissipate_procr( VirtProcr *procrToDissipate );
   6.321 +
   6.322 +   //Use this to create processor inside entry point & other places outside
   6.323 +   // the VMS system boundary (IE, not run in slave nor Master)
   6.324  VirtProcr *
   6.325 -VMS__create_the_shutdown_procr();
   6.326 -
   6.327 -//==========================
   6.328 -inline void
   6.329 -VMS__add_sem_request( void *semReqData, VirtProcr *callingPr );
   6.330 +VMS_ext__create_procr( VirtProcrFnPtr fnPtr, void *initialData );
   6.331  
   6.332  void
   6.333 -VMS__send_req_to_register_new_procr( VirtProcr *newPrToRegister,
   6.334 -                                      VirtProcr *reqstingPr );
   6.335 +VMS_ext__dissipate_procr( VirtProcr *procrToDissipate );
   6.336  
   6.337  void
   6.338 -VMS__free_request( VMSReqst *req );
   6.339 +VMS__throw_exception( char *msgStr, VirtProcr *reqstPr, VMSExcp *excpData );
   6.340  
   6.341  void
   6.342 -VMS__remove_and_free_top_request( VirtProcr *reqstingPr );
   6.343 +VMS__shutdown();
   6.344 +
   6.345 +void
   6.346 +VMS__cleanup_at_end_of_shutdown();
   6.347 +
   6.348 +
   6.349 +//==============  Request Related  ===============
   6.350 +
   6.351 +void
   6.352 +VMS__suspend_procr( VirtProcr *callingPr );
   6.353 +
   6.354 +inline void
   6.355 +VMS__add_sem_request_in_mallocd_VMSReqst( void *semReqData, VirtProcr *callingPr );
   6.356 +
   6.357 +inline void
   6.358 +VMS__send_sem_request( void *semReqData, VirtProcr *callingPr );
   6.359 +
   6.360 +void
   6.361 +VMS__send_create_procr_req( void *semReqData, VirtProcr *reqstingPr );
   6.362 +
   6.363 +void inline
   6.364 +VMS__send_dissipate_req( VirtProcr *prToDissipate );
   6.365 +
   6.366 +inline void
   6.367 +VMS__send_VMSSem_request( void *semReqData, VirtProcr *callingPr );
   6.368  
   6.369  VMSReqst *
   6.370 -VMS__take_top_request_from( VirtProcr *reqstingPr );
   6.371 -
   6.372 -VMSReqst *
   6.373 -VMS__free_top_and_give_next_request_from( VirtProcr *procrWithReq );
   6.374 +VMS__take_next_request_out_of( VirtProcr *procrWithReq );
   6.375  
   6.376  inline void *
   6.377  VMS__take_sem_reqst_from( VMSReqst *req );
   6.378  
   6.379 -inline int
   6.380 -VMS__isSemanticReqst( VMSReqst *req );
   6.381 -
   6.382 -inline int
   6.383 -VMS__isDissipateReqst( VMSReqst *req );
   6.384 -
   6.385 -inline int
   6.386 -VMS__isCreateReqst( VMSReqst *req );
   6.387 -
   6.388 -//==========================
   6.389 -
   6.390 -void
   6.391 -VMS__suspend_procr( VirtProcr *callingPr );
   6.392 -
   6.393 -void
   6.394 -VMS__dissipate_procr( VirtProcr *prToDissipate );
   6.395 -
   6.396 -void
   6.397 -VMS__handle_dissipate_reqst( VirtProcr *procrToDissipate );
   6.398 -
   6.399 -void
   6.400 -VMS__cleanup_after_shutdown();
   6.401 -
   6.402 -//==========================
   6.403 -void
   6.404 -measureTSCOffsetsAsCore0();
   6.405 -
   6.406 -void
   6.407 -measureTSCOffsetsAsRemoteCore( int coreIdx );
   6.408 -
   6.409 -//============================= Statistics ==================================
   6.410 -
   6.411 -typedef unsigned long long TSCount;
   6.412 -
   6.413     //Frequency of TS counts
   6.414     //TODO: change freq for each machine
   6.415  #define TSCOUNT_FREQ 3180000000
   6.416 +//======================== STATS ======================
   6.417 +
   6.418 +//===== RDTSC wrapper =====
   6.419  
   6.420  #define saveTimeStampCountInto(low, high) \
   6.421     asm volatile("RDTSC;                   \
   6.422 @@ -284,10 +376,12 @@
   6.423     /* inputs  */ :                        \
   6.424     /* clobber */ : "%eax", "%edx"         \
   6.425                  );
   6.426 +//=====
   6.427  
   6.428 -inline TSCount getTSC();
   6.429 +#include "SwitchAnimators.h"
   6.430 +#include "probes.h"
   6.431  
   6.432 -inline TSCount getTSC();
   6.433 +
   6.434  
   6.435  //===================== Debug ==========================
   6.436  int numProcrsCreated;
   6.437 @@ -298,4 +392,3 @@
   6.438  TSCount  *pingTimes;
   6.439  
   6.440  #endif	/* _VMS_H */
   6.441 -
     7.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     7.2 +++ b/VMS__DESIGN_NOTES.txt	Thu Nov 11 06:19:51 2010 -0800
     7.3 @@ -0,0 +1,2 @@
     7.4 +
     7.5 +Implement VMS this way:
     8.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     8.2 +++ b/probes.c	Thu Nov 11 06:19:51 2010 -0800
     8.3 @@ -0,0 +1,354 @@
     8.4 +/*
     8.5 + * Copyright 2010  OpenSourceStewardshipFoundation
     8.6 + *
     8.7 + * Licensed under BSD
     8.8 + */
     8.9 +
    8.10 +#include <stdio.h>
    8.11 +#include <stdlib.h>
    8.12 +#include <malloc.h>
    8.13 +#include <sys/time.h>
    8.14 +#include <string.h>
    8.15 +
    8.16 +#include "VMS.h"
    8.17 +#include "Queue_impl/BlockingQueue.h"
    8.18 +#include "Histogram/Histogram.h"
    8.19 +
    8.20 +
    8.21 +//================================ STATS ====================================
    8.22 +
    8.23 +inline TSCount getTSCount()
    8.24 + { unsigned int low, high;
    8.25 +   TSCount  out;
    8.26 +
    8.27 +   saveTimeStampCountInto( low, high );
    8.28 +   out = high;
    8.29 +   out = (out << 32) + low;
    8.30 +   return out;
    8.31 + }
    8.32 +
    8.33 +
    8.34 +
    8.35 +//====================  Probes =================
    8.36 +#ifdef STATS__USE_TSC_PROBES
    8.37 +
    8.38 +int32
    8.39 +VMS__create_histogram_probe( int32 numBins, float32 startValue,
    8.40 +                             float32 binWidth, char *nameStr )
    8.41 + { IntervalProbe *newProbe;
    8.42 +   int32 idx;
    8.43 +   FloatHist *hist;
    8.44 +
    8.45 +   idx = VMS__create_single_interval_probe( nameStr );
    8.46 +   newProbe =  _VMSMasterEnv->intervalProbes[ idx ];
    8.47 +
    8.48 +   hist =  makeFloatHistogram( numBins, startValue, binWidth );
    8.49 +   newProbe->hist = hist;
    8.50 +   return idx;
    8.51 + }
    8.52 +
    8.53 +void
    8.54 +VMS_impl__record_interval_start_in_probe( int32 probeID )
    8.55 + { IntervalProbe *probe;
    8.56 +
    8.57 +   probe = _VMSMasterEnv->intervalProbes[ probeID ];
    8.58 +   probe->startStamp = getTSCount();
    8.59 + }
    8.60 +
    8.61 +void
    8.62 +VMS_impl__record_interval_end_in_probe( int32 probeID )
    8.63 + { IntervalProbe *probe;
    8.64 +   TSCount endStamp;
    8.65 +
    8.66 +   endStamp = getTSCount();
    8.67 +
    8.68 +   probe = _VMSMasterEnv->intervalProbes[ probeID ];
    8.69 +   probe->endStamp = endStamp;
    8.70 +
    8.71 +   if( probe->hist != NULL )
    8.72 +    { TSCount interval = probe->endStamp - probe->startStamp;
    8.73 +         //if the interval is sane, then add to histogram
    8.74 +      if( interval < probe->hist->endOfRange * 10 )
    8.75 +         addToFloatHist( interval, probe->hist );
    8.76 +    }
    8.77 + }
    8.78 +
    8.79 +void
    8.80 +VMS_impl__print_stats_of_probe( int32 probeID )
    8.81 + { IntervalProbe *probe;
    8.82 +
    8.83 +   probe = _VMSMasterEnv->intervalProbes[ probeID ];
    8.84 +
    8.85 +   if( probe->hist == NULL )
    8.86 +    {
    8.87 +      printf("probe: %s, interval: %.6lf\n", probe->nameStr,probe->interval);
    8.88 +    }
    8.89 +
    8.90 +   else
    8.91 +    {
    8.92 +      printf( "probe: %s\n", probe->nameStr );
    8.93 +      printFloatHist( probe->hist );
    8.94 +    }
    8.95 + }
    8.96 +#else
    8.97 +
    8.98 +/*
    8.99 + * In practice, probe operations are called from the app, from inside slaves
   8.100 + *  -- so have to be sure each probe is single-VP owned, and be sure that
   8.101 + *  any place common structures are modified it's done inside the master.
   8.102 + * So -- the only place common structures are modified is during creation.
   8.103 + *  after that, all mods are to individual instances.
   8.104 + *
   8.105 + * Thniking perhaps should change the semantics to be that probes are
   8.106 + *  attached to the virtual processor -- and then everything is guaranteed
   8.107 + *  to be isolated -- except then can't take any intervals that span VPs,
   8.108 + *  and would have to transfer the probes to Master env when VP dissipates..
   8.109 + *  gets messy..
   8.110 + *
   8.111 + * For now, just making so that probe creation causes a suspend, so that
   8.112 + *  the dynamic array in the master env is only modified from the master
   8.113 + * 
   8.114 + */
   8.115 +IntervalProbe *
   8.116 +create_generic_probe( char *nameStr, VirtProcr *animPr )
   8.117 + { IntervalProbe *newProbe;
   8.118 +   VMSSemReq reqData;
   8.119 +
   8.120 +   reqData.reqType  = createProbe;
   8.121 +   reqData.nameStr  = nameStr;
   8.122 +
   8.123 +   VMS__send_VMSSem_request( &reqData, animPr );
   8.124 +
   8.125 +   return animPr->dataRetFromReq;
   8.126 + }
   8.127 +
   8.128 +/*Use this version from outside VMS -- it uses external malloc, and modifies
   8.129 + * dynamic array, so can't be animated in a slave VP
   8.130 + */
   8.131 +IntervalProbe *
   8.132 +ext__create_generic_probe( char *nameStr )
   8.133 + { IntervalProbe *newProbe;
   8.134 +   int32          nameLen;
   8.135 +
   8.136 +   newProbe          = malloc( sizeof(IntervalProbe) );
   8.137 +   nameLen = strlen( nameStr );
   8.138 +   newProbe->nameStr = malloc( nameLen );
   8.139 +   memcpy( newProbe->nameStr, nameStr, nameLen );
   8.140 +   newProbe->hist    = NULL;
   8.141 +   newProbe->schedChoiceWasRecorded = FALSE;
   8.142 +   newProbe->probeID =
   8.143 +             addToDynArray( newProbe, _VMSMasterEnv->dynIntervalProbesInfo );
   8.144 +
   8.145 +   return newProbe;
   8.146 + }
   8.147 +
   8.148 +
   8.149 +/*Only call from inside master or main startup/shutdown thread
   8.150 + */
   8.151 +void
   8.152 +VMS_impl__free_probe( IntervalProbe *probe )
   8.153 + { if( probe->hist != NULL )   freeDblHist( probe->hist );
   8.154 +   if( probe->nameStr != NULL) VMS__free( probe->nameStr );
   8.155 +   VMS__free( probe );
   8.156 + }
   8.157 +
   8.158 +
   8.159 +int32
   8.160 +VMS_impl__record_time_point_into_new_probe( char *nameStr, VirtProcr *animPr)
   8.161 + { IntervalProbe *newProbe;
   8.162 +   struct timeval *startStamp;
   8.163 +   float64 startSecs;
   8.164 +
   8.165 +   newProbe           = create_generic_probe( nameStr, animPr );
   8.166 +   newProbe->endSecs  = 0;
   8.167 +
   8.168 +   gettimeofday( &(newProbe->startStamp), NULL);
   8.169 +
   8.170 +      //turn into a double
   8.171 +   startStamp = &(newProbe->startStamp);
   8.172 +   startSecs = startStamp->tv_sec + ( startStamp->tv_usec / 1000000.0 );
   8.173 +   newProbe->startSecs = startSecs;
   8.174 +
   8.175 +   return newProbe->probeID;
   8.176 + }
   8.177 +
   8.178 +int32
   8.179 +VMS_ext_impl__record_time_point_into_new_probe( char *nameStr )
   8.180 + { IntervalProbe *newProbe;
   8.181 +   struct timeval *startStamp;
   8.182 +   float64 startSecs;
   8.183 +
   8.184 +   newProbe           = ext__create_generic_probe( nameStr );
   8.185 +   newProbe->endSecs  = 0;
   8.186 +
   8.187 +   gettimeofday( &(newProbe->startStamp), NULL);
   8.188 +
   8.189 +      //turn into a double
   8.190 +   startStamp = &(newProbe->startStamp);
   8.191 +   startSecs = startStamp->tv_sec + ( startStamp->tv_usec / 1000000.0 );
   8.192 +   newProbe->startSecs = startSecs;
   8.193 +
   8.194 +   return newProbe->probeID;
   8.195 + }
   8.196 +
   8.197 +int32
   8.198 +VMS_impl__create_single_interval_probe( char *nameStr, VirtProcr *animPr )
   8.199 + { IntervalProbe *newProbe;
   8.200 +
   8.201 +   newProbe = create_generic_probe( nameStr, animPr );
   8.202 +   
   8.203 +   return newProbe->probeID;
   8.204 + }
   8.205 +
   8.206 +int32
   8.207 +VMS_impl__create_histogram_probe( int32   numBins, float64    startValue,
   8.208 +               float64 binWidth, char   *nameStr, VirtProcr *animPr )
   8.209 + { IntervalProbe *newProbe;
   8.210 +   DblHist *hist;
   8.211 +
   8.212 +   newProbe = create_generic_probe( nameStr, animPr );
   8.213 +   
   8.214 +   hist =  makeDblHistogram( numBins, startValue, binWidth );
   8.215 +   newProbe->hist = hist;
   8.216 +   return newProbe->probeID;
   8.217 + }
   8.218 +
   8.219 +void
   8.220 +VMS_impl__index_probe_by_its_name( int32 probeID, VirtProcr *animPr )
   8.221 + { IntervalProbe *probe;
   8.222 +
   8.223 +   //TODO: fix this To be in Master -- race condition
   8.224 +   probe = _VMSMasterEnv->intervalProbes[ probeID ];
   8.225 +
   8.226 +   addValueIntoTable(probe->nameStr, probe, _VMSMasterEnv->probeNameHashTbl);
   8.227 + }
   8.228 +
   8.229 +IntervalProbe *
   8.230 +VMS_impl__get_probe_by_name( char *probeName, VirtProcr *animPr )
   8.231 + {
   8.232 +   //TODO: fix this To be in Master -- race condition
   8.233 +   return getValueFromTable( probeName, _VMSMasterEnv->probeNameHashTbl );
   8.234 + }
   8.235 +
   8.236 +
   8.237 +/*Everything is local to the animating procr, so no need for request, do
   8.238 + * work locally, in the anim Pr
   8.239 + */
   8.240 +void
   8.241 +VMS_impl__record_sched_choice_into_probe( int32 probeID, VirtProcr *animatingPr )
   8.242 + { IntervalProbe *probe;
   8.243 + 
   8.244 +   probe = _VMSMasterEnv->intervalProbes[ probeID ];
   8.245 +   probe->schedChoiceWasRecorded = TRUE;
   8.246 +   probe->coreNum = animatingPr->coreAnimatedBy;
   8.247 +   probe->procrID = animatingPr->procrID;
   8.248 +   probe->procrCreateSecs = animatingPr->createPtInSecs;
   8.249 + }
   8.250 +
   8.251 +/*Everything is local to the animating procr, so no need for request, do
   8.252 + * work locally, in the anim Pr
   8.253 + */
   8.254 +void
   8.255 +VMS_impl__record_interval_start_in_probe( int32 probeID )
   8.256 + { IntervalProbe *probe;
   8.257 +
   8.258 +         DEBUG( dbgProbes, "record start of interval\n" )
   8.259 +   probe = _VMSMasterEnv->intervalProbes[ probeID ];
   8.260 +   gettimeofday( &(probe->startStamp), NULL );
   8.261 + }
   8.262 +
   8.263 +
   8.264 +/*Everything is local to the animating procr, so no need for request, do
   8.265 + * work locally, in the anim Pr
   8.266 + */
   8.267 +void
   8.268 +VMS_impl__record_interval_end_in_probe( int32 probeID )
   8.269 + { IntervalProbe *probe;
   8.270 +   struct timeval *endStamp, *startStamp;
   8.271 +   float64 startSecs, endSecs;
   8.272 +
   8.273 +         DEBUG( dbgProbes, "record end of interval\n" )
   8.274 +      //possible seg-fault if array resized by diff core right after this
   8.275 +      // one gets probe..?  Something like that?  Might be safe.. don't care
   8.276 +   probe = _VMSMasterEnv->intervalProbes[ probeID ];
   8.277 +   gettimeofday( &(probe->endStamp), NULL);
   8.278 +
   8.279 +      //now turn into an interval held in a double
   8.280 +   startStamp = &(probe->startStamp);
   8.281 +   endStamp   = &(probe->endStamp);
   8.282 +
   8.283 +   startSecs = startStamp->tv_sec + ( startStamp->tv_usec / 1000000.0 );
   8.284 +   endSecs   = endStamp->tv_sec   + ( endStamp->tv_usec / 1000000.0 );
   8.285 +
   8.286 +   probe->interval  = endSecs - startSecs;
   8.287 +   probe->startSecs = startSecs;
   8.288 +   probe->endSecs   = endSecs;
   8.289 +
   8.290 +   if( probe->hist != NULL )
   8.291 +    {
   8.292 +         //if the interval is sane, then add to histogram
   8.293 +      if( probe->interval < probe->hist->endOfRange * 10 )
   8.294 +         addToDblHist( probe->interval, probe->hist );
   8.295 +    }
   8.296 + }
   8.297 +
   8.298 +void
   8.299 +print_probe_helper( IntervalProbe *probe )
   8.300 + {
   8.301 +   printf( "\nprobe: %s, ",  probe->nameStr );
   8.302 +   
   8.303 +   if( probe->schedChoiceWasRecorded )
   8.304 +    { printf( "coreNum: %d, procrID: %d, procrCreated: %.6lf | ",
   8.305 +              probe->coreNum, probe->procrID, probe->procrCreateSecs );
   8.306 +    }
   8.307 +
   8.308 +   if( probe->endSecs == 0 ) //just a single point in time
   8.309 +    {
   8.310 +      printf( " time point: %.6lf\n",
   8.311 +              probe->startSecs - _VMSMasterEnv->createPtInSecs );
   8.312 +    }
   8.313 +   else if( probe->hist == NULL ) //just an interval
   8.314 +    {
   8.315 +      printf( " startSecs: %.6lf, interval: %.6lf\n", 
   8.316 +         probe->startSecs - _VMSMasterEnv->createPtInSecs, probe->interval);
   8.317 +    }
   8.318 +   else  //a full histogram of intervals
   8.319 +    {
   8.320 +      printDblHist( probe->hist );
   8.321 +    }
   8.322 + }
   8.323 +
   8.324 +//TODO: change so pass around pointer to probe instead of its array-index..
   8.325 +// will eliminate chance for timing of resize to cause problems with the
   8.326 +// lookup -- even though don't think it actually can cause problems..
   8.327 +// there's no need to pass index around -- have hash table for names, and
   8.328 +// only need it once, then have ptr to probe..  the thing about enum the
   8.329 +// index and use that as name is clunky in practice -- just hash.
   8.330 +void
   8.331 +VMS_impl__print_stats_of_probe( int32 probeID )
   8.332 + { IntervalProbe *probe;
   8.333 +
   8.334 +   probe = _VMSMasterEnv->intervalProbes[ probeID ];
   8.335 +
   8.336 +   print_probe_helper( probe );
   8.337 + }
   8.338 +
   8.339 +
   8.340 +
   8.341 +void
   8.342 +generic_print_probe( void *_probe )
   8.343 + { IntervalProbe *probe;
   8.344 +
   8.345 +   probe = (IntervalProbe *)_probe;
   8.346 +   print_probe_helper( probe );
   8.347 + }
   8.348 +
   8.349 +void
   8.350 +VMS_impl__print_stats_of_all_probes()
   8.351 + { IntervalProbe *probe;
   8.352 +
   8.353 +   forAllInDynArrayDo( _VMSMasterEnv->dynIntervalProbesInfo,
   8.354 +                       &generic_print_probe );
   8.355 +   fflush( stdout );
   8.356 + }
   8.357 +#endif
     9.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     9.2 +++ b/probes.h	Thu Nov 11 06:19:51 2010 -0800
     9.3 @@ -0,0 +1,194 @@
     9.4 +/*
     9.5 + *  Copyright 2009 OpenSourceStewardshipFoundation.org
     9.6 + *  Licensed under GNU General Public License version 2
     9.7 + *
     9.8 + * Author: seanhalle@yahoo.com
     9.9 + * 
    9.10 + */
    9.11 +
    9.12 +#ifndef _PROBES_H
    9.13 +#define	_PROBES_H
    9.14 +#define __USE_GNU
    9.15 +
    9.16 +#include "VMS_primitive_data_types.h"
    9.17 +
    9.18 +#include <sys/time.h>
    9.19 +
    9.20 +
    9.21 +   //when STATS__TURN_ON_PROBES is defined allows using probes to measure
    9.22 +   // time intervals.  The probes are macros that only compile to something
    9.23 +   // when STATS__TURN_ON_PROBES is defined.  The probes are saved in the
    9.24 +   // master env -- but only when this is defined.
    9.25 +   //The TSC probes use RDTSC instr, can be unreliable, Dbl uses gettimeofday
    9.26 +#define STATS__TURN_ON_PROBES
    9.27 +//#define STATS__USE_TSC_PROBES
    9.28 +#define STATS__USE_DBL_PROBES
    9.29 +
    9.30 +//typedef struct _IntervalProbe IntervalProbe; //in VMS.h
    9.31 +
    9.32 +struct _IntervalProbe
    9.33 + {
    9.34 +   char           *nameStr;
    9.35 +   int32           probeID;
    9.36 +
    9.37 +   int32           schedChoiceWasRecorded;
    9.38 +   int32           coreNum;
    9.39 +   int32           procrID;
    9.40 +   float64         procrCreateSecs;
    9.41 +
    9.42 +   #ifdef STATS__USE_TSC_PROBES
    9.43 +   TSCount    startStamp;
    9.44 +   TSCount    endStamp;
    9.45 +   #else
    9.46 +   struct timeval  startStamp;
    9.47 +   struct timeval  endStamp;
    9.48 +   #endif
    9.49 +   float64         startSecs;
    9.50 +   float64         endSecs;
    9.51 +   float64         interval;
    9.52 +   DblHist        *hist;//if NULL, then is single interval probe
    9.53 + };
    9.54 +
    9.55 +
    9.56 +//============================= Statistics ==================================
    9.57 +
    9.58 +   //Frequency of TS counts
    9.59 +   //TODO: change freq for each machine
    9.60 +#define TSCOUNT_FREQ 3180000000
    9.61 +
    9.62 +inline TSCount getTSCount();
    9.63 +
    9.64 +
    9.65 +//======================== Probes =============================
    9.66 +//
    9.67 +// Use macros to allow turning probes off with a #define switch
    9.68 +#ifdef STATS__ENABLE_PROBES
    9.69 +int32
    9.70 +VMS_impl__record_time_point_into_new_probe( char *nameStr,VirtProcr *animPr);
    9.71 +#define VMS__record_time_point_into_new_probe( nameStr, animPr ) \
    9.72 +        VMS_impl__record_time_point_in_new_probe( nameStr, animPr )
    9.73 +
    9.74 +int32
    9.75 +VMS_ext_impl__record_time_point_into_new_probe( char *nameStr );
    9.76 +#define VMS_ext__record_time_point_into_new_probe( nameStr ) \
    9.77 +        VMS_ext_impl__record_time_point_into_new_probe( nameStr )
    9.78 +
    9.79 +
    9.80 +int32
    9.81 +VMS_impl__create_single_interval_probe( char *nameStr, VirtProcr *animPr );
    9.82 +#define VMS__create_single_interval_probe( nameStr, animPr ) \
    9.83 +        VMS_impl__create_single_interval_probe( nameStr, animPr )
    9.84 +
    9.85 +
    9.86 +int32
    9.87 +VMS_impl__create_histogram_probe( int32   numBins, float64    startValue,
    9.88 +               float64 binWidth, char    *nameStr, VirtProcr *animPr );
    9.89 +#define VMS__create_histogram_probe(      numBins, startValue,              \
    9.90 +                                          binWidth, nameStr, animPr )       \
    9.91 +        VMS_impl__create_histogram_probe( numBins, startValue,              \
    9.92 +                                          binWidth, nameStr, animPr )
    9.93 +void
    9.94 +VMS_impl__free_probe( IntervalProbe *probe );
    9.95 +#define VMS__free_probe( probe ) \
    9.96 +        VMS_impl__free_probe( probe )
    9.97 +
    9.98 +void
    9.99 +VMS_impl__index_probe_by_its_name( int32 probeID, VirtProcr *animPr );
   9.100 +#define VMS__index_probe_by_its_name( probeID, animPr ) \
   9.101 +        VMS_impl__index_probe_by_its_name( probeID, animPr )
   9.102 +
   9.103 +IntervalProbe *
   9.104 +VMS_impl__get_probe_by_name( char *probeName, VirtProcr *animPr );
   9.105 +#define VMS__get_probe_by_name( probeID, animPr ) \
   9.106 +        VMS_impl__get_probe_by_name( probeName, animPr )
   9.107 +
   9.108 +void
   9.109 +VMS_impl__record_sched_choice_into_probe( int32 probeID, VirtProcr *animPr );
   9.110 +#define VMS__record_sched_choice_into_probe( probeID, animPr ) \
   9.111 +        VMS_impl__record_sched_choice_into_probe( probeID, animPr )
   9.112 +
   9.113 +void
   9.114 +VMS_impl__record_interval_start_in_probe( int32 probeID );
   9.115 +#define VMS__record_interval_start_in_probe( probeID ) \
   9.116 +        VMS_impl__record_interval_start_in_probe( probeID )
   9.117 +
   9.118 +void
   9.119 +VMS_impl__record_interval_end_in_probe( int32 probeID );
   9.120 +#define VMS__record_interval_end_in_probe( probeID ) \
   9.121 +        VMS_impl__record_interval_end_in_probe( probeID )
   9.122 +
   9.123 +void
   9.124 +VMS_impl__print_stats_of_probe( int32 probeID );
   9.125 +#define VMS__print_stats_of_probe( probeID ) \
   9.126 +        VMS_impl__print_stats_of_probe( probeID )
   9.127 +
   9.128 +void
   9.129 +VMS_impl__print_stats_of_all_probes();
   9.130 +#define VMS__print_stats_of_all_probes \
   9.131 +        VMS_impl__print_stats_of_all_probes
   9.132 +
   9.133 +
   9.134 +#else
   9.135 +int32
   9.136 +VMS_impl__record_time_point_into_new_probe( char *nameStr,VirtProcr *animPr);
   9.137 +#define VMS__record_time_point_into_new_probe( nameStr, animPr ) \
   9.138 +       0 /* do nothing */
   9.139 +
   9.140 +int32
   9.141 +VMS_ext_impl__record_time_point_into_new_probe( char *nameStr, VirtProcr *animPr);
   9.142 +#define VMS_ext__record_time_point_into_new_probe( nameStr ) \
   9.143 +       0 /* do nothing */
   9.144 +
   9.145 +
   9.146 +int32
   9.147 +VMS_impl__create_single_interval_probe( char *nameStr, VirtProcr *animPr );
   9.148 +#define VMS__create_single_interval_probe( nameStr, animPr ) \
   9.149 +       0 /* do nothing */
   9.150 +
   9.151 +
   9.152 +int32
   9.153 +VMS_impl__create_histogram_probe( int32   numBins, float64    startValue,
   9.154 +               float64 binWidth, char    *nameStr, VirtProcr *animPr );
   9.155 +#define VMS__create_histogram_probe(      numBins, startValue,              \
   9.156 +                                          binWidth, nameStr, animPr )       \
   9.157 +       0 /* do nothing */
   9.158 +
   9.159 +void
   9.160 +VMS_impl__index_probe_by_its_name( int32 probeID, VirtProcr *animPr );
   9.161 +#define VMS__index_probe_by_its_name( probeID, animPr ) \
   9.162 +        /* do nothing */
   9.163 +
   9.164 +IntervalProbe *
   9.165 +VMS_impl__get_probe_by_name( char *probeName, VirtProcr *animPr );
   9.166 +#define VMS__get_probe_by_name( probeID, animPr ) \
   9.167 +       NULL /* do nothing */
   9.168 +
   9.169 +void
   9.170 +VMS_impl__record_sched_choice_into_probe( int32 probeID, VirtProcr *animPr );
   9.171 +#define VMS__record_sched_choice_into_probe( probeID, animPr ) \
   9.172 +        /* do nothing */
   9.173 +
   9.174 +void
   9.175 +VMS_impl__record_interval_start_in_probe( int32 probeID );
   9.176 +#define VMS__record_interval_start_in_probe( probeID ) \
   9.177 +        /* do nothing */
   9.178 +
   9.179 +void
   9.180 +VMS_impl__record_interval_end_in_probe( int32 probeID );
   9.181 +#define VMS__record_interval_end_in_probe( probeID ) \
   9.182 +        /* do nothing */
   9.183 +
   9.184 +void
   9.185 +VMS_impl__print_stats_of_probe( int32 probeID );
   9.186 +#define VMS__print_stats_of_probe( probeID ) \
   9.187 +        /* do nothing */
   9.188 +
   9.189 +void
   9.190 +VMS_impl__print_stats_of_all_probes();
   9.191 +#define VMS__print_stats_of_all_probes \
   9.192 +        /* do nothing */
   9.193 +
   9.194 +#endif   /* defined STATS__ENABLE_PROBES */
   9.195 +
   9.196 +#endif	/* _PROBES_H */
   9.197 +
    10.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    10.2 +++ b/vmalloc.c	Thu Nov 11 06:19:51 2010 -0800
    10.3 @@ -0,0 +1,327 @@
    10.4 +/*
    10.5 + *  Copyright 2009 OpenSourceCodeStewardshipFoundation.org
    10.6 + *  Licensed under GNU General Public License version 2
    10.7 + *
    10.8 + * Author: seanhalle@yahoo.com
    10.9 + *
   10.10 + * Created on November 14, 2009, 9:07 PM
   10.11 + */
   10.12 +
   10.13 +#include <malloc.h>
   10.14 +#include <stdlib.h>
   10.15 +
   10.16 +#include "VMS.h"
   10.17 +
   10.18 +/*Helper function
   10.19 + *Insert a newly generated free chunk into the first spot on the free list.
   10.20 + * The chunk is cast as a MallocProlog, so the various pointers in it are
   10.21 + * accessed with C's help -- and the size of the prolog is easily added to
   10.22 + * the pointer when a chunk is returned to the app -- so C handles changes
   10.23 + * in pointer sizes among machines.
   10.24 + *
   10.25 + *The list head is a normal MallocProlog struct -- identified by its
   10.26 + * prevChunkInFreeList being NULL -- the only one.
   10.27 + *
   10.28 + *The end of the list is identified by next chunk being NULL, as usual.
   10.29 + */
   10.30 +void inline
   10.31 +add_chunk_to_free_list( MallocProlog *chunk, MallocProlog *listHead )
   10.32 + { 
   10.33 +   chunk->nextChunkInFreeList     = listHead->nextChunkInFreeList;
   10.34 +   if( chunk->nextChunkInFreeList != NULL ) //if not last in free list
   10.35 +      chunk->nextChunkInFreeList->prevChunkInFreeList = chunk;
   10.36 +   chunk->prevChunkInFreeList     = listHead;
   10.37 +   listHead->nextChunkInFreeList  = chunk;
   10.38 + }
   10.39 +
   10.40 +
   10.41 +/*This is sequential code, meant to only be called from the Master, not from
   10.42 + * any slave VPs.
   10.43 + *Search down list, checking size by the nextHigherInMem pointer, to find
   10.44 + * first chunk bigger than size needed.
   10.45 + *Shave off the extra and make it into a new free-list element, hook it in
   10.46 + * then return the address of the found element plus size of prolog.
   10.47 + *
   10.48 + *Will find a
   10.49 + */
   10.50 +void *
   10.51 +VMS__malloc( int32 sizeRequested )
   10.52 + { MallocProlog *foundElem = NULL, *currElem, *newElem;
   10.53 +   int32         amountExtra, foundElemIsTopOfHeap, sizeConsumed,sizeOfFound;
   10.54 +
   10.55 +      //step up the size to be aligned at 16-byte boundary, prob better ways
   10.56 +   sizeRequested = ((sizeRequested + 16) >> 4) << 4;
   10.57 +   currElem = (_VMSMasterEnv->freeListHead)->nextChunkInFreeList;
   10.58 +
   10.59 +   while( currElem != NULL )
   10.60 +    {    //check if size of currElem is big enough
   10.61 +      sizeOfFound=(int32)((char*)currElem->nextHigherInMem -(char*)currElem);
   10.62 +      amountExtra = sizeOfFound - sizeRequested - sizeof(MallocProlog);
   10.63 +      if( amountExtra > 0 )
   10.64 +       {    //found it, get out of loop
   10.65 +         foundElem = currElem;
   10.66 +         currElem = NULL;
   10.67 +       }
   10.68 +      else
   10.69 +         currElem = currElem->nextChunkInFreeList;
   10.70 +    }
   10.71 +
   10.72 +   if( foundElem == NULL )
   10.73 +    { ERROR("\nmalloc failed\n")
   10.74 +      return (void *)NULL;  //indicates malloc failed
   10.75 +    }
   10.76 +      //Using a kludge to identify the element that is the top chunk in the
   10.77 +      // heap -- saving top-of-heap addr in head's nextHigherInMem -- and
   10.78 +      // save addr of start of heap in head's nextLowerInMem
   10.79 +      //Will handle top of Heap specially
   10.80 +   foundElemIsTopOfHeap = foundElem->nextHigherInMem ==
   10.81 +                          _VMSMasterEnv->freeListHead->nextHigherInMem;
   10.82 +
   10.83 +      //before shave off and try to insert new elem, remove found elem
   10.84 +      //note, foundElem will never be the head, so always has valid prevChunk
   10.85 +   foundElem->prevChunkInFreeList->nextChunkInFreeList =
   10.86 +                                              foundElem->nextChunkInFreeList;
   10.87 +   if( foundElem->nextChunkInFreeList != NULL )
   10.88 +    { foundElem->nextChunkInFreeList->prevChunkInFreeList =
   10.89 +                                              foundElem->prevChunkInFreeList;
   10.90 +    }
   10.91 +   foundElem->prevChunkInFreeList = NULL;//indicates elem currently allocated
   10.92 +   
   10.93 +      //if enough, turn extra into new elem & insert it
   10.94 +   if( amountExtra > 64 )
   10.95 +    {    //make new elem by adding to addr of curr elem then casting
   10.96 +      sizeConsumed = sizeof(MallocProlog) + sizeRequested;
   10.97 +      newElem = (MallocProlog *)( (char *)foundElem + sizeConsumed );
   10.98 +      newElem->nextHigherInMem   = foundElem->nextHigherInMem;
   10.99 +      newElem->nextLowerInMem    = foundElem;
  10.100 +      foundElem->nextHigherInMem = newElem;
  10.101 +      
  10.102 +      if( ! foundElemIsTopOfHeap )
  10.103 +       {    //there is no next higher for top of heap, so can't write to it
  10.104 +         newElem->nextHigherInMem->nextLowerInMem = newElem;
  10.105 +       }
  10.106 +      add_chunk_to_free_list( newElem, _VMSMasterEnv->freeListHead );
  10.107 +    }
  10.108 +   else
  10.109 +    {
  10.110 +      sizeConsumed = sizeOfFound;
  10.111 +    }
  10.112 +  _VMSMasterEnv->amtOfOutstandingMem += sizeConsumed;
  10.113 +
  10.114 +      //skip over the prolog by adding its size to the pointer return
  10.115 +   return (void *)((char *)foundElem + sizeof(MallocProlog));
  10.116 + }
  10.117 +
  10.118 +
  10.119 +/*This is sequential code -- only to be called from the Master
  10.120 + * When free, subtract the size of prolog from pointer, then cast it to a
  10.121 + * MallocProlog.  Then check the nextLower and nextHigher chunks to see if
  10.122 + * one or both are also free, and coalesce if so, and if neither free, then
  10.123 + * add this one to free-list.
  10.124 + */
  10.125 +void
  10.126 +VMS__free( void *ptrToFree )
  10.127 + { MallocProlog *elemToFree, *nextLowerElem, *nextHigherElem;
  10.128 +   int32         lowerExistsAndIsFree, higherExistsAndIsFree, sizeOfElem;
  10.129 +
  10.130 +   if( ptrToFree < (void*)_VMSMasterEnv->freeListHead->nextLowerInMem ||
  10.131 +       ptrToFree > (void*)_VMSMasterEnv->freeListHead->nextHigherInMem )
  10.132 +    {    //outside the range of data owned by VMS's malloc, so do nothing
  10.133 +      return;
  10.134 +    }
  10.135 +      //subtract size of prolog to get pointer to prolog, then cast
  10.136 +   elemToFree = (MallocProlog *)((char *)ptrToFree - sizeof(MallocProlog));
  10.137 +   sizeOfElem =(int32)((char*)elemToFree->nextHigherInMem-(char*)elemToFree);
  10.138 +
  10.139 +   if( elemToFree->prevChunkInFreeList != NULL )
  10.140 +    { printf( "error: freeing same element twice!" ); exit(1);
  10.141 +    }
  10.142 +
  10.143 +   _VMSMasterEnv->amtOfOutstandingMem -= sizeOfElem;
  10.144 +
  10.145 +   nextLowerElem  = elemToFree->nextLowerInMem;
  10.146 +   nextHigherElem = elemToFree->nextHigherInMem;
  10.147 +
  10.148 +   if( nextHigherElem == NULL )
  10.149 +      higherExistsAndIsFree = FALSE;
  10.150 +   else //okay exists, now check if in the free-list by checking back ptr
  10.151 +      higherExistsAndIsFree = (nextHigherElem->prevChunkInFreeList != NULL);
  10.152 +    
  10.153 +   if( nextLowerElem == NULL )
  10.154 +      lowerExistsAndIsFree = FALSE;
  10.155 +   else //okay, it exists, now check if it's free
  10.156 +      lowerExistsAndIsFree = (nextLowerElem->prevChunkInFreeList != NULL);
  10.157 +    
  10.158 +
  10.159 +      //now, know what exists and what's free
  10.160 +   if( lowerExistsAndIsFree )
  10.161 +    { if( higherExistsAndIsFree )
  10.162 +       {    //both exist and are free, so coalesce all three
  10.163 +            //First, remove higher from free-list
  10.164 +         nextHigherElem->prevChunkInFreeList->nextChunkInFreeList =
  10.165 +                                         nextHigherElem->nextChunkInFreeList;
  10.166 +         if( nextHigherElem->nextChunkInFreeList != NULL ) //end-of-list?
  10.167 +            nextHigherElem->nextChunkInFreeList->prevChunkInFreeList =
  10.168 +                                         nextHigherElem->prevChunkInFreeList;
  10.169 +            //Now, fix-up sequence-in-mem list -- by side-effect, this also
  10.170 +            // changes size of the lower elem, which is still in free-list
  10.171 +         nextLowerElem->nextHigherInMem = nextHigherElem->nextHigherInMem;
  10.172 +         if( nextHigherElem->nextHigherInMem !=
  10.173 +             _VMSMasterEnv->freeListHead->nextHigherInMem )
  10.174 +            nextHigherElem->nextHigherInMem->nextLowerInMem = nextLowerElem;
  10.175 +            //notice didn't do anything to elemToFree -- it simply is no
  10.176 +            // longer reachable from any of the lists.  Wonder if could be a
  10.177 +            // security leak because left valid addresses in it,
  10.178 +            // but don't care for now.
  10.179 +       }
  10.180 +      else
  10.181 +       {    //lower is the only of the two that exists and is free,
  10.182 +            //In this case, no adjustment to free-list, just change mem-list.
  10.183 +            // By side-effect, changes size of the lower elem
  10.184 +         nextLowerElem->nextHigherInMem = elemToFree->nextHigherInMem;
  10.185 +         if( elemToFree->nextHigherInMem !=
  10.186 +             _VMSMasterEnv->freeListHead->nextHigherInMem )
  10.187 +            elemToFree->nextHigherInMem->nextLowerInMem = nextLowerElem;
  10.188 +       }
  10.189 +    }
  10.190 +   else
  10.191 +    {    //lower either doesn't exist or isn't free, so check higher
  10.192 +      if( higherExistsAndIsFree )
  10.193 +       {    //higher exists and is the only of the two free
  10.194 +            //First, in free-list, replace higher elem with the one to free
  10.195 +         elemToFree->nextChunkInFreeList=nextHigherElem->nextChunkInFreeList;
  10.196 +         elemToFree->prevChunkInFreeList=nextHigherElem->prevChunkInFreeList;
  10.197 +         elemToFree->prevChunkInFreeList->nextChunkInFreeList = elemToFree;
  10.198 +         if( elemToFree->nextChunkInFreeList != NULL ) // end-of-list?
  10.199 +            elemToFree->nextChunkInFreeList->prevChunkInFreeList =elemToFree;
  10.200 +            //Now chg mem-list. By side-effect, changes size of elemToFree
  10.201 +         elemToFree->nextHigherInMem = nextHigherElem->nextHigherInMem;
  10.202 +         if( elemToFree->nextHigherInMem !=
  10.203 +             _VMSMasterEnv->freeListHead->nextHigherInMem )
  10.204 +            elemToFree->nextHigherInMem->nextLowerInMem = elemToFree;
  10.205 +       }
  10.206 +      else
  10.207 +       {    //neither lower nor higher is availabe to coalesce so add to list
  10.208 +            // this makes prev chunk ptr non-null, which indicates it's free
  10.209 +         elemToFree->nextChunkInFreeList =
  10.210 +                            _VMSMasterEnv->freeListHead->nextChunkInFreeList;
  10.211 +         _VMSMasterEnv->freeListHead->nextChunkInFreeList = elemToFree;
  10.212 +         if( elemToFree->nextChunkInFreeList != NULL ) // end-of-list?
  10.213 +            elemToFree->nextChunkInFreeList->prevChunkInFreeList =elemToFree;
  10.214 +         elemToFree->prevChunkInFreeList = _VMSMasterEnv->freeListHead;
  10.215 +       }
  10.216 +    }
  10.217 +
  10.218 + }
  10.219 +
  10.220 +
  10.221 +/*Allocates memory from the external system -- higher overhead
  10.222 + *
  10.223 + *Because of Linux's malloc throwing bizarre random faults when malloc is
  10.224 + * used inside a VMS virtual processor, have to pass this as a request and
  10.225 + * have the core loop do it when it gets around to it -- will look for these
  10.226 + * chores leftover from the previous animation of masterVP the next time it
  10.227 + * goes to animate the masterVP -- so it takes two separate masterVP
  10.228 + * animations, separated by work, to complete an external malloc or
  10.229 + * external free request.
  10.230 + *
  10.231 + *Thinking core loop accepts signals -- just looks if signal-location is
  10.232 + * empty or not --
  10.233 + */
  10.234 +void *
  10.235 +VMS__malloc_in_ext( int32 sizeRequested )
  10.236 + {
  10.237 + /*
  10.238 +      //This is running in the master, so no chance for multiple cores to be
  10.239 +      // competing for the core's flag.
  10.240 +   if(  *(_VMSMasterEnv->coreLoopSignalAddr[ 0 ]) != 0 )
  10.241 +    {    //something has already signalled to core loop, so save the signal
  10.242 +         // and look, next time master animated, to see if can send it.
  10.243 +         //Note, the addr to put a signal is in the coreloop's frame, so just
  10.244 +         // checks it each time through -- make it volatile to avoid GCC
  10.245 +         // optimizations -- it's a coreloop local var that only changes
  10.246 +         // after jumping away.  The signal includes the addr to send the
  10.247 +         //return to -- even if just empty return completion-signal
  10.248 +         //
  10.249 +         //save the signal in some queue that the master looks at each time
  10.250 +         // it starts up -- one loc says if empty for fast common case --
  10.251 +         //something like that -- want to hide this inside this call -- but
  10.252 +         // think this has to come as a request -- req handler gives procr
  10.253 +         // back to master loop, which gives it back to req handler at point
  10.254 +         // it sees that core loop has sent return signal.  Something like
  10.255 +         // that.
  10.256 +      saveTheSignal
  10.257 +
  10.258 +    }
  10.259 +  coreSigData->type = malloc;
  10.260 +  coreSigData->sizeToMalloc = sizeRequested;
  10.261 +  coreSigData->locToSignalCompletion = &figureOut;
  10.262 +   _VMSMasterEnv->coreLoopSignals[ 0 ] = coreSigData;
  10.263 +  */
  10.264 +      //just risk system-stack faults until get this figured out
  10.265 +   return malloc( sizeRequested );
  10.266 + }
  10.267 +
  10.268 +
  10.269 +/*Frees memory that was allocated in the external system -- higher overhead
  10.270 + *
  10.271 + *As noted in external malloc comment, this is clunky 'cause the free has
  10.272 + * to be called in the core loop.
  10.273 + */
  10.274 +void
  10.275 +VMS__free_in_ext( void *ptrToFree )
  10.276 + {
  10.277 +      //just risk system-stack faults until get this figured out
  10.278 +   free( ptrToFree );
  10.279 +
  10.280 +      //TODO: fix this -- so 
  10.281 + }
  10.282 +
  10.283 +
  10.284 +/*Designed to be called from the main thread outside of VMS, during init
  10.285 + */
  10.286 +MallocProlog *
  10.287 +VMS_ext__create_free_list()
  10.288 + { MallocProlog *freeListHead, *firstChunk;
  10.289 +
  10.290 +      //Note, this is running in the main thread -- all increases in malloc
  10.291 +      // mem and all frees of it must be done in this thread, with the
  10.292 +      // thread's original stack available
  10.293 +   freeListHead = malloc( sizeof(MallocProlog) );
  10.294 +   firstChunk   = malloc( MALLOC_ADDITIONAL_MEM_FROM_OS_SIZE );
  10.295 +   if( firstChunk == NULL ) {printf("malloc error\n"); exit(1);}
  10.296 +
  10.297 +   freeListHead->prevChunkInFreeList = NULL;
  10.298 +      //Use this addr to free the heap when cleanup
  10.299 +   freeListHead->nextLowerInMem      = firstChunk;
  10.300 +      //to identify top-of-heap elem, compare this addr to elem's next higher
  10.301 +   freeListHead->nextHigherInMem     = (void*)( (char*)firstChunk +
  10.302 +                                         MALLOC_ADDITIONAL_MEM_FROM_OS_SIZE);
  10.303 +   freeListHead->nextChunkInFreeList = firstChunk;
  10.304 +
  10.305 +   firstChunk->nextChunkInFreeList   = NULL;
  10.306 +   firstChunk->prevChunkInFreeList   = freeListHead;
  10.307 +      //next Higher has to be set to top of chunk, so can calc size in malloc
  10.308 +   firstChunk->nextHigherInMem       = (void*)( (char*)firstChunk +
  10.309 +                                         MALLOC_ADDITIONAL_MEM_FROM_OS_SIZE);
  10.310 +   firstChunk->nextLowerInMem        = NULL; //identifies as bott of heap
  10.311 +   
  10.312 +   _VMSMasterEnv->amtOfOutstandingMem = 0; //none allocated yet
  10.313 +
  10.314 +   return freeListHead;
  10.315 + }
  10.316 +
  10.317 +
  10.318 +/*Designed to be called from the main thread outside of VMS, during cleanup
  10.319 + */
  10.320 +void
  10.321 +VMS_ext__free_free_list( MallocProlog *freeListHead )
  10.322 + {    
  10.323 +      //stashed a ptr to the one and only bug chunk malloc'd from OS in the
  10.324 +      // free list head's next lower in mem pointer
  10.325 +   free( freeListHead->nextLowerInMem );
  10.326 +
  10.327 +   //don't free the head -- it'll be in an array eventually -- free whole
  10.328 +   // array when all the free lists linked from it have already been freed
  10.329 + }
  10.330 +
    11.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    11.2 +++ b/vmalloc.h	Thu Nov 11 06:19:51 2010 -0800
    11.3 @@ -0,0 +1,52 @@
    11.4 +/*
    11.5 + *  Copyright 2009 OpenSourceCodeStewardshipFoundation.org
    11.6 + *  Licensed under GNU General Public License version 2
    11.7 + *
    11.8 + * Author: seanhalle@yahoo.com
    11.9 + *
   11.10 + * Created on November 14, 2009, 9:07 PM
   11.11 + */
   11.12 +
   11.13 +#include <malloc.h>
   11.14 +#include "VMS_primitive_data_types.h"
   11.15 +
   11.16 +typedef struct _MallocProlog MallocProlog;
   11.17 +
   11.18 +struct _MallocProlog
   11.19 + {
   11.20 +   MallocProlog *nextChunkInFreeList;
   11.21 +   MallocProlog *prevChunkInFreeList;
   11.22 +   MallocProlog *nextHigherInMem;
   11.23 +   MallocProlog *nextLowerInMem;
   11.24 + };
   11.25 +//MallocProlog
   11.26 +
   11.27 +typedef struct
   11.28 + {
   11.29 +   MallocProlog *firstChunkInFreeList;
   11.30 +   int32         numInList;
   11.31 + }
   11.32 +FreeListHead;
   11.33 +
   11.34 +void *
   11.35 +VMS__malloc( int32 sizeRequested );
   11.36 +
   11.37 +void
   11.38 +VMS__free( void *ptrToFree );
   11.39 +
   11.40 +/*Allocates memory from the external system -- higher overhead
   11.41 + */
   11.42 +void *
   11.43 +VMS__malloc_in_ext( int32 sizeRequested );
   11.44 +
   11.45 +/*Frees memory that was allocated in the external system -- higher overhead
   11.46 + */
   11.47 +void
   11.48 +VMS__free_in_ext( void *ptrToFree );
   11.49 +
   11.50 +
   11.51 +MallocProlog *
   11.52 +VMS_ext__create_free_list();
   11.53 +
   11.54 +void
   11.55 +VMS_ext__free_free_list( MallocProlog *freeListHead );