changeset 208:eaf7e4c58c9e Common_Ancestor

Create common_ancestor brch -- all branches will be closed, then new ones created with this as the common ancestor of all branches -- it is incomplete! only code that is common to all HW and Feat and FeatDev branches is in here
author Some Random Person <seanhalle@yahoo.com>
date Wed, 22 Feb 2012 11:39:12 -0800
parents bc4cb994f114
children 0c83ea8adefc
files .hgignore .hgtags CoreLoop.c MasterLoop.c VMS.h VMS__HW_dependent.c VMS__HW_dependent.h VMS__HW_dependent.s VMS__PI.c VMS__WL.c VMS__int.c VMS__startup_and_shutdown.c VMS_defs__HW_specific.h VMS_defs__lang_specific.h VMS_defs__main.h VMS_primitive_data_types.h __brch__Common_ancestor __brch__DEPRECATED_README probes.c probes.h vmalloc.c vmalloc.h vutilities.c vutilities.h
diffstat 24 files changed, 3707 insertions(+), 29 deletions(-) [+]
line diff
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/.hgignore	Wed Feb 22 11:39:12 2012 -0800
     1.3 @@ -0,0 +1,3 @@
     1.4 +syntax: glob
     1.5 +
     1.6 +*.o
     2.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     2.2 +++ b/.hgtags	Wed Feb 22 11:39:12 2012 -0800
     2.3 @@ -0,0 +1,1 @@
     2.4 +9c3107044f86c36fea3a8f72f64910b1363555be Dec27_2010_about_to_add_sched_record
     3.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     3.2 +++ b/CoreLoop.c	Wed Feb 22 11:39:12 2012 -0800
     3.3 @@ -0,0 +1,214 @@
     3.4 +/*
     3.5 + * Copyright 2010  OpenSourceStewardshipFoundation
     3.6 + *
     3.7 + * Licensed under BSD
     3.8 + */
     3.9 +
    3.10 +
    3.11 +#include "VMS.h"
    3.12 +#include "ProcrContext.h"
    3.13 +
    3.14 +#include <stdlib.h>
    3.15 +#include <stdio.h>
    3.16 +#include <time.h>
    3.17 +
    3.18 +#include <pthread.h>
    3.19 +#include <sched.h>
    3.20 +
    3.21 +void *terminateCoreLoop(SlaveVP *currPr);
    3.22 +
    3.23 +/*This is the loop that runs in the OS Thread pinned to each core
    3.24 + *Get virt procr from queue,
    3.25 + * save state of current animator, then load in state of virt procr, using
    3.26 + * jmp instr to switch the program-counter state -- making the virt procr
    3.27 + * the new animator.
    3.28 + *At some point, the virt procr will suspend itself by saving out its
    3.29 + * animator state (stack ptr, frame ptr, program counter) and switching
    3.30 + * back to the OS Thread's animator state, which means restoring the
    3.31 + * stack and frame and jumping to the core loop start point.
    3.32 + *This cycle then repeats, until a special shutdown virtual processor is
    3.33 + * animated, which jumps to the end point at the bottom of core loop.
    3.34 + */
    3.35 +void *
    3.36 +coreLoop( void *paramsIn )
    3.37 + { 
    3.38 +   ThdParams      *coreLoopThdParams;
    3.39 +   int             thisCoresIdx;
    3.40 +   SlaveVP        *currPr;
    3.41 +   VMSQueueStruc  *readyToAnimateQ;
    3.42 +   cpu_set_t       coreMask;  //has 1 in bit positions of allowed cores
    3.43 +   int             errorCode;
    3.44 +
    3.45 +      //work-stealing struc on stack to prevent false-sharing in cache-line
    3.46 +   volatile GateStruc gate;
    3.47 +   //preGateProgress, waitProgress, exitProgress, gateClosed;
    3.48 +
    3.49 +
    3.50 +   coreLoopThdParams = (ThdParams *)paramsIn;
    3.51 +   thisCoresIdx = coreLoopThdParams->coreNum;
    3.52 +
    3.53 +   gate.gateClosed      = FALSE;
    3.54 +   gate.preGateProgress = 0;
    3.55 +   gate.waitProgress    = 0;
    3.56 +   gate.exitProgress    = 0;
    3.57 +   _VMSMasterEnv->workStealingGates[ thisCoresIdx ] = (GateStruc*)&gate;//race @startup
    3.58 +
    3.59 +      //wait until signalled that setup is complete
    3.60 +   pthread_mutex_lock(   &suspendLock );
    3.61 +   while( !(_VMSMasterEnv->setupComplete) )
    3.62 +    {
    3.63 +      pthread_cond_wait( &suspend_cond,
    3.64 +                         &suspendLock );
    3.65 +    }
    3.66 +   pthread_mutex_unlock( &suspendLock );
    3.67 +
    3.68 +      //printf( "\nCore unsuspended: %d\n", coreLoopThdParams->coreNum );
    3.69 +
    3.70 +      //set thread affinity
    3.71 +      //Linux requires pinning thd to core inside thread-function
    3.72 +      //Designate a core by a 1 in bit-position corresponding to the core
    3.73 +   CPU_ZERO(&coreMask);
    3.74 +   CPU_SET(coreLoopThdParams->coreNum,&coreMask);
    3.75 +   //coreMask = 1L << coreLoopThdParams->coreNum;
    3.76 +
    3.77 +   pthread_t selfThd = pthread_self();
    3.78 +   errorCode =
    3.79 +   pthread_setaffinity_np( selfThd, sizeof(coreMask), &coreMask);
    3.80 +   
    3.81 +   if(errorCode){ printf("\nset affinity failure\n"); exit(0); }
    3.82 +
    3.83 +   
    3.84 +   //Save the return address in the SwitchVP function
    3.85 +   saveCoreLoopReturnAddr((void**)&(_VMSMasterEnv->coreLoopReturnPt));
    3.86 +
    3.87 +   
    3.88 +   while(1){
    3.89 +   
    3.90 +      //Get virtual processor from queue
    3.91 +      //The Q must be a global, static volatile var, so not kept in reg,
    3.92 +      // which forces reloading the pointer after each jmp to this point
    3.93 +   readyToAnimateQ  = _VMSMasterEnv->readyToAnimateQs[thisCoresIdx];
    3.94 +
    3.95 +   #ifdef USE_WORK_STEALING
    3.96 +      //Alg for work-stealing designed to make common case fast.  Comment
    3.97 +      // in stealer code explains.
    3.98 +   gate.preGateProgress++;
    3.99 +   if( gate.gateClosed )
   3.100 +    {    //now, set coreloop's progress, so stealer can see that core loop
   3.101 +         // has made it into the waiting area.
   3.102 +      gate.waitProgress = gate.preGateProgress;
   3.103 +      while( gate.gateClosed ) /*busy wait*/;
   3.104 +    }
   3.105 +
   3.106 +   currPr = (SlaveVP *) readVMSQ( readyToAnimateQ );
   3.107 +
   3.108 +      //Set the coreloop's progress, so stealer can see it has made it out
   3.109 +      // of the protected area
   3.110 +   gate.exitProgress = gate.preGateProgress;
   3.111 +   #else
   3.112 +   currPr = (SlaveVP *) readVMSQ( readyToAnimateQ );
   3.113 +   #endif
   3.114 +
   3.115 +   if( currPr != NULL ) _VMSMasterEnv->numMasterInARow[thisCoresIdx] = 0;
   3.116 +   else
   3.117 +    {
   3.118 +      //============================= MEASUREMENT STUFF =====================
   3.119 +      #ifdef MEAS__TIME_MASTER_LOCK
   3.120 +      int32 startStamp, endStamp;
   3.121 +      saveLowTimeStampCountInto( startStamp );
   3.122 +      #endif
   3.123 +      //=====================================================================
   3.124 +      int tries = 0; int gotLock = 0;
   3.125 +      while( currPr == NULL ) //if queue was empty, enter get masterLock loop
   3.126 +       {    //queue was empty, so get master lock
   3.127 +
   3.128 +         gotLock = __sync_bool_compare_and_swap(&(_VMSMasterEnv->masterLock),
   3.129 +                                                          UNLOCKED, LOCKED );
   3.130 +         if( gotLock )
   3.131 +          {    //run own MasterVP -- jmps to coreLoops startPt when done
   3.132 +            currPr = _VMSMasterEnv->masterVPs[thisCoresIdx];
   3.133 +            if( _VMSMasterEnv->numMasterInARow[thisCoresIdx] > 1000 )
   3.134 +             {       DEBUG( dbgB2BMaster,"Many back to back MasterVPs\n");
   3.135 +               pthread_yield();
   3.136 +             }
   3.137 +            _VMSMasterEnv->numMasterInARow[thisCoresIdx] += 1;
   3.138 +            break;  //end while -- have a VP to animate now
   3.139 +          }
   3.140 +
   3.141 +         tries++;      //if too many, means master on other core taking too long
   3.142 +         if( tries > MASTERLOCK_RETRIES ) { tries = 0; pthread_yield(); }
   3.143 +       }
   3.144 +      //============================= MEASUREMENT STUFF =====================
   3.145 +      #ifdef MEAS__TIME_MASTER_LOCK
   3.146 +      saveLowTimeStampCountInto( endStamp );
   3.147 +      addIntervalToHist( startStamp, endStamp,
   3.148 +                         _VMSMasterEnv->masterLockLowTimeHist );
   3.149 +      addIntervalToHist( startStamp, endStamp,
   3.150 +                         _VMSMasterEnv->masterLockHighTimeHist );
   3.151 +      #endif
   3.152 +      //=====================================================================
   3.153 +
   3.154 +    }
   3.155 +
   3.156 +   
   3.157 +   switchToVP(currPr); //The VPs return in here
   3.158 +   flushRegisters();
   3.159 +   }//CoreLoop      
   3.160 + }
   3.161 +
   3.162 +
   3.163 +void *
   3.164 +terminateCoreLoop(SlaveVP *currPr){
   3.165 +   //first free shutdown VP that jumped here -- it first restores the
   3.166 +   // coreloop's stack, so addr of currPr in stack frame is still correct
   3.167 +   VMS_int__dissipate_procr( currPr );
   3.168 +   pthread_exit( NULL );
   3.169 +}
   3.170 +
   3.171 +
   3.172 +
   3.173 +#ifdef SEQUENTIAL
   3.174 +
   3.175 +//===========================================================================
   3.176 +/*This sequential version is exact same as threaded, except doesn't do the
   3.177 + * pin-threads part, nor the wait until setup complete part.
   3.178 + */
   3.179 +void *
   3.180 +coreLoop_Seq( void *paramsIn )
   3.181 + {
   3.182 +   SlaveVP      *currPr;
   3.183 +   VMSQueueStruc *readyToAnimateQ;
   3.184 +   
   3.185 +   ThdParams      *coreLoopThdParams;
   3.186 +   int             thisCoresIdx;
   3.187 +   
   3.188 +   coreLoopThdParams = (ThdParams *)paramsIn;
   3.189 +//   thisCoresIdx = coreLoopThdParams->coreNum;
   3.190 +   thisCoresIdx = 0;
   3.191 +
   3.192 +   //Save the return address in the SwitchVP function
   3.193 +   saveCoreLoopReturnAddr(&(_VMSMasterEnv->coreLoopReturnPt));
   3.194 +
   3.195 +   
   3.196 +   while(1){
   3.197 +      //Get virtual processor from queue
   3.198 +      //_VMSWorkQ must be a global, static volatile var, so not kept in reg,
   3.199 +      // which forces reloading the pointer after each jmp to this point
   3.200 +   readyToAnimateQ  = _VMSMasterEnv->readyToAnimateQs[thisCoresIdx];
   3.201 +   currPr = (SlaveVP *) readVMSQ( readyToAnimateQ );
   3.202 +   if( currPr == NULL )
   3.203 +    { if( _VMSMasterEnv->numMasterInARow[thisCoresIdx] > 1000 )
   3.204 +       { printf("too many back to back MasterVP\n"); exit(1); }
   3.205 +      _VMSMasterEnv->numMasterInARow[thisCoresIdx] += 1;
   3.206 +      
   3.207 +      currPr = _VMSMasterEnv->masterVPs[thisCoresIdx];
   3.208 +    }
   3.209 +   else
   3.210 +      _VMSMasterEnv->numMasterInARow[thisCoresIdx] = 0;
   3.211 +
   3.212 +
   3.213 +   switchToVP( currPr );
   3.214 +   flushRegisters();
   3.215 +   }
   3.216 + }
   3.217 +#endif
     4.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     4.2 +++ b/MasterLoop.c	Wed Feb 22 11:39:12 2012 -0800
     4.3 @@ -0,0 +1,373 @@
     4.4 +/*
     4.5 + * Copyright 2010  OpenSourceStewardshipFoundation
     4.6 + * 
     4.7 + * Licensed under BSD
     4.8 + */
     4.9 +
    4.10 +
    4.11 +
    4.12 +#include <stdio.h>
    4.13 +#include <stddef.h>
    4.14 +
    4.15 +#include "VMS.h"
    4.16 +#include "ProcrContext.h"
    4.17 +
    4.18 +
    4.19 +//===========================================================================
    4.20 +void inline
    4.21 +stealWorkInto( SchedSlot *currSlot, VMSQueueStruc *readyToAnimateQ,
    4.22 +               SlaveVP *masterPr );
    4.23 +
    4.24 +//===========================================================================
    4.25 +
    4.26 +
    4.27 +
    4.28 +/*This code is animated by the virtual Master processor.
    4.29 + *
    4.30 + *Polls each sched slot exactly once, hands any requests made by a newly
    4.31 + * done slave to the "request handler" plug-in function
    4.32 + *
    4.33 + *Any slots that need a virt procr assigned are given to the "schedule"
    4.34 + * plug-in function, which tries to assign a virt procr (slave) to it.
    4.35 + *
    4.36 + *When all slots needing a processor have been given to the schedule plug-in,
    4.37 + * a fraction of the procrs successfully scheduled are put into the
    4.38 + * work queue, then a continuation of this function is put in, then the rest
    4.39 + * of the virt procrs that were successfully scheduled.
    4.40 + *
    4.41 + *The first thing the continuation does is busy-wait until the previous
    4.42 + * animation completes.  This is because an (unlikely) continuation may
    4.43 + * sneak through queue before previous continuation is done putting second
    4.44 + * part of scheduled slaves in, which is the only race condition.
    4.45 + *
    4.46 + */
    4.47 +
    4.48 +/*May 29, 2010 -- birth a Master during init so that first core loop to
    4.49 + * start running gets it and does all the stuff for a newly born --
    4.50 + * from then on, will be doing continuation, but do suspension self
    4.51 + * directly at end of master loop
    4.52 + *So VMS__init just births the master virtual processor same way it births
    4.53 + * all the others -- then does any extra setup needed and puts it into the
    4.54 + * work queue.
    4.55 + *However means have to make masterEnv a global static volatile the same way
    4.56 + * did with readyToAnimateQ in core loop.  -- for performance, put the
    4.57 + * jump to the core loop directly in here, and have it directly jump back.
    4.58 + *
    4.59 + *
    4.60 + *Aug 18, 2010 -- Going to a separate MasterVP for each core, to see if this
    4.61 + * avoids the suspected bug in the system stack that causes bizarre faults
    4.62 + * at random places in the system code.
    4.63 + *
    4.64 + *So, this function is coupled to each of the MasterVPs, -- meaning this
    4.65 + * function can't rely on a particular stack and frame -- each MasterVP that
    4.66 + * animates this function has a different one.
    4.67 + *
    4.68 + *At this point, the masterLoop does not write itself into the queue anymore,
    4.69 + * instead, the coreLoop acquires the masterLock when it has nothing to
    4.70 + * animate, and then animates its own masterLoop.  However, still try to put
    4.71 + * several AppVPs into the queue to amortize the startup cost of switching
    4.72 + * to the MasterVP.  Note, don't have to worry about latency of requests much
    4.73 + * because most requests generate work for same core -- only latency issue
    4.74 + * is case when other cores starved and one core's requests generate work
    4.75 + * for them -- so keep max in queue to 3 or 4..
    4.76 + */
    4.77 +void masterLoop( void *initData, SlaveVP *animatingPr )
    4.78 + { 
    4.79 +   int32           slotIdx, numSlotsFilled;
    4.80 +   SlaveVP      *schedVirtPr;
    4.81 +   SchedSlot      *currSlot, **schedSlots;
    4.82 +   MasterEnv      *masterEnv;
    4.83 +   VMSQueueStruc  *readyToAnimateQ;
    4.84 +   
    4.85 +   Sched_Assigner  slaveScheduler;
    4.86 +   RequestHandler  requestHandler;
    4.87 +   void           *semanticEnv;
    4.88 +
    4.89 +   int32           thisCoresIdx;
    4.90 +   SlaveVP      *masterPr;
    4.91 +   volatile        SlaveVP *volatileMasterPr;
    4.92 +   
    4.93 +   volatileMasterPr = animatingPr;
    4.94 +   masterPr         = (SlaveVP*)volatileMasterPr; //used to force re-define after jmp
    4.95 +
    4.96 +      //First animation of each MasterVP will in turn animate this part
    4.97 +      // of setup code.. (VP creator sets up the stack as if this function
    4.98 +      // was called normally, but actually get here by jmp)
    4.99 +      //So, setup values about stack ptr, jmp pt and all that
   4.100 +   //masterPr->resumeInstrPtr = &&masterLoopStartPt;
   4.101 +
   4.102 +
   4.103 +      //Note, got rid of writing the stack and frame ptr up here, because
   4.104 +      // only one
   4.105 +      // core can ever animate a given MasterVP, so don't need to communicate
   4.106 +      // new frame and stack ptr to the MasterVP storage before a second
   4.107 +      // version of that MasterVP can get animated on a different core.
   4.108 +      //Also got rid of the busy-wait.
   4.109 +
   4.110 +   
   4.111 +   //masterLoopStartPt:
   4.112 +   while(1){
   4.113 +       
   4.114 +   //============================= MEASUREMENT STUFF ========================
   4.115 +   #ifdef MEAS__TIME_MASTER
   4.116 +      //Total Master time includes one coreloop time -- just assume the core
   4.117 +      // loop time is same for Master as for AppVPs, even though it may be
   4.118 +      // smaller due to higher predictability of the fixed jmp.
   4.119 +   saveLowTimeStampCountInto( masterPr->startMasterTSCLow );
   4.120 +   #endif
   4.121 +   //========================================================================
   4.122 +
   4.123 +   masterEnv        = (MasterEnv*)_VMSMasterEnv;
   4.124 +   
   4.125 +      //GCC may optimize so doesn't always re-define from frame-storage
   4.126 +   masterPr         = (SlaveVP*)volatileMasterPr;  //just to make sure after jmp
   4.127 +   thisCoresIdx     = masterPr->coreAnimatedBy;
   4.128 +   readyToAnimateQ  = masterEnv->readyToAnimateQs[thisCoresIdx];
   4.129 +   schedSlots       = masterEnv->allSchedSlots[thisCoresIdx];
   4.130 +
   4.131 +   requestHandler   = masterEnv->requestHandler;
   4.132 +   slaveScheduler   = masterEnv->slaveSchedAssigner;
   4.133 +   semanticEnv      = masterEnv->semanticEnv;
   4.134 +
   4.135 +
   4.136 +      //Poll each slot's Done flag
   4.137 +   numSlotsFilled = 0;
   4.138 +   for( slotIdx = 0; slotIdx < NUM_SCHED_SLOTS; slotIdx++)
   4.139 +    {
   4.140 +      currSlot = schedSlots[ slotIdx ];
   4.141 +
   4.142 +      if( currSlot->workIsDone )
   4.143 +       {
   4.144 +         currSlot->workIsDone         = FALSE;
   4.145 +         currSlot->needsProcrAssigned = TRUE;
   4.146 +
   4.147 +            //process requests from slave to master
   4.148 +               //====================== MEASUREMENT STUFF ===================
   4.149 +               #ifdef MEAS__TIME_PLUGIN
   4.150 +               int32 startStamp1, endStamp1;
   4.151 +               saveLowTimeStampCountInto( startStamp1 );
   4.152 +               #endif
   4.153 +               //============================================================
   4.154 +         (*requestHandler)( currSlot->procrAssignedToSlot, semanticEnv );
   4.155 +               //====================== MEASUREMENT STUFF ===================
   4.156 +               #ifdef MEAS__TIME_PLUGIN
   4.157 +               saveLowTimeStampCountInto( endStamp1 );
   4.158 +               addIntervalToHist( startStamp1, endStamp1,
   4.159 +                                        _VMSMasterEnv->reqHdlrLowTimeHist );
   4.160 +               addIntervalToHist( startStamp1, endStamp1,
   4.161 +                                        _VMSMasterEnv->reqHdlrHighTimeHist );
   4.162 +               #endif
   4.163 +               //============================================================
   4.164 +       }
   4.165 +      if( currSlot->needsProcrAssigned )
   4.166 +       {    //give slot a new virt procr
   4.167 +         schedVirtPr =
   4.168 +          (*slaveScheduler)( semanticEnv, thisCoresIdx );
   4.169 +         
   4.170 +         if( schedVirtPr != NULL )
   4.171 +          { currSlot->procrAssignedToSlot = schedVirtPr;
   4.172 +            schedVirtPr->schedSlot        = currSlot;
   4.173 +            currSlot->needsProcrAssigned  = FALSE;
   4.174 +            numSlotsFilled               += 1;
   4.175 +            
   4.176 +            writeVMSQ( schedVirtPr, readyToAnimateQ );
   4.177 +          }
   4.178 +       }
   4.179 +    }
   4.180 +
   4.181 +   
   4.182 +   #ifdef USE_WORK_STEALING
   4.183 +      //If no slots filled, means no more work, look for work to steal.
   4.184 +   if( numSlotsFilled == 0 )
   4.185 +    { gateProtected_stealWorkInto( currSlot, readyToAnimateQ, masterPr );
   4.186 +    }
   4.187 +   #endif
   4.188 +
   4.189 +   
   4.190 +   #ifdef MEAS__TIME_MASTER
   4.191 +   saveLowTimeStampCountInto( masterPr->endMasterTSCLow );
   4.192 +   #endif
   4.193 +
   4.194 +   masterSwitchToCoreLoop(animatingPr);
   4.195 +   flushRegisters();
   4.196 +   }//MasterLoop
   4.197 +
   4.198 +
   4.199 + }
   4.200 +
   4.201 +
   4.202 +
   4.203 +/*This has a race condition -- the coreloops are accessing their own queues
   4.204 + * at the same time that this work-stealer on a different core is trying to
   4.205 + */
   4.206 +void inline
   4.207 +stealWorkInto( SchedSlot *currSlot, VMSQueueStruc *readyToAnimateQ,
   4.208 +               SlaveVP *masterPr )
   4.209 + { 
   4.210 +   SlaveVP   *stolenPr;
   4.211 +   int32        coreIdx, i;
   4.212 +   VMSQueueStruc *currQ;
   4.213 +
   4.214 +   stolenPr = NULL;
   4.215 +   coreIdx = masterPr->coreAnimatedBy;
   4.216 +   for( i = 0; i < NUM_CORES -1; i++ )
   4.217 +    {
   4.218 +      if( coreIdx >= NUM_CORES -1 )
   4.219 +       { coreIdx = 0;
   4.220 +       }
   4.221 +      else
   4.222 +       { coreIdx++;
   4.223 +       }
   4.224 +      currQ = _VMSMasterEnv->readyToAnimateQs[coreIdx];
   4.225 +      if( numInVMSQ( currQ ) > 0 )
   4.226 +       { stolenPr = readVMSQ (currQ );
   4.227 +         break;
   4.228 +       }
   4.229 +    }
   4.230 +
   4.231 +   if( stolenPr != NULL )
   4.232 +    { currSlot->procrAssignedToSlot = stolenPr;
   4.233 +      stolenPr->schedSlot           = currSlot;
   4.234 +      currSlot->needsProcrAssigned  = FALSE;
   4.235 +
   4.236 +      writeVMSQ( stolenPr, readyToAnimateQ );
   4.237 +    }
   4.238 + }
   4.239 +
   4.240 +/*This algorithm makes the common case fast.  Make the coreloop passive,
   4.241 + * and show its progress.  Make the stealer control a gate that coreloop
   4.242 + * has to pass.
   4.243 + *To avoid interference, only one stealer at a time.  Use a global
   4.244 + * stealer-lock.
   4.245 + *
   4.246 + *The pattern is based on a gate -- stealer shuts the gate, then monitors
   4.247 + * to be sure any already past make it all the way out, before starting.
   4.248 + *So, have a "progress" measure just before the gate, then have two after it,
   4.249 + * one is in a "waiting room" outside the gate, the other is at the exit.
   4.250 + *Then, the stealer first shuts the gate, then checks the progress measure
   4.251 + * outside it, then looks to see if the progress measure at the exit is the
   4.252 + * same.  If yes, it knows the protected area is empty 'cause no other way
   4.253 + * to get in and the last to get in also exited.
   4.254 + *If the progress measure at the exit is not the same, then the stealer goes
   4.255 + * into a loop checking both the waiting-area and the exit progress-measures
   4.256 + * until one of them shows the same as the measure outside the gate.  Might
   4.257 + * as well re-read the measure outside the gate each go around, just to be
   4.258 + * sure.  It is guaranteed that one of the two will eventually match the one
   4.259 + * outside the gate.
   4.260 + *
   4.261 + *Here's an informal proof of correctness:
   4.262 + *The gate can be closed at any point, and have only four cases:
   4.263 + *  1) coreloop made it past the gate-closing but not yet past the exit
   4.264 + *  2) coreloop made it past the pre-gate progress update but not yet past
   4.265 + *     the gate,
   4.266 + *  3) coreloop is right before the pre-gate update
   4.267 + *  4) coreloop is past the exit and far from the pre-gate update.
   4.268 + *
   4.269 + * Covering the cases in reverse order,
   4.270 + *  4) is not a problem -- stealer will read pre-gate progress, see that it
   4.271 + *     matches exit progress, and the gate is closed, so stealer can proceed.
   4.272 + *  3) stealer will read pre-gate progress just after coreloop updates it..
   4.273 + *     so stealer goes into a loop until the coreloop causes wait-progress
   4.274 + *     to match pre-gate progress, so then stealer can proceed
   4.275 + *  2) same as 3..
   4.276 + *  1) stealer reads pre-gate progress, sees that it's different than exit,
   4.277 + *     so goes into loop until exit matches pre-gate, now it knows coreloop
   4.278 + *     is not in protected and cannot get back in, so can proceed.
   4.279 + *
   4.280 + *Implementation for the stealer:
   4.281 + *
   4.282 + *First, acquire the stealer lock -- only cores with no work to do will
   4.283 + * compete to steal, so not a big performance penalty having only one --
   4.284 + * will rarely have multiple stealers in a system with plenty of work -- and
   4.285 + * in a system with little work, it doesn't matter.
   4.286 + *
   4.287 + *Note, have single-reader, single-writer pattern for all variables used to
   4.288 + * communicate between stealer and victims
   4.289 + *
   4.290 + *So, scan the queues of the core loops, until find non-empty.  Each core
   4.291 + * has its own list that it scans.  The list goes in order from closest to
   4.292 + * furthest core, so it steals first from close cores.  Later can add
   4.293 + * taking info from the app about overlapping footprints, and scan all the
   4.294 + * others then choose work with the most footprint overlap with the contents
   4.295 + * of this core's cache.
   4.296 + *
   4.297 + *Now, have a victim want to take work from.  So, shut the gate in that
   4.298 + * coreloop, by setting the "gate closed" var on its stack to TRUE.
   4.299 + *Then, read the core's pre-gate progress and compare to the core's exit
   4.300 + * progress.
   4.301 + *If same, can proceed to take work from the coreloop's queue.  When done,
   4.302 + * write FALSE to gate closed var.
   4.303 + *If different, then enter a loop that reads the pre-gate progress, then
   4.304 + * compares to exit progress then to wait progress.  When one of two
   4.305 + * matches, proceed.  Take work from the coreloop's queue.  When done,
   4.306 + * write FALSE to the gate closed var.
   4.307 + * 
   4.308 + */
   4.309 +void inline
   4.310 +gateProtected_stealWorkInto( SchedSlot *currSlot,
   4.311 +                             VMSQueueStruc *myReadyToAnimateQ,
   4.312 +                             SlaveVP *masterPr )
   4.313 + {
   4.314 +   SlaveVP     *stolenPr;
   4.315 +   int32          coreIdx, i, haveAVictim, gotLock;
   4.316 +   VMSQueueStruc *victimsQ;
   4.317 +
   4.318 +   volatile GateStruc *vicGate;
   4.319 +   int32               coreMightBeInProtected;
   4.320 +
   4.321 +
   4.322 +
   4.323 +      //see if any other cores have work available to steal
   4.324 +   haveAVictim = FALSE;
   4.325 +   coreIdx = masterPr->coreAnimatedBy;
   4.326 +   for( i = 0; i < NUM_CORES -1; i++ )
   4.327 +    {
   4.328 +      if( coreIdx >= NUM_CORES -1 )
   4.329 +       { coreIdx = 0;
   4.330 +       }
   4.331 +      else
   4.332 +       { coreIdx++;
   4.333 +       }
   4.334 +      victimsQ = _VMSMasterEnv->readyToAnimateQs[coreIdx];
   4.335 +      if( numInVMSQ( victimsQ ) > 0 )
   4.336 +       { haveAVictim = TRUE;
   4.337 +         vicGate = _VMSMasterEnv->workStealingGates[ coreIdx ];
   4.338 +         break;
   4.339 +       }
   4.340 +    }
   4.341 +   if( !haveAVictim ) return;  //no work to steal, exit
   4.342 +
   4.343 +      //have a victim core, now get the stealer-lock
   4.344 +   gotLock =__sync_bool_compare_and_swap( &(_VMSMasterEnv->workStealingLock),
   4.345 +                                                          UNLOCKED, LOCKED );
   4.346 +   if( !gotLock ) return; //go back to core loop, which will re-start master
   4.347 +
   4.348 +
   4.349 +   //====== Start Gate-protection =======
   4.350 +   vicGate->gateClosed = TRUE;
   4.351 +   coreMightBeInProtected= vicGate->preGateProgress != vicGate->exitProgress;
   4.352 +   while( coreMightBeInProtected )
   4.353 +    {    //wait until sure
   4.354 +      if( vicGate->preGateProgress == vicGate->waitProgress )
   4.355 +         coreMightBeInProtected = FALSE;
   4.356 +      if( vicGate->preGateProgress == vicGate->exitProgress )
   4.357 +         coreMightBeInProtected = FALSE;
   4.358 +    }
   4.359 +
   4.360 +   stolenPr = readVMSQ ( victimsQ );
   4.361 +
   4.362 +   vicGate->gateClosed = FALSE;
   4.363 +   //======= End Gate-protection  =======
   4.364 +
   4.365 +
   4.366 +   if( stolenPr != NULL )  //victim could have been in protected and taken
   4.367 +    { currSlot->procrAssignedToSlot = stolenPr;
   4.368 +      stolenPr->schedSlot           = currSlot;
   4.369 +      currSlot->needsProcrAssigned  = FALSE;
   4.370 +
   4.371 +      writeVMSQ( stolenPr, myReadyToAnimateQ );
   4.372 +    }
   4.373 +
   4.374 +      //unlock the work stealing lock
   4.375 +   _VMSMasterEnv->workStealingLock = UNLOCKED;
   4.376 + }
     5.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     5.2 +++ b/VMS.h	Wed Feb 22 11:39:12 2012 -0800
     5.3 @@ -0,0 +1,377 @@
     5.4 +/*
     5.5 + *  Copyright 2009 OpenSourceStewardshipFoundation.org
     5.6 + *  Licensed under GNU General Public License version 2
     5.7 + *
     5.8 + * Author: seanhalle@yahoo.com
     5.9 + * 
    5.10 + */
    5.11 +
    5.12 +#ifndef _VMS_H
    5.13 +#define	_VMS_H
    5.14 +#define _GNU_SOURCE
    5.15 +
    5.16 +#include "VMS_primitive_data_types.h"
    5.17 +#include "C_Libraries/DynArray/DynArray.h"
    5.18 +#include "C_Libraries/Hash_impl/PrivateHash.h"
    5.19 +#include "C_Libraries/Histogram/Histogram.h"
    5.20 +#include "C_Libraries/Queue_impl/PrivateQueue.h"
    5.21 +#include "vmalloc.h"
    5.22 +
    5.23 +#include <pthread.h>
    5.24 +#include <sys/time.h>
    5.25 +
    5.26 +//=================  Defines: included from separate files  =================
    5.27 +//
    5.28 +// Note: ALL defines are in other files, none are in here
    5.29 +//
    5.30 +#include "VMS_defs__main.h"
    5.31 +
    5.32 +
    5.33 +//================================ Typedefs =================================
    5.34 +//
    5.35 +typedef unsigned long long TSCount;
    5.36 +typedef union
    5.37 + { uint32 lowHigh[2];
    5.38 +   uint64 longVal;
    5.39 + }
    5.40 +TSCountLowHigh;
    5.41 +
    5.42 +typedef struct _SchedSlot     SchedSlot;
    5.43 +typedef struct _VMSReqst      VMSReqst;
    5.44 +typedef struct _SlaveVP       SlaveVP;
    5.45 +typedef struct _MasterVP      MasterVP;
    5.46 +typedef struct _IntervalProbe IntervalProbe;
    5.47 +typedef struct _GateStruc     GateStruc;
    5.48 +
    5.49 +
    5.50 +typedef SlaveVP * (*Sched_Assigner)  ( void *, int );   //semEnv, coreIdx
    5.51 +typedef void  (*RequestHandler)  ( SlaveVP *, void * ); //prWReqst, semEnv
    5.52 +typedef void  (*TopLevelFnPtr)  ( void *, SlaveVP * ); //initData, animPr
    5.53 +typedef void    TopLevelFn      ( void *, SlaveVP * ); //initData, animPr
    5.54 +typedef void  (*ResumeVPFnPtr)   ( SlaveVP *, void * );
    5.55 +
    5.56 +//============================= Statistics ==================================
    5.57 +
    5.58 +inline TSCount getTSCount();
    5.59 +
    5.60 +//============= Request Related ===========
    5.61 +//
    5.62 +
    5.63 +enum VMSReqstType   //avoid starting enums at 0, for debug reasons
    5.64 + {
    5.65 +   semantic = 1,
    5.66 +   createReq,
    5.67 +   dissipate,
    5.68 +   VMSSemantic      //goes with VMSSemReqst below
    5.69 + };
    5.70 +
    5.71 +struct _VMSReqst
    5.72 + {
    5.73 +   enum VMSReqstType  reqType;//used for dissipate and in future for IO requests
    5.74 +   void              *semReqData;
    5.75 +
    5.76 +   VMSReqst *nextReqst;
    5.77 + };
    5.78 +//VMSReqst
    5.79 +
    5.80 +enum VMSSemReqstType   //These are equivalent to semantic requests, but for
    5.81 + {                     // VMS's services available directly to app, like OS
    5.82 +   createProbe = 1,    // and probe services -- like a VMS-wide built-in lang
    5.83 +   openFile,
    5.84 +   otherIO
    5.85 + };
    5.86 +
    5.87 +typedef struct
    5.88 + { enum VMSSemReqstType reqType;
    5.89 +   SlaveVP           *requestingPr;
    5.90 +   char                *nameStr;  //for create probe
    5.91 + }
    5.92 + VMSSemReq;
    5.93 +
    5.94 +
    5.95 +//====================  Core data structures  ===================
    5.96 +
    5.97 +struct _SchedSlot
    5.98 + {
    5.99 +   int         workIsDone;
   5.100 +   int         needsProcrAssigned;
   5.101 +   SlaveVP  *procrAssignedToSlot;
   5.102 + };
   5.103 +//SchedSlot
   5.104 +
   5.105 +/*WARNING: re-arranging this data structure could cause VP switching
   5.106 + *         assembly code to fail -- hard-codes offsets of fields
   5.107 + */
   5.108 +struct _SlaveVP
   5.109 + { int         procrID;  //for debugging -- count up each time create
   5.110 +   int         coreAnimatedBy;
   5.111 +   void       *startOfStack;
   5.112 +   void       *stackPtr;
   5.113 +   void       *framePtr;
   5.114 +   void       *resumeInstrPtr;
   5.115 +   
   5.116 +   void       *coreLoopStartPt;  //allows proto-runtime to be linked later
   5.117 +   void       *coreLoopFramePtr; //restore before jmp back to core loop
   5.118 +   void       *coreLoopStackPtr; //restore before jmp back to core loop
   5.119 +
   5.120 +   SchedSlot  *schedSlot;
   5.121 +   VMSReqst   *requests;
   5.122 +
   5.123 +   void       *semanticData; //this livesUSE_GNU here for the life of VP
   5.124 +   void       *dataRetFromReq;//values returned from plugin to VP go here
   5.125 +
   5.126 +      //=========== MEASUREMENT STUFF ==========
   5.127 +       #ifdef MEAS__TIME_STAMP_SUSP
   5.128 +       uint32  preSuspTSCLow;
   5.129 +       uint32  postSuspTSCLow;
   5.130 +       #endif
   5.131 +       #ifdef MEAS__TIME_MASTER /* in SlaveVP because multiple masterVPs*/
   5.132 +       uint32  startMasterTSCLow;USE_GNU
   5.133 +       uint32  endMasterTSCLow;
   5.134 +       #endif
   5.135 +       #ifdef MEAS__TIME_2011_SYS
   5.136 +       TSCountLowHigh  startSusp;
   5.137 +       uint64  totalSuspCycles;
   5.138 +       uint32  numGoodSusp;
   5.139 +       #endif
   5.140 +      //========================================
   5.141 +   
   5.142 +   float64      createPtInSecs;  //have space but don't use on some configs
   5.143 + };
   5.144 +//SlaveVP
   5.145 +
   5.146 +
   5.147 +/*WARNING: re-arranging this data structure could cause VP-switching
   5.148 + *         assembly code to fail -- hard-codes offsets of fields
   5.149 + *         (because -O3 messes with things otherwise)
   5.150 + */
   5.151 +typedef struct
   5.152 + {
   5.153 +   union{ //adds padding to put masterLock on its own cache-line to elim
   5.154 +          // false sharing (masterLock is most-accessed var in VMS)
   5.155 +        volatile int32   masterLock;
   5.156 +        char             padding[CACHE_LINE_SZ];    
   5.157 +   } masterLockUnion;
   5.158 +   Sched_Assigner   slaveSchedAssigner;
   5.159 +   RequestHandler   requestHandler;
   5.160 +   
   5.161 +   SchedSlot     ***allSchedSlots;
   5.162 +   VMSQueueStruc **readyToAnimateQs;
   5.163 +   SlaveVP      **masterVPs;
   5.164 +
   5.165 +   void            *semanticEnv;
   5.166 +   void            *OSEventStruc;   //for future, when add I/O to BLIS
   5.167 +   MallocArrays    *freeLists;
   5.168 +   int32            amtOfOutstandingMem; //total currently allocated
   5.169 +
   5.170 +   void            *coreLoopReturnPt;//addr to jump to to re-enter coreLoop
   5.171 +
   5.172 +   int32            setupComplete;
   5.173 +   //int32            numMasterInARow[NUM_CORES];//detect back-to-back masterVP
   5.174 +   GateStruc       *workStealingGates[ NUM_CORES ]; //concurrent work-steal
   5.175 +   int32            workStealingLock;
   5.176 +   
   5.177 +   int32            numVPsCreated; //gives ordering to processor creation
   5.178 +
   5.179 +      //=========== MEASUREMENT STUFF =============
   5.180 +       IntervalProbe   **intervalProbes;
   5.181 +       PrivDynArrayInfo *dynIntervalProbesInfo;
   5.182 +       HashTable        *probeNameHashTbl;
   5.183 +       int32             masterCreateProbeID;
   5.184 +       float64           createPtInSecs;
   5.185 +       Histogram       **measHists;
   5.186 +       PrivDynArrayInfo *measHistsInfo;
   5.187 +       #ifdef MEAS__TIME_PLUGIN
   5.188 +       Histogram       *reqHdlrLowTimeHist;
   5.189 +       Histogram       *reqHdlrHighTimeHist;
   5.190 +       #endif
   5.191 +       #ifdef MEAS__TIME_MALLOC
   5.192 +       Histogram       *mallocTimeHist;
   5.193 +       Histogram       *freeTimeHist;
   5.194 +       #endif
   5.195 +       #ifdef MEAS__TIME_MASTER_LOCK
   5.196 +       Histogram       *masterLockLowTimeHist;
   5.197 +       Histogram       *masterLockHighTimeHist;
   5.198 +       #endif
   5.199 +       #ifdef MEAS__TIME_2011_SYS
   5.200 +       TSCountLowHigh   startMaster;
   5.201 +       uint64           totalMasterCycles;
   5.202 +       uint32           numMasterAnimations;
   5.203 +       TSCountLowHigh   startReqHdlr;
   5.204 +       uint64           totalPluginCycles;
   5.205 +       uint32           numPluginAnimations;
   5.206 +       uint64           cyclesTillStartMasterLoop;
   5.207 +       TSCountLowHigh   endMasterLoop;
   5.208 +       #endif
   5.209 +      //==========================================
   5.210 + }
   5.211 +MasterEnv;
   5.212 +
   5.213 +//=========================  Extra Stuff Data Strucs  =======================
   5.214 +typedef struct
   5.215 + {
   5.216 +
   5.217 + }
   5.218 +VMSExcp;
   5.219 +
   5.220 +struct _GateStruc
   5.221 + {
   5.222 +   int32 gateClosed;
   5.223 +   int32 preGateProgress;
   5.224 +   int32 waitProgress;
   5.225 +   int32 exitProgress;
   5.226 + };
   5.227 +//GateStruc
   5.228 +
   5.229 +//=======================  OS Thread related  ===============================
   5.230 +
   5.231 +void * coreLoop( void *paramsIn );  //standard PThreads fn prototype
   5.232 +void * coreLoop_Seq( void *paramsIn );  //standard PThreads fn prototype
   5.233 +void masterLoop( void *initData, SlaveVP *masterVP );
   5.234 +
   5.235 +
   5.236 +typedef struct
   5.237 + {
   5.238 +   void           *endThdPt;
   5.239 +   unsigned int    coreNum;
   5.240 + }
   5.241 +ThdParams;
   5.242 +
   5.243 +pthread_t       coreLoopThdHandles[ NUM_CORES ];  //pthread's virt-procr state
   5.244 +ThdParams      *coreLoopThdParams [ NUM_CORES ];
   5.245 +pthread_mutex_t suspendLock;
   5.246 +pthread_cond_t  suspend_cond;
   5.247 +
   5.248 +
   5.249 +
   5.250 +//=============================  Global Vars ================================
   5.251 +
   5.252 +volatile MasterEnv      *_VMSMasterEnv __align_to_cacheline__;
   5.253 +
   5.254 +
   5.255 +
   5.256 +
   5.257 +//=========================  Function Prototypes  ===========================
   5.258 +
   5.259 +
   5.260 +//========== Setup and shutdown ==========
   5.261 +void
   5.262 +VMS_int__init();
   5.263 +
   5.264 +Fix seed-procr creation -- put box around language, have lang register stuff
   5.265 +        with VMS.
   5.266 +        have main program explicitly INIT Lang! -- makes more sense to
   5.267 +        C programmers -- makes it clear that there's a transition.
   5.268 +(might need to have the pthreads remain waiting for
   5.269 +        cond until work is scheduled)
   5.270 +Have main do call to tell language to perform work -- like did with DKU
   5.271 +
   5.272 +Ex: "HWSim__run_a_simulation(netlist, paramBag);"
   5.273 +        "processID = SSR__run_program(seed_fn, seedData); "
   5.274 +        "SSR__Wait_for_program_to_end(processID);"
   5.275 +        "SSR__run_program_and_wait_till_it_ends(seed_fn, seedData);"
   5.276 +        
   5.277 +        allows multiple languages to be started, and programs run in several,
   5.278 +        overlapped, or one program to be run that uses multiple langs..?
   5.279 +        So, each program is in separate directory:
   5.280 +            "HWSim_ArchDef__PingPong"  "SSR_Program__Blocked_Matrix_Mult"
   5.281 +        
   5.282 +        Those programs can talk to each other, via VMS, by handles they each
   5.283 +        return
   5.284 +        "processIDs[0] = SSR__run_program(seed_fn1, seedData1);"
   5.285 +        "processIDs[1] = SSR__run_program(seed_fn2, seedData2);"
   5.286 +        "SSR__link_programs(processIDs, 2);"
   5.287 +or even
   5.288 +        "processIDs[0] = Vthread__run_program(seed_fn1, seedData1);"
   5.289 +        "processIDs[1] = SSR__run_program(seed_fn2, seedData2);"
   5.290 +        "VMS__link_programs(processIDs, 2);"
   5.291 +        Then, the programs just know they sync with other prog, but use own
   5.292 +        lang's sync constructs -- VMS uses message system to establish tie-pt,
   5.293 +        each lang defines what a tie-point means to it..  (work with the
   5.294 +        diff semantics?)
   5.295 +void
   5.296 +VMS_WL__start_the_work_then_wait_until_done();
   5.297 +
   5.298 +void
   5.299 +VMS_int__shutdown();
   5.300 +
   5.301 +void
   5.302 +VMS_int__cleanup_at_end_of_shutdown();
   5.303 +
   5.304 +
   5.305 +//==============    ===============
   5.306 +
   5.307 +inline SlaveVP *
   5.308 +VMS_int__create_procr( TopLevelFnPtr fnPtr, void *dataParam );
   5.309 +
   5.310 +inline void
   5.311 +VMS_int__point_slave_to_Fn( SlaveVP *slaveVP, TopLevelFnPtr fnPtr,
   5.312 +                            void    *dataParam);
   5.313 +
   5.314 +void
   5.315 +VMS_int__save_return_addr_into_ptd_to_loc(void *ptrToReturnAddrHoldingLoc);
   5.316 +
   5.317 +void
   5.318 +VMS_int__write_return_addr_from_ptd_to_loc(void *ptrToReturnAddrHoldingLoc);
   5.319 +
   5.320 +void
   5.321 +VMS_int__dissipate_procr( SlaveVP *procrToDissipate );
   5.322 +
   5.323 +   //Use this to create processor inside entry point & other places outside
   5.324 +   // the VMS system boundary (IE, not run in slave nor Master)
   5.325 +SlaveVP *
   5.326 +VMS_ext__create_procr( TopLevelFnPtr fnPtr, void *dataParam );
   5.327 +
   5.328 +void
   5.329 +VMS_ext__dissipate_procr( SlaveVP *procrToDissipate );
   5.330 +
   5.331 +void
   5.332 +VMS_PI__throw_exception( char *msgStr, SlaveVP *reqstPr, VMSExcp *excpData );
   5.333 +
   5.334 +void *
   5.335 +VMS_WL__give_sem_env_for( SlaveVP *animPr );
   5.336 +
   5.337 +//==============  Request Related  ===============
   5.338 +
   5.339 +void
   5.340 +VMS_int__suspend_procr( SlaveVP *callingPr );
   5.341 +
   5.342 +inline void
   5.343 +VMS_WL__add_sem_request_in_mallocd_VMSReqst( void *semReqData, SlaveVP *callingPr );
   5.344 +
   5.345 +inline void
   5.346 +VMS_WL__send_sem_request( void *semReqData, SlaveVP *callingPr );
   5.347 +
   5.348 +void
   5.349 +VMS_WL__send_create_procr_req( void *semReqData, SlaveVP *reqstingPr );
   5.350 +
   5.351 +void inline
   5.352 +VMS_WL__send_dissipate_req( SlaveVP *prToDissipate );
   5.353 +
   5.354 +inline void
   5.355 +VMS_WL__send_VMSSem_request( void *semReqData, SlaveVP *callingPr );
   5.356 +
   5.357 +VMSReqst *
   5.358 +VMS_PI__take_next_request_out_of( SlaveVP *procrWithReq );
   5.359 +
   5.360 +inline void *
   5.361 +VMS_PI__take_sem_reqst_from( VMSReqst *req );
   5.362 +
   5.363 +void inline
   5.364 +VMS_PI__handle_VMSSemReq( VMSReqst *req, SlaveVP *requestingPr, void *semEnv,
   5.365 +                       ResumeVPFnPtr resumePrFnPtr );
   5.366 +
   5.367 +//======================== MEASUREMENT ======================
   5.368 +uint64
   5.369 +VMS_WL__give_num_plugin_cycles();
   5.370 +uint32
   5.371 +VMS_WL__give_num_plugin_animations();
   5.372 +
   5.373 +
   5.374 +
   5.375 +#include "VMS__HW_dependent.h"
   5.376 +#include "probes.h"
   5.377 +#include "vutilities.h"
   5.378 +
   5.379 +#endif	/* _VMS_H */
   5.380 +
     6.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     6.2 +++ b/VMS__HW_dependent.c	Wed Feb 22 11:39:12 2012 -0800
     6.3 @@ -0,0 +1,47 @@
     6.4 +/*
     6.5 + * This File contains all hardware dependent C code.
     6.6 + */
     6.7 +
     6.8 +
     6.9 +#include "VMS.h"
    6.10 +
    6.11 +/*Set up the stack with __cdecl structure on it
    6.12 + * Except doing a trick for 64 bits, where put top-level fn pointer on
    6.13 + * stack, then call an assembly helper that copies it into a reg and
    6.14 + * jumps to it.  So, set the resumeInstrPtr to the helper-assembly.
    6.15 + *No need to save registers on old stack frame, because there's no old
    6.16 + * animator state to return to
    6.17 + */
    6.18 +VMS_int__point_slave_to_Fn( SlaveVP *slaveVP, TopLevelFnPtr fnPtr,
    6.19 +                            void    *dataParam)
    6.20 + { void  *stackPtr;
    6.21 +
    6.22 +// Start of Hardware dependent part           
    6.23 +   
    6.24 +    //Set slave's instr pointer to a helper Fn that copies params from stack
    6.25 +   slaveVP->resumeInstrPtr  = (TopLevelFnPtr)&startUpTopLevelFn;
    6.26 +   
    6.27 +    //fnPtr takes two params -- void *dataParam & void *animProcr
    6.28 +    // Stack grows *down*, so start it at highest stack addr, minus room
    6.29 +    // for 2 params + return addr. 
    6.30 +   stackPtr = 
    6.31 +     (void *)slaveVP->startOfStack + VIRT_PROCR_STACK_SIZE - 4*sizeof(void*);
    6.32 +  
    6.33 +    //setup __cdecl on stack
    6.34 +    //Normally, return Addr is in loc pointed to by stackPtr, but doing a
    6.35 +    // trick for 64 bit arch, where put ptr to top-level fn there instead,
    6.36 +    // and set resumeInstrPtr to a helper-fn that copies the top-level
    6.37 +    // fn ptr and params into registers.
    6.38 +    //Then, dataParam is at stackPtr + 8 bytes, & animating SlaveVP above
    6.39 +   *((SlaveVP**)stackPtr + 2 ) = slaveVP; //rightmost param
    6.40 +   *((void**)stackPtr + 1 ) = dataParam;  //next  param to left
    6.41 +   *((void**)stackPtr) = (void*)fnPtr;    //copied to reg by helper Fn
    6.42 +   
    6.43 +  
    6.44 +// end of Hardware dependent part           
    6.45 +   
    6.46 +      //core loop will switch to stack & frame pointers stored in slave,
    6.47 +      // suspend will save processor's stack and frame into slave
    6.48 +   slaveVP->stackPtr = slaveVP->startOfStack; 
    6.49 +   slaveVP->framePtr = slaveVP->startOfStack; 
    6.50 + }
    6.51 \ No newline at end of file
     7.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     7.2 +++ b/VMS__HW_dependent.h	Wed Feb 22 11:39:12 2012 -0800
     7.3 @@ -0,0 +1,33 @@
     7.4 +/*
     7.5 + *  Copyright 2009 OpenSourceStewardshipFoundation.org
     7.6 + *  Licensed under GNU General Public License version 2
     7.7 + *
     7.8 + * Author: seanhalle@yahoo.com
     7.9 + * 
    7.10 + */
    7.11 +
    7.12 +#ifndef _ProcrContext_H
    7.13 +#define	_ProcrContext_H
    7.14 +#define _GNU_SOURCE
    7.15 +
    7.16 +void saveCoreLoopReturnAddr(void **returnAddress);
    7.17 +
    7.18 +void switchToVP(SlaveVP *nextProcr);
    7.19 +
    7.20 +void switchToCoreLoop(SlaveVP *nextProcr);
    7.21 +
    7.22 +void masterSwitchToCoreLoop(SlaveVP *nextProcr);
    7.23 +
    7.24 +void startUpTopLevelFn();
    7.25 +
    7.26 +void *asmTerminateCoreLoop(SlaveVP *currPr);
    7.27 +
    7.28 +#define flushRegisters() \
    7.29 +        asm volatile ("":::"%rbx", "%r12", "%r13","%r14","%r15")
    7.30 +
    7.31 +inline SlaveVP *
    7.32 +create_procr_helper( SlaveVP *newPr,       TopLevelFnPtr  fnPtr,
    7.33 +                     void      *dataParam, void           *stackLocs );
    7.34 +
    7.35 +#endif	/* _ProcrContext_H */
    7.36 +
     8.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     8.2 +++ b/VMS__HW_dependent.s	Wed Feb 22 11:39:12 2012 -0800
     8.3 @@ -0,0 +1,167 @@
     8.4 +.data
     8.5 +
     8.6 +
     8.7 +.text
     8.8 +
     8.9 +//Save return label address for the coreLoop to pointer
    8.10 +//Arguments: Pointer to variable holding address
    8.11 +.globl saveCoreLoopReturnAddr
    8.12 +saveCoreLoopReturnAddr:
    8.13 +    movq    $coreLoopReturn, %rcx   #load label address
    8.14 +    movq    %rcx, (%rdi)           #save address to pointer
    8.15 +    ret
    8.16 +
    8.17 +
    8.18 +//Trick for 64 bit arch -- copies args from stack into regs, then does jmp to
    8.19 +// the top-level function, which was pointed to by the stack-ptr
    8.20 +.globl startUpTopLevelFn
    8.21 +startUpTopLevelFn:
    8.22 +    movq    %rdi      , %rsi #get second argument from first argument of switchVP
    8.23 +    movq    0x08(%rsp), %rdi #get first argument from stack
    8.24 +    movq    (%rsp)    , %rax #get top-level function's addr from stack
    8.25 +    jmp     *%rax            #jump to the top-level function
    8.26 +
    8.27 +//Switches form CoreLoop to VP ether a normal VP or the Master Loop
    8.28 +//switch to virt procr's stack and frame ptr then jump to virt procr fn
    8.29 +/* SlaveVP  offsets:
    8.30 + * 0x10  stackPtr
    8.31 + * 0x18 framePtr
    8.32 + * 0x20 resumeInstrPtr
    8.33 + * 0x30 coreLoopFramePtr
    8.34 + * 0x38 coreLoopStackPtr
    8.35 + *
    8.36 + * _VMSMasterEnv  offsets:
    8.37 + * 0x48 coreLoopReturnPt
    8.38 + * 0x54 masterLock
    8.39 + */
    8.40 +.globl switchToVP
    8.41 +switchToVP:
    8.42 +    #SlaveVP in %rdi
    8.43 +    movq    %rsp      , 0x38(%rdi)   #save core loop stack pointer 
    8.44 +    movq    %rbp      , 0x30(%rdi)   #save core loop frame pointer
    8.45 +    movq    0x10(%rdi), %rsp         #restore stack pointer
    8.46 +    movq    0x18(%rdi), %rbp         #restore frame pointer
    8.47 +    movq    0x20(%rdi), %rax         #get jmp pointer
    8.48 +    jmp     *%rax                    #jmp to VP
    8.49 +coreLoopReturn:
    8.50 +    ret
    8.51 +
    8.52 +    
    8.53 +//switches to core loop. saves return address
    8.54 +/* SlaveVP  offsets:
    8.55 + * 0x10  stackPtr
    8.56 + * 0x18 framePtr
    8.57 + * 0x20 resumeInstrPtr
    8.58 + * 0x30 coreLoopFramePtr
    8.59 + * 0x38 coreLoopStackPtr
    8.60 + *
    8.61 + * _VMSMasterEnv  offsets:
    8.62 + * 0x48 coreLoopReturnPt
    8.63 + * 0x54 masterLock
    8.64 + */
    8.65 +.globl switchToCoreLoop
    8.66 +switchToCoreLoop:
    8.67 +    #SlaveVP in %rdi
    8.68 +    movq    $VPReturn , 0x20(%rdi)   #store return address
    8.69 +    movq    %rsp      , 0x10(%rdi)   #save stack pointer 
    8.70 +    movq    %rbp      , 0x18(%rdi)   #save frame pointer
    8.71 +    movq    0x38(%rdi), %rsp         #restore stack pointer
    8.72 +    movq    0x30(%rdi), %rbp         #restore frame pointer
    8.73 +    movq    $_VMSMasterEnv, %rcx
    8.74 +    movq    (%rcx)    , %rcx
    8.75 +    movq    0x48(%rcx), %rax         #get CoreLoopStartPt
    8.76 +    jmp     *%rax                    #jmp to CoreLoop
    8.77 +VPReturn:
    8.78 +    ret
    8.79 +
    8.80 +
    8.81 +
    8.82 +//switches to core loop from master. saves return address
    8.83 +//Releases masterLock so the next MasterLoop can be executed
    8.84 +/* SlaveVP  offsets:
    8.85 + * 0x10  stackPtr
    8.86 + * 0x18 framePtr
    8.87 + * 0x20 resumeInstrPtr
    8.88 + * 0x30 coreLoopFramePtr
    8.89 + * 0x38 coreLoopStackPtr
    8.90 + *
    8.91 + * _VMSMasterEnv  offsets:
    8.92 + * 0x48 coreLoopReturnPt
    8.93 + * 0x54 masterLock
    8.94 + */
    8.95 +.globl masterSwitchToCoreLoop
    8.96 +masterSwitchToCoreLoop:
    8.97 +    #SlaveVP in %rdi
    8.98 +    movq    $MasterReturn, 0x20(%rdi)   #store return address
    8.99 +    movq    %rsp      , 0x10(%rdi)   #save stack pointer 
   8.100 +    movq    %rbp      , 0x18(%rdi)   #save frame pointer
   8.101 +    movq    0x38(%rdi), %rsp         #restore stack pointer
   8.102 +    movq    0x30(%rdi), %rbp         #restore frame pointer
   8.103 +    movq    $_VMSMasterEnv, %rcx
   8.104 +    movq    (%rcx)    , %rcx
   8.105 +    movq    0x48(%rcx), %rax         #get CoreLoopStartPt
   8.106 +    movl    $0x0      , 0x54(%rcx)   #release lock
   8.107 +    jmp     *%rax                    #jmp to CoreLoop
   8.108 +MasterReturn:
   8.109 +    ret
   8.110 +
   8.111 +
   8.112 +//Switch to terminateCoreLoop
   8.113 +//therefor switch to coreLoop context from master context
   8.114 +// no need to call because the stack is already set up for switchVP
   8.115 +// and virtPr is in %rdi
   8.116 +// and both functions have the same argument.
   8.117 +// do not save register of VP because this function will never return
   8.118 +/* SlaveVP  offsets:
   8.119 + * 0x10  stackPtr
   8.120 + * 0x18 framePtr
   8.121 + * 0x20 resumeInstrPtr
   8.122 + * 0x30 coreLoopFramePtr
   8.123 + * 0x38 coreLoopStackPtr
   8.124 + *
   8.125 + * _VMSMasterEnv  offsets:
   8.126 + * 0x48 coreLoopReturnPt
   8.127 + * 0x58 masterLock
   8.128 + */
   8.129 +.globl asmTerminateCoreLoop
   8.130 +asmTerminateCoreLoop:
   8.131 +    #SlaveVP in %rdi
   8.132 +    movq    0x38(%rdi), %rsp         #restore stack pointer
   8.133 +    movq    0x30(%rdi), %rbp         #restore frame pointer
   8.134 +    movq    $terminateCoreLoop, %rax
   8.135 +    jmp     *%rax                    #jmp to CoreLoop
   8.136 +
   8.137 +
   8.138 +/*
   8.139 + * This one for the sequential version is special. It discards the current stack
   8.140 + * and returns directly from the coreLoop after VMS__dissipate_procr was called
   8.141 + */
   8.142 +.globl asmTerminateCoreLoopSeq
   8.143 +asmTerminateCoreLoopSeq:
   8.144 +    #SlaveVP in %rdi
   8.145 +    movq    0x38(%rdi), %rsp         #restore stack pointer
   8.146 +    movq    0x30(%rdi), %rbp         #restore frame pointer
   8.147 +    #argument is in %rdi
   8.148 +    call    VMS__dissipate_procr
   8.149 +    movq    %rbp      , %rsp        #goto the coreLoops stack
   8.150 +    pop     %rbp        #restore the old framepointer
   8.151 +    ret                 #return from core loop
   8.152 +    
   8.153 +
   8.154 +//Assembly code takes the return addr off the stack and saves
   8.155 +// into the loc pointed to by rdi.  The return addr is at 0x8(%rbp) for 64bit
   8.156 +.globl asm_save_ret_to_singleton
   8.157 +VMS_int__save_return_addr_into_ptd_to_loc:
   8.158 +    movq 0x8(%rbp),     %rax  #get ret address, rbp is the same as in the calling function
   8.159 +    movq     %rax,     (%rdi) #write ret addr to endInstrAddr field
   8.160 +    ret
   8.161 +
   8.162 +
   8.163 +//Assembly code changes the return addr on the stack to the one
   8.164 +// pointed to by the parameter. The stack's return addr is at 0x8(%rbp)
   8.165 +.globl asm_write_ret_from_singleton
   8.166 +VMS_int__write_return_addr_from_ptd_to_loc:
   8.167 +    movq    (%rdi),    %rax      #get return addr
   8.168 +    movq      %rax,    0x8(%rbp) #write return addr to the stack of the caller
   8.169 +    ret
   8.170 +
     9.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     9.2 +++ b/VMS__PI.c	Wed Feb 22 11:39:12 2012 -0800
     9.3 @@ -0,0 +1,87 @@
     9.4 +/*
     9.5 + * Copyright 2010  OpenSourceStewardshipFoundation
     9.6 + *
     9.7 + * Licensed under BSD
     9.8 + */
     9.9 +
    9.10 +#include <stdio.h>
    9.11 +#include <stdlib.h>
    9.12 +#include <string.h>
    9.13 +#include <malloc.h>
    9.14 +#include <inttypes.h>
    9.15 +#include <sys/time.h>
    9.16 +
    9.17 +#include "VMS.h"
    9.18 +
    9.19 +
    9.20 +/*
    9.21 + */
    9.22 +VMSReqst *
    9.23 +VMS_PI__take_next_request_out_of( SlaveVP *procrWithReq )
    9.24 + { VMSReqst *req;
    9.25 +
    9.26 +   req = procrWithReq->requests;
    9.27 +   if( req == NULL ) return NULL;
    9.28 +
    9.29 +   procrWithReq->requests = procrWithReq->requests->nextReqst;
    9.30 +   return req;
    9.31 + }
    9.32 +
    9.33 +
    9.34 +inline void *
    9.35 +VMS_PI__take_sem_reqst_from( VMSReqst *req )
    9.36 + {
    9.37 +   return req->semReqData;
    9.38 + }
    9.39 +
    9.40 +
    9.41 +
    9.42 +/* This is for OS requests and VMS infrastructure requests, such as to create
    9.43 + *  a probe -- a probe is inside the heart of VMS-core, it's not part of any
    9.44 + *  language -- but it's also a semantic thing that's triggered from and used
    9.45 + *  in the application.. so it crosses abstractions..  so, need some special
    9.46 + *  pattern here for handling such requests.
    9.47 + * Doing this just like it were a second language sharing VMS-core.
    9.48 + * 
    9.49 + * This is called from the language's request handler when it sees a request
    9.50 + *  of type VMSSemReq
    9.51 + *
    9.52 + * TODO: Later change this, to give probes their own separate plugin & have
    9.53 + *  VMS-core steer the request to appropriate plugin
    9.54 + * Do the same for OS calls -- look later at it..
    9.55 + */
    9.56 +void inline
    9.57 +VMS_PI__handle_VMSSemReq( VMSReqst *req, SlaveVP *requestingPr, void *semEnv,
    9.58 +                       ResumeVPFnPtr resumePrFnPtr )
    9.59 + { VMSSemReq     *semReq;
    9.60 +   IntervalProbe *newProbe;
    9.61 +
    9.62 +   semReq = req->semReqData;
    9.63 +
    9.64 +   newProbe          = VMS_int__malloc( sizeof(IntervalProbe) );
    9.65 +   newProbe->nameStr = VMS_int__strDup( semReq->nameStr );
    9.66 +   newProbe->hist    = NULL;
    9.67 +   newProbe->schedChoiceWasRecorded = FALSE;
    9.68 +
    9.69 +      //This runs in masterVP, so no race-condition worries
    9.70 +   newProbe->probeID =
    9.71 +             addToDynArray( newProbe, _VMSMasterEnv->dynIntervalProbesInfo );
    9.72 +
    9.73 +   requestingPr->dataRetFromReq = newProbe;
    9.74 +
    9.75 +   (*resumePrFnPtr)( requestingPr, semEnv );
    9.76 + }
    9.77 +
    9.78 +
    9.79 +/*Later, improve this -- for now, just exits the application after printing
    9.80 + * the error message.
    9.81 + */
    9.82 +void
    9.83 +VMS_PI__throw_exception( char *msgStr, SlaveVP *reqstPr, VMSExcp *excpData )
    9.84 + {
    9.85 +   printf("%s",msgStr);
    9.86 +   fflush(stdin);
    9.87 +   exit(1);
    9.88 + }
    9.89 +
    9.90 +
    10.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    10.2 +++ b/VMS__WL.c	Wed Feb 22 11:39:12 2012 -0800
    10.3 @@ -0,0 +1,138 @@
    10.4 +/*
    10.5 + * Copyright 2010  OpenSourceStewardshipFoundation
    10.6 + *
    10.7 + * Licensed under BSD
    10.8 + */
    10.9 +
   10.10 +#include <stdio.h>
   10.11 +#include <stdlib.h>
   10.12 +#include <string.h>
   10.13 +#include <malloc.h>
   10.14 +#include <inttypes.h>
   10.15 +#include <sys/time.h>
   10.16 +
   10.17 +#include "VMS.h"
   10.18 +
   10.19 +
   10.20 +/*Anticipating multi-tasking
   10.21 + */
   10.22 +void *
   10.23 +VMS_WL__give_sem_env_for( SlaveVP *animPr )
   10.24 + {
   10.25 +   return _VMSMasterEnv->semanticEnv;
   10.26 + }
   10.27 +
   10.28 +
   10.29 +/*For this implementation of VMS, it may not make much sense to have the
   10.30 + * system of requests for creating a new processor done this way.. but over
   10.31 + * the scope of single-master, multi-master, mult-tasking, OS-implementing,
   10.32 + * distributed-memory, and so on, this gives VMS implementation a chance to
   10.33 + * do stuff before suspend, in the AppVP, and in the Master before the plugin
   10.34 + * is called, as well as in the lang-lib before this is called, and in the
   10.35 + * plugin.  So, this gives both VMS and language implementations a chance to
   10.36 + * intercept at various points and do order-dependent stuff.
   10.37 + *Having a standard VMSNewPrReqData struc allows the language to create and
   10.38 + * free the struc, while VMS knows how to get the newPr if it wants it, and
   10.39 + * it lets the lang have lang-specific data related to creation transported
   10.40 + * to the plugin.
   10.41 + */
   10.42 +void
   10.43 +VMS_WL__send_create_procr_req( void *semReqData, SlaveVP *reqstingPr )
   10.44 + { VMSReqst req;
   10.45 +
   10.46 +   req.reqType          = createReq;
   10.47 +   req.semReqData       = semReqData;
   10.48 +   req.nextReqst        = reqstingPr->requests;
   10.49 +   reqstingPr->requests = &req;
   10.50 +
   10.51 +   VMS_int__suspend_procr( reqstingPr );
   10.52 + }
   10.53 +
   10.54 +
   10.55 +/*
   10.56 + *This adds a request to dissipate, then suspends the processor so that the
   10.57 + * request handler will receive the request.  The request handler is what
   10.58 + * does the work of freeing memory and removing the processor from the
   10.59 + * semantic environment's data structures.
   10.60 + *The request handler also is what figures out when to shutdown the VMS
   10.61 + * system -- which causes all the core loop threads to die, and returns from
   10.62 + * the call that started up VMS to perform the work.
   10.63 + *
   10.64 + *This form is a bit misleading to understand if one is trying to figure out
   10.65 + * how VMS works -- it looks like a normal function call, but inside it
   10.66 + * sends a request to the request handler and suspends the processor, which
   10.67 + * jumps out of the VMS__dissipate_procr function, and out of all nestings
   10.68 + * above it, transferring the work of dissipating to the request handler,
   10.69 + * which then does the actual work -- causing the processor that animated
   10.70 + * the call of this function to disappear and the "hanging" state of this
   10.71 + * function to just poof into thin air -- the virtual processor's trace
   10.72 + * never returns from this call, but instead the virtual processor's trace
   10.73 + * gets suspended in this call and all the virt processor's state disap-
   10.74 + * pears -- making that suspend the last thing in the virt procr's trace.
   10.75 + */
   10.76 +void
   10.77 +VMS_WL__send_dissipate_req( SlaveVP *procrToDissipate )
   10.78 + { VMSReqst req;
   10.79 +
   10.80 +   req.reqType                = dissipate;
   10.81 +   req.nextReqst              = procrToDissipate->requests;
   10.82 +   procrToDissipate->requests = &req;
   10.83 +
   10.84 +   VMS_int__suspend_procr( procrToDissipate );
   10.85 + }
   10.86 +
   10.87 +
   10.88 +
   10.89 +/*This call's name indicates that request is malloc'd -- so req handler
   10.90 + * has to free any extra requests tacked on before a send, using this.
   10.91 + *
   10.92 + * This inserts the semantic-layer's request data into standard VMS carrier
   10.93 + * request data-struct that is mallocd.  The sem request doesn't need to
   10.94 + * be malloc'd if this is called inside the same call chain before the
   10.95 + * send of the last request is called.
   10.96 + *
   10.97 + *The request handler has to call VMS__free_VMSReq for any of these
   10.98 + */
   10.99 +inline void
  10.100 +VMS_WL__add_sem_request_in_mallocd_VMSReqst( void *semReqData,
  10.101 +                                          SlaveVP *callingPr )
  10.102 + { VMSReqst *req;
  10.103 +
  10.104 +   req = VMS_int__malloc( sizeof(VMSReqst) );
  10.105 +   req->reqType         = semantic;
  10.106 +   req->semReqData      = semReqData;
  10.107 +   req->nextReqst       = callingPr->requests;
  10.108 +   callingPr->requests = req;
  10.109 + }
  10.110 +
  10.111 +/*This inserts the semantic-layer's request data into standard VMS carrier
  10.112 + * request data-struct is allocated on stack of this call & ptr to it sent
  10.113 + * to plugin
  10.114 + *Then it does suspend, to cause request to be sent.
  10.115 + */
  10.116 +inline void
  10.117 +VMS_WL__send_sem_request( void *semReqData, SlaveVP *callingPr )
  10.118 + { VMSReqst req;
  10.119 +
  10.120 +   req.reqType         = semantic;
  10.121 +   req.semReqData      = semReqData;
  10.122 +   req.nextReqst       = callingPr->requests;
  10.123 +   callingPr->requests = &req;
  10.124 +   
  10.125 +   VMS_int__suspend_procr( callingPr );
  10.126 + }
  10.127 +
  10.128 +
  10.129 +inline void
  10.130 +VMS_WL__send_VMSSem_request( void *semReqData, SlaveVP *callingPr )
  10.131 + { VMSReqst req;
  10.132 +
  10.133 +   req.reqType         = VMSSemantic;
  10.134 +   req.semReqData      = semReqData;
  10.135 +   req.nextReqst       = callingPr->requests; //gab any other preceeding 
  10.136 +   callingPr->requests = &req;
  10.137 +
  10.138 +   VMS_int__suspend_procr( callingPr );
  10.139 + }
  10.140 +
  10.141 +
    11.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    11.2 +++ b/VMS__int.c	Wed Feb 22 11:39:12 2012 -0800
    11.3 @@ -0,0 +1,155 @@
    11.4 +/*
    11.5 + * Copyright 2010  OpenSourceStewardshipFoundation
    11.6 + *
    11.7 + * Licensed under BSD
    11.8 + */
    11.9 +
   11.10 +#include <stdio.h>
   11.11 +#include <stdlib.h>
   11.12 +#include <string.h>
   11.13 +#include <malloc.h>
   11.14 +#include <inttypes.h>
   11.15 +#include <sys/time.h>
   11.16 +
   11.17 +#include "VMS.h"
   11.18 +
   11.19 +
   11.20 +inline SlaveVP *
   11.21 +VMS_int__create_procr( TopLevelFnPtr fnPtr, void *dataParam )
   11.22 + { SlaveVP *newPr;
   11.23 +   void      *stackLocs;
   11.24 +
   11.25 +   newPr      = VMS_int__malloc( sizeof(SlaveVP) );
   11.26 +   stackLocs  = VMS_int__malloc( VIRT_PROCR_STACK_SIZE );
   11.27 +   if( stackLocs == 0 )
   11.28 +    { perror("VMS__malloc stack"); exit(1); }
   11.29 +
   11.30 +   _VMSMasterEnv->numSlaves += 1;
   11.31 +
   11.32 +   return create_procr_helper( newPr, fnPtr, dataParam, stackLocs );
   11.33 + }
   11.34 +
   11.35 +/* "ext" designates that it's for use outside the VMS system -- should only
   11.36 + * be called from main thread or other thread -- never from code animated by
   11.37 + * a VMS virtual processor.
   11.38 + */
   11.39 +inline SlaveVP *
   11.40 +VMS_ext__create_procr( TopLevelFnPtr fnPtr, void *dataParam )
   11.41 + { SlaveVP *newPr;
   11.42 +   char      *stackLocs;
   11.43 +
   11.44 +   newPr      = malloc( sizeof(SlaveVP) );
   11.45 +   stackLocs  = malloc( VIRT_PROCR_STACK_SIZE );
   11.46 +   if( stackLocs == 0 )
   11.47 +    { perror("malloc stack"); exit(1); }
   11.48 +
   11.49 +   return create_procr_helper( newPr, fnPtr, dataParam, stackLocs );
   11.50 + }
   11.51 +
   11.52 +
   11.53 +//===========================================================================
   11.54 +/*there is a label inside this function -- save the addr of this label in
   11.55 + * the callingPr struc, as the pick-up point from which to start the next
   11.56 + * work-unit for that procr.  If turns out have to save registers, then
   11.57 + * save them in the procr struc too.  Then do assembly jump to the CoreLoop's
   11.58 + * "done with work-unit" label.  The procr struc is in the request in the
   11.59 + * slave that animated the just-ended work-unit, so all the state is saved
   11.60 + * there, and will get passed along, inside the request handler, to the
   11.61 + * next work-unit for that procr.
   11.62 + */
   11.63 +void
   11.64 +VMS_int__suspend_procr( SlaveVP *animatingPr )
   11.65 + { 
   11.66 +
   11.67 +      //The request to master will cause this suspended virt procr to get
   11.68 +      // scheduled again at some future point -- to resume, core loop jumps
   11.69 +      // to the resume point (below), which causes restore of saved regs and
   11.70 +      // "return" from this call.
   11.71 +   //animatingPr->resumeInstrPtr = &&ResumePt;
   11.72 +
   11.73 +      //return ownership of the virt procr and sched slot to Master virt pr
   11.74 +   animatingPr->schedSlot->workIsDone = TRUE;
   11.75 +
   11.76 +   //===========================  Measurement stuff ========================
   11.77 +   #ifdef MEAS__TIME_STAMP_SUSP
   11.78 +      //record time stamp: compare to time-stamp recorded below
   11.79 +   saveLowTimeStampCountInto( animatingPr->preSuspTSCLow );
   11.80 +   #endif
   11.81 +   //=======================================================================
   11.82 +
   11.83 +   switchToCoreLoop(animatingPr);
   11.84 +   flushRegisters();
   11.85 +
   11.86 +   //=======================================================================
   11.87 +
   11.88 +   #ifdef MEAS__TIME_STAMP_SUSP
   11.89 +      //NOTE: only take low part of count -- do sanity check when take diff
   11.90 +   saveLowTimeStampCountInto( animatingPr->postSuspTSCLow );
   11.91 +   #endif
   11.92 +
   11.93 +   return;
   11.94 + }
   11.95 +
   11.96 +
   11.97 +/* "ext" designates that it's for use outside the VMS system -- should only
   11.98 + * be called from main thread or other thread -- never from code animated by
   11.99 + * a SlaveVP, nor from a masterVP.
  11.100 + *
  11.101 + *Use this version to dissipate VPs created outside the VMS system.
  11.102 + */
  11.103 +void
  11.104 +VMS_ext__dissipate_procr( SlaveVP *procrToDissipate )
  11.105 + {
  11.106 +      //NOTE: dataParam was given to the processor, so should either have
  11.107 +      // been alloc'd with VMS__malloc, or freed by the level above animPr.
  11.108 +      //So, all that's left to free here is the stack and the SlaveVP struc
  11.109 +      // itself
  11.110 +      //Note, should not stack-allocate the data param -- no guarantee, in
  11.111 +      // general that creating processor will outlive ones it creates.
  11.112 +   free( procrToDissipate->startOfStack );
  11.113 +   free( procrToDissipate );
  11.114 + }
  11.115 +
  11.116 +
  11.117 +
  11.118 +/*This must be called by the request handler plugin -- it cannot be called
  11.119 + * from the semantic library "dissipate processor" function -- instead, the
  11.120 + * semantic layer has to generate a request, and the plug-in calls this
  11.121 + * function.
  11.122 + *The reason is that this frees the virtual processor's stack -- which is
  11.123 + * still in use inside semantic library calls!
  11.124 + *
  11.125 + *This frees or recycles all the state owned by and comprising the VMS
  11.126 + * portion of the animating virtual procr.  The request handler must first
  11.127 + * free any semantic data created for the processor that didn't use the
  11.128 + * VMS_malloc mechanism.  Then it calls this, which first asks the malloc
  11.129 + * system to disown any state that did use VMS_malloc, and then frees the
  11.130 + * statck and the processor-struct itself.
  11.131 + *If the dissipated processor is the sole (remaining) owner of VMS__malloc'd
  11.132 + * state, then that state gets freed (or sent to recycling) as a side-effect
  11.133 + * of dis-owning it.
  11.134 + */
  11.135 +void
  11.136 +VMS_int__dissipate_procr( SlaveVP *animatingPr )
  11.137 + {
  11.138 +      //dis-own all locations owned by this processor, causing to be freed
  11.139 +      // any locations that it is (was) sole owner of
  11.140 +//TODO: implement VMS__malloc system, including "give up ownership"
  11.141 +
  11.142 +   _VMSMasterEnv->numSlaves -= 1;
  11.143 +   if( _VMSMasterEnv->numSlaves == 0 )
  11.144 +    {    //no more work, so shutdown
  11.145 +      VMS_int__shutdown();  //note, creates 4 shut-down processors
  11.146 +    }
  11.147 +
  11.148 +      //NOTE: dataParam was given to the processor, so should either have
  11.149 +      // been alloc'd with VMS__malloc, or freed by the level above animPr.
  11.150 +      //So, all that's left to free here is the stack and the SlaveVP struc
  11.151 +      // itself
  11.152 +      //Note, should not stack-allocate initial data -- no guarantee, in
  11.153 +      // general that creating processor will outlive ones it creates.
  11.154 +   VMS_int__free( animatingPr->startOfStack );
  11.155 +   VMS_int__free( animatingPr );
  11.156 + }
  11.157 +
  11.158 +
    12.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    12.2 +++ b/VMS__startup_and_shutdown.c	Wed Feb 22 11:39:12 2012 -0800
    12.3 @@ -0,0 +1,458 @@
    12.4 +/*
    12.5 + * Copyright 2010  OpenSourceStewardshipFoundation
    12.6 + *
    12.7 + * Licensed under BSD
    12.8 + */
    12.9 +
   12.10 +#include <stdio.h>
   12.11 +#include <stdlib.h>
   12.12 +#include <string.h>
   12.13 +#include <malloc.h>
   12.14 +#include <inttypes.h>
   12.15 +#include <sys/time.h>
   12.16 +
   12.17 +#include "VMS.h"
   12.18 +#include "VMS__HW_dependent.h"
   12.19 +
   12.20 +
   12.21 +#define thdAttrs NULL
   12.22 +
   12.23 +//===========================================================================
   12.24 +void
   12.25 +shutdownFn( void *dummy, SlaveVP *dummy2 );
   12.26 +
   12.27 +SchedSlot **
   12.28 +create_sched_slots();
   12.29 +
   12.30 +void
   12.31 +create_masterEnv();
   12.32 +
   12.33 +void
   12.34 +create_the_coreLoop_OS_threads();
   12.35 +
   12.36 +MallocProlog *
   12.37 +create_free_list();
   12.38 +
   12.39 +void
   12.40 +endOSThreadFn( void *initData, SlaveVP *animatingPr );
   12.41 +
   12.42 +pthread_mutex_t suspendLock = PTHREAD_MUTEX_INITIALIZER;
   12.43 +pthread_cond_t  suspend_cond  = PTHREAD_COND_INITIALIZER;
   12.44 +
   12.45 +//===========================================================================
   12.46 +
   12.47 +/*Setup has two phases:
   12.48 + * 1) Semantic layer first calls init_VMS, which creates masterEnv, and puts
   12.49 + *    the master virt procr into the work-queue, ready for first "call"
   12.50 + * 2) Semantic layer then does its own init, which creates the seed virt
   12.51 + *    procr inside the semantic layer, ready to schedule it when
   12.52 + *    asked by the first run of the masterLoop.
   12.53 + *
   12.54 + *This part is bit weird because VMS really wants to be "always there", and
   12.55 + * have applications attach and detach..  for now, this VMS is part of
   12.56 + * the app, so the VMS system starts up as part of running the app.
   12.57 + *
   12.58 + *The semantic layer is isolated from the VMS internals by making the
   12.59 + * semantic layer do setup to a state that it's ready with its
   12.60 + * initial virt procrs, ready to schedule them to slots when the masterLoop
   12.61 + * asks.  Without this pattern, the semantic layer's setup would
   12.62 + * have to modify slots directly to assign the initial virt-procrs, and put
   12.63 + * them into the readyToAnimateQ itself, breaking the isolation completely.
   12.64 + *
   12.65 + * 
   12.66 + *The semantic layer creates the initial virt procr(s), and adds its
   12.67 + * own environment to masterEnv, and fills in the pointers to
   12.68 + * the requestHandler and slaveScheduler plug-in functions
   12.69 + */
   12.70 +
   12.71 +/*This allocates VMS data structures, populates the master VMSProc,
   12.72 + * and master environment, and returns the master environment to the semantic
   12.73 + * layer.
   12.74 + */
   12.75 +void
   12.76 +VMS_int__init()
   12.77 + {
   12.78 +
   12.79 +#ifdef SEQUENTIAL
   12.80 +   create_masterEnv();
   12.81 +   flushRegisters();  //? not sure why here -- merten added it..?
   12.82 +#else
   12.83 +   create_masterEnv();
   12.84 +   create_the_coreLoop_OS_threads();
   12.85 +#endif
   12.86 + }
   12.87 +
   12.88 +void
   12.89 +create_masterEnv()
   12.90 + { MasterEnv       *masterEnv;
   12.91 +   VMSQueueStruc **readyToAnimateQs;
   12.92 +   int              coreIdx;
   12.93 +   SlaveVP      **masterVPs;
   12.94 +   SchedSlot     ***allSchedSlots; //ptr to array of ptrs
   12.95 +
   12.96 +
   12.97 +      //Make the master env, which holds everything else
   12.98 +   _VMSMasterEnv = malloc( sizeof(MasterEnv) );
   12.99 +
  12.100 +        //Very first thing put into the master env is the free-list, seeded
  12.101 +        // with a massive initial chunk of memory.
  12.102 +        //After this, all other mallocs are VMS__malloc.
  12.103 +   _VMSMasterEnv->freeListHead        = VMS_ext__create_free_list();
  12.104 +
  12.105 +
  12.106 +   //============================= MEASUREMENT STUFF ========================
  12.107 +   #ifdef MEAS__TIME_MALLOC
  12.108 +   _VMSMasterEnv->mallocTimeHist  = makeFixedBinHistExt( 100, 0, 30,
  12.109 +                                                       "malloc_time_hist");
  12.110 +   _VMSMasterEnv->freeTimeHist  = makeFixedBinHistExt( 100, 0, 30,
  12.111 +                                                       "free_time_hist");
  12.112 +   #endif
  12.113 +   #ifdef MEAS__TIME_PLUGIN
  12.114 +   _VMSMasterEnv->reqHdlrLowTimeHist  = makeFixedBinHistExt( 100, 0, 200,
  12.115 +                                                     "plugin_low_time_hist");
  12.116 +   _VMSMasterEnv->reqHdlrHighTimeHist  = makeFixedBinHistExt( 100, 0, 200,
  12.117 +                                                    "plugin_high_time_hist");
  12.118 +   #endif
  12.119 +   //========================================================================
  12.120 +
  12.121 +   //===================== Only VMS__malloc after this ====================
  12.122 +   masterEnv     = (MasterEnv*)_VMSMasterEnv;
  12.123 +   
  12.124 +      //Make a readyToAnimateQ for each core loop
  12.125 +   readyToAnimateQs = VMS_int__malloc( NUM_CORES * sizeof(VMSQueueStruc *) );
  12.126 +   masterVPs        = VMS_int__malloc( NUM_CORES * sizeof(SlaveVP *) );
  12.127 +
  12.128 +      //One array for each core, 3 in array, core's masterVP scheds all
  12.129 +   allSchedSlots    = VMS_int__malloc( NUM_CORES * sizeof(SchedSlot *) );
  12.130 +
  12.131 +   _VMSMasterEnv->numSlaves = 0;  //used to detect shut-down condition
  12.132 +
  12.133 +   _VMSMasterEnv->numVPsCreated = 0;  //used by create procr to set ID
  12.134 +   for( coreIdx = 0; coreIdx < NUM_CORES; coreIdx++ )
  12.135 +    {    
  12.136 +      readyToAnimateQs[ coreIdx ] = makeVMSQ();
  12.137 +      
  12.138 +         //Q: should give masterVP core-specific info as its init data?
  12.139 +      masterVPs[ coreIdx ] = VMS_int__create_procr( (TopLevelFnPtr)&masterLoop, (void*)masterEnv );
  12.140 +      masterVPs[ coreIdx ]->coreAnimatedBy = coreIdx;
  12.141 +      allSchedSlots[ coreIdx ] = create_sched_slots(); //makes for one core
  12.142 +      _VMSMasterEnv->numMasterInARow[ coreIdx ] = 0;
  12.143 +      _VMSMasterEnv->workStealingGates[ coreIdx ] = NULL;
  12.144 +    }
  12.145 +   _VMSMasterEnv->readyToAnimateQs = readyToAnimateQs;
  12.146 +   _VMSMasterEnv->masterVPs        = masterVPs;
  12.147 +   _VMSMasterEnv->masterLock       = UNLOCKED;
  12.148 +   _VMSMasterEnv->allSchedSlots    = allSchedSlots;
  12.149 +   _VMSMasterEnv->workStealingLock = UNLOCKED;
  12.150 +
  12.151 +
  12.152 +      //Aug 19, 2010:  no longer need to place initial masterVP into queue
  12.153 +      // because coreLoop now controls -- animates its masterVP when no work
  12.154 +
  12.155 +
  12.156 +   //============================= MEASUREMENT STUFF ========================
  12.157 +   #ifdef STATS__TURN_ON_PROBES
  12.158 +   _VMSMasterEnv->dynIntervalProbesInfo =
  12.159 +              makePrivDynArrayOfSize( (void***)&(_VMSMasterEnv->intervalProbes), 200);
  12.160 +
  12.161 +   _VMSMasterEnv->probeNameHashTbl = makeHashTable( 1000, &VMS_int__free );
  12.162 +   
  12.163 +      //put creation time directly into master env, for fast retrieval
  12.164 +   struct timeval timeStamp;
  12.165 +   gettimeofday( &(timeStamp), NULL);
  12.166 +   _VMSMasterEnv->createPtInSecs =
  12.167 +                           timeStamp.tv_sec +(timeStamp.tv_usec/1000000.0);
  12.168 +   #endif
  12.169 +   #ifdef MEAS__TIME_MASTER_LOCK
  12.170 +   _VMSMasterEnv->masterLockLowTimeHist  = makeFixedBinHist( 50, 0, 2,
  12.171 +                                                "master lock low time hist");
  12.172 +   _VMSMasterEnv->masterLockHighTimeHist  = makeFixedBinHist( 50, 0, 100,
  12.173 +                                               "master lock high time hist");
  12.174 +   #endif
  12.175 +   
  12.176 +   MakeTheMeasHists();
  12.177 +   //========================================================================
  12.178 + }
  12.179 +
  12.180 +SchedSlot **
  12.181 +create_sched_slots()
  12.182 + { SchedSlot  **schedSlots;
  12.183 +   int i;
  12.184 +
  12.185 +   schedSlots  = VMS_int__malloc( NUM_SCHED_SLOTS * sizeof(SchedSlot *) );
  12.186 +
  12.187 +   for( i = 0; i < NUM_SCHED_SLOTS; i++ )
  12.188 +    {
  12.189 +      schedSlots[i] = VMS_int__malloc( sizeof(SchedSlot) );
  12.190 +
  12.191 +         //Set state to mean "handling requests done, slot needs filling"
  12.192 +      schedSlots[i]->workIsDone         = FALSE;
  12.193 +      schedSlots[i]->needsProcrAssigned = TRUE;
  12.194 +    }
  12.195 +   return schedSlots;
  12.196 + }
  12.197 +
  12.198 +
  12.199 +void
  12.200 +freeSchedSlots( SchedSlot **schedSlots )
  12.201 + { int i;
  12.202 +   for( i = 0; i < NUM_SCHED_SLOTS; i++ )
  12.203 +    {
  12.204 +      VMS_int__free( schedSlots[i] );
  12.205 +    }
  12.206 +   VMS_int__free( schedSlots );
  12.207 + }
  12.208 +
  12.209 +
  12.210 +void
  12.211 +create_the_coreLoop_OS_threads()
  12.212 + {
  12.213 +   //========================================================================
  12.214 +   //                      Create the Threads
  12.215 +   int coreIdx, retCode;
  12.216 +
  12.217 +      //Need the threads to be created suspended, and wait for a signal
  12.218 +      // before proceeding -- gives time after creating to initialize other
  12.219 +      // stuff before the coreLoops set off.
  12.220 +   _VMSMasterEnv->setupComplete = 0;
  12.221 +
  12.222 +      //Make the threads that animate the core loops
  12.223 +   for( coreIdx=0; coreIdx < NUM_CORES; coreIdx++ )
  12.224 +    { coreLoopThdParams[coreIdx]          = VMS_int__malloc( sizeof(ThdParams) );
  12.225 +      coreLoopThdParams[coreIdx]->coreNum = coreIdx;
  12.226 +
  12.227 +      retCode =
  12.228 +      pthread_create( &(coreLoopThdHandles[coreIdx]),
  12.229 +                        thdAttrs,
  12.230 +                       &coreLoop,
  12.231 +               (void *)(coreLoopThdParams[coreIdx]) );
  12.232 +      if(retCode){printf("ERROR creating thread: %d\n", retCode); exit(1);}
  12.233 +    }
  12.234 + }
  12.235 +
  12.236 +
  12.237 +
  12.238 +void
  12.239 +VMS_WL__register_request_handler( RequestHandler requestHandler )
  12.240 + { _VMSMasterEnv->requestHandler = requestHandler;
  12.241 + }
  12.242 +
  12.243 +
  12.244 +void
  12.245 +VMS_WL__register_sched_assigner( Sched_Assigner schedAssigner )
  12.246 + { _VMSMasterEnv->slaveSchedAssigner = schedAssigner;
  12.247 + }
  12.248 +
  12.249 +VMS_WL__register_semantic_env( void *semanticEnv )
  12.250 + { _VMSMasterEnv->semanticEnv = semanticEnv;
  12.251 + }
  12.252 +
  12.253 +
  12.254 +/*This is what causes the VMS system to initialize.. then waits for it to
  12.255 + * exit.
  12.256 + * 
  12.257 + *Wrapper lib layer calls this when it wants the system to start running..
  12.258 + */
  12.259 +void
  12.260 +VMS_WL__start_the_work_then_wait_until_done()
  12.261 + { 
  12.262 +#ifdef SEQUENTIAL
  12.263 +   /*Only difference between version with an OS thread pinned to each core and
  12.264 +    * the sequential version of VMS is VMS__init_Seq, this, and coreLoop_Seq.
  12.265 +    */
  12.266 +         //Instead of un-suspending threads, just call the one and only
  12.267 +         // core loop (sequential version), in the main thread.
  12.268 +      coreLoop_Seq( NULL );
  12.269 +      flushRegisters();
  12.270 +#else
  12.271 +   int coreIdx;
  12.272 +      //Start the core loops running
  12.273 +   
  12.274 +      //tell the core loop threads that setup is complete
  12.275 +      //get lock, to lock out any threads still starting up -- they'll see
  12.276 +      // that setupComplete is true before entering while loop, and so never
  12.277 +      // wait on the condition
  12.278 +   pthread_mutex_lock(     &suspendLock );
  12.279 +   _VMSMasterEnv->setupComplete = 1;
  12.280 +   pthread_mutex_unlock(   &suspendLock );
  12.281 +   pthread_cond_broadcast( &suspend_cond );
  12.282 +   
  12.283 +   
  12.284 +      //wait for all to complete
  12.285 +   for( coreIdx=0; coreIdx < NUM_CORES; coreIdx++ )
  12.286 +    {
  12.287 +      pthread_join( coreLoopThdHandles[coreIdx], NULL );
  12.288 +    }
  12.289 +   
  12.290 +      //NOTE: do not clean up VMS env here -- semantic layer has to have
  12.291 +      // a chance to clean up its environment first, then do a call to free
  12.292 +      // the Master env and rest of VMS locations
  12.293 +#endif
  12.294 + }
  12.295 +
  12.296 +
  12.297 +//TODO: look at architecting cleanest separation between request handler
  12.298 +// and master loop, for dissipate, create, shutdown, and other non-semantic
  12.299 +// requests.  Issue is chain: one removes requests from AppVP, one dispatches
  12.300 +// on type of request, and one handles each type..  but some types require
  12.301 +// action from both request handler and master loop -- maybe just give the
  12.302 +// request handler calls like:  VMS__handle_X_request_type
  12.303 +
  12.304 +
  12.305 +/*This is called by the semantic layer's request handler when it decides its
  12.306 + * time to shut down the VMS system.  Calling this causes the core loop OS
  12.307 + * threads to exit, which unblocks the entry-point function that started up
  12.308 + * VMS, and allows it to grab the result and return to the original single-
  12.309 + * threaded application.
  12.310 + * 
  12.311 + *The _VMSMasterEnv is needed by this shut down function, so the create-seed-
  12.312 + * and-wait function has to free a bunch of stuff after it detects the
  12.313 + * threads have all died: the masterEnv, the thread-related locations,
  12.314 + * masterVP any AppVPs that might still be allocated and sitting in the
  12.315 + * semantic environment, or have been orphaned in the _VMSWorkQ.
  12.316 + * 
  12.317 + *NOTE: the semantic plug-in is expected to use VMS__malloc to get all the
  12.318 + * locations it needs, and give ownership to masterVP.  Then, they will be
  12.319 + * automatically freed.
  12.320 + *
  12.321 + *In here,create one core-loop shut-down processor for each core loop and put
  12.322 + * them all directly into the readyToAnimateQ.
  12.323 + *Note, this function can ONLY be called after the semantic environment no
  12.324 + * longer cares if AppVPs get animated after the point this is called.  In
  12.325 + * other words, this can be used as an abort, or else it should only be
  12.326 + * called when all AppVPs have finished dissipate requests -- only at that
  12.327 + * point is it sure that all results have completed.
  12.328 + */
  12.329 +void
  12.330 +VMS_int__shutdown()
  12.331 + { int coreIdx;
  12.332 +   SlaveVP *shutDownPr;
  12.333 +
  12.334 +      //create the shutdown processors, one for each core loop -- put them
  12.335 +      // directly into the Q -- each core will die when gets one
  12.336 +   for( coreIdx=0; coreIdx < NUM_CORES; coreIdx++ )
  12.337 +    {    //Note, this is running in the master
  12.338 +      shutDownPr = VMS_int__create_procr( &endOSThreadFn, NULL );
  12.339 +      writeVMSQ( shutDownPr, _VMSMasterEnv->readyToAnimateQs[coreIdx] );
  12.340 +    }
  12.341 +
  12.342 + }
  12.343 +
  12.344 +
  12.345 +/*Am trying to be cute, avoiding IF statement in coreLoop that checks for
  12.346 + * a special shutdown procr.  Ended up with extra-complex shutdown sequence.
  12.347 + *This function has the sole purpose of setting the stack and framePtr
  12.348 + * to the coreLoop's stack and framePtr.. it does that then jumps to the
  12.349 + * core loop's shutdown point -- might be able to just call Pthread_exit
  12.350 + * from here, but am going back to the pthread's stack and setting everything
  12.351 + * up just as if it never jumped out, before calling pthread_exit.
  12.352 + *The end-point of core loop will free the stack and so forth of the
  12.353 + * processor that animates this function, (this fn is transfering the
  12.354 + * animator of the AppVP that is in turn animating this function over
  12.355 + * to core loop function -- note that this slices out a level of virtual
  12.356 + * processors).
  12.357 + */
  12.358 +void
  12.359 +endOSThreadFn( void *initData, SlaveVP *animatingPr )
  12.360 + { 
  12.361 +#ifdef SEQUENTIAL
  12.362 +    asmTerminateCoreLoopSeq(animatingPr);
  12.363 +#else
  12.364 +    asmTerminateCoreLoop(animatingPr);
  12.365 +#endif
  12.366 + }
  12.367 +
  12.368 +
  12.369 +/*This is called from the startup & shutdown
  12.370 + */
  12.371 +void
  12.372 +VMS_int__cleanup_at_end_of_shutdown()
  12.373 + { 
  12.374 +   //unused
  12.375 +   //VMSQueueStruc **readyToAnimateQs;
  12.376 +   //int              coreIdx;
  12.377 +   //SlaveVP      **masterVPs;
  12.378 +   //SchedSlot     ***allSchedSlots; //ptr to array of ptrs
  12.379 +
  12.380 +      //Before getting rid of everything, print out any measurements made
  12.381 +   forAllInDynArrayDo( _VMSMasterEnv->measHistsInfo, (DynArrayFnPtr)&printHist );
  12.382 +   forAllInDynArrayDo( _VMSMasterEnv->measHistsInfo, (DynArrayFnPtr)&saveHistToFile);
  12.383 +   forAllInDynArrayDo( _VMSMasterEnv->measHistsInfo, &freeHist );
  12.384 +   #ifdef MEAS__TIME_PLUGIN
  12.385 +   printHist( _VMSMasterEnv->reqHdlrLowTimeHist );
  12.386 +   saveHistToFile( _VMSMasterEnv->reqHdlrLowTimeHist );
  12.387 +   printHist( _VMSMasterEnv->reqHdlrHighTimeHist );
  12.388 +   saveHistToFile( _VMSMasterEnv->reqHdlrHighTimeHist );
  12.389 +   freeHistExt( _VMSMasterEnv->reqHdlrLowTimeHist );
  12.390 +   freeHistExt( _VMSMasterEnv->reqHdlrHighTimeHist );
  12.391 +   #endif
  12.392 +   #ifdef MEAS__TIME_MALLOC
  12.393 +   printHist( _VMSMasterEnv->mallocTimeHist   );
  12.394 +   saveHistToFile( _VMSMasterEnv->mallocTimeHist   );
  12.395 +   printHist( _VMSMasterEnv->freeTimeHist     );
  12.396 +   saveHistToFile( _VMSMasterEnv->freeTimeHist     );
  12.397 +   freeHistExt( _VMSMasterEnv->mallocTimeHist );
  12.398 +   freeHistExt( _VMSMasterEnv->freeTimeHist   );
  12.399 +   #endif
  12.400 +   #ifdef MEAS__TIME_MASTER_LOCK
  12.401 +   printHist( _VMSMasterEnv->masterLockLowTimeHist );
  12.402 +   printHist( _VMSMasterEnv->masterLockHighTimeHist );
  12.403 +   #endif
  12.404 +   #ifdef MEAS__TIME_MASTER
  12.405 +   printHist( _VMSMasterEnv->pluginTimeHist );
  12.406 +   for( coreIdx = 0; coreIdx < NUM_CORES; coreIdx++ )
  12.407 +    {
  12.408 +      freeVMSQ( readyToAnimateQs[ coreIdx ] );
  12.409 +         //master VPs were created external to VMS, so use external free
  12.410 +      VMS_int__dissipate_procr( masterVPs[ coreIdx ] );
  12.411 +
  12.412 +      freeSchedSlots( allSchedSlots[ coreIdx ] );
  12.413 +    }
  12.414 +   #endif
  12.415 +   #ifdef MEAS__TIME_STAMP_SUSP
  12.416 +   printHist( _VMSMasterEnv->pluginTimeHist );
  12.417 +   for( coreIdx = 0; coreIdx < NUM_CORES; coreIdx++ )
  12.418 +    {
  12.419 +      freeVMSQ( readyToAnimateQs[ coreIdx ] );
  12.420 +         //master VPs were created external to VMS, so use external free
  12.421 +      VMS_int__dissipate_procr( masterVPs[ coreIdx ] );
  12.422 +
  12.423 +      freeSchedSlots( allSchedSlots[ coreIdx ] );
  12.424 +    }
  12.425 +   #endif
  12.426 +
  12.427 +      //All the environment data has been allocated with VMS__malloc, so just
  12.428 +      // free its internal big-chunk and all inside it disappear.
  12.429 +/*
  12.430 +   readyToAnimateQs = _VMSMasterEnv->readyToAnimateQs;
  12.431 +   masterVPs        = _VMSMasterEnv->masterVPs;
  12.432 +   allSchedSlots    = _VMSMasterEnv->allSchedSlots;
  12.433 +   
  12.434 +   for( coreIdx = 0; coreIdx < NUM_CORES; coreIdx++ )
  12.435 +    {
  12.436 +      freeVMSQ( readyToAnimateQs[ coreIdx ] );
  12.437 +         //master VPs were created external to VMS, so use external free
  12.438 +      VMS__dissipate_procr( masterVPs[ coreIdx ] );
  12.439 +      
  12.440 +      freeSchedSlots( allSchedSlots[ coreIdx ] );
  12.441 +    }
  12.442 +   
  12.443 +   VMS__free( _VMSMasterEnv->readyToAnimateQs );
  12.444 +   VMS__free( _VMSMasterEnv->masterVPs );
  12.445 +   VMS__free( _VMSMasterEnv->allSchedSlots );
  12.446 +   
  12.447 +   //============================= MEASUREMENT STUFF ========================
  12.448 +   #ifdef STATS__TURN_ON_PROBES
  12.449 +   freeDynArrayDeep( _VMSMasterEnv->dynIntervalProbesInfo, &VMS__free_probe);
  12.450 +   #endif
  12.451 +   //========================================================================
  12.452 +*/
  12.453 +      //These are the only two that use system free 
  12.454 +   VMS_ext__free_free_list( _VMSMasterEnv->freeListHead );
  12.455 +   free( (void *)_VMSMasterEnv );
  12.456 + }
  12.457 +
  12.458 +
  12.459 +//================================
  12.460 +
  12.461 +
    13.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    13.2 +++ b/VMS_defs__HW_specific.h	Wed Feb 22 11:39:12 2012 -0800
    13.3 @@ -0,0 +1,51 @@
    13.4 +/*
    13.5 + *  Copyright 2012 OpenSourceStewardshipFoundation
    13.6 + *  Licensed under BSD
    13.7 + *
    13.8 + * Author: seanhalle@yahoo.com
    13.9 + * 
   13.10 + */
   13.11 +
   13.12 +#ifndef _VMS_HW_SPEC_DEFS_H
   13.13 +#define	_VMS_HW_SPEC_DEFS_H
   13.14 +#define _GNU_SOURCE
   13.15 +
   13.16 +
   13.17 +//=========================  Hardware related Constants =====================
   13.18 +   //This value is the number of hardware threads in the shared memory
   13.19 +   // machine
   13.20 +#define NUM_CORES        4
   13.21 +
   13.22 +   // tradeoff amortizing master fixed overhead vs imbalance potential
   13.23 +   // when work-stealing, can make bigger, at risk of losing cache affinity
   13.24 +#define NUM_SCHED_SLOTS  3
   13.25 +
   13.26 +#define MIN_WORK_UNIT_CYCLES 20000
   13.27 +
   13.28 +#define MASTERLOCK_RETRIES 10000
   13.29 +
   13.30 +   // stack size in virtual processors created
   13.31 +#define VIRT_PROCR_STACK_SIZE 0x8000 /* 32K */
   13.32 +
   13.33 +   // memory for VMS__malloc
   13.34 +#define MALLOC_ADDITIONAL_MEM_FROM_OS_SIZE 0x40000000 /* 1G */
   13.35 +
   13.36 +   //Frequency of TS counts -- have to do tests to verify
   13.37 +   //NOTE: turn off (in BIOS)  TURBO-BOOST and SPEED-STEP else won't be const
   13.38 +#define TSCOUNT_FREQ 3180000000
   13.39 +
   13.40 +#define CACHE_LINE_SZ 256
   13.41 +#define PAGE_SIZE 4096
   13.42 +
   13.43 +//To prevent false-sharing, aligns a variable to a cache-line boundary.
   13.44 +//No need to use for local vars because those are never shared between cores
   13.45 +#define __align_to_cacheline__ __attribute__ ((aligned(CACHE_LINE_SZ)))
   13.46 +
   13.47 +//aligns a pointer to cacheline. The memory area has to contain at least
   13.48 +//CACHE_LINE_SZ bytes more then needed
   13.49 +#define __align_address(ptr) ((void*)(((uintptr_t)(ptr))&((uintptr_t)(~0x0FF))))
   13.50 +
   13.51 +//===========================================================================
   13.52 +
   13.53 +#endif	/* _VMS_DEFS_H */
   13.54 +
    14.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    14.2 +++ b/VMS_defs__lang_specific.h	Wed Feb 22 11:39:12 2012 -0800
    14.3 @@ -0,0 +1,182 @@
    14.4 +/*
    14.5 + *  Copyright 2009 OpenSourceStewardshipFoundation.org
    14.6 + *  Licensed under GNU General Public License version 2
    14.7 + *
    14.8 + * Author: seanhalle@yahoo.com
    14.9 + * 
   14.10 + */
   14.11 +
   14.12 +#ifndef _VMS_LANG_SPEC_DEFS_H
   14.13 +#define	_VMS_LANG_SPEC_DEFS_H
   14.14 +
   14.15 +
   14.16 +
   14.17 +//===================  Language-specific Measurement Stuff ===================
   14.18 +//
   14.19 +//TODO:  Figure out way to move these into language dir..
   14.20 +//   wrap them in #ifdef MEAS__...
   14.21 +//
   14.22 +#ifndef MAKE_HISTS_FOR_MEASUREMENTS
   14.23 +#define MakeTheMeasHists() 
   14.24 +#endif
   14.25 +
   14.26 +//===========================================================================
   14.27 +//VPThread
   14.28 +#ifdef VTHREAD
   14.29 +
   14.30 +#define createHistIdx      1  //note: starts at 1
   14.31 +#define mutexLockHistIdx   2
   14.32 +#define mutexUnlockHistIdx 3
   14.33 +#define condWaitHistIdx    4
   14.34 +#define condSignalHistIdx  5
   14.35 +
   14.36 +#define MakeTheMeasHists() \
   14.37 +   _VMSMasterEnv->measHistsInfo = \
   14.38 +              makePrivDynArrayOfSize( (void***)&(_VMSMasterEnv->measHists), 200); \
   14.39 +   makeAMeasHist( createHistIdx,      "create",        250, 0, 100 ) \
   14.40 +   makeAMeasHist( mutexLockHistIdx,   "mutex_lock",    50, 0, 100 ) \
   14.41 +   makeAMeasHist( mutexUnlockHistIdx, "mutex_unlock",  50, 0, 100 ) \
   14.42 +   makeAMeasHist( condWaitHistIdx,    "cond_wait",     50, 0, 100 ) \
   14.43 +   makeAMeasHist( condSignalHistIdx,  "cond_signal",   50, 0, 100 )
   14.44 +
   14.45 +   
   14.46 +#define Meas_startCreate \
   14.47 +    int32 startStamp, endStamp; \
   14.48 +    saveLowTimeStampCountInto( startStamp ); \
   14.49 +
   14.50 +#define Meas_endCreate \
   14.51 +    saveLowTimeStampCountInto( endStamp ); \
   14.52 +    addIntervalToHist( startStamp, endStamp, \
   14.53 +                                 _VMSMasterEnv->measHists[ createHistIdx ] );
   14.54 +
   14.55 +#define Meas_startMutexLock \
   14.56 +    int32 startStamp, endStamp; \
   14.57 +    saveLowTimeStampCountInto( startStamp ); \
   14.58 +
   14.59 +#define Meas_endMutexLock \
   14.60 +    saveLowTimeStampCountInto( endStamp ); \
   14.61 +    addIntervalToHist( startStamp, endStamp, \
   14.62 +                              _VMSMasterEnv->measHists[ mutexLockHistIdx ] );
   14.63 +
   14.64 +#define Meas_startMutexUnlock \
   14.65 +    int32 startStamp, endStamp; \
   14.66 +    saveLowTimeStampCountInto( startStamp ); \
   14.67 +
   14.68 +#define Meas_endMutexUnlock \
   14.69 +    saveLowTimeStampCountInto( endStamp ); \
   14.70 +    addIntervalToHist( startStamp, endStamp, \
   14.71 +                            _VMSMasterEnv->measHists[ mutexUnlockHistIdx ] );
   14.72 +
   14.73 +#define Meas_startCondWait \
   14.74 +    int32 startStamp, endStamp; \
   14.75 +    saveLowTimeStampCountInto( startStamp ); \
   14.76 +
   14.77 +#define Meas_endCondWait \
   14.78 +    saveLowTimeStampCountInto( endStamp ); \
   14.79 +    addIntervalToHist( startStamp, endStamp, \
   14.80 +                               _VMSMasterEnv->measHists[ condWaitHistIdx ] );
   14.81 +
   14.82 +#define Meas_startCondSignal \
   14.83 +    int32 startStamp, endStamp; \
   14.84 +    saveLowTimeStampCountInto( startStamp ); \
   14.85 +
   14.86 +#define Meas_endCondSignal \
   14.87 +    saveLowTimeStampCountInto( endStamp ); \
   14.88 +    addIntervalToHist( startStamp, endStamp, \
   14.89 +                             _VMSMasterEnv->measHists[ condSignalHistIdx ] );
   14.90 +
   14.91 +#endif
   14.92 +
   14.93 +
   14.94 +
   14.95 +//===========================================================================
   14.96 +//VCilk
   14.97 +
   14.98 +#ifdef VCILK
   14.99 +
  14.100 +#define spawnHistIdx      1 //note: starts at 1
  14.101 +#define syncHistIdx       2
  14.102 +
  14.103 +#define MakeTheMeasHists() \
  14.104 +   _VMSMasterEnv->measHistsInfo = \
  14.105 +          makePrivDynArrayOfSize( (void***)&(_VMSMasterEnv->measHists), 200); \
  14.106 +    makeAMeasHist( spawnHistIdx,      "Spawn",        50, 0, 200 ) \
  14.107 +    makeAMeasHist( syncHistIdx,       "Sync",         50, 0, 200 )
  14.108 +
  14.109 +
  14.110 +#define Meas_startSpawn \
  14.111 +    int32 startStamp, endStamp; \
  14.112 +    saveLowTimeStampCountInto( startStamp ); \
  14.113 +
  14.114 +#define Meas_endSpawn \
  14.115 +    saveLowTimeStampCountInto( endStamp ); \
  14.116 +    addIntervalToHist( startStamp, endStamp, \
  14.117 +                             _VMSMasterEnv->measHists[ spawnHistIdx ] );
  14.118 +
  14.119 +#define Meas_startSync \
  14.120 +    int32 startStamp, endStamp; \
  14.121 +    saveLowTimeStampCountInto( startStamp ); \
  14.122 +
  14.123 +#define Meas_endSync \
  14.124 +    saveLowTimeStampCountInto( endStamp ); \
  14.125 +    addIntervalToHist( startStamp, endStamp, \
  14.126 +                             _VMSMasterEnv->measHists[ syncHistIdx ] );
  14.127 +#endif
  14.128 +
  14.129 +//===========================================================================
  14.130 +// SSR
  14.131 +
  14.132 +#ifdef SSR
  14.133 +
  14.134 +#define SendFromToHistIdx      1 //note: starts at 1
  14.135 +#define SendOfTypeHistIdx      2
  14.136 +#define ReceiveFromToHistIdx   3
  14.137 +#define ReceiveOfTypeHistIdx   4
  14.138 +
  14.139 +#define MakeTheMeasHists() \
  14.140 +   _VMSMasterEnv->measHistsInfo = \
  14.141 +              makePrivDynArrayOfSize( (void***)&(_VMSMasterEnv->measHists), 200); \
  14.142 +    makeAMeasHist( SendFromToHistIdx,   "SendFromTo",    50, 0, 100 ) \
  14.143 +    makeAMeasHist( SendOfTypeHistIdx,   "SendOfType",    50, 0, 100 ) \
  14.144 +    makeAMeasHist( ReceiveFromToHistIdx,"ReceiveFromTo", 50, 0, 100 ) \
  14.145 +    makeAMeasHist( ReceiveOfTypeHistIdx,"ReceiveOfType", 50, 0, 100 )
  14.146 +
  14.147 +#define Meas_startSendFromTo \
  14.148 +    int32 startStamp, endStamp; \
  14.149 +    saveLowTimeStampCountInto( startStamp ); \
  14.150 +
  14.151 +#define Meas_endSendFromTo \
  14.152 +    saveLowTimeStampCountInto( endStamp ); \
  14.153 +    addIntervalToHist( startStamp, endStamp, \
  14.154 +                             _VMSMasterEnv->measHists[ SendFromToHistIdx ] );
  14.155 +
  14.156 +#define Meas_startSendOfType \
  14.157 +    int32 startStamp, endStamp; \
  14.158 +    saveLowTimeStampCountInto( startStamp ); \
  14.159 +
  14.160 +#define Meas_endSendOfType \
  14.161 +    saveLowTimeStampCountInto( endStamp ); \
  14.162 +    addIntervalToHist( startStamp, endStamp, \
  14.163 +                             _VMSMasterEnv->measHists[ SendOfTypeHistIdx ] );
  14.164 +
  14.165 +#define Meas_startReceiveFromTo \
  14.166 +    int32 startStamp, endStamp; \
  14.167 +    saveLowTimeStampCountInto( startStamp ); \
  14.168 +
  14.169 +#define Meas_endReceiveFromTo \
  14.170 +    saveLowTimeStampCountInto( endStamp ); \
  14.171 +    addIntervalToHist( startStamp, endStamp, \
  14.172 +                             _VMSMasterEnv->measHists[ ReceiveFromToHistIdx ] );
  14.173 +
  14.174 +#define Meas_startReceiveOfType \
  14.175 +    int32 startStamp, endStamp; \
  14.176 +    saveLowTimeStampCountInto( startStamp ); \
  14.177 +
  14.178 +#define Meas_endReceiveOfType \
  14.179 +    saveLowTimeStampCountInto( endStamp ); \
  14.180 +    addIntervalToHist( startStamp, endStamp, \
  14.181 +                             _VMSMasterEnv->measHists[ReceiveOfTypeHistIdx ] );
  14.182 +#endif  /* SSR */
  14.183 +
  14.184 +#endif	/* _VMS_DEFS_H */
  14.185 +
    15.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    15.2 +++ b/VMS_defs__main.h	Wed Feb 22 11:39:12 2012 -0800
    15.3 @@ -0,0 +1,185 @@
    15.4 +/*
    15.5 + *  Copyright 2009 OpenSourceStewardshipFoundation.org
    15.6 + *  Licensed under GNU General Public License version 2
    15.7 + *
    15.8 + * Author: seanhalle@yahoo.com
    15.9 + * 
   15.10 + */
   15.11 +
   15.12 +#ifndef _VMS_DEFS_H
   15.13 +#define	_VMS_DEFS_H
   15.14 +#define _GNU_SOURCE
   15.15 +
   15.16 +//===========================  VMS-wide defs  ===============================
   15.17 +#include "VMS_primitive_data_types.h"
   15.18 +
   15.19 +#define SUCCESS 0
   15.20 +
   15.21 +   //only after macro-expansion are the defs of writePrivQ, aso looked up
   15.22 +   // so these defs can be at the top, and writePrivQ defined later on..
   15.23 +#define writeVMSQ     writePrivQ
   15.24 +#define readVMSQ      readPrivQ
   15.25 +#define makeVMSQ      makeVMSPrivQ
   15.26 +#define numInVMSQ     numInPrivQ
   15.27 +#define VMSQueueStruc PrivQueueStruc
   15.28 +
   15.29 +
   15.30 +//======================  Hardware Specific Defs ============================
   15.31 +#include "VMS_defs__HW_specific.h"
   15.32 +
   15.33 +//=========================  Debug Related Defs =============================
   15.34 +//
   15.35 +//When SEQUENTIAL is defined, VMS does sequential exe in the main thread
   15.36 +// It still does co-routines and all the mechanisms are the same, it just
   15.37 +// has only a single thread and animates VPs one at a time
   15.38 +//#define SEQUENTIAL
   15.39 +
   15.40 +//#define USE_WORK_STEALING
   15.41 +
   15.42 +//turns on the probe-instrumentation in the application -- when not
   15.43 +// defined, the calls to the probe functions turn into comments
   15.44 +#define STATS__ENABLE_PROBES
   15.45 +//#define TURN_ON_DEBUG_PROBES
   15.46 +
   15.47 +//These defines turn types of bug messages on and off
   15.48 +// be sure debug messages are un-commented (next block of defines)
   15.49 +#define dbgAppFlow   TRUE /* Top level flow of application code -- general*/
   15.50 +#define dbgProbes    FALSE /* for issues inside probes themselves*/
   15.51 +#define dbgB2BMaster FALSE /* in coreloop, back to back master VPs*/
   15.52 +#define dbgRqstHdlr  FALSE /* in request handler code*/
   15.53 +
   15.54 +//Comment or un- the substitute half to turn on/off types of debug message
   15.55 +#define DEBUG(  bool, msg)         \
   15.56 +//  if( bool){ printf(msg); fflush(stdin);}
   15.57 +#define DEBUG1( bool, msg, param)  \
   15.58 +//   if(bool){printf(msg, param); fflush(stdin);}
   15.59 +#define DEBUG2( bool, msg, p1, p2) \
   15.60 +//   if(bool) {printf(msg, p1, p2); fflush(stdin);}
   15.61 +
   15.62 +#define ERROR(msg) printf(msg);
   15.63 +#define ERROR1(msg, param) printf(msg, param); 
   15.64 +#define ERROR2(msg, p1, p2) printf(msg, p1, p2);
   15.65 +
   15.66 +//======================  Measurement Related Defs ==========================
   15.67 +//
   15.68 +//
   15.69 +   //when STATS__TURN_ON_PROBES is defined allows using probes to measure
   15.70 +   // time intervals.  The probes are macros that only compile to something
   15.71 +   // when STATS__TURN_ON_PROBES is defined.  The probes are saved in the
   15.72 +   // master env -- but only when this is defined.
   15.73 +   //The TSC probes use RDTSC instr, can be unreliable, Dbl uses gettimeofday
   15.74 +#define STATS__TURN_ON_PROBES
   15.75 +//#define STATS__USE_TSC_PROBES
   15.76 +#define STATS__USE_DBL_PROBES
   15.77 +
   15.78 +//==================  Turn Measurement Things on and off ====================
   15.79 +
   15.80 +//#define MEAS__TIME_2011_SYS
   15.81 +
   15.82 +//define this if any MEAS__... below are
   15.83 +//#define MAKE_HISTS_FOR_MEASUREMENTS
   15.84 +   //when MEAS__TIME_STAMP_SUSP is defined, causes code to be inserted and
   15.85 +   // compiled-in that saves the low part of the time stamp count just before
   15.86 +   // suspending a processor and just after resuming that processor.  It is
   15.87 +   // saved into a field added to VirtProcr.  Have to sanity-check for
   15.88 +   // rollover of low portion into high portion.
   15.89 +//#define MEAS__TIME_STAMP_SUSP
   15.90 +//#define MEAS__TIME_MASTER
   15.91 +//#define MEAS__TIME_PLUGIN
   15.92 +//#define MEAS__TIME_MALLOC
   15.93 +//#define MEAS__TIME_MASTER_LOCK
   15.94 +
   15.95 +   //For code that calculates normalization-offset between TSC counts of
   15.96 +   // different cores.
   15.97 +//#define NUM_TSC_ROUND_TRIPS 10
   15.98 +
   15.99 +
  15.100 +
  15.101 +//===================  Macros to Capture Measurements  ======================
  15.102 +//
  15.103 +//===== RDTSC wrapper ===== 
  15.104 +//Also runs with x86_64 code
  15.105 +#define saveTSCLowHigh(lowHighIn) \
  15.106 +   asm volatile("RDTSC;                   \
  15.107 +                 movl %%eax, %0;          \
  15.108 +                 movl %%edx, %1;"         \
  15.109 +   /* outputs */ : "=m" (lowHighIn.lowHigh[0]), "=m" (lowHighIn.lowHigh[1])\
  15.110 +   /* inputs  */ :                        \
  15.111 +   /* clobber */ : "%eax", "%edx"         \
  15.112 +                );
  15.113 +
  15.114 +#define saveTimeStampCountInto(low, high) \
  15.115 +   asm volatile("RDTSC;                   \
  15.116 +                 movl %%eax, %0;          \
  15.117 +                 movl %%edx, %1;"         \
  15.118 +   /* outputs */ : "=m" (low), "=m" (high)\
  15.119 +   /* inputs  */ :                        \
  15.120 +   /* clobber */ : "%eax", "%edx"         \
  15.121 +                );
  15.122 +
  15.123 +#define saveLowTimeStampCountInto(low)    \
  15.124 +   asm volatile("RDTSC;                   \
  15.125 +                 movl %%eax, %0;"         \
  15.126 +   /* outputs */ : "=m" (low)             \
  15.127 +   /* inputs  */ :                        \
  15.128 +   /* clobber */ : "%eax", "%edx"         \
  15.129 +                );
  15.130 +
  15.131 +
  15.132 +//==================  Macros define types of meas want  =====================
  15.133 +#ifdef MEAS__TIME_PLUGIN
  15.134 +
  15.135 +#define Meas_startReqHdlr \
  15.136 +        int32 startStamp1, endStamp1; \
  15.137 +        saveLowTimeStampCountInto( startStamp1 );
  15.138 +
  15.139 +#define Meas_endReqHdlr \
  15.140 +        saveLowTimeStampCountInto( endStamp1 ); \
  15.141 +        addIntervalToHist( startStamp1, endStamp1, \
  15.142 +                           _VMSMasterEnv->reqHdlrLowTimeHist ); \
  15.143 +        addIntervalToHist( startStamp1, endStamp1, \
  15.144 +                           _VMSMasterEnv->reqHdlrHighTimeHist );
  15.145 +               
  15.146 +#elif defined MEAS__TIME_2011_SYS
  15.147 +#define Meas_startMasterLoop \
  15.148 +        TSCountLowHigh startStamp1, endStamp1; \
  15.149 +        saveTSCLowHigh( endStamp1 ); \
  15.150 +        _VMSMasterEnv->cyclesTillStartMasterLoop = \
  15.151 +        endStamp1.longVal - masterVP->startSusp.longVal;
  15.152 +
  15.153 +#define Meas_startReqHdlr \
  15.154 +        saveTSCLowHigh( startStamp1 ); \
  15.155 +        _VMSMasterEnv->startReqHdlr.longVal = startStamp1.longVal;
  15.156 +
  15.157 +#define Meas_endReqHdlr 
  15.158 +
  15.159 +#define Meas_endMasterLoop \
  15.160 +        saveTSCLowHigh( startStamp1 ); \
  15.161 +        _VMSMasterEnv->endMasterLoop.longVal = startStamp1.longVal;
  15.162 +
  15.163 +#else
  15.164 +#define Meas_startMasterLoop 
  15.165 +#define Meas_startReqHdlr 
  15.166 +#define Meas_endReqHdlr 
  15.167 +#define Meas_endMasterLoop
  15.168 +#endif
  15.169 +
  15.170 +//======================  Histogram Macros -- Create ========================
  15.171 +//
  15.172 +//
  15.173 +#ifdef MAKE_HISTS_FOR_MEASUREMENTS
  15.174 +#define makeAMeasHist( idx, name, numBins, startVal, binWidth ) \
  15.175 +   makeHighestDynArrayIndexBeAtLeast( _VMSMasterEnv->measHistsInfo, idx ); \
  15.176 +   _VMSMasterEnv->measHists[idx] =  \
  15.177 +                       makeFixedBinHist( numBins, startVal, binWidth, name );
  15.178 +#else
  15.179 +#define makeAMeasHist( idx, name, numBins, startVal, binWidth )
  15.180 +#endif
  15.181 +
  15.182 +
  15.183 +#define MEAS__SUB_CREATE  /*turn on/off subtraction of create from plugin*/
  15.184 +
  15.185 +#include "VMS_defs__lang_specific.h"
  15.186 +
  15.187 +#endif	/* _VMS_DEFS_H */
  15.188 +
    16.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    16.2 +++ b/VMS_primitive_data_types.h	Wed Feb 22 11:39:12 2012 -0800
    16.3 @@ -0,0 +1,53 @@
    16.4 +/*
    16.5 + *  Copyright 2009 OpenSourceStewardshipFoundation.org
    16.6 + *  Licensed under GNU General Public License version 2
    16.7 + *  
    16.8 + * Author: seanhalle@yahoo.com
    16.9 + *  
   16.10 +
   16.11 + */
   16.12 +
   16.13 +#ifndef _BLIS_PRIMITIVE_DATA_TYPES_H
   16.14 +#define	_BLIS_PRIMITIVE_DATA_TYPES_H
   16.15 +
   16.16 +
   16.17 +/*For portability, need primitive data types that have a well defined
   16.18 + * size, and well-defined layout into bytes
   16.19 + *To do this, provide BLIS standard aliases for all primitive data types
   16.20 + *These aliases must be used in all BLIS functions instead of the ANSI types
   16.21 + *
   16.22 + *These definitions will be replaced inside each specialization module
   16.23 + * according to the compiler used in that module and the hardware being
   16.24 + * specialized to.
   16.25 + */
   16.26 +/*
   16.27 +#define    int8  char
   16.28 +#define   uint8  char
   16.29 +#define    int16 short
   16.30 +#define   uint16 unsigned short
   16.31 +#define    int32 int
   16.32 +#define   uint32 unsigned int
   16.33 +#define    int64 long long
   16.34 +#define   uint64 unsigned long long
   16.35 +#define  float32 float
   16.36 +#define  float64 double
   16.37 +*/
   16.38 +typedef char               bool8;
   16.39 +typedef char               int8;
   16.40 +typedef char               uint8;
   16.41 +typedef short              int16;
   16.42 +typedef unsigned short     uint16;
   16.43 +typedef int                int32;
   16.44 +typedef unsigned int       uint32;
   16.45 +typedef long long          int64;
   16.46 +typedef unsigned long long uint64;
   16.47 +typedef float              float32;
   16.48 +typedef double             float64;
   16.49 +//typedef double double      float128;
   16.50 +#define float128 double double
   16.51 +
   16.52 +#define TRUE  1
   16.53 +#define FALSE 0
   16.54 +
   16.55 +#endif	/* _BLIS_PRIMITIVE_DATA_TYPES_H */
   16.56 +
    17.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    17.2 +++ b/__brch__Common_ancestor	Wed Feb 22 11:39:12 2012 -0800
    17.3 @@ -0,0 +1,33 @@
    17.4 +A HW branch for:
    17.5 +
    17.6 +generic MultiCore machines with x86 64bit instruction set
    17.7 +
    17.8 +This branch shouldn't be used, except as a lazy fall-back.  Instead, try out other branches tuned to specific hardware platforms to find the one that performs best on your machine.  Use the "exe_time_vs_task_size" project to generate curves of overhead, and compare result from various branches.
    17.9 +
   17.10 +Note, if this branch is used, then NUM_CORES in VMS_HW_specific_defs.h file has to be updated with the number of cores in your machine
   17.11 +
   17.12 +========  Background on branch naming  =========
   17.13 +
   17.14 +There are two kinds of branchs: ones used to develop features, and ones tuned to particular hardware.  A given HW branch may combine features from several feature-branches, picking and choosing among them.
   17.15 +
   17.16 +After Feb 2012, branches are named by the scheme:
   17.17 +
   17.18 +feat__<feat_descr>__<HW_feat_dev_on>
   17.19 +
   17.20 +HW__<desc_of_HW_brch_tuned_for>
   17.21 +
   17.22 +where <HW_feat_dev_on> and <desc_of_HW_brch_tuned_for> follow the pattern:
   17.23 +
   17.24 +<num_socket> x <num_cores>_<Manuf>_<special_features>
   17.25 +
   17.26 +Examples:
   17.27 +
   17.28 +feat__exp_array_malloc
   17.29 +
   17.30 +feat__rand_backoff__4x10_Intel_WestmereEx
   17.31 +
   17.32 +HW__1x4_Intel_SandyBridge
   17.33 +
   17.34 +HW__4x10_Intel_WestmereEx
   17.35 +
   17.36 +HW__1x4_AMD_mobile
    18.1 --- a/__brch__DEPRECATED_README	Mon Feb 13 13:34:13 2012 -0800
    18.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
    18.3 @@ -1,29 +0,0 @@
    18.4 -*DEPRECATED*  as of Feb 2012, this branch should not be used.  Too many variations of VMS for MC_shared exist.
    18.5 -
    18.6 -Instead, choose a branch that has the best implementation for the machine being run on.  For example, single-socket with 2 cores, or with 4 cores, or with 8 cores all have their own branches with code tuned to that number of cores.  AMD processors require different low-level tweaking than Intel, and so on.
    18.7 -
    18.8 -============== Background on Branch Naming ============
    18.9 -
   18.10 -There are two kinds of branchs: ones used to develop features, and ones tuned to particular hardware.  A given HW branch may combine features from several feature-branches, picking and choosing among them.
   18.11 -
   18.12 -Legacy branches, from before Feb 2012 have random names.  After Feb 2012, they're named by the scheme:
   18.13 -
   18.14 -feat__<feat_descr>__<HW_feat_dev_on>
   18.15 -
   18.16 -HW__<desc_of_HW_brch_tuned_for>
   18.17 -
   18.18 -where <HW_feat_dev_on> and <desc_of_HW_brch_tuned_for> follow the pattern:
   18.19 -
   18.20 -<num_socket> x <num_cores>_<ArchName>_<optional_special_features>
   18.21 -
   18.22 -Examples:
   18.23 -
   18.24 -feat__exp_array_malloc__generic_MC
   18.25 -
   18.26 -feat__rand_backoff__4x10_WestmereEx
   18.27 -
   18.28 -HW__1x4_SandyBridge
   18.29 -
   18.30 -HW__4x10_WestmereEx
   18.31 -
   18.32 -HW__1x4_AMD_mobile
   18.33 \ No newline at end of file
    19.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    19.2 +++ b/probes.c	Wed Feb 22 11:39:12 2012 -0800
    19.3 @@ -0,0 +1,339 @@
    19.4 +/*
    19.5 + * Copyright 2010  OpenSourceStewardshipFoundation
    19.6 + *
    19.7 + * Licensed under BSD
    19.8 + */
    19.9 +
   19.10 +#include <stdio.h>
   19.11 +#include <malloc.h>
   19.12 +#include <sys/time.h>
   19.13 +
   19.14 +#include "VMS.h"
   19.15 +
   19.16 +
   19.17 +
   19.18 +//====================  Probes =================
   19.19 +#ifdef STATS__USE_TSC_PROBES
   19.20 +
   19.21 +int32
   19.22 +VMS__create_histogram_probe( int32 numBins, float32 startValue,
   19.23 +                             float32 binWidth, char *nameStr )
   19.24 + { IntervalProbe *newProbe;
   19.25 +   int32 idx;
   19.26 +   FloatHist *hist;
   19.27 +
   19.28 +   idx = VMS__create_single_interval_probe( nameStr );
   19.29 +   newProbe =  _VMSMasterEnv->intervalProbes[ idx ];
   19.30 +
   19.31 +   hist =  makeFloatHistogram( numBins, startValue, binWidth );
   19.32 +   newProbe->hist = hist;
   19.33 +   return idx;
   19.34 + }
   19.35 +
   19.36 +void
   19.37 +VMS_impl__record_interval_start_in_probe( int32 probeID )
   19.38 + { IntervalProbe *probe;
   19.39 +
   19.40 +   probe = _VMSMasterEnv->intervalProbes[ probeID ];
   19.41 +   probe->startStamp = getTSCount();
   19.42 + }
   19.43 +
   19.44 +void
   19.45 +VMS_impl__record_interval_end_in_probe( int32 probeID )
   19.46 + { IntervalProbe *probe;
   19.47 +   TSCount endStamp;
   19.48 +
   19.49 +   endStamp = getTSCount();
   19.50 +
   19.51 +   probe = _VMSMasterEnv->intervalProbes[ probeID ];
   19.52 +   probe->endStamp = endStamp;
   19.53 +
   19.54 +   if( probe->hist != NULL )
   19.55 +    { TSCount interval = probe->endStamp - probe->startStamp;
   19.56 +         //if the interval is sane, then add to histogram
   19.57 +      if( interval < probe->hist->endOfRange * 10 )
   19.58 +         addToFloatHist( interval, probe->hist );
   19.59 +    }
   19.60 + }
   19.61 +
   19.62 +void
   19.63 +VMS_impl__print_stats_of_probe( int32 probeID )
   19.64 + { IntervalProbe *probe;
   19.65 +
   19.66 +   probe = _VMSMasterEnv->intervalProbes[ probeID ];
   19.67 +
   19.68 +   if( probe->hist == NULL )
   19.69 +    {
   19.70 +      printf("probe: %s, interval: %.6lf\n", probe->nameStr,probe->interval);
   19.71 +    }
   19.72 +
   19.73 +   else
   19.74 +    {
   19.75 +      printf( "probe: %s\n", probe->nameStr );
   19.76 +      printFloatHist( probe->hist );
   19.77 +    }
   19.78 + }
   19.79 +#else
   19.80 +
   19.81 +/*
   19.82 + * In practice, probe operations are called from the app, from inside slaves
   19.83 + *  -- so have to be sure each probe is single-VP owned, and be sure that
   19.84 + *  any place common structures are modified it's done inside the master.
   19.85 + * So -- the only place common structures are modified is during creation.
   19.86 + *  after that, all mods are to individual instances.
   19.87 + *
   19.88 + * Thniking perhaps should change the semantics to be that probes are
   19.89 + *  attached to the virtual processor -- and then everything is guaranteed
   19.90 + *  to be isolated -- except then can't take any intervals that span VPs,
   19.91 + *  and would have to transfer the probes to Master env when VP dissipates..
   19.92 + *  gets messy..
   19.93 + *
   19.94 + * For now, just making so that probe creation causes a suspend, so that
   19.95 + *  the dynamic array in the master env is only modified from the master
   19.96 + * 
   19.97 + */
   19.98 +IntervalProbe *
   19.99 +create_generic_probe( char *nameStr, SlaveVP *animPr )
  19.100 +{
  19.101 +   VMSSemReq reqData;
  19.102 +
  19.103 +   reqData.reqType  = createProbe;
  19.104 +   reqData.nameStr  = nameStr;
  19.105 +
  19.106 +   VMS_WL__send_VMSSem_request( &reqData, animPr );
  19.107 +
  19.108 +   return animPr->dataRetFromReq;
  19.109 + }
  19.110 +
  19.111 +/*Use this version from outside VMS -- it uses external malloc, and modifies
  19.112 + * dynamic array, so can't be animated in a slave VP
  19.113 + */
  19.114 +IntervalProbe *
  19.115 +ext__create_generic_probe( char *nameStr )
  19.116 + { IntervalProbe *newProbe;
  19.117 +   int32          nameLen;
  19.118 +
  19.119 +   newProbe          = malloc( sizeof(IntervalProbe) );
  19.120 +   nameLen = strlen( nameStr );
  19.121 +   newProbe->nameStr = malloc( nameLen );
  19.122 +   memcpy( newProbe->nameStr, nameStr, nameLen );
  19.123 +   newProbe->hist    = NULL;
  19.124 +   newProbe->schedChoiceWasRecorded = FALSE;
  19.125 +   newProbe->probeID =
  19.126 +             addToDynArray( newProbe, _VMSMasterEnv->dynIntervalProbesInfo );
  19.127 +
  19.128 +   return newProbe;
  19.129 + }
  19.130 +
  19.131 +
  19.132 +/*Only call from inside master or main startup/shutdown thread
  19.133 + */
  19.134 +void
  19.135 +VMS_impl__free_probe( IntervalProbe *probe )
  19.136 + { if( probe->hist != NULL )   freeDblHist( probe->hist );
  19.137 +   if( probe->nameStr != NULL) VMS_int__free( probe->nameStr );
  19.138 +   VMS_int__free( probe );
  19.139 + }
  19.140 +
  19.141 +
  19.142 +int32
  19.143 +VMS_impl__record_time_point_into_new_probe( char *nameStr, SlaveVP *animPr)
  19.144 + { IntervalProbe *newProbe;
  19.145 +   struct timeval *startStamp;
  19.146 +   float64 startSecs;
  19.147 +
  19.148 +   newProbe           = create_generic_probe( nameStr, animPr );
  19.149 +   newProbe->endSecs  = 0;
  19.150 +
  19.151 +   gettimeofday( &(newProbe->startStamp), NULL);
  19.152 +
  19.153 +      //turn into a double
  19.154 +   startStamp = &(newProbe->startStamp);
  19.155 +   startSecs = startStamp->tv_sec + ( startStamp->tv_usec / 1000000.0 );
  19.156 +   newProbe->startSecs = startSecs;
  19.157 +
  19.158 +   return newProbe->probeID;
  19.159 + }
  19.160 +
  19.161 +int32
  19.162 +VMS_ext_impl__record_time_point_into_new_probe( char *nameStr )
  19.163 + { IntervalProbe *newProbe;
  19.164 +   struct timeval *startStamp;
  19.165 +   float64 startSecs;
  19.166 +
  19.167 +   newProbe           = ext__create_generic_probe( nameStr );
  19.168 +   newProbe->endSecs  = 0;
  19.169 +
  19.170 +   gettimeofday( &(newProbe->startStamp), NULL);
  19.171 +
  19.172 +      //turn into a double
  19.173 +   startStamp = &(newProbe->startStamp);
  19.174 +   startSecs = startStamp->tv_sec + ( startStamp->tv_usec / 1000000.0 );
  19.175 +   newProbe->startSecs = startSecs;
  19.176 +
  19.177 +   return newProbe->probeID;
  19.178 + }
  19.179 +
  19.180 +int32
  19.181 +VMS_impl__create_single_interval_probe( char *nameStr, SlaveVP *animPr )
  19.182 + { IntervalProbe *newProbe;
  19.183 +
  19.184 +   newProbe = create_generic_probe( nameStr, animPr );
  19.185 +   
  19.186 +   return newProbe->probeID;
  19.187 + }
  19.188 +
  19.189 +int32
  19.190 +VMS_impl__create_histogram_probe( int32   numBins, float64    startValue,
  19.191 +               float64 binWidth, char   *nameStr, SlaveVP *animPr )
  19.192 + { IntervalProbe *newProbe;
  19.193 +   DblHist *hist;
  19.194 +
  19.195 +   newProbe = create_generic_probe( nameStr, animPr );
  19.196 +   
  19.197 +   hist =  makeDblHistogram( numBins, startValue, binWidth );
  19.198 +   newProbe->hist = hist;
  19.199 +   return newProbe->probeID;
  19.200 + }
  19.201 +
  19.202 +void
  19.203 +VMS_impl__index_probe_by_its_name( int32 probeID, SlaveVP *animPr )
  19.204 + { IntervalProbe *probe;
  19.205 +
  19.206 +   //TODO: fix this To be in Master -- race condition
  19.207 +   probe = _VMSMasterEnv->intervalProbes[ probeID ];
  19.208 +
  19.209 +   addValueIntoTable(probe->nameStr, probe, _VMSMasterEnv->probeNameHashTbl);
  19.210 + }
  19.211 +
  19.212 +IntervalProbe *
  19.213 +VMS_impl__get_probe_by_name( char *probeName, SlaveVP *animPr )
  19.214 + {
  19.215 +   //TODO: fix this To be in Master -- race condition
  19.216 +   return getValueFromTable( probeName, _VMSMasterEnv->probeNameHashTbl );
  19.217 + }
  19.218 +
  19.219 +
  19.220 +/*Everything is local to the animating procr, so no need for request, do
  19.221 + * work locally, in the anim Pr
  19.222 + */
  19.223 +void
  19.224 +VMS_impl__record_sched_choice_into_probe( int32 probeID, SlaveVP *animatingPr )
  19.225 + { IntervalProbe *probe;
  19.226 + 
  19.227 +   probe = _VMSMasterEnv->intervalProbes[ probeID ];
  19.228 +   probe->schedChoiceWasRecorded = TRUE;
  19.229 +   probe->coreNum = animatingPr->coreAnimatedBy;
  19.230 +   probe->procrID = animatingPr->procrID;
  19.231 +   probe->procrCreateSecs = animatingPr->createPtInSecs;
  19.232 + }
  19.233 +
  19.234 +/*Everything is local to the animating procr, so no need for request, do
  19.235 + * work locally, in the anim Pr
  19.236 + */
  19.237 +void
  19.238 +VMS_impl__record_interval_start_in_probe( int32 probeID )
  19.239 + { IntervalProbe *probe;
  19.240 +
  19.241 +         DEBUG( dbgProbes, "record start of interval\n" )
  19.242 +   probe = _VMSMasterEnv->intervalProbes[ probeID ];
  19.243 +   gettimeofday( &(probe->startStamp), NULL );
  19.244 + }
  19.245 +
  19.246 +
  19.247 +/*Everything is local to the animating procr, so no need for request, do
  19.248 + * work locally, in the anim Pr
  19.249 + */
  19.250 +void
  19.251 +VMS_impl__record_interval_end_in_probe( int32 probeID )
  19.252 + { IntervalProbe *probe;
  19.253 +   struct timeval *endStamp, *startStamp;
  19.254 +   float64 startSecs, endSecs;
  19.255 +
  19.256 +         DEBUG( dbgProbes, "record end of interval\n" )
  19.257 +      //possible seg-fault if array resized by diff core right after this
  19.258 +      // one gets probe..?  Something like that?  Might be safe.. don't care
  19.259 +   probe = _VMSMasterEnv->intervalProbes[ probeID ];
  19.260 +   gettimeofday( &(probe->endStamp), NULL);
  19.261 +
  19.262 +      //now turn into an interval held in a double
  19.263 +   startStamp = &(probe->startStamp);
  19.264 +   endStamp   = &(probe->endStamp);
  19.265 +
  19.266 +   startSecs = startStamp->tv_sec + ( startStamp->tv_usec / 1000000.0 );
  19.267 +   endSecs   = endStamp->tv_sec   + ( endStamp->tv_usec / 1000000.0 );
  19.268 +
  19.269 +   probe->interval  = endSecs - startSecs;
  19.270 +   probe->startSecs = startSecs;
  19.271 +   probe->endSecs   = endSecs;
  19.272 +
  19.273 +   if( probe->hist != NULL )
  19.274 +    {
  19.275 +         //if the interval is sane, then add to histogram
  19.276 +      if( probe->interval < probe->hist->endOfRange * 10 )
  19.277 +         addToDblHist( probe->interval, probe->hist );
  19.278 +    }
  19.279 + }
  19.280 +
  19.281 +void
  19.282 +print_probe_helper( IntervalProbe *probe )
  19.283 + {
  19.284 +   printf( "\nprobe: %s, ",  probe->nameStr );
  19.285 +   
  19.286 +   
  19.287 +   if( probe->schedChoiceWasRecorded )
  19.288 +    { printf( "coreNum: %d, procrID: %d, procrCreated: %0.6f | ",
  19.289 +              probe->coreNum, probe->procrID, probe->procrCreateSecs );
  19.290 +    }
  19.291 +
  19.292 +   if( probe->endSecs == 0 ) //just a single point in time
  19.293 +    {
  19.294 +      printf( " time point: %.6f\n",
  19.295 +              probe->startSecs - _VMSMasterEnv->createPtInSecs );
  19.296 +    }
  19.297 +   else if( probe->hist == NULL ) //just an interval
  19.298 +    {
  19.299 +      printf( " startSecs: %.6f interval: %.6f\n", 
  19.300 +         (probe->startSecs - _VMSMasterEnv->createPtInSecs), probe->interval);
  19.301 +    }
  19.302 +   else  //a full histogram of intervals
  19.303 +    {
  19.304 +      printDblHist( probe->hist );
  19.305 +    }
  19.306 + }
  19.307 +
  19.308 +//TODO: change so pass around pointer to probe instead of its array-index..
  19.309 +// will eliminate chance for timing of resize to cause problems with the
  19.310 +// lookup -- even though don't think it actually can cause problems..
  19.311 +// there's no need to pass index around -- have hash table for names, and
  19.312 +// only need it once, then have ptr to probe..  the thing about enum the
  19.313 +// index and use that as name is clunky in practice -- just hash.
  19.314 +void
  19.315 +VMS_impl__print_stats_of_probe( int32 probeID )
  19.316 + { IntervalProbe *probe;
  19.317 +
  19.318 +   probe = _VMSMasterEnv->intervalProbes[ probeID ];
  19.319 +
  19.320 +   print_probe_helper( probe );
  19.321 + }
  19.322 +
  19.323 +
  19.324 +inline void doNothing(){};
  19.325 +
  19.326 +void
  19.327 +generic_print_probe( void *_probe )
  19.328 + { 
  19.329 +   IntervalProbe *probe = (IntervalProbe *)_probe;
  19.330 +   
  19.331 +   //TODO segfault in printf
  19.332 +   //print_probe_helper( probe );
  19.333 + }
  19.334 +
  19.335 +void
  19.336 +VMS_impl__print_stats_of_all_probes()
  19.337 + {
  19.338 +   forAllInDynArrayDo( _VMSMasterEnv->dynIntervalProbesInfo,
  19.339 +                       &generic_print_probe );
  19.340 +   fflush( stdout );
  19.341 + }
  19.342 +#endif
    20.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    20.2 +++ b/probes.h	Wed Feb 22 11:39:12 2012 -0800
    20.3 @@ -0,0 +1,182 @@
    20.4 +/*
    20.5 + *  Copyright 2009 OpenSourceStewardshipFoundation.org
    20.6 + *  Licensed under GNU General Public License version 2
    20.7 + *
    20.8 + * Author: seanhalle@yahoo.com
    20.9 + * 
   20.10 + */
   20.11 +
   20.12 +#ifndef _PROBES_H
   20.13 +#define	_PROBES_H
   20.14 +#define _GNU_SOURCE
   20.15 +
   20.16 +#include "VMS_primitive_data_types.h"
   20.17 +
   20.18 +#include <sys/time.h>
   20.19 +
   20.20 +/*Note on order of include files:  
   20.21 + * This file relies on #defines that appear in other files..
   20.22 + */
   20.23 +
   20.24 +
   20.25 +//typedef struct _IntervalProbe IntervalProbe; //in VMS.h
   20.26 +
   20.27 +struct _IntervalProbe
   20.28 + {
   20.29 +   char           *nameStr;
   20.30 +   int32           probeID;
   20.31 +
   20.32 +   int32           schedChoiceWasRecorded;
   20.33 +   int32           coreNum;
   20.34 +   int32           procrID;
   20.35 +   float64         procrCreateSecs;
   20.36 +
   20.37 +   #ifdef STATS__USE_TSC_PROBES
   20.38 +   TSCount    startStamp;
   20.39 +   TSCount    endStamp;
   20.40 +   #else
   20.41 +   struct timeval  startStamp;
   20.42 +   struct timeval  endStamp;
   20.43 +   #endif
   20.44 +   float64         startSecs;
   20.45 +   float64         endSecs;
   20.46 +   float64         interval;
   20.47 +   DblHist        *hist;//if NULL, then is single interval probe
   20.48 + };
   20.49 +
   20.50 +
   20.51 +
   20.52 +//======================== Probes =============================
   20.53 +//
   20.54 +// Use macros to allow turning probes off with a #define switch
   20.55 +#ifdef STATS__ENABLE_PROBES
   20.56 +int32
   20.57 +VMS_impl__record_time_point_into_new_probe( char *nameStr,SlaveVP *animPr);
   20.58 +#define VMS__record_time_point_into_new_probe( nameStr, animPr ) \
   20.59 +        VMS_impl__record_time_point_in_new_probe( nameStr, animPr )
   20.60 +
   20.61 +int32
   20.62 +VMS_ext_impl__record_time_point_into_new_probe( char *nameStr );
   20.63 +#define VMS_ext__record_time_point_into_new_probe( nameStr ) \
   20.64 +        VMS_ext_impl__record_time_point_into_new_probe( nameStr )
   20.65 +
   20.66 +
   20.67 +int32
   20.68 +VMS_impl__create_single_interval_probe( char *nameStr, SlaveVP *animPr );
   20.69 +#define VMS__create_single_interval_probe( nameStr, animPr ) \
   20.70 +        VMS_impl__create_single_interval_probe( nameStr, animPr )
   20.71 +
   20.72 +
   20.73 +int32
   20.74 +VMS_impl__create_histogram_probe( int32   numBins, float64    startValue,
   20.75 +               float64 binWidth, char    *nameStr, SlaveVP *animPr );
   20.76 +#define VMS__create_histogram_probe(      numBins, startValue,              \
   20.77 +                                          binWidth, nameStr, animPr )       \
   20.78 +        VMS_impl__create_histogram_probe( numBins, startValue,              \
   20.79 +                                          binWidth, nameStr, animPr )
   20.80 +void
   20.81 +VMS_impl__free_probe( IntervalProbe *probe );
   20.82 +#define VMS__free_probe( probe ) \
   20.83 +        VMS_impl__free_probe( probe )
   20.84 +
   20.85 +void
   20.86 +VMS_impl__index_probe_by_its_name( int32 probeID, SlaveVP *animPr );
   20.87 +#define VMS__index_probe_by_its_name( probeID, animPr ) \
   20.88 +        VMS_impl__index_probe_by_its_name( probeID, animPr )
   20.89 +
   20.90 +IntervalProbe *
   20.91 +VMS_impl__get_probe_by_name( char *probeName, SlaveVP *animPr );
   20.92 +#define VMS__get_probe_by_name( probeID, animPr ) \
   20.93 +        VMS_impl__get_probe_by_name( probeName, animPr )
   20.94 +
   20.95 +void
   20.96 +VMS_impl__record_sched_choice_into_probe( int32 probeID, SlaveVP *animPr );
   20.97 +#define VMS__record_sched_choice_into_probe( probeID, animPr ) \
   20.98 +        VMS_impl__record_sched_choice_into_probe( probeID, animPr )
   20.99 +
  20.100 +void
  20.101 +VMS_impl__record_interval_start_in_probe( int32 probeID );
  20.102 +#define VMS__record_interval_start_in_probe( probeID ) \
  20.103 +        VMS_impl__record_interval_start_in_probe( probeID )
  20.104 +
  20.105 +void
  20.106 +VMS_impl__record_interval_end_in_probe( int32 probeID );
  20.107 +#define VMS__record_interval_end_in_probe( probeID ) \
  20.108 +        VMS_impl__record_interval_end_in_probe( probeID )
  20.109 +
  20.110 +void
  20.111 +VMS_impl__print_stats_of_probe( int32 probeID );
  20.112 +#define VMS__print_stats_of_probe( probeID ) \
  20.113 +        VMS_impl__print_stats_of_probe( probeID )
  20.114 +
  20.115 +void
  20.116 +VMS_impl__print_stats_of_all_probes();
  20.117 +#define VMS__print_stats_of_all_probes() \
  20.118 +        VMS_impl__print_stats_of_all_probes()
  20.119 +
  20.120 +
  20.121 +#else
  20.122 +int32
  20.123 +VMS_impl__record_time_point_into_new_probe( char *nameStr,SlaveVP *animPr);
  20.124 +#define VMS__record_time_point_into_new_probe( nameStr, animPr ) \
  20.125 +       0 /* do nothing */
  20.126 +
  20.127 +int32
  20.128 +VMS_ext_impl__record_time_point_into_new_probe( char *nameStr );
  20.129 +#define VMS_ext__record_time_point_into_new_probe( nameStr ) \
  20.130 +       0 /* do nothing */
  20.131 +
  20.132 +
  20.133 +int32
  20.134 +VMS_impl__create_single_interval_probe( char *nameStr, SlaveVP *animPr );
  20.135 +#define VMS__create_single_interval_probe( nameStr, animPr ) \
  20.136 +       0 /* do nothing */
  20.137 +
  20.138 +
  20.139 +int32
  20.140 +VMS_impl__create_histogram_probe( int32   numBins, float64    startValue,
  20.141 +               float64 binWidth, char    *nameStr, SlaveVP *animPr );
  20.142 +#define VMS__create_histogram_probe(      numBins, startValue,              \
  20.143 +                                          binWidth, nameStr, animPr )       \
  20.144 +       0 /* do nothing */
  20.145 +
  20.146 +void
  20.147 +VMS_impl__index_probe_by_its_name( int32 probeID, SlaveVP *animPr );
  20.148 +#define VMS__index_probe_by_its_name( probeID, animPr ) \
  20.149 +        /* do nothing */
  20.150 +
  20.151 +IntervalProbe *
  20.152 +VMS_impl__get_probe_by_name( char *probeName, SlaveVP *animPr );
  20.153 +#define VMS__get_probe_by_name( probeID, animPr ) \
  20.154 +       NULL /* do nothing */
  20.155 +
  20.156 +void
  20.157 +VMS_impl__record_sched_choice_into_probe( int32 probeID, SlaveVP *animPr );
  20.158 +#define VMS__record_sched_choice_into_probe( probeID, animPr ) \
  20.159 +        /* do nothing */
  20.160 +
  20.161 +void
  20.162 +VMS_impl__record_interval_start_in_probe( int32 probeID );
  20.163 +#define VMS__record_interval_start_in_probe( probeID ) \
  20.164 +        /* do nothing */
  20.165 +
  20.166 +void
  20.167 +VMS_impl__record_interval_end_in_probe( int32 probeID );
  20.168 +#define VMS__record_interval_end_in_probe( probeID ) \
  20.169 +        /* do nothing */
  20.170 +
  20.171 +inline void doNothing();
  20.172 +void
  20.173 +VMS_impl__print_stats_of_probe( int32 probeID );
  20.174 +#define VMS__print_stats_of_probe( probeID ) \
  20.175 +        doNothing/* do nothing */
  20.176 +
  20.177 +void
  20.178 +VMS_impl__print_stats_of_all_probes();
  20.179 +#define VMS__print_stats_of_all_probes \
  20.180 +        doNothing/* do nothing */
  20.181 +
  20.182 +#endif   /* defined STATS__ENABLE_PROBES */
  20.183 +
  20.184 +#endif	/* _PROBES_H */
  20.185 +
    21.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    21.2 +++ b/vmalloc.c	Wed Feb 22 11:39:12 2012 -0800
    21.3 @@ -0,0 +1,494 @@
    21.4 +/*
    21.5 + *  Copyright 2009 OpenSourceCodeStewardshipFoundation.org
    21.6 + *  Licensed under GNU General Public License version 2
    21.7 + *
    21.8 + * Author: seanhalle@yahoo.com
    21.9 + *
   21.10 + * Created on November 14, 2009, 9:07 PM
   21.11 + */
   21.12 +
   21.13 +#include <malloc.h>
   21.14 +#include <inttypes.h>
   21.15 +#include <stdlib.h>
   21.16 +#include <stdio.h>
   21.17 +
   21.18 +#include "VMS.h"
   21.19 +#include "C_Libraries/Histogram/Histogram.h"
   21.20 +
   21.21 +/*Helper function
   21.22 + *Insert a newly generated free chunk into the first spot on the free list.
   21.23 + * The chunk is cast as a MallocProlog, so the various pointers in it are
   21.24 + * accessed with C's help -- and the size of the prolog is easily added to
   21.25 + * the pointer when a chunk is returned to the app -- so C handles changes
   21.26 + * in pointer sizes among machines.
   21.27 + *
   21.28 + *The list head is a normal MallocProlog struct -- identified by its
   21.29 + * prevChunkInFreeList being NULL -- the only one.
   21.30 + *
   21.31 + *The end of the list is identified by next chunk being NULL, as usual.
   21.32 + */
   21.33 +void inline
   21.34 +add_chunk_to_free_list( MallocProlog *chunk, MallocProlog *listHead )
   21.35 + { 
   21.36 +   chunk->nextChunkInFreeList     = listHead->nextChunkInFreeList;
   21.37 +   if( chunk->nextChunkInFreeList != NULL ) //if not last in free list
   21.38 +      chunk->nextChunkInFreeList->prevChunkInFreeList = chunk;
   21.39 +   chunk->prevChunkInFreeList     = listHead;
   21.40 +   listHead->nextChunkInFreeList  = chunk;
   21.41 + }
   21.42 +
   21.43 +
   21.44 +/*This is sequential code, meant to only be called from the Master, not from
   21.45 + * any slave VPs.
   21.46 + *Search down list, checking size by the nextHigherInMem pointer, to find
   21.47 + * first chunk bigger than size needed.
   21.48 + *Shave off the extra and make it into a new free-list element, hook it in
   21.49 + * then return the address of the found element plus size of prolog.
   21.50 + *
   21.51 + */
   21.52 +void *VMS_int__malloc( size_t sizeRequested )
   21.53 + { MallocProlog *foundElem = NULL, *currElem, *newElem;
   21.54 +   ssize_t        amountExtra, sizeConsumed,sizeOfFound;
   21.55 +   uint32        foundElemIsTopOfHeap;
   21.56 +
   21.57 +   //============================= MEASUREMENT STUFF ========================
   21.58 +   #ifdef MEAS__TIME_MALLOC
   21.59 +   int32 startStamp, endStamp;
   21.60 +   saveLowTimeStampCountInto( startStamp );
   21.61 +   #endif
   21.62 +   //========================================================================
   21.63 +   
   21.64 +      //step up the size to be aligned at 16-byte boundary, prob better ways
   21.65 +   sizeRequested = (sizeRequested + 16) & ~15;
   21.66 +   currElem = (_VMSMasterEnv->freeListHead)->nextChunkInFreeList;
   21.67 +
   21.68 +   while( currElem != NULL )
   21.69 +    {    //check if size of currElem is big enough
   21.70 +      sizeOfFound=(size_t)((uintptr_t)currElem->nextHigherInMem -(uintptr_t)currElem);
   21.71 +      amountExtra = sizeOfFound - sizeRequested - sizeof(MallocProlog);
   21.72 +      if( amountExtra > 0 )
   21.73 +       {    //found it, get out of loop
   21.74 +         foundElem = currElem;
   21.75 +         currElem = NULL;
   21.76 +       }
   21.77 +      else
   21.78 +         currElem = currElem->nextChunkInFreeList;
   21.79 +    }
   21.80 +   
   21.81 +   if( foundElem == NULL )
   21.82 +    { ERROR("\nmalloc failed\n")
   21.83 +      return (void *)NULL;  //indicates malloc failed
   21.84 +    }
   21.85 +      //Using a kludge to identify the element that is the top chunk in the
   21.86 +      // heap -- saving top-of-heap addr in head's nextHigherInMem -- and
   21.87 +      // save addr of start of heap in head's nextLowerInMem
   21.88 +      //Will handle top of Heap specially
   21.89 +   foundElemIsTopOfHeap = foundElem->nextHigherInMem ==
   21.90 +                          _VMSMasterEnv->freeListHead->nextHigherInMem;
   21.91 +   
   21.92 +      //before shave off and try to insert new elem, remove found elem
   21.93 +      //note, foundElem will never be the head, so always has valid prevChunk
   21.94 +   foundElem->prevChunkInFreeList->nextChunkInFreeList =
   21.95 +                                              foundElem->nextChunkInFreeList;
   21.96 +   if( foundElem->nextChunkInFreeList != NULL )
   21.97 +    { foundElem->nextChunkInFreeList->prevChunkInFreeList =
   21.98 +                                              foundElem->prevChunkInFreeList;
   21.99 +    }
  21.100 +   foundElem->prevChunkInFreeList = NULL;//indicates elem currently allocated
  21.101 +   
  21.102 +      //if enough, turn extra into new elem & insert it
  21.103 +   if( amountExtra > 64 )
  21.104 +    {   //make new elem by adding to addr of curr elem then casting
  21.105 +        sizeConsumed = sizeof(MallocProlog) + sizeRequested; 
  21.106 +        newElem = (MallocProlog *)( (uintptr_t)foundElem + sizeConsumed );
  21.107 +        newElem->nextLowerInMem    = foundElem; //This is evil (but why?) 
  21.108 +        newElem->nextHigherInMem   = foundElem->nextHigherInMem; //This is evil (but why?)
  21.109 +        foundElem->nextHigherInMem = newElem;
  21.110 +        if( ! foundElemIsTopOfHeap )
  21.111 +        {  //there is no next higher for top of heap, so can't write to it
  21.112 +           newElem->nextHigherInMem->nextLowerInMem = newElem;
  21.113 +        }
  21.114 +        add_chunk_to_free_list( newElem, _VMSMasterEnv->freeListHead );
  21.115 +    }
  21.116 +   else
  21.117 +    {
  21.118 +      sizeConsumed = sizeOfFound;
  21.119 +    }
  21.120 +  _VMSMasterEnv->amtOfOutstandingMem += sizeConsumed;
  21.121 +
  21.122 +   //============================= MEASUREMENT STUFF ========================
  21.123 +   #ifdef MEAS__TIME_MALLOC
  21.124 +   saveLowTimeStampCountInto( endStamp );
  21.125 +   addIntervalToHist( startStamp, endStamp, _VMSMasterEnv->mallocTimeHist );
  21.126 +   #endif
  21.127 +   //========================================================================
  21.128 +
  21.129 +      //skip over the prolog by adding its size to the pointer return
  21.130 +   return (void*)((uintptr_t)foundElem + sizeof(MallocProlog));
  21.131 + }
  21.132 +
  21.133 +/*This is sequential code, meant to only be called from the Master, not from
  21.134 + * any slave VPs.
  21.135 + *Search down list, checking size by the nextHigherInMem pointer, to find
  21.136 + * first chunk bigger than size needed.
  21.137 + *Shave off the extra and make it into a new free-list element, hook it in
  21.138 + * then return the address of the found element plus size of prolog.
  21.139 + *
  21.140 + * The difference to the regular malloc is, that all the allocated chunks are
  21.141 + * aligned and padded to the size of a CACHE_LINE_SZ. Thus creating a new chunk
  21.142 + * before the aligned chunk.
  21.143 + */
  21.144 +void *VMS_int__malloc_aligned( size_t sizeRequested )
  21.145 + { MallocProlog *foundElem = NULL, *currElem, *newElem;
  21.146 +   ssize_t        amountExtra, sizeConsumed,sizeOfFound,prevAmount;
  21.147 +   uint32        foundElemIsTopOfHeap;
  21.148 +
  21.149 +   //============================= MEASUREMENT STUFF ========================
  21.150 +   #ifdef MEAS__TIME_MALLOC
  21.151 +   uint32 startStamp, endStamp;
  21.152 +   saveLowTimeStampCountInto( startStamp );
  21.153 +   #endif
  21.154 +   //========================================================================
  21.155 +   
  21.156 +      //step up the size to be multiple of the cache line size
  21.157 +   sizeRequested = (sizeRequested + CACHE_LINE_SZ) & ~(CACHE_LINE_SZ-1);
  21.158 +   currElem = (_VMSMasterEnv->freeListHead)->nextChunkInFreeList;
  21.159 +
  21.160 +   while( currElem != NULL )
  21.161 +    {    //check if size of currElem is big enough
  21.162 +      sizeOfFound=(size_t)((uintptr_t)currElem->nextHigherInMem -(uintptr_t)currElem);
  21.163 +      amountExtra = sizeOfFound - sizeRequested - sizeof(MallocProlog);
  21.164 +      if( amountExtra > 0 )
  21.165 +       {    
  21.166 +         //look if the found element is already aligned
  21.167 +         if((((uintptr_t)currElem+sizeof(MallocProlog)) & (uintptr_t)(CACHE_LINE_SZ-1)) == 0){
  21.168 +             //found it, get out of loop
  21.169 +             foundElem = currElem;
  21.170 +             break;
  21.171 +         }else{
  21.172 +             //find first aligned address and check if it's still big enough
  21.173 +             //check also if the space before the aligned address is big enough
  21.174 +             //for a new element
  21.175 +             void *firstAlignedAddr = (void*)(((uintptr_t)currElem + 2*CACHE_LINE_SZ) & ~((uintptr_t)(CACHE_LINE_SZ-1)));
  21.176 +             prevAmount = (uintptr_t)firstAlignedAddr - (uintptr_t)currElem;
  21.177 +             sizeOfFound=(uintptr_t)currElem->nextHigherInMem -(uintptr_t)firstAlignedAddr + sizeof(MallocProlog);
  21.178 +             amountExtra= sizeOfFound - sizeRequested - sizeof(MallocProlog);
  21.179 +             if(prevAmount > 2*sizeof(MallocProlog) && amountExtra > 0 ){
  21.180 +                 //found suitable element
  21.181 +                 //create new previous element and exit loop
  21.182 +                 MallocProlog *newAlignedElem = (MallocProlog*)firstAlignedAddr - 1;
  21.183 +                 
  21.184 +                 //insert new element into free list
  21.185 +                 if(currElem->nextChunkInFreeList != NULL)
  21.186 +                     currElem->nextChunkInFreeList->prevChunkInFreeList = newAlignedElem;                     
  21.187 +                 newAlignedElem->prevChunkInFreeList = currElem;
  21.188 +                 newAlignedElem->nextChunkInFreeList = currElem->nextChunkInFreeList;
  21.189 +                 currElem->nextChunkInFreeList = newAlignedElem;
  21.190 +                 
  21.191 +                 //set higherInMem and lowerInMem
  21.192 +                 newAlignedElem->nextHigherInMem = currElem->nextHigherInMem;
  21.193 +                 foundElemIsTopOfHeap = currElem->nextHigherInMem ==
  21.194 +                          _VMSMasterEnv->freeListHead->nextHigherInMem;
  21.195 +                 if(!foundElemIsTopOfHeap)
  21.196 +                     currElem->nextHigherInMem->nextLowerInMem = newAlignedElem;
  21.197 +                 currElem->nextHigherInMem = newAlignedElem;
  21.198 +                 newAlignedElem->nextLowerInMem = currElem;
  21.199 +                 
  21.200 +                 //Found new element leaving loop
  21.201 +                 foundElem = newAlignedElem;
  21.202 +                 break;
  21.203 +             }
  21.204 +         }
  21.205 +         
  21.206 +       }
  21.207 +       currElem = currElem->nextChunkInFreeList;
  21.208 +    }
  21.209 +
  21.210 +   if( foundElem == NULL )
  21.211 +    { ERROR("\nmalloc failed\n")
  21.212 +      return (void *)NULL;  //indicates malloc failed
  21.213 +    }
  21.214 +      //Using a kludge to identify the element that is the top chunk in the
  21.215 +      // heap -- saving top-of-heap addr in head's nextHigherInMem -- and
  21.216 +      // save addr of start of heap in head's nextLowerInMem
  21.217 +      //Will handle top of Heap specially
  21.218 +   foundElemIsTopOfHeap = foundElem->nextHigherInMem ==
  21.219 +                          _VMSMasterEnv->freeListHead->nextHigherInMem;
  21.220 +
  21.221 +      //before shave off and try to insert new elem, remove found elem
  21.222 +      //note, foundElem will never be the head, so always has valid prevChunk
  21.223 +   foundElem->prevChunkInFreeList->nextChunkInFreeList =
  21.224 +                                              foundElem->nextChunkInFreeList;
  21.225 +   if( foundElem->nextChunkInFreeList != NULL )
  21.226 +    { foundElem->nextChunkInFreeList->prevChunkInFreeList =
  21.227 +                                              foundElem->prevChunkInFreeList;
  21.228 +    }
  21.229 +   foundElem->prevChunkInFreeList = NULL;//indicates elem currently allocated
  21.230 +   
  21.231 +      //if enough, turn extra into new elem & insert it
  21.232 +   if( amountExtra > 64 )
  21.233 +    {    //make new elem by adding to addr of curr elem then casting
  21.234 +      sizeConsumed = sizeof(MallocProlog) + sizeRequested;
  21.235 +      newElem = (MallocProlog *)( (uintptr_t)foundElem + sizeConsumed );
  21.236 +      newElem->nextHigherInMem   = foundElem->nextHigherInMem;
  21.237 +      newElem->nextLowerInMem    = foundElem;
  21.238 +      foundElem->nextHigherInMem = newElem;
  21.239 +      
  21.240 +      if( ! foundElemIsTopOfHeap )
  21.241 +       {    //there is no next higher for top of heap, so can't write to it
  21.242 +         newElem->nextHigherInMem->nextLowerInMem = newElem;
  21.243 +       }
  21.244 +      add_chunk_to_free_list( newElem, _VMSMasterEnv->freeListHead );
  21.245 +    }
  21.246 +   else
  21.247 +    {
  21.248 +      sizeConsumed = sizeOfFound;
  21.249 +    }
  21.250 +  _VMSMasterEnv->amtOfOutstandingMem += sizeConsumed;
  21.251 +
  21.252 +   //============================= MEASUREMENT STUFF ========================
  21.253 +   #ifdef MEAS__TIME_MALLOC
  21.254 +   saveLowTimeStampCountInto( endStamp );
  21.255 +   addIntervalToHist( startStamp, endStamp, _VMSMasterEnv->mallocTimeHist );
  21.256 +   #endif
  21.257 +   //========================================================================
  21.258 +
  21.259 +      //skip over the prolog by adding its size to the pointer return
  21.260 +   return (void*)((uintptr_t)foundElem + sizeof(MallocProlog));
  21.261 + }
  21.262 +
  21.263 +
  21.264 +/*This is sequential code -- only to be called from the Master
  21.265 + * When free, subtract the size of prolog from pointer, then cast it to a
  21.266 + * MallocProlog.  Then check the nextLower and nextHigher chunks to see if
  21.267 + * one or both are also free, and coalesce if so, and if neither free, then
  21.268 + * add this one to free-list.
  21.269 + */
  21.270 +void
  21.271 +VMS_int__free( void *ptrToFree )
  21.272 + { MallocProlog *elemToFree, *nextLowerElem, *nextHigherElem;
  21.273 +   size_t         sizeOfElem;
  21.274 +   uint32         lowerExistsAndIsFree, higherExistsAndIsFree;
  21.275 +
  21.276 +   //============================= MEASUREMENT STUFF ========================
  21.277 +   #ifdef MEAS__TIME_MALLOC
  21.278 +   int32 startStamp, endStamp;
  21.279 +   saveLowTimeStampCountInto( startStamp );
  21.280 +   #endif
  21.281 +   //========================================================================
  21.282 +
  21.283 +   if( ptrToFree < (void*)_VMSMasterEnv->freeListHead->nextLowerInMem ||
  21.284 +       ptrToFree > (void*)_VMSMasterEnv->freeListHead->nextHigherInMem )
  21.285 +    {    //outside the range of data owned by VMS's malloc, so do nothing
  21.286 +      return;
  21.287 +    }
  21.288 +      //subtract size of prolog to get pointer to prolog, then cast
  21.289 +   elemToFree = (MallocProlog *)((uintptr_t)ptrToFree - sizeof(MallocProlog));
  21.290 +   sizeOfElem =(size_t)((uintptr_t)elemToFree->nextHigherInMem-(uintptr_t)elemToFree);
  21.291 +
  21.292 +   if( elemToFree->prevChunkInFreeList != NULL )
  21.293 +    { printf( "error: freeing same element twice!" ); exit(1);
  21.294 +    }
  21.295 +
  21.296 +   _VMSMasterEnv->amtOfOutstandingMem -= sizeOfElem;
  21.297 +
  21.298 +   nextLowerElem  = elemToFree->nextLowerInMem;
  21.299 +   nextHigherElem = elemToFree->nextHigherInMem;
  21.300 +
  21.301 +   if( nextHigherElem == NULL )
  21.302 +      higherExistsAndIsFree = FALSE;
  21.303 +   else //okay exists, now check if in the free-list by checking back ptr
  21.304 +      higherExistsAndIsFree = (nextHigherElem->prevChunkInFreeList != NULL);
  21.305 +    
  21.306 +   if( nextLowerElem == NULL )
  21.307 +      lowerExistsAndIsFree = FALSE;
  21.308 +   else //okay, it exists, now check if it's free
  21.309 +      lowerExistsAndIsFree = (nextLowerElem->prevChunkInFreeList != NULL);
  21.310 +    
  21.311 +
  21.312 +      //now, know what exists and what's free
  21.313 +   if( lowerExistsAndIsFree )
  21.314 +    { if( higherExistsAndIsFree )
  21.315 +       {    //both exist and are free, so coalesce all three
  21.316 +            //First, remove higher from free-list
  21.317 +         nextHigherElem->prevChunkInFreeList->nextChunkInFreeList =
  21.318 +                                         nextHigherElem->nextChunkInFreeList;
  21.319 +         if( nextHigherElem->nextChunkInFreeList != NULL ) //end-of-list?
  21.320 +            nextHigherElem->nextChunkInFreeList->prevChunkInFreeList =
  21.321 +                                         nextHigherElem->prevChunkInFreeList;
  21.322 +            //Now, fix-up sequence-in-mem list -- by side-effect, this also
  21.323 +            // changes size of the lower elem, which is still in free-list
  21.324 +         nextLowerElem->nextHigherInMem = nextHigherElem->nextHigherInMem;
  21.325 +         if( nextHigherElem->nextHigherInMem !=
  21.326 +             _VMSMasterEnv->freeListHead->nextHigherInMem )
  21.327 +            nextHigherElem->nextHigherInMem->nextLowerInMem = nextLowerElem;
  21.328 +            //notice didn't do anything to elemToFree -- it simply is no
  21.329 +            // longer reachable from any of the lists.  Wonder if could be a
  21.330 +            // security leak because left valid addresses in it,
  21.331 +            // but don't care for now.
  21.332 +       }
  21.333 +      else
  21.334 +       {    //lower is the only of the two that exists and is free,
  21.335 +            //In this case, no adjustment to free-list, just change mem-list.
  21.336 +            // By side-effect, changes size of the lower elem
  21.337 +         nextLowerElem->nextHigherInMem = elemToFree->nextHigherInMem;
  21.338 +         if( elemToFree->nextHigherInMem !=
  21.339 +             _VMSMasterEnv->freeListHead->nextHigherInMem )
  21.340 +            elemToFree->nextHigherInMem->nextLowerInMem = nextLowerElem;
  21.341 +       }
  21.342 +    }
  21.343 +   else
  21.344 +    {    //lower either doesn't exist or isn't free, so check higher
  21.345 +      if( higherExistsAndIsFree )
  21.346 +       {    //higher exists and is the only of the two free
  21.347 +            //First, in free-list, replace higher elem with the one to free
  21.348 +         elemToFree->nextChunkInFreeList=nextHigherElem->nextChunkInFreeList;
  21.349 +         elemToFree->prevChunkInFreeList=nextHigherElem->prevChunkInFreeList;
  21.350 +         elemToFree->prevChunkInFreeList->nextChunkInFreeList = elemToFree;
  21.351 +         if( elemToFree->nextChunkInFreeList != NULL ) // end-of-list?
  21.352 +            elemToFree->nextChunkInFreeList->prevChunkInFreeList =elemToFree;
  21.353 +            //Now chg mem-list. By side-effect, changes size of elemToFree
  21.354 +         elemToFree->nextHigherInMem = nextHigherElem->nextHigherInMem;
  21.355 +         if( elemToFree->nextHigherInMem !=
  21.356 +             _VMSMasterEnv->freeListHead->nextHigherInMem )
  21.357 +            elemToFree->nextHigherInMem->nextLowerInMem = elemToFree;
  21.358 +       }
  21.359 +      else
  21.360 +       {    //neither lower nor higher is availabe to coalesce so add to list
  21.361 +            // this makes prev chunk ptr non-null, which indicates it's free
  21.362 +         elemToFree->nextChunkInFreeList =
  21.363 +                            _VMSMasterEnv->freeListHead->nextChunkInFreeList;
  21.364 +         _VMSMasterEnv->freeListHead->nextChunkInFreeList = elemToFree;
  21.365 +         if( elemToFree->nextChunkInFreeList != NULL ) // end-of-list?
  21.366 +            elemToFree->nextChunkInFreeList->prevChunkInFreeList =elemToFree;
  21.367 +         elemToFree->prevChunkInFreeList = _VMSMasterEnv->freeListHead;
  21.368 +       }
  21.369 +    }
  21.370 +   //============================= MEASUREMENT STUFF ========================
  21.371 +   #ifdef MEAS__TIME_MALLOC
  21.372 +   saveLowTimeStampCountInto( endStamp );
  21.373 +   addIntervalToHist( startStamp, endStamp, _VMSMasterEnv->freeTimeHist );
  21.374 +   #endif
  21.375 +   //========================================================================
  21.376 +
  21.377 + }
  21.378 +
  21.379 +
  21.380 +/*Allocates memory from the external system -- higher overhead
  21.381 + *
  21.382 + *Because of Linux's malloc throwing bizarre random faults when malloc is
  21.383 + * used inside a VMS virtual processor, have to pass this as a request and
  21.384 + * have the core loop do it when it gets around to it -- will look for these
  21.385 + * chores leftover from the previous animation of masterVP the next time it
  21.386 + * goes to animate the masterVP -- so it takes two separate masterVP
  21.387 + * animations, separated by work, to complete an external malloc or
  21.388 + * external free request.
  21.389 + *
  21.390 + *Thinking core loop accepts signals -- just looks if signal-location is
  21.391 + * empty or not --
  21.392 + */
  21.393 +void *
  21.394 +VMS__malloc_in_ext( size_t sizeRequested )
  21.395 + {
  21.396 + /*
  21.397 +      //This is running in the master, so no chance for multiple cores to be
  21.398 +      // competing for the core's flag.
  21.399 +   if(  *(_VMSMasterEnv->coreLoopSignalAddr[ 0 ]) != 0 )
  21.400 +    {    //something has already signalled to core loop, so save the signal
  21.401 +         // and look, next time master animated, to see if can send it.
  21.402 +         //Note, the addr to put a signal is in the coreloop's frame, so just
  21.403 +         // checks it each time through -- make it volatile to avoid GCC
  21.404 +         // optimizations -- it's a coreloop local var that only changes
  21.405 +         // after jumping away.  The signal includes the addr to send the
  21.406 +         //return to -- even if just empty return completion-signal
  21.407 +         //
  21.408 +         //save the signal in some queue that the master looks at each time
  21.409 +         // it starts up -- one loc says if empty for fast common case --
  21.410 +         //something like that -- want to hide this inside this call -- but
  21.411 +         // think this has to come as a request -- req handler gives procr
  21.412 +         // back to master loop, which gives it back to req handler at point
  21.413 +         // it sees that core loop has sent return signal.  Something like
  21.414 +         // that.
  21.415 +      saveTheSignal
  21.416 +
  21.417 +    }
  21.418 +  coreSigData->type = malloc;
  21.419 +  coreSigData->sizeToMalloc = sizeRequested;
  21.420 +  coreSigData->locToSignalCompletion = &figureOut;
  21.421 +   _VMSMasterEnv->coreLoopSignals[ 0 ] = coreSigData;
  21.422 +  */
  21.423 +      //just risk system-stack faults until get this figured out
  21.424 +   return malloc( sizeRequested );
  21.425 + }
  21.426 +
  21.427 +
  21.428 +/*Frees memory that was allocated in the external system -- higher overhead
  21.429 + *
  21.430 + *As noted in external malloc comment, this is clunky 'cause the free has
  21.431 + * to be called in the core loop.
  21.432 + */
  21.433 +void
  21.434 +VMS__free_in_ext( void *ptrToFree )
  21.435 + {
  21.436 +      //just risk system-stack faults until get this figured out
  21.437 +   free( ptrToFree );
  21.438 +
  21.439 +      //TODO: fix this -- so 
  21.440 + }
  21.441 +
  21.442 +
  21.443 +/*Designed to be called from the main thread outside of VMS, during init
  21.444 + */
  21.445 +MallocProlog *
  21.446 +VMS_ext__create_free_list()
  21.447 + { MallocProlog *freeListHead, *firstChunk;
  21.448 +
  21.449 +      //Note, this is running in the main thread -- all increases in malloc
  21.450 +      // mem and all frees of it must be done in this thread, with the
  21.451 +      // thread's original stack available
  21.452 +   freeListHead = malloc( sizeof(MallocProlog) );
  21.453 +   firstChunk   = malloc( MALLOC_ADDITIONAL_MEM_FROM_OS_SIZE );
  21.454 +   if( firstChunk == NULL ) {printf("malloc error\n"); exit(1);}
  21.455 +   
  21.456 +   //Touch memory to avoid page faults
  21.457 +   void *ptr,*endPtr; 
  21.458 +   endPtr = (void*)firstChunk+MALLOC_ADDITIONAL_MEM_FROM_OS_SIZE;
  21.459 +   for(ptr = firstChunk; ptr < endPtr; ptr+=PAGE_SIZE)
  21.460 +   {
  21.461 +       *(char*)ptr = 0;
  21.462 +   }
  21.463 +
  21.464 +   freeListHead->prevChunkInFreeList = NULL;
  21.465 +      //Use this addr to free the heap when cleanup
  21.466 +   freeListHead->nextLowerInMem      = firstChunk;
  21.467 +      //to identify top-of-heap elem, compare this addr to elem's next higher
  21.468 +   freeListHead->nextHigherInMem     = (void*)( (uintptr_t)firstChunk +
  21.469 +                                         MALLOC_ADDITIONAL_MEM_FROM_OS_SIZE);
  21.470 +   freeListHead->nextChunkInFreeList = firstChunk;
  21.471 +
  21.472 +   firstChunk->nextChunkInFreeList   = NULL;
  21.473 +   firstChunk->prevChunkInFreeList   = freeListHead;
  21.474 +      //next Higher has to be set to top of chunk, so can calc size in malloc
  21.475 +   firstChunk->nextHigherInMem       = (void*)( (uintptr_t)firstChunk +
  21.476 +                                         MALLOC_ADDITIONAL_MEM_FROM_OS_SIZE);
  21.477 +   firstChunk->nextLowerInMem        = NULL; //identifies as bott of heap
  21.478 +   
  21.479 +   _VMSMasterEnv->amtOfOutstandingMem = 0; //none allocated yet
  21.480 +
  21.481 +   return freeListHead;
  21.482 + }
  21.483 +
  21.484 +
  21.485 +/*Designed to be called from the main thread outside of VMS, during cleanup
  21.486 + */
  21.487 +void
  21.488 +VMS_ext__free_free_list( MallocProlog *freeListHead )
  21.489 + {    
  21.490 +      //stashed a ptr to the one and only bug chunk malloc'd from OS in the
  21.491 +      // free list head's next lower in mem pointer
  21.492 +   free( freeListHead->nextLowerInMem );
  21.493 +
  21.494 +   //don't free the head -- it'll be in an array eventually -- free whole
  21.495 +   // array when all the free lists linked from it have already been freed
  21.496 + }
  21.497 +
    22.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    22.2 +++ b/vmalloc.h	Wed Feb 22 11:39:12 2012 -0800
    22.3 @@ -0,0 +1,90 @@
    22.4 +/*
    22.5 + *  Copyright 2009 OpenSourceCodeStewardshipFoundation.org
    22.6 + *  Licensed under GNU General Public License version 2
    22.7 + *
    22.8 + * Author: seanhalle@yahoo.com
    22.9 + *
   22.10 + * Created on November 14, 2009, 9:07 PM
   22.11 + */
   22.12 +
   22.13 +#ifndef _VMALLOC_H
   22.14 +#define	_VMALLOC_H
   22.15 +
   22.16 +#include <malloc.h>
   22.17 +#include <inttypes.h>
   22.18 +#include "VMS_primitive_data_types.h"
   22.19 +
   22.20 +typedef struct _MallocProlog MallocProlog;
   22.21 +
   22.22 +struct _MallocProlog
   22.23 + {
   22.24 +   MallocProlog *nextChunkInFreeList;
   22.25 +   MallocProlog *prevChunkInFreeList;
   22.26 +   MallocProlog *nextHigherInMem;
   22.27 +   MallocProlog *nextLowerInMem;
   22.28 + };
   22.29 +//MallocProlog
   22.30 +
   22.31 +typedef struct
   22.32 + {
   22.33 +   MallocProlog *firstChunkInFreeList;
   22.34 +   int32         numInList; //TODO not used
   22.35 + }
   22.36 +FreeListHead;
   22.37 +
   22.38 +void *
   22.39 +VMS_int__malloc( size_t sizeRequested );
   22.40 +
   22.41 +void *
   22.42 +VMS_int__malloc_aligned( size_t sizeRequested );
   22.43 +
   22.44 +void
   22.45 +VMS_int__free( void *ptrToFree );
   22.46 +
   22.47 +#define VMS_PI__malloc VMS_int__malloc
   22.48 +#define VMS_PI__malloc_aligned VMS_int__malloc_aligned
   22.49 +#define VMS_PI__free VMS_int__free
   22.50 +/* For now, the PI is protected by master lock, so int malloc fine
   22.51 +void *
   22.52 +VMS_PI__malloc( size_t sizeRequested );
   22.53 +
   22.54 +void *
   22.55 +VMS_PI__malloc_aligned( size_t sizeRequested );
   22.56 +
   22.57 +void
   22.58 +VMS_PI__free( void *ptrToFree );
   22.59 +*/
   22.60 +
   22.61 +//TODO: protect WL malloc from concurrency!! shared freelist can be corrupted
   22.62 +#define VMS_WL__malloc VMS_int__malloc
   22.63 +#define VMS_WL__malloc_aligned VMS_int__malloc_aligned
   22.64 +#define VMS_WL__free VMS_int__free
   22.65 +/*
   22.66 +void *
   22.67 +VMS_WL__malloc( size_t sizeRequested );
   22.68 +
   22.69 +void *
   22.70 +VMS_WL__malloc_aligned( size_t sizeRequested );
   22.71 +
   22.72 +void
   22.73 +VMS_WL__free( void *ptrToFree );
   22.74 +*/
   22.75 +
   22.76 +/*Allocates memory from the external system -- higher overhead
   22.77 + */
   22.78 +void *
   22.79 +VMS__malloc_in_ext( size_t sizeRequested );
   22.80 +
   22.81 +/*Frees memory that was allocated in the external system -- higher overhead
   22.82 + */
   22.83 +void
   22.84 +VMS__free_in_ext( void *ptrToFree );
   22.85 +
   22.86 +
   22.87 +MallocProlog *
   22.88 +VMS_ext__create_free_list();
   22.89 +
   22.90 +void
   22.91 +VMS_ext__free_free_list( MallocProlog *freeListHead );
   22.92 +
   22.93 +#endif
   22.94 \ No newline at end of file
    23.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    23.2 +++ b/vutilities.c	Wed Feb 22 11:39:12 2012 -0800
    23.3 @@ -0,0 +1,25 @@
    23.4 +/*
    23.5 + *  Copyright 2009 OpenSourceCodeStewardshipFoundation.org
    23.6 + *  Licensed under GNU General Public License version 2
    23.7 + *
    23.8 + * Author: seanhalle@yahoo.com
    23.9 + *
   23.10 + * Created on November 14, 2009, 9:07 PM
   23.11 + */
   23.12 +
   23.13 +#include <malloc.h>
   23.14 +#include <stdlib.h>
   23.15 +
   23.16 +#include "VMS.h"
   23.17 +
   23.18 +
   23.19 +inline char *
   23.20 +VMS_int__strDup( char *str )
   23.21 + { char *retStr;
   23.22 +
   23.23 +   retStr = VMS_int__malloc( strlen(str) + 1 );
   23.24 +   if( str == NULL ) return str;
   23.25 +   strcpy( retStr, str );
   23.26 +
   23.27 +   return retStr;
   23.28 + }
    24.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    24.2 +++ b/vutilities.h	Wed Feb 22 11:39:12 2012 -0800
    24.3 @@ -0,0 +1,20 @@
    24.4 +/*
    24.5 + *  Copyright 2009 OpenSourceCodeStewardshipFoundation.org
    24.6 + *  Licensed under GNU General Public License version 2
    24.7 + *
    24.8 + * Author: seanhalle@yahoo.com
    24.9 + *
   24.10 + * Created on November 14, 2009, 9:07 PM
   24.11 + */
   24.12 +
   24.13 +
   24.14 +#ifndef  _UTILITIES_H
   24.15 +#define	_UTILITIES_H
   24.16 +
   24.17 +#include <string.h>
   24.18 +#include "VMS_primitive_data_types.h"
   24.19 +
   24.20 +inline char *
   24.21 +VMS_int__strDup( char *str );
   24.22 + 
   24.23 +#endif