Mercurial > cgi-bin > hgwebdir.cgi > VMS > VMS_Implementations > VMS_impls > VMS__MC_shared_impl

diff AnimationMaster.c @ 230:f2a7831352dc
changed SchedulingMaster.c to AnimationMaster.c and cleaned up all comments
author: Some Random Person <seanhalle@yahoo.com>
date: Thu, 15 Mar 2012 20:31:41 -0700
children: 88fd85921d7f
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/AnimationMaster.c	Thu Mar 15 20:31:41 2012 -0700
     1.3 @@ -0,0 +1,393 @@
     1.4 +/*
     1.5 + * Copyright 2010  OpenSourceStewardshipFoundation
     1.6 + * 
     1.7 + * Licensed under BSD
     1.8 + */
     1.9 +
    1.10 +
    1.11 +
    1.12 +#include <stdio.h>
    1.13 +#include <stddef.h>
    1.14 +
    1.15 +#include "VMS.h"
    1.16 +
    1.17 +
    1.18 +//========================= Local Fn Prototypes =============================
    1.19 +void inline
    1.20 +stealWorkInto( SchedSlot *currSlot, VMSQueueStruc *readyToAnimateQ,
    1.21 +               SlaveVP *masterVP );
    1.22 +
    1.23 +//===========================================================================
    1.24 +
    1.25 +
    1.26 +
    1.27 +/*The animationMaster embodies most of the animator of the language.  The
    1.28 + * animator is what emodies the behavior of language constructs. 
    1.29 + * As such, it is the animationMaster, in combination with the plugin
    1.30 + * functions, that make the language constructs do their behavior.   
    1.31 + * 
    1.32 + *Within the code, this is the top-level-function of the masterVPs, and
    1.33 + * runs when the coreController has no more slave VPs.  It's job is to
    1.34 + * refill the animation slots with slaves.
    1.35 + *
    1.36 + *To do this, it scans the animation slots for just-completed slaves.
    1.37 + * Each of these has a request in it.  So, the master hands each to the
    1.38 + * plugin's request handler.
    1.39 + *Each request represents a language construct that has been encountered
    1.40 + * by the application code in the slave. Passing the request to the
    1.41 + * request handler is how that language construct's behavior gets invoked.
    1.42 + * The request handler then performs the actions of the construct's
    1.43 + * behavior. So, the request handler encodes the behavior of the 
    1.44 + * language's parallelism constructs, and performs that when the master
    1.45 + * hands it a slave containing a request to perform that construct.
    1.46 + * 
    1.47 + *On a shared-memory machine, the behavior of parallelism constructs
    1.48 + * equals control, over order of execution of code.  Hence, the behavior
    1.49 + * of the language constructs performed by the request handler is to 
    1.50 + * choose the order that slaves get animated, and thereby control the
    1.51 + * order that application code in the slaves executes.
    1.52 + * 
    1.53 + *To control order of animation of slaves, the request handler has a
    1.54 + * semantic environment that holds data structures used to hold slaves
    1.55 + * and choose when they're ready to be animated.
    1.56 + *
    1.57 + *Once a slave is marked as ready to be animated by the request handler,
    1.58 + * it is the second plugin function, the Assigner, which chooses the core
    1.59 + * the slave gets assigned to for animation.  Hence, the Assigner doesn't
    1.60 + * perform any of the semantic behavior of language constructs, rather
    1.61 + * it gives the language a chance to improve performance. The performance
    1.62 + * of application code is strongly related to communication between
    1.63 + * cores. On shared-memory machines, communication is caused during
    1.64 + * execution of code, by memory accesses, and how much depends on contents
    1.65 + * of caches connected to the core executing the code.  So, the placement
    1.66 + * of slaves determines the communication caused during execution of the
    1.67 + * slave's code.
    1.68 + *The point of the Assigner, then, is to use application information during
    1.69 + * execution of the program, to make choices about slave placement onto
    1.70 + * cores, with the aim to put slaves close to caches containing the data
    1.71 + * used by the slave's code.
    1.72 + * 
    1.73 + *==========================================================================
    1.74 + *In summary, the animationMaster scans the slots, finds slaves
    1.75 + * just-finished, which hold requests, pass those to the request handler,
    1.76 + * along with the semantic environment, and the request handler then manages
    1.77 + * the structures in the semantic env, which controls the order of
    1.78 + * animation of slaves, and so embodies the behavior of the language
    1.79 + * constructs.
    1.80 + *The animationMaster then rescans the slots, offering each empty one to
    1.81 + * the Assigner, along with the semantic environment.  The Assigner chooses
    1.82 + * among the ready slaves in the semantic Env, finding the one best suited
    1.83 + * to be animated by that slot's associated core.
    1.84 + * 
    1.85 + *==========================================================================
    1.86 + *Implementation Details:
    1.87 + * 
    1.88 + *There is a separate masterVP for each core, but a single semantic
    1.89 + * environment shared by all cores.  Each core also has its own scheduling
    1.90 + * slots, which are used to communicate slaves between animationMaster and
    1.91 + * coreController.  There is only one global variable, _VMSMasterEnv, which
    1.92 + * holds the semantic env and other things shared by the different
    1.93 + * masterVPs.  The request handler and Assigner are registered with
    1.94 + * the animationMaster by the language's init function, and a pointer to
    1.95 + * each is in the _VMSMasterEnv. (There are also some pthread related global
    1.96 + * vars, but they're only used during init of VMS).
    1.97 + *VMS gains control over the cores by essentially "turning off" the OS's
    1.98 + * scheduler, using pthread pin-to-core commands.
    1.99 + *
   1.100 + *The masterVPs are created during init, with this animationMaster as their
   1.101 + * top level function.  The masterVPs use the same SlaveVP data structure,
   1.102 + * even though they're not slave VPs.
   1.103 + *A "seed slave" is also created during init -- this is equivalent to the
   1.104 + * "main" function in C, and acts as the entry-point to the VMS-language-
   1.105 + * based application.
   1.106 + *The masterVPs shared a single system-wide master-lock, so only one
   1.107 + * masterVP may be animated at a time.
   1.108 + *The core controllers access _VMSMasterEnv to get the masterVP, and when
   1.109 + * they start, the slots are all empty, so they run their associated core's
   1.110 + * masterVP.  The first of those to get the master lock sees the seed slave
   1.111 + * in the shared semantic environment, so when it runs the Assigner, that
   1.112 + * returns the seed slave, which the animationMaster puts into a scheduling
   1.113 + * slot then switches to the core controller.  That then switches the core
   1.114 + * over to the seed slave, which then proceeds to execute language
   1.115 + * constructs to create more slaves, and so on.  Each of those constructs
   1.116 + * causes the seed slave to suspend, switching over to the core controller,
   1.117 + * which eventually switches to the masterVP, which executes the 
   1.118 + * request handler, which uses VMS primitives to carry out the creation of
   1.119 + * new slave VPs, which are marked as ready for the Assigner, and so on..
   1.120 + * 
   1.121 + *On animation slots, and system behavior:
   1.122 + * A request may linger in a animation slot for a long time while
   1.123 + * the slaves in the other slots are animated.  This only becomes a problem
   1.124 + * when such a request is a choke-point in the constraints, and is needed
   1.125 + * to free work for *other* cores.  To reduce this occurance, the number
   1.126 + * of animation slots should be kept low.  In balance, having multiple
   1.127 + * animation slots amortizes the overhead of switching to the masterVP and
   1.128 + * executing the animationMaster code, which drives for more than one. In
   1.129 + * practice, the best balance should be discovered by profiling.
   1.130 + */
   1.131 +void animationMaster( void *initData, SlaveVP *masterVP )
   1.132 + { 
   1.133 +      //Used while scanning and filling animation slots
   1.134 +   int32           slotIdx, numSlotsFilled;
   1.135 +   SchedSlot      *currSlot, **schedSlots;
   1.136 +   SlaveVP        *assignedSlaveVP;  //the slave chosen by the assigner
   1.137 +
   1.138 +      //Local copies, for performance
   1.139 +   MasterEnv      *masterEnv;
   1.140 +   SlaveAssigner   slaveAssigner;
   1.141 +   RequestHandler  requestHandler;
   1.142 +   void           *semanticEnv;
   1.143 +   int32           thisCoresIdx;
   1.144 +      
   1.145 +   //======================== Initializations ========================
   1.146 +   masterEnv        = (MasterEnv*)_VMSMasterEnv;
   1.147 +   
   1.148 +   thisCoresIdx     = masterVP->coreAnimatedBy;
   1.149 +   schedSlots       = masterEnv->allSchedSlots[thisCoresIdx];
   1.150 +
   1.151 +   requestHandler   = masterEnv->requestHandler;
   1.152 +   slaveAssigner    = masterEnv->slaveAssigner;
   1.153 +   semanticEnv      = masterEnv->semanticEnv;
   1.154 +
   1.155 +   
   1.156 +   //======================== animationMaster ========================
   1.157 +   while(1){
   1.158 +       
   1.159 +      MEAS__Capture_Pre_Master_Point
   1.160 +
   1.161 +      //Scan the animation slots
   1.162 +   numSlotsFilled = 0;
   1.163 +   for( slotIdx = 0; slotIdx < NUM_SCHED_SLOTS; slotIdx++)
   1.164 +    {
   1.165 +      currSlot = schedSlots[ slotIdx ];
   1.166 +
   1.167 +         //Check if newly-done slave in slot, which will need request handld
   1.168 +      if( currSlot->workIsDone )
   1.169 +       {
   1.170 +         currSlot->workIsDone         = FALSE;
   1.171 +         currSlot->needsSlaveAssigned = TRUE;
   1.172 +
   1.173 +               MEAS__startReqHdlr;
   1.174 +               
   1.175 +            //process the requests made by the slave (held inside slave struc)
   1.176 +         (*requestHandler)( currSlot->slaveAssignedToSlot, semanticEnv );
   1.177 +         
   1.178 +               MEAS__endReqHdlr;
   1.179 +       }
   1.180 +         //If slot empty, hand to Assigner to fill with a slave
   1.181 +      if( currSlot->needsSlaveAssigned )
   1.182 +       {    //Call plugin's Assigner to give slot a new slave
   1.183 +         assignedSlaveVP =
   1.184 +          (*slaveAssigner)( semanticEnv, currSlot );
   1.185 +         
   1.186 +            //put the chosen slave into slot, and adjust flags and state
   1.187 +         if( assignedSlaveVP != NULL )
   1.188 +          { currSlot->slaveAssignedToSlot = assignedSlaveVP;
   1.189 +            assignedSlaveVP->schedSlot       = currSlot;
   1.190 +            currSlot->needsSlaveAssigned  = FALSE;
   1.191 +            numSlotsFilled               += 1;
   1.192 +          }
   1.193 +       }
   1.194 +    }
   1.195 +
   1.196 +   
   1.197 +   #ifdef SYS__TURN_ON_WORK_STEALING
   1.198 +      /*If no slots filled, means no more work, look for work to steal. */
   1.199 +   if( numSlotsFilled == 0 )
   1.200 +    { gateProtected_stealWorkInto( currSlot, readyToAnimateQ, masterVP );
   1.201 +    }
   1.202 +   #endif
   1.203 +
   1.204 +         MEAS__Capture_Post_Master_Point;
   1.205 +   
   1.206 +   masterSwitchToCoreCtlr(animatingSlv);
   1.207 +   flushRegisters();
   1.208 +   }//MasterLoop
   1.209 +
   1.210 +
   1.211 + }
   1.212 +
   1.213 +
   1.214 +//===========================  Work Stealing  ==============================
   1.215 +
   1.216 +/*This is first of two work-stealing approaches.  It's not used, but left
   1.217 + * in the code as a simple illustration of the principle.  This version
   1.218 + * has a race condition -- the core controllers are accessing their own
   1.219 + * animation slots at the same time that this work-stealer on a different
   1.220 + * core is..
   1.221 + *Because the core controllers run outside the master lock, this interaction
   1.222 + * is not protected.
   1.223 + */
   1.224 +void inline
   1.225 +stealWorkInto( SchedSlot *currSlot, VMSQueueStruc *readyToAnimateQ,
   1.226 +               SlaveVP *masterVP )
   1.227 + { 
   1.228 +   SlaveVP   *stolenSlv;
   1.229 +   int32        coreIdx, i;
   1.230 +   VMSQueueStruc *currQ;
   1.231 +
   1.232 +   stolenSlv = NULL;
   1.233 +   coreIdx = masterVP->coreAnimatedBy;
   1.234 +   for( i = 0; i < NUM_CORES -1; i++ )
   1.235 +    {
   1.236 +      if( coreIdx >= NUM_CORES -1 )
   1.237 +       { coreIdx = 0;
   1.238 +       }
   1.239 +      else
   1.240 +       { coreIdx++;
   1.241 +       }
   1.242 +      //TODO: fix this for coreCtlr scans slots
   1.243 +//      currQ = _VMSMasterEnv->readyToAnimateQs[coreIdx];
   1.244 +      if( numInVMSQ( currQ ) > 0 )
   1.245 +       { stolenSlv = readVMSQ (currQ );
   1.246 +         break;
   1.247 +       }
   1.248 +    }
   1.249 +
   1.250 +   if( stolenSlv != NULL )
   1.251 +    { currSlot->slaveAssignedToSlot = stolenSlv;
   1.252 +      stolenSlv->schedSlot           = currSlot;
   1.253 +      currSlot->needsSlaveAssigned  = FALSE;
   1.254 +
   1.255 +      writeVMSQ( stolenSlv, readyToAnimateQ );
   1.256 +    }
   1.257 + }
   1.258 +
   1.259 +/*This algorithm makes the common case fast.  Make the coreloop passive,
   1.260 + * and show its progress.  Make the stealer control a gate that coreloop
   1.261 + * has to pass.
   1.262 + *To avoid interference, only one stealer at a time.  Use a global
   1.263 + * stealer-lock, so only the stealer is slowed.
   1.264 + *
   1.265 + *The pattern is based on a gate -- stealer shuts the gate, then monitors
   1.266 + * to be sure any already past make it all the way out, before starting.
   1.267 + *So, have a "progress" measure just before the gate, then have two after it,
   1.268 + * one is in a "waiting room" outside the gate, the other is at the exit.
   1.269 + *Then, the stealer first shuts the gate, then checks the progress measure
   1.270 + * outside it, then looks to see if the progress measure at the exit is the
   1.271 + * same.  If yes, it knows the protected area is empty 'cause no other way
   1.272 + * to get in and the last to get in also exited.
   1.273 + *If the progress measure at the exit is not the same, then the stealer goes
   1.274 + * into a loop checking both the waiting-area and the exit progress-measures
   1.275 + * until one of them shows the same as the measure outside the gate.  Might
   1.276 + * as well re-read the measure outside the gate each go around, just to be
   1.277 + * sure.  It is guaranteed that one of the two will eventually match the one
   1.278 + * outside the gate.
   1.279 + *
   1.280 + *Here's an informal proof of correctness:
   1.281 + *The gate can be closed at any point, and have only four cases:
   1.282 + *  1) coreloop made it past the gate-closing but not yet past the exit
   1.283 + *  2) coreloop made it past the pre-gate progress update but not yet past
   1.284 + *     the gate,
   1.285 + *  3) coreloop is right before the pre-gate update
   1.286 + *  4) coreloop is past the exit and far from the pre-gate update.
   1.287 + *
   1.288 + * Covering the cases in reverse order,
   1.289 + *  4) is not a problem -- stealer will read pre-gate progress, see that it
   1.290 + *     matches exit progress, and the gate is closed, so stealer can proceed.
   1.291 + *  3) stealer will read pre-gate progress just after coreloop updates it..
   1.292 + *     so stealer goes into a loop until the coreloop causes wait-progress
   1.293 + *     to match pre-gate progress, so then stealer can proceed
   1.294 + *  2) same as 3..
   1.295 + *  1) stealer reads pre-gate progress, sees that it's different than exit,
   1.296 + *     so goes into loop until exit matches pre-gate, now it knows coreloop
   1.297 + *     is not in protected and cannot get back in, so can proceed.
   1.298 + *
   1.299 + *Implementation for the stealer:
   1.300 + *
   1.301 + *First, acquire the stealer lock -- only cores with no work to do will
   1.302 + * compete to steal, so not a big performance penalty having only one --
   1.303 + * will rarely have multiple stealers in a system with plenty of work -- and
   1.304 + * in a system with little work, it doesn't matter.
   1.305 + *
   1.306 + *Note, have single-reader, single-writer pattern for all variables used to
   1.307 + * communicate between stealer and victims
   1.308 + *
   1.309 + *So, scan the queues of the core controllers, until find non-empty.  Each core
   1.310 + * has its own list that it scans.  The list goes in order from closest to
   1.311 + * furthest core, so it steals first from close cores.  Later can add
   1.312 + * taking info from the app about overlapping footprints, and scan all the
   1.313 + * others then choose work with the most footprint overlap with the contents
   1.314 + * of this core's cache.
   1.315 + *
   1.316 + *Now, have a victim want to take work from.  So, shut the gate in that
   1.317 + * coreloop, by setting the "gate closed" var on its stack to TRUE.
   1.318 + *Then, read the core's pre-gate progress and compare to the core's exit
   1.319 + * progress.
   1.320 + *If same, can proceed to take work from the coreloop's queue.  When done,
   1.321 + * write FALSE to gate closed var.
   1.322 + *If different, then enter a loop that reads the pre-gate progress, then
   1.323 + * compares to exit progress then to wait progress.  When one of two
   1.324 + * matches, proceed.  Take work from the coreloop's queue.  When done,
   1.325 + * write FALSE to the gate closed var.
   1.326 + * 
   1.327 + */
   1.328 +void inline
   1.329 +gateProtected_stealWorkInto( SchedSlot *currSlot,
   1.330 +                             VMSQueueStruc *myReadyToAnimateQ,
   1.331 +                             SlaveVP *masterVP )
   1.332 + {
   1.333 +   SlaveVP     *stolenSlv;
   1.334 +   int32          coreIdx, i, haveAVictim, gotLock;
   1.335 +   VMSQueueStruc *victimsQ;
   1.336 +
   1.337 +   volatile GateStruc *vicGate;
   1.338 +   int32               coreMightBeInProtected;
   1.339 +
   1.340 +
   1.341 +
   1.342 +      //see if any other cores have work available to steal
   1.343 +   haveAVictim = FALSE;
   1.344 +   coreIdx = masterVP->coreAnimatedBy;
   1.345 +   for( i = 0; i < NUM_CORES -1; i++ )
   1.346 +    {
   1.347 +      if( coreIdx >= NUM_CORES -1 )
   1.348 +       { coreIdx = 0;
   1.349 +       }
   1.350 +      else
   1.351 +       { coreIdx++;
   1.352 +       }
   1.353 +      //TODO: fix this for coreCtlr scans slots
   1.354 +//      victimsQ = _VMSMasterEnv->readyToAnimateQs[coreIdx];
   1.355 +      if( numInVMSQ( victimsQ ) > 0 )
   1.356 +       { haveAVictim = TRUE;
   1.357 +         vicGate = _VMSMasterEnv->workStealingGates[ coreIdx ];
   1.358 +         break;
   1.359 +       }
   1.360 +    }
   1.361 +   if( !haveAVictim ) return;  //no work to steal, exit
   1.362 +
   1.363 +      //have a victim core, now get the stealer-lock
   1.364 +   gotLock =__sync_bool_compare_and_swap( &(_VMSMasterEnv->workStealingLock),
   1.365 +                                                          UNLOCKED, LOCKED );
   1.366 +   if( !gotLock ) return; //go back to core controller, which will re-start master
   1.367 +
   1.368 +
   1.369 +   //====== Start Gate-protection =======
   1.370 +   vicGate->gateClosed = TRUE;
   1.371 +   coreMightBeInProtected= vicGate->preGateProgress != vicGate->exitProgress;
   1.372 +   while( coreMightBeInProtected )
   1.373 +    {    //wait until sure
   1.374 +      if( vicGate->preGateProgress == vicGate->waitProgress )
   1.375 +         coreMightBeInProtected = FALSE;
   1.376 +      if( vicGate->preGateProgress == vicGate->exitProgress )
   1.377 +         coreMightBeInProtected = FALSE;
   1.378 +    }
   1.379 +
   1.380 +   stolenSlv = readVMSQ ( victimsQ );
   1.381 +
   1.382 +   vicGate->gateClosed = FALSE;
   1.383 +   //======= End Gate-protection  =======
   1.384 +
   1.385 +
   1.386 +   if( stolenSlv != NULL )  //victim could have been in protected and taken
   1.387 +    { currSlot->slaveAssignedToSlot = stolenSlv;
   1.388 +      stolenSlv->schedSlot           = currSlot;
   1.389 +      currSlot->needsSlaveAssigned  = FALSE;
   1.390 +
   1.391 +      writeVMSQ( stolenSlv, myReadyToAnimateQ );
   1.392 +    }
   1.393 +
   1.394 +      //unlock the work stealing lock
   1.395 +   _VMSMasterEnv->workStealingLock = UNLOCKED;
   1.396 + }
author	Some Random Person <seanhalle@yahoo.com>
date	Thu, 15 Mar 2012 20:31:41 -0700
parents
children	88fd85921d7f