diff MasterLoop.c @ 61:984f7d78bfdf

Merge See what happens -- merged test stuff into Nov 8 VMS version
author SeanHalle
date Thu, 11 Nov 2010 06:19:51 -0800
parents 054006c26b92 3bac84e4e56e
children
line diff
     1.1 --- a/MasterLoop.c	Tue Oct 26 18:31:34 2010 -0700
     1.2 +++ b/MasterLoop.c	Thu Nov 11 06:19:51 2010 -0800
     1.3 @@ -7,12 +7,19 @@
     1.4  
     1.5  
     1.6  #include <stdio.h>
     1.7 -#include <malloc.h>
     1.8  #include <stddef.h>
     1.9  
    1.10  #include "VMS.h"
    1.11  
    1.12  
    1.13 +//===========================================================================
    1.14 +void inline
    1.15 +stealWorkInto( SchedSlot *currSlot, VMSQueueStruc *readyToAnimateQ,
    1.16 +               VirtProcr *masterPr );
    1.17 +
    1.18 +//===========================================================================
    1.19 +
    1.20 +
    1.21  
    1.22  /*This code is animated by the virtual Master processor.
    1.23   *
    1.24 @@ -65,7 +72,7 @@
    1.25   */
    1.26  void masterLoop( void *initData, VirtProcr *animatingPr )
    1.27   { 
    1.28 -   int             slotIdx;
    1.29 +   int32           slotIdx, numSlotsFilled;
    1.30     VirtProcr      *schedVirtPr;
    1.31     SchedSlot      *currSlot, **schedSlots;
    1.32     MasterEnv      *masterEnv;
    1.33 @@ -75,7 +82,7 @@
    1.34     RequestHandler  requestHandler;
    1.35     void           *semanticEnv;
    1.36  
    1.37 -   int             thisCoresIdx;
    1.38 +   int32           thisCoresIdx;
    1.39     VirtProcr      *masterPr;
    1.40     volatile        VirtProcr *volatileMasterPr;
    1.41     
    1.42 @@ -110,7 +117,7 @@
    1.43  
    1.44     masterEnv        = _VMSMasterEnv;
    1.45     
    1.46 -//TODO: check that compiles so that always re-define from frame-storage
    1.47 +      //GCC may optimize so doesn't always re-define from frame-storage
    1.48     masterPr         = volatileMasterPr;  //just to make sure after jmp
    1.49     thisCoresIdx     = masterPr->coreAnimatedBy;
    1.50     readyToAnimateQ  = masterEnv->readyToAnimateQs[thisCoresIdx];
    1.51 @@ -122,6 +129,7 @@
    1.52  
    1.53  
    1.54        //Poll each slot's Done flag
    1.55 +   numSlotsFilled = 0;
    1.56     for( slotIdx = 0; slotIdx < NUM_SCHED_SLOTS; slotIdx++)
    1.57      {
    1.58        currSlot = schedSlots[ slotIdx ];
    1.59 @@ -143,27 +151,21 @@
    1.60            { currSlot->procrAssignedToSlot = schedVirtPr;
    1.61              schedVirtPr->schedSlot        = currSlot;
    1.62              currSlot->needsProcrAssigned  = FALSE;
    1.63 -
    1.64 -            writeSRSWQ( schedVirtPr, readyToAnimateQ );
    1.65 +            numSlotsFilled               += 1;
    1.66 +            
    1.67 +            writeVMSQ( schedVirtPr, readyToAnimateQ );
    1.68            }
    1.69         }
    1.70      }
    1.71  
    1.72 +   
    1.73 +   #ifdef USE_WORK_STEALING
    1.74 +      //If no slots filled, means no more work, look for work to steal.
    1.75 +   if( numSlotsFilled == 0 )
    1.76 +    { gateProtected_stealWorkInto( currSlot, readyToAnimateQ, masterPr );
    1.77 +    }
    1.78 +   #endif
    1.79  
    1.80 -      //Save stack ptr and frame, restore CoreLoop's stack and frame,
    1.81 -      // and clear the MasterLock
    1.82 -      //TODO: cafefully verify don't need to force saving anything to stack
    1.83 -      // before jumping back to core loop.
    1.84 -   void           *stackPtrAddr, *framePtrAddr, *masterLockAddr;
    1.85 -   void           *jmpPt, *coreLoopFramePtr, *coreLoopStackPtr;
    1.86 -
    1.87 -   stackPtrAddr      = &(masterPr->stackPtr);
    1.88 -   framePtrAddr      = &(masterPr->framePtr);
    1.89 -   masterLockAddr    = &(_VMSMasterEnv->masterLock);
    1.90 -
    1.91 -   jmpPt             = _VMSMasterEnv->coreLoopStartPt;
    1.92 -   coreLoopFramePtr  = masterPr->coreLoopFramePtr;//need this only
    1.93 -   coreLoopStackPtr  = masterPr->coreLoopStackPtr;//shouldn't need -- safety
    1.94     
    1.95     //============================= MEASUREMENT STUFF ========================
    1.96     #ifdef MEAS__TIME_MASTER
    1.97 @@ -172,21 +174,183 @@
    1.98     #endif
    1.99     //========================================================================
   1.100  
   1.101 -   asm volatile("movl %0,     %%eax;  \
   1.102 -                 movl %%esp, (%%eax); \
   1.103 -                 movl %1,     %%eax;  \
   1.104 -                 movl %%ebp, (%%eax); \
   1.105 -                 movl %2, %%ebx;      \
   1.106 -                 movl %3, %%eax;      \
   1.107 -                 movl %4, %%esp;      \
   1.108 -                 movl %5, %%ebp;      \
   1.109 -                 movl $0x0, (%%ebx);  \
   1.110 -                 jmp  %%eax;"         \
   1.111 -   /* outputs */ : "=g" (stackPtrAddr), "=g" (framePtrAddr),                \
   1.112 -                   "=g"(masterLockAddr)                                     \
   1.113 -   /* inputs  */ : "g" (jmpPt), "g"(coreLoopStackPtr), "g"(coreLoopFramePtr)\
   1.114 -   /* clobber */ : "memory", "%eax", "%ebx", "%ecx", "%edx", "%edi", "%esi" \
   1.115 -                );//can probably make clobber list empty -- but safe for now
   1.116 +   
   1.117 +   masterSwitchToCoreLoop( masterPr )
   1.118   }
   1.119  
   1.120  
   1.121 +
   1.122 +/*This has a race condition -- the coreloops are accessing their own queues
   1.123 + * at the same time that this work-stealer on a different core is trying to
   1.124 + */
   1.125 +void inline
   1.126 +stealWorkInto( SchedSlot *currSlot, VMSQueueStruc *readyToAnimateQ,
   1.127 +               VirtProcr *masterPr )
   1.128 + { 
   1.129 +   VirtProcr   *stolenPr;
   1.130 +   int32        coreIdx, i;
   1.131 +   VMSQueueStruc *currQ;
   1.132 +
   1.133 +   stolenPr = NULL;
   1.134 +   coreIdx = masterPr->coreAnimatedBy;
   1.135 +   for( i = 0; i < NUM_CORES -1; i++ )
   1.136 +    {
   1.137 +      if( coreIdx >= NUM_CORES -1 )
   1.138 +       { coreIdx = 0;
   1.139 +       }
   1.140 +      else
   1.141 +       { coreIdx++;
   1.142 +       }
   1.143 +      currQ = _VMSMasterEnv->readyToAnimateQs[coreIdx];
   1.144 +      if( numInVMSQ( currQ ) > 0 )
   1.145 +       { stolenPr = readVMSQ (currQ );
   1.146 +         break;
   1.147 +       }
   1.148 +    }
   1.149 +
   1.150 +   if( stolenPr != NULL )
   1.151 +    { currSlot->procrAssignedToSlot = stolenPr;
   1.152 +      stolenPr->schedSlot           = currSlot;
   1.153 +      currSlot->needsProcrAssigned  = FALSE;
   1.154 +
   1.155 +      writeVMSQ( stolenPr, readyToAnimateQ );
   1.156 +    }
   1.157 + }
   1.158 +
   1.159 +/*This algorithm makes the common case fast.  Make the coreloop passive,
   1.160 + * and show its progress.  Make the stealer control a gate that coreloop
   1.161 + * has to pass.
   1.162 + *To avoid interference, only one stealer at a time.  Use a global
   1.163 + * stealer-lock.
   1.164 + *
   1.165 + *The pattern is based on a gate -- stealer shuts the gate, then monitors
   1.166 + * to be sure any already past make it all the way out, before starting.
   1.167 + *So, have a "progress" measure just before the gate, then have two after it,
   1.168 + * one is in a "waiting room" outside the gate, the other is at the exit.
   1.169 + *Then, the stealer first shuts the gate, then checks the progress measure
   1.170 + * outside it, then looks to see if the progress measure at the exit is the
   1.171 + * same.  If yes, it knows the protected area is empty 'cause no other way
   1.172 + * to get in and the last to get in also exited.
   1.173 + *If the progress measure at the exit is not the same, then the stealer goes
   1.174 + * into a loop checking both the waiting-area and the exit progress-measures
   1.175 + * until one of them shows the same as the measure outside the gate.  Might
   1.176 + * as well re-read the measure outside the gate each go around, just to be
   1.177 + * sure.  It is guaranteed that one of the two will eventually match the one
   1.178 + * outside the gate.
   1.179 + *
   1.180 + *Here's an informal proof of correctness:
   1.181 + *The gate can be closed at any point, and have only four cases:
   1.182 + *  1) coreloop made it past the gate-closing but not yet past the exit
   1.183 + *  2) coreloop made it past the pre-gate progress update but not yet past
   1.184 + *     the gate,
   1.185 + *  3) coreloop is right before the pre-gate update
   1.186 + *  4) coreloop is past the exit and far from the pre-gate update.
   1.187 + *
   1.188 + * Covering the cases in reverse order,
   1.189 + *  4) is not a problem -- stealer will read pre-gate progress, see that it
   1.190 + *     matches exit progress, and the gate is closed, so stealer can proceed.
   1.191 + *  3) stealer will read pre-gate progress just after coreloop updates it..
   1.192 + *     so stealer goes into a loop until the coreloop causes wait-progress
   1.193 + *     to match pre-gate progress, so then stealer can proceed
   1.194 + *  2) same as 3..
   1.195 + *  1) stealer reads pre-gate progress, sees that it's different than exit,
   1.196 + *     so goes into loop until exit matches pre-gate, now it knows coreloop
   1.197 + *     is not in protected and cannot get back in, so can proceed.
   1.198 + *
   1.199 + *Implementation for the stealer:
   1.200 + *
   1.201 + *First, acquire the stealer lock -- only cores with no work to do will
   1.202 + * compete to steal, so not a big performance penalty having only one --
   1.203 + * will rarely have multiple stealers in a system with plenty of work -- and
   1.204 + * in a system with little work, it doesn't matter.
   1.205 + *
   1.206 + *Note, have single-reader, single-writer pattern for all variables used to
   1.207 + * communicate between stealer and victims
   1.208 + *
   1.209 + *So, scan the queues of the core loops, until find non-empty.  Each core
   1.210 + * has its own list that it scans.  The list goes in order from closest to
   1.211 + * furthest core, so it steals first from close cores.  Later can add
   1.212 + * taking info from the app about overlapping footprints, and scan all the
   1.213 + * others then choose work with the most footprint overlap with the contents
   1.214 + * of this core's cache.
   1.215 + *
   1.216 + *Now, have a victim want to take work from.  So, shut the gate in that
   1.217 + * coreloop, by setting the "gate closed" var on its stack to TRUE.
   1.218 + *Then, read the core's pre-gate progress and compare to the core's exit
   1.219 + * progress.
   1.220 + *If same, can proceed to take work from the coreloop's queue.  When done,
   1.221 + * write FALSE to gate closed var.
   1.222 + *If different, then enter a loop that reads the pre-gate progress, then
   1.223 + * compares to exit progress then to wait progress.  When one of two
   1.224 + * matches, proceed.  Take work from the coreloop's queue.  When done,
   1.225 + * write FALSE to the gate closed var.
   1.226 + * 
   1.227 + */
   1.228 +void inline
   1.229 +gateProtected_stealWorkInto( SchedSlot *currSlot,
   1.230 +                             VMSQueueStruc *myReadyToAnimateQ,
   1.231 +                             VirtProcr *masterPr )
   1.232 + {
   1.233 +   VirtProcr     *stolenPr;
   1.234 +   int32          coreIdx, i, haveAVictim, gotLock;
   1.235 +   VMSQueueStruc *victimsQ;
   1.236 +
   1.237 +   volatile GateStruc *vicGate;
   1.238 +   int32               coreMightBeInProtected;
   1.239 +
   1.240 +
   1.241 +
   1.242 +      //see if any other cores have work available to steal
   1.243 +   haveAVictim = FALSE;
   1.244 +   coreIdx = masterPr->coreAnimatedBy;
   1.245 +   for( i = 0; i < NUM_CORES -1; i++ )
   1.246 +    {
   1.247 +      if( coreIdx >= NUM_CORES -1 )
   1.248 +       { coreIdx = 0;
   1.249 +       }
   1.250 +      else
   1.251 +       { coreIdx++;
   1.252 +       }
   1.253 +      victimsQ = _VMSMasterEnv->readyToAnimateQs[coreIdx];
   1.254 +      if( numInVMSQ( victimsQ ) > 0 )
   1.255 +       { haveAVictim = TRUE;
   1.256 +         vicGate = _VMSMasterEnv->workStealingGates[ coreIdx ];
   1.257 +         break;
   1.258 +       }
   1.259 +    }
   1.260 +   if( !haveAVictim ) return;  //no work to steal, exit
   1.261 +
   1.262 +      //have a victim core, now get the stealer-lock
   1.263 +   gotLock =__sync_bool_compare_and_swap( &(_VMSMasterEnv->workStealingLock),
   1.264 +                                                          UNLOCKED, LOCKED );
   1.265 +   if( !gotLock ) return; //go back to core loop, which will re-start master
   1.266 +
   1.267 +
   1.268 +   //====== Start Gate-protection =======
   1.269 +   vicGate->gateClosed = TRUE;
   1.270 +   coreMightBeInProtected= vicGate->preGateProgress != vicGate->exitProgress;
   1.271 +   while( coreMightBeInProtected )
   1.272 +    {    //wait until sure
   1.273 +      if( vicGate->preGateProgress == vicGate->waitProgress )
   1.274 +         coreMightBeInProtected = FALSE;
   1.275 +      if( vicGate->preGateProgress == vicGate->exitProgress )
   1.276 +         coreMightBeInProtected = FALSE;
   1.277 +    }
   1.278 +
   1.279 +   stolenPr = readVMSQ ( victimsQ );
   1.280 +
   1.281 +   vicGate->gateClosed = FALSE;
   1.282 +   //======= End Gate-protection  =======
   1.283 +
   1.284 +
   1.285 +   if( stolenPr != NULL )  //victim could have been in protected and taken
   1.286 +    { currSlot->procrAssignedToSlot = stolenPr;
   1.287 +      stolenPr->schedSlot           = currSlot;
   1.288 +      currSlot->needsProcrAssigned  = FALSE;
   1.289 +
   1.290 +      writeVMSQ( stolenPr, myReadyToAnimateQ );
   1.291 +    }
   1.292 +
   1.293 +      //unlock the work stealing lock
   1.294 +   _VMSMasterEnv->workStealingLock = UNLOCKED;
   1.295 + }