diff MasterLoop.c @ 55:3bac84e4e56e

Works with correct matrix mult Nov 4 -- switch animators macros, many updates Changed all queues back to VMSQ variants #defines correct, protected, work-stealing, with compiler switch in and out
author Me
date Thu, 04 Nov 2010 18:13:18 -0700
parents 42dd44df1bb0
children 984f7d78bfdf dd3e60aeae26
line diff
     1.1 --- a/MasterLoop.c	Tue Nov 02 16:43:01 2010 -0700
     1.2 +++ b/MasterLoop.c	Thu Nov 04 18:13:18 2010 -0700
     1.3 @@ -12,6 +12,14 @@
     1.4  #include "VMS.h"
     1.5  
     1.6  
     1.7 +//===========================================================================
     1.8 +void inline
     1.9 +stealWorkInto( SchedSlot *currSlot, VMSQueueStruc *readyToAnimateQ,
    1.10 +               VirtProcr *masterPr );
    1.11 +
    1.12 +//===========================================================================
    1.13 +
    1.14 +
    1.15  
    1.16  /*This code is animated by the virtual Master processor.
    1.17   *
    1.18 @@ -64,7 +72,7 @@
    1.19   */
    1.20  void masterLoop( void *initData, VirtProcr *animatingPr )
    1.21   { 
    1.22 -   int             slotIdx;
    1.23 +   int32           slotIdx, numSlotsFilled;
    1.24     VirtProcr      *schedVirtPr;
    1.25     SchedSlot      *currSlot, **schedSlots;
    1.26     MasterEnv      *masterEnv;
    1.27 @@ -74,7 +82,7 @@
    1.28     RequestHandler  requestHandler;
    1.29     void           *semanticEnv;
    1.30  
    1.31 -   int             thisCoresIdx;
    1.32 +   int32           thisCoresIdx;
    1.33     VirtProcr      *masterPr;
    1.34     volatile        VirtProcr *volatileMasterPr;
    1.35     
    1.36 @@ -108,7 +116,7 @@
    1.37  
    1.38     masterEnv        = _VMSMasterEnv;
    1.39     
    1.40 -//TODO: check that compiles so that always re-define from frame-storage
    1.41 +      //GCC may optimize so doesn't always re-define from frame-storage
    1.42     masterPr         = volatileMasterPr;  //just to make sure after jmp
    1.43     thisCoresIdx     = masterPr->coreAnimatedBy;
    1.44     readyToAnimateQ  = masterEnv->readyToAnimateQs[thisCoresIdx];
    1.45 @@ -120,6 +128,7 @@
    1.46  
    1.47  
    1.48        //Poll each slot's Done flag
    1.49 +   numSlotsFilled = 0;
    1.50     for( slotIdx = 0; slotIdx < NUM_SCHED_SLOTS; slotIdx++)
    1.51      {
    1.52        currSlot = schedSlots[ slotIdx ];
    1.53 @@ -141,46 +150,203 @@
    1.54            { currSlot->procrAssignedToSlot = schedVirtPr;
    1.55              schedVirtPr->schedSlot        = currSlot;
    1.56              currSlot->needsProcrAssigned  = FALSE;
    1.57 -
    1.58 -            writeSRSWQ( schedVirtPr, readyToAnimateQ );
    1.59 +            numSlotsFilled               += 1;
    1.60 +            
    1.61 +            writeVMSQ( schedVirtPr, readyToAnimateQ );
    1.62            }
    1.63         }
    1.64      }
    1.65  
    1.66 +   
    1.67 +   #ifdef USE_WORK_STEALING
    1.68 +      //If no slots filled, means no more work, look for work to steal.
    1.69 +   if( numSlotsFilled == 0 )
    1.70 +    { gateProtected_stealWorkInto( currSlot, readyToAnimateQ, masterPr );
    1.71 +    }
    1.72 +   #endif
    1.73  
    1.74 -      //Save stack ptr and frame, restore CoreLoop's stack and frame,
    1.75 -      // and clear the MasterLock
    1.76 -      //TODO: cafefully verify don't need to force saving anything to stack
    1.77 -      // before jumping back to core loop.
    1.78 -   void           *stackPtrAddr, *framePtrAddr, *masterLockAddr;
    1.79 -   void           *jmpPt, *coreLoopFramePtr, *coreLoopStackPtr;
    1.80 -
    1.81 -   stackPtrAddr      = &(masterPr->stackPtr);
    1.82 -   framePtrAddr      = &(masterPr->framePtr);
    1.83 -   masterLockAddr    = &(_VMSMasterEnv->masterLock);
    1.84 -
    1.85 -   jmpPt             = _VMSMasterEnv->coreLoopStartPt;
    1.86 -   coreLoopFramePtr  = masterPr->coreLoopFramePtr;//need this only
    1.87 -   coreLoopStackPtr  = masterPr->coreLoopStackPtr;//shouldn't need -- safety
    1.88     
    1.89     #ifdef MEAS__TIME_MASTER
    1.90     saveLowTimeStampCountInto( masterPr->endMasterTSCLow );
    1.91     #endif
    1.92  
    1.93 -   asm volatile("movl %0,     %%eax;  \
    1.94 -                 movl %%esp, (%%eax); \
    1.95 -                 movl %1,     %%eax;  \
    1.96 -                 movl %%ebp, (%%eax); \
    1.97 -                 movl %2, %%ebx;      \
    1.98 -                 movl %3, %%eax;      \
    1.99 -                 movl %4, %%esp;      \
   1.100 -                 movl %5, %%ebp;      \
   1.101 -                 movl $0x0, (%%ebx);  \
   1.102 -                 jmp  %%eax;"         \
   1.103 -   /* outputs */ : "=g" (stackPtrAddr), "=g" (framePtrAddr),                \
   1.104 -                   "=g"(masterLockAddr)                                     \
   1.105 -   /* inputs  */ : "g" (jmpPt), "g"(coreLoopStackPtr), "g"(coreLoopFramePtr)\
   1.106 -   /* clobber */ : "memory", "%eax", "%ebx", "%ecx", "%edx", "%edi", "%esi" \
   1.107 -                );//can probably make clobber list empty -- but safe for now
   1.108 +   
   1.109 +   masterSwitchToCoreLoop( masterPr )
   1.110   }
   1.111  
   1.112 +
   1.113 +
   1.114 +/*This has a race condition -- the coreloops are accessing their own queues
   1.115 + * at the same time that this work-stealer on a different core is trying to
   1.116 + */
   1.117 +void inline
   1.118 +stealWorkInto( SchedSlot *currSlot, VMSQueueStruc *readyToAnimateQ,
   1.119 +               VirtProcr *masterPr )
   1.120 + { 
   1.121 +   VirtProcr   *stolenPr;
   1.122 +   int32        coreIdx, i;
   1.123 +   VMSQueueStruc *currQ;
   1.124 +
   1.125 +   stolenPr = NULL;
   1.126 +   coreIdx = masterPr->coreAnimatedBy;
   1.127 +   for( i = 0; i < NUM_CORES -1; i++ )
   1.128 +    {
   1.129 +      if( coreIdx >= NUM_CORES -1 )
   1.130 +       { coreIdx = 0;
   1.131 +       }
   1.132 +      else
   1.133 +       { coreIdx++;
   1.134 +       }
   1.135 +      currQ = _VMSMasterEnv->readyToAnimateQs[coreIdx];
   1.136 +      if( numInVMSQ( currQ ) > 0 )
   1.137 +       { stolenPr = readVMSQ (currQ );
   1.138 +         break;
   1.139 +       }
   1.140 +    }
   1.141 +
   1.142 +   if( stolenPr != NULL )
   1.143 +    { currSlot->procrAssignedToSlot = stolenPr;
   1.144 +      stolenPr->schedSlot           = currSlot;
   1.145 +      currSlot->needsProcrAssigned  = FALSE;
   1.146 +
   1.147 +      writeVMSQ( stolenPr, readyToAnimateQ );
   1.148 +    }
   1.149 + }
   1.150 +
   1.151 +/*This algorithm makes the common case fast.  Make the coreloop passive,
   1.152 + * and show its progress.  Make the stealer control a gate that coreloop
   1.153 + * has to pass.
   1.154 + *To avoid interference, only one stealer at a time.  Use a global
   1.155 + * stealer-lock.
   1.156 + *
   1.157 + *The pattern is based on a gate -- stealer shuts the gate, then monitors
   1.158 + * to be sure any already past make it all the way out, before starting.
   1.159 + *So, have a "progress" measure just before the gate, then have two after it,
   1.160 + * one is in a "waiting room" outside the gate, the other is at the exit.
   1.161 + *Then, the stealer first shuts the gate, then checks the progress measure
   1.162 + * outside it, then looks to see if the progress measure at the exit is the
   1.163 + * same.  If yes, it knows the protected area is empty 'cause no other way
   1.164 + * to get in and the last to get in also exited.
   1.165 + *If the progress measure at the exit is not the same, then the stealer goes
   1.166 + * into a loop checking both the waiting-area and the exit progress-measures
   1.167 + * until one of them shows the same as the measure outside the gate.  Might
   1.168 + * as well re-read the measure outside the gate each go around, just to be
   1.169 + * sure.  It is guaranteed that one of the two will eventually match the one
   1.170 + * outside the gate.
   1.171 + *
   1.172 + *Here's an informal proof of correctness:
   1.173 + *The gate can be closed at any point, and have only four cases:
   1.174 + *  1) coreloop made it past the gate-closing but not yet past the exit
   1.175 + *  2) coreloop made it past the pre-gate progress update but not yet past
   1.176 + *     the gate,
   1.177 + *  3) coreloop is right before the pre-gate update
   1.178 + *  4) coreloop is past the exit and far from the pre-gate update.
   1.179 + *
   1.180 + * Covering the cases in reverse order,
   1.181 + *  4) is not a problem -- stealer will read pre-gate progress, see that it
   1.182 + *     matches exit progress, and the gate is closed, so stealer can proceed.
   1.183 + *  3) stealer will read pre-gate progress just after coreloop updates it..
   1.184 + *     so stealer goes into a loop until the coreloop causes wait-progress
   1.185 + *     to match pre-gate progress, so then stealer can proceed
   1.186 + *  2) same as 3..
   1.187 + *  1) stealer reads pre-gate progress, sees that it's different than exit,
   1.188 + *     so goes into loop until exit matches pre-gate, now it knows coreloop
   1.189 + *     is not in protected and cannot get back in, so can proceed.
   1.190 + *
   1.191 + *Implementation for the stealer:
   1.192 + *
   1.193 + *First, acquire the stealer lock -- only cores with no work to do will
   1.194 + * compete to steal, so not a big performance penalty having only one --
   1.195 + * will rarely have multiple stealers in a system with plenty of work -- and
   1.196 + * in a system with little work, it doesn't matter.
   1.197 + *
   1.198 + *Note, have single-reader, single-writer pattern for all variables used to
   1.199 + * communicate between stealer and victims
   1.200 + *
   1.201 + *So, scan the queues of the core loops, until find non-empty.  Each core
   1.202 + * has its own list that it scans.  The list goes in order from closest to
   1.203 + * furthest core, so it steals first from close cores.  Later can add
   1.204 + * taking info from the app about overlapping footprints, and scan all the
   1.205 + * others then choose work with the most footprint overlap with the contents
   1.206 + * of this core's cache.
   1.207 + *
   1.208 + *Now, have a victim want to take work from.  So, shut the gate in that
   1.209 + * coreloop, by setting the "gate closed" var on its stack to TRUE.
   1.210 + *Then, read the core's pre-gate progress and compare to the core's exit
   1.211 + * progress.
   1.212 + *If same, can proceed to take work from the coreloop's queue.  When done,
   1.213 + * write FALSE to gate closed var.
   1.214 + *If different, then enter a loop that reads the pre-gate progress, then
   1.215 + * compares to exit progress then to wait progress.  When one of two
   1.216 + * matches, proceed.  Take work from the coreloop's queue.  When done,
   1.217 + * write FALSE to the gate closed var.
   1.218 + * 
   1.219 + */
   1.220 +void inline
   1.221 +gateProtected_stealWorkInto( SchedSlot *currSlot,
   1.222 +                             VMSQueueStruc *myReadyToAnimateQ,
   1.223 +                             VirtProcr *masterPr )
   1.224 + {
   1.225 +   VirtProcr     *stolenPr;
   1.226 +   int32          coreIdx, i, haveAVictim, gotLock;
   1.227 +   VMSQueueStruc *victimsQ;
   1.228 +
   1.229 +   volatile GateStruc *vicGate;
   1.230 +   int32               coreMightBeInProtected;
   1.231 +
   1.232 +
   1.233 +
   1.234 +      //see if any other cores have work available to steal
   1.235 +   haveAVictim = FALSE;
   1.236 +   coreIdx = masterPr->coreAnimatedBy;
   1.237 +   for( i = 0; i < NUM_CORES -1; i++ )
   1.238 +    {
   1.239 +      if( coreIdx >= NUM_CORES -1 )
   1.240 +       { coreIdx = 0;
   1.241 +       }
   1.242 +      else
   1.243 +       { coreIdx++;
   1.244 +       }
   1.245 +      victimsQ = _VMSMasterEnv->readyToAnimateQs[coreIdx];
   1.246 +      if( numInVMSQ( victimsQ ) > 0 )
   1.247 +       { haveAVictim = TRUE;
   1.248 +         vicGate = _VMSMasterEnv->workStealingGates[ coreIdx ];
   1.249 +         break;
   1.250 +       }
   1.251 +    }
   1.252 +   if( !haveAVictim ) return;  //no work to steal, exit
   1.253 +
   1.254 +      //have a victim core, now get the stealer-lock
   1.255 +   gotLock =__sync_bool_compare_and_swap( &(_VMSMasterEnv->workStealingLock),
   1.256 +                                                          UNLOCKED, LOCKED );
   1.257 +   if( !gotLock ) return; //go back to core loop, which will re-start master
   1.258 +
   1.259 +
   1.260 +   //====== Start Gate-protection =======
   1.261 +   vicGate->gateClosed = TRUE;
   1.262 +   coreMightBeInProtected= vicGate->preGateProgress != vicGate->exitProgress;
   1.263 +   while( coreMightBeInProtected )
   1.264 +    {    //wait until sure
   1.265 +      if( vicGate->preGateProgress == vicGate->waitProgress )
   1.266 +         coreMightBeInProtected = FALSE;
   1.267 +      if( vicGate->preGateProgress == vicGate->exitProgress )
   1.268 +         coreMightBeInProtected = FALSE;
   1.269 +    }
   1.270 +
   1.271 +   stolenPr = readVMSQ ( victimsQ );
   1.272 +
   1.273 +   vicGate->gateClosed = FALSE;
   1.274 +   //======= End Gate-protection  =======
   1.275 +
   1.276 +
   1.277 +   if( stolenPr != NULL )  //victim could have been in protected and taken
   1.278 +    { currSlot->procrAssignedToSlot = stolenPr;
   1.279 +      stolenPr->schedSlot           = currSlot;
   1.280 +      currSlot->needsProcrAssigned  = FALSE;
   1.281 +
   1.282 +      writeVMSQ( stolenPr, myReadyToAnimateQ );
   1.283 +    }
   1.284 +
   1.285 +      //unlock the work stealing lock
   1.286 +   _VMSMasterEnv->workStealingLock = UNLOCKED;
   1.287 + }