VMS/VMS_Implementations/VMS_impls/VMS__MC_shared

comparison MasterLoop.c @ 55:3bac84e4e56e

Works with correct matrix mult Nov 4 -- switch animators macros, many updates Changed all queues back to VMSQ variants #defines correct, protected, work-stealing, with compiler switch in and out

author	Me
date	Thu, 04 Nov 2010 18:13:18 -0700
parents	42dd44df1bb0
children	984f7d78bfdf dd3e60aeae26

comparison

equal deleted inserted replaced

-:e929fe72639c
+:1c7bfb48cc44
 #include <stdio.h>
 #include <stddef.h>
 #include "VMS.h"
+//===========================================================================
+void inline
+stealWorkInto( SchedSlot *currSlot, VMSQueueStruc *readyToAnimateQ,
+VirtProcr *masterPr );
+//===========================================================================
 /*This code is animated by the virtual Master processor.
 *
 * is case when other cores starved and one core's requests generate work
 * for them -- so keep max in queue to 3 or 4..
 */
 void masterLoop( void *initData, VirtProcr *animatingPr )
 {
-int             slotIdx;
+int32           slotIdx, numSlotsFilled;
 VirtProcr      *schedVirtPr;
 SchedSlot      *currSlot, **schedSlots;
 MasterEnv      *masterEnv;
 VMSQueueStruc  *readyToAnimateQ;
 SlaveScheduler  slaveScheduler;
 RequestHandler  requestHandler;
 void           *semanticEnv;
-int             thisCoresIdx;
+int32           thisCoresIdx;
 VirtProcr      *masterPr;
 volatile        VirtProcr *volatileMasterPr;
 volatileMasterPr = animatingPr;
 masterPr         = volatileMasterPr; //used to force re-define after jmp
 #endif
 //========================================================================
 masterEnv        = _VMSMasterEnv;
-//TODO: check that compiles so that always re-define from frame-storage
+//GCC may optimize so doesn't always re-define from frame-storage
 masterPr         = volatileMasterPr;  //just to make sure after jmp
 thisCoresIdx     = masterPr->coreAnimatedBy;
 readyToAnimateQ  = masterEnv->readyToAnimateQs[thisCoresIdx];
 schedSlots       = masterEnv->allSchedSlots[thisCoresIdx];
 slaveScheduler   = masterEnv->slaveScheduler;
 semanticEnv      = masterEnv->semanticEnv;
 //Poll each slot's Done flag
+numSlotsFilled = 0;
 for( slotIdx = 0; slotIdx < NUM_SCHED_SLOTS; slotIdx++)
 {
 currSlot = schedSlots[ slotIdx ];
 if( currSlot->workIsDone )
 if( schedVirtPr != NULL )
 { currSlot->procrAssignedToSlot = schedVirtPr;
 schedVirtPr->schedSlot        = currSlot;
 currSlot->needsProcrAssigned  = FALSE;
+numSlotsFilled               += 1;
-writeSRSWQ( schedVirtPr, readyToAnimateQ );
+writeVMSQ( schedVirtPr, readyToAnimateQ );
 }
 }
 }
-//Save stack ptr and frame, restore CoreLoop's stack and frame,
+#ifdef USE_WORK_STEALING
-// and clear the MasterLock
+//If no slots filled, means no more work, look for work to steal.
-//TODO: cafefully verify don't need to force saving anything to stack
+if( numSlotsFilled == 0 )
-// before jumping back to core loop.
+{ gateProtected_stealWorkInto( currSlot, readyToAnimateQ, masterPr );
-void           *stackPtrAddr, *framePtrAddr, *masterLockAddr;
+}
-void           *jmpPt, *coreLoopFramePtr, *coreLoopStackPtr;
+#endif
-stackPtrAddr      = &(masterPr->stackPtr);
-framePtrAddr      = &(masterPr->framePtr);
-masterLockAddr    = &(_VMSMasterEnv->masterLock);
-jmpPt             = _VMSMasterEnv->coreLoopStartPt;
-coreLoopFramePtr  = masterPr->coreLoopFramePtr;//need this only
-coreLoopStackPtr  = masterPr->coreLoopStackPtr;//shouldn't need -- safety
 #ifdef MEAS__TIME_MASTER
 saveLowTimeStampCountInto( masterPr->endMasterTSCLow );
 #endif
-asm volatile("movl %0,     %%eax;  \
-movl %%esp, (%%eax); \
+masterSwitchToCoreLoop( masterPr )
-movl %1,     %%eax;  \
-movl %%ebp, (%%eax); \
-movl %2, %%ebx;      \
-movl %3, %%eax;      \
-movl %4, %%esp;      \
-movl %5, %%ebp;      \
-movl $0x0, (%%ebx);  \
-jmp  %%eax;"         \
-/* outputs */ : "=g" (stackPtrAddr), "=g" (framePtrAddr),                \
-"=g"(masterLockAddr)                                     \
-/* inputs  */ : "g" (jmpPt), "g"(coreLoopStackPtr), "g"(coreLoopFramePtr)\
-/* clobber */ : "memory", "%eax", "%ebx", "%ecx", "%edx", "%edi", "%esi" \
-);//can probably make clobber list empty -- but safe for now
 }
+/*This has a race condition -- the coreloops are accessing their own queues
+* at the same time that this work-stealer on a different core is trying to
+*/
+void inline
+stealWorkInto( SchedSlot *currSlot, VMSQueueStruc *readyToAnimateQ,
+VirtProcr *masterPr )
+{
+VirtProcr   *stolenPr;
+int32        coreIdx, i;
+VMSQueueStruc *currQ;
+stolenPr = NULL;
+coreIdx = masterPr->coreAnimatedBy;
+for( i = 0; i < NUM_CORES -1; i++ )
+{
+if( coreIdx >= NUM_CORES -1 )
+{ coreIdx = 0;
+}
+else
+{ coreIdx++;
+}
+currQ = _VMSMasterEnv->readyToAnimateQs[coreIdx];
+if( numInVMSQ( currQ ) > 0 )
+{ stolenPr = readVMSQ (currQ );
+break;
+}
+}
+if( stolenPr != NULL )
+{ currSlot->procrAssignedToSlot = stolenPr;
+stolenPr->schedSlot           = currSlot;
+currSlot->needsProcrAssigned  = FALSE;
+writeVMSQ( stolenPr, readyToAnimateQ );
+}
+}
+/*This algorithm makes the common case fast.  Make the coreloop passive,
+* and show its progress.  Make the stealer control a gate that coreloop
+* has to pass.
+*To avoid interference, only one stealer at a time.  Use a global
+* stealer-lock.
+*
+*The pattern is based on a gate -- stealer shuts the gate, then monitors
+* to be sure any already past make it all the way out, before starting.
+*So, have a "progress" measure just before the gate, then have two after it,
+* one is in a "waiting room" outside the gate, the other is at the exit.
+*Then, the stealer first shuts the gate, then checks the progress measure
+* outside it, then looks to see if the progress measure at the exit is the
+* same.  If yes, it knows the protected area is empty 'cause no other way
+* to get in and the last to get in also exited.
+*If the progress measure at the exit is not the same, then the stealer goes
+* into a loop checking both the waiting-area and the exit progress-measures
+* until one of them shows the same as the measure outside the gate.  Might
+* as well re-read the measure outside the gate each go around, just to be
+* sure.  It is guaranteed that one of the two will eventually match the one
+* outside the gate.
+*
+*Here's an informal proof of correctness:
+*The gate can be closed at any point, and have only four cases:
+*  1) coreloop made it past the gate-closing but not yet past the exit
+*  2) coreloop made it past the pre-gate progress update but not yet past
+*     the gate,
+*  3) coreloop is right before the pre-gate update
+*  4) coreloop is past the exit and far from the pre-gate update.
+*
+* Covering the cases in reverse order,
+*  4) is not a problem -- stealer will read pre-gate progress, see that it
+*     matches exit progress, and the gate is closed, so stealer can proceed.
+*  3) stealer will read pre-gate progress just after coreloop updates it..
+*     so stealer goes into a loop until the coreloop causes wait-progress
+*     to match pre-gate progress, so then stealer can proceed
+*  2) same as 3..
+*  1) stealer reads pre-gate progress, sees that it's different than exit,
+*     so goes into loop until exit matches pre-gate, now it knows coreloop
+*     is not in protected and cannot get back in, so can proceed.
+*
+*Implementation for the stealer:
+*
+*First, acquire the stealer lock -- only cores with no work to do will
+* compete to steal, so not a big performance penalty having only one --
+* will rarely have multiple stealers in a system with plenty of work -- and
+* in a system with little work, it doesn't matter.
+*
+*Note, have single-reader, single-writer pattern for all variables used to
+* communicate between stealer and victims
+*
+*So, scan the queues of the core loops, until find non-empty.  Each core
+* has its own list that it scans.  The list goes in order from closest to
+* furthest core, so it steals first from close cores.  Later can add
+* taking info from the app about overlapping footprints, and scan all the
+* others then choose work with the most footprint overlap with the contents
+* of this core's cache.
+*
+*Now, have a victim want to take work from.  So, shut the gate in that
+* coreloop, by setting the "gate closed" var on its stack to TRUE.
+*Then, read the core's pre-gate progress and compare to the core's exit
+* progress.
+*If same, can proceed to take work from the coreloop's queue.  When done,
+* write FALSE to gate closed var.
+*If different, then enter a loop that reads the pre-gate progress, then
+* compares to exit progress then to wait progress.  When one of two
+* matches, proceed.  Take work from the coreloop's queue.  When done,
+* write FALSE to the gate closed var.
+*
+*/
+void inline
+gateProtected_stealWorkInto( SchedSlot *currSlot,
+VMSQueueStruc *myReadyToAnimateQ,
+VirtProcr *masterPr )
+{
+VirtProcr     *stolenPr;
+int32          coreIdx, i, haveAVictim, gotLock;
+VMSQueueStruc *victimsQ;
+volatile GateStruc *vicGate;
+int32               coreMightBeInProtected;
+//see if any other cores have work available to steal
+haveAVictim = FALSE;
+coreIdx = masterPr->coreAnimatedBy;
+for( i = 0; i < NUM_CORES -1; i++ )
+{
+if( coreIdx >= NUM_CORES -1 )
+{ coreIdx = 0;
+}
+else
+{ coreIdx++;
+}
+victimsQ = _VMSMasterEnv->readyToAnimateQs[coreIdx];
+if( numInVMSQ( victimsQ ) > 0 )
+{ haveAVictim = TRUE;
+vicGate = _VMSMasterEnv->workStealingGates[ coreIdx ];
+break;
+}
+}
+if( !haveAVictim ) return;  //no work to steal, exit
+//have a victim core, now get the stealer-lock
+gotLock =__sync_bool_compare_and_swap( &(_VMSMasterEnv->workStealingLock),
+UNLOCKED, LOCKED );
+if( !gotLock ) return; //go back to core loop, which will re-start master
+//====== Start Gate-protection =======
+vicGate->gateClosed = TRUE;
+coreMightBeInProtected= vicGate->preGateProgress != vicGate->exitProgress;
+while( coreMightBeInProtected )
+{    //wait until sure
+if( vicGate->preGateProgress == vicGate->waitProgress )
+coreMightBeInProtected = FALSE;
+if( vicGate->preGateProgress == vicGate->exitProgress )
+coreMightBeInProtected = FALSE;
+}
+stolenPr = readVMSQ ( victimsQ );
+vicGate->gateClosed = FALSE;
+//======= End Gate-protection  =======
+if( stolenPr != NULL )  //victim could have been in protected and taken
+{ currSlot->procrAssignedToSlot = stolenPr;
+stolenPr->schedSlot           = currSlot;
+currSlot->needsProcrAssigned  = FALSE;
+writeVMSQ( stolenPr, myReadyToAnimateQ );
+}
+//unlock the work stealing lock
+_VMSMasterEnv->workStealingLock = UNLOCKED;
+}

Mercurial > cgi-bin > hgwebdir.cgi > VMS > VMS_Implementations > VMS_impls > VMS__MC_shared_impl

comparison MasterLoop.c @ 55:3bac84e4e56e