# HG changeset patch # User Some Random Person # Date 1331658126 25200 # Node ID c88ce1db91ef3c39f00a390d13f29d754876aca3 # Parent 8059fb8d5465aba70aa50a537b60d55f25b8d3f8 Compiles, but does not run properly -- and changed MasterLoop to SchedulingMaster diff -r 8059fb8d5465 -r c88ce1db91ef CoreController.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/CoreController.c Tue Mar 13 10:02:06 2012 -0700 @@ -0,0 +1,333 @@ +/* + * Copyright 2010 OpenSourceStewardshipFoundation + * + * Licensed under BSD + */ + + +#include "VMS.h" + +#include +#include +#include + +#include +#include + +//===================== Functions local to this file ======================= +void *terminateCoreController(SlaveVP *currSlv); +inline void +doBackoff_for_TooLongToGetLock( int32 numTriesToGetLock, uint32 *seed1, + uint32 *seed2 ); +inline void +doBackoff_for_TooLongWithNoWork( int32 numRepsWithNoWork, uint32 *seed1, + uint32 *seed2 ); + +//=========================================================================== + + +/*The Core Controller is logically "beneath" the masterVP and slave VPs. Its + * job is to control which of those VPs the core animates. Any time one of + * those VPs suspends, the suspend-primitive switches the core over to + * animating the core controller. The core controller then follows a very + * basic pattern to choose which VP will get animated next, then switches + * the core over to animating that VP. So, all VPs switch the core to + * core controller, which then chooses which VP the core animates next. + * + *The way the core controller decides which VP to switch the core to next is: + * 1) There are a number of "scheduling slots", which the master VP fills up + * with slave VPs that are ready to be animated. So, the core controller + * just iterates through the scheduling slots. When the next slot has a + * slave VP in it, the core controller switches the core over to animate + * that slave. + * 2) When the core controller checks a scheduling slot, and it's empty, + * then the controller switches the core over to animating the master VP, + * whose job is to find more slave VPs ready, and assign those to + * scheduling slots. + * + *So, in effect, a scheduling slot functions as another layer of virtual + * processor. A slot has the logical meaning of being an animator that + * animates the slave assigned to it. However, the core controller sits + * below the slots, and sequences down them, assigning the actual physical + * core to each slot, in turn. + *The reason for having the scheduling slots and core controller is to + * amortize the overhead of switching to the master VP and running it. With + * multiple scheduling slots, the time to switch-to-master and the code in + * the master loop is divided by the number of scheduling slots. + *The core controller and scheduling slots are not fundamental parts of VMS, + * but rather optimizations put into the shared-semantic-state version of + * VMS. Other versions of VMS will not have a core controller nor scheduling + * slots. + * + *The core controller "owns" the physical core, in effect, and is the + * function given to the pthread creation call. Hence, it contains code + * related to pthread startup, synchronizing the controllers to all start + * at the same time-point, and pinning the pthreads to physical cores. + * + */ +void * +coreController( void *paramsIn ) + { + int32 thisCoresIdx; + int32 numRepetitionsWithNoWork; + SlaveVP *currVP; + SchedSlot *currSlot, **schedSlots; + int32 currSlotIdx; + volatile int32 *addrOfMasterLock; //thing pointed to is volatile, not ptr + SlaveVP *thisCoresMasterVP; + //Variables used for pthread related things + ThdParams *coreCtlrThdParams; + cpu_set_t coreMask; //used during pinning pthread to CPU core + int32 errorCode; + //Variables used during measurements + TSCountLowHigh endSusp; + //Variables used in random-backoff, for master-lock and waiting for work + uint32_t seed1 = rand()%1000; // init random number generator for retries + uint32_t seed2 = rand()%1000; + //Variable for work-stealing -- a gate protects a critical section + volatile GateStruc gate; //on stack to avoid false-sharing + + + //=============== Initializations =================== + coreCtlrThdParams = (ThdParams *)paramsIn; + thisCoresIdx = coreCtlrThdParams->coreNum; + + gate.gateClosed = FALSE; + gate.preGateProgress = 0; + gate.waitProgress = 0; + gate.exitProgress = 0; + //TODO: pad these to prevent false-sharing, and fix the race at startup + _VMSMasterEnv->workStealingGates[ thisCoresIdx ] = (GateStruc*)&gate; + + //Assembly that saves addr of label of return instr -- label in assmbly + recordCoreCtlrReturnLabelAddr((void**)&(_VMSMasterEnv->coreCtlrReturnPt)); + + schedSlots = _VMSMasterEnv->allSchedSlots[thisCoresIdx]; + currSlotIdx = 0; //start at slot 0, go up until one empty, then do master + numRepetitionsWithNoWork = 0; + addrOfMasterLock = &(_VMSMasterEnv->masterLock); + thisCoresMasterVP = _VMSMasterEnv->masterVPs[thisCoresIdx]; + + //==================== pthread related stuff ====================== + //pin the pthread to the core + //Linux requires pinning to be done inside the thread-function + //Designate a core by a 1 in bit-position corresponding to the core + CPU_ZERO(&coreMask); //initialize mask bits to zero + CPU_SET(coreCtlrThdParams->coreNum,&coreMask); //set bit repr the coreNum + pthread_t selfThd = pthread_self(); + errorCode = + pthread_setaffinity_np( selfThd, sizeof(coreMask), &coreMask); + if(errorCode){ printf("\n pinning thd to core failed \n"); exit(0); } + + //make sure the controllers all start at same time, by making them wait + pthread_mutex_lock( &suspendLock ); + while( !(_VMSMasterEnv->setupComplete) ) + { pthread_cond_wait( &suspendCond, &suspendLock ); + } + pthread_mutex_unlock( &suspendLock ); + + //====================== The Core Controller ====================== + while(1) //An endless loop is just one way of doing the control structure + { //Assembly code switches the core between animating a VP and + // animating this core controller. The switch is done by + // changing the stack-pointer and frame-pointer and then doing + // an assembly jmp. When reading this code, the effect is + // that the "switchToSlv()" at the end of the loop is sort of a + // "warp in time" -- the core disappears inside this, jmps to + // animating a VP, and when that VP suspends, the suspend + // jmps back. This has the effect of "returning" from the + // switchToSlv() call. Then control loops back to here. + //Alternatively, the VP suspend primitive could just not bother + // returning from switchToSlv, and instead jmp directly to here. + + if( currSlotIdx >= NUM_SCHED_SLOTS ) goto switchToMaster; + currSlot = schedSlots[ currSlotIdx ]; + + + if( ! currSlot->needsSlaveAssigned ) //slot does have slave assigned + { numRepetitionsWithNoWork = 0; //reset B2B master count + currSlotIdx ++; + currVP = currSlot->slaveAssignedToSlot; + } + else //slot is empty, so switch to master + { + switchToMaster: + currSlotIdx = 0; //doing switch to master, so start over at slot 0 + currVP = NULL; + + MEAS__Capture_Pre_Master_Lock_Point; + + int numTriesToGetLock = 0; int gotLock = 0; + while( currVP == NULL ) //keep going until get master lock + { + //At this point, first thing to do is get lock. But, want to + // reduce lock contention from cores with no work, so first + // check if this is a core with no work, and busy wait if so. + //Then, if it's been way too long without work, yield pthread + if( numRepetitionsWithNoWork > NUM_REPS_W_NO_WORK_BEFORE_BACKOFF) + doBackoff_for_TooLongWithNoWork( numRepetitionsWithNoWork, &seed1, &seed2 ); + if( numRepetitionsWithNoWork > NUM_REPS_W_NO_WORK_BEFORE_YIELD ) + { numRepetitionsWithNoWork = 0; pthread_yield(); } + + + //Now, try to get the lock + gotLock = __sync_bool_compare_and_swap( addrOfMasterLock, + UNLOCKED, LOCKED ); + if( gotLock ) + { //At this point, have run out of slaves, so tried to get + // the master lock, and have successfully gotten it. + //So, set the currVP to this core's masterVP and break out + // of the get-lock loop. Below, assembly code will switch + // the core over to animating the masterVP. When it's + // done, the masterVP will use assembly to switch the core + // back to animating this core controller + currVP = thisCoresMasterVP; + numRepetitionsWithNoWork += 1; + break; //end while -- have a VP to animate now + } + //Get here only when failed to get lock + + numTriesToGetLock++; //if too many, means too much contention + if( numTriesToGetLock > NUM_TRIES_BEFORE_DO_BACKOFF ) + doBackoff_for_TooLongToGetLock( numTriesToGetLock, &seed1, &seed2 ); + if( numTriesToGetLock > MASTERLOCK_RETRIES_BEFORE_YIELD ) + { numTriesToGetLock = 0; pthread_yield(); } + } + MEAS__Capture_Post_Master_Lock_Point; + } + + + switchToSlv(currVP); //Slave suspend makes core "return" from this call + flushRegisters(); //prevent GCC optimization from doing bad things + + MEAS__Capture_End_Susp_in_CoreCtlr_ForSys; + + }//while(1) + } + + +void * +terminateCoreCtlr(SlaveVP *currSlv) + { + //first free shutdown Slv that jumped here -- it first restores the + // coreloop's stack, so addr of currSlv in stack frame is still correct + VMS_int__dissipate_slaveVP( currSlv ); + pthread_exit( NULL ); + } + + +/*Used by the backoff to pick a random amount of busy-wait. Can't use the + * system rand because it takes much too long. + *Note, are passing pointers to the seeds, which are then modified + */ +inline uint32_t +randomNumber(uint32_t* seed1, uint32_t* seed2) + { + *seed1 = 36969 * (*seed1 & 65535) + (*seed1 >> 16); + *seed2 = 18000 * (*seed2 & 65535) + (*seed2 >> 16); + return (*seed1 << 16) + *seed2; + } + +/*Busy-wait for a random number of cycles -- chooses number of cycles + * differently than for the too-many-tries-to-get-lock backoff + */ +inline void +doBackoff_for_TooLongWithNoWork( int32 numRepsWithNoWork, uint32 *seed1, + uint32 *seed2 ) + { int32 i, waitIterations; + volatile double fakeWorkVar; //busy-wait fake work + + waitIterations = + randomNumber(seed1, seed2) % + (numRepsWithNoWork * numRepsWithNoWork * NUM_CORES); + for( i = 0; i < waitIterations; i++ ) + { fakeWorkVar += (fakeWorkVar + 32.0) / 2.0; //busy-wait + } + } + +/*Busy-waits for a random number of cycles -- chooses number of cycles + * differently than for the no-work backoff + */ +inline void +doBackoff_for_TooLongToGetLock( int32 numTriesToGetLock, uint32 *seed1, + uint32 *seed2 ) + { int32 i, waitIterations; + volatile double fakeWorkVar; //busy-wait fake work + + waitIterations = + randomNumber(seed1, seed2) % + (numTriesToGetLock * NUM_TRIES_TO_GET_LOCK_BACKOFF_WEIGHT); + //addToHist( wait_iterations, coreLoopThdParams->wait_iterations_hist ); + for( i = 0; i < waitIterations; i++ ) + { fakeWorkVar += (fakeWorkVar + 32.0) / 2.0; //busy-wait + } + } + + +#ifdef DEBUG__TURN_ON_SEQUENTIAL_MODE + +//=========================================================================== +/*This sequential version does the same as threaded, except doesn't do the + * pin-threads part, nor the wait until setup complete and acquire master + * lock parts. + */ +void * +coreCtlr_Seq( void *paramsIn ) + { + int32 thisCoresIdx; + int32 numRepetitionsWithNoWork; + SlaveVP *currVP; + SchedSlot *currSlot, **schedSlots; + int32 currSlotIdx; + int32 *addrOfMasterLock; + SlaveVP *thisCoresMasterVP; + + //=============== Initializations =================== + schedSlots = _VMSMasterEnv->allSchedSlots[thisCoresIdx]; + currSlotIdx = 0; //start at slot 0, go up until one empty, then do master + numRepetitionsWithNoWork = 0; + addrOfMasterLock = &(_VMSMasterEnv->masterLock); + thisCoresMasterVP = _VMSMasterEnv->masterVPs[thisCoresIdx]; + + thisCoresIdx = 0; //sequential version + + //Assembly that saves addr of label of return instr -- label in assmbly + recordCoreCtlrReturnLabelAddr((void**)&(_VMSMasterEnv->coreCtlrReturnPt)); + + + //====================== The Core Controller ====================== + while(1) + { + if( currSlotIdx >= NUM_SCHED_SLOTS ) goto switchToMaster; + currSlot = schedSlots[ currSlotIdx ]; + + if( ! currSlot->needsSlaveAssigned ) //slot does have slave assigned + { numRepetitionsWithNoWork = 0; //reset B2B master count + currSlotIdx ++; + currVP = currSlot->slaveAssignedToSlot; + } + else //slot is empty, so switch to master + { + switchToMaster: + currSlotIdx = 0; //doing switch to master, so start over at slot 0 + + currVP = thisCoresMasterVP; + + MEAS__Capture_Pre_Master_Lock_Point; //back to back because + MEAS__Capture_Post_Master_Lock_Point; // sequential version + + if( numRepetitionsWithNoWork > NUM_REPS_W_NO_WORK_BEFORE_YIELD ) + { printf("Lots of reps w/o work\n"); + exit(0); //if no work, no way to ever get it in sequential! + } + numRepetitionsWithNoWork += 1; + } + + switchToSlv(currVP); //Slave suspend makes core "return" from this call + flushRegisters(); //prevent GCC optimization from doing bad things + + MEAS__Capture_End_Susp_in_CoreCtlr_ForSys; + + } //while(1) + } +#endif diff -r 8059fb8d5465 -r c88ce1db91ef CoreLoop.c --- a/CoreLoop.c Mon Mar 12 05:38:07 2012 -0700 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,332 +0,0 @@ -/* - * Copyright 2010 OpenSourceStewardshipFoundation - * - * Licensed under BSD - */ - - -#include "VMS.h" - -#include -#include -#include - -#include -#include - -//===================== Functions local to this file ======================= -void *terminateCoreController(SlaveVP *currSlv); -inline void -doBackoff_for_TooLongToGetLock( int32 numTriesToGetLock, uint32 *seed1, - uint32 *seed2 ); -inline void -doBackoff_for_TooLongWithNoWork( int32 numRepsWithNoWork, uint32 *seed1, - uint32 *seed2 ); -//=========================================================================== - - -/*The Core Controller is logically "beneath" the masterVP and slave VPs. Its - * job is to control which of those VPs the core animates. Any time one of - * those VPs suspends, the suspend-primitive switches the core over to - * animating the core controller. The core controller then follows a very - * basic pattern to choose which VP will get animated next, then switches - * the core over to animating that VP. So, all VPs switch the core to - * core controller, which then chooses which VP the core animates next. - * - *The way the core controller decides which VP to switch the core to next is: - * 1) There are a number of "scheduling slots", which the master VP fills up - * with slave VPs that are ready to be animated. So, the core controller - * just iterates through the scheduling slots. When the next slot has a - * slave VP in it, the core controller switches the core over to animate - * that slave. - * 2) When the core controller checks a scheduling slot, and it's empty, - * then the controller switches the core over to animating the master VP, - * whose job is to find more slave VPs ready, and assign those to - * scheduling slots. - * - *So, in effect, a scheduling slot functions as another layer of virtual - * processor. A slot has the logical meaning of being an animator that - * animates the slave assigned to it. However, the core controller sits - * below the slots, and sequences down them, assigning the actual physical - * core to each slot, in turn. - *The reason for having the scheduling slots and core controller is to - * amortize the overhead of switching to the master VP and running it. With - * multiple scheduling slots, the time to switch-to-master and the code in - * the master loop is divided by the number of scheduling slots. - *The core controller and scheduling slots are not fundamental parts of VMS, - * but rather optimizations put into the shared-semantic-state version of - * VMS. Other versions of VMS will not have a core controller nor scheduling - * slots. - * - *The core controller "owns" the physical core, in effect, and is the - * function given to the pthread creation call. Hence, it contains code - * related to pthread startup, synchronizing the controllers to all start - * at the same time-point, and pinning the pthreads to physical cores. - * - */ -void * -coreController( void *paramsIn ) - { - int32 thisCoresIdx; - int32 numRepetitionsWithNoWork; - SlaveVP *currVP; - SchedSlot *currSlot, **schedSlots; - int32 currSlotIdx; - volatile int32 *addrOfMasterLock; //thing pointed to is volatile, not ptr - SlaveVP *thisCoresMasterVP; - //Variables used for pthread related things - ThdParams *coreCtlrThdParams; - cpu_set_t coreMask; //used during pinning pthread to CPU core - int32 errorCode; - //Variables used during measurements - TSCountLowHigh endSusp; - //Variables used in random-backoff, for master-lock and waiting for work - uint32_t seed1 = rand()%1000; // init random number generator for retries - uint32_t seed2 = rand()%1000; - //Variable for work-stealing -- a gate protects a critical section - volatile GateStruc gate; //on stack to avoid false-sharing - - - //=============== Initializations =================== - coreCtlrThdParams = (ThdParams *)paramsIn; - thisCoresIdx = coreCtlrThdParams->coreNum; - - gate.gateClosed = FALSE; - gate.preGateProgress = 0; - gate.waitProgress = 0; - gate.exitProgress = 0; - //TODO: pad these to prevent false-sharing, and fix the race at startup - _VMSMasterEnv->workStealingGates[ thisCoresIdx ] = (GateStruc*)&gate; - - //Assembly that saves addr of label of return instr -- label in assmbly - recordCoreCtlrReturnLabelAddr((void**)&(_VMSMasterEnv->coreCtlrReturnPt)); - - schedSlots = _VMSMasterEnv->allSchedSlots[thisCoresIdx]; - currSlotIdx = 0; //start at slot 0, go up until one empty, then do master - numRepetitionsWithNoWork = 0; - addrOfMasterLock = &(_VMSMasterEnv->masterLock); - thisCoresMasterVP = _VMSMasterEnv->masterVPs[thisCoresIdx]; - - //==================== pthread related stuff ====================== - //pin the pthread to the core - //Linux requires pinning to be done inside the thread-function - //Designate a core by a 1 in bit-position corresponding to the core - CPU_ZERO(&coreMask); //initialize mask bits to zero - CPU_SET(coreCtlrThdParams->coreNum,&coreMask); //set bit repr the coreNum - pthread_t selfThd = pthread_self(); - errorCode = - pthread_setaffinity_np( selfThd, sizeof(coreMask), &coreMask); - if(errorCode){ printf("\n pinning thd to core failed \n"); exit(0); } - - //make sure the controllers all start at same time, by making them wait - pthread_mutex_lock( &suspendLock ); - while( !(_VMSMasterEnv->setupComplete) ) - { pthread_cond_wait( &suspendCond, &suspendLock ); - } - pthread_mutex_unlock( &suspendLock ); - - //====================== The Core Controller ====================== - while(1) //An endless loop is just one way of doing the control structure - { //Assembly code switches the core between animating a VP and - // animating this core controller. The switch is done by - // changing the stack-pointer and frame-pointer and then doing - // an assembly jmp. When reading this code, the effect is - // that the "switchToSlv()" at the end of the loop is sort of a - // "warp in time" -- the core disappears inside this, jmps to - // animating a VP, and when that VP suspends, the suspend - // jmps back. This has the effect of "returning" from the - // switchToSlv() call. Then control loops back to here. - //Alternatively, the VP suspend primitive could just not bother - // returning from switchToSlv, and instead jmp directly to here. - - if( currSlotIdx >= NUM_SCHED_SLOTS ) goto switchToMaster; - currSlot = schedSlots[ currSlotIdx ]; - - - if( ! currSlot->needsSlaveAssigned ) //slot does have slave assigned - { numRepetitionsWithNoWork = 0; //reset B2B master count - currSlotIdx ++; - currVP = currSlot->slaveAssignedToSlot; - } - else //slot is empty, so switch to master - { - switchToMaster: - currSlotIdx = 0; //doing switch to master, so start over at slot 0 - currVP = NULL; - - MEAS__Capture_Pre_Master_Lock_Point; - - int numTriesToGetLock = 0; int gotLock = 0; - while( currVP == NULL ) //keep going until get master lock - { - //At this point, first thing to do is get lock. But, want to - // reduce lock contention from cores with no work, so first - // check if this is a core with no work, and busy wait if so. - //Then, if it's been way too long without work, yield pthread - if( numRepetitionsWithNoWork > NUM_REPS_W_NO_WORK_BEFORE_BACKOFF) - doBackoff_for_TooLongWithNoWork( numRepetitionsWithNoWork, &seed1, &seed2 ); - if( numRepetitionsWithNoWork > NUM_REPS_W_NO_WORK_BEFORE_YIELD ) - { numRepetitionsWithNoWork = 0; pthread_yield(); } - - - //Now, try to get the lock - gotLock = __sync_bool_compare_and_swap( addrOfMasterLock, - UNLOCKED, LOCKED ); - if( gotLock ) - { //At this point, have run out of slaves, so tried to get - // the master lock, and have successfully gotten it. - //So, set the currVP to this core's masterVP and break out - // of the get-lock loop. Below, assembly code will switch - // the core over to animating the masterVP. When it's - // done, the masterVP will use assembly to switch the core - // back to animating this core controller - currVP = thisCoresMasterVP; - numRepetitionsWithNoWork += 1; - break; //end while -- have a VP to animate now - } - //Get here only when failed to get lock - - numTriesToGetLock++; //if too many, means too much contention - if( numTriesToGetLock > NUM_TRIES_BEFORE_DO_BACKOFF ) - doBackoff_for_TooLongToGetLock( numTriesToGetLock, &seed1, &seed2 ); - if( numTriesToGetLock > MASTERLOCK_RETRIES_BEFORE_YIELD ) - { numTriesToGetLock = 0; pthread_yield(); } - } - MEAS__Capture_Post_Master_Lock_Point; - } - - - switchToSlv(currVP); //Slave suspend makes core "return" from this call - flushRegisters(); //prevent GCC optimization from doing bad things - - MEAS__Capture_End_Susp_in_CoreCtlr_ForSys; - - }//while(1) - } - - -void * -terminateCoreController(SlaveVP *currSlv) - { - //first free shutdown Slv that jumped here -- it first restores the - // coreloop's stack, so addr of currSlv in stack frame is still correct - VMS_int__dissipate_slaveVP( currSlv ); - pthread_exit( NULL ); - } - - -/*Used by the backoff to pick a random amount of busy-wait. Can't use the - * system rand because it takes much too long. - *Note, are passing pointers to the seeds, which are then modified - */ -inline uint32_t -randomNumber(uint32_t* seed1, uint32_t* seed2) - { - *seed1 = 36969 * (*seed1 & 65535) + (*seed1 >> 16); - *seed2 = 18000 * (*seed2 & 65535) + (*seed2 >> 16); - return (*seed1 << 16) + *seed2; - } - -/*Busy-wait for a random number of cycles -- chooses number of cycles - * differently than for the too-many-tries-to-get-lock backoff - */ -inline void -doBackoff_for_TooLongWithNoWork( int32 numRepsWithNoWork, uint32 *seed1, - uint32 *seed2 ) - { int32 i, waitIterations; - volatile double fakeWorkVar; //busy-wait fake work - - waitIterations = - randomNumber(seed1, seed2) % - (numRepsWithNoWork * numRepsWithNoWork * NUM_CORES); - for( i = 0; i < waitIterations; i++ ) - { fakeWorkVar += (fakeWorkVar + 32.0) / 2.0; //busy-wait - } - } - -/*Busy-waits for a random number of cycles -- chooses number of cycles - * differently than for the no-work backoff - */ -inline void -doBackoff_for_TooLongToGetLock( int32 numTriesToGetLock, uint32 *seed1, - uint32 *seed2 ) - { int32 i, waitIterations; - volatile double fakeWorkVar; //busy-wait fake work - - waitIterations = - randomNumber(seed1, seed2) % - (numTriesToGetLock * NUM_TRIES_TO_GET_LOCK_BACKOFF_WEIGHT); - //addToHist( wait_iterations, coreLoopThdParams->wait_iterations_hist ); - for( i = 0; i < waitIterations; i++ ) - { fakeWorkVar += (fakeWorkVar + 32.0) / 2.0; //busy-wait - } - } - - -#ifdef DEBUG__TURN_ON_SEQUENTIAL_MODE - -//=========================================================================== -/*This sequential version does the same as threaded, except doesn't do the - * pin-threads part, nor the wait until setup complete and acquire master - * lock parts. - */ -void * -coreCtlr_Seq( void *paramsIn ) - { - int32 thisCoresIdx; - int32 numRepetitionsWithNoWork; - SlaveVP *currVP; - SchedSlot *currSlot, **schedSlots; - int32 currSlotIdx; - int32 *addrOfMasterLock; - SlaveVP *thisCoresMasterVP; - - //=============== Initializations =================== - schedSlots = _VMSMasterEnv->allSchedSlots[thisCoresIdx]; - currSlotIdx = 0; //start at slot 0, go up until one empty, then do master - numRepetitionsWithNoWork = 0; - addrOfMasterLock = &(_VMSMasterEnv->masterLock); - thisCoresMasterVP = _VMSMasterEnv->masterVPs[thisCoresIdx]; - - thisCoresIdx = 0; //sequential version - - //Assembly that saves addr of label of return instr -- label in assmbly - recordCoreCtlrReturnLabelAddr((void**)&(_VMSMasterEnv->coreCtlrReturnPt)); - - - //====================== The Core Controller ====================== - while(1) - { - if( currSlotIdx >= NUM_SCHED_SLOTS ) goto switchToMaster; - currSlot = schedSlots[ currSlotIdx ]; - - if( ! currSlot->needsSlaveAssigned ) //slot does have slave assigned - { numRepetitionsWithNoWork = 0; //reset B2B master count - currSlotIdx ++; - currVP = currSlot->slaveAssignedToSlot; - } - else //slot is empty, so switch to master - { - switchToMaster: - currSlotIdx = 0; //doing switch to master, so start over at slot 0 - - currVP = thisCoresMasterVP; - - MEAS__Capture_Pre_Master_Lock_Point; //back to back because - MEAS__Capture_Post_Master_Lock_Point; // sequential version - - if( numRepetitionsWithNoWork > NUM_REPS_W_NO_WORK_BEFORE_YIELD ) - { printf("Lots of reps w/o work\n"); - exit(0); //if no work, no way to ever get it in sequential! - } - numRepetitionsWithNoWork += 1; - } - - switchToSlv(currVP); //Slave suspend makes core "return" from this call - flushRegisters(); //prevent GCC optimization from doing bad things - - MEAS__Capture_End_Susp_in_CoreCtlr_ForSys; - - } //while(1) - } -#endif diff -r 8059fb8d5465 -r c88ce1db91ef Defines/VMS_defs__MEAS.h --- a/Defines/VMS_defs__MEAS.h Mon Mar 12 05:38:07 2012 -0700 +++ b/Defines/VMS_defs__MEAS.h Tue Mar 13 10:02:06 2012 -0700 @@ -6,8 +6,8 @@ * */ -#ifndef _VMS_DEFS_MEAS_H -#define _VMS_DEFS_MEAS_H +#ifndef _VMS_DEFS_MEAS_H +#define _VMS_DEFS_MEAS_H #define _GNU_SOURCE //================== Macros define types of meas want ===================== @@ -321,5 +321,5 @@ //=========================================================================== -#endif /* _VMS_DEFS_H */ +#endif /* _VMS_DEFS_MEAS_H */ diff -r 8059fb8d5465 -r c88ce1db91ef Defines/VMS_defs__turn_on_and_off.h --- a/Defines/VMS_defs__turn_on_and_off.h Mon Mar 12 05:38:07 2012 -0700 +++ b/Defines/VMS_defs__turn_on_and_off.h Tue Mar 13 10:02:06 2012 -0700 @@ -15,7 +15,7 @@ * It still does co-routines and all the mechanisms are the same, it just * has only a single thread and animates Slvs one at a time */ -//#define DEBUG__TURN_ON_SEQUENTIAL_MODE +#define DEBUG__TURN_ON_SEQUENTIAL_MODE /*turns on the probe-instrumentation in the application -- when not diff -r 8059fb8d5465 -r c88ce1db91ef MasterLoop.c --- a/MasterLoop.c Mon Mar 12 05:38:07 2012 -0700 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,349 +0,0 @@ -/* - * Copyright 2010 OpenSourceStewardshipFoundation - * - * Licensed under BSD - */ - - - -#include -#include - -#include "VMS.h" - - -//=========================================================================== -void inline -stealWorkInto( SchedSlot *currSlot, VMSQueueStruc *readyToAnimateQ, - SlaveVP *masterVP ); - -//=========================================================================== - - - -/*This code is animated by the virtual Master processor. - * - *Polls each sched slot exactly once, hands any requests made by a newly - * done slave to the "request handler" plug-in function - * - *Any slots that need a Slv assigned are given to the "schedule" - * plug-in function, which tries to assign a Slv (slave) to it. - * - *When all slots needing a processor have been given to the schedule plug-in, - * a fraction of the slaves successfully scheduled are put into the - * work queue, then a continuation of this function is put in, then the rest - * of the Slvs that were successfully scheduled. - * - *The first thing the continuation does is busy-wait until the previous - * animation completes. This is because an (unlikely) continuation may - * sneak through queue before previous continuation is done putting second - * part of scheduled slaves in, which is the only race condition. - * - */ - -/*May 29, 2010 -- birth a Master during init so that first core controller to - * start running gets it and does all the stuff for a newly born -- - * from then on, will be doing continuation, but do suspension self - * directly at end of master loop - *So VMS_WL__init just births the master virtual processor same way it births - * all the others -- then does any extra setup needed and puts it into the - * work queue. - *However means have to make masterEnv a global static volatile the same way - * did with readyToAnimateQ in core controller. -- for performance, put the - * jump to the core controller directly in here, and have it directly jump back. - * - * - *Aug 18, 2010 -- Going to a separate MasterVP for each core, to see if this - * avoids the suspected bug in the system stack that causes bizarre faults - * at random places in the system code. - * - *So, this function is coupled to each of the MasterVPs, -- meaning this - * function can't rely on a particular stack and frame -- each MasterVP that - * animates this function has a different one. - * - *At this point, the masterLoop does not write itself into the queue anymore, - * instead, the coreCtlr acquires the masterLock when it has nothing to - * animate, and then animates its own masterLoop. However, still try to put - * several AppSlvs into the queue to amortize the startup cost of switching - * to the MasterVP. Note, don't have to worry about latency of requests much - * because most requests generate work for same core -- only latency issue - * is case when other cores starved and one core's requests generate work - * for them -- so keep max in queue to 3 or 4.. - */ -void masterLoop( void *initData, SlaveVP *animatingSlv ) - { - int32 slotIdx, numSlotsFilled; - SlaveVP *schedSlaveVP; - SchedSlot *currSlot, **schedSlots; - MasterEnv *masterEnv; - VMSQueueStruc *readyToAnimateQ; - - Sched_Assigner slaveAssigner; - RequestHandler requestHandler; - void *semanticEnv; - - int32 thisCoresIdx; - SlaveVP *masterVP; - volatile SlaveVP *volatileMasterVP; - - volatileMasterVP = animatingSlv; - masterVP = (SlaveVP*)volatileMasterVP; //used to force re-define after jmp - - //First animation of each MasterVP will in turn animate this part - // of setup code.. (Slv creator sets up the stack as if this function - // was called normally, but actually get here by jmp) - //So, setup values about stack ptr, jmp pt and all that - //masterVP->resumeInstrPtr = &&masterLoopStartPt; - - - //Note, got rid of writing the stack and frame ptr up here, because - // only one - // core can ever animate a given MasterVP, so don't need to communicate - // new frame and stack ptr to the MasterVP storage before a second - // version of that MasterVP can get animated on a different core. - //Also got rid of the busy-wait. - - - //masterLoopStartPt: - while(1){ - - MEAS__Capture_Pre_Master_Point - - masterEnv = (MasterEnv*)_VMSMasterEnv; - - //GCC may optimize so doesn't always re-define from frame-storage - masterVP = (SlaveVP*)volatileMasterVP; //just to make sure after jmp - thisCoresIdx = masterVP->coreAnimatedBy; - readyToAnimateQ = masterEnv->readyToAnimateQs[thisCoresIdx]; - schedSlots = masterEnv->allSchedSlots[thisCoresIdx]; - - requestHandler = masterEnv->requestHandler; - slaveAssigner = masterEnv->slaveAssigner; - semanticEnv = masterEnv->semanticEnv; - - - //Poll each slot's Done flag - numSlotsFilled = 0; - for( slotIdx = 0; slotIdx < NUM_SCHED_SLOTS; slotIdx++) - { - currSlot = schedSlots[ slotIdx ]; - - if( currSlot->workIsDone ) - { - currSlot->workIsDone = FALSE; - currSlot->needsSlaveAssigned = TRUE; - - MEAS__startReqHdlr; - - //process the requests made by the slave (held inside slave struc) - (*requestHandler)( currSlot->slaveAssignedToSlot, semanticEnv ); - - MEAS__endReqHdlr; - } - if( currSlot->needsSlaveAssigned ) - { //give slot a new Slv - schedSlaveVP = - (*slaveAssigner)( semanticEnv, thisCoresIdx ); - - if( schedSlaveVP != NULL ) - { currSlot->slaveAssignedToSlot = schedSlaveVP; - schedSlaveVP->schedSlot = currSlot; - currSlot->needsSlaveAssigned = FALSE; - numSlotsFilled += 1; - } - } - } - - - #ifdef SYS__TURN_ON_WORK_STEALING - //If no slots filled, means no more work, look for work to steal. - if( numSlotsFilled == 0 ) - { gateProtected_stealWorkInto( currSlot, readyToAnimateQ, masterVP ); - } - #endif - - MEAS__Capture_Post_Master_Point; - - masterSwitchToCoreCtlr(animatingSlv); - flushRegisters(); - }//MasterLoop - - - } - - - -/*This has a race condition -- the coreloops are accessing their own queues - * at the same time that this work-stealer on a different core is trying to - */ -void inline -stealWorkInto( SchedSlot *currSlot, VMSQueueStruc *readyToAnimateQ, - SlaveVP *masterVP ) - { - SlaveVP *stolenSlv; - int32 coreIdx, i; - VMSQueueStruc *currQ; - - stolenSlv = NULL; - coreIdx = masterVP->coreAnimatedBy; - for( i = 0; i < NUM_CORES -1; i++ ) - { - if( coreIdx >= NUM_CORES -1 ) - { coreIdx = 0; - } - else - { coreIdx++; - } - currQ = _VMSMasterEnv->readyToAnimateQs[coreIdx]; - if( numInVMSQ( currQ ) > 0 ) - { stolenSlv = readVMSQ (currQ ); - break; - } - } - - if( stolenSlv != NULL ) - { currSlot->slaveAssignedToSlot = stolenSlv; - stolenSlv->schedSlot = currSlot; - currSlot->needsSlaveAssigned = FALSE; - - writeVMSQ( stolenSlv, readyToAnimateQ ); - } - } - -/*This algorithm makes the common case fast. Make the coreloop passive, - * and show its progress. Make the stealer control a gate that coreloop - * has to pass. - *To avoid interference, only one stealer at a time. Use a global - * stealer-lock. - * - *The pattern is based on a gate -- stealer shuts the gate, then monitors - * to be sure any already past make it all the way out, before starting. - *So, have a "progress" measure just before the gate, then have two after it, - * one is in a "waiting room" outside the gate, the other is at the exit. - *Then, the stealer first shuts the gate, then checks the progress measure - * outside it, then looks to see if the progress measure at the exit is the - * same. If yes, it knows the protected area is empty 'cause no other way - * to get in and the last to get in also exited. - *If the progress measure at the exit is not the same, then the stealer goes - * into a loop checking both the waiting-area and the exit progress-measures - * until one of them shows the same as the measure outside the gate. Might - * as well re-read the measure outside the gate each go around, just to be - * sure. It is guaranteed that one of the two will eventually match the one - * outside the gate. - * - *Here's an informal proof of correctness: - *The gate can be closed at any point, and have only four cases: - * 1) coreloop made it past the gate-closing but not yet past the exit - * 2) coreloop made it past the pre-gate progress update but not yet past - * the gate, - * 3) coreloop is right before the pre-gate update - * 4) coreloop is past the exit and far from the pre-gate update. - * - * Covering the cases in reverse order, - * 4) is not a problem -- stealer will read pre-gate progress, see that it - * matches exit progress, and the gate is closed, so stealer can proceed. - * 3) stealer will read pre-gate progress just after coreloop updates it.. - * so stealer goes into a loop until the coreloop causes wait-progress - * to match pre-gate progress, so then stealer can proceed - * 2) same as 3.. - * 1) stealer reads pre-gate progress, sees that it's different than exit, - * so goes into loop until exit matches pre-gate, now it knows coreloop - * is not in protected and cannot get back in, so can proceed. - * - *Implementation for the stealer: - * - *First, acquire the stealer lock -- only cores with no work to do will - * compete to steal, so not a big performance penalty having only one -- - * will rarely have multiple stealers in a system with plenty of work -- and - * in a system with little work, it doesn't matter. - * - *Note, have single-reader, single-writer pattern for all variables used to - * communicate between stealer and victims - * - *So, scan the queues of the core controllers, until find non-empty. Each core - * has its own list that it scans. The list goes in order from closest to - * furthest core, so it steals first from close cores. Later can add - * taking info from the app about overlapping footprints, and scan all the - * others then choose work with the most footprint overlap with the contents - * of this core's cache. - * - *Now, have a victim want to take work from. So, shut the gate in that - * coreloop, by setting the "gate closed" var on its stack to TRUE. - *Then, read the core's pre-gate progress and compare to the core's exit - * progress. - *If same, can proceed to take work from the coreloop's queue. When done, - * write FALSE to gate closed var. - *If different, then enter a loop that reads the pre-gate progress, then - * compares to exit progress then to wait progress. When one of two - * matches, proceed. Take work from the coreloop's queue. When done, - * write FALSE to the gate closed var. - * - */ -void inline -gateProtected_stealWorkInto( SchedSlot *currSlot, - VMSQueueStruc *myReadyToAnimateQ, - SlaveVP *masterVP ) - { - SlaveVP *stolenSlv; - int32 coreIdx, i, haveAVictim, gotLock; - VMSQueueStruc *victimsQ; - - volatile GateStruc *vicGate; - int32 coreMightBeInProtected; - - - - //see if any other cores have work available to steal - haveAVictim = FALSE; - coreIdx = masterVP->coreAnimatedBy; - for( i = 0; i < NUM_CORES -1; i++ ) - { - if( coreIdx >= NUM_CORES -1 ) - { coreIdx = 0; - } - else - { coreIdx++; - } - victimsQ = _VMSMasterEnv->readyToAnimateQs[coreIdx]; - if( numInVMSQ( victimsQ ) > 0 ) - { haveAVictim = TRUE; - vicGate = _VMSMasterEnv->workStealingGates[ coreIdx ]; - break; - } - } - if( !haveAVictim ) return; //no work to steal, exit - - //have a victim core, now get the stealer-lock - gotLock =__sync_bool_compare_and_swap( &(_VMSMasterEnv->workStealingLock), - UNLOCKED, LOCKED ); - if( !gotLock ) return; //go back to core controller, which will re-start master - - - //====== Start Gate-protection ======= - vicGate->gateClosed = TRUE; - coreMightBeInProtected= vicGate->preGateProgress != vicGate->exitProgress; - while( coreMightBeInProtected ) - { //wait until sure - if( vicGate->preGateProgress == vicGate->waitProgress ) - coreMightBeInProtected = FALSE; - if( vicGate->preGateProgress == vicGate->exitProgress ) - coreMightBeInProtected = FALSE; - } - - stolenSlv = readVMSQ ( victimsQ ); - - vicGate->gateClosed = FALSE; - //======= End Gate-protection ======= - - - if( stolenSlv != NULL ) //victim could have been in protected and taken - { currSlot->slaveAssignedToSlot = stolenSlv; - stolenSlv->schedSlot = currSlot; - currSlot->needsSlaveAssigned = FALSE; - - writeVMSQ( stolenSlv, myReadyToAnimateQ ); - } - - //unlock the work stealing lock - _VMSMasterEnv->workStealingLock = UNLOCKED; - } diff -r 8059fb8d5465 -r c88ce1db91ef Probes/probes.c --- a/Probes/probes.c Mon Mar 12 05:38:07 2012 -0700 +++ b/Probes/probes.c Tue Mar 13 10:02:06 2012 -0700 @@ -298,7 +298,6 @@ VMS_impl__print_stats_of_all_probes() { forAllInDynArrayDo( _VMSMasterEnv->dynIntervalProbesInfo, - &VMS_impl__print_stats_of_probe ); + (DynArrayFnPtr) &VMS_impl__print_stats_of_probe ); fflush( stdout ); } -typedef void (*DynArrayFnPtr) ( void * ); //fn has to cast void * diff -r 8059fb8d5465 -r c88ce1db91ef Probes/probes.h --- a/Probes/probes.h Mon Mar 12 05:38:07 2012 -0700 +++ b/Probes/probes.h Tue Mar 13 10:02:06 2012 -0700 @@ -107,7 +107,7 @@ VMS_impl__record_interval_end_in_probe( int32 probeID ); void -VMS_impl__print_stats_of_probe( IntervalProbe *probe ) +VMS_impl__print_stats_of_probe( IntervalProbe *probe ); void VMS_impl__print_stats_of_all_probes(); diff -r 8059fb8d5465 -r c88ce1db91ef SchedulingMaster.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SchedulingMaster.c Tue Mar 13 10:02:06 2012 -0700 @@ -0,0 +1,349 @@ +/* + * Copyright 2010 OpenSourceStewardshipFoundation + * + * Licensed under BSD + */ + + + +#include +#include + +#include "VMS.h" + + +//=========================================================================== +void inline +stealWorkInto( SchedSlot *currSlot, VMSQueueStruc *readyToAnimateQ, + SlaveVP *masterVP ); + +//=========================================================================== + + + +/*This code is animated by the virtual Master processor. + * + *Polls each sched slot exactly once, hands any requests made by a newly + * done slave to the "request handler" plug-in function + * + *Any slots that need a Slv assigned are given to the "schedule" + * plug-in function, which tries to assign a Slv (slave) to it. + * + *When all slots needing a processor have been given to the schedule plug-in, + * a fraction of the slaves successfully scheduled are put into the + * work queue, then a continuation of this function is put in, then the rest + * of the Slvs that were successfully scheduled. + * + *The first thing the continuation does is busy-wait until the previous + * animation completes. This is because an (unlikely) continuation may + * sneak through queue before previous continuation is done putting second + * part of scheduled slaves in, which is the only race condition. + * + */ + +/*May 29, 2010 -- birth a Master during init so that first core controller to + * start running gets it and does all the stuff for a newly born -- + * from then on, will be doing continuation, but do suspension self + * directly at end of master loop + *So VMS_WL__init just births the master virtual processor same way it births + * all the others -- then does any extra setup needed and puts it into the + * work queue. + *However means have to make masterEnv a global static volatile the same way + * did with readyToAnimateQ in core controller. -- for performance, put the + * jump to the core controller directly in here, and have it directly jump back. + * + * + *Aug 18, 2010 -- Going to a separate MasterVP for each core, to see if this + * avoids the suspected bug in the system stack that causes bizarre faults + * at random places in the system code. + * + *So, this function is coupled to each of the MasterVPs, -- meaning this + * function can't rely on a particular stack and frame -- each MasterVP that + * animates this function has a different one. + * + *At this point, the schedulingMaster does not write itself into the queue anymore, + * instead, the coreCtlr acquires the masterLock when it has nothing to + * animate, and then animates its own schedulingMaster. However, still try to put + * several AppSlvs into the queue to amortize the startup cost of switching + * to the MasterVP. Note, don't have to worry about latency of requests much + * because most requests generate work for same core -- only latency issue + * is case when other cores starved and one core's requests generate work + * for them -- so keep max in queue to 3 or 4.. + */ +void schedulingMaster( void *initData, SlaveVP *animatingSlv ) + { + int32 slotIdx, numSlotsFilled; + SlaveVP *schedSlaveVP; + SchedSlot *currSlot, **schedSlots; + MasterEnv *masterEnv; + VMSQueueStruc *readyToAnimateQ; + + Sched_Assigner slaveAssigner; + RequestHandler requestHandler; + void *semanticEnv; + + int32 thisCoresIdx; + SlaveVP *masterVP; + volatile SlaveVP *volatileMasterVP; + + volatileMasterVP = animatingSlv; + masterVP = (SlaveVP*)volatileMasterVP; //used to force re-define after jmp + + //First animation of each MasterVP will in turn animate this part + // of setup code.. (Slv creator sets up the stack as if this function + // was called normally, but actually get here by jmp) + //So, setup values about stack ptr, jmp pt and all that + //masterVP->resumeInstrPtr = &&schedulingMasterStartPt; + + + //Note, got rid of writing the stack and frame ptr up here, because + // only one + // core can ever animate a given MasterVP, so don't need to communicate + // new frame and stack ptr to the MasterVP storage before a second + // version of that MasterVP can get animated on a different core. + //Also got rid of the busy-wait. + + + //schedulingMasterStartPt: + while(1){ + + MEAS__Capture_Pre_Master_Point + + masterEnv = (MasterEnv*)_VMSMasterEnv; + + //GCC may optimize so doesn't always re-define from frame-storage + masterVP = (SlaveVP*)volatileMasterVP; //just to make sure after jmp + thisCoresIdx = masterVP->coreAnimatedBy; + readyToAnimateQ = masterEnv->readyToAnimateQs[thisCoresIdx]; + schedSlots = masterEnv->allSchedSlots[thisCoresIdx]; + + requestHandler = masterEnv->requestHandler; + slaveAssigner = masterEnv->slaveAssigner; + semanticEnv = masterEnv->semanticEnv; + + + //Poll each slot's Done flag + numSlotsFilled = 0; + for( slotIdx = 0; slotIdx < NUM_SCHED_SLOTS; slotIdx++) + { + currSlot = schedSlots[ slotIdx ]; + + if( currSlot->workIsDone ) + { + currSlot->workIsDone = FALSE; + currSlot->needsSlaveAssigned = TRUE; + + MEAS__startReqHdlr; + + //process the requests made by the slave (held inside slave struc) + (*requestHandler)( currSlot->slaveAssignedToSlot, semanticEnv ); + + MEAS__endReqHdlr; + } + if( currSlot->needsSlaveAssigned ) + { //give slot a new Slv + schedSlaveVP = + (*slaveAssigner)( semanticEnv, thisCoresIdx ); + + if( schedSlaveVP != NULL ) + { currSlot->slaveAssignedToSlot = schedSlaveVP; + schedSlaveVP->schedSlot = currSlot; + currSlot->needsSlaveAssigned = FALSE; + numSlotsFilled += 1; + } + } + } + + + #ifdef SYS__TURN_ON_WORK_STEALING + //If no slots filled, means no more work, look for work to steal. + if( numSlotsFilled == 0 ) + { gateProtected_stealWorkInto( currSlot, readyToAnimateQ, masterVP ); + } + #endif + + MEAS__Capture_Post_Master_Point; + + masterSwitchToCoreCtlr(animatingSlv); + flushRegisters(); + }//MasterLoop + + + } + + + +/*This has a race condition -- the coreloops are accessing their own queues + * at the same time that this work-stealer on a different core is trying to + */ +void inline +stealWorkInto( SchedSlot *currSlot, VMSQueueStruc *readyToAnimateQ, + SlaveVP *masterVP ) + { + SlaveVP *stolenSlv; + int32 coreIdx, i; + VMSQueueStruc *currQ; + + stolenSlv = NULL; + coreIdx = masterVP->coreAnimatedBy; + for( i = 0; i < NUM_CORES -1; i++ ) + { + if( coreIdx >= NUM_CORES -1 ) + { coreIdx = 0; + } + else + { coreIdx++; + } + currQ = _VMSMasterEnv->readyToAnimateQs[coreIdx]; + if( numInVMSQ( currQ ) > 0 ) + { stolenSlv = readVMSQ (currQ ); + break; + } + } + + if( stolenSlv != NULL ) + { currSlot->slaveAssignedToSlot = stolenSlv; + stolenSlv->schedSlot = currSlot; + currSlot->needsSlaveAssigned = FALSE; + + writeVMSQ( stolenSlv, readyToAnimateQ ); + } + } + +/*This algorithm makes the common case fast. Make the coreloop passive, + * and show its progress. Make the stealer control a gate that coreloop + * has to pass. + *To avoid interference, only one stealer at a time. Use a global + * stealer-lock. + * + *The pattern is based on a gate -- stealer shuts the gate, then monitors + * to be sure any already past make it all the way out, before starting. + *So, have a "progress" measure just before the gate, then have two after it, + * one is in a "waiting room" outside the gate, the other is at the exit. + *Then, the stealer first shuts the gate, then checks the progress measure + * outside it, then looks to see if the progress measure at the exit is the + * same. If yes, it knows the protected area is empty 'cause no other way + * to get in and the last to get in also exited. + *If the progress measure at the exit is not the same, then the stealer goes + * into a loop checking both the waiting-area and the exit progress-measures + * until one of them shows the same as the measure outside the gate. Might + * as well re-read the measure outside the gate each go around, just to be + * sure. It is guaranteed that one of the two will eventually match the one + * outside the gate. + * + *Here's an informal proof of correctness: + *The gate can be closed at any point, and have only four cases: + * 1) coreloop made it past the gate-closing but not yet past the exit + * 2) coreloop made it past the pre-gate progress update but not yet past + * the gate, + * 3) coreloop is right before the pre-gate update + * 4) coreloop is past the exit and far from the pre-gate update. + * + * Covering the cases in reverse order, + * 4) is not a problem -- stealer will read pre-gate progress, see that it + * matches exit progress, and the gate is closed, so stealer can proceed. + * 3) stealer will read pre-gate progress just after coreloop updates it.. + * so stealer goes into a loop until the coreloop causes wait-progress + * to match pre-gate progress, so then stealer can proceed + * 2) same as 3.. + * 1) stealer reads pre-gate progress, sees that it's different than exit, + * so goes into loop until exit matches pre-gate, now it knows coreloop + * is not in protected and cannot get back in, so can proceed. + * + *Implementation for the stealer: + * + *First, acquire the stealer lock -- only cores with no work to do will + * compete to steal, so not a big performance penalty having only one -- + * will rarely have multiple stealers in a system with plenty of work -- and + * in a system with little work, it doesn't matter. + * + *Note, have single-reader, single-writer pattern for all variables used to + * communicate between stealer and victims + * + *So, scan the queues of the core controllers, until find non-empty. Each core + * has its own list that it scans. The list goes in order from closest to + * furthest core, so it steals first from close cores. Later can add + * taking info from the app about overlapping footprints, and scan all the + * others then choose work with the most footprint overlap with the contents + * of this core's cache. + * + *Now, have a victim want to take work from. So, shut the gate in that + * coreloop, by setting the "gate closed" var on its stack to TRUE. + *Then, read the core's pre-gate progress and compare to the core's exit + * progress. + *If same, can proceed to take work from the coreloop's queue. When done, + * write FALSE to gate closed var. + *If different, then enter a loop that reads the pre-gate progress, then + * compares to exit progress then to wait progress. When one of two + * matches, proceed. Take work from the coreloop's queue. When done, + * write FALSE to the gate closed var. + * + */ +void inline +gateProtected_stealWorkInto( SchedSlot *currSlot, + VMSQueueStruc *myReadyToAnimateQ, + SlaveVP *masterVP ) + { + SlaveVP *stolenSlv; + int32 coreIdx, i, haveAVictim, gotLock; + VMSQueueStruc *victimsQ; + + volatile GateStruc *vicGate; + int32 coreMightBeInProtected; + + + + //see if any other cores have work available to steal + haveAVictim = FALSE; + coreIdx = masterVP->coreAnimatedBy; + for( i = 0; i < NUM_CORES -1; i++ ) + { + if( coreIdx >= NUM_CORES -1 ) + { coreIdx = 0; + } + else + { coreIdx++; + } + victimsQ = _VMSMasterEnv->readyToAnimateQs[coreIdx]; + if( numInVMSQ( victimsQ ) > 0 ) + { haveAVictim = TRUE; + vicGate = _VMSMasterEnv->workStealingGates[ coreIdx ]; + break; + } + } + if( !haveAVictim ) return; //no work to steal, exit + + //have a victim core, now get the stealer-lock + gotLock =__sync_bool_compare_and_swap( &(_VMSMasterEnv->workStealingLock), + UNLOCKED, LOCKED ); + if( !gotLock ) return; //go back to core controller, which will re-start master + + + //====== Start Gate-protection ======= + vicGate->gateClosed = TRUE; + coreMightBeInProtected= vicGate->preGateProgress != vicGate->exitProgress; + while( coreMightBeInProtected ) + { //wait until sure + if( vicGate->preGateProgress == vicGate->waitProgress ) + coreMightBeInProtected = FALSE; + if( vicGate->preGateProgress == vicGate->exitProgress ) + coreMightBeInProtected = FALSE; + } + + stolenSlv = readVMSQ ( victimsQ ); + + vicGate->gateClosed = FALSE; + //======= End Gate-protection ======= + + + if( stolenSlv != NULL ) //victim could have been in protected and taken + { currSlot->slaveAssignedToSlot = stolenSlv; + stolenSlv->schedSlot = currSlot; + currSlot->needsSlaveAssigned = FALSE; + + writeVMSQ( stolenSlv, myReadyToAnimateQ ); + } + + //unlock the work stealing lock + _VMSMasterEnv->workStealingLock = UNLOCKED; + } diff -r 8059fb8d5465 -r c88ce1db91ef VMS.h --- a/VMS.h Mon Mar 12 05:38:07 2012 -0700 +++ b/VMS.h Tue Mar 13 10:02:06 2012 -0700 @@ -201,7 +201,7 @@ void * coreController( void *paramsIn ); //standard PThreads fn prototype void * coreCtlr_Seq( void *paramsIn ); //standard PThreads fn prototype -void masterLoop( void *initData, SlaveVP *masterVP ); +void schedulingMaster( void *initData, SlaveVP *masterVP ); typedef struct @@ -215,10 +215,11 @@ volatile MasterEnv *_VMSMasterEnv __align_to_cacheline__; -pthread_t coreCtlrThdHandles[ NUM_CORES ]; //pthread's virt-procr state +pthread_t coreCtlrThdHandles[ NUM_CORES ]; //pthread's virt-procr state ThdParams *coreCtlrThdParams [ NUM_CORES ]; -pthread_mutex_t suspendLock = PTHREAD_MUTEX_INITIALIZER; -pthread_cond_t suspendCond = PTHREAD_COND_INITIALIZER; + +pthread_mutex_t suspendLock; +pthread_cond_t suspendCond; //========================= Function Prototypes =========================== diff -r 8059fb8d5465 -r c88ce1db91ef VMS__startup_and_shutdown.c --- a/VMS__startup_and_shutdown.c Mon Mar 12 05:38:07 2012 -0700 +++ b/VMS__startup_and_shutdown.c Tue Mar 13 10:02:06 2012 -0700 @@ -10,6 +10,7 @@ #include #include #include +#include #include "VMS.h" @@ -43,7 +44,7 @@ * the master Slv into the work-queue, ready for first "call" * 2) Semantic layer then does its own init, which creates the seed virt * slave inside the semantic layer, ready to schedule it when - * asked by the first run of the masterLoop. + * asked by the first run of the schedulingMaster. * *This part is bit weird because VMS really wants to be "always there", and * have applications attach and detach.. for now, this VMS is part of @@ -51,7 +52,7 @@ * *The semantic layer is isolated from the VMS internals by making the * semantic layer do setup to a state that it's ready with its - * initial Slvs, ready to schedule them to slots when the masterLoop + * initial Slvs, ready to schedule them to slots when the schedulingMaster * asks. Without this pattern, the semantic layer's setup would * have to modify slots directly to assign the initial virt-procrs, and put * them into the readyToAnimateQ itself, breaking the isolation completely. @@ -71,7 +72,7 @@ { #ifdef DEBUG__TURN_ON_SEQUENTIAL_MODE create_masterEnv(); - flushRegisters(); //? not sure why here -- merten added it..? + printf( "\n\n Running in SEQUENTIAL mode \n\n" ); #else create_masterEnv(); create_the_coreCtlr_OS_threads(); @@ -292,7 +293,7 @@ readyToAnimateQs[ coreIdx ] = makeVMSQ(); //Q: should give masterVP core-specific info as its init data? - masterVPs[ coreIdx ] = VMS_int__create_slaveVP( (TopLevelFnPtr)&masterLoop, (void*)masterEnv ); + masterVPs[ coreIdx ] = VMS_int__create_slaveVP( (TopLevelFnPtr)&schedulingMaster, (void*)masterEnv ); masterVPs[ coreIdx ]->coreAnimatedBy = coreIdx; allSchedSlots[ coreIdx ] = create_sched_slots(); //makes for one core _VMSMasterEnv->numMasterInARow[ coreIdx ] = 0; @@ -426,6 +427,8 @@ //get lock, to lock out any threads still starting up -- they'll see // that setupComplete is true before entering while loop, and so never // wait on the condition + pthread_mutex_init( &suspendLock, NULL ); + pthread_cond_init( &suspendCond, NULL ); pthread_mutex_lock( &suspendLock ); _VMSMasterEnv->setupComplete = 1; pthread_mutex_unlock( &suspendLock ); diff -r 8059fb8d5465 -r c88ce1db91ef VMS_primitive_data_types.h --- a/VMS_primitive_data_types.h Mon Mar 12 05:38:07 2012 -0700 +++ b/VMS_primitive_data_types.h Tue Mar 13 10:02:06 2012 -0700 @@ -7,8 +7,8 @@ */ -#ifndef _PRIMITIVE_DATA_TYPES_H -#define _PRIMITIVE_DATA_TYPES_H +#ifndef _PRIMITIVE_DATA_TYPES_H +#define _PRIMITIVE_DATA_TYPES_H /*For portability, need primitive data types that have a well defined diff -r 8059fb8d5465 -r c88ce1db91ef vmalloc.c --- a/vmalloc.c Mon Mar 12 05:38:07 2012 -0700 +++ b/vmalloc.c Tue Mar 13 10:02:06 2012 -0700 @@ -15,7 +15,7 @@ #include #include "VMS.h" -#include "C_Libraries/Histogram/Histogram.h" +#include "Histogram/Histogram.h" #define MAX_UINT64 0xFFFFFFFFFFFFFFFF