# HG changeset patch # User Some Random Person # Date 1331444933 28800 # Node ID 82f7defac8517258c324bf033d091f50f27245bd # Parent ecbdb74cad970b205c06999b082f24331f619fa9 Added backoff to core loop, cleaned up core loop code, cleaned comments diff -r ecbdb74cad97 -r 82f7defac851 CoreLoop.c --- a/CoreLoop.c Sat Mar 10 20:38:25 2012 -0800 +++ b/CoreLoop.c Sat Mar 10 21:48:53 2012 -0800 @@ -16,6 +16,12 @@ //===================== Functions local to this file ======================= void *terminateCoreController(SlaveVP *currSlv); +inline void +doBackoff_for_TooLongToGetLock( int32 numTriesToGetLock, uint32 *seed1, + uint32 *seed2 ); +inline void +doBackoff_for_TooLongWithNoWork( int32 numRepsWithNoWork, uint32 *seed1, + uint32 *seed2 ); //=========================================================================== @@ -75,7 +81,6 @@ //Variables used during measurements TSCountLowHigh endSusp; //Variables used in random-backoff, for master-lock and waiting for work - volatile double workspace1,workspace2; //busy-wait fake work uint32_t seed1 = rand()%1000; // init random number generator for retries uint32_t seed2 = rand()%1000; //Variable for work-stealing -- a gate protects a critical section @@ -137,6 +142,7 @@ if( currSlotIdx >= NUM_SCHED_SLOTS ) goto switchToMaster; currSlot = schedSlots[ currSlotIdx ]; + if( ! currSlot->needsSlaveAssigned ) //slot does have slave assigned { numRepetitionsWithNoWork = 0; //reset B2B master count currSlotIdx ++; @@ -150,9 +156,20 @@ MEAS__Capture_Pre_Master_Lock_Point; - int tries = 0; int gotLock = 0; + int numTriesToGetLock = 0; int gotLock = 0; while( currVP == NULL ) //keep going until get master lock { + //At this point, first thing to do is get lock. But, want to + // reduce lock contention from cores with no work, so first + // check if this is a core with no work, and busy wait if so. + //Then, if it's been way too long without work, yield pthread + if( numRepetitionsWithNoWork > NUM_REPS_W_NO_WORK_BEFORE_BACKOFF) + doBackoff_for_TooLongWithNoWork( numRepetitionsWithNoWork, &seed1, &seed2 ); + if( numRepetitionsWithNoWork > NUM_REPS_W_NO_WORK_BEFORE_YIELD ) + { numRepetitionsWithNoWork = 0; pthread_yield(); } + + + //Now, try to get the lock gotLock = __sync_bool_compare_and_swap( addrOfMasterLock, UNLOCKED, LOCKED ); if( gotLock ) @@ -164,16 +181,16 @@ // done, the masterVP will use assembly to switch the core // back to animating this core controller currVP = thisCoresMasterVP; - if( numRepetitionsWithNoWork > NUM_REPS_W_NO_WORK_BEFORE_YIELD ) - { DEBUG_Print( dbgB2BMaster,"Lots of reps w/o work\n"); - pthread_yield(); - } numRepetitionsWithNoWork += 1; break; //end while -- have a VP to animate now } + //Get here only when failed to get lock - tries++; //if too many, means too much contention - if( tries > MASTERLOCK_RETRIES_BEFORE_YIELD ) { tries = 0; pthread_yield(); } + numTriesToGetLock++; //if too many, means too much contention + if( numTriesToGetLock > NUM_TRIES_BEFORE_DO_BACKOFF ) + doBackoff_for_TooLongToGetLock( numTriesToGetLock, &seed1, &seed2 ); + if( numTriesToGetLock > MASTERLOCK_RETRIES_BEFORE_YIELD ) + { numTriesToGetLock = 0; pthread_yield(); } } MEAS__Capture_Post_Master_Lock_Point; } @@ -198,6 +215,53 @@ } +/*Used by the backoff to pick a random amount of busy-wait. Can't use the + * system rand because it takes much too long. + *Note, are passing pointers to the seeds, which are then modified + */ +inline uint32_t +randomNumber(uint32_t* seed1, uint32_t* seed2) + { + *seed1 = 36969 * (*seed1 & 65535) + (*seed1 >> 16); + *seed2 = 18000 * (*seed2 & 65535) + (*seed2 >> 16); + return (*seed1 << 16) + *seed2; + } + +/*Busy-wait for a random number of cycles -- chooses number of cycles + * differently than for the too-many-tries-to-get-lock backoff + */ +inline void +doBackoff_for_TooLongWithNoWork( int32 numRepsWithNoWork, uint32 *seed1, + uint32 *seed2 ) + { int32 i, waitIterations; + volatile double fakeWorkVar; //busy-wait fake work + + waitIterations = + randomNumber(seed1, seed2) % + (numRepsWithNoWork * numRepsWithNoWork * NUM_CORES); + for( i = 0; i < waitIterations; i++ ) + { fakeWorkVar += (fakeWorkVar + 32.0) / 2.0; //busy-wait + } + } + +/*Busy-waits for a random number of cycles -- chooses number of cycles + * differently than for the no-work backoff + */ +inline void +doBackoff_for_TooLongToGetLock( int32 numTriesToGetLock, uint32 *seed1, + uint32 *seed2 ) + { int32 i, waitIterations; + volatile double fakeWorkVar; //busy-wait fake work + + waitIterations = + randomNumber(seed1, seed2) % + (numTriesToGetLock * NUM_TRIES_TO_GET_LOCK_BACKOFF_WEIGHT); + //addToHist( wait_iterations, coreLoopThdParams->wait_iterations_hist ); + for( i = 0; i < waitIterations; i++ ) + { fakeWorkVar += (fakeWorkVar + 32.0) / 2.0; //busy-wait + } + } + #ifdef DEBUG__TURN_ON_SEQUENTIAL_MODE diff -r ecbdb74cad97 -r 82f7defac851 Defines/VMS_defs__HW_constants.h --- a/Defines/VMS_defs__HW_constants.h Sat Mar 10 20:38:25 2012 -0800 +++ b/Defines/VMS_defs__HW_constants.h Sat Mar 10 21:48:53 2012 -0800 @@ -20,23 +20,25 @@ // when work-stealing, can make bigger, at risk of losing cache affinity #define NUM_SCHED_SLOTS 3 -#define MIN_WORK_UNIT_CYCLES 20000 - -#define NUM_REPS_W_NO_WORK_BEFORE_YIELD 10 -#define MASTERLOCK_RETRIES_BEFORE_YIELD 100 - + //These are for backoff inside core-loop, which reduces lock contention +#define NUM_REPS_W_NO_WORK_BEFORE_YIELD 10 +#define NUM_REPS_W_NO_WORK_BEFORE_BACKOFF 2 +#define MASTERLOCK_RETRIES_BEFORE_YIELD 100 +#define NUM_TRIES_BEFORE_DO_BACKOFF 10 +#define NUM_TRIES_TO_GET_LOCK_BACKOFF_WEIGHT 100 + // stack size in virtual processors created #define VIRT_PROCR_STACK_SIZE 0x8000 /* 32K */ - // memory for VMS_WL__malloc + // memory for VMS_int__malloc #define MALLOC_ADDITIONAL_MEM_FROM_OS_SIZE 0x40000000 /* 1G */ //Frequency of TS counts -- have to do tests to verify //NOTE: turn off (in BIOS) TURBO-BOOST and SPEED-STEP else won't be const #define TSCOUNT_FREQ 3180000000 -#define CACHE_LINE_SZ 256 -#define PAGE_SIZE 4096 +#define CACHE_LINE_SZ 256 +#define PAGE_SIZE 4096 //To prevent false-sharing, aligns a variable to a cache-line boundary. //No need to use for local vars because those are never shared between cores