changeset 218:82f7defac851 Common_Ancestor

Added backoff to core loop, cleaned up core loop code, cleaned comments
author Some Random Person <seanhalle@yahoo.com>
date Sat, 10 Mar 2012 21:48:53 -0800
parents ecbdb74cad97
children 8059fb8d5465
files CoreLoop.c Defines/VMS_defs__HW_constants.h
diffstat 2 files changed, 82 insertions(+), 16 deletions(-) [+]
line diff
     1.1 --- a/CoreLoop.c	Sat Mar 10 20:38:25 2012 -0800
     1.2 +++ b/CoreLoop.c	Sat Mar 10 21:48:53 2012 -0800
     1.3 @@ -16,6 +16,12 @@
     1.4  
     1.5  //=====================  Functions local to this file =======================
     1.6  void *terminateCoreController(SlaveVP *currSlv);
     1.7 +inline void
     1.8 +doBackoff_for_TooLongToGetLock( int32  numTriesToGetLock, uint32 *seed1, 
     1.9 +                                uint32 *seed2 );
    1.10 +inline void
    1.11 +doBackoff_for_TooLongWithNoWork( int32   numRepsWithNoWork, uint32 *seed1, 
    1.12 +                                 uint32 *seed2 );
    1.13  //===========================================================================
    1.14  
    1.15  
    1.16 @@ -75,7 +81,6 @@
    1.17        //Variables used during measurements
    1.18     TSCountLowHigh  endSusp;
    1.19        //Variables used in random-backoff, for master-lock and waiting for work
    1.20 -   volatile double workspace1,workspace2; //busy-wait fake work
    1.21     uint32_t seed1 = rand()%1000; // init random number generator for retries
    1.22     uint32_t seed2 = rand()%1000;
    1.23        //Variable for work-stealing -- a gate protects a critical section
    1.24 @@ -137,6 +142,7 @@
    1.25        if( currSlotIdx >= NUM_SCHED_SLOTS ) goto switchToMaster;
    1.26        currSlot = schedSlots[ currSlotIdx ];
    1.27  
    1.28 +      
    1.29        if( ! currSlot->needsSlaveAssigned ) //slot does have slave assigned
    1.30         { numRepetitionsWithNoWork = 0;     //reset B2B master count
    1.31           currSlotIdx ++;
    1.32 @@ -150,9 +156,20 @@
    1.33  
    1.34                 MEAS__Capture_Pre_Master_Lock_Point;
    1.35  
    1.36 -         int tries = 0; int gotLock = 0;
    1.37 +         int numTriesToGetLock = 0; int gotLock = 0;
    1.38           while( currVP == NULL ) //keep going until get master lock
    1.39            { 
    1.40 +               //At this point, first thing to do is get lock.  But, want to
    1.41 +               // reduce lock contention from cores with no work, so first
    1.42 +               // check if this is a core with no work, and busy wait if so.
    1.43 +               //Then, if it's been way too long without work, yield pthread
    1.44 +            if( numRepetitionsWithNoWork > NUM_REPS_W_NO_WORK_BEFORE_BACKOFF)
    1.45 +               doBackoff_for_TooLongWithNoWork( numRepetitionsWithNoWork, &seed1, &seed2 );
    1.46 +            if( numRepetitionsWithNoWork > NUM_REPS_W_NO_WORK_BEFORE_YIELD )
    1.47 +             { numRepetitionsWithNoWork = 0; pthread_yield(); }
    1.48 +
    1.49 +               
    1.50 +               //Now, try to get the lock
    1.51              gotLock = __sync_bool_compare_and_swap( addrOfMasterLock,
    1.52                                                      UNLOCKED, LOCKED );
    1.53              if( gotLock )
    1.54 @@ -164,16 +181,16 @@
    1.55                    // done, the masterVP will use assembly to switch the core
    1.56                    // back to animating this core controller
    1.57                 currVP = thisCoresMasterVP;
    1.58 -               if( numRepetitionsWithNoWork > NUM_REPS_W_NO_WORK_BEFORE_YIELD )
    1.59 -                {       DEBUG_Print( dbgB2BMaster,"Lots of reps w/o work\n");
    1.60 -                  pthread_yield();
    1.61 -                }
    1.62                 numRepetitionsWithNoWork += 1;
    1.63                 break;  //end while -- have a VP to animate now
    1.64               }
    1.65 +               //Get here only when failed to get lock
    1.66  
    1.67 -            tries++;   //if too many, means too much contention
    1.68 -            if( tries > MASTERLOCK_RETRIES_BEFORE_YIELD ) { tries = 0; pthread_yield(); }
    1.69 +            numTriesToGetLock++;   //if too many, means too much contention
    1.70 +            if( numTriesToGetLock > NUM_TRIES_BEFORE_DO_BACKOFF ) 
    1.71 +               doBackoff_for_TooLongToGetLock( numTriesToGetLock, &seed1, &seed2 );
    1.72 +            if( numTriesToGetLock > MASTERLOCK_RETRIES_BEFORE_YIELD ) 
    1.73 +             { numTriesToGetLock = 0; pthread_yield(); }
    1.74            }
    1.75                 MEAS__Capture_Post_Master_Lock_Point;
    1.76         }
    1.77 @@ -198,6 +215,53 @@
    1.78   }
    1.79  
    1.80  
    1.81 +/*Used by the backoff to pick a random amount of busy-wait.  Can't use the
    1.82 + * system rand because it takes much too long.
    1.83 + *Note, are passing pointers to the seeds, which are then modified
    1.84 + */
    1.85 +inline uint32_t
    1.86 +randomNumber(uint32_t* seed1, uint32_t* seed2)
    1.87 + {
    1.88 +	*seed1 = 36969 * (*seed1 & 65535) + (*seed1 >> 16);
    1.89 +	*seed2 = 18000 * (*seed2 & 65535) + (*seed2 >> 16);
    1.90 +	return (*seed1 << 16) + *seed2;
    1.91 + }
    1.92 +
    1.93 +/*Busy-wait for a random number of cycles -- chooses number of cycles 
    1.94 + * differently than for the too-many-tries-to-get-lock backoff
    1.95 + */
    1.96 +inline void
    1.97 +doBackoff_for_TooLongWithNoWork( int32   numRepsWithNoWork, uint32 *seed1, 
    1.98 +                                 uint32 *seed2 )
    1.99 + { int32 i, waitIterations;
   1.100 +   volatile double fakeWorkVar; //busy-wait fake work
   1.101 + 
   1.102 +   waitIterations = 
   1.103 +    randomNumber(seed1, seed2) % 
   1.104 +    (numRepsWithNoWork * numRepsWithNoWork * NUM_CORES);
   1.105 +   for( i = 0; i < waitIterations; i++ )
   1.106 +    { fakeWorkVar += (fakeWorkVar + 32.0) / 2.0; //busy-wait
   1.107 +    }
   1.108 + }
   1.109 +
   1.110 +/*Busy-waits for a random number of cycles -- chooses number of cycles 
   1.111 + * differently than for the no-work backoff
   1.112 + */
   1.113 +inline void
   1.114 +doBackoff_for_TooLongToGetLock( int32 numTriesToGetLock, uint32 *seed1, 
   1.115 +                                uint32 *seed2 )
   1.116 + { int32 i, waitIterations;
   1.117 +   volatile double fakeWorkVar; //busy-wait fake work
   1.118 +
   1.119 +   waitIterations = 
   1.120 +    randomNumber(seed1, seed2) % 
   1.121 +    (numTriesToGetLock * NUM_TRIES_TO_GET_LOCK_BACKOFF_WEIGHT);   
   1.122 +   //addToHist( wait_iterations, coreLoopThdParams->wait_iterations_hist );
   1.123 +   for( i = 0; i < waitIterations; i++ )
   1.124 +    { fakeWorkVar += (fakeWorkVar + 32.0) / 2.0; //busy-wait
   1.125 +    }
   1.126 + }
   1.127 +
   1.128  
   1.129  #ifdef DEBUG__TURN_ON_SEQUENTIAL_MODE
   1.130  
     2.1 --- a/Defines/VMS_defs__HW_constants.h	Sat Mar 10 20:38:25 2012 -0800
     2.2 +++ b/Defines/VMS_defs__HW_constants.h	Sat Mar 10 21:48:53 2012 -0800
     2.3 @@ -20,23 +20,25 @@
     2.4     // when work-stealing, can make bigger, at risk of losing cache affinity
     2.5  #define NUM_SCHED_SLOTS  3
     2.6  
     2.7 -#define MIN_WORK_UNIT_CYCLES 20000
     2.8 -
     2.9 -#define NUM_REPS_W_NO_WORK_BEFORE_YIELD 10
    2.10 -#define MASTERLOCK_RETRIES_BEFORE_YIELD 100
    2.11 -
    2.12 +   //These are for backoff inside core-loop, which reduces lock contention
    2.13 +#define NUM_REPS_W_NO_WORK_BEFORE_YIELD      10
    2.14 +#define NUM_REPS_W_NO_WORK_BEFORE_BACKOFF    2
    2.15 +#define MASTERLOCK_RETRIES_BEFORE_YIELD      100
    2.16 +#define NUM_TRIES_BEFORE_DO_BACKOFF          10
    2.17 +#define NUM_TRIES_TO_GET_LOCK_BACKOFF_WEIGHT 100
    2.18 +   
    2.19     // stack size in virtual processors created
    2.20  #define VIRT_PROCR_STACK_SIZE 0x8000 /* 32K */
    2.21  
    2.22 -   // memory for VMS_WL__malloc
    2.23 +   // memory for VMS_int__malloc
    2.24  #define MALLOC_ADDITIONAL_MEM_FROM_OS_SIZE 0x40000000 /* 1G */
    2.25  
    2.26     //Frequency of TS counts -- have to do tests to verify
    2.27     //NOTE: turn off (in BIOS)  TURBO-BOOST and SPEED-STEP else won't be const
    2.28  #define TSCOUNT_FREQ 3180000000
    2.29  
    2.30 -#define CACHE_LINE_SZ 256
    2.31 -#define PAGE_SIZE 4096
    2.32 +#define CACHE_LINE_SZ  256
    2.33 +#define PAGE_SIZE     4096
    2.34  
    2.35  //To prevent false-sharing, aligns a variable to a cache-line boundary.
    2.36  //No need to use for local vars because those are never shared between cores