# HG changeset patch
# User Some Random Person <seanhalle@yahoo.com>
# Date 1331444933 28800
# Node ID 82f7defac8517258c324bf033d091f50f27245bd
# Parent  ecbdb74cad970b205c06999b082f24331f619fa9
Added backoff to core loop, cleaned up core loop code, cleaned comments

diff -r ecbdb74cad97 -r 82f7defac851 CoreLoop.c
--- a/CoreLoop.c	Sat Mar 10 20:38:25 2012 -0800
+++ b/CoreLoop.c	Sat Mar 10 21:48:53 2012 -0800
@@ -16,6 +16,12 @@
 
 //=====================  Functions local to this file =======================
 void *terminateCoreController(SlaveVP *currSlv);
+inline void
+doBackoff_for_TooLongToGetLock( int32  numTriesToGetLock, uint32 *seed1, 
+                                uint32 *seed2 );
+inline void
+doBackoff_for_TooLongWithNoWork( int32   numRepsWithNoWork, uint32 *seed1, 
+                                 uint32 *seed2 );
 //===========================================================================
 
 
@@ -75,7 +81,6 @@
       //Variables used during measurements
    TSCountLowHigh  endSusp;
       //Variables used in random-backoff, for master-lock and waiting for work
-   volatile double workspace1,workspace2; //busy-wait fake work
    uint32_t seed1 = rand()%1000; // init random number generator for retries
    uint32_t seed2 = rand()%1000;
       //Variable for work-stealing -- a gate protects a critical section
@@ -137,6 +142,7 @@
       if( currSlotIdx >= NUM_SCHED_SLOTS ) goto switchToMaster;
       currSlot = schedSlots[ currSlotIdx ];
 
+      
       if( ! currSlot->needsSlaveAssigned ) //slot does have slave assigned
        { numRepetitionsWithNoWork = 0;     //reset B2B master count
          currSlotIdx ++;
@@ -150,9 +156,20 @@
 
                MEAS__Capture_Pre_Master_Lock_Point;
 
-         int tries = 0; int gotLock = 0;
+         int numTriesToGetLock = 0; int gotLock = 0;
          while( currVP == NULL ) //keep going until get master lock
           { 
+               //At this point, first thing to do is get lock.  But, want to
+               // reduce lock contention from cores with no work, so first
+               // check if this is a core with no work, and busy wait if so.
+               //Then, if it's been way too long without work, yield pthread
+            if( numRepetitionsWithNoWork > NUM_REPS_W_NO_WORK_BEFORE_BACKOFF)
+               doBackoff_for_TooLongWithNoWork( numRepetitionsWithNoWork, &seed1, &seed2 );
+            if( numRepetitionsWithNoWork > NUM_REPS_W_NO_WORK_BEFORE_YIELD )
+             { numRepetitionsWithNoWork = 0; pthread_yield(); }
+
+               
+               //Now, try to get the lock
             gotLock = __sync_bool_compare_and_swap( addrOfMasterLock,
                                                     UNLOCKED, LOCKED );
             if( gotLock )
@@ -164,16 +181,16 @@
                   // done, the masterVP will use assembly to switch the core
                   // back to animating this core controller
                currVP = thisCoresMasterVP;
-               if( numRepetitionsWithNoWork > NUM_REPS_W_NO_WORK_BEFORE_YIELD )
-                {       DEBUG_Print( dbgB2BMaster,"Lots of reps w/o work\n");
-                  pthread_yield();
-                }
                numRepetitionsWithNoWork += 1;
                break;  //end while -- have a VP to animate now
              }
+               //Get here only when failed to get lock
 
-            tries++;   //if too many, means too much contention
-            if( tries > MASTERLOCK_RETRIES_BEFORE_YIELD ) { tries = 0; pthread_yield(); }
+            numTriesToGetLock++;   //if too many, means too much contention
+            if( numTriesToGetLock > NUM_TRIES_BEFORE_DO_BACKOFF ) 
+               doBackoff_for_TooLongToGetLock( numTriesToGetLock, &seed1, &seed2 );
+            if( numTriesToGetLock > MASTERLOCK_RETRIES_BEFORE_YIELD ) 
+             { numTriesToGetLock = 0; pthread_yield(); }
           }
                MEAS__Capture_Post_Master_Lock_Point;
        }
@@ -198,6 +215,53 @@
  }
 
 
+/*Used by the backoff to pick a random amount of busy-wait.  Can't use the
+ * system rand because it takes much too long.
+ *Note, are passing pointers to the seeds, which are then modified
+ */
+inline uint32_t
+randomNumber(uint32_t* seed1, uint32_t* seed2)
+ {
+	*seed1 = 36969 * (*seed1 & 65535) + (*seed1 >> 16);
+	*seed2 = 18000 * (*seed2 & 65535) + (*seed2 >> 16);
+	return (*seed1 << 16) + *seed2;
+ }
+
+/*Busy-wait for a random number of cycles -- chooses number of cycles 
+ * differently than for the too-many-tries-to-get-lock backoff
+ */
+inline void
+doBackoff_for_TooLongWithNoWork( int32   numRepsWithNoWork, uint32 *seed1, 
+                                 uint32 *seed2 )
+ { int32 i, waitIterations;
+   volatile double fakeWorkVar; //busy-wait fake work
+ 
+   waitIterations = 
+    randomNumber(seed1, seed2) % 
+    (numRepsWithNoWork * numRepsWithNoWork * NUM_CORES);
+   for( i = 0; i < waitIterations; i++ )
+    { fakeWorkVar += (fakeWorkVar + 32.0) / 2.0; //busy-wait
+    }
+ }
+
+/*Busy-waits for a random number of cycles -- chooses number of cycles 
+ * differently than for the no-work backoff
+ */
+inline void
+doBackoff_for_TooLongToGetLock( int32 numTriesToGetLock, uint32 *seed1, 
+                                uint32 *seed2 )
+ { int32 i, waitIterations;
+   volatile double fakeWorkVar; //busy-wait fake work
+
+   waitIterations = 
+    randomNumber(seed1, seed2) % 
+    (numTriesToGetLock * NUM_TRIES_TO_GET_LOCK_BACKOFF_WEIGHT);   
+   //addToHist( wait_iterations, coreLoopThdParams->wait_iterations_hist );
+   for( i = 0; i < waitIterations; i++ )
+    { fakeWorkVar += (fakeWorkVar + 32.0) / 2.0; //busy-wait
+    }
+ }
+
 
 #ifdef DEBUG__TURN_ON_SEQUENTIAL_MODE
 
diff -r ecbdb74cad97 -r 82f7defac851 Defines/VMS_defs__HW_constants.h
--- a/Defines/VMS_defs__HW_constants.h	Sat Mar 10 20:38:25 2012 -0800
+++ b/Defines/VMS_defs__HW_constants.h	Sat Mar 10 21:48:53 2012 -0800
@@ -20,23 +20,25 @@
    // when work-stealing, can make bigger, at risk of losing cache affinity
 #define NUM_SCHED_SLOTS  3
 
-#define MIN_WORK_UNIT_CYCLES 20000
-
-#define NUM_REPS_W_NO_WORK_BEFORE_YIELD 10
-#define MASTERLOCK_RETRIES_BEFORE_YIELD 100
-
+   //These are for backoff inside core-loop, which reduces lock contention
+#define NUM_REPS_W_NO_WORK_BEFORE_YIELD      10
+#define NUM_REPS_W_NO_WORK_BEFORE_BACKOFF    2
+#define MASTERLOCK_RETRIES_BEFORE_YIELD      100
+#define NUM_TRIES_BEFORE_DO_BACKOFF          10
+#define NUM_TRIES_TO_GET_LOCK_BACKOFF_WEIGHT 100
+   
    // stack size in virtual processors created
 #define VIRT_PROCR_STACK_SIZE 0x8000 /* 32K */
 
-   // memory for VMS_WL__malloc
+   // memory for VMS_int__malloc
 #define MALLOC_ADDITIONAL_MEM_FROM_OS_SIZE 0x40000000 /* 1G */
 
    //Frequency of TS counts -- have to do tests to verify
    //NOTE: turn off (in BIOS)  TURBO-BOOST and SPEED-STEP else won't be const
 #define TSCOUNT_FREQ 3180000000
 
-#define CACHE_LINE_SZ 256
-#define PAGE_SIZE 4096
+#define CACHE_LINE_SZ  256
+#define PAGE_SIZE     4096
 
 //To prevent false-sharing, aligns a variable to a cache-line boundary.
 //No need to use for local vars because those are never shared between cores