changeset 13:85f55731f6cd false_sharing

Padded variables to avoid false sharing in the application
author Merten Sach <msach@mailbox.tu-berlin.de>
date Tue, 20 Dec 2011 15:00:07 +0100
parents 1320dd56673a
children c3561dbac1dc
files src/Application/main.c
diffstat 1 files changed, 35 insertions(+), 35 deletions(-) [+]
line diff
     1.1 --- a/src/Application/main.c	Fri Dec 16 16:40:07 2011 +0100
     1.2 +++ b/src/Application/main.c	Tue Dec 20 15:00:07 2011 +0100
     1.3 @@ -64,8 +64,7 @@
     1.4  typedef struct perfData measurement_t;
     1.5  struct perfData{
     1.6      uint64 cycles;
     1.7 -    uint64 instructions;
     1.8 -};
     1.9 +} __align_to_cacheline__;
    1.10  
    1.11  const char *usage = {
    1.12  	"Usage: malloc_test [options]\n"
    1.13 @@ -85,7 +84,7 @@
    1.14      int32 cond;
    1.15      measurement_t endBarrierCycles;
    1.16  
    1.17 -};
    1.18 +} __align_to_cacheline__;
    1.19  typedef struct barrier_t barrier;
    1.20  
    1.21  void inline barrier_init(barrier *barr, int nthreads, VirtProcr *animatingPr)
    1.22 @@ -104,7 +103,7 @@
    1.23     barr->counter++;
    1.24     if(barr->counter == barr->nthreads)
    1.25      { 
    1.26 -      read(cycles_counter_main_fd, &(barr->endBarrierCycles.cycles), \
    1.27 +        read(cycles_counter_main_fd, &(barr->endBarrierCycles.cycles), \
    1.28                  sizeof(barr->endBarrierCycles.cycles));
    1.29         
    1.30        barr->counter = 0;
    1.31 @@ -119,7 +118,7 @@
    1.32  
    1.33  
    1.34  
    1.35 -typedef struct
    1.36 +struct WorkerParams_t
    1.37   { struct barrier_t* barrier;
    1.38     uint64_t  totalWorkCycles;
    1.39     uint64_t  totalBadCycles;
    1.40 @@ -127,15 +126,18 @@
    1.41     uint64_t  totalBadSyncCycles;
    1.42     uint64     numGoodSyncs;
    1.43     uint64     numGoodTasks;
    1.44 - }
    1.45 -WorkerParams;
    1.46 -
    1.47 + };
    1.48 + 
    1.49 + typedef union
    1.50 + {
    1.51 +     struct WorkerParams_t data;
    1.52 +     char padding[CACHELINE_SIZE];
    1.53 + } WorkerParams __align_to_cacheline__;
    1.54  
    1.55  typedef struct
    1.56   { measurement_t *startExeCycles;
    1.57     measurement_t *endExeCycles;
    1.58 - }
    1.59 -BenchParams;
    1.60 + } BenchParams __align_to_cacheline__;
    1.61  
    1.62  //======================== Globals =========================
    1.63  char __ProgrammName[] = "overhead_test";
    1.64 @@ -151,7 +153,7 @@
    1.65  
    1.66  //======================== App Code =========================
    1.67  /*
    1.68 - * Workload
    1.69 + p* Workload
    1.70   */
    1.71  
    1.72  #define saveCyclesAndInstrs(core,cycles) do{     \
    1.73 @@ -179,14 +181,14 @@
    1.74     
    1.75     int cpuid = sched_getcpu();
    1.76     
    1.77 -   measurement_t startWorkload, endWorkload, startWorkload2, endWorkload2;
    1.78 +   measurement_t startWorkload, endWorkload;
    1.79     uint64 numCycles;
    1.80     for(o=0; o < outer_iters; o++)
    1.81      {
    1.82         
    1.83            saveCyclesAndInstrs(cpuid,startWorkload.cycles);
    1.84         
    1.85 -      //workltask
    1.86 +      //task
    1.87        for(i=0; i < inner_iters; i++)
    1.88         {
    1.89           workspace1 += (workspace1 + 32)/2;
    1.90 @@ -221,12 +223,12 @@
    1.91        VPThread__mutex_unlock(privateMutex, animatingPr);
    1.92      }
    1.93  
    1.94 -   params->totalWorkCycles = totalWorkCycles;
    1.95 -   params->totalBadCycles = totalBadCycles;
    1.96 -   params->numGoodTasks   = numGoodTasks;
    1.97 -   params->totalSyncCycles = totalSyncCycles;
    1.98 -   params->totalBadSyncCycles = totalBadSyncCycles;
    1.99 -   params->numGoodSyncs = numGoodSyncs;
   1.100 +   params->data.totalWorkCycles = totalWorkCycles;
   1.101 +   params->data.totalBadCycles = totalBadCycles;
   1.102 +   params->data.numGoodTasks   = numGoodTasks;
   1.103 +   params->data.totalSyncCycles = totalSyncCycles;
   1.104 +   params->data.totalBadSyncCycles = totalBadSyncCycles;
   1.105 +   params->data.numGoodSyncs = numGoodSyncs;
   1.106  /*
   1.107     params->totalSyncCycles = VMS__give_num_plugin_cycles();
   1.108     params->totalBadSyncCycles = 0;
   1.109 @@ -235,7 +237,7 @@
   1.110     
   1.111     
   1.112     //Wait for all threads to end
   1.113 -   barrier_wait(params->barrier, animatingPr);
   1.114 +   barrier_wait(params->data.barrier, animatingPr);
   1.115     
   1.116     //Shutdown worker
   1.117     VPThread__dissipate_thread(animatingPr);
   1.118 @@ -244,14 +246,15 @@
   1.119     return (workspace1 + workspace2);  //to prevent gcc from optimizing work out
   1.120   }
   1.121  
   1.122 +//local variables of benchmark, made global for alignment
   1.123 +struct barrier_t  barr __align_to_cacheline__;
   1.124 +BenchParams      *params __align_to_cacheline__;
   1.125  
   1.126  /* this is run after the VMS is set up*/
   1.127  void benchmark(void *_params, VirtProcr *animatingPr)
   1.128   {
   1.129 -   int i, cpuID;
   1.130 -   struct barrier_t  barr;
   1.131 -   BenchParams      *params;
   1.132 -   
   1.133 +   int i;
   1.134 +
   1.135     params = (BenchParams *)_params;
   1.136  
   1.137     barrier_init(&barr, num_threads+1, animatingPr);
   1.138 @@ -259,15 +262,12 @@
   1.139     //prepare input
   1.140     for(i=0; i<num_threads; i++)
   1.141      { 
   1.142 -       workerParamsArray[i].barrier = &barr;
   1.143 +       workerParamsArray[i].data.barrier = &barr;
   1.144      }
   1.145       
   1.146     //save cycles before execution of threads, to get total exe cycles
   1.147 -   measurement_t *startExeCycles, *endExeCycles;
   1.148 -   startExeCycles = params->startExeCycles;
   1.149 -   
   1.150 -   int nread = read(cycles_counter_main_fd, &(startExeCycles->cycles),
   1.151 -                sizeof(startExeCycles->cycles));
   1.152 +   int nread = read(cycles_counter_main_fd, &(params->startExeCycles->cycles),
   1.153 +                sizeof(params->startExeCycles->cycles));
   1.154     if(nread<0) perror("Error reading cycles counter");
   1.155     
   1.156     //create (which starts running) all threads
   1.157 @@ -438,14 +438,14 @@
   1.158     uint64_t totalWorkCyclesAcrossCores = 0, totalBadCyclesAcrossCores = 0;
   1.159     uint64_t totalSyncCyclesAcrossCores = 0, totalBadSyncCyclesAcrossCores = 0;
   1.160     for(i=0; i<num_threads; i++){ 
   1.161 -       printf("WorkCycles: %lu\n",workerParamsArray[i].totalWorkCycles);
   1.162 +       printf("WorkCycles: %lu\n",workerParamsArray[i].data.totalWorkCycles);
   1.163  //       printf("Num Good Tasks: %lu\n",workerParamsArray[i].numGoodTasks);
   1.164  //       printf("SyncCycles: %lu\n",workerParamsArray[i].totalSyncCycles);
   1.165  //       printf("Num Good Syncs: %lu\n",workerParamsArray[i].numGoodSyncs);
   1.166 -       totalWorkCyclesAcrossCores += workerParamsArray[i].totalWorkCycles;
   1.167 -       totalBadCyclesAcrossCores  += workerParamsArray[i].totalBadCycles;
   1.168 -       totalSyncCyclesAcrossCores += workerParamsArray[i].totalSyncCycles;
   1.169 -       totalBadSyncCyclesAcrossCores  += workerParamsArray[i].totalBadSyncCycles;
   1.170 +       totalWorkCyclesAcrossCores += workerParamsArray[i].data.totalWorkCycles;
   1.171 +       totalBadCyclesAcrossCores  += workerParamsArray[i].data.totalBadCycles;
   1.172 +       totalSyncCyclesAcrossCores += workerParamsArray[i].data.totalSyncCycles;
   1.173 +       totalBadSyncCyclesAcrossCores  += workerParamsArray[i].data.totalBadSyncCycles;
   1.174      }
   1.175  
   1.176     uint64_t totalExeCycles = endExeCycles.cycles - startExeCycles.cycles;