changeset 14:c3561dbac1dc false_sharing

added cache misses counter
author Merten Sach <msach@mailbox.tu-berlin.de>
date Tue, 20 Dec 2011 17:21:27 +0100
parents 85f55731f6cd
children 281cadcbb796
files src/Application/main.c
diffstat 1 files changed, 55 insertions(+), 16 deletions(-) [+]
line diff
     1.1 --- a/src/Application/main.c	Tue Dec 20 15:00:07 2011 +0100
     1.2 +++ b/src/Application/main.c	Tue Dec 20 17:21:27 2011 +0100
     1.3 @@ -60,6 +60,21 @@
     1.4  //#error "I don't know how to measure time on your platform"
     1.5  #endif
     1.6  
     1.7 +//======================== Globals =========================
     1.8 +char __ProgrammName[] = "overhead_test";
     1.9 +char __DataSet[255];
    1.10 +
    1.11 +int outer_iters, inner_iters, num_threads;
    1.12 +size_t chunk_size = 0;
    1.13 +
    1.14 +int cycles_counter_main_fd;
    1.15 +int misses_counter_fd;
    1.16 +
    1.17 +uint64_t cache_misses;
    1.18 +
    1.19 +int cycles_counter_fd[NUM_CORES];
    1.20 +struct perf_event_attr* hw_event;
    1.21 +
    1.22  //======================== Defines =========================
    1.23  typedef struct perfData measurement_t;
    1.24  struct perfData{
    1.25 @@ -95,7 +110,6 @@
    1.26     barr->cond    = VPThread__make_cond(barr->mutex, animatingPr);
    1.27   }
    1.28  
    1.29 -int cycles_counter_main_fd;
    1.30  void inline barrier_wait(barrier *barr, VirtProcr *animatingPr)
    1.31   { int i;
    1.32  
    1.33 @@ -133,24 +147,14 @@
    1.34       struct WorkerParams_t data;
    1.35       char padding[CACHELINE_SIZE];
    1.36   } WorkerParams __align_to_cacheline__;
    1.37 + 
    1.38 +WorkerParams *workerParamsArray;
    1.39  
    1.40  typedef struct
    1.41   { measurement_t *startExeCycles;
    1.42     measurement_t *endExeCycles;
    1.43   } BenchParams __align_to_cacheline__;
    1.44  
    1.45 -//======================== Globals =========================
    1.46 -char __ProgrammName[] = "overhead_test";
    1.47 -char __DataSet[255];
    1.48 -
    1.49 -int outer_iters, inner_iters, num_threads;
    1.50 -size_t chunk_size = 0;
    1.51 -
    1.52 -int cycles_counter_fd[NUM_CORES];
    1.53 -struct perf_event_attr* hw_event;
    1.54 -
    1.55 -WorkerParams *workerParamsArray;
    1.56 -
    1.57  //======================== App Code =========================
    1.58  /*
    1.59   p* Workload
    1.60 @@ -166,6 +170,16 @@
    1.61         cycles = 0;                                      \
    1.62     }                                                    \
    1.63  } while (0) //macro magic for scoping
    1.64 + 
    1.65 +#define saveMisses(misses) do{     \
    1.66 +   int nread;                                           \
    1.67 +                                                        \
    1.68 +   nread = read(misses_counter_fd,&(misses),sizeof(misses));    \
    1.69 +   if(nread<0){                                         \
    1.70 +       perror("Error reading misses counter");          \
    1.71 +       misses = 0;                                      \
    1.72 +   }                                                    \
    1.73 +} while (0) //macro magic for scoping
    1.74  
    1.75  
    1.76  double
    1.77 @@ -264,7 +278,9 @@
    1.78      { 
    1.79         workerParamsArray[i].data.barrier = &barr;
    1.80      }
    1.81 -     
    1.82 +    
    1.83 +   uint64_t cache_misses_at_start, cache_misses_at_end;
    1.84 +   saveMisses(cache_misses_at_start);
    1.85     //save cycles before execution of threads, to get total exe cycles
    1.86     int nread = read(cycles_counter_main_fd, &(params->startExeCycles->cycles),
    1.87                  sizeof(params->startExeCycles->cycles));
    1.88 @@ -279,8 +295,8 @@
    1.89    
    1.90     //endBarrierCycles read in barrier_wait()!  Merten, email me if want to chg
    1.91     params->endExeCycles->cycles = barr.endBarrierCycles.cycles;
    1.92 -   
    1.93 -
    1.94 +   saveMisses(cache_misses_at_end);
    1.95 +   cache_misses = cache_misses_at_end-cache_misses_at_start;
    1.96  /*
    1.97     uint64_t overallWorkCycles = 0;
    1.98     for(i=0; i<num_threads; i++){ 
    1.99 @@ -420,6 +436,28 @@
   1.100        perror("Failed to open cycles counter");
   1.101      }
   1.102     
   1.103 +   //Set up counters to count cache misses
   1.104 +    hw_event->type = PERF_TYPE_HARDWARE;
   1.105 +    hw_event->config = PERF_COUNT_HW_CACHE_MISSES; //misses
   1.106 +    
   1.107 +   retries = 0;
   1.108 +   do
   1.109 +    { retries += 1;
   1.110 +      misses_counter_fd = 
   1.111 +       syscall(__NR_perf_event_open, hw_event,
   1.112 +               0,//pid_t: 0 is "pid of calling process" 
   1.113 +               -1,//int: cpu, -1 means accumulate from all cores
   1.114 +               -1,//int: group_fd, -1 is "leader" == independent
   1.115 +               0//unsigned long: flags
   1.116 +              );
   1.117 +    }
   1.118 +   while(misses_counter_fd<0 && retries < 100);
   1.119 +   if(retries >= 100)
   1.120 +    {
   1.121 +      fprintf(stderr,"in main ");
   1.122 +      perror("Failed to misses counter");
   1.123 +    }
   1.124 +   
   1.125     measurement_t startExeCycles, endExeCycles;
   1.126     BenchParams *benchParams;
   1.127     
   1.128 @@ -453,6 +491,7 @@
   1.129     uint64 totalOverhead = totalExeCycles - totalWorkCyclesAcrossCores;
   1.130     int32  numSyncs = outer_iters * num_threads * 2;
   1.131     printf("Total Execution Cycles: %lu\n", totalExeCycles);
   1.132 +   printf("Total number of cache misses: %lu\n", cache_misses);
   1.133     printf("Sum across threads of work cycles: %lu\n", totalWorkCyclesAcrossCores);
   1.134     printf("Sum across threads of bad work cycles: %lu\n", totalBadCyclesAcrossCores);
   1.135  //   printf("Sum across threads of Bad Sync cycles: %lu\n", totalBadSyncCyclesAcrossCores);