changeset 17:281cadcbb796 false_sharing

changed directory structure, added .hgeol file
author Merten Sach <msach@mailbox.tu-berlin.de>
date Mon, 13 Feb 2012 16:12:20 +0100
parents c3561dbac1dc
children e7277df4460e
files .hgeol main.c src/Application/main.c
diffstat 3 files changed, 516 insertions(+), 502 deletions(-) [+]
line diff
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/.hgeol	Mon Feb 13 16:12:20 2012 +0100
     1.3 @@ -0,0 +1,14 @@
     1.4 +
     1.5 +[patterns]
     1.6 +**.py = native
     1.7 +**.txt = native
     1.8 +**.c = native
     1.9 +**.h = native
    1.10 +**.cpp = native
    1.11 +**.java = native
    1.12 +**.class = bin
    1.13 +**.jar = bin
    1.14 +**.sh = native
    1.15 +**.pl = native
    1.16 +**.jpg = bin
    1.17 +**.gif = bin
     2.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     2.2 +++ b/main.c	Mon Feb 13 16:12:20 2012 +0100
     2.3 @@ -0,0 +1,502 @@
     2.4 +/* 
     2.5 + * 
     2.6 + */
     2.7 +#include <stdio.h>
     2.8 +#include <stdlib.h>
     2.9 +#include <string.h>
    2.10 +#include <math.h>
    2.11 +#include <ctype.h>
    2.12 +#include <errno.h>
    2.13 +#include <pthread.h>
    2.14 +#include <unistd.h>
    2.15 +#include "VPThread_lib/VPThread.h"
    2.16 +#include "VPThread_lib/VMS/Queue_impl/PrivateQueue.h"
    2.17 +
    2.18 +#include <linux/perf_event.h>
    2.19 +#include <linux/prctl.h>
    2.20 +#include <sys/syscall.h>
    2.21 +
    2.22 +#undef DEBUG
    2.23 +//#define DEBUG
    2.24 +
    2.25 +#if !defined(unix) && !defined(__unix__)
    2.26 +#ifdef __MACH__
    2.27 +#define unix		1
    2.28 +#define __unix__	1
    2.29 +#endif	/* __MACH__ */
    2.30 +#endif	/* unix */
    2.31 +
    2.32 +/* find the appropriate way to define explicitly sized types */
    2.33 +/* for C99 or GNU libc (also mach's libc) we can use stdint.h */
    2.34 +#if (__STDC_VERSION__ >= 199900) || defined(__GLIBC__) || defined(__MACH__)
    2.35 +#include <stdint.h>
    2.36 +#elif defined(unix) || defined(__unix__)	/* some UNIX systems have them in sys/types.h */
    2.37 +#include <sys/types.h>
    2.38 +#elif defined(__WIN32__) || defined(WIN32)	/* the nameless one */
    2.39 +typedef unsigned __int8 uint8_t;
    2.40 +typedef unsigned __int32 uint32_t;
    2.41 +#endif	/* sized type detection */
    2.42 +
    2.43 +/* provide a millisecond-resolution timer for each system */
    2.44 +#if defined(unix) || defined(__unix__)
    2.45 +#include <time.h>
    2.46 +#include <sys/time.h>
    2.47 +unsigned long get_msec(void) {
    2.48 +	static struct timeval timeval, first_timeval;
    2.49 +
    2.50 +	gettimeofday(&timeval, 0);
    2.51 +	if(first_timeval.tv_sec == 0) {
    2.52 +		first_timeval = timeval;
    2.53 +		return 0;
    2.54 +	}
    2.55 +	return (timeval.tv_sec - first_timeval.tv_sec) * 1000 + (timeval.tv_usec - first_timeval.tv_usec) / 1000;
    2.56 +}
    2.57 +#elif defined(__WIN32__) || defined(WIN32)
    2.58 +#include <windows.h>
    2.59 +unsigned long get_msec(void) {
    2.60 +	return GetTickCount();
    2.61 +}
    2.62 +#else
    2.63 +//#error "I don't know how to measure time on your platform"
    2.64 +#endif
    2.65 +
    2.66 +//======================== Globals =========================
    2.67 +char __ProgrammName[] = "overhead_test";
    2.68 +char __DataSet[255];
    2.69 +
    2.70 +int outer_iters, inner_iters, num_threads;
    2.71 +size_t chunk_size = 0;
    2.72 +
    2.73 +int cycles_counter_main_fd;
    2.74 +int misses_counter_fd;
    2.75 +
    2.76 +uint64_t cache_misses;
    2.77 +
    2.78 +int cycles_counter_fd[NUM_CORES];
    2.79 +struct perf_event_attr* hw_event;
    2.80 +
    2.81 +//======================== Defines =========================
    2.82 +typedef struct perfData measurement_t;
    2.83 +struct perfData{
    2.84 +    uint64 cycles;
    2.85 +} __align_to_cacheline__;
    2.86 +
    2.87 +const char *usage = {
    2.88 +	"Usage: malloc_test [options]\n"
    2.89 +	"  Spwans a number of threads and allocates memory.\n\n"
    2.90 +	"Options:\n"
    2.91 +	"  -t <num>   how many threads to use (default: 1). This is internaly multiplied by the number of cores.\n"
    2.92 +	"  -o <num>   repeat workload and sync operation <m> times\n"
    2.93 +        "  -i <num>   size of workload, repeat <n> times\n"     
    2.94 +	"  -h         this help screen\n\n"
    2.95 +};
    2.96 +
    2.97 +struct barrier_t
    2.98 +{
    2.99 +    int counter;
   2.100 +    int nthreads;
   2.101 +    int32 mutex;
   2.102 +    int32 cond;
   2.103 +    measurement_t endBarrierCycles;
   2.104 +
   2.105 +} __align_to_cacheline__;
   2.106 +typedef struct barrier_t barrier;
   2.107 +
   2.108 +void inline barrier_init(barrier *barr, int nthreads, VirtProcr *animatingPr)
   2.109 + {
   2.110 +   barr->counter = 0;
   2.111 +   barr->nthreads = nthreads;
   2.112 +   barr->mutex   = VPThread__make_mutex(animatingPr);
   2.113 +   barr->cond    = VPThread__make_cond(barr->mutex, animatingPr);
   2.114 + }
   2.115 +
   2.116 +void inline barrier_wait(barrier *barr, VirtProcr *animatingPr)
   2.117 + { int i;
   2.118 +
   2.119 +   VPThread__mutex_lock(barr->mutex, animatingPr);
   2.120 +   barr->counter++;
   2.121 +   if(barr->counter == barr->nthreads)
   2.122 +    { 
   2.123 +        read(cycles_counter_main_fd, &(barr->endBarrierCycles.cycles), \
   2.124 +                sizeof(barr->endBarrierCycles.cycles));
   2.125 +       
   2.126 +      barr->counter = 0;
   2.127 +      for(i=0; i < barr->nthreads; i++)
   2.128 +         VPThread__cond_signal(barr->cond, animatingPr);
   2.129 +    }
   2.130 +   else
   2.131 +    { VPThread__cond_wait(barr->cond, animatingPr);
   2.132 +    }
   2.133 +   VPThread__mutex_unlock(barr->mutex, animatingPr);
   2.134 + }
   2.135 +
   2.136 +
   2.137 +
   2.138 +struct WorkerParams_t
   2.139 + { struct barrier_t* barrier;
   2.140 +   uint64_t  totalWorkCycles;
   2.141 +   uint64_t  totalBadCycles;
   2.142 +   uint64_t  totalSyncCycles;
   2.143 +   uint64_t  totalBadSyncCycles;
   2.144 +   uint64     numGoodSyncs;
   2.145 +   uint64     numGoodTasks;
   2.146 + };
   2.147 + 
   2.148 + typedef union
   2.149 + {
   2.150 +     struct WorkerParams_t data;
   2.151 +     char padding[CACHELINE_SIZE];
   2.152 + } WorkerParams __align_to_cacheline__;
   2.153 + 
   2.154 +WorkerParams *workerParamsArray;
   2.155 +
   2.156 +typedef struct
   2.157 + { measurement_t *startExeCycles;
   2.158 +   measurement_t *endExeCycles;
   2.159 + } BenchParams __align_to_cacheline__;
   2.160 +
   2.161 +//======================== App Code =========================
   2.162 +/*
   2.163 + p* Workload
   2.164 + */
   2.165 +
   2.166 +#define saveCyclesAndInstrs(core,cycles) do{     \
   2.167 +   int cycles_fd = cycles_counter_fd[core];             \
   2.168 +   int nread;                                           \
   2.169 +                                                        \
   2.170 +   nread = read(cycles_fd,&(cycles),sizeof(cycles));    \
   2.171 +   if(nread<0){                                         \
   2.172 +       perror("Error reading cycles counter");          \
   2.173 +       cycles = 0;                                      \
   2.174 +   }                                                    \
   2.175 +} while (0) //macro magic for scoping
   2.176 + 
   2.177 +#define saveMisses(misses) do{     \
   2.178 +   int nread;                                           \
   2.179 +                                                        \
   2.180 +   nread = read(misses_counter_fd,&(misses),sizeof(misses));    \
   2.181 +   if(nread<0){                                         \
   2.182 +       perror("Error reading misses counter");          \
   2.183 +       misses = 0;                                      \
   2.184 +   }                                                    \
   2.185 +} while (0) //macro magic for scoping
   2.186 +
   2.187 +
   2.188 +double
   2.189 +worker_TLF(void* _params, VirtProcr* animatingPr)
   2.190 + {
   2.191 +   int i,o;
   2.192 +   WorkerParams* params = (WorkerParams*)_params;
   2.193 +   unsigned int totalWorkCycles = 0, totalBadCycles = 0;
   2.194 +   unsigned int totalSyncCycles = 0, totalBadSyncCycles = 0;
   2.195 +   unsigned int workspace1=0, numGoodSyncs = 0, numGoodTasks = 0;
   2.196 +   double workspace2=0.0;
   2.197 +   int32 privateMutex = VPThread__make_mutex(animatingPr);
   2.198 +   
   2.199 +   int cpuid = sched_getcpu();
   2.200 +   
   2.201 +   measurement_t startWorkload, endWorkload;
   2.202 +   uint64 numCycles;
   2.203 +   for(o=0; o < outer_iters; o++)
   2.204 +    {
   2.205 +       
   2.206 +          saveCyclesAndInstrs(cpuid,startWorkload.cycles);
   2.207 +       
   2.208 +      //task
   2.209 +      for(i=0; i < inner_iters; i++)
   2.210 +       {
   2.211 +         workspace1 += (workspace1 + 32)/2;
   2.212 +         workspace2 += (workspace2 + 23.2)/1.4;
   2.213 +       }
   2.214 +      
   2.215 +          saveCyclesAndInstrs(cpuid,endWorkload.cycles);
   2.216 +          numCycles = endWorkload.cycles - startWorkload.cycles;
   2.217 +          //sanity check (400K is about 20K iters)
   2.218 +          if( numCycles < 400000 ) {totalWorkCycles += numCycles; numGoodTasks++;}
   2.219 +          else                     {totalBadCycles  += numCycles; }
   2.220 +
   2.221 +      //mutex access often causes switch to different Slave VP
   2.222 +      VPThread__mutex_lock(privateMutex, animatingPr);
   2.223 +      
   2.224 +/*
   2.225 +          saveCyclesAndInstrs(cpuid,startWorkload2.cycles);
   2.226 +      //Task
   2.227 +      for(i=0; i < inner_iters; i++)
   2.228 +       {
   2.229 +         workspace1 += (workspace1 + 32)/2;
   2.230 +         workspace2 += (workspace2 + 23.2)/1.4;
   2.231 +       }
   2.232 +      
   2.233 +          saveCyclesAndInstrs(cpuid,endWorkload2.cycles);
   2.234 +          numCycles = endWorkload2.cycles - startWorkload2.cycles;
   2.235 +          //sanity check (400K is about 20K iters)
   2.236 +          if( numCycles < 400000 ) {totalWorkCycles += numCycles; numGoodTasks++;}
   2.237 +          else                     {totalBadCycles  += numCycles; }
   2.238 +      
   2.239 +*/
   2.240 +      VPThread__mutex_unlock(privateMutex, animatingPr);
   2.241 +    }
   2.242 +
   2.243 +   params->data.totalWorkCycles = totalWorkCycles;
   2.244 +   params->data.totalBadCycles = totalBadCycles;
   2.245 +   params->data.numGoodTasks   = numGoodTasks;
   2.246 +   params->data.totalSyncCycles = totalSyncCycles;
   2.247 +   params->data.totalBadSyncCycles = totalBadSyncCycles;
   2.248 +   params->data.numGoodSyncs = numGoodSyncs;
   2.249 +/*
   2.250 +   params->totalSyncCycles = VMS__give_num_plugin_cycles();
   2.251 +   params->totalBadSyncCycles = 0;
   2.252 +   params->numGoodSyncs = VMS__give_num_plugin_animations();
   2.253 +*/
   2.254 +   
   2.255 +   
   2.256 +   //Wait for all threads to end
   2.257 +   barrier_wait(params->data.barrier, animatingPr);
   2.258 +   
   2.259 +   //Shutdown worker
   2.260 +   VPThread__dissipate_thread(animatingPr);
   2.261 +   
   2.262 +     //below return never reached --> there for gcc
   2.263 +   return (workspace1 + workspace2);  //to prevent gcc from optimizing work out
   2.264 + }
   2.265 +
   2.266 +//local variables of benchmark, made global for alignment
   2.267 +struct barrier_t  barr __align_to_cacheline__;
   2.268 +BenchParams      *params __align_to_cacheline__;
   2.269 +
   2.270 +/* this is run after the VMS is set up*/
   2.271 +void benchmark(void *_params, VirtProcr *animatingPr)
   2.272 + {
   2.273 +   int i;
   2.274 +
   2.275 +   params = (BenchParams *)_params;
   2.276 +
   2.277 +   barrier_init(&barr, num_threads+1, animatingPr);
   2.278 +      
   2.279 +   //prepare input
   2.280 +   for(i=0; i<num_threads; i++)
   2.281 +    { 
   2.282 +       workerParamsArray[i].data.barrier = &barr;
   2.283 +    }
   2.284 +    
   2.285 +   uint64_t cache_misses_at_start, cache_misses_at_end;
   2.286 +   saveMisses(cache_misses_at_start);
   2.287 +   //save cycles before execution of threads, to get total exe cycles
   2.288 +   int nread = read(cycles_counter_main_fd, &(params->startExeCycles->cycles),
   2.289 +                sizeof(params->startExeCycles->cycles));
   2.290 +   if(nread<0) perror("Error reading cycles counter");
   2.291 +   
   2.292 +   //create (which starts running) all threads
   2.293 +   for(i=0; i<num_threads; i++)
   2.294 +    { VPThread__create_thread((VirtProcrFnPtr)worker_TLF, &(workerParamsArray[i]), animatingPr);
   2.295 +    }
   2.296 +   //wait for all threads to finish
   2.297 +   barrier_wait(&barr, animatingPr);
   2.298 +  
   2.299 +   //endBarrierCycles read in barrier_wait()!  Merten, email me if want to chg
   2.300 +   params->endExeCycles->cycles = barr.endBarrierCycles.cycles;
   2.301 +   saveMisses(cache_misses_at_end);
   2.302 +   cache_misses = cache_misses_at_end-cache_misses_at_start;
   2.303 +/*
   2.304 +   uint64_t overallWorkCycles = 0;
   2.305 +   for(i=0; i<num_threads; i++){ 
   2.306 +       printf("WorkCycles: %lu\n",input[i].totalWorkCycles);
   2.307 +       overallWorkCycles += input[i].totalWorkCycles;
   2.308 +    }
   2.309 +   
   2.310 +   printf("Sum across threads of work cycles: %lu\n", overallWorkCycles);
   2.311 +   printf("Total Execution: %lu\n", endBenchTime.cycles-startBenchTime.cycles);
   2.312 +   printf("Runtime/Workcycle Ratio %lu\n", 
   2.313 +   ((endBenchTime.cycles-startBenchTime.cycles)*100)/overallWorkCycles);
   2.314 +*/
   2.315 +
   2.316 +   //======================================================
   2.317 +
   2.318 +   VPThread__dissipate_thread(animatingPr);
   2.319 + }
   2.320 +
   2.321 +int main(int argc, char **argv)
   2.322 + {
   2.323 +   int i;
   2.324 +
   2.325 +   //set global static variables, based on cmd-line args
   2.326 +   for(i=1; i<argc; i++)
   2.327 +    {
   2.328 +      if(argv[i][0] == '-' && argv[i][2] == 0)
   2.329 +       {
   2.330 +         switch(argv[i][1])
   2.331 +          {
   2.332 +            case 't':
   2.333 +               if(!isdigit(argv[++i][0]))
   2.334 +                {
   2.335 +                  fprintf(stderr, "-t must be followed by the number of worker threads to spawn\n");
   2.336 +                  return EXIT_FAILURE;
   2.337 +                }
   2.338 +               num_threads = atoi(argv[i]);
   2.339 +               if(!num_threads)
   2.340 +                {
   2.341 +                  fprintf(stderr, "invalid number of threads specified: %d\n", num_threads);
   2.342 +                  return EXIT_FAILURE;
   2.343 +                }
   2.344 +            break;
   2.345 +            case 'o':
   2.346 +               if(!isdigit(argv[++i][0]))
   2.347 +                {
   2.348 +                  fputs("-i must be followed by a number\n", stderr);
   2.349 +                  return EXIT_FAILURE;
   2.350 +                }
   2.351 +               outer_iters = atoi(argv[i]);
   2.352 +				break;
   2.353 +            case 'i':
   2.354 +               if(!isdigit(argv[++i][0]))
   2.355 +                {
   2.356 +                  fputs("-o must be followed by a number (workload size)\n", stderr);
   2.357 +                  return EXIT_FAILURE;
   2.358 +                }
   2.359 +               inner_iters = atoi(argv[i]);
   2.360 +				break;
   2.361 +            case 'h':
   2.362 +               fputs(usage, stdout);
   2.363 +               return 0;
   2.364 +				
   2.365 +            default:
   2.366 +               fprintf(stderr, "unrecognized argument: %s\n", argv[i]);
   2.367 +               fputs(usage, stderr);
   2.368 +               return EXIT_FAILURE;
   2.369 +          }//switch
   2.370 +       }//if arg
   2.371 +      else
   2.372 +       {
   2.373 +			fprintf(stderr, "unrecognized argument: %s\n", argv[i]);
   2.374 +			fputs(usage, stderr);
   2.375 +			return EXIT_FAILURE;
   2.376 +       }
   2.377 +    }//for
   2.378 +   
   2.379 +   
   2.380 +   //setup performance counters
   2.381 +    hw_event = malloc(sizeof(struct perf_event_attr));
   2.382 +    memset(hw_event,0,sizeof(struct perf_event_attr));
   2.383 +    
   2.384 +    hw_event->type = PERF_TYPE_HARDWARE;
   2.385 +    hw_event->size = sizeof(hw_event);
   2.386 +    hw_event->disabled = 0;
   2.387 +    hw_event->freq = 0;
   2.388 +    hw_event->inherit = 1; /* children inherit it   */
   2.389 +    hw_event->pinned = 1; /* says this virt counter must always be on HW */
   2.390 +    hw_event->exclusive = 0; /* only group on PMU     */
   2.391 +    hw_event->exclude_user = 0; /* don't count user      */
   2.392 +    hw_event->exclude_kernel = 1; /* don't count kernel  */
   2.393 +    hw_event->exclude_hv = 1; /* ditto hypervisor      */
   2.394 +    hw_event->exclude_idle = 1; /* don't count when idle */
   2.395 +    hw_event->mmap = 0; /* include mmap data     */
   2.396 +    hw_event->comm = 0; /* include comm data     */
   2.397 +
   2.398 +    hw_event->config = PERF_COUNT_HW_CPU_CYCLES; //cycles
   2.399 +    
   2.400 +    int cpuID, retries;
   2.401 +
   2.402 +   for( cpuID = 0; cpuID < NUM_CORES; cpuID++ )
   2.403 +    { retries = 0;
   2.404 +      do
   2.405 +       { retries += 1;
   2.406 +         cycles_counter_fd[cpuID] = 
   2.407 +          syscall(__NR_perf_event_open, hw_event,
   2.408 +                  0,//pid_t: 0 is "pid of calling process" 
   2.409 +                  cpuID,//int: cpu, the value returned by "CPUID" instr(?)
   2.410 +                  -1,//int: group_fd, -1 is "leader" or independent
   2.411 +                  0//unsigned long: flags
   2.412 +                 );
   2.413 +       }
   2.414 +      while(cycles_counter_fd[cpuID]<0 && retries < 100);
   2.415 +      if(retries >= 100)
   2.416 +       {
   2.417 +         fprintf(stderr,"On core %d: ",cpuID);
   2.418 +         perror("Failed to open cycles counter");
   2.419 +       }
   2.420 +    }
   2.421 +
   2.422 +   //Set up counter to accumulate total cycles to process, across all CPUs
   2.423 +
   2.424 +   retries = 0;
   2.425 +   do
   2.426 +    { retries += 1;
   2.427 +      cycles_counter_main_fd = 
   2.428 +       syscall(__NR_perf_event_open, hw_event,
   2.429 +               0,//pid_t: 0 is "pid of calling process" 
   2.430 +               -1,//int: cpu, -1 means accumulate from all cores
   2.431 +               -1,//int: group_fd, -1 is "leader" == independent
   2.432 +               0//unsigned long: flags
   2.433 +              );
   2.434 +    }
   2.435 +   while(cycles_counter_main_fd<0 && retries < 100);
   2.436 +   if(retries >= 100)
   2.437 +    {
   2.438 +      fprintf(stderr,"in main ");
   2.439 +      perror("Failed to open cycles counter");
   2.440 +    }
   2.441 +   
   2.442 +   //Set up counters to count cache misses
   2.443 +    hw_event->type = PERF_TYPE_HARDWARE;
   2.444 +    hw_event->config = PERF_COUNT_HW_CACHE_MISSES; //misses
   2.445 +    
   2.446 +   retries = 0;
   2.447 +   do
   2.448 +    { retries += 1;
   2.449 +      misses_counter_fd = 
   2.450 +       syscall(__NR_perf_event_open, hw_event,
   2.451 +               0,//pid_t: 0 is "pid of calling process" 
   2.452 +               -1,//int: cpu, -1 means accumulate from all cores
   2.453 +               -1,//int: group_fd, -1 is "leader" == independent
   2.454 +               0//unsigned long: flags
   2.455 +              );
   2.456 +    }
   2.457 +   while(misses_counter_fd<0 && retries < 100);
   2.458 +   if(retries >= 100)
   2.459 +    {
   2.460 +      fprintf(stderr,"in main ");
   2.461 +      perror("Failed to misses counter");
   2.462 +    }
   2.463 +   
   2.464 +   measurement_t startExeCycles, endExeCycles;
   2.465 +   BenchParams *benchParams;
   2.466 +   
   2.467 +   benchParams = malloc(sizeof(BenchParams)); 
   2.468 +   
   2.469 +   benchParams->startExeCycles = &startExeCycles;
   2.470 +   benchParams->endExeCycles   = &endExeCycles;
   2.471 +   
   2.472 +   workerParamsArray =  (WorkerParams *)malloc( (num_threads + 1) * sizeof(WorkerParams) );
   2.473 +   if(workerParamsArray == NULL ) printf("error mallocing worker params array\n");
   2.474 +   
   2.475 + 
   2.476 +   //This is the transition to the VMS runtime
   2.477 +   VPThread__create_seed_procr_and_do_work( &benchmark, benchParams );
   2.478 +   
   2.479 +   uint64_t totalWorkCyclesAcrossCores = 0, totalBadCyclesAcrossCores = 0;
   2.480 +   uint64_t totalSyncCyclesAcrossCores = 0, totalBadSyncCyclesAcrossCores = 0;
   2.481 +   for(i=0; i<num_threads; i++){ 
   2.482 +       printf("WorkCycles: %lu\n",workerParamsArray[i].data.totalWorkCycles);
   2.483 +//       printf("Num Good Tasks: %lu\n",workerParamsArray[i].numGoodTasks);
   2.484 +//       printf("SyncCycles: %lu\n",workerParamsArray[i].totalSyncCycles);
   2.485 +//       printf("Num Good Syncs: %lu\n",workerParamsArray[i].numGoodSyncs);
   2.486 +       totalWorkCyclesAcrossCores += workerParamsArray[i].data.totalWorkCycles;
   2.487 +       totalBadCyclesAcrossCores  += workerParamsArray[i].data.totalBadCycles;
   2.488 +       totalSyncCyclesAcrossCores += workerParamsArray[i].data.totalSyncCycles;
   2.489 +       totalBadSyncCyclesAcrossCores  += workerParamsArray[i].data.totalBadSyncCycles;
   2.490 +    }
   2.491 +
   2.492 +   uint64_t totalExeCycles = endExeCycles.cycles - startExeCycles.cycles;
   2.493 +   totalExeCycles -= totalBadCyclesAcrossCores;
   2.494 +   uint64 totalOverhead = totalExeCycles - totalWorkCyclesAcrossCores;
   2.495 +   int32  numSyncs = outer_iters * num_threads * 2;
   2.496 +   printf("Total Execution Cycles: %lu\n", totalExeCycles);
   2.497 +   printf("Total number of cache misses: %lu\n", cache_misses);
   2.498 +   printf("Sum across threads of work cycles: %lu\n", totalWorkCyclesAcrossCores);
   2.499 +   printf("Sum across threads of bad work cycles: %lu\n", totalBadCyclesAcrossCores);
   2.500 +//   printf("Sum across threads of Bad Sync cycles: %lu\n", totalBadSyncCyclesAcrossCores);
   2.501 +   printf("Overhead per sync: %f\n", (double)totalOverhead / (double)numSyncs );
   2.502 +   printf("ExeCycles/WorkCycles Ratio %f\n", 
   2.503 +          (double)totalExeCycles / (double)totalWorkCyclesAcrossCores);
   2.504 +   return 0;
   2.505 + }
     3.1 --- a/src/Application/main.c	Tue Dec 20 17:21:27 2011 +0100
     3.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
     3.3 @@ -1,502 +0,0 @@
     3.4 -/* 
     3.5 - * 
     3.6 - */
     3.7 -#include <stdio.h>
     3.8 -#include <stdlib.h>
     3.9 -#include <string.h>
    3.10 -#include <math.h>
    3.11 -#include <ctype.h>
    3.12 -#include <errno.h>
    3.13 -#include <pthread.h>
    3.14 -#include <unistd.h>
    3.15 -#include "VPThread_lib/VPThread.h"
    3.16 -#include "VPThread_lib/VMS/Queue_impl/PrivateQueue.h"
    3.17 -
    3.18 -#include <linux/perf_event.h>
    3.19 -#include <linux/prctl.h>
    3.20 -#include <sys/syscall.h>
    3.21 -
    3.22 -#undef DEBUG
    3.23 -//#define DEBUG
    3.24 -
    3.25 -#if !defined(unix) && !defined(__unix__)
    3.26 -#ifdef __MACH__
    3.27 -#define unix		1
    3.28 -#define __unix__	1
    3.29 -#endif	/* __MACH__ */
    3.30 -#endif	/* unix */
    3.31 -
    3.32 -/* find the appropriate way to define explicitly sized types */
    3.33 -/* for C99 or GNU libc (also mach's libc) we can use stdint.h */
    3.34 -#if (__STDC_VERSION__ >= 199900) || defined(__GLIBC__) || defined(__MACH__)
    3.35 -#include <stdint.h>
    3.36 -#elif defined(unix) || defined(__unix__)	/* some UNIX systems have them in sys/types.h */
    3.37 -#include <sys/types.h>
    3.38 -#elif defined(__WIN32__) || defined(WIN32)	/* the nameless one */
    3.39 -typedef unsigned __int8 uint8_t;
    3.40 -typedef unsigned __int32 uint32_t;
    3.41 -#endif	/* sized type detection */
    3.42 -
    3.43 -/* provide a millisecond-resolution timer for each system */
    3.44 -#if defined(unix) || defined(__unix__)
    3.45 -#include <time.h>
    3.46 -#include <sys/time.h>
    3.47 -unsigned long get_msec(void) {
    3.48 -	static struct timeval timeval, first_timeval;
    3.49 -
    3.50 -	gettimeofday(&timeval, 0);
    3.51 -	if(first_timeval.tv_sec == 0) {
    3.52 -		first_timeval = timeval;
    3.53 -		return 0;
    3.54 -	}
    3.55 -	return (timeval.tv_sec - first_timeval.tv_sec) * 1000 + (timeval.tv_usec - first_timeval.tv_usec) / 1000;
    3.56 -}
    3.57 -#elif defined(__WIN32__) || defined(WIN32)
    3.58 -#include <windows.h>
    3.59 -unsigned long get_msec(void) {
    3.60 -	return GetTickCount();
    3.61 -}
    3.62 -#else
    3.63 -//#error "I don't know how to measure time on your platform"
    3.64 -#endif
    3.65 -
    3.66 -//======================== Globals =========================
    3.67 -char __ProgrammName[] = "overhead_test";
    3.68 -char __DataSet[255];
    3.69 -
    3.70 -int outer_iters, inner_iters, num_threads;
    3.71 -size_t chunk_size = 0;
    3.72 -
    3.73 -int cycles_counter_main_fd;
    3.74 -int misses_counter_fd;
    3.75 -
    3.76 -uint64_t cache_misses;
    3.77 -
    3.78 -int cycles_counter_fd[NUM_CORES];
    3.79 -struct perf_event_attr* hw_event;
    3.80 -
    3.81 -//======================== Defines =========================
    3.82 -typedef struct perfData measurement_t;
    3.83 -struct perfData{
    3.84 -    uint64 cycles;
    3.85 -} __align_to_cacheline__;
    3.86 -
    3.87 -const char *usage = {
    3.88 -	"Usage: malloc_test [options]\n"
    3.89 -	"  Spwans a number of threads and allocates memory.\n\n"
    3.90 -	"Options:\n"
    3.91 -	"  -t <num>   how many threads to use (default: 1). This is internaly multiplied by the number of cores.\n"
    3.92 -	"  -o <num>   repeat workload and sync operation <m> times\n"
    3.93 -        "  -i <num>   size of workload, repeat <n> times\n"     
    3.94 -	"  -h         this help screen\n\n"
    3.95 -};
    3.96 -
    3.97 -struct barrier_t
    3.98 -{
    3.99 -    int counter;
   3.100 -    int nthreads;
   3.101 -    int32 mutex;
   3.102 -    int32 cond;
   3.103 -    measurement_t endBarrierCycles;
   3.104 -
   3.105 -} __align_to_cacheline__;
   3.106 -typedef struct barrier_t barrier;
   3.107 -
   3.108 -void inline barrier_init(barrier *barr, int nthreads, VirtProcr *animatingPr)
   3.109 - {
   3.110 -   barr->counter = 0;
   3.111 -   barr->nthreads = nthreads;
   3.112 -   barr->mutex   = VPThread__make_mutex(animatingPr);
   3.113 -   barr->cond    = VPThread__make_cond(barr->mutex, animatingPr);
   3.114 - }
   3.115 -
   3.116 -void inline barrier_wait(barrier *barr, VirtProcr *animatingPr)
   3.117 - { int i;
   3.118 -
   3.119 -   VPThread__mutex_lock(barr->mutex, animatingPr);
   3.120 -   barr->counter++;
   3.121 -   if(barr->counter == barr->nthreads)
   3.122 -    { 
   3.123 -        read(cycles_counter_main_fd, &(barr->endBarrierCycles.cycles), \
   3.124 -                sizeof(barr->endBarrierCycles.cycles));
   3.125 -       
   3.126 -      barr->counter = 0;
   3.127 -      for(i=0; i < barr->nthreads; i++)
   3.128 -         VPThread__cond_signal(barr->cond, animatingPr);
   3.129 -    }
   3.130 -   else
   3.131 -    { VPThread__cond_wait(barr->cond, animatingPr);
   3.132 -    }
   3.133 -   VPThread__mutex_unlock(barr->mutex, animatingPr);
   3.134 - }
   3.135 -
   3.136 -
   3.137 -
   3.138 -struct WorkerParams_t
   3.139 - { struct barrier_t* barrier;
   3.140 -   uint64_t  totalWorkCycles;
   3.141 -   uint64_t  totalBadCycles;
   3.142 -   uint64_t  totalSyncCycles;
   3.143 -   uint64_t  totalBadSyncCycles;
   3.144 -   uint64     numGoodSyncs;
   3.145 -   uint64     numGoodTasks;
   3.146 - };
   3.147 - 
   3.148 - typedef union
   3.149 - {
   3.150 -     struct WorkerParams_t data;
   3.151 -     char padding[CACHELINE_SIZE];
   3.152 - } WorkerParams __align_to_cacheline__;
   3.153 - 
   3.154 -WorkerParams *workerParamsArray;
   3.155 -
   3.156 -typedef struct
   3.157 - { measurement_t *startExeCycles;
   3.158 -   measurement_t *endExeCycles;
   3.159 - } BenchParams __align_to_cacheline__;
   3.160 -
   3.161 -//======================== App Code =========================
   3.162 -/*
   3.163 - p* Workload
   3.164 - */
   3.165 -
   3.166 -#define saveCyclesAndInstrs(core,cycles) do{     \
   3.167 -   int cycles_fd = cycles_counter_fd[core];             \
   3.168 -   int nread;                                           \
   3.169 -                                                        \
   3.170 -   nread = read(cycles_fd,&(cycles),sizeof(cycles));    \
   3.171 -   if(nread<0){                                         \
   3.172 -       perror("Error reading cycles counter");          \
   3.173 -       cycles = 0;                                      \
   3.174 -   }                                                    \
   3.175 -} while (0) //macro magic for scoping
   3.176 - 
   3.177 -#define saveMisses(misses) do{     \
   3.178 -   int nread;                                           \
   3.179 -                                                        \
   3.180 -   nread = read(misses_counter_fd,&(misses),sizeof(misses));    \
   3.181 -   if(nread<0){                                         \
   3.182 -       perror("Error reading misses counter");          \
   3.183 -       misses = 0;                                      \
   3.184 -   }                                                    \
   3.185 -} while (0) //macro magic for scoping
   3.186 -
   3.187 -
   3.188 -double
   3.189 -worker_TLF(void* _params, VirtProcr* animatingPr)
   3.190 - {
   3.191 -   int i,o;
   3.192 -   WorkerParams* params = (WorkerParams*)_params;
   3.193 -   unsigned int totalWorkCycles = 0, totalBadCycles = 0;
   3.194 -   unsigned int totalSyncCycles = 0, totalBadSyncCycles = 0;
   3.195 -   unsigned int workspace1=0, numGoodSyncs = 0, numGoodTasks = 0;
   3.196 -   double workspace2=0.0;
   3.197 -   int32 privateMutex = VPThread__make_mutex(animatingPr);
   3.198 -   
   3.199 -   int cpuid = sched_getcpu();
   3.200 -   
   3.201 -   measurement_t startWorkload, endWorkload;
   3.202 -   uint64 numCycles;
   3.203 -   for(o=0; o < outer_iters; o++)
   3.204 -    {
   3.205 -       
   3.206 -          saveCyclesAndInstrs(cpuid,startWorkload.cycles);
   3.207 -       
   3.208 -      //task
   3.209 -      for(i=0; i < inner_iters; i++)
   3.210 -       {
   3.211 -         workspace1 += (workspace1 + 32)/2;
   3.212 -         workspace2 += (workspace2 + 23.2)/1.4;
   3.213 -       }
   3.214 -      
   3.215 -          saveCyclesAndInstrs(cpuid,endWorkload.cycles);
   3.216 -          numCycles = endWorkload.cycles - startWorkload.cycles;
   3.217 -          //sanity check (400K is about 20K iters)
   3.218 -          if( numCycles < 400000 ) {totalWorkCycles += numCycles; numGoodTasks++;}
   3.219 -          else                     {totalBadCycles  += numCycles; }
   3.220 -
   3.221 -      //mutex access often causes switch to different Slave VP
   3.222 -      VPThread__mutex_lock(privateMutex, animatingPr);
   3.223 -      
   3.224 -/*
   3.225 -          saveCyclesAndInstrs(cpuid,startWorkload2.cycles);
   3.226 -      //Task
   3.227 -      for(i=0; i < inner_iters; i++)
   3.228 -       {
   3.229 -         workspace1 += (workspace1 + 32)/2;
   3.230 -         workspace2 += (workspace2 + 23.2)/1.4;
   3.231 -       }
   3.232 -      
   3.233 -          saveCyclesAndInstrs(cpuid,endWorkload2.cycles);
   3.234 -          numCycles = endWorkload2.cycles - startWorkload2.cycles;
   3.235 -          //sanity check (400K is about 20K iters)
   3.236 -          if( numCycles < 400000 ) {totalWorkCycles += numCycles; numGoodTasks++;}
   3.237 -          else                     {totalBadCycles  += numCycles; }
   3.238 -      
   3.239 -*/
   3.240 -      VPThread__mutex_unlock(privateMutex, animatingPr);
   3.241 -    }
   3.242 -
   3.243 -   params->data.totalWorkCycles = totalWorkCycles;
   3.244 -   params->data.totalBadCycles = totalBadCycles;
   3.245 -   params->data.numGoodTasks   = numGoodTasks;
   3.246 -   params->data.totalSyncCycles = totalSyncCycles;
   3.247 -   params->data.totalBadSyncCycles = totalBadSyncCycles;
   3.248 -   params->data.numGoodSyncs = numGoodSyncs;
   3.249 -/*
   3.250 -   params->totalSyncCycles = VMS__give_num_plugin_cycles();
   3.251 -   params->totalBadSyncCycles = 0;
   3.252 -   params->numGoodSyncs = VMS__give_num_plugin_animations();
   3.253 -*/
   3.254 -   
   3.255 -   
   3.256 -   //Wait for all threads to end
   3.257 -   barrier_wait(params->data.barrier, animatingPr);
   3.258 -   
   3.259 -   //Shutdown worker
   3.260 -   VPThread__dissipate_thread(animatingPr);
   3.261 -   
   3.262 -     //below return never reached --> there for gcc
   3.263 -   return (workspace1 + workspace2);  //to prevent gcc from optimizing work out
   3.264 - }
   3.265 -
   3.266 -//local variables of benchmark, made global for alignment
   3.267 -struct barrier_t  barr __align_to_cacheline__;
   3.268 -BenchParams      *params __align_to_cacheline__;
   3.269 -
   3.270 -/* this is run after the VMS is set up*/
   3.271 -void benchmark(void *_params, VirtProcr *animatingPr)
   3.272 - {
   3.273 -   int i;
   3.274 -
   3.275 -   params = (BenchParams *)_params;
   3.276 -
   3.277 -   barrier_init(&barr, num_threads+1, animatingPr);
   3.278 -      
   3.279 -   //prepare input
   3.280 -   for(i=0; i<num_threads; i++)
   3.281 -    { 
   3.282 -       workerParamsArray[i].data.barrier = &barr;
   3.283 -    }
   3.284 -    
   3.285 -   uint64_t cache_misses_at_start, cache_misses_at_end;
   3.286 -   saveMisses(cache_misses_at_start);
   3.287 -   //save cycles before execution of threads, to get total exe cycles
   3.288 -   int nread = read(cycles_counter_main_fd, &(params->startExeCycles->cycles),
   3.289 -                sizeof(params->startExeCycles->cycles));
   3.290 -   if(nread<0) perror("Error reading cycles counter");
   3.291 -   
   3.292 -   //create (which starts running) all threads
   3.293 -   for(i=0; i<num_threads; i++)
   3.294 -    { VPThread__create_thread((VirtProcrFnPtr)worker_TLF, &(workerParamsArray[i]), animatingPr);
   3.295 -    }
   3.296 -   //wait for all threads to finish
   3.297 -   barrier_wait(&barr, animatingPr);
   3.298 -  
   3.299 -   //endBarrierCycles read in barrier_wait()!  Merten, email me if want to chg
   3.300 -   params->endExeCycles->cycles = barr.endBarrierCycles.cycles;
   3.301 -   saveMisses(cache_misses_at_end);
   3.302 -   cache_misses = cache_misses_at_end-cache_misses_at_start;
   3.303 -/*
   3.304 -   uint64_t overallWorkCycles = 0;
   3.305 -   for(i=0; i<num_threads; i++){ 
   3.306 -       printf("WorkCycles: %lu\n",input[i].totalWorkCycles);
   3.307 -       overallWorkCycles += input[i].totalWorkCycles;
   3.308 -    }
   3.309 -   
   3.310 -   printf("Sum across threads of work cycles: %lu\n", overallWorkCycles);
   3.311 -   printf("Total Execution: %lu\n", endBenchTime.cycles-startBenchTime.cycles);
   3.312 -   printf("Runtime/Workcycle Ratio %lu\n", 
   3.313 -   ((endBenchTime.cycles-startBenchTime.cycles)*100)/overallWorkCycles);
   3.314 -*/
   3.315 -
   3.316 -   //======================================================
   3.317 -
   3.318 -   VPThread__dissipate_thread(animatingPr);
   3.319 - }
   3.320 -
   3.321 -int main(int argc, char **argv)
   3.322 - {
   3.323 -   int i;
   3.324 -
   3.325 -   //set global static variables, based on cmd-line args
   3.326 -   for(i=1; i<argc; i++)
   3.327 -    {
   3.328 -      if(argv[i][0] == '-' && argv[i][2] == 0)
   3.329 -       {
   3.330 -         switch(argv[i][1])
   3.331 -          {
   3.332 -            case 't':
   3.333 -               if(!isdigit(argv[++i][0]))
   3.334 -                {
   3.335 -                  fprintf(stderr, "-t must be followed by the number of worker threads to spawn\n");
   3.336 -                  return EXIT_FAILURE;
   3.337 -                }
   3.338 -               num_threads = atoi(argv[i]);
   3.339 -               if(!num_threads)
   3.340 -                {
   3.341 -                  fprintf(stderr, "invalid number of threads specified: %d\n", num_threads);
   3.342 -                  return EXIT_FAILURE;
   3.343 -                }
   3.344 -            break;
   3.345 -            case 'o':
   3.346 -               if(!isdigit(argv[++i][0]))
   3.347 -                {
   3.348 -                  fputs("-i must be followed by a number\n", stderr);
   3.349 -                  return EXIT_FAILURE;
   3.350 -                }
   3.351 -               outer_iters = atoi(argv[i]);
   3.352 -				break;
   3.353 -            case 'i':
   3.354 -               if(!isdigit(argv[++i][0]))
   3.355 -                {
   3.356 -                  fputs("-o must be followed by a number (workload size)\n", stderr);
   3.357 -                  return EXIT_FAILURE;
   3.358 -                }
   3.359 -               inner_iters = atoi(argv[i]);
   3.360 -				break;
   3.361 -            case 'h':
   3.362 -               fputs(usage, stdout);
   3.363 -               return 0;
   3.364 -				
   3.365 -            default:
   3.366 -               fprintf(stderr, "unrecognized argument: %s\n", argv[i]);
   3.367 -               fputs(usage, stderr);
   3.368 -               return EXIT_FAILURE;
   3.369 -          }//switch
   3.370 -       }//if arg
   3.371 -      else
   3.372 -       {
   3.373 -			fprintf(stderr, "unrecognized argument: %s\n", argv[i]);
   3.374 -			fputs(usage, stderr);
   3.375 -			return EXIT_FAILURE;
   3.376 -       }
   3.377 -    }//for
   3.378 -   
   3.379 -   
   3.380 -   //setup performance counters
   3.381 -    hw_event = malloc(sizeof(struct perf_event_attr));
   3.382 -    memset(hw_event,0,sizeof(struct perf_event_attr));
   3.383 -    
   3.384 -    hw_event->type = PERF_TYPE_HARDWARE;
   3.385 -    hw_event->size = sizeof(hw_event);
   3.386 -    hw_event->disabled = 0;
   3.387 -    hw_event->freq = 0;
   3.388 -    hw_event->inherit = 1; /* children inherit it   */
   3.389 -    hw_event->pinned = 1; /* says this virt counter must always be on HW */
   3.390 -    hw_event->exclusive = 0; /* only group on PMU     */
   3.391 -    hw_event->exclude_user = 0; /* don't count user      */
   3.392 -    hw_event->exclude_kernel = 1; /* don't count kernel  */
   3.393 -    hw_event->exclude_hv = 1; /* ditto hypervisor      */
   3.394 -    hw_event->exclude_idle = 1; /* don't count when idle */
   3.395 -    hw_event->mmap = 0; /* include mmap data     */
   3.396 -    hw_event->comm = 0; /* include comm data     */
   3.397 -
   3.398 -    hw_event->config = PERF_COUNT_HW_CPU_CYCLES; //cycles
   3.399 -    
   3.400 -    int cpuID, retries;
   3.401 -
   3.402 -   for( cpuID = 0; cpuID < NUM_CORES; cpuID++ )
   3.403 -    { retries = 0;
   3.404 -      do
   3.405 -       { retries += 1;
   3.406 -         cycles_counter_fd[cpuID] = 
   3.407 -          syscall(__NR_perf_event_open, hw_event,
   3.408 -                  0,//pid_t: 0 is "pid of calling process" 
   3.409 -                  cpuID,//int: cpu, the value returned by "CPUID" instr(?)
   3.410 -                  -1,//int: group_fd, -1 is "leader" or independent
   3.411 -                  0//unsigned long: flags
   3.412 -                 );
   3.413 -       }
   3.414 -      while(cycles_counter_fd[cpuID]<0 && retries < 100);
   3.415 -      if(retries >= 100)
   3.416 -       {
   3.417 -         fprintf(stderr,"On core %d: ",cpuID);
   3.418 -         perror("Failed to open cycles counter");
   3.419 -       }
   3.420 -    }
   3.421 -
   3.422 -   //Set up counter to accumulate total cycles to process, across all CPUs
   3.423 -
   3.424 -   retries = 0;
   3.425 -   do
   3.426 -    { retries += 1;
   3.427 -      cycles_counter_main_fd = 
   3.428 -       syscall(__NR_perf_event_open, hw_event,
   3.429 -               0,//pid_t: 0 is "pid of calling process" 
   3.430 -               -1,//int: cpu, -1 means accumulate from all cores
   3.431 -               -1,//int: group_fd, -1 is "leader" == independent
   3.432 -               0//unsigned long: flags
   3.433 -              );
   3.434 -    }
   3.435 -   while(cycles_counter_main_fd<0 && retries < 100);
   3.436 -   if(retries >= 100)
   3.437 -    {
   3.438 -      fprintf(stderr,"in main ");
   3.439 -      perror("Failed to open cycles counter");
   3.440 -    }
   3.441 -   
   3.442 -   //Set up counters to count cache misses
   3.443 -    hw_event->type = PERF_TYPE_HARDWARE;
   3.444 -    hw_event->config = PERF_COUNT_HW_CACHE_MISSES; //misses
   3.445 -    
   3.446 -   retries = 0;
   3.447 -   do
   3.448 -    { retries += 1;
   3.449 -      misses_counter_fd = 
   3.450 -       syscall(__NR_perf_event_open, hw_event,
   3.451 -               0,//pid_t: 0 is "pid of calling process" 
   3.452 -               -1,//int: cpu, -1 means accumulate from all cores
   3.453 -               -1,//int: group_fd, -1 is "leader" == independent
   3.454 -               0//unsigned long: flags
   3.455 -              );
   3.456 -    }
   3.457 -   while(misses_counter_fd<0 && retries < 100);
   3.458 -   if(retries >= 100)
   3.459 -    {
   3.460 -      fprintf(stderr,"in main ");
   3.461 -      perror("Failed to misses counter");
   3.462 -    }
   3.463 -   
   3.464 -   measurement_t startExeCycles, endExeCycles;
   3.465 -   BenchParams *benchParams;
   3.466 -   
   3.467 -   benchParams = malloc(sizeof(BenchParams)); 
   3.468 -   
   3.469 -   benchParams->startExeCycles = &startExeCycles;
   3.470 -   benchParams->endExeCycles   = &endExeCycles;
   3.471 -   
   3.472 -   workerParamsArray =  (WorkerParams *)malloc( (num_threads + 1) * sizeof(WorkerParams) );
   3.473 -   if(workerParamsArray == NULL ) printf("error mallocing worker params array\n");
   3.474 -   
   3.475 - 
   3.476 -   //This is the transition to the VMS runtime
   3.477 -   VPThread__create_seed_procr_and_do_work( &benchmark, benchParams );
   3.478 -   
   3.479 -   uint64_t totalWorkCyclesAcrossCores = 0, totalBadCyclesAcrossCores = 0;
   3.480 -   uint64_t totalSyncCyclesAcrossCores = 0, totalBadSyncCyclesAcrossCores = 0;
   3.481 -   for(i=0; i<num_threads; i++){ 
   3.482 -       printf("WorkCycles: %lu\n",workerParamsArray[i].data.totalWorkCycles);
   3.483 -//       printf("Num Good Tasks: %lu\n",workerParamsArray[i].numGoodTasks);
   3.484 -//       printf("SyncCycles: %lu\n",workerParamsArray[i].totalSyncCycles);
   3.485 -//       printf("Num Good Syncs: %lu\n",workerParamsArray[i].numGoodSyncs);
   3.486 -       totalWorkCyclesAcrossCores += workerParamsArray[i].data.totalWorkCycles;
   3.487 -       totalBadCyclesAcrossCores  += workerParamsArray[i].data.totalBadCycles;
   3.488 -       totalSyncCyclesAcrossCores += workerParamsArray[i].data.totalSyncCycles;
   3.489 -       totalBadSyncCyclesAcrossCores  += workerParamsArray[i].data.totalBadSyncCycles;
   3.490 -    }
   3.491 -
   3.492 -   uint64_t totalExeCycles = endExeCycles.cycles - startExeCycles.cycles;
   3.493 -   totalExeCycles -= totalBadCyclesAcrossCores;
   3.494 -   uint64 totalOverhead = totalExeCycles - totalWorkCyclesAcrossCores;
   3.495 -   int32  numSyncs = outer_iters * num_threads * 2;
   3.496 -   printf("Total Execution Cycles: %lu\n", totalExeCycles);
   3.497 -   printf("Total number of cache misses: %lu\n", cache_misses);
   3.498 -   printf("Sum across threads of work cycles: %lu\n", totalWorkCyclesAcrossCores);
   3.499 -   printf("Sum across threads of bad work cycles: %lu\n", totalBadCyclesAcrossCores);
   3.500 -//   printf("Sum across threads of Bad Sync cycles: %lu\n", totalBadSyncCyclesAcrossCores);
   3.501 -   printf("Overhead per sync: %f\n", (double)totalOverhead / (double)numSyncs );
   3.502 -   printf("ExeCycles/WorkCycles Ratio %f\n", 
   3.503 -          (double)totalExeCycles / (double)totalWorkCyclesAcrossCores);
   3.504 -   return 0;
   3.505 - }