diff main.c @ 17:281cadcbb796

changed directory structure, added .hgeol file
author Merten Sach <msach@mailbox.tu-berlin.de>
date Mon, 13 Feb 2012 16:12:20 +0100
parents src/Application/main.c@c3561dbac1dc
children e7277df4460e
line diff
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/main.c	Mon Feb 13 16:12:20 2012 +0100
     1.3 @@ -0,0 +1,502 @@
     1.4 +/* 
     1.5 + * 
     1.6 + */
     1.7 +#include <stdio.h>
     1.8 +#include <stdlib.h>
     1.9 +#include <string.h>
    1.10 +#include <math.h>
    1.11 +#include <ctype.h>
    1.12 +#include <errno.h>
    1.13 +#include <pthread.h>
    1.14 +#include <unistd.h>
    1.15 +#include "VPThread_lib/VPThread.h"
    1.16 +#include "VPThread_lib/VMS/Queue_impl/PrivateQueue.h"
    1.17 +
    1.18 +#include <linux/perf_event.h>
    1.19 +#include <linux/prctl.h>
    1.20 +#include <sys/syscall.h>
    1.21 +
    1.22 +#undef DEBUG
    1.23 +//#define DEBUG
    1.24 +
    1.25 +#if !defined(unix) && !defined(__unix__)
    1.26 +#ifdef __MACH__
    1.27 +#define unix		1
    1.28 +#define __unix__	1
    1.29 +#endif	/* __MACH__ */
    1.30 +#endif	/* unix */
    1.31 +
    1.32 +/* find the appropriate way to define explicitly sized types */
    1.33 +/* for C99 or GNU libc (also mach's libc) we can use stdint.h */
    1.34 +#if (__STDC_VERSION__ >= 199900) || defined(__GLIBC__) || defined(__MACH__)
    1.35 +#include <stdint.h>
    1.36 +#elif defined(unix) || defined(__unix__)	/* some UNIX systems have them in sys/types.h */
    1.37 +#include <sys/types.h>
    1.38 +#elif defined(__WIN32__) || defined(WIN32)	/* the nameless one */
    1.39 +typedef unsigned __int8 uint8_t;
    1.40 +typedef unsigned __int32 uint32_t;
    1.41 +#endif	/* sized type detection */
    1.42 +
    1.43 +/* provide a millisecond-resolution timer for each system */
    1.44 +#if defined(unix) || defined(__unix__)
    1.45 +#include <time.h>
    1.46 +#include <sys/time.h>
    1.47 +unsigned long get_msec(void) {
    1.48 +	static struct timeval timeval, first_timeval;
    1.49 +
    1.50 +	gettimeofday(&timeval, 0);
    1.51 +	if(first_timeval.tv_sec == 0) {
    1.52 +		first_timeval = timeval;
    1.53 +		return 0;
    1.54 +	}
    1.55 +	return (timeval.tv_sec - first_timeval.tv_sec) * 1000 + (timeval.tv_usec - first_timeval.tv_usec) / 1000;
    1.56 +}
    1.57 +#elif defined(__WIN32__) || defined(WIN32)
    1.58 +#include <windows.h>
    1.59 +unsigned long get_msec(void) {
    1.60 +	return GetTickCount();
    1.61 +}
    1.62 +#else
    1.63 +//#error "I don't know how to measure time on your platform"
    1.64 +#endif
    1.65 +
    1.66 +//======================== Globals =========================
    1.67 +char __ProgrammName[] = "overhead_test";
    1.68 +char __DataSet[255];
    1.69 +
    1.70 +int outer_iters, inner_iters, num_threads;
    1.71 +size_t chunk_size = 0;
    1.72 +
    1.73 +int cycles_counter_main_fd;
    1.74 +int misses_counter_fd;
    1.75 +
    1.76 +uint64_t cache_misses;
    1.77 +
    1.78 +int cycles_counter_fd[NUM_CORES];
    1.79 +struct perf_event_attr* hw_event;
    1.80 +
    1.81 +//======================== Defines =========================
    1.82 +typedef struct perfData measurement_t;
    1.83 +struct perfData{
    1.84 +    uint64 cycles;
    1.85 +} __align_to_cacheline__;
    1.86 +
    1.87 +const char *usage = {
    1.88 +	"Usage: malloc_test [options]\n"
    1.89 +	"  Spwans a number of threads and allocates memory.\n\n"
    1.90 +	"Options:\n"
    1.91 +	"  -t <num>   how many threads to use (default: 1). This is internaly multiplied by the number of cores.\n"
    1.92 +	"  -o <num>   repeat workload and sync operation <m> times\n"
    1.93 +        "  -i <num>   size of workload, repeat <n> times\n"     
    1.94 +	"  -h         this help screen\n\n"
    1.95 +};
    1.96 +
    1.97 +struct barrier_t
    1.98 +{
    1.99 +    int counter;
   1.100 +    int nthreads;
   1.101 +    int32 mutex;
   1.102 +    int32 cond;
   1.103 +    measurement_t endBarrierCycles;
   1.104 +
   1.105 +} __align_to_cacheline__;
   1.106 +typedef struct barrier_t barrier;
   1.107 +
   1.108 +void inline barrier_init(barrier *barr, int nthreads, VirtProcr *animatingPr)
   1.109 + {
   1.110 +   barr->counter = 0;
   1.111 +   barr->nthreads = nthreads;
   1.112 +   barr->mutex   = VPThread__make_mutex(animatingPr);
   1.113 +   barr->cond    = VPThread__make_cond(barr->mutex, animatingPr);
   1.114 + }
   1.115 +
   1.116 +void inline barrier_wait(barrier *barr, VirtProcr *animatingPr)
   1.117 + { int i;
   1.118 +
   1.119 +   VPThread__mutex_lock(barr->mutex, animatingPr);
   1.120 +   barr->counter++;
   1.121 +   if(barr->counter == barr->nthreads)
   1.122 +    { 
   1.123 +        read(cycles_counter_main_fd, &(barr->endBarrierCycles.cycles), \
   1.124 +                sizeof(barr->endBarrierCycles.cycles));
   1.125 +       
   1.126 +      barr->counter = 0;
   1.127 +      for(i=0; i < barr->nthreads; i++)
   1.128 +         VPThread__cond_signal(barr->cond, animatingPr);
   1.129 +    }
   1.130 +   else
   1.131 +    { VPThread__cond_wait(barr->cond, animatingPr);
   1.132 +    }
   1.133 +   VPThread__mutex_unlock(barr->mutex, animatingPr);
   1.134 + }
   1.135 +
   1.136 +
   1.137 +
   1.138 +struct WorkerParams_t
   1.139 + { struct barrier_t* barrier;
   1.140 +   uint64_t  totalWorkCycles;
   1.141 +   uint64_t  totalBadCycles;
   1.142 +   uint64_t  totalSyncCycles;
   1.143 +   uint64_t  totalBadSyncCycles;
   1.144 +   uint64     numGoodSyncs;
   1.145 +   uint64     numGoodTasks;
   1.146 + };
   1.147 + 
   1.148 + typedef union
   1.149 + {
   1.150 +     struct WorkerParams_t data;
   1.151 +     char padding[CACHELINE_SIZE];
   1.152 + } WorkerParams __align_to_cacheline__;
   1.153 + 
   1.154 +WorkerParams *workerParamsArray;
   1.155 +
   1.156 +typedef struct
   1.157 + { measurement_t *startExeCycles;
   1.158 +   measurement_t *endExeCycles;
   1.159 + } BenchParams __align_to_cacheline__;
   1.160 +
   1.161 +//======================== App Code =========================
   1.162 +/*
   1.163 + p* Workload
   1.164 + */
   1.165 +
   1.166 +#define saveCyclesAndInstrs(core,cycles) do{     \
   1.167 +   int cycles_fd = cycles_counter_fd[core];             \
   1.168 +   int nread;                                           \
   1.169 +                                                        \
   1.170 +   nread = read(cycles_fd,&(cycles),sizeof(cycles));    \
   1.171 +   if(nread<0){                                         \
   1.172 +       perror("Error reading cycles counter");          \
   1.173 +       cycles = 0;                                      \
   1.174 +   }                                                    \
   1.175 +} while (0) //macro magic for scoping
   1.176 + 
   1.177 +#define saveMisses(misses) do{     \
   1.178 +   int nread;                                           \
   1.179 +                                                        \
   1.180 +   nread = read(misses_counter_fd,&(misses),sizeof(misses));    \
   1.181 +   if(nread<0){                                         \
   1.182 +       perror("Error reading misses counter");          \
   1.183 +       misses = 0;                                      \
   1.184 +   }                                                    \
   1.185 +} while (0) //macro magic for scoping
   1.186 +
   1.187 +
   1.188 +double
   1.189 +worker_TLF(void* _params, VirtProcr* animatingPr)
   1.190 + {
   1.191 +   int i,o;
   1.192 +   WorkerParams* params = (WorkerParams*)_params;
   1.193 +   unsigned int totalWorkCycles = 0, totalBadCycles = 0;
   1.194 +   unsigned int totalSyncCycles = 0, totalBadSyncCycles = 0;
   1.195 +   unsigned int workspace1=0, numGoodSyncs = 0, numGoodTasks = 0;
   1.196 +   double workspace2=0.0;
   1.197 +   int32 privateMutex = VPThread__make_mutex(animatingPr);
   1.198 +   
   1.199 +   int cpuid = sched_getcpu();
   1.200 +   
   1.201 +   measurement_t startWorkload, endWorkload;
   1.202 +   uint64 numCycles;
   1.203 +   for(o=0; o < outer_iters; o++)
   1.204 +    {
   1.205 +       
   1.206 +          saveCyclesAndInstrs(cpuid,startWorkload.cycles);
   1.207 +       
   1.208 +      //task
   1.209 +      for(i=0; i < inner_iters; i++)
   1.210 +       {
   1.211 +         workspace1 += (workspace1 + 32)/2;
   1.212 +         workspace2 += (workspace2 + 23.2)/1.4;
   1.213 +       }
   1.214 +      
   1.215 +          saveCyclesAndInstrs(cpuid,endWorkload.cycles);
   1.216 +          numCycles = endWorkload.cycles - startWorkload.cycles;
   1.217 +          //sanity check (400K is about 20K iters)
   1.218 +          if( numCycles < 400000 ) {totalWorkCycles += numCycles; numGoodTasks++;}
   1.219 +          else                     {totalBadCycles  += numCycles; }
   1.220 +
   1.221 +      //mutex access often causes switch to different Slave VP
   1.222 +      VPThread__mutex_lock(privateMutex, animatingPr);
   1.223 +      
   1.224 +/*
   1.225 +          saveCyclesAndInstrs(cpuid,startWorkload2.cycles);
   1.226 +      //Task
   1.227 +      for(i=0; i < inner_iters; i++)
   1.228 +       {
   1.229 +         workspace1 += (workspace1 + 32)/2;
   1.230 +         workspace2 += (workspace2 + 23.2)/1.4;
   1.231 +       }
   1.232 +      
   1.233 +          saveCyclesAndInstrs(cpuid,endWorkload2.cycles);
   1.234 +          numCycles = endWorkload2.cycles - startWorkload2.cycles;
   1.235 +          //sanity check (400K is about 20K iters)
   1.236 +          if( numCycles < 400000 ) {totalWorkCycles += numCycles; numGoodTasks++;}
   1.237 +          else                     {totalBadCycles  += numCycles; }
   1.238 +      
   1.239 +*/
   1.240 +      VPThread__mutex_unlock(privateMutex, animatingPr);
   1.241 +    }
   1.242 +
   1.243 +   params->data.totalWorkCycles = totalWorkCycles;
   1.244 +   params->data.totalBadCycles = totalBadCycles;
   1.245 +   params->data.numGoodTasks   = numGoodTasks;
   1.246 +   params->data.totalSyncCycles = totalSyncCycles;
   1.247 +   params->data.totalBadSyncCycles = totalBadSyncCycles;
   1.248 +   params->data.numGoodSyncs = numGoodSyncs;
   1.249 +/*
   1.250 +   params->totalSyncCycles = VMS__give_num_plugin_cycles();
   1.251 +   params->totalBadSyncCycles = 0;
   1.252 +   params->numGoodSyncs = VMS__give_num_plugin_animations();
   1.253 +*/
   1.254 +   
   1.255 +   
   1.256 +   //Wait for all threads to end
   1.257 +   barrier_wait(params->data.barrier, animatingPr);
   1.258 +   
   1.259 +   //Shutdown worker
   1.260 +   VPThread__dissipate_thread(animatingPr);
   1.261 +   
   1.262 +     //below return never reached --> there for gcc
   1.263 +   return (workspace1 + workspace2);  //to prevent gcc from optimizing work out
   1.264 + }
   1.265 +
   1.266 +//local variables of benchmark, made global for alignment
   1.267 +struct barrier_t  barr __align_to_cacheline__;
   1.268 +BenchParams      *params __align_to_cacheline__;
   1.269 +
   1.270 +/* this is run after the VMS is set up*/
   1.271 +void benchmark(void *_params, VirtProcr *animatingPr)
   1.272 + {
   1.273 +   int i;
   1.274 +
   1.275 +   params = (BenchParams *)_params;
   1.276 +
   1.277 +   barrier_init(&barr, num_threads+1, animatingPr);
   1.278 +      
   1.279 +   //prepare input
   1.280 +   for(i=0; i<num_threads; i++)
   1.281 +    { 
   1.282 +       workerParamsArray[i].data.barrier = &barr;
   1.283 +    }
   1.284 +    
   1.285 +   uint64_t cache_misses_at_start, cache_misses_at_end;
   1.286 +   saveMisses(cache_misses_at_start);
   1.287 +   //save cycles before execution of threads, to get total exe cycles
   1.288 +   int nread = read(cycles_counter_main_fd, &(params->startExeCycles->cycles),
   1.289 +                sizeof(params->startExeCycles->cycles));
   1.290 +   if(nread<0) perror("Error reading cycles counter");
   1.291 +   
   1.292 +   //create (which starts running) all threads
   1.293 +   for(i=0; i<num_threads; i++)
   1.294 +    { VPThread__create_thread((VirtProcrFnPtr)worker_TLF, &(workerParamsArray[i]), animatingPr);
   1.295 +    }
   1.296 +   //wait for all threads to finish
   1.297 +   barrier_wait(&barr, animatingPr);
   1.298 +  
   1.299 +   //endBarrierCycles read in barrier_wait()!  Merten, email me if want to chg
   1.300 +   params->endExeCycles->cycles = barr.endBarrierCycles.cycles;
   1.301 +   saveMisses(cache_misses_at_end);
   1.302 +   cache_misses = cache_misses_at_end-cache_misses_at_start;
   1.303 +/*
   1.304 +   uint64_t overallWorkCycles = 0;
   1.305 +   for(i=0; i<num_threads; i++){ 
   1.306 +       printf("WorkCycles: %lu\n",input[i].totalWorkCycles);
   1.307 +       overallWorkCycles += input[i].totalWorkCycles;
   1.308 +    }
   1.309 +   
   1.310 +   printf("Sum across threads of work cycles: %lu\n", overallWorkCycles);
   1.311 +   printf("Total Execution: %lu\n", endBenchTime.cycles-startBenchTime.cycles);
   1.312 +   printf("Runtime/Workcycle Ratio %lu\n", 
   1.313 +   ((endBenchTime.cycles-startBenchTime.cycles)*100)/overallWorkCycles);
   1.314 +*/
   1.315 +
   1.316 +   //======================================================
   1.317 +
   1.318 +   VPThread__dissipate_thread(animatingPr);
   1.319 + }
   1.320 +
   1.321 +int main(int argc, char **argv)
   1.322 + {
   1.323 +   int i;
   1.324 +
   1.325 +   //set global static variables, based on cmd-line args
   1.326 +   for(i=1; i<argc; i++)
   1.327 +    {
   1.328 +      if(argv[i][0] == '-' && argv[i][2] == 0)
   1.329 +       {
   1.330 +         switch(argv[i][1])
   1.331 +          {
   1.332 +            case 't':
   1.333 +               if(!isdigit(argv[++i][0]))
   1.334 +                {
   1.335 +                  fprintf(stderr, "-t must be followed by the number of worker threads to spawn\n");
   1.336 +                  return EXIT_FAILURE;
   1.337 +                }
   1.338 +               num_threads = atoi(argv[i]);
   1.339 +               if(!num_threads)
   1.340 +                {
   1.341 +                  fprintf(stderr, "invalid number of threads specified: %d\n", num_threads);
   1.342 +                  return EXIT_FAILURE;
   1.343 +                }
   1.344 +            break;
   1.345 +            case 'o':
   1.346 +               if(!isdigit(argv[++i][0]))
   1.347 +                {
   1.348 +                  fputs("-i must be followed by a number\n", stderr);
   1.349 +                  return EXIT_FAILURE;
   1.350 +                }
   1.351 +               outer_iters = atoi(argv[i]);
   1.352 +				break;
   1.353 +            case 'i':
   1.354 +               if(!isdigit(argv[++i][0]))
   1.355 +                {
   1.356 +                  fputs("-o must be followed by a number (workload size)\n", stderr);
   1.357 +                  return EXIT_FAILURE;
   1.358 +                }
   1.359 +               inner_iters = atoi(argv[i]);
   1.360 +				break;
   1.361 +            case 'h':
   1.362 +               fputs(usage, stdout);
   1.363 +               return 0;
   1.364 +				
   1.365 +            default:
   1.366 +               fprintf(stderr, "unrecognized argument: %s\n", argv[i]);
   1.367 +               fputs(usage, stderr);
   1.368 +               return EXIT_FAILURE;
   1.369 +          }//switch
   1.370 +       }//if arg
   1.371 +      else
   1.372 +       {
   1.373 +			fprintf(stderr, "unrecognized argument: %s\n", argv[i]);
   1.374 +			fputs(usage, stderr);
   1.375 +			return EXIT_FAILURE;
   1.376 +       }
   1.377 +    }//for
   1.378 +   
   1.379 +   
   1.380 +   //setup performance counters
   1.381 +    hw_event = malloc(sizeof(struct perf_event_attr));
   1.382 +    memset(hw_event,0,sizeof(struct perf_event_attr));
   1.383 +    
   1.384 +    hw_event->type = PERF_TYPE_HARDWARE;
   1.385 +    hw_event->size = sizeof(hw_event);
   1.386 +    hw_event->disabled = 0;
   1.387 +    hw_event->freq = 0;
   1.388 +    hw_event->inherit = 1; /* children inherit it   */
   1.389 +    hw_event->pinned = 1; /* says this virt counter must always be on HW */
   1.390 +    hw_event->exclusive = 0; /* only group on PMU     */
   1.391 +    hw_event->exclude_user = 0; /* don't count user      */
   1.392 +    hw_event->exclude_kernel = 1; /* don't count kernel  */
   1.393 +    hw_event->exclude_hv = 1; /* ditto hypervisor      */
   1.394 +    hw_event->exclude_idle = 1; /* don't count when idle */
   1.395 +    hw_event->mmap = 0; /* include mmap data     */
   1.396 +    hw_event->comm = 0; /* include comm data     */
   1.397 +
   1.398 +    hw_event->config = PERF_COUNT_HW_CPU_CYCLES; //cycles
   1.399 +    
   1.400 +    int cpuID, retries;
   1.401 +
   1.402 +   for( cpuID = 0; cpuID < NUM_CORES; cpuID++ )
   1.403 +    { retries = 0;
   1.404 +      do
   1.405 +       { retries += 1;
   1.406 +         cycles_counter_fd[cpuID] = 
   1.407 +          syscall(__NR_perf_event_open, hw_event,
   1.408 +                  0,//pid_t: 0 is "pid of calling process" 
   1.409 +                  cpuID,//int: cpu, the value returned by "CPUID" instr(?)
   1.410 +                  -1,//int: group_fd, -1 is "leader" or independent
   1.411 +                  0//unsigned long: flags
   1.412 +                 );
   1.413 +       }
   1.414 +      while(cycles_counter_fd[cpuID]<0 && retries < 100);
   1.415 +      if(retries >= 100)
   1.416 +       {
   1.417 +         fprintf(stderr,"On core %d: ",cpuID);
   1.418 +         perror("Failed to open cycles counter");
   1.419 +       }
   1.420 +    }
   1.421 +
   1.422 +   //Set up counter to accumulate total cycles to process, across all CPUs
   1.423 +
   1.424 +   retries = 0;
   1.425 +   do
   1.426 +    { retries += 1;
   1.427 +      cycles_counter_main_fd = 
   1.428 +       syscall(__NR_perf_event_open, hw_event,
   1.429 +               0,//pid_t: 0 is "pid of calling process" 
   1.430 +               -1,//int: cpu, -1 means accumulate from all cores
   1.431 +               -1,//int: group_fd, -1 is "leader" == independent
   1.432 +               0//unsigned long: flags
   1.433 +              );
   1.434 +    }
   1.435 +   while(cycles_counter_main_fd<0 && retries < 100);
   1.436 +   if(retries >= 100)
   1.437 +    {
   1.438 +      fprintf(stderr,"in main ");
   1.439 +      perror("Failed to open cycles counter");
   1.440 +    }
   1.441 +   
   1.442 +   //Set up counters to count cache misses
   1.443 +    hw_event->type = PERF_TYPE_HARDWARE;
   1.444 +    hw_event->config = PERF_COUNT_HW_CACHE_MISSES; //misses
   1.445 +    
   1.446 +   retries = 0;
   1.447 +   do
   1.448 +    { retries += 1;
   1.449 +      misses_counter_fd = 
   1.450 +       syscall(__NR_perf_event_open, hw_event,
   1.451 +               0,//pid_t: 0 is "pid of calling process" 
   1.452 +               -1,//int: cpu, -1 means accumulate from all cores
   1.453 +               -1,//int: group_fd, -1 is "leader" == independent
   1.454 +               0//unsigned long: flags
   1.455 +              );
   1.456 +    }
   1.457 +   while(misses_counter_fd<0 && retries < 100);
   1.458 +   if(retries >= 100)
   1.459 +    {
   1.460 +      fprintf(stderr,"in main ");
   1.461 +      perror("Failed to misses counter");
   1.462 +    }
   1.463 +   
   1.464 +   measurement_t startExeCycles, endExeCycles;
   1.465 +   BenchParams *benchParams;
   1.466 +   
   1.467 +   benchParams = malloc(sizeof(BenchParams)); 
   1.468 +   
   1.469 +   benchParams->startExeCycles = &startExeCycles;
   1.470 +   benchParams->endExeCycles   = &endExeCycles;
   1.471 +   
   1.472 +   workerParamsArray =  (WorkerParams *)malloc( (num_threads + 1) * sizeof(WorkerParams) );
   1.473 +   if(workerParamsArray == NULL ) printf("error mallocing worker params array\n");
   1.474 +   
   1.475 + 
   1.476 +   //This is the transition to the VMS runtime
   1.477 +   VPThread__create_seed_procr_and_do_work( &benchmark, benchParams );
   1.478 +   
   1.479 +   uint64_t totalWorkCyclesAcrossCores = 0, totalBadCyclesAcrossCores = 0;
   1.480 +   uint64_t totalSyncCyclesAcrossCores = 0, totalBadSyncCyclesAcrossCores = 0;
   1.481 +   for(i=0; i<num_threads; i++){ 
   1.482 +       printf("WorkCycles: %lu\n",workerParamsArray[i].data.totalWorkCycles);
   1.483 +//       printf("Num Good Tasks: %lu\n",workerParamsArray[i].numGoodTasks);
   1.484 +//       printf("SyncCycles: %lu\n",workerParamsArray[i].totalSyncCycles);
   1.485 +//       printf("Num Good Syncs: %lu\n",workerParamsArray[i].numGoodSyncs);
   1.486 +       totalWorkCyclesAcrossCores += workerParamsArray[i].data.totalWorkCycles;
   1.487 +       totalBadCyclesAcrossCores  += workerParamsArray[i].data.totalBadCycles;
   1.488 +       totalSyncCyclesAcrossCores += workerParamsArray[i].data.totalSyncCycles;
   1.489 +       totalBadSyncCyclesAcrossCores  += workerParamsArray[i].data.totalBadSyncCycles;
   1.490 +    }
   1.491 +
   1.492 +   uint64_t totalExeCycles = endExeCycles.cycles - startExeCycles.cycles;
   1.493 +   totalExeCycles -= totalBadCyclesAcrossCores;
   1.494 +   uint64 totalOverhead = totalExeCycles - totalWorkCyclesAcrossCores;
   1.495 +   int32  numSyncs = outer_iters * num_threads * 2;
   1.496 +   printf("Total Execution Cycles: %lu\n", totalExeCycles);
   1.497 +   printf("Total number of cache misses: %lu\n", cache_misses);
   1.498 +   printf("Sum across threads of work cycles: %lu\n", totalWorkCyclesAcrossCores);
   1.499 +   printf("Sum across threads of bad work cycles: %lu\n", totalBadCyclesAcrossCores);
   1.500 +//   printf("Sum across threads of Bad Sync cycles: %lu\n", totalBadSyncCyclesAcrossCores);
   1.501 +   printf("Overhead per sync: %f\n", (double)totalOverhead / (double)numSyncs );
   1.502 +   printf("ExeCycles/WorkCycles Ratio %f\n", 
   1.503 +          (double)totalExeCycles / (double)totalWorkCyclesAcrossCores);
   1.504 +   return 0;
   1.505 + }