annotate main.c @ 18:e7277df4460e

New include paths because of new project structure
author Merten Sach <msach@mailbox.tu-berlin.de>
date Mon, 13 Feb 2012 19:34:46 +0100
parents 281cadcbb796
children
rev   line source
Me@4 1 /*
Me@4 2 *
Me@4 3 */
Me@4 4 #include <stdio.h>
Me@4 5 #include <stdlib.h>
Me@4 6 #include <string.h>
Me@4 7 #include <math.h>
Me@4 8 #include <ctype.h>
Me@4 9 #include <errno.h>
Me@4 10 #include <pthread.h>
msach@6 11 #include <unistd.h>
msach@18 12 #include "VMS_Implementations/Vthread_impl/VPThread.h"
msach@18 13 #include "C_Libraries/Queue_impl/PrivateQueue.h"
Me@4 14
msach@6 15 #include <linux/perf_event.h>
msach@6 16 #include <linux/prctl.h>
msach@6 17 #include <sys/syscall.h>
msach@6 18
Me@4 19 #undef DEBUG
Me@4 20 //#define DEBUG
Me@4 21
Me@4 22 #if !defined(unix) && !defined(__unix__)
Me@4 23 #ifdef __MACH__
Me@4 24 #define unix 1
Me@4 25 #define __unix__ 1
Me@4 26 #endif /* __MACH__ */
Me@4 27 #endif /* unix */
Me@4 28
Me@4 29 /* find the appropriate way to define explicitly sized types */
Me@4 30 /* for C99 or GNU libc (also mach's libc) we can use stdint.h */
Me@4 31 #if (__STDC_VERSION__ >= 199900) || defined(__GLIBC__) || defined(__MACH__)
Me@4 32 #include <stdint.h>
Me@4 33 #elif defined(unix) || defined(__unix__) /* some UNIX systems have them in sys/types.h */
Me@4 34 #include <sys/types.h>
Me@4 35 #elif defined(__WIN32__) || defined(WIN32) /* the nameless one */
Me@4 36 typedef unsigned __int8 uint8_t;
Me@4 37 typedef unsigned __int32 uint32_t;
Me@4 38 #endif /* sized type detection */
Me@4 39
Me@4 40 /* provide a millisecond-resolution timer for each system */
Me@4 41 #if defined(unix) || defined(__unix__)
Me@4 42 #include <time.h>
Me@4 43 #include <sys/time.h>
Me@4 44 unsigned long get_msec(void) {
Me@4 45 static struct timeval timeval, first_timeval;
Me@4 46
Me@4 47 gettimeofday(&timeval, 0);
Me@4 48 if(first_timeval.tv_sec == 0) {
Me@4 49 first_timeval = timeval;
Me@4 50 return 0;
Me@4 51 }
Me@4 52 return (timeval.tv_sec - first_timeval.tv_sec) * 1000 + (timeval.tv_usec - first_timeval.tv_usec) / 1000;
Me@4 53 }
Me@4 54 #elif defined(__WIN32__) || defined(WIN32)
Me@4 55 #include <windows.h>
Me@4 56 unsigned long get_msec(void) {
Me@4 57 return GetTickCount();
Me@4 58 }
Me@4 59 #else
Me@4 60 //#error "I don't know how to measure time on your platform"
Me@4 61 #endif
Me@4 62
msach@14 63 //======================== Globals =========================
msach@14 64 char __ProgrammName[] = "overhead_test";
msach@14 65 char __DataSet[255];
msach@14 66
msach@14 67 int outer_iters, inner_iters, num_threads;
msach@14 68 size_t chunk_size = 0;
msach@14 69
msach@14 70 int cycles_counter_main_fd;
msach@14 71 int misses_counter_fd;
msach@14 72
msach@14 73 uint64_t cache_misses;
msach@14 74
msach@14 75 int cycles_counter_fd[NUM_CORES];
msach@14 76 struct perf_event_attr* hw_event;
msach@14 77
Me@4 78 //======================== Defines =========================
kshalle@8 79 typedef struct perfData measurement_t;
kshalle@8 80 struct perfData{
kshalle@8 81 uint64 cycles;
msach@13 82 } __align_to_cacheline__;
Me@4 83
Me@4 84 const char *usage = {
Me@4 85 "Usage: malloc_test [options]\n"
Me@4 86 " Spwans a number of threads and allocates memory.\n\n"
Me@4 87 "Options:\n"
msach@6 88 " -t <num> how many threads to use (default: 1). This is internaly multiplied by the number of cores.\n"
msach@6 89 " -o <num> repeat workload and sync operation <m> times\n"
msach@6 90 " -i <num> size of workload, repeat <n> times\n"
Me@4 91 " -h this help screen\n\n"
Me@4 92 };
Me@4 93
Me@4 94 struct barrier_t
Me@4 95 {
Me@4 96 int counter;
Me@4 97 int nthreads;
Me@4 98 int32 mutex;
Me@4 99 int32 cond;
kshalle@8 100 measurement_t endBarrierCycles;
kshalle@8 101
msach@13 102 } __align_to_cacheline__;
Me@4 103 typedef struct barrier_t barrier;
Me@4 104
Me@4 105 void inline barrier_init(barrier *barr, int nthreads, VirtProcr *animatingPr)
Me@4 106 {
Me@4 107 barr->counter = 0;
Me@4 108 barr->nthreads = nthreads;
Me@4 109 barr->mutex = VPThread__make_mutex(animatingPr);
Me@4 110 barr->cond = VPThread__make_cond(barr->mutex, animatingPr);
Me@4 111 }
Me@4 112
Me@4 113 void inline barrier_wait(barrier *barr, VirtProcr *animatingPr)
Me@4 114 { int i;
Me@4 115
Me@4 116 VPThread__mutex_lock(barr->mutex, animatingPr);
Me@4 117 barr->counter++;
Me@4 118 if(barr->counter == barr->nthreads)
kshalle@8 119 {
msach@13 120 read(cycles_counter_main_fd, &(barr->endBarrierCycles.cycles), \
kshalle@8 121 sizeof(barr->endBarrierCycles.cycles));
kshalle@8 122
kshalle@8 123 barr->counter = 0;
Me@4 124 for(i=0; i < barr->nthreads; i++)
Me@4 125 VPThread__cond_signal(barr->cond, animatingPr);
Me@4 126 }
Me@4 127 else
Me@4 128 { VPThread__cond_wait(barr->cond, animatingPr);
Me@4 129 }
Me@4 130 VPThread__mutex_unlock(barr->mutex, animatingPr);
Me@4 131 }
Me@4 132
kshalle@8 133
kshalle@8 134
msach@13 135 struct WorkerParams_t
msach@9 136 { struct barrier_t* barrier;
msach@9 137 uint64_t totalWorkCycles;
msach@9 138 uint64_t totalBadCycles;
msach@9 139 uint64_t totalSyncCycles;
msach@9 140 uint64_t totalBadSyncCycles;
msach@9 141 uint64 numGoodSyncs;
msach@9 142 uint64 numGoodTasks;
msach@13 143 };
msach@13 144
msach@13 145 typedef union
msach@13 146 {
msach@13 147 struct WorkerParams_t data;
msach@13 148 char padding[CACHELINE_SIZE];
msach@13 149 } WorkerParams __align_to_cacheline__;
msach@14 150
msach@14 151 WorkerParams *workerParamsArray;
Me@4 152
kshalle@8 153 typedef struct
kshalle@8 154 { measurement_t *startExeCycles;
kshalle@8 155 measurement_t *endExeCycles;
msach@13 156 } BenchParams __align_to_cacheline__;
Me@4 157
Me@4 158 //======================== App Code =========================
Me@4 159 /*
msach@13 160 p* Workload
Me@4 161 */
msach@6 162
msach@6 163 #define saveCyclesAndInstrs(core,cycles) do{ \
msach@6 164 int cycles_fd = cycles_counter_fd[core]; \
msach@6 165 int nread; \
msach@6 166 \
msach@6 167 nread = read(cycles_fd,&(cycles),sizeof(cycles)); \
msach@7 168 if(nread<0){ \
msach@6 169 perror("Error reading cycles counter"); \
msach@6 170 cycles = 0; \
msach@6 171 } \
msach@6 172 } while (0) //macro magic for scoping
msach@14 173
msach@14 174 #define saveMisses(misses) do{ \
msach@14 175 int nread; \
msach@14 176 \
msach@14 177 nread = read(misses_counter_fd,&(misses),sizeof(misses)); \
msach@14 178 if(nread<0){ \
msach@14 179 perror("Error reading misses counter"); \
msach@14 180 misses = 0; \
msach@14 181 } \
msach@14 182 } while (0) //macro magic for scoping
msach@6 183
msach@7 184
msach@9 185 double
msach@9 186 worker_TLF(void* _params, VirtProcr* animatingPr)
Me@5 187 {
msach@7 188 int i,o;
msach@9 189 WorkerParams* params = (WorkerParams*)_params;
msach@9 190 unsigned int totalWorkCycles = 0, totalBadCycles = 0;
msach@9 191 unsigned int totalSyncCycles = 0, totalBadSyncCycles = 0;
msach@9 192 unsigned int workspace1=0, numGoodSyncs = 0, numGoodTasks = 0;
kshalle@8 193 double workspace2=0.0;
Me@5 194 int32 privateMutex = VPThread__make_mutex(animatingPr);
msach@6 195
msach@6 196 int cpuid = sched_getcpu();
msach@9 197
msach@13 198 measurement_t startWorkload, endWorkload;
msach@9 199 uint64 numCycles;
msach@9 200 for(o=0; o < outer_iters; o++)
Me@4 201 {
msach@6 202
msach@10 203 saveCyclesAndInstrs(cpuid,startWorkload.cycles);
msach@9 204
msach@13 205 //task
msach@9 206 for(i=0; i < inner_iters; i++)
Me@5 207 {
Me@5 208 workspace1 += (workspace1 + 32)/2;
Me@5 209 workspace2 += (workspace2 + 23.2)/1.4;
Me@5 210 }
msach@6 211
msach@10 212 saveCyclesAndInstrs(cpuid,endWorkload.cycles);
msach@10 213 numCycles = endWorkload.cycles - startWorkload.cycles;
msach@9 214 //sanity check (400K is about 20K iters)
msach@9 215 if( numCycles < 400000 ) {totalWorkCycles += numCycles; numGoodTasks++;}
msach@9 216 else {totalBadCycles += numCycles; }
msach@9 217
msach@9 218 //mutex access often causes switch to different Slave VP
msach@9 219 VPThread__mutex_lock(privateMutex, animatingPr);
msach@11 220
msach@10 221 /*
msach@11 222 saveCyclesAndInstrs(cpuid,startWorkload2.cycles);
msach@11 223 //Task
msach@11 224 for(i=0; i < inner_iters; i++)
msach@11 225 {
msach@11 226 workspace1 += (workspace1 + 32)/2;
msach@11 227 workspace2 += (workspace2 + 23.2)/1.4;
msach@11 228 }
msach@11 229
msach@11 230 saveCyclesAndInstrs(cpuid,endWorkload2.cycles);
msach@11 231 numCycles = endWorkload2.cycles - startWorkload2.cycles;
msach@9 232 //sanity check (400K is about 20K iters)
msach@11 233 if( numCycles < 400000 ) {totalWorkCycles += numCycles; numGoodTasks++;}
msach@11 234 else {totalBadCycles += numCycles; }
msach@11 235
msach@10 236 */
msach@9 237 VPThread__mutex_unlock(privateMutex, animatingPr);
Me@4 238 }
Me@5 239
msach@13 240 params->data.totalWorkCycles = totalWorkCycles;
msach@13 241 params->data.totalBadCycles = totalBadCycles;
msach@13 242 params->data.numGoodTasks = numGoodTasks;
msach@13 243 params->data.totalSyncCycles = totalSyncCycles;
msach@13 244 params->data.totalBadSyncCycles = totalBadSyncCycles;
msach@13 245 params->data.numGoodSyncs = numGoodSyncs;
msach@9 246 /*
msach@9 247 params->totalSyncCycles = VMS__give_num_plugin_cycles();
msach@9 248 params->totalBadSyncCycles = 0;
msach@9 249 params->numGoodSyncs = VMS__give_num_plugin_animations();
msach@9 250 */
msach@6 251
msach@6 252
msach@6 253 //Wait for all threads to end
msach@13 254 barrier_wait(params->data.barrier, animatingPr);
Me@5 255
Me@5 256 //Shutdown worker
Me@5 257 VPThread__dissipate_thread(animatingPr);
msach@9 258
msach@9 259 //below return never reached --> there for gcc
msach@9 260 return (workspace1 + workspace2); //to prevent gcc from optimizing work out
Me@5 261 }
Me@4 262
msach@13 263 //local variables of benchmark, made global for alignment
msach@13 264 struct barrier_t barr __align_to_cacheline__;
msach@13 265 BenchParams *params __align_to_cacheline__;
kshalle@8 266
Me@4 267 /* this is run after the VMS is set up*/
kshalle@8 268 void benchmark(void *_params, VirtProcr *animatingPr)
Me@4 269 {
msach@13 270 int i;
msach@13 271
kshalle@8 272 params = (BenchParams *)_params;
kshalle@8 273
Me@4 274 barrier_init(&barr, num_threads+1, animatingPr);
kshalle@8 275
msach@6 276 //prepare input
Me@4 277 for(i=0; i<num_threads; i++)
msach@6 278 {
msach@13 279 workerParamsArray[i].data.barrier = &barr;
Me@4 280 }
msach@14 281
msach@14 282 uint64_t cache_misses_at_start, cache_misses_at_end;
msach@14 283 saveMisses(cache_misses_at_start);
kshalle@8 284 //save cycles before execution of threads, to get total exe cycles
msach@13 285 int nread = read(cycles_counter_main_fd, &(params->startExeCycles->cycles),
msach@13 286 sizeof(params->startExeCycles->cycles));
msach@9 287 if(nread<0) perror("Error reading cycles counter");
msach@9 288
msach@9 289 //create (which starts running) all threads
msach@9 290 for(i=0; i<num_threads; i++)
msach@9 291 { VPThread__create_thread((VirtProcrFnPtr)worker_TLF, &(workerParamsArray[i]), animatingPr);
kshalle@8 292 }
msach@6 293 //wait for all threads to finish
Me@4 294 barrier_wait(&barr, animatingPr);
msach@6 295
msach@9 296 //endBarrierCycles read in barrier_wait()! Merten, email me if want to chg
kshalle@8 297 params->endExeCycles->cycles = barr.endBarrierCycles.cycles;
msach@14 298 saveMisses(cache_misses_at_end);
msach@14 299 cache_misses = cache_misses_at_end-cache_misses_at_start;
kshalle@8 300 /*
msach@6 301 uint64_t overallWorkCycles = 0;
msach@6 302 for(i=0; i<num_threads; i++){
msach@7 303 printf("WorkCycles: %lu\n",input[i].totalWorkCycles);
msach@6 304 overallWorkCycles += input[i].totalWorkCycles;
Me@4 305 }
msach@6 306
msach@6 307 printf("Sum across threads of work cycles: %lu\n", overallWorkCycles);
msach@6 308 printf("Total Execution: %lu\n", endBenchTime.cycles-startBenchTime.cycles);
kshalle@8 309 printf("Runtime/Workcycle Ratio %lu\n",
kshalle@8 310 ((endBenchTime.cycles-startBenchTime.cycles)*100)/overallWorkCycles);
kshalle@8 311 */
Me@4 312
Me@4 313 //======================================================
Me@4 314
Me@4 315 VPThread__dissipate_thread(animatingPr);
Me@4 316 }
Me@4 317
Me@4 318 int main(int argc, char **argv)
Me@4 319 {
Me@4 320 int i;
Me@4 321
Me@4 322 //set global static variables, based on cmd-line args
Me@4 323 for(i=1; i<argc; i++)
Me@4 324 {
Me@4 325 if(argv[i][0] == '-' && argv[i][2] == 0)
Me@4 326 {
Me@4 327 switch(argv[i][1])
Me@4 328 {
Me@4 329 case 't':
Me@4 330 if(!isdigit(argv[++i][0]))
Me@4 331 {
kshalle@8 332 fprintf(stderr, "-t must be followed by the number of worker threads to spawn\n");
Me@4 333 return EXIT_FAILURE;
Me@4 334 }
Me@4 335 num_threads = atoi(argv[i]);
Me@4 336 if(!num_threads)
Me@4 337 {
Me@4 338 fprintf(stderr, "invalid number of threads specified: %d\n", num_threads);
Me@4 339 return EXIT_FAILURE;
Me@4 340 }
Me@4 341 break;
msach@9 342 case 'o':
Me@4 343 if(!isdigit(argv[++i][0]))
Me@4 344 {
msach@6 345 fputs("-i must be followed by a number\n", stderr);
Me@4 346 return EXIT_FAILURE;
Me@4 347 }
msach@9 348 outer_iters = atoi(argv[i]);
Me@4 349 break;
msach@9 350 case 'i':
Me@4 351 if(!isdigit(argv[++i][0]))
Me@4 352 {
msach@6 353 fputs("-o must be followed by a number (workload size)\n", stderr);
Me@4 354 return EXIT_FAILURE;
Me@4 355 }
msach@9 356 inner_iters = atoi(argv[i]);
Me@4 357 break;
Me@4 358 case 'h':
Me@4 359 fputs(usage, stdout);
Me@4 360 return 0;
Me@4 361
Me@4 362 default:
Me@4 363 fprintf(stderr, "unrecognized argument: %s\n", argv[i]);
Me@4 364 fputs(usage, stderr);
Me@4 365 return EXIT_FAILURE;
Me@4 366 }//switch
Me@4 367 }//if arg
Me@4 368 else
Me@4 369 {
Me@4 370 fprintf(stderr, "unrecognized argument: %s\n", argv[i]);
Me@4 371 fputs(usage, stderr);
Me@4 372 return EXIT_FAILURE;
Me@4 373 }
Me@4 374 }//for
msach@7 375
kshalle@8 376
msach@7 377 //setup performance counters
msach@7 378 hw_event = malloc(sizeof(struct perf_event_attr));
msach@7 379 memset(hw_event,0,sizeof(struct perf_event_attr));
msach@7 380
msach@7 381 hw_event->type = PERF_TYPE_HARDWARE;
msach@7 382 hw_event->size = sizeof(hw_event);
msach@7 383 hw_event->disabled = 0;
msach@7 384 hw_event->freq = 0;
msach@7 385 hw_event->inherit = 1; /* children inherit it */
msach@7 386 hw_event->pinned = 1; /* says this virt counter must always be on HW */
msach@7 387 hw_event->exclusive = 0; /* only group on PMU */
msach@7 388 hw_event->exclude_user = 0; /* don't count user */
msach@7 389 hw_event->exclude_kernel = 1; /* don't count kernel */
msach@7 390 hw_event->exclude_hv = 1; /* ditto hypervisor */
msach@7 391 hw_event->exclude_idle = 1; /* don't count when idle */
msach@7 392 hw_event->mmap = 0; /* include mmap data */
msach@7 393 hw_event->comm = 0; /* include comm data */
msach@7 394
msach@7 395 hw_event->config = PERF_COUNT_HW_CPU_CYCLES; //cycles
msach@7 396
msach@7 397 int cpuID, retries;
msach@7 398
msach@7 399 for( cpuID = 0; cpuID < NUM_CORES; cpuID++ )
msach@7 400 { retries = 0;
msach@7 401 do
msach@7 402 { retries += 1;
msach@7 403 cycles_counter_fd[cpuID] =
msach@7 404 syscall(__NR_perf_event_open, hw_event,
msach@7 405 0,//pid_t: 0 is "pid of calling process"
msach@7 406 cpuID,//int: cpu, the value returned by "CPUID" instr(?)
msach@7 407 -1,//int: group_fd, -1 is "leader" or independent
msach@7 408 0//unsigned long: flags
msach@7 409 );
msach@7 410 }
msach@7 411 while(cycles_counter_fd[cpuID]<0 && retries < 100);
msach@7 412 if(retries >= 100)
msach@7 413 {
msach@7 414 fprintf(stderr,"On core %d: ",cpuID);
msach@7 415 perror("Failed to open cycles counter");
msach@7 416 }
msach@7 417 }
msach@7 418
msach@7 419 //Set up counter to accumulate total cycles to process, across all CPUs
msach@7 420
msach@7 421 retries = 0;
msach@7 422 do
msach@7 423 { retries += 1;
msach@7 424 cycles_counter_main_fd =
msach@7 425 syscall(__NR_perf_event_open, hw_event,
msach@7 426 0,//pid_t: 0 is "pid of calling process"
msach@7 427 -1,//int: cpu, -1 means accumulate from all cores
msach@7 428 -1,//int: group_fd, -1 is "leader" == independent
msach@7 429 0//unsigned long: flags
msach@7 430 );
msach@7 431 }
msach@7 432 while(cycles_counter_main_fd<0 && retries < 100);
msach@7 433 if(retries >= 100)
msach@7 434 {
msach@7 435 fprintf(stderr,"in main ");
msach@7 436 perror("Failed to open cycles counter");
msach@7 437 }
kshalle@8 438
msach@14 439 //Set up counters to count cache misses
msach@14 440 hw_event->type = PERF_TYPE_HARDWARE;
msach@14 441 hw_event->config = PERF_COUNT_HW_CACHE_MISSES; //misses
msach@14 442
msach@14 443 retries = 0;
msach@14 444 do
msach@14 445 { retries += 1;
msach@14 446 misses_counter_fd =
msach@14 447 syscall(__NR_perf_event_open, hw_event,
msach@14 448 0,//pid_t: 0 is "pid of calling process"
msach@14 449 -1,//int: cpu, -1 means accumulate from all cores
msach@14 450 -1,//int: group_fd, -1 is "leader" == independent
msach@14 451 0//unsigned long: flags
msach@14 452 );
msach@14 453 }
msach@14 454 while(misses_counter_fd<0 && retries < 100);
msach@14 455 if(retries >= 100)
msach@14 456 {
msach@14 457 fprintf(stderr,"in main ");
msach@14 458 perror("Failed to misses counter");
msach@14 459 }
msach@14 460
msach@9 461 measurement_t startExeCycles, endExeCycles;
msach@9 462 BenchParams *benchParams;
msach@9 463
msach@9 464 benchParams = malloc(sizeof(BenchParams));
msach@9 465
msach@9 466 benchParams->startExeCycles = &startExeCycles;
msach@9 467 benchParams->endExeCycles = &endExeCycles;
msach@9 468
kshalle@8 469 workerParamsArray = (WorkerParams *)malloc( (num_threads + 1) * sizeof(WorkerParams) );
kshalle@8 470 if(workerParamsArray == NULL ) printf("error mallocing worker params array\n");
kshalle@8 471
msach@9 472
kshalle@8 473 //This is the transition to the VMS runtime
kshalle@8 474 VPThread__create_seed_procr_and_do_work( &benchmark, benchParams );
kshalle@8 475
msach@9 476 uint64_t totalWorkCyclesAcrossCores = 0, totalBadCyclesAcrossCores = 0;
msach@9 477 uint64_t totalSyncCyclesAcrossCores = 0, totalBadSyncCyclesAcrossCores = 0;
kshalle@8 478 for(i=0; i<num_threads; i++){
msach@13 479 printf("WorkCycles: %lu\n",workerParamsArray[i].data.totalWorkCycles);
msach@9 480 // printf("Num Good Tasks: %lu\n",workerParamsArray[i].numGoodTasks);
msach@9 481 // printf("SyncCycles: %lu\n",workerParamsArray[i].totalSyncCycles);
msach@9 482 // printf("Num Good Syncs: %lu\n",workerParamsArray[i].numGoodSyncs);
msach@13 483 totalWorkCyclesAcrossCores += workerParamsArray[i].data.totalWorkCycles;
msach@13 484 totalBadCyclesAcrossCores += workerParamsArray[i].data.totalBadCycles;
msach@13 485 totalSyncCyclesAcrossCores += workerParamsArray[i].data.totalSyncCycles;
msach@13 486 totalBadSyncCyclesAcrossCores += workerParamsArray[i].data.totalBadSyncCycles;
kshalle@8 487 }
msach@7 488
kshalle@8 489 uint64_t totalExeCycles = endExeCycles.cycles - startExeCycles.cycles;
msach@9 490 totalExeCycles -= totalBadCyclesAcrossCores;
msach@10 491 uint64 totalOverhead = totalExeCycles - totalWorkCyclesAcrossCores;
msach@10 492 int32 numSyncs = outer_iters * num_threads * 2;
msach@10 493 printf("Total Execution Cycles: %lu\n", totalExeCycles);
msach@14 494 printf("Total number of cache misses: %lu\n", cache_misses);
kshalle@8 495 printf("Sum across threads of work cycles: %lu\n", totalWorkCyclesAcrossCores);
msach@10 496 printf("Sum across threads of bad work cycles: %lu\n", totalBadCyclesAcrossCores);
msach@10 497 // printf("Sum across threads of Bad Sync cycles: %lu\n", totalBadSyncCyclesAcrossCores);
msach@10 498 printf("Overhead per sync: %f\n", (double)totalOverhead / (double)numSyncs );
kshalle@8 499 printf("ExeCycles/WorkCycles Ratio %f\n",
kshalle@8 500 (double)totalExeCycles / (double)totalWorkCyclesAcrossCores);
Me@4 501 return 0;
msach@7 502 }