annotate main.c @ 16:5887fbce425f

changed directory structure, added .hgeol file
author Merten Sach <msach@mailbox.tu-berlin.de>
date Mon, 13 Feb 2012 16:11:00 +0100
parents src/Application/main.c@a1269b1549fc
children fdc2f264f3d6
rev   line source
Me@4 1 /*
Me@4 2 *
Me@4 3 */
Me@4 4 #include <stdio.h>
Me@4 5 #include <stdlib.h>
Me@4 6 #include <string.h>
Me@4 7 #include <math.h>
Me@4 8 #include <ctype.h>
Me@4 9 #include <errno.h>
Me@4 10 #include <pthread.h>
msach@6 11 #include <unistd.h>
Me@4 12 #include "VPThread_lib/VPThread.h"
Me@4 13 #include "VPThread_lib/VMS/Queue_impl/PrivateQueue.h"
Me@4 14
msach@6 15 #include <linux/perf_event.h>
msach@6 16 #include <linux/prctl.h>
msach@6 17 #include <sys/syscall.h>
msach@6 18
Me@4 19 #undef DEBUG
Me@4 20 //#define DEBUG
Me@4 21
msach@15 22 #define MEASURE_PERF
msach@15 23
Me@4 24 #if !defined(unix) && !defined(__unix__)
Me@4 25 #ifdef __MACH__
Me@4 26 #define unix 1
Me@4 27 #define __unix__ 1
Me@4 28 #endif /* __MACH__ */
Me@4 29 #endif /* unix */
Me@4 30
Me@4 31 /* find the appropriate way to define explicitly sized types */
Me@4 32 /* for C99 or GNU libc (also mach's libc) we can use stdint.h */
Me@4 33 #if (__STDC_VERSION__ >= 199900) || defined(__GLIBC__) || defined(__MACH__)
Me@4 34 #include <stdint.h>
Me@4 35 #elif defined(unix) || defined(__unix__) /* some UNIX systems have them in sys/types.h */
Me@4 36 #include <sys/types.h>
Me@4 37 #elif defined(__WIN32__) || defined(WIN32) /* the nameless one */
Me@4 38 typedef unsigned __int8 uint8_t;
Me@4 39 typedef unsigned __int32 uint32_t;
Me@4 40 #endif /* sized type detection */
Me@4 41
Me@4 42 /* provide a millisecond-resolution timer for each system */
Me@4 43 #if defined(unix) || defined(__unix__)
Me@4 44 #include <time.h>
Me@4 45 #include <sys/time.h>
Me@4 46 unsigned long get_msec(void) {
Me@4 47 static struct timeval timeval, first_timeval;
Me@4 48
Me@4 49 gettimeofday(&timeval, 0);
Me@4 50 if(first_timeval.tv_sec == 0) {
Me@4 51 first_timeval = timeval;
Me@4 52 return 0;
Me@4 53 }
Me@4 54 return (timeval.tv_sec - first_timeval.tv_sec) * 1000 + (timeval.tv_usec - first_timeval.tv_usec) / 1000;
Me@4 55 }
Me@4 56 #elif defined(__WIN32__) || defined(WIN32)
Me@4 57 #include <windows.h>
Me@4 58 unsigned long get_msec(void) {
Me@4 59 return GetTickCount();
Me@4 60 }
Me@4 61 #else
Me@4 62 //#error "I don't know how to measure time on your platform"
Me@4 63 #endif
Me@4 64
Me@4 65 //======================== Defines =========================
kshalle@8 66 typedef struct perfData measurement_t;
kshalle@8 67 struct perfData{
kshalle@8 68 uint64 cycles;
kshalle@8 69 uint64 instructions;
kshalle@8 70 };
Me@4 71
Me@4 72 const char *usage = {
Me@4 73 "Usage: malloc_test [options]\n"
Me@4 74 " Spwans a number of threads and allocates memory.\n\n"
Me@4 75 "Options:\n"
msach@6 76 " -t <num> how many threads to use (default: 1). This is internaly multiplied by the number of cores.\n"
msach@6 77 " -o <num> repeat workload and sync operation <m> times\n"
msach@6 78 " -i <num> size of workload, repeat <n> times\n"
Me@4 79 " -h this help screen\n\n"
Me@4 80 };
Me@4 81
Me@4 82 struct barrier_t
Me@4 83 {
Me@4 84 int counter;
Me@4 85 int nthreads;
Me@4 86 int32 mutex;
Me@4 87 int32 cond;
kshalle@8 88 measurement_t endBarrierCycles;
kshalle@8 89
Me@4 90 };
Me@4 91 typedef struct barrier_t barrier;
Me@4 92
Me@4 93 void inline barrier_init(barrier *barr, int nthreads, VirtProcr *animatingPr)
Me@4 94 {
Me@4 95 barr->counter = 0;
Me@4 96 barr->nthreads = nthreads;
Me@4 97 barr->mutex = VPThread__make_mutex(animatingPr);
Me@4 98 barr->cond = VPThread__make_cond(barr->mutex, animatingPr);
Me@4 99 }
Me@4 100
kshalle@8 101 int cycles_counter_main_fd;
Me@4 102 void inline barrier_wait(barrier *barr, VirtProcr *animatingPr)
Me@4 103 { int i;
Me@4 104
Me@4 105 VPThread__mutex_lock(barr->mutex, animatingPr);
Me@4 106 barr->counter++;
Me@4 107 if(barr->counter == barr->nthreads)
kshalle@8 108 {
msach@15 109 #ifdef MEASURE_PERF
kshalle@8 110 read(cycles_counter_main_fd, &(barr->endBarrierCycles.cycles), \
kshalle@8 111 sizeof(barr->endBarrierCycles.cycles));
msach@15 112 #endif
kshalle@8 113
kshalle@8 114 barr->counter = 0;
Me@4 115 for(i=0; i < barr->nthreads; i++)
Me@4 116 VPThread__cond_signal(barr->cond, animatingPr);
Me@4 117 }
Me@4 118 else
Me@4 119 { VPThread__cond_wait(barr->cond, animatingPr);
Me@4 120 }
Me@4 121 VPThread__mutex_unlock(barr->mutex, animatingPr);
Me@4 122 }
Me@4 123
kshalle@8 124
kshalle@8 125
msach@9 126 typedef struct
msach@9 127 { struct barrier_t* barrier;
msach@9 128 uint64_t totalWorkCycles;
msach@9 129 uint64_t totalBadCycles;
msach@9 130 uint64_t totalSyncCycles;
msach@9 131 uint64_t totalBadSyncCycles;
msach@9 132 uint64 numGoodSyncs;
msach@9 133 uint64 numGoodTasks;
msach@9 134 }
msach@9 135 WorkerParams;
Me@4 136
Me@4 137
kshalle@8 138 typedef struct
kshalle@8 139 { measurement_t *startExeCycles;
kshalle@8 140 measurement_t *endExeCycles;
kshalle@8 141 }
kshalle@8 142 BenchParams;
Me@4 143
Me@4 144 //======================== Globals =========================
Me@4 145 char __ProgrammName[] = "overhead_test";
Me@4 146 char __DataSet[255];
Me@4 147
msach@9 148 int outer_iters, inner_iters, num_threads;
Me@4 149 size_t chunk_size = 0;
Me@4 150
msach@6 151 int cycles_counter_fd[NUM_CORES];
msach@7 152 struct perf_event_attr* hw_event;
Me@4 153
kshalle@8 154 WorkerParams *workerParamsArray;
kshalle@8 155
Me@4 156 //======================== App Code =========================
Me@4 157 /*
Me@4 158 * Workload
Me@4 159 */
msach@6 160
msach@6 161 #define saveCyclesAndInstrs(core,cycles) do{ \
msach@6 162 int cycles_fd = cycles_counter_fd[core]; \
msach@6 163 int nread; \
msach@6 164 \
msach@6 165 nread = read(cycles_fd,&(cycles),sizeof(cycles)); \
msach@7 166 if(nread<0){ \
msach@6 167 perror("Error reading cycles counter"); \
msach@6 168 cycles = 0; \
msach@6 169 } \
msach@6 170 } while (0) //macro magic for scoping
msach@6 171
msach@7 172
msach@9 173 double
msach@9 174 worker_TLF(void* _params, VirtProcr* animatingPr)
Me@5 175 {
msach@7 176 int i,o;
msach@9 177 WorkerParams* params = (WorkerParams*)_params;
msach@9 178 unsigned int totalWorkCycles = 0, totalBadCycles = 0;
msach@9 179 unsigned int totalSyncCycles = 0, totalBadSyncCycles = 0;
msach@9 180 unsigned int workspace1=0, numGoodSyncs = 0, numGoodTasks = 0;
kshalle@8 181 double workspace2=0.0;
Me@5 182 int32 privateMutex = VPThread__make_mutex(animatingPr);
msach@6 183
msach@6 184 int cpuid = sched_getcpu();
msach@9 185
msach@11 186 measurement_t startWorkload, endWorkload, startWorkload2, endWorkload2;
msach@9 187 uint64 numCycles;
msach@9 188 for(o=0; o < outer_iters; o++)
Me@4 189 {
msach@15 190 #ifdef MEASURE_PERF
msach@10 191 saveCyclesAndInstrs(cpuid,startWorkload.cycles);
msach@15 192 #endif
msach@9 193
msach@11 194 //workltask
msach@9 195 for(i=0; i < inner_iters; i++)
Me@5 196 {
Me@5 197 workspace1 += (workspace1 + 32)/2;
Me@5 198 workspace2 += (workspace2 + 23.2)/1.4;
Me@5 199 }
msach@15 200
msach@15 201 #ifdef MEASURE_PERF
msach@10 202 saveCyclesAndInstrs(cpuid,endWorkload.cycles);
msach@10 203 numCycles = endWorkload.cycles - startWorkload.cycles;
msach@9 204 //sanity check (400K is about 20K iters)
msach@9 205 if( numCycles < 400000 ) {totalWorkCycles += numCycles; numGoodTasks++;}
msach@9 206 else {totalBadCycles += numCycles; }
msach@15 207 #endif
msach@9 208
msach@9 209 //mutex access often causes switch to different Slave VP
msach@9 210 VPThread__mutex_lock(privateMutex, animatingPr);
msach@11 211
msach@10 212 /*
msach@11 213 saveCyclesAndInstrs(cpuid,startWorkload2.cycles);
msach@11 214 //Task
msach@11 215 for(i=0; i < inner_iters; i++)
msach@11 216 {
msach@11 217 workspace1 += (workspace1 + 32)/2;
msach@11 218 workspace2 += (workspace2 + 23.2)/1.4;
msach@11 219 }
msach@11 220
msach@11 221 saveCyclesAndInstrs(cpuid,endWorkload2.cycles);
msach@11 222 numCycles = endWorkload2.cycles - startWorkload2.cycles;
msach@9 223 //sanity check (400K is about 20K iters)
msach@11 224 if( numCycles < 400000 ) {totalWorkCycles += numCycles; numGoodTasks++;}
msach@11 225 else {totalBadCycles += numCycles; }
msach@11 226
msach@10 227 */
msach@9 228 VPThread__mutex_unlock(privateMutex, animatingPr);
Me@4 229 }
Me@5 230
kshalle@8 231 params->totalWorkCycles = totalWorkCycles;
msach@9 232 params->totalBadCycles = totalBadCycles;
msach@9 233 params->numGoodTasks = numGoodTasks;
msach@9 234 params->totalSyncCycles = totalSyncCycles;
msach@9 235 params->totalBadSyncCycles = totalBadSyncCycles;
msach@9 236 params->numGoodSyncs = numGoodSyncs;
msach@9 237 /*
msach@9 238 params->totalSyncCycles = VMS__give_num_plugin_cycles();
msach@9 239 params->totalBadSyncCycles = 0;
msach@9 240 params->numGoodSyncs = VMS__give_num_plugin_animations();
msach@9 241 */
msach@6 242
msach@6 243
msach@6 244 //Wait for all threads to end
kshalle@8 245 barrier_wait(params->barrier, animatingPr);
Me@5 246
Me@5 247 //Shutdown worker
Me@5 248 VPThread__dissipate_thread(animatingPr);
msach@9 249
msach@9 250 //below return never reached --> there for gcc
msach@9 251 return (workspace1 + workspace2); //to prevent gcc from optimizing work out
Me@5 252 }
Me@4 253
kshalle@8 254
Me@4 255 /* this is run after the VMS is set up*/
kshalle@8 256 void benchmark(void *_params, VirtProcr *animatingPr)
Me@4 257 {
msach@7 258 int i, cpuID;
kshalle@8 259 struct barrier_t barr;
kshalle@8 260 BenchParams *params;
kshalle@8 261
kshalle@8 262 params = (BenchParams *)_params;
kshalle@8 263
Me@4 264 barrier_init(&barr, num_threads+1, animatingPr);
kshalle@8 265
msach@6 266 //prepare input
Me@4 267 for(i=0; i<num_threads; i++)
msach@6 268 {
kshalle@8 269 workerParamsArray[i].barrier = &barr;
Me@4 270 }
msach@7 271
kshalle@8 272 //save cycles before execution of threads, to get total exe cycles
kshalle@8 273 measurement_t *startExeCycles, *endExeCycles;
kshalle@8 274 startExeCycles = params->startExeCycles;
kshalle@8 275
msach@15 276 #ifdef MEASURE_PERF
kshalle@8 277 int nread = read(cycles_counter_main_fd, &(startExeCycles->cycles),
kshalle@8 278 sizeof(startExeCycles->cycles));
msach@9 279 if(nread<0) perror("Error reading cycles counter");
msach@15 280 #endif
msach@9 281
msach@9 282 //create (which starts running) all threads
msach@9 283 for(i=0; i<num_threads; i++)
msach@9 284 { VPThread__create_thread((VirtProcrFnPtr)worker_TLF, &(workerParamsArray[i]), animatingPr);
kshalle@8 285 }
msach@6 286 //wait for all threads to finish
Me@4 287 barrier_wait(&barr, animatingPr);
msach@6 288
msach@15 289 #ifdef MEASURE_PERF
msach@9 290 //endBarrierCycles read in barrier_wait()! Merten, email me if want to chg
kshalle@8 291 params->endExeCycles->cycles = barr.endBarrierCycles.cycles;
msach@15 292 #endif
kshalle@8 293
Me@4 294
kshalle@8 295 /*
msach@6 296 uint64_t overallWorkCycles = 0;
msach@6 297 for(i=0; i<num_threads; i++){
msach@7 298 printf("WorkCycles: %lu\n",input[i].totalWorkCycles);
msach@6 299 overallWorkCycles += input[i].totalWorkCycles;
Me@4 300 }
msach@6 301
msach@6 302 printf("Sum across threads of work cycles: %lu\n", overallWorkCycles);
msach@6 303 printf("Total Execution: %lu\n", endBenchTime.cycles-startBenchTime.cycles);
kshalle@8 304 printf("Runtime/Workcycle Ratio %lu\n",
kshalle@8 305 ((endBenchTime.cycles-startBenchTime.cycles)*100)/overallWorkCycles);
kshalle@8 306 */
Me@4 307
Me@4 308 //======================================================
Me@4 309
Me@4 310 VPThread__dissipate_thread(animatingPr);
Me@4 311 }
Me@4 312
Me@4 313 int main(int argc, char **argv)
Me@4 314 {
Me@4 315 int i;
Me@4 316
Me@4 317 //set global static variables, based on cmd-line args
Me@4 318 for(i=1; i<argc; i++)
Me@4 319 {
Me@4 320 if(argv[i][0] == '-' && argv[i][2] == 0)
Me@4 321 {
Me@4 322 switch(argv[i][1])
Me@4 323 {
Me@4 324 case 't':
Me@4 325 if(!isdigit(argv[++i][0]))
Me@4 326 {
kshalle@8 327 fprintf(stderr, "-t must be followed by the number of worker threads to spawn\n");
Me@4 328 return EXIT_FAILURE;
Me@4 329 }
Me@4 330 num_threads = atoi(argv[i]);
Me@4 331 if(!num_threads)
Me@4 332 {
Me@4 333 fprintf(stderr, "invalid number of threads specified: %d\n", num_threads);
Me@4 334 return EXIT_FAILURE;
Me@4 335 }
Me@4 336 break;
msach@9 337 case 'o':
Me@4 338 if(!isdigit(argv[++i][0]))
Me@4 339 {
msach@6 340 fputs("-i must be followed by a number\n", stderr);
Me@4 341 return EXIT_FAILURE;
Me@4 342 }
msach@9 343 outer_iters = atoi(argv[i]);
Me@4 344 break;
msach@9 345 case 'i':
Me@4 346 if(!isdigit(argv[++i][0]))
Me@4 347 {
msach@6 348 fputs("-o must be followed by a number (workload size)\n", stderr);
Me@4 349 return EXIT_FAILURE;
Me@4 350 }
msach@9 351 inner_iters = atoi(argv[i]);
Me@4 352 break;
Me@4 353 case 'h':
Me@4 354 fputs(usage, stdout);
Me@4 355 return 0;
Me@4 356
Me@4 357 default:
Me@4 358 fprintf(stderr, "unrecognized argument: %s\n", argv[i]);
Me@4 359 fputs(usage, stderr);
Me@4 360 return EXIT_FAILURE;
Me@4 361 }//switch
Me@4 362 }//if arg
Me@4 363 else
Me@4 364 {
Me@4 365 fprintf(stderr, "unrecognized argument: %s\n", argv[i]);
Me@4 366 fputs(usage, stderr);
Me@4 367 return EXIT_FAILURE;
Me@4 368 }
Me@4 369 }//for
msach@7 370
kshalle@8 371
msach@15 372 #ifdef MEASURE_PERF
msach@7 373 //setup performance counters
msach@7 374 hw_event = malloc(sizeof(struct perf_event_attr));
msach@7 375 memset(hw_event,0,sizeof(struct perf_event_attr));
msach@7 376
msach@7 377 hw_event->type = PERF_TYPE_HARDWARE;
msach@7 378 hw_event->size = sizeof(hw_event);
msach@7 379 hw_event->disabled = 0;
msach@7 380 hw_event->freq = 0;
msach@7 381 hw_event->inherit = 1; /* children inherit it */
msach@7 382 hw_event->pinned = 1; /* says this virt counter must always be on HW */
msach@7 383 hw_event->exclusive = 0; /* only group on PMU */
msach@7 384 hw_event->exclude_user = 0; /* don't count user */
msach@7 385 hw_event->exclude_kernel = 1; /* don't count kernel */
msach@7 386 hw_event->exclude_hv = 1; /* ditto hypervisor */
msach@7 387 hw_event->exclude_idle = 1; /* don't count when idle */
msach@7 388 hw_event->mmap = 0; /* include mmap data */
msach@7 389 hw_event->comm = 0; /* include comm data */
msach@7 390
msach@7 391 hw_event->config = PERF_COUNT_HW_CPU_CYCLES; //cycles
msach@7 392
msach@7 393 int cpuID, retries;
msach@7 394
msach@7 395 for( cpuID = 0; cpuID < NUM_CORES; cpuID++ )
msach@7 396 { retries = 0;
msach@7 397 do
msach@7 398 { retries += 1;
msach@7 399 cycles_counter_fd[cpuID] =
msach@7 400 syscall(__NR_perf_event_open, hw_event,
msach@7 401 0,//pid_t: 0 is "pid of calling process"
msach@7 402 cpuID,//int: cpu, the value returned by "CPUID" instr(?)
msach@7 403 -1,//int: group_fd, -1 is "leader" or independent
msach@7 404 0//unsigned long: flags
msach@7 405 );
msach@7 406 }
msach@7 407 while(cycles_counter_fd[cpuID]<0 && retries < 100);
msach@7 408 if(retries >= 100)
msach@7 409 {
msach@7 410 fprintf(stderr,"On core %d: ",cpuID);
msach@7 411 perror("Failed to open cycles counter");
msach@7 412 }
msach@7 413 }
msach@7 414
msach@7 415 //Set up counter to accumulate total cycles to process, across all CPUs
msach@7 416
msach@7 417 retries = 0;
msach@7 418 do
msach@7 419 { retries += 1;
msach@7 420 cycles_counter_main_fd =
msach@7 421 syscall(__NR_perf_event_open, hw_event,
msach@7 422 0,//pid_t: 0 is "pid of calling process"
msach@7 423 -1,//int: cpu, -1 means accumulate from all cores
msach@7 424 -1,//int: group_fd, -1 is "leader" == independent
msach@7 425 0//unsigned long: flags
msach@7 426 );
msach@7 427 }
msach@7 428 while(cycles_counter_main_fd<0 && retries < 100);
msach@7 429 if(retries >= 100)
msach@7 430 {
msach@7 431 fprintf(stderr,"in main ");
msach@7 432 perror("Failed to open cycles counter");
msach@7 433 }
msach@15 434 #endif
kshalle@8 435
msach@9 436 measurement_t startExeCycles, endExeCycles;
msach@9 437 BenchParams *benchParams;
msach@9 438
msach@9 439 benchParams = malloc(sizeof(BenchParams));
msach@9 440
msach@9 441 benchParams->startExeCycles = &startExeCycles;
msach@9 442 benchParams->endExeCycles = &endExeCycles;
msach@9 443
kshalle@8 444 workerParamsArray = (WorkerParams *)malloc( (num_threads + 1) * sizeof(WorkerParams) );
kshalle@8 445 if(workerParamsArray == NULL ) printf("error mallocing worker params array\n");
kshalle@8 446
msach@9 447
kshalle@8 448 //This is the transition to the VMS runtime
kshalle@8 449 VPThread__create_seed_procr_and_do_work( &benchmark, benchParams );
kshalle@8 450
msach@15 451 #ifdef MEASURE_PERF
msach@9 452 uint64_t totalWorkCyclesAcrossCores = 0, totalBadCyclesAcrossCores = 0;
msach@9 453 uint64_t totalSyncCyclesAcrossCores = 0, totalBadSyncCyclesAcrossCores = 0;
kshalle@8 454 for(i=0; i<num_threads; i++){
kshalle@8 455 printf("WorkCycles: %lu\n",workerParamsArray[i].totalWorkCycles);
msach@9 456 // printf("Num Good Tasks: %lu\n",workerParamsArray[i].numGoodTasks);
msach@9 457 // printf("SyncCycles: %lu\n",workerParamsArray[i].totalSyncCycles);
msach@9 458 // printf("Num Good Syncs: %lu\n",workerParamsArray[i].numGoodSyncs);
kshalle@8 459 totalWorkCyclesAcrossCores += workerParamsArray[i].totalWorkCycles;
msach@9 460 totalBadCyclesAcrossCores += workerParamsArray[i].totalBadCycles;
msach@9 461 totalSyncCyclesAcrossCores += workerParamsArray[i].totalSyncCycles;
msach@9 462 totalBadSyncCyclesAcrossCores += workerParamsArray[i].totalBadSyncCycles;
kshalle@8 463 }
msach@7 464
kshalle@8 465 uint64_t totalExeCycles = endExeCycles.cycles - startExeCycles.cycles;
msach@9 466 totalExeCycles -= totalBadCyclesAcrossCores;
msach@10 467 uint64 totalOverhead = totalExeCycles - totalWorkCyclesAcrossCores;
msach@10 468 int32 numSyncs = outer_iters * num_threads * 2;
msach@10 469 printf("Total Execution Cycles: %lu\n", totalExeCycles);
kshalle@8 470 printf("Sum across threads of work cycles: %lu\n", totalWorkCyclesAcrossCores);
msach@10 471 printf("Sum across threads of bad work cycles: %lu\n", totalBadCyclesAcrossCores);
msach@10 472 // printf("Sum across threads of Bad Sync cycles: %lu\n", totalBadSyncCyclesAcrossCores);
msach@10 473 printf("Overhead per sync: %f\n", (double)totalOverhead / (double)numSyncs );
kshalle@8 474 printf("ExeCycles/WorkCycles Ratio %f\n",
kshalle@8 475 (double)totalExeCycles / (double)totalWorkCyclesAcrossCores);
msach@15 476 #else
msach@15 477 printf("No measurement done!\n");
msach@15 478 #endif
Me@4 479 return 0;
msach@7 480 }