Mercurial > cgi-bin > hgwebdir.cgi > PR > Applications > Vthread > Vthread__Blocked_Matrix_Mult__Bench
comparison main.c @ 15:69928a38d5af
updating to most recent repository structure -- not working yet
| author | Sean Halle <seanhalle@yahoo.com> |
|---|---|
| date | Mon, 17 Sep 2012 18:19:52 -0700 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:a055e2bc7420 |
|---|---|
| 1 /* | |
| 2 * | |
| 3 */ | |
| 4 #include <stdio.h> | |
| 5 #include <stdlib.h> | |
| 6 #include <string.h> | |
| 7 #include <math.h> | |
| 8 #include <ctype.h> | |
| 9 #include <errno.h> | |
| 10 #include <pthread.h> | |
| 11 #include <unistd.h> | |
| 12 #include "VMS_Implementations/Vthread_impl/VPThread.h" | |
| 13 #include "C_Libraries/Queue_impl/PrivateQueue.h" | |
| 14 | |
| 15 #include <linux/perf_event.h> | |
| 16 #include <linux/prctl.h> | |
| 17 #include <sys/syscall.h> | |
| 18 | |
| 19 #undef DEBUG | |
| 20 //#define DEBUG | |
| 21 | |
| 22 #define MEASURE_PERF | |
| 23 | |
| 24 #if !defined(unix) && !defined(__unix__) | |
| 25 #ifdef __MACH__ | |
| 26 #define unix 1 | |
| 27 #define __unix__ 1 | |
| 28 #endif /* __MACH__ */ | |
| 29 #endif /* unix */ | |
| 30 | |
| 31 /* find the appropriate way to define explicitly sized types */ | |
| 32 /* for C99 or GNU libc (also mach's libc) we can use stdint.h */ | |
| 33 #if (__STDC_VERSION__ >= 199900) || defined(__GLIBC__) || defined(__MACH__) | |
| 34 #include <stdint.h> | |
| 35 #elif defined(unix) || defined(__unix__) /* some UNIX systems have them in sys/types.h */ | |
| 36 #include <sys/types.h> | |
| 37 #elif defined(__WIN32__) || defined(WIN32) /* the nameless one */ | |
| 38 typedef unsigned __int8 uint8_t; | |
| 39 typedef unsigned __int32 uint32_t; | |
| 40 #endif /* sized type detection */ | |
| 41 | |
| 42 /* provide a millisecond-resolution timer for each system */ | |
| 43 #if defined(unix) || defined(__unix__) | |
| 44 #include <time.h> | |
| 45 #include <sys/time.h> | |
| 46 unsigned long get_msec(void) { | |
| 47 static struct timeval timeval, first_timeval; | |
| 48 | |
| 49 gettimeofday(&timeval, 0); | |
| 50 if(first_timeval.tv_sec == 0) { | |
| 51 first_timeval = timeval; | |
| 52 return 0; | |
| 53 } | |
| 54 return (timeval.tv_sec - first_timeval.tv_sec) * 1000 + (timeval.tv_usec - first_timeval.tv_usec) / 1000; | |
| 55 } | |
| 56 #elif defined(__WIN32__) || defined(WIN32) | |
| 57 #include <windows.h> | |
| 58 unsigned long get_msec(void) { | |
| 59 return GetTickCount(); | |
| 60 } | |
| 61 #else | |
| 62 //#error "I don't know how to measure time on your platform" | |
| 63 #endif | |
| 64 | |
| 65 //======================== Defines ========================= | |
| 66 typedef struct perfData measurement_t; | |
| 67 struct perfData{ | |
| 68 uint64 cycles; | |
| 69 uint64 instructions; | |
| 70 }; | |
| 71 | |
| 72 const char *usage = { | |
| 73 "Usage: malloc_test [options]\n" | |
| 74 " Spwans a number of threads and allocates memory.\n\n" | |
| 75 "Options:\n" | |
| 76 " -t <num> how many threads to use (default: 1). This is internaly multiplied by the number of cores.\n" | |
| 77 " -o <num> repeat workload and sync operation <m> times\n" | |
| 78 " -i <num> size of workload, repeat <n> times\n" | |
| 79 " -h this help screen\n\n" | |
| 80 }; | |
| 81 | |
| 82 struct barrier_t | |
| 83 { | |
| 84 int counter; | |
| 85 int nthreads; | |
| 86 int32 mutex; | |
| 87 int32 cond; | |
| 88 measurement_t endBarrierCycles; | |
| 89 | |
| 90 }; | |
| 91 typedef struct barrier_t barrier; | |
| 92 | |
| 93 void inline barrier_init(barrier *barr, int nthreads, VirtProcr *animatingPr) | |
| 94 { | |
| 95 barr->counter = 0; | |
| 96 barr->nthreads = nthreads; | |
| 97 barr->mutex = VPThread__make_mutex(animatingPr); | |
| 98 barr->cond = VPThread__make_cond(barr->mutex, animatingPr); | |
| 99 } | |
| 100 | |
| 101 int cycles_counter_main_fd; | |
| 102 void inline barrier_wait(barrier *barr, VirtProcr *animatingPr) | |
| 103 { int i; | |
| 104 | |
| 105 VPThread__mutex_lock(barr->mutex, animatingPr); | |
| 106 barr->counter++; | |
| 107 if(barr->counter == barr->nthreads) | |
| 108 { | |
| 109 #ifdef MEASURE_PERF | |
| 110 read(cycles_counter_main_fd, &(barr->endBarrierCycles.cycles), \ | |
| 111 sizeof(barr->endBarrierCycles.cycles)); | |
| 112 #endif | |
| 113 | |
| 114 barr->counter = 0; | |
| 115 for(i=0; i < barr->nthreads; i++) | |
| 116 VPThread__cond_signal(barr->cond, animatingPr); | |
| 117 } | |
| 118 else | |
| 119 { VPThread__cond_wait(barr->cond, animatingPr); | |
| 120 } | |
| 121 VPThread__mutex_unlock(barr->mutex, animatingPr); | |
| 122 } | |
| 123 | |
| 124 | |
| 125 | |
| 126 typedef struct | |
| 127 { struct barrier_t* barrier; | |
| 128 uint64_t totalWorkCycles; | |
| 129 uint64_t totalBadCycles; | |
| 130 uint64_t totalSyncCycles; | |
| 131 uint64_t totalBadSyncCycles; | |
| 132 uint64 numGoodSyncs; | |
| 133 uint64 numGoodTasks; | |
| 134 } | |
| 135 WorkerParams; | |
| 136 | |
| 137 | |
| 138 typedef struct | |
| 139 { measurement_t *startExeCycles; | |
| 140 measurement_t *endExeCycles; | |
| 141 } | |
| 142 BenchParams; | |
| 143 | |
| 144 //======================== Globals ========================= | |
| 145 char __ProgrammName[] = "overhead_test"; | |
| 146 char __DataSet[255]; | |
| 147 | |
| 148 int outer_iters, inner_iters, num_threads; | |
| 149 size_t chunk_size = 0; | |
| 150 | |
| 151 int cycles_counter_fd[NUM_CORES]; | |
| 152 struct perf_event_attr* hw_event; | |
| 153 | |
| 154 WorkerParams *workerParamsArray; | |
| 155 | |
| 156 //======================== App Code ========================= | |
| 157 /* | |
| 158 * Workload | |
| 159 */ | |
| 160 | |
| 161 #define saveCyclesAndInstrs(core,cycles) do{ \ | |
| 162 int cycles_fd = cycles_counter_fd[core]; \ | |
| 163 int nread; \ | |
| 164 \ | |
| 165 nread = read(cycles_fd,&(cycles),sizeof(cycles)); \ | |
| 166 if(nread<0){ \ | |
| 167 perror("Error reading cycles counter"); \ | |
| 168 cycles = 0; \ | |
| 169 } \ | |
| 170 } while (0) //macro magic for scoping | |
| 171 | |
| 172 | |
| 173 double | |
| 174 worker_TLF(void* _params, VirtProcr* animatingPr) | |
| 175 { | |
| 176 int i,o; | |
| 177 WorkerParams* params = (WorkerParams*)_params; | |
| 178 unsigned int totalWorkCycles = 0, totalBadCycles = 0; | |
| 179 unsigned int totalSyncCycles = 0, totalBadSyncCycles = 0; | |
| 180 unsigned int workspace1=0, numGoodSyncs = 0, numGoodTasks = 0; | |
| 181 double workspace2=0.0; | |
| 182 int32 privateMutex = VPThread__make_mutex(animatingPr); | |
| 183 | |
| 184 int cpuid = sched_getcpu(); | |
| 185 | |
| 186 measurement_t startWorkload, endWorkload, startWorkload2, endWorkload2; | |
| 187 uint64 numCycles; | |
| 188 for(o=0; o < outer_iters; o++) | |
| 189 { | |
| 190 #ifdef MEASURE_PERF | |
| 191 saveCyclesAndInstrs(cpuid,startWorkload.cycles); | |
| 192 #endif | |
| 193 | |
| 194 //workltask | |
| 195 for(i=0; i < inner_iters; i++) | |
| 196 { | |
| 197 workspace1 += (workspace1 + 32)/2; | |
| 198 workspace2 += (workspace2 + 23.2)/1.4; | |
| 199 } | |
| 200 | |
| 201 #ifdef MEASURE_PERF | |
| 202 saveCyclesAndInstrs(cpuid,endWorkload.cycles); | |
| 203 numCycles = endWorkload.cycles - startWorkload.cycles; | |
| 204 //sanity check (400K is about 20K iters) | |
| 205 if( numCycles < 400000 ) {totalWorkCycles += numCycles; numGoodTasks++;} | |
| 206 else {totalBadCycles += numCycles; } | |
| 207 #endif | |
| 208 | |
| 209 //mutex access often causes switch to different Slave VP | |
| 210 VPThread__mutex_lock(privateMutex, animatingPr); | |
| 211 | |
| 212 /* | |
| 213 saveCyclesAndInstrs(cpuid,startWorkload2.cycles); | |
| 214 //Task | |
| 215 for(i=0; i < inner_iters; i++) | |
| 216 { | |
| 217 workspace1 += (workspace1 + 32)/2; | |
| 218 workspace2 += (workspace2 + 23.2)/1.4; | |
| 219 } | |
| 220 | |
| 221 saveCyclesAndInstrs(cpuid,endWorkload2.cycles); | |
| 222 numCycles = endWorkload2.cycles - startWorkload2.cycles; | |
| 223 //sanity check (400K is about 20K iters) | |
| 224 if( numCycles < 400000 ) {totalWorkCycles += numCycles; numGoodTasks++;} | |
| 225 else {totalBadCycles += numCycles; } | |
| 226 | |
| 227 */ | |
| 228 VPThread__mutex_unlock(privateMutex, animatingPr); | |
| 229 } | |
| 230 | |
| 231 params->totalWorkCycles = totalWorkCycles; | |
| 232 params->totalBadCycles = totalBadCycles; | |
| 233 params->numGoodTasks = numGoodTasks; | |
| 234 params->totalSyncCycles = totalSyncCycles; | |
| 235 params->totalBadSyncCycles = totalBadSyncCycles; | |
| 236 params->numGoodSyncs = numGoodSyncs; | |
| 237 /* | |
| 238 params->totalSyncCycles = VMS__give_num_plugin_cycles(); | |
| 239 params->totalBadSyncCycles = 0; | |
| 240 params->numGoodSyncs = VMS__give_num_plugin_animations(); | |
| 241 */ | |
| 242 | |
| 243 | |
| 244 //Wait for all threads to end | |
| 245 barrier_wait(params->barrier, animatingPr); | |
| 246 | |
| 247 //Shutdown worker | |
| 248 VPThread__dissipate_thread(animatingPr); | |
| 249 | |
| 250 //below return never reached --> there for gcc | |
| 251 return (workspace1 + workspace2); //to prevent gcc from optimizing work out | |
| 252 } | |
| 253 | |
| 254 | |
| 255 /* this is run after the VMS is set up*/ | |
| 256 void benchmark(void *_params, VirtProcr *animatingPr) | |
| 257 { | |
| 258 int i, cpuID; | |
| 259 struct barrier_t barr; | |
| 260 BenchParams *params; | |
| 261 | |
| 262 params = (BenchParams *)_params; | |
| 263 | |
| 264 barrier_init(&barr, num_threads+1, animatingPr); | |
| 265 | |
| 266 //prepare input | |
| 267 for(i=0; i<num_threads; i++) | |
| 268 { | |
| 269 workerParamsArray[i].barrier = &barr; | |
| 270 } | |
| 271 | |
| 272 //save cycles before execution of threads, to get total exe cycles | |
| 273 measurement_t *startExeCycles, *endExeCycles; | |
| 274 startExeCycles = params->startExeCycles; | |
| 275 | |
| 276 #ifdef MEASURE_PERF | |
| 277 int nread = read(cycles_counter_main_fd, &(startExeCycles->cycles), | |
| 278 sizeof(startExeCycles->cycles)); | |
| 279 if(nread<0) perror("Error reading cycles counter"); | |
| 280 #endif | |
| 281 | |
| 282 //create (which starts running) all threads | |
| 283 for(i=0; i<num_threads; i++) | |
| 284 { VPThread__create_thread((VirtProcrFnPtr)worker_TLF, &(workerParamsArray[i]), animatingPr); | |
| 285 } | |
| 286 //wait for all threads to finish | |
| 287 barrier_wait(&barr, animatingPr); | |
| 288 | |
| 289 #ifdef MEASURE_PERF | |
| 290 //endBarrierCycles read in barrier_wait()! Merten, email me if want to chg | |
| 291 params->endExeCycles->cycles = barr.endBarrierCycles.cycles; | |
| 292 #endif | |
| 293 | |
| 294 | |
| 295 /* | |
| 296 uint64_t overallWorkCycles = 0; | |
| 297 for(i=0; i<num_threads; i++){ | |
| 298 printf("WorkCycles: %lu\n",input[i].totalWorkCycles); | |
| 299 overallWorkCycles += input[i].totalWorkCycles; | |
| 300 } | |
| 301 | |
| 302 printf("Sum across threads of work cycles: %lu\n", overallWorkCycles); | |
| 303 printf("Total Execution: %lu\n", endBenchTime.cycles-startBenchTime.cycles); | |
| 304 printf("Runtime/Workcycle Ratio %lu\n", | |
| 305 ((endBenchTime.cycles-startBenchTime.cycles)*100)/overallWorkCycles); | |
| 306 */ | |
| 307 | |
| 308 //====================================================== | |
| 309 | |
| 310 VPThread__dissipate_thread(animatingPr); | |
| 311 } | |
| 312 | |
| 313 int main(int argc, char **argv) | |
| 314 { | |
| 315 int i; | |
| 316 | |
| 317 //set global static variables, based on cmd-line args | |
| 318 for(i=1; i<argc; i++) | |
| 319 { | |
| 320 if(argv[i][0] == '-' && argv[i][2] == 0) | |
| 321 { | |
| 322 switch(argv[i][1]) | |
| 323 { | |
| 324 case 't': | |
| 325 if(!isdigit(argv[++i][0])) | |
| 326 { | |
| 327 fprintf(stderr, "-t must be followed by the number of worker threads to spawn\n"); | |
| 328 return EXIT_FAILURE; | |
| 329 } | |
| 330 num_threads = atoi(argv[i]); | |
| 331 if(!num_threads) | |
| 332 { | |
| 333 fprintf(stderr, "invalid number of threads specified: %d\n", num_threads); | |
| 334 return EXIT_FAILURE; | |
| 335 } | |
| 336 break; | |
| 337 case 'o': | |
| 338 if(!isdigit(argv[++i][0])) | |
| 339 { | |
| 340 fputs("-i must be followed by a number\n", stderr); | |
| 341 return EXIT_FAILURE; | |
| 342 } | |
| 343 outer_iters = atoi(argv[i]); | |
| 344 break; | |
| 345 case 'i': | |
| 346 if(!isdigit(argv[++i][0])) | |
| 347 { | |
| 348 fputs("-o must be followed by a number (workload size)\n", stderr); | |
| 349 return EXIT_FAILURE; | |
| 350 } | |
| 351 inner_iters = atoi(argv[i]); | |
| 352 break; | |
| 353 case 'h': | |
| 354 fputs(usage, stdout); | |
| 355 return 0; | |
| 356 | |
| 357 default: | |
| 358 fprintf(stderr, "unrecognized argument: %s\n", argv[i]); | |
| 359 fputs(usage, stderr); | |
| 360 return EXIT_FAILURE; | |
| 361 }//switch | |
| 362 }//if arg | |
| 363 else | |
| 364 { | |
| 365 fprintf(stderr, "unrecognized argument: %s\n", argv[i]); | |
| 366 fputs(usage, stderr); | |
| 367 return EXIT_FAILURE; | |
| 368 } | |
| 369 }//for | |
| 370 | |
| 371 | |
| 372 #ifdef MEASURE_PERF | |
| 373 //setup performance counters | |
| 374 hw_event = malloc(sizeof(struct perf_event_attr)); | |
| 375 memset(hw_event,0,sizeof(struct perf_event_attr)); | |
| 376 | |
| 377 hw_event->type = PERF_TYPE_HARDWARE; | |
| 378 hw_event->size = sizeof(hw_event); | |
| 379 hw_event->disabled = 0; | |
| 380 hw_event->freq = 0; | |
| 381 hw_event->inherit = 1; /* children inherit it */ | |
| 382 hw_event->pinned = 1; /* says this virt counter must always be on HW */ | |
| 383 hw_event->exclusive = 0; /* only group on PMU */ | |
| 384 hw_event->exclude_user = 0; /* don't count user */ | |
| 385 hw_event->exclude_kernel = 1; /* don't count kernel */ | |
| 386 hw_event->exclude_hv = 1; /* ditto hypervisor */ | |
| 387 hw_event->exclude_idle = 1; /* don't count when idle */ | |
| 388 hw_event->mmap = 0; /* include mmap data */ | |
| 389 hw_event->comm = 0; /* include comm data */ | |
| 390 | |
| 391 hw_event->config = PERF_COUNT_HW_CPU_CYCLES; //cycles | |
| 392 | |
| 393 int cpuID, retries; | |
| 394 | |
| 395 for( cpuID = 0; cpuID < NUM_CORES; cpuID++ ) | |
| 396 { retries = 0; | |
| 397 do | |
| 398 { retries += 1; | |
| 399 cycles_counter_fd[cpuID] = | |
| 400 syscall(__NR_perf_event_open, hw_event, | |
| 401 0,//pid_t: 0 is "pid of calling process" | |
| 402 cpuID,//int: cpu, the value returned by "CPUID" instr(?) | |
| 403 -1,//int: group_fd, -1 is "leader" or independent | |
| 404 0//unsigned long: flags | |
| 405 ); | |
| 406 } | |
| 407 while(cycles_counter_fd[cpuID]<0 && retries < 100); | |
| 408 if(retries >= 100) | |
| 409 { | |
| 410 fprintf(stderr,"On core %d: ",cpuID); | |
| 411 perror("Failed to open cycles counter"); | |
| 412 } | |
| 413 } | |
| 414 | |
| 415 //Set up counter to accumulate total cycles to process, across all CPUs | |
| 416 | |
| 417 retries = 0; | |
| 418 do | |
| 419 { retries += 1; | |
| 420 cycles_counter_main_fd = | |
| 421 syscall(__NR_perf_event_open, hw_event, | |
| 422 0,//pid_t: 0 is "pid of calling process" | |
| 423 -1,//int: cpu, -1 means accumulate from all cores | |
| 424 -1,//int: group_fd, -1 is "leader" == independent | |
| 425 0//unsigned long: flags | |
| 426 ); | |
| 427 } | |
| 428 while(cycles_counter_main_fd<0 && retries < 100); | |
| 429 if(retries >= 100) | |
| 430 { | |
| 431 fprintf(stderr,"in main "); | |
| 432 perror("Failed to open cycles counter"); | |
| 433 } | |
| 434 #endif | |
| 435 | |
| 436 measurement_t startExeCycles, endExeCycles; | |
| 437 BenchParams *benchParams; | |
| 438 | |
| 439 benchParams = malloc(sizeof(BenchParams)); | |
| 440 | |
| 441 benchParams->startExeCycles = &startExeCycles; | |
| 442 benchParams->endExeCycles = &endExeCycles; | |
| 443 | |
| 444 workerParamsArray = (WorkerParams *)malloc( (num_threads + 1) * sizeof(WorkerParams) ); | |
| 445 if(workerParamsArray == NULL ) printf("error mallocing worker params array\n"); | |
| 446 | |
| 447 | |
| 448 //This is the transition to the VMS runtime | |
| 449 VPThread__create_seed_procr_and_do_work( &benchmark, benchParams ); | |
| 450 | |
| 451 #ifdef MEASURE_PERF | |
| 452 uint64_t totalWorkCyclesAcrossCores = 0, totalBadCyclesAcrossCores = 0; | |
| 453 uint64_t totalSyncCyclesAcrossCores = 0, totalBadSyncCyclesAcrossCores = 0; | |
| 454 for(i=0; i<num_threads; i++){ | |
| 455 printf("WorkCycles: %lu\n",workerParamsArray[i].totalWorkCycles); | |
| 456 // printf("Num Good Tasks: %lu\n",workerParamsArray[i].numGoodTasks); | |
| 457 // printf("SyncCycles: %lu\n",workerParamsArray[i].totalSyncCycles); | |
| 458 // printf("Num Good Syncs: %lu\n",workerParamsArray[i].numGoodSyncs); | |
| 459 totalWorkCyclesAcrossCores += workerParamsArray[i].totalWorkCycles; | |
| 460 totalBadCyclesAcrossCores += workerParamsArray[i].totalBadCycles; | |
| 461 totalSyncCyclesAcrossCores += workerParamsArray[i].totalSyncCycles; | |
| 462 totalBadSyncCyclesAcrossCores += workerParamsArray[i].totalBadSyncCycles; | |
| 463 } | |
| 464 | |
| 465 uint64_t totalExeCycles = endExeCycles.cycles - startExeCycles.cycles; | |
| 466 totalExeCycles -= totalBadCyclesAcrossCores; | |
| 467 uint64 totalOverhead = totalExeCycles - totalWorkCyclesAcrossCores; | |
| 468 int32 numSyncs = outer_iters * num_threads * 2; | |
| 469 printf("Total Execution Cycles: %lu\n", totalExeCycles); | |
| 470 printf("Sum across threads of work cycles: %lu\n", totalWorkCyclesAcrossCores); | |
| 471 printf("Sum across threads of bad work cycles: %lu\n", totalBadCyclesAcrossCores); | |
| 472 // printf("Sum across threads of Bad Sync cycles: %lu\n", totalBadSyncCyclesAcrossCores); | |
| 473 printf("Overhead per sync: %f\n", (double)totalOverhead / (double)numSyncs ); | |
| 474 printf("ExeCycles/WorkCycles Ratio %f\n", | |
| 475 (double)totalExeCycles / (double)totalWorkCyclesAcrossCores); | |
| 476 #else | |
| 477 printf("No measurement done!\n"); | |
| 478 #endif | |
| 479 return 0; | |
| 480 } |
