Mercurial > cgi-bin > hgwebdir.cgi > PR > Applications > Vthread > Vthread__Best_Effort_Msg__Bench
changeset 17:281cadcbb796 false_sharing
changed directory structure, added .hgeol file
| author | Merten Sach <msach@mailbox.tu-berlin.de> |
|---|---|
| date | Mon, 13 Feb 2012 16:12:20 +0100 |
| parents | c3561dbac1dc |
| children | e7277df4460e |
| files | .hgeol main.c src/Application/main.c |
| diffstat | 3 files changed, 516 insertions(+), 502 deletions(-) [+] |
line diff
1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/.hgeol Mon Feb 13 16:12:20 2012 +0100 1.3 @@ -0,0 +1,14 @@ 1.4 + 1.5 +[patterns] 1.6 +**.py = native 1.7 +**.txt = native 1.8 +**.c = native 1.9 +**.h = native 1.10 +**.cpp = native 1.11 +**.java = native 1.12 +**.class = bin 1.13 +**.jar = bin 1.14 +**.sh = native 1.15 +**.pl = native 1.16 +**.jpg = bin 1.17 +**.gif = bin
2.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 2.2 +++ b/main.c Mon Feb 13 16:12:20 2012 +0100 2.3 @@ -0,0 +1,502 @@ 2.4 +/* 2.5 + * 2.6 + */ 2.7 +#include <stdio.h> 2.8 +#include <stdlib.h> 2.9 +#include <string.h> 2.10 +#include <math.h> 2.11 +#include <ctype.h> 2.12 +#include <errno.h> 2.13 +#include <pthread.h> 2.14 +#include <unistd.h> 2.15 +#include "VPThread_lib/VPThread.h" 2.16 +#include "VPThread_lib/VMS/Queue_impl/PrivateQueue.h" 2.17 + 2.18 +#include <linux/perf_event.h> 2.19 +#include <linux/prctl.h> 2.20 +#include <sys/syscall.h> 2.21 + 2.22 +#undef DEBUG 2.23 +//#define DEBUG 2.24 + 2.25 +#if !defined(unix) && !defined(__unix__) 2.26 +#ifdef __MACH__ 2.27 +#define unix 1 2.28 +#define __unix__ 1 2.29 +#endif /* __MACH__ */ 2.30 +#endif /* unix */ 2.31 + 2.32 +/* find the appropriate way to define explicitly sized types */ 2.33 +/* for C99 or GNU libc (also mach's libc) we can use stdint.h */ 2.34 +#if (__STDC_VERSION__ >= 199900) || defined(__GLIBC__) || defined(__MACH__) 2.35 +#include <stdint.h> 2.36 +#elif defined(unix) || defined(__unix__) /* some UNIX systems have them in sys/types.h */ 2.37 +#include <sys/types.h> 2.38 +#elif defined(__WIN32__) || defined(WIN32) /* the nameless one */ 2.39 +typedef unsigned __int8 uint8_t; 2.40 +typedef unsigned __int32 uint32_t; 2.41 +#endif /* sized type detection */ 2.42 + 2.43 +/* provide a millisecond-resolution timer for each system */ 2.44 +#if defined(unix) || defined(__unix__) 2.45 +#include <time.h> 2.46 +#include <sys/time.h> 2.47 +unsigned long get_msec(void) { 2.48 + static struct timeval timeval, first_timeval; 2.49 + 2.50 + gettimeofday(&timeval, 0); 2.51 + if(first_timeval.tv_sec == 0) { 2.52 + first_timeval = timeval; 2.53 + return 0; 2.54 + } 2.55 + return (timeval.tv_sec - first_timeval.tv_sec) * 1000 + (timeval.tv_usec - first_timeval.tv_usec) / 1000; 2.56 +} 2.57 +#elif defined(__WIN32__) || defined(WIN32) 2.58 +#include <windows.h> 2.59 +unsigned long get_msec(void) { 2.60 + return GetTickCount(); 2.61 +} 2.62 +#else 2.63 +//#error "I don't know how to measure time on your platform" 2.64 +#endif 2.65 + 2.66 +//======================== Globals ========================= 2.67 +char __ProgrammName[] = "overhead_test"; 2.68 +char __DataSet[255]; 2.69 + 2.70 +int outer_iters, inner_iters, num_threads; 2.71 +size_t chunk_size = 0; 2.72 + 2.73 +int cycles_counter_main_fd; 2.74 +int misses_counter_fd; 2.75 + 2.76 +uint64_t cache_misses; 2.77 + 2.78 +int cycles_counter_fd[NUM_CORES]; 2.79 +struct perf_event_attr* hw_event; 2.80 + 2.81 +//======================== Defines ========================= 2.82 +typedef struct perfData measurement_t; 2.83 +struct perfData{ 2.84 + uint64 cycles; 2.85 +} __align_to_cacheline__; 2.86 + 2.87 +const char *usage = { 2.88 + "Usage: malloc_test [options]\n" 2.89 + " Spwans a number of threads and allocates memory.\n\n" 2.90 + "Options:\n" 2.91 + " -t <num> how many threads to use (default: 1). This is internaly multiplied by the number of cores.\n" 2.92 + " -o <num> repeat workload and sync operation <m> times\n" 2.93 + " -i <num> size of workload, repeat <n> times\n" 2.94 + " -h this help screen\n\n" 2.95 +}; 2.96 + 2.97 +struct barrier_t 2.98 +{ 2.99 + int counter; 2.100 + int nthreads; 2.101 + int32 mutex; 2.102 + int32 cond; 2.103 + measurement_t endBarrierCycles; 2.104 + 2.105 +} __align_to_cacheline__; 2.106 +typedef struct barrier_t barrier; 2.107 + 2.108 +void inline barrier_init(barrier *barr, int nthreads, VirtProcr *animatingPr) 2.109 + { 2.110 + barr->counter = 0; 2.111 + barr->nthreads = nthreads; 2.112 + barr->mutex = VPThread__make_mutex(animatingPr); 2.113 + barr->cond = VPThread__make_cond(barr->mutex, animatingPr); 2.114 + } 2.115 + 2.116 +void inline barrier_wait(barrier *barr, VirtProcr *animatingPr) 2.117 + { int i; 2.118 + 2.119 + VPThread__mutex_lock(barr->mutex, animatingPr); 2.120 + barr->counter++; 2.121 + if(barr->counter == barr->nthreads) 2.122 + { 2.123 + read(cycles_counter_main_fd, &(barr->endBarrierCycles.cycles), \ 2.124 + sizeof(barr->endBarrierCycles.cycles)); 2.125 + 2.126 + barr->counter = 0; 2.127 + for(i=0; i < barr->nthreads; i++) 2.128 + VPThread__cond_signal(barr->cond, animatingPr); 2.129 + } 2.130 + else 2.131 + { VPThread__cond_wait(barr->cond, animatingPr); 2.132 + } 2.133 + VPThread__mutex_unlock(barr->mutex, animatingPr); 2.134 + } 2.135 + 2.136 + 2.137 + 2.138 +struct WorkerParams_t 2.139 + { struct barrier_t* barrier; 2.140 + uint64_t totalWorkCycles; 2.141 + uint64_t totalBadCycles; 2.142 + uint64_t totalSyncCycles; 2.143 + uint64_t totalBadSyncCycles; 2.144 + uint64 numGoodSyncs; 2.145 + uint64 numGoodTasks; 2.146 + }; 2.147 + 2.148 + typedef union 2.149 + { 2.150 + struct WorkerParams_t data; 2.151 + char padding[CACHELINE_SIZE]; 2.152 + } WorkerParams __align_to_cacheline__; 2.153 + 2.154 +WorkerParams *workerParamsArray; 2.155 + 2.156 +typedef struct 2.157 + { measurement_t *startExeCycles; 2.158 + measurement_t *endExeCycles; 2.159 + } BenchParams __align_to_cacheline__; 2.160 + 2.161 +//======================== App Code ========================= 2.162 +/* 2.163 + p* Workload 2.164 + */ 2.165 + 2.166 +#define saveCyclesAndInstrs(core,cycles) do{ \ 2.167 + int cycles_fd = cycles_counter_fd[core]; \ 2.168 + int nread; \ 2.169 + \ 2.170 + nread = read(cycles_fd,&(cycles),sizeof(cycles)); \ 2.171 + if(nread<0){ \ 2.172 + perror("Error reading cycles counter"); \ 2.173 + cycles = 0; \ 2.174 + } \ 2.175 +} while (0) //macro magic for scoping 2.176 + 2.177 +#define saveMisses(misses) do{ \ 2.178 + int nread; \ 2.179 + \ 2.180 + nread = read(misses_counter_fd,&(misses),sizeof(misses)); \ 2.181 + if(nread<0){ \ 2.182 + perror("Error reading misses counter"); \ 2.183 + misses = 0; \ 2.184 + } \ 2.185 +} while (0) //macro magic for scoping 2.186 + 2.187 + 2.188 +double 2.189 +worker_TLF(void* _params, VirtProcr* animatingPr) 2.190 + { 2.191 + int i,o; 2.192 + WorkerParams* params = (WorkerParams*)_params; 2.193 + unsigned int totalWorkCycles = 0, totalBadCycles = 0; 2.194 + unsigned int totalSyncCycles = 0, totalBadSyncCycles = 0; 2.195 + unsigned int workspace1=0, numGoodSyncs = 0, numGoodTasks = 0; 2.196 + double workspace2=0.0; 2.197 + int32 privateMutex = VPThread__make_mutex(animatingPr); 2.198 + 2.199 + int cpuid = sched_getcpu(); 2.200 + 2.201 + measurement_t startWorkload, endWorkload; 2.202 + uint64 numCycles; 2.203 + for(o=0; o < outer_iters; o++) 2.204 + { 2.205 + 2.206 + saveCyclesAndInstrs(cpuid,startWorkload.cycles); 2.207 + 2.208 + //task 2.209 + for(i=0; i < inner_iters; i++) 2.210 + { 2.211 + workspace1 += (workspace1 + 32)/2; 2.212 + workspace2 += (workspace2 + 23.2)/1.4; 2.213 + } 2.214 + 2.215 + saveCyclesAndInstrs(cpuid,endWorkload.cycles); 2.216 + numCycles = endWorkload.cycles - startWorkload.cycles; 2.217 + //sanity check (400K is about 20K iters) 2.218 + if( numCycles < 400000 ) {totalWorkCycles += numCycles; numGoodTasks++;} 2.219 + else {totalBadCycles += numCycles; } 2.220 + 2.221 + //mutex access often causes switch to different Slave VP 2.222 + VPThread__mutex_lock(privateMutex, animatingPr); 2.223 + 2.224 +/* 2.225 + saveCyclesAndInstrs(cpuid,startWorkload2.cycles); 2.226 + //Task 2.227 + for(i=0; i < inner_iters; i++) 2.228 + { 2.229 + workspace1 += (workspace1 + 32)/2; 2.230 + workspace2 += (workspace2 + 23.2)/1.4; 2.231 + } 2.232 + 2.233 + saveCyclesAndInstrs(cpuid,endWorkload2.cycles); 2.234 + numCycles = endWorkload2.cycles - startWorkload2.cycles; 2.235 + //sanity check (400K is about 20K iters) 2.236 + if( numCycles < 400000 ) {totalWorkCycles += numCycles; numGoodTasks++;} 2.237 + else {totalBadCycles += numCycles; } 2.238 + 2.239 +*/ 2.240 + VPThread__mutex_unlock(privateMutex, animatingPr); 2.241 + } 2.242 + 2.243 + params->data.totalWorkCycles = totalWorkCycles; 2.244 + params->data.totalBadCycles = totalBadCycles; 2.245 + params->data.numGoodTasks = numGoodTasks; 2.246 + params->data.totalSyncCycles = totalSyncCycles; 2.247 + params->data.totalBadSyncCycles = totalBadSyncCycles; 2.248 + params->data.numGoodSyncs = numGoodSyncs; 2.249 +/* 2.250 + params->totalSyncCycles = VMS__give_num_plugin_cycles(); 2.251 + params->totalBadSyncCycles = 0; 2.252 + params->numGoodSyncs = VMS__give_num_plugin_animations(); 2.253 +*/ 2.254 + 2.255 + 2.256 + //Wait for all threads to end 2.257 + barrier_wait(params->data.barrier, animatingPr); 2.258 + 2.259 + //Shutdown worker 2.260 + VPThread__dissipate_thread(animatingPr); 2.261 + 2.262 + //below return never reached --> there for gcc 2.263 + return (workspace1 + workspace2); //to prevent gcc from optimizing work out 2.264 + } 2.265 + 2.266 +//local variables of benchmark, made global for alignment 2.267 +struct barrier_t barr __align_to_cacheline__; 2.268 +BenchParams *params __align_to_cacheline__; 2.269 + 2.270 +/* this is run after the VMS is set up*/ 2.271 +void benchmark(void *_params, VirtProcr *animatingPr) 2.272 + { 2.273 + int i; 2.274 + 2.275 + params = (BenchParams *)_params; 2.276 + 2.277 + barrier_init(&barr, num_threads+1, animatingPr); 2.278 + 2.279 + //prepare input 2.280 + for(i=0; i<num_threads; i++) 2.281 + { 2.282 + workerParamsArray[i].data.barrier = &barr; 2.283 + } 2.284 + 2.285 + uint64_t cache_misses_at_start, cache_misses_at_end; 2.286 + saveMisses(cache_misses_at_start); 2.287 + //save cycles before execution of threads, to get total exe cycles 2.288 + int nread = read(cycles_counter_main_fd, &(params->startExeCycles->cycles), 2.289 + sizeof(params->startExeCycles->cycles)); 2.290 + if(nread<0) perror("Error reading cycles counter"); 2.291 + 2.292 + //create (which starts running) all threads 2.293 + for(i=0; i<num_threads; i++) 2.294 + { VPThread__create_thread((VirtProcrFnPtr)worker_TLF, &(workerParamsArray[i]), animatingPr); 2.295 + } 2.296 + //wait for all threads to finish 2.297 + barrier_wait(&barr, animatingPr); 2.298 + 2.299 + //endBarrierCycles read in barrier_wait()! Merten, email me if want to chg 2.300 + params->endExeCycles->cycles = barr.endBarrierCycles.cycles; 2.301 + saveMisses(cache_misses_at_end); 2.302 + cache_misses = cache_misses_at_end-cache_misses_at_start; 2.303 +/* 2.304 + uint64_t overallWorkCycles = 0; 2.305 + for(i=0; i<num_threads; i++){ 2.306 + printf("WorkCycles: %lu\n",input[i].totalWorkCycles); 2.307 + overallWorkCycles += input[i].totalWorkCycles; 2.308 + } 2.309 + 2.310 + printf("Sum across threads of work cycles: %lu\n", overallWorkCycles); 2.311 + printf("Total Execution: %lu\n", endBenchTime.cycles-startBenchTime.cycles); 2.312 + printf("Runtime/Workcycle Ratio %lu\n", 2.313 + ((endBenchTime.cycles-startBenchTime.cycles)*100)/overallWorkCycles); 2.314 +*/ 2.315 + 2.316 + //====================================================== 2.317 + 2.318 + VPThread__dissipate_thread(animatingPr); 2.319 + } 2.320 + 2.321 +int main(int argc, char **argv) 2.322 + { 2.323 + int i; 2.324 + 2.325 + //set global static variables, based on cmd-line args 2.326 + for(i=1; i<argc; i++) 2.327 + { 2.328 + if(argv[i][0] == '-' && argv[i][2] == 0) 2.329 + { 2.330 + switch(argv[i][1]) 2.331 + { 2.332 + case 't': 2.333 + if(!isdigit(argv[++i][0])) 2.334 + { 2.335 + fprintf(stderr, "-t must be followed by the number of worker threads to spawn\n"); 2.336 + return EXIT_FAILURE; 2.337 + } 2.338 + num_threads = atoi(argv[i]); 2.339 + if(!num_threads) 2.340 + { 2.341 + fprintf(stderr, "invalid number of threads specified: %d\n", num_threads); 2.342 + return EXIT_FAILURE; 2.343 + } 2.344 + break; 2.345 + case 'o': 2.346 + if(!isdigit(argv[++i][0])) 2.347 + { 2.348 + fputs("-i must be followed by a number\n", stderr); 2.349 + return EXIT_FAILURE; 2.350 + } 2.351 + outer_iters = atoi(argv[i]); 2.352 + break; 2.353 + case 'i': 2.354 + if(!isdigit(argv[++i][0])) 2.355 + { 2.356 + fputs("-o must be followed by a number (workload size)\n", stderr); 2.357 + return EXIT_FAILURE; 2.358 + } 2.359 + inner_iters = atoi(argv[i]); 2.360 + break; 2.361 + case 'h': 2.362 + fputs(usage, stdout); 2.363 + return 0; 2.364 + 2.365 + default: 2.366 + fprintf(stderr, "unrecognized argument: %s\n", argv[i]); 2.367 + fputs(usage, stderr); 2.368 + return EXIT_FAILURE; 2.369 + }//switch 2.370 + }//if arg 2.371 + else 2.372 + { 2.373 + fprintf(stderr, "unrecognized argument: %s\n", argv[i]); 2.374 + fputs(usage, stderr); 2.375 + return EXIT_FAILURE; 2.376 + } 2.377 + }//for 2.378 + 2.379 + 2.380 + //setup performance counters 2.381 + hw_event = malloc(sizeof(struct perf_event_attr)); 2.382 + memset(hw_event,0,sizeof(struct perf_event_attr)); 2.383 + 2.384 + hw_event->type = PERF_TYPE_HARDWARE; 2.385 + hw_event->size = sizeof(hw_event); 2.386 + hw_event->disabled = 0; 2.387 + hw_event->freq = 0; 2.388 + hw_event->inherit = 1; /* children inherit it */ 2.389 + hw_event->pinned = 1; /* says this virt counter must always be on HW */ 2.390 + hw_event->exclusive = 0; /* only group on PMU */ 2.391 + hw_event->exclude_user = 0; /* don't count user */ 2.392 + hw_event->exclude_kernel = 1; /* don't count kernel */ 2.393 + hw_event->exclude_hv = 1; /* ditto hypervisor */ 2.394 + hw_event->exclude_idle = 1; /* don't count when idle */ 2.395 + hw_event->mmap = 0; /* include mmap data */ 2.396 + hw_event->comm = 0; /* include comm data */ 2.397 + 2.398 + hw_event->config = PERF_COUNT_HW_CPU_CYCLES; //cycles 2.399 + 2.400 + int cpuID, retries; 2.401 + 2.402 + for( cpuID = 0; cpuID < NUM_CORES; cpuID++ ) 2.403 + { retries = 0; 2.404 + do 2.405 + { retries += 1; 2.406 + cycles_counter_fd[cpuID] = 2.407 + syscall(__NR_perf_event_open, hw_event, 2.408 + 0,//pid_t: 0 is "pid of calling process" 2.409 + cpuID,//int: cpu, the value returned by "CPUID" instr(?) 2.410 + -1,//int: group_fd, -1 is "leader" or independent 2.411 + 0//unsigned long: flags 2.412 + ); 2.413 + } 2.414 + while(cycles_counter_fd[cpuID]<0 && retries < 100); 2.415 + if(retries >= 100) 2.416 + { 2.417 + fprintf(stderr,"On core %d: ",cpuID); 2.418 + perror("Failed to open cycles counter"); 2.419 + } 2.420 + } 2.421 + 2.422 + //Set up counter to accumulate total cycles to process, across all CPUs 2.423 + 2.424 + retries = 0; 2.425 + do 2.426 + { retries += 1; 2.427 + cycles_counter_main_fd = 2.428 + syscall(__NR_perf_event_open, hw_event, 2.429 + 0,//pid_t: 0 is "pid of calling process" 2.430 + -1,//int: cpu, -1 means accumulate from all cores 2.431 + -1,//int: group_fd, -1 is "leader" == independent 2.432 + 0//unsigned long: flags 2.433 + ); 2.434 + } 2.435 + while(cycles_counter_main_fd<0 && retries < 100); 2.436 + if(retries >= 100) 2.437 + { 2.438 + fprintf(stderr,"in main "); 2.439 + perror("Failed to open cycles counter"); 2.440 + } 2.441 + 2.442 + //Set up counters to count cache misses 2.443 + hw_event->type = PERF_TYPE_HARDWARE; 2.444 + hw_event->config = PERF_COUNT_HW_CACHE_MISSES; //misses 2.445 + 2.446 + retries = 0; 2.447 + do 2.448 + { retries += 1; 2.449 + misses_counter_fd = 2.450 + syscall(__NR_perf_event_open, hw_event, 2.451 + 0,//pid_t: 0 is "pid of calling process" 2.452 + -1,//int: cpu, -1 means accumulate from all cores 2.453 + -1,//int: group_fd, -1 is "leader" == independent 2.454 + 0//unsigned long: flags 2.455 + ); 2.456 + } 2.457 + while(misses_counter_fd<0 && retries < 100); 2.458 + if(retries >= 100) 2.459 + { 2.460 + fprintf(stderr,"in main "); 2.461 + perror("Failed to misses counter"); 2.462 + } 2.463 + 2.464 + measurement_t startExeCycles, endExeCycles; 2.465 + BenchParams *benchParams; 2.466 + 2.467 + benchParams = malloc(sizeof(BenchParams)); 2.468 + 2.469 + benchParams->startExeCycles = &startExeCycles; 2.470 + benchParams->endExeCycles = &endExeCycles; 2.471 + 2.472 + workerParamsArray = (WorkerParams *)malloc( (num_threads + 1) * sizeof(WorkerParams) ); 2.473 + if(workerParamsArray == NULL ) printf("error mallocing worker params array\n"); 2.474 + 2.475 + 2.476 + //This is the transition to the VMS runtime 2.477 + VPThread__create_seed_procr_and_do_work( &benchmark, benchParams ); 2.478 + 2.479 + uint64_t totalWorkCyclesAcrossCores = 0, totalBadCyclesAcrossCores = 0; 2.480 + uint64_t totalSyncCyclesAcrossCores = 0, totalBadSyncCyclesAcrossCores = 0; 2.481 + for(i=0; i<num_threads; i++){ 2.482 + printf("WorkCycles: %lu\n",workerParamsArray[i].data.totalWorkCycles); 2.483 +// printf("Num Good Tasks: %lu\n",workerParamsArray[i].numGoodTasks); 2.484 +// printf("SyncCycles: %lu\n",workerParamsArray[i].totalSyncCycles); 2.485 +// printf("Num Good Syncs: %lu\n",workerParamsArray[i].numGoodSyncs); 2.486 + totalWorkCyclesAcrossCores += workerParamsArray[i].data.totalWorkCycles; 2.487 + totalBadCyclesAcrossCores += workerParamsArray[i].data.totalBadCycles; 2.488 + totalSyncCyclesAcrossCores += workerParamsArray[i].data.totalSyncCycles; 2.489 + totalBadSyncCyclesAcrossCores += workerParamsArray[i].data.totalBadSyncCycles; 2.490 + } 2.491 + 2.492 + uint64_t totalExeCycles = endExeCycles.cycles - startExeCycles.cycles; 2.493 + totalExeCycles -= totalBadCyclesAcrossCores; 2.494 + uint64 totalOverhead = totalExeCycles - totalWorkCyclesAcrossCores; 2.495 + int32 numSyncs = outer_iters * num_threads * 2; 2.496 + printf("Total Execution Cycles: %lu\n", totalExeCycles); 2.497 + printf("Total number of cache misses: %lu\n", cache_misses); 2.498 + printf("Sum across threads of work cycles: %lu\n", totalWorkCyclesAcrossCores); 2.499 + printf("Sum across threads of bad work cycles: %lu\n", totalBadCyclesAcrossCores); 2.500 +// printf("Sum across threads of Bad Sync cycles: %lu\n", totalBadSyncCyclesAcrossCores); 2.501 + printf("Overhead per sync: %f\n", (double)totalOverhead / (double)numSyncs ); 2.502 + printf("ExeCycles/WorkCycles Ratio %f\n", 2.503 + (double)totalExeCycles / (double)totalWorkCyclesAcrossCores); 2.504 + return 0; 2.505 + }
3.1 --- a/src/Application/main.c Tue Dec 20 17:21:27 2011 +0100 3.2 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 3.3 @@ -1,502 +0,0 @@ 3.4 -/* 3.5 - * 3.6 - */ 3.7 -#include <stdio.h> 3.8 -#include <stdlib.h> 3.9 -#include <string.h> 3.10 -#include <math.h> 3.11 -#include <ctype.h> 3.12 -#include <errno.h> 3.13 -#include <pthread.h> 3.14 -#include <unistd.h> 3.15 -#include "VPThread_lib/VPThread.h" 3.16 -#include "VPThread_lib/VMS/Queue_impl/PrivateQueue.h" 3.17 - 3.18 -#include <linux/perf_event.h> 3.19 -#include <linux/prctl.h> 3.20 -#include <sys/syscall.h> 3.21 - 3.22 -#undef DEBUG 3.23 -//#define DEBUG 3.24 - 3.25 -#if !defined(unix) && !defined(__unix__) 3.26 -#ifdef __MACH__ 3.27 -#define unix 1 3.28 -#define __unix__ 1 3.29 -#endif /* __MACH__ */ 3.30 -#endif /* unix */ 3.31 - 3.32 -/* find the appropriate way to define explicitly sized types */ 3.33 -/* for C99 or GNU libc (also mach's libc) we can use stdint.h */ 3.34 -#if (__STDC_VERSION__ >= 199900) || defined(__GLIBC__) || defined(__MACH__) 3.35 -#include <stdint.h> 3.36 -#elif defined(unix) || defined(__unix__) /* some UNIX systems have them in sys/types.h */ 3.37 -#include <sys/types.h> 3.38 -#elif defined(__WIN32__) || defined(WIN32) /* the nameless one */ 3.39 -typedef unsigned __int8 uint8_t; 3.40 -typedef unsigned __int32 uint32_t; 3.41 -#endif /* sized type detection */ 3.42 - 3.43 -/* provide a millisecond-resolution timer for each system */ 3.44 -#if defined(unix) || defined(__unix__) 3.45 -#include <time.h> 3.46 -#include <sys/time.h> 3.47 -unsigned long get_msec(void) { 3.48 - static struct timeval timeval, first_timeval; 3.49 - 3.50 - gettimeofday(&timeval, 0); 3.51 - if(first_timeval.tv_sec == 0) { 3.52 - first_timeval = timeval; 3.53 - return 0; 3.54 - } 3.55 - return (timeval.tv_sec - first_timeval.tv_sec) * 1000 + (timeval.tv_usec - first_timeval.tv_usec) / 1000; 3.56 -} 3.57 -#elif defined(__WIN32__) || defined(WIN32) 3.58 -#include <windows.h> 3.59 -unsigned long get_msec(void) { 3.60 - return GetTickCount(); 3.61 -} 3.62 -#else 3.63 -//#error "I don't know how to measure time on your platform" 3.64 -#endif 3.65 - 3.66 -//======================== Globals ========================= 3.67 -char __ProgrammName[] = "overhead_test"; 3.68 -char __DataSet[255]; 3.69 - 3.70 -int outer_iters, inner_iters, num_threads; 3.71 -size_t chunk_size = 0; 3.72 - 3.73 -int cycles_counter_main_fd; 3.74 -int misses_counter_fd; 3.75 - 3.76 -uint64_t cache_misses; 3.77 - 3.78 -int cycles_counter_fd[NUM_CORES]; 3.79 -struct perf_event_attr* hw_event; 3.80 - 3.81 -//======================== Defines ========================= 3.82 -typedef struct perfData measurement_t; 3.83 -struct perfData{ 3.84 - uint64 cycles; 3.85 -} __align_to_cacheline__; 3.86 - 3.87 -const char *usage = { 3.88 - "Usage: malloc_test [options]\n" 3.89 - " Spwans a number of threads and allocates memory.\n\n" 3.90 - "Options:\n" 3.91 - " -t <num> how many threads to use (default: 1). This is internaly multiplied by the number of cores.\n" 3.92 - " -o <num> repeat workload and sync operation <m> times\n" 3.93 - " -i <num> size of workload, repeat <n> times\n" 3.94 - " -h this help screen\n\n" 3.95 -}; 3.96 - 3.97 -struct barrier_t 3.98 -{ 3.99 - int counter; 3.100 - int nthreads; 3.101 - int32 mutex; 3.102 - int32 cond; 3.103 - measurement_t endBarrierCycles; 3.104 - 3.105 -} __align_to_cacheline__; 3.106 -typedef struct barrier_t barrier; 3.107 - 3.108 -void inline barrier_init(barrier *barr, int nthreads, VirtProcr *animatingPr) 3.109 - { 3.110 - barr->counter = 0; 3.111 - barr->nthreads = nthreads; 3.112 - barr->mutex = VPThread__make_mutex(animatingPr); 3.113 - barr->cond = VPThread__make_cond(barr->mutex, animatingPr); 3.114 - } 3.115 - 3.116 -void inline barrier_wait(barrier *barr, VirtProcr *animatingPr) 3.117 - { int i; 3.118 - 3.119 - VPThread__mutex_lock(barr->mutex, animatingPr); 3.120 - barr->counter++; 3.121 - if(barr->counter == barr->nthreads) 3.122 - { 3.123 - read(cycles_counter_main_fd, &(barr->endBarrierCycles.cycles), \ 3.124 - sizeof(barr->endBarrierCycles.cycles)); 3.125 - 3.126 - barr->counter = 0; 3.127 - for(i=0; i < barr->nthreads; i++) 3.128 - VPThread__cond_signal(barr->cond, animatingPr); 3.129 - } 3.130 - else 3.131 - { VPThread__cond_wait(barr->cond, animatingPr); 3.132 - } 3.133 - VPThread__mutex_unlock(barr->mutex, animatingPr); 3.134 - } 3.135 - 3.136 - 3.137 - 3.138 -struct WorkerParams_t 3.139 - { struct barrier_t* barrier; 3.140 - uint64_t totalWorkCycles; 3.141 - uint64_t totalBadCycles; 3.142 - uint64_t totalSyncCycles; 3.143 - uint64_t totalBadSyncCycles; 3.144 - uint64 numGoodSyncs; 3.145 - uint64 numGoodTasks; 3.146 - }; 3.147 - 3.148 - typedef union 3.149 - { 3.150 - struct WorkerParams_t data; 3.151 - char padding[CACHELINE_SIZE]; 3.152 - } WorkerParams __align_to_cacheline__; 3.153 - 3.154 -WorkerParams *workerParamsArray; 3.155 - 3.156 -typedef struct 3.157 - { measurement_t *startExeCycles; 3.158 - measurement_t *endExeCycles; 3.159 - } BenchParams __align_to_cacheline__; 3.160 - 3.161 -//======================== App Code ========================= 3.162 -/* 3.163 - p* Workload 3.164 - */ 3.165 - 3.166 -#define saveCyclesAndInstrs(core,cycles) do{ \ 3.167 - int cycles_fd = cycles_counter_fd[core]; \ 3.168 - int nread; \ 3.169 - \ 3.170 - nread = read(cycles_fd,&(cycles),sizeof(cycles)); \ 3.171 - if(nread<0){ \ 3.172 - perror("Error reading cycles counter"); \ 3.173 - cycles = 0; \ 3.174 - } \ 3.175 -} while (0) //macro magic for scoping 3.176 - 3.177 -#define saveMisses(misses) do{ \ 3.178 - int nread; \ 3.179 - \ 3.180 - nread = read(misses_counter_fd,&(misses),sizeof(misses)); \ 3.181 - if(nread<0){ \ 3.182 - perror("Error reading misses counter"); \ 3.183 - misses = 0; \ 3.184 - } \ 3.185 -} while (0) //macro magic for scoping 3.186 - 3.187 - 3.188 -double 3.189 -worker_TLF(void* _params, VirtProcr* animatingPr) 3.190 - { 3.191 - int i,o; 3.192 - WorkerParams* params = (WorkerParams*)_params; 3.193 - unsigned int totalWorkCycles = 0, totalBadCycles = 0; 3.194 - unsigned int totalSyncCycles = 0, totalBadSyncCycles = 0; 3.195 - unsigned int workspace1=0, numGoodSyncs = 0, numGoodTasks = 0; 3.196 - double workspace2=0.0; 3.197 - int32 privateMutex = VPThread__make_mutex(animatingPr); 3.198 - 3.199 - int cpuid = sched_getcpu(); 3.200 - 3.201 - measurement_t startWorkload, endWorkload; 3.202 - uint64 numCycles; 3.203 - for(o=0; o < outer_iters; o++) 3.204 - { 3.205 - 3.206 - saveCyclesAndInstrs(cpuid,startWorkload.cycles); 3.207 - 3.208 - //task 3.209 - for(i=0; i < inner_iters; i++) 3.210 - { 3.211 - workspace1 += (workspace1 + 32)/2; 3.212 - workspace2 += (workspace2 + 23.2)/1.4; 3.213 - } 3.214 - 3.215 - saveCyclesAndInstrs(cpuid,endWorkload.cycles); 3.216 - numCycles = endWorkload.cycles - startWorkload.cycles; 3.217 - //sanity check (400K is about 20K iters) 3.218 - if( numCycles < 400000 ) {totalWorkCycles += numCycles; numGoodTasks++;} 3.219 - else {totalBadCycles += numCycles; } 3.220 - 3.221 - //mutex access often causes switch to different Slave VP 3.222 - VPThread__mutex_lock(privateMutex, animatingPr); 3.223 - 3.224 -/* 3.225 - saveCyclesAndInstrs(cpuid,startWorkload2.cycles); 3.226 - //Task 3.227 - for(i=0; i < inner_iters; i++) 3.228 - { 3.229 - workspace1 += (workspace1 + 32)/2; 3.230 - workspace2 += (workspace2 + 23.2)/1.4; 3.231 - } 3.232 - 3.233 - saveCyclesAndInstrs(cpuid,endWorkload2.cycles); 3.234 - numCycles = endWorkload2.cycles - startWorkload2.cycles; 3.235 - //sanity check (400K is about 20K iters) 3.236 - if( numCycles < 400000 ) {totalWorkCycles += numCycles; numGoodTasks++;} 3.237 - else {totalBadCycles += numCycles; } 3.238 - 3.239 -*/ 3.240 - VPThread__mutex_unlock(privateMutex, animatingPr); 3.241 - } 3.242 - 3.243 - params->data.totalWorkCycles = totalWorkCycles; 3.244 - params->data.totalBadCycles = totalBadCycles; 3.245 - params->data.numGoodTasks = numGoodTasks; 3.246 - params->data.totalSyncCycles = totalSyncCycles; 3.247 - params->data.totalBadSyncCycles = totalBadSyncCycles; 3.248 - params->data.numGoodSyncs = numGoodSyncs; 3.249 -/* 3.250 - params->totalSyncCycles = VMS__give_num_plugin_cycles(); 3.251 - params->totalBadSyncCycles = 0; 3.252 - params->numGoodSyncs = VMS__give_num_plugin_animations(); 3.253 -*/ 3.254 - 3.255 - 3.256 - //Wait for all threads to end 3.257 - barrier_wait(params->data.barrier, animatingPr); 3.258 - 3.259 - //Shutdown worker 3.260 - VPThread__dissipate_thread(animatingPr); 3.261 - 3.262 - //below return never reached --> there for gcc 3.263 - return (workspace1 + workspace2); //to prevent gcc from optimizing work out 3.264 - } 3.265 - 3.266 -//local variables of benchmark, made global for alignment 3.267 -struct barrier_t barr __align_to_cacheline__; 3.268 -BenchParams *params __align_to_cacheline__; 3.269 - 3.270 -/* this is run after the VMS is set up*/ 3.271 -void benchmark(void *_params, VirtProcr *animatingPr) 3.272 - { 3.273 - int i; 3.274 - 3.275 - params = (BenchParams *)_params; 3.276 - 3.277 - barrier_init(&barr, num_threads+1, animatingPr); 3.278 - 3.279 - //prepare input 3.280 - for(i=0; i<num_threads; i++) 3.281 - { 3.282 - workerParamsArray[i].data.barrier = &barr; 3.283 - } 3.284 - 3.285 - uint64_t cache_misses_at_start, cache_misses_at_end; 3.286 - saveMisses(cache_misses_at_start); 3.287 - //save cycles before execution of threads, to get total exe cycles 3.288 - int nread = read(cycles_counter_main_fd, &(params->startExeCycles->cycles), 3.289 - sizeof(params->startExeCycles->cycles)); 3.290 - if(nread<0) perror("Error reading cycles counter"); 3.291 - 3.292 - //create (which starts running) all threads 3.293 - for(i=0; i<num_threads; i++) 3.294 - { VPThread__create_thread((VirtProcrFnPtr)worker_TLF, &(workerParamsArray[i]), animatingPr); 3.295 - } 3.296 - //wait for all threads to finish 3.297 - barrier_wait(&barr, animatingPr); 3.298 - 3.299 - //endBarrierCycles read in barrier_wait()! Merten, email me if want to chg 3.300 - params->endExeCycles->cycles = barr.endBarrierCycles.cycles; 3.301 - saveMisses(cache_misses_at_end); 3.302 - cache_misses = cache_misses_at_end-cache_misses_at_start; 3.303 -/* 3.304 - uint64_t overallWorkCycles = 0; 3.305 - for(i=0; i<num_threads; i++){ 3.306 - printf("WorkCycles: %lu\n",input[i].totalWorkCycles); 3.307 - overallWorkCycles += input[i].totalWorkCycles; 3.308 - } 3.309 - 3.310 - printf("Sum across threads of work cycles: %lu\n", overallWorkCycles); 3.311 - printf("Total Execution: %lu\n", endBenchTime.cycles-startBenchTime.cycles); 3.312 - printf("Runtime/Workcycle Ratio %lu\n", 3.313 - ((endBenchTime.cycles-startBenchTime.cycles)*100)/overallWorkCycles); 3.314 -*/ 3.315 - 3.316 - //====================================================== 3.317 - 3.318 - VPThread__dissipate_thread(animatingPr); 3.319 - } 3.320 - 3.321 -int main(int argc, char **argv) 3.322 - { 3.323 - int i; 3.324 - 3.325 - //set global static variables, based on cmd-line args 3.326 - for(i=1; i<argc; i++) 3.327 - { 3.328 - if(argv[i][0] == '-' && argv[i][2] == 0) 3.329 - { 3.330 - switch(argv[i][1]) 3.331 - { 3.332 - case 't': 3.333 - if(!isdigit(argv[++i][0])) 3.334 - { 3.335 - fprintf(stderr, "-t must be followed by the number of worker threads to spawn\n"); 3.336 - return EXIT_FAILURE; 3.337 - } 3.338 - num_threads = atoi(argv[i]); 3.339 - if(!num_threads) 3.340 - { 3.341 - fprintf(stderr, "invalid number of threads specified: %d\n", num_threads); 3.342 - return EXIT_FAILURE; 3.343 - } 3.344 - break; 3.345 - case 'o': 3.346 - if(!isdigit(argv[++i][0])) 3.347 - { 3.348 - fputs("-i must be followed by a number\n", stderr); 3.349 - return EXIT_FAILURE; 3.350 - } 3.351 - outer_iters = atoi(argv[i]); 3.352 - break; 3.353 - case 'i': 3.354 - if(!isdigit(argv[++i][0])) 3.355 - { 3.356 - fputs("-o must be followed by a number (workload size)\n", stderr); 3.357 - return EXIT_FAILURE; 3.358 - } 3.359 - inner_iters = atoi(argv[i]); 3.360 - break; 3.361 - case 'h': 3.362 - fputs(usage, stdout); 3.363 - return 0; 3.364 - 3.365 - default: 3.366 - fprintf(stderr, "unrecognized argument: %s\n", argv[i]); 3.367 - fputs(usage, stderr); 3.368 - return EXIT_FAILURE; 3.369 - }//switch 3.370 - }//if arg 3.371 - else 3.372 - { 3.373 - fprintf(stderr, "unrecognized argument: %s\n", argv[i]); 3.374 - fputs(usage, stderr); 3.375 - return EXIT_FAILURE; 3.376 - } 3.377 - }//for 3.378 - 3.379 - 3.380 - //setup performance counters 3.381 - hw_event = malloc(sizeof(struct perf_event_attr)); 3.382 - memset(hw_event,0,sizeof(struct perf_event_attr)); 3.383 - 3.384 - hw_event->type = PERF_TYPE_HARDWARE; 3.385 - hw_event->size = sizeof(hw_event); 3.386 - hw_event->disabled = 0; 3.387 - hw_event->freq = 0; 3.388 - hw_event->inherit = 1; /* children inherit it */ 3.389 - hw_event->pinned = 1; /* says this virt counter must always be on HW */ 3.390 - hw_event->exclusive = 0; /* only group on PMU */ 3.391 - hw_event->exclude_user = 0; /* don't count user */ 3.392 - hw_event->exclude_kernel = 1; /* don't count kernel */ 3.393 - hw_event->exclude_hv = 1; /* ditto hypervisor */ 3.394 - hw_event->exclude_idle = 1; /* don't count when idle */ 3.395 - hw_event->mmap = 0; /* include mmap data */ 3.396 - hw_event->comm = 0; /* include comm data */ 3.397 - 3.398 - hw_event->config = PERF_COUNT_HW_CPU_CYCLES; //cycles 3.399 - 3.400 - int cpuID, retries; 3.401 - 3.402 - for( cpuID = 0; cpuID < NUM_CORES; cpuID++ ) 3.403 - { retries = 0; 3.404 - do 3.405 - { retries += 1; 3.406 - cycles_counter_fd[cpuID] = 3.407 - syscall(__NR_perf_event_open, hw_event, 3.408 - 0,//pid_t: 0 is "pid of calling process" 3.409 - cpuID,//int: cpu, the value returned by "CPUID" instr(?) 3.410 - -1,//int: group_fd, -1 is "leader" or independent 3.411 - 0//unsigned long: flags 3.412 - ); 3.413 - } 3.414 - while(cycles_counter_fd[cpuID]<0 && retries < 100); 3.415 - if(retries >= 100) 3.416 - { 3.417 - fprintf(stderr,"On core %d: ",cpuID); 3.418 - perror("Failed to open cycles counter"); 3.419 - } 3.420 - } 3.421 - 3.422 - //Set up counter to accumulate total cycles to process, across all CPUs 3.423 - 3.424 - retries = 0; 3.425 - do 3.426 - { retries += 1; 3.427 - cycles_counter_main_fd = 3.428 - syscall(__NR_perf_event_open, hw_event, 3.429 - 0,//pid_t: 0 is "pid of calling process" 3.430 - -1,//int: cpu, -1 means accumulate from all cores 3.431 - -1,//int: group_fd, -1 is "leader" == independent 3.432 - 0//unsigned long: flags 3.433 - ); 3.434 - } 3.435 - while(cycles_counter_main_fd<0 && retries < 100); 3.436 - if(retries >= 100) 3.437 - { 3.438 - fprintf(stderr,"in main "); 3.439 - perror("Failed to open cycles counter"); 3.440 - } 3.441 - 3.442 - //Set up counters to count cache misses 3.443 - hw_event->type = PERF_TYPE_HARDWARE; 3.444 - hw_event->config = PERF_COUNT_HW_CACHE_MISSES; //misses 3.445 - 3.446 - retries = 0; 3.447 - do 3.448 - { retries += 1; 3.449 - misses_counter_fd = 3.450 - syscall(__NR_perf_event_open, hw_event, 3.451 - 0,//pid_t: 0 is "pid of calling process" 3.452 - -1,//int: cpu, -1 means accumulate from all cores 3.453 - -1,//int: group_fd, -1 is "leader" == independent 3.454 - 0//unsigned long: flags 3.455 - ); 3.456 - } 3.457 - while(misses_counter_fd<0 && retries < 100); 3.458 - if(retries >= 100) 3.459 - { 3.460 - fprintf(stderr,"in main "); 3.461 - perror("Failed to misses counter"); 3.462 - } 3.463 - 3.464 - measurement_t startExeCycles, endExeCycles; 3.465 - BenchParams *benchParams; 3.466 - 3.467 - benchParams = malloc(sizeof(BenchParams)); 3.468 - 3.469 - benchParams->startExeCycles = &startExeCycles; 3.470 - benchParams->endExeCycles = &endExeCycles; 3.471 - 3.472 - workerParamsArray = (WorkerParams *)malloc( (num_threads + 1) * sizeof(WorkerParams) ); 3.473 - if(workerParamsArray == NULL ) printf("error mallocing worker params array\n"); 3.474 - 3.475 - 3.476 - //This is the transition to the VMS runtime 3.477 - VPThread__create_seed_procr_and_do_work( &benchmark, benchParams ); 3.478 - 3.479 - uint64_t totalWorkCyclesAcrossCores = 0, totalBadCyclesAcrossCores = 0; 3.480 - uint64_t totalSyncCyclesAcrossCores = 0, totalBadSyncCyclesAcrossCores = 0; 3.481 - for(i=0; i<num_threads; i++){ 3.482 - printf("WorkCycles: %lu\n",workerParamsArray[i].data.totalWorkCycles); 3.483 -// printf("Num Good Tasks: %lu\n",workerParamsArray[i].numGoodTasks); 3.484 -// printf("SyncCycles: %lu\n",workerParamsArray[i].totalSyncCycles); 3.485 -// printf("Num Good Syncs: %lu\n",workerParamsArray[i].numGoodSyncs); 3.486 - totalWorkCyclesAcrossCores += workerParamsArray[i].data.totalWorkCycles; 3.487 - totalBadCyclesAcrossCores += workerParamsArray[i].data.totalBadCycles; 3.488 - totalSyncCyclesAcrossCores += workerParamsArray[i].data.totalSyncCycles; 3.489 - totalBadSyncCyclesAcrossCores += workerParamsArray[i].data.totalBadSyncCycles; 3.490 - } 3.491 - 3.492 - uint64_t totalExeCycles = endExeCycles.cycles - startExeCycles.cycles; 3.493 - totalExeCycles -= totalBadCyclesAcrossCores; 3.494 - uint64 totalOverhead = totalExeCycles - totalWorkCyclesAcrossCores; 3.495 - int32 numSyncs = outer_iters * num_threads * 2; 3.496 - printf("Total Execution Cycles: %lu\n", totalExeCycles); 3.497 - printf("Total number of cache misses: %lu\n", cache_misses); 3.498 - printf("Sum across threads of work cycles: %lu\n", totalWorkCyclesAcrossCores); 3.499 - printf("Sum across threads of bad work cycles: %lu\n", totalBadCyclesAcrossCores); 3.500 -// printf("Sum across threads of Bad Sync cycles: %lu\n", totalBadSyncCyclesAcrossCores); 3.501 - printf("Overhead per sync: %f\n", (double)totalOverhead / (double)numSyncs ); 3.502 - printf("ExeCycles/WorkCycles Ratio %f\n", 3.503 - (double)totalExeCycles / (double)totalWorkCyclesAcrossCores); 3.504 - return 0; 3.505 - }
