Mercurial > cgi-bin > hgwebdir.cgi > PR > Applications > Vthread > Vthread__Best_Effort_Msg__Bench
diff main.c @ 17:281cadcbb796
changed directory structure, added .hgeol file
| author | Merten Sach <msach@mailbox.tu-berlin.de> |
|---|---|
| date | Mon, 13 Feb 2012 16:12:20 +0100 |
| parents | src/Application/main.c@c3561dbac1dc |
| children | e7277df4460e |
line diff
1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/main.c Mon Feb 13 16:12:20 2012 +0100 1.3 @@ -0,0 +1,502 @@ 1.4 +/* 1.5 + * 1.6 + */ 1.7 +#include <stdio.h> 1.8 +#include <stdlib.h> 1.9 +#include <string.h> 1.10 +#include <math.h> 1.11 +#include <ctype.h> 1.12 +#include <errno.h> 1.13 +#include <pthread.h> 1.14 +#include <unistd.h> 1.15 +#include "VPThread_lib/VPThread.h" 1.16 +#include "VPThread_lib/VMS/Queue_impl/PrivateQueue.h" 1.17 + 1.18 +#include <linux/perf_event.h> 1.19 +#include <linux/prctl.h> 1.20 +#include <sys/syscall.h> 1.21 + 1.22 +#undef DEBUG 1.23 +//#define DEBUG 1.24 + 1.25 +#if !defined(unix) && !defined(__unix__) 1.26 +#ifdef __MACH__ 1.27 +#define unix 1 1.28 +#define __unix__ 1 1.29 +#endif /* __MACH__ */ 1.30 +#endif /* unix */ 1.31 + 1.32 +/* find the appropriate way to define explicitly sized types */ 1.33 +/* for C99 or GNU libc (also mach's libc) we can use stdint.h */ 1.34 +#if (__STDC_VERSION__ >= 199900) || defined(__GLIBC__) || defined(__MACH__) 1.35 +#include <stdint.h> 1.36 +#elif defined(unix) || defined(__unix__) /* some UNIX systems have them in sys/types.h */ 1.37 +#include <sys/types.h> 1.38 +#elif defined(__WIN32__) || defined(WIN32) /* the nameless one */ 1.39 +typedef unsigned __int8 uint8_t; 1.40 +typedef unsigned __int32 uint32_t; 1.41 +#endif /* sized type detection */ 1.42 + 1.43 +/* provide a millisecond-resolution timer for each system */ 1.44 +#if defined(unix) || defined(__unix__) 1.45 +#include <time.h> 1.46 +#include <sys/time.h> 1.47 +unsigned long get_msec(void) { 1.48 + static struct timeval timeval, first_timeval; 1.49 + 1.50 + gettimeofday(&timeval, 0); 1.51 + if(first_timeval.tv_sec == 0) { 1.52 + first_timeval = timeval; 1.53 + return 0; 1.54 + } 1.55 + return (timeval.tv_sec - first_timeval.tv_sec) * 1000 + (timeval.tv_usec - first_timeval.tv_usec) / 1000; 1.56 +} 1.57 +#elif defined(__WIN32__) || defined(WIN32) 1.58 +#include <windows.h> 1.59 +unsigned long get_msec(void) { 1.60 + return GetTickCount(); 1.61 +} 1.62 +#else 1.63 +//#error "I don't know how to measure time on your platform" 1.64 +#endif 1.65 + 1.66 +//======================== Globals ========================= 1.67 +char __ProgrammName[] = "overhead_test"; 1.68 +char __DataSet[255]; 1.69 + 1.70 +int outer_iters, inner_iters, num_threads; 1.71 +size_t chunk_size = 0; 1.72 + 1.73 +int cycles_counter_main_fd; 1.74 +int misses_counter_fd; 1.75 + 1.76 +uint64_t cache_misses; 1.77 + 1.78 +int cycles_counter_fd[NUM_CORES]; 1.79 +struct perf_event_attr* hw_event; 1.80 + 1.81 +//======================== Defines ========================= 1.82 +typedef struct perfData measurement_t; 1.83 +struct perfData{ 1.84 + uint64 cycles; 1.85 +} __align_to_cacheline__; 1.86 + 1.87 +const char *usage = { 1.88 + "Usage: malloc_test [options]\n" 1.89 + " Spwans a number of threads and allocates memory.\n\n" 1.90 + "Options:\n" 1.91 + " -t <num> how many threads to use (default: 1). This is internaly multiplied by the number of cores.\n" 1.92 + " -o <num> repeat workload and sync operation <m> times\n" 1.93 + " -i <num> size of workload, repeat <n> times\n" 1.94 + " -h this help screen\n\n" 1.95 +}; 1.96 + 1.97 +struct barrier_t 1.98 +{ 1.99 + int counter; 1.100 + int nthreads; 1.101 + int32 mutex; 1.102 + int32 cond; 1.103 + measurement_t endBarrierCycles; 1.104 + 1.105 +} __align_to_cacheline__; 1.106 +typedef struct barrier_t barrier; 1.107 + 1.108 +void inline barrier_init(barrier *barr, int nthreads, VirtProcr *animatingPr) 1.109 + { 1.110 + barr->counter = 0; 1.111 + barr->nthreads = nthreads; 1.112 + barr->mutex = VPThread__make_mutex(animatingPr); 1.113 + barr->cond = VPThread__make_cond(barr->mutex, animatingPr); 1.114 + } 1.115 + 1.116 +void inline barrier_wait(barrier *barr, VirtProcr *animatingPr) 1.117 + { int i; 1.118 + 1.119 + VPThread__mutex_lock(barr->mutex, animatingPr); 1.120 + barr->counter++; 1.121 + if(barr->counter == barr->nthreads) 1.122 + { 1.123 + read(cycles_counter_main_fd, &(barr->endBarrierCycles.cycles), \ 1.124 + sizeof(barr->endBarrierCycles.cycles)); 1.125 + 1.126 + barr->counter = 0; 1.127 + for(i=0; i < barr->nthreads; i++) 1.128 + VPThread__cond_signal(barr->cond, animatingPr); 1.129 + } 1.130 + else 1.131 + { VPThread__cond_wait(barr->cond, animatingPr); 1.132 + } 1.133 + VPThread__mutex_unlock(barr->mutex, animatingPr); 1.134 + } 1.135 + 1.136 + 1.137 + 1.138 +struct WorkerParams_t 1.139 + { struct barrier_t* barrier; 1.140 + uint64_t totalWorkCycles; 1.141 + uint64_t totalBadCycles; 1.142 + uint64_t totalSyncCycles; 1.143 + uint64_t totalBadSyncCycles; 1.144 + uint64 numGoodSyncs; 1.145 + uint64 numGoodTasks; 1.146 + }; 1.147 + 1.148 + typedef union 1.149 + { 1.150 + struct WorkerParams_t data; 1.151 + char padding[CACHELINE_SIZE]; 1.152 + } WorkerParams __align_to_cacheline__; 1.153 + 1.154 +WorkerParams *workerParamsArray; 1.155 + 1.156 +typedef struct 1.157 + { measurement_t *startExeCycles; 1.158 + measurement_t *endExeCycles; 1.159 + } BenchParams __align_to_cacheline__; 1.160 + 1.161 +//======================== App Code ========================= 1.162 +/* 1.163 + p* Workload 1.164 + */ 1.165 + 1.166 +#define saveCyclesAndInstrs(core,cycles) do{ \ 1.167 + int cycles_fd = cycles_counter_fd[core]; \ 1.168 + int nread; \ 1.169 + \ 1.170 + nread = read(cycles_fd,&(cycles),sizeof(cycles)); \ 1.171 + if(nread<0){ \ 1.172 + perror("Error reading cycles counter"); \ 1.173 + cycles = 0; \ 1.174 + } \ 1.175 +} while (0) //macro magic for scoping 1.176 + 1.177 +#define saveMisses(misses) do{ \ 1.178 + int nread; \ 1.179 + \ 1.180 + nread = read(misses_counter_fd,&(misses),sizeof(misses)); \ 1.181 + if(nread<0){ \ 1.182 + perror("Error reading misses counter"); \ 1.183 + misses = 0; \ 1.184 + } \ 1.185 +} while (0) //macro magic for scoping 1.186 + 1.187 + 1.188 +double 1.189 +worker_TLF(void* _params, VirtProcr* animatingPr) 1.190 + { 1.191 + int i,o; 1.192 + WorkerParams* params = (WorkerParams*)_params; 1.193 + unsigned int totalWorkCycles = 0, totalBadCycles = 0; 1.194 + unsigned int totalSyncCycles = 0, totalBadSyncCycles = 0; 1.195 + unsigned int workspace1=0, numGoodSyncs = 0, numGoodTasks = 0; 1.196 + double workspace2=0.0; 1.197 + int32 privateMutex = VPThread__make_mutex(animatingPr); 1.198 + 1.199 + int cpuid = sched_getcpu(); 1.200 + 1.201 + measurement_t startWorkload, endWorkload; 1.202 + uint64 numCycles; 1.203 + for(o=0; o < outer_iters; o++) 1.204 + { 1.205 + 1.206 + saveCyclesAndInstrs(cpuid,startWorkload.cycles); 1.207 + 1.208 + //task 1.209 + for(i=0; i < inner_iters; i++) 1.210 + { 1.211 + workspace1 += (workspace1 + 32)/2; 1.212 + workspace2 += (workspace2 + 23.2)/1.4; 1.213 + } 1.214 + 1.215 + saveCyclesAndInstrs(cpuid,endWorkload.cycles); 1.216 + numCycles = endWorkload.cycles - startWorkload.cycles; 1.217 + //sanity check (400K is about 20K iters) 1.218 + if( numCycles < 400000 ) {totalWorkCycles += numCycles; numGoodTasks++;} 1.219 + else {totalBadCycles += numCycles; } 1.220 + 1.221 + //mutex access often causes switch to different Slave VP 1.222 + VPThread__mutex_lock(privateMutex, animatingPr); 1.223 + 1.224 +/* 1.225 + saveCyclesAndInstrs(cpuid,startWorkload2.cycles); 1.226 + //Task 1.227 + for(i=0; i < inner_iters; i++) 1.228 + { 1.229 + workspace1 += (workspace1 + 32)/2; 1.230 + workspace2 += (workspace2 + 23.2)/1.4; 1.231 + } 1.232 + 1.233 + saveCyclesAndInstrs(cpuid,endWorkload2.cycles); 1.234 + numCycles = endWorkload2.cycles - startWorkload2.cycles; 1.235 + //sanity check (400K is about 20K iters) 1.236 + if( numCycles < 400000 ) {totalWorkCycles += numCycles; numGoodTasks++;} 1.237 + else {totalBadCycles += numCycles; } 1.238 + 1.239 +*/ 1.240 + VPThread__mutex_unlock(privateMutex, animatingPr); 1.241 + } 1.242 + 1.243 + params->data.totalWorkCycles = totalWorkCycles; 1.244 + params->data.totalBadCycles = totalBadCycles; 1.245 + params->data.numGoodTasks = numGoodTasks; 1.246 + params->data.totalSyncCycles = totalSyncCycles; 1.247 + params->data.totalBadSyncCycles = totalBadSyncCycles; 1.248 + params->data.numGoodSyncs = numGoodSyncs; 1.249 +/* 1.250 + params->totalSyncCycles = VMS__give_num_plugin_cycles(); 1.251 + params->totalBadSyncCycles = 0; 1.252 + params->numGoodSyncs = VMS__give_num_plugin_animations(); 1.253 +*/ 1.254 + 1.255 + 1.256 + //Wait for all threads to end 1.257 + barrier_wait(params->data.barrier, animatingPr); 1.258 + 1.259 + //Shutdown worker 1.260 + VPThread__dissipate_thread(animatingPr); 1.261 + 1.262 + //below return never reached --> there for gcc 1.263 + return (workspace1 + workspace2); //to prevent gcc from optimizing work out 1.264 + } 1.265 + 1.266 +//local variables of benchmark, made global for alignment 1.267 +struct barrier_t barr __align_to_cacheline__; 1.268 +BenchParams *params __align_to_cacheline__; 1.269 + 1.270 +/* this is run after the VMS is set up*/ 1.271 +void benchmark(void *_params, VirtProcr *animatingPr) 1.272 + { 1.273 + int i; 1.274 + 1.275 + params = (BenchParams *)_params; 1.276 + 1.277 + barrier_init(&barr, num_threads+1, animatingPr); 1.278 + 1.279 + //prepare input 1.280 + for(i=0; i<num_threads; i++) 1.281 + { 1.282 + workerParamsArray[i].data.barrier = &barr; 1.283 + } 1.284 + 1.285 + uint64_t cache_misses_at_start, cache_misses_at_end; 1.286 + saveMisses(cache_misses_at_start); 1.287 + //save cycles before execution of threads, to get total exe cycles 1.288 + int nread = read(cycles_counter_main_fd, &(params->startExeCycles->cycles), 1.289 + sizeof(params->startExeCycles->cycles)); 1.290 + if(nread<0) perror("Error reading cycles counter"); 1.291 + 1.292 + //create (which starts running) all threads 1.293 + for(i=0; i<num_threads; i++) 1.294 + { VPThread__create_thread((VirtProcrFnPtr)worker_TLF, &(workerParamsArray[i]), animatingPr); 1.295 + } 1.296 + //wait for all threads to finish 1.297 + barrier_wait(&barr, animatingPr); 1.298 + 1.299 + //endBarrierCycles read in barrier_wait()! Merten, email me if want to chg 1.300 + params->endExeCycles->cycles = barr.endBarrierCycles.cycles; 1.301 + saveMisses(cache_misses_at_end); 1.302 + cache_misses = cache_misses_at_end-cache_misses_at_start; 1.303 +/* 1.304 + uint64_t overallWorkCycles = 0; 1.305 + for(i=0; i<num_threads; i++){ 1.306 + printf("WorkCycles: %lu\n",input[i].totalWorkCycles); 1.307 + overallWorkCycles += input[i].totalWorkCycles; 1.308 + } 1.309 + 1.310 + printf("Sum across threads of work cycles: %lu\n", overallWorkCycles); 1.311 + printf("Total Execution: %lu\n", endBenchTime.cycles-startBenchTime.cycles); 1.312 + printf("Runtime/Workcycle Ratio %lu\n", 1.313 + ((endBenchTime.cycles-startBenchTime.cycles)*100)/overallWorkCycles); 1.314 +*/ 1.315 + 1.316 + //====================================================== 1.317 + 1.318 + VPThread__dissipate_thread(animatingPr); 1.319 + } 1.320 + 1.321 +int main(int argc, char **argv) 1.322 + { 1.323 + int i; 1.324 + 1.325 + //set global static variables, based on cmd-line args 1.326 + for(i=1; i<argc; i++) 1.327 + { 1.328 + if(argv[i][0] == '-' && argv[i][2] == 0) 1.329 + { 1.330 + switch(argv[i][1]) 1.331 + { 1.332 + case 't': 1.333 + if(!isdigit(argv[++i][0])) 1.334 + { 1.335 + fprintf(stderr, "-t must be followed by the number of worker threads to spawn\n"); 1.336 + return EXIT_FAILURE; 1.337 + } 1.338 + num_threads = atoi(argv[i]); 1.339 + if(!num_threads) 1.340 + { 1.341 + fprintf(stderr, "invalid number of threads specified: %d\n", num_threads); 1.342 + return EXIT_FAILURE; 1.343 + } 1.344 + break; 1.345 + case 'o': 1.346 + if(!isdigit(argv[++i][0])) 1.347 + { 1.348 + fputs("-i must be followed by a number\n", stderr); 1.349 + return EXIT_FAILURE; 1.350 + } 1.351 + outer_iters = atoi(argv[i]); 1.352 + break; 1.353 + case 'i': 1.354 + if(!isdigit(argv[++i][0])) 1.355 + { 1.356 + fputs("-o must be followed by a number (workload size)\n", stderr); 1.357 + return EXIT_FAILURE; 1.358 + } 1.359 + inner_iters = atoi(argv[i]); 1.360 + break; 1.361 + case 'h': 1.362 + fputs(usage, stdout); 1.363 + return 0; 1.364 + 1.365 + default: 1.366 + fprintf(stderr, "unrecognized argument: %s\n", argv[i]); 1.367 + fputs(usage, stderr); 1.368 + return EXIT_FAILURE; 1.369 + }//switch 1.370 + }//if arg 1.371 + else 1.372 + { 1.373 + fprintf(stderr, "unrecognized argument: %s\n", argv[i]); 1.374 + fputs(usage, stderr); 1.375 + return EXIT_FAILURE; 1.376 + } 1.377 + }//for 1.378 + 1.379 + 1.380 + //setup performance counters 1.381 + hw_event = malloc(sizeof(struct perf_event_attr)); 1.382 + memset(hw_event,0,sizeof(struct perf_event_attr)); 1.383 + 1.384 + hw_event->type = PERF_TYPE_HARDWARE; 1.385 + hw_event->size = sizeof(hw_event); 1.386 + hw_event->disabled = 0; 1.387 + hw_event->freq = 0; 1.388 + hw_event->inherit = 1; /* children inherit it */ 1.389 + hw_event->pinned = 1; /* says this virt counter must always be on HW */ 1.390 + hw_event->exclusive = 0; /* only group on PMU */ 1.391 + hw_event->exclude_user = 0; /* don't count user */ 1.392 + hw_event->exclude_kernel = 1; /* don't count kernel */ 1.393 + hw_event->exclude_hv = 1; /* ditto hypervisor */ 1.394 + hw_event->exclude_idle = 1; /* don't count when idle */ 1.395 + hw_event->mmap = 0; /* include mmap data */ 1.396 + hw_event->comm = 0; /* include comm data */ 1.397 + 1.398 + hw_event->config = PERF_COUNT_HW_CPU_CYCLES; //cycles 1.399 + 1.400 + int cpuID, retries; 1.401 + 1.402 + for( cpuID = 0; cpuID < NUM_CORES; cpuID++ ) 1.403 + { retries = 0; 1.404 + do 1.405 + { retries += 1; 1.406 + cycles_counter_fd[cpuID] = 1.407 + syscall(__NR_perf_event_open, hw_event, 1.408 + 0,//pid_t: 0 is "pid of calling process" 1.409 + cpuID,//int: cpu, the value returned by "CPUID" instr(?) 1.410 + -1,//int: group_fd, -1 is "leader" or independent 1.411 + 0//unsigned long: flags 1.412 + ); 1.413 + } 1.414 + while(cycles_counter_fd[cpuID]<0 && retries < 100); 1.415 + if(retries >= 100) 1.416 + { 1.417 + fprintf(stderr,"On core %d: ",cpuID); 1.418 + perror("Failed to open cycles counter"); 1.419 + } 1.420 + } 1.421 + 1.422 + //Set up counter to accumulate total cycles to process, across all CPUs 1.423 + 1.424 + retries = 0; 1.425 + do 1.426 + { retries += 1; 1.427 + cycles_counter_main_fd = 1.428 + syscall(__NR_perf_event_open, hw_event, 1.429 + 0,//pid_t: 0 is "pid of calling process" 1.430 + -1,//int: cpu, -1 means accumulate from all cores 1.431 + -1,//int: group_fd, -1 is "leader" == independent 1.432 + 0//unsigned long: flags 1.433 + ); 1.434 + } 1.435 + while(cycles_counter_main_fd<0 && retries < 100); 1.436 + if(retries >= 100) 1.437 + { 1.438 + fprintf(stderr,"in main "); 1.439 + perror("Failed to open cycles counter"); 1.440 + } 1.441 + 1.442 + //Set up counters to count cache misses 1.443 + hw_event->type = PERF_TYPE_HARDWARE; 1.444 + hw_event->config = PERF_COUNT_HW_CACHE_MISSES; //misses 1.445 + 1.446 + retries = 0; 1.447 + do 1.448 + { retries += 1; 1.449 + misses_counter_fd = 1.450 + syscall(__NR_perf_event_open, hw_event, 1.451 + 0,//pid_t: 0 is "pid of calling process" 1.452 + -1,//int: cpu, -1 means accumulate from all cores 1.453 + -1,//int: group_fd, -1 is "leader" == independent 1.454 + 0//unsigned long: flags 1.455 + ); 1.456 + } 1.457 + while(misses_counter_fd<0 && retries < 100); 1.458 + if(retries >= 100) 1.459 + { 1.460 + fprintf(stderr,"in main "); 1.461 + perror("Failed to misses counter"); 1.462 + } 1.463 + 1.464 + measurement_t startExeCycles, endExeCycles; 1.465 + BenchParams *benchParams; 1.466 + 1.467 + benchParams = malloc(sizeof(BenchParams)); 1.468 + 1.469 + benchParams->startExeCycles = &startExeCycles; 1.470 + benchParams->endExeCycles = &endExeCycles; 1.471 + 1.472 + workerParamsArray = (WorkerParams *)malloc( (num_threads + 1) * sizeof(WorkerParams) ); 1.473 + if(workerParamsArray == NULL ) printf("error mallocing worker params array\n"); 1.474 + 1.475 + 1.476 + //This is the transition to the VMS runtime 1.477 + VPThread__create_seed_procr_and_do_work( &benchmark, benchParams ); 1.478 + 1.479 + uint64_t totalWorkCyclesAcrossCores = 0, totalBadCyclesAcrossCores = 0; 1.480 + uint64_t totalSyncCyclesAcrossCores = 0, totalBadSyncCyclesAcrossCores = 0; 1.481 + for(i=0; i<num_threads; i++){ 1.482 + printf("WorkCycles: %lu\n",workerParamsArray[i].data.totalWorkCycles); 1.483 +// printf("Num Good Tasks: %lu\n",workerParamsArray[i].numGoodTasks); 1.484 +// printf("SyncCycles: %lu\n",workerParamsArray[i].totalSyncCycles); 1.485 +// printf("Num Good Syncs: %lu\n",workerParamsArray[i].numGoodSyncs); 1.486 + totalWorkCyclesAcrossCores += workerParamsArray[i].data.totalWorkCycles; 1.487 + totalBadCyclesAcrossCores += workerParamsArray[i].data.totalBadCycles; 1.488 + totalSyncCyclesAcrossCores += workerParamsArray[i].data.totalSyncCycles; 1.489 + totalBadSyncCyclesAcrossCores += workerParamsArray[i].data.totalBadSyncCycles; 1.490 + } 1.491 + 1.492 + uint64_t totalExeCycles = endExeCycles.cycles - startExeCycles.cycles; 1.493 + totalExeCycles -= totalBadCyclesAcrossCores; 1.494 + uint64 totalOverhead = totalExeCycles - totalWorkCyclesAcrossCores; 1.495 + int32 numSyncs = outer_iters * num_threads * 2; 1.496 + printf("Total Execution Cycles: %lu\n", totalExeCycles); 1.497 + printf("Total number of cache misses: %lu\n", cache_misses); 1.498 + printf("Sum across threads of work cycles: %lu\n", totalWorkCyclesAcrossCores); 1.499 + printf("Sum across threads of bad work cycles: %lu\n", totalBadCyclesAcrossCores); 1.500 +// printf("Sum across threads of Bad Sync cycles: %lu\n", totalBadSyncCyclesAcrossCores); 1.501 + printf("Overhead per sync: %f\n", (double)totalOverhead / (double)numSyncs ); 1.502 + printf("ExeCycles/WorkCycles Ratio %f\n", 1.503 + (double)totalExeCycles / (double)totalWorkCyclesAcrossCores); 1.504 + return 0; 1.505 + }
