Mercurial > cgi-bin > hgwebdir.cgi > PR > Applications > Vthread > Vthread__Best_Effort_Msg__Bench
diff main.c @ 16:5887fbce425f
changed directory structure, added .hgeol file
| author | Merten Sach <msach@mailbox.tu-berlin.de> |
|---|---|
| date | Mon, 13 Feb 2012 16:11:00 +0100 |
| parents | src/Application/main.c@a1269b1549fc |
| children | fdc2f264f3d6 |
line diff
1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/main.c Mon Feb 13 16:11:00 2012 +0100 1.3 @@ -0,0 +1,480 @@ 1.4 +/* 1.5 + * 1.6 + */ 1.7 +#include <stdio.h> 1.8 +#include <stdlib.h> 1.9 +#include <string.h> 1.10 +#include <math.h> 1.11 +#include <ctype.h> 1.12 +#include <errno.h> 1.13 +#include <pthread.h> 1.14 +#include <unistd.h> 1.15 +#include "VPThread_lib/VPThread.h" 1.16 +#include "VPThread_lib/VMS/Queue_impl/PrivateQueue.h" 1.17 + 1.18 +#include <linux/perf_event.h> 1.19 +#include <linux/prctl.h> 1.20 +#include <sys/syscall.h> 1.21 + 1.22 +#undef DEBUG 1.23 +//#define DEBUG 1.24 + 1.25 +#define MEASURE_PERF 1.26 + 1.27 +#if !defined(unix) && !defined(__unix__) 1.28 +#ifdef __MACH__ 1.29 +#define unix 1 1.30 +#define __unix__ 1 1.31 +#endif /* __MACH__ */ 1.32 +#endif /* unix */ 1.33 + 1.34 +/* find the appropriate way to define explicitly sized types */ 1.35 +/* for C99 or GNU libc (also mach's libc) we can use stdint.h */ 1.36 +#if (__STDC_VERSION__ >= 199900) || defined(__GLIBC__) || defined(__MACH__) 1.37 +#include <stdint.h> 1.38 +#elif defined(unix) || defined(__unix__) /* some UNIX systems have them in sys/types.h */ 1.39 +#include <sys/types.h> 1.40 +#elif defined(__WIN32__) || defined(WIN32) /* the nameless one */ 1.41 +typedef unsigned __int8 uint8_t; 1.42 +typedef unsigned __int32 uint32_t; 1.43 +#endif /* sized type detection */ 1.44 + 1.45 +/* provide a millisecond-resolution timer for each system */ 1.46 +#if defined(unix) || defined(__unix__) 1.47 +#include <time.h> 1.48 +#include <sys/time.h> 1.49 +unsigned long get_msec(void) { 1.50 + static struct timeval timeval, first_timeval; 1.51 + 1.52 + gettimeofday(&timeval, 0); 1.53 + if(first_timeval.tv_sec == 0) { 1.54 + first_timeval = timeval; 1.55 + return 0; 1.56 + } 1.57 + return (timeval.tv_sec - first_timeval.tv_sec) * 1000 + (timeval.tv_usec - first_timeval.tv_usec) / 1000; 1.58 +} 1.59 +#elif defined(__WIN32__) || defined(WIN32) 1.60 +#include <windows.h> 1.61 +unsigned long get_msec(void) { 1.62 + return GetTickCount(); 1.63 +} 1.64 +#else 1.65 +//#error "I don't know how to measure time on your platform" 1.66 +#endif 1.67 + 1.68 +//======================== Defines ========================= 1.69 +typedef struct perfData measurement_t; 1.70 +struct perfData{ 1.71 + uint64 cycles; 1.72 + uint64 instructions; 1.73 +}; 1.74 + 1.75 +const char *usage = { 1.76 + "Usage: malloc_test [options]\n" 1.77 + " Spwans a number of threads and allocates memory.\n\n" 1.78 + "Options:\n" 1.79 + " -t <num> how many threads to use (default: 1). This is internaly multiplied by the number of cores.\n" 1.80 + " -o <num> repeat workload and sync operation <m> times\n" 1.81 + " -i <num> size of workload, repeat <n> times\n" 1.82 + " -h this help screen\n\n" 1.83 +}; 1.84 + 1.85 +struct barrier_t 1.86 +{ 1.87 + int counter; 1.88 + int nthreads; 1.89 + int32 mutex; 1.90 + int32 cond; 1.91 + measurement_t endBarrierCycles; 1.92 + 1.93 +}; 1.94 +typedef struct barrier_t barrier; 1.95 + 1.96 +void inline barrier_init(barrier *barr, int nthreads, VirtProcr *animatingPr) 1.97 + { 1.98 + barr->counter = 0; 1.99 + barr->nthreads = nthreads; 1.100 + barr->mutex = VPThread__make_mutex(animatingPr); 1.101 + barr->cond = VPThread__make_cond(barr->mutex, animatingPr); 1.102 + } 1.103 + 1.104 +int cycles_counter_main_fd; 1.105 +void inline barrier_wait(barrier *barr, VirtProcr *animatingPr) 1.106 + { int i; 1.107 + 1.108 + VPThread__mutex_lock(barr->mutex, animatingPr); 1.109 + barr->counter++; 1.110 + if(barr->counter == barr->nthreads) 1.111 + { 1.112 +#ifdef MEASURE_PERF 1.113 + read(cycles_counter_main_fd, &(barr->endBarrierCycles.cycles), \ 1.114 + sizeof(barr->endBarrierCycles.cycles)); 1.115 +#endif 1.116 + 1.117 + barr->counter = 0; 1.118 + for(i=0; i < barr->nthreads; i++) 1.119 + VPThread__cond_signal(barr->cond, animatingPr); 1.120 + } 1.121 + else 1.122 + { VPThread__cond_wait(barr->cond, animatingPr); 1.123 + } 1.124 + VPThread__mutex_unlock(barr->mutex, animatingPr); 1.125 + } 1.126 + 1.127 + 1.128 + 1.129 +typedef struct 1.130 + { struct barrier_t* barrier; 1.131 + uint64_t totalWorkCycles; 1.132 + uint64_t totalBadCycles; 1.133 + uint64_t totalSyncCycles; 1.134 + uint64_t totalBadSyncCycles; 1.135 + uint64 numGoodSyncs; 1.136 + uint64 numGoodTasks; 1.137 + } 1.138 +WorkerParams; 1.139 + 1.140 + 1.141 +typedef struct 1.142 + { measurement_t *startExeCycles; 1.143 + measurement_t *endExeCycles; 1.144 + } 1.145 +BenchParams; 1.146 + 1.147 +//======================== Globals ========================= 1.148 +char __ProgrammName[] = "overhead_test"; 1.149 +char __DataSet[255]; 1.150 + 1.151 +int outer_iters, inner_iters, num_threads; 1.152 +size_t chunk_size = 0; 1.153 + 1.154 +int cycles_counter_fd[NUM_CORES]; 1.155 +struct perf_event_attr* hw_event; 1.156 + 1.157 +WorkerParams *workerParamsArray; 1.158 + 1.159 +//======================== App Code ========================= 1.160 +/* 1.161 + * Workload 1.162 + */ 1.163 + 1.164 +#define saveCyclesAndInstrs(core,cycles) do{ \ 1.165 + int cycles_fd = cycles_counter_fd[core]; \ 1.166 + int nread; \ 1.167 + \ 1.168 + nread = read(cycles_fd,&(cycles),sizeof(cycles)); \ 1.169 + if(nread<0){ \ 1.170 + perror("Error reading cycles counter"); \ 1.171 + cycles = 0; \ 1.172 + } \ 1.173 +} while (0) //macro magic for scoping 1.174 + 1.175 + 1.176 +double 1.177 +worker_TLF(void* _params, VirtProcr* animatingPr) 1.178 + { 1.179 + int i,o; 1.180 + WorkerParams* params = (WorkerParams*)_params; 1.181 + unsigned int totalWorkCycles = 0, totalBadCycles = 0; 1.182 + unsigned int totalSyncCycles = 0, totalBadSyncCycles = 0; 1.183 + unsigned int workspace1=0, numGoodSyncs = 0, numGoodTasks = 0; 1.184 + double workspace2=0.0; 1.185 + int32 privateMutex = VPThread__make_mutex(animatingPr); 1.186 + 1.187 + int cpuid = sched_getcpu(); 1.188 + 1.189 + measurement_t startWorkload, endWorkload, startWorkload2, endWorkload2; 1.190 + uint64 numCycles; 1.191 + for(o=0; o < outer_iters; o++) 1.192 + { 1.193 +#ifdef MEASURE_PERF 1.194 + saveCyclesAndInstrs(cpuid,startWorkload.cycles); 1.195 +#endif 1.196 + 1.197 + //workltask 1.198 + for(i=0; i < inner_iters; i++) 1.199 + { 1.200 + workspace1 += (workspace1 + 32)/2; 1.201 + workspace2 += (workspace2 + 23.2)/1.4; 1.202 + } 1.203 + 1.204 +#ifdef MEASURE_PERF 1.205 + saveCyclesAndInstrs(cpuid,endWorkload.cycles); 1.206 + numCycles = endWorkload.cycles - startWorkload.cycles; 1.207 + //sanity check (400K is about 20K iters) 1.208 + if( numCycles < 400000 ) {totalWorkCycles += numCycles; numGoodTasks++;} 1.209 + else {totalBadCycles += numCycles; } 1.210 +#endif 1.211 + 1.212 + //mutex access often causes switch to different Slave VP 1.213 + VPThread__mutex_lock(privateMutex, animatingPr); 1.214 + 1.215 +/* 1.216 + saveCyclesAndInstrs(cpuid,startWorkload2.cycles); 1.217 + //Task 1.218 + for(i=0; i < inner_iters; i++) 1.219 + { 1.220 + workspace1 += (workspace1 + 32)/2; 1.221 + workspace2 += (workspace2 + 23.2)/1.4; 1.222 + } 1.223 + 1.224 + saveCyclesAndInstrs(cpuid,endWorkload2.cycles); 1.225 + numCycles = endWorkload2.cycles - startWorkload2.cycles; 1.226 + //sanity check (400K is about 20K iters) 1.227 + if( numCycles < 400000 ) {totalWorkCycles += numCycles; numGoodTasks++;} 1.228 + else {totalBadCycles += numCycles; } 1.229 + 1.230 +*/ 1.231 + VPThread__mutex_unlock(privateMutex, animatingPr); 1.232 + } 1.233 + 1.234 + params->totalWorkCycles = totalWorkCycles; 1.235 + params->totalBadCycles = totalBadCycles; 1.236 + params->numGoodTasks = numGoodTasks; 1.237 + params->totalSyncCycles = totalSyncCycles; 1.238 + params->totalBadSyncCycles = totalBadSyncCycles; 1.239 + params->numGoodSyncs = numGoodSyncs; 1.240 +/* 1.241 + params->totalSyncCycles = VMS__give_num_plugin_cycles(); 1.242 + params->totalBadSyncCycles = 0; 1.243 + params->numGoodSyncs = VMS__give_num_plugin_animations(); 1.244 +*/ 1.245 + 1.246 + 1.247 + //Wait for all threads to end 1.248 + barrier_wait(params->barrier, animatingPr); 1.249 + 1.250 + //Shutdown worker 1.251 + VPThread__dissipate_thread(animatingPr); 1.252 + 1.253 + //below return never reached --> there for gcc 1.254 + return (workspace1 + workspace2); //to prevent gcc from optimizing work out 1.255 + } 1.256 + 1.257 + 1.258 +/* this is run after the VMS is set up*/ 1.259 +void benchmark(void *_params, VirtProcr *animatingPr) 1.260 + { 1.261 + int i, cpuID; 1.262 + struct barrier_t barr; 1.263 + BenchParams *params; 1.264 + 1.265 + params = (BenchParams *)_params; 1.266 + 1.267 + barrier_init(&barr, num_threads+1, animatingPr); 1.268 + 1.269 + //prepare input 1.270 + for(i=0; i<num_threads; i++) 1.271 + { 1.272 + workerParamsArray[i].barrier = &barr; 1.273 + } 1.274 + 1.275 + //save cycles before execution of threads, to get total exe cycles 1.276 + measurement_t *startExeCycles, *endExeCycles; 1.277 + startExeCycles = params->startExeCycles; 1.278 + 1.279 +#ifdef MEASURE_PERF 1.280 + int nread = read(cycles_counter_main_fd, &(startExeCycles->cycles), 1.281 + sizeof(startExeCycles->cycles)); 1.282 + if(nread<0) perror("Error reading cycles counter"); 1.283 +#endif 1.284 + 1.285 + //create (which starts running) all threads 1.286 + for(i=0; i<num_threads; i++) 1.287 + { VPThread__create_thread((VirtProcrFnPtr)worker_TLF, &(workerParamsArray[i]), animatingPr); 1.288 + } 1.289 + //wait for all threads to finish 1.290 + barrier_wait(&barr, animatingPr); 1.291 + 1.292 +#ifdef MEASURE_PERF 1.293 + //endBarrierCycles read in barrier_wait()! Merten, email me if want to chg 1.294 + params->endExeCycles->cycles = barr.endBarrierCycles.cycles; 1.295 +#endif 1.296 + 1.297 + 1.298 +/* 1.299 + uint64_t overallWorkCycles = 0; 1.300 + for(i=0; i<num_threads; i++){ 1.301 + printf("WorkCycles: %lu\n",input[i].totalWorkCycles); 1.302 + overallWorkCycles += input[i].totalWorkCycles; 1.303 + } 1.304 + 1.305 + printf("Sum across threads of work cycles: %lu\n", overallWorkCycles); 1.306 + printf("Total Execution: %lu\n", endBenchTime.cycles-startBenchTime.cycles); 1.307 + printf("Runtime/Workcycle Ratio %lu\n", 1.308 + ((endBenchTime.cycles-startBenchTime.cycles)*100)/overallWorkCycles); 1.309 +*/ 1.310 + 1.311 + //====================================================== 1.312 + 1.313 + VPThread__dissipate_thread(animatingPr); 1.314 + } 1.315 + 1.316 +int main(int argc, char **argv) 1.317 + { 1.318 + int i; 1.319 + 1.320 + //set global static variables, based on cmd-line args 1.321 + for(i=1; i<argc; i++) 1.322 + { 1.323 + if(argv[i][0] == '-' && argv[i][2] == 0) 1.324 + { 1.325 + switch(argv[i][1]) 1.326 + { 1.327 + case 't': 1.328 + if(!isdigit(argv[++i][0])) 1.329 + { 1.330 + fprintf(stderr, "-t must be followed by the number of worker threads to spawn\n"); 1.331 + return EXIT_FAILURE; 1.332 + } 1.333 + num_threads = atoi(argv[i]); 1.334 + if(!num_threads) 1.335 + { 1.336 + fprintf(stderr, "invalid number of threads specified: %d\n", num_threads); 1.337 + return EXIT_FAILURE; 1.338 + } 1.339 + break; 1.340 + case 'o': 1.341 + if(!isdigit(argv[++i][0])) 1.342 + { 1.343 + fputs("-i must be followed by a number\n", stderr); 1.344 + return EXIT_FAILURE; 1.345 + } 1.346 + outer_iters = atoi(argv[i]); 1.347 + break; 1.348 + case 'i': 1.349 + if(!isdigit(argv[++i][0])) 1.350 + { 1.351 + fputs("-o must be followed by a number (workload size)\n", stderr); 1.352 + return EXIT_FAILURE; 1.353 + } 1.354 + inner_iters = atoi(argv[i]); 1.355 + break; 1.356 + case 'h': 1.357 + fputs(usage, stdout); 1.358 + return 0; 1.359 + 1.360 + default: 1.361 + fprintf(stderr, "unrecognized argument: %s\n", argv[i]); 1.362 + fputs(usage, stderr); 1.363 + return EXIT_FAILURE; 1.364 + }//switch 1.365 + }//if arg 1.366 + else 1.367 + { 1.368 + fprintf(stderr, "unrecognized argument: %s\n", argv[i]); 1.369 + fputs(usage, stderr); 1.370 + return EXIT_FAILURE; 1.371 + } 1.372 + }//for 1.373 + 1.374 + 1.375 +#ifdef MEASURE_PERF 1.376 + //setup performance counters 1.377 + hw_event = malloc(sizeof(struct perf_event_attr)); 1.378 + memset(hw_event,0,sizeof(struct perf_event_attr)); 1.379 + 1.380 + hw_event->type = PERF_TYPE_HARDWARE; 1.381 + hw_event->size = sizeof(hw_event); 1.382 + hw_event->disabled = 0; 1.383 + hw_event->freq = 0; 1.384 + hw_event->inherit = 1; /* children inherit it */ 1.385 + hw_event->pinned = 1; /* says this virt counter must always be on HW */ 1.386 + hw_event->exclusive = 0; /* only group on PMU */ 1.387 + hw_event->exclude_user = 0; /* don't count user */ 1.388 + hw_event->exclude_kernel = 1; /* don't count kernel */ 1.389 + hw_event->exclude_hv = 1; /* ditto hypervisor */ 1.390 + hw_event->exclude_idle = 1; /* don't count when idle */ 1.391 + hw_event->mmap = 0; /* include mmap data */ 1.392 + hw_event->comm = 0; /* include comm data */ 1.393 + 1.394 + hw_event->config = PERF_COUNT_HW_CPU_CYCLES; //cycles 1.395 + 1.396 + int cpuID, retries; 1.397 + 1.398 + for( cpuID = 0; cpuID < NUM_CORES; cpuID++ ) 1.399 + { retries = 0; 1.400 + do 1.401 + { retries += 1; 1.402 + cycles_counter_fd[cpuID] = 1.403 + syscall(__NR_perf_event_open, hw_event, 1.404 + 0,//pid_t: 0 is "pid of calling process" 1.405 + cpuID,//int: cpu, the value returned by "CPUID" instr(?) 1.406 + -1,//int: group_fd, -1 is "leader" or independent 1.407 + 0//unsigned long: flags 1.408 + ); 1.409 + } 1.410 + while(cycles_counter_fd[cpuID]<0 && retries < 100); 1.411 + if(retries >= 100) 1.412 + { 1.413 + fprintf(stderr,"On core %d: ",cpuID); 1.414 + perror("Failed to open cycles counter"); 1.415 + } 1.416 + } 1.417 + 1.418 + //Set up counter to accumulate total cycles to process, across all CPUs 1.419 + 1.420 + retries = 0; 1.421 + do 1.422 + { retries += 1; 1.423 + cycles_counter_main_fd = 1.424 + syscall(__NR_perf_event_open, hw_event, 1.425 + 0,//pid_t: 0 is "pid of calling process" 1.426 + -1,//int: cpu, -1 means accumulate from all cores 1.427 + -1,//int: group_fd, -1 is "leader" == independent 1.428 + 0//unsigned long: flags 1.429 + ); 1.430 + } 1.431 + while(cycles_counter_main_fd<0 && retries < 100); 1.432 + if(retries >= 100) 1.433 + { 1.434 + fprintf(stderr,"in main "); 1.435 + perror("Failed to open cycles counter"); 1.436 + } 1.437 +#endif 1.438 + 1.439 + measurement_t startExeCycles, endExeCycles; 1.440 + BenchParams *benchParams; 1.441 + 1.442 + benchParams = malloc(sizeof(BenchParams)); 1.443 + 1.444 + benchParams->startExeCycles = &startExeCycles; 1.445 + benchParams->endExeCycles = &endExeCycles; 1.446 + 1.447 + workerParamsArray = (WorkerParams *)malloc( (num_threads + 1) * sizeof(WorkerParams) ); 1.448 + if(workerParamsArray == NULL ) printf("error mallocing worker params array\n"); 1.449 + 1.450 + 1.451 + //This is the transition to the VMS runtime 1.452 + VPThread__create_seed_procr_and_do_work( &benchmark, benchParams ); 1.453 + 1.454 +#ifdef MEASURE_PERF 1.455 + uint64_t totalWorkCyclesAcrossCores = 0, totalBadCyclesAcrossCores = 0; 1.456 + uint64_t totalSyncCyclesAcrossCores = 0, totalBadSyncCyclesAcrossCores = 0; 1.457 + for(i=0; i<num_threads; i++){ 1.458 + printf("WorkCycles: %lu\n",workerParamsArray[i].totalWorkCycles); 1.459 +// printf("Num Good Tasks: %lu\n",workerParamsArray[i].numGoodTasks); 1.460 +// printf("SyncCycles: %lu\n",workerParamsArray[i].totalSyncCycles); 1.461 +// printf("Num Good Syncs: %lu\n",workerParamsArray[i].numGoodSyncs); 1.462 + totalWorkCyclesAcrossCores += workerParamsArray[i].totalWorkCycles; 1.463 + totalBadCyclesAcrossCores += workerParamsArray[i].totalBadCycles; 1.464 + totalSyncCyclesAcrossCores += workerParamsArray[i].totalSyncCycles; 1.465 + totalBadSyncCyclesAcrossCores += workerParamsArray[i].totalBadSyncCycles; 1.466 + } 1.467 + 1.468 + uint64_t totalExeCycles = endExeCycles.cycles - startExeCycles.cycles; 1.469 + totalExeCycles -= totalBadCyclesAcrossCores; 1.470 + uint64 totalOverhead = totalExeCycles - totalWorkCyclesAcrossCores; 1.471 + int32 numSyncs = outer_iters * num_threads * 2; 1.472 + printf("Total Execution Cycles: %lu\n", totalExeCycles); 1.473 + printf("Sum across threads of work cycles: %lu\n", totalWorkCyclesAcrossCores); 1.474 + printf("Sum across threads of bad work cycles: %lu\n", totalBadCyclesAcrossCores); 1.475 +// printf("Sum across threads of Bad Sync cycles: %lu\n", totalBadSyncCyclesAcrossCores); 1.476 + printf("Overhead per sync: %f\n", (double)totalOverhead / (double)numSyncs ); 1.477 + printf("ExeCycles/WorkCycles Ratio %f\n", 1.478 + (double)totalExeCycles / (double)totalWorkCyclesAcrossCores); 1.479 +#else 1.480 + printf("No measurement done!\n"); 1.481 +#endif 1.482 + return 0; 1.483 + }
