Mercurial > cgi-bin > hgwebdir.cgi > PR > Applications > Vthread > Vthread__Best_Effort_Msg__Bench
changeset 16:5887fbce425f
changed directory structure, added .hgeol file
| author | Merten Sach <msach@mailbox.tu-berlin.de> |
|---|---|
| date | Mon, 13 Feb 2012 16:11:00 +0100 |
| parents | a1269b1549fc |
| children | fdc2f264f3d6 |
| files | .hgeol main.c src/Application/main.c |
| diffstat | 3 files changed, 494 insertions(+), 480 deletions(-) [+] |
line diff
1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/.hgeol Mon Feb 13 16:11:00 2012 +0100 1.3 @@ -0,0 +1,14 @@ 1.4 + 1.5 +[patterns] 1.6 +**.py = native 1.7 +**.txt = native 1.8 +**.c = native 1.9 +**.h = native 1.10 +**.cpp = native 1.11 +**.java = native 1.12 +**.class = bin 1.13 +**.jar = bin 1.14 +**.sh = native 1.15 +**.pl = native 1.16 +**.jpg = bin 1.17 +**.gif = bin
2.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 2.2 +++ b/main.c Mon Feb 13 16:11:00 2012 +0100 2.3 @@ -0,0 +1,480 @@ 2.4 +/* 2.5 + * 2.6 + */ 2.7 +#include <stdio.h> 2.8 +#include <stdlib.h> 2.9 +#include <string.h> 2.10 +#include <math.h> 2.11 +#include <ctype.h> 2.12 +#include <errno.h> 2.13 +#include <pthread.h> 2.14 +#include <unistd.h> 2.15 +#include "VPThread_lib/VPThread.h" 2.16 +#include "VPThread_lib/VMS/Queue_impl/PrivateQueue.h" 2.17 + 2.18 +#include <linux/perf_event.h> 2.19 +#include <linux/prctl.h> 2.20 +#include <sys/syscall.h> 2.21 + 2.22 +#undef DEBUG 2.23 +//#define DEBUG 2.24 + 2.25 +#define MEASURE_PERF 2.26 + 2.27 +#if !defined(unix) && !defined(__unix__) 2.28 +#ifdef __MACH__ 2.29 +#define unix 1 2.30 +#define __unix__ 1 2.31 +#endif /* __MACH__ */ 2.32 +#endif /* unix */ 2.33 + 2.34 +/* find the appropriate way to define explicitly sized types */ 2.35 +/* for C99 or GNU libc (also mach's libc) we can use stdint.h */ 2.36 +#if (__STDC_VERSION__ >= 199900) || defined(__GLIBC__) || defined(__MACH__) 2.37 +#include <stdint.h> 2.38 +#elif defined(unix) || defined(__unix__) /* some UNIX systems have them in sys/types.h */ 2.39 +#include <sys/types.h> 2.40 +#elif defined(__WIN32__) || defined(WIN32) /* the nameless one */ 2.41 +typedef unsigned __int8 uint8_t; 2.42 +typedef unsigned __int32 uint32_t; 2.43 +#endif /* sized type detection */ 2.44 + 2.45 +/* provide a millisecond-resolution timer for each system */ 2.46 +#if defined(unix) || defined(__unix__) 2.47 +#include <time.h> 2.48 +#include <sys/time.h> 2.49 +unsigned long get_msec(void) { 2.50 + static struct timeval timeval, first_timeval; 2.51 + 2.52 + gettimeofday(&timeval, 0); 2.53 + if(first_timeval.tv_sec == 0) { 2.54 + first_timeval = timeval; 2.55 + return 0; 2.56 + } 2.57 + return (timeval.tv_sec - first_timeval.tv_sec) * 1000 + (timeval.tv_usec - first_timeval.tv_usec) / 1000; 2.58 +} 2.59 +#elif defined(__WIN32__) || defined(WIN32) 2.60 +#include <windows.h> 2.61 +unsigned long get_msec(void) { 2.62 + return GetTickCount(); 2.63 +} 2.64 +#else 2.65 +//#error "I don't know how to measure time on your platform" 2.66 +#endif 2.67 + 2.68 +//======================== Defines ========================= 2.69 +typedef struct perfData measurement_t; 2.70 +struct perfData{ 2.71 + uint64 cycles; 2.72 + uint64 instructions; 2.73 +}; 2.74 + 2.75 +const char *usage = { 2.76 + "Usage: malloc_test [options]\n" 2.77 + " Spwans a number of threads and allocates memory.\n\n" 2.78 + "Options:\n" 2.79 + " -t <num> how many threads to use (default: 1). This is internaly multiplied by the number of cores.\n" 2.80 + " -o <num> repeat workload and sync operation <m> times\n" 2.81 + " -i <num> size of workload, repeat <n> times\n" 2.82 + " -h this help screen\n\n" 2.83 +}; 2.84 + 2.85 +struct barrier_t 2.86 +{ 2.87 + int counter; 2.88 + int nthreads; 2.89 + int32 mutex; 2.90 + int32 cond; 2.91 + measurement_t endBarrierCycles; 2.92 + 2.93 +}; 2.94 +typedef struct barrier_t barrier; 2.95 + 2.96 +void inline barrier_init(barrier *barr, int nthreads, VirtProcr *animatingPr) 2.97 + { 2.98 + barr->counter = 0; 2.99 + barr->nthreads = nthreads; 2.100 + barr->mutex = VPThread__make_mutex(animatingPr); 2.101 + barr->cond = VPThread__make_cond(barr->mutex, animatingPr); 2.102 + } 2.103 + 2.104 +int cycles_counter_main_fd; 2.105 +void inline barrier_wait(barrier *barr, VirtProcr *animatingPr) 2.106 + { int i; 2.107 + 2.108 + VPThread__mutex_lock(barr->mutex, animatingPr); 2.109 + barr->counter++; 2.110 + if(barr->counter == barr->nthreads) 2.111 + { 2.112 +#ifdef MEASURE_PERF 2.113 + read(cycles_counter_main_fd, &(barr->endBarrierCycles.cycles), \ 2.114 + sizeof(barr->endBarrierCycles.cycles)); 2.115 +#endif 2.116 + 2.117 + barr->counter = 0; 2.118 + for(i=0; i < barr->nthreads; i++) 2.119 + VPThread__cond_signal(barr->cond, animatingPr); 2.120 + } 2.121 + else 2.122 + { VPThread__cond_wait(barr->cond, animatingPr); 2.123 + } 2.124 + VPThread__mutex_unlock(barr->mutex, animatingPr); 2.125 + } 2.126 + 2.127 + 2.128 + 2.129 +typedef struct 2.130 + { struct barrier_t* barrier; 2.131 + uint64_t totalWorkCycles; 2.132 + uint64_t totalBadCycles; 2.133 + uint64_t totalSyncCycles; 2.134 + uint64_t totalBadSyncCycles; 2.135 + uint64 numGoodSyncs; 2.136 + uint64 numGoodTasks; 2.137 + } 2.138 +WorkerParams; 2.139 + 2.140 + 2.141 +typedef struct 2.142 + { measurement_t *startExeCycles; 2.143 + measurement_t *endExeCycles; 2.144 + } 2.145 +BenchParams; 2.146 + 2.147 +//======================== Globals ========================= 2.148 +char __ProgrammName[] = "overhead_test"; 2.149 +char __DataSet[255]; 2.150 + 2.151 +int outer_iters, inner_iters, num_threads; 2.152 +size_t chunk_size = 0; 2.153 + 2.154 +int cycles_counter_fd[NUM_CORES]; 2.155 +struct perf_event_attr* hw_event; 2.156 + 2.157 +WorkerParams *workerParamsArray; 2.158 + 2.159 +//======================== App Code ========================= 2.160 +/* 2.161 + * Workload 2.162 + */ 2.163 + 2.164 +#define saveCyclesAndInstrs(core,cycles) do{ \ 2.165 + int cycles_fd = cycles_counter_fd[core]; \ 2.166 + int nread; \ 2.167 + \ 2.168 + nread = read(cycles_fd,&(cycles),sizeof(cycles)); \ 2.169 + if(nread<0){ \ 2.170 + perror("Error reading cycles counter"); \ 2.171 + cycles = 0; \ 2.172 + } \ 2.173 +} while (0) //macro magic for scoping 2.174 + 2.175 + 2.176 +double 2.177 +worker_TLF(void* _params, VirtProcr* animatingPr) 2.178 + { 2.179 + int i,o; 2.180 + WorkerParams* params = (WorkerParams*)_params; 2.181 + unsigned int totalWorkCycles = 0, totalBadCycles = 0; 2.182 + unsigned int totalSyncCycles = 0, totalBadSyncCycles = 0; 2.183 + unsigned int workspace1=0, numGoodSyncs = 0, numGoodTasks = 0; 2.184 + double workspace2=0.0; 2.185 + int32 privateMutex = VPThread__make_mutex(animatingPr); 2.186 + 2.187 + int cpuid = sched_getcpu(); 2.188 + 2.189 + measurement_t startWorkload, endWorkload, startWorkload2, endWorkload2; 2.190 + uint64 numCycles; 2.191 + for(o=0; o < outer_iters; o++) 2.192 + { 2.193 +#ifdef MEASURE_PERF 2.194 + saveCyclesAndInstrs(cpuid,startWorkload.cycles); 2.195 +#endif 2.196 + 2.197 + //workltask 2.198 + for(i=0; i < inner_iters; i++) 2.199 + { 2.200 + workspace1 += (workspace1 + 32)/2; 2.201 + workspace2 += (workspace2 + 23.2)/1.4; 2.202 + } 2.203 + 2.204 +#ifdef MEASURE_PERF 2.205 + saveCyclesAndInstrs(cpuid,endWorkload.cycles); 2.206 + numCycles = endWorkload.cycles - startWorkload.cycles; 2.207 + //sanity check (400K is about 20K iters) 2.208 + if( numCycles < 400000 ) {totalWorkCycles += numCycles; numGoodTasks++;} 2.209 + else {totalBadCycles += numCycles; } 2.210 +#endif 2.211 + 2.212 + //mutex access often causes switch to different Slave VP 2.213 + VPThread__mutex_lock(privateMutex, animatingPr); 2.214 + 2.215 +/* 2.216 + saveCyclesAndInstrs(cpuid,startWorkload2.cycles); 2.217 + //Task 2.218 + for(i=0; i < inner_iters; i++) 2.219 + { 2.220 + workspace1 += (workspace1 + 32)/2; 2.221 + workspace2 += (workspace2 + 23.2)/1.4; 2.222 + } 2.223 + 2.224 + saveCyclesAndInstrs(cpuid,endWorkload2.cycles); 2.225 + numCycles = endWorkload2.cycles - startWorkload2.cycles; 2.226 + //sanity check (400K is about 20K iters) 2.227 + if( numCycles < 400000 ) {totalWorkCycles += numCycles; numGoodTasks++;} 2.228 + else {totalBadCycles += numCycles; } 2.229 + 2.230 +*/ 2.231 + VPThread__mutex_unlock(privateMutex, animatingPr); 2.232 + } 2.233 + 2.234 + params->totalWorkCycles = totalWorkCycles; 2.235 + params->totalBadCycles = totalBadCycles; 2.236 + params->numGoodTasks = numGoodTasks; 2.237 + params->totalSyncCycles = totalSyncCycles; 2.238 + params->totalBadSyncCycles = totalBadSyncCycles; 2.239 + params->numGoodSyncs = numGoodSyncs; 2.240 +/* 2.241 + params->totalSyncCycles = VMS__give_num_plugin_cycles(); 2.242 + params->totalBadSyncCycles = 0; 2.243 + params->numGoodSyncs = VMS__give_num_plugin_animations(); 2.244 +*/ 2.245 + 2.246 + 2.247 + //Wait for all threads to end 2.248 + barrier_wait(params->barrier, animatingPr); 2.249 + 2.250 + //Shutdown worker 2.251 + VPThread__dissipate_thread(animatingPr); 2.252 + 2.253 + //below return never reached --> there for gcc 2.254 + return (workspace1 + workspace2); //to prevent gcc from optimizing work out 2.255 + } 2.256 + 2.257 + 2.258 +/* this is run after the VMS is set up*/ 2.259 +void benchmark(void *_params, VirtProcr *animatingPr) 2.260 + { 2.261 + int i, cpuID; 2.262 + struct barrier_t barr; 2.263 + BenchParams *params; 2.264 + 2.265 + params = (BenchParams *)_params; 2.266 + 2.267 + barrier_init(&barr, num_threads+1, animatingPr); 2.268 + 2.269 + //prepare input 2.270 + for(i=0; i<num_threads; i++) 2.271 + { 2.272 + workerParamsArray[i].barrier = &barr; 2.273 + } 2.274 + 2.275 + //save cycles before execution of threads, to get total exe cycles 2.276 + measurement_t *startExeCycles, *endExeCycles; 2.277 + startExeCycles = params->startExeCycles; 2.278 + 2.279 +#ifdef MEASURE_PERF 2.280 + int nread = read(cycles_counter_main_fd, &(startExeCycles->cycles), 2.281 + sizeof(startExeCycles->cycles)); 2.282 + if(nread<0) perror("Error reading cycles counter"); 2.283 +#endif 2.284 + 2.285 + //create (which starts running) all threads 2.286 + for(i=0; i<num_threads; i++) 2.287 + { VPThread__create_thread((VirtProcrFnPtr)worker_TLF, &(workerParamsArray[i]), animatingPr); 2.288 + } 2.289 + //wait for all threads to finish 2.290 + barrier_wait(&barr, animatingPr); 2.291 + 2.292 +#ifdef MEASURE_PERF 2.293 + //endBarrierCycles read in barrier_wait()! Merten, email me if want to chg 2.294 + params->endExeCycles->cycles = barr.endBarrierCycles.cycles; 2.295 +#endif 2.296 + 2.297 + 2.298 +/* 2.299 + uint64_t overallWorkCycles = 0; 2.300 + for(i=0; i<num_threads; i++){ 2.301 + printf("WorkCycles: %lu\n",input[i].totalWorkCycles); 2.302 + overallWorkCycles += input[i].totalWorkCycles; 2.303 + } 2.304 + 2.305 + printf("Sum across threads of work cycles: %lu\n", overallWorkCycles); 2.306 + printf("Total Execution: %lu\n", endBenchTime.cycles-startBenchTime.cycles); 2.307 + printf("Runtime/Workcycle Ratio %lu\n", 2.308 + ((endBenchTime.cycles-startBenchTime.cycles)*100)/overallWorkCycles); 2.309 +*/ 2.310 + 2.311 + //====================================================== 2.312 + 2.313 + VPThread__dissipate_thread(animatingPr); 2.314 + } 2.315 + 2.316 +int main(int argc, char **argv) 2.317 + { 2.318 + int i; 2.319 + 2.320 + //set global static variables, based on cmd-line args 2.321 + for(i=1; i<argc; i++) 2.322 + { 2.323 + if(argv[i][0] == '-' && argv[i][2] == 0) 2.324 + { 2.325 + switch(argv[i][1]) 2.326 + { 2.327 + case 't': 2.328 + if(!isdigit(argv[++i][0])) 2.329 + { 2.330 + fprintf(stderr, "-t must be followed by the number of worker threads to spawn\n"); 2.331 + return EXIT_FAILURE; 2.332 + } 2.333 + num_threads = atoi(argv[i]); 2.334 + if(!num_threads) 2.335 + { 2.336 + fprintf(stderr, "invalid number of threads specified: %d\n", num_threads); 2.337 + return EXIT_FAILURE; 2.338 + } 2.339 + break; 2.340 + case 'o': 2.341 + if(!isdigit(argv[++i][0])) 2.342 + { 2.343 + fputs("-i must be followed by a number\n", stderr); 2.344 + return EXIT_FAILURE; 2.345 + } 2.346 + outer_iters = atoi(argv[i]); 2.347 + break; 2.348 + case 'i': 2.349 + if(!isdigit(argv[++i][0])) 2.350 + { 2.351 + fputs("-o must be followed by a number (workload size)\n", stderr); 2.352 + return EXIT_FAILURE; 2.353 + } 2.354 + inner_iters = atoi(argv[i]); 2.355 + break; 2.356 + case 'h': 2.357 + fputs(usage, stdout); 2.358 + return 0; 2.359 + 2.360 + default: 2.361 + fprintf(stderr, "unrecognized argument: %s\n", argv[i]); 2.362 + fputs(usage, stderr); 2.363 + return EXIT_FAILURE; 2.364 + }//switch 2.365 + }//if arg 2.366 + else 2.367 + { 2.368 + fprintf(stderr, "unrecognized argument: %s\n", argv[i]); 2.369 + fputs(usage, stderr); 2.370 + return EXIT_FAILURE; 2.371 + } 2.372 + }//for 2.373 + 2.374 + 2.375 +#ifdef MEASURE_PERF 2.376 + //setup performance counters 2.377 + hw_event = malloc(sizeof(struct perf_event_attr)); 2.378 + memset(hw_event,0,sizeof(struct perf_event_attr)); 2.379 + 2.380 + hw_event->type = PERF_TYPE_HARDWARE; 2.381 + hw_event->size = sizeof(hw_event); 2.382 + hw_event->disabled = 0; 2.383 + hw_event->freq = 0; 2.384 + hw_event->inherit = 1; /* children inherit it */ 2.385 + hw_event->pinned = 1; /* says this virt counter must always be on HW */ 2.386 + hw_event->exclusive = 0; /* only group on PMU */ 2.387 + hw_event->exclude_user = 0; /* don't count user */ 2.388 + hw_event->exclude_kernel = 1; /* don't count kernel */ 2.389 + hw_event->exclude_hv = 1; /* ditto hypervisor */ 2.390 + hw_event->exclude_idle = 1; /* don't count when idle */ 2.391 + hw_event->mmap = 0; /* include mmap data */ 2.392 + hw_event->comm = 0; /* include comm data */ 2.393 + 2.394 + hw_event->config = PERF_COUNT_HW_CPU_CYCLES; //cycles 2.395 + 2.396 + int cpuID, retries; 2.397 + 2.398 + for( cpuID = 0; cpuID < NUM_CORES; cpuID++ ) 2.399 + { retries = 0; 2.400 + do 2.401 + { retries += 1; 2.402 + cycles_counter_fd[cpuID] = 2.403 + syscall(__NR_perf_event_open, hw_event, 2.404 + 0,//pid_t: 0 is "pid of calling process" 2.405 + cpuID,//int: cpu, the value returned by "CPUID" instr(?) 2.406 + -1,//int: group_fd, -1 is "leader" or independent 2.407 + 0//unsigned long: flags 2.408 + ); 2.409 + } 2.410 + while(cycles_counter_fd[cpuID]<0 && retries < 100); 2.411 + if(retries >= 100) 2.412 + { 2.413 + fprintf(stderr,"On core %d: ",cpuID); 2.414 + perror("Failed to open cycles counter"); 2.415 + } 2.416 + } 2.417 + 2.418 + //Set up counter to accumulate total cycles to process, across all CPUs 2.419 + 2.420 + retries = 0; 2.421 + do 2.422 + { retries += 1; 2.423 + cycles_counter_main_fd = 2.424 + syscall(__NR_perf_event_open, hw_event, 2.425 + 0,//pid_t: 0 is "pid of calling process" 2.426 + -1,//int: cpu, -1 means accumulate from all cores 2.427 + -1,//int: group_fd, -1 is "leader" == independent 2.428 + 0//unsigned long: flags 2.429 + ); 2.430 + } 2.431 + while(cycles_counter_main_fd<0 && retries < 100); 2.432 + if(retries >= 100) 2.433 + { 2.434 + fprintf(stderr,"in main "); 2.435 + perror("Failed to open cycles counter"); 2.436 + } 2.437 +#endif 2.438 + 2.439 + measurement_t startExeCycles, endExeCycles; 2.440 + BenchParams *benchParams; 2.441 + 2.442 + benchParams = malloc(sizeof(BenchParams)); 2.443 + 2.444 + benchParams->startExeCycles = &startExeCycles; 2.445 + benchParams->endExeCycles = &endExeCycles; 2.446 + 2.447 + workerParamsArray = (WorkerParams *)malloc( (num_threads + 1) * sizeof(WorkerParams) ); 2.448 + if(workerParamsArray == NULL ) printf("error mallocing worker params array\n"); 2.449 + 2.450 + 2.451 + //This is the transition to the VMS runtime 2.452 + VPThread__create_seed_procr_and_do_work( &benchmark, benchParams ); 2.453 + 2.454 +#ifdef MEASURE_PERF 2.455 + uint64_t totalWorkCyclesAcrossCores = 0, totalBadCyclesAcrossCores = 0; 2.456 + uint64_t totalSyncCyclesAcrossCores = 0, totalBadSyncCyclesAcrossCores = 0; 2.457 + for(i=0; i<num_threads; i++){ 2.458 + printf("WorkCycles: %lu\n",workerParamsArray[i].totalWorkCycles); 2.459 +// printf("Num Good Tasks: %lu\n",workerParamsArray[i].numGoodTasks); 2.460 +// printf("SyncCycles: %lu\n",workerParamsArray[i].totalSyncCycles); 2.461 +// printf("Num Good Syncs: %lu\n",workerParamsArray[i].numGoodSyncs); 2.462 + totalWorkCyclesAcrossCores += workerParamsArray[i].totalWorkCycles; 2.463 + totalBadCyclesAcrossCores += workerParamsArray[i].totalBadCycles; 2.464 + totalSyncCyclesAcrossCores += workerParamsArray[i].totalSyncCycles; 2.465 + totalBadSyncCyclesAcrossCores += workerParamsArray[i].totalBadSyncCycles; 2.466 + } 2.467 + 2.468 + uint64_t totalExeCycles = endExeCycles.cycles - startExeCycles.cycles; 2.469 + totalExeCycles -= totalBadCyclesAcrossCores; 2.470 + uint64 totalOverhead = totalExeCycles - totalWorkCyclesAcrossCores; 2.471 + int32 numSyncs = outer_iters * num_threads * 2; 2.472 + printf("Total Execution Cycles: %lu\n", totalExeCycles); 2.473 + printf("Sum across threads of work cycles: %lu\n", totalWorkCyclesAcrossCores); 2.474 + printf("Sum across threads of bad work cycles: %lu\n", totalBadCyclesAcrossCores); 2.475 +// printf("Sum across threads of Bad Sync cycles: %lu\n", totalBadSyncCyclesAcrossCores); 2.476 + printf("Overhead per sync: %f\n", (double)totalOverhead / (double)numSyncs ); 2.477 + printf("ExeCycles/WorkCycles Ratio %f\n", 2.478 + (double)totalExeCycles / (double)totalWorkCyclesAcrossCores); 2.479 +#else 2.480 + printf("No measurement done!\n"); 2.481 +#endif 2.482 + return 0; 2.483 + }
3.1 --- a/src/Application/main.c Fri Jan 06 19:09:38 2012 +0100 3.2 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 3.3 @@ -1,480 +0,0 @@ 3.4 -/* 3.5 - * 3.6 - */ 3.7 -#include <stdio.h> 3.8 -#include <stdlib.h> 3.9 -#include <string.h> 3.10 -#include <math.h> 3.11 -#include <ctype.h> 3.12 -#include <errno.h> 3.13 -#include <pthread.h> 3.14 -#include <unistd.h> 3.15 -#include "VPThread_lib/VPThread.h" 3.16 -#include "VPThread_lib/VMS/Queue_impl/PrivateQueue.h" 3.17 - 3.18 -#include <linux/perf_event.h> 3.19 -#include <linux/prctl.h> 3.20 -#include <sys/syscall.h> 3.21 - 3.22 -#undef DEBUG 3.23 -//#define DEBUG 3.24 - 3.25 -#define MEASURE_PERF 3.26 - 3.27 -#if !defined(unix) && !defined(__unix__) 3.28 -#ifdef __MACH__ 3.29 -#define unix 1 3.30 -#define __unix__ 1 3.31 -#endif /* __MACH__ */ 3.32 -#endif /* unix */ 3.33 - 3.34 -/* find the appropriate way to define explicitly sized types */ 3.35 -/* for C99 or GNU libc (also mach's libc) we can use stdint.h */ 3.36 -#if (__STDC_VERSION__ >= 199900) || defined(__GLIBC__) || defined(__MACH__) 3.37 -#include <stdint.h> 3.38 -#elif defined(unix) || defined(__unix__) /* some UNIX systems have them in sys/types.h */ 3.39 -#include <sys/types.h> 3.40 -#elif defined(__WIN32__) || defined(WIN32) /* the nameless one */ 3.41 -typedef unsigned __int8 uint8_t; 3.42 -typedef unsigned __int32 uint32_t; 3.43 -#endif /* sized type detection */ 3.44 - 3.45 -/* provide a millisecond-resolution timer for each system */ 3.46 -#if defined(unix) || defined(__unix__) 3.47 -#include <time.h> 3.48 -#include <sys/time.h> 3.49 -unsigned long get_msec(void) { 3.50 - static struct timeval timeval, first_timeval; 3.51 - 3.52 - gettimeofday(&timeval, 0); 3.53 - if(first_timeval.tv_sec == 0) { 3.54 - first_timeval = timeval; 3.55 - return 0; 3.56 - } 3.57 - return (timeval.tv_sec - first_timeval.tv_sec) * 1000 + (timeval.tv_usec - first_timeval.tv_usec) / 1000; 3.58 -} 3.59 -#elif defined(__WIN32__) || defined(WIN32) 3.60 -#include <windows.h> 3.61 -unsigned long get_msec(void) { 3.62 - return GetTickCount(); 3.63 -} 3.64 -#else 3.65 -//#error "I don't know how to measure time on your platform" 3.66 -#endif 3.67 - 3.68 -//======================== Defines ========================= 3.69 -typedef struct perfData measurement_t; 3.70 -struct perfData{ 3.71 - uint64 cycles; 3.72 - uint64 instructions; 3.73 -}; 3.74 - 3.75 -const char *usage = { 3.76 - "Usage: malloc_test [options]\n" 3.77 - " Spwans a number of threads and allocates memory.\n\n" 3.78 - "Options:\n" 3.79 - " -t <num> how many threads to use (default: 1). This is internaly multiplied by the number of cores.\n" 3.80 - " -o <num> repeat workload and sync operation <m> times\n" 3.81 - " -i <num> size of workload, repeat <n> times\n" 3.82 - " -h this help screen\n\n" 3.83 -}; 3.84 - 3.85 -struct barrier_t 3.86 -{ 3.87 - int counter; 3.88 - int nthreads; 3.89 - int32 mutex; 3.90 - int32 cond; 3.91 - measurement_t endBarrierCycles; 3.92 - 3.93 -}; 3.94 -typedef struct barrier_t barrier; 3.95 - 3.96 -void inline barrier_init(barrier *barr, int nthreads, VirtProcr *animatingPr) 3.97 - { 3.98 - barr->counter = 0; 3.99 - barr->nthreads = nthreads; 3.100 - barr->mutex = VPThread__make_mutex(animatingPr); 3.101 - barr->cond = VPThread__make_cond(barr->mutex, animatingPr); 3.102 - } 3.103 - 3.104 -int cycles_counter_main_fd; 3.105 -void inline barrier_wait(barrier *barr, VirtProcr *animatingPr) 3.106 - { int i; 3.107 - 3.108 - VPThread__mutex_lock(barr->mutex, animatingPr); 3.109 - barr->counter++; 3.110 - if(barr->counter == barr->nthreads) 3.111 - { 3.112 -#ifdef MEASURE_PERF 3.113 - read(cycles_counter_main_fd, &(barr->endBarrierCycles.cycles), \ 3.114 - sizeof(barr->endBarrierCycles.cycles)); 3.115 -#endif 3.116 - 3.117 - barr->counter = 0; 3.118 - for(i=0; i < barr->nthreads; i++) 3.119 - VPThread__cond_signal(barr->cond, animatingPr); 3.120 - } 3.121 - else 3.122 - { VPThread__cond_wait(barr->cond, animatingPr); 3.123 - } 3.124 - VPThread__mutex_unlock(barr->mutex, animatingPr); 3.125 - } 3.126 - 3.127 - 3.128 - 3.129 -typedef struct 3.130 - { struct barrier_t* barrier; 3.131 - uint64_t totalWorkCycles; 3.132 - uint64_t totalBadCycles; 3.133 - uint64_t totalSyncCycles; 3.134 - uint64_t totalBadSyncCycles; 3.135 - uint64 numGoodSyncs; 3.136 - uint64 numGoodTasks; 3.137 - } 3.138 -WorkerParams; 3.139 - 3.140 - 3.141 -typedef struct 3.142 - { measurement_t *startExeCycles; 3.143 - measurement_t *endExeCycles; 3.144 - } 3.145 -BenchParams; 3.146 - 3.147 -//======================== Globals ========================= 3.148 -char __ProgrammName[] = "overhead_test"; 3.149 -char __DataSet[255]; 3.150 - 3.151 -int outer_iters, inner_iters, num_threads; 3.152 -size_t chunk_size = 0; 3.153 - 3.154 -int cycles_counter_fd[NUM_CORES]; 3.155 -struct perf_event_attr* hw_event; 3.156 - 3.157 -WorkerParams *workerParamsArray; 3.158 - 3.159 -//======================== App Code ========================= 3.160 -/* 3.161 - * Workload 3.162 - */ 3.163 - 3.164 -#define saveCyclesAndInstrs(core,cycles) do{ \ 3.165 - int cycles_fd = cycles_counter_fd[core]; \ 3.166 - int nread; \ 3.167 - \ 3.168 - nread = read(cycles_fd,&(cycles),sizeof(cycles)); \ 3.169 - if(nread<0){ \ 3.170 - perror("Error reading cycles counter"); \ 3.171 - cycles = 0; \ 3.172 - } \ 3.173 -} while (0) //macro magic for scoping 3.174 - 3.175 - 3.176 -double 3.177 -worker_TLF(void* _params, VirtProcr* animatingPr) 3.178 - { 3.179 - int i,o; 3.180 - WorkerParams* params = (WorkerParams*)_params; 3.181 - unsigned int totalWorkCycles = 0, totalBadCycles = 0; 3.182 - unsigned int totalSyncCycles = 0, totalBadSyncCycles = 0; 3.183 - unsigned int workspace1=0, numGoodSyncs = 0, numGoodTasks = 0; 3.184 - double workspace2=0.0; 3.185 - int32 privateMutex = VPThread__make_mutex(animatingPr); 3.186 - 3.187 - int cpuid = sched_getcpu(); 3.188 - 3.189 - measurement_t startWorkload, endWorkload, startWorkload2, endWorkload2; 3.190 - uint64 numCycles; 3.191 - for(o=0; o < outer_iters; o++) 3.192 - { 3.193 -#ifdef MEASURE_PERF 3.194 - saveCyclesAndInstrs(cpuid,startWorkload.cycles); 3.195 -#endif 3.196 - 3.197 - //workltask 3.198 - for(i=0; i < inner_iters; i++) 3.199 - { 3.200 - workspace1 += (workspace1 + 32)/2; 3.201 - workspace2 += (workspace2 + 23.2)/1.4; 3.202 - } 3.203 - 3.204 -#ifdef MEASURE_PERF 3.205 - saveCyclesAndInstrs(cpuid,endWorkload.cycles); 3.206 - numCycles = endWorkload.cycles - startWorkload.cycles; 3.207 - //sanity check (400K is about 20K iters) 3.208 - if( numCycles < 400000 ) {totalWorkCycles += numCycles; numGoodTasks++;} 3.209 - else {totalBadCycles += numCycles; } 3.210 -#endif 3.211 - 3.212 - //mutex access often causes switch to different Slave VP 3.213 - VPThread__mutex_lock(privateMutex, animatingPr); 3.214 - 3.215 -/* 3.216 - saveCyclesAndInstrs(cpuid,startWorkload2.cycles); 3.217 - //Task 3.218 - for(i=0; i < inner_iters; i++) 3.219 - { 3.220 - workspace1 += (workspace1 + 32)/2; 3.221 - workspace2 += (workspace2 + 23.2)/1.4; 3.222 - } 3.223 - 3.224 - saveCyclesAndInstrs(cpuid,endWorkload2.cycles); 3.225 - numCycles = endWorkload2.cycles - startWorkload2.cycles; 3.226 - //sanity check (400K is about 20K iters) 3.227 - if( numCycles < 400000 ) {totalWorkCycles += numCycles; numGoodTasks++;} 3.228 - else {totalBadCycles += numCycles; } 3.229 - 3.230 -*/ 3.231 - VPThread__mutex_unlock(privateMutex, animatingPr); 3.232 - } 3.233 - 3.234 - params->totalWorkCycles = totalWorkCycles; 3.235 - params->totalBadCycles = totalBadCycles; 3.236 - params->numGoodTasks = numGoodTasks; 3.237 - params->totalSyncCycles = totalSyncCycles; 3.238 - params->totalBadSyncCycles = totalBadSyncCycles; 3.239 - params->numGoodSyncs = numGoodSyncs; 3.240 -/* 3.241 - params->totalSyncCycles = VMS__give_num_plugin_cycles(); 3.242 - params->totalBadSyncCycles = 0; 3.243 - params->numGoodSyncs = VMS__give_num_plugin_animations(); 3.244 -*/ 3.245 - 3.246 - 3.247 - //Wait for all threads to end 3.248 - barrier_wait(params->barrier, animatingPr); 3.249 - 3.250 - //Shutdown worker 3.251 - VPThread__dissipate_thread(animatingPr); 3.252 - 3.253 - //below return never reached --> there for gcc 3.254 - return (workspace1 + workspace2); //to prevent gcc from optimizing work out 3.255 - } 3.256 - 3.257 - 3.258 -/* this is run after the VMS is set up*/ 3.259 -void benchmark(void *_params, VirtProcr *animatingPr) 3.260 - { 3.261 - int i, cpuID; 3.262 - struct barrier_t barr; 3.263 - BenchParams *params; 3.264 - 3.265 - params = (BenchParams *)_params; 3.266 - 3.267 - barrier_init(&barr, num_threads+1, animatingPr); 3.268 - 3.269 - //prepare input 3.270 - for(i=0; i<num_threads; i++) 3.271 - { 3.272 - workerParamsArray[i].barrier = &barr; 3.273 - } 3.274 - 3.275 - //save cycles before execution of threads, to get total exe cycles 3.276 - measurement_t *startExeCycles, *endExeCycles; 3.277 - startExeCycles = params->startExeCycles; 3.278 - 3.279 -#ifdef MEASURE_PERF 3.280 - int nread = read(cycles_counter_main_fd, &(startExeCycles->cycles), 3.281 - sizeof(startExeCycles->cycles)); 3.282 - if(nread<0) perror("Error reading cycles counter"); 3.283 -#endif 3.284 - 3.285 - //create (which starts running) all threads 3.286 - for(i=0; i<num_threads; i++) 3.287 - { VPThread__create_thread((VirtProcrFnPtr)worker_TLF, &(workerParamsArray[i]), animatingPr); 3.288 - } 3.289 - //wait for all threads to finish 3.290 - barrier_wait(&barr, animatingPr); 3.291 - 3.292 -#ifdef MEASURE_PERF 3.293 - //endBarrierCycles read in barrier_wait()! Merten, email me if want to chg 3.294 - params->endExeCycles->cycles = barr.endBarrierCycles.cycles; 3.295 -#endif 3.296 - 3.297 - 3.298 -/* 3.299 - uint64_t overallWorkCycles = 0; 3.300 - for(i=0; i<num_threads; i++){ 3.301 - printf("WorkCycles: %lu\n",input[i].totalWorkCycles); 3.302 - overallWorkCycles += input[i].totalWorkCycles; 3.303 - } 3.304 - 3.305 - printf("Sum across threads of work cycles: %lu\n", overallWorkCycles); 3.306 - printf("Total Execution: %lu\n", endBenchTime.cycles-startBenchTime.cycles); 3.307 - printf("Runtime/Workcycle Ratio %lu\n", 3.308 - ((endBenchTime.cycles-startBenchTime.cycles)*100)/overallWorkCycles); 3.309 -*/ 3.310 - 3.311 - //====================================================== 3.312 - 3.313 - VPThread__dissipate_thread(animatingPr); 3.314 - } 3.315 - 3.316 -int main(int argc, char **argv) 3.317 - { 3.318 - int i; 3.319 - 3.320 - //set global static variables, based on cmd-line args 3.321 - for(i=1; i<argc; i++) 3.322 - { 3.323 - if(argv[i][0] == '-' && argv[i][2] == 0) 3.324 - { 3.325 - switch(argv[i][1]) 3.326 - { 3.327 - case 't': 3.328 - if(!isdigit(argv[++i][0])) 3.329 - { 3.330 - fprintf(stderr, "-t must be followed by the number of worker threads to spawn\n"); 3.331 - return EXIT_FAILURE; 3.332 - } 3.333 - num_threads = atoi(argv[i]); 3.334 - if(!num_threads) 3.335 - { 3.336 - fprintf(stderr, "invalid number of threads specified: %d\n", num_threads); 3.337 - return EXIT_FAILURE; 3.338 - } 3.339 - break; 3.340 - case 'o': 3.341 - if(!isdigit(argv[++i][0])) 3.342 - { 3.343 - fputs("-i must be followed by a number\n", stderr); 3.344 - return EXIT_FAILURE; 3.345 - } 3.346 - outer_iters = atoi(argv[i]); 3.347 - break; 3.348 - case 'i': 3.349 - if(!isdigit(argv[++i][0])) 3.350 - { 3.351 - fputs("-o must be followed by a number (workload size)\n", stderr); 3.352 - return EXIT_FAILURE; 3.353 - } 3.354 - inner_iters = atoi(argv[i]); 3.355 - break; 3.356 - case 'h': 3.357 - fputs(usage, stdout); 3.358 - return 0; 3.359 - 3.360 - default: 3.361 - fprintf(stderr, "unrecognized argument: %s\n", argv[i]); 3.362 - fputs(usage, stderr); 3.363 - return EXIT_FAILURE; 3.364 - }//switch 3.365 - }//if arg 3.366 - else 3.367 - { 3.368 - fprintf(stderr, "unrecognized argument: %s\n", argv[i]); 3.369 - fputs(usage, stderr); 3.370 - return EXIT_FAILURE; 3.371 - } 3.372 - }//for 3.373 - 3.374 - 3.375 -#ifdef MEASURE_PERF 3.376 - //setup performance counters 3.377 - hw_event = malloc(sizeof(struct perf_event_attr)); 3.378 - memset(hw_event,0,sizeof(struct perf_event_attr)); 3.379 - 3.380 - hw_event->type = PERF_TYPE_HARDWARE; 3.381 - hw_event->size = sizeof(hw_event); 3.382 - hw_event->disabled = 0; 3.383 - hw_event->freq = 0; 3.384 - hw_event->inherit = 1; /* children inherit it */ 3.385 - hw_event->pinned = 1; /* says this virt counter must always be on HW */ 3.386 - hw_event->exclusive = 0; /* only group on PMU */ 3.387 - hw_event->exclude_user = 0; /* don't count user */ 3.388 - hw_event->exclude_kernel = 1; /* don't count kernel */ 3.389 - hw_event->exclude_hv = 1; /* ditto hypervisor */ 3.390 - hw_event->exclude_idle = 1; /* don't count when idle */ 3.391 - hw_event->mmap = 0; /* include mmap data */ 3.392 - hw_event->comm = 0; /* include comm data */ 3.393 - 3.394 - hw_event->config = PERF_COUNT_HW_CPU_CYCLES; //cycles 3.395 - 3.396 - int cpuID, retries; 3.397 - 3.398 - for( cpuID = 0; cpuID < NUM_CORES; cpuID++ ) 3.399 - { retries = 0; 3.400 - do 3.401 - { retries += 1; 3.402 - cycles_counter_fd[cpuID] = 3.403 - syscall(__NR_perf_event_open, hw_event, 3.404 - 0,//pid_t: 0 is "pid of calling process" 3.405 - cpuID,//int: cpu, the value returned by "CPUID" instr(?) 3.406 - -1,//int: group_fd, -1 is "leader" or independent 3.407 - 0//unsigned long: flags 3.408 - ); 3.409 - } 3.410 - while(cycles_counter_fd[cpuID]<0 && retries < 100); 3.411 - if(retries >= 100) 3.412 - { 3.413 - fprintf(stderr,"On core %d: ",cpuID); 3.414 - perror("Failed to open cycles counter"); 3.415 - } 3.416 - } 3.417 - 3.418 - //Set up counter to accumulate total cycles to process, across all CPUs 3.419 - 3.420 - retries = 0; 3.421 - do 3.422 - { retries += 1; 3.423 - cycles_counter_main_fd = 3.424 - syscall(__NR_perf_event_open, hw_event, 3.425 - 0,//pid_t: 0 is "pid of calling process" 3.426 - -1,//int: cpu, -1 means accumulate from all cores 3.427 - -1,//int: group_fd, -1 is "leader" == independent 3.428 - 0//unsigned long: flags 3.429 - ); 3.430 - } 3.431 - while(cycles_counter_main_fd<0 && retries < 100); 3.432 - if(retries >= 100) 3.433 - { 3.434 - fprintf(stderr,"in main "); 3.435 - perror("Failed to open cycles counter"); 3.436 - } 3.437 -#endif 3.438 - 3.439 - measurement_t startExeCycles, endExeCycles; 3.440 - BenchParams *benchParams; 3.441 - 3.442 - benchParams = malloc(sizeof(BenchParams)); 3.443 - 3.444 - benchParams->startExeCycles = &startExeCycles; 3.445 - benchParams->endExeCycles = &endExeCycles; 3.446 - 3.447 - workerParamsArray = (WorkerParams *)malloc( (num_threads + 1) * sizeof(WorkerParams) ); 3.448 - if(workerParamsArray == NULL ) printf("error mallocing worker params array\n"); 3.449 - 3.450 - 3.451 - //This is the transition to the VMS runtime 3.452 - VPThread__create_seed_procr_and_do_work( &benchmark, benchParams ); 3.453 - 3.454 -#ifdef MEASURE_PERF 3.455 - uint64_t totalWorkCyclesAcrossCores = 0, totalBadCyclesAcrossCores = 0; 3.456 - uint64_t totalSyncCyclesAcrossCores = 0, totalBadSyncCyclesAcrossCores = 0; 3.457 - for(i=0; i<num_threads; i++){ 3.458 - printf("WorkCycles: %lu\n",workerParamsArray[i].totalWorkCycles); 3.459 -// printf("Num Good Tasks: %lu\n",workerParamsArray[i].numGoodTasks); 3.460 -// printf("SyncCycles: %lu\n",workerParamsArray[i].totalSyncCycles); 3.461 -// printf("Num Good Syncs: %lu\n",workerParamsArray[i].numGoodSyncs); 3.462 - totalWorkCyclesAcrossCores += workerParamsArray[i].totalWorkCycles; 3.463 - totalBadCyclesAcrossCores += workerParamsArray[i].totalBadCycles; 3.464 - totalSyncCyclesAcrossCores += workerParamsArray[i].totalSyncCycles; 3.465 - totalBadSyncCyclesAcrossCores += workerParamsArray[i].totalBadSyncCycles; 3.466 - } 3.467 - 3.468 - uint64_t totalExeCycles = endExeCycles.cycles - startExeCycles.cycles; 3.469 - totalExeCycles -= totalBadCyclesAcrossCores; 3.470 - uint64 totalOverhead = totalExeCycles - totalWorkCyclesAcrossCores; 3.471 - int32 numSyncs = outer_iters * num_threads * 2; 3.472 - printf("Total Execution Cycles: %lu\n", totalExeCycles); 3.473 - printf("Sum across threads of work cycles: %lu\n", totalWorkCyclesAcrossCores); 3.474 - printf("Sum across threads of bad work cycles: %lu\n", totalBadCyclesAcrossCores); 3.475 -// printf("Sum across threads of Bad Sync cycles: %lu\n", totalBadSyncCyclesAcrossCores); 3.476 - printf("Overhead per sync: %f\n", (double)totalOverhead / (double)numSyncs ); 3.477 - printf("ExeCycles/WorkCycles Ratio %f\n", 3.478 - (double)totalExeCycles / (double)totalWorkCyclesAcrossCores); 3.479 -#else 3.480 - printf("No measurement done!\n"); 3.481 -#endif 3.482 - return 0; 3.483 - }
