Mercurial > cgi-bin > hgwebdir.cgi > PR > Applications > Vthread > Vthread__Best_Effort_Msg__Bench
view main.c @ 17:281cadcbb796
changed directory structure, added .hgeol file
| author | Merten Sach <msach@mailbox.tu-berlin.de> |
|---|---|
| date | Mon, 13 Feb 2012 16:12:20 +0100 |
| parents | src/Application/main.c@c3561dbac1dc |
| children | e7277df4460e |
line source
1 /*
2 *
3 */
4 #include <stdio.h>
5 #include <stdlib.h>
6 #include <string.h>
7 #include <math.h>
8 #include <ctype.h>
9 #include <errno.h>
10 #include <pthread.h>
11 #include <unistd.h>
12 #include "VPThread_lib/VPThread.h"
13 #include "VPThread_lib/VMS/Queue_impl/PrivateQueue.h"
15 #include <linux/perf_event.h>
16 #include <linux/prctl.h>
17 #include <sys/syscall.h>
19 #undef DEBUG
20 //#define DEBUG
22 #if !defined(unix) && !defined(__unix__)
23 #ifdef __MACH__
24 #define unix 1
25 #define __unix__ 1
26 #endif /* __MACH__ */
27 #endif /* unix */
29 /* find the appropriate way to define explicitly sized types */
30 /* for C99 or GNU libc (also mach's libc) we can use stdint.h */
31 #if (__STDC_VERSION__ >= 199900) || defined(__GLIBC__) || defined(__MACH__)
32 #include <stdint.h>
33 #elif defined(unix) || defined(__unix__) /* some UNIX systems have them in sys/types.h */
34 #include <sys/types.h>
35 #elif defined(__WIN32__) || defined(WIN32) /* the nameless one */
36 typedef unsigned __int8 uint8_t;
37 typedef unsigned __int32 uint32_t;
38 #endif /* sized type detection */
40 /* provide a millisecond-resolution timer for each system */
41 #if defined(unix) || defined(__unix__)
42 #include <time.h>
43 #include <sys/time.h>
44 unsigned long get_msec(void) {
45 static struct timeval timeval, first_timeval;
47 gettimeofday(&timeval, 0);
48 if(first_timeval.tv_sec == 0) {
49 first_timeval = timeval;
50 return 0;
51 }
52 return (timeval.tv_sec - first_timeval.tv_sec) * 1000 + (timeval.tv_usec - first_timeval.tv_usec) / 1000;
53 }
54 #elif defined(__WIN32__) || defined(WIN32)
55 #include <windows.h>
56 unsigned long get_msec(void) {
57 return GetTickCount();
58 }
59 #else
60 //#error "I don't know how to measure time on your platform"
61 #endif
63 //======================== Globals =========================
64 char __ProgrammName[] = "overhead_test";
65 char __DataSet[255];
67 int outer_iters, inner_iters, num_threads;
68 size_t chunk_size = 0;
70 int cycles_counter_main_fd;
71 int misses_counter_fd;
73 uint64_t cache_misses;
75 int cycles_counter_fd[NUM_CORES];
76 struct perf_event_attr* hw_event;
78 //======================== Defines =========================
79 typedef struct perfData measurement_t;
80 struct perfData{
81 uint64 cycles;
82 } __align_to_cacheline__;
84 const char *usage = {
85 "Usage: malloc_test [options]\n"
86 " Spwans a number of threads and allocates memory.\n\n"
87 "Options:\n"
88 " -t <num> how many threads to use (default: 1). This is internaly multiplied by the number of cores.\n"
89 " -o <num> repeat workload and sync operation <m> times\n"
90 " -i <num> size of workload, repeat <n> times\n"
91 " -h this help screen\n\n"
92 };
94 struct barrier_t
95 {
96 int counter;
97 int nthreads;
98 int32 mutex;
99 int32 cond;
100 measurement_t endBarrierCycles;
102 } __align_to_cacheline__;
103 typedef struct barrier_t barrier;
105 void inline barrier_init(barrier *barr, int nthreads, VirtProcr *animatingPr)
106 {
107 barr->counter = 0;
108 barr->nthreads = nthreads;
109 barr->mutex = VPThread__make_mutex(animatingPr);
110 barr->cond = VPThread__make_cond(barr->mutex, animatingPr);
111 }
113 void inline barrier_wait(barrier *barr, VirtProcr *animatingPr)
114 { int i;
116 VPThread__mutex_lock(barr->mutex, animatingPr);
117 barr->counter++;
118 if(barr->counter == barr->nthreads)
119 {
120 read(cycles_counter_main_fd, &(barr->endBarrierCycles.cycles), \
121 sizeof(barr->endBarrierCycles.cycles));
123 barr->counter = 0;
124 for(i=0; i < barr->nthreads; i++)
125 VPThread__cond_signal(barr->cond, animatingPr);
126 }
127 else
128 { VPThread__cond_wait(barr->cond, animatingPr);
129 }
130 VPThread__mutex_unlock(barr->mutex, animatingPr);
131 }
135 struct WorkerParams_t
136 { struct barrier_t* barrier;
137 uint64_t totalWorkCycles;
138 uint64_t totalBadCycles;
139 uint64_t totalSyncCycles;
140 uint64_t totalBadSyncCycles;
141 uint64 numGoodSyncs;
142 uint64 numGoodTasks;
143 };
145 typedef union
146 {
147 struct WorkerParams_t data;
148 char padding[CACHELINE_SIZE];
149 } WorkerParams __align_to_cacheline__;
151 WorkerParams *workerParamsArray;
153 typedef struct
154 { measurement_t *startExeCycles;
155 measurement_t *endExeCycles;
156 } BenchParams __align_to_cacheline__;
158 //======================== App Code =========================
159 /*
160 p* Workload
161 */
163 #define saveCyclesAndInstrs(core,cycles) do{ \
164 int cycles_fd = cycles_counter_fd[core]; \
165 int nread; \
166 \
167 nread = read(cycles_fd,&(cycles),sizeof(cycles)); \
168 if(nread<0){ \
169 perror("Error reading cycles counter"); \
170 cycles = 0; \
171 } \
172 } while (0) //macro magic for scoping
174 #define saveMisses(misses) do{ \
175 int nread; \
176 \
177 nread = read(misses_counter_fd,&(misses),sizeof(misses)); \
178 if(nread<0){ \
179 perror("Error reading misses counter"); \
180 misses = 0; \
181 } \
182 } while (0) //macro magic for scoping
185 double
186 worker_TLF(void* _params, VirtProcr* animatingPr)
187 {
188 int i,o;
189 WorkerParams* params = (WorkerParams*)_params;
190 unsigned int totalWorkCycles = 0, totalBadCycles = 0;
191 unsigned int totalSyncCycles = 0, totalBadSyncCycles = 0;
192 unsigned int workspace1=0, numGoodSyncs = 0, numGoodTasks = 0;
193 double workspace2=0.0;
194 int32 privateMutex = VPThread__make_mutex(animatingPr);
196 int cpuid = sched_getcpu();
198 measurement_t startWorkload, endWorkload;
199 uint64 numCycles;
200 for(o=0; o < outer_iters; o++)
201 {
203 saveCyclesAndInstrs(cpuid,startWorkload.cycles);
205 //task
206 for(i=0; i < inner_iters; i++)
207 {
208 workspace1 += (workspace1 + 32)/2;
209 workspace2 += (workspace2 + 23.2)/1.4;
210 }
212 saveCyclesAndInstrs(cpuid,endWorkload.cycles);
213 numCycles = endWorkload.cycles - startWorkload.cycles;
214 //sanity check (400K is about 20K iters)
215 if( numCycles < 400000 ) {totalWorkCycles += numCycles; numGoodTasks++;}
216 else {totalBadCycles += numCycles; }
218 //mutex access often causes switch to different Slave VP
219 VPThread__mutex_lock(privateMutex, animatingPr);
221 /*
222 saveCyclesAndInstrs(cpuid,startWorkload2.cycles);
223 //Task
224 for(i=0; i < inner_iters; i++)
225 {
226 workspace1 += (workspace1 + 32)/2;
227 workspace2 += (workspace2 + 23.2)/1.4;
228 }
230 saveCyclesAndInstrs(cpuid,endWorkload2.cycles);
231 numCycles = endWorkload2.cycles - startWorkload2.cycles;
232 //sanity check (400K is about 20K iters)
233 if( numCycles < 400000 ) {totalWorkCycles += numCycles; numGoodTasks++;}
234 else {totalBadCycles += numCycles; }
236 */
237 VPThread__mutex_unlock(privateMutex, animatingPr);
238 }
240 params->data.totalWorkCycles = totalWorkCycles;
241 params->data.totalBadCycles = totalBadCycles;
242 params->data.numGoodTasks = numGoodTasks;
243 params->data.totalSyncCycles = totalSyncCycles;
244 params->data.totalBadSyncCycles = totalBadSyncCycles;
245 params->data.numGoodSyncs = numGoodSyncs;
246 /*
247 params->totalSyncCycles = VMS__give_num_plugin_cycles();
248 params->totalBadSyncCycles = 0;
249 params->numGoodSyncs = VMS__give_num_plugin_animations();
250 */
253 //Wait for all threads to end
254 barrier_wait(params->data.barrier, animatingPr);
256 //Shutdown worker
257 VPThread__dissipate_thread(animatingPr);
259 //below return never reached --> there for gcc
260 return (workspace1 + workspace2); //to prevent gcc from optimizing work out
261 }
263 //local variables of benchmark, made global for alignment
264 struct barrier_t barr __align_to_cacheline__;
265 BenchParams *params __align_to_cacheline__;
267 /* this is run after the VMS is set up*/
268 void benchmark(void *_params, VirtProcr *animatingPr)
269 {
270 int i;
272 params = (BenchParams *)_params;
274 barrier_init(&barr, num_threads+1, animatingPr);
276 //prepare input
277 for(i=0; i<num_threads; i++)
278 {
279 workerParamsArray[i].data.barrier = &barr;
280 }
282 uint64_t cache_misses_at_start, cache_misses_at_end;
283 saveMisses(cache_misses_at_start);
284 //save cycles before execution of threads, to get total exe cycles
285 int nread = read(cycles_counter_main_fd, &(params->startExeCycles->cycles),
286 sizeof(params->startExeCycles->cycles));
287 if(nread<0) perror("Error reading cycles counter");
289 //create (which starts running) all threads
290 for(i=0; i<num_threads; i++)
291 { VPThread__create_thread((VirtProcrFnPtr)worker_TLF, &(workerParamsArray[i]), animatingPr);
292 }
293 //wait for all threads to finish
294 barrier_wait(&barr, animatingPr);
296 //endBarrierCycles read in barrier_wait()! Merten, email me if want to chg
297 params->endExeCycles->cycles = barr.endBarrierCycles.cycles;
298 saveMisses(cache_misses_at_end);
299 cache_misses = cache_misses_at_end-cache_misses_at_start;
300 /*
301 uint64_t overallWorkCycles = 0;
302 for(i=0; i<num_threads; i++){
303 printf("WorkCycles: %lu\n",input[i].totalWorkCycles);
304 overallWorkCycles += input[i].totalWorkCycles;
305 }
307 printf("Sum across threads of work cycles: %lu\n", overallWorkCycles);
308 printf("Total Execution: %lu\n", endBenchTime.cycles-startBenchTime.cycles);
309 printf("Runtime/Workcycle Ratio %lu\n",
310 ((endBenchTime.cycles-startBenchTime.cycles)*100)/overallWorkCycles);
311 */
313 //======================================================
315 VPThread__dissipate_thread(animatingPr);
316 }
318 int main(int argc, char **argv)
319 {
320 int i;
322 //set global static variables, based on cmd-line args
323 for(i=1; i<argc; i++)
324 {
325 if(argv[i][0] == '-' && argv[i][2] == 0)
326 {
327 switch(argv[i][1])
328 {
329 case 't':
330 if(!isdigit(argv[++i][0]))
331 {
332 fprintf(stderr, "-t must be followed by the number of worker threads to spawn\n");
333 return EXIT_FAILURE;
334 }
335 num_threads = atoi(argv[i]);
336 if(!num_threads)
337 {
338 fprintf(stderr, "invalid number of threads specified: %d\n", num_threads);
339 return EXIT_FAILURE;
340 }
341 break;
342 case 'o':
343 if(!isdigit(argv[++i][0]))
344 {
345 fputs("-i must be followed by a number\n", stderr);
346 return EXIT_FAILURE;
347 }
348 outer_iters = atoi(argv[i]);
349 break;
350 case 'i':
351 if(!isdigit(argv[++i][0]))
352 {
353 fputs("-o must be followed by a number (workload size)\n", stderr);
354 return EXIT_FAILURE;
355 }
356 inner_iters = atoi(argv[i]);
357 break;
358 case 'h':
359 fputs(usage, stdout);
360 return 0;
362 default:
363 fprintf(stderr, "unrecognized argument: %s\n", argv[i]);
364 fputs(usage, stderr);
365 return EXIT_FAILURE;
366 }//switch
367 }//if arg
368 else
369 {
370 fprintf(stderr, "unrecognized argument: %s\n", argv[i]);
371 fputs(usage, stderr);
372 return EXIT_FAILURE;
373 }
374 }//for
377 //setup performance counters
378 hw_event = malloc(sizeof(struct perf_event_attr));
379 memset(hw_event,0,sizeof(struct perf_event_attr));
381 hw_event->type = PERF_TYPE_HARDWARE;
382 hw_event->size = sizeof(hw_event);
383 hw_event->disabled = 0;
384 hw_event->freq = 0;
385 hw_event->inherit = 1; /* children inherit it */
386 hw_event->pinned = 1; /* says this virt counter must always be on HW */
387 hw_event->exclusive = 0; /* only group on PMU */
388 hw_event->exclude_user = 0; /* don't count user */
389 hw_event->exclude_kernel = 1; /* don't count kernel */
390 hw_event->exclude_hv = 1; /* ditto hypervisor */
391 hw_event->exclude_idle = 1; /* don't count when idle */
392 hw_event->mmap = 0; /* include mmap data */
393 hw_event->comm = 0; /* include comm data */
395 hw_event->config = PERF_COUNT_HW_CPU_CYCLES; //cycles
397 int cpuID, retries;
399 for( cpuID = 0; cpuID < NUM_CORES; cpuID++ )
400 { retries = 0;
401 do
402 { retries += 1;
403 cycles_counter_fd[cpuID] =
404 syscall(__NR_perf_event_open, hw_event,
405 0,//pid_t: 0 is "pid of calling process"
406 cpuID,//int: cpu, the value returned by "CPUID" instr(?)
407 -1,//int: group_fd, -1 is "leader" or independent
408 0//unsigned long: flags
409 );
410 }
411 while(cycles_counter_fd[cpuID]<0 && retries < 100);
412 if(retries >= 100)
413 {
414 fprintf(stderr,"On core %d: ",cpuID);
415 perror("Failed to open cycles counter");
416 }
417 }
419 //Set up counter to accumulate total cycles to process, across all CPUs
421 retries = 0;
422 do
423 { retries += 1;
424 cycles_counter_main_fd =
425 syscall(__NR_perf_event_open, hw_event,
426 0,//pid_t: 0 is "pid of calling process"
427 -1,//int: cpu, -1 means accumulate from all cores
428 -1,//int: group_fd, -1 is "leader" == independent
429 0//unsigned long: flags
430 );
431 }
432 while(cycles_counter_main_fd<0 && retries < 100);
433 if(retries >= 100)
434 {
435 fprintf(stderr,"in main ");
436 perror("Failed to open cycles counter");
437 }
439 //Set up counters to count cache misses
440 hw_event->type = PERF_TYPE_HARDWARE;
441 hw_event->config = PERF_COUNT_HW_CACHE_MISSES; //misses
443 retries = 0;
444 do
445 { retries += 1;
446 misses_counter_fd =
447 syscall(__NR_perf_event_open, hw_event,
448 0,//pid_t: 0 is "pid of calling process"
449 -1,//int: cpu, -1 means accumulate from all cores
450 -1,//int: group_fd, -1 is "leader" == independent
451 0//unsigned long: flags
452 );
453 }
454 while(misses_counter_fd<0 && retries < 100);
455 if(retries >= 100)
456 {
457 fprintf(stderr,"in main ");
458 perror("Failed to misses counter");
459 }
461 measurement_t startExeCycles, endExeCycles;
462 BenchParams *benchParams;
464 benchParams = malloc(sizeof(BenchParams));
466 benchParams->startExeCycles = &startExeCycles;
467 benchParams->endExeCycles = &endExeCycles;
469 workerParamsArray = (WorkerParams *)malloc( (num_threads + 1) * sizeof(WorkerParams) );
470 if(workerParamsArray == NULL ) printf("error mallocing worker params array\n");
473 //This is the transition to the VMS runtime
474 VPThread__create_seed_procr_and_do_work( &benchmark, benchParams );
476 uint64_t totalWorkCyclesAcrossCores = 0, totalBadCyclesAcrossCores = 0;
477 uint64_t totalSyncCyclesAcrossCores = 0, totalBadSyncCyclesAcrossCores = 0;
478 for(i=0; i<num_threads; i++){
479 printf("WorkCycles: %lu\n",workerParamsArray[i].data.totalWorkCycles);
480 // printf("Num Good Tasks: %lu\n",workerParamsArray[i].numGoodTasks);
481 // printf("SyncCycles: %lu\n",workerParamsArray[i].totalSyncCycles);
482 // printf("Num Good Syncs: %lu\n",workerParamsArray[i].numGoodSyncs);
483 totalWorkCyclesAcrossCores += workerParamsArray[i].data.totalWorkCycles;
484 totalBadCyclesAcrossCores += workerParamsArray[i].data.totalBadCycles;
485 totalSyncCyclesAcrossCores += workerParamsArray[i].data.totalSyncCycles;
486 totalBadSyncCyclesAcrossCores += workerParamsArray[i].data.totalBadSyncCycles;
487 }
489 uint64_t totalExeCycles = endExeCycles.cycles - startExeCycles.cycles;
490 totalExeCycles -= totalBadCyclesAcrossCores;
491 uint64 totalOverhead = totalExeCycles - totalWorkCyclesAcrossCores;
492 int32 numSyncs = outer_iters * num_threads * 2;
493 printf("Total Execution Cycles: %lu\n", totalExeCycles);
494 printf("Total number of cache misses: %lu\n", cache_misses);
495 printf("Sum across threads of work cycles: %lu\n", totalWorkCyclesAcrossCores);
496 printf("Sum across threads of bad work cycles: %lu\n", totalBadCyclesAcrossCores);
497 // printf("Sum across threads of Bad Sync cycles: %lu\n", totalBadSyncCyclesAcrossCores);
498 printf("Overhead per sync: %f\n", (double)totalOverhead / (double)numSyncs );
499 printf("ExeCycles/WorkCycles Ratio %f\n",
500 (double)totalExeCycles / (double)totalWorkCyclesAcrossCores);
501 return 0;
502 }
