| rev |
line source |
|
Me@4
|
1 /*
|
|
Me@4
|
2 *
|
|
Me@4
|
3 */
|
|
Me@4
|
4 #include <stdio.h>
|
|
Me@4
|
5 #include <stdlib.h>
|
|
Me@4
|
6 #include <string.h>
|
|
Me@4
|
7 #include <math.h>
|
|
Me@4
|
8 #include <ctype.h>
|
|
Me@4
|
9 #include <errno.h>
|
|
Me@4
|
10 #include <pthread.h>
|
|
msach@6
|
11 #include <unistd.h>
|
|
Me@4
|
12 #include "VPThread_lib/VPThread.h"
|
|
Me@4
|
13 #include "VPThread_lib/VMS/Queue_impl/PrivateQueue.h"
|
|
Me@4
|
14
|
|
msach@6
|
15 #include <linux/perf_event.h>
|
|
msach@6
|
16 #include <linux/prctl.h>
|
|
msach@6
|
17 #include <sys/syscall.h>
|
|
msach@6
|
18
|
|
Me@4
|
19 #undef DEBUG
|
|
Me@4
|
20 //#define DEBUG
|
|
Me@4
|
21
|
|
msach@15
|
22 #define MEASURE_PERF
|
|
msach@15
|
23
|
|
Me@4
|
24 #if !defined(unix) && !defined(__unix__)
|
|
Me@4
|
25 #ifdef __MACH__
|
|
Me@4
|
26 #define unix 1
|
|
Me@4
|
27 #define __unix__ 1
|
|
Me@4
|
28 #endif /* __MACH__ */
|
|
Me@4
|
29 #endif /* unix */
|
|
Me@4
|
30
|
|
Me@4
|
31 /* find the appropriate way to define explicitly sized types */
|
|
Me@4
|
32 /* for C99 or GNU libc (also mach's libc) we can use stdint.h */
|
|
Me@4
|
33 #if (__STDC_VERSION__ >= 199900) || defined(__GLIBC__) || defined(__MACH__)
|
|
Me@4
|
34 #include <stdint.h>
|
|
Me@4
|
35 #elif defined(unix) || defined(__unix__) /* some UNIX systems have them in sys/types.h */
|
|
Me@4
|
36 #include <sys/types.h>
|
|
Me@4
|
37 #elif defined(__WIN32__) || defined(WIN32) /* the nameless one */
|
|
Me@4
|
38 typedef unsigned __int8 uint8_t;
|
|
Me@4
|
39 typedef unsigned __int32 uint32_t;
|
|
Me@4
|
40 #endif /* sized type detection */
|
|
Me@4
|
41
|
|
Me@4
|
42 /* provide a millisecond-resolution timer for each system */
|
|
Me@4
|
43 #if defined(unix) || defined(__unix__)
|
|
Me@4
|
44 #include <time.h>
|
|
Me@4
|
45 #include <sys/time.h>
|
|
Me@4
|
46 unsigned long get_msec(void) {
|
|
Me@4
|
47 static struct timeval timeval, first_timeval;
|
|
Me@4
|
48
|
|
Me@4
|
49 gettimeofday(&timeval, 0);
|
|
Me@4
|
50 if(first_timeval.tv_sec == 0) {
|
|
Me@4
|
51 first_timeval = timeval;
|
|
Me@4
|
52 return 0;
|
|
Me@4
|
53 }
|
|
Me@4
|
54 return (timeval.tv_sec - first_timeval.tv_sec) * 1000 + (timeval.tv_usec - first_timeval.tv_usec) / 1000;
|
|
Me@4
|
55 }
|
|
Me@4
|
56 #elif defined(__WIN32__) || defined(WIN32)
|
|
Me@4
|
57 #include <windows.h>
|
|
Me@4
|
58 unsigned long get_msec(void) {
|
|
Me@4
|
59 return GetTickCount();
|
|
Me@4
|
60 }
|
|
Me@4
|
61 #else
|
|
Me@4
|
62 //#error "I don't know how to measure time on your platform"
|
|
Me@4
|
63 #endif
|
|
Me@4
|
64
|
|
Me@4
|
65 //======================== Defines =========================
|
|
kshalle@8
|
66 typedef struct perfData measurement_t;
|
|
kshalle@8
|
67 struct perfData{
|
|
kshalle@8
|
68 uint64 cycles;
|
|
kshalle@8
|
69 uint64 instructions;
|
|
kshalle@8
|
70 };
|
|
Me@4
|
71
|
|
Me@4
|
72 const char *usage = {
|
|
Me@4
|
73 "Usage: malloc_test [options]\n"
|
|
Me@4
|
74 " Spwans a number of threads and allocates memory.\n\n"
|
|
Me@4
|
75 "Options:\n"
|
|
msach@6
|
76 " -t <num> how many threads to use (default: 1). This is internaly multiplied by the number of cores.\n"
|
|
msach@6
|
77 " -o <num> repeat workload and sync operation <m> times\n"
|
|
msach@6
|
78 " -i <num> size of workload, repeat <n> times\n"
|
|
Me@4
|
79 " -h this help screen\n\n"
|
|
Me@4
|
80 };
|
|
Me@4
|
81
|
|
Me@4
|
82 struct barrier_t
|
|
Me@4
|
83 {
|
|
Me@4
|
84 int counter;
|
|
Me@4
|
85 int nthreads;
|
|
Me@4
|
86 int32 mutex;
|
|
Me@4
|
87 int32 cond;
|
|
kshalle@8
|
88 measurement_t endBarrierCycles;
|
|
kshalle@8
|
89
|
|
Me@4
|
90 };
|
|
Me@4
|
91 typedef struct barrier_t barrier;
|
|
Me@4
|
92
|
|
Me@4
|
93 void inline barrier_init(barrier *barr, int nthreads, VirtProcr *animatingPr)
|
|
Me@4
|
94 {
|
|
Me@4
|
95 barr->counter = 0;
|
|
Me@4
|
96 barr->nthreads = nthreads;
|
|
Me@4
|
97 barr->mutex = VPThread__make_mutex(animatingPr);
|
|
Me@4
|
98 barr->cond = VPThread__make_cond(barr->mutex, animatingPr);
|
|
Me@4
|
99 }
|
|
Me@4
|
100
|
|
kshalle@8
|
101 int cycles_counter_main_fd;
|
|
Me@4
|
102 void inline barrier_wait(barrier *barr, VirtProcr *animatingPr)
|
|
Me@4
|
103 { int i;
|
|
Me@4
|
104
|
|
Me@4
|
105 VPThread__mutex_lock(barr->mutex, animatingPr);
|
|
Me@4
|
106 barr->counter++;
|
|
Me@4
|
107 if(barr->counter == barr->nthreads)
|
|
kshalle@8
|
108 {
|
|
msach@15
|
109 #ifdef MEASURE_PERF
|
|
kshalle@8
|
110 read(cycles_counter_main_fd, &(barr->endBarrierCycles.cycles), \
|
|
kshalle@8
|
111 sizeof(barr->endBarrierCycles.cycles));
|
|
msach@15
|
112 #endif
|
|
kshalle@8
|
113
|
|
kshalle@8
|
114 barr->counter = 0;
|
|
Me@4
|
115 for(i=0; i < barr->nthreads; i++)
|
|
Me@4
|
116 VPThread__cond_signal(barr->cond, animatingPr);
|
|
Me@4
|
117 }
|
|
Me@4
|
118 else
|
|
Me@4
|
119 { VPThread__cond_wait(barr->cond, animatingPr);
|
|
Me@4
|
120 }
|
|
Me@4
|
121 VPThread__mutex_unlock(barr->mutex, animatingPr);
|
|
Me@4
|
122 }
|
|
Me@4
|
123
|
|
kshalle@8
|
124
|
|
kshalle@8
|
125
|
|
msach@9
|
126 typedef struct
|
|
msach@9
|
127 { struct barrier_t* barrier;
|
|
msach@9
|
128 uint64_t totalWorkCycles;
|
|
msach@9
|
129 uint64_t totalBadCycles;
|
|
msach@9
|
130 uint64_t totalSyncCycles;
|
|
msach@9
|
131 uint64_t totalBadSyncCycles;
|
|
msach@9
|
132 uint64 numGoodSyncs;
|
|
msach@9
|
133 uint64 numGoodTasks;
|
|
msach@9
|
134 }
|
|
msach@9
|
135 WorkerParams;
|
|
Me@4
|
136
|
|
Me@4
|
137
|
|
kshalle@8
|
138 typedef struct
|
|
kshalle@8
|
139 { measurement_t *startExeCycles;
|
|
kshalle@8
|
140 measurement_t *endExeCycles;
|
|
kshalle@8
|
141 }
|
|
kshalle@8
|
142 BenchParams;
|
|
Me@4
|
143
|
|
Me@4
|
144 //======================== Globals =========================
|
|
Me@4
|
145 char __ProgrammName[] = "overhead_test";
|
|
Me@4
|
146 char __DataSet[255];
|
|
Me@4
|
147
|
|
msach@9
|
148 int outer_iters, inner_iters, num_threads;
|
|
Me@4
|
149 size_t chunk_size = 0;
|
|
Me@4
|
150
|
|
msach@6
|
151 int cycles_counter_fd[NUM_CORES];
|
|
msach@7
|
152 struct perf_event_attr* hw_event;
|
|
Me@4
|
153
|
|
kshalle@8
|
154 WorkerParams *workerParamsArray;
|
|
kshalle@8
|
155
|
|
Me@4
|
156 //======================== App Code =========================
|
|
Me@4
|
157 /*
|
|
Me@4
|
158 * Workload
|
|
Me@4
|
159 */
|
|
msach@6
|
160
|
|
msach@6
|
161 #define saveCyclesAndInstrs(core,cycles) do{ \
|
|
msach@6
|
162 int cycles_fd = cycles_counter_fd[core]; \
|
|
msach@6
|
163 int nread; \
|
|
msach@6
|
164 \
|
|
msach@6
|
165 nread = read(cycles_fd,&(cycles),sizeof(cycles)); \
|
|
msach@7
|
166 if(nread<0){ \
|
|
msach@6
|
167 perror("Error reading cycles counter"); \
|
|
msach@6
|
168 cycles = 0; \
|
|
msach@6
|
169 } \
|
|
msach@6
|
170 } while (0) //macro magic for scoping
|
|
msach@6
|
171
|
|
msach@7
|
172
|
|
msach@9
|
173 double
|
|
msach@9
|
174 worker_TLF(void* _params, VirtProcr* animatingPr)
|
|
Me@5
|
175 {
|
|
msach@7
|
176 int i,o;
|
|
msach@9
|
177 WorkerParams* params = (WorkerParams*)_params;
|
|
msach@9
|
178 unsigned int totalWorkCycles = 0, totalBadCycles = 0;
|
|
msach@9
|
179 unsigned int totalSyncCycles = 0, totalBadSyncCycles = 0;
|
|
msach@9
|
180 unsigned int workspace1=0, numGoodSyncs = 0, numGoodTasks = 0;
|
|
kshalle@8
|
181 double workspace2=0.0;
|
|
Me@5
|
182 int32 privateMutex = VPThread__make_mutex(animatingPr);
|
|
msach@6
|
183
|
|
msach@6
|
184 int cpuid = sched_getcpu();
|
|
msach@9
|
185
|
|
msach@11
|
186 measurement_t startWorkload, endWorkload, startWorkload2, endWorkload2;
|
|
msach@9
|
187 uint64 numCycles;
|
|
msach@9
|
188 for(o=0; o < outer_iters; o++)
|
|
Me@4
|
189 {
|
|
msach@15
|
190 #ifdef MEASURE_PERF
|
|
msach@10
|
191 saveCyclesAndInstrs(cpuid,startWorkload.cycles);
|
|
msach@15
|
192 #endif
|
|
msach@9
|
193
|
|
msach@11
|
194 //workltask
|
|
msach@9
|
195 for(i=0; i < inner_iters; i++)
|
|
Me@5
|
196 {
|
|
Me@5
|
197 workspace1 += (workspace1 + 32)/2;
|
|
Me@5
|
198 workspace2 += (workspace2 + 23.2)/1.4;
|
|
Me@5
|
199 }
|
|
msach@15
|
200
|
|
msach@15
|
201 #ifdef MEASURE_PERF
|
|
msach@10
|
202 saveCyclesAndInstrs(cpuid,endWorkload.cycles);
|
|
msach@10
|
203 numCycles = endWorkload.cycles - startWorkload.cycles;
|
|
msach@9
|
204 //sanity check (400K is about 20K iters)
|
|
msach@9
|
205 if( numCycles < 400000 ) {totalWorkCycles += numCycles; numGoodTasks++;}
|
|
msach@9
|
206 else {totalBadCycles += numCycles; }
|
|
msach@15
|
207 #endif
|
|
msach@9
|
208
|
|
msach@9
|
209 //mutex access often causes switch to different Slave VP
|
|
msach@9
|
210 VPThread__mutex_lock(privateMutex, animatingPr);
|
|
msach@11
|
211
|
|
msach@10
|
212 /*
|
|
msach@11
|
213 saveCyclesAndInstrs(cpuid,startWorkload2.cycles);
|
|
msach@11
|
214 //Task
|
|
msach@11
|
215 for(i=0; i < inner_iters; i++)
|
|
msach@11
|
216 {
|
|
msach@11
|
217 workspace1 += (workspace1 + 32)/2;
|
|
msach@11
|
218 workspace2 += (workspace2 + 23.2)/1.4;
|
|
msach@11
|
219 }
|
|
msach@11
|
220
|
|
msach@11
|
221 saveCyclesAndInstrs(cpuid,endWorkload2.cycles);
|
|
msach@11
|
222 numCycles = endWorkload2.cycles - startWorkload2.cycles;
|
|
msach@9
|
223 //sanity check (400K is about 20K iters)
|
|
msach@11
|
224 if( numCycles < 400000 ) {totalWorkCycles += numCycles; numGoodTasks++;}
|
|
msach@11
|
225 else {totalBadCycles += numCycles; }
|
|
msach@11
|
226
|
|
msach@10
|
227 */
|
|
msach@9
|
228 VPThread__mutex_unlock(privateMutex, animatingPr);
|
|
Me@4
|
229 }
|
|
Me@5
|
230
|
|
kshalle@8
|
231 params->totalWorkCycles = totalWorkCycles;
|
|
msach@9
|
232 params->totalBadCycles = totalBadCycles;
|
|
msach@9
|
233 params->numGoodTasks = numGoodTasks;
|
|
msach@9
|
234 params->totalSyncCycles = totalSyncCycles;
|
|
msach@9
|
235 params->totalBadSyncCycles = totalBadSyncCycles;
|
|
msach@9
|
236 params->numGoodSyncs = numGoodSyncs;
|
|
msach@9
|
237 /*
|
|
msach@9
|
238 params->totalSyncCycles = VMS__give_num_plugin_cycles();
|
|
msach@9
|
239 params->totalBadSyncCycles = 0;
|
|
msach@9
|
240 params->numGoodSyncs = VMS__give_num_plugin_animations();
|
|
msach@9
|
241 */
|
|
msach@6
|
242
|
|
msach@6
|
243
|
|
msach@6
|
244 //Wait for all threads to end
|
|
kshalle@8
|
245 barrier_wait(params->barrier, animatingPr);
|
|
Me@5
|
246
|
|
Me@5
|
247 //Shutdown worker
|
|
Me@5
|
248 VPThread__dissipate_thread(animatingPr);
|
|
msach@9
|
249
|
|
msach@9
|
250 //below return never reached --> there for gcc
|
|
msach@9
|
251 return (workspace1 + workspace2); //to prevent gcc from optimizing work out
|
|
Me@5
|
252 }
|
|
Me@4
|
253
|
|
kshalle@8
|
254
|
|
Me@4
|
255 /* this is run after the VMS is set up*/
|
|
kshalle@8
|
256 void benchmark(void *_params, VirtProcr *animatingPr)
|
|
Me@4
|
257 {
|
|
msach@7
|
258 int i, cpuID;
|
|
kshalle@8
|
259 struct barrier_t barr;
|
|
kshalle@8
|
260 BenchParams *params;
|
|
kshalle@8
|
261
|
|
kshalle@8
|
262 params = (BenchParams *)_params;
|
|
kshalle@8
|
263
|
|
Me@4
|
264 barrier_init(&barr, num_threads+1, animatingPr);
|
|
kshalle@8
|
265
|
|
msach@6
|
266 //prepare input
|
|
Me@4
|
267 for(i=0; i<num_threads; i++)
|
|
msach@6
|
268 {
|
|
kshalle@8
|
269 workerParamsArray[i].barrier = &barr;
|
|
Me@4
|
270 }
|
|
msach@7
|
271
|
|
kshalle@8
|
272 //save cycles before execution of threads, to get total exe cycles
|
|
kshalle@8
|
273 measurement_t *startExeCycles, *endExeCycles;
|
|
kshalle@8
|
274 startExeCycles = params->startExeCycles;
|
|
kshalle@8
|
275
|
|
msach@15
|
276 #ifdef MEASURE_PERF
|
|
kshalle@8
|
277 int nread = read(cycles_counter_main_fd, &(startExeCycles->cycles),
|
|
kshalle@8
|
278 sizeof(startExeCycles->cycles));
|
|
msach@9
|
279 if(nread<0) perror("Error reading cycles counter");
|
|
msach@15
|
280 #endif
|
|
msach@9
|
281
|
|
msach@9
|
282 //create (which starts running) all threads
|
|
msach@9
|
283 for(i=0; i<num_threads; i++)
|
|
msach@9
|
284 { VPThread__create_thread((VirtProcrFnPtr)worker_TLF, &(workerParamsArray[i]), animatingPr);
|
|
kshalle@8
|
285 }
|
|
msach@6
|
286 //wait for all threads to finish
|
|
Me@4
|
287 barrier_wait(&barr, animatingPr);
|
|
msach@6
|
288
|
|
msach@15
|
289 #ifdef MEASURE_PERF
|
|
msach@9
|
290 //endBarrierCycles read in barrier_wait()! Merten, email me if want to chg
|
|
kshalle@8
|
291 params->endExeCycles->cycles = barr.endBarrierCycles.cycles;
|
|
msach@15
|
292 #endif
|
|
kshalle@8
|
293
|
|
Me@4
|
294
|
|
kshalle@8
|
295 /*
|
|
msach@6
|
296 uint64_t overallWorkCycles = 0;
|
|
msach@6
|
297 for(i=0; i<num_threads; i++){
|
|
msach@7
|
298 printf("WorkCycles: %lu\n",input[i].totalWorkCycles);
|
|
msach@6
|
299 overallWorkCycles += input[i].totalWorkCycles;
|
|
Me@4
|
300 }
|
|
msach@6
|
301
|
|
msach@6
|
302 printf("Sum across threads of work cycles: %lu\n", overallWorkCycles);
|
|
msach@6
|
303 printf("Total Execution: %lu\n", endBenchTime.cycles-startBenchTime.cycles);
|
|
kshalle@8
|
304 printf("Runtime/Workcycle Ratio %lu\n",
|
|
kshalle@8
|
305 ((endBenchTime.cycles-startBenchTime.cycles)*100)/overallWorkCycles);
|
|
kshalle@8
|
306 */
|
|
Me@4
|
307
|
|
Me@4
|
308 //======================================================
|
|
Me@4
|
309
|
|
Me@4
|
310 VPThread__dissipate_thread(animatingPr);
|
|
Me@4
|
311 }
|
|
Me@4
|
312
|
|
Me@4
|
313 int main(int argc, char **argv)
|
|
Me@4
|
314 {
|
|
Me@4
|
315 int i;
|
|
Me@4
|
316
|
|
Me@4
|
317 //set global static variables, based on cmd-line args
|
|
Me@4
|
318 for(i=1; i<argc; i++)
|
|
Me@4
|
319 {
|
|
Me@4
|
320 if(argv[i][0] == '-' && argv[i][2] == 0)
|
|
Me@4
|
321 {
|
|
Me@4
|
322 switch(argv[i][1])
|
|
Me@4
|
323 {
|
|
Me@4
|
324 case 't':
|
|
Me@4
|
325 if(!isdigit(argv[++i][0]))
|
|
Me@4
|
326 {
|
|
kshalle@8
|
327 fprintf(stderr, "-t must be followed by the number of worker threads to spawn\n");
|
|
Me@4
|
328 return EXIT_FAILURE;
|
|
Me@4
|
329 }
|
|
Me@4
|
330 num_threads = atoi(argv[i]);
|
|
Me@4
|
331 if(!num_threads)
|
|
Me@4
|
332 {
|
|
Me@4
|
333 fprintf(stderr, "invalid number of threads specified: %d\n", num_threads);
|
|
Me@4
|
334 return EXIT_FAILURE;
|
|
Me@4
|
335 }
|
|
Me@4
|
336 break;
|
|
msach@9
|
337 case 'o':
|
|
Me@4
|
338 if(!isdigit(argv[++i][0]))
|
|
Me@4
|
339 {
|
|
msach@6
|
340 fputs("-i must be followed by a number\n", stderr);
|
|
Me@4
|
341 return EXIT_FAILURE;
|
|
Me@4
|
342 }
|
|
msach@9
|
343 outer_iters = atoi(argv[i]);
|
|
Me@4
|
344 break;
|
|
msach@9
|
345 case 'i':
|
|
Me@4
|
346 if(!isdigit(argv[++i][0]))
|
|
Me@4
|
347 {
|
|
msach@6
|
348 fputs("-o must be followed by a number (workload size)\n", stderr);
|
|
Me@4
|
349 return EXIT_FAILURE;
|
|
Me@4
|
350 }
|
|
msach@9
|
351 inner_iters = atoi(argv[i]);
|
|
Me@4
|
352 break;
|
|
Me@4
|
353 case 'h':
|
|
Me@4
|
354 fputs(usage, stdout);
|
|
Me@4
|
355 return 0;
|
|
Me@4
|
356
|
|
Me@4
|
357 default:
|
|
Me@4
|
358 fprintf(stderr, "unrecognized argument: %s\n", argv[i]);
|
|
Me@4
|
359 fputs(usage, stderr);
|
|
Me@4
|
360 return EXIT_FAILURE;
|
|
Me@4
|
361 }//switch
|
|
Me@4
|
362 }//if arg
|
|
Me@4
|
363 else
|
|
Me@4
|
364 {
|
|
Me@4
|
365 fprintf(stderr, "unrecognized argument: %s\n", argv[i]);
|
|
Me@4
|
366 fputs(usage, stderr);
|
|
Me@4
|
367 return EXIT_FAILURE;
|
|
Me@4
|
368 }
|
|
Me@4
|
369 }//for
|
|
msach@7
|
370
|
|
kshalle@8
|
371
|
|
msach@15
|
372 #ifdef MEASURE_PERF
|
|
msach@7
|
373 //setup performance counters
|
|
msach@7
|
374 hw_event = malloc(sizeof(struct perf_event_attr));
|
|
msach@7
|
375 memset(hw_event,0,sizeof(struct perf_event_attr));
|
|
msach@7
|
376
|
|
msach@7
|
377 hw_event->type = PERF_TYPE_HARDWARE;
|
|
msach@7
|
378 hw_event->size = sizeof(hw_event);
|
|
msach@7
|
379 hw_event->disabled = 0;
|
|
msach@7
|
380 hw_event->freq = 0;
|
|
msach@7
|
381 hw_event->inherit = 1; /* children inherit it */
|
|
msach@7
|
382 hw_event->pinned = 1; /* says this virt counter must always be on HW */
|
|
msach@7
|
383 hw_event->exclusive = 0; /* only group on PMU */
|
|
msach@7
|
384 hw_event->exclude_user = 0; /* don't count user */
|
|
msach@7
|
385 hw_event->exclude_kernel = 1; /* don't count kernel */
|
|
msach@7
|
386 hw_event->exclude_hv = 1; /* ditto hypervisor */
|
|
msach@7
|
387 hw_event->exclude_idle = 1; /* don't count when idle */
|
|
msach@7
|
388 hw_event->mmap = 0; /* include mmap data */
|
|
msach@7
|
389 hw_event->comm = 0; /* include comm data */
|
|
msach@7
|
390
|
|
msach@7
|
391 hw_event->config = PERF_COUNT_HW_CPU_CYCLES; //cycles
|
|
msach@7
|
392
|
|
msach@7
|
393 int cpuID, retries;
|
|
msach@7
|
394
|
|
msach@7
|
395 for( cpuID = 0; cpuID < NUM_CORES; cpuID++ )
|
|
msach@7
|
396 { retries = 0;
|
|
msach@7
|
397 do
|
|
msach@7
|
398 { retries += 1;
|
|
msach@7
|
399 cycles_counter_fd[cpuID] =
|
|
msach@7
|
400 syscall(__NR_perf_event_open, hw_event,
|
|
msach@7
|
401 0,//pid_t: 0 is "pid of calling process"
|
|
msach@7
|
402 cpuID,//int: cpu, the value returned by "CPUID" instr(?)
|
|
msach@7
|
403 -1,//int: group_fd, -1 is "leader" or independent
|
|
msach@7
|
404 0//unsigned long: flags
|
|
msach@7
|
405 );
|
|
msach@7
|
406 }
|
|
msach@7
|
407 while(cycles_counter_fd[cpuID]<0 && retries < 100);
|
|
msach@7
|
408 if(retries >= 100)
|
|
msach@7
|
409 {
|
|
msach@7
|
410 fprintf(stderr,"On core %d: ",cpuID);
|
|
msach@7
|
411 perror("Failed to open cycles counter");
|
|
msach@7
|
412 }
|
|
msach@7
|
413 }
|
|
msach@7
|
414
|
|
msach@7
|
415 //Set up counter to accumulate total cycles to process, across all CPUs
|
|
msach@7
|
416
|
|
msach@7
|
417 retries = 0;
|
|
msach@7
|
418 do
|
|
msach@7
|
419 { retries += 1;
|
|
msach@7
|
420 cycles_counter_main_fd =
|
|
msach@7
|
421 syscall(__NR_perf_event_open, hw_event,
|
|
msach@7
|
422 0,//pid_t: 0 is "pid of calling process"
|
|
msach@7
|
423 -1,//int: cpu, -1 means accumulate from all cores
|
|
msach@7
|
424 -1,//int: group_fd, -1 is "leader" == independent
|
|
msach@7
|
425 0//unsigned long: flags
|
|
msach@7
|
426 );
|
|
msach@7
|
427 }
|
|
msach@7
|
428 while(cycles_counter_main_fd<0 && retries < 100);
|
|
msach@7
|
429 if(retries >= 100)
|
|
msach@7
|
430 {
|
|
msach@7
|
431 fprintf(stderr,"in main ");
|
|
msach@7
|
432 perror("Failed to open cycles counter");
|
|
msach@7
|
433 }
|
|
msach@15
|
434 #endif
|
|
kshalle@8
|
435
|
|
msach@9
|
436 measurement_t startExeCycles, endExeCycles;
|
|
msach@9
|
437 BenchParams *benchParams;
|
|
msach@9
|
438
|
|
msach@9
|
439 benchParams = malloc(sizeof(BenchParams));
|
|
msach@9
|
440
|
|
msach@9
|
441 benchParams->startExeCycles = &startExeCycles;
|
|
msach@9
|
442 benchParams->endExeCycles = &endExeCycles;
|
|
msach@9
|
443
|
|
kshalle@8
|
444 workerParamsArray = (WorkerParams *)malloc( (num_threads + 1) * sizeof(WorkerParams) );
|
|
kshalle@8
|
445 if(workerParamsArray == NULL ) printf("error mallocing worker params array\n");
|
|
kshalle@8
|
446
|
|
msach@9
|
447
|
|
kshalle@8
|
448 //This is the transition to the VMS runtime
|
|
kshalle@8
|
449 VPThread__create_seed_procr_and_do_work( &benchmark, benchParams );
|
|
kshalle@8
|
450
|
|
msach@15
|
451 #ifdef MEASURE_PERF
|
|
msach@9
|
452 uint64_t totalWorkCyclesAcrossCores = 0, totalBadCyclesAcrossCores = 0;
|
|
msach@9
|
453 uint64_t totalSyncCyclesAcrossCores = 0, totalBadSyncCyclesAcrossCores = 0;
|
|
kshalle@8
|
454 for(i=0; i<num_threads; i++){
|
|
kshalle@8
|
455 printf("WorkCycles: %lu\n",workerParamsArray[i].totalWorkCycles);
|
|
msach@9
|
456 // printf("Num Good Tasks: %lu\n",workerParamsArray[i].numGoodTasks);
|
|
msach@9
|
457 // printf("SyncCycles: %lu\n",workerParamsArray[i].totalSyncCycles);
|
|
msach@9
|
458 // printf("Num Good Syncs: %lu\n",workerParamsArray[i].numGoodSyncs);
|
|
kshalle@8
|
459 totalWorkCyclesAcrossCores += workerParamsArray[i].totalWorkCycles;
|
|
msach@9
|
460 totalBadCyclesAcrossCores += workerParamsArray[i].totalBadCycles;
|
|
msach@9
|
461 totalSyncCyclesAcrossCores += workerParamsArray[i].totalSyncCycles;
|
|
msach@9
|
462 totalBadSyncCyclesAcrossCores += workerParamsArray[i].totalBadSyncCycles;
|
|
kshalle@8
|
463 }
|
|
msach@7
|
464
|
|
kshalle@8
|
465 uint64_t totalExeCycles = endExeCycles.cycles - startExeCycles.cycles;
|
|
msach@9
|
466 totalExeCycles -= totalBadCyclesAcrossCores;
|
|
msach@10
|
467 uint64 totalOverhead = totalExeCycles - totalWorkCyclesAcrossCores;
|
|
msach@10
|
468 int32 numSyncs = outer_iters * num_threads * 2;
|
|
msach@10
|
469 printf("Total Execution Cycles: %lu\n", totalExeCycles);
|
|
kshalle@8
|
470 printf("Sum across threads of work cycles: %lu\n", totalWorkCyclesAcrossCores);
|
|
msach@10
|
471 printf("Sum across threads of bad work cycles: %lu\n", totalBadCyclesAcrossCores);
|
|
msach@10
|
472 // printf("Sum across threads of Bad Sync cycles: %lu\n", totalBadSyncCyclesAcrossCores);
|
|
msach@10
|
473 printf("Overhead per sync: %f\n", (double)totalOverhead / (double)numSyncs );
|
|
kshalle@8
|
474 printf("ExeCycles/WorkCycles Ratio %f\n",
|
|
kshalle@8
|
475 (double)totalExeCycles / (double)totalWorkCyclesAcrossCores);
|
|
msach@15
|
476 #else
|
|
msach@15
|
477 printf("No measurement done!\n");
|
|
msach@15
|
478 #endif
|
|
Me@4
|
479 return 0;
|
|
msach@7
|
480 }
|