| rev |
line source |
|
Me@4
|
1 /*
|
|
Me@4
|
2 *
|
|
Me@4
|
3 */
|
|
Me@4
|
4 #include <stdio.h>
|
|
Me@4
|
5 #include <stdlib.h>
|
|
Me@4
|
6 #include <string.h>
|
|
Me@4
|
7 #include <math.h>
|
|
Me@4
|
8 #include <ctype.h>
|
|
Me@4
|
9 #include <errno.h>
|
|
Me@4
|
10 #include <pthread.h>
|
|
msach@6
|
11 #include <unistd.h>
|
|
msach@18
|
12 #include "VMS_Implementations/Vthread_impl/VPThread.h"
|
|
msach@18
|
13 #include "C_Libraries/Queue_impl/PrivateQueue.h"
|
|
Me@4
|
14
|
|
msach@6
|
15 #include <linux/perf_event.h>
|
|
msach@6
|
16 #include <linux/prctl.h>
|
|
msach@6
|
17 #include <sys/syscall.h>
|
|
msach@6
|
18
|
|
Me@4
|
19 #undef DEBUG
|
|
Me@4
|
20 //#define DEBUG
|
|
Me@4
|
21
|
|
Me@4
|
22 #if !defined(unix) && !defined(__unix__)
|
|
Me@4
|
23 #ifdef __MACH__
|
|
Me@4
|
24 #define unix 1
|
|
Me@4
|
25 #define __unix__ 1
|
|
Me@4
|
26 #endif /* __MACH__ */
|
|
Me@4
|
27 #endif /* unix */
|
|
Me@4
|
28
|
|
Me@4
|
29 /* find the appropriate way to define explicitly sized types */
|
|
Me@4
|
30 /* for C99 or GNU libc (also mach's libc) we can use stdint.h */
|
|
Me@4
|
31 #if (__STDC_VERSION__ >= 199900) || defined(__GLIBC__) || defined(__MACH__)
|
|
Me@4
|
32 #include <stdint.h>
|
|
Me@4
|
33 #elif defined(unix) || defined(__unix__) /* some UNIX systems have them in sys/types.h */
|
|
Me@4
|
34 #include <sys/types.h>
|
|
Me@4
|
35 #elif defined(__WIN32__) || defined(WIN32) /* the nameless one */
|
|
Me@4
|
36 typedef unsigned __int8 uint8_t;
|
|
Me@4
|
37 typedef unsigned __int32 uint32_t;
|
|
Me@4
|
38 #endif /* sized type detection */
|
|
Me@4
|
39
|
|
Me@4
|
40 /* provide a millisecond-resolution timer for each system */
|
|
Me@4
|
41 #if defined(unix) || defined(__unix__)
|
|
Me@4
|
42 #include <time.h>
|
|
Me@4
|
43 #include <sys/time.h>
|
|
Me@4
|
44 unsigned long get_msec(void) {
|
|
Me@4
|
45 static struct timeval timeval, first_timeval;
|
|
Me@4
|
46
|
|
Me@4
|
47 gettimeofday(&timeval, 0);
|
|
Me@4
|
48 if(first_timeval.tv_sec == 0) {
|
|
Me@4
|
49 first_timeval = timeval;
|
|
Me@4
|
50 return 0;
|
|
Me@4
|
51 }
|
|
Me@4
|
52 return (timeval.tv_sec - first_timeval.tv_sec) * 1000 + (timeval.tv_usec - first_timeval.tv_usec) / 1000;
|
|
Me@4
|
53 }
|
|
Me@4
|
54 #elif defined(__WIN32__) || defined(WIN32)
|
|
Me@4
|
55 #include <windows.h>
|
|
Me@4
|
56 unsigned long get_msec(void) {
|
|
Me@4
|
57 return GetTickCount();
|
|
Me@4
|
58 }
|
|
Me@4
|
59 #else
|
|
Me@4
|
60 //#error "I don't know how to measure time on your platform"
|
|
Me@4
|
61 #endif
|
|
Me@4
|
62
|
|
msach@14
|
63 //======================== Globals =========================
|
|
msach@14
|
64 char __ProgrammName[] = "overhead_test";
|
|
msach@14
|
65 char __DataSet[255];
|
|
msach@14
|
66
|
|
msach@14
|
67 int outer_iters, inner_iters, num_threads;
|
|
msach@14
|
68 size_t chunk_size = 0;
|
|
msach@14
|
69
|
|
msach@14
|
70 int cycles_counter_main_fd;
|
|
msach@14
|
71 int misses_counter_fd;
|
|
msach@14
|
72
|
|
msach@14
|
73 uint64_t cache_misses;
|
|
msach@14
|
74
|
|
msach@14
|
75 int cycles_counter_fd[NUM_CORES];
|
|
msach@14
|
76 struct perf_event_attr* hw_event;
|
|
msach@14
|
77
|
|
Me@4
|
78 //======================== Defines =========================
|
|
kshalle@8
|
79 typedef struct perfData measurement_t;
|
|
kshalle@8
|
80 struct perfData{
|
|
kshalle@8
|
81 uint64 cycles;
|
|
msach@13
|
82 } __align_to_cacheline__;
|
|
Me@4
|
83
|
|
Me@4
|
84 const char *usage = {
|
|
Me@4
|
85 "Usage: malloc_test [options]\n"
|
|
Me@4
|
86 " Spwans a number of threads and allocates memory.\n\n"
|
|
Me@4
|
87 "Options:\n"
|
|
msach@6
|
88 " -t <num> how many threads to use (default: 1). This is internaly multiplied by the number of cores.\n"
|
|
msach@6
|
89 " -o <num> repeat workload and sync operation <m> times\n"
|
|
msach@6
|
90 " -i <num> size of workload, repeat <n> times\n"
|
|
Me@4
|
91 " -h this help screen\n\n"
|
|
Me@4
|
92 };
|
|
Me@4
|
93
|
|
Me@4
|
94 struct barrier_t
|
|
Me@4
|
95 {
|
|
Me@4
|
96 int counter;
|
|
Me@4
|
97 int nthreads;
|
|
Me@4
|
98 int32 mutex;
|
|
Me@4
|
99 int32 cond;
|
|
kshalle@8
|
100 measurement_t endBarrierCycles;
|
|
kshalle@8
|
101
|
|
msach@13
|
102 } __align_to_cacheline__;
|
|
Me@4
|
103 typedef struct barrier_t barrier;
|
|
Me@4
|
104
|
|
Me@4
|
105 void inline barrier_init(barrier *barr, int nthreads, VirtProcr *animatingPr)
|
|
Me@4
|
106 {
|
|
Me@4
|
107 barr->counter = 0;
|
|
Me@4
|
108 barr->nthreads = nthreads;
|
|
Me@4
|
109 barr->mutex = VPThread__make_mutex(animatingPr);
|
|
Me@4
|
110 barr->cond = VPThread__make_cond(barr->mutex, animatingPr);
|
|
Me@4
|
111 }
|
|
Me@4
|
112
|
|
Me@4
|
113 void inline barrier_wait(barrier *barr, VirtProcr *animatingPr)
|
|
Me@4
|
114 { int i;
|
|
Me@4
|
115
|
|
Me@4
|
116 VPThread__mutex_lock(barr->mutex, animatingPr);
|
|
Me@4
|
117 barr->counter++;
|
|
Me@4
|
118 if(barr->counter == barr->nthreads)
|
|
kshalle@8
|
119 {
|
|
msach@13
|
120 read(cycles_counter_main_fd, &(barr->endBarrierCycles.cycles), \
|
|
kshalle@8
|
121 sizeof(barr->endBarrierCycles.cycles));
|
|
kshalle@8
|
122
|
|
kshalle@8
|
123 barr->counter = 0;
|
|
Me@4
|
124 for(i=0; i < barr->nthreads; i++)
|
|
Me@4
|
125 VPThread__cond_signal(barr->cond, animatingPr);
|
|
Me@4
|
126 }
|
|
Me@4
|
127 else
|
|
Me@4
|
128 { VPThread__cond_wait(barr->cond, animatingPr);
|
|
Me@4
|
129 }
|
|
Me@4
|
130 VPThread__mutex_unlock(barr->mutex, animatingPr);
|
|
Me@4
|
131 }
|
|
Me@4
|
132
|
|
kshalle@8
|
133
|
|
kshalle@8
|
134
|
|
msach@13
|
135 struct WorkerParams_t
|
|
msach@9
|
136 { struct barrier_t* barrier;
|
|
msach@9
|
137 uint64_t totalWorkCycles;
|
|
msach@9
|
138 uint64_t totalBadCycles;
|
|
msach@9
|
139 uint64_t totalSyncCycles;
|
|
msach@9
|
140 uint64_t totalBadSyncCycles;
|
|
msach@9
|
141 uint64 numGoodSyncs;
|
|
msach@9
|
142 uint64 numGoodTasks;
|
|
msach@13
|
143 };
|
|
msach@13
|
144
|
|
msach@13
|
145 typedef union
|
|
msach@13
|
146 {
|
|
msach@13
|
147 struct WorkerParams_t data;
|
|
msach@13
|
148 char padding[CACHELINE_SIZE];
|
|
msach@13
|
149 } WorkerParams __align_to_cacheline__;
|
|
msach@14
|
150
|
|
msach@14
|
151 WorkerParams *workerParamsArray;
|
|
Me@4
|
152
|
|
kshalle@8
|
153 typedef struct
|
|
kshalle@8
|
154 { measurement_t *startExeCycles;
|
|
kshalle@8
|
155 measurement_t *endExeCycles;
|
|
msach@13
|
156 } BenchParams __align_to_cacheline__;
|
|
Me@4
|
157
|
|
Me@4
|
158 //======================== App Code =========================
|
|
Me@4
|
159 /*
|
|
msach@13
|
160 p* Workload
|
|
Me@4
|
161 */
|
|
msach@6
|
162
|
|
msach@6
|
163 #define saveCyclesAndInstrs(core,cycles) do{ \
|
|
msach@6
|
164 int cycles_fd = cycles_counter_fd[core]; \
|
|
msach@6
|
165 int nread; \
|
|
msach@6
|
166 \
|
|
msach@6
|
167 nread = read(cycles_fd,&(cycles),sizeof(cycles)); \
|
|
msach@7
|
168 if(nread<0){ \
|
|
msach@6
|
169 perror("Error reading cycles counter"); \
|
|
msach@6
|
170 cycles = 0; \
|
|
msach@6
|
171 } \
|
|
msach@6
|
172 } while (0) //macro magic for scoping
|
|
msach@14
|
173
|
|
msach@14
|
174 #define saveMisses(misses) do{ \
|
|
msach@14
|
175 int nread; \
|
|
msach@14
|
176 \
|
|
msach@14
|
177 nread = read(misses_counter_fd,&(misses),sizeof(misses)); \
|
|
msach@14
|
178 if(nread<0){ \
|
|
msach@14
|
179 perror("Error reading misses counter"); \
|
|
msach@14
|
180 misses = 0; \
|
|
msach@14
|
181 } \
|
|
msach@14
|
182 } while (0) //macro magic for scoping
|
|
msach@6
|
183
|
|
msach@7
|
184
|
|
msach@9
|
185 double
|
|
msach@9
|
186 worker_TLF(void* _params, VirtProcr* animatingPr)
|
|
Me@5
|
187 {
|
|
msach@7
|
188 int i,o;
|
|
msach@9
|
189 WorkerParams* params = (WorkerParams*)_params;
|
|
msach@9
|
190 unsigned int totalWorkCycles = 0, totalBadCycles = 0;
|
|
msach@9
|
191 unsigned int totalSyncCycles = 0, totalBadSyncCycles = 0;
|
|
msach@9
|
192 unsigned int workspace1=0, numGoodSyncs = 0, numGoodTasks = 0;
|
|
kshalle@8
|
193 double workspace2=0.0;
|
|
Me@5
|
194 int32 privateMutex = VPThread__make_mutex(animatingPr);
|
|
msach@6
|
195
|
|
msach@6
|
196 int cpuid = sched_getcpu();
|
|
msach@9
|
197
|
|
msach@13
|
198 measurement_t startWorkload, endWorkload;
|
|
msach@9
|
199 uint64 numCycles;
|
|
msach@9
|
200 for(o=0; o < outer_iters; o++)
|
|
Me@4
|
201 {
|
|
msach@6
|
202
|
|
msach@10
|
203 saveCyclesAndInstrs(cpuid,startWorkload.cycles);
|
|
msach@9
|
204
|
|
msach@13
|
205 //task
|
|
msach@9
|
206 for(i=0; i < inner_iters; i++)
|
|
Me@5
|
207 {
|
|
Me@5
|
208 workspace1 += (workspace1 + 32)/2;
|
|
Me@5
|
209 workspace2 += (workspace2 + 23.2)/1.4;
|
|
Me@5
|
210 }
|
|
msach@6
|
211
|
|
msach@10
|
212 saveCyclesAndInstrs(cpuid,endWorkload.cycles);
|
|
msach@10
|
213 numCycles = endWorkload.cycles - startWorkload.cycles;
|
|
msach@9
|
214 //sanity check (400K is about 20K iters)
|
|
msach@9
|
215 if( numCycles < 400000 ) {totalWorkCycles += numCycles; numGoodTasks++;}
|
|
msach@9
|
216 else {totalBadCycles += numCycles; }
|
|
msach@9
|
217
|
|
msach@9
|
218 //mutex access often causes switch to different Slave VP
|
|
msach@9
|
219 VPThread__mutex_lock(privateMutex, animatingPr);
|
|
msach@11
|
220
|
|
msach@10
|
221 /*
|
|
msach@11
|
222 saveCyclesAndInstrs(cpuid,startWorkload2.cycles);
|
|
msach@11
|
223 //Task
|
|
msach@11
|
224 for(i=0; i < inner_iters; i++)
|
|
msach@11
|
225 {
|
|
msach@11
|
226 workspace1 += (workspace1 + 32)/2;
|
|
msach@11
|
227 workspace2 += (workspace2 + 23.2)/1.4;
|
|
msach@11
|
228 }
|
|
msach@11
|
229
|
|
msach@11
|
230 saveCyclesAndInstrs(cpuid,endWorkload2.cycles);
|
|
msach@11
|
231 numCycles = endWorkload2.cycles - startWorkload2.cycles;
|
|
msach@9
|
232 //sanity check (400K is about 20K iters)
|
|
msach@11
|
233 if( numCycles < 400000 ) {totalWorkCycles += numCycles; numGoodTasks++;}
|
|
msach@11
|
234 else {totalBadCycles += numCycles; }
|
|
msach@11
|
235
|
|
msach@10
|
236 */
|
|
msach@9
|
237 VPThread__mutex_unlock(privateMutex, animatingPr);
|
|
Me@4
|
238 }
|
|
Me@5
|
239
|
|
msach@13
|
240 params->data.totalWorkCycles = totalWorkCycles;
|
|
msach@13
|
241 params->data.totalBadCycles = totalBadCycles;
|
|
msach@13
|
242 params->data.numGoodTasks = numGoodTasks;
|
|
msach@13
|
243 params->data.totalSyncCycles = totalSyncCycles;
|
|
msach@13
|
244 params->data.totalBadSyncCycles = totalBadSyncCycles;
|
|
msach@13
|
245 params->data.numGoodSyncs = numGoodSyncs;
|
|
msach@9
|
246 /*
|
|
msach@9
|
247 params->totalSyncCycles = VMS__give_num_plugin_cycles();
|
|
msach@9
|
248 params->totalBadSyncCycles = 0;
|
|
msach@9
|
249 params->numGoodSyncs = VMS__give_num_plugin_animations();
|
|
msach@9
|
250 */
|
|
msach@6
|
251
|
|
msach@6
|
252
|
|
msach@6
|
253 //Wait for all threads to end
|
|
msach@13
|
254 barrier_wait(params->data.barrier, animatingPr);
|
|
Me@5
|
255
|
|
Me@5
|
256 //Shutdown worker
|
|
Me@5
|
257 VPThread__dissipate_thread(animatingPr);
|
|
msach@9
|
258
|
|
msach@9
|
259 //below return never reached --> there for gcc
|
|
msach@9
|
260 return (workspace1 + workspace2); //to prevent gcc from optimizing work out
|
|
Me@5
|
261 }
|
|
Me@4
|
262
|
|
msach@13
|
263 //local variables of benchmark, made global for alignment
|
|
msach@13
|
264 struct barrier_t barr __align_to_cacheline__;
|
|
msach@13
|
265 BenchParams *params __align_to_cacheline__;
|
|
kshalle@8
|
266
|
|
Me@4
|
267 /* this is run after the VMS is set up*/
|
|
kshalle@8
|
268 void benchmark(void *_params, VirtProcr *animatingPr)
|
|
Me@4
|
269 {
|
|
msach@13
|
270 int i;
|
|
msach@13
|
271
|
|
kshalle@8
|
272 params = (BenchParams *)_params;
|
|
kshalle@8
|
273
|
|
Me@4
|
274 barrier_init(&barr, num_threads+1, animatingPr);
|
|
kshalle@8
|
275
|
|
msach@6
|
276 //prepare input
|
|
Me@4
|
277 for(i=0; i<num_threads; i++)
|
|
msach@6
|
278 {
|
|
msach@13
|
279 workerParamsArray[i].data.barrier = &barr;
|
|
Me@4
|
280 }
|
|
msach@14
|
281
|
|
msach@14
|
282 uint64_t cache_misses_at_start, cache_misses_at_end;
|
|
msach@14
|
283 saveMisses(cache_misses_at_start);
|
|
kshalle@8
|
284 //save cycles before execution of threads, to get total exe cycles
|
|
msach@13
|
285 int nread = read(cycles_counter_main_fd, &(params->startExeCycles->cycles),
|
|
msach@13
|
286 sizeof(params->startExeCycles->cycles));
|
|
msach@9
|
287 if(nread<0) perror("Error reading cycles counter");
|
|
msach@9
|
288
|
|
msach@9
|
289 //create (which starts running) all threads
|
|
msach@9
|
290 for(i=0; i<num_threads; i++)
|
|
msach@9
|
291 { VPThread__create_thread((VirtProcrFnPtr)worker_TLF, &(workerParamsArray[i]), animatingPr);
|
|
kshalle@8
|
292 }
|
|
msach@6
|
293 //wait for all threads to finish
|
|
Me@4
|
294 barrier_wait(&barr, animatingPr);
|
|
msach@6
|
295
|
|
msach@9
|
296 //endBarrierCycles read in barrier_wait()! Merten, email me if want to chg
|
|
kshalle@8
|
297 params->endExeCycles->cycles = barr.endBarrierCycles.cycles;
|
|
msach@14
|
298 saveMisses(cache_misses_at_end);
|
|
msach@14
|
299 cache_misses = cache_misses_at_end-cache_misses_at_start;
|
|
kshalle@8
|
300 /*
|
|
msach@6
|
301 uint64_t overallWorkCycles = 0;
|
|
msach@6
|
302 for(i=0; i<num_threads; i++){
|
|
msach@7
|
303 printf("WorkCycles: %lu\n",input[i].totalWorkCycles);
|
|
msach@6
|
304 overallWorkCycles += input[i].totalWorkCycles;
|
|
Me@4
|
305 }
|
|
msach@6
|
306
|
|
msach@6
|
307 printf("Sum across threads of work cycles: %lu\n", overallWorkCycles);
|
|
msach@6
|
308 printf("Total Execution: %lu\n", endBenchTime.cycles-startBenchTime.cycles);
|
|
kshalle@8
|
309 printf("Runtime/Workcycle Ratio %lu\n",
|
|
kshalle@8
|
310 ((endBenchTime.cycles-startBenchTime.cycles)*100)/overallWorkCycles);
|
|
kshalle@8
|
311 */
|
|
Me@4
|
312
|
|
Me@4
|
313 //======================================================
|
|
Me@4
|
314
|
|
Me@4
|
315 VPThread__dissipate_thread(animatingPr);
|
|
Me@4
|
316 }
|
|
Me@4
|
317
|
|
Me@4
|
318 int main(int argc, char **argv)
|
|
Me@4
|
319 {
|
|
Me@4
|
320 int i;
|
|
Me@4
|
321
|
|
Me@4
|
322 //set global static variables, based on cmd-line args
|
|
Me@4
|
323 for(i=1; i<argc; i++)
|
|
Me@4
|
324 {
|
|
Me@4
|
325 if(argv[i][0] == '-' && argv[i][2] == 0)
|
|
Me@4
|
326 {
|
|
Me@4
|
327 switch(argv[i][1])
|
|
Me@4
|
328 {
|
|
Me@4
|
329 case 't':
|
|
Me@4
|
330 if(!isdigit(argv[++i][0]))
|
|
Me@4
|
331 {
|
|
kshalle@8
|
332 fprintf(stderr, "-t must be followed by the number of worker threads to spawn\n");
|
|
Me@4
|
333 return EXIT_FAILURE;
|
|
Me@4
|
334 }
|
|
Me@4
|
335 num_threads = atoi(argv[i]);
|
|
Me@4
|
336 if(!num_threads)
|
|
Me@4
|
337 {
|
|
Me@4
|
338 fprintf(stderr, "invalid number of threads specified: %d\n", num_threads);
|
|
Me@4
|
339 return EXIT_FAILURE;
|
|
Me@4
|
340 }
|
|
Me@4
|
341 break;
|
|
msach@9
|
342 case 'o':
|
|
Me@4
|
343 if(!isdigit(argv[++i][0]))
|
|
Me@4
|
344 {
|
|
msach@6
|
345 fputs("-i must be followed by a number\n", stderr);
|
|
Me@4
|
346 return EXIT_FAILURE;
|
|
Me@4
|
347 }
|
|
msach@9
|
348 outer_iters = atoi(argv[i]);
|
|
Me@4
|
349 break;
|
|
msach@9
|
350 case 'i':
|
|
Me@4
|
351 if(!isdigit(argv[++i][0]))
|
|
Me@4
|
352 {
|
|
msach@6
|
353 fputs("-o must be followed by a number (workload size)\n", stderr);
|
|
Me@4
|
354 return EXIT_FAILURE;
|
|
Me@4
|
355 }
|
|
msach@9
|
356 inner_iters = atoi(argv[i]);
|
|
Me@4
|
357 break;
|
|
Me@4
|
358 case 'h':
|
|
Me@4
|
359 fputs(usage, stdout);
|
|
Me@4
|
360 return 0;
|
|
Me@4
|
361
|
|
Me@4
|
362 default:
|
|
Me@4
|
363 fprintf(stderr, "unrecognized argument: %s\n", argv[i]);
|
|
Me@4
|
364 fputs(usage, stderr);
|
|
Me@4
|
365 return EXIT_FAILURE;
|
|
Me@4
|
366 }//switch
|
|
Me@4
|
367 }//if arg
|
|
Me@4
|
368 else
|
|
Me@4
|
369 {
|
|
Me@4
|
370 fprintf(stderr, "unrecognized argument: %s\n", argv[i]);
|
|
Me@4
|
371 fputs(usage, stderr);
|
|
Me@4
|
372 return EXIT_FAILURE;
|
|
Me@4
|
373 }
|
|
Me@4
|
374 }//for
|
|
msach@7
|
375
|
|
kshalle@8
|
376
|
|
msach@7
|
377 //setup performance counters
|
|
msach@7
|
378 hw_event = malloc(sizeof(struct perf_event_attr));
|
|
msach@7
|
379 memset(hw_event,0,sizeof(struct perf_event_attr));
|
|
msach@7
|
380
|
|
msach@7
|
381 hw_event->type = PERF_TYPE_HARDWARE;
|
|
msach@7
|
382 hw_event->size = sizeof(hw_event);
|
|
msach@7
|
383 hw_event->disabled = 0;
|
|
msach@7
|
384 hw_event->freq = 0;
|
|
msach@7
|
385 hw_event->inherit = 1; /* children inherit it */
|
|
msach@7
|
386 hw_event->pinned = 1; /* says this virt counter must always be on HW */
|
|
msach@7
|
387 hw_event->exclusive = 0; /* only group on PMU */
|
|
msach@7
|
388 hw_event->exclude_user = 0; /* don't count user */
|
|
msach@7
|
389 hw_event->exclude_kernel = 1; /* don't count kernel */
|
|
msach@7
|
390 hw_event->exclude_hv = 1; /* ditto hypervisor */
|
|
msach@7
|
391 hw_event->exclude_idle = 1; /* don't count when idle */
|
|
msach@7
|
392 hw_event->mmap = 0; /* include mmap data */
|
|
msach@7
|
393 hw_event->comm = 0; /* include comm data */
|
|
msach@7
|
394
|
|
msach@7
|
395 hw_event->config = PERF_COUNT_HW_CPU_CYCLES; //cycles
|
|
msach@7
|
396
|
|
msach@7
|
397 int cpuID, retries;
|
|
msach@7
|
398
|
|
msach@7
|
399 for( cpuID = 0; cpuID < NUM_CORES; cpuID++ )
|
|
msach@7
|
400 { retries = 0;
|
|
msach@7
|
401 do
|
|
msach@7
|
402 { retries += 1;
|
|
msach@7
|
403 cycles_counter_fd[cpuID] =
|
|
msach@7
|
404 syscall(__NR_perf_event_open, hw_event,
|
|
msach@7
|
405 0,//pid_t: 0 is "pid of calling process"
|
|
msach@7
|
406 cpuID,//int: cpu, the value returned by "CPUID" instr(?)
|
|
msach@7
|
407 -1,//int: group_fd, -1 is "leader" or independent
|
|
msach@7
|
408 0//unsigned long: flags
|
|
msach@7
|
409 );
|
|
msach@7
|
410 }
|
|
msach@7
|
411 while(cycles_counter_fd[cpuID]<0 && retries < 100);
|
|
msach@7
|
412 if(retries >= 100)
|
|
msach@7
|
413 {
|
|
msach@7
|
414 fprintf(stderr,"On core %d: ",cpuID);
|
|
msach@7
|
415 perror("Failed to open cycles counter");
|
|
msach@7
|
416 }
|
|
msach@7
|
417 }
|
|
msach@7
|
418
|
|
msach@7
|
419 //Set up counter to accumulate total cycles to process, across all CPUs
|
|
msach@7
|
420
|
|
msach@7
|
421 retries = 0;
|
|
msach@7
|
422 do
|
|
msach@7
|
423 { retries += 1;
|
|
msach@7
|
424 cycles_counter_main_fd =
|
|
msach@7
|
425 syscall(__NR_perf_event_open, hw_event,
|
|
msach@7
|
426 0,//pid_t: 0 is "pid of calling process"
|
|
msach@7
|
427 -1,//int: cpu, -1 means accumulate from all cores
|
|
msach@7
|
428 -1,//int: group_fd, -1 is "leader" == independent
|
|
msach@7
|
429 0//unsigned long: flags
|
|
msach@7
|
430 );
|
|
msach@7
|
431 }
|
|
msach@7
|
432 while(cycles_counter_main_fd<0 && retries < 100);
|
|
msach@7
|
433 if(retries >= 100)
|
|
msach@7
|
434 {
|
|
msach@7
|
435 fprintf(stderr,"in main ");
|
|
msach@7
|
436 perror("Failed to open cycles counter");
|
|
msach@7
|
437 }
|
|
kshalle@8
|
438
|
|
msach@14
|
439 //Set up counters to count cache misses
|
|
msach@14
|
440 hw_event->type = PERF_TYPE_HARDWARE;
|
|
msach@14
|
441 hw_event->config = PERF_COUNT_HW_CACHE_MISSES; //misses
|
|
msach@14
|
442
|
|
msach@14
|
443 retries = 0;
|
|
msach@14
|
444 do
|
|
msach@14
|
445 { retries += 1;
|
|
msach@14
|
446 misses_counter_fd =
|
|
msach@14
|
447 syscall(__NR_perf_event_open, hw_event,
|
|
msach@14
|
448 0,//pid_t: 0 is "pid of calling process"
|
|
msach@14
|
449 -1,//int: cpu, -1 means accumulate from all cores
|
|
msach@14
|
450 -1,//int: group_fd, -1 is "leader" == independent
|
|
msach@14
|
451 0//unsigned long: flags
|
|
msach@14
|
452 );
|
|
msach@14
|
453 }
|
|
msach@14
|
454 while(misses_counter_fd<0 && retries < 100);
|
|
msach@14
|
455 if(retries >= 100)
|
|
msach@14
|
456 {
|
|
msach@14
|
457 fprintf(stderr,"in main ");
|
|
msach@14
|
458 perror("Failed to misses counter");
|
|
msach@14
|
459 }
|
|
msach@14
|
460
|
|
msach@9
|
461 measurement_t startExeCycles, endExeCycles;
|
|
msach@9
|
462 BenchParams *benchParams;
|
|
msach@9
|
463
|
|
msach@9
|
464 benchParams = malloc(sizeof(BenchParams));
|
|
msach@9
|
465
|
|
msach@9
|
466 benchParams->startExeCycles = &startExeCycles;
|
|
msach@9
|
467 benchParams->endExeCycles = &endExeCycles;
|
|
msach@9
|
468
|
|
kshalle@8
|
469 workerParamsArray = (WorkerParams *)malloc( (num_threads + 1) * sizeof(WorkerParams) );
|
|
kshalle@8
|
470 if(workerParamsArray == NULL ) printf("error mallocing worker params array\n");
|
|
kshalle@8
|
471
|
|
msach@9
|
472
|
|
kshalle@8
|
473 //This is the transition to the VMS runtime
|
|
kshalle@8
|
474 VPThread__create_seed_procr_and_do_work( &benchmark, benchParams );
|
|
kshalle@8
|
475
|
|
msach@9
|
476 uint64_t totalWorkCyclesAcrossCores = 0, totalBadCyclesAcrossCores = 0;
|
|
msach@9
|
477 uint64_t totalSyncCyclesAcrossCores = 0, totalBadSyncCyclesAcrossCores = 0;
|
|
kshalle@8
|
478 for(i=0; i<num_threads; i++){
|
|
msach@13
|
479 printf("WorkCycles: %lu\n",workerParamsArray[i].data.totalWorkCycles);
|
|
msach@9
|
480 // printf("Num Good Tasks: %lu\n",workerParamsArray[i].numGoodTasks);
|
|
msach@9
|
481 // printf("SyncCycles: %lu\n",workerParamsArray[i].totalSyncCycles);
|
|
msach@9
|
482 // printf("Num Good Syncs: %lu\n",workerParamsArray[i].numGoodSyncs);
|
|
msach@13
|
483 totalWorkCyclesAcrossCores += workerParamsArray[i].data.totalWorkCycles;
|
|
msach@13
|
484 totalBadCyclesAcrossCores += workerParamsArray[i].data.totalBadCycles;
|
|
msach@13
|
485 totalSyncCyclesAcrossCores += workerParamsArray[i].data.totalSyncCycles;
|
|
msach@13
|
486 totalBadSyncCyclesAcrossCores += workerParamsArray[i].data.totalBadSyncCycles;
|
|
kshalle@8
|
487 }
|
|
msach@7
|
488
|
|
kshalle@8
|
489 uint64_t totalExeCycles = endExeCycles.cycles - startExeCycles.cycles;
|
|
msach@9
|
490 totalExeCycles -= totalBadCyclesAcrossCores;
|
|
msach@10
|
491 uint64 totalOverhead = totalExeCycles - totalWorkCyclesAcrossCores;
|
|
msach@10
|
492 int32 numSyncs = outer_iters * num_threads * 2;
|
|
msach@10
|
493 printf("Total Execution Cycles: %lu\n", totalExeCycles);
|
|
msach@14
|
494 printf("Total number of cache misses: %lu\n", cache_misses);
|
|
kshalle@8
|
495 printf("Sum across threads of work cycles: %lu\n", totalWorkCyclesAcrossCores);
|
|
msach@10
|
496 printf("Sum across threads of bad work cycles: %lu\n", totalBadCyclesAcrossCores);
|
|
msach@10
|
497 // printf("Sum across threads of Bad Sync cycles: %lu\n", totalBadSyncCyclesAcrossCores);
|
|
msach@10
|
498 printf("Overhead per sync: %f\n", (double)totalOverhead / (double)numSyncs );
|
|
kshalle@8
|
499 printf("ExeCycles/WorkCycles Ratio %f\n",
|
|
kshalle@8
|
500 (double)totalExeCycles / (double)totalWorkCyclesAcrossCores);
|
|
Me@4
|
501 return 0;
|
|
msach@7
|
502 }
|