PR/Applications/Vthread/Vthread__Best_Effort_Msg_

view main.c @ 17:281cadcbb796

changed directory structure, added .hgeol file

author	Merten Sach <msach@mailbox.tu-berlin.de>
date	Mon, 13 Feb 2012 16:12:20 +0100
parents	src/Application/main.c@c3561dbac1dc
children	e7277df4460e

line source

1 /*

2 *

3 */

4 #include <stdio.h>

5 #include <stdlib.h>

6 #include <string.h>

7 #include <math.h>

8 #include <ctype.h>

9 #include <errno.h>

10 #include <pthread.h>

11 #include <unistd.h>

12 #include "VPThread_lib/VPThread.h"

13 #include "VPThread_lib/VMS/Queue_impl/PrivateQueue.h"

15 #include <linux/perf_event.h>

16 #include <linux/prctl.h>

17 #include <sys/syscall.h>

19 #undef DEBUG

20 //#define DEBUG

22 #if !defined(unix) && !defined(__unix__)

23 #ifdef __MACH__

24 #define unix 1

25 #define __unix__ 1

26 #endif /* __MACH__ */

27 #endif /* unix */

29 /* find the appropriate way to define explicitly sized types */

30 /* for C99 or GNU libc (also mach's libc) we can use stdint.h */

31 #if (__STDC_VERSION__ >= 199900) || defined(__GLIBC__) || defined(__MACH__)

32 #include <stdint.h>

33 #elif defined(unix) || defined(__unix__) /* some UNIX systems have them in sys/types.h */

34 #include <sys/types.h>

35 #elif defined(__WIN32__) || defined(WIN32) /* the nameless one */

36 typedef unsigned __int8 uint8_t;

37 typedef unsigned __int32 uint32_t;

38 #endif /* sized type detection */

40 /* provide a millisecond-resolution timer for each system */

41 #if defined(unix) || defined(__unix__)

42 #include <time.h>

43 #include <sys/time.h>

44 unsigned long get_msec(void) {

45 static struct timeval timeval, first_timeval;

47 gettimeofday(&timeval, 0);

48 if(first_timeval.tv_sec == 0) {

49 first_timeval = timeval;

50 return 0;

51 }

52 return (timeval.tv_sec - first_timeval.tv_sec) * 1000 + (timeval.tv_usec - first_timeval.tv_usec) / 1000;

53 }

54 #elif defined(__WIN32__) || defined(WIN32)

55 #include <windows.h>

56 unsigned long get_msec(void) {

57 return GetTickCount();

58 }

59 #else

60 //#error "I don't know how to measure time on your platform"

61 #endif

63 //======================== Globals =========================

64 char __ProgrammName[] = "overhead_test";

65 char __DataSet[255];

67 int outer_iters, inner_iters, num_threads;

68 size_t chunk_size = 0;

70 int cycles_counter_main_fd;

71 int misses_counter_fd;

73 uint64_t cache_misses;

75 int cycles_counter_fd[NUM_CORES];

76 struct perf_event_attr* hw_event;

78 //======================== Defines =========================

79 typedef struct perfData measurement_t;

80 struct perfData{

81 uint64 cycles;

82 } __align_to_cacheline__;

84 const char *usage = {

85 "Usage: malloc_test [options]\n"

86 " Spwans a number of threads and allocates memory.\n\n"

87 "Options:\n"

88 " -t <num> how many threads to use (default: 1). This is internaly multiplied by the number of cores.\n"

89 " -o <num> repeat workload and sync operation <m> times\n"

90 " -i <num> size of workload, repeat <n> times\n"

91 " -h this help screen\n\n"

92 };

94 struct barrier_t

95 {

96 int counter;

97 int nthreads;

98 int32 mutex;

99 int32 cond;

100 measurement_t endBarrierCycles;

101

102 } __align_to_cacheline__;

103 typedef struct barrier_t barrier;

104

105 void inline barrier_init(barrier *barr, int nthreads, VirtProcr *animatingPr)

106 {

107 barr->counter = 0;

108 barr->nthreads = nthreads;

109 barr->mutex = VPThread__make_mutex(animatingPr);

110 barr->cond = VPThread__make_cond(barr->mutex, animatingPr);

111 }

112

113 void inline barrier_wait(barrier *barr, VirtProcr *animatingPr)

114 { int i;

115

116 VPThread__mutex_lock(barr->mutex, animatingPr);

117 barr->counter++;

118 if(barr->counter == barr->nthreads)

119 {

120 read(cycles_counter_main_fd, &(barr->endBarrierCycles.cycles), \

121 sizeof(barr->endBarrierCycles.cycles));

122

123 barr->counter = 0;

124 for(i=0; i < barr->nthreads; i++)

125 VPThread__cond_signal(barr->cond, animatingPr);

126 }

127 else

128 { VPThread__cond_wait(barr->cond, animatingPr);

129 }

130 VPThread__mutex_unlock(barr->mutex, animatingPr);

131 }

132

133

134

135 struct WorkerParams_t

136 { struct barrier_t* barrier;

137 uint64_t totalWorkCycles;

138 uint64_t totalBadCycles;

139 uint64_t totalSyncCycles;

140 uint64_t totalBadSyncCycles;

141 uint64 numGoodSyncs;

142 uint64 numGoodTasks;

143 };

144

145 typedef union

146 {

147 struct WorkerParams_t data;

148 char padding[CACHELINE_SIZE];

149 } WorkerParams __align_to_cacheline__;

150

151 WorkerParams *workerParamsArray;

152

153 typedef struct

154 { measurement_t *startExeCycles;

155 measurement_t *endExeCycles;

156 } BenchParams __align_to_cacheline__;

157

158 //======================== App Code =========================

159 /*

160 p* Workload

161 */

162

163 #define saveCyclesAndInstrs(core,cycles) do{ \

164 int cycles_fd = cycles_counter_fd[core]; \

165 int nread; \

166 \

167 nread = read(cycles_fd,&(cycles),sizeof(cycles)); \

168 if(nread<0){ \

169 perror("Error reading cycles counter"); \

170 cycles = 0; \

171 } \

172 } while (0) //macro magic for scoping

173

174 #define saveMisses(misses) do{ \

175 int nread; \

176 \

177 nread = read(misses_counter_fd,&(misses),sizeof(misses)); \

178 if(nread<0){ \

179 perror("Error reading misses counter"); \

180 misses = 0; \

181 } \

182 } while (0) //macro magic for scoping

183

184

185 double

186 worker_TLF(void* _params, VirtProcr* animatingPr)

187 {

188 int i,o;

189 WorkerParams* params = (WorkerParams*)_params;

190 unsigned int totalWorkCycles = 0, totalBadCycles = 0;

191 unsigned int totalSyncCycles = 0, totalBadSyncCycles = 0;

192 unsigned int workspace1=0, numGoodSyncs = 0, numGoodTasks = 0;

193 double workspace2=0.0;

194 int32 privateMutex = VPThread__make_mutex(animatingPr);

195

196 int cpuid = sched_getcpu();

197

198 measurement_t startWorkload, endWorkload;

199 uint64 numCycles;

200 for(o=0; o < outer_iters; o++)

201 {

202

203 saveCyclesAndInstrs(cpuid,startWorkload.cycles);

204

205 //task

206 for(i=0; i < inner_iters; i++)

207 {

208 workspace1 += (workspace1 + 32)/2;

209 workspace2 += (workspace2 + 23.2)/1.4;

210 }

211

212 saveCyclesAndInstrs(cpuid,endWorkload.cycles);

213 numCycles = endWorkload.cycles - startWorkload.cycles;

214 //sanity check (400K is about 20K iters)

215 if( numCycles < 400000 ) {totalWorkCycles += numCycles; numGoodTasks++;}

216 else {totalBadCycles += numCycles; }

217

218 //mutex access often causes switch to different Slave VP

219 VPThread__mutex_lock(privateMutex, animatingPr);

220

221 /*

222 saveCyclesAndInstrs(cpuid,startWorkload2.cycles);

223 //Task

224 for(i=0; i < inner_iters; i++)

225 {

226 workspace1 += (workspace1 + 32)/2;

227 workspace2 += (workspace2 + 23.2)/1.4;

228 }

229

230 saveCyclesAndInstrs(cpuid,endWorkload2.cycles);

231 numCycles = endWorkload2.cycles - startWorkload2.cycles;

232 //sanity check (400K is about 20K iters)

233 if( numCycles < 400000 ) {totalWorkCycles += numCycles; numGoodTasks++;}

234 else {totalBadCycles += numCycles; }

235

236 */

237 VPThread__mutex_unlock(privateMutex, animatingPr);

238 }

239

240 params->data.totalWorkCycles = totalWorkCycles;

241 params->data.totalBadCycles = totalBadCycles;

242 params->data.numGoodTasks = numGoodTasks;

243 params->data.totalSyncCycles = totalSyncCycles;

244 params->data.totalBadSyncCycles = totalBadSyncCycles;

245 params->data.numGoodSyncs = numGoodSyncs;

246 /*

247 params->totalSyncCycles = VMS__give_num_plugin_cycles();

248 params->totalBadSyncCycles = 0;

249 params->numGoodSyncs = VMS__give_num_plugin_animations();

250 */

251

252

253 //Wait for all threads to end

254 barrier_wait(params->data.barrier, animatingPr);

255

256 //Shutdown worker

257 VPThread__dissipate_thread(animatingPr);

258

259 //below return never reached --> there for gcc

260 return (workspace1 + workspace2); //to prevent gcc from optimizing work out

261 }

262

263 //local variables of benchmark, made global for alignment

264 struct barrier_t barr __align_to_cacheline__;

265 BenchParams *params __align_to_cacheline__;

266

267 /* this is run after the VMS is set up*/

268 void benchmark(void *_params, VirtProcr *animatingPr)

269 {

270 int i;

271

272 params = (BenchParams *)_params;

273

274 barrier_init(&barr, num_threads+1, animatingPr);

275

276 //prepare input

277 for(i=0; i<num_threads; i++)

278 {

279 workerParamsArray[i].data.barrier = &barr;

280 }

281

282 uint64_t cache_misses_at_start, cache_misses_at_end;

283 saveMisses(cache_misses_at_start);

284 //save cycles before execution of threads, to get total exe cycles

285 int nread = read(cycles_counter_main_fd, &(params->startExeCycles->cycles),

286 sizeof(params->startExeCycles->cycles));

287 if(nread<0) perror("Error reading cycles counter");

288

289 //create (which starts running) all threads

290 for(i=0; i<num_threads; i++)

291 { VPThread__create_thread((VirtProcrFnPtr)worker_TLF, &(workerParamsArray[i]), animatingPr);

292 }

293 //wait for all threads to finish

294 barrier_wait(&barr, animatingPr);

295

296 //endBarrierCycles read in barrier_wait()! Merten, email me if want to chg

297 params->endExeCycles->cycles = barr.endBarrierCycles.cycles;

298 saveMisses(cache_misses_at_end);

299 cache_misses = cache_misses_at_end-cache_misses_at_start;

300 /*

301 uint64_t overallWorkCycles = 0;

302 for(i=0; i<num_threads; i++){

303 printf("WorkCycles: %lu\n",input[i].totalWorkCycles);

304 overallWorkCycles += input[i].totalWorkCycles;

305 }

306

307 printf("Sum across threads of work cycles: %lu\n", overallWorkCycles);

308 printf("Total Execution: %lu\n", endBenchTime.cycles-startBenchTime.cycles);

309 printf("Runtime/Workcycle Ratio %lu\n",

310 ((endBenchTime.cycles-startBenchTime.cycles)*100)/overallWorkCycles);

311 */

312

313 //======================================================

314

315 VPThread__dissipate_thread(animatingPr);

316 }

317

318 int main(int argc, char **argv)

319 {

320 int i;

321

322 //set global static variables, based on cmd-line args

323 for(i=1; i<argc; i++)

324 {

325 if(argv[i][0] == '-' && argv[i][2] == 0)

326 {

327 switch(argv[i][1])

328 {

329 case 't':

330 if(!isdigit(argv[++i][0]))

331 {

332 fprintf(stderr, "-t must be followed by the number of worker threads to spawn\n");

333 return EXIT_FAILURE;

334 }

335 num_threads = atoi(argv[i]);

336 if(!num_threads)

337 {

338 fprintf(stderr, "invalid number of threads specified: %d\n", num_threads);

339 return EXIT_FAILURE;

340 }

341 break;

342 case 'o':

343 if(!isdigit(argv[++i][0]))

344 {

345 fputs("-i must be followed by a number\n", stderr);

346 return EXIT_FAILURE;

347 }

348 outer_iters = atoi(argv[i]);

349 break;

350 case 'i':

351 if(!isdigit(argv[++i][0]))

352 {

353 fputs("-o must be followed by a number (workload size)\n", stderr);

354 return EXIT_FAILURE;

355 }

356 inner_iters = atoi(argv[i]);

357 break;

358 case 'h':

359 fputs(usage, stdout);

360 return 0;

361

362 default:

363 fprintf(stderr, "unrecognized argument: %s\n", argv[i]);

364 fputs(usage, stderr);

365 return EXIT_FAILURE;

366 }//switch

367 }//if arg

368 else

369 {

370 fprintf(stderr, "unrecognized argument: %s\n", argv[i]);

371 fputs(usage, stderr);

372 return EXIT_FAILURE;

373 }

374 }//for

375

376

377 //setup performance counters

378 hw_event = malloc(sizeof(struct perf_event_attr));

379 memset(hw_event,0,sizeof(struct perf_event_attr));

380

381 hw_event->type = PERF_TYPE_HARDWARE;

382 hw_event->size = sizeof(hw_event);

383 hw_event->disabled = 0;

384 hw_event->freq = 0;

385 hw_event->inherit = 1; /* children inherit it */

386 hw_event->pinned = 1; /* says this virt counter must always be on HW */

387 hw_event->exclusive = 0; /* only group on PMU */

388 hw_event->exclude_user = 0; /* don't count user */

389 hw_event->exclude_kernel = 1; /* don't count kernel */

390 hw_event->exclude_hv = 1; /* ditto hypervisor */

391 hw_event->exclude_idle = 1; /* don't count when idle */

392 hw_event->mmap = 0; /* include mmap data */

393 hw_event->comm = 0; /* include comm data */

394

395 hw_event->config = PERF_COUNT_HW_CPU_CYCLES; //cycles

396

397 int cpuID, retries;

398

399 for( cpuID = 0; cpuID < NUM_CORES; cpuID++ )

400 { retries = 0;

401 do

402 { retries += 1;

403 cycles_counter_fd[cpuID] =

404 syscall(__NR_perf_event_open, hw_event,

405 0,//pid_t: 0 is "pid of calling process"

406 cpuID,//int: cpu, the value returned by "CPUID" instr(?)

407 -1,//int: group_fd, -1 is "leader" or independent

408 0//unsigned long: flags

409 );

410 }

411 while(cycles_counter_fd[cpuID]<0 && retries < 100);

412 if(retries >= 100)

413 {

414 fprintf(stderr,"On core %d: ",cpuID);

415 perror("Failed to open cycles counter");

416 }

417 }

418

419 //Set up counter to accumulate total cycles to process, across all CPUs

420

421 retries = 0;

422 do

423 { retries += 1;

424 cycles_counter_main_fd =

425 syscall(__NR_perf_event_open, hw_event,

426 0,//pid_t: 0 is "pid of calling process"

427 -1,//int: cpu, -1 means accumulate from all cores

428 -1,//int: group_fd, -1 is "leader" == independent

429 0//unsigned long: flags

430 );

431 }

432 while(cycles_counter_main_fd<0 && retries < 100);

433 if(retries >= 100)

434 {

435 fprintf(stderr,"in main ");

436 perror("Failed to open cycles counter");

437 }

438

439 //Set up counters to count cache misses

440 hw_event->type = PERF_TYPE_HARDWARE;

441 hw_event->config = PERF_COUNT_HW_CACHE_MISSES; //misses

442

443 retries = 0;

444 do

445 { retries += 1;

446 misses_counter_fd =

447 syscall(__NR_perf_event_open, hw_event,

448 0,//pid_t: 0 is "pid of calling process"

449 -1,//int: cpu, -1 means accumulate from all cores

450 -1,//int: group_fd, -1 is "leader" == independent

451 0//unsigned long: flags

452 );

453 }

454 while(misses_counter_fd<0 && retries < 100);

455 if(retries >= 100)

456 {

457 fprintf(stderr,"in main ");

458 perror("Failed to misses counter");

459 }

460

461 measurement_t startExeCycles, endExeCycles;

462 BenchParams *benchParams;

463

464 benchParams = malloc(sizeof(BenchParams));

465

466 benchParams->startExeCycles = &startExeCycles;

467 benchParams->endExeCycles = &endExeCycles;

468

469 workerParamsArray = (WorkerParams *)malloc( (num_threads + 1) * sizeof(WorkerParams) );

470 if(workerParamsArray == NULL ) printf("error mallocing worker params array\n");

471

472

473 //This is the transition to the VMS runtime

474 VPThread__create_seed_procr_and_do_work( &benchmark, benchParams );

475

476 uint64_t totalWorkCyclesAcrossCores = 0, totalBadCyclesAcrossCores = 0;

477 uint64_t totalSyncCyclesAcrossCores = 0, totalBadSyncCyclesAcrossCores = 0;

478 for(i=0; i<num_threads; i++){

479 printf("WorkCycles: %lu\n",workerParamsArray[i].data.totalWorkCycles);

480 // printf("Num Good Tasks: %lu\n",workerParamsArray[i].numGoodTasks);

481 // printf("SyncCycles: %lu\n",workerParamsArray[i].totalSyncCycles);

482 // printf("Num Good Syncs: %lu\n",workerParamsArray[i].numGoodSyncs);

483 totalWorkCyclesAcrossCores += workerParamsArray[i].data.totalWorkCycles;

484 totalBadCyclesAcrossCores += workerParamsArray[i].data.totalBadCycles;

485 totalSyncCyclesAcrossCores += workerParamsArray[i].data.totalSyncCycles;

486 totalBadSyncCyclesAcrossCores += workerParamsArray[i].data.totalBadSyncCycles;

487 }

488

489 uint64_t totalExeCycles = endExeCycles.cycles - startExeCycles.cycles;

490 totalExeCycles -= totalBadCyclesAcrossCores;

491 uint64 totalOverhead = totalExeCycles - totalWorkCyclesAcrossCores;

492 int32 numSyncs = outer_iters * num_threads * 2;

493 printf("Total Execution Cycles: %lu\n", totalExeCycles);

494 printf("Total number of cache misses: %lu\n", cache_misses);

495 printf("Sum across threads of work cycles: %lu\n", totalWorkCyclesAcrossCores);

496 printf("Sum across threads of bad work cycles: %lu\n", totalBadCyclesAcrossCores);

497 // printf("Sum across threads of Bad Sync cycles: %lu\n", totalBadSyncCyclesAcrossCores);

498 printf("Overhead per sync: %f\n", (double)totalOverhead / (double)numSyncs );

499 printf("ExeCycles/WorkCycles Ratio %f\n",

500 (double)totalExeCycles / (double)totalWorkCyclesAcrossCores);

501 return 0;

502 }

Mercurial > cgi-bin > hgwebdir.cgi > PR > Applications > Vthread > Vthread__Best_Effort_Msg__Bench

view main.c @ 17:281cadcbb796