Mercurial > cgi-bin > hgwebdir.cgi > PR > Applications > Vthread > Vthread__Best_Effort_Msg__Bench
changeset 13:85f55731f6cd false_sharing
Padded variables to avoid false sharing in the application
| author | Merten Sach <msach@mailbox.tu-berlin.de> |
|---|---|
| date | Tue, 20 Dec 2011 15:00:07 +0100 |
| parents | 1320dd56673a |
| children | c3561dbac1dc |
| files | src/Application/main.c |
| diffstat | 1 files changed, 35 insertions(+), 35 deletions(-) [+] |
line diff
1.1 --- a/src/Application/main.c Fri Dec 16 16:40:07 2011 +0100 1.2 +++ b/src/Application/main.c Tue Dec 20 15:00:07 2011 +0100 1.3 @@ -64,8 +64,7 @@ 1.4 typedef struct perfData measurement_t; 1.5 struct perfData{ 1.6 uint64 cycles; 1.7 - uint64 instructions; 1.8 -}; 1.9 +} __align_to_cacheline__; 1.10 1.11 const char *usage = { 1.12 "Usage: malloc_test [options]\n" 1.13 @@ -85,7 +84,7 @@ 1.14 int32 cond; 1.15 measurement_t endBarrierCycles; 1.16 1.17 -}; 1.18 +} __align_to_cacheline__; 1.19 typedef struct barrier_t barrier; 1.20 1.21 void inline barrier_init(barrier *barr, int nthreads, VirtProcr *animatingPr) 1.22 @@ -104,7 +103,7 @@ 1.23 barr->counter++; 1.24 if(barr->counter == barr->nthreads) 1.25 { 1.26 - read(cycles_counter_main_fd, &(barr->endBarrierCycles.cycles), \ 1.27 + read(cycles_counter_main_fd, &(barr->endBarrierCycles.cycles), \ 1.28 sizeof(barr->endBarrierCycles.cycles)); 1.29 1.30 barr->counter = 0; 1.31 @@ -119,7 +118,7 @@ 1.32 1.33 1.34 1.35 -typedef struct 1.36 +struct WorkerParams_t 1.37 { struct barrier_t* barrier; 1.38 uint64_t totalWorkCycles; 1.39 uint64_t totalBadCycles; 1.40 @@ -127,15 +126,18 @@ 1.41 uint64_t totalBadSyncCycles; 1.42 uint64 numGoodSyncs; 1.43 uint64 numGoodTasks; 1.44 - } 1.45 -WorkerParams; 1.46 - 1.47 + }; 1.48 + 1.49 + typedef union 1.50 + { 1.51 + struct WorkerParams_t data; 1.52 + char padding[CACHELINE_SIZE]; 1.53 + } WorkerParams __align_to_cacheline__; 1.54 1.55 typedef struct 1.56 { measurement_t *startExeCycles; 1.57 measurement_t *endExeCycles; 1.58 - } 1.59 -BenchParams; 1.60 + } BenchParams __align_to_cacheline__; 1.61 1.62 //======================== Globals ========================= 1.63 char __ProgrammName[] = "overhead_test"; 1.64 @@ -151,7 +153,7 @@ 1.65 1.66 //======================== App Code ========================= 1.67 /* 1.68 - * Workload 1.69 + p* Workload 1.70 */ 1.71 1.72 #define saveCyclesAndInstrs(core,cycles) do{ \ 1.73 @@ -179,14 +181,14 @@ 1.74 1.75 int cpuid = sched_getcpu(); 1.76 1.77 - measurement_t startWorkload, endWorkload, startWorkload2, endWorkload2; 1.78 + measurement_t startWorkload, endWorkload; 1.79 uint64 numCycles; 1.80 for(o=0; o < outer_iters; o++) 1.81 { 1.82 1.83 saveCyclesAndInstrs(cpuid,startWorkload.cycles); 1.84 1.85 - //workltask 1.86 + //task 1.87 for(i=0; i < inner_iters; i++) 1.88 { 1.89 workspace1 += (workspace1 + 32)/2; 1.90 @@ -221,12 +223,12 @@ 1.91 VPThread__mutex_unlock(privateMutex, animatingPr); 1.92 } 1.93 1.94 - params->totalWorkCycles = totalWorkCycles; 1.95 - params->totalBadCycles = totalBadCycles; 1.96 - params->numGoodTasks = numGoodTasks; 1.97 - params->totalSyncCycles = totalSyncCycles; 1.98 - params->totalBadSyncCycles = totalBadSyncCycles; 1.99 - params->numGoodSyncs = numGoodSyncs; 1.100 + params->data.totalWorkCycles = totalWorkCycles; 1.101 + params->data.totalBadCycles = totalBadCycles; 1.102 + params->data.numGoodTasks = numGoodTasks; 1.103 + params->data.totalSyncCycles = totalSyncCycles; 1.104 + params->data.totalBadSyncCycles = totalBadSyncCycles; 1.105 + params->data.numGoodSyncs = numGoodSyncs; 1.106 /* 1.107 params->totalSyncCycles = VMS__give_num_plugin_cycles(); 1.108 params->totalBadSyncCycles = 0; 1.109 @@ -235,7 +237,7 @@ 1.110 1.111 1.112 //Wait for all threads to end 1.113 - barrier_wait(params->barrier, animatingPr); 1.114 + barrier_wait(params->data.barrier, animatingPr); 1.115 1.116 //Shutdown worker 1.117 VPThread__dissipate_thread(animatingPr); 1.118 @@ -244,14 +246,15 @@ 1.119 return (workspace1 + workspace2); //to prevent gcc from optimizing work out 1.120 } 1.121 1.122 +//local variables of benchmark, made global for alignment 1.123 +struct barrier_t barr __align_to_cacheline__; 1.124 +BenchParams *params __align_to_cacheline__; 1.125 1.126 /* this is run after the VMS is set up*/ 1.127 void benchmark(void *_params, VirtProcr *animatingPr) 1.128 { 1.129 - int i, cpuID; 1.130 - struct barrier_t barr; 1.131 - BenchParams *params; 1.132 - 1.133 + int i; 1.134 + 1.135 params = (BenchParams *)_params; 1.136 1.137 barrier_init(&barr, num_threads+1, animatingPr); 1.138 @@ -259,15 +262,12 @@ 1.139 //prepare input 1.140 for(i=0; i<num_threads; i++) 1.141 { 1.142 - workerParamsArray[i].barrier = &barr; 1.143 + workerParamsArray[i].data.barrier = &barr; 1.144 } 1.145 1.146 //save cycles before execution of threads, to get total exe cycles 1.147 - measurement_t *startExeCycles, *endExeCycles; 1.148 - startExeCycles = params->startExeCycles; 1.149 - 1.150 - int nread = read(cycles_counter_main_fd, &(startExeCycles->cycles), 1.151 - sizeof(startExeCycles->cycles)); 1.152 + int nread = read(cycles_counter_main_fd, &(params->startExeCycles->cycles), 1.153 + sizeof(params->startExeCycles->cycles)); 1.154 if(nread<0) perror("Error reading cycles counter"); 1.155 1.156 //create (which starts running) all threads 1.157 @@ -438,14 +438,14 @@ 1.158 uint64_t totalWorkCyclesAcrossCores = 0, totalBadCyclesAcrossCores = 0; 1.159 uint64_t totalSyncCyclesAcrossCores = 0, totalBadSyncCyclesAcrossCores = 0; 1.160 for(i=0; i<num_threads; i++){ 1.161 - printf("WorkCycles: %lu\n",workerParamsArray[i].totalWorkCycles); 1.162 + printf("WorkCycles: %lu\n",workerParamsArray[i].data.totalWorkCycles); 1.163 // printf("Num Good Tasks: %lu\n",workerParamsArray[i].numGoodTasks); 1.164 // printf("SyncCycles: %lu\n",workerParamsArray[i].totalSyncCycles); 1.165 // printf("Num Good Syncs: %lu\n",workerParamsArray[i].numGoodSyncs); 1.166 - totalWorkCyclesAcrossCores += workerParamsArray[i].totalWorkCycles; 1.167 - totalBadCyclesAcrossCores += workerParamsArray[i].totalBadCycles; 1.168 - totalSyncCyclesAcrossCores += workerParamsArray[i].totalSyncCycles; 1.169 - totalBadSyncCyclesAcrossCores += workerParamsArray[i].totalBadSyncCycles; 1.170 + totalWorkCyclesAcrossCores += workerParamsArray[i].data.totalWorkCycles; 1.171 + totalBadCyclesAcrossCores += workerParamsArray[i].data.totalBadCycles; 1.172 + totalSyncCyclesAcrossCores += workerParamsArray[i].data.totalSyncCycles; 1.173 + totalBadSyncCyclesAcrossCores += workerParamsArray[i].data.totalBadSyncCycles; 1.174 } 1.175 1.176 uint64_t totalExeCycles = endExeCycles.cycles - startExeCycles.cycles;
