Mercurial > cgi-bin > hgwebdir.cgi > PR > Applications > Vthread > Vthread__Best_Effort_Msg__Bench
changeset 9:5d3b5e58456e
Working version, uses TSC for inner loop -- appears more stable
| author | Merten Sach <msach@mailbox.tu-berlin.de> |
|---|---|
| date | Wed, 07 Dec 2011 06:17:46 +0100 |
| parents | b2a84bc2b274 |
| children | 662089f010bb |
| files | src/Application/main.c |
| diffstat | 1 files changed, 92 insertions(+), 63 deletions(-) [+] |
line diff
1.1 --- a/src/Application/main.c Mon Nov 28 23:58:58 2011 +0100 1.2 +++ b/src/Application/main.c Wed Dec 07 06:17:46 2011 +0100 1.3 @@ -119,12 +119,17 @@ 1.4 1.5 1.6 1.7 -struct WorkerParamsStr{ 1.8 - struct barrier_t* barrier; 1.9 - uint64_t totalWorkCycles; 1.10 -}; 1.11 +typedef struct 1.12 + { struct barrier_t* barrier; 1.13 + uint64_t totalWorkCycles; 1.14 + uint64_t totalBadCycles; 1.15 + uint64_t totalSyncCycles; 1.16 + uint64_t totalBadSyncCycles; 1.17 + uint64 numGoodSyncs; 1.18 + uint64 numGoodTasks; 1.19 + } 1.20 +WorkerParams; 1.21 1.22 -typedef struct WorkerParamsStr WorkerParams; 1.23 1.24 typedef struct 1.25 { measurement_t *startExeCycles; 1.26 @@ -136,7 +141,7 @@ 1.27 char __ProgrammName[] = "overhead_test"; 1.28 char __DataSet[255]; 1.29 1.30 -int repetitions, workload_size, num_threads; 1.31 +int outer_iters, inner_iters, num_threads; 1.32 size_t chunk_size = 0; 1.33 1.34 int cycles_counter_fd[NUM_CORES]; 1.35 @@ -161,47 +166,72 @@ 1.36 } while (0) //macro magic for scoping 1.37 1.38 1.39 -void worker_TLF(void* _params, VirtProcr* animatingPr) 1.40 +double 1.41 +worker_TLF(void* _params, VirtProcr* animatingPr) 1.42 { 1.43 int i,o; 1.44 - WorkerParams* params = (struct WorkerParamsStr*)_params; 1.45 - unsigned int totalWorkCycles = 0; 1.46 - unsigned int workspace1=0; 1.47 + WorkerParams* params = (WorkerParams*)_params; 1.48 + unsigned int totalWorkCycles = 0, totalBadCycles = 0; 1.49 + unsigned int totalSyncCycles = 0, totalBadSyncCycles = 0; 1.50 + unsigned int workspace1=0, numGoodSyncs = 0, numGoodTasks = 0; 1.51 double workspace2=0.0; 1.52 int32 privateMutex = VPThread__make_mutex(animatingPr); 1.53 1.54 int cpuid = sched_getcpu(); 1.55 - 1.56 - for(o=0; o<repetitions; o++) 1.57 + 1.58 + measurement_t startWorkload, endWorkload; 1.59 + uint64 numCycles; 1.60 + TSCountLowHigh startTask, endTask, endSync1, endSync2; 1.61 + for(o=0; o < outer_iters; o++) 1.62 { 1.63 1.64 - //measure inner workload to determine synchronisation overhead by subtraction 1.65 - measurement_t startWorkload, endWorkload; 1.66 - saveCyclesAndInstrs(cpuid,startWorkload.cycles); 1.67 - 1.68 +// saveCyclesAndInstrs(cpuid,startWorkload.cycles); 1.69 + saveTSCLowHigh(startTask); 1.70 + 1.71 //workload 1.72 - for(i=0; i<workload_size; i++) 1.73 + for(i=0; i < inner_iters; i++) 1.74 { 1.75 workspace1 += (workspace1 + 32)/2; 1.76 workspace2 += (workspace2 + 23.2)/1.4; 1.77 } 1.78 1.79 - 1.80 - //measure end of inner workload 1.81 - saveCyclesAndInstrs(cpuid,endWorkload.cycles); 1.82 - uint64 numCycles = endWorkload.cycles - startWorkload.cycles; 1.83 + saveTSCLowHigh(endTask); 1.84 + numCycles = endTask.longVal - startTask.longVal; 1.85 +// saveCyclesAndInstrs(cpuid,endWorkload.cycles); 1.86 +// numCycles = endWorkload.cycles - startWorkload.cycles; 1.87 + 1.88 + //sanity check (400K is about 20K iters) 1.89 + if( numCycles < 400000 ) {totalWorkCycles += numCycles; numGoodTasks++;} 1.90 + else {totalBadCycles += numCycles; } 1.91 + 1.92 + //mutex access often causes switch to different Slave VP 1.93 + VPThread__mutex_lock(privateMutex, animatingPr); 1.94 + saveTSCLowHigh(endSync1); 1.95 + numCycles = endSync1.longVal - endTask.longVal; 1.96 + //sanity check (400K is about 20K iters) 1.97 + if( numCycles < 400000 ) {totalSyncCycles += numCycles; numGoodSyncs++;} 1.98 + else totalBadSyncCycles += numCycles; 1.99 1.100 - 1.101 - if( numCycles < 4000000 ) //sanity check (4M is about 200K iters) 1.102 - totalWorkCycles += numCycles; 1.103 + VPThread__mutex_unlock(privateMutex, animatingPr); 1.104 + saveTSCLowHigh(endSync2); 1.105 + numCycles = endSync2.longVal - endSync1.longVal; 1.106 + //sanity check (400K is about 20K iters) 1.107 + if( numCycles < 400000 ) {totalSyncCycles += numCycles; numGoodSyncs++;} 1.108 + else totalBadSyncCycles += numCycles; 1.109 1.110 - //mutex access causes switch to different Slave VP 1.111 - VPThread__mutex_lock(privateMutex, animatingPr); 1.112 - VPThread__mutex_unlock(privateMutex, animatingPr); 1.113 } 1.114 1.115 params->totalWorkCycles = totalWorkCycles; 1.116 - //printf("Cycles: %lu on CPU %lu\n", totalWorkCycles, cpuid); 1.117 + params->totalBadCycles = totalBadCycles; 1.118 + params->numGoodTasks = numGoodTasks; 1.119 + params->totalSyncCycles = totalSyncCycles; 1.120 + params->totalBadSyncCycles = totalBadSyncCycles; 1.121 + params->numGoodSyncs = numGoodSyncs; 1.122 +/* 1.123 + params->totalSyncCycles = VMS__give_num_plugin_cycles(); 1.124 + params->totalBadSyncCycles = 0; 1.125 + params->numGoodSyncs = VMS__give_num_plugin_animations(); 1.126 +*/ 1.127 1.128 1.129 //Wait for all threads to end 1.130 @@ -209,9 +239,9 @@ 1.131 1.132 //Shutdown worker 1.133 VPThread__dissipate_thread(animatingPr); 1.134 - //below printfs never reached --> there for gcc 1.135 - printf("%d", workspace1); //This is to prevent gcc from optimizing out the 1.136 - printf("%f", workspace2); //two workspace variables 1.137 + 1.138 + //below return never reached --> there for gcc 1.139 + return (workspace1 + workspace2); //to prevent gcc from optimizing work out 1.140 } 1.141 1.142 1.143 @@ -232,33 +262,22 @@ 1.144 workerParamsArray[i].barrier = &barr; 1.145 } 1.146 1.147 - //printf("just before first counter read, inside benchmark\n"); 1.148 - 1.149 //save cycles before execution of threads, to get total exe cycles 1.150 measurement_t *startExeCycles, *endExeCycles; 1.151 startExeCycles = params->startExeCycles; 1.152 - //endExeCycles = params->endExeCycles; 1.153 - 1.154 - //printf("finished first counter read, inside benchmark\n"); 1.155 - //create all threads 1.156 - for(i=0; i<num_threads; i++) 1.157 - { VPThread__create_thread((VirtProcrFnPtr)worker_TLF, &(workerParamsArray[i]), animatingPr); 1.158 - } 1.159 1.160 int nread = read(cycles_counter_main_fd, &(startExeCycles->cycles), 1.161 sizeof(startExeCycles->cycles)); 1.162 - if(nread<0) 1.163 - { perror("Error reading cycles counter"); 1.164 + if(nread<0) perror("Error reading cycles counter"); 1.165 + 1.166 + //create (which starts running) all threads 1.167 + for(i=0; i<num_threads; i++) 1.168 + { VPThread__create_thread((VirtProcrFnPtr)worker_TLF, &(workerParamsArray[i]), animatingPr); 1.169 } 1.170 //wait for all threads to finish 1.171 barrier_wait(&barr, animatingPr); 1.172 1.173 - 1.174 - //accumulated cycles of all cores 1.175 - // nread = read(cycles_counter_main_fd, &(endExeCycles->cycles), \ 1.176 - sizeof(endExeCycles->cycles)); 1.177 - // if(nread<0){ 1.178 - // perror("Error reading cycles counter"); 1.179 + //endBarrierCycles read in barrier_wait()! Merten, email me if want to chg 1.180 params->endExeCycles->cycles = barr.endBarrierCycles.cycles; 1.181 1.182 1.183 @@ -283,10 +302,6 @@ 1.184 int main(int argc, char **argv) 1.185 { 1.186 int i; 1.187 - measurement_t startExeCycles, endExeCycles; 1.188 - BenchParams *benchParams; 1.189 - 1.190 - benchParams = malloc(sizeof(BenchParams)); 1.191 1.192 //set global static variables, based on cmd-line args 1.193 for(i=1; i<argc; i++) 1.194 @@ -309,21 +324,21 @@ 1.195 } 1.196 num_threads *= NUM_CORES; 1.197 break; 1.198 - case 'i': 1.199 + case 'o': 1.200 if(!isdigit(argv[++i][0])) 1.201 { 1.202 fputs("-i must be followed by a number\n", stderr); 1.203 return EXIT_FAILURE; 1.204 } 1.205 - repetitions = atoi(argv[i]); 1.206 + outer_iters = atoi(argv[i]); 1.207 break; 1.208 - case 'o': 1.209 + case 'i': 1.210 if(!isdigit(argv[++i][0])) 1.211 { 1.212 fputs("-o must be followed by a number (workload size)\n", stderr); 1.213 return EXIT_FAILURE; 1.214 } 1.215 - workload_size = atoi(argv[i]); 1.216 + inner_iters = atoi(argv[i]); 1.217 break; 1.218 case 'h': 1.219 fputs(usage, stdout); 1.220 @@ -406,27 +421,41 @@ 1.221 perror("Failed to open cycles counter"); 1.222 } 1.223 1.224 - //printf("counters now set up\n"); 1.225 + measurement_t startExeCycles, endExeCycles; 1.226 + BenchParams *benchParams; 1.227 + 1.228 + benchParams = malloc(sizeof(BenchParams)); 1.229 + 1.230 + benchParams->startExeCycles = &startExeCycles; 1.231 + benchParams->endExeCycles = &endExeCycles; 1.232 + 1.233 workerParamsArray = (WorkerParams *)malloc( (num_threads + 1) * sizeof(WorkerParams) ); 1.234 if(workerParamsArray == NULL ) printf("error mallocing worker params array\n"); 1.235 1.236 - workerParamsArray[0].totalWorkCycles = 0; 1.237 - 1.238 - benchParams->startExeCycles = &startExeCycles; 1.239 - benchParams->endExeCycles = &endExeCycles; 1.240 - 1.241 + 1.242 //This is the transition to the VMS runtime 1.243 VPThread__create_seed_procr_and_do_work( &benchmark, benchParams ); 1.244 1.245 - uint64_t totalWorkCyclesAcrossCores = 0; 1.246 + uint64_t totalWorkCyclesAcrossCores = 0, totalBadCyclesAcrossCores = 0; 1.247 + uint64_t totalSyncCyclesAcrossCores = 0, totalBadSyncCyclesAcrossCores = 0; 1.248 for(i=0; i<num_threads; i++){ 1.249 printf("WorkCycles: %lu\n",workerParamsArray[i].totalWorkCycles); 1.250 +// printf("Num Good Tasks: %lu\n",workerParamsArray[i].numGoodTasks); 1.251 +// printf("SyncCycles: %lu\n",workerParamsArray[i].totalSyncCycles); 1.252 +// printf("Num Good Syncs: %lu\n",workerParamsArray[i].numGoodSyncs); 1.253 totalWorkCyclesAcrossCores += workerParamsArray[i].totalWorkCycles; 1.254 + totalBadCyclesAcrossCores += workerParamsArray[i].totalBadCycles; 1.255 + totalSyncCyclesAcrossCores += workerParamsArray[i].totalSyncCycles; 1.256 + totalBadSyncCyclesAcrossCores += workerParamsArray[i].totalBadSyncCycles; 1.257 } 1.258 1.259 uint64_t totalExeCycles = endExeCycles.cycles - startExeCycles.cycles; 1.260 + totalExeCycles -= totalBadCyclesAcrossCores; 1.261 + 1.262 printf("Sum across threads of work cycles: %lu\n", totalWorkCyclesAcrossCores); 1.263 printf("Total Execution Cycles: %lu\n", totalExeCycles); 1.264 + printf("Sum across threads of Sync cycles: %lu\n", totalSyncCyclesAcrossCores); 1.265 + printf("Sum across threads of Bad Sync cycles: %lu\n", totalBadSyncCyclesAcrossCores); 1.266 printf("ExeCycles/WorkCycles Ratio %f\n", 1.267 (double)totalExeCycles / (double)totalWorkCyclesAcrossCores); 1.268
