changeset 9:5d3b5e58456e

Working version, uses TSC for inner loop -- appears more stable
author Merten Sach <msach@mailbox.tu-berlin.de>
date Wed, 07 Dec 2011 06:17:46 +0100
parents b2a84bc2b274
children 662089f010bb
files src/Application/main.c
diffstat 1 files changed, 92 insertions(+), 63 deletions(-) [+]
line diff
     1.1 --- a/src/Application/main.c	Mon Nov 28 23:58:58 2011 +0100
     1.2 +++ b/src/Application/main.c	Wed Dec 07 06:17:46 2011 +0100
     1.3 @@ -119,12 +119,17 @@
     1.4  
     1.5  
     1.6  
     1.7 -struct WorkerParamsStr{
     1.8 -    struct barrier_t* barrier;
     1.9 -    uint64_t  totalWorkCycles;
    1.10 -};
    1.11 +typedef struct
    1.12 + { struct barrier_t* barrier;
    1.13 +   uint64_t  totalWorkCycles;
    1.14 +   uint64_t  totalBadCycles;
    1.15 +   uint64_t  totalSyncCycles;
    1.16 +   uint64_t  totalBadSyncCycles;
    1.17 +   uint64     numGoodSyncs;
    1.18 +   uint64     numGoodTasks;
    1.19 + }
    1.20 +WorkerParams;
    1.21  
    1.22 -typedef struct WorkerParamsStr WorkerParams;
    1.23  
    1.24  typedef struct
    1.25   { measurement_t *startExeCycles;
    1.26 @@ -136,7 +141,7 @@
    1.27  char __ProgrammName[] = "overhead_test";
    1.28  char __DataSet[255];
    1.29  
    1.30 -int repetitions, workload_size, num_threads;
    1.31 +int outer_iters, inner_iters, num_threads;
    1.32  size_t chunk_size = 0;
    1.33  
    1.34  int cycles_counter_fd[NUM_CORES];
    1.35 @@ -161,47 +166,72 @@
    1.36  } while (0) //macro magic for scoping
    1.37  
    1.38  
    1.39 -void worker_TLF(void* _params, VirtProcr* animatingPr)
    1.40 +double
    1.41 +worker_TLF(void* _params, VirtProcr* animatingPr)
    1.42   {
    1.43     int i,o;
    1.44 -   WorkerParams* params = (struct WorkerParamsStr*)_params;
    1.45 -   unsigned int totalWorkCycles = 0;
    1.46 -   unsigned int workspace1=0;
    1.47 +   WorkerParams* params = (WorkerParams*)_params;
    1.48 +   unsigned int totalWorkCycles = 0, totalBadCycles = 0;
    1.49 +   unsigned int totalSyncCycles = 0, totalBadSyncCycles = 0;
    1.50 +   unsigned int workspace1=0, numGoodSyncs = 0, numGoodTasks = 0;
    1.51     double workspace2=0.0;
    1.52     int32 privateMutex = VPThread__make_mutex(animatingPr);
    1.53     
    1.54     int cpuid = sched_getcpu();
    1.55 -    
    1.56 -   for(o=0; o<repetitions; o++)
    1.57 +   
    1.58 +   measurement_t startWorkload, endWorkload;
    1.59 +   uint64 numCycles;
    1.60 +   TSCountLowHigh startTask, endTask, endSync1, endSync2;
    1.61 +   for(o=0; o < outer_iters; o++)
    1.62      {
    1.63         
    1.64 -      //measure inner workload to determine synchronisation overhead by subtraction
    1.65 -      measurement_t startWorkload, endWorkload;
    1.66 -      saveCyclesAndInstrs(cpuid,startWorkload.cycles);
    1.67 -
    1.68 +//          saveCyclesAndInstrs(cpuid,startWorkload.cycles);
    1.69 +          saveTSCLowHigh(startTask);
    1.70 +       
    1.71        //workload
    1.72 -      for(i=0; i<workload_size; i++)
    1.73 +      for(i=0; i < inner_iters; i++)
    1.74         {
    1.75           workspace1 += (workspace1 + 32)/2;
    1.76           workspace2 += (workspace2 + 23.2)/1.4;
    1.77         }
    1.78        
    1.79 -    
    1.80 -      //measure end of inner workload
    1.81 -      saveCyclesAndInstrs(cpuid,endWorkload.cycles);
    1.82 -      uint64 numCycles = endWorkload.cycles - startWorkload.cycles;
    1.83 +          saveTSCLowHigh(endTask);
    1.84 +          numCycles = endTask.longVal - startTask.longVal;
    1.85 +//          saveCyclesAndInstrs(cpuid,endWorkload.cycles);
    1.86 +//          numCycles = endWorkload.cycles - startWorkload.cycles;
    1.87 +
    1.88 +          //sanity check (400K is about 20K iters)
    1.89 +          if( numCycles < 400000 ) {totalWorkCycles += numCycles; numGoodTasks++;}
    1.90 +          else                     {totalBadCycles  += numCycles; }
    1.91 +
    1.92 +      //mutex access often causes switch to different Slave VP
    1.93 +      VPThread__mutex_lock(privateMutex, animatingPr);
    1.94 +          saveTSCLowHigh(endSync1);
    1.95 +          numCycles = endSync1.longVal - endTask.longVal;
    1.96 +          //sanity check (400K is about 20K iters)
    1.97 +          if( numCycles < 400000 ) {totalSyncCycles += numCycles; numGoodSyncs++;}
    1.98 +          else                     totalBadSyncCycles  += numCycles;
    1.99        
   1.100 -      
   1.101 -     if( numCycles < 4000000 ) //sanity check (4M is about 200K iters)
   1.102 -         totalWorkCycles += numCycles;
   1.103 +      VPThread__mutex_unlock(privateMutex, animatingPr);
   1.104 +          saveTSCLowHigh(endSync2);
   1.105 +          numCycles = endSync2.longVal - endSync1.longVal;
   1.106 +          //sanity check (400K is about 20K iters)
   1.107 +          if( numCycles < 400000 ) {totalSyncCycles += numCycles; numGoodSyncs++;}
   1.108 +          else                     totalBadSyncCycles  += numCycles;
   1.109  
   1.110 -      //mutex access causes switch to different Slave VP
   1.111 -      VPThread__mutex_lock(privateMutex, animatingPr);
   1.112 -      VPThread__mutex_unlock(privateMutex, animatingPr);
   1.113      }
   1.114  
   1.115     params->totalWorkCycles = totalWorkCycles;
   1.116 -   //printf("Cycles: %lu on CPU %lu\n", totalWorkCycles, cpuid);
   1.117 +   params->totalBadCycles = totalBadCycles;
   1.118 +   params->numGoodTasks   = numGoodTasks;
   1.119 +   params->totalSyncCycles = totalSyncCycles;
   1.120 +   params->totalBadSyncCycles = totalBadSyncCycles;
   1.121 +   params->numGoodSyncs = numGoodSyncs;
   1.122 +/*
   1.123 +   params->totalSyncCycles = VMS__give_num_plugin_cycles();
   1.124 +   params->totalBadSyncCycles = 0;
   1.125 +   params->numGoodSyncs = VMS__give_num_plugin_animations();
   1.126 +*/
   1.127     
   1.128     
   1.129     //Wait for all threads to end
   1.130 @@ -209,9 +239,9 @@
   1.131     
   1.132     //Shutdown worker
   1.133     VPThread__dissipate_thread(animatingPr);
   1.134 -     //below printfs never reached --> there for gcc
   1.135 -   printf("%d", workspace1);  //This is to prevent gcc from optimizing out the
   1.136 -   printf("%f", workspace2);  //two workspace variables
   1.137 +   
   1.138 +     //below return never reached --> there for gcc
   1.139 +   return (workspace1 + workspace2);  //to prevent gcc from optimizing work out
   1.140   }
   1.141  
   1.142  
   1.143 @@ -232,33 +262,22 @@
   1.144         workerParamsArray[i].barrier = &barr;
   1.145      }
   1.146       
   1.147 -         //printf("just before first counter read, inside benchmark\n");
   1.148 -  
   1.149     //save cycles before execution of threads, to get total exe cycles
   1.150     measurement_t *startExeCycles, *endExeCycles;
   1.151     startExeCycles = params->startExeCycles;
   1.152 -   //endExeCycles   = params->endExeCycles;
   1.153 -   
   1.154 -    //printf("finished first counter read, inside benchmark\n");
   1.155 -   //create all threads
   1.156 -   for(i=0; i<num_threads; i++)
   1.157 -    { VPThread__create_thread((VirtProcrFnPtr)worker_TLF, &(workerParamsArray[i]), animatingPr);
   1.158 -    }
   1.159     
   1.160     int nread = read(cycles_counter_main_fd, &(startExeCycles->cycles),
   1.161                  sizeof(startExeCycles->cycles));
   1.162 -   if(nread<0)
   1.163 -    { perror("Error reading cycles counter");
   1.164 +   if(nread<0) perror("Error reading cycles counter");
   1.165 +   
   1.166 +   //create (which starts running) all threads
   1.167 +   for(i=0; i<num_threads; i++)
   1.168 +    { VPThread__create_thread((VirtProcrFnPtr)worker_TLF, &(workerParamsArray[i]), animatingPr);
   1.169      }
   1.170     //wait for all threads to finish
   1.171     barrier_wait(&barr, animatingPr);
   1.172    
   1.173 -   
   1.174 -   //accumulated cycles of all cores
   1.175 - //  nread = read(cycles_counter_main_fd, &(endExeCycles->cycles), \
   1.176 -                sizeof(endExeCycles->cycles));
   1.177 - //  if(nread<0){                                         
   1.178 - //      perror("Error reading cycles counter");
   1.179 +   //endBarrierCycles read in barrier_wait()!  Merten, email me if want to chg
   1.180     params->endExeCycles->cycles = barr.endBarrierCycles.cycles;
   1.181     
   1.182  
   1.183 @@ -283,10 +302,6 @@
   1.184  int main(int argc, char **argv)
   1.185   {
   1.186     int i;
   1.187 -   measurement_t startExeCycles, endExeCycles;
   1.188 -   BenchParams *benchParams;
   1.189 -   
   1.190 -   benchParams = malloc(sizeof(BenchParams)); 
   1.191  
   1.192     //set global static variables, based on cmd-line args
   1.193     for(i=1; i<argc; i++)
   1.194 @@ -309,21 +324,21 @@
   1.195                  }
   1.196                 num_threads *= NUM_CORES;
   1.197              break;
   1.198 -            case 'i':
   1.199 +            case 'o':
   1.200                 if(!isdigit(argv[++i][0]))
   1.201                  {
   1.202                    fputs("-i must be followed by a number\n", stderr);
   1.203                    return EXIT_FAILURE;
   1.204                  }
   1.205 -               repetitions = atoi(argv[i]);
   1.206 +               outer_iters = atoi(argv[i]);
   1.207  				break;
   1.208 -            case 'o':
   1.209 +            case 'i':
   1.210                 if(!isdigit(argv[++i][0]))
   1.211                  {
   1.212                    fputs("-o must be followed by a number (workload size)\n", stderr);
   1.213                    return EXIT_FAILURE;
   1.214                  }
   1.215 -               workload_size = atoi(argv[i]);
   1.216 +               inner_iters = atoi(argv[i]);
   1.217  				break;
   1.218              case 'h':
   1.219                 fputs(usage, stdout);
   1.220 @@ -406,27 +421,41 @@
   1.221        perror("Failed to open cycles counter");
   1.222      }
   1.223     
   1.224 -   //printf("counters now set up\n");
   1.225 +   measurement_t startExeCycles, endExeCycles;
   1.226 +   BenchParams *benchParams;
   1.227 +   
   1.228 +   benchParams = malloc(sizeof(BenchParams)); 
   1.229 +   
   1.230 +   benchParams->startExeCycles = &startExeCycles;
   1.231 +   benchParams->endExeCycles   = &endExeCycles;
   1.232 +   
   1.233     workerParamsArray =  (WorkerParams *)malloc( (num_threads + 1) * sizeof(WorkerParams) );
   1.234     if(workerParamsArray == NULL ) printf("error mallocing worker params array\n");
   1.235     
   1.236 -   workerParamsArray[0].totalWorkCycles = 0;
   1.237 -   
   1.238 -   benchParams->startExeCycles = &startExeCycles;
   1.239 -   benchParams->endExeCycles   = &endExeCycles;
   1.240 -
   1.241 + 
   1.242     //This is the transition to the VMS runtime
   1.243     VPThread__create_seed_procr_and_do_work( &benchmark, benchParams );
   1.244     
   1.245 -   uint64_t totalWorkCyclesAcrossCores = 0;
   1.246 +   uint64_t totalWorkCyclesAcrossCores = 0, totalBadCyclesAcrossCores = 0;
   1.247 +   uint64_t totalSyncCyclesAcrossCores = 0, totalBadSyncCyclesAcrossCores = 0;
   1.248     for(i=0; i<num_threads; i++){ 
   1.249         printf("WorkCycles: %lu\n",workerParamsArray[i].totalWorkCycles);
   1.250 +//       printf("Num Good Tasks: %lu\n",workerParamsArray[i].numGoodTasks);
   1.251 +//       printf("SyncCycles: %lu\n",workerParamsArray[i].totalSyncCycles);
   1.252 +//       printf("Num Good Syncs: %lu\n",workerParamsArray[i].numGoodSyncs);
   1.253         totalWorkCyclesAcrossCores += workerParamsArray[i].totalWorkCycles;
   1.254 +       totalBadCyclesAcrossCores  += workerParamsArray[i].totalBadCycles;
   1.255 +       totalSyncCyclesAcrossCores += workerParamsArray[i].totalSyncCycles;
   1.256 +       totalBadSyncCyclesAcrossCores  += workerParamsArray[i].totalBadSyncCycles;
   1.257      }
   1.258  
   1.259     uint64_t totalExeCycles = endExeCycles.cycles - startExeCycles.cycles;
   1.260 +   totalExeCycles -= totalBadCyclesAcrossCores;
   1.261 +   
   1.262     printf("Sum across threads of work cycles: %lu\n", totalWorkCyclesAcrossCores);
   1.263     printf("Total Execution Cycles: %lu\n", totalExeCycles);
   1.264 +   printf("Sum across threads of Sync cycles: %lu\n", totalSyncCyclesAcrossCores);
   1.265 +   printf("Sum across threads of Bad Sync cycles: %lu\n", totalBadSyncCyclesAcrossCores);
   1.266     printf("ExeCycles/WorkCycles Ratio %f\n", 
   1.267            (double)totalExeCycles / (double)totalWorkCyclesAcrossCores);
   1.268