changeset 8:b2a84bc2b274

working version of exe time vs task size
author kshalle
date Mon, 28 Nov 2011 23:58:58 +0100
parents 28650a4df2b9
children 5d3b5e58456e
files src/Application/main.c
diffstat 1 files changed, 95 insertions(+), 44 deletions(-) [+]
line diff
     1.1 --- a/src/Application/main.c	Mon Nov 21 21:39:03 2011 +0100
     1.2 +++ b/src/Application/main.c	Mon Nov 28 23:58:58 2011 +0100
     1.3 @@ -61,6 +61,11 @@
     1.4  #endif
     1.5  
     1.6  //======================== Defines =========================
     1.7 +typedef struct perfData measurement_t;
     1.8 +struct perfData{
     1.9 +    uint64 cycles;
    1.10 +    uint64 instructions;
    1.11 +};
    1.12  
    1.13  const char *usage = {
    1.14  	"Usage: malloc_test [options]\n"
    1.15 @@ -78,6 +83,8 @@
    1.16      int nthreads;
    1.17      int32 mutex;
    1.18      int32 cond;
    1.19 +    measurement_t endBarrierCycles;
    1.20 +
    1.21  };
    1.22  typedef struct barrier_t barrier;
    1.23  
    1.24 @@ -89,13 +96,18 @@
    1.25     barr->cond    = VPThread__make_cond(barr->mutex, animatingPr);
    1.26   }
    1.27  
    1.28 +int cycles_counter_main_fd;
    1.29  void inline barrier_wait(barrier *barr, VirtProcr *animatingPr)
    1.30   { int i;
    1.31  
    1.32     VPThread__mutex_lock(barr->mutex, animatingPr);
    1.33     barr->counter++;
    1.34     if(barr->counter == barr->nthreads)
    1.35 -    { barr->counter = 0;
    1.36 +    { 
    1.37 +      read(cycles_counter_main_fd, &(barr->endBarrierCycles.cycles), \
    1.38 +                sizeof(barr->endBarrierCycles.cycles));
    1.39 +       
    1.40 +      barr->counter = 0;
    1.41        for(i=0; i < barr->nthreads; i++)
    1.42           VPThread__cond_signal(barr->cond, animatingPr);
    1.43      }
    1.44 @@ -105,18 +117,20 @@
    1.45     VPThread__mutex_unlock(barr->mutex, animatingPr);
    1.46   }
    1.47  
    1.48 -struct perfData{
    1.49 -    uint64 cycles;
    1.50 -    uint64 instructions;
    1.51 +
    1.52 +
    1.53 +struct WorkerParamsStr{
    1.54 +    struct barrier_t* barrier;
    1.55 +    uint64_t  totalWorkCycles;
    1.56  };
    1.57  
    1.58 -typedef struct perfData measurement_t;
    1.59 +typedef struct WorkerParamsStr WorkerParams;
    1.60  
    1.61 -struct input_t{
    1.62 -    struct barrier_t* barrier;
    1.63 -    uint64  totalWorkCycles;
    1.64 -};
    1.65 -
    1.66 +typedef struct
    1.67 + { measurement_t *startExeCycles;
    1.68 +   measurement_t *endExeCycles;
    1.69 + }
    1.70 +BenchParams;
    1.71  
    1.72  //======================== Globals =========================
    1.73  char __ProgrammName[] = "overhead_test";
    1.74 @@ -126,9 +140,10 @@
    1.75  size_t chunk_size = 0;
    1.76  
    1.77  int cycles_counter_fd[NUM_CORES];
    1.78 -int cycles_counter_main_fd;
    1.79  struct perf_event_attr* hw_event;
    1.80  
    1.81 +WorkerParams *workerParamsArray;
    1.82 +
    1.83  //======================== App Code =========================
    1.84  /*
    1.85   * Workload
    1.86 @@ -146,13 +161,13 @@
    1.87  } while (0) //macro magic for scoping
    1.88  
    1.89  
    1.90 -void work(void* input, VirtProcr* animatingPr)
    1.91 +void worker_TLF(void* _params, VirtProcr* animatingPr)
    1.92   {
    1.93     int i,o;
    1.94 -   struct input_t* in = (struct input_t*)input;
    1.95 +   WorkerParams* params = (struct WorkerParamsStr*)_params;
    1.96     unsigned int totalWorkCycles = 0;
    1.97 -   unsigned int workspace1;
    1.98 -   double workspace2;
    1.99 +   unsigned int workspace1=0;
   1.100 +   double workspace2=0.0;
   1.101     int32 privateMutex = VPThread__make_mutex(animatingPr);
   1.102     
   1.103     int cpuid = sched_getcpu();
   1.104 @@ -185,12 +200,12 @@
   1.105        VPThread__mutex_unlock(privateMutex, animatingPr);
   1.106      }
   1.107  
   1.108 -   in->totalWorkCycles = totalWorkCycles;
   1.109 -   printf("Cycles: %lu on CPU %lu\n", totalWorkCycles, cpuid);
   1.110 +   params->totalWorkCycles = totalWorkCycles;
   1.111 +   //printf("Cycles: %lu on CPU %lu\n", totalWorkCycles, cpuid);
   1.112     
   1.113     
   1.114     //Wait for all threads to end
   1.115 -   barrier_wait(in->barrier, animatingPr);
   1.116 +   barrier_wait(params->barrier, animatingPr);
   1.117     
   1.118     //Shutdown worker
   1.119     VPThread__dissipate_thread(animatingPr);
   1.120 @@ -199,46 +214,55 @@
   1.121     printf("%f", workspace2);  //two workspace variables
   1.122   }
   1.123  
   1.124 +
   1.125  /* this is run after the VMS is set up*/
   1.126 -void benchmark(void *in, VirtProcr *animatingPr)
   1.127 +void benchmark(void *_params, VirtProcr *animatingPr)
   1.128   {
   1.129     int i, cpuID;
   1.130 -   struct input_t input[num_threads];
   1.131 -   struct barrier_t barr;
   1.132 +   struct barrier_t  barr;
   1.133 +   BenchParams      *params;
   1.134 +   
   1.135 +   params = (BenchParams *)_params;
   1.136 +
   1.137     barrier_init(&barr, num_threads+1, animatingPr);
   1.138 -   
   1.139 -   
   1.140 -   
   1.141 +      
   1.142     //prepare input
   1.143     for(i=0; i<num_threads; i++)
   1.144      { 
   1.145 -       input[i].barrier = &barr;
   1.146 +       workerParamsArray[i].barrier = &barr;
   1.147      }
   1.148       
   1.149 -   printf("just before first counter read, inside benchmark\n");
   1.150 +         //printf("just before first counter read, inside benchmark\n");
   1.151    
   1.152 -   //save cycles before execution of threads to get longest runtime
   1.153 -   measurement_t startBenchTime, endBenchTime;
   1.154 -   int nread = read(cycles_counter_main_fd,&(startBenchTime.cycles),
   1.155 -                sizeof(startBenchTime.cycles));
   1.156 -   if(nread<0){                                         
   1.157 -       perror("Error reading cycles counter");
   1.158 -   }
   1.159 -   printf("finished first counter read, inside benchmark\n");
   1.160 +   //save cycles before execution of threads, to get total exe cycles
   1.161 +   measurement_t *startExeCycles, *endExeCycles;
   1.162 +   startExeCycles = params->startExeCycles;
   1.163 +   //endExeCycles   = params->endExeCycles;
   1.164 +   
   1.165 +    //printf("finished first counter read, inside benchmark\n");
   1.166     //create all threads
   1.167     for(i=0; i<num_threads; i++)
   1.168 -    { VPThread__create_thread((VirtProcrFnPtr)work, (void*)&input[i], animatingPr);}
   1.169 +    { VPThread__create_thread((VirtProcrFnPtr)worker_TLF, &(workerParamsArray[i]), animatingPr);
   1.170 +    }
   1.171 +   
   1.172 +   int nread = read(cycles_counter_main_fd, &(startExeCycles->cycles),
   1.173 +                sizeof(startExeCycles->cycles));
   1.174 +   if(nread<0)
   1.175 +    { perror("Error reading cycles counter");
   1.176 +    }
   1.177     //wait for all threads to finish
   1.178     barrier_wait(&barr, animatingPr);
   1.179    
   1.180     
   1.181     //accumulated cycles of all cores
   1.182 -   nread = read(cycles_counter_main_fd,&(endBenchTime.cycles),
   1.183 -                sizeof(endBenchTime.cycles));
   1.184 -   if(nread<0){                                         
   1.185 -       perror("Error reading cycles counter");
   1.186 -   }
   1.187 + //  nread = read(cycles_counter_main_fd, &(endExeCycles->cycles), \
   1.188 +                sizeof(endExeCycles->cycles));
   1.189 + //  if(nread<0){                                         
   1.190 + //      perror("Error reading cycles counter");
   1.191 +   params->endExeCycles->cycles = barr.endBarrierCycles.cycles;
   1.192 +   
   1.193  
   1.194 +/*
   1.195     uint64_t overallWorkCycles = 0;
   1.196     for(i=0; i<num_threads; i++){ 
   1.197         printf("WorkCycles: %lu\n",input[i].totalWorkCycles);
   1.198 @@ -247,6 +271,9 @@
   1.199     
   1.200     printf("Sum across threads of work cycles: %lu\n", overallWorkCycles);
   1.201     printf("Total Execution: %lu\n", endBenchTime.cycles-startBenchTime.cycles);
   1.202 +   printf("Runtime/Workcycle Ratio %lu\n", 
   1.203 +   ((endBenchTime.cycles-startBenchTime.cycles)*100)/overallWorkCycles);
   1.204 +*/
   1.205  
   1.206     //======================================================
   1.207  
   1.208 @@ -256,6 +283,10 @@
   1.209  int main(int argc, char **argv)
   1.210   {
   1.211     int i;
   1.212 +   measurement_t startExeCycles, endExeCycles;
   1.213 +   BenchParams *benchParams;
   1.214 +   
   1.215 +   benchParams = malloc(sizeof(BenchParams)); 
   1.216  
   1.217     //set global static variables, based on cmd-line args
   1.218     for(i=1; i<argc; i++)
   1.219 @@ -267,7 +298,7 @@
   1.220              case 't':
   1.221                 if(!isdigit(argv[++i][0]))
   1.222                  {
   1.223 -                  fprintf(stderr, "-t mus be followed by the number of worker threads to spawn\n");
   1.224 +                  fprintf(stderr, "-t must be followed by the number of worker threads to spawn\n");
   1.225                    return EXIT_FAILURE;
   1.226                  }
   1.227                 num_threads = atoi(argv[i]);
   1.228 @@ -312,6 +343,7 @@
   1.229         }
   1.230      }//for
   1.231     
   1.232 +   
   1.233     //setup performance counters
   1.234      hw_event = malloc(sizeof(struct perf_event_attr));
   1.235      memset(hw_event,0,sizeof(struct perf_event_attr));
   1.236 @@ -353,7 +385,6 @@
   1.237           perror("Failed to open cycles counter");
   1.238         }
   1.239      }
   1.240 -   printf("counters now set up\n");
   1.241  
   1.242     //Set up counter to accumulate total cycles to process, across all CPUs
   1.243  
   1.244 @@ -374,10 +405,30 @@
   1.245        fprintf(stderr,"in main ");
   1.246        perror("Failed to open cycles counter");
   1.247      }
   1.248 +   
   1.249 +   //printf("counters now set up\n");
   1.250 +   workerParamsArray =  (WorkerParams *)malloc( (num_threads + 1) * sizeof(WorkerParams) );
   1.251 +   if(workerParamsArray == NULL ) printf("error mallocing worker params array\n");
   1.252 +   
   1.253 +   workerParamsArray[0].totalWorkCycles = 0;
   1.254 +   
   1.255 +   benchParams->startExeCycles = &startExeCycles;
   1.256 +   benchParams->endExeCycles   = &endExeCycles;
   1.257  
   1.258 +   //This is the transition to the VMS runtime
   1.259 +   VPThread__create_seed_procr_and_do_work( &benchmark, benchParams );
   1.260 +   
   1.261 +   uint64_t totalWorkCyclesAcrossCores = 0;
   1.262 +   for(i=0; i<num_threads; i++){ 
   1.263 +       printf("WorkCycles: %lu\n",workerParamsArray[i].totalWorkCycles);
   1.264 +       totalWorkCyclesAcrossCores += workerParamsArray[i].totalWorkCycles;
   1.265 +    }
   1.266  
   1.267 -    //This is the transition to the VMS runtime
   1.268 -   VPThread__create_seed_procr_and_do_work(benchmark, NULL);
   1.269 +   uint64_t totalExeCycles = endExeCycles.cycles - startExeCycles.cycles;
   1.270 +   printf("Sum across threads of work cycles: %lu\n", totalWorkCyclesAcrossCores);
   1.271 +   printf("Total Execution Cycles: %lu\n", totalExeCycles);
   1.272 +   printf("ExeCycles/WorkCycles Ratio %f\n", 
   1.273 +          (double)totalExeCycles / (double)totalWorkCyclesAcrossCores);
   1.274  
   1.275     return 0;
   1.276   }