changeset 16:5887fbce425f

changed directory structure, added .hgeol file
author Merten Sach <msach@mailbox.tu-berlin.de>
date Mon, 13 Feb 2012 16:11:00 +0100
parents a1269b1549fc
children fdc2f264f3d6
files .hgeol main.c src/Application/main.c
diffstat 3 files changed, 494 insertions(+), 480 deletions(-) [+]
line diff
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/.hgeol	Mon Feb 13 16:11:00 2012 +0100
     1.3 @@ -0,0 +1,14 @@
     1.4 +
     1.5 +[patterns]
     1.6 +**.py = native
     1.7 +**.txt = native
     1.8 +**.c = native
     1.9 +**.h = native
    1.10 +**.cpp = native
    1.11 +**.java = native
    1.12 +**.class = bin
    1.13 +**.jar = bin
    1.14 +**.sh = native
    1.15 +**.pl = native
    1.16 +**.jpg = bin
    1.17 +**.gif = bin
     2.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     2.2 +++ b/main.c	Mon Feb 13 16:11:00 2012 +0100
     2.3 @@ -0,0 +1,480 @@
     2.4 +/* 
     2.5 + * 
     2.6 + */
     2.7 +#include <stdio.h>
     2.8 +#include <stdlib.h>
     2.9 +#include <string.h>
    2.10 +#include <math.h>
    2.11 +#include <ctype.h>
    2.12 +#include <errno.h>
    2.13 +#include <pthread.h>
    2.14 +#include <unistd.h>
    2.15 +#include "VPThread_lib/VPThread.h"
    2.16 +#include "VPThread_lib/VMS/Queue_impl/PrivateQueue.h"
    2.17 +
    2.18 +#include <linux/perf_event.h>
    2.19 +#include <linux/prctl.h>
    2.20 +#include <sys/syscall.h>
    2.21 +
    2.22 +#undef DEBUG
    2.23 +//#define DEBUG
    2.24 +
    2.25 +#define MEASURE_PERF
    2.26 +
    2.27 +#if !defined(unix) && !defined(__unix__)
    2.28 +#ifdef __MACH__
    2.29 +#define unix		1
    2.30 +#define __unix__	1
    2.31 +#endif	/* __MACH__ */
    2.32 +#endif	/* unix */
    2.33 +
    2.34 +/* find the appropriate way to define explicitly sized types */
    2.35 +/* for C99 or GNU libc (also mach's libc) we can use stdint.h */
    2.36 +#if (__STDC_VERSION__ >= 199900) || defined(__GLIBC__) || defined(__MACH__)
    2.37 +#include <stdint.h>
    2.38 +#elif defined(unix) || defined(__unix__)	/* some UNIX systems have them in sys/types.h */
    2.39 +#include <sys/types.h>
    2.40 +#elif defined(__WIN32__) || defined(WIN32)	/* the nameless one */
    2.41 +typedef unsigned __int8 uint8_t;
    2.42 +typedef unsigned __int32 uint32_t;
    2.43 +#endif	/* sized type detection */
    2.44 +
    2.45 +/* provide a millisecond-resolution timer for each system */
    2.46 +#if defined(unix) || defined(__unix__)
    2.47 +#include <time.h>
    2.48 +#include <sys/time.h>
    2.49 +unsigned long get_msec(void) {
    2.50 +	static struct timeval timeval, first_timeval;
    2.51 +
    2.52 +	gettimeofday(&timeval, 0);
    2.53 +	if(first_timeval.tv_sec == 0) {
    2.54 +		first_timeval = timeval;
    2.55 +		return 0;
    2.56 +	}
    2.57 +	return (timeval.tv_sec - first_timeval.tv_sec) * 1000 + (timeval.tv_usec - first_timeval.tv_usec) / 1000;
    2.58 +}
    2.59 +#elif defined(__WIN32__) || defined(WIN32)
    2.60 +#include <windows.h>
    2.61 +unsigned long get_msec(void) {
    2.62 +	return GetTickCount();
    2.63 +}
    2.64 +#else
    2.65 +//#error "I don't know how to measure time on your platform"
    2.66 +#endif
    2.67 +
    2.68 +//======================== Defines =========================
    2.69 +typedef struct perfData measurement_t;
    2.70 +struct perfData{
    2.71 +    uint64 cycles;
    2.72 +    uint64 instructions;
    2.73 +};
    2.74 +
    2.75 +const char *usage = {
    2.76 +	"Usage: malloc_test [options]\n"
    2.77 +	"  Spwans a number of threads and allocates memory.\n\n"
    2.78 +	"Options:\n"
    2.79 +	"  -t <num>   how many threads to use (default: 1). This is internaly multiplied by the number of cores.\n"
    2.80 +	"  -o <num>   repeat workload and sync operation <m> times\n"
    2.81 +        "  -i <num>   size of workload, repeat <n> times\n"     
    2.82 +	"  -h         this help screen\n\n"
    2.83 +};
    2.84 +
    2.85 +struct barrier_t
    2.86 +{
    2.87 +    int counter;
    2.88 +    int nthreads;
    2.89 +    int32 mutex;
    2.90 +    int32 cond;
    2.91 +    measurement_t endBarrierCycles;
    2.92 +
    2.93 +};
    2.94 +typedef struct barrier_t barrier;
    2.95 +
    2.96 +void inline barrier_init(barrier *barr, int nthreads, VirtProcr *animatingPr)
    2.97 + {
    2.98 +   barr->counter = 0;
    2.99 +   barr->nthreads = nthreads;
   2.100 +   barr->mutex   = VPThread__make_mutex(animatingPr);
   2.101 +   barr->cond    = VPThread__make_cond(barr->mutex, animatingPr);
   2.102 + }
   2.103 +
   2.104 +int cycles_counter_main_fd;
   2.105 +void inline barrier_wait(barrier *barr, VirtProcr *animatingPr)
   2.106 + { int i;
   2.107 +
   2.108 +   VPThread__mutex_lock(barr->mutex, animatingPr);
   2.109 +   barr->counter++;
   2.110 +   if(barr->counter == barr->nthreads)
   2.111 +    { 
   2.112 +#ifdef MEASURE_PERF
   2.113 +      read(cycles_counter_main_fd, &(barr->endBarrierCycles.cycles), \
   2.114 +                sizeof(barr->endBarrierCycles.cycles));
   2.115 +#endif
   2.116 +       
   2.117 +      barr->counter = 0;
   2.118 +      for(i=0; i < barr->nthreads; i++)
   2.119 +         VPThread__cond_signal(barr->cond, animatingPr);
   2.120 +    }
   2.121 +   else
   2.122 +    { VPThread__cond_wait(barr->cond, animatingPr);
   2.123 +    }
   2.124 +   VPThread__mutex_unlock(barr->mutex, animatingPr);
   2.125 + }
   2.126 +
   2.127 +
   2.128 +
   2.129 +typedef struct
   2.130 + { struct barrier_t* barrier;
   2.131 +   uint64_t  totalWorkCycles;
   2.132 +   uint64_t  totalBadCycles;
   2.133 +   uint64_t  totalSyncCycles;
   2.134 +   uint64_t  totalBadSyncCycles;
   2.135 +   uint64     numGoodSyncs;
   2.136 +   uint64     numGoodTasks;
   2.137 + }
   2.138 +WorkerParams;
   2.139 +
   2.140 +
   2.141 +typedef struct
   2.142 + { measurement_t *startExeCycles;
   2.143 +   measurement_t *endExeCycles;
   2.144 + }
   2.145 +BenchParams;
   2.146 +
   2.147 +//======================== Globals =========================
   2.148 +char __ProgrammName[] = "overhead_test";
   2.149 +char __DataSet[255];
   2.150 +
   2.151 +int outer_iters, inner_iters, num_threads;
   2.152 +size_t chunk_size = 0;
   2.153 +
   2.154 +int cycles_counter_fd[NUM_CORES];
   2.155 +struct perf_event_attr* hw_event;
   2.156 +
   2.157 +WorkerParams *workerParamsArray;
   2.158 +
   2.159 +//======================== App Code =========================
   2.160 +/*
   2.161 + * Workload
   2.162 + */
   2.163 +
   2.164 +#define saveCyclesAndInstrs(core,cycles) do{     \
   2.165 +   int cycles_fd = cycles_counter_fd[core];             \
   2.166 +   int nread;                                           \
   2.167 +                                                        \
   2.168 +   nread = read(cycles_fd,&(cycles),sizeof(cycles));    \
   2.169 +   if(nread<0){                                         \
   2.170 +       perror("Error reading cycles counter");          \
   2.171 +       cycles = 0;                                      \
   2.172 +   }                                                    \
   2.173 +} while (0) //macro magic for scoping
   2.174 +
   2.175 +
   2.176 +double
   2.177 +worker_TLF(void* _params, VirtProcr* animatingPr)
   2.178 + {
   2.179 +   int i,o;
   2.180 +   WorkerParams* params = (WorkerParams*)_params;
   2.181 +   unsigned int totalWorkCycles = 0, totalBadCycles = 0;
   2.182 +   unsigned int totalSyncCycles = 0, totalBadSyncCycles = 0;
   2.183 +   unsigned int workspace1=0, numGoodSyncs = 0, numGoodTasks = 0;
   2.184 +   double workspace2=0.0;
   2.185 +   int32 privateMutex = VPThread__make_mutex(animatingPr);
   2.186 +   
   2.187 +   int cpuid = sched_getcpu();
   2.188 +   
   2.189 +   measurement_t startWorkload, endWorkload, startWorkload2, endWorkload2;
   2.190 +   uint64 numCycles;
   2.191 +   for(o=0; o < outer_iters; o++)
   2.192 +    {
   2.193 +#ifdef MEASURE_PERF
   2.194 +          saveCyclesAndInstrs(cpuid,startWorkload.cycles);
   2.195 +#endif
   2.196 +       
   2.197 +      //workltask
   2.198 +      for(i=0; i < inner_iters; i++)
   2.199 +       {
   2.200 +         workspace1 += (workspace1 + 32)/2;
   2.201 +         workspace2 += (workspace2 + 23.2)/1.4;
   2.202 +       }
   2.203 +  
   2.204 +#ifdef MEASURE_PERF
   2.205 +          saveCyclesAndInstrs(cpuid,endWorkload.cycles);
   2.206 +          numCycles = endWorkload.cycles - startWorkload.cycles;
   2.207 +          //sanity check (400K is about 20K iters)
   2.208 +          if( numCycles < 400000 ) {totalWorkCycles += numCycles; numGoodTasks++;}
   2.209 +          else                     {totalBadCycles  += numCycles; }
   2.210 +#endif
   2.211 +
   2.212 +      //mutex access often causes switch to different Slave VP
   2.213 +      VPThread__mutex_lock(privateMutex, animatingPr);
   2.214 +      
   2.215 +/*
   2.216 +          saveCyclesAndInstrs(cpuid,startWorkload2.cycles);
   2.217 +      //Task
   2.218 +      for(i=0; i < inner_iters; i++)
   2.219 +       {
   2.220 +         workspace1 += (workspace1 + 32)/2;
   2.221 +         workspace2 += (workspace2 + 23.2)/1.4;
   2.222 +       }
   2.223 +      
   2.224 +          saveCyclesAndInstrs(cpuid,endWorkload2.cycles);
   2.225 +          numCycles = endWorkload2.cycles - startWorkload2.cycles;
   2.226 +          //sanity check (400K is about 20K iters)
   2.227 +          if( numCycles < 400000 ) {totalWorkCycles += numCycles; numGoodTasks++;}
   2.228 +          else                     {totalBadCycles  += numCycles; }
   2.229 +      
   2.230 +*/
   2.231 +      VPThread__mutex_unlock(privateMutex, animatingPr);
   2.232 +    }
   2.233 +
   2.234 +   params->totalWorkCycles = totalWorkCycles;
   2.235 +   params->totalBadCycles = totalBadCycles;
   2.236 +   params->numGoodTasks   = numGoodTasks;
   2.237 +   params->totalSyncCycles = totalSyncCycles;
   2.238 +   params->totalBadSyncCycles = totalBadSyncCycles;
   2.239 +   params->numGoodSyncs = numGoodSyncs;
   2.240 +/*
   2.241 +   params->totalSyncCycles = VMS__give_num_plugin_cycles();
   2.242 +   params->totalBadSyncCycles = 0;
   2.243 +   params->numGoodSyncs = VMS__give_num_plugin_animations();
   2.244 +*/
   2.245 +   
   2.246 +   
   2.247 +   //Wait for all threads to end
   2.248 +   barrier_wait(params->barrier, animatingPr);
   2.249 +   
   2.250 +   //Shutdown worker
   2.251 +   VPThread__dissipate_thread(animatingPr);
   2.252 +   
   2.253 +     //below return never reached --> there for gcc
   2.254 +   return (workspace1 + workspace2);  //to prevent gcc from optimizing work out
   2.255 + }
   2.256 +
   2.257 +
   2.258 +/* this is run after the VMS is set up*/
   2.259 +void benchmark(void *_params, VirtProcr *animatingPr)
   2.260 + {
   2.261 +   int i, cpuID;
   2.262 +   struct barrier_t  barr;
   2.263 +   BenchParams      *params;
   2.264 +   
   2.265 +   params = (BenchParams *)_params;
   2.266 +
   2.267 +   barrier_init(&barr, num_threads+1, animatingPr);
   2.268 +      
   2.269 +   //prepare input
   2.270 +   for(i=0; i<num_threads; i++)
   2.271 +    { 
   2.272 +       workerParamsArray[i].barrier = &barr;
   2.273 +    }
   2.274 +     
   2.275 +   //save cycles before execution of threads, to get total exe cycles
   2.276 +   measurement_t *startExeCycles, *endExeCycles;
   2.277 +   startExeCycles = params->startExeCycles;
   2.278 +   
   2.279 +#ifdef MEASURE_PERF
   2.280 +   int nread = read(cycles_counter_main_fd, &(startExeCycles->cycles),
   2.281 +                sizeof(startExeCycles->cycles));
   2.282 +   if(nread<0) perror("Error reading cycles counter");
   2.283 +#endif
   2.284 +   
   2.285 +   //create (which starts running) all threads
   2.286 +   for(i=0; i<num_threads; i++)
   2.287 +    { VPThread__create_thread((VirtProcrFnPtr)worker_TLF, &(workerParamsArray[i]), animatingPr);
   2.288 +    }
   2.289 +   //wait for all threads to finish
   2.290 +   barrier_wait(&barr, animatingPr);
   2.291 +  
   2.292 +#ifdef MEASURE_PERF
   2.293 +   //endBarrierCycles read in barrier_wait()!  Merten, email me if want to chg
   2.294 +   params->endExeCycles->cycles = barr.endBarrierCycles.cycles;
   2.295 +#endif
   2.296 +   
   2.297 +
   2.298 +/*
   2.299 +   uint64_t overallWorkCycles = 0;
   2.300 +   for(i=0; i<num_threads; i++){ 
   2.301 +       printf("WorkCycles: %lu\n",input[i].totalWorkCycles);
   2.302 +       overallWorkCycles += input[i].totalWorkCycles;
   2.303 +    }
   2.304 +   
   2.305 +   printf("Sum across threads of work cycles: %lu\n", overallWorkCycles);
   2.306 +   printf("Total Execution: %lu\n", endBenchTime.cycles-startBenchTime.cycles);
   2.307 +   printf("Runtime/Workcycle Ratio %lu\n", 
   2.308 +   ((endBenchTime.cycles-startBenchTime.cycles)*100)/overallWorkCycles);
   2.309 +*/
   2.310 +
   2.311 +   //======================================================
   2.312 +
   2.313 +   VPThread__dissipate_thread(animatingPr);
   2.314 + }
   2.315 +
   2.316 +int main(int argc, char **argv)
   2.317 + {
   2.318 +   int i;
   2.319 +
   2.320 +   //set global static variables, based on cmd-line args
   2.321 +   for(i=1; i<argc; i++)
   2.322 +    {
   2.323 +      if(argv[i][0] == '-' && argv[i][2] == 0)
   2.324 +       {
   2.325 +         switch(argv[i][1])
   2.326 +          {
   2.327 +            case 't':
   2.328 +               if(!isdigit(argv[++i][0]))
   2.329 +                {
   2.330 +                  fprintf(stderr, "-t must be followed by the number of worker threads to spawn\n");
   2.331 +                  return EXIT_FAILURE;
   2.332 +                }
   2.333 +               num_threads = atoi(argv[i]);
   2.334 +               if(!num_threads)
   2.335 +                {
   2.336 +                  fprintf(stderr, "invalid number of threads specified: %d\n", num_threads);
   2.337 +                  return EXIT_FAILURE;
   2.338 +                }
   2.339 +            break;
   2.340 +            case 'o':
   2.341 +               if(!isdigit(argv[++i][0]))
   2.342 +                {
   2.343 +                  fputs("-i must be followed by a number\n", stderr);
   2.344 +                  return EXIT_FAILURE;
   2.345 +                }
   2.346 +               outer_iters = atoi(argv[i]);
   2.347 +				break;
   2.348 +            case 'i':
   2.349 +               if(!isdigit(argv[++i][0]))
   2.350 +                {
   2.351 +                  fputs("-o must be followed by a number (workload size)\n", stderr);
   2.352 +                  return EXIT_FAILURE;
   2.353 +                }
   2.354 +               inner_iters = atoi(argv[i]);
   2.355 +				break;
   2.356 +            case 'h':
   2.357 +               fputs(usage, stdout);
   2.358 +               return 0;
   2.359 +				
   2.360 +            default:
   2.361 +               fprintf(stderr, "unrecognized argument: %s\n", argv[i]);
   2.362 +               fputs(usage, stderr);
   2.363 +               return EXIT_FAILURE;
   2.364 +          }//switch
   2.365 +       }//if arg
   2.366 +      else
   2.367 +       {
   2.368 +			fprintf(stderr, "unrecognized argument: %s\n", argv[i]);
   2.369 +			fputs(usage, stderr);
   2.370 +			return EXIT_FAILURE;
   2.371 +       }
   2.372 +    }//for
   2.373 +   
   2.374 +   
   2.375 +#ifdef MEASURE_PERF
   2.376 +   //setup performance counters
   2.377 +    hw_event = malloc(sizeof(struct perf_event_attr));
   2.378 +    memset(hw_event,0,sizeof(struct perf_event_attr));
   2.379 +    
   2.380 +    hw_event->type = PERF_TYPE_HARDWARE;
   2.381 +    hw_event->size = sizeof(hw_event);
   2.382 +    hw_event->disabled = 0;
   2.383 +    hw_event->freq = 0;
   2.384 +    hw_event->inherit = 1; /* children inherit it   */
   2.385 +    hw_event->pinned = 1; /* says this virt counter must always be on HW */
   2.386 +    hw_event->exclusive = 0; /* only group on PMU     */
   2.387 +    hw_event->exclude_user = 0; /* don't count user      */
   2.388 +    hw_event->exclude_kernel = 1; /* don't count kernel  */
   2.389 +    hw_event->exclude_hv = 1; /* ditto hypervisor      */
   2.390 +    hw_event->exclude_idle = 1; /* don't count when idle */
   2.391 +    hw_event->mmap = 0; /* include mmap data     */
   2.392 +    hw_event->comm = 0; /* include comm data     */
   2.393 +
   2.394 +    hw_event->config = PERF_COUNT_HW_CPU_CYCLES; //cycles
   2.395 +    
   2.396 +    int cpuID, retries;
   2.397 +
   2.398 +   for( cpuID = 0; cpuID < NUM_CORES; cpuID++ )
   2.399 +    { retries = 0;
   2.400 +      do
   2.401 +       { retries += 1;
   2.402 +         cycles_counter_fd[cpuID] = 
   2.403 +          syscall(__NR_perf_event_open, hw_event,
   2.404 +                  0,//pid_t: 0 is "pid of calling process" 
   2.405 +                  cpuID,//int: cpu, the value returned by "CPUID" instr(?)
   2.406 +                  -1,//int: group_fd, -1 is "leader" or independent
   2.407 +                  0//unsigned long: flags
   2.408 +                 );
   2.409 +       }
   2.410 +      while(cycles_counter_fd[cpuID]<0 && retries < 100);
   2.411 +      if(retries >= 100)
   2.412 +       {
   2.413 +         fprintf(stderr,"On core %d: ",cpuID);
   2.414 +         perror("Failed to open cycles counter");
   2.415 +       }
   2.416 +    }
   2.417 +
   2.418 +   //Set up counter to accumulate total cycles to process, across all CPUs
   2.419 +
   2.420 +   retries = 0;
   2.421 +   do
   2.422 +    { retries += 1;
   2.423 +      cycles_counter_main_fd = 
   2.424 +       syscall(__NR_perf_event_open, hw_event,
   2.425 +               0,//pid_t: 0 is "pid of calling process" 
   2.426 +               -1,//int: cpu, -1 means accumulate from all cores
   2.427 +               -1,//int: group_fd, -1 is "leader" == independent
   2.428 +               0//unsigned long: flags
   2.429 +              );
   2.430 +    }
   2.431 +   while(cycles_counter_main_fd<0 && retries < 100);
   2.432 +   if(retries >= 100)
   2.433 +    {
   2.434 +      fprintf(stderr,"in main ");
   2.435 +      perror("Failed to open cycles counter");
   2.436 +    }
   2.437 +#endif
   2.438 +   
   2.439 +   measurement_t startExeCycles, endExeCycles;
   2.440 +   BenchParams *benchParams;
   2.441 +   
   2.442 +   benchParams = malloc(sizeof(BenchParams)); 
   2.443 +   
   2.444 +   benchParams->startExeCycles = &startExeCycles;
   2.445 +   benchParams->endExeCycles   = &endExeCycles;
   2.446 +   
   2.447 +   workerParamsArray =  (WorkerParams *)malloc( (num_threads + 1) * sizeof(WorkerParams) );
   2.448 +   if(workerParamsArray == NULL ) printf("error mallocing worker params array\n");
   2.449 +   
   2.450 + 
   2.451 +   //This is the transition to the VMS runtime
   2.452 +   VPThread__create_seed_procr_and_do_work( &benchmark, benchParams );
   2.453 +   
   2.454 +#ifdef MEASURE_PERF
   2.455 +   uint64_t totalWorkCyclesAcrossCores = 0, totalBadCyclesAcrossCores = 0;
   2.456 +   uint64_t totalSyncCyclesAcrossCores = 0, totalBadSyncCyclesAcrossCores = 0;
   2.457 +   for(i=0; i<num_threads; i++){ 
   2.458 +       printf("WorkCycles: %lu\n",workerParamsArray[i].totalWorkCycles);
   2.459 +//       printf("Num Good Tasks: %lu\n",workerParamsArray[i].numGoodTasks);
   2.460 +//       printf("SyncCycles: %lu\n",workerParamsArray[i].totalSyncCycles);
   2.461 +//       printf("Num Good Syncs: %lu\n",workerParamsArray[i].numGoodSyncs);
   2.462 +       totalWorkCyclesAcrossCores += workerParamsArray[i].totalWorkCycles;
   2.463 +       totalBadCyclesAcrossCores  += workerParamsArray[i].totalBadCycles;
   2.464 +       totalSyncCyclesAcrossCores += workerParamsArray[i].totalSyncCycles;
   2.465 +       totalBadSyncCyclesAcrossCores  += workerParamsArray[i].totalBadSyncCycles;
   2.466 +    }
   2.467 +
   2.468 +   uint64_t totalExeCycles = endExeCycles.cycles - startExeCycles.cycles;
   2.469 +   totalExeCycles -= totalBadCyclesAcrossCores;
   2.470 +   uint64 totalOverhead = totalExeCycles - totalWorkCyclesAcrossCores;
   2.471 +   int32  numSyncs = outer_iters * num_threads * 2;
   2.472 +   printf("Total Execution Cycles: %lu\n", totalExeCycles);
   2.473 +   printf("Sum across threads of work cycles: %lu\n", totalWorkCyclesAcrossCores);
   2.474 +   printf("Sum across threads of bad work cycles: %lu\n", totalBadCyclesAcrossCores);
   2.475 +//   printf("Sum across threads of Bad Sync cycles: %lu\n", totalBadSyncCyclesAcrossCores);
   2.476 +   printf("Overhead per sync: %f\n", (double)totalOverhead / (double)numSyncs );
   2.477 +   printf("ExeCycles/WorkCycles Ratio %f\n", 
   2.478 +          (double)totalExeCycles / (double)totalWorkCyclesAcrossCores);
   2.479 +#else
   2.480 +   printf("No measurement done!\n");
   2.481 +#endif
   2.482 +   return 0;
   2.483 + }
     3.1 --- a/src/Application/main.c	Fri Jan 06 19:09:38 2012 +0100
     3.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
     3.3 @@ -1,480 +0,0 @@
     3.4 -/* 
     3.5 - * 
     3.6 - */
     3.7 -#include <stdio.h>
     3.8 -#include <stdlib.h>
     3.9 -#include <string.h>
    3.10 -#include <math.h>
    3.11 -#include <ctype.h>
    3.12 -#include <errno.h>
    3.13 -#include <pthread.h>
    3.14 -#include <unistd.h>
    3.15 -#include "VPThread_lib/VPThread.h"
    3.16 -#include "VPThread_lib/VMS/Queue_impl/PrivateQueue.h"
    3.17 -
    3.18 -#include <linux/perf_event.h>
    3.19 -#include <linux/prctl.h>
    3.20 -#include <sys/syscall.h>
    3.21 -
    3.22 -#undef DEBUG
    3.23 -//#define DEBUG
    3.24 -
    3.25 -#define MEASURE_PERF
    3.26 -
    3.27 -#if !defined(unix) && !defined(__unix__)
    3.28 -#ifdef __MACH__
    3.29 -#define unix		1
    3.30 -#define __unix__	1
    3.31 -#endif	/* __MACH__ */
    3.32 -#endif	/* unix */
    3.33 -
    3.34 -/* find the appropriate way to define explicitly sized types */
    3.35 -/* for C99 or GNU libc (also mach's libc) we can use stdint.h */
    3.36 -#if (__STDC_VERSION__ >= 199900) || defined(__GLIBC__) || defined(__MACH__)
    3.37 -#include <stdint.h>
    3.38 -#elif defined(unix) || defined(__unix__)	/* some UNIX systems have them in sys/types.h */
    3.39 -#include <sys/types.h>
    3.40 -#elif defined(__WIN32__) || defined(WIN32)	/* the nameless one */
    3.41 -typedef unsigned __int8 uint8_t;
    3.42 -typedef unsigned __int32 uint32_t;
    3.43 -#endif	/* sized type detection */
    3.44 -
    3.45 -/* provide a millisecond-resolution timer for each system */
    3.46 -#if defined(unix) || defined(__unix__)
    3.47 -#include <time.h>
    3.48 -#include <sys/time.h>
    3.49 -unsigned long get_msec(void) {
    3.50 -	static struct timeval timeval, first_timeval;
    3.51 -
    3.52 -	gettimeofday(&timeval, 0);
    3.53 -	if(first_timeval.tv_sec == 0) {
    3.54 -		first_timeval = timeval;
    3.55 -		return 0;
    3.56 -	}
    3.57 -	return (timeval.tv_sec - first_timeval.tv_sec) * 1000 + (timeval.tv_usec - first_timeval.tv_usec) / 1000;
    3.58 -}
    3.59 -#elif defined(__WIN32__) || defined(WIN32)
    3.60 -#include <windows.h>
    3.61 -unsigned long get_msec(void) {
    3.62 -	return GetTickCount();
    3.63 -}
    3.64 -#else
    3.65 -//#error "I don't know how to measure time on your platform"
    3.66 -#endif
    3.67 -
    3.68 -//======================== Defines =========================
    3.69 -typedef struct perfData measurement_t;
    3.70 -struct perfData{
    3.71 -    uint64 cycles;
    3.72 -    uint64 instructions;
    3.73 -};
    3.74 -
    3.75 -const char *usage = {
    3.76 -	"Usage: malloc_test [options]\n"
    3.77 -	"  Spwans a number of threads and allocates memory.\n\n"
    3.78 -	"Options:\n"
    3.79 -	"  -t <num>   how many threads to use (default: 1). This is internaly multiplied by the number of cores.\n"
    3.80 -	"  -o <num>   repeat workload and sync operation <m> times\n"
    3.81 -        "  -i <num>   size of workload, repeat <n> times\n"     
    3.82 -	"  -h         this help screen\n\n"
    3.83 -};
    3.84 -
    3.85 -struct barrier_t
    3.86 -{
    3.87 -    int counter;
    3.88 -    int nthreads;
    3.89 -    int32 mutex;
    3.90 -    int32 cond;
    3.91 -    measurement_t endBarrierCycles;
    3.92 -
    3.93 -};
    3.94 -typedef struct barrier_t barrier;
    3.95 -
    3.96 -void inline barrier_init(barrier *barr, int nthreads, VirtProcr *animatingPr)
    3.97 - {
    3.98 -   barr->counter = 0;
    3.99 -   barr->nthreads = nthreads;
   3.100 -   barr->mutex   = VPThread__make_mutex(animatingPr);
   3.101 -   barr->cond    = VPThread__make_cond(barr->mutex, animatingPr);
   3.102 - }
   3.103 -
   3.104 -int cycles_counter_main_fd;
   3.105 -void inline barrier_wait(barrier *barr, VirtProcr *animatingPr)
   3.106 - { int i;
   3.107 -
   3.108 -   VPThread__mutex_lock(barr->mutex, animatingPr);
   3.109 -   barr->counter++;
   3.110 -   if(barr->counter == barr->nthreads)
   3.111 -    { 
   3.112 -#ifdef MEASURE_PERF
   3.113 -      read(cycles_counter_main_fd, &(barr->endBarrierCycles.cycles), \
   3.114 -                sizeof(barr->endBarrierCycles.cycles));
   3.115 -#endif
   3.116 -       
   3.117 -      barr->counter = 0;
   3.118 -      for(i=0; i < barr->nthreads; i++)
   3.119 -         VPThread__cond_signal(barr->cond, animatingPr);
   3.120 -    }
   3.121 -   else
   3.122 -    { VPThread__cond_wait(barr->cond, animatingPr);
   3.123 -    }
   3.124 -   VPThread__mutex_unlock(barr->mutex, animatingPr);
   3.125 - }
   3.126 -
   3.127 -
   3.128 -
   3.129 -typedef struct
   3.130 - { struct barrier_t* barrier;
   3.131 -   uint64_t  totalWorkCycles;
   3.132 -   uint64_t  totalBadCycles;
   3.133 -   uint64_t  totalSyncCycles;
   3.134 -   uint64_t  totalBadSyncCycles;
   3.135 -   uint64     numGoodSyncs;
   3.136 -   uint64     numGoodTasks;
   3.137 - }
   3.138 -WorkerParams;
   3.139 -
   3.140 -
   3.141 -typedef struct
   3.142 - { measurement_t *startExeCycles;
   3.143 -   measurement_t *endExeCycles;
   3.144 - }
   3.145 -BenchParams;
   3.146 -
   3.147 -//======================== Globals =========================
   3.148 -char __ProgrammName[] = "overhead_test";
   3.149 -char __DataSet[255];
   3.150 -
   3.151 -int outer_iters, inner_iters, num_threads;
   3.152 -size_t chunk_size = 0;
   3.153 -
   3.154 -int cycles_counter_fd[NUM_CORES];
   3.155 -struct perf_event_attr* hw_event;
   3.156 -
   3.157 -WorkerParams *workerParamsArray;
   3.158 -
   3.159 -//======================== App Code =========================
   3.160 -/*
   3.161 - * Workload
   3.162 - */
   3.163 -
   3.164 -#define saveCyclesAndInstrs(core,cycles) do{     \
   3.165 -   int cycles_fd = cycles_counter_fd[core];             \
   3.166 -   int nread;                                           \
   3.167 -                                                        \
   3.168 -   nread = read(cycles_fd,&(cycles),sizeof(cycles));    \
   3.169 -   if(nread<0){                                         \
   3.170 -       perror("Error reading cycles counter");          \
   3.171 -       cycles = 0;                                      \
   3.172 -   }                                                    \
   3.173 -} while (0) //macro magic for scoping
   3.174 -
   3.175 -
   3.176 -double
   3.177 -worker_TLF(void* _params, VirtProcr* animatingPr)
   3.178 - {
   3.179 -   int i,o;
   3.180 -   WorkerParams* params = (WorkerParams*)_params;
   3.181 -   unsigned int totalWorkCycles = 0, totalBadCycles = 0;
   3.182 -   unsigned int totalSyncCycles = 0, totalBadSyncCycles = 0;
   3.183 -   unsigned int workspace1=0, numGoodSyncs = 0, numGoodTasks = 0;
   3.184 -   double workspace2=0.0;
   3.185 -   int32 privateMutex = VPThread__make_mutex(animatingPr);
   3.186 -   
   3.187 -   int cpuid = sched_getcpu();
   3.188 -   
   3.189 -   measurement_t startWorkload, endWorkload, startWorkload2, endWorkload2;
   3.190 -   uint64 numCycles;
   3.191 -   for(o=0; o < outer_iters; o++)
   3.192 -    {
   3.193 -#ifdef MEASURE_PERF
   3.194 -          saveCyclesAndInstrs(cpuid,startWorkload.cycles);
   3.195 -#endif
   3.196 -       
   3.197 -      //workltask
   3.198 -      for(i=0; i < inner_iters; i++)
   3.199 -       {
   3.200 -         workspace1 += (workspace1 + 32)/2;
   3.201 -         workspace2 += (workspace2 + 23.2)/1.4;
   3.202 -       }
   3.203 -  
   3.204 -#ifdef MEASURE_PERF
   3.205 -          saveCyclesAndInstrs(cpuid,endWorkload.cycles);
   3.206 -          numCycles = endWorkload.cycles - startWorkload.cycles;
   3.207 -          //sanity check (400K is about 20K iters)
   3.208 -          if( numCycles < 400000 ) {totalWorkCycles += numCycles; numGoodTasks++;}
   3.209 -          else                     {totalBadCycles  += numCycles; }
   3.210 -#endif
   3.211 -
   3.212 -      //mutex access often causes switch to different Slave VP
   3.213 -      VPThread__mutex_lock(privateMutex, animatingPr);
   3.214 -      
   3.215 -/*
   3.216 -          saveCyclesAndInstrs(cpuid,startWorkload2.cycles);
   3.217 -      //Task
   3.218 -      for(i=0; i < inner_iters; i++)
   3.219 -       {
   3.220 -         workspace1 += (workspace1 + 32)/2;
   3.221 -         workspace2 += (workspace2 + 23.2)/1.4;
   3.222 -       }
   3.223 -      
   3.224 -          saveCyclesAndInstrs(cpuid,endWorkload2.cycles);
   3.225 -          numCycles = endWorkload2.cycles - startWorkload2.cycles;
   3.226 -          //sanity check (400K is about 20K iters)
   3.227 -          if( numCycles < 400000 ) {totalWorkCycles += numCycles; numGoodTasks++;}
   3.228 -          else                     {totalBadCycles  += numCycles; }
   3.229 -      
   3.230 -*/
   3.231 -      VPThread__mutex_unlock(privateMutex, animatingPr);
   3.232 -    }
   3.233 -
   3.234 -   params->totalWorkCycles = totalWorkCycles;
   3.235 -   params->totalBadCycles = totalBadCycles;
   3.236 -   params->numGoodTasks   = numGoodTasks;
   3.237 -   params->totalSyncCycles = totalSyncCycles;
   3.238 -   params->totalBadSyncCycles = totalBadSyncCycles;
   3.239 -   params->numGoodSyncs = numGoodSyncs;
   3.240 -/*
   3.241 -   params->totalSyncCycles = VMS__give_num_plugin_cycles();
   3.242 -   params->totalBadSyncCycles = 0;
   3.243 -   params->numGoodSyncs = VMS__give_num_plugin_animations();
   3.244 -*/
   3.245 -   
   3.246 -   
   3.247 -   //Wait for all threads to end
   3.248 -   barrier_wait(params->barrier, animatingPr);
   3.249 -   
   3.250 -   //Shutdown worker
   3.251 -   VPThread__dissipate_thread(animatingPr);
   3.252 -   
   3.253 -     //below return never reached --> there for gcc
   3.254 -   return (workspace1 + workspace2);  //to prevent gcc from optimizing work out
   3.255 - }
   3.256 -
   3.257 -
   3.258 -/* this is run after the VMS is set up*/
   3.259 -void benchmark(void *_params, VirtProcr *animatingPr)
   3.260 - {
   3.261 -   int i, cpuID;
   3.262 -   struct barrier_t  barr;
   3.263 -   BenchParams      *params;
   3.264 -   
   3.265 -   params = (BenchParams *)_params;
   3.266 -
   3.267 -   barrier_init(&barr, num_threads+1, animatingPr);
   3.268 -      
   3.269 -   //prepare input
   3.270 -   for(i=0; i<num_threads; i++)
   3.271 -    { 
   3.272 -       workerParamsArray[i].barrier = &barr;
   3.273 -    }
   3.274 -     
   3.275 -   //save cycles before execution of threads, to get total exe cycles
   3.276 -   measurement_t *startExeCycles, *endExeCycles;
   3.277 -   startExeCycles = params->startExeCycles;
   3.278 -   
   3.279 -#ifdef MEASURE_PERF
   3.280 -   int nread = read(cycles_counter_main_fd, &(startExeCycles->cycles),
   3.281 -                sizeof(startExeCycles->cycles));
   3.282 -   if(nread<0) perror("Error reading cycles counter");
   3.283 -#endif
   3.284 -   
   3.285 -   //create (which starts running) all threads
   3.286 -   for(i=0; i<num_threads; i++)
   3.287 -    { VPThread__create_thread((VirtProcrFnPtr)worker_TLF, &(workerParamsArray[i]), animatingPr);
   3.288 -    }
   3.289 -   //wait for all threads to finish
   3.290 -   barrier_wait(&barr, animatingPr);
   3.291 -  
   3.292 -#ifdef MEASURE_PERF
   3.293 -   //endBarrierCycles read in barrier_wait()!  Merten, email me if want to chg
   3.294 -   params->endExeCycles->cycles = barr.endBarrierCycles.cycles;
   3.295 -#endif
   3.296 -   
   3.297 -
   3.298 -/*
   3.299 -   uint64_t overallWorkCycles = 0;
   3.300 -   for(i=0; i<num_threads; i++){ 
   3.301 -       printf("WorkCycles: %lu\n",input[i].totalWorkCycles);
   3.302 -       overallWorkCycles += input[i].totalWorkCycles;
   3.303 -    }
   3.304 -   
   3.305 -   printf("Sum across threads of work cycles: %lu\n", overallWorkCycles);
   3.306 -   printf("Total Execution: %lu\n", endBenchTime.cycles-startBenchTime.cycles);
   3.307 -   printf("Runtime/Workcycle Ratio %lu\n", 
   3.308 -   ((endBenchTime.cycles-startBenchTime.cycles)*100)/overallWorkCycles);
   3.309 -*/
   3.310 -
   3.311 -   //======================================================
   3.312 -
   3.313 -   VPThread__dissipate_thread(animatingPr);
   3.314 - }
   3.315 -
   3.316 -int main(int argc, char **argv)
   3.317 - {
   3.318 -   int i;
   3.319 -
   3.320 -   //set global static variables, based on cmd-line args
   3.321 -   for(i=1; i<argc; i++)
   3.322 -    {
   3.323 -      if(argv[i][0] == '-' && argv[i][2] == 0)
   3.324 -       {
   3.325 -         switch(argv[i][1])
   3.326 -          {
   3.327 -            case 't':
   3.328 -               if(!isdigit(argv[++i][0]))
   3.329 -                {
   3.330 -                  fprintf(stderr, "-t must be followed by the number of worker threads to spawn\n");
   3.331 -                  return EXIT_FAILURE;
   3.332 -                }
   3.333 -               num_threads = atoi(argv[i]);
   3.334 -               if(!num_threads)
   3.335 -                {
   3.336 -                  fprintf(stderr, "invalid number of threads specified: %d\n", num_threads);
   3.337 -                  return EXIT_FAILURE;
   3.338 -                }
   3.339 -            break;
   3.340 -            case 'o':
   3.341 -               if(!isdigit(argv[++i][0]))
   3.342 -                {
   3.343 -                  fputs("-i must be followed by a number\n", stderr);
   3.344 -                  return EXIT_FAILURE;
   3.345 -                }
   3.346 -               outer_iters = atoi(argv[i]);
   3.347 -				break;
   3.348 -            case 'i':
   3.349 -               if(!isdigit(argv[++i][0]))
   3.350 -                {
   3.351 -                  fputs("-o must be followed by a number (workload size)\n", stderr);
   3.352 -                  return EXIT_FAILURE;
   3.353 -                }
   3.354 -               inner_iters = atoi(argv[i]);
   3.355 -				break;
   3.356 -            case 'h':
   3.357 -               fputs(usage, stdout);
   3.358 -               return 0;
   3.359 -				
   3.360 -            default:
   3.361 -               fprintf(stderr, "unrecognized argument: %s\n", argv[i]);
   3.362 -               fputs(usage, stderr);
   3.363 -               return EXIT_FAILURE;
   3.364 -          }//switch
   3.365 -       }//if arg
   3.366 -      else
   3.367 -       {
   3.368 -			fprintf(stderr, "unrecognized argument: %s\n", argv[i]);
   3.369 -			fputs(usage, stderr);
   3.370 -			return EXIT_FAILURE;
   3.371 -       }
   3.372 -    }//for
   3.373 -   
   3.374 -   
   3.375 -#ifdef MEASURE_PERF
   3.376 -   //setup performance counters
   3.377 -    hw_event = malloc(sizeof(struct perf_event_attr));
   3.378 -    memset(hw_event,0,sizeof(struct perf_event_attr));
   3.379 -    
   3.380 -    hw_event->type = PERF_TYPE_HARDWARE;
   3.381 -    hw_event->size = sizeof(hw_event);
   3.382 -    hw_event->disabled = 0;
   3.383 -    hw_event->freq = 0;
   3.384 -    hw_event->inherit = 1; /* children inherit it   */
   3.385 -    hw_event->pinned = 1; /* says this virt counter must always be on HW */
   3.386 -    hw_event->exclusive = 0; /* only group on PMU     */
   3.387 -    hw_event->exclude_user = 0; /* don't count user      */
   3.388 -    hw_event->exclude_kernel = 1; /* don't count kernel  */
   3.389 -    hw_event->exclude_hv = 1; /* ditto hypervisor      */
   3.390 -    hw_event->exclude_idle = 1; /* don't count when idle */
   3.391 -    hw_event->mmap = 0; /* include mmap data     */
   3.392 -    hw_event->comm = 0; /* include comm data     */
   3.393 -
   3.394 -    hw_event->config = PERF_COUNT_HW_CPU_CYCLES; //cycles
   3.395 -    
   3.396 -    int cpuID, retries;
   3.397 -
   3.398 -   for( cpuID = 0; cpuID < NUM_CORES; cpuID++ )
   3.399 -    { retries = 0;
   3.400 -      do
   3.401 -       { retries += 1;
   3.402 -         cycles_counter_fd[cpuID] = 
   3.403 -          syscall(__NR_perf_event_open, hw_event,
   3.404 -                  0,//pid_t: 0 is "pid of calling process" 
   3.405 -                  cpuID,//int: cpu, the value returned by "CPUID" instr(?)
   3.406 -                  -1,//int: group_fd, -1 is "leader" or independent
   3.407 -                  0//unsigned long: flags
   3.408 -                 );
   3.409 -       }
   3.410 -      while(cycles_counter_fd[cpuID]<0 && retries < 100);
   3.411 -      if(retries >= 100)
   3.412 -       {
   3.413 -         fprintf(stderr,"On core %d: ",cpuID);
   3.414 -         perror("Failed to open cycles counter");
   3.415 -       }
   3.416 -    }
   3.417 -
   3.418 -   //Set up counter to accumulate total cycles to process, across all CPUs
   3.419 -
   3.420 -   retries = 0;
   3.421 -   do
   3.422 -    { retries += 1;
   3.423 -      cycles_counter_main_fd = 
   3.424 -       syscall(__NR_perf_event_open, hw_event,
   3.425 -               0,//pid_t: 0 is "pid of calling process" 
   3.426 -               -1,//int: cpu, -1 means accumulate from all cores
   3.427 -               -1,//int: group_fd, -1 is "leader" == independent
   3.428 -               0//unsigned long: flags
   3.429 -              );
   3.430 -    }
   3.431 -   while(cycles_counter_main_fd<0 && retries < 100);
   3.432 -   if(retries >= 100)
   3.433 -    {
   3.434 -      fprintf(stderr,"in main ");
   3.435 -      perror("Failed to open cycles counter");
   3.436 -    }
   3.437 -#endif
   3.438 -   
   3.439 -   measurement_t startExeCycles, endExeCycles;
   3.440 -   BenchParams *benchParams;
   3.441 -   
   3.442 -   benchParams = malloc(sizeof(BenchParams)); 
   3.443 -   
   3.444 -   benchParams->startExeCycles = &startExeCycles;
   3.445 -   benchParams->endExeCycles   = &endExeCycles;
   3.446 -   
   3.447 -   workerParamsArray =  (WorkerParams *)malloc( (num_threads + 1) * sizeof(WorkerParams) );
   3.448 -   if(workerParamsArray == NULL ) printf("error mallocing worker params array\n");
   3.449 -   
   3.450 - 
   3.451 -   //This is the transition to the VMS runtime
   3.452 -   VPThread__create_seed_procr_and_do_work( &benchmark, benchParams );
   3.453 -   
   3.454 -#ifdef MEASURE_PERF
   3.455 -   uint64_t totalWorkCyclesAcrossCores = 0, totalBadCyclesAcrossCores = 0;
   3.456 -   uint64_t totalSyncCyclesAcrossCores = 0, totalBadSyncCyclesAcrossCores = 0;
   3.457 -   for(i=0; i<num_threads; i++){ 
   3.458 -       printf("WorkCycles: %lu\n",workerParamsArray[i].totalWorkCycles);
   3.459 -//       printf("Num Good Tasks: %lu\n",workerParamsArray[i].numGoodTasks);
   3.460 -//       printf("SyncCycles: %lu\n",workerParamsArray[i].totalSyncCycles);
   3.461 -//       printf("Num Good Syncs: %lu\n",workerParamsArray[i].numGoodSyncs);
   3.462 -       totalWorkCyclesAcrossCores += workerParamsArray[i].totalWorkCycles;
   3.463 -       totalBadCyclesAcrossCores  += workerParamsArray[i].totalBadCycles;
   3.464 -       totalSyncCyclesAcrossCores += workerParamsArray[i].totalSyncCycles;
   3.465 -       totalBadSyncCyclesAcrossCores  += workerParamsArray[i].totalBadSyncCycles;
   3.466 -    }
   3.467 -
   3.468 -   uint64_t totalExeCycles = endExeCycles.cycles - startExeCycles.cycles;
   3.469 -   totalExeCycles -= totalBadCyclesAcrossCores;
   3.470 -   uint64 totalOverhead = totalExeCycles - totalWorkCyclesAcrossCores;
   3.471 -   int32  numSyncs = outer_iters * num_threads * 2;
   3.472 -   printf("Total Execution Cycles: %lu\n", totalExeCycles);
   3.473 -   printf("Sum across threads of work cycles: %lu\n", totalWorkCyclesAcrossCores);
   3.474 -   printf("Sum across threads of bad work cycles: %lu\n", totalBadCyclesAcrossCores);
   3.475 -//   printf("Sum across threads of Bad Sync cycles: %lu\n", totalBadSyncCyclesAcrossCores);
   3.476 -   printf("Overhead per sync: %f\n", (double)totalOverhead / (double)numSyncs );
   3.477 -   printf("ExeCycles/WorkCycles Ratio %f\n", 
   3.478 -          (double)totalExeCycles / (double)totalWorkCyclesAcrossCores);
   3.479 -#else
   3.480 -   printf("No measurement done!\n");
   3.481 -#endif
   3.482 -   return 0;
   3.483 - }