changeset 6:c8995a602b46

mallocing hw_event
author Merten Sach <msach@mailbox.tu-berlin.de>
date Mon, 21 Nov 2011 19:16:03 +0100
parents 535c119ba090
children 28650a4df2b9
files .hgignore Makefile src/Application/main.c
diffstat 3 files changed, 132 insertions(+), 93 deletions(-) [+]
line diff
     1.1 --- a/.hgignore	Fri Oct 28 06:56:35 2011 -0700
     1.2 +++ b/.hgignore	Mon Nov 21 19:16:03 2011 +0100
     1.3 @@ -7,3 +7,4 @@
     1.4  c-ray-mt
     1.5  *.ppm
     1.6  *.o
     1.7 +*~
     2.1 --- a/Makefile	Fri Oct 28 06:56:35 2011 -0700
     2.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
     2.3 @@ -1,51 +0,0 @@
     2.4 -obj = 	\
     2.5 -	src/VPThread_lib/VMS/Histogram/Histogram.o \
     2.6 -	src/VPThread_lib/VMS/Histogram/FloatHist.o \
     2.7 -	src/VPThread_lib/VMS/CoreLoop.o \
     2.8 -	src/VPThread_lib/VMS/VMS.o \
     2.9 -	src/VPThread_lib/VMS/MasterLoop.o \
    2.10 -	src/VPThread_lib/VMS/Queue_impl/PrivateQueue.o \
    2.11 -	src/VPThread_lib/VMS/Hash_impl/PrivateHash.o \
    2.12 -	src/VPThread_lib/VMS/DynArray/DynArray.o \
    2.13 -	src/VPThread_lib/VPThread_PluginFns.o \
    2.14 -	src/VPThread_lib/VPThread_lib.o \
    2.15 -	src/VPThread_lib/VMS/Histogram/DblHist.o \
    2.16 -	src/VPThread_lib/VPThread.o \
    2.17 -	src/VPThread_lib/VMS/probes.o \
    2.18 -	src/VPThread_lib/VMS/ProcrContext.o \
    2.19 -	src/VPThread_lib/VPThread_Request_Handlers.o \
    2.20 -	src/VPThread_lib/VPThread_helper.o \
    2.21 -	src/VPThread_lib/VMS/Hash_impl/MurmurHash2.o \
    2.22 -	src/VPThread_lib/VMS/vmalloc.o \
    2.23 -	src/VPThread_lib/VMS/contextSwitch.o \
    2.24 -	src/VPThread_lib/VMS/Queue_impl/BlockingQueue.o \
    2.25 -	src/VPThread_lib/VMS/vutilities.o \
    2.26 -	src/Application/main.o
    2.27 -
    2.28 -bin = task_size_vs_exe_time
    2.29 -
    2.30 -NUM_CORES=4
    2.31 -
    2.32 -CC = gcc
    2.33 -CFLAGS = -m64 -ffast-math -fwrapv -fno-omit-frame-pointer -O3 -D VPTHREAD -D APPLICATION=C-RAY -D NUM_CORES=$(NUM_CORES) -g -Wall
    2.34 -
    2.35 -$(bin): $(obj)
    2.36 -	$(CC) -o $@ $(obj) -lm -lpthread
    2.37 -	
    2.38 -%.o : %.c
    2.39 -	$(CC) -c $(CFLAGS) -o $@ $<
    2.40 -
    2.41 -.PHONY: clean
    2.42 -clean:
    2.43 -	rm -f $(obj) $(bin)
    2.44 -
    2.45 -.PHONY: install
    2.46 -install:
    2.47 -	cp $(bin) /usr/local/bin/$(bin)
    2.48 -
    2.49 -.PHONY: uninstall
    2.50 -uninstall:
    2.51 -	rm -f /usr/local/bin/$(bin)
    2.52 -
    2.53 -
    2.54 -# $@ Name des Targets
     3.1 --- a/src/Application/main.c	Fri Oct 28 06:56:35 2011 -0700
     3.2 +++ b/src/Application/main.c	Mon Nov 21 19:16:03 2011 +0100
     3.3 @@ -8,9 +8,14 @@
     3.4  #include <ctype.h>
     3.5  #include <errno.h>
     3.6  #include <pthread.h>
     3.7 +#include <unistd.h>
     3.8  #include "VPThread_lib/VPThread.h"
     3.9  #include "VPThread_lib/VMS/Queue_impl/PrivateQueue.h"
    3.10  
    3.11 +#include <linux/perf_event.h>
    3.12 +#include <linux/prctl.h>
    3.13 +#include <sys/syscall.h>
    3.14 +
    3.15  #undef DEBUG
    3.16  //#define DEBUG
    3.17  
    3.18 @@ -61,9 +66,9 @@
    3.19  	"Usage: malloc_test [options]\n"
    3.20  	"  Spwans a number of threads and allocates memory.\n\n"
    3.21  	"Options:\n"
    3.22 -	"  -t <num>   how many threads to use (default: 1)\n"
    3.23 -	"  -m <num>   repeat workload and sync operation <m> times\n"
    3.24 -        "  -n <num>   size of workload, repeat <n> times\n"     
    3.25 +	"  -t <num>   how many threads to use (default: 1). This is internaly multiplied by the number of cores.\n"
    3.26 +	"  -o <num>   repeat workload and sync operation <m> times\n"
    3.27 +        "  -i <num>   size of workload, repeat <n> times\n"     
    3.28  	"  -h         this help screen\n\n"
    3.29  };
    3.30  
    3.31 @@ -100,19 +105,16 @@
    3.32     VPThread__mutex_unlock(barr->mutex, animatingPr);
    3.33   }
    3.34  
    3.35 +struct perfData{
    3.36 +    uint64 cycles;
    3.37 +    uint64 instructions;
    3.38 +};
    3.39  
    3.40 -
    3.41 -union timeStamp{
    3.42 -    uint32 highLow[2];
    3.43 -    uint64 total;
    3.44 -};
    3.45 +typedef struct perfData measurement_t;
    3.46  
    3.47  struct input_t{
    3.48      struct barrier_t* barrier;
    3.49      uint64  totalWorkCycles;
    3.50 -    uint64  workPlusMutexCycles;
    3.51 -    union timeStamp startTime;
    3.52 -    union timeStamp endTime;
    3.53  };
    3.54  
    3.55  
    3.56 @@ -123,11 +125,24 @@
    3.57  int repetitions, workload_size, num_threads;
    3.58  size_t chunk_size = 0;
    3.59  
    3.60 +int cycles_counter_fd[NUM_CORES];
    3.61  
    3.62  //======================== App Code =========================
    3.63  /*
    3.64   * Workload
    3.65   */
    3.66 +
    3.67 +#define saveCyclesAndInstrs(core,cycles) do{     \
    3.68 +   int cycles_fd = cycles_counter_fd[core];             \
    3.69 +   int nread;                                           \
    3.70 +                                                        \
    3.71 +   nread = read(cycles_fd,&(cycles),sizeof(cycles));    \
    3.72 +   if(nread<=0){                                         \
    3.73 +       perror("Error reading cycles counter");          \
    3.74 +       cycles = 0;                                      \
    3.75 +   }                                                    \
    3.76 +} while (0) //macro magic for scoping
    3.77 +
    3.78  void work(void* input, VirtProcr* animatingPr)
    3.79   {
    3.80     int n,m;
    3.81 @@ -136,35 +151,48 @@
    3.82     unsigned int workspace1;
    3.83     double workspace2;
    3.84     int32 privateMutex = VPThread__make_mutex(animatingPr);
    3.85 -
    3.86 -   saveTimeStampCountInto(in->startTime.highLow[0], in->startTime.highLow[1]);
    3.87 +   
    3.88 +   int cpuid = sched_getcpu();
    3.89 +    
    3.90     for(m=0; m<repetitions; m++)
    3.91      {
    3.92 -      int32 stamp_startWorkload, stamp_endWorkload;
    3.93 -      saveLowTimeStampCountInto( stamp_startWorkload );
    3.94 +       
    3.95 +      //measure inner workload to determine synchronisation overhead by subtraction
    3.96 +      measurement_t startWorkload, endWorkload;
    3.97 +      saveCyclesAndInstrs(cpuid,startWorkload.cycles);
    3.98 +
    3.99 +      //workload
   3.100        for(n=0; n<workload_size; n++)
   3.101         {
   3.102           workspace1 += (workspace1 + 32)/2;
   3.103           workspace2 += (workspace2 + 23.2)/1.4;
   3.104         }
   3.105 -      saveLowTimeStampCountInto( stamp_endWorkload );
   3.106 -      int32 numCycles = stamp_endWorkload-stamp_startWorkload;
   3.107 -      if( numCycles < 100000000 ) totalWorkCycles += numCycles;//sanity check
   3.108 +      
   3.109 +    
   3.110 +      //measure end of inner workload
   3.111 +      saveCyclesAndInstrs(cpuid,endWorkload.cycles);
   3.112 +      uint64 numCycles = endWorkload.cycles - startWorkload.cycles;
   3.113 +      
   3.114 +      
   3.115 +     if( numCycles < 100000000 ) totalWorkCycles += numCycles;//sanity check
   3.116  
   3.117        VPThread__mutex_lock(privateMutex, animatingPr);
   3.118 -      //access queue
   3.119 +      //lock access to switch to different tast
   3.120        VPThread__mutex_unlock(privateMutex, animatingPr);
   3.121      }
   3.122  
   3.123 -   saveTimeStampCountInto( in->endTime.highLow[0], in->endTime.highLow[1] );
   3.124     in->totalWorkCycles = totalWorkCycles;
   3.125 -   in->workPlusMutexCycles = in->endTime.total - in->startTime.total;
   3.126 +   printf("Cycles: %lu on CPU %lu\n", totalWorkCycles, cpuid);
   3.127 +   
   3.128 +   
   3.129 +   //Wait for all threads to end
   3.130     barrier_wait(in->barrier, animatingPr);
   3.131     
   3.132     //Shutdown worker
   3.133     VPThread__dissipate_thread(animatingPr);
   3.134 -//   printf("%d", workspace1);  //Should never execute!  VMS bug if does
   3.135 -//   printf("%f", workspace2);
   3.136 +   
   3.137 +   printf("%d", workspace1);  //This is to prevent gcc from optimizing out the
   3.138 +   printf("%f", workspace2);  //two workspace variables
   3.139   }
   3.140  
   3.141  /* this is run after the VMS is set up*/
   3.142 @@ -174,29 +202,89 @@
   3.143     struct input_t input[num_threads];
   3.144     struct barrier_t barr;
   3.145     barrier_init(&barr, num_threads+1, animatingPr);
   3.146 +   
   3.147 +    //setup performance counters
   3.148 +    struct perf_event_attr* hw_event;
   3.149 +    hw_event = VMS__malloc(sizeof(struct perf_event_attr));
   3.150 +    memset(hw_event,0,sizeof(hw_event));
   3.151 +        hw_event->type = PERF_TYPE_HARDWARE;
   3.152 +        hw_event->size = sizeof(hw_event);
   3.153 +        hw_event->disabled = 0;
   3.154 +        hw_event->freq = 0;
   3.155 +        hw_event->inherit = 1; /* children inherit it   */
   3.156 +        hw_event->pinned = 1; /* must always be on PMU */
   3.157 +        hw_event->exclusive = 0; /* only group on PMU     */
   3.158 +        hw_event->exclude_user = 0; /* don't count user      */
   3.159 +        hw_event->exclude_kernel = 1; /* ditto kernel          */
   3.160 +        hw_event->exclude_hv = 1; /* ditto hypervisor      */
   3.161 +        hw_event->exclude_idle = 1; /* don't count when idle */
   3.162 +        hw_event->mmap = 0; /* include mmap data     */
   3.163 +        hw_event->comm = 0; /* include comm data     */
   3.164  
   3.165 -   for(i=0; i<num_threads; i++)
   3.166 -    { input[i].barrier = &barr;
   3.167 +
   3.168 +    for( i = 0; i < NUM_CORES; i++ )
   3.169 +    {
   3.170 +        hw_event->config = PERF_COUNT_HW_CPU_CYCLES; //cycles
   3.171 +        cycles_counter_fd[i] = syscall(__NR_perf_event_open, hw_event,
   3.172 +                0,//pid_t pid, 
   3.173 +                i,//int cpu, 
   3.174 +                -1,//int group_fd,
   3.175 +                0//unsigned long flags
   3.176 +        );
   3.177 +        if (cycles_counter_fd[i]<0){
   3.178 +            fprintf(stderr,"On core %d: ",i);
   3.179 +            perror("Failed to open cycles counter");
   3.180 +        }
   3.181      }
   3.182  
   3.183 -   union timeStamp startBenchTime, endBenchTime;
   3.184 -   uint64 lastThreadFinishTime = 0L;
   3.185 -   saveTimeStampCountInto(startBenchTime.highLow[0], startBenchTime.highLow[1]);
   3.186 +   //Count on all CPUs
   3.187 +   int cycles_counter_main_fd = syscall(__NR_perf_event_open, hw_event,
   3.188 +                0,//pid_t pid, 
   3.189 +                -1,//int cpu, 
   3.190 +                -1,//int group_fd,
   3.191 +                0//unsigned long flags
   3.192 +        );
   3.193 +    if (cycles_counter_main_fd<0){
   3.194 +        fprintf(stderr,"On core %d: ",i);
   3.195 +        perror("Failed to open cycles counter");
   3.196 +    } 
   3.197 +   
   3.198 +   //prepare input
   3.199     for(i=0; i<num_threads; i++)
   3.200 -    { VPThread__create_thread((VirtProcrFnPtr)work, (void*)&input[i], animatingPr);
   3.201 +    { 
   3.202 +       input[i].barrier = &barr;
   3.203      }
   3.204 +    
   3.205 +   //save cycles before execution of threads to get longest runtime
   3.206 +   measurement_t startBenchTime, endBenchTime;
   3.207 +   int nread = read(cycles_counter_main_fd,&(startBenchTime.cycles),
   3.208 +                sizeof(startBenchTime.cycles));
   3.209 +   if(nread<0){                                         
   3.210 +       perror("Error reading cycles counter");
   3.211 +   }
   3.212 +   
   3.213 +   //create all threads
   3.214 +   for(i=0; i<num_threads; i++)
   3.215 +    { VPThread__create_thread((VirtProcrFnPtr)work, (void*)&input[i], animatingPr);}
   3.216 +   //wait for all threads to finish
   3.217     barrier_wait(&barr, animatingPr);
   3.218 -   saveTimeStampCountInto(endBenchTime.highLow[0], endBenchTime.highLow[1]);
   3.219 +  
   3.220 +   
   3.221 +   //longest thread  measurement
   3.222 +   nread = read(cycles_counter_main_fd,&(endBenchTime.cycles),
   3.223 +                sizeof(endBenchTime.cycles));
   3.224 +   if(nread<0){                                         
   3.225 +       perror("Error reading cycles counter");
   3.226 +   }
   3.227  
   3.228 -
   3.229 -   for(i=0; i<num_threads; i++)
   3.230 -    { printf("WorkCycles: %d\n",input[i].totalWorkCycles);
   3.231 -      printf("Work + Sync Cycles: %lu\n", input[i].workPlusMutexCycles);
   3.232 -      if(input[i].endTime.total > lastThreadFinishTime)
   3.233 -         lastThreadFinishTime = input[i].endTime.total;
   3.234 +   uint64_t overallWorkCycles = 0;
   3.235 +   for(i=0; i<num_threads; i++){ 
   3.236 +       printf("WorkCycles: %d\n",input[i].totalWorkCycles);
   3.237 +       overallWorkCycles += input[i].totalWorkCycles;
   3.238      }
   3.239 -   printf("Time inside Barrier: %lu\n", endBenchTime.total-startBenchTime.total);
   3.240 -   printf("Longest Span: %lu\n", lastThreadFinishTime-startBenchTime.total);
   3.241 +   
   3.242 +   printf("Sum across threads of work cycles: %lu\n", overallWorkCycles);
   3.243 +   printf("Total Execution: %lu\n", endBenchTime.cycles-startBenchTime.cycles);
   3.244  
   3.245     //======================================================
   3.246  
   3.247 @@ -226,19 +314,20 @@
   3.248                    fprintf(stderr, "invalid number of threads specified: %d\n", num_threads);
   3.249                    return EXIT_FAILURE;
   3.250                  }
   3.251 +               num_threads *= NUM_CORES;
   3.252              break;
   3.253 -            case 'n':
   3.254 +            case 'i':
   3.255                 if(!isdigit(argv[++i][0]))
   3.256                  {
   3.257 -                  fputs("-n must be followed by a number\n", stderr);
   3.258 +                  fputs("-i must be followed by a number\n", stderr);
   3.259                    return EXIT_FAILURE;
   3.260                  }
   3.261                 repetitions = atoi(argv[i]);
   3.262  				break;
   3.263 -            case 'm':
   3.264 +            case 'o':
   3.265                 if(!isdigit(argv[++i][0]))
   3.266                  {
   3.267 -                  fputs("-m must be followed by a number (workload size)\n", stderr);
   3.268 +                  fputs("-o must be followed by a number (workload size)\n", stderr);
   3.269                    return EXIT_FAILURE;
   3.270                  }
   3.271                 workload_size = atoi(argv[i]);