Mercurial > cgi-bin > hgwebdir.cgi > PR > Applications > Vthread > Vthread__Best_Effort_Msg__Bench
changeset 6:c8995a602b46
mallocing hw_event
| author | Merten Sach <msach@mailbox.tu-berlin.de> |
|---|---|
| date | Mon, 21 Nov 2011 19:16:03 +0100 |
| parents | 535c119ba090 |
| children | 28650a4df2b9 |
| files | .hgignore Makefile src/Application/main.c |
| diffstat | 3 files changed, 132 insertions(+), 93 deletions(-) [+] |
line diff
1.1 --- a/.hgignore Fri Oct 28 06:56:35 2011 -0700 1.2 +++ b/.hgignore Mon Nov 21 19:16:03 2011 +0100 1.3 @@ -7,3 +7,4 @@ 1.4 c-ray-mt 1.5 *.ppm 1.6 *.o 1.7 +*~
2.1 --- a/Makefile Fri Oct 28 06:56:35 2011 -0700 2.2 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 2.3 @@ -1,51 +0,0 @@ 2.4 -obj = \ 2.5 - src/VPThread_lib/VMS/Histogram/Histogram.o \ 2.6 - src/VPThread_lib/VMS/Histogram/FloatHist.o \ 2.7 - src/VPThread_lib/VMS/CoreLoop.o \ 2.8 - src/VPThread_lib/VMS/VMS.o \ 2.9 - src/VPThread_lib/VMS/MasterLoop.o \ 2.10 - src/VPThread_lib/VMS/Queue_impl/PrivateQueue.o \ 2.11 - src/VPThread_lib/VMS/Hash_impl/PrivateHash.o \ 2.12 - src/VPThread_lib/VMS/DynArray/DynArray.o \ 2.13 - src/VPThread_lib/VPThread_PluginFns.o \ 2.14 - src/VPThread_lib/VPThread_lib.o \ 2.15 - src/VPThread_lib/VMS/Histogram/DblHist.o \ 2.16 - src/VPThread_lib/VPThread.o \ 2.17 - src/VPThread_lib/VMS/probes.o \ 2.18 - src/VPThread_lib/VMS/ProcrContext.o \ 2.19 - src/VPThread_lib/VPThread_Request_Handlers.o \ 2.20 - src/VPThread_lib/VPThread_helper.o \ 2.21 - src/VPThread_lib/VMS/Hash_impl/MurmurHash2.o \ 2.22 - src/VPThread_lib/VMS/vmalloc.o \ 2.23 - src/VPThread_lib/VMS/contextSwitch.o \ 2.24 - src/VPThread_lib/VMS/Queue_impl/BlockingQueue.o \ 2.25 - src/VPThread_lib/VMS/vutilities.o \ 2.26 - src/Application/main.o 2.27 - 2.28 -bin = task_size_vs_exe_time 2.29 - 2.30 -NUM_CORES=4 2.31 - 2.32 -CC = gcc 2.33 -CFLAGS = -m64 -ffast-math -fwrapv -fno-omit-frame-pointer -O3 -D VPTHREAD -D APPLICATION=C-RAY -D NUM_CORES=$(NUM_CORES) -g -Wall 2.34 - 2.35 -$(bin): $(obj) 2.36 - $(CC) -o $@ $(obj) -lm -lpthread 2.37 - 2.38 -%.o : %.c 2.39 - $(CC) -c $(CFLAGS) -o $@ $< 2.40 - 2.41 -.PHONY: clean 2.42 -clean: 2.43 - rm -f $(obj) $(bin) 2.44 - 2.45 -.PHONY: install 2.46 -install: 2.47 - cp $(bin) /usr/local/bin/$(bin) 2.48 - 2.49 -.PHONY: uninstall 2.50 -uninstall: 2.51 - rm -f /usr/local/bin/$(bin) 2.52 - 2.53 - 2.54 -# $@ Name des Targets
3.1 --- a/src/Application/main.c Fri Oct 28 06:56:35 2011 -0700 3.2 +++ b/src/Application/main.c Mon Nov 21 19:16:03 2011 +0100 3.3 @@ -8,9 +8,14 @@ 3.4 #include <ctype.h> 3.5 #include <errno.h> 3.6 #include <pthread.h> 3.7 +#include <unistd.h> 3.8 #include "VPThread_lib/VPThread.h" 3.9 #include "VPThread_lib/VMS/Queue_impl/PrivateQueue.h" 3.10 3.11 +#include <linux/perf_event.h> 3.12 +#include <linux/prctl.h> 3.13 +#include <sys/syscall.h> 3.14 + 3.15 #undef DEBUG 3.16 //#define DEBUG 3.17 3.18 @@ -61,9 +66,9 @@ 3.19 "Usage: malloc_test [options]\n" 3.20 " Spwans a number of threads and allocates memory.\n\n" 3.21 "Options:\n" 3.22 - " -t <num> how many threads to use (default: 1)\n" 3.23 - " -m <num> repeat workload and sync operation <m> times\n" 3.24 - " -n <num> size of workload, repeat <n> times\n" 3.25 + " -t <num> how many threads to use (default: 1). This is internaly multiplied by the number of cores.\n" 3.26 + " -o <num> repeat workload and sync operation <m> times\n" 3.27 + " -i <num> size of workload, repeat <n> times\n" 3.28 " -h this help screen\n\n" 3.29 }; 3.30 3.31 @@ -100,19 +105,16 @@ 3.32 VPThread__mutex_unlock(barr->mutex, animatingPr); 3.33 } 3.34 3.35 +struct perfData{ 3.36 + uint64 cycles; 3.37 + uint64 instructions; 3.38 +}; 3.39 3.40 - 3.41 -union timeStamp{ 3.42 - uint32 highLow[2]; 3.43 - uint64 total; 3.44 -}; 3.45 +typedef struct perfData measurement_t; 3.46 3.47 struct input_t{ 3.48 struct barrier_t* barrier; 3.49 uint64 totalWorkCycles; 3.50 - uint64 workPlusMutexCycles; 3.51 - union timeStamp startTime; 3.52 - union timeStamp endTime; 3.53 }; 3.54 3.55 3.56 @@ -123,11 +125,24 @@ 3.57 int repetitions, workload_size, num_threads; 3.58 size_t chunk_size = 0; 3.59 3.60 +int cycles_counter_fd[NUM_CORES]; 3.61 3.62 //======================== App Code ========================= 3.63 /* 3.64 * Workload 3.65 */ 3.66 + 3.67 +#define saveCyclesAndInstrs(core,cycles) do{ \ 3.68 + int cycles_fd = cycles_counter_fd[core]; \ 3.69 + int nread; \ 3.70 + \ 3.71 + nread = read(cycles_fd,&(cycles),sizeof(cycles)); \ 3.72 + if(nread<=0){ \ 3.73 + perror("Error reading cycles counter"); \ 3.74 + cycles = 0; \ 3.75 + } \ 3.76 +} while (0) //macro magic for scoping 3.77 + 3.78 void work(void* input, VirtProcr* animatingPr) 3.79 { 3.80 int n,m; 3.81 @@ -136,35 +151,48 @@ 3.82 unsigned int workspace1; 3.83 double workspace2; 3.84 int32 privateMutex = VPThread__make_mutex(animatingPr); 3.85 - 3.86 - saveTimeStampCountInto(in->startTime.highLow[0], in->startTime.highLow[1]); 3.87 + 3.88 + int cpuid = sched_getcpu(); 3.89 + 3.90 for(m=0; m<repetitions; m++) 3.91 { 3.92 - int32 stamp_startWorkload, stamp_endWorkload; 3.93 - saveLowTimeStampCountInto( stamp_startWorkload ); 3.94 + 3.95 + //measure inner workload to determine synchronisation overhead by subtraction 3.96 + measurement_t startWorkload, endWorkload; 3.97 + saveCyclesAndInstrs(cpuid,startWorkload.cycles); 3.98 + 3.99 + //workload 3.100 for(n=0; n<workload_size; n++) 3.101 { 3.102 workspace1 += (workspace1 + 32)/2; 3.103 workspace2 += (workspace2 + 23.2)/1.4; 3.104 } 3.105 - saveLowTimeStampCountInto( stamp_endWorkload ); 3.106 - int32 numCycles = stamp_endWorkload-stamp_startWorkload; 3.107 - if( numCycles < 100000000 ) totalWorkCycles += numCycles;//sanity check 3.108 + 3.109 + 3.110 + //measure end of inner workload 3.111 + saveCyclesAndInstrs(cpuid,endWorkload.cycles); 3.112 + uint64 numCycles = endWorkload.cycles - startWorkload.cycles; 3.113 + 3.114 + 3.115 + if( numCycles < 100000000 ) totalWorkCycles += numCycles;//sanity check 3.116 3.117 VPThread__mutex_lock(privateMutex, animatingPr); 3.118 - //access queue 3.119 + //lock access to switch to different tast 3.120 VPThread__mutex_unlock(privateMutex, animatingPr); 3.121 } 3.122 3.123 - saveTimeStampCountInto( in->endTime.highLow[0], in->endTime.highLow[1] ); 3.124 in->totalWorkCycles = totalWorkCycles; 3.125 - in->workPlusMutexCycles = in->endTime.total - in->startTime.total; 3.126 + printf("Cycles: %lu on CPU %lu\n", totalWorkCycles, cpuid); 3.127 + 3.128 + 3.129 + //Wait for all threads to end 3.130 barrier_wait(in->barrier, animatingPr); 3.131 3.132 //Shutdown worker 3.133 VPThread__dissipate_thread(animatingPr); 3.134 -// printf("%d", workspace1); //Should never execute! VMS bug if does 3.135 -// printf("%f", workspace2); 3.136 + 3.137 + printf("%d", workspace1); //This is to prevent gcc from optimizing out the 3.138 + printf("%f", workspace2); //two workspace variables 3.139 } 3.140 3.141 /* this is run after the VMS is set up*/ 3.142 @@ -174,29 +202,89 @@ 3.143 struct input_t input[num_threads]; 3.144 struct barrier_t barr; 3.145 barrier_init(&barr, num_threads+1, animatingPr); 3.146 + 3.147 + //setup performance counters 3.148 + struct perf_event_attr* hw_event; 3.149 + hw_event = VMS__malloc(sizeof(struct perf_event_attr)); 3.150 + memset(hw_event,0,sizeof(hw_event)); 3.151 + hw_event->type = PERF_TYPE_HARDWARE; 3.152 + hw_event->size = sizeof(hw_event); 3.153 + hw_event->disabled = 0; 3.154 + hw_event->freq = 0; 3.155 + hw_event->inherit = 1; /* children inherit it */ 3.156 + hw_event->pinned = 1; /* must always be on PMU */ 3.157 + hw_event->exclusive = 0; /* only group on PMU */ 3.158 + hw_event->exclude_user = 0; /* don't count user */ 3.159 + hw_event->exclude_kernel = 1; /* ditto kernel */ 3.160 + hw_event->exclude_hv = 1; /* ditto hypervisor */ 3.161 + hw_event->exclude_idle = 1; /* don't count when idle */ 3.162 + hw_event->mmap = 0; /* include mmap data */ 3.163 + hw_event->comm = 0; /* include comm data */ 3.164 3.165 - for(i=0; i<num_threads; i++) 3.166 - { input[i].barrier = &barr; 3.167 + 3.168 + for( i = 0; i < NUM_CORES; i++ ) 3.169 + { 3.170 + hw_event->config = PERF_COUNT_HW_CPU_CYCLES; //cycles 3.171 + cycles_counter_fd[i] = syscall(__NR_perf_event_open, hw_event, 3.172 + 0,//pid_t pid, 3.173 + i,//int cpu, 3.174 + -1,//int group_fd, 3.175 + 0//unsigned long flags 3.176 + ); 3.177 + if (cycles_counter_fd[i]<0){ 3.178 + fprintf(stderr,"On core %d: ",i); 3.179 + perror("Failed to open cycles counter"); 3.180 + } 3.181 } 3.182 3.183 - union timeStamp startBenchTime, endBenchTime; 3.184 - uint64 lastThreadFinishTime = 0L; 3.185 - saveTimeStampCountInto(startBenchTime.highLow[0], startBenchTime.highLow[1]); 3.186 + //Count on all CPUs 3.187 + int cycles_counter_main_fd = syscall(__NR_perf_event_open, hw_event, 3.188 + 0,//pid_t pid, 3.189 + -1,//int cpu, 3.190 + -1,//int group_fd, 3.191 + 0//unsigned long flags 3.192 + ); 3.193 + if (cycles_counter_main_fd<0){ 3.194 + fprintf(stderr,"On core %d: ",i); 3.195 + perror("Failed to open cycles counter"); 3.196 + } 3.197 + 3.198 + //prepare input 3.199 for(i=0; i<num_threads; i++) 3.200 - { VPThread__create_thread((VirtProcrFnPtr)work, (void*)&input[i], animatingPr); 3.201 + { 3.202 + input[i].barrier = &barr; 3.203 } 3.204 + 3.205 + //save cycles before execution of threads to get longest runtime 3.206 + measurement_t startBenchTime, endBenchTime; 3.207 + int nread = read(cycles_counter_main_fd,&(startBenchTime.cycles), 3.208 + sizeof(startBenchTime.cycles)); 3.209 + if(nread<0){ 3.210 + perror("Error reading cycles counter"); 3.211 + } 3.212 + 3.213 + //create all threads 3.214 + for(i=0; i<num_threads; i++) 3.215 + { VPThread__create_thread((VirtProcrFnPtr)work, (void*)&input[i], animatingPr);} 3.216 + //wait for all threads to finish 3.217 barrier_wait(&barr, animatingPr); 3.218 - saveTimeStampCountInto(endBenchTime.highLow[0], endBenchTime.highLow[1]); 3.219 + 3.220 + 3.221 + //longest thread measurement 3.222 + nread = read(cycles_counter_main_fd,&(endBenchTime.cycles), 3.223 + sizeof(endBenchTime.cycles)); 3.224 + if(nread<0){ 3.225 + perror("Error reading cycles counter"); 3.226 + } 3.227 3.228 - 3.229 - for(i=0; i<num_threads; i++) 3.230 - { printf("WorkCycles: %d\n",input[i].totalWorkCycles); 3.231 - printf("Work + Sync Cycles: %lu\n", input[i].workPlusMutexCycles); 3.232 - if(input[i].endTime.total > lastThreadFinishTime) 3.233 - lastThreadFinishTime = input[i].endTime.total; 3.234 + uint64_t overallWorkCycles = 0; 3.235 + for(i=0; i<num_threads; i++){ 3.236 + printf("WorkCycles: %d\n",input[i].totalWorkCycles); 3.237 + overallWorkCycles += input[i].totalWorkCycles; 3.238 } 3.239 - printf("Time inside Barrier: %lu\n", endBenchTime.total-startBenchTime.total); 3.240 - printf("Longest Span: %lu\n", lastThreadFinishTime-startBenchTime.total); 3.241 + 3.242 + printf("Sum across threads of work cycles: %lu\n", overallWorkCycles); 3.243 + printf("Total Execution: %lu\n", endBenchTime.cycles-startBenchTime.cycles); 3.244 3.245 //====================================================== 3.246 3.247 @@ -226,19 +314,20 @@ 3.248 fprintf(stderr, "invalid number of threads specified: %d\n", num_threads); 3.249 return EXIT_FAILURE; 3.250 } 3.251 + num_threads *= NUM_CORES; 3.252 break; 3.253 - case 'n': 3.254 + case 'i': 3.255 if(!isdigit(argv[++i][0])) 3.256 { 3.257 - fputs("-n must be followed by a number\n", stderr); 3.258 + fputs("-i must be followed by a number\n", stderr); 3.259 return EXIT_FAILURE; 3.260 } 3.261 repetitions = atoi(argv[i]); 3.262 break; 3.263 - case 'm': 3.264 + case 'o': 3.265 if(!isdigit(argv[++i][0])) 3.266 { 3.267 - fputs("-m must be followed by a number (workload size)\n", stderr); 3.268 + fputs("-o must be followed by a number (workload size)\n", stderr); 3.269 return EXIT_FAILURE; 3.270 } 3.271 workload_size = atoi(argv[i]);
