# HG changeset patch # User Merten Sach # Date 1329145860 -3600 # Node ID 5887fbce425fc37feb1969fe6639929f7204afa9 # Parent a1269b1549fc2cb4180809e8df40fb948e6c38e5 changed directory structure, added .hgeol file diff -r a1269b1549fc -r 5887fbce425f .hgeol --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/.hgeol Mon Feb 13 16:11:00 2012 +0100 @@ -0,0 +1,14 @@ + +[patterns] +**.py = native +**.txt = native +**.c = native +**.h = native +**.cpp = native +**.java = native +**.class = bin +**.jar = bin +**.sh = native +**.pl = native +**.jpg = bin +**.gif = bin diff -r a1269b1549fc -r 5887fbce425f main.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/main.c Mon Feb 13 16:11:00 2012 +0100 @@ -0,0 +1,480 @@ +/* + * + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include "VPThread_lib/VPThread.h" +#include "VPThread_lib/VMS/Queue_impl/PrivateQueue.h" + +#include +#include +#include + +#undef DEBUG +//#define DEBUG + +#define MEASURE_PERF + +#if !defined(unix) && !defined(__unix__) +#ifdef __MACH__ +#define unix 1 +#define __unix__ 1 +#endif /* __MACH__ */ +#endif /* unix */ + +/* find the appropriate way to define explicitly sized types */ +/* for C99 or GNU libc (also mach's libc) we can use stdint.h */ +#if (__STDC_VERSION__ >= 199900) || defined(__GLIBC__) || defined(__MACH__) +#include +#elif defined(unix) || defined(__unix__) /* some UNIX systems have them in sys/types.h */ +#include +#elif defined(__WIN32__) || defined(WIN32) /* the nameless one */ +typedef unsigned __int8 uint8_t; +typedef unsigned __int32 uint32_t; +#endif /* sized type detection */ + +/* provide a millisecond-resolution timer for each system */ +#if defined(unix) || defined(__unix__) +#include +#include +unsigned long get_msec(void) { + static struct timeval timeval, first_timeval; + + gettimeofday(&timeval, 0); + if(first_timeval.tv_sec == 0) { + first_timeval = timeval; + return 0; + } + return (timeval.tv_sec - first_timeval.tv_sec) * 1000 + (timeval.tv_usec - first_timeval.tv_usec) / 1000; +} +#elif defined(__WIN32__) || defined(WIN32) +#include +unsigned long get_msec(void) { + return GetTickCount(); +} +#else +//#error "I don't know how to measure time on your platform" +#endif + +//======================== Defines ========================= +typedef struct perfData measurement_t; +struct perfData{ + uint64 cycles; + uint64 instructions; +}; + +const char *usage = { + "Usage: malloc_test [options]\n" + " Spwans a number of threads and allocates memory.\n\n" + "Options:\n" + " -t how many threads to use (default: 1). This is internaly multiplied by the number of cores.\n" + " -o repeat workload and sync operation times\n" + " -i size of workload, repeat times\n" + " -h this help screen\n\n" +}; + +struct barrier_t +{ + int counter; + int nthreads; + int32 mutex; + int32 cond; + measurement_t endBarrierCycles; + +}; +typedef struct barrier_t barrier; + +void inline barrier_init(barrier *barr, int nthreads, VirtProcr *animatingPr) + { + barr->counter = 0; + barr->nthreads = nthreads; + barr->mutex = VPThread__make_mutex(animatingPr); + barr->cond = VPThread__make_cond(barr->mutex, animatingPr); + } + +int cycles_counter_main_fd; +void inline barrier_wait(barrier *barr, VirtProcr *animatingPr) + { int i; + + VPThread__mutex_lock(barr->mutex, animatingPr); + barr->counter++; + if(barr->counter == barr->nthreads) + { +#ifdef MEASURE_PERF + read(cycles_counter_main_fd, &(barr->endBarrierCycles.cycles), \ + sizeof(barr->endBarrierCycles.cycles)); +#endif + + barr->counter = 0; + for(i=0; i < barr->nthreads; i++) + VPThread__cond_signal(barr->cond, animatingPr); + } + else + { VPThread__cond_wait(barr->cond, animatingPr); + } + VPThread__mutex_unlock(barr->mutex, animatingPr); + } + + + +typedef struct + { struct barrier_t* barrier; + uint64_t totalWorkCycles; + uint64_t totalBadCycles; + uint64_t totalSyncCycles; + uint64_t totalBadSyncCycles; + uint64 numGoodSyncs; + uint64 numGoodTasks; + } +WorkerParams; + + +typedef struct + { measurement_t *startExeCycles; + measurement_t *endExeCycles; + } +BenchParams; + +//======================== Globals ========================= +char __ProgrammName[] = "overhead_test"; +char __DataSet[255]; + +int outer_iters, inner_iters, num_threads; +size_t chunk_size = 0; + +int cycles_counter_fd[NUM_CORES]; +struct perf_event_attr* hw_event; + +WorkerParams *workerParamsArray; + +//======================== App Code ========================= +/* + * Workload + */ + +#define saveCyclesAndInstrs(core,cycles) do{ \ + int cycles_fd = cycles_counter_fd[core]; \ + int nread; \ + \ + nread = read(cycles_fd,&(cycles),sizeof(cycles)); \ + if(nread<0){ \ + perror("Error reading cycles counter"); \ + cycles = 0; \ + } \ +} while (0) //macro magic for scoping + + +double +worker_TLF(void* _params, VirtProcr* animatingPr) + { + int i,o; + WorkerParams* params = (WorkerParams*)_params; + unsigned int totalWorkCycles = 0, totalBadCycles = 0; + unsigned int totalSyncCycles = 0, totalBadSyncCycles = 0; + unsigned int workspace1=0, numGoodSyncs = 0, numGoodTasks = 0; + double workspace2=0.0; + int32 privateMutex = VPThread__make_mutex(animatingPr); + + int cpuid = sched_getcpu(); + + measurement_t startWorkload, endWorkload, startWorkload2, endWorkload2; + uint64 numCycles; + for(o=0; o < outer_iters; o++) + { +#ifdef MEASURE_PERF + saveCyclesAndInstrs(cpuid,startWorkload.cycles); +#endif + + //workltask + for(i=0; i < inner_iters; i++) + { + workspace1 += (workspace1 + 32)/2; + workspace2 += (workspace2 + 23.2)/1.4; + } + +#ifdef MEASURE_PERF + saveCyclesAndInstrs(cpuid,endWorkload.cycles); + numCycles = endWorkload.cycles - startWorkload.cycles; + //sanity check (400K is about 20K iters) + if( numCycles < 400000 ) {totalWorkCycles += numCycles; numGoodTasks++;} + else {totalBadCycles += numCycles; } +#endif + + //mutex access often causes switch to different Slave VP + VPThread__mutex_lock(privateMutex, animatingPr); + +/* + saveCyclesAndInstrs(cpuid,startWorkload2.cycles); + //Task + for(i=0; i < inner_iters; i++) + { + workspace1 += (workspace1 + 32)/2; + workspace2 += (workspace2 + 23.2)/1.4; + } + + saveCyclesAndInstrs(cpuid,endWorkload2.cycles); + numCycles = endWorkload2.cycles - startWorkload2.cycles; + //sanity check (400K is about 20K iters) + if( numCycles < 400000 ) {totalWorkCycles += numCycles; numGoodTasks++;} + else {totalBadCycles += numCycles; } + +*/ + VPThread__mutex_unlock(privateMutex, animatingPr); + } + + params->totalWorkCycles = totalWorkCycles; + params->totalBadCycles = totalBadCycles; + params->numGoodTasks = numGoodTasks; + params->totalSyncCycles = totalSyncCycles; + params->totalBadSyncCycles = totalBadSyncCycles; + params->numGoodSyncs = numGoodSyncs; +/* + params->totalSyncCycles = VMS__give_num_plugin_cycles(); + params->totalBadSyncCycles = 0; + params->numGoodSyncs = VMS__give_num_plugin_animations(); +*/ + + + //Wait for all threads to end + barrier_wait(params->barrier, animatingPr); + + //Shutdown worker + VPThread__dissipate_thread(animatingPr); + + //below return never reached --> there for gcc + return (workspace1 + workspace2); //to prevent gcc from optimizing work out + } + + +/* this is run after the VMS is set up*/ +void benchmark(void *_params, VirtProcr *animatingPr) + { + int i, cpuID; + struct barrier_t barr; + BenchParams *params; + + params = (BenchParams *)_params; + + barrier_init(&barr, num_threads+1, animatingPr); + + //prepare input + for(i=0; istartExeCycles; + +#ifdef MEASURE_PERF + int nread = read(cycles_counter_main_fd, &(startExeCycles->cycles), + sizeof(startExeCycles->cycles)); + if(nread<0) perror("Error reading cycles counter"); +#endif + + //create (which starts running) all threads + for(i=0; iendExeCycles->cycles = barr.endBarrierCycles.cycles; +#endif + + +/* + uint64_t overallWorkCycles = 0; + for(i=0; itype = PERF_TYPE_HARDWARE; + hw_event->size = sizeof(hw_event); + hw_event->disabled = 0; + hw_event->freq = 0; + hw_event->inherit = 1; /* children inherit it */ + hw_event->pinned = 1; /* says this virt counter must always be on HW */ + hw_event->exclusive = 0; /* only group on PMU */ + hw_event->exclude_user = 0; /* don't count user */ + hw_event->exclude_kernel = 1; /* don't count kernel */ + hw_event->exclude_hv = 1; /* ditto hypervisor */ + hw_event->exclude_idle = 1; /* don't count when idle */ + hw_event->mmap = 0; /* include mmap data */ + hw_event->comm = 0; /* include comm data */ + + hw_event->config = PERF_COUNT_HW_CPU_CYCLES; //cycles + + int cpuID, retries; + + for( cpuID = 0; cpuID < NUM_CORES; cpuID++ ) + { retries = 0; + do + { retries += 1; + cycles_counter_fd[cpuID] = + syscall(__NR_perf_event_open, hw_event, + 0,//pid_t: 0 is "pid of calling process" + cpuID,//int: cpu, the value returned by "CPUID" instr(?) + -1,//int: group_fd, -1 is "leader" or independent + 0//unsigned long: flags + ); + } + while(cycles_counter_fd[cpuID]<0 && retries < 100); + if(retries >= 100) + { + fprintf(stderr,"On core %d: ",cpuID); + perror("Failed to open cycles counter"); + } + } + + //Set up counter to accumulate total cycles to process, across all CPUs + + retries = 0; + do + { retries += 1; + cycles_counter_main_fd = + syscall(__NR_perf_event_open, hw_event, + 0,//pid_t: 0 is "pid of calling process" + -1,//int: cpu, -1 means accumulate from all cores + -1,//int: group_fd, -1 is "leader" == independent + 0//unsigned long: flags + ); + } + while(cycles_counter_main_fd<0 && retries < 100); + if(retries >= 100) + { + fprintf(stderr,"in main "); + perror("Failed to open cycles counter"); + } +#endif + + measurement_t startExeCycles, endExeCycles; + BenchParams *benchParams; + + benchParams = malloc(sizeof(BenchParams)); + + benchParams->startExeCycles = &startExeCycles; + benchParams->endExeCycles = &endExeCycles; + + workerParamsArray = (WorkerParams *)malloc( (num_threads + 1) * sizeof(WorkerParams) ); + if(workerParamsArray == NULL ) printf("error mallocing worker params array\n"); + + + //This is the transition to the VMS runtime + VPThread__create_seed_procr_and_do_work( &benchmark, benchParams ); + +#ifdef MEASURE_PERF + uint64_t totalWorkCyclesAcrossCores = 0, totalBadCyclesAcrossCores = 0; + uint64_t totalSyncCyclesAcrossCores = 0, totalBadSyncCyclesAcrossCores = 0; + for(i=0; i -#include -#include -#include -#include -#include -#include -#include -#include "VPThread_lib/VPThread.h" -#include "VPThread_lib/VMS/Queue_impl/PrivateQueue.h" - -#include -#include -#include - -#undef DEBUG -//#define DEBUG - -#define MEASURE_PERF - -#if !defined(unix) && !defined(__unix__) -#ifdef __MACH__ -#define unix 1 -#define __unix__ 1 -#endif /* __MACH__ */ -#endif /* unix */ - -/* find the appropriate way to define explicitly sized types */ -/* for C99 or GNU libc (also mach's libc) we can use stdint.h */ -#if (__STDC_VERSION__ >= 199900) || defined(__GLIBC__) || defined(__MACH__) -#include -#elif defined(unix) || defined(__unix__) /* some UNIX systems have them in sys/types.h */ -#include -#elif defined(__WIN32__) || defined(WIN32) /* the nameless one */ -typedef unsigned __int8 uint8_t; -typedef unsigned __int32 uint32_t; -#endif /* sized type detection */ - -/* provide a millisecond-resolution timer for each system */ -#if defined(unix) || defined(__unix__) -#include -#include -unsigned long get_msec(void) { - static struct timeval timeval, first_timeval; - - gettimeofday(&timeval, 0); - if(first_timeval.tv_sec == 0) { - first_timeval = timeval; - return 0; - } - return (timeval.tv_sec - first_timeval.tv_sec) * 1000 + (timeval.tv_usec - first_timeval.tv_usec) / 1000; -} -#elif defined(__WIN32__) || defined(WIN32) -#include -unsigned long get_msec(void) { - return GetTickCount(); -} -#else -//#error "I don't know how to measure time on your platform" -#endif - -//======================== Defines ========================= -typedef struct perfData measurement_t; -struct perfData{ - uint64 cycles; - uint64 instructions; -}; - -const char *usage = { - "Usage: malloc_test [options]\n" - " Spwans a number of threads and allocates memory.\n\n" - "Options:\n" - " -t how many threads to use (default: 1). This is internaly multiplied by the number of cores.\n" - " -o repeat workload and sync operation times\n" - " -i size of workload, repeat times\n" - " -h this help screen\n\n" -}; - -struct barrier_t -{ - int counter; - int nthreads; - int32 mutex; - int32 cond; - measurement_t endBarrierCycles; - -}; -typedef struct barrier_t barrier; - -void inline barrier_init(barrier *barr, int nthreads, VirtProcr *animatingPr) - { - barr->counter = 0; - barr->nthreads = nthreads; - barr->mutex = VPThread__make_mutex(animatingPr); - barr->cond = VPThread__make_cond(barr->mutex, animatingPr); - } - -int cycles_counter_main_fd; -void inline barrier_wait(barrier *barr, VirtProcr *animatingPr) - { int i; - - VPThread__mutex_lock(barr->mutex, animatingPr); - barr->counter++; - if(barr->counter == barr->nthreads) - { -#ifdef MEASURE_PERF - read(cycles_counter_main_fd, &(barr->endBarrierCycles.cycles), \ - sizeof(barr->endBarrierCycles.cycles)); -#endif - - barr->counter = 0; - for(i=0; i < barr->nthreads; i++) - VPThread__cond_signal(barr->cond, animatingPr); - } - else - { VPThread__cond_wait(barr->cond, animatingPr); - } - VPThread__mutex_unlock(barr->mutex, animatingPr); - } - - - -typedef struct - { struct barrier_t* barrier; - uint64_t totalWorkCycles; - uint64_t totalBadCycles; - uint64_t totalSyncCycles; - uint64_t totalBadSyncCycles; - uint64 numGoodSyncs; - uint64 numGoodTasks; - } -WorkerParams; - - -typedef struct - { measurement_t *startExeCycles; - measurement_t *endExeCycles; - } -BenchParams; - -//======================== Globals ========================= -char __ProgrammName[] = "overhead_test"; -char __DataSet[255]; - -int outer_iters, inner_iters, num_threads; -size_t chunk_size = 0; - -int cycles_counter_fd[NUM_CORES]; -struct perf_event_attr* hw_event; - -WorkerParams *workerParamsArray; - -//======================== App Code ========================= -/* - * Workload - */ - -#define saveCyclesAndInstrs(core,cycles) do{ \ - int cycles_fd = cycles_counter_fd[core]; \ - int nread; \ - \ - nread = read(cycles_fd,&(cycles),sizeof(cycles)); \ - if(nread<0){ \ - perror("Error reading cycles counter"); \ - cycles = 0; \ - } \ -} while (0) //macro magic for scoping - - -double -worker_TLF(void* _params, VirtProcr* animatingPr) - { - int i,o; - WorkerParams* params = (WorkerParams*)_params; - unsigned int totalWorkCycles = 0, totalBadCycles = 0; - unsigned int totalSyncCycles = 0, totalBadSyncCycles = 0; - unsigned int workspace1=0, numGoodSyncs = 0, numGoodTasks = 0; - double workspace2=0.0; - int32 privateMutex = VPThread__make_mutex(animatingPr); - - int cpuid = sched_getcpu(); - - measurement_t startWorkload, endWorkload, startWorkload2, endWorkload2; - uint64 numCycles; - for(o=0; o < outer_iters; o++) - { -#ifdef MEASURE_PERF - saveCyclesAndInstrs(cpuid,startWorkload.cycles); -#endif - - //workltask - for(i=0; i < inner_iters; i++) - { - workspace1 += (workspace1 + 32)/2; - workspace2 += (workspace2 + 23.2)/1.4; - } - -#ifdef MEASURE_PERF - saveCyclesAndInstrs(cpuid,endWorkload.cycles); - numCycles = endWorkload.cycles - startWorkload.cycles; - //sanity check (400K is about 20K iters) - if( numCycles < 400000 ) {totalWorkCycles += numCycles; numGoodTasks++;} - else {totalBadCycles += numCycles; } -#endif - - //mutex access often causes switch to different Slave VP - VPThread__mutex_lock(privateMutex, animatingPr); - -/* - saveCyclesAndInstrs(cpuid,startWorkload2.cycles); - //Task - for(i=0; i < inner_iters; i++) - { - workspace1 += (workspace1 + 32)/2; - workspace2 += (workspace2 + 23.2)/1.4; - } - - saveCyclesAndInstrs(cpuid,endWorkload2.cycles); - numCycles = endWorkload2.cycles - startWorkload2.cycles; - //sanity check (400K is about 20K iters) - if( numCycles < 400000 ) {totalWorkCycles += numCycles; numGoodTasks++;} - else {totalBadCycles += numCycles; } - -*/ - VPThread__mutex_unlock(privateMutex, animatingPr); - } - - params->totalWorkCycles = totalWorkCycles; - params->totalBadCycles = totalBadCycles; - params->numGoodTasks = numGoodTasks; - params->totalSyncCycles = totalSyncCycles; - params->totalBadSyncCycles = totalBadSyncCycles; - params->numGoodSyncs = numGoodSyncs; -/* - params->totalSyncCycles = VMS__give_num_plugin_cycles(); - params->totalBadSyncCycles = 0; - params->numGoodSyncs = VMS__give_num_plugin_animations(); -*/ - - - //Wait for all threads to end - barrier_wait(params->barrier, animatingPr); - - //Shutdown worker - VPThread__dissipate_thread(animatingPr); - - //below return never reached --> there for gcc - return (workspace1 + workspace2); //to prevent gcc from optimizing work out - } - - -/* this is run after the VMS is set up*/ -void benchmark(void *_params, VirtProcr *animatingPr) - { - int i, cpuID; - struct barrier_t barr; - BenchParams *params; - - params = (BenchParams *)_params; - - barrier_init(&barr, num_threads+1, animatingPr); - - //prepare input - for(i=0; istartExeCycles; - -#ifdef MEASURE_PERF - int nread = read(cycles_counter_main_fd, &(startExeCycles->cycles), - sizeof(startExeCycles->cycles)); - if(nread<0) perror("Error reading cycles counter"); -#endif - - //create (which starts running) all threads - for(i=0; iendExeCycles->cycles = barr.endBarrierCycles.cycles; -#endif - - -/* - uint64_t overallWorkCycles = 0; - for(i=0; itype = PERF_TYPE_HARDWARE; - hw_event->size = sizeof(hw_event); - hw_event->disabled = 0; - hw_event->freq = 0; - hw_event->inherit = 1; /* children inherit it */ - hw_event->pinned = 1; /* says this virt counter must always be on HW */ - hw_event->exclusive = 0; /* only group on PMU */ - hw_event->exclude_user = 0; /* don't count user */ - hw_event->exclude_kernel = 1; /* don't count kernel */ - hw_event->exclude_hv = 1; /* ditto hypervisor */ - hw_event->exclude_idle = 1; /* don't count when idle */ - hw_event->mmap = 0; /* include mmap data */ - hw_event->comm = 0; /* include comm data */ - - hw_event->config = PERF_COUNT_HW_CPU_CYCLES; //cycles - - int cpuID, retries; - - for( cpuID = 0; cpuID < NUM_CORES; cpuID++ ) - { retries = 0; - do - { retries += 1; - cycles_counter_fd[cpuID] = - syscall(__NR_perf_event_open, hw_event, - 0,//pid_t: 0 is "pid of calling process" - cpuID,//int: cpu, the value returned by "CPUID" instr(?) - -1,//int: group_fd, -1 is "leader" or independent - 0//unsigned long: flags - ); - } - while(cycles_counter_fd[cpuID]<0 && retries < 100); - if(retries >= 100) - { - fprintf(stderr,"On core %d: ",cpuID); - perror("Failed to open cycles counter"); - } - } - - //Set up counter to accumulate total cycles to process, across all CPUs - - retries = 0; - do - { retries += 1; - cycles_counter_main_fd = - syscall(__NR_perf_event_open, hw_event, - 0,//pid_t: 0 is "pid of calling process" - -1,//int: cpu, -1 means accumulate from all cores - -1,//int: group_fd, -1 is "leader" == independent - 0//unsigned long: flags - ); - } - while(cycles_counter_main_fd<0 && retries < 100); - if(retries >= 100) - { - fprintf(stderr,"in main "); - perror("Failed to open cycles counter"); - } -#endif - - measurement_t startExeCycles, endExeCycles; - BenchParams *benchParams; - - benchParams = malloc(sizeof(BenchParams)); - - benchParams->startExeCycles = &startExeCycles; - benchParams->endExeCycles = &endExeCycles; - - workerParamsArray = (WorkerParams *)malloc( (num_threads + 1) * sizeof(WorkerParams) ); - if(workerParamsArray == NULL ) printf("error mallocing worker params array\n"); - - - //This is the transition to the VMS runtime - VPThread__create_seed_procr_and_do_work( &benchmark, benchParams ); - -#ifdef MEASURE_PERF - uint64_t totalWorkCyclesAcrossCores = 0, totalBadCyclesAcrossCores = 0; - uint64_t totalSyncCyclesAcrossCores = 0, totalBadSyncCyclesAcrossCores = 0; - for(i=0; i