Me@4: /* Me@4: * Me@4: */ Me@4: #include Me@4: #include Me@4: #include Me@4: #include Me@4: #include Me@4: #include Me@4: #include msach@6: #include msach@18: #include "VMS_Implementations/Vthread_impl/VPThread.h" msach@18: #include "C_Libraries/Queue_impl/PrivateQueue.h" Me@4: msach@6: #include msach@6: #include msach@6: #include msach@6: Me@4: #undef DEBUG Me@4: //#define DEBUG Me@4: Me@4: #if !defined(unix) && !defined(__unix__) Me@4: #ifdef __MACH__ Me@4: #define unix 1 Me@4: #define __unix__ 1 Me@4: #endif /* __MACH__ */ Me@4: #endif /* unix */ Me@4: Me@4: /* find the appropriate way to define explicitly sized types */ Me@4: /* for C99 or GNU libc (also mach's libc) we can use stdint.h */ Me@4: #if (__STDC_VERSION__ >= 199900) || defined(__GLIBC__) || defined(__MACH__) Me@4: #include Me@4: #elif defined(unix) || defined(__unix__) /* some UNIX systems have them in sys/types.h */ Me@4: #include Me@4: #elif defined(__WIN32__) || defined(WIN32) /* the nameless one */ Me@4: typedef unsigned __int8 uint8_t; Me@4: typedef unsigned __int32 uint32_t; Me@4: #endif /* sized type detection */ Me@4: Me@4: /* provide a millisecond-resolution timer for each system */ Me@4: #if defined(unix) || defined(__unix__) Me@4: #include Me@4: #include Me@4: unsigned long get_msec(void) { Me@4: static struct timeval timeval, first_timeval; Me@4: Me@4: gettimeofday(&timeval, 0); Me@4: if(first_timeval.tv_sec == 0) { Me@4: first_timeval = timeval; Me@4: return 0; Me@4: } Me@4: return (timeval.tv_sec - first_timeval.tv_sec) * 1000 + (timeval.tv_usec - first_timeval.tv_usec) / 1000; Me@4: } Me@4: #elif defined(__WIN32__) || defined(WIN32) Me@4: #include Me@4: unsigned long get_msec(void) { Me@4: return GetTickCount(); Me@4: } Me@4: #else Me@4: //#error "I don't know how to measure time on your platform" Me@4: #endif Me@4: msach@14: //======================== Globals ========================= msach@14: char __ProgrammName[] = "overhead_test"; msach@14: char __DataSet[255]; msach@14: msach@14: int outer_iters, inner_iters, num_threads; msach@14: size_t chunk_size = 0; msach@14: msach@14: int cycles_counter_main_fd; msach@14: int misses_counter_fd; msach@14: msach@14: uint64_t cache_misses; msach@14: msach@14: int cycles_counter_fd[NUM_CORES]; msach@14: struct perf_event_attr* hw_event; msach@14: Me@4: //======================== Defines ========================= kshalle@8: typedef struct perfData measurement_t; kshalle@8: struct perfData{ kshalle@8: uint64 cycles; msach@13: } __align_to_cacheline__; Me@4: Me@4: const char *usage = { Me@4: "Usage: malloc_test [options]\n" Me@4: " Spwans a number of threads and allocates memory.\n\n" Me@4: "Options:\n" msach@6: " -t how many threads to use (default: 1). This is internaly multiplied by the number of cores.\n" msach@6: " -o repeat workload and sync operation times\n" msach@6: " -i size of workload, repeat times\n" Me@4: " -h this help screen\n\n" Me@4: }; Me@4: Me@4: struct barrier_t Me@4: { Me@4: int counter; Me@4: int nthreads; Me@4: int32 mutex; Me@4: int32 cond; kshalle@8: measurement_t endBarrierCycles; kshalle@8: msach@13: } __align_to_cacheline__; Me@4: typedef struct barrier_t barrier; Me@4: Me@4: void inline barrier_init(barrier *barr, int nthreads, VirtProcr *animatingPr) Me@4: { Me@4: barr->counter = 0; Me@4: barr->nthreads = nthreads; Me@4: barr->mutex = VPThread__make_mutex(animatingPr); Me@4: barr->cond = VPThread__make_cond(barr->mutex, animatingPr); Me@4: } Me@4: Me@4: void inline barrier_wait(barrier *barr, VirtProcr *animatingPr) Me@4: { int i; Me@4: Me@4: VPThread__mutex_lock(barr->mutex, animatingPr); Me@4: barr->counter++; Me@4: if(barr->counter == barr->nthreads) kshalle@8: { msach@13: read(cycles_counter_main_fd, &(barr->endBarrierCycles.cycles), \ kshalle@8: sizeof(barr->endBarrierCycles.cycles)); kshalle@8: kshalle@8: barr->counter = 0; Me@4: for(i=0; i < barr->nthreads; i++) Me@4: VPThread__cond_signal(barr->cond, animatingPr); Me@4: } Me@4: else Me@4: { VPThread__cond_wait(barr->cond, animatingPr); Me@4: } Me@4: VPThread__mutex_unlock(barr->mutex, animatingPr); Me@4: } Me@4: kshalle@8: kshalle@8: msach@13: struct WorkerParams_t msach@9: { struct barrier_t* barrier; msach@9: uint64_t totalWorkCycles; msach@9: uint64_t totalBadCycles; msach@9: uint64_t totalSyncCycles; msach@9: uint64_t totalBadSyncCycles; msach@9: uint64 numGoodSyncs; msach@9: uint64 numGoodTasks; msach@13: }; msach@13: msach@13: typedef union msach@13: { msach@13: struct WorkerParams_t data; msach@13: char padding[CACHELINE_SIZE]; msach@13: } WorkerParams __align_to_cacheline__; msach@14: msach@14: WorkerParams *workerParamsArray; Me@4: kshalle@8: typedef struct kshalle@8: { measurement_t *startExeCycles; kshalle@8: measurement_t *endExeCycles; msach@13: } BenchParams __align_to_cacheline__; Me@4: Me@4: //======================== App Code ========================= Me@4: /* msach@13: p* Workload Me@4: */ msach@6: msach@6: #define saveCyclesAndInstrs(core,cycles) do{ \ msach@6: int cycles_fd = cycles_counter_fd[core]; \ msach@6: int nread; \ msach@6: \ msach@6: nread = read(cycles_fd,&(cycles),sizeof(cycles)); \ msach@7: if(nread<0){ \ msach@6: perror("Error reading cycles counter"); \ msach@6: cycles = 0; \ msach@6: } \ msach@6: } while (0) //macro magic for scoping msach@14: msach@14: #define saveMisses(misses) do{ \ msach@14: int nread; \ msach@14: \ msach@14: nread = read(misses_counter_fd,&(misses),sizeof(misses)); \ msach@14: if(nread<0){ \ msach@14: perror("Error reading misses counter"); \ msach@14: misses = 0; \ msach@14: } \ msach@14: } while (0) //macro magic for scoping msach@6: msach@7: msach@9: double msach@9: worker_TLF(void* _params, VirtProcr* animatingPr) Me@5: { msach@7: int i,o; msach@9: WorkerParams* params = (WorkerParams*)_params; msach@9: unsigned int totalWorkCycles = 0, totalBadCycles = 0; msach@9: unsigned int totalSyncCycles = 0, totalBadSyncCycles = 0; msach@9: unsigned int workspace1=0, numGoodSyncs = 0, numGoodTasks = 0; kshalle@8: double workspace2=0.0; Me@5: int32 privateMutex = VPThread__make_mutex(animatingPr); msach@6: msach@6: int cpuid = sched_getcpu(); msach@9: msach@13: measurement_t startWorkload, endWorkload; msach@9: uint64 numCycles; msach@9: for(o=0; o < outer_iters; o++) Me@4: { msach@6: msach@10: saveCyclesAndInstrs(cpuid,startWorkload.cycles); msach@9: msach@13: //task msach@9: for(i=0; i < inner_iters; i++) Me@5: { Me@5: workspace1 += (workspace1 + 32)/2; Me@5: workspace2 += (workspace2 + 23.2)/1.4; Me@5: } msach@6: msach@10: saveCyclesAndInstrs(cpuid,endWorkload.cycles); msach@10: numCycles = endWorkload.cycles - startWorkload.cycles; msach@9: //sanity check (400K is about 20K iters) msach@9: if( numCycles < 400000 ) {totalWorkCycles += numCycles; numGoodTasks++;} msach@9: else {totalBadCycles += numCycles; } msach@9: msach@9: //mutex access often causes switch to different Slave VP msach@9: VPThread__mutex_lock(privateMutex, animatingPr); msach@11: msach@10: /* msach@11: saveCyclesAndInstrs(cpuid,startWorkload2.cycles); msach@11: //Task msach@11: for(i=0; i < inner_iters; i++) msach@11: { msach@11: workspace1 += (workspace1 + 32)/2; msach@11: workspace2 += (workspace2 + 23.2)/1.4; msach@11: } msach@11: msach@11: saveCyclesAndInstrs(cpuid,endWorkload2.cycles); msach@11: numCycles = endWorkload2.cycles - startWorkload2.cycles; msach@9: //sanity check (400K is about 20K iters) msach@11: if( numCycles < 400000 ) {totalWorkCycles += numCycles; numGoodTasks++;} msach@11: else {totalBadCycles += numCycles; } msach@11: msach@10: */ msach@9: VPThread__mutex_unlock(privateMutex, animatingPr); Me@4: } Me@5: msach@13: params->data.totalWorkCycles = totalWorkCycles; msach@13: params->data.totalBadCycles = totalBadCycles; msach@13: params->data.numGoodTasks = numGoodTasks; msach@13: params->data.totalSyncCycles = totalSyncCycles; msach@13: params->data.totalBadSyncCycles = totalBadSyncCycles; msach@13: params->data.numGoodSyncs = numGoodSyncs; msach@9: /* msach@9: params->totalSyncCycles = VMS__give_num_plugin_cycles(); msach@9: params->totalBadSyncCycles = 0; msach@9: params->numGoodSyncs = VMS__give_num_plugin_animations(); msach@9: */ msach@6: msach@6: msach@6: //Wait for all threads to end msach@13: barrier_wait(params->data.barrier, animatingPr); Me@5: Me@5: //Shutdown worker Me@5: VPThread__dissipate_thread(animatingPr); msach@9: msach@9: //below return never reached --> there for gcc msach@9: return (workspace1 + workspace2); //to prevent gcc from optimizing work out Me@5: } Me@4: msach@13: //local variables of benchmark, made global for alignment msach@13: struct barrier_t barr __align_to_cacheline__; msach@13: BenchParams *params __align_to_cacheline__; kshalle@8: Me@4: /* this is run after the VMS is set up*/ kshalle@8: void benchmark(void *_params, VirtProcr *animatingPr) Me@4: { msach@13: int i; msach@13: kshalle@8: params = (BenchParams *)_params; kshalle@8: Me@4: barrier_init(&barr, num_threads+1, animatingPr); kshalle@8: msach@6: //prepare input Me@4: for(i=0; istartExeCycles->cycles), msach@13: sizeof(params->startExeCycles->cycles)); msach@9: if(nread<0) perror("Error reading cycles counter"); msach@9: msach@9: //create (which starts running) all threads msach@9: for(i=0; iendExeCycles->cycles = barr.endBarrierCycles.cycles; msach@14: saveMisses(cache_misses_at_end); msach@14: cache_misses = cache_misses_at_end-cache_misses_at_start; kshalle@8: /* msach@6: uint64_t overallWorkCycles = 0; msach@6: for(i=0; itype = PERF_TYPE_HARDWARE; msach@7: hw_event->size = sizeof(hw_event); msach@7: hw_event->disabled = 0; msach@7: hw_event->freq = 0; msach@7: hw_event->inherit = 1; /* children inherit it */ msach@7: hw_event->pinned = 1; /* says this virt counter must always be on HW */ msach@7: hw_event->exclusive = 0; /* only group on PMU */ msach@7: hw_event->exclude_user = 0; /* don't count user */ msach@7: hw_event->exclude_kernel = 1; /* don't count kernel */ msach@7: hw_event->exclude_hv = 1; /* ditto hypervisor */ msach@7: hw_event->exclude_idle = 1; /* don't count when idle */ msach@7: hw_event->mmap = 0; /* include mmap data */ msach@7: hw_event->comm = 0; /* include comm data */ msach@7: msach@7: hw_event->config = PERF_COUNT_HW_CPU_CYCLES; //cycles msach@7: msach@7: int cpuID, retries; msach@7: msach@7: for( cpuID = 0; cpuID < NUM_CORES; cpuID++ ) msach@7: { retries = 0; msach@7: do msach@7: { retries += 1; msach@7: cycles_counter_fd[cpuID] = msach@7: syscall(__NR_perf_event_open, hw_event, msach@7: 0,//pid_t: 0 is "pid of calling process" msach@7: cpuID,//int: cpu, the value returned by "CPUID" instr(?) msach@7: -1,//int: group_fd, -1 is "leader" or independent msach@7: 0//unsigned long: flags msach@7: ); msach@7: } msach@7: while(cycles_counter_fd[cpuID]<0 && retries < 100); msach@7: if(retries >= 100) msach@7: { msach@7: fprintf(stderr,"On core %d: ",cpuID); msach@7: perror("Failed to open cycles counter"); msach@7: } msach@7: } msach@7: msach@7: //Set up counter to accumulate total cycles to process, across all CPUs msach@7: msach@7: retries = 0; msach@7: do msach@7: { retries += 1; msach@7: cycles_counter_main_fd = msach@7: syscall(__NR_perf_event_open, hw_event, msach@7: 0,//pid_t: 0 is "pid of calling process" msach@7: -1,//int: cpu, -1 means accumulate from all cores msach@7: -1,//int: group_fd, -1 is "leader" == independent msach@7: 0//unsigned long: flags msach@7: ); msach@7: } msach@7: while(cycles_counter_main_fd<0 && retries < 100); msach@7: if(retries >= 100) msach@7: { msach@7: fprintf(stderr,"in main "); msach@7: perror("Failed to open cycles counter"); msach@7: } kshalle@8: msach@14: //Set up counters to count cache misses msach@14: hw_event->type = PERF_TYPE_HARDWARE; msach@14: hw_event->config = PERF_COUNT_HW_CACHE_MISSES; //misses msach@14: msach@14: retries = 0; msach@14: do msach@14: { retries += 1; msach@14: misses_counter_fd = msach@14: syscall(__NR_perf_event_open, hw_event, msach@14: 0,//pid_t: 0 is "pid of calling process" msach@14: -1,//int: cpu, -1 means accumulate from all cores msach@14: -1,//int: group_fd, -1 is "leader" == independent msach@14: 0//unsigned long: flags msach@14: ); msach@14: } msach@14: while(misses_counter_fd<0 && retries < 100); msach@14: if(retries >= 100) msach@14: { msach@14: fprintf(stderr,"in main "); msach@14: perror("Failed to misses counter"); msach@14: } msach@14: msach@9: measurement_t startExeCycles, endExeCycles; msach@9: BenchParams *benchParams; msach@9: msach@9: benchParams = malloc(sizeof(BenchParams)); msach@9: msach@9: benchParams->startExeCycles = &startExeCycles; msach@9: benchParams->endExeCycles = &endExeCycles; msach@9: kshalle@8: workerParamsArray = (WorkerParams *)malloc( (num_threads + 1) * sizeof(WorkerParams) ); kshalle@8: if(workerParamsArray == NULL ) printf("error mallocing worker params array\n"); kshalle@8: msach@9: kshalle@8: //This is the transition to the VMS runtime kshalle@8: VPThread__create_seed_procr_and_do_work( &benchmark, benchParams ); kshalle@8: msach@9: uint64_t totalWorkCyclesAcrossCores = 0, totalBadCyclesAcrossCores = 0; msach@9: uint64_t totalSyncCyclesAcrossCores = 0, totalBadSyncCyclesAcrossCores = 0; kshalle@8: for(i=0; i