changeset 242:b4f684e98d0b Common_Ancestor

add cache miss counter
author Nina Engelhardt <nengel@mailbox.tu-berlin.de>
date Tue, 08 May 2012 18:58:41 +0200
parents 1cfcf49dc7ab
children 227cd4d33d94
files Defines/VMS_defs__HW_constants.h HW_Dependent_Primitives/VMS__HW_measurement.c Services_Offered_by_VMS/Measurement_and_Stats/MEAS__macros.h
diffstat 3 files changed, 60 insertions(+), 41 deletions(-) [+]
line diff
     1.1 --- a/Defines/VMS_defs__HW_constants.h	Sun Apr 01 13:53:46 2012 -0700
     1.2 +++ b/Defines/VMS_defs__HW_constants.h	Tue May 08 18:58:41 2012 +0200
     1.3 @@ -14,7 +14,7 @@
     1.4  //=========================  Hardware related Constants =====================
     1.5     //This value is the number of hardware threads in the shared memory
     1.6     // machine
     1.7 -#define NUM_CORES        40
     1.8 +#define NUM_CORES        4
     1.9  
    1.10     // tradeoff amortizing master fixed overhead vs imbalance potential
    1.11     // when work-stealing, can make bigger, at risk of losing cache affinity
     2.1 --- a/HW_Dependent_Primitives/VMS__HW_measurement.c	Sun Apr 01 13:53:46 2012 -0700
     2.2 +++ b/HW_Dependent_Primitives/VMS__HW_measurement.c	Tue May 08 18:58:41 2012 +0200
     2.3 @@ -12,10 +12,8 @@
     2.4  #ifdef HOLISTIC__TURN_ON_PERF_COUNTERS
     2.5      struct perf_event_attr hw_event;
     2.6     memset(&hw_event,0,sizeof(hw_event));
     2.7 -   	hw_event.type = PERF_TYPE_HARDWARE;
     2.8 -	hw_event.size = sizeof(hw_event);
     2.9 +	hw_event.size = sizeof(struct perf_event_attr);
    2.10  	hw_event.disabled = 1;
    2.11 -        hw_event.freq = 0;
    2.12  	hw_event.inherit = 1; /* children inherit it   */
    2.13  	hw_event.pinned = 1; /* must always be on PMU */
    2.14  	hw_event.exclusive = 0; /* only group on PMU     */
    2.15 @@ -23,13 +21,12 @@
    2.16  	hw_event.exclude_kernel = 0; /* ditto kernel          */
    2.17  	hw_event.exclude_hv = 0; /* ditto hypervisor      */
    2.18  	hw_event.exclude_idle = 0; /* don't count when idle */
    2.19 -	hw_event.mmap = 0; /* include mmap data     */
    2.20 -	hw_event.comm = 0; /* include comm data     */
    2.21  
    2.22          int coreIdx;
    2.23     for( coreIdx = 0; coreIdx < NUM_CORES; coreIdx++ )
    2.24      {
    2.25 -       	hw_event.config = 0x0000000000000000; //cycles
    2.26 +       hw_event.type = PERF_TYPE_HARDWARE;	
    2.27 +       hw_event.config = PERF_COUNT_HW_CPU_CYCLES; //cycles
    2.28          _VMSMasterEnv->cycles_counter_fd[coreIdx] = syscall(__NR_perf_event_open, &hw_event,
    2.29   		0,//pid_t pid, 
    2.30  		coreIdx,//int cpu, 
    2.31 @@ -40,7 +37,8 @@
    2.32              fprintf(stderr,"On core %d: ",coreIdx);
    2.33              perror("Failed to open cycles counter");
    2.34          }
    2.35 -        hw_event.config = 0x0000000000000001; //instrs
    2.36 +        hw_event.type = PERF_TYPE_HARDWARE;
    2.37 +        hw_event.config = PERF_COUNT_HW_INSTRUCTIONS; //instrs
    2.38          _VMSMasterEnv->instrs_counter_fd[coreIdx] = syscall(__NR_perf_event_open, &hw_event,
    2.39   		0,//pid_t pid, 
    2.40  		coreIdx,//int cpu, 
    2.41 @@ -51,6 +49,21 @@
    2.42              fprintf(stderr,"On core %d: ",coreIdx);
    2.43              perror("Failed to open instrs counter");
    2.44          }
    2.45 +        hw_event.type = PERF_TYPE_HW_CACHE;
    2.46 +        hw_event.config = PERF_COUNT_HW_CACHE_L1D <<  0  |
    2.47 +	(PERF_COUNT_HW_CACHE_OP_READ		<<  8) |
    2.48 +	(PERF_COUNT_HW_CACHE_RESULT_MISS	<< 16); //cache misses
    2.49 +        _VMSMasterEnv->cachem_counter_fd[coreIdx] = syscall(__NR_perf_event_open, &hw_event,
    2.50 + 		0,//pid_t pid, 
    2.51 +		coreIdx,//int cpu, 
    2.52 +		-1,//int group_fd,
    2.53 +		0//unsigned long flags
    2.54 +	);
    2.55 +        if (_VMSMasterEnv->cachem_counter_fd[coreIdx]<0){
    2.56 +            fprintf(stderr,"On core %d: ",coreIdx);
    2.57 +            perror("Failed to open cache miss counter");
    2.58 +            exit(1);
    2.59 +        }
    2.60     }
    2.61          
    2.62     prctl(PR_TASK_PERF_EVENTS_ENABLE);
     3.1 --- a/Services_Offered_by_VMS/Measurement_and_Stats/MEAS__macros.h	Sun Apr 01 13:53:46 2012 -0700
     3.2 +++ b/Services_Offered_by_VMS/Measurement_and_Stats/MEAS__macros.h	Tue May 08 18:58:41 2012 +0200
     3.3 @@ -295,7 +295,7 @@
     3.4  #ifdef HOLISTIC__TURN_ON_PERF_COUNTERS
     3.5     
     3.6  #define MEAS__Insert_Counter_Handler \
     3.7 -   typedef void (*CounterHandler) (int,int,int,SlaveVP*,uint64,uint64);
     3.8 +   typedef void (*CounterHandler) (int,int,int,SlaveVP*,uint64,uint64,uint64);
     3.9   
    3.10     enum eventType {
    3.11      DebugEvt = 0,
    3.12 @@ -313,9 +313,10 @@
    3.13      Timestamp_end
    3.14     };
    3.15     
    3.16 -   #define saveCyclesAndInstrs(core,cycles,instrs) do{ \
    3.17 +   #define saveCyclesAndInstrs(core,cycles,instrs,cachem) do{ \
    3.18     int cycles_fd = _VMSMasterEnv->cycles_counter_fd[core]; \
    3.19     int instrs_fd = _VMSMasterEnv->instrs_counter_fd[core]; \
    3.20 +   int cachem_fd = _VMSMasterEnv->cachem_counter_fd[core]; \
    3.21     int nread;                                           \
    3.22                                                          \
    3.23     nread = read(cycles_fd,&(cycles),sizeof(cycles));    \
    3.24 @@ -329,12 +330,18 @@
    3.25         perror("Error reading cycles counter");          \
    3.26         instrs = 0;                                      \
    3.27     }                                                    \
    3.28 +   nread = read(cachem_fd,&(cachem),sizeof(cachem));    \
    3.29 +   if(nread<0){                                         \
    3.30 +       perror("Error reading last level cache miss counter");          \
    3.31 +       cachem = 0;                                      \
    3.32 +   }                                                    \
    3.33     } while (0) 
    3.34  
    3.35     #define MEAS__Insert_Counter_Meas_Fields_into_MasterEnv \
    3.36       int cycles_counter_fd[NUM_CORES]; \
    3.37       int instrs_counter_fd[NUM_CORES]; \
    3.38 -     uint64 start_master_lock[NUM_CORES][2]; \
    3.39 +     int cachem_counter_fd[NUM_CORES]; \
    3.40 +     uint64 start_master_lock[NUM_CORES][3]; \
    3.41       CounterHandler counterHandler;
    3.42  
    3.43     #define HOLISTIC__Setup_Perf_Counters setup_perf_counters();
    3.44 @@ -365,14 +372,15 @@
    3.45          CounterHandler counterHandler = masterEnv->counterHandler;
    3.46     
    3.47     #define HOLISTIC__Record_AppResponderInvocation_start \
    3.48 -      uint64 cycles,instrs; \
    3.49 -      saveCyclesAndInstrs(thisCoresIdx,cycles, instrs); \
    3.50 +      uint64 cycles,instrs,cachem; \
    3.51 +      saveCyclesAndInstrs(thisCoresIdx,cycles, instrs,cachem); \
    3.52        if(lastVPBeforeMaster){ \
    3.53 -        (*counterHandler)(AppResponderInvocation_start,lastVPBeforeMaster->slaveID,lastVPBeforeMaster->assignCount,lastVPBeforeMaster,cycles,instrs); \
    3.54 +        (*counterHandler)(AppResponderInvocation_start,lastVPBeforeMaster->slaveID,lastVPBeforeMaster->assignCount,lastVPBeforeMaster,cycles,instrs,cachem); \
    3.55          lastVPBeforeMaster = NULL; \
    3.56        } else { \
    3.57            _VMSMasterEnv->start_master_lock[thisCoresIdx][0] = cycles; \
    3.58            _VMSMasterEnv->start_master_lock[thisCoresIdx][1] = instrs; \
    3.59 +          _VMSMasterEnv->start_master_lock[thisCoresIdx][2] = cachem; \
    3.60        }
    3.61   
    3.62             /* Request Handler may call resume() on the VP, but we want to 
    3.63 @@ -388,15 +396,15 @@
    3.64     #define HOLISTIC__Record_AppResponder_start \
    3.65                 vpid = currSlot->slaveAssignedToSlot->slaveID; \
    3.66                 task = currSlot->slaveAssignedToSlot->assignCount; \
    3.67 -               uint64 cycles, instrs; \
    3.68 -               saveCyclesAndInstrs(thisCoresIdx,cycles, instrs); \
    3.69 -               (*counterHandler)(AppResponder_start,vpid,task,currSlot->slaveAssignedToSlot,cycles,instrs);
    3.70 +               uint64 cycles, instrs, cachem; \
    3.71 +               saveCyclesAndInstrs(thisCoresIdx,cycles, instrs,cachem); \
    3.72 +               (*counterHandler)(AppResponder_start,vpid,task,currSlot->slaveAssignedToSlot,cycles,instrs,cachem);
    3.73  
    3.74     #define HOLISTIC__Record_AppResponder_end \
    3.75 -        uint64 cycles2,instrs2; \
    3.76 -        saveCyclesAndInstrs(thisCoresIdx,cycles2, instrs2); \
    3.77 -        (*counterHandler)(AppResponder_end,vpid,task,currSlot->slaveAssignedToSlot,cycles2,instrs2); \
    3.78 -        (*counterHandler)(Timestamp_end,vpid,task,currSlot->slaveAssignedToSlot,rdtsc(),0);
    3.79 +        uint64 cycles2,instrs2,cachem2; \
    3.80 +        saveCyclesAndInstrs(thisCoresIdx,cycles2, instrs2,cachem2); \
    3.81 +        (*counterHandler)(AppResponder_end,vpid,task,currSlot->slaveAssignedToSlot,cycles2,instrs2,cachem2); \
    3.82 +        (*counterHandler)(Timestamp_end,vpid,task,currSlot->slaveAssignedToSlot,rdtsc(),0,0);
    3.83  
    3.84     
    3.85     /* Don't know who to account time to yet - goes to assigned VP
    3.86 @@ -407,45 +415,43 @@
    3.87         if(currSlot->slaveAssignedToSlot == NULL){ \
    3.88             empty= TRUE; \
    3.89         } \
    3.90 -       uint64 tmp_cycles; \
    3.91 -       uint64 tmp_instrs; \
    3.92 -       saveCyclesAndInstrs(thisCoresIdx,tmp_cycles,tmp_instrs); \
    3.93 +       uint64 tmp_cycles, tmp_instrs, tmp_cachem; \
    3.94 +       saveCyclesAndInstrs(thisCoresIdx,tmp_cycles,tmp_instrs,tmp_cachem); \
    3.95         uint64 tsc = rdtsc(); \
    3.96         if(vpid > 0) { \
    3.97 -           (*counterHandler)(NextAssigner_start,vpid,task,currSlot->slaveAssignedToSlot,tmp_cycles,tmp_instrs); \
    3.98 +           (*counterHandler)(NextAssigner_start,vpid,task,currSlot->slaveAssignedToSlot,tmp_cycles,tmp_instrs,tmp_cachem); \
    3.99             vpid = 0; \
   3.100             task = 0; \
   3.101          }
   3.102  
   3.103     #define HOLISTIC__Record_Assigner_end \
   3.104 -        uint64 cycles; \
   3.105 -        uint64 instrs; \
   3.106 -        saveCyclesAndInstrs(thisCoresIdx,cycles,instrs); \
   3.107 +        uint64 cycles,instrs,cachem; \
   3.108 +        saveCyclesAndInstrs(thisCoresIdx,cycles,instrs,cachem); \
   3.109          if(empty){ \
   3.110 -            (*counterHandler)(AssignerInvocation_start,assignedSlaveVP->slaveID,assignedSlaveVP->assignCount,assignedSlaveVP,masterEnv->start_master_lock[thisCoresIdx][0],masterEnv->start_master_lock[thisCoresIdx][1]); \
   3.111 +            (*counterHandler)(AssignerInvocation_start,assignedSlaveVP->slaveID,assignedSlaveVP->assignCount,assignedSlaveVP,masterEnv->start_master_lock[thisCoresIdx][0],masterEnv->start_master_lock[thisCoresIdx][1],masterEnv->start_master_lock[thisCoresIdx][2]); \
   3.112          } \
   3.113 -        (*counterHandler)(Timestamp_start,assignedSlaveVP->slaveID,assignedSlaveVP->assignCount,assignedSlaveVP,tsc,0); \
   3.114 -        (*counterHandler)(Assigner_start,assignedSlaveVP->slaveID,assignedSlaveVP->assignCount,assignedSlaveVP,tmp_cycles,tmp_instrs); \
   3.115 -        (*counterHandler)(Assigner_end,assignedSlaveVP->slaveID,assignedSlaveVP->assignCount,assignedSlaveVP,cycles,instrs);
   3.116 +        (*counterHandler)(Timestamp_start,assignedSlaveVP->slaveID,assignedSlaveVP->assignCount,assignedSlaveVP,tsc,0,0); \
   3.117 +        (*counterHandler)(Assigner_start,assignedSlaveVP->slaveID,assignedSlaveVP->assignCount,assignedSlaveVP,tmp_cycles,tmp_instrs,tmp_cachem); \
   3.118 +        (*counterHandler)(Assigner_end,assignedSlaveVP->slaveID,assignedSlaveVP->assignCount,assignedSlaveVP,cycles,instrs,tmp_cachem);
   3.119  
   3.120     #define HOLISTIC__Record_Work_start \
   3.121          if(currVP){ \
   3.122 -                uint64 cycles,instrs; \
   3.123 -                saveCyclesAndInstrs(thisCoresIdx,cycles, instrs); \
   3.124 -                (*counterHandler)(Work_start,currVP->slaveID,currVP->assignCount,currVP,cycles,instrs); \
   3.125 +                uint64 cycles,instrs,cachem; \
   3.126 +                saveCyclesAndInstrs(thisCoresIdx,cycles, instrs,cachem); \
   3.127 +                (*counterHandler)(Work_start,currVP->slaveID,currVP->assignCount,currVP,cycles,instrs,cachem); \
   3.128          }
   3.129     
   3.130     #define HOLISTIC__Record_Work_end \
   3.131         if(currVP){ \
   3.132 -               uint64 cycles,instrs; \
   3.133 -               saveCyclesAndInstrs(thisCoresIdx,cycles, instrs); \
   3.134 -               (*counterHandler)(Work_end,currVP->slaveID,currVP->assignCount,currVP,cycles,instrs); \
   3.135 +               uint64 cycles,instrs,cachem; \
   3.136 +               saveCyclesAndInstrs(thisCoresIdx,cycles, instrs,cachem); \
   3.137 +               (*counterHandler)(Work_end,currVP->slaveID,currVP->assignCount,currVP,cycles,instrs,cachem); \
   3.138         }
   3.139  
   3.140     #define HOLISTIC__Record_HwResponderInvocation_start \
   3.141 -        uint64 cycles,instrs; \
   3.142 -        saveCyclesAndInstrs(animatingSlv->coreAnimatedBy,cycles, instrs); \
   3.143 -        (*(_VMSMasterEnv->counterHandler))(HwResponderInvocation_start,animatingSlv->slaveID,animatingSlv->assignCount,animatingSlv,cycles,instrs); 
   3.144 +        uint64 cycles,instrs,cachem; \
   3.145 +        saveCyclesAndInstrs(animatingSlv->coreAnimatedBy,cycles, instrs,cachem); \
   3.146 +        (*(_VMSMasterEnv->counterHandler))(HwResponderInvocation_start,animatingSlv->slaveID,animatingSlv->assignCount,animatingSlv,cycles,instrs,cachem); 
   3.147          
   3.148  
   3.149     #define getReturnAddressBeforeLibraryCall(vp_ptr, res_ptr) do{     \