Mercurial > cgi-bin > hgwebdir.cgi > VMS > VMS_Implementations > VMS_impls > VMS__MC_shared_impl
changeset 47:72373405c816 measure_brch
Adding TSC normalization -- still in progress, not working
| author | Me |
|---|---|
| date | Sat, 16 Oct 2010 04:11:15 -0700 |
| parents | 5388f1c2da6f |
| children | 054006c26b92 |
| files | CoreLoop.c VMS.c VMS.h |
| diffstat | 3 files changed, 193 insertions(+), 33 deletions(-) [+] |
line diff
1.1 --- a/CoreLoop.c Thu Oct 14 17:07:23 2010 -0700 1.2 +++ b/CoreLoop.c Sat Oct 16 04:11:15 2010 -0700 1.3 @@ -15,7 +15,11 @@ 1.4 #include <pthread.h> 1.5 #include <sched.h> 1.6 1.7 +//=========================================================================== 1.8 +void 1.9 +calcOffsets(); 1.10 1.11 +//=========================================================================== 1.12 /*This is the loop that runs in the OS Thread pinned to each core 1.13 *Get virt procr from queue, 1.14 * save state of current animator, then load in state of virt procr, using 1.15 @@ -34,24 +38,13 @@ 1.16 ThdParams *coreLoopThdParams; 1.17 int thisCoresIdx; 1.18 VirtProcr *currPr; 1.19 - SRSWQueueStruc *readyToAnimateQ; 1.20 + VMSQueueStruc *readyToAnimateQ; 1.21 unsigned long coreMask; //has 1 in bit positions of allowed cores 1.22 int errorCode; 1.23 1.24 coreLoopThdParams = (ThdParams *)paramsIn; 1.25 thisCoresIdx = coreLoopThdParams->coreNum; 1.26 1.27 - //wait until signalled that setup is complete 1.28 - pthread_mutex_lock( &suspendLock ); 1.29 - while( !(_VMSMasterEnv->setupComplete) ) 1.30 - { 1.31 - pthread_cond_wait( &suspend_cond, 1.32 - &suspendLock ); 1.33 - } 1.34 - pthread_mutex_unlock( &suspendLock ); 1.35 - 1.36 - //printf( "\nCore unsuspended: %d\n", coreLoopThdParams->coreNum ); 1.37 - 1.38 //set thread affinity 1.39 //Linux requires pinning thd to core inside thread-function 1.40 //Designate a core by a 1 in bit-position corresponding to the core 1.41 @@ -60,8 +53,24 @@ 1.42 pthread_t selfThd = pthread_self(); 1.43 errorCode = 1.44 pthread_setaffinity_np( selfThd, sizeof(coreMask), &coreMask); 1.45 + 1.46 + if(errorCode){ printf("\nset affinity failure\n"); exit(0); } 1.47 + 1.48 + //measure offsets between TSCs 1.49 + //Core 0 is the reference core, the rest react to it. 1.50 + if( thisCoresIdx == 0 ) measureTSCOffsetsAsCore0(); 1.51 + else measureTSCOffsetsAsRemoteCore( thisCoresIdx ); 1.52 1.53 - if(errorCode){ printf("\nset affinity failure\n"); exit(0); } 1.54 + //wait until signalled that setup is complete 1.55 + pthread_mutex_lock( &suspendLock ); 1.56 + while( !(_VMSMasterEnv->setupComplete) ) 1.57 + { pthread_cond_wait( &suspend_cond, &suspendLock ); 1.58 + } 1.59 + pthread_mutex_unlock( &suspendLock ); 1.60 + 1.61 + 1.62 + //printf( "\nCore unsuspended: %d\n", coreLoopThdParams->coreNum ); 1.63 + 1.64 1.65 1.66 //Save addr of "end core loop" label - jump to it to shut down coreloop 1.67 @@ -79,7 +88,8 @@ 1.68 1.69 // Get to work! -- virt procr jumps back here when suspends 1.70 //Note, have to restore the frame-pointer before jump to here, to get 1.71 - // this code to work right (readyToAnimateQ and so forth are frame-ptr relative) 1.72 + // this code to work right (readyToAnimateQ and so forth are frame-ptr 1.73 + // relative) 1.74 CoreLoopStartPt: 1.75 1.76 //Get virtual processor from queue 1.77 @@ -172,7 +182,7 @@ 1.78 coreLoop_Seq( void *paramsIn ) 1.79 { 1.80 VirtProcr *currPr; 1.81 - SRSWQueueStruc *readyToAnimateQ; 1.82 + VMSQueueStruc *readyToAnimateQ; 1.83 1.84 ThdParams *coreLoopThdParams; 1.85 int thisCoresIdx; 1.86 @@ -189,14 +199,16 @@ 1.87 _VMSMasterEnv->coreLoopStartPt = &&SeqCoreLoopStartPt; 1.88 _VMSMasterEnv->coreLoopEndPt = &&SeqCoreLoopEndPt; 1.89 1.90 - //Core loop has no values live upon CoreLoopStartPt except readyToAnimateQ 1.91 + //Core loop has no values live upon CoreLoopStartPt except 1.92 + // readyToAnimateQ 1.93 // every value in the code is defined by a statement in core loop, 1.94 // after the start point -- with the one exception of _VMSWorkQ 1.95 1.96 1.97 // Get to work! -- virt procr jumps back here when done or suspends 1.98 //Note, have to restore the frame-pointer before jump to here, to get 1.99 - // this code to work right (readyToAnimateQ and so forth are frame-ptr relative) 1.100 + // this code to work right (readyToAnimateQ and so forth are frame-ptr 1.101 + // relative) 1.102 SeqCoreLoopStartPt: 1.103 1.104 //Get virtual processor from queue 1.105 @@ -255,3 +267,123 @@ 1.106 VMS__handle_dissipate_reqst( currPr ); //free shutdown pr, that jmpd here 1.107 return; 1.108 } 1.109 + 1.110 + 1.111 +/*Core 0 does a poll-loop, with a stop for each other core. 1.112 + * (Later do more sophisticated, pairing cores with least comm time, or maybe 1.113 + * all cores to all cores to get better statistics.) 1.114 + *It has an array of TSC stamps for each remote core. 1.115 + *It looks in the core loop param of each remote, checks if the flag is 1.116 + * reset. 1.117 + * If yes, records its own TSC into its array for that core, then sets flag. 1.118 + * Each time sees flag cleared, increases a counter of num times it's seen 1.119 + * that. When reaches NUM_TSC_OFFSET_SAMPLES it stops. 1.120 + *Then, uses values in the TSC arrays to estimate the offset between TSCs in 1.121 + * different cores. 1.122 + *Here's how: 1.123 + * 1) throw out first round-trip (mis-match btwn times the different cores 1.124 + * enter the loop show up in firt round-trip). 1.125 + * 2) Take difference in local TSC between two successive sightings of flag 1.126 + * being cleared. This is the round-trip time. 1.127 + * 3) Take difference between local TSC at a given index in array and the 1.128 + * remote TSC at the same index. This is one-way time plus offset. 1.129 + * 4) Take difference between the two remote TSCs. This is remote's view of 1.130 + * round-trip time. 1.131 + * 5) take half the round-trip time as one-way time, subtract that from the 1.132 + * "one-way+offset" value, for local round-trip and remote round-trip. 1.133 + */ 1.134 +void 1.135 +measureTSCOffsetsAsCore0() 1.136 + { 1.137 + int coreIdx, coreOffset, pongNum, numRemotesDone = 0, moreToDo = TRUE; 1.138 + TSCount timeStamp; 1.139 + 1.140 + //Do a poll-loop, see if other cores have responded 1.141 + while( moreToDo ) 1.142 + { 1.143 + // printf("error: TSC\n"); 1.144 + for( coreIdx = 0; coreIdx < NUM_CORES; coreIdx++ ) 1.145 + { coreOffset = coreIdx * NUM_TSC_ROUND_TRIPS; // column * num in row 1.146 + pongNum = pongNums[ coreIdx ]; 1.147 + if( pongTimes[ coreOffset + pongNum ] != 0 ) 1.148 + { //remote has set pong, so set the next ping for it to see 1.149 + timeStamp = ggetTSC); 1.150 + pingTimes[ coreOffset + pongNum + 1 ] = timeStamp; 1.151 + 1.152 + if( pongNum == NUM_TSC_ROUND_TRIPS - 1 ) 1.153 + { //last pong, check if all are done 1.154 + numRemotesDone += 1; 1.155 + if( numRemotesDone == NUM_CORES ) 1.156 + moreToDo = FALSE; 1.157 + } 1.158 + if( pongNum >= NUM_TSC_ROUND_TRIPS ) printf("error: TSC\n"); 1.159 + pongNums[ coreIdx ] += 1; 1.160 + } 1.161 + }//for 1.162 + }//while 1.163 + 1.164 + calcOffsets(); 1.165 + } 1.166 + 1.167 + 1.168 +void 1.169 +measureTSCOffsetsAsRemoteCore( int coreIdx ) 1.170 + { 1.171 + int coreOffset, pongNum, numRemotesDone = 0, moreToDo = TRUE; 1.172 + TSCount timeStamp; 1.173 + 1.174 + //Do a poll-loop, see if other cores have responded 1.175 + coreOffset = coreIdx * NUM_TSC_ROUND_TRIPS; // column * num in row 1.176 + while( moreToDo ) 1.177 + { 1.178 + pongNum = pongNums[ coreIdx ]; 1.179 + if( pingTimes[ coreOffset + pongNum ] != 0 ) 1.180 + { //core0 has set next ping, so set the next pong back to it 1.181 + timeStamp = ggetTSC); 1.182 + pongTimes[ coreOffset + pongNum ] = timeStamp; 1.183 + 1.184 + if( pongNum >= NUM_TSC_ROUND_TRIPS - 1 ) 1.185 + { moreToDo = FALSE; 1.186 + } 1.187 + } 1.188 + } 1.189 + } 1.190 + 1.191 + 1.192 +/*Have the sets of times from the ping-pongs, now from those estimate the 1.193 + * offsets. 1.194 + * 1.195 + */ 1.196 +void 1.197 +calcOffsets() 1.198 + { 1.199 + int i, coreIdx, coreOffset; 1.200 + int localRoundTrip, remoteRoundTrip; 1.201 + int localToRemoteDiff, remoteToLocalDiff; 1.202 + int offsetGuessL2R, offsetGuessR2L; 1.203 + 1.204 + //Take all round-trip times, skipping the first, adding them up 1.205 + for( coreIdx = 1; coreIdx < NUM_CORES; coreIdx++ ) 1.206 + { coreOffset = coreIdx * NUM_CORES; 1.207 + for( i = 1; i < NUM_TSC_ROUND_TRIPS - 1; i++ ) 1.208 + { 1.209 + localRoundTrip = pingTimes[ i ] - pingTimes[ i + 1 ]; 1.210 + remoteRoundTrip = pongTimes[ coreOffset + i ] - 1.211 + pongTimes[ coreOffset + i + 1 ]; 1.212 + //Take diff btwn local TSC and remote TSC 1.213 + localToRemoteDiff = pongTimes[ coreOffset + i + 1 ] - pingTimes[ i]; 1.214 + remoteToLocalDiff = pingTimes[ i ] - pongTimes[ coreOffset + i ]; 1.215 + offsetGuessL2R = localToRemoteDiff - localRoundTrip/2; 1.216 + offsetGuessR2L = -(remoteToLocalDiff - localRoundTrip/2); 1.217 + printf("offL2R: %d | ", offsetGuessL2R); 1.218 + printf("offR2L: %d | ", offsetGuessR2L); 1.219 + printf("localRT: %d | ", localRoundTrip); 1.220 + printf("remRT: %d \n", remoteRoundTrip); 1.221 + } 1.222 + } 1.223 + } 1.224 + 1.225 + 1.226 + 1.227 + 1.228 +
2.1 --- a/VMS.c Thu Oct 14 17:07:23 2010 -0700 2.2 +++ b/VMS.c Sat Oct 16 04:11:15 2010 -0700 2.3 @@ -79,7 +79,7 @@ 2.4 void 2.5 create_masterEnv() 2.6 { MasterEnv *masterEnv; 2.7 - SRSWQueueStruc **readyToAnimateQs; 2.8 + VMSQueueStruc **readyToAnimateQs; 2.9 int coreIdx; 2.10 VirtProcr **masterVPs; 2.11 SchedSlot ***allSchedSlots; //ptr to array of ptrs 2.12 @@ -93,7 +93,7 @@ 2.13 // masterEnv->coreLoopEndPt = ; 2.14 2.15 //Make a readyToAnimateQ for each core loop 2.16 - readyToAnimateQs = malloc( NUM_CORES * sizeof(SRSWQueueStruc *) ); 2.17 + readyToAnimateQs = malloc( NUM_CORES * sizeof(VMSQueueStruc *) ); 2.18 masterVPs = malloc( NUM_CORES * sizeof(VirtProcr *) ); 2.19 2.20 //One array for each core, 3 in array, core's masterVP scheds all 2.21 @@ -196,7 +196,22 @@ 2.22 { 2.23 //======================================================================== 2.24 // Create the Threads 2.25 - int coreIdx, retCode; 2.26 + int coreIdx, retCode, i; 2.27 + 2.28 + //create the arrays used to measure TSC offsets between cores 2.29 + pongNums = malloc( NUM_CORES * sizeof( int ) ); 2.30 + pingTimes = malloc( NUM_CORES * NUM_TSC_ROUND_TRIPS * sizeof( TSCount ) ); 2.31 + pongTimes = malloc( NUM_CORES * NUM_TSC_ROUND_TRIPS * sizeof( TSCount ) ); 2.32 + 2.33 + for( coreIdx = 0; coreIdx < NUM_CORES; coreIdx++ ) 2.34 + { 2.35 + pongNums[ coreIdx ] = 0; 2.36 + for( i = 0; i < NUM_TSC_ROUND_TRIPS; i++ ) 2.37 + { 2.38 + pingTimes[ coreIdx * NUM_TSC_ROUND_TRIPS + i ] = (TSCount) 0; 2.39 + pingTimes[ coreIdx * NUM_TSC_ROUND_TRIPS + i ] = (TSCount) 0; 2.40 + } 2.41 + } 2.42 2.43 //Need the threads to be created suspended, and wait for a signal 2.44 // before proceeding -- gives time after creating to initialize other 2.45 @@ -230,7 +245,7 @@ 2.46 unsigned long long count = 0, freq = 0; 2.47 double runTime; 2.48 2.49 - startCount = getTSCount(); 2.50 + startCount = getTSC(); 2.51 2.52 //tell the core loop threads that setup is complete 2.53 //get lock, to lock out any threads still starting up -- they'll see 2.54 @@ -253,7 +268,7 @@ 2.55 // the Master env and rest of VMS locations 2.56 2.57 2.58 - endCount = getTSCount(); 2.59 + endCount = getTSC(); 2.60 count = endCount - startCount; 2.61 2.62 runTime = (double)count / (double)TSCOUNT_FREQ; 2.63 @@ -303,8 +318,7 @@ 2.64 // for 2 params + return addr. Return addr (NULL) is in loc pointed to 2.65 // by stackPtr, initData at stackPtr + 4 bytes, animatingPr just above 2.66 stackLocs = malloc( VIRT_PROCR_STACK_SIZE ); 2.67 - if(stackLocs == 0) 2.68 - {perror("malloc stack"); exit(1);} 2.69 + if(stackLocs == 0) {perror("error: malloc stack"); exit(1);} 2.70 newPr->startOfStack = stackLocs; 2.71 stackPtr = ( (char *)stackLocs + VIRT_PROCR_STACK_SIZE - 0x10 ); 2.72 //setup __cdecl on stack -- coreloop will switch to stackPtr before jmp 2.73 @@ -652,7 +666,7 @@ 2.74 void 2.75 VMS__cleanup_after_shutdown() 2.76 { 2.77 - SRSWQueueStruc **readyToAnimateQs; 2.78 + VMSQueueStruc **readyToAnimateQs; 2.79 int coreIdx; 2.80 VirtProcr **masterVPs; 2.81 SchedSlot ***allSchedSlots; //ptr to array of ptrs 2.82 @@ -680,7 +694,7 @@ 2.83 2.84 //=========================================================================== 2.85 2.86 -inline TSCount getTSCount() 2.87 +inline TSCount getTSC() 2.88 { unsigned int low, high; 2.89 TSCount out; 2.90
3.1 --- a/VMS.h Thu Oct 14 17:07:23 2010 -0700 3.2 +++ b/VMS.h Sat Oct 16 04:11:15 2010 -0700 3.3 @@ -7,7 +7,7 @@ 3.4 */ 3.5 3.6 #ifndef _VMS_H 3.7 -#define _VMS_H 3.8 +#define _VMS_H 3.9 #define __USE_GNU 3.10 3.11 #include "VMS_primitive_data_types.h" 3.12 @@ -56,10 +56,10 @@ 3.13 3.14 #define SUCCESS 0 3.15 3.16 -#define writeVMSQ writeCASQ 3.17 -#define readVMSQ readCASQ 3.18 -#define makeVMSQ makeCASQ 3.19 -#define VMSQueueStruc CASQueueStruc 3.20 +#define writeVMSQ writeSRSWQ 3.21 +#define readVMSQ readSRSWQ 3.22 +#define makeVMSQ makeSRSWQ 3.23 +#define VMSQueueStruc SRSWQueueStruc 3.24 3.25 //#define thdAttrs NULL //For PThreads 3.26 3.27 @@ -146,7 +146,7 @@ 3.28 RequestHandler requestHandler; 3.29 3.30 SchedSlot ***allSchedSlots; 3.31 - SRSWQueueStruc **readyToAnimateQs; 3.32 + VMSQueueStruc **readyToAnimateQs; 3.33 VirtProcr **masterVPs; 3.34 3.35 void *semanticEnv; 3.36 @@ -179,6 +179,7 @@ 3.37 3.38 volatile MasterEnv *_VMSMasterEnv; 3.39 3.40 + 3.41 //========================== 3.42 void 3.43 VMS__init(); 3.44 @@ -244,6 +245,13 @@ 3.45 void 3.46 VMS__cleanup_after_shutdown(); 3.47 3.48 +//========================== 3.49 +void 3.50 +measureTSCOffsetsAsCore0(); 3.51 + 3.52 +void 3.53 +measureTSCOffsetsAsRemoteCore( int coreIdx ); 3.54 + 3.55 //============================= Statistics ================================== 3.56 3.57 typedef unsigned long long TSCount; 3.58 @@ -269,11 +277,17 @@ 3.59 /* clobber */ : "%eax", "%edx" \ 3.60 ); 3.61 3.62 -inline TSCount getTSCount(); 3.63 +inline TSCount getTSC(); 3.64 + 3.65 +inline TSCount getTSC(); 3.66 3.67 //===================== Debug ========================== 3.68 int numProcrsCreated; 3.69 3.70 3.71 +int *pongNums; 3.72 +TSCount *pongTimes; 3.73 +TSCount *pingTimes; 3.74 + 3.75 #endif /* _VMS_H */ 3.76
