Mercurial > cgi-bin > hgwebdir.cgi > VMS > VMS_Implementations > VMS_impls > VMS__MC_shared_impl
changeset 55:3bac84e4e56e
Works with correct matrix mult Nov 4 -- switch animators macros, many updates
Changed all queues back to VMSQ variants #defines
correct, protected, work-stealing, with compiler switch in and out
| author | Me |
|---|---|
| date | Thu, 04 Nov 2010 18:13:18 -0700 |
| parents | f8508572f3de |
| children | 26d53313a8f2 |
| files | CoreLoop.c MasterLoop.c SwitchAnimators.h VMS.c VMS.h probes.c vmalloc.c |
| diffstat | 7 files changed, 442 insertions(+), 183 deletions(-) [+] |
line diff
1.1 --- a/CoreLoop.c Tue Nov 02 16:43:01 2010 -0700 1.2 +++ b/CoreLoop.c Thu Nov 04 18:13:18 2010 -0700 1.3 @@ -34,13 +34,24 @@ 1.4 ThdParams *coreLoopThdParams; 1.5 int thisCoresIdx; 1.6 VirtProcr *currPr; 1.7 - SRSWQueueStruc *readyToAnimateQ; 1.8 + VMSQueueStruc *readyToAnimateQ; 1.9 unsigned long coreMask; //has 1 in bit positions of allowed cores 1.10 int errorCode; 1.11 - 1.12 + 1.13 + //work-stealing struc on stack to prevent false-sharing in cache-line 1.14 + volatile GateStruc gate; 1.15 + //preGateProgress, waitProgress, exitProgress, gateClosed; 1.16 + 1.17 + 1.18 coreLoopThdParams = (ThdParams *)paramsIn; 1.19 thisCoresIdx = coreLoopThdParams->coreNum; 1.20 1.21 + gate.gateClosed = FALSE; 1.22 + gate.preGateProgress = 0; 1.23 + gate.waitProgress = 0; 1.24 + gate.exitProgress = 0; 1.25 + _VMSMasterEnv->workStealingGates[ thisCoresIdx ] = &gate;//race @startup 1.26 + 1.27 //wait until signalled that setup is complete 1.28 pthread_mutex_lock( &suspendLock ); 1.29 while( !(_VMSMasterEnv->setupComplete) ) 1.30 @@ -87,32 +98,38 @@ 1.31 // which forces reloading the pointer after each jmp to this point 1.32 readyToAnimateQ = _VMSMasterEnv->readyToAnimateQs[thisCoresIdx]; 1.33 1.34 - currPr = (VirtProcr *) readSRSWQ_NonBlocking( readyToAnimateQ ); 1.35 - 1.36 + #ifdef USE_WORK_STEALING 1.37 + //Alg for work-stealing designed to make common case fast. Comment 1.38 + // in stealer code explains. 1.39 + gate.preGateProgress++; 1.40 + if( gate.gateClosed ) 1.41 + { //now, set coreloop's progress, so stealer can see that core loop 1.42 + // has made it into the waiting area. 1.43 + gate.waitProgress = gate.preGateProgress; 1.44 + while( gate.gateClosed ) /*busy wait*/; 1.45 + } 1.46 + 1.47 + currPr = (VirtProcr *) readVMSQ( readyToAnimateQ ); 1.48 + 1.49 + //Set the coreloop's progress, so stealer can see it has made it out 1.50 + // of the protected area 1.51 + gate.exitProgress = gate.preGateProgress; 1.52 + #else 1.53 + currPr = (VirtProcr *) readVMSQ( readyToAnimateQ ); 1.54 + #endif 1.55 + 1.56 if( currPr != NULL ) _VMSMasterEnv->numMasterInARow[thisCoresIdx] = 0; 1.57 1.58 int tries = 0; int gotLock = 0; 1.59 - while( currPr == NULL ) 1.60 - { //no VPs ready to animate, so run MasterVP --later make "try Master" 1.61 - // VPs & put one in every queue at strategic point -- so have work 1.62 - // avail if don't get lock & short-circuit out of it if master has 1.63 - // recently run on another core 1.64 - //TODO: perf -- "try Master" VP that checks if should run Master Fn 1.65 - //But just letting queue run empty is quickest to see if pinning VP 1.66 - // to core will solve the bizarre random seg-faults in system stack. 1.67 - 1.68 - //check if get the MasterLock 1.69 + while( currPr == NULL ) //if queue was empty, enter get masterLock loop 1.70 + { //queue was empty, so get master lock 1.71 gotLock = __sync_bool_compare_and_swap( &(_VMSMasterEnv->masterLock), \ 1.72 - UNLOCKED, LOCKED ); 1.73 + UNLOCKED, LOCKED ); 1.74 if( gotLock ) 1.75 - { //run own MasterVP -- when its done, unlocks MasterLock and 1.76 - // jumps back to coreLoops's startPt 1.77 + { //run own MasterVP -- jmps to coreLoops startPt when done 1.78 currPr = _VMSMasterEnv->masterVPs[thisCoresIdx]; 1.79 - if( _VMSMasterEnv->numMasterInARow[thisCoresIdx] > 100 ) 1.80 - { //printf("1000 back to back MasterVP\n"); 1.81 - //TODO: turn this into work-stealing from another core 1.82 - //only yield if no work to steal -- and count consecutive yields 1.83 - // if too many of those, then sleep for 10ms or whatever 1.84 + if( _VMSMasterEnv->numMasterInARow[thisCoresIdx] > 1000 ) 1.85 + { DEBUG( dbgB2BMaster,"Many back to back MasterVPs\n"); 1.86 pthread_yield(); 1.87 } 1.88 _VMSMasterEnv->numMasterInARow[thisCoresIdx] += 1; 1.89 @@ -124,38 +141,7 @@ 1.90 } 1.91 1.92 1.93 - //switch to virt procr's stack and frame ptr then jump to virt procr fn 1.94 - void *stackPtr, *framePtr, *jmpPt, *coreLoopFramePtrAddr, \ 1.95 - *coreLoopStackPtrAddr; 1.96 - 1.97 - stackPtr = currPr->stackPtr; 1.98 - framePtr = currPr->framePtr; 1.99 - jmpPt = currPr->nextInstrPt; 1.100 - coreLoopFramePtrAddr = &(currPr->coreLoopFramePtr); 1.101 - coreLoopStackPtrAddr = &(currPr->coreLoopStackPtr); 1.102 - 1.103 - //Save the core loop's stack and frame pointers into virt procr struct 1.104 - // then switch to stack ptr and frame ptr of virt procr & jmp to it 1.105 - //This was a pain to get right because GCC converts the "(jmpPt)" to 1.106 - // frame-relative mem-op -- so generated machine code first changed the 1.107 - // frame pointer, then tried to jump to an addr stored on stack, which 1.108 - // it accessed as an offset from frame-ptr! (wrong frame-ptr now) 1.109 - //Explicitly loading into eax before changing frame-ptr fixed it 1.110 - //Also, it turns "(currPr->coreLoopFramePtr)" into a temporary on the 1.111 - // stack, so "movl %%ebp, %0" saves to the temp, NOT the data-struc! 1.112 - asm volatile("movl %0, %%eax; \ 1.113 - movl %%esp, (%%eax); \ 1.114 - movl %1, %%eax; \ 1.115 - movl %%ebp, (%%eax); \ 1.116 - movl %2, %%eax; \ 1.117 - movl %3, %%esp; \ 1.118 - movl %4, %%ebp; \ 1.119 - jmp %%eax" \ 1.120 - /* outputs */ : "=g"(coreLoopStackPtrAddr), \ 1.121 - "=g"(coreLoopFramePtrAddr) \ 1.122 - /* inputs */ : "g" (jmpPt), "g" (stackPtr), "g" (framePtr) \ 1.123 - /* clobber */ : "memory", "%eax", "%ebx", "%ecx", "%edx", "%edi", "%esi" \ 1.124 - ); 1.125 + SwitchToVP( currPr ) 1.126 1.127 //=========== jmp to here when want to shut down the VMS system ========== 1.128 CoreLoopEndPt: 1.129 @@ -176,7 +162,7 @@ 1.130 coreLoop_Seq( void *paramsIn ) 1.131 { 1.132 VirtProcr *currPr; 1.133 - SRSWQueueStruc *readyToAnimateQ; 1.134 + VMSQueueStruc *readyToAnimateQ; 1.135 1.136 ThdParams *coreLoopThdParams; 1.137 int thisCoresIdx; 1.138 @@ -207,7 +193,7 @@ 1.139 //_VMSWorkQ must be a global, static volatile var, so not kept in reg, 1.140 // which forces reloading the pointer after each jmp to this point 1.141 readyToAnimateQ = _VMSMasterEnv->readyToAnimateQs[thisCoresIdx]; 1.142 - currPr = (VirtProcr *) readSRSWQ_NonBlocking( readyToAnimateQ ); 1.143 + currPr = (VirtProcr *) readVMSQ( readyToAnimateQ ); 1.144 if( currPr == NULL ) 1.145 { if( _VMSMasterEnv->numMasterInARow[thisCoresIdx] > 1000 ) 1.146 { printf("too many back to back MasterVP\n"); exit(1); } 1.147 @@ -219,38 +205,7 @@ 1.148 _VMSMasterEnv->numMasterInARow[thisCoresIdx] = 0; 1.149 1.150 1.151 - //switch to virt procr's stack and frame ptr then jump to virt procr 1.152 - void *stackPtr, *framePtr, *jmpPt, *coreLoopFramePtrAddr, \ 1.153 - *coreLoopStackPtrAddr; 1.154 - 1.155 - stackPtr = currPr->stackPtr; 1.156 - framePtr = currPr->framePtr; 1.157 - jmpPt = currPr->nextInstrPt; 1.158 - coreLoopFramePtrAddr = &(currPr->coreLoopFramePtr); 1.159 - coreLoopStackPtrAddr = &(currPr->coreLoopStackPtr); 1.160 - 1.161 - //Save the core loop's stack and frame pointers into virt procr struct 1.162 - // then switch to stack ptr and frame ptr of virt procr & jmp to it 1.163 - //This was a pain to get right because GCC converts the "(jmpPt)" to 1.164 - // frame-relative mem-op -- so generated machine code first changed the 1.165 - // frame pointer, then tried to jump to an addr stored on stack, which 1.166 - // it accessed as an offset from frame-ptr! (wrong frame-ptr now) 1.167 - //Explicitly loading into eax before changing frame-ptr fixed it 1.168 - //Also, it turns "(currPr->coreLoopFramePtr)" into a temporary on the 1.169 - // stack, so "movl %%ebp, %0" saves to the temp, NOT the data-struc! 1.170 - asm volatile("movl %0, %%eax; \ 1.171 - movl %%esp, (%%eax); \ 1.172 - movl %1, %%eax; \ 1.173 - movl %%ebp, (%%eax); \ 1.174 - movl %2, %%eax; \ 1.175 - movl %3, %%esp; \ 1.176 - movl %4, %%ebp; \ 1.177 - jmp %%eax" \ 1.178 - /* outputs */ : "=g"(coreLoopStackPtrAddr), \ 1.179 - "=g"(coreLoopFramePtrAddr) \ 1.180 - /* inputs */ : "g" (jmpPt), "g" (stackPtr), "g" (framePtr) \ 1.181 - /* clobber */ : "memory", "%eax", "%ebx", "%ecx", "%edx", "%edi", "%esi" \ 1.182 - ); 1.183 + SwitchToVP( currPr ) 1.184 1.185 //======================================================================== 1.186 //jmp to here when want to shut down the VMS system. A shutdown VP is
2.1 --- a/MasterLoop.c Tue Nov 02 16:43:01 2010 -0700 2.2 +++ b/MasterLoop.c Thu Nov 04 18:13:18 2010 -0700 2.3 @@ -12,6 +12,14 @@ 2.4 #include "VMS.h" 2.5 2.6 2.7 +//=========================================================================== 2.8 +void inline 2.9 +stealWorkInto( SchedSlot *currSlot, VMSQueueStruc *readyToAnimateQ, 2.10 + VirtProcr *masterPr ); 2.11 + 2.12 +//=========================================================================== 2.13 + 2.14 + 2.15 2.16 /*This code is animated by the virtual Master processor. 2.17 * 2.18 @@ -64,7 +72,7 @@ 2.19 */ 2.20 void masterLoop( void *initData, VirtProcr *animatingPr ) 2.21 { 2.22 - int slotIdx; 2.23 + int32 slotIdx, numSlotsFilled; 2.24 VirtProcr *schedVirtPr; 2.25 SchedSlot *currSlot, **schedSlots; 2.26 MasterEnv *masterEnv; 2.27 @@ -74,7 +82,7 @@ 2.28 RequestHandler requestHandler; 2.29 void *semanticEnv; 2.30 2.31 - int thisCoresIdx; 2.32 + int32 thisCoresIdx; 2.33 VirtProcr *masterPr; 2.34 volatile VirtProcr *volatileMasterPr; 2.35 2.36 @@ -108,7 +116,7 @@ 2.37 2.38 masterEnv = _VMSMasterEnv; 2.39 2.40 -//TODO: check that compiles so that always re-define from frame-storage 2.41 + //GCC may optimize so doesn't always re-define from frame-storage 2.42 masterPr = volatileMasterPr; //just to make sure after jmp 2.43 thisCoresIdx = masterPr->coreAnimatedBy; 2.44 readyToAnimateQ = masterEnv->readyToAnimateQs[thisCoresIdx]; 2.45 @@ -120,6 +128,7 @@ 2.46 2.47 2.48 //Poll each slot's Done flag 2.49 + numSlotsFilled = 0; 2.50 for( slotIdx = 0; slotIdx < NUM_SCHED_SLOTS; slotIdx++) 2.51 { 2.52 currSlot = schedSlots[ slotIdx ]; 2.53 @@ -141,46 +150,203 @@ 2.54 { currSlot->procrAssignedToSlot = schedVirtPr; 2.55 schedVirtPr->schedSlot = currSlot; 2.56 currSlot->needsProcrAssigned = FALSE; 2.57 - 2.58 - writeSRSWQ( schedVirtPr, readyToAnimateQ ); 2.59 + numSlotsFilled += 1; 2.60 + 2.61 + writeVMSQ( schedVirtPr, readyToAnimateQ ); 2.62 } 2.63 } 2.64 } 2.65 2.66 + 2.67 + #ifdef USE_WORK_STEALING 2.68 + //If no slots filled, means no more work, look for work to steal. 2.69 + if( numSlotsFilled == 0 ) 2.70 + { gateProtected_stealWorkInto( currSlot, readyToAnimateQ, masterPr ); 2.71 + } 2.72 + #endif 2.73 2.74 - //Save stack ptr and frame, restore CoreLoop's stack and frame, 2.75 - // and clear the MasterLock 2.76 - //TODO: cafefully verify don't need to force saving anything to stack 2.77 - // before jumping back to core loop. 2.78 - void *stackPtrAddr, *framePtrAddr, *masterLockAddr; 2.79 - void *jmpPt, *coreLoopFramePtr, *coreLoopStackPtr; 2.80 - 2.81 - stackPtrAddr = &(masterPr->stackPtr); 2.82 - framePtrAddr = &(masterPr->framePtr); 2.83 - masterLockAddr = &(_VMSMasterEnv->masterLock); 2.84 - 2.85 - jmpPt = _VMSMasterEnv->coreLoopStartPt; 2.86 - coreLoopFramePtr = masterPr->coreLoopFramePtr;//need this only 2.87 - coreLoopStackPtr = masterPr->coreLoopStackPtr;//shouldn't need -- safety 2.88 2.89 #ifdef MEAS__TIME_MASTER 2.90 saveLowTimeStampCountInto( masterPr->endMasterTSCLow ); 2.91 #endif 2.92 2.93 - asm volatile("movl %0, %%eax; \ 2.94 - movl %%esp, (%%eax); \ 2.95 - movl %1, %%eax; \ 2.96 - movl %%ebp, (%%eax); \ 2.97 - movl %2, %%ebx; \ 2.98 - movl %3, %%eax; \ 2.99 - movl %4, %%esp; \ 2.100 - movl %5, %%ebp; \ 2.101 - movl $0x0, (%%ebx); \ 2.102 - jmp %%eax;" \ 2.103 - /* outputs */ : "=g" (stackPtrAddr), "=g" (framePtrAddr), \ 2.104 - "=g"(masterLockAddr) \ 2.105 - /* inputs */ : "g" (jmpPt), "g"(coreLoopStackPtr), "g"(coreLoopFramePtr)\ 2.106 - /* clobber */ : "memory", "%eax", "%ebx", "%ecx", "%edx", "%edi", "%esi" \ 2.107 - );//can probably make clobber list empty -- but safe for now 2.108 + 2.109 + masterSwitchToCoreLoop( masterPr ) 2.110 } 2.111 2.112 + 2.113 + 2.114 +/*This has a race condition -- the coreloops are accessing their own queues 2.115 + * at the same time that this work-stealer on a different core is trying to 2.116 + */ 2.117 +void inline 2.118 +stealWorkInto( SchedSlot *currSlot, VMSQueueStruc *readyToAnimateQ, 2.119 + VirtProcr *masterPr ) 2.120 + { 2.121 + VirtProcr *stolenPr; 2.122 + int32 coreIdx, i; 2.123 + VMSQueueStruc *currQ; 2.124 + 2.125 + stolenPr = NULL; 2.126 + coreIdx = masterPr->coreAnimatedBy; 2.127 + for( i = 0; i < NUM_CORES -1; i++ ) 2.128 + { 2.129 + if( coreIdx >= NUM_CORES -1 ) 2.130 + { coreIdx = 0; 2.131 + } 2.132 + else 2.133 + { coreIdx++; 2.134 + } 2.135 + currQ = _VMSMasterEnv->readyToAnimateQs[coreIdx]; 2.136 + if( numInVMSQ( currQ ) > 0 ) 2.137 + { stolenPr = readVMSQ (currQ ); 2.138 + break; 2.139 + } 2.140 + } 2.141 + 2.142 + if( stolenPr != NULL ) 2.143 + { currSlot->procrAssignedToSlot = stolenPr; 2.144 + stolenPr->schedSlot = currSlot; 2.145 + currSlot->needsProcrAssigned = FALSE; 2.146 + 2.147 + writeVMSQ( stolenPr, readyToAnimateQ ); 2.148 + } 2.149 + } 2.150 + 2.151 +/*This algorithm makes the common case fast. Make the coreloop passive, 2.152 + * and show its progress. Make the stealer control a gate that coreloop 2.153 + * has to pass. 2.154 + *To avoid interference, only one stealer at a time. Use a global 2.155 + * stealer-lock. 2.156 + * 2.157 + *The pattern is based on a gate -- stealer shuts the gate, then monitors 2.158 + * to be sure any already past make it all the way out, before starting. 2.159 + *So, have a "progress" measure just before the gate, then have two after it, 2.160 + * one is in a "waiting room" outside the gate, the other is at the exit. 2.161 + *Then, the stealer first shuts the gate, then checks the progress measure 2.162 + * outside it, then looks to see if the progress measure at the exit is the 2.163 + * same. If yes, it knows the protected area is empty 'cause no other way 2.164 + * to get in and the last to get in also exited. 2.165 + *If the progress measure at the exit is not the same, then the stealer goes 2.166 + * into a loop checking both the waiting-area and the exit progress-measures 2.167 + * until one of them shows the same as the measure outside the gate. Might 2.168 + * as well re-read the measure outside the gate each go around, just to be 2.169 + * sure. It is guaranteed that one of the two will eventually match the one 2.170 + * outside the gate. 2.171 + * 2.172 + *Here's an informal proof of correctness: 2.173 + *The gate can be closed at any point, and have only four cases: 2.174 + * 1) coreloop made it past the gate-closing but not yet past the exit 2.175 + * 2) coreloop made it past the pre-gate progress update but not yet past 2.176 + * the gate, 2.177 + * 3) coreloop is right before the pre-gate update 2.178 + * 4) coreloop is past the exit and far from the pre-gate update. 2.179 + * 2.180 + * Covering the cases in reverse order, 2.181 + * 4) is not a problem -- stealer will read pre-gate progress, see that it 2.182 + * matches exit progress, and the gate is closed, so stealer can proceed. 2.183 + * 3) stealer will read pre-gate progress just after coreloop updates it.. 2.184 + * so stealer goes into a loop until the coreloop causes wait-progress 2.185 + * to match pre-gate progress, so then stealer can proceed 2.186 + * 2) same as 3.. 2.187 + * 1) stealer reads pre-gate progress, sees that it's different than exit, 2.188 + * so goes into loop until exit matches pre-gate, now it knows coreloop 2.189 + * is not in protected and cannot get back in, so can proceed. 2.190 + * 2.191 + *Implementation for the stealer: 2.192 + * 2.193 + *First, acquire the stealer lock -- only cores with no work to do will 2.194 + * compete to steal, so not a big performance penalty having only one -- 2.195 + * will rarely have multiple stealers in a system with plenty of work -- and 2.196 + * in a system with little work, it doesn't matter. 2.197 + * 2.198 + *Note, have single-reader, single-writer pattern for all variables used to 2.199 + * communicate between stealer and victims 2.200 + * 2.201 + *So, scan the queues of the core loops, until find non-empty. Each core 2.202 + * has its own list that it scans. The list goes in order from closest to 2.203 + * furthest core, so it steals first from close cores. Later can add 2.204 + * taking info from the app about overlapping footprints, and scan all the 2.205 + * others then choose work with the most footprint overlap with the contents 2.206 + * of this core's cache. 2.207 + * 2.208 + *Now, have a victim want to take work from. So, shut the gate in that 2.209 + * coreloop, by setting the "gate closed" var on its stack to TRUE. 2.210 + *Then, read the core's pre-gate progress and compare to the core's exit 2.211 + * progress. 2.212 + *If same, can proceed to take work from the coreloop's queue. When done, 2.213 + * write FALSE to gate closed var. 2.214 + *If different, then enter a loop that reads the pre-gate progress, then 2.215 + * compares to exit progress then to wait progress. When one of two 2.216 + * matches, proceed. Take work from the coreloop's queue. When done, 2.217 + * write FALSE to the gate closed var. 2.218 + * 2.219 + */ 2.220 +void inline 2.221 +gateProtected_stealWorkInto( SchedSlot *currSlot, 2.222 + VMSQueueStruc *myReadyToAnimateQ, 2.223 + VirtProcr *masterPr ) 2.224 + { 2.225 + VirtProcr *stolenPr; 2.226 + int32 coreIdx, i, haveAVictim, gotLock; 2.227 + VMSQueueStruc *victimsQ; 2.228 + 2.229 + volatile GateStruc *vicGate; 2.230 + int32 coreMightBeInProtected; 2.231 + 2.232 + 2.233 + 2.234 + //see if any other cores have work available to steal 2.235 + haveAVictim = FALSE; 2.236 + coreIdx = masterPr->coreAnimatedBy; 2.237 + for( i = 0; i < NUM_CORES -1; i++ ) 2.238 + { 2.239 + if( coreIdx >= NUM_CORES -1 ) 2.240 + { coreIdx = 0; 2.241 + } 2.242 + else 2.243 + { coreIdx++; 2.244 + } 2.245 + victimsQ = _VMSMasterEnv->readyToAnimateQs[coreIdx]; 2.246 + if( numInVMSQ( victimsQ ) > 0 ) 2.247 + { haveAVictim = TRUE; 2.248 + vicGate = _VMSMasterEnv->workStealingGates[ coreIdx ]; 2.249 + break; 2.250 + } 2.251 + } 2.252 + if( !haveAVictim ) return; //no work to steal, exit 2.253 + 2.254 + //have a victim core, now get the stealer-lock 2.255 + gotLock =__sync_bool_compare_and_swap( &(_VMSMasterEnv->workStealingLock), 2.256 + UNLOCKED, LOCKED ); 2.257 + if( !gotLock ) return; //go back to core loop, which will re-start master 2.258 + 2.259 + 2.260 + //====== Start Gate-protection ======= 2.261 + vicGate->gateClosed = TRUE; 2.262 + coreMightBeInProtected= vicGate->preGateProgress != vicGate->exitProgress; 2.263 + while( coreMightBeInProtected ) 2.264 + { //wait until sure 2.265 + if( vicGate->preGateProgress == vicGate->waitProgress ) 2.266 + coreMightBeInProtected = FALSE; 2.267 + if( vicGate->preGateProgress == vicGate->exitProgress ) 2.268 + coreMightBeInProtected = FALSE; 2.269 + } 2.270 + 2.271 + stolenPr = readVMSQ ( victimsQ ); 2.272 + 2.273 + vicGate->gateClosed = FALSE; 2.274 + //======= End Gate-protection ======= 2.275 + 2.276 + 2.277 + if( stolenPr != NULL ) //victim could have been in protected and taken 2.278 + { currSlot->procrAssignedToSlot = stolenPr; 2.279 + stolenPr->schedSlot = currSlot; 2.280 + currSlot->needsProcrAssigned = FALSE; 2.281 + 2.282 + writeVMSQ( stolenPr, myReadyToAnimateQ ); 2.283 + } 2.284 + 2.285 + //unlock the work stealing lock 2.286 + _VMSMasterEnv->workStealingLock = UNLOCKED; 2.287 + }
3.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 3.2 +++ b/SwitchAnimators.h Thu Nov 04 18:13:18 2010 -0700 3.3 @@ -0,0 +1,138 @@ 3.4 +/* 3.5 + * Copyright 2009 OpenSourceStewardshipFoundation.org 3.6 + * Licensed under GNU General Public License version 2 3.7 + * 3.8 + * Author: seanhalle@yahoo.com 3.9 + * 3.10 + */ 3.11 + 3.12 +#ifndef _SwitchAnimators_H 3.13 +#define _SwitchAnimators_H 3.14 +#define __USE_GNU 3.15 + 3.16 +/*Isolating code for switching between animators within these macros -- at 3.17 + * some point will make switches to compile for 32 bit or for 64 bit, which 3.18 + * having these isolated will make cleaner 3.19 + * 3.20 + *This also makes it easier to change architectures, at some point 3.21 + *And it cleans the code up, having the ugly assembly out of the way 3.22 + */ 3.23 + 3.24 +//=========================== MasterVP to CoreLoop ========================== 3.25 +// 3.26 + //Save stack ptr and frame, restore CoreLoop's stack and frame, 3.27 + // and clear the MasterLock 3.28 + //GCC's -O3 messes with this -- go through generated -- protect somehow 3.29 + // 3.30 +#define masterSwitchToCoreLoop( masterPr ) \ 3.31 + void *stackPtrAddr, *framePtrAddr, *masterLockAddr; \ 3.32 + void *jmpPt, *coreLoopFramePtr, *coreLoopStackPtr; \ 3.33 +\ 3.34 + stackPtrAddr = &(masterPr->stackPtr); \ 3.35 + framePtrAddr = &(masterPr->framePtr); \ 3.36 + masterLockAddr = &(_VMSMasterEnv->masterLock); \ 3.37 +\ 3.38 + jmpPt = _VMSMasterEnv->coreLoopStartPt; \ 3.39 + coreLoopFramePtr = masterPr->coreLoopFramePtr; \ 3.40 + coreLoopStackPtr = masterPr->coreLoopStackPtr; \ 3.41 +\ 3.42 + asm volatile("movl %0, %%eax; \ 3.43 + movl %%esp, (%%eax); \ 3.44 + movl %1, %%eax; \ 3.45 + movl %%ebp, (%%eax); \ 3.46 + movl %2, %%ebx; \ 3.47 + movl %3, %%eax; \ 3.48 + movl %4, %%esp; \ 3.49 + movl %5, %%ebp; \ 3.50 + movl $0x0, (%%ebx); \ 3.51 + jmp %%eax;" \ 3.52 + /* outputs */ : "=g" (stackPtrAddr), "=g" (framePtrAddr), \ 3.53 + "=g"(masterLockAddr) \ 3.54 + /* inputs */ : "g" (jmpPt), "g"(coreLoopStackPtr), "g"(coreLoopFramePtr)\ 3.55 + /* clobber */ : "memory", "%eax", "%ebx", "%ecx", "%edx", "%edi", "%esi" \ 3.56 + );//can probably make clobber list empty -- but safe for now 3.57 + 3.58 + 3.59 +//=========================== SlaveVP to CoreLoop =========================== 3.60 +// 3.61 + 3.62 +#define SwitchToCoreLoop( animatingPr ) \ 3.63 + void *jmpPt, *stackPtrAddr, *framePtrAddr, *coreLoopStackPtr; \ 3.64 + void *coreLoopFramePtr; \ 3.65 +\ 3.66 + stackPtrAddr = &(animatingPr->stackPtr); \ 3.67 + framePtrAddr = &(animatingPr->framePtr); \ 3.68 +\ 3.69 + jmpPt = _VMSMasterEnv->coreLoopStartPt; \ 3.70 + coreLoopFramePtr = animatingPr->coreLoopFramePtr; \ 3.71 + coreLoopStackPtr = animatingPr->coreLoopStackPtr; \ 3.72 +\ 3.73 + /*Save the virt procr's stack and frame ptrs*/ \ 3.74 + asm volatile("movl %0, %%eax; \ 3.75 + movl %%esp, (%%eax); \ 3.76 + movl %1, %%eax; \ 3.77 + movl %%ebp, (%%eax) "\ 3.78 + /* outputs */ : "=g" (stackPtrAddr), "=g" (framePtrAddr) \ 3.79 + /* inputs */ : \ 3.80 + /* clobber */ : "%eax" \ 3.81 + ); \ 3.82 +\ 3.83 + /*restore coreloop's frame ptr, then jump back to "start" of core loop*/\ 3.84 + /*Note, GCC compiles to assembly that saves esp and ebp in the stack*/ \ 3.85 + /* frame -- so have to explicitly do assembly that saves to memory*/ \ 3.86 + asm volatile("movl %0, %%eax; \ 3.87 + movl %1, %%esp; \ 3.88 + movl %2, %%ebp; \ 3.89 + jmp %%eax " \ 3.90 + /* outputs */ : \ 3.91 + /* inputs */ : "m" (jmpPt), "m"(coreLoopStackPtr), "m"(coreLoopFramePtr)\ 3.92 + /* clobber */ : "memory", "%eax", "%ebx", "%ecx", "%edx", "%edi","%esi" \ 3.93 + ); 3.94 + //list everything as clobbered to force GCC to save all 3.95 + // live vars that are in regs on stack before this 3.96 + // assembly, so that stack pointer is correct, before jmp 3.97 + 3.98 + 3.99 + 3.100 +//============================== CoreLoop to VP ============================= 3.101 +// 3.102 + //Save the core loop's stack and frame pointers into virt procr struct 3.103 + // then switch to stack ptr and frame ptr of virt procr & jmp to it 3.104 + //This was a pain to get right because GCC converts the "(jmpPt)" to 3.105 + // frame-relative mem-op -- so generated machine code first changed the 3.106 + // frame pointer, then tried to jump to an addr stored on stack, which 3.107 + // it accessed as an offset from frame-ptr! (wrong frame-ptr now) 3.108 + //Explicitly loading into eax before changing frame-ptr fixed it 3.109 + //Also, it turns "(currPr->coreLoopFramePtr)" into a temporary on the 3.110 + // stack, so "movl %%ebp, %0" saves to the temp, NOT the data-struc! 3.111 + 3.112 + 3.113 + //switch to virt procr's stack and frame ptr then jump to virt procr fn 3.114 + 3.115 +#define SwitchToVP( currPr ) \ 3.116 + void *stackPtr, *framePtr, *jmpPt, *coreLoopFramePtrAddr, \ 3.117 + *coreLoopStackPtrAddr; \ 3.118 +\ 3.119 + stackPtr = currPr->stackPtr; \ 3.120 + framePtr = currPr->framePtr; \ 3.121 + jmpPt = currPr->nextInstrPt; \ 3.122 + coreLoopFramePtrAddr = &(currPr->coreLoopFramePtr); \ 3.123 + coreLoopStackPtrAddr = &(currPr->coreLoopStackPtr); \ 3.124 +\ 3.125 + asm volatile("movl %0, %%eax; \ 3.126 + movl %%esp, (%%eax); \ 3.127 + movl %1, %%eax; \ 3.128 + movl %%ebp, (%%eax); \ 3.129 + movl %2, %%eax; \ 3.130 + movl %3, %%esp; \ 3.131 + movl %4, %%ebp; \ 3.132 + jmp %%eax" \ 3.133 + /* outputs */ : "=g"(coreLoopStackPtrAddr), \ 3.134 + "=g"(coreLoopFramePtrAddr) \ 3.135 + /* inputs */ : "g" (jmpPt), "g" (stackPtr), "g" (framePtr) \ 3.136 + /* clobber */ : "memory", "%eax", "%ebx", "%ecx", "%edx", "%edi", "%esi" \ 3.137 + ); 3.138 + 3.139 + 3.140 +#endif /* _SwitchAnimators_H */ 3.141 +
4.1 --- a/VMS.c Tue Nov 02 16:43:01 2010 -0700 4.2 +++ b/VMS.c Thu Nov 04 18:13:18 2010 -0700 4.3 @@ -87,7 +87,7 @@ 4.4 void 4.5 create_masterEnv() 4.6 { MasterEnv *masterEnv; 4.7 - SRSWQueueStruc **readyToAnimateQs; 4.8 + VMSQueueStruc **readyToAnimateQs; 4.9 int coreIdx; 4.10 VirtProcr **masterVPs; 4.11 SchedSlot ***allSchedSlots; //ptr to array of ptrs 4.12 @@ -105,7 +105,7 @@ 4.13 masterEnv = _VMSMasterEnv; 4.14 4.15 //Make a readyToAnimateQ for each core loop 4.16 - readyToAnimateQs = VMS__malloc( NUM_CORES * sizeof(SRSWQueueStruc *) ); 4.17 + readyToAnimateQs = VMS__malloc( NUM_CORES * sizeof(VMSQueueStruc *) ); 4.18 masterVPs = VMS__malloc( NUM_CORES * sizeof(VirtProcr *) ); 4.19 4.20 //One array for each core, 3 in array, core's masterVP scheds all 4.21 @@ -114,18 +114,20 @@ 4.22 _VMSMasterEnv->numProcrsCreated = 0; //used by create procr 4.23 for( coreIdx = 0; coreIdx < NUM_CORES; coreIdx++ ) 4.24 { 4.25 - readyToAnimateQs[ coreIdx ] = makeSRSWQ(); 4.26 + readyToAnimateQs[ coreIdx ] = makeVMSQ(); 4.27 4.28 //Q: should give masterVP core-specific info as its init data? 4.29 masterVPs[ coreIdx ] = VMS__create_procr( &masterLoop, masterEnv ); 4.30 masterVPs[ coreIdx ]->coreAnimatedBy = coreIdx; 4.31 allSchedSlots[ coreIdx ] = create_sched_slots(); //makes for one core 4.32 _VMSMasterEnv->numMasterInARow[ coreIdx ] = 0; 4.33 + _VMSMasterEnv->workStealingGates[ coreIdx ] = NULL; 4.34 } 4.35 _VMSMasterEnv->readyToAnimateQs = readyToAnimateQs; 4.36 _VMSMasterEnv->masterVPs = masterVPs; 4.37 _VMSMasterEnv->masterLock = UNLOCKED; 4.38 _VMSMasterEnv->allSchedSlots = allSchedSlots; 4.39 + _VMSMasterEnv->workStealingLock = UNLOCKED; 4.40 4.41 4.42 //Aug 19, 2010: no longer need to place initial masterVP into queue 4.43 @@ -338,8 +340,7 @@ 4.44 */ 4.45 void 4.46 VMS__suspend_procr( VirtProcr *animatingPr ) 4.47 - { void *jmpPt, *stackPtrAddr, *framePtrAddr, *coreLoopStackPtr; 4.48 - void *coreLoopFramePtr; 4.49 + { 4.50 4.51 //The request to master will cause this suspended virt procr to get 4.52 // scheduled again at some future point -- to resume, core loop jumps 4.53 @@ -350,23 +351,6 @@ 4.54 //return ownership of the virt procr and sched slot to Master virt pr 4.55 animatingPr->schedSlot->workIsDone = TRUE; 4.56 4.57 - stackPtrAddr = &(animatingPr->stackPtr); 4.58 - framePtrAddr = &(animatingPr->framePtr); 4.59 - 4.60 - jmpPt = _VMSMasterEnv->coreLoopStartPt; 4.61 - coreLoopFramePtr = animatingPr->coreLoopFramePtr;//need this only 4.62 - coreLoopStackPtr = animatingPr->coreLoopStackPtr;//safety 4.63 - 4.64 - //Save the virt procr's stack and frame ptrs, 4.65 - asm volatile("movl %0, %%eax; \ 4.66 - movl %%esp, (%%eax); \ 4.67 - movl %1, %%eax; \ 4.68 - movl %%ebp, (%%eax) "\ 4.69 - /* outputs */ : "=g" (stackPtrAddr), "=g" (framePtrAddr) \ 4.70 - /* inputs */ : \ 4.71 - /* clobber */ : "%eax" \ 4.72 - ); 4.73 - 4.74 //=========================== Measurement stuff ======================== 4.75 #ifdef MEAS__TIME_STAMP_SUSP 4.76 //record time stamp: compare to time-stamp recorded below 4.77 @@ -374,20 +358,10 @@ 4.78 #endif 4.79 //======================================================================= 4.80 4.81 - //restore coreloop's frame ptr, then jump back to "start" of core loop 4.82 - //Note, GCC compiles to assembly that saves esp and ebp in the stack 4.83 - // frame -- so have to explicitly do assembly that saves to memory 4.84 - asm volatile("movl %0, %%eax; \ 4.85 - movl %1, %%esp; \ 4.86 - movl %2, %%ebp; \ 4.87 - jmp %%eax " \ 4.88 - /* outputs */ : \ 4.89 - /* inputs */ : "m" (jmpPt), "m"(coreLoopStackPtr), "m"(coreLoopFramePtr)\ 4.90 - /* clobber */ : "memory", "%eax", "%ebx", "%ecx", "%edx", "%edi","%esi" \ 4.91 - ); //list everything as clobbered to force GCC to save all 4.92 - // live vars that are in regs on stack before this 4.93 - // assembly, so that stack pointer is correct, before jmp 4.94 4.95 + SwitchToCoreLoop( animatingPr ) 4.96 + 4.97 + //======================================================================= 4.98 ResumePt: 4.99 #ifdef MEAS__TIME_STAMP_SUSP 4.100 //NOTE: only take low part of count -- do sanity check when take diff 4.101 @@ -673,7 +647,7 @@ 4.102 for( coreIdx=0; coreIdx < NUM_CORES; coreIdx++ ) 4.103 { //Note, this is running in the master 4.104 shutDownPr = VMS__create_procr( &endOSThreadFn, NULL ); 4.105 - writeSRSWQ( shutDownPr, _VMSMasterEnv->readyToAnimateQs[coreIdx] ); 4.106 + writeVMSQ( shutDownPr, _VMSMasterEnv->readyToAnimateQs[coreIdx] ); 4.107 } 4.108 4.109 } 4.110 @@ -717,7 +691,7 @@ 4.111 void 4.112 VMS__cleanup_at_end_of_shutdown() 4.113 { 4.114 - SRSWQueueStruc **readyToAnimateQs; 4.115 + VMSQueueStruc **readyToAnimateQs; 4.116 int coreIdx; 4.117 VirtProcr **masterVPs; 4.118 SchedSlot ***allSchedSlots; //ptr to array of ptrs 4.119 @@ -731,7 +705,7 @@ 4.120 4.121 for( coreIdx = 0; coreIdx < NUM_CORES; coreIdx++ ) 4.122 { 4.123 - freeSRSWQ( readyToAnimateQs[ coreIdx ] ); 4.124 + freeVMSQ( readyToAnimateQs[ coreIdx ] ); 4.125 //master VPs were created external to VMS, so use external free 4.126 VMS__dissipate_procr( masterVPs[ coreIdx ] ); 4.127
5.1 --- a/VMS.h Tue Nov 02 16:43:01 2010 -0700 5.2 +++ b/VMS.h Thu Nov 04 18:13:18 2010 -0700 5.3 @@ -11,7 +11,7 @@ 5.4 #define __USE_GNU 5.5 5.6 #include "VMS_primitive_data_types.h" 5.7 -#include "Queue_impl/BlockingQueue.h" 5.8 +#include "Queue_impl/PrivateQueue.h" 5.9 #include "Histogram/Histogram.h" 5.10 #include "DynArray/DynArray.h" 5.11 #include "Hash_impl/PrivateHash.h" 5.12 @@ -22,28 +22,36 @@ 5.13 5.14 5.15 //=============================== Debug =================================== 5.16 - //These defines turn types of bug messages on and off 5.17 -#define dbgProbes FALSE 5.18 -#define dbgAppFlow FALSE 5.19 - 5.20 +// 5.21 //When SEQUENTIAL is defined, VMS does sequential exe in the main thread 5.22 // It still does co-routines and all the mechanisms are the same, it just 5.23 // has only a single thread and animates VPs one at a time 5.24 //#define SEQUENTIAL 5.25 5.26 +//#define USE_WORK_STEALING 5.27 + 5.28 //turns on the probe-instrumentation in the application -- when not 5.29 // defined, the calls to the probe functions turn into comments 5.30 #define STATS__ENABLE_PROBES 5.31 5.32 + //These defines turn types of bug messages on and off 5.33 + // be sure debug messages are un-commented (next block of defines) 5.34 +#define dbgProbes FALSE /* for issues inside probes themselves*/ 5.35 +#define dbgAppFlow TRUE /* Top level flow of application code -- general*/ 5.36 +#define dbgB2BMaster FALSE/* in coreloop, back to back master VPs*/ 5.37 +#define dbgRqstHdlr FALSE /* in request handler code*/ 5.38 5.39 -#define DEBUG(msg)// printf(msg); fflush(stdin); 5.40 -#define DEBUG_MSG( bool, msg) //if( bool){ printf(msg); fflush(stdin);} 5.41 -#define PRINT1_DEBUG(msg, param) //printf(msg, param); fflush(stdin); 5.42 -#define PRINT2_DEBUG(msg, p1, p2) //printf(msg, p1, p2); fflush(stdin); 5.43 + //Comment or un- the substitute half to turn on/off types of debug message 5.44 +#define DEBUG( bool, msg) \ 5.45 + if( bool){ printf(msg); fflush(stdin);} 5.46 +#define DEBUG1( bool, msg, param) \ 5.47 + if(bool){printf(msg, param); fflush(stdin);} 5.48 +#define DEBUG2( bool, msg, p1, p2) \ 5.49 + //if(bool) {printf(msg, p1, p2); fflush(stdin);} 5.50 5.51 -#define PRINT_ERROR(msg) printf(msg); fflush(stdin); 5.52 -#define PRINT1_ERROR(msg, param) printf(msg, param); fflush(stdin); 5.53 -#define PRINT2_ERROR(msg, p1, p2) printf(msg, p1, p2); fflush(stdin); 5.54 +#define ERROR(msg) printf(msg); fflush(stdin); 5.55 +#define ERROR1(msg, param) printf(msg, param); fflush(stdin); 5.56 +#define ERROR2(msg, p1, p2) printf(msg, p1, p2); fflush(stdin); 5.57 5.58 //=========================== STATS ======================= 5.59 5.60 @@ -56,6 +64,8 @@ 5.61 #define MEAS__TIME_MASTER 5.62 #define MEAS__NUM_TIMES_TO_RUN 100000 5.63 5.64 + //For code that calculates normalization-offset between TSC counts of 5.65 + // different cores. 5.66 #define NUM_TSC_ROUND_TRIPS 10 5.67 5.68 5.69 @@ -64,8 +74,9 @@ 5.70 // machine 5.71 #define NUM_CORES 4 5.72 5.73 - // balance amortizing master fixed overhead vs imbalance potential 5.74 -#define NUM_SCHED_SLOTS 3 5.75 + // tradeoff amortizing master fixed overhead vs imbalance potential 5.76 + // when work-stealing, can make bigger, at risk of losing cache affinity 5.77 +#define NUM_SCHED_SLOTS 5 5.78 5.79 #define MIN_WORK_UNIT_CYCLES 20000 5.80 5.81 @@ -82,10 +93,11 @@ 5.82 5.83 #define SUCCESS 0 5.84 5.85 -#define writeVMSQ writeSRSWQ 5.86 -#define readVMSQ readSRSWQ 5.87 -#define makeVMSQ makeSRSWQ 5.88 -#define VMSQueueStruc SRSWQueueStruc 5.89 +#define writeVMSQ writePrivQ 5.90 +#define readVMSQ readPrivQ 5.91 +#define makeVMSQ makePrivQ 5.92 +#define numInVMSQ numInPrivQ 5.93 +#define VMSQueueStruc PrivQueueStruc 5.94 5.95 5.96 5.97 @@ -96,6 +108,8 @@ 5.98 typedef struct _VMSReqst VMSReqst; 5.99 typedef struct _VirtProcr VirtProcr; 5.100 typedef struct _IntervalProbe IntervalProbe; 5.101 +typedef struct _GateStruc GateStruc; 5.102 + 5.103 5.104 typedef VirtProcr * (*SlaveScheduler) ( void *, int ); //semEnv, coreIdx 5.105 typedef void (*RequestHandler) ( VirtProcr *, void * ); //prWReqst, semEnv 5.106 @@ -190,7 +204,7 @@ 5.107 RequestHandler requestHandler; 5.108 5.109 SchedSlot ***allSchedSlots; 5.110 - SRSWQueueStruc **readyToAnimateQs; 5.111 + VMSQueueStruc **readyToAnimateQs; 5.112 VirtProcr **masterVPs; 5.113 5.114 void *semanticEnv; 5.115 @@ -205,6 +219,9 @@ 5.116 int32 masterLock; 5.117 5.118 int32 numMasterInARow[NUM_CORES];//detect back-to-back masterVP 5.119 + GateStruc **workStealingGates[ NUM_CORES ]; //concurrent work-steal 5.120 + int32 workStealingLock; 5.121 + 5.122 int32 numProcrsCreated; //gives ordering to processor creation 5.123 5.124 //=========== MEASUREMENT STUFF ============= 5.125 @@ -216,13 +233,21 @@ 5.126 } 5.127 MasterEnv; 5.128 5.129 -//============================= 5.130 +//========================= Extra Stuff Data Strucs ======================= 5.131 typedef struct 5.132 { 5.133 5.134 } 5.135 VMSExcp; 5.136 5.137 +struct _GateStruc 5.138 + { 5.139 + int32 gateClosed; 5.140 + int32 preGateProgress; 5.141 + int32 waitProgress; 5.142 + int32 exitProgress; 5.143 + }; 5.144 +//GateStruc 5.145 5.146 //======================= OS Thread related =============================== 5.147 5.148 @@ -342,6 +367,7 @@ 5.149 ); 5.150 //===== 5.151 5.152 +#include "SwitchAnimators.h" 5.153 #include "probes.h" 5.154 5.155 #endif /* _VMS_H */
6.1 --- a/probes.c Tue Nov 02 16:43:01 2010 -0700 6.2 +++ b/probes.c Thu Nov 04 18:13:18 2010 -0700 6.3 @@ -253,7 +253,7 @@ 6.4 VMS_impl__record_interval_start_in_probe( int32 probeID ) 6.5 { IntervalProbe *probe; 6.6 6.7 - DEBUG_MSG( dbgProbes, "record start of interval\n" ) 6.8 + DEBUG( dbgProbes, "record start of interval\n" ) 6.9 probe = _VMSMasterEnv->intervalProbes[ probeID ]; 6.10 gettimeofday( &(probe->startStamp), NULL ); 6.11 } 6.12 @@ -268,7 +268,7 @@ 6.13 struct timeval *endStamp, *startStamp; 6.14 float64 startSecs, endSecs; 6.15 6.16 - DEBUG_MSG( dbgProbes, "record end of interval\n" ) 6.17 + DEBUG( dbgProbes, "record end of interval\n" ) 6.18 //possible seg-fault if array resized by diff core right after this 6.19 // one gets probe..? Something like that? Might be safe.. don't care 6.20 probe = _VMSMasterEnv->intervalProbes[ probeID ];
7.1 --- a/vmalloc.c Tue Nov 02 16:43:01 2010 -0700 7.2 +++ b/vmalloc.c Thu Nov 04 18:13:18 2010 -0700 7.3 @@ -67,7 +67,7 @@ 7.4 } 7.5 7.6 if( foundElem == NULL ) 7.7 - { PRINT_ERROR("\nmalloc failed\n") 7.8 + { ERROR("\nmalloc failed\n") 7.9 return NULL; //indicates malloc failed 7.10 } 7.11 //Using a kludge to identify the element that is the top chunk in the
