Mercurial > cgi-bin > hgwebdir.cgi > VMS > VMS_Implementations > VMS_impls > VMS__MC_shared_impl
diff MasterLoop.c @ 178:c1784868dcea
testing hgeol -- see if it fixes line-ending issues -- commit line endings
| author | Me@portablequad |
|---|---|
| date | Wed, 04 Jan 2012 16:10:11 -0800 |
| parents | efb55f1b5fb9 |
| children | 7523ee70d66c 7cff4e13d5c4 |
line diff
1.1 --- a/MasterLoop.c Thu Oct 06 16:24:17 2011 +0200 1.2 +++ b/MasterLoop.c Wed Jan 04 16:10:11 2012 -0800 1.3 @@ -1,373 +1,373 @@ 1.4 -/* 1.5 - * Copyright 2010 OpenSourceStewardshipFoundation 1.6 - * 1.7 - * Licensed under BSD 1.8 - */ 1.9 - 1.10 - 1.11 - 1.12 -#include <stdio.h> 1.13 -#include <stddef.h> 1.14 - 1.15 -#include "VMS.h" 1.16 -#include "ProcrContext.h" 1.17 - 1.18 - 1.19 -//=========================================================================== 1.20 -void inline 1.21 -stealWorkInto( SchedSlot *currSlot, VMSQueueStruc *readyToAnimateQ, 1.22 - VirtProcr *masterPr ); 1.23 - 1.24 -//=========================================================================== 1.25 - 1.26 - 1.27 - 1.28 -/*This code is animated by the virtual Master processor. 1.29 - * 1.30 - *Polls each sched slot exactly once, hands any requests made by a newly 1.31 - * done slave to the "request handler" plug-in function 1.32 - * 1.33 - *Any slots that need a virt procr assigned are given to the "schedule" 1.34 - * plug-in function, which tries to assign a virt procr (slave) to it. 1.35 - * 1.36 - *When all slots needing a processor have been given to the schedule plug-in, 1.37 - * a fraction of the procrs successfully scheduled are put into the 1.38 - * work queue, then a continuation of this function is put in, then the rest 1.39 - * of the virt procrs that were successfully scheduled. 1.40 - * 1.41 - *The first thing the continuation does is busy-wait until the previous 1.42 - * animation completes. This is because an (unlikely) continuation may 1.43 - * sneak through queue before previous continuation is done putting second 1.44 - * part of scheduled slaves in, which is the only race condition. 1.45 - * 1.46 - */ 1.47 - 1.48 -/*May 29, 2010 -- birth a Master during init so that first core loop to 1.49 - * start running gets it and does all the stuff for a newly born -- 1.50 - * from then on, will be doing continuation, but do suspension self 1.51 - * directly at end of master loop 1.52 - *So VMS__init just births the master virtual processor same way it births 1.53 - * all the others -- then does any extra setup needed and puts it into the 1.54 - * work queue. 1.55 - *However means have to make masterEnv a global static volatile the same way 1.56 - * did with readyToAnimateQ in core loop. -- for performance, put the 1.57 - * jump to the core loop directly in here, and have it directly jump back. 1.58 - * 1.59 - * 1.60 - *Aug 18, 2010 -- Going to a separate MasterVP for each core, to see if this 1.61 - * avoids the suspected bug in the system stack that causes bizarre faults 1.62 - * at random places in the system code. 1.63 - * 1.64 - *So, this function is coupled to each of the MasterVPs, -- meaning this 1.65 - * function can't rely on a particular stack and frame -- each MasterVP that 1.66 - * animates this function has a different one. 1.67 - * 1.68 - *At this point, the masterLoop does not write itself into the queue anymore, 1.69 - * instead, the coreLoop acquires the masterLock when it has nothing to 1.70 - * animate, and then animates its own masterLoop. However, still try to put 1.71 - * several AppVPs into the queue to amortize the startup cost of switching 1.72 - * to the MasterVP. Note, don't have to worry about latency of requests much 1.73 - * because most requests generate work for same core -- only latency issue 1.74 - * is case when other cores starved and one core's requests generate work 1.75 - * for them -- so keep max in queue to 3 or 4.. 1.76 - */ 1.77 -void masterLoop( void *initData, VirtProcr *animatingPr ) 1.78 - { 1.79 - int32 slotIdx, numSlotsFilled; 1.80 - VirtProcr *schedVirtPr; 1.81 - SchedSlot *currSlot, **schedSlots; 1.82 - MasterEnv *masterEnv; 1.83 - VMSQueueStruc *readyToAnimateQ; 1.84 - 1.85 - SlaveScheduler slaveScheduler; 1.86 - RequestHandler requestHandler; 1.87 - void *semanticEnv; 1.88 - 1.89 - int32 thisCoresIdx; 1.90 - VirtProcr *masterPr; 1.91 - volatile VirtProcr *volatileMasterPr; 1.92 - 1.93 - volatileMasterPr = animatingPr; 1.94 - masterPr = (VirtProcr*)volatileMasterPr; //used to force re-define after jmp 1.95 - 1.96 - //First animation of each MasterVP will in turn animate this part 1.97 - // of setup code.. (VP creator sets up the stack as if this function 1.98 - // was called normally, but actually get here by jmp) 1.99 - //So, setup values about stack ptr, jmp pt and all that 1.100 - //masterPr->nextInstrPt = &&masterLoopStartPt; 1.101 - 1.102 - 1.103 - //Note, got rid of writing the stack and frame ptr up here, because 1.104 - // only one 1.105 - // core can ever animate a given MasterVP, so don't need to communicate 1.106 - // new frame and stack ptr to the MasterVP storage before a second 1.107 - // version of that MasterVP can get animated on a different core. 1.108 - //Also got rid of the busy-wait. 1.109 - 1.110 - 1.111 - //masterLoopStartPt: 1.112 - while(1){ 1.113 - 1.114 - //============================= MEASUREMENT STUFF ======================== 1.115 - #ifdef MEAS__TIME_MASTER 1.116 - //Total Master time includes one coreloop time -- just assume the core 1.117 - // loop time is same for Master as for AppVPs, even though it may be 1.118 - // smaller due to higher predictability of the fixed jmp. 1.119 - saveLowTimeStampCountInto( masterPr->startMasterTSCLow ); 1.120 - #endif 1.121 - //======================================================================== 1.122 - 1.123 - masterEnv = (MasterEnv*)_VMSMasterEnv; 1.124 - 1.125 - //GCC may optimize so doesn't always re-define from frame-storage 1.126 - masterPr = (VirtProcr*)volatileMasterPr; //just to make sure after jmp 1.127 - thisCoresIdx = masterPr->coreAnimatedBy; 1.128 - readyToAnimateQ = masterEnv->readyToAnimateQs[thisCoresIdx]; 1.129 - schedSlots = masterEnv->allSchedSlots[thisCoresIdx]; 1.130 - 1.131 - requestHandler = masterEnv->requestHandler; 1.132 - slaveScheduler = masterEnv->slaveScheduler; 1.133 - semanticEnv = masterEnv->semanticEnv; 1.134 - 1.135 - 1.136 - //Poll each slot's Done flag 1.137 - numSlotsFilled = 0; 1.138 - for( slotIdx = 0; slotIdx < NUM_SCHED_SLOTS; slotIdx++) 1.139 - { 1.140 - currSlot = schedSlots[ slotIdx ]; 1.141 - 1.142 - if( currSlot->workIsDone ) 1.143 - { 1.144 - currSlot->workIsDone = FALSE; 1.145 - currSlot->needsProcrAssigned = TRUE; 1.146 - 1.147 - //process requests from slave to master 1.148 - //====================== MEASUREMENT STUFF =================== 1.149 - #ifdef MEAS__TIME_PLUGIN 1.150 - int32 startStamp1, endStamp1; 1.151 - saveLowTimeStampCountInto( startStamp1 ); 1.152 - #endif 1.153 - //============================================================ 1.154 - (*requestHandler)( currSlot->procrAssignedToSlot, semanticEnv ); 1.155 - //====================== MEASUREMENT STUFF =================== 1.156 - #ifdef MEAS__TIME_PLUGIN 1.157 - saveLowTimeStampCountInto( endStamp1 ); 1.158 - addIntervalToHist( startStamp1, endStamp1, 1.159 - _VMSMasterEnv->reqHdlrLowTimeHist ); 1.160 - addIntervalToHist( startStamp1, endStamp1, 1.161 - _VMSMasterEnv->reqHdlrHighTimeHist ); 1.162 - #endif 1.163 - //============================================================ 1.164 - } 1.165 - if( currSlot->needsProcrAssigned ) 1.166 - { //give slot a new virt procr 1.167 - schedVirtPr = 1.168 - (*slaveScheduler)( semanticEnv, thisCoresIdx ); 1.169 - 1.170 - if( schedVirtPr != NULL ) 1.171 - { currSlot->procrAssignedToSlot = schedVirtPr; 1.172 - schedVirtPr->schedSlot = currSlot; 1.173 - currSlot->needsProcrAssigned = FALSE; 1.174 - numSlotsFilled += 1; 1.175 - 1.176 - writeVMSQ( schedVirtPr, readyToAnimateQ ); 1.177 - } 1.178 - } 1.179 - } 1.180 - 1.181 - 1.182 - #ifdef USE_WORK_STEALING 1.183 - //If no slots filled, means no more work, look for work to steal. 1.184 - if( numSlotsFilled == 0 ) 1.185 - { gateProtected_stealWorkInto( currSlot, readyToAnimateQ, masterPr ); 1.186 - } 1.187 - #endif 1.188 - 1.189 - 1.190 - #ifdef MEAS__TIME_MASTER 1.191 - saveLowTimeStampCountInto( masterPr->endMasterTSCLow ); 1.192 - #endif 1.193 - 1.194 - masterSwitchToCoreLoop(animatingPr); 1.195 - flushRegisters(); 1.196 - }//MasterLoop 1.197 - 1.198 - 1.199 - } 1.200 - 1.201 - 1.202 - 1.203 -/*This has a race condition -- the coreloops are accessing their own queues 1.204 - * at the same time that this work-stealer on a different core is trying to 1.205 - */ 1.206 -void inline 1.207 -stealWorkInto( SchedSlot *currSlot, VMSQueueStruc *readyToAnimateQ, 1.208 - VirtProcr *masterPr ) 1.209 - { 1.210 - VirtProcr *stolenPr; 1.211 - int32 coreIdx, i; 1.212 - VMSQueueStruc *currQ; 1.213 - 1.214 - stolenPr = NULL; 1.215 - coreIdx = masterPr->coreAnimatedBy; 1.216 - for( i = 0; i < NUM_CORES -1; i++ ) 1.217 - { 1.218 - if( coreIdx >= NUM_CORES -1 ) 1.219 - { coreIdx = 0; 1.220 - } 1.221 - else 1.222 - { coreIdx++; 1.223 - } 1.224 - currQ = _VMSMasterEnv->readyToAnimateQs[coreIdx]; 1.225 - if( numInVMSQ( currQ ) > 0 ) 1.226 - { stolenPr = readVMSQ (currQ ); 1.227 - break; 1.228 - } 1.229 - } 1.230 - 1.231 - if( stolenPr != NULL ) 1.232 - { currSlot->procrAssignedToSlot = stolenPr; 1.233 - stolenPr->schedSlot = currSlot; 1.234 - currSlot->needsProcrAssigned = FALSE; 1.235 - 1.236 - writeVMSQ( stolenPr, readyToAnimateQ ); 1.237 - } 1.238 - } 1.239 - 1.240 -/*This algorithm makes the common case fast. Make the coreloop passive, 1.241 - * and show its progress. Make the stealer control a gate that coreloop 1.242 - * has to pass. 1.243 - *To avoid interference, only one stealer at a time. Use a global 1.244 - * stealer-lock. 1.245 - * 1.246 - *The pattern is based on a gate -- stealer shuts the gate, then monitors 1.247 - * to be sure any already past make it all the way out, before starting. 1.248 - *So, have a "progress" measure just before the gate, then have two after it, 1.249 - * one is in a "waiting room" outside the gate, the other is at the exit. 1.250 - *Then, the stealer first shuts the gate, then checks the progress measure 1.251 - * outside it, then looks to see if the progress measure at the exit is the 1.252 - * same. If yes, it knows the protected area is empty 'cause no other way 1.253 - * to get in and the last to get in also exited. 1.254 - *If the progress measure at the exit is not the same, then the stealer goes 1.255 - * into a loop checking both the waiting-area and the exit progress-measures 1.256 - * until one of them shows the same as the measure outside the gate. Might 1.257 - * as well re-read the measure outside the gate each go around, just to be 1.258 - * sure. It is guaranteed that one of the two will eventually match the one 1.259 - * outside the gate. 1.260 - * 1.261 - *Here's an informal proof of correctness: 1.262 - *The gate can be closed at any point, and have only four cases: 1.263 - * 1) coreloop made it past the gate-closing but not yet past the exit 1.264 - * 2) coreloop made it past the pre-gate progress update but not yet past 1.265 - * the gate, 1.266 - * 3) coreloop is right before the pre-gate update 1.267 - * 4) coreloop is past the exit and far from the pre-gate update. 1.268 - * 1.269 - * Covering the cases in reverse order, 1.270 - * 4) is not a problem -- stealer will read pre-gate progress, see that it 1.271 - * matches exit progress, and the gate is closed, so stealer can proceed. 1.272 - * 3) stealer will read pre-gate progress just after coreloop updates it.. 1.273 - * so stealer goes into a loop until the coreloop causes wait-progress 1.274 - * to match pre-gate progress, so then stealer can proceed 1.275 - * 2) same as 3.. 1.276 - * 1) stealer reads pre-gate progress, sees that it's different than exit, 1.277 - * so goes into loop until exit matches pre-gate, now it knows coreloop 1.278 - * is not in protected and cannot get back in, so can proceed. 1.279 - * 1.280 - *Implementation for the stealer: 1.281 - * 1.282 - *First, acquire the stealer lock -- only cores with no work to do will 1.283 - * compete to steal, so not a big performance penalty having only one -- 1.284 - * will rarely have multiple stealers in a system with plenty of work -- and 1.285 - * in a system with little work, it doesn't matter. 1.286 - * 1.287 - *Note, have single-reader, single-writer pattern for all variables used to 1.288 - * communicate between stealer and victims 1.289 - * 1.290 - *So, scan the queues of the core loops, until find non-empty. Each core 1.291 - * has its own list that it scans. The list goes in order from closest to 1.292 - * furthest core, so it steals first from close cores. Later can add 1.293 - * taking info from the app about overlapping footprints, and scan all the 1.294 - * others then choose work with the most footprint overlap with the contents 1.295 - * of this core's cache. 1.296 - * 1.297 - *Now, have a victim want to take work from. So, shut the gate in that 1.298 - * coreloop, by setting the "gate closed" var on its stack to TRUE. 1.299 - *Then, read the core's pre-gate progress and compare to the core's exit 1.300 - * progress. 1.301 - *If same, can proceed to take work from the coreloop's queue. When done, 1.302 - * write FALSE to gate closed var. 1.303 - *If different, then enter a loop that reads the pre-gate progress, then 1.304 - * compares to exit progress then to wait progress. When one of two 1.305 - * matches, proceed. Take work from the coreloop's queue. When done, 1.306 - * write FALSE to the gate closed var. 1.307 - * 1.308 - */ 1.309 -void inline 1.310 -gateProtected_stealWorkInto( SchedSlot *currSlot, 1.311 - VMSQueueStruc *myReadyToAnimateQ, 1.312 - VirtProcr *masterPr ) 1.313 - { 1.314 - VirtProcr *stolenPr; 1.315 - int32 coreIdx, i, haveAVictim, gotLock; 1.316 - VMSQueueStruc *victimsQ; 1.317 - 1.318 - volatile GateStruc *vicGate; 1.319 - int32 coreMightBeInProtected; 1.320 - 1.321 - 1.322 - 1.323 - //see if any other cores have work available to steal 1.324 - haveAVictim = FALSE; 1.325 - coreIdx = masterPr->coreAnimatedBy; 1.326 - for( i = 0; i < NUM_CORES -1; i++ ) 1.327 - { 1.328 - if( coreIdx >= NUM_CORES -1 ) 1.329 - { coreIdx = 0; 1.330 - } 1.331 - else 1.332 - { coreIdx++; 1.333 - } 1.334 - victimsQ = _VMSMasterEnv->readyToAnimateQs[coreIdx]; 1.335 - if( numInVMSQ( victimsQ ) > 0 ) 1.336 - { haveAVictim = TRUE; 1.337 - vicGate = _VMSMasterEnv->workStealingGates[ coreIdx ]; 1.338 - break; 1.339 - } 1.340 - } 1.341 - if( !haveAVictim ) return; //no work to steal, exit 1.342 - 1.343 - //have a victim core, now get the stealer-lock 1.344 - gotLock =__sync_bool_compare_and_swap( &(_VMSMasterEnv->workStealingLock), 1.345 - UNLOCKED, LOCKED ); 1.346 - if( !gotLock ) return; //go back to core loop, which will re-start master 1.347 - 1.348 - 1.349 - //====== Start Gate-protection ======= 1.350 - vicGate->gateClosed = TRUE; 1.351 - coreMightBeInProtected= vicGate->preGateProgress != vicGate->exitProgress; 1.352 - while( coreMightBeInProtected ) 1.353 - { //wait until sure 1.354 - if( vicGate->preGateProgress == vicGate->waitProgress ) 1.355 - coreMightBeInProtected = FALSE; 1.356 - if( vicGate->preGateProgress == vicGate->exitProgress ) 1.357 - coreMightBeInProtected = FALSE; 1.358 - } 1.359 - 1.360 - stolenPr = readVMSQ ( victimsQ ); 1.361 - 1.362 - vicGate->gateClosed = FALSE; 1.363 - //======= End Gate-protection ======= 1.364 - 1.365 - 1.366 - if( stolenPr != NULL ) //victim could have been in protected and taken 1.367 - { currSlot->procrAssignedToSlot = stolenPr; 1.368 - stolenPr->schedSlot = currSlot; 1.369 - currSlot->needsProcrAssigned = FALSE; 1.370 - 1.371 - writeVMSQ( stolenPr, myReadyToAnimateQ ); 1.372 - } 1.373 - 1.374 - //unlock the work stealing lock 1.375 - _VMSMasterEnv->workStealingLock = UNLOCKED; 1.376 - } 1.377 +/* 1.378 + * Copyright 2010 OpenSourceStewardshipFoundation 1.379 + * 1.380 + * Licensed under BSD 1.381 + */ 1.382 + 1.383 + 1.384 + 1.385 +#include <stdio.h> 1.386 +#include <stddef.h> 1.387 + 1.388 +#include "VMS.h" 1.389 +#include "ProcrContext.h" 1.390 + 1.391 + 1.392 +//=========================================================================== 1.393 +void inline 1.394 +stealWorkInto( SchedSlot *currSlot, VMSQueueStruc *readyToAnimateQ, 1.395 + VirtProcr *masterPr ); 1.396 + 1.397 +//=========================================================================== 1.398 + 1.399 + 1.400 + 1.401 +/*This code is animated by the virtual Master processor. 1.402 + * 1.403 + *Polls each sched slot exactly once, hands any requests made by a newly 1.404 + * done slave to the "request handler" plug-in function 1.405 + * 1.406 + *Any slots that need a virt procr assigned are given to the "schedule" 1.407 + * plug-in function, which tries to assign a virt procr (slave) to it. 1.408 + * 1.409 + *When all slots needing a processor have been given to the schedule plug-in, 1.410 + * a fraction of the procrs successfully scheduled are put into the 1.411 + * work queue, then a continuation of this function is put in, then the rest 1.412 + * of the virt procrs that were successfully scheduled. 1.413 + * 1.414 + *The first thing the continuation does is busy-wait until the previous 1.415 + * animation completes. This is because an (unlikely) continuation may 1.416 + * sneak through queue before previous continuation is done putting second 1.417 + * part of scheduled slaves in, which is the only race condition. 1.418 + * 1.419 + */ 1.420 + 1.421 +/*May 29, 2010 -- birth a Master during init so that first core loop to 1.422 + * start running gets it and does all the stuff for a newly born -- 1.423 + * from then on, will be doing continuation, but do suspension self 1.424 + * directly at end of master loop 1.425 + *So VMS__init just births the master virtual processor same way it births 1.426 + * all the others -- then does any extra setup needed and puts it into the 1.427 + * work queue. 1.428 + *However means have to make masterEnv a global static volatile the same way 1.429 + * did with readyToAnimateQ in core loop. -- for performance, put the 1.430 + * jump to the core loop directly in here, and have it directly jump back. 1.431 + * 1.432 + * 1.433 + *Aug 18, 2010 -- Going to a separate MasterVP for each core, to see if this 1.434 + * avoids the suspected bug in the system stack that causes bizarre faults 1.435 + * at random places in the system code. 1.436 + * 1.437 + *So, this function is coupled to each of the MasterVPs, -- meaning this 1.438 + * function can't rely on a particular stack and frame -- each MasterVP that 1.439 + * animates this function has a different one. 1.440 + * 1.441 + *At this point, the masterLoop does not write itself into the queue anymore, 1.442 + * instead, the coreLoop acquires the masterLock when it has nothing to 1.443 + * animate, and then animates its own masterLoop. However, still try to put 1.444 + * several AppVPs into the queue to amortize the startup cost of switching 1.445 + * to the MasterVP. Note, don't have to worry about latency of requests much 1.446 + * because most requests generate work for same core -- only latency issue 1.447 + * is case when other cores starved and one core's requests generate work 1.448 + * for them -- so keep max in queue to 3 or 4.. 1.449 + */ 1.450 +void masterLoop( void *initData, VirtProcr *animatingPr ) 1.451 + { 1.452 + int32 slotIdx, numSlotsFilled; 1.453 + VirtProcr *schedVirtPr; 1.454 + SchedSlot *currSlot, **schedSlots; 1.455 + MasterEnv *masterEnv; 1.456 + VMSQueueStruc *readyToAnimateQ; 1.457 + 1.458 + SlaveScheduler slaveScheduler; 1.459 + RequestHandler requestHandler; 1.460 + void *semanticEnv; 1.461 + 1.462 + int32 thisCoresIdx; 1.463 + VirtProcr *masterPr; 1.464 + volatile VirtProcr *volatileMasterPr; 1.465 + 1.466 + volatileMasterPr = animatingPr; 1.467 + masterPr = (VirtProcr*)volatileMasterPr; //used to force re-define after jmp 1.468 + 1.469 + //First animation of each MasterVP will in turn animate this part 1.470 + // of setup code.. (VP creator sets up the stack as if this function 1.471 + // was called normally, but actually get here by jmp) 1.472 + //So, setup values about stack ptr, jmp pt and all that 1.473 + //masterPr->nextInstrPt = &&masterLoopStartPt; 1.474 + 1.475 + 1.476 + //Note, got rid of writing the stack and frame ptr up here, because 1.477 + // only one 1.478 + // core can ever animate a given MasterVP, so don't need to communicate 1.479 + // new frame and stack ptr to the MasterVP storage before a second 1.480 + // version of that MasterVP can get animated on a different core. 1.481 + //Also got rid of the busy-wait. 1.482 + 1.483 + 1.484 + //masterLoopStartPt: 1.485 + while(1){ 1.486 + 1.487 + //============================= MEASUREMENT STUFF ======================== 1.488 + #ifdef MEAS__TIME_MASTER 1.489 + //Total Master time includes one coreloop time -- just assume the core 1.490 + // loop time is same for Master as for AppVPs, even though it may be 1.491 + // smaller due to higher predictability of the fixed jmp. 1.492 + saveLowTimeStampCountInto( masterPr->startMasterTSCLow ); 1.493 + #endif 1.494 + //======================================================================== 1.495 + 1.496 + masterEnv = (MasterEnv*)_VMSMasterEnv; 1.497 + 1.498 + //GCC may optimize so doesn't always re-define from frame-storage 1.499 + masterPr = (VirtProcr*)volatileMasterPr; //just to make sure after jmp 1.500 + thisCoresIdx = masterPr->coreAnimatedBy; 1.501 + readyToAnimateQ = masterEnv->readyToAnimateQs[thisCoresIdx]; 1.502 + schedSlots = masterEnv->allSchedSlots[thisCoresIdx]; 1.503 + 1.504 + requestHandler = masterEnv->requestHandler; 1.505 + slaveScheduler = masterEnv->slaveScheduler; 1.506 + semanticEnv = masterEnv->semanticEnv; 1.507 + 1.508 + 1.509 + //Poll each slot's Done flag 1.510 + numSlotsFilled = 0; 1.511 + for( slotIdx = 0; slotIdx < NUM_SCHED_SLOTS; slotIdx++) 1.512 + { 1.513 + currSlot = schedSlots[ slotIdx ]; 1.514 + 1.515 + if( currSlot->workIsDone ) 1.516 + { 1.517 + currSlot->workIsDone = FALSE; 1.518 + currSlot->needsProcrAssigned = TRUE; 1.519 + 1.520 + //process requests from slave to master 1.521 + //====================== MEASUREMENT STUFF =================== 1.522 + #ifdef MEAS__TIME_PLUGIN 1.523 + int32 startStamp1, endStamp1; 1.524 + saveLowTimeStampCountInto( startStamp1 ); 1.525 + #endif 1.526 + //============================================================ 1.527 + (*requestHandler)( currSlot->procrAssignedToSlot, semanticEnv ); 1.528 + //====================== MEASUREMENT STUFF =================== 1.529 + #ifdef MEAS__TIME_PLUGIN 1.530 + saveLowTimeStampCountInto( endStamp1 ); 1.531 + addIntervalToHist( startStamp1, endStamp1, 1.532 + _VMSMasterEnv->reqHdlrLowTimeHist ); 1.533 + addIntervalToHist( startStamp1, endStamp1, 1.534 + _VMSMasterEnv->reqHdlrHighTimeHist ); 1.535 + #endif 1.536 + //============================================================ 1.537 + } 1.538 + if( currSlot->needsProcrAssigned ) 1.539 + { //give slot a new virt procr 1.540 + schedVirtPr = 1.541 + (*slaveScheduler)( semanticEnv, thisCoresIdx ); 1.542 + 1.543 + if( schedVirtPr != NULL ) 1.544 + { currSlot->procrAssignedToSlot = schedVirtPr; 1.545 + schedVirtPr->schedSlot = currSlot; 1.546 + currSlot->needsProcrAssigned = FALSE; 1.547 + numSlotsFilled += 1; 1.548 + 1.549 + writeVMSQ( schedVirtPr, readyToAnimateQ ); 1.550 + } 1.551 + } 1.552 + } 1.553 + 1.554 + 1.555 + #ifdef USE_WORK_STEALING 1.556 + //If no slots filled, means no more work, look for work to steal. 1.557 + if( numSlotsFilled == 0 ) 1.558 + { gateProtected_stealWorkInto( currSlot, readyToAnimateQ, masterPr ); 1.559 + } 1.560 + #endif 1.561 + 1.562 + 1.563 + #ifdef MEAS__TIME_MASTER 1.564 + saveLowTimeStampCountInto( masterPr->endMasterTSCLow ); 1.565 + #endif 1.566 + 1.567 + masterSwitchToCoreLoop(animatingPr); 1.568 + flushRegisters(); 1.569 + }//MasterLoop 1.570 + 1.571 + 1.572 + } 1.573 + 1.574 + 1.575 + 1.576 +/*This has a race condition -- the coreloops are accessing their own queues 1.577 + * at the same time that this work-stealer on a different core is trying to 1.578 + */ 1.579 +void inline 1.580 +stealWorkInto( SchedSlot *currSlot, VMSQueueStruc *readyToAnimateQ, 1.581 + VirtProcr *masterPr ) 1.582 + { 1.583 + VirtProcr *stolenPr; 1.584 + int32 coreIdx, i; 1.585 + VMSQueueStruc *currQ; 1.586 + 1.587 + stolenPr = NULL; 1.588 + coreIdx = masterPr->coreAnimatedBy; 1.589 + for( i = 0; i < NUM_CORES -1; i++ ) 1.590 + { 1.591 + if( coreIdx >= NUM_CORES -1 ) 1.592 + { coreIdx = 0; 1.593 + } 1.594 + else 1.595 + { coreIdx++; 1.596 + } 1.597 + currQ = _VMSMasterEnv->readyToAnimateQs[coreIdx]; 1.598 + if( numInVMSQ( currQ ) > 0 ) 1.599 + { stolenPr = readVMSQ (currQ ); 1.600 + break; 1.601 + } 1.602 + } 1.603 + 1.604 + if( stolenPr != NULL ) 1.605 + { currSlot->procrAssignedToSlot = stolenPr; 1.606 + stolenPr->schedSlot = currSlot; 1.607 + currSlot->needsProcrAssigned = FALSE; 1.608 + 1.609 + writeVMSQ( stolenPr, readyToAnimateQ ); 1.610 + } 1.611 + } 1.612 + 1.613 +/*This algorithm makes the common case fast. Make the coreloop passive, 1.614 + * and show its progress. Make the stealer control a gate that coreloop 1.615 + * has to pass. 1.616 + *To avoid interference, only one stealer at a time. Use a global 1.617 + * stealer-lock. 1.618 + * 1.619 + *The pattern is based on a gate -- stealer shuts the gate, then monitors 1.620 + * to be sure any already past make it all the way out, before starting. 1.621 + *So, have a "progress" measure just before the gate, then have two after it, 1.622 + * one is in a "waiting room" outside the gate, the other is at the exit. 1.623 + *Then, the stealer first shuts the gate, then checks the progress measure 1.624 + * outside it, then looks to see if the progress measure at the exit is the 1.625 + * same. If yes, it knows the protected area is empty 'cause no other way 1.626 + * to get in and the last to get in also exited. 1.627 + *If the progress measure at the exit is not the same, then the stealer goes 1.628 + * into a loop checking both the waiting-area and the exit progress-measures 1.629 + * until one of them shows the same as the measure outside the gate. Might 1.630 + * as well re-read the measure outside the gate each go around, just to be 1.631 + * sure. It is guaranteed that one of the two will eventually match the one 1.632 + * outside the gate. 1.633 + * 1.634 + *Here's an informal proof of correctness: 1.635 + *The gate can be closed at any point, and have only four cases: 1.636 + * 1) coreloop made it past the gate-closing but not yet past the exit 1.637 + * 2) coreloop made it past the pre-gate progress update but not yet past 1.638 + * the gate, 1.639 + * 3) coreloop is right before the pre-gate update 1.640 + * 4) coreloop is past the exit and far from the pre-gate update. 1.641 + * 1.642 + * Covering the cases in reverse order, 1.643 + * 4) is not a problem -- stealer will read pre-gate progress, see that it 1.644 + * matches exit progress, and the gate is closed, so stealer can proceed. 1.645 + * 3) stealer will read pre-gate progress just after coreloop updates it.. 1.646 + * so stealer goes into a loop until the coreloop causes wait-progress 1.647 + * to match pre-gate progress, so then stealer can proceed 1.648 + * 2) same as 3.. 1.649 + * 1) stealer reads pre-gate progress, sees that it's different than exit, 1.650 + * so goes into loop until exit matches pre-gate, now it knows coreloop 1.651 + * is not in protected and cannot get back in, so can proceed. 1.652 + * 1.653 + *Implementation for the stealer: 1.654 + * 1.655 + *First, acquire the stealer lock -- only cores with no work to do will 1.656 + * compete to steal, so not a big performance penalty having only one -- 1.657 + * will rarely have multiple stealers in a system with plenty of work -- and 1.658 + * in a system with little work, it doesn't matter. 1.659 + * 1.660 + *Note, have single-reader, single-writer pattern for all variables used to 1.661 + * communicate between stealer and victims 1.662 + * 1.663 + *So, scan the queues of the core loops, until find non-empty. Each core 1.664 + * has its own list that it scans. The list goes in order from closest to 1.665 + * furthest core, so it steals first from close cores. Later can add 1.666 + * taking info from the app about overlapping footprints, and scan all the 1.667 + * others then choose work with the most footprint overlap with the contents 1.668 + * of this core's cache. 1.669 + * 1.670 + *Now, have a victim want to take work from. So, shut the gate in that 1.671 + * coreloop, by setting the "gate closed" var on its stack to TRUE. 1.672 + *Then, read the core's pre-gate progress and compare to the core's exit 1.673 + * progress. 1.674 + *If same, can proceed to take work from the coreloop's queue. When done, 1.675 + * write FALSE to gate closed var. 1.676 + *If different, then enter a loop that reads the pre-gate progress, then 1.677 + * compares to exit progress then to wait progress. When one of two 1.678 + * matches, proceed. Take work from the coreloop's queue. When done, 1.679 + * write FALSE to the gate closed var. 1.680 + * 1.681 + */ 1.682 +void inline 1.683 +gateProtected_stealWorkInto( SchedSlot *currSlot, 1.684 + VMSQueueStruc *myReadyToAnimateQ, 1.685 + VirtProcr *masterPr ) 1.686 + { 1.687 + VirtProcr *stolenPr; 1.688 + int32 coreIdx, i, haveAVictim, gotLock; 1.689 + VMSQueueStruc *victimsQ; 1.690 + 1.691 + volatile GateStruc *vicGate; 1.692 + int32 coreMightBeInProtected; 1.693 + 1.694 + 1.695 + 1.696 + //see if any other cores have work available to steal 1.697 + haveAVictim = FALSE; 1.698 + coreIdx = masterPr->coreAnimatedBy; 1.699 + for( i = 0; i < NUM_CORES -1; i++ ) 1.700 + { 1.701 + if( coreIdx >= NUM_CORES -1 ) 1.702 + { coreIdx = 0; 1.703 + } 1.704 + else 1.705 + { coreIdx++; 1.706 + } 1.707 + victimsQ = _VMSMasterEnv->readyToAnimateQs[coreIdx]; 1.708 + if( numInVMSQ( victimsQ ) > 0 ) 1.709 + { haveAVictim = TRUE; 1.710 + vicGate = _VMSMasterEnv->workStealingGates[ coreIdx ]; 1.711 + break; 1.712 + } 1.713 + } 1.714 + if( !haveAVictim ) return; //no work to steal, exit 1.715 + 1.716 + //have a victim core, now get the stealer-lock 1.717 + gotLock =__sync_bool_compare_and_swap( &(_VMSMasterEnv->workStealingLock), 1.718 + UNLOCKED, LOCKED ); 1.719 + if( !gotLock ) return; //go back to core loop, which will re-start master 1.720 + 1.721 + 1.722 + //====== Start Gate-protection ======= 1.723 + vicGate->gateClosed = TRUE; 1.724 + coreMightBeInProtected= vicGate->preGateProgress != vicGate->exitProgress; 1.725 + while( coreMightBeInProtected ) 1.726 + { //wait until sure 1.727 + if( vicGate->preGateProgress == vicGate->waitProgress ) 1.728 + coreMightBeInProtected = FALSE; 1.729 + if( vicGate->preGateProgress == vicGate->exitProgress ) 1.730 + coreMightBeInProtected = FALSE; 1.731 + } 1.732 + 1.733 + stolenPr = readVMSQ ( victimsQ ); 1.734 + 1.735 + vicGate->gateClosed = FALSE; 1.736 + //======= End Gate-protection ======= 1.737 + 1.738 + 1.739 + if( stolenPr != NULL ) //victim could have been in protected and taken 1.740 + { currSlot->procrAssignedToSlot = stolenPr; 1.741 + stolenPr->schedSlot = currSlot; 1.742 + currSlot->needsProcrAssigned = FALSE; 1.743 + 1.744 + writeVMSQ( stolenPr, myReadyToAnimateQ ); 1.745 + } 1.746 + 1.747 + //unlock the work stealing lock 1.748 + _VMSMasterEnv->workStealingLock = UNLOCKED; 1.749 + }
