# HG changeset patch # User Me # Date 1288919598 25200 # Node ID 3bac84e4e56e3e88a9e4977cee9946143551f863 # Parent f8508572f3de9080da6148d30219271c7ecff065 Works with correct matrix mult Nov 4 -- switch animators macros, many updates Changed all queues back to VMSQ variants #defines correct, protected, work-stealing, with compiler switch in and out diff -r f8508572f3de -r 3bac84e4e56e CoreLoop.c --- a/CoreLoop.c Tue Nov 02 16:43:01 2010 -0700 +++ b/CoreLoop.c Thu Nov 04 18:13:18 2010 -0700 @@ -34,13 +34,24 @@ ThdParams *coreLoopThdParams; int thisCoresIdx; VirtProcr *currPr; - SRSWQueueStruc *readyToAnimateQ; + VMSQueueStruc *readyToAnimateQ; unsigned long coreMask; //has 1 in bit positions of allowed cores int errorCode; - + + //work-stealing struc on stack to prevent false-sharing in cache-line + volatile GateStruc gate; + //preGateProgress, waitProgress, exitProgress, gateClosed; + + coreLoopThdParams = (ThdParams *)paramsIn; thisCoresIdx = coreLoopThdParams->coreNum; + gate.gateClosed = FALSE; + gate.preGateProgress = 0; + gate.waitProgress = 0; + gate.exitProgress = 0; + _VMSMasterEnv->workStealingGates[ thisCoresIdx ] = &gate;//race @startup + //wait until signalled that setup is complete pthread_mutex_lock( &suspendLock ); while( !(_VMSMasterEnv->setupComplete) ) @@ -87,32 +98,38 @@ // which forces reloading the pointer after each jmp to this point readyToAnimateQ = _VMSMasterEnv->readyToAnimateQs[thisCoresIdx]; - currPr = (VirtProcr *) readSRSWQ_NonBlocking( readyToAnimateQ ); - + #ifdef USE_WORK_STEALING + //Alg for work-stealing designed to make common case fast. Comment + // in stealer code explains. + gate.preGateProgress++; + if( gate.gateClosed ) + { //now, set coreloop's progress, so stealer can see that core loop + // has made it into the waiting area. + gate.waitProgress = gate.preGateProgress; + while( gate.gateClosed ) /*busy wait*/; + } + + currPr = (VirtProcr *) readVMSQ( readyToAnimateQ ); + + //Set the coreloop's progress, so stealer can see it has made it out + // of the protected area + gate.exitProgress = gate.preGateProgress; + #else + currPr = (VirtProcr *) readVMSQ( readyToAnimateQ ); + #endif + if( currPr != NULL ) _VMSMasterEnv->numMasterInARow[thisCoresIdx] = 0; int tries = 0; int gotLock = 0; - while( currPr == NULL ) - { //no VPs ready to animate, so run MasterVP --later make "try Master" - // VPs & put one in every queue at strategic point -- so have work - // avail if don't get lock & short-circuit out of it if master has - // recently run on another core - //TODO: perf -- "try Master" VP that checks if should run Master Fn - //But just letting queue run empty is quickest to see if pinning VP - // to core will solve the bizarre random seg-faults in system stack. - - //check if get the MasterLock + while( currPr == NULL ) //if queue was empty, enter get masterLock loop + { //queue was empty, so get master lock gotLock = __sync_bool_compare_and_swap( &(_VMSMasterEnv->masterLock), \ - UNLOCKED, LOCKED ); + UNLOCKED, LOCKED ); if( gotLock ) - { //run own MasterVP -- when its done, unlocks MasterLock and - // jumps back to coreLoops's startPt + { //run own MasterVP -- jmps to coreLoops startPt when done currPr = _VMSMasterEnv->masterVPs[thisCoresIdx]; - if( _VMSMasterEnv->numMasterInARow[thisCoresIdx] > 100 ) - { //printf("1000 back to back MasterVP\n"); - //TODO: turn this into work-stealing from another core - //only yield if no work to steal -- and count consecutive yields - // if too many of those, then sleep for 10ms or whatever + if( _VMSMasterEnv->numMasterInARow[thisCoresIdx] > 1000 ) + { DEBUG( dbgB2BMaster,"Many back to back MasterVPs\n"); pthread_yield(); } _VMSMasterEnv->numMasterInARow[thisCoresIdx] += 1; @@ -124,38 +141,7 @@ } - //switch to virt procr's stack and frame ptr then jump to virt procr fn - void *stackPtr, *framePtr, *jmpPt, *coreLoopFramePtrAddr, \ - *coreLoopStackPtrAddr; - - stackPtr = currPr->stackPtr; - framePtr = currPr->framePtr; - jmpPt = currPr->nextInstrPt; - coreLoopFramePtrAddr = &(currPr->coreLoopFramePtr); - coreLoopStackPtrAddr = &(currPr->coreLoopStackPtr); - - //Save the core loop's stack and frame pointers into virt procr struct - // then switch to stack ptr and frame ptr of virt procr & jmp to it - //This was a pain to get right because GCC converts the "(jmpPt)" to - // frame-relative mem-op -- so generated machine code first changed the - // frame pointer, then tried to jump to an addr stored on stack, which - // it accessed as an offset from frame-ptr! (wrong frame-ptr now) - //Explicitly loading into eax before changing frame-ptr fixed it - //Also, it turns "(currPr->coreLoopFramePtr)" into a temporary on the - // stack, so "movl %%ebp, %0" saves to the temp, NOT the data-struc! - asm volatile("movl %0, %%eax; \ - movl %%esp, (%%eax); \ - movl %1, %%eax; \ - movl %%ebp, (%%eax); \ - movl %2, %%eax; \ - movl %3, %%esp; \ - movl %4, %%ebp; \ - jmp %%eax" \ - /* outputs */ : "=g"(coreLoopStackPtrAddr), \ - "=g"(coreLoopFramePtrAddr) \ - /* inputs */ : "g" (jmpPt), "g" (stackPtr), "g" (framePtr) \ - /* clobber */ : "memory", "%eax", "%ebx", "%ecx", "%edx", "%edi", "%esi" \ - ); + SwitchToVP( currPr ) //=========== jmp to here when want to shut down the VMS system ========== CoreLoopEndPt: @@ -176,7 +162,7 @@ coreLoop_Seq( void *paramsIn ) { VirtProcr *currPr; - SRSWQueueStruc *readyToAnimateQ; + VMSQueueStruc *readyToAnimateQ; ThdParams *coreLoopThdParams; int thisCoresIdx; @@ -207,7 +193,7 @@ //_VMSWorkQ must be a global, static volatile var, so not kept in reg, // which forces reloading the pointer after each jmp to this point readyToAnimateQ = _VMSMasterEnv->readyToAnimateQs[thisCoresIdx]; - currPr = (VirtProcr *) readSRSWQ_NonBlocking( readyToAnimateQ ); + currPr = (VirtProcr *) readVMSQ( readyToAnimateQ ); if( currPr == NULL ) { if( _VMSMasterEnv->numMasterInARow[thisCoresIdx] > 1000 ) { printf("too many back to back MasterVP\n"); exit(1); } @@ -219,38 +205,7 @@ _VMSMasterEnv->numMasterInARow[thisCoresIdx] = 0; - //switch to virt procr's stack and frame ptr then jump to virt procr - void *stackPtr, *framePtr, *jmpPt, *coreLoopFramePtrAddr, \ - *coreLoopStackPtrAddr; - - stackPtr = currPr->stackPtr; - framePtr = currPr->framePtr; - jmpPt = currPr->nextInstrPt; - coreLoopFramePtrAddr = &(currPr->coreLoopFramePtr); - coreLoopStackPtrAddr = &(currPr->coreLoopStackPtr); - - //Save the core loop's stack and frame pointers into virt procr struct - // then switch to stack ptr and frame ptr of virt procr & jmp to it - //This was a pain to get right because GCC converts the "(jmpPt)" to - // frame-relative mem-op -- so generated machine code first changed the - // frame pointer, then tried to jump to an addr stored on stack, which - // it accessed as an offset from frame-ptr! (wrong frame-ptr now) - //Explicitly loading into eax before changing frame-ptr fixed it - //Also, it turns "(currPr->coreLoopFramePtr)" into a temporary on the - // stack, so "movl %%ebp, %0" saves to the temp, NOT the data-struc! - asm volatile("movl %0, %%eax; \ - movl %%esp, (%%eax); \ - movl %1, %%eax; \ - movl %%ebp, (%%eax); \ - movl %2, %%eax; \ - movl %3, %%esp; \ - movl %4, %%ebp; \ - jmp %%eax" \ - /* outputs */ : "=g"(coreLoopStackPtrAddr), \ - "=g"(coreLoopFramePtrAddr) \ - /* inputs */ : "g" (jmpPt), "g" (stackPtr), "g" (framePtr) \ - /* clobber */ : "memory", "%eax", "%ebx", "%ecx", "%edx", "%edi", "%esi" \ - ); + SwitchToVP( currPr ) //======================================================================== //jmp to here when want to shut down the VMS system. A shutdown VP is diff -r f8508572f3de -r 3bac84e4e56e MasterLoop.c --- a/MasterLoop.c Tue Nov 02 16:43:01 2010 -0700 +++ b/MasterLoop.c Thu Nov 04 18:13:18 2010 -0700 @@ -12,6 +12,14 @@ #include "VMS.h" +//=========================================================================== +void inline +stealWorkInto( SchedSlot *currSlot, VMSQueueStruc *readyToAnimateQ, + VirtProcr *masterPr ); + +//=========================================================================== + + /*This code is animated by the virtual Master processor. * @@ -64,7 +72,7 @@ */ void masterLoop( void *initData, VirtProcr *animatingPr ) { - int slotIdx; + int32 slotIdx, numSlotsFilled; VirtProcr *schedVirtPr; SchedSlot *currSlot, **schedSlots; MasterEnv *masterEnv; @@ -74,7 +82,7 @@ RequestHandler requestHandler; void *semanticEnv; - int thisCoresIdx; + int32 thisCoresIdx; VirtProcr *masterPr; volatile VirtProcr *volatileMasterPr; @@ -108,7 +116,7 @@ masterEnv = _VMSMasterEnv; -//TODO: check that compiles so that always re-define from frame-storage + //GCC may optimize so doesn't always re-define from frame-storage masterPr = volatileMasterPr; //just to make sure after jmp thisCoresIdx = masterPr->coreAnimatedBy; readyToAnimateQ = masterEnv->readyToAnimateQs[thisCoresIdx]; @@ -120,6 +128,7 @@ //Poll each slot's Done flag + numSlotsFilled = 0; for( slotIdx = 0; slotIdx < NUM_SCHED_SLOTS; slotIdx++) { currSlot = schedSlots[ slotIdx ]; @@ -141,46 +150,203 @@ { currSlot->procrAssignedToSlot = schedVirtPr; schedVirtPr->schedSlot = currSlot; currSlot->needsProcrAssigned = FALSE; - - writeSRSWQ( schedVirtPr, readyToAnimateQ ); + numSlotsFilled += 1; + + writeVMSQ( schedVirtPr, readyToAnimateQ ); } } } + + #ifdef USE_WORK_STEALING + //If no slots filled, means no more work, look for work to steal. + if( numSlotsFilled == 0 ) + { gateProtected_stealWorkInto( currSlot, readyToAnimateQ, masterPr ); + } + #endif - //Save stack ptr and frame, restore CoreLoop's stack and frame, - // and clear the MasterLock - //TODO: cafefully verify don't need to force saving anything to stack - // before jumping back to core loop. - void *stackPtrAddr, *framePtrAddr, *masterLockAddr; - void *jmpPt, *coreLoopFramePtr, *coreLoopStackPtr; - - stackPtrAddr = &(masterPr->stackPtr); - framePtrAddr = &(masterPr->framePtr); - masterLockAddr = &(_VMSMasterEnv->masterLock); - - jmpPt = _VMSMasterEnv->coreLoopStartPt; - coreLoopFramePtr = masterPr->coreLoopFramePtr;//need this only - coreLoopStackPtr = masterPr->coreLoopStackPtr;//shouldn't need -- safety #ifdef MEAS__TIME_MASTER saveLowTimeStampCountInto( masterPr->endMasterTSCLow ); #endif - asm volatile("movl %0, %%eax; \ - movl %%esp, (%%eax); \ - movl %1, %%eax; \ - movl %%ebp, (%%eax); \ - movl %2, %%ebx; \ - movl %3, %%eax; \ - movl %4, %%esp; \ - movl %5, %%ebp; \ - movl $0x0, (%%ebx); \ - jmp %%eax;" \ - /* outputs */ : "=g" (stackPtrAddr), "=g" (framePtrAddr), \ - "=g"(masterLockAddr) \ - /* inputs */ : "g" (jmpPt), "g"(coreLoopStackPtr), "g"(coreLoopFramePtr)\ - /* clobber */ : "memory", "%eax", "%ebx", "%ecx", "%edx", "%edi", "%esi" \ - );//can probably make clobber list empty -- but safe for now + + masterSwitchToCoreLoop( masterPr ) } + + +/*This has a race condition -- the coreloops are accessing their own queues + * at the same time that this work-stealer on a different core is trying to + */ +void inline +stealWorkInto( SchedSlot *currSlot, VMSQueueStruc *readyToAnimateQ, + VirtProcr *masterPr ) + { + VirtProcr *stolenPr; + int32 coreIdx, i; + VMSQueueStruc *currQ; + + stolenPr = NULL; + coreIdx = masterPr->coreAnimatedBy; + for( i = 0; i < NUM_CORES -1; i++ ) + { + if( coreIdx >= NUM_CORES -1 ) + { coreIdx = 0; + } + else + { coreIdx++; + } + currQ = _VMSMasterEnv->readyToAnimateQs[coreIdx]; + if( numInVMSQ( currQ ) > 0 ) + { stolenPr = readVMSQ (currQ ); + break; + } + } + + if( stolenPr != NULL ) + { currSlot->procrAssignedToSlot = stolenPr; + stolenPr->schedSlot = currSlot; + currSlot->needsProcrAssigned = FALSE; + + writeVMSQ( stolenPr, readyToAnimateQ ); + } + } + +/*This algorithm makes the common case fast. Make the coreloop passive, + * and show its progress. Make the stealer control a gate that coreloop + * has to pass. + *To avoid interference, only one stealer at a time. Use a global + * stealer-lock. + * + *The pattern is based on a gate -- stealer shuts the gate, then monitors + * to be sure any already past make it all the way out, before starting. + *So, have a "progress" measure just before the gate, then have two after it, + * one is in a "waiting room" outside the gate, the other is at the exit. + *Then, the stealer first shuts the gate, then checks the progress measure + * outside it, then looks to see if the progress measure at the exit is the + * same. If yes, it knows the protected area is empty 'cause no other way + * to get in and the last to get in also exited. + *If the progress measure at the exit is not the same, then the stealer goes + * into a loop checking both the waiting-area and the exit progress-measures + * until one of them shows the same as the measure outside the gate. Might + * as well re-read the measure outside the gate each go around, just to be + * sure. It is guaranteed that one of the two will eventually match the one + * outside the gate. + * + *Here's an informal proof of correctness: + *The gate can be closed at any point, and have only four cases: + * 1) coreloop made it past the gate-closing but not yet past the exit + * 2) coreloop made it past the pre-gate progress update but not yet past + * the gate, + * 3) coreloop is right before the pre-gate update + * 4) coreloop is past the exit and far from the pre-gate update. + * + * Covering the cases in reverse order, + * 4) is not a problem -- stealer will read pre-gate progress, see that it + * matches exit progress, and the gate is closed, so stealer can proceed. + * 3) stealer will read pre-gate progress just after coreloop updates it.. + * so stealer goes into a loop until the coreloop causes wait-progress + * to match pre-gate progress, so then stealer can proceed + * 2) same as 3.. + * 1) stealer reads pre-gate progress, sees that it's different than exit, + * so goes into loop until exit matches pre-gate, now it knows coreloop + * is not in protected and cannot get back in, so can proceed. + * + *Implementation for the stealer: + * + *First, acquire the stealer lock -- only cores with no work to do will + * compete to steal, so not a big performance penalty having only one -- + * will rarely have multiple stealers in a system with plenty of work -- and + * in a system with little work, it doesn't matter. + * + *Note, have single-reader, single-writer pattern for all variables used to + * communicate between stealer and victims + * + *So, scan the queues of the core loops, until find non-empty. Each core + * has its own list that it scans. The list goes in order from closest to + * furthest core, so it steals first from close cores. Later can add + * taking info from the app about overlapping footprints, and scan all the + * others then choose work with the most footprint overlap with the contents + * of this core's cache. + * + *Now, have a victim want to take work from. So, shut the gate in that + * coreloop, by setting the "gate closed" var on its stack to TRUE. + *Then, read the core's pre-gate progress and compare to the core's exit + * progress. + *If same, can proceed to take work from the coreloop's queue. When done, + * write FALSE to gate closed var. + *If different, then enter a loop that reads the pre-gate progress, then + * compares to exit progress then to wait progress. When one of two + * matches, proceed. Take work from the coreloop's queue. When done, + * write FALSE to the gate closed var. + * + */ +void inline +gateProtected_stealWorkInto( SchedSlot *currSlot, + VMSQueueStruc *myReadyToAnimateQ, + VirtProcr *masterPr ) + { + VirtProcr *stolenPr; + int32 coreIdx, i, haveAVictim, gotLock; + VMSQueueStruc *victimsQ; + + volatile GateStruc *vicGate; + int32 coreMightBeInProtected; + + + + //see if any other cores have work available to steal + haveAVictim = FALSE; + coreIdx = masterPr->coreAnimatedBy; + for( i = 0; i < NUM_CORES -1; i++ ) + { + if( coreIdx >= NUM_CORES -1 ) + { coreIdx = 0; + } + else + { coreIdx++; + } + victimsQ = _VMSMasterEnv->readyToAnimateQs[coreIdx]; + if( numInVMSQ( victimsQ ) > 0 ) + { haveAVictim = TRUE; + vicGate = _VMSMasterEnv->workStealingGates[ coreIdx ]; + break; + } + } + if( !haveAVictim ) return; //no work to steal, exit + + //have a victim core, now get the stealer-lock + gotLock =__sync_bool_compare_and_swap( &(_VMSMasterEnv->workStealingLock), + UNLOCKED, LOCKED ); + if( !gotLock ) return; //go back to core loop, which will re-start master + + + //====== Start Gate-protection ======= + vicGate->gateClosed = TRUE; + coreMightBeInProtected= vicGate->preGateProgress != vicGate->exitProgress; + while( coreMightBeInProtected ) + { //wait until sure + if( vicGate->preGateProgress == vicGate->waitProgress ) + coreMightBeInProtected = FALSE; + if( vicGate->preGateProgress == vicGate->exitProgress ) + coreMightBeInProtected = FALSE; + } + + stolenPr = readVMSQ ( victimsQ ); + + vicGate->gateClosed = FALSE; + //======= End Gate-protection ======= + + + if( stolenPr != NULL ) //victim could have been in protected and taken + { currSlot->procrAssignedToSlot = stolenPr; + stolenPr->schedSlot = currSlot; + currSlot->needsProcrAssigned = FALSE; + + writeVMSQ( stolenPr, myReadyToAnimateQ ); + } + + //unlock the work stealing lock + _VMSMasterEnv->workStealingLock = UNLOCKED; + } diff -r f8508572f3de -r 3bac84e4e56e SwitchAnimators.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SwitchAnimators.h Thu Nov 04 18:13:18 2010 -0700 @@ -0,0 +1,138 @@ +/* + * Copyright 2009 OpenSourceStewardshipFoundation.org + * Licensed under GNU General Public License version 2 + * + * Author: seanhalle@yahoo.com + * + */ + +#ifndef _SwitchAnimators_H +#define _SwitchAnimators_H +#define __USE_GNU + +/*Isolating code for switching between animators within these macros -- at + * some point will make switches to compile for 32 bit or for 64 bit, which + * having these isolated will make cleaner + * + *This also makes it easier to change architectures, at some point + *And it cleans the code up, having the ugly assembly out of the way + */ + +//=========================== MasterVP to CoreLoop ========================== +// + //Save stack ptr and frame, restore CoreLoop's stack and frame, + // and clear the MasterLock + //GCC's -O3 messes with this -- go through generated -- protect somehow + // +#define masterSwitchToCoreLoop( masterPr ) \ + void *stackPtrAddr, *framePtrAddr, *masterLockAddr; \ + void *jmpPt, *coreLoopFramePtr, *coreLoopStackPtr; \ +\ + stackPtrAddr = &(masterPr->stackPtr); \ + framePtrAddr = &(masterPr->framePtr); \ + masterLockAddr = &(_VMSMasterEnv->masterLock); \ +\ + jmpPt = _VMSMasterEnv->coreLoopStartPt; \ + coreLoopFramePtr = masterPr->coreLoopFramePtr; \ + coreLoopStackPtr = masterPr->coreLoopStackPtr; \ +\ + asm volatile("movl %0, %%eax; \ + movl %%esp, (%%eax); \ + movl %1, %%eax; \ + movl %%ebp, (%%eax); \ + movl %2, %%ebx; \ + movl %3, %%eax; \ + movl %4, %%esp; \ + movl %5, %%ebp; \ + movl $0x0, (%%ebx); \ + jmp %%eax;" \ + /* outputs */ : "=g" (stackPtrAddr), "=g" (framePtrAddr), \ + "=g"(masterLockAddr) \ + /* inputs */ : "g" (jmpPt), "g"(coreLoopStackPtr), "g"(coreLoopFramePtr)\ + /* clobber */ : "memory", "%eax", "%ebx", "%ecx", "%edx", "%edi", "%esi" \ + );//can probably make clobber list empty -- but safe for now + + +//=========================== SlaveVP to CoreLoop =========================== +// + +#define SwitchToCoreLoop( animatingPr ) \ + void *jmpPt, *stackPtrAddr, *framePtrAddr, *coreLoopStackPtr; \ + void *coreLoopFramePtr; \ +\ + stackPtrAddr = &(animatingPr->stackPtr); \ + framePtrAddr = &(animatingPr->framePtr); \ +\ + jmpPt = _VMSMasterEnv->coreLoopStartPt; \ + coreLoopFramePtr = animatingPr->coreLoopFramePtr; \ + coreLoopStackPtr = animatingPr->coreLoopStackPtr; \ +\ + /*Save the virt procr's stack and frame ptrs*/ \ + asm volatile("movl %0, %%eax; \ + movl %%esp, (%%eax); \ + movl %1, %%eax; \ + movl %%ebp, (%%eax) "\ + /* outputs */ : "=g" (stackPtrAddr), "=g" (framePtrAddr) \ + /* inputs */ : \ + /* clobber */ : "%eax" \ + ); \ +\ + /*restore coreloop's frame ptr, then jump back to "start" of core loop*/\ + /*Note, GCC compiles to assembly that saves esp and ebp in the stack*/ \ + /* frame -- so have to explicitly do assembly that saves to memory*/ \ + asm volatile("movl %0, %%eax; \ + movl %1, %%esp; \ + movl %2, %%ebp; \ + jmp %%eax " \ + /* outputs */ : \ + /* inputs */ : "m" (jmpPt), "m"(coreLoopStackPtr), "m"(coreLoopFramePtr)\ + /* clobber */ : "memory", "%eax", "%ebx", "%ecx", "%edx", "%edi","%esi" \ + ); + //list everything as clobbered to force GCC to save all + // live vars that are in regs on stack before this + // assembly, so that stack pointer is correct, before jmp + + + +//============================== CoreLoop to VP ============================= +// + //Save the core loop's stack and frame pointers into virt procr struct + // then switch to stack ptr and frame ptr of virt procr & jmp to it + //This was a pain to get right because GCC converts the "(jmpPt)" to + // frame-relative mem-op -- so generated machine code first changed the + // frame pointer, then tried to jump to an addr stored on stack, which + // it accessed as an offset from frame-ptr! (wrong frame-ptr now) + //Explicitly loading into eax before changing frame-ptr fixed it + //Also, it turns "(currPr->coreLoopFramePtr)" into a temporary on the + // stack, so "movl %%ebp, %0" saves to the temp, NOT the data-struc! + + + //switch to virt procr's stack and frame ptr then jump to virt procr fn + +#define SwitchToVP( currPr ) \ + void *stackPtr, *framePtr, *jmpPt, *coreLoopFramePtrAddr, \ + *coreLoopStackPtrAddr; \ +\ + stackPtr = currPr->stackPtr; \ + framePtr = currPr->framePtr; \ + jmpPt = currPr->nextInstrPt; \ + coreLoopFramePtrAddr = &(currPr->coreLoopFramePtr); \ + coreLoopStackPtrAddr = &(currPr->coreLoopStackPtr); \ +\ + asm volatile("movl %0, %%eax; \ + movl %%esp, (%%eax); \ + movl %1, %%eax; \ + movl %%ebp, (%%eax); \ + movl %2, %%eax; \ + movl %3, %%esp; \ + movl %4, %%ebp; \ + jmp %%eax" \ + /* outputs */ : "=g"(coreLoopStackPtrAddr), \ + "=g"(coreLoopFramePtrAddr) \ + /* inputs */ : "g" (jmpPt), "g" (stackPtr), "g" (framePtr) \ + /* clobber */ : "memory", "%eax", "%ebx", "%ecx", "%edx", "%edi", "%esi" \ + ); + + +#endif /* _SwitchAnimators_H */ + diff -r f8508572f3de -r 3bac84e4e56e VMS.c --- a/VMS.c Tue Nov 02 16:43:01 2010 -0700 +++ b/VMS.c Thu Nov 04 18:13:18 2010 -0700 @@ -87,7 +87,7 @@ void create_masterEnv() { MasterEnv *masterEnv; - SRSWQueueStruc **readyToAnimateQs; + VMSQueueStruc **readyToAnimateQs; int coreIdx; VirtProcr **masterVPs; SchedSlot ***allSchedSlots; //ptr to array of ptrs @@ -105,7 +105,7 @@ masterEnv = _VMSMasterEnv; //Make a readyToAnimateQ for each core loop - readyToAnimateQs = VMS__malloc( NUM_CORES * sizeof(SRSWQueueStruc *) ); + readyToAnimateQs = VMS__malloc( NUM_CORES * sizeof(VMSQueueStruc *) ); masterVPs = VMS__malloc( NUM_CORES * sizeof(VirtProcr *) ); //One array for each core, 3 in array, core's masterVP scheds all @@ -114,18 +114,20 @@ _VMSMasterEnv->numProcrsCreated = 0; //used by create procr for( coreIdx = 0; coreIdx < NUM_CORES; coreIdx++ ) { - readyToAnimateQs[ coreIdx ] = makeSRSWQ(); + readyToAnimateQs[ coreIdx ] = makeVMSQ(); //Q: should give masterVP core-specific info as its init data? masterVPs[ coreIdx ] = VMS__create_procr( &masterLoop, masterEnv ); masterVPs[ coreIdx ]->coreAnimatedBy = coreIdx; allSchedSlots[ coreIdx ] = create_sched_slots(); //makes for one core _VMSMasterEnv->numMasterInARow[ coreIdx ] = 0; + _VMSMasterEnv->workStealingGates[ coreIdx ] = NULL; } _VMSMasterEnv->readyToAnimateQs = readyToAnimateQs; _VMSMasterEnv->masterVPs = masterVPs; _VMSMasterEnv->masterLock = UNLOCKED; _VMSMasterEnv->allSchedSlots = allSchedSlots; + _VMSMasterEnv->workStealingLock = UNLOCKED; //Aug 19, 2010: no longer need to place initial masterVP into queue @@ -338,8 +340,7 @@ */ void VMS__suspend_procr( VirtProcr *animatingPr ) - { void *jmpPt, *stackPtrAddr, *framePtrAddr, *coreLoopStackPtr; - void *coreLoopFramePtr; + { //The request to master will cause this suspended virt procr to get // scheduled again at some future point -- to resume, core loop jumps @@ -350,23 +351,6 @@ //return ownership of the virt procr and sched slot to Master virt pr animatingPr->schedSlot->workIsDone = TRUE; - stackPtrAddr = &(animatingPr->stackPtr); - framePtrAddr = &(animatingPr->framePtr); - - jmpPt = _VMSMasterEnv->coreLoopStartPt; - coreLoopFramePtr = animatingPr->coreLoopFramePtr;//need this only - coreLoopStackPtr = animatingPr->coreLoopStackPtr;//safety - - //Save the virt procr's stack and frame ptrs, - asm volatile("movl %0, %%eax; \ - movl %%esp, (%%eax); \ - movl %1, %%eax; \ - movl %%ebp, (%%eax) "\ - /* outputs */ : "=g" (stackPtrAddr), "=g" (framePtrAddr) \ - /* inputs */ : \ - /* clobber */ : "%eax" \ - ); - //=========================== Measurement stuff ======================== #ifdef MEAS__TIME_STAMP_SUSP //record time stamp: compare to time-stamp recorded below @@ -374,20 +358,10 @@ #endif //======================================================================= - //restore coreloop's frame ptr, then jump back to "start" of core loop - //Note, GCC compiles to assembly that saves esp and ebp in the stack - // frame -- so have to explicitly do assembly that saves to memory - asm volatile("movl %0, %%eax; \ - movl %1, %%esp; \ - movl %2, %%ebp; \ - jmp %%eax " \ - /* outputs */ : \ - /* inputs */ : "m" (jmpPt), "m"(coreLoopStackPtr), "m"(coreLoopFramePtr)\ - /* clobber */ : "memory", "%eax", "%ebx", "%ecx", "%edx", "%edi","%esi" \ - ); //list everything as clobbered to force GCC to save all - // live vars that are in regs on stack before this - // assembly, so that stack pointer is correct, before jmp + SwitchToCoreLoop( animatingPr ) + + //======================================================================= ResumePt: #ifdef MEAS__TIME_STAMP_SUSP //NOTE: only take low part of count -- do sanity check when take diff @@ -673,7 +647,7 @@ for( coreIdx=0; coreIdx < NUM_CORES; coreIdx++ ) { //Note, this is running in the master shutDownPr = VMS__create_procr( &endOSThreadFn, NULL ); - writeSRSWQ( shutDownPr, _VMSMasterEnv->readyToAnimateQs[coreIdx] ); + writeVMSQ( shutDownPr, _VMSMasterEnv->readyToAnimateQs[coreIdx] ); } } @@ -717,7 +691,7 @@ void VMS__cleanup_at_end_of_shutdown() { - SRSWQueueStruc **readyToAnimateQs; + VMSQueueStruc **readyToAnimateQs; int coreIdx; VirtProcr **masterVPs; SchedSlot ***allSchedSlots; //ptr to array of ptrs @@ -731,7 +705,7 @@ for( coreIdx = 0; coreIdx < NUM_CORES; coreIdx++ ) { - freeSRSWQ( readyToAnimateQs[ coreIdx ] ); + freeVMSQ( readyToAnimateQs[ coreIdx ] ); //master VPs were created external to VMS, so use external free VMS__dissipate_procr( masterVPs[ coreIdx ] ); diff -r f8508572f3de -r 3bac84e4e56e VMS.h --- a/VMS.h Tue Nov 02 16:43:01 2010 -0700 +++ b/VMS.h Thu Nov 04 18:13:18 2010 -0700 @@ -11,7 +11,7 @@ #define __USE_GNU #include "VMS_primitive_data_types.h" -#include "Queue_impl/BlockingQueue.h" +#include "Queue_impl/PrivateQueue.h" #include "Histogram/Histogram.h" #include "DynArray/DynArray.h" #include "Hash_impl/PrivateHash.h" @@ -22,28 +22,36 @@ //=============================== Debug =================================== - //These defines turn types of bug messages on and off -#define dbgProbes FALSE -#define dbgAppFlow FALSE - +// //When SEQUENTIAL is defined, VMS does sequential exe in the main thread // It still does co-routines and all the mechanisms are the same, it just // has only a single thread and animates VPs one at a time //#define SEQUENTIAL +//#define USE_WORK_STEALING + //turns on the probe-instrumentation in the application -- when not // defined, the calls to the probe functions turn into comments #define STATS__ENABLE_PROBES + //These defines turn types of bug messages on and off + // be sure debug messages are un-commented (next block of defines) +#define dbgProbes FALSE /* for issues inside probes themselves*/ +#define dbgAppFlow TRUE /* Top level flow of application code -- general*/ +#define dbgB2BMaster FALSE/* in coreloop, back to back master VPs*/ +#define dbgRqstHdlr FALSE /* in request handler code*/ -#define DEBUG(msg)// printf(msg); fflush(stdin); -#define DEBUG_MSG( bool, msg) //if( bool){ printf(msg); fflush(stdin);} -#define PRINT1_DEBUG(msg, param) //printf(msg, param); fflush(stdin); -#define PRINT2_DEBUG(msg, p1, p2) //printf(msg, p1, p2); fflush(stdin); + //Comment or un- the substitute half to turn on/off types of debug message +#define DEBUG( bool, msg) \ + if( bool){ printf(msg); fflush(stdin);} +#define DEBUG1( bool, msg, param) \ + if(bool){printf(msg, param); fflush(stdin);} +#define DEBUG2( bool, msg, p1, p2) \ + //if(bool) {printf(msg, p1, p2); fflush(stdin);} -#define PRINT_ERROR(msg) printf(msg); fflush(stdin); -#define PRINT1_ERROR(msg, param) printf(msg, param); fflush(stdin); -#define PRINT2_ERROR(msg, p1, p2) printf(msg, p1, p2); fflush(stdin); +#define ERROR(msg) printf(msg); fflush(stdin); +#define ERROR1(msg, param) printf(msg, param); fflush(stdin); +#define ERROR2(msg, p1, p2) printf(msg, p1, p2); fflush(stdin); //=========================== STATS ======================= @@ -56,6 +64,8 @@ #define MEAS__TIME_MASTER #define MEAS__NUM_TIMES_TO_RUN 100000 + //For code that calculates normalization-offset between TSC counts of + // different cores. #define NUM_TSC_ROUND_TRIPS 10 @@ -64,8 +74,9 @@ // machine #define NUM_CORES 4 - // balance amortizing master fixed overhead vs imbalance potential -#define NUM_SCHED_SLOTS 3 + // tradeoff amortizing master fixed overhead vs imbalance potential + // when work-stealing, can make bigger, at risk of losing cache affinity +#define NUM_SCHED_SLOTS 5 #define MIN_WORK_UNIT_CYCLES 20000 @@ -82,10 +93,11 @@ #define SUCCESS 0 -#define writeVMSQ writeSRSWQ -#define readVMSQ readSRSWQ -#define makeVMSQ makeSRSWQ -#define VMSQueueStruc SRSWQueueStruc +#define writeVMSQ writePrivQ +#define readVMSQ readPrivQ +#define makeVMSQ makePrivQ +#define numInVMSQ numInPrivQ +#define VMSQueueStruc PrivQueueStruc @@ -96,6 +108,8 @@ typedef struct _VMSReqst VMSReqst; typedef struct _VirtProcr VirtProcr; typedef struct _IntervalProbe IntervalProbe; +typedef struct _GateStruc GateStruc; + typedef VirtProcr * (*SlaveScheduler) ( void *, int ); //semEnv, coreIdx typedef void (*RequestHandler) ( VirtProcr *, void * ); //prWReqst, semEnv @@ -190,7 +204,7 @@ RequestHandler requestHandler; SchedSlot ***allSchedSlots; - SRSWQueueStruc **readyToAnimateQs; + VMSQueueStruc **readyToAnimateQs; VirtProcr **masterVPs; void *semanticEnv; @@ -205,6 +219,9 @@ int32 masterLock; int32 numMasterInARow[NUM_CORES];//detect back-to-back masterVP + GateStruc **workStealingGates[ NUM_CORES ]; //concurrent work-steal + int32 workStealingLock; + int32 numProcrsCreated; //gives ordering to processor creation //=========== MEASUREMENT STUFF ============= @@ -216,13 +233,21 @@ } MasterEnv; -//============================= +//========================= Extra Stuff Data Strucs ======================= typedef struct { } VMSExcp; +struct _GateStruc + { + int32 gateClosed; + int32 preGateProgress; + int32 waitProgress; + int32 exitProgress; + }; +//GateStruc //======================= OS Thread related =============================== @@ -342,6 +367,7 @@ ); //===== +#include "SwitchAnimators.h" #include "probes.h" #endif /* _VMS_H */ diff -r f8508572f3de -r 3bac84e4e56e probes.c --- a/probes.c Tue Nov 02 16:43:01 2010 -0700 +++ b/probes.c Thu Nov 04 18:13:18 2010 -0700 @@ -253,7 +253,7 @@ VMS_impl__record_interval_start_in_probe( int32 probeID ) { IntervalProbe *probe; - DEBUG_MSG( dbgProbes, "record start of interval\n" ) + DEBUG( dbgProbes, "record start of interval\n" ) probe = _VMSMasterEnv->intervalProbes[ probeID ]; gettimeofday( &(probe->startStamp), NULL ); } @@ -268,7 +268,7 @@ struct timeval *endStamp, *startStamp; float64 startSecs, endSecs; - DEBUG_MSG( dbgProbes, "record end of interval\n" ) + DEBUG( dbgProbes, "record end of interval\n" ) //possible seg-fault if array resized by diff core right after this // one gets probe..? Something like that? Might be safe.. don't care probe = _VMSMasterEnv->intervalProbes[ probeID ]; diff -r f8508572f3de -r 3bac84e4e56e vmalloc.c --- a/vmalloc.c Tue Nov 02 16:43:01 2010 -0700 +++ b/vmalloc.c Thu Nov 04 18:13:18 2010 -0700 @@ -67,7 +67,7 @@ } if( foundElem == NULL ) - { PRINT_ERROR("\nmalloc failed\n") + { ERROR("\nmalloc failed\n") return NULL; //indicates malloc failed } //Using a kludge to identify the element that is the top chunk in the