# HG changeset patch # User Sean # Date 1337279984 -7200 # Node ID 227cd4d33d9423cf38320fec90b87f3ed4af5d90 # Parent b4f684e98d0be2d6477262725ac20d56b7a4427a fixed pointing slave to a TLF, added ones that don't trash stack fixed names diff -r b4f684e98d0b -r 227cd4d33d94 Defines/VMS_defs__turn_on_and_off.h --- a/Defines/VMS_defs__turn_on_and_off.h Tue May 08 18:58:41 2012 +0200 +++ b/Defines/VMS_defs__turn_on_and_off.h Thu May 17 20:39:44 2012 +0200 @@ -15,7 +15,7 @@ * It still does co-routines and all the mechanisms are the same, it just * has only a single thread and animates Slvs one at a time */ -//#define DEBUG__TURN_ON_SEQUENTIAL_MODE +#define DEBUG__TURN_ON_SEQUENTIAL_MODE /*turns on the probe-instrumentation in the application -- when not @@ -29,6 +29,7 @@ #define dbgProbes FALSE /* for issues inside probes themselves*/ #define dbgB2BMaster FALSE /* in coreloop, back to back master Slvs*/ #define dbgRqstHdlr FALSE /* in request handler code*/ +#define dbgSS FALSE /* in request handler code*/ //#define DEBUG__TURN_ON_ERROR_MSGS @@ -60,9 +61,9 @@ /*turn on/off subtraction of create measurements from plugin meas*/ //#define MEAS__TURN_ON_EXCLUDE_CREATION_TIME -#define HOLISTIC__TURN_ON_PERF_COUNTERS -#define HOLISTIC__TURN_ON_OBSERVE_UCC -#define DETECT_LOOP_GRAPH +//#define HOLISTIC__TURN_ON_PERF_COUNTERS +//#define HOLISTIC__TURN_ON_OBSERVE_UCC +//#define DETECT_LOOP_GRAPH //=================== Turn on or off system options ======================= // diff -r b4f684e98d0b -r 227cd4d33d94 HW_Dependent_Primitives/VMS__primitives.c --- a/HW_Dependent_Primitives/VMS__primitives.c Tue May 08 18:58:41 2012 +0200 +++ b/HW_Dependent_Primitives/VMS__primitives.c Thu May 17 20:39:44 2012 +0200 @@ -5,19 +5,20 @@ #include "../VMS.h" -/*Set up the stack with __cdecl structure on it - * Except doing a trick for 64 bits, where put top-level fn pointer on - * stack, then call an assembly helper that copies it into a reg and +/*Reset the stack then set it up with __cdecl structure on it + * Except doing a trick for 64 bits, where point slave to helper assembly + * that copies the function pointer off stack and into a reg, then * jumps to it. So, set the resumeInstrPtr to the helper-assembly. - *No need to save registers on old stack frame, because there's no old - * animator state to return to + *This is for first-time startup of slave.. it trashes the stack. + *No registers saved into old stack frame, and no animator state to + * return to * *This was factored into separate function because it's used stand-alone in * some wrapper-libraries (but only "int" version, to warn users to check * carefully that it's safe) */ inline void -VMS_int__point_slaveVP_to_Fn( SlaveVP *slaveVP, TopLevelFnPtr fnPtr, +VMS_int__reset_slaveVP_to_TopLvlFn( SlaveVP *slaveVP, TopLevelFnPtr fnPtr, void *dataParam) { void *stackPtr; @@ -28,9 +29,9 @@ //fnPtr takes two params -- void *dataParam & void *animSlv // Stack grows *down*, so start it at highest stack addr, minus room - // for 2 params + return addr. + // for 2 params + return addr. Do ptr arith in terms of bytes.. stackPtr = - (void *)slaveVP->startOfStack + VIRT_PROCR_STACK_SIZE - 4*sizeof(void*); + (uint8 *)slaveVP->startOfStack + VIRT_PROCR_STACK_SIZE - 4*sizeof(void*); //setup __cdecl on stack //Normally, return Addr is in loc pointed to by stackPtr, but doing a @@ -38,6 +39,7 @@ // and set resumeInstrPtr to a helper-fn that copies the top-level // fn ptr and params into registers. //Then, dataParam is at stackPtr + 8 bytes, & animating SlaveVP above + //Do ptr arith in terms of pointers *((SlaveVP**)stackPtr + 2 ) = slaveVP; //rightmost param *((void**)stackPtr + 1 ) = dataParam; //next param to left *((void**)stackPtr) = (void*)fnPtr; //copied to reg by helper Fn @@ -46,8 +48,90 @@ // end of Hardware dependent part //core controller will switch to stack & frame pointers stored in slave, - // suspend will save processor's stack and frame into slave + // can't use this fn if have state on stack that needs preserving. slaveVP->stackPtr = stackPtr; slaveVP->framePtr = stackPtr; } + +/*Preserve the stack, pushing the __cdecl structure onto it + * For 64 bits, params passed in regs, so point slave to helper assembly + * that copies the arguments off stack and into regs, then + * jumps to Fn. So, set the resumeInstrPtr to the helper-assembly. + * + *This preserves the stack state existed at time slave was suspended. + */ +inline void +VMS_int__point_slaveVP_to_OneParamFn( SlaveVP *slaveVP, void *fnPtr, + void *param) + { void *stackPtr; + +// Start of Hardware dependent part + + // Get the slave's current stack ptr, and make room for param + ret addr + stackPtr = ((void **)slaveVP->stackPtr - 2); + + //save slave's current instr ptr as the return addr, so stack looks + // just like it does after a call instr. + //Put argument plus fn addr onto stack -- helper will copy into regs + // then jump to the fn + //fnPtr is just below top of stack, param is above at stackPtr + 8 bytes + *((void**)stackPtr + 1 ) = param; + *((void**)stackPtr) = slaveVP->resumeInstrPtr; //acts as return addr + *((void**)stackPtr - 1) = (void*)fnPtr; //what helper jmps to + + //Set slave's instr pointer to a helper Fn that copies params from stack + slaveVP->resumeInstrPtr = (TopLevelFnPtr)&jmpToOneParamFn; + +// end of Hardware dependent part + + //core controller will switch to stack & frame pointers stored in slave, + // then jmp to helper Fn, which will then move param to register used + // to pass argument and jmp to fnPtr saved on stack. + //That fn should save the framePtr on stack and make room + // for its own frame, as normal. So don't modify framePtr, only stack + slaveVP->stackPtr = stackPtr; + } + + +/*Same as for one-parameter function, but puts two arguments on stack + *Preserve the stack, pushing the __cdecl structure onto it + * For 64 bits, params passed in regs, so point slave to helper assembly + * that copies the arguments off stack and into regs, then + * jumps to Fn. So, set the resumeInstrPtr to the helper-assembly. + * + *This preserves the stack state existed at time slave was suspended. + */ +inline void +VMS_int__point_slaveVP_to_TwoParamFn( SlaveVP *slaveVP, void *fnPtr, + void *param1, void *param2) + { void *stackPtr; + +// Start of Hardware dependent part + + // Get the slave's current stack ptr, and make room for param + ret addr + stackPtr = slaveVP->stackPtr - 3; + + //save slave's current instr ptr as the return addr, so stack looks + // just like it does after a call instr. + //Put argument plus fn addr onto stack -- helper will copy into regs + // then jump to the fn + //fnPtr is just below top of stack, param1 is above at stackPtr + 8 bytes + *((void**)stackPtr + 2 ) = param2; + *((void**)stackPtr + 1 ) = param1; + *((void**)stackPtr) = slaveVP->resumeInstrPtr; //acts as return addr + *((void**)stackPtr - 1) = (void*)fnPtr; //what helper jmps to + + //Set slave's instr pointer to a helper Fn that copies params from stack + slaveVP->resumeInstrPtr = (TopLevelFnPtr)&jmpToTwoParamFn; + +// end of Hardware dependent part + + //core controller will switch to stack & frame pointers stored in slave, + // then jmp to helper Fn, which will then move param to register used + // to pass argument and jmp to fnPtr saved on stack. + //That fn should save the framePtr on stack and make room + // for its own frame, as normal. So don't modify framePtr, only stack + slaveVP->stackPtr = stackPtr; + } + diff -r b4f684e98d0b -r 227cd4d33d94 HW_Dependent_Primitives/VMS__primitives.h --- a/HW_Dependent_Primitives/VMS__primitives.h Tue May 08 18:58:41 2012 +0200 +++ b/HW_Dependent_Primitives/VMS__primitives.h Thu May 17 20:39:44 2012 +0200 @@ -25,6 +25,12 @@ void startUpTopLevelFn(); +void +jmpToOneParamFn(); + +void +jmpToTwoParamFn(); + void * asmTerminateCoreCtlr(SlaveVP *currSlv); @@ -37,5 +43,13 @@ void VMS_int__return_to_addr_in_ptd_to_loc(void *ptdToLoc); +inline void +VMS_int__point_slaveVP_to_OneParamFn( SlaveVP *slaveVP, void *fnPtr, + void *param); + +inline void +VMS_int__point_slaveVP_to_TwoParamFn( SlaveVP *slaveVP, void *fnPtr, + void *param1, void *param2); + #endif /* _VMS__HW_DEPENDENT_H */ diff -r b4f684e98d0b -r 227cd4d33d94 HW_Dependent_Primitives/VMS__primitives_asm.s --- a/HW_Dependent_Primitives/VMS__primitives_asm.s Tue May 08 18:58:41 2012 +0200 +++ b/HW_Dependent_Primitives/VMS__primitives_asm.s Thu May 17 20:39:44 2012 +0200 @@ -21,6 +21,27 @@ movq (%rsp) , %rax #get top-level function's addr from stack jmp *%rax #jump to the top-level function + +//Args passed in regs in 64 bit arch. This copies args from stack into regs, +// then does jmp to the function, whose addr is on stack. +//For 64bit, %rdi is first arg, %rsi is second arg to function +//The top of stack is a valid return addr (old value of slaveVP's instrPtr), +// and the fnPtr is just below the top of stack (will be overwritten when +// fn saves the frame ptr) +.globl jmpToOneParamFn +jmpToOneParamFn: + movq 0x08(%rsp), %rdi #get the argument from stack + movq -0x08(%rsp), %rax #get function's addr from stack + jmp *%rax #jump to the function + +.globl jmpToTwoParamFn +jmpToTwoParamFn: + movq 0x10(%rsp), %rsi #get the second argument from stack + movq 0x08(%rsp), %rdi #get the first argument from stack + movq -0x08(%rsp), %rax #get function's addr from stack + jmp *%rax #jump to the function + + //Switches form CoreCtlr to either a normal Slv VP or the Master VP //switch to VP's stack and frame ptr then jump to VP's next-instr-ptr /* SlaveVP offsets: diff -r b4f684e98d0b -r 227cd4d33d94 Services_Offered_by_VMS/Measurement_and_Stats/MEAS__macros.h --- a/Services_Offered_by_VMS/Measurement_and_Stats/MEAS__macros.h Tue May 08 18:58:41 2012 +0200 +++ b/Services_Offered_by_VMS/Measurement_and_Stats/MEAS__macros.h Thu May 17 20:39:44 2012 +0200 @@ -294,7 +294,7 @@ #ifdef HOLISTIC__TURN_ON_PERF_COUNTERS -#define MEAS__Insert_Counter_Handler \ + #define MEAS__Insert_Counter_Handler \ typedef void (*CounterHandler) (int,int,int,SlaveVP*,uint64,uint64,uint64); enum eventType { @@ -365,12 +365,13 @@ int sndctr = tsc_offset_resp(sendCoresThdParams,0); \ }*/ -#define HOLISTIC__Record_last_work lastVPBeforeMaster = currVP; #define HOLISTIC__Insert_Master_Global_Vars \ int vpid,task; \ CounterHandler counterHandler = masterEnv->counterHandler; + #define HOLISTIC__Record_last_work lastVPBeforeMaster = currVP; + #define HOLISTIC__Record_AppResponderInvocation_start \ uint64 cycles,instrs,cachem; \ saveCyclesAndInstrs(thisCoresIdx,cycles, instrs,cachem); \ @@ -467,6 +468,9 @@ #define MEAS__Insert_Counter_Handler #define MEAS__Insert_Counter_Meas_Fields_into_MasterEnv #define HOLISTIC__Setup_Perf_Counters + #define HOLISTIC__CoreCtrl_Setup + #define HOLISTIC__Insert_Master_Global_Vars + #define HOLISTIC__Record_last_work #define HOLISTIC__Record_AppResponderInvocation_start #define HOLISTIC__Record_AppResponder_start #define HOLISTIC__Record_AppResponder_end diff -r b4f684e98d0b -r 227cd4d33d94 Services_Offered_by_VMS/Memory_Handling/vmalloc.c --- a/Services_Offered_by_VMS/Memory_Handling/vmalloc.c Tue May 08 18:58:41 2012 +0200 +++ b/Services_Offered_by_VMS/Memory_Handling/vmalloc.c Thu May 17 20:39:44 2012 +0200 @@ -202,6 +202,47 @@ /* * This is sequential code, meant to only be called from the Master, not from * any slave Slvs. + * + *ToDo: Improve speed, by using built-in leading 1 detector to calc free-list + * index. + *Change to two separate arrays, one for free-lists of small fixed-size chunks + * other for free lists of exponentially growing chunk sizes + *Do simple compare to decide which array of lists to use + *For small chunks, size the lists in increments of 16, up to, say, 128 (1024 + * is max if want less than 64 lists, which allows searching for first + * occupied free-list using leading-1 detector on a bit-vector) + *To find index, right-shift by 4 bits, and that's the index! (works because + * compare says no 1's above 128 position ((bit 7)), and sizes are every 16, + * so dividing by 16 equals exactly the position) + *For large chunks, have 63 free lists, but split into even and odd indexes. + *For even indexes, each list starts with chunks twice the size of previous + * even index. + *For odd indexes, each list starts with chunks of size half-way between those + * of the even indexes on either side. + * + *To calc the free-list position of a requested size, get pos of leading 1 + * of the size, call this msbsP (most-significant-bit-set-position). Then + * check bit to right of it (one-less-significant) + *If it's 0 then use the even index: msbsP * 2, which is msbsP << 1. + *If it's 1, then use the odd-index, which is msbsP << 1 + 1 + * + *To find msbsP, use GCC builtin: "int __builtin_clzll (unsigned long long)" + * which returns the number of zeros above (left of) msb set. Note, dies if + * give it zero, but the compare used to choose between arrays makes sure + * requested size given to it is not zero. + * + *This scheme keeps wastage small, while finding free element is O(1), and a + * fast constant. + *For large chunk sizes, if don't shave excess, then it ensures worst-case + * wastage due to mis-match in size of chunk vs requested size is 33% + * (invariant: take any even list.. it starts at a power of 2, and next list + * up starts at 50% larger, so biggest chunk is 1.5 x smallest request, that's + * 33% of total memory wasted. Then, for the odd index above, smallest chunk + * is 2x for smallest request of 1.5x, for 25% total wasted memory) + *For smallest size chunks, the pre-amble wastes quite a bit, but above that, + * sizing in increments of 16 keeps wastage small. And, if always shave, then + * wastage due to size mis-match is maximum 16 bytes for the large chunks. + * */ void * VMS_int__malloc( size_t sizeRequested ) @@ -213,17 +254,17 @@ //Return a small chunk if the requested size is smaller than 128B if(sizeRequested <= LOWER_BOUND) - { - uint32 freeListIdx = (sizeRequested-1)/SMALL_CHUNK_SIZE; - if(freeLists->smallChunks[freeListIdx] == NULL) - foundChunk = searchChunk(freeLists, SMALL_CHUNK_SIZE*(freeListIdx+1), 0); - else - foundChunk = removeSmallChunk(freeLists, freeListIdx); + { + uint32 freeListIdx = (sizeRequested-1)/SMALL_CHUNK_SIZE; + if(freeLists->smallChunks[freeListIdx] == NULL) + foundChunk = searchChunk(freeLists, SMALL_CHUNK_SIZE*(freeListIdx+1), 0); + else + foundChunk = removeSmallChunk(freeLists, freeListIdx); - //Mark as allocated - foundChunk->prevChunkInFreeList = NULL; - return foundChunk + 1; - } + //Mark as allocated + foundChunk->prevChunkInFreeList = NULL; + return foundChunk + 1; + } //Calculate the expected container. Start one higher to have a Chunk that's //always big enough. diff -r b4f684e98d0b -r 227cd4d33d94 Services_Offered_by_VMS/Memory_Handling/vmalloc.h --- a/Services_Offered_by_VMS/Memory_Handling/vmalloc.h Tue May 08 18:58:41 2012 +0200 +++ b/Services_Offered_by_VMS/Memory_Handling/vmalloc.h Thu May 17 20:39:44 2012 +0200 @@ -55,8 +55,8 @@ void * VMS_int__malloc( size_t sizeRequested ); #define VMS_PI__malloc VMS_int__malloc -#define VMS_WL__malloc VMS_int__malloc /*TODO: Bug -- Not protected!! */ -#define VMS_App__malloc VMS_int__malloc /*TODO: Bug -- Not protected!! */ +#define VMS_WL__malloc VMS_int__malloc /*TODO: Bug -- get master lock */ +#define VMS_App__malloc VMS_int__malloc /*TODO: Bug -- get master lock */ void * VMS_int__malloc_aligned( size_t sizeRequested ); diff -r b4f684e98d0b -r 227cd4d33d94 VMS.h --- a/VMS.h Tue May 08 18:58:41 2012 +0200 +++ b/VMS.h Thu May 17 20:39:44 2012 +0200 @@ -10,11 +10,12 @@ #define _VMS_H #define _GNU_SOURCE -#include "VMS_primitive_data_types.h" #include "DynArray/DynArray.h" #include "Hash_impl/PrivateHash.h" #include "Histogram/Histogram.h" #include "Queue_impl/PrivateQueue.h" + +#include "VMS_primitive_data_types.h" #include "Services_Offered_by_VMS/Memory_Handling/vmalloc.h" #include @@ -49,8 +50,8 @@ //============================ HW Dependent Fns ================================ -#include "Hardware_Dependent/VMS__HW_measurement.h" -#include "Hardware_Dependent/VMS__primitives.h" +#include "HW_Dependent_Primitives/VMS__HW_measurement.h" +#include "HW_Dependent_Primitives/VMS__primitives.h" //============= Request Related =========== @@ -156,9 +157,9 @@ */ typedef struct { //The offsets of these fields are hard-coded into assembly - void *coreCtlrReturnPt; //offset of field used in asm + void *coreCtlrReturnPt; //offset to this field used in asm int8 falseSharePad1[256 - sizeof(void*)]; - int32 masterLock; //offset of field used in asm + int32 masterLock; //offset to this field used in asm int8 falseSharePad2[256 - sizeof(int32)]; //============ below this, no fields are used in asm ============= @@ -289,9 +290,17 @@ void *dataParam, void *stackLocs ); inline void -VMS_int__point_slaveVP_to_Fn( SlaveVP *slaveVP, TopLevelFnPtr fnPtr, +VMS_int__reset_slaveVP_to_TopLvlFn( SlaveVP *slaveVP, TopLevelFnPtr fnPtr, void *dataParam); +inline void +VMS_int__point_slaveVP_to_OneParamFn( SlaveVP *slaveVP, void *fnPtr, + void *param); + +inline void +VMS_int__point_slaveVP_to_TwoParamFn( SlaveVP *slaveVP, void *fnPtr, + void *param1, void *param2); + void VMS_int__dissipate_slaveVP( SlaveVP *slaveToDissipate ); #define VMS_PI__dissipate_slaveVP VMS_int__dissipate_slaveVP diff -r b4f684e98d0b -r 227cd4d33d94 VMS__int.c --- a/VMS__int.c Tue May 08 18:58:41 2012 +0200 +++ b/VMS__int.c Thu May 17 20:39:44 2012 +0200 @@ -72,17 +72,22 @@ VMS_int__suspend_slaveVP_and_send_req( SlaveVP *animatingSlv ) { - //The request to master will cause this suspended Slv to get - // assigned again at some future point -- to resume, core ctlr jumps - // to the resume point (below), which causes restore of saved regs and - // "return" from this call. - //animatingSlv->resumeInstrPtr = &&ResumePt; + //This suspended Slv will get assigned by Master again at some + // future point //return ownership of the Slv and anim slot to Master virt pr animatingSlv->animSlotAssignedTo->workIsDone = TRUE; HOLISTIC__Record_HwResponderInvocation_start; MEAS__Capture_Pre_Susp_Point; + //This assembly function is a VMS primitive that first saves the + // stack and frame pointer, plus an addr inside this assembly code. + //When core ctlr later gets this slave out of a sched slot, it + // restores the stack and frame and then jumps to the addr.. that + // jmp causes return from this function. + //So, in effect, this function takes a variable amount of wall-clock + // time to complete -- the amount of time is determined by the + // Master, which makes sure the memory is in a consistent state first. switchToCoreCtlr(animatingSlv); flushRegisters(); MEAS__Capture_Post_Susp_Point; @@ -176,10 +181,10 @@ newSlv->slaveID = _VMSMasterEnv->numSlavesCreated++; newSlv->requests = NULL; newSlv->animSlotAssignedTo = NULL; - newSlv->typeOfVP = Slave; - newSlv->assignCount = 0; + newSlv->typeOfVP = Slave; + newSlv->assignCount = 0; - VMS_int__point_slaveVP_to_Fn( newSlv, fnPtr, dataParam ); + VMS_int__reset_slaveVP_to_TopLvlFn( newSlv, fnPtr, dataParam ); //============================= MEASUREMENT STUFF ======================== #ifdef PROBES__TURN_ON_STATS_PROBES @@ -199,7 +204,7 @@ VMS_int__strDup( char *str ) { char *retStr; - if( str == NULL ) return NULL; + if( str == NULL ) return (char *)NULL; retStr = (char *)VMS_int__malloc( strlen(str) + 1 ); strcpy( retStr, str ); diff -r b4f684e98d0b -r 227cd4d33d94 VMS__startup_and_shutdown.c --- a/VMS__startup_and_shutdown.c Tue May 08 18:58:41 2012 +0200 +++ b/VMS__startup_and_shutdown.c Thu May 17 20:39:44 2012 +0200 @@ -157,7 +157,7 @@ /* void -VMS__start_VMS_running() +VMS_App__start_VMS_running() { create_masterEnv(); @@ -176,7 +176,7 @@ */ /* VMSProcess * -VMS__spawn_program_on_data_in_Lang( TopLevelFnPtr prog_seed_fn, void *data, +VMS_App__spawn_program_on_data_in_Lang( TopLevelFnPtr prog_seed_fn, void *data, LangInitFnPtr langInitFnPtr ) { VMSProcess *newProcess; newProcess = malloc( sizeof(VMSProcess) ); @@ -217,7 +217,7 @@ */ /* void * -VMS__give_results_when_done_for( VMSProcess *process ) +VMS_App__give_results_when_done_for( VMSProcess *process ) { void *result; pthread_mutex_lock( process->doneLock ); @@ -247,7 +247,7 @@ VMS_SS__shutdown(); //already defined -- look at it void -VMS__shutdown() +VMS_App__shutdown() { for( cores ) { slave = VMS_int__create_new_SlaveVP( endOSThreadFn, NULL ); @@ -256,16 +256,16 @@ } */ -/* VMS__start_VMS_running(); +/* VMS_App__start_VMS_running(); VMSProcess matrixMultProcess; matrixMultProcess = - VMS__spawn_program_on_data_in_Lang( &prog_seed_fn, data, Vthread_lang ); + VMS_App__spawn_program_on_data_in_Lang( &prog_seed_fn, data, Vthread_lang ); - resMatrix = VMS__give_results_when_done_for( matrixMultProcess ); + resMatrix = VMS_App__give_results_when_done_for( matrixMultProcess ); - VMS__shutdown(); + VMS_App__shutdown(); */ void