# HG changeset patch # User Some Random Person # Date 1329939552 28800 # Node ID eaf7e4c58c9e0b9279d9cbf16e3a17c18b4a0bc1 # Parent bc4cb994f11451921a3c960a34de1747560a672b Create common_ancestor brch -- all branches will be closed, then new ones created with this as the common ancestor of all branches -- it is incomplete! only code that is common to all HW and Feat and FeatDev branches is in here diff -r bc4cb994f114 -r eaf7e4c58c9e .hgignore --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/.hgignore Wed Feb 22 11:39:12 2012 -0800 @@ -0,0 +1,3 @@ +syntax: glob + +*.o diff -r bc4cb994f114 -r eaf7e4c58c9e .hgtags --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/.hgtags Wed Feb 22 11:39:12 2012 -0800 @@ -0,0 +1,1 @@ +9c3107044f86c36fea3a8f72f64910b1363555be Dec27_2010_about_to_add_sched_record diff -r bc4cb994f114 -r eaf7e4c58c9e CoreLoop.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/CoreLoop.c Wed Feb 22 11:39:12 2012 -0800 @@ -0,0 +1,214 @@ +/* + * Copyright 2010 OpenSourceStewardshipFoundation + * + * Licensed under BSD + */ + + +#include "VMS.h" +#include "ProcrContext.h" + +#include +#include +#include + +#include +#include + +void *terminateCoreLoop(SlaveVP *currPr); + +/*This is the loop that runs in the OS Thread pinned to each core + *Get virt procr from queue, + * save state of current animator, then load in state of virt procr, using + * jmp instr to switch the program-counter state -- making the virt procr + * the new animator. + *At some point, the virt procr will suspend itself by saving out its + * animator state (stack ptr, frame ptr, program counter) and switching + * back to the OS Thread's animator state, which means restoring the + * stack and frame and jumping to the core loop start point. + *This cycle then repeats, until a special shutdown virtual processor is + * animated, which jumps to the end point at the bottom of core loop. + */ +void * +coreLoop( void *paramsIn ) + { + ThdParams *coreLoopThdParams; + int thisCoresIdx; + SlaveVP *currPr; + VMSQueueStruc *readyToAnimateQ; + cpu_set_t coreMask; //has 1 in bit positions of allowed cores + int errorCode; + + //work-stealing struc on stack to prevent false-sharing in cache-line + volatile GateStruc gate; + //preGateProgress, waitProgress, exitProgress, gateClosed; + + + coreLoopThdParams = (ThdParams *)paramsIn; + thisCoresIdx = coreLoopThdParams->coreNum; + + gate.gateClosed = FALSE; + gate.preGateProgress = 0; + gate.waitProgress = 0; + gate.exitProgress = 0; + _VMSMasterEnv->workStealingGates[ thisCoresIdx ] = (GateStruc*)&gate;//race @startup + + //wait until signalled that setup is complete + pthread_mutex_lock( &suspendLock ); + while( !(_VMSMasterEnv->setupComplete) ) + { + pthread_cond_wait( &suspend_cond, + &suspendLock ); + } + pthread_mutex_unlock( &suspendLock ); + + //printf( "\nCore unsuspended: %d\n", coreLoopThdParams->coreNum ); + + //set thread affinity + //Linux requires pinning thd to core inside thread-function + //Designate a core by a 1 in bit-position corresponding to the core + CPU_ZERO(&coreMask); + CPU_SET(coreLoopThdParams->coreNum,&coreMask); + //coreMask = 1L << coreLoopThdParams->coreNum; + + pthread_t selfThd = pthread_self(); + errorCode = + pthread_setaffinity_np( selfThd, sizeof(coreMask), &coreMask); + + if(errorCode){ printf("\nset affinity failure\n"); exit(0); } + + + //Save the return address in the SwitchVP function + saveCoreLoopReturnAddr((void**)&(_VMSMasterEnv->coreLoopReturnPt)); + + + while(1){ + + //Get virtual processor from queue + //The Q must be a global, static volatile var, so not kept in reg, + // which forces reloading the pointer after each jmp to this point + readyToAnimateQ = _VMSMasterEnv->readyToAnimateQs[thisCoresIdx]; + + #ifdef USE_WORK_STEALING + //Alg for work-stealing designed to make common case fast. Comment + // in stealer code explains. + gate.preGateProgress++; + if( gate.gateClosed ) + { //now, set coreloop's progress, so stealer can see that core loop + // has made it into the waiting area. + gate.waitProgress = gate.preGateProgress; + while( gate.gateClosed ) /*busy wait*/; + } + + currPr = (SlaveVP *) readVMSQ( readyToAnimateQ ); + + //Set the coreloop's progress, so stealer can see it has made it out + // of the protected area + gate.exitProgress = gate.preGateProgress; + #else + currPr = (SlaveVP *) readVMSQ( readyToAnimateQ ); + #endif + + if( currPr != NULL ) _VMSMasterEnv->numMasterInARow[thisCoresIdx] = 0; + else + { + //============================= MEASUREMENT STUFF ===================== + #ifdef MEAS__TIME_MASTER_LOCK + int32 startStamp, endStamp; + saveLowTimeStampCountInto( startStamp ); + #endif + //===================================================================== + int tries = 0; int gotLock = 0; + while( currPr == NULL ) //if queue was empty, enter get masterLock loop + { //queue was empty, so get master lock + + gotLock = __sync_bool_compare_and_swap(&(_VMSMasterEnv->masterLock), + UNLOCKED, LOCKED ); + if( gotLock ) + { //run own MasterVP -- jmps to coreLoops startPt when done + currPr = _VMSMasterEnv->masterVPs[thisCoresIdx]; + if( _VMSMasterEnv->numMasterInARow[thisCoresIdx] > 1000 ) + { DEBUG( dbgB2BMaster,"Many back to back MasterVPs\n"); + pthread_yield(); + } + _VMSMasterEnv->numMasterInARow[thisCoresIdx] += 1; + break; //end while -- have a VP to animate now + } + + tries++; //if too many, means master on other core taking too long + if( tries > MASTERLOCK_RETRIES ) { tries = 0; pthread_yield(); } + } + //============================= MEASUREMENT STUFF ===================== + #ifdef MEAS__TIME_MASTER_LOCK + saveLowTimeStampCountInto( endStamp ); + addIntervalToHist( startStamp, endStamp, + _VMSMasterEnv->masterLockLowTimeHist ); + addIntervalToHist( startStamp, endStamp, + _VMSMasterEnv->masterLockHighTimeHist ); + #endif + //===================================================================== + + } + + + switchToVP(currPr); //The VPs return in here + flushRegisters(); + }//CoreLoop + } + + +void * +terminateCoreLoop(SlaveVP *currPr){ + //first free shutdown VP that jumped here -- it first restores the + // coreloop's stack, so addr of currPr in stack frame is still correct + VMS_int__dissipate_procr( currPr ); + pthread_exit( NULL ); +} + + + +#ifdef SEQUENTIAL + +//=========================================================================== +/*This sequential version is exact same as threaded, except doesn't do the + * pin-threads part, nor the wait until setup complete part. + */ +void * +coreLoop_Seq( void *paramsIn ) + { + SlaveVP *currPr; + VMSQueueStruc *readyToAnimateQ; + + ThdParams *coreLoopThdParams; + int thisCoresIdx; + + coreLoopThdParams = (ThdParams *)paramsIn; +// thisCoresIdx = coreLoopThdParams->coreNum; + thisCoresIdx = 0; + + //Save the return address in the SwitchVP function + saveCoreLoopReturnAddr(&(_VMSMasterEnv->coreLoopReturnPt)); + + + while(1){ + //Get virtual processor from queue + //_VMSWorkQ must be a global, static volatile var, so not kept in reg, + // which forces reloading the pointer after each jmp to this point + readyToAnimateQ = _VMSMasterEnv->readyToAnimateQs[thisCoresIdx]; + currPr = (SlaveVP *) readVMSQ( readyToAnimateQ ); + if( currPr == NULL ) + { if( _VMSMasterEnv->numMasterInARow[thisCoresIdx] > 1000 ) + { printf("too many back to back MasterVP\n"); exit(1); } + _VMSMasterEnv->numMasterInARow[thisCoresIdx] += 1; + + currPr = _VMSMasterEnv->masterVPs[thisCoresIdx]; + } + else + _VMSMasterEnv->numMasterInARow[thisCoresIdx] = 0; + + + switchToVP( currPr ); + flushRegisters(); + } + } +#endif diff -r bc4cb994f114 -r eaf7e4c58c9e MasterLoop.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/MasterLoop.c Wed Feb 22 11:39:12 2012 -0800 @@ -0,0 +1,373 @@ +/* + * Copyright 2010 OpenSourceStewardshipFoundation + * + * Licensed under BSD + */ + + + +#include +#include + +#include "VMS.h" +#include "ProcrContext.h" + + +//=========================================================================== +void inline +stealWorkInto( SchedSlot *currSlot, VMSQueueStruc *readyToAnimateQ, + SlaveVP *masterPr ); + +//=========================================================================== + + + +/*This code is animated by the virtual Master processor. + * + *Polls each sched slot exactly once, hands any requests made by a newly + * done slave to the "request handler" plug-in function + * + *Any slots that need a virt procr assigned are given to the "schedule" + * plug-in function, which tries to assign a virt procr (slave) to it. + * + *When all slots needing a processor have been given to the schedule plug-in, + * a fraction of the procrs successfully scheduled are put into the + * work queue, then a continuation of this function is put in, then the rest + * of the virt procrs that were successfully scheduled. + * + *The first thing the continuation does is busy-wait until the previous + * animation completes. This is because an (unlikely) continuation may + * sneak through queue before previous continuation is done putting second + * part of scheduled slaves in, which is the only race condition. + * + */ + +/*May 29, 2010 -- birth a Master during init so that first core loop to + * start running gets it and does all the stuff for a newly born -- + * from then on, will be doing continuation, but do suspension self + * directly at end of master loop + *So VMS__init just births the master virtual processor same way it births + * all the others -- then does any extra setup needed and puts it into the + * work queue. + *However means have to make masterEnv a global static volatile the same way + * did with readyToAnimateQ in core loop. -- for performance, put the + * jump to the core loop directly in here, and have it directly jump back. + * + * + *Aug 18, 2010 -- Going to a separate MasterVP for each core, to see if this + * avoids the suspected bug in the system stack that causes bizarre faults + * at random places in the system code. + * + *So, this function is coupled to each of the MasterVPs, -- meaning this + * function can't rely on a particular stack and frame -- each MasterVP that + * animates this function has a different one. + * + *At this point, the masterLoop does not write itself into the queue anymore, + * instead, the coreLoop acquires the masterLock when it has nothing to + * animate, and then animates its own masterLoop. However, still try to put + * several AppVPs into the queue to amortize the startup cost of switching + * to the MasterVP. Note, don't have to worry about latency of requests much + * because most requests generate work for same core -- only latency issue + * is case when other cores starved and one core's requests generate work + * for them -- so keep max in queue to 3 or 4.. + */ +void masterLoop( void *initData, SlaveVP *animatingPr ) + { + int32 slotIdx, numSlotsFilled; + SlaveVP *schedVirtPr; + SchedSlot *currSlot, **schedSlots; + MasterEnv *masterEnv; + VMSQueueStruc *readyToAnimateQ; + + Sched_Assigner slaveScheduler; + RequestHandler requestHandler; + void *semanticEnv; + + int32 thisCoresIdx; + SlaveVP *masterPr; + volatile SlaveVP *volatileMasterPr; + + volatileMasterPr = animatingPr; + masterPr = (SlaveVP*)volatileMasterPr; //used to force re-define after jmp + + //First animation of each MasterVP will in turn animate this part + // of setup code.. (VP creator sets up the stack as if this function + // was called normally, but actually get here by jmp) + //So, setup values about stack ptr, jmp pt and all that + //masterPr->resumeInstrPtr = &&masterLoopStartPt; + + + //Note, got rid of writing the stack and frame ptr up here, because + // only one + // core can ever animate a given MasterVP, so don't need to communicate + // new frame and stack ptr to the MasterVP storage before a second + // version of that MasterVP can get animated on a different core. + //Also got rid of the busy-wait. + + + //masterLoopStartPt: + while(1){ + + //============================= MEASUREMENT STUFF ======================== + #ifdef MEAS__TIME_MASTER + //Total Master time includes one coreloop time -- just assume the core + // loop time is same for Master as for AppVPs, even though it may be + // smaller due to higher predictability of the fixed jmp. + saveLowTimeStampCountInto( masterPr->startMasterTSCLow ); + #endif + //======================================================================== + + masterEnv = (MasterEnv*)_VMSMasterEnv; + + //GCC may optimize so doesn't always re-define from frame-storage + masterPr = (SlaveVP*)volatileMasterPr; //just to make sure after jmp + thisCoresIdx = masterPr->coreAnimatedBy; + readyToAnimateQ = masterEnv->readyToAnimateQs[thisCoresIdx]; + schedSlots = masterEnv->allSchedSlots[thisCoresIdx]; + + requestHandler = masterEnv->requestHandler; + slaveScheduler = masterEnv->slaveSchedAssigner; + semanticEnv = masterEnv->semanticEnv; + + + //Poll each slot's Done flag + numSlotsFilled = 0; + for( slotIdx = 0; slotIdx < NUM_SCHED_SLOTS; slotIdx++) + { + currSlot = schedSlots[ slotIdx ]; + + if( currSlot->workIsDone ) + { + currSlot->workIsDone = FALSE; + currSlot->needsProcrAssigned = TRUE; + + //process requests from slave to master + //====================== MEASUREMENT STUFF =================== + #ifdef MEAS__TIME_PLUGIN + int32 startStamp1, endStamp1; + saveLowTimeStampCountInto( startStamp1 ); + #endif + //============================================================ + (*requestHandler)( currSlot->procrAssignedToSlot, semanticEnv ); + //====================== MEASUREMENT STUFF =================== + #ifdef MEAS__TIME_PLUGIN + saveLowTimeStampCountInto( endStamp1 ); + addIntervalToHist( startStamp1, endStamp1, + _VMSMasterEnv->reqHdlrLowTimeHist ); + addIntervalToHist( startStamp1, endStamp1, + _VMSMasterEnv->reqHdlrHighTimeHist ); + #endif + //============================================================ + } + if( currSlot->needsProcrAssigned ) + { //give slot a new virt procr + schedVirtPr = + (*slaveScheduler)( semanticEnv, thisCoresIdx ); + + if( schedVirtPr != NULL ) + { currSlot->procrAssignedToSlot = schedVirtPr; + schedVirtPr->schedSlot = currSlot; + currSlot->needsProcrAssigned = FALSE; + numSlotsFilled += 1; + + writeVMSQ( schedVirtPr, readyToAnimateQ ); + } + } + } + + + #ifdef USE_WORK_STEALING + //If no slots filled, means no more work, look for work to steal. + if( numSlotsFilled == 0 ) + { gateProtected_stealWorkInto( currSlot, readyToAnimateQ, masterPr ); + } + #endif + + + #ifdef MEAS__TIME_MASTER + saveLowTimeStampCountInto( masterPr->endMasterTSCLow ); + #endif + + masterSwitchToCoreLoop(animatingPr); + flushRegisters(); + }//MasterLoop + + + } + + + +/*This has a race condition -- the coreloops are accessing their own queues + * at the same time that this work-stealer on a different core is trying to + */ +void inline +stealWorkInto( SchedSlot *currSlot, VMSQueueStruc *readyToAnimateQ, + SlaveVP *masterPr ) + { + SlaveVP *stolenPr; + int32 coreIdx, i; + VMSQueueStruc *currQ; + + stolenPr = NULL; + coreIdx = masterPr->coreAnimatedBy; + for( i = 0; i < NUM_CORES -1; i++ ) + { + if( coreIdx >= NUM_CORES -1 ) + { coreIdx = 0; + } + else + { coreIdx++; + } + currQ = _VMSMasterEnv->readyToAnimateQs[coreIdx]; + if( numInVMSQ( currQ ) > 0 ) + { stolenPr = readVMSQ (currQ ); + break; + } + } + + if( stolenPr != NULL ) + { currSlot->procrAssignedToSlot = stolenPr; + stolenPr->schedSlot = currSlot; + currSlot->needsProcrAssigned = FALSE; + + writeVMSQ( stolenPr, readyToAnimateQ ); + } + } + +/*This algorithm makes the common case fast. Make the coreloop passive, + * and show its progress. Make the stealer control a gate that coreloop + * has to pass. + *To avoid interference, only one stealer at a time. Use a global + * stealer-lock. + * + *The pattern is based on a gate -- stealer shuts the gate, then monitors + * to be sure any already past make it all the way out, before starting. + *So, have a "progress" measure just before the gate, then have two after it, + * one is in a "waiting room" outside the gate, the other is at the exit. + *Then, the stealer first shuts the gate, then checks the progress measure + * outside it, then looks to see if the progress measure at the exit is the + * same. If yes, it knows the protected area is empty 'cause no other way + * to get in and the last to get in also exited. + *If the progress measure at the exit is not the same, then the stealer goes + * into a loop checking both the waiting-area and the exit progress-measures + * until one of them shows the same as the measure outside the gate. Might + * as well re-read the measure outside the gate each go around, just to be + * sure. It is guaranteed that one of the two will eventually match the one + * outside the gate. + * + *Here's an informal proof of correctness: + *The gate can be closed at any point, and have only four cases: + * 1) coreloop made it past the gate-closing but not yet past the exit + * 2) coreloop made it past the pre-gate progress update but not yet past + * the gate, + * 3) coreloop is right before the pre-gate update + * 4) coreloop is past the exit and far from the pre-gate update. + * + * Covering the cases in reverse order, + * 4) is not a problem -- stealer will read pre-gate progress, see that it + * matches exit progress, and the gate is closed, so stealer can proceed. + * 3) stealer will read pre-gate progress just after coreloop updates it.. + * so stealer goes into a loop until the coreloop causes wait-progress + * to match pre-gate progress, so then stealer can proceed + * 2) same as 3.. + * 1) stealer reads pre-gate progress, sees that it's different than exit, + * so goes into loop until exit matches pre-gate, now it knows coreloop + * is not in protected and cannot get back in, so can proceed. + * + *Implementation for the stealer: + * + *First, acquire the stealer lock -- only cores with no work to do will + * compete to steal, so not a big performance penalty having only one -- + * will rarely have multiple stealers in a system with plenty of work -- and + * in a system with little work, it doesn't matter. + * + *Note, have single-reader, single-writer pattern for all variables used to + * communicate between stealer and victims + * + *So, scan the queues of the core loops, until find non-empty. Each core + * has its own list that it scans. The list goes in order from closest to + * furthest core, so it steals first from close cores. Later can add + * taking info from the app about overlapping footprints, and scan all the + * others then choose work with the most footprint overlap with the contents + * of this core's cache. + * + *Now, have a victim want to take work from. So, shut the gate in that + * coreloop, by setting the "gate closed" var on its stack to TRUE. + *Then, read the core's pre-gate progress and compare to the core's exit + * progress. + *If same, can proceed to take work from the coreloop's queue. When done, + * write FALSE to gate closed var. + *If different, then enter a loop that reads the pre-gate progress, then + * compares to exit progress then to wait progress. When one of two + * matches, proceed. Take work from the coreloop's queue. When done, + * write FALSE to the gate closed var. + * + */ +void inline +gateProtected_stealWorkInto( SchedSlot *currSlot, + VMSQueueStruc *myReadyToAnimateQ, + SlaveVP *masterPr ) + { + SlaveVP *stolenPr; + int32 coreIdx, i, haveAVictim, gotLock; + VMSQueueStruc *victimsQ; + + volatile GateStruc *vicGate; + int32 coreMightBeInProtected; + + + + //see if any other cores have work available to steal + haveAVictim = FALSE; + coreIdx = masterPr->coreAnimatedBy; + for( i = 0; i < NUM_CORES -1; i++ ) + { + if( coreIdx >= NUM_CORES -1 ) + { coreIdx = 0; + } + else + { coreIdx++; + } + victimsQ = _VMSMasterEnv->readyToAnimateQs[coreIdx]; + if( numInVMSQ( victimsQ ) > 0 ) + { haveAVictim = TRUE; + vicGate = _VMSMasterEnv->workStealingGates[ coreIdx ]; + break; + } + } + if( !haveAVictim ) return; //no work to steal, exit + + //have a victim core, now get the stealer-lock + gotLock =__sync_bool_compare_and_swap( &(_VMSMasterEnv->workStealingLock), + UNLOCKED, LOCKED ); + if( !gotLock ) return; //go back to core loop, which will re-start master + + + //====== Start Gate-protection ======= + vicGate->gateClosed = TRUE; + coreMightBeInProtected= vicGate->preGateProgress != vicGate->exitProgress; + while( coreMightBeInProtected ) + { //wait until sure + if( vicGate->preGateProgress == vicGate->waitProgress ) + coreMightBeInProtected = FALSE; + if( vicGate->preGateProgress == vicGate->exitProgress ) + coreMightBeInProtected = FALSE; + } + + stolenPr = readVMSQ ( victimsQ ); + + vicGate->gateClosed = FALSE; + //======= End Gate-protection ======= + + + if( stolenPr != NULL ) //victim could have been in protected and taken + { currSlot->procrAssignedToSlot = stolenPr; + stolenPr->schedSlot = currSlot; + currSlot->needsProcrAssigned = FALSE; + + writeVMSQ( stolenPr, myReadyToAnimateQ ); + } + + //unlock the work stealing lock + _VMSMasterEnv->workStealingLock = UNLOCKED; + } diff -r bc4cb994f114 -r eaf7e4c58c9e VMS.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/VMS.h Wed Feb 22 11:39:12 2012 -0800 @@ -0,0 +1,377 @@ +/* + * Copyright 2009 OpenSourceStewardshipFoundation.org + * Licensed under GNU General Public License version 2 + * + * Author: seanhalle@yahoo.com + * + */ + +#ifndef _VMS_H +#define _VMS_H +#define _GNU_SOURCE + +#include "VMS_primitive_data_types.h" +#include "C_Libraries/DynArray/DynArray.h" +#include "C_Libraries/Hash_impl/PrivateHash.h" +#include "C_Libraries/Histogram/Histogram.h" +#include "C_Libraries/Queue_impl/PrivateQueue.h" +#include "vmalloc.h" + +#include +#include + +//================= Defines: included from separate files ================= +// +// Note: ALL defines are in other files, none are in here +// +#include "VMS_defs__main.h" + + +//================================ Typedefs ================================= +// +typedef unsigned long long TSCount; +typedef union + { uint32 lowHigh[2]; + uint64 longVal; + } +TSCountLowHigh; + +typedef struct _SchedSlot SchedSlot; +typedef struct _VMSReqst VMSReqst; +typedef struct _SlaveVP SlaveVP; +typedef struct _MasterVP MasterVP; +typedef struct _IntervalProbe IntervalProbe; +typedef struct _GateStruc GateStruc; + + +typedef SlaveVP * (*Sched_Assigner) ( void *, int ); //semEnv, coreIdx +typedef void (*RequestHandler) ( SlaveVP *, void * ); //prWReqst, semEnv +typedef void (*TopLevelFnPtr) ( void *, SlaveVP * ); //initData, animPr +typedef void TopLevelFn ( void *, SlaveVP * ); //initData, animPr +typedef void (*ResumeVPFnPtr) ( SlaveVP *, void * ); + +//============================= Statistics ================================== + +inline TSCount getTSCount(); + +//============= Request Related =========== +// + +enum VMSReqstType //avoid starting enums at 0, for debug reasons + { + semantic = 1, + createReq, + dissipate, + VMSSemantic //goes with VMSSemReqst below + }; + +struct _VMSReqst + { + enum VMSReqstType reqType;//used for dissipate and in future for IO requests + void *semReqData; + + VMSReqst *nextReqst; + }; +//VMSReqst + +enum VMSSemReqstType //These are equivalent to semantic requests, but for + { // VMS's services available directly to app, like OS + createProbe = 1, // and probe services -- like a VMS-wide built-in lang + openFile, + otherIO + }; + +typedef struct + { enum VMSSemReqstType reqType; + SlaveVP *requestingPr; + char *nameStr; //for create probe + } + VMSSemReq; + + +//==================== Core data structures =================== + +struct _SchedSlot + { + int workIsDone; + int needsProcrAssigned; + SlaveVP *procrAssignedToSlot; + }; +//SchedSlot + +/*WARNING: re-arranging this data structure could cause VP switching + * assembly code to fail -- hard-codes offsets of fields + */ +struct _SlaveVP + { int procrID; //for debugging -- count up each time create + int coreAnimatedBy; + void *startOfStack; + void *stackPtr; + void *framePtr; + void *resumeInstrPtr; + + void *coreLoopStartPt; //allows proto-runtime to be linked later + void *coreLoopFramePtr; //restore before jmp back to core loop + void *coreLoopStackPtr; //restore before jmp back to core loop + + SchedSlot *schedSlot; + VMSReqst *requests; + + void *semanticData; //this livesUSE_GNU here for the life of VP + void *dataRetFromReq;//values returned from plugin to VP go here + + //=========== MEASUREMENT STUFF ========== + #ifdef MEAS__TIME_STAMP_SUSP + uint32 preSuspTSCLow; + uint32 postSuspTSCLow; + #endif + #ifdef MEAS__TIME_MASTER /* in SlaveVP because multiple masterVPs*/ + uint32 startMasterTSCLow;USE_GNU + uint32 endMasterTSCLow; + #endif + #ifdef MEAS__TIME_2011_SYS + TSCountLowHigh startSusp; + uint64 totalSuspCycles; + uint32 numGoodSusp; + #endif + //======================================== + + float64 createPtInSecs; //have space but don't use on some configs + }; +//SlaveVP + + +/*WARNING: re-arranging this data structure could cause VP-switching + * assembly code to fail -- hard-codes offsets of fields + * (because -O3 messes with things otherwise) + */ +typedef struct + { + union{ //adds padding to put masterLock on its own cache-line to elim + // false sharing (masterLock is most-accessed var in VMS) + volatile int32 masterLock; + char padding[CACHE_LINE_SZ]; + } masterLockUnion; + Sched_Assigner slaveSchedAssigner; + RequestHandler requestHandler; + + SchedSlot ***allSchedSlots; + VMSQueueStruc **readyToAnimateQs; + SlaveVP **masterVPs; + + void *semanticEnv; + void *OSEventStruc; //for future, when add I/O to BLIS + MallocArrays *freeLists; + int32 amtOfOutstandingMem; //total currently allocated + + void *coreLoopReturnPt;//addr to jump to to re-enter coreLoop + + int32 setupComplete; + //int32 numMasterInARow[NUM_CORES];//detect back-to-back masterVP + GateStruc *workStealingGates[ NUM_CORES ]; //concurrent work-steal + int32 workStealingLock; + + int32 numVPsCreated; //gives ordering to processor creation + + //=========== MEASUREMENT STUFF ============= + IntervalProbe **intervalProbes; + PrivDynArrayInfo *dynIntervalProbesInfo; + HashTable *probeNameHashTbl; + int32 masterCreateProbeID; + float64 createPtInSecs; + Histogram **measHists; + PrivDynArrayInfo *measHistsInfo; + #ifdef MEAS__TIME_PLUGIN + Histogram *reqHdlrLowTimeHist; + Histogram *reqHdlrHighTimeHist; + #endif + #ifdef MEAS__TIME_MALLOC + Histogram *mallocTimeHist; + Histogram *freeTimeHist; + #endif + #ifdef MEAS__TIME_MASTER_LOCK + Histogram *masterLockLowTimeHist; + Histogram *masterLockHighTimeHist; + #endif + #ifdef MEAS__TIME_2011_SYS + TSCountLowHigh startMaster; + uint64 totalMasterCycles; + uint32 numMasterAnimations; + TSCountLowHigh startReqHdlr; + uint64 totalPluginCycles; + uint32 numPluginAnimations; + uint64 cyclesTillStartMasterLoop; + TSCountLowHigh endMasterLoop; + #endif + //========================================== + } +MasterEnv; + +//========================= Extra Stuff Data Strucs ======================= +typedef struct + { + + } +VMSExcp; + +struct _GateStruc + { + int32 gateClosed; + int32 preGateProgress; + int32 waitProgress; + int32 exitProgress; + }; +//GateStruc + +//======================= OS Thread related =============================== + +void * coreLoop( void *paramsIn ); //standard PThreads fn prototype +void * coreLoop_Seq( void *paramsIn ); //standard PThreads fn prototype +void masterLoop( void *initData, SlaveVP *masterVP ); + + +typedef struct + { + void *endThdPt; + unsigned int coreNum; + } +ThdParams; + +pthread_t coreLoopThdHandles[ NUM_CORES ]; //pthread's virt-procr state +ThdParams *coreLoopThdParams [ NUM_CORES ]; +pthread_mutex_t suspendLock; +pthread_cond_t suspend_cond; + + + +//============================= Global Vars ================================ + +volatile MasterEnv *_VMSMasterEnv __align_to_cacheline__; + + + + +//========================= Function Prototypes =========================== + + +//========== Setup and shutdown ========== +void +VMS_int__init(); + +Fix seed-procr creation -- put box around language, have lang register stuff + with VMS. + have main program explicitly INIT Lang! -- makes more sense to + C programmers -- makes it clear that there's a transition. +(might need to have the pthreads remain waiting for + cond until work is scheduled) +Have main do call to tell language to perform work -- like did with DKU + +Ex: "HWSim__run_a_simulation(netlist, paramBag);" + "processID = SSR__run_program(seed_fn, seedData); " + "SSR__Wait_for_program_to_end(processID);" + "SSR__run_program_and_wait_till_it_ends(seed_fn, seedData);" + + allows multiple languages to be started, and programs run in several, + overlapped, or one program to be run that uses multiple langs..? + So, each program is in separate directory: + "HWSim_ArchDef__PingPong" "SSR_Program__Blocked_Matrix_Mult" + + Those programs can talk to each other, via VMS, by handles they each + return + "processIDs[0] = SSR__run_program(seed_fn1, seedData1);" + "processIDs[1] = SSR__run_program(seed_fn2, seedData2);" + "SSR__link_programs(processIDs, 2);" +or even + "processIDs[0] = Vthread__run_program(seed_fn1, seedData1);" + "processIDs[1] = SSR__run_program(seed_fn2, seedData2);" + "VMS__link_programs(processIDs, 2);" + Then, the programs just know they sync with other prog, but use own + lang's sync constructs -- VMS uses message system to establish tie-pt, + each lang defines what a tie-point means to it.. (work with the + diff semantics?) +void +VMS_WL__start_the_work_then_wait_until_done(); + +void +VMS_int__shutdown(); + +void +VMS_int__cleanup_at_end_of_shutdown(); + + +//============== =============== + +inline SlaveVP * +VMS_int__create_procr( TopLevelFnPtr fnPtr, void *dataParam ); + +inline void +VMS_int__point_slave_to_Fn( SlaveVP *slaveVP, TopLevelFnPtr fnPtr, + void *dataParam); + +void +VMS_int__save_return_addr_into_ptd_to_loc(void *ptrToReturnAddrHoldingLoc); + +void +VMS_int__write_return_addr_from_ptd_to_loc(void *ptrToReturnAddrHoldingLoc); + +void +VMS_int__dissipate_procr( SlaveVP *procrToDissipate ); + + //Use this to create processor inside entry point & other places outside + // the VMS system boundary (IE, not run in slave nor Master) +SlaveVP * +VMS_ext__create_procr( TopLevelFnPtr fnPtr, void *dataParam ); + +void +VMS_ext__dissipate_procr( SlaveVP *procrToDissipate ); + +void +VMS_PI__throw_exception( char *msgStr, SlaveVP *reqstPr, VMSExcp *excpData ); + +void * +VMS_WL__give_sem_env_for( SlaveVP *animPr ); + +//============== Request Related =============== + +void +VMS_int__suspend_procr( SlaveVP *callingPr ); + +inline void +VMS_WL__add_sem_request_in_mallocd_VMSReqst( void *semReqData, SlaveVP *callingPr ); + +inline void +VMS_WL__send_sem_request( void *semReqData, SlaveVP *callingPr ); + +void +VMS_WL__send_create_procr_req( void *semReqData, SlaveVP *reqstingPr ); + +void inline +VMS_WL__send_dissipate_req( SlaveVP *prToDissipate ); + +inline void +VMS_WL__send_VMSSem_request( void *semReqData, SlaveVP *callingPr ); + +VMSReqst * +VMS_PI__take_next_request_out_of( SlaveVP *procrWithReq ); + +inline void * +VMS_PI__take_sem_reqst_from( VMSReqst *req ); + +void inline +VMS_PI__handle_VMSSemReq( VMSReqst *req, SlaveVP *requestingPr, void *semEnv, + ResumeVPFnPtr resumePrFnPtr ); + +//======================== MEASUREMENT ====================== +uint64 +VMS_WL__give_num_plugin_cycles(); +uint32 +VMS_WL__give_num_plugin_animations(); + + + +#include "VMS__HW_dependent.h" +#include "probes.h" +#include "vutilities.h" + +#endif /* _VMS_H */ + diff -r bc4cb994f114 -r eaf7e4c58c9e VMS__HW_dependent.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/VMS__HW_dependent.c Wed Feb 22 11:39:12 2012 -0800 @@ -0,0 +1,47 @@ +/* + * This File contains all hardware dependent C code. + */ + + +#include "VMS.h" + +/*Set up the stack with __cdecl structure on it + * Except doing a trick for 64 bits, where put top-level fn pointer on + * stack, then call an assembly helper that copies it into a reg and + * jumps to it. So, set the resumeInstrPtr to the helper-assembly. + *No need to save registers on old stack frame, because there's no old + * animator state to return to + */ +VMS_int__point_slave_to_Fn( SlaveVP *slaveVP, TopLevelFnPtr fnPtr, + void *dataParam) + { void *stackPtr; + +// Start of Hardware dependent part + + //Set slave's instr pointer to a helper Fn that copies params from stack + slaveVP->resumeInstrPtr = (TopLevelFnPtr)&startUpTopLevelFn; + + //fnPtr takes two params -- void *dataParam & void *animProcr + // Stack grows *down*, so start it at highest stack addr, minus room + // for 2 params + return addr. + stackPtr = + (void *)slaveVP->startOfStack + VIRT_PROCR_STACK_SIZE - 4*sizeof(void*); + + //setup __cdecl on stack + //Normally, return Addr is in loc pointed to by stackPtr, but doing a + // trick for 64 bit arch, where put ptr to top-level fn there instead, + // and set resumeInstrPtr to a helper-fn that copies the top-level + // fn ptr and params into registers. + //Then, dataParam is at stackPtr + 8 bytes, & animating SlaveVP above + *((SlaveVP**)stackPtr + 2 ) = slaveVP; //rightmost param + *((void**)stackPtr + 1 ) = dataParam; //next param to left + *((void**)stackPtr) = (void*)fnPtr; //copied to reg by helper Fn + + +// end of Hardware dependent part + + //core loop will switch to stack & frame pointers stored in slave, + // suspend will save processor's stack and frame into slave + slaveVP->stackPtr = slaveVP->startOfStack; + slaveVP->framePtr = slaveVP->startOfStack; + } \ No newline at end of file diff -r bc4cb994f114 -r eaf7e4c58c9e VMS__HW_dependent.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/VMS__HW_dependent.h Wed Feb 22 11:39:12 2012 -0800 @@ -0,0 +1,33 @@ +/* + * Copyright 2009 OpenSourceStewardshipFoundation.org + * Licensed under GNU General Public License version 2 + * + * Author: seanhalle@yahoo.com + * + */ + +#ifndef _ProcrContext_H +#define _ProcrContext_H +#define _GNU_SOURCE + +void saveCoreLoopReturnAddr(void **returnAddress); + +void switchToVP(SlaveVP *nextProcr); + +void switchToCoreLoop(SlaveVP *nextProcr); + +void masterSwitchToCoreLoop(SlaveVP *nextProcr); + +void startUpTopLevelFn(); + +void *asmTerminateCoreLoop(SlaveVP *currPr); + +#define flushRegisters() \ + asm volatile ("":::"%rbx", "%r12", "%r13","%r14","%r15") + +inline SlaveVP * +create_procr_helper( SlaveVP *newPr, TopLevelFnPtr fnPtr, + void *dataParam, void *stackLocs ); + +#endif /* _ProcrContext_H */ + diff -r bc4cb994f114 -r eaf7e4c58c9e VMS__HW_dependent.s --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/VMS__HW_dependent.s Wed Feb 22 11:39:12 2012 -0800 @@ -0,0 +1,167 @@ +.data + + +.text + +//Save return label address for the coreLoop to pointer +//Arguments: Pointer to variable holding address +.globl saveCoreLoopReturnAddr +saveCoreLoopReturnAddr: + movq $coreLoopReturn, %rcx #load label address + movq %rcx, (%rdi) #save address to pointer + ret + + +//Trick for 64 bit arch -- copies args from stack into regs, then does jmp to +// the top-level function, which was pointed to by the stack-ptr +.globl startUpTopLevelFn +startUpTopLevelFn: + movq %rdi , %rsi #get second argument from first argument of switchVP + movq 0x08(%rsp), %rdi #get first argument from stack + movq (%rsp) , %rax #get top-level function's addr from stack + jmp *%rax #jump to the top-level function + +//Switches form CoreLoop to VP ether a normal VP or the Master Loop +//switch to virt procr's stack and frame ptr then jump to virt procr fn +/* SlaveVP offsets: + * 0x10 stackPtr + * 0x18 framePtr + * 0x20 resumeInstrPtr + * 0x30 coreLoopFramePtr + * 0x38 coreLoopStackPtr + * + * _VMSMasterEnv offsets: + * 0x48 coreLoopReturnPt + * 0x54 masterLock + */ +.globl switchToVP +switchToVP: + #SlaveVP in %rdi + movq %rsp , 0x38(%rdi) #save core loop stack pointer + movq %rbp , 0x30(%rdi) #save core loop frame pointer + movq 0x10(%rdi), %rsp #restore stack pointer + movq 0x18(%rdi), %rbp #restore frame pointer + movq 0x20(%rdi), %rax #get jmp pointer + jmp *%rax #jmp to VP +coreLoopReturn: + ret + + +//switches to core loop. saves return address +/* SlaveVP offsets: + * 0x10 stackPtr + * 0x18 framePtr + * 0x20 resumeInstrPtr + * 0x30 coreLoopFramePtr + * 0x38 coreLoopStackPtr + * + * _VMSMasterEnv offsets: + * 0x48 coreLoopReturnPt + * 0x54 masterLock + */ +.globl switchToCoreLoop +switchToCoreLoop: + #SlaveVP in %rdi + movq $VPReturn , 0x20(%rdi) #store return address + movq %rsp , 0x10(%rdi) #save stack pointer + movq %rbp , 0x18(%rdi) #save frame pointer + movq 0x38(%rdi), %rsp #restore stack pointer + movq 0x30(%rdi), %rbp #restore frame pointer + movq $_VMSMasterEnv, %rcx + movq (%rcx) , %rcx + movq 0x48(%rcx), %rax #get CoreLoopStartPt + jmp *%rax #jmp to CoreLoop +VPReturn: + ret + + + +//switches to core loop from master. saves return address +//Releases masterLock so the next MasterLoop can be executed +/* SlaveVP offsets: + * 0x10 stackPtr + * 0x18 framePtr + * 0x20 resumeInstrPtr + * 0x30 coreLoopFramePtr + * 0x38 coreLoopStackPtr + * + * _VMSMasterEnv offsets: + * 0x48 coreLoopReturnPt + * 0x54 masterLock + */ +.globl masterSwitchToCoreLoop +masterSwitchToCoreLoop: + #SlaveVP in %rdi + movq $MasterReturn, 0x20(%rdi) #store return address + movq %rsp , 0x10(%rdi) #save stack pointer + movq %rbp , 0x18(%rdi) #save frame pointer + movq 0x38(%rdi), %rsp #restore stack pointer + movq 0x30(%rdi), %rbp #restore frame pointer + movq $_VMSMasterEnv, %rcx + movq (%rcx) , %rcx + movq 0x48(%rcx), %rax #get CoreLoopStartPt + movl $0x0 , 0x54(%rcx) #release lock + jmp *%rax #jmp to CoreLoop +MasterReturn: + ret + + +//Switch to terminateCoreLoop +//therefor switch to coreLoop context from master context +// no need to call because the stack is already set up for switchVP +// and virtPr is in %rdi +// and both functions have the same argument. +// do not save register of VP because this function will never return +/* SlaveVP offsets: + * 0x10 stackPtr + * 0x18 framePtr + * 0x20 resumeInstrPtr + * 0x30 coreLoopFramePtr + * 0x38 coreLoopStackPtr + * + * _VMSMasterEnv offsets: + * 0x48 coreLoopReturnPt + * 0x58 masterLock + */ +.globl asmTerminateCoreLoop +asmTerminateCoreLoop: + #SlaveVP in %rdi + movq 0x38(%rdi), %rsp #restore stack pointer + movq 0x30(%rdi), %rbp #restore frame pointer + movq $terminateCoreLoop, %rax + jmp *%rax #jmp to CoreLoop + + +/* + * This one for the sequential version is special. It discards the current stack + * and returns directly from the coreLoop after VMS__dissipate_procr was called + */ +.globl asmTerminateCoreLoopSeq +asmTerminateCoreLoopSeq: + #SlaveVP in %rdi + movq 0x38(%rdi), %rsp #restore stack pointer + movq 0x30(%rdi), %rbp #restore frame pointer + #argument is in %rdi + call VMS__dissipate_procr + movq %rbp , %rsp #goto the coreLoops stack + pop %rbp #restore the old framepointer + ret #return from core loop + + +//Assembly code takes the return addr off the stack and saves +// into the loc pointed to by rdi. The return addr is at 0x8(%rbp) for 64bit +.globl asm_save_ret_to_singleton +VMS_int__save_return_addr_into_ptd_to_loc: + movq 0x8(%rbp), %rax #get ret address, rbp is the same as in the calling function + movq %rax, (%rdi) #write ret addr to endInstrAddr field + ret + + +//Assembly code changes the return addr on the stack to the one +// pointed to by the parameter. The stack's return addr is at 0x8(%rbp) +.globl asm_write_ret_from_singleton +VMS_int__write_return_addr_from_ptd_to_loc: + movq (%rdi), %rax #get return addr + movq %rax, 0x8(%rbp) #write return addr to the stack of the caller + ret + diff -r bc4cb994f114 -r eaf7e4c58c9e VMS__PI.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/VMS__PI.c Wed Feb 22 11:39:12 2012 -0800 @@ -0,0 +1,87 @@ +/* + * Copyright 2010 OpenSourceStewardshipFoundation + * + * Licensed under BSD + */ + +#include +#include +#include +#include +#include +#include + +#include "VMS.h" + + +/* + */ +VMSReqst * +VMS_PI__take_next_request_out_of( SlaveVP *procrWithReq ) + { VMSReqst *req; + + req = procrWithReq->requests; + if( req == NULL ) return NULL; + + procrWithReq->requests = procrWithReq->requests->nextReqst; + return req; + } + + +inline void * +VMS_PI__take_sem_reqst_from( VMSReqst *req ) + { + return req->semReqData; + } + + + +/* This is for OS requests and VMS infrastructure requests, such as to create + * a probe -- a probe is inside the heart of VMS-core, it's not part of any + * language -- but it's also a semantic thing that's triggered from and used + * in the application.. so it crosses abstractions.. so, need some special + * pattern here for handling such requests. + * Doing this just like it were a second language sharing VMS-core. + * + * This is called from the language's request handler when it sees a request + * of type VMSSemReq + * + * TODO: Later change this, to give probes their own separate plugin & have + * VMS-core steer the request to appropriate plugin + * Do the same for OS calls -- look later at it.. + */ +void inline +VMS_PI__handle_VMSSemReq( VMSReqst *req, SlaveVP *requestingPr, void *semEnv, + ResumeVPFnPtr resumePrFnPtr ) + { VMSSemReq *semReq; + IntervalProbe *newProbe; + + semReq = req->semReqData; + + newProbe = VMS_int__malloc( sizeof(IntervalProbe) ); + newProbe->nameStr = VMS_int__strDup( semReq->nameStr ); + newProbe->hist = NULL; + newProbe->schedChoiceWasRecorded = FALSE; + + //This runs in masterVP, so no race-condition worries + newProbe->probeID = + addToDynArray( newProbe, _VMSMasterEnv->dynIntervalProbesInfo ); + + requestingPr->dataRetFromReq = newProbe; + + (*resumePrFnPtr)( requestingPr, semEnv ); + } + + +/*Later, improve this -- for now, just exits the application after printing + * the error message. + */ +void +VMS_PI__throw_exception( char *msgStr, SlaveVP *reqstPr, VMSExcp *excpData ) + { + printf("%s",msgStr); + fflush(stdin); + exit(1); + } + + diff -r bc4cb994f114 -r eaf7e4c58c9e VMS__WL.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/VMS__WL.c Wed Feb 22 11:39:12 2012 -0800 @@ -0,0 +1,138 @@ +/* + * Copyright 2010 OpenSourceStewardshipFoundation + * + * Licensed under BSD + */ + +#include +#include +#include +#include +#include +#include + +#include "VMS.h" + + +/*Anticipating multi-tasking + */ +void * +VMS_WL__give_sem_env_for( SlaveVP *animPr ) + { + return _VMSMasterEnv->semanticEnv; + } + + +/*For this implementation of VMS, it may not make much sense to have the + * system of requests for creating a new processor done this way.. but over + * the scope of single-master, multi-master, mult-tasking, OS-implementing, + * distributed-memory, and so on, this gives VMS implementation a chance to + * do stuff before suspend, in the AppVP, and in the Master before the plugin + * is called, as well as in the lang-lib before this is called, and in the + * plugin. So, this gives both VMS and language implementations a chance to + * intercept at various points and do order-dependent stuff. + *Having a standard VMSNewPrReqData struc allows the language to create and + * free the struc, while VMS knows how to get the newPr if it wants it, and + * it lets the lang have lang-specific data related to creation transported + * to the plugin. + */ +void +VMS_WL__send_create_procr_req( void *semReqData, SlaveVP *reqstingPr ) + { VMSReqst req; + + req.reqType = createReq; + req.semReqData = semReqData; + req.nextReqst = reqstingPr->requests; + reqstingPr->requests = &req; + + VMS_int__suspend_procr( reqstingPr ); + } + + +/* + *This adds a request to dissipate, then suspends the processor so that the + * request handler will receive the request. The request handler is what + * does the work of freeing memory and removing the processor from the + * semantic environment's data structures. + *The request handler also is what figures out when to shutdown the VMS + * system -- which causes all the core loop threads to die, and returns from + * the call that started up VMS to perform the work. + * + *This form is a bit misleading to understand if one is trying to figure out + * how VMS works -- it looks like a normal function call, but inside it + * sends a request to the request handler and suspends the processor, which + * jumps out of the VMS__dissipate_procr function, and out of all nestings + * above it, transferring the work of dissipating to the request handler, + * which then does the actual work -- causing the processor that animated + * the call of this function to disappear and the "hanging" state of this + * function to just poof into thin air -- the virtual processor's trace + * never returns from this call, but instead the virtual processor's trace + * gets suspended in this call and all the virt processor's state disap- + * pears -- making that suspend the last thing in the virt procr's trace. + */ +void +VMS_WL__send_dissipate_req( SlaveVP *procrToDissipate ) + { VMSReqst req; + + req.reqType = dissipate; + req.nextReqst = procrToDissipate->requests; + procrToDissipate->requests = &req; + + VMS_int__suspend_procr( procrToDissipate ); + } + + + +/*This call's name indicates that request is malloc'd -- so req handler + * has to free any extra requests tacked on before a send, using this. + * + * This inserts the semantic-layer's request data into standard VMS carrier + * request data-struct that is mallocd. The sem request doesn't need to + * be malloc'd if this is called inside the same call chain before the + * send of the last request is called. + * + *The request handler has to call VMS__free_VMSReq for any of these + */ +inline void +VMS_WL__add_sem_request_in_mallocd_VMSReqst( void *semReqData, + SlaveVP *callingPr ) + { VMSReqst *req; + + req = VMS_int__malloc( sizeof(VMSReqst) ); + req->reqType = semantic; + req->semReqData = semReqData; + req->nextReqst = callingPr->requests; + callingPr->requests = req; + } + +/*This inserts the semantic-layer's request data into standard VMS carrier + * request data-struct is allocated on stack of this call & ptr to it sent + * to plugin + *Then it does suspend, to cause request to be sent. + */ +inline void +VMS_WL__send_sem_request( void *semReqData, SlaveVP *callingPr ) + { VMSReqst req; + + req.reqType = semantic; + req.semReqData = semReqData; + req.nextReqst = callingPr->requests; + callingPr->requests = &req; + + VMS_int__suspend_procr( callingPr ); + } + + +inline void +VMS_WL__send_VMSSem_request( void *semReqData, SlaveVP *callingPr ) + { VMSReqst req; + + req.reqType = VMSSemantic; + req.semReqData = semReqData; + req.nextReqst = callingPr->requests; //gab any other preceeding + callingPr->requests = &req; + + VMS_int__suspend_procr( callingPr ); + } + + diff -r bc4cb994f114 -r eaf7e4c58c9e VMS__int.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/VMS__int.c Wed Feb 22 11:39:12 2012 -0800 @@ -0,0 +1,155 @@ +/* + * Copyright 2010 OpenSourceStewardshipFoundation + * + * Licensed under BSD + */ + +#include +#include +#include +#include +#include +#include + +#include "VMS.h" + + +inline SlaveVP * +VMS_int__create_procr( TopLevelFnPtr fnPtr, void *dataParam ) + { SlaveVP *newPr; + void *stackLocs; + + newPr = VMS_int__malloc( sizeof(SlaveVP) ); + stackLocs = VMS_int__malloc( VIRT_PROCR_STACK_SIZE ); + if( stackLocs == 0 ) + { perror("VMS__malloc stack"); exit(1); } + + _VMSMasterEnv->numSlaves += 1; + + return create_procr_helper( newPr, fnPtr, dataParam, stackLocs ); + } + +/* "ext" designates that it's for use outside the VMS system -- should only + * be called from main thread or other thread -- never from code animated by + * a VMS virtual processor. + */ +inline SlaveVP * +VMS_ext__create_procr( TopLevelFnPtr fnPtr, void *dataParam ) + { SlaveVP *newPr; + char *stackLocs; + + newPr = malloc( sizeof(SlaveVP) ); + stackLocs = malloc( VIRT_PROCR_STACK_SIZE ); + if( stackLocs == 0 ) + { perror("malloc stack"); exit(1); } + + return create_procr_helper( newPr, fnPtr, dataParam, stackLocs ); + } + + +//=========================================================================== +/*there is a label inside this function -- save the addr of this label in + * the callingPr struc, as the pick-up point from which to start the next + * work-unit for that procr. If turns out have to save registers, then + * save them in the procr struc too. Then do assembly jump to the CoreLoop's + * "done with work-unit" label. The procr struc is in the request in the + * slave that animated the just-ended work-unit, so all the state is saved + * there, and will get passed along, inside the request handler, to the + * next work-unit for that procr. + */ +void +VMS_int__suspend_procr( SlaveVP *animatingPr ) + { + + //The request to master will cause this suspended virt procr to get + // scheduled again at some future point -- to resume, core loop jumps + // to the resume point (below), which causes restore of saved regs and + // "return" from this call. + //animatingPr->resumeInstrPtr = &&ResumePt; + + //return ownership of the virt procr and sched slot to Master virt pr + animatingPr->schedSlot->workIsDone = TRUE; + + //=========================== Measurement stuff ======================== + #ifdef MEAS__TIME_STAMP_SUSP + //record time stamp: compare to time-stamp recorded below + saveLowTimeStampCountInto( animatingPr->preSuspTSCLow ); + #endif + //======================================================================= + + switchToCoreLoop(animatingPr); + flushRegisters(); + + //======================================================================= + + #ifdef MEAS__TIME_STAMP_SUSP + //NOTE: only take low part of count -- do sanity check when take diff + saveLowTimeStampCountInto( animatingPr->postSuspTSCLow ); + #endif + + return; + } + + +/* "ext" designates that it's for use outside the VMS system -- should only + * be called from main thread or other thread -- never from code animated by + * a SlaveVP, nor from a masterVP. + * + *Use this version to dissipate VPs created outside the VMS system. + */ +void +VMS_ext__dissipate_procr( SlaveVP *procrToDissipate ) + { + //NOTE: dataParam was given to the processor, so should either have + // been alloc'd with VMS__malloc, or freed by the level above animPr. + //So, all that's left to free here is the stack and the SlaveVP struc + // itself + //Note, should not stack-allocate the data param -- no guarantee, in + // general that creating processor will outlive ones it creates. + free( procrToDissipate->startOfStack ); + free( procrToDissipate ); + } + + + +/*This must be called by the request handler plugin -- it cannot be called + * from the semantic library "dissipate processor" function -- instead, the + * semantic layer has to generate a request, and the plug-in calls this + * function. + *The reason is that this frees the virtual processor's stack -- which is + * still in use inside semantic library calls! + * + *This frees or recycles all the state owned by and comprising the VMS + * portion of the animating virtual procr. The request handler must first + * free any semantic data created for the processor that didn't use the + * VMS_malloc mechanism. Then it calls this, which first asks the malloc + * system to disown any state that did use VMS_malloc, and then frees the + * statck and the processor-struct itself. + *If the dissipated processor is the sole (remaining) owner of VMS__malloc'd + * state, then that state gets freed (or sent to recycling) as a side-effect + * of dis-owning it. + */ +void +VMS_int__dissipate_procr( SlaveVP *animatingPr ) + { + //dis-own all locations owned by this processor, causing to be freed + // any locations that it is (was) sole owner of +//TODO: implement VMS__malloc system, including "give up ownership" + + _VMSMasterEnv->numSlaves -= 1; + if( _VMSMasterEnv->numSlaves == 0 ) + { //no more work, so shutdown + VMS_int__shutdown(); //note, creates 4 shut-down processors + } + + //NOTE: dataParam was given to the processor, so should either have + // been alloc'd with VMS__malloc, or freed by the level above animPr. + //So, all that's left to free here is the stack and the SlaveVP struc + // itself + //Note, should not stack-allocate initial data -- no guarantee, in + // general that creating processor will outlive ones it creates. + VMS_int__free( animatingPr->startOfStack ); + VMS_int__free( animatingPr ); + } + + diff -r bc4cb994f114 -r eaf7e4c58c9e VMS__startup_and_shutdown.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/VMS__startup_and_shutdown.c Wed Feb 22 11:39:12 2012 -0800 @@ -0,0 +1,458 @@ +/* + * Copyright 2010 OpenSourceStewardshipFoundation + * + * Licensed under BSD + */ + +#include +#include +#include +#include +#include +#include + +#include "VMS.h" +#include "VMS__HW_dependent.h" + + +#define thdAttrs NULL + +//=========================================================================== +void +shutdownFn( void *dummy, SlaveVP *dummy2 ); + +SchedSlot ** +create_sched_slots(); + +void +create_masterEnv(); + +void +create_the_coreLoop_OS_threads(); + +MallocProlog * +create_free_list(); + +void +endOSThreadFn( void *initData, SlaveVP *animatingPr ); + +pthread_mutex_t suspendLock = PTHREAD_MUTEX_INITIALIZER; +pthread_cond_t suspend_cond = PTHREAD_COND_INITIALIZER; + +//=========================================================================== + +/*Setup has two phases: + * 1) Semantic layer first calls init_VMS, which creates masterEnv, and puts + * the master virt procr into the work-queue, ready for first "call" + * 2) Semantic layer then does its own init, which creates the seed virt + * procr inside the semantic layer, ready to schedule it when + * asked by the first run of the masterLoop. + * + *This part is bit weird because VMS really wants to be "always there", and + * have applications attach and detach.. for now, this VMS is part of + * the app, so the VMS system starts up as part of running the app. + * + *The semantic layer is isolated from the VMS internals by making the + * semantic layer do setup to a state that it's ready with its + * initial virt procrs, ready to schedule them to slots when the masterLoop + * asks. Without this pattern, the semantic layer's setup would + * have to modify slots directly to assign the initial virt-procrs, and put + * them into the readyToAnimateQ itself, breaking the isolation completely. + * + * + *The semantic layer creates the initial virt procr(s), and adds its + * own environment to masterEnv, and fills in the pointers to + * the requestHandler and slaveScheduler plug-in functions + */ + +/*This allocates VMS data structures, populates the master VMSProc, + * and master environment, and returns the master environment to the semantic + * layer. + */ +void +VMS_int__init() + { + +#ifdef SEQUENTIAL + create_masterEnv(); + flushRegisters(); //? not sure why here -- merten added it..? +#else + create_masterEnv(); + create_the_coreLoop_OS_threads(); +#endif + } + +void +create_masterEnv() + { MasterEnv *masterEnv; + VMSQueueStruc **readyToAnimateQs; + int coreIdx; + SlaveVP **masterVPs; + SchedSlot ***allSchedSlots; //ptr to array of ptrs + + + //Make the master env, which holds everything else + _VMSMasterEnv = malloc( sizeof(MasterEnv) ); + + //Very first thing put into the master env is the free-list, seeded + // with a massive initial chunk of memory. + //After this, all other mallocs are VMS__malloc. + _VMSMasterEnv->freeListHead = VMS_ext__create_free_list(); + + + //============================= MEASUREMENT STUFF ======================== + #ifdef MEAS__TIME_MALLOC + _VMSMasterEnv->mallocTimeHist = makeFixedBinHistExt( 100, 0, 30, + "malloc_time_hist"); + _VMSMasterEnv->freeTimeHist = makeFixedBinHistExt( 100, 0, 30, + "free_time_hist"); + #endif + #ifdef MEAS__TIME_PLUGIN + _VMSMasterEnv->reqHdlrLowTimeHist = makeFixedBinHistExt( 100, 0, 200, + "plugin_low_time_hist"); + _VMSMasterEnv->reqHdlrHighTimeHist = makeFixedBinHistExt( 100, 0, 200, + "plugin_high_time_hist"); + #endif + //======================================================================== + + //===================== Only VMS__malloc after this ==================== + masterEnv = (MasterEnv*)_VMSMasterEnv; + + //Make a readyToAnimateQ for each core loop + readyToAnimateQs = VMS_int__malloc( NUM_CORES * sizeof(VMSQueueStruc *) ); + masterVPs = VMS_int__malloc( NUM_CORES * sizeof(SlaveVP *) ); + + //One array for each core, 3 in array, core's masterVP scheds all + allSchedSlots = VMS_int__malloc( NUM_CORES * sizeof(SchedSlot *) ); + + _VMSMasterEnv->numSlaves = 0; //used to detect shut-down condition + + _VMSMasterEnv->numVPsCreated = 0; //used by create procr to set ID + for( coreIdx = 0; coreIdx < NUM_CORES; coreIdx++ ) + { + readyToAnimateQs[ coreIdx ] = makeVMSQ(); + + //Q: should give masterVP core-specific info as its init data? + masterVPs[ coreIdx ] = VMS_int__create_procr( (TopLevelFnPtr)&masterLoop, (void*)masterEnv ); + masterVPs[ coreIdx ]->coreAnimatedBy = coreIdx; + allSchedSlots[ coreIdx ] = create_sched_slots(); //makes for one core + _VMSMasterEnv->numMasterInARow[ coreIdx ] = 0; + _VMSMasterEnv->workStealingGates[ coreIdx ] = NULL; + } + _VMSMasterEnv->readyToAnimateQs = readyToAnimateQs; + _VMSMasterEnv->masterVPs = masterVPs; + _VMSMasterEnv->masterLock = UNLOCKED; + _VMSMasterEnv->allSchedSlots = allSchedSlots; + _VMSMasterEnv->workStealingLock = UNLOCKED; + + + //Aug 19, 2010: no longer need to place initial masterVP into queue + // because coreLoop now controls -- animates its masterVP when no work + + + //============================= MEASUREMENT STUFF ======================== + #ifdef STATS__TURN_ON_PROBES + _VMSMasterEnv->dynIntervalProbesInfo = + makePrivDynArrayOfSize( (void***)&(_VMSMasterEnv->intervalProbes), 200); + + _VMSMasterEnv->probeNameHashTbl = makeHashTable( 1000, &VMS_int__free ); + + //put creation time directly into master env, for fast retrieval + struct timeval timeStamp; + gettimeofday( &(timeStamp), NULL); + _VMSMasterEnv->createPtInSecs = + timeStamp.tv_sec +(timeStamp.tv_usec/1000000.0); + #endif + #ifdef MEAS__TIME_MASTER_LOCK + _VMSMasterEnv->masterLockLowTimeHist = makeFixedBinHist( 50, 0, 2, + "master lock low time hist"); + _VMSMasterEnv->masterLockHighTimeHist = makeFixedBinHist( 50, 0, 100, + "master lock high time hist"); + #endif + + MakeTheMeasHists(); + //======================================================================== + } + +SchedSlot ** +create_sched_slots() + { SchedSlot **schedSlots; + int i; + + schedSlots = VMS_int__malloc( NUM_SCHED_SLOTS * sizeof(SchedSlot *) ); + + for( i = 0; i < NUM_SCHED_SLOTS; i++ ) + { + schedSlots[i] = VMS_int__malloc( sizeof(SchedSlot) ); + + //Set state to mean "handling requests done, slot needs filling" + schedSlots[i]->workIsDone = FALSE; + schedSlots[i]->needsProcrAssigned = TRUE; + } + return schedSlots; + } + + +void +freeSchedSlots( SchedSlot **schedSlots ) + { int i; + for( i = 0; i < NUM_SCHED_SLOTS; i++ ) + { + VMS_int__free( schedSlots[i] ); + } + VMS_int__free( schedSlots ); + } + + +void +create_the_coreLoop_OS_threads() + { + //======================================================================== + // Create the Threads + int coreIdx, retCode; + + //Need the threads to be created suspended, and wait for a signal + // before proceeding -- gives time after creating to initialize other + // stuff before the coreLoops set off. + _VMSMasterEnv->setupComplete = 0; + + //Make the threads that animate the core loops + for( coreIdx=0; coreIdx < NUM_CORES; coreIdx++ ) + { coreLoopThdParams[coreIdx] = VMS_int__malloc( sizeof(ThdParams) ); + coreLoopThdParams[coreIdx]->coreNum = coreIdx; + + retCode = + pthread_create( &(coreLoopThdHandles[coreIdx]), + thdAttrs, + &coreLoop, + (void *)(coreLoopThdParams[coreIdx]) ); + if(retCode){printf("ERROR creating thread: %d\n", retCode); exit(1);} + } + } + + + +void +VMS_WL__register_request_handler( RequestHandler requestHandler ) + { _VMSMasterEnv->requestHandler = requestHandler; + } + + +void +VMS_WL__register_sched_assigner( Sched_Assigner schedAssigner ) + { _VMSMasterEnv->slaveSchedAssigner = schedAssigner; + } + +VMS_WL__register_semantic_env( void *semanticEnv ) + { _VMSMasterEnv->semanticEnv = semanticEnv; + } + + +/*This is what causes the VMS system to initialize.. then waits for it to + * exit. + * + *Wrapper lib layer calls this when it wants the system to start running.. + */ +void +VMS_WL__start_the_work_then_wait_until_done() + { +#ifdef SEQUENTIAL + /*Only difference between version with an OS thread pinned to each core and + * the sequential version of VMS is VMS__init_Seq, this, and coreLoop_Seq. + */ + //Instead of un-suspending threads, just call the one and only + // core loop (sequential version), in the main thread. + coreLoop_Seq( NULL ); + flushRegisters(); +#else + int coreIdx; + //Start the core loops running + + //tell the core loop threads that setup is complete + //get lock, to lock out any threads still starting up -- they'll see + // that setupComplete is true before entering while loop, and so never + // wait on the condition + pthread_mutex_lock( &suspendLock ); + _VMSMasterEnv->setupComplete = 1; + pthread_mutex_unlock( &suspendLock ); + pthread_cond_broadcast( &suspend_cond ); + + + //wait for all to complete + for( coreIdx=0; coreIdx < NUM_CORES; coreIdx++ ) + { + pthread_join( coreLoopThdHandles[coreIdx], NULL ); + } + + //NOTE: do not clean up VMS env here -- semantic layer has to have + // a chance to clean up its environment first, then do a call to free + // the Master env and rest of VMS locations +#endif + } + + +//TODO: look at architecting cleanest separation between request handler +// and master loop, for dissipate, create, shutdown, and other non-semantic +// requests. Issue is chain: one removes requests from AppVP, one dispatches +// on type of request, and one handles each type.. but some types require +// action from both request handler and master loop -- maybe just give the +// request handler calls like: VMS__handle_X_request_type + + +/*This is called by the semantic layer's request handler when it decides its + * time to shut down the VMS system. Calling this causes the core loop OS + * threads to exit, which unblocks the entry-point function that started up + * VMS, and allows it to grab the result and return to the original single- + * threaded application. + * + *The _VMSMasterEnv is needed by this shut down function, so the create-seed- + * and-wait function has to free a bunch of stuff after it detects the + * threads have all died: the masterEnv, the thread-related locations, + * masterVP any AppVPs that might still be allocated and sitting in the + * semantic environment, or have been orphaned in the _VMSWorkQ. + * + *NOTE: the semantic plug-in is expected to use VMS__malloc to get all the + * locations it needs, and give ownership to masterVP. Then, they will be + * automatically freed. + * + *In here,create one core-loop shut-down processor for each core loop and put + * them all directly into the readyToAnimateQ. + *Note, this function can ONLY be called after the semantic environment no + * longer cares if AppVPs get animated after the point this is called. In + * other words, this can be used as an abort, or else it should only be + * called when all AppVPs have finished dissipate requests -- only at that + * point is it sure that all results have completed. + */ +void +VMS_int__shutdown() + { int coreIdx; + SlaveVP *shutDownPr; + + //create the shutdown processors, one for each core loop -- put them + // directly into the Q -- each core will die when gets one + for( coreIdx=0; coreIdx < NUM_CORES; coreIdx++ ) + { //Note, this is running in the master + shutDownPr = VMS_int__create_procr( &endOSThreadFn, NULL ); + writeVMSQ( shutDownPr, _VMSMasterEnv->readyToAnimateQs[coreIdx] ); + } + + } + + +/*Am trying to be cute, avoiding IF statement in coreLoop that checks for + * a special shutdown procr. Ended up with extra-complex shutdown sequence. + *This function has the sole purpose of setting the stack and framePtr + * to the coreLoop's stack and framePtr.. it does that then jumps to the + * core loop's shutdown point -- might be able to just call Pthread_exit + * from here, but am going back to the pthread's stack and setting everything + * up just as if it never jumped out, before calling pthread_exit. + *The end-point of core loop will free the stack and so forth of the + * processor that animates this function, (this fn is transfering the + * animator of the AppVP that is in turn animating this function over + * to core loop function -- note that this slices out a level of virtual + * processors). + */ +void +endOSThreadFn( void *initData, SlaveVP *animatingPr ) + { +#ifdef SEQUENTIAL + asmTerminateCoreLoopSeq(animatingPr); +#else + asmTerminateCoreLoop(animatingPr); +#endif + } + + +/*This is called from the startup & shutdown + */ +void +VMS_int__cleanup_at_end_of_shutdown() + { + //unused + //VMSQueueStruc **readyToAnimateQs; + //int coreIdx; + //SlaveVP **masterVPs; + //SchedSlot ***allSchedSlots; //ptr to array of ptrs + + //Before getting rid of everything, print out any measurements made + forAllInDynArrayDo( _VMSMasterEnv->measHistsInfo, (DynArrayFnPtr)&printHist ); + forAllInDynArrayDo( _VMSMasterEnv->measHistsInfo, (DynArrayFnPtr)&saveHistToFile); + forAllInDynArrayDo( _VMSMasterEnv->measHistsInfo, &freeHist ); + #ifdef MEAS__TIME_PLUGIN + printHist( _VMSMasterEnv->reqHdlrLowTimeHist ); + saveHistToFile( _VMSMasterEnv->reqHdlrLowTimeHist ); + printHist( _VMSMasterEnv->reqHdlrHighTimeHist ); + saveHistToFile( _VMSMasterEnv->reqHdlrHighTimeHist ); + freeHistExt( _VMSMasterEnv->reqHdlrLowTimeHist ); + freeHistExt( _VMSMasterEnv->reqHdlrHighTimeHist ); + #endif + #ifdef MEAS__TIME_MALLOC + printHist( _VMSMasterEnv->mallocTimeHist ); + saveHistToFile( _VMSMasterEnv->mallocTimeHist ); + printHist( _VMSMasterEnv->freeTimeHist ); + saveHistToFile( _VMSMasterEnv->freeTimeHist ); + freeHistExt( _VMSMasterEnv->mallocTimeHist ); + freeHistExt( _VMSMasterEnv->freeTimeHist ); + #endif + #ifdef MEAS__TIME_MASTER_LOCK + printHist( _VMSMasterEnv->masterLockLowTimeHist ); + printHist( _VMSMasterEnv->masterLockHighTimeHist ); + #endif + #ifdef MEAS__TIME_MASTER + printHist( _VMSMasterEnv->pluginTimeHist ); + for( coreIdx = 0; coreIdx < NUM_CORES; coreIdx++ ) + { + freeVMSQ( readyToAnimateQs[ coreIdx ] ); + //master VPs were created external to VMS, so use external free + VMS_int__dissipate_procr( masterVPs[ coreIdx ] ); + + freeSchedSlots( allSchedSlots[ coreIdx ] ); + } + #endif + #ifdef MEAS__TIME_STAMP_SUSP + printHist( _VMSMasterEnv->pluginTimeHist ); + for( coreIdx = 0; coreIdx < NUM_CORES; coreIdx++ ) + { + freeVMSQ( readyToAnimateQs[ coreIdx ] ); + //master VPs were created external to VMS, so use external free + VMS_int__dissipate_procr( masterVPs[ coreIdx ] ); + + freeSchedSlots( allSchedSlots[ coreIdx ] ); + } + #endif + + //All the environment data has been allocated with VMS__malloc, so just + // free its internal big-chunk and all inside it disappear. +/* + readyToAnimateQs = _VMSMasterEnv->readyToAnimateQs; + masterVPs = _VMSMasterEnv->masterVPs; + allSchedSlots = _VMSMasterEnv->allSchedSlots; + + for( coreIdx = 0; coreIdx < NUM_CORES; coreIdx++ ) + { + freeVMSQ( readyToAnimateQs[ coreIdx ] ); + //master VPs were created external to VMS, so use external free + VMS__dissipate_procr( masterVPs[ coreIdx ] ); + + freeSchedSlots( allSchedSlots[ coreIdx ] ); + } + + VMS__free( _VMSMasterEnv->readyToAnimateQs ); + VMS__free( _VMSMasterEnv->masterVPs ); + VMS__free( _VMSMasterEnv->allSchedSlots ); + + //============================= MEASUREMENT STUFF ======================== + #ifdef STATS__TURN_ON_PROBES + freeDynArrayDeep( _VMSMasterEnv->dynIntervalProbesInfo, &VMS__free_probe); + #endif + //======================================================================== +*/ + //These are the only two that use system free + VMS_ext__free_free_list( _VMSMasterEnv->freeListHead ); + free( (void *)_VMSMasterEnv ); + } + + +//================================ + + diff -r bc4cb994f114 -r eaf7e4c58c9e VMS_defs__HW_specific.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/VMS_defs__HW_specific.h Wed Feb 22 11:39:12 2012 -0800 @@ -0,0 +1,51 @@ +/* + * Copyright 2012 OpenSourceStewardshipFoundation + * Licensed under BSD + * + * Author: seanhalle@yahoo.com + * + */ + +#ifndef _VMS_HW_SPEC_DEFS_H +#define _VMS_HW_SPEC_DEFS_H +#define _GNU_SOURCE + + +//========================= Hardware related Constants ===================== + //This value is the number of hardware threads in the shared memory + // machine +#define NUM_CORES 4 + + // tradeoff amortizing master fixed overhead vs imbalance potential + // when work-stealing, can make bigger, at risk of losing cache affinity +#define NUM_SCHED_SLOTS 3 + +#define MIN_WORK_UNIT_CYCLES 20000 + +#define MASTERLOCK_RETRIES 10000 + + // stack size in virtual processors created +#define VIRT_PROCR_STACK_SIZE 0x8000 /* 32K */ + + // memory for VMS__malloc +#define MALLOC_ADDITIONAL_MEM_FROM_OS_SIZE 0x40000000 /* 1G */ + + //Frequency of TS counts -- have to do tests to verify + //NOTE: turn off (in BIOS) TURBO-BOOST and SPEED-STEP else won't be const +#define TSCOUNT_FREQ 3180000000 + +#define CACHE_LINE_SZ 256 +#define PAGE_SIZE 4096 + +//To prevent false-sharing, aligns a variable to a cache-line boundary. +//No need to use for local vars because those are never shared between cores +#define __align_to_cacheline__ __attribute__ ((aligned(CACHE_LINE_SZ))) + +//aligns a pointer to cacheline. The memory area has to contain at least +//CACHE_LINE_SZ bytes more then needed +#define __align_address(ptr) ((void*)(((uintptr_t)(ptr))&((uintptr_t)(~0x0FF)))) + +//=========================================================================== + +#endif /* _VMS_DEFS_H */ + diff -r bc4cb994f114 -r eaf7e4c58c9e VMS_defs__lang_specific.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/VMS_defs__lang_specific.h Wed Feb 22 11:39:12 2012 -0800 @@ -0,0 +1,182 @@ +/* + * Copyright 2009 OpenSourceStewardshipFoundation.org + * Licensed under GNU General Public License version 2 + * + * Author: seanhalle@yahoo.com + * + */ + +#ifndef _VMS_LANG_SPEC_DEFS_H +#define _VMS_LANG_SPEC_DEFS_H + + + +//=================== Language-specific Measurement Stuff =================== +// +//TODO: Figure out way to move these into language dir.. +// wrap them in #ifdef MEAS__... +// +#ifndef MAKE_HISTS_FOR_MEASUREMENTS +#define MakeTheMeasHists() +#endif + +//=========================================================================== +//VPThread +#ifdef VTHREAD + +#define createHistIdx 1 //note: starts at 1 +#define mutexLockHistIdx 2 +#define mutexUnlockHistIdx 3 +#define condWaitHistIdx 4 +#define condSignalHistIdx 5 + +#define MakeTheMeasHists() \ + _VMSMasterEnv->measHistsInfo = \ + makePrivDynArrayOfSize( (void***)&(_VMSMasterEnv->measHists), 200); \ + makeAMeasHist( createHistIdx, "create", 250, 0, 100 ) \ + makeAMeasHist( mutexLockHistIdx, "mutex_lock", 50, 0, 100 ) \ + makeAMeasHist( mutexUnlockHistIdx, "mutex_unlock", 50, 0, 100 ) \ + makeAMeasHist( condWaitHistIdx, "cond_wait", 50, 0, 100 ) \ + makeAMeasHist( condSignalHistIdx, "cond_signal", 50, 0, 100 ) + + +#define Meas_startCreate \ + int32 startStamp, endStamp; \ + saveLowTimeStampCountInto( startStamp ); \ + +#define Meas_endCreate \ + saveLowTimeStampCountInto( endStamp ); \ + addIntervalToHist( startStamp, endStamp, \ + _VMSMasterEnv->measHists[ createHistIdx ] ); + +#define Meas_startMutexLock \ + int32 startStamp, endStamp; \ + saveLowTimeStampCountInto( startStamp ); \ + +#define Meas_endMutexLock \ + saveLowTimeStampCountInto( endStamp ); \ + addIntervalToHist( startStamp, endStamp, \ + _VMSMasterEnv->measHists[ mutexLockHistIdx ] ); + +#define Meas_startMutexUnlock \ + int32 startStamp, endStamp; \ + saveLowTimeStampCountInto( startStamp ); \ + +#define Meas_endMutexUnlock \ + saveLowTimeStampCountInto( endStamp ); \ + addIntervalToHist( startStamp, endStamp, \ + _VMSMasterEnv->measHists[ mutexUnlockHistIdx ] ); + +#define Meas_startCondWait \ + int32 startStamp, endStamp; \ + saveLowTimeStampCountInto( startStamp ); \ + +#define Meas_endCondWait \ + saveLowTimeStampCountInto( endStamp ); \ + addIntervalToHist( startStamp, endStamp, \ + _VMSMasterEnv->measHists[ condWaitHistIdx ] ); + +#define Meas_startCondSignal \ + int32 startStamp, endStamp; \ + saveLowTimeStampCountInto( startStamp ); \ + +#define Meas_endCondSignal \ + saveLowTimeStampCountInto( endStamp ); \ + addIntervalToHist( startStamp, endStamp, \ + _VMSMasterEnv->measHists[ condSignalHistIdx ] ); + +#endif + + + +//=========================================================================== +//VCilk + +#ifdef VCILK + +#define spawnHistIdx 1 //note: starts at 1 +#define syncHistIdx 2 + +#define MakeTheMeasHists() \ + _VMSMasterEnv->measHistsInfo = \ + makePrivDynArrayOfSize( (void***)&(_VMSMasterEnv->measHists), 200); \ + makeAMeasHist( spawnHistIdx, "Spawn", 50, 0, 200 ) \ + makeAMeasHist( syncHistIdx, "Sync", 50, 0, 200 ) + + +#define Meas_startSpawn \ + int32 startStamp, endStamp; \ + saveLowTimeStampCountInto( startStamp ); \ + +#define Meas_endSpawn \ + saveLowTimeStampCountInto( endStamp ); \ + addIntervalToHist( startStamp, endStamp, \ + _VMSMasterEnv->measHists[ spawnHistIdx ] ); + +#define Meas_startSync \ + int32 startStamp, endStamp; \ + saveLowTimeStampCountInto( startStamp ); \ + +#define Meas_endSync \ + saveLowTimeStampCountInto( endStamp ); \ + addIntervalToHist( startStamp, endStamp, \ + _VMSMasterEnv->measHists[ syncHistIdx ] ); +#endif + +//=========================================================================== +// SSR + +#ifdef SSR + +#define SendFromToHistIdx 1 //note: starts at 1 +#define SendOfTypeHistIdx 2 +#define ReceiveFromToHistIdx 3 +#define ReceiveOfTypeHistIdx 4 + +#define MakeTheMeasHists() \ + _VMSMasterEnv->measHistsInfo = \ + makePrivDynArrayOfSize( (void***)&(_VMSMasterEnv->measHists), 200); \ + makeAMeasHist( SendFromToHistIdx, "SendFromTo", 50, 0, 100 ) \ + makeAMeasHist( SendOfTypeHistIdx, "SendOfType", 50, 0, 100 ) \ + makeAMeasHist( ReceiveFromToHistIdx,"ReceiveFromTo", 50, 0, 100 ) \ + makeAMeasHist( ReceiveOfTypeHistIdx,"ReceiveOfType", 50, 0, 100 ) + +#define Meas_startSendFromTo \ + int32 startStamp, endStamp; \ + saveLowTimeStampCountInto( startStamp ); \ + +#define Meas_endSendFromTo \ + saveLowTimeStampCountInto( endStamp ); \ + addIntervalToHist( startStamp, endStamp, \ + _VMSMasterEnv->measHists[ SendFromToHistIdx ] ); + +#define Meas_startSendOfType \ + int32 startStamp, endStamp; \ + saveLowTimeStampCountInto( startStamp ); \ + +#define Meas_endSendOfType \ + saveLowTimeStampCountInto( endStamp ); \ + addIntervalToHist( startStamp, endStamp, \ + _VMSMasterEnv->measHists[ SendOfTypeHistIdx ] ); + +#define Meas_startReceiveFromTo \ + int32 startStamp, endStamp; \ + saveLowTimeStampCountInto( startStamp ); \ + +#define Meas_endReceiveFromTo \ + saveLowTimeStampCountInto( endStamp ); \ + addIntervalToHist( startStamp, endStamp, \ + _VMSMasterEnv->measHists[ ReceiveFromToHistIdx ] ); + +#define Meas_startReceiveOfType \ + int32 startStamp, endStamp; \ + saveLowTimeStampCountInto( startStamp ); \ + +#define Meas_endReceiveOfType \ + saveLowTimeStampCountInto( endStamp ); \ + addIntervalToHist( startStamp, endStamp, \ + _VMSMasterEnv->measHists[ReceiveOfTypeHistIdx ] ); +#endif /* SSR */ + +#endif /* _VMS_DEFS_H */ + diff -r bc4cb994f114 -r eaf7e4c58c9e VMS_defs__main.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/VMS_defs__main.h Wed Feb 22 11:39:12 2012 -0800 @@ -0,0 +1,185 @@ +/* + * Copyright 2009 OpenSourceStewardshipFoundation.org + * Licensed under GNU General Public License version 2 + * + * Author: seanhalle@yahoo.com + * + */ + +#ifndef _VMS_DEFS_H +#define _VMS_DEFS_H +#define _GNU_SOURCE + +//=========================== VMS-wide defs =============================== +#include "VMS_primitive_data_types.h" + +#define SUCCESS 0 + + //only after macro-expansion are the defs of writePrivQ, aso looked up + // so these defs can be at the top, and writePrivQ defined later on.. +#define writeVMSQ writePrivQ +#define readVMSQ readPrivQ +#define makeVMSQ makeVMSPrivQ +#define numInVMSQ numInPrivQ +#define VMSQueueStruc PrivQueueStruc + + +//====================== Hardware Specific Defs ============================ +#include "VMS_defs__HW_specific.h" + +//========================= Debug Related Defs ============================= +// +//When SEQUENTIAL is defined, VMS does sequential exe in the main thread +// It still does co-routines and all the mechanisms are the same, it just +// has only a single thread and animates VPs one at a time +//#define SEQUENTIAL + +//#define USE_WORK_STEALING + +//turns on the probe-instrumentation in the application -- when not +// defined, the calls to the probe functions turn into comments +#define STATS__ENABLE_PROBES +//#define TURN_ON_DEBUG_PROBES + +//These defines turn types of bug messages on and off +// be sure debug messages are un-commented (next block of defines) +#define dbgAppFlow TRUE /* Top level flow of application code -- general*/ +#define dbgProbes FALSE /* for issues inside probes themselves*/ +#define dbgB2BMaster FALSE /* in coreloop, back to back master VPs*/ +#define dbgRqstHdlr FALSE /* in request handler code*/ + +//Comment or un- the substitute half to turn on/off types of debug message +#define DEBUG( bool, msg) \ +// if( bool){ printf(msg); fflush(stdin);} +#define DEBUG1( bool, msg, param) \ +// if(bool){printf(msg, param); fflush(stdin);} +#define DEBUG2( bool, msg, p1, p2) \ +// if(bool) {printf(msg, p1, p2); fflush(stdin);} + +#define ERROR(msg) printf(msg); +#define ERROR1(msg, param) printf(msg, param); +#define ERROR2(msg, p1, p2) printf(msg, p1, p2); + +//====================== Measurement Related Defs ========================== +// +// + //when STATS__TURN_ON_PROBES is defined allows using probes to measure + // time intervals. The probes are macros that only compile to something + // when STATS__TURN_ON_PROBES is defined. The probes are saved in the + // master env -- but only when this is defined. + //The TSC probes use RDTSC instr, can be unreliable, Dbl uses gettimeofday +#define STATS__TURN_ON_PROBES +//#define STATS__USE_TSC_PROBES +#define STATS__USE_DBL_PROBES + +//================== Turn Measurement Things on and off ==================== + +//#define MEAS__TIME_2011_SYS + +//define this if any MEAS__... below are +//#define MAKE_HISTS_FOR_MEASUREMENTS + //when MEAS__TIME_STAMP_SUSP is defined, causes code to be inserted and + // compiled-in that saves the low part of the time stamp count just before + // suspending a processor and just after resuming that processor. It is + // saved into a field added to VirtProcr. Have to sanity-check for + // rollover of low portion into high portion. +//#define MEAS__TIME_STAMP_SUSP +//#define MEAS__TIME_MASTER +//#define MEAS__TIME_PLUGIN +//#define MEAS__TIME_MALLOC +//#define MEAS__TIME_MASTER_LOCK + + //For code that calculates normalization-offset between TSC counts of + // different cores. +//#define NUM_TSC_ROUND_TRIPS 10 + + + +//=================== Macros to Capture Measurements ====================== +// +//===== RDTSC wrapper ===== +//Also runs with x86_64 code +#define saveTSCLowHigh(lowHighIn) \ + asm volatile("RDTSC; \ + movl %%eax, %0; \ + movl %%edx, %1;" \ + /* outputs */ : "=m" (lowHighIn.lowHigh[0]), "=m" (lowHighIn.lowHigh[1])\ + /* inputs */ : \ + /* clobber */ : "%eax", "%edx" \ + ); + +#define saveTimeStampCountInto(low, high) \ + asm volatile("RDTSC; \ + movl %%eax, %0; \ + movl %%edx, %1;" \ + /* outputs */ : "=m" (low), "=m" (high)\ + /* inputs */ : \ + /* clobber */ : "%eax", "%edx" \ + ); + +#define saveLowTimeStampCountInto(low) \ + asm volatile("RDTSC; \ + movl %%eax, %0;" \ + /* outputs */ : "=m" (low) \ + /* inputs */ : \ + /* clobber */ : "%eax", "%edx" \ + ); + + +//================== Macros define types of meas want ===================== +#ifdef MEAS__TIME_PLUGIN + +#define Meas_startReqHdlr \ + int32 startStamp1, endStamp1; \ + saveLowTimeStampCountInto( startStamp1 ); + +#define Meas_endReqHdlr \ + saveLowTimeStampCountInto( endStamp1 ); \ + addIntervalToHist( startStamp1, endStamp1, \ + _VMSMasterEnv->reqHdlrLowTimeHist ); \ + addIntervalToHist( startStamp1, endStamp1, \ + _VMSMasterEnv->reqHdlrHighTimeHist ); + +#elif defined MEAS__TIME_2011_SYS +#define Meas_startMasterLoop \ + TSCountLowHigh startStamp1, endStamp1; \ + saveTSCLowHigh( endStamp1 ); \ + _VMSMasterEnv->cyclesTillStartMasterLoop = \ + endStamp1.longVal - masterVP->startSusp.longVal; + +#define Meas_startReqHdlr \ + saveTSCLowHigh( startStamp1 ); \ + _VMSMasterEnv->startReqHdlr.longVal = startStamp1.longVal; + +#define Meas_endReqHdlr + +#define Meas_endMasterLoop \ + saveTSCLowHigh( startStamp1 ); \ + _VMSMasterEnv->endMasterLoop.longVal = startStamp1.longVal; + +#else +#define Meas_startMasterLoop +#define Meas_startReqHdlr +#define Meas_endReqHdlr +#define Meas_endMasterLoop +#endif + +//====================== Histogram Macros -- Create ======================== +// +// +#ifdef MAKE_HISTS_FOR_MEASUREMENTS +#define makeAMeasHist( idx, name, numBins, startVal, binWidth ) \ + makeHighestDynArrayIndexBeAtLeast( _VMSMasterEnv->measHistsInfo, idx ); \ + _VMSMasterEnv->measHists[idx] = \ + makeFixedBinHist( numBins, startVal, binWidth, name ); +#else +#define makeAMeasHist( idx, name, numBins, startVal, binWidth ) +#endif + + +#define MEAS__SUB_CREATE /*turn on/off subtraction of create from plugin*/ + +#include "VMS_defs__lang_specific.h" + +#endif /* _VMS_DEFS_H */ + diff -r bc4cb994f114 -r eaf7e4c58c9e VMS_primitive_data_types.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/VMS_primitive_data_types.h Wed Feb 22 11:39:12 2012 -0800 @@ -0,0 +1,53 @@ +/* + * Copyright 2009 OpenSourceStewardshipFoundation.org + * Licensed under GNU General Public License version 2 + * + * Author: seanhalle@yahoo.com + * + + */ + +#ifndef _BLIS_PRIMITIVE_DATA_TYPES_H +#define _BLIS_PRIMITIVE_DATA_TYPES_H + + +/*For portability, need primitive data types that have a well defined + * size, and well-defined layout into bytes + *To do this, provide BLIS standard aliases for all primitive data types + *These aliases must be used in all BLIS functions instead of the ANSI types + * + *These definitions will be replaced inside each specialization module + * according to the compiler used in that module and the hardware being + * specialized to. + */ +/* +#define int8 char +#define uint8 char +#define int16 short +#define uint16 unsigned short +#define int32 int +#define uint32 unsigned int +#define int64 long long +#define uint64 unsigned long long +#define float32 float +#define float64 double +*/ +typedef char bool8; +typedef char int8; +typedef char uint8; +typedef short int16; +typedef unsigned short uint16; +typedef int int32; +typedef unsigned int uint32; +typedef long long int64; +typedef unsigned long long uint64; +typedef float float32; +typedef double float64; +//typedef double double float128; +#define float128 double double + +#define TRUE 1 +#define FALSE 0 + +#endif /* _BLIS_PRIMITIVE_DATA_TYPES_H */ + diff -r bc4cb994f114 -r eaf7e4c58c9e __brch__Common_ancestor --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/__brch__Common_ancestor Wed Feb 22 11:39:12 2012 -0800 @@ -0,0 +1,33 @@ +A HW branch for: + +generic MultiCore machines with x86 64bit instruction set + +This branch shouldn't be used, except as a lazy fall-back. Instead, try out other branches tuned to specific hardware platforms to find the one that performs best on your machine. Use the "exe_time_vs_task_size" project to generate curves of overhead, and compare result from various branches. + +Note, if this branch is used, then NUM_CORES in VMS_HW_specific_defs.h file has to be updated with the number of cores in your machine + +======== Background on branch naming ========= + +There are two kinds of branchs: ones used to develop features, and ones tuned to particular hardware. A given HW branch may combine features from several feature-branches, picking and choosing among them. + +After Feb 2012, branches are named by the scheme: + +feat____ + +HW__ + +where and follow the pattern: + + x __ + +Examples: + +feat__exp_array_malloc + +feat__rand_backoff__4x10_Intel_WestmereEx + +HW__1x4_Intel_SandyBridge + +HW__4x10_Intel_WestmereEx + +HW__1x4_AMD_mobile diff -r bc4cb994f114 -r eaf7e4c58c9e __brch__DEPRECATED_README --- a/__brch__DEPRECATED_README Mon Feb 13 13:34:13 2012 -0800 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,29 +0,0 @@ -*DEPRECATED* as of Feb 2012, this branch should not be used. Too many variations of VMS for MC_shared exist. - -Instead, choose a branch that has the best implementation for the machine being run on. For example, single-socket with 2 cores, or with 4 cores, or with 8 cores all have their own branches with code tuned to that number of cores. AMD processors require different low-level tweaking than Intel, and so on. - -============== Background on Branch Naming ============ - -There are two kinds of branchs: ones used to develop features, and ones tuned to particular hardware. A given HW branch may combine features from several feature-branches, picking and choosing among them. - -Legacy branches, from before Feb 2012 have random names. After Feb 2012, they're named by the scheme: - -feat____ - -HW__ - -where and follow the pattern: - - x __ - -Examples: - -feat__exp_array_malloc__generic_MC - -feat__rand_backoff__4x10_WestmereEx - -HW__1x4_SandyBridge - -HW__4x10_WestmereEx - -HW__1x4_AMD_mobile \ No newline at end of file diff -r bc4cb994f114 -r eaf7e4c58c9e probes.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/probes.c Wed Feb 22 11:39:12 2012 -0800 @@ -0,0 +1,339 @@ +/* + * Copyright 2010 OpenSourceStewardshipFoundation + * + * Licensed under BSD + */ + +#include +#include +#include + +#include "VMS.h" + + + +//==================== Probes ================= +#ifdef STATS__USE_TSC_PROBES + +int32 +VMS__create_histogram_probe( int32 numBins, float32 startValue, + float32 binWidth, char *nameStr ) + { IntervalProbe *newProbe; + int32 idx; + FloatHist *hist; + + idx = VMS__create_single_interval_probe( nameStr ); + newProbe = _VMSMasterEnv->intervalProbes[ idx ]; + + hist = makeFloatHistogram( numBins, startValue, binWidth ); + newProbe->hist = hist; + return idx; + } + +void +VMS_impl__record_interval_start_in_probe( int32 probeID ) + { IntervalProbe *probe; + + probe = _VMSMasterEnv->intervalProbes[ probeID ]; + probe->startStamp = getTSCount(); + } + +void +VMS_impl__record_interval_end_in_probe( int32 probeID ) + { IntervalProbe *probe; + TSCount endStamp; + + endStamp = getTSCount(); + + probe = _VMSMasterEnv->intervalProbes[ probeID ]; + probe->endStamp = endStamp; + + if( probe->hist != NULL ) + { TSCount interval = probe->endStamp - probe->startStamp; + //if the interval is sane, then add to histogram + if( interval < probe->hist->endOfRange * 10 ) + addToFloatHist( interval, probe->hist ); + } + } + +void +VMS_impl__print_stats_of_probe( int32 probeID ) + { IntervalProbe *probe; + + probe = _VMSMasterEnv->intervalProbes[ probeID ]; + + if( probe->hist == NULL ) + { + printf("probe: %s, interval: %.6lf\n", probe->nameStr,probe->interval); + } + + else + { + printf( "probe: %s\n", probe->nameStr ); + printFloatHist( probe->hist ); + } + } +#else + +/* + * In practice, probe operations are called from the app, from inside slaves + * -- so have to be sure each probe is single-VP owned, and be sure that + * any place common structures are modified it's done inside the master. + * So -- the only place common structures are modified is during creation. + * after that, all mods are to individual instances. + * + * Thniking perhaps should change the semantics to be that probes are + * attached to the virtual processor -- and then everything is guaranteed + * to be isolated -- except then can't take any intervals that span VPs, + * and would have to transfer the probes to Master env when VP dissipates.. + * gets messy.. + * + * For now, just making so that probe creation causes a suspend, so that + * the dynamic array in the master env is only modified from the master + * + */ +IntervalProbe * +create_generic_probe( char *nameStr, SlaveVP *animPr ) +{ + VMSSemReq reqData; + + reqData.reqType = createProbe; + reqData.nameStr = nameStr; + + VMS_WL__send_VMSSem_request( &reqData, animPr ); + + return animPr->dataRetFromReq; + } + +/*Use this version from outside VMS -- it uses external malloc, and modifies + * dynamic array, so can't be animated in a slave VP + */ +IntervalProbe * +ext__create_generic_probe( char *nameStr ) + { IntervalProbe *newProbe; + int32 nameLen; + + newProbe = malloc( sizeof(IntervalProbe) ); + nameLen = strlen( nameStr ); + newProbe->nameStr = malloc( nameLen ); + memcpy( newProbe->nameStr, nameStr, nameLen ); + newProbe->hist = NULL; + newProbe->schedChoiceWasRecorded = FALSE; + newProbe->probeID = + addToDynArray( newProbe, _VMSMasterEnv->dynIntervalProbesInfo ); + + return newProbe; + } + + +/*Only call from inside master or main startup/shutdown thread + */ +void +VMS_impl__free_probe( IntervalProbe *probe ) + { if( probe->hist != NULL ) freeDblHist( probe->hist ); + if( probe->nameStr != NULL) VMS_int__free( probe->nameStr ); + VMS_int__free( probe ); + } + + +int32 +VMS_impl__record_time_point_into_new_probe( char *nameStr, SlaveVP *animPr) + { IntervalProbe *newProbe; + struct timeval *startStamp; + float64 startSecs; + + newProbe = create_generic_probe( nameStr, animPr ); + newProbe->endSecs = 0; + + gettimeofday( &(newProbe->startStamp), NULL); + + //turn into a double + startStamp = &(newProbe->startStamp); + startSecs = startStamp->tv_sec + ( startStamp->tv_usec / 1000000.0 ); + newProbe->startSecs = startSecs; + + return newProbe->probeID; + } + +int32 +VMS_ext_impl__record_time_point_into_new_probe( char *nameStr ) + { IntervalProbe *newProbe; + struct timeval *startStamp; + float64 startSecs; + + newProbe = ext__create_generic_probe( nameStr ); + newProbe->endSecs = 0; + + gettimeofday( &(newProbe->startStamp), NULL); + + //turn into a double + startStamp = &(newProbe->startStamp); + startSecs = startStamp->tv_sec + ( startStamp->tv_usec / 1000000.0 ); + newProbe->startSecs = startSecs; + + return newProbe->probeID; + } + +int32 +VMS_impl__create_single_interval_probe( char *nameStr, SlaveVP *animPr ) + { IntervalProbe *newProbe; + + newProbe = create_generic_probe( nameStr, animPr ); + + return newProbe->probeID; + } + +int32 +VMS_impl__create_histogram_probe( int32 numBins, float64 startValue, + float64 binWidth, char *nameStr, SlaveVP *animPr ) + { IntervalProbe *newProbe; + DblHist *hist; + + newProbe = create_generic_probe( nameStr, animPr ); + + hist = makeDblHistogram( numBins, startValue, binWidth ); + newProbe->hist = hist; + return newProbe->probeID; + } + +void +VMS_impl__index_probe_by_its_name( int32 probeID, SlaveVP *animPr ) + { IntervalProbe *probe; + + //TODO: fix this To be in Master -- race condition + probe = _VMSMasterEnv->intervalProbes[ probeID ]; + + addValueIntoTable(probe->nameStr, probe, _VMSMasterEnv->probeNameHashTbl); + } + +IntervalProbe * +VMS_impl__get_probe_by_name( char *probeName, SlaveVP *animPr ) + { + //TODO: fix this To be in Master -- race condition + return getValueFromTable( probeName, _VMSMasterEnv->probeNameHashTbl ); + } + + +/*Everything is local to the animating procr, so no need for request, do + * work locally, in the anim Pr + */ +void +VMS_impl__record_sched_choice_into_probe( int32 probeID, SlaveVP *animatingPr ) + { IntervalProbe *probe; + + probe = _VMSMasterEnv->intervalProbes[ probeID ]; + probe->schedChoiceWasRecorded = TRUE; + probe->coreNum = animatingPr->coreAnimatedBy; + probe->procrID = animatingPr->procrID; + probe->procrCreateSecs = animatingPr->createPtInSecs; + } + +/*Everything is local to the animating procr, so no need for request, do + * work locally, in the anim Pr + */ +void +VMS_impl__record_interval_start_in_probe( int32 probeID ) + { IntervalProbe *probe; + + DEBUG( dbgProbes, "record start of interval\n" ) + probe = _VMSMasterEnv->intervalProbes[ probeID ]; + gettimeofday( &(probe->startStamp), NULL ); + } + + +/*Everything is local to the animating procr, so no need for request, do + * work locally, in the anim Pr + */ +void +VMS_impl__record_interval_end_in_probe( int32 probeID ) + { IntervalProbe *probe; + struct timeval *endStamp, *startStamp; + float64 startSecs, endSecs; + + DEBUG( dbgProbes, "record end of interval\n" ) + //possible seg-fault if array resized by diff core right after this + // one gets probe..? Something like that? Might be safe.. don't care + probe = _VMSMasterEnv->intervalProbes[ probeID ]; + gettimeofday( &(probe->endStamp), NULL); + + //now turn into an interval held in a double + startStamp = &(probe->startStamp); + endStamp = &(probe->endStamp); + + startSecs = startStamp->tv_sec + ( startStamp->tv_usec / 1000000.0 ); + endSecs = endStamp->tv_sec + ( endStamp->tv_usec / 1000000.0 ); + + probe->interval = endSecs - startSecs; + probe->startSecs = startSecs; + probe->endSecs = endSecs; + + if( probe->hist != NULL ) + { + //if the interval is sane, then add to histogram + if( probe->interval < probe->hist->endOfRange * 10 ) + addToDblHist( probe->interval, probe->hist ); + } + } + +void +print_probe_helper( IntervalProbe *probe ) + { + printf( "\nprobe: %s, ", probe->nameStr ); + + + if( probe->schedChoiceWasRecorded ) + { printf( "coreNum: %d, procrID: %d, procrCreated: %0.6f | ", + probe->coreNum, probe->procrID, probe->procrCreateSecs ); + } + + if( probe->endSecs == 0 ) //just a single point in time + { + printf( " time point: %.6f\n", + probe->startSecs - _VMSMasterEnv->createPtInSecs ); + } + else if( probe->hist == NULL ) //just an interval + { + printf( " startSecs: %.6f interval: %.6f\n", + (probe->startSecs - _VMSMasterEnv->createPtInSecs), probe->interval); + } + else //a full histogram of intervals + { + printDblHist( probe->hist ); + } + } + +//TODO: change so pass around pointer to probe instead of its array-index.. +// will eliminate chance for timing of resize to cause problems with the +// lookup -- even though don't think it actually can cause problems.. +// there's no need to pass index around -- have hash table for names, and +// only need it once, then have ptr to probe.. the thing about enum the +// index and use that as name is clunky in practice -- just hash. +void +VMS_impl__print_stats_of_probe( int32 probeID ) + { IntervalProbe *probe; + + probe = _VMSMasterEnv->intervalProbes[ probeID ]; + + print_probe_helper( probe ); + } + + +inline void doNothing(){}; + +void +generic_print_probe( void *_probe ) + { + IntervalProbe *probe = (IntervalProbe *)_probe; + + //TODO segfault in printf + //print_probe_helper( probe ); + } + +void +VMS_impl__print_stats_of_all_probes() + { + forAllInDynArrayDo( _VMSMasterEnv->dynIntervalProbesInfo, + &generic_print_probe ); + fflush( stdout ); + } +#endif diff -r bc4cb994f114 -r eaf7e4c58c9e probes.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/probes.h Wed Feb 22 11:39:12 2012 -0800 @@ -0,0 +1,182 @@ +/* + * Copyright 2009 OpenSourceStewardshipFoundation.org + * Licensed under GNU General Public License version 2 + * + * Author: seanhalle@yahoo.com + * + */ + +#ifndef _PROBES_H +#define _PROBES_H +#define _GNU_SOURCE + +#include "VMS_primitive_data_types.h" + +#include + +/*Note on order of include files: + * This file relies on #defines that appear in other files.. + */ + + +//typedef struct _IntervalProbe IntervalProbe; //in VMS.h + +struct _IntervalProbe + { + char *nameStr; + int32 probeID; + + int32 schedChoiceWasRecorded; + int32 coreNum; + int32 procrID; + float64 procrCreateSecs; + + #ifdef STATS__USE_TSC_PROBES + TSCount startStamp; + TSCount endStamp; + #else + struct timeval startStamp; + struct timeval endStamp; + #endif + float64 startSecs; + float64 endSecs; + float64 interval; + DblHist *hist;//if NULL, then is single interval probe + }; + + + +//======================== Probes ============================= +// +// Use macros to allow turning probes off with a #define switch +#ifdef STATS__ENABLE_PROBES +int32 +VMS_impl__record_time_point_into_new_probe( char *nameStr,SlaveVP *animPr); +#define VMS__record_time_point_into_new_probe( nameStr, animPr ) \ + VMS_impl__record_time_point_in_new_probe( nameStr, animPr ) + +int32 +VMS_ext_impl__record_time_point_into_new_probe( char *nameStr ); +#define VMS_ext__record_time_point_into_new_probe( nameStr ) \ + VMS_ext_impl__record_time_point_into_new_probe( nameStr ) + + +int32 +VMS_impl__create_single_interval_probe( char *nameStr, SlaveVP *animPr ); +#define VMS__create_single_interval_probe( nameStr, animPr ) \ + VMS_impl__create_single_interval_probe( nameStr, animPr ) + + +int32 +VMS_impl__create_histogram_probe( int32 numBins, float64 startValue, + float64 binWidth, char *nameStr, SlaveVP *animPr ); +#define VMS__create_histogram_probe( numBins, startValue, \ + binWidth, nameStr, animPr ) \ + VMS_impl__create_histogram_probe( numBins, startValue, \ + binWidth, nameStr, animPr ) +void +VMS_impl__free_probe( IntervalProbe *probe ); +#define VMS__free_probe( probe ) \ + VMS_impl__free_probe( probe ) + +void +VMS_impl__index_probe_by_its_name( int32 probeID, SlaveVP *animPr ); +#define VMS__index_probe_by_its_name( probeID, animPr ) \ + VMS_impl__index_probe_by_its_name( probeID, animPr ) + +IntervalProbe * +VMS_impl__get_probe_by_name( char *probeName, SlaveVP *animPr ); +#define VMS__get_probe_by_name( probeID, animPr ) \ + VMS_impl__get_probe_by_name( probeName, animPr ) + +void +VMS_impl__record_sched_choice_into_probe( int32 probeID, SlaveVP *animPr ); +#define VMS__record_sched_choice_into_probe( probeID, animPr ) \ + VMS_impl__record_sched_choice_into_probe( probeID, animPr ) + +void +VMS_impl__record_interval_start_in_probe( int32 probeID ); +#define VMS__record_interval_start_in_probe( probeID ) \ + VMS_impl__record_interval_start_in_probe( probeID ) + +void +VMS_impl__record_interval_end_in_probe( int32 probeID ); +#define VMS__record_interval_end_in_probe( probeID ) \ + VMS_impl__record_interval_end_in_probe( probeID ) + +void +VMS_impl__print_stats_of_probe( int32 probeID ); +#define VMS__print_stats_of_probe( probeID ) \ + VMS_impl__print_stats_of_probe( probeID ) + +void +VMS_impl__print_stats_of_all_probes(); +#define VMS__print_stats_of_all_probes() \ + VMS_impl__print_stats_of_all_probes() + + +#else +int32 +VMS_impl__record_time_point_into_new_probe( char *nameStr,SlaveVP *animPr); +#define VMS__record_time_point_into_new_probe( nameStr, animPr ) \ + 0 /* do nothing */ + +int32 +VMS_ext_impl__record_time_point_into_new_probe( char *nameStr ); +#define VMS_ext__record_time_point_into_new_probe( nameStr ) \ + 0 /* do nothing */ + + +int32 +VMS_impl__create_single_interval_probe( char *nameStr, SlaveVP *animPr ); +#define VMS__create_single_interval_probe( nameStr, animPr ) \ + 0 /* do nothing */ + + +int32 +VMS_impl__create_histogram_probe( int32 numBins, float64 startValue, + float64 binWidth, char *nameStr, SlaveVP *animPr ); +#define VMS__create_histogram_probe( numBins, startValue, \ + binWidth, nameStr, animPr ) \ + 0 /* do nothing */ + +void +VMS_impl__index_probe_by_its_name( int32 probeID, SlaveVP *animPr ); +#define VMS__index_probe_by_its_name( probeID, animPr ) \ + /* do nothing */ + +IntervalProbe * +VMS_impl__get_probe_by_name( char *probeName, SlaveVP *animPr ); +#define VMS__get_probe_by_name( probeID, animPr ) \ + NULL /* do nothing */ + +void +VMS_impl__record_sched_choice_into_probe( int32 probeID, SlaveVP *animPr ); +#define VMS__record_sched_choice_into_probe( probeID, animPr ) \ + /* do nothing */ + +void +VMS_impl__record_interval_start_in_probe( int32 probeID ); +#define VMS__record_interval_start_in_probe( probeID ) \ + /* do nothing */ + +void +VMS_impl__record_interval_end_in_probe( int32 probeID ); +#define VMS__record_interval_end_in_probe( probeID ) \ + /* do nothing */ + +inline void doNothing(); +void +VMS_impl__print_stats_of_probe( int32 probeID ); +#define VMS__print_stats_of_probe( probeID ) \ + doNothing/* do nothing */ + +void +VMS_impl__print_stats_of_all_probes(); +#define VMS__print_stats_of_all_probes \ + doNothing/* do nothing */ + +#endif /* defined STATS__ENABLE_PROBES */ + +#endif /* _PROBES_H */ + diff -r bc4cb994f114 -r eaf7e4c58c9e vmalloc.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/vmalloc.c Wed Feb 22 11:39:12 2012 -0800 @@ -0,0 +1,494 @@ +/* + * Copyright 2009 OpenSourceCodeStewardshipFoundation.org + * Licensed under GNU General Public License version 2 + * + * Author: seanhalle@yahoo.com + * + * Created on November 14, 2009, 9:07 PM + */ + +#include +#include +#include +#include + +#include "VMS.h" +#include "C_Libraries/Histogram/Histogram.h" + +/*Helper function + *Insert a newly generated free chunk into the first spot on the free list. + * The chunk is cast as a MallocProlog, so the various pointers in it are + * accessed with C's help -- and the size of the prolog is easily added to + * the pointer when a chunk is returned to the app -- so C handles changes + * in pointer sizes among machines. + * + *The list head is a normal MallocProlog struct -- identified by its + * prevChunkInFreeList being NULL -- the only one. + * + *The end of the list is identified by next chunk being NULL, as usual. + */ +void inline +add_chunk_to_free_list( MallocProlog *chunk, MallocProlog *listHead ) + { + chunk->nextChunkInFreeList = listHead->nextChunkInFreeList; + if( chunk->nextChunkInFreeList != NULL ) //if not last in free list + chunk->nextChunkInFreeList->prevChunkInFreeList = chunk; + chunk->prevChunkInFreeList = listHead; + listHead->nextChunkInFreeList = chunk; + } + + +/*This is sequential code, meant to only be called from the Master, not from + * any slave VPs. + *Search down list, checking size by the nextHigherInMem pointer, to find + * first chunk bigger than size needed. + *Shave off the extra and make it into a new free-list element, hook it in + * then return the address of the found element plus size of prolog. + * + */ +void *VMS_int__malloc( size_t sizeRequested ) + { MallocProlog *foundElem = NULL, *currElem, *newElem; + ssize_t amountExtra, sizeConsumed,sizeOfFound; + uint32 foundElemIsTopOfHeap; + + //============================= MEASUREMENT STUFF ======================== + #ifdef MEAS__TIME_MALLOC + int32 startStamp, endStamp; + saveLowTimeStampCountInto( startStamp ); + #endif + //======================================================================== + + //step up the size to be aligned at 16-byte boundary, prob better ways + sizeRequested = (sizeRequested + 16) & ~15; + currElem = (_VMSMasterEnv->freeListHead)->nextChunkInFreeList; + + while( currElem != NULL ) + { //check if size of currElem is big enough + sizeOfFound=(size_t)((uintptr_t)currElem->nextHigherInMem -(uintptr_t)currElem); + amountExtra = sizeOfFound - sizeRequested - sizeof(MallocProlog); + if( amountExtra > 0 ) + { //found it, get out of loop + foundElem = currElem; + currElem = NULL; + } + else + currElem = currElem->nextChunkInFreeList; + } + + if( foundElem == NULL ) + { ERROR("\nmalloc failed\n") + return (void *)NULL; //indicates malloc failed + } + //Using a kludge to identify the element that is the top chunk in the + // heap -- saving top-of-heap addr in head's nextHigherInMem -- and + // save addr of start of heap in head's nextLowerInMem + //Will handle top of Heap specially + foundElemIsTopOfHeap = foundElem->nextHigherInMem == + _VMSMasterEnv->freeListHead->nextHigherInMem; + + //before shave off and try to insert new elem, remove found elem + //note, foundElem will never be the head, so always has valid prevChunk + foundElem->prevChunkInFreeList->nextChunkInFreeList = + foundElem->nextChunkInFreeList; + if( foundElem->nextChunkInFreeList != NULL ) + { foundElem->nextChunkInFreeList->prevChunkInFreeList = + foundElem->prevChunkInFreeList; + } + foundElem->prevChunkInFreeList = NULL;//indicates elem currently allocated + + //if enough, turn extra into new elem & insert it + if( amountExtra > 64 ) + { //make new elem by adding to addr of curr elem then casting + sizeConsumed = sizeof(MallocProlog) + sizeRequested; + newElem = (MallocProlog *)( (uintptr_t)foundElem + sizeConsumed ); + newElem->nextLowerInMem = foundElem; //This is evil (but why?) + newElem->nextHigherInMem = foundElem->nextHigherInMem; //This is evil (but why?) + foundElem->nextHigherInMem = newElem; + if( ! foundElemIsTopOfHeap ) + { //there is no next higher for top of heap, so can't write to it + newElem->nextHigherInMem->nextLowerInMem = newElem; + } + add_chunk_to_free_list( newElem, _VMSMasterEnv->freeListHead ); + } + else + { + sizeConsumed = sizeOfFound; + } + _VMSMasterEnv->amtOfOutstandingMem += sizeConsumed; + + //============================= MEASUREMENT STUFF ======================== + #ifdef MEAS__TIME_MALLOC + saveLowTimeStampCountInto( endStamp ); + addIntervalToHist( startStamp, endStamp, _VMSMasterEnv->mallocTimeHist ); + #endif + //======================================================================== + + //skip over the prolog by adding its size to the pointer return + return (void*)((uintptr_t)foundElem + sizeof(MallocProlog)); + } + +/*This is sequential code, meant to only be called from the Master, not from + * any slave VPs. + *Search down list, checking size by the nextHigherInMem pointer, to find + * first chunk bigger than size needed. + *Shave off the extra and make it into a new free-list element, hook it in + * then return the address of the found element plus size of prolog. + * + * The difference to the regular malloc is, that all the allocated chunks are + * aligned and padded to the size of a CACHE_LINE_SZ. Thus creating a new chunk + * before the aligned chunk. + */ +void *VMS_int__malloc_aligned( size_t sizeRequested ) + { MallocProlog *foundElem = NULL, *currElem, *newElem; + ssize_t amountExtra, sizeConsumed,sizeOfFound,prevAmount; + uint32 foundElemIsTopOfHeap; + + //============================= MEASUREMENT STUFF ======================== + #ifdef MEAS__TIME_MALLOC + uint32 startStamp, endStamp; + saveLowTimeStampCountInto( startStamp ); + #endif + //======================================================================== + + //step up the size to be multiple of the cache line size + sizeRequested = (sizeRequested + CACHE_LINE_SZ) & ~(CACHE_LINE_SZ-1); + currElem = (_VMSMasterEnv->freeListHead)->nextChunkInFreeList; + + while( currElem != NULL ) + { //check if size of currElem is big enough + sizeOfFound=(size_t)((uintptr_t)currElem->nextHigherInMem -(uintptr_t)currElem); + amountExtra = sizeOfFound - sizeRequested - sizeof(MallocProlog); + if( amountExtra > 0 ) + { + //look if the found element is already aligned + if((((uintptr_t)currElem+sizeof(MallocProlog)) & (uintptr_t)(CACHE_LINE_SZ-1)) == 0){ + //found it, get out of loop + foundElem = currElem; + break; + }else{ + //find first aligned address and check if it's still big enough + //check also if the space before the aligned address is big enough + //for a new element + void *firstAlignedAddr = (void*)(((uintptr_t)currElem + 2*CACHE_LINE_SZ) & ~((uintptr_t)(CACHE_LINE_SZ-1))); + prevAmount = (uintptr_t)firstAlignedAddr - (uintptr_t)currElem; + sizeOfFound=(uintptr_t)currElem->nextHigherInMem -(uintptr_t)firstAlignedAddr + sizeof(MallocProlog); + amountExtra= sizeOfFound - sizeRequested - sizeof(MallocProlog); + if(prevAmount > 2*sizeof(MallocProlog) && amountExtra > 0 ){ + //found suitable element + //create new previous element and exit loop + MallocProlog *newAlignedElem = (MallocProlog*)firstAlignedAddr - 1; + + //insert new element into free list + if(currElem->nextChunkInFreeList != NULL) + currElem->nextChunkInFreeList->prevChunkInFreeList = newAlignedElem; + newAlignedElem->prevChunkInFreeList = currElem; + newAlignedElem->nextChunkInFreeList = currElem->nextChunkInFreeList; + currElem->nextChunkInFreeList = newAlignedElem; + + //set higherInMem and lowerInMem + newAlignedElem->nextHigherInMem = currElem->nextHigherInMem; + foundElemIsTopOfHeap = currElem->nextHigherInMem == + _VMSMasterEnv->freeListHead->nextHigherInMem; + if(!foundElemIsTopOfHeap) + currElem->nextHigherInMem->nextLowerInMem = newAlignedElem; + currElem->nextHigherInMem = newAlignedElem; + newAlignedElem->nextLowerInMem = currElem; + + //Found new element leaving loop + foundElem = newAlignedElem; + break; + } + } + + } + currElem = currElem->nextChunkInFreeList; + } + + if( foundElem == NULL ) + { ERROR("\nmalloc failed\n") + return (void *)NULL; //indicates malloc failed + } + //Using a kludge to identify the element that is the top chunk in the + // heap -- saving top-of-heap addr in head's nextHigherInMem -- and + // save addr of start of heap in head's nextLowerInMem + //Will handle top of Heap specially + foundElemIsTopOfHeap = foundElem->nextHigherInMem == + _VMSMasterEnv->freeListHead->nextHigherInMem; + + //before shave off and try to insert new elem, remove found elem + //note, foundElem will never be the head, so always has valid prevChunk + foundElem->prevChunkInFreeList->nextChunkInFreeList = + foundElem->nextChunkInFreeList; + if( foundElem->nextChunkInFreeList != NULL ) + { foundElem->nextChunkInFreeList->prevChunkInFreeList = + foundElem->prevChunkInFreeList; + } + foundElem->prevChunkInFreeList = NULL;//indicates elem currently allocated + + //if enough, turn extra into new elem & insert it + if( amountExtra > 64 ) + { //make new elem by adding to addr of curr elem then casting + sizeConsumed = sizeof(MallocProlog) + sizeRequested; + newElem = (MallocProlog *)( (uintptr_t)foundElem + sizeConsumed ); + newElem->nextHigherInMem = foundElem->nextHigherInMem; + newElem->nextLowerInMem = foundElem; + foundElem->nextHigherInMem = newElem; + + if( ! foundElemIsTopOfHeap ) + { //there is no next higher for top of heap, so can't write to it + newElem->nextHigherInMem->nextLowerInMem = newElem; + } + add_chunk_to_free_list( newElem, _VMSMasterEnv->freeListHead ); + } + else + { + sizeConsumed = sizeOfFound; + } + _VMSMasterEnv->amtOfOutstandingMem += sizeConsumed; + + //============================= MEASUREMENT STUFF ======================== + #ifdef MEAS__TIME_MALLOC + saveLowTimeStampCountInto( endStamp ); + addIntervalToHist( startStamp, endStamp, _VMSMasterEnv->mallocTimeHist ); + #endif + //======================================================================== + + //skip over the prolog by adding its size to the pointer return + return (void*)((uintptr_t)foundElem + sizeof(MallocProlog)); + } + + +/*This is sequential code -- only to be called from the Master + * When free, subtract the size of prolog from pointer, then cast it to a + * MallocProlog. Then check the nextLower and nextHigher chunks to see if + * one or both are also free, and coalesce if so, and if neither free, then + * add this one to free-list. + */ +void +VMS_int__free( void *ptrToFree ) + { MallocProlog *elemToFree, *nextLowerElem, *nextHigherElem; + size_t sizeOfElem; + uint32 lowerExistsAndIsFree, higherExistsAndIsFree; + + //============================= MEASUREMENT STUFF ======================== + #ifdef MEAS__TIME_MALLOC + int32 startStamp, endStamp; + saveLowTimeStampCountInto( startStamp ); + #endif + //======================================================================== + + if( ptrToFree < (void*)_VMSMasterEnv->freeListHead->nextLowerInMem || + ptrToFree > (void*)_VMSMasterEnv->freeListHead->nextHigherInMem ) + { //outside the range of data owned by VMS's malloc, so do nothing + return; + } + //subtract size of prolog to get pointer to prolog, then cast + elemToFree = (MallocProlog *)((uintptr_t)ptrToFree - sizeof(MallocProlog)); + sizeOfElem =(size_t)((uintptr_t)elemToFree->nextHigherInMem-(uintptr_t)elemToFree); + + if( elemToFree->prevChunkInFreeList != NULL ) + { printf( "error: freeing same element twice!" ); exit(1); + } + + _VMSMasterEnv->amtOfOutstandingMem -= sizeOfElem; + + nextLowerElem = elemToFree->nextLowerInMem; + nextHigherElem = elemToFree->nextHigherInMem; + + if( nextHigherElem == NULL ) + higherExistsAndIsFree = FALSE; + else //okay exists, now check if in the free-list by checking back ptr + higherExistsAndIsFree = (nextHigherElem->prevChunkInFreeList != NULL); + + if( nextLowerElem == NULL ) + lowerExistsAndIsFree = FALSE; + else //okay, it exists, now check if it's free + lowerExistsAndIsFree = (nextLowerElem->prevChunkInFreeList != NULL); + + + //now, know what exists and what's free + if( lowerExistsAndIsFree ) + { if( higherExistsAndIsFree ) + { //both exist and are free, so coalesce all three + //First, remove higher from free-list + nextHigherElem->prevChunkInFreeList->nextChunkInFreeList = + nextHigherElem->nextChunkInFreeList; + if( nextHigherElem->nextChunkInFreeList != NULL ) //end-of-list? + nextHigherElem->nextChunkInFreeList->prevChunkInFreeList = + nextHigherElem->prevChunkInFreeList; + //Now, fix-up sequence-in-mem list -- by side-effect, this also + // changes size of the lower elem, which is still in free-list + nextLowerElem->nextHigherInMem = nextHigherElem->nextHigherInMem; + if( nextHigherElem->nextHigherInMem != + _VMSMasterEnv->freeListHead->nextHigherInMem ) + nextHigherElem->nextHigherInMem->nextLowerInMem = nextLowerElem; + //notice didn't do anything to elemToFree -- it simply is no + // longer reachable from any of the lists. Wonder if could be a + // security leak because left valid addresses in it, + // but don't care for now. + } + else + { //lower is the only of the two that exists and is free, + //In this case, no adjustment to free-list, just change mem-list. + // By side-effect, changes size of the lower elem + nextLowerElem->nextHigherInMem = elemToFree->nextHigherInMem; + if( elemToFree->nextHigherInMem != + _VMSMasterEnv->freeListHead->nextHigherInMem ) + elemToFree->nextHigherInMem->nextLowerInMem = nextLowerElem; + } + } + else + { //lower either doesn't exist or isn't free, so check higher + if( higherExistsAndIsFree ) + { //higher exists and is the only of the two free + //First, in free-list, replace higher elem with the one to free + elemToFree->nextChunkInFreeList=nextHigherElem->nextChunkInFreeList; + elemToFree->prevChunkInFreeList=nextHigherElem->prevChunkInFreeList; + elemToFree->prevChunkInFreeList->nextChunkInFreeList = elemToFree; + if( elemToFree->nextChunkInFreeList != NULL ) // end-of-list? + elemToFree->nextChunkInFreeList->prevChunkInFreeList =elemToFree; + //Now chg mem-list. By side-effect, changes size of elemToFree + elemToFree->nextHigherInMem = nextHigherElem->nextHigherInMem; + if( elemToFree->nextHigherInMem != + _VMSMasterEnv->freeListHead->nextHigherInMem ) + elemToFree->nextHigherInMem->nextLowerInMem = elemToFree; + } + else + { //neither lower nor higher is availabe to coalesce so add to list + // this makes prev chunk ptr non-null, which indicates it's free + elemToFree->nextChunkInFreeList = + _VMSMasterEnv->freeListHead->nextChunkInFreeList; + _VMSMasterEnv->freeListHead->nextChunkInFreeList = elemToFree; + if( elemToFree->nextChunkInFreeList != NULL ) // end-of-list? + elemToFree->nextChunkInFreeList->prevChunkInFreeList =elemToFree; + elemToFree->prevChunkInFreeList = _VMSMasterEnv->freeListHead; + } + } + //============================= MEASUREMENT STUFF ======================== + #ifdef MEAS__TIME_MALLOC + saveLowTimeStampCountInto( endStamp ); + addIntervalToHist( startStamp, endStamp, _VMSMasterEnv->freeTimeHist ); + #endif + //======================================================================== + + } + + +/*Allocates memory from the external system -- higher overhead + * + *Because of Linux's malloc throwing bizarre random faults when malloc is + * used inside a VMS virtual processor, have to pass this as a request and + * have the core loop do it when it gets around to it -- will look for these + * chores leftover from the previous animation of masterVP the next time it + * goes to animate the masterVP -- so it takes two separate masterVP + * animations, separated by work, to complete an external malloc or + * external free request. + * + *Thinking core loop accepts signals -- just looks if signal-location is + * empty or not -- + */ +void * +VMS__malloc_in_ext( size_t sizeRequested ) + { + /* + //This is running in the master, so no chance for multiple cores to be + // competing for the core's flag. + if( *(_VMSMasterEnv->coreLoopSignalAddr[ 0 ]) != 0 ) + { //something has already signalled to core loop, so save the signal + // and look, next time master animated, to see if can send it. + //Note, the addr to put a signal is in the coreloop's frame, so just + // checks it each time through -- make it volatile to avoid GCC + // optimizations -- it's a coreloop local var that only changes + // after jumping away. The signal includes the addr to send the + //return to -- even if just empty return completion-signal + // + //save the signal in some queue that the master looks at each time + // it starts up -- one loc says if empty for fast common case -- + //something like that -- want to hide this inside this call -- but + // think this has to come as a request -- req handler gives procr + // back to master loop, which gives it back to req handler at point + // it sees that core loop has sent return signal. Something like + // that. + saveTheSignal + + } + coreSigData->type = malloc; + coreSigData->sizeToMalloc = sizeRequested; + coreSigData->locToSignalCompletion = &figureOut; + _VMSMasterEnv->coreLoopSignals[ 0 ] = coreSigData; + */ + //just risk system-stack faults until get this figured out + return malloc( sizeRequested ); + } + + +/*Frees memory that was allocated in the external system -- higher overhead + * + *As noted in external malloc comment, this is clunky 'cause the free has + * to be called in the core loop. + */ +void +VMS__free_in_ext( void *ptrToFree ) + { + //just risk system-stack faults until get this figured out + free( ptrToFree ); + + //TODO: fix this -- so + } + + +/*Designed to be called from the main thread outside of VMS, during init + */ +MallocProlog * +VMS_ext__create_free_list() + { MallocProlog *freeListHead, *firstChunk; + + //Note, this is running in the main thread -- all increases in malloc + // mem and all frees of it must be done in this thread, with the + // thread's original stack available + freeListHead = malloc( sizeof(MallocProlog) ); + firstChunk = malloc( MALLOC_ADDITIONAL_MEM_FROM_OS_SIZE ); + if( firstChunk == NULL ) {printf("malloc error\n"); exit(1);} + + //Touch memory to avoid page faults + void *ptr,*endPtr; + endPtr = (void*)firstChunk+MALLOC_ADDITIONAL_MEM_FROM_OS_SIZE; + for(ptr = firstChunk; ptr < endPtr; ptr+=PAGE_SIZE) + { + *(char*)ptr = 0; + } + + freeListHead->prevChunkInFreeList = NULL; + //Use this addr to free the heap when cleanup + freeListHead->nextLowerInMem = firstChunk; + //to identify top-of-heap elem, compare this addr to elem's next higher + freeListHead->nextHigherInMem = (void*)( (uintptr_t)firstChunk + + MALLOC_ADDITIONAL_MEM_FROM_OS_SIZE); + freeListHead->nextChunkInFreeList = firstChunk; + + firstChunk->nextChunkInFreeList = NULL; + firstChunk->prevChunkInFreeList = freeListHead; + //next Higher has to be set to top of chunk, so can calc size in malloc + firstChunk->nextHigherInMem = (void*)( (uintptr_t)firstChunk + + MALLOC_ADDITIONAL_MEM_FROM_OS_SIZE); + firstChunk->nextLowerInMem = NULL; //identifies as bott of heap + + _VMSMasterEnv->amtOfOutstandingMem = 0; //none allocated yet + + return freeListHead; + } + + +/*Designed to be called from the main thread outside of VMS, during cleanup + */ +void +VMS_ext__free_free_list( MallocProlog *freeListHead ) + { + //stashed a ptr to the one and only bug chunk malloc'd from OS in the + // free list head's next lower in mem pointer + free( freeListHead->nextLowerInMem ); + + //don't free the head -- it'll be in an array eventually -- free whole + // array when all the free lists linked from it have already been freed + } + diff -r bc4cb994f114 -r eaf7e4c58c9e vmalloc.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/vmalloc.h Wed Feb 22 11:39:12 2012 -0800 @@ -0,0 +1,90 @@ +/* + * Copyright 2009 OpenSourceCodeStewardshipFoundation.org + * Licensed under GNU General Public License version 2 + * + * Author: seanhalle@yahoo.com + * + * Created on November 14, 2009, 9:07 PM + */ + +#ifndef _VMALLOC_H +#define _VMALLOC_H + +#include +#include +#include "VMS_primitive_data_types.h" + +typedef struct _MallocProlog MallocProlog; + +struct _MallocProlog + { + MallocProlog *nextChunkInFreeList; + MallocProlog *prevChunkInFreeList; + MallocProlog *nextHigherInMem; + MallocProlog *nextLowerInMem; + }; +//MallocProlog + +typedef struct + { + MallocProlog *firstChunkInFreeList; + int32 numInList; //TODO not used + } +FreeListHead; + +void * +VMS_int__malloc( size_t sizeRequested ); + +void * +VMS_int__malloc_aligned( size_t sizeRequested ); + +void +VMS_int__free( void *ptrToFree ); + +#define VMS_PI__malloc VMS_int__malloc +#define VMS_PI__malloc_aligned VMS_int__malloc_aligned +#define VMS_PI__free VMS_int__free +/* For now, the PI is protected by master lock, so int malloc fine +void * +VMS_PI__malloc( size_t sizeRequested ); + +void * +VMS_PI__malloc_aligned( size_t sizeRequested ); + +void +VMS_PI__free( void *ptrToFree ); +*/ + +//TODO: protect WL malloc from concurrency!! shared freelist can be corrupted +#define VMS_WL__malloc VMS_int__malloc +#define VMS_WL__malloc_aligned VMS_int__malloc_aligned +#define VMS_WL__free VMS_int__free +/* +void * +VMS_WL__malloc( size_t sizeRequested ); + +void * +VMS_WL__malloc_aligned( size_t sizeRequested ); + +void +VMS_WL__free( void *ptrToFree ); +*/ + +/*Allocates memory from the external system -- higher overhead + */ +void * +VMS__malloc_in_ext( size_t sizeRequested ); + +/*Frees memory that was allocated in the external system -- higher overhead + */ +void +VMS__free_in_ext( void *ptrToFree ); + + +MallocProlog * +VMS_ext__create_free_list(); + +void +VMS_ext__free_free_list( MallocProlog *freeListHead ); + +#endif \ No newline at end of file diff -r bc4cb994f114 -r eaf7e4c58c9e vutilities.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/vutilities.c Wed Feb 22 11:39:12 2012 -0800 @@ -0,0 +1,25 @@ +/* + * Copyright 2009 OpenSourceCodeStewardshipFoundation.org + * Licensed under GNU General Public License version 2 + * + * Author: seanhalle@yahoo.com + * + * Created on November 14, 2009, 9:07 PM + */ + +#include +#include + +#include "VMS.h" + + +inline char * +VMS_int__strDup( char *str ) + { char *retStr; + + retStr = VMS_int__malloc( strlen(str) + 1 ); + if( str == NULL ) return str; + strcpy( retStr, str ); + + return retStr; + } diff -r bc4cb994f114 -r eaf7e4c58c9e vutilities.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/vutilities.h Wed Feb 22 11:39:12 2012 -0800 @@ -0,0 +1,20 @@ +/* + * Copyright 2009 OpenSourceCodeStewardshipFoundation.org + * Licensed under GNU General Public License version 2 + * + * Author: seanhalle@yahoo.com + * + * Created on November 14, 2009, 9:07 PM + */ + + +#ifndef _UTILITIES_H +#define _UTILITIES_H + +#include +#include "VMS_primitive_data_types.h" + +inline char * +VMS_int__strDup( char *str ); + +#endif