# HG changeset patch
# User Merten Sach <msach@mailbox.tu-berlin.de>
# Date 1316441521 -7200
# Node ID 99798e4438a628b15153dc950cb4a589f419f0b5
# Parent  90cbb7b803eeefa8fa5c88a2f38a933d9381fccf# Parent  99343ffe1918508ceef38f7fb35444a14048955f
Merge of Malloc2 and inter master requests

diff -r 90cbb7b803ee -r 99798e4438a6 .hgignore
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/.hgignore	Mon Sep 19 16:12:01 2011 +0200
@@ -0,0 +1,3 @@
+syntax: glob
+
+*.o
diff -r 90cbb7b803ee -r 99798e4438a6 CoreLoop.c
--- a/CoreLoop.c	Wed Sep 07 19:36:46 2011 +0200
+++ b/CoreLoop.c	Mon Sep 19 16:12:01 2011 +0200
@@ -70,7 +70,6 @@
       //Designate a core by a 1 in bit-position corresponding to the core
    CPU_ZERO(&coreMask);
    CPU_SET(coreLoopThdParams->coreNum,&coreMask);
-   //coreMask = 1L << coreLoopThdParams->coreNum;
 
    pthread_t selfThd = pthread_self();
    errorCode =
@@ -91,8 +90,8 @@
    readyToAnimateQ  = _VMSMasterEnv->readyToAnimateQs[thisCoresIdx];
 
    #ifdef USE_WORK_STEALING
-      //Alg for work-stealing designed to make common case fast.  Comment
-      // in stealer code explains.
+      //protect access to readyToAnimateQ -- other cores also want access!
+      //Alg makes common case fast.  Comment in stealer code explains.
    gate.preGateProgress++;
    if( gate.gateClosed )
     {    //now, set coreloop's progress, so stealer can see that core loop
@@ -103,7 +102,7 @@
 
    currPr = (VirtProcr *) readVMSQ( readyToAnimateQ );
 
-      //Set the coreloop's progress, so stealer can see it has made it out
+      //Set coreloop's progress, so stealer can see this has made it out
       // of the protected area
    gate.exitProgress = gate.preGateProgress;
    #else
@@ -112,7 +111,8 @@
 
    if( currPr != NULL ) _VMSMasterEnv->numMasterInARow[thisCoresIdx] = 0;
    else
-    {
+    { //no more Slaves, get master lock and switch to master Pr
+	
       //============================= MEASUREMENT STUFF =====================
       #ifdef MEAS__TIME_MASTER_LOCK
       int32 startStamp, endStamp;
@@ -120,17 +120,17 @@
       #endif
       //=====================================================================
       int tries = 0; int gotLock = 0;
-      while( currPr == NULL ) //if queue was empty, enter get masterLock loop
-       {    //queue was empty, so get master lock
+      while( currPr == NULL ) 
+       {    //didn't get lock, so keep trying
 
          gotLock = __sync_bool_compare_and_swap(&(_VMSMasterEnv->masterLock),
                                                           UNLOCKED, LOCKED );
          if( gotLock )
-          {    //run own MasterVP -- jmps to coreLoops startPt when done
+          {    //run own MasterVP
             currPr = _VMSMasterEnv->masterVPs[thisCoresIdx];
             if( _VMSMasterEnv->numMasterInARow[thisCoresIdx] > 1000 )
              {       DEBUG( dbgB2BMaster,"Many back to back MasterVPs\n");
-               pthread_yield();
+               pthread_yield();   //this core has no slaves to schedule..
              }
             _VMSMasterEnv->numMasterInARow[thisCoresIdx] += 1;
             break;  //end while -- have a VP to animate now
@@ -152,7 +152,7 @@
     }
 
    
-   switchToVP(currPr); //The VPs return in here
+   switchToVP(currPr); //The VPs all return back to here
    flushRegisters();
    }//CoreLoop      
  }
@@ -160,16 +160,11 @@
 
 void *
 terminateCoreLoop(VirtProcr *currPr){
-   //first free shutdown VP that jumped here -- it first restores the
-   // coreloop's stack, so addr of currPr in stack frame is still correct
-   VMS__dissipate_procr( currPr );
-   pthread_exit( NULL );
+    pthread_exit( NULL );
 }
 
 
 
-#ifdef SEQUENTIAL
-
 //===========================================================================
 /*This sequential version is exact same as threaded, except doesn't do the
  * pin-threads part, nor the wait until setup complete part.
@@ -188,7 +183,7 @@
    thisCoresIdx = 0;
 
    //Save the return address in the SwitchVP function
-   saveCoreLoopReturnAddr(&(_VMSMasterEnv->coreLoopReturnPt));
+   saveCoreLoopReturnAddr((void**)&(_VMSMasterEnv->coreLoopReturnPt));
 
    
    while(1){
@@ -212,4 +207,3 @@
    flushRegisters();
    }
  }
-#endif
diff -r 90cbb7b803ee -r 99798e4438a6 MasterLoop.c
--- a/MasterLoop.c	Wed Sep 07 19:36:46 2011 +0200
+++ b/MasterLoop.c	Mon Sep 19 16:12:01 2011 +0200
@@ -11,12 +11,21 @@
 
 #include "VMS.h"
 #include "ProcrContext.h"
-
+#include "scheduling.h"
+#include "inter_VMS_requests.h"
+#include "inter_VMS_requests_handler.h"
 
 //===========================================================================
 void inline
 stealWorkInto( SchedSlot *currSlot, VMSQueueStruc *readyToAnimateQ,
-               VirtProcr *masterPr );
+               VirtProcr *masterPr);
+
+void inline
+handleInterMasterReq( InterMasterReqst *currReq, void *_semEnv,
+                                                    VirtProcr *masterPr);
+
+void inline
+handleInterVMSCoreReq( InterVMSCoreReqst *currReq, VirtProcr *masterPr);
 
 //===========================================================================
 
@@ -49,9 +58,7 @@
  *So VMS__init just births the master virtual processor same way it births
  * all the others -- then does any extra setup needed and puts it into the
  * work queue.
- *However means have to make masterEnv a global static volatile the same way
- * did with readyToAnimateQ in core loop.  -- for performance, put the
- * jump to the core loop directly in here, and have it directly jump back.
+ *However means have to make masterEnv a global static volatile.
  *
  *
  *Aug 18, 2010 -- Going to a separate MasterVP for each core, to see if this
@@ -60,7 +67,7 @@
  *
  *So, this function is coupled to each of the MasterVPs, -- meaning this
  * function can't rely on a particular stack and frame -- each MasterVP that
- * animates this function has a different one.
+ * animates this function has a different stack.
  *
  *At this point, the masterLoop does not write itself into the queue anymore,
  * instead, the coreLoop acquires the masterLock when it has nothing to
@@ -89,39 +96,30 @@
    
    volatileMasterPr = animatingPr;
    masterPr         = (VirtProcr*)volatileMasterPr; //used to force re-define after jmp
+   masterEnv        = (MasterEnv*)_VMSMasterEnv;
 
       //First animation of each MasterVP will in turn animate this part
       // of setup code.. (VP creator sets up the stack as if this function
       // was called normally, but actually get here by jmp)
-      //So, setup values about stack ptr, jmp pt and all that
-   //masterPr->nextInstrPt = &&masterLoopStartPt;
 
-
-      //Note, got rid of writing the stack and frame ptr up here, because
-      // only one
-      // core can ever animate a given MasterVP, so don't need to communicate
-      // new frame and stack ptr to the MasterVP storage before a second
-      // version of that MasterVP can get animated on a different core.
-      //Also got rid of the busy-wait.
-
-   
-   //masterLoopStartPt:
-   while(1){
-       
+      //Sept 2011
+      //Old code jumped directly to this point, but doesn't work on x64
+	  // So, just make this an endless loop, and do assembly function at end
+	  // that saves its own return addr, then jumps to core_loop.
+   while(1)
+   {       
    //============================= MEASUREMENT STUFF ========================
    #ifdef MEAS__TIME_MASTER
       //Total Master time includes one coreloop time -- just assume the core
-      // loop time is same for Master as for AppVPs, even though it may be
+      // loop time is same for Master as is for AppVPs, even though it may be
       // smaller due to higher predictability of the fixed jmp.
    saveLowTimeStampCountInto( masterPr->startMasterTSCLow );
    #endif
    //========================================================================
 
-   masterEnv        = (MasterEnv*)_VMSMasterEnv;
-   
-      //GCC may optimize so doesn't always re-define from frame-storage
-   masterPr         = (VirtProcr*)volatileMasterPr;  //just to make sure after jmp
+   //GCC may optimize so doesn't always re-define from frame-storage
    thisCoresIdx     = masterPr->coreAnimatedBy;
+   masterEnv->currentMasterProcrID = thisCoresIdx;
    readyToAnimateQ  = masterEnv->readyToAnimateQs[thisCoresIdx];
    schedSlots       = masterEnv->allSchedSlots[thisCoresIdx];
 
@@ -129,8 +127,28 @@
    slaveScheduler   = masterEnv->slaveScheduler;
    semanticEnv      = masterEnv->semanticEnv;
 
-
-      //Poll each slot's Done flag
+      //First, check for requests from other MasterVPs, and handle them
+   InterMasterReqst* currReqst = masterEnv->interMasterRequestsFor[thisCoresIdx];
+   while(currReqst)
+   {
+       handleInterMasterReq( currReqst, semanticEnv, masterPr );
+       currReqst = currReqst->nextReqst;
+   }
+   masterEnv->interMasterRequestsFor[thisCoresIdx] = NULL;
+   
+   //Second, check for own request that were handled for other MasterVPs
+   currReqst = masterEnv->interMasterRequestsSentBy[thisCoresIdx];
+   while(currReqst && currReqst->obsolete)
+   {
+       InterMasterReqst *nextReqst = currReqst->nextSentReqst;
+       VMS__free(currReqst);
+       currReqst = nextReqst;
+   }
+   masterEnv->interMasterRequestsSentBy[thisCoresIdx] = currReqst;
+   
+      //Now, take care of the SlaveVPs
+      //Go through the slots -- if Slave there newly suspended, handle its request
+      // then, either way, ask assigner to fill each slot
    numSlotsFilled = 0;
    for( slotIdx = 0; slotIdx < NUM_SCHED_SLOTS; slotIdx++)
     {
@@ -167,6 +185,7 @@
          if( schedVirtPr != NULL )
           { currSlot->procrAssignedToSlot = schedVirtPr;
             schedVirtPr->schedSlot        = currSlot;
+            schedVirtPr->coreAnimatedBy   = thisCoresIdx;
             currSlot->needsProcrAssigned  = FALSE;
             numSlotsFilled               += 1;
             
@@ -190,15 +209,56 @@
 
    masterSwitchToCoreLoop(animatingPr);
    flushRegisters();
-   }//MasterLoop
-
-
+   }//while(1)   MasterLoop
  }
 
+/*This is for inter-master communication.  Either the master itself or
+ * the plugin sends one of these requests.  Some are handled here, by the
+ * master_loop, others are handed off to the plugin.
+ */
+void inline
+handleInterMasterReq( InterMasterReqst *currReq, void *_semEnv,
+                                                    VirtProcr *masterPr )
+ { 
+    
+    switch( currReq->reqType )
+    {  
+      case destVMSCore:
+         handleInterVMSCoreReq( (InterVMSCoreReqst *)currReq, masterPr);
+         break;
+      case destPlugin:
+         _VMSMasterEnv->interPluginReqHdlr( ((InterPluginReqst  *)currReq)->pluginReq,
+                                                                    _semEnv );
+          break;
+      default:
+         break;
+    }
+ }
 
+void inline
+handleInterVMSCoreReq( InterVMSCoreReqst *currReq, VirtProcr *masterPr )
+ { 
+   switch( currReq->secondReqType )
+    {
+      case transfer_free_ptr:
+          handleTransferFree( currReq, masterPr );
+          currReq->obsolete = 1; //now the sender can free the structure
+          break;
+       case shutdownVP:
+           currReq->obsolete = 1;
+           handleShutdown(currReq, masterPr); 
+           //The Execution of the MasterLoop ends here
+           break;
+      default:
+          break;
+    }
+}
 
-/*This has a race condition -- the coreloops are accessing their own queues
- * at the same time that this work-stealer on a different core is trying to
+/*Work Stealing Alg -- racy one
+ *This algorithm has a race condition -- the coreloops are accessing their
+ * own queues at the same time that this work-stealer on a different core
+ * is trying to.
+ *The second stealing alg, below, protects against this.
  */
 void inline
 stealWorkInto( SchedSlot *currSlot, VMSQueueStruc *readyToAnimateQ,
@@ -234,7 +294,8 @@
     }
  }
 
-/*This algorithm makes the common case fast.  Make the coreloop passive,
+/*Work Stealing alg -- protected one
+ *This algorithm makes the common case fast.  Make the coreloop passive,
  * and show its progress.  Make the stealer control a gate that coreloop
  * has to pass.
  *To avoid interference, only one stealer at a time.  Use a global
@@ -360,7 +421,7 @@
    //======= End Gate-protection  =======
 
 
-   if( stolenPr != NULL )  //victim could have been in protected and taken
+   if( stolenPr != NULL )  //victim could have been in protected and took it
     { currSlot->procrAssignedToSlot = stolenPr;
       stolenPr->schedSlot           = currSlot;
       currSlot->needsProcrAssigned  = FALSE;
diff -r 90cbb7b803ee -r 99798e4438a6 ProcrContext.c
--- a/ProcrContext.c	Wed Sep 07 19:36:46 2011 +0200
+++ b/ProcrContext.c	Mon Sep 19 16:12:01 2011 +0200
@@ -4,6 +4,7 @@
 
 
 #include "VMS.h"
+#include "ProcrContext.h"
 
 /*Create stack, then create __cdecl structure on it and put initialData and
  * pointer to the new structure instance into the parameter positions on
diff -r 90cbb7b803ee -r 99798e4438a6 ProcrContext.h
--- a/ProcrContext.h	Wed Sep 07 19:36:46 2011 +0200
+++ b/ProcrContext.h	Mon Sep 19 16:12:01 2011 +0200
@@ -5,11 +5,56 @@
  * Author: seanhalle@yahoo.com
  * 
  */
-
 #ifndef _ProcrContext_H
 #define	_ProcrContext_H
 #define _GNU_SOURCE
 
+typedef struct _VirtProcr VirtProcr;
+typedef struct _VMSReqst  VMSReqst;
+typedef void  (*VirtProcrFnPtr)  ( void *, VirtProcr * ); //initData, animPr
+
+
+#include "VMS_primitive_data_types.h"
+#include "scheduling.h"
+
+/*WARNING: re-arranging this data structure could cause VP switching
+ *         assembly code to fail -- hard-codes offsets of fields
+ */
+struct _VirtProcr
+ { int         procrID;  //for debugging -- count up each time create
+   int         coreAnimatedBy;
+   void       *startOfStack;
+   void       *stackPtr;
+   void       *framePtr;
+   void       *nextInstrPt;
+   
+   void       *coreLoopStartPt;  //allows proto-runtime to be linked later
+   void       *coreLoopFramePtr; //restore before jmp back to core loop
+   void       *coreLoopStackPtr; //restore before jmp back to core loop
+
+   void       *initialData;
+   
+   SchedSlot  *schedSlot;
+   VMSReqst   *requests;
+
+   void       *semanticData;
+   void       *dataRetFromReq; //values returned from plugin to VP go here
+
+      //=========== MEASUREMENT STUFF ==========
+   #ifdef MEAS__TIME_STAMP_SUSP
+   unsigned int preSuspTSCLow;
+   unsigned int postSuspTSCLow;
+   #endif
+   #ifdef MEAS__TIME_MASTER /* in VirtProcr because multiple masterVPs*/
+   unsigned int startMasterTSCLow;USE_GNU
+   unsigned int endMasterTSCLow;
+   #endif
+      //========================================
+   
+   float64      createPtInSecs;  //have space but don't use on some configs
+ };
+//VirtProcr
+
 void saveCoreLoopReturnAddr(void **returnAddress);
 
 void switchToVP(VirtProcr *nextProcr);
@@ -20,10 +65,12 @@
 
 void startVirtProcrFn();
 
-void *asmTerminateCoreLoop(VirtProcr *currPr);
+void asmTerminateCoreLoop(VirtProcr *currPr);
+
+void asmTerminateCoreLoopSeq(VirtProcr *currPr);
 
 #define flushRegisters() \
-        asm volatile ("":::"%rbx", "%r12", "%r13","%r14","%r15")
+        asm volatile ("":::"%rbx", "%r12", "%r13","%r14","%r15");
 
 inline VirtProcr *
 create_procr_helper( VirtProcr *newPr,       VirtProcrFnPtr  fnPtr,
diff -r 90cbb7b803ee -r 99798e4438a6 VMS.c
--- a/VMS.c	Wed Sep 07 19:36:46 2011 +0200
+++ b/VMS.c	Mon Sep 19 16:12:01 2011 +0200
@@ -13,6 +13,7 @@
 
 #include "VMS.h"
 #include "ProcrContext.h"
+#include "scheduling.h"
 #include "Queue_impl/BlockingQueue.h"
 #include "Histogram/Histogram.h"
 
@@ -105,8 +106,14 @@
         //Very first thing put into the master env is the free-list, seeded
         // with a massive initial chunk of memory.
         //After this, all other mallocs are VMS__malloc.
-   _VMSMasterEnv->freeLists        = VMS_ext__create_free_list();
-
+   int i;
+   for(i=0; i<NUM_CORES; i++)
+   {
+       _VMSMasterEnv->freeLists[i]        = VMS_ext__create_free_list();
+       _VMSMasterEnv->interMasterRequestsFor[i] = NULL;
+       _VMSMasterEnv->interMasterRequestsSentBy[i] = NULL;
+   }
+   _VMSMasterEnv->currentMasterProcrID = 0;
 
    //============================= MEASUREMENT STUFF ========================
    #ifdef MEAS__TIME_MALLOC
@@ -497,6 +504,19 @@
    VMS__suspend_procr( callingPr );
  }
 
+void inline
+VMS__send_inter_plugin_req( void *reqData, int32 targetMaster, 
+                                            VirtProcr *requestingMaster )
+ { _VMSMasterEnv->interMasterRequestsFor[targetMaster] = 
+                                            (InterMasterReqst *) reqData;
+ }
+
+void inline
+VMS__send_inter_VMSCore_req( InterVMSCoreReqst *reqData,
+                        int32 targetMaster, VirtProcr *requestingMaster )
+ { _VMSMasterEnv->interMasterRequestsFor[targetMaster] = 
+                                            (InterMasterReqst *) reqData;
+ }
 
 /*
  */
@@ -542,18 +562,27 @@
 
    semReq = req->semReqData;
 
-   newProbe          = VMS__malloc( sizeof(IntervalProbe) );
-   newProbe->nameStr = VMS__strDup( semReq->nameStr );
-   newProbe->hist    = NULL;
-   newProbe->schedChoiceWasRecorded = FALSE;
+   switch(semReq->reqType){
+       case createProbe:
+           newProbe          = VMS__malloc( sizeof(IntervalProbe) );
+           newProbe->nameStr = VMS__strDup( (char*)semReq->data );
+           newProbe->hist    = NULL;
+           newProbe->schedChoiceWasRecorded = FALSE;
 
-      //This runs in masterVP, so no race-condition worries
-   newProbe->probeID =
-             addToDynArray( newProbe, _VMSMasterEnv->dynIntervalProbesInfo );
-
-   requestingPr->dataRetFromReq = newProbe;
-
-   (*resumePrFnPtr)( requestingPr, semEnv );
+           //This runs in masterVP, so no race-condition worries
+           newProbe->probeID =
+                   addToDynArray( newProbe, _VMSMasterEnv->dynIntervalProbesInfo );
+           requestingPr->dataRetFromReq = newProbe;
+           break;
+       case interMasterReqst:
+           sendInterMasterReqst(semReq->receiverID,
+                   (InterMasterReqst*)semReq->data);
+           break;
+       default:
+           break;       
+   }
+   
+   resumePrFnPtr( requestingPr, semEnv );
  }
 
 
@@ -589,8 +618,9 @@
       // itself
       //Note, should not stack-allocate initial data -- no guarantee, in
       // general that creating processor will outlive ones it creates.
-   VMS__free( animatingPr->startOfStack );
-   VMS__free( animatingPr );
+     
+   VMS__free( animatingPr->startOfStack);
+   VMS__free( animatingPr);
  }
 
 
@@ -629,14 +659,12 @@
 void
 VMS__shutdown()
  { int coreIdx;
-   VirtProcr *shutDownPr;
-
-      //create the shutdown processors, one for each core loop -- put them
-      // directly into the Q -- each core will die when gets one
+   //Send a shutdown Request to all MasterLoops.
    for( coreIdx=0; coreIdx < NUM_CORES; coreIdx++ )
     {    //Note, this is running in the master
-      shutDownPr = VMS__create_procr( &endOSThreadFn, NULL );
-      writeVMSQ( shutDownPr, _VMSMasterEnv->readyToAnimateQs[coreIdx] );
+       InterVMSCoreReqst *shutdownReqst = VMS__malloc(sizeof(InterVMSCoreReqst));
+       shutdownReqst->secondReqType = shutdownVP;
+       sendInterMasterReqst(coreIdx, (InterMasterReqst*)shutdownReqst);
     }
 
  }
@@ -655,6 +683,7 @@
  * to core loop function -- note that this slices out a level of virtual
  * processors).
  */
+/*
 void
 endOSThreadFn( void *initData, VirtProcr *animatingPr )
  { 
@@ -664,6 +693,7 @@
     asmTerminateCoreLoop(animatingPr);
 #endif
  }
+ */
 
 
 /*This is called from the startup & shutdown
@@ -671,6 +701,9 @@
 void
 VMS__cleanup_at_end_of_shutdown()
  { 
+   // Set to zero so that all data structures are freed correctly 
+   _VMSMasterEnv->currentMasterProcrID = 0;
+   
    //unused
    //VMSQueueStruc **readyToAnimateQs;
    //int              coreIdx;
@@ -751,7 +784,9 @@
    //========================================================================
 */
       //These are the only two that use system free 
-   VMS_ext__free_free_list( _VMSMasterEnv->freeLists );
+   int i;
+   for(i=0; i<NUM_CORES; i++)
+        VMS_ext__free_free_list( _VMSMasterEnv->freeLists[i]);
    free( (void *)_VMSMasterEnv );
  }
 
diff -r 90cbb7b803ee -r 99798e4438a6 VMS.h
--- a/VMS.h	Wed Sep 07 19:36:46 2011 +0200
+++ b/VMS.h	Mon Sep 19 16:12:01 2011 +0200
@@ -5,21 +5,20 @@
  * Author: seanhalle@yahoo.com
  * 
  */
-
 #ifndef _VMS_H
 #define	_VMS_H
 #define _GNU_SOURCE
 
+#include <pthread.h>
+#include <sys/time.h>
+
 #include "VMS_primitive_data_types.h"
 #include "Queue_impl/PrivateQueue.h"
 #include "Histogram/Histogram.h"
 #include "DynArray/DynArray.h"
 #include "Hash_impl/PrivateHash.h"
 #include "vmalloc.h"
-
-#include <pthread.h>
-#include <sys/time.h>
-
+#include "inter_VMS_requests.h"
 
 //===============================  Debug  ===================================
 //
@@ -50,9 +49,9 @@
 #define DEBUG2( bool, msg, p1, p2) \
 //   if(bool) {printf(msg, p1, p2); fflush(stdin);}
 
-#define ERROR(msg) printf(msg);
-#define ERROR1(msg, param) printf(msg, param); 
-#define ERROR2(msg, p1, p2) printf(msg, p1, p2);
+#define ERROR(msg) printf(msg)
+#define ERROR1(msg, param) printf(msg, param);
+#define ERROR2(msg, p1, p2) printf(msg, p1, p2)
 
 //===========================  STATS =======================
 
@@ -90,7 +89,7 @@
 #define VIRT_PROCR_STACK_SIZE 0x8000 /* 32K */
 
    // memory for VMS__malloc
-#define MALLOC_ADDITIONAL_MEM_FROM_OS_SIZE 0x10000000 /* 256M */
+#define MALLOC_ADDITIONAL_MEM_FROM_OS_SIZE 0x4000000 /* 64M */
 
 #define CACHE_LINE 64
 #define PAGE_SIZE 4096
@@ -111,26 +110,23 @@
 //===========================================================================
 typedef unsigned long long TSCount;
 
-typedef struct _SchedSlot     SchedSlot;
-typedef struct _VMSReqst      VMSReqst;
-typedef struct _VirtProcr     VirtProcr;
 typedef struct _IntervalProbe IntervalProbe;
 typedef struct _GateStruc     GateStruc;
 
 
 typedef VirtProcr * (*SlaveScheduler)  ( void *, int );   //semEnv, coreIdx
 typedef void  (*RequestHandler)  ( VirtProcr *, void * ); //prWReqst, semEnv
-typedef void  (*VirtProcrFnPtr)  ( void *, VirtProcr * ); //initData, animPr
-typedef void    VirtProcrFn      ( void *, VirtProcr * ); //initData, animPr
 typedef void  (*ResumePrFnPtr)   ( VirtProcr *, void * );
 
-
 //============= Requests ===========
 //
 
-enum VMSReqstType   //avoid starting enums at 0, for debug reasons
- {
-   semantic = 1,
+//VMS Request is the carrier for Slave to Master requests
+// it has an embedded sub-type request that is pulled out
+// inside the plugin's request handler
+enum VMSReqstType   //For Slave->Master requests
+ { 
+   semantic = 1,    //avoid starting enums at 0, for debug reasons
    createReq,
    dissipate,
    VMSSemantic      //goes with VMSSemReqst below
@@ -145,73 +141,30 @@
  };
 //VMSReqst
 
+//This is a sub-type of Slave->Master requests.
+// It's for Slaves to invoke built-in VMS-core functions that have language-like
+// behavior.
 enum VMSSemReqstType   //These are equivalent to semantic requests, but for
  {                     // VMS's services available directly to app, like OS
    createProbe = 1,    // and probe services -- like a VMS-wide built-in lang
    openFile,
-   otherIO
+   otherIO,
+   interMasterReqst
  };
 
 typedef struct
  { enum VMSSemReqstType reqType;
-   VirtProcr           *requestingPr;
-   char                *nameStr;  //for create probe
+   //VirtProcr           *requestingPr;
+   int                  receiverID; //for inter master requests
+   void                *data;
  }
- VMSSemReq;
+VMSSemReq;
 
 
 //====================  Core data structures  ===================
 
-struct _SchedSlot
- {
-   int         workIsDone;
-   int         needsProcrAssigned;
-   VirtProcr  *procrAssignedToSlot;
- };
-//SchedSlot
-
-/*WARNING: re-arranging this data structure could cause VP switching
- *         assembly code to fail -- hard-codes offsets of fields
- */
-struct _VirtProcr
- { int         procrID;  //for debugging -- count up each time create
-   int         coreAnimatedBy;
-   void       *startOfStack;
-   void       *stackPtr;
-   void       *framePtr;
-   void       *nextInstrPt;
-   
-   void       *coreLoopStartPt;  //allows proto-runtime to be linked later
-   void       *coreLoopFramePtr; //restore before jmp back to core loop
-   void       *coreLoopStackPtr; //restore before jmp back to core loop
-
-   void       *initialData;
-
-   SchedSlot  *schedSlot;
-   VMSReqst   *requests;
-
-   void       *semanticData; //this livesUSE_GNU here for the life of VP
-   void       *dataRetFromReq;//values returned from plugin to VP go here
-
-      //=========== MEASUREMENT STUFF ==========
-   #ifdef MEAS__TIME_STAMP_SUSP
-   unsigned int preSuspTSCLow;
-   unsigned int postSuspTSCLow;
-   #endif
-   #ifdef MEAS__TIME_MASTER /* in VirtProcr because multiple masterVPs*/
-   unsigned int startMasterTSCLow;USE_GNU
-   unsigned int endMasterTSCLow;
-   #endif
-      //========================================
-   
-   float64      createPtInSecs;  //have space but don't use on some configs
- };
-//VirtProcr
-
-
-/*WARNING: re-arranging this data structure could cause VP-switching
- *         assembly code to fail -- hard-codes offsets of fields
- *         (because -O3 messes with things otherwise)
+/*Master Env is the only global variable -- has entry points for any other
+ * data needed.  
  */
 typedef struct
  {
@@ -219,44 +172,51 @@
    RequestHandler   requestHandler;
    
    SchedSlot     ***allSchedSlots;
-   VMSQueueStruc **readyToAnimateQs;
+   VMSQueueStruc  **readyToAnimateQs;
    VirtProcr      **masterVPs;
 
    void            *semanticEnv;
    void            *OSEventStruc;   //for future, when add I/O to BLIS
-   MallocArrays    *freeLists;
-   int32            amtOfOutstandingMem; //total currently allocated
 
    void            *coreLoopReturnPt;//addr to jump to to re-enter coreLoop
 
    int32            setupComplete;
    volatile int32   masterLock;
+   
+   MallocArrays    *freeLists[NUM_CORES];
+   int32            amtOfOutstandingMem; //total currently allocated
 
    int32            numMasterInARow[NUM_CORES];//detect back-to-back masterVP
-   GateStruc       *workStealingGates[ NUM_CORES ]; //concurrent work-steal
+   GateStruc       *workStealingGates[NUM_CORES]; //concurrent work-steal
    int32            workStealingLock;
    
-   int32            numProcrsCreated; //gives ordering to processor creation
+   InterMasterReqst*  interMasterRequestsFor[NUM_CORES];
+   InterMasterReqst*  interMasterRequestsSentBy[NUM_CORES];
+   RequestHandler     interPluginReqHdlr;
+   
+   int32              numProcrsCreated; //gives ordering to processor creation
+   
+   int32              currentMasterProcrID;
 
       //=========== MEASUREMENT STUFF =============
-   IntervalProbe  **intervalProbes;
-   PrivDynArrayInfo    *dynIntervalProbesInfo;
-   HashTable       *probeNameHashTbl;
-   int32            masterCreateProbeID;
-   float64          createPtInSecs;
-   Histogram      **measHists;
-   PrivDynArrayInfo *measHistsInfo;
+   IntervalProbe    **intervalProbes;
+   PrivDynArrayInfo  *dynIntervalProbesInfo;
+   HashTable         *probeNameHashTbl;
+   int32              masterCreateProbeID;
+   float64            createPtInSecs;
+   Histogram        **measHists;
+   PrivDynArrayInfo  *measHistsInfo;
    #ifdef MEAS__TIME_PLUGIN
-   Histogram       *reqHdlrLowTimeHist;
-   Histogram       *reqHdlrHighTimeHist;
+   Histogram         *reqHdlrLowTimeHist;
+   Histogram         *reqHdlrHighTimeHist;
    #endif
    #ifdef MEAS__TIME_MALLOC
-   Histogram       *mallocTimeHist;
-   Histogram       *freeTimeHist;
+   Histogram         *mallocTimeHist;
+   Histogram         *freeTimeHist;
    #endif
    #ifdef MEAS__TIME_MASTER_LOCK
-   Histogram       *masterLockLowTimeHist;
-   Histogram       *masterLockHighTimeHist;
+   Histogram         *masterLockLowTimeHist;
+   Histogram         *masterLockHighTimeHist;
    #endif
  }
 MasterEnv;
@@ -303,8 +263,6 @@
 volatile MasterEnv      *_VMSMasterEnv;
 
 
-
-
 //===========================  Function Prototypes  =========================
 
 
@@ -368,6 +326,14 @@
 inline void
 VMS__send_VMSSem_request( void *semReqData, VirtProcr *callingPr );
 
+
+void inline
+VMS__send_inter_plugin_req( void *reqData, int32 targetMaster, 
+                                            VirtProcr *requestingMaster );
+void inline
+VMS__send_inter_VMSCore_req( InterVMSCoreReqst *reqData, int32 targetMaster,
+                                           VirtProcr *requestingMaster );
+
 VMSReqst *
 VMS__take_next_request_out_of( VirtProcr *procrWithReq );
 
@@ -571,7 +537,6 @@
 
 //=====
 
-#include "ProcrContext.h"
 #include "probes.h"
 #include "vutilities.h"
 
diff -r 90cbb7b803ee -r 99798e4438a6 contextSwitch.s
--- a/contextSwitch.s	Wed Sep 07 19:36:46 2011 +0200
+++ b/contextSwitch.s	Mon Sep 19 16:12:01 2011 +0200
@@ -2,7 +2,17 @@
 
 
 .text
-
+/* VirtProcr  offsets:
+ * 0x10  stackPtr
+ * 0x18 framePtr
+ * 0x20 nextInstrPt
+ * 0x30 coreLoopFramePtr
+ * 0x38 coreLoopStackPtr
+ *
+ * _VMSMasterEnv  offsets:
+ * 0x38 coreLoopReturnPt
+ * 0x44 masterLock
+ */
 //Save return label address for the coreLoop to pointer
 //Arguments: Pointer to variable holding address
 .globl saveCoreLoopReturnAddr
@@ -23,17 +33,6 @@
 
 //Switches form CoreLoop to VP ether a normal VP or the Master Loop
 //switch to virt procr's stack and frame ptr then jump to virt procr fn
-/* VirtProcr  offsets:
- * 0x10  stackPtr
- * 0x18 framePtr
- * 0x20 nextInstrPt
- * 0x30 coreLoopFramePtr
- * 0x38 coreLoopStackPtr
- *
- * _VMSMasterEnv  offsets:
- * 0x48 coreLoopReturnPt
- * 0x54 masterLock
- */
 .globl switchToVP
 switchToVP:
     #VirtProcr in %rdi
@@ -48,17 +47,6 @@
 
     
 //switches to core loop. saves return address
-/* VirtProcr  offsets:
- * 0x10  stackPtr
- * 0x18 framePtr
- * 0x20 nextInstrPt
- * 0x30 coreLoopFramePtr
- * 0x38 coreLoopStackPtr
- *
- * _VMSMasterEnv  offsets:
- * 0x48 coreLoopReturnPt
- * 0x54 masterLock
- */
 .globl switchToCoreLoop
 switchToCoreLoop:
     #VirtProcr in %rdi
@@ -69,7 +57,7 @@
     movq    0x30(%rdi), %rbp         #restore frame pointer
     movq    $_VMSMasterEnv, %rcx
     movq    (%rcx)    , %rcx
-    movq    0x48(%rcx), %rax         #get CoreLoopStartPt
+    movq    0x38(%rcx), %rax         #get CoreLoopStartPt
     jmp     *%rax                    #jmp to CoreLoop
 VPReturn:
     ret
@@ -78,17 +66,6 @@
 
 //switches to core loop from master. saves return address
 //Releases masterLock so the next MasterLoop can be executed
-/* VirtProcr  offsets:
- * 0x10  stackPtr
- * 0x18 framePtr
- * 0x20 nextInstrPt
- * 0x30 coreLoopFramePtr
- * 0x38 coreLoopStackPtr
- *
- * _VMSMasterEnv  offsets:
- * 0x48 coreLoopReturnPt
- * 0x54 masterLock
- */
 .globl masterSwitchToCoreLoop
 masterSwitchToCoreLoop:
     #VirtProcr in %rdi
@@ -99,8 +76,8 @@
     movq    0x30(%rdi), %rbp         #restore frame pointer
     movq    $_VMSMasterEnv, %rcx
     movq    (%rcx)    , %rcx
-    movq    0x48(%rcx), %rax         #get CoreLoopStartPt
-    movl    $0x0      , 0x54(%rcx)   #release lock
+    movq    0x38(%rcx), %rax         #get CoreLoopStartPt
+    movl    $0x0      , 0x44(%rcx)   #release lock
     jmp     *%rax                    #jmp to CoreLoop
 MasterReturn:
     ret
@@ -112,22 +89,14 @@
 // and virtPr is in %rdi
 // and both functions have the same argument.
 // do not save register of VP because this function will never return
-/* VirtProcr  offsets:
- * 0x10  stackPtr
- * 0x18 framePtr
- * 0x20 nextInstrPt
- * 0x30 coreLoopFramePtr
- * 0x38 coreLoopStackPtr
- *
- * _VMSMasterEnv  offsets:
- * 0x48 coreLoopReturnPt
- * 0x58 masterLock
- */
 .globl asmTerminateCoreLoop
 asmTerminateCoreLoop:
     #VirtProcr in %rdi
     movq    0x38(%rdi), %rsp         #restore stack pointer
     movq    0x30(%rdi), %rbp         #restore frame pointer
+    movq    $_VMSMasterEnv, %rcx
+    movq    (%rcx)    , %rcx
+    movl    $0x0      , 0x44(%rcx)   #release lock
     movq    $terminateCoreLoop, %rax
     jmp     *%rax                    #jmp to CoreLoop
 
@@ -142,7 +111,6 @@
     movq    0x38(%rdi), %rsp         #restore stack pointer
     movq    0x30(%rdi), %rbp         #restore frame pointer
     #argument is in %rdi
-    call    VMS__dissipate_procr
     movq    %rbp      , %rsp        #goto the coreLoops stack
     pop     %rbp        #restore the old framepointer
     ret                 #return from core loop
diff -r 90cbb7b803ee -r 99798e4438a6 inter_VMS_requests.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/inter_VMS_requests.c	Mon Sep 19 16:12:01 2011 +0200
@@ -0,0 +1,15 @@
+#include "VMS.h"
+#include "inter_VMS_requests.h"
+
+
+void sendInterMasterReqst(int receiverID, InterMasterReqst* request)
+{
+    request->reqType = destVMSCore;
+    request->obsolete = 0;
+    request->nextReqst = _VMSMasterEnv->interMasterRequestsFor[receiverID];
+    _VMSMasterEnv->interMasterRequestsFor[receiverID] = request;
+    request->nextSentReqst = 
+            _VMSMasterEnv->interMasterRequestsSentBy[_VMSMasterEnv->currentMasterProcrID];
+    _VMSMasterEnv->interMasterRequestsSentBy[_VMSMasterEnv->currentMasterProcrID]
+            = request;
+}
\ No newline at end of file
diff -r 90cbb7b803ee -r 99798e4438a6 inter_VMS_requests.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/inter_VMS_requests.h	Mon Sep 19 16:12:01 2011 +0200
@@ -0,0 +1,74 @@
+/*
+ *  Copyright 2011 OpenSourceStewardshipFoundation.org
+ *  Licensed under GNU General Public License version 2
+ *
+ * Author(s): seanhalle@yahoo.com
+ *
+ */
+
+#ifndef _MASTER_REQ_H
+#define	_MASTER_REQ_H
+
+typedef struct _InterMasterReqst InterMasterReqst;
+
+//These are for Master to Master requests
+// They get re-cast to the appropriate sub-type of request
+enum InterMasterReqstType    //For Master->Master
+ {
+   destVMSCore = 1,          //avoid starting enums at 0, for debug reasons
+   destPlugin
+ };
+
+struct _InterMasterReqst //Doing a trick to save space & time -- allocate
+ {  // space for a sub-type then cast first as InterMaster then as sub-type
+   enum InterMasterReqstType  reqType;
+   InterMasterReqst *nextReqst;
+   InterMasterReqst *nextSentReqst;
+   int32 obsolete;
+ };
+//InterMasterReqst  (defined above in typedef block)
+
+
+//These are a sub-type of InterMaster requests.  The inter-master req gets
+// re-cast to be of this type, after checking
+//This ones for requests between internals of VMS-core.. such as malloc
+enum InterVMSCoreReqType   
+ {
+   transfer_free_ptr = 1,     //avoid starting enums at 0, for debug reasons
+   shutdownVP
+ };
+
+//Doing a trick to save space & time -- allocate space
+// for this, cast first as InterMaster then as this
+typedef struct  
+ {
+   enum InterMasterReqstType  reqType;  //duplicate InterMasterReqst at top
+   InterMasterReqst *nextReqst;
+   InterMasterReqst *nextSentReqst;
+   int32 obsolete;
+   
+   enum InterVMSCoreReqType  secondReqType;
+   void                     *freePtr;  //pile up fields, add as needed
+ } InterVMSCoreReqst;
+
+//This is for requests between plugins on different cores
+// Here, after casting, the pluginReq is extracted and handed to plugin
+//Doing a trick to save space & time -- allocate space
+// for this, cast first as InterMaster then as this
+typedef struct  
+ {
+   enum InterMasterReqstType  reqType;  //copy InterMasterReqst at top
+   InterMasterReqst          *nextReqst;
+   
+   void                      *pluginReq; //plugin will cast to approp type
+ } InterPluginReqst;
+ 
+ /*
+  * This has to be called from the MasterLoop!
+  * Send inter master request. The request structure has to be malloced itself.
+  * The sending VP will free the structure when the request is handled.
+  */
+ void sendInterMasterReqst(int receiverID, InterMasterReqst* request);
+
+#endif	/* _MASTER_REQ_H */
+
diff -r 90cbb7b803ee -r 99798e4438a6 inter_VMS_requests_handler.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/inter_VMS_requests_handler.c	Mon Sep 19 16:12:01 2011 +0200
@@ -0,0 +1,41 @@
+/*
+ * Copyright 2011  OpenSourceCodeStewardshipFoundation
+ *
+ * Licensed under GNU GPL version 2
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "VMS.h"
+#include "ProcrContext.h"
+#include "inter_VMS_requests.h"
+#include "vmalloc.h"
+
+
+
+//==================================================================
+/* The VMS__free in a different masterVP discovered the chunk it was
+ * given was originally allocated by this masterVP, so it sent the
+ * chunk over.  Simply call VMS__free here.
+ */
+inline void
+handleTransferFree( InterVMSCoreReqst *masterReq, VirtProcr *masterPr )
+ {
+    VMS__free( masterReq->freePtr );
+ }
+
+
+/*
+ * The starts the shutdown procedure.
+ */
+inline void
+handleShutdown( InterVMSCoreReqst *masterReq, VirtProcr *masterPr )
+{
+#ifdef SEQUENTIAL
+    asmTerminateCoreLoopSeq(masterPr);
+#else
+    asmTerminateCoreLoop(masterPr);
+#endif
+}
+
diff -r 90cbb7b803ee -r 99798e4438a6 inter_VMS_requests_handler.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/inter_VMS_requests_handler.h	Mon Sep 19 16:12:01 2011 +0200
@@ -0,0 +1,26 @@
+/*
+ *  Copyright 2011 OpenSourceStewardshipFoundation.org
+ *  Licensed under GNU General Public License version 2
+ *
+ * Author(s): seanhalle@yahoo.com
+ *
+ */
+
+#ifndef _MASTER_REQ_HANDLER_H
+#define	_MASTER_REQ_HANDLER_H
+
+/*Defines everything specific to inter-master requests that
+ * are internal to VMS.
+ *The plugin has its own handlers for inter-master requests
+ * sent between plugin instances.
+ */
+
+inline void
+handleTransferFree( InterVMSCoreReqst *masterReq, VirtProcr *masterPr );
+
+inline void
+handleShutdown( InterVMSCoreReqst *masterReq, VirtProcr *masterPr );
+
+
+#endif	/* _MASTER_REQ_HANDLER_H */
+
diff -r 90cbb7b803ee -r 99798e4438a6 probes.c
--- a/probes.c	Wed Sep 07 19:36:46 2011 +0200
+++ b/probes.c	Mon Sep 19 16:12:01 2011 +0200
@@ -113,7 +113,7 @@
    VMSSemReq reqData;
 
    reqData.reqType  = createProbe;
-   reqData.nameStr  = nameStr;
+   reqData.data  = (void*)nameStr;
 
    VMS__send_VMSSem_request( &reqData, animPr );
 
@@ -338,7 +338,7 @@
 void
 generic_print_probe( void *_probe )
  { 
-   IntervalProbe *probe = (IntervalProbe *)_probe;
+   //IntervalProbe *probe = (IntervalProbe *)_probe;
    
    //TODO segfault in printf
    //print_probe_helper( probe );
diff -r 90cbb7b803ee -r 99798e4438a6 scheduling.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/scheduling.h	Mon Sep 19 16:12:01 2011 +0200
@@ -0,0 +1,23 @@
+/* 
+ * File:   scheduling.h
+ * Author: Merten Sachh
+ *
+ * Created on September 16, 2011, 2:28 PM
+ */
+
+#ifndef SCHEDULING_H
+#define	SCHEDULING_H
+
+#include "ProcrContext.h"
+
+typedef struct _SchedSlot SchedSlot;
+
+struct _SchedSlot
+ {
+   int         workIsDone;
+   int         needsProcrAssigned;
+   VirtProcr  *procrAssignedToSlot;
+ };
+ 
+#endif	/* SCHEDULING_H */
+
diff -r 90cbb7b803ee -r 99798e4438a6 vmalloc.c
--- a/vmalloc.c	Wed Sep 07 19:36:46 2011 +0200
+++ b/vmalloc.c	Mon Sep 19 16:12:01 2011 +0200
@@ -19,6 +19,12 @@
 
 #define MAX_UINT64 0xFFFFFFFFFFFFFFFF
 
+inline void
+sendFreeReqst_lib(int receiverID, void *ptrToFree, VirtProcr *animPr);
+
+inline void
+sendFreeReqst_master(int receiverID, void *ptrToFree);
+
 //A MallocProlog is a head element if the HigherInMem variable is NULL
 //A Chunk is free if the prevChunkInFreeList variable is NULL
 
@@ -198,12 +204,32 @@
     return foundChunk;
 }
 
+/*
+ * This function is called by code which is part of the master loop.
+ * This reads the animating coreID from the MasterEnv and calls the normal malloc
+ * in VMS__malloc_on_core
+ */
+void *
+VMS__malloc( size_t sizeRequested)
+{
+    return VMS__malloc_on_core(sizeRequested, _VMSMasterEnv->currentMasterProcrID);
+}
+
+/*
+ * This is called by the plugin. This call to VMS_malloc_on_core is run on the
+ * slave VPs stack so there is no switch to the VMS runtime.
+ */
+void *
+VMS__malloc_in_lib(size_t sizeRequested, VirtProcr *VProcr)
+{
+    return VMS__malloc_on_core(sizeRequested, VProcr->coreAnimatedBy);
+}
 
 /*
  * This is sequential code, meant to only be called from the Master, not from
  * any slave VPs.
  */
-void *VMS__malloc( size_t sizeRequested )
+void *VMS__malloc_on_core( size_t sizeRequested, int procrID )
  {     
    //============================= MEASUREMENT STUFF ========================
    #ifdef MEAS__TIME_MALLOC
@@ -212,8 +238,9 @@
    #endif
    //========================================================================
    
-   MallocArrays* freeLists = _VMSMasterEnv->freeLists;
+   MallocArrays* freeLists = _VMSMasterEnv->freeLists[procrID];
    MallocProlog* foundChunk;
+   MallocPrologAllocated* returnChunk;
    
    //Return a small chunk if the requested size is smaller than 128B
    if(sizeRequested <= LOWER_BOUND)
@@ -224,9 +251,10 @@
        else
            foundChunk = removeSmallChunk(freeLists, freeListIdx);
        
-       //Mark as allocated
-       foundChunk->prevChunkInFreeList = NULL;      
-       return foundChunk + 1;
+       returnChunk = (MallocPrologAllocated*)foundChunk;
+       returnChunk->prevChunkInFreeList = NULL;//indicates elem currently allocated
+       returnChunk->procrID = procrID;  
+       return returnChunk + 1;
    }
    
    //Calculate the expected container. Start one higher to have a Chunk that's
@@ -239,7 +267,9 @@
        foundChunk = removeChunk(freeLists, containerIdx); 
    
    //Mark as allocated
-   foundChunk->prevChunkInFreeList = NULL;      
+   returnChunk = (MallocPrologAllocated*)foundChunk;
+   returnChunk->prevChunkInFreeList = NULL;//indicates elem currently allocated
+   returnChunk->procrID = procrID;
    
    //============================= MEASUREMENT STUFF ========================
    #ifdef MEAS__TIME_MALLOC
@@ -249,7 +279,79 @@
    //========================================================================
    
    //skip over the prolog by adding its size to the pointer return
-   return foundChunk + 1;
+   return returnChunk + 1;
+ }
+ 
+ /*
+ * This free is called for a master loop. It decides whether the allocation of
+ * chunk was done on the same core. If it was it calls VMS__free_on_core 
+ * otherwise it sends a message to the responsible core.
+ */
+void
+VMS__free(void *ptrToFree)
+{
+    MallocPrologAllocated *chunk = (MallocPrologAllocated*)ptrToFree - 1;
+    if(chunk->procrID == _VMSMasterEnv->currentMasterProcrID)
+    {
+        VMS__free_on_core(ptrToFree, _VMSMasterEnv->currentMasterProcrID);
+    }
+    else
+    {
+        sendFreeReqst_master(chunk->procrID, ptrToFree);
+        
+    }
+}
+
+/*
+ * This free is called for the plugins. It decides whether the allocation of
+ * chunk was done on the same core. If it was it calls VMS__free_on_core 
+ * otherwise it sends a message to the responsible core.
+ */
+void
+VMS__free_in_lib(void *ptrToFree, VirtProcr *VProc)
+{
+    MallocPrologAllocated *chunk = (MallocPrologAllocated*)ptrToFree - 1;
+    if(chunk->procrID == VProc->coreAnimatedBy)
+    {
+        VMS__free_on_core(ptrToFree, VProc->coreAnimatedBy);
+    }
+    else
+    {
+        sendFreeReqst_lib(chunk->procrID, ptrToFree, VProc);
+    }
+}
+
+/* 
+ * This is called form a masterVP and request an free from a different masterVP.
+ * The free of the request structure is done after the request is handled.
+ */
+inline void
+sendFreeReqst_master(int receiverID, void *ptrToFree)
+{
+   InterVMSCoreReqst *freeReqst = VMS__malloc(sizeof(InterVMSCoreReqst));
+   freeReqst->freePtr = ptrToFree;
+   freeReqst->secondReqType = transfer_free_ptr;
+
+   sendInterMasterReqst(receiverID, (InterMasterReqst*)freeReqst);
+ }
+
+/*
+ * This is called if the free is called from the plugin. This requests an inter
+ * master request from his master.
+ */
+inline void
+sendFreeReqst_lib(int receiverID, void *ptrToFree, VirtProcr *animPr )
+{
+   VMSSemReq reqData;
+   InterVMSCoreReqst *freeReqst = VMS__malloc(sizeof(InterVMSCoreReqst));
+   freeReqst->freePtr = ptrToFree;
+   freeReqst->secondReqType = transfer_free_ptr;
+
+   reqData.reqType  = interMasterReqst;
+   reqData.receiverID   = receiverID;
+   reqData.data  = (void*)freeReqst;
+
+   VMS__send_VMSSem_request( (void*)&reqData, animPr );
  }
 
 /*
@@ -257,7 +359,7 @@
  * any slave VPs.
  */
 void
-VMS__free( void *ptrToFree )
+VMS__free_on_core( void *ptrToFree, int procrID )
  {
     
    //============================= MEASUREMENT STUFF ========================
@@ -267,7 +369,7 @@
    #endif
    //========================================================================
    
-   MallocArrays* freeLists = _VMSMasterEnv->freeLists;
+   MallocArrays* freeLists = _VMSMasterEnv->freeLists[procrID];
    MallocProlog *chunkToFree = (MallocProlog*)ptrToFree - 1;
    uint32 containerIdx;
    
@@ -323,8 +425,7 @@
 VMS_ext__create_free_list()
 {     
    //Initialize containers for small chunks and fill with zeros
-   _VMSMasterEnv->freeLists = (MallocArrays*)malloc( sizeof(MallocArrays) );
-   MallocArrays *freeLists = _VMSMasterEnv->freeLists;
+   MallocArrays *freeLists = (MallocArrays*)malloc( sizeof(MallocArrays) );
    
    freeLists->smallChunks = 
            (MallocProlog**)malloc(SMALL_CHUNK_COUNT*sizeof(MallocProlog*));
@@ -355,14 +456,14 @@
                         MALLOC_ADDITIONAL_MEM_FROM_OS_SIZE - sizeof(MallocProlog*));
    firstChunk->nextChunkInFreeList = NULL;
    //previous element in the queue is the container
-   firstChunk->prevChunkInFreeList = &freeLists->bigChunks[container-2];
+   firstChunk->prevChunkInFreeList = (MallocProlog*)&freeLists->bigChunks[container-2];
    
    freeLists->bigChunks[container-2] = firstChunk;
    //Insert into bit search list
    if(container <= 65)
-       freeLists->bigChunksSearchVector[0] |= ((uint64)1 << (container-2));
+       freeLists->bigChunksSearchVector[0] = ((uint64)1 << (container-2));
    else
-       freeLists->bigChunksSearchVector[1] |= ((uint64)1 << (container-66));
+       freeLists->bigChunksSearchVector[1] = ((uint64)1 << (container-66));
    
    //Create dummy chunk to mark the top of stack this is of course
    //never freed
@@ -384,6 +485,6 @@
    free(freeLists->memSpace);
    free(freeLists->bigChunks);
    free(freeLists->smallChunks);
-   
+   free(freeLists);   
  }
 
diff -r 90cbb7b803ee -r 99798e4438a6 vmalloc.h
--- a/vmalloc.h	Wed Sep 07 19:36:46 2011 +0200
+++ b/vmalloc.h	Mon Sep 19 16:12:01 2011 +0200
@@ -6,13 +6,13 @@
  *
  * Created on November 14, 2009, 9:07 PM
  */
-
 #ifndef _VMALLOC_H
 #define	_VMALLOC_H
 
 #include <malloc.h>
 #include <inttypes.h>
 #include "VMS_primitive_data_types.h"
+#include "ProcrContext.h"
 
 #define SMALL_CHUNK_SIZE 32
 #define SMALL_CHUNK_COUNT 4
@@ -30,12 +30,12 @@
    MallocProlog *prevChunkInFreeList;
    MallocProlog *nextHigherInMem;
    MallocProlog *nextLowerInMem;
- };
+ }; 
 //MallocProlog
  
  typedef struct MallocArrays MallocArrays;
 
- struct MallocArrays
+ struct MallocArrays 
  {
      MallocProlog **smallChunks;
      MallocProlog **bigChunks;
@@ -43,24 +43,42 @@
      void         *memSpace;
      uint32       containerCount;
  };
- //MallocArrays
+ //MallocArray
+
+
+ typedef struct
+{
+     uintptr_t procrID;
+     MallocProlog *prevChunkInFreeList;
+     MallocProlog *nextHigherInMem;
+     MallocProlog *nextLowerInMem;
+ } MallocPrologAllocated;
+
 
 typedef struct
  {
    MallocProlog *firstChunkInFreeList;
    int32         numInList; //TODO not used
- }
-FreeListHead;
+ } FreeListHead;
 
 void *
-VMS__malloc( size_t sizeRequested );
+VMS__malloc_on_core(size_t sizeRequested, int procrID);
 
 void *
-VMS__malloc_aligned( size_t sizeRequested );
+VMS__malloc(size_t sizeRequested);
+
+void *
+VMS__malloc_in_lib(size_t sizeRequested, VirtProcr *VProc);
 
 void
 VMS__free( void *ptrToFree );
 
+void
+VMS__free_in_lib(void *ptrToFree, VirtProcr *VProc);
+
+void
+VMS__free_on_core(void *ptrToFree, int procrID);
+
 /*Allocates memory from the external system -- higher overhead
  */
 void *
@@ -71,7 +89,6 @@
 void
 VMS__free_in_ext( void *ptrToFree );
 
-
 MallocArrays *
 VMS_ext__create_free_list();
 
diff -r 90cbb7b803ee -r 99798e4438a6 vutilities.c
--- a/vutilities.c	Wed Sep 07 19:36:46 2011 +0200
+++ b/vutilities.c	Mon Sep 19 16:12:01 2011 +0200
@@ -9,6 +9,7 @@
 
 #include <malloc.h>
 #include <stdlib.h>
+#include <string.h>
 
 #include "VMS.h"