# HG changeset patch
# User Merten Sach <msach@mailbox.tu-berlin.de>
# Date 1316686504 -7200
# Node ID e4de34fd220bae91d3cb6f1954e5993b6b7c0b3f
# Parent  b549ad140f18665e9183569a801f5c6aa5d19b38# Parent  2c146b6b38900f26a1581df4c6179b1b39458978
merged useless branch test_without_inline

diff -r b549ad140f18 -r e4de34fd220b DESIGN_NOTES.txt
--- a/DESIGN_NOTES.txt	Thu Nov 11 04:59:48 2010 -0800
+++ b/DESIGN_NOTES.txt	Thu Sep 22 12:15:04 2011 +0200
@@ -1,212 +1,212 @@
-
-From e-mail to Albert, on design of app-virt-procr to core-loop animation
-switch and back.
-
-====================
-General warnings about this code:
-It only compiles in GCC 4.x  (label addr and computed goto)
-Has assembly for x86  32bit
-
-
-====================
-AVProcr data-struc has: stack-ptr, jump-ptr, data-ptr, slotNum, coreloop-ptr
- and semantic-custom-ptr
-
-The VMS Creator: takes ptr to function and ptr to initial data
--- creates a new AVProcr struc
--- sets the jmp-ptr field to the ptr-to-function passed in
--- sets the data-ptr to ptr to initial data passed in
--- if this is for a suspendable virt  processor, then create a stack and set
-   the stack-ptr
-
-VMS__create_procr( AVProcrFnPtr fnPtr, void *initialData )
-{
-AVProcr   newPr = malloc( sizeof(AVProcr) );
-newPr->jmpPtr = fnPtr;
-newPr->coreLoopDonePt = &CoreLoopDonePt; //label is in coreLoop
-newPr->data = initialData;
-newPr->stackPtr = createNewStack();
-return newPr;
-}
-
-The semantic layer can then add its own state in the cusom-ptr field
-
-The Scheduler plug-in:
--- Sets slave-ptr in AVProcr, and points the slave to AVProcr
--- if non-suspendable, sets the AVProcr's stack-ptr to the slave's stack-ptr
-
-MasterLoop:
--- puts AVProcr structures onto the workQ
-
-CoreLoop:
--- gets stack-ptr out of AVProcr and sets the core's stack-ptr to that
--- gets data-ptr out of AVProcr and puts it into reg GCC uses for that param
--- puts AVProcr's addr into reg GCC uses for the AVProcr-pointer param
--- jumps to the addr in AVProcr's jmp-ptr field
-CoreLoop()
-{ while( FOREVER )
- { nextPr = readQ( workQ );  //workQ is static (global) var declared volatile
-   <dataPtr-param-register>       = nextPr->data;
-   <AVProcrPtr-param-register> = nextPr;
-   <stack-pointer register>          = nextPr->stackPtr;
-   jmp nextPr->jmpPtr;
-CoreLoopDonePt:   //label's addr put into AVProcr when create new one
- }
-}
-(Note, for suspendable processors coming back from suspension, there is no
- need to fill the parameter registers -- they will be discarded)
-
-Suspend an application-level virtual processor:
-VMS__AVPSuspend( AVProcr *pr )
-{
-pr->jmpPtr = &ResumePt;  //label defined a few lines below
-pr->slave->doneFlag = TRUE;
-pr->stackPtr = <current SP reg value>;
-jmp pr->coreLoopDonePt;
-ResumePt: return;
-}
-
-This works because the core loop will have switched back to this stack
- before jumping to ResumePt..    also, the core loop never modifies the
- stack pointer, it simply switches to whatever stack pointer is in the
- next AVProcr it gets off the workQ.
-
-
-
-=============================================================================
-As it is now, there's only one major unknown about GCC (first thing below
-  the line),  and there are a few restrictions, the most intrusive being
-  that the functions the application gives to the semantic layer have a
-  pre-defined prototype -- return nothing, take a pointer to initial data
-  and a pointer to an AVProcr struc, which they're not allowed to modify
-  -- only pass it to semantic-lib calls.
-
-So, here are the assumptions, restrictions, and so forth:
-===========================
-Major assumption:  that GCC will do the following the same way every time:
-  say the application defines a function that fits this typedef:
-typedef void (*AVProcrFnPtr)  ( void *, AVProcr * );
-
-and let's say somewhere in the code they do this:
-AVProcrFnPtr   fnPtr = &someFunc;
-
-then they do this:
-(*fnPtr)( dataPtr, animatingVirtProcrPtr );
-
-Can the registers that GCC uses to pass the two pointers be predicted?
- Will they always be the same registers, in every program that has the
- same typedef?
-If that typedef fixes, guaranteed, the registers (on x86) that GCC will use
- to send the two pointers, then the rest of this solution works.
-
-Change in model: Instead of a virtual processor whose execution trace is
- divided into work-units, replacing that with the pattern that a virtual
- processor is suspended.  Which means, no more "work unit" data structure
- -- instead, it's now an "Application Virtual Processor" structure
- -- AVProcr -- which is given directly to the application function!
-
-   -- You were right, don't need slaves to be virtual processors, only need
-      "scheduling buckets" -- just a way to keep track of things..
-
-Restrictions:
--- the  "virtual entities"  created by the semantic layer must be virtual
-   processors, created with a function-to-execute and initial data -- the
-   function is restricted to return nothing and only take a pointer to the
-   initial data plus a pointer to an AVProcr structure, which represents
-   "self", the virtual processor created.  (This is the interface I showed
-   you for "Hello World" semantic layer).
-What this means for synchronous dataflow, is that the nodes in the graph
-  are virtual processors that in turn spawn a new virtual processor for
-  every "firing" of the node.  This should be fine because the function
-  that the node itself is created with is a "canned" function that is part
-  of the semantic layer -- the function that is spawned is the user-provided
-  function.  The restriction only means that the values from the inputs to
-  the node are packaged as the "initial data" given to the spawned virtual
-  processor -- so the user-function has to cast a void * to the
-  semantic-layer-defined structure by which it gets the inputs to the node.
-
--- Second restriction is that the semantic layer has to use VMS supplied
-   stuff -- for example, the data structure that represents the
-   application-level virtual processor is defined in VMS, and the semantic
-   layer has to call a VMS function in order to suspend a virtual processor.
-
--- Third restriction is that the application code never do anything with
-   the AVProcr structure except pass it to semantic-layer lib calls.
-
--- Fourth restriction is that every virtual processor must call a
-   "dissipate" function as its last act -- the user-supplied
-   virtual-processor function can't just end -- it has to call
-   SemLib__dissipate( AVProcr ) before the closing brace.. and after the
-   semantic layer is done cleaning up its own data, it has to in turn call
-   VMS__disspate( AVProcr ).
-
--- For performance reasons, I think I want to have two different kinds of
-   app-virtual processor -- suspendable ones and non-suspendable -- where
-   non-suspendable are not allowed to perform any communication with other
-   virtual processors, except at birth and death.  Suspendable ones, of
-   course can perform communications, create other processors, and so forth
-   -- all of which cause it to suspend.
-The performance difference is that I need a separate stack for each
-  suspendable, but non-suspendable can re-use a fixed number of stacks
-  (one for each slave).
-
-
-==================== May 29
-
-Qs:
---1 how to safely jump between virt processor's trace and coreloop
---2 how to set up __cdecl style stack + frame for just-born virtual processor
---3 how to switch stack-pointers + frame-pointers
-
-
---1:
-Not sure if GCC's computed goto is safe, because modify the stack pointer
-without GCC's knowledge -- although, don't use the stack in the coreloop
-segment, so, actually, that should be safe!
-
-So, GCC has its own special C extensions, one of which gets address of label:
-
-void *labelAddr;
-labelAddr = &&label;
-goto *labelAddr;
-
---2
-In CoreLoop, will check whether VirtProc just born, or was suspended.
-If just born, do bit of code that sets up the virtual processor's stack
-and frame according to the __cdecl convention for the standard virt proc
-fn typedef -- save the pointer to data and pointer to virt proc struc into
-correct places in the frame
-   __cdecl says, according to:
-http://unixwiz.net/techtips/win32-callconv-asm.html
-To do this:
-push the parameters onto the stack, right most first, working backwards to
- the left.
-Then perform call instr, which pushes return addr onto stack.
-Then callee first pushes the frame pointer, %EBP followed by placing the
-then-current value of stack pointer into %EBP
-push ebp
-mov  ebp, esp    // ebp « esp
-
-Once %ebp has been changed, it can now refer directly to the function's
- arguments as 8(%ebp), 12(%ebp). Note that 0(%ebp) is the old base pointer
- and 4(%ebp) is the old instruction pointer.
-
-Then callee pushes regs it will use then adds to stack pointer the size of
- its local vars.
-
-Stack in callee looks like this:
-16(%ebp)	 - third function parameter
-12(%ebp)	 - second function parameter
-8(%ebp)	 - first function parameter
-4(%ebp)	 - old %EIP (the function's "return address")
-----------^^ State seen at first instr of callee ^^-----------
-0(%ebp)	- old %EBP (previous function's base pointer)
--4(%ebp)	 - save of EAX, the only reg used in function
--8(%ebp)	 - first local variable
--12(%ebp)	 - second local variable
--16(%ebp)	 - third local variable
-
-
---3
-It might be just as simple as two mov instrs, one for %ESP, one for %EBP..
- the stack and frame pointer regs
+
+From e-mail to Albert, on design of app-virt-procr to core-loop animation
+switch and back.
+
+====================
+General warnings about this code:
+It only compiles in GCC 4.x  (label addr and computed goto)
+Has assembly for x86  32bit
+
+
+====================
+AVProcr data-struc has: stack-ptr, jump-ptr, data-ptr, slotNum, coreloop-ptr
+ and semantic-custom-ptr
+
+The VMS Creator: takes ptr to function and ptr to initial data
+-- creates a new AVProcr struc
+-- sets the jmp-ptr field to the ptr-to-function passed in
+-- sets the data-ptr to ptr to initial data passed in
+-- if this is for a suspendable virt  processor, then create a stack and set
+   the stack-ptr
+
+VMS__create_procr( AVProcrFnPtr fnPtr, void *initialData )
+{
+AVProcr   newPr = malloc( sizeof(AVProcr) );
+newPr->jmpPtr = fnPtr;
+newPr->coreLoopDonePt = &CoreLoopDonePt; //label is in coreLoop
+newPr->data = initialData;
+newPr->stackPtr = createNewStack();
+return newPr;
+}
+
+The semantic layer can then add its own state in the cusom-ptr field
+
+The Scheduler plug-in:
+-- Sets slave-ptr in AVProcr, and points the slave to AVProcr
+-- if non-suspendable, sets the AVProcr's stack-ptr to the slave's stack-ptr
+
+MasterLoop:
+-- puts AVProcr structures onto the workQ
+
+CoreLoop:
+-- gets stack-ptr out of AVProcr and sets the core's stack-ptr to that
+-- gets data-ptr out of AVProcr and puts it into reg GCC uses for that param
+-- puts AVProcr's addr into reg GCC uses for the AVProcr-pointer param
+-- jumps to the addr in AVProcr's jmp-ptr field
+CoreLoop()
+{ while( FOREVER )
+ { nextPr = readQ( workQ );  //workQ is static (global) var declared volatile
+   <dataPtr-param-register>       = nextPr->data;
+   <AVProcrPtr-param-register> = nextPr;
+   <stack-pointer register>          = nextPr->stackPtr;
+   jmp nextPr->jmpPtr;
+CoreLoopDonePt:   //label's addr put into AVProcr when create new one
+ }
+}
+(Note, for suspendable processors coming back from suspension, there is no
+ need to fill the parameter registers -- they will be discarded)
+
+Suspend an application-level virtual processor:
+VMS__AVPSuspend( AVProcr *pr )
+{
+pr->jmpPtr = &ResumePt;  //label defined a few lines below
+pr->slave->doneFlag = TRUE;
+pr->stackPtr = <current SP reg value>;
+jmp pr->coreLoopDonePt;
+ResumePt: return;
+}
+
+This works because the core loop will have switched back to this stack
+ before jumping to ResumePt..    also, the core loop never modifies the
+ stack pointer, it simply switches to whatever stack pointer is in the
+ next AVProcr it gets off the workQ.
+
+
+
+=============================================================================
+As it is now, there's only one major unknown about GCC (first thing below
+  the line),  and there are a few restrictions, the most intrusive being
+  that the functions the application gives to the semantic layer have a
+  pre-defined prototype -- return nothing, take a pointer to initial data
+  and a pointer to an AVProcr struc, which they're not allowed to modify
+  -- only pass it to semantic-lib calls.
+
+So, here are the assumptions, restrictions, and so forth:
+===========================
+Major assumption:  that GCC will do the following the same way every time:
+  say the application defines a function that fits this typedef:
+typedef void (*AVProcrFnPtr)  ( void *, AVProcr * );
+
+and let's say somewhere in the code they do this:
+AVProcrFnPtr   fnPtr = &someFunc;
+
+then they do this:
+(*fnPtr)( dataPtr, animatingVirtProcrPtr );
+
+Can the registers that GCC uses to pass the two pointers be predicted?
+ Will they always be the same registers, in every program that has the
+ same typedef?
+If that typedef fixes, guaranteed, the registers (on x86) that GCC will use
+ to send the two pointers, then the rest of this solution works.
+
+Change in model: Instead of a virtual processor whose execution trace is
+ divided into work-units, replacing that with the pattern that a virtual
+ processor is suspended.  Which means, no more "work unit" data structure
+ -- instead, it's now an "Application Virtual Processor" structure
+ -- AVProcr -- which is given directly to the application function!
+
+   -- You were right, don't need slaves to be virtual processors, only need
+      "scheduling buckets" -- just a way to keep track of things..
+
+Restrictions:
+-- the  "virtual entities"  created by the semantic layer must be virtual
+   processors, created with a function-to-execute and initial data -- the
+   function is restricted to return nothing and only take a pointer to the
+   initial data plus a pointer to an AVProcr structure, which represents
+   "self", the virtual processor created.  (This is the interface I showed
+   you for "Hello World" semantic layer).
+What this means for synchronous dataflow, is that the nodes in the graph
+  are virtual processors that in turn spawn a new virtual processor for
+  every "firing" of the node.  This should be fine because the function
+  that the node itself is created with is a "canned" function that is part
+  of the semantic layer -- the function that is spawned is the user-provided
+  function.  The restriction only means that the values from the inputs to
+  the node are packaged as the "initial data" given to the spawned virtual
+  processor -- so the user-function has to cast a void * to the
+  semantic-layer-defined structure by which it gets the inputs to the node.
+
+-- Second restriction is that the semantic layer has to use VMS supplied
+   stuff -- for example, the data structure that represents the
+   application-level virtual processor is defined in VMS, and the semantic
+   layer has to call a VMS function in order to suspend a virtual processor.
+
+-- Third restriction is that the application code never do anything with
+   the AVProcr structure except pass it to semantic-layer lib calls.
+
+-- Fourth restriction is that every virtual processor must call a
+   "dissipate" function as its last act -- the user-supplied
+   virtual-processor function can't just end -- it has to call
+   SemLib__dissipate( AVProcr ) before the closing brace.. and after the
+   semantic layer is done cleaning up its own data, it has to in turn call
+   VMS__disspate( AVProcr ).
+
+-- For performance reasons, I think I want to have two different kinds of
+   app-virtual processor -- suspendable ones and non-suspendable -- where
+   non-suspendable are not allowed to perform any communication with other
+   virtual processors, except at birth and death.  Suspendable ones, of
+   course can perform communications, create other processors, and so forth
+   -- all of which cause it to suspend.
+The performance difference is that I need a separate stack for each
+  suspendable, but non-suspendable can re-use a fixed number of stacks
+  (one for each slave).
+
+
+==================== May 29
+
+Qs:
+--1 how to safely jump between virt processor's trace and coreloop
+--2 how to set up __cdecl style stack + frame for just-born virtual processor
+--3 how to switch stack-pointers + frame-pointers
+
+
+--1:
+Not sure if GCC's computed goto is safe, because modify the stack pointer
+without GCC's knowledge -- although, don't use the stack in the coreloop
+segment, so, actually, that should be safe!
+
+So, GCC has its own special C extensions, one of which gets address of label:
+
+void *labelAddr;
+labelAddr = &&label;
+goto *labelAddr;
+
+--2
+In CoreLoop, will check whether VirtProc just born, or was suspended.
+If just born, do bit of code that sets up the virtual processor's stack
+and frame according to the __cdecl convention for the standard virt proc
+fn typedef -- save the pointer to data and pointer to virt proc struc into
+correct places in the frame
+   __cdecl says, according to:
+http://unixwiz.net/techtips/win32-callconv-asm.html
+To do this:
+push the parameters onto the stack, right most first, working backwards to
+ the left.
+Then perform call instr, which pushes return addr onto stack.
+Then callee first pushes the frame pointer, %EBP followed by placing the
+then-current value of stack pointer into %EBP
+push ebp
+mov  ebp, esp    // ebp « esp
+
+Once %ebp has been changed, it can now refer directly to the function's
+ arguments as 8(%ebp), 12(%ebp). Note that 0(%ebp) is the old base pointer
+ and 4(%ebp) is the old instruction pointer.
+
+Then callee pushes regs it will use then adds to stack pointer the size of
+ its local vars.
+
+Stack in callee looks like this:
+16(%ebp)	 - third function parameter
+12(%ebp)	 - second function parameter
+8(%ebp)	 - first function parameter
+4(%ebp)	 - old %EIP (the function's "return address")
+----------^^ State seen at first instr of callee ^^-----------
+0(%ebp)	- old %EBP (previous function's base pointer)
+-4(%ebp)	 - save of EAX, the only reg used in function
+-8(%ebp)	 - first local variable
+-12(%ebp)	 - second local variable
+-16(%ebp)	 - third local variable
+
+
+--3
+It might be just as simple as two mov instrs, one for %ESP, one for %EBP..
+ the stack and frame pointer regs
diff -r b549ad140f18 -r e4de34fd220b SSR.h
--- a/SSR.h	Thu Nov 11 04:59:48 2010 -0800
+++ b/SSR.h	Thu Sep 22 12:15:04 2011 +0200
@@ -27,6 +27,25 @@
 /*Semantic layer-specific data sent inside a request from lib called in app
  * to request handler called in MasterLoop
  */
+
+typedef struct
+ {
+   VirtProcr      *VPCurrentlyExecuting;
+   PrivQueueStruc *waitingVPQ;
+ }
+SSRTrans;
+
+/*WARNING: assembly hard-codes position of endInstrAddr as first field
+ */
+typedef struct
+ {
+   void           *endInstrAddr;
+   int32           hasBeenStarted;
+   int32           hasFinished;
+   PrivQueueStruc *waitQ;
+ }
+SSRSingleton;
+
 enum SSRReqType
  {
    send_type = 1,
@@ -38,7 +57,10 @@
    transfer_out,
    malloc_req,
    free_req,
-   singleton,
+   singleton_fn_start,
+   singleton_fn_end,
+   singleton_data_start,
+   singleton_data_end,
    atomic,
    trans_start,
    trans_end
@@ -60,7 +82,7 @@
    void              *ptrToFree;
 
    int32              singletonID;
-   void              *endJumpPt;
+   SSRSingleton     **singletonPtrAddr;
 
    PtrToAtomicFn      fnToExecInMaster;
    void              *dataForFn;
@@ -72,13 +94,6 @@
 
 typedef struct
  {
-   VirtProcr      *VPCurrentlyExecuting;
-   PrivQueueStruc *waitingVPQ;
- }
-SSRTrans;
-
-typedef struct
- {
    PrivQueueStruc **readyVPQs;
    HashTable       *commHashTbl;
    int32            numVirtPr;
@@ -86,7 +101,7 @@
    int32            primitiveStartTime;
 
                        //fix limit on num with dynArray
-   int32            singletonHasBeenExecutedFlags[NUM_STRUCS_IN_SEM_ENV];
+   SSRSingleton     fnSingletons[NUM_STRUCS_IN_SEM_ENV];
    SSRTrans         transactionStrucs[NUM_STRUCS_IN_SEM_ENV];
  }
 SSRSemEnv;
@@ -115,10 +130,10 @@
 int32
 SSR__giveMinWorkUnitCycles( float32 percentOverhead );
 
-void inline
+void
 SSR__start_primitive();
 
-int32 inline
+int32
 SSR__end_primitive_and_give_cycles();
 
 int32
@@ -137,11 +152,11 @@
 
 //=======================
 
-inline VirtProcr *
+  VirtProcr *
 SSR__create_procr_with( VirtProcrFnPtr fnPtr, void *initData,
                           VirtProcr *creatingPr );
 
-inline VirtProcr *
+  VirtProcr *
 SSR__create_procr_with_affinity( VirtProcrFnPtr fnPtr,    void *initData,
                             VirtProcr *creatingPr, int32 coreToScheduleOnto);
 
@@ -187,8 +202,16 @@
 
 //======================= Concurrency Stuff ======================
 void
-SSR__start_singleton( int32 singletonID, void *endSingletonLabelAddr,
-                      VirtProcr *animPr );
+SSR__start_fn_singleton( int32 singletonID, VirtProcr *animPr );
+
+void
+SSR__end_fn_singleton( int32 singletonID, VirtProcr *animPr );
+
+void
+SSR__start_data_singleton( SSRSingleton **singeltonAddr, VirtProcr *animPr );
+
+void
+SSR__end_data_singleton( SSRSingleton **singletonAddr, VirtProcr *animPr );
 
 void
 SSR__animate_short_fn_in_isolation( PtrToAtomicFn ptrToFnToExecInMaster,
@@ -208,6 +231,9 @@
 VirtProcr *
 SSR__schedule_virt_procr( void *_semEnv, int coreNum );
 
+VirtProcr*
+SSR__create_procr_helper( VirtProcrFnPtr fnPtr, void *initData,
+                          SSRSemEnv *semEnv,    int32 coreToScheduleOnto );
 
 #endif	/* _SSR_H */
 
diff -r b549ad140f18 -r e4de34fd220b SSR.s
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/SSR.s	Thu Sep 22 12:15:04 2011 +0200
@@ -0,0 +1,21 @@
+
+//Assembly code takes the return addr off the stack and saves
+// into the singleton.  The first field in the singleton is the
+// "endInstrAddr" field, and the return addr is at 0x4(%ebp)
+.globl asm_save_ret_to_singleton
+asm_save_ret_to_singleton:
+    movq 0x8(%rbp),     %rax   #get ret address, ebp is the same as in the calling function
+    movq     %rax,     (%rdi) #write ret addr to endInstrAddr field
+    ret
+
+
+//Assembly code changes the return addr on the stack to the one
+// saved into the singleton by the end-singleton-fn
+//The stack's return addr is at 0x4(%%ebp)
+.globl asm_write_ret_from_singleton
+asm_write_ret_from_singleton:
+    movq    (%rdi),    %rax  #get endInstrAddr field
+    movq      %rax,    0x8(%rbp) #write return addr to the stack of the caller
+    ret
+
+
diff -r b549ad140f18 -r e4de34fd220b SSR_PluginFns.c
--- a/SSR_PluginFns.c	Thu Nov 11 04:59:48 2010 -0800
+++ b/SSR_PluginFns.c	Thu Sep 22 12:15:04 2011 +0200
@@ -12,10 +12,10 @@
 #include "SSR_Request_Handlers.h"
 
 //=========================== Local Fn Prototypes ===========================
-void inline
+void
 resume_procr( VirtProcr *procr, SSRSemEnv *semEnv );
 
-void inline
+void
 handleSemReq( VMSReqst *req, VirtProcr *requestingPr, SSRSemEnv *semEnv );
 
 void
@@ -33,6 +33,8 @@
  * to the slave -- return FALSE to let Master loop know scheduling that
  * slave failed.
  */
+char __Scheduler[] = "FIFO Scheduler"; //Gobal variable for name in saved histogram
+
 VirtProcr *
 SSR__schedule_virt_procr( void *_semEnv, int coreNum )
  { VirtProcr   *schedPr;
@@ -63,7 +65,7 @@
 SSR__Request_Handler( VirtProcr *requestingPr, void *_semEnv )
  { SSRSemEnv *semEnv;
    VMSReqst    *req;
- 
+   
    semEnv = (SSRSemEnv *)_semEnv;
 
    req    = VMS__take_next_request_out_of( requestingPr );
@@ -90,7 +92,7 @@
  }
 
 
-void inline
+void
 handleSemReq( VMSReqst *req, VirtProcr *reqPr, SSRSemEnv *semEnv )
  { SSRSemReq *semReq;
 
@@ -114,7 +116,13 @@
          break;
       case free_req:        handleFree(         semReq, reqPr, semEnv);
          break;
-      case singleton:       handleSingleton(    semReq, reqPr, semEnv);
+      case singleton_fn_start:  handleStartFnSingleton(semReq, reqPr, semEnv);
+         break;
+      case singleton_fn_end:    handleEndFnSingleton(  semReq, reqPr, semEnv);
+         break;
+      case singleton_data_start:handleStartDataSingleton(semReq,reqPr,semEnv);
+         break;
+      case singleton_data_end:  handleEndDataSingleton(semReq, reqPr, semEnv);
          break;
       case atomic:          handleAtomic(       semReq, reqPr, semEnv);
          break;
@@ -147,11 +155,11 @@
 
 /*Re-use this in the entry-point fn
  */
-inline VirtProcr *
+  VirtProcr *
 SSR__create_procr_helper( VirtProcrFnPtr fnPtr, void *initData,
                           SSRSemEnv *semEnv,    int32 coreToScheduleOnto )
  { VirtProcr    *newPr;
-   SSRSemData    semData;
+   SSRSemData   *semData;
 
       //This is running in master, so use internal version
    newPr = VMS__create_procr( fnPtr, initData );
@@ -205,7 +213,7 @@
 
 
 //=========================== Helper ==============================
-void inline
+void
 resume_procr( VirtProcr *procr, SSRSemEnv *semEnv )
  {
    writePrivQ( procr, semEnv->readyVPQs[ procr->coreAnimatedBy] );
diff -r b549ad140f18 -r e4de34fd220b SSR_Request_Handlers.c
--- a/SSR_Request_Handlers.c	Thu Nov 11 04:59:48 2010 -0800
+++ b/SSR_Request_Handlers.c	Thu Sep 22 12:15:04 2011 +0200
@@ -15,7 +15,7 @@
 
 
 //=========================== Local Fn Prototypes ===========================
-void inline
+void
 resume_procr( VirtProcr *procr, SSRSemEnv *semEnv );
 
 
@@ -25,7 +25,7 @@
 
 /*Only clone the elements of req used in these reqst handlers
  */
-inline SSRSemReq *
+  SSRSemReq *
 cloneReq( SSRSemReq *semReq )
  { SSRSemReq *clonedReq;
 
@@ -81,7 +81,7 @@
  * separate processors can send to the same receiver, and hashing on the
  * receive processor, so they will stack up.
  */
-void inline
+void
 handleSendType( SSRSemReq *semReq, SSRSemEnv *semEnv )
  { VirtProcr   *sendPr, *receivePr;
    int          key[] = {0,0,0};
@@ -150,7 +150,7 @@
 /*Looks like can make single handler for both sends..
  */
 //TODO: combine both send handlers into single handler
-void inline
+void
 handleSendFromTo( SSRSemReq *semReq, SSRSemEnv *semEnv)
  { VirtProcr   *sendPr, *receivePr;
    int          key[] = {0,0,0};
@@ -229,14 +229,14 @@
  * If ever add receive_any, looking like this second option easier and even
  * less costly.
  */
-void inline
+void
 handleReceiveAny( SSRSemReq *semReq, SSRSemEnv *semEnv)
  {
  
  }
 
 
-void inline
+void
 handleReceiveType( SSRSemReq *semReq, SSRSemEnv *semEnv)
  { VirtProcr   *sendPr, *receivePr;
    int          key[] = {0,0,0};
@@ -284,7 +284,7 @@
 
 /*
  */
-void inline
+void
 handleReceiveFromTo( SSRSemReq *semReq, SSRSemEnv *semEnv)
  { VirtProcr   *sendPr, *receivePr;
    int          key[] = {0,0,0};
@@ -332,13 +332,13 @@
 
 
 //===============================================
-void inline
+void
 handleTransferTo( SSRSemReq *semReq, SSRSemEnv *semEnv)
  {
 
  }
 
-void inline
+void
 handleTransferOut( SSRSemReq *semReq, SSRSemEnv *semEnv)
  {
 
@@ -347,7 +347,7 @@
 
 /*
  */
-void inline
+void
 handleMalloc( SSRSemReq *semReq, VirtProcr *requestingPr, SSRSemEnv *semEnv )
  { void *ptr;
 
@@ -358,7 +358,7 @@
 
 /*
  */
-void inline
+void
 handleFree( SSRSemReq *semReq, VirtProcr *requestingPr, SSRSemEnv *semEnv )
  {
    VMS__free( semReq->ptrToFree );
@@ -372,22 +372,105 @@
  * end-label.  Else, sets flag and resumes normally.
  */
 void inline
-handleSingleton( SSRSemReq *semReq, VirtProcr *requestingPr,
-                 SSRSemEnv *semEnv )
+handleStartSingleton_helper( SSRSingleton *singleton, VirtProcr *reqstingPr,
+                             SSRSemEnv    *semEnv )
  {
-   if( semEnv->singletonHasBeenExecutedFlags[ semReq->singletonID ] )
-      requestingPr->nextInstrPt = semReq->endJumpPt;
+   if( singleton->hasFinished )
+    {    //the code that sets the flag to true first sets the end instr addr
+      reqstingPr->dataRetFromReq = singleton->endInstrAddr;
+      resume_procr( reqstingPr, semEnv );
+      return;
+    }
+   else if( singleton->hasBeenStarted )
+    {    //singleton is in-progress in a diff slave, so wait for it to finish
+      writePrivQ(reqstingPr, singleton->waitQ );
+      return;
+    }
    else
-      semEnv->singletonHasBeenExecutedFlags[ semReq->singletonID ] = TRUE;
+    {    //hasn't been started, so this is the first attempt at the singleton
+      singleton->hasBeenStarted = TRUE;
+      reqstingPr->dataRetFromReq = 0x0;
+      resume_procr( reqstingPr, semEnv );
+      return;
+    }
+ }
+void inline
+handleStartFnSingleton( SSRSemReq *semReq, VirtProcr *requestingPr,
+                      SSRSemEnv *semEnv )
+ { SSRSingleton *singleton;
+
+   singleton = &(semEnv->fnSingletons[ semReq->singletonID ]);
+   handleStartSingleton_helper( singleton, requestingPr, semEnv );
+ }
+void inline
+handleStartDataSingleton( SSRSemReq *semReq, VirtProcr *requestingPr,
+                      SSRSemEnv *semEnv )
+ { SSRSingleton *singleton;
+
+   if( *(semReq->singletonPtrAddr) == NULL )
+    { singleton                 = VMS__malloc( sizeof(SSRSingleton) );
+      singleton->waitQ          = makeVMSPrivQ();
+      singleton->endInstrAddr   = 0x0;
+      singleton->hasBeenStarted = FALSE;
+      singleton->hasFinished    = FALSE;
+      *(semReq->singletonPtrAddr)  = singleton;
+    }
+   else
+      singleton = *(semReq->singletonPtrAddr);
+   handleStartSingleton_helper( singleton, requestingPr, semEnv );
+ }
+
+
+void inline
+handleEndSingleton_helper( SSRSingleton *singleton, VirtProcr *requestingPr,
+                           SSRSemEnv    *semEnv )
+ { PrivQueueStruc *waitQ;
+   int32           numWaiting, i;
+   VirtProcr      *resumingPr;
+
+   if( singleton->hasFinished )
+    { //by definition, only one slave should ever be able to run end singleton
+      // so if this is true, is an error
+      //VMS__throw_exception( "singleton code ran twice", requestingPr, NULL);
+    }
+
+   singleton->hasFinished = TRUE;
+   waitQ = singleton->waitQ;
+   numWaiting = numInPrivQ( waitQ );
+   for( i = 0; i < numWaiting; i++ )
+    {    //they will resume inside start singleton, then jmp to end singleton
+      resumingPr = readPrivQ( waitQ );
+      resumingPr->dataRetFromReq = singleton->endInstrAddr;
+      resume_procr( resumingPr, semEnv );
+    }
 
    resume_procr( requestingPr, semEnv );
- }
+
+}
+void inline
+handleEndFnSingleton( SSRSemReq *semReq, VirtProcr *requestingPr,
+                        SSRSemEnv *semEnv )
+ {
+   SSRSingleton   *singleton;
+
+   singleton = &(semEnv->fnSingletons[ semReq->singletonID ]);
+   handleEndSingleton_helper( singleton, requestingPr, semEnv );
+  }
+void inline
+handleEndDataSingleton( SSRSemReq *semReq, VirtProcr *requestingPr,
+                        SSRSemEnv *semEnv )
+ {
+   SSRSingleton   *singleton;
+
+   singleton = *(semReq->singletonPtrAddr);
+   handleEndSingleton_helper( singleton, requestingPr, semEnv );
+  }
 
 
 /*This executes the function in the masterVP, take the function
  * pointer out of the request and call it, then resume the VP.
  */
-void inline
+void
 handleAtomic( SSRSemReq *semReq, VirtProcr *requestingPr, SSRSemEnv *semEnv )
  {
    semReq->fnToExecInMaster( semReq->dataForFn );
@@ -408,7 +491,7 @@
  * end-transaction, which will take this VP from the queue and resume it.)
  *If NULL, then write requesting into the field and resume.
  */
-void inline
+void
 handleTransStart( SSRSemReq *semReq, VirtProcr *requestingPr,
                   SSRSemEnv *semEnv )
  { SSRSemData *semData;
@@ -459,7 +542,7 @@
  *If get somethine, set VP_currently_executing to the VP from the queue, then
  * resume both.
  */
-void inline
+void
 handleTransEnd(SSRSemReq *semReq, VirtProcr *requestingPr, SSRSemEnv *semEnv)
  { SSRSemData    *semData;
    VirtProcr     *waitingPr;
diff -r b549ad140f18 -r e4de34fd220b SSR_Request_Handlers.h
--- a/SSR_Request_Handlers.h	Thu Nov 11 04:59:48 2010 -0800
+++ b/SSR_Request_Handlers.h	Thu Sep 22 12:15:04 2011 +0200
@@ -14,34 +14,43 @@
 /*This header defines everything specific to the SSR semantic plug-in
  */
 
-void inline
+inline void
 handleSendType( SSRSemReq *semReq, SSRSemEnv *semEnv);
-void inline
+inline void
 handleSendFromTo( SSRSemReq *semReq, SSRSemEnv *semEnv);
-void inline
+inline void
 handleReceiveAny( SSRSemReq *semReq, SSRSemEnv *semEnv);
-void inline
+inline void
 handleReceiveType( SSRSemReq *semReq, SSRSemEnv *semEnv);
-void inline
+inline void
 handleReceiveFromTo( SSRSemReq *semReq, SSRSemEnv *semEnv);
-void inline
+inline void
 handleTransferTo( SSRSemReq *semReq, SSRSemEnv *semEnv);
-void inline
+inline void
 handleTransferOut( SSRSemReq *semReq, SSRSemEnv *semEnv);
-void inline
+inline void
 handleMalloc( SSRSemReq *semReq, VirtProcr *requestingPr, SSRSemEnv *semEnv);
-void inline
+inline void
 handleFree( SSRSemReq *semReq, VirtProcr *requestingPr, SSRSemEnv *semEnv );
-void inline
+inline void
 handleTransEnd(SSRSemReq *semReq, VirtProcr *requestingPr, SSRSemEnv*semEnv);
-void inline
+inline void
 handleTransStart( SSRSemReq *semReq, VirtProcr *requestingPr,
                   SSRSemEnv *semEnv );
-void inline
+inline void
 handleAtomic( SSRSemReq *semReq, VirtProcr *requestingPr, SSRSemEnv *semEnv);
-void inline
-handleSingleton( SSRSemReq *semReq, VirtProcr *requestingPr,
-                 SSRSemEnv *semEnv );
+inline void
+handleStartFnSingleton( SSRSemReq *semReq, VirtProcr *reqstingPr,
+                      SSRSemEnv *semEnv );
+inline void
+handleEndFnSingleton( SSRSemReq *semReq, VirtProcr *requestingPr,
+                    SSRSemEnv *semEnv );
+inline void
+handleStartDataSingleton( SSRSemReq *semReq, VirtProcr *reqstingPr,
+                      SSRSemEnv *semEnv );
+inline void
+handleEndDataSingleton( SSRSemReq *semReq, VirtProcr *requestingPr,
+                    SSRSemEnv *semEnv );
 
 #endif	/* _SSR_REQ_H */
 
diff -r b549ad140f18 -r e4de34fd220b SSR_lib.c
--- a/SSR_lib.c	Thu Nov 11 04:59:48 2010 -0800
+++ b/SSR_lib.c	Thu Sep 22 12:15:04 2011 +0200
@@ -12,6 +12,7 @@
 #include "SSR.h"
 #include "VMS/Queue_impl/PrivateQueue.h"
 #include "VMS/Hash_impl/PrivateHash.h"
+#include "SSR.h"
 
 
 //==========================================================================
@@ -118,7 +119,7 @@
  }
 
 
-int32 inline
+int32
 SSR__giveMinWorkUnitCycles( float32 percentOverhead )
  {
    return MIN_WORK_UNIT_CYCLES;
@@ -139,7 +140,7 @@
 /*For now, use TSC -- later, make these two macros with assembly that first
  * saves jump point, and second jumps back several times to get reliable time
  */
-void inline
+void
 SSR__start_primitive()
  { saveLowTimeStampCountInto( ((SSRSemEnv *)(_VMSMasterEnv->semanticEnv))->
                               primitiveStartTime );
@@ -150,7 +151,7 @@
  * because don't want comm time included in calc-time measurement -- and
  * also to throw out any "weird" values due to OS interrupt or TSC rollover
  */
-int32 inline
+int32
 SSR__end_primitive_and_give_cycles()
  { int32 endTime, startTime;
    //TODO: fix by repeating time-measurement
@@ -181,14 +182,17 @@
    SSR__init_Helper();
  }
 
+#ifdef SEQUENTIAL
 void
 SSR__init_Seq()
  {
    VMS__init_Seq();
+   flushRegisters();
       //masterEnv, a global var, now is partially set up by init_VMS
 
    SSR__init_Helper();
  }
+#endif
 
 void
 SSR__init_Helper()
@@ -213,7 +217,7 @@
 
    for( coreIdx = 0; coreIdx < NUM_CORES; coreIdx++ )
     {
-      readyVPQs[ coreIdx ] = makePrivQ();
+      readyVPQs[ coreIdx ] = makeVMSPrivQ();
     }
    
    semanticEnv->readyVPQs = readyVPQs;
@@ -228,8 +232,11 @@
    //semanticEnv->transactionStrucs = makeDynArrayInfo( );
    for( i = 0; i < NUM_STRUCS_IN_SEM_ENV; i++ )
     {
-      semanticEnv->singletonHasBeenExecutedFlags[i] = FALSE;
-      semanticEnv->transactionStrucs[i].waitingVPQ = makePrivQ();
+      semanticEnv->fnSingletons[i].endInstrAddr      = NULL;
+      semanticEnv->fnSingletons[i].hasBeenStarted    = FALSE;
+      semanticEnv->fnSingletons[i].hasFinished       = FALSE;
+      semanticEnv->fnSingletons[i].waitQ             = makeVMSPrivQ();
+      semanticEnv->transactionStrucs[i].waitingVPQ   = makeVMSPrivQ();
     }
  }
 
@@ -263,7 +270,7 @@
 
 /*
  */
-inline VirtProcr *
+  VirtProcr *
 SSR__create_procr_with( VirtProcrFnPtr fnPtr,   void *initData,
                         VirtProcr *creatingPr )
  { SSRSemReq reqData;
@@ -282,10 +289,10 @@
    return creatingPr->dataRetFromReq;
  }
 
-inline VirtProcr *
+  VirtProcr *
 SSR__create_procr_with_affinity( VirtProcrFnPtr fnPtr, void *initData,
                         VirtProcr *creatingPr,  int32  coreToScheduleOnto )
- { SSRSemReq reqData;
+ { SSRSemReq  reqData;
 
       //the semantic request data is on the stack and disappears when this
       // call returns -- it's guaranteed to remain in the VP's stack for as
@@ -297,10 +304,12 @@
    reqData.sendPr             = creatingPr;
 
    VMS__send_create_procr_req( &reqData, creatingPr );
+
+   return creatingPr->dataRetFromReq;
  }
 
 
-inline void
+  void
 SSR__dissipate_procr( VirtProcr *procrToDissipate )
  {
    VMS__send_dissipate_req( procrToDissipate );
@@ -475,20 +484,111 @@
 
 
 //===========================================================================
+//
+/*A function singleton is a function whose body executes exactly once, on a
+ * single core, no matter how many times the fuction is called and no
+ * matter how many cores or the timing of cores calling it.
+ *
+ *A data singleton is a ticket attached to data.  That ticket can be used
+ * to get the data through the function exactly once, no matter how many
+ * times the data is given to the function, and no matter the timing of
+ * trying to get the data through from different cores.
+ */
 
-/*Uses ID as index into array of flags.  If flag already set, resumes from
- * end-label.  Else, sets flag and resumes normally.
+/*asm function declarations*/
+void asm_save_ret_to_singleton(SSRSingleton *singletonPtrAddr);
+void asm_write_ret_from_singleton(SSRSingleton *singletonPtrAddr);
+
+/*Fn singleton uses ID as index into array of singleton structs held in the
+ * semantic environment.
  */
 void
-SSR__start_singleton( int32 singletonID, void *endSingletonLabelAddr,
-                      VirtProcr *animPr )
+SSR__start_fn_singleton( int32 singletonID,   VirtProcr *animPr )
  {
    SSRSemReq  reqData;
 
       //
-   reqData.reqType     = singleton;
+   reqData.reqType     = singleton_fn_start;
    reqData.singletonID = singletonID;
-   reqData.endJumpPt   = endSingletonLabelAddr;
+
+   VMS__send_sem_request( &reqData, animPr );
+   if( animPr->dataRetFromReq ) //will be 0 or addr of label in end singleton
+    {
+       SSRSemEnv *semEnv = VMS__give_sem_env_for( animPr );
+       asm_write_ret_from_singleton(&(semEnv->fnSingletons[ singletonID]));
+    }
+ }
+
+/*Data singleton hands addr of loc holding a pointer to a singleton struct.
+ * The start_data_singleton makes the structure and puts its addr into the
+ * location.
+ */
+void
+SSR__start_data_singleton( SSRSingleton **singletonAddr,  VirtProcr *animPr )
+ {
+   SSRSemReq  reqData;
+
+   if( *singletonAddr && (*singletonAddr)->hasFinished )
+       goto JmpToEndSingleton;
+   
+   reqData.reqType          = singleton_data_start;
+   reqData.singletonPtrAddr = singletonAddr;
+
+   VMS__send_sem_request( &reqData, animPr );
+   if( animPr->dataRetFromReq ) //either 0 or end singleton's return addr
+    {    //Assembly code changes the return addr on the stack to the one
+         // saved into the singleton by the end-singleton-fn
+         //The return addr is at 0x4(%%ebp)
+        JmpToEndSingleton:
+          asm_write_ret_from_singleton(*singletonAddr);
+    }
+   //now, simply return
+   //will exit either from the start singleton call or the end-singleton call
+ }
+
+/*Uses ID as index into array of flags.  If flag already set, resumes from
+ * end-label.  Else, sets flag and resumes normally.
+ *
+ *Note, this call cannot be inlined because the instr addr at the label
+ * inside is shared by all invocations of a given singleton ID.
+ */
+void
+SSR__end_fn_singleton( int32 singletonID, VirtProcr *animPr )
+ {
+   SSRSemReq  reqData;
+
+      //don't need this addr until after at least one singleton has reached
+      // this function
+   SSRSemEnv *semEnv = VMS__give_sem_env_for( animPr );
+   asm_write_ret_from_singleton(&(semEnv->fnSingletons[ singletonID]));
+
+   reqData.reqType     = singleton_fn_end;
+   reqData.singletonID = singletonID;
+
+   VMS__send_sem_request( &reqData, animPr );
+
+EndSingletonInstrAddr:
+   return;
+ }
+
+void
+SSR__end_data_singleton(  SSRSingleton **singletonPtrAddr, VirtProcr *animPr )
+ {
+   SSRSemReq  reqData;
+
+      //don't need this addr until after singleton struct has reached
+      // this function for first time
+      //do assembly that saves the return addr of this fn call into the
+      // data singleton -- that data-singleton can only be given to exactly
+      // one instance in the code of this function.  However, can use this
+      // function in different places for different data-singletons.
+//   (*(singletonAddr))->endInstrAddr =  &&EndDataSingletonInstrAddr;
+
+
+   asm_save_ret_to_singleton(*singletonPtrAddr);
+
+   reqData.reqType          = singleton_data_end;
+   reqData.singletonPtrAddr = singletonPtrAddr;
 
    VMS__send_sem_request( &reqData, animPr );
  }