# HG changeset patch
# User Me@portablequad
# Date 1320710581 28800
# Node ID 8ea476474093d250e710c700e568d91d7e7f9ac7

Initial add -- gobbeldegook

diff -r 000000000000 -r 8ea476474093 .hgignore
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/.hgignore	Mon Nov 07 16:03:01 2011 -0800
@@ -0,0 +1,6 @@
+syntax: glob
+
+nbproject
+build
+dist
+*.o
\ No newline at end of file
diff -r 000000000000 -r 8ea476474093 src/Application/HWSim__Hello_World_HW/Communicator_TimeLine.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/Application/HWSim__Hello_World_HW/Communicator_TimeLine.c	Mon Nov 07 16:03:01 2011 -0800
@@ -0,0 +1,297 @@
+/*
+ *  Copyright 2009 OpenSourceStewardshipFoundation.org
+ *  Licensed under GNU General Public License version 2
+ *
+ * Author: seanhalle@yahoo.com
+ *
+ */
+
+#include "HWSim_TeraFlux.h"
+
+//=====================
+
+//===========================================================================
+/*This is the collection of spans for the Communication Processor TimeLine
+ *
+ *This TimeLine does acquire-start, acquire-send, release, acquire-receive,
+ * all triggered by communications coming from other TimeLines.
+ * 
+ *  Sometime after Feb 2011 it will also have the control-
+ * communications that Albert wants for Erbium (the broad-cast of counter
+ * updates to registered listener TNodes).  And, it will probably get some
+ * form of control communication for implementing a fast chip-wide VMS.
+ *
+ *
+ *
+ *The kinds of span the TimeLine can have are:
+-] acquire-start -- is triggered by the reception of a "start-acquire"
+   communication from the CPU TimeLine. It generates an "acquire send"
+   communication to the TNode that currently owns the data.
+-] acquire-send -- when triggered, checks if the Frame is free.  If yes,
+   sends an "acquire-receive" communication back to the requester.  If not
+   free, places data representing the acquire-request into a queue of waiting
+   ones in the Communicator TimeLine.  When the Communicator receives a
+   "release" communication from the CPU TimeLine, it runs the release span.
+-] release -- takes the data of the next request waiting for the released
+   Frame out of the queue and then sends an "acquire-receive" communication
+   to the TNode requesting.
+-] acquire-receive -- when triggered, runs firmware, which writes memory
+   shared with the CPU, modifying data-structures.  This firmware for
+   TeraFlux works with the System Code that runs on the CPU TimeLine to
+   notify it that the acquire is complete.
+ *
+ *
+ *Background on acquire:
+ *The request handler invokes acquire by placing the calling VP into
+ * a holding list, then executing the hardware instruction that starts the
+ * acquire.
+ *This instruction sends a communication from the CPU TimeLine to the
+ * Communicator TimeLine.  The instruction passes a pointer to the
+ * list element, and also passes a pointer to the queue of
+ * ready VPs.  When acquire comm is done, the Communicator takes the list
+ * element out and recycles it, and places the pointer to the VP into the
+ * queue of ready VPs.
+ * 
+ *In the actual hardware, each node has an ultra-simple communication
+ * processor -- like an 8 bit control data-path and a physical-addr-width
+ * addr-data-path (simple -- only power-of-2 shift, add, and maybe mask).
+ *The acquire-instruction parameters are placed into a data
+ * structure and only the pointer to them is in a register.  The CPU performs
+ * a write of the ptr to a particular physical addr, which the comm hardware
+ * catches and queues.  The comm-processor is driven by the queue -- stalls
+ * when queue is empty -- returns after finishes to get next from queue.
+ *Firmware in the comm-processor then fetches the params out of the data
+ * structure and starts the communication.  The comm involves the
+ * BIU, which is asked to grant acquire on the frame. BIU sends back
+ * the node that currently has the frame, or else main-mem physical address
+ * range.
+ *When comm is complete, the comm processor performs list-element removal,
+ * recycling, and pointer movement.
+ *End Background on acquire
+ *
+ *So, the HWInstr_acquire() is the hardware instruction called by the
+ * GuestSystemCode. This instruction's job is to perform the communication
+ * protocol that gets data from wherever it is and brings it onto the TNode
+ * executing this instr.
+ *In the simulator, this happens in a separate time-line than the CPU, which
+ * is animating another virtual TNode while the communication happens.
+ *There may be several acquires started while one is in progress.  But not
+ * going to model queueing of them.  This time-line treats them as
+ * zero width.  They each start a new span, but the span ends and reports
+ * its end-time as the same as its start time, then starts the idle-span.
+ *
+ *HWInstr_acquire:   CPU TimeLine
+ * when executes on the CPU TimeLine, sends a communication to the
+ * Communicator TimeLine, which equals inserting an acquire-start span into
+ * the consistent-time trigger-priority-queue that's driven by the TNode's
+ * ConsistentTime.  It will stop ConsistentTime advancement at its target
+ * arrival time, ending the Idle-span and starting the acquire-start span.
+ *
+ *Acquire-start:  requesting TNode's Communicator TimeLine
+ * this span looks inside the data-struct to find the Frame start addr.  It
+ * looks this up in a hash table to see which TNode owns the Frame.  If none,
+ * means hasn't been allocated yet, so puts acquire-info into the wait-queue
+ * that's in the hash entry.
+ *
+ *Acquire-send: Dwelling TNode's Communicator TimeLine
+ * When advancement of ConsistentTime stopped by the acquire-send reception,
+ * check to see
+ * if the desired Frame is still owned (hash table keyed by Frame start
+ * addr).  If desired Frame is owned, places the data of the request into
+ * the queue of waiting acquires that's in the hash-entry.
+ * If no longer owned, then set new Dwelling TNode to be requester and
+ * place Acquire-receive into waiting-comm queue of requesting TNode
+ * TimeDomain.
+ *
+ *Release: CPUTimeLine of Owning TNode
+ * put Release into Communicator TimeLine, with the Frame start addr in it
+ *
+ *Release: Communicator TimeLine of Owning TNode
+ * look up the hash entry for released Frame, set to Not Owned.  If waitQ
+ * in the Frame's hash entry is not empty, take next entry, set as Owner of
+ * the Frame, and as new Dwelling TNode, then place Acquire-receive into
+ * waiting-comm queue of requesting TNode-TimeDomain.
+ *
+ *Acquire-receive: Communicator TimeLine of requesting TNode
+ * When advancement of ConsistentTime stopped by the acquire-receive
+ * reception, run the Firmware that sets the shared state, so that the CPU
+ * TimeLine will see the acquire is complete -- also do the data-structure
+ * rearrangement stuff.  Finally, do the thing of checking if all VTNodes
+ * are suspended waiting for acquires -- if so, the receive will have to be
+ * pro-active in restarting the CPU TimeLine, kicking it out of the
+ * LightSleep span. THIS SPECIAL SPAN IS KEY BENEFIT OF HW SUPPORT FOR VMS --
+ * can put hardware into light sleep mode when nothing to do -- power/energy
+ * saver.
+ * 
+ *
+ *So, a time-line is the virtual processor itself.  It animates the
+ * "current span" as the top-level function.  That span executes span-end,
+ * the next span's function replaces the old one as the top-level function.
+ * The stack is also reset, and the parameters for the new span are placed
+ * on the stack, and the pointer to the span's function is set as the resume-
+ * ptr.  When the time-line VP is resumed, it's equivalent to that span's
+ * function being called.
+ *The time-line VP has app-specific data that says whether the current-span
+ * is the Idle span, and the simulated-time of the last span-end.
+ *
+ *So, rather than having a single top-level functin, a timeLine VP has many,
+ * a different TLF for each kind of span.
+ *
+ *This file holds all the spans == TLFs for the comm-procr TimeLine.
+ */
+
+
+//===========================================================================
+/*
+ *Acquire-start:  requesting TNode's Communicator TimeLine
+ * this span looks inside the data-struct to find the Frame start addr.  It
+ * looks this up in a hash table to see which TNode owns the Frame.  If none,
+ * means hasn't been allocated yet, so puts acquire-info into the wait-queue
+ * that's in the hash entry.
+ *Q: want to put global data-structs into HWSim, with some protection
+ *  mechanism (like the transactions have already implemented), or want to
+ *  make general request-handler extension thingie?  Request-handler
+ *  extensions would be done by registering a handler function during
+ *  architecture-definition code.
+ *A: transactions awkward..  make all state be either contained in the
+ *  TimeLine, or else global.  If in TimeLine, has to be allocated during
+ *  architecture definition, and if global, has to be declared, and a
+ *  pointer to the functions that access it, along with an ID for that kind
+ *  of access is registered.  Then, in the span-code, call
+ *  HWSim__access_global_var( params, ACCESSID )  the ACCESSID determines
+ *  which function-pointer is called, and the function code determines
+ *  which global var is accessed, and the params hold all the data the
+ *  function needs to do whatever is to be done.
+ */
+void
+commSpan_acquire_start( void *_params, VirtProcr *animTimeLine )
+ { 
+   AcquireParams  *params;
+
+   params    = (AcquireParams *)_params;
+
+         DEBUG( dbgAppFlow, "acquire_start\n", cloneAcquireParams( params ));
+
+      //invoke global-var-accessor to get the TNode owns the Frame
+   residingTNode =
+      HWSim__access_global_var( params, GET_OWNING_TNODE, animTimeLine );
+
+      //send a communication to that TNode's Communicator
+   targetTL = residingTNode->communicatorTL;
+
+      //params are: amount of simulated time the communication takes,
+      // the TimeLine receiving, the span-function to run when consistent-
+      // time reaches the reception time, params for that span, and animTL
+   HWSim__send_comm( calcNetworkTime(), targetTL, targetTL->sendAcquireSpan,
+                     params, animTimeLine );
+
+   duration = 0; //starting an acquire modeled as taking zero time
+
+      //every span function ends with this call -- duration of this span,
+      // pointer to next span-function to run, params for it, and animTL
+   HWSim__transition_to_new_span( duration, IDLE_SPAN, NULL, animTimeLine );
+ }
+
+
+/*Acquire-send: Dwelling TNode's Communicator TimeLine
+ * Runs when advancement of ConsistentTime stopped by the acquire-send
+ * reception.  Check to see
+ * if the desired Frame is still owned (hash table keyed by Frame start
+ * addr).  If desired Frame is owned, place the data of the request into
+ * the queue of waiting acquires that's in the hash-entry.
+ * If already released, then set new Dwelling TNode to be requester and
+ * do an Acquire-receive back to the requesting TNode, which has the effect
+ * of placing an acquire-receive span to wait in its consistent-time-arrest
+ * queue.
+ *The release span will take waiting requests out of the waiting-acquires Q
+ */
+void
+commSpan_acquire_send( void *_params, VirtProcr *animTimeLine )
+ {
+   AcquireParams  *params;
+
+   params    = (AcquireParams *)_params;
+
+         DEBUG( dbgAppFlow, "acquire_send\n", cloneAcquireParams( params ));
+
+
+      //invoke global-var-accessor to lookup hash entry and see if Frame is
+      // still owned, and if so, add this acquire to queue of waiting ones.
+      // note, this is non-physical behavior -- any use of global vars is
+      // non-physical.  To make this function physical, implement a TimeLine
+      // that holds the hash table and all other TimeLines communicate to.
+      //Caveat there is that collisions can happen unless also impl protocol.
+      // So leave that for later improvement.
+   notCurrentlyOwned =
+      HWSim__access_global_var( params, DO_ACQUIRE_SEND, animTimeLine );
+
+   if( notCurrentlyOwned )
+    {
+         //send a communication to requesting TNode's Communicator
+      targetTL = params->requestingTNode->communicatorTL;
+
+         //params are: amount of simulated time the communication takes,
+         // the TimeLine receiving, the span-function to run when consistent-
+         // time reaches the reception time, params for that span, and animTL
+      HWSim__send_comm( calcNetworkTime(), targetTL, COMM_INPORT,
+                        &commSpan_acquire_receive, params, animTimeLine );
+    }
+   
+   HWSim__transition_to_new_span( IDLE_SPAN, NULL, animTimeLine );
+ }
+
+
+/*Acquire-receive: Communicator TimeLine of requesting TNode
+ * When advancement of ConsistentTime stopped by the acquire-receive
+ * reception, run the "Firmware" that sets the shared state, so that the CPU
+ * TimeLine will see the acquire is complete -- also do the data-structure
+ * rearrangement stuff.  Finally, do the thing of checking if all VTNodes
+ * are suspended waiting for acquires -- if so, the receive will have to be
+ * pro-active in restarting the CPU TimeLine, kicking it out of the
+ * LightSleep span. THIS SPECIAL SPAN IS KEY BENEFIT OF HW SUPPORT FOR VMS --
+ * can put hardware into light sleep mode when nothing to do -- power/energy
+ * saver.
+ */
+void
+commSpan_acquire_receive( void *_params, VirtProcr *animTimeLine )
+ {
+   AcquireParams  *params;
+
+   params    = (AcquireParams *)_params;
+
+         DEBUG( dbgAppFlow, "acquire_receive\n", cloneAcquireParams(params));
+
+
+   HWSim__transition_to_new_span( IDLE_SPAN, NULL, animTimeLine );
+ }
+
+
+/*Release: Communicator TimeLine of Owning TNode
+ * look up the hash entry for released Frame, set to Not Owned.  If waitQ
+ * in the Frame's hash entry is not empty, take next entry, set as Owner of
+ * the Frame, and as new Dwelling TNode, then place Acquire-receive into
+ * waiting-comm queue of requesting TNode-TimeDomain.
+ */
+void
+commSpan_release( void *_params, VirtProcr *animTimeLine )
+ {
+   AcquireParams  *params;
+
+   params    = (AcquireParams *)_params;
+
+         DEBUG( dbgAppFlow, "acquire_start\n", cloneAcquireParams( params ));
+
+
+   HWSim__transition_to_new_span( IDLE_SPAN, NULL, animTimeLine );
+ }
+
+/*At reset only starts the Idle span in the communicator.
+ */
+void
+commSpan_at_reset( void *_params, VirtProcr *animTimeLine )
+ {
+
+         DEBUG( dbgAppFlow, "commSpan at_reset\n", NULL );
+         
+   HWSim__transition_to_new_span( IDLE_SPAN, NULL, animTimeLine );
+ }
diff -r 000000000000 -r 8ea476474093 src/Application/HWSim__Hello_World_HW/DESIGN_NOTES.txt
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/Application/HWSim__Hello_World_HW/DESIGN_NOTES.txt	Mon Nov 07 16:03:01 2011 -0800
@@ -0,0 +1,369 @@
+
+
+TeraFlux Hardware Model  (impl of this model in terms of HWSim is below)
+
+As of Feb 2011, the hardware being simulated is:
+
+A number of TNodes on a chip, which are connected by a network, with a
+ main-memory.
+
+The address space is divided into a local address-space and a shared
+ address-space.  Both address spaces are divided among the TNodes -- each
+ has its own unique range of local virtual addresses and own range of
+ global virtual addresses.  A given TNode may only allocate virtual
+ addresses within its own two ranges.  The contents of *local* addresses it
+ allocates may only ever be seen by the allocating TNode.  The contents of
+ shared addresses may be seen by any TNode after executing an acquire
+ hardware instruction and being granted the contents.
+
+Each TNode has a single CPU core, a local memory, and a
+ communication processor with its own network hardware.
+
+The local memory size is not modelled, so is considered unlimited.
+The network has undefined topology and is modelled as having constant latency
+ from any TNode to any other TNode, with a fixed BW between any two TNodes.
+
+The only modelled communication is movement of data, which can only be
+ triggered by the "acquire" hardware instruction.
+
+The behavior of acquire is defined as:
+-] Only one TNode at a time owns a given shared-memory "Frame", which has a
+   start address and a size.
+-] A given shared virtual address is in at most one Frame for the duration
+   of a program run (IE, no overlap of Frames, implying no change of size).
+-] The hardware mechanism by which single-ownership is enforced is not
+   modelled.  Control communication is considered infinitely fast.
+-] The simulated time between a given TNode's CPU executing the acquire
+   instruction and the simulated time the data of the Frame appears in the
+   local memory of that TNode is determined by both a queue of waiting
+   acquire requests and the network time required to move the data.
+
+The CPU in a TNode executes the standard x86 instruction set.
+
+The communication processor performs the acquire instruction and the 
+ release instruction.  The exact hardware mechanism by which these
+ instructions get from the instruction stream fetched by the CPU to the
+ communication processor is not defined.
+
+As an aside.  For simulation, the acquire and release instruction  are
+ implemented in the lightweight simulator as library-calls that trigger
+ the simulation infrastructure. In the application they are stated as
+ macro-calls.  These macros can then be defined in the tool-chain.
+When the COTSon simulator is targeted, the
+ macro is implemented as an in-line assembly custom op-code.  When the
+ lightweight simulator is targeted, the macro is implemented as a call to
+ a library, which invokes the acquire or release functionality in the
+ lightweight simulator.
+
+The effect of transferring data between TNode local memory and the chip's
+ main memory is not modelled.
+
+The portions of the hardware left undefined or not modelled may all be
+ filled-in in future versions, according to research needs.
+
+
+=============================================================================
+Implementing the hardware model in terms of HWSim
+
+HWSim is used to implement the TeraFlux hardware model for simulation.
+
+The implementation consists of four elements:
+1) A CPU TimeLine, which executes the Guest code
+2) A Communicator TimeLine, which performs all inter-node communication
+3) A TNode TimeDomain, which has one CPU TimeLine and one Communicator
+   TimeLine in it
+4) A Chip TimeDomain, which has a number of TNode TimeDomains in it.
+
+For the first version, there is no local memory element in a TNode, there is
+ no Main Memory node, and there is no address translation mechanism modelled.
+ These may be added later, and so might a TSU mechanism, and experimental
+ alternative memory models.
+
+The CPU TimeLine has as spans:
+-] at_reset -- standard span that runs when the hardware is reset.  Runs the
+   TeraFlux System Code bootstrap function.
+-] guestCode span -- runs whatever Guest code is pointed to.. each span of
+   this type is created with a pointer to code to run
+-] lightSleep -- implemented as the built-in IDLE span.  When no virtual
+    nodes are ready to animate, the CPU enters light sleep until the
+    Communicator wakes it up
+
+The CPU TimeLine generates two kinds of communication-spans:
+  acquire-start in Communicator
+  release in Communicator
+
+For now, these communications are considered to be performed by dedicated
+ hardware in the CPU, so they take exactly one simulated instruction, and
+ their simulated time is thus included in the measured time of the span.  The
+ HWSim__send_comm takes a time-stamp before suspending the TimeLine, then
+ another just after resume, and accumulates -- the end-span adds this in.
+
+  All TimeLines begin by running the at_reset span defined for that TimeLine.
+  The CPU's at_reset is hard-coded to start a GuestCode span that runs the
+  TeraFlux System Code's boot sequence.
+
+The Communicator TimeLine has a number of spans, all related to acquire and
+  release:
+-] acquire-start -- is triggered by the reception of a "start-acquire"
+   communication from the CPU TimeLine. It generates an "acquire send"
+   communication to the TNode that currently owns the data.
+-] acquire-send -- when triggered, checks if the Frame is free.  If yes,
+   sends an "acquire-receive" communication back to the requester.  If not
+   free, places data representing the acquire-request into a queue of waiting
+   ones in the Communicator TimeLine.  When the Communicator receives a
+   "release" communication from the CPU TimeLine, it runs the release span.
+-] release -- takes the data of the next request waiting for the released
+   Frame out of the queue and then sends an "acquire-receive" communication
+   to the TNode requesting.
+-] acquire-receive -- when triggered, runs firmware, which writes memory
+   shared with the CPU, modifying data-structures.  This firmware for
+   TeraFlux works with the System Code that runs on the CPU TimeLine to
+   notify it that the acquire is complete.
+
+Note, nothing checks whether a Frame's addresses are accessed from outside an
+ Acquire-Release block, which could be a source of difficult to find bugs in
+ the application.
+
+
+=============================================================================
+Earlier versions of notes:
+
+A time-line is a virtual-processor, and has a sequence of spans -- each
+ span performs one hardware-function, and has a start-time and an end-time
+ -- those are simulated-time, not physical time.
+
+Each time-line is created with a start-span that initializes it, then every
+ span ends with an "end span" sem-lib call.
+
+There are three kinds of span -- fixed-function spans, which represent
+ hard-wired hardware behavior, processing-core spans, which represent
+ processing elements that execute code, and communication-spans, which
+ cross time-lines.
+
+a fixed-function span has a fixed function-pointer that it is created with
+ and jumps to when the time-line is resumed.  Fixed-function spans also have
+ a pointer to a function that calculates the width of the span.  The
+ width-caluclating span is defined in the application directory.
+
+a processing-core span has a function-pointer that is assigned to it by the
+ end-span call of the preceeding span.  The width is also determined by a
+ pointer to a width-calculating function.  The width-calculating function
+ for these spans is also defined in the application directory (In first
+ teraflux impl, this function uses RDTSC to measure physical execution time,
+ and makes that the simulated execution time too -- but with a "BS" detector
+ that sees when the time is significantly larger than the previous
+ invocation of the same function-pointer).
+
+Communication spans are special because they cross time-lines.  So, a
+ communication span has zero width in the time-line it's created in, and
+ goes onto the queue as a new span in the target time-line (which also has
+ zero-width).  When the target span runs, it changes the state available to
+ the target time-line, to represent the reception of the communication.
+
+=============================
+Span-end is the only semantic-library call implemented.  Inside the
+ request-handler, it causes new spans to be created.
+
+So, have to have a separate receive time-line, that modifies hardware shared
+ with other time-lines.  The send span causes a receive-span to be inserted
+ into the target receive time-line.  
+
+Receive-spans are zero-width -- they update the hardware-state atomically,
+ so don't have to worry about conflicts between different receive spans in
+ the simulator.  The hardware-application that uses the simulator-library
+ must model the receive hardware and implement the send-hardware function
+ to work out any physical conflicts among receives targeted to the same
+ receive time-line.
+
+
+=============================
+
+Time-lines are specialized to specific hardware functions inside the
+ Application directory -- that's where the main creates all the time-lines,
+ and where the spans are implemented that have the behavior of a given type
+ of time-line.
+
+For example, if the hardware is a communication-unit, then span-types are
+ created that have the behavior that does all the setup of a communication
+ span and then does an end-span that creates as its follow-on the
+ communication-span.
+
+Communication spans are special because they cross time-lines.  So, a
+ communication span has zero width in the time-line it's created in, and
+ goes onto the queue in the target time-line, where it creates a new span
+ that also has zero-width.  The target span's function updates the hardware
+ state available to the target time-line, which may be shared with other
+ time-lines and that update may cause new spans to be spawned in those.
+
+if the hardware is a processing-core, then the function points to
+ Guest-application-code.  This function-pointer is what core_loop jumps to
+ when it reanimates the time-line virtual processor.
+
+================  Albert e-mail =================
+
+
+Hi Albert,
+
+   the simulator is a thing of beauty.  I'm getting goose bumps as I put the
+ last pieces of the design into place.  It's a sweet thing.
+
+Just in case you're curious, attached are my design notes <this file>.  The
+ thing that makes it nice is the clean decomposition and reusability -- the
+ core simulator only has three things: time-lines, spans, and a
+ priority-queue with an associated "certain-time" or "consistent-time".
+
+   A time-line is a VMS virtual-processor, which executes a sequence of
+ spans.  Each span has a function that represents the behavior performed
+ during that span, plus a function that calculates the simulated-time width
+ of that span.  The certain-time represents the advancement of global
+ simulated-time.  At all points in physical time during the simulation, it
+ is guaranteed that no spans are waiting to execute that have a
+ simulated-time older than the certain-time.
+   In other words, at any physical moment, there are lots of spans sitting
+ in queues waiting to run.  So, a given time-line finishes a span at a given
+ simulated-time point in global simulated-time.  But there may be spans from
+ other time-lines in the queue that finish at a preceeding simulated-time.
+ But there can never be any waiting spans that finish before the
+ certain-time.  This is important for communications, which cross
+ time-lines.
+
+
+   Time-lines, spans, and certain-time are implemented in a semantic-library.
+
+This is, in essence a new parallel language for writing hardware-simulators
+ with.  (this just turns out to be the most natural and most simple way to
+ write the TeraFlux simulator)
+
+The behavior of particular hardware is defined as a simulator-application
+ that makes calls to that semantic-library.
+
+The main of that simulator-application creates the pieces of the hardware
+ -- for Teraflux, that means it creates the nodes, and the pieces inside
+ each node.
+
+Running this simulator-application equals turning on the power-switch of
+ the Guest hardware.
+
+The Guest application code is retrieved by the main of the
+ simulator-application, and starts running when the "power-switch" is
+ turned on (running the simulator-application causes the Guest-application
+ to be retrieved and start running on the hardware created by main).  This
+ is equivalent to the boot sequence of the BIOS, which happens at power-on.
+
+So, the end-effect is that full Linux is available to the Guest hardware as
+ a sort of "escape".  The Guest hardware can use the Host's disk access,
+ debugging, and everything else, but these usages are "outside" the
+ simulated time -- they are essentially magic-spells that the Guest
+ hardware can perform that take place outside of time as far as the Guest
+ application-code is concerned.  Hence, the main is able to use the Host
+ Linux to retrieve-from-disk the Guest application (but the disk-access
+ takes place outside the measurements reported by the simulator).
+
+Which maybe seems trivial, but I consider it a very cool trick  : )
+
+The part I like best is the fact that the simulator itself is such a simple
+ semantic-library, and the behavior of the hardware is written all as
+ sequential code.  This makes it easily customizable to any architecture
+ someone might want to investigate, and still run on parallel hardware  : D
+
+The only caveat is the thing I noted in earlier e-mails about communication
+ updates -- memory images and other hardware state  atomically  update at
+ the ends of spans.  So, Guest-code-execution that overlaps the
+ simulated-time at which reception happens, on a node, will not see that
+ reception until the end of the span.
+   For TeraFlux hardware, the only natural span-endings are acquire
+ instructions and release instructions.
+   What this means for you is that you may want to insert artificial
+ end-span calls into the Guest application code you generate (I'll provide
+ a "dummy" call).  Communication-receptions on a node will only become
+ visible to Guest application code after one of these dummy end-span calls.
+ So, the granularity of time in the simulation is related to the frequency
+ of end-span calls in the application code.  If you don't insert any
+ artificial ones, then acquire and release instructions will be the only
+ span-ending events, and will define the granularity of
+ communication-receptions being seen by Guest application code.
+   In particular, this means that no Guest application code is allowed to
+ monitor memory to see when it changes -- so, programming techniques that
+ would work on real hardware, to detect acquire-updates early won't work
+ in the simulator -- all Guest code to run on the simulator must use the
+ acquire-library call, which will suspend the virtual-node the code is
+ running on until the acquire is complete, then re-start that node at some
+ simulated-time after the acquire completes.
+   
+ 
+No idea when something will start running, but the design is getting quite
+ detailed, so progress is being made,
+
+Sean
+
+
+=============================================================================
+ Notes from before figured out how to do ConsistentTime for each TimeDomain
+ and have a priority-queue of waiting communication receptions for the
+ TimeDomain.
+
+This illustrates the tortured logic would have to go through otherwise.
+ *
+ * that acquire's release point in
+ * sumulated is either already known, or not yet known.  If already known,
+ * then recursively check if another TNode has already been granted acquire,
+ * until reach the end of the chain. If the end is already known without any
+ * other acquires, then set state that this TNode gets the acquire at the
+ * release-time of the last in the chain.  This acquire will have an unknown
+ * release time.
+ *If the release time of the end of the chain is unknown, then put the
+ * requesting acquire into a queue of acquires waiting for that Frame. When
+ * the release happens, it will check if any acquires are in the queue for
+ * the released Frame.  If so, it will do the same as is done when the end
+ * of a chain is known -- set the Frame's state to acquired, with release-
+ * time as the start-time of the new acquire-grant, and unknown end-time.
+ *
+ *When a HWInstr_release() is executed, it has a simulated-time at which the
+ * span starts -- the span will also have zero width and start the idle span
+ * when it ends, just like the acquire-start span, and the triggered
+ * spans.
+ *
+ *The release span will check the wait queue for the Frame it is releasing,
+ * and either mark the Frame as free, or else fire off the grant-acquire
+ * function for the first waiting acquire.
+ *
+ *The grant-acquire function checks the ConsistentTime, and if the
+ * simulated time of the grant preceeds the ConsistentTime, then the
+ * acquire is put into the readyQ as an acquire-send span in the comm
+ * TimeLine of the TNode that owns the acquired data, or else the MainMem
+ * Node.
+ *
+ *The comm TimeLine that executes an acquire-send creates a comm span between
+ * the sending TNode or MMNode and the receiving TNode.  The width of the
+ * span is set by hardware model.  This will be a parameter for experiments.
+ * It's the main phenomenon affecting performance and scalability.  A fixed
+ * latency plus Frame-size / fixed-BW to start.
+ *
+ *An acquire can only be granted when the Consistent time reaches the
+ * Release-time of the previous acquire.  That's when know for certain that
+ * the memory image being acquired is correct, and the acquire order is
+ * correct (all acquires that want a given Frame will have been queued up
+ * for that Frame, so can't grant to one, in Host time, then a different
+ * acquire arrives that SHOULD have been the one given the grant).
+ *Each Frame has a priority-queue of acquires waiting for it, ordered by the
+ * simulated time the acquire-request was made.
+ *When ConsistentTime advances past the last Release of a Frame, then check
+ * the priority queue of waiting acquires -- if the top is older than the
+ * ConsistentTime, then grant to that one -- otherwise, move the acquire to
+ * the TriggerByConsistentTime queue.
+ *
+ *Each time ConsistentTime *wants* to advance, check the Trigger priority-
+ * queue to see if any triggers are older than the proposed new Consistent-
+ * Time.  If yes, then ConsistentTime is only advanced to that trigger's
+ * time, and the tigger is performed.
+ *
+ *Triggers are spans that have a start-time that depends on actions in other
+ * time-lines.  So, the span is created in one TimeLine, either ending an
+ * idle-span, or being queued up to run in that TimeLine's sequence -- all
+ * dependencies have been satisfied except access to the TimeLine resource.
+ *
+ *Each Frame has a hash-entry key'd by the Frame's start-addr.  This entry
+ * has a priority queue holding acquires waiting for the Frame, sorted
+ * by sim-time the acquire was executed.
+
+
diff -r 000000000000 -r 8ea476474093 src/Application/HWSim__Hello_World_HW/EntryPoint.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/Application/HWSim__Hello_World_HW/EntryPoint.c	Mon Nov 07 16:03:01 2011 -0800
@@ -0,0 +1,40 @@
+/*
+ *  Copyright 2009 OpenSourceStewardshipFoundation.org
+ *  Licensed under GNU General Public License version 2
+ *
+ * Author: seanhalle@yahoo.com
+ *
+ */
+
+#include <math.h>
+
+#include "HWSim_TeraFlux.h"
+
+
+
+/*Every HWSim system has an "entry point" function that creates the first
+ * virtual processor, which is the seed processor.
+ *
+ *The seed processor will construct the system to be simulated.
+ *The other files in this directory define the components the system is
+ * constructed from.
+ *
+ *
+ *This entry-point function follows the same pattern as all entry-point
+ * functions do:
+ *1) it creates the params for the seed processor, from the
+ *    parameters passed into the entry-point function
+ *2) it calls HWSim__create_seed_procr_and_do_work
+ *3) it gets the return value from the params struc, frees the params struc,
+ *    and returns the value from the function
+ *
+ */
+void
+runTheSimulation( SimulationParams *simParams )
+ {
+      //create divider processor, start doing the work, and wait till done
+      //This function is the "border crossing" between normal code and HWSim
+   HWSim__create_seed_procr_and_do_work( &constructAndSimulateSystem,
+                                          simParams );
+   
+ }
diff -r 000000000000 -r 8ea476474093 src/Application/HWSim__Hello_World_HW/Seed_VP.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/Application/HWSim__Hello_World_HW/Seed_VP.c	Mon Nov 07 16:03:01 2011 -0800
@@ -0,0 +1,181 @@
+/*
+ *  Copyright 2009 OpenSourceStewardshipFoundation.org
+ *  Licensed under GNU General Public License version 2
+ *
+ * Author: seanhalle@yahoo.com
+ *
+ */
+
+
+#include "HWSim_TeraFlux.h"
+#include <math.h>
+#include <string.h>
+
+
+
+//===========================================================================
+/*This is the seed processor.
+ *
+ *It takes the simulation parameters that were passed in to the entry point
+ * and uses them to construct the system and start it running.
+ *
+ *The way this VP Top-level-function is written is specific to TeraFlux, so
+ * the system it constructs is specifically a TeraFlux chip.
+ *
+ *HWSim, on the other hand, expects to be handed functions that it can call
+ * itself.  The first function should perform a bunch of
+ * HWSim__create_TimeLine()  and HWSim__create_TimeDomain()  calls.  The
+ * second function should connect together the entities created in the first
+ * function.
+ *
+ *So, the job of this seed processor is to construct the parameters those
+ * two functions will take, and hand them to HWSim with the
+ * HWSim__run_creation_fn( createFnPtr, createFnParams, animVP)
+ */
+void
+TFSeedVP_TLF( void *_params, VirtProcr *animPr )
+ {
+   TFSimulatorParams   *params;
+
+   params    = (TFSimulatorParams *)_params;
+
+         DEBUG( dbgTFHW, "CPU Span at_reset\n", _params );
+         
+         int32
+         constructProbe = VMS__create_single_interval_probe("constructProbe",
+                                                                    animPr );
+         VMS__record_sched_choice_into_probe( constructProbe, animPr );
+         VMS__record_interval_start_in_probe( constructProbe );
+
+   HWSim__register_constructor( &constructTeraFluxArch, _params, animPr);
+   HWSim__reset_and_sim( params->results, animPr );//animPr suspends til done
+
+   //=========== Setup 
+   /* for performance, want each phys core's master to have own acquire state
+    *  locally, and only read some config info that tells it whether needs
+    *  to read other data to update itself, or something..
+    * But, for now, just doing simplest thing.. can add a "comm plugin" to
+    * HWSim, so the the handler for communication-calls takes a plugin that
+    * it calls..  that lets HWSim be modified, so that acquire is done in
+    * the request handler in the master.
+    *Other alternative is making a communication-controller element, and send
+    * messages to it to do the acquires and releases -- will make that
+    * time-line be animated a lot -- will need it to be able to jump around
+    * among the physical cores -- so, something about letting a given time-
+    * line be able to be animated on whichever core needs it at the moment.
+    *Let's see..  the cores will be busy, than one will do an acquire, which
+    * will need the acquire-controller time-line -- but don't want that core
+    * to run out of work waiting for the controller -- hmmm, how about, use
+    * the affinity feature to keep each of the nodes to a particular core,
+    * but don't use it on the controller, which will let it move around..
+    *So, have separate readyQs -- one for each core, and another for free-
+    * floating..  when whatever scheduler is running has its local readyQ
+    * empty, it takes from the floating.
+    */
+   make acquire-controller. (central control over acquires but no timing)
+   make array to hold all the nodes.
+      loop makes each node and gives it an x and a y ID, and code-ptrs
+      (call make_node(), which constructs the four time-lines in a node)
+
+      loop through, send each "start" signal.
+
+ }
+
+
+/*This function is the constructor given to HWSim by the seed processor.
+ * It uses HWSim calls to create all the TimeLines and TimeDomains, and to
+ * hook them together.  Note that HWSim will start them, itself, after this
+ * constructor is done.
+ *
+ *Note, timelines don't have to be connected in order to communicate -- it's
+ * just one way of getting the needed info to the sending TimeLine, which
+ * consists of the pointer to the destination TimeLine, and which port to
+ * tell that target Timeline the communication is coming in on.
+ *The acquire will have the target TimeLine stored in a hash table, that's
+ * how the sending TimeLine gets the pointer to the target.  It has the port
+ * number hard-coded.
+ */
+void
+constructTeraFluxArch( void *_params, VirtProcr *animPr )
+ {
+   TFSimulatorParams   *params;
+   int nodeNum;
+   HWSimTimeDomain *node;
+   HWSimTimeLine   *cpu, *communicator;
+
+   params    = (TFSimulatorParams *)_params;
+
+   //========Define the types of TimeLine, which sets the at_reset fn========
+   //
+   HWSim__define_TimeLine_type( CPU_TIMELINE, &CPUSpan_at_reset, animPr );
+   HWSim__define_TimeLine_type( COMM_TIMELINE, &commSpan_at_reset, animPr );
+   
+   //==========Register HWSim handlers=========
+   //
+   // Handlers are special because they run inside HWSim with access to
+   //  shared global state and have the ability to start new spans, modify
+   //  TimeLine state, and so on.
+
+      //All data is local to a TimeLine, except global vars.  Those can only
+      // be accessed through a registered handler.
+   HWSim__register_global_var_accessor( DO_ACQUIRE_SEND, &do_acquire_send,
+                                        animPr );
+   HWSim__register_global_var_accessor( GET_OWNING_TNODE, &get_owning_TNode,
+                                        animPr );
+
+      //HWInstrs are able to generate communications, start new spans, and
+      // so forth -- they are considered extensions of HWSim itself, with the
+      // ability to affect the language's internal semantic and scheduling
+      // state, and so are created as handlers, which must be registered.
+   HWSim__register_HWInstr_type( ACQUIRE_INSTR,
+                                 &handle_Acquire_HWInstr_request, animPr );
+   HWSim__register_HWInstr_type( RELEASE_INSTR,
+                                 &handle_Release_HWInstr_request, animPr );
+
+   //==========Create the TimeDomains and TimeLines and connect them=========
+   //
+   for( nodeNum = 0; nodeNum < params->numNodes; nodeNum++ )
+    {
+      node          = HWSim__create_TimeDomain( params?, animPr );
+      cpu           = HWSim__create_TimeLine_of_type( CPU_TIMELINE, animPr );
+      communicator  = HWSim__create_TimeLine_of_type( COMM_TIMELINE, animPr);
+      HWSim__add_TimeLine_to_TimeDomain( cpu,          node );
+      HWSim__add_TimeLine_to_TimeDomain( communicator, node );
+         //This stores the target ptr + port-num in the out-port position in
+         // the cpu TimeLine -- so spans in cpu TimeLine can look up target
+      HWSim__connect_TimeLine_outPort_to_TimeLine_inPort(
+         cpu, COMMUNICATOR_OUTPORT, communicator, CPU_INPORT );
+    }
+
+   //Done -- the architecture is very simple for now -- inter-node comm
+   // happens via acquire spans, which use global vars to find the target
+   // communicator, and the comm spans have the target port hard-coded.
+   //So no inter-node communication connections
+ }
+
+/*The acquire handler uses HWSim calls to generate a communication in the
+ * Communicator TimeLine.  When the ConsistentTime reaches the appointed
+ * simulation time at which that comm arrives, it triggers the acquire-start
+ * span in the Communicator TimeLine.
+ *
+ *Q: what's with HWSim__send_comm?  Can just use that, don't need to register
+ * an instruction..  Means GuestCode span just keeps going..  pause it during
+ * an HWInstr, then resume it..  Want multiple spans for any reason?
+ */
+void
+handle_Acquire_HWInstr_request()
+ {
+
+
+ }
+
+/*The release handler uses HWSim calls to generate a communication in the
+ * Communicator TimeLine.  When the ConsistentTime reaches the appointed
+ * simulation time at which that comm arrives, it triggers the release
+ * span in the Communicator TimeLine.
+ */
+void
+handle_Release_HWInstr_request()
+ {
+
+ }
diff -r 000000000000 -r 8ea476474093 src/Application/SimParams.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/Application/SimParams.c	Mon Nov 07 16:03:01 2011 -0800
@@ -0,0 +1,91 @@
+/*
+ *  Copyright 2009 OpenSourceStewardshipFoundation.org
+ *  Licensed under GNU General Public License version 2
+ *
+ * Author: seanhalle@yahoo.com
+ *
+ * Created on November 15, 2009, 2:35 AM
+ */
+
+#include <malloc.h>
+#include <stdlib.h>
+
+#include "SimParams.h"
+#include "ParamHelper/Param.h"
+
+
+uint8 *
+read_Machine_Code_From_File( int numBytesInFile, char *machineCodeFileName );
+
+ 
+void
+fill_sim_params_from_bag( SimulationParams *simParams, ParamBag *paramBag )
+ { char *guestAppFileName, *systemCodeFileName;
+   int numBytesInGuestApp, numBytesInSystemCode;
+   
+      ParamStruc *param;
+      param = getParamFromBag( "GuestApplicationFileName", paramBag );
+   guestAppFileName = param->strValue;
+      param = getParamFromBag( "numBytesInGuestApp", paramBag );
+   numBytesInGuestApp = param->intValue;
+
+   simParams->guestApp =
+    read_Machine_Code_From_File( numBytesInGuestApp, guestAppFileName );
+
+      param = getParamFromBag( "SystemCodeFileName", paramBag );
+   systemCodeFileName = param->strValue;
+      param = getParamFromBag( "numBytesInSystemCode", paramBag );
+   numBytesInSystemCode = param->intValue;
+
+   simParams->systemCode =
+    read_Machine_Code_From_File( numBytesInSystemCode, systemCodeFileName );
+
+
+      param = getParamFromBag( "numNodes", paramBag );
+   simParams->numNodes = param->intValue;
+
+ }
+
+
+
+uint8 *
+read_Machine_Code_From_File( int numBytesInFile, char *machineCodeFileName )
+ { int byte;
+   FILE  *file;
+   char  *machineCode = malloc( numBytesInFile );
+   if( machineCode == NULL ) printf( "\nno mem for machine code\n" );
+   
+   file = fopen( machineCodeFileName, "r" );
+   if( file == NULL ) { printf( "\nCouldn't open file!!\n"); exit(1);}
+
+   fseek( file, 0, SEEK_SET );
+   for( byte = 0; byte < numBytesInFile; byte++ )
+    {
+      if( feof( file ) )  printf( "file ran out too soon" );
+      machineCode[byte] = getchar( file );
+      
+    }
+   return machineCode;
+ }
+
+
+ //==========================================================================
+void
+printSimResults( SimulationResults simResults )
+ { int r, c, numRows, numCols, rowsToPrint, colsToPrint, rowIncr, colIncr;
+   float32 *matrixArray;
+
+   numRows = rowsToPrint = matrix->numRows;
+   numCols = colsToPrint = matrix->numCols;
+   matrixArray = matrix->array;
+
+   rowIncr = numRows/20; if(rowIncr == 0) rowIncr = 1;//20 to 39 rows printed
+   colIncr = numCols/20; if(colIncr == 0) colIncr = 1;//20 to 39 cols printed
+   for( r = 0; r < numRows; r += rowIncr )
+    { for( c = 0; c < numCols; c += colIncr )
+       { printf( "%3.1f | ", matrixArray[ r * numCols + c ] );
+       }
+      printf("\n");
+    }
+ }
+
diff -r 000000000000 -r 8ea476474093 src/Application/SimParams.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/Application/SimParams.h	Mon Nov 07 16:03:01 2011 -0800
@@ -0,0 +1,48 @@
+/*
+ *  Copyright Oct 24, 2009 OpenSourceStewardshipFoundation.org
+ *  Licensed under GNU General Public License version 2
+ */
+
+#ifndef MATRIX_MULT_H_
+#define MATRIX_MULT_H_
+
+#include <stdio.h>
+#include <unistd.h>
+#include <malloc.h>
+
+#include "../HWSim_lib/VMS/VMS_primitive_data_types.h"
+#include "ParamHelper/Param.h"
+
+//==============================  Structures  ==============================
+
+typedef
+struct
+ { uint8 *guestApp;
+   uint8 *systemCode;
+   int32 numNodes;
+ }
+SimulationResults;
+
+
+typedef
+struct
+ { uint8 *guestApp;
+   uint8 *systemCode;
+   int32 numNodes;
+   SimulationResults *simResults;
+ }
+SimulationParams;
+
+
+
+//==============================  Functions  ================================
+
+void
+printSimResults( SimulationResults simResults );
+
+void
+fill_sim_params_from_bag( SimulationParams *simParams, ParamBag *paramBag );
+
+//===========================================================================
+
+#endif /*MATRIX_MULT_H_*/
diff -r 000000000000 -r 8ea476474093 src/Application/main.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/Application/main.c	Mon Nov 07 16:03:01 2011 -0800
@@ -0,0 +1,48 @@
+/*
+ *  Copyright Oct 24, 2009 OpenSourceStewardshipFoundation.org
+ *  Licensed under GNU General Public License version 2
+ *
+ * author seanhalle@yahoo.com
+ */
+
+#include <malloc.h>
+#include <stdlib.h>
+
+#include "SimParams.h"
+#include "HWSim_TeraFlux/HWSim_TeraFlux.h"
+
+/**
+ * 
+ */
+int main( int argc, char **argv )
+ { SimulationParams  *simParams;
+   SimulationResults *simResults;
+   ParamBag          *paramBag;
+   
+   printf( "arguments: %s | %s\n", argv[0], argv[1] );
+
+   simParams             = malloc( sizeof(SimulationParams) );
+
+
+      //VMS has its own separate internal malloc, so to get results out,
+      // have to pass in empty array for it to fill up
+      //The alternative is internally telling HWSim make external space to use
+   simResults            = malloc( sizeof(SimulationResults) );
+   simParams->simResults = simResults;
+
+   paramBag              = makeParamBag();
+
+   readParamFileIntoBag(     argv[1],   paramBag );
+   fill_sim_params_from_bag( simParams, paramBag );
+
+
+   constructAndSimulateSystem( simParams );
+
+   printSimResults( simResults );
+
+   fflush(stdin);
+   
+   exit(0); //cleans up
+ }
+
+