# HG changeset patch # User Me@portablequad # Date 1320710581 28800 # Node ID 8ea476474093d250e710c700e568d91d7e7f9ac7 Initial add -- gobbeldegook diff -r 000000000000 -r 8ea476474093 .hgignore --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/.hgignore Mon Nov 07 16:03:01 2011 -0800 @@ -0,0 +1,6 @@ +syntax: glob + +nbproject +build +dist +*.o \ No newline at end of file diff -r 000000000000 -r 8ea476474093 src/Application/HWSim__Hello_World_HW/Communicator_TimeLine.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/Application/HWSim__Hello_World_HW/Communicator_TimeLine.c Mon Nov 07 16:03:01 2011 -0800 @@ -0,0 +1,297 @@ +/* + * Copyright 2009 OpenSourceStewardshipFoundation.org + * Licensed under GNU General Public License version 2 + * + * Author: seanhalle@yahoo.com + * + */ + +#include "HWSim_TeraFlux.h" + +//===================== + +//=========================================================================== +/*This is the collection of spans for the Communication Processor TimeLine + * + *This TimeLine does acquire-start, acquire-send, release, acquire-receive, + * all triggered by communications coming from other TimeLines. + * + * Sometime after Feb 2011 it will also have the control- + * communications that Albert wants for Erbium (the broad-cast of counter + * updates to registered listener TNodes). And, it will probably get some + * form of control communication for implementing a fast chip-wide VMS. + * + * + * + *The kinds of span the TimeLine can have are: +-] acquire-start -- is triggered by the reception of a "start-acquire" + communication from the CPU TimeLine. It generates an "acquire send" + communication to the TNode that currently owns the data. +-] acquire-send -- when triggered, checks if the Frame is free. If yes, + sends an "acquire-receive" communication back to the requester. If not + free, places data representing the acquire-request into a queue of waiting + ones in the Communicator TimeLine. When the Communicator receives a + "release" communication from the CPU TimeLine, it runs the release span. +-] release -- takes the data of the next request waiting for the released + Frame out of the queue and then sends an "acquire-receive" communication + to the TNode requesting. +-] acquire-receive -- when triggered, runs firmware, which writes memory + shared with the CPU, modifying data-structures. This firmware for + TeraFlux works with the System Code that runs on the CPU TimeLine to + notify it that the acquire is complete. + * + * + *Background on acquire: + *The request handler invokes acquire by placing the calling VP into + * a holding list, then executing the hardware instruction that starts the + * acquire. + *This instruction sends a communication from the CPU TimeLine to the + * Communicator TimeLine. The instruction passes a pointer to the + * list element, and also passes a pointer to the queue of + * ready VPs. When acquire comm is done, the Communicator takes the list + * element out and recycles it, and places the pointer to the VP into the + * queue of ready VPs. + * + *In the actual hardware, each node has an ultra-simple communication + * processor -- like an 8 bit control data-path and a physical-addr-width + * addr-data-path (simple -- only power-of-2 shift, add, and maybe mask). + *The acquire-instruction parameters are placed into a data + * structure and only the pointer to them is in a register. The CPU performs + * a write of the ptr to a particular physical addr, which the comm hardware + * catches and queues. The comm-processor is driven by the queue -- stalls + * when queue is empty -- returns after finishes to get next from queue. + *Firmware in the comm-processor then fetches the params out of the data + * structure and starts the communication. The comm involves the + * BIU, which is asked to grant acquire on the frame. BIU sends back + * the node that currently has the frame, or else main-mem physical address + * range. + *When comm is complete, the comm processor performs list-element removal, + * recycling, and pointer movement. + *End Background on acquire + * + *So, the HWInstr_acquire() is the hardware instruction called by the + * GuestSystemCode. This instruction's job is to perform the communication + * protocol that gets data from wherever it is and brings it onto the TNode + * executing this instr. + *In the simulator, this happens in a separate time-line than the CPU, which + * is animating another virtual TNode while the communication happens. + *There may be several acquires started while one is in progress. But not + * going to model queueing of them. This time-line treats them as + * zero width. They each start a new span, but the span ends and reports + * its end-time as the same as its start time, then starts the idle-span. + * + *HWInstr_acquire: CPU TimeLine + * when executes on the CPU TimeLine, sends a communication to the + * Communicator TimeLine, which equals inserting an acquire-start span into + * the consistent-time trigger-priority-queue that's driven by the TNode's + * ConsistentTime. It will stop ConsistentTime advancement at its target + * arrival time, ending the Idle-span and starting the acquire-start span. + * + *Acquire-start: requesting TNode's Communicator TimeLine + * this span looks inside the data-struct to find the Frame start addr. It + * looks this up in a hash table to see which TNode owns the Frame. If none, + * means hasn't been allocated yet, so puts acquire-info into the wait-queue + * that's in the hash entry. + * + *Acquire-send: Dwelling TNode's Communicator TimeLine + * When advancement of ConsistentTime stopped by the acquire-send reception, + * check to see + * if the desired Frame is still owned (hash table keyed by Frame start + * addr). If desired Frame is owned, places the data of the request into + * the queue of waiting acquires that's in the hash-entry. + * If no longer owned, then set new Dwelling TNode to be requester and + * place Acquire-receive into waiting-comm queue of requesting TNode + * TimeDomain. + * + *Release: CPUTimeLine of Owning TNode + * put Release into Communicator TimeLine, with the Frame start addr in it + * + *Release: Communicator TimeLine of Owning TNode + * look up the hash entry for released Frame, set to Not Owned. If waitQ + * in the Frame's hash entry is not empty, take next entry, set as Owner of + * the Frame, and as new Dwelling TNode, then place Acquire-receive into + * waiting-comm queue of requesting TNode-TimeDomain. + * + *Acquire-receive: Communicator TimeLine of requesting TNode + * When advancement of ConsistentTime stopped by the acquire-receive + * reception, run the Firmware that sets the shared state, so that the CPU + * TimeLine will see the acquire is complete -- also do the data-structure + * rearrangement stuff. Finally, do the thing of checking if all VTNodes + * are suspended waiting for acquires -- if so, the receive will have to be + * pro-active in restarting the CPU TimeLine, kicking it out of the + * LightSleep span. THIS SPECIAL SPAN IS KEY BENEFIT OF HW SUPPORT FOR VMS -- + * can put hardware into light sleep mode when nothing to do -- power/energy + * saver. + * + * + *So, a time-line is the virtual processor itself. It animates the + * "current span" as the top-level function. That span executes span-end, + * the next span's function replaces the old one as the top-level function. + * The stack is also reset, and the parameters for the new span are placed + * on the stack, and the pointer to the span's function is set as the resume- + * ptr. When the time-line VP is resumed, it's equivalent to that span's + * function being called. + *The time-line VP has app-specific data that says whether the current-span + * is the Idle span, and the simulated-time of the last span-end. + * + *So, rather than having a single top-level functin, a timeLine VP has many, + * a different TLF for each kind of span. + * + *This file holds all the spans == TLFs for the comm-procr TimeLine. + */ + + +//=========================================================================== +/* + *Acquire-start: requesting TNode's Communicator TimeLine + * this span looks inside the data-struct to find the Frame start addr. It + * looks this up in a hash table to see which TNode owns the Frame. If none, + * means hasn't been allocated yet, so puts acquire-info into the wait-queue + * that's in the hash entry. + *Q: want to put global data-structs into HWSim, with some protection + * mechanism (like the transactions have already implemented), or want to + * make general request-handler extension thingie? Request-handler + * extensions would be done by registering a handler function during + * architecture-definition code. + *A: transactions awkward.. make all state be either contained in the + * TimeLine, or else global. If in TimeLine, has to be allocated during + * architecture definition, and if global, has to be declared, and a + * pointer to the functions that access it, along with an ID for that kind + * of access is registered. Then, in the span-code, call + * HWSim__access_global_var( params, ACCESSID ) the ACCESSID determines + * which function-pointer is called, and the function code determines + * which global var is accessed, and the params hold all the data the + * function needs to do whatever is to be done. + */ +void +commSpan_acquire_start( void *_params, VirtProcr *animTimeLine ) + { + AcquireParams *params; + + params = (AcquireParams *)_params; + + DEBUG( dbgAppFlow, "acquire_start\n", cloneAcquireParams( params )); + + //invoke global-var-accessor to get the TNode owns the Frame + residingTNode = + HWSim__access_global_var( params, GET_OWNING_TNODE, animTimeLine ); + + //send a communication to that TNode's Communicator + targetTL = residingTNode->communicatorTL; + + //params are: amount of simulated time the communication takes, + // the TimeLine receiving, the span-function to run when consistent- + // time reaches the reception time, params for that span, and animTL + HWSim__send_comm( calcNetworkTime(), targetTL, targetTL->sendAcquireSpan, + params, animTimeLine ); + + duration = 0; //starting an acquire modeled as taking zero time + + //every span function ends with this call -- duration of this span, + // pointer to next span-function to run, params for it, and animTL + HWSim__transition_to_new_span( duration, IDLE_SPAN, NULL, animTimeLine ); + } + + +/*Acquire-send: Dwelling TNode's Communicator TimeLine + * Runs when advancement of ConsistentTime stopped by the acquire-send + * reception. Check to see + * if the desired Frame is still owned (hash table keyed by Frame start + * addr). If desired Frame is owned, place the data of the request into + * the queue of waiting acquires that's in the hash-entry. + * If already released, then set new Dwelling TNode to be requester and + * do an Acquire-receive back to the requesting TNode, which has the effect + * of placing an acquire-receive span to wait in its consistent-time-arrest + * queue. + *The release span will take waiting requests out of the waiting-acquires Q + */ +void +commSpan_acquire_send( void *_params, VirtProcr *animTimeLine ) + { + AcquireParams *params; + + params = (AcquireParams *)_params; + + DEBUG( dbgAppFlow, "acquire_send\n", cloneAcquireParams( params )); + + + //invoke global-var-accessor to lookup hash entry and see if Frame is + // still owned, and if so, add this acquire to queue of waiting ones. + // note, this is non-physical behavior -- any use of global vars is + // non-physical. To make this function physical, implement a TimeLine + // that holds the hash table and all other TimeLines communicate to. + //Caveat there is that collisions can happen unless also impl protocol. + // So leave that for later improvement. + notCurrentlyOwned = + HWSim__access_global_var( params, DO_ACQUIRE_SEND, animTimeLine ); + + if( notCurrentlyOwned ) + { + //send a communication to requesting TNode's Communicator + targetTL = params->requestingTNode->communicatorTL; + + //params are: amount of simulated time the communication takes, + // the TimeLine receiving, the span-function to run when consistent- + // time reaches the reception time, params for that span, and animTL + HWSim__send_comm( calcNetworkTime(), targetTL, COMM_INPORT, + &commSpan_acquire_receive, params, animTimeLine ); + } + + HWSim__transition_to_new_span( IDLE_SPAN, NULL, animTimeLine ); + } + + +/*Acquire-receive: Communicator TimeLine of requesting TNode + * When advancement of ConsistentTime stopped by the acquire-receive + * reception, run the "Firmware" that sets the shared state, so that the CPU + * TimeLine will see the acquire is complete -- also do the data-structure + * rearrangement stuff. Finally, do the thing of checking if all VTNodes + * are suspended waiting for acquires -- if so, the receive will have to be + * pro-active in restarting the CPU TimeLine, kicking it out of the + * LightSleep span. THIS SPECIAL SPAN IS KEY BENEFIT OF HW SUPPORT FOR VMS -- + * can put hardware into light sleep mode when nothing to do -- power/energy + * saver. + */ +void +commSpan_acquire_receive( void *_params, VirtProcr *animTimeLine ) + { + AcquireParams *params; + + params = (AcquireParams *)_params; + + DEBUG( dbgAppFlow, "acquire_receive\n", cloneAcquireParams(params)); + + + HWSim__transition_to_new_span( IDLE_SPAN, NULL, animTimeLine ); + } + + +/*Release: Communicator TimeLine of Owning TNode + * look up the hash entry for released Frame, set to Not Owned. If waitQ + * in the Frame's hash entry is not empty, take next entry, set as Owner of + * the Frame, and as new Dwelling TNode, then place Acquire-receive into + * waiting-comm queue of requesting TNode-TimeDomain. + */ +void +commSpan_release( void *_params, VirtProcr *animTimeLine ) + { + AcquireParams *params; + + params = (AcquireParams *)_params; + + DEBUG( dbgAppFlow, "acquire_start\n", cloneAcquireParams( params )); + + + HWSim__transition_to_new_span( IDLE_SPAN, NULL, animTimeLine ); + } + +/*At reset only starts the Idle span in the communicator. + */ +void +commSpan_at_reset( void *_params, VirtProcr *animTimeLine ) + { + + DEBUG( dbgAppFlow, "commSpan at_reset\n", NULL ); + + HWSim__transition_to_new_span( IDLE_SPAN, NULL, animTimeLine ); + } diff -r 000000000000 -r 8ea476474093 src/Application/HWSim__Hello_World_HW/DESIGN_NOTES.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/Application/HWSim__Hello_World_HW/DESIGN_NOTES.txt Mon Nov 07 16:03:01 2011 -0800 @@ -0,0 +1,369 @@ + + +TeraFlux Hardware Model (impl of this model in terms of HWSim is below) + +As of Feb 2011, the hardware being simulated is: + +A number of TNodes on a chip, which are connected by a network, with a + main-memory. + +The address space is divided into a local address-space and a shared + address-space. Both address spaces are divided among the TNodes -- each + has its own unique range of local virtual addresses and own range of + global virtual addresses. A given TNode may only allocate virtual + addresses within its own two ranges. The contents of *local* addresses it + allocates may only ever be seen by the allocating TNode. The contents of + shared addresses may be seen by any TNode after executing an acquire + hardware instruction and being granted the contents. + +Each TNode has a single CPU core, a local memory, and a + communication processor with its own network hardware. + +The local memory size is not modelled, so is considered unlimited. +The network has undefined topology and is modelled as having constant latency + from any TNode to any other TNode, with a fixed BW between any two TNodes. + +The only modelled communication is movement of data, which can only be + triggered by the "acquire" hardware instruction. + +The behavior of acquire is defined as: +-] Only one TNode at a time owns a given shared-memory "Frame", which has a + start address and a size. +-] A given shared virtual address is in at most one Frame for the duration + of a program run (IE, no overlap of Frames, implying no change of size). +-] The hardware mechanism by which single-ownership is enforced is not + modelled. Control communication is considered infinitely fast. +-] The simulated time between a given TNode's CPU executing the acquire + instruction and the simulated time the data of the Frame appears in the + local memory of that TNode is determined by both a queue of waiting + acquire requests and the network time required to move the data. + +The CPU in a TNode executes the standard x86 instruction set. + +The communication processor performs the acquire instruction and the + release instruction. The exact hardware mechanism by which these + instructions get from the instruction stream fetched by the CPU to the + communication processor is not defined. + +As an aside. For simulation, the acquire and release instruction are + implemented in the lightweight simulator as library-calls that trigger + the simulation infrastructure. In the application they are stated as + macro-calls. These macros can then be defined in the tool-chain. +When the COTSon simulator is targeted, the + macro is implemented as an in-line assembly custom op-code. When the + lightweight simulator is targeted, the macro is implemented as a call to + a library, which invokes the acquire or release functionality in the + lightweight simulator. + +The effect of transferring data between TNode local memory and the chip's + main memory is not modelled. + +The portions of the hardware left undefined or not modelled may all be + filled-in in future versions, according to research needs. + + +============================================================================= +Implementing the hardware model in terms of HWSim + +HWSim is used to implement the TeraFlux hardware model for simulation. + +The implementation consists of four elements: +1) A CPU TimeLine, which executes the Guest code +2) A Communicator TimeLine, which performs all inter-node communication +3) A TNode TimeDomain, which has one CPU TimeLine and one Communicator + TimeLine in it +4) A Chip TimeDomain, which has a number of TNode TimeDomains in it. + +For the first version, there is no local memory element in a TNode, there is + no Main Memory node, and there is no address translation mechanism modelled. + These may be added later, and so might a TSU mechanism, and experimental + alternative memory models. + +The CPU TimeLine has as spans: +-] at_reset -- standard span that runs when the hardware is reset. Runs the + TeraFlux System Code bootstrap function. +-] guestCode span -- runs whatever Guest code is pointed to.. each span of + this type is created with a pointer to code to run +-] lightSleep -- implemented as the built-in IDLE span. When no virtual + nodes are ready to animate, the CPU enters light sleep until the + Communicator wakes it up + +The CPU TimeLine generates two kinds of communication-spans: + acquire-start in Communicator + release in Communicator + +For now, these communications are considered to be performed by dedicated + hardware in the CPU, so they take exactly one simulated instruction, and + their simulated time is thus included in the measured time of the span. The + HWSim__send_comm takes a time-stamp before suspending the TimeLine, then + another just after resume, and accumulates -- the end-span adds this in. + + All TimeLines begin by running the at_reset span defined for that TimeLine. + The CPU's at_reset is hard-coded to start a GuestCode span that runs the + TeraFlux System Code's boot sequence. + +The Communicator TimeLine has a number of spans, all related to acquire and + release: +-] acquire-start -- is triggered by the reception of a "start-acquire" + communication from the CPU TimeLine. It generates an "acquire send" + communication to the TNode that currently owns the data. +-] acquire-send -- when triggered, checks if the Frame is free. If yes, + sends an "acquire-receive" communication back to the requester. If not + free, places data representing the acquire-request into a queue of waiting + ones in the Communicator TimeLine. When the Communicator receives a + "release" communication from the CPU TimeLine, it runs the release span. +-] release -- takes the data of the next request waiting for the released + Frame out of the queue and then sends an "acquire-receive" communication + to the TNode requesting. +-] acquire-receive -- when triggered, runs firmware, which writes memory + shared with the CPU, modifying data-structures. This firmware for + TeraFlux works with the System Code that runs on the CPU TimeLine to + notify it that the acquire is complete. + +Note, nothing checks whether a Frame's addresses are accessed from outside an + Acquire-Release block, which could be a source of difficult to find bugs in + the application. + + +============================================================================= +Earlier versions of notes: + +A time-line is a virtual-processor, and has a sequence of spans -- each + span performs one hardware-function, and has a start-time and an end-time + -- those are simulated-time, not physical time. + +Each time-line is created with a start-span that initializes it, then every + span ends with an "end span" sem-lib call. + +There are three kinds of span -- fixed-function spans, which represent + hard-wired hardware behavior, processing-core spans, which represent + processing elements that execute code, and communication-spans, which + cross time-lines. + +a fixed-function span has a fixed function-pointer that it is created with + and jumps to when the time-line is resumed. Fixed-function spans also have + a pointer to a function that calculates the width of the span. The + width-caluclating span is defined in the application directory. + +a processing-core span has a function-pointer that is assigned to it by the + end-span call of the preceeding span. The width is also determined by a + pointer to a width-calculating function. The width-calculating function + for these spans is also defined in the application directory (In first + teraflux impl, this function uses RDTSC to measure physical execution time, + and makes that the simulated execution time too -- but with a "BS" detector + that sees when the time is significantly larger than the previous + invocation of the same function-pointer). + +Communication spans are special because they cross time-lines. So, a + communication span has zero width in the time-line it's created in, and + goes onto the queue as a new span in the target time-line (which also has + zero-width). When the target span runs, it changes the state available to + the target time-line, to represent the reception of the communication. + +============================= +Span-end is the only semantic-library call implemented. Inside the + request-handler, it causes new spans to be created. + +So, have to have a separate receive time-line, that modifies hardware shared + with other time-lines. The send span causes a receive-span to be inserted + into the target receive time-line. + +Receive-spans are zero-width -- they update the hardware-state atomically, + so don't have to worry about conflicts between different receive spans in + the simulator. The hardware-application that uses the simulator-library + must model the receive hardware and implement the send-hardware function + to work out any physical conflicts among receives targeted to the same + receive time-line. + + +============================= + +Time-lines are specialized to specific hardware functions inside the + Application directory -- that's where the main creates all the time-lines, + and where the spans are implemented that have the behavior of a given type + of time-line. + +For example, if the hardware is a communication-unit, then span-types are + created that have the behavior that does all the setup of a communication + span and then does an end-span that creates as its follow-on the + communication-span. + +Communication spans are special because they cross time-lines. So, a + communication span has zero width in the time-line it's created in, and + goes onto the queue in the target time-line, where it creates a new span + that also has zero-width. The target span's function updates the hardware + state available to the target time-line, which may be shared with other + time-lines and that update may cause new spans to be spawned in those. + +if the hardware is a processing-core, then the function points to + Guest-application-code. This function-pointer is what core_loop jumps to + when it reanimates the time-line virtual processor. + +================ Albert e-mail ================= + + +Hi Albert, + + the simulator is a thing of beauty. I'm getting goose bumps as I put the + last pieces of the design into place. It's a sweet thing. + +Just in case you're curious, attached are my design notes . The + thing that makes it nice is the clean decomposition and reusability -- the + core simulator only has three things: time-lines, spans, and a + priority-queue with an associated "certain-time" or "consistent-time". + + A time-line is a VMS virtual-processor, which executes a sequence of + spans. Each span has a function that represents the behavior performed + during that span, plus a function that calculates the simulated-time width + of that span. The certain-time represents the advancement of global + simulated-time. At all points in physical time during the simulation, it + is guaranteed that no spans are waiting to execute that have a + simulated-time older than the certain-time. + In other words, at any physical moment, there are lots of spans sitting + in queues waiting to run. So, a given time-line finishes a span at a given + simulated-time point in global simulated-time. But there may be spans from + other time-lines in the queue that finish at a preceeding simulated-time. + But there can never be any waiting spans that finish before the + certain-time. This is important for communications, which cross + time-lines. + + + Time-lines, spans, and certain-time are implemented in a semantic-library. + +This is, in essence a new parallel language for writing hardware-simulators + with. (this just turns out to be the most natural and most simple way to + write the TeraFlux simulator) + +The behavior of particular hardware is defined as a simulator-application + that makes calls to that semantic-library. + +The main of that simulator-application creates the pieces of the hardware + -- for Teraflux, that means it creates the nodes, and the pieces inside + each node. + +Running this simulator-application equals turning on the power-switch of + the Guest hardware. + +The Guest application code is retrieved by the main of the + simulator-application, and starts running when the "power-switch" is + turned on (running the simulator-application causes the Guest-application + to be retrieved and start running on the hardware created by main). This + is equivalent to the boot sequence of the BIOS, which happens at power-on. + +So, the end-effect is that full Linux is available to the Guest hardware as + a sort of "escape". The Guest hardware can use the Host's disk access, + debugging, and everything else, but these usages are "outside" the + simulated time -- they are essentially magic-spells that the Guest + hardware can perform that take place outside of time as far as the Guest + application-code is concerned. Hence, the main is able to use the Host + Linux to retrieve-from-disk the Guest application (but the disk-access + takes place outside the measurements reported by the simulator). + +Which maybe seems trivial, but I consider it a very cool trick : ) + +The part I like best is the fact that the simulator itself is such a simple + semantic-library, and the behavior of the hardware is written all as + sequential code. This makes it easily customizable to any architecture + someone might want to investigate, and still run on parallel hardware : D + +The only caveat is the thing I noted in earlier e-mails about communication + updates -- memory images and other hardware state atomically update at + the ends of spans. So, Guest-code-execution that overlaps the + simulated-time at which reception happens, on a node, will not see that + reception until the end of the span. + For TeraFlux hardware, the only natural span-endings are acquire + instructions and release instructions. + What this means for you is that you may want to insert artificial + end-span calls into the Guest application code you generate (I'll provide + a "dummy" call). Communication-receptions on a node will only become + visible to Guest application code after one of these dummy end-span calls. + So, the granularity of time in the simulation is related to the frequency + of end-span calls in the application code. If you don't insert any + artificial ones, then acquire and release instructions will be the only + span-ending events, and will define the granularity of + communication-receptions being seen by Guest application code. + In particular, this means that no Guest application code is allowed to + monitor memory to see when it changes -- so, programming techniques that + would work on real hardware, to detect acquire-updates early won't work + in the simulator -- all Guest code to run on the simulator must use the + acquire-library call, which will suspend the virtual-node the code is + running on until the acquire is complete, then re-start that node at some + simulated-time after the acquire completes. + + +No idea when something will start running, but the design is getting quite + detailed, so progress is being made, + +Sean + + +============================================================================= + Notes from before figured out how to do ConsistentTime for each TimeDomain + and have a priority-queue of waiting communication receptions for the + TimeDomain. + +This illustrates the tortured logic would have to go through otherwise. + * + * that acquire's release point in + * sumulated is either already known, or not yet known. If already known, + * then recursively check if another TNode has already been granted acquire, + * until reach the end of the chain. If the end is already known without any + * other acquires, then set state that this TNode gets the acquire at the + * release-time of the last in the chain. This acquire will have an unknown + * release time. + *If the release time of the end of the chain is unknown, then put the + * requesting acquire into a queue of acquires waiting for that Frame. When + * the release happens, it will check if any acquires are in the queue for + * the released Frame. If so, it will do the same as is done when the end + * of a chain is known -- set the Frame's state to acquired, with release- + * time as the start-time of the new acquire-grant, and unknown end-time. + * + *When a HWInstr_release() is executed, it has a simulated-time at which the + * span starts -- the span will also have zero width and start the idle span + * when it ends, just like the acquire-start span, and the triggered + * spans. + * + *The release span will check the wait queue for the Frame it is releasing, + * and either mark the Frame as free, or else fire off the grant-acquire + * function for the first waiting acquire. + * + *The grant-acquire function checks the ConsistentTime, and if the + * simulated time of the grant preceeds the ConsistentTime, then the + * acquire is put into the readyQ as an acquire-send span in the comm + * TimeLine of the TNode that owns the acquired data, or else the MainMem + * Node. + * + *The comm TimeLine that executes an acquire-send creates a comm span between + * the sending TNode or MMNode and the receiving TNode. The width of the + * span is set by hardware model. This will be a parameter for experiments. + * It's the main phenomenon affecting performance and scalability. A fixed + * latency plus Frame-size / fixed-BW to start. + * + *An acquire can only be granted when the Consistent time reaches the + * Release-time of the previous acquire. That's when know for certain that + * the memory image being acquired is correct, and the acquire order is + * correct (all acquires that want a given Frame will have been queued up + * for that Frame, so can't grant to one, in Host time, then a different + * acquire arrives that SHOULD have been the one given the grant). + *Each Frame has a priority-queue of acquires waiting for it, ordered by the + * simulated time the acquire-request was made. + *When ConsistentTime advances past the last Release of a Frame, then check + * the priority queue of waiting acquires -- if the top is older than the + * ConsistentTime, then grant to that one -- otherwise, move the acquire to + * the TriggerByConsistentTime queue. + * + *Each time ConsistentTime *wants* to advance, check the Trigger priority- + * queue to see if any triggers are older than the proposed new Consistent- + * Time. If yes, then ConsistentTime is only advanced to that trigger's + * time, and the tigger is performed. + * + *Triggers are spans that have a start-time that depends on actions in other + * time-lines. So, the span is created in one TimeLine, either ending an + * idle-span, or being queued up to run in that TimeLine's sequence -- all + * dependencies have been satisfied except access to the TimeLine resource. + * + *Each Frame has a hash-entry key'd by the Frame's start-addr. This entry + * has a priority queue holding acquires waiting for the Frame, sorted + * by sim-time the acquire was executed. + + diff -r 000000000000 -r 8ea476474093 src/Application/HWSim__Hello_World_HW/EntryPoint.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/Application/HWSim__Hello_World_HW/EntryPoint.c Mon Nov 07 16:03:01 2011 -0800 @@ -0,0 +1,40 @@ +/* + * Copyright 2009 OpenSourceStewardshipFoundation.org + * Licensed under GNU General Public License version 2 + * + * Author: seanhalle@yahoo.com + * + */ + +#include + +#include "HWSim_TeraFlux.h" + + + +/*Every HWSim system has an "entry point" function that creates the first + * virtual processor, which is the seed processor. + * + *The seed processor will construct the system to be simulated. + *The other files in this directory define the components the system is + * constructed from. + * + * + *This entry-point function follows the same pattern as all entry-point + * functions do: + *1) it creates the params for the seed processor, from the + * parameters passed into the entry-point function + *2) it calls HWSim__create_seed_procr_and_do_work + *3) it gets the return value from the params struc, frees the params struc, + * and returns the value from the function + * + */ +void +runTheSimulation( SimulationParams *simParams ) + { + //create divider processor, start doing the work, and wait till done + //This function is the "border crossing" between normal code and HWSim + HWSim__create_seed_procr_and_do_work( &constructAndSimulateSystem, + simParams ); + + } diff -r 000000000000 -r 8ea476474093 src/Application/HWSim__Hello_World_HW/Seed_VP.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/Application/HWSim__Hello_World_HW/Seed_VP.c Mon Nov 07 16:03:01 2011 -0800 @@ -0,0 +1,181 @@ +/* + * Copyright 2009 OpenSourceStewardshipFoundation.org + * Licensed under GNU General Public License version 2 + * + * Author: seanhalle@yahoo.com + * + */ + + +#include "HWSim_TeraFlux.h" +#include +#include + + + +//=========================================================================== +/*This is the seed processor. + * + *It takes the simulation parameters that were passed in to the entry point + * and uses them to construct the system and start it running. + * + *The way this VP Top-level-function is written is specific to TeraFlux, so + * the system it constructs is specifically a TeraFlux chip. + * + *HWSim, on the other hand, expects to be handed functions that it can call + * itself. The first function should perform a bunch of + * HWSim__create_TimeLine() and HWSim__create_TimeDomain() calls. The + * second function should connect together the entities created in the first + * function. + * + *So, the job of this seed processor is to construct the parameters those + * two functions will take, and hand them to HWSim with the + * HWSim__run_creation_fn( createFnPtr, createFnParams, animVP) + */ +void +TFSeedVP_TLF( void *_params, VirtProcr *animPr ) + { + TFSimulatorParams *params; + + params = (TFSimulatorParams *)_params; + + DEBUG( dbgTFHW, "CPU Span at_reset\n", _params ); + + int32 + constructProbe = VMS__create_single_interval_probe("constructProbe", + animPr ); + VMS__record_sched_choice_into_probe( constructProbe, animPr ); + VMS__record_interval_start_in_probe( constructProbe ); + + HWSim__register_constructor( &constructTeraFluxArch, _params, animPr); + HWSim__reset_and_sim( params->results, animPr );//animPr suspends til done + + //=========== Setup + /* for performance, want each phys core's master to have own acquire state + * locally, and only read some config info that tells it whether needs + * to read other data to update itself, or something.. + * But, for now, just doing simplest thing.. can add a "comm plugin" to + * HWSim, so the the handler for communication-calls takes a plugin that + * it calls.. that lets HWSim be modified, so that acquire is done in + * the request handler in the master. + *Other alternative is making a communication-controller element, and send + * messages to it to do the acquires and releases -- will make that + * time-line be animated a lot -- will need it to be able to jump around + * among the physical cores -- so, something about letting a given time- + * line be able to be animated on whichever core needs it at the moment. + *Let's see.. the cores will be busy, than one will do an acquire, which + * will need the acquire-controller time-line -- but don't want that core + * to run out of work waiting for the controller -- hmmm, how about, use + * the affinity feature to keep each of the nodes to a particular core, + * but don't use it on the controller, which will let it move around.. + *So, have separate readyQs -- one for each core, and another for free- + * floating.. when whatever scheduler is running has its local readyQ + * empty, it takes from the floating. + */ + make acquire-controller. (central control over acquires but no timing) + make array to hold all the nodes. + loop makes each node and gives it an x and a y ID, and code-ptrs + (call make_node(), which constructs the four time-lines in a node) + + loop through, send each "start" signal. + + } + + +/*This function is the constructor given to HWSim by the seed processor. + * It uses HWSim calls to create all the TimeLines and TimeDomains, and to + * hook them together. Note that HWSim will start them, itself, after this + * constructor is done. + * + *Note, timelines don't have to be connected in order to communicate -- it's + * just one way of getting the needed info to the sending TimeLine, which + * consists of the pointer to the destination TimeLine, and which port to + * tell that target Timeline the communication is coming in on. + *The acquire will have the target TimeLine stored in a hash table, that's + * how the sending TimeLine gets the pointer to the target. It has the port + * number hard-coded. + */ +void +constructTeraFluxArch( void *_params, VirtProcr *animPr ) + { + TFSimulatorParams *params; + int nodeNum; + HWSimTimeDomain *node; + HWSimTimeLine *cpu, *communicator; + + params = (TFSimulatorParams *)_params; + + //========Define the types of TimeLine, which sets the at_reset fn======== + // + HWSim__define_TimeLine_type( CPU_TIMELINE, &CPUSpan_at_reset, animPr ); + HWSim__define_TimeLine_type( COMM_TIMELINE, &commSpan_at_reset, animPr ); + + //==========Register HWSim handlers========= + // + // Handlers are special because they run inside HWSim with access to + // shared global state and have the ability to start new spans, modify + // TimeLine state, and so on. + + //All data is local to a TimeLine, except global vars. Those can only + // be accessed through a registered handler. + HWSim__register_global_var_accessor( DO_ACQUIRE_SEND, &do_acquire_send, + animPr ); + HWSim__register_global_var_accessor( GET_OWNING_TNODE, &get_owning_TNode, + animPr ); + + //HWInstrs are able to generate communications, start new spans, and + // so forth -- they are considered extensions of HWSim itself, with the + // ability to affect the language's internal semantic and scheduling + // state, and so are created as handlers, which must be registered. + HWSim__register_HWInstr_type( ACQUIRE_INSTR, + &handle_Acquire_HWInstr_request, animPr ); + HWSim__register_HWInstr_type( RELEASE_INSTR, + &handle_Release_HWInstr_request, animPr ); + + //==========Create the TimeDomains and TimeLines and connect them========= + // + for( nodeNum = 0; nodeNum < params->numNodes; nodeNum++ ) + { + node = HWSim__create_TimeDomain( params?, animPr ); + cpu = HWSim__create_TimeLine_of_type( CPU_TIMELINE, animPr ); + communicator = HWSim__create_TimeLine_of_type( COMM_TIMELINE, animPr); + HWSim__add_TimeLine_to_TimeDomain( cpu, node ); + HWSim__add_TimeLine_to_TimeDomain( communicator, node ); + //This stores the target ptr + port-num in the out-port position in + // the cpu TimeLine -- so spans in cpu TimeLine can look up target + HWSim__connect_TimeLine_outPort_to_TimeLine_inPort( + cpu, COMMUNICATOR_OUTPORT, communicator, CPU_INPORT ); + } + + //Done -- the architecture is very simple for now -- inter-node comm + // happens via acquire spans, which use global vars to find the target + // communicator, and the comm spans have the target port hard-coded. + //So no inter-node communication connections + } + +/*The acquire handler uses HWSim calls to generate a communication in the + * Communicator TimeLine. When the ConsistentTime reaches the appointed + * simulation time at which that comm arrives, it triggers the acquire-start + * span in the Communicator TimeLine. + * + *Q: what's with HWSim__send_comm? Can just use that, don't need to register + * an instruction.. Means GuestCode span just keeps going.. pause it during + * an HWInstr, then resume it.. Want multiple spans for any reason? + */ +void +handle_Acquire_HWInstr_request() + { + + + } + +/*The release handler uses HWSim calls to generate a communication in the + * Communicator TimeLine. When the ConsistentTime reaches the appointed + * simulation time at which that comm arrives, it triggers the release + * span in the Communicator TimeLine. + */ +void +handle_Release_HWInstr_request() + { + + } diff -r 000000000000 -r 8ea476474093 src/Application/SimParams.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/Application/SimParams.c Mon Nov 07 16:03:01 2011 -0800 @@ -0,0 +1,91 @@ +/* + * Copyright 2009 OpenSourceStewardshipFoundation.org + * Licensed under GNU General Public License version 2 + * + * Author: seanhalle@yahoo.com + * + * Created on November 15, 2009, 2:35 AM + */ + +#include +#include + +#include "SimParams.h" +#include "ParamHelper/Param.h" + + +uint8 * +read_Machine_Code_From_File( int numBytesInFile, char *machineCodeFileName ); + + +void +fill_sim_params_from_bag( SimulationParams *simParams, ParamBag *paramBag ) + { char *guestAppFileName, *systemCodeFileName; + int numBytesInGuestApp, numBytesInSystemCode; + + ParamStruc *param; + param = getParamFromBag( "GuestApplicationFileName", paramBag ); + guestAppFileName = param->strValue; + param = getParamFromBag( "numBytesInGuestApp", paramBag ); + numBytesInGuestApp = param->intValue; + + simParams->guestApp = + read_Machine_Code_From_File( numBytesInGuestApp, guestAppFileName ); + + param = getParamFromBag( "SystemCodeFileName", paramBag ); + systemCodeFileName = param->strValue; + param = getParamFromBag( "numBytesInSystemCode", paramBag ); + numBytesInSystemCode = param->intValue; + + simParams->systemCode = + read_Machine_Code_From_File( numBytesInSystemCode, systemCodeFileName ); + + + param = getParamFromBag( "numNodes", paramBag ); + simParams->numNodes = param->intValue; + + } + + + +uint8 * +read_Machine_Code_From_File( int numBytesInFile, char *machineCodeFileName ) + { int byte; + FILE *file; + char *machineCode = malloc( numBytesInFile ); + if( machineCode == NULL ) printf( "\nno mem for machine code\n" ); + + file = fopen( machineCodeFileName, "r" ); + if( file == NULL ) { printf( "\nCouldn't open file!!\n"); exit(1);} + + fseek( file, 0, SEEK_SET ); + for( byte = 0; byte < numBytesInFile; byte++ ) + { + if( feof( file ) ) printf( "file ran out too soon" ); + machineCode[byte] = getchar( file ); + + } + return machineCode; + } + + + //========================================================================== +void +printSimResults( SimulationResults simResults ) + { int r, c, numRows, numCols, rowsToPrint, colsToPrint, rowIncr, colIncr; + float32 *matrixArray; + + numRows = rowsToPrint = matrix->numRows; + numCols = colsToPrint = matrix->numCols; + matrixArray = matrix->array; + + rowIncr = numRows/20; if(rowIncr == 0) rowIncr = 1;//20 to 39 rows printed + colIncr = numCols/20; if(colIncr == 0) colIncr = 1;//20 to 39 cols printed + for( r = 0; r < numRows; r += rowIncr ) + { for( c = 0; c < numCols; c += colIncr ) + { printf( "%3.1f | ", matrixArray[ r * numCols + c ] ); + } + printf("\n"); + } + } + diff -r 000000000000 -r 8ea476474093 src/Application/SimParams.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/Application/SimParams.h Mon Nov 07 16:03:01 2011 -0800 @@ -0,0 +1,48 @@ +/* + * Copyright Oct 24, 2009 OpenSourceStewardshipFoundation.org + * Licensed under GNU General Public License version 2 + */ + +#ifndef MATRIX_MULT_H_ +#define MATRIX_MULT_H_ + +#include +#include +#include + +#include "../HWSim_lib/VMS/VMS_primitive_data_types.h" +#include "ParamHelper/Param.h" + +//============================== Structures ============================== + +typedef +struct + { uint8 *guestApp; + uint8 *systemCode; + int32 numNodes; + } +SimulationResults; + + +typedef +struct + { uint8 *guestApp; + uint8 *systemCode; + int32 numNodes; + SimulationResults *simResults; + } +SimulationParams; + + + +//============================== Functions ================================ + +void +printSimResults( SimulationResults simResults ); + +void +fill_sim_params_from_bag( SimulationParams *simParams, ParamBag *paramBag ); + +//=========================================================================== + +#endif /*MATRIX_MULT_H_*/ diff -r 000000000000 -r 8ea476474093 src/Application/main.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/Application/main.c Mon Nov 07 16:03:01 2011 -0800 @@ -0,0 +1,48 @@ +/* + * Copyright Oct 24, 2009 OpenSourceStewardshipFoundation.org + * Licensed under GNU General Public License version 2 + * + * author seanhalle@yahoo.com + */ + +#include +#include + +#include "SimParams.h" +#include "HWSim_TeraFlux/HWSim_TeraFlux.h" + +/** + * + */ +int main( int argc, char **argv ) + { SimulationParams *simParams; + SimulationResults *simResults; + ParamBag *paramBag; + + printf( "arguments: %s | %s\n", argv[0], argv[1] ); + + simParams = malloc( sizeof(SimulationParams) ); + + + //VMS has its own separate internal malloc, so to get results out, + // have to pass in empty array for it to fill up + //The alternative is internally telling HWSim make external space to use + simResults = malloc( sizeof(SimulationResults) ); + simParams->simResults = simResults; + + paramBag = makeParamBag(); + + readParamFileIntoBag( argv[1], paramBag ); + fill_sim_params_from_bag( simParams, paramBag ); + + + constructAndSimulateSystem( simParams ); + + printSimResults( simResults ); + + fflush(stdin); + + exit(0); //cleans up + } + +