changeset 0:8ea476474093

Initial add -- gobbeldegook
author Me@portablequad
date Mon, 07 Nov 2011 16:03:01 -0800
parents
children 7566745e812a
files .hgignore src/Application/HWSim__Hello_World_HW/Communicator_TimeLine.c src/Application/HWSim__Hello_World_HW/DESIGN_NOTES.txt src/Application/HWSim__Hello_World_HW/EntryPoint.c src/Application/HWSim__Hello_World_HW/Seed_VP.c src/Application/SimParams.c src/Application/SimParams.h src/Application/main.c
diffstat 8 files changed, 1080 insertions(+), 0 deletions(-) [+]
line diff
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/.hgignore	Mon Nov 07 16:03:01 2011 -0800
     1.3 @@ -0,0 +1,6 @@
     1.4 +syntax: glob
     1.5 +
     1.6 +nbproject
     1.7 +build
     1.8 +dist
     1.9 +*.o
    1.10 \ No newline at end of file
     2.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     2.2 +++ b/src/Application/HWSim__Hello_World_HW/Communicator_TimeLine.c	Mon Nov 07 16:03:01 2011 -0800
     2.3 @@ -0,0 +1,297 @@
     2.4 +/*
     2.5 
     2.6 + *  Copyright 2009 OpenSourceStewardshipFoundation.org
     2.7 
     2.8 + *  Licensed under GNU General Public License version 2
     2.9 
    2.10 + *
    2.11 
    2.12 + * Author: seanhalle@yahoo.com
    2.13 
    2.14 + *
    2.15 
    2.16 + */
    2.17 
    2.18 +
    2.19 
    2.20 +#include "HWSim_TeraFlux.h"
    2.21 
    2.22 +
    2.23 
    2.24 +//=====================
    2.25 
    2.26 +
    2.27 
    2.28 +//===========================================================================
    2.29 
    2.30 +/*This is the collection of spans for the Communication Processor TimeLine
    2.31 
    2.32 + *
    2.33 
    2.34 + *This TimeLine does acquire-start, acquire-send, release, acquire-receive,
    2.35 
    2.36 + * all triggered by communications coming from other TimeLines.
    2.37 
    2.38 + * 
    2.39 
    2.40 + *  Sometime after Feb 2011 it will also have the control-
    2.41 
    2.42 + * communications that Albert wants for Erbium (the broad-cast of counter
    2.43 
    2.44 + * updates to registered listener TNodes).  And, it will probably get some
    2.45 
    2.46 + * form of control communication for implementing a fast chip-wide VMS.
    2.47 
    2.48 + *
    2.49 
    2.50 + *
    2.51 
    2.52 + *
    2.53 
    2.54 + *The kinds of span the TimeLine can have are:
    2.55 
    2.56 +-] acquire-start -- is triggered by the reception of a "start-acquire"
    2.57 
    2.58 +   communication from the CPU TimeLine. It generates an "acquire send"
    2.59 
    2.60 +   communication to the TNode that currently owns the data.
    2.61 
    2.62 +-] acquire-send -- when triggered, checks if the Frame is free.  If yes,
    2.63 
    2.64 +   sends an "acquire-receive" communication back to the requester.  If not
    2.65 
    2.66 +   free, places data representing the acquire-request into a queue of waiting
    2.67 
    2.68 +   ones in the Communicator TimeLine.  When the Communicator receives a
    2.69 
    2.70 +   "release" communication from the CPU TimeLine, it runs the release span.
    2.71 
    2.72 +-] release -- takes the data of the next request waiting for the released
    2.73 
    2.74 +   Frame out of the queue and then sends an "acquire-receive" communication
    2.75 
    2.76 +   to the TNode requesting.
    2.77 
    2.78 +-] acquire-receive -- when triggered, runs firmware, which writes memory
    2.79 
    2.80 +   shared with the CPU, modifying data-structures.  This firmware for
    2.81 
    2.82 +   TeraFlux works with the System Code that runs on the CPU TimeLine to
    2.83 
    2.84 +   notify it that the acquire is complete.
    2.85 
    2.86 + *
    2.87 
    2.88 + *
    2.89 
    2.90 + *Background on acquire:
    2.91 
    2.92 + *The request handler invokes acquire by placing the calling VP into
    2.93 
    2.94 + * a holding list, then executing the hardware instruction that starts the
    2.95 
    2.96 + * acquire.
    2.97 
    2.98 + *This instruction sends a communication from the CPU TimeLine to the
    2.99 
   2.100 + * Communicator TimeLine.  The instruction passes a pointer to the
   2.101 
   2.102 + * list element, and also passes a pointer to the queue of
   2.103 
   2.104 + * ready VPs.  When acquire comm is done, the Communicator takes the list
   2.105 
   2.106 + * element out and recycles it, and places the pointer to the VP into the
   2.107 
   2.108 + * queue of ready VPs.
   2.109 
   2.110 + * 
   2.111 
   2.112 + *In the actual hardware, each node has an ultra-simple communication
   2.113 
   2.114 + * processor -- like an 8 bit control data-path and a physical-addr-width
   2.115 
   2.116 + * addr-data-path (simple -- only power-of-2 shift, add, and maybe mask).
   2.117 
   2.118 + *The acquire-instruction parameters are placed into a data
   2.119 
   2.120 + * structure and only the pointer to them is in a register.  The CPU performs
   2.121 
   2.122 + * a write of the ptr to a particular physical addr, which the comm hardware
   2.123 
   2.124 + * catches and queues.  The comm-processor is driven by the queue -- stalls
   2.125 
   2.126 + * when queue is empty -- returns after finishes to get next from queue.
   2.127 
   2.128 + *Firmware in the comm-processor then fetches the params out of the data
   2.129 
   2.130 + * structure and starts the communication.  The comm involves the
   2.131 
   2.132 + * BIU, which is asked to grant acquire on the frame. BIU sends back
   2.133 
   2.134 + * the node that currently has the frame, or else main-mem physical address
   2.135 
   2.136 + * range.
   2.137 
   2.138 + *When comm is complete, the comm processor performs list-element removal,
   2.139 
   2.140 + * recycling, and pointer movement.
   2.141 
   2.142 + *End Background on acquire
   2.143 
   2.144 + *
   2.145 
   2.146 + *So, the HWInstr_acquire() is the hardware instruction called by the
   2.147 
   2.148 + * GuestSystemCode. This instruction's job is to perform the communication
   2.149 
   2.150 + * protocol that gets data from wherever it is and brings it onto the TNode
   2.151 
   2.152 + * executing this instr.
   2.153 
   2.154 + *In the simulator, this happens in a separate time-line than the CPU, which
   2.155 
   2.156 + * is animating another virtual TNode while the communication happens.
   2.157 
   2.158 + *There may be several acquires started while one is in progress.  But not
   2.159 
   2.160 + * going to model queueing of them.  This time-line treats them as
   2.161 
   2.162 + * zero width.  They each start a new span, but the span ends and reports
   2.163 
   2.164 + * its end-time as the same as its start time, then starts the idle-span.
   2.165 
   2.166 + *
   2.167 
   2.168 + *HWInstr_acquire:   CPU TimeLine
   2.169 
   2.170 + * when executes on the CPU TimeLine, sends a communication to the
   2.171 
   2.172 + * Communicator TimeLine, which equals inserting an acquire-start span into
   2.173 
   2.174 + * the consistent-time trigger-priority-queue that's driven by the TNode's
   2.175 
   2.176 + * ConsistentTime.  It will stop ConsistentTime advancement at its target
   2.177 
   2.178 + * arrival time, ending the Idle-span and starting the acquire-start span.
   2.179 
   2.180 + *
   2.181 
   2.182 + *Acquire-start:  requesting TNode's Communicator TimeLine
   2.183 
   2.184 + * this span looks inside the data-struct to find the Frame start addr.  It
   2.185 
   2.186 + * looks this up in a hash table to see which TNode owns the Frame.  If none,
   2.187 
   2.188 + * means hasn't been allocated yet, so puts acquire-info into the wait-queue
   2.189 
   2.190 + * that's in the hash entry.
   2.191 
   2.192 + *
   2.193 
   2.194 + *Acquire-send: Dwelling TNode's Communicator TimeLine
   2.195 
   2.196 + * When advancement of ConsistentTime stopped by the acquire-send reception,
   2.197 
   2.198 + * check to see
   2.199 
   2.200 + * if the desired Frame is still owned (hash table keyed by Frame start
   2.201 
   2.202 + * addr).  If desired Frame is owned, places the data of the request into
   2.203 
   2.204 + * the queue of waiting acquires that's in the hash-entry.
   2.205 
   2.206 + * If no longer owned, then set new Dwelling TNode to be requester and
   2.207 
   2.208 + * place Acquire-receive into waiting-comm queue of requesting TNode
   2.209 
   2.210 + * TimeDomain.
   2.211 
   2.212 + *
   2.213 
   2.214 + *Release: CPUTimeLine of Owning TNode
   2.215 
   2.216 + * put Release into Communicator TimeLine, with the Frame start addr in it
   2.217 
   2.218 + *
   2.219 
   2.220 + *Release: Communicator TimeLine of Owning TNode
   2.221 
   2.222 + * look up the hash entry for released Frame, set to Not Owned.  If waitQ
   2.223 
   2.224 + * in the Frame's hash entry is not empty, take next entry, set as Owner of
   2.225 
   2.226 + * the Frame, and as new Dwelling TNode, then place Acquire-receive into
   2.227 
   2.228 + * waiting-comm queue of requesting TNode-TimeDomain.
   2.229 
   2.230 + *
   2.231 
   2.232 + *Acquire-receive: Communicator TimeLine of requesting TNode
   2.233 
   2.234 + * When advancement of ConsistentTime stopped by the acquire-receive
   2.235 
   2.236 + * reception, run the Firmware that sets the shared state, so that the CPU
   2.237 
   2.238 + * TimeLine will see the acquire is complete -- also do the data-structure
   2.239 
   2.240 + * rearrangement stuff.  Finally, do the thing of checking if all VTNodes
   2.241 
   2.242 + * are suspended waiting for acquires -- if so, the receive will have to be
   2.243 
   2.244 + * pro-active in restarting the CPU TimeLine, kicking it out of the
   2.245 
   2.246 + * LightSleep span. THIS SPECIAL SPAN IS KEY BENEFIT OF HW SUPPORT FOR VMS --
   2.247 
   2.248 + * can put hardware into light sleep mode when nothing to do -- power/energy
   2.249 
   2.250 + * saver.
   2.251 
   2.252 + * 
   2.253 
   2.254 + *
   2.255 
   2.256 + *So, a time-line is the virtual processor itself.  It animates the
   2.257 
   2.258 + * "current span" as the top-level function.  That span executes span-end,
   2.259 
   2.260 + * the next span's function replaces the old one as the top-level function.
   2.261 
   2.262 + * The stack is also reset, and the parameters for the new span are placed
   2.263 
   2.264 + * on the stack, and the pointer to the span's function is set as the resume-
   2.265 
   2.266 + * ptr.  When the time-line VP is resumed, it's equivalent to that span's
   2.267 
   2.268 + * function being called.
   2.269 
   2.270 + *The time-line VP has app-specific data that says whether the current-span
   2.271 
   2.272 + * is the Idle span, and the simulated-time of the last span-end.
   2.273 
   2.274 + *
   2.275 
   2.276 + *So, rather than having a single top-level functin, a timeLine VP has many,
   2.277 
   2.278 + * a different TLF for each kind of span.
   2.279 
   2.280 + *
   2.281 
   2.282 + *This file holds all the spans == TLFs for the comm-procr TimeLine.
   2.283 
   2.284 + */
   2.285 
   2.286 +
   2.287 
   2.288 +
   2.289 
   2.290 +//===========================================================================
   2.291 
   2.292 +/*
   2.293 
   2.294 + *Acquire-start:  requesting TNode's Communicator TimeLine
   2.295 
   2.296 + * this span looks inside the data-struct to find the Frame start addr.  It
   2.297 
   2.298 + * looks this up in a hash table to see which TNode owns the Frame.  If none,
   2.299 
   2.300 + * means hasn't been allocated yet, so puts acquire-info into the wait-queue
   2.301 
   2.302 + * that's in the hash entry.
   2.303 
   2.304 + *Q: want to put global data-structs into HWSim, with some protection
   2.305 
   2.306 + *  mechanism (like the transactions have already implemented), or want to
   2.307 
   2.308 + *  make general request-handler extension thingie?  Request-handler
   2.309 
   2.310 + *  extensions would be done by registering a handler function during
   2.311 
   2.312 + *  architecture-definition code.
   2.313 
   2.314 + *A: transactions awkward..  make all state be either contained in the
   2.315 
   2.316 + *  TimeLine, or else global.  If in TimeLine, has to be allocated during
   2.317 
   2.318 + *  architecture definition, and if global, has to be declared, and a
   2.319 
   2.320 + *  pointer to the functions that access it, along with an ID for that kind
   2.321 
   2.322 + *  of access is registered.  Then, in the span-code, call
   2.323 
   2.324 + *  HWSim__access_global_var( params, ACCESSID )  the ACCESSID determines
   2.325 
   2.326 + *  which function-pointer is called, and the function code determines
   2.327 
   2.328 + *  which global var is accessed, and the params hold all the data the
   2.329 
   2.330 + *  function needs to do whatever is to be done.
   2.331 
   2.332 + */
   2.333 
   2.334 +void
   2.335 
   2.336 +commSpan_acquire_start( void *_params, VirtProcr *animTimeLine )
   2.337 
   2.338 + { 
   2.339 
   2.340 +   AcquireParams  *params;
   2.341 
   2.342 +
   2.343 
   2.344 +   params    = (AcquireParams *)_params;
   2.345 
   2.346 +
   2.347 
   2.348 +         DEBUG( dbgAppFlow, "acquire_start\n", cloneAcquireParams( params ));
   2.349 
   2.350 +
   2.351 
   2.352 +      //invoke global-var-accessor to get the TNode owns the Frame
   2.353 
   2.354 +   residingTNode =
   2.355 
   2.356 +      HWSim__access_global_var( params, GET_OWNING_TNODE, animTimeLine );
   2.357 
   2.358 +
   2.359 
   2.360 +      //send a communication to that TNode's Communicator
   2.361 
   2.362 +   targetTL = residingTNode->communicatorTL;
   2.363 
   2.364 +
   2.365 
   2.366 +      //params are: amount of simulated time the communication takes,
   2.367 
   2.368 +      // the TimeLine receiving, the span-function to run when consistent-
   2.369 
   2.370 +      // time reaches the reception time, params for that span, and animTL
   2.371 
   2.372 +   HWSim__send_comm( calcNetworkTime(), targetTL, targetTL->sendAcquireSpan,
   2.373 
   2.374 +                     params, animTimeLine );
   2.375 
   2.376 +
   2.377 
   2.378 +   duration = 0; //starting an acquire modeled as taking zero time
   2.379 
   2.380 +
   2.381 
   2.382 +      //every span function ends with this call -- duration of this span,
   2.383 
   2.384 +      // pointer to next span-function to run, params for it, and animTL
   2.385 
   2.386 +   HWSim__transition_to_new_span( duration, IDLE_SPAN, NULL, animTimeLine );
   2.387 
   2.388 + }
   2.389 
   2.390 +
   2.391 
   2.392 +
   2.393 
   2.394 +/*Acquire-send: Dwelling TNode's Communicator TimeLine
   2.395 
   2.396 + * Runs when advancement of ConsistentTime stopped by the acquire-send
   2.397 
   2.398 + * reception.  Check to see
   2.399 
   2.400 + * if the desired Frame is still owned (hash table keyed by Frame start
   2.401 
   2.402 + * addr).  If desired Frame is owned, place the data of the request into
   2.403 
   2.404 + * the queue of waiting acquires that's in the hash-entry.
   2.405 
   2.406 + * If already released, then set new Dwelling TNode to be requester and
   2.407 
   2.408 + * do an Acquire-receive back to the requesting TNode, which has the effect
   2.409 
   2.410 + * of placing an acquire-receive span to wait in its consistent-time-arrest
   2.411 
   2.412 + * queue.
   2.413 
   2.414 + *The release span will take waiting requests out of the waiting-acquires Q
   2.415 
   2.416 + */
   2.417 
   2.418 +void
   2.419 
   2.420 +commSpan_acquire_send( void *_params, VirtProcr *animTimeLine )
   2.421 
   2.422 + {
   2.423 
   2.424 +   AcquireParams  *params;
   2.425 
   2.426 +
   2.427 
   2.428 +   params    = (AcquireParams *)_params;
   2.429 
   2.430 +
   2.431 
   2.432 +         DEBUG( dbgAppFlow, "acquire_send\n", cloneAcquireParams( params ));
   2.433 
   2.434 +
   2.435 
   2.436 +
   2.437 
   2.438 +      //invoke global-var-accessor to lookup hash entry and see if Frame is
   2.439 
   2.440 +      // still owned, and if so, add this acquire to queue of waiting ones.
   2.441 
   2.442 +      // note, this is non-physical behavior -- any use of global vars is
   2.443 
   2.444 +      // non-physical.  To make this function physical, implement a TimeLine
   2.445 
   2.446 +      // that holds the hash table and all other TimeLines communicate to.
   2.447 
   2.448 +      //Caveat there is that collisions can happen unless also impl protocol.
   2.449 
   2.450 +      // So leave that for later improvement.
   2.451 
   2.452 +   notCurrentlyOwned =
   2.453 
   2.454 +      HWSim__access_global_var( params, DO_ACQUIRE_SEND, animTimeLine );
   2.455 
   2.456 +
   2.457 
   2.458 +   if( notCurrentlyOwned )
   2.459 
   2.460 +    {
   2.461 
   2.462 +         //send a communication to requesting TNode's Communicator
   2.463 
   2.464 +      targetTL = params->requestingTNode->communicatorTL;
   2.465 
   2.466 +
   2.467 
   2.468 +         //params are: amount of simulated time the communication takes,
   2.469 
   2.470 +         // the TimeLine receiving, the span-function to run when consistent-
   2.471 
   2.472 +         // time reaches the reception time, params for that span, and animTL
   2.473 
   2.474 +      HWSim__send_comm( calcNetworkTime(), targetTL, COMM_INPORT,
   2.475 
   2.476 +                        &commSpan_acquire_receive, params, animTimeLine );
   2.477 
   2.478 +    }
   2.479 
   2.480 +   
   2.481 
   2.482 +   HWSim__transition_to_new_span( IDLE_SPAN, NULL, animTimeLine );
   2.483 
   2.484 + }
   2.485 
   2.486 +
   2.487 
   2.488 +
   2.489 
   2.490 +/*Acquire-receive: Communicator TimeLine of requesting TNode
   2.491 
   2.492 + * When advancement of ConsistentTime stopped by the acquire-receive
   2.493 
   2.494 + * reception, run the "Firmware" that sets the shared state, so that the CPU
   2.495 
   2.496 + * TimeLine will see the acquire is complete -- also do the data-structure
   2.497 
   2.498 + * rearrangement stuff.  Finally, do the thing of checking if all VTNodes
   2.499 
   2.500 + * are suspended waiting for acquires -- if so, the receive will have to be
   2.501 
   2.502 + * pro-active in restarting the CPU TimeLine, kicking it out of the
   2.503 
   2.504 + * LightSleep span. THIS SPECIAL SPAN IS KEY BENEFIT OF HW SUPPORT FOR VMS --
   2.505 
   2.506 + * can put hardware into light sleep mode when nothing to do -- power/energy
   2.507 
   2.508 + * saver.
   2.509 
   2.510 + */
   2.511 
   2.512 +void
   2.513 
   2.514 +commSpan_acquire_receive( void *_params, VirtProcr *animTimeLine )
   2.515 
   2.516 + {
   2.517 
   2.518 +   AcquireParams  *params;
   2.519 
   2.520 +
   2.521 
   2.522 +   params    = (AcquireParams *)_params;
   2.523 
   2.524 +
   2.525 
   2.526 +         DEBUG( dbgAppFlow, "acquire_receive\n", cloneAcquireParams(params));
   2.527 
   2.528 +
   2.529 
   2.530 +
   2.531 
   2.532 +   HWSim__transition_to_new_span( IDLE_SPAN, NULL, animTimeLine );
   2.533 
   2.534 + }
   2.535 
   2.536 +
   2.537 
   2.538 +
   2.539 
   2.540 +/*Release: Communicator TimeLine of Owning TNode
   2.541 
   2.542 + * look up the hash entry for released Frame, set to Not Owned.  If waitQ
   2.543 
   2.544 + * in the Frame's hash entry is not empty, take next entry, set as Owner of
   2.545 
   2.546 + * the Frame, and as new Dwelling TNode, then place Acquire-receive into
   2.547 
   2.548 + * waiting-comm queue of requesting TNode-TimeDomain.
   2.549 
   2.550 + */
   2.551 
   2.552 +void
   2.553 
   2.554 +commSpan_release( void *_params, VirtProcr *animTimeLine )
   2.555 
   2.556 + {
   2.557 
   2.558 +   AcquireParams  *params;
   2.559 
   2.560 +
   2.561 
   2.562 +   params    = (AcquireParams *)_params;
   2.563 
   2.564 +
   2.565 
   2.566 +         DEBUG( dbgAppFlow, "acquire_start\n", cloneAcquireParams( params ));
   2.567 
   2.568 +
   2.569 
   2.570 +
   2.571 
   2.572 +   HWSim__transition_to_new_span( IDLE_SPAN, NULL, animTimeLine );
   2.573 
   2.574 + }
   2.575 
   2.576 +
   2.577 
   2.578 +/*At reset only starts the Idle span in the communicator.
   2.579 
   2.580 + */
   2.581 
   2.582 +void
   2.583 
   2.584 +commSpan_at_reset( void *_params, VirtProcr *animTimeLine )
   2.585 
   2.586 + {
   2.587 
   2.588 +
   2.589 
   2.590 +         DEBUG( dbgAppFlow, "commSpan at_reset\n", NULL );
   2.591 
   2.592 +         
   2.593 
   2.594 +   HWSim__transition_to_new_span( IDLE_SPAN, NULL, animTimeLine );
   2.595 
   2.596 + }
   2.597 
     3.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     3.2 +++ b/src/Application/HWSim__Hello_World_HW/DESIGN_NOTES.txt	Mon Nov 07 16:03:01 2011 -0800
     3.3 @@ -0,0 +1,369 @@
     3.4 +
     3.5 
     3.6 +
     3.7 
     3.8 +TeraFlux Hardware Model  (impl of this model in terms of HWSim is below)
     3.9 
    3.10 +
    3.11 
    3.12 +As of Feb 2011, the hardware being simulated is:
    3.13 
    3.14 +
    3.15 
    3.16 +A number of TNodes on a chip, which are connected by a network, with a
    3.17 
    3.18 + main-memory.
    3.19 
    3.20 +
    3.21 
    3.22 +The address space is divided into a local address-space and a shared
    3.23 
    3.24 + address-space.  Both address spaces are divided among the TNodes -- each
    3.25 
    3.26 + has its own unique range of local virtual addresses and own range of
    3.27 
    3.28 + global virtual addresses.  A given TNode may only allocate virtual
    3.29 
    3.30 + addresses within its own two ranges.  The contents of *local* addresses it
    3.31 
    3.32 + allocates may only ever be seen by the allocating TNode.  The contents of
    3.33 
    3.34 + shared addresses may be seen by any TNode after executing an acquire
    3.35 
    3.36 + hardware instruction and being granted the contents.
    3.37 
    3.38 +
    3.39 
    3.40 +Each TNode has a single CPU core, a local memory, and a
    3.41 
    3.42 + communication processor with its own network hardware.
    3.43 
    3.44 +
    3.45 
    3.46 +The local memory size is not modelled, so is considered unlimited.
    3.47 
    3.48 +The network has undefined topology and is modelled as having constant latency
    3.49 
    3.50 + from any TNode to any other TNode, with a fixed BW between any two TNodes.
    3.51 
    3.52 +
    3.53 
    3.54 +The only modelled communication is movement of data, which can only be
    3.55 
    3.56 + triggered by the "acquire" hardware instruction.
    3.57 
    3.58 +
    3.59 
    3.60 +The behavior of acquire is defined as:
    3.61 
    3.62 +-] Only one TNode at a time owns a given shared-memory "Frame", which has a
    3.63 
    3.64 +   start address and a size.
    3.65 
    3.66 +-] A given shared virtual address is in at most one Frame for the duration
    3.67 
    3.68 +   of a program run (IE, no overlap of Frames, implying no change of size).
    3.69 
    3.70 +-] The hardware mechanism by which single-ownership is enforced is not
    3.71 
    3.72 +   modelled.  Control communication is considered infinitely fast.
    3.73 
    3.74 +-] The simulated time between a given TNode's CPU executing the acquire
    3.75 
    3.76 +   instruction and the simulated time the data of the Frame appears in the
    3.77 
    3.78 +   local memory of that TNode is determined by both a queue of waiting
    3.79 
    3.80 +   acquire requests and the network time required to move the data.
    3.81 
    3.82 +
    3.83 
    3.84 +The CPU in a TNode executes the standard x86 instruction set.
    3.85 
    3.86 +
    3.87 
    3.88 +The communication processor performs the acquire instruction and the 
    3.89 
    3.90 + release instruction.  The exact hardware mechanism by which these
    3.91 
    3.92 + instructions get from the instruction stream fetched by the CPU to the
    3.93 
    3.94 + communication processor is not defined.
    3.95 
    3.96 +
    3.97 
    3.98 +As an aside.  For simulation, the acquire and release instruction  are
    3.99 
   3.100 + implemented in the lightweight simulator as library-calls that trigger
   3.101 
   3.102 + the simulation infrastructure. In the application they are stated as
   3.103 
   3.104 + macro-calls.  These macros can then be defined in the tool-chain.
   3.105 
   3.106 +When the COTSon simulator is targeted, the
   3.107 
   3.108 + macro is implemented as an in-line assembly custom op-code.  When the
   3.109 
   3.110 + lightweight simulator is targeted, the macro is implemented as a call to
   3.111 
   3.112 + a library, which invokes the acquire or release functionality in the
   3.113 
   3.114 + lightweight simulator.
   3.115 
   3.116 +
   3.117 
   3.118 +The effect of transferring data between TNode local memory and the chip's
   3.119 
   3.120 + main memory is not modelled.
   3.121 
   3.122 +
   3.123 
   3.124 +The portions of the hardware left undefined or not modelled may all be
   3.125 
   3.126 + filled-in in future versions, according to research needs.
   3.127 
   3.128 +
   3.129 
   3.130 +
   3.131 
   3.132 +=============================================================================
   3.133 
   3.134 +Implementing the hardware model in terms of HWSim
   3.135 
   3.136 +
   3.137 
   3.138 +HWSim is used to implement the TeraFlux hardware model for simulation.
   3.139 
   3.140 +
   3.141 
   3.142 +The implementation consists of four elements:
   3.143 
   3.144 +1) A CPU TimeLine, which executes the Guest code
   3.145 
   3.146 +2) A Communicator TimeLine, which performs all inter-node communication
   3.147 
   3.148 +3) A TNode TimeDomain, which has one CPU TimeLine and one Communicator
   3.149 
   3.150 +   TimeLine in it
   3.151 
   3.152 +4) A Chip TimeDomain, which has a number of TNode TimeDomains in it.
   3.153 
   3.154 +
   3.155 
   3.156 +For the first version, there is no local memory element in a TNode, there is
   3.157 
   3.158 + no Main Memory node, and there is no address translation mechanism modelled.
   3.159 
   3.160 + These may be added later, and so might a TSU mechanism, and experimental
   3.161 
   3.162 + alternative memory models.
   3.163 
   3.164 +
   3.165 
   3.166 +The CPU TimeLine has as spans:
   3.167 
   3.168 +-] at_reset -- standard span that runs when the hardware is reset.  Runs the
   3.169 
   3.170 +   TeraFlux System Code bootstrap function.
   3.171 
   3.172 +-] guestCode span -- runs whatever Guest code is pointed to.. each span of
   3.173 
   3.174 +   this type is created with a pointer to code to run
   3.175 
   3.176 +-] lightSleep -- implemented as the built-in IDLE span.  When no virtual
   3.177 
   3.178 +    nodes are ready to animate, the CPU enters light sleep until the
   3.179 
   3.180 +    Communicator wakes it up
   3.181 
   3.182 +
   3.183 
   3.184 +The CPU TimeLine generates two kinds of communication-spans:
   3.185 
   3.186 +  acquire-start in Communicator
   3.187 
   3.188 +  release in Communicator
   3.189 
   3.190 +
   3.191 
   3.192 +For now, these communications are considered to be performed by dedicated
   3.193 
   3.194 + hardware in the CPU, so they take exactly one simulated instruction, and
   3.195 
   3.196 + their simulated time is thus included in the measured time of the span.  The
   3.197 
   3.198 + HWSim__send_comm takes a time-stamp before suspending the TimeLine, then
   3.199 
   3.200 + another just after resume, and accumulates -- the end-span adds this in.
   3.201 
   3.202 +
   3.203 
   3.204 +  All TimeLines begin by running the at_reset span defined for that TimeLine.
   3.205 
   3.206 +  The CPU's at_reset is hard-coded to start a GuestCode span that runs the
   3.207 
   3.208 +  TeraFlux System Code's boot sequence.
   3.209 
   3.210 +
   3.211 
   3.212 +The Communicator TimeLine has a number of spans, all related to acquire and
   3.213 
   3.214 +  release:
   3.215 
   3.216 +-] acquire-start -- is triggered by the reception of a "start-acquire"
   3.217 
   3.218 +   communication from the CPU TimeLine. It generates an "acquire send"
   3.219 
   3.220 +   communication to the TNode that currently owns the data.
   3.221 
   3.222 +-] acquire-send -- when triggered, checks if the Frame is free.  If yes,
   3.223 
   3.224 +   sends an "acquire-receive" communication back to the requester.  If not
   3.225 
   3.226 +   free, places data representing the acquire-request into a queue of waiting
   3.227 
   3.228 +   ones in the Communicator TimeLine.  When the Communicator receives a
   3.229 
   3.230 +   "release" communication from the CPU TimeLine, it runs the release span.
   3.231 
   3.232 +-] release -- takes the data of the next request waiting for the released
   3.233 
   3.234 +   Frame out of the queue and then sends an "acquire-receive" communication
   3.235 
   3.236 +   to the TNode requesting.
   3.237 
   3.238 +-] acquire-receive -- when triggered, runs firmware, which writes memory
   3.239 
   3.240 +   shared with the CPU, modifying data-structures.  This firmware for
   3.241 
   3.242 +   TeraFlux works with the System Code that runs on the CPU TimeLine to
   3.243 
   3.244 +   notify it that the acquire is complete.
   3.245 
   3.246 +
   3.247 
   3.248 +Note, nothing checks whether a Frame's addresses are accessed from outside an
   3.249 
   3.250 + Acquire-Release block, which could be a source of difficult to find bugs in
   3.251 
   3.252 + the application.
   3.253 
   3.254 +
   3.255 
   3.256 +
   3.257 
   3.258 +=============================================================================
   3.259 
   3.260 +Earlier versions of notes:
   3.261 
   3.262 +
   3.263 
   3.264 +A time-line is a virtual-processor, and has a sequence of spans -- each
   3.265 
   3.266 + span performs one hardware-function, and has a start-time and an end-time
   3.267 
   3.268 + -- those are simulated-time, not physical time.
   3.269 
   3.270 +
   3.271 
   3.272 +Each time-line is created with a start-span that initializes it, then every
   3.273 
   3.274 + span ends with an "end span" sem-lib call.
   3.275 
   3.276 +
   3.277 
   3.278 +There are three kinds of span -- fixed-function spans, which represent
   3.279 
   3.280 + hard-wired hardware behavior, processing-core spans, which represent
   3.281 
   3.282 + processing elements that execute code, and communication-spans, which
   3.283 
   3.284 + cross time-lines.
   3.285 
   3.286 +
   3.287 
   3.288 +a fixed-function span has a fixed function-pointer that it is created with
   3.289 
   3.290 + and jumps to when the time-line is resumed.  Fixed-function spans also have
   3.291 
   3.292 + a pointer to a function that calculates the width of the span.  The
   3.293 
   3.294 + width-caluclating span is defined in the application directory.
   3.295 
   3.296 +
   3.297 
   3.298 +a processing-core span has a function-pointer that is assigned to it by the
   3.299 
   3.300 + end-span call of the preceeding span.  The width is also determined by a
   3.301 
   3.302 + pointer to a width-calculating function.  The width-calculating function
   3.303 
   3.304 + for these spans is also defined in the application directory (In first
   3.305 
   3.306 + teraflux impl, this function uses RDTSC to measure physical execution time,
   3.307 
   3.308 + and makes that the simulated execution time too -- but with a "BS" detector
   3.309 
   3.310 + that sees when the time is significantly larger than the previous
   3.311 
   3.312 + invocation of the same function-pointer).
   3.313 
   3.314 +
   3.315 
   3.316 +Communication spans are special because they cross time-lines.  So, a
   3.317 
   3.318 + communication span has zero width in the time-line it's created in, and
   3.319 
   3.320 + goes onto the queue as a new span in the target time-line (which also has
   3.321 
   3.322 + zero-width).  When the target span runs, it changes the state available to
   3.323 
   3.324 + the target time-line, to represent the reception of the communication.
   3.325 
   3.326 +
   3.327 
   3.328 +=============================
   3.329 
   3.330 +Span-end is the only semantic-library call implemented.  Inside the
   3.331 
   3.332 + request-handler, it causes new spans to be created.
   3.333 
   3.334 +
   3.335 
   3.336 +So, have to have a separate receive time-line, that modifies hardware shared
   3.337 
   3.338 + with other time-lines.  The send span causes a receive-span to be inserted
   3.339 
   3.340 + into the target receive time-line.  
   3.341 
   3.342 +
   3.343 
   3.344 +Receive-spans are zero-width -- they update the hardware-state atomically,
   3.345 
   3.346 + so don't have to worry about conflicts between different receive spans in
   3.347 
   3.348 + the simulator.  The hardware-application that uses the simulator-library
   3.349 
   3.350 + must model the receive hardware and implement the send-hardware function
   3.351 
   3.352 + to work out any physical conflicts among receives targeted to the same
   3.353 
   3.354 + receive time-line.
   3.355 
   3.356 +
   3.357 
   3.358 +
   3.359 
   3.360 +=============================
   3.361 
   3.362 +
   3.363 
   3.364 +Time-lines are specialized to specific hardware functions inside the
   3.365 
   3.366 + Application directory -- that's where the main creates all the time-lines,
   3.367 
   3.368 + and where the spans are implemented that have the behavior of a given type
   3.369 
   3.370 + of time-line.
   3.371 
   3.372 +
   3.373 
   3.374 +For example, if the hardware is a communication-unit, then span-types are
   3.375 
   3.376 + created that have the behavior that does all the setup of a communication
   3.377 
   3.378 + span and then does an end-span that creates as its follow-on the
   3.379 
   3.380 + communication-span.
   3.381 
   3.382 +
   3.383 
   3.384 +Communication spans are special because they cross time-lines.  So, a
   3.385 
   3.386 + communication span has zero width in the time-line it's created in, and
   3.387 
   3.388 + goes onto the queue in the target time-line, where it creates a new span
   3.389 
   3.390 + that also has zero-width.  The target span's function updates the hardware
   3.391 
   3.392 + state available to the target time-line, which may be shared with other
   3.393 
   3.394 + time-lines and that update may cause new spans to be spawned in those.
   3.395 
   3.396 +
   3.397 
   3.398 +if the hardware is a processing-core, then the function points to
   3.399 
   3.400 + Guest-application-code.  This function-pointer is what core_loop jumps to
   3.401 
   3.402 + when it reanimates the time-line virtual processor.
   3.403 
   3.404 +
   3.405 
   3.406 +================  Albert e-mail =================
   3.407 
   3.408 +
   3.409 
   3.410 +
   3.411 
   3.412 +Hi Albert,
   3.413 
   3.414 +
   3.415 
   3.416 +   the simulator is a thing of beauty.  I'm getting goose bumps as I put the
   3.417 
   3.418 + last pieces of the design into place.  It's a sweet thing.
   3.419 
   3.420 +
   3.421 
   3.422 +Just in case you're curious, attached are my design notes <this file>.  The
   3.423 
   3.424 + thing that makes it nice is the clean decomposition and reusability -- the
   3.425 
   3.426 + core simulator only has three things: time-lines, spans, and a
   3.427 
   3.428 + priority-queue with an associated "certain-time" or "consistent-time".
   3.429 
   3.430 +
   3.431 
   3.432 +   A time-line is a VMS virtual-processor, which executes a sequence of
   3.433 
   3.434 + spans.  Each span has a function that represents the behavior performed
   3.435 
   3.436 + during that span, plus a function that calculates the simulated-time width
   3.437 
   3.438 + of that span.  The certain-time represents the advancement of global
   3.439 
   3.440 + simulated-time.  At all points in physical time during the simulation, it
   3.441 
   3.442 + is guaranteed that no spans are waiting to execute that have a
   3.443 
   3.444 + simulated-time older than the certain-time.
   3.445 
   3.446 +   In other words, at any physical moment, there are lots of spans sitting
   3.447 
   3.448 + in queues waiting to run.  So, a given time-line finishes a span at a given
   3.449 
   3.450 + simulated-time point in global simulated-time.  But there may be spans from
   3.451 
   3.452 + other time-lines in the queue that finish at a preceeding simulated-time.
   3.453 
   3.454 + But there can never be any waiting spans that finish before the
   3.455 
   3.456 + certain-time.  This is important for communications, which cross
   3.457 
   3.458 + time-lines.
   3.459 
   3.460 +
   3.461 
   3.462 +
   3.463 
   3.464 +   Time-lines, spans, and certain-time are implemented in a semantic-library.
   3.465 
   3.466 +
   3.467 
   3.468 +This is, in essence a new parallel language for writing hardware-simulators
   3.469 
   3.470 + with.  (this just turns out to be the most natural and most simple way to
   3.471 
   3.472 + write the TeraFlux simulator)
   3.473 
   3.474 +
   3.475 
   3.476 +The behavior of particular hardware is defined as a simulator-application
   3.477 
   3.478 + that makes calls to that semantic-library.
   3.479 
   3.480 +
   3.481 
   3.482 +The main of that simulator-application creates the pieces of the hardware
   3.483 
   3.484 + -- for Teraflux, that means it creates the nodes, and the pieces inside
   3.485 
   3.486 + each node.
   3.487 
   3.488 +
   3.489 
   3.490 +Running this simulator-application equals turning on the power-switch of
   3.491 
   3.492 + the Guest hardware.
   3.493 
   3.494 +
   3.495 
   3.496 +The Guest application code is retrieved by the main of the
   3.497 
   3.498 + simulator-application, and starts running when the "power-switch" is
   3.499 
   3.500 + turned on (running the simulator-application causes the Guest-application
   3.501 
   3.502 + to be retrieved and start running on the hardware created by main).  This
   3.503 
   3.504 + is equivalent to the boot sequence of the BIOS, which happens at power-on.
   3.505 
   3.506 +
   3.507 
   3.508 +So, the end-effect is that full Linux is available to the Guest hardware as
   3.509 
   3.510 + a sort of "escape".  The Guest hardware can use the Host's disk access,
   3.511 
   3.512 + debugging, and everything else, but these usages are "outside" the
   3.513 
   3.514 + simulated time -- they are essentially magic-spells that the Guest
   3.515 
   3.516 + hardware can perform that take place outside of time as far as the Guest
   3.517 
   3.518 + application-code is concerned.  Hence, the main is able to use the Host
   3.519 
   3.520 + Linux to retrieve-from-disk the Guest application (but the disk-access
   3.521 
   3.522 + takes place outside the measurements reported by the simulator).
   3.523 
   3.524 +
   3.525 
   3.526 +Which maybe seems trivial, but I consider it a very cool trick  : )
   3.527 
   3.528 +
   3.529 
   3.530 +The part I like best is the fact that the simulator itself is such a simple
   3.531 
   3.532 + semantic-library, and the behavior of the hardware is written all as
   3.533 
   3.534 + sequential code.  This makes it easily customizable to any architecture
   3.535 
   3.536 + someone might want to investigate, and still run on parallel hardware  : D
   3.537 
   3.538 +
   3.539 
   3.540 +The only caveat is the thing I noted in earlier e-mails about communication
   3.541 
   3.542 + updates -- memory images and other hardware state  atomically  update at
   3.543 
   3.544 + the ends of spans.  So, Guest-code-execution that overlaps the
   3.545 
   3.546 + simulated-time at which reception happens, on a node, will not see that
   3.547 
   3.548 + reception until the end of the span.
   3.549 
   3.550 +   For TeraFlux hardware, the only natural span-endings are acquire
   3.551 
   3.552 + instructions and release instructions.
   3.553 
   3.554 +   What this means for you is that you may want to insert artificial
   3.555 
   3.556 + end-span calls into the Guest application code you generate (I'll provide
   3.557 
   3.558 + a "dummy" call).  Communication-receptions on a node will only become
   3.559 
   3.560 + visible to Guest application code after one of these dummy end-span calls.
   3.561 
   3.562 + So, the granularity of time in the simulation is related to the frequency
   3.563 
   3.564 + of end-span calls in the application code.  If you don't insert any
   3.565 
   3.566 + artificial ones, then acquire and release instructions will be the only
   3.567 
   3.568 + span-ending events, and will define the granularity of
   3.569 
   3.570 + communication-receptions being seen by Guest application code.
   3.571 
   3.572 +   In particular, this means that no Guest application code is allowed to
   3.573 
   3.574 + monitor memory to see when it changes -- so, programming techniques that
   3.575 
   3.576 + would work on real hardware, to detect acquire-updates early won't work
   3.577 
   3.578 + in the simulator -- all Guest code to run on the simulator must use the
   3.579 
   3.580 + acquire-library call, which will suspend the virtual-node the code is
   3.581 
   3.582 + running on until the acquire is complete, then re-start that node at some
   3.583 
   3.584 + simulated-time after the acquire completes.
   3.585 
   3.586 +   
   3.587 
   3.588 + 
   3.589 
   3.590 +No idea when something will start running, but the design is getting quite
   3.591 
   3.592 + detailed, so progress is being made,
   3.593 
   3.594 +
   3.595 
   3.596 +Sean
   3.597 
   3.598 +
   3.599 
   3.600 +
   3.601 
   3.602 +=============================================================================
   3.603 
   3.604 + Notes from before figured out how to do ConsistentTime for each TimeDomain
   3.605 
   3.606 + and have a priority-queue of waiting communication receptions for the
   3.607 
   3.608 + TimeDomain.
   3.609 
   3.610 +
   3.611 
   3.612 +This illustrates the tortured logic would have to go through otherwise.
   3.613 
   3.614 + *
   3.615 
   3.616 + * that acquire's release point in
   3.617 
   3.618 + * sumulated is either already known, or not yet known.  If already known,
   3.619 
   3.620 + * then recursively check if another TNode has already been granted acquire,
   3.621 
   3.622 + * until reach the end of the chain. If the end is already known without any
   3.623 
   3.624 + * other acquires, then set state that this TNode gets the acquire at the
   3.625 
   3.626 + * release-time of the last in the chain.  This acquire will have an unknown
   3.627 
   3.628 + * release time.
   3.629 
   3.630 + *If the release time of the end of the chain is unknown, then put the
   3.631 
   3.632 + * requesting acquire into a queue of acquires waiting for that Frame. When
   3.633 
   3.634 + * the release happens, it will check if any acquires are in the queue for
   3.635 
   3.636 + * the released Frame.  If so, it will do the same as is done when the end
   3.637 
   3.638 + * of a chain is known -- set the Frame's state to acquired, with release-
   3.639 
   3.640 + * time as the start-time of the new acquire-grant, and unknown end-time.
   3.641 
   3.642 + *
   3.643 
   3.644 + *When a HWInstr_release() is executed, it has a simulated-time at which the
   3.645 
   3.646 + * span starts -- the span will also have zero width and start the idle span
   3.647 
   3.648 + * when it ends, just like the acquire-start span, and the triggered
   3.649 
   3.650 + * spans.
   3.651 
   3.652 + *
   3.653 
   3.654 + *The release span will check the wait queue for the Frame it is releasing,
   3.655 
   3.656 + * and either mark the Frame as free, or else fire off the grant-acquire
   3.657 
   3.658 + * function for the first waiting acquire.
   3.659 
   3.660 + *
   3.661 
   3.662 + *The grant-acquire function checks the ConsistentTime, and if the
   3.663 
   3.664 + * simulated time of the grant preceeds the ConsistentTime, then the
   3.665 
   3.666 + * acquire is put into the readyQ as an acquire-send span in the comm
   3.667 
   3.668 + * TimeLine of the TNode that owns the acquired data, or else the MainMem
   3.669 
   3.670 + * Node.
   3.671 
   3.672 + *
   3.673 
   3.674 + *The comm TimeLine that executes an acquire-send creates a comm span between
   3.675 
   3.676 + * the sending TNode or MMNode and the receiving TNode.  The width of the
   3.677 
   3.678 + * span is set by hardware model.  This will be a parameter for experiments.
   3.679 
   3.680 + * It's the main phenomenon affecting performance and scalability.  A fixed
   3.681 
   3.682 + * latency plus Frame-size / fixed-BW to start.
   3.683 
   3.684 + *
   3.685 
   3.686 + *An acquire can only be granted when the Consistent time reaches the
   3.687 
   3.688 + * Release-time of the previous acquire.  That's when know for certain that
   3.689 
   3.690 + * the memory image being acquired is correct, and the acquire order is
   3.691 
   3.692 + * correct (all acquires that want a given Frame will have been queued up
   3.693 
   3.694 + * for that Frame, so can't grant to one, in Host time, then a different
   3.695 
   3.696 + * acquire arrives that SHOULD have been the one given the grant).
   3.697 
   3.698 + *Each Frame has a priority-queue of acquires waiting for it, ordered by the
   3.699 
   3.700 + * simulated time the acquire-request was made.
   3.701 
   3.702 + *When ConsistentTime advances past the last Release of a Frame, then check
   3.703 
   3.704 + * the priority queue of waiting acquires -- if the top is older than the
   3.705 
   3.706 + * ConsistentTime, then grant to that one -- otherwise, move the acquire to
   3.707 
   3.708 + * the TriggerByConsistentTime queue.
   3.709 
   3.710 + *
   3.711 
   3.712 + *Each time ConsistentTime *wants* to advance, check the Trigger priority-
   3.713 
   3.714 + * queue to see if any triggers are older than the proposed new Consistent-
   3.715 
   3.716 + * Time.  If yes, then ConsistentTime is only advanced to that trigger's
   3.717 
   3.718 + * time, and the tigger is performed.
   3.719 
   3.720 + *
   3.721 
   3.722 + *Triggers are spans that have a start-time that depends on actions in other
   3.723 
   3.724 + * time-lines.  So, the span is created in one TimeLine, either ending an
   3.725 
   3.726 + * idle-span, or being queued up to run in that TimeLine's sequence -- all
   3.727 
   3.728 + * dependencies have been satisfied except access to the TimeLine resource.
   3.729 
   3.730 + *
   3.731 
   3.732 + *Each Frame has a hash-entry key'd by the Frame's start-addr.  This entry
   3.733 
   3.734 + * has a priority queue holding acquires waiting for the Frame, sorted
   3.735 
   3.736 + * by sim-time the acquire was executed.
   3.737 
   3.738 +
   3.739 
   3.740 +
   3.741 
     4.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     4.2 +++ b/src/Application/HWSim__Hello_World_HW/EntryPoint.c	Mon Nov 07 16:03:01 2011 -0800
     4.3 @@ -0,0 +1,40 @@
     4.4 +/*
     4.5 
     4.6 + *  Copyright 2009 OpenSourceStewardshipFoundation.org
     4.7 
     4.8 + *  Licensed under GNU General Public License version 2
     4.9 
    4.10 + *
    4.11 
    4.12 + * Author: seanhalle@yahoo.com
    4.13 
    4.14 + *
    4.15 
    4.16 + */
    4.17 
    4.18 +
    4.19 
    4.20 +#include <math.h>
    4.21 
    4.22 +
    4.23 
    4.24 +#include "HWSim_TeraFlux.h"
    4.25 
    4.26 +
    4.27 
    4.28 +
    4.29 
    4.30 +
    4.31 
    4.32 +/*Every HWSim system has an "entry point" function that creates the first
    4.33 
    4.34 + * virtual processor, which is the seed processor.
    4.35 
    4.36 + *
    4.37 
    4.38 + *The seed processor will construct the system to be simulated.
    4.39 
    4.40 + *The other files in this directory define the components the system is
    4.41 
    4.42 + * constructed from.
    4.43 
    4.44 + *
    4.45 
    4.46 + *
    4.47 
    4.48 + *This entry-point function follows the same pattern as all entry-point
    4.49 
    4.50 + * functions do:
    4.51 
    4.52 + *1) it creates the params for the seed processor, from the
    4.53 
    4.54 + *    parameters passed into the entry-point function
    4.55 
    4.56 + *2) it calls HWSim__create_seed_procr_and_do_work
    4.57 
    4.58 + *3) it gets the return value from the params struc, frees the params struc,
    4.59 
    4.60 + *    and returns the value from the function
    4.61 
    4.62 + *
    4.63 
    4.64 + */
    4.65 
    4.66 +void
    4.67 
    4.68 +runTheSimulation( SimulationParams *simParams )
    4.69 
    4.70 + {
    4.71 
    4.72 +      //create divider processor, start doing the work, and wait till done
    4.73 
    4.74 +      //This function is the "border crossing" between normal code and HWSim
    4.75 
    4.76 +   HWSim__create_seed_procr_and_do_work( &constructAndSimulateSystem,
    4.77 
    4.78 +                                          simParams );
    4.79 
    4.80 +   
    4.81 
    4.82 + }
    4.83 
     5.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     5.2 +++ b/src/Application/HWSim__Hello_World_HW/Seed_VP.c	Mon Nov 07 16:03:01 2011 -0800
     5.3 @@ -0,0 +1,181 @@
     5.4 +/*
     5.5 
     5.6 + *  Copyright 2009 OpenSourceStewardshipFoundation.org
     5.7 
     5.8 + *  Licensed under GNU General Public License version 2
     5.9 
    5.10 + *
    5.11 
    5.12 + * Author: seanhalle@yahoo.com
    5.13 
    5.14 + *
    5.15 
    5.16 + */
    5.17 
    5.18 +
    5.19 
    5.20 +
    5.21 
    5.22 +#include "HWSim_TeraFlux.h"
    5.23 
    5.24 +#include <math.h>
    5.25 
    5.26 +#include <string.h>
    5.27 
    5.28 +
    5.29 
    5.30 +
    5.31 
    5.32 +
    5.33 
    5.34 +//===========================================================================
    5.35 
    5.36 +/*This is the seed processor.
    5.37 
    5.38 + *
    5.39 
    5.40 + *It takes the simulation parameters that were passed in to the entry point
    5.41 
    5.42 + * and uses them to construct the system and start it running.
    5.43 
    5.44 + *
    5.45 
    5.46 + *The way this VP Top-level-function is written is specific to TeraFlux, so
    5.47 
    5.48 + * the system it constructs is specifically a TeraFlux chip.
    5.49 
    5.50 + *
    5.51 
    5.52 + *HWSim, on the other hand, expects to be handed functions that it can call
    5.53 
    5.54 + * itself.  The first function should perform a bunch of
    5.55 
    5.56 + * HWSim__create_TimeLine()  and HWSim__create_TimeDomain()  calls.  The
    5.57 
    5.58 + * second function should connect together the entities created in the first
    5.59 
    5.60 + * function.
    5.61 
    5.62 + *
    5.63 
    5.64 + *So, the job of this seed processor is to construct the parameters those
    5.65 
    5.66 + * two functions will take, and hand them to HWSim with the
    5.67 
    5.68 + * HWSim__run_creation_fn( createFnPtr, createFnParams, animVP)
    5.69 
    5.70 + */
    5.71 
    5.72 +void
    5.73 
    5.74 +TFSeedVP_TLF( void *_params, VirtProcr *animPr )
    5.75 
    5.76 + {
    5.77 
    5.78 +   TFSimulatorParams   *params;
    5.79 
    5.80 +
    5.81 
    5.82 +   params    = (TFSimulatorParams *)_params;
    5.83 
    5.84 +
    5.85 
    5.86 +         DEBUG( dbgTFHW, "CPU Span at_reset\n", _params );
    5.87 
    5.88 +         
    5.89 
    5.90 +         int32
    5.91 
    5.92 +         constructProbe = VMS__create_single_interval_probe("constructProbe",
    5.93 
    5.94 +                                                                    animPr );
    5.95 
    5.96 +         VMS__record_sched_choice_into_probe( constructProbe, animPr );
    5.97 
    5.98 +         VMS__record_interval_start_in_probe( constructProbe );
    5.99 
   5.100 +
   5.101 
   5.102 +   HWSim__register_constructor( &constructTeraFluxArch, _params, animPr);
   5.103 
   5.104 +   HWSim__reset_and_sim( params->results, animPr );//animPr suspends til done
   5.105 
   5.106 +
   5.107 
   5.108 +   //=========== Setup 
   5.109 
   5.110 +   /* for performance, want each phys core's master to have own acquire state
   5.111 
   5.112 +    *  locally, and only read some config info that tells it whether needs
   5.113 
   5.114 +    *  to read other data to update itself, or something..
   5.115 
   5.116 +    * But, for now, just doing simplest thing.. can add a "comm plugin" to
   5.117 
   5.118 +    * HWSim, so the the handler for communication-calls takes a plugin that
   5.119 
   5.120 +    * it calls..  that lets HWSim be modified, so that acquire is done in
   5.121 
   5.122 +    * the request handler in the master.
   5.123 
   5.124 +    *Other alternative is making a communication-controller element, and send
   5.125 
   5.126 +    * messages to it to do the acquires and releases -- will make that
   5.127 
   5.128 +    * time-line be animated a lot -- will need it to be able to jump around
   5.129 
   5.130 +    * among the physical cores -- so, something about letting a given time-
   5.131 
   5.132 +    * line be able to be animated on whichever core needs it at the moment.
   5.133 
   5.134 +    *Let's see..  the cores will be busy, than one will do an acquire, which
   5.135 
   5.136 +    * will need the acquire-controller time-line -- but don't want that core
   5.137 
   5.138 +    * to run out of work waiting for the controller -- hmmm, how about, use
   5.139 
   5.140 +    * the affinity feature to keep each of the nodes to a particular core,
   5.141 
   5.142 +    * but don't use it on the controller, which will let it move around..
   5.143 
   5.144 +    *So, have separate readyQs -- one for each core, and another for free-
   5.145 
   5.146 +    * floating..  when whatever scheduler is running has its local readyQ
   5.147 
   5.148 +    * empty, it takes from the floating.
   5.149 
   5.150 +    */
   5.151 
   5.152 +   make acquire-controller. (central control over acquires but no timing)
   5.153 
   5.154 +   make array to hold all the nodes.
   5.155 
   5.156 +      loop makes each node and gives it an x and a y ID, and code-ptrs
   5.157 
   5.158 +      (call make_node(), which constructs the four time-lines in a node)
   5.159 
   5.160 +
   5.161 
   5.162 +      loop through, send each "start" signal.
   5.163 
   5.164 +
   5.165 
   5.166 + }
   5.167 
   5.168 +
   5.169 
   5.170 +
   5.171 
   5.172 +/*This function is the constructor given to HWSim by the seed processor.
   5.173 
   5.174 + * It uses HWSim calls to create all the TimeLines and TimeDomains, and to
   5.175 
   5.176 + * hook them together.  Note that HWSim will start them, itself, after this
   5.177 
   5.178 + * constructor is done.
   5.179 
   5.180 + *
   5.181 
   5.182 + *Note, timelines don't have to be connected in order to communicate -- it's
   5.183 
   5.184 + * just one way of getting the needed info to the sending TimeLine, which
   5.185 
   5.186 + * consists of the pointer to the destination TimeLine, and which port to
   5.187 
   5.188 + * tell that target Timeline the communication is coming in on.
   5.189 
   5.190 + *The acquire will have the target TimeLine stored in a hash table, that's
   5.191 
   5.192 + * how the sending TimeLine gets the pointer to the target.  It has the port
   5.193 
   5.194 + * number hard-coded.
   5.195 
   5.196 + */
   5.197 
   5.198 +void
   5.199 
   5.200 +constructTeraFluxArch( void *_params, VirtProcr *animPr )
   5.201 
   5.202 + {
   5.203 
   5.204 +   TFSimulatorParams   *params;
   5.205 
   5.206 +   int nodeNum;
   5.207 
   5.208 +   HWSimTimeDomain *node;
   5.209 
   5.210 +   HWSimTimeLine   *cpu, *communicator;
   5.211 
   5.212 +
   5.213 
   5.214 +   params    = (TFSimulatorParams *)_params;
   5.215 
   5.216 +
   5.217 
   5.218 +   //========Define the types of TimeLine, which sets the at_reset fn========
   5.219 
   5.220 +   //
   5.221 
   5.222 +   HWSim__define_TimeLine_type( CPU_TIMELINE, &CPUSpan_at_reset, animPr );
   5.223 
   5.224 +   HWSim__define_TimeLine_type( COMM_TIMELINE, &commSpan_at_reset, animPr );
   5.225 
   5.226 +   
   5.227 
   5.228 +   //==========Register HWSim handlers=========
   5.229 
   5.230 +   //
   5.231 
   5.232 +   // Handlers are special because they run inside HWSim with access to
   5.233 
   5.234 +   //  shared global state and have the ability to start new spans, modify
   5.235 
   5.236 +   //  TimeLine state, and so on.
   5.237 
   5.238 +
   5.239 
   5.240 +      //All data is local to a TimeLine, except global vars.  Those can only
   5.241 
   5.242 +      // be accessed through a registered handler.
   5.243 
   5.244 +   HWSim__register_global_var_accessor( DO_ACQUIRE_SEND, &do_acquire_send,
   5.245 
   5.246 +                                        animPr );
   5.247 
   5.248 +   HWSim__register_global_var_accessor( GET_OWNING_TNODE, &get_owning_TNode,
   5.249 
   5.250 +                                        animPr );
   5.251 
   5.252 +
   5.253 
   5.254 +      //HWInstrs are able to generate communications, start new spans, and
   5.255 
   5.256 +      // so forth -- they are considered extensions of HWSim itself, with the
   5.257 
   5.258 +      // ability to affect the language's internal semantic and scheduling
   5.259 
   5.260 +      // state, and so are created as handlers, which must be registered.
   5.261 
   5.262 +   HWSim__register_HWInstr_type( ACQUIRE_INSTR,
   5.263 
   5.264 +                                 &handle_Acquire_HWInstr_request, animPr );
   5.265 
   5.266 +   HWSim__register_HWInstr_type( RELEASE_INSTR,
   5.267 
   5.268 +                                 &handle_Release_HWInstr_request, animPr );
   5.269 
   5.270 +
   5.271 
   5.272 +   //==========Create the TimeDomains and TimeLines and connect them=========
   5.273 
   5.274 +   //
   5.275 
   5.276 +   for( nodeNum = 0; nodeNum < params->numNodes; nodeNum++ )
   5.277 
   5.278 +    {
   5.279 
   5.280 +      node          = HWSim__create_TimeDomain( params?, animPr );
   5.281 
   5.282 +      cpu           = HWSim__create_TimeLine_of_type( CPU_TIMELINE, animPr );
   5.283 
   5.284 +      communicator  = HWSim__create_TimeLine_of_type( COMM_TIMELINE, animPr);
   5.285 
   5.286 +      HWSim__add_TimeLine_to_TimeDomain( cpu,          node );
   5.287 
   5.288 +      HWSim__add_TimeLine_to_TimeDomain( communicator, node );
   5.289 
   5.290 +         //This stores the target ptr + port-num in the out-port position in
   5.291 
   5.292 +         // the cpu TimeLine -- so spans in cpu TimeLine can look up target
   5.293 
   5.294 +      HWSim__connect_TimeLine_outPort_to_TimeLine_inPort(
   5.295 
   5.296 +         cpu, COMMUNICATOR_OUTPORT, communicator, CPU_INPORT );
   5.297 
   5.298 +    }
   5.299 
   5.300 +
   5.301 
   5.302 +   //Done -- the architecture is very simple for now -- inter-node comm
   5.303 
   5.304 +   // happens via acquire spans, which use global vars to find the target
   5.305 
   5.306 +   // communicator, and the comm spans have the target port hard-coded.
   5.307 
   5.308 +   //So no inter-node communication connections
   5.309 
   5.310 + }
   5.311 
   5.312 +
   5.313 
   5.314 +/*The acquire handler uses HWSim calls to generate a communication in the
   5.315 
   5.316 + * Communicator TimeLine.  When the ConsistentTime reaches the appointed
   5.317 
   5.318 + * simulation time at which that comm arrives, it triggers the acquire-start
   5.319 
   5.320 + * span in the Communicator TimeLine.
   5.321 
   5.322 + *
   5.323 
   5.324 + *Q: what's with HWSim__send_comm?  Can just use that, don't need to register
   5.325 
   5.326 + * an instruction..  Means GuestCode span just keeps going..  pause it during
   5.327 
   5.328 + * an HWInstr, then resume it..  Want multiple spans for any reason?
   5.329 
   5.330 + */
   5.331 
   5.332 +void
   5.333 
   5.334 +handle_Acquire_HWInstr_request()
   5.335 
   5.336 + {
   5.337 
   5.338 +
   5.339 
   5.340 +
   5.341 
   5.342 + }
   5.343 
   5.344 +
   5.345 
   5.346 +/*The release handler uses HWSim calls to generate a communication in the
   5.347 
   5.348 + * Communicator TimeLine.  When the ConsistentTime reaches the appointed
   5.349 
   5.350 + * simulation time at which that comm arrives, it triggers the release
   5.351 
   5.352 + * span in the Communicator TimeLine.
   5.353 
   5.354 + */
   5.355 
   5.356 +void
   5.357 
   5.358 +handle_Release_HWInstr_request()
   5.359 
   5.360 + {
   5.361 
   5.362 +
   5.363 
   5.364 + }
   5.365 
     6.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     6.2 +++ b/src/Application/SimParams.c	Mon Nov 07 16:03:01 2011 -0800
     6.3 @@ -0,0 +1,91 @@
     6.4 +/*
     6.5 
     6.6 + *  Copyright 2009 OpenSourceStewardshipFoundation.org
     6.7 
     6.8 + *  Licensed under GNU General Public License version 2
     6.9 
    6.10 + *
    6.11 
    6.12 + * Author: seanhalle@yahoo.com
    6.13 
    6.14 + *
    6.15 
    6.16 + * Created on November 15, 2009, 2:35 AM
    6.17 
    6.18 + */
    6.19 
    6.20 +
    6.21 
    6.22 +#include <malloc.h>
    6.23 
    6.24 +#include <stdlib.h>
    6.25 
    6.26 +
    6.27 
    6.28 +#include "SimParams.h"
    6.29 
    6.30 +#include "ParamHelper/Param.h"
    6.31 
    6.32 +
    6.33 
    6.34 +
    6.35 
    6.36 +uint8 *
    6.37 
    6.38 +read_Machine_Code_From_File( int numBytesInFile, char *machineCodeFileName );
    6.39 
    6.40 +
    6.41 
    6.42 + 
    6.43 
    6.44 +void
    6.45 
    6.46 +fill_sim_params_from_bag( SimulationParams *simParams, ParamBag *paramBag )
    6.47 
    6.48 + { char *guestAppFileName, *systemCodeFileName;
    6.49 
    6.50 +   int numBytesInGuestApp, numBytesInSystemCode;
    6.51 
    6.52 +   
    6.53 
    6.54 +      ParamStruc *param;
    6.55 
    6.56 +      param = getParamFromBag( "GuestApplicationFileName", paramBag );
    6.57 
    6.58 +   guestAppFileName = param->strValue;
    6.59 
    6.60 +      param = getParamFromBag( "numBytesInGuestApp", paramBag );
    6.61 
    6.62 +   numBytesInGuestApp = param->intValue;
    6.63 
    6.64 +
    6.65 
    6.66 +   simParams->guestApp =
    6.67 
    6.68 +    read_Machine_Code_From_File( numBytesInGuestApp, guestAppFileName );
    6.69 
    6.70 +
    6.71 
    6.72 +      param = getParamFromBag( "SystemCodeFileName", paramBag );
    6.73 
    6.74 +   systemCodeFileName = param->strValue;
    6.75 
    6.76 +      param = getParamFromBag( "numBytesInSystemCode", paramBag );
    6.77 
    6.78 +   numBytesInSystemCode = param->intValue;
    6.79 
    6.80 +
    6.81 
    6.82 +   simParams->systemCode =
    6.83 
    6.84 +    read_Machine_Code_From_File( numBytesInSystemCode, systemCodeFileName );
    6.85 
    6.86 +
    6.87 
    6.88 +
    6.89 
    6.90 +      param = getParamFromBag( "numNodes", paramBag );
    6.91 
    6.92 +   simParams->numNodes = param->intValue;
    6.93 
    6.94 +
    6.95 
    6.96 + }
    6.97 
    6.98 +
    6.99 
   6.100 +
   6.101 
   6.102 +
   6.103 
   6.104 +uint8 *
   6.105 
   6.106 +read_Machine_Code_From_File( int numBytesInFile, char *machineCodeFileName )
   6.107 
   6.108 + { int byte;
   6.109 
   6.110 +   FILE  *file;
   6.111 
   6.112 +   char  *machineCode = malloc( numBytesInFile );
   6.113 
   6.114 +   if( machineCode == NULL ) printf( "\nno mem for machine code\n" );
   6.115 
   6.116 +   
   6.117 
   6.118 +   file = fopen( machineCodeFileName, "r" );
   6.119 
   6.120 +   if( file == NULL ) { printf( "\nCouldn't open file!!\n"); exit(1);}
   6.121 
   6.122 +
   6.123 
   6.124 +   fseek( file, 0, SEEK_SET );
   6.125 
   6.126 +   for( byte = 0; byte < numBytesInFile; byte++ )
   6.127 
   6.128 +    {
   6.129 
   6.130 +      if( feof( file ) )  printf( "file ran out too soon" );
   6.131 
   6.132 +      machineCode[byte] = getchar( file );
   6.133 
   6.134 +      
   6.135 
   6.136 +    }
   6.137 
   6.138 +   return machineCode;
   6.139 
   6.140 + }
   6.141 
   6.142 +
   6.143 
   6.144 +
   6.145 
   6.146 + //==========================================================================
   6.147 
   6.148 +void
   6.149 
   6.150 +printSimResults( SimulationResults simResults )
   6.151 
   6.152 + { int r, c, numRows, numCols, rowsToPrint, colsToPrint, rowIncr, colIncr;
   6.153 
   6.154 +   float32 *matrixArray;
   6.155 
   6.156 +
   6.157 
   6.158 +   numRows = rowsToPrint = matrix->numRows;
   6.159 
   6.160 +   numCols = colsToPrint = matrix->numCols;
   6.161 
   6.162 +   matrixArray = matrix->array;
   6.163 
   6.164 +
   6.165 
   6.166 +   rowIncr = numRows/20; if(rowIncr == 0) rowIncr = 1;//20 to 39 rows printed
   6.167 
   6.168 +   colIncr = numCols/20; if(colIncr == 0) colIncr = 1;//20 to 39 cols printed
   6.169 
   6.170 +   for( r = 0; r < numRows; r += rowIncr )
   6.171 
   6.172 +    { for( c = 0; c < numCols; c += colIncr )
   6.173 
   6.174 +       { printf( "%3.1f | ", matrixArray[ r * numCols + c ] );
   6.175 
   6.176 +       }
   6.177 
   6.178 +      printf("\n");
   6.179 
   6.180 +    }
   6.181 
   6.182 + }
   6.183 
   6.184 +
   6.185 
     7.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     7.2 +++ b/src/Application/SimParams.h	Mon Nov 07 16:03:01 2011 -0800
     7.3 @@ -0,0 +1,48 @@
     7.4 +/*
     7.5 
     7.6 + *  Copyright Oct 24, 2009 OpenSourceStewardshipFoundation.org
     7.7 
     7.8 + *  Licensed under GNU General Public License version 2
     7.9 
    7.10 + */
    7.11 
    7.12 +
    7.13 
    7.14 +#ifndef MATRIX_MULT_H_
    7.15 
    7.16 +#define MATRIX_MULT_H_
    7.17 
    7.18 +
    7.19 
    7.20 +#include <stdio.h>
    7.21 
    7.22 +#include <unistd.h>
    7.23 
    7.24 +#include <malloc.h>
    7.25 
    7.26 +
    7.27 
    7.28 +#include "../HWSim_lib/VMS/VMS_primitive_data_types.h"
    7.29 
    7.30 +#include "ParamHelper/Param.h"
    7.31 
    7.32 +
    7.33 
    7.34 +//==============================  Structures  ==============================
    7.35 
    7.36 +
    7.37 
    7.38 +typedef
    7.39 
    7.40 +struct
    7.41 
    7.42 + { uint8 *guestApp;
    7.43 
    7.44 +   uint8 *systemCode;
    7.45 
    7.46 +   int32 numNodes;
    7.47 
    7.48 + }
    7.49 
    7.50 +SimulationResults;
    7.51 
    7.52 +
    7.53 
    7.54 +
    7.55 
    7.56 +typedef
    7.57 
    7.58 +struct
    7.59 
    7.60 + { uint8 *guestApp;
    7.61 
    7.62 +   uint8 *systemCode;
    7.63 
    7.64 +   int32 numNodes;
    7.65 
    7.66 +   SimulationResults *simResults;
    7.67 
    7.68 + }
    7.69 
    7.70 +SimulationParams;
    7.71 
    7.72 +
    7.73 
    7.74 +
    7.75 
    7.76 +
    7.77 
    7.78 +//==============================  Functions  ================================
    7.79 
    7.80 +
    7.81 
    7.82 +void
    7.83 
    7.84 +printSimResults( SimulationResults simResults );
    7.85 
    7.86 +
    7.87 
    7.88 +void
    7.89 
    7.90 +fill_sim_params_from_bag( SimulationParams *simParams, ParamBag *paramBag );
    7.91 
    7.92 +
    7.93 
    7.94 +//===========================================================================
    7.95 
    7.96 +
    7.97 
    7.98 +#endif /*MATRIX_MULT_H_*/
    7.99 
     8.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     8.2 +++ b/src/Application/main.c	Mon Nov 07 16:03:01 2011 -0800
     8.3 @@ -0,0 +1,48 @@
     8.4 +/*
     8.5 
     8.6 + *  Copyright Oct 24, 2009 OpenSourceStewardshipFoundation.org
     8.7 
     8.8 + *  Licensed under GNU General Public License version 2
     8.9 
    8.10 + *
    8.11 
    8.12 + * author seanhalle@yahoo.com
    8.13 
    8.14 + */
    8.15 
    8.16 +
    8.17 
    8.18 +#include <malloc.h>
    8.19 
    8.20 +#include <stdlib.h>
    8.21 
    8.22 +
    8.23 
    8.24 +#include "SimParams.h"
    8.25 
    8.26 +#include "HWSim_TeraFlux/HWSim_TeraFlux.h"
    8.27 
    8.28 +
    8.29 
    8.30 +/**
    8.31 
    8.32 + * 
    8.33 
    8.34 + */
    8.35 
    8.36 +int main( int argc, char **argv )
    8.37 
    8.38 + { SimulationParams  *simParams;
    8.39 
    8.40 +   SimulationResults *simResults;
    8.41 
    8.42 +   ParamBag          *paramBag;
    8.43 
    8.44 +   
    8.45 
    8.46 +   printf( "arguments: %s | %s\n", argv[0], argv[1] );
    8.47 
    8.48 +
    8.49 
    8.50 +   simParams             = malloc( sizeof(SimulationParams) );
    8.51 
    8.52 +
    8.53 
    8.54 +
    8.55 
    8.56 +      //VMS has its own separate internal malloc, so to get results out,
    8.57 
    8.58 +      // have to pass in empty array for it to fill up
    8.59 
    8.60 +      //The alternative is internally telling HWSim make external space to use
    8.61 
    8.62 +   simResults            = malloc( sizeof(SimulationResults) );
    8.63 
    8.64 +   simParams->simResults = simResults;
    8.65 
    8.66 +
    8.67 
    8.68 +   paramBag              = makeParamBag();
    8.69 
    8.70 +
    8.71 
    8.72 +   readParamFileIntoBag(     argv[1],   paramBag );
    8.73 
    8.74 +   fill_sim_params_from_bag( simParams, paramBag );
    8.75 
    8.76 +
    8.77 
    8.78 +
    8.79 
    8.80 +   constructAndSimulateSystem( simParams );
    8.81 
    8.82 +
    8.83 
    8.84 +   printSimResults( simResults );
    8.85 
    8.86 +
    8.87 
    8.88 +   fflush(stdin);
    8.89 
    8.90 +   
    8.91 
    8.92 +   exit(0); //cleans up
    8.93 
    8.94 + }
    8.95 
    8.96 +
    8.97 
    8.98 +
    8.99