Mercurial > cgi-bin > hgwebdir.cgi > PR > Applications > HWSim > HWSim__PingPong__HWDef
changeset 0:8ea476474093
Initial add -- gobbeldegook
author | Me@portablequad |
---|---|
date | Mon, 07 Nov 2011 16:03:01 -0800 |
parents | |
children | 7566745e812a |
files | .hgignore src/Application/HWSim__Hello_World_HW/Communicator_TimeLine.c src/Application/HWSim__Hello_World_HW/DESIGN_NOTES.txt src/Application/HWSim__Hello_World_HW/EntryPoint.c src/Application/HWSim__Hello_World_HW/Seed_VP.c src/Application/SimParams.c src/Application/SimParams.h src/Application/main.c |
diffstat | 8 files changed, 1080 insertions(+), 0 deletions(-) [+] |
line diff
1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/.hgignore Mon Nov 07 16:03:01 2011 -0800 1.3 @@ -0,0 +1,6 @@ 1.4 +syntax: glob 1.5 + 1.6 +nbproject 1.7 +build 1.8 +dist 1.9 +*.o 1.10 \ No newline at end of file
2.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 2.2 +++ b/src/Application/HWSim__Hello_World_HW/Communicator_TimeLine.c Mon Nov 07 16:03:01 2011 -0800 2.3 @@ -0,0 +1,297 @@ 2.4 +/* 2.5 2.6 + * Copyright 2009 OpenSourceStewardshipFoundation.org 2.7 2.8 + * Licensed under GNU General Public License version 2 2.9 2.10 + * 2.11 2.12 + * Author: seanhalle@yahoo.com 2.13 2.14 + * 2.15 2.16 + */ 2.17 2.18 + 2.19 2.20 +#include "HWSim_TeraFlux.h" 2.21 2.22 + 2.23 2.24 +//===================== 2.25 2.26 + 2.27 2.28 +//=========================================================================== 2.29 2.30 +/*This is the collection of spans for the Communication Processor TimeLine 2.31 2.32 + * 2.33 2.34 + *This TimeLine does acquire-start, acquire-send, release, acquire-receive, 2.35 2.36 + * all triggered by communications coming from other TimeLines. 2.37 2.38 + * 2.39 2.40 + * Sometime after Feb 2011 it will also have the control- 2.41 2.42 + * communications that Albert wants for Erbium (the broad-cast of counter 2.43 2.44 + * updates to registered listener TNodes). And, it will probably get some 2.45 2.46 + * form of control communication for implementing a fast chip-wide VMS. 2.47 2.48 + * 2.49 2.50 + * 2.51 2.52 + * 2.53 2.54 + *The kinds of span the TimeLine can have are: 2.55 2.56 +-] acquire-start -- is triggered by the reception of a "start-acquire" 2.57 2.58 + communication from the CPU TimeLine. It generates an "acquire send" 2.59 2.60 + communication to the TNode that currently owns the data. 2.61 2.62 +-] acquire-send -- when triggered, checks if the Frame is free. If yes, 2.63 2.64 + sends an "acquire-receive" communication back to the requester. If not 2.65 2.66 + free, places data representing the acquire-request into a queue of waiting 2.67 2.68 + ones in the Communicator TimeLine. When the Communicator receives a 2.69 2.70 + "release" communication from the CPU TimeLine, it runs the release span. 2.71 2.72 +-] release -- takes the data of the next request waiting for the released 2.73 2.74 + Frame out of the queue and then sends an "acquire-receive" communication 2.75 2.76 + to the TNode requesting. 2.77 2.78 +-] acquire-receive -- when triggered, runs firmware, which writes memory 2.79 2.80 + shared with the CPU, modifying data-structures. This firmware for 2.81 2.82 + TeraFlux works with the System Code that runs on the CPU TimeLine to 2.83 2.84 + notify it that the acquire is complete. 2.85 2.86 + * 2.87 2.88 + * 2.89 2.90 + *Background on acquire: 2.91 2.92 + *The request handler invokes acquire by placing the calling VP into 2.93 2.94 + * a holding list, then executing the hardware instruction that starts the 2.95 2.96 + * acquire. 2.97 2.98 + *This instruction sends a communication from the CPU TimeLine to the 2.99 2.100 + * Communicator TimeLine. The instruction passes a pointer to the 2.101 2.102 + * list element, and also passes a pointer to the queue of 2.103 2.104 + * ready VPs. When acquire comm is done, the Communicator takes the list 2.105 2.106 + * element out and recycles it, and places the pointer to the VP into the 2.107 2.108 + * queue of ready VPs. 2.109 2.110 + * 2.111 2.112 + *In the actual hardware, each node has an ultra-simple communication 2.113 2.114 + * processor -- like an 8 bit control data-path and a physical-addr-width 2.115 2.116 + * addr-data-path (simple -- only power-of-2 shift, add, and maybe mask). 2.117 2.118 + *The acquire-instruction parameters are placed into a data 2.119 2.120 + * structure and only the pointer to them is in a register. The CPU performs 2.121 2.122 + * a write of the ptr to a particular physical addr, which the comm hardware 2.123 2.124 + * catches and queues. The comm-processor is driven by the queue -- stalls 2.125 2.126 + * when queue is empty -- returns after finishes to get next from queue. 2.127 2.128 + *Firmware in the comm-processor then fetches the params out of the data 2.129 2.130 + * structure and starts the communication. The comm involves the 2.131 2.132 + * BIU, which is asked to grant acquire on the frame. BIU sends back 2.133 2.134 + * the node that currently has the frame, or else main-mem physical address 2.135 2.136 + * range. 2.137 2.138 + *When comm is complete, the comm processor performs list-element removal, 2.139 2.140 + * recycling, and pointer movement. 2.141 2.142 + *End Background on acquire 2.143 2.144 + * 2.145 2.146 + *So, the HWInstr_acquire() is the hardware instruction called by the 2.147 2.148 + * GuestSystemCode. This instruction's job is to perform the communication 2.149 2.150 + * protocol that gets data from wherever it is and brings it onto the TNode 2.151 2.152 + * executing this instr. 2.153 2.154 + *In the simulator, this happens in a separate time-line than the CPU, which 2.155 2.156 + * is animating another virtual TNode while the communication happens. 2.157 2.158 + *There may be several acquires started while one is in progress. But not 2.159 2.160 + * going to model queueing of them. This time-line treats them as 2.161 2.162 + * zero width. They each start a new span, but the span ends and reports 2.163 2.164 + * its end-time as the same as its start time, then starts the idle-span. 2.165 2.166 + * 2.167 2.168 + *HWInstr_acquire: CPU TimeLine 2.169 2.170 + * when executes on the CPU TimeLine, sends a communication to the 2.171 2.172 + * Communicator TimeLine, which equals inserting an acquire-start span into 2.173 2.174 + * the consistent-time trigger-priority-queue that's driven by the TNode's 2.175 2.176 + * ConsistentTime. It will stop ConsistentTime advancement at its target 2.177 2.178 + * arrival time, ending the Idle-span and starting the acquire-start span. 2.179 2.180 + * 2.181 2.182 + *Acquire-start: requesting TNode's Communicator TimeLine 2.183 2.184 + * this span looks inside the data-struct to find the Frame start addr. It 2.185 2.186 + * looks this up in a hash table to see which TNode owns the Frame. If none, 2.187 2.188 + * means hasn't been allocated yet, so puts acquire-info into the wait-queue 2.189 2.190 + * that's in the hash entry. 2.191 2.192 + * 2.193 2.194 + *Acquire-send: Dwelling TNode's Communicator TimeLine 2.195 2.196 + * When advancement of ConsistentTime stopped by the acquire-send reception, 2.197 2.198 + * check to see 2.199 2.200 + * if the desired Frame is still owned (hash table keyed by Frame start 2.201 2.202 + * addr). If desired Frame is owned, places the data of the request into 2.203 2.204 + * the queue of waiting acquires that's in the hash-entry. 2.205 2.206 + * If no longer owned, then set new Dwelling TNode to be requester and 2.207 2.208 + * place Acquire-receive into waiting-comm queue of requesting TNode 2.209 2.210 + * TimeDomain. 2.211 2.212 + * 2.213 2.214 + *Release: CPUTimeLine of Owning TNode 2.215 2.216 + * put Release into Communicator TimeLine, with the Frame start addr in it 2.217 2.218 + * 2.219 2.220 + *Release: Communicator TimeLine of Owning TNode 2.221 2.222 + * look up the hash entry for released Frame, set to Not Owned. If waitQ 2.223 2.224 + * in the Frame's hash entry is not empty, take next entry, set as Owner of 2.225 2.226 + * the Frame, and as new Dwelling TNode, then place Acquire-receive into 2.227 2.228 + * waiting-comm queue of requesting TNode-TimeDomain. 2.229 2.230 + * 2.231 2.232 + *Acquire-receive: Communicator TimeLine of requesting TNode 2.233 2.234 + * When advancement of ConsistentTime stopped by the acquire-receive 2.235 2.236 + * reception, run the Firmware that sets the shared state, so that the CPU 2.237 2.238 + * TimeLine will see the acquire is complete -- also do the data-structure 2.239 2.240 + * rearrangement stuff. Finally, do the thing of checking if all VTNodes 2.241 2.242 + * are suspended waiting for acquires -- if so, the receive will have to be 2.243 2.244 + * pro-active in restarting the CPU TimeLine, kicking it out of the 2.245 2.246 + * LightSleep span. THIS SPECIAL SPAN IS KEY BENEFIT OF HW SUPPORT FOR VMS -- 2.247 2.248 + * can put hardware into light sleep mode when nothing to do -- power/energy 2.249 2.250 + * saver. 2.251 2.252 + * 2.253 2.254 + * 2.255 2.256 + *So, a time-line is the virtual processor itself. It animates the 2.257 2.258 + * "current span" as the top-level function. That span executes span-end, 2.259 2.260 + * the next span's function replaces the old one as the top-level function. 2.261 2.262 + * The stack is also reset, and the parameters for the new span are placed 2.263 2.264 + * on the stack, and the pointer to the span's function is set as the resume- 2.265 2.266 + * ptr. When the time-line VP is resumed, it's equivalent to that span's 2.267 2.268 + * function being called. 2.269 2.270 + *The time-line VP has app-specific data that says whether the current-span 2.271 2.272 + * is the Idle span, and the simulated-time of the last span-end. 2.273 2.274 + * 2.275 2.276 + *So, rather than having a single top-level functin, a timeLine VP has many, 2.277 2.278 + * a different TLF for each kind of span. 2.279 2.280 + * 2.281 2.282 + *This file holds all the spans == TLFs for the comm-procr TimeLine. 2.283 2.284 + */ 2.285 2.286 + 2.287 2.288 + 2.289 2.290 +//=========================================================================== 2.291 2.292 +/* 2.293 2.294 + *Acquire-start: requesting TNode's Communicator TimeLine 2.295 2.296 + * this span looks inside the data-struct to find the Frame start addr. It 2.297 2.298 + * looks this up in a hash table to see which TNode owns the Frame. If none, 2.299 2.300 + * means hasn't been allocated yet, so puts acquire-info into the wait-queue 2.301 2.302 + * that's in the hash entry. 2.303 2.304 + *Q: want to put global data-structs into HWSim, with some protection 2.305 2.306 + * mechanism (like the transactions have already implemented), or want to 2.307 2.308 + * make general request-handler extension thingie? Request-handler 2.309 2.310 + * extensions would be done by registering a handler function during 2.311 2.312 + * architecture-definition code. 2.313 2.314 + *A: transactions awkward.. make all state be either contained in the 2.315 2.316 + * TimeLine, or else global. If in TimeLine, has to be allocated during 2.317 2.318 + * architecture definition, and if global, has to be declared, and a 2.319 2.320 + * pointer to the functions that access it, along with an ID for that kind 2.321 2.322 + * of access is registered. Then, in the span-code, call 2.323 2.324 + * HWSim__access_global_var( params, ACCESSID ) the ACCESSID determines 2.325 2.326 + * which function-pointer is called, and the function code determines 2.327 2.328 + * which global var is accessed, and the params hold all the data the 2.329 2.330 + * function needs to do whatever is to be done. 2.331 2.332 + */ 2.333 2.334 +void 2.335 2.336 +commSpan_acquire_start( void *_params, VirtProcr *animTimeLine ) 2.337 2.338 + { 2.339 2.340 + AcquireParams *params; 2.341 2.342 + 2.343 2.344 + params = (AcquireParams *)_params; 2.345 2.346 + 2.347 2.348 + DEBUG( dbgAppFlow, "acquire_start\n", cloneAcquireParams( params )); 2.349 2.350 + 2.351 2.352 + //invoke global-var-accessor to get the TNode owns the Frame 2.353 2.354 + residingTNode = 2.355 2.356 + HWSim__access_global_var( params, GET_OWNING_TNODE, animTimeLine ); 2.357 2.358 + 2.359 2.360 + //send a communication to that TNode's Communicator 2.361 2.362 + targetTL = residingTNode->communicatorTL; 2.363 2.364 + 2.365 2.366 + //params are: amount of simulated time the communication takes, 2.367 2.368 + // the TimeLine receiving, the span-function to run when consistent- 2.369 2.370 + // time reaches the reception time, params for that span, and animTL 2.371 2.372 + HWSim__send_comm( calcNetworkTime(), targetTL, targetTL->sendAcquireSpan, 2.373 2.374 + params, animTimeLine ); 2.375 2.376 + 2.377 2.378 + duration = 0; //starting an acquire modeled as taking zero time 2.379 2.380 + 2.381 2.382 + //every span function ends with this call -- duration of this span, 2.383 2.384 + // pointer to next span-function to run, params for it, and animTL 2.385 2.386 + HWSim__transition_to_new_span( duration, IDLE_SPAN, NULL, animTimeLine ); 2.387 2.388 + } 2.389 2.390 + 2.391 2.392 + 2.393 2.394 +/*Acquire-send: Dwelling TNode's Communicator TimeLine 2.395 2.396 + * Runs when advancement of ConsistentTime stopped by the acquire-send 2.397 2.398 + * reception. Check to see 2.399 2.400 + * if the desired Frame is still owned (hash table keyed by Frame start 2.401 2.402 + * addr). If desired Frame is owned, place the data of the request into 2.403 2.404 + * the queue of waiting acquires that's in the hash-entry. 2.405 2.406 + * If already released, then set new Dwelling TNode to be requester and 2.407 2.408 + * do an Acquire-receive back to the requesting TNode, which has the effect 2.409 2.410 + * of placing an acquire-receive span to wait in its consistent-time-arrest 2.411 2.412 + * queue. 2.413 2.414 + *The release span will take waiting requests out of the waiting-acquires Q 2.415 2.416 + */ 2.417 2.418 +void 2.419 2.420 +commSpan_acquire_send( void *_params, VirtProcr *animTimeLine ) 2.421 2.422 + { 2.423 2.424 + AcquireParams *params; 2.425 2.426 + 2.427 2.428 + params = (AcquireParams *)_params; 2.429 2.430 + 2.431 2.432 + DEBUG( dbgAppFlow, "acquire_send\n", cloneAcquireParams( params )); 2.433 2.434 + 2.435 2.436 + 2.437 2.438 + //invoke global-var-accessor to lookup hash entry and see if Frame is 2.439 2.440 + // still owned, and if so, add this acquire to queue of waiting ones. 2.441 2.442 + // note, this is non-physical behavior -- any use of global vars is 2.443 2.444 + // non-physical. To make this function physical, implement a TimeLine 2.445 2.446 + // that holds the hash table and all other TimeLines communicate to. 2.447 2.448 + //Caveat there is that collisions can happen unless also impl protocol. 2.449 2.450 + // So leave that for later improvement. 2.451 2.452 + notCurrentlyOwned = 2.453 2.454 + HWSim__access_global_var( params, DO_ACQUIRE_SEND, animTimeLine ); 2.455 2.456 + 2.457 2.458 + if( notCurrentlyOwned ) 2.459 2.460 + { 2.461 2.462 + //send a communication to requesting TNode's Communicator 2.463 2.464 + targetTL = params->requestingTNode->communicatorTL; 2.465 2.466 + 2.467 2.468 + //params are: amount of simulated time the communication takes, 2.469 2.470 + // the TimeLine receiving, the span-function to run when consistent- 2.471 2.472 + // time reaches the reception time, params for that span, and animTL 2.473 2.474 + HWSim__send_comm( calcNetworkTime(), targetTL, COMM_INPORT, 2.475 2.476 + &commSpan_acquire_receive, params, animTimeLine ); 2.477 2.478 + } 2.479 2.480 + 2.481 2.482 + HWSim__transition_to_new_span( IDLE_SPAN, NULL, animTimeLine ); 2.483 2.484 + } 2.485 2.486 + 2.487 2.488 + 2.489 2.490 +/*Acquire-receive: Communicator TimeLine of requesting TNode 2.491 2.492 + * When advancement of ConsistentTime stopped by the acquire-receive 2.493 2.494 + * reception, run the "Firmware" that sets the shared state, so that the CPU 2.495 2.496 + * TimeLine will see the acquire is complete -- also do the data-structure 2.497 2.498 + * rearrangement stuff. Finally, do the thing of checking if all VTNodes 2.499 2.500 + * are suspended waiting for acquires -- if so, the receive will have to be 2.501 2.502 + * pro-active in restarting the CPU TimeLine, kicking it out of the 2.503 2.504 + * LightSleep span. THIS SPECIAL SPAN IS KEY BENEFIT OF HW SUPPORT FOR VMS -- 2.505 2.506 + * can put hardware into light sleep mode when nothing to do -- power/energy 2.507 2.508 + * saver. 2.509 2.510 + */ 2.511 2.512 +void 2.513 2.514 +commSpan_acquire_receive( void *_params, VirtProcr *animTimeLine ) 2.515 2.516 + { 2.517 2.518 + AcquireParams *params; 2.519 2.520 + 2.521 2.522 + params = (AcquireParams *)_params; 2.523 2.524 + 2.525 2.526 + DEBUG( dbgAppFlow, "acquire_receive\n", cloneAcquireParams(params)); 2.527 2.528 + 2.529 2.530 + 2.531 2.532 + HWSim__transition_to_new_span( IDLE_SPAN, NULL, animTimeLine ); 2.533 2.534 + } 2.535 2.536 + 2.537 2.538 + 2.539 2.540 +/*Release: Communicator TimeLine of Owning TNode 2.541 2.542 + * look up the hash entry for released Frame, set to Not Owned. If waitQ 2.543 2.544 + * in the Frame's hash entry is not empty, take next entry, set as Owner of 2.545 2.546 + * the Frame, and as new Dwelling TNode, then place Acquire-receive into 2.547 2.548 + * waiting-comm queue of requesting TNode-TimeDomain. 2.549 2.550 + */ 2.551 2.552 +void 2.553 2.554 +commSpan_release( void *_params, VirtProcr *animTimeLine ) 2.555 2.556 + { 2.557 2.558 + AcquireParams *params; 2.559 2.560 + 2.561 2.562 + params = (AcquireParams *)_params; 2.563 2.564 + 2.565 2.566 + DEBUG( dbgAppFlow, "acquire_start\n", cloneAcquireParams( params )); 2.567 2.568 + 2.569 2.570 + 2.571 2.572 + HWSim__transition_to_new_span( IDLE_SPAN, NULL, animTimeLine ); 2.573 2.574 + } 2.575 2.576 + 2.577 2.578 +/*At reset only starts the Idle span in the communicator. 2.579 2.580 + */ 2.581 2.582 +void 2.583 2.584 +commSpan_at_reset( void *_params, VirtProcr *animTimeLine ) 2.585 2.586 + { 2.587 2.588 + 2.589 2.590 + DEBUG( dbgAppFlow, "commSpan at_reset\n", NULL ); 2.591 2.592 + 2.593 2.594 + HWSim__transition_to_new_span( IDLE_SPAN, NULL, animTimeLine ); 2.595 2.596 + } 2.597
3.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 3.2 +++ b/src/Application/HWSim__Hello_World_HW/DESIGN_NOTES.txt Mon Nov 07 16:03:01 2011 -0800 3.3 @@ -0,0 +1,369 @@ 3.4 + 3.5 3.6 + 3.7 3.8 +TeraFlux Hardware Model (impl of this model in terms of HWSim is below) 3.9 3.10 + 3.11 3.12 +As of Feb 2011, the hardware being simulated is: 3.13 3.14 + 3.15 3.16 +A number of TNodes on a chip, which are connected by a network, with a 3.17 3.18 + main-memory. 3.19 3.20 + 3.21 3.22 +The address space is divided into a local address-space and a shared 3.23 3.24 + address-space. Both address spaces are divided among the TNodes -- each 3.25 3.26 + has its own unique range of local virtual addresses and own range of 3.27 3.28 + global virtual addresses. A given TNode may only allocate virtual 3.29 3.30 + addresses within its own two ranges. The contents of *local* addresses it 3.31 3.32 + allocates may only ever be seen by the allocating TNode. The contents of 3.33 3.34 + shared addresses may be seen by any TNode after executing an acquire 3.35 3.36 + hardware instruction and being granted the contents. 3.37 3.38 + 3.39 3.40 +Each TNode has a single CPU core, a local memory, and a 3.41 3.42 + communication processor with its own network hardware. 3.43 3.44 + 3.45 3.46 +The local memory size is not modelled, so is considered unlimited. 3.47 3.48 +The network has undefined topology and is modelled as having constant latency 3.49 3.50 + from any TNode to any other TNode, with a fixed BW between any two TNodes. 3.51 3.52 + 3.53 3.54 +The only modelled communication is movement of data, which can only be 3.55 3.56 + triggered by the "acquire" hardware instruction. 3.57 3.58 + 3.59 3.60 +The behavior of acquire is defined as: 3.61 3.62 +-] Only one TNode at a time owns a given shared-memory "Frame", which has a 3.63 3.64 + start address and a size. 3.65 3.66 +-] A given shared virtual address is in at most one Frame for the duration 3.67 3.68 + of a program run (IE, no overlap of Frames, implying no change of size). 3.69 3.70 +-] The hardware mechanism by which single-ownership is enforced is not 3.71 3.72 + modelled. Control communication is considered infinitely fast. 3.73 3.74 +-] The simulated time between a given TNode's CPU executing the acquire 3.75 3.76 + instruction and the simulated time the data of the Frame appears in the 3.77 3.78 + local memory of that TNode is determined by both a queue of waiting 3.79 3.80 + acquire requests and the network time required to move the data. 3.81 3.82 + 3.83 3.84 +The CPU in a TNode executes the standard x86 instruction set. 3.85 3.86 + 3.87 3.88 +The communication processor performs the acquire instruction and the 3.89 3.90 + release instruction. The exact hardware mechanism by which these 3.91 3.92 + instructions get from the instruction stream fetched by the CPU to the 3.93 3.94 + communication processor is not defined. 3.95 3.96 + 3.97 3.98 +As an aside. For simulation, the acquire and release instruction are 3.99 3.100 + implemented in the lightweight simulator as library-calls that trigger 3.101 3.102 + the simulation infrastructure. In the application they are stated as 3.103 3.104 + macro-calls. These macros can then be defined in the tool-chain. 3.105 3.106 +When the COTSon simulator is targeted, the 3.107 3.108 + macro is implemented as an in-line assembly custom op-code. When the 3.109 3.110 + lightweight simulator is targeted, the macro is implemented as a call to 3.111 3.112 + a library, which invokes the acquire or release functionality in the 3.113 3.114 + lightweight simulator. 3.115 3.116 + 3.117 3.118 +The effect of transferring data between TNode local memory and the chip's 3.119 3.120 + main memory is not modelled. 3.121 3.122 + 3.123 3.124 +The portions of the hardware left undefined or not modelled may all be 3.125 3.126 + filled-in in future versions, according to research needs. 3.127 3.128 + 3.129 3.130 + 3.131 3.132 +============================================================================= 3.133 3.134 +Implementing the hardware model in terms of HWSim 3.135 3.136 + 3.137 3.138 +HWSim is used to implement the TeraFlux hardware model for simulation. 3.139 3.140 + 3.141 3.142 +The implementation consists of four elements: 3.143 3.144 +1) A CPU TimeLine, which executes the Guest code 3.145 3.146 +2) A Communicator TimeLine, which performs all inter-node communication 3.147 3.148 +3) A TNode TimeDomain, which has one CPU TimeLine and one Communicator 3.149 3.150 + TimeLine in it 3.151 3.152 +4) A Chip TimeDomain, which has a number of TNode TimeDomains in it. 3.153 3.154 + 3.155 3.156 +For the first version, there is no local memory element in a TNode, there is 3.157 3.158 + no Main Memory node, and there is no address translation mechanism modelled. 3.159 3.160 + These may be added later, and so might a TSU mechanism, and experimental 3.161 3.162 + alternative memory models. 3.163 3.164 + 3.165 3.166 +The CPU TimeLine has as spans: 3.167 3.168 +-] at_reset -- standard span that runs when the hardware is reset. Runs the 3.169 3.170 + TeraFlux System Code bootstrap function. 3.171 3.172 +-] guestCode span -- runs whatever Guest code is pointed to.. each span of 3.173 3.174 + this type is created with a pointer to code to run 3.175 3.176 +-] lightSleep -- implemented as the built-in IDLE span. When no virtual 3.177 3.178 + nodes are ready to animate, the CPU enters light sleep until the 3.179 3.180 + Communicator wakes it up 3.181 3.182 + 3.183 3.184 +The CPU TimeLine generates two kinds of communication-spans: 3.185 3.186 + acquire-start in Communicator 3.187 3.188 + release in Communicator 3.189 3.190 + 3.191 3.192 +For now, these communications are considered to be performed by dedicated 3.193 3.194 + hardware in the CPU, so they take exactly one simulated instruction, and 3.195 3.196 + their simulated time is thus included in the measured time of the span. The 3.197 3.198 + HWSim__send_comm takes a time-stamp before suspending the TimeLine, then 3.199 3.200 + another just after resume, and accumulates -- the end-span adds this in. 3.201 3.202 + 3.203 3.204 + All TimeLines begin by running the at_reset span defined for that TimeLine. 3.205 3.206 + The CPU's at_reset is hard-coded to start a GuestCode span that runs the 3.207 3.208 + TeraFlux System Code's boot sequence. 3.209 3.210 + 3.211 3.212 +The Communicator TimeLine has a number of spans, all related to acquire and 3.213 3.214 + release: 3.215 3.216 +-] acquire-start -- is triggered by the reception of a "start-acquire" 3.217 3.218 + communication from the CPU TimeLine. It generates an "acquire send" 3.219 3.220 + communication to the TNode that currently owns the data. 3.221 3.222 +-] acquire-send -- when triggered, checks if the Frame is free. If yes, 3.223 3.224 + sends an "acquire-receive" communication back to the requester. If not 3.225 3.226 + free, places data representing the acquire-request into a queue of waiting 3.227 3.228 + ones in the Communicator TimeLine. When the Communicator receives a 3.229 3.230 + "release" communication from the CPU TimeLine, it runs the release span. 3.231 3.232 +-] release -- takes the data of the next request waiting for the released 3.233 3.234 + Frame out of the queue and then sends an "acquire-receive" communication 3.235 3.236 + to the TNode requesting. 3.237 3.238 +-] acquire-receive -- when triggered, runs firmware, which writes memory 3.239 3.240 + shared with the CPU, modifying data-structures. This firmware for 3.241 3.242 + TeraFlux works with the System Code that runs on the CPU TimeLine to 3.243 3.244 + notify it that the acquire is complete. 3.245 3.246 + 3.247 3.248 +Note, nothing checks whether a Frame's addresses are accessed from outside an 3.249 3.250 + Acquire-Release block, which could be a source of difficult to find bugs in 3.251 3.252 + the application. 3.253 3.254 + 3.255 3.256 + 3.257 3.258 +============================================================================= 3.259 3.260 +Earlier versions of notes: 3.261 3.262 + 3.263 3.264 +A time-line is a virtual-processor, and has a sequence of spans -- each 3.265 3.266 + span performs one hardware-function, and has a start-time and an end-time 3.267 3.268 + -- those are simulated-time, not physical time. 3.269 3.270 + 3.271 3.272 +Each time-line is created with a start-span that initializes it, then every 3.273 3.274 + span ends with an "end span" sem-lib call. 3.275 3.276 + 3.277 3.278 +There are three kinds of span -- fixed-function spans, which represent 3.279 3.280 + hard-wired hardware behavior, processing-core spans, which represent 3.281 3.282 + processing elements that execute code, and communication-spans, which 3.283 3.284 + cross time-lines. 3.285 3.286 + 3.287 3.288 +a fixed-function span has a fixed function-pointer that it is created with 3.289 3.290 + and jumps to when the time-line is resumed. Fixed-function spans also have 3.291 3.292 + a pointer to a function that calculates the width of the span. The 3.293 3.294 + width-caluclating span is defined in the application directory. 3.295 3.296 + 3.297 3.298 +a processing-core span has a function-pointer that is assigned to it by the 3.299 3.300 + end-span call of the preceeding span. The width is also determined by a 3.301 3.302 + pointer to a width-calculating function. The width-calculating function 3.303 3.304 + for these spans is also defined in the application directory (In first 3.305 3.306 + teraflux impl, this function uses RDTSC to measure physical execution time, 3.307 3.308 + and makes that the simulated execution time too -- but with a "BS" detector 3.309 3.310 + that sees when the time is significantly larger than the previous 3.311 3.312 + invocation of the same function-pointer). 3.313 3.314 + 3.315 3.316 +Communication spans are special because they cross time-lines. So, a 3.317 3.318 + communication span has zero width in the time-line it's created in, and 3.319 3.320 + goes onto the queue as a new span in the target time-line (which also has 3.321 3.322 + zero-width). When the target span runs, it changes the state available to 3.323 3.324 + the target time-line, to represent the reception of the communication. 3.325 3.326 + 3.327 3.328 +============================= 3.329 3.330 +Span-end is the only semantic-library call implemented. Inside the 3.331 3.332 + request-handler, it causes new spans to be created. 3.333 3.334 + 3.335 3.336 +So, have to have a separate receive time-line, that modifies hardware shared 3.337 3.338 + with other time-lines. The send span causes a receive-span to be inserted 3.339 3.340 + into the target receive time-line. 3.341 3.342 + 3.343 3.344 +Receive-spans are zero-width -- they update the hardware-state atomically, 3.345 3.346 + so don't have to worry about conflicts between different receive spans in 3.347 3.348 + the simulator. The hardware-application that uses the simulator-library 3.349 3.350 + must model the receive hardware and implement the send-hardware function 3.351 3.352 + to work out any physical conflicts among receives targeted to the same 3.353 3.354 + receive time-line. 3.355 3.356 + 3.357 3.358 + 3.359 3.360 +============================= 3.361 3.362 + 3.363 3.364 +Time-lines are specialized to specific hardware functions inside the 3.365 3.366 + Application directory -- that's where the main creates all the time-lines, 3.367 3.368 + and where the spans are implemented that have the behavior of a given type 3.369 3.370 + of time-line. 3.371 3.372 + 3.373 3.374 +For example, if the hardware is a communication-unit, then span-types are 3.375 3.376 + created that have the behavior that does all the setup of a communication 3.377 3.378 + span and then does an end-span that creates as its follow-on the 3.379 3.380 + communication-span. 3.381 3.382 + 3.383 3.384 +Communication spans are special because they cross time-lines. So, a 3.385 3.386 + communication span has zero width in the time-line it's created in, and 3.387 3.388 + goes onto the queue in the target time-line, where it creates a new span 3.389 3.390 + that also has zero-width. The target span's function updates the hardware 3.391 3.392 + state available to the target time-line, which may be shared with other 3.393 3.394 + time-lines and that update may cause new spans to be spawned in those. 3.395 3.396 + 3.397 3.398 +if the hardware is a processing-core, then the function points to 3.399 3.400 + Guest-application-code. This function-pointer is what core_loop jumps to 3.401 3.402 + when it reanimates the time-line virtual processor. 3.403 3.404 + 3.405 3.406 +================ Albert e-mail ================= 3.407 3.408 + 3.409 3.410 + 3.411 3.412 +Hi Albert, 3.413 3.414 + 3.415 3.416 + the simulator is a thing of beauty. I'm getting goose bumps as I put the 3.417 3.418 + last pieces of the design into place. It's a sweet thing. 3.419 3.420 + 3.421 3.422 +Just in case you're curious, attached are my design notes <this file>. The 3.423 3.424 + thing that makes it nice is the clean decomposition and reusability -- the 3.425 3.426 + core simulator only has three things: time-lines, spans, and a 3.427 3.428 + priority-queue with an associated "certain-time" or "consistent-time". 3.429 3.430 + 3.431 3.432 + A time-line is a VMS virtual-processor, which executes a sequence of 3.433 3.434 + spans. Each span has a function that represents the behavior performed 3.435 3.436 + during that span, plus a function that calculates the simulated-time width 3.437 3.438 + of that span. The certain-time represents the advancement of global 3.439 3.440 + simulated-time. At all points in physical time during the simulation, it 3.441 3.442 + is guaranteed that no spans are waiting to execute that have a 3.443 3.444 + simulated-time older than the certain-time. 3.445 3.446 + In other words, at any physical moment, there are lots of spans sitting 3.447 3.448 + in queues waiting to run. So, a given time-line finishes a span at a given 3.449 3.450 + simulated-time point in global simulated-time. But there may be spans from 3.451 3.452 + other time-lines in the queue that finish at a preceeding simulated-time. 3.453 3.454 + But there can never be any waiting spans that finish before the 3.455 3.456 + certain-time. This is important for communications, which cross 3.457 3.458 + time-lines. 3.459 3.460 + 3.461 3.462 + 3.463 3.464 + Time-lines, spans, and certain-time are implemented in a semantic-library. 3.465 3.466 + 3.467 3.468 +This is, in essence a new parallel language for writing hardware-simulators 3.469 3.470 + with. (this just turns out to be the most natural and most simple way to 3.471 3.472 + write the TeraFlux simulator) 3.473 3.474 + 3.475 3.476 +The behavior of particular hardware is defined as a simulator-application 3.477 3.478 + that makes calls to that semantic-library. 3.479 3.480 + 3.481 3.482 +The main of that simulator-application creates the pieces of the hardware 3.483 3.484 + -- for Teraflux, that means it creates the nodes, and the pieces inside 3.485 3.486 + each node. 3.487 3.488 + 3.489 3.490 +Running this simulator-application equals turning on the power-switch of 3.491 3.492 + the Guest hardware. 3.493 3.494 + 3.495 3.496 +The Guest application code is retrieved by the main of the 3.497 3.498 + simulator-application, and starts running when the "power-switch" is 3.499 3.500 + turned on (running the simulator-application causes the Guest-application 3.501 3.502 + to be retrieved and start running on the hardware created by main). This 3.503 3.504 + is equivalent to the boot sequence of the BIOS, which happens at power-on. 3.505 3.506 + 3.507 3.508 +So, the end-effect is that full Linux is available to the Guest hardware as 3.509 3.510 + a sort of "escape". The Guest hardware can use the Host's disk access, 3.511 3.512 + debugging, and everything else, but these usages are "outside" the 3.513 3.514 + simulated time -- they are essentially magic-spells that the Guest 3.515 3.516 + hardware can perform that take place outside of time as far as the Guest 3.517 3.518 + application-code is concerned. Hence, the main is able to use the Host 3.519 3.520 + Linux to retrieve-from-disk the Guest application (but the disk-access 3.521 3.522 + takes place outside the measurements reported by the simulator). 3.523 3.524 + 3.525 3.526 +Which maybe seems trivial, but I consider it a very cool trick : ) 3.527 3.528 + 3.529 3.530 +The part I like best is the fact that the simulator itself is such a simple 3.531 3.532 + semantic-library, and the behavior of the hardware is written all as 3.533 3.534 + sequential code. This makes it easily customizable to any architecture 3.535 3.536 + someone might want to investigate, and still run on parallel hardware : D 3.537 3.538 + 3.539 3.540 +The only caveat is the thing I noted in earlier e-mails about communication 3.541 3.542 + updates -- memory images and other hardware state atomically update at 3.543 3.544 + the ends of spans. So, Guest-code-execution that overlaps the 3.545 3.546 + simulated-time at which reception happens, on a node, will not see that 3.547 3.548 + reception until the end of the span. 3.549 3.550 + For TeraFlux hardware, the only natural span-endings are acquire 3.551 3.552 + instructions and release instructions. 3.553 3.554 + What this means for you is that you may want to insert artificial 3.555 3.556 + end-span calls into the Guest application code you generate (I'll provide 3.557 3.558 + a "dummy" call). Communication-receptions on a node will only become 3.559 3.560 + visible to Guest application code after one of these dummy end-span calls. 3.561 3.562 + So, the granularity of time in the simulation is related to the frequency 3.563 3.564 + of end-span calls in the application code. If you don't insert any 3.565 3.566 + artificial ones, then acquire and release instructions will be the only 3.567 3.568 + span-ending events, and will define the granularity of 3.569 3.570 + communication-receptions being seen by Guest application code. 3.571 3.572 + In particular, this means that no Guest application code is allowed to 3.573 3.574 + monitor memory to see when it changes -- so, programming techniques that 3.575 3.576 + would work on real hardware, to detect acquire-updates early won't work 3.577 3.578 + in the simulator -- all Guest code to run on the simulator must use the 3.579 3.580 + acquire-library call, which will suspend the virtual-node the code is 3.581 3.582 + running on until the acquire is complete, then re-start that node at some 3.583 3.584 + simulated-time after the acquire completes. 3.585 3.586 + 3.587 3.588 + 3.589 3.590 +No idea when something will start running, but the design is getting quite 3.591 3.592 + detailed, so progress is being made, 3.593 3.594 + 3.595 3.596 +Sean 3.597 3.598 + 3.599 3.600 + 3.601 3.602 +============================================================================= 3.603 3.604 + Notes from before figured out how to do ConsistentTime for each TimeDomain 3.605 3.606 + and have a priority-queue of waiting communication receptions for the 3.607 3.608 + TimeDomain. 3.609 3.610 + 3.611 3.612 +This illustrates the tortured logic would have to go through otherwise. 3.613 3.614 + * 3.615 3.616 + * that acquire's release point in 3.617 3.618 + * sumulated is either already known, or not yet known. If already known, 3.619 3.620 + * then recursively check if another TNode has already been granted acquire, 3.621 3.622 + * until reach the end of the chain. If the end is already known without any 3.623 3.624 + * other acquires, then set state that this TNode gets the acquire at the 3.625 3.626 + * release-time of the last in the chain. This acquire will have an unknown 3.627 3.628 + * release time. 3.629 3.630 + *If the release time of the end of the chain is unknown, then put the 3.631 3.632 + * requesting acquire into a queue of acquires waiting for that Frame. When 3.633 3.634 + * the release happens, it will check if any acquires are in the queue for 3.635 3.636 + * the released Frame. If so, it will do the same as is done when the end 3.637 3.638 + * of a chain is known -- set the Frame's state to acquired, with release- 3.639 3.640 + * time as the start-time of the new acquire-grant, and unknown end-time. 3.641 3.642 + * 3.643 3.644 + *When a HWInstr_release() is executed, it has a simulated-time at which the 3.645 3.646 + * span starts -- the span will also have zero width and start the idle span 3.647 3.648 + * when it ends, just like the acquire-start span, and the triggered 3.649 3.650 + * spans. 3.651 3.652 + * 3.653 3.654 + *The release span will check the wait queue for the Frame it is releasing, 3.655 3.656 + * and either mark the Frame as free, or else fire off the grant-acquire 3.657 3.658 + * function for the first waiting acquire. 3.659 3.660 + * 3.661 3.662 + *The grant-acquire function checks the ConsistentTime, and if the 3.663 3.664 + * simulated time of the grant preceeds the ConsistentTime, then the 3.665 3.666 + * acquire is put into the readyQ as an acquire-send span in the comm 3.667 3.668 + * TimeLine of the TNode that owns the acquired data, or else the MainMem 3.669 3.670 + * Node. 3.671 3.672 + * 3.673 3.674 + *The comm TimeLine that executes an acquire-send creates a comm span between 3.675 3.676 + * the sending TNode or MMNode and the receiving TNode. The width of the 3.677 3.678 + * span is set by hardware model. This will be a parameter for experiments. 3.679 3.680 + * It's the main phenomenon affecting performance and scalability. A fixed 3.681 3.682 + * latency plus Frame-size / fixed-BW to start. 3.683 3.684 + * 3.685 3.686 + *An acquire can only be granted when the Consistent time reaches the 3.687 3.688 + * Release-time of the previous acquire. That's when know for certain that 3.689 3.690 + * the memory image being acquired is correct, and the acquire order is 3.691 3.692 + * correct (all acquires that want a given Frame will have been queued up 3.693 3.694 + * for that Frame, so can't grant to one, in Host time, then a different 3.695 3.696 + * acquire arrives that SHOULD have been the one given the grant). 3.697 3.698 + *Each Frame has a priority-queue of acquires waiting for it, ordered by the 3.699 3.700 + * simulated time the acquire-request was made. 3.701 3.702 + *When ConsistentTime advances past the last Release of a Frame, then check 3.703 3.704 + * the priority queue of waiting acquires -- if the top is older than the 3.705 3.706 + * ConsistentTime, then grant to that one -- otherwise, move the acquire to 3.707 3.708 + * the TriggerByConsistentTime queue. 3.709 3.710 + * 3.711 3.712 + *Each time ConsistentTime *wants* to advance, check the Trigger priority- 3.713 3.714 + * queue to see if any triggers are older than the proposed new Consistent- 3.715 3.716 + * Time. If yes, then ConsistentTime is only advanced to that trigger's 3.717 3.718 + * time, and the tigger is performed. 3.719 3.720 + * 3.721 3.722 + *Triggers are spans that have a start-time that depends on actions in other 3.723 3.724 + * time-lines. So, the span is created in one TimeLine, either ending an 3.725 3.726 + * idle-span, or being queued up to run in that TimeLine's sequence -- all 3.727 3.728 + * dependencies have been satisfied except access to the TimeLine resource. 3.729 3.730 + * 3.731 3.732 + *Each Frame has a hash-entry key'd by the Frame's start-addr. This entry 3.733 3.734 + * has a priority queue holding acquires waiting for the Frame, sorted 3.735 3.736 + * by sim-time the acquire was executed. 3.737 3.738 + 3.739 3.740 + 3.741
4.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 4.2 +++ b/src/Application/HWSim__Hello_World_HW/EntryPoint.c Mon Nov 07 16:03:01 2011 -0800 4.3 @@ -0,0 +1,40 @@ 4.4 +/* 4.5 4.6 + * Copyright 2009 OpenSourceStewardshipFoundation.org 4.7 4.8 + * Licensed under GNU General Public License version 2 4.9 4.10 + * 4.11 4.12 + * Author: seanhalle@yahoo.com 4.13 4.14 + * 4.15 4.16 + */ 4.17 4.18 + 4.19 4.20 +#include <math.h> 4.21 4.22 + 4.23 4.24 +#include "HWSim_TeraFlux.h" 4.25 4.26 + 4.27 4.28 + 4.29 4.30 + 4.31 4.32 +/*Every HWSim system has an "entry point" function that creates the first 4.33 4.34 + * virtual processor, which is the seed processor. 4.35 4.36 + * 4.37 4.38 + *The seed processor will construct the system to be simulated. 4.39 4.40 + *The other files in this directory define the components the system is 4.41 4.42 + * constructed from. 4.43 4.44 + * 4.45 4.46 + * 4.47 4.48 + *This entry-point function follows the same pattern as all entry-point 4.49 4.50 + * functions do: 4.51 4.52 + *1) it creates the params for the seed processor, from the 4.53 4.54 + * parameters passed into the entry-point function 4.55 4.56 + *2) it calls HWSim__create_seed_procr_and_do_work 4.57 4.58 + *3) it gets the return value from the params struc, frees the params struc, 4.59 4.60 + * and returns the value from the function 4.61 4.62 + * 4.63 4.64 + */ 4.65 4.66 +void 4.67 4.68 +runTheSimulation( SimulationParams *simParams ) 4.69 4.70 + { 4.71 4.72 + //create divider processor, start doing the work, and wait till done 4.73 4.74 + //This function is the "border crossing" between normal code and HWSim 4.75 4.76 + HWSim__create_seed_procr_and_do_work( &constructAndSimulateSystem, 4.77 4.78 + simParams ); 4.79 4.80 + 4.81 4.82 + } 4.83
5.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 5.2 +++ b/src/Application/HWSim__Hello_World_HW/Seed_VP.c Mon Nov 07 16:03:01 2011 -0800 5.3 @@ -0,0 +1,181 @@ 5.4 +/* 5.5 5.6 + * Copyright 2009 OpenSourceStewardshipFoundation.org 5.7 5.8 + * Licensed under GNU General Public License version 2 5.9 5.10 + * 5.11 5.12 + * Author: seanhalle@yahoo.com 5.13 5.14 + * 5.15 5.16 + */ 5.17 5.18 + 5.19 5.20 + 5.21 5.22 +#include "HWSim_TeraFlux.h" 5.23 5.24 +#include <math.h> 5.25 5.26 +#include <string.h> 5.27 5.28 + 5.29 5.30 + 5.31 5.32 + 5.33 5.34 +//=========================================================================== 5.35 5.36 +/*This is the seed processor. 5.37 5.38 + * 5.39 5.40 + *It takes the simulation parameters that were passed in to the entry point 5.41 5.42 + * and uses them to construct the system and start it running. 5.43 5.44 + * 5.45 5.46 + *The way this VP Top-level-function is written is specific to TeraFlux, so 5.47 5.48 + * the system it constructs is specifically a TeraFlux chip. 5.49 5.50 + * 5.51 5.52 + *HWSim, on the other hand, expects to be handed functions that it can call 5.53 5.54 + * itself. The first function should perform a bunch of 5.55 5.56 + * HWSim__create_TimeLine() and HWSim__create_TimeDomain() calls. The 5.57 5.58 + * second function should connect together the entities created in the first 5.59 5.60 + * function. 5.61 5.62 + * 5.63 5.64 + *So, the job of this seed processor is to construct the parameters those 5.65 5.66 + * two functions will take, and hand them to HWSim with the 5.67 5.68 + * HWSim__run_creation_fn( createFnPtr, createFnParams, animVP) 5.69 5.70 + */ 5.71 5.72 +void 5.73 5.74 +TFSeedVP_TLF( void *_params, VirtProcr *animPr ) 5.75 5.76 + { 5.77 5.78 + TFSimulatorParams *params; 5.79 5.80 + 5.81 5.82 + params = (TFSimulatorParams *)_params; 5.83 5.84 + 5.85 5.86 + DEBUG( dbgTFHW, "CPU Span at_reset\n", _params ); 5.87 5.88 + 5.89 5.90 + int32 5.91 5.92 + constructProbe = VMS__create_single_interval_probe("constructProbe", 5.93 5.94 + animPr ); 5.95 5.96 + VMS__record_sched_choice_into_probe( constructProbe, animPr ); 5.97 5.98 + VMS__record_interval_start_in_probe( constructProbe ); 5.99 5.100 + 5.101 5.102 + HWSim__register_constructor( &constructTeraFluxArch, _params, animPr); 5.103 5.104 + HWSim__reset_and_sim( params->results, animPr );//animPr suspends til done 5.105 5.106 + 5.107 5.108 + //=========== Setup 5.109 5.110 + /* for performance, want each phys core's master to have own acquire state 5.111 5.112 + * locally, and only read some config info that tells it whether needs 5.113 5.114 + * to read other data to update itself, or something.. 5.115 5.116 + * But, for now, just doing simplest thing.. can add a "comm plugin" to 5.117 5.118 + * HWSim, so the the handler for communication-calls takes a plugin that 5.119 5.120 + * it calls.. that lets HWSim be modified, so that acquire is done in 5.121 5.122 + * the request handler in the master. 5.123 5.124 + *Other alternative is making a communication-controller element, and send 5.125 5.126 + * messages to it to do the acquires and releases -- will make that 5.127 5.128 + * time-line be animated a lot -- will need it to be able to jump around 5.129 5.130 + * among the physical cores -- so, something about letting a given time- 5.131 5.132 + * line be able to be animated on whichever core needs it at the moment. 5.133 5.134 + *Let's see.. the cores will be busy, than one will do an acquire, which 5.135 5.136 + * will need the acquire-controller time-line -- but don't want that core 5.137 5.138 + * to run out of work waiting for the controller -- hmmm, how about, use 5.139 5.140 + * the affinity feature to keep each of the nodes to a particular core, 5.141 5.142 + * but don't use it on the controller, which will let it move around.. 5.143 5.144 + *So, have separate readyQs -- one for each core, and another for free- 5.145 5.146 + * floating.. when whatever scheduler is running has its local readyQ 5.147 5.148 + * empty, it takes from the floating. 5.149 5.150 + */ 5.151 5.152 + make acquire-controller. (central control over acquires but no timing) 5.153 5.154 + make array to hold all the nodes. 5.155 5.156 + loop makes each node and gives it an x and a y ID, and code-ptrs 5.157 5.158 + (call make_node(), which constructs the four time-lines in a node) 5.159 5.160 + 5.161 5.162 + loop through, send each "start" signal. 5.163 5.164 + 5.165 5.166 + } 5.167 5.168 + 5.169 5.170 + 5.171 5.172 +/*This function is the constructor given to HWSim by the seed processor. 5.173 5.174 + * It uses HWSim calls to create all the TimeLines and TimeDomains, and to 5.175 5.176 + * hook them together. Note that HWSim will start them, itself, after this 5.177 5.178 + * constructor is done. 5.179 5.180 + * 5.181 5.182 + *Note, timelines don't have to be connected in order to communicate -- it's 5.183 5.184 + * just one way of getting the needed info to the sending TimeLine, which 5.185 5.186 + * consists of the pointer to the destination TimeLine, and which port to 5.187 5.188 + * tell that target Timeline the communication is coming in on. 5.189 5.190 + *The acquire will have the target TimeLine stored in a hash table, that's 5.191 5.192 + * how the sending TimeLine gets the pointer to the target. It has the port 5.193 5.194 + * number hard-coded. 5.195 5.196 + */ 5.197 5.198 +void 5.199 5.200 +constructTeraFluxArch( void *_params, VirtProcr *animPr ) 5.201 5.202 + { 5.203 5.204 + TFSimulatorParams *params; 5.205 5.206 + int nodeNum; 5.207 5.208 + HWSimTimeDomain *node; 5.209 5.210 + HWSimTimeLine *cpu, *communicator; 5.211 5.212 + 5.213 5.214 + params = (TFSimulatorParams *)_params; 5.215 5.216 + 5.217 5.218 + //========Define the types of TimeLine, which sets the at_reset fn======== 5.219 5.220 + // 5.221 5.222 + HWSim__define_TimeLine_type( CPU_TIMELINE, &CPUSpan_at_reset, animPr ); 5.223 5.224 + HWSim__define_TimeLine_type( COMM_TIMELINE, &commSpan_at_reset, animPr ); 5.225 5.226 + 5.227 5.228 + //==========Register HWSim handlers========= 5.229 5.230 + // 5.231 5.232 + // Handlers are special because they run inside HWSim with access to 5.233 5.234 + // shared global state and have the ability to start new spans, modify 5.235 5.236 + // TimeLine state, and so on. 5.237 5.238 + 5.239 5.240 + //All data is local to a TimeLine, except global vars. Those can only 5.241 5.242 + // be accessed through a registered handler. 5.243 5.244 + HWSim__register_global_var_accessor( DO_ACQUIRE_SEND, &do_acquire_send, 5.245 5.246 + animPr ); 5.247 5.248 + HWSim__register_global_var_accessor( GET_OWNING_TNODE, &get_owning_TNode, 5.249 5.250 + animPr ); 5.251 5.252 + 5.253 5.254 + //HWInstrs are able to generate communications, start new spans, and 5.255 5.256 + // so forth -- they are considered extensions of HWSim itself, with the 5.257 5.258 + // ability to affect the language's internal semantic and scheduling 5.259 5.260 + // state, and so are created as handlers, which must be registered. 5.261 5.262 + HWSim__register_HWInstr_type( ACQUIRE_INSTR, 5.263 5.264 + &handle_Acquire_HWInstr_request, animPr ); 5.265 5.266 + HWSim__register_HWInstr_type( RELEASE_INSTR, 5.267 5.268 + &handle_Release_HWInstr_request, animPr ); 5.269 5.270 + 5.271 5.272 + //==========Create the TimeDomains and TimeLines and connect them========= 5.273 5.274 + // 5.275 5.276 + for( nodeNum = 0; nodeNum < params->numNodes; nodeNum++ ) 5.277 5.278 + { 5.279 5.280 + node = HWSim__create_TimeDomain( params?, animPr ); 5.281 5.282 + cpu = HWSim__create_TimeLine_of_type( CPU_TIMELINE, animPr ); 5.283 5.284 + communicator = HWSim__create_TimeLine_of_type( COMM_TIMELINE, animPr); 5.285 5.286 + HWSim__add_TimeLine_to_TimeDomain( cpu, node ); 5.287 5.288 + HWSim__add_TimeLine_to_TimeDomain( communicator, node ); 5.289 5.290 + //This stores the target ptr + port-num in the out-port position in 5.291 5.292 + // the cpu TimeLine -- so spans in cpu TimeLine can look up target 5.293 5.294 + HWSim__connect_TimeLine_outPort_to_TimeLine_inPort( 5.295 5.296 + cpu, COMMUNICATOR_OUTPORT, communicator, CPU_INPORT ); 5.297 5.298 + } 5.299 5.300 + 5.301 5.302 + //Done -- the architecture is very simple for now -- inter-node comm 5.303 5.304 + // happens via acquire spans, which use global vars to find the target 5.305 5.306 + // communicator, and the comm spans have the target port hard-coded. 5.307 5.308 + //So no inter-node communication connections 5.309 5.310 + } 5.311 5.312 + 5.313 5.314 +/*The acquire handler uses HWSim calls to generate a communication in the 5.315 5.316 + * Communicator TimeLine. When the ConsistentTime reaches the appointed 5.317 5.318 + * simulation time at which that comm arrives, it triggers the acquire-start 5.319 5.320 + * span in the Communicator TimeLine. 5.321 5.322 + * 5.323 5.324 + *Q: what's with HWSim__send_comm? Can just use that, don't need to register 5.325 5.326 + * an instruction.. Means GuestCode span just keeps going.. pause it during 5.327 5.328 + * an HWInstr, then resume it.. Want multiple spans for any reason? 5.329 5.330 + */ 5.331 5.332 +void 5.333 5.334 +handle_Acquire_HWInstr_request() 5.335 5.336 + { 5.337 5.338 + 5.339 5.340 + 5.341 5.342 + } 5.343 5.344 + 5.345 5.346 +/*The release handler uses HWSim calls to generate a communication in the 5.347 5.348 + * Communicator TimeLine. When the ConsistentTime reaches the appointed 5.349 5.350 + * simulation time at which that comm arrives, it triggers the release 5.351 5.352 + * span in the Communicator TimeLine. 5.353 5.354 + */ 5.355 5.356 +void 5.357 5.358 +handle_Release_HWInstr_request() 5.359 5.360 + { 5.361 5.362 + 5.363 5.364 + } 5.365
6.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 6.2 +++ b/src/Application/SimParams.c Mon Nov 07 16:03:01 2011 -0800 6.3 @@ -0,0 +1,91 @@ 6.4 +/* 6.5 6.6 + * Copyright 2009 OpenSourceStewardshipFoundation.org 6.7 6.8 + * Licensed under GNU General Public License version 2 6.9 6.10 + * 6.11 6.12 + * Author: seanhalle@yahoo.com 6.13 6.14 + * 6.15 6.16 + * Created on November 15, 2009, 2:35 AM 6.17 6.18 + */ 6.19 6.20 + 6.21 6.22 +#include <malloc.h> 6.23 6.24 +#include <stdlib.h> 6.25 6.26 + 6.27 6.28 +#include "SimParams.h" 6.29 6.30 +#include "ParamHelper/Param.h" 6.31 6.32 + 6.33 6.34 + 6.35 6.36 +uint8 * 6.37 6.38 +read_Machine_Code_From_File( int numBytesInFile, char *machineCodeFileName ); 6.39 6.40 + 6.41 6.42 + 6.43 6.44 +void 6.45 6.46 +fill_sim_params_from_bag( SimulationParams *simParams, ParamBag *paramBag ) 6.47 6.48 + { char *guestAppFileName, *systemCodeFileName; 6.49 6.50 + int numBytesInGuestApp, numBytesInSystemCode; 6.51 6.52 + 6.53 6.54 + ParamStruc *param; 6.55 6.56 + param = getParamFromBag( "GuestApplicationFileName", paramBag ); 6.57 6.58 + guestAppFileName = param->strValue; 6.59 6.60 + param = getParamFromBag( "numBytesInGuestApp", paramBag ); 6.61 6.62 + numBytesInGuestApp = param->intValue; 6.63 6.64 + 6.65 6.66 + simParams->guestApp = 6.67 6.68 + read_Machine_Code_From_File( numBytesInGuestApp, guestAppFileName ); 6.69 6.70 + 6.71 6.72 + param = getParamFromBag( "SystemCodeFileName", paramBag ); 6.73 6.74 + systemCodeFileName = param->strValue; 6.75 6.76 + param = getParamFromBag( "numBytesInSystemCode", paramBag ); 6.77 6.78 + numBytesInSystemCode = param->intValue; 6.79 6.80 + 6.81 6.82 + simParams->systemCode = 6.83 6.84 + read_Machine_Code_From_File( numBytesInSystemCode, systemCodeFileName ); 6.85 6.86 + 6.87 6.88 + 6.89 6.90 + param = getParamFromBag( "numNodes", paramBag ); 6.91 6.92 + simParams->numNodes = param->intValue; 6.93 6.94 + 6.95 6.96 + } 6.97 6.98 + 6.99 6.100 + 6.101 6.102 + 6.103 6.104 +uint8 * 6.105 6.106 +read_Machine_Code_From_File( int numBytesInFile, char *machineCodeFileName ) 6.107 6.108 + { int byte; 6.109 6.110 + FILE *file; 6.111 6.112 + char *machineCode = malloc( numBytesInFile ); 6.113 6.114 + if( machineCode == NULL ) printf( "\nno mem for machine code\n" ); 6.115 6.116 + 6.117 6.118 + file = fopen( machineCodeFileName, "r" ); 6.119 6.120 + if( file == NULL ) { printf( "\nCouldn't open file!!\n"); exit(1);} 6.121 6.122 + 6.123 6.124 + fseek( file, 0, SEEK_SET ); 6.125 6.126 + for( byte = 0; byte < numBytesInFile; byte++ ) 6.127 6.128 + { 6.129 6.130 + if( feof( file ) ) printf( "file ran out too soon" ); 6.131 6.132 + machineCode[byte] = getchar( file ); 6.133 6.134 + 6.135 6.136 + } 6.137 6.138 + return machineCode; 6.139 6.140 + } 6.141 6.142 + 6.143 6.144 + 6.145 6.146 + //========================================================================== 6.147 6.148 +void 6.149 6.150 +printSimResults( SimulationResults simResults ) 6.151 6.152 + { int r, c, numRows, numCols, rowsToPrint, colsToPrint, rowIncr, colIncr; 6.153 6.154 + float32 *matrixArray; 6.155 6.156 + 6.157 6.158 + numRows = rowsToPrint = matrix->numRows; 6.159 6.160 + numCols = colsToPrint = matrix->numCols; 6.161 6.162 + matrixArray = matrix->array; 6.163 6.164 + 6.165 6.166 + rowIncr = numRows/20; if(rowIncr == 0) rowIncr = 1;//20 to 39 rows printed 6.167 6.168 + colIncr = numCols/20; if(colIncr == 0) colIncr = 1;//20 to 39 cols printed 6.169 6.170 + for( r = 0; r < numRows; r += rowIncr ) 6.171 6.172 + { for( c = 0; c < numCols; c += colIncr ) 6.173 6.174 + { printf( "%3.1f | ", matrixArray[ r * numCols + c ] ); 6.175 6.176 + } 6.177 6.178 + printf("\n"); 6.179 6.180 + } 6.181 6.182 + } 6.183 6.184 + 6.185
7.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 7.2 +++ b/src/Application/SimParams.h Mon Nov 07 16:03:01 2011 -0800 7.3 @@ -0,0 +1,48 @@ 7.4 +/* 7.5 7.6 + * Copyright Oct 24, 2009 OpenSourceStewardshipFoundation.org 7.7 7.8 + * Licensed under GNU General Public License version 2 7.9 7.10 + */ 7.11 7.12 + 7.13 7.14 +#ifndef MATRIX_MULT_H_ 7.15 7.16 +#define MATRIX_MULT_H_ 7.17 7.18 + 7.19 7.20 +#include <stdio.h> 7.21 7.22 +#include <unistd.h> 7.23 7.24 +#include <malloc.h> 7.25 7.26 + 7.27 7.28 +#include "../HWSim_lib/VMS/VMS_primitive_data_types.h" 7.29 7.30 +#include "ParamHelper/Param.h" 7.31 7.32 + 7.33 7.34 +//============================== Structures ============================== 7.35 7.36 + 7.37 7.38 +typedef 7.39 7.40 +struct 7.41 7.42 + { uint8 *guestApp; 7.43 7.44 + uint8 *systemCode; 7.45 7.46 + int32 numNodes; 7.47 7.48 + } 7.49 7.50 +SimulationResults; 7.51 7.52 + 7.53 7.54 + 7.55 7.56 +typedef 7.57 7.58 +struct 7.59 7.60 + { uint8 *guestApp; 7.61 7.62 + uint8 *systemCode; 7.63 7.64 + int32 numNodes; 7.65 7.66 + SimulationResults *simResults; 7.67 7.68 + } 7.69 7.70 +SimulationParams; 7.71 7.72 + 7.73 7.74 + 7.75 7.76 + 7.77 7.78 +//============================== Functions ================================ 7.79 7.80 + 7.81 7.82 +void 7.83 7.84 +printSimResults( SimulationResults simResults ); 7.85 7.86 + 7.87 7.88 +void 7.89 7.90 +fill_sim_params_from_bag( SimulationParams *simParams, ParamBag *paramBag ); 7.91 7.92 + 7.93 7.94 +//=========================================================================== 7.95 7.96 + 7.97 7.98 +#endif /*MATRIX_MULT_H_*/ 7.99
8.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 8.2 +++ b/src/Application/main.c Mon Nov 07 16:03:01 2011 -0800 8.3 @@ -0,0 +1,48 @@ 8.4 +/* 8.5 8.6 + * Copyright Oct 24, 2009 OpenSourceStewardshipFoundation.org 8.7 8.8 + * Licensed under GNU General Public License version 2 8.9 8.10 + * 8.11 8.12 + * author seanhalle@yahoo.com 8.13 8.14 + */ 8.15 8.16 + 8.17 8.18 +#include <malloc.h> 8.19 8.20 +#include <stdlib.h> 8.21 8.22 + 8.23 8.24 +#include "SimParams.h" 8.25 8.26 +#include "HWSim_TeraFlux/HWSim_TeraFlux.h" 8.27 8.28 + 8.29 8.30 +/** 8.31 8.32 + * 8.33 8.34 + */ 8.35 8.36 +int main( int argc, char **argv ) 8.37 8.38 + { SimulationParams *simParams; 8.39 8.40 + SimulationResults *simResults; 8.41 8.42 + ParamBag *paramBag; 8.43 8.44 + 8.45 8.46 + printf( "arguments: %s | %s\n", argv[0], argv[1] ); 8.47 8.48 + 8.49 8.50 + simParams = malloc( sizeof(SimulationParams) ); 8.51 8.52 + 8.53 8.54 + 8.55 8.56 + //VMS has its own separate internal malloc, so to get results out, 8.57 8.58 + // have to pass in empty array for it to fill up 8.59 8.60 + //The alternative is internally telling HWSim make external space to use 8.61 8.62 + simResults = malloc( sizeof(SimulationResults) ); 8.63 8.64 + simParams->simResults = simResults; 8.65 8.66 + 8.67 8.68 + paramBag = makeParamBag(); 8.69 8.70 + 8.71 8.72 + readParamFileIntoBag( argv[1], paramBag ); 8.73 8.74 + fill_sim_params_from_bag( simParams, paramBag ); 8.75 8.76 + 8.77 8.78 + 8.79 8.80 + constructAndSimulateSystem( simParams ); 8.81 8.82 + 8.83 8.84 + printSimResults( simResults ); 8.85 8.86 + 8.87 8.88 + fflush(stdin); 8.89 8.90 + 8.91 8.92 + exit(0); //cleans up 8.93 8.94 + } 8.95 8.96 + 8.97 8.98 + 8.99