changeset 60:74775fd41630

checkpoint, so can go back to prev version
author Sean Halle <seanhalle@yahoo.com>
date Wed, 27 Jun 2012 02:49:38 -0700
parents 7bc474513431
children cce09bfd652f
files 0__Papers/Holistic_Model/Perf_Tune/latex/sigplanconf-template.tex 0__Papers/PStack/HotPar_2012/latex/11_Dc_27__HW_abstr_for_port_stack.tex 0__Papers/PStack/HotPar_2012/latex/12_Ja_20__HotPar_Abstr_for_PStack.pdf 0__Papers/PStack/HotPar_2012/latex/12_Ja_20__HotPar_Abstr_for_PStack.tex 0__Papers/VMS/VMS__Foundation_Paper/VMS__Full_conference_version/latex/VMS__Full_conf_paper.tex 0__Papers/writing-a-paper-slides.pdf
diffstat 6 files changed, 158 insertions(+), 1479 deletions(-) [+]
line diff
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/0__Papers/Holistic_Model/Perf_Tune/latex/sigplanconf-template.tex	Wed Jun 27 02:49:38 2012 -0700
     1.3 @@ -0,0 +1,93 @@
     1.4 +%-----------------------------------------------------------------------------
     1.5 +%
     1.6 +%               Template for sigplanconf LaTeX Class
     1.7 +%
     1.8 +% Name:         sigplanconf-template.tex
     1.9 +%
    1.10 +% Purpose:      A template for sigplanconf.cls, which is a LaTeX 2e class
    1.11 +%               file for SIGPLAN conference proceedings.
    1.12 +%
    1.13 +% Guide:        Refer to "Author's Guide to the ACM SIGPLAN Class,"
    1.14 +%               sigplanconf-guide.pdf
    1.15 +%
    1.16 +% Author:       Paul C. Anagnostopoulos
    1.17 +%               Windfall Software
    1.18 +%               978 371-2316
    1.19 +%               paul@windfall.com
    1.20 +%
    1.21 +% Created:      15 February 2005
    1.22 +%
    1.23 +%-----------------------------------------------------------------------------
    1.24 +
    1.25 +
    1.26 +\documentclass[preprint]{sigplanconf}
    1.27 +
    1.28 +% The following \documentclass options may be useful:
    1.29 +%
    1.30 +% 10pt          To set in 10-point type instead of 9-point.
    1.31 +% 11pt          To set in 11-point type instead of 9-point.
    1.32 +% authoryear    To obtain author/year citation style instead of numeric.
    1.33 +
    1.34 +\usepackage{amsmath}
    1.35 +
    1.36 +\begin{document}
    1.37 +
    1.38 +\conferenceinfo{WXYZ '05}{date, City.} 
    1.39 +\copyrightyear{2005} 
    1.40 +\copyrightdata{[to be supplied]} 
    1.41 +
    1.42 +\titlebanner{banner above paper title}        % These are ignored unless
    1.43 +\preprintfooter{short description of paper}   % 'preprint' option specified.
    1.44 +
    1.45 +\title{Title Text}
    1.46 +\subtitle{Subtitle Text, if any}
    1.47 +
    1.48 +\authorinfo{Name1}
    1.49 +           {Affiliation1}
    1.50 +           {Email1}
    1.51 +\authorinfo{Name2\and Name3}
    1.52 +           {Affiliation2/3}
    1.53 +           {Email2/3}
    1.54 +
    1.55 +\maketitle
    1.56 +
    1.57 +\begin{abstract}
    1.58 +This is the text of the abstract.
    1.59 +\end{abstract}
    1.60 +
    1.61 +\category{CR-number}{subcategory}{third-level}
    1.62 +
    1.63 +\terms
    1.64 +term1, term2
    1.65 +
    1.66 +\keywords
    1.67 +keyword1, keyword2
    1.68 +
    1.69 +\section{Introduction}
    1.70 +
    1.71 +The text of the paper begins here.
    1.72 +
    1.73 +\appendix
    1.74 +\section{Appendix Title}
    1.75 +
    1.76 +This is the text of the appendix, if you need one.
    1.77 +
    1.78 +\acks
    1.79 +
    1.80 +Acknowledgments, if needed.
    1.81 +
    1.82 +% We recommend abbrvnat bibliography style.
    1.83 +
    1.84 +\bibliographystyle{abbrvnat}
    1.85 +
    1.86 +% The bibliography should be embedded for final submission.
    1.87 +
    1.88 +\begin{thebibliography}{}
    1.89 +\softraggedright
    1.90 +
    1.91 +\bibitem[Smith et~al.(2009)Smith, Jones]{smith02}
    1.92 +P. Q. Smith, and X. Y. Jones. ...reference text...
    1.93 +
    1.94 +\end{thebibliography}
    1.95 +
    1.96 +\end{document}
     2.1 --- a/0__Papers/PStack/HotPar_2012/latex/11_Dc_27__HW_abstr_for_port_stack.tex	Tue Jun 26 12:48:44 2012 +0200
     2.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
     2.3 @@ -1,1474 +0,0 @@
     2.4 -%&latex
     2.5 -
     2.6 -
     2.7 -\documentclass[conference]{../helpers/llncs}
     2.8 -%
     2.9 -
    2.10 -%\usepackage{geometry} %chgs the margins and layout!
    2.11 -\usepackage{makeidx,amssymb,graphicx,calc,ifthen}
    2.12 -%
    2.13 -
    2.14 -% *** CITATION PACKAGES ***
    2.15 -%
    2.16 -%\usepackage{cite}
    2.17 -% cite.sty was written by Donald Arseneau
    2.18 -% V1.6 and later of IEEEtran pre-defines the format of the cite.sty package
    2.19 -% \cite{} output to follow that of IEEE. Loading the cite package will
    2.20 -% result in citation numbers being automatically sorted and properly
    2.21 -% "compressed/ranged". e.g., [1], [9], [2], [7], [5], [6] without using
    2.22 -% cite.sty will become [1], [2], [5]--[7], [9] using cite.sty. cite.sty's
    2.23 -% \cite will automatically add leading space, if needed. Use cite.sty's
    2.24 -% noadjust option (cite.sty V3.8 and later) if you want to turn this off.
    2.25 -% cite.sty is already installed on most LaTeX systems. Be sure and use
    2.26 -% version 4.0 (2003-05-27) and later if using hyperref.sty. cite.sty does
    2.27 -% not currently provide for hyperlinked citations.
    2.28 -% The latest version can be obtained at:
    2.29 -% http://www.ctan.org/tex-archive/macros/latex/contrib/cite/
    2.30 -% The documentation is contained in the cite.sty file itself.
    2.31 -
    2.32 -
    2.33 -
    2.34 -
    2.35 -
    2.36 -
    2.37 -% *** GRAPHICS RELATED PACKAGES ***
    2.38 -%
    2.39 -%\ifCLASSINFOpdf
    2.40 -  % \usepackage[pdftex]{graphicx}
    2.41 -  % declare the path(s) where your graphic files are
    2.42 -  % \graphicspath{{../pdf/}{../jpeg/}}
    2.43 -  % and their extensions so you won't have to specify these with
    2.44 -  % every instance of \includegraphics
    2.45 -  % \DeclareGraphicsExtensions{.pdf,.jpeg,.png}
    2.46 -%\else
    2.47 -  % or other class option (dvipsone, dvipdf, if not using dvips). graphicx
    2.48 -  % will default to the driver specified in the system graphics.cfg if no
    2.49 -  % driver is specified.
    2.50 -  % \usepackage[dvips]{graphicx}
    2.51 -  % declare the path(s) where your graphic files are
    2.52 -  % \graphicspath{{../eps/}}
    2.53 -  % and their extensions so you won't have to specify these with
    2.54 -  % every instance of \includegraphics
    2.55 -  % \DeclareGraphicsExtensions{.eps}
    2.56 -%\fi
    2.57 -% graphicx was written by David Carlisle and Sebastian Rahtz. It is
    2.58 -% required if you want graphics, photos, etc. graphicx.sty is already
    2.59 -% installed on most LaTeX systems. The latest version and documentation can
    2.60 -% be obtained at: 
    2.61 -% http://www.ctan.org/tex-archive/macros/latex/required/graphics/
    2.62 -% Another good source of documentation is "Using Imported Graphics in
    2.63 -% LaTeX2e" by Keith Reckdahl which can be found as epslatex.ps or
    2.64 -% epslatex.pdf at: http://www.ctan.org/tex-archive/info/
    2.65 -%
    2.66 -% latex, and pdflatex in dvi mode, support graphics in encapsulated
    2.67 -% postscript (.eps) format. pdflatex in pdf mode supports graphics
    2.68 -% in .pdf, .jpeg, .png and .mps (metapost) formats. Users should ensure
    2.69 -% that all non-photo figures use a vector format (.eps, .pdf, .mps) and
    2.70 -% not a bitmapped formats (.jpeg, .png). IEEE frowns on bitmapped formats
    2.71 -% which can result in "jaggedy"/blurry rendering of lines and letters as
    2.72 -% well as large increases in file sizes.
    2.73 -%
    2.74 -% You can find documentation about the pdfTeX application at:
    2.75 -% http://www.tug.org/applications/pdftex
    2.76 -
    2.77 -
    2.78 -
    2.79 -
    2.80 -
    2.81 -% *** MATH PACKAGES ***
    2.82 -%
    2.83 -%\usepackage[cmex10]{amsmath}
    2.84 -% A popular package from the American Mathematical Society that provides
    2.85 -% many useful and powerful commands for dealing with mathematics. If using
    2.86 -% it, be sure to load this package with the cmex10 option to ensure that
    2.87 -% only type 1 fonts will utilized at all point sizes. Without this option,
    2.88 -% it is possible that some math symbols, particularly those within
    2.89 -% footnotes, will be rendered in bitmap form which will result in a
    2.90 -% document that can not be IEEE Xplore compliant!
    2.91 -%
    2.92 -% Also, note that the amsmath package sets \interdisplaylinepenalty to 10000
    2.93 -% thus preventing page breaks from occurring within multiline equations. Use:
    2.94 -%\interdisplaylinepenalty=2500
    2.95 -% after loading amsmath to restore such page breaks as IEEEtran.cls normally
    2.96 -% does. amsmath.sty is already installed on most LaTeX systems. The latest
    2.97 -% version and documentation can be obtained at:
    2.98 -% http://www.ctan.org/tex-archive/macros/latex/required/amslatex/math/
    2.99 -
   2.100 -
   2.101 -
   2.102 -
   2.103 -
   2.104 -% *** SPECIALIZED LIST PACKAGES ***
   2.105 -%
   2.106 -%\usepackage{algorithmic}
   2.107 -% algorithmic.sty was written by Peter Williams and Rogerio Brito.
   2.108 -% This package provides an algorithmic environment fo describing algorithms.
   2.109 -% You can use the algorithmic environment in-text or within a figure
   2.110 -% environment to provide for a floating algorithm. Do NOT use the algorithm
   2.111 -% floating environment provided by algorithm.sty (by the same authors) or
   2.112 -% algorithm2e.sty (by Christophe Fiorio) as IEEE does not use dedicated
   2.113 -% algorithm float types and packages that provide these will not provide
   2.114 -% correct IEEE style captions. The latest version and documentation of
   2.115 -% algorithmic.sty can be obtained at:
   2.116 -% http://www.ctan.org/tex-archive/macros/latex/contrib/algorithms/
   2.117 -% There is also a support site at:
   2.118 -% http://algorithms.berlios.de/index.html
   2.119 -% Also of interest may be the (relatively newer and more customizable)
   2.120 -% algorithmicx.sty package by Szasz Janos:
   2.121 -% http://www.ctan.org/tex-archive/macros/latex/contrib/algorithmicx/
   2.122 -
   2.123 -
   2.124 -
   2.125 -
   2.126 -% *** ALIGNMENT PACKAGES ***
   2.127 -%
   2.128 -%\usepackage{array}
   2.129 -% Frank Mittelbach's and David Carlisle's array.sty patches and improves
   2.130 -% the standard LaTeX2e array and tabular environments to provide better
   2.131 -% appearance and additional user controls. As the default LaTeX2e table
   2.132 -% generation code is lacking to the point of almost being broken with
   2.133 -% respect to the quality of the end results, all users are strongly
   2.134 -% advised to use an enhanced (at the very least that provided by array.sty)
   2.135 -% set of table tools. array.sty is already installed on most systems. The
   2.136 -% latest version and documentation can be obtained at:
   2.137 -% http://www.ctan.org/tex-archive/macros/latex/required/tools/
   2.138 -
   2.139 -
   2.140 -%\usepackage{mdwmath}
   2.141 -%\usepackage{mdwtab}
   2.142 -% Also highly recommended is Mark Wooding's extremely powerful MDW tools,
   2.143 -% especially mdwmath.sty and mdwtab.sty which are used to format equations
   2.144 -% and tables, respectively. The MDWtools set is already installed on most
   2.145 -% LaTeX systems. The lastest version and documentation is available at:
   2.146 -% http://www.ctan.org/tex-archive/macros/latex/contrib/mdwtools/
   2.147 -
   2.148 -
   2.149 -% IEEEtran contains the IEEEeqnarray family of commands that can be used to
   2.150 -% generate multiline equations as well as matrices, tables, etc., of high
   2.151 -% quality.
   2.152 -
   2.153 -
   2.154 -%\usepackage{eqparbox}
   2.155 -% Also of notable interest is Scott Pakin's eqparbox package for creating
   2.156 -% (automatically sized) equal width boxes - aka "natural width parboxes".
   2.157 -% Available at:
   2.158 -% http://www.ctan.org/tex-archive/macros/latex/contrib/eqparbox/
   2.159 -
   2.160 -
   2.161 -
   2.162 -
   2.163 -
   2.164 -% *** SUBFIGURE PACKAGES ***
   2.165 -%\usepackage[tight,footnotesize]{subfigure}
   2.166 -% subfigure.sty was written by Steven Douglas Cochran. This package makes it
   2.167 -% easy to put subfigures in your figures. e.g., "Figure 1a and 1b". For IEEE
   2.168 -% work, it is a good idea to load it with the tight package option to reduce
   2.169 -% the amount of white space around the subfigures. subfigure.sty is already
   2.170 -% installed on most LaTeX systems. The latest version and documentation can
   2.171 -% be obtained at:
   2.172 -% http://www.ctan.org/tex-archive/obsolete/macros/latex/contrib/subfigure/
   2.173 -% subfigure.sty has been superceeded by subfig.sty.
   2.174 -
   2.175 -
   2.176 -
   2.177 -%\usepackage[caption=false]{caption}
   2.178 -%\usepackage[font=footnotesize]{subfig}
   2.179 -% subfig.sty, also written by Steven Douglas Cochran, is the modern
   2.180 -% replacement for subfigure.sty. However, subfig.sty requires and
   2.181 -% automatically loads Axel Sommerfeldt's caption.sty which will override
   2.182 -% IEEEtran.cls handling of captions and this will result in nonIEEE style
   2.183 -% figure/table captions. To prevent this problem, be sure and preload
   2.184 -% caption.sty with its "caption=false" package option. This is will preserve
   2.185 -% IEEEtran.cls handing of captions. Version 1.3 (2005/06/28) and later 
   2.186 -% (recommended due to many improvements over 1.2) of subfig.sty supports
   2.187 -% the caption=false option directly:
   2.188 -%\usepackage[caption=false,font=footnotesize]{subfig}
   2.189 -%
   2.190 -% The latest version and documentation can be obtained at:
   2.191 -% http://www.ctan.org/tex-archive/macros/latex/contrib/subfig/
   2.192 -% The latest version and documentation of caption.sty can be obtained at:
   2.193 -% http://www.ctan.org/tex-archive/macros/latex/contrib/caption/
   2.194 -
   2.195 -
   2.196 -
   2.197 -
   2.198 -% *** FLOAT PACKAGES ***
   2.199 -%
   2.200 -%\usepackage{fixltx2e}
   2.201 -% fixltx2e, the successor to the earlier fix2col.sty, was written by
   2.202 -% Frank Mittelbach and David Carlisle. This package corrects a few problems
   2.203 -% in the LaTeX2e kernel, the most notable of which is that in current
   2.204 -% LaTeX2e releases, the ordering of single and double column floats is not
   2.205 -% guaranteed to be preserved. Thus, an unpatched LaTeX2e can allow a
   2.206 -% single column figure to be placed prior to an earlier double column
   2.207 -% figure. The latest version and documentation can be found at:
   2.208 -% http://www.ctan.org/tex-archive/macros/latex/base/
   2.209 -
   2.210 -
   2.211 -
   2.212 -%\usepackage{stfloats}
   2.213 -% stfloats.sty was written by Sigitas Tolusis. This package gives LaTeX2e
   2.214 -% the ability to do double column floats at the bottom of the page as well
   2.215 -% as the top. (e.g., "\begin{figure*}[!b]" is not normally possible in
   2.216 -% LaTeX2e). It also provides a command:
   2.217 -%\fnbelowfloat
   2.218 -% to enable the placement of footnotes below bottom floats (the standard
   2.219 -% LaTeX2e kernel puts them above bottom floats). This is an invasive package
   2.220 -% which rewrites many portions of the LaTeX2e float routines. It may not work
   2.221 -% with other packages that modify the LaTeX2e float routines. The latest
   2.222 -% version and documentation can be obtained at:
   2.223 -% http://www.ctan.org/tex-archive/macros/latex/contrib/sttools/
   2.224 -% Documentation is contained in the stfloats.sty comments as well as in the
   2.225 -% presfull.pdf file. Do not use the stfloats baselinefloat ability as IEEE
   2.226 -% does not allow \baselineskip to stretch. Authors submitting work to the
   2.227 -% IEEE should note that IEEE rarely uses double column equations and
   2.228 -% that authors should try to avoid such use. Do not be tempted to use the
   2.229 -% cuted.sty or midfloat.sty packages (also by Sigitas Tolusis) as IEEE does
   2.230 -% not format its papers in such ways.
   2.231 -
   2.232 -
   2.233 -
   2.234 -
   2.235 -
   2.236 -% *** PDF, URL AND HYPERLINK PACKAGES ***
   2.237 -%
   2.238 -%\usepackage{url}
   2.239 -% url.sty was written by Donald Arseneau. It provides better support for
   2.240 -% handling and breaking URLs. url.sty is already installed on most LaTeX
   2.241 -% systems. The latest version can be obtained at:
   2.242 -% http://www.ctan.org/tex-archive/macros/latex/contrib/misc/
   2.243 -% Read the url.sty source comments for usage information. Basically,
   2.244 -% \url{my_url_here}.
   2.245 -
   2.246 -
   2.247 -
   2.248 -
   2.249 -
   2.250 -% *** Do not adjust lengths that control margins, column widths, etc. ***
   2.251 -% *** Do not use packages that alter fonts (such as pslatex).         ***
   2.252 -% There should be no need to do such things with IEEEtran.cls V1.6 and later.
   2.253 -% (Unless specifically asked to do so by the journal or conference you plan
   2.254 -% to submit to, of course. )
   2.255 -
   2.256 -
   2.257 -% correct bad hyphenation here
   2.258 -\hyphenation{op-tical net-works semi-conduc-tor}
   2.259 -
   2.260 -
   2.261 -\begin{document}
   2.262 -
   2.263 -\bibliographystyle{plain}
   2.264 -%
   2.265 -
   2.266 -\title{A Hardware Abstraction Suitable for Use at the Base of a Portability Software Stack}
   2.267 -
   2.268 -\author{Sean Halle \and Merten Sach \and BJ}
   2.269 -\institute{Technical University Berlin, Germany}
   2.270 -
   2.271 -\maketitle             
   2.272 -%
   2.273 -
   2.274 -\begin{abstract}
   2.275 -Proposals for a software stack that supports performant portability\cite{} rely on the assumption, among others, that a suitable hardware abstraction exists for the bottom layer. The abstraction must have certain properties:  it must be invisible to upper layers; it must simplify language implementation, in part by hiding details of synchronization and details of network; and at the same time enable high performance, in part by giving the  language control over task placement and exposing memory hierarchy, communication times, and other major performance-related aspects of the hardware.
   2.276 -
   2.277 -In this paper, we show that an abstraction called Virtualized Master-Slave, or VMS \cite{}
   2.278 -satisfies these criteria, and we provide recent measurements to support the case.
   2.279 -
   2.280 -\end{abstract}
   2.281 -
   2.282 -\section{Motivation}
   2.283 -
   2.284 -As stated in the call for papers, wide uptake of high efficiency parallel architectures ``requires new parallel programming paradigms, new methods of application design, new structures for system software, and new models of interaction among applications, compilers, operating systems, and hardware." In short, a new software stack, and players in research and industry organized to supply the pieces of the stack. The goal of the stack is  to make parallel programming the same order of productivity as sequential programming, and to give parallel code similar portability onto newer generations of hardware that sequential code enjoyed.
   2.285 -
   2.286 -The benefit from such a stack is  reducing the cost of parallel software, by making it just as productive as sequential coding, and by allowing  code to be written once then run performantly across hardware targets, including unknown future architectures.
   2.287 -
   2.288 -A recent proposal for achieving this, known as PStack\cite{},  calls for a software stack having a layer of languages (toolchains) at the top, a layer of language runtimes below that, and a hardware abstraction layer at the bottom\cite{}.
   2.289 -
   2.290 -The languages must be designed to capture all information required to specialize the code for high performance across hardware. A computation model, called The Holistic Model\cite{}, suggests that such a canonical set of information exists. PStack proposes to develop the constructs that gather the information, where some constructs are in the form of specialization helpers such as task-resizers and layout modifiers. The application implements the specialization helpers, thereby encoding information about data structures and how to manipulate them.  The seeds of such an approach were laid with work on DKU\cite{}, which demonstrated the success of task-resizing constructs.
   2.291 -
   2.292 -The proposal also suggests the use of the BLIS\cite{} approach for managing multiple toolchains, where each one specializes to a different target. The management includes the  install process, during which the correct toolchain output is paired to the installation target.  Further specialization can thus naturally be added during installation, when exact hardware details are known.  If required, runtime tuning and optimization also fit  within the approach.
   2.293 -
   2.294 -The top layer of the software stack is thus the combination of constructs and managed toolchains. 
   2.295 -Below that, in the middle of the stack, a collection of runtime systems acts as a sort of cross-bar switch, connecting the languages above to a standard hardware abstraction below. 
   2.296 -
   2.297 -The purpose of the bottom abstraction is to minimize the effort to create the runtimes in the middle layer.  It must both hide hardware details and uniform-ize runtime implementation. However, it must not hide performance-critical information from the runtime, which holds the scheduler that decides when tasks become free and where to execute them.  Such scheduling choices need to know the communication paths and memory pools in the hardware, along with latency, bandwidth, capacity and performance.
   2.298 -
   2.299 -A single abstraction can't both hide details and expose those required by the runtimes to attain high performance.  Instead, PStack calls for a  family of abstractions, one for each major type of architecture, including a``hierarchy'' abstraction used to glue together heterogeneous hardware.  In each, only the details critical to performance are exposed to the scheduler in the runtime, thus keeping the number of abstractions needed manageably small, on the order of tens in total.
   2.300 -
   2.301 -This paper's contribution is showing how the Virtualized Master-Slave abstraction\cite{} fits the criteria for the bottom of such a portability software stack.  This includes recent performance results for VMS on a variety of hardware platforms over a selection of benchmarks. 
   2.302 -
   2.303 -
   2.304 -\section{Background on PStack and VMS}
   2.305 -
   2.306 -Describe details of bottom of software stack and details of how cross-bar works, tie to the need for low-work runtime impl, and the need for reuse of runtime code across languages.
   2.307 -
   2.308 -Describe how VMS fulfills those needs.  Details of its interface, details of its impl on multi-core, details of differences on different machines.
   2.309 -Leave as-is for 1x4 and 1x2 -- make entirely different impl for 2x4 and another for 4x10
   2.310 -
   2.311 -\section{Description of How VMS Fits PStack's Requirements}
   2.312 -
   2.313 -\section{Experimental Setup}
   2.314 -
   2.315 -\section{Results}
   2.316 -
   2.317 -\subsection{Performance Results}
   2.318 -Figure X shows the execution time versus task-size curve for Vthread, while Figure X shows the same for pthread, both executed on the SandyBridge 1x4 machine.  The 2x point occurs near X cycles for Vthread, nearly independently from the number of threads created.  Meanwhile, it occurs near X for pthread, for a small number of threads, showing that pthreads has X times higher overhead in the best case.  However, pthreads degrades rapidly as the number of threads is increased.  The system crashes with more than X threads, at which point the overhead of pthreads is X times higher than Vthread (not shown on graph because it's too large).
   2.319 -
   2.320 -When moving to multi-socket machines, things change, as seen in figures X and X.  The central master-lock in the VMS implementation creates a bottleneck for small task sizes.  This causes Vthread to have only a X advantage over pthread for small task sizes.  However, Vthread overhead remains independent of number of threads.
   2.321 -
   2.322 -Likewise, the benchmarks for ray tracking, matrix multiply, and kmeans clustering show that Vthread outperforms pthreads.  They nearly match for large work sizes, but Vthread significantly outperforms for small work sizes, as seen in Figures X and X.
   2.323 -
   2.324 -
   2.325 -\begin{figure}[ht]
   2.326 -\mbox{\subfigure{\includegraphics[width=3in]{fig1.pdf}}\quad
   2.327 -\subfigure{\includegraphics[width=2.3in, angle = -90]{../figures/plots_exec_vs_task_size_v3/cray1_pthreads_8_32_128_512thds__o30000__perfCtrs.result.eps}}
   2.328 -
   2.329 - \caption
   2.330 - {Execution time / total work  vs  size of a task.  The smaller the task, the smaller the distance between scheduling events.  When the ratio reaches 2, the scheduling overhead exactly equals the task size.
   2.331 -  }
   2.332 -\label{figCray1Vthread}
   2.333 -\end{figure}
   2.334 -
   2.335 - \includegraphics[width=2.3in, angle = -90]{../figures/plots_exec_vs_task_size_v3/cray1_pthreads_8_32_128_512thds__o30000__perfCtrs.result.eps} 
   2.336 -\subsection{Implementation Time Results}
   2.337 -As seen in a previous paper on VMS\citation{}, VMS makes runtime implementation quick and easy.  The results are re-printed here to support the claim that VMS meets the PStack requirement of reducing runtime implementation effort. These compare to weeks or months to learn code of a pre-existing multi-threaded runtime, modify it, and debug the multi-threaded implementation.
   2.338 -
   2.339 -Implementation time is reduced by the uniform natrue of VMS-based runtimes, freedom from details hidden by the VMS interface, the helper facilities VMS provides, VMS's debugging facilities, and freedom from multi-threading issues due to VMS's tie-point ordering guarantee.  When combined, they lead to the fast implementation times seen in Table X.  
   2.340 -
   2.341 -
   2.342 -
   2.343 -\section{Future Work and Conclusion}
   2.344 -A semester long 
   2.345 -
   2.346 -
   2.347 -
   2.348 -\section{Planning}
   2.349 -
   2.350 - (6.5" x 9" 10pt 2 col) 
   2.351 -
   2.352 -require new parallel programming paradigms, new methods of application design, new structures for system software, and new models of interaction among applications, compilers, operating systems, and hardware.
   2.353 -
   2.354 -what benefits performant portability will bring
   2.355 -
   2.356 -abstraction must have certain properties to fulfill its role in stack.  Contribution of paper is new results confirming suitability of VMS to be that abstraction.
   2.357 -
   2.358 -The Need: background on portability stack -- the end-goal sought, the fundamentals involved in any solution to provide it, how proposed stack from last year is updated to accord with those fundamentals in achieving end-goal.  The elements of the stack.  What function/role each element serves.
   2.359 -
   2.360 -Describe details of bottom of software stack and details of how cross-bar works, and the need for low-work runtime impl, and the need for reuse of runtime code across languages.
   2.361 -
   2.362 -what need in bottom, to support eco-system\cite{} 
   2.363 - 
   2.364 - Many runtime systems are implemented -- for each language, one runtime for each variant of the abstraction.
   2.365 -
   2.366 -Expect domain-specific languages -- these embed new behaviors into base language.  Different from library because cannot understand the behavior of the call based on the base language alone. The runtime system provides behavior beyond the base language.  This approach makes for quick  creation of languages specialized to narrow domains.
   2.367 -
   2.368 -This approach is likely to fail without performant portability for its user's benefit, and a low-cost way to implement it for the language provider. 
   2.369 -
   2.370 -In effect, just about the only purpose and criteria for judgement of the base abstraction is its ability to reduce effort, and its ability to support the portability features of the upper layers.
   2.371 -
   2.372 -Effort has two aspects: in implementing a new language, and in porting existing languages to new hardware. 
   2.373 -
   2.374 -Support for upper layers is 
   2.375 -
   2.376 -Hence, the two primary aspects of the base hardware abstraction are its ability to reduce the effort of implementing the set of runtime systems for a given language, and its ability to reduce the number of such runtime systems.
   2.377 -
   2.378 -for abstraction in bottom layer, here's VMS, detailed in LCPC paper, with  new results that show it can, indeed, fulfil the base abstraction role for the class of multi-core hardware.
   2.379 -
   2.380 -
   2.381 -
   2.382 -
   2.383 -
   2.384 -
   2.385 -
   2.386 -
   2.387 -
   2.388 -
   2.389 -
   2.390 -
   2.391 -  
   2.392 -
   2.393 -
   2.394 -
   2.395 -
   2.396 -
   2.397 -Describe details of bottom of software stack and details of how cross-bar works, tie to the need for low-work runtime impl, and the need for reuse of runtime code across languages.
   2.398 -
   2.399 -Describe how VMS fulfills those needs.  Details of its interface, details of its impl on multi-core, details of differences on different machines.
   2.400 -Leave as-is for 1x4 and 1x2 -- make entirely different impl for 2x4 and another for 4x10
   2.401 -?
   2.402 -\section{old stuff}
   2.403 -
   2.404 -A common theme among parallel language designers, parallel OS implementers and parallel hardware architects is the need for a clean, portable hardware abstraction.  Locks, with thread-constructs built above them, have failed due to interrelated issues: requiring system-wide quorum on each acquisition of a lock, limited semantics of lock-hardware, and its black-box nature which freezes out the OS and language runtimes. 
   2.405 -
   2.406 - We argue that lock-oriented hardware plays a major role in the  failure to achieve top-to-bottom integrated cooperation between application, langauge runtime, hardware management in the OS, and the hardware itself.  We further argue that this kind of integration is  a necessary (but not sufficient) requirement for efficient portability of applications. Both arguments are expounded upon in the dissertation of Halle[].
   2.407 -
   2.408 -We further claim that  every application-level synchronization or parallelism construct is a coupling, of semantic-behavior to  a mechanism that  orders events among application timelines.  Even the CAS instruction combines compare and swap semantics with an order-establishing hardware mechanism. The mechanism guarantees that CAS instructions from different cores receive a sequential ordering, and is implemented in the memory system, where it establishes sole access of one core to the specified address, Meanwhile, the "Compare and swap" semantics may be implemented in the core, providing the behavior of the access. When several kinds of lock-establishing instruction are available, they all use the same memory-system hardware to establish ordering, and simply implement different behavior on top of it.  The OS or language runtime code then use these instructions to establish ordering among the cores, and implement  semantics of more complex constructs  on top of that.
   2.409 -
   2.410 -What would happen, then, if the semantic-behavior were separated from the establishment of ordering? In this case, the order-establishment mechanism would have no behavior usable in a program. It would have to be coupled with code fragments that add semantic behavior.
   2.411 -
   2.412 -Such a separation is what this paper proposes, and presents an implementation in user-space for multi-core shared memory processors,
   2.413 -
   2.414 -  
   2.415 -
   2.416 -?
   2.417 -
   2.418 -a rich interface between scheduling and hardware that enables adaptation to target hardware characteristics for higher performance, and integration into a cross-language framework
   2.419 -
   2.420 -  ?
   2.421 -
   2.422 - This abstraction is not directly usable by application
   2.423 -  programmers.  Instead, application-visible behavior is implemented
   2.424 -in a semantical plugin, and then invoked via a language or library
   2.425 -  that uses the plugin.  The main benefit is that parallel language
   2.426 -  runtimes become simpler to implement, because they use sequential
   2.427 -  algorithms for the parallel semantics. This is possible because the
   2.428 -  abstraction makes available a virtual time in which events in
   2.429 -  different program time-lines are sequentialized.  The parallel
   2.430 -  semantics relate events in different time-lines via relating the
   2.431 -  sequentialized versions within the virtual time-line.
   2.432 -
   2.433 -?
   2.434 -
   2.435 -Thread parallelism constructs have been well documented to be
   2.436 -difficult to program with. They directly expose low-level concurrency
   2.437 -to the programmer. Arbitrary non-deterministic behavior and deadlocks
   2.438 -can arise from improperly synchronized code. Efficient execution
   2.439 -requires non-blocking algorithms whose correctness requires deep
   2.440 -understanding of weakly consistent memory models. In addition, the
   2.441 -operating system abstraction for threads comes with a very high
   2.442 -context-switching and synchronization overhead.
   2.443 -
   2.444 -%% AC. The following is wrong.
   2.445 -%\paragraph{\bf Problems with threads: } Thread parallelism constructs have been well documented to be difficult to program with. In addition, threads are problematic when one has multiple physical cores because the thread abstraction hides the hardware, disabling control over placement of tasks.  High performance requires such control. Hence, not only are threads  difficult to use directly in applications, but they prevent easier parallelism constructs from being built on top of them that have high performance.
   2.446 -
   2.447 -%\paragraph{\bf Partial solution: } To deal with this, in practice, a language's runtime turns off
   2.448 -
   2.449 -\paragraph*{A partial solution.}
   2.450 -
   2.451 -To deal with the last problem, a parallel language's runtime turns off
   2.452 -operating system threads by pinning one to each physical core.  This
   2.453 -way, the custom runtime is assured that the software thread is
   2.454 -one-to-one with a physical core. It then implements a user-level
   2.455 -thread package that lets it control which OS thread a computational
   2.456 -task is assigned to. Finally, the runtime then implements the
   2.457 -language's parallel semantics in terms of those user threads.
   2.458 -
   2.459 -The user-level threading approach addresses the system overhead issue,
   2.460 -but it still hides important events such as input-output or
   2.461 -node-to-node communications in a cluster.
   2.462 -%% AC. Irrelevant for a parallelism-centric paper
   2.463 -%software faults, and other hardware events
   2.464 -Hence, the more scalable runtimes need to coordinate task assignment
   2.465 -to cores with application access of input and output, memory
   2.466 -allocation over non-uniform cache and memory hierarchies, offloading
   2.467 -to hardware accelerators, power management, and inter-node
   2.468 -communication in a cluster.  The user-level threading approach also
   2.469 -makes the parallel runtime implementation cumbersome, error-prone and
   2.470 -complex, because it is still written in terms of threads.
   2.471 -
   2.472 -Overall, parallel language implementations must deal with a number of
   2.473 -challenges normally deferred to the operating system, and they still
   2.474 -suffer from the complexity of non-blocking shared memory
   2.475 -concurrency.
   2.476 -
   2.477 -Ideally, the OS would provide a mutable hardware abstraction,
   2.478 -along with a selection of customizations or mutations that provide convenient
   2.479 -parallelism constructs for applications to use. It should allow
   2.480 -languages to provide their own parallel semantics.  The language
   2.481 -runtime would then provide a secure way for the runtime to access
   2.482 -kernel-only hardware mechanisms. It could interact directly with the
   2.483 -kernel to manage physical resources, in a low-overhead way.
   2.484 -One benefit of this arrangement is the chance to implement a
   2.485 -language's runtime directly as a plugin, which gives it the ability to
   2.486 -control which task is assigned to which processing element at which
   2.487 -time.  This enables high performance and low-energy data affinity
   2.488 -techniques. For example, the runtime could track data within the
   2.489 -memory hierarchy and assign tasks to locations close to their consumed
   2.490 -data.
   2.491 -
   2.492 -% \paragraph{\bf Support for portability: } This separation between
   2.493 -% application executable and language runtime also supports
   2.494 -% portability. It packages the language-specific scheduler into a
   2.495 -% separate machine-specific module that is separately installed. The
   2.496 -% scheduler is thus optimized for the combination of language with
   2.497 -% hardware. This isn't a full solution to portability but it is a needed
   2.498 -% precursor.
   2.499 -
   2.500 -% An interesting portability technique enabled by a separate
   2.501 -% language-plus-hardware specific runtime is tracking which cores are
   2.502 -% likely to already have data in the cache which a new task requires as
   2.503 -% input. It could place tasks where the input data is likely to reside,
   2.504 -% reducing communication to increase performance and power efficiency.
   2.505 -
   2.506 -\paragraph*{Contribution.}
   2.507 -
   2.508 -We show in this paper the definition and implementation of such a
   2.509 -mutable hardware abstraction, albeit at user-level rather than in
   2.510 -the kernel.  It lets a language runtime be implemented as
   2.511 -a plugin, which includes parallelism constructs and assignment of
   2.512 -tasks to cores.
   2.513 -
   2.514 -We focus in this introductory paper on the definition of the
   2.515 -abstraction and its support for parallelism constructs, postponing
   2.516 -exploration of assignment of tasks onto cores and other performance
   2.517 -optimizations to following papers. This paper establishes a definition
   2.518 -of multiple time-lines in a program, and a virtual time that
   2.519 -sequentializes events within those. It demonstrates three sets of
   2.520 -parallelism constructs: synchronous \texttt{send}-\texttt{receive}
   2.521 -motivated by process calculi; \texttt{spawn} and \texttt{sync} from
   2.522 -Cilk {\cite{Fri98,CILKHome}}; and \texttt{mutex} and \texttt{condition
   2.523 -  variable} from pthreads.  The assignment policy we implemented with
   2.524 -them is simply first-come first-served.
   2.525 -
   2.526 -We call the abstraction Virtualized Master-Slave, or VMS. It is
   2.527 -essentially a definition of virtualized time on parallel hardware. It
   2.528 -exports facilities, to a plugin, to create virtual processors and
   2.529 -control how their timelines relate to each other, and relate to
   2.530 -physical time. It also exports facilities, for a library to use, to
   2.531 -suspend a virtual processor and interact with the plugin. The plugin
   2.532 -embodies most of a language's runtime, while the library is the
   2.533 -application's gateway to that runtime.
   2.534 -
   2.535 -\paragraph*{Organization of paper.}
   2.536 -
   2.537 -Section~\ref{secAbsModel} provide the original concepts and
   2.538 -definitions of VMS. Section~\ref{secInternal} focuses on the
   2.539 -implementation, describing the elements and how they interact, then
   2.540 -relating them back to the theoretical definition.
   2.541 -Section~\ref{secApp} takes the point of view of the application code,
   2.542 -studying the usage and implementation of parallel language constructs
   2.543 -as a VMS plugin. To wrap up, measurements of effectiveness appear in
   2.544 -Section~\ref{secResults} and conclusions in
   2.545 -Section~\ref{secConclusion}.
   2.546 -
   2.547 -\section{Background and Related Work}
   2.548 -
   2.549 -All synchronization constructs establish an ordering between program timelines.  We formalize this with the notion of a tie-point and provide a hardware-neutral interface for establishing tie-points. To build synchronization constructs such as locks, condition variables, send-receive, spawn-sync, and so on, we couple the tie-point interface with a virtual timeline that establishes a global ordering of events from software timelines. The combination simplifies implementing synchronization primitives by allowing sequential algorithms to be used, without a performance penalty.  
   2.550 -
   2.551 -This approach has the benefit of separating the semantics of synchronization from the mapping, of software timelines onto physical-core timelines.  The hardware used to perform the mapping is hidden below the interface, and controlled by the runtime by invoking the interface, This improves portability, allows reuse, and shifts  runtimes down into the OS kernel level, providing performance benefits.
   2.552 -The language runtime gains secure access to all kernel level hardware mechanisms, and hardware resources are managed by dialog between runtime and interface, potentially alleviating a major challenge in parallel OS architecture.  The semantics of the tie-point interface makes such a dialog practical, portable, and efficient, as opposed to current issues caused by the semantics of lock hardware such as TAS and CAS instructions.
   2.553 -
   2.554 -In this paper, we introduce the approach with a user-level implementation, deferring Linux integration and hardware support to future papers.
   2.555 -We focus on the formal model of tie-points, details of our proof-of-concept implementation,
   2.556 -and connecting these to application code.
   2.557 -
   2.558 -
   2.559 -  
   2.560 -  We demonstrate
   2.561 -  the low overhead of the interface and quickness to implement a runtime on three sets
   2.562 -  of parallelism constructs: rendez-vous style \texttt{send} and
   2.563 -  \texttt{receive}; Cilk style \texttt{spawn} and \texttt{sync}, which
   2.564 -  have similar performance to Cilk 5.4; and \texttt{mutex} and
   2.565 -  \texttt{condition variable} constructs from pthreads, which have 80x
   2.566 -  lower overhead than Linux thread operations.  Development time
   2.567 -  averaged just over one day per set as opposed to weeks for equivalent lock-based implementations.
   2.568 -
   2.569 -
   2.570 -
   2.571 -User-level thread packages and most parallel language runtimes have to
   2.572 -side-step OS threads, by pinning one to each core, which effectively
   2.573 -gives the user-level package control over the core. Our VMS
   2.574 -implementation also does this. We are not claiming in this paper to
   2.575 -have the OS level implementation of VMS that is possible -- but just
   2.576 -the user-space version.
   2.577 -
   2.578 -\paragraph*{Related work.}
   2.579 -
   2.580 -The most closely related work is Scheduler Activations
   2.581 -\cite{SchedActivations}, which also allows modifying concurrency
   2.582 -constructs and controlling assignment of virtual processors onto
   2.583 -cores. However it has no virtual time to guarantee globally consistent
   2.584 -sequentialization, and no interface for plugins.
   2.585 -
   2.586 -BOM \cite{BOMinManticore}, which is used in Manticore to express
   2.587 -scheduling policies and synchronization, also bears resemblances to
   2.588 -VMS, but at a higher level of abstraction. BOM is a functional
   2.589 -language, rather than a primitive abstraction meant to sit at the
   2.590 -hardware-software boundary as VMS is.
   2.591 -
   2.592 -Coroutines is a high-performance means of switching between
   2.593 -tasks. Coroutine scheduling and stack handling techniques were well
   2.594 -suited to the user-space implementation of VMS.
   2.595 -
   2.596 -Other related work either provides an abstraction of the thread model,
   2.597 -or is a full language with specific parallelism constructs. As a
   2.598 -protypic example of user-level threads, Cilk {\cite{Fri98,CILKHome}}
   2.599 -provides a simplified abstraction with an efficient scheduling and
   2.600 -load balancing algorithm, but limited to fork-join concurrency. OpenMP
   2.601 -{\cite{OpenMPHome}} is a typical example of a parallel extension of
   2.602 -sequential languages; it allows creating tasks and controlling their
   2.603 -execution order. We claim that both Cilk and OpenMP, as well as most
   2.604 -thread abstractions or parallel languages may be implemented via
   2.605 -plugins to VMS, with similar performance.
   2.606 -
   2.607 -In contrast to thread abstractions and parallel programming languages,
   2.608 -VMS doesn't impose its own low-level concurrency semantics as a
   2.609 -programming model, but rather takes preferred ones as plugins. This
   2.610 -makes it not a language itself, but a \emph{support} mechanism to
   2.611 -implement language runtimes. Parallelism constructs may be implemented
   2.612 -as VMS plugins, easily, quickly, and with high performance as
   2.613 -indicated in Section~\ref{secResults}.
   2.614 -
   2.615 -This work presents a first incarnation and evaluation of VMS. We plan
   2.616 -to explore the embedding into VMS of a variety of parallel languages,
   2.617 -with a special interest for coordination languages
   2.618 -\cite{Gelernter85Linda,CnCInHotPar,CnCHome}. We will also explore
   2.619 -VMS's compatibility with different concurrent semantics
   2.620 -\cite{Kah74,hoare78,milner99,Hewitt10,Actors97}. One particularly
   2.621 -important application would be to use VMS to facilitate the design and
   2.622 -implementation of the emerging hybrid programming models, such as
   2.623 -MPI+OpenMP, or OpenMP+OpenCL \cite{Car10,OpenCLHome}.
   2.624 -
   2.625 -\paragraph*{Virtual Processor (VP).}
   2.626 -
   2.627 -We want to avoid the confusion associated with the various
   2.628 -interpretation for the the terms ``thread'' and ``task'' so will use
   2.629 -the term \emph{Virtual Processor} (VP), which we define as state in
   2.630 -combination with the ability to animate code or an additional level of
   2.631 -virtual processors. The state consists of a program counter, a stack
   2.632 -with its contents, a pointer to top of stack, and a pointer to the
   2.633 -current stack frame.
   2.634 -
   2.635 -\section{Abstract Definition of VMS}
   2.636 -\label{secAbsModel}
   2.637 -
   2.638 -We start with an intuitive overview, then precise the definitions and
   2.639 -properties in the following sub-sections.
   2.640 -
   2.641 -\paragraph*{Intuitive Overview.}
   2.642 -
   2.643 -VMS is concerned primarily with time and guarantees about it. This is
   2.644 -because parallelism constructs control how the time-lines of different
   2.645 -virtual processors intersect. They also guarantee relations of time
   2.646 -lines to hardware events.
   2.647 -
   2.648 -As an example, consider a program that writes into a data structure in
   2.649 -one time-line, then calls a \texttt{send} construct, meanwhile in a
   2.650 -different time-line it calls the \texttt{receive} construct then reads
   2.651 -the data structure. The constructs should guarantee that all data
   2.652 -written before the \texttt{send} is readable in the other time-line
   2.653 -after the \texttt{receive}.
   2.654 -
   2.655 -%% AC. Too early.
   2.656 -% VMS provides a primitive guarantee that
   2.657 -% plugin code builds upon to provide such higher-level guarantees. The
   2.658 -% primitive guarantee is defined in Section~\ref{scTime}, after the
   2.659 -% necessary vocabulary is established.
   2.660 -
   2.661 -To support parallelism constructs, VMS provides: primitive operations
   2.662 -to create and suspend VPs; a way for plugged-in code to control when
   2.663 -each VP is (re)started; and time-related guarantees. These are
   2.664 -enforced on all hardware, be it shared memory or distributed, with
   2.665 -strong memory consistency or weak.
   2.666 -
   2.667 -\paragraph*{Definition in three parts.}
   2.668 -
   2.669 -The definition we give is for VMS \emph{with plugins present}.  Hence,
   2.670 -the definition includes the behavior of any parallelism construct
   2.671 -implementable with VMS.  We give the abstract definition in three
   2.672 -parts: a definition of the elements of a VMS computation system; a
   2.673 -definition of time and the key VMS guarantee; and a definition of
   2.674 -virtual processor scheduling states and transitions between them.
   2.675 -
   2.676 -%% AC. Too early, Master has not been defined. I renamed VMS-core into
   2.677 -%% VMS afterwards, finding no ambiguity.
   2.678 -% The Master mentioned in the definition is an abstract entity, with a
   2.679 -% plugin present. In practice, this Master entity is implemented as part
   2.680 -% of a core VMS, and plugins later added. This VMS-core is the hardware
   2.681 -% abstraction. It hides the physical hardware behind an interface that
   2.682 -% creates virtual processors and enforces well-defined time-behavior.
   2.683 -
   2.684 -\subsection{The Elements of a VMS Computation System}
   2.685 -
   2.686 -\begin{itemize}
   2.687 -\item A VMS program has multiple VPs, which are Slaves, each with an
   2.688 -  independent time-line.
   2.689 -\item A schedule of Slaves is generated by a Master entity, from
   2.690 -  within separate time-line(s).
   2.691 -\item A schedule is defined as the set of points at which VPs are
   2.692 -  (re)animated.
   2.693 -\item All semantic parallelism behavior is invoked via communication
   2.694 -  with the Master.
   2.695 -\item Communication with the Master happens by using a VMS primitive,
   2.696 -  which causes \emph{voluntary} suspension of the program's VP.
   2.697 -\end{itemize}
   2.698 -
   2.699 -What is important here is that the choice of which VP is animated, at
   2.700 -which point, happens in a separate time-line; and that the VPs suspend
   2.701 -voluntarily for each parallelism construct. This means that
   2.702 -\emph{scheduling is separated from the program code}, the key point.
   2.703 -
   2.704 -The Master entity appears to be a single entity to the slaves, but may
   2.705 -be implemented by multiple Master VPs hidden inside the VMS
   2.706 -implementation.
   2.707 -
   2.708 -VPs use the Master as an intermediary to: semantically communicate
   2.709 -with each other; cause creation of new program VPs; and to influence
   2.710 -re-animation of VPs. As a subtlety, notice that hardware mechanisms,
   2.711 -such as coherent shared memory, allow communication to take place that
   2.712 -is not visible to the parallelism constructs. This is not allowed with
   2.713 -VMS: parallelism constructs of the language, and through them VMS
   2.714 -primitives, must be separately called in order to make use of shared
   2.715 -variable communication safe.
   2.716 -
   2.717 -%% AC. Calling memory a processing element only adds confusion.
   2.718 -% {\bf Definitions:\ }VMS is intended only for hardware systems that
   2.719 -% consist of processing elements connected by communication.  We define
   2.720 -% a memory-space to be a processing element, albeit without the ability
   2.721 -% to transform data.
   2.722 -
   2.723 -We define a \emph{physical core} to be a processing element executing
   2.724 -a sequential stream of instructions.
   2.725 -
   2.726 -We define a program-time as the sequence of instructions animated by a
   2.727 -Slave VP, which is eventually animated by a physical core. A Slave VP
   2.728 -has associated \emph{scheduling state} that, among other things,
   2.729 -relates to how its program-time progresses relative to physical time
   2.730 -on the cores.
   2.731 -
   2.732 -\subsection{Time in VMS}
   2.733 -\label{secTime}
   2.734 -
   2.735 -VMS has three levels of time: \emph{Program time}, \emph{Master time},
   2.736 -and \emph{Virtual time}.
   2.737 -
   2.738 -\begin{itemize}
   2.739 -\item Program time is local to a Slave VP, measured in instruction
   2.740 -  executions.
   2.741 -\item Master time is hidden from the program and is independent from
   2.742 -  all Program times.
   2.743 -\item Virtual time is the ordered set of changes in scheduling state
   2.744 -  of Slave VPs.
   2.745 -\end{itemize}
   2.746 -
   2.747 -What is most important here is that Virtual time defines a global
   2.748 -sequential ordering. This ordering is consistent with the key VMS
   2.749 -guarantee (given below), and each point in it is computed within
   2.750 -Master time.
   2.751 -
   2.752 -Also, the independence between program times and master time has
   2.753 -subtle advantages. It enables elegant enforcement of the VMS
   2.754 -guarantee, and implementation simplifications that become clear after
   2.755 -gaining deep implementation knowledge.
   2.756 -
   2.757 -In VMS, each event relevant to parallel semantics is tied to a
   2.758 -transition of the state of a Slave VP. This means that implementing
   2.759 -the behavior of parallel semantics is equivalent to controlling the
   2.760 -order of transitions of state of virtual processors.
   2.761 -
   2.762 -\paragraph*{Definition.}
   2.763 -
   2.764 -The stream of instructions in a given program-time is broken into a
   2.765 -number of \emph{trace-segments}, separated by suspension points. Each
   2.766 -trace-segment is animated by a single physical core, but not
   2.767 -necessarily the same core as animated the other trace segments. A
   2.768 -suspend point is created by a Slave VP executing the ``suspend''
   2.769 -primitive provided by VMS. A suspend point has no duration in program
   2.770 -time, but has distinct start and end points in virtual time. The
   2.771 -end-suspension points of two different program times can be tied
   2.772 -together within virtual time, which is called a \emph{tie point} and
   2.773 -has special significance to parallel constructs. The physical time of
   2.774 -a core has no relationship to any program time, except for the various
   2.775 -time-guarantees in this definition of VMS.
   2.776 -
   2.777 -\begin{figure}[ht]
   2.778 - \includegraphics[width=5in]{../figures/Time_in_VMS_1.eps}
   2.779 - \caption
   2.780 - {Mapping program time onto Virtual time. \ The
   2.781 -  Master controls creation of new program time lines, and ending suspend
   2.782 -  points. Here, it has ended two suspend points at a common tie-point.
   2.783 -  }
   2.784 -\label{figTimeMapping}
   2.785 -\end{figure}
   2.786 -
   2.787 -\paragraph*{Relating time-lines to each other.}
   2.788 -
   2.789 -Figure \ref{figTimeMapping} illustrates how trace-segments relate to
   2.790 -suspend points, and map onto virtual time. A trace segment starts in
   2.791 -virtual time where suspend is ended, as seen.  In fact, the two trace
   2.792 -segments shown have a common start-point within virtual time. This is
   2.793 -because the parallelism semantics chose to start them at the same
   2.794 -point -- this is what a tie point is.  A key note is that the lengths
   2.795 -in virtual time have no relation to the lengths in program-time. The
   2.796 -only defined feature is that those two trace-segments have a common
   2.797 -start-point in virtual time. This means that the two suspend points
   2.798 -are considered to be tied together.
   2.799 -
   2.800 -\paragraph*{The key VMS guarantee.}
   2.801 -
   2.802 -Being tied together means that all physical events that can be
   2.803 -observed by both program-times are covered by the key VMS guarantee:
   2.804 -any events triggered before the common suspend point in one program
   2.805 -time are guaranteed visible in the other program time after the common
   2.806 -suspend point. They {\em{might}} be visible before, but it's not
   2.807 -guaranteed. In addition, events triggered after the common suspend
   2.808 -point in one are guaranteed not visible before the common suspend
   2.809 -point in the other. This two-part guarantee is a fundamental design
   2.810 -property of VMS.
   2.811 -
   2.812 -Intuitively, a tie-point separates before it from after such that tied
   2.813 -program times agree (illustrated with code in Section \ref{secApp}
   2.814 -Figure \ref{figAnimVP}). But the subtlety is that events triggered
   2.815 -before the tie-point, {\em{might}} be visible to the other before, and
   2.816 -ones triggered after {\em{might not}} be visible to the other after --
   2.817 -physical events triggered before are only guaranteed visible
   2.818 -{\em{after}} the tie point, and events after are only guaranteed
   2.819 -{\em{not}} visible {\em{before}} the tie point.
   2.820 -
   2.821 -This is a form of bounded non-determinism. The pattern of suspension
   2.822 -end-points determines which trace-segments overlap in Virtual time,
   2.823 -and events triggered in one might be visible in overlapped ones. But
   2.824 -no guarantees cover these. If one segment tries to observe, it will
   2.825 -see events triggered by overlapped segments in non-deterministic
   2.826 -order.
   2.827 -
   2.828 -The VMS implementation defines which physical events are covered by
   2.829 -the key VMS guarantee (reads/writes, network communication, DMA, I/O).
   2.830 -
   2.831 -\paragraph*{Globally consistent sequential order.}
   2.832 -
   2.833 -VMS maps suspend-start, suspend-end, and hence tie-points, to a
   2.834 -globally-consistent sequential order in Virtual time. This enables one
   2.835 -of VMS's key benefits: sequential algorithms for parallel constructs.
   2.836 -
   2.837 -Tie points define parallel behavior, so an implementation of how to
   2.838 -choose tie points equals an implementation of parallel constructs. The
   2.839 -Master chooses tie-points, so plugging code to choose tie-points into
   2.840 -the Master equals plugging in parallel constructs.
   2.841 -
   2.842 -\subsection{Scheduling State}
   2.843 -
   2.844 -Scheduling state is used in VMS to organize internal activity, for
   2.845 -enforcing the guarantees.
   2.846 -
   2.847 -\begin{itemize}
   2.848 -\item VPs have three scheduling states: {\em{Animated}},
   2.849 -  {\em{Blocked}}, {\em{Ready}}; see Figure~\ref{figStates}.
   2.850 -\item VPs in Animated are {\em{allowed}} to advance Program time with
   2.851 -  (core-local) physical time.
   2.852 -\item VPs in Blocked and Ready do not advance their Program time.
   2.853 -\item Animated has two physical states: {\em{Progressing}} and
   2.854 -  {\em{Stalled}}.
   2.855 -\item VPs in Progressing advance Program time with (core-local)
   2.856 -  physical time, those in Stalled do not (allowing non-semantic
   2.857 -  suspend).
   2.858 -\item Scheduling states are defined in Virtual time only.
   2.859 -\item Progressing and Stalled are defined in (core-local) physical
   2.860 -  time only; the distinction is invisible in Virtual time.
   2.861 -\end{itemize}
   2.862 -
   2.863 -\begin{figure}[h!tb]
   2.864 -\begin{minipage}{.55\textwidth}
   2.865 -  \includegraphics[width=\textwidth]{../figures/Scheduling_states_2.eps}
   2.866 -  \caption{Scheduling states of a slave VP in the VMS model.}
   2.867 -  \label{figStates}
   2.868 -\end{minipage}
   2.869 -\hfill
   2.870 -\begin{minipage}{.43\textwidth}
   2.871 -  \includegraphics[width=\textwidth]{../figures/VMS-core__plugins.eps}
   2.872 -  \caption
   2.873 - {
   2.874 -  The Master, split into a generic core and a language-specific plugin.
   2.875 -  The core encapsulates the hardware and remains the same across applications.
   2.876 -  The plugin is part of the parallelism-construct implementation.
   2.877 - }
   2.878 - \label{figMasterSplit}
   2.879 -\end{minipage}
   2.880 -\end{figure}
   2.881 -
   2.882 -
   2.883 -Some important points: (1) only VPs Animated can trigger physical
   2.884 -events that are seen in other program time-lines; (2) the distinction
   2.885 -between Blocked vs Stalled is that a VP has to explicitly execute a
   2.886 -VMS primitive operation to enter Blocked, making it part of the
   2.887 -semantics of parallelism constructs. In contrast, Stalled happens
   2.888 -invisibly, with no effect on semantic behavior. It is due to hardware
   2.889 -events hidden inside VMS, such as interrupts.
   2.890 -
   2.891 -The Ready state is used to separate the parallelism-construct behavior
   2.892 -from the scheduling behavior. It acts as a ``staging area'' for
   2.893 -scheduling. VPs placed into this state are {\em{allowed}} to be
   2.894 -animated, then the scheduler decides when and where.
   2.895 -
   2.896 -An essential and illustrative point is that actions {\em{outside}} a
   2.897 -given Program time cause the VP to transition
   2.898 -Blocked$\rightarrow$Ready, which contrasts to lock algorithms where
   2.899 -the concurrency-related behavior takes place {\em{inside}} program
   2.900 -time.
   2.901 -
   2.902 -\paragraph*{Transition Between Slave Scheduling States.}
   2.903 -
   2.904 -\begin{itemize}
   2.905 -\item VPs transition states as shown in Figure \ref{figStates}.
   2.906 -\item Animated$\rightarrow$Blocked is caused by a Slave VP
   2.907 -  executing the Suspend VMS primitive.
   2.908 -\item Blocked$\rightarrow$Ready is determined by the semantics
   2.909 -  implemented in the plugin.
   2.910 -\item Ready$\rightarrow$Animated is determined by the scheduler in the
   2.911 -  plugin.
   2.912 -\item Transitions in scheduling state have a globally consistent order
   2.913 -  in Virtual time.
   2.914 -\end{itemize}
   2.915 -
   2.916 -The parallelism primitives executed by a program do not control change
   2.917 -in scheduling states. They merely communicate messages to the Master,
   2.918 -via a VMS supplied primitive. Inside the Master, the plugin's
   2.919 -parallelism construct implementation processes the message. Based on
   2.920 -that, it performs changes in state from Blocked$\rightarrow$Ready,
   2.921 -creates new VPs, and dissipates existing VPs.  Most communication from
   2.922 -Slave to Master requires the VP to suspend when it sends the
   2.923 -message. A few messages, like creating new Slave may be sent without
   2.924 -suspending.
   2.925 -
   2.926 -The suspend primitive decouples local physical time from Virtual time.
   2.927 -Execution causes immediate transition to Stalled in physical time,
   2.928 -then the Master performs Animated$\rightarrow$Blocked, fixing that
   2.929 -transition in Virtual time. The only relationship is causality. This
   2.930 -weak relation is what allows suspension-points to be serialized in
   2.931 -Virtual time, which in turn is what allows using sequential algorithms
   2.932 -to implement parallelism constructs.
   2.933 -
   2.934 -\subsection{Plugins}
   2.935 -
   2.936 -The Master entity has two parts, a generic core part and a plugin
   2.937 -(Figure \ref{figMasterSplit}).  The core part of the Master is
   2.938 -implemented as part of VMS itself. The plugin supplies two functions:
   2.939 -the communication-handler and the scheduler, both having a standard
   2.940 -prototype. The communication-handler implements the parallelism
   2.941 -constructs, while scheduler assigns VPs to cores.
   2.942 -
   2.943 -An \emph{instance} of a plugin is created as part of initializing an
   2.944 -application, and the instance holds the semantic and scheduling state
   2.945 -for that run of the application. This state, combined with the virtual
   2.946 -processor states of the slaves created during that application run,
   2.947 -represents progress of the work of the application.  For example,
   2.948 -multi-tasking is performed simply by the Master switching among
   2.949 -plugin instances when it has a resource to offer to a scheduler. The
   2.950 -parallelism-semantic state holds all information needed to resume
   2.951 -(hardware state, such as TLB and cache-tags is inside VMS).
   2.952 -
   2.953 -\section{Internal Workings of Our Implementation}
   2.954 -\label{secInternal}
   2.955 -
   2.956 -We name the elements of our example implementation and describe their
   2.957 -logical function, then relate them to the abstract model. We then step
   2.958 -through the operation of the elements.
   2.959 -
   2.960 -\paragraph*{Elements and their logical function.}
   2.961 -
   2.962 -As illustrated in Figure~\ref{figInternals}, our VMS implementation is
   2.963 -organized around physical cores.  Each core has its own {\em{master
   2.964 -    virtual-processor}}, \texttt{masterVP}, and a {\em{physical-core
   2.965 -    controller}}, which communicate via a set of scheduling slots,
   2.966 -\texttt{schedSlot}. The Master in the abstract definition is
   2.967 -implemented by the multiple \texttt{masterVP}s plus a particular
   2.968 -plugin instance with its shared parallelism-semantic state (seen at
   2.969 -the top).
   2.970 -
   2.971 -On a given core, only one of: the core-controller, \texttt{masterVP},
   2.972 -or a slave VP, is animated at any point in local physical time. Each
   2.973 -\texttt{masterVP} animates the same function, called
   2.974 -\texttt{master\_loop}, and each slave VP animates a function from the
   2.975 -application, starting with the top-level function the slave is created
   2.976 -with, and following its call sequence. The core controller is
   2.977 -implemented here as a Linux pthread that runs the \texttt{core\_loop}
   2.978 -function.
   2.979 -
   2.980 -Switching between VPs is done by executing a VMS primitive that
   2.981 -suspends the VP. This switches the physical core over to the
   2.982 -controller, by jumping to the start of the \texttt{core\_loop}
   2.983 -function, which chooses the next VP and switches to that (switching is
   2.984 -detailed in Section \ref{secApp} Figure \ref{figAssembly}).
   2.985 -
   2.986 -\paragraph*{Relation to abstract model.}
   2.987 -
   2.988 -We chose to implement the Master entity of the model by a set of
   2.989 -\texttt{masterVP}s, plus plugin functions and shared
   2.990 -parallelism-semantic state. VMS consists of this implementation of the
   2.991 -Master, plus the core-controllers, plus the VMS primitive libraries,
   2.992 -for creating new VPs and dissipating existing VPs, suspending VPs, and
   2.993 -communicating from slave VP to Master. In Figure~\ref{figInternals},
   2.994 -everything in green is part of VMS, while the plugin is in red, and
   2.995 -application code appears as blue, inside the slave VP.
   2.996 -
   2.997 -Virtual time in the model is implemented via a combination of four
   2.998 -things: a \texttt{masterLock} (not shown) that guarantees non-overlap
   2.999 -of \texttt{masterVP} trace-segments; the \texttt{master\_loop} which
  2.1000 -performs transition Animated$\rightarrow$Blocked; the
  2.1001 -\texttt{comm\_handler\_fn} which performs Blocked$\rightarrow$Ready
  2.1002 -and the \texttt{scheduler\_fn} which performs
  2.1003 -Ready$\rightarrow$Animated. \ Each state transition is one step of
  2.1004 -Virtual time; is guaranteed sequential by the non-overlap of
  2.1005 -\texttt{masterVP} trace segments; and is global due to being in
  2.1006 -parallelism-semantic state that is shared (top of
  2.1007 -Figure~\ref{figInternals}).
  2.1008 -
  2.1009 -Transitions Progressing$\rightleftarrows$Stalled within the Animated
  2.1010 -state are invisible to the parallelism semantics, the Master, and
  2.1011 -Virtual time, and so have no effect on the elements seen.
  2.1012 -
  2.1013 -\begin{figure*}[h!tb]
  2.1014 -  \centerline{\includegraphics[width=5in]{../figures/VMS-core__internal_workings.eps}}
  2.1015 -  \caption
  2.1016 -  { Internal elements of our example VMS implementation
  2.1017 -  }
  2.1018 -  \label{figInternals}
  2.1019 -\end{figure*}
  2.1020 -
  2.1021 -\paragraph*{Steps of operation.}
  2.1022 -
  2.1023 -The steps of operation are numbered, in Figure \ref{figInternals}.
  2.1024 -Taking them in order:
  2.1025 -
  2.1026 -\begin{enumerate}
  2.1027 -\item \texttt{master\_loop} scans the scheduling slots to see which
  2.1028 -  ones' slaves have suspended since the previous scan.
  2.1029 -\item It hands these to the \texttt{comm\_handler\_fn} plugged in
  2.1030 -(which equals transition Animated$\rightarrow$Blocked).
  2.1031 -\item The VP has a request attached, and data in it causes the
  2.1032 -  \texttt{comm\_handler\_fn} to manipulate data structures in the
  2.1033 -  shared parallelism-semantic state.  These structures hold all the
  2.1034 -  slaves in the blocked state (code-level detail and example will
  2.1035 -  come in Figure~\ref{figReqHdlr}, Section~\ref{secApp}).
  2.1036 -\item Some requests cause slaves to be moved to a \texttt{readyQ} on
  2.1037 -  one of the cores (Blocked$\rightarrow$Ready). Which core's
  2.1038 -  \texttt{readyQ} receives the slave is under plugin control,
  2.1039 -  determined by a combination of request contents, semantic state and
  2.1040 -  physical machine state.
  2.1041 -\item During the scan, the \texttt{master\_loop} also looks for empty
  2.1042 -  slots, and for each calls the \texttt{scheduler\_fn} plugged in. It
  2.1043 -  chooses a slave from the \texttt{readyQ} on the core animating
  2.1044 -  \texttt{master\_loop}.
  2.1045 -\item The \texttt{master\_loop} then places the slave VP's pointer
  2.1046 -  into the scheduling slot (Ready$\rightarrow$Animated), making it
  2.1047 -  available to the \texttt{core\_loop}.
  2.1048 -\item When done with the scan, \texttt{masterVP} suspends, switching
  2.1049 -  animation back to the \texttt{core\_loop}.
  2.1050 -\item \texttt{core\_loop} takes slave VPs out of the slots.
  2.1051 -\item Then \texttt{core\_loop} switches animation to these slave VPs.
  2.1052 -\item When a slave self-suspends, animation returns to the
  2.1053 -  \texttt{core\_loop} (detail in code in Figure 9), which picks
  2.1054 -  another.
  2.1055 -\item Until all slots are empty and the \texttt{core\_loop} switches
  2.1056 -  animation to the \texttt{masterVP}.
  2.1057 -\end{enumerate}
  2.1058 -
  2.1059 -\paragraph*{Enabling sequential implementation of parallelism semantics.}
  2.1060 -
  2.1061 -All these steps happen on each core separately, but
  2.1062 -%% AC. Cannot be both an essential property of VMS and "un this particular..."
  2.1063 -% in this particular implementation
  2.1064 -we use a central \texttt{masterLock} to ensure that
  2.1065 -only one core's \texttt{masterVP} can be active at any time.  This
  2.1066 -guarantees non-overlap of trace-segments from different
  2.1067 -\texttt{masterVP}s, allowing the plugins to use sequential algorithms,
  2.1068 -without a performance penalty, as verified in
  2.1069 -Section~\ref{secResults}.
  2.1070 -
  2.1071 -Relating this to the abstract model: the parallelism-semantic behavior
  2.1072 -of the Master is implemented by the communication handler, in the
  2.1073 -plugin. It thus runs in the Master time referred to, in the model, in
  2.1074 -Section \ref{secAbsModel}. Requests are sent to the Master by
  2.1075 -self-suspension of the slaves, but sit idle until the other slaves in
  2.1076 -the scheduling slots have also run. This is the passive behavior of
  2.1077 -requests that was noted in Section~\ref{secAbsModel}, which allows the
  2.1078 -\texttt{masterVP}s to remain suspended until needed.  This in turn
  2.1079 -enables the \texttt{masterVP}s from different cores to be
  2.1080 -non-overlapped. It is the non-overlap that enables the algorithms for
  2.1081 -the parallelism semantics to be sequential.
  2.1082 -
  2.1083 -\section{Code Level View}
  2.1084 -\label{secApp}
  2.1085 -
  2.1086 -To relate the abstract model and the internal elements to application
  2.1087 -code and parallelism-library code, we give code snippets that
  2.1088 -illustrate key features.  We start with the application then work down
  2.1089 -through the sequence of calls, to the plugin, using our SSR
  2.1090 -{\cite{VMSHome}} parallelism-library as an example.
  2.1091 -
  2.1092 -In general, applications are either written in terms of a parallel
  2.1093 -language that has its own syntax, or a base language with a
  2.1094 -parallelism library, which is often called an {\em{embedded
  2.1095 -    language}}.  Our demonstrators, VCilk {\cite{VMSHome}}, Vthread,
  2.1096 -and SSR, are all parallelism libraries. A parallel language would
  2.1097 -follow the standard practice of performing source-to-source transform,
  2.1098 -from custom syntax into C plus parallelism-library calls.
  2.1099 -
  2.1100 -\paragraph*{SSR.}
  2.1101 -
  2.1102 -SSR stands for Synchronous Send-Receive, and details of its calls and
  2.1103 -internal implementation will be given throughout this section. It has
  2.1104 -two types of construct. The first, called {\em{from-to}} has two
  2.1105 -calls: \texttt{SSR\_send\_from\_to} and
  2.1106 -\texttt{SSR\_receive\_from\_to}, both of which specify the sending VP
  2.1107 -as well as the receiving VP. \ The other, called {\em{of-type}} also
  2.1108 -has two calls: \texttt{SSR\_\_send\_of\_type\_to} and
  2.1109 -\texttt{SSR\_\_receive\_of\_type}, which allow a receiver to accept
  2.1110 -from anonymous senders, but select according to type of message.
  2.1111 -
  2.1112 -% An example of a double column floating figure using two subfigures.
  2.1113 -% (The subfig.sty package must be loaded for this to work.)
  2.1114 -% The subfigure \label commands are set within each subfloat command, the
  2.1115 -% \label for the overall figure must come after \caption.
  2.1116 -% \hfil must be used as a separator to get equal spacing.
  2.1117 -% The subfigure.sty package works much the same way, except \subfigure is
  2.1118 -% used instead of \subfloat.
  2.1119 -%
  2.1120 -%\begin{figure*}[!t]
  2.1121 -%\centerline{\subfloat[Case I]\includegraphics[width=2.5in]{subfigcase1}%
  2.1122 -%\label{fig_first_case}}
  2.1123 -%\hfil
  2.1124 -%\subfloat[Case II]{\includegraphics[width=2.5in]{subfigcase2}%
  2.1125 -%\label{fig_second_case}}}
  2.1126 -%\caption{Simulation results}
  2.1127 -%\label{fig_sim}
  2.1128 -%\end{figure*}
  2.1129 -%
  2.1130 -% Note that often IEEE papers with subfigures do not employ subfigure
  2.1131 -% captions (using the optional argument to \subfloat), but instead will
  2.1132 -% reference/describe all of them (a), (b), etc., within the main caption.
  2.1133 -
  2.1134 -\paragraph*{Application view.}
  2.1135 -
  2.1136 -Figure~\ref{figAnimVP} shows snippets of application code, which use
  2.1137 -the SSR parallelism library. The most important feature is that all
  2.1138 -calls take a pointer to the VP that is animating the call. This is
  2.1139 -seen at the top of the figure where slave VP creation takes a pointer
  2.1140 -to the VP asking for creation. Below that is the standard prototype
  2.1141 -for top level functions, showing that the function receives a pointer
  2.1142 -to the VP it is the top level function for.
  2.1143 -
  2.1144 -The pointer is placed on the stack by VMS when it creates the VP, and
  2.1145 -is the means by which the application comes into possession of the
  2.1146 -pointer. This animating VP is passed to all library calls made from
  2.1147 -there. For example, the bottom shows a pointer to the animating VP
  2.1148 -placed in the position of sender in the \texttt{send} construct call.
  2.1149 -Correspondingly, for the \texttt{receive} construct, the position of
  2.1150 -receiving VP is filled by the VP animating the call.
  2.1151 -
  2.1152 -\begin{figure}[h!tb]
  2.1153 -{\noindent
  2.1154 -{\footnotesize
  2.1155 -{\small Creating a new processor:}
  2.1156 -\begin{verbatim}
  2.1157 -newProcessor = SSR__create_procr( &top_VP_fn, paramsPtr, animatingVP );
  2.1158 -\end{verbatim}
  2.1159 -
  2.1160 -{\small prototype for the top level function:}
  2.1161 -\begin{verbatim}
  2.1162 -top_VP_fn( void *parameterStrucPtr, VirtProcr *animatingVP );
  2.1163 -\end{verbatim}
  2.1164 -
  2.1165 -{\small handing animating VP to parallelism constructs:}
  2.1166 -\begin{verbatim}
  2.1167 -SSR__send_from_to( messagePtr, animatingVP, receivingVP );
  2.1168 -messagePtr = SSR__receive_from_to( sendingVP, animatingVP );
  2.1169 -\end{verbatim}
  2.1170 -}
  2.1171 -}
  2.1172 -\caption
  2.1173 -{Application code snippets showing that all calls to the parallelism library
  2.1174 -take the VP animating that call as a parameter.
  2.1175 -}
  2.1176 -\label{figAnimVP}
  2.1177 -\end{figure}
  2.1178 -
  2.1179 -Relating these to the internal elements of our implementation, the
  2.1180 -\texttt{animatingVP} suspends inside each of these calls, passing a
  2.1181 -request (generated in the library) to one of the \texttt{masterVP}s.
  2.1182 -The \texttt{masterVP} then calls the \texttt{comm-handler} plugin, and
  2.1183 -so on, as described in Section \ref{secInternal}.
  2.1184 -
  2.1185 -For the \texttt{SSR\_\_create\_processor} call, the comm-handler in
  2.1186 -turn calls a VMS primitive to perform the creation.  The primitive
  2.1187 -places a pointer to the newly created VP onto its stack, so that when
  2.1188 -\texttt{top\_VP\_fn} is later animated, it sees the VP-pointer as a
  2.1189 -parameter passed to it. \ All application code is either such a
  2.1190 -top-level function, or has one at the root of the call-stack.
  2.1191 -
  2.1192 -The send and receive calls both suspend their animating VP. When both
  2.1193 -have been called, the communication handler pairs them up and resumes
  2.1194 -both. This ties time-lines together, invoking the VMS guarantee. Both
  2.1195 -application-functions know, because of the VMS guarantee
  2.1196 -(Section~\ref{secAbsModel}), that writes to shared variables made
  2.1197 -before the send call by the sender are visible to the receiver after
  2.1198 -the receive call. This is the programmer's view of tying together the
  2.1199 -local time-lines of two different VPs, as defined in
  2.1200 -Section~\ref{secAbsModel}.
  2.1201 -
  2.1202 -\paragraph*{Concurrency-library view.}
  2.1203 -
  2.1204 -A parallelism library function, in general, only creates a request,
  2.1205 -sends it, and returns, as seen below.  To send a request, it uses the
  2.1206 -combined request-and-suspend VMS primitive that attaches the request
  2.1207 -then suspends the VP.  The primitive requires the pointer to the VP,
  2.1208 -to attach the request and to suspend it.
  2.1209 -
  2.1210 -In Figure~\ref{figImplLib}, notice that the request's data is on the
  2.1211 -stack of the virtual processor that's animating the call, which is the
  2.1212 -\texttt{receiveVP}.  The \texttt{VMS\_\_send\_sem\_request} suspends
  2.1213 -this VP, which changes the physical core's stack pointer to a
  2.1214 -different stack. So the request data is guaranteed to remain
  2.1215 -undisturbed while the VP is suspended.
  2.1216 -
  2.1217 -Figure~\ref{figAssembly} shows the implementation of the VMS suspend
  2.1218 -primitive. As seen in Figure \ref{figInternals}, suspending the
  2.1219 -\texttt{receiveVP} involves switching to the \texttt{core\_loop}. In
  2.1220 -our implementation, this is done by switching to the stack of the
  2.1221 -pthread pinned to the physical core and then jumping to the
  2.1222 -start-point of \texttt{core\_loop}.
  2.1223 -
  2.1224 -This code uses standard techniques commonly employed in co-routine
  2.1225 -implementations. Tuning effort spent in \texttt{core\_loop} is
  2.1226 -inherited by all applications.
  2.1227 -
  2.1228 -\begin{figure}[h!tb]
  2.1229 -{\noindent
  2.1230 -{\footnotesize
  2.1231 -\begin{verbatim}
  2.1232 -void * SSR__receive_from_to( VirtProcr *sendVP, VirtProcr *receiveVP )
  2.1233 - { SSRSemReq  reqData;
  2.1234 -   reqData.receiveVP = receiveVP;
  2.1235 -   reqData.sendVP    = sendVP;
  2.1236 -   reqData.reqType   = receive_from_to;
  2.1237 -   VMS__send_sem_request( &reqData, receiveVP );
  2.1238 -   return receiveVP->dataReturnedFromRequest;
  2.1239 - }
  2.1240 -\end{verbatim}
  2.1241 -}
  2.1242 -}
  2.1243 -\caption{Implementation of SSR's receive\_from\_to library function.}
  2.1244 -\label{figImplLib}
  2.1245 -
  2.1246 -{\noindent
  2.1247 -{\footnotesize
  2.1248 -\begin{verbatim}
  2.1249 -VMS__suspend_procr( VirtProcr *animatingVP )
  2.1250 - { animatingVP->resumeInstrAddr = &&ResumePt; //GCC takes addr of label
  2.1251 -   animatingVP->schedSlotAssignedTo->isNewlySuspended = TRUE; //for master_loop to see
  2.1252 -   <assembly code stores current physical core's stack reg into animatingVP struct>
  2.1253 -   <assembly code loads stack reg with core_loop stackPtr, which was saved into animatingVP>
  2.1254 -   <assembly code jmps to core_loop start instr addr, which was also saved into animatingVP>
  2.1255 - ResumePt:
  2.1256 -   return;
  2.1257 - }
  2.1258 -\end{verbatim}
  2.1259 -}
  2.1260 -}
  2.1261 -  \caption
  2.1262 -{Implementation of VMS suspend processor.
  2.1263 -Re-animating the virtual processor reverses this sequence. \ It saves the
  2.1264 -\texttt{core\_loop}'s resume instr-addr and stack ptr into the VP structure,
  2.1265 -then loads the VP's stack ptr and jmps to its \texttt{resumeInstrAddr}.
  2.1266 -}
  2.1267 -\label{figAssembly}
  2.1268 -
  2.1269 -{\noindent
  2.1270 -{\footnotesize
  2.1271 -\begin{verbatim}
  2.1272 -handle_receive_from_to( VirtProcr *requestingVP, SSRSemReq *reqData, SSRSemEnv *semEnv )
  2.1273 - { commHashTbl = semEnv->communicatingVPHashTable;
  2.1274 -   key[0] = reqData->receiveVP;   key[1] = reqData->sendVP; //send uses same key
  2.1275 -   waitingReqData = lookup_and_remove( key, commHashTbl );  //get waiting request
  2.1276 -   if( waitingReqData != NULL )
  2.1277 -    { resume_virt_procr( waitingReqData->sendVP );
  2.1278 -      resume_virt_procr( waitingReqData->receiveVP );
  2.1279 -    }
  2.1280 -   else
  2.1281 -      insert( key, reqData, commHashTbl ); //receive is first to arrive, make it wait
  2.1282 - }
  2.1283 -\end{verbatim}
  2.1284 -}
  2.1285 -}
  2.1286 -  \caption
  2.1287 -{Pseudo-code of communication-handler for
  2.1288 -\texttt{receive\_from\_to} request type. The \texttt{semEnv} is a pointer
  2.1289 -to the shared parallel semantic state seen at the top of
  2.1290 -Figure~\ref{figInternals}.
  2.1291 -}
  2.1292 -\label{figReqHdlr}
  2.1293 -\end{figure}
  2.1294 -
  2.1295 -\paragraph*{Plugin view.}
  2.1296 -
  2.1297 -SSR's communication handler dispatches on the \texttt{reqType} field
  2.1298 -of the request data, as set by the \texttt{SSR\_\_receive\_from\_to}
  2.1299 -code. It calls the handler code in Figure \ref{figReqHdlr}. This
  2.1300 -constructs a hash-key, by concatenating the from-VP's pointer with the
  2.1301 -to-VP's pointer. Then it looks-up that key in the hash-table that SSR
  2.1302 -uses to match sends with receives, which is in the shared semantic
  2.1303 -state seen at the top of Figure \ref{figInternals} in
  2.1304 -Section~\ref{secInternal}.
  2.1305 -
  2.1306 -The most important feature in Figure~\ref{figReqHdlr} is that both
  2.1307 -send and receive will construct the same key, so will find the same
  2.1308 -hash entry.  Whichever request is handled first in Virtual time will
  2.1309 -see the hash entry empty, and save itself in that entry.  The second
  2.1310 -to arrive sees the waiting request and then resumes both VPs, by
  2.1311 -putting them into their \texttt{readyQ}s.
  2.1312 -
  2.1313 -Access to the shared hash table can be considered private, as in a
  2.1314 -sequential algorithm.  This is because our VMS-core implementation
  2.1315 -ensures that only one handler on one core is executing at a time.
  2.1316 -
  2.1317 -\section{Results}
  2.1318 -\label{secResults}
  2.1319 -
  2.1320 -We implemented blocked dense matrix multiplication with right
  2.1321 -sub-matrices copied to transposed form, and ran it on a 4-core
  2.1322 -Core2Quad 2.4Ghz processor.
  2.1323 -
  2.1324 -\paragraph*{Implementation-time.}
  2.1325 -
  2.1326 -As shown in Table~\ref{tabPersonDaysLang}, time to implement the three
  2.1327 -parallel libraries averages 2 days each. As an example of
  2.1328 -productivity, adding nested transactions, parallel singleton, and
  2.1329 -atomic function-execution to SSR required a single afternoon, totaling
  2.1330 -less than 100 lines of C code.
  2.1331 -
  2.1332 -\paragraph*{Execution Performance.}
  2.1333 -
  2.1334 -Performance of VMS is seen in Table~\ref{tabOverheadCycles}. The code
  2.1335 -is not optimized, but rather written to be easy to understand and
  2.1336 -modify. The majority of the plugin time is lost to cache misses
  2.1337 -because the shared parallelism-semantic state moves between cores on a
  2.1338 -majority of accesses. Acquisition of the master lock is slow due to
  2.1339 -the hardware implementing the CAS instruction.
  2.1340 -
  2.1341 -Existing techniques will likely improve performance, such as
  2.1342 -localizing semantic data to cores, splitting malloc across the cores,
  2.1343 -pre-allocating slabs that are recycled, and pre-fetching. However, in
  2.1344 -many cases, several hundred nano-seconds per task is as optimal as the
  2.1345 -applications can benefit from.
  2.1346 -
  2.1347 -\begin{table}[h!tb]
  2.1348 -\begin{minipage}{.35\textwidth}
  2.1349 -\begin{tabular}{|l|l|l|l|}
  2.1350 -  \cline{2-4}
  2.1351 -  \multicolumn{1}{l|}{} & SSR & Vthread & VCilk\\
  2.1352 -  \cline{2-4}
  2.1353 -  \noalign{\vskip2pt}
  2.1354 -  \hline
  2.1355 -  Design & 4 & 1 & 0.5\\
  2.1356 -  Code & 2 & 0.5 & 0.5\\
  2.1357 -  Test & 1 & 0.5 & 0.5\\
  2.1358 -  L.O.C. & 470 & 290 & 310\\
  2.1359 -  \hline
  2.1360 -\end{tabular}
  2.1361 -\caption
  2.1362 -{Person-days to design, code, and
  2.1363 -  test each parallelism library. L.O.C. is lines of (original) C code,
  2.1364 -  excluding libraries and comments.
  2.1365 -}
  2.1366 -\label{tabPersonDaysLang}
  2.1367 -\end{minipage}
  2.1368 -\hspace{-.05\textwidth}
  2.1369 -\begin{minipage}[t]{.7\textwidth}
  2.1370 -\hfill
  2.1371 -\begin{tabular}{|l|l|r|r|}
  2.1372 -\cline{3-4}
  2.1373 -\multicolumn{2}{l|}{} & comp only & comp + mem\\
  2.1374 -\cline{3-4}
  2.1375 -\noalign{\vskip2pt}
  2.1376 -\hline
  2.1377 -VMS Only & \texttt{master\_loop} &  91 &  110\\
  2.1378 -& switch VPs  & 77 &  130\\
  2.1379 -& (malloc) & 160 & 2300\\
  2.1380 -& (create VP)  & 540 & 3800\\
  2.1381 -\hline
  2.1382 -Language: &  &  & \\
  2.1383 -\quad SSR & plugin -- concur & 190 & 540\\
  2.1384 -& plugin -- all & 530 & 2200\\
  2.1385 -& lock &  &  250\\
  2.1386 -\quad Vthread & plugin -- concur & 66 & 710\\
  2.1387 -& plugin -- all & 180 & 1500\\
  2.1388 -& lock &  &  250\\
  2.1389 -\quad VCilk & plugin -- concur & 65 & 260\\
  2.1390 -& plugin -- all & 330 & 1800\\
  2.1391 -& lock &  &  250\\
  2.1392 -\hline
  2.1393 -\end{tabular}
  2.1394 -\caption
  2.1395 -{Cycles of overhead, per scheduled
  2.1396 -    slave. ``comp only'' is perfect memory, ``comp + mem'' is actual cycles.
  2.1397 -    ``Plugin-concur'' only concurrency requests, ``plugin-all'' includes
  2.1398 -    create and malloc requests. Two significant digits due to variability.
  2.1399 -}
  2.1400 -\label{tabOverheadCycles}
  2.1401 -\end{minipage}
  2.1402 -
  2.1403 -\vskip-1.5cm
  2.1404 -\begin{tabular}{|l|lr|}
  2.1405 -\hline
  2.1406 -    Matrix size&Lang.&sec.\\
  2.1407 -[2pt]\hline
  2.1408 -    $81\times81$ & Cilk & 0.017\\
  2.1409 -    & VCilk & 0.008\\
  2.1410 -\hline
  2.1411 -    $324\times324$ & Cilk & 0.13\\
  2.1412 -    & VCilk & 0.13\\
  2.1413 -\hline
  2.1414 -    $648\times648$ & Cilk & 0.71\\
  2.1415 -    & VCilk & 0.85\\
  2.1416 -\hline
  2.1417 -    $1296\times1296$ & Cilk & 4.8\\
  2.1418 -    & VCilk & 6.2 \\
  2.1419 -[2pt]\hline
  2.1420 -\end{tabular}
  2.1421 -\hfill
  2.1422 -\begin{tabular}{|l|r|r|r|r|}
  2.1423 -\multicolumn{5}{l}{}\\
  2.1424 -\multicolumn{5}{l}{}\\
  2.1425 -\multicolumn{5}{l}{}\\
  2.1426 -\hline
  2.1427 -    operation & \multicolumn{2}{c|}{Vthread} & pthread & ratio\\
  2.1428 -    & comp only & total &  & \\
  2.1429 -[2pt]\hline
  2.1430 -    mutex\_lock & 85 & 1050 & 50,000 & 48:1\\
  2.1431 -    mutex\_unlock & 85 & 610 & 45,000 & 74:1\\
  2.1432 -    cond\_wait  & 85 & 850 & 60,000 & 71:1\\
  2.1433 -    cond\_signal & 90 & 650 & 60,000 & 92:1\\
  2.1434 -\hline
  2.1435 -\end{tabular}
  2.1436 -\caption
  2.1437 -{On left, exe time in seconds for MM. To the right, overhead for
  2.1438 -  pthread vs. Vthread. First column is cycles
  2.1439 -  for perfect memory and second is total measured cycles. pthread cycles are
  2.1440 -  deduced from round-trip experiments.
  2.1441 -}
  2.1442 -\label{tabHeadToHead}
  2.1443 -\end{table}
  2.1444 -
  2.1445 -\paragraph*{Head to head.}
  2.1446 -
  2.1447 -We compare our implementation of the \texttt{spawn} and \texttt{sync}
  2.1448 -constructs against Cilk 5.4, on the top in Table~\ref{tabHeadToHead},
  2.1449 -which shows that the same application code has similar
  2.1450 -performance. For large matrices, Cilk 5.4's better use of the memory
  2.1451 -hierarchy (the workstealing algorithm) achieves 23\% better
  2.1452 -performance. However, for small matrices, VCilk is better, with a
  2.1453 -factor 2 lower overhead. Cilk 5.4 does not allow controlling the
  2.1454 -number of spawn events it actually executes, and chooses to run
  2.1455 -smaller matrices sequentially, limiting our comparison.
  2.1456 -
  2.1457 -When comparing to pthreads, our VMS based implementation has more than
  2.1458 -an order of magnitude better overhead per invocation of mutex or
  2.1459 -condition variable functionality, as seen on the bottom of Table
  2.1460 -\ref{tabHeadToHead}.  Applications that inherently have short trace
  2.1461 -segments will synchronize often and benefit the most from Vthread.
  2.1462 -
  2.1463 -\section{Conclusion}
  2.1464 -\label{secConclusion}
  2.1465 -
  2.1466 -We have shown an alternative to the thread model that enables
  2.1467 -easier-to-use parallelism constructs by splitting the scheduler open,
  2.1468 -to accept new parallelism constructs in the form of plugins. This
  2.1469 -gives the language control over assigning virtual processors to
  2.1470 -physical cores, for performance, debugging, and flexibility
  2.1471 -benefits. Parallelism constructs of programming languages can be
  2.1472 -implemented using sequential algorithms, within a matter of days,
  2.1473 -while maintaining low run-time overhead.
  2.1474 -
  2.1475 -\bibliography{../helpers/bib_for_papers}
  2.1476 -
  2.1477 -\end{document}
     3.1 Binary file 0__Papers/PStack/HotPar_2012/latex/12_Ja_20__HotPar_Abstr_for_PStack.pdf has changed
     4.1 --- a/0__Papers/PStack/HotPar_2012/latex/12_Ja_20__HotPar_Abstr_for_PStack.tex	Tue Jun 26 12:48:44 2012 +0200
     4.2 +++ b/0__Papers/PStack/HotPar_2012/latex/12_Ja_20__HotPar_Abstr_for_PStack.tex	Wed Jun 27 02:49:38 2012 -0700
     4.3 @@ -288,7 +288,7 @@
     4.4  \end{figure}
     4.5  
     4.6  
     4.7 -However, things change dramatically on the 4 socket by 10 core-each Westmere machine, seen in Figure \ref{figXoanonResults}.  Here, inter-socket communication dominates, and VMS gains orders of magnitude advantage.  For one thread per hardware context, Vthread's overhead is around 2000 cycles, while pthread starts at around 50,000 and goes up from there.
     4.8 +However, things change dramatically on the 4 socket by 10 core-each Westmere machine, seen in Figure \ref{figXoanonResults}.  Here, inter-socket communication dominates, and VMS gains orders of magnitude advantage.  For one thread per hardware context, Vthread's overhead is around 1500 cycles, while pthread starts at around 50,000 and goes up from there.
     4.9  
    4.10  The implementation of VMS is different on this machine than the single-socket ones, and demonstrates the effectiveness of pulling hardware details below the abstraction.  
    4.11  
     5.1 --- a/0__Papers/VMS/VMS__Foundation_Paper/VMS__Full_conference_version/latex/VMS__Full_conf_paper.tex	Tue Jun 26 12:48:44 2012 +0200
     5.2 +++ b/0__Papers/VMS/VMS__Foundation_Paper/VMS__Full_conference_version/latex/VMS__Full_conf_paper.tex	Wed Jun 27 02:49:38 2012 -0700
     5.3 @@ -65,7 +65,7 @@
     5.4  
     5.5  \bibliographystyle{plain}
     5.6  
     5.7 -\title{A Proto-Runtime Embodiment of a Unified Approach to Parallelism Constructs for Use in Domain-Specific Languages}
     5.8 +\title{A Proto-Runtime  Approach to  Domain-Specific Languages}
     5.9  
    5.10  \docstatus{Submitted to HotPar 2012}
    5.11  
    5.12 @@ -85,11 +85,16 @@
    5.13  
    5.14  \begin{abstract}
    5.15  
    5.16 -The triple challenge. Many believe productivity and portability can be solved with domain-specific languages. But adoptability is hindered by practical problems due to small user-base. Few users to support a language means development time must be small and porting language across machines must be low effort.
    5.17 +Software has not been keeping up with new parallel hardware, which slows the economy and retards adoption of  new hardware. Many believe the productivity and portability challenges of parallel software can be solved with domain-specific languages. But adoption is hindered by practical problems due to the small user-base, which means the language development time must be small and porting it across machines must be low effort.
    5.18  
    5.19 -If buy domain-specific, then we have a toolkit to address the time-to-create and time-to-port. It is based on recognizing a pattern common to all parallel languages, and embodying that in a partial runtime, or "proto" runtime. The proto-runtime embodies most of the implementation effort, and is reused among all the domain-specific languages. Each language just adds sequential-reasoning algorithms to control the relative progress of different timelines and to control which hardware generates that progress, through a well-defined interface.
    5.20  
    5.21 -?
    5.22 +To address this,  we propose the proto-runtime, which is a full runtime, but with two key pieces  replaced with an interface. A new language is created by providing: 1)the behavior of language constructs and 2) assignment of work onto hardware resources.  The pieces are simplified by keeping concurrency issues inside the proto-runtime, so the pieces are implemented using sequential reasoning. The high reuse of the proto-runtime allows intense hardware-specific tuning, which all languages inherit, keeping overhead low. 
    5.23 +
    5.24 +
    5.25 +
    5.26 +We explain the practical usage and theory, and show measurements of implementation time of languages equivalent to OpenMP, StarSs, Cilk, and pthreads, along with two new languages, including a domain-specific language for hardware simulation. We include overhead measurements which show  the VMS versions are orders of magnitude better than the original runtimes.
    5.27 +
    5.28 +
    5.29  
    5.30  
    5.31  
    5.32 @@ -100,6 +105,39 @@
    5.33  
    5.34  
    5.35  \section{}
    5.36 +
    5.37 +Software has not been keeping up with new parallel hardware, which slows the economy and retards adoption of the new hardware. The gap is due in part to the disruption caused by  moving to parallel languages, and in part to the prohibitive effort of porting application code across platforms.
    5.38 +
    5.39 +  A leading idea for solving this is domain-specific parallel languages, where custom constructs are made to match features of the problem.  However, such languages have a small number of users, which can't support the currently large effort to create such languages and port them across hardware platforms.
    5.40 +
    5.41 +To simplify creation of domain-specific languages, we propose a ``proto" runtime, which is a normal, full runtime, but with two key parts  replaced with an interface.  To create a new language, one  provides an implementation of those two pieces: 1)behavior of language constructs and 2)assignment of work onto hardware resources.  The pieces are simplified by keeping concurrency issues inside the proto-runtime, so they are implemented using sequential reasoning. 
    5.42 +
    5.43 +The proto-runtime remains the same for all languages, causing very high reuse, which gives benefit.  Intense effort can be spent fine tuning performance of the proto-runtime, which all languages then benefit from.  Such effort would be prohibitive if done separately for every language runtime on every target hardware platform.  In addition, services for debugging, performance tuning,  gathering portability information, and so on is centralized for use by the languages.
    5.44 +
    5.45 +Such an approach is only attractive if it delivers high application performance and low runtime overhead.  We demonstrate this in this paper for multicore hardware.  In the long run, the proto-runtime interface must be compatible with a much wider variety of parallel architectures and system approaches.  We reserve this for future work, concentrating here only on multi-core hardware.
    5.46 +
    5.47 +The performance is due to language control over hardware resources, which is lost when implementing the language runtime on top of a threading package like pthreads or TBB, which is, in effect, a lower-level runtime.  The issue is that the lower runtime has the hardware control,  not the language runtime on top of it! 
    5.48 +
    5.49 +Fixing this by exposing a means for the language runtime to directly control hardware is half of what a proto-runtime is. Once that much has been changed, additional benefits accrue from going all the way and replacing the pthread or TBB constructs with the second interface, which completes the transformation to a proto-runtime.
    5.50 +
    5.51 +Once a proto-runtime exists, the language plugins can be provided separately, in a trusted way, making it safe for them to access privileged hardware instructions to control hardware resource usage. This additional aspect gives more benefits to the proto-runtime approach.
    5.52 +
    5.53 +Finally, now that a highly reused thing exists, put lots of services into it, accessed via the standard interface. This way language implementers  inherit valuable facilities for debugging, performance tuning (see companion paper), automated verification, capturing portability information, and so on.  This saves yet more time, supplying a rich language for end-users.
    5.54 +
    5.55 +A bonus feature is combination of constructs from different languages in meaningful ways, quickly and easily (as demonstrated).
    5.56 +
    5.57 +
    5.58 +We explain the practical usage and theory, and show measurements of implementation time of languages equivalent to OpenMP, StarSs, Cilk, and pthreads, along with two new languages, including a domain-specific language for hardware simulation. We include overhead  measurements for the VMS versions, which are orders of magnitude better than the original runtimes.
    5.59 +
    5.60 +========
    5.61 +
    5.62 +We propose the proto-runtime approach to address the time-to-create and time-to-port issues.  Each language just adds sequentially-reasoned algorithms to control the relative progress of different timelines and to control which hardware generates that progress, through a well-defined interface.
    5.63 +
    5.64 +It is based on recognizing a pattern common to all parallel languages, which embodies most of the runtime implementation effort, and is reused among all the domain-specific languages.
    5.65 +
    5.66 +======== 
    5.67 +
    5.68 +
    5.69  Current  parallel programming is blocked from the mainstream for several reasons: it has lower productivity than sequential programming, forces a rewrite of source for each new target to get good performance, and disrupts the ways programmers  think and their workflow. These cause high expense, and slow adoption.
    5.70  
    5.71  Many believe a solution to the productivity issue is domain-specific languages. 
     6.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     6.2 +++ b/0__Papers/writing-a-paper-slides.pdf	Wed Jun 27 02:49:38 2012 -0700
     6.3 @@ -0,0 +1,22 @@
     6.4 +<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
     6.5 +<html>
     6.6 +<head>
     6.7 +<meta http-equiv="Pragma" content="no-cache">
     6.8 +<meta http-equiv="expires" content="0">
     6.9 +<META HTTP-EQUIV="CACHE-CONTROL" CONTENT="NO-CACHE">
    6.10 +<title>Microsoft Research - Site Down</title>
    6.11 +<style type="text/css">
    6.12 +* {margin:0;}
    6.13 +body {background: #dfecec url(bg.png) repeat-x; }
    6.14 +</style>
    6.15 +</head>
    6.16 +<body>
    6.17 +<div style="margin:0 auto 0 auto; width:1002px; position:relative;">
    6.18 +	<map id="FPMap0" name="FPMap0">
    6.19 +	<area coords="383, 329, 619, 450" href="http://twitter.com/msftresearch" shape="rect">
    6.20 +	<area coords="675, 325, 911, 450" href="http://www.facebook.com/microsoftresearch?ref=search&amp;sid=623616584.2939282704..1" shape="rect">
    6.21 +	</map>
    6.22 +<img alt="alt text" height="896" src="http://research.microsoft.com/Error-page.png" width="1002" usemap="#FPMap0" border="0">
    6.23 +</div>
    6.24 +</body>
    6.25 +</html>