### changeset 82:8508a9dd967a

author Nina Engelhardt Mon, 13 Aug 2012 18:48:19 +0200 b5e314f7cdb9 78a1ee9b06f1 0__Papers/Holistic_Model/Perf_Tune/latex/Holistic_Perf_Tuning.pdf 0__Papers/Holistic_Model/Perf_Tune/latex/Holistic_Perf_Tuning.tex 0__Papers/Holistic_Model/Perf_Tune/latex/bib_for_papers_12_Jy_15.bib 3 files changed, 41 insertions(+), 17 deletions(-) [+]
line diff
     1.1 Binary file 0__Papers/Holistic_Model/Perf_Tune/latex/Holistic_Perf_Tuning.pdf has changed

     2.1 --- a/0__Papers/Holistic_Model/Perf_Tune/latex/Holistic_Perf_Tuning.tex	Mon Aug 13 02:13:02 2012 -0700
2.2 +++ b/0__Papers/Holistic_Model/Perf_Tune/latex/Holistic_Perf_Tuning.tex	Mon Aug 13 18:48:19 2012 +0200
2.3 @@ -41,11 +41,11 @@
2.4  %
2.5
2.6  \conferenceinfo{WXYZ '05}{date, City.}
2.10
2.11 -\titlebanner{banner above paper title}        % These are ignored unless
2.12 -\preprintfooter{short description of paper}   % 'preprint' option specified.
2.13 +\titlebanner{}        % These are ignored unless
2.14 +\preprintfooter{Performance Tuning Using Semantic Information from the Language Runtime}   % 'preprint' option specified.
2.15
2.16
2.17  %MOIRAI: MOdel for Integrated Runtime Analysis through Instrumentation
2.18 @@ -70,8 +70,8 @@
2.19  \begin{abstract}
2.20  Performance tuning is an important aspect of parallel programming. Yet when trying to pinpoint the causes of performance loss, often times insufficient knowledge  of the internal structure of the application and the runtime is available to understand how the observed patterns of performance have come to pass.
2.21  A trend in parallel programming languages is towards models that capture more structural information about the application, in an effort to increase both performance and ease of programming. We propose using this structural information in  performance tuning tools to make the causes of performance loss more readily apparent.
2.22 -Our work produces a universal, adaptable set of performance visualizations that integrates this extra application structure,via a new model of parallel computation. The visualizations clearly identify idle cores, and tie the idleness to causal interactions within the runtime and hardware, and from there to the parallelism constructs that constrained the runtime and hardware behavior, thereby eliminating guesswork.
2.23 -This approach can be used to instrument the runtime of any parallel programming model without modifying the application. As a case study, we applied it to a message-passing model, and we walk through a tuning session on a large multi-core machine to illustrate how performance loss is identified and how hypotheses for the cause are generated.
2.24 +Our work produces a universal, adaptable set of performance visualizations that integrates this extra application structure, via a new model of parallel computation. The visualizations clearly identify idle cores, and tie the idleness to causal interactions within the runtime and hardware, and from there to the parallelism constructs that constrained the runtime and hardware behavior, thereby eliminating guesswork.
2.25 +This approach can be used to instrument the runtime of any parallel programming model without modifying the application. As a case study, we applied it to the SSR message-passing model, and we walk through a tuning session on a large multi-core machine to illustrate how performance loss is identified and how hypotheses for the cause are generated.
2.26  \end{abstract}
2.27
2.28
2.29 @@ -106,7 +106,7 @@
2.30   We believe that the three primary factors to consider when choosing a performance tuning tool are:
2.31
2.32  \begin{itemize}
2.33 -\item the ease  of recognizing that performance is less than optimal
2.34 +\item the ease of recognizing that performance is less than optimal
2.35  \item the ease of forming a hypothesis to explain the difference
2.36  \item the ease of linking the hypothesis to changes to make in application code
2.37  \end{itemize}
2.38 @@ -124,16 +124,14 @@
2.39
2.40  One fix to these issues is to allow the users to introduce measuring points into their own code, at the cost of increased programmer effort.  Because instrumentation code is written in the source language, it has access to application concepts. This advantage can be kept with automated instrumentation, by providing an instrumenting compiler, like the Tau \cite{PerfToolTau} project does.
2.41
2.42 -However, as long as the underlying execution model is still threads,  there is no meaningful structure common to all applications to use to generate expressive measurement quantities. Function boundaries and the call graph are not sufficient when parallel execution is performed. The sequence and frequency of function calls tell little about parallel performance impacts because they have no bearing on synchronization events.  Unfortunately, pthreads does not capture even hints as to \emph{why} a given function call ends up blocking or not blocking, and what the effects on other threads are.
2.43 +However, as long as the underlying execution model is still threads, there is no meaningful structure common to all applications to use to generate expressive measurement quantities. Function boundaries and the call graph are not sufficient when parallel execution is performed. The sequence and frequency of function calls tell little about parallel performance impacts because they have no bearing on synchronization events.  Unfortunately, pthreads does not capture even hints as to \emph{why} a given function call ends up blocking or not blocking, and what the effects on other threads are.
2.44
2.45  Higher level parallel abstractions help alleviate that particular problem. For instance, in an application with MPI message passing \cite{MPI}, the information thread 2 spends little time waiting for messages from thread 0 but a lot of time waiting for messages from thread 1'' can be recorded, where in pthreads only thread 2 spends a lot of time waiting for a signal'' would be visible. It is much easier to reach the conclusion that the bottleneck is the slow rate at which thread 1 produces data.
2.46 -Leveraging the MPI library is a well-trodden path, with many tools available \cite{PMPI} \cite{MPICL} (the above-cited Tau and Paradyn also added this feature). A frequent complaint of users of these tools is that in complex programs, the large number of messages sent makes it difficult to see problems. As there is little that distinguishes messages beside size, it is difficult to implement good filtering. Once again, we find that the limitations come from a parallel model that captures insufficient information.
2.47 +Leveraging the MPI library is a well-trodden path, with many tools available \cite{PerfToolParaver} \cite{PerfToolVampir} \cite{PerfToolParadyn} \cite{PerfToolTau}. A frequent complaint of users of these tools is that in complex programs, the large number of messages sent makes it difficult to see problems. As there is little that distinguishes messages beside size, it is difficult to implement good filtering. Once again, we find that the limitations come from a parallel model that captures insufficient information.
2.48
2.49 -Fortunately, parallel languages are evolving in the same direction, and it is widely believed that semantically rich parallel constructs, when they match the structure of the application well, are key to improved productivity. With languages such as \cite{CnCInHotPar} and StarSs \cite{StarSs}, the information available to the language runtime offers a much clearer picture of the constraints  placed on the execution of tasks and where they come from. Consequently, the application structure can be well reflected in visualizations, as the Temanejo debugger  \cite{PerfToolStarSs} for StarSs demonstrates.
2.50 +Fortunately, parallel languages are evolving in the same direction, and it is widely believed that semantically rich parallel constructs, when they match the structure of the application well, are key to improved productivity. With languages such as CnC \cite{CnCInHotPar} and StarSs \cite{StarSs}, the information available to the language runtime offers a much clearer picture of the constraints  placed on the execution of tasks and where they come from. Consequently, the application structure can be well reflected in visualizations, as the Temanejo debugger  \cite{PerfToolStarSs} for StarSs demonstrates.
2.51
2.52 -
2.53 -Our approach differs in  the following ways:
2.54 -
2.55 +In the following section, we will demonstrate how our approach overcomes these challenges through a combination of the following features:
2.56  \begin{itemize}
2.57  \item  all the factors influencing scheduling decisions will be covered, given sufficient language and runtime pre-requisites
2.58  \item works for any parallel programming library or language
2.59 @@ -1284,4 +1282,4 @@
2.60
2.61
2.62
2.63 - trying to pinpoint the causes of performance loss,  internal structure of
2.64 \ No newline at end of file
2.65 + trying to pinpoint the causes of performance loss,  internal structure of

     3.1 --- a/0__Papers/Holistic_Model/Perf_Tune/latex/bib_for_papers_12_Jy_15.bib	Mon Aug 13 02:13:02 2012 -0700
3.2 +++ b/0__Papers/Holistic_Model/Perf_Tune/latex/bib_for_papers_12_Jy_15.bib	Mon Aug 13 18:48:19 2012 +0200
3.3 @@ -1,6 +1,32 @@
3.4 +@misc{PerfToolVTune,
3.5 +    author = {Intel},
3.6 +    howpublished = {http://software.intel.com/en-us/articles/intel-vtune-amplifier-xe/},
3.7 +    title = {{Intel VTune Amplifier XE Performance Analyzer}},
3.8 +    year = {2011}
3.9 +}
3.10 +@ARTICLE{PerfToolVampir,
3.11 +    author = {W. E. Nagel and A. Arnold and M. Weber and H.-Ch. Hoppe and K. Solchenbach},
3.12 +    title = {{VAMPIR: Visualization and Analysis of MPI Resources}},
3.13 +    journal = {Supercomputer},
3.14 +    year = {1996},
3.15 +    volume = {12},
3.16 +    pages = {69--80}
3.17 +}
3.18 +@TECHREPORT{PerfToolParaver,
3.19 +    author = {Vincent Pillet and Vincent Pillet and Jesús Labarta and Toni Cortes and Toni Cortes and Sergi Girona and Sergi Girona and Departament D'arquitectura De Computadors},
3.20 +    title = {PARAVER: A Tool to Visualize and Analyze Parallel Code},
3.21 +    institution = {In WoTUG-18},
3.22 +    year = {1995}
3.23 +}
3.24 +@Misc{MPI,
3.25 +	author = {Message Passing Interface Forum},
3.26 +	howpublished = {http://www.mpi-forum.org},
3.27 +	title = {{MPI}: A Message-Passing Interface Standard Version 2.2},
3.28 +	year = 2009
3.29 +}
3.31  	author = {Planas, J. and Badia, R.M. and Ayguad{\'e}, E. and Labarta, J.},
3.34  	journal = {International Journal of High Performance Computing Applications},
3.35  	year = 2009,
3.36  	volume = 23,
3.37 @@ -16,7 +42,7 @@
3.38  }
3.39  @Article{PerfToolTau,
3.40  	author = {Shende, Sameer S. and Malony, Allen D.},
3.41 -	title = {The Tau Parallel Performance System},
3.42 +	title = {The {TAU} Parallel Performance System},
3.43  	volume = 20,
3.44  	number = 2,
3.45  	pages = {287-311},
3.46 @@ -26,7 +52,7 @@
3.48  	author = {Miller, B.P. and Callaghan, M.D. and Cargille, J.M. and Hollingsworth, J.K. and Irvin, R.B. and Karavanic, K.L. and Kunchithapadam, K. and Newhall, T.},
3.49  	journal = {Computer},
3.50 -	title = {The Paradyn parallel performance measurement tool},
3.51 +	title = {The {Paradyn} parallel performance measurement tool},
3.52  	year = 1995,
3.53  	month = {nov},
3.54  	volume = 28,
3.55 @@ -45,7 +71,7 @@
3.56  }