# HG changeset patch # User Nina Engelhardt # Date 1344876499 -7200 # Node ID 8508a9dd967a91926e6f583b2e446b2ceff7d269 # Parent b5e314f7cdb91a60fcb0f439734285b3f1b3285c perf tune : Add citations diff -r b5e314f7cdb9 -r 8508a9dd967a 0__Papers/Holistic_Model/Perf_Tune/latex/Holistic_Perf_Tuning.pdf Binary file 0__Papers/Holistic_Model/Perf_Tune/latex/Holistic_Perf_Tuning.pdf has changed diff -r b5e314f7cdb9 -r 8508a9dd967a 0__Papers/Holistic_Model/Perf_Tune/latex/Holistic_Perf_Tuning.tex --- a/0__Papers/Holistic_Model/Perf_Tune/latex/Holistic_Perf_Tuning.tex Mon Aug 13 02:13:02 2012 -0700 +++ b/0__Papers/Holistic_Model/Perf_Tune/latex/Holistic_Perf_Tuning.tex Mon Aug 13 18:48:19 2012 +0200 @@ -41,11 +41,11 @@ % \conferenceinfo{WXYZ '05}{date, City.} -\copyrightyear{2005} +\copyrightyear{2012} \copyrightdata{[to be supplied]} -\titlebanner{banner above paper title} % These are ignored unless -\preprintfooter{short description of paper} % 'preprint' option specified. +\titlebanner{} % These are ignored unless +\preprintfooter{Performance Tuning Using Semantic Information from the Language Runtime} % 'preprint' option specified. %MOIRAI: MOdel for Integrated Runtime Analysis through Instrumentation @@ -70,8 +70,8 @@ \begin{abstract} Performance tuning is an important aspect of parallel programming. Yet when trying to pinpoint the causes of performance loss, often times insufficient knowledge of the internal structure of the application and the runtime is available to understand how the observed patterns of performance have come to pass. A trend in parallel programming languages is towards models that capture more structural information about the application, in an effort to increase both performance and ease of programming. We propose using this structural information in performance tuning tools to make the causes of performance loss more readily apparent. -Our work produces a universal, adaptable set of performance visualizations that integrates this extra application structure,via a new model of parallel computation. The visualizations clearly identify idle cores, and tie the idleness to causal interactions within the runtime and hardware, and from there to the parallelism constructs that constrained the runtime and hardware behavior, thereby eliminating guesswork. -This approach can be used to instrument the runtime of any parallel programming model without modifying the application. As a case study, we applied it to a message-passing model, and we walk through a tuning session on a large multi-core machine to illustrate how performance loss is identified and how hypotheses for the cause are generated. +Our work produces a universal, adaptable set of performance visualizations that integrates this extra application structure, via a new model of parallel computation. The visualizations clearly identify idle cores, and tie the idleness to causal interactions within the runtime and hardware, and from there to the parallelism constructs that constrained the runtime and hardware behavior, thereby eliminating guesswork. +This approach can be used to instrument the runtime of any parallel programming model without modifying the application. As a case study, we applied it to the SSR message-passing model, and we walk through a tuning session on a large multi-core machine to illustrate how performance loss is identified and how hypotheses for the cause are generated. \end{abstract} @@ -106,7 +106,7 @@ We believe that the three primary factors to consider when choosing a performance tuning tool are: \begin{itemize} -\item the ease of recognizing that performance is less than optimal +\item the ease of recognizing that performance is less than optimal \item the ease of forming a hypothesis to explain the difference \item the ease of linking the hypothesis to changes to make in application code \end{itemize} @@ -124,16 +124,14 @@ One fix to these issues is to allow the users to introduce measuring points into their own code, at the cost of increased programmer effort. Because instrumentation code is written in the source language, it has access to application concepts. This advantage can be kept with automated instrumentation, by providing an instrumenting compiler, like the Tau \cite{PerfToolTau} project does. -However, as long as the underlying execution model is still threads, there is no meaningful structure common to all applications to use to generate expressive measurement quantities. Function boundaries and the call graph are not sufficient when parallel execution is performed. The sequence and frequency of function calls tell little about parallel performance impacts because they have no bearing on synchronization events. Unfortunately, pthreads does not capture even hints as to \emph{why} a given function call ends up blocking or not blocking, and what the effects on other threads are. +However, as long as the underlying execution model is still threads, there is no meaningful structure common to all applications to use to generate expressive measurement quantities. Function boundaries and the call graph are not sufficient when parallel execution is performed. The sequence and frequency of function calls tell little about parallel performance impacts because they have no bearing on synchronization events. Unfortunately, pthreads does not capture even hints as to \emph{why} a given function call ends up blocking or not blocking, and what the effects on other threads are. Higher level parallel abstractions help alleviate that particular problem. For instance, in an application with MPI message passing \cite{MPI}, the information ``thread 2 spends little time waiting for messages from thread 0 but a lot of time waiting for messages from thread 1'' can be recorded, where in pthreads only ``thread 2 spends a lot of time waiting for a signal'' would be visible. It is much easier to reach the conclusion that the bottleneck is the slow rate at which thread 1 produces data. -Leveraging the MPI library is a well-trodden path, with many tools available \cite{PMPI} \cite{MPICL} (the above-cited Tau and Paradyn also added this feature). A frequent complaint of users of these tools is that in complex programs, the large number of messages sent makes it difficult to see problems. As there is little that distinguishes messages beside size, it is difficult to implement good filtering. Once again, we find that the limitations come from a parallel model that captures insufficient information. +Leveraging the MPI library is a well-trodden path, with many tools available \cite{PerfToolParaver} \cite{PerfToolVampir} \cite{PerfToolParadyn} \cite{PerfToolTau}. A frequent complaint of users of these tools is that in complex programs, the large number of messages sent makes it difficult to see problems. As there is little that distinguishes messages beside size, it is difficult to implement good filtering. Once again, we find that the limitations come from a parallel model that captures insufficient information. -Fortunately, parallel languages are evolving in the same direction, and it is widely believed that semantically rich parallel constructs, when they match the structure of the application well, are key to improved productivity. With languages such as \cite{CnCInHotPar} and StarSs \cite{StarSs}, the information available to the language runtime offers a much clearer picture of the constraints placed on the execution of tasks and where they come from. Consequently, the application structure can be well reflected in visualizations, as the Temanejo debugger \cite{PerfToolStarSs} for StarSs demonstrates. +Fortunately, parallel languages are evolving in the same direction, and it is widely believed that semantically rich parallel constructs, when they match the structure of the application well, are key to improved productivity. With languages such as CnC \cite{CnCInHotPar} and StarSs \cite{StarSs}, the information available to the language runtime offers a much clearer picture of the constraints placed on the execution of tasks and where they come from. Consequently, the application structure can be well reflected in visualizations, as the Temanejo debugger \cite{PerfToolStarSs} for StarSs demonstrates. - -Our approach differs in the following ways: - +In the following section, we will demonstrate how our approach overcomes these challenges through a combination of the following features: \begin{itemize} \item all the factors influencing scheduling decisions will be covered, given sufficient language and runtime pre-requisites \item works for any parallel programming library or language @@ -1284,4 +1282,4 @@ - trying to pinpoint the causes of performance loss, internal structure of \ No newline at end of file + trying to pinpoint the causes of performance loss, internal structure of diff -r b5e314f7cdb9 -r 8508a9dd967a 0__Papers/Holistic_Model/Perf_Tune/latex/bib_for_papers_12_Jy_15.bib --- a/0__Papers/Holistic_Model/Perf_Tune/latex/bib_for_papers_12_Jy_15.bib Mon Aug 13 02:13:02 2012 -0700 +++ b/0__Papers/Holistic_Model/Perf_Tune/latex/bib_for_papers_12_Jy_15.bib Mon Aug 13 18:48:19 2012 +0200 @@ -1,6 +1,32 @@ +@misc{PerfToolVTune, + author = {Intel}, + howpublished = {http://software.intel.com/en-us/articles/intel-vtune-amplifier-xe/}, + title = {{Intel VTune Amplifier XE Performance Analyzer}}, + year = {2011} +} +@ARTICLE{PerfToolVampir, + author = {W. E. Nagel and A. Arnold and M. Weber and H.-Ch. Hoppe and K. Solchenbach}, + title = {{VAMPIR: Visualization and Analysis of MPI Resources}}, + journal = {Supercomputer}, + year = {1996}, + volume = {12}, + pages = {69--80} +} +@TECHREPORT{PerfToolParaver, + author = {Vincent Pillet and Vincent Pillet and Jesús Labarta and Toni Cortes and Toni Cortes and Sergi Girona and Sergi Girona and Departament D'arquitectura De Computadors}, + title = {PARAVER: A Tool to Visualize and Analyze Parallel Code}, + institution = {In WoTUG-18}, + year = {1995} +} +@Misc{MPI, + author = {Message Passing Interface Forum}, + howpublished = {http://www.mpi-forum.org}, + title = {{MPI}: A Message-Passing Interface Standard Version 2.2}, + year = 2009 +} @Article{StarSs, author = {Planas, J. and Badia, R.M. and Ayguad{\'e}, E. and Labarta, J.}, - title = {Hierarchical task-based programming with StarSs}, + title = {Hierarchical task-based programming with {StarSs}}, journal = {International Journal of High Performance Computing Applications}, year = 2009, volume = 23, @@ -16,7 +42,7 @@ } @Article{PerfToolTau, author = {Shende, Sameer S. and Malony, Allen D.}, - title = {The Tau Parallel Performance System}, + title = {The {TAU} Parallel Performance System}, volume = 20, number = 2, pages = {287-311}, @@ -26,7 +52,7 @@ @Article{PerfToolParadyn, author = {Miller, B.P. and Callaghan, M.D. and Cargille, J.M. and Hollingsworth, J.K. and Irvin, R.B. and Karavanic, K.L. and Kunchithapadam, K. and Newhall, T.}, journal = {Computer}, - title = {The Paradyn parallel performance measurement tool}, + title = {The {Paradyn} parallel performance measurement tool}, year = 1995, month = {nov}, volume = 28, @@ -45,7 +71,7 @@ } @Article{PerfToolStarSs, author = {Steffen Brinkmann and Jos{\'e} Gracia and Christoph Niethammer and Rainer Keller}, - title = {TEMANEJO - a debugger for task based parallel programming models}, + title = {{TEMANEJO} - a debugger for task based parallel programming models}, journal = {CoRR}, volume = {abs/1112.4604}, year = 2011