Mercurial > cgi-bin > hgwebdir.cgi > VMS > 0__Writings > kshalle
changeset 58:2db21884dc2e
perf-tuning -- Candidate for near-final rough draft
| author | Sean Halle <seanhalle@yahoo.com> |
|---|---|
| date | Wed, 20 Jun 2012 15:17:14 -0700 |
| parents | 959587ac4044 |
| children | 7bc474513431 |
| files | 0__Papers/Holistic_Model/Perf_Tune/figures/SCG_stylized_for_expl.pdf 0__Papers/Holistic_Model/Perf_Tune/figures/SCG_stylized_for_expl.svg 0__Papers/Holistic_Model/Perf_Tune/latex/Holistic_Perf_Tuning.tex |
| diffstat | 3 files changed, 236 insertions(+), 603 deletions(-) [+] |
line diff
1.1 Binary file 0__Papers/Holistic_Model/Perf_Tune/figures/SCG_stylized_for_expl.pdf has changed
2.1 --- a/0__Papers/Holistic_Model/Perf_Tune/figures/SCG_stylized_for_expl.svg Tue Jun 19 16:32:07 2012 -0700 2.2 +++ b/0__Papers/Holistic_Model/Perf_Tune/figures/SCG_stylized_for_expl.svg Wed Jun 20 15:17:14 2012 -0700 2.3 @@ -14,7 +14,7 @@ 2.4 id="svg2" 2.5 sodipodi:version="0.32" 2.6 inkscape:version="0.48.1 " 2.7 - sodipodi:docname="UCC_concreteness_grid.svg" 2.8 + sodipodi:docname="SCG_stylized_for_expl.svg" 2.9 inkscape:output_extension="org.inkscape.output.svg.inkscape" 2.10 version="1.1"> 2.11 <defs 2.12 @@ -344,9 +344,9 @@ 2.13 objecttolerance="10" 2.14 inkscape:pageopacity="0.0" 2.15 inkscape:pageshadow="2" 2.16 - inkscape:zoom="1.5301741" 2.17 - inkscape:cx="385.73996" 2.18 - inkscape:cy="731.0723" 2.19 + inkscape:zoom="4.6439189" 2.20 + inkscape:cx="50.001957" 2.21 + inkscape:cy="781.44647" 2.22 inkscape:document-units="px" 2.23 inkscape:current-layer="layer1" 2.24 showgrid="false" 2.25 @@ -363,7 +363,7 @@ 2.26 <dc:format>image/svg+xml</dc:format> 2.27 <dc:type 2.28 rdf:resource="http://purl.org/dc/dcmitype/StillImage" /> 2.29 - <dc:title></dc:title> 2.30 + <dc:title /> 2.31 </cc:Work> 2.32 </rdf:RDF> 2.33 </metadata> 2.34 @@ -371,13 +371,6 @@ 2.35 inkscape:label="Layer 1" 2.36 inkscape:groupmode="layer" 2.37 id="layer1"> 2.38 - <rect 2.39 - y="182.49448" 2.40 - x="163.84431" 2.41 - height="161.39378" 2.42 - width="329.44443" 2.43 - id="rect4979" 2.44 - style="fill:none;stroke:#000000;stroke-width:1.08000004;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:4.32, 4.32;stroke-dashoffset:1.944" /> 2.45 <text 2.46 xml:space="preserve" 2.47 style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:start;line-height:100%;writing-mode:lr-tb;text-anchor:start;opacity:1;fill:#000000;fill-opacity:1;stroke:none;stroke-width:2;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0.60163802;stroke-opacity:1;font-family:Trebuchet MS;-inkscape-font-specification:Trebuchet MS" 2.48 @@ -391,30 +384,6 @@ 2.49 style="font-size:8px" 2.50 id="tspan12172" /></text> 2.51 <text 2.52 - id="text5835" 2.53 - y="125.02016" 2.54 - x="450.34119" 2.55 - style="font-size:10px;font-style:normal;font-weight:normal;fill:#000000;fill-opacity:1;stroke:none;font-family:Trebuchet MS;-inkscape-font-specification:Trebuchet MS" 2.56 - xml:space="preserve"><tspan 2.57 - y="125.02016" 2.58 - x="450.34119" 2.59 - id="tspan5837" 2.60 - sodipodi:role="line"><tspan 2.61 - style="fill:#0303b8;fill-opacity:1" 2.62 - id="tspan5845">Blue</tspan> = application</tspan></text> 2.63 - <text 2.64 - xml:space="preserve" 2.65 - style="font-size:10px;font-style:normal;font-weight:normal;fill:#000000;fill-opacity:1;stroke:none;font-family:Trebuchet MS;-inkscape-font-specification:Trebuchet MS" 2.66 - x="368.28989" 2.67 - y="125.02016" 2.68 - id="text5839"><tspan 2.69 - sodipodi:role="line" 2.70 - id="tspan5841" 2.71 - x="368.28989" 2.72 - y="125.02016"><tspan 2.73 - style="fill:#960a0a;fill-opacity:1" 2.74 - id="tspan5847">Red</tspan> = runtime</tspan></text> 2.75 - <text 2.76 xml:space="preserve" 2.77 style="font-size:10px;font-style:normal;font-weight:normal;fill:#000000;fill-opacity:1;stroke:none;font-family:Trebuchet MS;-inkscape-font-specification:Trebuchet MS" 2.78 x="368.25397" 2.79 @@ -424,197 +393,6 @@ 2.80 id="tspan5851" 2.81 x="368.25397" 2.82 y="405.27225" /></text> 2.83 - <path 2.84 - style="fill:none;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:0.50000001, 0.50000001;stroke-dashoffset:0;marker-end:url(#Arrow2Mend-8)" 2.85 - d="M 289.73631,577.76803 C 263.95998,564.0618 244.9124,553.07853 244.9124,553.07853" 2.86 - id="path6066-9-4" 2.87 - sodipodi:nodetypes="cc" 2.88 - inkscape:connector-curvature="0" /> 2.89 - <rect 2.90 - y="151.42043" 2.91 - x="157.23335" 2.92 - height="194.79402" 2.93 - width="457.79532" 2.94 - id="rect18326-3" 2.95 - style="opacity:0.3142857;fill:none;stroke:#000000;stroke-width:0.09999995;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:0.3999998, 0.3999998;stroke-dashoffset:0.14999992" /> 2.96 - <path 2.97 - sodipodi:nodetypes="cc" 2.98 - id="path3602" 2.99 - d="m 261.4119,552.41344 c 26.46687,13.70623 46.02474,24.6895 46.02474,24.6895" 2.100 - style="fill:none;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:0.5, 0.5;stroke-dashoffset:0;marker-end:url(#Arrow2Mend-8)" 2.101 - inkscape:connector-curvature="0" /> 2.102 - <path 2.103 - sodipodi:nodetypes="cc" 2.104 - id="path3655" 2.105 - d="M 318.49369,577.76803 C 357.52667,563.08168 386.37044,551.313 386.37044,551.313" 2.106 - style="fill:none;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:0.49999999, 0.49999999;stroke-dashoffset:0;marker-end:url(#Arrow2Mend-8)" 2.107 - inkscape:connector-curvature="0" /> 2.108 - <g 2.109 - id="g3242" 2.110 - transform="translate(20,-81.133751)"> 2.111 - <rect 2.112 - y="309.504" 2.113 - x="532.18225" 2.114 - height="68.687347" 2.115 - width="78.674728" 2.116 - id="rect4725" 2.117 - style="fill:none;stroke:#000000;stroke-width:1.08018124;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0" /> 2.118 - <text 2.119 - xml:space="preserve" 2.120 - style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:start;line-height:100%;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;font-family:Trebuchet MS;-inkscape-font-specification:Trebuchet MS" 2.121 - x="571.67981" 2.122 - y="322.80692" 2.123 - id="text4727" 2.124 - sodipodi:linespacing="100%"><tspan 2.125 - style="font-size:9px;text-align:center;text-anchor:middle" 2.126 - sodipodi:role="line" 2.127 - x="571.67981" 2.128 - y="322.80692" 2.129 - id="tspan4731">slaveVP</tspan></text> 2.130 - <g 2.131 - transform="translate(230.36868,-317.52206)" 2.132 - id="g4733"> 2.133 - <rect 2.134 - y="673.02618" 2.135 - x="316.63855" 2.136 - height="16.01606" 2.137 - width="47.791374" 2.138 - id="rect4737" 2.139 - style="fill:none;stroke:#0303b8;stroke-width:1.08018124;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:4.320725, 4.320725;stroke-dashoffset:0" /> 2.140 - <text 2.141 - xml:space="preserve" 2.142 - style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:start;line-height:100%;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;font-family:Trebuchet MS;-inkscape-font-specification:Trebuchet MS" 2.143 - x="340.63751" 2.144 - y="684.03717" 2.145 - id="text4739" 2.146 - sodipodi:linespacing="100%"><tspan 2.147 - style="font-size:9px;text-align:center;text-anchor:middle" 2.148 - id="tspan4741" 2.149 - sodipodi:role="line" 2.150 - x="340.63751" 2.151 - y="684.03717">top_VP_fn</tspan></text> 2.152 - </g> 2.153 - </g> 2.154 - <g 2.155 - id="g5001" 2.156 - transform="translate(-104.17459,-455.52206)"> 2.157 - <text 2.158 - sodipodi:linespacing="100%" 2.159 - id="text4834" 2.160 - y="619.79431" 2.161 - x="339.56277" 2.162 - style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:start;line-height:100%;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;font-family:Trebuchet MS;-inkscape-font-specification:Trebuchet MS" 2.163 - xml:space="preserve"><tspan 2.164 - y="619.79431" 2.165 - x="339.56277" 2.166 - sodipodi:role="line" 2.167 - id="tspan4836" 2.168 - style="font-size:9px;text-align:center;text-anchor:middle">Shared Parallelism-Semantic State</tspan><tspan 2.169 - id="tspan4838" 2.170 - y="628.79431" 2.171 - x="339.56277" 2.172 - sodipodi:role="line" 2.173 - style="font-size:9px;text-align:center;text-anchor:middle" /></text> 2.174 - <rect 2.175 - style="fill:none;stroke:#960a0a;stroke-width:1.07999992;stroke-linejoin:round;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:4.32000008, 4.32000008;stroke-dashoffset:0" 2.176 - id="rect4840" 2.177 - width="149.25206" 2.178 - height="12.695431" 2.179 - x="264.97189" 2.180 - y="610.18365" /> 2.181 - </g> 2.182 - <path 2.183 - sodipodi:nodetypes="cc" 2.184 - id="path4883" 2.185 - d="m 205.94834,169.27344 c -0.0417,34.47629 -0.0725,62.1034 -0.0725,62.1034" 2.186 - style="fill:none;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:0.49999999, 0.49999999;stroke-dashoffset:0;marker-end:url(#Arrow2Mend-8)" 2.187 - inkscape:connector-curvature="0" /> 2.188 - <path 2.189 - style="fill:none;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:0.49999999, 0.49999999;stroke-dashoffset:0;marker-end:url(#Arrow2Mend-8)" 2.190 - d="m 193.94834,231.61326 c -0.0417,-34.47628 -0.0725,-62.10333 -0.0725,-62.10333" 2.191 - id="path4885" 2.192 - sodipodi:nodetypes="cc" 2.193 - inkscape:connector-curvature="0" /> 2.194 - <path 2.195 - style="fill:none;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0;marker-end:url(#Arrow2Mend-8)" 2.196 - d="m 551.6841,281.79676 c -38.53453,0.002 -67.00988,0.004 -67.00988,0.004" 2.197 - id="path4902" 2.198 - sodipodi:nodetypes="cc" 2.199 - inkscape:connector-curvature="0" /> 2.200 - <path 2.201 - sodipodi:nodetypes="cc" 2.202 - id="path4904" 2.203 - d="m 483.44108,257.79676 c 38.5498,0.002 67.03641,0.004 67.03641,0.004" 2.204 - style="fill:none;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0;marker-end:url(#Arrow2Mend-8)" 2.205 - inkscape:connector-curvature="0" /> 2.206 - <text 2.207 - xml:space="preserve" 2.208 - style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:start;line-height:100%;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;font-family:Trebuchet MS;-inkscape-font-specification:Trebuchet MS" 2.209 - x="528.8573" 2.210 - y="255.07925" 2.211 - id="text4721-1" 2.212 - sodipodi:linespacing="100%"><tspan 2.213 - style="font-size:8px;text-align:center;text-anchor:middle;fill:#159415;fill-opacity:1" 2.214 - id="tspan4723-9" 2.215 - sodipodi:role="line" 2.216 - x="528.8573" 2.217 - y="255.07925">Switch VPs</tspan></text> 2.218 - <text 2.219 - sodipodi:linespacing="100%" 2.220 - id="text4931" 2.221 - y="279.07925" 2.222 - x="528.8573" 2.223 - style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:start;line-height:100%;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;font-family:Trebuchet MS;-inkscape-font-specification:Trebuchet MS" 2.224 - xml:space="preserve"><tspan 2.225 - y="279.07925" 2.226 - x="528.8573" 2.227 - sodipodi:role="line" 2.228 - id="tspan4933" 2.229 - style="font-size:8px;text-align:center;text-anchor:middle;fill:#159415;fill-opacity:1">Switch VPs</tspan></text> 2.230 - <text 2.231 - xml:space="preserve" 2.232 - style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:start;line-height:100%;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;font-family:Trebuchet MS;-inkscape-font-specification:Trebuchet MS" 2.233 - x="422.57816" 2.234 - y="193.16141" 2.235 - id="text4981" 2.236 - sodipodi:linespacing="100%"><tspan 2.237 - style="font-size:9px;text-align:center;text-anchor:middle" 2.238 - id="tspan4983" 2.239 - sodipodi:role="line" 2.240 - x="422.57816" 2.241 - y="193.16141">Repeated for each physical core</tspan></text> 2.242 - <g 2.243 - id="g3590" 2.244 - transform="translate(-104.75363,-387.52206)"> 2.245 - <g 2.246 - id="g18019-8" 2.247 - transform="translate(-109.01365,39.321571)"> 2.248 - <text 2.249 - sodipodi:linespacing="100%" 2.250 - id="text17967-5" 2.251 - y="590.22229" 2.252 - x="419.38776" 2.253 - style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:start;line-height:100%;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;font-family:Trebuchet MS;-inkscape-font-specification:Trebuchet MS" 2.254 - xml:space="preserve"><tspan 2.255 - y="590.22229" 2.256 - x="419.38776" 2.257 - sodipodi:role="line" 2.258 - id="tspan17969-7" 2.259 - style="font-size:9px;text-align:center;text-anchor:middle">comm_</tspan><tspan 2.260 - id="tspan17971-6" 2.261 - y="599.22229" 2.262 - x="419.38776" 2.263 - sodipodi:role="line" 2.264 - style="font-size:9px;text-align:center;text-anchor:middle">handler_fn</tspan></text> 2.265 - <rect 2.266 - style="fill:none;stroke:#960a0a;stroke-width:1.08000004;stroke-linejoin:round;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:4.32, 4.32;stroke-dashoffset:0" 2.267 - id="rect17973-1" 2.268 - width="48.927513" 2.269 - height="22.365267" 2.270 - x="395.11298" 2.271 - y="580.86206" /> 2.272 - </g> 2.273 - </g> 2.274 <text 2.275 sodipodi:linespacing="100%" 2.276 id="text17977-3-5" 2.277 @@ -627,338 +405,171 @@ 2.278 sodipodi:role="line" 2.279 id="tspan17979-3-4" 2.280 style="font-size:9px;text-align:center;text-anchor:middle" /></text> 2.281 - <g 2.282 - id="g3580" 2.283 - transform="translate(-103.63132,-399.52206)"> 2.284 - <path 2.285 - inkscape:connector-curvature="0" 2.286 - style="fill:none;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:0.5, 0.5;stroke-dashoffset:0;marker-end:url(#Arrow2Mend-8)" 2.287 - d="m 317.62527,671.29009 c -5.09448,-8.49068 -8.85909,-15.29455 -8.85909,-15.29455" 2.288 - id="path18292-2" 2.289 - sodipodi:nodetypes="cc" /> 2.290 - <g 2.291 - id="g17983-3" 2.292 - transform="translate(-7.0136518,5.3215684)"> 2.293 - <rect 2.294 - y="667.70459" 2.295 - x="317.60855" 2.296 - height="16.01606" 2.297 - width="61.691765" 2.298 - id="rect17975-2" 2.299 - style="fill:none;stroke:#159415;stroke-width:1.08018124;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:4.32072499, 4.32072499;stroke-dashoffset:0" /> 2.300 - <text 2.301 - xml:space="preserve" 2.302 - style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:start;line-height:100%;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;font-family:Trebuchet MS;-inkscape-font-specification:Trebuchet MS" 2.303 - x="347.65115" 2.304 - y="678.71558" 2.305 - id="text17977-2" 2.306 - sodipodi:linespacing="100%"><tspan 2.307 - style="font-size:9px;text-align:center;text-anchor:middle" 2.308 - id="tspan17979-1" 2.309 - sodipodi:role="line" 2.310 - x="347.65115" 2.311 - y="678.71558">master_loop</tspan></text> 2.312 - </g> 2.313 - <path 2.314 - inkscape:connector-curvature="0" 2.315 - sodipodi:nodetypes="cc" 2.316 - id="path3578" 2.317 - d="m 374.32675,655.55365 c -5.09448,8.49068 -8.85909,15.29455 -8.85909,15.29455" 2.318 - style="fill:none;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:0.5, 0.5;stroke-dashoffset:0;marker-end:url(#Arrow2Mend-8)" /> 2.319 - </g> 2.320 - <text 2.321 - sodipodi:linespacing="100%" 2.322 - id="text4821" 2.323 - y="218.02182" 2.324 - x="269.62048" 2.325 - style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:start;line-height:100%;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;font-family:Trebuchet MS;-inkscape-font-specification:Trebuchet MS" 2.326 - xml:space="preserve"><tspan 2.327 - y="218.02182" 2.328 - x="269.62048" 2.329 - sodipodi:role="line" 2.330 - id="tspan4823" 2.331 - style="font-size:9px;text-align:center;text-anchor:middle">readyQ</tspan><tspan 2.332 - id="tspan4825" 2.333 - y="227.02182" 2.334 - x="269.62048" 2.335 - sodipodi:role="line" 2.336 - style="font-size:9px;text-align:center;text-anchor:middle" /></text> 2.337 <rect 2.338 - style="fill:none;stroke:#960a0a;stroke-width:1.08000004;stroke-linejoin:round;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:4.32000001, 4.32000001;stroke-dashoffset:0" 2.339 - id="rect4827" 2.340 - width="48.927513" 2.341 - height="12.695431" 2.342 - x="245.3457" 2.343 - y="208.66159" /> 2.344 - <path 2.345 - inkscape:connector-curvature="0" 2.346 - style="fill:none;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:0.49999999, 0.49999999;stroke-dashoffset:0;marker-end:url(#Arrow2Mend-8)" 2.347 - d="m 227.16268,231.46504 c 9.78768,-7.86986 17.02037,-14.17623 17.02037,-14.17623" 2.348 - id="path3576-2" 2.349 - sodipodi:nodetypes="cc" /> 2.350 - <text 2.351 - sodipodi:linespacing="100%" 2.352 - id="text3915" 2.353 - y="268.80692" 2.354 - x="201.94568" 2.355 - style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:start;line-height:100%;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;font-family:Trebuchet MS;-inkscape-font-specification:Trebuchet MS" 2.356 - xml:space="preserve"><tspan 2.357 - y="268.80692" 2.358 - x="201.94568" 2.359 - sodipodi:role="line" 2.360 - id="tspan3917" 2.361 - style="font-size:9px;text-align:center;text-anchor:middle">2</tspan></text> 2.362 + y="248.90544" 2.363 + x="12.182251" 2.364 + height="23.594439" 2.365 + width="13.322688" 2.366 + id="rect3143" 2.367 + style="fill:#0000ff;stroke:#000000;stroke-width:1.08018123999999990;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0" /> 2.368 <text 2.369 xml:space="preserve" 2.370 style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:start;line-height:100%;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;font-family:Trebuchet MS;-inkscape-font-specification:Trebuchet MS" 2.371 - x="199.94568" 2.372 - y="214.80692" 2.373 - id="text3919" 2.374 + x="30.146273" 2.375 + y="321.8623" 2.376 + id="text3145" 2.377 sodipodi:linespacing="100%"><tspan 2.378 style="font-size:9px;text-align:center;text-anchor:middle" 2.379 - id="tspan3921" 2.380 sodipodi:role="line" 2.381 - x="199.94568" 2.382 - y="214.80692">3</tspan></text> 2.383 + x="30.146273" 2.384 + y="321.8623" 2.385 + id="tspan3147" /></text> 2.386 + <rect 2.387 + style="fill:#999999;stroke:#000000;stroke-width:1.08018123999999990;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0" 2.388 + id="rect3164" 2.389 + width="13.322688" 2.390 + height="10.524032" 2.391 + x="32.182251" 2.392 + y="228.37024" /> 2.393 + <rect 2.394 + y="228.37024" 2.395 + x="52.182251" 2.396 + height="5.949388" 2.397 + width="13.322688" 2.398 + id="rect3166" 2.399 + style="fill:#999999;stroke:#000000;stroke-width:1.08018123999999990;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0" /> 2.400 <text 2.401 sodipodi:linespacing="100%" 2.402 - id="text3923" 2.403 - y="222.80692" 2.404 - x="231.94568" 2.405 + id="text3168" 2.406 + y="21.569902" 2.407 + x="-212.36188" 2.408 style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:start;line-height:100%;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;font-family:Trebuchet MS;-inkscape-font-specification:Trebuchet MS" 2.409 - xml:space="preserve"><tspan 2.410 - y="222.80692" 2.411 - x="231.94568" 2.412 + xml:space="preserve" 2.413 + transform="matrix(0,-1,1,0,0,0)"><tspan 2.414 + id="tspan3170" 2.415 + y="21.569902" 2.416 + x="-212.36188" 2.417 sodipodi:role="line" 2.418 - id="tspan3925" 2.419 - style="font-size:9px;text-align:center;text-anchor:middle">4</tspan></text> 2.420 + style="font-size:9px;text-align:center;text-anchor:middle">core 0</tspan></text> 2.421 + <text 2.422 + transform="matrix(0,-1,1,0,0,0)" 2.423 + xml:space="preserve" 2.424 + style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:start;line-height:100%;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;font-family:Trebuchet MS;-inkscape-font-specification:Trebuchet MS" 2.425 + x="-212.36188" 2.426 + y="41.569904" 2.427 + id="text3176" 2.428 + sodipodi:linespacing="100%"><tspan 2.429 + style="font-size:9px;text-align:center;text-anchor:middle" 2.430 + sodipodi:role="line" 2.431 + x="-212.36188" 2.432 + y="41.569904" 2.433 + id="tspan3178">core 1</tspan></text> 2.434 <text 2.435 sodipodi:linespacing="100%" 2.436 - id="text3931" 2.437 - y="264.80692" 2.438 - x="261.94568" 2.439 + id="text3180" 2.440 + y="61.569904" 2.441 + x="-212.36188" 2.442 style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:start;line-height:100%;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;font-family:Trebuchet MS;-inkscape-font-specification:Trebuchet MS" 2.443 - xml:space="preserve"><tspan 2.444 - y="264.80692" 2.445 - x="261.94568" 2.446 + xml:space="preserve" 2.447 + transform="matrix(0,-1,1,0,0,0)"><tspan 2.448 + id="tspan3182" 2.449 + y="61.569904" 2.450 + x="-212.36188" 2.451 sodipodi:role="line" 2.452 - id="tspan3933" 2.453 - style="font-size:9px;text-align:center;text-anchor:middle">5</tspan></text> 2.454 - <text 2.455 - xml:space="preserve" 2.456 - style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:start;line-height:100%;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;font-family:Trebuchet MS;-inkscape-font-specification:Trebuchet MS" 2.457 - x="527.94568" 2.458 - y="246.80692" 2.459 - id="text3951" 2.460 - sodipodi:linespacing="100%"><tspan 2.461 - style="font-size:9px;text-align:center;text-anchor:middle" 2.462 - id="tspan3953" 2.463 - sodipodi:role="line" 2.464 - x="527.94568" 2.465 - y="246.80692">9</tspan></text> 2.466 - <text 2.467 - sodipodi:linespacing="100%" 2.468 - id="text3955" 2.469 - y="270.80692" 2.470 - x="527.94568" 2.471 - style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:start;line-height:100%;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;font-family:Trebuchet MS;-inkscape-font-specification:Trebuchet MS" 2.472 - xml:space="preserve"><tspan 2.473 - y="270.80692" 2.474 - x="527.94568" 2.475 - sodipodi:role="line" 2.476 - id="tspan3957" 2.477 - style="font-size:9px;text-align:center;text-anchor:middle">10</tspan></text> 2.478 - <text 2.479 - sodipodi:linespacing="100%" 2.480 - id="text3494" 2.481 - y="324.80692" 2.482 - x="243.94568" 2.483 - style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:start;line-height:100%;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;font-family:Trebuchet MS;-inkscape-font-specification:Trebuchet MS" 2.484 - xml:space="preserve"><tspan 2.485 - y="324.80692" 2.486 - x="243.94568" 2.487 - sodipodi:role="line" 2.488 - id="tspan3496" 2.489 - style="font-size:9px;font-style:oblique;text-align:center;text-anchor:middle;-inkscape-font-specification:Trebuchet MS Oblique">(Animated)</tspan></text> 2.490 - <text 2.491 - xml:space="preserve" 2.492 - style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:start;line-height:100%;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;font-family:Trebuchet MS;-inkscape-font-specification:Trebuchet MS" 2.493 - x="331.94568" 2.494 - y="162.80692" 2.495 - id="text3498" 2.496 - sodipodi:linespacing="100%"><tspan 2.497 - style="font-size:9px;font-style:oblique;text-align:center;text-anchor:middle;-inkscape-font-specification:Trebuchet MS Oblique" 2.498 - id="tspan3500" 2.499 - sodipodi:role="line" 2.500 - x="331.94568" 2.501 - y="162.80692">(Blocked)</tspan></text> 2.502 - <text 2.503 - sodipodi:linespacing="100%" 2.504 - id="text3502" 2.505 - y="216.80692" 2.506 - x="403.94568" 2.507 - style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:start;line-height:100%;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;font-family:Trebuchet MS;-inkscape-font-specification:Trebuchet MS" 2.508 - xml:space="preserve"><tspan 2.509 - y="216.80692" 2.510 - x="403.94568" 2.511 - sodipodi:role="line" 2.512 - id="tspan3504" 2.513 - style="font-size:9px;font-style:oblique;text-align:center;text-anchor:middle;-inkscape-font-specification:Trebuchet MS Oblique">(Ready)</tspan></text> 2.514 - <text 2.515 - xml:space="preserve" 2.516 - style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:start;line-height:100%;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;font-family:Trebuchet MS;-inkscape-font-specification:Trebuchet MS" 2.517 - x="329.62048" 2.518 - y="398.02182" 2.519 - id="text9820" 2.520 - sodipodi:linespacing="100%"><tspan 2.521 - style="font-size:9px;text-align:start;text-anchor:start" 2.522 - id="tspan9822" 2.523 - sodipodi:role="line" 2.524 - x="329.62048" 2.525 - y="398.02182">Receive_type and Send_type: which ever is last to arrive between the send and the receive,</tspan><tspan 2.526 - style="font-size:9px;text-align:start;text-anchor:start" 2.527 - sodipodi:role="line" 2.528 - x="329.62048" 2.529 - y="407.02182" 2.530 - id="tspan9862"> at that point record the unit-ID for sending unit and receiving. This is part of an NtoN </tspan><tspan 2.531 - style="font-size:9px;text-align:start;text-anchor:start" 2.532 - sodipodi:role="line" 2.533 - x="329.62048" 2.534 - y="416.02182" 2.535 - id="tspan9870">constraint, so add sender to senders part of NtoN group and receiver to receivers part</tspan><tspan 2.536 - style="font-size:9px;text-align:start;text-anchor:start" 2.537 - sodipodi:role="line" 2.538 - x="329.62048" 2.539 - y="425.02182" 2.540 - id="tspan9824" /></text> 2.541 - <text 2.542 - sodipodi:linespacing="100%" 2.543 - id="text9826" 2.544 - y="438.02182" 2.545 - x="329.62048" 2.546 - style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:start;line-height:100%;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;font-family:Trebuchet MS;-inkscape-font-specification:Trebuchet MS" 2.547 - xml:space="preserve"><tspan 2.548 - y="438.02182" 2.549 - x="329.62048" 2.550 - sodipodi:role="line" 2.551 - id="tspan9828" 2.552 - style="font-size:9px;text-align:start;text-anchor:start">send_from_to and receive_from_to -- whichever arrives last between sending VP and receiving VP,</tspan><tspan 2.553 - y="447.02182" 2.554 - x="329.62048" 2.555 - sodipodi:role="line" 2.556 - style="font-size:9px;text-align:start;text-anchor:start" 2.557 - id="tspan9864"> record the unit-ID for sending unit and receiving unit. </tspan><tspan 2.558 - y="456.02182" 2.559 - x="329.62048" 2.560 - sodipodi:role="line" 2.561 - style="font-size:9px;text-align:start;text-anchor:start" 2.562 - id="tspan9876">Add as a dependency between send *unit* and receive *unit* -- static comm dependency</tspan><tspan 2.563 - id="tspan9830" 2.564 - y="465.02182" 2.565 - x="329.62048" 2.566 - sodipodi:role="line" 2.567 - style="font-size:9px;text-align:start;text-anchor:start" /></text> 2.568 - <text 2.569 - xml:space="preserve" 2.570 - style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:start;line-height:100%;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;font-family:Trebuchet MS;-inkscape-font-specification:Trebuchet MS" 2.571 - x="329.62048" 2.572 - y="478.02182" 2.573 - id="text9832" 2.574 - sodipodi:linespacing="100%"><tspan 2.575 - style="font-size:9px;text-align:start;text-anchor:start" 2.576 - id="tspan9834" 2.577 - sodipodi:role="line" 2.578 - x="329.62048" 2.579 - y="478.02182">At point switch to slave, record, at point switch back, record point </tspan><tspan 2.580 - style="font-size:9px;text-align:start;text-anchor:start" 2.581 - sodipodi:role="line" 2.582 - x="329.62048" 2.583 - y="487.02182" 2.584 - id="tspan9878">-] That is the work time of the unit</tspan><tspan 2.585 - style="font-size:9px;text-align:start;text-anchor:start" 2.586 - sodipodi:role="line" 2.587 - x="329.62048" 2.588 - y="496.02182" 2.589 - id="tspan9836" /></text> 2.590 - <text 2.591 - sodipodi:linespacing="100%" 2.592 - id="text9838" 2.593 - y="498.02185" 2.594 - x="329.62048" 2.595 - style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:start;line-height:100%;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;font-family:Trebuchet MS;-inkscape-font-specification:Trebuchet MS" 2.596 - xml:space="preserve"><tspan 2.597 - y="498.02185" 2.598 - x="329.62048" 2.599 - sodipodi:role="line" 2.600 - id="tspan9840" 2.601 - style="font-size:9px;text-align:start;text-anchor:start">ctlr -- start of getting lock</tspan><tspan 2.602 - id="tspan9842" 2.603 - y="507.02185" 2.604 - x="329.62048" 2.605 - sodipodi:role="line" 2.606 - style="font-size:9px;text-align:start;text-anchor:start" /></text> 2.607 - <text 2.608 - xml:space="preserve" 2.609 - style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:start;line-height:100%;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;font-family:Trebuchet MS;-inkscape-font-specification:Trebuchet MS" 2.610 - x="329.62048" 2.611 - y="518.02185" 2.612 - id="text9844" 2.613 - sodipodi:linespacing="100%"><tspan 2.614 - style="font-size:9px;text-align:start;text-anchor:start" 2.615 - id="tspan9846" 2.616 - sodipodi:role="line" 2.617 - x="329.62048" 2.618 - y="518.02185">master loop -- start and end of assigner, start and end of req hdlr</tspan><tspan 2.619 - style="font-size:9px;text-align:start;text-anchor:start" 2.620 - sodipodi:role="line" 2.621 - x="329.62048" 2.622 - y="527.02185" 2.623 - id="tspan9848" /></text> 2.624 - <text 2.625 - sodipodi:linespacing="100%" 2.626 - id="text9850" 2.627 - y="378.02182" 2.628 - x="329.62048" 2.629 - style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:start;line-height:100%;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;font-family:Trebuchet MS;-inkscape-font-specification:Trebuchet MS" 2.630 - xml:space="preserve"><tspan 2.631 - y="378.02182" 2.632 - x="329.62048" 2.633 - sodipodi:role="line" 2.634 - id="tspan9852" 2.635 - style="font-size:9px;text-align:start;text-anchor:start">ResumeVP -- create unit & control dep from prev VP</tspan><tspan 2.636 - id="tspan9854" 2.637 - y="387.02182" 2.638 - x="329.62048" 2.639 - sodipodi:role="line" 2.640 - style="font-size:9px;text-align:start;text-anchor:start" /></text> 2.641 - <text 2.642 - xml:space="preserve" 2.643 - style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:start;line-height:100%;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;font-family:Trebuchet MS;-inkscape-font-specification:Trebuchet MS" 2.644 - x="29.620483" 2.645 - y="398.02182" 2.646 - id="text9856" 2.647 - sodipodi:linespacing="100%"><tspan 2.648 - style="font-size:9px;text-align:start;text-anchor:start" 2.649 - id="tspan9858" 2.650 - sodipodi:role="line" 2.651 - x="29.620483" 2.652 - y="398.02182">handleCreate -- dep from creator to first task in new VP</tspan><tspan 2.653 - style="font-size:9px;text-align:start;text-anchor:start" 2.654 - sodipodi:role="line" 2.655 - x="29.620483" 2.656 - y="407.02182" 2.657 - id="tspan9860" /></text> 2.658 + style="font-size:9px;text-align:center;text-anchor:middle">core 2</tspan></text> 2.659 <path 2.660 inkscape:connector-curvature="0" 2.661 sodipodi:nodetypes="cc" 2.662 - id="path3139" 2.663 - d="m 23.849847,481.79676 c 38.53453,0.002 67.00988,0.004 67.00988,0.004" 2.664 + id="path3184" 2.665 + d="m -2.8805704,227.87366 c 0.002,38.53453 0.004,67.00988 0.004,67.00988" 2.666 style="fill:none;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0;marker-end:url(#Arrow2Mend-8)" /> 2.667 + <text 2.668 + transform="matrix(0,-1,1,0,0,0)" 2.669 + xml:space="preserve" 2.670 + style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:start;line-height:100%;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;font-family:Trebuchet MS;-inkscape-font-specification:Trebuchet MS" 2.671 + x="-212.36188" 2.672 + y="0.223423" 2.673 + id="text3186" 2.674 + sodipodi:linespacing="100%"><tspan 2.675 + style="font-size:9px;text-align:center;text-anchor:middle" 2.676 + sodipodi:role="line" 2.677 + x="-212.36188" 2.678 + y="0.223423" 2.679 + id="tspan3188">time</tspan></text> 2.680 + <rect 2.681 + style="fill:#c83771;stroke:#000000;stroke-width:1.08018124;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0" 2.682 + id="rect3190" 2.683 + width="13.322688" 2.684 + height="25.555" 2.685 + x="52.182251" 2.686 + y="282.37024" /> 2.687 + <rect 2.688 + style="fill:#ff00cc;stroke:#000000;stroke-width:1.08018123999999990;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0" 2.689 + id="rect3192" 2.690 + width="13.322688" 2.691 + height="18.366276" 2.692 + x="12.182251" 2.693 + y="286.37024" /> 2.694 + <rect 2.695 + y="239.02373" 2.696 + x="32.182251" 2.697 + height="32.743729" 2.698 + width="13.322688" 2.699 + id="rect4264" 2.700 + style="fill:#d40000;stroke:#000000;stroke-width:1.08018123999999990;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0" /> 2.701 + <rect 2.702 + style="fill:#800080;stroke:#000000;stroke-width:1.08018123999999990;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0" 2.703 + id="rect4266" 2.704 + width="13.322688" 2.705 + height="19.673317" 2.706 + x="52.182251" 2.707 + y="234.37024" /> 2.708 + <rect 2.709 + style="fill:#999999;stroke:#000000;stroke-width:1.08018124;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0" 2.710 + id="rect4268" 2.711 + width="13.322688" 2.712 + height="5.949388" 2.713 + x="52.182251" 2.714 + y="276.37024" /> 2.715 + <rect 2.716 + y="280.37024" 2.717 + x="12.182251" 2.718 + height="5.949388" 2.719 + width="13.322688" 2.720 + id="rect4270" 2.721 + style="fill:#999999;stroke:#000000;stroke-width:1.08018123999999990;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0" /> 2.722 + <rect 2.723 + style="fill:#999999;stroke:#000000;stroke-width:1.08018123999999990;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0" 2.724 + id="rect4272" 2.725 + width="13.322688" 2.726 + height="15.098674" 2.727 + x="12.182251" 2.728 + y="234.37024" /> 2.729 + <rect 2.730 + y="272.37024" 2.731 + x="32.182251" 2.732 + height="5.949388" 2.733 + width="13.322688" 2.734 + id="rect4274" 2.735 + style="fill:#999999;stroke:#000000;stroke-width:1.08018123999999990;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0" /> 2.736 + <rect 2.737 + style="fill:#cc00ff;stroke:#000000;stroke-width:1.08018123999999990;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0" 2.738 + id="rect4276" 2.739 + width="13.322688" 2.740 + height="27.515566" 2.741 + x="32.182251" 2.742 + y="278.88849" /> 2.743 <path 2.744 - style="fill:none;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0;marker-end:url(#Arrow2Mend-8)" 2.745 - d="m 17.77295,475.70791 c 0.002,-38.53453 0.004,-67.00988 0.004,-67.00988" 2.746 - id="path3141" 2.747 + inkscape:connector-curvature="0" 2.748 + style="fill:#008000;stroke:#008000;stroke-width:1.6;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0;marker-end:none" 2.749 + d="M 58.375661,254.09357 C 35.840498,268.46406 19.187989,279.97962 19.187989,279.97962" 2.750 + id="path4278" 2.751 + sodipodi:nodetypes="cc" /> 2.752 + <path 2.753 sodipodi:nodetypes="cc" 2.754 + id="path3602" 2.755 + d="m 18.781626,272.62621 c 22.916388,1.81946 39.850606,3.27746 39.850606,3.27746" 2.756 + style="fill:#d45500;stroke:#d45500;stroke-width:1.6;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0;marker-end:none" 2.757 inkscape:connector-curvature="0" /> 2.758 </g> 2.759 </svg>
3.1 --- a/0__Papers/Holistic_Model/Perf_Tune/latex/Holistic_Perf_Tuning.tex Tue Jun 19 16:32:07 2012 -0700 3.2 +++ b/0__Papers/Holistic_Model/Perf_Tune/latex/Holistic_Perf_Tuning.tex Wed Jun 20 15:17:14 2012 -0700 3.3 @@ -333,14 +333,13 @@ 3.4 3.5 The connections made within the views enable more quickly generating the correct hypotheses for the cause of performance losses. The views connect units to the specific segments of code that compose the units, and connect each constraint on scheduling choices to the precise source of the constraint, within the code, hardware, or runtime. 3.6 3.7 -Computation-core usage is displayed in categories: application work, idleness due to non-overlapped communication (which results from scheduling decisions), and scheduling/runtime overhead. 3.8 -In addition, the influence of parameters that affect units and constraints can be deduced. 3.9 -Visual features indicate causal connections between activities, allowing root causes to be traced, and then linked to application code. 3.10 + 3.11 + 3.12 3.13 In the following, we will describe our model of computation, which drives the instrumentation and visualization, and links the aspects of performance to the collected information. 3.14 The usage of the model and visualization is illustrated with a story line of performance tuning a standard parallel application on a large multi-core system. 3.15 3.16 -We start with background on performance tuning and an overview of previous approaches in section \ref{sec:related}. We shall introduce our model through a case study in section \ref{sec:casestudy}, and then expand on the theory behind it in section \ref{sec:theory}. Section \ref{sec:Implementation} will tie the model to implementation details. Finally, we will conclude in section \ref{sec:conclusion}. 3.17 +We start with refresher on performance tuning and an overview of previous approaches in section \ref{sec:related}. We shall usage of our visualizations through a case study in section \ref{sec:casestudy}, and then expand on the model behind it in section \ref{sec:theory}. Section \ref{sec:Implementation} will tie the model to implementation details. Finally, we will conclude in section \ref{sec:conclusion}. 3.18 3.19 3.20 3.21 @@ -352,19 +351,19 @@ 3.22 3.23 A quick review of the process of performance tuning will provide much needed context for the shortcomings of other tools. 3.24 3.25 - Performance tuning is an iterative process that involves a mental model. The programmer usually has some expectation of the performance of her program, and takes measurements during execution that are then compared to the desired outcome. A mental model, constructed through experience and knowledge of the mechanics of execution, is then used to generate a hypothesis explaining any discrepancies between the measurement and expectations. This hypothesis is then linked, again through a mental model, to things within the programmer's control, to suggest a change to make to the code. The modified code is then run again, and these steps are repeated until the programmer is satisfied with the performance of the program. 3.26 + Performance tuning is an iterative process that involves a mental model. The programmer takes measurements during execution that are then compared to the desired outcome. A mental model, constructed through experience and knowledge of the mechanics of execution, is used to generate a hypothesis explaining any discrepancies between the measurement and expectations. This hypothesis is then linked, again through a mental model, to things within the programmer's control, to suggest a change to make to the code. The modified code is then run again, and these steps are repeated until the programmer is satisfied with the performance of the program. 3.27 3.28 -For parallel programs, context for measurements must be gathered that relate them to the mental model. These include identification of the units of work that are scheduled onto resources, as well as the constraints on scheduling those. It is also crucial to record during the run not only measurements of when the hardware was busy doing what, but also the scheduling decisions that affected this hardware usage. 3.29 +For parallel performance tuning, the rest of the paper will support the position that the quantities best to measure are scheduling decisions and the consequent usage of communication hardware and cores. Hence, the mental model should have, as concepts, units of scheduled work, and scheduling decisions made on them, then relate those to consequent hardware behavior. It should also relate all of those to application code, such as code boundaries that identify units, and constraints (dependencies) the application places on scheduling decisions. 3.30 3.31 With this in mind, we evaluate the advantages and shortcomings of five categories of models used for performance tuning. The first four approaches are found, sometimes in combinations, in most classic performance evaluation tools, while the fifth is starting to emerge, enabled by the growing adoption of task-based programming languages in recent years. 3.32 3.33 The commonality among the four classic approaches is either the lack of a model of parallel computation, or a poorly suited one. 3.34 3.35 -The early tools also suffered from applications that effectively implemented the runtime as part of the application code. Both MPI and threads effectively force implementing a runtime system in the application. In such a case, the units of work are implied in the code, and difficult for tools to recognize. Likewise, constraints on scheduling are enforced by the code, but never stated in any explicit form. 3.36 +The early tools also suffered from applications that, effectively, implemented the runtime as part of the application code, which hid information from the tool. Both MPI and threads cause the runtime to be embedded into the application. In such a case, the units of work are implied in the code, and difficult for tools to recognize. Likewise, constraints on scheduling are enforced by the code, but never stated in any explicit form. 3.37 3.38 3.39 \subsection{Thread-model based Approaches} 3.40 -Most of the older more established tools come from the threads world, and conceive of the application as a collection of virtual processors that perform actions, but don't include the concept of application-defined tasks nor constraints on them. This makes them unable to directly connect statistics they gather to specific application features. The lack of connection forces the user to guess at what aspect of the code is responsible for observed performance. 3.41 +Most of the older more established tools come from the threads world, and conceive of the application as a collection of virtual processors that perform actions, but don't include the concept of application-defined tasks nor constraints on them. This makes them unable to directly connect statistics they gather to scheduling choices and application features. The lack of connection forces the user to guess at what aspect of the code is responsible for observed performance. 3.42 3.43 For example, Tau [] is representative of the thread-centric approach, and a highly cited, older, system for performance tuning parallel applications. It integrates many data sources, and has rich displays. However it models cores and memories and thread contexts, with actions taken on or by each, with no well defined concept of unit of work. What it is missing is the concept: tasks, constraints on them, and scheduling choices. 3.44 3.45 @@ -372,13 +371,13 @@ 3.46 3.47 Systems that model parallel computation as a collection of events are well represented by Paradyn[], another highly cited classic performance tuning tool. Its model of computation is based on events, both the timing of events and counts of events. 3.48 3.49 - It has a system for user-supplied instrumentation to collect event information and it has a hypotheses mechanism that protects the user from having to write custom code to test their hypotheses. However, the hypotheses are in terms of the timing and counts of events, not the parallel computation relevant information of units of scheduled work and the scheduling decisions made on those. 3.50 + It has a system for user-supplied instrumentation to collect event information and it has a hypothesis mechanism that protects the user from having to write custom code to test their hypotheses. However, the hypotheses are in terms of the timing and counts of events, not the parallel computation relevant information of units of scheduled work and the scheduling decisions made on those. 3.51 3.52 \subsection{Message-centric approach} 3.53 3.54 Paragraph also follows an event-based model, but represents the large collection of simpler tools that instrument the MPI or other message-passing library. It shows whether cores are busy, and indicates communication overhead, but lacks an underlying computation model to tie the communication pattern realized to application code features, which are what is under programmer control. It also fails to show runtime overhead, and which portions of idle time are caused by runtime internal constraints. 3.55 3.56 -Paraver and Vampir are just painting tools that take event measurements and paint them on the screen. 3.57 +Note that Paraver and Vampir are just painting tools that take event measurements and paint them on the screen. 3.58 3.59 \subsection{Performance-counter approaches} 3.60 Performance-counter approaches, such as VTune, POPI, and so forth concentrate on identifying hot-spots and potential false-sharing. These suffer from a lack of encompassing computation model, leaving the user to guess at what might be the cause of measured numbers. They do a good job of saying that something might be wrong, but a poor job of pointing to what is causing the problem, and hence leave the user baffled as to what to change in their code to get better performance. 3.61 @@ -386,7 +385,7 @@ 3.62 \subsection{Newer approaches} 3.63 3.64 3.65 - These shortcomings are rectified by recent parallel languages such as CnC[] and StarSs[], which cleanly identify tasks and the constraints on them. The people who develop such a language also develop tools to go with it. 3.66 + These shortcomings are rectified by recent parallel languages such as CnC[] and StarSs[], which cleanly identify tasks and the constraints on them. The people who develop such a language usually also develop tools to go with it. 3.67 3.68 The StarSs tool[]\ tries to simplify the view for the user. It doesn't give performance information directly, but instead identifies tasks and tells the user whether it thinks the task size is too small, just right, or tool big. Too small has too much runtime overhead, while too big has too few tasks to keep the cores busy. 3.69 3.70 @@ -406,7 +405,7 @@ 3.71 3.72 \subsection{The Application, and Target Hardware} 3.73 3.74 -In our session, we wish to tune a standard program that the reader has likely already experienced attempting to performance tune, and/or knows well. The best example is likely matrix multiply, with which the reader should be familiar, allowing concentration on the tool without distraction about the application. 3.75 +In our session, we wish to tune a standard program that the reader knows well. The best example is likely matrix multiply, with which the reader should be familiar, allowing concentration on the tool without distraction about the application. 3.76 3.77 We run it on a machine with 4 sockets by 10 cores each, for a total of 40 physical cores. They are Intel WestmereEx cores running at 3.0GHz, with TurboBoost turned off. 3.78 3.79 @@ -416,11 +415,12 @@ 3.80 3.81 It then creates a results VP that receives a partial-result from each piece and accumulates the results. The original divider VP then waits for the results VP to indicate completion, after which the language shuts down. 3.82 3.83 +\subsection{The language} 3.84 The language used is SSR, which is based on rendez-vous style send and receive operations made between virtual processors (VPs). It has commands for creating and destroying VPs, and three kinds of send-receive paired operations. 3.85 3.86 The first, \emph{send\_from\_to} specifies a specific sender and specific receiver. It is used by the results VP to tell the divider VP that the work is complete. The second, \emph{send\_of\_type\_to}, specifies a specific receiver, but the sender is anonymous, which increases flexibility while maintaining some control over scope. This is used by the VPs doing the pieces to send their partial-result to the results processor. The third kind, \emph{send\_of\_type}, only specifies the type, and so acts as a global communication channel; this is not used in our application. 3.87 3.88 - The language also includes a \emph{singleton} construct that designates a piece of code as to be executed only once, which we use to rearrange and copy data to get better cache behavior. A given copy is shared by several virtual processors on different cores, but the copy only needs to be performed once. 3.89 + The language also includes a \emph{singleton} construct that designates a piece of code as to be executed only once, which we use to rearrange and copy data to get better cache behavior. A given copy is shared by several virtual processors on different cores, but the copy is only to be performed once. 3.90 3.91 Miscellaneous performance constructs are also available, such as one to force which core a virtual processor is assigned to. We use this in our example program to control scheduling. 3.92 3.93 @@ -432,28 +432,42 @@ 3.94 \label{subsec:visualization_def} 3.95 The first visualization is what we refer to as a scheduling consequence graph (SCG), or just consequence graph (CG). It depicts the scheduling operations performed by the runtime, and the consequent usage of the cores. 3.96 3.97 -The second visualization depicts constraints on those scheduling decisions that come from the application, such as dependencies in the code. These limit the choices the runtime is allowed to make. We call this the Unit \& Constraint Collection, or UCC. 3.98 +The second visualization depicts constraints on those scheduling decisions, which come from the application, such as dependencies in the code. These limit the choices the runtime is allowed to make. We call this the Unit \& Constraint Collection, or UCC. 3.99 3.100 The UCC shows only application-derived information, as opposed to the consequence graph, which combines the \textit{use} of the UCC-depicted constraints with runtime-imposed dependencies and hardware-imposed constraints. Hence, the UCC states the degrees of freedom enabled by the application, while the consequence graph states how those were made use of, by a particular runtime on particular hardware. 3.101 3.102 -Fig X shows a consequence graph (CG), stylized for purposes of explanation. It is composed of a number of columns, one for each core. A column represents time, with early at the top, increasing as one goes down, measured in clock cycles. It is broken into blocks, each representing the time accounted to one work-unit. Each block is further divided into regions, each a different color which indicates the kind of activity the core was engaged in during that region's time-span. 3.103 - 3.104 -The kinds of activities are defined by the computation model that underlies the visualization. The first is the work of a work-unit, represented by a blue-to-red region. It includes time stalled due to cache misses. The color indicates intensity of cache misses, with pure red representing at or above the maximum misses per instruction, and pure blue the minimum. The max and min are set in the tool that generates the visualization. 3.105 3.106 - The second kind of activity is runtime overhead, represented by a gray region. This is the overhead spent on that particular work-unit. When desired by the viewer, it is further broken into pieces representing activities inside the runtime. These may include acquisition of a lock on shared semantic state, time spent on constraints determining readiness of the work-unit, on deciding which ready one to assign to which hardware, and time spent switching from virtual processor, to the runtime, and back. In this paper, we show all runtime overhead the same way, however in other circumstances a breakdown can be key to understanding interaction between runtime and application. 3.107 + 3.108 +\begin{figure}[ht] 3.109 + \centering 3.110 + \includegraphics[width = 2in, height = 1.8in]{../figures/SCG_stylized_for_expl.pdf} 3.111 + \caption{Stylized Scheduling Consequence Graph (SCG).} 3.112 + \label{fig:SCG_expl} 3.113 +\end{figure} 3.114 + 3.115 + 3.116 + 3.117 +Fig \ref{fig:SCG_expl} shows a consequence graph, stylized for purposes of explanation. It is composed of a number of columns, one for each core. A column represents time on the core, increasing as one goes down, measured in clock cycles. It is broken into blocks, each representing the time accounted to one work-unit. Each block is further divided into regions, each a different color, which indicates the kind of activity the core was engaged in during that region's time-span. 3.118 + 3.119 +The application code executed within a block is linked to the block. In our tool, the block is labelled with a unique unitID. This ID is then linked to the code executed within that unit. In this way, the code of any block can be looked up, along with the parallelism constructs that mark the start and end of the block. 3.120 + 3.121 +The kinds of activities within a block are defined by the computation model that underlies the visualization. The first kind of activity is the actual work, plus waiting for cache misses. It is represented by a blue-to-red region where the color indicates intensity of cache misses, with pure red representing at or above the maximum misses per instruction, and pure blue the minimum (the max and min are set in the tool that generates the visualization). 3.122 + 3.123 + The second kind of activity is runtime overhead, represented by a gray region. This is the overhead spent on that particular work-unit. When desired by the user, it is further broken into pieces representing activities inside the runtime. The options include time spent on: constraints, when determining readiness of the work-unit; deciding which ready unit to assign to which hardware; and time spent switching from virtual processor, to the runtime, and back. In this paper, we show all runtime overhead lumped together, however in other circumstances a breakdown can be key to understanding interaction between runtime and application. 3.124 3.125 3.126 -The other type of visual feature seen in Fig X is lines. Each represents a construct that influenced scheduling. The line depicts two things: a constraint and a decision, both inside the runtime. The constraint was satisfied, which made the decision possible, choosing which core to do the work on. 3.127 +The other type of visual feature seen in Fig \ref{fig:SCG_expl} is lines. Each represents a construct that influenced scheduling, where the color indicates which construct. A line represents two things: a constraint, whose satisfaction made the lower unit ready, and a decision by the runtime to start the lower unit on that core. 3.128 3.129 -In general, other kinds of lines may also be drawn, representing other kinds of interactions that affect core usage. For example, visualization can be turned on for the internal runtime constraint, that only one core at a time may access shared constraint and scheduling state. This appears as additional lines linking the gray runtime regions of blocks. In this paper, visualization is turned off for such minor interactions. 3.130 +In general, lines may also be drawn, representing other kinds of interactions that affect core usage. For example, our runtime implementation only allows one core at a time to access shared scheduling state. Visualization of this can be turned on, as additional lines linking the gray runtime regions of blocks (visualization of such interactions is turned off in this paper). 3.131 3.132 -Two work-unit (blue-to-red) blocks that go in sequence and have no parallelism construct constraints often have a causal dependency between them, due to the semantics of the base language. These are turned off, but can be checked via the link to the code. 3.133 +Two work-unit blocks that appear in sequence and have no lines drawn to them often have a causal dependency between them, due to the semantics of the base language. Visualization of these dependencies is also turned off, but can be inferred via the link to the code. 3.134 3.135 -Many different orderings could also have been validly chosen. Which scheduler choices are valid is determined by three kinds of constraints: the application code constraints, hardware constraints, and runtime implementation imposed constraints. 3.136 + 3.137 3.138 -Returning to Fig \ref{story}, the lines in red, orange, and green represent application-code constraints that each tie two work-units together. The color represents the parallelism construct that imposed that kind of constraint. Red is creation of a virtual processor, green is the many-to-one \texttt{send\_of\_type\_to}, and orange is the singleton construct. For better visibility, only constraints that cross cores are shown. 3.139 3.140 -The columns represent each core, and the vertical axis represents the progress of the program execution, starting from the top. This gives at a glance the total runtime (height), level of parallelism during the run (number of filled columns at a specific height), and processor utilization (blue-covered area vs. white). All figures are at the same scale, so they can be compared directly. 3.141 +Note that many different orderings can be validly chosen. Which scheduler choices are valid is determined by three kinds of constraints: the application code constraints, hardware constraints, and runtime implementation imposed constraints. 3.142 + 3.143 +Also note that because the vertical axis represents the progress of the program execution, the user can see at a glance the total runtime (height), level of parallelism during the run (number of filled columns at a specific height), and processor utilization (blue-covered area vs. white). All figures are at the same scale, so they can be compared directly. 3.144 3.145 3.146 3.147 @@ -510,30 +524,38 @@ 3.148 3.149 3.150 3.151 -After functional debugging, the first run produces the consequence graph seen in Figure \ref{fig:story:a}. The first thing to notice, is that it is slimmer than expected: of the 40 available cores, only 13 are being used. As the application places work on cores explicitly, this must be a bug in the dividing code. A cursory inspection reveals that a closing curly brace in the distribution loop had been misplaced. This may be a very simple bug, but it went unnoticed despite using this application as test program for development of the language runtime, including performance, for several months. 3.152 +In this subsection, many SCGs are shown, within Fig \ref{story}, that display the measurements collected on various runs during tuning. They are all 40 columns wide, one for each core, and all are on the same scale, so relative height indicates relative execution time. They have lines in red, orange, and green, which represent application-code constructs. Red is creation of a virtual processor, green is the many-to-one \texttt{send\_of\_type\_to}, and orange is the singleton construct. For better visibility, only constraints that cross cores are shown. 3.153 + 3.154 + 3.155 +After functional debugging, the first tuning run produces the consequence graph seen in Figure \ref{fig:story:a}. The first thing to notice, is that it is slimmer than expected: of the 40 available cores, only 13 are being used. As the application places work on cores explicitly, this must be a bug in the dividing code. A cursory inspection reveals that a closing curly brace in the distribution loop had been misplaced. This may be a very simple bug, but it went unnoticed despite using this application as test program for development of the language runtime, including performance, for several months. 3.156 3.157 \subsubsection{Second Run} 3.158 - The second run (Fig \ref{story:b}) already corresponds much more to the expected execution behaviour. However, there remains a noticeable section at the beginning where only 3 cores have work and the other 37 remain idle. 3.159 + After fixing this, the next run (Fig \ref{story:b}) corresponds much more to the expected execution behaviour. However, there remains a noticeable section at the beginning where only 3 cores have work and the other 37 remain idle. 3.160 3.161 -Zooming in on those cores, we see that the task creation VP animates a chain of short tasks (each with a red edge outgoing), and is assigned to core 0. A task is the work in-between scheduling decisions, and creating a task requires switching to the runtime. In order to animate the next work creation task, the creator VP has to be chosen again. However, the creation VP makes the work for all the cores, and starts with itself, core 0, while the runtime animates tasks in the order they become ready. So, after the creator VP makes a work task for itself, that task is ready, and the next chained creation task is put into the queue behind it. That means the work task is chosen next, and the creation task gets left in the queue while work is done, during which task creation is suspended (the merits of work stealing or other scheduling strategies are independent from this illustration of how to use this approach to performance tuning). 3.162 +Zooming in on those cores, we see that creation code starts running on core 0, within the creation VP, and then the next block on the core is work! Creation stops, starving the other cores. Looking at the creation code, we see that the creation VP assigns the first work VP to its own core, so that work is now waiting in the queue to execute there. When it creates the second work VP, that creation call switches core 0 to the runtime. When done with creation, it takes the next VP from the queue, witch is that waiting work VP. Hence core 0 does the work next instead of continuing with creation (the merits of work stealing or other scheduling strategies are independent from this illustration of how to use this approach to performance tuning). 3.163 3.164 -Two solutions come to mind: distribute work to all other cores first so that they would be busy when the creator VP gets interrupted, or dedicate a core to the creator VP. The first solution has the advantage of preserving performance of the application even when run on a machine with a single-digit number of cores, so we tried it first. 3.165 +The hypothesis was generated by looking at the code linked to each block and noting the visual pattern that creation code stopped running on core 0. Work code started running instead, and only after it finished did creation code start again. Hence, visual cues led directly to the hypothesis. 3.166 3.167 -\subsubsection{Third run} Distributing work to other cores first gives us Fig \ref{story:c}. The section at the top with many cores idling has disappeared. A small idle period can still be observed between the first and the second set of work tasks, because the work tasks have roughly the same length (some of them are slightly longer because they perform a copy-transpose singleton, and small variations can be caused by cache misses etc.), so the work on core 0 holding up the creator VP, being last to be distributed, is also last to finish. It is also noticeable that in the second set of work units to be distributed, not enough work pieces remain to fill all cores, so that 16 out of 40 remain idle. 3.168 +Two solutions come to mind: assign work to the other cores first, so that they would be busy when the creator VP gets interrupted, or else dedicate a core to the creator VP. The first solution has the advantage of preserving performance of the application even when run on a machine with a single-digit number of cores, so we tried it first. 3.169 3.170 -\subsubsection{Fourth and fifth runs} To try to fill that empty space, we tried to modify the size of the work units. However, as figures \ref{story:d} and \ref{story:e} show, this did not help, as the time spent creating the increased number of units becomes the bottleneck, and the time lost between sets grows larger than the time lost on the cores not receiving any work. 3.171 +\subsubsection{Third run} Assigning work to the other cores first gives us Fig \ref{story:c}. The section that was at the top, with idle cores, has disappeared. A small idle period can still be observed between the first and the second set of work tasks, because the work tasks have roughly the same length and the work on core 0 starts last. It thus holds up creation, which re-starts after all the others have finished work (note that work on some cores takes slightly longer because that core performs the copy-transpose singleton, and also variations are caused by cache misses). 3.172 3.173 -\subsubsection{Sixth run} At this point we wanted to try whether taking the road not chosen, dedicating a core to the create VP, would improve performance more. 3.174 + It is also noticeable that in the second set of work units to be distributed, not enough work pieces remain to fill all cores. 16 out of 40 remain idle. 3.175 + 3.176 +\subsubsection{Fourth and fifth runs} To try to fill the empty columns at the end, we modified the size of the work units. However, as figures \ref{story:d} and \ref{story:e} show, this did not help, as the time spent creating the increased number of units becomes the bottleneck, and the time lost between sets grows larger than the time that previously was lost on the cores not receiving work. 3.177 + 3.178 +\subsubsection{Sixth run} At this point we wanted to try the road not chosen, dedicating a core to the creation VP. 3.179 Going back to version b of the code and implementing this solution instead leads to fig. \ref{story:f}. The delay between the two sets has disappeared, leading to a 4\% shorter execution time. 3.180 3.181 -\subsubsection{Seventh and eighth runs}As core 0 is now empty after the creation phase at the beginning, we also moved the receive VP there (fig. \ref{story:g}). This added only a minimal improvement at this size of work unit, but allows overlapping the result collection with other work, which is an advantage when cutting up the work into more pieces, requiring longer collection (fig. \ref{story:h}). 3.182 +\subsubsection{Seventh and eighth runs}As core 0 is now empty after the creation phase at the beginning, we also moved the receive VP there (fig. \ref{story:g}). This added only a minimal improvement at this size of work unit, but allows overlapping the result collection with other work, which is an advantage when cutting the work into more pieces, requiring longer collection (fig. \ref{story:h}). 3.183 3.184 -Overall it is also noticeable that as work units become smaller, execution becomes more irregular. Variability in task length is likely due to cache misses or page faults, but for verification of this hypothesis more data would need to be collected (for instance, the ``cycles stalled due to cache misses" counter available on most modern Intel chips could be tracked and displayed for each unit). 3.185 +Overall it is also noticeable that as work units become smaller, execution time becomes more irregular. Variability in work length correlates with the color, indicating cache behavior has worsened with smaller work size. 3.186 + 3.187 +Note that the hypothesis, that cache behavior worsened with smaller work sizes, was generated directly from visual cues. 3.188 3.189 \subsubsection{holes in the core usage} 3.190 3.191 -In Fig X, ``holes'' are noticeable. Inspecting these holes closer, we can see that the stalling tasks are waiting upon the completion of a singleton. However, the operations enclosed in the singleton take only a short time, and start much later than the idle periods. Once again, the in-order animation of queued of tasks is at fault: When the first VP reaches the singleton code portion, it sends a request to acquire the singleton. This request succeeds, but as sending a request results in suspension of the requesting VP, a new task is scheduled first. If this happens to be a long task, the singleton is suspended for a long time, but all other VPs with this singleton have to wait, because it has already been reserved. Because several VPs assigned to the same core share the same matrix pieces so as to increase cache locality, this can result in all VPs on a core being stalled, leading to the observed idle times. This is a property of the language runtime, so the application programmer cannot change this, but making the work units smaller helps minimize these effects. 3.192 - 3.193 +In Fig \ref{story:d}, ``holes'' are noticeable. Inspecting these holes closer, we can see that the stalled blocks are at the end of orange lines. This indicates they are waiting upon the completion of a singleton. The pattern of blocks shows that usually the singleton unit runs before the work unit, but in these cases the singleton code was delayed until after the work on that core. This is a runtime implementation fluke. The only thing an application programmer can do is change the work size to minimize the impact. (For those curious, the first VP to reach the singleton is granted control, but a ready work VP lands in the queue during the granting activity, so when the runtime finishes granting, the work VP is next, and the VP that now owns the singleton sits and waits for the work to end. All work VPs on other cores that pass through the same singleton also wait.) 3.194 3.195 3.196 3.197 @@ -543,39 +565,39 @@ 3.198 \section{The Model Behind the Visualization} 3.199 \label{sec:theory} 3.200 3.201 -Now that the usage has been seen, we expand on the model behind the visualizations, which is what ties the information together. Understanding of the model leads to quickly seeing the reason for performance-related patterns in the visualizations. Such understanding generates the hypotheses of the source of performance loss. 3.202 +Now that the usage has been seen, we expand on the model behind the visualizations. The model ties the information together, and understanding it helps in generating hypotheses from the visualization features. 3.203 3.204 -The model has two parts, a \emph{Unit \&\ Constraint Collection (UCC)}, and a \emph{Scheduling Consequence Graph} (just consequence graph or CG). The UCC indicates the freedom of choice the application allows, encoding what the programmer has control over. The consequence graph says which of those were actually taken during the run and the consequences of that set of choices. 3.205 +As seen, the model has two parts, a \emph{Unit \&\ Constraint Collection (UCC)}, and a \emph{Scheduling Consequence Graph} (just consequence graph or CG). The UCC indicates the scheduling choices the application allows, encoding what the programmer has control over. The consequence graph says which of those were actually taken during the run and the consequences of that set of choices. 3.206 3.207 -We give a more precise description of UCC then consequence graph, in turn. 3.208 +We give a more precise description of UCC, then consequence graph, in turn. 3.209 However, space is too limited for a complete definition, which is given in a companion paper submitted to a longer format venue. 3.210 \subsection{Unit \& Constraint Collection} 3.211 -The UCC contains all units of work that get scheduled and all application-related constraints on scheduling them. That's a nice solid definition, but things aren't quite that simple. The complication is that different classes of application exist, with two degrees of freedom that determine how much of the UCC is actually defined in the application vs the input data, or even the scheduler. 3.212 +The UCC contains all the units of work that get scheduled during a run, and all constraints the application places on scheduling the units. That's a nice solid definition, but things aren't quite that simple. The complication is that different classes of application exist, with two degrees of freedom that determine how much of the UCC is actually defined in the application vs the input data, or even in the runtime. 3.213 3.214 -Some applications have everything determined in the code, with all units fixed, and all constraints fixed. An example is matrix multiply with fixed size matrices. But for others, the shape of the UCC is only partially defined by the application code. Take matrix multiply when the size is an input parameter, the units in the UCC are different for each parameter value. An extreme example is an NP complete problem, with redividable units, for which the units are a function of both the particular input data \emph{and} decisions made by the scheduler! 3.215 +Some applications have everything determined in the code, with all units fixed, and all constraints fixed. An example is matrix multiply with fixed size matrices. But for others, the shape of the UCC is only partially defined by the application code. Take matrix multiply used in Section \ref{sec:casestudy}, where an input parameter determines the number of units created. Here, the UCC is different for each parameter value. An extreme example is an NP complete problem, with redividable units, for which the units are a function of both the input data \emph{and} decisions made by the runtime. 3.216 3.217 - We call a fully specified UCC a \emph{concrete} UCC. Every run of an application eventually winds up defining a concrete UCC, such as seen back in Fig X. But the amount of UCC made concrete by the application alone falls into a two-dimensional grid. One dimension covers the units, the other the constraints. 3.218 + We call a fully specified UCC a \emph{concrete} UCC. Every run of an application eventually winds up defining a concrete UCC, such as seen back in Fig \ref{fig:UCC_example}. But the amount of UCC made concrete by the application alone falls into a two-dimensional grid. One dimension covers the units, the other the constraints. 3.219 3.220 3.221 \begin{figure}[ht] 3.222 -% \centering 3.223 + \centering 3.224 \includegraphics[width = 2in, height = 1.8in]{../figures/UCC_concreteness_grid.pdf} 3.225 \caption{Abstract representation of the kinds of UCC possible. The letters A, B, C, D stand for UCCs described in the text.} 3.226 \label{fig:UCC_Concreteness} 3.227 \end{figure} 3.228 3.229 -Figure \ref{fig:UCC_Concreteness} shows the two axes and the four sets of information on each, which act as the inputs that determine the units and constraints. The position a UCC lands on the grid indicates the information still needed in order to make the UCC fully concrete. The horizontal indicates what inputs are still needed to determine the units, and vertical the constraints. 0 indicates that the units (constraints) are fully determined by the application code alone; 1 means parameter values also must be known; 2 means input data values also play a role, and 3 means the units (constraints) can only become known after runtime scheduling decisions have been made. 3.230 +Figure \ref{fig:UCC_Concreteness} shows the two axes and the four sets of information on each, which act as the inputs that determine the units and constraints. The position a UCC lands on the grid indicates how far it is from being fully concrete. The horizontal indicates what inputs are still needed to determine the units, and vertical the constraints. 0 indicates that the units (constraints) are fully determined by the application code alone; 1 means parameter values also must be known; 2 means input data values also play a role, and 3 means the units (constraints) can only become known after runtime scheduling decisions have been made. 3.231 3.232 The closer the application-derived UCC is to the origin, the less additional information it needs to become concrete. The UCC labeled A in the figure is fully concrete just from the source code alone (representing for example, matrix multiply with fixed size matrices). The UCC labeled B requires the input data and parameters to be specified before its units are concrete, but just parameters to make its constraints fully concrete (as per ray-tracing, with bounce depth specified as a parameter). The UCC labeled C only has variability in its constraints, which require input data (for example, H.264 motion vectors). 3.233 But even the least concrete UCC, out at the end of the diagonal (D in the figure), becomes concrete during a run of the application. 3.234 3.235 -Notice, though, that a fully concrete UCC still has degrees of freedom in what hardware and order the units are assigned. These decisions interact with the hardware, to yield the communication patterns and consequent performance seen during the run. 3.236 +Notice, though, that a fully concrete UCC still has degrees of freedom, in which units to which hardware and the order of execution. The decisions fix interactions within the hardware, to yield the communication patterns and consequent performance seen during the run. 3.237 3.238 -An added twist is that an application has a life-line, spanning from code through the run, and its representation may change at the different stages of life. It starts as pristine source, then moves into specialization where code is translated into different representations than the original, and finally the specialized code is run. The UCC often changes between points in the life-line. 3.239 +An added twist is that an application has a life-line, spanning from code all the way through the run, and its representation may change at the different stages of life. It starts as pristine source, then moves into specialization where code is translated into different representations than the original, and finally the specialized code is run. The UCC often changes between these points in the life-line. 3.240 3.241 For example, specialization may perform a static scheduling, which fixes the units, moving the UCC towards the origin. Alternatively, the toolchain may inject manipulator code for the runtime to use, which lets it divide units during the run when it needs more. The injection of manipulator code makes the UCC less concrete, moving it further from the origin. 3.242 3.243 -The UCC still tells what is inside the application's control vs under the runtime's control, even for applications that land far out on the diagonal. It thus indicates what can be done statically. The further out on the diagonal a UCC is, the less scheduling can be done statically in the toolchain. 3.244 +The UCC still indicates what is inside the application's control vs under the runtime's control, even for applications that land far out on the diagonal. It thus indicates what can be done statically: the further out on the diagonal a UCC is, the less scheduling can be done statically in the toolchain. 3.245 3.246 In this paper, we do not suggest how to represent UCCs far out on the diagonal. One of those actually indicates a multi-verse of concrete-UCCs. Which of them materializes depends on the data that shows up and what the scheduler does. We only represent the concrete UCC that materializes during a run and leave the question of representing less concrete ones to future work. 3.247 3.248 @@ -588,27 +610,27 @@ 3.249 3.250 To distinguish from the UCC, the consequence graph shows the behavior resulting from scheduling decisions actually \emph{made}, from among those \emph{possible.} The UCC shows just the possibilities. Hence, a consequence graph shows \emph{one} of the possible choice-sets allowed by the UCC. 3.251 3.252 -A consequence graph accounts for each bit of core time. It has boxes and arcs. The boxes each represent one segment of core time, which is counted towards one work-unit, and the collection of boxes counted to the same unit make a node. An arc links two boxes and represents a causality of some kind. 3.253 +A consequence graph accounts for each bit of core time. It has boxes and arcs, with the boxes divided into regions. The boxes each represent all core time assigned to one work unit, with each region inside representing a segment of time that the core was engaged in a specific type of activity. An arc links regions (or boxes) and represents a causality of some kind. 3.254 3.255 -There are several kinds of boxes, one for each reason that the core is being used (or being forced idle), and several kinds of arcs, one for each type of causality between boxes. 3.256 +There is one kind of region for each reason that the core is being used (or being forced idle), and several kinds of arcs, one for each type of causality between regions. 3.257 3.258 -The box types are arranged by reason for core usage: application work, waiting for communication of work data, managing constraints, choosing assignment of work onto cores, and runtime internals. The runtime internals have sub-categories but space is limited so we skip those here. 3.259 +The core activities associated with region types are: application work, waiting for communication of work data, managing constraints, choosing assignment of work onto cores, and runtime internals. The runtime internals have sub-categories but space is limited so we skip those here. 3.260 3.261 -The arc types are arranged by source of the causal relationship: control dependency in the base language, parallel constraint that fed a choice in the runtime (IE, the choice ties together two specific work-units so the one completing causes other to start), runtime internal causality such as a global lock or a distributed quorum algorithm whose action creates causal dependencies between boxes that represent execution of internal runtime code, and arcs that represent hardware causal relationships, such as one work-unit finishing on a core causes another work-unit to start there, given the choice by the runtime. The finer details are beyond the scope of this paper. 3.262 +The arc types, representing the type of causal relationship, are: control dependency in the base language, parallel constraint that had to be satisfied (IE, one unit did something to satisfy a constraint on the other, causing it to be free to be scheduled), runtime internal causality such as a global lock (runtime on one core releases the lock, causing the other to acquire it), and arcs that represent hardware causal relationships (one work-unit finishing on a core causes another work-unit to start there, given the choice by the runtime). The formal details are given in the longer format companion paper. 3.263 3.264 We will now look at each source of causal relationship. 3.265 3.266 -\paragraph{Scheduling decision causality} Notice that the performance varies between choice-sets. The variations reflect differences in communication time, which result from the placement of work, chosen by the scheduler. For a particular concrete UCC, each set of scheduling choices it allows has a consequent pattern of core usages. A given choice ties the unit that completed as the cause of starting the unit chosen. 3.267 +\paragraph{Constraint causality} There is a constraint causality when two units are involved in a constraint, where action by one unit causes (or contributes to) satisfaction of the constraint blocking the other unit. This includes control dependencies from the base language. 3.268 3.269 -The consequence graph also includes control dependencies from the base language, which may add superfluous constraints that eliminate some otherwise allowed choices in the UCC. An example would be a \texttt{for} loop that creates work-units -- no parallelism constructs cause the creations to be done in sequence, but the base C language sequentializes it nonetheless. 3.270 +Control dependencies may add superfluous constraints that eliminate some otherwise allowed choices in the UCC. An example would be a \texttt{for} loop that creates work-units -- no parallelism constructs cause the creations to be done in sequence, but the base C language sequentializes it nonetheless. 3.271 3.272 -\paragraph{Runtime causal dependencies} are introduced by implementation details. For example, the version of VMS we instrumented to take measurements for this paper relies upon a global lock for shared semantic-constraint information. This lock introduces a causal dependency that only allows the runtime to execute on one core at any horizontal line in the consequence graph. 3.273 +\paragraph{Runtime internal causality} Runtime implementation details may introduce ``extra" causalities between units. For example, the version of VMS we instrumented for this paper runs separately on each core and relies upon a global lock for accessing shared runtime information. This lock introduces a causal relationship when the runtime on one core is attempting to process one unit, but must wait for the runtime on a different core to finish with its unit. 3.274 3.275 - Normally, these are not displayed explicitly, due to clutter, but can be turned on when needed, say, to figure out the cause of a particular pattern of core usage. 3.276 + Normally, these are not displayed explicitly, due to clutter, but can be turned on when needed, say, to determine the cause of a particular pattern of core usage. 3.277 3.278 -\paragraph{Hardware causal dependencies} are due to the physical fact that a given resource can only be used by one work-unit at a time. A (logical) core only executes one work-unit at a time. A communication link is occupied by one packet at a time, and so on. 3.279 +\paragraph{Hardware causality} The physical fact that a given resource can only be used by one work-unit at a time introduces hardware causalities. When multiple units are free to execute, but all cores are busy, then completion of a unit on one core causes (in part) the next ready unit to run on that core. 3.280 3.281 -These are also not normally displayed, due to clutter, and not all hardware dependencies are directly measured. Future work will focus on using the performance counters and other instrumentation to add more information about communication paths taken as a consequence of the scheduling decisions made. This takes the current linkage of application-code to runtime decisions, and adds consequent communication patterns, which are the primary free variable in resulting performance. This gives an end-to-end linkage between code choices and caused behavior on the hardware for performance. 3.282 +These are also not normally displayed, due to clutter, and not all hardware dependencies are directly measured. Future work will focus on using the performance counters and other instrumentation to add more information about communication paths taken as a consequence of the scheduling decisions made. It will start with the current linkage of application-code to runtime decisions, and add consequent usage of communication hardware. This gives an end-to-end linkage between runtime choices and caused behavior on the hardware. 3.283 3.284 Consequence graph features each tie back to features in the UCC and thence to specific segments of code or constructs. 3.285 3.286 @@ -679,13 +701,13 @@ 3.287 3.288 \subsection{Mapping model onto implementation details in runtime} 3.289 3.290 -The meta-unit and life-line aspects of the computation model map straight-forwardly to the UCC visualization. The constraints in the UCC are those stated in or implied by the application (with the complexities noted in Section \ref{sec:theory}). 3.291 +The meta-unit and unit life-line aspects of the computation model map straight-forwardly to the UCC visualization. The constraints in the UCC are those stated in or implied by the application (with the complexities noted in Section \ref{sec:theory}). 3.292 3.293 However, the SCG is not a strict expression of the model, rather it's purpose is practical. It shows usage of the cores, and relates that to the quantities in the model. Hence, the measurements for the SCG all are boundaries, where the core's time switches from one category in the model to a different. 3.294 3.295 This differs from the model in subtle ways. Most notably, the model declares segments of time where communications take place, while the SCG doesn't measure the communication time directly, rather it captures idleness of the core caused by the non-overlapped portion of that communication. 3.296 3.297 -This difference stems from the SCG's focus on core usage, and assigning each idle period to a cause. The runtime's choice of units to cores is what determined the source and destination of communications, which caused the idling. Hence, idle periods due to non-overlapped communication are consequences of the assignment choices made by the scheduler. This, by the way, leads to the name: scheduling consequence graph. 3.298 +This difference stems from the SCG's focus on core usage, and assigning each idle period to a cause. The runtime's choice of units to cores is what determined the source and destination of communications, which caused the idling. Hence, idle periods due to non-overlapped communication are consequences of the assignment choices made by the scheduler. This supports the name: scheduling consequence graph. 3.299 3.300 3.301 What must be collected during the run differs between the two types of visualization. For the UCC it is unit boundaries and the constraints related to each unit. For the SCG, the same units must be collected, but also the time a core spends on each segment of the unit's life-line. Also, implementation details of the runtime will cause things such as idling the core during lock acquisition to be counted towards a unit's life segment. What core activities go to which life segments changes from runtime to runtime. For example, our implementation includes idle time due to acquiring the lock on shared runtime state as part of the state-update life-line step. 3.302 @@ -732,7 +754,7 @@ 3.303 \item Decision to animate: This is the time spent inside the language-supplied assigner function. 3.304 \item Move meta-unit to core: This is via shared variables, recorded as part of 3. 3.305 \item Move work data to core: This is via cache misses, recorded as part of 6. 3.306 -\item Do the work of the unit: This is measured by instrumenting the VMS switch-to-unit primitive and the corresponding switch-to-runtime primitive. 3.307 +\item Do the work of the unit: This is the cycles between the switch-to-unit and the following switch-to-runtime. 3.308 \item Communicate state update: This is the time between leaving the application code and starting the construct handler (which includes lock acquisition). 3.309 \item Resulting constraint updates: This is the time spent inside the construct handler, and is the same as 2. 3.310 \end{enumerate} 3.311 @@ -742,10 +764,10 @@ 3.312 \begin{itemize} 3.313 \item Construct handler: To measure 2 and 8, reading is done before and after VMS calls the language-supplied construct handler function.\item 3.314 Assigner: To measure 3 and 4, reading is done before and after VMS calls the language-supplied assigner function. 3.315 -\item Work: To measure 5 and 6, reading is done at the point VMS switches to the unit, and the point it switches back into the runtime. 3.316 +\item Work: To measure 5 and 6, reading is done by reading inside the VMS switch-to-unit operation, and the switch-to-runtime operation. 3.317 3.318 \item 3.319 -Dual-use: 1 is measured by using the reads on the create\_VP construct handler. To measure 7, the reading done upon switching into runtime is coupled to the reading done just before starting the construct handler function. 3.320 +Dual-use: 1 is measured by using the construct handler reads for the create\_VP construct handler. For 7, the switch-to-runtime read is subtracted from the read at the start of the construct handler function. 3.321 \end{itemize} 3.322 3.323 3.324 @@ -755,14 +777,14 @@ 3.325 3.326 \subsection{Building the Visualizations} 3.327 3.328 -Both the UCC and the SCG are represented as directed graphs, with units as nodes. 3.329 +Both the UCC and the SCG are internally represented as directed graphs, with units as nodes. 3.330 3.331 \subsubsection{UCC} 3.332 -For the UCC, units can be either unweighted or weighted. Weighted units appear as rectangles with height proportional to the weight, unweighted units as circles. We weight the units with the number of instructions in the work. This removes some of the influence of scheduling and data, such as cache misses. 3.333 +For the UCC, units can be either unweighted or weighted. Weighted units appear as rectangles with height proportional to the weight, unweighted units appear as circles. We weight the units with the number of instructions in the work. This removes some of the influence of scheduling and data, such as cache misses. 3.334 3.335 -A critical path algorithm calculates vertical position of a unit by its position in the critical path. The nodes are spread horizontally such that none overlap. 3.336 +A critical path algorithm calculates vertical position of a unit, as its position within the critical path. The nodes are spread horizontally such that none overlap. 3.337 3.338 -Simple constraints (dependencies) are represented as arcs. Complicated constraints are for now displayed as an additional node bearing information on the constraint, with incoming arcs from all units whose execution status affects the constraint and outgoing arcs to the constrained units. 3.339 +Simple constraints (dependencies) are painted as arcs. Complicated constraints are for now displayed as an additional node bearing information on the constraint, with incoming arcs from all units whose execution status affects the constraint and outgoing arcs to the constrained units. 3.340 3.341 3.342 \subsubsection{SCG}
