[hip] Docs: Overhaul HW implementation page (#1994)

* [hip] Docs: Overhaul HW implementation page * Update hardware implementation and glossary * Update programming model * Add performance optimization * Split into how-to and understanding --------- Signed-off-by: Jan Stephan <jan.stephan@amd.com> Co-authored-by: Jan Stephan <jan.stephan@amd.com> Co-authored-by: Julia Jiang <julia.jiang@amd.com>
2025-12-11 10:52:34 +01:00
@@ -0,0 +1,393 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!-- Created with Inkscape (http://www.inkscape.org/) -->
+
+<svg
+   width="78.030128mm"
+   height="59.956924mm"
+   viewBox="0 0 78.030128 59.956924"
+   version="1.1"
+   id="svg5"
+   inkscape:version="1.1.2 (0a00cf5339, 2022-02-04)"
+   sodipodi:docname="lds.svg"
+   inkscape:export-filename="/home/nick/Documents/software_repos/omniperf/src/docs/images/lds.png"
+   inkscape:export-xdpi="180"
+   inkscape:export-ydpi="180"
+   xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
+   xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
+   xmlns="http://www.w3.org/2000/svg"
+   xmlns:svg="http://www.w3.org/2000/svg">
+  <sodipodi:namedview
+     id="namedview7"
+     pagecolor="#ffffff"
+     bordercolor="#666666"
+     borderopacity="1.0"
+     inkscape:pageshadow="2"
+     inkscape:pageopacity="0.0"
+     inkscape:pagecheckerboard="0"
+     inkscape:document-units="mm"
+     showgrid="false"
+     inkscape:zoom="2.8284271"
+     inkscape:cx="7.9549513"
+     inkscape:cy="93.161318"
+     inkscape:window-width="2490"
+     inkscape:window-height="1376"
+     inkscape:window-x="70"
+     inkscape:window-y="27"
+     inkscape:window-maximized="1"
+     inkscape:current-layer="layer1"
+     fit-margin-top="0"
+     fit-margin-left="0"
+     fit-margin-right="0"
+     fit-margin-bottom="0" />
+  <defs
+     id="defs2">
+    <marker
+       style="overflow:visible"
+       id="marker35467"
+       refX="0"
+       refY="0"
+       orient="auto"
+       inkscape:stockid="Arrow1Mend"
+       inkscape:isstock="true">
+      <path
+         transform="matrix(-0.4,0,0,-0.4,-4,0)"
+         style="fill:context-stroke;fill-rule:evenodd;stroke:context-stroke;stroke-width:1pt"
+         d="M 0,0 5,-5 -12.5,0 5,5 Z"
+         id="path35465" />
+    </marker>
+    <marker
+       style="overflow:visible"
+       id="Arrow2Mend"
+       refX="0"
+       refY="0"
+       orient="auto"
+       inkscape:stockid="Arrow2Mend"
+       inkscape:isstock="true">
+      <path
+         transform="scale(-0.6)"
+         d="M 8.7185878,4.0337352 -2.2072895,0.01601326 8.7185884,-4.0017078 c -1.7454984,2.3720609 -1.7354408,5.6174519 -6e-7,8.035443 z"
+         style="fill:context-stroke;fill-rule:evenodd;stroke:context-stroke;stroke-width:0.625;stroke-linejoin:round"
+         id="path34770" />
+    </marker>
+    <marker
+       style="overflow:visible"
+       id="marker35449"
+       refX="0"
+       refY="0"
+       orient="auto"
+       inkscape:stockid="Arrow1Mend"
+       inkscape:isstock="true">
+      <path
+         transform="matrix(-0.4,0,0,-0.4,-4,0)"
+         style="fill:context-stroke;fill-rule:evenodd;stroke:context-stroke;stroke-width:1pt"
+         d="M 0,0 5,-5 -12.5,0 5,5 Z"
+         id="path35447" />
+    </marker>
+    <marker
+       style="overflow:visible"
+       id="Arrow1Mend"
+       refX="0"
+       refY="0"
+       orient="auto"
+       inkscape:stockid="Arrow1Mend"
+       inkscape:isstock="true">
+      <path
+         transform="matrix(-0.4,0,0,-0.4,-4,0)"
+         style="fill:context-stroke;fill-rule:evenodd;stroke:context-stroke;stroke-width:1pt"
+         d="M 0,0 5,-5 -12.5,0 5,5 Z"
+         id="path34752" />
+    </marker>
+    <marker
+       style="overflow:visible"
+       id="marker35115"
+       refX="0"
+       refY="0"
+       orient="auto"
+       inkscape:stockid="Arrow2Mstart"
+       inkscape:isstock="true">
+      <path
+         transform="scale(0.6)"
+         d="M 8.7185878,4.0337352 -2.2072895,0.01601326 8.7185884,-4.0017078 c -1.7454984,2.3720609 -1.7354408,5.6174519 -6e-7,8.035443 z"
+         style="fill:context-stroke;fill-rule:evenodd;stroke:context-stroke;stroke-width:0.625;stroke-linejoin:round"
+         id="path35113" />
+    </marker>
+    <marker
+       style="overflow:visible"
+       id="Arrow2Mstart"
+       refX="0"
+       refY="0"
+       orient="auto"
+       inkscape:stockid="Arrow2Mstart"
+       inkscape:isstock="true">
+      <path
+         transform="scale(0.6)"
+         d="M 8.7185878,4.0337352 -2.2072895,0.01601326 8.7185884,-4.0017078 c -1.7454984,2.3720609 -1.7354408,5.6174519 -6e-7,8.035443 z"
+         style="fill:context-stroke;fill-rule:evenodd;stroke:context-stroke;stroke-width:0.625;stroke-linejoin:round"
+         id="path34767" />
+    </marker>
+    <marker
+       style="overflow:visible"
+       id="Arrow1Lstart"
+       refX="0"
+       refY="0"
+       orient="auto"
+       inkscape:stockid="Arrow1Lstart"
+       inkscape:isstock="true">
+      <path
+         transform="matrix(0.8,0,0,0.8,10,0)"
+         style="fill:context-stroke;fill-rule:evenodd;stroke:context-stroke;stroke-width:1pt"
+         d="M 0,0 5,-5 -12.5,0 5,5 Z"
+         id="path34743" />
+    </marker>
+    <rect
+       x="175.93893"
+       y="253.99336"
+       width="234.12074"
+       height="100.54605"
+       id="rect3930" />
+    <marker
+       style="overflow:visible"
+       id="Arrow1Mend-5"
+       refX="0"
+       refY="0"
+       orient="auto"
+       inkscape:stockid="Arrow1Mend"
+       inkscape:isstock="true">
+      <path
+         transform="matrix(-0.4,0,0,-0.4,-4,0)"
+         style="fill:context-stroke;fill-rule:evenodd;stroke:context-stroke;stroke-width:1pt"
+         d="M 0,0 5,-5 -12.5,0 5,5 Z"
+         id="path34752-6" />
+    </marker>
+  </defs>
+  <g
+     inkscape:label="Layer 1"
+     inkscape:groupmode="layer"
+     id="layer1"
+     transform="translate(-38.921551,-41.961155)">
+    <rect
+       style="fill:none;stroke:#000000;stroke-width:0.5;stroke-miterlimit:4;stroke-dasharray:none"
+       id="rect846"
+       width="77.212677"
+       height="40.499405"
+       x="39.330276"
+       y="61.168674" />
+    <text
+       xml:space="preserve"
+       transform="scale(0.26458333)"
+       id="text3928"
+       style="font-style:normal;font-weight:normal;font-size:40px;line-height:1.25;font-family:sans-serif;white-space:pre;shape-inside:url(#rect3930);fill:#000000;fill-opacity:1;stroke:none" />
+    <rect
+       style="fill:none;stroke:#000000;stroke-width:0.5;stroke-miterlimit:4;stroke-dasharray:none"
+       id="rect846-3"
+       width="31.886179"
+       height="13.072078"
+       x="39.171551"
+       y="42.211155" />
+    <text
+       xml:space="preserve"
+       style="font-style:normal;font-weight:normal;font-size:3.52778px;line-height:1.25;font-family:sans-serif;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583"
+       x="47.24173"
+       y="49.892689"
+       id="text25740"><tspan
+         sodipodi:role="line"
+         id="tspan25738"
+         style="font-size:3.52778px;stroke-width:0.264583"
+         x="47.24173"
+         y="49.892689">SIMD 0/1</tspan></text>
+    <text
+       xml:space="preserve"
+       style="font-style:normal;font-weight:normal;font-size:3.52778px;line-height:1.25;font-family:sans-serif;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583"
+       x="92.864143"
+       y="49.892689"
+       id="text25740-7"><tspan
+         sodipodi:role="line"
+         id="tspan25738-5"
+         style="font-size:3.52778px;stroke-width:0.264583"
+         x="92.864143"
+         y="49.892689">SIMD 2/3</tspan></text>
+    <rect
+       style="fill:none;stroke:#000000;stroke-width:0.5;stroke-miterlimit:4;stroke-dasharray:none"
+       id="rect846-3-3"
+       width="31.886179"
+       height="13.072078"
+       x="84.815498"
+       y="42.211155" />
+    <path
+       style="fill:none;stroke:#000000;stroke-width:0.239054px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;marker-start:url(#marker35115);marker-end:url(#Arrow2Mend)"
+       d="m 55.11464,55.888713 v 4.575516"
+       id="path35236" />
+    <path
+       style="fill:none;stroke:#000000;stroke-width:0.239437px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;marker-start:url(#marker35115);marker-end:url(#Arrow2Mend)"
+       d="m 100.75859,55.863706 v 4.584345"
+       id="path35236-2" />
+    <g
+       id="g91757"
+       transform="translate(0.09337305,-1.490623)">
+      <rect
+         style="fill:none;stroke:#000000;stroke-width:0.5;stroke-miterlimit:4;stroke-dasharray:none"
+         id="rect846-3-9"
+         width="57.741043"
+         height="6.0837841"
+         x="49.066093"
+         y="64.83651" />
+      <text
+         xml:space="preserve"
+         style="font-style:normal;font-weight:normal;font-size:3.52778px;line-height:1.25;font-family:sans-serif;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583"
+         x="62.166679"
+         y="69.054901"
+         id="text43239"><tspan
+           sodipodi:role="line"
+           id="tspan43237"
+           style="font-size:3.52778px;stroke-width:0.264583"
+           x="62.166679"
+           y="69.054901">Conflict Detection</tspan></text>
+    </g>
+    <g
+       id="g94249">
+      <rect
+         style="fill:none;stroke:#000000;stroke-width:0.5;stroke-miterlimit:4;stroke-dasharray:none"
+         id="rect846-3-9-7"
+         width="57.741043"
+         height="6.0837841"
+         x="49.159466"
+         y="69.429672" />
+      <text
+         xml:space="preserve"
+         style="font-style:normal;font-weight:normal;font-size:3.52778px;line-height:1.25;font-family:sans-serif;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583"
+         x="69.083069"
+         y="73.648064"
+         id="text43239-9"><tspan
+           sodipodi:role="line"
+           id="tspan43237-2"
+           style="font-size:3.52778px;stroke-width:0.264583"
+           x="69.083069"
+           y="73.648064">Scheduler</tspan></text>
+    </g>
+    <g
+       id="g75031">
+      <rect
+         style="fill:none;stroke:#000000;stroke-width:0.499999;stroke-miterlimit:4;stroke-dasharray:none"
+         id="rect53227"
+         width="10.155521"
+         height="20.193951"
+         x="41.817165"
+         y="78.846886" />
+      <text
+         xml:space="preserve"
+         style="font-style:normal;font-weight:normal;font-size:3.52778px;line-height:1.25;font-family:sans-serif;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583"
+         x="-95.113167"
+         y="48.210091"
+         id="text58467"
+         transform="rotate(-90)"><tspan
+           sodipodi:role="line"
+           id="tspan58465"
+           style="font-size:3.52778px;stroke-width:0.264583"
+           x="-95.113167"
+           y="48.210091">Bank 0</tspan></text>
+    </g>
+    <g
+       id="g75036"
+       transform="translate(-0.28890355,0.10812378)">
+      <rect
+         style="fill:none;stroke:#000000;stroke-width:0.499999;stroke-miterlimit:4;stroke-dasharray:none"
+         id="rect53227-1"
+         width="10.155521"
+         height="20.193951"
+         x="55.210358"
+         y="78.738762" />
+      <text
+         xml:space="preserve"
+         style="font-style:normal;font-weight:normal;font-size:3.52778px;line-height:1.25;font-family:sans-serif;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583"
+         x="-95.005043"
+         y="61.603283"
+         id="text58467-2"
+         transform="rotate(-90)"><tspan
+           sodipodi:role="line"
+           id="tspan58465-7"
+           style="font-size:3.52778px;stroke-width:0.264583"
+           x="-95.005043"
+           y="61.603283">Bank 1</tspan></text>
+    </g>
+    <g
+       id="g75041"
+       transform="translate(-0.42640324,0.12826538)">
+      <rect
+         style="fill:none;stroke:#000000;stroke-width:0.499999;stroke-miterlimit:4;stroke-dasharray:none"
+         id="rect53227-0"
+         width="10.155521"
+         height="20.193951"
+         x="68.452148"
+         y="78.71862" />
+      <text
+         xml:space="preserve"
+         style="font-style:normal;font-weight:normal;font-size:3.52778px;line-height:1.25;font-family:sans-serif;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583"
+         x="-94.984901"
+         y="74.845078"
+         id="text58467-9"
+         transform="rotate(-90)"><tspan
+           sodipodi:role="line"
+           id="tspan58465-3"
+           style="font-size:3.52778px;stroke-width:0.264583"
+           x="-94.984901"
+           y="74.845078">Bank 2</tspan></text>
+    </g>
+    <g
+       id="g75046"
+       transform="translate(0,-0.36049652)">
+      <rect
+         style="fill:none;stroke:#000000;stroke-width:0.499999;stroke-miterlimit:4;stroke-dasharray:none"
+         id="rect53227-6"
+         width="10.155521"
+         height="20.193951"
+         x="81.130043"
+         y="79.207382" />
+      <text
+         xml:space="preserve"
+         style="font-style:normal;font-weight:normal;font-size:3.52778px;line-height:1.25;font-family:sans-serif;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583"
+         x="-95.473663"
+         y="87.522972"
+         id="text58467-0"
+         transform="rotate(-90)"><tspan
+           sodipodi:role="line"
+           id="tspan58465-6"
+           style="font-size:3.52778px;stroke-width:0.264583"
+           x="-95.473663"
+           y="87.522972">Bank 3</tspan></text>
+    </g>
+    <g
+       id="g75106"
+       transform="translate(0,0.88236237)">
+      <rect
+         style="fill:none;stroke:#000000;stroke-width:0.499999;stroke-miterlimit:4;stroke-dasharray:none"
+         id="rect53227-2"
+         width="10.155521"
+         height="20.193951"
+         x="103.69362"
+         y="77.964523" />
+      <text
+         xml:space="preserve"
+         style="font-style:normal;font-weight:normal;font-size:3.52778px;line-height:1.25;font-family:sans-serif;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583"
+         x="-95.307404"
+         y="110.08656"
+         id="text58467-6"
+         transform="rotate(-90)"><tspan
+           sodipodi:role="line"
+           id="tspan58465-1"
+           style="font-size:3.52778px;stroke-width:0.264583"
+           x="-95.307404"
+           y="110.08656">Bank 31</tspan></text>
+    </g>
+    <text
+       xml:space="preserve"
+       style="font-style:normal;font-weight:normal;font-size:3.52778px;line-height:1.25;font-family:sans-serif;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583"
+       x="95.809242"
+       y="89.162628"
+       id="text80928"><tspan
+         sodipodi:role="line"
+         id="tspan80926"
+         style="font-size:3.52778px;stroke-width:0.264583"
+         x="95.809242"
+         y="89.162628">...</tspan></text>
+  </g>
+</svg>
@@ -0,0 +1,302 @@
+<svg class="typst-doc" viewBox="0 0 382.68322834645664 269.7074015748031" width="382.68322834645664pt" height="269.7074015748031pt" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:h5="http://www.w3.org/1999/xhtml">
+    <path class="typst-shape" fill="#ffffff" fill-rule="nonzero" d="M 0 0 L 0 269.7074 L 382.68323 269.7074 L 382.68323 0 Z "/>
+    <g>
+        <g transform="translate(14.173228346456693 14.173228346456693)">
+            <g class="typst-group">
+                <g>
+                    <g transform="translate(14.17929133858268 226.77165354330708)">
+                        <path class="typst-shape" fill="none" stroke="#000000" stroke-width="0.5" stroke-linecap="square" stroke-linejoin="miter" stroke-miterlimit="4" d="M 0 0 L 340.15747 0 "/>
+                    </g>
+                    <g transform="translate(111.55303149606299 232.44094488188975)">
+                        <path class="typst-shape" fill="none" d="M 0 0 L 145.41 0 L 145.41 8.92 L 0 8.92 L 0 0 Z "/>
+                    </g>
+                    <g transform="translate(111.5530314488189 232.4409448503937)">
+                        <g class="typst-group">
+                            <g>
+                                <g transform="translate(0 0)">
+                                    <g class="typst-group">
+                                        <g>
+                                            <g transform="translate(0 6.580000000000001)">
+                                                <g class="typst-text" transform="scale(1, -1)">
+                                                    <use xlink:href="#gF68F62BAF7023F8E796CE041842DF629" x="0" fill="#000000" fill-rule="nonzero"/>
+                                                    <use xlink:href="#gD022B8BB58399F5AFF249BACEC9B429C" x="7.02" fill="#000000" fill-rule="nonzero"/>
+                                                    <use xlink:href="#g8A55166D3146C138BB50BFE795ED0EE5" x="12.28" fill="#000000" fill-rule="nonzero"/>
+                                                    <use xlink:href="#g2C268D8F9C3C96139D78CE91BE4D9FB" x="16.75" fill="#000000" fill-rule="nonzero"/>
+                                                    <use xlink:href="#g528CFE5CB98F8F81646F93C3E83B138B" x="20.47" fill="#000000" fill-rule="nonzero"/>
+                                                    <use xlink:href="#g74F5E04B137C7A69C847D70E8574BAA6" x="25.04" fill="#000000" fill-rule="nonzero"/>
+                                                    <use xlink:href="#g55B6FB7CBDC9C1DAE949CD37F0DC16D" x="28.2" fill="#000000" fill-rule="nonzero"/>
+                                                    <use xlink:href="#gA11C98823DFB5A0C784EF187B0832232" x="30.91" fill="#000000" fill-rule="nonzero"/>
+                                                    <use xlink:href="#g9C2BE9118249AB6B38E684352DDFBC5D" x="35.95" fill="#000000" fill-rule="nonzero"/>
+                                                    <use xlink:href="#g528CFE5CB98F8F81646F93C3E83B138B" x="41.370000000000005" fill="#000000" fill-rule="nonzero"/>
+                                                    <use xlink:href="#g179036B287C688540B027B22D62B33B" x="45.940000000000005" fill="#000000" fill-rule="nonzero"/>
+                                                    <use xlink:href="#g6A35C718BF189FD6EC8D20440CDA771D" x="51.080000000000005" fill="#000000" fill-rule="nonzero"/>
+                                                    <use xlink:href="#g9C2BE9118249AB6B38E684352DDFBC5D" x="54.050000000000004" fill="#000000" fill-rule="nonzero"/>
+                                                    <use xlink:href="#g74F5E04B137C7A69C847D70E8574BAA6" x="59.470000000000006" fill="#000000" fill-rule="nonzero"/>
+                                                    <use xlink:href="#g8A55166D3146C138BB50BFE795ED0EE5" x="62.63000000000001" fill="#000000" fill-rule="nonzero"/>
+                                                    <use xlink:href="#g9C2BE9118249AB6B38E684352DDFBC5D" x="67.10000000000001" fill="#000000" fill-rule="nonzero"/>
+                                                    <use xlink:href="#g7D8F87D2CD6331112C9FE3A07C40E0F" x="72.52000000000001" fill="#000000" fill-rule="nonzero"/>
+                                                    <use xlink:href="#g55B6FB7CBDC9C1DAE949CD37F0DC16D" x="76.42000000000002" fill="#000000" fill-rule="nonzero"/>
+                                                    <use xlink:href="#g74F5E04B137C7A69C847D70E8574BAA6" x="79.13000000000001" fill="#000000" fill-rule="nonzero"/>
+                                                    <use xlink:href="#gB5DAD6AB9EF529853D7226A3F4588253" x="82.29" fill="#000000" fill-rule="nonzero"/>
+                                                    <use xlink:href="#g72F468A787EE89ADC78BA24E12FC80FE" x="89.94000000000001" fill="#000000" fill-rule="nonzero"/>
+                                                    <use xlink:href="#g229B6FF65CF728248AF8BE0A20FAFDFC" x="93.50000000000001" fill="#000000" fill-rule="nonzero"/>
+                                                    <use xlink:href="#g2D340B5AC4FE037754EF86B583EC1E8C" x="98.35000000000001" fill="#000000" fill-rule="nonzero"/>
+                                                    <use xlink:href="#gF68F62BAF7023F8E796CE041842DF629" x="103.63000000000001" fill="#000000" fill-rule="nonzero"/>
+                                                    <use xlink:href="#g29073FC46AA074D3D56D2F6B70E76DDC" x="110.65" fill="#000000" fill-rule="nonzero"/>
+                                                    <use xlink:href="#g62E843CADC663C7579F6A3FBC77388E" x="116.06" fill="#000000" fill-rule="nonzero"/>
+                                                    <use xlink:href="#g95425A100709BC27DA272BD05EE64D31" x="120.91" fill="#000000" fill-rule="nonzero"/>
+                                                    <use xlink:href="#gA9BC29EE3008608C91A9D8C24BBB5729" x="124.14" fill="#000000" fill-rule="nonzero"/>
+                                                    <use xlink:href="#gB5DAD6AB9EF529853D7226A3F4588253" x="129.07" fill="#000000" fill-rule="nonzero"/>
+                                                    <use xlink:href="#g74F5E04B137C7A69C847D70E8574BAA6" x="134.22" fill="#000000" fill-rule="nonzero"/>
+                                                    <use xlink:href="#g8A55166D3146C138BB50BFE795ED0EE5" x="137.38" fill="#000000" fill-rule="nonzero"/>
+                                                    <use xlink:href="#g99297E49F87FAD41528007F13950C7FE" x="141.85" fill="#000000" fill-rule="nonzero"/>
+                                                </g>
+                                            </g>
+                                        </g>
+                                    </g>
+                                </g>
+                            </g>
+                        </g>
+                    </g>
+                    <g transform="translate(14.17929133858268 0)">
+                        <path class="typst-shape" fill="none" stroke="#000000" stroke-width="0.5" stroke-linecap="square" stroke-linejoin="miter" stroke-miterlimit="4" d="M 0 0 L 340.15747 0 "/>
+                    </g>
+                    <g transform="translate(14.17929133858268 0)">
+                        <path class="typst-shape" fill="none" stroke="#000000" stroke-width="0.5" stroke-linecap="square" stroke-linejoin="miter" stroke-miterlimit="4" d="M 0 226.77165 L 0 0 "/>
+                    </g>
+                    <g transform="translate(0 65.83582677165356)">
+                        <path class="typst-shape" fill="none" d="M 0 95.1 L 0.0000000000000062941777 0 L 8.51 0 L 8.51 95.1 L 0 95.1 Z "/>
+                    </g>
+                    <g transform="translate(-43.295000047244095 109.13082675590552)">
+                        <g class="typst-group">
+                            <g transform="matrix(0.00000000000000006123233995736766 -1 1 0.00000000000000006123233995736766 43.295000031496066 51.80500006299213)">
+                                <g transform="translate(0 0)">
+                                    <g class="typst-group">
+                                        <g>
+                                            <g transform="translate(0 6.580000000000001)">
+                                                <g class="typst-text" transform="scale(1, -1)">
+                                                    <use xlink:href="#g29073FC46AA074D3D56D2F6B70E76DDC" x="0" fill="#000000" fill-rule="nonzero"/>
+                                                    <use xlink:href="#g8A55166D3146C138BB50BFE795ED0EE5" x="5.41" fill="#000000" fill-rule="nonzero"/>
+                                                    <use xlink:href="#g2C268D8F9C3C96139D78CE91BE4D9FB" x="9.88" fill="#000000" fill-rule="nonzero"/>
+                                                    <use xlink:href="#gE67DAEF23D7CF18A0AF2DED5954B0CBF" x="13.600000000000001" fill="#000000" fill-rule="nonzero"/>
+                                                    <use xlink:href="#gA11C98823DFB5A0C784EF187B0832232" x="16.700000000000003" fill="#000000" fill-rule="nonzero"/>
+                                                    <use xlink:href="#g2C268D8F9C3C96139D78CE91BE4D9FB" x="21.740000000000002" fill="#000000" fill-rule="nonzero"/>
+                                                    <use xlink:href="#g652A779657972FBC161C470ED70CA8EC" x="25.46" fill="#000000" fill-rule="nonzero"/>
+                                                    <use xlink:href="#g528CFE5CB98F8F81646F93C3E83B138B" x="33.36" fill="#000000" fill-rule="nonzero"/>
+                                                    <use xlink:href="#g9C2BE9118249AB6B38E684352DDFBC5D" x="37.93" fill="#000000" fill-rule="nonzero"/>
+                                                    <use xlink:href="#gD63979DFEF9CA9C9A7E89CD2DA3C3A19" x="43.35" fill="#000000" fill-rule="nonzero"/>
+                                                    <use xlink:href="#g8A55166D3146C138BB50BFE795ED0EE5" x="47.63" fill="#000000" fill-rule="nonzero"/>
+                                                    <use xlink:href="#g72F468A787EE89ADC78BA24E12FC80FE" x="54.6" fill="#000000" fill-rule="nonzero"/>
+                                                    <use xlink:href="#g8038720E9FD02ADAD29947BFDF50BDCC" x="58.160000000000004" fill="#000000" fill-rule="nonzero"/>
+                                                    <use xlink:href="#g229B6FF65CF728248AF8BE0A20FAFDFC" x="64.13000000000001" fill="#000000" fill-rule="nonzero"/>
+                                                    <use xlink:href="#g2D340B5AC4FE037754EF86B583EC1E8C" x="68.98" fill="#000000" fill-rule="nonzero"/>
+                                                    <use xlink:href="#gF68F62BAF7023F8E796CE041842DF629" x="74.26" fill="#000000" fill-rule="nonzero"/>
+                                                    <use xlink:href="#g29073FC46AA074D3D56D2F6B70E76DDC" x="81.28" fill="#000000" fill-rule="nonzero"/>
+                                                    <use xlink:href="#g62E843CADC663C7579F6A3FBC77388E" x="86.69" fill="#000000" fill-rule="nonzero"/>
+                                                    <use xlink:href="#g99297E49F87FAD41528007F13950C7FE" x="91.53999999999999" fill="#000000" fill-rule="nonzero"/>
+                                                </g>
+                                            </g>
+                                        </g>
+                                    </g>
+                                </g>
+                            </g>
+                        </g>
+                    </g>
+                    <g transform="translate(354.3367716535433 0)">
+                        <path class="typst-shape" fill="none" stroke="#000000" stroke-width="0.5" stroke-linecap="square" stroke-linejoin="miter" stroke-miterlimit="4" d="M 0 226.77165 L 0 0 "/>
+                    </g>
+                    <g transform="translate(14.17929133858268 63.26929133858268)">
+                        <path class="typst-shape" fill="none" stroke="#0074d9" stroke-width="1" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="4" d="M 0 163.50237 L 153.47057 0 L 340.15747 0 "/>
+                    </g>
+                    <g transform="translate(167.6498611189155 63.26929133858268)">
+                        <path class="typst-shape" fill="none" stroke="#ff4136" stroke-width="1" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="4" stroke-dashoffset="0" stroke-dasharray="3 2" d="M 0 0 L 0 163.50237 "/>
+                    </g>
+                    <g transform="translate(82.21078740157479 85.03937007874016)">
+                        <path class="typst-shape" fill="none" stroke="#000000" stroke-width="1" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="4" d="M 0 62.072063 L 0 0 "/>
+                    </g>
+                    <g transform="translate(80.08480314960629 147.11143237795272)">
+                        <path class="typst-shape" fill="none" stroke="#000000" stroke-width="1" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="4" d="M 2.1259842 5.6692915 L 0 0 L 4.2519684 0 L 2.1259842 5.6692915 Z "/>
+                    </g>
+                    <g transform="translate(23.97114173228346 70.41007874015746)">
+                        <path class="typst-shape" fill="none" d="M 0 0 L 116.479294 0 L 116.479294 14.629292 L 0 14.629292 L 0 0 Z "/>
+                    </g>
+                    <g transform="translate(23.97114166929133 70.41007872440944)">
+                        <g class="typst-group">
+                            <g>
+                                <g transform="translate(0 0)">
+                                    <g class="typst-group">
+                                        <g>
+                                            <g transform="translate(2.8346456692913384 9.414645669291339)">
+                                                <g class="typst-text" transform="scale(1, -1)">
+                                                    <use xlink:href="#gD6703645BCA3FEA845C8778091F5B88B" x="0" fill="#000000" fill-rule="nonzero"/>
+                                                    <use xlink:href="#g8A55166D3146C138BB50BFE795ED0EE5" x="8.39" fill="#000000" fill-rule="nonzero"/>
+                                                    <use xlink:href="#g652A779657972FBC161C470ED70CA8EC" x="12.860000000000001" fill="#000000" fill-rule="nonzero"/>
+                                                    <use xlink:href="#gA11C98823DFB5A0C784EF187B0832232" x="20.76" fill="#000000" fill-rule="nonzero"/>
+                                                    <use xlink:href="#g2C268D8F9C3C96139D78CE91BE4D9FB" x="25.8" fill="#000000" fill-rule="nonzero"/>
+                                                    <use xlink:href="#gB5DAD6AB9EF529853D7226A3F4588253" x="29.67" fill="#000000" fill-rule="nonzero"/>
+                                                    <use xlink:href="#gA9BC29EE3008608C91A9D8C24BBB5729" x="37.32" fill="#000000" fill-rule="nonzero"/>
+                                                    <use xlink:href="#g528CFE5CB98F8F81646F93C3E83B138B" x="42.25" fill="#000000" fill-rule="nonzero"/>
+                                                    <use xlink:href="#g9C2BE9118249AB6B38E684352DDFBC5D" x="46.82" fill="#000000" fill-rule="nonzero"/>
+                                                    <use xlink:href="#gD908EABB3E36470B09E6E9B440A24B90" x="52.24" fill="#000000" fill-rule="nonzero"/>
+                                                    <use xlink:href="#g24E326A9802E1326DD4439B59EE95868" x="57.300000000000004" fill="#000000" fill-rule="nonzero"/>
+                                                    <use xlink:href="#g55B6FB7CBDC9C1DAE949CD37F0DC16D" x="64.77000000000001" fill="#000000" fill-rule="nonzero"/>
+                                                    <use xlink:href="#gD908EABB3E36470B09E6E9B440A24B90" x="67.48" fill="#000000" fill-rule="nonzero"/>
+                                                    <use xlink:href="#g74F5E04B137C7A69C847D70E8574BAA6" x="72.54" fill="#000000" fill-rule="nonzero"/>
+                                                    <use xlink:href="#g326CA448B94C8D25A227D725FDC9AB9" x="75.7" fill="#000000" fill-rule="nonzero"/>
+                                                    <use xlink:href="#gD63979DFEF9CA9C9A7E89CD2DA3C3A19" x="83.58" fill="#000000" fill-rule="nonzero"/>
+                                                    <use xlink:href="#g8A55166D3146C138BB50BFE795ED0EE5" x="87.86" fill="#000000" fill-rule="nonzero"/>
+                                                    <use xlink:href="#g55B6FB7CBDC9C1DAE949CD37F0DC16D" x="92.33" fill="#000000" fill-rule="nonzero"/>
+                                                    <use xlink:href="#g179036B287C688540B027B22D62B33B" x="95.03999999999999" fill="#000000" fill-rule="nonzero"/>
+                                                    <use xlink:href="#g55B6FB7CBDC9C1DAE949CD37F0DC16D" x="97.67999999999999" fill="#000000" fill-rule="nonzero"/>
+                                                    <use xlink:href="#g9C2BE9118249AB6B38E684352DDFBC5D" x="100.38999999999999" fill="#000000" fill-rule="nonzero"/>
+                                                    <use xlink:href="#gED8CAD24521E8BF74BBCB4EAC1068E02" x="105.80999999999999" fill="#000000" fill-rule="nonzero"/>
+                                                </g>
+                                            </g>
+                                        </g>
+                                    </g>
+                                </g>
+                            </g>
+                        </g>
+                    </g>
+                    <g transform="translate(286.30527559055116 70.36258330138494)">
+                        <path class="typst-shape" fill="none" stroke="#000000" stroke-width="1" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="4" d="M 0 0 L 0 99.716156 "/>
+                    </g>
+                    <g transform="translate(284.17929133858263 64.69329203149606)">
+                        <path class="typst-shape" fill="none" stroke="#000000" stroke-width="1" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="4" d="M 2.1259842 0 L 4.2519684 5.6692915 L 0 5.6692915 L 2.1259842 0 Z "/>
+                    </g>
+                    <g transform="translate(249.8406299212598 170.07874015748033)">
+                        <path class="typst-shape" fill="none" d="M 0 0 L 72.92929 0 L 72.92929 14.629292 L 0 14.629292 L 0 0 Z "/>
+                    </g>
+                    <g transform="translate(249.84062982677165 170.07874014173228)">
+                        <g class="typst-group">
+                            <g>
+                                <g transform="translate(0 0)">
+                                    <g class="typst-group">
+                                        <g>
+                                            <g transform="translate(2.8346456692913384 9.414645669291339)">
+                                                <g class="typst-text" transform="scale(1, -1)">
+                                                    <use xlink:href="#g23EA1C753A3A6B5551EEA1A5D7FB37ED" x="0" fill="#000000" fill-rule="nonzero"/>
+                                                    <use xlink:href="#gA11C98823DFB5A0C784EF187B0832232" x="6.460000000000001" fill="#000000" fill-rule="nonzero"/>
+                                                    <use xlink:href="#g652A779657972FBC161C470ED70CA8EC" x="11.5" fill="#000000" fill-rule="nonzero"/>
+                                                    <use xlink:href="#gD022B8BB58399F5AFF249BACEC9B429C" x="19.4" fill="#000000" fill-rule="nonzero"/>
+                                                    <use xlink:href="#gA44474528B05A80D93B01AD42F22D65" x="24.59" fill="#000000" fill-rule="nonzero"/>
+                                                    <use xlink:href="#g74F5E04B137C7A69C847D70E8574BAA6" x="29.9" fill="#000000" fill-rule="nonzero"/>
+                                                    <use xlink:href="#g8A55166D3146C138BB50BFE795ED0EE5" x="33.06" fill="#000000" fill-rule="nonzero"/>
+                                                    <use xlink:href="#gD63979DFEF9CA9C9A7E89CD2DA3C3A19" x="40.03" fill="#000000" fill-rule="nonzero"/>
+                                                    <use xlink:href="#g8A55166D3146C138BB50BFE795ED0EE5" x="44.31" fill="#000000" fill-rule="nonzero"/>
+                                                    <use xlink:href="#g55B6FB7CBDC9C1DAE949CD37F0DC16D" x="48.78" fill="#000000" fill-rule="nonzero"/>
+                                                    <use xlink:href="#g179036B287C688540B027B22D62B33B" x="51.49" fill="#000000" fill-rule="nonzero"/>
+                                                    <use xlink:href="#g55B6FB7CBDC9C1DAE949CD37F0DC16D" x="54.13" fill="#000000" fill-rule="nonzero"/>
+                                                    <use xlink:href="#g9C2BE9118249AB6B38E684352DDFBC5D" x="56.84" fill="#000000" fill-rule="nonzero"/>
+                                                    <use xlink:href="#gED8CAD24521E8BF74BBCB4EAC1068E02" x="62.260000000000005" fill="#000000" fill-rule="nonzero"/>
+                                                </g>
+                                            </g>
+                                        </g>
+                                    </g>
+                                </g>
+                            </g>
+                        </g>
+                    </g>
+                </g>
+            </g>
+        </g>
+    </g>
+    <defs id="glyph">
+        <symbol id="gF68F62BAF7023F8E796CE041842DF629" overflow="visible">
+            <path d="M 6.66 3.29 C 6.66 5.3599997 5.23 6.58 3.4399998 6.58 C 1.5 6.58 0.37 5.04 0.37 3.1 C 0.37 1.13 1.8 -0.099999994 3.5 -0.099999994 C 4.61 -0.099999994 5.49 0.38 6.04 1.16 C 6.44 1.73 6.66 2.45 6.66 3.29 Z M 3.31 6.22 C 4.63 6.22 5.7 5.12 5.7 3.1 C 5.7 1.31 4.8399997 0.26 3.6999998 0.26 C 2.48 0.26 1.3299999 1.35 1.3299999 3.27 C 1.3299999 5.37 2.32 6.22 3.31 6.22 Z "/>
+        </symbol>
+        <symbol id="gD022B8BB58399F5AFF249BACEC9B429C" overflow="visible">
+            <path d="M 1.56 3.6799998 C 1.55 3.98 1.53 4.24 1.48 4.3399997 C 1.4599999 4.39 1.4399999 4.42 1.36 4.42 C 1.0799999 4.31 0.82 4.22 0.13 4.13 C 0.11 4.0699997 0.13 3.9099998 0.14999999 3.85 C 0.69 3.8 0.79999995 3.75 0.79999995 3.1699998 L 0.79999995 -1.1 C 0.79999995 -1.93 0.69 -1.9799999 0.08 -2.01 C 0.02 -2.07 0.02 -2.28 0.08 -2.34 C 0.42999998 -2.33 0.79999995 -2.32 1.1999999 -2.32 C 1.5999999 -2.32 2.08 -2.33 2.4099998 -2.34 C 2.47 -2.28 2.47 -2.07 2.4099998 -2.01 C 1.6999999 -1.9699999 1.5899999 -1.93 1.5899999 -1.1 L 1.5899999 -0.02 C 1.5899999 0.11 1.63 0.099999994 1.73 0.06 C 1.9799999 -0.04 2.28 -0.099999994 2.6 -0.099999994 C 3.1599998 -0.099999994 3.6599998 0.07 4.0699997 0.45999998 C 4.54 0.91999996 4.81 1.54 4.81 2.35 C 4.81 3.4099998 4.06 4.39 3.04 4.39 C 2.58 4.39 2.07 4.0899997 1.67 3.6399999 C 1.61 3.58 1.5699999 3.58 1.56 3.6799998 Z M 1.75 3.31 C 2.01 3.6299999 2.47 3.9299998 2.76 3.9299998 C 3.3999999 3.9299998 3.9499998 3.21 3.9499998 2.08 C 3.9499998 1.26 3.6599998 0.24 2.54 0.24 C 2.36 0.24 2.01 0.29 1.8299999 0.45 C 1.63 0.63 1.5899999 0.69 1.5899999 1.05 L 1.5899999 2.87 C 1.5899999 3.08 1.63 3.1699998 1.75 3.31 Z "/>
+        </symbol>
+        <symbol id="g8A55166D3146C138BB50BFE795ED0EE5" overflow="visible">
+            <path d="M 3.86 0.93 C 3.49 0.55 3.1999998 0.39 2.62 0.39 C 2.26 0.39 1.8399999 0.59999996 1.53 1.11 C 1.3299999 1.4399999 1.2099999 1.9 1.2099999 2.48 L 3.87 2.46 C 3.99 2.46 4.06 2.52 4.06 2.6299999 C 4.06 3.47 3.76 4.37 2.37 4.37 C 1.5 4.37 0.37 3.54 0.37 2.02 C 0.37 1.4599999 0.51 0.91999996 0.84 0.53999996 C 1.18 0.14 1.65 -0.099999994 2.37 -0.099999994 C 3.1299999 -0.099999994 3.6699998 0.25 4.0699997 0.77 C 4.04 0.87 3.98 0.91999996 3.86 0.93 Z M 1.24 2.82 C 1.43 3.9499998 2.1299999 4.04 2.37 4.04 C 2.75 4.04 3.1999998 3.83 3.1999998 2.99 C 3.1999998 2.8999999 3.1599998 2.85 3.05 2.85 Z "/>
+        </symbol>
+        <symbol id="g2C268D8F9C3C96139D78CE91BE4D9FB" overflow="visible">
+            <path d="M 1.76 3.58 C 1.74 3.98 1.73 4.24 1.68 4.3399997 C 1.66 4.39 1.64 4.42 1.56 4.42 C 1.28 4.31 1.02 4.22 0.32999998 4.13 C 0.31 4.0699997 0.32999998 3.9099998 0.35 3.85 C 0.89 3.8 1 3.75 1 3.1699998 L 1 1.22 C 1 0.39 0.88 0.35 0.26 0.31 C 0.19999999 0.25 0.19999999 0.04 0.26 -0.02 C 0.61 -0.01 1 0 1.4 0 C 1.8 0 2.26 -0.01 2.61 -0.02 C 2.6699998 0.04 2.6699998 0.25 2.61 0.31 C 1.91 0.35999998 1.79 0.39 1.79 1.22 L 1.79 2.61 C 1.79 2.87 1.91 3.1 2.03 3.28 C 2.1399999 3.4399998 2.37 3.77 2.49 3.77 C 2.58 3.77 2.6699998 3.75 2.75 3.6399999 C 2.82 3.54 2.9399998 3.4099998 3.11 3.4099998 C 3.35 3.4099998 3.58 3.6599998 3.58 3.9099998 C 3.58 4.1 3.3999999 4.39 2.98 4.39 C 2.51 4.39 2.1 3.9499998 1.87 3.56 C 1.81 3.4499998 1.76 3.53 1.76 3.58 Z "/>
+        </symbol>
+        <symbol id="g528CFE5CB98F8F81646F93C3E83B138B" overflow="visible">
+            <path d="M 2.9299998 0.48 C 2.99 0.17 3.1 -0.099999994 3.6 -0.099999994 C 3.98 -0.099999994 4.3399997 0.07 4.5499997 0.26999998 C 4.5299997 0.39 4.49 0.48 4.38 0.53999996 C 4.31 0.48 4.14 0.38 4.0099998 0.38 C 3.72 0.38 3.7099998 0.77 3.7099998 1.23 L 3.7099998 2.7 C 3.7099998 4.12 2.9299998 4.39 2.2 4.39 C 1.38 4.39 0.55 3.85 0.55 3.28 C 0.55 3.04 0.66999996 2.9199998 0.9 2.9199998 C 1.1899999 2.9199998 1.37 3.1299999 1.37 3.26 C 1.37 3.33 1.36 3.3999999 1.3399999 3.4399998 C 1.3299999 3.47 1.3199999 3.53 1.3199999 3.6399999 C 1.3199999 3.9499998 1.74 4.06 2.12 4.06 C 2.46 4.06 2.9299998 3.8899999 2.9299998 2.76 C 2.9299998 2.69 2.8999999 2.6499999 2.87 2.6399999 L 2.01 2.4299998 C 1.05 2.19 0.35999998 1.66 0.35999998 0.97999996 C 0.35999998 0.16 0.91999996 -0.099999994 1.62 -0.099999994 C 1.9699999 -0.099999994 2.27 -0.02 2.71 0.32 L 2.9099998 0.48 Z M 2.9299998 2.33 L 2.9299998 1.01 C 2.9299998 0.88 2.87 0.81 2.79 0.75 C 2.53 0.53999996 2.19 0.31 1.91 0.31 C 1.41 0.31 1.1899999 0.71 1.1899999 1.02 C 1.1899999 1.4699999 1.4 1.93 2.1399999 2.12 Z "/>
+        </symbol>
+        <symbol id="g74F5E04B137C7A69C847D70E8574BAA6" overflow="visible">
+            <path d="M 0.42999998 4.29 C 0.29 4.29 0.25 4.17 0.25 4.0899997 L 0.25 3.9599998 C 0.25 3.9099998 0.26 3.8999999 0.29999998 3.8999999 L 0.89 3.8999999 L 0.89 0.89 C 0.89 0.17999999 1.1999999 -0.099999994 1.66 -0.099999994 C 2.12 -0.099999994 2.62 0.12 3.01 0.56 C 2.99 0.65999997 2.9299998 0.71999997 2.83 0.72999996 C 2.57 0.53 2.27 0.45 2.01 0.45 C 1.74 0.45 1.68 0.75 1.68 1.37 L 1.68 3.8999999 L 2.72 3.8999999 C 2.82 3.8999999 2.96 3.9399998 2.96 4.0299997 L 2.96 4.23 C 2.96 4.27 2.9299998 4.29 2.8799999 4.29 L 1.68 4.29 L 1.68 4.68 C 1.68 5.33 1.7199999 5.73 1.7199999 5.73 C 1.7199999 5.79 1.6899999 5.8199997 1.64 5.8199997 C 1.5999999 5.8199997 1.51 5.7799997 1.42 5.73 C 1.31 5.67 1.2099999 5.62 1.0799999 5.5899997 C 0.96 5.5499997 0.85999995 5.52 0.85999995 5.45 C 0.85999995 5.33 0.89 5.4 0.89 4.29 Z "/>
+        </symbol>
+        <symbol id="g55B6FB7CBDC9C1DAE949CD37F0DC16D" overflow="visible">
+            <path d="M 1.81 1.22 L 1.81 3.21 C 1.81 3.7099998 1.8499999 4.35 1.8499999 4.35 C 1.8499999 4.39 1.8 4.42 1.7199999 4.42 C 1.4399999 4.31 1.04 4.22 0.35 4.13 C 0.32999998 4.0699997 0.35 3.9099998 0.37 3.85 C 0.91999996 3.8 1.02 3.74 1.02 3.1699998 L 1.02 1.22 C 1.02 0.39 0.90999997 0.35999998 0.29999998 0.31 C 0.24 0.25 0.24 0.04 0.29999998 -0.02 C 0.63 -0.01 1.02 0 1.42 0 C 1.8199999 0 2.2 -0.01 2.53 -0.02 C 2.59 0.04 2.59 0.25 2.53 0.31 C 1.92 0.35 1.81 0.39 1.81 1.22 Z M 0.9 5.99 C 0.9 5.73 1.14 5.47 1.38 5.47 C 1.66 5.47 1.9 5.74 1.9 5.95 C 1.9 6.19 1.6899999 6.47 1.42 6.47 C 1.18 6.47 0.9 6.23 0.9 5.99 Z "/>
+        </symbol>
+        <symbol id="gA11C98823DFB5A0C784EF187B0832232" overflow="visible">
+            <path d="M 0.41 2.05 C 0.41 1.03 1.09 -0.099999994 2.51 -0.099999994 C 3.1499999 -0.099999994 3.6399999 0.13 3.98 0.45999998 C 4.43 0.9 4.63 1.53 4.63 2.1399999 C 4.63 3.1799998 4.06 4.39 2.53 4.39 C 1.87 4.39 1.3299999 4.12 0.96 3.6899998 C 0.59999996 3.26 0.41 2.6799998 0.41 2.05 Z M 2.3799999 4.04 C 3.24 4.04 3.77 3.26 3.77 1.8199999 C 3.77 0.56 3.12 0.25 2.6499999 0.25 C 1.61 0.25 1.27 1.51 1.27 2.28 C 1.27 3.1499999 1.48 4.04 2.3799999 4.04 Z "/>
+        </symbol>
+        <symbol id="g9C2BE9118249AB6B38E684352DDFBC5D" overflow="visible">
+            <path d="M 1.8399999 3.58 C 1.78 3.51 1.7199999 3.49 1.7199999 3.58 C 1.7099999 3.85 1.6899999 4.24 1.64 4.3399997 C 1.62 4.39 1.5999999 4.42 1.52 4.42 C 1.24 4.31 0.97999996 4.22 0.29 4.13 C 0.26999998 4.0699997 0.29 3.9099998 0.31 3.85 C 0.84999996 3.8 0.96 3.75 0.96 3.1699998 L 0.96 1.22 C 0.96 0.39999998 0.85999995 0.35999998 0.26 0.31 C 0.19999999 0.25 0.19999999 0.04 0.26 -0.02 C 0.56 -0.01 0.96 0 1.36 0 C 1.76 0 2.06 -0.01 2.36 -0.02 C 2.4199998 0.04 2.4199998 0.25 2.36 0.31 C 1.8499999 0.35999998 1.75 0.39999998 1.75 1.22 L 1.75 2.86 C 1.75 3.07 1.8399999 3.1899998 1.92 3.28 C 2.3 3.6499999 2.75 3.87 3.1399999 3.87 C 3.34 3.87 3.55 3.74 3.6699998 3.51 C 3.77 3.31 3.79 3.04 3.79 2.74 L 3.79 1.22 C 3.79 0.39999998 3.6899998 0.35999998 3.1699998 0.31 C 3.12 0.25 3.12 0.04 3.1699998 -0.02 C 3.47 -0.01 3.79 0 4.19 0 C 4.5899997 0 4.95 -0.01 5.25 -0.02 C 5.2999997 0.04 5.2999997 0.25 5.25 0.31 C 4.69 0.35999998 4.58 0.39999998 4.58 1.22 L 4.58 2.71 C 4.58 3.26 4.54 3.74 4.31 4.0499997 C 4.14 4.27 3.83 4.39 3.48 4.39 C 2.99 4.39 2.4299998 4.2599998 1.8399999 3.58 Z "/>
+        </symbol>
+        <symbol id="g179036B287C688540B027B22D62B33B" overflow="visible">
+            <path d="M 0.95 1.22 C 0.95 0.39 0.84 0.34 0.22999999 0.31 C 0.17 0.25 0.17 0.04 0.22999999 -0.02 C 0.58 -0.01 0.95 0 1.35 0 C 1.75 0 2.1299999 -0.01 2.46 -0.02 C 2.52 0.04 2.52 0.25 2.46 0.31 C 1.8499999 0.34 1.74 0.39 1.74 1.22 L 1.74 5.83 C 1.74 6.48 1.78 6.8799996 1.78 6.8799996 C 1.78 6.95 1.74 6.98 1.65 6.98 C 1.4 6.8799996 0.65 6.74 0.25 6.71 C 0.22999999 6.6299996 0.25 6.47 0.31 6.41 C 0.89 6.37 0.95 6.3399997 0.95 5.5899997 Z "/>
+        </symbol>
+        <symbol id="g6A35C718BF189FD6EC8D20440CDA771D" overflow="visible">
+            <path d="M 1.91 1.22 L 1.91 5.23 C 1.91 6.06 2.08 6.1099997 2.78 6.14 C 2.84 6.2 2.84 6.41 2.78 6.47 C 2.34 6.46 1.87 6.45 1.48 6.45 C 1.15 6.45 0.65999997 6.46 0.19 6.47 C 0.13 6.41 0.13 6.2 0.19 6.14 C 0.89 6.1099997 1.06 6.06 1.06 5.23 L 1.06 1.22 C 1.06 0.39 0.89 0.34 0.19 0.31 C 0.13 0.25 0.13 0.04 0.19 -0.02 C 0.64 -0.01 1.13 0 1.49 0 C 1.8499999 0 2.33 -0.01 2.78 -0.02 C 2.84 0.04 2.84 0.25 2.78 0.31 C 2.08 0.34 1.91 0.39 1.91 1.22 Z "/>
+        </symbol>
+        <symbol id="g7D8F87D2CD6331112C9FE3A07C40E0F" overflow="visible">
+            <path d="M 0.48 1.38 C 0.52 0.89 0.55 0.42 0.55 0 C 0.65 0.02 0.75 0.03 0.79999995 0.03 C 0.87 0.03 0.93 0.03 1 0.01 C 1.27 -0.06 1.54 -0.099999994 1.91 -0.099999994 C 2.47 -0.099999994 3.5 0.17 3.5 1.16 C 3.5 1.8399999 3.01 2.24 2.33 2.49 C 1.73 2.72 1.3299999 2.87 1.3299999 3.4199998 C 1.3299999 3.83 1.6899999 4.06 2.03 4.06 C 2.25 4.06 2.83 3.98 2.96 3.1299999 C 3.02 3.07 3.22 3.08 3.28 3.1399999 C 3.31 3.5 3.33 3.87 3.34 4.2 C 3.03 4.25 2.55 4.39 2.03 4.39 C 1.29 4.39 0.62 3.9099998 0.62 3.27 C 0.62 2.54 0.95 2.23 1.7199999 1.91 C 2.55 1.5699999 2.74 1.36 2.74 0.93 C 2.74 0.44 2.26 0.22999999 1.89 0.22999999 C 1.5 0.22999999 1.28 0.35999998 1.18 0.47 C 0.96 0.7 0.84999996 1.14 0.78999996 1.39 C 0.72999996 1.4499999 0.53999996 1.4399999 0.48 1.38 Z "/>
+        </symbol>
+        <symbol id="gB5DAD6AB9EF529853D7226A3F4588253" overflow="visible">
+            <path d="M 2.04 -1.5999999 C 2.2 -1.3199999 2.33 -1.04 2.45 -0.74 C 3.25 1.1899999 3.6999998 2.22 4.22 3.34 C 4.42 3.76 4.56 3.9199998 5.0299997 3.98 C 5.0899997 4.04 5.0899997 4.25 5.0299997 4.31 C 4.83 4.2999997 4.6 4.29 4.3199997 4.29 C 4.02 4.29 3.7099998 4.2999997 3.4099998 4.31 C 3.35 4.25 3.35 4.04 3.4099998 3.98 C 3.73 3.9499998 4.0499997 3.8899999 3.8899999 3.53 L 2.8999999 1.24 C 2.83 1.0799999 2.74 1.05 2.6599998 1.25 L 1.77 3.33 C 1.5899999 3.75 1.54 3.9399998 2.1 3.98 C 2.1599998 4.04 2.1599998 4.25 2.1 4.31 C 1.73 4.2999997 1.3299999 4.29 0.96999997 4.29 C 0.63 4.29 0.35999998 4.2999997 0.16 4.31 C 0.099999994 4.25 0.099999994 4.04 0.16 3.98 C 0.56 3.9299998 0.69 3.84 0.95 3.23 L 2.08 0.59999996 C 2.1699998 0.39999998 2.32 -0.06 2.22 -0.34 C 2.1 -0.66999996 1.9799999 -0.95 1.8299999 -1.26 C 1.7199999 -1.4599999 1.5799999 -1.55 1.3299999 -1.55 C 1.1899999 -1.55 1.15 -1.52 1.04 -1.52 C 0.75 -1.52 0.59999996 -1.8199999 0.59999996 -1.9499999 C 0.59999996 -2.1599998 0.79999995 -2.32 1.0699999 -2.32 C 1.28 -2.32 1.68 -2.24 2.04 -1.5999999 Z "/>
+        </symbol>
+        <symbol id="g72F468A787EE89ADC78BA24E12FC80FE" overflow="visible">
+            <path d="M 1.0799999 -1.93 L 3.1999998 -1.93 C 3.26 -1.87 3.26 -1.6899999 3.1999998 -1.63 C 1.8199999 -1.52 1.75 -1.37 1.75 -0.5 L 1.75 5.6099997 C 1.75 6.48 1.8299999 6.6299996 3.1999998 6.73 C 3.26 6.79 3.26 6.97 3.1999998 7.0299997 L 1.0799999 7.0299997 Z "/>
+        </symbol>
+        <symbol id="g229B6FF65CF728248AF8BE0A20FAFDFC" overflow="visible">
+            <path d="M 2.83 3.23 C 3.6799998 3.23 3.7099998 3 3.74 2.55 C 3.8 2.49 4.0099998 2.49 4.0699997 2.55 C 4.06 2.8 4.0499997 3.09 4.0499997 3.4299998 C 4.0499997 3.77 4.06 4.0299997 4.0699997 4.29 C 4.0099998 4.35 3.8 4.35 3.74 4.29 C 3.7099998 3.74 3.6799998 3.61 2.83 3.61 L 1.9 3.61 L 1.9 5.41 C 1.9 5.95 2.02 6.0499997 2.46 6.0499997 L 2.9099998 6.0499997 C 3.97 6.0499997 4.22 5.65 4.44 4.96 C 4.56 4.96 4.67 4.98 4.75 5.0099998 C 4.7 5.42 4.5499997 6.3799996 4.5299997 6.46 C 4.5299997 6.48 4.52 6.49 4.49 6.49 C 4.3199997 6.46 4.27 6.45 4.02 6.45 L 1.4699999 6.45 C 1.15 6.45 0.58 6.46 0.17999999 6.47 C 0.12 6.41 0.12 6.2 0.17999999 6.14 C 0.88 6.1099997 1.05 6.06 1.05 5.23 L 1.05 1.22 C 1.05 0.39 0.88 0.34 0.17999999 0.31 C 0.12 0.25 0.12 0.04 0.17999999 -0.02 C 0.53 -0.01 1.03 0 1.48 0 C 1.93 0 2.4199998 -0.01 2.77 -0.02 C 2.83 0.04 2.83 0.25 2.77 0.31 C 2.07 0.34 1.9 0.39 1.9 1.22 L 1.9 3.23 Z "/>
+        </symbol>
+        <symbol id="g2D340B5AC4FE037754EF86B583EC1E8C" overflow="visible">
+            <path d="M 1.49 0 L 3.62 0 C 3.8899999 0 4.81 -0.02 4.81 -0.02 C 4.91 0.48 5.0099998 1.13 5.0699997 1.65 C 4.97 1.6999999 4.8599997 1.7199999 4.74 1.6999999 C 4.54 0.97999996 4.17 0.39 3.11 0.39 L 2.48 0.39 C 2.09 0.39 1.91 0.58 1.91 1.09 L 1.91 5.23 C 1.91 6.06 2.08 6.1099997 2.78 6.14 C 2.84 6.2 2.84 6.41 2.78 6.47 C 2.33 6.46 1.86 6.45 1.48 6.45 C 1.12 6.45 0.65 6.46 0.19 6.47 C 0.13 6.41 0.13 6.2 0.19 6.14 C 0.89 6.1099997 1.06 6.06 1.06 5.23 L 1.06 1.22 C 1.06 0.39 0.89 0.34 0.19 0.31 C 0.13 0.25 0.13 0.04 0.19 -0.02 C 0.58 -0.01 1.15 0 1.49 0 Z "/>
+        </symbol>
+        <symbol id="g29073FC46AA074D3D56D2F6B70E76DDC" overflow="visible">
+            <path d="M 1.04 5.23 L 1.04 1.22 C 1.04 0.39 0.87 0.34 0.17 0.31 C 0.11 0.25 0.11 0.04 0.17 -0.02 C 0.62 -0.01 1.11 0 1.4699999 0 C 1.8199999 0 2.4099998 -0.01 2.9099998 -0.02 C 2.97 0.04 2.97 0.25 2.9099998 0.31 C 2.1299999 0.35 1.89 0.39 1.89 1.22 L 1.89 2.9199998 C 2.11 2.85 2.35 2.82 2.7 2.82 C 4.52 2.82 5.0699997 4.0099998 5.0699997 4.8599997 C 5.0699997 5.45 4.68 6.52 2.83 6.52 C 2.45 6.52 1.86 6.45 1.4599999 6.45 C 1.09 6.45 0.57 6.46 0.17 6.47 C 0.11 6.41 0.11 6.2 0.17 6.14 C 0.87 6.1099997 1.04 6.06 1.04 5.23 Z M 1.89 5.54 C 1.89 5.83 2.04 6.18 2.76 6.18 C 3.4499998 6.18 4.14 5.95 4.14 4.68 C 4.14 3.6 3.62 3.1599998 2.6499999 3.1599998 C 2.3999999 3.1599998 2 3.1799998 1.89 3.21 Z "/>
+        </symbol>
+        <symbol id="g62E843CADC663C7579F6A3FBC77388E" overflow="visible">
+            <path d="M 3.9499998 6.31 C 3.37 6.39 3.3899999 6.58 2.4099998 6.58 C 1.4 6.58 0.51 5.91 0.51 4.85 C 0.51 3.8 1.38 3.32 2.26 2.98 C 2.86 2.75 3.6 2.47 3.6 1.49 C 3.6 0.68 3.1499999 0.26 2.35 0.26 C 1.42 0.26 0.82 0.69 0.59999996 1.68 C 0.47 1.7199999 0.37 1.6999999 0.26999998 1.65 C 0.31 0.88 0.35 0.62 0.47 0.13 C 1.1 0.13 1.38 -0.099999994 2.27 -0.099999994 C 2.72 -0.099999994 3.1499999 0.01 3.5 0.22999999 C 4.08 0.59 4.44 1.1999999 4.44 1.8399999 C 4.44 2.8999999 3.6399999 3.37 2.82 3.6799998 C 2.22 3.8999999 1.22 4.2799997 1.22 5.12 C 1.22 5.68 1.73 6.24 2.32 6.24 C 3.29 6.24 3.6 5.62 3.8 4.96 C 3.9099998 4.94 4.04 4.95 4.13 5.02 C 4.0899997 5.5 4.0499997 5.7799997 3.9499998 6.31 Z "/>
+        </symbol>
+        <symbol id="g95425A100709BC27DA272BD05EE64D31" overflow="visible">
+            <path d="M 2.9299998 6.46 L 2.52 6.46 L 0.14999999 -0.59 L 0.56 -0.59 Z "/>
+        </symbol>
+        <symbol id="gA9BC29EE3008608C91A9D8C24BBB5729" overflow="visible">
+            <path d="M 1.67 3.9399998 C 1.61 3.8799999 1.5699999 3.8999999 1.5699999 3.99 L 1.5699999 5.83 C 1.5699999 6.48 1.61 6.8799996 1.61 6.8799996 C 1.61 6.95 1.5699999 6.98 1.48 6.98 C 1.23 6.8799996 0.48 6.74 0.08 6.71 C 0.06 6.6299996 0.08 6.47 0.14 6.41 C 0.17 6.41 0.19999999 6.41 0.22999999 6.41 C 0.66999996 6.3799996 0.78 6.3799996 0.78 5.5899997 L 0.78 0.71 C 0.78 0.32 0.77 0.14 0.74 0 C 0.78999996 -0.08 0.84 -0.12 0.96 -0.12 C 1.02 -0.06 1.12 0.03 1.1999999 0.12 C 1.3 0.24 1.36 0.24 1.4699999 0.14999999 C 1.6999999 -0.04 2 -0.099999994 2.35 -0.099999994 C 3.37 -0.099999994 4.56 0.83 4.56 2.4199998 C 4.56 3.6399999 3.6599998 4.39 2.79 4.39 C 2.36 4.39 1.9799999 4.22 1.67 3.9399998 Z M 1.74 3.6299999 C 2 3.86 2.3 3.9499998 2.59 3.9499998 C 3.1999998 3.9499998 3.6999998 3.21 3.6999998 2.24 C 3.6999998 1.1 3.24 0.24 2.29 0.24 C 1.9799999 0.24 1.77 0.45999998 1.5699999 0.71 L 1.5699999 3.21 C 1.5699999 3.4199998 1.61 3.52 1.74 3.6299999 Z "/>
+        </symbol>
+        <symbol id="g99297E49F87FAD41528007F13950C7FE" overflow="visible">
+            <path d="M 2.48 7.04 L 0.35999998 7.04 C 0.29999998 6.98 0.29999998 6.7999997 0.35999998 6.74 C 1.74 6.6299996 1.81 6.48 1.81 5.6099997 L 1.81 -0.5 C 1.81 -1.37 1.73 -1.52 0.35999998 -1.62 C 0.29999998 -1.68 0.29999998 -1.86 0.35999998 -1.92 L 2.48 -1.92 Z "/>
+        </symbol>
+        <symbol id="gE67DAEF23D7CF18A0AF2DED5954B0CBF" overflow="visible">
+            <path d="M 1.75 1.22 L 1.75 3.8999999 L 2.6799998 3.8999999 C 2.77 3.8999999 2.9099998 3.9399998 2.9099998 4.0299997 L 2.9099998 4.23 C 2.9099998 4.27 2.8799999 4.29 2.83 4.29 L 1.75 4.29 L 1.75 4.8599997 C 1.75 6.3999996 2.21 6.64 2.54 6.64 C 2.84 6.64 3 6.52 3.1399999 6.17 C 3.22 5.99 3.33 5.85 3.54 5.85 C 3.7099998 5.85 3.9499998 6.06 3.9499998 6.2599998 C 3.9499998 6.43 3.84 6.6099997 3.6299999 6.77 C 3.37 6.95 3.11 6.98 2.73 6.98 C 1.89 6.98 0.96 6.25 0.96 4.69 L 0.96 4.29 L 0.45 4.29 C 0.26999998 4.29 0.22 4.17 0.22 4.0899997 L 0.22 3.9599998 C 0.22 3.9099998 0.22999999 3.8999999 0.26999998 3.8999999 L 0.96 3.8999999 L 0.96 1.22 C 0.96 0.39 0.79999995 0.35 0.26 0.31 C 0.19999999 0.25 0.19999999 0.04 0.26 -0.02 C 0.61 -0.01 0.96 0 1.36 0 C 1.76 0 2.24 -0.01 2.59 -0.02 C 2.6499999 0.04 2.6499999 0.25 2.59 0.31 C 1.81 0.35 1.75 0.39 1.75 1.22 Z "/>
+        </symbol>
+        <symbol id="g652A779657972FBC161C470ED70CA8EC" overflow="visible">
+            <path d="M 1.6999999 3.58 C 1.6899999 3.8799999 1.67 4.24 1.62 4.3399997 C 1.5999999 4.39 1.5799999 4.42 1.5 4.42 C 1.22 4.31 0.96 4.22 0.26999998 4.13 C 0.25 4.0699997 0.26999998 3.9099998 0.29 3.85 C 0.83 3.8 0.94 3.75 0.94 3.1699998 L 0.94 1.22 C 0.94 0.39999998 0.81 0.35 0.26 0.31 C 0.19999999 0.25 0.19999999 0.04 0.26 -0.02 C 0.56 -0.01 0.94 0 1.3399999 0 C 1.74 0 2.05 -0.01 2.35 -0.02 C 2.4099998 0.04 2.4099998 0.25 2.35 0.31 C 1.8399999 0.35999998 1.73 0.39999998 1.73 1.22 L 1.73 2.86 C 1.73 3.07 1.8199999 3.1899998 1.9 3.28 C 2.3 3.6699998 2.6699998 3.87 2.98 3.87 C 3.36 3.87 3.6399999 3.6299999 3.6399999 2.96 L 3.6399999 1.22 C 3.6399999 0.39999998 3.56 0.35 3.02 0.31 C 2.97 0.25 2.97 0.04 3.02 -0.02 C 3.27 -0.01 3.6399999 0 4.04 0 C 4.44 0 4.77 -0.01 5.02 -0.02 C 5.0699997 0.04 5.0699997 0.25 5.02 0.31 C 4.52 0.35 4.43 0.39999998 4.43 1.22 L 4.43 2.81 C 4.43 2.95 4.43 3.09 4.42 3.21 C 4.9 3.74 5.35 3.87 5.74 3.87 C 6.12 3.87 6.3399997 3.6499999 6.3399997 2.98 L 6.3399997 1.22 C 6.3399997 0.39999998 6.24 0.35 5.72 0.31 C 5.67 0.25 5.67 0.04 5.72 -0.02 C 5.97 -0.01 6.3399997 0 6.74 0 C 7.14 0 7.49 -0.01 7.77 -0.02 C 7.8199997 0.04 7.8199997 0.25 7.77 0.31 C 7.22 0.35 7.1299996 0.39999998 7.1299996 1.22 L 7.1299996 2.8 C 7.1299996 3.6899998 6.98 4.39 6.1 4.39 C 5.5899997 4.39 4.97 4.2 4.45 3.6499999 C 4.42 3.62 4.36 3.57 4.3399997 3.6599998 C 4.25 4.0699997 3.86 4.39 3.34 4.39 C 2.76 4.39 2.24 4.0499997 1.8199999 3.58 C 1.77 3.53 1.7099999 3.46 1.6999999 3.58 Z "/>
+        </symbol>
+        <symbol id="gD63979DFEF9CA9C9A7E89CD2DA3C3A19" overflow="visible">
+            <path d="M 3.98 0.90999997 C 3.9399998 1 3.86 1.04 3.77 1.05 C 3.4299998 0.61 3 0.39 2.57 0.39 C 1.8399999 0.39 1.23 1.13 1.23 2.3 C 1.23 3.3999999 1.7099999 4.06 2.37 4.06 C 2.96 4.06 3.04 3.7099998 3.08 3.36 C 3.11 3.09 3.25 3 3.46 3 C 3.6699998 3 3.9499998 3.1299999 3.9499998 3.4399998 C 3.9499998 3.99 3.3799999 4.39 2.4199998 4.39 C 1.43 4.39 0.37 3.5 0.37 2.08 C 0.37 0.78999996 1.09 -0.099999994 2.35 -0.099999994 C 2.95 -0.099999994 3.48 0.089999996 3.98 0.90999997 Z "/>
+        </symbol>
+        <symbol id="g8038720E9FD02ADAD29947BFDF50BDCC" overflow="visible">
+            <path d="M 3.5 1.22 L 3.5 5.04 C 3.5 5.68 3.6 6.06 3.9599998 6.06 L 4.18 6.06 C 4.93 6.06 5.4 5.7999997 5.5699997 5.0299997 C 5.68 5.0299997 5.8199997 5.04 5.91 5.08 C 5.8399997 5.54 5.79 6.0499997 5.7799997 6.5 C 5.7799997 6.5099998 5.7599998 6.5299997 5.75 6.5299997 C 5.41 6.5 4.2999997 6.45 3.51 6.45 L 2.6499999 6.45 C 1.88 6.45 0.7 6.5 0.32 6.5299997 C 0.29999998 6.5299997 0.28 6.5099998 0.28 6.5 C 0.24 6.0499997 0.14 5.5299997 0.03 5.0499997 C 0.13 5.0099998 0.25 5 0.37 5 C 0.57 5.7999997 1.03 6.06 1.6899999 6.06 L 2.18 6.06 C 2.55 6.06 2.6499999 5.68 2.6499999 5.0699997 L 2.6499999 1.22 C 2.6499999 0.39 2.48 0.34 1.68 0.31 C 1.62 0.25 1.62 0.04 1.68 -0.02 C 2.1699998 -0.01 2.6799998 0 3.08 0 C 3.46 0 3.97 -0.01 4.47 -0.02 C 4.5299997 0.04 4.5299997 0.25 4.47 0.31 C 3.6699998 0.34 3.5 0.39 3.5 1.22 Z "/>
+        </symbol>
+        <symbol id="gD6703645BCA3FEA845C8778091F5B88B" overflow="visible">
+            <path d="M 6.6299996 1.16 C 6.68 0.35 6.64 0.34 5.99 0.31 C 5.93 0.25 5.93 0.04 5.99 -0.02 C 6.3599997 -0.01 6.7599998 0 7.06 0 C 7.3599997 0 7.8199997 -0.01 8.15 -0.02 C 8.21 0.04 8.21 0.25 8.15 0.31 C 7.47 0.34 7.44 0.38 7.3799996 1.2099999 L 7.08 5.42 C 7.04 5.96 7.0899997 6.1 7.77 6.14 C 7.83 6.2 7.83 6.41 7.77 6.47 L 6.46 6.45 L 4.3399997 1.38 C 4.2999997 1.28 4.27 1.24 4.25 1.24 C 4.23 1.24 4.2 1.29 4.17 1.37 L 2.12 6.45 L 0.66999996 6.47 C 0.61 6.41 0.61 6.2 0.66999996 6.14 C 1.36 6.1 1.42 6.08 1.35 5.33 L 0.95 1.2099999 C 0.89 0.61 0.77 0.35999998 0.19999999 0.31 C 0.14 0.25 0.14 0.04 0.19999999 -0.02 C 0.5 -0.01 0.82 0 1.06 0 C 1.3 0 1.6999999 -0.01 2 -0.02 C 2.06 0.04 2.06 0.25 2 0.31 C 1.3399999 0.35999998 1.29 0.66999996 1.3399999 1.23 L 1.7199999 5.24 L 1.74 5.24 L 3.81 0.049999997 C 3.84 -0.02 3.8999999 -0.099999994 3.9599998 -0.099999994 C 4.02 -0.099999994 4.06 -0.03 4.1 0.049999997 L 6.35 5.3399997 L 6.37 5.3399997 Z "/>
+        </symbol>
+        <symbol id="gD908EABB3E36470B09E6E9B440A24B90" overflow="visible">
+            <path d="M 3.34 0.5 C 3.3899999 0.53999996 3.48 0.56 3.49 0.48999998 C 3.52 0.25 3.6 -0.099999994 3.6 -0.099999994 C 3.6799998 -0.13 3.73 -0.12 3.79 -0.099999994 C 4.0099998 0.08 4.36 0.22999999 4.97 0.29999998 C 5.0299997 0.35999998 5.0299997 0.51 4.97 0.57 C 4.33 0.62 4.24 0.81 4.24 1.3 L 4.24 5.83 C 4.24 6.48 4.2799997 6.8799996 4.2799997 6.8799996 C 4.2799997 6.95 4.24 6.98 4.15 6.98 C 3.8999999 6.8799996 3.1499999 6.74 2.75 6.71 C 2.73 6.6299996 2.75 6.47 2.81 6.41 C 2.84 6.41 2.87 6.41 2.8999999 6.41 C 3.34 6.3799996 3.4499998 6.3799996 3.4499998 5.5899997 L 3.4499998 4.31 C 3.4499998 4.24 3.4299998 4.22 3.36 4.22 C 3.32 4.22 2.9099998 4.39 2.58 4.39 C 1.92 4.39 1.48 4.17 1.0799999 3.79 C 0.65 3.36 0.39 2.77 0.39 2.03 C 0.39 0.79999995 1.01 -0.099999994 2.09 -0.099999994 C 2.48 -0.099999994 2.85 0.099999994 3.34 0.5 Z M 3.4499998 1.24 C 3.4499998 1.05 3.4299998 0.96999997 3.29 0.84999996 C 2.9199998 0.53 2.6 0.37 2.35 0.37 C 1.81 0.37 1.25 0.96 1.25 2.21 C 1.25 2.9299998 1.39 3.33 1.54 3.54 C 1.8499999 4.0099998 2.27 4.04 2.47 4.04 C 2.83 4.04 3.08 3.9099998 3.28 3.6799998 C 3.4199998 3.52 3.4499998 3.4499998 3.4499998 3.1399999 Z "/>
+        </symbol>
+        <symbol id="g24E326A9802E1326DD4439B59EE95868" overflow="visible">
+            <path d="M 2.03 3.98 C 2.09 4.04 2.09 4.25 2.03 4.31 C 1.73 4.2999997 1.38 4.29 1.02 4.29 C 0.64 4.29 0.37 4.2999997 0.11 4.31 C 0.049999997 4.25 0.049999997 4.04 0.11 3.98 C 0.58 3.9299998 0.69 3.83 0.9 3.31 L 2.21 0.08 C 2.27 -0.06 2.35 -0.12 2.45 -0.12 C 2.54 -0.12 2.62 -0.06 2.69 0.099999994 L 3.6999998 2.56 L 4.73 0.08 C 4.79 -0.06 4.87 -0.12 4.97 -0.12 C 5.06 -0.12 5.14 -0.06 5.2 0.089999996 L 6.5099998 3.26 C 6.67 3.6699998 6.8399997 3.9499998 7.35 3.98 C 7.41 4.04 7.41 4.25 7.35 4.31 C 7.1499996 4.2999997 6.93 4.29 6.6299996 4.29 C 6.33 4.29 5.91 4.2999997 5.6099997 4.31 C 5.5499997 4.25 5.5499997 4.04 5.6099997 3.98 C 6.3599997 3.9399998 6.2599998 3.6499999 6.12 3.3 L 5.2799997 1.22 C 5.18 0.96999997 5.14 0.96999997 5.06 1.18 L 4.2 3.37 C 4.0099998 3.84 4.0299997 3.9399998 4.56 3.98 C 4.62 4.04 4.62 4.25 4.56 4.31 C 4.2599998 4.2999997 3.86 4.29 3.56 4.29 C 3.31 4.29 2.98 4.2999997 2.6799998 4.31 C 2.62 4.25 2.62 4.04 2.6799998 3.98 C 3.1499999 3.9399998 3.27 3.6499999 3.4499998 3.1999998 L 3.51 3.04 L 2.81 1.27 C 2.6799998 0.94 2.6599998 0.93 2.53 1.25 L 1.73 3.31 C 1.54 3.79 1.55 3.9499998 2.03 3.98 Z "/>
+        </symbol>
+        <symbol id="g326CA448B94C8D25A227D725FDC9AB9" overflow="visible">
+            <path d="M 1.67 2.86 C 1.67 3.07 1.76 3.1899998 1.8399999 3.28 C 2.22 3.6499999 2.73 3.87 3.12 3.87 C 3.32 3.87 3.53 3.74 3.6499999 3.51 C 3.75 3.31 3.77 3.04 3.77 2.74 L 3.77 1.22 C 3.77 0.39999998 3.6699998 0.35999998 3.1499999 0.31 C 3.1 0.25 3.1 0.04 3.1499999 -0.02 C 3.4299998 -0.01 3.77 0 4.17 0 C 4.5699997 0 4.9 -0.01 5.23 -0.02 C 5.2799997 0.04 5.2799997 0.25 5.23 0.31 C 4.67 0.35999998 4.56 0.39999998 4.56 1.22 L 4.56 2.71 C 4.56 3.26 4.52 3.75 4.29 4.0499997 C 4.12 4.27 3.81 4.39 3.46 4.39 C 2.97 4.39 2.37 4.2599998 1.76 3.58 C 1.76 3.57 1.75 3.57 1.74 3.56 C 1.7099999 3.52 1.66 3.46 1.66 3.58 L 1.67 5.83 C 1.67 6.48 1.7099999 6.8799996 1.7099999 6.8799996 C 1.7099999 6.95 1.67 6.98 1.5799999 6.98 C 1.3299999 6.8799996 0.58 6.74 0.17999999 6.71 C 0.16 6.6299996 0.17999999 6.47 0.24 6.41 C 0.26999998 6.41 0.29999998 6.41 0.32999998 6.41 C 0.77 6.3799996 0.88 6.3799996 0.88 5.5899997 L 0.88 1.22 C 0.88 0.39 0.76 0.35 0.17999999 0.31 C 0.12 0.25 0.12 0.04 0.17999999 -0.02 C 0.51 -0.01 0.88 0 1.28 0 C 1.66 0 1.99 -0.01 2.27 -0.02 C 2.33 0.04 2.33 0.25 2.27 0.31 C 1.76 0.35 1.67 0.39 1.67 1.22 Z "/>
+        </symbol>
+        <symbol id="gED8CAD24521E8BF74BBCB4EAC1068E02" overflow="visible">
+            <path d="M 4.44 3.87 C 4.63 3.87 4.81 4.0499997 4.81 4.25 C 4.81 4.46 4.62 4.62 4.35 4.62 C 4.0899997 4.62 3.61 4.45 3.35 4.11 C 3.23 4.19 2.8999999 4.39 2.31 4.39 C 1.42 4.39 0.59999996 3.79 0.59999996 2.87 C 0.59999996 2.33 0.84 2.02 1.12 1.75 C 0.84 1.51 0.68 1.0799999 0.68 0.74 C 0.68 0.38 0.88 0.099999994 1.14 -0.03 C 0.59999996 -0.35 0.32 -0.81 0.32 -1.24 C 0.32 -2.11 1.14 -2.3799999 1.91 -2.3799999 C 3.26 -2.3799999 4.72 -1.73 4.72 -0.65 C 4.72 -0.32999998 4.5699997 -0.08 4.2799997 0.16 C 3.8899999 0.48 3.1899998 0.48999998 2.82 0.48999998 C 2.6399999 0.48999998 2.3899999 0.47 2.1499999 0.44 C 2 0.42999998 1.9 0.42 1.8499999 0.42 C 1.56 0.42 1.2099999 0.56 1.2099999 0.98999995 C 1.2099999 1.1899999 1.27 1.4 1.39 1.5699999 C 1.63 1.43 1.91 1.36 2.3 1.36 C 3.1799998 1.36 3.99 1.91 3.99 2.8899999 C 3.99 3.36 3.85 3.6299999 3.56 3.9299998 C 3.6299999 4.0299997 3.83 4.17 3.9599998 4.17 C 4.0299997 4.17 4.1 4.14 4.16 4.0499997 C 4.2 3.97 4.33 3.87 4.44 3.87 Z M 1.3199999 -0.099999994 C 1.43 -0.13 1.5899999 -0.14999999 1.7199999 -0.14999999 C 2.03 -0.14999999 2.31 -0.12 2.45 -0.12 C 2.95 -0.12 3.37 -0.13 3.6599998 -0.29 C 4.0499997 -0.51 4.17 -0.65 4.17 -0.90999997 C 4.17 -1.63 3.11 -2.01 2.21 -2.01 C 1.8499999 -2.01 1.01 -1.7199999 1.01 -1.04 C 1.01 -0.7 1.03 -0.45 1.3199999 -0.099999994 Z M 3.1899998 2.79 C 3.1899998 1.86 2.72 1.67 2.35 1.67 C 1.51 1.67 1.42 2.3799999 1.42 3.02 C 1.42 3.72 1.67 4.08 2.22 4.08 C 2.85 4.08 3.1899998 3.62 3.1899998 2.79 Z "/>
+        </symbol>
+        <symbol id="g23EA1C753A3A6B5551EEA1A5D7FB37ED" overflow="visible">
+            <path d="M 3.57 -0.099999994 C 4.54 -0.099999994 5.43 0.35999998 6.1099997 1.25 C 6.06 1.3399999 5.99 1.4 5.8799996 1.4 C 5.17 0.62 4.49 0.31 3.56 0.31 C 2.21 0.31 1.28 1.8299999 1.28 3.29 C 1.28 4.15 1.51 4.87 1.87 5.3199997 C 2.37 5.93 2.95 6.21 3.47 6.21 C 4.85 6.21 5.4 5.39 5.66 4.5299997 C 5.7799997 4.49 5.8799996 4.52 5.99 4.58 C 5.94 5.1 5.87 5.58 5.77 6.1299996 C 5.2599998 6.18 4.81 6.58 3.58 6.58 C 2.73 6.58 2.02 6.27 1.43 5.73 C 0.77 5.12 0.37 4.14 0.37 3.1 C 0.37 1.36 1.42 -0.099999994 3.57 -0.099999994 Z "/>
+        </symbol>
+        <symbol id="gA44474528B05A80D93B01AD42F22D65" overflow="visible">
+            <path d="M 2.1699998 -0.099999994 C 2.56 -0.099999994 3.03 0.099999994 3.52 0.5 C 3.57 0.53999996 3.6599998 0.56 3.6699998 0.48999998 C 3.6999998 0.24 3.78 -0.099999994 3.78 -0.099999994 C 3.86 -0.13 3.9099998 -0.12 3.97 -0.099999994 C 4.19 0.08 4.54 0.22999999 5.15 0.29999998 C 5.21 0.35999998 5.21 0.51 5.15 0.57 C 4.5099998 0.62 4.42 0.81 4.42 1.3 L 4.42 3.22 C 4.42 3.52 4.46 4.25 4.46 4.25 C 4.46 4.2799997 4.43 4.31 4.38 4.31 C 4.33 4.2999997 4.18 4.29 4.0299997 4.29 C 3.7099998 4.29 3.35 4.2999997 3.01 4.31 C 2.95 4.25 2.95 4.04 3.01 3.98 C 3.5 3.9499998 3.6299999 3.83 3.6299999 3.1699998 L 3.6299999 1.24 C 3.6299999 1.05 3.61 0.96999997 3.47 0.84999996 C 3.1 0.53 2.72 0.37 2.47 0.37 C 2.1699998 0.37 1.67 0.51 1.67 1.4 L 1.67 3.22 C 1.67 3.52 1.7099999 4.25 1.7099999 4.25 C 1.7099999 4.2799997 1.68 4.31 1.63 4.31 C 1.5799999 4.2999997 1.43 4.29 1.28 4.29 C 0.96 4.29 0.59999996 4.2999997 0.26 4.31 C 0.19999999 4.25 0.19999999 4.04 0.26 3.98 C 0.74 3.9399998 0.88 3.83 0.88 3.1799998 L 0.88 1.26 C 0.88 0.57 1.18 -0.099999994 2.1699998 -0.099999994 Z "/>
+        </symbol>
+    </defs>
+</svg>
@@ -0,0 +1,35 @@
+#import "@preview/cetz:0.4.2"
+#import "@preview/cetz-plot:0.1.3": plot, chart
+
+#set page(width: auto, height: auto, margin: .5cm)
+#set text(size: 10pt)
+
+#cetz.canvas({
+  import cetz.draw: *
+
+  set-style(axes: (stroke: .5pt, tick: (stroke: .5pt)), legend: none)
+
+  plot.plot(
+    size: (12, 8),
+    name: "roofline",
+    x-min: 0.1, x-max: 10000, x-mode: "log",
+    y-min: 0, y-max: 200,
+    x-tick-step: none, y-tick-step: none,
+    {
+      plot.add(((0.1, 0), (18.025, 144.2), (10000, 144.2)), style: (stroke: (paint: blue)))
+      plot.add-anchor("membound", (1, 64))
+      plot.add-anchor("compbound", (1000, 144.2))
+      plot.add-anchor("ridge", (18.025, 144.2))
+    },
+    x-label: "Operational Intensity [FLOPS/byte]",
+    y-label: "Performance [TFLOPS]",
+  )
+
+  line("roofline.ridge", ((), "|-", (18.025, 0)), name: "ridge-horiz-line", stroke: (paint: red, dash: "densely-dashed"))
+
+  line("roofline.membound", ((), "|-", (1, 5)), mark: (start: ">"), name: "membound-arrow")
+  content("membound-arrow.end", [Memory bandwidth ceiling], anchor: "south", padding: .1)
+
+  line("roofline.compbound", ((), "|-", (1, 2)), mark: (start: ">"), name: "compbound-arrow")
+  content("compbound-arrow.end", [Compute ceiling], anchor: "north", padding: .1)
+})
@@ -1,7 +1,7 @@
 .. meta::
  :description: This chapter describes a set of best practices designed to help
   developers optimize the performance of HIP-capable GPU architectures.
-  :keywords: AMD, ROCm, HIP, CUDA, performance, guidelines
+  :keywords: AMD, ROCm, HIP, CUDA, performance, guidelines, optimization, how-to

 .. _how_to_performance_guidelines:

@@ -9,313 +9,619 @@
 Performance guidelines
 *******************************************************************************

-The AMD HIP performance guidelines are a set of best practices designed to help
-you optimize the application performance on AMDGPUs. The guidelines discuss
-established parallelization and optimization techniques to improve the
-application performance on HIP-capable GPU architectures.
+The AMD HIP performance guidelines provide practical, actionable techniques for
+optimizing application performance on AMD GPUs. This guide focuses on 
+step-by-step instructions and best practices for improving performance.

-Here are the four main cornerstones to help you exploit HIP's performance
-optimization potential:
+For theoretical foundations and performance concepts, see 
+:doc:`../understand/performance_optimization`.

- Parallel execution
- Memory bandwidth usage optimization
- Maximum throughput optimization
- Memory thrashing minimization
+Optimization workflow
+=====================

-This document discusses the usage and benefits of these cornerstones in detail.
+Follow this systematic approach to optimize GPU performance:
+
+1. **Profile and measure baseline**
+   
+   Use ``rocprofv3`` to identify bottlenecks:
+   
+   .. code-block:: bash
+   
+      rocprofv3 --stats ./your_application
+   
+   Collect metrics on kernel execution time, memory bandwidth, occupancy, and 
+   CU utilization.
+
+2. **Analyze metrics to identify bottlenecks**
+   
+   Determine if kernels are compute-bound or memory-bound. Check arithmetic 
+   intensity, memory bandwidth achieved vs peak, and compute throughput.
+   
+   For understanding the roofline model, see :ref:`roofline_model`.
+
+3. **Apply targeted optimizations**
+   
+   Based on identified bottlenecks, apply techniques from this guide.
+
+4. **Verify improvements**
+   
+   Re-profile to confirm performance gains.
+
+5. **Iterate**
+   
+   Repeat until performance goals are met.

 .. _parallel execution:

 Parallel execution
-================================================================================
+==================

 For optimal use and to keep all system components busy, the application must
-reveal and efficiently provide as much parallelism as possible. The parallelism
-can be performed at the application level, device level, and multiprocessor
-level.
-
-.. _application_parallel_execution:
+reveal and efficiently provide as much parallelism as possible.

 Application level
--------------------------------------------------------------------------------
+-----------------

-To enable parallel execution of the application across the host and devices, use
-:ref:`asynchronous calls and streams <asynchronous_how-to>`. Assign workloads
-based on efficiency: serial to the host or parallel to the devices.
+To enable parallel execution across the host and devices:

-For parallel workloads, when threads belonging to the same block need to
-synchronize to share data, use :cpp:func:`__syncthreads()` (see:
-:ref:`synchronization_functions`) within the same kernel invocation. For threads
-belonging to different blocks, use global memory with two separate
-kernel invocations. It is recommended to avoid the latter approach as it adds
-overhead.
+* Use :ref:`asynchronous calls and streams <asynchronous_how-to>`
+* Assign serial workloads to the host
+* Assign parallel workloads to the devices
+
+For parallel workloads:
+
+* Use :cpp:func:`__syncthreads()` (see :ref:`synchronization_functions`) for 
+  intra-block synchronization
+* Use global memory with separate kernel invocations for inter-block 
+  synchronization (has overhead, minimize when possible)

 Device level
--------------------------------------------------------------------------------
+------------

-Device level optimization primarily involves maximizing parallel execution
-across the multiprocessors on the device. You can achieve device level
-optimization by executing multiple kernels concurrently on a device. To enhance
-performance, the management of these kernels is facilitated by streams, which
-allows overlapping of computation and data transfers. This approach aims at
-keeping all multiprocessors busy by executing enough kernels concurrently.
-However, launching too many kernels can lead to resource contention, hence a
-balance must be found for optimal performance. The device level optimization
-helps in achieving maximum utilization of the device resources.
+Maximize parallel execution across multiprocessors:
+
+* Execute multiple kernels concurrently on a device
+* Use streams to overlap computation and data transfers
+* Keep all multiprocessors busy with enough concurrent kernels
+* Avoid launching too many kernels (causes resource contention)

 Multiprocessor level
--------------------------------------------------------------------------------
+--------------------

-Multiprocessor level optimization involves maximizing parallel execution within
-each multiprocessor on a device. The key to multiprocessor level optimization
-is to efficiently utilize the various functional units within a multiprocessor.
-For example, ensuring a sufficient number of resident warps, so that every clock
-cycle has an instruction from a warp is ready for execution. This instruction
-could either be another independent instruction of the same warp, which exploits
-:ref:`instruction level optimization <instruction optimization>`, or more
-commonly an instruction of another warp, which exploits thread-level parallelism.
+Maximize parallel execution within each multiprocessor:

-On the other hand, device level optimization focuses on the device as a whole,
-aiming at keeping all multiprocessors busy by executing enough kernels
-concurrently. Both multiprocessor and device levels of optimization are crucial
-for achieving maximum performance. They work together to ensure efficient
-utilization of the GPU resources, ranging from individual multiprocessors to the
-device as a whole.
+* Ensure sufficient resident warps for every clock cycle
+* Exploit instruction-level parallelism within warps
+* Exploit thread-level parallelism across warps
+* Balance resource usage for optimal occupancy

 .. _memory optimization:

 Memory throughput optimization
-================================================================================
+==============================

 The first step in maximizing memory throughput is to minimize low-bandwidth
 data transfers between the host and the device.

-Additionally, maximize the use of on-chip memory, that is, shared memory and
-caches, and minimize transfers with global memory. Shared memory acts as a
-user-managed cache explicitly allocated and accessed by the application. A
-common programming pattern is to stage data from device memory into shared
-memory. The staging of data from the device to shared memory involves the
-following steps:
-
-1. Each thread of a block loading data from device memory to shared memory.
-2. Synchronizing with all other threads of the block.
-3. Processing the data stored in shared memory.
-4. Synchronizing again if necessary.
-5. Writing the results back to the device global memory.
-
-For some applications, a traditional hardware-managed cache is more appropriate
-for exploiting data locality.
-
-In conclusion, the throughput of memory accesses by a kernel can vary
-significantly depending on the access pattern. Therefore, the next step in
-maximizing memory throughput is to organize memory accesses as optimally as
-possible. This is especially important for global memory accesses, as global
-memory bandwidth is low compared to available on-chip bandwidths and arithmetic
-instruction throughput. Thus, non-optimal global memory accesses generally have
-a high impact on performance.
-The memory throughput optimization techniques are further discussed in detail in
-the following sections.
+Additionally, maximize the use of on-chip memory (shared memory and caches) and 
+minimize transfers with global memory.

 .. _data transfer:

-Data transfer
--------------------------------------------------------------------------------
+Data transfer optimization
+--------------------------

-To minimize data transfers between the host and the device, applications should
-move more computations from the host to the device, even at the cost of running
-kernels that don't fully utilize parallelism for the device. Intermediate data
-structures should be created, used, and discarded in device memory without being
-mapped or copied to host memory.
+**Minimize host-device transfers**

-Batching small transfers into a single large transfer can improve performance
-due to the overhead associated with each transfer. On systems with a front-side
-bus, using page-locked host memory can enhance data transfer performance.
+* Move computations from host to device when possible
+* Create, use, and discard intermediate data structures on device
+* Avoid unnecessary copies to host memory

-When using mapped page-locked memory, there is no need to allocate device
-memory or explicitly copy data between device and host memory. Data transfers
-occur implicitly each time the kernel accesses the mapped memory. For optimal
-performance, these memory accesses should be coalesced, similar to global
-memory accesses. The process where threads in a warp access sequential memory
-locations is known as coalesced memory access, which can enhance memory data
-transfer efficiency.
+**Batch small transfers**

-On integrated systems where device and host memory are physically the same, no
-copy operation between host and device memory is required and hence mapped
-page-locked memory should be used instead. To check if the device is integrated,
-applications can query the integrated device property.
+Each memory transfer incurs a fixed overhead from driver calls and PCIe
+transaction setup. Consolidating many small transfers into a single large
+transfer amortizes this overhead across more data, resulting in much higher
+effective bandwidth.
+
+.. code-block:: cuda
+
+   // Instead of many small transfers
+   for (int i = 0; i < n; i++) {
+       hipMemcpy(&d_data[i], &h_data[i], sizeof(float), ...);
+   }
+   
+   // Use a single large transfer
+   hipMemcpy(d_data, h_data, n * sizeof(float), ...);
+
+**Use page-locked memory for transfers**
+
+Page-locked (pinned) memory cannot be swapped to disk by the operating system, 
+allowing the GPU to access it directly via DMA without CPU involvement. This 
+eliminates an extra copy through a staging buffer and achieves higher bandwidth.
+
+.. code-block:: cuda
+
+   float* h_pinned;
+   hipHostMalloc(&h_pinned, size);
+   // Faster transfers than pageable memory
+   hipMemcpy(d_data, h_pinned, size, hipMemcpyHostToDevice);
+
+**Use mapped memory on integrated systems**
+
+On integrated GPUs (APUs), the CPU and GPU share the same physical memory. 
+Mapped page-locked memory allows zero-copy access, where the GPU reads directly 
+from host memory without requiring an explicit transfer, eliminating redundant copies.
+
+.. code-block:: cuda
+
+   int integrated;
+   hipDeviceGetAttribute(&integrated, hipDeviceAttributeIntegrated, device);
+   if (integrated) {
+       // Use mapped page-locked memory - no explicit copy needed
+       hipHostMalloc(&ptr, size, hipHostMallocMapped);
+   }

 .. _device memory access:

 Device memory access
---------------------
+--------------------

-Memory access instructions might be repeated due to the spread of memory
-addresses across warp threads. The impact on throughput varies with memory type
-and is generally reduced when addresses are more scattered, especially in
-global memory.
+**Ensure proper alignment**

-Device memory is accessed via 32-, 64-, or 128-byte transactions that must be
-naturally aligned.
-Maximizing memory throughput involves:
+Memory hardware loads data in aligned chunks (typically 128 bytes). Using 
+naturally aligned data types ensures each access maps to a single memory 
+transaction, maximizing bandwidth and avoiding split transactions.

- Coalescing memory accesses of threads within a warp into minimal transactions.
- Following optimal access patterns.
- Using properly sized and aligned data types.
- Padding data when necessary.
+.. code-block:: cuda

-Global memory instructions support reading or writing data of specific sizes (1,
-2, 4, 8, or 16 bytes) that are naturally aligned. Not meeting the size and
-alignment requirements leads to multiple instructions, which reduces
-performance. Therefore, for correct results and optimal performance:
+   // Use naturally aligned types
+   float4 data;  // 16-byte aligned
+   float2 data;  // 8-byte aligned
+   
+   // Ensure structure alignment
+   struct __align__(16) MyStruct {
+       float4 data;
+   };

- Use data types that meet these requirements
- Ensure alignment for structures
- Maintain alignment for all values or arrays.
+**Optimize 2D array access**

-Threads often access 2D arrays at an address calculated as
-``BaseAddress + xIndex + width * yIndex``. For efficient memory access, the
-array and thread block widths should be multiples of the warp size. If the
-array width is not a multiple of the warp size, it is usually more efficient to
-allocate the array with a width rounded up to the nearest multiple and pad the
-rows accordingly.
+Padding 2D arrays to multiples of the wavefront size ensures each row starts 
+at an aligned memory boundary. This allows consecutive threads accessing the 
+same row to generate coalesced memory transactions, thereby maximizing
+bandwidth.

-Local memory is used for certain automatic variables, such as arrays with
-non-constant indices, large structures of arrays, and any variable where the
-kernel uses more registers than available. Local memory resides in device
-memory, which leads to high latency and low bandwidth, similar to global memory
-accesses. However, the local memory is organized for consecutive 32-bit words to
-be accessed by consecutive thread IDs, which allows full coalescing when all
-threads in a warp access the same relative address.
+.. code-block:: cuda

-Shared memory is located on-chip and provides higher bandwidth and lower latency
-than local or global memory. It is divided into banks that can be simultaneously
-accessed, which boosts bandwidth. However, bank conflicts, where two addresses
-fall in the same bank, lead to serialized access and decreased throughput.
-Therefore, understanding how memory addresses map to banks and scheduling
-requests to minimize conflicts is crucial for optimal performance.
+   // Ensure array width is multiple of warp size
+   int width = ((actual_width + warpSize - 1) / warpSize) * warpSize;
+   hipMalloc(&array, width * height * sizeof(float));
+   
+   // Access pattern
+   int idx = x + width * y;  // width should be warp-aligned

-Constant memory is in the device memory and cached in the constant cache.
-Requests are split based on different memory addresses and are serviced based
-either on the throughput of the constant cache for cache hits or on the
-throughput of the device memory otherwise. This splitting of requests affects
-throughput.
+**Coalesce memory accesses**

-Texture and surface memory are stored in the device memory and cached in the
-texture cache. This setup optimizes 2D spatial locality, which leads to better
-performance for threads reading close 2D addresses.
-Reading device memory through texture or surface fetching provides the following
-advantages:
+When consecutive threads in a wavefront access consecutive memory addresses, 
+the hardware combines these into a single wide transaction. Non-coalesced 
+patterns require multiple transactions, reducing effective bandwidth.

- Higher bandwidth for local texture fetches or surface reads.
- Offloading addressing calculation.
- Data broadcasting.
- Optional conversion of 8-bit and 16-bit integer input data to 32-bit
-  floating-point values on the fly.
+.. code-block:: cuda
+
+   // Good: consecutive threads access consecutive addresses
+   int idx = threadIdx.x + blockIdx.x * blockDim.x;
+   data[idx] = value;
+   
+   // Bad: strided access
+   int idx = threadIdx.x * stride;  // Non-coalesced if stride > 1
+   data[idx] = value;
+
+For understanding memory coalescing theory, see :ref:`memory_hierarchy_theory`.
+
+**Use shared memory for data reuse**
+
+Shared memory (LDS) provides low-latency on-chip storage shared across threads 
+in a block. Loading data into shared memory once and reusing it many times 
+reduces global memory traffic, particularly effective for tiled algorithms such
+as matrix multiplication.
+
+.. code-block:: cuda
+
+   __global__ void optimized_kernel(float* input, float* output) {
+       __shared__ float tile[TILE_SIZE][TILE_SIZE];
+       
+       // Load data into shared memory
+       tile[threadIdx.y][threadIdx.x] = input[...];
+       __syncthreads();
+       
+       // Reuse data from fast shared memory
+       float result = 0;
+       for (int i = 0; i < TILE_SIZE; i++) {
+           result += tile[threadIdx.y][i] * tile[i][threadIdx.x];
+       }
+       __syncthreads();
+       
+       output[...] = result;
+   }
+
+**Avoid bank conflicts in shared memory**
+
+Shared memory is organized into banks, each capable of servicing one request per
+cycle. When multiple threads in a warp access the same bank simultaneously, the
+requests are serialized, reducing throughput. Padding arrays by one element
+shifts addresses to avoid systematic conflicts.
+
+.. code-block:: cuda
+
+   // Bad: power-of-2 stride causes conflicts
+   __shared__ float data[32][32];
+   float value = data[threadIdx.x][threadIdx.y];
+   
+   // Good: padding avoids conflicts
+   __shared__ float data[32][33];  // Extra column
+   float value = data[threadIdx.x][threadIdx.y];
+
+For bank conflict theory, see :ref:`bank_conflicts_theory`.
+
+**Use texture memory for 2D spatial access**
+
+Texture memory provides hardware-accelerated 2D filtering and caching optimized 
+for spatial locality. It automatically handles boundary conditions and can 
+interpolate values, making it ideal for image processing and nearby-neighbor access patterns.
+
+.. code-block:: cuda
+
+   // Create texture object
+   hipTextureObject_t texObj;
+   hipCreateTextureObject(&texObj, &resDesc, &texDesc, NULL);
+   
+   // Access in kernel
+   float value = tex2D<float>(texObj, x, y);

 .. _instruction optimization:

-Optimization for maximum instruction throughput
-================================================================================
-
-To maximize instruction throughput:
-
- Minimize low throughput arithmetic instructions.
- Minimize divergent warps inflicted by flow control instructions.
- Maximize instruction parallelism.
-
-These techniques are discussed in detail in the following sections.
+Instruction throughput optimization
+====================================

 Arithmetic instructions
--------------------------------------------------------------------------------
+-----------------------

-The type and complexity of arithmetic operations can significantly impact the
-performance of your application. We are highlighting some hints how to maximize
-it.
+**Use efficient operations**

-Use efficient operations: Some arithmetic operations are costlier than others.
-For example, multiplication is typically faster than division, and integer
-operations are usually faster than floating-point operations, especially with
-double precision.
+Division requires many more hardware cycles than multiplication. Similarly, 
+bitwise operations (shifts, AND, OR) are single-cycle instructions on integer 
+units, making them far more efficient than equivalent arithmetic for power-of-two calculations.

-Minimize low-throughput instructions: This might involve trading precision for
-speed when it does not affect the final result. For instance, consider using
-single-precision arithmetic instead of double-precision.
+.. code-block:: cuda

-Leverage intrinsic functions: Intrinsic functions are predefined functions
-available in HIP that can often be executed faster than equivalent arithmetic
-operations (subject to some input or accuracy restrictions). They can help
-optimize performance by replacing more complex arithmetic operations.
+   // Prefer multiplication over division
+   float result = value * 0.5f;     // Fast
+   float result = value / 2.0f;     // Slower
+   
+   // Use bitwise operations for powers of 2
+   int index = threadIdx.x << 2;    // Multiply by 4
+   int mask = (1 << n) - 1;         // Create bit mask

-Optimize memory access: The memory access efficiency can impact the speed of
-arithmetic operations. See: :ref:`device memory access`.
+**Use single-precision when possible**
+
+AMD GPUs have significantly higher throughput for single-precision (FP32) 
+operations compared to double-precision (FP64). Using single-precision math 
+functions can deliver substantial performance gains when FP64 accuracy is not required.
+
+.. code-block:: cuda
+
+   // Single-precision (faster)
+   float result = sinf(x);
+   float result = expf(x);
+   
+   // Double-precision (slower, use only when necessary)
+   double result = sin(x);
+   double result = exp(x);
+
+**Leverage fast math intrinsics**
+
+Hardware-specific intrinsics bypass certain accuracy checks and use lookup
+tables or polynomial approximations, trading slight precision loss for
+significantly higher throughput. These should be used when the application can
+tolerate reduced precision.
+
+.. code-block:: cuda
+
+   // Fast intrinsic versions
+   float ex = __expf(x);            // Fast exponential
+   float lg = __logf(x);            // Fast logarithm
+   float sq = __fsqrt_rn(x);        // Fast square root
+   float rc = __frcp_rn(x);         // Fast reciprocal

 .. _control flow instructions:

-Control flow instructions
--------------------------------------------------------------------------------
+Control flow optimization
+-------------------------

-Control flow instructions (``if``, ``else``, ``for``, ``do``, ``while``,
-``break``, ``continue``, ``switch``) can impact instruction throughput by
-causing threads within a warp to diverge and follow different execution paths.
-To optimize performance, write control conditions to minimize divergent warps.
-For example, when the control condition depends on ``threadIdx`` or ``warpSize``,
-warp doesn't diverge. The compiler might optimize loops, short ifs, or switch
-blocks using branch predication, which prevents warp divergence. With branch
-predication, instructions associated with a false predicate are scheduled but
-not executed, which avoids unnecessary operations. For control conditions where
-one outcome is significantly more likely than the other, use `__builtin_expect <https://clang.llvm.org/docs/LanguageExtensions.html#builtin-expect>`_
-or ``[[likely]]`` to indicate the likely condition result.
+**Minimize divergence**

-Avoiding divergent warps
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+When threads in a wavefront take different execution paths, the hardware 
+serializes both branches, executing each path with only the relevant threads 
+active. This reduces effective parallelism and wastes cycles on inactive threads.

-Warps diverge when threads within the same warp follow different execution paths.
-This is caused by conditional statements that lead to different arithmetic
-operations being performed by different threads. Divergent warps can
-significantly reduce instruction throughput, so it is advisable to structure
-your code to minimize divergence.
+.. code-block:: cuda
+
+   // Good: no divergence (condition depends on threadIdx)
+   if (threadIdx.x < 32) {
+       // All threads in first half-warp execute
+   }
+   
+   // Bad: divergence within warp
+   if (data[threadIdx.x] > threshold) {
+       // Some threads execute, others don't
+   }
+
+**Use branch hints for predictable conditions**
+
+Providing hints about branch likelihood helps the compiler generate better 
+instruction ordering and can improve the branch predictor's accuracy, reducing 
+pipeline stalls when the prediction proves correct.
+
+.. code-block:: cuda
+
+   if (__builtin_expect(rare_condition, 0)) {
+       // Unlikely branch
+   }
+   
+   // C++20 attribute
+   if (common_condition) [[likely]] {
+       // Likely branch
+   }
+
+**Avoid divergent warps**
+
+When divergence is unavoidable, restructure the code to separate divergent paths 
+into different kernel launches or use predication (branchless programming) to 
+keep all threads active, though computing unnecessary values may be acceptable 
+if it avoids the serialization penalty.
+
+.. code-block:: cuda
+
+   // Instead of:
+   if (threadIdx.x % 2 == 0) {
+       result = compute_even();
+   } else {
+       result = compute_odd();
+   }
+   
+   // Consider separating into different kernels or using predication

 Synchronization
--------------------------------------------------------------------------------
+---------------

-Synchronization ensures that all threads within a block complete their
-computations and memory accesses before moving forward, which is critical when
-threads depend on other thread results. However, synchronization can also cause
-performance overhead, as it needs the threads to wait, which might lead to idle
-GPU resources.
+**Use minimal synchronization**

-To synchronize all threads in a block, use :cpp:func:`__syncthreads()`.
-:cpp:func:`__syncthreads()` ensures that, all threads reach the same point in
-the code and can access shared memory after reaching that point.
+Each synchronization point stalls all threads in a block until the slowest one 
+reaches the barrier. Minimize synchronizations by carefully analyzing data 
+dependencies—only synchronize when threads genuinely need to exchange data 
+through shared memory.

-An alternative way to synchronize is to use streams. Different streams can
-execute commands either without following a specific order or concurrently. This
-is why streams allow more fine-grained control over the execution order of
-commands, which can be beneficial in certain scenarios.
+.. code-block:: cuda
+
+   __global__ void kernel() {
+       __shared__ float data[256];
+       
+       // Load phase
+       data[threadIdx.x] = input[...];
+       __syncthreads();  // Necessary sync
+       
+       // Compute phase - no sync needed if threads are independent
+       float result = compute(data[...]);
+       
+       // Store phase - sync only if needed
+       output[...] = result;
+   }
+
+**Use streams for async execution**
+
+Streams enable concurrent execution of independent operations. Commands in 
+different streams can overlap in time, allowing kernel execution and memory 
+transfers to run simultaneously. This maximizes GPU utilization by keeping 
+multiple execution engines busy concurrently.
+
+.. code-block:: cuda
+
+   hipStream_t stream1, stream2;
+   hipStreamCreate(&stream1);
+   hipStreamCreate(&stream2);
+   
+   // Overlap independent operations
+   kernel1<<<grid, block, 0, stream1>>>(...);
+   kernel2<<<grid, block, 0, stream2>>>(...);
+   
+   hipStreamSynchronize(stream1);
+   hipStreamSynchronize(stream2);
+
+Managing register pressure
+==========================
+
+High register usage can limit occupancy. Follow these steps:
+
+**Minimize live variables**
+
+The compiler allocates registers for every variable that must remain accessible. 
+Reducing the number of simultaneously live variables frees registers, allowing 
+more wavefronts to fit on each CU. Chaining function calls trades some redundant 
+computation for lower register usage.
+
+.. code-block:: cuda
+
+   // Instead of storing all intermediate results
+   float a = compute_a();
+   float b = compute_b();
+   float c = compute_c();
+   float result = combine(a, b, c);
+   
+   // Recompute or chain operations
+   float result = combine(compute_a(), compute_b(), compute_c());
+
+**Use shared memory for temporary storage**
+
+Per-thread arrays stored in registers consume valuable register space, limiting 
+occupancy. Moving temporary storage to shared memory trades register usage for 
+shared memory usage, often allowing higher occupancy since shared memory limits 
+are typically less restrictive.
+
+.. code-block:: cuda
+
+   // Instead of per-thread arrays (uses registers)
+   float temp[100];
+   
+   // Use shared memory
+   __shared__ float temp[blockDim.x][100];
+   float* my_temp = temp[threadIdx.x];
+
+**Adjust launch bounds**
+
+The ``__launch_bounds__`` attribute provides hints to the compiler about expected 
+thread block size and minimum blocks per CU. This guides register allocation 
+decisions, potentially trading per-thread register count for higher occupancy.
+
+.. code-block:: cuda
+
+   __global__ void 
+   __launch_bounds__(256, 4)  // 256 threads, 4 blocks per CU
+   my_kernel() {
+       // Kernel code
+   }
+
+**Check register usage during compilation**
+
+The compiler can report per-kernel register usage statistics. Monitoring this 
+output helps identify kernels consuming excessive registers, guiding optimization 
+efforts toward reducing register pressure in the most impactful areas.
+
+.. code-block:: bash
+
+   hipcc --resource-usage kernel.hip
+
+For register pressure theory, see :ref:`register_pressure_theory`.
+
+Improving occupancy
+===================
+
+Higher occupancy helps hide latency. Follow these steps:
+
+**Reduce register usage per thread**
+
+Use techniques from "Managing register pressure" above.
+
+**Reduce shared memory usage per block**
+
+Each CU has limited shared memory that must be divided among resident blocks. 
+Reducing per-block shared memory usage allows more blocks to reside simultaneously, 
+increasing occupancy and improving latency hiding through greater thread-level parallelism.
+
+.. code-block:: cuda
+
+   // Allocate only what's needed
+   __shared__ float tile[TILE_SIZE][TILE_SIZE];
+   
+   // Or use dynamic allocation
+   extern __shared__ float dynamic_shared[];
+
+**Optimize block size**
+
+AMD GPUs execute threads in wavefronts of 64. Choosing block sizes as multiples 
+of 64 prevents partial wavefronts that waste execution slots. Larger blocks 
+(128-256 threads) typically achieve better occupancy and resource utilization.
+
+.. code-block:: cuda
+
+   // Use multiples of wavefront size
+   dim3 block(64);    // Good for AMD GPUs (wavefront=64)
+   dim3 block(128);   // Common choice
+   dim3 block(256);   // Good for high-occupancy kernels
+   
+   // Avoid very small blocks
+   dim3 block(32);    // May waste resources
+
+**Profile occupancy**
+
+Profiling tools report the ratio of active wavefronts to maximum possible 
+wavefronts per CU. Low occupancy suggests resource constraints (registers or 
+shared memory) are limiting parallelism and may indicate opportunities for optimization.
+
+.. code-block:: bash
+
+   rocprofv3 --occupancy ./your_application
+
+For occupancy theory, see :ref:`occupancy`.

 Minimizing memory thrashing
-================================================================================
+============================

 Applications frequently allocating and freeing memory might experience slower
-allocation calls over time as memory is released back to the operating system.
-To optimize performance in such scenarios, follow these guidelines:
+allocation calls over time. To optimize:

- Avoid allocating all available memory with :cpp:func:`hipMalloc` or
-  :cpp:func:`hipHostMalloc`, as this immediately reserves memory and might
-  prevent other applications from using it. This behavior could strain the
-  operating system schedulers or prevent other applications from running on the
-  same GPU.
- Try to allocate memory in suitably sized blocks early in the application's
-  lifecycle and deallocate only when the application no longer needs it.
-  Minimize the number of :cpp:func:`hipMalloc` and :cpp:func:`hipFree` calls in
-  your application, particularly in performance-critical areas.
- Consider resorting to other memory types such as :cpp:func:`hipHostMalloc` or
-  :cpp:func:`hipMallocManaged`, if an application can't allocate sufficient
-  device memory. While the other memory types might not offer similar
-  performance, they allow the application to continue running.
- For supported platforms, use :cpp:func:`hipMallocManaged`, as it allows
-  oversubscription. With the right policies, :cpp:func:`hipMallocManaged` can
-  maintain most, if not all, :cpp:func:`hipMalloc` performance.
-  :cpp:func:`hipMallocManaged` doesn't require an allocation to be resident
-  until it is needed or prefetched, which eases the load on the operating
-  system's schedulers and facilitates multitenant scenarios.
+**Allocate early, deallocate late**
+
+Frequent allocation and deallocation causes memory fragmentation and increases 
+allocator overhead. Reusing allocations across iterations amortizes the cost 
+of memory management and maintains better memory locality.
+
+.. code-block:: cuda
+
+   // Bad: frequent allocation in loop
+   for (int i = 0; i < iterations; i++) {
+       float* temp;
+       hipMalloc(&temp, size);
+       // Use temp
+       hipFree(temp);
+   }
+   
+   // Good: allocate once
+   float* temp;
+   hipMalloc(&temp, size);
+   for (int i = 0; i < iterations; i++) {
+       // Reuse temp
+   }
+   hipFree(temp);
+
+**Avoid allocating all available memory**
+
+Reserving some memory headroom prevents allocation failures and system instability. 
+The driver and runtime need workspace for internal operations, and leaving a 
+safety margin ensures stable operation without unexpected out-of-memory errors.
+
+.. code-block:: cuda
+
+   size_t free, total;
+   hipMemGetInfo(&free, &total);
+   
+   // Don't allocate all free memory
+   size_t safe_size = free * 0.9;  // Leave some margin
+
+**Use managed memory for oversubscription**
+
+Managed memory automatically migrates data between host and device on demand, 
+allowing allocations larger than physical GPU memory. Prefetching hints help 
+the runtime optimize page placement, reducing migration overhead during kernel execution.
+
+.. code-block:: cuda
+
+   // Allows exceeding physical memory
+   float* data;
+   hipMallocManaged(&data, large_size);
+   
+   // Optionally prefetch to device
+   hipMemPrefetchAsync(data, size, device, stream);
+
+Summary
+=======
+
+Key optimization techniques:
+
+* **Profile first**: Use ``rocprofv3`` to identify actual bottlenecks
+* **Parallelize effectively**: Maximize work at all levels (application, device, CU)
+* **Optimize memory**: Minimize transfers, maximize coalescing, use LDS
+* **Manage resources**: Balance registers, shared memory, and occupancy
+* **Minimize divergence**: Structure control flow to keep warps coherent
+
+For understanding the theory behind these techniques, refer to 
+:doc:`../understand/performance_optimization` and :doc:`../understand/hardware_implementation`.
@@ -30,6 +30,7 @@ The HIP documentation is organized into the following categories:
 :::{grid-item-card} Programming guide

 * {doc}`./understand/programming_model`
+* {doc}`./understand/performance_optimization`
 * {doc}`./understand/hardware_implementation`
 * {doc}`./understand/compilers`
 * {doc}`./how-to/performance_guidelines`
@@ -26,6 +26,7 @@ subtrees:
 - caption: Programming guide
  entries:
  - file: understand/programming_model
+  - file: understand/performance_optimization
  - file: understand/hardware_implementation
  - file: understand/compilers
  - file: how-to/performance_guidelines
@@ -1,24 +1,110 @@
 # Glossary of terms

-* **host**, **host CPU** : Executes the HIP runtime API and is capable of initiating kernel launches to one or more devices.
-* **default device** : Each host thread maintains a default device.
-Most HIP runtime APIs (including memory allocation, copy commands, kernel launches) do not accept an explicit device
-argument but instead implicitly use the default device.
-The default device can be set with `hipSetDevice`.
+## Host and device

-* **active host thread** - the thread which is running the HIP APIs.
+* **host**, **host CPU**: Executes the HIP runtime API and is capable of initiating kernel launches to one or more devices.
+* **device**: A GPU or accelerator that executes HIP kernels. In the context of HIP, this typically refers to AMD GPUs or NVIDIA GPUs when using HIP on CUDA.
+* **default device**: Each host thread maintains a default device. Most HIP runtime APIs (including memory allocation, copy commands, kernel launches) do not accept an explicit device argument but instead implicitly use the default device. The default device can be set with `hipSetDevice`.
+* **active host thread**: The thread which is running the HIP APIs.

-* **HIP-Clang** - Heterogeneous AMDGPU Compiler, with its capability to compile HIP programs on AMD platform (https://github.com/RadeonOpenCompute/llvm-project).
+## Architecture and hardware

-* **clr** - a repository for AMD Compute Language Runtime, contains source codes for AMD's compute languages runtimes: HIP and OpenCL.
-clr (https://github.com/ROCm/clr) contains the following three parts,
+* **AMD device architecture**: The organization and execution model of AMD GPUs, defining how computation happens inside the hardware through programmable compute units organized into shader engines.
+* **compute unit (CU)**: The fundamental programmable engine of AMD GPUs. Each CU manages thousands of lightweight threads, orchestrating their execution, memory access, and synchronization. A CU consists of SIMD units, scalar units, register files, LDS, and caches.
+* **GPU cores**: The primary arithmetic engines within compute units, including SIMD lanes and matrix fused multiply-add (MFMA) units that execute mathematical and logical operations.
+* **SIMD (Single Instruction, Multiple Data)**: Hardware units that execute the same instruction across multiple data elements simultaneously. AMD GPUs typically have four SIMD units per CU.
+* **VALU (Vector Arithmetic Logic Unit)**: Executes vector instructions across entire wavefronts, with each thread potentially operating on different data.
+* **SALU (Scalar Arithmetic Logic Unit)**: Executes instructions uniformly across all threads in a wavefront, handling control flow and wavefront-uniform operations.
+* **SFU (Special Function Unit)**: Accelerates transcendental and reciprocal mathematical functions like exp, log, sin, cos, rcp, and rsqrt.
+* **LSU (Load/Store Unit)**: Handles data transfers between compute units and GPU memory subsystems, managing thousands of outstanding memory requests.
+* **MFMA (Matrix Fused Multiply-Add)**: Specialized hardware units in CDNA architectures that perform large-scale matrix operations in a single instruction, providing the primary source of peak floating-point performance.
+* **DME (Data Movement Engine)**: Specialized hardware units in CDNA3/4 that accelerate access to multi-dimensional tensor data, performing high-throughput copies between global memory and on-chip memory.
+* **shader engine**: Top-level organizational unit in AMD GPUs containing multiple shader arrays and shared resources.
+* **GFX IP**: Graphics IP version identifier (like gfx908, gfx90a, gfx942) that specifies the precise features, register layout, and machine instruction format a GPU supports.
+* **CDNA**: Compute DNA architecture specialized for high-performance computing and machine learning workloads, featuring matrix cores and enhanced memory bandwidth.
+* **RDNA**: Radeon DNA architecture optimized for graphics and lower-latency compute workloads, featuring Wave32 execution and work group processors.
+* **GCN**: Graphics Core Next, the foundational architecture for modern AMD GPUs that established key design principles still used today.

-  * `hipamd`: contains implementation of HIP language on AMD platform.
-  * `rocclr`: contains common runtime used in HIP and OpenCL, which provides virtual device interfaces that compute runtimes interact with different backends such as ROCr on Linux or PAL on Windows.
-  * `opencl`: contains implementation of OpenCL on AMD platform.
+## Memory hierarchy

-* **hipify tools** - tools to convert CUDA code to portable C++ code (https://github.com/ROCm/HIPIFY).
+* **VGPR (Vector General-Purpose Registers)**: Per-thread registers that hold data processed by SIMD lanes, such as individual elements of matrices or vectors.
+* **SGPR (Scalar General-Purpose Registers)**: Registers holding values shared across an entire wavefront, such as loop counters, constants, or addresses.
+* **AGPR (Accumulation VGPRs)**: Additional register file space in CDNA architectures dedicated to matrix accumulation, doubling available register storage for matrix operations.
+* **register file**: Primary on-chip memory that holds data between arithmetic and memory operations, built from extremely fast SRAM.
+* **LDS (Local Data Share)**: Fast on-chip scratchpad memory shared among threads in a workgroup, providing low-latency communication within a block.
+* **L1 data cache**: Private on-chip memory associated with each compute unit, providing fast access to recently used data.
+* **global memory**: General read-write accessible memory visible to all threads on a device, backed by high-bandwidth memory (HBM).
+* **constant memory**: Read-only storage visible to all threads, optimized for uniform access patterns across a wavefront.
+* **texture memory**: Special read-only memory optimized for spatial locality and 2D/3D access patterns with filtering capabilities.
+* **surface memory**: Read-write version of texture memory.
+* **HBM (High Bandwidth Memory)**: Vertically stacked DRAM technology providing terabytes per second of memory bandwidth in modern GPUs.
+* **memory coalescing**: Hardware optimization that combines memory accesses from multiple threads into fewer transactions when accessing consecutive addresses.
+* **bank conflict**: Performance penalty when multiple threads access different addresses in the same LDS bank, causing serialization.

-* **`hipconfig`** - tool to report various configuration properties of the target platform.
+## Execution model

-* **`nvcc`** - NVIDIA CUDA `nvcc` compiler, do not capitalize.
+* **work-item**: A single instance of a kernel, representing one thread of parallel execution.
+* **wavefront**: A group of threads (typically 32 or 64) that execute the same instruction simultaneously in lockstep on a SIMD unit.
+* **work-group**: A collection of wavefronts that can synchronize and share LDS memory, executing on the same compute unit.
+* **grid**: The total collection of work-groups launched for a kernel, defining the problem size.
+* **HIP kernel**: A function marked with `__global__` that executes on the GPU device in parallel across many threads.
+* **HIP thread hierarchy**: The three-level organization of threads (thread, block, grid) that defines parallel execution scope.
+* **warp size**: The number of threads in a wavefront - typically 64 for AMD CDNA/GCN architectures and 32 for RDNA architectures.
+* **occupancy**: The ratio of active wavefronts to maximum possible wavefronts on a compute unit, affecting latency hiding ability.
+* **wavefront scheduler**: Hardware component that decides which wavefront to execute each clock cycle, enabling rapid context switching.
+* **wavefront divergence**: Performance penalty when threads within a wavefront take different execution paths due to conditional statements.
+* **branch efficiency**: The ratio of non-divergent to total branches, indicating control flow uniformity.
+
+## Performance concepts
+
+* **performance bottleneck**: The limiting factor preventing higher kernel performance, typically either compute-bound or memory-bound.
+* **roofline model**: Visual performance analysis framework relating achievable performance to hardware limits based on arithmetic intensity.
+* **compute-bound**: Kernel performance limited by arithmetic throughput rather than memory bandwidth.
+* **memory-bound**: Kernel performance limited by memory bandwidth rather than compute capacity.
+* **arithmetic intensity**: Ratio of floating-point operations to memory traffic (FLOPs/byte), determining compute vs memory boundedness.
+* **overhead**: Fixed costs associated with operations like kernel launches, memory allocations, or synchronization.
+* **latency hiding**: GPU technique of switching between wavefronts to mask memory and instruction latency through massive multithreading.
+* **memory bandwidth**: Rate of data transfer between GPU compute units and memory, measured in GB/s or TB/s.
+* **arithmetic bandwidth**: Rate of arithmetic operations, measured in FLOPS (floating-point operations per second).
+* **active cycle**: Percentage of cycles where at least one instruction is executing on a compute unit.
+* **pipe utilization**: Percentage of execution cycles where the pipeline actively processes instructions.
+* **peak rate**: Theoretical maximum performance of a GPU in FLOPS or bandwidth.
+* **issue efficiency**: Ratio of issued instructions to maximum possible, indicating scheduling effectiveness.
+* **CU utilization**: Percentage of compute units actively executing work.
+* **register pressure**: Condition where high register usage limits occupancy and performance.
+* **Little's Law**: Performance principle relating concurrency, latency, and throughput: Concurrency = Latency × Throughput.
+
+## Software and tools
+
+* **ROCm programming model**: The software model for programming AMD GPUs, including the HIP API and associated runtime.
+* **ROCm software platform**: Complete software stack for AMD GPU computing, including drivers, runtime, compilers, libraries, and tools.
+* **HIP C++**: C++ dialect for writing portable GPU code that can run on both AMD and NVIDIA platforms.
+* **HIP runtime API**: Set of functions for managing GPU devices, memory, streams, and kernel execution.
+* **HIP compiler driver**: Tool that compiles HIP source code into device binaries and host code.
+* **HIP runtime compiler (hipRTC)**: API for compiling HIP kernels at runtime from source strings.
+* **HIP-Clang**: Heterogeneous AMDGPU Compiler with capability to compile HIP programs on AMD platform.
+* **clr**: Repository for AMD Compute Language Runtime, containing source codes for HIP and OpenCL runtimes.
+  * `hipamd`: Implementation of HIP language on AMD platform
+  * `rocclr`: Common runtime providing virtual device interfaces for different backends
+  * `opencl`: Implementation of OpenCL on AMD platform
+* **hipify tools**: Tools to convert CUDA code to portable C++ code.
+* **hipconfig**: Tool to report various configuration properties of the target platform.
+* **rocm-smi**: System management interface for monitoring and configuring AMD GPUs.
+* **AMD uProf**: System-wide performance analysis tool for CPU and GPU profiling.
+* **ROCm profiler**: Performance profiling tools for analyzing GPU kernel execution.
+* **ROCm binary utilities**: Tools for inspecting and manipulating GPU binary code objects.
+* **rocBLAS**: Optimized BLAS (Basic Linear Algebra Subprograms) library for AMD GPUs.
+* **MIOpen**: Machine learning primitives library optimized for AMD GPUs.
+* **AMDGPU assembly**: Low-level assembly language for AMD GPUs.
+* **AMDGPU IR**: Intermediate representation used in the AMD GPU compilation pipeline.
+* **nvcc**: NVIDIA CUDA compiler (do not capitalize).
+
+## Execution states and synchronization
+
+* **wavefront execution state**: Current status of a wavefront - active, ready, stalled, or sleeping.
+* **active wavefront**: Wavefront currently executing on a SIMD unit.
+* **stalled wavefront**: Wavefront waiting for a dependency like memory or synchronization.
+* **stream**: FIFO queue of GPU commands that execute in order, enabling asynchronous execution.
+* **event**: Synchronization primitive for coordinating execution between streams or with the host.
+* **barrier**: Synchronization point where threads wait for others to reach before proceeding.
+* **memory fence**: Operation ensuring memory consistency and ordering across threads.
@@ -1,6 +1,6 @@
 .. meta::
-  :description: This chapter describes the typical hardware implementation of GPUs supported by HIP.
-  :keywords: AMD, ROCm, HIP, Hardware, Compute Unit, ALU, VALU, Cache, Registers, LDS
+  :description: This chapter describes the hardware implementation of AMD GPUs supported by HIP.
+  :keywords: AMD, ROCm, HIP, hardware, GPU, architecture, compute unit, VALU, SALU, cache, memory hierarchy, CDNA, RDNA, GCN

 .. _hardware_implementation:

@@ -8,160 +8,701 @@
 Hardware implementation
 *******************************************************************************

-This chapter describes the typical hardware implementation of GPUs supported by
-HIP, and how the :ref:`inherent_thread_model` maps to the hardware.
+This chapter describes the hardware architecture of AMD GPUs supported by HIP,
+focusing on the internal organization and operation of GPU hardware components.
+Understanding these hardware details helps you optimize GPU applications and
+achieve maximum performance.

-Compute units
-=============
+Overall GPU architecture
+========================

-The basic building block of a GPU is a compute unit (CU), also known
-as streaming multiprocessor (SM) on NVIDIA GPUs. The thread blocks making up a
-grid are scheduled for execution on CUs. Each block is assigned to an
-individual CU, and a CU can accommodate several blocks. Depending on their
-resource usage up to thousands of threads can reside on a CU.
+AMD GPUs consist of interconnected blocks of digital circuits that work together
+to execute complex parallel computing tasks. The architecture is organized
+hierarchically to enable massive parallelism while managing resources efficiently.

-CUs contain an array of processing elements, referred to as
-vector ALU (VALU), that execute the actual instructions of the threads
-according to the :ref:`SIMT model<programming_model_simt>`, together with the
-necessary registers and caches.
+Command processor and control
+-----------------------------

-The threads are executed in groupings called warps. The amount of threads
-making up a warp is architecture dependent. On AMD GPUs the warp size is
-commonly 64 threads, except in RDNA architectures which can utilize a warp size
-of 32 or 64 respectively. The warp size of supported AMD GPUs is listed in the
-:doc:`rocm:reference/gpu-arch-specs`. NVIDIA GPUs have a warp size of 32.
+The command processor (CP) serves as the primary interface between the CPU and
+GPU, receiving and distributing commands for execution. The CP consists of two
+main components:

-In contrast to CPUs, GPUs generally do not employ complex cache structures or
-control logic, like branch prediction or out-of-order execution, but instead
-rely on massive hardware multithreading to hide latency.
+* **Command processor fetcher (CPF)**: Fetches commands from memory and passes
+  them to the CPC for processing.
+* **Command processor packet processor (CPC)**: A microcontroller that decodes the
+  fetched commands and dispatches kernels to the workgroup processors for
+  scheduling.

-Context switching between warps residing on a CU incurs no overhead,
-as the context for the warps is stored on the CU and does not need to
-be fetched from memory. If there are not enough free registers to accommodate
-all warps of a block, the block can not be scheduled to that CU and it
-has to wait until other blocks finish execution.
+The command processor handles several types of operations:

-The amount of warps that can reside concurrently on a CU, known
-as occupancy, is determined by the warp's resource usage of registers and
-shared memory.
+* Kernel launches, which are forwarded to asynchronous compute engines (ACEs)
+* Memory transfers, which are delegated to direct memory access (DMA) engines
+* Synchronization operations and memory fences

-.. _gcn_cu:
+**DMA engines** handle memory transfers between CPU and GPU memory without CPU
+involvement after initialization. Most GPUs contain two DMA engines, enabling
+concurrent bidirectional transfers to better utilize PCIe bandwidth. The DMA
+engines fetch data in small chunks and can process transfers in parallel but
+cannot handle multiple copy commands on the same engine simultaneously.

-.. figure:: ../data/understand/hardware_implementation/compute_unit.svg
-    :alt: Diagram depicting the general structure of a compute unit of an AMD
-          GPU.
+**Asynchronous compute engines (ACEs)** break down kernels into workgroups for
+distribution to shader processor input (SPI) blocks. Multiple ACEs enable
+concurrent kernel execution, with each ACE capable of dispatching one kernel
+at a time. ACEs process commands from different queues asynchronously, enabling
+overlap between different kernel executions and memory operations.

-    AMD Graphics Core Next (GCN) CU
+Hierarchical organization
+-------------------------

-On AMD GCN GPUs the basic structure of a CU is:
+The GPU organizes compute resources in a three-level hierarchy that enables
+modular design and resource sharing:

-* four Single Instruction Multiple Data units (SIMDs)
-* a vector cache
-* a local data share
-* and a scalar unit
+1. **Shader engines (SE)**: Top-level organizational units containing multiple
+   shader arrays and shared resources
+2. **Shader arrays**: Groups of compute units sharing instruction and scalar
+   caches
+3. **Compute units (CU)**: Basic execution units containing the ALUs and
+   registers for thread execution

-SIMD
----
+.. figure:: ../data/understand/hardware_implementation/selayout.png
+   :align: center
+   :alt: Diagram showing the hierarchical organization of compute units grouped
+         into shader engines on AMD GPUs
+   :width: 800

-A SIMD consists of a VALU, that executes the instruction of a warp, together
-with a register file, that provides the registers warps.
+   Hierarchical organization of compute units into shader engines

-The size of the warp is inherently related to the width of the vector ALU of
-the SIMD. On GCN compute units the width of the VALU is 16, so a warp can be
-issued to a SIMD every 4 cycles. Since a CU has 4 SIMDs it issues one
-warp per cycle. The instructions of a warp are effectively executed in
-lock-step.
+This hierarchical design allows different GPU configurations using the same
+underlying architecture. For example, the R9 Fury X contains 16 shader arrays
+with four CUs each, while the RX 480 contains 12 shader arrays with three CUs
+each, but both use the same gfx803 chip design.

-A SIMD always executes the same instruction for the whole VALU. If the control
-flow of a warp diverges, the performance is decreased, as the results for the
-threads that do not participate in that branch have to be masked out, and the
-instructions of the other branch have to be executed in the same way. The best
-performance can therefore be achieved when thread divergence is kept to a warp
-level, i.e. when all threads in a warp take the same execution path.
+Shader engine components
+========================

-Vector cache
------------
+Shader engines group multiple compute units together with shared resources that
+improve efficiency and reduce redundancy. Each shader engine contains several
+key components shared across its compute units.

-The usage of cache on a GPU differs from that on a CPU, as there is less cache
-available per thread. Its main purpose is to coalesce memory accesses of the
-warps in order to reduce the amount of accesses to device memory, and make that
-memory available for other warps that currently reside on the compute unit, that
-also need to load those values.
+Workgroup manager (SPI)
+-----------------------

-Local data share
----------------
+The workgroup manager, also called the shader processor input (SPI), bridges
+the command processor and compute units. After the CP processes a kernel
+dispatch, the SPI:

-The local data share is memory that is accessible to all threads within a block.
-Its latency and bandwidth is comparable to that of the vector cache. It can be
-used to share memory between the threads in a block, or as a software managed
-cache.
+* Receives workgroups from the ACEs
+* Schedules workgroups onto available compute units
+* Initializes registers with kernel parameters
+* Ensures all wavefronts of a workgroup execute on the same CU for
+  synchronization
+* Monitors resource availability and queues workgroups when resources are
+  exhausted

-Scalar Unit
-----------
+The SPI tracks four critical resources that limit concurrent execution:

-The scalar unit performs instructions that are uniform within a warp. It
-thereby improves efficiency and reduces the pressure on the vector ALUs and the
-vector register file.
+* Wavefront slots (execution contexts)
+* Vector general-purpose registers (VGPRs)
+* Scalar general-purpose registers (SGPRs)
+* Local data share (LDS) memory

-.. _cdna3_cu:
+Workgroup-to-CU mapping is non-deterministic and based on available resources.
+You should not assume any specific mapping pattern, as the same kernel launched
+multiple times can have different workgroup distributions.

-CDNA architecture
-=================
+Scalar L1 data cache (sL1D)
+---------------------------

-The general structure of CUs stays mostly as it is in GCN
-architectures. The most prominent change is the addition of matrix ALUs, which
-can greatly improve the performance of algorithms involving matrix
-multiply-accumulate operations for
-:doc:`int8, float16, bfloat16 or float32<rocm:reference/precision-support>`.
+The scalar L1 data cache serves scalar memory operations from multiple CUs
+within a shader array. The sL1D is shared between CUs (3 CUs in the Graphics
+Core Next (GCN) and MI100, 2 CUs in the MI200 series) and caches data that is
+uniform across a wavefront, including:

-.. figure:: ../data/understand/hardware_implementation/cdna3_cu.png
-  :alt: Block diagram showing the structure of a CDNA3 compute unit. It includes
-        Shader Cores, the Matrix Core Unit, a Local Data Share used for sharing
-        memory between threads in a block, an L1 Cache and a Scheduler. The
-        Shader Cores represent the vector ALUs and the Matrix Core Unit
-        represents the matrix ALUs. The Local Data Share is used as the shared
-        memory.
+* Kernel arguments and pointers
+* Grid and block dimensions
+* Constants accessed uniformly across threads
+* Data from ``__constant__`` memory when accessed uniformly

-  Block Diagram of a CDNA3 Compute Unit.
+Unlike the vector L1 cache, the sL1D doesn't use a "hit-on-miss" approach,
+meaning subsequent requests to the same pending cache line count as duplicated
+misses rather than hits.

-.. _rdna3_cu:
+L1 instruction cache (L1I)
+--------------------------

-RDNA architecture
-=================
+The L1 instruction cache is a read-only cache shared between multiple CUs in a
+shader array. Like the sL1D, it is backed by the L2 cache and doesn't use the
+"hit-on-miss" approach. The L1I stores kernel instructions fetched by the
+compute units, reducing instruction fetch latency and L2 cache pressure.

-RDNA makes a fundamental change to CU design, by changing the
-size of a warp to 32 threads. This is done by effectively combining two GCN5
-SIMDs, creating a VALU of width 32, so that a whole warp can be issued in one
-cycle. The CU is also replaced by the work group processor (WGP),
-which encompasses two CUs. For backwards compatibility the WGP can
-also run in wave64 mode, in which it issues a warp of size 64 in two cycles.
+Compute unit architecture
+=========================

-It also adds an extra layer of cache to the WGP, shared by the CUs
-within it. This cache is referred to as L1 cache, promoting the per-CU cache to
-an L0 cache.
+The compute unit (CU) is the fundamental execution block of AMD GPUs.
+It's responsible for executing kernels through its various specialized components
+and pipelines.

-.. figure:: ../data/understand/hardware_implementation/rdna3_cu.png
-  :alt: Block diagram showing the structure of an RDNA3 Compute Unit. It
-        consists of four SIMD units, each including a vector and scalar register
-        file, with the corresponding scalar and vector ALUs. All four SIMDs
-        share a scalar and instruction cache, as well as the shared memory. Two
-        of the SIMD units each share an L0 cache.
+.. figure:: ../data/understand/hardware_implementation/gcn_compute_unit.png
+   :align: center
+   :alt: Detailed diagram of an AMD CDNA compute unit showing internal
+         components and data flow
+   :width: 800

-  Block Diagram of an RDNA3 work group processor.
+   Internal architecture of an AMD CDNA compute unit

-Shader engines
-==============
+Sequencer and scheduling
+------------------------

-For hardware implementation's sake, multiple CUs are grouped
-together into a Shader Engine or Compute Engine, typically sharing some fixed
-function units or memory subsystem resources.
+The instruction sequencer (SQ) serves as the control center of each compute
+unit, managing instruction flow through the execution pipelines. The sequencer
+maintains wavefront state and coordinates instruction execution across different
+functional units.
+
+**Wavefront organization**: The sequencer organizes active wavefronts into four
+pools, each containing slots for up to ten wavefronts (eight on the CDNA2 MI200
+series). Each slot includes:
+
+* Wavefront-level registers (program counter, execution mask, and others)
+* Instruction buffer for prefetched instructions
+* State information for scheduling decisions
+
+This organization theoretically allows up to 40 concurrent wavefronts per CU,
+though actual occupancy is typically limited by register and LDS usage.
+
+**Instruction fetching**: The fetch arbiter selects one wavefront per cycle to
+fetch instructions from memory, prioritizing the oldest wavefronts. Each CU can
+fetch up to 32 bytes (4-8 instructions) per cycle.
+
+**Instruction issuing**: The issue arbiter determines which instructions execute
+each cycle, selecting wavefronts from one pool per cycle in round-robin fashion.
+The arbiter can issue multiple instructions per cycle to different execution
+units, with a theoretical maximum of five instructions per cycle:
+
+* One VALU instruction
+* One vector memory operation
+* One SALU/scalar memory operation
+* One LDS operation
+* One branch operation
+
+Instructions always issue at wavefront granularity, with all threads in the
+wavefront executing the same instruction in lockstep. Context switching between
+wavefronts occurs every cycle with zero overhead, as all wavefront contexts
+remain resident on the CU.
+
+Execution pipelines
+-------------------
+
+Each compute unit contains multiple specialized execution pipelines that process
+different types of instructions in parallel, enabling efficient utilization of
+the hardware resources.
+
+Vector arithmetic logic unit (VALU)
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The VALU executes vector instructions across entire wavefronts, with each thread
+potentially operating on different data. The VALU consists of:
+
+* **Four SIMD processors**: Each containing 16 single-precision ALUs (or
+  equivalent), for 64 total ALUs per CU
+* **Vector register files**: 256-512 KiB of VGPR storage split across the four
+  SIMDs
+* **Instruction buffers**: Storage for up to 8-10 wavefronts per SIMD
+
+On architectures with 64-thread wavefronts and 16-instruction wide SIMD units,
+executing one instruction takes four cycles (one cycle per 16 threads). The four
+SIMD design ensures full utilization when sufficient wavefronts are available, as
+a new instruction can issue to each SIMD every cycle.
+
+The VALU serves as the primary arithmetic engine, executing the majority of 
+computation in GPU kernels. Data flows into these pipelines, undergoes arithmetic 
+transformation, and exits as results — with the goal of maximizing the number of 
+such transformations per clock cycle.
+
+For CDNA architectures with matrix operations, the VALU also dispatches
+matrix fused multiply-add (MFMA) instructions to specialized matrix units.
+
+Scalar arithmetic logic unit (SALU)
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The SALU executes instructions uniformly across all threads in a wavefront,
+handling operations like:
+
+* Control flow (branches, loops)
+* Address calculations
+* Loading kernel arguments and constants
+* Managing wavefront-uniform values
+
+The SALU includes:
+
+* A scalar processor for arithmetic and logic operations
+* 12.5 KiB of SGPR storage per CU
+* A scalar memory (SMEM) unit for memory operations
+
+Scalar operations reduce pressure on vector units and registers by handling
+uniform computations efficiently.
+
+Vector memory unit (VMEM)
+^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The VMEM unit handles all vector memory operations, including loads, stores,
+and atomic operations. Each thread supplies its own address and data, though
+the hardware optimizes access through memory coalescing when threads access
+nearby addresses. The VMEM unit connects to the vector L1 cache and implements
+both address generation and coalescing logic.
+
+Branch unit
+^^^^^^^^^^^
+
+The branch unit executes jumps and branches for control flow changes affecting
+entire wavefronts. Note that the branch unit handles wavefront-level control
+flow, not execution mask updates for thread divergence, which are handled
+through predication.
+
+Special function unit (SFU)
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The special function units accelerate certain arithmetic operations that are too 
+complex or costly to implement purely within the standard vector ALUs.
+
+SFUs are responsible for executing transcendental and reciprocal mathematical 
+functions — operations such as ``exp``, ``log``, ``sin``, ``cos``, ``rcp`` 
+(reciprocal), and ``rsqrt`` (reciprocal square root). These are heavily used in 
+scientific, physics, and machine learning workloads, particularly in activation 
+functions like GELU, sigmoid, or softmax.
+
+Each compute unit includes a set of specialized pipelines or transcendental 
+function units (TFUs) that handle these operations with dedicated hardware. 
+While their throughput is lower than that of the primary SIMD pipelines, they 
+enable these functions to execute efficiently without consuming general ALU 
+bandwidth.
+
+From the compiler's perspective, these operations map to specific AMDGPU ISA 
+instructions, such as:
+
+* ``v_exp_f32`` — compute exponential base e
+* ``v_log_f32`` — compute natural logarithm  
+* ``v_sin_f32``, ``v_cos_f32`` — compute sine or cosine
+* ``v_rsq_f32``, ``v_rcp_f32`` — compute reciprocal or reciprocal square root
+
+In CDNA3-based GPUs (like MI300), SFU throughput and latency have been tuned for 
+deep learning primitives. For instance, exponentiation (``exp``) and logarithm 
+(``log``) functions are now pipelined to complete in a few cycles per lane, 
+allowing vectorized activation functions in large-scale matrix workloads to 
+execute without significant stalls.
+
+For programmers targeting ROCm or HIP, these SFU-accelerated operations are 
+typically accessed through math intrinsics such as ``__expf``, ``__logf``, or 
+``__sinf``, which the compiler lowers to the corresponding AMDGPU ISA instructions 
+at compile time.
+
+Load/store unit (LSU)
+^^^^^^^^^^^^^^^^^^^^^
+
+The load/store units handle the transfer of data between the compute units and 
+the GPU's memory subsystems. They are responsible for issuing, tracking, and 
+retiring memory operations — including loads from and stores to global memory, 
+local shared memory, and caches — for thousands of concurrent threads.
+
+Each compute unit includes a set of LSUs tightly integrated with its vector and 
+scalar pipelines. These units handle memory instructions generated by active 
+wavefronts — such as ``buffer_load``, ``buffer_store``, and ``flat_load_dword`` 
+— and route them through the GPU's hierarchical memory system.
+
+The LSU's responsibilities include:
+
+* Managing vector memory accesses for SIMD instructions
+* Coordinating local data share (LDS) reads and writes
+* Accessing the L0/L1 caches and forwarding requests to the L2 cache and HBM
+* Handling synchronization and atomic operations between threads and workgroups
+
+LSUs manage thousands of outstanding memory requests per GPU, dynamically 
+scheduling them to hide memory latency. While arithmetic pipelines continue 
+executing other wavefronts, the LSUs maintain queues of pending transactions 
+and reorder responses as data returns from memory.
+
+On modern accelerators like MI300X, these LSUs achieve terabytes-per-second of 
+aggregate memory bandwidth, coordinating thousands of active threads performing 
+memory-intensive operations such as tensor loading, matrix tiling, and gradient 
+updates.
+
+Matrix fused multiply-add (MFMA)
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+CDNA architectures (MI100 and newer) include specialized matrix acceleration
+units for high-throughput matrix operations. These units execute independently
+from other VALU operations, allowing overlap between matrix and vector
+computations. MFMA units support various data types including ``INT8``, ``FP16``,
+``BF16``, and ``FP32``, with different throughput characteristics for each.
+
+Matrix cores are GPU execution units that perform large-scale matrix operations 
+in a single instruction. In AMD architectures, these units are formally known as 
+MFMA (matrix fused multiply-add) units — the core hardware blocks responsible for 
+accelerating deep learning, HPC, and dense linear-algebra workloads on modern 
+Instinct GPUs.
+
+Operating on entire tiles of matrices per instruction allows MFMA units to deliver 
+far greater arithmetic throughput and energy efficiency than scalar or vector ALUs. 
+Rather than fetching and decoding thousands of per-element multiply-add instructions, 
+each MFMA instruction processes an entire matrix fragment — drastically reducing 
+power per operation and increasing overall throughput.
+
+An example MFMA instruction from the AMDGPU ISA is:
+
+.. code-block:: none
+
+   v_mfma_f32_16x16x4f16 v[0:15], v[16:31], v[32:47], v[0:15]
+
+This instruction performs a matrix multiplication and accumulation D=A×B+C,
+where the fragments A, B, and C are stored in VGPRs. The suffix ``16x16x4f16`` 
+indicates a tile size of 16×16, with an inner dimension of 4, operating on 
+half-precision (FP16) inputs and accumulating into 32-bit floating-point outputs.
+
+Since their introduction in CDNA1, and further expanded in CDNA2 and CDNA3, AMD's 
+matrix cores have become the primary source of peak floating-point performance in 
+datacenter GPUs. For example, an MI300X accelerator achieves its multi-petaFLOP 
+throughput primarily through MFMA units.
+
+The MFMA units use both standard VGPRs and additional accumulation VGPRs
+(AGPRs) on supported architectures, providing up to 512 KiB of combined
+register storage per CU.
+
+Local data share (LDS)
+----------------------
+
+The local data share provides fast on-CU scratchpad memory for communication
+between threads in a workgroup.
+
+.. figure:: ../data/understand/hardware_implementation/lds.svg
+   :align: center
+   :alt: Diagram showing the organization of local data share with banks and
+         connections to SIMD units
+   :width: 800
+
+   Local data share organization and SIMD connections
+
+**Organization**: The LDS contains 32 banks, each 4-bytes wide, providing
+128 bytes/cycle total bandwidth. Banks can be accessed independently each cycle
+for reads, writes, or atomic operations. The SIMDs connect to the LDS in pairs,
+with each pair sharing a 64-byte bidirectional port.
+
+**Access patterns**: A single wavefront can achieve up to 64 bytes/cycle
+throughput (16 lanes per cycle). The actual bandwidth depends on data size and
+access patterns:
+
+* 4-byte values: 8 cycles for 64 threads (50% peak bandwidth)
+* 16-byte values: 20 cycles for 64 threads (80% peak bandwidth)
+
+**Conflict resolution**: The LDS includes hardware to detect and resolve bank
+conflicts when multiple threads access different addresses in the same bank.
+Conflicts are resolved by serializing accesses across multiple cycles. Address
+conflicts (multiple threads atomically updating the same address) are similarly
+serialized. Broadcasting from the same address to multiple threads is handled
+efficiently without conflicts.
+
+Vector L1 cache
+---------------
+
+Each CU contains a dedicated vector L1 data cache (vL1D) serving vector memory
+operations. Key characteristics include:
+
+* Write-through design (writes go directly to L2)
+* Optimization for high-bandwidth streaming access patterns
+* Coherent with other CUs through software management
+* Typical size of 16 KB per CU
+
+The vector cache tags are checked for all vector memory operations, with misses
+forwarded to the L2 cache. The write-through design simplifies coherence at the
+cost of write bandwidth.
+
+Memory hierarchy and system
+===========================
+
+The GPU memory system provides the bandwidth and capacity needed for massive
+parallel computation while managing data coherence and access efficiency.
+
+Memory organization
+-------------------

 .. figure:: ../data/understand/hardware_implementation/cdna2_gcd.png
-  :alt: Block diagram showing four Compute Engines each with 28 Compute Units
-        inside. These four Compute Engines share one block of L2 Cache. Around
-        them are four Memory Controllers. To the top and bottom of all these are
-        eight blocks of Infinity Fabric Links. Two Video Core Next blocks sit in
-        the top corners. At the very bottom spans a colored section reading
-        Infinity Fabric.
+  :alt: Block diagram showing four compute engines with L2 cache, memory
+        controllers, and Infinity Fabric interconnect on CDNA2
+  :width: 800

-  Block Diagram of a CDNA2 Graphics Compute Die.
+  CDNA2 Graphics Compute Die organization showing memory subsystem
+
+AMD GPUs typically use high-bandwidth memory (HBM) for data-intensive workloads,
+providing significantly higher bandwidth than traditional GDDR memory at the
+cost of slightly higher latency. The memory system includes:
+
+* **Memory channels**: Multiple independent memory controllers (typically 8-16)
+* **L2 cache banks**: Distributed cache banks serving as the coherence point
+* **Infinity Fabric**: High-speed interconnect for data routing
+
+L2 cache architecture
+---------------------
+
+The L2 cache serves as the coherence point for all GPU memory accesses and is
+shared by all compute units. The L2 consists of multiple independent channels
+(32 on CDNA GPUs at 256-byte interleaving) that operate in parallel.
+
+.. figure:: ../data/understand/hardware_implementation/l2perf_model.png
+   :align: center
+   :alt: Diagram showing L2 cache to Infinity Fabric transaction flow with
+         request categorization and routing
+   :width: 800
+
+   L2 cache to Infinity Fabric transaction flow
+
+**Key characteristics**:
+
+* **Channel organization**: Each channel handles a portion of the address space,
+  with addresses interleaved across channels for load balancing.
+* **Hit-on-miss behavior**: If a request arrives for a pending cache line fill,
+  it counts as a hit, improving the effective hit rate.
+* **Write coalescing**: Multiple writes to the same cache line are combined.
+* **Atomic operation support**: Atomics execute directly in the L2 cache for
+  coherence.
+
+**L2-Fabric interface**: Requests missing in L2 are routed through Infinity
+Fabric to the appropriate memory location, which could be:
+
+* Local HBM on the same GPU
+* Remote GPU memory (in multi-GPU systems)
+* System memory (CPU DRAM)
+
+The interface categorizes requests by type (read/write), size (32B/64B), and
+destination for optimal routing.
+
+Memory coherence
+----------------
+
+GPU memory coherence differs significantly from CPU designs to optimize for
+throughput over latency:
+
+**Write-through L1 caches**: All writes update both L1 and L2, ensuring L2
+always has the latest data. This eliminates the need for complex coherence
+protocols between L1 caches but requires higher write bandwidth.
+
+**Software-managed coherence**: Coherence between CUs requires explicit
+synchronization through:
+
+* Memory fences for ordering
+* Cache invalidation instructions
+* Atomic operations (executed at L2 level)
+* Kernel boundaries (implicit synchronization)
+
+**Write combining**: To handle partial cache line updates from different CUs,
+the GPU uses write masks indicating which bytes to update. This prevents false
+sharing issues while maintaining correctness.
+
+Memory coalescing
+-----------------
+
+Memory coalescing combines memory accesses from multiple threads into fewer
+transactions, significantly improving bandwidth utilization. The coalescing
+hardware in the VMEM unit analyzes addresses from all threads in a wavefront
+and groups them into the minimum number of cache line requests.
+
+**Coalesced access pattern**: When consecutive threads access consecutive memory
+addresses, the hardware can combine all 64 thread requests into as few as 4-8
+cache line requests (depending on data size and alignment).
+
+**Non-coalesced access pattern**: When threads access widely separated addresses,
+each thread can generate a separate memory transaction, reducing effective
+bandwidth by up to 16x or more.
+
+To achieve optimal memory performance:
+
+* Ensure consecutive threads access consecutive memory addresses
+* Align data structures to cache line boundaries (64B or 128B)
+* Use structure-of-arrays rather than array-of-structures layouts
+* Consider padding to avoid bank conflicts
+
+Architecture variants
+=====================
+
+AMD supports multiple GPU architecture families optimized for different use
+cases while maintaining HIP compatibility.
+
+Graphics Core Next (GCN)
+------------------------
+
+GCN represents the foundational architecture for modern AMD GPUs, establishing
+key design principles still used today:
+
+* 64-thread wavefronts
+* Four SIMD units per CU with 16 lanes each
+* Scalar unit for wavefront-uniform operations
+* LDS with 32 banks
+
+Multiple GCN generations (GCN1-5) introduced incremental improvements in
+process technology, clock speeds, and instruction set features while maintaining
+the core architectural philosophy.
+
+.. _cdna_architecture:
+
+CDNA architecture
+-----------------
+
+CDNA (Compute DNA) specializes in high-performance computing and machine
+learning workloads. Building on GCN principles, CDNA adds significant compute
+enhancements:
+
+.. figure:: ../data/understand/hardware_implementation/cdna3_cu.png
+  :alt: Block diagram showing CDNA3 compute unit with matrix core unit, shader
+        cores, L1 cache, and local data share
+  :width: 800
+
+  CDNA3 compute unit with matrix acceleration
+
+**Matrix Core Unit**: Specialized hardware for matrix multiply-accumulate
+operations, providing up to 16 times more throughput than vector units for supported
+operations. Matrix cores support multiple precisions (``INT8``, ``FP16``, ``BF16``, ``FP32``)
+with varying performance characteristics.
+
+**Accumulation VGPRs (AGPRs)**: Additional register file space (up to 256 KB)
+dedicated to matrix accumulation, doubling the available register storage for
+matrix operations. Data movement between VGPRs and AGPRs uses specialized
+instructions (``v_accvgpr_*``).
+
+**Enhanced memory bandwidth**: CDNA GPUs typically use HBM2/HBM2e/HBM3 memory,
+providing up to 3.2 TB/s bandwidth on high-end models.
+
+**Multi-die designs**: CDNA2 (MI250) and CDNA3 (MI300) use chiplet
+architectures with multiple dies connected through high-speed links, scaling
+to higher compute and memory capacities.
+
+.. _rdna_architecture:
+
+RDNA architecture
+-----------------
+
+RDNA optimizes for graphics and lower-latency compute workloads through
+fundamental architectural changes:
+
+.. figure:: ../data/understand/hardware_implementation/rdna3_cu.png
+  :alt: Block diagram showing RDNA3 work group processor with dual compute
+        units, shared caches, and 32-wide SIMD units
+  :width: 800
+
+  RDNA3 work group processor architecture
+
+**Wave32 execution**: Primary execution mode uses 32-thread wavefronts,
+reducing divergence penalties and register pressure. Wave64 mode is available
+for backward compatibility.
+
+**Dual compute units**: The work group processor (WGP) replaces standalone CUs,
+containing two closely coupled compute units sharing resources:
+
+* Each CU has two 32-wide SIMD units (vs. four 16-wide in GCN)
+* Wavefronts execute in a single cycle on 32-wide SIMDs
+* Reduced instruction latency improves responsiveness
+
+**Three-level cache hierarchy**:
+
+* **L0 cache**: Per-CU cache (equivalent to GCN's L1)
+* **L1 cache**: Shared between CUs in a WGP (new intermediate level)
+* **L2 cache**: Global cache shared across all WGPs
+
+**128-byte cache lines**: Doubled from 64 bytes in GCN, aligning with Wave32
+access patterns (32 threads × 4 bytes = 128 bytes).
+
+These RDNA optimizations target gaming workloads where latency matters more
+than pure throughput, though the architecture remains capable for general
+compute tasks.
+
+Performance considerations
+==========================
+
+Understanding hardware characteristics helps you optimize GPU applications for
+maximum performance.
+
+Occupancy and resource limits
+-----------------------------
+
+Occupancy measures the ratio of active wavefronts to maximum possible
+wavefronts on a CU. Higher occupancy generally improves latency hiding but
+is limited by:
+
+* **Register usage**: Each wavefront requires VGPRs and SGPRs from finite pools
+* **LDS allocation**: Shared memory used per workgroup
+* **Wavefront slots**: Fixed number of execution contexts per CU
+* **Workgroup size**: Smaller workgroups can waste resources
+
+Balancing these resources is critical for achieving optimal occupancy. Tools
+like ``rocprof`` can help analyze occupancy and identify limiting factors.
+
+Latency hiding through multithreading
+-------------------------------------
+
+GPUs hide memory and instruction latency through massive hardware
+multithreading rather than complex CPU techniques like out-of-order execution
+or speculation. With sufficient wavefronts:
+
+* Memory latency is hidden by executing other wavefronts during waits
+* Pipeline latencies are covered by round-robin wavefront scheduling
+* No context switch overhead as all contexts remain resident
+
+The hardware can switch between wavefronts every cycle, maintaining high ALU
+utilization even with long-latency operations in flight.
+
+Memory bandwidth utilization
+----------------------------
+
+Effective memory bandwidth depends on access patterns:
+
+* **Coalesced access**: Can achieve 70-90% of peak bandwidth
+* **Random access**: Might achieve only 5-15% of peak bandwidth
+* **Bank conflicts**: Can serialize LDS access, reducing throughput
+
+Memory-bound kernels should focus on:
+
+* Maximizing coalescing through proper data layout
+* Prefetching and data reuse in LDS
+* Balancing computation with memory access
+* Using appropriate cache policies
+
+Hardware-specific optimizations
+-------------------------------
+
+Different AMD GPU architectures benefit from tailored optimizations:
+
+**For GCN/CDNA**:
+
+* Optimize for 64-thread wavefront granularity
+* Leverage matrix cores for applicable algorithms
+* Consider AGPR usage for register spilling
+
+**For RDNA**:
+
+* Design for 32-thread wavefront execution
+* Utilize improved divergence handling
+* Take advantage of additional cache level
+
+**Architecture-agnostic**:
+
+* Minimize divergent control flow
+* Ensure memory access coalescing
+* Balance resource usage for occupancy
+* Overlap computation with memory access
+
+Summary
+=======
+
+AMD GPU hardware architecture provides massive parallelism through hierarchical
+organization of compute resources, specialized execution units, and a
+sophisticated memory system. Understanding these hardware details—from the
+command processor through shader engines to individual compute units and the
+memory hierarchy—enables you to write more efficient GPU applications.
+
+Key hardware concepts for optimization include:
+
+* Workgroup scheduling and resource management by the SPI
+* Instruction scheduling and wavefront execution in compute units
+* Memory coalescing and cache behavior
+* Architecture-specific features (matrix cores, Wave32/64 modes)
+* Resource limits affecting occupancy
+
+For details on mapping parallel algorithms to this hardware, see the
+:ref:`programming_model` chapter. For specific optimization techniques, consult
+the performance optimization guides in the ROCm documentation.
@@ -0,0 +1,375 @@
+.. meta::
+  :description: This chapter explains performance concepts and theoretical models for understanding AMD GPU performance
+  :keywords: AMD, ROCm, HIP, performance, theory, roofline, occupancy, bandwidth, arithmetic intensity
+
+.. _performance_optimization:
+
+*******************************************************************************
+Understanding GPU performance
+*******************************************************************************
+
+This chapter explains the theoretical foundations of GPU performance on AMD 
+hardware. Understanding these concepts helps you analyze performance 
+characteristics, identify bottlenecks, and make informed optimization decisions.
+
+For practical optimization techniques and step-by-step guidance, see 
+:doc:`../how-to/performance_guidelines`.
+
+Performance bottlenecks
+=======================
+
+A performance bottleneck is the limiting factor that prevents a GPU kernel from 
+achieving higher performance. The two primary categories are:
+
+* **Compute-bound**: The kernel is limited by arithmetic throughput
+* **Memory-bound**: The kernel is limited by memory bandwidth
+
+Understanding which category applies helps identify the appropriate optimization 
+approach. Compute-bound kernels benefit from arithmetic optimizations, while 
+memory-bound kernels benefit from memory access improvements.
+
+.. _roofline_model:
+
+Roofline model
+==============
+
+The roofline model is a visual performance analysis framework that relates 
+achievable performance to hardware limits based on arithmetic intensity.
+
+The model plots performance (FLOPS) against arithmetic intensity (FLOPS/byte) 
+with two limiting factors:
+
+1. **Memory bandwidth ceiling**: A sloped line representing peak memory bandwidth
+2. **Compute ceiling**: A horizontal line representing peak arithmetic throughput
+
+The intersection point determines the transition between memory-bound and 
+compute-bound regions. Kernels below and to the left of the intersection are 
+memory-bound, while those to the right are compute-bound.
+
+.. figure:: ../../data/understand/performance_optimization/roofline.svg
+   :alt: Roofline model diagram showing memory bandwidth ceiling and compute
+         ceiling
+   :align: center
+   :width: 100%
+
+   Roofline model showing the relationship between arithmetic intensity and
+   achievable performance. The memory bandwidth ceiling represents the GPU's
+   memory bandwidth limit, while the compute ceiling shows the maximum
+   achievable TFLOPs. Kernels falling into the area to the left of the red
+   line are memory-bound, while they are compute-bound if they fall into the
+   right area.
+
+Key characteristics:
+
+* The roofline creates an upper bound on achievable performance
+* Real applications typically achieve a significant portion of the theoretical limit
+* The model helps guide optimization efforts based on kernel characteristics
+
+.. _compute_bound:
+
+Compute-bound performance
+=========================
+
+A kernel is compute-bound when its performance is limited by the GPU's arithmetic 
+throughput rather than memory bandwidth. These kernels have high arithmetic 
+intensity and spend most cycles executing arithmetic operations.
+
+Characteristics of compute-bound kernels:
+
+* High ratio of arithmetic operations to memory accesses
+* Performance scales with GPU compute capacity
+* Limited benefit from memory bandwidth optimization
+* Can often achieve a high percentage of peak theoretical FLOPS
+
+The theoretical maximum is determined by:
+
+* Number of compute units and SIMD lanes
+* Clock frequency
+* Instruction throughput per cycle
+* Specialized unit capabilities (matrix cores, SFUs)
+
+.. _memory_bound:
+
+Memory-bound performance
+========================
+
+A kernel is memory-bound when its performance is limited by memory bandwidth 
+rather than compute capacity. These kernels have low arithmetic intensity and 
+spend significant time waiting for memory operations.
+
+Characteristics of memory-bound kernels:
+
+* Low ratio of arithmetic operations to memory accesses
+* Performance scales with memory bandwidth
+* Sensitive to memory access patterns
+* Typically achieve lower percentage of peak FLOPS
+
+The theoretical maximum is determined by:
+
+* HBM bandwidth capacity
+* Memory controller efficiency
+* Cache hierarchy effectiveness
+* Memory access pattern efficiency
+
+.. _arithmetic_intensity:
+
+Arithmetic intensity
+====================
+
+Arithmetic intensity is the ratio of floating-point operations (FLOPs) to memory 
+traffic (bytes) for a given kernel or algorithm.
+
+.. math::
+
+   \text{Arithmetic Intensity} = \frac{\text{FLOPs}}{\text{Bytes Transferred}}
+
+This metric determines whether a kernel is compute-bound or memory-bound.
+
+Key points:
+
+* Higher arithmetic intensity indicates more computation per byte transferred
+* The balance point depends on the GPU's compute-to-bandwidth ratio
+* It can be calculated theoretically or measured empirically
+* Different precision types affect both FLOPs and bytes
+
+For modern AMD GPUs:
+
+* The compute-to-bandwidth ratio varies by GPU generation
+* Higher-end models have higher ratios
+* Kernels above the GPU's specific ratio are compute-bound
+
+.. _latency_hiding:
+
+Latency hiding mechanisms
+==========================
+
+GPUs hide memory and instruction latency through massive hardware multithreading 
+rather than complex CPU techniques like out-of-order execution.
+
+How latency hiding works:
+
+* **Wavefront switching**: Context switches occur every cycle with zero overhead
+* **Multiple wavefronts per CU**: Many concurrent wavefronts supported
+* **Instruction-level parallelism**: Multiple independent instructions in flight
+
+The hardware can completely hide memory latency if there are enough active 
+wavefronts with independent work. The number of instructions required from other 
+wavefronts to hide latency depends on the specific memory latency and instruction 
+throughput characteristics of the GPU.
+
+Requirements for effective latency hiding:
+
+* Sufficient occupancy (active wavefronts)
+* Independent instructions to overlap
+* Balanced resource usage
+* Minimal divergence
+
+.. _wavefront_execution:
+
+Wavefront execution states
+==========================
+
+A wavefront can be in one of several states during execution:
+
+* **Active**: Currently executing on a SIMD unit
+* **Ready**: Eligible for execution, waiting for scheduling
+* **Stalled**: Waiting for a dependency (memory, synchronization)
+* **Sleeping**: Blocked on a barrier or synchronization primitive
+
+Understanding these states helps explain GPU utilization metrics:
+
+* **Active cycles**: Percentage of cycles with at least one instruction executing
+* **Stall cycles**: Percentage of cycles waiting for resources
+* **Idle cycles**: No wavefronts available to execute
+
+Maximizing active cycles while minimizing stall and idle cycles improves 
+performance.
+
+.. _occupancy:
+
+Occupancy theory
+================
+
+Occupancy measures the ratio of active wavefronts to the maximum possible 
+wavefronts on a compute unit.
+
+.. math::
+
+   \text{Occupancy} = \frac{\text{Active Wavefronts}}{\text{Max Wavefronts per CU}}
+
+Why occupancy matters:
+
+* Higher occupancy improves latency hiding
+* More concurrent wavefronts mask memory and instruction latency
+* Enables better utilization of execution units
+
+Limiting factors:
+
+* **Register usage**: VGPRs and SGPRs per thread
+* **Shared memory (LDS)**: Allocation per block
+* **Wavefront slots**: Hardware limit on concurrent wavefronts
+* **Block size**: Small blocks may waste resources
+
+Trade-offs:
+
+* Higher occupancy improves latency hiding but reduces resources per thread
+* Lower occupancy allows more resources per thread but may expose latency
+* Optimal occupancy depends on kernel characteristics
+* Memory-bound kernels benefit more from high occupancy
+
+.. _memory_hierarchy_theory:
+
+Memory hierarchy impact on performance
+=======================================
+
+The GPU memory hierarchy has different bandwidths and latencies:
+
+Memory types by speed:
+
+1. **Registers**: Fastest, lowest latency (per-thread storage)
+2. **LDS (shared memory)**: Very fast, on-chip (per-block storage)
+3. **L1 cache**: Fast, on-chip (per-CU cache)
+4. **L2 cache**: Moderate, on-chip (shared across CUs)
+5. **HBM (global memory)**: Slower, off-chip but high bandwidth
+
+Memory coalescing theory
+-------------------------
+
+Memory coalescing combines memory accesses from multiple threads into fewer 
+transactions. When consecutive threads access consecutive memory addresses, the 
+hardware can merge requests into efficient cache line accesses.
+
+Why coalescing matters:
+
+* Reduces number of memory transactions
+* Improves memory bandwidth utilization
+* Decreases memory access latency
+
+**Coalesced pattern**: Consecutive threads accessing consecutive addresses 
+achieve high bandwidth utilization.
+
+**Non-coalesced pattern**: Random or strided addresses result in many separate 
+transactions and low bandwidth utilization.
+
+.. _bank_conflicts_theory:
+
+Bank conflict theory
+--------------------
+
+Shared memory (LDS) is organized into banks that can be accessed independently. 
+Bank conflicts occur when multiple threads access different addresses in the 
+same bank.
+
+Why bank conflicts matter:
+
+* Conflicts serialize accesses, reducing throughput
+* LDS bandwidth drops proportionally to conflict degree
+* Can turn parallel operations into sequential ones
+
+Common patterns:
+
+* **No conflict**: Each thread accesses a different bank (full bandwidth)
+* **Broadcast**: Multiple threads read the same address (no conflict)
+* **N-way conflict**: N threads access the same bank (1/N bandwidth)
+
+.. _register_pressure_theory:
+
+Register pressure theory
+=========================
+
+Register pressure occurs when a kernel requires more registers than optimal for 
+the target occupancy.
+
+Why register pressure matters:
+
+* Reduces maximum occupancy
+* May cause register spilling to memory
+* Decreases ability to hide latency
+* Lowers overall throughput
+
+The relationship between registers and occupancy:
+
+* More registers per thread → fewer concurrent wavefronts
+* Fewer registers per thread → higher occupancy but may need memory spills
+* Optimal balance depends on kernel memory access patterns
+
+.. _performance_metrics_theory:
+
+Performance metrics explained
+==============================
+
+Understanding performance metrics helps analyze GPU behavior:
+
+Peak rate
+---------
+
+The theoretical maximum performance of a GPU:
+
+* **Peak FLOPS**: Maximum floating-point operations per second
+* **Peak bandwidth**: Maximum memory throughput
+* **Peak instruction rate**: Maximum instructions per cycle
+
+Actual performance is always below peak due to various inefficiencies.
+
+Utilization metrics
+-------------------
+
+**Pipe utilization**: The percentage of execution cycles where the pipeline is 
+actively processing instructions. Low utilization indicates stalls or insufficient 
+work.
+
+**Issue efficiency**: The ratio of issued instructions to the maximum possible. 
+Low efficiency can indicate instruction cache misses, scheduling inefficiencies, 
+or resource conflicts.
+
+**CU utilization**: The percentage of compute units actively executing work. Low 
+utilization suggests insufficient parallelism, load imbalance, or synchronization 
+overhead.
+
+**Branch efficiency**: The ratio of non-divergent to total branches. Low 
+efficiency indicates significant divergence overhead.
+
+Theoretical performance limits
+==============================
+
+Understanding theoretical limits helps set realistic performance expectations.
+
+**Peak performance bounds**
+
+Every GPU has theoretical maximum performance determined by:
+
+* Clock frequency and number of compute units
+* Instruction throughput per clock cycle
+* Memory bandwidth capacity
+* Specialized unit capabilities (matrix cores, SFUs)
+
+**Achievable performance**
+
+Real applications typically achieve a fraction of theoretical peak due to:
+
+* Imperfect resource utilization
+* Memory access inefficiencies
+* Control flow divergence
+* Synchronization overhead
+* Launch and scheduling costs
+
+The gap between theoretical and achieved performance reveals optimization 
+opportunities. The roofline model provides a framework for understanding these 
+limits and identifying which factor (compute or memory) constrains performance.
+
+Summary
+=======
+
+Understanding GPU performance requires knowledge of several interconnected concepts:
+
+* **Performance bottlenecks**: Whether compute or memory limits performance
+* **Roofline model**: Visual framework for analyzing performance limits
+* **Arithmetic intensity**: The compute-to-memory ratio of algorithms
+* **Latency hiding**: How concurrent execution masks delays
+* **Occupancy**: How wavefront concurrency affects resource utilization
+* **Memory hierarchy**: How different memory types affect bandwidth
+* **Performance metrics**: Quantitative measures for analysis
+
+These theoretical foundations inform practical optimization decisions. For 
+step-by-step optimization techniques and practical guidance, see 
+:doc:`../how-to/performance_guidelines`.
@@ -271,10 +271,16 @@ depicted in the following figure.
 .. _wavefront:

 Warp (or Wavefront)
-  The innermost grouping of threads is called a warp. A warp is the most tightly
-  coupled groups of threads, both physically and logically. Threads inside a warp
-  are executed in lockstep, with each thread executing the same instruction. Threads
-  in a warp are also called lanes, and the value identifying them is the lane ID.
+  The innermost grouping of threads is called a warp (NVIDIA terminology) or 
+  wavefront (AMD terminology). A wavefront is the most tightly coupled group of 
+  threads, both physically and logically. Threads within a wavefront are
+  executed in lockstep, with each thread executing the same instruction
+  simultaneously on different data elements.
+  
+  A wavefront represents the fundamental execution unit of AMD GPUs. Each wavefront 
+  consists of multiple parallel threads that execute the same instruction 
+  simultaneously across the SIMD pipelines of a compute unit. Threads in a wavefront 
+  are also called lanes, and the value identifying them is the lane ID.

  .. tip::

@@ -282,22 +288,39 @@ Warp (or Wavefront)
    consequence, they are only as multidimensional as the user interprets the
    calculated values to be.

-  The size of a warp is architecture dependent and always fixed. For AMD GPUs
-  the warp is typically 64 threads, though sometimes 32 threads. Warps are
-  signified by the set of communication primitives at their disposal, as
-  discussed in :ref:`warp-cross-lane`.
+  The size of a wavefront is architecture dependent and always fixed:
+  
+  * **64 threads** for AMD GCN and CDNA architectures as well as RDNA architectures in wave64 mode
+  * **32 threads** for AMD RDNA architectures in wave32 mode
+  * **32 threads** for NVIDIA GPUs
+  
+  Wavefronts are signified by the set of communication primitives at their disposal, 
+  as discussed in :ref:`warp-cross-lane`. On modern AMD datacenter GPUs like MI300X, 
+  each CU can support up to 64 concurrent wavefronts, each containing 64 threads, 
+  for a total of over 4,000 active threads per CU.

 .. _inherent_thread_hierarchy_block:

-Block
-  The next level of the thread hierarchy is called a thread block, or block. The
-  defining feature of a block is that all threads in the block have shared memory
-  that they can use to share data or synchronize with one another, as described in
-  :ref:`memory_hierarchy`.
+Block (Work-group)
+  The next level of the thread hierarchy is called a thread block (or work-group in 
+  OpenCL terminology). A block is a collection of wavefronts that can synchronize 
+  and share local data share (LDS) memory. The defining feature of a block is that 
+  all threads in the block have shared memory that they can use to share data or 
+  synchronize with one another, as described in :ref:`memory_hierarchy`.
+  
+  All wavefronts of a block execute on the same compute unit, ensuring they 
+  can access the same LDS and synchronize efficiently. This locality is crucial 
+  for performance when threads need to cooperate on shared data.

  The size of a block, or the block dimension, is the user-configurable number of
  threads per block, but is limited by the queryable capabilities of the executing
-  hardware. The unique ID of the thread within a block can be 1, 2, or 3-dimensional
+  hardware. Common limits include:
+  
+  * Maximum threads per block: typically 1024
+  * Maximum block dimensions: 1024 x 1024 x 64 (x, y, z)
+  * Limited by available resources (registers, LDS, wavefront slots)
+  
+  The unique ID of the thread within a block can be 1, 2, or 3-dimensional
  as provided by the HIP API. You can configure the thread block to best represent
  the data associated with the kernel instruction set. 
  
@@ -308,10 +331,24 @@ Block
 .. _inherent_thread_hierarchy_grid:

 Grid
-  The top-most level of the thread hierarchy is a grid. A grid is the number of blocks
-  needed for a single launch of the kernel. The unique ID of each block within
-  a grid can be 1, 2, or 3-dimensional, as provided by the API and is queryable
-  by every thread within the block.
+  The top-most level of the thread hierarchy is a grid. A grid represents the total 
+  collection of blocks (work-groups) launched for a single kernel execution. It defines 
+  the overall problem size and how work is distributed across the GPU.
+  
+  The grid is specified when launching a kernel and determines:
+  
+  * Total number of threads: ``grid_size`` × ``block_size``
+  * Distribution of work across compute units
+  * Overall parallelism of the computation
+  
+  The unique ID of each block within a grid can be 1, 2, or 3-dimensional, as provided 
+  by the API and is queryable by every thread within the block through the ``blockIdx`` 
+  built-in variable.
+  
+  Grid dimensions are limited by hardware capabilities:
+  
+  * Maximum x-dimension: 2³¹ - 1
+  * Maximum y and z-dimensions: 2¹⁶ - 1 (65,535)

 The three-dimensional thread hierarchy available to a kernel program lends itself to solutions
 that align closely to the computational problem. The following are some examples: