[hip] Docs: Overhaul HW implementation page (#1994)

* [hip] Docs: Overhaul HW implementation page
* Update hardware implementation and glossary
* Update programming model
* Add performance optimization
* Split into how-to and understanding

---------

Signed-off-by: Jan Stephan <jan.stephan@amd.com>
Co-authored-by: Jan Stephan <jan.stephan@amd.com>
Co-authored-by: Julia Jiang <julia.jiang@amd.com>
This commit is contained in:
Adel Johar
2025-12-11 10:52:34 +01:00
کامیت شده توسط GitHub
والد 9e06ea8f79
کامیت 256dd1963a
14فایلهای تغییر یافته به همراه2466 افزوده شده و 389 حذف شده
فایل باینری نشان داده نشده است.

پس از

عرض:  |  ارتفاع:  |  اندازه: 45 KiB

فایل باینری نشان داده نشده است.

پس از

عرض:  |  ارتفاع:  |  اندازه: 50 KiB

فایل باینری نشان داده نشده است.

پس از

عرض:  |  ارتفاع:  |  اندازه: 18 KiB

@@ -0,0 +1,393 @@
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!-- Created with Inkscape (http://www.inkscape.org/) -->
<svg
width="78.030128mm"
height="59.956924mm"
viewBox="0 0 78.030128 59.956924"
version="1.1"
id="svg5"
inkscape:version="1.1.2 (0a00cf5339, 2022-02-04)"
sodipodi:docname="lds.svg"
inkscape:export-filename="/home/nick/Documents/software_repos/omniperf/src/docs/images/lds.png"
inkscape:export-xdpi="180"
inkscape:export-ydpi="180"
xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
xmlns="http://www.w3.org/2000/svg"
xmlns:svg="http://www.w3.org/2000/svg">
<sodipodi:namedview
id="namedview7"
pagecolor="#ffffff"
bordercolor="#666666"
borderopacity="1.0"
inkscape:pageshadow="2"
inkscape:pageopacity="0.0"
inkscape:pagecheckerboard="0"
inkscape:document-units="mm"
showgrid="false"
inkscape:zoom="2.8284271"
inkscape:cx="7.9549513"
inkscape:cy="93.161318"
inkscape:window-width="2490"
inkscape:window-height="1376"
inkscape:window-x="70"
inkscape:window-y="27"
inkscape:window-maximized="1"
inkscape:current-layer="layer1"
fit-margin-top="0"
fit-margin-left="0"
fit-margin-right="0"
fit-margin-bottom="0" />
<defs
id="defs2">
<marker
style="overflow:visible"
id="marker35467"
refX="0"
refY="0"
orient="auto"
inkscape:stockid="Arrow1Mend"
inkscape:isstock="true">
<path
transform="matrix(-0.4,0,0,-0.4,-4,0)"
style="fill:context-stroke;fill-rule:evenodd;stroke:context-stroke;stroke-width:1pt"
d="M 0,0 5,-5 -12.5,0 5,5 Z"
id="path35465" />
</marker>
<marker
style="overflow:visible"
id="Arrow2Mend"
refX="0"
refY="0"
orient="auto"
inkscape:stockid="Arrow2Mend"
inkscape:isstock="true">
<path
transform="scale(-0.6)"
d="M 8.7185878,4.0337352 -2.2072895,0.01601326 8.7185884,-4.0017078 c -1.7454984,2.3720609 -1.7354408,5.6174519 -6e-7,8.035443 z"
style="fill:context-stroke;fill-rule:evenodd;stroke:context-stroke;stroke-width:0.625;stroke-linejoin:round"
id="path34770" />
</marker>
<marker
style="overflow:visible"
id="marker35449"
refX="0"
refY="0"
orient="auto"
inkscape:stockid="Arrow1Mend"
inkscape:isstock="true">
<path
transform="matrix(-0.4,0,0,-0.4,-4,0)"
style="fill:context-stroke;fill-rule:evenodd;stroke:context-stroke;stroke-width:1pt"
d="M 0,0 5,-5 -12.5,0 5,5 Z"
id="path35447" />
</marker>
<marker
style="overflow:visible"
id="Arrow1Mend"
refX="0"
refY="0"
orient="auto"
inkscape:stockid="Arrow1Mend"
inkscape:isstock="true">
<path
transform="matrix(-0.4,0,0,-0.4,-4,0)"
style="fill:context-stroke;fill-rule:evenodd;stroke:context-stroke;stroke-width:1pt"
d="M 0,0 5,-5 -12.5,0 5,5 Z"
id="path34752" />
</marker>
<marker
style="overflow:visible"
id="marker35115"
refX="0"
refY="0"
orient="auto"
inkscape:stockid="Arrow2Mstart"
inkscape:isstock="true">
<path
transform="scale(0.6)"
d="M 8.7185878,4.0337352 -2.2072895,0.01601326 8.7185884,-4.0017078 c -1.7454984,2.3720609 -1.7354408,5.6174519 -6e-7,8.035443 z"
style="fill:context-stroke;fill-rule:evenodd;stroke:context-stroke;stroke-width:0.625;stroke-linejoin:round"
id="path35113" />
</marker>
<marker
style="overflow:visible"
id="Arrow2Mstart"
refX="0"
refY="0"
orient="auto"
inkscape:stockid="Arrow2Mstart"
inkscape:isstock="true">
<path
transform="scale(0.6)"
d="M 8.7185878,4.0337352 -2.2072895,0.01601326 8.7185884,-4.0017078 c -1.7454984,2.3720609 -1.7354408,5.6174519 -6e-7,8.035443 z"
style="fill:context-stroke;fill-rule:evenodd;stroke:context-stroke;stroke-width:0.625;stroke-linejoin:round"
id="path34767" />
</marker>
<marker
style="overflow:visible"
id="Arrow1Lstart"
refX="0"
refY="0"
orient="auto"
inkscape:stockid="Arrow1Lstart"
inkscape:isstock="true">
<path
transform="matrix(0.8,0,0,0.8,10,0)"
style="fill:context-stroke;fill-rule:evenodd;stroke:context-stroke;stroke-width:1pt"
d="M 0,0 5,-5 -12.5,0 5,5 Z"
id="path34743" />
</marker>
<rect
x="175.93893"
y="253.99336"
width="234.12074"
height="100.54605"
id="rect3930" />
<marker
style="overflow:visible"
id="Arrow1Mend-5"
refX="0"
refY="0"
orient="auto"
inkscape:stockid="Arrow1Mend"
inkscape:isstock="true">
<path
transform="matrix(-0.4,0,0,-0.4,-4,0)"
style="fill:context-stroke;fill-rule:evenodd;stroke:context-stroke;stroke-width:1pt"
d="M 0,0 5,-5 -12.5,0 5,5 Z"
id="path34752-6" />
</marker>
</defs>
<g
inkscape:label="Layer 1"
inkscape:groupmode="layer"
id="layer1"
transform="translate(-38.921551,-41.961155)">
<rect
style="fill:none;stroke:#000000;stroke-width:0.5;stroke-miterlimit:4;stroke-dasharray:none"
id="rect846"
width="77.212677"
height="40.499405"
x="39.330276"
y="61.168674" />
<text
xml:space="preserve"
transform="scale(0.26458333)"
id="text3928"
style="font-style:normal;font-weight:normal;font-size:40px;line-height:1.25;font-family:sans-serif;white-space:pre;shape-inside:url(#rect3930);fill:#000000;fill-opacity:1;stroke:none" />
<rect
style="fill:none;stroke:#000000;stroke-width:0.5;stroke-miterlimit:4;stroke-dasharray:none"
id="rect846-3"
width="31.886179"
height="13.072078"
x="39.171551"
y="42.211155" />
<text
xml:space="preserve"
style="font-style:normal;font-weight:normal;font-size:3.52778px;line-height:1.25;font-family:sans-serif;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583"
x="47.24173"
y="49.892689"
id="text25740"><tspan
sodipodi:role="line"
id="tspan25738"
style="font-size:3.52778px;stroke-width:0.264583"
x="47.24173"
y="49.892689">SIMD 0/1</tspan></text>
<text
xml:space="preserve"
style="font-style:normal;font-weight:normal;font-size:3.52778px;line-height:1.25;font-family:sans-serif;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583"
x="92.864143"
y="49.892689"
id="text25740-7"><tspan
sodipodi:role="line"
id="tspan25738-5"
style="font-size:3.52778px;stroke-width:0.264583"
x="92.864143"
y="49.892689">SIMD 2/3</tspan></text>
<rect
style="fill:none;stroke:#000000;stroke-width:0.5;stroke-miterlimit:4;stroke-dasharray:none"
id="rect846-3-3"
width="31.886179"
height="13.072078"
x="84.815498"
y="42.211155" />
<path
style="fill:none;stroke:#000000;stroke-width:0.239054px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;marker-start:url(#marker35115);marker-end:url(#Arrow2Mend)"
d="m 55.11464,55.888713 v 4.575516"
id="path35236" />
<path
style="fill:none;stroke:#000000;stroke-width:0.239437px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;marker-start:url(#marker35115);marker-end:url(#Arrow2Mend)"
d="m 100.75859,55.863706 v 4.584345"
id="path35236-2" />
<g
id="g91757"
transform="translate(0.09337305,-1.490623)">
<rect
style="fill:none;stroke:#000000;stroke-width:0.5;stroke-miterlimit:4;stroke-dasharray:none"
id="rect846-3-9"
width="57.741043"
height="6.0837841"
x="49.066093"
y="64.83651" />
<text
xml:space="preserve"
style="font-style:normal;font-weight:normal;font-size:3.52778px;line-height:1.25;font-family:sans-serif;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583"
x="62.166679"
y="69.054901"
id="text43239"><tspan
sodipodi:role="line"
id="tspan43237"
style="font-size:3.52778px;stroke-width:0.264583"
x="62.166679"
y="69.054901">Conflict Detection</tspan></text>
</g>
<g
id="g94249">
<rect
style="fill:none;stroke:#000000;stroke-width:0.5;stroke-miterlimit:4;stroke-dasharray:none"
id="rect846-3-9-7"
width="57.741043"
height="6.0837841"
x="49.159466"
y="69.429672" />
<text
xml:space="preserve"
style="font-style:normal;font-weight:normal;font-size:3.52778px;line-height:1.25;font-family:sans-serif;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583"
x="69.083069"
y="73.648064"
id="text43239-9"><tspan
sodipodi:role="line"
id="tspan43237-2"
style="font-size:3.52778px;stroke-width:0.264583"
x="69.083069"
y="73.648064">Scheduler</tspan></text>
</g>
<g
id="g75031">
<rect
style="fill:none;stroke:#000000;stroke-width:0.499999;stroke-miterlimit:4;stroke-dasharray:none"
id="rect53227"
width="10.155521"
height="20.193951"
x="41.817165"
y="78.846886" />
<text
xml:space="preserve"
style="font-style:normal;font-weight:normal;font-size:3.52778px;line-height:1.25;font-family:sans-serif;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583"
x="-95.113167"
y="48.210091"
id="text58467"
transform="rotate(-90)"><tspan
sodipodi:role="line"
id="tspan58465"
style="font-size:3.52778px;stroke-width:0.264583"
x="-95.113167"
y="48.210091">Bank 0</tspan></text>
</g>
<g
id="g75036"
transform="translate(-0.28890355,0.10812378)">
<rect
style="fill:none;stroke:#000000;stroke-width:0.499999;stroke-miterlimit:4;stroke-dasharray:none"
id="rect53227-1"
width="10.155521"
height="20.193951"
x="55.210358"
y="78.738762" />
<text
xml:space="preserve"
style="font-style:normal;font-weight:normal;font-size:3.52778px;line-height:1.25;font-family:sans-serif;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583"
x="-95.005043"
y="61.603283"
id="text58467-2"
transform="rotate(-90)"><tspan
sodipodi:role="line"
id="tspan58465-7"
style="font-size:3.52778px;stroke-width:0.264583"
x="-95.005043"
y="61.603283">Bank 1</tspan></text>
</g>
<g
id="g75041"
transform="translate(-0.42640324,0.12826538)">
<rect
style="fill:none;stroke:#000000;stroke-width:0.499999;stroke-miterlimit:4;stroke-dasharray:none"
id="rect53227-0"
width="10.155521"
height="20.193951"
x="68.452148"
y="78.71862" />
<text
xml:space="preserve"
style="font-style:normal;font-weight:normal;font-size:3.52778px;line-height:1.25;font-family:sans-serif;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583"
x="-94.984901"
y="74.845078"
id="text58467-9"
transform="rotate(-90)"><tspan
sodipodi:role="line"
id="tspan58465-3"
style="font-size:3.52778px;stroke-width:0.264583"
x="-94.984901"
y="74.845078">Bank 2</tspan></text>
</g>
<g
id="g75046"
transform="translate(0,-0.36049652)">
<rect
style="fill:none;stroke:#000000;stroke-width:0.499999;stroke-miterlimit:4;stroke-dasharray:none"
id="rect53227-6"
width="10.155521"
height="20.193951"
x="81.130043"
y="79.207382" />
<text
xml:space="preserve"
style="font-style:normal;font-weight:normal;font-size:3.52778px;line-height:1.25;font-family:sans-serif;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583"
x="-95.473663"
y="87.522972"
id="text58467-0"
transform="rotate(-90)"><tspan
sodipodi:role="line"
id="tspan58465-6"
style="font-size:3.52778px;stroke-width:0.264583"
x="-95.473663"
y="87.522972">Bank 3</tspan></text>
</g>
<g
id="g75106"
transform="translate(0,0.88236237)">
<rect
style="fill:none;stroke:#000000;stroke-width:0.499999;stroke-miterlimit:4;stroke-dasharray:none"
id="rect53227-2"
width="10.155521"
height="20.193951"
x="103.69362"
y="77.964523" />
<text
xml:space="preserve"
style="font-style:normal;font-weight:normal;font-size:3.52778px;line-height:1.25;font-family:sans-serif;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583"
x="-95.307404"
y="110.08656"
id="text58467-6"
transform="rotate(-90)"><tspan
sodipodi:role="line"
id="tspan58465-1"
style="font-size:3.52778px;stroke-width:0.264583"
x="-95.307404"
y="110.08656">Bank 31</tspan></text>
</g>
<text
xml:space="preserve"
style="font-style:normal;font-weight:normal;font-size:3.52778px;line-height:1.25;font-family:sans-serif;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583"
x="95.809242"
y="89.162628"
id="text80928"><tspan
sodipodi:role="line"
id="tspan80926"
style="font-size:3.52778px;stroke-width:0.264583"
x="95.809242"
y="89.162628">...</tspan></text>
</g>
</svg>

پس از

عرض:  |  ارتفاع:  |  اندازه: 14 KiB

فایل باینری نشان داده نشده است.

پس از

عرض:  |  ارتفاع:  |  اندازه: 75 KiB

@@ -0,0 +1,302 @@
<svg class="typst-doc" viewBox="0 0 382.68322834645664 269.7074015748031" width="382.68322834645664pt" height="269.7074015748031pt" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:h5="http://www.w3.org/1999/xhtml">
<path class="typst-shape" fill="#ffffff" fill-rule="nonzero" d="M 0 0 L 0 269.7074 L 382.68323 269.7074 L 382.68323 0 Z "/>
<g>
<g transform="translate(14.173228346456693 14.173228346456693)">
<g class="typst-group">
<g>
<g transform="translate(14.17929133858268 226.77165354330708)">
<path class="typst-shape" fill="none" stroke="#000000" stroke-width="0.5" stroke-linecap="square" stroke-linejoin="miter" stroke-miterlimit="4" d="M 0 0 L 340.15747 0 "/>
</g>
<g transform="translate(111.55303149606299 232.44094488188975)">
<path class="typst-shape" fill="none" d="M 0 0 L 145.41 0 L 145.41 8.92 L 0 8.92 L 0 0 Z "/>
</g>
<g transform="translate(111.5530314488189 232.4409448503937)">
<g class="typst-group">
<g>
<g transform="translate(0 0)">
<g class="typst-group">
<g>
<g transform="translate(0 6.580000000000001)">
<g class="typst-text" transform="scale(1, -1)">
<use xlink:href="#gF68F62BAF7023F8E796CE041842DF629" x="0" fill="#000000" fill-rule="nonzero"/>
<use xlink:href="#gD022B8BB58399F5AFF249BACEC9B429C" x="7.02" fill="#000000" fill-rule="nonzero"/>
<use xlink:href="#g8A55166D3146C138BB50BFE795ED0EE5" x="12.28" fill="#000000" fill-rule="nonzero"/>
<use xlink:href="#g2C268D8F9C3C96139D78CE91BE4D9FB" x="16.75" fill="#000000" fill-rule="nonzero"/>
<use xlink:href="#g528CFE5CB98F8F81646F93C3E83B138B" x="20.47" fill="#000000" fill-rule="nonzero"/>
<use xlink:href="#g74F5E04B137C7A69C847D70E8574BAA6" x="25.04" fill="#000000" fill-rule="nonzero"/>
<use xlink:href="#g55B6FB7CBDC9C1DAE949CD37F0DC16D" x="28.2" fill="#000000" fill-rule="nonzero"/>
<use xlink:href="#gA11C98823DFB5A0C784EF187B0832232" x="30.91" fill="#000000" fill-rule="nonzero"/>
<use xlink:href="#g9C2BE9118249AB6B38E684352DDFBC5D" x="35.95" fill="#000000" fill-rule="nonzero"/>
<use xlink:href="#g528CFE5CB98F8F81646F93C3E83B138B" x="41.370000000000005" fill="#000000" fill-rule="nonzero"/>
<use xlink:href="#g179036B287C688540B027B22D62B33B" x="45.940000000000005" fill="#000000" fill-rule="nonzero"/>
<use xlink:href="#g6A35C718BF189FD6EC8D20440CDA771D" x="51.080000000000005" fill="#000000" fill-rule="nonzero"/>
<use xlink:href="#g9C2BE9118249AB6B38E684352DDFBC5D" x="54.050000000000004" fill="#000000" fill-rule="nonzero"/>
<use xlink:href="#g74F5E04B137C7A69C847D70E8574BAA6" x="59.470000000000006" fill="#000000" fill-rule="nonzero"/>
<use xlink:href="#g8A55166D3146C138BB50BFE795ED0EE5" x="62.63000000000001" fill="#000000" fill-rule="nonzero"/>
<use xlink:href="#g9C2BE9118249AB6B38E684352DDFBC5D" x="67.10000000000001" fill="#000000" fill-rule="nonzero"/>
<use xlink:href="#g7D8F87D2CD6331112C9FE3A07C40E0F" x="72.52000000000001" fill="#000000" fill-rule="nonzero"/>
<use xlink:href="#g55B6FB7CBDC9C1DAE949CD37F0DC16D" x="76.42000000000002" fill="#000000" fill-rule="nonzero"/>
<use xlink:href="#g74F5E04B137C7A69C847D70E8574BAA6" x="79.13000000000001" fill="#000000" fill-rule="nonzero"/>
<use xlink:href="#gB5DAD6AB9EF529853D7226A3F4588253" x="82.29" fill="#000000" fill-rule="nonzero"/>
<use xlink:href="#g72F468A787EE89ADC78BA24E12FC80FE" x="89.94000000000001" fill="#000000" fill-rule="nonzero"/>
<use xlink:href="#g229B6FF65CF728248AF8BE0A20FAFDFC" x="93.50000000000001" fill="#000000" fill-rule="nonzero"/>
<use xlink:href="#g2D340B5AC4FE037754EF86B583EC1E8C" x="98.35000000000001" fill="#000000" fill-rule="nonzero"/>
<use xlink:href="#gF68F62BAF7023F8E796CE041842DF629" x="103.63000000000001" fill="#000000" fill-rule="nonzero"/>
<use xlink:href="#g29073FC46AA074D3D56D2F6B70E76DDC" x="110.65" fill="#000000" fill-rule="nonzero"/>
<use xlink:href="#g62E843CADC663C7579F6A3FBC77388E" x="116.06" fill="#000000" fill-rule="nonzero"/>
<use xlink:href="#g95425A100709BC27DA272BD05EE64D31" x="120.91" fill="#000000" fill-rule="nonzero"/>
<use xlink:href="#gA9BC29EE3008608C91A9D8C24BBB5729" x="124.14" fill="#000000" fill-rule="nonzero"/>
<use xlink:href="#gB5DAD6AB9EF529853D7226A3F4588253" x="129.07" fill="#000000" fill-rule="nonzero"/>
<use xlink:href="#g74F5E04B137C7A69C847D70E8574BAA6" x="134.22" fill="#000000" fill-rule="nonzero"/>
<use xlink:href="#g8A55166D3146C138BB50BFE795ED0EE5" x="137.38" fill="#000000" fill-rule="nonzero"/>
<use xlink:href="#g99297E49F87FAD41528007F13950C7FE" x="141.85" fill="#000000" fill-rule="nonzero"/>
</g>
</g>
</g>
</g>
</g>
</g>
</g>
</g>
<g transform="translate(14.17929133858268 0)">
<path class="typst-shape" fill="none" stroke="#000000" stroke-width="0.5" stroke-linecap="square" stroke-linejoin="miter" stroke-miterlimit="4" d="M 0 0 L 340.15747 0 "/>
</g>
<g transform="translate(14.17929133858268 0)">
<path class="typst-shape" fill="none" stroke="#000000" stroke-width="0.5" stroke-linecap="square" stroke-linejoin="miter" stroke-miterlimit="4" d="M 0 226.77165 L 0 0 "/>
</g>
<g transform="translate(0 65.83582677165356)">
<path class="typst-shape" fill="none" d="M 0 95.1 L 0.0000000000000062941777 0 L 8.51 0 L 8.51 95.1 L 0 95.1 Z "/>
</g>
<g transform="translate(-43.295000047244095 109.13082675590552)">
<g class="typst-group">
<g transform="matrix(0.00000000000000006123233995736766 -1 1 0.00000000000000006123233995736766 43.295000031496066 51.80500006299213)">
<g transform="translate(0 0)">
<g class="typst-group">
<g>
<g transform="translate(0 6.580000000000001)">
<g class="typst-text" transform="scale(1, -1)">
<use xlink:href="#g29073FC46AA074D3D56D2F6B70E76DDC" x="0" fill="#000000" fill-rule="nonzero"/>
<use xlink:href="#g8A55166D3146C138BB50BFE795ED0EE5" x="5.41" fill="#000000" fill-rule="nonzero"/>
<use xlink:href="#g2C268D8F9C3C96139D78CE91BE4D9FB" x="9.88" fill="#000000" fill-rule="nonzero"/>
<use xlink:href="#gE67DAEF23D7CF18A0AF2DED5954B0CBF" x="13.600000000000001" fill="#000000" fill-rule="nonzero"/>
<use xlink:href="#gA11C98823DFB5A0C784EF187B0832232" x="16.700000000000003" fill="#000000" fill-rule="nonzero"/>
<use xlink:href="#g2C268D8F9C3C96139D78CE91BE4D9FB" x="21.740000000000002" fill="#000000" fill-rule="nonzero"/>
<use xlink:href="#g652A779657972FBC161C470ED70CA8EC" x="25.46" fill="#000000" fill-rule="nonzero"/>
<use xlink:href="#g528CFE5CB98F8F81646F93C3E83B138B" x="33.36" fill="#000000" fill-rule="nonzero"/>
<use xlink:href="#g9C2BE9118249AB6B38E684352DDFBC5D" x="37.93" fill="#000000" fill-rule="nonzero"/>
<use xlink:href="#gD63979DFEF9CA9C9A7E89CD2DA3C3A19" x="43.35" fill="#000000" fill-rule="nonzero"/>
<use xlink:href="#g8A55166D3146C138BB50BFE795ED0EE5" x="47.63" fill="#000000" fill-rule="nonzero"/>
<use xlink:href="#g72F468A787EE89ADC78BA24E12FC80FE" x="54.6" fill="#000000" fill-rule="nonzero"/>
<use xlink:href="#g8038720E9FD02ADAD29947BFDF50BDCC" x="58.160000000000004" fill="#000000" fill-rule="nonzero"/>
<use xlink:href="#g229B6FF65CF728248AF8BE0A20FAFDFC" x="64.13000000000001" fill="#000000" fill-rule="nonzero"/>
<use xlink:href="#g2D340B5AC4FE037754EF86B583EC1E8C" x="68.98" fill="#000000" fill-rule="nonzero"/>
<use xlink:href="#gF68F62BAF7023F8E796CE041842DF629" x="74.26" fill="#000000" fill-rule="nonzero"/>
<use xlink:href="#g29073FC46AA074D3D56D2F6B70E76DDC" x="81.28" fill="#000000" fill-rule="nonzero"/>
<use xlink:href="#g62E843CADC663C7579F6A3FBC77388E" x="86.69" fill="#000000" fill-rule="nonzero"/>
<use xlink:href="#g99297E49F87FAD41528007F13950C7FE" x="91.53999999999999" fill="#000000" fill-rule="nonzero"/>
</g>
</g>
</g>
</g>
</g>
</g>
</g>
</g>
<g transform="translate(354.3367716535433 0)">
<path class="typst-shape" fill="none" stroke="#000000" stroke-width="0.5" stroke-linecap="square" stroke-linejoin="miter" stroke-miterlimit="4" d="M 0 226.77165 L 0 0 "/>
</g>
<g transform="translate(14.17929133858268 63.26929133858268)">
<path class="typst-shape" fill="none" stroke="#0074d9" stroke-width="1" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="4" d="M 0 163.50237 L 153.47057 0 L 340.15747 0 "/>
</g>
<g transform="translate(167.6498611189155 63.26929133858268)">
<path class="typst-shape" fill="none" stroke="#ff4136" stroke-width="1" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="4" stroke-dashoffset="0" stroke-dasharray="3 2" d="M 0 0 L 0 163.50237 "/>
</g>
<g transform="translate(82.21078740157479 85.03937007874016)">
<path class="typst-shape" fill="none" stroke="#000000" stroke-width="1" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="4" d="M 0 62.072063 L 0 0 "/>
</g>
<g transform="translate(80.08480314960629 147.11143237795272)">
<path class="typst-shape" fill="none" stroke="#000000" stroke-width="1" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="4" d="M 2.1259842 5.6692915 L 0 0 L 4.2519684 0 L 2.1259842 5.6692915 Z "/>
</g>
<g transform="translate(23.97114173228346 70.41007874015746)">
<path class="typst-shape" fill="none" d="M 0 0 L 116.479294 0 L 116.479294 14.629292 L 0 14.629292 L 0 0 Z "/>
</g>
<g transform="translate(23.97114166929133 70.41007872440944)">
<g class="typst-group">
<g>
<g transform="translate(0 0)">
<g class="typst-group">
<g>
<g transform="translate(2.8346456692913384 9.414645669291339)">
<g class="typst-text" transform="scale(1, -1)">
<use xlink:href="#gD6703645BCA3FEA845C8778091F5B88B" x="0" fill="#000000" fill-rule="nonzero"/>
<use xlink:href="#g8A55166D3146C138BB50BFE795ED0EE5" x="8.39" fill="#000000" fill-rule="nonzero"/>
<use xlink:href="#g652A779657972FBC161C470ED70CA8EC" x="12.860000000000001" fill="#000000" fill-rule="nonzero"/>
<use xlink:href="#gA11C98823DFB5A0C784EF187B0832232" x="20.76" fill="#000000" fill-rule="nonzero"/>
<use xlink:href="#g2C268D8F9C3C96139D78CE91BE4D9FB" x="25.8" fill="#000000" fill-rule="nonzero"/>
<use xlink:href="#gB5DAD6AB9EF529853D7226A3F4588253" x="29.67" fill="#000000" fill-rule="nonzero"/>
<use xlink:href="#gA9BC29EE3008608C91A9D8C24BBB5729" x="37.32" fill="#000000" fill-rule="nonzero"/>
<use xlink:href="#g528CFE5CB98F8F81646F93C3E83B138B" x="42.25" fill="#000000" fill-rule="nonzero"/>
<use xlink:href="#g9C2BE9118249AB6B38E684352DDFBC5D" x="46.82" fill="#000000" fill-rule="nonzero"/>
<use xlink:href="#gD908EABB3E36470B09E6E9B440A24B90" x="52.24" fill="#000000" fill-rule="nonzero"/>
<use xlink:href="#g24E326A9802E1326DD4439B59EE95868" x="57.300000000000004" fill="#000000" fill-rule="nonzero"/>
<use xlink:href="#g55B6FB7CBDC9C1DAE949CD37F0DC16D" x="64.77000000000001" fill="#000000" fill-rule="nonzero"/>
<use xlink:href="#gD908EABB3E36470B09E6E9B440A24B90" x="67.48" fill="#000000" fill-rule="nonzero"/>
<use xlink:href="#g74F5E04B137C7A69C847D70E8574BAA6" x="72.54" fill="#000000" fill-rule="nonzero"/>
<use xlink:href="#g326CA448B94C8D25A227D725FDC9AB9" x="75.7" fill="#000000" fill-rule="nonzero"/>
<use xlink:href="#gD63979DFEF9CA9C9A7E89CD2DA3C3A19" x="83.58" fill="#000000" fill-rule="nonzero"/>
<use xlink:href="#g8A55166D3146C138BB50BFE795ED0EE5" x="87.86" fill="#000000" fill-rule="nonzero"/>
<use xlink:href="#g55B6FB7CBDC9C1DAE949CD37F0DC16D" x="92.33" fill="#000000" fill-rule="nonzero"/>
<use xlink:href="#g179036B287C688540B027B22D62B33B" x="95.03999999999999" fill="#000000" fill-rule="nonzero"/>
<use xlink:href="#g55B6FB7CBDC9C1DAE949CD37F0DC16D" x="97.67999999999999" fill="#000000" fill-rule="nonzero"/>
<use xlink:href="#g9C2BE9118249AB6B38E684352DDFBC5D" x="100.38999999999999" fill="#000000" fill-rule="nonzero"/>
<use xlink:href="#gED8CAD24521E8BF74BBCB4EAC1068E02" x="105.80999999999999" fill="#000000" fill-rule="nonzero"/>
</g>
</g>
</g>
</g>
</g>
</g>
</g>
</g>
<g transform="translate(286.30527559055116 70.36258330138494)">
<path class="typst-shape" fill="none" stroke="#000000" stroke-width="1" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="4" d="M 0 0 L 0 99.716156 "/>
</g>
<g transform="translate(284.17929133858263 64.69329203149606)">
<path class="typst-shape" fill="none" stroke="#000000" stroke-width="1" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="4" d="M 2.1259842 0 L 4.2519684 5.6692915 L 0 5.6692915 L 2.1259842 0 Z "/>
</g>
<g transform="translate(249.8406299212598 170.07874015748033)">
<path class="typst-shape" fill="none" d="M 0 0 L 72.92929 0 L 72.92929 14.629292 L 0 14.629292 L 0 0 Z "/>
</g>
<g transform="translate(249.84062982677165 170.07874014173228)">
<g class="typst-group">
<g>
<g transform="translate(0 0)">
<g class="typst-group">
<g>
<g transform="translate(2.8346456692913384 9.414645669291339)">
<g class="typst-text" transform="scale(1, -1)">
<use xlink:href="#g23EA1C753A3A6B5551EEA1A5D7FB37ED" x="0" fill="#000000" fill-rule="nonzero"/>
<use xlink:href="#gA11C98823DFB5A0C784EF187B0832232" x="6.460000000000001" fill="#000000" fill-rule="nonzero"/>
<use xlink:href="#g652A779657972FBC161C470ED70CA8EC" x="11.5" fill="#000000" fill-rule="nonzero"/>
<use xlink:href="#gD022B8BB58399F5AFF249BACEC9B429C" x="19.4" fill="#000000" fill-rule="nonzero"/>
<use xlink:href="#gA44474528B05A80D93B01AD42F22D65" x="24.59" fill="#000000" fill-rule="nonzero"/>
<use xlink:href="#g74F5E04B137C7A69C847D70E8574BAA6" x="29.9" fill="#000000" fill-rule="nonzero"/>
<use xlink:href="#g8A55166D3146C138BB50BFE795ED0EE5" x="33.06" fill="#000000" fill-rule="nonzero"/>
<use xlink:href="#gD63979DFEF9CA9C9A7E89CD2DA3C3A19" x="40.03" fill="#000000" fill-rule="nonzero"/>
<use xlink:href="#g8A55166D3146C138BB50BFE795ED0EE5" x="44.31" fill="#000000" fill-rule="nonzero"/>
<use xlink:href="#g55B6FB7CBDC9C1DAE949CD37F0DC16D" x="48.78" fill="#000000" fill-rule="nonzero"/>
<use xlink:href="#g179036B287C688540B027B22D62B33B" x="51.49" fill="#000000" fill-rule="nonzero"/>
<use xlink:href="#g55B6FB7CBDC9C1DAE949CD37F0DC16D" x="54.13" fill="#000000" fill-rule="nonzero"/>
<use xlink:href="#g9C2BE9118249AB6B38E684352DDFBC5D" x="56.84" fill="#000000" fill-rule="nonzero"/>
<use xlink:href="#gED8CAD24521E8BF74BBCB4EAC1068E02" x="62.260000000000005" fill="#000000" fill-rule="nonzero"/>
</g>
</g>
</g>
</g>
</g>
</g>
</g>
</g>
</g>
</g>
</g>
</g>
<defs id="glyph">
<symbol id="gF68F62BAF7023F8E796CE041842DF629" overflow="visible">
<path d="M 6.66 3.29 C 6.66 5.3599997 5.23 6.58 3.4399998 6.58 C 1.5 6.58 0.37 5.04 0.37 3.1 C 0.37 1.13 1.8 -0.099999994 3.5 -0.099999994 C 4.61 -0.099999994 5.49 0.38 6.04 1.16 C 6.44 1.73 6.66 2.45 6.66 3.29 Z M 3.31 6.22 C 4.63 6.22 5.7 5.12 5.7 3.1 C 5.7 1.31 4.8399997 0.26 3.6999998 0.26 C 2.48 0.26 1.3299999 1.35 1.3299999 3.27 C 1.3299999 5.37 2.32 6.22 3.31 6.22 Z "/>
</symbol>
<symbol id="gD022B8BB58399F5AFF249BACEC9B429C" overflow="visible">
<path d="M 1.56 3.6799998 C 1.55 3.98 1.53 4.24 1.48 4.3399997 C 1.4599999 4.39 1.4399999 4.42 1.36 4.42 C 1.0799999 4.31 0.82 4.22 0.13 4.13 C 0.11 4.0699997 0.13 3.9099998 0.14999999 3.85 C 0.69 3.8 0.79999995 3.75 0.79999995 3.1699998 L 0.79999995 -1.1 C 0.79999995 -1.93 0.69 -1.9799999 0.08 -2.01 C 0.02 -2.07 0.02 -2.28 0.08 -2.34 C 0.42999998 -2.33 0.79999995 -2.32 1.1999999 -2.32 C 1.5999999 -2.32 2.08 -2.33 2.4099998 -2.34 C 2.47 -2.28 2.47 -2.07 2.4099998 -2.01 C 1.6999999 -1.9699999 1.5899999 -1.93 1.5899999 -1.1 L 1.5899999 -0.02 C 1.5899999 0.11 1.63 0.099999994 1.73 0.06 C 1.9799999 -0.04 2.28 -0.099999994 2.6 -0.099999994 C 3.1599998 -0.099999994 3.6599998 0.07 4.0699997 0.45999998 C 4.54 0.91999996 4.81 1.54 4.81 2.35 C 4.81 3.4099998 4.06 4.39 3.04 4.39 C 2.58 4.39 2.07 4.0899997 1.67 3.6399999 C 1.61 3.58 1.5699999 3.58 1.56 3.6799998 Z M 1.75 3.31 C 2.01 3.6299999 2.47 3.9299998 2.76 3.9299998 C 3.3999999 3.9299998 3.9499998 3.21 3.9499998 2.08 C 3.9499998 1.26 3.6599998 0.24 2.54 0.24 C 2.36 0.24 2.01 0.29 1.8299999 0.45 C 1.63 0.63 1.5899999 0.69 1.5899999 1.05 L 1.5899999 2.87 C 1.5899999 3.08 1.63 3.1699998 1.75 3.31 Z "/>
</symbol>
<symbol id="g8A55166D3146C138BB50BFE795ED0EE5" overflow="visible">
<path d="M 3.86 0.93 C 3.49 0.55 3.1999998 0.39 2.62 0.39 C 2.26 0.39 1.8399999 0.59999996 1.53 1.11 C 1.3299999 1.4399999 1.2099999 1.9 1.2099999 2.48 L 3.87 2.46 C 3.99 2.46 4.06 2.52 4.06 2.6299999 C 4.06 3.47 3.76 4.37 2.37 4.37 C 1.5 4.37 0.37 3.54 0.37 2.02 C 0.37 1.4599999 0.51 0.91999996 0.84 0.53999996 C 1.18 0.14 1.65 -0.099999994 2.37 -0.099999994 C 3.1299999 -0.099999994 3.6699998 0.25 4.0699997 0.77 C 4.04 0.87 3.98 0.91999996 3.86 0.93 Z M 1.24 2.82 C 1.43 3.9499998 2.1299999 4.04 2.37 4.04 C 2.75 4.04 3.1999998 3.83 3.1999998 2.99 C 3.1999998 2.8999999 3.1599998 2.85 3.05 2.85 Z "/>
</symbol>
<symbol id="g2C268D8F9C3C96139D78CE91BE4D9FB" overflow="visible">
<path d="M 1.76 3.58 C 1.74 3.98 1.73 4.24 1.68 4.3399997 C 1.66 4.39 1.64 4.42 1.56 4.42 C 1.28 4.31 1.02 4.22 0.32999998 4.13 C 0.31 4.0699997 0.32999998 3.9099998 0.35 3.85 C 0.89 3.8 1 3.75 1 3.1699998 L 1 1.22 C 1 0.39 0.88 0.35 0.26 0.31 C 0.19999999 0.25 0.19999999 0.04 0.26 -0.02 C 0.61 -0.01 1 0 1.4 0 C 1.8 0 2.26 -0.01 2.61 -0.02 C 2.6699998 0.04 2.6699998 0.25 2.61 0.31 C 1.91 0.35999998 1.79 0.39 1.79 1.22 L 1.79 2.61 C 1.79 2.87 1.91 3.1 2.03 3.28 C 2.1399999 3.4399998 2.37 3.77 2.49 3.77 C 2.58 3.77 2.6699998 3.75 2.75 3.6399999 C 2.82 3.54 2.9399998 3.4099998 3.11 3.4099998 C 3.35 3.4099998 3.58 3.6599998 3.58 3.9099998 C 3.58 4.1 3.3999999 4.39 2.98 4.39 C 2.51 4.39 2.1 3.9499998 1.87 3.56 C 1.81 3.4499998 1.76 3.53 1.76 3.58 Z "/>
</symbol>
<symbol id="g528CFE5CB98F8F81646F93C3E83B138B" overflow="visible">
<path d="M 2.9299998 0.48 C 2.99 0.17 3.1 -0.099999994 3.6 -0.099999994 C 3.98 -0.099999994 4.3399997 0.07 4.5499997 0.26999998 C 4.5299997 0.39 4.49 0.48 4.38 0.53999996 C 4.31 0.48 4.14 0.38 4.0099998 0.38 C 3.72 0.38 3.7099998 0.77 3.7099998 1.23 L 3.7099998 2.7 C 3.7099998 4.12 2.9299998 4.39 2.2 4.39 C 1.38 4.39 0.55 3.85 0.55 3.28 C 0.55 3.04 0.66999996 2.9199998 0.9 2.9199998 C 1.1899999 2.9199998 1.37 3.1299999 1.37 3.26 C 1.37 3.33 1.36 3.3999999 1.3399999 3.4399998 C 1.3299999 3.47 1.3199999 3.53 1.3199999 3.6399999 C 1.3199999 3.9499998 1.74 4.06 2.12 4.06 C 2.46 4.06 2.9299998 3.8899999 2.9299998 2.76 C 2.9299998 2.69 2.8999999 2.6499999 2.87 2.6399999 L 2.01 2.4299998 C 1.05 2.19 0.35999998 1.66 0.35999998 0.97999996 C 0.35999998 0.16 0.91999996 -0.099999994 1.62 -0.099999994 C 1.9699999 -0.099999994 2.27 -0.02 2.71 0.32 L 2.9099998 0.48 Z M 2.9299998 2.33 L 2.9299998 1.01 C 2.9299998 0.88 2.87 0.81 2.79 0.75 C 2.53 0.53999996 2.19 0.31 1.91 0.31 C 1.41 0.31 1.1899999 0.71 1.1899999 1.02 C 1.1899999 1.4699999 1.4 1.93 2.1399999 2.12 Z "/>
</symbol>
<symbol id="g74F5E04B137C7A69C847D70E8574BAA6" overflow="visible">
<path d="M 0.42999998 4.29 C 0.29 4.29 0.25 4.17 0.25 4.0899997 L 0.25 3.9599998 C 0.25 3.9099998 0.26 3.8999999 0.29999998 3.8999999 L 0.89 3.8999999 L 0.89 0.89 C 0.89 0.17999999 1.1999999 -0.099999994 1.66 -0.099999994 C 2.12 -0.099999994 2.62 0.12 3.01 0.56 C 2.99 0.65999997 2.9299998 0.71999997 2.83 0.72999996 C 2.57 0.53 2.27 0.45 2.01 0.45 C 1.74 0.45 1.68 0.75 1.68 1.37 L 1.68 3.8999999 L 2.72 3.8999999 C 2.82 3.8999999 2.96 3.9399998 2.96 4.0299997 L 2.96 4.23 C 2.96 4.27 2.9299998 4.29 2.8799999 4.29 L 1.68 4.29 L 1.68 4.68 C 1.68 5.33 1.7199999 5.73 1.7199999 5.73 C 1.7199999 5.79 1.6899999 5.8199997 1.64 5.8199997 C 1.5999999 5.8199997 1.51 5.7799997 1.42 5.73 C 1.31 5.67 1.2099999 5.62 1.0799999 5.5899997 C 0.96 5.5499997 0.85999995 5.52 0.85999995 5.45 C 0.85999995 5.33 0.89 5.4 0.89 4.29 Z "/>
</symbol>
<symbol id="g55B6FB7CBDC9C1DAE949CD37F0DC16D" overflow="visible">
<path d="M 1.81 1.22 L 1.81 3.21 C 1.81 3.7099998 1.8499999 4.35 1.8499999 4.35 C 1.8499999 4.39 1.8 4.42 1.7199999 4.42 C 1.4399999 4.31 1.04 4.22 0.35 4.13 C 0.32999998 4.0699997 0.35 3.9099998 0.37 3.85 C 0.91999996 3.8 1.02 3.74 1.02 3.1699998 L 1.02 1.22 C 1.02 0.39 0.90999997 0.35999998 0.29999998 0.31 C 0.24 0.25 0.24 0.04 0.29999998 -0.02 C 0.63 -0.01 1.02 0 1.42 0 C 1.8199999 0 2.2 -0.01 2.53 -0.02 C 2.59 0.04 2.59 0.25 2.53 0.31 C 1.92 0.35 1.81 0.39 1.81 1.22 Z M 0.9 5.99 C 0.9 5.73 1.14 5.47 1.38 5.47 C 1.66 5.47 1.9 5.74 1.9 5.95 C 1.9 6.19 1.6899999 6.47 1.42 6.47 C 1.18 6.47 0.9 6.23 0.9 5.99 Z "/>
</symbol>
<symbol id="gA11C98823DFB5A0C784EF187B0832232" overflow="visible">
<path d="M 0.41 2.05 C 0.41 1.03 1.09 -0.099999994 2.51 -0.099999994 C 3.1499999 -0.099999994 3.6399999 0.13 3.98 0.45999998 C 4.43 0.9 4.63 1.53 4.63 2.1399999 C 4.63 3.1799998 4.06 4.39 2.53 4.39 C 1.87 4.39 1.3299999 4.12 0.96 3.6899998 C 0.59999996 3.26 0.41 2.6799998 0.41 2.05 Z M 2.3799999 4.04 C 3.24 4.04 3.77 3.26 3.77 1.8199999 C 3.77 0.56 3.12 0.25 2.6499999 0.25 C 1.61 0.25 1.27 1.51 1.27 2.28 C 1.27 3.1499999 1.48 4.04 2.3799999 4.04 Z "/>
</symbol>
<symbol id="g9C2BE9118249AB6B38E684352DDFBC5D" overflow="visible">
<path d="M 1.8399999 3.58 C 1.78 3.51 1.7199999 3.49 1.7199999 3.58 C 1.7099999 3.85 1.6899999 4.24 1.64 4.3399997 C 1.62 4.39 1.5999999 4.42 1.52 4.42 C 1.24 4.31 0.97999996 4.22 0.29 4.13 C 0.26999998 4.0699997 0.29 3.9099998 0.31 3.85 C 0.84999996 3.8 0.96 3.75 0.96 3.1699998 L 0.96 1.22 C 0.96 0.39999998 0.85999995 0.35999998 0.26 0.31 C 0.19999999 0.25 0.19999999 0.04 0.26 -0.02 C 0.56 -0.01 0.96 0 1.36 0 C 1.76 0 2.06 -0.01 2.36 -0.02 C 2.4199998 0.04 2.4199998 0.25 2.36 0.31 C 1.8499999 0.35999998 1.75 0.39999998 1.75 1.22 L 1.75 2.86 C 1.75 3.07 1.8399999 3.1899998 1.92 3.28 C 2.3 3.6499999 2.75 3.87 3.1399999 3.87 C 3.34 3.87 3.55 3.74 3.6699998 3.51 C 3.77 3.31 3.79 3.04 3.79 2.74 L 3.79 1.22 C 3.79 0.39999998 3.6899998 0.35999998 3.1699998 0.31 C 3.12 0.25 3.12 0.04 3.1699998 -0.02 C 3.47 -0.01 3.79 0 4.19 0 C 4.5899997 0 4.95 -0.01 5.25 -0.02 C 5.2999997 0.04 5.2999997 0.25 5.25 0.31 C 4.69 0.35999998 4.58 0.39999998 4.58 1.22 L 4.58 2.71 C 4.58 3.26 4.54 3.74 4.31 4.0499997 C 4.14 4.27 3.83 4.39 3.48 4.39 C 2.99 4.39 2.4299998 4.2599998 1.8399999 3.58 Z "/>
</symbol>
<symbol id="g179036B287C688540B027B22D62B33B" overflow="visible">
<path d="M 0.95 1.22 C 0.95 0.39 0.84 0.34 0.22999999 0.31 C 0.17 0.25 0.17 0.04 0.22999999 -0.02 C 0.58 -0.01 0.95 0 1.35 0 C 1.75 0 2.1299999 -0.01 2.46 -0.02 C 2.52 0.04 2.52 0.25 2.46 0.31 C 1.8499999 0.34 1.74 0.39 1.74 1.22 L 1.74 5.83 C 1.74 6.48 1.78 6.8799996 1.78 6.8799996 C 1.78 6.95 1.74 6.98 1.65 6.98 C 1.4 6.8799996 0.65 6.74 0.25 6.71 C 0.22999999 6.6299996 0.25 6.47 0.31 6.41 C 0.89 6.37 0.95 6.3399997 0.95 5.5899997 Z "/>
</symbol>
<symbol id="g6A35C718BF189FD6EC8D20440CDA771D" overflow="visible">
<path d="M 1.91 1.22 L 1.91 5.23 C 1.91 6.06 2.08 6.1099997 2.78 6.14 C 2.84 6.2 2.84 6.41 2.78 6.47 C 2.34 6.46 1.87 6.45 1.48 6.45 C 1.15 6.45 0.65999997 6.46 0.19 6.47 C 0.13 6.41 0.13 6.2 0.19 6.14 C 0.89 6.1099997 1.06 6.06 1.06 5.23 L 1.06 1.22 C 1.06 0.39 0.89 0.34 0.19 0.31 C 0.13 0.25 0.13 0.04 0.19 -0.02 C 0.64 -0.01 1.13 0 1.49 0 C 1.8499999 0 2.33 -0.01 2.78 -0.02 C 2.84 0.04 2.84 0.25 2.78 0.31 C 2.08 0.34 1.91 0.39 1.91 1.22 Z "/>
</symbol>
<symbol id="g7D8F87D2CD6331112C9FE3A07C40E0F" overflow="visible">
<path d="M 0.48 1.38 C 0.52 0.89 0.55 0.42 0.55 0 C 0.65 0.02 0.75 0.03 0.79999995 0.03 C 0.87 0.03 0.93 0.03 1 0.01 C 1.27 -0.06 1.54 -0.099999994 1.91 -0.099999994 C 2.47 -0.099999994 3.5 0.17 3.5 1.16 C 3.5 1.8399999 3.01 2.24 2.33 2.49 C 1.73 2.72 1.3299999 2.87 1.3299999 3.4199998 C 1.3299999 3.83 1.6899999 4.06 2.03 4.06 C 2.25 4.06 2.83 3.98 2.96 3.1299999 C 3.02 3.07 3.22 3.08 3.28 3.1399999 C 3.31 3.5 3.33 3.87 3.34 4.2 C 3.03 4.25 2.55 4.39 2.03 4.39 C 1.29 4.39 0.62 3.9099998 0.62 3.27 C 0.62 2.54 0.95 2.23 1.7199999 1.91 C 2.55 1.5699999 2.74 1.36 2.74 0.93 C 2.74 0.44 2.26 0.22999999 1.89 0.22999999 C 1.5 0.22999999 1.28 0.35999998 1.18 0.47 C 0.96 0.7 0.84999996 1.14 0.78999996 1.39 C 0.72999996 1.4499999 0.53999996 1.4399999 0.48 1.38 Z "/>
</symbol>
<symbol id="gB5DAD6AB9EF529853D7226A3F4588253" overflow="visible">
<path d="M 2.04 -1.5999999 C 2.2 -1.3199999 2.33 -1.04 2.45 -0.74 C 3.25 1.1899999 3.6999998 2.22 4.22 3.34 C 4.42 3.76 4.56 3.9199998 5.0299997 3.98 C 5.0899997 4.04 5.0899997 4.25 5.0299997 4.31 C 4.83 4.2999997 4.6 4.29 4.3199997 4.29 C 4.02 4.29 3.7099998 4.2999997 3.4099998 4.31 C 3.35 4.25 3.35 4.04 3.4099998 3.98 C 3.73 3.9499998 4.0499997 3.8899999 3.8899999 3.53 L 2.8999999 1.24 C 2.83 1.0799999 2.74 1.05 2.6599998 1.25 L 1.77 3.33 C 1.5899999 3.75 1.54 3.9399998 2.1 3.98 C 2.1599998 4.04 2.1599998 4.25 2.1 4.31 C 1.73 4.2999997 1.3299999 4.29 0.96999997 4.29 C 0.63 4.29 0.35999998 4.2999997 0.16 4.31 C 0.099999994 4.25 0.099999994 4.04 0.16 3.98 C 0.56 3.9299998 0.69 3.84 0.95 3.23 L 2.08 0.59999996 C 2.1699998 0.39999998 2.32 -0.06 2.22 -0.34 C 2.1 -0.66999996 1.9799999 -0.95 1.8299999 -1.26 C 1.7199999 -1.4599999 1.5799999 -1.55 1.3299999 -1.55 C 1.1899999 -1.55 1.15 -1.52 1.04 -1.52 C 0.75 -1.52 0.59999996 -1.8199999 0.59999996 -1.9499999 C 0.59999996 -2.1599998 0.79999995 -2.32 1.0699999 -2.32 C 1.28 -2.32 1.68 -2.24 2.04 -1.5999999 Z "/>
</symbol>
<symbol id="g72F468A787EE89ADC78BA24E12FC80FE" overflow="visible">
<path d="M 1.0799999 -1.93 L 3.1999998 -1.93 C 3.26 -1.87 3.26 -1.6899999 3.1999998 -1.63 C 1.8199999 -1.52 1.75 -1.37 1.75 -0.5 L 1.75 5.6099997 C 1.75 6.48 1.8299999 6.6299996 3.1999998 6.73 C 3.26 6.79 3.26 6.97 3.1999998 7.0299997 L 1.0799999 7.0299997 Z "/>
</symbol>
<symbol id="g229B6FF65CF728248AF8BE0A20FAFDFC" overflow="visible">
<path d="M 2.83 3.23 C 3.6799998 3.23 3.7099998 3 3.74 2.55 C 3.8 2.49 4.0099998 2.49 4.0699997 2.55 C 4.06 2.8 4.0499997 3.09 4.0499997 3.4299998 C 4.0499997 3.77 4.06 4.0299997 4.0699997 4.29 C 4.0099998 4.35 3.8 4.35 3.74 4.29 C 3.7099998 3.74 3.6799998 3.61 2.83 3.61 L 1.9 3.61 L 1.9 5.41 C 1.9 5.95 2.02 6.0499997 2.46 6.0499997 L 2.9099998 6.0499997 C 3.97 6.0499997 4.22 5.65 4.44 4.96 C 4.56 4.96 4.67 4.98 4.75 5.0099998 C 4.7 5.42 4.5499997 6.3799996 4.5299997 6.46 C 4.5299997 6.48 4.52 6.49 4.49 6.49 C 4.3199997 6.46 4.27 6.45 4.02 6.45 L 1.4699999 6.45 C 1.15 6.45 0.58 6.46 0.17999999 6.47 C 0.12 6.41 0.12 6.2 0.17999999 6.14 C 0.88 6.1099997 1.05 6.06 1.05 5.23 L 1.05 1.22 C 1.05 0.39 0.88 0.34 0.17999999 0.31 C 0.12 0.25 0.12 0.04 0.17999999 -0.02 C 0.53 -0.01 1.03 0 1.48 0 C 1.93 0 2.4199998 -0.01 2.77 -0.02 C 2.83 0.04 2.83 0.25 2.77 0.31 C 2.07 0.34 1.9 0.39 1.9 1.22 L 1.9 3.23 Z "/>
</symbol>
<symbol id="g2D340B5AC4FE037754EF86B583EC1E8C" overflow="visible">
<path d="M 1.49 0 L 3.62 0 C 3.8899999 0 4.81 -0.02 4.81 -0.02 C 4.91 0.48 5.0099998 1.13 5.0699997 1.65 C 4.97 1.6999999 4.8599997 1.7199999 4.74 1.6999999 C 4.54 0.97999996 4.17 0.39 3.11 0.39 L 2.48 0.39 C 2.09 0.39 1.91 0.58 1.91 1.09 L 1.91 5.23 C 1.91 6.06 2.08 6.1099997 2.78 6.14 C 2.84 6.2 2.84 6.41 2.78 6.47 C 2.33 6.46 1.86 6.45 1.48 6.45 C 1.12 6.45 0.65 6.46 0.19 6.47 C 0.13 6.41 0.13 6.2 0.19 6.14 C 0.89 6.1099997 1.06 6.06 1.06 5.23 L 1.06 1.22 C 1.06 0.39 0.89 0.34 0.19 0.31 C 0.13 0.25 0.13 0.04 0.19 -0.02 C 0.58 -0.01 1.15 0 1.49 0 Z "/>
</symbol>
<symbol id="g29073FC46AA074D3D56D2F6B70E76DDC" overflow="visible">
<path d="M 1.04 5.23 L 1.04 1.22 C 1.04 0.39 0.87 0.34 0.17 0.31 C 0.11 0.25 0.11 0.04 0.17 -0.02 C 0.62 -0.01 1.11 0 1.4699999 0 C 1.8199999 0 2.4099998 -0.01 2.9099998 -0.02 C 2.97 0.04 2.97 0.25 2.9099998 0.31 C 2.1299999 0.35 1.89 0.39 1.89 1.22 L 1.89 2.9199998 C 2.11 2.85 2.35 2.82 2.7 2.82 C 4.52 2.82 5.0699997 4.0099998 5.0699997 4.8599997 C 5.0699997 5.45 4.68 6.52 2.83 6.52 C 2.45 6.52 1.86 6.45 1.4599999 6.45 C 1.09 6.45 0.57 6.46 0.17 6.47 C 0.11 6.41 0.11 6.2 0.17 6.14 C 0.87 6.1099997 1.04 6.06 1.04 5.23 Z M 1.89 5.54 C 1.89 5.83 2.04 6.18 2.76 6.18 C 3.4499998 6.18 4.14 5.95 4.14 4.68 C 4.14 3.6 3.62 3.1599998 2.6499999 3.1599998 C 2.3999999 3.1599998 2 3.1799998 1.89 3.21 Z "/>
</symbol>
<symbol id="g62E843CADC663C7579F6A3FBC77388E" overflow="visible">
<path d="M 3.9499998 6.31 C 3.37 6.39 3.3899999 6.58 2.4099998 6.58 C 1.4 6.58 0.51 5.91 0.51 4.85 C 0.51 3.8 1.38 3.32 2.26 2.98 C 2.86 2.75 3.6 2.47 3.6 1.49 C 3.6 0.68 3.1499999 0.26 2.35 0.26 C 1.42 0.26 0.82 0.69 0.59999996 1.68 C 0.47 1.7199999 0.37 1.6999999 0.26999998 1.65 C 0.31 0.88 0.35 0.62 0.47 0.13 C 1.1 0.13 1.38 -0.099999994 2.27 -0.099999994 C 2.72 -0.099999994 3.1499999 0.01 3.5 0.22999999 C 4.08 0.59 4.44 1.1999999 4.44 1.8399999 C 4.44 2.8999999 3.6399999 3.37 2.82 3.6799998 C 2.22 3.8999999 1.22 4.2799997 1.22 5.12 C 1.22 5.68 1.73 6.24 2.32 6.24 C 3.29 6.24 3.6 5.62 3.8 4.96 C 3.9099998 4.94 4.04 4.95 4.13 5.02 C 4.0899997 5.5 4.0499997 5.7799997 3.9499998 6.31 Z "/>
</symbol>
<symbol id="g95425A100709BC27DA272BD05EE64D31" overflow="visible">
<path d="M 2.9299998 6.46 L 2.52 6.46 L 0.14999999 -0.59 L 0.56 -0.59 Z "/>
</symbol>
<symbol id="gA9BC29EE3008608C91A9D8C24BBB5729" overflow="visible">
<path d="M 1.67 3.9399998 C 1.61 3.8799999 1.5699999 3.8999999 1.5699999 3.99 L 1.5699999 5.83 C 1.5699999 6.48 1.61 6.8799996 1.61 6.8799996 C 1.61 6.95 1.5699999 6.98 1.48 6.98 C 1.23 6.8799996 0.48 6.74 0.08 6.71 C 0.06 6.6299996 0.08 6.47 0.14 6.41 C 0.17 6.41 0.19999999 6.41 0.22999999 6.41 C 0.66999996 6.3799996 0.78 6.3799996 0.78 5.5899997 L 0.78 0.71 C 0.78 0.32 0.77 0.14 0.74 0 C 0.78999996 -0.08 0.84 -0.12 0.96 -0.12 C 1.02 -0.06 1.12 0.03 1.1999999 0.12 C 1.3 0.24 1.36 0.24 1.4699999 0.14999999 C 1.6999999 -0.04 2 -0.099999994 2.35 -0.099999994 C 3.37 -0.099999994 4.56 0.83 4.56 2.4199998 C 4.56 3.6399999 3.6599998 4.39 2.79 4.39 C 2.36 4.39 1.9799999 4.22 1.67 3.9399998 Z M 1.74 3.6299999 C 2 3.86 2.3 3.9499998 2.59 3.9499998 C 3.1999998 3.9499998 3.6999998 3.21 3.6999998 2.24 C 3.6999998 1.1 3.24 0.24 2.29 0.24 C 1.9799999 0.24 1.77 0.45999998 1.5699999 0.71 L 1.5699999 3.21 C 1.5699999 3.4199998 1.61 3.52 1.74 3.6299999 Z "/>
</symbol>
<symbol id="g99297E49F87FAD41528007F13950C7FE" overflow="visible">
<path d="M 2.48 7.04 L 0.35999998 7.04 C 0.29999998 6.98 0.29999998 6.7999997 0.35999998 6.74 C 1.74 6.6299996 1.81 6.48 1.81 5.6099997 L 1.81 -0.5 C 1.81 -1.37 1.73 -1.52 0.35999998 -1.62 C 0.29999998 -1.68 0.29999998 -1.86 0.35999998 -1.92 L 2.48 -1.92 Z "/>
</symbol>
<symbol id="gE67DAEF23D7CF18A0AF2DED5954B0CBF" overflow="visible">
<path d="M 1.75 1.22 L 1.75 3.8999999 L 2.6799998 3.8999999 C 2.77 3.8999999 2.9099998 3.9399998 2.9099998 4.0299997 L 2.9099998 4.23 C 2.9099998 4.27 2.8799999 4.29 2.83 4.29 L 1.75 4.29 L 1.75 4.8599997 C 1.75 6.3999996 2.21 6.64 2.54 6.64 C 2.84 6.64 3 6.52 3.1399999 6.17 C 3.22 5.99 3.33 5.85 3.54 5.85 C 3.7099998 5.85 3.9499998 6.06 3.9499998 6.2599998 C 3.9499998 6.43 3.84 6.6099997 3.6299999 6.77 C 3.37 6.95 3.11 6.98 2.73 6.98 C 1.89 6.98 0.96 6.25 0.96 4.69 L 0.96 4.29 L 0.45 4.29 C 0.26999998 4.29 0.22 4.17 0.22 4.0899997 L 0.22 3.9599998 C 0.22 3.9099998 0.22999999 3.8999999 0.26999998 3.8999999 L 0.96 3.8999999 L 0.96 1.22 C 0.96 0.39 0.79999995 0.35 0.26 0.31 C 0.19999999 0.25 0.19999999 0.04 0.26 -0.02 C 0.61 -0.01 0.96 0 1.36 0 C 1.76 0 2.24 -0.01 2.59 -0.02 C 2.6499999 0.04 2.6499999 0.25 2.59 0.31 C 1.81 0.35 1.75 0.39 1.75 1.22 Z "/>
</symbol>
<symbol id="g652A779657972FBC161C470ED70CA8EC" overflow="visible">
<path d="M 1.6999999 3.58 C 1.6899999 3.8799999 1.67 4.24 1.62 4.3399997 C 1.5999999 4.39 1.5799999 4.42 1.5 4.42 C 1.22 4.31 0.96 4.22 0.26999998 4.13 C 0.25 4.0699997 0.26999998 3.9099998 0.29 3.85 C 0.83 3.8 0.94 3.75 0.94 3.1699998 L 0.94 1.22 C 0.94 0.39999998 0.81 0.35 0.26 0.31 C 0.19999999 0.25 0.19999999 0.04 0.26 -0.02 C 0.56 -0.01 0.94 0 1.3399999 0 C 1.74 0 2.05 -0.01 2.35 -0.02 C 2.4099998 0.04 2.4099998 0.25 2.35 0.31 C 1.8399999 0.35999998 1.73 0.39999998 1.73 1.22 L 1.73 2.86 C 1.73 3.07 1.8199999 3.1899998 1.9 3.28 C 2.3 3.6699998 2.6699998 3.87 2.98 3.87 C 3.36 3.87 3.6399999 3.6299999 3.6399999 2.96 L 3.6399999 1.22 C 3.6399999 0.39999998 3.56 0.35 3.02 0.31 C 2.97 0.25 2.97 0.04 3.02 -0.02 C 3.27 -0.01 3.6399999 0 4.04 0 C 4.44 0 4.77 -0.01 5.02 -0.02 C 5.0699997 0.04 5.0699997 0.25 5.02 0.31 C 4.52 0.35 4.43 0.39999998 4.43 1.22 L 4.43 2.81 C 4.43 2.95 4.43 3.09 4.42 3.21 C 4.9 3.74 5.35 3.87 5.74 3.87 C 6.12 3.87 6.3399997 3.6499999 6.3399997 2.98 L 6.3399997 1.22 C 6.3399997 0.39999998 6.24 0.35 5.72 0.31 C 5.67 0.25 5.67 0.04 5.72 -0.02 C 5.97 -0.01 6.3399997 0 6.74 0 C 7.14 0 7.49 -0.01 7.77 -0.02 C 7.8199997 0.04 7.8199997 0.25 7.77 0.31 C 7.22 0.35 7.1299996 0.39999998 7.1299996 1.22 L 7.1299996 2.8 C 7.1299996 3.6899998 6.98 4.39 6.1 4.39 C 5.5899997 4.39 4.97 4.2 4.45 3.6499999 C 4.42 3.62 4.36 3.57 4.3399997 3.6599998 C 4.25 4.0699997 3.86 4.39 3.34 4.39 C 2.76 4.39 2.24 4.0499997 1.8199999 3.58 C 1.77 3.53 1.7099999 3.46 1.6999999 3.58 Z "/>
</symbol>
<symbol id="gD63979DFEF9CA9C9A7E89CD2DA3C3A19" overflow="visible">
<path d="M 3.98 0.90999997 C 3.9399998 1 3.86 1.04 3.77 1.05 C 3.4299998 0.61 3 0.39 2.57 0.39 C 1.8399999 0.39 1.23 1.13 1.23 2.3 C 1.23 3.3999999 1.7099999 4.06 2.37 4.06 C 2.96 4.06 3.04 3.7099998 3.08 3.36 C 3.11 3.09 3.25 3 3.46 3 C 3.6699998 3 3.9499998 3.1299999 3.9499998 3.4399998 C 3.9499998 3.99 3.3799999 4.39 2.4199998 4.39 C 1.43 4.39 0.37 3.5 0.37 2.08 C 0.37 0.78999996 1.09 -0.099999994 2.35 -0.099999994 C 2.95 -0.099999994 3.48 0.089999996 3.98 0.90999997 Z "/>
</symbol>
<symbol id="g8038720E9FD02ADAD29947BFDF50BDCC" overflow="visible">
<path d="M 3.5 1.22 L 3.5 5.04 C 3.5 5.68 3.6 6.06 3.9599998 6.06 L 4.18 6.06 C 4.93 6.06 5.4 5.7999997 5.5699997 5.0299997 C 5.68 5.0299997 5.8199997 5.04 5.91 5.08 C 5.8399997 5.54 5.79 6.0499997 5.7799997 6.5 C 5.7799997 6.5099998 5.7599998 6.5299997 5.75 6.5299997 C 5.41 6.5 4.2999997 6.45 3.51 6.45 L 2.6499999 6.45 C 1.88 6.45 0.7 6.5 0.32 6.5299997 C 0.29999998 6.5299997 0.28 6.5099998 0.28 6.5 C 0.24 6.0499997 0.14 5.5299997 0.03 5.0499997 C 0.13 5.0099998 0.25 5 0.37 5 C 0.57 5.7999997 1.03 6.06 1.6899999 6.06 L 2.18 6.06 C 2.55 6.06 2.6499999 5.68 2.6499999 5.0699997 L 2.6499999 1.22 C 2.6499999 0.39 2.48 0.34 1.68 0.31 C 1.62 0.25 1.62 0.04 1.68 -0.02 C 2.1699998 -0.01 2.6799998 0 3.08 0 C 3.46 0 3.97 -0.01 4.47 -0.02 C 4.5299997 0.04 4.5299997 0.25 4.47 0.31 C 3.6699998 0.34 3.5 0.39 3.5 1.22 Z "/>
</symbol>
<symbol id="gD6703645BCA3FEA845C8778091F5B88B" overflow="visible">
<path d="M 6.6299996 1.16 C 6.68 0.35 6.64 0.34 5.99 0.31 C 5.93 0.25 5.93 0.04 5.99 -0.02 C 6.3599997 -0.01 6.7599998 0 7.06 0 C 7.3599997 0 7.8199997 -0.01 8.15 -0.02 C 8.21 0.04 8.21 0.25 8.15 0.31 C 7.47 0.34 7.44 0.38 7.3799996 1.2099999 L 7.08 5.42 C 7.04 5.96 7.0899997 6.1 7.77 6.14 C 7.83 6.2 7.83 6.41 7.77 6.47 L 6.46 6.45 L 4.3399997 1.38 C 4.2999997 1.28 4.27 1.24 4.25 1.24 C 4.23 1.24 4.2 1.29 4.17 1.37 L 2.12 6.45 L 0.66999996 6.47 C 0.61 6.41 0.61 6.2 0.66999996 6.14 C 1.36 6.1 1.42 6.08 1.35 5.33 L 0.95 1.2099999 C 0.89 0.61 0.77 0.35999998 0.19999999 0.31 C 0.14 0.25 0.14 0.04 0.19999999 -0.02 C 0.5 -0.01 0.82 0 1.06 0 C 1.3 0 1.6999999 -0.01 2 -0.02 C 2.06 0.04 2.06 0.25 2 0.31 C 1.3399999 0.35999998 1.29 0.66999996 1.3399999 1.23 L 1.7199999 5.24 L 1.74 5.24 L 3.81 0.049999997 C 3.84 -0.02 3.8999999 -0.099999994 3.9599998 -0.099999994 C 4.02 -0.099999994 4.06 -0.03 4.1 0.049999997 L 6.35 5.3399997 L 6.37 5.3399997 Z "/>
</symbol>
<symbol id="gD908EABB3E36470B09E6E9B440A24B90" overflow="visible">
<path d="M 3.34 0.5 C 3.3899999 0.53999996 3.48 0.56 3.49 0.48999998 C 3.52 0.25 3.6 -0.099999994 3.6 -0.099999994 C 3.6799998 -0.13 3.73 -0.12 3.79 -0.099999994 C 4.0099998 0.08 4.36 0.22999999 4.97 0.29999998 C 5.0299997 0.35999998 5.0299997 0.51 4.97 0.57 C 4.33 0.62 4.24 0.81 4.24 1.3 L 4.24 5.83 C 4.24 6.48 4.2799997 6.8799996 4.2799997 6.8799996 C 4.2799997 6.95 4.24 6.98 4.15 6.98 C 3.8999999 6.8799996 3.1499999 6.74 2.75 6.71 C 2.73 6.6299996 2.75 6.47 2.81 6.41 C 2.84 6.41 2.87 6.41 2.8999999 6.41 C 3.34 6.3799996 3.4499998 6.3799996 3.4499998 5.5899997 L 3.4499998 4.31 C 3.4499998 4.24 3.4299998 4.22 3.36 4.22 C 3.32 4.22 2.9099998 4.39 2.58 4.39 C 1.92 4.39 1.48 4.17 1.0799999 3.79 C 0.65 3.36 0.39 2.77 0.39 2.03 C 0.39 0.79999995 1.01 -0.099999994 2.09 -0.099999994 C 2.48 -0.099999994 2.85 0.099999994 3.34 0.5 Z M 3.4499998 1.24 C 3.4499998 1.05 3.4299998 0.96999997 3.29 0.84999996 C 2.9199998 0.53 2.6 0.37 2.35 0.37 C 1.81 0.37 1.25 0.96 1.25 2.21 C 1.25 2.9299998 1.39 3.33 1.54 3.54 C 1.8499999 4.0099998 2.27 4.04 2.47 4.04 C 2.83 4.04 3.08 3.9099998 3.28 3.6799998 C 3.4199998 3.52 3.4499998 3.4499998 3.4499998 3.1399999 Z "/>
</symbol>
<symbol id="g24E326A9802E1326DD4439B59EE95868" overflow="visible">
<path d="M 2.03 3.98 C 2.09 4.04 2.09 4.25 2.03 4.31 C 1.73 4.2999997 1.38 4.29 1.02 4.29 C 0.64 4.29 0.37 4.2999997 0.11 4.31 C 0.049999997 4.25 0.049999997 4.04 0.11 3.98 C 0.58 3.9299998 0.69 3.83 0.9 3.31 L 2.21 0.08 C 2.27 -0.06 2.35 -0.12 2.45 -0.12 C 2.54 -0.12 2.62 -0.06 2.69 0.099999994 L 3.6999998 2.56 L 4.73 0.08 C 4.79 -0.06 4.87 -0.12 4.97 -0.12 C 5.06 -0.12 5.14 -0.06 5.2 0.089999996 L 6.5099998 3.26 C 6.67 3.6699998 6.8399997 3.9499998 7.35 3.98 C 7.41 4.04 7.41 4.25 7.35 4.31 C 7.1499996 4.2999997 6.93 4.29 6.6299996 4.29 C 6.33 4.29 5.91 4.2999997 5.6099997 4.31 C 5.5499997 4.25 5.5499997 4.04 5.6099997 3.98 C 6.3599997 3.9399998 6.2599998 3.6499999 6.12 3.3 L 5.2799997 1.22 C 5.18 0.96999997 5.14 0.96999997 5.06 1.18 L 4.2 3.37 C 4.0099998 3.84 4.0299997 3.9399998 4.56 3.98 C 4.62 4.04 4.62 4.25 4.56 4.31 C 4.2599998 4.2999997 3.86 4.29 3.56 4.29 C 3.31 4.29 2.98 4.2999997 2.6799998 4.31 C 2.62 4.25 2.62 4.04 2.6799998 3.98 C 3.1499999 3.9399998 3.27 3.6499999 3.4499998 3.1999998 L 3.51 3.04 L 2.81 1.27 C 2.6799998 0.94 2.6599998 0.93 2.53 1.25 L 1.73 3.31 C 1.54 3.79 1.55 3.9499998 2.03 3.98 Z "/>
</symbol>
<symbol id="g326CA448B94C8D25A227D725FDC9AB9" overflow="visible">
<path d="M 1.67 2.86 C 1.67 3.07 1.76 3.1899998 1.8399999 3.28 C 2.22 3.6499999 2.73 3.87 3.12 3.87 C 3.32 3.87 3.53 3.74 3.6499999 3.51 C 3.75 3.31 3.77 3.04 3.77 2.74 L 3.77 1.22 C 3.77 0.39999998 3.6699998 0.35999998 3.1499999 0.31 C 3.1 0.25 3.1 0.04 3.1499999 -0.02 C 3.4299998 -0.01 3.77 0 4.17 0 C 4.5699997 0 4.9 -0.01 5.23 -0.02 C 5.2799997 0.04 5.2799997 0.25 5.23 0.31 C 4.67 0.35999998 4.56 0.39999998 4.56 1.22 L 4.56 2.71 C 4.56 3.26 4.52 3.75 4.29 4.0499997 C 4.12 4.27 3.81 4.39 3.46 4.39 C 2.97 4.39 2.37 4.2599998 1.76 3.58 C 1.76 3.57 1.75 3.57 1.74 3.56 C 1.7099999 3.52 1.66 3.46 1.66 3.58 L 1.67 5.83 C 1.67 6.48 1.7099999 6.8799996 1.7099999 6.8799996 C 1.7099999 6.95 1.67 6.98 1.5799999 6.98 C 1.3299999 6.8799996 0.58 6.74 0.17999999 6.71 C 0.16 6.6299996 0.17999999 6.47 0.24 6.41 C 0.26999998 6.41 0.29999998 6.41 0.32999998 6.41 C 0.77 6.3799996 0.88 6.3799996 0.88 5.5899997 L 0.88 1.22 C 0.88 0.39 0.76 0.35 0.17999999 0.31 C 0.12 0.25 0.12 0.04 0.17999999 -0.02 C 0.51 -0.01 0.88 0 1.28 0 C 1.66 0 1.99 -0.01 2.27 -0.02 C 2.33 0.04 2.33 0.25 2.27 0.31 C 1.76 0.35 1.67 0.39 1.67 1.22 Z "/>
</symbol>
<symbol id="gED8CAD24521E8BF74BBCB4EAC1068E02" overflow="visible">
<path d="M 4.44 3.87 C 4.63 3.87 4.81 4.0499997 4.81 4.25 C 4.81 4.46 4.62 4.62 4.35 4.62 C 4.0899997 4.62 3.61 4.45 3.35 4.11 C 3.23 4.19 2.8999999 4.39 2.31 4.39 C 1.42 4.39 0.59999996 3.79 0.59999996 2.87 C 0.59999996 2.33 0.84 2.02 1.12 1.75 C 0.84 1.51 0.68 1.0799999 0.68 0.74 C 0.68 0.38 0.88 0.099999994 1.14 -0.03 C 0.59999996 -0.35 0.32 -0.81 0.32 -1.24 C 0.32 -2.11 1.14 -2.3799999 1.91 -2.3799999 C 3.26 -2.3799999 4.72 -1.73 4.72 -0.65 C 4.72 -0.32999998 4.5699997 -0.08 4.2799997 0.16 C 3.8899999 0.48 3.1899998 0.48999998 2.82 0.48999998 C 2.6399999 0.48999998 2.3899999 0.47 2.1499999 0.44 C 2 0.42999998 1.9 0.42 1.8499999 0.42 C 1.56 0.42 1.2099999 0.56 1.2099999 0.98999995 C 1.2099999 1.1899999 1.27 1.4 1.39 1.5699999 C 1.63 1.43 1.91 1.36 2.3 1.36 C 3.1799998 1.36 3.99 1.91 3.99 2.8899999 C 3.99 3.36 3.85 3.6299999 3.56 3.9299998 C 3.6299999 4.0299997 3.83 4.17 3.9599998 4.17 C 4.0299997 4.17 4.1 4.14 4.16 4.0499997 C 4.2 3.97 4.33 3.87 4.44 3.87 Z M 1.3199999 -0.099999994 C 1.43 -0.13 1.5899999 -0.14999999 1.7199999 -0.14999999 C 2.03 -0.14999999 2.31 -0.12 2.45 -0.12 C 2.95 -0.12 3.37 -0.13 3.6599998 -0.29 C 4.0499997 -0.51 4.17 -0.65 4.17 -0.90999997 C 4.17 -1.63 3.11 -2.01 2.21 -2.01 C 1.8499999 -2.01 1.01 -1.7199999 1.01 -1.04 C 1.01 -0.7 1.03 -0.45 1.3199999 -0.099999994 Z M 3.1899998 2.79 C 3.1899998 1.86 2.72 1.67 2.35 1.67 C 1.51 1.67 1.42 2.3799999 1.42 3.02 C 1.42 3.72 1.67 4.08 2.22 4.08 C 2.85 4.08 3.1899998 3.62 3.1899998 2.79 Z "/>
</symbol>
<symbol id="g23EA1C753A3A6B5551EEA1A5D7FB37ED" overflow="visible">
<path d="M 3.57 -0.099999994 C 4.54 -0.099999994 5.43 0.35999998 6.1099997 1.25 C 6.06 1.3399999 5.99 1.4 5.8799996 1.4 C 5.17 0.62 4.49 0.31 3.56 0.31 C 2.21 0.31 1.28 1.8299999 1.28 3.29 C 1.28 4.15 1.51 4.87 1.87 5.3199997 C 2.37 5.93 2.95 6.21 3.47 6.21 C 4.85 6.21 5.4 5.39 5.66 4.5299997 C 5.7799997 4.49 5.8799996 4.52 5.99 4.58 C 5.94 5.1 5.87 5.58 5.77 6.1299996 C 5.2599998 6.18 4.81 6.58 3.58 6.58 C 2.73 6.58 2.02 6.27 1.43 5.73 C 0.77 5.12 0.37 4.14 0.37 3.1 C 0.37 1.36 1.42 -0.099999994 3.57 -0.099999994 Z "/>
</symbol>
<symbol id="gA44474528B05A80D93B01AD42F22D65" overflow="visible">
<path d="M 2.1699998 -0.099999994 C 2.56 -0.099999994 3.03 0.099999994 3.52 0.5 C 3.57 0.53999996 3.6599998 0.56 3.6699998 0.48999998 C 3.6999998 0.24 3.78 -0.099999994 3.78 -0.099999994 C 3.86 -0.13 3.9099998 -0.12 3.97 -0.099999994 C 4.19 0.08 4.54 0.22999999 5.15 0.29999998 C 5.21 0.35999998 5.21 0.51 5.15 0.57 C 4.5099998 0.62 4.42 0.81 4.42 1.3 L 4.42 3.22 C 4.42 3.52 4.46 4.25 4.46 4.25 C 4.46 4.2799997 4.43 4.31 4.38 4.31 C 4.33 4.2999997 4.18 4.29 4.0299997 4.29 C 3.7099998 4.29 3.35 4.2999997 3.01 4.31 C 2.95 4.25 2.95 4.04 3.01 3.98 C 3.5 3.9499998 3.6299999 3.83 3.6299999 3.1699998 L 3.6299999 1.24 C 3.6299999 1.05 3.61 0.96999997 3.47 0.84999996 C 3.1 0.53 2.72 0.37 2.47 0.37 C 2.1699998 0.37 1.67 0.51 1.67 1.4 L 1.67 3.22 C 1.67 3.52 1.7099999 4.25 1.7099999 4.25 C 1.7099999 4.2799997 1.68 4.31 1.63 4.31 C 1.5799999 4.2999997 1.43 4.29 1.28 4.29 C 0.96 4.29 0.59999996 4.2999997 0.26 4.31 C 0.19999999 4.25 0.19999999 4.04 0.26 3.98 C 0.74 3.9399998 0.88 3.83 0.88 3.1799998 L 0.88 1.26 C 0.88 0.57 1.18 -0.099999994 2.1699998 -0.099999994 Z "/>
</symbol>
</defs>
</svg>

پس از

عرض:  |  ارتفاع:  |  اندازه: 49 KiB

@@ -0,0 +1,35 @@
#import "@preview/cetz:0.4.2"
#import "@preview/cetz-plot:0.1.3": plot, chart
#set page(width: auto, height: auto, margin: .5cm)
#set text(size: 10pt)
#cetz.canvas({
import cetz.draw: *
set-style(axes: (stroke: .5pt, tick: (stroke: .5pt)), legend: none)
plot.plot(
size: (12, 8),
name: "roofline",
x-min: 0.1, x-max: 10000, x-mode: "log",
y-min: 0, y-max: 200,
x-tick-step: none, y-tick-step: none,
{
plot.add(((0.1, 0), (18.025, 144.2), (10000, 144.2)), style: (stroke: (paint: blue)))
plot.add-anchor("membound", (1, 64))
plot.add-anchor("compbound", (1000, 144.2))
plot.add-anchor("ridge", (18.025, 144.2))
},
x-label: "Operational Intensity [FLOPS/byte]",
y-label: "Performance [TFLOPS]",
)
line("roofline.ridge", ((), "|-", (18.025, 0)), name: "ridge-horiz-line", stroke: (paint: red, dash: "densely-dashed"))
line("roofline.membound", ((), "|-", (1, 5)), mark: (start: ">"), name: "membound-arrow")
content("membound-arrow.end", [Memory bandwidth ceiling], anchor: "south", padding: .1)
line("roofline.compbound", ((), "|-", (1, 2)), mark: (start: ">"), name: "compbound-arrow")
content("compbound-arrow.end", [Compute ceiling], anchor: "north", padding: .1)
})
@@ -1,7 +1,7 @@
.. meta::
:description: This chapter describes a set of best practices designed to help
developers optimize the performance of HIP-capable GPU architectures.
:keywords: AMD, ROCm, HIP, CUDA, performance, guidelines
:keywords: AMD, ROCm, HIP, CUDA, performance, guidelines, optimization, how-to
.. _how_to_performance_guidelines:
@@ -9,313 +9,619 @@
Performance guidelines
*******************************************************************************
The AMD HIP performance guidelines are a set of best practices designed to help
you optimize the application performance on AMDGPUs. The guidelines discuss
established parallelization and optimization techniques to improve the
application performance on HIP-capable GPU architectures.
The AMD HIP performance guidelines provide practical, actionable techniques for
optimizing application performance on AMD GPUs. This guide focuses on
step-by-step instructions and best practices for improving performance.
Here are the four main cornerstones to help you exploit HIP's performance
optimization potential:
For theoretical foundations and performance concepts, see
:doc:`../understand/performance_optimization`.
- Parallel execution
- Memory bandwidth usage optimization
- Maximum throughput optimization
- Memory thrashing minimization
Optimization workflow
=====================
This document discusses the usage and benefits of these cornerstones in detail.
Follow this systematic approach to optimize GPU performance:
1. **Profile and measure baseline**
Use ``rocprofv3`` to identify bottlenecks:
.. code-block:: bash
rocprofv3 --stats ./your_application
Collect metrics on kernel execution time, memory bandwidth, occupancy, and
CU utilization.
2. **Analyze metrics to identify bottlenecks**
Determine if kernels are compute-bound or memory-bound. Check arithmetic
intensity, memory bandwidth achieved vs peak, and compute throughput.
For understanding the roofline model, see :ref:`roofline_model`.
3. **Apply targeted optimizations**
Based on identified bottlenecks, apply techniques from this guide.
4. **Verify improvements**
Re-profile to confirm performance gains.
5. **Iterate**
Repeat until performance goals are met.
.. _parallel execution:
Parallel execution
================================================================================
==================
For optimal use and to keep all system components busy, the application must
reveal and efficiently provide as much parallelism as possible. The parallelism
can be performed at the application level, device level, and multiprocessor
level.
.. _application_parallel_execution:
reveal and efficiently provide as much parallelism as possible.
Application level
--------------------------------------------------------------------------------
-----------------
To enable parallel execution of the application across the host and devices, use
:ref:`asynchronous calls and streams <asynchronous_how-to>`. Assign workloads
based on efficiency: serial to the host or parallel to the devices.
To enable parallel execution across the host and devices:
For parallel workloads, when threads belonging to the same block need to
synchronize to share data, use :cpp:func:`__syncthreads()` (see:
:ref:`synchronization_functions`) within the same kernel invocation. For threads
belonging to different blocks, use global memory with two separate
kernel invocations. It is recommended to avoid the latter approach as it adds
overhead.
* Use :ref:`asynchronous calls and streams <asynchronous_how-to>`
* Assign serial workloads to the host
* Assign parallel workloads to the devices
For parallel workloads:
* Use :cpp:func:`__syncthreads()` (see :ref:`synchronization_functions`) for
intra-block synchronization
* Use global memory with separate kernel invocations for inter-block
synchronization (has overhead, minimize when possible)
Device level
--------------------------------------------------------------------------------
------------
Device level optimization primarily involves maximizing parallel execution
across the multiprocessors on the device. You can achieve device level
optimization by executing multiple kernels concurrently on a device. To enhance
performance, the management of these kernels is facilitated by streams, which
allows overlapping of computation and data transfers. This approach aims at
keeping all multiprocessors busy by executing enough kernels concurrently.
However, launching too many kernels can lead to resource contention, hence a
balance must be found for optimal performance. The device level optimization
helps in achieving maximum utilization of the device resources.
Maximize parallel execution across multiprocessors:
* Execute multiple kernels concurrently on a device
* Use streams to overlap computation and data transfers
* Keep all multiprocessors busy with enough concurrent kernels
* Avoid launching too many kernels (causes resource contention)
Multiprocessor level
--------------------------------------------------------------------------------
--------------------
Multiprocessor level optimization involves maximizing parallel execution within
each multiprocessor on a device. The key to multiprocessor level optimization
is to efficiently utilize the various functional units within a multiprocessor.
For example, ensuring a sufficient number of resident warps, so that every clock
cycle has an instruction from a warp is ready for execution. This instruction
could either be another independent instruction of the same warp, which exploits
:ref:`instruction level optimization <instruction optimization>`, or more
commonly an instruction of another warp, which exploits thread-level parallelism.
Maximize parallel execution within each multiprocessor:
On the other hand, device level optimization focuses on the device as a whole,
aiming at keeping all multiprocessors busy by executing enough kernels
concurrently. Both multiprocessor and device levels of optimization are crucial
for achieving maximum performance. They work together to ensure efficient
utilization of the GPU resources, ranging from individual multiprocessors to the
device as a whole.
* Ensure sufficient resident warps for every clock cycle
* Exploit instruction-level parallelism within warps
* Exploit thread-level parallelism across warps
* Balance resource usage for optimal occupancy
.. _memory optimization:
Memory throughput optimization
================================================================================
==============================
The first step in maximizing memory throughput is to minimize low-bandwidth
data transfers between the host and the device.
Additionally, maximize the use of on-chip memory, that is, shared memory and
caches, and minimize transfers with global memory. Shared memory acts as a
user-managed cache explicitly allocated and accessed by the application. A
common programming pattern is to stage data from device memory into shared
memory. The staging of data from the device to shared memory involves the
following steps:
1. Each thread of a block loading data from device memory to shared memory.
2. Synchronizing with all other threads of the block.
3. Processing the data stored in shared memory.
4. Synchronizing again if necessary.
5. Writing the results back to the device global memory.
For some applications, a traditional hardware-managed cache is more appropriate
for exploiting data locality.
In conclusion, the throughput of memory accesses by a kernel can vary
significantly depending on the access pattern. Therefore, the next step in
maximizing memory throughput is to organize memory accesses as optimally as
possible. This is especially important for global memory accesses, as global
memory bandwidth is low compared to available on-chip bandwidths and arithmetic
instruction throughput. Thus, non-optimal global memory accesses generally have
a high impact on performance.
The memory throughput optimization techniques are further discussed in detail in
the following sections.
Additionally, maximize the use of on-chip memory (shared memory and caches) and
minimize transfers with global memory.
.. _data transfer:
Data transfer
--------------------------------------------------------------------------------
Data transfer optimization
--------------------------
To minimize data transfers between the host and the device, applications should
move more computations from the host to the device, even at the cost of running
kernels that don't fully utilize parallelism for the device. Intermediate data
structures should be created, used, and discarded in device memory without being
mapped or copied to host memory.
**Minimize host-device transfers**
Batching small transfers into a single large transfer can improve performance
due to the overhead associated with each transfer. On systems with a front-side
bus, using page-locked host memory can enhance data transfer performance.
* Move computations from host to device when possible
* Create, use, and discard intermediate data structures on device
* Avoid unnecessary copies to host memory
When using mapped page-locked memory, there is no need to allocate device
memory or explicitly copy data between device and host memory. Data transfers
occur implicitly each time the kernel accesses the mapped memory. For optimal
performance, these memory accesses should be coalesced, similar to global
memory accesses. The process where threads in a warp access sequential memory
locations is known as coalesced memory access, which can enhance memory data
transfer efficiency.
**Batch small transfers**
On integrated systems where device and host memory are physically the same, no
copy operation between host and device memory is required and hence mapped
page-locked memory should be used instead. To check if the device is integrated,
applications can query the integrated device property.
Each memory transfer incurs a fixed overhead from driver calls and PCIe
transaction setup. Consolidating many small transfers into a single large
transfer amortizes this overhead across more data, resulting in much higher
effective bandwidth.
.. code-block:: cuda
// Instead of many small transfers
for (int i = 0; i < n; i++) {
hipMemcpy(&d_data[i], &h_data[i], sizeof(float), ...);
}
// Use a single large transfer
hipMemcpy(d_data, h_data, n * sizeof(float), ...);
**Use page-locked memory for transfers**
Page-locked (pinned) memory cannot be swapped to disk by the operating system,
allowing the GPU to access it directly via DMA without CPU involvement. This
eliminates an extra copy through a staging buffer and achieves higher bandwidth.
.. code-block:: cuda
float* h_pinned;
hipHostMalloc(&h_pinned, size);
// Faster transfers than pageable memory
hipMemcpy(d_data, h_pinned, size, hipMemcpyHostToDevice);
**Use mapped memory on integrated systems**
On integrated GPUs (APUs), the CPU and GPU share the same physical memory.
Mapped page-locked memory allows zero-copy access, where the GPU reads directly
from host memory without requiring an explicit transfer, eliminating redundant copies.
.. code-block:: cuda
int integrated;
hipDeviceGetAttribute(&integrated, hipDeviceAttributeIntegrated, device);
if (integrated) {
// Use mapped page-locked memory - no explicit copy needed
hipHostMalloc(&ptr, size, hipHostMallocMapped);
}
.. _device memory access:
Device memory access
---------------------
--------------------
Memory access instructions might be repeated due to the spread of memory
addresses across warp threads. The impact on throughput varies with memory type
and is generally reduced when addresses are more scattered, especially in
global memory.
**Ensure proper alignment**
Device memory is accessed via 32-, 64-, or 128-byte transactions that must be
naturally aligned.
Maximizing memory throughput involves:
Memory hardware loads data in aligned chunks (typically 128 bytes). Using
naturally aligned data types ensures each access maps to a single memory
transaction, maximizing bandwidth and avoiding split transactions.
- Coalescing memory accesses of threads within a warp into minimal transactions.
- Following optimal access patterns.
- Using properly sized and aligned data types.
- Padding data when necessary.
.. code-block:: cuda
Global memory instructions support reading or writing data of specific sizes (1,
2, 4, 8, or 16 bytes) that are naturally aligned. Not meeting the size and
alignment requirements leads to multiple instructions, which reduces
performance. Therefore, for correct results and optimal performance:
// Use naturally aligned types
float4 data; // 16-byte aligned
float2 data; // 8-byte aligned
// Ensure structure alignment
struct __align__(16) MyStruct {
float4 data;
};
- Use data types that meet these requirements
- Ensure alignment for structures
- Maintain alignment for all values or arrays.
**Optimize 2D array access**
Threads often access 2D arrays at an address calculated as
``BaseAddress + xIndex + width * yIndex``. For efficient memory access, the
array and thread block widths should be multiples of the warp size. If the
array width is not a multiple of the warp size, it is usually more efficient to
allocate the array with a width rounded up to the nearest multiple and pad the
rows accordingly.
Padding 2D arrays to multiples of the wavefront size ensures each row starts
at an aligned memory boundary. This allows consecutive threads accessing the
same row to generate coalesced memory transactions, thereby maximizing
bandwidth.
Local memory is used for certain automatic variables, such as arrays with
non-constant indices, large structures of arrays, and any variable where the
kernel uses more registers than available. Local memory resides in device
memory, which leads to high latency and low bandwidth, similar to global memory
accesses. However, the local memory is organized for consecutive 32-bit words to
be accessed by consecutive thread IDs, which allows full coalescing when all
threads in a warp access the same relative address.
.. code-block:: cuda
Shared memory is located on-chip and provides higher bandwidth and lower latency
than local or global memory. It is divided into banks that can be simultaneously
accessed, which boosts bandwidth. However, bank conflicts, where two addresses
fall in the same bank, lead to serialized access and decreased throughput.
Therefore, understanding how memory addresses map to banks and scheduling
requests to minimize conflicts is crucial for optimal performance.
// Ensure array width is multiple of warp size
int width = ((actual_width + warpSize - 1) / warpSize) * warpSize;
hipMalloc(&array, width * height * sizeof(float));
// Access pattern
int idx = x + width * y; // width should be warp-aligned
Constant memory is in the device memory and cached in the constant cache.
Requests are split based on different memory addresses and are serviced based
either on the throughput of the constant cache for cache hits or on the
throughput of the device memory otherwise. This splitting of requests affects
throughput.
**Coalesce memory accesses**
Texture and surface memory are stored in the device memory and cached in the
texture cache. This setup optimizes 2D spatial locality, which leads to better
performance for threads reading close 2D addresses.
Reading device memory through texture or surface fetching provides the following
advantages:
When consecutive threads in a wavefront access consecutive memory addresses,
the hardware combines these into a single wide transaction. Non-coalesced
patterns require multiple transactions, reducing effective bandwidth.
- Higher bandwidth for local texture fetches or surface reads.
- Offloading addressing calculation.
- Data broadcasting.
- Optional conversion of 8-bit and 16-bit integer input data to 32-bit
floating-point values on the fly.
.. code-block:: cuda
// Good: consecutive threads access consecutive addresses
int idx = threadIdx.x + blockIdx.x * blockDim.x;
data[idx] = value;
// Bad: strided access
int idx = threadIdx.x * stride; // Non-coalesced if stride > 1
data[idx] = value;
For understanding memory coalescing theory, see :ref:`memory_hierarchy_theory`.
**Use shared memory for data reuse**
Shared memory (LDS) provides low-latency on-chip storage shared across threads
in a block. Loading data into shared memory once and reusing it many times
reduces global memory traffic, particularly effective for tiled algorithms such
as matrix multiplication.
.. code-block:: cuda
__global__ void optimized_kernel(float* input, float* output) {
__shared__ float tile[TILE_SIZE][TILE_SIZE];
// Load data into shared memory
tile[threadIdx.y][threadIdx.x] = input[...];
__syncthreads();
// Reuse data from fast shared memory
float result = 0;
for (int i = 0; i < TILE_SIZE; i++) {
result += tile[threadIdx.y][i] * tile[i][threadIdx.x];
}
__syncthreads();
output[...] = result;
}
**Avoid bank conflicts in shared memory**
Shared memory is organized into banks, each capable of servicing one request per
cycle. When multiple threads in a warp access the same bank simultaneously, the
requests are serialized, reducing throughput. Padding arrays by one element
shifts addresses to avoid systematic conflicts.
.. code-block:: cuda
// Bad: power-of-2 stride causes conflicts
__shared__ float data[32][32];
float value = data[threadIdx.x][threadIdx.y];
// Good: padding avoids conflicts
__shared__ float data[32][33]; // Extra column
float value = data[threadIdx.x][threadIdx.y];
For bank conflict theory, see :ref:`bank_conflicts_theory`.
**Use texture memory for 2D spatial access**
Texture memory provides hardware-accelerated 2D filtering and caching optimized
for spatial locality. It automatically handles boundary conditions and can
interpolate values, making it ideal for image processing and nearby-neighbor access patterns.
.. code-block:: cuda
// Create texture object
hipTextureObject_t texObj;
hipCreateTextureObject(&texObj, &resDesc, &texDesc, NULL);
// Access in kernel
float value = tex2D<float>(texObj, x, y);
.. _instruction optimization:
Optimization for maximum instruction throughput
================================================================================
To maximize instruction throughput:
- Minimize low throughput arithmetic instructions.
- Minimize divergent warps inflicted by flow control instructions.
- Maximize instruction parallelism.
These techniques are discussed in detail in the following sections.
Instruction throughput optimization
====================================
Arithmetic instructions
--------------------------------------------------------------------------------
-----------------------
The type and complexity of arithmetic operations can significantly impact the
performance of your application. We are highlighting some hints how to maximize
it.
**Use efficient operations**
Use efficient operations: Some arithmetic operations are costlier than others.
For example, multiplication is typically faster than division, and integer
operations are usually faster than floating-point operations, especially with
double precision.
Division requires many more hardware cycles than multiplication. Similarly,
bitwise operations (shifts, AND, OR) are single-cycle instructions on integer
units, making them far more efficient than equivalent arithmetic for power-of-two calculations.
Minimize low-throughput instructions: This might involve trading precision for
speed when it does not affect the final result. For instance, consider using
single-precision arithmetic instead of double-precision.
.. code-block:: cuda
Leverage intrinsic functions: Intrinsic functions are predefined functions
available in HIP that can often be executed faster than equivalent arithmetic
operations (subject to some input or accuracy restrictions). They can help
optimize performance by replacing more complex arithmetic operations.
// Prefer multiplication over division
float result = value * 0.5f; // Fast
float result = value / 2.0f; // Slower
// Use bitwise operations for powers of 2
int index = threadIdx.x << 2; // Multiply by 4
int mask = (1 << n) - 1; // Create bit mask
Optimize memory access: The memory access efficiency can impact the speed of
arithmetic operations. See: :ref:`device memory access`.
**Use single-precision when possible**
AMD GPUs have significantly higher throughput for single-precision (FP32)
operations compared to double-precision (FP64). Using single-precision math
functions can deliver substantial performance gains when FP64 accuracy is not required.
.. code-block:: cuda
// Single-precision (faster)
float result = sinf(x);
float result = expf(x);
// Double-precision (slower, use only when necessary)
double result = sin(x);
double result = exp(x);
**Leverage fast math intrinsics**
Hardware-specific intrinsics bypass certain accuracy checks and use lookup
tables or polynomial approximations, trading slight precision loss for
significantly higher throughput. These should be used when the application can
tolerate reduced precision.
.. code-block:: cuda
// Fast intrinsic versions
float ex = __expf(x); // Fast exponential
float lg = __logf(x); // Fast logarithm
float sq = __fsqrt_rn(x); // Fast square root
float rc = __frcp_rn(x); // Fast reciprocal
.. _control flow instructions:
Control flow instructions
--------------------------------------------------------------------------------
Control flow optimization
-------------------------
Control flow instructions (``if``, ``else``, ``for``, ``do``, ``while``,
``break``, ``continue``, ``switch``) can impact instruction throughput by
causing threads within a warp to diverge and follow different execution paths.
To optimize performance, write control conditions to minimize divergent warps.
For example, when the control condition depends on ``threadIdx`` or ``warpSize``,
warp doesn't diverge. The compiler might optimize loops, short ifs, or switch
blocks using branch predication, which prevents warp divergence. With branch
predication, instructions associated with a false predicate are scheduled but
not executed, which avoids unnecessary operations. For control conditions where
one outcome is significantly more likely than the other, use `__builtin_expect <https://clang.llvm.org/docs/LanguageExtensions.html#builtin-expect>`_
or ``[[likely]]`` to indicate the likely condition result.
**Minimize divergence**
Avoiding divergent warps
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
When threads in a wavefront take different execution paths, the hardware
serializes both branches, executing each path with only the relevant threads
active. This reduces effective parallelism and wastes cycles on inactive threads.
Warps diverge when threads within the same warp follow different execution paths.
This is caused by conditional statements that lead to different arithmetic
operations being performed by different threads. Divergent warps can
significantly reduce instruction throughput, so it is advisable to structure
your code to minimize divergence.
.. code-block:: cuda
// Good: no divergence (condition depends on threadIdx)
if (threadIdx.x < 32) {
// All threads in first half-warp execute
}
// Bad: divergence within warp
if (data[threadIdx.x] > threshold) {
// Some threads execute, others don't
}
**Use branch hints for predictable conditions**
Providing hints about branch likelihood helps the compiler generate better
instruction ordering and can improve the branch predictor's accuracy, reducing
pipeline stalls when the prediction proves correct.
.. code-block:: cuda
if (__builtin_expect(rare_condition, 0)) {
// Unlikely branch
}
// C++20 attribute
if (common_condition) [[likely]] {
// Likely branch
}
**Avoid divergent warps**
When divergence is unavoidable, restructure the code to separate divergent paths
into different kernel launches or use predication (branchless programming) to
keep all threads active, though computing unnecessary values may be acceptable
if it avoids the serialization penalty.
.. code-block:: cuda
// Instead of:
if (threadIdx.x % 2 == 0) {
result = compute_even();
} else {
result = compute_odd();
}
// Consider separating into different kernels or using predication
Synchronization
--------------------------------------------------------------------------------
---------------
Synchronization ensures that all threads within a block complete their
computations and memory accesses before moving forward, which is critical when
threads depend on other thread results. However, synchronization can also cause
performance overhead, as it needs the threads to wait, which might lead to idle
GPU resources.
**Use minimal synchronization**
To synchronize all threads in a block, use :cpp:func:`__syncthreads()`.
:cpp:func:`__syncthreads()` ensures that, all threads reach the same point in
the code and can access shared memory after reaching that point.
Each synchronization point stalls all threads in a block until the slowest one
reaches the barrier. Minimize synchronizations by carefully analyzing data
dependencies—only synchronize when threads genuinely need to exchange data
through shared memory.
An alternative way to synchronize is to use streams. Different streams can
execute commands either without following a specific order or concurrently. This
is why streams allow more fine-grained control over the execution order of
commands, which can be beneficial in certain scenarios.
.. code-block:: cuda
__global__ void kernel() {
__shared__ float data[256];
// Load phase
data[threadIdx.x] = input[...];
__syncthreads(); // Necessary sync
// Compute phase - no sync needed if threads are independent
float result = compute(data[...]);
// Store phase - sync only if needed
output[...] = result;
}
**Use streams for async execution**
Streams enable concurrent execution of independent operations. Commands in
different streams can overlap in time, allowing kernel execution and memory
transfers to run simultaneously. This maximizes GPU utilization by keeping
multiple execution engines busy concurrently.
.. code-block:: cuda
hipStream_t stream1, stream2;
hipStreamCreate(&stream1);
hipStreamCreate(&stream2);
// Overlap independent operations
kernel1<<<grid, block, 0, stream1>>>(...);
kernel2<<<grid, block, 0, stream2>>>(...);
hipStreamSynchronize(stream1);
hipStreamSynchronize(stream2);
Managing register pressure
==========================
High register usage can limit occupancy. Follow these steps:
**Minimize live variables**
The compiler allocates registers for every variable that must remain accessible.
Reducing the number of simultaneously live variables frees registers, allowing
more wavefronts to fit on each CU. Chaining function calls trades some redundant
computation for lower register usage.
.. code-block:: cuda
// Instead of storing all intermediate results
float a = compute_a();
float b = compute_b();
float c = compute_c();
float result = combine(a, b, c);
// Recompute or chain operations
float result = combine(compute_a(), compute_b(), compute_c());
**Use shared memory for temporary storage**
Per-thread arrays stored in registers consume valuable register space, limiting
occupancy. Moving temporary storage to shared memory trades register usage for
shared memory usage, often allowing higher occupancy since shared memory limits
are typically less restrictive.
.. code-block:: cuda
// Instead of per-thread arrays (uses registers)
float temp[100];
// Use shared memory
__shared__ float temp[blockDim.x][100];
float* my_temp = temp[threadIdx.x];
**Adjust launch bounds**
The ``__launch_bounds__`` attribute provides hints to the compiler about expected
thread block size and minimum blocks per CU. This guides register allocation
decisions, potentially trading per-thread register count for higher occupancy.
.. code-block:: cuda
__global__ void
__launch_bounds__(256, 4) // 256 threads, 4 blocks per CU
my_kernel() {
// Kernel code
}
**Check register usage during compilation**
The compiler can report per-kernel register usage statistics. Monitoring this
output helps identify kernels consuming excessive registers, guiding optimization
efforts toward reducing register pressure in the most impactful areas.
.. code-block:: bash
hipcc --resource-usage kernel.hip
For register pressure theory, see :ref:`register_pressure_theory`.
Improving occupancy
===================
Higher occupancy helps hide latency. Follow these steps:
**Reduce register usage per thread**
Use techniques from "Managing register pressure" above.
**Reduce shared memory usage per block**
Each CU has limited shared memory that must be divided among resident blocks.
Reducing per-block shared memory usage allows more blocks to reside simultaneously,
increasing occupancy and improving latency hiding through greater thread-level parallelism.
.. code-block:: cuda
// Allocate only what's needed
__shared__ float tile[TILE_SIZE][TILE_SIZE];
// Or use dynamic allocation
extern __shared__ float dynamic_shared[];
**Optimize block size**
AMD GPUs execute threads in wavefronts of 64. Choosing block sizes as multiples
of 64 prevents partial wavefronts that waste execution slots. Larger blocks
(128-256 threads) typically achieve better occupancy and resource utilization.
.. code-block:: cuda
// Use multiples of wavefront size
dim3 block(64); // Good for AMD GPUs (wavefront=64)
dim3 block(128); // Common choice
dim3 block(256); // Good for high-occupancy kernels
// Avoid very small blocks
dim3 block(32); // May waste resources
**Profile occupancy**
Profiling tools report the ratio of active wavefronts to maximum possible
wavefronts per CU. Low occupancy suggests resource constraints (registers or
shared memory) are limiting parallelism and may indicate opportunities for optimization.
.. code-block:: bash
rocprofv3 --occupancy ./your_application
For occupancy theory, see :ref:`occupancy`.
Minimizing memory thrashing
================================================================================
============================
Applications frequently allocating and freeing memory might experience slower
allocation calls over time as memory is released back to the operating system.
To optimize performance in such scenarios, follow these guidelines:
allocation calls over time. To optimize:
- Avoid allocating all available memory with :cpp:func:`hipMalloc` or
:cpp:func:`hipHostMalloc`, as this immediately reserves memory and might
prevent other applications from using it. This behavior could strain the
operating system schedulers or prevent other applications from running on the
same GPU.
- Try to allocate memory in suitably sized blocks early in the application's
lifecycle and deallocate only when the application no longer needs it.
Minimize the number of :cpp:func:`hipMalloc` and :cpp:func:`hipFree` calls in
your application, particularly in performance-critical areas.
- Consider resorting to other memory types such as :cpp:func:`hipHostMalloc` or
:cpp:func:`hipMallocManaged`, if an application can't allocate sufficient
device memory. While the other memory types might not offer similar
performance, they allow the application to continue running.
- For supported platforms, use :cpp:func:`hipMallocManaged`, as it allows
oversubscription. With the right policies, :cpp:func:`hipMallocManaged` can
maintain most, if not all, :cpp:func:`hipMalloc` performance.
:cpp:func:`hipMallocManaged` doesn't require an allocation to be resident
until it is needed or prefetched, which eases the load on the operating
system's schedulers and facilitates multitenant scenarios.
**Allocate early, deallocate late**
Frequent allocation and deallocation causes memory fragmentation and increases
allocator overhead. Reusing allocations across iterations amortizes the cost
of memory management and maintains better memory locality.
.. code-block:: cuda
// Bad: frequent allocation in loop
for (int i = 0; i < iterations; i++) {
float* temp;
hipMalloc(&temp, size);
// Use temp
hipFree(temp);
}
// Good: allocate once
float* temp;
hipMalloc(&temp, size);
for (int i = 0; i < iterations; i++) {
// Reuse temp
}
hipFree(temp);
**Avoid allocating all available memory**
Reserving some memory headroom prevents allocation failures and system instability.
The driver and runtime need workspace for internal operations, and leaving a
safety margin ensures stable operation without unexpected out-of-memory errors.
.. code-block:: cuda
size_t free, total;
hipMemGetInfo(&free, &total);
// Don't allocate all free memory
size_t safe_size = free * 0.9; // Leave some margin
**Use managed memory for oversubscription**
Managed memory automatically migrates data between host and device on demand,
allowing allocations larger than physical GPU memory. Prefetching hints help
the runtime optimize page placement, reducing migration overhead during kernel execution.
.. code-block:: cuda
// Allows exceeding physical memory
float* data;
hipMallocManaged(&data, large_size);
// Optionally prefetch to device
hipMemPrefetchAsync(data, size, device, stream);
Summary
=======
Key optimization techniques:
* **Profile first**: Use ``rocprofv3`` to identify actual bottlenecks
* **Parallelize effectively**: Maximize work at all levels (application, device, CU)
* **Optimize memory**: Minimize transfers, maximize coalescing, use LDS
* **Manage resources**: Balance registers, shared memory, and occupancy
* **Minimize divergence**: Structure control flow to keep warps coherent
For understanding the theory behind these techniques, refer to
:doc:`../understand/performance_optimization` and :doc:`../understand/hardware_implementation`.
@@ -30,6 +30,7 @@ The HIP documentation is organized into the following categories:
:::{grid-item-card} Programming guide
* {doc}`./understand/programming_model`
* {doc}`./understand/performance_optimization`
* {doc}`./understand/hardware_implementation`
* {doc}`./understand/compilers`
* {doc}`./how-to/performance_guidelines`
@@ -26,6 +26,7 @@ subtrees:
- caption: Programming guide
entries:
- file: understand/programming_model
- file: understand/performance_optimization
- file: understand/hardware_implementation
- file: understand/compilers
- file: how-to/performance_guidelines
@@ -1,24 +1,110 @@
# Glossary of terms
* **host**, **host CPU** : Executes the HIP runtime API and is capable of initiating kernel launches to one or more devices.
* **default device** : Each host thread maintains a default device.
Most HIP runtime APIs (including memory allocation, copy commands, kernel launches) do not accept an explicit device
argument but instead implicitly use the default device.
The default device can be set with `hipSetDevice`.
## Host and device
* **active host thread** - the thread which is running the HIP APIs.
* **host**, **host CPU**: Executes the HIP runtime API and is capable of initiating kernel launches to one or more devices.
* **device**: A GPU or accelerator that executes HIP kernels. In the context of HIP, this typically refers to AMD GPUs or NVIDIA GPUs when using HIP on CUDA.
* **default device**: Each host thread maintains a default device. Most HIP runtime APIs (including memory allocation, copy commands, kernel launches) do not accept an explicit device argument but instead implicitly use the default device. The default device can be set with `hipSetDevice`.
* **active host thread**: The thread which is running the HIP APIs.
* **HIP-Clang** - Heterogeneous AMDGPU Compiler, with its capability to compile HIP programs on AMD platform (https://github.com/RadeonOpenCompute/llvm-project).
## Architecture and hardware
* **clr** - a repository for AMD Compute Language Runtime, contains source codes for AMD's compute languages runtimes: HIP and OpenCL.
clr (https://github.com/ROCm/clr) contains the following three parts,
* **AMD device architecture**: The organization and execution model of AMD GPUs, defining how computation happens inside the hardware through programmable compute units organized into shader engines.
* **compute unit (CU)**: The fundamental programmable engine of AMD GPUs. Each CU manages thousands of lightweight threads, orchestrating their execution, memory access, and synchronization. A CU consists of SIMD units, scalar units, register files, LDS, and caches.
* **GPU cores**: The primary arithmetic engines within compute units, including SIMD lanes and matrix fused multiply-add (MFMA) units that execute mathematical and logical operations.
* **SIMD (Single Instruction, Multiple Data)**: Hardware units that execute the same instruction across multiple data elements simultaneously. AMD GPUs typically have four SIMD units per CU.
* **VALU (Vector Arithmetic Logic Unit)**: Executes vector instructions across entire wavefronts, with each thread potentially operating on different data.
* **SALU (Scalar Arithmetic Logic Unit)**: Executes instructions uniformly across all threads in a wavefront, handling control flow and wavefront-uniform operations.
* **SFU (Special Function Unit)**: Accelerates transcendental and reciprocal mathematical functions like exp, log, sin, cos, rcp, and rsqrt.
* **LSU (Load/Store Unit)**: Handles data transfers between compute units and GPU memory subsystems, managing thousands of outstanding memory requests.
* **MFMA (Matrix Fused Multiply-Add)**: Specialized hardware units in CDNA architectures that perform large-scale matrix operations in a single instruction, providing the primary source of peak floating-point performance.
* **DME (Data Movement Engine)**: Specialized hardware units in CDNA3/4 that accelerate access to multi-dimensional tensor data, performing high-throughput copies between global memory and on-chip memory.
* **shader engine**: Top-level organizational unit in AMD GPUs containing multiple shader arrays and shared resources.
* **GFX IP**: Graphics IP version identifier (like gfx908, gfx90a, gfx942) that specifies the precise features, register layout, and machine instruction format a GPU supports.
* **CDNA**: Compute DNA architecture specialized for high-performance computing and machine learning workloads, featuring matrix cores and enhanced memory bandwidth.
* **RDNA**: Radeon DNA architecture optimized for graphics and lower-latency compute workloads, featuring Wave32 execution and work group processors.
* **GCN**: Graphics Core Next, the foundational architecture for modern AMD GPUs that established key design principles still used today.
* `hipamd`: contains implementation of HIP language on AMD platform.
* `rocclr`: contains common runtime used in HIP and OpenCL, which provides virtual device interfaces that compute runtimes interact with different backends such as ROCr on Linux or PAL on Windows.
* `opencl`: contains implementation of OpenCL on AMD platform.
## Memory hierarchy
* **hipify tools** - tools to convert CUDA code to portable C++ code (https://github.com/ROCm/HIPIFY).
* **VGPR (Vector General-Purpose Registers)**: Per-thread registers that hold data processed by SIMD lanes, such as individual elements of matrices or vectors.
* **SGPR (Scalar General-Purpose Registers)**: Registers holding values shared across an entire wavefront, such as loop counters, constants, or addresses.
* **AGPR (Accumulation VGPRs)**: Additional register file space in CDNA architectures dedicated to matrix accumulation, doubling available register storage for matrix operations.
* **register file**: Primary on-chip memory that holds data between arithmetic and memory operations, built from extremely fast SRAM.
* **LDS (Local Data Share)**: Fast on-chip scratchpad memory shared among threads in a workgroup, providing low-latency communication within a block.
* **L1 data cache**: Private on-chip memory associated with each compute unit, providing fast access to recently used data.
* **global memory**: General read-write accessible memory visible to all threads on a device, backed by high-bandwidth memory (HBM).
* **constant memory**: Read-only storage visible to all threads, optimized for uniform access patterns across a wavefront.
* **texture memory**: Special read-only memory optimized for spatial locality and 2D/3D access patterns with filtering capabilities.
* **surface memory**: Read-write version of texture memory.
* **HBM (High Bandwidth Memory)**: Vertically stacked DRAM technology providing terabytes per second of memory bandwidth in modern GPUs.
* **memory coalescing**: Hardware optimization that combines memory accesses from multiple threads into fewer transactions when accessing consecutive addresses.
* **bank conflict**: Performance penalty when multiple threads access different addresses in the same LDS bank, causing serialization.
* **`hipconfig`** - tool to report various configuration properties of the target platform.
## Execution model
* **`nvcc`** - NVIDIA CUDA `nvcc` compiler, do not capitalize.
* **work-item**: A single instance of a kernel, representing one thread of parallel execution.
* **wavefront**: A group of threads (typically 32 or 64) that execute the same instruction simultaneously in lockstep on a SIMD unit.
* **work-group**: A collection of wavefronts that can synchronize and share LDS memory, executing on the same compute unit.
* **grid**: The total collection of work-groups launched for a kernel, defining the problem size.
* **HIP kernel**: A function marked with `__global__` that executes on the GPU device in parallel across many threads.
* **HIP thread hierarchy**: The three-level organization of threads (thread, block, grid) that defines parallel execution scope.
* **warp size**: The number of threads in a wavefront - typically 64 for AMD CDNA/GCN architectures and 32 for RDNA architectures.
* **occupancy**: The ratio of active wavefronts to maximum possible wavefronts on a compute unit, affecting latency hiding ability.
* **wavefront scheduler**: Hardware component that decides which wavefront to execute each clock cycle, enabling rapid context switching.
* **wavefront divergence**: Performance penalty when threads within a wavefront take different execution paths due to conditional statements.
* **branch efficiency**: The ratio of non-divergent to total branches, indicating control flow uniformity.
## Performance concepts
* **performance bottleneck**: The limiting factor preventing higher kernel performance, typically either compute-bound or memory-bound.
* **roofline model**: Visual performance analysis framework relating achievable performance to hardware limits based on arithmetic intensity.
* **compute-bound**: Kernel performance limited by arithmetic throughput rather than memory bandwidth.
* **memory-bound**: Kernel performance limited by memory bandwidth rather than compute capacity.
* **arithmetic intensity**: Ratio of floating-point operations to memory traffic (FLOPs/byte), determining compute vs memory boundedness.
* **overhead**: Fixed costs associated with operations like kernel launches, memory allocations, or synchronization.
* **latency hiding**: GPU technique of switching between wavefronts to mask memory and instruction latency through massive multithreading.
* **memory bandwidth**: Rate of data transfer between GPU compute units and memory, measured in GB/s or TB/s.
* **arithmetic bandwidth**: Rate of arithmetic operations, measured in FLOPS (floating-point operations per second).
* **active cycle**: Percentage of cycles where at least one instruction is executing on a compute unit.
* **pipe utilization**: Percentage of execution cycles where the pipeline actively processes instructions.
* **peak rate**: Theoretical maximum performance of a GPU in FLOPS or bandwidth.
* **issue efficiency**: Ratio of issued instructions to maximum possible, indicating scheduling effectiveness.
* **CU utilization**: Percentage of compute units actively executing work.
* **register pressure**: Condition where high register usage limits occupancy and performance.
* **Little's Law**: Performance principle relating concurrency, latency, and throughput: Concurrency = Latency × Throughput.
## Software and tools
* **ROCm programming model**: The software model for programming AMD GPUs, including the HIP API and associated runtime.
* **ROCm software platform**: Complete software stack for AMD GPU computing, including drivers, runtime, compilers, libraries, and tools.
* **HIP C++**: C++ dialect for writing portable GPU code that can run on both AMD and NVIDIA platforms.
* **HIP runtime API**: Set of functions for managing GPU devices, memory, streams, and kernel execution.
* **HIP compiler driver**: Tool that compiles HIP source code into device binaries and host code.
* **HIP runtime compiler (hipRTC)**: API for compiling HIP kernels at runtime from source strings.
* **HIP-Clang**: Heterogeneous AMDGPU Compiler with capability to compile HIP programs on AMD platform.
* **clr**: Repository for AMD Compute Language Runtime, containing source codes for HIP and OpenCL runtimes.
* `hipamd`: Implementation of HIP language on AMD platform
* `rocclr`: Common runtime providing virtual device interfaces for different backends
* `opencl`: Implementation of OpenCL on AMD platform
* **hipify tools**: Tools to convert CUDA code to portable C++ code.
* **hipconfig**: Tool to report various configuration properties of the target platform.
* **rocm-smi**: System management interface for monitoring and configuring AMD GPUs.
* **AMD uProf**: System-wide performance analysis tool for CPU and GPU profiling.
* **ROCm profiler**: Performance profiling tools for analyzing GPU kernel execution.
* **ROCm binary utilities**: Tools for inspecting and manipulating GPU binary code objects.
* **rocBLAS**: Optimized BLAS (Basic Linear Algebra Subprograms) library for AMD GPUs.
* **MIOpen**: Machine learning primitives library optimized for AMD GPUs.
* **AMDGPU assembly**: Low-level assembly language for AMD GPUs.
* **AMDGPU IR**: Intermediate representation used in the AMD GPU compilation pipeline.
* **nvcc**: NVIDIA CUDA compiler (do not capitalize).
## Execution states and synchronization
* **wavefront execution state**: Current status of a wavefront - active, ready, stalled, or sleeping.
* **active wavefront**: Wavefront currently executing on a SIMD unit.
* **stalled wavefront**: Wavefront waiting for a dependency like memory or synchronization.
* **stream**: FIFO queue of GPU commands that execute in order, enabling asynchronous execution.
* **event**: Synchronization primitive for coordinating execution between streams or with the host.
* **barrier**: Synchronization point where threads wait for others to reach before proceeding.
* **memory fence**: Operation ensuring memory consistency and ordering across threads.
@@ -1,6 +1,6 @@
.. meta::
:description: This chapter describes the typical hardware implementation of GPUs supported by HIP.
:keywords: AMD, ROCm, HIP, Hardware, Compute Unit, ALU, VALU, Cache, Registers, LDS
:description: This chapter describes the hardware implementation of AMD GPUs supported by HIP.
:keywords: AMD, ROCm, HIP, hardware, GPU, architecture, compute unit, VALU, SALU, cache, memory hierarchy, CDNA, RDNA, GCN
.. _hardware_implementation:
@@ -8,160 +8,701 @@
Hardware implementation
*******************************************************************************
This chapter describes the typical hardware implementation of GPUs supported by
HIP, and how the :ref:`inherent_thread_model` maps to the hardware.
This chapter describes the hardware architecture of AMD GPUs supported by HIP,
focusing on the internal organization and operation of GPU hardware components.
Understanding these hardware details helps you optimize GPU applications and
achieve maximum performance.
Compute units
=============
Overall GPU architecture
========================
The basic building block of a GPU is a compute unit (CU), also known
as streaming multiprocessor (SM) on NVIDIA GPUs. The thread blocks making up a
grid are scheduled for execution on CUs. Each block is assigned to an
individual CU, and a CU can accommodate several blocks. Depending on their
resource usage up to thousands of threads can reside on a CU.
AMD GPUs consist of interconnected blocks of digital circuits that work together
to execute complex parallel computing tasks. The architecture is organized
hierarchically to enable massive parallelism while managing resources efficiently.
CUs contain an array of processing elements, referred to as
vector ALU (VALU), that execute the actual instructions of the threads
according to the :ref:`SIMT model<programming_model_simt>`, together with the
necessary registers and caches.
Command processor and control
-----------------------------
The threads are executed in groupings called warps. The amount of threads
making up a warp is architecture dependent. On AMD GPUs the warp size is
commonly 64 threads, except in RDNA architectures which can utilize a warp size
of 32 or 64 respectively. The warp size of supported AMD GPUs is listed in the
:doc:`rocm:reference/gpu-arch-specs`. NVIDIA GPUs have a warp size of 32.
The command processor (CP) serves as the primary interface between the CPU and
GPU, receiving and distributing commands for execution. The CP consists of two
main components:
In contrast to CPUs, GPUs generally do not employ complex cache structures or
control logic, like branch prediction or out-of-order execution, but instead
rely on massive hardware multithreading to hide latency.
* **Command processor fetcher (CPF)**: Fetches commands from memory and passes
them to the CPC for processing.
* **Command processor packet processor (CPC)**: A microcontroller that decodes the
fetched commands and dispatches kernels to the workgroup processors for
scheduling.
Context switching between warps residing on a CU incurs no overhead,
as the context for the warps is stored on the CU and does not need to
be fetched from memory. If there are not enough free registers to accommodate
all warps of a block, the block can not be scheduled to that CU and it
has to wait until other blocks finish execution.
The command processor handles several types of operations:
The amount of warps that can reside concurrently on a CU, known
as occupancy, is determined by the warp's resource usage of registers and
shared memory.
* Kernel launches, which are forwarded to asynchronous compute engines (ACEs)
* Memory transfers, which are delegated to direct memory access (DMA) engines
* Synchronization operations and memory fences
.. _gcn_cu:
**DMA engines** handle memory transfers between CPU and GPU memory without CPU
involvement after initialization. Most GPUs contain two DMA engines, enabling
concurrent bidirectional transfers to better utilize PCIe bandwidth. The DMA
engines fetch data in small chunks and can process transfers in parallel but
cannot handle multiple copy commands on the same engine simultaneously.
.. figure:: ../data/understand/hardware_implementation/compute_unit.svg
:alt: Diagram depicting the general structure of a compute unit of an AMD
GPU.
**Asynchronous compute engines (ACEs)** break down kernels into workgroups for
distribution to shader processor input (SPI) blocks. Multiple ACEs enable
concurrent kernel execution, with each ACE capable of dispatching one kernel
at a time. ACEs process commands from different queues asynchronously, enabling
overlap between different kernel executions and memory operations.
AMD Graphics Core Next (GCN) CU
Hierarchical organization
-------------------------
On AMD GCN GPUs the basic structure of a CU is:
The GPU organizes compute resources in a three-level hierarchy that enables
modular design and resource sharing:
* four Single Instruction Multiple Data units (SIMDs)
* a vector cache
* a local data share
* and a scalar unit
1. **Shader engines (SE)**: Top-level organizational units containing multiple
shader arrays and shared resources
2. **Shader arrays**: Groups of compute units sharing instruction and scalar
caches
3. **Compute units (CU)**: Basic execution units containing the ALUs and
registers for thread execution
SIMD
----
.. figure:: ../data/understand/hardware_implementation/selayout.png
:align: center
:alt: Diagram showing the hierarchical organization of compute units grouped
into shader engines on AMD GPUs
:width: 800
A SIMD consists of a VALU, that executes the instruction of a warp, together
with a register file, that provides the registers warps.
Hierarchical organization of compute units into shader engines
The size of the warp is inherently related to the width of the vector ALU of
the SIMD. On GCN compute units the width of the VALU is 16, so a warp can be
issued to a SIMD every 4 cycles. Since a CU has 4 SIMDs it issues one
warp per cycle. The instructions of a warp are effectively executed in
lock-step.
This hierarchical design allows different GPU configurations using the same
underlying architecture. For example, the R9 Fury X contains 16 shader arrays
with four CUs each, while the RX 480 contains 12 shader arrays with three CUs
each, but both use the same gfx803 chip design.
A SIMD always executes the same instruction for the whole VALU. If the control
flow of a warp diverges, the performance is decreased, as the results for the
threads that do not participate in that branch have to be masked out, and the
instructions of the other branch have to be executed in the same way. The best
performance can therefore be achieved when thread divergence is kept to a warp
level, i.e. when all threads in a warp take the same execution path.
Shader engine components
========================
Vector cache
------------
Shader engines group multiple compute units together with shared resources that
improve efficiency and reduce redundancy. Each shader engine contains several
key components shared across its compute units.
The usage of cache on a GPU differs from that on a CPU, as there is less cache
available per thread. Its main purpose is to coalesce memory accesses of the
warps in order to reduce the amount of accesses to device memory, and make that
memory available for other warps that currently reside on the compute unit, that
also need to load those values.
Workgroup manager (SPI)
-----------------------
Local data share
----------------
The workgroup manager, also called the shader processor input (SPI), bridges
the command processor and compute units. After the CP processes a kernel
dispatch, the SPI:
The local data share is memory that is accessible to all threads within a block.
Its latency and bandwidth is comparable to that of the vector cache. It can be
used to share memory between the threads in a block, or as a software managed
cache.
* Receives workgroups from the ACEs
* Schedules workgroups onto available compute units
* Initializes registers with kernel parameters
* Ensures all wavefronts of a workgroup execute on the same CU for
synchronization
* Monitors resource availability and queues workgroups when resources are
exhausted
Scalar Unit
-----------
The SPI tracks four critical resources that limit concurrent execution:
The scalar unit performs instructions that are uniform within a warp. It
thereby improves efficiency and reduces the pressure on the vector ALUs and the
vector register file.
* Wavefront slots (execution contexts)
* Vector general-purpose registers (VGPRs)
* Scalar general-purpose registers (SGPRs)
* Local data share (LDS) memory
.. _cdna3_cu:
Workgroup-to-CU mapping is non-deterministic and based on available resources.
You should not assume any specific mapping pattern, as the same kernel launched
multiple times can have different workgroup distributions.
CDNA architecture
=================
Scalar L1 data cache (sL1D)
---------------------------
The general structure of CUs stays mostly as it is in GCN
architectures. The most prominent change is the addition of matrix ALUs, which
can greatly improve the performance of algorithms involving matrix
multiply-accumulate operations for
:doc:`int8, float16, bfloat16 or float32<rocm:reference/precision-support>`.
The scalar L1 data cache serves scalar memory operations from multiple CUs
within a shader array. The sL1D is shared between CUs (3 CUs in the Graphics
Core Next (GCN) and MI100, 2 CUs in the MI200 series) and caches data that is
uniform across a wavefront, including:
.. figure:: ../data/understand/hardware_implementation/cdna3_cu.png
:alt: Block diagram showing the structure of a CDNA3 compute unit. It includes
Shader Cores, the Matrix Core Unit, a Local Data Share used for sharing
memory between threads in a block, an L1 Cache and a Scheduler. The
Shader Cores represent the vector ALUs and the Matrix Core Unit
represents the matrix ALUs. The Local Data Share is used as the shared
memory.
* Kernel arguments and pointers
* Grid and block dimensions
* Constants accessed uniformly across threads
* Data from ``__constant__`` memory when accessed uniformly
Block Diagram of a CDNA3 Compute Unit.
Unlike the vector L1 cache, the sL1D doesn't use a "hit-on-miss" approach,
meaning subsequent requests to the same pending cache line count as duplicated
misses rather than hits.
.. _rdna3_cu:
L1 instruction cache (L1I)
--------------------------
RDNA architecture
=================
The L1 instruction cache is a read-only cache shared between multiple CUs in a
shader array. Like the sL1D, it is backed by the L2 cache and doesn't use the
"hit-on-miss" approach. The L1I stores kernel instructions fetched by the
compute units, reducing instruction fetch latency and L2 cache pressure.
RDNA makes a fundamental change to CU design, by changing the
size of a warp to 32 threads. This is done by effectively combining two GCN5
SIMDs, creating a VALU of width 32, so that a whole warp can be issued in one
cycle. The CU is also replaced by the work group processor (WGP),
which encompasses two CUs. For backwards compatibility the WGP can
also run in wave64 mode, in which it issues a warp of size 64 in two cycles.
Compute unit architecture
=========================
It also adds an extra layer of cache to the WGP, shared by the CUs
within it. This cache is referred to as L1 cache, promoting the per-CU cache to
an L0 cache.
The compute unit (CU) is the fundamental execution block of AMD GPUs.
It's responsible for executing kernels through its various specialized components
and pipelines.
.. figure:: ../data/understand/hardware_implementation/rdna3_cu.png
:alt: Block diagram showing the structure of an RDNA3 Compute Unit. It
consists of four SIMD units, each including a vector and scalar register
file, with the corresponding scalar and vector ALUs. All four SIMDs
share a scalar and instruction cache, as well as the shared memory. Two
of the SIMD units each share an L0 cache.
.. figure:: ../data/understand/hardware_implementation/gcn_compute_unit.png
:align: center
:alt: Detailed diagram of an AMD CDNA compute unit showing internal
components and data flow
:width: 800
Block Diagram of an RDNA3 work group processor.
Internal architecture of an AMD CDNA compute unit
Shader engines
==============
Sequencer and scheduling
------------------------
For hardware implementation's sake, multiple CUs are grouped
together into a Shader Engine or Compute Engine, typically sharing some fixed
function units or memory subsystem resources.
The instruction sequencer (SQ) serves as the control center of each compute
unit, managing instruction flow through the execution pipelines. The sequencer
maintains wavefront state and coordinates instruction execution across different
functional units.
**Wavefront organization**: The sequencer organizes active wavefronts into four
pools, each containing slots for up to ten wavefronts (eight on the CDNA2 MI200
series). Each slot includes:
* Wavefront-level registers (program counter, execution mask, and others)
* Instruction buffer for prefetched instructions
* State information for scheduling decisions
This organization theoretically allows up to 40 concurrent wavefronts per CU,
though actual occupancy is typically limited by register and LDS usage.
**Instruction fetching**: The fetch arbiter selects one wavefront per cycle to
fetch instructions from memory, prioritizing the oldest wavefronts. Each CU can
fetch up to 32 bytes (4-8 instructions) per cycle.
**Instruction issuing**: The issue arbiter determines which instructions execute
each cycle, selecting wavefronts from one pool per cycle in round-robin fashion.
The arbiter can issue multiple instructions per cycle to different execution
units, with a theoretical maximum of five instructions per cycle:
* One VALU instruction
* One vector memory operation
* One SALU/scalar memory operation
* One LDS operation
* One branch operation
Instructions always issue at wavefront granularity, with all threads in the
wavefront executing the same instruction in lockstep. Context switching between
wavefronts occurs every cycle with zero overhead, as all wavefront contexts
remain resident on the CU.
Execution pipelines
-------------------
Each compute unit contains multiple specialized execution pipelines that process
different types of instructions in parallel, enabling efficient utilization of
the hardware resources.
Vector arithmetic logic unit (VALU)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
The VALU executes vector instructions across entire wavefronts, with each thread
potentially operating on different data. The VALU consists of:
* **Four SIMD processors**: Each containing 16 single-precision ALUs (or
equivalent), for 64 total ALUs per CU
* **Vector register files**: 256-512 KiB of VGPR storage split across the four
SIMDs
* **Instruction buffers**: Storage for up to 8-10 wavefronts per SIMD
On architectures with 64-thread wavefronts and 16-instruction wide SIMD units,
executing one instruction takes four cycles (one cycle per 16 threads). The four
SIMD design ensures full utilization when sufficient wavefronts are available, as
a new instruction can issue to each SIMD every cycle.
The VALU serves as the primary arithmetic engine, executing the majority of
computation in GPU kernels. Data flows into these pipelines, undergoes arithmetic
transformation, and exits as results — with the goal of maximizing the number of
such transformations per clock cycle.
For CDNA architectures with matrix operations, the VALU also dispatches
matrix fused multiply-add (MFMA) instructions to specialized matrix units.
Scalar arithmetic logic unit (SALU)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
The SALU executes instructions uniformly across all threads in a wavefront,
handling operations like:
* Control flow (branches, loops)
* Address calculations
* Loading kernel arguments and constants
* Managing wavefront-uniform values
The SALU includes:
* A scalar processor for arithmetic and logic operations
* 12.5 KiB of SGPR storage per CU
* A scalar memory (SMEM) unit for memory operations
Scalar operations reduce pressure on vector units and registers by handling
uniform computations efficiently.
Vector memory unit (VMEM)
^^^^^^^^^^^^^^^^^^^^^^^^^
The VMEM unit handles all vector memory operations, including loads, stores,
and atomic operations. Each thread supplies its own address and data, though
the hardware optimizes access through memory coalescing when threads access
nearby addresses. The VMEM unit connects to the vector L1 cache and implements
both address generation and coalescing logic.
Branch unit
^^^^^^^^^^^
The branch unit executes jumps and branches for control flow changes affecting
entire wavefronts. Note that the branch unit handles wavefront-level control
flow, not execution mask updates for thread divergence, which are handled
through predication.
Special function unit (SFU)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
The special function units accelerate certain arithmetic operations that are too
complex or costly to implement purely within the standard vector ALUs.
SFUs are responsible for executing transcendental and reciprocal mathematical
functions — operations such as ``exp``, ``log``, ``sin``, ``cos``, ``rcp``
(reciprocal), and ``rsqrt`` (reciprocal square root). These are heavily used in
scientific, physics, and machine learning workloads, particularly in activation
functions like GELU, sigmoid, or softmax.
Each compute unit includes a set of specialized pipelines or transcendental
function units (TFUs) that handle these operations with dedicated hardware.
While their throughput is lower than that of the primary SIMD pipelines, they
enable these functions to execute efficiently without consuming general ALU
bandwidth.
From the compiler's perspective, these operations map to specific AMDGPU ISA
instructions, such as:
* ``v_exp_f32`` — compute exponential base e
* ``v_log_f32`` — compute natural logarithm
* ``v_sin_f32``, ``v_cos_f32`` — compute sine or cosine
* ``v_rsq_f32``, ``v_rcp_f32`` — compute reciprocal or reciprocal square root
In CDNA3-based GPUs (like MI300), SFU throughput and latency have been tuned for
deep learning primitives. For instance, exponentiation (``exp``) and logarithm
(``log``) functions are now pipelined to complete in a few cycles per lane,
allowing vectorized activation functions in large-scale matrix workloads to
execute without significant stalls.
For programmers targeting ROCm or HIP, these SFU-accelerated operations are
typically accessed through math intrinsics such as ``__expf``, ``__logf``, or
``__sinf``, which the compiler lowers to the corresponding AMDGPU ISA instructions
at compile time.
Load/store unit (LSU)
^^^^^^^^^^^^^^^^^^^^^
The load/store units handle the transfer of data between the compute units and
the GPU's memory subsystems. They are responsible for issuing, tracking, and
retiring memory operations — including loads from and stores to global memory,
local shared memory, and caches — for thousands of concurrent threads.
Each compute unit includes a set of LSUs tightly integrated with its vector and
scalar pipelines. These units handle memory instructions generated by active
wavefronts — such as ``buffer_load``, ``buffer_store``, and ``flat_load_dword``
— and route them through the GPU's hierarchical memory system.
The LSU's responsibilities include:
* Managing vector memory accesses for SIMD instructions
* Coordinating local data share (LDS) reads and writes
* Accessing the L0/L1 caches and forwarding requests to the L2 cache and HBM
* Handling synchronization and atomic operations between threads and workgroups
LSUs manage thousands of outstanding memory requests per GPU, dynamically
scheduling them to hide memory latency. While arithmetic pipelines continue
executing other wavefronts, the LSUs maintain queues of pending transactions
and reorder responses as data returns from memory.
On modern accelerators like MI300X, these LSUs achieve terabytes-per-second of
aggregate memory bandwidth, coordinating thousands of active threads performing
memory-intensive operations such as tensor loading, matrix tiling, and gradient
updates.
Matrix fused multiply-add (MFMA)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
CDNA architectures (MI100 and newer) include specialized matrix acceleration
units for high-throughput matrix operations. These units execute independently
from other VALU operations, allowing overlap between matrix and vector
computations. MFMA units support various data types including ``INT8``, ``FP16``,
``BF16``, and ``FP32``, with different throughput characteristics for each.
Matrix cores are GPU execution units that perform large-scale matrix operations
in a single instruction. In AMD architectures, these units are formally known as
MFMA (matrix fused multiply-add) units — the core hardware blocks responsible for
accelerating deep learning, HPC, and dense linear-algebra workloads on modern
Instinct GPUs.
Operating on entire tiles of matrices per instruction allows MFMA units to deliver
far greater arithmetic throughput and energy efficiency than scalar or vector ALUs.
Rather than fetching and decoding thousands of per-element multiply-add instructions,
each MFMA instruction processes an entire matrix fragment — drastically reducing
power per operation and increasing overall throughput.
An example MFMA instruction from the AMDGPU ISA is:
.. code-block:: none
v_mfma_f32_16x16x4f16 v[0:15], v[16:31], v[32:47], v[0:15]
This instruction performs a matrix multiplication and accumulation D=A×B+C,
where the fragments A, B, and C are stored in VGPRs. The suffix ``16x16x4f16``
indicates a tile size of 16×16, with an inner dimension of 4, operating on
half-precision (FP16) inputs and accumulating into 32-bit floating-point outputs.
Since their introduction in CDNA1, and further expanded in CDNA2 and CDNA3, AMD's
matrix cores have become the primary source of peak floating-point performance in
datacenter GPUs. For example, an MI300X accelerator achieves its multi-petaFLOP
throughput primarily through MFMA units.
The MFMA units use both standard VGPRs and additional accumulation VGPRs
(AGPRs) on supported architectures, providing up to 512 KiB of combined
register storage per CU.
Local data share (LDS)
----------------------
The local data share provides fast on-CU scratchpad memory for communication
between threads in a workgroup.
.. figure:: ../data/understand/hardware_implementation/lds.svg
:align: center
:alt: Diagram showing the organization of local data share with banks and
connections to SIMD units
:width: 800
Local data share organization and SIMD connections
**Organization**: The LDS contains 32 banks, each 4-bytes wide, providing
128 bytes/cycle total bandwidth. Banks can be accessed independently each cycle
for reads, writes, or atomic operations. The SIMDs connect to the LDS in pairs,
with each pair sharing a 64-byte bidirectional port.
**Access patterns**: A single wavefront can achieve up to 64 bytes/cycle
throughput (16 lanes per cycle). The actual bandwidth depends on data size and
access patterns:
* 4-byte values: 8 cycles for 64 threads (50% peak bandwidth)
* 16-byte values: 20 cycles for 64 threads (80% peak bandwidth)
**Conflict resolution**: The LDS includes hardware to detect and resolve bank
conflicts when multiple threads access different addresses in the same bank.
Conflicts are resolved by serializing accesses across multiple cycles. Address
conflicts (multiple threads atomically updating the same address) are similarly
serialized. Broadcasting from the same address to multiple threads is handled
efficiently without conflicts.
Vector L1 cache
---------------
Each CU contains a dedicated vector L1 data cache (vL1D) serving vector memory
operations. Key characteristics include:
* Write-through design (writes go directly to L2)
* Optimization for high-bandwidth streaming access patterns
* Coherent with other CUs through software management
* Typical size of 16 KB per CU
The vector cache tags are checked for all vector memory operations, with misses
forwarded to the L2 cache. The write-through design simplifies coherence at the
cost of write bandwidth.
Memory hierarchy and system
===========================
The GPU memory system provides the bandwidth and capacity needed for massive
parallel computation while managing data coherence and access efficiency.
Memory organization
-------------------
.. figure:: ../data/understand/hardware_implementation/cdna2_gcd.png
:alt: Block diagram showing four Compute Engines each with 28 Compute Units
inside. These four Compute Engines share one block of L2 Cache. Around
them are four Memory Controllers. To the top and bottom of all these are
eight blocks of Infinity Fabric Links. Two Video Core Next blocks sit in
the top corners. At the very bottom spans a colored section reading
Infinity Fabric.
:alt: Block diagram showing four compute engines with L2 cache, memory
controllers, and Infinity Fabric interconnect on CDNA2
:width: 800
Block Diagram of a CDNA2 Graphics Compute Die.
CDNA2 Graphics Compute Die organization showing memory subsystem
AMD GPUs typically use high-bandwidth memory (HBM) for data-intensive workloads,
providing significantly higher bandwidth than traditional GDDR memory at the
cost of slightly higher latency. The memory system includes:
* **Memory channels**: Multiple independent memory controllers (typically 8-16)
* **L2 cache banks**: Distributed cache banks serving as the coherence point
* **Infinity Fabric**: High-speed interconnect for data routing
L2 cache architecture
---------------------
The L2 cache serves as the coherence point for all GPU memory accesses and is
shared by all compute units. The L2 consists of multiple independent channels
(32 on CDNA GPUs at 256-byte interleaving) that operate in parallel.
.. figure:: ../data/understand/hardware_implementation/l2perf_model.png
:align: center
:alt: Diagram showing L2 cache to Infinity Fabric transaction flow with
request categorization and routing
:width: 800
L2 cache to Infinity Fabric transaction flow
**Key characteristics**:
* **Channel organization**: Each channel handles a portion of the address space,
with addresses interleaved across channels for load balancing.
* **Hit-on-miss behavior**: If a request arrives for a pending cache line fill,
it counts as a hit, improving the effective hit rate.
* **Write coalescing**: Multiple writes to the same cache line are combined.
* **Atomic operation support**: Atomics execute directly in the L2 cache for
coherence.
**L2-Fabric interface**: Requests missing in L2 are routed through Infinity
Fabric to the appropriate memory location, which could be:
* Local HBM on the same GPU
* Remote GPU memory (in multi-GPU systems)
* System memory (CPU DRAM)
The interface categorizes requests by type (read/write), size (32B/64B), and
destination for optimal routing.
Memory coherence
----------------
GPU memory coherence differs significantly from CPU designs to optimize for
throughput over latency:
**Write-through L1 caches**: All writes update both L1 and L2, ensuring L2
always has the latest data. This eliminates the need for complex coherence
protocols between L1 caches but requires higher write bandwidth.
**Software-managed coherence**: Coherence between CUs requires explicit
synchronization through:
* Memory fences for ordering
* Cache invalidation instructions
* Atomic operations (executed at L2 level)
* Kernel boundaries (implicit synchronization)
**Write combining**: To handle partial cache line updates from different CUs,
the GPU uses write masks indicating which bytes to update. This prevents false
sharing issues while maintaining correctness.
Memory coalescing
-----------------
Memory coalescing combines memory accesses from multiple threads into fewer
transactions, significantly improving bandwidth utilization. The coalescing
hardware in the VMEM unit analyzes addresses from all threads in a wavefront
and groups them into the minimum number of cache line requests.
**Coalesced access pattern**: When consecutive threads access consecutive memory
addresses, the hardware can combine all 64 thread requests into as few as 4-8
cache line requests (depending on data size and alignment).
**Non-coalesced access pattern**: When threads access widely separated addresses,
each thread can generate a separate memory transaction, reducing effective
bandwidth by up to 16x or more.
To achieve optimal memory performance:
* Ensure consecutive threads access consecutive memory addresses
* Align data structures to cache line boundaries (64B or 128B)
* Use structure-of-arrays rather than array-of-structures layouts
* Consider padding to avoid bank conflicts
Architecture variants
=====================
AMD supports multiple GPU architecture families optimized for different use
cases while maintaining HIP compatibility.
Graphics Core Next (GCN)
------------------------
GCN represents the foundational architecture for modern AMD GPUs, establishing
key design principles still used today:
* 64-thread wavefronts
* Four SIMD units per CU with 16 lanes each
* Scalar unit for wavefront-uniform operations
* LDS with 32 banks
Multiple GCN generations (GCN1-5) introduced incremental improvements in
process technology, clock speeds, and instruction set features while maintaining
the core architectural philosophy.
.. _cdna_architecture:
CDNA architecture
-----------------
CDNA (Compute DNA) specializes in high-performance computing and machine
learning workloads. Building on GCN principles, CDNA adds significant compute
enhancements:
.. figure:: ../data/understand/hardware_implementation/cdna3_cu.png
:alt: Block diagram showing CDNA3 compute unit with matrix core unit, shader
cores, L1 cache, and local data share
:width: 800
CDNA3 compute unit with matrix acceleration
**Matrix Core Unit**: Specialized hardware for matrix multiply-accumulate
operations, providing up to 16 times more throughput than vector units for supported
operations. Matrix cores support multiple precisions (``INT8``, ``FP16``, ``BF16``, ``FP32``)
with varying performance characteristics.
**Accumulation VGPRs (AGPRs)**: Additional register file space (up to 256 KB)
dedicated to matrix accumulation, doubling the available register storage for
matrix operations. Data movement between VGPRs and AGPRs uses specialized
instructions (``v_accvgpr_*``).
**Enhanced memory bandwidth**: CDNA GPUs typically use HBM2/HBM2e/HBM3 memory,
providing up to 3.2 TB/s bandwidth on high-end models.
**Multi-die designs**: CDNA2 (MI250) and CDNA3 (MI300) use chiplet
architectures with multiple dies connected through high-speed links, scaling
to higher compute and memory capacities.
.. _rdna_architecture:
RDNA architecture
-----------------
RDNA optimizes for graphics and lower-latency compute workloads through
fundamental architectural changes:
.. figure:: ../data/understand/hardware_implementation/rdna3_cu.png
:alt: Block diagram showing RDNA3 work group processor with dual compute
units, shared caches, and 32-wide SIMD units
:width: 800
RDNA3 work group processor architecture
**Wave32 execution**: Primary execution mode uses 32-thread wavefronts,
reducing divergence penalties and register pressure. Wave64 mode is available
for backward compatibility.
**Dual compute units**: The work group processor (WGP) replaces standalone CUs,
containing two closely coupled compute units sharing resources:
* Each CU has two 32-wide SIMD units (vs. four 16-wide in GCN)
* Wavefronts execute in a single cycle on 32-wide SIMDs
* Reduced instruction latency improves responsiveness
**Three-level cache hierarchy**:
* **L0 cache**: Per-CU cache (equivalent to GCN's L1)
* **L1 cache**: Shared between CUs in a WGP (new intermediate level)
* **L2 cache**: Global cache shared across all WGPs
**128-byte cache lines**: Doubled from 64 bytes in GCN, aligning with Wave32
access patterns (32 threads × 4 bytes = 128 bytes).
These RDNA optimizations target gaming workloads where latency matters more
than pure throughput, though the architecture remains capable for general
compute tasks.
Performance considerations
==========================
Understanding hardware characteristics helps you optimize GPU applications for
maximum performance.
Occupancy and resource limits
-----------------------------
Occupancy measures the ratio of active wavefronts to maximum possible
wavefronts on a CU. Higher occupancy generally improves latency hiding but
is limited by:
* **Register usage**: Each wavefront requires VGPRs and SGPRs from finite pools
* **LDS allocation**: Shared memory used per workgroup
* **Wavefront slots**: Fixed number of execution contexts per CU
* **Workgroup size**: Smaller workgroups can waste resources
Balancing these resources is critical for achieving optimal occupancy. Tools
like ``rocprof`` can help analyze occupancy and identify limiting factors.
Latency hiding through multithreading
-------------------------------------
GPUs hide memory and instruction latency through massive hardware
multithreading rather than complex CPU techniques like out-of-order execution
or speculation. With sufficient wavefronts:
* Memory latency is hidden by executing other wavefronts during waits
* Pipeline latencies are covered by round-robin wavefront scheduling
* No context switch overhead as all contexts remain resident
The hardware can switch between wavefronts every cycle, maintaining high ALU
utilization even with long-latency operations in flight.
Memory bandwidth utilization
----------------------------
Effective memory bandwidth depends on access patterns:
* **Coalesced access**: Can achieve 70-90% of peak bandwidth
* **Random access**: Might achieve only 5-15% of peak bandwidth
* **Bank conflicts**: Can serialize LDS access, reducing throughput
Memory-bound kernels should focus on:
* Maximizing coalescing through proper data layout
* Prefetching and data reuse in LDS
* Balancing computation with memory access
* Using appropriate cache policies
Hardware-specific optimizations
-------------------------------
Different AMD GPU architectures benefit from tailored optimizations:
**For GCN/CDNA**:
* Optimize for 64-thread wavefront granularity
* Leverage matrix cores for applicable algorithms
* Consider AGPR usage for register spilling
**For RDNA**:
* Design for 32-thread wavefront execution
* Utilize improved divergence handling
* Take advantage of additional cache level
**Architecture-agnostic**:
* Minimize divergent control flow
* Ensure memory access coalescing
* Balance resource usage for occupancy
* Overlap computation with memory access
Summary
=======
AMD GPU hardware architecture provides massive parallelism through hierarchical
organization of compute resources, specialized execution units, and a
sophisticated memory system. Understanding these hardware details—from the
command processor through shader engines to individual compute units and the
memory hierarchy—enables you to write more efficient GPU applications.
Key hardware concepts for optimization include:
* Workgroup scheduling and resource management by the SPI
* Instruction scheduling and wavefront execution in compute units
* Memory coalescing and cache behavior
* Architecture-specific features (matrix cores, Wave32/64 modes)
* Resource limits affecting occupancy
For details on mapping parallel algorithms to this hardware, see the
:ref:`programming_model` chapter. For specific optimization techniques, consult
the performance optimization guides in the ROCm documentation.
@@ -0,0 +1,375 @@
.. meta::
:description: This chapter explains performance concepts and theoretical models for understanding AMD GPU performance
:keywords: AMD, ROCm, HIP, performance, theory, roofline, occupancy, bandwidth, arithmetic intensity
.. _performance_optimization:
*******************************************************************************
Understanding GPU performance
*******************************************************************************
This chapter explains the theoretical foundations of GPU performance on AMD
hardware. Understanding these concepts helps you analyze performance
characteristics, identify bottlenecks, and make informed optimization decisions.
For practical optimization techniques and step-by-step guidance, see
:doc:`../how-to/performance_guidelines`.
Performance bottlenecks
=======================
A performance bottleneck is the limiting factor that prevents a GPU kernel from
achieving higher performance. The two primary categories are:
* **Compute-bound**: The kernel is limited by arithmetic throughput
* **Memory-bound**: The kernel is limited by memory bandwidth
Understanding which category applies helps identify the appropriate optimization
approach. Compute-bound kernels benefit from arithmetic optimizations, while
memory-bound kernels benefit from memory access improvements.
.. _roofline_model:
Roofline model
==============
The roofline model is a visual performance analysis framework that relates
achievable performance to hardware limits based on arithmetic intensity.
The model plots performance (FLOPS) against arithmetic intensity (FLOPS/byte)
with two limiting factors:
1. **Memory bandwidth ceiling**: A sloped line representing peak memory bandwidth
2. **Compute ceiling**: A horizontal line representing peak arithmetic throughput
The intersection point determines the transition between memory-bound and
compute-bound regions. Kernels below and to the left of the intersection are
memory-bound, while those to the right are compute-bound.
.. figure:: ../../data/understand/performance_optimization/roofline.svg
:alt: Roofline model diagram showing memory bandwidth ceiling and compute
ceiling
:align: center
:width: 100%
Roofline model showing the relationship between arithmetic intensity and
achievable performance. The memory bandwidth ceiling represents the GPU's
memory bandwidth limit, while the compute ceiling shows the maximum
achievable TFLOPs. Kernels falling into the area to the left of the red
line are memory-bound, while they are compute-bound if they fall into the
right area.
Key characteristics:
* The roofline creates an upper bound on achievable performance
* Real applications typically achieve a significant portion of the theoretical limit
* The model helps guide optimization efforts based on kernel characteristics
.. _compute_bound:
Compute-bound performance
=========================
A kernel is compute-bound when its performance is limited by the GPU's arithmetic
throughput rather than memory bandwidth. These kernels have high arithmetic
intensity and spend most cycles executing arithmetic operations.
Characteristics of compute-bound kernels:
* High ratio of arithmetic operations to memory accesses
* Performance scales with GPU compute capacity
* Limited benefit from memory bandwidth optimization
* Can often achieve a high percentage of peak theoretical FLOPS
The theoretical maximum is determined by:
* Number of compute units and SIMD lanes
* Clock frequency
* Instruction throughput per cycle
* Specialized unit capabilities (matrix cores, SFUs)
.. _memory_bound:
Memory-bound performance
========================
A kernel is memory-bound when its performance is limited by memory bandwidth
rather than compute capacity. These kernels have low arithmetic intensity and
spend significant time waiting for memory operations.
Characteristics of memory-bound kernels:
* Low ratio of arithmetic operations to memory accesses
* Performance scales with memory bandwidth
* Sensitive to memory access patterns
* Typically achieve lower percentage of peak FLOPS
The theoretical maximum is determined by:
* HBM bandwidth capacity
* Memory controller efficiency
* Cache hierarchy effectiveness
* Memory access pattern efficiency
.. _arithmetic_intensity:
Arithmetic intensity
====================
Arithmetic intensity is the ratio of floating-point operations (FLOPs) to memory
traffic (bytes) for a given kernel or algorithm.
.. math::
\text{Arithmetic Intensity} = \frac{\text{FLOPs}}{\text{Bytes Transferred}}
This metric determines whether a kernel is compute-bound or memory-bound.
Key points:
* Higher arithmetic intensity indicates more computation per byte transferred
* The balance point depends on the GPU's compute-to-bandwidth ratio
* It can be calculated theoretically or measured empirically
* Different precision types affect both FLOPs and bytes
For modern AMD GPUs:
* The compute-to-bandwidth ratio varies by GPU generation
* Higher-end models have higher ratios
* Kernels above the GPU's specific ratio are compute-bound
.. _latency_hiding:
Latency hiding mechanisms
==========================
GPUs hide memory and instruction latency through massive hardware multithreading
rather than complex CPU techniques like out-of-order execution.
How latency hiding works:
* **Wavefront switching**: Context switches occur every cycle with zero overhead
* **Multiple wavefronts per CU**: Many concurrent wavefronts supported
* **Instruction-level parallelism**: Multiple independent instructions in flight
The hardware can completely hide memory latency if there are enough active
wavefronts with independent work. The number of instructions required from other
wavefronts to hide latency depends on the specific memory latency and instruction
throughput characteristics of the GPU.
Requirements for effective latency hiding:
* Sufficient occupancy (active wavefronts)
* Independent instructions to overlap
* Balanced resource usage
* Minimal divergence
.. _wavefront_execution:
Wavefront execution states
==========================
A wavefront can be in one of several states during execution:
* **Active**: Currently executing on a SIMD unit
* **Ready**: Eligible for execution, waiting for scheduling
* **Stalled**: Waiting for a dependency (memory, synchronization)
* **Sleeping**: Blocked on a barrier or synchronization primitive
Understanding these states helps explain GPU utilization metrics:
* **Active cycles**: Percentage of cycles with at least one instruction executing
* **Stall cycles**: Percentage of cycles waiting for resources
* **Idle cycles**: No wavefronts available to execute
Maximizing active cycles while minimizing stall and idle cycles improves
performance.
.. _occupancy:
Occupancy theory
================
Occupancy measures the ratio of active wavefronts to the maximum possible
wavefronts on a compute unit.
.. math::
\text{Occupancy} = \frac{\text{Active Wavefronts}}{\text{Max Wavefronts per CU}}
Why occupancy matters:
* Higher occupancy improves latency hiding
* More concurrent wavefronts mask memory and instruction latency
* Enables better utilization of execution units
Limiting factors:
* **Register usage**: VGPRs and SGPRs per thread
* **Shared memory (LDS)**: Allocation per block
* **Wavefront slots**: Hardware limit on concurrent wavefronts
* **Block size**: Small blocks may waste resources
Trade-offs:
* Higher occupancy improves latency hiding but reduces resources per thread
* Lower occupancy allows more resources per thread but may expose latency
* Optimal occupancy depends on kernel characteristics
* Memory-bound kernels benefit more from high occupancy
.. _memory_hierarchy_theory:
Memory hierarchy impact on performance
=======================================
The GPU memory hierarchy has different bandwidths and latencies:
Memory types by speed:
1. **Registers**: Fastest, lowest latency (per-thread storage)
2. **LDS (shared memory)**: Very fast, on-chip (per-block storage)
3. **L1 cache**: Fast, on-chip (per-CU cache)
4. **L2 cache**: Moderate, on-chip (shared across CUs)
5. **HBM (global memory)**: Slower, off-chip but high bandwidth
Memory coalescing theory
-------------------------
Memory coalescing combines memory accesses from multiple threads into fewer
transactions. When consecutive threads access consecutive memory addresses, the
hardware can merge requests into efficient cache line accesses.
Why coalescing matters:
* Reduces number of memory transactions
* Improves memory bandwidth utilization
* Decreases memory access latency
**Coalesced pattern**: Consecutive threads accessing consecutive addresses
achieve high bandwidth utilization.
**Non-coalesced pattern**: Random or strided addresses result in many separate
transactions and low bandwidth utilization.
.. _bank_conflicts_theory:
Bank conflict theory
--------------------
Shared memory (LDS) is organized into banks that can be accessed independently.
Bank conflicts occur when multiple threads access different addresses in the
same bank.
Why bank conflicts matter:
* Conflicts serialize accesses, reducing throughput
* LDS bandwidth drops proportionally to conflict degree
* Can turn parallel operations into sequential ones
Common patterns:
* **No conflict**: Each thread accesses a different bank (full bandwidth)
* **Broadcast**: Multiple threads read the same address (no conflict)
* **N-way conflict**: N threads access the same bank (1/N bandwidth)
.. _register_pressure_theory:
Register pressure theory
=========================
Register pressure occurs when a kernel requires more registers than optimal for
the target occupancy.
Why register pressure matters:
* Reduces maximum occupancy
* May cause register spilling to memory
* Decreases ability to hide latency
* Lowers overall throughput
The relationship between registers and occupancy:
* More registers per thread → fewer concurrent wavefronts
* Fewer registers per thread → higher occupancy but may need memory spills
* Optimal balance depends on kernel memory access patterns
.. _performance_metrics_theory:
Performance metrics explained
==============================
Understanding performance metrics helps analyze GPU behavior:
Peak rate
---------
The theoretical maximum performance of a GPU:
* **Peak FLOPS**: Maximum floating-point operations per second
* **Peak bandwidth**: Maximum memory throughput
* **Peak instruction rate**: Maximum instructions per cycle
Actual performance is always below peak due to various inefficiencies.
Utilization metrics
-------------------
**Pipe utilization**: The percentage of execution cycles where the pipeline is
actively processing instructions. Low utilization indicates stalls or insufficient
work.
**Issue efficiency**: The ratio of issued instructions to the maximum possible.
Low efficiency can indicate instruction cache misses, scheduling inefficiencies,
or resource conflicts.
**CU utilization**: The percentage of compute units actively executing work. Low
utilization suggests insufficient parallelism, load imbalance, or synchronization
overhead.
**Branch efficiency**: The ratio of non-divergent to total branches. Low
efficiency indicates significant divergence overhead.
Theoretical performance limits
==============================
Understanding theoretical limits helps set realistic performance expectations.
**Peak performance bounds**
Every GPU has theoretical maximum performance determined by:
* Clock frequency and number of compute units
* Instruction throughput per clock cycle
* Memory bandwidth capacity
* Specialized unit capabilities (matrix cores, SFUs)
**Achievable performance**
Real applications typically achieve a fraction of theoretical peak due to:
* Imperfect resource utilization
* Memory access inefficiencies
* Control flow divergence
* Synchronization overhead
* Launch and scheduling costs
The gap between theoretical and achieved performance reveals optimization
opportunities. The roofline model provides a framework for understanding these
limits and identifying which factor (compute or memory) constrains performance.
Summary
=======
Understanding GPU performance requires knowledge of several interconnected concepts:
* **Performance bottlenecks**: Whether compute or memory limits performance
* **Roofline model**: Visual framework for analyzing performance limits
* **Arithmetic intensity**: The compute-to-memory ratio of algorithms
* **Latency hiding**: How concurrent execution masks delays
* **Occupancy**: How wavefront concurrency affects resource utilization
* **Memory hierarchy**: How different memory types affect bandwidth
* **Performance metrics**: Quantitative measures for analysis
These theoretical foundations inform practical optimization decisions. For
step-by-step optimization techniques and practical guidance, see
:doc:`../how-to/performance_guidelines`.
@@ -271,10 +271,16 @@ depicted in the following figure.
.. _wavefront:
Warp (or Wavefront)
The innermost grouping of threads is called a warp. A warp is the most tightly
coupled groups of threads, both physically and logically. Threads inside a warp
are executed in lockstep, with each thread executing the same instruction. Threads
in a warp are also called lanes, and the value identifying them is the lane ID.
The innermost grouping of threads is called a warp (NVIDIA terminology) or
wavefront (AMD terminology). A wavefront is the most tightly coupled group of
threads, both physically and logically. Threads within a wavefront are
executed in lockstep, with each thread executing the same instruction
simultaneously on different data elements.
A wavefront represents the fundamental execution unit of AMD GPUs. Each wavefront
consists of multiple parallel threads that execute the same instruction
simultaneously across the SIMD pipelines of a compute unit. Threads in a wavefront
are also called lanes, and the value identifying them is the lane ID.
.. tip::
@@ -282,22 +288,39 @@ Warp (or Wavefront)
consequence, they are only as multidimensional as the user interprets the
calculated values to be.
The size of a warp is architecture dependent and always fixed. For AMD GPUs
the warp is typically 64 threads, though sometimes 32 threads. Warps are
signified by the set of communication primitives at their disposal, as
discussed in :ref:`warp-cross-lane`.
The size of a wavefront is architecture dependent and always fixed:
* **64 threads** for AMD GCN and CDNA architectures as well as RDNA architectures in wave64 mode
* **32 threads** for AMD RDNA architectures in wave32 mode
* **32 threads** for NVIDIA GPUs
Wavefronts are signified by the set of communication primitives at their disposal,
as discussed in :ref:`warp-cross-lane`. On modern AMD datacenter GPUs like MI300X,
each CU can support up to 64 concurrent wavefronts, each containing 64 threads,
for a total of over 4,000 active threads per CU.
.. _inherent_thread_hierarchy_block:
Block
The next level of the thread hierarchy is called a thread block, or block. The
defining feature of a block is that all threads in the block have shared memory
that they can use to share data or synchronize with one another, as described in
:ref:`memory_hierarchy`.
Block (Work-group)
The next level of the thread hierarchy is called a thread block (or work-group in
OpenCL terminology). A block is a collection of wavefronts that can synchronize
and share local data share (LDS) memory. The defining feature of a block is that
all threads in the block have shared memory that they can use to share data or
synchronize with one another, as described in :ref:`memory_hierarchy`.
All wavefronts of a block execute on the same compute unit, ensuring they
can access the same LDS and synchronize efficiently. This locality is crucial
for performance when threads need to cooperate on shared data.
The size of a block, or the block dimension, is the user-configurable number of
threads per block, but is limited by the queryable capabilities of the executing
hardware. The unique ID of the thread within a block can be 1, 2, or 3-dimensional
hardware. Common limits include:
* Maximum threads per block: typically 1024
* Maximum block dimensions: 1024 x 1024 x 64 (x, y, z)
* Limited by available resources (registers, LDS, wavefront slots)
The unique ID of the thread within a block can be 1, 2, or 3-dimensional
as provided by the HIP API. You can configure the thread block to best represent
the data associated with the kernel instruction set.
@@ -308,10 +331,24 @@ Block
.. _inherent_thread_hierarchy_grid:
Grid
The top-most level of the thread hierarchy is a grid. A grid is the number of blocks
needed for a single launch of the kernel. The unique ID of each block within
a grid can be 1, 2, or 3-dimensional, as provided by the API and is queryable
by every thread within the block.
The top-most level of the thread hierarchy is a grid. A grid represents the total
collection of blocks (work-groups) launched for a single kernel execution. It defines
the overall problem size and how work is distributed across the GPU.
The grid is specified when launching a kernel and determines:
* Total number of threads: ``grid_size`` × ``block_size``
* Distribution of work across compute units
* Overall parallelism of the computation
The unique ID of each block within a grid can be 1, 2, or 3-dimensional, as provided
by the API and is queryable by every thread within the block through the ``blockIdx``
built-in variable.
Grid dimensions are limited by hardware capabilities:
* Maximum x-dimension: 2³¹ - 1
* Maximum y and z-dimensions: 2¹⁶ - 1 (65,535)
The three-dimensional thread hierarchy available to a kernel program lends itself to solutions
that align closely to the computational problem. The following are some examples: