SWDEV-502480 - Update documentation from GitHub 2024-12-05

Change-Id: I179814351b77935aff55e8ae47dd322a3e15a868


[ROCm/hip commit: f39c7a3150]
This commit is contained in:
Istvan Kiss
2024-12-15 19:31:35 +01:00
والد 11e6f3ce2d
کامیت b271963c51
85فایلهای تغییر یافته به همراه8331 افزوده شده و 3512 حذف شده
+24 -2
مشاهده پرونده
@@ -6,6 +6,7 @@ APU
APUs
AQL
AXPY
asm
Asynchrony
backtrace
Bitcode
@@ -15,6 +16,7 @@ builtins
Builtins
CAS
clr
compilable
coroutines
Ctx
cuBLASLt
@@ -42,12 +44,14 @@ extern
fatbin
fatbinary
foundationally
framebuffer
frontends
fnuz
FNUZ
fp
gedit
GPGPU
GROMACS
GWS
hardcoded
HC
@@ -58,6 +62,7 @@ hipcc
hipCtx
hipexamine
hipified
HIPify
hipModule
hipModuleLaunchKernel
hipother
@@ -65,9 +70,12 @@ HIPRTC
icc
IILE
iGPU
inlined
inplace
Interoperation
interop
interoperation
interoperate
interoperation
Interprocess
interprocess
Intrinsics
@@ -75,6 +83,7 @@ intrinsics
IPC
IPs
isa
iteratively
Lapack
latencies
libc
@@ -87,6 +96,8 @@ ltrace
makefile
Malloc
malloc
MALU
MiB
memset
multicore
multigrid
@@ -101,9 +112,12 @@ NOP
Numa
Nsight
ocp
omnitrace
overindex
overindexing
oversubscription
overutilized
parallelizable
pixelated
pragmas
preallocated
@@ -111,6 +125,7 @@ preconditioners
predefining
prefetched
preprocessor
profilers
PTX
PyHIP
queryable
@@ -118,6 +133,7 @@ prefetching
quad
representable
RMW
rocgdb
ROCm's
rocTX
roundtrip
@@ -129,6 +145,7 @@ scalarizing
sceneries
shaders
SIMT
sinewave
SOMA
SPMV
structs
@@ -139,11 +156,16 @@ texels
tradeoffs
templated
toolkits
transfering
typedefs
unintuitive
UMM
unmap
unmapped
unmapping
unregister
upscaled
variadic
vulkan
WinGDB
zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz
zc
+1 -1
مشاهده پرونده
@@ -36,7 +36,7 @@ HIP releases are typically naming convention for each ROCM release to help diffe
## More Info
* [Installation](docs/install/install.rst)
* [HIP FAQ](docs/how-to/faq.md)
* [HIP FAQ](docs/faq.rst)
* [HIP C++ Language Extensions](docs/reference/cpp_language_extensions.rst)
* [HIP Porting Guide](docs/how-to/hip_porting_guide.md)
* [HIP Porting Driver Guide](docs/how-to/hip_porting_driver_api.md)
+2 -2
مشاهده پرونده
@@ -47,8 +47,8 @@ suppress_warnings = ["etoc.toctree"]
numfig = False
exclude_patterns = [
"doxygen/mainpage.md",
"understand/glossary.md"
"understand/glossary.md",
'how-to/debugging_env.rst'
]

قبل از

عرض:  |  ارتفاع:  |  اندازه: 64 KiB

پس از

عرض:  |  ارتفاع:  |  اندازه: 64 KiB

قبل از

عرض:  |  ارتفاع:  |  اندازه: 308 KiB

پس از

عرض:  |  ارتفاع:  |  اندازه: 308 KiB

قبل از

عرض:  |  ارتفاع:  |  اندازه: 25 KiB

پس از

عرض:  |  ارتفاع:  |  اندازه: 25 KiB

قبل از

عرض:  |  ارتفاع:  |  اندازه: 203 KiB

پس از

عرض:  |  ارتفاع:  |  اندازه: 203 KiB

@@ -0,0 +1,106 @@
<mxfile host="65bd71144e">
<diagram id="zBbb_w2fufU70cdOGtND" name="1 oldal">
<mxGraphModel dx="1547" dy="1302" grid="1" gridSize="10" guides="1" tooltips="1" connect="1" arrows="1" fold="1" page="0" pageScale="1" pageWidth="660" pageHeight="610" background="none" math="0" shadow="0">
<root>
<mxCell id="0"/>
<mxCell id="1" parent="0"/>
<mxCell id="5927" value="" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#5E5B61;fontColor=#FFFFFF;strokeColor=none;spacing=0;" parent="1" vertex="1">
<mxGeometry y="-10" width="740" height="290" as="geometry"/>
</mxCell>
<mxCell id="5928" value="Pageable data transfer" style="text;strokeColor=none;fillColor=none;align=center;verticalAlign=middle;rounded=0;fontFamily=Helvetica;fontSize=17;fontColor=#FFFFFF;" parent="1" vertex="1">
<mxGeometry x="20" width="340" height="30" as="geometry"/>
</mxCell>
<mxCell id="UvHuP5o6jSuoLTm0AUZA-5955" value="" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#4F1623;fontColor=#FFFFFF;strokeColor=none;" parent="1" vertex="1">
<mxGeometry x="20" y="160" width="340" height="100" as="geometry"/>
</mxCell>
<mxCell id="UvHuP5o6jSuoLTm0AUZA-5959" value="" style="group" parent="1" vertex="1" connectable="0">
<mxGeometry x="230" y="170" width="120" height="80" as="geometry"/>
</mxCell>
<mxCell id="UvHuP5o6jSuoLTm0AUZA-5960" value="" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#C23555;fontColor=#FFFFFF;strokeColor=none;" parent="UvHuP5o6jSuoLTm0AUZA-5959" vertex="1">
<mxGeometry width="120" height="80" as="geometry"/>
</mxCell>
<mxCell id="UvHuP5o6jSuoLTm0AUZA-5961" value="&lt;div&gt;Pinned memory&lt;/div&gt;" style="text;html=1;strokeColor=none;fillColor=none;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;fontSize=17;fontColor=#FFFFFF;" parent="UvHuP5o6jSuoLTm0AUZA-5959" vertex="1">
<mxGeometry width="120" height="80" as="geometry"/>
</mxCell>
<mxCell id="LV0FwBpydXXZrUbya0PG-5946" value="Pinned data transfer" style="text;strokeColor=none;fillColor=none;align=center;verticalAlign=middle;rounded=0;fontFamily=Helvetica;fontSize=17;fontColor=#FFFFFF;" parent="1" vertex="1">
<mxGeometry x="380" width="340" height="30" as="geometry"/>
</mxCell>
<mxCell id="UvHuP5o6jSuoLTm0AUZA-5952" value="" style="group;fillColor=#9C2A44;" parent="1" vertex="1" connectable="0">
<mxGeometry x="70" y="170" width="120" height="80" as="geometry"/>
</mxCell>
<mxCell id="UvHuP5o6jSuoLTm0AUZA-5950" value="" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#9C2A44;fontColor=#FFFFFF;strokeColor=none;" parent="UvHuP5o6jSuoLTm0AUZA-5952" vertex="1">
<mxGeometry width="120" height="80" as="geometry"/>
</mxCell>
<mxCell id="UvHuP5o6jSuoLTm0AUZA-5951" value="Pageable memory" style="text;html=1;strokeColor=none;fillColor=none;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;fontSize=17;fontColor=#FFFFFF;" parent="UvHuP5o6jSuoLTm0AUZA-5952" vertex="1">
<mxGeometry width="120" height="80" as="geometry"/>
</mxCell>
<mxCell id="LV0FwBpydXXZrUbya0PG-5974" value="" style="endArrow=classic;startArrow=classic;html=1;rounded=0;entryX=1;entryY=0.5;entryDx=0;entryDy=0;strokeWidth=2;exitX=0;exitY=0.5;exitDx=0;exitDy=0;" parent="1" target="UvHuP5o6jSuoLTm0AUZA-5950" edge="1" source="UvHuP5o6jSuoLTm0AUZA-5961">
<mxGeometry width="50" height="50" relative="1" as="geometry">
<mxPoint x="220" y="250" as="sourcePoint"/>
<mxPoint x="109.5" y="201" as="targetPoint"/>
</mxGeometry>
</mxCell>
<mxCell id="5929" value="" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#4F1623;fontColor=#FFFFFF;strokeColor=none;" vertex="1" parent="1">
<mxGeometry x="20" y="40" width="340" height="100" as="geometry"/>
</mxCell>
<mxCell id="5930" value="" style="group" vertex="1" connectable="0" parent="1">
<mxGeometry x="230" y="50" width="120" height="80" as="geometry"/>
</mxCell>
<mxCell id="5931" value="" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#C23555;fontColor=#FFFFFF;strokeColor=none;" vertex="1" parent="5930">
<mxGeometry width="120" height="80" as="geometry"/>
</mxCell>
<mxCell id="5932" value="&lt;div&gt;Device memory&lt;/div&gt;" style="text;html=1;strokeColor=none;fillColor=none;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;fontSize=17;fontColor=#FFFFFF;" vertex="1" parent="5930">
<mxGeometry width="120" height="80" as="geometry"/>
</mxCell>
<mxCell id="LV0FwBpydXXZrUbya0PG-5968" value="" style="endArrow=classic;startArrow=classic;html=1;rounded=0;entryX=0.5;entryY=0;entryDx=0;entryDy=0;strokeWidth=2;strokeColor=#ffffff;exitX=0.5;exitY=1;exitDx=0;exitDy=0;" parent="1" source="5932" target="UvHuP5o6jSuoLTm0AUZA-5960" edge="1">
<mxGeometry width="50" height="50" relative="1" as="geometry">
<mxPoint x="290" y="120" as="sourcePoint"/>
<mxPoint x="289.5" y="160" as="targetPoint"/>
</mxGeometry>
</mxCell>
<mxCell id="5944" value="" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#4F1623;fontColor=#FFFFFF;strokeColor=none;" vertex="1" parent="1">
<mxGeometry x="380" y="160" width="340" height="100" as="geometry"/>
</mxCell>
<mxCell id="5945" value="" style="group" vertex="1" connectable="0" parent="1">
<mxGeometry x="590" y="170" width="120" height="80" as="geometry"/>
</mxCell>
<mxCell id="5946" value="" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#C23555;fontColor=#FFFFFF;strokeColor=none;" vertex="1" parent="5945">
<mxGeometry width="120" height="80" as="geometry"/>
</mxCell>
<mxCell id="5947" value="&lt;div&gt;Pinned memory&lt;/div&gt;" style="text;html=1;strokeColor=none;fillColor=none;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;fontSize=17;fontColor=#FFFFFF;" vertex="1" parent="5945">
<mxGeometry width="120" height="80" as="geometry"/>
</mxCell>
<mxCell id="5948" value="" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#4F1623;fontColor=#FFFFFF;strokeColor=none;" vertex="1" parent="1">
<mxGeometry x="380" y="40" width="340" height="100" as="geometry"/>
</mxCell>
<mxCell id="5949" value="" style="group" vertex="1" connectable="0" parent="1">
<mxGeometry x="590" y="50" width="120" height="80" as="geometry"/>
</mxCell>
<mxCell id="5950" value="" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#C23555;fontColor=#FFFFFF;strokeColor=none;" vertex="1" parent="5949">
<mxGeometry width="120" height="80" as="geometry"/>
</mxCell>
<mxCell id="5951" value="&lt;div&gt;Device memory&lt;/div&gt;" style="text;html=1;strokeColor=none;fillColor=none;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;fontSize=17;fontColor=#FFFFFF;" vertex="1" parent="5949">
<mxGeometry width="120" height="80" as="geometry"/>
</mxCell>
<mxCell id="5952" style="edgeStyle=none;html=1;exitX=0.5;exitY=0;exitDx=0;exitDy=0;entryX=0.5;entryY=1;entryDx=0;entryDy=0;startArrow=classic;startFill=1;strokeWidth=2;strokeColor=#FFFFFF;" edge="1" parent="1" source="5947" target="5951">
<mxGeometry relative="1" as="geometry"/>
</mxCell>
<mxCell id="5958" value="&lt;div&gt;Host&lt;/div&gt;" style="text;html=1;strokeColor=none;fillColor=none;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;fontSize=17;fontColor=#FFFFFF;direction=west;" vertex="1" parent="1">
<mxGeometry x="20" y="195" width="50" height="30" as="geometry"/>
</mxCell>
<mxCell id="5960" value="&lt;div&gt;Device&lt;/div&gt;" style="text;html=1;strokeColor=none;fillColor=none;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;fontSize=17;fontColor=#FFFFFF;" vertex="1" parent="1">
<mxGeometry x="20" y="75" width="70" height="30" as="geometry"/>
</mxCell>
<mxCell id="5961" value="&lt;div&gt;Device&lt;/div&gt;" style="text;html=1;strokeColor=none;fillColor=none;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;fontSize=17;fontColor=#FFFFFF;" vertex="1" parent="1">
<mxGeometry x="380" y="75" width="70" height="30" as="geometry"/>
</mxCell>
<mxCell id="5962" value="&lt;div&gt;Host&lt;/div&gt;" style="text;html=1;strokeColor=none;fillColor=none;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;fontSize=17;fontColor=#FFFFFF;" vertex="1" parent="1">
<mxGeometry x="380" y="195" width="60" height="30" as="geometry"/>
</mxCell>
<mxCell id="5964" value="" style="edgeStyle=none;html=1;strokeWidth=2;startArrow=classic;startFill=1;strokeColor=#FFFFFF;" edge="1" parent="1" source="UvHuP5o6jSuoLTm0AUZA-5951" target="UvHuP5o6jSuoLTm0AUZA-5961">
<mxGeometry relative="1" as="geometry"/>
</mxCell>
</root>
</mxGraphModel>
</diagram>
</mxfile>
تفاوت فایل به دلیل طولانی بودن یک یا چند خط حذف شد

پس از

عرض:  |  ارتفاع:  |  اندازه: 13 KiB

قبل از

عرض:  |  ارتفاع:  |  اندازه: 1.1 KiB

پس از

عرض:  |  ارتفاع:  |  اندازه: 1.1 KiB

قبل از

عرض:  |  ارتفاع:  |  اندازه: 1.1 KiB

پس از

عرض:  |  ارتفاع:  |  اندازه: 1.1 KiB

قبل از

عرض:  |  ارتفاع:  |  اندازه: 2.9 KiB

پس از

عرض:  |  ارتفاع:  |  اندازه: 2.9 KiB

قبل از

عرض:  |  ارتفاع:  |  اندازه: 1.1 KiB

پس از

عرض:  |  ارتفاع:  |  اندازه: 1.1 KiB

قبل از

عرض:  |  ارتفاع:  |  اندازه: 1.3 KiB

پس از

عرض:  |  ارتفاع:  |  اندازه: 1.3 KiB

قبل از

عرض:  |  ارتفاع:  |  اندازه: 401 B

پس از

عرض:  |  ارتفاع:  |  اندازه: 401 B

قبل از

عرض:  |  ارتفاع:  |  اندازه: 1.1 KiB

پس از

عرض:  |  ارتفاع:  |  اندازه: 1.1 KiB

تفاوت فایلی نمایش داده نمی شود زیرا این فایل بسیار بزرگ است Diff را بارگزاری کن
تفاوت فایل به دلیل طولانی بودن یک یا چند خط حذف شد

پس از

عرض:  |  ارتفاع:  |  اندازه: 83 KiB

@@ -0,0 +1,127 @@
<mxfile host="65bd71144e">
<diagram id="zBbb_w2fufU70cdOGtND" name="1 oldal">
<mxGraphModel dx="1584" dy="1200" grid="1" gridSize="10" guides="1" tooltips="1" connect="1" arrows="1" fold="1" page="0" pageScale="1" pageWidth="660" pageHeight="610" background="none" math="0" shadow="0">
<root>
<mxCell id="0"/>
<mxCell id="1" parent="0"/>
<mxCell id="5927" value="" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#5E5B61;fontColor=#FFFFFF;strokeColor=none;spacing=0;" parent="1" vertex="1">
<mxGeometry y="-30" width="680" height="380" as="geometry"/>
</mxCell>
<mxCell id="5945" value="" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#C23555;fontColor=#FFFFFF;strokeColor=#A20025;" parent="1" vertex="1">
<mxGeometry x="10" y="-10" width="660" height="30" as="geometry"/>
</mxCell>
<mxCell id="5946" value="&lt;font face=&quot;Helvetica&quot;&gt;HIP Runtime API&lt;/font&gt;" style="text;html=1;strokeColor=none;fillColor=none;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;fontSize=17;fontColor=#FFFFFF;" parent="1" vertex="1">
<mxGeometry x="75" y="-10" width="530" height="30" as="geometry"/>
</mxCell>
<mxCell id="UvHuP5o6jSuoLTm0AUZA-5953" value="" style="group" parent="1" vertex="1" connectable="0">
<mxGeometry x="10" y="80" width="330" height="260" as="geometry"/>
</mxCell>
<mxCell id="5925" value="" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#333333;fontColor=#FFFFFF;strokeColor=none;" parent="UvHuP5o6jSuoLTm0AUZA-5953" vertex="1">
<mxGeometry width="330" height="260" as="geometry"/>
</mxCell>
<mxCell id="UvHuP5o6jSuoLTm0AUZA-5952" value="" style="group" parent="UvHuP5o6jSuoLTm0AUZA-5953" vertex="1" connectable="0">
<mxGeometry x="16.67" y="190.00279999999998" width="293.33" height="45" as="geometry"/>
</mxCell>
<mxCell id="UvHuP5o6jSuoLTm0AUZA-5950" value="" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#60a917;fontColor=#ffffff;strokeColor=#2D7600;" parent="UvHuP5o6jSuoLTm0AUZA-5952" vertex="1">
<mxGeometry width="293.33000000000004" height="45" as="geometry"/>
</mxCell>
<mxCell id="UvHuP5o6jSuoLTm0AUZA-5951" value="&lt;div&gt;CUDA Driver API&lt;/div&gt;" style="text;html=1;strokeColor=none;fillColor=none;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;fontSize=17;fontColor=#FFFFFF;" parent="UvHuP5o6jSuoLTm0AUZA-5952" vertex="1">
<mxGeometry x="10.9643478387712" y="7.500000000000001" width="266.79913074343256" height="30.000000000000004" as="geometry"/>
</mxCell>
<mxCell id="5948" value="" style="endArrow=classic;startArrow=none;html=1;rounded=0;strokeWidth=2;startFill=0;exitX=0.5;exitY=1;exitDx=0;exitDy=0;entryX=0.358;entryY=0;entryDx=0;entryDy=0;entryPerimeter=0;strokeColor=#FFFFFF;" parent="UvHuP5o6jSuoLTm0AUZA-5953" source="5967" target="UvHuP5o6jSuoLTm0AUZA-5950" edge="1">
<mxGeometry width="50" height="50" relative="1" as="geometry">
<mxPoint x="103.33500000000004" y="108.22000000000003" as="sourcePoint"/>
<mxPoint x="85" y="145.6" as="targetPoint"/>
</mxGeometry>
</mxCell>
<mxCell id="5966" value="" style="group" parent="UvHuP5o6jSuoLTm0AUZA-5953" vertex="1" connectable="0">
<mxGeometry x="16.670000000000016" y="64" width="210" height="45" as="geometry"/>
</mxCell>
<mxCell id="5967" value="" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#60a917;fontColor=#ffffff;strokeColor=#2D7600;" parent="5966" vertex="1">
<mxGeometry width="210.00000000000003" height="45" as="geometry"/>
</mxCell>
<mxCell id="5968" value="&lt;div&gt;CUDA runtime&lt;/div&gt;" style="text;html=1;strokeColor=none;fillColor=none;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;fontSize=17;fontColor=#FFFFFF;" parent="5966" vertex="1">
<mxGeometry x="9.499565493273565" y="7.499999999999974" width="191.0060936696582" height="29.999999999999996" as="geometry"/>
</mxCell>
<mxCell id="UvHuP5o6jSuoLTm0AUZA-5982" value="" style="endArrow=classic;startArrow=none;html=1;rounded=0;strokeWidth=2;startFill=0;strokeColor=#FFFFFF;" parent="1" edge="1">
<mxGeometry width="50" height="50" relative="1" as="geometry">
<mxPoint x="270" y="60" as="sourcePoint"/>
<mxPoint x="270" y="270" as="targetPoint"/>
</mxGeometry>
</mxCell>
<mxCell id="UvHuP5o6jSuoLTm0AUZA-5955" value="" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#333333;fontColor=#FFFFFF;strokeColor=none;" parent="1" vertex="1">
<mxGeometry x="350" y="80" width="320" height="260" as="geometry"/>
</mxCell>
<mxCell id="5955" value="" style="group" parent="1" vertex="1" connectable="0">
<mxGeometry x="360" y="270" width="140" height="45" as="geometry"/>
</mxCell>
<mxCell id="5956" value="" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#C23555;fontColor=#FFFFFF;strokeColor=#A20025;" parent="5955" vertex="1">
<mxGeometry width="140.00000000000003" height="45" as="geometry"/>
</mxCell>
<mxCell id="5957" value="&lt;div&gt;ROCr runtime&lt;/div&gt;" style="text;html=1;strokeColor=none;fillColor=none;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;fontSize=17;fontColor=#FFFFFF;" parent="5955" vertex="1">
<mxGeometry x="2.51" y="8.75" width="134.99" height="27.5" as="geometry"/>
</mxCell>
<mxCell id="5958" value="" style="group" parent="1" vertex="1" connectable="0">
<mxGeometry x="520" y="270" width="140" height="45" as="geometry"/>
</mxCell>
<mxCell id="5959" value="" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#C23555;fontColor=#FFFFFF;strokeColor=#A20025;" parent="5958" vertex="1">
<mxGeometry width="140.00000000000003" height="45" as="geometry"/>
</mxCell>
<mxCell id="5960" value="&lt;div&gt;PAL&lt;/div&gt;" style="text;html=1;strokeColor=none;fillColor=none;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;fontSize=17;fontColor=#FFFFFF;" parent="5958" vertex="1">
<mxGeometry x="5.233043662182416" y="7.499999999999999" width="127.33739577977217" height="29.999999999999996" as="geometry"/>
</mxCell>
<mxCell id="5962" value="" style="group" parent="1" vertex="1" connectable="0">
<mxGeometry x="405" y="144.91" width="210" height="45" as="geometry"/>
</mxCell>
<mxCell id="5963" value="" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#C23555;fontColor=#FFFFFF;strokeColor=#A20025;" parent="5962" vertex="1">
<mxGeometry width="210.00000000000003" height="45" as="geometry"/>
</mxCell>
<mxCell id="5964" value="&lt;div&gt;CLR&lt;/div&gt;" style="text;html=1;strokeColor=none;fillColor=none;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;fontSize=17;fontColor=#FFFFFF;" parent="5962" vertex="1">
<mxGeometry x="7.849565493273624" y="7.499999999999999" width="191.0060936696582" height="29.999999999999996" as="geometry"/>
</mxCell>
<mxCell id="5965" value="" style="endArrow=classic;startArrow=none;html=1;rounded=0;strokeWidth=2;startFill=0;entryX=0.5;entryY=0;entryDx=0;entryDy=0;strokeColor=#FFFFFF;exitX=0.823;exitY=1.047;exitDx=0;exitDy=0;exitPerimeter=0;" parent="1" target="5963" edge="1" source="5946">
<mxGeometry width="50" height="50" relative="1" as="geometry">
<mxPoint x="510" y="60" as="sourcePoint"/>
<mxPoint x="640" y="290" as="targetPoint"/>
</mxGeometry>
</mxCell>
<mxCell id="5969" value="" style="endArrow=classic;startArrow=none;html=1;rounded=0;strokeWidth=2;startFill=0;strokeColor=#FFFFFF;" parent="1" edge="1">
<mxGeometry width="50" height="50" relative="1" as="geometry">
<mxPoint x="570" y="190" as="sourcePoint"/>
<mxPoint x="570" y="270" as="targetPoint"/>
</mxGeometry>
</mxCell>
<mxCell id="5971" value="" style="endArrow=classic;startArrow=none;html=1;rounded=0;strokeWidth=2;startFill=0;strokeColor=#FFFFFF;entryX=0.661;entryY=0.007;entryDx=0;entryDy=0;entryPerimeter=0;" parent="1" target="5956" edge="1">
<mxGeometry width="50" height="50" relative="1" as="geometry">
<mxPoint x="453" y="190" as="sourcePoint"/>
<mxPoint x="450" y="270" as="targetPoint"/>
</mxGeometry>
</mxCell>
<mxCell id="UvHuP5o6jSuoLTm0AUZA-5981" value="" style="endArrow=classic;startArrow=none;html=1;rounded=0;strokeWidth=2;startFill=0;entryX=0.5;entryY=0;entryDx=0;entryDy=0;strokeColor=#FFFFFF;" parent="1" target="5967" edge="1">
<mxGeometry width="50" height="50" relative="1" as="geometry">
<mxPoint x="132" y="60" as="sourcePoint"/>
<mxPoint x="95" y="140" as="targetPoint"/>
</mxGeometry>
</mxCell>
<mxCell id="UvHuP5o6jSuoLTm0AUZA-5957" value="&lt;font face=&quot;Helvetica&quot;&gt;&lt;span style=&quot;background-color: rgb(77, 77, 77);&quot;&gt;AMD Platform&lt;/span&gt;&lt;br&gt;&lt;/font&gt;" style="text;html=1;strokeColor=none;fillColor=none;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;fontSize=17;fontColor=#FFFFFF;" parent="1" vertex="1">
<mxGeometry x="440" y="84" width="140" height="30" as="geometry"/>
</mxCell>
<mxCell id="5926" value="&lt;font style=&quot;background-color: rgb(77, 77, 77);&quot;&gt;NVIDIA Platform&lt;/font&gt;" style="text;html=1;strokeColor=none;fillColor=none;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;fontSize=17;fontColor=#FFFFFF;" parent="1" vertex="1">
<mxGeometry x="10" y="80" width="330" height="34.0392" as="geometry"/>
</mxCell>
<mxCell id="5973" value="" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#C23555;fontColor=#FFFFFF;strokeColor=#A20025;" vertex="1" parent="1">
<mxGeometry x="10" y="40" width="330" height="30" as="geometry"/>
</mxCell>
<mxCell id="5975" value="&lt;font face=&quot;Helvetica&quot;&gt;hipother&lt;/font&gt;" style="text;html=1;strokeColor=none;fillColor=none;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;fontSize=17;fontColor=#FFFFFF;" vertex="1" parent="1">
<mxGeometry x="10" y="40" width="330" height="30" as="geometry"/>
</mxCell>
<mxCell id="5976" value="" style="endArrow=classic;startArrow=none;html=1;rounded=0;strokeWidth=2;startFill=0;strokeColor=#FFFFFF;exitX=0.823;exitY=1.047;exitDx=0;exitDy=0;exitPerimeter=0;" edge="1" parent="1">
<mxGeometry width="50" height="50" relative="1" as="geometry">
<mxPoint x="175.59000000000003" y="20.00000000000008" as="sourcePoint"/>
<mxPoint x="176" y="40" as="targetPoint"/>
</mxGeometry>
</mxCell>
</root>
</mxGraphModel>
</diagram>
</mxfile>
تفاوت فایل به دلیل طولانی بودن یک یا چند خط حذف شد

پس از

عرض:  |  ارتفاع:  |  اندازه: 14 KiB

@@ -0,0 +1,46 @@
<mxfile host="65bd71144e">
<diagram id="zBbb_w2fufU70cdOGtND" name="1 oldal">
<mxGraphModel dx="438" dy="902" grid="1" gridSize="10" guides="1" tooltips="1" connect="1" arrows="1" fold="1" page="1" pageScale="1" pageWidth="1200" pageHeight="1600" background="none" math="0" shadow="0">
<root>
<mxCell id="0"/>
<mxCell id="1" parent="0"/>
<mxCell id="5536" value="" style="rounded=0;fillColor=#C23555;fontColor=#FFFFFF;strokeColor=none;spacing=0;fontFamily=Segoe UI;fontSize=18;" parent="1" vertex="1">
<mxGeometry x="340" y="10" width="280" height="540" as="geometry"/>
</mxCell>
<mxCell id="1Txoek2s6jAQB3cqoh21-5821" value="" style="rounded=0;fillColor=#C23555;fontColor=#FFFFFF;strokeColor=none;spacing=0;fontFamily=Segoe UI;fontSize=18;" parent="1" vertex="1">
<mxGeometry x="10" y="10" width="280" height="540" as="geometry"/>
</mxCell>
<mxCell id="5401" value="Stream 1" style="text;strokeColor=none;fillColor=none;align=center;verticalAlign=middle;rounded=0;fontFamily=Segoe UI;fontSize=18;fontColor=#FFFFFF;" parent="1" vertex="1">
<mxGeometry y="10" width="320" height="30" as="geometry"/>
</mxCell>
<mxCell id="1Txoek2s6jAQB3cqoh21-5820" value="Kernel A" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#A20025;fontColor=#FFFFFF;strokeColor=none;fontFamily=Segoe UI;fontSize=18;" parent="1" vertex="1">
<mxGeometry x="30" y="130" width="240" height="100" as="geometry"/>
</mxCell>
<mxCell id="1Txoek2s6jAQB3cqoh21-5819" value="Stream 2" style="text;strokeColor=none;fillColor=none;align=center;verticalAlign=middle;rounded=0;fontFamily=Segoe UI;fontSize=18;fontColor=#FFFFFF;" parent="1" vertex="1">
<mxGeometry x="320" y="10" width="320" height="30" as="geometry"/>
</mxCell>
<mxCell id="1Txoek2s6jAQB3cqoh21-5822" value="Memory Copy" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#A20025;fontColor=#FFFFFF;strokeColor=none;fontFamily=Segoe UI;fontSize=18;" parent="1" vertex="1">
<mxGeometry x="30" y="50" width="240" height="60" as="geometry"/>
</mxCell>
<mxCell id="1Txoek2s6jAQB3cqoh21-5825" value="hipDeviceSynchronize" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#A20025;fontColor=#FFFFFF;strokeColor=none;fontFamily=Segoe UI;fontSize=18;" parent="1" vertex="1">
<mxGeometry x="30" y="410" width="570" height="40" as="geometry"/>
</mxCell>
<mxCell id="1Txoek2s6jAQB3cqoh21-5826" value="Kernel B" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#A20025;fontColor=#FFFFFF;strokeColor=none;fontFamily=Segoe UI;fontSize=18;" parent="1" vertex="1">
<mxGeometry x="360" y="130" width="240" height="150" as="geometry"/>
</mxCell>
<mxCell id="1Txoek2s6jAQB3cqoh21-5828" value="Kernel C" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#A20025;fontColor=#FFFFFF;strokeColor=none;fontFamily=Segoe UI;fontSize=18;" parent="1" vertex="1">
<mxGeometry x="30" y="250" width="240" height="140" as="geometry"/>
</mxCell>
<mxCell id="5537" value="Memory Copy" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#A20025;fontColor=#FFFFFF;strokeColor=none;fontFamily=Segoe UI;fontSize=18;" parent="1" vertex="1">
<mxGeometry x="360" y="50" width="240" height="60" as="geometry"/>
</mxCell>
<mxCell id="5538" value="Memory Copy" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#A20025;fontColor=#FFFFFF;strokeColor=none;fontFamily=Segoe UI;fontSize=18;" parent="1" vertex="1">
<mxGeometry x="30" y="470" width="240" height="60" as="geometry"/>
</mxCell>
<mxCell id="5539" value="Memory Copy" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#A20025;fontColor=#FFFFFF;strokeColor=none;fontFamily=Segoe UI;fontSize=18;" parent="1" vertex="1">
<mxGeometry x="360" y="470" width="240" height="60" as="geometry"/>
</mxCell>
</root>
</mxGraphModel>
</diagram>
</mxfile>
تفاوت فایل به دلیل طولانی بودن یک یا چند خط حذف شد

پس از

عرض:  |  ارتفاع:  |  اندازه: 9.8 KiB

تفاوت فایلی نمایش داده نمی شود زیرا این فایل بسیار بزرگ است Diff را بارگزاری کن
تفاوت فایل به دلیل طولانی بودن یک یا چند خط حذف شد

قبل از

عرض:  |  ارتفاع:  |  اندازه: 338 KiB

@@ -0,0 +1,157 @@
<mxfile host="65bd71144e">
<diagram id="zBbb_w2fufU70cdOGtND" name="1 oldal">
<mxGraphModel dx="1547" dy="1302" grid="1" gridSize="10" guides="1" tooltips="1" connect="1" arrows="1" fold="1" page="0" pageScale="1" pageWidth="660" pageHeight="610" background="none" math="0" shadow="0">
<root>
<mxCell id="0"/>
<mxCell id="1" parent="0"/>
<mxCell id="6033" value="" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#5E5B61;fontColor=#FFFFFF;strokeColor=none;spacing=0;" parent="1" vertex="1">
<mxGeometry x="110" y="-320" width="480" height="490" as="geometry"/>
</mxCell>
<mxCell id="5981" value="" style="group" parent="1" vertex="1" connectable="0">
<mxGeometry x="130" y="60" width="210" height="90" as="geometry"/>
</mxCell>
<mxCell id="5982" value="" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#333333;fontColor=#FFFFFF;strokeColor=none;" parent="5981" vertex="1">
<mxGeometry width="210" height="90" as="geometry"/>
</mxCell>
<mxCell id="5983" value="" style="group" parent="5981" vertex="1" connectable="0">
<mxGeometry x="7.7419872652362365" y="8" width="192.50000000000003" height="45" as="geometry"/>
</mxCell>
<mxCell id="5984" value="" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#60a917;fontColor=#ffffff;strokeColor=#2D7600;" parent="5983" vertex="1">
<mxGeometry y="2" width="192.50000000000003" height="45" as="geometry"/>
</mxCell>
<mxCell id="5985" value="&lt;div&gt;NVIDIA runtime&lt;/div&gt;" style="text;html=1;strokeColor=none;fillColor=none;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;fontSize=17;fontColor=#FFFFFF;" parent="5983" vertex="1">
<mxGeometry x="11.998194444444442" y="13.01" width="168.50166666666664" height="18.99" as="geometry"/>
</mxCell>
<mxCell id="5986" value="&lt;font style=&quot;&quot;&gt;NVIDIA Platform&lt;/font&gt;" style="text;html=1;strokeColor=none;fillColor=none;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;fontSize=17;fontColor=#FFFFFF;" parent="5981" vertex="1">
<mxGeometry x="40" y="63" width="130" height="20" as="geometry"/>
</mxCell>
<mxCell id="5987" value="" style="endArrow=classic;startArrow=none;html=1;rounded=0;strokeWidth=2;startFill=0;strokeColor=#FFFFFF;" parent="1" edge="1">
<mxGeometry width="50" height="50" relative="1" as="geometry">
<mxPoint x="315" y="45" as="sourcePoint"/>
<mxPoint x="315" y="70" as="targetPoint"/>
</mxGeometry>
</mxCell>
<mxCell id="5988" value="" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#9C2A44;fontColor=#FFFFFF;strokeColor=#4c1523;strokeWidth=5;" parent="1" vertex="1">
<mxGeometry x="300" y="-17" width="260" height="60" as="geometry"/>
</mxCell>
<mxCell id="5989" value="&lt;font style=&quot;font-size: 14px;&quot; face=&quot;Helvetica&quot;&gt;HIP&lt;/font&gt;" style="text;html=1;strokeColor=none;fillColor=none;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;fontSize=17;fontColor=#FFFFFF;strokeWidth=2;" parent="1" vertex="1">
<mxGeometry x="300" y="-17" width="260" height="20" as="geometry"/>
</mxCell>
<mxCell id="5990" value="" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#333333;fontColor=#FFFFFF;strokeColor=none;" parent="1" vertex="1">
<mxGeometry x="350" y="60" width="210" height="90" as="geometry"/>
</mxCell>
<mxCell id="5991" value="" style="group;fillColor=#A50040;fontColor=#ffffff;strokeColor=none;" parent="1" vertex="1" connectable="0">
<mxGeometry x="360" y="70" width="192" height="45" as="geometry"/>
</mxCell>
<mxCell id="5992" value="" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#962744;fontColor=#FFFFFF;strokeColor=none;" parent="5991" vertex="1">
<mxGeometry width="192.00000000000003" height="45" as="geometry"/>
</mxCell>
<mxCell id="5993" value="&lt;div&gt;AMD runtime&lt;/div&gt;" style="text;html=1;strokeColor=none;fillColor=none;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;fontSize=17;fontColor=#ffffff;" parent="5991" vertex="1">
<mxGeometry x="8.638736842105262" y="7.497" width="174.72" height="29.996999999999993" as="geometry"/>
</mxCell>
<mxCell id="5994" value="" style="endArrow=classic;startArrow=none;html=1;rounded=0;strokeWidth=2;startFill=0;strokeColor=#FFFFFF;entryX=0.5;entryY=0;entryDx=0;entryDy=0;" parent="1" target="5992" edge="1">
<mxGeometry width="50" height="50" relative="1" as="geometry">
<mxPoint x="456" y="44" as="sourcePoint"/>
<mxPoint x="470" y="70" as="targetPoint"/>
</mxGeometry>
</mxCell>
<mxCell id="5995" value="&lt;font face=&quot;Helvetica&quot;&gt;AMD Platform&lt;br&gt;&lt;/font&gt;" style="text;html=1;strokeColor=none;fillColor=none;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;fontSize=17;fontColor=#FFFFFF;" parent="1" vertex="1">
<mxGeometry x="365" y="123" width="180" height="20" as="geometry"/>
</mxCell>
<mxCell id="6003" value="" style="endArrow=classic;startArrow=none;html=1;rounded=0;strokeWidth=2;startFill=0;strokeColor=#FFFFFF;exitX=0.25;exitY=1;exitDx=0;exitDy=0;entryX=0.855;entryY=-0.018;entryDx=0;entryDy=0;entryPerimeter=0;" parent="1" target="6000" edge="1">
<mxGeometry width="50" height="50" relative="1" as="geometry">
<mxPoint x="309.5" y="-104" as="sourcePoint"/>
<mxPoint x="309.55999999999995" y="-60.975106382978765" as="targetPoint"/>
</mxGeometry>
</mxCell>
<mxCell id="6004" value="" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#9C2A44;fontColor=#FFFFFF;strokeColor=none;" parent="1" vertex="1">
<mxGeometry x="274.5" y="-150" width="140" height="45" as="geometry"/>
</mxCell>
<mxCell id="6005" value="&lt;font face=&quot;Helvetica&quot;&gt;hipLibrary&lt;/font&gt;" style="text;html=1;strokeColor=none;fillColor=none;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;fontSize=17;fontColor=#ffffff;dashed=1;strokeWidth=2;" parent="1" vertex="1">
<mxGeometry x="274.5" y="-150" width="140" height="45" as="geometry"/>
</mxCell>
<mxCell id="6007" value="" style="group;dashed=1;strokeWidth=2;strokeColor=none;" parent="1" vertex="1" connectable="0">
<mxGeometry x="360" y="-80" width="140" height="45" as="geometry"/>
</mxCell>
<mxCell id="6008" value="" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#962744;fontColor=#FFFFFF;strokeColor=none;dashed=1;strokeWidth=2;" parent="6007" vertex="1">
<mxGeometry width="140.00000000000003" height="45" as="geometry"/>
</mxCell>
<mxCell id="6009" value="&lt;div&gt;rocLibrary&lt;/div&gt;" style="text;html=1;strokeColor=none;fillColor=none;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;fontSize=17;fontColor=#FFFFFF;" parent="6007" vertex="1">
<mxGeometry x="6.299078947368418" y="7.497" width="127.39999999999998" height="29.996999999999993" as="geometry"/>
</mxCell>
<mxCell id="6010" value="" style="endArrow=classic;startArrow=none;html=1;rounded=0;strokeWidth=2;startFill=0;strokeColor=#FFFFFF;" parent="1" edge="1">
<mxGeometry width="50" height="50" relative="1" as="geometry">
<mxPoint x="386" y="-105" as="sourcePoint"/>
<mxPoint x="386" y="-80" as="targetPoint"/>
</mxGeometry>
</mxCell>
<mxCell id="5999" value="" style="group" parent="1" vertex="1" connectable="0">
<mxGeometry x="190" y="-80" width="140" height="45" as="geometry"/>
</mxCell>
<mxCell id="6000" value="" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#60a917;fontColor=#ffffff;strokeColor=#2D7600;" parent="5999" vertex="1">
<mxGeometry y="-0.005106382978723234" width="140" height="45" as="geometry"/>
</mxCell>
<mxCell id="6001" value="&lt;div&gt;cuLibrary&lt;/div&gt;" style="text;html=1;strokeColor=none;fillColor=none;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;fontSize=17;fontColor=#FFFFFF;" parent="5999" vertex="1">
<mxGeometry x="10.019288676236041" y="13.404255319148938" width="119.9667368421052" height="18.18191489361702" as="geometry"/>
</mxCell>
<mxCell id="6013" style="edgeStyle=none;html=1;strokeWidth=2;strokeColor=#FFFFFF;entryX=0.5;entryY=0;entryDx=0;entryDy=0;" parent="1" target="5984" edge="1">
<mxGeometry relative="1" as="geometry">
<mxPoint x="234" y="-35" as="sourcePoint"/>
<mxPoint x="220" y="60" as="targetPoint"/>
</mxGeometry>
</mxCell>
<mxCell id="6014" style="edgeStyle=none;html=1;exitX=0.5;exitY=1;exitDx=0;exitDy=0;strokeWidth=2;strokeColor=#FFFFFF;" parent="1" source="6008" edge="1">
<mxGeometry relative="1" as="geometry">
<mxPoint x="430" y="-19" as="targetPoint"/>
</mxGeometry>
</mxCell>
<mxCell id="6025" value="" style="group;strokeColor=none;dashed=1;strokeWidth=2;" parent="1" vertex="1" connectable="0">
<mxGeometry x="129.5" y="-290" width="430" height="100" as="geometry"/>
</mxCell>
<mxCell id="6023" value="" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#333333;fontColor=#FFFFFF;strokeColor=none;spacing=0;" parent="6025" vertex="1">
<mxGeometry width="430" height="100" as="geometry"/>
</mxCell>
<mxCell id="6024" value="&lt;div&gt;Application Implementation&lt;/div&gt;" style="text;html=1;strokeColor=none;fillColor=none;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;fontSize=17;fontColor=#FFFFFF;" parent="6025" vertex="1">
<mxGeometry x="97.50999999999999" y="43.93999999999998" width="234.99" height="12.120000000000001" as="geometry"/>
</mxCell>
<mxCell id="6026" style="edgeStyle=none;html=1;entryX=0.148;entryY=0.008;entryDx=0;entryDy=0;strokeWidth=2;entryPerimeter=0;strokeColor=#FFFFFF;" parent="1" target="5984" edge="1">
<mxGeometry relative="1" as="geometry">
<mxPoint x="166" y="-190" as="sourcePoint"/>
<mxPoint x="159.99598908448831" y="-94.12" as="targetPoint"/>
</mxGeometry>
</mxCell>
<mxCell id="6027" style="edgeStyle=none;html=1;strokeWidth=2;entryX=0.5;entryY=0;entryDx=0;entryDy=0;strokeColor=#FFFFFF;" parent="1" edge="1">
<mxGeometry relative="1" as="geometry">
<mxPoint x="260" y="-190" as="sourcePoint"/>
<mxPoint x="260" y="-80.00510638297874" as="targetPoint"/>
</mxGeometry>
</mxCell>
<mxCell id="6029" style="edgeStyle=none;html=1;entryX=0.5;entryY=0;entryDx=0;entryDy=0;strokeWidth=2;exitX=0.5;exitY=1;exitDx=0;exitDy=0;strokeColor=#FFFFFF;" parent="1" source="6023" target="6005" edge="1">
<mxGeometry relative="1" as="geometry">
<mxPoint x="320" y="-190" as="sourcePoint"/>
</mxGeometry>
</mxCell>
<mxCell id="6030" style="edgeStyle=none;html=1;strokeWidth=2;strokeColor=#FFFFFF;entryX=0.5;entryY=0;entryDx=0;entryDy=0;" parent="1" target="6008" edge="1">
<mxGeometry relative="1" as="geometry">
<mxPoint x="452" y="-80" as="targetPoint"/>
<mxPoint x="430" y="-190" as="sourcePoint"/>
</mxGeometry>
</mxCell>
<mxCell id="6031" style="edgeStyle=none;html=1;exitX=0.912;exitY=1.013;exitDx=0;exitDy=0;strokeWidth=2;exitPerimeter=0;strokeColor=#FFFFFF;" parent="1" source="6023" edge="1">
<mxGeometry relative="1" as="geometry">
<mxPoint x="520" y="-19" as="targetPoint"/>
</mxGeometry>
</mxCell>
<mxCell id="6034" value="&lt;div&gt;Application&lt;/div&gt;" style="text;html=1;strokeColor=none;fillColor=none;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;fontSize=17;fontColor=#FFFFFF;" parent="1" vertex="1">
<mxGeometry x="232.5" y="-310" width="234.99" height="12.120000000000001" as="geometry"/>
</mxCell>
<mxCell id="6035" value="runtime API" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#4F1623;fontColor=#FFFFFF;strokeColor=none;" parent="1" vertex="1">
<mxGeometry x="310" y="3" width="115" height="30" as="geometry"/>
</mxCell>
<mxCell id="6036" value="kernel language" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#4F1623;fontColor=#FFFFFF;strokeColor=none;" parent="1" vertex="1">
<mxGeometry x="437" y="3" width="115" height="30" as="geometry"/>
</mxCell>
</root>
</mxGraphModel>
</diagram>
</mxfile>
تفاوت فایل به دلیل طولانی بودن یک یا چند خط حذف شد

پس از

عرض:  |  ارتفاع:  |  اندازه: 18 KiB

+242
مشاهده پرونده
@@ -0,0 +1,242 @@
.. meta::
:description: This page lists frequently asked questions about HIP
:keywords: AMD, ROCm, HIP, FAQ, frequently asked questions
*******************************************************************************
Frequently asked questions
*******************************************************************************
This topic provides answers to frequently asked questions from new HIP users and
users familiar with NVIDIA CUDA.
HIP Support
===========
What hardware does HIP support?
-------------------------------
HIP supports AMD and NVIDIA GPUs. See
:ref:`prerequisites of the install guide<install_prerequisites>` for detailed
information.
What operating systems does HIP support?
----------------------------------------
Linux as well as Windows are supported by ROCm. The exact versions are listed in
the system requirements for :ref:`rocm-install-on-linux:supported_distributions`
and :ref:`rocm-install-on-windows:supported-skus-win`.
.. note::
Not all HIP runtime API functions are yet supported on Windows.
A note is added to those functions' documentation in the
:ref:`HIP runtime API reference<runtime_api_reference>`.
What libraries does HIP provide?
--------------------------------
HIP provides key math and AI libraries. See :doc:`rocm:reference/api-libraries`
for the full list.
What NVIDIA CUDA features does HIP support?
-------------------------------------------
The :doc:`NVIDIA CUDA runtime API supported by HIP<hipify:tables/CUDA_Runtime_API_functions_supported_by_HIP>`
and :doc:`NVIDIA CUDA driver API supported by HIP<hipify:tables/CUDA_Driver_API_functions_supported_by_HIP>`
pages describe which NVIDIA CUDA APIs are supported and what the equivalents are.
The :doc:`HIP API documentation <doxygen/html/index>` describes each API and
its limitations, if any, compared with the equivalent CUDA API.
The kernel language features are documented in the
:doc:`/reference/cpp_language_extensions` page.
Relation to other GPGPU frameworks
==================================
Is HIP a drop-in replacement for CUDA?
--------------------------------------
The `HIPIFY <https://github.com/ROCm/HIPIFY>`_ tools can automatically convert
almost all CUDA runtime code to HIP. Most device code needs no additional
conversion because HIP and CUDA have the same signatures for math and built-in
functions except for the name. HIP code provides similar performance as native
CUDA code on NVIDIA platforms, plus the benefits of being compilable for AMD
platforms.
Additional porting might be required to deal with architecture feature
queries or CUDA capabilities that HIP doesn't support.
How does HIP compare with OpenCL?
---------------------------------
HIP offers several benefits over OpenCL:
* Device code can be written in modern C++, including templates, lambdas,
classes and so on.
* Host and device code can be mixed in the source files.
* The HIP API is less verbose than OpenCL and is familiar to CUDA developers.
* Porting from CUDA to HIP is significantly easier than from CUDA to OpenCL.
* HIP uses development tools specialized for each platform: :doc:`amdclang++ <llvm-project:index>`
for AMD GPUs or `nvcc <https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html>`_
for NVIDIA GPUs, and profilers like :doc:`ROCm Compute Profiler <rocprofiler-compute:index>` or
`Nsight Systems <https://developer.nvidia.com/nsight-systems>`_.
* HIP provides
* pointers and host-side pointer arithmetic.
* device-level control over memory allocation and placement.
* an offline compilation model.
How does porting CUDA to HIP compare to porting CUDA to OpenCL?
---------------------------------------------------------------
OpenCL differs from HIP and CUDA when considering the host runtime,
but even more so when considering the kernel code.
The HIP device code is a C++ dialect, while OpenCL is C99-based.
OpenCL does not support single-source compilation.
As a result, the OpenCL syntax differs significantly from HIP, and porting tools
must perform complex transformations, especially regarding templates or other
C++ features in kernels.
To better understand the syntax differences, see :doc:`here<reference/terms>` or
the :doc:`HIP porting guide <how-to/hip_porting_guide>`.
Can I install CUDA and ROCm on the same machine?
------------------------------------------------
Yes, but you require a compatible GPU to run the compiled code.
On NVIDIA platforms, can I mix HIP code with CUDA code?
-------------------------------------------------------
Yes. Most HIP types and data structures are ``typedef`` s to CUDA equivalents and
can be used interchangeably. This can be useful for iteratively porting CUDA code.
See :doc:`how-to/hip_porting_guide` for more details.
Can a HIP binary run on both AMD and NVIDIA platforms?
------------------------------------------------------
HIP is a source-portable language that can be compiled to run on AMD or NVIDIA
platforms. However, the HIP tools don't create a "fat binary" that can run on
both platforms.
Compiler related questions
==========================
hipcc detected my platform incorrectly. What should I do?
---------------------------------------------------------
The environment variable ``HIP_PLATFORM`` can be used to specify the platform
for which the code is going to be compiled with ``hipcc``. See the
:doc:`hipcc environment variables<hipcc:env>` for more information.
.. warning::
If you specify HIP_PLATFORM=NVIDIA with hipcc, you also need to pass ``-x cu``
to hipcc when compiling files with the ``.hip`` file extension. Otherwise,
nvcc will not recognize the ``.hip`` file extension and will fail with
``nvcc fatal : Don't know what to do with <file>.hip``.
How to use HIP-Clang to build HIP programs?
------------------------------------------------------
:doc:`hipcc <hipcc:index>` is a compiler driver. This means it is not a compiler
but calls the appropriate compilers and sets some options.
The underlying compilers are :doc:`amdclang++ <llvm-project:index>` or
`nvcc <https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html>`_,
depending on the platform, and can be called directly.
What is HIP-Clang?
------------------
HIP-Clang is a Clang/LLVM-based compiler used to compile HIP programs for AMD
platforms. The executable is named :doc:`amdclang++ <llvm-project:index>` on
Linux and ``clang++`` on Windows.
Can I link HIP device code with host code compiled with another compiler such as gcc, icc, or clang?
-----------------------------------------------------------------------------------------------------------
Yes. HIP generates object code that conforms to the GCC ABI, and links with libstdc++.
This means you can compile host code with the compiler of your choice and link the
generated host object code with device code.
Can HIP applications be compiled with a C compiler?
---------------------------------------------------
HIP is a C/C++ API that can be used with C compilers. However, this applies only
to the API itself. Device code and the syntax for calling kernels must be
compiled with a supported compiler like :doc:`hipcc <hipcc:index>`. The code
objects that are generated with ``hipcc`` can, however, be used with a C
compiler, as shown in the code examples below.
The following is the HIP device code, assumed to be saved in ``device.hip``:
.. code-block:: c++
#include <hip/hip_runtime.h>
__global__ void kernel(double* array, size_t size){
const int x = threadIdx.x + blockIdx.x * blockDim.x;
if(x < size){array[x] = x;}
};
extern "C"{
hipError_t callKernel(int blocks, int threadsPerBlock, double* array, size_t size){
kernel<<<blocks, threadsPerBlock, 0, hipStreamDefault>>>(array, size);
return hipGetLastError();
}
}
The following is the host code, written in C, saved in ``host.c``:
.. code-block:: c
#include <hip/hip_runtime_api.h>
#include <stdio.h>
#include <stdlib.h>
#define HIP_CHECK(c) { \
if (c != hipSuccess){ \
printf("HIP Error : %s", hipGetErrorString(c)); \
printf(" %s %d\n", __FILE__, __LINE__); \
exit(c); \
} \
}
// Forward declaration - the implementation needs to be compiled with
// a device compiler like hipcc or amdclang++
hipError_t callKernel(int blocks, int threadsPerBlock, double* array, size_t size);
int main(int argc, char** argv) {
int blocks = 1024;
int threadsPerBlock = 256;
size_t arraySize = blocks * threadsPerBlock;
double* d_array;
double* h_array;
h_array = (double*)malloc(arraySize * sizeof(double));
HIP_CHECK(hipMalloc((void**)&d_array, arraySize * sizeof(double)));
HIP_CHECK(callKernel(blocks, threadsPerBlock, d_array, arraySize));
HIP_CHECK(hipMemcpy(h_array, d_array, arraySize * sizeof(double), hipMemcpyDeviceToHost));
HIP_CHECK(hipFree(d_array));
free(h_array);
return 0;
}
These files are then compiled and linked using
.. code-block:: shell
hipcc -c device.hip
gcc host.c device.o $(hipconfig --cpp_config) -L/opt/rocm/lib -lamdhip64
assuming the default installation of ROCm in ``/opt/rocm``.
How to guard code specific to the host or the GPU?
--------------------------------------------------
The compiler defines the ``__HIP_DEVICE_COMPILE__`` macro only when compiling
device code.
Refer to the :doc:`how-to/hip_porting_guide` for more information.
@@ -2,12 +2,13 @@
:description: How to debug using HIP.
:keywords: AMD, ROCm, HIP, debugging, ltrace, ROCgdb, WinGDB
.. _debugging_with_hip:
*************************************************************************
Debugging with HIP
*************************************************************************
AMD debugging tools include *ltrace* and *ROCgdb*. External tools are available and can be found
online. For example, if you're using Windows, you can use *Microsoft Visual Studio* and *WinGDB*.
HIP debugging tools include `ltrace <https://ltrace.org/>`_ and :doc:`ROCgdb <rocgdb:index>`. External tools are available and can be found online. For example, if you're using Windows, you can use Microsoft Visual Studio and WinGDB.
You can trace and debug your code using the following tools and techniques.
@@ -272,110 +273,7 @@ HIP environment variable summary
Here are some of the more commonly used environment variables:
.. <!-- spellcheck-disable -->
.. # COMMENT: The following lines define a break for use in the table below.
.. |break| raw:: html
<br />
.. <!-- spellcheck-enable -->
.. list-table::
* - **Environment variable**
- **Default value**
- **Usage**
* - AMD_LOG_LEVEL
|break| Enable HIP log on different Level
- 0
- 0: Disable log.
|break| 1: Enable log on error level
|break| 2: Enable log on warning and below levels
|break| 0x3: Enable log on information and below levels
|break| 0x4: Decode and display AQL packets
* - AMD_LOG_MASK
|break| Enable HIP log on different Level
- 0x7FFFFFFF
- 0x1: Log API calls
|break| 0x02: Kernel and Copy Commands and Barriers
|break| 0x4: Synchronization and waiting for commands to finish
|break| 0x8: Enable log on information and below levels
|break| 0x20: Queue commands and queue contents
|break| 0x40: Signal creation, allocation, pool
|break| 0x80: Locks and thread-safety code
|break| 0x100: Copy debug
|break| 0x200: Detailed copy debug
|break| 0x400: Resource allocation, performance-impacting events
|break| 0x800: Initialization and shutdown
|break| 0x1000: Misc debug, not yet classified
|break| 0x2000: Show raw bytes of AQL packet
|break| 0x4000: Show code creation debug
|break| 0x8000: More detailed command info, including barrier commands
|break| 0x10000: Log message location
|break| 0xFFFFFFFF: Log always even mask flag is zero
* - HIP_LAUNCH_BLOCKING
|break| Used for serialization on kernel execution.
- 0
- 0: Disable. Kernel executes normally.
|break| 1: Enable. Serializes kernel enqueue, behaves the same as AMD_SERIALIZE_KERNEL.
* - HIP_VISIBLE_DEVICES (or CUDA_VISIBLE_DEVICES)
|break| Only devices whose index is present in the sequence are visible to HIP
-
- 0,1,2: Depending on the number of devices on the system
* - GPU_DUMP_CODE_OBJECT
|break| Dump code object
- 0
- 0: Disable
|break| 1: Enable
* - AMD_SERIALIZE_KERNEL
|break| Serialize kernel enqueue
- 0
- 1: Wait for completion before enqueue
|break| 2: Wait for completion after enqueue
|break| 3: Both
* - AMD_SERIALIZE_COPY
|break| Serialize copies
- 0
- 1: Wait for completion before enqueue
|break| 2: Wait for completion after enqueue
|break| 3: Both
* - HIP_HOST_COHERENT
|break| Coherent memory in hipHostMalloc
- 0
- 0: memory is not coherent between host and GPU
|break| 1: memory is coherent with host
* - AMD_DIRECT_DISPATCH
|break| Enable direct kernel dispatch (Currently for Linux; under development for Windows)
- 1
- 0: Disable
|break| 1: Enable
* - GPU_MAX_HW_QUEUES
|break| The maximum number of hardware queues allocated per device
- 4
- The variable controls how many independent hardware queues HIP runtime can create per process,
per device. If an application allocates more HIP streams than this number, then HIP runtime reuses
the same hardware queues for the new streams in a round-robin manner. Note that this maximum
number does not apply to hardware queues that are created for CU-masked HIP streams, or
cooperative queues for HIP Cooperative Groups (single queue per device).
* - DEBUG_HIP_7_PREVIEW
|break| Enable preview of upcoming runtime changes that break backward compatibility.
These changes might require updating existing application code to support the new behavior.
The new behavior will become default in a future major release and this environment
variable will no longer be needed.
- 0
- 0x1: Match the behavior of hipGetLastError with its corresponding CUDA API
.. include:: ../how-to/debugging_env.rst
General debugging tips
======================================================
@@ -0,0 +1,110 @@
.. list-table::
:header-rows: 1
:widths: 35,14,51
* - **Environment variable**
- **Default value**
- **Value**
* - | ``AMD_LOG_LEVEL``
| Enables HIP log on various level.
- ``0``
- | 0: Disable log.
| 1: Enables error logs.
| 2: Enables warning logs next to lower-level logs.
| 3: Enables information logs next to lower-level logs.
| 4: Enables debug logs next to lower-level logs.
| 5: Enables debug extra logs next to lower-level logs.
* - | ``AMD_LOG_LEVEL_FILE``
| Sets output file for ``AMD_LOG_LEVEL``.
- stderr output
-
* - | ``AMD_LOG_MASK``
| Specifies HIP log filters. Here is the ` complete list of log masks <https://github.com/ROCm/clr/blob/develop/rocclr/utils/debug.hpp#L40>`_.
- ``0x7FFFFFFF``
- | 0x1: Log API calls.
| 0x2: Kernel and copy commands and barriers.
| 0x4: Synchronization and waiting for commands to finish.
| 0x8: Decode and display AQL packets.
| 0x10: Queue commands and queue contents.
| 0x20: Signal creation, allocation, pool.
| 0x40: Locks and thread-safety code.
| 0x80: Kernel creations and arguments, etc.
| 0x100: Copy debug.
| 0x200: Detailed copy debug.
| 0x400: Resource allocation, performance-impacting events.
| 0x800: Initialization and shutdown.
| 0x1000: Misc debug, not yet classified.
| 0x2000: Show raw bytes of AQL packet.
| 0x4000: Show code creation debug.
| 0x8000: More detailed command info, including barrier commands.
| 0x10000: Log message location.
| 0x20000: Memory allocation.
| 0x40000: Memory pool allocation, including memory in graphs.
| 0x80000: Timestamp details.
| 0xFFFFFFFF: Log always even mask flag is zero.
* - | ``HIP_LAUNCH_BLOCKING``
| Used for serialization on kernel execution.
- ``0``
- | 0: Disable. Kernel executes normally.
| 1: Enable. Serializes kernel enqueue, behaves the same as ``AMD_SERIALIZE_KERNEL``.
* - | ``HIP_VISIBLE_DEVICES`` (or ``CUDA_VISIBLE_DEVICES``)
| Only devices whose index is present in the sequence are visible to HIP
- Unset by default.
- 0,1,2: Depending on the number of devices on the system.
* - | ``GPU_DUMP_CODE_OBJECT``
| Dump code object.
- ``0``
- | 0: Disable
| 1: Enable
* - | ``AMD_SERIALIZE_KERNEL``
| Serialize kernel enqueue.
- ``0``
- | 0: Disable
| 1: Wait for completion before enqueue.
| 2: Wait for completion after enqueue.
| 3: Both
* - | ``AMD_SERIALIZE_COPY``
| Serialize copies
- ``0``
- | 0: Disable
| 1: Wait for completion before enqueue.
| 2: Wait for completion after enqueue.
| 3: Both
* - | ``AMD_DIRECT_DISPATCH``
| Enable direct kernel dispatch (Currently for Linux; under development for Windows).
- ``1``
- | 0: Disable
| 1: Enable
* - | ``GPU_MAX_HW_QUEUES``
| The maximum number of hardware queues allocated per device.
- ``4``
- The variable controls how many independent hardware queues HIP runtime can create per process,
per device. If an application allocates more HIP streams than this number, then HIP runtime reuses
the same hardware queues for the new streams in a round-robin manner. Note that this maximum
number does not apply to hardware queues that are created for CU-masked HIP streams, or
cooperative queues for HIP Cooperative Groups (single queue per device).
* - | ``DEBUG_HIP_7_PREVIEW``
| Enable preview of upcoming
| runtime changes that break
| backward compatibility.
| These changes might require
| updating existing application
| code to support the new
| behavior. The new behavior
| will become default in a
| future major release and this
| environment variable will
| no longer be needed.
- 0
- 0x1: Match the behavior of hipGetLastError with its corresponding CUDA API
@@ -1,386 +0,0 @@
# Frequently asked questions
## What APIs and features does HIP support?
HIP provides the following:
* Devices (`hipSetDevice()`, `hipGetDeviceProperties()`, etc.)
* Memory management (`hipMalloc()`, `hipMemcpy()`, `hipFree()`, etc.)
* Streams (`hipStreamCreate()`, `hipStreamSynchronize()`, `hipStreamWaitEvent()`, etc.)
* Events (`hipEventRecord()`, `hipEventElapsedTime()`, etc.)
* Kernel launching (`hipLaunchKernel`/`hipLaunchKernelGGL` is the preferred way of launching kernels. `hipLaunchKernelGGL` is a standard C/C++ macro that can serve as an alternative way to launch kernels, replacing the CUDA triple-chevron (`<<< >>>`) syntax).
* HIP Module API to control when and how code is loaded.
* CUDA-style kernel coordinate functions (`threadIdx`, `blockIdx`, `blockDim`, `gridDim`)
* Cross-lane instructions including `shfl`, `ballot`, `any`, `all`
* Most device-side math built-ins
* Error reporting (`hipGetLastError()`, `hipGetErrorString()`)
The HIP API documentation describes each API and its limitations, if any, compared with the equivalent CUDA API.
## What is not supported?
### Runtime/Driver API features
At a high-level, the following features are not supported:
* Textures (partial support available)
* Dynamic parallelism (CUDA 5.0)
* Graphics interoperability with OpenGL or Direct3D
* CUDA IPC Functions (Under Development)
* CUDA array, `mipmappedArray` and pitched memory
* Queue priority controls
See the [API Support Table](https://github.com/ROCm/HIPIFY/blob/amd-staging/docs/tables/CUDA_Runtime_API_functions_supported_by_HIP.md) for more detailed information.
### Kernel language features
* C++-style device-side dynamic memory allocations (free, new, delete) (CUDA 4.0)
* Virtual functions, indirect functions and try/catch (CUDA 4.0)
* `__prof_trigger`
* PTX assembly (CUDA 4.0). HIP-Clang supports inline GCN assembly.
* Several kernel features are under development. See the {doc}`/reference/cpp_language_extensions` for more information.
## Is HIP a drop-in replacement for CUDA?
No. HIP provides porting tools which do most of the work to convert CUDA code into portable C++ code that uses the HIP APIs.
Most developers will port their code from CUDA to HIP and then maintain the HIP version.
HIP code provides the same performance as native CUDA code, plus the benefits of running on AMD platforms.
## What specific version of CUDA does HIP support?
HIP APIs and features do not map to a specific CUDA version. HIP provides a strong subset of the functionality provided in CUDA, and the hipify tools can scan code to identify any unsupported CUDA functions - this is useful for identifying the specific features required by a given application.
However, we can provide a rough summary of the features included in each CUDA SDK and the support level in HIP. Each bullet below lists the major new language features in each CUDA release and then indicate which are supported/not supported in HIP:
* CUDA 4.0 and earlier :
* HIP supports CUDA 4.0 except for the limitations described above.
* CUDA 5.0 :
* Dynamic Parallelism (not supported)
* `cuIpc` functions (under development).
* CUDA 6.0 :
* Managed memory (under development)
* CUDA 6.5 :
* `__shfl` intrinsic (supported)
* CUDA 7.0 :
* Per-thread default streams (supported)
* C++11 (Hip-Clang supports all of C++11, all of C++14 and some C++17 features)
* CUDA 7.5 :
* float16 (supported)
* CUDA 8.0 :
* Page Migration including `cudaMemAdvise`, `cudaMemPrefetch`, other `cudaMem*` APIs(not supported)
* CUDA 9.0 :
* Cooperative Launch, Surface Object Management, Version Management
## What libraries does HIP support?
HIP includes growing support for the four key math libraries using hipBLAS, hipFFT, hipRAND and hipSPARSE, as well as MIOpen for machine intelligence applications.
These offer pointer-based memory interfaces (as opposed to opaque buffers) and can be easily interfaced with other HIP applications.
The hip interfaces support both ROCm and CUDA paths, with familiar library interfaces.
* [hipBLAS](https://github.com/ROCmSoftwarePlatform/hipBLAS), which utilizes [rocBlas](https://github.com/ROCmSoftwarePlatform/rocBLAS).
* [hipFFT](https://github.com/ROCmSoftwarePlatform/hipfft)
* [hipsSPARSE](https://github.com/ROCmSoftwarePlatform/hipsparse)
* [hipRAND](https://github.com/ROCmSoftwarePlatform/hipRAND)
* [MIOpen](https://github.com/ROCmSoftwarePlatform/MIOpen)
Additionally, some of the cuBLAS routines are automatically converted to hipblas equivalents by the HIPIFY tools. These APIs use cuBLAS or hcBLAS depending on the platform and replace the need to use conditional compilation.
## How does HIP compare with OpenCL?
Both AMD and NVIDIA support OpenCL 1.2 on their devices so that developers can write portable code.
HIP offers several benefits over OpenCL:
* Developers can code in C++ as well as mix host and device C++ code in their source files. HIP C++ code can use templates, lambdas, classes and so on.
* The HIP API is less verbose than OpenCL and is familiar to CUDA developers.
* Because both CUDA and HIP are C++ languages, porting from CUDA to HIP is significantly easier than porting from CUDA to OpenCL.
* HIP uses the best available development tools on each platform: on NVIDIA GPUs, HIP code compiles using NVCC and can employ the Nsight profiler and debugger (unlike OpenCL on NVIDIA GPUs).
* HIP provides pointers and host-side pointer arithmetic.
* HIP provides device-level control over memory allocation and placement.
* HIP offers an offline compilation model.
## How does porting CUDA to HIP compare to porting CUDA to OpenCL?
Both HIP and CUDA are dialects of C++, and thus porting between them is relatively straightforward.
Both dialects support templates, classes, lambdas, and other C++ constructs.
As one example, the hipify-perl tool was originally a Perl script that used simple text conversions from CUDA to HIP.
HIP and CUDA provide similar math library calls as well. In summary, the HIP philosophy was to make the HIP language close enough to CUDA that the porting effort is relatively simple.
This reduces the potential for error, and also makes it easy to automate the translation. HIP goal is to quickly get the ported program running on both platforms with little manual intervention, so that the programmer can focus on performance optimizations.
There have been several tools that have attempted to convert CUDA into OpenCL, such as CU2CL. OpenCL is a C99-based kernel language (rather than C++) and also does not support single-source compilation.
As a result, the OpenCL syntax is different from CUDA, and the porting tools have to perform some heroic transformations to bridge this gap.
The tools also struggle with more complex CUDA applications, in particular, those that use templates, classes, or other C++ features inside the kernel.
## What hardware does HIP support?
* For AMD platforms, see the [ROCm documentation](https://github.com/RadeonOpenCompute/ROCm#supported-gpus) for the list of supported platforms.
* For NVIDIA platforms, HIP requires unified memory and should run on any device supporting CUDA SDK 6.0 or newer. We have tested the NVIDIA Titan and Tesla K40.
## Do HIPIFY tools automatically convert all source code?
Typically, HIPIFY tools can automatically convert almost all run-time code.
Most device code needs no additional conversion since HIP and CUDA have similar names for math and built-in functions.
The hipify-clang tool will automatically modify the kernel signature as needed (automating a step that used to be done manually).
Additional porting may be required to deal with architecture feature queries or with CUDA capabilities that HIP doesn't support.
In general, developers should always expect to perform some platform-specific tuning and optimization.
## What is NVCC?
NVCC is NVIDIA's compiler driver for compiling "CUDA C++" code into PTX or device code for NVIDIA GPUs. It's a closed-source binary compiler that is provided by the CUDA SDK.
## What is HIP-Clang?
HIP-Clang is a Clang/LLVM based compiler to compile HIP programs which can run on AMD platform.
## Why use HIP rather than supporting CUDA directly?
While HIP is a strong subset of the CUDA, it is a subset. The HIP layer allows that subset to be clearly defined and documented.
Developers who code to the HIP API can be assured their code will remain portable across NVIDIA and AMD platforms.
In addition, HIP defines portable mechanisms to query architectural features and supports a larger 64-bit `WaveSize` which expands the return type for cross-lane functions like ballot and shuffle from 32-bit integers to 64-bit integers.
## Can I develop HIP code on an NVIDIA CUDA platform?
Yes. HIP's CUDA path only exposes the APIs and functionality that work on both NVCC and AMDGPU back-ends.
"Extra" APIs, parameters, and features which exist in CUDA but not in HIP-Clang will typically result in compile-time or run-time errors.
Developers need to use the HIP API for most accelerator code and bracket any CUDA-specific code with preprocessor conditionals.
Developers concerned about portability should, of course, run on both platforms, and should expect to tune for performance.
In some cases, CUDA has a richer set of modes for some APIs, and some C++ capabilities such as virtual functions - see the HIP @API documentation for more details.
## Can I develop HIP code on an AMD HIP-Clang platform?
Yes. HIP's HIP-Clang path only exposes the APIs and functions that work on AMD runtime back ends. "Extra" APIs, parameters and features that appear in HIP-Clang but not CUDA will typically cause compile- or run-time errors. Developers must use the HIP API for most accelerator code and bracket any HIP-Clang specific code with preprocessor conditionals. Those concerned about portability should, of course, test their code on both platforms and should tune it for performance. Typically, HIP-Clang supports a more modern set of C++11/C++14/C++17 features, so HIP developers who want portability should be careful when using advanced C++ features on the HIP-Clang path.
## How to use HIP-Clang to build HIP programs?
The environment variable can be used to set compiler path:
* HIP_CLANG_PATH: path to hip-clang. When set, this variable let hipcc to use hip-clang for compilation/linking.
There is an alternative environment variable to set compiler path:
* HIP_ROCCLR_HOME: path to root directory of the HIP-ROCclr runtime. When set, this variable let hipcc use hip-clang from the ROCclr distribution.
NOTE: If HIP_ROCCLR_HOME is set, there is no need to set HIP_CLANG_PATH since hipcc will deduce them from HIP_ROCCLR_HOME.
## What is AMD clr?
AMD [Compute Language Runtime (CLR)](https://github.com/ROCm/clr) is a repository for the AMD platform, which contains source codes for AMD's compute languages runtimes as follows,
* hipamd - contains implementation of HIP language for AMD GPU.
* rocclr - contains virtual device interfaces that compute runtimes interact with backends, such as ROCr on Linux and PAL on Windows.
* opencl - contains implementation of OpenCL™ on the AMD platform.
## What is hipother?
A new repository ['hipother'](https://github.com/ROCm/hipother) is added in the ROCm 6.1 release, which is branched out from HIP.
hipother supports the HIP back-end implementation on some non-AMD platforms, like NVIDIA.
## Can I get HIP open source repository for Windows?
No, there is no HIP repository open publicly on Windows.
## Can a HIP binary run on both AMD and NVIDIA platforms?
HIP is a source-portable language that can be compiled to run on either AMD or NVIDIA platform. HIP tools don't create a "fat binary" that can run on either platform, however.
## On HIP-Clang, can I link HIP code with host code compiled with another compiler such as gcc, icc, or clang?
Yes. HIP generates the object code which conforms to the GCC ABI, and also links with libstdc++. This means you can compile host code with the compiler of your choice and link the generated object code
with GPU code compiled with HIP. Larger projects often contain a mixture of accelerator code (initially written in CUDA with NVCC) and host code (compiled with gcc, icc, or clang). These projects
can convert the accelerator code to HIP, compile that code with hipcc, and link with object code from their preferred compiler.
## Can HIP API support C style application? What is the difference between C and C++?
HIP is C++ runtime API that supports C style applications as well.
Some C style applications (and interfaces to other languages (FORTRAN, Python)) would call certain HIP APIs but not use kernel programming.
They can be compiled with a C compiler and run correctly, however, small details must be considered in the code. For example, initialization, as shown in the simple application below, uses HIP structs dim3 with the file name "test.hip.cpp"
```cpp
#include "hip/hip_runtime_api.h"
#include "stdio.h"
int main(int argc, char** argv) {
dim3 grid1;
printf("dim3 grid1; x=%d, y=%d, z=%d\n",grid1.x,grid1.y,grid1.z);
dim3 grid2 = {1,1,1};
printf("dim3 grid2 = {1,1,1}; x=%d, y=%d, z=%d\n",grid2.x,grid2.y,grid2.z);
return 0;
}
```
When using a C++ compiler,
```shell
$ gcc -x c++ $(hipconfig --cpp_config) test3.hip.cpp -o test
$ ./test
dim3 grid1; x=1, y=1, z=1
dim3 grid2 = {1,1,1}; x=1, y=1, z=1
```
In which "dim3 grid1;" will yield a dim3 grid with all dimensional members x,y,z initialized to 1, as the default constructor behaves that way.
Further, if written:
```cpp
dim3 grid(2); // yields {2,1,1}
dim3 grid(2,3); yields {2,3,1}
```
In comparison, when using the C compiler,
```shell
$ gcc -x c $(hipconfig --cpp_config) test.hip.cpp -o test
$ ./test
dim3 grid1; x=646881376, y=21975, z=1517277280
dim3 grid2 = {1,1,1}; x=1, y=1, z=1
```
In which "dim3 grid;" does not imply any initialization, no constructor is called, and dimensional values x,y,z of grid are undefined.
NOTE: To get the C++ default behavior, C programmers must additionally specify the right-hand side as shown below,
```cpp
dim3 grid = {1,1,1}; // initialized as in C++
```
## Can I install both CUDA SDK and HIP-Clang on the same machine?
Yes. You can use HIP_PLATFORM to choose which path hipcc targets. This configuration can be useful when using HIP to develop an application which is portable to both AMD and NVIDIA.
## HIP detected my platform (HIP-Clang vs NVCC) incorrectly * what should I do?
HIP will set the platform to AMD and use HIP-Clang as compiler if it sees that the AMD graphics driver is installed and has detected an AMD GPU.
Sometimes this isn't what you want * you can force HIP to recognize the platform by setting the following,
```shell
export HIP_PLATFORM=amd
```
HIP then set and use correct AMD compiler and runtime,
HIP_COMPILER=clang
HIP_RUNTIME=rocclr
To choose NVIDIA platform, you can set,
```shell
export HIP_PLATFORM=nvidia
```
In this case, HIP will set and use the following,
```shell
HIP_COMPILER=cuda
HIP_RUNTIME=nvcc
```
One symptom of this problem is the message "error: 'unknown error'(11) at `square.hipref.cpp:56`. This can occur if you have a CUDA installation on an AMD platform, and HIP incorrectly detects the platform as NVCC. HIP may be able to compile the application using the NVCC tool-chain but will generate this error at runtime since the platform does not have a CUDA device.
## On CUDA, can I mix CUDA code with HIP code?
Yes. Most HIP data structures (`hipStream_t`, `hipEvent_t`) are typedefs to CUDA equivalents and can be intermixed. Both CUDA and HIP use integer device ids.
One notable exception is that `hipError_t` is a new type, and cannot be used where a `cudaError_t` is expected. In these cases, refactor the code to remove the expectation. Alternatively, hip_runtime_api.h defines functions which convert between the error code spaces:
`hipErrorToCudaError`
`hipCUDAErrorTohipError`
`hipCUResultTohipError`
If platform portability is important, use `#ifdef __HIP_PLATFORM_NVIDIA__` to guard the CUDA-specific code.
## How do I trace HIP application flow?
See {doc}`/how-to/logging` for more information.
## What are the maximum limits of kernel launch parameters?
Product of block.x, block.y, and block.z should be less than 1024.
Please note, HIP does not support kernel launch with total work items defined in dimension with size `gridDim x blockDim >= 2^32`, so `gridDim.x * blockDim.x, gridDim.y * blockDim.y and gridDim.z * blockDim.z` are always less than 2^32.
## Are ``__shfl_*_sync`` functions supported on HIP platform?
``__shfl_*_sync`` is not supported on HIP but for NVCC path CUDA 9.0 and above all shuffle calls get redirected to it's sync version.
## How to create a guard for code that is specific to the host or the GPU?
The compiler defines the `__HIP_DEVICE_COMPILE__` macro only when compiling the code for the GPU. It could be used to guard code that is specific to the host or the GPU.
## Why _OpenMP is undefined when compiling with `-fopenmp`?
When compiling an OpenMP source file with `hipcc -fopenmp`, the compiler may generate error if there is a reference to the `_OPENMP` macro. This is due to a limitation in hipcc that treats any source file type (for example `.cpp`) as an HIP translation unit leading to some conflicts with the OpenMP language switch. If the OpenMP source file doesn't contain any HIP language constructs you could work around this issue by adding the `-x c++` switch to force the compiler to treat the file as regular C++. Another approach would be to guard the OpenMP code with `#ifdef _OPENMP` so that the code block is disabled when compiling for the GPU. The `__HIP_DEVICE_COMPILE__` macro defined by the HIP compiler when compiling GPU code could also be used for guarding code paths specific to the host or the GPU.
## Does the HIP-Clang compiler support extern shared declarations?
Previously, it was essential to declare dynamic shared memory using the HIP_DYNAMIC_SHARED macro for accuracy, as using static shared memory in the same kernel could result in overlapping memory ranges and data-races.
Now, the HIP-Clang compiler provides support for extern shared declarations, and the HIP_DYNAMIC_SHARED option is no longer required. You may use the standard extern definition:
extern __shared__ type var[];
## I have multiple HIP enabled devices and I am getting an error code `hipErrorSharedObjectInitFailed` with the message "Error: shared object initialization failed"?
This error message is seen due to the fact that you do not have valid code object for all of your devices.
If you have compiled the application yourself, make sure you have given the correct device name(s) and its features via: `--offload-arch`. If you are not mentioning the `--offload-arch`, make sure that `hipcc` is using the correct offload arch by verifying the hipcc output generated by setting the environment variable `HIPCC_VERBOSE=1`.
If you have a precompiled application/library (like rocblas, TensorFlow etc) which gives you such error, there are one of two possibilities.
* The application/library does not ship code object bundles for __all__ of your device(s): in this case you need to recompile the application/library yourself with correct `--offload-arch`.
* The application/library does not ship code object bundles for __some__ of your device(s), for example you have a system with an APU + GPU and the library does not ship code objects for your APU. For this you can set the environment variable `HIP_VISIBLE_DEVICES` or `CUDA_VISIBLE_DEVICES` on NVIDIA platform, to only enable GPUs for which code object is available. This will limit the GPUs visible to your application and allow it to run.
Note: In previous releases, the error code is `hipErrorNoBinaryForGpu` with message "Unable to find code object for all current devices".
The error code handling behavior is changed. HIP runtime shows the error code `hipErrorSharedObjectInitFailed` with message "Error: shared object initialization failed" on unsupported GPU.
## How to use per-thread default stream in HIP?
The per-thread default stream is an implicit stream local to both the thread and the current device. It does not do any implicit synchronization with other streams (like explicitly created streams), or default per-thread stream on other threads.
The per-thread default stream is a blocking stream and will synchronize with the default null stream if both are used in a program.
In ROCm, a compilation option should be added in order to compile the translation unit with per-thread default stream enabled.
`-fgpu-default-stream=per-thread`.
Once source is compiled with per-thread default stream enabled, all APIs will be executed on per thread default stream, hence there will not be any implicit synchronization with other streams.
Besides, per-thread default stream be enabled per translation unit, users can compile some files with feature enabled and some with feature disabled. Feature enabled translation unit will have default stream as per thread and there will not be any implicit synchronization done but other modules will have legacy default stream which will do implicit synchronization.
## How to use complex multiplication and division operations?
In HIP, `hipFloatComplex` and `hipDoubleComplex` are defined as complex data types,
```c++
typedef float2 hipFloatComplex;
typedef double2 hipDoubleComplex;
```
Any application uses complex multiplication and division operations, need to replace '*' and '/' operators with the following,
* `hipCmulf()` and `hipCdivf()` for `hipFloatComplex`
* `hipCmul()` and `hipCdiv()` for `hipDoubleComplex`
Note: These complex operations are equivalent to corresponding types/functions on the NVIDIA platform.
## Can I develop applications with HIP APIs on Windows the same on Linux?
Yes, HIP APIs are available to use on both Linux and Windows.
Due to different working mechanisms on operating systems like Windows vs Linux, HIP APIs call corresponding lower level backend runtime libraries and kernel drivers for the OS, in order to control the executions on GPU hardware accordingly. There might be a few differences on the related backend software and driver support, which might affect usage of HIP APIs. See OS support details in HIP API document.
## Does HIP support LUID?
Starting ROCm 6.0, HIP runtime supports Locally Unique Identifier (LUID).
This feature enables the local physical device(s) to interoperate with other devices. For example, DirectX 12.
HIP runtime sets device LUID properties so the driver can query LUID to identify each device for interoperability.
Note: HIP supports LUID only on Windows OS.
## How can I know the version of HIP?
HIP version definition has been updated since ROCm 4.2 release as the following:
```cpp
HIP_VERSION=HIP_VERSION_MAJOR * 10000000 + HIP_VERSION_MINOR * 100000 + HIP_VERSION_PATCH
```
HIP version can be queried from HIP API call,
```cpp
hipRuntimeGetVersion(&runtimeVersion);
```
The version returned will always be greater than the versions in previous ROCm releases.
Note: The version definition of HIP runtime is different from CUDA. On AMD platform, the function returns HIP runtime version, while on NVIDIA platform, it returns CUDA runtime version. And there is no mapping/correlation between HIP version and CUDA version.
@@ -8,14 +8,26 @@
Porting CUDA driver API
*******************************************************************************
NVIDIA provides separate CUDA driver and runtime APIs. The two APIs have significant overlap in functionality:
NVIDIA provides separate CUDA driver and runtime APIs. The two APIs have
significant overlap in functionality:
* Both APIs support events, streams, memory management, memory copy, and error
handling.
* Both APIs support events, streams, memory management, memory copy, and error handling.
* Both APIs deliver similar performance.
* Driver API calls begin with the prefix ``cu``, while runtime API calls begin with the prefix ``cuda``. For example, the driver API contains ``cuEventCreate``, while the runtime API contains ``cudaEventCreate``, which has similar functionality.
* The driver API defines a different, but largely overlapping, error code space than the runtime API and uses a different coding convention. For example, the driver API defines ``CUDA_ERROR_INVALID_VALUE``, while the runtime API defines ``cudaErrorInvalidValue``.
The driver API offers two additional functionalities not provided by the runtime API: ``cuModule`` and ``cuCtx`` APIs.
* Driver API calls begin with the prefix ``cu``, while runtime API calls begin
with the prefix ``cuda``. For example, the driver API contains
``cuEventCreate``, while the runtime API contains ``cudaEventCreate``, which
has similar functionality.
* The driver API defines a different, but largely overlapping, error code space
than the runtime API and uses a different coding convention. For example, the
driver API defines ``CUDA_ERROR_INVALID_VALUE``, while the runtime API defines
``cudaErrorInvalidValue``.
The driver API offers two additional functionalities not provided by the runtime
API: ``cuModule`` and ``cuCtx`` APIs.
cuModule API
================================================================================
@@ -345,7 +357,7 @@ The sample below shows how to use ``hipModuleGetFunction``.
HIP module and texture Driver API
================================================================================
HIP supports texture driver APIs. However, texture references must be declared
HIP supports texture driver APIs. However, texture references must be declared
within the host scope. The following code demonstrates the use of texture
references for the ``__HIP_PLATFORM_AMD__`` platform.
@@ -111,10 +111,10 @@ Most CUDA libraries have a corresponding ROCm library with similar functionality
All HIP projects target either AMD or NVIDIA platform. The platform affects which headers are included and which libraries are used for linking.
* `HIP_PLATFORM_AMD` is defined if the HIP platform targets AMD.
Note, `HIP_PLATFORM_HCC` was previously defined if the HIP platform targeted AMD, it is deprecated.
* `HIP_PLATFORM_NVDIA` is defined if the HIP platform targets NVIDIA.
Note, `HIP_PLATFORM_NVCC` was previously defined if the HIP platform targeted NVIDIA, it is deprecated.
* `__HIP_PLATFORM_AMD__` is defined if the HIP platform targets AMD.
Note, `__HIP_PLATFORM_HCC__` was previously defined if the HIP platform targeted AMD, it is deprecated.
* `__HIP_PLATFORM_NVDIA__` is defined if the HIP platform targets NVIDIA.
Note, `__HIP_PLATFORM_NVCC__` was previously defined if the HIP platform targeted NVIDIA, it is deprecated.
### Identifying the Compiler: hip-clang or NVCC
@@ -257,7 +257,14 @@ ROCclr is a virtual device interface that HIP runtimes interact with different b
* NVIDIA platform
On NVIDIA platform, HIP is just a thin layer on top of CUDA.
On non-AMD platform, HIP runtime determines if CUDA is available and can be used. If available, HIP_PLATFORM is set to `nvidia` and underneath CUDA path is used.
The environment variable `HIP_PLATFORM` specifies the runtime to use. The
platform is detected automatically by HIP. When an AMD graphics driver and an
AMD GPU is detected, `HIP_PLATFORM` is set to `amd`. If both runtimes are
installed, and a specific one should be used, or HIP can't detect the runtime,
setting the environment variable manually tells `hipcc` what compilation path to
choose. To use the CUDA compilation path, set the environment variable to
`HIP_PLATFORM=nvidia`.
## `hipLaunchKernelGGL`
@@ -0,0 +1,48 @@
.. meta::
:description: HIP runtime API usage
:keywords: AMD, ROCm, HIP, CUDA, HIP runtime API How to,
.. _hip_runtime_api_how-to:
********************************************************************************
Using HIP runtime API
********************************************************************************
The HIP runtime API provides C and C++ functionalities to manage event, stream,
and memory on GPUs. On the AMD platform, the HIP runtime uses
:doc:`Compute Language Runtime (CLR) <../understand/amd_clr>`, while on NVIDIA
CUDA platform, it is only a thin layer over the CUDA runtime or Driver API.
- **CLR** contains source code for AMD's compute language runtimes: ``HIP`` and
``OpenCL™``. CLR includes the ``HIP`` implementation on the AMD
platform: `hipamd <https://github.com/ROCm/clr/tree/develop/hipamd>`_ and the
ROCm Compute Language Runtime (``rocclr``). ``rocclr`` is a
virtual device interface that enables the HIP runtime to interact with
different backends such as :doc:`ROCr <rocr-runtime:index>` on Linux or PAL on
Windows. CLR also includes the `OpenCL runtime <https://github.com/ROCm/clr/tree/develop/opencl>`_
implementation.
- The **CUDA runtime** is built on top of the CUDA driver API, which is a C API
with lower-level access to NVIDIA GPUs. For details about the CUDA driver and
runtime API with reference to HIP, see :doc:`CUDA driver API porting guide <../how-to/hip_porting_driver_api>`.
The backends of HIP runtime API under AMD and NVIDIA platform are summarized in
the following figure:
.. figure:: ../data/how-to/hip_runtime_api/runtimes.svg
.. note::
On NVIDIA platform HIP runtime API calls CUDA runtime or CUDA driver via
hipother interface. For more information, see the `hipother repository <https://github.com/ROCm/hipother>`_.
Here are the various HIP Runtime API high level functions:
* :doc:`./hip_runtime_api/initialization`
* :doc:`./hip_runtime_api/memory_management`
* :doc:`./hip_runtime_api/error_handling`
* :doc:`./hip_runtime_api/cooperative_groups`
* :doc:`./hip_runtime_api/hipgraph`
* :doc:`./hip_runtime_api/call_stack`
* :doc:`./hip_runtime_api/multi_device`
* :doc:`./hip_runtime_api/opengl_interop`
* :doc:`./hip_runtime_api/external_interop`
@@ -0,0 +1,129 @@
.. meta::
:description: This page describes call stack concept in HIP
:keywords: AMD, ROCm, HIP, call stack
*******************************************************************************
Call stack
*******************************************************************************
The call stack is a data structure for managing function calls, by saving the
state of the current function. Each time a function is called, a new call frame
is added to the top of the stack, containing information such as local
variables, return addresses and function parameters. When the function
execution completes, the frame is removed from the stack and loaded back into
the corresponding registers. This concept allows the program to return to the
calling function and continue execution from where it left off.
The call stack for each thread must track its function calls, local variables,
and return addresses. However, in GPU programming, the memory required to store
the call stack increases due to the parallelism inherent to the GPUs. NVIDIA
and AMD GPUs use different approaches. NVIDIA GPUs have the independent thread
scheduling feature where each thread has its own call stack and effective
program counter. On AMD GPUs threads are grouped; each warp has its own call
stack and program counter. Warps are described and explained in the
:ref:`inherent_thread_hierarchy`
If a thread or warp exceeds its stack size, a stack overflow occurs, causing
kernel failure. This can be detected using debuggers.
Call stack management with HIP
===============================================================================
You can adjust the call stack size as shown in the following example, allowing
fine-tuning based on specific kernel requirements. This helps prevent stack
overflow errors by ensuring sufficient stack memory is allocated.
.. code-block:: cpp
#include <hip/hip_runtime.h>
#include <iostream>
#define HIP_CHECK(expression) \
{ \
const hipError_t status = expression; \
if(status != hipSuccess){ \
std::cerr << "HIP error " \
<< status << ": " \
<< hipGetErrorString(status) \
<< " at " << __FILE__ << ":" \
<< __LINE__ << std::endl; \
} \
}
int main()
{
size_t stackSize;
HIP_CHECK(hipDeviceGetLimit(&stackSize, hipLimitStackSize));
std::cout << "Default stack size: " << stackSize << " bytes" << std::endl;
// Set a new stack size
size_t newStackSize = 1024 * 8; // 8 KiB
HIP_CHECK(hipDeviceSetLimit(hipLimitStackSize, newStackSize));
HIP_CHECK(hipDeviceGetLimit(&stackSize, hipLimitStackSize));
std::cout << "Updated stack size: " << stackSize << " bytes" << std::endl;
return 0;
}
Depending on the GPU model, at full occupancy, it can consume a significant
amount of memory. For instance, an MI300X with 304 compute units (CU) and up to
2048 threads per CU could use 304 · 2048 · 1024 bytes = 608 MiB for the call
stack by default.
Handling recursion and deep function calls
-------------------------------------------------------------------------------
Similar to CPU programming, recursive functions and deeply nested function
calls are supported. However, developers must ensure that these functions do
not exceed the available stack memory, considering the huge amount of memory
needed for the call stack due to the GPUs inherent parallelism. This can be
achieved by increasing stack size or optimizing code to reduce stack usage. To
detect stack overflow add proper error handling or use debugging tools.
.. code-block:: cpp
#include <hip/hip_runtime.h>
#include <iostream>
#define HIP_CHECK(expression) \
{ \
const hipError_t status = expression; \
if(status != hipSuccess){ \
std::cerr << "HIP error " \
<< status << ": " \
<< hipGetErrorString(status) \
<< " at " << __FILE__ << ":" \
<< __LINE__ << std::endl; \
} \
}
__device__ unsigned long long fibonacci(unsigned long long n)
{
if (n == 0 || n == 1)
{
return n;
}
return fibonacci(n - 1) + fibonacci(n - 2);
}
__global__ void kernel(unsigned long long n)
{
unsigned long long result = fibonacci(n);
const size_t x = threadIdx.x + blockDim.x * blockIdx.x;
if (x == 0)
printf("%llu! = %llu \n", n, result);
}
int main()
{
kernel<<<1, 1>>>(10);
HIP_CHECK(hipDeviceSynchronize());
// With -O0 optimization option hit the stack limit
// kernel<<<1, 256>>>(2048);
// HIP_CHECK(hipDeviceSynchronize());
return 0;
}
@@ -8,9 +8,16 @@
Cooperative groups
*******************************************************************************
Cooperative groups API is an extension to the HIP programming model, which provides developers with a flexible, dynamic grouping mechanism for the communicating threads. Cooperative groups let you define your own set of thread groups which may fit your user-cases better than those defined by the hardware. This lets you specify the level of granularity for thread communication which can lead to more efficient parallel decompositions.
The cooperative groups API is an extension to the HIP programming model, which
provides developers with a flexible, dynamic grouping mechanism for the
communicating threads. Cooperative groups let you define your own set of thread
groups which may fit your use-cases better than those defined by the hardware.
This lets you specify the level of granularity for thread communication which
can lead to more efficient parallel decompositions.
The API is accessible in the ``cooperative_groups`` namespace after the ``hip_cooperative_groups.h`` is included. The header contains the following elements:
The API is accessible in the ``cooperative_groups`` namespace after the
``hip_cooperative_groups.h`` header is included. The header contains the following
elements:
* Static functions to create groups and subgroups.
* Hardware-accelerated operations over the whole group, like shuffles.
@@ -19,13 +26,13 @@ The API is accessible in the ``cooperative_groups`` namespace after the ``hip_c
* Get group properties member functions.
Cooperative groups thread model
===============================
================================================================================
The thread hierarchy abstraction of cooperative groups are in :ref:`grid hierarchy <coop_thread_top_hierarchy>` and :ref:`block hierarchy <coop_thread_bottom_hierarchy>`.
The thread hierarchy abstractions of cooperative groups are depicted in the following figures: :ref:`grid hierarchy <coop_thread_top_hierarchy>` and :ref:`block hierarchy <coop_thread_bottom_hierarchy>`.
.. _coop_thread_top_hierarchy:
.. figure:: ../data/how-to/cooperative_groups/thread_hierarchy_coop_top.svg
.. figure:: ../../data/how-to/hip_runtime_api/cooperative_groups/thread_hierarchy_coop_top.svg
:alt: Diagram depicting nested rectangles of varying color. The outermost one
titled "Grid", inside sets of different sized rectangles layered on
one another titled "Block". Each "Block" containing sets of uniform
@@ -34,11 +41,16 @@ The thread hierarchy abstraction of cooperative groups are in :ref:`grid hierarc
Cooperative group thread hierarchy in grids.
The **multi grid** is an abstraction of potentially multiple simultaneous launches of the same kernel over multiple devices (Deprecated since 5.0). The **grid** in cooperative groups is a single dispatch of kernels for execution like the original grid.
The **multi grid** is an abstraction of potentially multiple simultaneous
launches of the same kernel over multiple devices. The **grid** in cooperative
groups is a single dispatch of kernels for execution like the original grid.
.. note::
The ability to synchronize over a grid or multi grid requires the kernel to be launched using the specific cooperative groups API.
* The ability to synchronize over a grid or multi grid requires the kernel to
be launched using the specific cooperative groups API.
* Multi grid deprecated since ROCm 5.0.
The **block** is the same as the :ref:`inherent_thread_model` block entity.
@@ -48,7 +60,7 @@ The **block** is the same as the :ref:`inherent_thread_model` block entity.
.. _coop_thread_bottom_hierarchy:
.. figure:: ../data/how-to/cooperative_groups/thread_hierarchy_coop_bottom.svg
.. figure:: ../../data/how-to/hip_runtime_api/cooperative_groups/thread_hierarchy_coop_bottom.svg
:alt: The new level between block thread and threads.
Cooperative group thread hierarchy in blocks.
@@ -156,7 +168,7 @@ Threads (64 threads on CDNA and 32 threads on RDNA) in a warp cannot execute dif
.. note::
The NVIDIA GPU's independent thread scheduling presents the appearance that threads on different branches execute concurrently.
The NVIDIA GPU's independent thread scheduling presents the appearance that threads on different branches execute concurrently.
.. warning::
@@ -378,8 +390,8 @@ With each group type, the synchronization requires using the correct cooperative
dim3(threads_per_block),
0,
hipStreamDefault,
&d_vector,
&d_block_reduced,
&d_vector,
&d_block_reduced,
&d_partition_reduced));
.. tab-item:: Grid
@@ -0,0 +1,136 @@
.. meta::
:description: Error Handling
:keywords: AMD, ROCm, HIP, error handling, error
.. _error_handling:
********************************************************************************
Error handling
********************************************************************************
HIP provides functionality to detect, report, and manage errors that occur
during the execution of HIP runtime functions or when launching kernels. Every
HIP runtime function, apart from launching kernels, has :cpp:type:`hipError_t`
as return type. :cpp:func:`hipGetLastError` and :cpp:func:`hipPeekAtLastError`
can be used for catching errors from kernel launches, as kernel launches don't
return an error directly. HIP maintains an internal state, that includes the
last error code. :cpp:func:`hipGetLastError` returns and resets that error to
``hipSuccess``, while :cpp:func:`hipPeekAtLastError` just returns the error
without changing it. To get a human readable version of the errors,
:cpp:func:`hipGetErrorString` and :cpp:func:`hipGetErrorName` can be used.
.. note::
:cpp:func:`hipGetLastError` returns the returned error code of the last HIP
runtime API call even if it's ``hipSuccess``, while ``cudaGetLastError``
returns the error returned by any of the preceding CUDA APIs in the same
host thread. :cpp:func:`hipGetLastError` behavior will be matched with
``cudaGetLastError`` in ROCm release 7.0.
Best practices of HIP error handling:
1. Check errors after each API call - Avoid error propagation.
2. Use macros for error checking - Check :ref:`hip_check_macros`.
3. Handle errors gracefully - Free resources and provide meaningful error
messages to the user.
For more details on the error handling functions, see :ref:`error handling
functions reference page <error_handling_reference>`.
.. _hip_check_macros:
HIP check macros
================================================================================
HIP uses check macros to simplify error checking and reduce code duplication.
The ``HIP_CHECK`` macros are mainly used to detect and report errors. It can
also exit from application with ``exit(1);`` function call after the error
print. The ``HIP_CHECK`` macro example:
.. code-block:: cpp
#define HIP_CHECK(expression) \
{ \
const hipError_t status = expression; \
if(status != hipSuccess){ \
std::cerr << "HIP error " \
<< status << ": " \
<< hipGetErrorString(status) \
<< " at " << __FILE__ << ":" \
<< __LINE__ << std::endl; \
} \
}
Complete example
================================================================================
A complete example to demonstrate the error handling with a simple addition of
two values kernel:
.. code-block:: cpp
#include <hip/hip_runtime.h>
#include <vector>
#include <iostream>
#define HIP_CHECK(expression) \
{ \
const hipError_t status = expression; \
if(status != hipSuccess){ \
std::cerr << "HIP error " \
<< status << ": " \
<< hipGetErrorString(status) \
<< " at " << __FILE__ << ":" \
<< __LINE__ << std::endl; \
} \
}
// Addition of two values.
__global__ void add(int *a, int *b, int *c, size_t size) {
const size_t index = threadIdx.x + blockDim.x * blockIdx.x;
if(index < size) {
c[index] += a[index] + b[index];
}
}
int main() {
constexpr int numOfBlocks = 256;
constexpr int threadsPerBlock = 256;
constexpr size_t arraySize = 1U << 16;
std::vector<int> a(arraySize), b(arraySize), c(arraySize);
int *d_a, *d_b, *d_c;
// Setup input values.
std::fill(a.begin(), a.end(), 1);
std::fill(b.begin(), b.end(), 2);
// Allocate device copies of a, b and c.
HIP_CHECK(hipMalloc(&d_a, arraySize * sizeof(*d_a)));
HIP_CHECK(hipMalloc(&d_b, arraySize * sizeof(*d_b)));
HIP_CHECK(hipMalloc(&d_c, arraySize * sizeof(*d_c)));
// Copy input values to device.
HIP_CHECK(hipMemcpy(d_a, &a, arraySize * sizeof(*d_a), hipMemcpyHostToDevice));
HIP_CHECK(hipMemcpy(d_b, &b, arraySize * sizeof(*d_b), hipMemcpyHostToDevice));
// Launch add() kernel on GPU.
hipLaunchKernelGGL(add, dim3(numOfBlocks), dim3(threadsPerBlock), 0, 0, d_a, d_b, d_c, arraySize);
// Check the kernel launch
HIP_CHECK(hipGetLastError());
// Check for kernel execution error
HIP_CHECK(hipDeviceSynchronize());
// Copy the result back to the host.
HIP_CHECK(hipMemcpy(&c, d_c, arraySize * sizeof(*d_c), hipMemcpyDeviceToHost));
// Cleanup allocated memory.
HIP_CHECK(hipFree(d_a));
HIP_CHECK(hipFree(d_b));
HIP_CHECK(hipFree(d_c));
// Print the result.
std::cout << a[0] << " + " << b[0] << " = " << c[0] << std::endl;
return 0;
}
@@ -0,0 +1,140 @@
.. meta::
:description: HIP provides an external resource interoperability API that
allows efficient data sharing between HIP's computing power and
OpenGL's graphics rendering.
:keywords: AMD, ROCm, HIP, external, interop, interoperability
*******************************************************************************
External resource interoperability
*******************************************************************************
This feature allows HIP to work with resources -- like memory and semaphores --
created by other APIs. This means resources can be used from APIs like CUDA,
OpenCL and Vulkan within HIP, making it easier to integrate HIP into existing
projects.
To use external resources in HIP, you typically follow these steps:
- Import resources from other APIs using HIP provided functions
- Use external resources as if they were created in HIP
- Destroy the HIP resource object to clean up
Semaphore Functions
===============================================================================
Semaphore functions are essential for synchronization in parallel computing.
These functions facilitate communication and coordination between different
parts of a program or between different programs. By managing semaphores, tasks
are executed in the correct order, and resources are utilized effectively.
Semaphore functions ensure smooth operation, preventing conflicts and
maintaining the integrity of processes; upholding the integrity and performance
of concurrent processes.
External semaphore functions can be used in HIP as described in :ref:`external_resource_interoperability_reference`.
Memory Functions
===============================================================================
HIP external memory functions focus on the efficient sharing and management of
memory resources. These functions enable importing memory created by external
systems, enabling the HIP program to use this memory seamlessly. Memory
functions include mapping memory for effective use and ensuring proper cleanup
to prevent resource leaks. This is critical for performance, particularly in
applications handling large datasets or complex structures such as textures in
graphics. Proper memory management ensures stability and efficient resource
utilization.
Example
===============================================================================
ROCm examples include a
`HIP--Vulkan interoperation example <https://github.com/ROCm/rocm-examples/tree/develop/HIP-Basic/vulkan_interop>`_
demonstrates how to perform interoperation between HIP and Vulkan.
In this example, a simple HIP kernel is used to compute a sine wave, which is
then rendered to a window as a graphical output using Vulkan. The process
requires several initialization steps, such as setting up a HIP context,
creating a Vulkan instance, and configuring the GPU device and queue. After
these initial steps, the kernel executes the sine wave computation, and Vulkan
continuously updates the window framebuffer to display the computed data until
the window is closed.
The following code converts a Vulkan memory handle to its equivalent HIP
handle. The input ``VkDeviceMemory`` and the created HIP memory represents the
same physical area of GPU memory, through the handles of each respective API.
Writing to the buffer in one API will allow us to read the results through the
other. Note that access to the buffer should be synchronized between the APIs,
for example using queue syncs or semaphores.
.. <!-- spellcheck-disable -->
.. literalinclude:: ../../tools/example_codes/external_interop.hip
:start-after: // [Sphinx vulkan memory to hip start]
:end-before: // [Sphinx vulkan memory to hip end]
:language: cpp
.. <!-- spellcheck-enable -->
The Vulkan semaphore is converted to HIP semaphore shown in the following
example. Signaling on the semaphore in one API will allow the other API to wait
on it, which is how we can guarantee synchronized access to resources in a
cross-API manner.
.. <!-- spellcheck-disable -->
.. literalinclude:: ../../tools/example_codes/external_interop.hip
:start-after: // [Sphinx semaphore import start]
:end-before: // [Sphinx semaphore import end]
:language: cpp
.. <!-- spellcheck-enable -->
When the HIP external memory is exported from Vulkan and imported to HIP, it is
not yet ready for use. The Vulkan handle is shared, allowing for memory sharing
rather than copying during the export process. To actually use the memory, we
need to map it to a pointer so that we may pass it to the kernel so that it can
be read from and written to. The external memory map to HIP in the following
example:
.. <!-- spellcheck-disable -->
.. literalinclude:: ../../tools/example_codes/external_interop.hip
:start-after: // [Sphinx map external memory start]
:end-before: // [Sphinx map external memory end]
:language: cpp
.. <!-- spellcheck-enable -->
Wait for buffer is ready and not under modification at Vulkan side:
.. <!-- spellcheck-disable -->
.. literalinclude:: ../../tools/example_codes/external_interop.hip
:start-after: // [Sphinx wait semaphore start]
:end-before: // [Sphinx wait semaphore end]
:language: cpp
.. <!-- spellcheck-enable -->
The sinewave kernel implementation:
.. <!-- spellcheck-disable -->
.. literalinclude:: ../../tools/example_codes/external_interop.hip
:start-after: [Sphinx sinewave kernel start]
:end-before: // [Sphinx sinewave kernel end]
:language: cpp
.. <!-- spellcheck-enable -->
Signal to Vulkan that we are done with the buffer and that it can proceed with
rendering:
.. <!-- spellcheck-disable -->
.. literalinclude:: ../../tools/example_codes/external_interop.hip
:start-after: // [Sphinx signal semaphore start]
:end-before: // [Sphinx signal semaphore end]
:language: cpp
.. <!-- spellcheck-enable -->
@@ -12,7 +12,7 @@ HIP graphs
The HIP graph API is currently in Beta. Some features can change and might
have outstanding issues. Not all features supported by CUDA graphs are yet
supported. For a list of all currently supported functions see the
:doc:`HIP graph API documentation<../doxygen/html/group___graph>`.
:ref:`HIP graph API documentation<graph_management_reference>`.
HIP graphs are an alternative way of executing tasks on a GPU that can provide
performance benefits over launching kernels using the standard
@@ -35,7 +35,7 @@ The nodes can be one of the following:
The following figure visualizes the concept of graphs, compared to using streams.
.. figure:: ../data/how-to/hipgraph/hip_graph.svg
.. figure:: ../../data/how-to/hip_runtime_api/hipgraph/hip_graph.svg
:alt: Diagram depicting the difference between using streams to execute
kernels with dependencies, resolved by explicitly synchronizing,
or using graphs, where the edges denote the dependencies.
@@ -56,7 +56,7 @@ HIP runtime takes care of executing the operations within the graph.
Graphs can provide additional performance benefits, by enabling optimizations
that are only possible when knowing the dependencies between the operations.
.. figure:: ../data/how-to/hipgraph/hip_graph_speedup.svg
.. figure:: ../../data/how-to/hip_runtime_api/hipgraph/hip_graph_speedup.svg
:alt: Diagram depicting the speed up achievable with HIP graphs compared to
HIP streams when launching many short-running kernels.
@@ -316,11 +316,11 @@ edges of the graph, thereby forming the graph structure.
The nodes are represented by the generic :cpp:type:`hipGraphNode_t` type. The actual
node type is implicitly defined by the specific function used to add the node to
the graph, for example :cpp:func:`hipGraphAddKernelNode` See the
:doc:`HIP graph API documentation<../doxygen/html/group___graph>` for the
:ref:`HIP graph API documentation<graph_management_reference>` for the
available functions, they are of type ``hipGraphAdd{Type}Node``. Each type of
node also has a predefined set of parameters depending on the operation, for
example :cpp:class:`hipKernelNodeParams` for a kernel launch. See the
:doc:`documentation for the general hipGraphNodeParams type<../doxygen/html/structhip_graph_node_params>`
:doc:`documentation for the general hipGraphNodeParams type<../../doxygen/html/structhip_graph_node_params>`
for a list of available parameter types and their members.
The general flow for explicitly creating a graph is usually:
@@ -0,0 +1,107 @@
.. meta::
:description: Initialization.
:keywords: AMD, ROCm, HIP, initialization
.. _initialization:
********************************************************************************
Initialization
********************************************************************************
The initialization involves setting up the environment and resources needed for
using GPUs. The following steps are covered with the initialization:
- Setting up the HIP runtime
This includes reading the environment variables set during init, setting up
the active or visible devices, loading necessary libraries, setting up
internal buffers for memory copies or cooperative launches, initialize the
compiler as well as HSA runtime and checks any errors due to lack of resources
or no active devices.
- Querying and setting GPUs
Identifying and querying the available GPU devices on the system.
- Setting up contexts
Creating contexts for each GPU device, which are essential for managing
resources and executing kernels. For further details, check the :ref:`context
section <context_driver_api>`.
Initialize the HIP runtime
================================================================================
The HIP runtime is initialized automatically when the first HIP API call is
made. However, you can explicitly initialize it using :cpp:func:`hipInit`,
to be able to control the timing of the initialization. The manual
initialization can be useful to ensure that the GPU is initialized and
ready, or to isolate GPU initialization time from other parts of
your program.
.. note::
You can use :cpp:func:`hipDeviceReset` to delete all streams created, memory
allocated, kernels running and events created by the current process. Any new
HIP API call initializes the HIP runtime again.
Querying and setting GPUs
================================================================================
If multiple GPUs are available in the system, you can query and select the
desired GPU(s) to use based on device properties, such as size of global memory,
size shared memory per block, support of cooperative launch and support of
managed memory.
Querying GPUs
--------------------------------------------------------------------------------
The properties of a GPU can be queried using :cpp:func:`hipGetDeviceProperties`,
which returns a struct of :cpp:struct:`hipDeviceProp_t`. The properties in the
struct can be used to identify a device or give an overview of hardware
characteristics, that might make one GPU better suited for the task than others.
The :cpp:func:`hipGetDeviceCount` function returns the number of available GPUs,
which can be used to loop over the available GPUs.
Example code of querying GPUs:
.. code-block:: cpp
#include <hip/hip_runtime.h>
#include <iostream>
int main() {
int deviceCount;
if (hipGetDeviceCount(&deviceCount) == hipSuccess){
for (int i = 0; i < deviceCount; ++i){
hipDeviceProp_t prop;
if ( hipGetDeviceProperties(&prop, i) == hipSuccess)
std::cout << "Device" << i << prop.name << std::endl;
}
}
return 0;
}
Setting the GPU
--------------------------------------------------------------------------------
:cpp:func:`hipSetDevice` function select the GPU to be used for subsequent HIP
operations. This function performs several key tasks:
- Context Binding
Binds the current thread to the context of the specified GPU device. This
ensures that all subsequent operations are executed on the selected device.
- Resource Allocation
Prepares the device for resource allocation, such as memory allocation and
stream creation.
- Check device availability
Checks for errors in device selection and returns error if the specified
device is not available or not capable of executing HIP operations.
@@ -0,0 +1,52 @@
.. meta::
:description: Memory management and its usage
:keywords: AMD, ROCm, HIP, CUDA, memory management
.. _memory_management:
********************************************************************************
Memory management
********************************************************************************
Memory management is an important part of the HIP runtime API, when creating
high-performance applications. Both allocating and copying memory can result in
bottlenecks, which can significantly impact performance.
The programming model is based on a system with a host and a device, each having
its own distinct memory. Kernels operate on :ref:`device_memory`, while host functions
operate on :ref:`host_memory`.
The runtime offers functions for allocating, freeing, and copying device memory,
along with transferring data between host and device memory.
Here are the various memory management techniques:
* :ref:`coherence_control`
* :ref:`unified_memory`
* :ref:`virtual_memory`
* :ref:`stream_ordered_memory_allocator_how-to`
Memory allocation
================================================================================
The API calls and the resulting allocations are listed here:
.. list-table:: Memory coherence control
:header-rows: 1
:align: center
* - API
- Data location
- Allocation
* - System allocated
- Host
- :ref:`Pageable <pageable_host_memory>`
* - :cpp:func:`hipMallocManaged`
- Host
- :ref:`Managed <unified_memory>`
* - :cpp:func:`hipHostMalloc`
- Host
- :ref:`Pinned <pinned_host_memory>`
* - :cpp:func:`hipMalloc`
- Device
- Pinned
@@ -0,0 +1,178 @@
.. meta::
:description: HIP coherence control
ecosystem ROCm software.
:keywords: AMD, ROCm, HIP, host memory
.. _coherence_control:
*******************************************************************************
Coherence control
*******************************************************************************
Memory coherence describes how memory of a specific part of the system is
visible to the other parts of the system. For example, how GPU memory is visible
to the CPU and vice versa. In HIP, host and device memory can be allocated with
two different types of coherence:
* **Coarse-grained coherence:** The memory is considered up-to-date only after
synchronization performed using :cpp:func:`hipDeviceSynchronize`,
:cpp:func:`hipStreamSynchronize`, or any blocking operation that acts on the
null stream such as :cpp:func:`hipMemcpy`. To avoid the cache from being
accessed by a part of the system while simultaneously being written by
another, the memory is made visible only after the caches have been flushed.
* **Fine-grained coherence:** The memory is coherent even while being modified
by a part of the system. Fine-grained coherence ensures that up-to-date data
is visible to others regardless of kernel boundaries. This can be useful if
both host and device operate on the same data.
.. note::
To achieve fine-grained coherence, many AMD GPUs use a limited cache policy,
such as leaving these allocations uncached by the GPU or making them read-only.
Mi200 accelerator's hardware based floating point instructions work on
coarse-grained memory regions. Coarse-grained coherence is typically useful in
reducing host-device interconnect communication.
To check the availability of fine- and coarse-grained memory pools, use
``rocminfo``:
.. code-block:: sh
$ rocminfo
...
*******
Agent 1
*******
Name: AMD EPYC 7742 64-Core Processor
...
Pool Info:
Pool 1
Segment: GLOBAL; FLAGS: FINE GRAINED
...
Pool 3
Segment: GLOBAL; FLAGS: COARSE GRAINED
...
*******
Agent 9
*******
Name: gfx90a
...
Pool Info:
Pool 1
Segment: GLOBAL; FLAGS: COARSE GRAINED
...
The APIs, flags and respective memory coherence control are listed in the
following table:
.. list-table:: Memory coherence control
:widths: 25, 35, 20, 20
:header-rows: 1
:align: center
* - API
- Flag
- :cpp:func:`hipMemAdvise` call with argument
- Coherence
* - ``hipHostMalloc`` :sup:`1`
- ``hipHostMallocDefault``
-
- Fine-grained
* - ``hipHostMalloc`` :sup:`1`
- ``hipHostMallocNonCoherent``
-
- Coarse-grained
* - ``hipExtMallocWithFlags``
- ``hipDeviceMallocDefault``
-
- Coarse-grained
* - ``hipExtMallocWithFlags``
- ``hipDeviceMallocFinegrained``
-
- Fine-grained
* - ``hipMallocManaged``
-
-
- Fine-grained
* - ``hipMallocManaged``
-
- ``hipMemAdviseSetCoarseGrain``
- Coarse-grained
* - ``malloc``
-
-
- Fine-grained
* - ``malloc``
-
- ``hipMemAdviseSetCoarseGrain``
- Coarse-grained
:sup:`1` The :cpp:func:`hipHostMalloc` memory allocation coherence mode can be
affected by the ``HIP_HOST_COHERENT`` environment variable, if the
``hipHostMallocCoherent``, ``hipHostMallocNonCoherent``, and
``hipHostMallocMapped`` are unset. If neither these flags nor the
``HIP_HOST_COHERENT`` environment variable is set, or set as 0, the host memory
allocation is coarse-grained.
.. note::
* When ``hipHostMallocMapped`` flag is set, the allocated host memory is
fine-grained and the ``hipHostMallocNonCoherent`` flag is ignored.
* Setting both the ``hipHostMallocCoherent`` and
``hipHostMallocNonCoherent`` flags leads to an illegal state.
Visibility of synchronization functions
================================================================================
The fine-grained coherence memory is visible at the synchronization points,
however the visibility of coarse-grained memory depends on the synchronization
function used. The effect and visibility of various synchronization functions on
fine- and coarse-grained memory types are listed here:
.. list-table:: HIP synchronize functions effect and visibility
* - HIP API
- :cpp:func:`hipStreamSynchronize`
- :cpp:func:`hipDeviceSynchronize`
- :cpp:func:`hipEventSynchronize`
- :cpp:func:`hipStreamWaitEvent`
* - Synchronization effect
- Host waits for all commands in the specified stream to complete
- Host waits for all commands in all streams on the specified device to complete
- Host waits for the specified event to complete
- Stream waits for the specified event to complete
* - Fence
- System-scope release
- System-scope release
- System-scope release
- None
* - Fine-grained host memory visibility
- Yes
- Yes
- Yes
- Yes
* - Coarse-grained host memory visibility
- Yes
- Yes
- Depends on the used event.
- No
You can control the release scope for ``hipEvents``. By default, the GPU
performs a device-scope acquire and release operation with each recorded event.
This makes the host and device memory visible to other commands executing on the
same device.
:cpp:func:`hipEventCreateWithFlags`: You can specify a stronger system-level
fence by creating the event with ``hipEventCreateWithFlags``:
* ``hipEventReleaseToSystem``: Performs a system-scope release operation when
the event is recorded. This makes both fine-grained and coarse-grained host
memory visible to other agents in the system, which might also involve
heavyweight operations such as cache flushing. Fine-grained memory typically
uses lighter-weight in-kernel synchronization mechanisms such as an atomic
operation and thus doesn't need to use ``hipEventReleaseToSystem``.
* ``hipEventDisableTiming``: Events created with this flag don't record
profiling data, which significantly improves synchronization performance.
@@ -0,0 +1,52 @@
.. meta::
:description: This chapter describes the device memory of the HIP ecosystem
ROCm software.
:keywords: AMD, ROCm, HIP, device memory
.. _device_memory:
*******************************************************************************
Device memory
*******************************************************************************
Device memory exists on the device, e.g. on GPUs in the video random access
memory (VRAM), and is accessible by the kernels operating on the device. Recent
architectures use graphics double data rate (GDDR) synchronous dynamic
random-access memory (SDRAM) such as GDDR6, or high-bandwidth memory (HBM) such
as HBM2e. Device memory can be allocated as global memory, constant, texture or
surface memory.
Global memory
================================================================================
Read-write storage visible to all threads on a given device. There are
specialized versions of global memory with different usage semantics which are
typically backed by the same hardware, but can use different caching paths.
Constant memory
================================================================================
Read-only storage visible to all threads on a given device. It is a limited
segment backed by device memory with queryable size. It needs to be set by the
host before kernel execution. Constant memory provides the best performance
benefit when all threads within a warp access the same address.
Texture memory
================================================================================
Read-only storage visible to all threads on a given device and accessible
through additional APIs. Its origins come from graphics APIs, and provides
performance benefits when accessing memory in a pattern where the
addresses are close to each other in a 2D representation of the memory.
The :ref:`texture management module <texture_management_reference>` of the HIP
runtime API reference contains the functions of texture memory.
Surface memory
================================================================================
A read-write version of texture memory, which can be useful for applications
that require direct manipulation of 1D, 2D, or 3D hipArray_t.
The :ref:`surface objects module <surface_object_reference>` of HIP runtime API
contains the functions for creating, destroying and reading surface memory.
@@ -3,11 +3,13 @@
ROCm software.
:keywords: AMD, ROCm, HIP, Texture, Texture Fetching
.. _texture_fetching:
*******************************************************************************
Texture fetching
*******************************************************************************
`Textures <../doxygen/html/group___texture.html>`_ are more than just a buffer
`Textures <../../../../doxygen/html/group___texture.html>`_ are more than just a buffer
interpreted as a 1D, 2D, or 3D array.
As textures are associated with graphics, they are indexed using floating-point
@@ -32,7 +34,7 @@ sections.
Here is the sample texture used in this document for demonstration purposes. It
is 2x2 texels and indexed in the [0 to 1] range.
.. figure:: ../data/understand/textures/original.png
.. figure:: ../../../../data/how-to/hip_runtime_api/memory_management/textures/original.png
:width: 150
:alt: Sample texture
:align: center
@@ -66,7 +68,7 @@ The following image shows a texture stretched to a 4x4 pixel quad but still
indexed in the [0 to 1] range. The in-between values are the same as the values
of the nearest texel.
.. figure:: ../data/understand/textures/nearest.png
.. figure:: ../../../../data/how-to/hip_runtime_api/memory_management/textures/nearest.png
:width: 300
:alt: Texture upscaled with nearest point sampling
:align: center
@@ -97,7 +99,7 @@ This following image shows a texture stretched out to a 4x4 pixel quad, but
still indexed in the [0 to 1] range. The in-between values are interpolated
between the neighboring texels.
.. figure:: ../data/understand/textures/linear.png
.. figure:: ../../../../data/how-to/hip_runtime_api/memory_management/textures/linear.png
:width: 300
:alt: Texture upscaled with linear filtering
:align: center
@@ -124,7 +126,7 @@ bounds. The border value must be set before texture fetching.
The following image shows the texture on a 4x4 pixel quad, indexed in the
[0 to 3] range. The out-of-bounds values are the border color, which is yellow.
.. figure:: ../data/understand/textures/border.png
.. figure:: ../../../../data/how-to/hip_runtime_api/memory_management/textures/border.png
:width: 300
:alt: Texture with yellow border color
:align: center
@@ -147,7 +149,7 @@ The following image shows the texture on a 4x4 pixel quad, indexed in the
[0 to 3] range. The out-of-bounds values are repeating the values at the edge of
the texture.
.. figure:: ../data/understand/textures/clamp.png
.. figure:: ../../../../data/how-to/hip_runtime_api/memory_management/textures/clamp.png
:width: 300
:alt: Texture with clamp addressing
:align: center
@@ -172,7 +174,7 @@ This creates a repeating image effect.
The following image shows the texture on a 4x4 pixel quad, indexed in the
[0 to 3] range. The out-of-bounds values are repeating the original texture.
.. figure:: ../data/understand/textures/wrap.png
.. figure:: ../../../../data/how-to/hip_runtime_api/memory_management/textures/wrap.png
:width: 300
:alt: Texture with wrap addressing
:align: center
@@ -201,7 +203,7 @@ The following image shows the texture on a 4x4 pixel quad, indexed in The
[0 to 3] range. The out-of-bounds values are repeating the original texture, but
mirrored.
.. figure:: ../data/understand/textures/mirror.png
.. figure:: ../../../../data/how-to/hip_runtime_api/memory_management/textures/mirror.png
:width: 300
:alt: Texture with mirror addressing
:align: center
@@ -0,0 +1,239 @@
.. meta::
:description: Host memory of the HIP ecosystem
:keywords: AMD, ROCm, HIP, host memory
.. _host_memory:
********************************************************************************
Host memory
********************************************************************************
Host memory is the "normal" memory residing in the host RAM and allocated by C
or C++. Host memory can be allocated in two different ways:
* Pageable memory
* Pinned memory
The following figure explains how data is transferred in pageable and pinned
memory.
.. figure:: ../../../data/how-to/hip_runtime_api/memory_management/pageable_pinned.svg
The pageable and pinned memory allow you to exercise direct control over
memory operations, which is known as explicit memory management. When using the
unified memory, you get a simplified memory model with less control over
low level memory operations.
The difference in memory transfers between explicit and unified memory
management is highlighted in the following figure:
.. figure:: ../../../data/how-to/hip_runtime_api/memory_management/unified_memory/um.svg
For more details on unified memory management, see :doc:`/how-to/hip_runtime_api/memory_management/unified_memory`.
.. _pageable_host_memory:
Pageable memory
================================================================================
Pageable memory exists on memory blocks known as "pages" that can be migrated to
other memory storage. For example, migrating memory between CPU sockets on a
motherboard or in a system whose RAM runs out of space and starts dumping pages
into the swap partition of the hard drive.
Pageable memory is usually allocated with a call to ``malloc`` or ``new`` in a
C++ application.
**Example:** Using pageable host memory in HIP
.. code-block:: cpp
#include <hip/hip_runtime.h>
#include <iostream>
#define HIP_CHECK(expression) \
{ \
const hipError_t status = expression; \
if(status != hipSuccess){ \
std::cerr << "HIP error " \
<< status << ": " \
<< hipGetErrorString(status) \
<< " at " << __FILE__ << ":" \
<< __LINE__ << std::endl; \
} \
}
int main()
{
const int element_number = 100;
int *host_input, *host_output;
// Host allocation
host_input = new int[element_number];
host_output = new int[element_number];
// Host data preparation
for (int i = 0; i < element_number; i++) {
host_input[i] = i;
}
memset(host_output, 0, element_number * sizeof(int));
int *device_input, *device_output;
// Device allocation
HIP_CHECK(hipMalloc((int **)&device_input, element_number * sizeof(int)));
HIP_CHECK(hipMalloc((int **)&device_output, element_number * sizeof(int)));
// Device data preparation
HIP_CHECK(hipMemcpy(device_input, host_input, element_number * sizeof(int), hipMemcpyHostToDevice));
HIP_CHECK(hipMemset(device_output, 0, element_number * sizeof(int)));
// Run the kernel
// ...
HIP_CHECK(hipMemcpy(device_input, host_input, element_number * sizeof(int), hipMemcpyHostToDevice));
// Free host memory
delete[] host_input;
delete[] host_output;
// Free device memory
HIP_CHECK(hipFree(device_input));
HIP_CHECK(hipFree(device_output));
}
.. note::
:cpp:func:`hipMalloc` and :cpp:func:`hipFree` are blocking calls. However, HIP
also provides non-blocking versions :cpp:func:`hipMallocAsync` and
:cpp:func:`hipFreeAsync`, which require a stream as an additional argument.
.. _pinned_host_memory:
Pinned memory
================================================================================
Pinned memory or page-locked memory is stored in pages that are locked in
specific sectors in RAM and can't be migrated. The pointer can be used on both
host and device. Accessing host-resident pinned memory in device kernels is
generally not recommended for performance, as it can force the data to traverse
the host-device interconnect such as PCIe, which is much slower than the
on-device bandwidth.
The advantage of pinned memory is the improved transfer time between host and
device. For transfer operations, such as :cpp:func:`hipMemcpy` or :cpp:func:`hipMemcpyAsync`,
using pinned memory instead of pageable memory on the host can lead to a three times
improvement in bandwidth.
The disadvantage of pinned memory is the reduced availability of RAM for other
processes, which can negatively impact the overall performance of the host.
**Example:** Using pinned memory in HIP
.. code-block:: cpp
#include <hip/hip_runtime.h>
#include <iostream>
#define HIP_CHECK(expression) \
{ \
const hipError_t status = expression; \
if(status != hipSuccess){ \
std::cerr << "HIP error " \
<< status << ": " \
<< hipGetErrorString(status) \
<< " at " << __FILE__ << ":" \
<< __LINE__ << std::endl; \
} \
}
int main()
{
const int element_number = 100;
int *host_input, *host_output;
// Host allocation
HIP_CHECK(hipHostMalloc((int **)&host_input, element_number * sizeof(int)));
HIP_CHECK(hipHostMalloc((int **)&host_output, element_number * sizeof(int)));
// Host data preparation
for (int i = 0; i < element_number; i++) {
host_input[i] = i;
}
memset(host_output, 0, element_number * sizeof(int));
int *device_input, *device_output;
// Device allocation
HIP_CHECK(hipMalloc((int **)&device_input, element_number * sizeof(int)));
HIP_CHECK(hipMalloc((int **)&device_output, element_number * sizeof(int)));
// Device data preparation
HIP_CHECK(hipMemcpy(device_input, host_input, element_number * sizeof(int), hipMemcpyHostToDevice));
HIP_CHECK(hipMemset(device_output, 0, element_number * sizeof(int)));
// Run the kernel
// ...
HIP_CHECK(hipMemcpy(device_input, host_input, element_number * sizeof(int), hipMemcpyHostToDevice));
// Free host memory
delete[] host_input;
delete[] host_output;
// Free device memory
HIP_CHECK(hipFree(device_input));
HIP_CHECK(hipFree(device_output));
}
.. _memory_allocation_flags:
Memory allocation flags for pinned memory
--------------------------------------------------------------------------------
The memory allocation for pinned memory can be controlled using ``hipHostMalloc`` flags:
* ``hipHostMallocPortable``: The memory allocation is not restricted to the
context making the allocation.
* ``hipHostMallocMapped``: The memory is allocated into the address space for
the current device and the device pointer can be obtained with
:cpp:func:`hipHostGetDevicePointer`.
* ``hipHostMallocNumaUser``: The host memory allocation follows Numa policy
specified by the user. Target of Numa policy is to select a CPU that is
closest to each GPU. Numa distance is the distance between GPU and CPU
devices.
* ``hipHostMallocWriteCombined``: The memory is allocated as write-combined.
Although lacking read efficiency by most CPUs, write-combined allocation might
be transferred faster across the PCIe bus on some system configurations. It's
a good option for data transfer from host to device via mapped pinned memory.
* ``hipHostMallocCoherent``: Fine-grained memory is allocated. Overrides
``HIP_HOST_COHERENT`` environment variable for specific allocation. For
details, see :ref:`coherence_control`.
* ``hipHostMallocNonCoherent``: Coarse-grained memory is allocated. Overrides
``HIP_HOST_COHERENT`` environment variable for specific allocation. For
details, see :ref:`coherence_control`.
All allocation flags are independent and can be set in any combination. The only
exception is setting ``hipHostMallocCoherent`` and ``hipHostMallocNonCoherent``
together, which leads to an illegal state. An example of a valid flag
combination is calling :cpp:func:`hipHostMalloc` with both
``hipHostMallocPortable`` and ``hipHostMallocMapped`` flags set. Both the flags
use the same model and differentiate only between how the surrounding code uses
the host memory.
.. note::
By default, each GPU selects a Numa CPU node with the least Numa distance
between them. This implies that the host memory is automatically allocated on
the closest memory pool of the current GPU device's Numa node. Using
:cpp:func:`hipSetDevice` API to set a different GPU increases the Numa
distance but still allows you to access the host allocation.
Numa policy is implemented on Linux and is under development on Microsoft
Windows.
@@ -2,6 +2,8 @@
:description:
:keywords: stream, memory allocation, SOMA, stream ordered memory allocator
.. _stream_ordered_memory_allocator_how-to:
*******************************************************************************
Stream Ordered Memory Allocator
*******************************************************************************
@@ -25,7 +27,7 @@ Using SOMA
=====================================
You can allocate memory using ``hipMallocAsync()`` with stream-ordered
semantics. This restricts the asynchronous access to the memory between the stream executions of the allocation and deallocation. Accessing
semantics. This restricts the asynchronous access to the memory between the stream executions of the allocation and deallocation. Accessing
memory if the compliant memory accesses won't overlap
temporally. ``hipFreeAsync()`` frees memory from the pool with stream-ordered
semantics.
@@ -0,0 +1,740 @@
.. meta::
:description: This chapter describes Unified Memory and shows
how to use it in AMD HIP.
:keywords: AMD, ROCm, HIP, CUDA, unified memory, unified, memory
.. _unified_memory:
*******************************************************************************
Unified memory management
*******************************************************************************
In conventional architectures CPUs and attached devices have their own memory
space and dedicated physical memory backing it up, e.g. normal RAM for CPUs and
VRAM on GPUs. This way each device can have physical memory optimized for its
use case. GPUs usually have specialized memory whose bandwidth is a
magnitude higher than the RAM attached to CPUs.
While providing exceptional performance, this setup typically requires explicit
memory management, as memory needs to be allocated, copied and freed on the used
devices and on the host. Additionally, this makes using more than the physically
available memory on the devices complicated.
Modern GPUs circumvent the problem of having to explicitly manage the memory,
while still keeping the benefits of the dedicated physical memories, by
supporting the concept of unified memory. This enables the CPU and the GPUs in
the system to access host and other GPUs' memory without explicit memory
management.
Unified memory
================================================================================
Unified Memory is a single memory address space accessible from any processor
within a system. This setup simplifies memory management and enables
applications to allocate data that can be read or written on both CPUs and GPUs
without explicitly copying it to the specific CPU or GPU. The Unified memory
model is shown in the following figure.
.. figure:: ../../../data/how-to/hip_runtime_api/memory_management/unified_memory/um.svg
Unified memory enables the access to memory located on other devices via
several methods, depending on whether hardware support is available or has to be
managed by the driver.
Hardware supported on-demand page migration
--------------------------------------------------------------------------------
When a kernel on the device tries to access a memory address that is not in its
memory, a page-fault is triggered. The GPU then in turn requests the page from
the host or an other device, on which the memory is located. The page is then
unmapped from the source, sent to the device and mapped to the device's memory.
The requested memory is then available to the processes running on the device.
In case the device's memory is at capacity, a page is unmapped from the device's
memory first and sent and mapped to host memory. This enables more memory to be
allocated and used for a GPU, than the GPU itself has physically available.
This level of unified memory support can be very beneficial for sparse accesses
to an array, that is not often used on the device.
Driver managed page migration
--------------------------------------------------------------------------------
If the hardware does not support on-demand page migration, then all the pages
accessed by a kernel have to be resident on the device, so they have to be
migrated before the kernel is running. Since the driver can not know beforehand,
what parts of an array are going to be accessed, all pages of all accessed
arrays have to be migrated. This can lead to significant delays on the first run
of a kernel, on top of possibly copying more memory than is actually accessed by
the kernel.
.. _unified memory system requirements:
System requirements
================================================================================
Unified memory is supported on Linux by all modern AMD GPUs from the Vega
series onward, as shown in the following table. Unified memory management can
be achieved by explicitly allocating managed memory using
:cpp:func:`hipMallocManaged` or marking variables with the ``__managed__``
attribute. For the latest GPUs, with a Linux kernel that supports
`Heterogeneous Memory Management (HMM)
<https://www.kernel.org/doc/html/latest/mm/hmm.html>`_, the normal system
allocator can be used.
.. list-table:: Supported Unified Memory Allocators by GPU architecture
:widths: 40, 25, 25
:header-rows: 1
:align: center
* - Architecture
- :cpp:func:`hipMallocManaged()`, ``__managed__``
- ``new``, ``malloc()``
* - CDNA3
- ✅
- ✅ :sup:`1`
* - CDNA2
- ✅
- ✅ :sup:`1`
* - CDNA1
- ✅
- ✅ :sup:`1`
* - RDNA1
- ✅
- ❌
* - GCN5
- ✅
- ❌
✅: **Supported**
❌: **Unsupported**
:sup:`1` Works only with ``XNACK=1`` and kernels with HMM support. First GPU
access causes recoverable page-fault. For more details, visit `GPU memory
<https://rocm.docs.amd.com/en/latest/conceptual/gpu-memory.html#xnack>`_.
.. _unified memory allocators:
Unified memory allocators
================================================================================
Support for the different unified memory allocators depends on the GPU
architecture and on the system. For more information, see :ref:`unified memory
system requirements` and :ref:`checking unified memory support`.
- **HIP allocated managed memory and variables**
:cpp:func:`hipMallocManaged()` is a dynamic memory allocator available on
all GPUs with unified memory support. For more details, visit
:ref:`unified_memory_reference`.
The ``__managed__`` declaration specifier, which serves as its counterpart,
can be utilized for static allocation.
- **System allocated unified memory**
Starting with CDNA2, the ``new`` and ``malloc()`` system allocators allow
you to reserve unified memory. The system allocator is more versatile and
offers an easy transition for code written for CPUs to HIP code as the
same system allocation API is used.
To ensure the proper functioning of system allocated unified memory on supported
GPUs, it is essential to configure the environment variable ``XNACK=1`` and use
a kernel that supports `HMM
<https://www.kernel.org/doc/html/latest/mm/hmm.html>`_. Without this
configuration, the behavior will be similar to that of systems without HMM
support. For more details, visit
`GPU memory <https://rocm.docs.amd.com/en/latest/conceptual/gpu-memory.html#xnack>`_.
The table below illustrates the expected behavior of managed and unified memory
functions on ROCm and CUDA, both with and without HMM support.
.. tab-set::
.. tab-item:: ROCm allocation behaviour
:sync: original-block
.. list-table:: Comparison of expected behavior of managed and unified memory functions in ROCm
:widths: 26, 17, 20, 17, 20
:header-rows: 1
* - call
- Allocation origin without HMM or ``XNACK=0``
- Access outside the origin without HMM or ``XNACK=0``
- Allocation origin with HMM and ``XNACK=1``
- Access outside the origin with HMM and ``XNACK=1``
* - ``new``, ``malloc()``
- host
- not accessible on device
- host
- page-fault migration
* - :cpp:func:`hipMalloc()`
- device
- zero copy [zc]_
- device
- zero copy [zc]_
* - :cpp:func:`hipMallocManaged()`, ``__managed__``
- pinned host
- zero copy [zc]_
- host
- page-fault migration
* - :cpp:func:`hipHostRegister()`
- undefined behavior
- undefined behavior
- host
- page-fault migration
* - :cpp:func:`hipHostMalloc()`
- pinned host
- zero copy [zc]_
- pinned host
- zero copy [zc]_
.. tab-item:: CUDA allocation behaviour
:sync: cooperative-groups
.. list-table:: Comparison of expected behavior of managed and unified memory functions in CUDA
:widths: 26, 17, 20, 17, 20
:header-rows: 1
* - call
- Allocation origin without HMM
- Access outside the origin without HMM
- Allocation origin with HMM
- Access outside the origin with HMM
* - ``new``, ``malloc()``
- host
- not accessible on device
- first touch
- page-fault migration
* - ``cudaMalloc()``
- device
- not accessible on host
- device
- page-fault migration
* - ``cudaMallocManaged()``, ``__managed__``
- host
- page-fault migration
- first touch
- page-fault migration
* - ``cudaHostRegister()``
- host
- page-fault migration
- host
- page-fault migration
* - ``cudaMallocHost()``
- pinned host
- zero copy [zc]_
- pinned host
- zero copy [zc]_
.. [zc] Zero copy is a feature, where the memory is pinned to either the device
or the host, and won't be transferred when accessed by another device or
the host. Instead only the requested memory is transferred, without
making an explicit copy, like a normal memory access, hence the term
"zero copy".
.. _checking unified memory support:
Checking unified memory support
--------------------------------------------------------------------------------
The following device attributes can offer information about which :ref:`unified
memory allocators` are supported. The attribute value is 1 if the functionality
is supported, and 0 if it is not supported.
.. list-table:: Device attributes for unified memory management
:widths: 40, 60
:header-rows: 1
:align: center
* - Attribute
- Description
* - :cpp:enumerator:`hipDeviceAttributeManagedMemory`
- Device supports allocating managed memory on this system
* - :cpp:enumerator:`hipDeviceAttributePageableMemoryAccess`
- Device supports coherently accessing pageable memory without calling :cpp:func:`hipHostRegister()` on it.
* - :cpp:enumerator:`hipDeviceAttributeConcurrentManagedAccess`
- Full unified memory support. Device can coherently access managed memory concurrently with the CPU
For details on how to get the attributes of a specific device see :cpp:func:`hipDeviceGetAttribute()`.
Example for unified memory management
--------------------------------------------------------------------------------
The following example shows how to use unified memory with
:cpp:func:`hipMallocManaged()` for dynamic allocation, the ``__managed__`` attribute
for static allocation and the standard ``new`` allocation. For comparison, the
explicit memory management example is presented in the last tab.
.. tab-set::
.. tab-item:: hipMallocManaged()
.. code-block:: cpp
:emphasize-lines: 22-25
#include <hip/hip_runtime.h>
#include <iostream>
#define HIP_CHECK(expression) \
{ \
const hipError_t err = expression; \
if(err != hipSuccess){ \
std::cerr << "HIP error: " \
<< hipGetErrorString(err) \
<< " at " << __LINE__ << "\n"; \
} \
}
// Addition of two values.
__global__ void add(int *a, int *b, int *c) {
*c = *a + *b;
}
int main() {
int *a, *b, *c;
// Allocate memory for a, b and c that is accessible to both device and host codes.
HIP_CHECK(hipMallocManaged(&a, sizeof(*a)));
HIP_CHECK(hipMallocManaged(&b, sizeof(*b)));
HIP_CHECK(hipMallocManaged(&c, sizeof(*c)));
// Setup input values.
*a = 1;
*b = 2;
// Launch add() kernel on GPU.
hipLaunchKernelGGL(add, dim3(1), dim3(1), 0, 0, a, b, c);
// Wait for GPU to finish before accessing on host.
HIP_CHECK(hipDeviceSynchronize());
// Print the result.
std::cout << *a << " + " << *b << " = " << *c << std::endl;
// Cleanup allocated memory.
HIP_CHECK(hipFree(a));
HIP_CHECK(hipFree(b));
HIP_CHECK(hipFree(c));
return 0;
}
.. tab-item:: __managed__
.. code-block:: cpp
:emphasize-lines: 19-20
#include <hip/hip_runtime.h>
#include <iostream>
#define HIP_CHECK(expression) \
{ \
const hipError_t err = expression; \
if(err != hipSuccess){ \
std::cerr << "HIP error: " \
<< hipGetErrorString(err) \
<< " at " << __LINE__ << "\n"; \
} \
}
// Addition of two values.
__global__ void add(int *a, int *b, int *c) {
*c = *a + *b;
}
// Declare a, b and c as static variables.
__managed__ int a, b, c;
int main() {
// Setup input values.
a = 1;
b = 2;
// Launch add() kernel on GPU.
hipLaunchKernelGGL(add, dim3(1), dim3(1), 0, 0, &a, &b, &c);
// Wait for GPU to finish before accessing on host.
HIP_CHECK(hipDeviceSynchronize());
// Prints the result.
std::cout << a << " + " << b << " = " << c << std::endl;
return 0;
}
.. tab-item:: new
.. code-block:: cpp
:emphasize-lines: 20-23
#include <hip/hip_runtime.h>
#include <iostream>
#define HIP_CHECK(expression) \
{ \
const hipError_t err = expression; \
if(err != hipSuccess){ \
std::cerr << "HIP error: " \
<< hipGetErrorString(err) \
<< " at " << __LINE__ << "\n"; \
} \
}
// Addition of two values.
__global__ void add(int* a, int* b, int* c) {
*c = *a + *b;
}
// This example requires HMM support and the environment variable HSA_XNACK needs to be set to 1
int main() {
// Allocate memory for a, b, and c.
int *a = new int[1];
int *b = new int[1];
int *c = new int[1];
// Setup input values.
*a = 1;
*b = 2;
// Launch add() kernel on GPU.
hipLaunchKernelGGL(add, dim3(1), dim3(1), 0, 0, a, b, c);
// Wait for GPU to finish before accessing on host.
HIP_CHECK(hipDeviceSynchronize());
// Prints the result.
std::cout << *a << " + " << *b << " = " << *c << std::endl;
// Cleanup allocated memory.
delete[] a;
delete[] b;
delete[] c;
return 0;
}
.. tab-item:: Explicit Memory Management
.. code-block:: cpp
:emphasize-lines: 27-34, 39-40
#include <hip/hip_runtime.h>
#include <iostream>
#define HIP_CHECK(expression) \
{ \
const hipError_t err = expression; \
if(err != hipSuccess){ \
std::cerr << "HIP error: " \
<< hipGetErrorString(err) \
<< " at " << __LINE__ << "\n"; \
} \
}
// Addition of two values.
__global__ void add(int *a, int *b, int *c) {
*c = *a + *b;
}
int main() {
int a, b, c;
int *d_a, *d_b, *d_c;
// Setup input values.
a = 1;
b = 2;
// Allocate device copies of a, b and c.
HIP_CHECK(hipMalloc(&d_a, sizeof(*d_a)));
HIP_CHECK(hipMalloc(&d_b, sizeof(*d_b)));
HIP_CHECK(hipMalloc(&d_c, sizeof(*d_c)));
// Copy input values to device.
HIP_CHECK(hipMemcpy(d_a, &a, sizeof(*d_a), hipMemcpyHostToDevice));
HIP_CHECK(hipMemcpy(d_b, &b, sizeof(*d_b), hipMemcpyHostToDevice));
// Launch add() kernel on GPU.
hipLaunchKernelGGL(add, dim3(1), dim3(1), 0, 0, d_a, d_b, d_c);
// Copy the result back to the host.
HIP_CHECK(hipMemcpy(&c, d_c, sizeof(*d_c), hipMemcpyDeviceToHost));
// Cleanup allocated memory.
HIP_CHECK(hipFree(d_a));
HIP_CHECK(hipFree(d_b));
HIP_CHECK(hipFree(d_c));
// Prints the result.
std::cout << a << " + " << b << " = " << c << std::endl;
return 0;
}
.. _using unified memory:
Using unified memory
================================================================================
Unified memory can simplify the complexities of memory management in GPU
computing, by not requiring explicit copies between the host and the devices. It
can be particularly useful in use cases with sparse memory accesses from both
the CPU and the GPU, as only the parts of the memory region that are actually
accessed need to be transferred to the corresponding processor, not the whole
memory region. This reduces the amount of memory sent over the PCIe bus or other
interfaces.
In HIP, pinned memory allocations are coherent by default. Pinned memory is
host memory mapped into the address space of all GPUs, meaning that the pointer
can be used on both host and device. Additionally, using pinned memory instead of
pageable memory on the host can improve bandwidth for transfers between the host
and the GPUs.
While unified memory can provide numerous benefits, it's important to be aware
of the potential performance overhead associated with unified memory. You must
thoroughly test and profile your code to ensure it's the most suitable choice
for your use case.
.. _unified memory runtime hints:
Performance optimizations for unified memory
================================================================================
There are several ways, in which the developer can guide the runtime to reduce
copies between devices, in order to improve performance.
Data prefetching
--------------------------------------------------------------------------------
Data prefetching is a technique used to improve the performance of your
application by moving data to the desired device before it's actually
needed. ``hipCpuDeviceId`` is a special constant to specify the CPU as target.
.. code-block:: cpp
:emphasize-lines: 33-36,41-42
#include <hip/hip_runtime.h>
#include <iostream>
#define HIP_CHECK(expression) \
{ \
const hipError_t err = expression; \
if(err != hipSuccess){ \
std::cerr << "HIP error: " \
<< hipGetErrorString(err) \
<< " at " << __LINE__ << "\n"; \
} \
}
// Addition of two values.
__global__ void add(int *a, int *b, int *c) {
*c = *a + *b;
}
int main() {
int *a, *b, *c;
int deviceId;
HIP_CHECK(hipGetDevice(&deviceId)); // Get the current device ID
// Allocate memory for a, b and c that is accessible to both device and host codes.
HIP_CHECK(hipMallocManaged(&a, sizeof(*a)));
HIP_CHECK(hipMallocManaged(&b, sizeof(*b)));
HIP_CHECK(hipMallocManaged(&c, sizeof(*c)));
// Setup input values.
*a = 1;
*b = 2;
// Prefetch the data to the GPU device.
HIP_CHECK(hipMemPrefetchAsync(a, sizeof(*a), deviceId, 0));
HIP_CHECK(hipMemPrefetchAsync(b, sizeof(*b), deviceId, 0));
HIP_CHECK(hipMemPrefetchAsync(c, sizeof(*c), deviceId, 0));
// Launch add() kernel on GPU.
hipLaunchKernelGGL(add, dim3(1), dim3(1), 0, 0, a, b, c);
// Prefetch the result back to the CPU.
HIP_CHECK(hipMemPrefetchAsync(c, sizeof(*c), hipCpuDeviceId, 0));
// Wait for the prefetch operations to complete.
HIP_CHECK(hipDeviceSynchronize());
// Prints the result.
std::cout << *a << " + " << *b << " = " << *c << std::endl;
// Cleanup allocated memory.
HIP_CHECK(hipFree(a));
HIP_CHECK(hipFree(b));
HIP_CHECK(hipFree(c));
return 0;
}
Memory advice
--------------------------------------------------------------------------------
Unified memory runtime hints can be set with :cpp:func:`hipMemAdvise()` to help
improve the performance of your code if you know the memory usage pattern. There
are several different types of hints as specified in the enum
:cpp:enum:`hipMemoryAdvise`, for example, whether a certain device mostly reads
the memory region, where it should ideally be located, and even whether that
specific memory region is accessed by a specific device.
For the best performance, profile your application to optimize the
utilization of HIP runtime hints.
The effectiveness of :cpp:func:`hipMemAdvise()` comes from its ability to inform
the runtime of the developer's intentions regarding memory usage. When the
runtime has knowledge of the expected memory access patterns, it can make better
decisions about data placement, leading to less transfers via the interconnect
and thereby reduced latency and bandwidth requirements. However, the actual
impact on performance can vary based on the specific use case and the system.
The following is the updated version of the example above with memory advice
instead of prefetching.
.. code-block:: cpp
:emphasize-lines: 29-41
#include <hip/hip_runtime.h>
#include <iostream>
#define HIP_CHECK(expression) \
{ \
const hipError_t err = expression; \
if(err != hipSuccess){ \
std::cerr << "HIP error: " \
<< hipGetErrorString(err) \
<< " at " << __LINE__ << "\n"; \
} \
}
// Addition of two values.
__global__ void add(int *a, int *b, int *c) {
*c = *a + *b;
}
int main() {
int deviceId;
HIP_CHECK(hipGetDevice(&deviceId));
int *a, *b, *c;
// Allocate memory for a, b, and c accessible to both device and host codes.
HIP_CHECK(hipMallocManaged(&a, sizeof(*a)));
HIP_CHECK(hipMallocManaged(&b, sizeof(*b)));
HIP_CHECK(hipMallocManaged(&c, sizeof(*c)));
// Set memory advice for a and b to be read, located on and accessed by the GPU.
HIP_CHECK(hipMemAdvise(a, sizeof(*a), hipMemAdviseSetPreferredLocation, deviceId));
HIP_CHECK(hipMemAdvise(a, sizeof(*a), hipMemAdviseSetAccessedBy, deviceId));
HIP_CHECK(hipMemAdvise(a, sizeof(*a), hipMemAdviseSetReadMostly, deviceId));
HIP_CHECK(hipMemAdvise(b, sizeof(*b), hipMemAdviseSetPreferredLocation, deviceId));
HIP_CHECK(hipMemAdvise(b, sizeof(*b), hipMemAdviseSetAccessedBy, deviceId));
HIP_CHECK(hipMemAdvise(b, sizeof(*b), hipMemAdviseSetReadMostly, deviceId));
// Set memory advice for c to be read, located on and accessed by the CPU.
HIP_CHECK(hipMemAdvise(c, sizeof(*c), hipMemAdviseSetPreferredLocation, hipCpuDeviceId));
HIP_CHECK(hipMemAdvise(c, sizeof(*c), hipMemAdviseSetAccessedBy, hipCpuDeviceId));
HIP_CHECK(hipMemAdvise(c, sizeof(*c), hipMemAdviseSetReadMostly, hipCpuDeviceId));
// Setup input values.
*a = 1;
*b = 2;
// Launch add() kernel on GPU.
hipLaunchKernelGGL(add, dim3(1), dim3(1), 0, 0, a, b, c);
// Wait for GPU to finish before accessing on host.
HIP_CHECK(hipDeviceSynchronize());
// Prints the result.
std::cout << *a << " + " << *b << " = " << *c << std::endl;
// Cleanup allocated memory.
HIP_CHECK(hipFree(a));
HIP_CHECK(hipFree(b));
HIP_CHECK(hipFree(c));
return 0;
}
Memory range attributes
--------------------------------------------------------------------------------
:cpp:func:`hipMemRangeGetAttribute()` allows you to query attributes of a given
memory range. The attributes are given in :cpp:enum:`hipMemRangeAttribute`.
.. code-block:: cpp
:emphasize-lines: 44-49
#include <hip/hip_runtime.h>
#include <iostream>
#define HIP_CHECK(expression) \
{ \
const hipError_t err = expression; \
if(err != hipSuccess){ \
std::cerr << "HIP error: " \
<< hipGetErrorString(err) \
<< " at " << __LINE__ << "\n"; \
} \
}
// Addition of two values.
__global__ void add(int *a, int *b, int *c) {
*c = *a + *b;
}
int main() {
int *a, *b, *c;
unsigned int attributeValue;
constexpr size_t attributeSize = sizeof(attributeValue);
int deviceId;
HIP_CHECK(hipGetDevice(&deviceId));
// Allocate memory for a, b and c that is accessible to both device and host codes.
HIP_CHECK(hipMallocManaged(&a, sizeof(*a)));
HIP_CHECK(hipMallocManaged(&b, sizeof(*b)));
HIP_CHECK(hipMallocManaged(&c, sizeof(*c)));
// Setup input values.
*a = 1;
*b = 2;
HIP_CHECK(hipMemAdvise(a, sizeof(*a), hipMemAdviseSetReadMostly, deviceId));
// Launch add() kernel on GPU.
hipLaunchKernelGGL(add, dim3(1), dim3(1), 0, 0, a, b, c);
// Wait for GPU to finish before accessing on host.
HIP_CHECK(hipDeviceSynchronize());
// Query an attribute of the memory range.
HIP_CHECK(hipMemRangeGetAttribute(&attributeValue,
attributeSize,
hipMemRangeAttributeReadMostly,
a,
sizeof(*a)));
// Prints the result.
std::cout << *a << " + " << *b << " = " << *c << std::endl;
std::cout << "The array a is" << (attributeValue == 1 ? "" : " NOT") << " set to hipMemRangeAttributeReadMostly" << std::endl;
// Cleanup allocated memory.
HIP_CHECK(hipFree(a));
HIP_CHECK(hipFree(b));
HIP_CHECK(hipFree(c));
return 0;
}
Asynchronously attach memory to a stream
--------------------------------------------------------------------------------
The :cpp:func:`hipStreamAttachMemAsync()` function attaches memory to a stream,
which can reduce the amount of memory transferred, when managed memory is used.
When the memory is attached to a stream using this function, it only gets
transferred between devices, when a kernel that is launched on this stream needs
access to the memory.
@@ -0,0 +1,154 @@
.. meta::
:description: This chapter describes introduces Virtual Memory (VM) and shows
how to use it in AMD HIP.
:keywords: AMD, ROCm, HIP, CUDA, virtual memory, virtual, memory, UM, APU
.. _virtual_memory:
********************************************************************************
Virtual memory management
********************************************************************************
Memory management is important when creating high-performance applications in
the HIP ecosystem. Both allocating and copying memory can result in bottlenecks,
which can significantly impact performance.
Global memory allocation in HIP uses the C language style allocation function.
This works fine for simple cases but can cause problems if your memory needs
change. If you need to increase the size of your memory, you must allocate a
second larger buffer and copy the data to it before you can free the original
buffer. This increases overall memory usage and causes unnecessary ``memcpy``
calls. Another solution is to allocate a larger buffer than you initially need.
However, this isn't an efficient way to handle resources and doesn't solve the
issue of reallocation when the extra buffer runs out.
Virtual memory management solves these memory management problems. It helps to
reduce memory usage and unnecessary ``memcpy`` calls.
.. _memory_allocation_virtual_memory:
Memory allocation
================================================================================
Standard memory allocation uses the :cpp:func:`hipMalloc` function to allocate a
block of memory on the device. However, when using virtual memory, this process
is separated into multiple steps using the :cpp:func:`hipMemCreate`,
:cpp:func:`hipMemAddressReserve`, :cpp:func:`hipMemMap`, and
:cpp:func:`hipMemSetAccess` functions. This guide explains what these functions
do and how you can use them for virtual memory management.
Allocate physical memory
--------------------------------------------------------------------------------
The first step is to allocate the physical memory itself with the
:cpp:func:`hipMemCreate` function. This function accepts the size of the buffer,
an ``unsigned long long`` variable for the flags, and a
:cpp:struct:`hipMemAllocationProp` variable. :cpp:struct:`hipMemAllocationProp`
contains the properties of the memory to be allocated, such as where the memory
is physically located and what kind of shareable handles are available. If the
allocation is successful, the function returns a value of
:cpp:enumerator:`hipSuccess`, with :cpp:type:`hipMemGenericAllocationHandle_t`
representing a valid physical memory allocation. The allocated memory size must
be aligned with the granularity appropriate for the properties of the
allocation. You can use the :cpp:func:`hipMemGetAllocationGranularity` function
to determine the correct granularity.
.. code-block:: cpp
size_t granularity = 0;
hipMemGenericAllocationHandle_t allocHandle;
hipMemAllocationProp prop = {};
prop.type = HIP_MEM_ALLOCATION_TYPE_PINNED;
prop.location.type = HIP_MEM_LOCATION_TYPE_DEVICE;
prop.location.id = currentDev;
hipMemGetAllocationGranularity(&granularity, &prop, HIP_MEM_ALLOC_GRANULARITY_MINIMUM);
padded_size = ROUND_UP(size, granularity);
hipMemCreate(&allocHandle, padded_size, &prop, 0);
Reserve virtual address range
--------------------------------------------------------------------------------
After you have acquired an allocation of physical memory, you must map it before
you can use it. To do so, you need a virtual address to map it to. Mapping
means the physical memory allocation is available from the virtual address range
it is mapped to. To reserve a virtual memory range, use the
:cpp:func:`hipMemAddressReserve` function. The size of the virtual memory must
match the amount of physical memory previously allocated. You can then map the
physical memory allocation to the newly-acquired virtual memory address range
using the :cpp:func:`hipMemMap` function.
.. code-block:: cpp
hipMemAddressReserve(&ptr, padded_size, 0, 0, 0);
hipMemMap(ptr, padded_size, 0, allocHandle, 0);
Set memory access
--------------------------------------------------------------------------------
Finally, use the :cpp:func:`hipMemSetAccess` function to enable memory access.
It accepts the pointer to the virtual memory, the size, and a
:cpp:struct:`hipMemAccessDesc` descriptor as parameters. In a multi-GPU
environment, you can map the device memory of one GPU to another. This feature
also works with the traditional memory management system, but isn't as scalable
as with virtual memory. When memory is allocated with :cpp:func:`hipMalloc`,
:cpp:func:`hipDeviceEnablePeerAccess` is used to enable peer access. This
function enables access between two devices, but it means that every call to
:cpp:func:`hipMalloc` takes more time to perform the checks and the mapping
between the devices. When using virtual memory management, peer access is
enabled by :cpp:func:`hipMemSetAccess`, which provides a finer level of
control over what is shared. This has no performance impact on memory allocation
and gives you more control over what memory buffers are shared with which
devices.
.. code-block:: cpp
hipMemAccessDesc accessDesc = {};
accessDesc.location.type = HIP_MEM_LOCATION_TYPE_DEVICE;
accessDesc.location.id = currentDev;
accessDesc.flags = HIP_MEM_ACCESS_FLAGS_PROT_READWRITE;
hipMemSetAccess(ptr, padded_size, &accessDesc, 1);
At this point the memory is allocated, mapped, and ready for use. You can read
and write to it, just like you would a C style memory allocation.
Free virtual memory
--------------------------------------------------------------------------------
To free the memory allocated in this manner, use the corresponding free
functions. To unmap the memory, use :cpp:func:`hipMemUnmap`. To release the
virtual address range, use :cpp:func:`hipMemAddressFree`. Finally, to release
the physical memory, use :cpp:func:`hipMemRelease`. A side effect of these
functions is the lack of synchronization when memory is released. If you call
:cpp:func:`hipFree` when you have multiple streams running in parallel, it
synchronizes the device. This causes worse resource usage and performance.
.. code-block:: cpp
hipMemUnmap(ptr, size);
hipMemRelease(allocHandle);
hipMemAddressFree(ptr, size);
.. _usage_virtual_memory:
Memory usage
================================================================================
Dynamically increase allocation size
--------------------------------------------------------------------------------
The :cpp:func:`hipMemAddressReserve` function allows you to increase the amount
of pre-allocated memory. This function accepts a parameter representing the
requested starting address of the virtual memory. This allows you to have a
continuous virtual address space without worrying about the underlying physical
allocation.
.. code-block:: cpp
hipMemAddressReserve(&new_ptr, (new_size - padded_size), 0, ptr + padded_size, 0);
hipMemMap(new_ptr, (new_size - padded_size), 0, newAllocHandle, 0);
hipMemSetAccess(new_ptr, (new_size - padded_size), &accessDesc, 1);
The code sample above assumes that :cpp:func:`hipMemAddressReserve` was able to
reserve the memory address at the specified location. However, this isn't
guaranteed to be true, so you should validate that ``new_ptr`` points to a
specific virtual address before using it.
@@ -0,0 +1,420 @@
.. meta::
:description: This chapter describes how to use multiple devices on one host.
:keywords: ROCm, HIP, multi-device, multiple, GPUs, devices
.. _multi-device:
*******************************************************************************
Multi-device management
*******************************************************************************
Device enumeration
===============================================================================
Device enumeration involves identifying all the available GPUs connected to the
host system. A single host machine can have multiple GPUs, each with its own
unique identifier. By listing these devices, you can decide which GPU to use
for computation. The host queries the system to count and list all connected
GPUs that support the chosen ``HIP_PLATFORM``, ensuring that the application
can leverage the full computational power available. Typically, applications
list devices and their properties for deployment planning, and also make
dynamic selections during runtime to ensure optimal performance.
If the application does not define a specific GPU, device 0 is selected.
.. code-block:: cpp
#include <hip/hip_runtime.h>
#include <iostream>
int main()
{
int deviceCount;
hipGetDeviceCount(&deviceCount);
std::cout << "Number of devices: " << deviceCount << std::endl;
for (int deviceId = 0; deviceId < deviceCount; ++deviceId)
{
hipDeviceProp_t deviceProp;
hipGetDeviceProperties(&deviceProp, deviceId);
std::cout << "Device " << deviceId << std::endl << " Properties:" << std::endl;
std::cout << " Name: " << deviceProp.name << std::endl;
std::cout << " Total Global Memory: " << deviceProp.totalGlobalMem / (1024 * 1024) << " MiB" << std::endl;
std::cout << " Shared Memory per Block: " << deviceProp.sharedMemPerBlock / 1024 << " KiB" << std::endl;
std::cout << " Registers per Block: " << deviceProp.regsPerBlock << std::endl;
std::cout << " Warp Size: " << deviceProp.warpSize << std::endl;
std::cout << " Max Threads per Block: " << deviceProp.maxThreadsPerBlock << std::endl;
std::cout << " Max Threads per Multiprocessor: " << deviceProp.maxThreadsPerMultiProcessor << std::endl;
std::cout << " Number of Multiprocessors: " << deviceProp.multiProcessorCount << std::endl;
std::cout << " Max Threads Dimensions: ["
<< deviceProp.maxThreadsDim[0] << ", "
<< deviceProp.maxThreadsDim[1] << ", "
<< deviceProp.maxThreadsDim[2] << "]" << std::endl;
std::cout << " Max Grid Size: ["
<< deviceProp.maxGridSize[0] << ", "
<< deviceProp.maxGridSize[1] << ", "
<< deviceProp.maxGridSize[2] << "]" << std::endl;
std::cout << std::endl;
}
return 0;
}
.. _multi_device_selection:
Device selection
===============================================================================
Once you have enumerated the available GPUs, the next step is to select a
specific device for computation. This involves setting the active GPU that will
execute subsequent operations. This step is crucial in multi-GPU systems where
different GPUs might have different capabilities or workloads. By selecting the
appropriate device, you ensure that the computational tasks are directed to the
correct GPU, optimizing performance and resource utilization.
.. code-block:: cpp
#include <hip/hip_runtime.h>
#include <iostream>
#define HIP_CHECK(expression) \
{ \
const hipError_t status = expression; \
if (status != hipSuccess) { \
std::cerr << "HIP error " << status \
<< ": " << hipGetErrorString(status) \
<< " at " << __FILE__ << ":" \
<< __LINE__ << std::endl; \
exit(status); \
} \
}
__global__ void simpleKernel(double *data)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
data[idx] = idx * 2.0;
}
int main()
{
double* deviceData0;
double* deviceData1;
size_t size = 1024 * sizeof(*deviceData0);
int deviceId0 = 0;
int deviceId1 = 1;
// Set device 0 and perform operations
HIP_CHECK(hipSetDevice(deviceId0)); // Set device 0 as current
HIP_CHECK(hipMalloc(&deviceData0, size)); // Allocate memory on device 0
simpleKernel<<<1000, 128>>>(deviceData0); // Launch kernel on device 0
HIP_CHECK(hipDeviceSynchronize());
// Set device 1 and perform operations
HIP_CHECK(hipSetDevice(deviceId1)); // Set device 1 as current
HIP_CHECK(hipMalloc(&deviceData1, size)); // Allocate memory on device 1
simpleKernel<<<1000, 128>>>(deviceData1); // Launch kernel on device 1
HIP_CHECK(hipDeviceSynchronize());
// Copy result from device 0
double hostData0[1024];
HIP_CHECK(hipSetDevice(deviceId0));
HIP_CHECK(hipMemcpy(hostData0, deviceData0, size, hipMemcpyDeviceToHost));
// Copy result from device 1
double hostData1[1024];
HIP_CHECK(hipSetDevice(deviceId1));
HIP_CHECK(hipMemcpy(hostData1, deviceData1, size, hipMemcpyDeviceToHost));
// Display results from both devices
std::cout << "Device 0 data: " << hostData0[0] << std::endl;
std::cout << "Device 1 data: " << hostData1[0] << std::endl;
// Free device memory
HIP_CHECK(hipFree(deviceData0));
HIP_CHECK(hipFree(deviceData1));
return 0;
}
Stream and event behavior
===============================================================================
In a multi-device system, streams and events are essential for efficient
parallel computation and synchronization. Streams enable asynchronous task
execution, allowing multiple devices to process data concurrently without
blocking one another. Events provide a mechanism for synchronizing operations
across streams and devices, ensuring that tasks on one device are completed
before dependent tasks on another device begin. This coordination prevents race
conditions and optimizes data flow in multi-GPU systems. Together, streams and
events maximize performance by enabling parallel execution, load balancing, and
effective resource utilization across heterogeneous hardware.
.. code-block:: cpp
#include <hip/hip_runtime.h>
#include <iostream>
__global__ void simpleKernel(double *data)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
data[idx] = idx * 2.0;
}
int main()
{
int numDevices;
hipGetDeviceCount(&numDevices);
if (numDevices < 2) {
std::cerr << "This example requires at least two GPUs." << std::endl;
return -1;
}
double *deviceData0, *deviceData1;
size_t size = 1024 * sizeof(*deviceData0);
// Create streams and events for each device
hipStream_t stream0, stream1;
hipEvent_t startEvent0, stopEvent0, startEvent1, stopEvent1;
// Initialize device 0
hipSetDevice(0);
hipStreamCreate(&stream0);
hipEventCreate(&startEvent0);
hipEventCreate(&stopEvent0);
hipMalloc(&deviceData0, size);
// Initialize device 1
hipSetDevice(1);
hipStreamCreate(&stream1);
hipEventCreate(&startEvent1);
hipEventCreate(&stopEvent1);
hipMalloc(&deviceData1, size);
// Record the start event on device 0
hipSetDevice(0);
hipEventRecord(startEvent0, stream0);
// Launch the kernel asynchronously on device 0
simpleKernel<<<1000, 128, 0, stream0>>>(deviceData0);
// Record the stop event on device 0
hipEventRecord(stopEvent0, stream0);
// Wait for the stop event on device 0 to complete
hipEventSynchronize(stopEvent0);
// Record the start event on device 1
hipSetDevice(1);
hipEventRecord(startEvent1, stream1);
// Launch the kernel asynchronously on device 1
simpleKernel<<<1000, 128, 0, stream1>>>(deviceData1);
// Record the stop event on device 1
hipEventRecord(stopEvent1, stream1);
// Wait for the stop event on device 1 to complete
hipEventSynchronize(stopEvent1);
// Calculate elapsed time between the events for both devices
float milliseconds0 = 0, milliseconds1 = 0;
hipEventElapsedTime(&milliseconds0, startEvent0, stopEvent0);
hipEventElapsedTime(&milliseconds1, startEvent1, stopEvent1);
std::cout << "Elapsed time on GPU 0: " << milliseconds0 << " ms" << std::endl;
std::cout << "Elapsed time on GPU 1: " << milliseconds1 << " ms" << std::endl;
// Cleanup for device 0
hipSetDevice(0);
hipEventDestroy(startEvent0);
hipEventDestroy(stopEvent0);
hipStreamSynchronize(stream0);
hipStreamDestroy(stream0);
hipFree(deviceData0);
// Cleanup for device 1
hipSetDevice(1);
hipEventDestroy(startEvent1);
hipEventDestroy(stopEvent1);
hipStreamSynchronize(stream1);
hipStreamDestroy(stream1);
hipFree(deviceData1);
return 0;
}
Peer-to-peer memory access
===============================================================================
In multi-GPU systems, peer-to-peer memory access enables one GPU to directly
read or write to the memory of another GPU. This capability reduces data
transfer times by allowing GPUs to communicate directly without involving the
host. Enabling peer-to-peer access can significantly improve the performance of
applications that require frequent data exchange between GPUs, as it eliminates
the need to transfer data through the host memory.
By adding peer-to-peer access to the example referenced in
:ref:`multi_device_selection`, data can be copied between devices:
.. tab-set::
.. tab-item:: with peer-to-peer
.. code-block:: cpp
:emphasize-lines: 31-37, 51-55
#include <hip/hip_runtime.h>
#include <iostream>
#define HIP_CHECK(expression) \
{ \
const hipError_t status = expression; \
if (status != hipSuccess) { \
std::cerr << "HIP error " << status \
<< ": " << hipGetErrorString(status) \
<< " at " << __FILE__ << ":" \
<< __LINE__ << std::endl; \
exit(status); \
} \
}
__global__ void simpleKernel(double *data)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
data[idx] = idx * 2.0;
}
int main()
{
double* deviceData0;
double* deviceData1;
size_t size = 1024 * sizeof(*deviceData0);
int deviceId0 = 0;
int deviceId1 = 1;
// Enable peer access to the memory (allocated and future) on the peer device.
// Ensure the device is active before enabling peer access.
hipSetDevice(deviceId0);
hipDeviceEnablePeerAccess(deviceId1, 0);
hipSetDevice(deviceId1);
hipDeviceEnablePeerAccess(deviceId0, 0);
// Set device 0 and perform operations
HIP_CHECK(hipSetDevice(deviceId0)); // Set device 0 as current
HIP_CHECK(hipMalloc(&deviceData0, size)); // Allocate memory on device 0
simpleKernel<<<1000, 128>>>(deviceData0); // Launch kernel on device 0
HIP_CHECK(hipDeviceSynchronize());
// Set device 1 and perform operations
HIP_CHECK(hipSetDevice(deviceId1)); // Set device 1 as current
HIP_CHECK(hipMalloc(&deviceData1, size)); // Allocate memory on device 1
simpleKernel<<<1000, 128>>>(deviceData1); // Launch kernel on device 1
HIP_CHECK(hipDeviceSynchronize());
// Use peer-to-peer access
hipSetDevice(deviceId0);
// Now device 0 can access memory allocated on device 1
hipMemcpy(deviceData0, deviceData1, size, hipMemcpyDeviceToDevice);
// Copy result from device 0
double hostData0[1024];
HIP_CHECK(hipSetDevice(deviceId0));
HIP_CHECK(hipMemcpy(hostData0, deviceData0, size, hipMemcpyDeviceToHost));
// Copy result from device 1
double hostData1[1024];
HIP_CHECK(hipSetDevice(deviceId1));
HIP_CHECK(hipMemcpy(hostData1, deviceData1, size, hipMemcpyDeviceToHost));
// Display results from both devices
std::cout << "Device 0 data: " << hostData0[0] << std::endl;
std::cout << "Device 1 data: " << hostData1[0] << std::endl;
// Free device memory
HIP_CHECK(hipFree(deviceData0));
HIP_CHECK(hipFree(deviceData1));
return 0;
}
.. tab-item:: without peer-to-peer
.. code-block:: cpp
:emphasize-lines: 43-49, 53, 58
#include <hip/hip_runtime.h>
#include <iostream>
#define HIP_CHECK(expression) \
{ \
const hipError_t status = expression; \
if (status != hipSuccess) { \
std::cerr << "HIP error " << status \
<< ": " << hipGetErrorString(status) \
<< " at " << __FILE__ << ":" \
<< __LINE__ << std::endl; \
exit(status); \
} \
}
__global__ void simpleKernel(double *data)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
data[idx] = idx * 2.0;
}
int main()
{
double* deviceData0;
double* deviceData1;
size_t size = 1024 * sizeof(*deviceData0);
int deviceId0 = 0;
int deviceId1 = 1;
// Set device 0 and perform operations
HIP_CHECK(hipSetDevice(deviceId0)); // Set device 0 as current
HIP_CHECK(hipMalloc(&deviceData0, size)); // Allocate memory on device 0
simpleKernel<<<1000, 128>>>(deviceData0); // Launch kernel on device 0
HIP_CHECK(hipDeviceSynchronize());
// Set device 1 and perform operations
HIP_CHECK(hipSetDevice(deviceId1)); // Set device 1 as current
HIP_CHECK(hipMalloc(&deviceData1, size)); // Allocate memory on device 1
simpleKernel<<<1000, 128>>>(deviceData1); // Launch kernel on device 1
HIP_CHECK(hipDeviceSynchronize());
// Attempt to use deviceData0 on device 1 (This will not work as deviceData0 is allocated on device 0)
HIP_CHECK(hipSetDevice(deviceId1));
hipError_t err = hipMemcpy(deviceData1, deviceData0, size, hipMemcpyDeviceToDevice); // This should fail
if (err != hipSuccess)
{
std::cout << "Error: Cannot access deviceData0 from device 1, deviceData0 is on device 0" << std::endl;
}
// Copy result from device 0
double hostData0[1024];
HIP_CHECK(hipSetDevice(deviceId0));
HIP_CHECK(hipMemcpy(hostData0, deviceData0, size, hipMemcpyDeviceToHost));
// Copy result from device 1
double hostData1[1024];
HIP_CHECK(hipSetDevice(deviceId1));
HIP_CHECK(hipMemcpy(hostData1, deviceData1, size, hipMemcpyDeviceToHost));
// Display results from both devices
std::cout << "Device 0 data: " << hostData0[0] << std::endl;
std::cout << "Device 1 data: " << hostData1[0] << std::endl;
// Free device memory
HIP_CHECK(hipFree(deviceData0));
HIP_CHECK(hipFree(deviceData1));
return 0;
}
@@ -0,0 +1,94 @@
.. meta::
:description: HIP provides an OpenGL interoperability API that allows
efficient data sharing between HIP's computing power and
OpenGL's graphics rendering.
:keywords: AMD, ROCm, HIP, OpenGL, interop, interoperability
*******************************************************************************
OpenGL interoperability
*******************************************************************************
The HIP--OpenGL interoperation involves mapping OpenGL resources, such as
buffers and textures, for HIP to interact with OpenGL. This mapping process
enables HIP to utilize these resources directly, bypassing the need for costly
data transfers between the CPU and GPU. This capability is useful in
applications that require both intensive GPU computation and real-time
visualization.
The graphics resources must be registered using functions like
:cpp:func:`hipGraphicsGLRegisterBuffer` or :cpp:func:`hipGraphicsGLRegisterImage`
then they can be mapped to HIP with :cpp:func:`hipGraphicsMapResources`
function.
After mapping, the :cpp:func:`hipGraphicsResourceGetMappedPointer` or
:cpp:func:`hipGraphicsSubResourceGetMappedArray` functions used to retrieve a
device pointer to the mapped resource, which can then be used in HIP kernels.
Unmapping resources with :cpp:func:`hipGraphicsUnmapResources` after
computations ensure proper resource management.
Example
===============================================================================
ROCm examples have a `HIP--OpenGL interoperation example <https://github.com/ROCm/rocm-examples/tree/develop/HIP-Basic/opengl_interop>`_,
where a simple HIP kernel is used to simulate a sine wave and rendered to a
window as a grid of triangles using OpenGL. For a working example, there are
multiple initialization steps needed like creating and opening a window,
initializing OpenGL or selecting the OpenGL-capable device. After the
initialization in the example, the kernel simulates the sinewave and updates
the window's framebuffer in a cycle until the window is closed.
.. note::
The more recent OpenGL functions are loaded with `OpenGL loader <https://github.com/ROCm/rocm-examples/tree/develop/External/glad>`_,
as these are not loaded by default on all platforms. The use of a custom
loader is shown in the following example
.. <!-- spellcheck-disable -->
.. literalinclude:: ../../tools/example_codes/opengl_interop.hip
:start-after: // [Sphinx opengl functions load start]
:end-before: // [Sphinx opengl functions load end]
:language: cpp
.. <!-- spellcheck-enable -->
The OpenGL buffer is imported to HIP in the following way:
.. <!-- spellcheck-disable -->
.. literalinclude:: ../../tools/example_codes/opengl_interop.hip
:start-after: // [Sphinx buffer register and get start]
:end-before: // [Sphinx buffer register and get end]
:language: cpp
.. <!-- spellcheck-enable -->
The imported pointer is manipulated in the sinewave kernel as shown in the
following example:
.. <!-- spellcheck-disable -->
.. literalinclude:: ../../tools/example_codes/opengl_interop.hip
:start-after: /// [Sphinx sinewave kernel start]
:end-before: /// [Sphinx sinewave kernel end]
:language: cpp
.. literalinclude:: ../../tools/example_codes/opengl_interop.hip
:start-after: // [Sphinx buffer use in kernel start]
:end-before: // [Sphinx buffer use in kernel end]
:language: cpp
.. <!-- spellcheck-enable -->
The HIP graphics resource that is imported from the OpenGL buffer and is not
needed anymore should be unmapped and unregistered as shown in the following way:
.. <!-- spellcheck-disable -->
.. literalinclude:: ../../tools/example_codes/opengl_interop.hip
:start-after: // [Sphinx unregister start]
:end-before: // [Sphinx unregister end]
:language: cpp
.. <!-- spellcheck-enable -->
@@ -33,11 +33,12 @@ The value of this variable controls your logging level. Levels are defined as fo
.. code-block:: cpp
enum LogLevel {
LOG_NONE = 0,
LOG_ERROR = 1,
LOG_WARNING = 2,
LOG_INFO = 3,
LOG_DEBUG = 4
LOG_NONE = 0,
LOG_ERROR = 1,
LOG_WARNING = 2,
LOG_INFO = 3,
LOG_DEBUG = 4,
LOG_EXTRA_DEBUG = 5
};
.. tip::
@@ -55,26 +56,27 @@ change this to any of the valid values:
.. code-block:: cpp
enum LogMask {
LOG_API = 0x00000001, //!< API call
LOG_CMD = 0x00000002, //!< Kernel and Copy Commands and Barriers
LOG_WAIT = 0x00000004, //!< Synchronization and waiting for commands to finish
LOG_AQL = 0x00000008, //!< Decode and display AQL packets
LOG_QUEUE = 0x00000010, //!< Queue commands and queue contents
LOG_SIG = 0x00000020, //!< Signal creation, allocation, pool
LOG_LOCK = 0x00000040, //!< Locks and thread-safety code.
LOG_KERN = 0x00000080, //!< kernel creations and arguments, etc.
LOG_COPY = 0x00000100, //!< Copy debug
LOG_COPY2 = 0x00000200, //!< Detailed copy debug
LOG_RESOURCE = 0x00000400, //!< Resource allocation, performance-impacting events.
LOG_INIT = 0x00000800, //!< Initialization and shutdown
LOG_MISC = 0x00001000, //!< misc debug, not yet classified
LOG_AQL2 = 0x00002000, //!< Show raw bytes of AQL packet
LOG_CODE = 0x00004000, //!< Show code creation debug
LOG_CMD2 = 0x00008000, //!< More detailed command info, including barrier commands
LOG_LOCATION = 0x00010000, //!< Log message location
LOG_MEM = 0x00020000, //!< Memory allocation
LOG_MEM_POOL = 0x00040000, //!< Memory pool allocation, including memory in graphs
LOG_ALWAYS = 0xFFFFFFFF, //!< Log always even mask flag is zero
LOG_API = 1, //!< (0x1) API call
LOG_CMD = 2, //!< (0x2) Kernel and Copy Commands and Barriers
LOG_WAIT = 4, //!< (0x4) Synchronization and waiting for commands to finish
LOG_AQL = 8, //!< (0x8) Decode and display AQL packets
LOG_QUEUE = 16, //!< (0x10) Queue commands and queue contents
LOG_SIG = 32, //!< (0x20) Signal creation, allocation, pool
LOG_LOCK = 64, //!< (0x40) Locks and thread-safety code.
LOG_KERN = 128, //!< (0x80) Kernel creations and arguments, etc.
LOG_COPY = 256, //!< (0x100) Copy debug
LOG_COPY2 = 512, //!< (0x200) Detailed copy debug
LOG_RESOURCE = 1024, //!< (0x400) Resource allocation, performance-impacting events.
LOG_INIT = 2048, //!< (0x800) Initialization and shutdown
LOG_MISC = 4096, //!< (0x1000) Misc debug, not yet classified
LOG_AQL2 = 8192, //!< (0x2000) Show raw bytes of AQL packet
LOG_CODE = 16384, //!< (0x4000) Show code creation debug
LOG_CMD2 = 32768, //!< (0x8000) More detailed command info, including barrier commands
LOG_LOCATION = 65536, //!< (0x10000) Log message location
LOG_MEM = 131072, //!< (0x20000) Memory allocation
LOG_MEM_POOL = 262144, //!< (0x40000) Memory pool allocation, including memory in graphs
LOG_TS = 524288, //!< (0x80000) Timestamp details
LOG_ALWAYS = -1 //!< (0xFFFFFFFF) Log always even mask flag is zero
};
You can also define the logging mask via the ``AMD_LOG_MASK`` environment variable.
@@ -41,7 +41,7 @@ the host or parallel to the devices.
For parallel workloads, when threads belonging to the same block need to
synchronize to share data, use :cpp:func:`__syncthreads()` (see:
:ref:`synchronization functions`) within the same kernel invocation. For threads
:ref:`synchronization_functions`) within the same kernel invocation. For threads
belonging to different blocks, use global memory with two separate
kernel invocations. It is recommended to avoid the latter approach as it adds
overhead.
@@ -151,7 +151,7 @@ and is generally reduced when addresses are more scattered, especially in
global memory.
Device memory is accessed via 32-, 64-, or 128-byte transactions that must be
naturally aligned.
naturally aligned.
Maximizing memory throughput involves:
- Coalescing memory accesses of threads within a warp into minimal transactions.
@@ -294,7 +294,7 @@ Applications frequently allocating and freeing memory might experience slower
allocation calls over time as memory is released back to the operating system.
To optimize performance in such scenarios, follow these guidelines:
- Avoid allocating all available memory with :cpp:func:`hipMalloc` or
- Avoid allocating all available memory with :cpp:func:`hipMalloc` or
:cpp:func:`hipHostMalloc`, as this immediately reserves memory and might
prevent other applications from using it. This behavior could strain the
operating system schedulers or prevent other applications from running on the
@@ -309,7 +309,7 @@ To optimize performance in such scenarios, follow these guidelines:
performance, they allow the application to continue running.
- For supported platforms, use :cpp:func:`hipMallocManaged`, as it allows
oversubscription. With the right policies, :cpp:func:`hipMallocManaged` can
maintain most, if not all, :cpp:func:`hipMalloc` performance.
maintain most, if not all, :cpp:func:`hipMalloc` performance.
:cpp:func:`hipMallocManaged` doesn't require an allocation to be resident
until it is needed or prefetched, which eases the load on the operating
system's schedulers and facilitates multitenant scenarios.
@@ -1,212 +0,0 @@
# HIP programming manual
## Host Memory
### Introduction
`hipHostMalloc` allocates pinned host memory which is mapped into the address space of all GPUs in the system, the memory can be accessed directly by the GPU device, and can be read or written with much higher bandwidth than pageable memory obtained with functions such as `malloc()`.
There are two use cases for this host memory:
* Faster `HostToDevice` and `DeviceToHost` Data Transfers:
The runtime tracks the `hipHostMalloc` allocations and can avoid some of the setup required for regular unpinned memory. For exact measurements on a specific system, experiment with `--unpinned` and `--pinned` switches for the `hipBusBandwidth` tool.
* Zero-Copy GPU Access:
GPU can directly access the host memory over the CPU/GPU interconnect, without need to copy the data. This avoids the need for the copy, but during the kernel access each memory access must traverse the interconnect, which can be tens of times slower than accessing the GPU's local device memory. Zero-copy memory can be a good choice when the memory accesses are infrequent (perhaps only once). Zero-copy memory is typically "Coherent" and thus not cached by the GPU but this can be overridden if desired.
### Memory allocation flags
There are flags parameter which can specify options how to allocate the memory, for example,
`hipHostMallocPortable`, the memory is considered allocated by all contexts, not just the one on which the allocation is made.
`hipHostMallocMapped`, will map the allocation into the address space for the current device, and the device pointer can be obtained with the API `hipHostGetDevicePointer()`.
`hipHostMallocNumaUser` is the flag to allow host memory allocation to follow Numa policy by user. Please note this flag is currently only applicable on Linux, under development on Windows.
All allocation flags are independent, and can be used in any combination without restriction, for instance, `hipHostMalloc` can be called with both `hipHostMallocPortable` and `hipHostMallocMapped` flags set. Both usage models described above use the same allocation flags, and the difference is in how the surrounding code uses the host memory.
### Numa-aware host memory allocation
Numa policy determines how memory is allocated.
Target of Numa policy is to select a CPU that is closest to each GPU.
Numa distance is the measurement of how far between GPU and CPU devices.
By default, each GPU selects a Numa CPU node that has the least Numa distance between them, that is, host memory will be automatically allocated closest on the memory pool of Numa node of the current GPU device. Using `hipSetDevice` API to a different GPU will still be able to access the host allocation, but can have longer Numa distance.
Note, Numa policy is so far implemented on Linux, and under development on Windows.
### Coherency Controls
ROCm defines two coherency options for host memory:
* Coherent memory : Supports fine-grain synchronization while the kernel is running. For example, a kernel can perform atomic operations that are visible to the host CPU or to other (peer) GPUs. Synchronization instructions include `threadfence_system` and C++11-style atomic operations.
In order to achieve this fine-grained coherence, many AMD GPUs use a limited cache policy, such as leaving these allocations uncached by the GPU, or making them read-only.
* Non-coherent memory : Can be cached by GPU, but cannot support synchronization while the kernel is running. Non-coherent memory can be optionally synchronized only at command (end-of-kernel or copy command) boundaries. This memory is appropriate for high-performance access when fine-grain synchronization is not required.
HIP provides the developer with controls to select which type of memory is used via allocation flags passed to `hipHostMalloc` and the `HIP_HOST_COHERENT` environment variable. By default, the environment variable HIP_HOST_COHERENT is set to 0 in HIP.
The control logic in the current version of HIP is as follows:
* No flags are passed in: the host memory allocation is coherent, the HIP_HOST_COHERENT environment variable is ignored.
* `hipHostMallocCoherent=1`: The host memory allocation will be coherent, the HIP_HOST_COHERENT environment variable is ignored.
* `hipHostMallocMapped=1`: The host memory allocation will be coherent, the HIP_HOST_COHERENT environment variable is ignored.
* `hipHostMallocNonCoherent=1`, `hipHostMallocCoherent=0`, and `hipHostMallocMapped=0`: The host memory will be non-coherent, the HIP_HOST_COHERENT environment variable is ignored.
* `hipHostMallocCoherent=0`, `hipHostMallocNonCoherent=0`, `hipHostMallocMapped=0`, but one of the other `HostMalloc` flags is set:
* If `HIP_HOST_COHERENT` is defined as 1, the host memory allocation is coherent.
* If `HIP_HOST_COHERENT` is not defined, or defined as 0, the host memory allocation is non-coherent.
* `hipHostMallocCoherent=1`, `hipHostMallocNonCoherent=1`: Illegal.
### Visibility of Zero-Copy Host Memory
Coherent host memory is automatically visible at synchronization points.
Non-coherent
| HIP API | Synchronization Effect | Fence | Coherent Host Memory Visibility | Non-Coherent Host Memory Visibility|
| --- | --- | --- | --- | --- |
| `hipStreamSynchronize` | host waits for all commands in the specified stream to complete | system-scope release | yes | yes |
| `hipDeviceSynchronize` | host waits for all commands in all streams on the specified device to complete | system-scope release | yes | yes |
| `hipEventSynchronize` | host waits for the specified event to complete | device-scope release | yes | depends - see below|
| `hipStreamWaitEvent` | stream waits for the specified event to complete | none | yes | no |
### `hipEventSynchronize`
Developers can control the release scope for `hipEvents`:
* By default, the GPU performs a device-scope acquire and release operation with each recorded event. This will make host and device memory visible to other commands executing on the same device.
A stronger system-level fence can be specified when the event is created with `hipEventCreateWithFlags`:
* `hipEventReleaseToSystem`: Perform a system-scope release operation when the event is recorded. This will make both Coherent and Non-Coherent host memory visible to other agents in the system, but may involve heavyweight operations such as cache flushing. Coherent memory will typically use lighter-weight in-kernel synchronization mechanisms such as an atomic operation and thus does not need to use `hipEventReleaseToSystem`.
* `hipEventDisableTiming`: Events created with this flag will not record profiling data and provide the best performance if used for synchronization.
### Summary and Recommendations
* Coherent host memory is the default and is the easiest to use since the memory is visible to the CPU at typical synchronization points. This memory allows in-kernel synchronization commands such as `threadfence_system` to work transparently.
* HIP/ROCm also supports the ability to cache host memory in the GPU using the "Non-Coherent" host memory allocations. This can provide performance benefit, but care must be taken to use the correct synchronization.
### Managed memory allocation
Managed memory, including the `__managed__` keyword, is supported in HIP combined host/device compilation, on Linux, not on Windows (under development).
Managed memory, via unified memory allocation, allows data be shared and accessible to both the CPU and GPU using a single pointer.
The allocation will be managed by AMD GPU driver using the Linux HMM (Heterogeneous Memory Management) mechanism, the user can call managed memory API `hipMallocManaged` to allocate a large chunk of HMM memory, execute kernels on device and fetch data between the host and device as needed.
In HIP application, it is recommended to do the capability check before calling the managed memory APIs. For example:
```cpp
int managed_memory = 0;
HIPCHECK(hipDeviceGetAttribute(&managed_memory,
hipDeviceAttributeManagedMemory,p_gpuDevice));
if (!managed_memory ) {
printf ("info: managed memory access not supported on the device %d\n Skipped\n", p_gpuDevice);
}
else {
HIPCHECK(hipSetDevice(p_gpuDevice));
HIPCHECK(hipMallocManaged(&Hmm, N * sizeof(T)));
. . .
}
```
Please note, the managed memory capability check may not be necessary, but if HMM is not supported, then managed malloc will fall back to using system memory and other managed memory API calls will have undefined behavior.
Note, managed memory management is implemented on Linux, not supported on Windows yet.
### HIP Stream Memory Operations
HIP supports Stream Memory Operations to enable direct synchronization between Network Nodes and GPU. Following new APIs are added,
`hipStreamWaitValue32`
`hipStreamWaitValue64`
`hipStreamWriteValue32`
`hipStreamWriteValue64`
Note, CPU access to the semaphore's memory requires volatile keyword to disable CPU compiler's optimizations on memory access.
For more details, please check the documentation `HIP-API.pdf`.
Please note, HIP stream does not guarantee concurrency on AMD hardware for the case of multiple (at least 6) long-running streams executing concurrently, using `hipStreamSynchronize(nullptr)` for synchronization.
## Direct Dispatch
HIP runtime has Direct Dispatch enabled by default in ROCM 4.4 on Linux.
With this feature we move away from our conventional producer-consumer model where the runtime creates a worker thread(consumer) for each HIP Stream, and the host thread(producer) enqueues commands to a command queue(per stream).
For Direct Dispatch, HIP runtime would directly enqueue a packet to the AQL queue (user mode queue on GPU) on the Dispatch API call from the application. That has shown to reduce the latency to launch the first wave on the idle GPU and total time of tiny dispatches synchronized with the host.
In addition, eliminating the threads in runtime has reduced the variance in the dispatch numbers as the thread scheduling delays and atomics/locks synchronization latencies are reduced.
This feature can be disabled via setting the following environment variable,
AMD_DIRECT_DISPATCH=0
Note, Direct Dispatch is implemented on Linux. It is currently not supported on Windows.
## HIP Runtime Compilation
HIP now supports runtime compilation (HIP RTC), the usage of which will provide the possibility of optimizations and performance improvement compared with other APIs via regular offline static compilation.
HIP RTC APIs accept HIP source files in character string format as input parameters and create handles of programs by compiling the HIP source files without spawning separate processes.
For more details on HIP RTC APIs, refer to [HIP Runtime API Reference](../doxygen/html/index).
For Linux developers, the link [here](https://github.com/ROCm/hip-tests/blob/develop/samples/2_Cookbook/23_cmake_hiprtc/saxpy.cpp) shows an example how to program HIP application using runtime compilation mechanism, and a detailed [HIP RTC programming guide](./hip_rtc) is also available.
## HIP Graph
HIP graphs are supported. For more details, refer to the [HIP API Guide](../doxygen/html/group___graph) or the [how-to section for HIP graphs](../how-to/hipgraph).
## Device-Side Malloc
HIP-Clang now supports device-side malloc and free.
This implementation does not require the use of `hipDeviceSetLimit(hipLimitMallocHeapSize,value)` nor respects any setting. The heap is fully dynamic and can grow until the available free memory on the device is consumed.
## Use of Per-thread default stream
The per-thread default stream is supported in HIP. It is an implicit stream local to both the thread and the current device. This means that the command issued to the per-thread default stream by the thread does not implicitly synchronize with other streams (like explicitly created streams), or default per-thread stream on other threads.
The per-thread default stream is a blocking stream and will synchronize with the default null stream if both are used in a program.
The per-thread default stream can be enabled via adding a compilation option,
`-fgpu-default-stream=per-thread`.
And users can explicitly use `hipStreamPerThread` as per-thread default stream handle as input in API commands. There are test codes as examples in the [link](https://github.com/ROCm/hip-tests/tree/develop/catch/unit/streamperthread).
## Use of Long Double Type
In HIP-Clang, long double type is 80-bit extended precision format for x86_64, which is not supported by AMDGPU. HIP-Clang treats long double type as IEEE double type for AMDGPU. Using long double type in HIP source code will not cause issue as long as data of long double type is not transferred between host and device. However, long double type should not be used as kernel argument type.
## Use of `_Float16` Type
If a host function is to be used between clang (or hipcc) and gcc for x86_64, i.e. its definition is compiled by one compiler but the caller is compiled by a different compiler, `_Float16` or aggregates containing `_Float16` should not be used as function argument or return type. This is due to lack of stable ABI for `_Float16` on x86_64. Passing `_Float16` or aggregates containing `_Float16` between clang and gcc could cause undefined behavior.
## FMA and contractions
By default HIP-Clang assumes `-ffp-contract=fast-honor-pragmas`.
Users can use `#pragma clang fp contract(on|off|fast)` to control `fp` contraction of a block of code.
For x86_64, FMA is off by default since the generic x86_64 target does not
support FMA by default. To turn on FMA on x86_64, either use `-mfma` or `-march=native`
on CPU's supporting FMA.
When contractions are enabled and the CPU has not enabled FMA instructions, the
GPU can produce different numerical results than the CPU for expressions that
can be contracted. Tolerance should be used for floating point comparisons.
## Math functions with special rounding modes
Note: Currently, HIP only supports basic math functions with rounding modern (round to nearest). HIP does not support basic math functions with rounding modes `ru` (round up), `rd` (round down), and `rz` (round towards zero).
## Creating Static Libraries
HIP-Clang supports generating two types of static libraries. The first type of static library does not export device functions, and only exports and launches host functions within the same library. The advantage of this type is the ability to link with a non-hipcc compiler such as gcc. The second type exports device functions to be linked by other code objects. However, this requires using hipcc as the linker.
In addition, the first type of library contains host objects with device code embedded as fat binaries. It is generated using the flag --emit-static-lib. The second type of library contains relocatable device objects and is generated using `ar`.
Here is an example to create and use static libraries:
* Type 1 using `--emit-static-lib`:
```cpp
hipcc hipOptLibrary.cpp --emit-static-lib -fPIC -o libHipOptLibrary.a
gcc test.cpp -L. -lhipOptLibrary -L/path/to/hip/lib -lamdhip64 -o test.out
```
* Type 2 using system `ar`:
```cpp
hipcc hipDevice.cpp -c -fgpu-rdc -o hipDevice.o
ar rcsD libHipDevice.a hipDevice.o
hipcc libHipDevice.a test.cpp -fgpu-rdc -o test.out
```
For more information, please see [HIP samples host functions](https://github.com/ROCm/hip-tests/tree/develop/samples/2_Cookbook/15_static_library/host_functions) and [device_functions](https://github.com/ROCm/hip-tests/tree/rocm-5.5.x/samples/2_Cookbook/15_static_library/device_functions).
@@ -1,577 +0,0 @@
.. meta::
:description: This chapter describes introduces Unified Memory (UM) and shows
how to use it in AMD HIP.
:keywords: AMD, ROCm, HIP, CUDA, unified memory, unified, memory, UM, APU
*******************************************************************************
Unified memory
*******************************************************************************
In conventional architectures, CPUs and GPUs have dedicated memory like Random
Access Memory (RAM) and Video Random Access Memory (VRAM). This architectural
design, while effective, can be limiting in terms of memory capacity and
bandwidth, as continuous memory copying is required to allow the processors to
access the appropriate data. New architectural features like Heterogeneous
System Architectures (HSA) and Unified Memory (UM) help avoid these limitations
and promise increased efficiency and innovation.
Unified memory
==============
Unified Memory is a single memory address space accessible from any processor
within a system. This setup simplifies memory management processes and enables
applications to allocate data that can be read or written by code running on
either CPUs or GPUs. The Unified memory model is shown in the following figure.
.. figure:: ../data/unified_memory/um.svg
AMD Accelerated Processing Unit (APU) is a typical example of a Unified Memory
Architecture. On a single die, a central processing unit (CPU) is combined
with an integrated graphics processing unit (iGPU), and both have access to a
high-bandwidth memory (HBM) module named Unified Memory. The CPU enables
high-performance, low-latency operations, while the GPU is optimized for high
throughput (data processed by unit time).
.. _unified memory system requirements:
System requirements
===================
Unified memory is supported on Linux by all modern AMD GPUs from the Vega
series onward. Unified memory management can be achieved with managed memory
allocation and, for the latest GPUs, with a system allocator.
The table below lists the supported allocators. The allocators are described in
the next section.
.. list-table:: Supported Unified Memory Allocators
:widths: 40, 25, 25, 25
:header-rows: 1
:align: center
* - Architecture
- ``hipMallocManaged()``
- ``__managed__``
- ``malloc()``
* - MI200, MI300 Series
- ✅
- ✅
- ✅ :sup:`1`
* - MI100
- ✅
- ✅
- ❌
* - RDNA (Navi) Series
- ✅
- ✅
- ❌
* - GCN5 (Vega) Series
- ✅
- ✅
- ❌
✅: **Supported**
❌: **Unsupported**
:sup:`1` Works only with ``XNACK=1``. First GPU access causes recoverable
page-fault. For more details, visit
`GPU memory <https://rocm.docs.amd.com/en/latest/conceptual/gpu-memory.html#xnack>`_.
.. _unified memory programming models:
Unified memory programming models
=================================
Showcasing various unified memory programming models, the model availability
depends on your architecture. For more information, see :ref:`unified memory
system requirements` and :ref:`checking unified memory management support`.
- **HIP managed memory allocation API**:
The ``hipMallocManaged()`` is a dynamic memory allocator available on
all GPUs with unified memory support. For more details, visit
:ref:`unified_memory_reference`.
- **HIP managed variables**:
The ``__managed__`` declaration specifier, which serves as its counterpart,
is supported on all modern AMD cards and can be utilized for static
allocation.
- **System allocation API**:
Starting with the AMD MI300 series, the ``malloc()`` system allocator allows
you to reserve unified memory. The system allocator is more versatile and
offers an easy transition from a CPU written C++ code to a HIP code as the
same system allocation API is used.
.. _checking unified memory management support:
Checking unified memory management support
------------------------------------------
Some device attributes can offer information about which :ref:`unified memory
programming models` are supported. The attribute value is 1 if the
functionality is supported, and 0 if it is not supported.
.. list-table:: Device attributes for unified memory management
:widths: 40, 60
:header-rows: 1
:align: center
* - attribute
- description
* - ``hipDeviceAttributeManagedMemory``
- unified addressing is supported
* - ``hipDeviceAttributeConcurrentManagedAccess``
- full managed memory support, concurrent access is supported
* - ``hipDeviceAttributePageableMemoryAccess``
- both managed and system memory allocation API is supported
The following examples show how to use device attributes:
.. code-block:: cpp
#include <hip/hip_runtime.h>
#include <iostream>
int main() {
int d;
hipGetDevice(&d);
int is_cma = 0;
hipDeviceGetAttribute(&is_cma, hipDeviceAttributeConcurrentManagedAccess, d);
std::cout << "HIP Managed Memory: "
<< (is_cma == 1 ? "is" : "NOT")
<< " supported" << std::endl;
return 0;
}
Example for unified memory management
-------------------------------------
The following example shows how to use unified memory management with
``hipMallocManaged()``, function, with ``__managed__`` attribute for static
allocation and standard ``malloc()`` allocation. For comparison, the Explicit
Memory Management example is presented in the last tab.
.. tab-set::
.. tab-item:: hipMallocManaged()
.. code-block:: cpp
:emphasize-lines: 12-15
#include <hip/hip_runtime.h>
#include <iostream>
// Addition of two values.
__global__ void add(int *a, int *b, int *c) {
*c = *a + *b;
}
int main() {
int *a, *b, *c;
// Allocate memory for a, b and c that is accessible to both device and host codes.
hipMallocManaged(&a, sizeof(*a));
hipMallocManaged(&b, sizeof(*b));
hipMallocManaged(&c, sizeof(*c));
// Setup input values.
*a = 1;
*b = 2;
// Launch add() kernel on GPU.
hipLaunchKernelGGL(add, dim3(1), dim3(1), 0, 0, a, b, c);
// Wait for GPU to finish before accessing on host.
hipDeviceSynchronize();
// Prints the result.
std::cout << *a << " + " << *b << " = " << *c << std::endl;
// Cleanup allocated memory.
hipFree(a);
hipFree(b);
hipFree(c);
return 0;
}
.. tab-item:: __managed__
.. code-block:: cpp
:emphasize-lines: 9-10
#include <hip/hip_runtime.h>
#include <iostream>
// Addition of two values.
__global__ void add(int *a, int *b, int *c) {
*c = *a + *b;
}
// Declare a, b and c as static variables.
__managed__ int a, b, c;
int main() {
// Setup input values.
a = 1;
b = 2;
// Launch add() kernel on GPU.
hipLaunchKernelGGL(add, dim3(1), dim3(1), 0, 0, &a, &b, &c);
// Wait for GPU to finish before accessing on host.
hipDeviceSynchronize();
// Prints the result.
std::cout << a << " + " << b << " = " << c << std::endl;
return 0;
}
.. tab-item:: malloc()
.. code-block:: cpp
:emphasize-lines: 12-15
#include <hip/hip_runtime.h>
#include <iostream>
// Addition of two values.
__global__ void add(int* a, int* b, int* c) {
*c = *a + *b;
}
int main() {
int* a, * b, * c;
// Allocate memory for a, b, and c.
a = (int*)malloc(sizeof(*a));
b = (int*)malloc(sizeof(*b));
c = (int*)malloc(sizeof(*c));
// Setup input values.
*a = 1;
*b = 2;
// Launch add() kernel on GPU.
hipLaunchKernelGGL(add, dim3(1), dim3(1), 0, 0, a, b, c);
// Wait for GPU to finish before accessing on host.
hipDeviceSynchronize();
// Prints the result.
std::cout << *a << " + " << *b << " = " << *c << std::endl;
// Cleanup allocated memory.
free(a);
free(b);
free(c);
return 0;
}
.. tab-item:: Explicit Memory Management
.. code-block:: cpp
:emphasize-lines: 17-24, 29-30
#include <hip/hip_runtime.h>
#include <iostream>
// Addition of two values.
__global__ void add(int *a, int *b, int *c) {
*c = *a + *b;
}
int main() {
int a, b, c;
int *d_a, *d_b, *d_c;
// Setup input values.
a = 1;
b = 2;
// Allocate device copies of a, b and c.
hipMalloc(&d_a, sizeof(*d_a));
hipMalloc(&d_b, sizeof(*d_b));
hipMalloc(&d_c, sizeof(*d_c));
// Copy input values to device.
hipMemcpy(d_a, &a, sizeof(*d_a), hipMemcpyHostToDevice);
hipMemcpy(d_b, &b, sizeof(*d_b), hipMemcpyHostToDevice);
// Launch add() kernel on GPU.
hipLaunchKernelGGL(add, dim3(1), dim3(1), 0, 0, d_a, d_b, d_c);
// Copy the result back to the host.
hipMemcpy(&c, d_c, sizeof(*d_c), hipMemcpyDeviceToHost);
// Cleanup allocated memory.
hipFree(d_a);
hipFree(d_b);
hipFree(d_c);
// Prints the result.
std::cout << a << " + " << b << " = " << c << std::endl;
return 0;
}
.. _using unified memory management:
Using unified memory management (UMM)
=====================================
Unified memory management (UMM) is a feature that can simplify the complexities
of memory management in GPU computing. It is particularly useful in
heterogeneous computing environments with heavy memory usage with both a CPU
and a GPU, which would require large memory transfers. Here are some areas
where UMM can be beneficial:
- **Simplification of Memory Management**:
UMM can help to simplify the complexities of memory management. This can make
it easier for developers to write code without worrying about memory
allocation and deallocation details.
- **Data Migration**:
UMM allows for efficient data migration between the host (CPU) and the device
(GPU). This can be particularly useful for applications that need to move
data back and forth between the device and host.
- **Improved Programming Productivity**:
As a positive side effect, UMM can reduce the lines of code, thereby
improving programming productivity.
In HIP, pinned memory allocations are coherent by default. Pinned memory is
host memory mapped into the address space of all GPUs, meaning that the pointer
can be used on both host and device. Using pinned memory instead of pageable
memory on the host can improve bandwidth.
While UMM can provide numerous benefits, it's important to be aware of the
potential performance overhead associated with UMM. You must thoroughly test
and profile your code to ensure it's the most suitable choice for your use
case.
.. _unified memory runtime hints:
Unified memory HIP runtime hints for the better performance
===========================================================
Unified memory HIP runtime hints can help improve the performance of your code if
you know your code's ability and infrastructure. Some hint techniques are
presented in this section.
The hint functions can set actions on a selected device, which can be
identified by ``hipGetDeviceProperties(&prop, device_id)``. There are two
special ``device_id`` values:
- ``hipCpuDeviceId`` = -1 means that the advised device is the CPU.
- ``hipInvalidDeviceId`` = -2 means that the device is invalid.
For the best performance, profile your application to optimize the
utilization of HIP runtime hints.
Data prefetching
----------------
Data prefetching is a technique used to improve the performance of your
application by moving data closer to the processing unit before it's actually
needed.
.. code-block:: cpp
:emphasize-lines: 20-23,31-32
// Addition of two values.
__global__ void add(int *a, int *b, int *c) {
*c = *a + *b;
}
int main() {
int *a, *b, *c;
int deviceId;
hipGetDevice(&deviceId); // Get the current device ID
// Allocate memory for a, b and c that is accessible to both device and host codes.
hipMallocManaged(&a, sizeof(*a));
hipMallocManaged(&b, sizeof(*b));
hipMallocManaged(&c, sizeof(*c));
// Setup input values.
*a = 1;
*b = 2;
// Prefetch the data to the GPU device.
hipMemPrefetchAsync(a, sizeof(*a), deviceId, 0);
hipMemPrefetchAsync(b, sizeof(*b), deviceId, 0);
hipMemPrefetchAsync(c, sizeof(*c), deviceId, 0);
// Launch add() kernel on GPU.
hipLaunchKernelGGL(add, dim3(1), dim3(1), 0, 0, a, b, c);
// Wait for GPU to finish before accessing on host.
hipDeviceSynchronize();
// Prefetch the result back to the CPU.
hipMemPrefetchAsync(c, sizeof(*c), hipCpuDeviceId, 0);
// Wait for the prefetch operations to complete.
hipDeviceSynchronize();
// Prints the result.
std::cout << *a << " + " << *b << " = " << *c << std::endl;
// Cleanup allocated memory.
hipFree(a);
hipFree(b);
hipFree(c);
return 0;
}
Remember to check the return status of ``hipMemPrefetchAsync()`` to ensure that
the prefetch operations are completed successfully.
Memory advice
-------------
The effectiveness of ``hipMemAdvise()`` comes from its ability to inform the
runtime system of the developer's intentions regarding memory usage. When the
runtime system has knowledge of the expected memory access patterns, it can
make better decisions about data placement and caching, leading to more
efficient execution of the application. However, the actual impact on
performance can vary based on the specific use case and the hardware
architecture.
For the description of ``hipMemAdvise()`` and the detailed list of advice,
visit the :ref:`unified_memory_reference`.
Here is the updated version of the example above with memory advice.
.. code-block:: cpp
:emphasize-lines: 17-26
#include <hip/hip_runtime.h>
#include <iostream>
// Addition of two values.
__global__ void add(int *a, int *b, int *c) {
*c = *a + *b;
}
int main() {
int *a, *b, *c;
// Allocate memory for a, b, and c accessible to both device and host codes.
hipMallocManaged(&a, sizeof(*a));
hipMallocManaged(&b, sizeof(*b));
hipMallocManaged(&c, sizeof(*c));
// Set memory advice for a, b, and c to be accessed by the CPU.
hipMemAdvise(a, sizeof(*a), hipMemAdviseSetPreferredLocation, hipCpuDeviceId);
hipMemAdvise(b, sizeof(*b), hipMemAdviseSetPreferredLocation, hipCpuDeviceId);
hipMemAdvise(c, sizeof(*c), hipMemAdviseSetPreferredLocation, hipCpuDeviceId);
// Additionally, set memory advice for a, b, and c to be read mostly from the device 0.
constexpr int device = 0;
hipMemAdvise(a, sizeof(*a), hipMemAdviseSetReadMostly, device);
hipMemAdvise(b, sizeof(*b), hipMemAdviseSetReadMostly, device);
hipMemAdvise(c, sizeof(*c), hipMemAdviseSetReadMostly, device);
// Setup input values.
*a = 1;
*b = 2;
// Launch add() kernel on GPU.
hipLaunchKernelGGL(add, dim3(1), dim3(1), 0, 0, a, b, c);
// Wait for GPU to finish before accessing on host.
hipDeviceSynchronize();
// Prints the result.
std::cout << *a << " + " << *b << " = " << *c << std::endl;
// Cleanup allocated memory.
hipFree(a);
hipFree(b);
hipFree(c);
return 0;
}
Memory range attributes
-----------------------
Memory Range attributes allow you to query attributes of a given memory range.
The ``hipMemRangeGetAttribute()`` is added to the example to query the
``hipMemRangeAttributeReadMostly`` attribute of the memory range pointed to by
``a``. The result is stored in ``attributeValue`` and then printed out.
For more details, visit the
:ref:`unified_memory_reference`.
.. code-block:: cpp
:emphasize-lines: 29-34
#include <hip/hip_runtime.h>
#include <iostream>
// Addition of two values.
__global__ void add(int *a, int *b, int *c) {
*c = *a + *b;
}
int main() {
int *a, *b, *c;
unsigned int attributeValue;
constexpr size_t attributeSize = sizeof(attributeValue);
// Allocate memory for a, b and c that is accessible to both device and host codes.
hipMallocManaged(&a, sizeof(*a));
hipMallocManaged(&b, sizeof(*b));
hipMallocManaged(&c, sizeof(*c));
// Setup input values.
*a = 1;
*b = 2;
// Launch add() kernel on GPU.
hipLaunchKernelGGL(add, dim3(1), dim3(1), 0, 0, a, b, c);
// Wait for GPU to finish before accessing on host.
hipDeviceSynchronize();
// Query an attribute of the memory range.
hipMemRangeGetAttribute(&attributeValue,
attributeSize,
hipMemRangeAttributeReadMostly,
a,
sizeof(*a));
// Prints the result.
std::cout << *a << " + " << *b << " = " << *c << std::endl;
std::cout << "The queried attribute value is: " << attributeValue << std::endl;
// Cleanup allocated memory.
hipFree(a);
hipFree(b);
hipFree(c);
return 0;
}
Asynchronously attach memory to a stream
----------------------------------------
The ``hipStreamAttachMemAsync`` function would be able to asynchronously attach memory to a stream, which can help concurrent execution when using streams.
Currently, this function is a no-operation (NOP) function on AMD GPUs. It simply returns success after the runtime memory validation passed. This function is necessary on Microsoft Windows, and UMM is not supported on this operating system with AMD GPUs at the moment.
@@ -1,94 +0,0 @@
.. meta::
:description: This chapter describes introduces Virtual Memory (VM) and shows
how to use it in AMD HIP.
:keywords: AMD, ROCm, HIP, CUDA, virtual memory, virtual, memory, UM, APU
.. _virtual_memory:
*****************************
Virtual memory management
*****************************
Memory management is important when creating high-performance applications in the HIP ecosystem. Both allocating and copying memory can result in bottlenecks, which can significantly impact performance.
Global memory allocation in HIP uses the C language style allocation function. This works fine for simple cases but can cause problems if your memory needs change. If you need to increase the size of your memory, you must allocate a second larger buffer and copy the data to it before you can free the original buffer. This increases overall memory usage and causes unnecessary ``memcpy`` calls. Another solution is to allocate a larger buffer than you initially need. However, this isn't an efficient way to handle resources and doesn't solve the issue of reallocation when the extra buffer runs out.
Virtual memory management solves these memory management problems. It helps to reduce memory usage and unnecessary ``memcpy`` calls.
.. _memory_allocation_virtual_memory:
Memory allocation
=================
Standard memory allocation uses the ``hipMalloc`` function to allocate a block of memory on the device. However, when using virtual memory, this process is separated into multiple steps using the ``hipMemCreate``, ``hipMemAddressReserve``, ``hipMemMap``, and ``hipMemSetAccess`` functions. This guide explains what these functions do and how you can use them for virtual memory management.
Allocate physical memory
------------------------
The first step is to allocate the physical memory itself with the ``hipMemCreate`` function. This function accepts the size of the buffer, an ``unsigned long long`` variable for the flags, and a ``hipMemAllocationProp`` variable. ``hipMemAllocationProp`` contains the properties of the memory to be allocated, such as where the memory is physically located and what kind of shareable handles are available. If the allocation is successful, the function returns a value of ``hipSuccess``, with ``hipMemGenericAllocationHandle_t`` representing a valid physical memory allocation. The allocated memory size must be aligned with the granularity appropriate for the properties of the allocation. You can use the ``hipMemGetAllocationGranularity`` function to determine the correct granularity.
.. code-block:: cpp
size_t granularity = 0;
hipMemGenericAllocationHandle_t allocHandle;
hipMemAllocationProp prop = {};
prop.type = HIP_MEM_ALLOCATION_TYPE_PINNED;
prop.location.type = HIP_MEM_LOCATION_TYPE_DEVICE;
prop.location.id = currentDev;
hipMemGetAllocationGranularity(&granularity, &prop, HIP_MEM_ALLOC_GRANULARITY_MINIMUM);
padded_size = ROUND_UP(size, granularity);
hipMemCreate(&allocHandle, padded_size, &prop, 0);
Reserve virtual address range
-----------------------------
After you have acquired an allocation of physical memory, you must map it before you can use it. To do so, you need a virtual address to map it to. Mapping means the physical memory allocation is available from the virtual address range it is mapped to. To reserve a virtual memory range, use the ``hipMemAddressReserve`` function. The size of the virtual memory must match the amount of physical memory previously allocated. You can then map the physical memory allocation to the newly-acquired virtual memory address range using the ``hipMemMap`` function.
.. code-block:: cpp
hipMemAddressReserve(&ptr, padded_size, 0, 0, 0);
hipMemMap(ptr, padded_size, 0, allocHandle, 0);
Set memory access
-----------------
Finally, use the ``hipMemSetAccess`` function to enable memory access. It accepts the pointer to the virtual memory, the size, and a ``hipMemAccessDesc`` descriptor as parameters. In a multi-GPU environment, you can map the device memory of one GPU to another. This feature also works with the traditional memory management system, but isn't as scalable as with virtual memory. When memory is allocated with ``hipMalloc``, ``hipDeviceEnablePeerAccess`` is used to enable peer access. This function enables access between two devices, but it means that every call to ``hipMalloc`` takes more time to perform the checks and the mapping between the devices. When using virtual memory management, peer access is enabled by ``hipMemSetAccess``, which provides a finer level of control over what is shared. This has no performance impact on memory allocation and gives you more control over what memory buffers are shared with which devices.
.. code-block:: cpp
hipMemAccessDesc accessDesc = {};
accessDesc.location.type = HIP_MEM_LOCATION_TYPE_DEVICE;
accessDesc.location.id = currentDev;
accessDesc.flags = HIP_MEM_ACCESS_FLAGS_PROT_READWRITE;
hipMemSetAccess(ptr, padded_size, &accessDesc, 1);
At this point the memory is allocated, mapped, and ready for use. You can read and write to it, just like you would a C style memory allocation.
Free virtual memory
-------------------
To free the memory allocated in this manner, use the corresponding free functions. To unmap the memory, use ``hipMemUnmap``. To release the virtual address range, use ``hipMemAddressFree``. Finally, to release the physical memory, use ``hipMemRelease``. A side effect of these functions is the lack of synchronization when memory is released. If you call ``hipFree`` when you have multiple streams running in parallel, it synchronizes the device. This causes worse resource usage and performance.
.. code-block:: cpp
hipMemUnmap(ptr, size);
hipMemRelease(allocHandle);
hipMemAddressFree(ptr, size);
.. _usage_virtual_memory:
Memory usage
============
Dynamically increase allocation size
------------------------------------
The ``hipMemAddressReserve`` function allows you to increase the amount of pre-allocated memory. This function accepts a parameter representing the requested starting address of the virtual memory. This allows you to have a continuous virtual address space without worrying about the underlying physical allocation.
.. code-block:: cpp
hipMemAddressReserve(&new_ptr, (new_size - padded_size), 0, ptr + padded_size, 0);
hipMemMap(new_ptr, (new_size - padded_size), 0, newAllocHandle, 0);
hipMemSetAccess(new_ptr, (new_size - padded_size), &accessDesc, 1);
The code sample above assumes that ``hipMemAddressReserve`` was able to reserve the memory address at the specified location. However, this isn't guaranteed to be true, so you should validate that ``new_ptr`` points to a specific virtual address before using it.
+24 -41
مشاهده پرونده
@@ -1,71 +1,54 @@
<head>
<meta charset="UTF-8">
<meta name="description" content="HIP documentation and programming guide.">
<meta name="keywords" content="HIP, Heterogeneous-computing Interface for Portability, HIP programming guide">
</head>
# HIP documentation
The Heterogeneous-computing Interface for Portability (HIP) API is a C++ runtime
API and kernel language that lets developers create portable applications for AMD
and NVIDIA GPUs from single source code.
The Heterogeneous-computing Interface for Portability (HIP) is a C++ runtime API
and kernel language that lets you create portable applications for AMD and
NVIDIA GPUs from a single source code. For more information, see [What is HIP?](./what_is_hip)
For HIP supported AMD GPUs on multiple operating systems, see:
* [Linux system requirements](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/reference/system-requirements.html#supported-gpus)
* [Microsoft Windows system requirements](https://rocm.docs.amd.com/projects/install-on-windows/en/latest/reference/system-requirements.html#windows-supported-gpus)
The CUDA enabled NVIDIA GPUs are supported by HIP. For more information, see [GPU Compute Capability](https://developer.nvidia.com/cuda-gpus).
On the AMD ROCm platform, HIP provides header files and runtime library built on top of HIP-Clang compiler in the repository [Compute Language Runtimes (CLR)](./understand/amd_clr), which contains source codes for AMD's compute languages runtimes as follows,
On non-AMD platforms, like NVIDIA, HIP provides header files required to support non-AMD specific back-end implementation in the repository ['hipother'](https://github.com/ROCm/hipother), which translates from the HIP runtime APIs to CUDA runtime APIs.
## Overview
::::{grid} 1 1 2 2
:gutter: 3
:::{grid-item-card} Install
Installation instructions are available from:
* [Installing HIP](./install/install)
* [Building HIP from source](./install/build)
:::
The HIP documentation is organized into the following categories:
:::{grid-item-card} Conceptual
::::{grid} 1 2 2 2
:gutter: 3
:::{grid-item-card} Programming guide
* [Introduction](./programming_guide)
* {doc}`./understand/programming_model`
* {doc}`./understand/hardware_implementation`
* {doc}`./understand/amd_clr`
* {doc}`./understand/texture_fetching`
:::
:::{grid-item-card} How to
* [Programming manual](./how-to/programming_manual)
* [HIP porting guide](./how-to/hip_porting_guide)
* [HIP porting: driver API guide](./how-to/hip_porting_driver_api)
* {doc}`./how-to/hip_rtc`
* {doc}`./understand/compilers`
* {doc}`./how-to/performance_guidelines`
* [Debugging with HIP](./how-to/debugging)
* {doc}`./how-to/logging`
* [Unified memory](./how-to/unified_memory)
* [Virtual memory](./how-to/virtual_memory)
* {doc}`./how-to/stream_ordered_allocator`
* [Cooperative groups](./how-to/cooperative_groups)
* [HIP graphs](./how-to/hipgraph)
* {doc}`./how-to/faq`
* {doc}`./how-to/hip_runtime_api`
* [HIP porting guide](./how-to/hip_porting_guide)
* [HIP porting: driver API guide](./how-to/hip_porting_driver_api)
* {doc}`./how-to/hip_rtc`
* {doc}`./understand/amd_clr`
:::
:::{grid-item-card} Reference
* [HIP runtime API](./reference/hip_runtime_api_reference)
* [Modules](./reference/hip_runtime_api/modules)
* [Global defines, enums, structs and files](./reference/hip_runtime_api/global_defines_enums_structs_files)
* [HSA runtime API for ROCm](./reference/virtual_rocr)
* [C++ language extensions](./reference/cpp_language_extensions)
* [C++ language support](./reference/cpp_language_support)
* [HIP math API](./reference/math_api)
* [HIP environment variables](./reference/env_variables)
* [Comparing syntax for different APIs](./reference/terms)
* [List of deprecated APIs](./reference/deprecated_api_list)
* [FP8 numbers in HIP](./reference/fp8_numbers)
* {doc}`./reference/hardware_features`
:::
@@ -1,3 +1,7 @@
.. meta::
:description: This page gives instructions on how to build HIP from source.
:keywords: AMD, ROCm, HIP, build, build instructions, source
*******************************************
Build HIP from source
*******************************************
@@ -1,12 +1,21 @@
.. meta::
:description: This page explains how to install HIP
:keywords: AMD, ROCm, HIP, install, installation
*******************************************
Install HIP
*******************************************
HIP can be installed on AMD (ROCm with HIP-Clang) and NVIDIA (CUDA with NVCC) platforms.
Note: The version definition for the HIP runtime is different from CUDA. On an AMD platform, the
``hipRuntimeGerVersion`` function returns the HIP runtime version; on an NVIDIA platform, this function
returns the CUDA runtime version.
.. note::
The version definition for the HIP runtime is different from CUDA. On AMD
platforms, the :cpp:func:`hipRuntimeGetVersion` function returns the HIP
runtime version. On NVIDIA platforms, this function returns the CUDA runtime
version.
.. _install_prerequisites:
Prerequisites
=======================================
@@ -24,8 +33,9 @@ Prerequisites
.. tab-item:: NVIDIA
:sync: nvidia
Check the system requirements in the
`NVIDIA CUDA Installation Guide <https://docs.nvidia.com/cuda/cuda-installation-guide-linux/>`_.
With NVIDIA GPUs, HIP requires unified memory. All CUDA-enabled NVIDIA
GPUs with compute capability 5.0 or later should be supported. For more
information, see `NVIDIA's list of CUDA enabled GPUs <https://developer.nvidia.com/cuda-gpus>`_.
Installation
=======================================
@@ -41,7 +51,7 @@ Installation
* :doc:`rocm-install-on-linux:index`
* :doc:`rocm-install-on-windows:index`
By default, HIP is installed into ``/opt/rocm/hip``.
By default, HIP is installed into ``/opt/rocm``.
.. note::
There is no autodetection for the HIP installation. If you choose to install it somewhere other than the default location, you must set the ``HIP_PATH`` environment variable as explained in `Build HIP from source <./build.html>`_.
@@ -83,7 +93,7 @@ Installation
The default paths are:
* CUDA SDK: ``/usr/local/cuda``
* HIP: ``/opt/rocm/hip``
* HIP: ``/opt/rocm``
#. Set the HIP_PLATFORM to nvidia.
@@ -0,0 +1,83 @@
.. meta::
:description: HIP programming guide introduction
:keywords: HIP programming guide introduction, HIP programming guide
.. _hip-programming-guide:
********************************************************************************
HIP programming guide introduction
********************************************************************************
This topic provides key HIP programming concepts and links to more detailed
information.
Write GPU Kernels for Parallel Execution
================================================================================
To make the most of the parallelism inherent to GPUs, a thorough understanding
of the :ref:`programming model <programming_model>` is helpful. The HIP
programming model is designed to make it easy to map data-parallel algorithms to
architecture of the GPUs. HIP employs the SIMT-model (Single Instruction
Multiple Threads) with a multi-layered thread hierarchy for efficient execution.
Understand the Target Architecture (CPU and GPU)
================================================================================
The :ref:`hardware implementation <hardware_implementation>` topic outlines the
GPUs supported by HIP. In general, GPUs are made up of Compute Units that excel
at executing parallelizable, computationally intensive workloads without complex
control-flow.
Increase parallelism on multiple level
================================================================================
To maximize performance and keep all system components fully utilized, the
application should expose and efficiently manage as much parallelism as possible.
:ref:`Parallel execution <parallel execution>` can be achieved at the
application, device, and multiprocessor levels.
The applications host and device operations can achieve parallel execution
through asynchronous calls, streams, or HIP graphs. On the device level,
multiple kernels can execute concurrently when resources are available, and at
the multiprocessor level, developers can overlap data transfers with
computations to further optimize performance.
Memory management
================================================================================
GPUs generally have their own distinct memory, also called :ref:`device
memory <device_memory>`, separate from the :ref:`host memory <host_memory>`.
Device memory needs to be managed separately from the host memory. This includes
allocating the memory and transfering it between the host and the device. These
operations can be performance critical, so it's important to know how to use
them effectively. For more information, see :ref:`Memory management <memory_management>`.
Synchronize CPU and GPU Workloads
================================================================================
Tasks on the host and devices run asynchronously, so proper synchronization is
needed when dependencies between those tasks exist. The asynchronous execution
of tasks is useful for fully utilizing the available resources. Even when only a
single device is available, memory transfers and the execution of tasks can be
overlapped with asynchronous execution.
Error Handling
================================================================================
All functions in the HIP runtime API return an error value of type
:cpp:enum:`hipError_t` that can be used to verify whether the function was
successfully executed. It's important to confirm these returned values, in order
to catch and handle those errors, if possible. An exception is kernel launches,
which don't return any value. These errors can be caught with specific functions
like :cpp:func:`hipGetLastError()`.
For more information, see :ref:`error_handling` .
Multi-GPU and Load Balancing
================================================================================
Large-scale applications that need more compute power can use multiple GPUs in
the system. This requires distributing workloads across multiple GPUs to balance
the load to prevent GPUs from being overutilized while others are idle.
For more information, see :ref:`multi-device` .
@@ -97,7 +97,7 @@ When using ``hipLaunchKernelGGL``, your first five parameters must be:
* ``size_t dynamicShared``: The amount of additional shared memory that you want to allocate
when launching the kernel (see :ref:`shared-variable-type`).
* ``hipStream_t``: The stream where you want to run the kernel. A value of ``0`` corresponds to the
NULL stream (see :ref:`synchronization functions`).
NULL stream (see :ref:`synchronization_functions`).
You can include your kernel arguments after these parameters.
@@ -293,6 +293,7 @@ dimensions to 1.
dim3(uint32_t _x=1, uint32_t _y=1, uint32_t _z=1) : x(_x), y(_y), z(_z) {};
};
.. _memory_fence_instructions:
Memory fence instructions
====================================================
@@ -306,7 +307,7 @@ HIP supports ``__threadfence()`` and ``__threadfence_block()``. If you're using
``hipHostMalloc()``.
* Remove ``memcpy`` for all allocated fine-grained system memory regions.
.. _synchronization functions:
.. _synchronization_functions:
Synchronization functions
====================================================
@@ -321,7 +322,7 @@ The Cooperative Groups API offer options to do synchronization on a developer de
Math functions
====================================================
HIP-Clang supports a set of math operations that are callable from the device.
HIP-Clang supports a set of math operations that are callable from the device.
HIP supports most of the device functions supported by CUDA. These are described
on :ref:`Math API page <math_api_reference>`.
@@ -376,6 +377,8 @@ To read a high-resolution timer from the device, HIP provides the following buil
Note that ``clock()`` and ``clock64()`` do not work properly on AMD RDNA3 (GFX11) graphic processors.
.. _atomic functions:
Atomic functions
===============================================
@@ -734,6 +737,8 @@ will be enabled unconditionally in the next ROCm release. Wherever possible, the
implementation includes a static assert to check that the program source uses
the correct type for the mask.
.. _warp_vote_functions:
Warp vote and ballot functions
-------------------------------------------------------------------------------------------------------------
@@ -6,87 +6,171 @@
HIP deprecated runtime API functions
**********************************************************************************************
Several of our API functions have been flagged for deprecation. Using the following functions results in
errors and unexpected results, so we encourage you to update your code accordingly.
Several of our API functions have been flagged for deprecation. Using the
following functions results in errors and unexpected results, so we encourage
you to update your code accordingly.
Context management
Deprecated since ROCm 6.1.0
============================================================
CUDA supports cuCtx API, which is the driver API that defines "Context" and "Devices" as separate
entities. Context contains a single device, and a device can theoretically have multiple contexts. HIP
initially added limited support for these APIs in order to facilitate porting from existing driver codes.
These APIs are now marked as deprecated because there are better alternate interfaces (such as
``hipSetDevice`` or the stream API) to achieve these functions.
Deprecated texture management functions.
* ``hipCtxCreate``
* ``hipCtxDestroy``
* ``hipCtxPopCurrent``
* ``hipCtxPushCurrent``
* ``hipCtxSetCurrent``
* ``hipCtxGetCurrent``
* ``hipCtxGetDevice``
* ``hipCtxGetApiVersion``
* ``hipCtxGetCacheConfig``
* ``hipCtxSetCacheConfig``
* ``hipCtxSetSharedMemConfig``
* ``hipCtxGetSharedMemConfig``
* ``hipCtxSynchronize``
* ``hipCtxGetFlags``
* ``hipCtxEnablePeerAccess``
* ``hipCtxDisablePeerAccess``
* ``hipDevicePrimaryCtxGetState``
* ``hipDevicePrimaryCtxRelease``
* ``hipDevicePrimaryCtxRetain``
* ``hipDevicePrimaryCtxReset``
* ``hipDevicePrimaryCtxSetFlags``
.. list-table::
:widths: 40
:header-rows: 1
:align: left
Memory management
* - function
* - :cpp:func:`hipTexRefGetBorderColor`
* - :cpp:func:`hipTexRefGetArray`
Deprecated since ROCm 5.7.0
============================================================
* ``hipMallocHost`` (replaced with ``hipHostMalloc``)
* ``hipMemAllocHost`` (replaced with ``hipHostMalloc``)
* ``hipMemcpyToArray``
* ``hipMemcpyFromArray``
Deprecated texture management functions.
Profiler control
.. list-table::
:widths: 40
:header-rows: 1
:align: left
* - function
* - :cpp:func:`hipBindTextureToMipmappedArray`
Deprecated since ROCm 5.3.0
============================================================
* ``hipProfilerStart`` (use roctracer/rocTX)
* ``hipProfilerStop`` (use roctracer/rocTX)
Deprecated texture management functions.
.. list-table::
:widths: 40
:header-rows: 1
:align: left
Texture management
* - function
* - :cpp:func:`hipGetTextureReference`
* - :cpp:func:`hipTexRefSetAddressMode`
* - :cpp:func:`hipTexRefSetArray`
* - :cpp:func:`hipTexRefSetFlags`
* - :cpp:func:`hipTexRefSetFilterMode`
* - :cpp:func:`hipTexRefSetFormat`
* - :cpp:func:`hipTexRefSetMipmapFilterMode`
* - :cpp:func:`hipTexRefSetMipmapLevelBias`
* - :cpp:func:`hipTexRefSetMipmapLevelClamp`
* - :cpp:func:`hipTexRefSetMipmappedArray`
Deprecated since ROCm 4.3.0
============================================================
* ``hipGetTextureReference``
* ``hipTexRefSetAddressMode``
* ``hipTexRefSetArray``
* ``hipTexRefSetFilterMode``
* ``hipTexRefSetFlags``
* ``hipTexRefSetFormat``
* ``hipTexRefGetAddress``
* ``hipTexRefGetAddressMode``
* ``hipTexRefGetFilterMode``
* ``hipTexRefGetFlags``
* ``hipTexRefGetFormat``
* ``hipTexRefGetMaxAnisotropy``
* ``hipTexRefGetMipmapFilterMode``
* ``hipTexRefGetMipmapLevelBias``
* ``hipTexRefGetMipmapLevelClamp``
* ``hipTexRefGetMipMappedArray``
* ``hipTexRefSetAddress``
* ``hipTexRefSetAddress2D``
* ``hipTexRefSetMaxAnisotropy``
* ``hipTexRefSetBorderColor``
* ``hipTexRefSetMipmapFilterMode``
* ``hipTexRefSetMipmapLevelBias``
* ``hipTexRefSetMipmapLevelClamp``
* ``hipTexRefSetMipmappedArray``
* ``hipTexRefGetBorderColor``
* ``hipTexRefGetArray``
* ``hipBindTexture``
* ``hipBindTexture2D``
* ``hipBindTextureToArray``
* ``hipGetTextureAlignmentOffset``
* ``hipUnbindTexture``
* ``hipBindTextureToMipmappedArray``
Deprecated texture management functions.
.. list-table::
:widths: 40
:header-rows: 1
:align: left
* - function
* - :cpp:func:`hipTexRefGetAddress`
* - :cpp:func:`hipTexRefGetAddressMode`
* - :cpp:func:`hipTexRefGetFilterMode`
* - :cpp:func:`hipTexRefGetFlags`
* - :cpp:func:`hipTexRefGetFormat`
* - :cpp:func:`hipTexRefGetMaxAnisotropy`
* - :cpp:func:`hipTexRefGetMipmapFilterMode`
* - :cpp:func:`hipTexRefGetMipmapLevelBias`
* - :cpp:func:`hipTexRefGetMipmapLevelClamp`
* - :cpp:func:`hipTexRefGetMipMappedArray`
* - :cpp:func:`hipTexRefSetAddress`
* - :cpp:func:`hipTexRefSetAddress2D`
* - :cpp:func:`hipTexRefSetBorderColor`
* - :cpp:func:`hipTexRefSetMaxAnisotropy`
Deprecated since ROCm 3.8.0
============================================================
Deprecated memory management and texture management functions.
.. list-table::
:widths: 40
:header-rows: 1
:align: left
* - function
* - :cpp:func:`hipBindTexture`
* - :cpp:func:`hipBindTexture2D`
* - :cpp:func:`hipBindTextureToArray`
* - :cpp:func:`hipGetTextureAlignmentOffset`
* - :cpp:func:`hipUnbindTexture`
* - :cpp:func:`hipMemcpyToArray`
* - :cpp:func:`hipMemcpyFromArray`
Deprecated since ROCm 3.1.0
============================================================
Deprecated memory management functions.
.. list-table::
:widths: 40, 60
:header-rows: 1
:align: left
* - function
-
* - :cpp:func:`hipMallocHost`
- replaced with :cpp:func:`hipHostAlloc`
* - :cpp:func:`hipMemAllocHost`
- replaced with :cpp:func:`hipHostAlloc`
Deprecated since ROCm 3.0.0
============================================================
The ``hipProfilerStart`` and ``hipProfilerStop`` functions are deprecated.
Instead, you can use ``roctracer`` or ``rocTX`` for profiling which provide more
flexibility and detailed profiling capabilities.
.. list-table::
:widths: 40
:header-rows: 1
:align: left
* - function
* - :cpp:func:`hipProfilerStart`
* - :cpp:func:`hipProfilerStop`
Deprecated since ROCm 1.9.0
============================================================
CUDA supports cuCtx API, which is the driver API that defines "Context" and
"Devices" as separate entities. Context contains a single device, and a device
can theoretically have multiple contexts. HIP initially added limited support
for context APIs in order to facilitate porting from existing driver codes. These
APIs are now marked as deprecated because there are better alternate interfaces
(such as ``hipSetDevice`` or the stream API) to achieve these functions.
.. list-table::
:widths: 40
:header-rows: 1
:align: left
* - function
* - :cpp:func:`hipCtxCreate`
* - :cpp:func:`hipCtxDestroy`
* - :cpp:func:`hipCtxPopCurrent`
* - :cpp:func:`hipCtxPushCurrent`
* - :cpp:func:`hipCtxSetCurrent`
* - :cpp:func:`hipCtxGetCurrent`
* - :cpp:func:`hipCtxGetDevice`
* - :cpp:func:`hipCtxGetApiVersion`
* - :cpp:func:`hipCtxGetCacheConfig`
* - :cpp:func:`hipCtxSetCacheConfig`
* - :cpp:func:`hipCtxSetSharedMemConfig`
* - :cpp:func:`hipCtxGetSharedMemConfig`
* - :cpp:func:`hipCtxSynchronize`
* - :cpp:func:`hipCtxGetFlags`
* - :cpp:func:`hipCtxEnablePeerAccess`
* - :cpp:func:`hipCtxDisablePeerAccess`
* - :cpp:func:`hipDevicePrimaryCtxGetState`
* - :cpp:func:`hipDevicePrimaryCtxRelease`
* - :cpp:func:`hipDevicePrimaryCtxRetain`
* - :cpp:func:`hipDevicePrimaryCtxReset`
* - :cpp:func:`hipDevicePrimaryCtxSetFlags`
@@ -0,0 +1,189 @@
.. meta::
:description: HIP environment variables reference
:keywords: AMD, HIP, environment variables, environment, reference
********************************************************************************
HIP environment variables
********************************************************************************
In this section, the reader can find all the important HIP environment variables
on AMD platform, which are grouped by functionality.
GPU isolation variables
================================================================================
The GPU isolation environment variables in HIP are collected in the next table.
For more information, check :doc:`GPU isolation page <rocm:conceptual/gpu-isolation>`.
.. list-table::
:header-rows: 1
:widths: 70,30
* - **Environment variable**
- **Value**
* - | ``ROCR_VISIBLE_DEVICES``
| A list of device indices or UUIDs that will be exposed to applications.
- Example: ``0,GPU-DEADBEEFDEADBEEF``
* - | ``GPU_DEVICE_ORDINAL``
| Devices indices exposed to OpenCL and HIP applications.
- Example: ``0,2``
* - | ``HIP_VISIBLE_DEVICES`` or ``CUDA_VISIBLE_DEVICES``
| Device indices exposed to HIP applications.
- Example: ``0,2``
Profiling variables
================================================================================
The profiling environment variables in HIP are collected in the next table. For
more information, check :doc:`setting the number of CUs page <rocm:how-to/setting-cus>`.
.. list-table::
:header-rows: 1
:widths: 70,30
* - **Environment variable**
- **Value**
* - | ``HSA_CU_MASK``
| Sets the mask on a lower level of queue creation in the driver,
| this mask will also be set for queues being profiled.
- Example: ``1:0-8``
* - | ``ROC_GLOBAL_CU_MASK``
| Sets the mask on queues created by the HIP or the OpenCL runtimes,
| this mask will also be set for queues being profiled.
- Example: ``0xf``, enables only 4 CUs
* - | ``HIP_FORCE_QUEUE_PROFILING``
| Used to run the app as if it were run in rocprof. Forces command queue
| profiling on by default.
- | 0: Disable
| 1: Enable
Debug variables
================================================================================
The debugging environment variables in HIP are collected in the next table. For
more information, check :ref:`debugging_with_hip`.
.. include:: ../how-to/debugging_env.rst
Memory management related variables
================================================================================
The memory management related environment variables in HIP are collected in the
next table.
.. list-table::
:header-rows: 1
:widths: 35,14,51
* - **Environment variable**
- **Default value**
- **Value**
* - | ``HIP_HIDDEN_FREE_MEM``
| Amount of memory to hide from the free memory reported by hipMemGetInfo.
- ``0``
- | 0: Disable
| Unit: megabyte (MB)
* - | ``HIP_HOST_COHERENT``
| Specifies if the memory is coherent between the host and GPU in ``hipHostMalloc``.
- ``0``
- | 0: Memory is not coherent.
| 1: Memory is coherent.
| Environment variable has effect, if the following conditions are statisfied:
| - One of the ``hipHostMallocDefault``, ``hipHostMallocPortable``, ``hipHostMallocWriteCombined`` or ``hipHostMallocNumaUser`` flag set to 1.
| - ``hipHostMallocCoherent``, ``hipHostMallocNonCoherent`` and ``hipHostMallocMapped`` flags set to 0.
* - | ``HIP_INITIAL_DM_SIZE``
| Set initial heap size for device malloc.
- ``8388608``
- | Unit: Byte
| The default value corresponds to 8 MB.
* - | ``HIP_MEM_POOL_SUPPORT``
| Enables memory pool support in HIP.
- ``0``
- | 0: Disable
| 1: Enable
* - | ``HIP_MEM_POOL_USE_VM``
| Enables memory pool support in HIP.
- | ``0``: other OS
| ``1``: Windows
- | 0: Disable
| 1: Enable
* - | ``HIP_VMEM_MANAGE_SUPPORT``
| Virtual Memory Management Support.
- ``1``
- | 0: Disable
| 1: Enable
* - | ``GPU_MAX_HEAP_SIZE``
| Set maximum size of the GPU heap to % of board memory.
- ``100``
- | Unit: Percentage
* - | ``GPU_MAX_REMOTE_MEM_SIZE``
| Maximum size that allows device memory substitution with system.
- ``2``
- | Unit: kilobyte (KB)
* - | ``GPU_NUM_MEM_DEPENDENCY``
| Number of memory objects for dependency tracking.
- ``256``
-
* - | ``GPU_STREAMOPS_CP_WAIT``
| Force the stream memory operation to wait on CP.
- ``0``
- | 0: Disable
| 1: Enable
* - | ``HSA_LOCAL_MEMORY_ENABLE``
| Enable HSA device local memory usage.
- ``1``
- | 0: Disable
| 1: Enable
* - | ``PAL_ALWAYS_RESIDENT``
| Force memory resources to become resident at allocation time.
- ``0``
- | 0: Disable
| 1: Enable
* - | ``PAL_PREPINNED_MEMORY_SIZE``
| Size of prepinned memory.
- ``64``
- | Unit: kilobyte (KB)
* - | ``REMOTE_ALLOC``
| Use remote memory for the global heap allocation.
- ``0``
- | 0: Disable
| 1: Enable
Other useful variables
================================================================================
The following table lists environment variables that are useful but relate to
different features.
.. list-table::
:header-rows: 1
:widths: 35,14,51
* - **Environment variable**
- **Default value**
- **Value**
* - | ``HIPRTC_COMPILE_OPTIONS_APPEND``
| Sets compile options needed for ``hiprtc`` compilation.
- None
- ``--gpu-architecture=gfx906:sramecc+:xnack``, ``-fgpu-rdc``
@@ -0,0 +1,249 @@
.. meta::
:description: This chapter describes the hardware features of the different hardware architectures.
:keywords: AMD, ROCm, HIP, hardware, hardware features, hardware architectures
*******************************************************************************
Hardware features
*******************************************************************************
This page gives an overview of the different hardware architectures and the
features they implement. Hardware features do not imply performance, that
depends on the specifications found in the :doc:`rocm:reference/gpu-arch-specs`
page.
.. list-table::
:header-rows: 1
:name: hardware-features-table
*
- Hardware feature support
- RDNA1
- CDNA1
- RDNA2
- CDNA2
- RDNA3
- CDNA3
*
- :ref:`atomic functions` on 32-bit integer values in global and shared memory
-
-
-
-
-
-
*
- Atomic functions on 64-bit integer values in global and shared memory
-
-
-
-
-
-
*
- Atomic addition on 32-bit floating point values in global and shared memory
-
-
-
-
-
-
*
- Atomic addition on 64-bit floating point values in global memory and shared memory
-
-
-
-
-
-
*
- :ref:`Warp vote functions <warp_vote_functions>`
-
-
-
-
-
-
*
- :ref:`Memory fence instructions <memory_fence_instructions>`
-
-
-
-
-
-
*
- :ref:`Synchronization functions <synchronization_functions>`
-
-
-
-
-
-
*
- :ref:`Surface functions <surface_object_reference>`
-
-
-
-
-
-
*
- :ref:`float16 half precision IEEE-conformant floating-point operations<rocm:precision_support_floating_point_types>`
-
-
-
-
-
-
*
- :ref:`bfloat16 16-bit floating-point operations<rocm:precision_support_floating_point_types>`
-
-
-
-
-
-
*
- Support for :ref:`8-bit floating-point types <rocm:precision_support_floating_point_types>`
-
-
-
-
-
-
*
- Support for :ref:`tensor float32 <rocm:precision_support_floating_point_types>`
-
-
-
-
-
-
*
- Packed math with 16-bit floating point values
-
-
-
-
-
-
*
- Packed math with 32-bit floating point values
-
-
-
-
-
-
*
- Matrix Cores
-
-
-
-
-
-
*
- On-Chip Error Correcting Code (ECC)
-
-
-
-
-
-
*
- Maximum dimensionality of grid
- 3
- 3
- 3
- 3
- 3
- 3
*
- Maximum x-, y- or z-dimension of a grid
- :math:`2^{32} - 1`
- :math:`2^{32} - 1`
- :math:`2^{32} - 1`
- :math:`2^{32} - 1`
- :math:`2^{32} - 1`
- :math:`2^{32} - 1`
*
- Maximum number of threads per grid
- :math:`2^{32} - 1`
- :math:`2^{32} - 1`
- :math:`2^{32} - 1`
- :math:`2^{32} - 1`
- :math:`2^{32} - 1`
- :math:`2^{32} - 1`
*
- Maximum x-, y- or z-dimension of a block
- :math:`1024`
- :math:`1024`
- :math:`1024`
- :math:`1024`
- :math:`1024`
- :math:`1024`
*
- Maximum number of threads per block
- :math:`1024`
- :math:`1024`
- :math:`1024`
- :math:`1024`
- :math:`1024`
- :math:`1024`
*
- Wavefront size
- 32 [1]_
- 64
- 32 [1]_
- 64
- 32 [1]_
- 64
*
- Maximum number of resident blocks per compute unit
- 40 [1]_
- 32
- 32 [1]_
- 32
- 32 [1]_
- 32
*
- Maximum number of resident wavefronts per compute unit
- 40 [1]_
- 32
- 32 [1]_
- 32
- 32 [1]_
- 32
*
- Maximum number of resident threads per compute unit
- 1280 [2]_
- 2048
- 1024 [2]_
- 2048
- 1024 [2]_
- 2048
*
- Maximum number of 32-bit vector registers per thread
- 256
- 256 (vector) + 256 (matrix)
- 256
- 256 (vector) + 256 (matrix)
- 256
- 256 (vector) + 256 (matrix)
*
- Maximum number of 32-bit scalar accumulation registers per thread
- 106
- 104
- 106
- 104
- 106
- 104
.. [1] RDNA architectures have a configurable wavefront size. The native
wavefront size is 32, but they can run in "CU mode", which has an effective
wavefront size of 64. This affects the number of resident wavefronts and
blocks per compute Unit.
.. [2] RDNA architectures expand the concept of the traditional compute unit
with the so-called work group processor, which effectively includes two
compute units, within which all threads can cooperate.
@@ -11,5 +11,5 @@ The structs, define macros, enums and files in the HIP runtime API.
* :ref:`global_enum_defines_reference`
* :ref:`driver_types_reference`
* :doc:`hip:doxygen/html/annotated`
* :doc:`hip:doxygen/html/files`
* :doc:`../../doxygen/html/annotated`
* :doc:`../../doxygen/html/files`
@@ -9,4 +9,4 @@ OpenGL interoperability
*******************************************************************************
.. doxygengroup:: GL
:content-only:
:content-only:
@@ -1,5 +1,5 @@
.. meta::
:description: This chapter describes the built-in math functions that are accessible in HIP.
:description: This chapter describes the built-in math functions that are accessible in HIP.
:keywords: AMD, ROCm, HIP, CUDA, math functions, HIP math functions
.. _math_api_reference:
@@ -1,6 +1,6 @@
.. meta::
:description: This chapter lists user-mode API interfaces and libraries
necessary for host applications to launch compute kernels to
:description: This chapter lists user-mode API interfaces and libraries
necessary for host applications to launch compute kernels to
available HSA ROCm kernel agents.
:keywords: AMD, ROCm, HIP, HSA, ROCR runtime, virtual memory management
@@ -5,6 +5,9 @@ defaults:
maxdepth: 6
root: index
subtrees:
- entries:
- file: what_is_hip
- file: faq
- caption: Install
entries:
@@ -12,33 +15,50 @@ subtrees:
title: Installing HIP
- file: install/build
title: Building HIP from source
- url: https://rocm.docs.amd.com/projects/install-on-linux/en/${branch}/reference/system-requirements.html
title: Linux supported AMD GPUs
- url: https://rocm.docs.amd.com/projects/install-on-windows/en/${branch}/reference/system-requirements.html
title: Windows supported AMD GPUs
- url: https://developer.nvidia.com/cuda-gpus
title: NVIDIA supported GPUs
- caption: Conceptual
- caption: Programming guide
entries:
- file: programming_guide
title: Introduction
- file: understand/programming_model
- file: understand/hardware_implementation
- file: understand/amd_clr
- file: understand/texture_fetching
title: Texture fetching
- caption: How to
entries:
- file: how-to/programming_manual
- file: how-to/hip_porting_guide
- file: how-to/hip_porting_driver_api
- file: how-to/hip_rtc
- file: understand/compilers
- file: how-to/performance_guidelines
- file: how-to/debugging
- file: how-to/logging
- file: how-to/cooperative_groups
- file: how-to/unified_memory
title: Unified memory
- file: how-to/virtual_memory
title: Virtual memory
- file: how-to/stream_ordered_allocator
- file: how-to/hipgraph
title: HIP graphs
- file: how-to/faq
- file: how-to/hip_runtime_api
subtrees:
- entries:
- file: how-to/hip_runtime_api/initialization
- file: how-to/hip_runtime_api/memory_management
subtrees:
- entries:
- file: how-to/hip_runtime_api/memory_management/host_memory
- file: how-to/hip_runtime_api/memory_management/device_memory
subtrees:
- entries:
- file: how-to/hip_runtime_api/memory_management/device_memory/texture_fetching
- file: how-to/hip_runtime_api/memory_management/coherence_control
- file: how-to/hip_runtime_api/memory_management/unified_memory
- file: how-to/hip_runtime_api/memory_management/virtual_memory
- file: how-to/hip_runtime_api/memory_management/stream_ordered_allocator
- file: how-to/hip_runtime_api/error_handling
- file: how-to/hip_runtime_api/cooperative_groups
- file: how-to/hip_runtime_api/hipgraph
- file: how-to/hip_runtime_api/call_stack
- file: how-to/hip_runtime_api/multi_device
- file: how-to/hip_runtime_api/opengl_interop
- file: how-to/hip_runtime_api/external_interop
- file: how-to/hip_porting_guide
- file: how-to/hip_porting_driver_api
- file: how-to/hip_rtc
- file: understand/amd_clr
- caption: Reference
entries:
@@ -75,6 +95,7 @@ subtrees:
- file: reference/hip_runtime_api/modules/runtime_compilation
- file: reference/hip_runtime_api/modules/callback_activity_apis
- file: reference/hip_runtime_api/modules/graph_management
- file: reference/hip_runtime_api/modules/graphics_interoperability
- file: reference/hip_runtime_api/modules/opengl_interoperability
- file: reference/hip_runtime_api/modules/cooperative_groups_reference
- file: reference/hip_runtime_api/global_defines_enums_structs_files
@@ -90,12 +111,14 @@ subtrees:
- file: reference/cpp_language_support
title: C++ language support
- file: reference/math_api
- file: reference/env_variables
- file: reference/terms
title: Comparing syntax for different APIs
- file: reference/deprecated_api_list
title: List of deprecated APIs
- file: reference/fp8_numbers
title: FP8 numbers in HIP
- file: reference/hardware_features
- caption: Tutorials
entries:
@@ -1,2 +1,2 @@
rocm-docs-core[api_reference]==1.7.2
rocm-docs-core[api_reference]==1.10.0
sphinxcontrib.doxylink
@@ -116,7 +116,7 @@ requests==2.32.3
# via
# pygithub
# sphinx
rocm-docs-core[api-reference]==1.7.2
rocm-docs-core[api-reference]==1.10.0
# via -r requirements.in
six==1.16.0
# via python-dateutil
تفاوت فایلی نمایش داده نمی شود زیرا این فایل بسیار بزرگ است Diff را بارگزاری کن
@@ -0,0 +1,628 @@
// MIT License
//
// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
#include "nvidia_hip_fix.hpp"
#include "example_utils.hpp"
#include "glad/glad.h"
#include <GLFW/glfw3.h>
#include <hip/hip_gl_interop.h>
#include <hip/hip_runtime.h>
#include <chrono>
#include <cstring>
#include <iomanip>
#include <iostream>
#include <vector>
/// \brief The number of triangles that the example's grid is in width.
constexpr uint32_t grid_width = 256;
/// \brief The number of triangles that the example's grid is in height.
constexpr uint32_t grid_height = 256;
/// \brief The OpenGL vertex shader that is used to render the triangles in this example.
/// The grid x- and y-positions are used to set the triangle coordinates in clip space.
/// The height value is passed on to the fragment shader.
constexpr const char* vertex_shader = R"(
#version 330 core
in float in_height;
in vec2 in_xy;
out float frag_height;
void main()
{
gl_Position = vec4(in_xy, 0, 1);
frag_height = in_height;
}
)";
/// \brief The OpenGL fragment shader that is used to render the triangles in this example.
/// The "height" value is used to shade the vertex. Its values are interpolated linearly
/// between the vertex and fragment shaders.
constexpr const char* fragment_shader = R"(
#version 330 core
in float frag_height;
void main()
{
gl_FragColor = vec4(vec3(frag_height * 0.5 + 0.5), 1.0);
}
)";
/// \brief Initialize a GLFW window with initial dimensions.
GLFWwindow* create_window(const int initial_width, const int initial_height)
{
/// [Sphinx-create-window]
glfwWindowHint(GLFW_CONTEXT_VERSION_MAJOR, 3);
glfwWindowHint(GLFW_CONTEXT_VERSION_MINOR, 3);
glfwWindowHint(GLFW_OPENGL_PROFILE, GLFW_OPENGL_CORE_PROFILE);
glfwWindowHint(GLFW_OPENGL_DEBUG_CONTEXT, GLFW_TRUE);
GLFWwindow* window = glfwCreateWindow(initial_width,
initial_height,
"OpenGL-HIP interop example",
nullptr,
nullptr);
if(window == nullptr)
{
std::cerr << "Failed to create GLFW window\n";
std::exit(error_exit_code);
}
/// [Sphinx-create-window]
return window;
}
/// \brief Select a HIP device that is compatible with the current OpenGL context.
/// \returns A HIP device-id that is capable of rendering the example. If no
/// suitable device is found, an error is printed and the program is exited.
int pick_hip_device()
{
/// [Sphinx-pick device]
unsigned int gl_device_count;
int hip_device;
HIP_CHECK(
hipGLGetDevices(&gl_device_count, &hip_device, 1, hipGLDeviceList::hipGLDeviceListAll));
if(gl_device_count == 0)
{
std::cerr << "System has no OpenGL-capable HIP devices" << std::endl;
std::exit(error_exit_code);
}
/// [Sphinx-pick device]
return hip_device;
}
/// \brief Utility function to compile shader source into an OpenGL shader.
/// If the shader could not be compiled, this function prints the compile log
/// and exits the program.
/// \param type - The OpenGL shader type for this shader, for example
/// \p GL_VERTEX_SHADER or \p GL_FRAGMENT_SHADER.
/// \param source - The GLSL source code for the shader.
GLuint compile_shader(const GLenum type, const char* const source)
{
const GLuint shader = glCreateShader(type);
const GLint length = static_cast<GLint>(std::strlen(source));
glShaderSource(shader, 1, &source, &length);
glCompileShader(shader);
GLint compile_status;
glGetShaderiv(shader, GL_COMPILE_STATUS, &compile_status);
if(compile_status != GL_TRUE)
{
// Compiling failed, get the shader log and print it to the user.
GLint log_length;
glGetShaderiv(shader, GL_INFO_LOG_LENGTH, &log_length);
std::vector<GLchar> log(log_length);
glGetShaderInfoLog(shader, length, nullptr, log.data());
std::cerr << "Failed to compile shader:\n";
std::cerr.write(log.data(), log.size()) << std::endl;
std::exit(error_exit_code);
}
return shader;
}
/// \brief Utility function to compile and link a vertex and fragment shader into an OpenGL
/// shader program.
/// If the shaders could not be compiled, a log is printed and the program is exited.
/// \param vert_src - The GLSL source code for the shader program's vertex shader.
/// \param frag_src - The GLSL source code for the shader program's fragment shader.
GLuint compile_shader_program(const char* const vert_src, const char* const frag_src)
{
const GLuint program = glCreateProgram();
const GLuint vert = compile_shader(GL_VERTEX_SHADER, vert_src);
const GLuint frag = compile_shader(GL_FRAGMENT_SHADER, frag_src);
glAttachShader(program, frag);
glAttachShader(program, vert);
glLinkProgram(program);
GLint link_status;
glGetProgramiv(program, GL_LINK_STATUS, &link_status);
if(link_status != GL_TRUE)
{
// Linking failed, get the program link log and print it to the user.
GLint log_length;
glGetProgramiv(program, GL_INFO_LOG_LENGTH, &log_length);
std::vector<GLchar> log(log_length);
glGetProgramInfoLog(program, log_length, nullptr, log.data());
std::cerr << "Failed to link program:\n";
std::cerr.write(log.data(), log.size()) << std::endl;
std::exit(error_exit_code);
}
glDetachShader(program, frag);
glDetachShader(program, vert);
glDeleteShader(frag);
glDeleteShader(vert);
return program;
}
/// \brief This structure contains the OpenGL handles that this example uses to render the
/// triangle grid to the screen.
///
/// Three buffers are used to render the triangle grid, the color of which is determined by
/// a HIP compulation in \p simulator:
/// - One buffer contains the height of each triangle (rendered as color).
/// - One buffer holds the x- and y-coordinates for each of the corners of the triangle. Note: these
/// coordinates are unique, as the triangles that are made up from these points are defined by the
/// - Index buffer, that holds indices into the former two buffers to make up a list of triangles.
struct renderer
{
/// The total number of vertices for the triangles.
constexpr static size_t num_verts = grid_width * grid_height;
/// The number of bytes in the x- and y-coordinates buffer. Each x/y coordinate is encoded as
/// a pair of floats, which are stored in a packed array-of-structures format: | x | y | x | y | ... |.
constexpr static size_t grid_buffer_size = num_verts * sizeof(float) * 2;
/// The number of bytes in the height buffer. Each height is encoded as a floating point value.
/// This buffer will be shared with HIP, which is why these coordinates are
/// stored in a separate buffer.
constexpr static size_t height_buffer_size = num_verts * sizeof(float);
/// The number of indices in the index buffer. Each triangle has 3 points, each square in the grid
/// is made up of 2 triangles. There are (width - 1) by (height - 1) squares in the grid.
constexpr static size_t num_indices = (grid_width - 1) * (grid_height - 1) * 3 * 2;
/// The number of bytes in the index buffer. Each index is encoded as a 32-bit int.
constexpr static size_t index_buffer_size = num_indices * sizeof(uint32_t);
/// An OpenGL handle to a Vertex Array Object, which has the grid and height buffers
/// bound to the corresponding attribute in the shader program (<tt>program</tt>) used for rendering.
GLuint vao;
/// Handle to the buffer that holds the indices for the triangles to render.
GLuint index_buffer;
/// Handle to the buffer that holds the x- and y-coordinates for each grid point.
GLuint grid_buffer;
/// Handle to the buffer that holds the heights each grid point. This buffer is shared with HIP.
GLuint height_buffer;
/// Handle to the OpenGL shader program that this example uses to render the triangles to the screen.
GLuint program;
/// Counters used to keep track of the rendering performance.
uint32_t fps_frame = 0;
std::chrono::high_resolution_clock::time_point fps_start_time;
/// \brief Initialize OpenGL rendering resources.
renderer()
{
// Create a vertex array used to bind the attribute buffers.
glGenVertexArrays(1, &this->vao);
// Also generate the buffers in question.
GLuint buffers[3];
glGenBuffers(std::size(buffers), buffers);
this->index_buffer = buffers[0];
this->grid_buffer = buffers[1];
this->height_buffer = buffers[2];
// Compile the shader program used to render the triangles.
this->program = compile_shader_program(vertex_shader, fragment_shader);
// Upload the initial data to the buffers.
this->initialize_buffer_data();
// Set up the VAO by binding the height and grid buffers to the attribute locations
// in the shader program.
glBindVertexArray(this->vao);
// Note - keep variable "in_height" in sync with shader.
glBindBuffer(GL_ARRAY_BUFFER, this->height_buffer);
const GLuint height_attrib = glGetAttribLocation(this->program, "in_height");
glVertexAttribPointer(height_attrib, 1, GL_FLOAT, GL_FALSE, 0, 0);
glEnableVertexAttribArray(height_attrib);
// Note - keep variable "in_xy" in sync with shader.
const GLuint grid_attrib = glGetAttribLocation(this->program, "in_xy");
glBindBuffer(GL_ARRAY_BUFFER, this->grid_buffer);
glVertexAttribPointer(grid_attrib, 2, GL_FLOAT, GL_FALSE, 0, 0);
glEnableVertexAttribArray(grid_attrib);
this->fps_start_time = std::chrono::high_resolution_clock::now();
}
renderer(const renderer&) = delete;
renderer& operator=(const renderer&) = delete;
renderer(renderer&&) = delete;
renderer& operator=(renderer&&) = delete;
~renderer()
{
glDeleteProgram(this->program);
GLuint buffers[] = {this->index_buffer, this->grid_buffer, this->height_buffer};
glDeleteBuffers(std::size(buffers), buffers);
glDeleteVertexArrays(1, &this->vao);
}
/// \brief Upload the initial values for each buffer to Vulkan.
void initialize_buffer_data() const
{
// Initialize the height buffer.
glBindBuffer(GL_ARRAY_BUFFER, this->height_buffer);
// We do not need to fill it, as that is going to be done from HIP, but we
// do need to allocate it from OpenGL. This is done simply by passing `nullptr` as
// initial data pointer.
// GL_DYNAMIC_DRAW is passed because this buffer is going to be updated every frame,
// and is going to be used to hold vertex data for drawing - this may help the driver
// to render more efficiently.
glBufferData(GL_ARRAY_BUFFER, height_buffer_size, nullptr, GL_DYNAMIC_DRAW);
// Initialize the grid buffer.
{
glBindBuffer(GL_ARRAY_BUFFER, this->grid_buffer);
// Avoid having to allocate on host by allocating the buffer in OpenGL and then mapping it
// into host-memory to initialize it.
// This buffer is going to be initialized once and is going to be used for drawing,
// so pass GL_STATIC_DRAW as usage hint.
glBufferData(GL_ARRAY_BUFFER, grid_buffer_size, nullptr, GL_STATIC_DRAW);
float* grid = reinterpret_cast<float*>(glMapBuffer(GL_ARRAY_BUFFER, GL_WRITE_ONLY));
for(uint32_t y = 0; y < grid_height; ++y)
{
for(uint32_t x = 0; x < grid_width; ++x)
{
*grid++ = (2.0f * x) / (grid_width - 1) - 1;
*grid++ = (2.0f * y) / (grid_height - 1) - 1;
}
}
// Let OpenGL know that we are done with this buffer.
glUnmapBuffer(GL_ARRAY_BUFFER);
}
// Initialize the index buffer
{
glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, this->index_buffer);
// Similar as the grid buffer, this buffer is going to be initialized once and is then used
// for drawing.
glBufferData(GL_ELEMENT_ARRAY_BUFFER, index_buffer_size, nullptr, GL_STATIC_DRAW);
uint32_t* indices
= reinterpret_cast<uint32_t*>(glMapBuffer(GL_ELEMENT_ARRAY_BUFFER, GL_WRITE_ONLY));
for(uint32_t y = 0; y < grid_height - 1; ++y)
{
for(uint32_t x = 0; x < grid_width - 1; ++x)
{
*indices++ = (y + 0) * grid_width + (x + 0);
*indices++ = (y + 1) * grid_width + (x + 0);
*indices++ = (y + 0) * grid_width + (x + 1);
*indices++ = (y + 1) * grid_width + (x + 0);
*indices++ = (y + 1) * grid_width + (x + 1);
*indices++ = (y + 0) * grid_width + (x + 1);
}
}
glUnmapBuffer(GL_ELEMENT_ARRAY_BUFFER);
}
}
/// \brief Bind the OpenGL pipeline state for this renderer.
void bind() const
{
glBindVertexArray(this->vao);
glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, this->index_buffer);
glUseProgram(this->program);
}
/// \brief Draw the next frame to the window. This requires the render state be bound using
/// <tt>bind</tt>.
void draw()
{
glDrawElements(GL_TRIANGLES, num_indices, GL_UNSIGNED_INT, nullptr);
// Output a native performance measurement.
++this->fps_frame;
const auto frame_time = std::chrono::high_resolution_clock::now();
const auto time_diff = frame_time - this->fps_start_time;
if(time_diff > std::chrono::seconds{5})
{
const auto time_diff_sec
= std::chrono::duration_cast<std::chrono::duration<float>>(time_diff).count();
std::cout << "Average FPS (over " << double_precision(time_diff_sec, 2, true)
<< " seconds): " << double_precision(this->fps_frame / time_diff_sec, 2, true)
<< " (" << double_precision((time_diff_sec * 1000) / this->fps_frame, 2, true)
<< " ms per frame, " << this->fps_frame << " frames)" << std::endl;
this->fps_frame = 0;
this->fps_start_time = frame_time;
}
}
};
/// [Sphinx sinewave kernel start]
/// \brief The main HIP kernel for this example - computes a simple sine wave over a
/// 2-dimensional grid of points.
/// \param height_map - the grid of points to compute a sine wave for. It is expected to be
/// a \p grid_width by \p grid_height array packed into memory.(y on the inner axis).
/// \param time - The current time relative to the start of the program.
__global__ void sinewave_kernel(float* height_map, const float time)
{
const float freq = 10.f;
const unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
const unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
const float u = (2.f * x) / grid_width - 1.f;
const float v = (2.f * y) / grid_height - 1.f;
if(x < grid_width && y < grid_height)
{
height_map[x * grid_width + y] = sinf(u * freq + time) * cosf(v * freq + time);
}
}
/// [Sphinx sinewave kernel end]
/// \brief This structure contains the HIP state and functionality used to advance the simulation.
/// Initializing a \p simulator fetches the OpenGL height buffer from the corresponding <tt>renderer</tt>,
/// and imports it as a HIP device pointer. This pointer is then passed to the simulation kernel
/// (<tt>sinewave_kernel</tt>), which updates the values in it. When <tt>renderer::draw</tt> is called,
/// the updated values are read from the buffer in OpenGL and used to render the triangle grid.
struct simulator
{
/// The HIP stream used to advance the simulation. This must be created from an OpenGL-interop
/// capable device, see <tt>pick_hip_device</tt>.
hipStream_t hip_stream;
/// A HIP graphics resource that is imported from the OpenGL height buffer to simulate.
hipGraphicsResource_t hip_height_buffer;
/// A device pointer to the height buffer, imported from the OPenGL height buffer.
float* hip_height_ptr;
/// The start time of the program, used for the simulation.
std::chrono::high_resolution_clock::time_point start_time;
/// \brief Initialize a simulator, that uses a particular HIP device.
/// \param renderer - The renderer that will be used to render the example. Its height buffer
/// is imported to HIP for use with this simulator.
explicit simulator(const int hip_device, const renderer& renderer)
{
// Create a HIP stream for the target device.
HIP_CHECK(hipSetDevice(hip_device));
HIP_CHECK(hipStreamCreate(&this->hip_stream));
// [Sphinx buffer register and get start]
// Import the OpenGL height buffer into a HIP graphics resource.
HIP_CHECK(hipGraphicsGLRegisterBuffer(
&this->hip_height_buffer,
renderer.height_buffer,
// We are going to write to this buffer from HIP,
// but we do not need to read from it.
// As an optimization we can pass hipGraphicsRegisterFlagsWriteDiscard,
// so that the driver knows that we do not need the old values of
// the buffer.
hipGraphicsRegisterFlagsWriteDiscard));
// After importing the OpenGL height buffer into HIP, map it into HIP memory so that we can use it.
HIP_CHECK(hipGraphicsMapResources(1, &this->hip_height_buffer, this->hip_stream));
// Fetch the device pointer that points to the OpenGL buffer's memory.
// This function also fetches the size of the buffer. We already know it, but we still need to pass
// a valid pointer to hipGraphicsResourceGetMappedPointer.
size_t size;
HIP_CHECK(
hipGraphicsResourceGetMappedPointer(reinterpret_cast<void**>(&this->hip_height_ptr),
&size,
this->hip_height_buffer));
// [Sphinx buffer register and get end]
this->start_time = std::chrono::high_resolution_clock::now();
}
simulator(const simulator&) = delete;
simulator& operator=(const simulator&) = delete;
simulator(simulator&&) = delete;
simulator& operator=(simulator&&) = delete;
~simulator()
{
// [Sphinx unregister start]
HIP_CHECK(hipStreamSynchronize(this->hip_stream));
HIP_CHECK(hipGraphicsUnmapResources(1, &this->hip_height_buffer, this->hip_stream));
HIP_CHECK(hipGraphicsUnregisterResource(this->hip_height_buffer));
HIP_CHECK(hipStreamDestroy(this->hip_stream));
// [Sphinx unregister end]
}
/// \brief Advance the simulation one step.
void step()
{
const auto now = std::chrono::high_resolution_clock::now();
const float time
= std::chrono::duration<float, std::chrono::seconds::period>(now - this->start_time)
.count();
// [Sphinx buffer use in kernel start]
// The tile size to be used for each block of the computation. A tile is
// tile_size by tile_size threads in this case, since we are invoking the
// computation over a 2D-grid.
constexpr size_t tile_size = 8;
// Launch the HIP kernel to advance the simulation.
sinewave_kernel<<<dim3(ceiling_div(grid_width, tile_size),
ceiling_div(grid_height, tile_size)),
dim3(tile_size, tile_size),
0,
this->hip_stream>>>(this->hip_height_ptr, time);
// Check that no errors occured while launching the kernel.
HIP_CHECK(hipGetLastError());
// [Sphinx buffer use in kernel end]
}
};
/// \brief GLFW window resize callback: If the window is resized then we need to re-size
/// the OpenGL viewport.
void resize_callback(GLFWwindow* const window, const int width, const int height)
{
(void)window;
glViewport(0, 0, width, height);
}
/// \brief Program entry point.
int main()
{
// The initial width of the GLFW window when the example is first started.
constexpr int initial_window_width = 1280;
// The initial height of the GLFW window.
constexpr int initial_window_height = 800;
// Initialize GLFW.
glfwSetErrorCallback(
[](int code, const char* const message)
{ std::cerr << "A glfw error encountered: " << message << "(" << code << ")\n"; });
if(glfwInit() != GLFW_TRUE)
{
std::cerr << "failed to initialize GLFW\n";
return error_exit_code;
}
// Initialize the GLFW window used to render the example.
GLFWwindow* const window = create_window(initial_window_width, initial_window_height);
// Ensure that we are using the OpenGL context associated to the Window.
glfwMakeContextCurrent(window);
// [Sphinx opengl functions load start]
// Make GLFW use a custom loader - we need this for the more recent OpenGL functions,
// as these are not loaded by default on all platforms.
if(!gladLoadGLLoader(reinterpret_cast<GLADloadproc>(glfwGetProcAddress)))
{
std::cerr << "Failed to load OpenGL function pointers" << std::endl;
return error_exit_code;
}
// [Sphinx opengl functions load end]
// Disable vsync.
glfwSwapInterval(0);
// If the OpenGL GL_ARB_debug_output extension is present, set a callback that is called
// whenever an OpenGL error occurs. This saves us calling glGetError after every OpenGL function.
if(GLAD_GL_ARB_debug_output)
{
glDebugMessageCallbackARB(
[](GLenum,
GLenum,
GLuint,
GLenum severity,
GLsizei length,
const GLchar* message,
const void*)
{
std::cerr << "[OpenGL] ";
std::cerr.write(message, length) << std::endl;
if(severity == GL_DEBUG_SEVERITY_HIGH_ARB)
{
std::exit(error_exit_code);
}
},
nullptr);
// We just want the errors: First disable all messaging, and then enable just the
// most severe ones.
glDebugMessageControlARB(GL_DONT_CARE, GL_DONT_CARE, GL_DONT_CARE, 0, NULL, GL_FALSE);
glDebugMessageControlARB(GL_DONT_CARE,
GL_DONT_CARE,
GL_DEBUG_SEVERITY_HIGH_ARB,
0,
NULL,
GL_TRUE);
// Report errors synchronously instead of asynchronously.
glEnable(GL_DEBUG_OUTPUT_SYNCHRONOUS_ARB);
}
// Figure out which HIP device we need to use.
// This device needs to be interop-capable (see pick_hip_device).
const int hip_device = pick_hip_device();
// Let the user know which device we are using, on both the OpenGL and HIP sides.
hipDeviceProp_t hip_props;
HIP_CHECK(hipGetDeviceProperties(&hip_props, hip_device));
const GLubyte* const device_name = glGetString(GL_RENDERER);
std::cout << "Using device " << device_name << " (hip device " << hip_device
<< ", compute capability " << hip_props.major << "." << hip_props.minor << ")\n";
// Sub-scope to call destructors before terminating GLFW.
{
renderer renderer;
simulator simulator(hip_device, renderer);
// There are no other renderers, so we can bind the OpenGL state once.
renderer.bind();
glfwSetFramebufferSizeCallback(window, resize_callback);
glClearColor(0, 0, 0, 1);
// The main rendering loop.
// Repeat for as long as the window is not closed.
while(glfwWindowShouldClose(window) == GLFW_FALSE)
{
glClear(GL_COLOR_BUFFER_BIT);
// First step the simulation so that the height buffer is ready
// for the next frame.
simulator.step();
// Draw the example to the window's framebuffer.
renderer.draw();
// Present the framebuffer on screen.
glfwSwapBuffers(window);
glfwPollEvents();
}
}
// Clean up GLFW.
glfwDestroyWindow(window);
glfwTerminate();
}
@@ -0,0 +1,4 @@
import urllib.request
urllib.request.urlretrieve("https://raw.githubusercontent.com/ROCm/rocm-examples/refs/heads/develop/HIP-Basic/opengl_interop/main.hip", "docs/tools/example_codes/opengl_interop.hip")
urllib.request.urlretrieve("https://raw.githubusercontent.com/ROCm/rocm-examples/refs/heads/develop/HIP-Basic/vulkan_interop/main.hip", "docs/tools/example_codes/external_interop.hip")
@@ -30,7 +30,7 @@ Implementing reductions on GPUs requires a basic understanding of the :doc:`/und
Synchronizing parallel threads of execution across a GPU is crucial for correctness as the partial results can't be synchronized before they manifest. Synchronizing all the threads running on a GPU at any given time is possible, however, it is a costly and intricate operation. If synchronization is not absolutely necessary, map the parallel algorithm so that multiprocessors and blocks can make independent progress and need not sync frequently.
There are ten reduction implementations in the `rocm-examples <https://github.com/ROCm/rocm-examples/tree/develop/Tutorials/reduction/include/Reduction>`_, which are described in the following sections.
There are ten reduction implementations in the `rocm-examples <https://github.com/ROCm/rocm-examples/tree/develop/Tutorials/reduction/include/Reduction>`_, which are described in the following sections.
Naive shared reduction
----------------------
@@ -188,7 +188,7 @@ A notable exception is when the shared read uniformly broadcasts to the same add
.. note::
To avoid bank conflicts, read shared memory in a coalesced manner, which implies that reads/writes of each lane in a warp evaluate to consecutive locations. Analyzing the read/write patterns could help you to understand the cause of bank conflicts. For more details, check `CDNA3 ISA <https://www.amd.com/content/dam/amd/en/documents/instinct-tech-docs/instruction-set-architectures/amd-instinct-mi300-cdna3-instruction-set-architecture.pdf>`_ or `RDNA3 ISA <https://www.amd.com/content/dam/amd/en/documents/radeon-tech-docs/instruction-set-architectures/rdna3-shader-instruction-set-architecture-feb-2023_0.pdf>`_ data share operations chapter.
Utilize upper half of the block
-------------------------------
@@ -143,10 +143,12 @@ Retrieval of the result from the device is done much like input data copy. In th
HIP_CHECK(hipMemcpy(y.data(), d_y, size_bytes, hipMemcpyDeviceToHost));
.. _compiling_on_the_command_line:
Compiling on the command line
=============================
.. _setting_up_the_command-line:
.. _setting_up_the_command_line:
Setting up the command line
---------------------------
@@ -19,11 +19,11 @@ Project organization
CLR includes the following source code,
* ``hipamd`` - contains implementation of ``HIP`` language on the AMD platform. It is hosted at `clr/hipamd <https://github.com/ROCm/clr/tree/develop/hipamd>`_.
* ``hipamd`` - contains implementation of ``HIP`` language on the AMD platform. It is hosted at `clr/hipamd <https://github.com/ROCm/clr/tree/amd-staging/hipamd>`_.
* ``opencl`` - contains implementation of `OpenCL™ <https://www.khronos.org/opencl/>`_ on AMD platform. It is hosted at `clr/opencl <https://github.com/ROCm/clr/tree/develop/opencl>`_.
* ``opencl`` - contains implementation of `OpenCL™ <https://www.khronos.org/opencl/>`_ on AMD platform. It is hosted at `clr/opencl <https://github.com/ROCm/clr/tree/amd-staging/opencl>`_.
* ``rocclr`` - contains ROCm compute runtime used in `HIP` and `OpenCL™`. This is hosted at `clr/rocclr <https://github.com/ROCm/clr/tree/develop/rocclr>`_.
* ``rocclr`` - contains ROCm compute runtime used in `HIP` and `OpenCL™`. This is hosted at `clr/rocclr <https://github.com/ROCm/clr/tree/amd-staging/rocclr>`_.
How to build/install
@@ -79,4 +79,4 @@ To run ``hip-tests`` please go to the repository and follow the steps.
Release notes
-------------
HIP provides release notes in CLR `change log <https://github.com/ROCm/clr/blob/develop/CHANGELOG.md>`_, which has records of changes in each release.
HIP provides release notes in CLR `change log <https://github.com/ROCm/clr/blob/amd-staging/amd-staging/CHANGELOG.md>`_, which has records of changes in each release.
@@ -0,0 +1,100 @@
.. meta::
:description: Compilation workflow of the HIP compilers.
:keywords: AMD, ROCm, HIP, CUDA, HIP runtime API
.. _hip_compilers:
********************************************************************************
HIP compilers
********************************************************************************
ROCm provides the compiler driver ``hipcc``, that can be used on AMD ROCm and
NVIDIA CUDA platforms.
On ROCm, ``hipcc`` takes care of the following:
- Setting the default library and include paths for HIP
- Setting some environment variables
- Invoking the appropriate compiler - ``amdclang++``
On NVIDIA CUDA platform, ``hipcc`` takes care of invoking compiler ``nvcc``.
``amdclang++`` is based on the ``clang++`` compiler. For more
details, see the :doc:`llvm project<llvm-project:index>`.
HIP compilation workflow
================================================================================
HIP provides a flexible compilation workflow that supports both offline
compilation and runtime or just-in-time (JIT) compilation. Each approach has
advantages depending on the use case, target architecture, and performance
needs.
The offline compilation is ideal for production environments, where the
performance is critical and the target GPU architecture is known in advance.
The runtime compilation is useful in development environments or when
distributing software that must run on a wide range of hardware without the
knowledge of the GPU in advance. It provides flexibility at the cost of some
performance overhead.
Offline compilation
--------------------------------------------------------------------------------
The HIP code compilation is performed in two stages: host and device code
compilation stage.
- Device-code compilation stage: The compiled device code is embedded into the
host object file. Depending on the platform, the device code can be compiled
into assembly or binary. ``nvcc`` and ``amdclang++`` target different
architectures and use different code object formats. ``nvcc`` uses the binary
``cubin`` or the assembly PTX files, while the ``amdclang++`` path is the
binary ``hsaco`` format. On CUDA platforms, the driver compiles the PTX files
to executable code during runtime.
- Host-code compilation stage: On the host side, ``hipcc`` or ``amdclang++`` can
compile the host code in one step without other C++ compilers. On the other
hand, ``nvcc`` only replaces the ``<<<...>>>`` kernel launch syntax with the
appropriate CUDA runtime function call and the modified host code is passed to
the default host compiler.
For an example on how to compile HIP from the command line, see :ref:`SAXPY
tutorial<compiling_on_the_command_line>` .
Runtime compilation
--------------------------------------------------------------------------------
HIP allows you to compile kernels at runtime using the ``hiprtc*`` API. Kernels
are stored as a text string, which is passed to HIPRTC alongside options to
guide the compilation.
For more details, see
:doc:`HIP runtime compiler <../how-to/hip_rtc>`.
Static libraries
================================================================================
``hipcc`` supports generating two types of static libraries.
- The first type of static library only exports and launches host functions
within the same library and not the device functions. This library type offers
the ability to link with a non-hipcc compiler such as ``gcc``. Additionally,
this library type contains host objects with device code embedded as fat
binaries. This library type is generated using the flag ``--emit-static-lib``:
.. code-block:: shell
hipcc hipOptLibrary.cpp --emit-static-lib -fPIC -o libHipOptLibrary.a
gcc test.cpp -L. -lhipOptLibrary -L/path/to/hip/lib -lamdhip64 -o test.out
- The second type of static library exports device functions to be linked by
other code objects by using ``hipcc`` as the linker. This library type
contains relocatable device objects and is generated using ``ar``:
.. code-block:: shell
hipcc hipDevice.cpp -c -fgpu-rdc -o hipDevice.o
ar rcsD libHipDevice.a hipDevice.o
hipcc libHipDevice.a test.cpp -fgpu-rdc -o test.out
For more information, see `HIP samples host functions <https://github.com/ROCm/hip-tests/tree/develop/samples/2_Cookbook/15_static_library/host_functions>`_
and `device functions <https://github.com/ROCm/hip-tests/tree/develop/samples/2_Cookbook/15_static_library/device_functions>`_.
@@ -26,11 +26,10 @@ according to the :ref:`SIMT model<programming_model_simt>`, together with the
necessary registers and caches.
The threads are executed in groupings called warps. The amount of threads
making up a warp is architecture dependent.
On AMD GPUs the warp size is commonly 64 threads, except in RDNA
architectures which can utilize a warp size of 32 or 64 respectively.
The warp size of supported AMD GPUs is listed in the :doc:`rocm:reference/gpu-arch-specs`.
NVIDIA GPUs have a warp size of 32.
making up a warp is architecture dependent. On AMD GPUs the warp size is
commonly 64 threads, except in RDNA architectures which can utilize a warp size
of 32 or 64 respectively. The warp size of supported AMD GPUs is listed in the
:doc:`rocm:reference/gpu-arch-specs`. NVIDIA GPUs have a warp size of 32.
In contrast to CPUs, GPUs generally do not employ complex cache structures or
control logic, like branch prediction or out-of-order execution, but instead
@@ -2,7 +2,9 @@
:description: This chapter explains the HIP programming model, the contract
between the programmer and the compiler/runtime executing the
code, how it maps to the hardware.
:keywords: AMD, ROCm, HIP, CUDA, API design
:keywords: ROCm, HIP, CUDA, API design, programming model
.. _programming_model:
*******************************************************************************
HIP programming model
@@ -10,7 +12,7 @@ HIP programming model
The HIP programming model makes it easy to map data-parallel C/C++ algorithms to
massively parallel, wide single instruction, multiple data (SIMD) architectures,
such as GPUs.
such as GPUs.
While the model may be expressed in most imperative languages, (for example
Python via PyHIP) this document will focus on the original C/C++ API of HIP.
@@ -74,7 +76,7 @@ a few key differences between the two:
accessible from all contexts.
Looking at :ref:`rdna3_cu` and :ref:`cdna3_cu`, you can see that
every CU has an instance of storage backing the namespace ``__shared__``.
every CU has an instance of storage backing the namespace ``__shared__``.
Even if the host were to have access to these regions of
memory, the performance benefits of the segmented memory subsystem are
supported by the inability of asynchronous access from the host.
@@ -90,11 +92,11 @@ a few key differences between the two:
* Asynchrony is at the forefront of the HIP API. Computations launched on the device
execute asynchronously with respect to the host, and it is the user's responsibility to
synchronize their data dispatch/fetch with computations on the device.
synchronize their data dispatch/fetch with computations on the device.
.. note::
HIP does perform implicit synchronization on occasions, more advanced than other
APIs such as OpenCL or SYCL, in which the responsibility of synchronization mostly
HIP does perform implicit synchronization on occasions, more advanced than other
APIs such as OpenCL or SYCL, in which the responsibility of synchronization mostly
depends on the user.
.. _programming_model_simt:
@@ -130,7 +132,7 @@ The incoming four-vector of floating-point values ``b`` is multiplied by a
scalar and then added element-wise to the four-vector floating-point values of
``a``. On modern SIMD-capable architectures, the four-vector ops are expected to
compile to a single SIMD instruction. However, GPU execution of this kernel will
typically break down the vector elements into 4 separate threads for parallel execution,
typically break down the vector elements into 4 separate threads for parallel execution,
as seen in the following figure:
.. _simt:
@@ -145,7 +147,7 @@ as seen in the following figure:
In HIP, lanes of the SIMD architecture are fed by mapping threads of a SIMT
execution, one thread down each lane of an SIMD engine. Execution parallelism
usually isn't exploited from the width of the built-in vector types, but across multiple threads via the thread ID constants ``threadIdx.x``, ``blockIdx.x``, etc.
usually isn't exploited from the width of the built-in vector types, but across multiple threads via the thread ID constants ``threadIdx.x``, ``blockIdx.x``, etc.
.. _inherent_thread_model:
@@ -159,7 +161,7 @@ online/offline to binaries, in bulk.
All threads of a kernel are uniquely identified by a set of integral values, called thread IDs.
The set of integers identifying a thread relate to the hierarchy in which the threads execute.
The thread hierarchy inherent to how AMD GPUs operate is depicted in the
The thread hierarchy inherent to how AMD GPUs operate is depicted in the
following figure.
.. _inherent_thread_hierarchy:
@@ -175,9 +177,9 @@ following figure.
Warp (or Wavefront)
The innermost grouping of threads is called a warp, or a wavefront in ISA terms. A warp
is the most tightly coupled groups of threads, both physically and logically. Threads
inside a warp are also called lanes, and the integral value identifying them is the lane ID.
is the most tightly coupled groups of threads, both physically and logically. Threads
inside a warp are also called lanes, and the integral value identifying them is the lane ID.
.. tip::
Lane IDs aren't queried like other thread IDs, but are user-calculated. As a
@@ -222,10 +224,10 @@ groups let you define your own set of thread groups which may fit your user-cas
better than the defaults defined by the hardware.
.. note::
The implicit groups defined by kernel launch parameters are still available
The implicit groups defined by kernel launch parameters are still available
when working with cooperative groups.
For further information, see :doc:`Cooperative groups </how-to/cooperative_groups>`.
For further information, see :doc:`Cooperative groups </how-to/hip_runtime_api/cooperative_groups>`.
Memory model
============
@@ -287,7 +289,7 @@ HIP programs consist of two distinct scopes:
importantly around kernel launching and argument setting. It is geared
towards implementing abstractions atop, such as the runtime API itself.
Offers two additional pieces of functionality not provided by the Runtime
API: ``hipModule`` and ``hipCtx`` APIs. For further details, check
API: ``hipModule`` and ``hipCtx`` APIs. For further details, check
:doc:`HIP driver API </how-to/hip_porting_driver_api>`.
* The device-side kernels running on GPUs. Both the host and the device-side
@@ -0,0 +1,99 @@
.. meta::
:description: This chapter provides an introduction to the HIP API.
:keywords: AMD, ROCm, HIP, CUDA, C++ language extensions
.. _intro-to-hip:
*******************************************************************************
What is HIP?
*******************************************************************************
The Heterogeneous-computing Interface for Portability (HIP) API is a C++ runtime API
and kernel language that lets developers create portable applications running in heterogeneous systems,
using CPUs and AMD GPUs or NVIDIA GPUs from a single source code. HIP provides a simple
marshalling language to access either the AMD ROCM back-end, or NVIDIA CUDA back-end,
to build and run application kernels.
.. figure:: data/what_is_hip/hip.svg
:alt: HIP in an application.
:align: center
* HIP is a thin API with little or no performance impact over coding directly
in NVIDIA CUDA or AMD :doc:`ROCm <rocm:what-is-rocm>`.
* HIP enables coding in a single-source C++ programming language including
features such as templates, C++11 lambdas, classes, namespaces, and more.
* Developers can specialize for the platform (CUDA or ROCm) to tune for
performance or handle tricky cases.
ROCm offers compilers (``clang``, ``hipcc``), code
profilers (``rocprof``, ``omnitrace``), debugging tools (``rocgdb``), libraries
and HIP with the runtime API and kernel language, to create heterogeneous applications
running on both CPUs and GPUs. ROCm provides marshalling libraries like
:doc:`hipFFT <hipfft:index>` or :doc:`hipBLAS <hipblas:index>` that act as a
thin programming layer over either NVIDIA CUDA or AMD ROCm to enable support for
either back-end. These libraries offer pointer-based memory interfaces and are
easily integrated into your applications.
HIP supports the ability to build and run on either AMD GPUs or NVIDIA GPUs.
GPU Programmers familiar with NVIDIA CUDA or OpenCL will find the HIP API
familiar and easy to use. Developers no longer need to choose between AMD or
NVIDIA GPUs. You can quickly port your application to run on the available
hardware while maintaining a single codebase. The :doc:`HIPify <hipify:index>`
tools, based on the clang front-end and Perl language, can convert CUDA API
calls into the corresponding HIP API calls. However, HIP is not intended to be a
drop-in replacement for CUDA, and developers should expect to do some manual
coding and performance tuning work for AMD GPUs to port existing projects as
described :doc:`HIP porting guide <how-to/hip_porting_guide>`.
HIP provides two components: those that run on the CPU, also known as host
system, and those that run on GPUs, also referred to as device. The host-based
code is used to create device buffers, move data between the host application
and a device, launch the device code (also known as kernel), manage streams and
events, and perform synchronization. The kernel language provides a way to
develop massively parallel programs that run on GPUs, and provides access to GPU
specific hardware capabilities.
In summary, HIP simplifies cross-platform development, maintains performance,
and provides a familiar C++ experience for GPU programming that runs seamlessly
on both AMD and NVIDIA GPUs.
HIP components
===============================================
HIP consists of the following components. For information on the license
associated with each component, see :doc:`HIP licensing <license>`.
C++ runtime API
-----------------------------------------------
For the AMD ROCm platform, HIP provides headers and a runtime library built on
top of HIP-Clang compiler in the repository
:doc:`Compute Language Runtime (CLR) <understand/amd_clr>`. The HIP runtime
implements HIP streams, events, and memory APIs, and is an object library that
is linked with the application. The source code for all headers and the library
implementation is available on GitHub.
For the NVIDIA CUDA platform, HIP provides headers that translate from the
HIP runtime API to the CUDA runtime API. The host-side contains mostly inlined
wrappers or even just preprocessor defines, with no additional overhead.
The device-side code is compiled with ``nvcc``, just like normal CUDA kernels,
and therefore one can expect the same performance as if directly coding in CUDA.
The CUDA specific headers can be found in the `hipother repository <https://github.com/ROCm/hipother>`_.
For further details, check :ref:`HIP Runtime API Reference <runtime_api_reference>`.
Kernel language
-----------------------------------------------
HIP provides a C++ syntax that is suitable for compiling most code that commonly appears in
compute kernels (classes, namespaces, operator overloading, and templates). HIP also defines other
language features that are designed to target accelerators, such as:
* Short-vector headers that can serve on a host or device
* Math functions that resemble those in ``math.h``, which is included with standard C++ compilers
* Built-in functions for accessing specific GPU hardware capabilities
For further details, check :doc:`C++ language extensions <reference/cpp_language_extensions>`
and :doc:`C++ language support <reference/cpp_language_support>`.
@@ -724,7 +724,7 @@ enum hipLimit_t {
/** Allocates the memory as write-combined. On some system configurations, write-combined allocation
* may be transferred faster across the PCI Express bus, however, could have low read efficiency by
* most CPUs. It's a good option for data tranfer from host to device via mapped pinned memory.*/
* most CPUs. It's a good option for data transfer from host to device via mapped pinned memory.*/
#define hipHostMallocWriteCombined 0x4
#define hipHostAllocWriteCombined 0x4
@@ -735,11 +735,11 @@ enum hipLimit_t {
#define hipHostMallocNumaUser 0x20000000
#define hipExtHostAllocNumaUser 0x20000000
/** Allocate coherent memory. Overrides HIP_COHERENT_HOST_ALLOC for specific allocation.*/
/** Allocate coherent memory. Overrides HIP_HOST_COHERENT for specific allocation.*/
#define hipHostMallocCoherent 0x40000000
#define hipExtHostAllocCoherent 0x40000000
/** Allocate non-coherent memory. Overrides HIP_COHERENT_HOST_ALLOC for specific allocation.*/
/** Allocate non-coherent memory. Overrides HIP_HOST_COHERENT for specific allocation.*/
#define hipHostMallocNonCoherent 0x80000000
#define hipExtHostAllocNonCoherent 0x80000000
@@ -3494,7 +3494,6 @@ hipError_t hipMemAllocHost(void** ptr, size_t size);
/**
* @}
*/
/**
* @brief Allocates device accessible page locked (pinned) host memory
*
@@ -3583,6 +3582,8 @@ hipError_t hipExtHostAlloc(void** ptr, size_t size, unsigned int flags);
* The API returns the allocation pointer, managed by HMM, can be used further to execute kernels
* on device and fetch data between the host and device as needed.
*
* If HMM is not supported, the function behaves the same as @p hipMallocHost .
*
* @note It is recommend to do the capability check before call this API.
*
* @param [out] dev_ptr - pointer to allocated device memory
@@ -9323,7 +9324,7 @@ return hipOccupancyMaxPotentialBlockSize(gridSize, blockSize,(hipFunction_t)kern
* @ingroup ModuleCooperativeG
*
* \tparam T The type of the kernel function.
*
*
* @param [in] f Kernel function to launch.
* @param [in] gridDim Grid dimensions specified as multiple of blockDim.
* @param [in] blockDim Block dimensions specified in work-items.
@@ -38,14 +38,14 @@ THE SOFTWARE.
<definitions>
<context id="hip">
<include>
<include>
<context ref="def:c-like-comment"/>
<context ref="c:string"/>
<context ref="c:escaped-character"/>
<context ref="c:storage-class"/>
<context ref="c:storage-class"/>
<context ref="def:c-like-comment-multiline"/>
<context ref="def:c-like-close-comment-outside-comment"/>
@@ -56,7 +56,7 @@ THE SOFTWARE.
<context ref="def:float"/>
<context ref="c:hexadecimal"/>
<context ref="c:hexadecimal"/>
<context ref="c:octal"/>