SWDEV-502480 - Update documentation from GitHub 2024-12-05

Change-Id: I179814351b77935aff55e8ae47dd322a3e15a868 [ROCm/hip commit: f39c7a3150]
2024-12-15 19:31:35 +01:00
@@ -6,6 +6,7 @@ APU
 APUs
 AQL
 AXPY
+asm
 Asynchrony
 backtrace
 Bitcode
@@ -15,6 +16,7 @@ builtins
 Builtins
 CAS
 clr
+compilable
 coroutines
 Ctx
 cuBLASLt
@@ -42,12 +44,14 @@ extern
 fatbin
 fatbinary
 foundationally
+framebuffer
 frontends
 fnuz
 FNUZ
 fp
 gedit
 GPGPU
+GROMACS
 GWS
 hardcoded
 HC
@@ -58,6 +62,7 @@ hipcc
 hipCtx
 hipexamine
 hipified
+HIPify
 hipModule
 hipModuleLaunchKernel
 hipother
@@ -65,9 +70,12 @@ HIPRTC
 icc
 IILE
 iGPU
+inlined
 inplace
-Interoperation
+interop
+interoperation
 interoperate
+interoperation
 Interprocess
 interprocess
 Intrinsics
@@ -75,6 +83,7 @@ intrinsics
 IPC
 IPs
 isa
+iteratively
 Lapack
 latencies
 libc
@@ -87,6 +96,8 @@ ltrace
 makefile
 Malloc
 malloc
+MALU
+MiB
 memset
 multicore
 multigrid
@@ -101,9 +112,12 @@ NOP
 Numa
 Nsight
 ocp
+omnitrace
 overindex
 overindexing
 oversubscription
+overutilized
+parallelizable
 pixelated
 pragmas
 preallocated
@@ -111,6 +125,7 @@ preconditioners
 predefining
 prefetched
 preprocessor
+profilers
 PTX
 PyHIP
 queryable
@@ -118,6 +133,7 @@ prefetching
 quad
 representable
 RMW
+rocgdb
 ROCm's
 rocTX
 roundtrip
@@ -129,6 +145,7 @@ scalarizing
 sceneries
 shaders
 SIMT
+sinewave
 SOMA
 SPMV
 structs
@@ -139,11 +156,16 @@ texels
 tradeoffs
 templated
 toolkits
+transfering
 typedefs
 unintuitive
 UMM
 unmap
+unmapped
+unmapping
+unregister
 upscaled
 variadic
+vulkan
 WinGDB
-zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz
+zc
@@ -36,7 +36,7 @@ HIP releases are typically naming convention for each ROCM release to help diffe
 ## More Info

 * [Installation](docs/install/install.rst)
-* [HIP FAQ](docs/how-to/faq.md)
+* [HIP FAQ](docs/faq.rst)
 * [HIP C++ Language Extensions](docs/reference/cpp_language_extensions.rst)
 * [HIP Porting Guide](docs/how-to/hip_porting_guide.md)
 * [HIP Porting Driver Guide](docs/how-to/hip_porting_driver_api.md)
@@ -47,8 +47,8 @@ suppress_warnings = ["etoc.toctree"]

 numfig = False

-
 exclude_patterns = [
    "doxygen/mainpage.md",
-    "understand/glossary.md"
+    "understand/glossary.md",
+    'how-to/debugging_env.rst'
 ]
@@ -0,0 +1,106 @@
+<mxfile host="65bd71144e">
+    <diagram id="zBbb_w2fufU70cdOGtND" name="1 oldal">
+        <mxGraphModel dx="1547" dy="1302" grid="1" gridSize="10" guides="1" tooltips="1" connect="1" arrows="1" fold="1" page="0" pageScale="1" pageWidth="660" pageHeight="610" background="none" math="0" shadow="0">
+            <root>
+                <mxCell id="0"/>
+                <mxCell id="1" parent="0"/>
+                <mxCell id="5927" value="" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#5E5B61;fontColor=#FFFFFF;strokeColor=none;spacing=0;" parent="1" vertex="1">
+                    <mxGeometry y="-10" width="740" height="290" as="geometry"/>
+                </mxCell>
+                <mxCell id="5928" value="Pageable data transfer" style="text;strokeColor=none;fillColor=none;align=center;verticalAlign=middle;rounded=0;fontFamily=Helvetica;fontSize=17;fontColor=#FFFFFF;" parent="1" vertex="1">
+                    <mxGeometry x="20" width="340" height="30" as="geometry"/>
+                </mxCell>
+                <mxCell id="UvHuP5o6jSuoLTm0AUZA-5955" value="" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#4F1623;fontColor=#FFFFFF;strokeColor=none;" parent="1" vertex="1">
+                    <mxGeometry x="20" y="160" width="340" height="100" as="geometry"/>
+                </mxCell>
+                <mxCell id="UvHuP5o6jSuoLTm0AUZA-5959" value="" style="group" parent="1" vertex="1" connectable="0">
+                    <mxGeometry x="230" y="170" width="120" height="80" as="geometry"/>
+                </mxCell>
+                <mxCell id="UvHuP5o6jSuoLTm0AUZA-5960" value="" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#C23555;fontColor=#FFFFFF;strokeColor=none;" parent="UvHuP5o6jSuoLTm0AUZA-5959" vertex="1">
+                    <mxGeometry width="120" height="80" as="geometry"/>
+                </mxCell>
+                <mxCell id="UvHuP5o6jSuoLTm0AUZA-5961" value="&lt;div&gt;Pinned memory&lt;/div&gt;" style="text;html=1;strokeColor=none;fillColor=none;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;fontSize=17;fontColor=#FFFFFF;" parent="UvHuP5o6jSuoLTm0AUZA-5959" vertex="1">
+                    <mxGeometry width="120" height="80" as="geometry"/>
+                </mxCell>
+                <mxCell id="LV0FwBpydXXZrUbya0PG-5946" value="Pinned data transfer" style="text;strokeColor=none;fillColor=none;align=center;verticalAlign=middle;rounded=0;fontFamily=Helvetica;fontSize=17;fontColor=#FFFFFF;" parent="1" vertex="1">
+                    <mxGeometry x="380" width="340" height="30" as="geometry"/>
+                </mxCell>
+                <mxCell id="UvHuP5o6jSuoLTm0AUZA-5952" value="" style="group;fillColor=#9C2A44;" parent="1" vertex="1" connectable="0">
+                    <mxGeometry x="70" y="170" width="120" height="80" as="geometry"/>
+                </mxCell>
+                <mxCell id="UvHuP5o6jSuoLTm0AUZA-5950" value="" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#9C2A44;fontColor=#FFFFFF;strokeColor=none;" parent="UvHuP5o6jSuoLTm0AUZA-5952" vertex="1">
+                    <mxGeometry width="120" height="80" as="geometry"/>
+                </mxCell>
+                <mxCell id="UvHuP5o6jSuoLTm0AUZA-5951" value="Pageable memory" style="text;html=1;strokeColor=none;fillColor=none;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;fontSize=17;fontColor=#FFFFFF;" parent="UvHuP5o6jSuoLTm0AUZA-5952" vertex="1">
+                    <mxGeometry width="120" height="80" as="geometry"/>
+                </mxCell>
+                <mxCell id="LV0FwBpydXXZrUbya0PG-5974" value="" style="endArrow=classic;startArrow=classic;html=1;rounded=0;entryX=1;entryY=0.5;entryDx=0;entryDy=0;strokeWidth=2;exitX=0;exitY=0.5;exitDx=0;exitDy=0;" parent="1" target="UvHuP5o6jSuoLTm0AUZA-5950" edge="1" source="UvHuP5o6jSuoLTm0AUZA-5961">
+                    <mxGeometry width="50" height="50" relative="1" as="geometry">
+                        <mxPoint x="220" y="250" as="sourcePoint"/>
+                        <mxPoint x="109.5" y="201" as="targetPoint"/>
+                    </mxGeometry>
+                </mxCell>
+                <mxCell id="5929" value="" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#4F1623;fontColor=#FFFFFF;strokeColor=none;" vertex="1" parent="1">
+                    <mxGeometry x="20" y="40" width="340" height="100" as="geometry"/>
+                </mxCell>
+                <mxCell id="5930" value="" style="group" vertex="1" connectable="0" parent="1">
+                    <mxGeometry x="230" y="50" width="120" height="80" as="geometry"/>
+                </mxCell>
+                <mxCell id="5931" value="" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#C23555;fontColor=#FFFFFF;strokeColor=none;" vertex="1" parent="5930">
+                    <mxGeometry width="120" height="80" as="geometry"/>
+                </mxCell>
+                <mxCell id="5932" value="&lt;div&gt;Device memory&lt;/div&gt;" style="text;html=1;strokeColor=none;fillColor=none;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;fontSize=17;fontColor=#FFFFFF;" vertex="1" parent="5930">
+                    <mxGeometry width="120" height="80" as="geometry"/>
+                </mxCell>
+                <mxCell id="LV0FwBpydXXZrUbya0PG-5968" value="" style="endArrow=classic;startArrow=classic;html=1;rounded=0;entryX=0.5;entryY=0;entryDx=0;entryDy=0;strokeWidth=2;strokeColor=#ffffff;exitX=0.5;exitY=1;exitDx=0;exitDy=0;" parent="1" source="5932" target="UvHuP5o6jSuoLTm0AUZA-5960" edge="1">
+                    <mxGeometry width="50" height="50" relative="1" as="geometry">
+                        <mxPoint x="290" y="120" as="sourcePoint"/>
+                        <mxPoint x="289.5" y="160" as="targetPoint"/>
+                    </mxGeometry>
+                </mxCell>
+                <mxCell id="5944" value="" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#4F1623;fontColor=#FFFFFF;strokeColor=none;" vertex="1" parent="1">
+                    <mxGeometry x="380" y="160" width="340" height="100" as="geometry"/>
+                </mxCell>
+                <mxCell id="5945" value="" style="group" vertex="1" connectable="0" parent="1">
+                    <mxGeometry x="590" y="170" width="120" height="80" as="geometry"/>
+                </mxCell>
+                <mxCell id="5946" value="" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#C23555;fontColor=#FFFFFF;strokeColor=none;" vertex="1" parent="5945">
+                    <mxGeometry width="120" height="80" as="geometry"/>
+                </mxCell>
+                <mxCell id="5947" value="&lt;div&gt;Pinned memory&lt;/div&gt;" style="text;html=1;strokeColor=none;fillColor=none;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;fontSize=17;fontColor=#FFFFFF;" vertex="1" parent="5945">
+                    <mxGeometry width="120" height="80" as="geometry"/>
+                </mxCell>
+                <mxCell id="5948" value="" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#4F1623;fontColor=#FFFFFF;strokeColor=none;" vertex="1" parent="1">
+                    <mxGeometry x="380" y="40" width="340" height="100" as="geometry"/>
+                </mxCell>
+                <mxCell id="5949" value="" style="group" vertex="1" connectable="0" parent="1">
+                    <mxGeometry x="590" y="50" width="120" height="80" as="geometry"/>
+                </mxCell>
+                <mxCell id="5950" value="" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#C23555;fontColor=#FFFFFF;strokeColor=none;" vertex="1" parent="5949">
+                    <mxGeometry width="120" height="80" as="geometry"/>
+                </mxCell>
+                <mxCell id="5951" value="&lt;div&gt;Device memory&lt;/div&gt;" style="text;html=1;strokeColor=none;fillColor=none;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;fontSize=17;fontColor=#FFFFFF;" vertex="1" parent="5949">
+                    <mxGeometry width="120" height="80" as="geometry"/>
+                </mxCell>
+                <mxCell id="5952" style="edgeStyle=none;html=1;exitX=0.5;exitY=0;exitDx=0;exitDy=0;entryX=0.5;entryY=1;entryDx=0;entryDy=0;startArrow=classic;startFill=1;strokeWidth=2;strokeColor=#FFFFFF;" edge="1" parent="1" source="5947" target="5951">
+                    <mxGeometry relative="1" as="geometry"/>
+                </mxCell>
+                <mxCell id="5958" value="&lt;div&gt;Host&lt;/div&gt;" style="text;html=1;strokeColor=none;fillColor=none;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;fontSize=17;fontColor=#FFFFFF;direction=west;" vertex="1" parent="1">
+                    <mxGeometry x="20" y="195" width="50" height="30" as="geometry"/>
+                </mxCell>
+                <mxCell id="5960" value="&lt;div&gt;Device&lt;/div&gt;" style="text;html=1;strokeColor=none;fillColor=none;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;fontSize=17;fontColor=#FFFFFF;" vertex="1" parent="1">
+                    <mxGeometry x="20" y="75" width="70" height="30" as="geometry"/>
+                </mxCell>
+                <mxCell id="5961" value="&lt;div&gt;Device&lt;/div&gt;" style="text;html=1;strokeColor=none;fillColor=none;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;fontSize=17;fontColor=#FFFFFF;" vertex="1" parent="1">
+                    <mxGeometry x="380" y="75" width="70" height="30" as="geometry"/>
+                </mxCell>
+                <mxCell id="5962" value="&lt;div&gt;Host&lt;/div&gt;" style="text;html=1;strokeColor=none;fillColor=none;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;fontSize=17;fontColor=#FFFFFF;" vertex="1" parent="1">
+                    <mxGeometry x="380" y="195" width="60" height="30" as="geometry"/>
+                </mxCell>
+                <mxCell id="5964" value="" style="edgeStyle=none;html=1;strokeWidth=2;startArrow=classic;startFill=1;strokeColor=#FFFFFF;" edge="1" parent="1" source="UvHuP5o6jSuoLTm0AUZA-5951" target="UvHuP5o6jSuoLTm0AUZA-5961">
+                    <mxGeometry relative="1" as="geometry"/>
+                </mxCell>
+            </root>
+        </mxGraphModel>
+    </diagram>
+</mxfile>
@@ -0,0 +1,127 @@
+<mxfile host="65bd71144e">
+    <diagram id="zBbb_w2fufU70cdOGtND" name="1 oldal">
+        <mxGraphModel dx="1584" dy="1200" grid="1" gridSize="10" guides="1" tooltips="1" connect="1" arrows="1" fold="1" page="0" pageScale="1" pageWidth="660" pageHeight="610" background="none" math="0" shadow="0">
+            <root>
+                <mxCell id="0"/>
+                <mxCell id="1" parent="0"/>
+                <mxCell id="5927" value="" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#5E5B61;fontColor=#FFFFFF;strokeColor=none;spacing=0;" parent="1" vertex="1">
+                    <mxGeometry y="-30" width="680" height="380" as="geometry"/>
+                </mxCell>
+                <mxCell id="5945" value="" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#C23555;fontColor=#FFFFFF;strokeColor=#A20025;" parent="1" vertex="1">
+                    <mxGeometry x="10" y="-10" width="660" height="30" as="geometry"/>
+                </mxCell>
+                <mxCell id="5946" value="&lt;font face=&quot;Helvetica&quot;&gt;HIP Runtime API&lt;/font&gt;" style="text;html=1;strokeColor=none;fillColor=none;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;fontSize=17;fontColor=#FFFFFF;" parent="1" vertex="1">
+                    <mxGeometry x="75" y="-10" width="530" height="30" as="geometry"/>
+                </mxCell>
+                <mxCell id="UvHuP5o6jSuoLTm0AUZA-5953" value="" style="group" parent="1" vertex="1" connectable="0">
+                    <mxGeometry x="10" y="80" width="330" height="260" as="geometry"/>
+                </mxCell>
+                <mxCell id="5925" value="" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#333333;fontColor=#FFFFFF;strokeColor=none;" parent="UvHuP5o6jSuoLTm0AUZA-5953" vertex="1">
+                    <mxGeometry width="330" height="260" as="geometry"/>
+                </mxCell>
+                <mxCell id="UvHuP5o6jSuoLTm0AUZA-5952" value="" style="group" parent="UvHuP5o6jSuoLTm0AUZA-5953" vertex="1" connectable="0">
+                    <mxGeometry x="16.67" y="190.00279999999998" width="293.33" height="45" as="geometry"/>
+                </mxCell>
+                <mxCell id="UvHuP5o6jSuoLTm0AUZA-5950" value="" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#60a917;fontColor=#ffffff;strokeColor=#2D7600;" parent="UvHuP5o6jSuoLTm0AUZA-5952" vertex="1">
+                    <mxGeometry width="293.33000000000004" height="45" as="geometry"/>
+                </mxCell>
+                <mxCell id="UvHuP5o6jSuoLTm0AUZA-5951" value="&lt;div&gt;CUDA Driver API&lt;/div&gt;" style="text;html=1;strokeColor=none;fillColor=none;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;fontSize=17;fontColor=#FFFFFF;" parent="UvHuP5o6jSuoLTm0AUZA-5952" vertex="1">
+                    <mxGeometry x="10.9643478387712" y="7.500000000000001" width="266.79913074343256" height="30.000000000000004" as="geometry"/>
+                </mxCell>
+                <mxCell id="5948" value="" style="endArrow=classic;startArrow=none;html=1;rounded=0;strokeWidth=2;startFill=0;exitX=0.5;exitY=1;exitDx=0;exitDy=0;entryX=0.358;entryY=0;entryDx=0;entryDy=0;entryPerimeter=0;strokeColor=#FFFFFF;" parent="UvHuP5o6jSuoLTm0AUZA-5953" source="5967" target="UvHuP5o6jSuoLTm0AUZA-5950" edge="1">
+                    <mxGeometry width="50" height="50" relative="1" as="geometry">
+                        <mxPoint x="103.33500000000004" y="108.22000000000003" as="sourcePoint"/>
+                        <mxPoint x="85" y="145.6" as="targetPoint"/>
+                    </mxGeometry>
+                </mxCell>
+                <mxCell id="5966" value="" style="group" parent="UvHuP5o6jSuoLTm0AUZA-5953" vertex="1" connectable="0">
+                    <mxGeometry x="16.670000000000016" y="64" width="210" height="45" as="geometry"/>
+                </mxCell>
+                <mxCell id="5967" value="" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#60a917;fontColor=#ffffff;strokeColor=#2D7600;" parent="5966" vertex="1">
+                    <mxGeometry width="210.00000000000003" height="45" as="geometry"/>
+                </mxCell>
+                <mxCell id="5968" value="&lt;div&gt;CUDA runtime&lt;/div&gt;" style="text;html=1;strokeColor=none;fillColor=none;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;fontSize=17;fontColor=#FFFFFF;" parent="5966" vertex="1">
+                    <mxGeometry x="9.499565493273565" y="7.499999999999974" width="191.0060936696582" height="29.999999999999996" as="geometry"/>
+                </mxCell>
+                <mxCell id="UvHuP5o6jSuoLTm0AUZA-5982" value="" style="endArrow=classic;startArrow=none;html=1;rounded=0;strokeWidth=2;startFill=0;strokeColor=#FFFFFF;" parent="1" edge="1">
+                    <mxGeometry width="50" height="50" relative="1" as="geometry">
+                        <mxPoint x="270" y="60" as="sourcePoint"/>
+                        <mxPoint x="270" y="270" as="targetPoint"/>
+                    </mxGeometry>
+                </mxCell>
+                <mxCell id="UvHuP5o6jSuoLTm0AUZA-5955" value="" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#333333;fontColor=#FFFFFF;strokeColor=none;" parent="1" vertex="1">
+                    <mxGeometry x="350" y="80" width="320" height="260" as="geometry"/>
+                </mxCell>
+                <mxCell id="5955" value="" style="group" parent="1" vertex="1" connectable="0">
+                    <mxGeometry x="360" y="270" width="140" height="45" as="geometry"/>
+                </mxCell>
+                <mxCell id="5956" value="" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#C23555;fontColor=#FFFFFF;strokeColor=#A20025;" parent="5955" vertex="1">
+                    <mxGeometry width="140.00000000000003" height="45" as="geometry"/>
+                </mxCell>
+                <mxCell id="5957" value="&lt;div&gt;ROCr runtime&lt;/div&gt;" style="text;html=1;strokeColor=none;fillColor=none;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;fontSize=17;fontColor=#FFFFFF;" parent="5955" vertex="1">
+                    <mxGeometry x="2.51" y="8.75" width="134.99" height="27.5" as="geometry"/>
+                </mxCell>
+                <mxCell id="5958" value="" style="group" parent="1" vertex="1" connectable="0">
+                    <mxGeometry x="520" y="270" width="140" height="45" as="geometry"/>
+                </mxCell>
+                <mxCell id="5959" value="" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#C23555;fontColor=#FFFFFF;strokeColor=#A20025;" parent="5958" vertex="1">
+                    <mxGeometry width="140.00000000000003" height="45" as="geometry"/>
+                </mxCell>
+                <mxCell id="5960" value="&lt;div&gt;PAL&lt;/div&gt;" style="text;html=1;strokeColor=none;fillColor=none;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;fontSize=17;fontColor=#FFFFFF;" parent="5958" vertex="1">
+                    <mxGeometry x="5.233043662182416" y="7.499999999999999" width="127.33739577977217" height="29.999999999999996" as="geometry"/>
+                </mxCell>
+                <mxCell id="5962" value="" style="group" parent="1" vertex="1" connectable="0">
+                    <mxGeometry x="405" y="144.91" width="210" height="45" as="geometry"/>
+                </mxCell>
+                <mxCell id="5963" value="" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#C23555;fontColor=#FFFFFF;strokeColor=#A20025;" parent="5962" vertex="1">
+                    <mxGeometry width="210.00000000000003" height="45" as="geometry"/>
+                </mxCell>
+                <mxCell id="5964" value="&lt;div&gt;CLR&lt;/div&gt;" style="text;html=1;strokeColor=none;fillColor=none;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;fontSize=17;fontColor=#FFFFFF;" parent="5962" vertex="1">
+                    <mxGeometry x="7.849565493273624" y="7.499999999999999" width="191.0060936696582" height="29.999999999999996" as="geometry"/>
+                </mxCell>
+                <mxCell id="5965" value="" style="endArrow=classic;startArrow=none;html=1;rounded=0;strokeWidth=2;startFill=0;entryX=0.5;entryY=0;entryDx=0;entryDy=0;strokeColor=#FFFFFF;exitX=0.823;exitY=1.047;exitDx=0;exitDy=0;exitPerimeter=0;" parent="1" target="5963" edge="1" source="5946">
+                    <mxGeometry width="50" height="50" relative="1" as="geometry">
+                        <mxPoint x="510" y="60" as="sourcePoint"/>
+                        <mxPoint x="640" y="290" as="targetPoint"/>
+                    </mxGeometry>
+                </mxCell>
+                <mxCell id="5969" value="" style="endArrow=classic;startArrow=none;html=1;rounded=0;strokeWidth=2;startFill=0;strokeColor=#FFFFFF;" parent="1" edge="1">
+                    <mxGeometry width="50" height="50" relative="1" as="geometry">
+                        <mxPoint x="570" y="190" as="sourcePoint"/>
+                        <mxPoint x="570" y="270" as="targetPoint"/>
+                    </mxGeometry>
+                </mxCell>
+                <mxCell id="5971" value="" style="endArrow=classic;startArrow=none;html=1;rounded=0;strokeWidth=2;startFill=0;strokeColor=#FFFFFF;entryX=0.661;entryY=0.007;entryDx=0;entryDy=0;entryPerimeter=0;" parent="1" target="5956" edge="1">
+                    <mxGeometry width="50" height="50" relative="1" as="geometry">
+                        <mxPoint x="453" y="190" as="sourcePoint"/>
+                        <mxPoint x="450" y="270" as="targetPoint"/>
+                    </mxGeometry>
+                </mxCell>
+                <mxCell id="UvHuP5o6jSuoLTm0AUZA-5981" value="" style="endArrow=classic;startArrow=none;html=1;rounded=0;strokeWidth=2;startFill=0;entryX=0.5;entryY=0;entryDx=0;entryDy=0;strokeColor=#FFFFFF;" parent="1" target="5967" edge="1">
+                    <mxGeometry width="50" height="50" relative="1" as="geometry">
+                        <mxPoint x="132" y="60" as="sourcePoint"/>
+                        <mxPoint x="95" y="140" as="targetPoint"/>
+                    </mxGeometry>
+                </mxCell>
+                <mxCell id="UvHuP5o6jSuoLTm0AUZA-5957" value="&lt;font face=&quot;Helvetica&quot;&gt;&lt;span style=&quot;background-color: rgb(77, 77, 77);&quot;&gt;AMD Platform&lt;/span&gt;&lt;br&gt;&lt;/font&gt;" style="text;html=1;strokeColor=none;fillColor=none;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;fontSize=17;fontColor=#FFFFFF;" parent="1" vertex="1">
+                    <mxGeometry x="440" y="84" width="140" height="30" as="geometry"/>
+                </mxCell>
+                <mxCell id="5926" value="&lt;font style=&quot;background-color: rgb(77, 77, 77);&quot;&gt;NVIDIA Platform&lt;/font&gt;" style="text;html=1;strokeColor=none;fillColor=none;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;fontSize=17;fontColor=#FFFFFF;" parent="1" vertex="1">
+                    <mxGeometry x="10" y="80" width="330" height="34.0392" as="geometry"/>
+                </mxCell>
+                <mxCell id="5973" value="" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#C23555;fontColor=#FFFFFF;strokeColor=#A20025;" vertex="1" parent="1">
+                    <mxGeometry x="10" y="40" width="330" height="30" as="geometry"/>
+                </mxCell>
+                <mxCell id="5975" value="&lt;font face=&quot;Helvetica&quot;&gt;hipother&lt;/font&gt;" style="text;html=1;strokeColor=none;fillColor=none;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;fontSize=17;fontColor=#FFFFFF;" vertex="1" parent="1">
+                    <mxGeometry x="10" y="40" width="330" height="30" as="geometry"/>
+                </mxCell>
+                <mxCell id="5976" value="" style="endArrow=classic;startArrow=none;html=1;rounded=0;strokeWidth=2;startFill=0;strokeColor=#FFFFFF;exitX=0.823;exitY=1.047;exitDx=0;exitDy=0;exitPerimeter=0;" edge="1" parent="1">
+                    <mxGeometry width="50" height="50" relative="1" as="geometry">
+                        <mxPoint x="175.59000000000003" y="20.00000000000008" as="sourcePoint"/>
+                        <mxPoint x="176" y="40" as="targetPoint"/>
+                    </mxGeometry>
+                </mxCell>
+            </root>
+        </mxGraphModel>
+    </diagram>
+</mxfile>
@@ -0,0 +1,46 @@
+<mxfile host="65bd71144e">
+    <diagram id="zBbb_w2fufU70cdOGtND" name="1 oldal">
+        <mxGraphModel dx="438" dy="902" grid="1" gridSize="10" guides="1" tooltips="1" connect="1" arrows="1" fold="1" page="1" pageScale="1" pageWidth="1200" pageHeight="1600" background="none" math="0" shadow="0">
+            <root>
+                <mxCell id="0"/>
+                <mxCell id="1" parent="0"/>
+                <mxCell id="5536" value="" style="rounded=0;fillColor=#C23555;fontColor=#FFFFFF;strokeColor=none;spacing=0;fontFamily=Segoe UI;fontSize=18;" parent="1" vertex="1">
+                    <mxGeometry x="340" y="10" width="280" height="540" as="geometry"/>
+                </mxCell>
+                <mxCell id="1Txoek2s6jAQB3cqoh21-5821" value="" style="rounded=0;fillColor=#C23555;fontColor=#FFFFFF;strokeColor=none;spacing=0;fontFamily=Segoe UI;fontSize=18;" parent="1" vertex="1">
+                    <mxGeometry x="10" y="10" width="280" height="540" as="geometry"/>
+                </mxCell>
+                <mxCell id="5401" value="Stream 1" style="text;strokeColor=none;fillColor=none;align=center;verticalAlign=middle;rounded=0;fontFamily=Segoe UI;fontSize=18;fontColor=#FFFFFF;" parent="1" vertex="1">
+                    <mxGeometry y="10" width="320" height="30" as="geometry"/>
+                </mxCell>
+                <mxCell id="1Txoek2s6jAQB3cqoh21-5820" value="Kernel A" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#A20025;fontColor=#FFFFFF;strokeColor=none;fontFamily=Segoe UI;fontSize=18;" parent="1" vertex="1">
+                    <mxGeometry x="30" y="130" width="240" height="100" as="geometry"/>
+                </mxCell>
+                <mxCell id="1Txoek2s6jAQB3cqoh21-5819" value="Stream 2" style="text;strokeColor=none;fillColor=none;align=center;verticalAlign=middle;rounded=0;fontFamily=Segoe UI;fontSize=18;fontColor=#FFFFFF;" parent="1" vertex="1">
+                    <mxGeometry x="320" y="10" width="320" height="30" as="geometry"/>
+                </mxCell>
+                <mxCell id="1Txoek2s6jAQB3cqoh21-5822" value="Memory Copy" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#A20025;fontColor=#FFFFFF;strokeColor=none;fontFamily=Segoe UI;fontSize=18;" parent="1" vertex="1">
+                    <mxGeometry x="30" y="50" width="240" height="60" as="geometry"/>
+                </mxCell>
+                <mxCell id="1Txoek2s6jAQB3cqoh21-5825" value="hipDeviceSynchronize" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#A20025;fontColor=#FFFFFF;strokeColor=none;fontFamily=Segoe UI;fontSize=18;" parent="1" vertex="1">
+                    <mxGeometry x="30" y="410" width="570" height="40" as="geometry"/>
+                </mxCell>
+                <mxCell id="1Txoek2s6jAQB3cqoh21-5826" value="Kernel B" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#A20025;fontColor=#FFFFFF;strokeColor=none;fontFamily=Segoe UI;fontSize=18;" parent="1" vertex="1">
+                    <mxGeometry x="360" y="130" width="240" height="150" as="geometry"/>
+                </mxCell>
+                <mxCell id="1Txoek2s6jAQB3cqoh21-5828" value="Kernel C" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#A20025;fontColor=#FFFFFF;strokeColor=none;fontFamily=Segoe UI;fontSize=18;" parent="1" vertex="1">
+                    <mxGeometry x="30" y="250" width="240" height="140" as="geometry"/>
+                </mxCell>
+                <mxCell id="5537" value="Memory Copy" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#A20025;fontColor=#FFFFFF;strokeColor=none;fontFamily=Segoe UI;fontSize=18;" parent="1" vertex="1">
+                    <mxGeometry x="360" y="50" width="240" height="60" as="geometry"/>
+                </mxCell>
+                <mxCell id="5538" value="Memory Copy" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#A20025;fontColor=#FFFFFF;strokeColor=none;fontFamily=Segoe UI;fontSize=18;" parent="1" vertex="1">
+                    <mxGeometry x="30" y="470" width="240" height="60" as="geometry"/>
+                </mxCell>
+                <mxCell id="5539" value="Memory Copy" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#A20025;fontColor=#FFFFFF;strokeColor=none;fontFamily=Segoe UI;fontSize=18;" parent="1" vertex="1">
+                    <mxGeometry x="360" y="470" width="240" height="60" as="geometry"/>
+                </mxCell>
+            </root>
+        </mxGraphModel>
+    </diagram>
+</mxfile>
@@ -0,0 +1,157 @@
+<mxfile host="65bd71144e">
+    <diagram id="zBbb_w2fufU70cdOGtND" name="1 oldal">
+        <mxGraphModel dx="1547" dy="1302" grid="1" gridSize="10" guides="1" tooltips="1" connect="1" arrows="1" fold="1" page="0" pageScale="1" pageWidth="660" pageHeight="610" background="none" math="0" shadow="0">
+            <root>
+                <mxCell id="0"/>
+                <mxCell id="1" parent="0"/>
+                <mxCell id="6033" value="" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#5E5B61;fontColor=#FFFFFF;strokeColor=none;spacing=0;" parent="1" vertex="1">
+                    <mxGeometry x="110" y="-320" width="480" height="490" as="geometry"/>
+                </mxCell>
+                <mxCell id="5981" value="" style="group" parent="1" vertex="1" connectable="0">
+                    <mxGeometry x="130" y="60" width="210" height="90" as="geometry"/>
+                </mxCell>
+                <mxCell id="5982" value="" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#333333;fontColor=#FFFFFF;strokeColor=none;" parent="5981" vertex="1">
+                    <mxGeometry width="210" height="90" as="geometry"/>
+                </mxCell>
+                <mxCell id="5983" value="" style="group" parent="5981" vertex="1" connectable="0">
+                    <mxGeometry x="7.7419872652362365" y="8" width="192.50000000000003" height="45" as="geometry"/>
+                </mxCell>
+                <mxCell id="5984" value="" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#60a917;fontColor=#ffffff;strokeColor=#2D7600;" parent="5983" vertex="1">
+                    <mxGeometry y="2" width="192.50000000000003" height="45" as="geometry"/>
+                </mxCell>
+                <mxCell id="5985" value="&lt;div&gt;NVIDIA runtime&lt;/div&gt;" style="text;html=1;strokeColor=none;fillColor=none;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;fontSize=17;fontColor=#FFFFFF;" parent="5983" vertex="1">
+                    <mxGeometry x="11.998194444444442" y="13.01" width="168.50166666666664" height="18.99" as="geometry"/>
+                </mxCell>
+                <mxCell id="5986" value="&lt;font style=&quot;&quot;&gt;NVIDIA Platform&lt;/font&gt;" style="text;html=1;strokeColor=none;fillColor=none;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;fontSize=17;fontColor=#FFFFFF;" parent="5981" vertex="1">
+                    <mxGeometry x="40" y="63" width="130" height="20" as="geometry"/>
+                </mxCell>
+                <mxCell id="5987" value="" style="endArrow=classic;startArrow=none;html=1;rounded=0;strokeWidth=2;startFill=0;strokeColor=#FFFFFF;" parent="1" edge="1">
+                    <mxGeometry width="50" height="50" relative="1" as="geometry">
+                        <mxPoint x="315" y="45" as="sourcePoint"/>
+                        <mxPoint x="315" y="70" as="targetPoint"/>
+                    </mxGeometry>
+                </mxCell>
+                <mxCell id="5988" value="" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#9C2A44;fontColor=#FFFFFF;strokeColor=#4c1523;strokeWidth=5;" parent="1" vertex="1">
+                    <mxGeometry x="300" y="-17" width="260" height="60" as="geometry"/>
+                </mxCell>
+                <mxCell id="5989" value="&lt;font style=&quot;font-size: 14px;&quot; face=&quot;Helvetica&quot;&gt;HIP&lt;/font&gt;" style="text;html=1;strokeColor=none;fillColor=none;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;fontSize=17;fontColor=#FFFFFF;strokeWidth=2;" parent="1" vertex="1">
+                    <mxGeometry x="300" y="-17" width="260" height="20" as="geometry"/>
+                </mxCell>
+                <mxCell id="5990" value="" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#333333;fontColor=#FFFFFF;strokeColor=none;" parent="1" vertex="1">
+                    <mxGeometry x="350" y="60" width="210" height="90" as="geometry"/>
+                </mxCell>
+                <mxCell id="5991" value="" style="group;fillColor=#A50040;fontColor=#ffffff;strokeColor=none;" parent="1" vertex="1" connectable="0">
+                    <mxGeometry x="360" y="70" width="192" height="45" as="geometry"/>
+                </mxCell>
+                <mxCell id="5992" value="" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#962744;fontColor=#FFFFFF;strokeColor=none;" parent="5991" vertex="1">
+                    <mxGeometry width="192.00000000000003" height="45" as="geometry"/>
+                </mxCell>
+                <mxCell id="5993" value="&lt;div&gt;AMD runtime&lt;/div&gt;" style="text;html=1;strokeColor=none;fillColor=none;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;fontSize=17;fontColor=#ffffff;" parent="5991" vertex="1">
+                    <mxGeometry x="8.638736842105262" y="7.497" width="174.72" height="29.996999999999993" as="geometry"/>
+                </mxCell>
+                <mxCell id="5994" value="" style="endArrow=classic;startArrow=none;html=1;rounded=0;strokeWidth=2;startFill=0;strokeColor=#FFFFFF;entryX=0.5;entryY=0;entryDx=0;entryDy=0;" parent="1" target="5992" edge="1">
+                    <mxGeometry width="50" height="50" relative="1" as="geometry">
+                        <mxPoint x="456" y="44" as="sourcePoint"/>
+                        <mxPoint x="470" y="70" as="targetPoint"/>
+                    </mxGeometry>
+                </mxCell>
+                <mxCell id="5995" value="&lt;font face=&quot;Helvetica&quot;&gt;AMD Platform&lt;br&gt;&lt;/font&gt;" style="text;html=1;strokeColor=none;fillColor=none;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;fontSize=17;fontColor=#FFFFFF;" parent="1" vertex="1">
+                    <mxGeometry x="365" y="123" width="180" height="20" as="geometry"/>
+                </mxCell>
+                <mxCell id="6003" value="" style="endArrow=classic;startArrow=none;html=1;rounded=0;strokeWidth=2;startFill=0;strokeColor=#FFFFFF;exitX=0.25;exitY=1;exitDx=0;exitDy=0;entryX=0.855;entryY=-0.018;entryDx=0;entryDy=0;entryPerimeter=0;" parent="1" target="6000" edge="1">
+                    <mxGeometry width="50" height="50" relative="1" as="geometry">
+                        <mxPoint x="309.5" y="-104" as="sourcePoint"/>
+                        <mxPoint x="309.55999999999995" y="-60.975106382978765" as="targetPoint"/>
+                    </mxGeometry>
+                </mxCell>
+                <mxCell id="6004" value="" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#9C2A44;fontColor=#FFFFFF;strokeColor=none;" parent="1" vertex="1">
+                    <mxGeometry x="274.5" y="-150" width="140" height="45" as="geometry"/>
+                </mxCell>
+                <mxCell id="6005" value="&lt;font face=&quot;Helvetica&quot;&gt;hipLibrary&lt;/font&gt;" style="text;html=1;strokeColor=none;fillColor=none;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;fontSize=17;fontColor=#ffffff;dashed=1;strokeWidth=2;" parent="1" vertex="1">
+                    <mxGeometry x="274.5" y="-150" width="140" height="45" as="geometry"/>
+                </mxCell>
+                <mxCell id="6007" value="" style="group;dashed=1;strokeWidth=2;strokeColor=none;" parent="1" vertex="1" connectable="0">
+                    <mxGeometry x="360" y="-80" width="140" height="45" as="geometry"/>
+                </mxCell>
+                <mxCell id="6008" value="" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#962744;fontColor=#FFFFFF;strokeColor=none;dashed=1;strokeWidth=2;" parent="6007" vertex="1">
+                    <mxGeometry width="140.00000000000003" height="45" as="geometry"/>
+                </mxCell>
+                <mxCell id="6009" value="&lt;div&gt;rocLibrary&lt;/div&gt;" style="text;html=1;strokeColor=none;fillColor=none;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;fontSize=17;fontColor=#FFFFFF;" parent="6007" vertex="1">
+                    <mxGeometry x="6.299078947368418" y="7.497" width="127.39999999999998" height="29.996999999999993" as="geometry"/>
+                </mxCell>
+                <mxCell id="6010" value="" style="endArrow=classic;startArrow=none;html=1;rounded=0;strokeWidth=2;startFill=0;strokeColor=#FFFFFF;" parent="1" edge="1">
+                    <mxGeometry width="50" height="50" relative="1" as="geometry">
+                        <mxPoint x="386" y="-105" as="sourcePoint"/>
+                        <mxPoint x="386" y="-80" as="targetPoint"/>
+                    </mxGeometry>
+                </mxCell>
+                <mxCell id="5999" value="" style="group" parent="1" vertex="1" connectable="0">
+                    <mxGeometry x="190" y="-80" width="140" height="45" as="geometry"/>
+                </mxCell>
+                <mxCell id="6000" value="" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#60a917;fontColor=#ffffff;strokeColor=#2D7600;" parent="5999" vertex="1">
+                    <mxGeometry y="-0.005106382978723234" width="140" height="45" as="geometry"/>
+                </mxCell>
+                <mxCell id="6001" value="&lt;div&gt;cuLibrary&lt;/div&gt;" style="text;html=1;strokeColor=none;fillColor=none;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;fontSize=17;fontColor=#FFFFFF;" parent="5999" vertex="1">
+                    <mxGeometry x="10.019288676236041" y="13.404255319148938" width="119.9667368421052" height="18.18191489361702" as="geometry"/>
+                </mxCell>
+                <mxCell id="6013" style="edgeStyle=none;html=1;strokeWidth=2;strokeColor=#FFFFFF;entryX=0.5;entryY=0;entryDx=0;entryDy=0;" parent="1" target="5984" edge="1">
+                    <mxGeometry relative="1" as="geometry">
+                        <mxPoint x="234" y="-35" as="sourcePoint"/>
+                        <mxPoint x="220" y="60" as="targetPoint"/>
+                    </mxGeometry>
+                </mxCell>
+                <mxCell id="6014" style="edgeStyle=none;html=1;exitX=0.5;exitY=1;exitDx=0;exitDy=0;strokeWidth=2;strokeColor=#FFFFFF;" parent="1" source="6008" edge="1">
+                    <mxGeometry relative="1" as="geometry">
+                        <mxPoint x="430" y="-19" as="targetPoint"/>
+                    </mxGeometry>
+                </mxCell>
+                <mxCell id="6025" value="" style="group;strokeColor=none;dashed=1;strokeWidth=2;" parent="1" vertex="1" connectable="0">
+                    <mxGeometry x="129.5" y="-290" width="430" height="100" as="geometry"/>
+                </mxCell>
+                <mxCell id="6023" value="" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#333333;fontColor=#FFFFFF;strokeColor=none;spacing=0;" parent="6025" vertex="1">
+                    <mxGeometry width="430" height="100" as="geometry"/>
+                </mxCell>
+                <mxCell id="6024" value="&lt;div&gt;Application Implementation&lt;/div&gt;" style="text;html=1;strokeColor=none;fillColor=none;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;fontSize=17;fontColor=#FFFFFF;" parent="6025" vertex="1">
+                    <mxGeometry x="97.50999999999999" y="43.93999999999998" width="234.99" height="12.120000000000001" as="geometry"/>
+                </mxCell>
+                <mxCell id="6026" style="edgeStyle=none;html=1;entryX=0.148;entryY=0.008;entryDx=0;entryDy=0;strokeWidth=2;entryPerimeter=0;strokeColor=#FFFFFF;" parent="1" target="5984" edge="1">
+                    <mxGeometry relative="1" as="geometry">
+                        <mxPoint x="166" y="-190" as="sourcePoint"/>
+                        <mxPoint x="159.99598908448831" y="-94.12" as="targetPoint"/>
+                    </mxGeometry>
+                </mxCell>
+                <mxCell id="6027" style="edgeStyle=none;html=1;strokeWidth=2;entryX=0.5;entryY=0;entryDx=0;entryDy=0;strokeColor=#FFFFFF;" parent="1" edge="1">
+                    <mxGeometry relative="1" as="geometry">
+                        <mxPoint x="260" y="-190" as="sourcePoint"/>
+                        <mxPoint x="260" y="-80.00510638297874" as="targetPoint"/>
+                    </mxGeometry>
+                </mxCell>
+                <mxCell id="6029" style="edgeStyle=none;html=1;entryX=0.5;entryY=0;entryDx=0;entryDy=0;strokeWidth=2;exitX=0.5;exitY=1;exitDx=0;exitDy=0;strokeColor=#FFFFFF;" parent="1" source="6023" target="6005" edge="1">
+                    <mxGeometry relative="1" as="geometry">
+                        <mxPoint x="320" y="-190" as="sourcePoint"/>
+                    </mxGeometry>
+                </mxCell>
+                <mxCell id="6030" style="edgeStyle=none;html=1;strokeWidth=2;strokeColor=#FFFFFF;entryX=0.5;entryY=0;entryDx=0;entryDy=0;" parent="1" target="6008" edge="1">
+                    <mxGeometry relative="1" as="geometry">
+                        <mxPoint x="452" y="-80" as="targetPoint"/>
+                        <mxPoint x="430" y="-190" as="sourcePoint"/>
+                    </mxGeometry>
+                </mxCell>
+                <mxCell id="6031" style="edgeStyle=none;html=1;exitX=0.912;exitY=1.013;exitDx=0;exitDy=0;strokeWidth=2;exitPerimeter=0;strokeColor=#FFFFFF;" parent="1" source="6023" edge="1">
+                    <mxGeometry relative="1" as="geometry">
+                        <mxPoint x="520" y="-19" as="targetPoint"/>
+                    </mxGeometry>
+                </mxCell>
+                <mxCell id="6034" value="&lt;div&gt;Application&lt;/div&gt;" style="text;html=1;strokeColor=none;fillColor=none;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;fontSize=17;fontColor=#FFFFFF;" parent="1" vertex="1">
+                    <mxGeometry x="232.5" y="-310" width="234.99" height="12.120000000000001" as="geometry"/>
+                </mxCell>
+                <mxCell id="6035" value="runtime API" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#4F1623;fontColor=#FFFFFF;strokeColor=none;" parent="1" vertex="1">
+                    <mxGeometry x="310" y="3" width="115" height="30" as="geometry"/>
+                </mxCell>
+                <mxCell id="6036" value="kernel language" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#4F1623;fontColor=#FFFFFF;strokeColor=none;" parent="1" vertex="1">
+                    <mxGeometry x="437" y="3" width="115" height="30" as="geometry"/>
+                </mxCell>
+            </root>
+        </mxGraphModel>
+    </diagram>
+</mxfile>
@@ -0,0 +1,242 @@
+.. meta::
+  :description: This page lists frequently asked questions about HIP
+  :keywords: AMD, ROCm, HIP, FAQ, frequently asked questions
+
+*******************************************************************************
+Frequently asked questions
+*******************************************************************************
+
+This topic provides answers to frequently asked questions from new HIP users and
+users familiar with NVIDIA CUDA.
+
+HIP Support
+===========
+
+What hardware does HIP support?
+-------------------------------
+
+HIP supports AMD and NVIDIA GPUs. See
+:ref:`prerequisites of the install guide<install_prerequisites>` for detailed
+information.
+
+What operating systems does HIP support?
+----------------------------------------
+
+Linux as well as Windows are supported by ROCm. The exact versions are listed in
+the system requirements for :ref:`rocm-install-on-linux:supported_distributions`
+and :ref:`rocm-install-on-windows:supported-skus-win`.
+
+.. note::
+   Not all HIP runtime API functions are yet supported on Windows.
+   A note is added to those functions' documentation in the
+   :ref:`HIP runtime API reference<runtime_api_reference>`.
+
+What libraries does HIP provide?
+--------------------------------
+
+HIP provides key math and AI libraries. See :doc:`rocm:reference/api-libraries`
+for the full list.
+
+What NVIDIA CUDA features does HIP support?
+-------------------------------------------
+
+The :doc:`NVIDIA CUDA runtime API supported by HIP<hipify:tables/CUDA_Runtime_API_functions_supported_by_HIP>`
+and :doc:`NVIDIA CUDA driver API supported by HIP<hipify:tables/CUDA_Driver_API_functions_supported_by_HIP>`
+pages describe which NVIDIA CUDA APIs are supported and what the equivalents are.
+The :doc:`HIP API documentation <doxygen/html/index>` describes each API and
+its limitations, if any, compared with the equivalent CUDA API.
+
+The kernel language features are documented in the
+:doc:`/reference/cpp_language_extensions` page.
+
+Relation to other GPGPU frameworks
+==================================
+
+Is HIP a drop-in replacement for CUDA?
+--------------------------------------
+
+The `HIPIFY <https://github.com/ROCm/HIPIFY>`_ tools can automatically convert
+almost all CUDA runtime code to HIP. Most device code needs no additional
+conversion because HIP and CUDA have the same signatures for math and built-in
+functions except for the name. HIP code provides similar performance as native
+CUDA code on NVIDIA platforms, plus the benefits of being compilable for AMD
+platforms.
+
+Additional porting might be required to deal with architecture feature
+queries or CUDA capabilities that HIP doesn't support.
+
+How does HIP compare with OpenCL?
+---------------------------------
+
+HIP offers several benefits over OpenCL:
+
+* Device code can be written in modern C++, including templates, lambdas,
+  classes and so on.
+* Host and device code can be mixed in the source files.
+* The HIP API is less verbose than OpenCL and is familiar to CUDA developers.
+* Porting from CUDA to HIP is significantly easier than from CUDA to OpenCL.
+* HIP uses development tools specialized for each platform: :doc:`amdclang++ <llvm-project:index>`
+  for AMD GPUs or `nvcc <https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html>`_
+  for NVIDIA GPUs, and profilers like :doc:`ROCm Compute Profiler <rocprofiler-compute:index>` or
+  `Nsight Systems <https://developer.nvidia.com/nsight-systems>`_.
+* HIP provides
+  * pointers and host-side pointer arithmetic.
+  * device-level control over memory allocation and placement.
+  * an offline compilation model.
+
+How does porting CUDA to HIP compare to porting CUDA to OpenCL?
+---------------------------------------------------------------
+
+OpenCL differs from HIP and CUDA when considering the host runtime,
+but even more so when considering the kernel code.
+The HIP device code is a C++ dialect, while OpenCL is C99-based.
+OpenCL does not support single-source compilation.
+
+As a result, the OpenCL syntax differs significantly from HIP, and porting tools
+must perform complex transformations, especially regarding templates or other
+C++ features in kernels.
+
+To better understand the syntax differences, see :doc:`here<reference/terms>` or
+the :doc:`HIP porting guide <how-to/hip_porting_guide>`.
+
+Can I install CUDA and ROCm on the same machine?
+------------------------------------------------
+
+Yes, but you require a compatible GPU to run the compiled code.
+
+On NVIDIA platforms, can I mix HIP code with CUDA code?
+-------------------------------------------------------
+
+Yes. Most HIP types and data structures are ``typedef`` s to CUDA equivalents and
+can be used interchangeably. This can be useful for iteratively porting CUDA code.
+
+See :doc:`how-to/hip_porting_guide` for more details.
+
+Can a HIP binary run on both AMD and NVIDIA platforms?
+------------------------------------------------------
+
+HIP is a source-portable language that can be compiled to run on AMD or NVIDIA
+platforms. However, the HIP tools don't create a "fat binary" that can run on
+both platforms.
+
+Compiler related questions
+==========================
+
+hipcc detected my platform incorrectly. What should I do?
+---------------------------------------------------------
+
+The environment variable ``HIP_PLATFORM`` can be used to specify the platform
+for which the code is going to be compiled with ``hipcc``. See the
+:doc:`hipcc environment variables<hipcc:env>` for more information.
+
+.. warning::
+
+   If you specify HIP_PLATFORM=NVIDIA with hipcc, you also need to pass ``-x cu``
+   to hipcc when compiling files with the ``.hip`` file extension. Otherwise,
+   nvcc will not recognize the ``.hip`` file extension and will fail with
+   ``nvcc fatal   : Don't know what to do with  <file>.hip``.
+
+How to use HIP-Clang to build HIP programs?
+------------------------------------------------------
+
+:doc:`hipcc <hipcc:index>` is a compiler driver. This means it is not a compiler
+but calls the appropriate compilers and sets some options.
+
+The underlying compilers are :doc:`amdclang++ <llvm-project:index>` or
+`nvcc <https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html>`_,
+depending on the platform, and can be called directly.
+
+What is HIP-Clang?
+------------------
+
+HIP-Clang is a Clang/LLVM-based compiler used to compile HIP programs for AMD
+platforms. The executable is named :doc:`amdclang++ <llvm-project:index>` on
+Linux and ``clang++`` on Windows.
+
+Can I link HIP device code with host code compiled with another compiler such as gcc, icc, or clang?
+-----------------------------------------------------------------------------------------------------------
+
+Yes. HIP generates object code that conforms to the GCC ABI, and links with libstdc++.
+This means you can compile host code with the compiler of your choice and link the
+generated host object code with device code.
+
+Can HIP applications be compiled with a C compiler?
+---------------------------------------------------
+
+HIP is a C/C++ API that can be used with C compilers. However, this applies only
+to the API itself. Device code and the syntax for calling kernels must be
+compiled with a supported compiler like :doc:`hipcc <hipcc:index>`. The code
+objects that are generated with ``hipcc`` can, however, be used with a C
+compiler, as shown in the code examples below.
+
+The following is the HIP device code, assumed to be saved in ``device.hip``:
+
+.. code-block:: c++
+
+  #include <hip/hip_runtime.h>
+
+  __global__ void kernel(double* array, size_t size){
+      const int x = threadIdx.x + blockIdx.x * blockDim.x;
+      if(x < size){array[x] = x;}
+  };
+
+  extern "C"{
+      hipError_t callKernel(int blocks, int threadsPerBlock, double* array, size_t size){
+          kernel<<<blocks, threadsPerBlock, 0, hipStreamDefault>>>(array, size);
+          return hipGetLastError();
+      }
+  }
+
+The following is the host code, written in C, saved in ``host.c``:
+
+.. code-block:: c
+
+  #include <hip/hip_runtime_api.h>
+  #include <stdio.h>
+  #include <stdlib.h>
+
+  #define HIP_CHECK(c) {                                \
+     if (c != hipSuccess){                              \
+        printf("HIP Error : %s", hipGetErrorString(c)); \
+        printf(" %s %d\n", __FILE__, __LINE__);         \
+        exit(c);                                        \
+     }                                                  \
+  }
+
+  // Forward declaration - the implementation needs to be compiled with
+  // a device compiler like hipcc or amdclang++
+  hipError_t callKernel(int blocks, int threadsPerBlock, double* array, size_t size);
+
+  int main(int argc, char** argv) {
+      int blocks = 1024;
+      int threadsPerBlock = 256;
+      size_t arraySize = blocks * threadsPerBlock;
+      double* d_array;
+      double* h_array;
+      h_array = (double*)malloc(arraySize * sizeof(double));
+
+      HIP_CHECK(hipMalloc((void**)&d_array, arraySize * sizeof(double)));
+      HIP_CHECK(callKernel(blocks, threadsPerBlock, d_array, arraySize));
+      HIP_CHECK(hipMemcpy(h_array, d_array, arraySize * sizeof(double), hipMemcpyDeviceToHost));
+      HIP_CHECK(hipFree(d_array));
+
+      free(h_array);
+      return 0;
+  }
+
+These files are then compiled and linked using
+
+.. code-block:: shell
+
+  hipcc -c device.hip
+  gcc host.c device.o $(hipconfig --cpp_config) -L/opt/rocm/lib -lamdhip64
+
+assuming the default installation of ROCm in ``/opt/rocm``.
+
+How to guard code specific to the host or the GPU?
+--------------------------------------------------
+
+The compiler defines the ``__HIP_DEVICE_COMPILE__`` macro only when compiling
+device code.
+
+Refer to the :doc:`how-to/hip_porting_guide` for more information.
@@ -2,12 +2,13 @@
   :description: How to debug using HIP.
   :keywords: AMD, ROCm, HIP, debugging, ltrace, ROCgdb, WinGDB

+.. _debugging_with_hip:
+
 *************************************************************************
 Debugging with HIP
 *************************************************************************

-AMD debugging tools include *ltrace* and *ROCgdb*. External tools are available and can be found
-online. For example, if you're using Windows, you can use *Microsoft Visual Studio* and *WinGDB*.
+HIP debugging tools include `ltrace <https://ltrace.org/>`_ and :doc:`ROCgdb <rocgdb:index>`. External tools are available and can be found online. For example, if you're using Windows, you can use Microsoft Visual Studio and WinGDB.

 You can trace and debug your code using the following tools and techniques.

@@ -272,110 +273,7 @@ HIP environment variable summary

 Here are some of the more commonly used environment variables:

-.. <!-- spellcheck-disable -->
-
-.. # COMMENT: The following lines define a break for use in the table below.
-.. |break| raw:: html
-
-    <br />
-
-.. <!-- spellcheck-enable -->
-
-.. list-table::
-
-    * - **Environment variable**
-      - **Default value**
-      - **Usage**
-
-    * - AMD_LOG_LEVEL
-        |break| Enable HIP log on different Level
-      - 0
-      - 0: Disable log.
-        |break| 1: Enable log on error level
-        |break| 2: Enable log on warning and below levels
-        |break| 0x3: Enable log on information and below levels
-        |break| 0x4: Decode and display AQL packets
-
-    * - AMD_LOG_MASK
-        |break| Enable HIP log on different Level
-      - 0x7FFFFFFF
-      - 0x1: Log API calls
-        |break| 0x02: Kernel and Copy Commands and Barriers
-        |break| 0x4: Synchronization and waiting for commands to finish
-        |break| 0x8: Enable log on information and below levels
-        |break| 0x20: Queue commands and queue contents
-        |break| 0x40: Signal creation, allocation, pool
-        |break| 0x80: Locks and thread-safety code
-        |break| 0x100: Copy debug
-        |break| 0x200: Detailed copy debug
-        |break| 0x400: Resource allocation, performance-impacting events
-        |break| 0x800: Initialization and shutdown
-        |break| 0x1000: Misc debug, not yet classified
-        |break| 0x2000: Show raw bytes of AQL packet
-        |break| 0x4000: Show code creation debug
-        |break| 0x8000: More detailed command info, including barrier commands
-        |break| 0x10000: Log message location
-        |break| 0xFFFFFFFF: Log always even mask flag is zero
-
-    * - HIP_LAUNCH_BLOCKING
-        |break|  Used for serialization on kernel execution.
-      - 0
-      - 0: Disable. Kernel executes normally.
-        |break| 1: Enable. Serializes kernel enqueue, behaves the same as AMD_SERIALIZE_KERNEL.
-
-    * - HIP_VISIBLE_DEVICES (or CUDA_VISIBLE_DEVICES)
-        |break|  Only devices whose index is present in the sequence are visible to HIP
-      -
-      - 0,1,2: Depending on the number of devices on the system
-
-    * - GPU_DUMP_CODE_OBJECT
-        |break| Dump code object
-      - 0
-      - 0: Disable
-        |break| 1: Enable
-
-    * - AMD_SERIALIZE_KERNEL
-        |break|  Serialize kernel enqueue
-      - 0
-      - 1: Wait for completion before enqueue
-        |break| 2: Wait for completion after enqueue
-        |break| 3: Both
-
-    * - AMD_SERIALIZE_COPY
-        |break| Serialize copies
-      - 0
-      - 1: Wait for completion before enqueue
-        |break| 2: Wait for completion after enqueue
-        |break| 3: Both
-
-    * - HIP_HOST_COHERENT
-        |break| Coherent memory in hipHostMalloc
-      - 0
-      - 0: memory is not coherent between host and GPU
-        |break| 1: memory is coherent with host
-
-    * - AMD_DIRECT_DISPATCH
-        |break| Enable direct kernel dispatch (Currently for Linux; under development for Windows)
-      - 1
-      - 0: Disable
-        |break| 1: Enable
-
-    * - GPU_MAX_HW_QUEUES
-        |break| The maximum number of hardware queues allocated per device
-      - 4
-      - The variable controls how many independent hardware queues HIP runtime can create per process,
-        per device. If an application allocates more HIP streams than this number, then HIP runtime reuses
-        the same hardware queues for the new streams in a round-robin manner. Note that this maximum
-        number does not apply to hardware queues that are created for CU-masked HIP streams, or
-        cooperative queues for HIP Cooperative Groups (single queue per device).
-
-    * - DEBUG_HIP_7_PREVIEW
-        |break| Enable preview of upcoming runtime changes that break backward compatibility.
-        These changes might require updating existing application code to support the new behavior.
-        The new behavior will become default in a future major release and this environment
-        variable will no longer be needed.
-      - 0
-      - 0x1: Match the behavior of hipGetLastError with its corresponding CUDA API
+.. include:: ../how-to/debugging_env.rst

 General debugging tips
 ======================================================
@@ -0,0 +1,110 @@
+.. list-table::
+    :header-rows: 1
+    :widths: 35,14,51
+
+    * - **Environment variable**
+      - **Default value**
+      - **Value**
+
+    * - | ``AMD_LOG_LEVEL``
+        | Enables HIP log on various level.
+      - ``0``
+      - | 0: Disable log.
+        | 1: Enables error logs.
+        | 2: Enables warning logs next to lower-level logs.
+        | 3: Enables information logs next to lower-level logs.
+        | 4: Enables debug logs next to lower-level logs.
+        | 5: Enables debug extra logs next to lower-level logs.
+
+    * - | ``AMD_LOG_LEVEL_FILE``
+        | Sets output file for ``AMD_LOG_LEVEL``.
+      - stderr output
+      -
+
+    * - | ``AMD_LOG_MASK``
+        | Specifies HIP log filters. Here is the ` complete list of log masks <https://github.com/ROCm/clr/blob/develop/rocclr/utils/debug.hpp#L40>`_.
+      - ``0x7FFFFFFF``
+      - | 0x1: Log API calls.
+        | 0x2: Kernel and copy commands and barriers.
+        | 0x4: Synchronization and waiting for commands to finish.
+        | 0x8: Decode and display AQL packets.
+        | 0x10: Queue commands and queue contents.
+        | 0x20: Signal creation, allocation, pool.
+        | 0x40: Locks and thread-safety code.
+        | 0x80: Kernel creations and arguments, etc.
+        | 0x100: Copy debug.
+        | 0x200: Detailed copy debug.
+        | 0x400: Resource allocation, performance-impacting events.
+        | 0x800: Initialization and shutdown.
+        | 0x1000: Misc debug, not yet classified.
+        | 0x2000: Show raw bytes of AQL packet.
+        | 0x4000: Show code creation debug.
+        | 0x8000: More detailed command info, including barrier commands.
+        | 0x10000: Log message location.
+        | 0x20000: Memory allocation.
+        | 0x40000: Memory pool allocation, including memory in graphs.
+        | 0x80000: Timestamp details.
+        | 0xFFFFFFFF: Log always even mask flag is zero.
+
+    * - | ``HIP_LAUNCH_BLOCKING``
+        | Used for serialization on kernel execution.
+      - ``0``
+      - | 0: Disable. Kernel executes normally.
+        | 1: Enable. Serializes kernel enqueue, behaves the same as ``AMD_SERIALIZE_KERNEL``.
+
+    * - | ``HIP_VISIBLE_DEVICES`` (or ``CUDA_VISIBLE_DEVICES``)
+        | Only devices whose index is present in the sequence are visible to HIP
+      - Unset by default.
+      - 0,1,2: Depending on the number of devices on the system.
+
+    * - | ``GPU_DUMP_CODE_OBJECT``
+        | Dump code object.
+      - ``0``
+      - | 0: Disable
+        | 1: Enable
+
+    * - | ``AMD_SERIALIZE_KERNEL``
+        | Serialize kernel enqueue.
+      - ``0``
+      - | 0: Disable
+        | 1: Wait for completion before enqueue.
+        | 2: Wait for completion after enqueue.
+        | 3: Both
+
+    * - | ``AMD_SERIALIZE_COPY``
+        | Serialize copies
+      - ``0``
+      - | 0: Disable
+        | 1: Wait for completion before enqueue.
+        | 2: Wait for completion after enqueue.
+        | 3: Both
+
+    * - | ``AMD_DIRECT_DISPATCH``
+        | Enable direct kernel dispatch (Currently for Linux; under development for Windows).
+      - ``1``
+      - | 0: Disable
+        | 1: Enable
+
+    * - | ``GPU_MAX_HW_QUEUES``
+        | The maximum number of hardware queues allocated per device.
+      - ``4``
+      - The variable controls how many independent hardware queues HIP runtime can create per process,
+        per device. If an application allocates more HIP streams than this number, then HIP runtime reuses
+        the same hardware queues for the new streams in a round-robin manner. Note that this maximum
+        number does not apply to hardware queues that are created for CU-masked HIP streams, or
+        cooperative queues for HIP Cooperative Groups (single queue per device).
+
+    * - | ``DEBUG_HIP_7_PREVIEW``
+        | Enable preview of upcoming
+        | runtime changes that break
+        | backward compatibility.
+        | These changes might require
+        | updating existing application
+        | code to support the new
+        | behavior. The new behavior
+        | will become default in a
+        | future major release and this
+        | environment variable will
+        | no longer be needed.
+      - 0
+      - 0x1: Match the behavior of hipGetLastError with its corresponding CUDA API
@@ -1,386 +0,0 @@
-# Frequently asked questions
-
-## What APIs and features does HIP support?
-
-HIP provides the following:
-
-* Devices (`hipSetDevice()`, `hipGetDeviceProperties()`, etc.)
-* Memory management (`hipMalloc()`, `hipMemcpy()`, `hipFree()`, etc.)
-* Streams (`hipStreamCreate()`, `hipStreamSynchronize()`, `hipStreamWaitEvent()`, etc.)
-* Events (`hipEventRecord()`, `hipEventElapsedTime()`, etc.)
-* Kernel launching (`hipLaunchKernel`/`hipLaunchKernelGGL` is the preferred way of launching kernels. `hipLaunchKernelGGL` is a standard C/C++ macro that can serve as an alternative way to launch kernels, replacing the CUDA triple-chevron (`<<< >>>`) syntax).
-* HIP Module API to control when and how code is loaded.
-* CUDA-style kernel coordinate functions (`threadIdx`, `blockIdx`, `blockDim`, `gridDim`)
-* Cross-lane instructions including `shfl`, `ballot`, `any`, `all`
-* Most device-side math built-ins
-* Error reporting (`hipGetLastError()`, `hipGetErrorString()`)
-
-The HIP API documentation describes each API and its limitations, if any, compared with the equivalent CUDA API.
-
-## What is not supported?
-
-### Runtime/Driver API features
-
-At a high-level, the following features are not supported:
-
-* Textures (partial support available)
-* Dynamic parallelism (CUDA 5.0)
-* Graphics interoperability with OpenGL or Direct3D
-* CUDA IPC Functions (Under Development)
-* CUDA array, `mipmappedArray` and pitched memory
-* Queue priority controls
-
-See the [API Support Table](https://github.com/ROCm/HIPIFY/blob/amd-staging/docs/tables/CUDA_Runtime_API_functions_supported_by_HIP.md) for more detailed information.
-
-### Kernel language features
-
-* C++-style device-side dynamic memory allocations (free, new, delete) (CUDA 4.0)
-* Virtual functions, indirect functions and try/catch (CUDA 4.0)
-* `__prof_trigger`
-* PTX assembly (CUDA 4.0). HIP-Clang supports inline GCN assembly.
-* Several kernel features are under development. See the {doc}`/reference/cpp_language_extensions` for more information.
-
-## Is HIP a drop-in replacement for CUDA?
-
-No. HIP provides porting tools which do most of the work to convert CUDA code into portable C++ code that uses the HIP APIs.
-Most developers will port their code from CUDA to HIP and then maintain the HIP version.
-HIP code provides the same performance as native CUDA code, plus the benefits of running on AMD platforms.
-
-## What specific version of CUDA does HIP support?
-
-HIP APIs and features do not map to a specific CUDA version. HIP provides a strong subset of the functionality provided in CUDA, and the hipify tools can scan code to identify any unsupported CUDA functions - this is useful for identifying the specific features required by a given application.
-
-However, we can provide a rough summary of the features included in each CUDA SDK and the support level in HIP. Each bullet below lists the major new language features in each CUDA release and then indicate which are supported/not supported in HIP:
-
-* CUDA 4.0 and earlier :
-  * HIP supports CUDA 4.0 except for the limitations described above.
-* CUDA 5.0 :
-  * Dynamic Parallelism (not supported)
-  * `cuIpc` functions (under development).
-* CUDA 6.0 :
-  * Managed memory (under development)
-* CUDA 6.5 :
-  * `__shfl` intrinsic (supported)
-* CUDA 7.0 :
-  * Per-thread default streams (supported)
-  * C++11 (Hip-Clang supports all of C++11, all of C++14 and some C++17 features)
-* CUDA 7.5 :
-  * float16 (supported)
-* CUDA 8.0 :
-  * Page Migration including `cudaMemAdvise`, `cudaMemPrefetch`, other `cudaMem*` APIs(not supported)
-* CUDA 9.0 :
-  * Cooperative Launch, Surface Object Management, Version Management
-
-## What libraries does HIP support?
-
-HIP includes growing support for the four key math libraries using hipBLAS, hipFFT, hipRAND and hipSPARSE, as well as MIOpen for machine intelligence applications.
-These offer pointer-based memory interfaces (as opposed to opaque buffers) and can be easily interfaced with other HIP applications.
-The hip interfaces support both ROCm and CUDA paths, with familiar library interfaces.
-
-* [hipBLAS](https://github.com/ROCmSoftwarePlatform/hipBLAS), which utilizes [rocBlas](https://github.com/ROCmSoftwarePlatform/rocBLAS).
-* [hipFFT](https://github.com/ROCmSoftwarePlatform/hipfft)
-* [hipsSPARSE](https://github.com/ROCmSoftwarePlatform/hipsparse)
-* [hipRAND](https://github.com/ROCmSoftwarePlatform/hipRAND)
-* [MIOpen](https://github.com/ROCmSoftwarePlatform/MIOpen)
-
-Additionally, some of the cuBLAS routines are automatically converted to hipblas equivalents by the HIPIFY tools. These APIs use cuBLAS or hcBLAS depending on the platform and replace the need to use conditional compilation.
-
-## How does HIP compare with OpenCL?
-
-Both AMD and NVIDIA support OpenCL 1.2 on their devices so that developers can write portable code.
-HIP offers several benefits over OpenCL:
-
-* Developers can code in C++ as well as mix host and device C++ code in their source files. HIP C++ code can use templates, lambdas, classes and so on.
-* The HIP API is less verbose than OpenCL and is familiar to CUDA developers.
-* Because both CUDA and HIP are C++ languages, porting from CUDA to HIP is significantly easier than porting from CUDA to OpenCL.
-* HIP uses the best available development tools on each platform: on NVIDIA GPUs, HIP code compiles using NVCC and can employ the Nsight profiler and debugger (unlike OpenCL on NVIDIA GPUs).
-* HIP provides pointers and host-side pointer arithmetic.
-* HIP provides device-level control over memory allocation and placement.
-* HIP offers an offline compilation model.
-
-## How does porting CUDA to HIP compare to porting CUDA to OpenCL?
-
-Both HIP and CUDA are dialects of C++, and thus porting between them is relatively straightforward.
-Both dialects support templates, classes, lambdas, and other C++ constructs.
-As one example, the hipify-perl tool was originally a Perl script that used simple text conversions from CUDA to HIP.
-HIP and CUDA provide similar math library calls as well. In summary, the HIP philosophy was to make the HIP language close enough to CUDA that the porting effort is relatively simple.
-This reduces the potential for error, and also makes it easy to automate the translation. HIP goal is to quickly get the ported program running on both platforms with little manual intervention, so that the programmer can focus on performance optimizations.
-
-There have been several tools that have attempted to convert CUDA into OpenCL, such as CU2CL. OpenCL is a C99-based kernel language (rather than C++) and also does not support single-source compilation.
-As a result, the OpenCL syntax is different from CUDA, and the porting tools have to perform some heroic transformations to bridge this gap.
-The tools also struggle with more complex CUDA applications, in particular, those that use templates, classes, or other C++ features inside the kernel.
-
-## What hardware does HIP support?
-
-* For AMD platforms, see the [ROCm documentation](https://github.com/RadeonOpenCompute/ROCm#supported-gpus) for the list of supported platforms.
-* For NVIDIA platforms, HIP requires unified memory and should run on any device supporting CUDA SDK 6.0 or newer. We have tested the NVIDIA Titan and Tesla K40.
-
-## Do HIPIFY tools automatically convert all source code?
-
-Typically, HIPIFY tools can automatically convert almost all run-time code.
-Most device code needs no additional conversion since HIP and CUDA have similar names for math and built-in functions.
-The hipify-clang tool will automatically modify the kernel signature as needed (automating a step that used to be done manually).
-Additional porting may be required to deal with architecture feature queries or with CUDA capabilities that HIP doesn't support.
-In general, developers should always expect to perform some platform-specific tuning and optimization.
-
-## What is NVCC?
-
-NVCC is NVIDIA's compiler driver for compiling "CUDA C++" code into PTX or device code for NVIDIA GPUs. It's a closed-source binary compiler that is provided by the CUDA SDK.
-
-## What is HIP-Clang?
-
-HIP-Clang is a Clang/LLVM based compiler to compile HIP programs which can run on AMD platform.
-
-## Why use HIP rather than supporting CUDA directly?
-
-While HIP is a strong subset of the CUDA, it is a subset. The HIP layer allows that subset to be clearly defined and documented.
-Developers who code to the HIP API can be assured their code will remain portable across NVIDIA and AMD platforms.
-In addition, HIP defines portable mechanisms to query architectural features and supports a larger 64-bit `WaveSize` which expands the return type for cross-lane functions like ballot and shuffle from 32-bit integers to 64-bit integers.
-
-## Can I develop HIP code on an NVIDIA CUDA platform?
-
-Yes. HIP's CUDA path only exposes the APIs and functionality that work on both NVCC and AMDGPU back-ends.
-"Extra" APIs, parameters, and features which exist in CUDA but not in HIP-Clang will typically result in compile-time or run-time errors.
-Developers need to use the HIP API for most accelerator code and bracket any CUDA-specific code with preprocessor conditionals.
-Developers concerned about portability should, of course, run on both platforms, and should expect to tune for performance.
-In some cases, CUDA has a richer set of modes for some APIs, and some C++ capabilities such as virtual functions - see the HIP @API documentation for more details.
-
-## Can I develop HIP code on an AMD HIP-Clang platform?
-
-Yes. HIP's HIP-Clang path only exposes the APIs and functions that work on AMD runtime back ends. "Extra" APIs, parameters and features that appear in HIP-Clang but not CUDA will typically cause compile- or run-time errors. Developers must use the HIP API for most accelerator code and bracket any HIP-Clang specific code with preprocessor conditionals. Those concerned about portability should, of course, test their code on both platforms and should tune it for performance. Typically, HIP-Clang supports a more modern set of C++11/C++14/C++17 features, so HIP developers who want portability should be careful when using advanced C++ features on the HIP-Clang path.
-
-## How to use HIP-Clang to build HIP programs?
-
-The environment variable can be used to set compiler path:
-
-* HIP_CLANG_PATH: path to hip-clang. When set, this variable let hipcc to use hip-clang for compilation/linking.
-
-There is an alternative environment variable to set compiler path:
-
-* HIP_ROCCLR_HOME: path to root directory of the HIP-ROCclr runtime. When set, this variable let hipcc use hip-clang from the ROCclr distribution.
-NOTE: If HIP_ROCCLR_HOME is set, there is no need to set HIP_CLANG_PATH since hipcc will deduce them from HIP_ROCCLR_HOME.
-
-## What is AMD clr?
-
-AMD [Compute Language Runtime (CLR)](https://github.com/ROCm/clr) is a repository for the AMD platform, which contains source codes for AMD's compute languages runtimes as follows,
-
-* hipamd - contains implementation of HIP language for AMD GPU.
-* rocclr - contains virtual device interfaces that compute runtimes interact with backends, such as ROCr on Linux and PAL on Windows.
-* opencl - contains implementation of OpenCL™ on the AMD platform.
-
-## What is hipother?
-
-A new repository ['hipother'](https://github.com/ROCm/hipother) is added in the ROCm 6.1 release, which is branched out from HIP.
-hipother supports the HIP back-end implementation on some non-AMD platforms, like NVIDIA.
-
-## Can I get HIP open source repository for Windows?
-
-No, there is no HIP repository open publicly on Windows.
-
-## Can a HIP binary run on both AMD and NVIDIA platforms?
-
-HIP is a source-portable language that can be compiled to run on either AMD or NVIDIA platform. HIP tools don't create a "fat binary" that can run on either platform, however.
-
-## On HIP-Clang, can I link HIP code with host code compiled with another compiler such as gcc, icc, or clang?
-
-Yes. HIP generates the object code which conforms to the GCC ABI, and also links with libstdc++. This means you can compile host code with the compiler of your choice and link the generated object code
-with GPU code compiled with HIP. Larger projects often contain a mixture of accelerator code (initially written in CUDA with NVCC) and host code (compiled with gcc, icc, or clang). These projects
-can convert the accelerator code to HIP, compile that code with hipcc, and link with object code from their preferred compiler.
-
-## Can HIP API support C style application? What is the difference between C and C++?
-
-HIP is C++ runtime API that supports C style applications as well.
-
-Some C style applications (and interfaces to other languages (FORTRAN, Python)) would call certain HIP APIs but not use kernel programming.
-They can be compiled with a C compiler and run correctly, however, small details must be considered in the code. For example, initialization, as shown in the simple application below, uses HIP structs dim3 with the file name "test.hip.cpp"
-
-```cpp
-#include "hip/hip_runtime_api.h"
-#include "stdio.h"
-
-int main(int argc, char** argv) {
-  dim3 grid1;
-  printf("dim3 grid1; x=%d, y=%d, z=%d\n",grid1.x,grid1.y,grid1.z);
-  dim3 grid2 = {1,1,1};
-  printf("dim3 grid2 = {1,1,1}; x=%d, y=%d, z=%d\n",grid2.x,grid2.y,grid2.z);
-  return 0;
-}
-```
-
-When using a C++ compiler,
-
-```shell
-$ gcc -x c++  $(hipconfig --cpp_config) test3.hip.cpp -o test
-$ ./test
-dim3 grid1; x=1, y=1, z=1
-dim3 grid2 = {1,1,1}; x=1, y=1, z=1
-```
-
-In which "dim3 grid1;" will yield a dim3 grid with all dimensional members x,y,z initialized to 1, as the default constructor behaves that way.
-Further, if written:
-
-```cpp
-dim3 grid(2); // yields {2,1,1}
-dim3 grid(2,3); yields {2,3,1}
-```
-
-In comparison, when using the C compiler,
-
-```shell
-$ gcc -x c $(hipconfig --cpp_config) test.hip.cpp -o test
-$ ./test
-dim3 grid1; x=646881376, y=21975, z=1517277280
-dim3 grid2 = {1,1,1}; x=1, y=1, z=1
-```
-
-In which "dim3 grid;" does not imply any initialization, no constructor is called, and dimensional values x,y,z of grid are undefined.
-NOTE: To get the C++ default behavior, C programmers must additionally specify the right-hand side as shown below,
-
-```cpp
-dim3 grid = {1,1,1}; // initialized as in C++
-```
-
-## Can I install both CUDA SDK and HIP-Clang on the same machine?
-
-Yes. You can use HIP_PLATFORM to choose which path hipcc targets. This configuration can be useful when using HIP to develop an application which is portable to both AMD and NVIDIA.
-
-## HIP detected my platform (HIP-Clang vs NVCC) incorrectly * what should I do?
-
-HIP will set the platform to AMD and use HIP-Clang as compiler if it sees that the AMD graphics driver is installed and has detected an AMD GPU.
-Sometimes this isn't what you want * you can force HIP to recognize the platform by setting the following,
-
-```shell
-export HIP_PLATFORM=amd
-```
-
-HIP then set and use correct AMD compiler and runtime,
-HIP_COMPILER=clang
-HIP_RUNTIME=rocclr
-
-To choose NVIDIA platform, you can set,
-
-```shell
-export HIP_PLATFORM=nvidia
-```
-
-In this case, HIP will set and use the following,
-
-```shell
-HIP_COMPILER=cuda
-HIP_RUNTIME=nvcc
-```
-
-One symptom of this problem is the message "error: 'unknown error'(11) at `square.hipref.cpp:56`. This can occur if you have a CUDA installation on an AMD platform, and HIP incorrectly detects the platform as NVCC. HIP may be able to compile the application using the NVCC tool-chain but will generate this error at runtime since the platform does not have a CUDA device.
-
-## On CUDA, can I mix CUDA code with HIP code?
-
-Yes. Most HIP data structures (`hipStream_t`, `hipEvent_t`) are typedefs to CUDA equivalents and can be intermixed. Both CUDA and HIP use integer device ids.
-One notable exception is that `hipError_t` is a new type, and cannot be used where a `cudaError_t` is expected. In these cases, refactor the code to remove the expectation. Alternatively, hip_runtime_api.h defines functions which convert between the error code spaces:
-
-`hipErrorToCudaError`
-`hipCUDAErrorTohipError`
-`hipCUResultTohipError`
-
-If platform portability is important, use `#ifdef __HIP_PLATFORM_NVIDIA__` to guard the CUDA-specific code.
-
-## How do I trace HIP application flow?
-
-See {doc}`/how-to/logging` for more information.
-
-## What are the maximum limits of kernel launch parameters?
-
-Product of block.x, block.y, and block.z should be less than 1024.
-Please note, HIP does not support kernel launch with total work items defined in dimension with size `gridDim x blockDim >= 2^32`, so `gridDim.x * blockDim.x, gridDim.y * blockDim.y and gridDim.z * blockDim.z` are always less than 2^32.
-
-## Are ``__shfl_*_sync`` functions supported on HIP platform?
-
-``__shfl_*_sync`` is not supported on HIP but for NVCC path CUDA 9.0 and above all shuffle calls get redirected to it's sync version.
-
-## How to create a guard for code that is specific to the host or the GPU?
-
-The compiler defines the `__HIP_DEVICE_COMPILE__` macro only when compiling the code for the GPU. It could be used to guard code that is specific to the host or the GPU.
-
-## Why _OpenMP is undefined when compiling with `-fopenmp`?
-
-When compiling an OpenMP source file with `hipcc -fopenmp`, the compiler may generate error if there is a reference to the `_OPENMP` macro. This is due to a limitation in hipcc that treats any source file type (for example `.cpp`) as an HIP translation unit leading to some conflicts with the OpenMP language switch. If the OpenMP source file doesn't contain any HIP language constructs you could work around this issue by adding the `-x c++` switch to force the compiler to treat the file as regular C++. Another approach would be to guard the OpenMP code with `#ifdef _OPENMP` so that the code block is disabled when compiling for the GPU. The `__HIP_DEVICE_COMPILE__` macro defined by the HIP compiler when compiling GPU code could also be used for guarding code paths specific to the host or the GPU.
-
-## Does the HIP-Clang compiler support extern shared declarations?
-
-Previously, it was essential to declare dynamic shared memory using the HIP_DYNAMIC_SHARED macro for accuracy, as using static shared memory in the same kernel could result in overlapping memory ranges and data-races.
-
-Now, the HIP-Clang compiler provides support for extern shared declarations, and the HIP_DYNAMIC_SHARED option is no longer required. You may use the standard extern definition:
-extern __shared__ type var[];
-
-## I have multiple HIP enabled devices and I am getting an error code `hipErrorSharedObjectInitFailed` with the message "Error: shared object initialization failed"?
-
-This error message is seen due to the fact that you do not have valid code object for all of your devices.
-
-If you have compiled the application yourself, make sure you have given the correct device name(s) and its features via: `--offload-arch`. If you are not mentioning the `--offload-arch`, make sure that `hipcc` is using the correct offload arch by verifying the hipcc output generated by setting the environment variable `HIPCC_VERBOSE=1`.
-
-If you have a precompiled application/library (like rocblas, TensorFlow etc) which gives you such error, there are one of two possibilities.
-
-* The application/library does not ship code object bundles for __all__ of your device(s): in this case you need to recompile the application/library yourself with correct `--offload-arch`.
-* The application/library does not ship code object bundles for __some__ of your device(s), for example you have a system with an APU + GPU and the library does not ship code objects for your APU. For this you can set the environment variable `HIP_VISIBLE_DEVICES` or `CUDA_VISIBLE_DEVICES` on NVIDIA platform, to only enable GPUs for which code object is available. This will limit the GPUs visible to your application and allow it to run.
-
-Note: In previous releases, the error code is `hipErrorNoBinaryForGpu` with message "Unable to find code object for all current devices".
-The error code handling behavior is changed. HIP runtime shows the error code `hipErrorSharedObjectInitFailed` with message "Error: shared object initialization failed" on unsupported GPU.
-
-## How to use per-thread default stream in HIP?
-
-The per-thread default stream is an implicit stream local to both the thread and the current device. It does not do any implicit synchronization with other streams (like explicitly created streams), or default per-thread stream on other threads.
-
-The per-thread default stream is a blocking stream and will synchronize with the default null stream if both are used in a program.
-
-In ROCm, a compilation option should be added in order to compile the translation unit with per-thread default stream enabled.
-`-fgpu-default-stream=per-thread`.
-Once source is compiled with per-thread default stream enabled, all APIs will be executed on per thread default stream, hence there will not be any implicit synchronization with other streams.
-
-Besides, per-thread default stream be enabled per translation unit, users can compile some files with feature enabled and some with feature disabled. Feature enabled translation unit will have default stream as per thread and there will not be any implicit synchronization done but other modules will have legacy default stream which will do implicit synchronization.
-
-## How to use complex multiplication and division operations?
-
-In HIP, `hipFloatComplex` and `hipDoubleComplex` are defined as complex data types,
-
-```c++
-typedef float2 hipFloatComplex;
-typedef double2 hipDoubleComplex;
-```
-
-Any application uses complex multiplication and division operations, need to replace '*' and '/' operators with the following,
-
-* `hipCmulf()` and `hipCdivf()` for `hipFloatComplex`
-* `hipCmul()` and `hipCdiv()` for `hipDoubleComplex`
-
-Note: These complex operations are equivalent to corresponding types/functions on the NVIDIA platform.
-
-## Can I develop applications with HIP APIs on Windows the same on Linux?
-
-Yes, HIP APIs are available to use on both Linux and Windows.
-Due to different working mechanisms on operating systems like Windows vs Linux, HIP APIs call corresponding lower level backend runtime libraries and kernel drivers for the OS, in order to control the executions on GPU hardware accordingly. There might be a few differences on the related backend software and driver support, which might affect usage of HIP APIs. See OS support details in HIP API document.
-
-## Does HIP support LUID?
-
-Starting ROCm 6.0, HIP runtime supports Locally Unique Identifier (LUID).
-This feature enables the local physical device(s) to interoperate with other devices. For example, DirectX 12.
-
-HIP runtime sets device LUID properties so the driver can query LUID to identify each device for interoperability.
-
-Note: HIP supports LUID only on Windows OS.
-
-## How can I know the version of HIP?
-
-HIP version definition has been updated since ROCm 4.2 release as the following:
-
-```cpp
-HIP_VERSION=HIP_VERSION_MAJOR * 10000000 + HIP_VERSION_MINOR * 100000 + HIP_VERSION_PATCH
-```
-
-HIP version can be queried from HIP API call,
-
-```cpp
-hipRuntimeGetVersion(&runtimeVersion);
-```
-
-The version returned will always be greater than the versions in previous ROCm releases.
-
-Note: The version definition of HIP runtime is different from CUDA. On AMD platform, the function returns HIP runtime version, while on NVIDIA platform, it returns CUDA runtime version. And there is no mapping/correlation between HIP version and CUDA version.
@@ -8,14 +8,26 @@
 Porting CUDA driver API
 *******************************************************************************

-NVIDIA provides separate CUDA driver and runtime APIs. The two APIs have significant overlap in functionality:
+NVIDIA provides separate CUDA driver and runtime APIs. The two APIs have
+significant overlap in functionality:
+
+* Both APIs support events, streams, memory management, memory copy, and error
+  handling.

-* Both APIs support events, streams, memory management, memory copy, and error handling.
 * Both APIs deliver similar performance.
-* Driver API calls begin with the prefix ``cu``, while runtime API calls begin with the prefix ``cuda``. For example, the driver API contains ``cuEventCreate``, while the runtime API contains ``cudaEventCreate``, which has similar functionality.
-* The driver API defines a different, but largely overlapping, error code space than the runtime API and uses a different coding convention. For example, the driver API defines ``CUDA_ERROR_INVALID_VALUE``, while the runtime API defines ``cudaErrorInvalidValue``.

-The driver API offers two additional functionalities not provided by the runtime API: ``cuModule`` and ``cuCtx`` APIs.
+* Driver API calls begin with the prefix ``cu``, while runtime API calls begin
+  with the prefix ``cuda``. For example, the driver API contains
+  ``cuEventCreate``, while the runtime API contains ``cudaEventCreate``, which
+  has similar functionality.
+
+* The driver API defines a different, but largely overlapping, error code space
+  than the runtime API and uses a different coding convention. For example, the
+  driver API defines ``CUDA_ERROR_INVALID_VALUE``, while the runtime API defines
+  ``cudaErrorInvalidValue``.
+
+The driver API offers two additional functionalities not provided by the runtime
+API: ``cuModule`` and ``cuCtx`` APIs.

 cuModule API
 ================================================================================
@@ -345,7 +357,7 @@ The sample below shows how to use ``hipModuleGetFunction``.
 HIP module and texture Driver API
 ================================================================================

-HIP supports texture driver APIs. However, texture references must be declared 
+HIP supports texture driver APIs. However, texture references must be declared
 within the host scope. The following code demonstrates the use of texture
 references for the ``__HIP_PLATFORM_AMD__`` platform.

@@ -111,10 +111,10 @@ Most CUDA libraries have a corresponding ROCm library with similar functionality

 All HIP projects target either AMD or NVIDIA platform. The platform affects which headers are included and which libraries are used for linking.

-* `HIP_PLATFORM_AMD` is defined if the HIP platform targets AMD.
-Note, `HIP_PLATFORM_HCC` was previously defined if the HIP platform targeted AMD, it is deprecated.
-* `HIP_PLATFORM_NVDIA` is defined if the HIP platform targets NVIDIA.
-Note, `HIP_PLATFORM_NVCC` was previously defined if the HIP platform targeted NVIDIA, it is deprecated.
+* `__HIP_PLATFORM_AMD__` is defined if the HIP platform targets AMD.
+Note, `__HIP_PLATFORM_HCC__` was previously defined if the HIP platform targeted AMD, it is deprecated.
+* `__HIP_PLATFORM_NVDIA__` is defined if the HIP platform targets NVIDIA.
+Note, `__HIP_PLATFORM_NVCC__` was previously defined if the HIP platform targeted NVIDIA, it is deprecated.

 ### Identifying the Compiler: hip-clang or NVCC

@@ -257,7 +257,14 @@ ROCclr is a virtual device interface that HIP runtimes interact with different b

 * NVIDIA platform
 On NVIDIA platform, HIP is just a thin layer on top of CUDA.
-On non-AMD platform, HIP runtime determines if CUDA is available and can be used. If available, HIP_PLATFORM is set to `nvidia` and underneath CUDA path is used.
+
+The environment variable `HIP_PLATFORM` specifies the runtime to use. The
+platform is detected automatically by HIP. When an AMD graphics driver and an
+AMD GPU is detected, `HIP_PLATFORM` is set to `amd`. If both runtimes are
+installed, and a specific one should be used, or HIP can't detect the runtime,
+setting the environment variable manually tells `hipcc` what compilation path to
+choose. To use the CUDA compilation path, set the environment variable to
+`HIP_PLATFORM=nvidia`.

 ## `hipLaunchKernelGGL`

@@ -0,0 +1,48 @@
+.. meta::
+  :description: HIP runtime API usage
+  :keywords: AMD, ROCm, HIP, CUDA, HIP runtime API How to,
+
+.. _hip_runtime_api_how-to:
+
+********************************************************************************
+Using HIP runtime API
+********************************************************************************
+
+The HIP runtime API provides C and C++ functionalities to manage event, stream,
+and memory on GPUs. On the AMD platform, the HIP runtime uses
+:doc:`Compute Language Runtime (CLR) <../understand/amd_clr>`, while on NVIDIA
+CUDA platform, it is only a thin layer over the CUDA runtime or Driver API.
+
+- **CLR** contains source code for AMD's compute language runtimes: ``HIP`` and
+  ``OpenCL™``. CLR includes the ``HIP`` implementation on the AMD
+  platform: `hipamd <https://github.com/ROCm/clr/tree/develop/hipamd>`_ and the
+  ROCm Compute Language Runtime (``rocclr``). ``rocclr`` is a
+  virtual device interface that enables the HIP runtime to interact with
+  different backends such as :doc:`ROCr <rocr-runtime:index>` on Linux or PAL on
+  Windows. CLR also includes the `OpenCL runtime <https://github.com/ROCm/clr/tree/develop/opencl>`_
+  implementation.
+- The **CUDA runtime** is built on top of the CUDA driver API, which is a C API
+  with lower-level access to NVIDIA GPUs. For details about the CUDA driver and
+  runtime API with reference to HIP, see :doc:`CUDA driver API porting guide <../how-to/hip_porting_driver_api>`.
+
+The backends of HIP runtime API under AMD and NVIDIA platform are summarized in
+the following figure:
+
+.. figure:: ../data/how-to/hip_runtime_api/runtimes.svg
+
+.. note::
+
+  On NVIDIA platform HIP runtime API calls CUDA runtime or CUDA driver via
+  hipother interface. For more information, see the `hipother repository <https://github.com/ROCm/hipother>`_.
+
+Here are the various HIP Runtime API high level functions:
+
+* :doc:`./hip_runtime_api/initialization`
+* :doc:`./hip_runtime_api/memory_management`
+* :doc:`./hip_runtime_api/error_handling`
+* :doc:`./hip_runtime_api/cooperative_groups`
+* :doc:`./hip_runtime_api/hipgraph`
+* :doc:`./hip_runtime_api/call_stack`
+* :doc:`./hip_runtime_api/multi_device`
+* :doc:`./hip_runtime_api/opengl_interop`
+* :doc:`./hip_runtime_api/external_interop`
@@ -0,0 +1,129 @@
+.. meta::
+    :description: This page describes call stack concept in HIP
+    :keywords: AMD, ROCm, HIP, call stack
+
+*******************************************************************************
+Call stack
+*******************************************************************************
+
+The call stack is a data structure for managing function calls, by saving the
+state of the current function. Each time a function is called, a new call frame
+is added to the top of the stack, containing information such as local
+variables, return addresses and function parameters. When the function
+execution completes, the frame is removed from the stack and loaded back into
+the corresponding registers. This concept allows the program to return to the
+calling function and continue execution from where it left off.
+
+The call stack for each thread must track its function calls, local variables,
+and return addresses. However, in GPU programming, the memory required to store
+the call stack increases due to the parallelism inherent to the GPUs. NVIDIA
+and AMD GPUs use different approaches. NVIDIA GPUs have the independent thread
+scheduling feature where each thread has its own call stack and effective
+program counter. On AMD GPUs threads are grouped; each warp has its own call
+stack and program counter. Warps are described and explained in the
+:ref:`inherent_thread_hierarchy`
+
+If a thread or warp exceeds its stack size, a stack overflow occurs, causing
+kernel failure. This can be detected using debuggers.
+
+Call stack management with HIP
+===============================================================================
+
+You can adjust the call stack size as shown in the following example, allowing
+fine-tuning based on specific kernel requirements. This helps prevent stack
+overflow errors by ensuring sufficient stack memory is allocated.
+
+.. code-block:: cpp
+
+    #include <hip/hip_runtime.h>
+    #include <iostream>
+
+    #define HIP_CHECK(expression)                \
+    {                                            \
+        const hipError_t status = expression;    \
+        if(status != hipSuccess){                \
+                std::cerr << "HIP error "        \
+                    << status << ": "            \
+                    << hipGetErrorString(status) \
+                    << " at " << __FILE__ << ":" \
+                    << __LINE__ << std::endl;    \
+        }                                        \
+    }
+
+    int main()
+    {
+        size_t stackSize;
+        HIP_CHECK(hipDeviceGetLimit(&stackSize, hipLimitStackSize));
+        std::cout << "Default stack size: " << stackSize << " bytes" << std::endl;
+
+        // Set a new stack size
+        size_t newStackSize = 1024 * 8; // 8 KiB
+        HIP_CHECK(hipDeviceSetLimit(hipLimitStackSize, newStackSize));
+
+        HIP_CHECK(hipDeviceGetLimit(&stackSize, hipLimitStackSize));
+        std::cout << "Updated stack size: " << stackSize << " bytes" << std::endl;
+
+        return 0;
+    }
+
+Depending on the GPU model, at full occupancy, it can consume a significant
+amount of memory. For instance, an MI300X with 304 compute units (CU) and up to
+2048 threads per CU could use 304 · 2048 · 1024 bytes = 608 MiB for the call
+stack by default.
+
+Handling recursion and deep function calls
+-------------------------------------------------------------------------------
+
+Similar to CPU programming, recursive functions and deeply nested function
+calls are supported. However, developers must ensure that these functions do
+not exceed the available stack memory, considering the huge amount of memory
+needed for the call stack due to the GPUs inherent parallelism. This can be
+achieved by increasing stack size or optimizing code to reduce stack usage. To
+detect stack overflow add proper error handling or use debugging tools.
+
+.. code-block:: cpp
+
+    #include <hip/hip_runtime.h>
+    #include <iostream>
+
+    #define HIP_CHECK(expression)                \
+    {                                            \
+        const hipError_t status = expression;    \
+        if(status != hipSuccess){                \
+                std::cerr << "HIP error "        \
+                    << status << ": "            \
+                    << hipGetErrorString(status) \
+                    << " at " << __FILE__ << ":" \
+                    << __LINE__ << std::endl;    \
+        }                                        \
+    }
+
+    __device__ unsigned long long fibonacci(unsigned long long n)
+    {
+        if (n == 0 || n == 1)
+        {
+            return n;
+        }
+        return fibonacci(n - 1) + fibonacci(n - 2);
+    }
+
+    __global__ void kernel(unsigned long long n)
+    {
+        unsigned long long result = fibonacci(n);
+        const size_t x = threadIdx.x + blockDim.x * blockIdx.x;
+
+        if (x == 0)
+            printf("%llu! = %llu \n", n, result);
+    }
+
+    int main()
+    {
+        kernel<<<1, 1>>>(10);
+        HIP_CHECK(hipDeviceSynchronize());
+
+        // With -O0 optimization option hit the stack limit
+        // kernel<<<1, 256>>>(2048);
+        // HIP_CHECK(hipDeviceSynchronize());
+
+        return 0;
+    }
@@ -8,9 +8,16 @@
 Cooperative groups
 *******************************************************************************

-Cooperative groups API is an extension to the HIP programming model, which provides developers with a flexible, dynamic grouping mechanism for the communicating threads. Cooperative groups let you define your own set of thread groups which may fit your user-cases better than those defined by the hardware. This lets you specify the level of granularity for thread communication which can lead to more efficient parallel decompositions.
+The cooperative groups API is an extension to the HIP programming model, which
+provides developers with a flexible, dynamic grouping mechanism for the
+communicating threads. Cooperative groups let you define your own set of thread
+groups which may fit your use-cases better than those defined by the hardware.
+This lets you specify the level of granularity for thread communication which
+can lead to more efficient parallel decompositions.

-The API is accessible in the ``cooperative_groups`` namespace after the  ``hip_cooperative_groups.h`` is included. The header contains the following  elements:
+The API is accessible in the ``cooperative_groups`` namespace after the
+``hip_cooperative_groups.h`` header is included. The header contains the following
+elements:

 * Static functions to create groups and subgroups.
 * Hardware-accelerated operations over the whole group, like shuffles.
@@ -19,13 +26,13 @@ The API is accessible in the ``cooperative_groups`` namespace after the  ``hip_c
 * Get group properties member functions.

 Cooperative groups thread model
-===============================
+================================================================================

-The thread hierarchy abstraction of cooperative groups are in :ref:`grid hierarchy <coop_thread_top_hierarchy>` and :ref:`block hierarchy <coop_thread_bottom_hierarchy>`.
+The thread hierarchy abstractions of cooperative groups are depicted in the following figures: :ref:`grid hierarchy <coop_thread_top_hierarchy>` and :ref:`block hierarchy <coop_thread_bottom_hierarchy>`.

 .. _coop_thread_top_hierarchy:

-.. figure:: ../data/how-to/cooperative_groups/thread_hierarchy_coop_top.svg
+.. figure:: ../../data/how-to/hip_runtime_api/cooperative_groups/thread_hierarchy_coop_top.svg
  :alt: Diagram depicting nested rectangles of varying color. The outermost one
        titled "Grid", inside sets of different sized rectangles layered on
        one another titled "Block". Each "Block" containing sets of uniform
@@ -34,11 +41,16 @@ The thread hierarchy abstraction of cooperative groups are in :ref:`grid hierarc

  Cooperative group thread hierarchy in grids.

-The **multi grid** is an abstraction of potentially multiple simultaneous launches of the same kernel over multiple devices (Deprecated since 5.0). The **grid** in cooperative groups is a single dispatch of kernels for execution like the original grid. 
+The **multi grid** is an abstraction of potentially multiple simultaneous
+launches of the same kernel over multiple devices. The **grid** in cooperative
+groups is a single dispatch of kernels for execution like the original grid.

 .. note::

-  The ability to synchronize over a grid or multi grid requires the kernel to be launched using the specific cooperative groups API.
+  * The ability to synchronize over a grid or multi grid requires the kernel to
+    be launched using the specific cooperative groups API.
+
+  * Multi grid deprecated since ROCm 5.0.

 The **block** is the same as the :ref:`inherent_thread_model` block entity.

@@ -48,7 +60,7 @@ The **block** is the same as the :ref:`inherent_thread_model` block entity.

 .. _coop_thread_bottom_hierarchy:

-.. figure:: ../data/how-to/cooperative_groups/thread_hierarchy_coop_bottom.svg
+.. figure:: ../../data/how-to/hip_runtime_api/cooperative_groups/thread_hierarchy_coop_bottom.svg
  :alt: The new level between block thread and threads.

  Cooperative group thread hierarchy in blocks.
@@ -156,7 +168,7 @@ Threads (64 threads on CDNA and 32 threads on RDNA) in a warp cannot execute dif

 .. note::

-  The NVIDIA GPU's independent thread scheduling presents the appearance that threads on different branches execute concurrently. 
+  The NVIDIA GPU's independent thread scheduling presents the appearance that threads on different branches execute concurrently.

 .. warning::

@@ -378,8 +390,8 @@ With each group type, the synchronization requires using the correct cooperative
                                     dim3(threads_per_block),
                                     0,
                                     hipStreamDefault,
-                                     &d_vector, 
-                                     &d_block_reduced, 
+                                     &d_vector,
+                                     &d_block_reduced,
                                     &d_partition_reduced));

  .. tab-item:: Grid
@@ -0,0 +1,136 @@
+.. meta::
+   :description: Error Handling
+   :keywords: AMD, ROCm, HIP, error handling, error
+
+.. _error_handling:
+
+********************************************************************************
+Error handling
+********************************************************************************
+
+HIP provides functionality to detect, report, and manage errors that occur
+during the execution of HIP runtime functions or when launching kernels. Every
+HIP runtime function, apart from launching kernels, has :cpp:type:`hipError_t`
+as return type. :cpp:func:`hipGetLastError` and :cpp:func:`hipPeekAtLastError`
+can be used for catching errors from kernel launches, as kernel launches don't
+return an error directly. HIP maintains an internal state, that includes the
+last error code. :cpp:func:`hipGetLastError` returns and resets that error to
+``hipSuccess``, while :cpp:func:`hipPeekAtLastError` just returns the error
+without changing it. To get a human readable version of the errors,
+:cpp:func:`hipGetErrorString` and :cpp:func:`hipGetErrorName` can be used.
+
+.. note::
+
+    :cpp:func:`hipGetLastError` returns the returned error code of the last HIP
+    runtime API call even if it's ``hipSuccess``, while ``cudaGetLastError``
+    returns the error returned by any of the preceding CUDA APIs in the same
+    host thread. :cpp:func:`hipGetLastError` behavior will be matched with
+    ``cudaGetLastError`` in ROCm release 7.0.
+
+Best practices of HIP error handling:
+
+1. Check errors after each API call - Avoid error propagation.
+2. Use macros for error checking - Check :ref:`hip_check_macros`.
+3. Handle errors gracefully - Free resources and provide meaningful error
+   messages to the user.
+
+For more details on the error handling functions, see :ref:`error handling
+functions reference page <error_handling_reference>`.
+
+.. _hip_check_macros:
+
+HIP check macros
+================================================================================
+
+HIP uses check macros to simplify error checking and reduce code duplication.
+The ``HIP_CHECK`` macros are mainly used to detect and report errors. It can
+also exit from application with ``exit(1);`` function call after the error
+print. The ``HIP_CHECK`` macro example:
+
+.. code-block:: cpp
+
+  #define HIP_CHECK(expression)                  \
+  {                                              \
+      const hipError_t status = expression;      \
+      if(status != hipSuccess){                  \
+          std::cerr << "HIP error "              \
+                    << status << ": "            \
+                    << hipGetErrorString(status) \
+                    << " at " << __FILE__ << ":" \
+                    << __LINE__ << std::endl;    \
+      }                                          \
+  }
+
+Complete example
+================================================================================
+
+A complete example to demonstrate the error handling with a simple addition of
+two values kernel:
+
+.. code-block:: cpp
+
+  #include <hip/hip_runtime.h>
+  #include <vector>
+  #include <iostream>
+
+  #define HIP_CHECK(expression)                  \
+  {                                              \
+      const hipError_t status = expression;      \
+      if(status != hipSuccess){                  \
+          std::cerr << "HIP error "              \
+                    << status << ": "            \
+                    << hipGetErrorString(status) \
+                    << " at " << __FILE__ << ":" \
+                    << __LINE__ << std::endl;    \
+      }                                          \
+  }
+
+  // Addition of two values.
+  __global__ void add(int *a, int *b, int *c, size_t size) {
+      const size_t index = threadIdx.x + blockDim.x * blockIdx.x;
+      if(index < size) {
+          c[index] += a[index] + b[index];
+      }
+  }
+
+  int main() {
+      constexpr int numOfBlocks = 256;
+      constexpr int threadsPerBlock = 256;
+      constexpr size_t arraySize = 1U << 16;
+
+      std::vector<int> a(arraySize), b(arraySize), c(arraySize);
+      int *d_a, *d_b, *d_c;
+
+      // Setup input values.
+      std::fill(a.begin(), a.end(), 1);
+      std::fill(b.begin(), b.end(), 2);
+
+      // Allocate device copies of a, b and c.
+      HIP_CHECK(hipMalloc(&d_a, arraySize * sizeof(*d_a)));
+      HIP_CHECK(hipMalloc(&d_b, arraySize * sizeof(*d_b)));
+      HIP_CHECK(hipMalloc(&d_c, arraySize * sizeof(*d_c)));
+
+      // Copy input values to device.
+      HIP_CHECK(hipMemcpy(d_a, &a, arraySize * sizeof(*d_a), hipMemcpyHostToDevice));
+      HIP_CHECK(hipMemcpy(d_b, &b, arraySize * sizeof(*d_b), hipMemcpyHostToDevice));
+
+      // Launch add() kernel on GPU.
+      hipLaunchKernelGGL(add, dim3(numOfBlocks), dim3(threadsPerBlock), 0, 0, d_a, d_b, d_c, arraySize);
+      // Check the kernel launch
+      HIP_CHECK(hipGetLastError());
+      // Check for kernel execution error
+      HIP_CHECK(hipDeviceSynchronize());
+
+      // Copy the result back to the host.
+      HIP_CHECK(hipMemcpy(&c, d_c, arraySize * sizeof(*d_c), hipMemcpyDeviceToHost));
+
+      // Cleanup allocated memory.
+      HIP_CHECK(hipFree(d_a));
+      HIP_CHECK(hipFree(d_b));
+      HIP_CHECK(hipFree(d_c));
+
+      // Print the result.
+      std::cout << a[0] << " + " << b[0] << " = " << c[0] << std::endl;
+
+      return 0;
+  }
@@ -0,0 +1,140 @@
+.. meta::
+   :description: HIP provides an external resource interoperability API that
+                 allows efficient data sharing between HIP's computing power and
+                 OpenGL's graphics rendering.
+   :keywords: AMD, ROCm, HIP, external, interop, interoperability
+
+*******************************************************************************
+External resource interoperability
+*******************************************************************************
+
+This feature allows HIP to work with resources -- like memory and semaphores --
+created by other APIs. This means resources can be used from APIs like CUDA,
+OpenCL and Vulkan within HIP, making it easier to integrate HIP into existing
+projects.
+
+To use external resources in HIP, you typically follow these steps:
+
+- Import resources from other APIs using HIP provided functions
+- Use external resources as if they were created in HIP
+- Destroy the HIP resource object to clean up
+
+Semaphore Functions
+===============================================================================
+
+Semaphore functions are essential for synchronization in parallel computing.
+These functions facilitate communication and coordination between different
+parts of a program or between different programs. By managing semaphores, tasks
+are executed in the correct order, and resources are utilized effectively.
+Semaphore functions ensure smooth operation, preventing conflicts and
+maintaining the integrity of processes; upholding the integrity and performance
+of concurrent processes.
+
+External semaphore functions can be used in HIP as described in :ref:`external_resource_interoperability_reference`.
+
+Memory Functions
+===============================================================================
+
+HIP external memory functions focus on the efficient sharing and management of
+memory resources. These functions enable importing memory created by external
+systems, enabling the HIP program to use this memory seamlessly. Memory
+functions include mapping memory for effective use and ensuring proper cleanup
+to prevent resource leaks. This is critical for performance, particularly in
+applications handling large datasets or complex structures such as textures in
+graphics. Proper memory management ensures stability and efficient resource
+utilization.
+
+Example
+===============================================================================
+
+ROCm examples include a
+`HIP--Vulkan interoperation example <https://github.com/ROCm/rocm-examples/tree/develop/HIP-Basic/vulkan_interop>`_
+demonstrates how to perform interoperation between HIP and Vulkan.
+
+In this example, a simple HIP kernel is used to compute a sine wave, which is
+then rendered to a window as a graphical output using Vulkan. The process
+requires several initialization steps, such as setting up a HIP context,
+creating a Vulkan instance, and configuring the GPU device and queue. After
+these initial steps, the kernel executes the sine wave computation, and Vulkan
+continuously updates the window framebuffer to display the computed data until
+the window is closed.
+
+The following code converts a Vulkan memory handle to its equivalent HIP
+handle. The input ``VkDeviceMemory`` and the created HIP memory represents the
+same physical area of GPU memory, through the handles of each respective API.
+Writing to the buffer in one API will allow us to read the results through the
+other. Note that access to the buffer should be synchronized between the APIs,
+for example using queue syncs or semaphores.
+
+.. <!-- spellcheck-disable -->
+
+.. literalinclude:: ../../tools/example_codes/external_interop.hip
+   :start-after: // [Sphinx vulkan memory to hip start]
+   :end-before: // [Sphinx vulkan memory to hip end]
+   :language: cpp
+
+.. <!-- spellcheck-enable -->
+
+The Vulkan semaphore is converted to HIP semaphore shown in the following
+example. Signaling on the semaphore in one API will allow the other API to wait
+on it, which is how we can guarantee synchronized access to resources in a
+cross-API manner.
+
+.. <!-- spellcheck-disable -->
+
+.. literalinclude:: ../../tools/example_codes/external_interop.hip
+   :start-after: // [Sphinx semaphore import start]
+   :end-before: // [Sphinx semaphore import end]
+   :language: cpp
+
+.. <!-- spellcheck-enable -->
+
+When the HIP external memory is exported from Vulkan and imported to HIP, it is
+not yet ready for use. The Vulkan handle is shared, allowing for memory sharing
+rather than copying during the export process. To actually use the memory, we
+need to map it to a pointer so that we may pass it to the kernel so that it can
+be read from and written to. The external memory map to HIP in the following
+example:
+
+.. <!-- spellcheck-disable -->
+
+.. literalinclude:: ../../tools/example_codes/external_interop.hip
+   :start-after: // [Sphinx map external memory start]
+   :end-before: // [Sphinx map external memory end]
+   :language: cpp
+
+.. <!-- spellcheck-enable -->
+
+Wait for buffer is ready and not under modification at Vulkan side:
+
+.. <!-- spellcheck-disable -->
+
+.. literalinclude:: ../../tools/example_codes/external_interop.hip
+   :start-after: // [Sphinx wait semaphore start]
+   :end-before: // [Sphinx wait semaphore end]
+   :language: cpp
+
+.. <!-- spellcheck-enable -->
+
+The sinewave kernel implementation:
+
+.. <!-- spellcheck-disable -->
+
+.. literalinclude:: ../../tools/example_codes/external_interop.hip
+   :start-after: [Sphinx sinewave kernel start]
+   :end-before: // [Sphinx sinewave kernel end]
+   :language: cpp
+
+.. <!-- spellcheck-enable -->
+
+Signal to Vulkan that we are done with the buffer and that it can proceed with
+rendering:
+
+.. <!-- spellcheck-disable -->
+
+.. literalinclude:: ../../tools/example_codes/external_interop.hip
+   :start-after: // [Sphinx signal semaphore start]
+   :end-before: // [Sphinx signal semaphore end]
+   :language: cpp
+
+.. <!-- spellcheck-enable -->
@@ -12,7 +12,7 @@ HIP graphs
    The HIP graph API is currently in Beta. Some features can change and might
    have outstanding issues. Not all features supported by CUDA graphs are yet
    supported. For a list of all currently supported functions see the
-    :doc:`HIP graph API documentation<../doxygen/html/group___graph>`.
+    :ref:`HIP graph API documentation<graph_management_reference>`.

 HIP graphs are an alternative way of executing tasks on a GPU that can provide
 performance benefits over launching kernels using the standard
@@ -35,7 +35,7 @@ The nodes can be one of the following:

 The following figure visualizes the concept of graphs, compared to using streams.

-.. figure:: ../data/how-to/hipgraph/hip_graph.svg
+.. figure:: ../../data/how-to/hip_runtime_api/hipgraph/hip_graph.svg
    :alt: Diagram depicting the difference between using streams to execute
          kernels with dependencies, resolved by explicitly synchronizing,
          or using graphs, where the edges denote the dependencies.
@@ -56,7 +56,7 @@ HIP runtime takes care of executing the operations within the graph.
 Graphs can provide additional performance benefits, by enabling optimizations
 that are only possible when knowing the dependencies between the operations.

-.. figure:: ../data/how-to/hipgraph/hip_graph_speedup.svg
+.. figure:: ../../data/how-to/hip_runtime_api/hipgraph/hip_graph_speedup.svg
    :alt: Diagram depicting the speed up achievable with HIP graphs compared to
          HIP streams when launching many short-running kernels.

@@ -316,11 +316,11 @@ edges of the graph, thereby forming the graph structure.
 The nodes are represented by the generic :cpp:type:`hipGraphNode_t` type. The actual
 node type is implicitly defined by the specific function used to add the node to
 the graph, for example :cpp:func:`hipGraphAddKernelNode` See the
-:doc:`HIP graph API documentation<../doxygen/html/group___graph>` for the
+:ref:`HIP graph API documentation<graph_management_reference>` for the
 available functions, they are of type ``hipGraphAdd{Type}Node``. Each type of
 node also has a predefined set of parameters depending on the operation, for
 example :cpp:class:`hipKernelNodeParams` for a kernel launch. See the
-:doc:`documentation for the general hipGraphNodeParams type<../doxygen/html/structhip_graph_node_params>`
+:doc:`documentation for the general hipGraphNodeParams type<../../doxygen/html/structhip_graph_node_params>`
 for a list of available parameter types and their members.

 The general flow for explicitly creating a graph is usually:
@@ -0,0 +1,107 @@
+.. meta::
+   :description: Initialization.
+   :keywords: AMD, ROCm, HIP, initialization
+
+.. _initialization:
+
+********************************************************************************
+Initialization
+********************************************************************************
+
+The initialization involves setting up the environment and resources needed for
+using GPUs. The following steps are covered with the initialization:
+
+- Setting up the HIP runtime
+
+  This includes reading the environment variables set during init, setting up
+  the active or visible devices, loading necessary libraries, setting up
+  internal buffers for memory copies or cooperative launches, initialize the
+  compiler as well as HSA runtime and checks any errors due to lack of resources
+  or no active devices.
+
+- Querying and setting GPUs
+
+  Identifying and querying the available GPU devices on the system.
+
+- Setting up contexts
+
+  Creating contexts for each GPU device, which are essential for managing
+  resources and executing kernels. For further details, check the :ref:`context
+  section <context_driver_api>`.
+
+Initialize the HIP runtime
+================================================================================
+
+The HIP runtime is initialized automatically when the first HIP API call is
+made. However, you can explicitly initialize it using :cpp:func:`hipInit`,
+to be able to control the timing of the initialization. The manual
+initialization can be useful to ensure that the GPU is initialized and
+ready, or to isolate GPU initialization time from other parts of
+your program.
+
+.. note::
+
+  You can use :cpp:func:`hipDeviceReset` to delete all streams created, memory
+  allocated, kernels running and events created by the current process. Any new
+  HIP API call initializes the HIP runtime again.
+
+Querying and setting GPUs
+================================================================================
+
+If multiple GPUs are available in the system, you can query and select the
+desired GPU(s) to use based on device properties, such as size of global memory,
+size shared memory per block, support of cooperative launch and support of
+managed memory.
+
+Querying GPUs
+--------------------------------------------------------------------------------
+
+The properties of a GPU can be queried using :cpp:func:`hipGetDeviceProperties`,
+which returns a struct of :cpp:struct:`hipDeviceProp_t`. The properties in the
+struct can be used to identify a device or give an overview of hardware
+characteristics, that might make one GPU better suited for the task than others.
+
+The :cpp:func:`hipGetDeviceCount` function returns the number of available GPUs,
+which can be used to loop over the available GPUs.
+
+Example code of querying GPUs:
+
+.. code-block:: cpp
+
+  #include <hip/hip_runtime.h>
+  #include <iostream>
+
+  int main() {
+
+      int deviceCount;
+      if (hipGetDeviceCount(&deviceCount) == hipSuccess){
+          for (int i = 0; i < deviceCount; ++i){
+              hipDeviceProp_t prop;
+              if ( hipGetDeviceProperties(&prop, i) == hipSuccess)
+                  std::cout << "Device" << i << prop.name << std::endl;
+          }
+      }
+
+      return 0;
+  }
+
+Setting the GPU
+--------------------------------------------------------------------------------
+
+:cpp:func:`hipSetDevice` function select the GPU to be used for subsequent HIP
+operations. This function performs several key tasks:
+
+- Context Binding
+
+  Binds the current thread to the context of the specified GPU device. This
+  ensures that all subsequent operations are executed on the selected device.
+
+- Resource Allocation
+
+  Prepares the device for resource allocation, such as memory allocation and
+  stream creation.
+
+- Check device availability
+
+  Checks for errors in device selection and returns error if the specified
+  device is not available or not capable of executing HIP operations.
@@ -0,0 +1,52 @@
+.. meta::
+  :description: Memory management and its usage
+  :keywords: AMD, ROCm, HIP, CUDA, memory management
+
+.. _memory_management:
+
+********************************************************************************
+Memory management
+********************************************************************************
+
+Memory management is an important part of the HIP runtime API, when creating
+high-performance applications. Both allocating and copying memory can result in
+bottlenecks, which can significantly impact performance.
+
+The programming model is based on a system with a host and a device, each having
+its own distinct memory. Kernels operate on :ref:`device_memory`, while host functions
+operate on :ref:`host_memory`.
+
+The runtime offers functions for allocating, freeing, and copying device memory,
+along with transferring data between host and device memory.
+
+Here are the various memory management techniques:
+
+* :ref:`coherence_control`
+* :ref:`unified_memory`
+* :ref:`virtual_memory`
+* :ref:`stream_ordered_memory_allocator_how-to`
+
+Memory allocation
+================================================================================
+
+The API calls and the resulting allocations are listed here:
+
+.. list-table:: Memory coherence control
+    :header-rows: 1
+    :align: center
+
+    * - API
+      - Data location
+      - Allocation
+    * - System allocated
+      - Host
+      - :ref:`Pageable <pageable_host_memory>`
+    * - :cpp:func:`hipMallocManaged`
+      - Host
+      - :ref:`Managed <unified_memory>`
+    * - :cpp:func:`hipHostMalloc`
+      - Host
+      - :ref:`Pinned <pinned_host_memory>`
+    * - :cpp:func:`hipMalloc`
+      - Device
+      - Pinned
@@ -0,0 +1,178 @@
+.. meta::
+  :description: HIP coherence control
+                ecosystem ROCm software.
+  :keywords: AMD, ROCm, HIP, host memory
+
+.. _coherence_control:
+
+*******************************************************************************
+Coherence control
+*******************************************************************************
+
+Memory coherence describes how memory of a specific part of the system is
+visible to the other parts of the system. For example, how GPU memory is visible
+to the CPU and vice versa. In HIP, host and device memory can be allocated with
+two different types of coherence:
+
+* **Coarse-grained coherence:** The memory is considered up-to-date only after
+  synchronization performed using :cpp:func:`hipDeviceSynchronize`,
+  :cpp:func:`hipStreamSynchronize`, or any blocking operation that acts on the
+  null stream such as :cpp:func:`hipMemcpy`. To avoid the cache from being
+  accessed by a part of the system while simultaneously being written by
+  another, the memory is made visible only after the caches have been flushed.
+
+* **Fine-grained coherence:** The memory is coherent even while being modified
+  by a part of the system. Fine-grained coherence ensures that up-to-date data
+  is visible to others regardless of kernel boundaries. This can be useful if
+  both host and device operate on the same data.
+
+.. note::
+
+  To achieve fine-grained coherence, many AMD GPUs use a limited cache policy,
+  such as leaving these allocations uncached by the GPU or making them read-only.
+
+Mi200 accelerator's hardware based floating point instructions work on
+coarse-grained memory regions. Coarse-grained coherence is typically useful in
+reducing host-device interconnect communication.
+
+To check the availability of fine- and coarse-grained memory pools, use
+``rocminfo``:
+
+.. code-block:: sh
+
+  $ rocminfo
+  ...
+  *******
+  Agent 1
+  *******
+  Name:                    AMD EPYC 7742 64-Core Processor
+  ...
+  Pool Info:
+  Pool 1
+  Segment:                 GLOBAL; FLAGS: FINE GRAINED
+  ...
+  Pool 3
+  Segment:                 GLOBAL; FLAGS: COARSE GRAINED
+  ...
+  *******
+  Agent 9
+  *******
+  Name:                    gfx90a
+  ...
+  Pool Info:
+  Pool 1
+  Segment:                 GLOBAL; FLAGS: COARSE GRAINED
+  ...
+
+The APIs, flags and respective memory coherence control are listed in the
+following table:
+
+.. list-table:: Memory coherence control
+    :widths: 25, 35, 20, 20
+    :header-rows: 1
+    :align: center
+
+    * - API
+      - Flag
+      - :cpp:func:`hipMemAdvise` call with argument
+      - Coherence
+    * - ``hipHostMalloc`` :sup:`1`
+      - ``hipHostMallocDefault``
+      -
+      - Fine-grained
+    * - ``hipHostMalloc`` :sup:`1`
+      - ``hipHostMallocNonCoherent``
+      -
+      - Coarse-grained
+    * - ``hipExtMallocWithFlags``
+      - ``hipDeviceMallocDefault``
+      -
+      - Coarse-grained
+    * - ``hipExtMallocWithFlags``
+      - ``hipDeviceMallocFinegrained``
+      -
+      - Fine-grained
+    * - ``hipMallocManaged``
+      -
+      -
+      - Fine-grained
+    * - ``hipMallocManaged``
+      -
+      - ``hipMemAdviseSetCoarseGrain``
+      - Coarse-grained
+    * - ``malloc``
+      -
+      -
+      - Fine-grained
+    * - ``malloc``
+      -
+      - ``hipMemAdviseSetCoarseGrain``
+      - Coarse-grained
+
+:sup:`1` The :cpp:func:`hipHostMalloc` memory allocation coherence mode can be
+affected by the ``HIP_HOST_COHERENT`` environment variable, if the
+``hipHostMallocCoherent``, ``hipHostMallocNonCoherent``, and
+``hipHostMallocMapped`` are unset. If neither these flags nor the
+``HIP_HOST_COHERENT`` environment variable is set, or set as 0, the host memory
+allocation is coarse-grained.
+
+.. note::
+
+  * When ``hipHostMallocMapped`` flag is set, the allocated host memory is
+    fine-grained and the ``hipHostMallocNonCoherent`` flag is ignored.
+  * Setting both the ``hipHostMallocCoherent`` and
+    ``hipHostMallocNonCoherent`` flags leads to an illegal state.
+
+Visibility of synchronization functions
+================================================================================
+
+The fine-grained coherence memory is visible at the synchronization points,
+however the visibility of coarse-grained memory depends on the synchronization
+function used. The effect and visibility of various synchronization functions on
+fine- and coarse-grained memory types are listed here:
+
+.. list-table:: HIP synchronize functions effect and visibility
+
+    * - HIP API
+      - :cpp:func:`hipStreamSynchronize`
+      - :cpp:func:`hipDeviceSynchronize`
+      - :cpp:func:`hipEventSynchronize`
+      - :cpp:func:`hipStreamWaitEvent`
+    * - Synchronization effect
+      - Host waits for all commands in the specified stream to complete
+      - Host waits for all commands in all streams on the specified device to complete
+      - Host waits for the specified event to complete
+      - Stream waits for the specified event to complete
+    * - Fence
+      - System-scope release
+      - System-scope release
+      - System-scope release
+      - None
+    * - Fine-grained host memory visibility
+      - Yes
+      - Yes
+      - Yes
+      - Yes
+    * - Coarse-grained host memory visibility
+      - Yes
+      - Yes
+      - Depends on the used event.
+      - No
+
+You can control the release scope for ``hipEvents``. By default, the GPU
+performs a device-scope acquire and release operation with each recorded event.
+This makes the host and device memory visible to other commands executing on the
+same device.
+
+:cpp:func:`hipEventCreateWithFlags`: You can specify a stronger system-level
+fence by creating the event with ``hipEventCreateWithFlags``:
+
+* ``hipEventReleaseToSystem``: Performs a system-scope release operation when
+  the event is recorded. This makes both fine-grained and coarse-grained host
+  memory visible to other agents in the system, which might also involve
+  heavyweight operations such as cache flushing. Fine-grained memory typically
+  uses lighter-weight in-kernel synchronization mechanisms such as an atomic
+  operation and thus doesn't need to use  ``hipEventReleaseToSystem``.
+
+* ``hipEventDisableTiming``: Events created with this flag don't record
+  profiling data, which significantly improves synchronization performance.
@@ -0,0 +1,52 @@
+.. meta::
+  :description: This chapter describes the device memory of the HIP ecosystem
+                ROCm software.
+  :keywords: AMD, ROCm, HIP, device memory
+
+.. _device_memory:
+
+*******************************************************************************
+Device memory
+*******************************************************************************
+
+Device memory exists on the device, e.g. on GPUs in the video random access
+memory (VRAM), and is accessible by the kernels operating on the device. Recent
+architectures use graphics double data rate (GDDR) synchronous dynamic
+random-access memory (SDRAM) such as GDDR6, or high-bandwidth memory (HBM) such
+as HBM2e. Device memory can be allocated as global memory, constant, texture or
+surface memory.
+
+Global memory
+================================================================================
+
+Read-write storage visible to all threads on a given device. There are
+specialized versions of global memory with different usage semantics which are
+typically backed by the same hardware, but can use different caching paths.
+
+Constant memory
+================================================================================
+
+Read-only storage visible to all threads on a given device. It is a limited
+segment backed by device memory with queryable size. It needs to be set by the
+host before kernel execution. Constant memory provides the best performance
+benefit when all threads within a warp access the same address.
+
+Texture memory
+================================================================================
+
+Read-only storage visible to all threads on a given device and accessible
+through additional APIs. Its origins come from graphics APIs, and provides
+performance benefits when accessing memory in a pattern where the
+addresses are close to each other in a 2D representation of the memory.
+
+The :ref:`texture management module <texture_management_reference>` of the HIP
+runtime API reference contains the functions of texture memory.
+
+Surface memory
+================================================================================
+
+A read-write version of texture memory, which can be useful for applications
+that require direct manipulation of 1D, 2D, or 3D hipArray_t.
+
+The :ref:`surface objects module <surface_object_reference>` of HIP runtime API
+contains the functions for creating, destroying and reading surface memory.
@@ -3,11 +3,13 @@
                ROCm software.
  :keywords: AMD, ROCm, HIP, Texture, Texture Fetching

+.. _texture_fetching:
+
 *******************************************************************************
 Texture fetching
 *******************************************************************************

-`Textures <../doxygen/html/group___texture.html>`_ are more than just a buffer
+`Textures <../../../../doxygen/html/group___texture.html>`_ are more than just a buffer
 interpreted as a 1D, 2D, or 3D array.

 As textures are associated with graphics, they are indexed using floating-point
@@ -32,7 +34,7 @@ sections.
 Here is the sample texture used in this document for demonstration purposes. It
 is 2x2 texels and indexed in the [0 to 1] range.

-.. figure:: ../data/understand/textures/original.png
+.. figure:: ../../../../data/how-to/hip_runtime_api/memory_management/textures/original.png
  :width: 150
  :alt: Sample texture
  :align: center
@@ -66,7 +68,7 @@ The following image shows a texture stretched to a 4x4 pixel quad but still
 indexed in the [0 to 1] range. The in-between values are the same as the values
 of the nearest texel.

-.. figure:: ../data/understand/textures/nearest.png
+.. figure:: ../../../../data/how-to/hip_runtime_api/memory_management/textures/nearest.png
  :width: 300
  :alt: Texture upscaled with nearest point sampling
  :align: center
@@ -97,7 +99,7 @@ This following image shows a texture stretched out to a 4x4 pixel quad, but
 still indexed in the [0 to 1] range. The in-between values are interpolated
 between the neighboring texels.

-.. figure:: ../data/understand/textures/linear.png
+.. figure:: ../../../../data/how-to/hip_runtime_api/memory_management/textures/linear.png
  :width: 300
  :alt: Texture upscaled with linear filtering
  :align: center
@@ -124,7 +126,7 @@ bounds. The border value must be set before texture fetching.
 The following image shows the texture on a 4x4 pixel quad, indexed in the
 [0 to 3] range. The out-of-bounds values are the border color, which is yellow.

-.. figure:: ../data/understand/textures/border.png
+.. figure:: ../../../../data/how-to/hip_runtime_api/memory_management/textures/border.png
  :width: 300
  :alt: Texture with yellow border color
  :align: center
@@ -147,7 +149,7 @@ The following image shows the texture on a 4x4 pixel quad, indexed in the
 [0 to 3] range. The out-of-bounds values are repeating the values at the edge of
 the texture.

-.. figure:: ../data/understand/textures/clamp.png
+.. figure:: ../../../../data/how-to/hip_runtime_api/memory_management/textures/clamp.png
  :width: 300
  :alt: Texture with clamp addressing
  :align: center
@@ -172,7 +174,7 @@ This creates a repeating image effect.
 The following image shows the texture on a 4x4 pixel quad, indexed in the
 [0 to 3] range. The out-of-bounds values are repeating the original texture.

-.. figure:: ../data/understand/textures/wrap.png
+.. figure:: ../../../../data/how-to/hip_runtime_api/memory_management/textures/wrap.png
  :width: 300
  :alt: Texture with wrap addressing
  :align: center
@@ -201,7 +203,7 @@ The following image shows the texture on a 4x4 pixel quad, indexed in The
 [0 to 3] range. The out-of-bounds values are repeating the original texture, but
 mirrored.

-.. figure:: ../data/understand/textures/mirror.png
+.. figure:: ../../../../data/how-to/hip_runtime_api/memory_management/textures/mirror.png
  :width: 300
  :alt: Texture with mirror addressing
  :align: center
@@ -0,0 +1,239 @@
+.. meta::
+  :description: Host memory of the HIP ecosystem
+  :keywords: AMD, ROCm, HIP, host memory
+
+.. _host_memory:
+
+********************************************************************************
+Host memory
+********************************************************************************
+
+Host memory is the "normal" memory residing in the host RAM and allocated by C
+or C++. Host memory can be allocated in two different ways:
+
+* Pageable memory
+
+* Pinned memory
+
+The following figure explains how data is transferred in pageable and pinned
+memory.
+
+.. figure:: ../../../data/how-to/hip_runtime_api/memory_management/pageable_pinned.svg
+
+The pageable and pinned memory allow you to exercise direct control over
+memory operations, which is known as explicit memory management. When using the
+unified memory, you get a simplified memory model with less control over
+low level memory operations.
+
+The difference in memory transfers between explicit and unified memory
+management is highlighted in the following figure:
+
+.. figure:: ../../../data/how-to/hip_runtime_api/memory_management/unified_memory/um.svg
+
+For more details on unified memory management, see :doc:`/how-to/hip_runtime_api/memory_management/unified_memory`.
+
+.. _pageable_host_memory:
+
+Pageable memory
+================================================================================
+
+Pageable memory exists on memory blocks known as "pages" that can be migrated to
+other memory storage. For example, migrating memory between CPU sockets on a
+motherboard or in a system whose RAM runs out of space and starts dumping pages
+into the swap partition of the hard drive.
+
+Pageable memory is usually allocated with a call to ``malloc`` or ``new`` in a
+C++ application.
+
+**Example:** Using pageable host memory in HIP
+
+.. code-block:: cpp
+
+  #include <hip/hip_runtime.h>
+  #include <iostream>
+
+  #define HIP_CHECK(expression)                  \
+  {                                              \
+      const hipError_t status = expression;      \
+      if(status != hipSuccess){                  \
+          std::cerr << "HIP error "              \
+                    << status << ": "            \
+                    << hipGetErrorString(status) \
+                    << " at " << __FILE__ << ":" \
+                    << __LINE__ << std::endl;    \
+      }                                          \
+  }
+
+  int main()
+  {
+      const int element_number = 100;
+
+      int *host_input, *host_output;
+      // Host allocation
+      host_input  = new int[element_number];
+      host_output = new int[element_number];
+
+      // Host data preparation
+      for (int i = 0; i < element_number; i++) {
+          host_input[i] = i;
+      }
+      memset(host_output, 0, element_number * sizeof(int));
+
+      int *device_input, *device_output;
+
+      // Device allocation
+      HIP_CHECK(hipMalloc((int **)&device_input,  element_number * sizeof(int)));
+      HIP_CHECK(hipMalloc((int **)&device_output, element_number * sizeof(int)));
+
+      // Device data preparation
+      HIP_CHECK(hipMemcpy(device_input, host_input, element_number * sizeof(int), hipMemcpyHostToDevice));
+      HIP_CHECK(hipMemset(device_output, 0, element_number * sizeof(int)));
+
+      // Run the kernel
+      // ...
+
+      HIP_CHECK(hipMemcpy(device_input, host_input, element_number * sizeof(int), hipMemcpyHostToDevice));
+
+      // Free host memory
+      delete[] host_input;
+      delete[] host_output;
+
+      // Free device memory
+      HIP_CHECK(hipFree(device_input));
+      HIP_CHECK(hipFree(device_output));
+  }
+
+.. note::
+
+  :cpp:func:`hipMalloc` and :cpp:func:`hipFree` are blocking calls. However, HIP
+  also provides non-blocking versions :cpp:func:`hipMallocAsync` and
+  :cpp:func:`hipFreeAsync`, which require a stream as an additional argument.
+
+.. _pinned_host_memory:
+
+Pinned memory
+================================================================================
+
+Pinned memory or page-locked memory is stored in pages that are locked in
+specific sectors in RAM and can't be migrated. The pointer can be used on both
+host and device. Accessing host-resident pinned memory in device kernels is
+generally not recommended for performance, as it can force the data to traverse
+the host-device interconnect such as PCIe, which is much slower than the
+on-device bandwidth.
+
+The advantage of pinned memory is the improved transfer time between host and
+device. For transfer operations, such as :cpp:func:`hipMemcpy` or :cpp:func:`hipMemcpyAsync`,
+using pinned memory instead of pageable memory on the host can lead to a three times
+improvement in bandwidth.
+
+The disadvantage of pinned memory is the reduced availability of RAM for other
+processes, which can negatively impact the overall performance of the host.
+
+**Example:** Using pinned memory in HIP
+
+.. code-block:: cpp
+
+  #include <hip/hip_runtime.h>
+  #include <iostream>
+
+  #define HIP_CHECK(expression)                  \
+  {                                              \
+      const hipError_t status = expression;      \
+      if(status != hipSuccess){                  \
+          std::cerr << "HIP error "              \
+                    << status << ": "            \
+                    << hipGetErrorString(status) \
+                    << " at " << __FILE__ << ":" \
+                    << __LINE__ << std::endl;    \
+      }                                          \
+  }
+
+  int main()
+  {
+      const int element_number = 100;
+
+      int *host_input, *host_output;
+      // Host allocation
+      HIP_CHECK(hipHostMalloc((int **)&host_input, element_number * sizeof(int)));
+      HIP_CHECK(hipHostMalloc((int **)&host_output, element_number * sizeof(int)));
+
+      // Host data preparation
+      for (int i = 0; i < element_number; i++) {
+          host_input[i] = i;
+      }
+      memset(host_output, 0, element_number * sizeof(int));
+
+      int *device_input, *device_output;
+
+      // Device allocation
+      HIP_CHECK(hipMalloc((int **)&device_input,  element_number * sizeof(int)));
+      HIP_CHECK(hipMalloc((int **)&device_output, element_number * sizeof(int)));
+
+      // Device data preparation
+      HIP_CHECK(hipMemcpy(device_input, host_input, element_number * sizeof(int), hipMemcpyHostToDevice));
+      HIP_CHECK(hipMemset(device_output, 0, element_number * sizeof(int)));
+
+      // Run the kernel
+      // ...
+
+      HIP_CHECK(hipMemcpy(device_input, host_input, element_number * sizeof(int), hipMemcpyHostToDevice));
+
+      // Free host memory
+      delete[] host_input;
+      delete[] host_output;
+
+      // Free device memory
+      HIP_CHECK(hipFree(device_input));
+      HIP_CHECK(hipFree(device_output));
+  }
+
+.. _memory_allocation_flags:
+
+Memory allocation flags for pinned memory
+--------------------------------------------------------------------------------
+
+The memory allocation for pinned memory can be controlled using ``hipHostMalloc`` flags:
+
+* ``hipHostMallocPortable``: The memory allocation is not restricted to the
+  context making the allocation.
+
+* ``hipHostMallocMapped``: The memory is allocated into the address space for
+  the current device and the device pointer can be obtained with
+  :cpp:func:`hipHostGetDevicePointer`.
+
+* ``hipHostMallocNumaUser``: The host memory allocation follows Numa policy
+  specified by the user. Target of Numa policy is to select a CPU that is
+  closest to each GPU. Numa distance is the distance between GPU and CPU
+  devices.
+
+* ``hipHostMallocWriteCombined``: The memory is allocated as write-combined.
+  Although lacking read efficiency by most CPUs, write-combined allocation might
+  be transferred faster across the PCIe bus on some system configurations. It's
+  a good option for data transfer from host to device via mapped pinned memory.
+
+* ``hipHostMallocCoherent``: Fine-grained memory is allocated. Overrides
+  ``HIP_HOST_COHERENT`` environment variable for specific allocation. For
+  details, see :ref:`coherence_control`.
+
+* ``hipHostMallocNonCoherent``: Coarse-grained memory is allocated. Overrides
+  ``HIP_HOST_COHERENT`` environment variable for specific allocation. For
+  details, see :ref:`coherence_control`.
+
+All allocation flags are independent and can be set in any combination. The only
+exception is setting ``hipHostMallocCoherent`` and ``hipHostMallocNonCoherent``
+together, which leads to an illegal state. An example of a valid flag
+combination is calling :cpp:func:`hipHostMalloc` with both
+``hipHostMallocPortable`` and ``hipHostMallocMapped`` flags set. Both the flags
+use the same model and differentiate only between how the surrounding code uses
+the host memory.
+
+.. note::
+
+  By default, each GPU selects a Numa CPU node with the least Numa distance
+  between them. This implies that the host memory is automatically allocated on
+  the closest memory pool of the current GPU device's Numa node. Using
+  :cpp:func:`hipSetDevice` API to set a different GPU increases the Numa
+  distance but still allows you to access the host allocation.
+
+  Numa policy is implemented on Linux and is under development on Microsoft
+  Windows.
@@ -2,6 +2,8 @@
  :description:
  :keywords: stream, memory allocation, SOMA, stream ordered memory allocator

+.. _stream_ordered_memory_allocator_how-to:
+
 *******************************************************************************
 Stream Ordered Memory Allocator
 *******************************************************************************
@@ -25,7 +27,7 @@ Using SOMA
 =====================================

 You can allocate memory using ``hipMallocAsync()`` with stream-ordered
-semantics. This restricts the asynchronous access to the memory between the stream executions of the allocation and deallocation. Accessing 
+semantics. This restricts the asynchronous access to the memory between the stream executions of the allocation and deallocation. Accessing
 memory if the compliant memory accesses won't overlap
 temporally. ``hipFreeAsync()`` frees memory from the pool with stream-ordered
 semantics.
@@ -0,0 +1,740 @@
+.. meta::
+  :description: This chapter describes Unified Memory and shows
+                how to use it in AMD HIP.
+  :keywords: AMD, ROCm, HIP, CUDA, unified memory, unified, memory
+
+.. _unified_memory:
+
+*******************************************************************************
+Unified memory management
+*******************************************************************************
+
+In conventional architectures CPUs and attached devices have their own memory
+space and dedicated physical memory backing it up, e.g. normal RAM for CPUs and
+VRAM on GPUs. This way each device can have physical memory optimized for its
+use case. GPUs usually have specialized memory whose bandwidth is a
+magnitude higher than the RAM attached to CPUs.
+
+While providing exceptional performance, this setup typically requires explicit
+memory management, as memory needs to be allocated, copied and freed on the used
+devices and on the host. Additionally, this makes using more than the physically
+available memory on the devices complicated.
+
+Modern GPUs circumvent the problem of having to explicitly manage the memory,
+while still keeping the benefits of the dedicated physical memories, by
+supporting the concept of unified memory. This enables the CPU and the GPUs in
+the system to access host and other GPUs' memory without explicit memory
+management.
+
+Unified memory
+================================================================================
+
+Unified Memory is a single memory address space accessible from any processor
+within a system. This setup simplifies memory management and enables
+applications to allocate data that can be read or written on both CPUs and GPUs
+without explicitly copying it to the specific CPU or GPU. The Unified memory
+model is shown in the following figure.
+
+.. figure:: ../../../data/how-to/hip_runtime_api/memory_management/unified_memory/um.svg
+
+Unified memory enables the access to memory located on other devices via
+several methods, depending on whether hardware support is available or has to be
+managed by the driver.
+
+Hardware supported on-demand page migration
+--------------------------------------------------------------------------------
+
+When a kernel on the device tries to access a memory address that is not in its
+memory, a page-fault is triggered. The GPU then in turn requests the page from
+the host or an other device, on which the memory is located. The page is then
+unmapped from the source, sent to the device and mapped to the device's memory.
+The requested memory is then available to the processes running on the device.
+
+In case the device's memory is at capacity, a page is unmapped from the device's
+memory first and sent and mapped to host memory. This enables more memory to be
+allocated and used for a GPU, than the GPU itself has physically available.
+
+This level of unified memory support can be very beneficial for sparse accesses
+to an array, that is not often used on the device.
+
+Driver managed page migration
+--------------------------------------------------------------------------------
+
+If the hardware does not support on-demand page migration, then all the pages
+accessed by a kernel have to be resident on the device, so they have to be
+migrated before the kernel is running. Since the driver can not know beforehand,
+what parts of an array are going to be accessed, all pages of all accessed
+arrays have to be migrated. This can lead to significant delays on the first run
+of a kernel, on top of possibly copying more memory than is actually accessed by
+the kernel.
+
+.. _unified memory system requirements:
+
+System requirements
+================================================================================
+
+Unified memory is supported on Linux by all modern AMD GPUs from the Vega
+series onward, as shown in the following table. Unified memory management can
+be achieved by explicitly allocating managed memory using
+:cpp:func:`hipMallocManaged` or marking variables with the ``__managed__``
+attribute. For the latest GPUs, with a Linux kernel that supports
+`Heterogeneous Memory Management (HMM)
+<https://www.kernel.org/doc/html/latest/mm/hmm.html>`_, the normal system
+allocator can be used.
+
+.. list-table:: Supported Unified Memory Allocators by GPU architecture
+    :widths: 40, 25, 25
+    :header-rows: 1
+    :align: center
+
+    * - Architecture
+      - :cpp:func:`hipMallocManaged()`, ``__managed__``
+      - ``new``, ``malloc()``
+    * - CDNA3
+      - ✅
+      - ✅ :sup:`1`
+    * - CDNA2
+      - ✅
+      - ✅ :sup:`1`
+    * - CDNA1
+      - ✅
+      - ✅ :sup:`1`
+    * - RDNA1
+      - ✅
+      - ❌
+    * - GCN5
+      - ✅
+      - ❌
+
+✅: **Supported**
+
+❌: **Unsupported**
+
+:sup:`1` Works only with ``XNACK=1`` and kernels with HMM support. First GPU
+access causes recoverable page-fault. For more details, visit `GPU memory
+<https://rocm.docs.amd.com/en/latest/conceptual/gpu-memory.html#xnack>`_.
+
+.. _unified memory allocators:
+
+Unified memory allocators
+================================================================================
+
+Support for the different unified memory allocators depends on the GPU
+architecture and on the system. For more information, see :ref:`unified memory
+system requirements` and :ref:`checking unified memory support`.
+
+- **HIP allocated managed memory and variables**
+
+  :cpp:func:`hipMallocManaged()` is a dynamic memory allocator available on
+  all GPUs with unified memory support. For more details, visit
+  :ref:`unified_memory_reference`.
+
+  The ``__managed__`` declaration specifier, which serves as its counterpart,
+  can be utilized for static allocation.
+
+- **System allocated unified memory**
+
+  Starting with CDNA2, the ``new`` and ``malloc()`` system allocators allow
+  you to reserve unified memory. The system allocator is more versatile and
+  offers an easy transition for code written for CPUs to HIP code as the
+  same system allocation API is used.
+
+To ensure the proper functioning of system allocated unified memory on supported
+GPUs, it is essential to configure the environment variable ``XNACK=1`` and use
+a kernel that supports `HMM
+<https://www.kernel.org/doc/html/latest/mm/hmm.html>`_. Without this
+configuration, the behavior will be similar to that of systems without HMM
+support. For more details, visit
+`GPU memory <https://rocm.docs.amd.com/en/latest/conceptual/gpu-memory.html#xnack>`_.
+
+The table below illustrates the expected behavior of managed and unified memory
+functions on ROCm and CUDA, both with and without HMM support.
+
+.. tab-set::
+  .. tab-item:: ROCm allocation behaviour
+    :sync: original-block
+
+    .. list-table:: Comparison of expected behavior of managed and unified memory functions in ROCm
+      :widths: 26, 17, 20, 17, 20
+      :header-rows: 1
+
+      * - call
+        - Allocation origin without HMM or ``XNACK=0``
+        - Access outside the origin without HMM or ``XNACK=0``
+        - Allocation origin with HMM and ``XNACK=1``
+        - Access outside the origin with HMM and ``XNACK=1``
+      * - ``new``, ``malloc()``
+        - host
+        - not accessible on device
+        - host
+        - page-fault migration
+      * - :cpp:func:`hipMalloc()`
+        - device
+        - zero copy [zc]_
+        - device
+        - zero copy [zc]_
+      * - :cpp:func:`hipMallocManaged()`, ``__managed__``
+        - pinned host
+        - zero copy [zc]_
+        - host
+        - page-fault migration
+      * - :cpp:func:`hipHostRegister()`
+        - undefined behavior
+        - undefined behavior
+        - host
+        - page-fault migration
+      * - :cpp:func:`hipHostMalloc()`
+        - pinned host
+        - zero copy [zc]_
+        - pinned host
+        - zero copy [zc]_
+
+  .. tab-item:: CUDA allocation behaviour
+    :sync: cooperative-groups
+
+    .. list-table:: Comparison of expected behavior of managed and unified memory functions in CUDA
+      :widths: 26, 17, 20, 17, 20
+      :header-rows: 1
+
+      * - call
+        - Allocation origin without HMM
+        - Access outside the origin without HMM
+        - Allocation origin with HMM
+        - Access outside the origin with HMM
+      * - ``new``, ``malloc()``
+        - host
+        - not accessible on device
+        - first touch
+        - page-fault migration
+      * - ``cudaMalloc()``
+        - device
+        - not accessible on host
+        - device
+        - page-fault migration
+      * - ``cudaMallocManaged()``, ``__managed__``
+        - host
+        - page-fault migration
+        - first touch
+        - page-fault migration
+      * - ``cudaHostRegister()``
+        - host
+        - page-fault migration
+        - host
+        - page-fault migration
+      * - ``cudaMallocHost()``
+        - pinned host
+        - zero copy [zc]_
+        - pinned host
+        - zero copy [zc]_
+
+.. [zc] Zero copy is a feature, where the memory is pinned to either the device
+        or the host, and won't be transferred when accessed by another device or
+        the host. Instead only the requested memory is transferred, without
+        making an explicit copy, like a normal memory access, hence the term
+        "zero copy".
+
+.. _checking unified memory support:
+
+Checking unified memory support
+--------------------------------------------------------------------------------
+
+The following device attributes can offer information about which :ref:`unified
+memory allocators` are supported. The attribute value is 1 if the functionality
+is supported, and 0 if it is not supported.
+
+.. list-table:: Device attributes for unified memory management
+    :widths: 40, 60
+    :header-rows: 1
+    :align: center
+
+    * - Attribute
+      - Description
+    * - :cpp:enumerator:`hipDeviceAttributeManagedMemory`
+      - Device supports allocating managed memory on this system
+    * - :cpp:enumerator:`hipDeviceAttributePageableMemoryAccess`
+      - Device supports coherently accessing pageable memory without calling :cpp:func:`hipHostRegister()` on it.
+    * - :cpp:enumerator:`hipDeviceAttributeConcurrentManagedAccess`
+      - Full unified memory support. Device can coherently access managed memory concurrently with the CPU
+
+For details on how to get the attributes of a specific device see :cpp:func:`hipDeviceGetAttribute()`.
+
+Example for unified memory management
+--------------------------------------------------------------------------------
+
+The following example shows how to use unified memory with
+:cpp:func:`hipMallocManaged()` for dynamic allocation, the ``__managed__`` attribute
+for static allocation and the standard  ``new`` allocation. For comparison, the
+explicit memory management example is presented in the last tab.
+
+.. tab-set::
+
+    .. tab-item:: hipMallocManaged()
+
+        .. code-block:: cpp
+            :emphasize-lines: 22-25
+
+            #include <hip/hip_runtime.h>
+            #include <iostream>
+
+            #define HIP_CHECK(expression)              \
+            {                                          \
+                const hipError_t err = expression;     \
+                if(err != hipSuccess){                 \
+                    std::cerr << "HIP error: "         \
+                        << hipGetErrorString(err)      \
+                        << " at " << __LINE__ << "\n"; \
+                }                                      \
+            }
+
+            // Addition of two values.
+            __global__ void add(int *a, int *b, int *c) {
+                *c = *a + *b;
+            }
+
+            int main() {
+                int *a, *b, *c;
+
+                // Allocate memory for a, b and c that is accessible to both device and host codes.
+                HIP_CHECK(hipMallocManaged(&a, sizeof(*a)));
+                HIP_CHECK(hipMallocManaged(&b, sizeof(*b)));
+                HIP_CHECK(hipMallocManaged(&c, sizeof(*c)));
+
+                // Setup input values.
+                *a = 1;
+                *b = 2;
+
+                // Launch add() kernel on GPU.
+                hipLaunchKernelGGL(add, dim3(1), dim3(1), 0, 0, a, b, c);
+
+                // Wait for GPU to finish before accessing on host.
+                HIP_CHECK(hipDeviceSynchronize());
+
+                // Print the result.
+                std::cout << *a << " + " << *b << " = " << *c << std::endl;
+
+                // Cleanup allocated memory.
+                HIP_CHECK(hipFree(a));
+                HIP_CHECK(hipFree(b));
+                HIP_CHECK(hipFree(c));
+
+                return 0;
+            }
+
+    .. tab-item:: __managed__
+
+        .. code-block:: cpp
+            :emphasize-lines: 19-20
+
+            #include <hip/hip_runtime.h>
+            #include <iostream>
+
+            #define HIP_CHECK(expression)              \
+            {                                          \
+                const hipError_t err = expression;     \
+                if(err != hipSuccess){                 \
+                    std::cerr << "HIP error: "         \
+                        << hipGetErrorString(err)      \
+                        << " at " << __LINE__ << "\n"; \
+                }                                      \
+            }
+
+            // Addition of two values.
+            __global__ void add(int *a, int *b, int *c) {
+                *c = *a + *b;
+            }
+
+            // Declare a, b and c as static variables.
+            __managed__ int a, b, c;
+
+            int main() {
+                // Setup input values.
+                a = 1;
+                b = 2;
+
+                // Launch add() kernel on GPU.
+                hipLaunchKernelGGL(add, dim3(1), dim3(1), 0, 0, &a, &b, &c);
+
+                // Wait for GPU to finish before accessing on host.
+                HIP_CHECK(hipDeviceSynchronize());
+
+                // Prints the result.
+                std::cout << a << " + " << b << " = " << c << std::endl;
+
+                return 0;
+            }
+
+    .. tab-item:: new
+
+        .. code-block:: cpp
+            :emphasize-lines: 20-23
+
+            #include <hip/hip_runtime.h>
+            #include <iostream>
+
+            #define HIP_CHECK(expression)              \
+            {                                          \
+                const hipError_t err = expression;     \
+                if(err != hipSuccess){                 \
+                    std::cerr << "HIP error: "         \
+                        << hipGetErrorString(err)      \
+                        << " at " << __LINE__ << "\n"; \
+                }                                      \
+            }
+
+            // Addition of two values.
+            __global__ void add(int* a, int* b, int* c) {
+                *c = *a + *b;
+            }
+
+            // This example requires HMM support and the environment variable HSA_XNACK needs to be set to 1
+            int main() {
+                // Allocate memory for a, b, and c.
+                int *a = new int[1];
+                int *b = new int[1];
+                int *c = new int[1];
+
+                // Setup input values.
+                *a = 1;
+                *b = 2;
+
+                // Launch add() kernel on GPU.
+                hipLaunchKernelGGL(add, dim3(1), dim3(1), 0, 0, a, b, c);
+
+                // Wait for GPU to finish before accessing on host.
+                HIP_CHECK(hipDeviceSynchronize());
+
+                // Prints the result.
+                std::cout << *a << " + " << *b << " = " << *c << std::endl;
+
+                // Cleanup allocated memory.
+                delete[] a;
+                delete[] b;
+                delete[] c;
+
+                return 0;
+            }
+
+    .. tab-item:: Explicit Memory Management
+
+        .. code-block:: cpp
+            :emphasize-lines: 27-34, 39-40
+
+            #include <hip/hip_runtime.h>
+            #include <iostream>
+
+            #define HIP_CHECK(expression)              \
+            {                                          \
+                const hipError_t err = expression;     \
+                if(err != hipSuccess){                 \
+                    std::cerr << "HIP error: "         \
+                        << hipGetErrorString(err)      \
+                        << " at " << __LINE__ << "\n"; \
+                }                                      \
+            }
+
+            // Addition of two values.
+            __global__ void add(int *a, int *b, int *c) {
+                *c = *a + *b;
+            }
+
+            int main() {
+                int a, b, c;
+                int *d_a, *d_b, *d_c;
+
+                // Setup input values.
+                a = 1;
+                b = 2;
+
+                // Allocate device copies of a, b and c.
+                HIP_CHECK(hipMalloc(&d_a, sizeof(*d_a)));
+                HIP_CHECK(hipMalloc(&d_b, sizeof(*d_b)));
+                HIP_CHECK(hipMalloc(&d_c, sizeof(*d_c)));
+
+                // Copy input values to device.
+                HIP_CHECK(hipMemcpy(d_a, &a, sizeof(*d_a), hipMemcpyHostToDevice));
+                HIP_CHECK(hipMemcpy(d_b, &b, sizeof(*d_b), hipMemcpyHostToDevice));
+
+                // Launch add() kernel on GPU.
+                hipLaunchKernelGGL(add, dim3(1), dim3(1), 0, 0, d_a, d_b, d_c);
+
+                // Copy the result back to the host.
+                HIP_CHECK(hipMemcpy(&c, d_c, sizeof(*d_c), hipMemcpyDeviceToHost));
+
+                // Cleanup allocated memory.
+                HIP_CHECK(hipFree(d_a));
+                HIP_CHECK(hipFree(d_b));
+                HIP_CHECK(hipFree(d_c));
+
+                // Prints the result.
+                std::cout << a << " + " << b << " = " << c << std::endl;
+
+                return 0;
+            }
+
+.. _using unified memory:
+
+Using unified memory
+================================================================================
+
+Unified memory can simplify the complexities of memory management in GPU
+computing, by not requiring explicit copies between the host and the devices. It
+can be particularly useful in use cases with sparse memory accesses from both
+the CPU and the GPU, as only the parts of the memory region that are actually
+accessed need to be transferred to the corresponding processor, not the whole
+memory region. This reduces the amount of memory sent over the PCIe bus or other
+interfaces.
+
+In HIP, pinned memory allocations are coherent by default. Pinned memory is
+host memory mapped into the address space of all GPUs, meaning that the pointer
+can be used on both host and device. Additionally, using pinned memory instead of
+pageable memory on the host can improve bandwidth for transfers between the host
+and the GPUs.
+
+While unified memory can provide numerous benefits, it's important to be aware
+of the potential performance overhead associated with unified memory. You must
+thoroughly test and profile your code to ensure it's the most suitable choice
+for your use case.
+
+.. _unified memory runtime hints:
+
+Performance optimizations for unified memory
+================================================================================
+
+There are several ways, in which the developer can guide the runtime to reduce
+copies between devices, in order to improve performance.
+
+Data prefetching
+--------------------------------------------------------------------------------
+
+Data prefetching is a technique used to improve the performance of your
+application by moving data to the desired device before it's actually
+needed. ``hipCpuDeviceId`` is a special constant to specify the CPU as target.
+
+.. code-block:: cpp
+    :emphasize-lines: 33-36,41-42
+
+    #include <hip/hip_runtime.h>
+    #include <iostream>
+
+    #define HIP_CHECK(expression)              \
+    {                                          \
+        const hipError_t err = expression;     \
+        if(err != hipSuccess){                 \
+            std::cerr << "HIP error: "         \
+                << hipGetErrorString(err)      \
+                << " at " << __LINE__ << "\n"; \
+        }                                      \
+    }
+
+    // Addition of two values.
+    __global__ void add(int *a, int *b, int *c) {
+        *c = *a + *b;
+    }
+
+    int main() {
+        int *a, *b, *c;
+        int deviceId;
+        HIP_CHECK(hipGetDevice(&deviceId)); // Get the current device ID
+
+        // Allocate memory for a, b and c that is accessible to both device and host codes.
+        HIP_CHECK(hipMallocManaged(&a, sizeof(*a)));
+        HIP_CHECK(hipMallocManaged(&b, sizeof(*b)));
+        HIP_CHECK(hipMallocManaged(&c, sizeof(*c)));
+
+        // Setup input values.
+        *a = 1;
+        *b = 2;
+
+        // Prefetch the data to the GPU device.
+        HIP_CHECK(hipMemPrefetchAsync(a, sizeof(*a), deviceId, 0));
+        HIP_CHECK(hipMemPrefetchAsync(b, sizeof(*b), deviceId, 0));
+        HIP_CHECK(hipMemPrefetchAsync(c, sizeof(*c), deviceId, 0));
+
+        // Launch add() kernel on GPU.
+        hipLaunchKernelGGL(add, dim3(1), dim3(1), 0, 0, a, b, c);
+
+        // Prefetch the result back to the CPU.
+        HIP_CHECK(hipMemPrefetchAsync(c, sizeof(*c), hipCpuDeviceId, 0));
+
+        // Wait for the prefetch operations to complete.
+        HIP_CHECK(hipDeviceSynchronize());
+
+        // Prints the result.
+        std::cout << *a << " + " << *b << " = " << *c << std::endl;
+
+        // Cleanup allocated memory.
+        HIP_CHECK(hipFree(a));
+        HIP_CHECK(hipFree(b));
+        HIP_CHECK(hipFree(c));
+
+        return 0;
+    }
+
+Memory advice
+--------------------------------------------------------------------------------
+
+Unified memory runtime hints can be set with :cpp:func:`hipMemAdvise()` to help
+improve the performance of your code if you know the memory usage pattern. There
+are several different types of hints as specified in the enum
+:cpp:enum:`hipMemoryAdvise`, for example, whether a certain device mostly reads
+the memory region, where it should ideally be located, and even whether that
+specific memory region is accessed by a specific device.
+
+For the best performance, profile your application to optimize the
+utilization of HIP runtime hints.
+
+The effectiveness of :cpp:func:`hipMemAdvise()` comes from its ability to inform
+the runtime of the developer's intentions regarding memory usage. When the
+runtime has knowledge of the expected memory access patterns, it can make better
+decisions about data placement, leading to less transfers via the interconnect
+and thereby reduced latency and bandwidth requirements. However, the actual
+impact on performance can vary based on the specific use case and the system.
+
+The following is the updated version of the example above with memory advice
+instead of prefetching.
+
+.. code-block:: cpp
+    :emphasize-lines: 29-41
+
+    #include <hip/hip_runtime.h>
+    #include <iostream>
+
+    #define HIP_CHECK(expression)              \
+    {                                          \
+        const hipError_t err = expression;     \
+        if(err != hipSuccess){                 \
+            std::cerr << "HIP error: "         \
+                << hipGetErrorString(err)      \
+                << " at " << __LINE__ << "\n"; \
+        }                                      \
+    }
+
+    // Addition of two values.
+    __global__ void add(int *a, int *b, int *c) {
+        *c = *a + *b;
+    }
+
+    int main() {
+        int deviceId;
+        HIP_CHECK(hipGetDevice(&deviceId));
+        int *a, *b, *c;
+
+        // Allocate memory for a, b, and c accessible to both device and host codes.
+        HIP_CHECK(hipMallocManaged(&a, sizeof(*a)));
+        HIP_CHECK(hipMallocManaged(&b, sizeof(*b)));
+        HIP_CHECK(hipMallocManaged(&c, sizeof(*c)));
+
+        // Set memory advice for a and b to be read, located on and accessed by the GPU.
+        HIP_CHECK(hipMemAdvise(a, sizeof(*a), hipMemAdviseSetPreferredLocation, deviceId));
+        HIP_CHECK(hipMemAdvise(a, sizeof(*a), hipMemAdviseSetAccessedBy, deviceId));
+        HIP_CHECK(hipMemAdvise(a, sizeof(*a), hipMemAdviseSetReadMostly, deviceId));
+
+        HIP_CHECK(hipMemAdvise(b, sizeof(*b), hipMemAdviseSetPreferredLocation, deviceId));
+        HIP_CHECK(hipMemAdvise(b, sizeof(*b), hipMemAdviseSetAccessedBy, deviceId));
+        HIP_CHECK(hipMemAdvise(b, sizeof(*b), hipMemAdviseSetReadMostly, deviceId));
+
+        // Set memory advice for c to be read, located on and accessed by the CPU.
+        HIP_CHECK(hipMemAdvise(c, sizeof(*c), hipMemAdviseSetPreferredLocation, hipCpuDeviceId));
+        HIP_CHECK(hipMemAdvise(c, sizeof(*c), hipMemAdviseSetAccessedBy, hipCpuDeviceId));
+        HIP_CHECK(hipMemAdvise(c, sizeof(*c), hipMemAdviseSetReadMostly, hipCpuDeviceId));
+
+        // Setup input values.
+        *a = 1;
+        *b = 2;
+
+        // Launch add() kernel on GPU.
+        hipLaunchKernelGGL(add, dim3(1), dim3(1), 0, 0, a, b, c);
+
+        // Wait for GPU to finish before accessing on host.
+        HIP_CHECK(hipDeviceSynchronize());
+
+        // Prints the result.
+        std::cout << *a << " + " << *b << " = " << *c << std::endl;
+
+        // Cleanup allocated memory.
+        HIP_CHECK(hipFree(a));
+        HIP_CHECK(hipFree(b));
+        HIP_CHECK(hipFree(c));
+
+        return 0;
+    }
+
+Memory range attributes
+--------------------------------------------------------------------------------
+
+:cpp:func:`hipMemRangeGetAttribute()` allows you to query attributes of a given
+memory range. The attributes are given in :cpp:enum:`hipMemRangeAttribute`.
+
+.. code-block:: cpp
+    :emphasize-lines: 44-49
+
+    #include <hip/hip_runtime.h>
+    #include <iostream>
+
+    #define HIP_CHECK(expression)              \
+    {                                          \
+        const hipError_t err = expression;     \
+        if(err != hipSuccess){                 \
+            std::cerr << "HIP error: "         \
+                << hipGetErrorString(err)      \
+                << " at " << __LINE__ << "\n"; \
+        }                                      \
+    }
+
+    // Addition of two values.
+    __global__ void add(int *a, int *b, int *c) {
+        *c = *a + *b;
+    }
+
+    int main() {
+        int *a, *b, *c;
+        unsigned int attributeValue;
+        constexpr size_t attributeSize = sizeof(attributeValue);
+
+        int deviceId;
+        HIP_CHECK(hipGetDevice(&deviceId));
+
+        // Allocate memory for a, b and c that is accessible to both device and host codes.
+        HIP_CHECK(hipMallocManaged(&a, sizeof(*a)));
+        HIP_CHECK(hipMallocManaged(&b, sizeof(*b)));
+        HIP_CHECK(hipMallocManaged(&c, sizeof(*c)));
+
+        // Setup input values.
+        *a = 1;
+        *b = 2;
+
+        HIP_CHECK(hipMemAdvise(a, sizeof(*a), hipMemAdviseSetReadMostly, deviceId));
+
+        // Launch add() kernel on GPU.
+        hipLaunchKernelGGL(add, dim3(1), dim3(1), 0, 0, a, b, c);
+
+        // Wait for GPU to finish before accessing on host.
+        HIP_CHECK(hipDeviceSynchronize());
+
+        // Query an attribute of the memory range.
+        HIP_CHECK(hipMemRangeGetAttribute(&attributeValue,
+                                attributeSize,
+                                hipMemRangeAttributeReadMostly,
+                                a,
+                                sizeof(*a)));
+
+        // Prints the result.
+        std::cout << *a << " + " << *b << " = " << *c << std::endl;
+        std::cout << "The array a is" << (attributeValue == 1 ? "" : " NOT") << " set to hipMemRangeAttributeReadMostly" << std::endl;
+
+        // Cleanup allocated memory.
+        HIP_CHECK(hipFree(a));
+        HIP_CHECK(hipFree(b));
+        HIP_CHECK(hipFree(c));
+
+        return 0;
+    }
+
+Asynchronously attach memory to a stream
+--------------------------------------------------------------------------------
+
+The :cpp:func:`hipStreamAttachMemAsync()` function attaches memory to a stream,
+which can reduce the amount of memory transferred, when managed memory is used.
+When the memory is attached to a stream using this function, it only gets
+transferred between devices, when a kernel that is launched on this stream needs
+access to the memory.
@@ -0,0 +1,154 @@
+.. meta::
+  :description: This chapter describes introduces Virtual Memory (VM) and shows
+                how to use it in AMD HIP.
+  :keywords: AMD, ROCm, HIP, CUDA, virtual memory, virtual, memory, UM, APU
+
+.. _virtual_memory:
+
+********************************************************************************
+Virtual memory management
+********************************************************************************
+
+Memory management is important when creating high-performance applications in
+the HIP ecosystem. Both allocating and copying memory can result in bottlenecks,
+which can significantly impact performance.
+
+Global memory allocation in HIP uses the C language style allocation function.
+This works fine for simple cases but can cause problems if your memory needs
+change. If you need to increase the size of your memory, you must allocate a
+second larger buffer and copy the data to it before you can free the original
+buffer. This increases overall memory usage and causes unnecessary ``memcpy``
+calls. Another solution is to allocate a larger buffer than you initially need.
+However, this isn't an efficient way to handle resources and doesn't solve the
+issue of reallocation when the extra buffer runs out.
+
+Virtual memory management solves these memory management problems. It helps to
+reduce memory usage and unnecessary ``memcpy`` calls.
+
+.. _memory_allocation_virtual_memory:
+
+Memory allocation
+================================================================================
+
+Standard memory allocation uses the :cpp:func:`hipMalloc` function to allocate a
+block of memory on the device. However, when using virtual memory, this process
+is separated into multiple steps using the :cpp:func:`hipMemCreate`,
+:cpp:func:`hipMemAddressReserve`, :cpp:func:`hipMemMap`, and
+:cpp:func:`hipMemSetAccess` functions. This guide explains what these functions
+do and how you can use them for virtual memory management.
+
+Allocate physical memory
+--------------------------------------------------------------------------------
+
+The first step is to allocate the physical memory itself with the
+:cpp:func:`hipMemCreate` function. This function accepts the size of the buffer,
+an ``unsigned long long`` variable for the flags, and a
+:cpp:struct:`hipMemAllocationProp` variable. :cpp:struct:`hipMemAllocationProp`
+contains the properties of the memory to be allocated, such as where the memory
+is physically located and what kind of shareable handles are available. If the
+allocation is successful, the function returns a value of
+:cpp:enumerator:`hipSuccess`, with :cpp:type:`hipMemGenericAllocationHandle_t`
+representing a valid physical memory allocation. The allocated memory size must
+be aligned with the granularity appropriate for the properties of the
+allocation. You can use the :cpp:func:`hipMemGetAllocationGranularity` function
+to determine the correct granularity.
+
+.. code-block:: cpp
+
+    size_t granularity = 0;
+    hipMemGenericAllocationHandle_t allocHandle;
+    hipMemAllocationProp prop = {};
+    prop.type = HIP_MEM_ALLOCATION_TYPE_PINNED;
+    prop.location.type = HIP_MEM_LOCATION_TYPE_DEVICE;
+    prop.location.id = currentDev;
+    hipMemGetAllocationGranularity(&granularity, &prop, HIP_MEM_ALLOC_GRANULARITY_MINIMUM);
+    padded_size = ROUND_UP(size, granularity);
+    hipMemCreate(&allocHandle, padded_size, &prop, 0);
+
+Reserve virtual address range
+--------------------------------------------------------------------------------
+
+After you have acquired an allocation of physical memory, you must map it before
+you can use it. To do so, you need a virtual address to map it to.  Mapping
+means the physical memory allocation is available from the virtual address range
+it is mapped to. To reserve a virtual memory range, use the
+:cpp:func:`hipMemAddressReserve` function. The size of the virtual memory must
+match the amount of physical memory previously allocated. You can then map the
+physical memory allocation to the newly-acquired virtual memory address range
+using the :cpp:func:`hipMemMap` function.
+
+.. code-block:: cpp
+
+    hipMemAddressReserve(&ptr, padded_size, 0, 0, 0);
+    hipMemMap(ptr, padded_size, 0, allocHandle, 0);
+
+Set memory access
+--------------------------------------------------------------------------------
+
+Finally, use the :cpp:func:`hipMemSetAccess` function to enable memory access.
+It accepts the pointer to the virtual memory, the size, and a
+:cpp:struct:`hipMemAccessDesc` descriptor as parameters. In a multi-GPU
+environment, you can map the device memory of one GPU to another. This feature
+also works with the traditional memory management system, but isn't as scalable
+as with virtual memory. When memory is allocated with :cpp:func:`hipMalloc`,
+:cpp:func:`hipDeviceEnablePeerAccess` is used to enable peer access. This
+function enables access between two devices, but it means that every call to
+:cpp:func:`hipMalloc` takes more time to perform the checks and the mapping
+between the devices. When using virtual memory management, peer access is
+enabled by :cpp:func:`hipMemSetAccess`, which provides a finer level of
+control over what is shared. This has no performance impact on memory allocation
+and gives you more control over what memory buffers are shared with which
+devices.
+
+.. code-block:: cpp
+
+    hipMemAccessDesc accessDesc = {};
+    accessDesc.location.type = HIP_MEM_LOCATION_TYPE_DEVICE;
+    accessDesc.location.id = currentDev;
+    accessDesc.flags = HIP_MEM_ACCESS_FLAGS_PROT_READWRITE;
+    hipMemSetAccess(ptr, padded_size, &accessDesc, 1);
+
+At this point the memory is allocated, mapped, and ready for use. You can read
+and write to it, just like you would a C style memory allocation.
+
+Free virtual memory
+--------------------------------------------------------------------------------
+
+To free the memory allocated in this manner, use the corresponding free
+functions. To unmap the memory, use :cpp:func:`hipMemUnmap`. To release the
+virtual address range, use :cpp:func:`hipMemAddressFree`.  Finally, to release
+the physical memory, use :cpp:func:`hipMemRelease`. A side effect of these
+functions is the lack of synchronization when memory is released. If you call
+:cpp:func:`hipFree` when you have multiple streams running in parallel, it
+synchronizes the device. This causes worse resource usage and performance.
+
+.. code-block:: cpp
+
+    hipMemUnmap(ptr, size);
+    hipMemRelease(allocHandle);
+    hipMemAddressFree(ptr, size);
+
+.. _usage_virtual_memory:
+
+Memory usage
+================================================================================
+
+Dynamically increase allocation size
+--------------------------------------------------------------------------------
+
+The :cpp:func:`hipMemAddressReserve` function allows you to increase the amount
+of pre-allocated memory. This function accepts a parameter representing the
+requested starting address of the virtual memory. This allows you to have a
+continuous virtual address space without worrying about the underlying physical
+allocation.
+
+.. code-block:: cpp
+
+    hipMemAddressReserve(&new_ptr, (new_size - padded_size), 0, ptr + padded_size, 0);
+    hipMemMap(new_ptr, (new_size - padded_size), 0, newAllocHandle, 0);
+    hipMemSetAccess(new_ptr, (new_size - padded_size), &accessDesc, 1);
+
+The code sample above assumes that :cpp:func:`hipMemAddressReserve` was able to
+reserve the memory address at the specified location. However, this isn't
+guaranteed to be true, so you should validate that ``new_ptr`` points to a
+specific virtual address before using it.
@@ -0,0 +1,420 @@
+.. meta::
+    :description: This chapter describes how to use multiple devices on one host.
+    :keywords: ROCm, HIP, multi-device, multiple, GPUs, devices
+
+.. _multi-device:
+
+*******************************************************************************
+Multi-device management
+*******************************************************************************
+
+Device enumeration
+===============================================================================
+
+Device enumeration involves identifying all the available GPUs connected to the
+host system. A single host machine can have multiple GPUs, each with its own
+unique identifier. By listing these devices, you can decide which GPU to use
+for computation. The host queries the system to count and list all connected
+GPUs that support the chosen ``HIP_PLATFORM``, ensuring that the application
+can leverage the full computational power available. Typically, applications
+list devices and their properties for deployment planning, and also make
+dynamic selections during runtime to ensure optimal performance.
+
+If the application does not define a specific GPU, device 0 is selected.
+
+.. code-block:: cpp
+
+    #include <hip/hip_runtime.h>
+    #include <iostream>
+
+    int main()
+    {
+        int deviceCount;
+        hipGetDeviceCount(&deviceCount);
+        std::cout << "Number of devices: " << deviceCount << std::endl;
+
+        for (int deviceId = 0; deviceId < deviceCount; ++deviceId)
+        {
+            hipDeviceProp_t deviceProp;
+            hipGetDeviceProperties(&deviceProp, deviceId);
+            std::cout << "Device " << deviceId << std::endl << " Properties:" << std::endl;
+            std::cout << "  Name: " << deviceProp.name << std::endl;
+            std::cout << "  Total Global Memory: " << deviceProp.totalGlobalMem / (1024 * 1024) << " MiB" << std::endl;
+            std::cout << "  Shared Memory per Block: " << deviceProp.sharedMemPerBlock / 1024 << " KiB" << std::endl;
+            std::cout << "  Registers per Block: " << deviceProp.regsPerBlock << std::endl;
+            std::cout << "  Warp Size: " << deviceProp.warpSize << std::endl;
+            std::cout << "  Max Threads per Block: " << deviceProp.maxThreadsPerBlock << std::endl;
+            std::cout << "  Max Threads per Multiprocessor: " << deviceProp.maxThreadsPerMultiProcessor << std::endl;
+            std::cout << "  Number of Multiprocessors: " << deviceProp.multiProcessorCount << std::endl;
+            std::cout << "  Max Threads Dimensions: ["
+                    << deviceProp.maxThreadsDim[0] << ", "
+                    << deviceProp.maxThreadsDim[1] << ", "
+                    << deviceProp.maxThreadsDim[2] << "]" << std::endl;
+            std::cout << "  Max Grid Size: ["
+                    << deviceProp.maxGridSize[0] << ", "
+                    << deviceProp.maxGridSize[1] << ", "
+                    << deviceProp.maxGridSize[2] << "]" << std::endl;
+            std::cout << std::endl;
+        }
+
+        return 0;
+    }
+
+.. _multi_device_selection:
+
+Device selection
+===============================================================================
+
+Once you have enumerated the available GPUs, the next step is to select a
+specific device for computation. This involves setting the active GPU that will
+execute subsequent operations. This step is crucial in multi-GPU systems where
+different GPUs might have different capabilities or workloads. By selecting the
+appropriate device, you ensure that the computational tasks are directed to the
+correct GPU, optimizing performance and resource utilization.
+
+.. code-block:: cpp
+
+    #include <hip/hip_runtime.h>
+    #include <iostream>
+
+    #define HIP_CHECK(expression)                \
+    {                                            \
+        const hipError_t status = expression;    \
+        if (status != hipSuccess) {              \
+            std::cerr << "HIP error " << status  \
+                    << ": " << hipGetErrorString(status) \
+                    << " at " << __FILE__ << ":" \
+                    << __LINE__ << std::endl;  \
+            exit(status);                        \
+        }                                        \
+    }
+
+    __global__ void simpleKernel(double *data)
+    {
+        int idx = blockIdx.x * blockDim.x + threadIdx.x;
+        data[idx] = idx * 2.0;
+    }
+
+    int main()
+    {
+        double* deviceData0;
+        double* deviceData1;
+        size_t  size = 1024 * sizeof(*deviceData0);
+
+        int deviceId0 = 0;
+        int deviceId1 = 1;
+
+        // Set device 0 and perform operations
+        HIP_CHECK(hipSetDevice(deviceId0)); // Set device 0 as current
+        HIP_CHECK(hipMalloc(&deviceData0, size)); // Allocate memory on device 0
+        simpleKernel<<<1000, 128>>>(deviceData0); // Launch kernel on device 0
+        HIP_CHECK(hipDeviceSynchronize());
+
+        // Set device 1 and perform operations
+        HIP_CHECK(hipSetDevice(deviceId1)); // Set device 1 as current
+        HIP_CHECK(hipMalloc(&deviceData1, size)); // Allocate memory on device 1
+        simpleKernel<<<1000, 128>>>(deviceData1); // Launch kernel on device 1
+        HIP_CHECK(hipDeviceSynchronize());
+
+        // Copy result from device 0
+        double hostData0[1024];
+        HIP_CHECK(hipSetDevice(deviceId0));
+        HIP_CHECK(hipMemcpy(hostData0, deviceData0, size, hipMemcpyDeviceToHost));
+
+        // Copy result from device 1
+        double hostData1[1024];
+        HIP_CHECK(hipSetDevice(deviceId1));
+        HIP_CHECK(hipMemcpy(hostData1, deviceData1, size, hipMemcpyDeviceToHost));
+
+        // Display results from both devices
+        std::cout << "Device 0 data: " << hostData0[0] << std::endl;
+        std::cout << "Device 1 data: " << hostData1[0] << std::endl;
+
+        // Free device memory
+        HIP_CHECK(hipFree(deviceData0));
+        HIP_CHECK(hipFree(deviceData1));
+
+        return 0;
+    }
+
+
+Stream and event behavior
+===============================================================================
+
+In a multi-device system, streams and events are essential for efficient
+parallel computation and synchronization. Streams enable asynchronous task
+execution, allowing multiple devices to process data concurrently without
+blocking one another. Events provide a mechanism for synchronizing operations
+across streams and devices, ensuring that tasks on one device are completed
+before dependent tasks on another device begin. This coordination prevents race
+conditions and optimizes data flow in multi-GPU systems. Together, streams and
+events maximize performance by enabling parallel execution, load balancing, and
+effective resource utilization across heterogeneous hardware.
+
+.. code-block:: cpp
+
+    #include <hip/hip_runtime.h>
+    #include <iostream>
+
+    __global__ void simpleKernel(double *data)
+    {
+        int idx = blockIdx.x * blockDim.x + threadIdx.x;
+        data[idx] = idx * 2.0;
+    }
+
+    int main()
+    {
+        int numDevices;
+        hipGetDeviceCount(&numDevices);
+
+        if (numDevices < 2) {
+            std::cerr << "This example requires at least two GPUs." << std::endl;
+            return -1;
+        }
+
+        double *deviceData0, *deviceData1;
+        size_t size = 1024 * sizeof(*deviceData0);
+
+        // Create streams and events for each device
+        hipStream_t stream0, stream1;
+        hipEvent_t startEvent0, stopEvent0, startEvent1, stopEvent1;
+
+        // Initialize device 0
+        hipSetDevice(0);
+        hipStreamCreate(&stream0);
+        hipEventCreate(&startEvent0);
+        hipEventCreate(&stopEvent0);
+        hipMalloc(&deviceData0, size);
+
+        // Initialize device 1
+        hipSetDevice(1);
+        hipStreamCreate(&stream1);
+        hipEventCreate(&startEvent1);
+        hipEventCreate(&stopEvent1);
+        hipMalloc(&deviceData1, size);
+
+        // Record the start event on device 0
+        hipSetDevice(0);
+        hipEventRecord(startEvent0, stream0);
+
+        // Launch the kernel asynchronously on device 0
+        simpleKernel<<<1000, 128, 0, stream0>>>(deviceData0);
+
+        // Record the stop event on device 0
+        hipEventRecord(stopEvent0, stream0);
+
+        // Wait for the stop event on device 0 to complete
+        hipEventSynchronize(stopEvent0);
+
+        // Record the start event on device 1
+        hipSetDevice(1);
+        hipEventRecord(startEvent1, stream1);
+
+        // Launch the kernel asynchronously on device 1
+        simpleKernel<<<1000, 128, 0, stream1>>>(deviceData1);
+
+        // Record the stop event on device 1
+        hipEventRecord(stopEvent1, stream1);
+
+        // Wait for the stop event on device 1 to complete
+        hipEventSynchronize(stopEvent1);
+
+        // Calculate elapsed time between the events for both devices
+        float milliseconds0 = 0, milliseconds1 = 0;
+        hipEventElapsedTime(&milliseconds0, startEvent0, stopEvent0);
+        hipEventElapsedTime(&milliseconds1, startEvent1, stopEvent1);
+
+        std::cout << "Elapsed time on GPU 0: " << milliseconds0 << " ms" << std::endl;
+        std::cout << "Elapsed time on GPU 1: " << milliseconds1 << " ms" << std::endl;
+
+        // Cleanup for device 0
+        hipSetDevice(0);
+        hipEventDestroy(startEvent0);
+        hipEventDestroy(stopEvent0);
+        hipStreamSynchronize(stream0);
+        hipStreamDestroy(stream0);
+        hipFree(deviceData0);
+
+        // Cleanup for device 1
+        hipSetDevice(1);
+        hipEventDestroy(startEvent1);
+        hipEventDestroy(stopEvent1);
+        hipStreamSynchronize(stream1);
+        hipStreamDestroy(stream1);
+        hipFree(deviceData1);
+
+        return 0;
+    }
+
+Peer-to-peer memory access
+===============================================================================
+
+In multi-GPU systems, peer-to-peer memory access enables one GPU to directly
+read or write to the memory of another GPU. This capability reduces data
+transfer times by allowing GPUs to communicate directly without involving the
+host. Enabling peer-to-peer access can significantly improve the performance of
+applications that require frequent data exchange between GPUs, as it eliminates
+the need to transfer data through the host memory.
+
+By adding peer-to-peer access to the example referenced in
+:ref:`multi_device_selection`, data can be copied between devices:
+
+.. tab-set::
+
+    .. tab-item:: with peer-to-peer
+
+        .. code-block:: cpp
+            :emphasize-lines: 31-37, 51-55
+
+            #include <hip/hip_runtime.h>
+            #include <iostream>
+
+            #define HIP_CHECK(expression)                        \
+            {                                                    \
+                const hipError_t status = expression;            \
+                if (status != hipSuccess) {                      \
+                    std::cerr << "HIP error " << status          \
+                            << ": " << hipGetErrorString(status) \
+                            << " at " << __FILE__ << ":"         \
+                            << __LINE__ << std::endl;            \
+                    exit(status);                                \
+                }                                                \
+            }
+
+            __global__ void simpleKernel(double *data)
+            {
+                int idx   = blockIdx.x * blockDim.x + threadIdx.x;
+                data[idx] = idx * 2.0;
+            }
+
+            int main()
+            {
+                double* deviceData0;
+                double* deviceData1;
+                size_t  size = 1024 * sizeof(*deviceData0);
+
+                int deviceId0 = 0;
+                int deviceId1 = 1;
+
+                // Enable peer access to the memory (allocated and future) on the peer device.
+                // Ensure the device is active before enabling peer access.
+                hipSetDevice(deviceId0);
+                hipDeviceEnablePeerAccess(deviceId1, 0);
+
+                hipSetDevice(deviceId1);
+                hipDeviceEnablePeerAccess(deviceId0, 0);
+
+                // Set device 0 and perform operations
+                HIP_CHECK(hipSetDevice(deviceId0)); // Set device 0 as current
+                HIP_CHECK(hipMalloc(&deviceData0, size)); // Allocate memory on device 0
+                simpleKernel<<<1000, 128>>>(deviceData0); // Launch kernel on device 0
+                HIP_CHECK(hipDeviceSynchronize());
+
+                // Set device 1 and perform operations
+                HIP_CHECK(hipSetDevice(deviceId1)); // Set device 1 as current
+                HIP_CHECK(hipMalloc(&deviceData1, size)); // Allocate memory on device 1
+                simpleKernel<<<1000, 128>>>(deviceData1); // Launch kernel on device 1
+                HIP_CHECK(hipDeviceSynchronize());
+
+                // Use peer-to-peer access
+                hipSetDevice(deviceId0);
+
+                // Now device 0 can access memory allocated on device 1
+                hipMemcpy(deviceData0, deviceData1, size, hipMemcpyDeviceToDevice);
+
+                // Copy result from device 0
+                double hostData0[1024];
+                HIP_CHECK(hipSetDevice(deviceId0));
+                HIP_CHECK(hipMemcpy(hostData0, deviceData0, size, hipMemcpyDeviceToHost));
+
+                // Copy result from device 1
+                double hostData1[1024];
+                HIP_CHECK(hipSetDevice(deviceId1));
+                HIP_CHECK(hipMemcpy(hostData1, deviceData1, size, hipMemcpyDeviceToHost));
+
+                // Display results from both devices
+                std::cout << "Device 0 data: " << hostData0[0] << std::endl;
+                std::cout << "Device 1 data: " << hostData1[0] << std::endl;
+
+                // Free device memory
+                HIP_CHECK(hipFree(deviceData0));
+                HIP_CHECK(hipFree(deviceData1));
+
+                return 0;
+            }
+
+    .. tab-item:: without peer-to-peer
+
+        .. code-block:: cpp
+            :emphasize-lines: 43-49, 53, 58
+
+            #include <hip/hip_runtime.h>
+            #include <iostream>
+
+            #define HIP_CHECK(expression)                        \
+            {                                                    \
+                const hipError_t status = expression;            \
+                if (status != hipSuccess) {                      \
+                    std::cerr << "HIP error " << status          \
+                            << ": " << hipGetErrorString(status) \
+                            << " at " << __FILE__ << ":"         \
+                            << __LINE__ << std::endl;            \
+                    exit(status);                                \
+                }                                                \
+            }
+
+            __global__ void simpleKernel(double *data)
+            {
+                int idx   = blockIdx.x * blockDim.x + threadIdx.x;
+                data[idx] = idx * 2.0;
+            }
+
+            int main()
+            {
+                double* deviceData0;
+                double* deviceData1;
+                size_t  size = 1024 * sizeof(*deviceData0);
+
+                int deviceId0 = 0;
+                int deviceId1 = 1;
+
+                // Set device 0 and perform operations
+                HIP_CHECK(hipSetDevice(deviceId0)); // Set device 0 as current
+                HIP_CHECK(hipMalloc(&deviceData0, size)); // Allocate memory on device 0
+                simpleKernel<<<1000, 128>>>(deviceData0); // Launch kernel on device 0
+                HIP_CHECK(hipDeviceSynchronize());
+
+                // Set device 1 and perform operations
+                HIP_CHECK(hipSetDevice(deviceId1)); // Set device 1 as current
+                HIP_CHECK(hipMalloc(&deviceData1, size)); // Allocate memory on device 1
+                simpleKernel<<<1000, 128>>>(deviceData1); // Launch kernel on device 1
+                HIP_CHECK(hipDeviceSynchronize());
+
+                // Attempt to use deviceData0 on device 1 (This will not work as deviceData0 is allocated on device 0)
+                HIP_CHECK(hipSetDevice(deviceId1));
+                hipError_t err = hipMemcpy(deviceData1, deviceData0, size, hipMemcpyDeviceToDevice); // This should fail
+                if (err != hipSuccess)
+                {
+                    std::cout << "Error: Cannot access deviceData0 from device 1, deviceData0 is on device 0" << std::endl;
+                }
+
+                // Copy result from device 0
+                double hostData0[1024];
+                HIP_CHECK(hipSetDevice(deviceId0));
+                HIP_CHECK(hipMemcpy(hostData0, deviceData0, size, hipMemcpyDeviceToHost));
+
+                // Copy result from device 1
+                double hostData1[1024];
+                HIP_CHECK(hipSetDevice(deviceId1));
+                HIP_CHECK(hipMemcpy(hostData1, deviceData1, size, hipMemcpyDeviceToHost));
+
+                // Display results from both devices
+                std::cout << "Device 0 data: " << hostData0[0] << std::endl;
+                std::cout << "Device 1 data: " << hostData1[0] << std::endl;
+
+                // Free device memory
+                HIP_CHECK(hipFree(deviceData0));
+                HIP_CHECK(hipFree(deviceData1));
+
+                return 0;
+            }
@@ -0,0 +1,94 @@
+.. meta::
+   :description: HIP provides an OpenGL interoperability API that allows
+                 efficient data sharing between HIP's computing power and
+                 OpenGL's graphics rendering.
+   :keywords: AMD, ROCm, HIP, OpenGL, interop, interoperability
+
+*******************************************************************************
+OpenGL interoperability
+*******************************************************************************
+
+The HIP--OpenGL interoperation involves mapping OpenGL resources, such as
+buffers and textures, for HIP to interact with OpenGL. This mapping process
+enables HIP to utilize these resources directly, bypassing the need for costly
+data transfers between the CPU and GPU. This capability is useful in
+applications that require both intensive GPU computation and real-time
+visualization.
+
+The graphics resources must be registered using functions like
+:cpp:func:`hipGraphicsGLRegisterBuffer` or :cpp:func:`hipGraphicsGLRegisterImage`
+then they can be mapped to HIP with :cpp:func:`hipGraphicsMapResources`
+function.
+
+After mapping, the :cpp:func:`hipGraphicsResourceGetMappedPointer` or
+:cpp:func:`hipGraphicsSubResourceGetMappedArray` functions used to retrieve a
+device pointer to the mapped resource, which can then be used in HIP kernels.
+
+Unmapping resources with :cpp:func:`hipGraphicsUnmapResources` after
+computations ensure proper resource management.
+
+Example
+===============================================================================
+
+ROCm examples have a `HIP--OpenGL interoperation example <https://github.com/ROCm/rocm-examples/tree/develop/HIP-Basic/opengl_interop>`_,
+where a simple HIP kernel is used to simulate a sine wave and rendered to a
+window as a grid of triangles using OpenGL. For a working example, there are
+multiple initialization steps needed like creating and opening a window,
+initializing OpenGL or selecting the OpenGL-capable device. After the
+initialization in the example, the kernel simulates the sinewave and updates
+the window's framebuffer in a cycle until the window is closed.
+
+.. note::
+
+   The more recent OpenGL functions are loaded with `OpenGL loader <https://github.com/ROCm/rocm-examples/tree/develop/External/glad>`_,
+   as these are not loaded by default on all platforms. The use of a custom
+   loader is shown in the following example
+
+   .. <!-- spellcheck-disable -->
+
+   .. literalinclude:: ../../tools/example_codes/opengl_interop.hip
+      :start-after: // [Sphinx opengl functions load start]
+      :end-before: // [Sphinx opengl functions load end]
+      :language: cpp
+
+   .. <!-- spellcheck-enable -->
+
+The OpenGL buffer is imported to HIP in the following way:
+
+.. <!-- spellcheck-disable -->
+
+.. literalinclude:: ../../tools/example_codes/opengl_interop.hip
+   :start-after: // [Sphinx buffer register and get start]
+   :end-before: // [Sphinx buffer register and get end]
+   :language: cpp
+
+.. <!-- spellcheck-enable -->
+
+The imported pointer is manipulated in the sinewave kernel as shown in the
+following example:
+
+.. <!-- spellcheck-disable -->
+
+.. literalinclude:: ../../tools/example_codes/opengl_interop.hip
+   :start-after: /// [Sphinx sinewave kernel start]
+   :end-before: /// [Sphinx sinewave kernel end]
+   :language: cpp
+
+.. literalinclude:: ../../tools/example_codes/opengl_interop.hip
+   :start-after: // [Sphinx buffer use in kernel start]
+   :end-before: // [Sphinx buffer use in kernel end]
+   :language: cpp
+
+.. <!-- spellcheck-enable -->
+
+The HIP graphics resource that is imported from the OpenGL buffer and is not
+needed anymore should be unmapped and unregistered as shown in the following way:
+
+.. <!-- spellcheck-disable -->
+
+.. literalinclude:: ../../tools/example_codes/opengl_interop.hip
+   :start-after: // [Sphinx unregister start]
+   :end-before: // [Sphinx unregister end]
+   :language: cpp
+
+.. <!-- spellcheck-enable -->
@@ -33,11 +33,12 @@ The value of this variable controls your logging level. Levels are defined as fo
 .. code-block:: cpp

  enum LogLevel {
-    LOG_NONE    = 0,
-    LOG_ERROR   = 1,
-    LOG_WARNING = 2,
-    LOG_INFO    = 3,
-    LOG_DEBUG   = 4
+    LOG_NONE           = 0,
+    LOG_ERROR          = 1,
+    LOG_WARNING        = 2,
+    LOG_INFO           = 3,
+    LOG_DEBUG          = 4,
+    LOG_EXTRA_DEBUG    = 5
  };

 .. tip::
@@ -55,26 +56,27 @@ change this to any of the valid values:
 .. code-block:: cpp

  enum LogMask {
-    LOG_API       = 0x00000001, //!< API call
-    LOG_CMD       = 0x00000002, //!< Kernel and Copy Commands and Barriers
-    LOG_WAIT      = 0x00000004, //!< Synchronization and waiting for commands to finish
-    LOG_AQL       = 0x00000008, //!< Decode and display AQL packets
-    LOG_QUEUE     = 0x00000010, //!< Queue commands and queue contents
-    LOG_SIG       = 0x00000020, //!< Signal creation, allocation, pool
-    LOG_LOCK      = 0x00000040, //!< Locks and thread-safety code.
-    LOG_KERN      = 0x00000080, //!< kernel creations and arguments, etc.
-    LOG_COPY      = 0x00000100, //!< Copy debug
-    LOG_COPY2     = 0x00000200, //!< Detailed copy debug
-    LOG_RESOURCE  = 0x00000400, //!< Resource allocation, performance-impacting events.
-    LOG_INIT      = 0x00000800, //!< Initialization and shutdown
-    LOG_MISC      = 0x00001000, //!< misc debug, not yet classified
-    LOG_AQL2      = 0x00002000, //!< Show raw bytes of AQL packet
-    LOG_CODE      = 0x00004000, //!< Show code creation debug
-    LOG_CMD2      = 0x00008000, //!< More detailed command info, including barrier commands
-    LOG_LOCATION  = 0x00010000, //!< Log message location
-    LOG_MEM       = 0x00020000, //!< Memory allocation
-    LOG_MEM_POOL  = 0x00040000, //!< Memory pool allocation, including memory in graphs
-    LOG_ALWAYS    = 0xFFFFFFFF, //!< Log always even mask flag is zero
+    LOG_API       = 1,      //!< (0x1)     API call
+    LOG_CMD       = 2,      //!< (0x2)     Kernel and Copy Commands and Barriers
+    LOG_WAIT      = 4,      //!< (0x4)     Synchronization and waiting for commands to finish
+    LOG_AQL       = 8,      //!< (0x8)     Decode and display AQL packets
+    LOG_QUEUE     = 16,     //!< (0x10)    Queue commands and queue contents
+    LOG_SIG       = 32,     //!< (0x20)    Signal creation, allocation, pool
+    LOG_LOCK      = 64,     //!< (0x40)    Locks and thread-safety code.
+    LOG_KERN      = 128,    //!< (0x80)    Kernel creations and arguments, etc.
+    LOG_COPY      = 256,    //!< (0x100)   Copy debug
+    LOG_COPY2     = 512,    //!< (0x200)   Detailed copy debug
+    LOG_RESOURCE  = 1024,   //!< (0x400)   Resource allocation, performance-impacting events.
+    LOG_INIT      = 2048,   //!< (0x800)   Initialization and shutdown
+    LOG_MISC      = 4096,   //!< (0x1000)  Misc debug, not yet classified
+    LOG_AQL2      = 8192,   //!< (0x2000)  Show raw bytes of AQL packet
+    LOG_CODE      = 16384,  //!< (0x4000)  Show code creation debug
+    LOG_CMD2      = 32768,  //!< (0x8000)  More detailed command info, including barrier commands
+    LOG_LOCATION  = 65536,  //!< (0x10000) Log message location
+    LOG_MEM       = 131072, //!< (0x20000) Memory allocation
+    LOG_MEM_POOL  = 262144, //!< (0x40000) Memory pool allocation, including memory in graphs
+    LOG_TS        = 524288, //!< (0x80000) Timestamp details
+    LOG_ALWAYS    = -1      //!< (0xFFFFFFFF) Log always even mask flag is zero
  };

 You can also define the logging mask via the ``AMD_LOG_MASK`` environment variable.
@@ -41,7 +41,7 @@ the host or parallel to the devices.

 For parallel workloads, when threads belonging to the same block need to
 synchronize to share data, use :cpp:func:`__syncthreads()` (see:
-:ref:`synchronization functions`) within the same kernel invocation. For threads
+:ref:`synchronization_functions`) within the same kernel invocation. For threads
 belonging to different blocks, use global memory with two separate
 kernel invocations. It is recommended to avoid the latter approach as it adds
 overhead.
@@ -151,7 +151,7 @@ and is generally reduced when addresses are more scattered, especially in
 global memory.

 Device memory is accessed via 32-, 64-, or 128-byte transactions that must be
-naturally aligned. 
+naturally aligned.
 Maximizing memory throughput involves:

 - Coalescing memory accesses of threads within a warp into minimal transactions.
@@ -294,7 +294,7 @@ Applications frequently allocating and freeing memory might experience slower
 allocation calls over time as memory is released back to the operating system.
 To optimize performance in such scenarios, follow these guidelines:

- Avoid allocating all available memory with :cpp:func:`hipMalloc` or 
+- Avoid allocating all available memory with :cpp:func:`hipMalloc` or
  :cpp:func:`hipHostMalloc`, as this immediately reserves memory and might
  prevent other applications from using it. This behavior could strain the
  operating system schedulers or prevent other applications from running on the
@@ -309,7 +309,7 @@ To optimize performance in such scenarios, follow these guidelines:
  performance, they allow the application to continue running.
 - For supported platforms, use :cpp:func:`hipMallocManaged`, as it allows
  oversubscription. With the right policies, :cpp:func:`hipMallocManaged` can
-  maintain most, if not all, :cpp:func:`hipMalloc` performance. 
+  maintain most, if not all, :cpp:func:`hipMalloc` performance.
  :cpp:func:`hipMallocManaged` doesn't require an allocation to be resident
  until it is needed or prefetched, which eases the load on the operating
  system's schedulers and facilitates multitenant scenarios.
@@ -1,212 +0,0 @@
-# HIP programming manual
-
-## Host Memory
-
-### Introduction
-
-`hipHostMalloc` allocates pinned host memory which is mapped into the address space of all GPUs in the system, the memory can be accessed directly by the GPU device, and can be read or written with much higher bandwidth than pageable memory obtained with functions such as `malloc()`.
-There are two use cases for this host memory:
-
-* Faster `HostToDevice` and `DeviceToHost` Data Transfers:
-The runtime tracks the `hipHostMalloc` allocations and can avoid some of the setup required for regular unpinned memory.  For exact measurements on a specific system, experiment with `--unpinned` and `--pinned` switches for the `hipBusBandwidth` tool.
-* Zero-Copy GPU Access:
-GPU can directly access the host memory over the CPU/GPU interconnect, without need to copy the data.  This avoids the need for the copy, but during the kernel access each memory access must traverse the interconnect, which can be tens of times slower than accessing the GPU's local device memory.  Zero-copy memory can be a good choice when the memory accesses are infrequent (perhaps only once).  Zero-copy memory is typically "Coherent" and thus not cached by the GPU but this can be overridden if desired.
-
-### Memory allocation flags
-
-There are flags parameter which can specify options how to allocate the memory, for example,
-`hipHostMallocPortable`, the memory is considered allocated by all contexts, not just the one on which the allocation is made.
-`hipHostMallocMapped`, will map the allocation into the address space for the current device, and the device pointer can be obtained with the API `hipHostGetDevicePointer()`.
-`hipHostMallocNumaUser` is the flag to allow host memory allocation to follow Numa policy by user. Please note this flag is currently only applicable on Linux, under development on Windows.
-
-All allocation flags are independent, and can be used in any combination without restriction, for instance, `hipHostMalloc` can be called with both `hipHostMallocPortable` and `hipHostMallocMapped` flags set. Both usage models described above use the same allocation flags, and the difference is in how the surrounding code uses the host memory.
-
-### Numa-aware host memory allocation
-
-Numa policy determines how memory is allocated.
-Target of Numa policy is to select a CPU that is closest to each GPU.
-Numa distance is the measurement of how far between GPU and CPU devices.
-
-By default, each GPU selects a Numa CPU node that has the least Numa distance between them, that is, host memory will be automatically allocated closest on the memory pool of Numa node of the current GPU device. Using `hipSetDevice` API to a different GPU will still be able to access the host allocation, but can have longer Numa distance.
-Note, Numa policy is so far implemented on Linux, and under development on Windows.
-
-### Coherency Controls
-
-ROCm defines two coherency options for host memory:
-
-* Coherent memory : Supports fine-grain synchronization while the kernel is running.  For example, a kernel can perform atomic operations that are visible to the host CPU or to other (peer) GPUs.  Synchronization instructions include `threadfence_system` and C++11-style atomic operations.
-In order to achieve this fine-grained coherence, many AMD GPUs use a limited cache policy, such as leaving these allocations uncached by the GPU, or making them read-only.
-
-* Non-coherent memory : Can be cached by GPU, but cannot support synchronization while the kernel is running.  Non-coherent memory can be optionally synchronized only at command (end-of-kernel or copy command) boundaries.  This memory is appropriate for high-performance access when fine-grain synchronization is not required.
-
-HIP provides the developer with controls to select which type of memory is used via allocation flags passed to `hipHostMalloc` and the `HIP_HOST_COHERENT` environment variable. By default, the environment variable HIP_HOST_COHERENT is set to 0 in HIP.
-The control logic in the current version of HIP is as follows:
-
-* No flags are passed in: the host memory allocation is coherent, the HIP_HOST_COHERENT environment variable is ignored.
-* `hipHostMallocCoherent=1`: The host memory allocation will be coherent, the HIP_HOST_COHERENT environment variable is ignored.
-* `hipHostMallocMapped=1`: The host memory allocation will be coherent, the HIP_HOST_COHERENT environment variable is ignored.
-* `hipHostMallocNonCoherent=1`, `hipHostMallocCoherent=0`, and `hipHostMallocMapped=0`: The host memory will be non-coherent, the HIP_HOST_COHERENT environment variable is ignored.
-* `hipHostMallocCoherent=0`, `hipHostMallocNonCoherent=0`, `hipHostMallocMapped=0`, but one of the other `HostMalloc` flags is set:
-  * If `HIP_HOST_COHERENT` is defined as 1, the host memory allocation is coherent.
-  * If `HIP_HOST_COHERENT` is not defined, or defined as 0, the host memory allocation is non-coherent.
-* `hipHostMallocCoherent=1`, `hipHostMallocNonCoherent=1`: Illegal.
-
-### Visibility of Zero-Copy Host Memory
-
-Coherent host memory is automatically visible at synchronization points.
-Non-coherent
-
-| HIP API              | Synchronization Effect                                                         | Fence                | Coherent Host Memory Visibility | Non-Coherent Host Memory Visibility|
-| ---                  | ---                                                                            | ---                  | ---                            | --- |
-| `hipStreamSynchronize` | host waits for all commands in the specified stream to complete                | system-scope release | yes                        | yes   |
-| `hipDeviceSynchronize` | host waits for all commands in all streams on the specified device to complete | system-scope release | yes                        | yes   |
-| `hipEventSynchronize`  | host waits for the specified event to complete                                 | device-scope release | yes                        | depends - see below|
-| `hipStreamWaitEvent`   | stream waits for the specified event to complete                               | none                 | yes                        | no   |
-
-### `hipEventSynchronize`
-
-Developers can control the release scope for `hipEvents`:
-
-* By default, the GPU performs a device-scope acquire and release operation with each recorded event.  This will make host and device memory visible to other commands executing on the same device.
-
-A stronger system-level fence can be specified when the event is created with `hipEventCreateWithFlags`:
-
-* `hipEventReleaseToSystem`: Perform a system-scope release operation when the event is recorded.  This will make both Coherent and Non-Coherent host memory visible to other agents in the system, but may involve heavyweight operations such as cache flushing.  Coherent memory will typically use lighter-weight in-kernel synchronization mechanisms such as an atomic operation and thus does not need to use `hipEventReleaseToSystem`.
-* `hipEventDisableTiming`: Events created with this flag will not record profiling data and provide the best performance if used for synchronization.
-
-### Summary and Recommendations
-
-* Coherent host memory is the default and is the easiest to use since the memory is visible to the CPU at typical synchronization points.  This memory allows in-kernel synchronization commands such as `threadfence_system` to work transparently.
-* HIP/ROCm also supports the ability to cache host memory in the GPU using the "Non-Coherent" host memory allocations. This can provide performance benefit, but care must be taken to use the correct synchronization.
-
-### Managed memory allocation
-
-Managed memory, including the `__managed__` keyword, is supported in HIP combined host/device compilation, on Linux, not on Windows (under development).
-
-Managed memory, via unified memory allocation, allows data be shared and accessible to both the CPU and GPU using a single pointer.
-The allocation will be managed by AMD GPU driver using the Linux HMM (Heterogeneous Memory Management) mechanism, the user can call managed memory API `hipMallocManaged` to allocate a large chunk of HMM memory, execute kernels on device and fetch data between the host and device as needed.
-
-In HIP application, it is recommended to do the capability check before calling the managed memory APIs. For example:
-
-```cpp
-int managed_memory = 0;
-HIPCHECK(hipDeviceGetAttribute(&managed_memory,
-  hipDeviceAttributeManagedMemory,p_gpuDevice));
-
-if (!managed_memory ) {
-  printf ("info: managed memory access not supported on the device %d\n Skipped\n", p_gpuDevice);
-}
-else {
-  HIPCHECK(hipSetDevice(p_gpuDevice));
-  HIPCHECK(hipMallocManaged(&Hmm, N * sizeof(T)));
-. . .
-}
-```
-
-Please note, the managed memory capability check may not be necessary, but if HMM is not supported, then managed malloc will fall back to using system memory and other managed memory API calls will have undefined behavior.
-
-Note, managed memory management is implemented on Linux, not supported on Windows yet.
-
-### HIP Stream Memory Operations
-
-HIP supports Stream Memory Operations to enable direct synchronization between Network Nodes and GPU. Following new APIs are added,
-  `hipStreamWaitValue32`
-  `hipStreamWaitValue64`
-  `hipStreamWriteValue32`
-  `hipStreamWriteValue64`
-
-Note, CPU access to the semaphore's memory requires volatile keyword to disable CPU compiler's optimizations on memory access.
-For more details, please check the documentation `HIP-API.pdf`.
-
-Please note, HIP stream does not guarantee concurrency on AMD hardware for the case of multiple (at least 6) long-running streams executing concurrently, using `hipStreamSynchronize(nullptr)` for synchronization.
-
-## Direct Dispatch
-
-HIP runtime has Direct Dispatch enabled by default in ROCM 4.4 on Linux.
-With this feature we move away from our conventional producer-consumer model where the runtime creates a worker thread(consumer) for each HIP Stream, and the host thread(producer) enqueues commands to a command queue(per stream).
-
-For Direct Dispatch, HIP runtime would directly enqueue a packet to the AQL queue (user mode queue on GPU) on the Dispatch API call from the application. That has shown to reduce the latency to launch the first wave on the idle GPU and total time of tiny dispatches synchronized with the host.
-
-In addition, eliminating the threads in runtime has reduced the variance in the dispatch numbers as the thread scheduling delays and atomics/locks synchronization latencies are reduced.
-
-This feature can be disabled via setting the following environment variable,
-AMD_DIRECT_DISPATCH=0
-
-Note, Direct Dispatch is implemented on Linux. It is currently not supported on Windows.
-
-## HIP Runtime Compilation
-
-HIP now supports runtime compilation (HIP RTC), the usage of which will provide the possibility of optimizations and performance improvement compared with other APIs via regular offline static compilation.
-
-HIP RTC APIs accept HIP source files in character string format as input parameters and create handles of programs by compiling the HIP source files without spawning separate processes.
-
-For more details on HIP RTC APIs, refer to [HIP Runtime API Reference](../doxygen/html/index).
-
-For Linux developers, the link [here](https://github.com/ROCm/hip-tests/blob/develop/samples/2_Cookbook/23_cmake_hiprtc/saxpy.cpp) shows an example how to program HIP application using runtime compilation mechanism, and a detailed [HIP RTC programming guide](./hip_rtc) is also available.
-
-## HIP Graph
-
-HIP graphs are supported. For more details, refer to the [HIP API Guide](../doxygen/html/group___graph) or the [how-to section for HIP graphs](../how-to/hipgraph).
-
-## Device-Side Malloc
-
-HIP-Clang now supports device-side malloc and free.
-This implementation does not require the use of `hipDeviceSetLimit(hipLimitMallocHeapSize,value)` nor respects any setting. The heap is fully dynamic and can grow until the available free memory on the device is consumed.
-
-## Use of Per-thread default stream
-
-The per-thread default stream is supported in HIP. It is an implicit stream local to both the thread and the current device. This means that the command issued to the per-thread default stream by the thread does not implicitly synchronize with other streams (like explicitly created streams), or default per-thread stream on other threads.
-The per-thread default stream is a blocking stream and will synchronize with the default null stream if both are used in a program.
-The per-thread default stream can be enabled via adding a compilation option,
-`-fgpu-default-stream=per-thread`.
-
-And users can explicitly use `hipStreamPerThread` as per-thread default stream handle as input in API commands. There are test codes as examples in the [link](https://github.com/ROCm/hip-tests/tree/develop/catch/unit/streamperthread).
-
-## Use of Long Double Type
-
-In HIP-Clang, long double type is 80-bit extended precision format for x86_64, which is not supported by AMDGPU. HIP-Clang treats long double type as IEEE double type for AMDGPU. Using long double type in HIP source code will not cause issue as long as data of long double type is not transferred between host and device. However, long double type should not be used as kernel argument type.
-
-## Use of `_Float16` Type
-
-If a host function is to be used between clang (or hipcc) and gcc for x86_64, i.e. its definition is compiled by one compiler but the caller is compiled by a different compiler, `_Float16` or aggregates containing `_Float16` should not be used as function argument or return type. This is due to lack of stable ABI for `_Float16` on x86_64. Passing `_Float16` or aggregates containing `_Float16` between clang and gcc could cause undefined behavior.
-
-## FMA and contractions
-
-By default HIP-Clang assumes `-ffp-contract=fast-honor-pragmas`.
-Users can use `#pragma clang fp contract(on|off|fast)` to control `fp` contraction of a block of code.
-For x86_64, FMA is off by default since the generic x86_64 target does not
-support FMA by default. To turn on FMA on x86_64, either use `-mfma` or `-march=native`
-on CPU's supporting FMA.
-
-When contractions are enabled and the CPU has not enabled FMA instructions, the
-GPU can produce different numerical results than the CPU for expressions that
-can be contracted. Tolerance should be used for floating point comparisons.
-
-## Math functions with special rounding modes
-
-Note: Currently, HIP only supports basic math functions with rounding modern (round to nearest). HIP does not support basic math functions with rounding modes `ru` (round up), `rd` (round down), and `rz` (round towards zero).
-
-## Creating Static Libraries
-
-HIP-Clang supports generating two types of static libraries. The first type of static library does not export device functions, and only exports and launches host functions within the same library. The advantage of this type is the ability to link with a non-hipcc compiler such as gcc. The second type exports device functions to be linked by other code objects. However, this requires using hipcc as the linker.
-
-In addition, the first type of library contains host objects with device code embedded as fat binaries. It is generated using the flag --emit-static-lib. The second type of library contains relocatable device objects and is generated using `ar`.
-
-Here is an example to create and use static libraries:
-
-* Type 1 using `--emit-static-lib`:
-
-    ```cpp
-    hipcc hipOptLibrary.cpp --emit-static-lib -fPIC -o libHipOptLibrary.a
-    gcc test.cpp -L. -lhipOptLibrary -L/path/to/hip/lib -lamdhip64 -o test.out
-    ```
-
-* Type 2 using system `ar`:
-
-    ```cpp
-    hipcc hipDevice.cpp -c -fgpu-rdc -o hipDevice.o
-    ar rcsD libHipDevice.a hipDevice.o
-    hipcc libHipDevice.a test.cpp -fgpu-rdc -o test.out
-    ```
-
-For more information, please see [HIP samples host functions](https://github.com/ROCm/hip-tests/tree/develop/samples/2_Cookbook/15_static_library/host_functions) and [device_functions](https://github.com/ROCm/hip-tests/tree/rocm-5.5.x/samples/2_Cookbook/15_static_library/device_functions).
@@ -1,577 +0,0 @@
-.. meta::
-  :description: This chapter describes introduces Unified Memory (UM) and shows
-                how to use it in AMD HIP.
-  :keywords: AMD, ROCm, HIP, CUDA, unified memory, unified, memory, UM, APU
-
-*******************************************************************************
-Unified memory
-*******************************************************************************
-
-In conventional architectures, CPUs and GPUs have dedicated memory like Random
-Access Memory (RAM) and Video Random Access Memory (VRAM). This architectural
-design, while effective, can be limiting in terms of memory capacity and
-bandwidth, as continuous memory copying is required to allow the processors to
-access the appropriate data. New architectural features like Heterogeneous
-System Architectures (HSA) and Unified Memory (UM) help avoid these limitations
-and promise increased efficiency and innovation.
-
-Unified memory
-==============
-Unified Memory is a single memory address space accessible from any processor
-within a system. This setup simplifies memory management processes and enables
-applications to allocate data that can be read or written by code running on
-either CPUs or GPUs. The Unified memory model is shown in the following figure.
-
-.. figure:: ../data/unified_memory/um.svg
-
-AMD Accelerated Processing Unit (APU) is a typical example of a Unified Memory
-Architecture. On a single die, a central processing unit (CPU) is combined
-with an integrated graphics processing unit (iGPU), and both have access to a
-high-bandwidth memory (HBM) module named Unified Memory. The CPU enables
-high-performance, low-latency operations, while the GPU is optimized for high
-throughput (data processed by unit time).
-
-.. _unified memory system requirements:
-
-System requirements
-===================
-Unified memory is supported on Linux by all modern AMD GPUs from the Vega
-series onward. Unified memory management can be achieved with managed memory
-allocation and, for the latest GPUs, with a system allocator.
-
-The table below lists the supported allocators. The allocators are described in
-the next section.
-
-.. list-table:: Supported Unified Memory Allocators
-    :widths: 40, 25, 25, 25
-    :header-rows: 1
-    :align: center
-
-    * - Architecture
-      - ``hipMallocManaged()``
-      - ``__managed__``
-      - ``malloc()``
-    * - MI200, MI300 Series
-      - ✅
-      - ✅
-      - ✅ :sup:`1`
-    * - MI100
-      - ✅
-      - ✅
-      - ❌
-    * - RDNA (Navi) Series
-      - ✅
-      - ✅
-      - ❌
-    * - GCN5 (Vega) Series
-      - ✅
-      - ✅
-      - ❌
-
-✅: **Supported**
-
-❌: **Unsupported**
-
-:sup:`1` Works only with ``XNACK=1``. First GPU access causes recoverable
-page-fault. For more details, visit
-`GPU memory <https://rocm.docs.amd.com/en/latest/conceptual/gpu-memory.html#xnack>`_.
-
-.. _unified memory programming models:
-
-Unified memory programming models
-=================================
-
-Showcasing various unified memory programming models, the model availability
-depends on your architecture. For more information, see :ref:`unified memory
-system requirements` and :ref:`checking unified memory management support`.
-
- **HIP managed memory allocation API**:
-
-  The ``hipMallocManaged()`` is a dynamic memory allocator available on
-  all GPUs with unified memory support. For more details, visit
-  :ref:`unified_memory_reference`.
-
- **HIP managed variables**:
-
-  The ``__managed__`` declaration specifier, which serves as its counterpart,
-  is supported on all modern AMD cards and can be utilized for static
-  allocation.
-
- **System allocation API**:
-
-  Starting with the AMD MI300 series, the ``malloc()`` system allocator allows
-  you to reserve unified memory. The system allocator is more versatile and
-  offers an easy transition from a CPU written C++ code to a HIP code as the
-  same system allocation API is used.
-
-.. _checking unified memory management support:
-
-Checking unified memory management support
------------------------------------------
-Some device attributes can offer information about which :ref:`unified memory
-programming models` are supported. The attribute value is 1 if the
-functionality is supported, and 0 if it is not supported.
-
-.. list-table:: Device attributes for unified memory management
-    :widths: 40, 60
-    :header-rows: 1
-    :align: center
-
-    * - attribute
-      - description
-    * - ``hipDeviceAttributeManagedMemory``
-      - unified addressing is supported
-    * - ``hipDeviceAttributeConcurrentManagedAccess``
-      - full managed memory support, concurrent access is supported
-    * - ``hipDeviceAttributePageableMemoryAccess``
-      - both managed and system memory allocation API is supported
-
-The following examples show how to use device attributes:
-
-.. code-block:: cpp
-
-    #include <hip/hip_runtime.h>
-    #include <iostream>
-
-    int main() {
-        int d;
-        hipGetDevice(&d);
-
-        int is_cma = 0;
-        hipDeviceGetAttribute(&is_cma, hipDeviceAttributeConcurrentManagedAccess, d);
-        std::cout << "HIP Managed Memory: "
-                  << (is_cma == 1 ? "is" : "NOT")
-                  << " supported" << std::endl;
-        return 0;
-    }
-
-Example for unified memory management
-------------------------------------
-
-The following example shows how to use unified memory management with
-``hipMallocManaged()``, function, with ``__managed__`` attribute for static
-allocation and standard  ``malloc()`` allocation. For comparison, the Explicit
-Memory Management example is presented in the last tab.
-
-.. tab-set::
-
-    .. tab-item:: hipMallocManaged()
-
-        .. code-block:: cpp
-            :emphasize-lines: 12-15
-
-            #include <hip/hip_runtime.h>
-            #include <iostream>
-
-            // Addition of two values.
-            __global__ void add(int *a, int *b, int *c) {
-                *c = *a + *b;
-            }
-
-            int main() {
-                int *a, *b, *c;
-
-                // Allocate memory for a, b and c that is accessible to both device and host codes.
-                hipMallocManaged(&a, sizeof(*a));
-                hipMallocManaged(&b, sizeof(*b));
-                hipMallocManaged(&c, sizeof(*c));
-
-                // Setup input values.
-                *a = 1;
-                *b = 2;
-
-                // Launch add() kernel on GPU.
-                hipLaunchKernelGGL(add, dim3(1), dim3(1), 0, 0, a, b, c);
-
-                // Wait for GPU to finish before accessing on host.
-                hipDeviceSynchronize();
-
-                // Prints the result.
-                std::cout << *a << " + " << *b << " = " << *c << std::endl;
-
-                // Cleanup allocated memory.
-                hipFree(a);
-                hipFree(b);
-                hipFree(c);
-
-                return 0;
-            }
-
-
-    .. tab-item:: __managed__
-
-        .. code-block:: cpp
-            :emphasize-lines: 9-10
-
-            #include <hip/hip_runtime.h>
-            #include <iostream>
-
-            // Addition of two values.
-            __global__ void add(int *a, int *b, int *c) {
-                *c = *a + *b;
-            }
-
-            // Declare a, b and c as static variables.
-            __managed__ int a, b, c;
-
-            int main() {
-                // Setup input values.
-                a = 1;
-                b = 2;
-
-                // Launch add() kernel on GPU.
-                hipLaunchKernelGGL(add, dim3(1), dim3(1), 0, 0, &a, &b, &c);
-
-                // Wait for GPU to finish before accessing on host.
-                hipDeviceSynchronize();
-
-                // Prints the result.
-                std::cout << a << " + " << b << " = " << c << std::endl;
-
-                return 0;
-            }
-
-
-    .. tab-item:: malloc()
-
-        .. code-block:: cpp
-            :emphasize-lines: 12-15
-
-            #include <hip/hip_runtime.h>
-            #include <iostream>
-
-            // Addition of two values.
-            __global__ void add(int* a, int* b, int* c) {
-                *c = *a + *b;
-            }
-
-            int main() {
-                int* a, * b, * c;
-
-                // Allocate memory for a, b, and c.
-                a = (int*)malloc(sizeof(*a));
-                b = (int*)malloc(sizeof(*b));
-                c = (int*)malloc(sizeof(*c));
-
-                // Setup input values.
-                *a = 1;
-                *b = 2;
-
-                // Launch add() kernel on GPU.
-                hipLaunchKernelGGL(add, dim3(1), dim3(1), 0, 0, a, b, c);
-
-                // Wait for GPU to finish before accessing on host.
-                hipDeviceSynchronize();
-
-                // Prints the result.
-                std::cout << *a << " + " << *b << " = " << *c << std::endl;
-
-                // Cleanup allocated memory.
-                free(a);
-                free(b);
-                free(c);
-
-                return 0;
-            }
-
-
-    .. tab-item:: Explicit Memory Management
-
-        .. code-block:: cpp
-            :emphasize-lines: 17-24, 29-30
-
-            #include <hip/hip_runtime.h>
-            #include <iostream>
-
-            // Addition of two values.
-            __global__ void add(int *a, int *b, int *c) {
-                *c = *a + *b;
-            }
-
-            int main() {
-                int a, b, c;
-                int *d_a, *d_b, *d_c;
-
-                // Setup input values.
-                a = 1;
-                b = 2;
-
-                // Allocate device copies of a, b and c.
-                hipMalloc(&d_a, sizeof(*d_a));
-                hipMalloc(&d_b, sizeof(*d_b));
-                hipMalloc(&d_c, sizeof(*d_c));
-
-                // Copy input values to device.
-                hipMemcpy(d_a, &a, sizeof(*d_a), hipMemcpyHostToDevice);
-                hipMemcpy(d_b, &b, sizeof(*d_b), hipMemcpyHostToDevice);
-
-                // Launch add() kernel on GPU.
-                hipLaunchKernelGGL(add, dim3(1), dim3(1), 0, 0, d_a, d_b, d_c);
-
-                // Copy the result back to the host.
-                hipMemcpy(&c, d_c, sizeof(*d_c), hipMemcpyDeviceToHost);
-
-                // Cleanup allocated memory.
-                hipFree(d_a);
-                hipFree(d_b);
-                hipFree(d_c);
-
-                // Prints the result.
-                std::cout << a << " + " << b << " = " << c << std::endl;
-
-                return 0;
-            }
-
-.. _using unified memory management:
-
-Using unified memory management (UMM)
-=====================================
-
-Unified memory management (UMM) is a feature that can simplify the complexities
-of memory management in GPU computing. It is particularly useful in
-heterogeneous computing environments with heavy memory usage with both a CPU
-and a GPU, which would require large memory transfers. Here are some areas
-where UMM can be beneficial:
-
- **Simplification of Memory Management**:
-
-  UMM can help to simplify the complexities of memory management. This can make
-  it easier for developers to write code without worrying about memory
-  allocation and deallocation details.
-
- **Data Migration**:
-
-  UMM allows for efficient data migration between the host (CPU) and the device
-  (GPU). This can be particularly useful for applications that need to move
-  data back and forth between the device and host.
-
- **Improved Programming Productivity**:
-
-  As a positive side effect, UMM can reduce the lines of code, thereby
-  improving programming productivity.
-
-In HIP, pinned memory allocations are coherent by default. Pinned memory is
-host memory mapped into the address space of all GPUs, meaning that the pointer
-can be used on both host and device. Using pinned memory instead of pageable
-memory on the host can improve bandwidth.
-
-While UMM can provide numerous benefits, it's important to be aware of the
-potential performance overhead associated with UMM. You must thoroughly test
-and profile your code to ensure it's the most suitable choice for your use
-case.
-
-.. _unified memory runtime hints:
-
-Unified memory HIP runtime hints for the better performance
-===========================================================
-
-Unified memory HIP runtime hints can help improve the performance of your code if
-you know your code's ability and infrastructure. Some hint techniques are
-presented in this section.
-
-The hint functions can set actions on a selected device, which can be
-identified by ``hipGetDeviceProperties(&prop, device_id)``. There are two
-special ``device_id`` values:
-
- ``hipCpuDeviceId`` = -1 means that the advised device is the CPU.
- ``hipInvalidDeviceId`` = -2 means that the device is invalid.
-
-For the best performance, profile your application to optimize the
-utilization of HIP runtime hints.
-
-Data prefetching
----------------
-
-Data prefetching is a technique used to improve the performance of your
-application by moving data closer to the processing unit before it's actually
-needed.
-
-.. code-block:: cpp
-    :emphasize-lines: 20-23,31-32
-
-    // Addition of two values.
-    __global__ void add(int *a, int *b, int *c) {
-        *c = *a + *b;
-    }
-
-    int main() {
-        int *a, *b, *c;
-        int deviceId;
-        hipGetDevice(&deviceId); // Get the current device ID
-
-        // Allocate memory for a, b and c that is accessible to both device and host codes.
-        hipMallocManaged(&a, sizeof(*a));
-        hipMallocManaged(&b, sizeof(*b));
-        hipMallocManaged(&c, sizeof(*c));
-
-        // Setup input values.
-        *a = 1;
-        *b = 2;
-
-        // Prefetch the data to the GPU device.
-        hipMemPrefetchAsync(a, sizeof(*a), deviceId, 0);
-        hipMemPrefetchAsync(b, sizeof(*b), deviceId, 0);
-        hipMemPrefetchAsync(c, sizeof(*c), deviceId, 0);
-
-        // Launch add() kernel on GPU.
-        hipLaunchKernelGGL(add, dim3(1), dim3(1), 0, 0, a, b, c);
-
-        // Wait for GPU to finish before accessing on host.
-        hipDeviceSynchronize();
-
-        // Prefetch the result back to the CPU.
-        hipMemPrefetchAsync(c, sizeof(*c), hipCpuDeviceId, 0);
-
-        // Wait for the prefetch operations to complete.
-        hipDeviceSynchronize();
-
-        // Prints the result.
-        std::cout << *a << " + " << *b << " = " << *c << std::endl;
-
-        // Cleanup allocated memory.
-        hipFree(a);
-        hipFree(b);
-        hipFree(c);
-
-        return 0;
-    }
-
-Remember to check the return status of ``hipMemPrefetchAsync()`` to ensure that
-the prefetch operations are completed successfully.
-
-Memory advice
-------------
-
-The effectiveness of ``hipMemAdvise()`` comes from its ability to inform the
-runtime system of the developer's intentions regarding memory usage. When the
-runtime system has knowledge of the expected memory access patterns, it can
-make better decisions about data placement and caching, leading to more
-efficient execution of the application. However, the actual impact on
-performance can vary based on the specific use case and the hardware
-architecture.
-
-For the description of ``hipMemAdvise()`` and the detailed list of advice,
-visit the :ref:`unified_memory_reference`.
-
-Here is the updated version of the example above with memory advice.
-
-.. code-block:: cpp
-    :emphasize-lines: 17-26
-
-    #include <hip/hip_runtime.h>
-    #include <iostream>
-
-    // Addition of two values.
-    __global__ void add(int *a, int *b, int *c) {
-        *c = *a + *b;
-    }
-
-    int main() {
-        int *a, *b, *c;
-
-        // Allocate memory for a, b, and c accessible to both device and host codes.
-        hipMallocManaged(&a, sizeof(*a));
-        hipMallocManaged(&b, sizeof(*b));
-        hipMallocManaged(&c, sizeof(*c));
-
-        // Set memory advice for a, b, and c to be accessed by the CPU.
-        hipMemAdvise(a, sizeof(*a), hipMemAdviseSetPreferredLocation, hipCpuDeviceId);
-        hipMemAdvise(b, sizeof(*b), hipMemAdviseSetPreferredLocation, hipCpuDeviceId);
-        hipMemAdvise(c, sizeof(*c), hipMemAdviseSetPreferredLocation, hipCpuDeviceId);
-
-        // Additionally, set memory advice for a, b, and c to be read mostly from the device 0.
-        constexpr int device = 0;
-        hipMemAdvise(a, sizeof(*a), hipMemAdviseSetReadMostly, device);
-        hipMemAdvise(b, sizeof(*b), hipMemAdviseSetReadMostly, device);
-        hipMemAdvise(c, sizeof(*c), hipMemAdviseSetReadMostly, device);
-
-        // Setup input values.
-        *a = 1;
-        *b = 2;
-
-        // Launch add() kernel on GPU.
-        hipLaunchKernelGGL(add, dim3(1), dim3(1), 0, 0, a, b, c);
-
-        // Wait for GPU to finish before accessing on host.
-        hipDeviceSynchronize();
-
-        // Prints the result.
-        std::cout << *a << " + " << *b << " = " << *c << std::endl;
-
-        // Cleanup allocated memory.
-        hipFree(a);
-        hipFree(b);
-        hipFree(c);
-
-        return 0;
-    }
-
-
-Memory range attributes
-----------------------
-
-Memory Range attributes allow you to query attributes of a given memory range.
-
-The ``hipMemRangeGetAttribute()`` is added to the example to query the
-``hipMemRangeAttributeReadMostly`` attribute of the memory range pointed to by
-``a``. The result is stored in ``attributeValue`` and then printed out.
-
-For more details, visit the
-:ref:`unified_memory_reference`.
-
-.. code-block:: cpp
-    :emphasize-lines: 29-34
-
-    #include <hip/hip_runtime.h>
-    #include <iostream>
-
-    // Addition of two values.
-    __global__ void add(int *a, int *b, int *c) {
-        *c = *a + *b;
-    }
-
-    int main() {
-        int *a, *b, *c;
-        unsigned int attributeValue;
-        constexpr size_t attributeSize = sizeof(attributeValue);
-
-        // Allocate memory for a, b and c that is accessible to both device and host codes.
-        hipMallocManaged(&a, sizeof(*a));
-        hipMallocManaged(&b, sizeof(*b));
-        hipMallocManaged(&c, sizeof(*c));
-
-        // Setup input values.
-        *a = 1;
-        *b = 2;
-
-        // Launch add() kernel on GPU.
-        hipLaunchKernelGGL(add, dim3(1), dim3(1), 0, 0, a, b, c);
-
-        // Wait for GPU to finish before accessing on host.
-        hipDeviceSynchronize();
-
-        // Query an attribute of the memory range.
-        hipMemRangeGetAttribute(&attributeValue,
-                                attributeSize,
-                                hipMemRangeAttributeReadMostly,
-                                a,
-                                sizeof(*a));
-
-        // Prints the result.
-        std::cout << *a << " + " << *b << " = " << *c << std::endl;
-        std::cout << "The queried attribute value is: " << attributeValue << std::endl;
-
-        // Cleanup allocated memory.
-        hipFree(a);
-        hipFree(b);
-        hipFree(c);
-
-        return 0;
-    }
-
-Asynchronously attach memory to a stream
----------------------------------------
-
-The ``hipStreamAttachMemAsync`` function would be able to asynchronously attach memory to a stream, which can help concurrent execution when using streams.
-
-Currently, this function is a no-operation (NOP) function on AMD GPUs. It simply returns success after the runtime memory validation passed. This function is necessary on Microsoft Windows, and UMM is not supported on this operating system with AMD GPUs at the moment.
@@ -1,94 +0,0 @@
-.. meta::
-  :description: This chapter describes introduces Virtual Memory (VM) and shows
-                how to use it in AMD HIP.
-  :keywords: AMD, ROCm, HIP, CUDA, virtual memory, virtual, memory, UM, APU
-
-.. _virtual_memory:
-
-*****************************
-Virtual memory management
-*****************************
-
-Memory management is important when creating high-performance applications in the HIP ecosystem. Both allocating and copying memory can result in bottlenecks, which can significantly impact performance.
-
-Global memory allocation in HIP uses the C language style allocation function. This works fine for simple cases but can cause problems if your memory needs change. If you need to increase the size of your memory, you must allocate a second larger buffer and copy the data to it before you can free the original buffer. This increases overall memory usage and causes unnecessary ``memcpy`` calls. Another solution is to allocate a larger buffer than you initially need. However, this isn't an efficient way to handle resources and doesn't solve the issue of reallocation when the extra buffer runs out.
-
-Virtual memory management solves these memory management problems. It helps to reduce memory usage and unnecessary ``memcpy`` calls.
-
-.. _memory_allocation_virtual_memory:
-
-Memory allocation
-=================
-
-Standard memory allocation uses the ``hipMalloc`` function to allocate a block of memory on the device. However, when using virtual memory, this process is separated into multiple steps using the ``hipMemCreate``, ``hipMemAddressReserve``, ``hipMemMap``, and ``hipMemSetAccess`` functions. This guide explains what these functions do and how you can use them for virtual memory management.
-
-Allocate physical memory
------------------------
-
-The first step is to allocate the physical memory itself with the ``hipMemCreate`` function. This function accepts the size of the buffer, an ``unsigned long long`` variable for the flags, and a ``hipMemAllocationProp`` variable. ``hipMemAllocationProp`` contains the properties of the memory to be allocated, such as where the memory is physically located and what kind of shareable handles are available. If the allocation is successful, the function returns a value of ``hipSuccess``, with ``hipMemGenericAllocationHandle_t`` representing a valid physical memory allocation. The allocated memory size must be aligned with the granularity appropriate for the properties of the allocation. You can use the ``hipMemGetAllocationGranularity`` function to determine the correct granularity.
-
-.. code-block:: cpp
-
-    size_t granularity = 0;
-    hipMemGenericAllocationHandle_t allocHandle;
-    hipMemAllocationProp prop = {};
-    prop.type = HIP_MEM_ALLOCATION_TYPE_PINNED;
-    prop.location.type = HIP_MEM_LOCATION_TYPE_DEVICE;
-    prop.location.id = currentDev;
-    hipMemGetAllocationGranularity(&granularity, &prop, HIP_MEM_ALLOC_GRANULARITY_MINIMUM);
-    padded_size = ROUND_UP(size, granularity);
-    hipMemCreate(&allocHandle, padded_size, &prop, 0);
-
-Reserve virtual address range
-----------------------------
-
-After you have acquired an allocation of physical memory, you must map it before you can use it. To do so, you need a virtual address to map it to.  Mapping means the physical memory allocation is available from the virtual address range it is mapped to. To reserve a virtual memory range, use the ``hipMemAddressReserve`` function. The size of the virtual memory must match the amount of physical memory previously allocated. You can then map the physical memory allocation to the newly-acquired virtual memory address range using the ``hipMemMap`` function.
-
-.. code-block:: cpp
-
-    hipMemAddressReserve(&ptr, padded_size, 0, 0, 0);
-    hipMemMap(ptr, padded_size, 0, allocHandle, 0);
-
-Set memory access
-----------------
-
-Finally, use the ``hipMemSetAccess`` function to enable memory access. It accepts the pointer to the virtual memory, the size, and a ``hipMemAccessDesc`` descriptor as parameters. In a multi-GPU environment, you can map the device memory of one GPU to another. This feature also works with the traditional memory management system, but isn't as scalable as with virtual memory. When memory is allocated with ``hipMalloc``, ``hipDeviceEnablePeerAccess`` is used to enable peer access. This function enables access between two devices, but it means that every call to ``hipMalloc`` takes more time to perform the checks and the mapping between the devices. When using virtual memory management, peer access is enabled by ``hipMemSetAccess``, which provides a finer level of control over what is shared. This has no performance impact on memory allocation and gives you more control over what memory buffers are shared with which devices.
-
-.. code-block:: cpp
-
-    hipMemAccessDesc accessDesc = {};
-    accessDesc.location.type = HIP_MEM_LOCATION_TYPE_DEVICE;
-    accessDesc.location.id = currentDev;
-    accessDesc.flags = HIP_MEM_ACCESS_FLAGS_PROT_READWRITE;
-    hipMemSetAccess(ptr, padded_size, &accessDesc, 1);
-
-At this point the memory is allocated, mapped, and ready for use. You can read and write to it, just like you would a C style memory allocation.
-
-Free virtual memory
-------------------
-
-To free the memory allocated in this manner, use the corresponding free functions. To unmap the memory, use ``hipMemUnmap``. To release the virtual address range, use ``hipMemAddressFree``.  Finally, to release the physical memory, use ``hipMemRelease``. A side effect of these functions is the lack of synchronization when memory is released. If you call ``hipFree`` when you have multiple streams running in parallel, it synchronizes the device. This causes worse resource usage and performance.
-
-.. code-block:: cpp
-
-    hipMemUnmap(ptr, size);
-    hipMemRelease(allocHandle);
-    hipMemAddressFree(ptr, size);
-
-.. _usage_virtual_memory:
-
-Memory usage
-============
-
-Dynamically increase allocation size
------------------------------------
-
-The ``hipMemAddressReserve`` function allows you to increase the amount of pre-allocated memory. This function accepts a parameter representing the requested starting address of the virtual memory. This allows you to have a continuous virtual address space without worrying about the underlying physical allocation.
-
-.. code-block:: cpp
-
-    hipMemAddressReserve(&new_ptr, (new_size - padded_size), 0, ptr + padded_size, 0);
-    hipMemMap(new_ptr, (new_size - padded_size), 0, newAllocHandle, 0);
-    hipMemSetAccess(new_ptr, (new_size - padded_size), &accessDesc, 1);
-
-The code sample above assumes that ``hipMemAddressReserve`` was able to reserve the memory address at the specified location. However, this isn't guaranteed to be true, so you should validate that ``new_ptr`` points to a specific virtual address before using it.
@@ -1,71 +1,54 @@
+<head>
+  <meta charset="UTF-8">
+  <meta name="description" content="HIP documentation and programming guide.">
+  <meta name="keywords" content="HIP, Heterogeneous-computing Interface for Portability, HIP programming guide">
+</head>
+
 # HIP documentation

-The Heterogeneous-computing Interface for Portability (HIP) API is a C++ runtime
-API and kernel language that lets developers create portable applications for AMD
-and NVIDIA GPUs from single source code.
+The Heterogeneous-computing Interface for Portability (HIP) is a C++ runtime API
+and kernel language that lets you create portable applications for AMD and
+NVIDIA GPUs from a single source code. For more information, see [What is HIP?](./what_is_hip)

-For HIP supported AMD GPUs on multiple operating systems, see:
-
-* [Linux system requirements](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/reference/system-requirements.html#supported-gpus)
-* [Microsoft Windows system requirements](https://rocm.docs.amd.com/projects/install-on-windows/en/latest/reference/system-requirements.html#windows-supported-gpus)
-
-The CUDA enabled NVIDIA GPUs are supported by HIP. For more information, see [GPU Compute Capability](https://developer.nvidia.com/cuda-gpus).
-
-On the AMD ROCm platform, HIP provides header files and runtime library built on top of HIP-Clang compiler in the repository [Compute Language Runtimes (CLR)](./understand/amd_clr), which contains source codes for AMD's compute languages runtimes as follows,
-
-On non-AMD platforms, like NVIDIA, HIP provides header files required to support non-AMD specific back-end implementation in the repository ['hipother'](https://github.com/ROCm/hipother), which translates from the HIP runtime APIs to CUDA runtime APIs.
-
-## Overview
-
-::::{grid} 1 1 2 2
-:gutter: 3
-
-:::{grid-item-card} Install
+Installation instructions are available from:

 * [Installing HIP](./install/install)
 * [Building HIP from source](./install/build)

-:::
+The HIP documentation is organized into the following categories:

-:::{grid-item-card} Conceptual
+::::{grid} 1 2 2 2
+:gutter: 3

+:::{grid-item-card} Programming guide
+
+* [Introduction](./programming_guide)
 * {doc}`./understand/programming_model`
 * {doc}`./understand/hardware_implementation`
-* {doc}`./understand/amd_clr`
-* {doc}`./understand/texture_fetching`
-
-:::
-
-:::{grid-item-card} How to
-
-* [Programming manual](./how-to/programming_manual)
-* [HIP porting guide](./how-to/hip_porting_guide)
-* [HIP porting: driver API guide](./how-to/hip_porting_driver_api)
-* {doc}`./how-to/hip_rtc`
+* {doc}`./understand/compilers`
 * {doc}`./how-to/performance_guidelines`
 * [Debugging with HIP](./how-to/debugging)
 * {doc}`./how-to/logging`
-* [Unified memory](./how-to/unified_memory)
-* [Virtual memory](./how-to/virtual_memory)
-* {doc}`./how-to/stream_ordered_allocator`
-* [Cooperative groups](./how-to/cooperative_groups)
-* [HIP graphs](./how-to/hipgraph)
-* {doc}`./how-to/faq`
+* {doc}`./how-to/hip_runtime_api`
+* [HIP porting guide](./how-to/hip_porting_guide)
+* [HIP porting: driver API guide](./how-to/hip_porting_driver_api)
+* {doc}`./how-to/hip_rtc`
+* {doc}`./understand/amd_clr`

 :::

 :::{grid-item-card} Reference

 * [HIP runtime API](./reference/hip_runtime_api_reference)
-  * [Modules](./reference/hip_runtime_api/modules)
-  * [Global defines, enums, structs and files](./reference/hip_runtime_api/global_defines_enums_structs_files)
 * [HSA runtime API for ROCm](./reference/virtual_rocr)
 * [C++ language extensions](./reference/cpp_language_extensions)
 * [C++ language support](./reference/cpp_language_support)
 * [HIP math API](./reference/math_api)
+* [HIP environment variables](./reference/env_variables)
 * [Comparing syntax for different APIs](./reference/terms)
 * [List of deprecated APIs](./reference/deprecated_api_list)
 * [FP8 numbers in HIP](./reference/fp8_numbers)
+* {doc}`./reference/hardware_features`

 :::

@@ -1,3 +1,7 @@
+.. meta::
+   :description: This page gives instructions on how to build HIP from source.
+   :keywords: AMD, ROCm, HIP, build, build instructions, source
+
 *******************************************
 Build HIP from source
 *******************************************
@@ -1,12 +1,21 @@
+.. meta::
+   :description: This page explains how to install HIP
+   :keywords: AMD, ROCm, HIP, install, installation
+
 *******************************************
 Install HIP
 *******************************************

 HIP can be installed on AMD (ROCm with HIP-Clang) and NVIDIA (CUDA with NVCC) platforms.

-Note: The version definition for the HIP runtime is different from CUDA. On an AMD platform, the
-``hipRuntimeGerVersion`` function returns the HIP runtime version; on an NVIDIA platform, this function
-returns the CUDA runtime version.
+.. note::
+
+   The version definition for the HIP runtime is different from CUDA. On AMD
+   platforms, the :cpp:func:`hipRuntimeGetVersion` function returns the HIP
+   runtime version. On NVIDIA platforms, this function returns the CUDA runtime
+   version.
+
+.. _install_prerequisites:

 Prerequisites
 =======================================
@@ -24,8 +33,9 @@ Prerequisites
   .. tab-item:: NVIDIA
      :sync: nvidia

-      Check the system requirements in the
-      `NVIDIA CUDA Installation Guide <https://docs.nvidia.com/cuda/cuda-installation-guide-linux/>`_.
+      With NVIDIA GPUs, HIP requires unified memory. All CUDA-enabled NVIDIA
+      GPUs with compute capability 5.0 or later should be supported. For more
+      information, see `NVIDIA's list of CUDA enabled GPUs <https://developer.nvidia.com/cuda-gpus>`_.

 Installation
 =======================================
@@ -41,7 +51,7 @@ Installation
         * :doc:`rocm-install-on-linux:index`
         * :doc:`rocm-install-on-windows:index`

-      By default, HIP is installed into ``/opt/rocm/hip``.
+      By default, HIP is installed into ``/opt/rocm``.

      .. note::
         There is no autodetection for the HIP installation. If you choose to install it somewhere other than the default location, you must set the ``HIP_PATH`` environment variable as explained in `Build HIP from source <./build.html>`_.
@@ -83,7 +93,7 @@ Installation

         The default paths are:
            * CUDA SDK: ``/usr/local/cuda``
-            * HIP: ``/opt/rocm/hip``
+            * HIP: ``/opt/rocm``

      #. Set the HIP_PLATFORM to nvidia.

@@ -0,0 +1,83 @@
+.. meta::
+    :description: HIP programming guide introduction
+    :keywords: HIP programming guide introduction, HIP programming guide
+
+.. _hip-programming-guide:
+
+********************************************************************************
+HIP programming guide introduction
+********************************************************************************
+
+This topic provides key HIP programming concepts and links to more detailed
+information.
+
+Write GPU Kernels for Parallel Execution
+================================================================================
+
+To make the most of the parallelism inherent to GPUs, a thorough understanding
+of the :ref:`programming model <programming_model>` is helpful. The HIP
+programming model is designed to make it easy to map data-parallel algorithms to
+architecture of the GPUs. HIP employs the SIMT-model (Single Instruction
+Multiple Threads) with a multi-layered thread hierarchy for efficient execution.
+
+Understand the Target Architecture (CPU and GPU)
+================================================================================
+
+The :ref:`hardware implementation <hardware_implementation>` topic outlines the
+GPUs supported by HIP. In general, GPUs are made up of Compute Units that excel
+at executing parallelizable, computationally intensive workloads without complex
+control-flow.
+
+Increase parallelism on multiple level
+================================================================================
+
+To maximize performance and keep all system components fully utilized, the
+application should expose and efficiently manage as much parallelism as possible.
+:ref:`Parallel execution <parallel execution>` can be achieved at the
+application, device, and multiprocessor levels.
+
+The application’s host and device operations can achieve parallel execution
+through asynchronous calls, streams, or HIP graphs. On the device level,
+multiple kernels can execute concurrently when resources are available, and at
+the multiprocessor level, developers can overlap data transfers with
+computations to further optimize performance.
+
+Memory management
+================================================================================
+
+GPUs generally have their own distinct memory, also called :ref:`device
+memory <device_memory>`, separate from the :ref:`host memory <host_memory>`.
+Device memory needs to be managed separately from the host memory. This includes
+allocating the memory and transfering it between the host and the device. These
+operations can be performance critical, so it's important to know how to use
+them effectively. For more information, see :ref:`Memory management <memory_management>`.
+
+Synchronize CPU and GPU Workloads
+================================================================================
+
+Tasks on the host and devices run asynchronously, so proper synchronization is
+needed when dependencies between those tasks exist. The asynchronous execution
+of tasks is useful for fully utilizing the available resources. Even when only a
+single device is available, memory transfers and the execution of tasks can be
+overlapped with asynchronous execution.
+
+Error Handling
+================================================================================
+
+All functions in the HIP runtime API return an error value of type
+:cpp:enum:`hipError_t` that can be used to verify whether the function was
+successfully executed. It's important to confirm these returned values, in order
+to catch and handle those errors, if possible. An exception is kernel launches,
+which don't return any value. These errors can be caught with specific functions
+like :cpp:func:`hipGetLastError()`.
+
+For more information, see :ref:`error_handling` .
+
+Multi-GPU and Load Balancing
+================================================================================
+
+Large-scale applications that need more compute power can use multiple GPUs in
+the system. This requires distributing workloads across multiple GPUs to balance
+the load to prevent GPUs from being overutilized while others are idle.
+
+For more information, see :ref:`multi-device` .
@@ -97,7 +97,7 @@ When using ``hipLaunchKernelGGL``, your first five parameters must be:
  * ``size_t dynamicShared``: The amount of additional shared memory that you want to allocate
    when launching the kernel (see :ref:`shared-variable-type`).
  * ``hipStream_t``: The stream where you want to run the kernel. A value of ``0`` corresponds to the
-    NULL stream (see :ref:`synchronization functions`).
+    NULL stream (see :ref:`synchronization_functions`).

 You can include your kernel arguments after these parameters.

@@ -293,6 +293,7 @@ dimensions to 1.
    dim3(uint32_t _x=1, uint32_t _y=1, uint32_t _z=1) : x(_x), y(_y), z(_z) {};
  };

+.. _memory_fence_instructions:

 Memory fence instructions
 ====================================================
@@ -306,7 +307,7 @@ HIP supports ``__threadfence()`` and ``__threadfence_block()``. If you're using
    ``hipHostMalloc()``.
  * Remove ``memcpy`` for all allocated fine-grained system memory regions.

-.. _synchronization functions:
+.. _synchronization_functions:

 Synchronization functions
 ====================================================
@@ -321,7 +322,7 @@ The Cooperative Groups API offer options to do synchronization on a developer de
 Math functions
 ====================================================

-HIP-Clang supports a set of math operations that are callable from the device. 
+HIP-Clang supports a set of math operations that are callable from the device.
 HIP supports most of the device functions supported by CUDA. These are described
 on :ref:`Math API page <math_api_reference>`.

@@ -376,6 +377,8 @@ To read a high-resolution timer from the device, HIP provides the following buil

  Note that ``clock()`` and ``clock64()`` do not work properly on AMD RDNA3 (GFX11) graphic processors.

+.. _atomic functions:
+
 Atomic functions
 ===============================================

@@ -734,6 +737,8 @@ will be enabled unconditionally in the next ROCm release. Wherever possible, the
 implementation includes a static assert to check that the program source uses
 the correct type for the mask.

+.. _warp_vote_functions:
+
 Warp vote and ballot functions
 -------------------------------------------------------------------------------------------------------------

@@ -6,87 +6,171 @@
 HIP deprecated runtime API functions
 **********************************************************************************************

-Several of our API functions have been flagged for deprecation. Using the following functions results in
-errors and unexpected results, so we encourage you to update your code accordingly.
+Several of our API functions have been flagged for deprecation. Using the
+following functions results in errors and unexpected results, so we encourage
+you to update your code accordingly.

-Context management
+Deprecated since ROCm 6.1.0
 ============================================================

-CUDA supports cuCtx API, which is the driver API that defines "Context" and "Devices" as separate
-entities. Context contains a single device, and a device can theoretically have multiple contexts. HIP
-initially added limited support for these APIs in order to facilitate porting from existing driver codes.
-These APIs are now marked as deprecated because there are better alternate interfaces (such as
-``hipSetDevice`` or the stream API) to achieve these functions.
+Deprecated texture management functions.

-* ``hipCtxCreate``
-* ``hipCtxDestroy``
-* ``hipCtxPopCurrent``
-* ``hipCtxPushCurrent``
-* ``hipCtxSetCurrent``
-* ``hipCtxGetCurrent``
-* ``hipCtxGetDevice``
-* ``hipCtxGetApiVersion``
-* ``hipCtxGetCacheConfig``
-* ``hipCtxSetCacheConfig``
-* ``hipCtxSetSharedMemConfig``
-* ``hipCtxGetSharedMemConfig``
-* ``hipCtxSynchronize``
-* ``hipCtxGetFlags``
-* ``hipCtxEnablePeerAccess``
-* ``hipCtxDisablePeerAccess``
-* ``hipDevicePrimaryCtxGetState``
-* ``hipDevicePrimaryCtxRelease``
-* ``hipDevicePrimaryCtxRetain``
-* ``hipDevicePrimaryCtxReset``
-* ``hipDevicePrimaryCtxSetFlags``
+.. list-table::
+   :widths: 40
+   :header-rows: 1
+   :align: left

-Memory management
+   * - function
+   * - :cpp:func:`hipTexRefGetBorderColor`
+   * - :cpp:func:`hipTexRefGetArray`
+
+Deprecated since ROCm 5.7.0
 ============================================================

-* ``hipMallocHost`` (replaced with ``hipHostMalloc``)
-* ``hipMemAllocHost`` (replaced with ``hipHostMalloc``)
-* ``hipMemcpyToArray``
-* ``hipMemcpyFromArray``
+Deprecated texture management functions.

-Profiler control
+.. list-table::
+   :widths: 40
+   :header-rows: 1
+   :align: left
+
+   * - function
+   * - :cpp:func:`hipBindTextureToMipmappedArray`
+
+Deprecated since ROCm 5.3.0
 ============================================================

-* ``hipProfilerStart`` (use roctracer/rocTX)
-* ``hipProfilerStop`` (use roctracer/rocTX)
+Deprecated texture management functions.

+.. list-table::
+   :widths: 40
+   :header-rows: 1
+   :align: left

-Texture management
+   * - function
+   * - :cpp:func:`hipGetTextureReference`
+   * - :cpp:func:`hipTexRefSetAddressMode`
+   * - :cpp:func:`hipTexRefSetArray`
+   * - :cpp:func:`hipTexRefSetFlags`
+   * - :cpp:func:`hipTexRefSetFilterMode`
+   * - :cpp:func:`hipTexRefSetFormat`
+   * - :cpp:func:`hipTexRefSetMipmapFilterMode`
+   * - :cpp:func:`hipTexRefSetMipmapLevelBias`
+   * - :cpp:func:`hipTexRefSetMipmapLevelClamp`
+   * - :cpp:func:`hipTexRefSetMipmappedArray`
+
+Deprecated since ROCm 4.3.0
 ============================================================

-* ``hipGetTextureReference``
-* ``hipTexRefSetAddressMode``
-* ``hipTexRefSetArray``
-* ``hipTexRefSetFilterMode``
-* ``hipTexRefSetFlags``
-* ``hipTexRefSetFormat``
-* ``hipTexRefGetAddress``
-* ``hipTexRefGetAddressMode``
-* ``hipTexRefGetFilterMode``
-* ``hipTexRefGetFlags``
-* ``hipTexRefGetFormat``
-* ``hipTexRefGetMaxAnisotropy``
-* ``hipTexRefGetMipmapFilterMode``
-* ``hipTexRefGetMipmapLevelBias``
-* ``hipTexRefGetMipmapLevelClamp``
-* ``hipTexRefGetMipMappedArray``
-* ``hipTexRefSetAddress``
-* ``hipTexRefSetAddress2D``
-* ``hipTexRefSetMaxAnisotropy``
-* ``hipTexRefSetBorderColor``
-* ``hipTexRefSetMipmapFilterMode``
-* ``hipTexRefSetMipmapLevelBias``
-* ``hipTexRefSetMipmapLevelClamp``
-* ``hipTexRefSetMipmappedArray``
-* ``hipTexRefGetBorderColor``
-* ``hipTexRefGetArray``
-* ``hipBindTexture``
-* ``hipBindTexture2D``
-* ``hipBindTextureToArray``
-* ``hipGetTextureAlignmentOffset``
-* ``hipUnbindTexture``
-* ``hipBindTextureToMipmappedArray``
+Deprecated texture management functions.
+
+.. list-table::
+   :widths: 40
+   :header-rows: 1
+   :align: left
+
+   * - function
+   * - :cpp:func:`hipTexRefGetAddress`
+   * - :cpp:func:`hipTexRefGetAddressMode`
+   * - :cpp:func:`hipTexRefGetFilterMode`
+   * - :cpp:func:`hipTexRefGetFlags`
+   * - :cpp:func:`hipTexRefGetFormat`
+   * - :cpp:func:`hipTexRefGetMaxAnisotropy`
+   * - :cpp:func:`hipTexRefGetMipmapFilterMode`
+   * - :cpp:func:`hipTexRefGetMipmapLevelBias`
+   * - :cpp:func:`hipTexRefGetMipmapLevelClamp`
+   * - :cpp:func:`hipTexRefGetMipMappedArray`
+   * - :cpp:func:`hipTexRefSetAddress`
+   * - :cpp:func:`hipTexRefSetAddress2D`
+   * - :cpp:func:`hipTexRefSetBorderColor`
+   * - :cpp:func:`hipTexRefSetMaxAnisotropy`
+
+Deprecated since ROCm 3.8.0
+============================================================
+
+Deprecated memory management and texture management functions.
+
+.. list-table::
+   :widths: 40
+   :header-rows: 1
+   :align: left
+
+   * - function
+   * - :cpp:func:`hipBindTexture`
+   * - :cpp:func:`hipBindTexture2D`
+   * - :cpp:func:`hipBindTextureToArray`
+   * - :cpp:func:`hipGetTextureAlignmentOffset`
+   * - :cpp:func:`hipUnbindTexture`
+   * - :cpp:func:`hipMemcpyToArray`
+   * - :cpp:func:`hipMemcpyFromArray`
+
+Deprecated since ROCm 3.1.0
+============================================================
+
+Deprecated memory management functions.
+
+.. list-table::
+   :widths: 40, 60
+   :header-rows: 1
+   :align: left
+
+   * - function
+     -
+   * - :cpp:func:`hipMallocHost`
+     - replaced with :cpp:func:`hipHostAlloc`
+   * - :cpp:func:`hipMemAllocHost`
+     - replaced with :cpp:func:`hipHostAlloc`
+
+Deprecated since ROCm 3.0.0
+============================================================
+
+The ``hipProfilerStart`` and ``hipProfilerStop`` functions are deprecated.
+Instead, you can use ``roctracer`` or ``rocTX`` for profiling which provide more
+flexibility and detailed profiling capabilities.
+
+.. list-table::
+   :widths: 40
+   :header-rows: 1
+   :align: left
+
+   * - function
+   * - :cpp:func:`hipProfilerStart`
+   * - :cpp:func:`hipProfilerStop`
+
+Deprecated since ROCm 1.9.0
+============================================================
+
+CUDA supports cuCtx API, which is the driver API that defines "Context" and
+"Devices" as separate entities. Context contains a single device, and a device
+can theoretically have multiple contexts. HIP initially added limited support
+for context APIs in order to facilitate porting from existing driver codes. These
+APIs are now marked as deprecated because there are better alternate interfaces
+(such as ``hipSetDevice`` or the stream API) to achieve these functions.
+
+.. list-table::
+   :widths: 40
+   :header-rows: 1
+   :align: left
+
+   * - function
+   * -  :cpp:func:`hipCtxCreate`
+   * -  :cpp:func:`hipCtxDestroy`
+   * -  :cpp:func:`hipCtxPopCurrent`
+   * -  :cpp:func:`hipCtxPushCurrent`
+   * -  :cpp:func:`hipCtxSetCurrent`
+   * -  :cpp:func:`hipCtxGetCurrent`
+   * -  :cpp:func:`hipCtxGetDevice`
+   * -  :cpp:func:`hipCtxGetApiVersion`
+   * -  :cpp:func:`hipCtxGetCacheConfig`
+   * -  :cpp:func:`hipCtxSetCacheConfig`
+   * -  :cpp:func:`hipCtxSetSharedMemConfig`
+   * -  :cpp:func:`hipCtxGetSharedMemConfig`
+   * -  :cpp:func:`hipCtxSynchronize`
+   * -  :cpp:func:`hipCtxGetFlags`
+   * -  :cpp:func:`hipCtxEnablePeerAccess`
+   * -  :cpp:func:`hipCtxDisablePeerAccess`
+   * -  :cpp:func:`hipDevicePrimaryCtxGetState`
+   * -  :cpp:func:`hipDevicePrimaryCtxRelease`
+   * -  :cpp:func:`hipDevicePrimaryCtxRetain`
+   * -  :cpp:func:`hipDevicePrimaryCtxReset`
+   * -  :cpp:func:`hipDevicePrimaryCtxSetFlags`
@@ -0,0 +1,189 @@
+.. meta::
+    :description: HIP environment variables reference
+    :keywords: AMD, HIP, environment variables, environment, reference
+
+********************************************************************************
+HIP environment variables
+********************************************************************************
+
+In this section, the reader can find all the important HIP environment variables
+on AMD platform, which are grouped by functionality.
+
+GPU isolation variables
+================================================================================
+
+The GPU isolation environment variables in HIP are collected in the next table.
+For more information, check :doc:`GPU isolation page <rocm:conceptual/gpu-isolation>`.
+
+.. list-table::
+    :header-rows: 1
+    :widths: 70,30
+
+    * - **Environment variable**
+      - **Value**
+
+    * - | ``ROCR_VISIBLE_DEVICES``
+        | A list of device indices or UUIDs that will be exposed to applications.
+      - Example: ``0,GPU-DEADBEEFDEADBEEF``
+
+    * - | ``GPU_DEVICE_ORDINAL``
+        | Devices indices exposed to OpenCL and HIP applications.
+      - Example: ``0,2``
+
+    * - | ``HIP_VISIBLE_DEVICES`` or ``CUDA_VISIBLE_DEVICES``
+        | Device indices exposed to HIP applications.
+      - Example: ``0,2``
+
+Profiling variables
+================================================================================
+
+The profiling environment variables in HIP are collected in the next table. For
+more information, check :doc:`setting the number of CUs page <rocm:how-to/setting-cus>`.
+
+.. list-table::
+    :header-rows: 1
+    :widths: 70,30
+
+    * - **Environment variable**
+      - **Value**
+
+    * - | ``HSA_CU_MASK``
+        | Sets the mask on a lower level of queue creation in the driver,
+        | this mask will also be set for queues being profiled.
+      - Example: ``1:0-8``
+
+    * - | ``ROC_GLOBAL_CU_MASK``
+        | Sets the mask on queues created by the HIP or the OpenCL runtimes,
+        | this mask will also be set for queues being profiled.
+      - Example: ``0xf``, enables only 4 CUs
+
+    * - | ``HIP_FORCE_QUEUE_PROFILING``
+        | Used to run the app as if it were run in rocprof. Forces command queue
+        | profiling on by default.
+      - | 0: Disable
+        | 1: Enable
+
+Debug variables
+================================================================================
+
+The debugging environment variables in HIP are collected in the next table. For
+more information, check :ref:`debugging_with_hip`.
+
+.. include:: ../how-to/debugging_env.rst
+
+Memory management related variables
+================================================================================
+
+The memory management related environment variables in HIP are collected in the
+next table.
+
+.. list-table::
+    :header-rows: 1
+    :widths: 35,14,51
+
+    * - **Environment variable**
+      - **Default value**
+      - **Value**
+
+    * - | ``HIP_HIDDEN_FREE_MEM``
+        | Amount of memory to hide from the free memory reported by hipMemGetInfo.
+      - ``0``
+      - | 0: Disable
+        | Unit: megabyte (MB)
+
+    * - | ``HIP_HOST_COHERENT``
+        | Specifies if the memory is coherent between the host and GPU in ``hipHostMalloc``.
+      - ``0``
+      - | 0: Memory is not coherent.
+        | 1: Memory is coherent.
+        | Environment variable has effect, if the following conditions are statisfied:
+        | - One of the ``hipHostMallocDefault``, ``hipHostMallocPortable``,  ``hipHostMallocWriteCombined`` or ``hipHostMallocNumaUser`` flag set to 1.
+        | - ``hipHostMallocCoherent``, ``hipHostMallocNonCoherent`` and ``hipHostMallocMapped`` flags set to 0.
+
+    * - | ``HIP_INITIAL_DM_SIZE``
+        | Set initial heap size for device malloc.
+      - ``8388608``
+      - | Unit: Byte
+        | The default value corresponds to 8 MB.
+
+    * - | ``HIP_MEM_POOL_SUPPORT``
+        | Enables memory pool support in HIP.
+      - ``0``
+      - | 0: Disable
+        | 1: Enable
+
+    * - | ``HIP_MEM_POOL_USE_VM``
+        | Enables memory pool support in HIP.
+      - | ``0``: other OS
+        | ``1``: Windows
+      - | 0: Disable
+        | 1: Enable
+
+    * - | ``HIP_VMEM_MANAGE_SUPPORT``
+        | Virtual Memory Management Support.
+      - ``1``
+      - | 0: Disable
+        | 1: Enable
+
+    * - | ``GPU_MAX_HEAP_SIZE``
+        | Set maximum size of the GPU heap to % of board memory.
+      - ``100``
+      - | Unit: Percentage
+
+    * - | ``GPU_MAX_REMOTE_MEM_SIZE``
+        | Maximum size that allows device memory substitution with system.
+      - ``2``
+      - | Unit: kilobyte (KB)
+
+    * - | ``GPU_NUM_MEM_DEPENDENCY``
+        | Number of memory objects for dependency tracking.
+      - ``256``
+      -
+
+    * - | ``GPU_STREAMOPS_CP_WAIT``
+        | Force the stream memory operation to wait on CP.
+      - ``0``
+      - | 0: Disable
+        | 1: Enable
+
+    * - | ``HSA_LOCAL_MEMORY_ENABLE``
+        | Enable HSA device local memory usage.
+      - ``1``
+      - | 0: Disable
+        | 1: Enable
+
+    * - | ``PAL_ALWAYS_RESIDENT``
+        | Force memory resources to become resident at allocation time.
+      - ``0``
+      - | 0: Disable
+        | 1: Enable
+
+    * - | ``PAL_PREPINNED_MEMORY_SIZE``
+        | Size of prepinned memory.
+      - ``64``
+      - | Unit: kilobyte (KB)
+
+    * - | ``REMOTE_ALLOC``
+        | Use remote memory for the global heap allocation.
+      - ``0``
+      - | 0: Disable
+        | 1: Enable
+
+Other useful variables
+================================================================================
+
+The following table lists environment variables that are useful but relate to
+different features.
+
+.. list-table::
+    :header-rows: 1
+    :widths: 35,14,51
+
+    * - **Environment variable**
+      - **Default value**
+      - **Value**
+
+    * - | ``HIPRTC_COMPILE_OPTIONS_APPEND``
+        | Sets compile options needed for ``hiprtc`` compilation.
+      - None
+      - ``--gpu-architecture=gfx906:sramecc+:xnack``, ``-fgpu-rdc``
@@ -0,0 +1,249 @@
+.. meta::
+  :description: This chapter describes the hardware features of the different hardware architectures.
+  :keywords: AMD, ROCm, HIP, hardware, hardware features, hardware architectures
+
+*******************************************************************************
+Hardware features
+*******************************************************************************
+
+This page gives an overview of the different hardware architectures and the
+features they implement. Hardware features do not imply performance, that
+depends on the specifications found in the :doc:`rocm:reference/gpu-arch-specs`
+page.
+
+  .. list-table::
+      :header-rows: 1
+      :name: hardware-features-table
+
+      *
+        - Hardware feature support
+        - RDNA1
+        - CDNA1
+        - RDNA2
+        - CDNA2
+        - RDNA3
+        - CDNA3
+      *
+        - :ref:`atomic functions` on 32-bit integer values in global and shared memory
+        - ✅
+        - ✅
+        - ✅
+        - ✅
+        - ✅
+        - ✅
+      *
+        - Atomic functions on 64-bit integer values in global and shared memory
+        - ✅
+        - ✅
+        - ✅
+        - ✅
+        - ✅
+        - ✅
+      *
+        - Atomic addition on 32-bit floating point values in global and shared memory
+        - ❌
+        - ❌
+        - ✅
+        - ✅
+        - ✅
+        - ✅
+      *
+        - Atomic addition on 64-bit floating point values in global memory and shared memory
+        - ❌
+        - ❌
+        - ✅
+        - ✅
+        - ✅
+        - ✅
+      *
+        - :ref:`Warp vote functions <warp_vote_functions>`
+        - ✅
+        - ✅
+        - ✅
+        - ✅
+        - ✅
+        - ✅
+      *
+        - :ref:`Memory fence instructions <memory_fence_instructions>`
+        - ✅
+        - ✅
+        - ✅
+        - ✅
+        - ✅
+        - ✅
+      *
+        - :ref:`Synchronization functions <synchronization_functions>`
+        - ✅
+        - ✅
+        - ✅
+        - ✅
+        - ✅
+        - ✅
+      *
+        - :ref:`Surface functions <surface_object_reference>`
+        - ✅
+        - ✅
+        - ✅
+        - ✅
+        - ✅
+        - ✅
+      *
+        - :ref:`float16 half precision IEEE-conformant floating-point operations<rocm:precision_support_floating_point_types>`
+        - ✅
+        - ✅
+        - ✅
+        - ✅
+        - ✅
+        - ✅
+      *
+        - :ref:`bfloat16 16-bit floating-point operations<rocm:precision_support_floating_point_types>`
+        - ❌
+        - ✅
+        - ❌
+        - ✅
+        - ✅
+        - ✅
+      *
+        - Support for :ref:`8-bit floating-point types <rocm:precision_support_floating_point_types>`
+        - ❌
+        - ❌
+        - ❌
+        - ❌
+        - ❌
+        - ✅
+      *
+        - Support for :ref:`tensor float32 <rocm:precision_support_floating_point_types>`
+        - ❌
+        - ❌
+        - ❌
+        - ❌
+        - ❌
+        - ✅
+      *
+        - Packed math with 16-bit floating point values
+        - ✅
+        - ✅
+        - ✅
+        - ✅
+        - ✅
+        - ✅
+      *
+        - Packed math with 32-bit floating point values
+        - ❌
+        - ❌
+        - ❌
+        - ✅
+        - ❌
+        - ✅
+      *
+        - Matrix Cores
+        - ❌
+        - ✅
+        - ❌
+        - ✅
+        - ❌
+        - ✅
+      *
+        - On-Chip Error Correcting Code (ECC)
+        - ✅
+        - ✅
+        - ✅
+        - ✅
+        - ✅
+        - ✅
+      *
+        - Maximum dimensionality of grid
+        - 3
+        - 3
+        - 3
+        - 3
+        - 3
+        - 3
+      *
+        - Maximum x-, y- or z-dimension of a grid
+        - :math:`2^{32} - 1`
+        - :math:`2^{32} - 1`
+        - :math:`2^{32} - 1`
+        - :math:`2^{32} - 1`
+        - :math:`2^{32} - 1`
+        - :math:`2^{32} - 1`
+      *
+        - Maximum number of threads per grid
+        - :math:`2^{32} - 1`
+        - :math:`2^{32} - 1`
+        - :math:`2^{32} - 1`
+        - :math:`2^{32} - 1`
+        - :math:`2^{32} - 1`
+        - :math:`2^{32} - 1`
+      *
+        - Maximum x-, y- or z-dimension of a block
+        - :math:`1024`
+        - :math:`1024`
+        - :math:`1024`
+        - :math:`1024`
+        - :math:`1024`
+        - :math:`1024`
+      *
+        - Maximum number of threads per block
+        - :math:`1024`
+        - :math:`1024`
+        - :math:`1024`
+        - :math:`1024`
+        - :math:`1024`
+        - :math:`1024`
+      *
+        - Wavefront size
+        - 32 [1]_
+        - 64
+        - 32 [1]_
+        - 64
+        - 32 [1]_
+        - 64
+      *
+        - Maximum number of resident blocks per compute unit
+        - 40 [1]_
+        - 32
+        - 32 [1]_
+        - 32
+        - 32 [1]_
+        - 32
+      *
+        - Maximum number of resident wavefronts per compute unit
+        - 40 [1]_
+        - 32
+        - 32 [1]_
+        - 32
+        - 32 [1]_
+        - 32
+      *
+        - Maximum number of resident threads per compute unit
+        - 1280 [2]_
+        - 2048
+        - 1024 [2]_
+        - 2048
+        - 1024 [2]_
+        - 2048
+      *
+        - Maximum number of 32-bit vector registers per thread
+        - 256
+        - 256 (vector) + 256 (matrix)
+        - 256
+        - 256 (vector) + 256 (matrix)
+        - 256
+        - 256 (vector) + 256 (matrix)
+      *
+        - Maximum number of 32-bit scalar accumulation registers per thread
+        - 106
+        - 104
+        - 106
+        - 104
+        - 106
+        - 104
+
+.. [1] RDNA architectures have a configurable wavefront size. The native
+   wavefront size is 32, but they can run in "CU mode", which has an effective
+   wavefront size of 64. This affects the number of resident wavefronts and
+   blocks per compute Unit.
+.. [2] RDNA architectures expand the concept of the traditional compute unit
+   with the so-called work group processor, which effectively includes two
+   compute units, within which all threads can cooperate.
@@ -11,5 +11,5 @@ The structs, define macros, enums and files in the HIP runtime API.

 * :ref:`global_enum_defines_reference`
 * :ref:`driver_types_reference`
-* :doc:`hip:doxygen/html/annotated`
-* :doc:`hip:doxygen/html/files`
+* :doc:`../../doxygen/html/annotated`
+* :doc:`../../doxygen/html/files`
@@ -9,4 +9,4 @@ OpenGL interoperability
 *******************************************************************************

 .. doxygengroup:: GL
-   :content-only:
+   :content-only:
@@ -1,5 +1,5 @@
 .. meta::
-  :description: This chapter describes the built-in math functions that are accessible in HIP. 
+  :description: This chapter describes the built-in math functions that are accessible in HIP.
  :keywords: AMD, ROCm, HIP, CUDA, math functions, HIP math functions

 .. _math_api_reference:
@@ -1,6 +1,6 @@
 .. meta::
-  :description: This chapter lists user-mode API interfaces and libraries 
-                necessary for host applications to launch compute kernels to 
+  :description: This chapter lists user-mode API interfaces and libraries
+                necessary for host applications to launch compute kernels to
                available HSA ROCm kernel agents.
  :keywords: AMD, ROCm, HIP, HSA, ROCR runtime, virtual memory management

@@ -5,6 +5,9 @@ defaults:
  maxdepth: 6
 root: index
 subtrees:
+- entries:
+  - file: what_is_hip
+  - file: faq

 - caption: Install
  entries:
@@ -12,33 +15,50 @@ subtrees:
    title: Installing HIP
  - file: install/build
    title: Building HIP from source
+  - url: https://rocm.docs.amd.com/projects/install-on-linux/en/${branch}/reference/system-requirements.html
+    title: Linux supported AMD GPUs
+  - url: https://rocm.docs.amd.com/projects/install-on-windows/en/${branch}/reference/system-requirements.html
+    title: Windows supported AMD GPUs
+  - url: https://developer.nvidia.com/cuda-gpus
+    title: NVIDIA supported GPUs

- caption: Conceptual
+- caption: Programming guide
  entries:
+  - file: programming_guide
+    title: Introduction
  - file: understand/programming_model
  - file: understand/hardware_implementation
-  - file: understand/amd_clr
-  - file: understand/texture_fetching
-    title: Texture fetching
-
- caption: How to
-  entries:
-  - file: how-to/programming_manual
-  - file: how-to/hip_porting_guide
-  - file: how-to/hip_porting_driver_api
-  - file: how-to/hip_rtc
+  - file: understand/compilers
  - file: how-to/performance_guidelines
  - file: how-to/debugging
  - file: how-to/logging
-  - file: how-to/cooperative_groups
-  - file: how-to/unified_memory
-    title: Unified memory
-  - file: how-to/virtual_memory
-    title: Virtual memory
-  - file: how-to/stream_ordered_allocator
-  - file: how-to/hipgraph
-    title: HIP graphs
-  - file: how-to/faq
+  - file: how-to/hip_runtime_api
+    subtrees:
+    - entries:
+      - file: how-to/hip_runtime_api/initialization
+      - file: how-to/hip_runtime_api/memory_management
+        subtrees:
+        - entries:
+          - file: how-to/hip_runtime_api/memory_management/host_memory
+          - file: how-to/hip_runtime_api/memory_management/device_memory
+            subtrees:
+            - entries:
+              - file: how-to/hip_runtime_api/memory_management/device_memory/texture_fetching
+          - file: how-to/hip_runtime_api/memory_management/coherence_control
+          - file: how-to/hip_runtime_api/memory_management/unified_memory
+          - file: how-to/hip_runtime_api/memory_management/virtual_memory
+          - file: how-to/hip_runtime_api/memory_management/stream_ordered_allocator
+      - file: how-to/hip_runtime_api/error_handling
+      - file: how-to/hip_runtime_api/cooperative_groups
+      - file: how-to/hip_runtime_api/hipgraph
+      - file: how-to/hip_runtime_api/call_stack
+      - file: how-to/hip_runtime_api/multi_device
+      - file: how-to/hip_runtime_api/opengl_interop
+      - file: how-to/hip_runtime_api/external_interop
+  - file: how-to/hip_porting_guide
+  - file: how-to/hip_porting_driver_api
+  - file: how-to/hip_rtc
+  - file: understand/amd_clr

 - caption: Reference
  entries:
@@ -75,6 +95,7 @@ subtrees:
          - file: reference/hip_runtime_api/modules/runtime_compilation
          - file: reference/hip_runtime_api/modules/callback_activity_apis
          - file: reference/hip_runtime_api/modules/graph_management
+          - file: reference/hip_runtime_api/modules/graphics_interoperability
          - file: reference/hip_runtime_api/modules/opengl_interoperability
          - file: reference/hip_runtime_api/modules/cooperative_groups_reference
      - file: reference/hip_runtime_api/global_defines_enums_structs_files
@@ -90,12 +111,14 @@ subtrees:
  - file: reference/cpp_language_support
    title: C++ language support
  - file: reference/math_api
+  - file: reference/env_variables
  - file: reference/terms
    title: Comparing syntax for different APIs
  - file: reference/deprecated_api_list
    title: List of deprecated APIs
  - file: reference/fp8_numbers
    title: FP8 numbers in HIP
+  - file: reference/hardware_features

 - caption: Tutorials
  entries:
@@ -1,2 +1,2 @@
-rocm-docs-core[api_reference]==1.7.2
+rocm-docs-core[api_reference]==1.10.0
 sphinxcontrib.doxylink
@@ -116,7 +116,7 @@ requests==2.32.3
    # via
    #   pygithub
    #   sphinx
-rocm-docs-core[api-reference]==1.7.2
+rocm-docs-core[api-reference]==1.10.0
    # via -r requirements.in
 six==1.16.0
    # via python-dateutil
@@ -0,0 +1,628 @@
+// MIT License
+//
+// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include "nvidia_hip_fix.hpp"
+
+#include "example_utils.hpp"
+
+#include "glad/glad.h"
+
+#include <GLFW/glfw3.h>
+#include <hip/hip_gl_interop.h>
+#include <hip/hip_runtime.h>
+
+#include <chrono>
+#include <cstring>
+#include <iomanip>
+#include <iostream>
+#include <vector>
+
+/// \brief The number of triangles that the example's grid is in width.
+constexpr uint32_t grid_width = 256;
+/// \brief The number of triangles that the example's grid is in height.
+constexpr uint32_t grid_height = 256;
+
+/// \brief The OpenGL vertex shader that is used to render the triangles in this example.
+/// The grid x- and y-positions are used to set the triangle coordinates in clip space.
+/// The height value is passed on to the fragment shader.
+constexpr const char* vertex_shader = R"(
+#version 330 core
+
+in float in_height;
+in vec2 in_xy;
+
+out float frag_height;
+
+void main()
+{
+    gl_Position = vec4(in_xy, 0, 1);
+    frag_height = in_height;
+}
+)";
+
+/// \brief The OpenGL fragment shader that is used to render the triangles in this example.
+/// The "height" value is used to shade the vertex. Its values are interpolated linearly
+/// between the vertex and fragment shaders.
+constexpr const char* fragment_shader = R"(
+#version 330 core
+
+in float frag_height;
+
+void main()
+{
+    gl_FragColor = vec4(vec3(frag_height * 0.5 + 0.5), 1.0);
+}
+)";
+
+/// \brief Initialize a GLFW window with initial dimensions.
+GLFWwindow* create_window(const int initial_width, const int initial_height)
+{
+    /// [Sphinx-create-window]
+    glfwWindowHint(GLFW_CONTEXT_VERSION_MAJOR, 3);
+    glfwWindowHint(GLFW_CONTEXT_VERSION_MINOR, 3);
+    glfwWindowHint(GLFW_OPENGL_PROFILE, GLFW_OPENGL_CORE_PROFILE);
+    glfwWindowHint(GLFW_OPENGL_DEBUG_CONTEXT, GLFW_TRUE);
+
+    GLFWwindow* window = glfwCreateWindow(initial_width,
+                                          initial_height,
+                                          "OpenGL-HIP interop example",
+                                          nullptr,
+                                          nullptr);
+    if(window == nullptr)
+    {
+        std::cerr << "Failed to create GLFW window\n";
+        std::exit(error_exit_code);
+    }
+    /// [Sphinx-create-window]
+    return window;
+}
+
+/// \brief Select a HIP device that is compatible with the current OpenGL context.
+/// \returns A HIP device-id that is capable of rendering the example. If no
+///   suitable device is found, an error is printed and the program is exited.
+int pick_hip_device()
+{
+    /// [Sphinx-pick device]
+    unsigned int gl_device_count;
+    int          hip_device;
+    HIP_CHECK(
+        hipGLGetDevices(&gl_device_count, &hip_device, 1, hipGLDeviceList::hipGLDeviceListAll));
+
+    if(gl_device_count == 0)
+    {
+        std::cerr << "System has no OpenGL-capable HIP devices" << std::endl;
+        std::exit(error_exit_code);
+    }
+    /// [Sphinx-pick device]
+
+    return hip_device;
+}
+
+/// \brief Utility function to compile shader source into an OpenGL shader.
+/// If the shader could not be compiled, this function prints the compile log
+/// and exits the program.
+/// \param type - The OpenGL shader type for this shader, for example
+///   \p GL_VERTEX_SHADER or \p GL_FRAGMENT_SHADER.
+/// \param source - The GLSL source code for the shader.
+GLuint compile_shader(const GLenum type, const char* const source)
+{
+    const GLuint shader = glCreateShader(type);
+
+    const GLint length = static_cast<GLint>(std::strlen(source));
+    glShaderSource(shader, 1, &source, &length);
+    glCompileShader(shader);
+
+    GLint compile_status;
+    glGetShaderiv(shader, GL_COMPILE_STATUS, &compile_status);
+
+    if(compile_status != GL_TRUE)
+    {
+        // Compiling failed, get the shader log and print it to the user.
+        GLint log_length;
+        glGetShaderiv(shader, GL_INFO_LOG_LENGTH, &log_length);
+        std::vector<GLchar> log(log_length);
+        glGetShaderInfoLog(shader, length, nullptr, log.data());
+        std::cerr << "Failed to compile shader:\n";
+        std::cerr.write(log.data(), log.size()) << std::endl;
+        std::exit(error_exit_code);
+    }
+
+    return shader;
+}
+
+/// \brief Utility function to compile and link a vertex and fragment shader into an OpenGL
+/// shader program.
+/// If the shaders could not be compiled, a log is printed and the program is exited.
+/// \param vert_src - The GLSL source code for the shader program's vertex shader.
+/// \param frag_src - The GLSL source code for the shader program's fragment shader.
+GLuint compile_shader_program(const char* const vert_src, const char* const frag_src)
+{
+    const GLuint program = glCreateProgram();
+
+    const GLuint vert = compile_shader(GL_VERTEX_SHADER, vert_src);
+    const GLuint frag = compile_shader(GL_FRAGMENT_SHADER, frag_src);
+
+    glAttachShader(program, frag);
+    glAttachShader(program, vert);
+
+    glLinkProgram(program);
+
+    GLint link_status;
+    glGetProgramiv(program, GL_LINK_STATUS, &link_status);
+    if(link_status != GL_TRUE)
+    {
+        // Linking failed, get the program link log and print it to the user.
+        GLint log_length;
+        glGetProgramiv(program, GL_INFO_LOG_LENGTH, &log_length);
+        std::vector<GLchar> log(log_length);
+        glGetProgramInfoLog(program, log_length, nullptr, log.data());
+        std::cerr << "Failed to link program:\n";
+        std::cerr.write(log.data(), log.size()) << std::endl;
+        std::exit(error_exit_code);
+    }
+
+    glDetachShader(program, frag);
+    glDetachShader(program, vert);
+
+    glDeleteShader(frag);
+    glDeleteShader(vert);
+
+    return program;
+}
+
+/// \brief This structure contains the OpenGL handles that this example uses to render the
+/// triangle grid to the screen.
+///
+/// Three buffers are used to render the triangle grid, the color of which is determined by
+/// a HIP compulation in \p simulator:
+/// - One buffer contains the height of each triangle (rendered as color).
+/// - One buffer holds the x- and y-coordinates for each of the corners of the triangle. Note: these
+///   coordinates are unique, as the triangles that are made up from these points are defined by the
+/// - Index buffer, that holds indices into the former two buffers to make up a list of triangles.
+struct renderer
+{
+    /// The total number of vertices for the triangles.
+    constexpr static size_t num_verts = grid_width * grid_height;
+    /// The number of bytes in the x- and y-coordinates buffer. Each x/y coordinate is encoded as
+    /// a pair of floats, which are stored in a packed  array-of-structures format: | x | y | x | y | ... |.
+    constexpr static size_t grid_buffer_size = num_verts * sizeof(float) * 2;
+    /// The number of bytes in the height buffer. Each height is encoded as a floating point value.
+    /// This buffer will be shared with HIP, which is why these coordinates are
+    /// stored in a separate buffer.
+    constexpr static size_t height_buffer_size = num_verts * sizeof(float);
+
+    /// The number of indices in the index buffer. Each triangle has 3 points, each square in the grid
+    /// is made up of 2 triangles. There are (width - 1) by (height - 1) squares in the grid.
+    constexpr static size_t num_indices = (grid_width - 1) * (grid_height - 1) * 3 * 2;
+    /// The number of bytes in the index buffer. Each index is encoded as a 32-bit int.
+    constexpr static size_t index_buffer_size = num_indices * sizeof(uint32_t);
+
+    /// An OpenGL handle to a Vertex Array Object, which has the grid and height buffers
+    /// bound to the corresponding attribute in the shader program (<tt>program</tt>) used for rendering.
+    GLuint vao;
+
+    /// Handle to the buffer that holds the indices for the triangles to render.
+    GLuint index_buffer;
+
+    /// Handle to the buffer that holds the x- and y-coordinates for each grid point.
+    GLuint grid_buffer;
+
+    /// Handle to the buffer that holds the heights each grid point. This buffer is shared with HIP.
+    GLuint height_buffer;
+
+    /// Handle to the OpenGL shader program that this example uses to render the triangles to the screen.
+    GLuint program;
+
+    /// Counters used to keep track of the rendering performance.
+    uint32_t                                       fps_frame = 0;
+    std::chrono::high_resolution_clock::time_point fps_start_time;
+
+    /// \brief Initialize OpenGL rendering resources.
+    renderer()
+    {
+        // Create a vertex array used to bind the attribute buffers.
+        glGenVertexArrays(1, &this->vao);
+
+        // Also generate the buffers in question.
+        GLuint buffers[3];
+        glGenBuffers(std::size(buffers), buffers);
+        this->index_buffer  = buffers[0];
+        this->grid_buffer   = buffers[1];
+        this->height_buffer = buffers[2];
+
+        // Compile the shader program used to render the triangles.
+        this->program = compile_shader_program(vertex_shader, fragment_shader);
+
+        // Upload the initial data to the buffers.
+        this->initialize_buffer_data();
+
+        // Set up the VAO by binding the height and grid buffers to the attribute locations
+        // in the shader program.
+        glBindVertexArray(this->vao);
+
+        // Note - keep variable "in_height" in sync with shader.
+        glBindBuffer(GL_ARRAY_BUFFER, this->height_buffer);
+        const GLuint height_attrib = glGetAttribLocation(this->program, "in_height");
+        glVertexAttribPointer(height_attrib, 1, GL_FLOAT, GL_FALSE, 0, 0);
+        glEnableVertexAttribArray(height_attrib);
+
+        // Note - keep variable "in_xy" in sync with shader.
+        const GLuint grid_attrib = glGetAttribLocation(this->program, "in_xy");
+        glBindBuffer(GL_ARRAY_BUFFER, this->grid_buffer);
+        glVertexAttribPointer(grid_attrib, 2, GL_FLOAT, GL_FALSE, 0, 0);
+        glEnableVertexAttribArray(grid_attrib);
+
+        this->fps_start_time = std::chrono::high_resolution_clock::now();
+    }
+
+    renderer(const renderer&)            = delete;
+    renderer& operator=(const renderer&) = delete;
+
+    renderer(renderer&&)            = delete;
+    renderer& operator=(renderer&&) = delete;
+
+    ~renderer()
+    {
+        glDeleteProgram(this->program);
+        GLuint buffers[] = {this->index_buffer, this->grid_buffer, this->height_buffer};
+        glDeleteBuffers(std::size(buffers), buffers);
+        glDeleteVertexArrays(1, &this->vao);
+    }
+
+    /// \brief Upload the initial values for each buffer to Vulkan.
+    void initialize_buffer_data() const
+    {
+        // Initialize the height buffer.
+        glBindBuffer(GL_ARRAY_BUFFER, this->height_buffer);
+        // We do not need to fill it, as that is going to be done from HIP, but we
+        // do need to allocate it from OpenGL. This is done simply by passing `nullptr` as
+        // initial data pointer.
+        // GL_DYNAMIC_DRAW is passed because this buffer is going to be updated every frame,
+        // and is going to be used to hold vertex data for drawing - this may help the driver
+        // to render more efficiently.
+        glBufferData(GL_ARRAY_BUFFER, height_buffer_size, nullptr, GL_DYNAMIC_DRAW);
+
+        // Initialize the grid buffer.
+        {
+            glBindBuffer(GL_ARRAY_BUFFER, this->grid_buffer);
+            // Avoid having to allocate on host by allocating the buffer in OpenGL and then mapping it
+            // into host-memory to initialize it.
+            // This buffer is going to be initialized once and is going to be used for drawing,
+            // so pass GL_STATIC_DRAW as usage hint.
+            glBufferData(GL_ARRAY_BUFFER, grid_buffer_size, nullptr, GL_STATIC_DRAW);
+
+            float* grid = reinterpret_cast<float*>(glMapBuffer(GL_ARRAY_BUFFER, GL_WRITE_ONLY));
+            for(uint32_t y = 0; y < grid_height; ++y)
+            {
+                for(uint32_t x = 0; x < grid_width; ++x)
+                {
+                    *grid++ = (2.0f * x) / (grid_width - 1) - 1;
+                    *grid++ = (2.0f * y) / (grid_height - 1) - 1;
+                }
+            }
+
+            // Let OpenGL know that we are done with this buffer.
+            glUnmapBuffer(GL_ARRAY_BUFFER);
+        }
+
+        // Initialize the index buffer
+        {
+            glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, this->index_buffer);
+            // Similar as the grid buffer, this buffer is going to be initialized once and is then used
+            // for drawing.
+            glBufferData(GL_ELEMENT_ARRAY_BUFFER, index_buffer_size, nullptr, GL_STATIC_DRAW);
+
+            uint32_t* indices
+                = reinterpret_cast<uint32_t*>(glMapBuffer(GL_ELEMENT_ARRAY_BUFFER, GL_WRITE_ONLY));
+            for(uint32_t y = 0; y < grid_height - 1; ++y)
+            {
+                for(uint32_t x = 0; x < grid_width - 1; ++x)
+                {
+                    *indices++ = (y + 0) * grid_width + (x + 0);
+                    *indices++ = (y + 1) * grid_width + (x + 0);
+                    *indices++ = (y + 0) * grid_width + (x + 1);
+                    *indices++ = (y + 1) * grid_width + (x + 0);
+                    *indices++ = (y + 1) * grid_width + (x + 1);
+                    *indices++ = (y + 0) * grid_width + (x + 1);
+                }
+            }
+
+            glUnmapBuffer(GL_ELEMENT_ARRAY_BUFFER);
+        }
+    }
+
+    /// \brief Bind the OpenGL pipeline state for this renderer.
+    void bind() const
+    {
+        glBindVertexArray(this->vao);
+        glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, this->index_buffer);
+        glUseProgram(this->program);
+    }
+
+    /// \brief Draw the next frame to the window. This requires the render state be bound using
+    /// <tt>bind</tt>.
+    void draw()
+    {
+        glDrawElements(GL_TRIANGLES, num_indices, GL_UNSIGNED_INT, nullptr);
+
+        // Output a native performance measurement.
+        ++this->fps_frame;
+        const auto frame_time = std::chrono::high_resolution_clock::now();
+        const auto time_diff  = frame_time - this->fps_start_time;
+        if(time_diff > std::chrono::seconds{5})
+        {
+            const auto time_diff_sec
+                = std::chrono::duration_cast<std::chrono::duration<float>>(time_diff).count();
+            std::cout << "Average FPS (over " << double_precision(time_diff_sec, 2, true)
+                      << " seconds): " << double_precision(this->fps_frame / time_diff_sec, 2, true)
+                      << " (" << double_precision((time_diff_sec * 1000) / this->fps_frame, 2, true)
+                      << " ms per frame, " << this->fps_frame << " frames)" << std::endl;
+            this->fps_frame      = 0;
+            this->fps_start_time = frame_time;
+        }
+    }
+};
+
+/// [Sphinx sinewave kernel start]
+/// \brief The main HIP kernel for this example - computes a simple sine wave over a
+/// 2-dimensional grid of points.
+/// \param height_map - the grid of points to compute a sine wave for. It is expected to be
+///   a \p grid_width by \p grid_height array packed into memory.(y on the inner axis).
+/// \param time - The current time relative to the start of the program.
+__global__ void sinewave_kernel(float* height_map, const float time)
+{
+    const float        freq = 10.f;
+    const unsigned int x    = blockIdx.x * blockDim.x + threadIdx.x;
+    const unsigned int y    = blockIdx.y * blockDim.y + threadIdx.y;
+    const float        u    = (2.f * x) / grid_width - 1.f;
+    const float        v    = (2.f * y) / grid_height - 1.f;
+
+    if(x < grid_width && y < grid_height)
+    {
+        height_map[x * grid_width + y] = sinf(u * freq + time) * cosf(v * freq + time);
+    }
+}
+/// [Sphinx sinewave kernel end]
+
+/// \brief This structure contains the HIP state and functionality used to advance the simulation.
+/// Initializing a \p simulator fetches the OpenGL height buffer from the corresponding <tt>renderer</tt>,
+/// and imports it as a HIP device pointer. This pointer is then passed to the simulation kernel
+/// (<tt>sinewave_kernel</tt>), which updates the values in it. When <tt>renderer::draw</tt> is called,
+/// the updated values are read from the buffer in OpenGL and used to render the triangle grid.
+struct simulator
+{
+    /// The HIP stream used to advance the simulation. This must be created from an OpenGL-interop
+    /// capable device, see <tt>pick_hip_device</tt>.
+    hipStream_t hip_stream;
+    /// A HIP graphics resource that is imported from the OpenGL height buffer to simulate.
+    hipGraphicsResource_t hip_height_buffer;
+    /// A device pointer to the height buffer, imported from the OPenGL height buffer.
+    float* hip_height_ptr;
+
+    /// The start time of the program, used for the simulation.
+    std::chrono::high_resolution_clock::time_point start_time;
+
+    /// \brief Initialize a simulator, that uses a particular HIP device.
+    /// \param renderer - The renderer that will be used to render the example. Its height buffer
+    ///   is imported to HIP for use with this simulator.
+    explicit simulator(const int hip_device, const renderer& renderer)
+    {
+        // Create a HIP stream for the target device.
+        HIP_CHECK(hipSetDevice(hip_device));
+        HIP_CHECK(hipStreamCreate(&this->hip_stream));
+
+        // [Sphinx buffer register and get start]
+        // Import the OpenGL height buffer into a HIP graphics resource.
+        HIP_CHECK(hipGraphicsGLRegisterBuffer(
+            &this->hip_height_buffer,
+            renderer.height_buffer,
+            // We are going to write to this buffer from HIP,
+            // but we do not need to read from it.
+            // As an optimization we can pass hipGraphicsRegisterFlagsWriteDiscard,
+            // so that the driver knows that we do not need the old values of
+            // the buffer.
+            hipGraphicsRegisterFlagsWriteDiscard));
+
+        // After importing the OpenGL height buffer into HIP, map it into HIP memory so that we can use it.
+        HIP_CHECK(hipGraphicsMapResources(1, &this->hip_height_buffer, this->hip_stream));
+
+        // Fetch the device pointer that points to the OpenGL buffer's memory.
+        // This function also fetches the size of the buffer. We already know it, but we still need to pass
+        // a valid pointer to hipGraphicsResourceGetMappedPointer.
+        size_t size;
+        HIP_CHECK(
+            hipGraphicsResourceGetMappedPointer(reinterpret_cast<void**>(&this->hip_height_ptr),
+                                                &size,
+                                                this->hip_height_buffer));
+        // [Sphinx buffer register and get end]
+
+        this->start_time = std::chrono::high_resolution_clock::now();
+    }
+
+    simulator(const simulator&)            = delete;
+    simulator& operator=(const simulator&) = delete;
+
+    simulator(simulator&&)            = delete;
+    simulator& operator=(simulator&&) = delete;
+
+    ~simulator()
+    {
+        // [Sphinx unregister start]
+        HIP_CHECK(hipStreamSynchronize(this->hip_stream));
+        HIP_CHECK(hipGraphicsUnmapResources(1, &this->hip_height_buffer, this->hip_stream));
+        HIP_CHECK(hipGraphicsUnregisterResource(this->hip_height_buffer));
+        HIP_CHECK(hipStreamDestroy(this->hip_stream));
+        // [Sphinx unregister end]
+    }
+
+    /// \brief Advance the simulation one step.
+    void step()
+    {
+        const auto  now = std::chrono::high_resolution_clock::now();
+        const float time
+            = std::chrono::duration<float, std::chrono::seconds::period>(now - this->start_time)
+                  .count();
+
+        // [Sphinx buffer use in kernel start]
+        // The tile size to be used for each block of the computation. A tile is
+        // tile_size by tile_size threads in this case, since we are invoking the
+        // computation over a 2D-grid.
+        constexpr size_t tile_size = 8;
+
+        // Launch the HIP kernel to advance the simulation.
+        sinewave_kernel<<<dim3(ceiling_div(grid_width, tile_size),
+                               ceiling_div(grid_height, tile_size)),
+                          dim3(tile_size, tile_size),
+                          0,
+                          this->hip_stream>>>(this->hip_height_ptr, time);
+
+        // Check that no errors occured while launching the kernel.
+        HIP_CHECK(hipGetLastError());
+        // [Sphinx buffer use in kernel end]
+    }
+};
+
+/// \brief GLFW window resize callback: If the window is resized then we need to re-size
+/// the OpenGL viewport.
+void resize_callback(GLFWwindow* const window, const int width, const int height)
+{
+    (void)window;
+    glViewport(0, 0, width, height);
+}
+
+/// \brief Program entry point.
+int main()
+{
+    // The initial width of the GLFW window when the example is first started.
+    constexpr int initial_window_width = 1280;
+    // The initial height of the GLFW window.
+    constexpr int initial_window_height = 800;
+
+    // Initialize GLFW.
+    glfwSetErrorCallback(
+        [](int code, const char* const message)
+        { std::cerr << "A glfw error encountered: " << message << "(" << code << ")\n"; });
+
+    if(glfwInit() != GLFW_TRUE)
+    {
+        std::cerr << "failed to initialize GLFW\n";
+        return error_exit_code;
+    }
+
+    // Initialize the GLFW window used to render the example.
+    GLFWwindow* const window = create_window(initial_window_width, initial_window_height);
+
+    // Ensure that we are using the OpenGL context associated to the Window.
+    glfwMakeContextCurrent(window);
+
+    // [Sphinx opengl functions load start]
+    // Make GLFW use a custom loader - we need this for the more recent OpenGL functions,
+    // as these are not loaded by default on all platforms.
+    if(!gladLoadGLLoader(reinterpret_cast<GLADloadproc>(glfwGetProcAddress)))
+    {
+        std::cerr << "Failed to load OpenGL function pointers" << std::endl;
+        return error_exit_code;
+    }
+    // [Sphinx opengl functions load end]
+
+    // Disable vsync.
+    glfwSwapInterval(0);
+
+    // If the OpenGL GL_ARB_debug_output extension is present, set a callback that is called
+    // whenever an OpenGL error occurs. This saves us calling glGetError after every OpenGL function.
+    if(GLAD_GL_ARB_debug_output)
+    {
+        glDebugMessageCallbackARB(
+            [](GLenum,
+               GLenum,
+               GLuint,
+               GLenum        severity,
+               GLsizei       length,
+               const GLchar* message,
+               const void*)
+            {
+                std::cerr << "[OpenGL] ";
+                std::cerr.write(message, length) << std::endl;
+                if(severity == GL_DEBUG_SEVERITY_HIGH_ARB)
+                {
+                    std::exit(error_exit_code);
+                }
+            },
+            nullptr);
+        // We just want the errors: First disable all messaging, and then enable just the
+        // most severe ones.
+        glDebugMessageControlARB(GL_DONT_CARE, GL_DONT_CARE, GL_DONT_CARE, 0, NULL, GL_FALSE);
+        glDebugMessageControlARB(GL_DONT_CARE,
+                                 GL_DONT_CARE,
+                                 GL_DEBUG_SEVERITY_HIGH_ARB,
+                                 0,
+                                 NULL,
+                                 GL_TRUE);
+        // Report errors synchronously instead of asynchronously.
+        glEnable(GL_DEBUG_OUTPUT_SYNCHRONOUS_ARB);
+    }
+
+    // Figure out which HIP device we need to use.
+    // This device needs to be interop-capable (see pick_hip_device).
+    const int hip_device = pick_hip_device();
+
+    // Let the user know which device we are using, on both the OpenGL and HIP sides.
+    hipDeviceProp_t hip_props;
+    HIP_CHECK(hipGetDeviceProperties(&hip_props, hip_device));
+    const GLubyte* const device_name = glGetString(GL_RENDERER);
+    std::cout << "Using device " << device_name << " (hip device " << hip_device
+              << ", compute capability " << hip_props.major << "." << hip_props.minor << ")\n";
+
+    // Sub-scope to call destructors before terminating GLFW.
+    {
+        renderer  renderer;
+        simulator simulator(hip_device, renderer);
+
+        // There are no other renderers, so we can bind the OpenGL state once.
+        renderer.bind();
+
+        glfwSetFramebufferSizeCallback(window, resize_callback);
+        glClearColor(0, 0, 0, 1);
+
+        // The main rendering loop.
+        // Repeat for as long as the window is not closed.
+        while(glfwWindowShouldClose(window) == GLFW_FALSE)
+        {
+            glClear(GL_COLOR_BUFFER_BIT);
+
+            // First step the simulation so that the height buffer is ready
+            // for the next frame.
+            simulator.step();
+
+            // Draw the example to the window's framebuffer.
+            renderer.draw();
+
+            // Present the framebuffer on screen.
+            glfwSwapBuffers(window);
+            glfwPollEvents();
+        }
+    }
+
+    // Clean up GLFW.
+    glfwDestroyWindow(window);
+    glfwTerminate();
+}
@@ -0,0 +1,4 @@
+import urllib.request
+
+urllib.request.urlretrieve("https://raw.githubusercontent.com/ROCm/rocm-examples/refs/heads/develop/HIP-Basic/opengl_interop/main.hip", "docs/tools/example_codes/opengl_interop.hip")
+urllib.request.urlretrieve("https://raw.githubusercontent.com/ROCm/rocm-examples/refs/heads/develop/HIP-Basic/vulkan_interop/main.hip", "docs/tools/example_codes/external_interop.hip")
@@ -30,7 +30,7 @@ Implementing reductions on GPUs requires a basic understanding of the :doc:`/und

 Synchronizing parallel threads of execution across a GPU is crucial for correctness as the partial results can't be synchronized before they manifest. Synchronizing all the threads running on a GPU at any given time is possible, however, it is a costly and intricate operation. If synchronization is not absolutely necessary, map the parallel algorithm so that multiprocessors and blocks can make independent progress and need not sync frequently.

-There are ten reduction implementations in the `rocm-examples <https://github.com/ROCm/rocm-examples/tree/develop/Tutorials/reduction/include/Reduction>`_, which are described in the following sections. 
+There are ten reduction implementations in the `rocm-examples <https://github.com/ROCm/rocm-examples/tree/develop/Tutorials/reduction/include/Reduction>`_, which are described in the following sections.

 Naive shared reduction
 ----------------------
@@ -188,7 +188,7 @@ A notable exception is when the shared read uniformly broadcasts to the same add
 .. note::

    To avoid bank conflicts, read shared memory in a coalesced manner, which implies that reads/writes of each lane in a warp evaluate to consecutive locations. Analyzing the read/write patterns could help you to understand the cause of bank conflicts. For more details, check `CDNA3 ISA <https://www.amd.com/content/dam/amd/en/documents/instinct-tech-docs/instruction-set-architectures/amd-instinct-mi300-cdna3-instruction-set-architecture.pdf>`_ or `RDNA3 ISA <https://www.amd.com/content/dam/amd/en/documents/radeon-tech-docs/instruction-set-architectures/rdna3-shader-instruction-set-architecture-feb-2023_0.pdf>`_ data share operations chapter.
-    
+
 Utilize upper half of the block
 -------------------------------

@@ -143,10 +143,12 @@ Retrieval of the result from the device is done much like input data copy. In th

  HIP_CHECK(hipMemcpy(y.data(), d_y, size_bytes, hipMemcpyDeviceToHost));

+.. _compiling_on_the_command_line:
+
 Compiling on the command line
 =============================

-.. _setting_up_the_command-line:
+.. _setting_up_the_command_line:

 Setting up the command line
 ---------------------------
@@ -19,11 +19,11 @@ Project organization

 CLR includes the following source code,

-* ``hipamd`` - contains implementation of ``HIP`` language on the AMD platform. It is hosted at `clr/hipamd <https://github.com/ROCm/clr/tree/develop/hipamd>`_.
+* ``hipamd`` - contains implementation of ``HIP`` language on the AMD platform. It is hosted at `clr/hipamd <https://github.com/ROCm/clr/tree/amd-staging/hipamd>`_.

-* ``opencl`` - contains implementation of `OpenCL™ <https://www.khronos.org/opencl/>`_ on AMD platform. It is hosted at `clr/opencl <https://github.com/ROCm/clr/tree/develop/opencl>`_.
+* ``opencl`` - contains implementation of `OpenCL™ <https://www.khronos.org/opencl/>`_ on AMD platform. It is hosted at `clr/opencl <https://github.com/ROCm/clr/tree/amd-staging/opencl>`_.

-* ``rocclr`` - contains ROCm compute runtime used in `HIP` and `OpenCL™`. This is hosted at `clr/rocclr <https://github.com/ROCm/clr/tree/develop/rocclr>`_.
+* ``rocclr`` - contains ROCm compute runtime used in `HIP` and `OpenCL™`. This is hosted at `clr/rocclr <https://github.com/ROCm/clr/tree/amd-staging/rocclr>`_.


 How to build/install
@@ -79,4 +79,4 @@ To run ``hip-tests`` please go to the repository and follow the steps.
 Release notes
 -------------

-HIP provides release notes in CLR `change log <https://github.com/ROCm/clr/blob/develop/CHANGELOG.md>`_, which has records of changes in each release.
+HIP provides release notes in CLR `change log <https://github.com/ROCm/clr/blob/amd-staging/amd-staging/CHANGELOG.md>`_, which has records of changes in each release.
@@ -0,0 +1,100 @@
+.. meta::
+  :description: Compilation workflow of the HIP compilers.
+  :keywords: AMD, ROCm, HIP, CUDA, HIP runtime API
+
+.. _hip_compilers:
+
+********************************************************************************
+HIP compilers
+********************************************************************************
+
+ROCm provides the compiler driver ``hipcc``, that can be used on AMD ROCm and
+NVIDIA CUDA platforms.
+
+On ROCm, ``hipcc`` takes care of the following:
+
+- Setting the default library and include paths for HIP
+- Setting some environment variables
+- Invoking the appropriate compiler - ``amdclang++``
+
+On NVIDIA CUDA platform, ``hipcc`` takes care of invoking compiler ``nvcc``.
+``amdclang++`` is based on the ``clang++`` compiler. For more
+details, see the :doc:`llvm project<llvm-project:index>`.
+
+HIP compilation workflow
+================================================================================
+
+HIP provides a flexible compilation workflow that supports both offline
+compilation and runtime or just-in-time (JIT) compilation. Each approach has
+advantages depending on the use case, target architecture, and performance
+needs.
+
+The offline compilation is ideal for production environments, where the
+performance is critical and the target GPU architecture is known in advance.
+
+The runtime compilation is useful in development environments or when
+distributing software that must run on a wide range of hardware without the
+knowledge of the GPU in advance. It provides flexibility at the cost of some
+performance overhead.
+
+Offline compilation
+--------------------------------------------------------------------------------
+
+The HIP code compilation is performed in two stages: host and  device code
+compilation stage.
+
+- Device-code compilation stage: The compiled device code is embedded into the
+  host object file. Depending on the platform, the device code can be compiled
+  into assembly or binary. ``nvcc`` and ``amdclang++`` target different
+  architectures and use different code object formats. ``nvcc`` uses the binary
+  ``cubin`` or the assembly PTX files, while the ``amdclang++`` path is the
+  binary ``hsaco`` format. On CUDA platforms, the driver compiles the PTX files
+  to executable code during runtime.
+
+- Host-code compilation stage: On the host side, ``hipcc`` or ``amdclang++`` can
+  compile the host code in one step without other C++ compilers. On the other
+  hand, ``nvcc`` only replaces the ``<<<...>>>`` kernel launch syntax with the
+  appropriate CUDA runtime function call and the modified host code is passed to
+  the default host compiler.
+
+For an example on how to compile HIP from the command line, see :ref:`SAXPY
+tutorial<compiling_on_the_command_line>` .
+
+Runtime compilation
+--------------------------------------------------------------------------------
+
+HIP allows you to compile kernels at runtime using the ``hiprtc*`` API. Kernels
+are stored as a text string, which is passed to HIPRTC alongside options to
+guide the compilation.
+
+For more details, see
+:doc:`HIP runtime compiler <../how-to/hip_rtc>`.
+
+Static libraries
+================================================================================
+
+``hipcc`` supports generating two types of static libraries.
+
+- The first type of static library only exports and launches host functions
+  within the same library and not the device functions. This library type offers
+  the ability to link with a non-hipcc compiler such as ``gcc``. Additionally,
+  this library type contains host objects with device code embedded as fat
+  binaries. This library type is generated using the flag ``--emit-static-lib``:
+
+  .. code-block:: shell
+
+    hipcc hipOptLibrary.cpp --emit-static-lib -fPIC -o libHipOptLibrary.a
+    gcc test.cpp -L. -lhipOptLibrary -L/path/to/hip/lib -lamdhip64 -o test.out
+
+- The second type of static library exports device functions to be linked by
+  other code objects by using ``hipcc`` as the linker. This library type
+  contains relocatable device objects and is generated using ``ar``:
+
+  .. code-block:: shell
+
+    hipcc hipDevice.cpp -c -fgpu-rdc -o hipDevice.o
+    ar rcsD libHipDevice.a hipDevice.o
+    hipcc libHipDevice.a test.cpp -fgpu-rdc -o test.out
+
+For more information, see `HIP samples host functions <https://github.com/ROCm/hip-tests/tree/develop/samples/2_Cookbook/15_static_library/host_functions>`_
+and `device functions <https://github.com/ROCm/hip-tests/tree/develop/samples/2_Cookbook/15_static_library/device_functions>`_.
@@ -26,11 +26,10 @@ according to the :ref:`SIMT model<programming_model_simt>`, together with the
 necessary registers and caches.

 The threads are executed in groupings called warps. The amount of threads
-making up a warp is architecture dependent.
-On AMD GPUs the warp size is commonly 64 threads, except in RDNA
-architectures which can utilize a warp size of 32 or 64 respectively. 
-The warp size of supported AMD GPUs is listed in the :doc:`rocm:reference/gpu-arch-specs`. 
-NVIDIA GPUs have a warp size of 32.
+making up a warp is architecture dependent. On AMD GPUs the warp size is
+commonly 64 threads, except in RDNA architectures which can utilize a warp size
+of 32 or 64 respectively. The warp size of supported AMD GPUs is listed in the
+:doc:`rocm:reference/gpu-arch-specs`. NVIDIA GPUs have a warp size of 32.

 In contrast to CPUs, GPUs generally do not employ complex cache structures or
 control logic, like branch prediction or out-of-order execution, but instead
@@ -2,7 +2,9 @@
  :description: This chapter explains the HIP programming model, the contract
                between the programmer and the compiler/runtime executing the
                code, how it maps to the hardware.
-  :keywords: AMD, ROCm, HIP, CUDA, API design
+  :keywords: ROCm, HIP, CUDA, API design, programming model
+
+.. _programming_model:

 *******************************************************************************
 HIP programming model
@@ -10,7 +12,7 @@ HIP programming model

 The HIP programming model makes it easy to map data-parallel C/C++ algorithms to
 massively parallel, wide single instruction, multiple data (SIMD) architectures,
-such as GPUs. 
+such as GPUs.

 While the model may be expressed in most imperative languages, (for example
 Python via PyHIP) this document will focus on the original C/C++ API of HIP.
@@ -74,7 +76,7 @@ a few key differences between the two:
  accessible from all contexts.

  Looking at :ref:`rdna3_cu` and :ref:`cdna3_cu`, you can see that
-  every CU has an instance of storage backing the namespace ``__shared__``. 
+  every CU has an instance of storage backing the namespace ``__shared__``.
  Even if the host were to have access to these regions of
  memory, the performance benefits of the segmented memory subsystem are
  supported by the inability of asynchronous access from the host.
@@ -90,11 +92,11 @@ a few key differences between the two:

 * Asynchrony is at the forefront of the HIP API. Computations launched on the device
  execute asynchronously with respect to the host, and it is the user's responsibility to
-  synchronize their data dispatch/fetch with computations on the device. 
-  
+  synchronize their data dispatch/fetch with computations on the device.
+
  .. note::
-    HIP does perform implicit synchronization on occasions, more advanced than other 
-    APIs such as OpenCL or SYCL, in which the responsibility of synchronization mostly 
+    HIP does perform implicit synchronization on occasions, more advanced than other
+    APIs such as OpenCL or SYCL, in which the responsibility of synchronization mostly
    depends on the user.

 .. _programming_model_simt:
@@ -130,7 +132,7 @@ The incoming four-vector of floating-point values ``b`` is multiplied by a
 scalar and then added element-wise to the four-vector floating-point values of
 ``a``. On modern SIMD-capable architectures, the four-vector ops are expected to
 compile to a single SIMD instruction. However, GPU execution of this kernel will
-typically break down the vector elements into 4 separate threads for parallel execution, 
+typically break down the vector elements into 4 separate threads for parallel execution,
 as seen in the following figure:

 .. _simt:
@@ -145,7 +147,7 @@ as seen in the following figure:

 In HIP, lanes of the SIMD architecture are fed by mapping threads of a SIMT
 execution, one thread down each lane of an SIMD engine. Execution parallelism
-usually isn't exploited from the width of the built-in vector types, but across multiple threads via the thread ID constants ``threadIdx.x``, ``blockIdx.x``, etc. 
+usually isn't exploited from the width of the built-in vector types, but across multiple threads via the thread ID constants ``threadIdx.x``, ``blockIdx.x``, etc.

 .. _inherent_thread_model:

@@ -159,7 +161,7 @@ online/offline to binaries, in bulk.
 All threads of a kernel are uniquely identified by a set of integral values, called thread IDs.
 The set of integers identifying a thread relate to the hierarchy in which the threads execute.

-The thread hierarchy inherent to how AMD GPUs operate is depicted in the 
+The thread hierarchy inherent to how AMD GPUs operate is depicted in the
 following figure.

 .. _inherent_thread_hierarchy:
@@ -175,9 +177,9 @@ following figure.

 Warp (or Wavefront)
  The innermost grouping of threads is called a warp, or a wavefront in ISA terms. A warp
-  is the most tightly coupled groups of threads, both physically and logically. Threads 
-  inside a warp are also called lanes, and the integral value identifying them is the lane ID. 
-  
+  is the most tightly coupled groups of threads, both physically and logically. Threads
+  inside a warp are also called lanes, and the integral value identifying them is the lane ID.
+
  .. tip::

    Lane IDs aren't queried like other thread IDs, but are user-calculated. As a
@@ -222,10 +224,10 @@ groups let you define your own set of thread groups which may fit  your user-cas
 better than the defaults defined by the hardware.

 .. note::
-  The implicit groups defined by kernel launch parameters are still available 
+  The implicit groups defined by kernel launch parameters are still available
  when working with cooperative groups.

-For further information, see :doc:`Cooperative groups </how-to/cooperative_groups>`. 
+For further information, see :doc:`Cooperative groups </how-to/hip_runtime_api/cooperative_groups>`.

 Memory model
 ============
@@ -287,7 +289,7 @@ HIP programs consist of two distinct scopes:
    importantly around kernel launching and argument setting. It is geared
    towards implementing abstractions atop, such as the runtime API itself.
    Offers two additional pieces of functionality not provided by the Runtime
-    API: ``hipModule`` and ``hipCtx`` APIs. For further details, check 
+    API: ``hipModule`` and ``hipCtx`` APIs. For further details, check
    :doc:`HIP driver API </how-to/hip_porting_driver_api>`.

 * The device-side kernels running on GPUs. Both the host and the device-side
@@ -0,0 +1,99 @@
+.. meta::
+  :description: This chapter provides an introduction to the HIP API.
+  :keywords: AMD, ROCm, HIP, CUDA, C++ language extensions
+
+.. _intro-to-hip:
+
+*******************************************************************************
+What is HIP?
+*******************************************************************************
+
+The Heterogeneous-computing Interface for Portability (HIP) API is a C++ runtime API
+and kernel language that lets developers create portable applications running in heterogeneous systems,
+using CPUs and AMD GPUs or NVIDIA GPUs from a single source code. HIP provides a simple
+marshalling language to access either the AMD ROCM back-end, or NVIDIA CUDA back-end,
+to build and run application kernels.
+
+.. figure:: data/what_is_hip/hip.svg
+    :alt: HIP in an application.
+    :align: center
+
+* HIP is a thin API with little or no performance impact over coding directly
+  in NVIDIA CUDA or AMD :doc:`ROCm <rocm:what-is-rocm>`.
+
+* HIP enables coding in a single-source C++ programming language including
+  features such as templates, C++11 lambdas, classes, namespaces, and more.
+
+* Developers can specialize for the platform (CUDA or ROCm) to tune for
+  performance or handle tricky cases.
+
+ROCm offers compilers (``clang``, ``hipcc``), code
+profilers (``rocprof``, ``omnitrace``), debugging tools (``rocgdb``), libraries
+and HIP with the runtime API and kernel language, to create heterogeneous applications
+running on both CPUs and GPUs. ROCm provides marshalling libraries like
+:doc:`hipFFT <hipfft:index>` or :doc:`hipBLAS <hipblas:index>` that act as a
+thin programming layer over either NVIDIA CUDA or AMD ROCm to enable support for
+either back-end. These libraries offer pointer-based memory interfaces and are
+easily integrated into your applications.
+
+HIP supports the ability to build and run on either AMD GPUs or NVIDIA GPUs.
+GPU Programmers familiar with NVIDIA CUDA or OpenCL will find the HIP API
+familiar and easy to use. Developers no longer need to choose between AMD or
+NVIDIA GPUs. You can quickly port your application to run on the available
+hardware while maintaining a single codebase. The :doc:`HIPify <hipify:index>`
+tools, based on the clang front-end and Perl language, can convert CUDA API
+calls into the corresponding HIP API calls. However, HIP is not intended to be a
+drop-in replacement for CUDA, and developers should expect to do some manual
+coding and performance tuning work for AMD GPUs to port existing projects as
+described :doc:`HIP porting guide <how-to/hip_porting_guide>`.
+
+HIP provides two components: those that run on the CPU, also known as host
+system, and those that run on GPUs, also referred to as device. The host-based
+code is used to create device buffers, move data between the host application
+and a device, launch the device code (also known as kernel), manage streams and
+events, and perform synchronization. The kernel language provides a way to
+develop massively parallel programs that run on GPUs, and provides access to GPU
+specific hardware capabilities.
+
+In summary, HIP simplifies cross-platform development, maintains performance,
+and provides a familiar C++ experience for GPU programming that runs seamlessly
+on both AMD and NVIDIA GPUs.
+
+HIP components
+===============================================
+
+HIP consists of the following components. For information on the license
+associated with each component, see :doc:`HIP licensing <license>`.
+
+C++ runtime API
+-----------------------------------------------
+
+For the AMD ROCm platform, HIP provides headers and a runtime library built on
+top of HIP-Clang compiler in the repository
+:doc:`Compute Language Runtime (CLR) <understand/amd_clr>`. The HIP runtime
+implements HIP streams, events, and memory APIs, and is an object library that
+is linked with the application. The source code for all headers and the library
+implementation is available on GitHub.
+
+For the NVIDIA CUDA platform, HIP provides headers that translate from the
+HIP runtime API to the CUDA runtime API. The host-side contains mostly inlined
+wrappers or even just preprocessor defines, with no additional overhead.
+The device-side code is compiled with ``nvcc``, just like normal CUDA kernels,
+and therefore one can expect the same performance as if directly coding in CUDA.
+The CUDA specific headers can be found in the `hipother repository <https://github.com/ROCm/hipother>`_.
+
+For further details, check :ref:`HIP Runtime API Reference <runtime_api_reference>`.
+
+Kernel language
+-----------------------------------------------
+
+HIP provides a C++ syntax that is suitable for compiling most code that commonly appears in
+compute kernels (classes, namespaces, operator overloading, and templates). HIP also defines other
+language features that are designed to target accelerators, such as:
+
+* Short-vector headers that can serve on a host or device
+* Math functions that resemble those in ``math.h``, which is included with standard C++ compilers
+* Built-in functions for accessing specific GPU hardware capabilities
+
+For further details, check :doc:`C++ language extensions <reference/cpp_language_extensions>`
+and :doc:`C++ language support <reference/cpp_language_support>`.
@@ -724,7 +724,7 @@ enum hipLimit_t {

 /** Allocates the memory as write-combined. On some system configurations, write-combined allocation
 * may be transferred faster across the PCI Express bus, however, could have low read efficiency by
- * most CPUs. It's a good option for data tranfer from host to device via mapped pinned memory.*/
+ * most CPUs. It's a good option for data transfer from host to device via mapped pinned memory.*/
 #define hipHostMallocWriteCombined 0x4
 #define hipHostAllocWriteCombined 0x4

@@ -735,11 +735,11 @@ enum hipLimit_t {
 #define hipHostMallocNumaUser  0x20000000
 #define hipExtHostAllocNumaUser  0x20000000

-/** Allocate coherent memory. Overrides HIP_COHERENT_HOST_ALLOC for specific allocation.*/
+/** Allocate coherent memory. Overrides HIP_HOST_COHERENT for specific allocation.*/
 #define hipHostMallocCoherent  0x40000000
 #define hipExtHostAllocCoherent  0x40000000

-/** Allocate non-coherent memory. Overrides HIP_COHERENT_HOST_ALLOC for specific allocation.*/
+/** Allocate non-coherent memory. Overrides HIP_HOST_COHERENT for specific allocation.*/
 #define hipHostMallocNonCoherent  0x80000000
 #define hipExtHostAllocNonCoherent  0x80000000

@@ -3494,7 +3494,6 @@ hipError_t hipMemAllocHost(void** ptr, size_t size);
 /**
 * @}
 */
-
 /**
 *  @brief Allocates device accessible page locked (pinned) host memory
 *
@@ -3583,6 +3582,8 @@ hipError_t hipExtHostAlloc(void** ptr, size_t size, unsigned int flags);
 * The API returns the allocation pointer, managed by HMM, can be used further to execute kernels
 * on device and fetch data between the host and device as needed.
 *
+ * If HMM is not supported, the function behaves the same as @p hipMallocHost .
+ *
 * @note   It is recommend to do the capability check before call this API.
 *
 * @param [out] dev_ptr - pointer to allocated device memory
@@ -9323,7 +9324,7 @@ return hipOccupancyMaxPotentialBlockSize(gridSize, blockSize,(hipFunction_t)kern
 * @ingroup ModuleCooperativeG
 *
 * \tparam T                  The type of the kernel function.
- * 
+ *
 * @param [in] f              Kernel function to launch.
 * @param [in] gridDim        Grid dimensions specified as multiple of blockDim.
 * @param [in] blockDim       Block dimensions specified in work-items.
@@ -38,14 +38,14 @@ THE SOFTWARE.

    <definitions>
      <context id="hip">
-      <include>  
+      <include>
        <context ref="def:c-like-comment"/>

        <context ref="c:string"/>

        <context ref="c:escaped-character"/>

-        <context ref="c:storage-class"/> 
+        <context ref="c:storage-class"/>

        <context ref="def:c-like-comment-multiline"/>
        <context ref="def:c-like-close-comment-outside-comment"/>
@@ -56,7 +56,7 @@ THE SOFTWARE.

        <context ref="def:float"/>

-        <context ref="c:hexadecimal"/> 
+        <context ref="c:hexadecimal"/>

        <context ref="c:octal"/>