diff --git a/projects/hip/.github/dependabot.yml b/projects/hip/.github/dependabot.yml index 0e0a252eb6..470629cc14 100644 --- a/projects/hip/.github/dependabot.yml +++ b/projects/hip/.github/dependabot.yml @@ -14,5 +14,6 @@ updates: - "documentation" - "dependencies" - "ci:docs-only" + target-branch: "docs/develop" reviewers: - "samjwu" diff --git a/projects/hip/.readthedocs.yaml b/projects/hip/.readthedocs.yaml index 02a17b0df0..76f837ede8 100644 --- a/projects/hip/.readthedocs.yaml +++ b/projects/hip/.readthedocs.yaml @@ -6,28 +6,28 @@ version: 2 sphinx: configuration: docs/conf.py -formats: [htmlzip, pdf, epub] +formats: [] python: install: - requirements: docs/sphinx/requirements.txt +conda: + environment: docs/environment.yml # needed until ubuntu ships doxygen >= 1.9.8 + build: os: ubuntu-22.04 tools: - python: "3.10" + python: "mambaforge-22.9" # needed until ubuntu ships doxygen >= 1.9.8 apt_packages: - - "doxygen" + - "gfortran" # For pre-processing fortran sources - "graphviz" # For dot graphs in doxygen jobs: post_checkout: - - if [ -d ../llvm-project ]; then rmdir ../llvm-project; fi - - if [ -d ../clr ]; then rmdir ../clr; fi - - if [ -d ../ROCR-Runtime ]; then rmdir ../ROCR-Runtime; fi - - git clone --depth=1 --single-branch --branch rocdoc-195 https://github.com/StreamHPC/llvm-project.git ../llvm-project - - git clone --depth=1 --single-branch --branch develop https://github.com/ROCm/clr.git ../clr - - git clone --depth=1 --single-branch --branch master https://github.com/ROCm/ROCR-Runtime.git ../ROCR-Runtime + - if [ -d ../clr ]; then rm -rf ../clr; fi + - if [ -d ../ROCR-Runtime ]; then rm -rf ../ROCR-Runtime; fi + - git clone --depth=1 --single-branch --branch docs/develop https://github.com/ROCm/clr.git ../clr + - git clone --depth=1 --single-branch --branch master https://github.com/ROCm/ROCR-Runtime.git ../ROCR-Runtime post_build: - rm -rf ../clr - - rm -rf ../llvm-project - rm -rf ../ROCR-Runtime diff --git a/projects/hip/.wordlist.txt b/projects/hip/.wordlist.txt index 45af247c0d..2033214571 100644 --- a/projects/hip/.wordlist.txt +++ b/projects/hip/.wordlist.txt @@ -1,8 +1,11 @@ +.hip_fatbin ALU ALUs AmgX APU +APUs AQL +AXPY Asynchrony backtrace Bitcode @@ -12,40 +15,65 @@ builtins Builtins CAS clr +coroutines +Ctx cuBLASLt cuCtx +CUDA's cuDNN +cuModule +dataflow deallocate +decompositions denormal +Dereferencing dll DirectX EIGEN EIGEN's enqueue enqueues +entrypoint +entrypoints enum +enums embeded extern +fatbin fatbinary +foundationally frontends +fnuz +FNUZ +fp gedit GPGPU +GWS hardcoded HC +hcBLAS +HIP-Clang HIP's hipcc +hipCtx hipexamine hipified +hipModule +hipModuleLaunchKernel hipother HIPRTC -hcBLAS icc +IILE +iGPU inplace Interoperation interoperate +Interprocess +interprocess Intrinsics intrinsics IPC +IPs isa Lapack latencies @@ -59,34 +87,63 @@ ltrace makefile Malloc malloc +memset multicore multigrid multithreading +multitenant +MALU +NaN NCCL NDRange nonnegative +NOP Numa Nsight +ocp +overindex +overindexing oversubscription +pixelated +pragmas +preallocated preconditioners +predefining prefetched preprocessor PTX PyHIP queryable +prefetching +quad representable RMW ROCm's rocTX +roundtrip RTC RTTI +rvalue +SAXPY scalarizing sceneries +shaders SIMT +SOMA SPMV structs SYCL syntaxes +texel +texels +tradeoffs +templated +toolkits typedefs +unintuitive +UMM +unmap +upscaled +variadic WinGDB -zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz \ No newline at end of file +zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz diff --git a/projects/hip/README.md b/projects/hip/README.md index f651518d35..e7037c7fdb 100644 --- a/projects/hip/README.md +++ b/projects/hip/README.md @@ -37,7 +37,7 @@ HIP releases are typically naming convention for each ROCM release to help diffe * [Installation](docs/install/install.rst) * [HIP FAQ](docs/how-to/faq.md) -* [HIP Kernel Language](docs/reference/kernel_language.rst) +* [HIP C++ Language Extensions](docs/reference/cpp_language_extensions.rst) * [HIP Porting Guide](docs/how-to/hip_porting_guide.md) * [HIP Porting Driver Guide](docs/how-to/hip_porting_driver_api.md) * [HIP Programming Guide](docs/how-to/programming_manual.md) @@ -88,7 +88,7 @@ hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost); The HIP kernel language defines builtins for determining grid and block coordinates, math functions, short vectors, atomics, and timer functions. -It also specifies additional defines and keywords for function types, address spaces, and optimization controls (See the [HIP Kernel Language](docs/reference/kernel_language.rst) for a full description). +It also specifies additional defines and keywords for function types, address spaces, and optimization controls (See the [HIP C++ Language Extensions](docs/reference/cpp_language_extensions.rst) for a full description). Here's an example of defining a simple 'vector_square' kernel. ```cpp diff --git a/projects/hip/docs/conf.py b/projects/hip/docs/conf.py index 3dec52d636..82bcefee89 100644 --- a/projects/hip/docs/conf.py +++ b/projects/hip/docs/conf.py @@ -43,4 +43,12 @@ extensions += ["sphinxcontrib.doxylink"] cpp_id_attributes = ["__global__", "__device__", "__host__", "__forceinline__", "static"] cpp_paren_attributes = ["__declspec"] -suppress_warnings = ["etoc.toctree"] \ No newline at end of file +suppress_warnings = ["etoc.toctree"] + +numfig = False + + +exclude_patterns = [ + "doxygen/mainpage.md", + "understand/glossary.md" +] \ No newline at end of file diff --git a/projects/hip/docs/data/how-to/cooperative_groups/thread_hierarchy_coop_bottom.drawio b/projects/hip/docs/data/how-to/cooperative_groups/thread_hierarchy_coop_bottom.drawio new file mode 100644 index 0000000000..4f1ff494f2 --- /dev/null +++ b/projects/hip/docs/data/how-to/cooperative_groups/thread_hierarchy_coop_bottom.drawio @@ -0,0 +1,904 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/projects/hip/docs/data/how-to/cooperative_groups/thread_hierarchy_coop_bottom.svg b/projects/hip/docs/data/how-to/cooperative_groups/thread_hierarchy_coop_bottom.svg new file mode 100644 index 0000000000..298cd48218 --- /dev/null +++ b/projects/hip/docs/data/how-to/cooperative_groups/thread_hierarchy_coop_bottom.svg @@ -0,0 +1 @@ +Block
Thread-block tile
Thread-block tile
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Thread-block tile
Thread-block tile
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Block
Coalesced group
Coalesced group
Warp
Warp
Warp
Warp
Warp
Warp
Coalesced group
Coalesced group
Warp
Warp
Warp
Warp
Warp
Warp
Grid
Text is not SVG - cannot display
\ No newline at end of file diff --git a/projects/hip/docs/data/understand/programming_model_reference/thread_hierarchy_coop.drawio b/projects/hip/docs/data/how-to/cooperative_groups/thread_hierarchy_coop_top.drawio similarity index 98% rename from projects/hip/docs/data/understand/programming_model_reference/thread_hierarchy_coop.drawio rename to projects/hip/docs/data/how-to/cooperative_groups/thread_hierarchy_coop_top.drawio index fb4c19fef9..e4c0c90d2d 100644 --- a/projects/hip/docs/data/understand/programming_model_reference/thread_hierarchy_coop.drawio +++ b/projects/hip/docs/data/how-to/cooperative_groups/thread_hierarchy_coop_top.drawio @@ -1,6 +1,6 @@ - + @@ -1411,7 +1411,7 @@ - + @@ -1591,7 +1591,7 @@ - + @@ -1762,7 +1762,7 @@ - + @@ -1876,7 +1876,7 @@ - + @@ -2047,7 +2047,7 @@ - + @@ -3490,7 +3490,7 @@ - + @@ -3670,7 +3670,7 @@ - + @@ -3841,7 +3841,7 @@ - + @@ -3955,7 +3955,7 @@ - + @@ -4126,7 +4126,7 @@ - + @@ -4534,7 +4534,7 @@ - + @@ -4600,7 +4600,7 @@ - + @@ -4771,7 +4771,7 @@ - + @@ -4933,7 +4933,7 @@ - + @@ -4984,163 +4984,163 @@ - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + diff --git a/projects/hip/docs/data/how-to/cooperative_groups/thread_hierarchy_coop_top.svg b/projects/hip/docs/data/how-to/cooperative_groups/thread_hierarchy_coop_top.svg new file mode 100644 index 0000000000..ebe4794576 --- /dev/null +++ b/projects/hip/docs/data/how-to/cooperative_groups/thread_hierarchy_coop_top.svg @@ -0,0 +1 @@ +Grid
Block
Block
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Block
Block
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Block
Block
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Block
Block
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Block
Block
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Block
Block
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
GridMulti Grid
Block
Block
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Text is not SVG - cannot display
\ No newline at end of file diff --git a/projects/hip/docs/data/how-to/hipgraph/hip_graph.drawio b/projects/hip/docs/data/how-to/hipgraph/hip_graph.drawio new file mode 100644 index 0000000000..03569ac734 --- /dev/null +++ b/projects/hip/docs/data/how-to/hipgraph/hip_graph.drawio @@ -0,0 +1,76 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/projects/hip/docs/data/how-to/hipgraph/hip_graph.svg b/projects/hip/docs/data/how-to/hipgraph/hip_graph.svg new file mode 100644 index 0000000000..6eed6b92e5 --- /dev/null +++ b/projects/hip/docs/data/how-to/hipgraph/hip_graph.svg @@ -0,0 +1,4 @@ + + + +Stream 1
Kernel B
Kernel B
Stream 2
Kernel A
Kernel A
hipDeviceSynchronize
hipDeviceSynchronize
Kernel C
Kernel C
hipDeviceSynchronize
hipDeviceSynchronize
Kernel D
Kernel D
Kernel A
Kernel A
Kernel B
Kernel B
Kernel C
Kernel C
Kernel D
Kernel D
Text is not SVG - cannot display
\ No newline at end of file diff --git a/projects/hip/docs/data/how-to/hipgraph/hip_graph_speedup.drawio b/projects/hip/docs/data/how-to/hipgraph/hip_graph_speedup.drawio new file mode 100644 index 0000000000..7802785f6b --- /dev/null +++ b/projects/hip/docs/data/how-to/hipgraph/hip_graph_speedup.drawio @@ -0,0 +1,162 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/projects/hip/docs/data/how-to/hipgraph/hip_graph_speedup.svg b/projects/hip/docs/data/how-to/hipgraph/hip_graph_speedup.svg new file mode 100644 index 0000000000..f16123b9e2 --- /dev/null +++ b/projects/hip/docs/data/how-to/hipgraph/hip_graph_speedup.svg @@ -0,0 +1,4 @@ + + + +Streams
kernel A
kernel launch A
kernel B
kernel C
kernel launch B
kernel launch C
host activity
device activity
time
kernel launch D
kernel D
device idling due to kernel launch congestion
kernel A
kernel B
kernel C
graph launch
host activity
device activity
kernel D
Graph
speedup
\ No newline at end of file diff --git a/projects/hip/docs/data/tutorial/reduction/conflict_free_reduction.drawio b/projects/hip/docs/data/tutorial/reduction/conflict_free_reduction.drawio new file mode 100644 index 0000000000..b1f0b51074 --- /dev/null +++ b/projects/hip/docs/data/tutorial/reduction/conflict_free_reduction.drawio @@ -0,0 +1,448 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/projects/hip/docs/data/tutorial/reduction/conflict_free_reduction.svg b/projects/hip/docs/data/tutorial/reduction/conflict_free_reduction.svg new file mode 100644 index 0000000000..71eb0660ed --- /dev/null +++ b/projects/hip/docs/data/tutorial/reduction/conflict_free_reduction.svg @@ -0,0 +1 @@ +Bank conflict free reduction
Shared
Shared
5
5
13
13
0
0
8
8
2
2
1
1
7
7
42
42
2
2
23
23
10
10
3
3
Shared
Shared
7
7
42
42
23
23
10
10
7
7
42
42
23
23
10
10
0
0
1
1
Shared
Shared
23
23
42
42
23
23
10
10
7
7
42
42
23
23
10
10
0
0
Shared
Shared
42
42
42
42
23
23
10
10
7
7
42
42
23
23
10
10
Thread IDs
Thread...
Data snapshot
Data sn...
Thread IDs
Thread...
Data snapshot
Data sn...
Thread IDs
Thread...
Data snapshot
Data sn...
Data snapshot
Data sn...
Text is not SVG - cannot display
\ No newline at end of file diff --git a/projects/hip/docs/data/tutorial/reduction/foldl.drawio b/projects/hip/docs/data/tutorial/reduction/foldl.drawio new file mode 100644 index 0000000000..1d5228da9e --- /dev/null +++ b/projects/hip/docs/data/tutorial/reduction/foldl.drawio @@ -0,0 +1,142 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/projects/hip/docs/data/tutorial/reduction/foldl.svg b/projects/hip/docs/data/tutorial/reduction/foldl.svg new file mode 100644 index 0000000000..7603080193 --- /dev/null +++ b/projects/hip/docs/data/tutorial/reduction/foldl.svg @@ -0,0 +1 @@ +Fold-left
Input
Input
8
8
13
13
5
5
z
z
5
5
f(z,5)
f(z,5)
5
5
13
13
f(f(z,5),13)
f(f(z,5...
13
13
8
8
f(f(f(z,5),13),8)
f(f(f(z...
Result
Result
13
13
Text is not SVG - cannot display
\ No newline at end of file diff --git a/projects/hip/docs/data/tutorial/reduction/naive_reduction.drawio b/projects/hip/docs/data/tutorial/reduction/naive_reduction.drawio new file mode 100644 index 0000000000..b186c58aad --- /dev/null +++ b/projects/hip/docs/data/tutorial/reduction/naive_reduction.drawio @@ -0,0 +1,442 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/projects/hip/docs/data/tutorial/reduction/naive_reduction.svg b/projects/hip/docs/data/tutorial/reduction/naive_reduction.svg new file mode 100644 index 0000000000..922bfff1e9 --- /dev/null +++ b/projects/hip/docs/data/tutorial/reduction/naive_reduction.svg @@ -0,0 +1 @@ +Naive Shared Reduction
Shared
Shared
5
5
13
13
0
0
8
8
2
2
2
2
7
7
42
42
4
4
23
23
10
10
6
6
Shared
Shared
13
13
13
13
0
0
8
8
2
2
42
42
42
42
4
4
23
23
10
10
Shared
Shared
13
13
13
13
0
0
8
8
2
2
42
42
42
42
23
23
10
10
Shared
Shared
42
42
13
13
8
8
2
2
42
42
42
42
23
23
10
10
Thread IDs
Thread...
Data snapshot
Data sn...
Thread IDs
Thread...
Data snapshot
Data sn...
Thread IDs
Thread...
Data snapshot
Data sn...
Data snapshot
Data sn...
Text is not SVG - cannot display
\ No newline at end of file diff --git a/projects/hip/docs/data/tutorial/reduction/parallel_foldl.drawio b/projects/hip/docs/data/tutorial/reduction/parallel_foldl.drawio new file mode 100644 index 0000000000..6b04c73cc2 --- /dev/null +++ b/projects/hip/docs/data/tutorial/reduction/parallel_foldl.drawio @@ -0,0 +1,142 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/projects/hip/docs/data/tutorial/reduction/parallel_foldl.svg b/projects/hip/docs/data/tutorial/reduction/parallel_foldl.svg new file mode 100644 index 0000000000..d5edb0accb --- /dev/null +++ b/projects/hip/docs/data/tutorial/reduction/parallel_foldl.svg @@ -0,0 +1 @@ +Parallel Reduce
Input
Input
8
8
13
13
5
5
z
z
5
5
f(z,5)
f(z,5)
13
13
8
8
f(13,8)
f(13,8)
5
5
13
13
f(f(z,5),f(13,8))
f(f(z,5...
Result
Result
13
13
Text is not SVG - cannot display
\ No newline at end of file diff --git a/projects/hip/docs/data/tutorial/reduction/reduced_divergence_reduction.drawio b/projects/hip/docs/data/tutorial/reduction/reduced_divergence_reduction.drawio new file mode 100644 index 0000000000..0f1bd277ad --- /dev/null +++ b/projects/hip/docs/data/tutorial/reduction/reduced_divergence_reduction.drawio @@ -0,0 +1,442 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/projects/hip/docs/data/tutorial/reduction/reduced_divergence_reduction.svg b/projects/hip/docs/data/tutorial/reduction/reduced_divergence_reduction.svg new file mode 100644 index 0000000000..9661e05115 --- /dev/null +++ b/projects/hip/docs/data/tutorial/reduction/reduced_divergence_reduction.svg @@ -0,0 +1 @@ +Reduced Divergence Reduction
Shared
Shared
5
5
13
13
0
0
8
8
2
2
1
1
7
7
42
42
2
2
23
23
10
10
3
3
Shared
Shared
13
13
13
13
0
0
8
8
2
2
42
42
42
42
1
1
23
23
10
10
Shared
Shared
13
13
13
13
0
0
8
8
2
2
42
42
42
42
23
23
10
10
Shared
Shared
42
42
13
13
8
8
2
2
42
42
42
42
23
23
10
10
Thread IDs
Thread...
Data snapshot
Data sn...
Thread IDs
Thread...
Data snapshot
Data sn...
Thread IDs
Thread...
Data snapshot
Data sn...
Data snapshot
Data sn...
Text is not SVG - cannot display
\ No newline at end of file diff --git a/projects/hip/docs/data/tutorial/reduction/warp_reduction.drawio b/projects/hip/docs/data/tutorial/reduction/warp_reduction.drawio new file mode 100644 index 0000000000..583f90cdd2 --- /dev/null +++ b/projects/hip/docs/data/tutorial/reduction/warp_reduction.drawio @@ -0,0 +1,421 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/projects/hip/docs/data/tutorial/reduction/warp_reduction.svg b/projects/hip/docs/data/tutorial/reduction/warp_reduction.svg new file mode 100644 index 0000000000..ec8d0a829b --- /dev/null +++ b/projects/hip/docs/data/tutorial/reduction/warp_reduction.svg @@ -0,0 +1,2 @@ +Warp reduction
Local
Local
5
5
13
13
0
0
8
8
2
2
1
1
7
7
42
42
2
2
23
23
10
10
3
3
Local
Local
7
7
42
42
23
23
10
10
7
7
42
42
23
23
10
10
0
0
1
1
Local
Local
23
23
42
42
23
23
10
10
7
7
42
42
23
23
10
10
0
0
Local
Local +
42
42
42
42
23
23
10
10
7
7
42
42
23
23
10
10
Thread IDs
Thread...
Data snapshot
Data sn...
Thread IDs
Thread...
Data snapshot
Data sn...
Thread IDs
Thread...
Data snapshot
Data sn...
Data snapshot
Data sn...
Text is not SVG - cannot display
\ No newline at end of file diff --git a/projects/hip/docs/data/tutorial/reduction/warp_reduction_with_shared.drawio b/projects/hip/docs/data/tutorial/reduction/warp_reduction_with_shared.drawio new file mode 100644 index 0000000000..338407f45e --- /dev/null +++ b/projects/hip/docs/data/tutorial/reduction/warp_reduction_with_shared.drawio @@ -0,0 +1,707 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/projects/hip/docs/data/tutorial/reduction/warp_reduction_with_shared.svg b/projects/hip/docs/data/tutorial/reduction/warp_reduction_with_shared.svg new file mode 100644 index 0000000000..65b6d642b8 --- /dev/null +++ b/projects/hip/docs/data/tutorial/reduction/warp_reduction_with_shared.svg @@ -0,0 +1,3 @@ +Warp reduction
Local
Local
5
5
13
13
8
8
2
2
7
7
42
42
23
23
10
10
7
7
42
42
23
23
10
10
7
7
42
42
23
23
10
10
Local
Local
23
23
42
42
23
23
10
10
7
7
42
42
23
23
10
10
Local
Local +
42
42
42
42
23
23
10
10
7
7
42
42
23
23
10
10
Warp reduction
Local
Local
3
3
2
2
4
4
1
1
7
7
11
11
8
8
14
14
10
10
13
13
12
12
15
15
7
7
11
11
8
8
14
14
Local
Local
22
22
28
28
12
12
15
15
7
7
11
11
8
8
14
14
Local
Local +
50
50
28
28
12
12
15
15
7
7
11
11
8
8
14
14
Local
Local
92
92
50
50
Local
Local
42
42
50
50
Shared
Shared
42
42
50
50
Warp reduction with shared memory
Local
Local
Local
Local
Text is not SVG - cannot display
\ No newline at end of file diff --git a/projects/hip/docs/data/understand/programming_model_reference/memory_hierarchy.drawio b/projects/hip/docs/data/understand/programming_model/memory_hierarchy.drawio similarity index 100% rename from projects/hip/docs/data/understand/programming_model_reference/memory_hierarchy.drawio rename to projects/hip/docs/data/understand/programming_model/memory_hierarchy.drawio diff --git a/projects/hip/docs/data/understand/programming_model_reference/memory_hierarchy.svg b/projects/hip/docs/data/understand/programming_model/memory_hierarchy.svg similarity index 100% rename from projects/hip/docs/data/understand/programming_model_reference/memory_hierarchy.svg rename to projects/hip/docs/data/understand/programming_model/memory_hierarchy.svg diff --git a/projects/hip/docs/data/understand/programming_model_reference/thread_hierarchy.drawio b/projects/hip/docs/data/understand/programming_model/thread_hierarchy.drawio similarity index 100% rename from projects/hip/docs/data/understand/programming_model_reference/thread_hierarchy.drawio rename to projects/hip/docs/data/understand/programming_model/thread_hierarchy.drawio diff --git a/projects/hip/docs/data/understand/programming_model_reference/thread_hierarchy.svg b/projects/hip/docs/data/understand/programming_model/thread_hierarchy.svg similarity index 100% rename from projects/hip/docs/data/understand/programming_model_reference/thread_hierarchy.svg rename to projects/hip/docs/data/understand/programming_model/thread_hierarchy.svg diff --git a/projects/hip/docs/data/understand/programming_model_reference/thread_hierarchy_coop.svg b/projects/hip/docs/data/understand/programming_model_reference/thread_hierarchy_coop.svg deleted file mode 100644 index a3f57994fb..0000000000 --- a/projects/hip/docs/data/understand/programming_model_reference/thread_hierarchy_coop.svg +++ /dev/null @@ -1 +0,0 @@ -Grid
Block
Block
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Block
Block
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Cluster
Cluster
Warp
Warp
Warp
Warp
Warp
Warp
Block
Block
Warp
Warp
Warp
Warp
Block
Block
Warp
Warp
Block
Block
Warp
Warp
Warp
Warp
Block
Block
Block
Block
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Block
Block
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Cluster
Cluster
Warp
Warp
Warp
Warp
Warp
Warp
Block
Block
Warp
Warp
Warp
Warp
Block
Block
Warp
Warp
Block
Block
Warp
Warp
Warp
Warp
Block
Block
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
Warp
GridMulti Grid
Cluster
Cluster
Warp
Warp
Block
Block
Warp
Warp
Warp
Warp
Block
Block
Block
Block
Block
Block
Text is not SVG - cannot display
\ No newline at end of file diff --git a/projects/hip/docs/data/understand/textures/border.png b/projects/hip/docs/data/understand/textures/border.png new file mode 100644 index 0000000000..e616610c15 Binary files /dev/null and b/projects/hip/docs/data/understand/textures/border.png differ diff --git a/projects/hip/docs/data/understand/textures/clamp.png b/projects/hip/docs/data/understand/textures/clamp.png new file mode 100644 index 0000000000..63ed5f116f Binary files /dev/null and b/projects/hip/docs/data/understand/textures/clamp.png differ diff --git a/projects/hip/docs/data/understand/textures/linear.png b/projects/hip/docs/data/understand/textures/linear.png new file mode 100644 index 0000000000..2c85c0b11a Binary files /dev/null and b/projects/hip/docs/data/understand/textures/linear.png differ diff --git a/projects/hip/docs/data/understand/textures/mirror.png b/projects/hip/docs/data/understand/textures/mirror.png new file mode 100644 index 0000000000..d26a241183 Binary files /dev/null and b/projects/hip/docs/data/understand/textures/mirror.png differ diff --git a/projects/hip/docs/data/understand/textures/nearest.png b/projects/hip/docs/data/understand/textures/nearest.png new file mode 100644 index 0000000000..edfbf8cfbe Binary files /dev/null and b/projects/hip/docs/data/understand/textures/nearest.png differ diff --git a/projects/hip/docs/data/understand/textures/original.png b/projects/hip/docs/data/understand/textures/original.png new file mode 100644 index 0000000000..eaf6e7f7be Binary files /dev/null and b/projects/hip/docs/data/understand/textures/original.png differ diff --git a/projects/hip/docs/data/understand/textures/wrap.png b/projects/hip/docs/data/understand/textures/wrap.png new file mode 100644 index 0000000000..68e80befe8 Binary files /dev/null and b/projects/hip/docs/data/understand/textures/wrap.png differ diff --git a/projects/hip/docs/data/unified_memory/um.drawio b/projects/hip/docs/data/unified_memory/um.drawio new file mode 100644 index 0000000000..fac74f4b60 --- /dev/null +++ b/projects/hip/docs/data/unified_memory/um.drawio @@ -0,0 +1,1878 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/projects/hip/docs/data/unified_memory/um.svg b/projects/hip/docs/data/unified_memory/um.svg new file mode 100644 index 0000000000..748949b271 --- /dev/null +++ b/projects/hip/docs/data/unified_memory/um.svg @@ -0,0 +1,4 @@ + + + +Explicit Memory Management
CPU cores
CPU cores
CPU
CPU
GPU Memory
(HBM)
GPU Memory...
Unified Memory Management
Unified Memory
(HBM)
Unified Memory...
GPU cores
GPU cores
CPU Memory (DRAM)
CPU Memory (DR...
GPU cores
GPU cores
CPU cores
CPU cores
GPU
GPU
APU
APU
Text is not SVG - cannot display
\ No newline at end of file diff --git a/projects/hip/docs/doxygen/Doxyfile b/projects/hip/docs/doxygen/Doxyfile index d4bb54bd5d..6570128d00 100644 --- a/projects/hip/docs/doxygen/Doxyfile +++ b/projects/hip/docs/doxygen/Doxyfile @@ -829,10 +829,10 @@ WARN_LOGFILE = # spaces. See also FILE_PATTERNS and EXTENSION_MAPPING # Note: If this tag is empty the current directory is searched. -INPUT = mainpage.md \ - ../../include/hip \ +INPUT = ../../include/hip \ ../../../clr/hipamd/include/hip/amd_detail/amd_hip_gl_interop.h \ - ../../../llvm-project/clang/lib/Headers/__clang_hip_math.h \ + ../../../clr/hipamd/include/hip/amd_detail/amd_surface_functions.h \ + ../../../clr/hipamd/include/hip/amd_detail/amd_hip_cooperative_groups.h \ ../../../ROCR-Runtime/src/inc/hsa_ext_amd.h # This tag can be used to specify the character encoding of the source files @@ -2195,8 +2195,21 @@ INCLUDE_FILE_PATTERNS = # recursively expanded use the := operator instead of the = operator. # This tag requires that the tag ENABLE_PREPROCESSING is set to YES. -PREDEFINED = __HIP_PLATFORM_AMD__ \ - __dparm(x)= +PREDEFINED = "__HIP_PLATFORM_AMD__" \ + "DOXYGEN_SHOULD_INCLUDE_THIS=1" \ + "DOXYGEN_SHOULD_SKIP_THIS=1" \ + "__dparm(x)=" \ + "__cplusplus=201103L" \ + "__host__=" \ + "__device__=" \ + "__hip_img_chk__=" \ + "__CG_QUALIFIER__=" \ + "__CG_STATIC_QUALIFIER__=static" \ + "_CG_STATIC_CONST_DECL_=static constexpr" \ + "HIP_PUBLIC_API" \ + "HIP_ENABLE_WARP_SYNC_BUILTINS" \ + "__HOST_DEVICE__" \ + "__forceinline__" # If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then this # tag can be used to specify a list of macro names that should be expanded. The diff --git a/projects/hip/docs/doxygen/mainpage.md b/projects/hip/docs/doxygen/mainpage.md deleted file mode 100644 index 66fb00250a..0000000000 --- a/projects/hip/docs/doxygen/mainpage.md +++ /dev/null @@ -1,34 +0,0 @@ -# HIP Runtime API Reference {#mainpage} - -This is the full HIP Runtime API reference. The API is organized into -[modules](modules.html) based on functionality. - -## List of Modules - -- @ref GlobalDefs -- @ref Driver -- @ref Device -- @ref Execution -- @ref Error -- @ref Stream -- @ref StreamM -- @ref Event -- @ref Memory - - @ref External - - @ref MemoryM - - @ref StreamO - - @ref MemoryD -- @ref PeerToPeer -- @ref Context -- @ref Module -- @ref Occupancy -- @ref Profiler -- @ref Clang -- @ref Texture - - @ref TextureD -- @ref Runtime -- @ref Callback -- @ref Graph -- @ref Virtual -- @ref GL -- @ref Surface diff --git a/projects/hip/docs/environment.yml b/projects/hip/docs/environment.yml new file mode 100644 index 0000000000..a3dbd898d7 --- /dev/null +++ b/projects/hip/docs/environment.yml @@ -0,0 +1,10 @@ +name: RTD +channels: + - conda-forge + - defaults +dependencies: + - python=3.10 + - pip + - doxygen=1.9.8 + - pip: + - -r ./sphinx/requirements.txt diff --git a/projects/hip/docs/how-to/cooperative_groups.rst b/projects/hip/docs/how-to/cooperative_groups.rst new file mode 100644 index 0000000000..370d6dc729 --- /dev/null +++ b/projects/hip/docs/how-to/cooperative_groups.rst @@ -0,0 +1,490 @@ +.. meta:: + :description: This topic describes how to use cooperative groups in HIP + :keywords: AMD, ROCm, HIP, cooperative groups + +.. _cooperative_groups_how-to: + +******************************************************************************* +Cooperative groups +******************************************************************************* + +Cooperative groups API is an extension to the HIP programming model, which provides developers with a flexible, dynamic grouping mechanism for the communicating threads. Cooperative groups let you define your own set of thread groups which may fit your user-cases better than those defined by the hardware. This lets you specify the level of granularity for thread communication which can lead to more efficient parallel decompositions. + +The API is accessible in the ``cooperative_groups`` namespace after the ``hip_cooperative_groups.h`` is included. The header contains the following elements: + +* Static functions to create groups and subgroups. +* Hardware-accelerated operations over the whole group, like shuffles. +* Data types of cooperative groups. +* Synchronize member function of the groups. +* Get group properties member functions. + +Cooperative groups thread model +=============================== + +The thread hierarchy abstraction of cooperative groups are in :ref:`grid hierarchy ` and :ref:`block hierarchy `. + +.. _coop_thread_top_hierarchy: + +.. figure:: ../data/how-to/cooperative_groups/thread_hierarchy_coop_top.svg + :alt: Diagram depicting nested rectangles of varying color. The outermost one + titled "Grid", inside sets of different sized rectangles layered on + one another titled "Block". Each "Block" containing sets of uniform + rectangles layered on one another titled "Warp". Each of the "Warp" + titled rectangles filled with downward pointing arrows inside. + + Cooperative group thread hierarchy in grids. + +The **multi grid** is an abstraction of potentially multiple simultaneous launches of the same kernel over multiple devices (Deprecated since 5.0). The **grid** in cooperative groups is a single dispatch of kernels for execution like the original grid. + +.. note:: + + The ability to synchronize over a grid or multi grid requires the kernel to be launched using the specific cooperative groups API. + +The **block** is the same as the :ref:`inherent_thread_model` block entity. + +.. note:: + + Explicit warp-level thread handling is absent from the Cooperative groups API. In order to exploit the known hardware SIMD width on which built-in functionality translates to simpler logic, you can use the group partitioning part of the API, such as ``tiled_partition``. + +.. _coop_thread_bottom_hierarchy: + +.. figure:: ../data/how-to/cooperative_groups/thread_hierarchy_coop_bottom.svg + :alt: The new level between block thread and threads. + + Cooperative group thread hierarchy in blocks. + +The cooperative groups API introduce a new level between block thread and threads. The :ref:`thread-block tile ` give the opportunity to have tiles in the thread block, while the :ref:`coalesced group ` holds the active threads of the parent group. These groups further discussed in the :ref:`groups types ` section. + +For details on memory model, check the :ref:`memory model description `. + +.. _coop_group_types: + +Group types +=========== + +Group types are based on the levels of synchronization and data sharing among threads. + +Thread-block group +------------------ + +Represents an intra-block cooperative groups type where the participating threads within the group are the same threads that participated in the currently executing ``block``. + +.. code-block:: cpp + + class thread_block; + +Constructed via: + +.. code-block:: cpp + + thread_block g = this_thread_block(); + +The ``group_index()`` , ``thread_index()`` , ``thread_rank()`` , ``size()``, ``cg_type()``, ``is_valid()`` , ``sync()`` and ``group_dim()`` member functions are public of the thread_block class. For further details, check the :ref:`thread_block references ` . + +Grid group +------------ + +Represents an inter-block cooperative groups type where the group's participating threads span multiple blocks running the same kernel on the same device. Use the cooperative launch API to enable synchronization across the grid group. + +.. code-block:: cpp + + class grid_group; + +Constructed via: + +.. code-block:: cpp + + grid_group g = this_grid(); + +The ``thread_rank()`` , ``size()``, ``cg_type()``, ``is_valid()`` and ``sync()`` member functions +are public of the ``grid_group`` class. For further details, check the :ref:`grid_group references `. + +Multi-grid group +------------------ + +Represents an inter-device cooperative groups type where the participating threads within the group span multiple devices that run the same kernel on the devices. Use the cooperative launch API to enable synchronization across the multi-grid group. + +.. code-block:: cpp + + class multi_grid_group; + +Constructed via: + +.. code-block:: cpp + + // Kernel must be launched with the cooperative multi-device API + multi_grid_group g = this_multi_grid(); + +The ``num_grids()`` , ``grid_rank()`` , ``thread_rank()``, ``size()``, ``cg_type()``, ``is_valid()`` , +and ``sync()`` member functions are public of the ``multi_grid_group`` class. For +further details check the :ref:`multi_grid_group references ` . + +.. _coop_thread_block_tile: + +Thread-block tile +------------------ + +This constructs a templated class derived from ``thread_group``. The template defines the tile +size of the new thread group at compile time. This group type also supports sub-wave level intrinsics. + +.. code-block:: cpp + + template + class thread_block_tile; + +Constructed via: + +.. code-block:: cpp + + template + _CG_QUALIFIER thread_block_tile tiled_partition(const ParentT& g) + + +.. note:: + + * Size must be a power of 2 and not larger than warp (wavefront) size. + * ``shfl()`` functions support integer or float type. + +The ``thread_rank()`` , ``size()``, ``cg_type()``, ``is_valid()``, ``sync()``, ``meta_group_rank()``, ``meta_group_size()``, ``shfl()``, ``shfl_down()``, ``shfl_up()``, ``shfl_xor()``, ``ballot()``, ``any()``, ``all()``, ``match_any()`` and ``match_all()`` member functions are public of the ``thread_block_tile`` class. For further details, check the :ref:`thread_block_tile references ` . + +.. _coop_coalesced_groups: + +Coalesced groups +------------------ + +Threads (64 threads on CDNA and 32 threads on RDNA) in a warp cannot execute different instructions simultaneously, so conditional branches are executed serially within the warp. When threads encounter a conditional branch, they can diverge, resulting in some threads being disabled, if they do not meet the condition to execute that branch. The active threads referred as coalesced, and coalesced group represents an active thread group within a warp. + +.. note:: + + The NVIDIA GPU's independent thread scheduling presents the appearance that threads on different branches execute concurrently. + +.. warning:: + + AMD GPUs do not support independent thread scheduling. Some CUDA application can rely on this feature and the ported HIP version on AMD GPUs can deadlock, when they try to make use of independent thread scheduling. + +This group type also supports sub-wave level intrinsics. + +.. code-block:: cpp + + class coalesced_group; + +Constructed via: + +.. code-block:: cpp + + coalesced_group active = coalesced_threads(); + +.. note:: + + ``shfl()`` functions support integer or float type. + +The ``thread_rank()`` , ``size()``, ``cg_type()``, ``is_valid()``, ``sync()``, ``meta_group_rank()``, ``meta_group_size()``, ``shfl()``, ``shfl_down()``, ``shfl_up()``, ``ballot()``, ``any()``, ``all()``, ``match_any()`` and ``match_all()`` member functions are public of the ``coalesced_group`` class. For more information, see :ref:`coalesced_group references ` . + +Cooperative groups simple example +================================= + +The difference to the original block model in the ``reduce_sum`` device function is the following. + +.. tab-set:: + .. tab-item:: Original Block + :sync: original-block + + .. code-block:: cuda + + __device__ int reduce_sum(int *shared, int val) { + + // Thread ID + const unsigned int thread_id = threadIdx.x; + + // Every iteration the number of active threads + // halves, until we processed all values + for(unsigned int i = blockDim.x / 2; i > 0; i /= 2) { + // Store value in shared memory with thread ID + shared[thread_id] = val; + + // Synchronize all threads + __syncthreads(); + + // Active thread sum up + if(thread_id < i) + val += shared[thread_id + i]; + + // Synchronize all threads in the group + __syncthreads(); + } + + // ... + } + + .. tab-item:: Cooperative groups + :sync: cooperative-groups + + .. code-block:: cuda + + __device__ int reduce_sum(thread_group g, + int *shared, + int val) { + + // Thread ID + const unsigned int group_thread_id = g.thread_rank(); + + // Every iteration the number of active threads + // halves, until we processed all values + for(unsigned int i = g.size() / 2; i > 0; i /= 2) { + // Store value in shared memroy with thread ID + shared[group_thread_id] = val; + + // Synchronize all threads in the group + g.sync(); + + // Active thread sum up + if(group_thread_id < i) + val += shared[group_thread_id + i]; + + // Synchronize all threads in the group + g.sync(); + } + + // ... + } + +The ``reduce_sum()`` function call and input data initialization difference to the original block model is the following. + +.. tab-set:: + .. tab-item:: Original Block + :sync: original-block + + .. code-block:: cuda + + __global__ void sum_kernel(...) { + + // ... + + // Workspace array in shared memory + __shared__ unsigned int workspace[2048]; + + // ... + + // Perform reduction + output = reduce_sum(workspace, input); + + // ... + } + + .. tab-item:: Cooperative groups + :sync: cooperative-groups + + .. code-block:: cuda + + __global__ void sum_kernel(...) { + + // ... + + // Workspace array in shared memory + __shared__ unsigned int workspace[2048]; + + // ... + + // Initialize the thread_block + thread_block thread_block_group = this_thread_block(); + // Perform reduction + output = reduce_sum(thread_block_group, workspace, input); + + // ... + } + +At the device function, the input group type is the ``thread_group``, which is the parent class of all the cooperative groups type. With this, you can write generic functions, which can work with any type of cooperative groups. + +.. _coop_synchronization: + +Synchronization +=============== + +With each group type, the synchronization requires using the correct cooperative groups launch API. + +**Check the kernel launch capability** + +.. tab-set:: + .. tab-item:: Thread-block + :sync: thread-block + + Do not need kernel launch validation. + + .. tab-item:: Grid + :sync: grid + + Confirm the cooperative launch capability on the single AMD GPU: + + .. code-block:: cpp + + int device = 0; + int supports_coop_launch = 0; + // Check support + // Use hipDeviceAttributeCooperativeMultiDeviceLaunch when launching across multiple devices + HIP_CHECK(hipGetDevice(&device)); + HIP_CHECK( + hipDeviceGetAttribute(&supports_coop_launch, hipDeviceAttributeCooperativeLaunch, device)); + if(!supports_coop_launch) + { + std::cout << "Skipping, device " << device << " does not support cooperative groups" + << std::endl; + return 0; + } + + .. tab-item:: Multi-grid + :sync: multi-grid + + Confirm the cooperative launch capability over multiple GPUs: + + .. code-block:: cpp + + // Check support of cooperative groups + std::vector deviceIDs; + for(int deviceID = 0; deviceID < device_count; deviceID++) { + #ifdef __HIP_PLATFORM_AMD__ + int supports_coop_launch = 0; + HIP_CHECK( + hipDeviceGetAttribute( + &supports_coop_launch, + hipDeviceAttributeCooperativeMultiDeviceLaunch, + deviceID)); + if(!supports_coop_launch) { + std::cout << "Skipping, device " << deviceID << " does not support cooperative groups" + << std::endl; + } + else + #endif + { + std::cout << deviceID << std::endl; + // Collect valid deviceIDs. + deviceIDs.push_back(deviceID); + } + } + +**Kernel launch** + +.. tab-set:: + .. tab-item:: Thread-block + :sync: thread-block + + You can access the new block representation using the original kernel launch methods. + + .. code-block:: cpp + + void* params[] = {&d_vector, &d_block_reduced, &d_partition_reduced}; + // Launching kernel from host. + HIP_CHECK(hipLaunchKernelGGL(vector_reduce_kernel, + dim3(num_blocks), + dim3(threads_per_block), + 0, + hipStreamDefault, + &d_vector, + &d_block_reduced, + &d_partition_reduced)); + + .. tab-item:: Grid + :sync: grid + + Launch the cooperative kernel on a single GPU: + + .. code-block:: cpp + + void* params[] = {}; + // Launching kernel from host. + HIP_CHECK(hipLaunchCooperativeKernel(vector_reduce_kernel, + dim3(num_blocks), + dim3(threads_per_block), + 0, + 0, + hipStreamDefault)); + + .. tab-item:: Multi-grid + :sync: multi-grid + + Launch the cooperative kernel over multiple GPUs: + + .. code-block:: cpp + + hipLaunchParams *launchParamsList = (hipLaunchParams*)malloc(sizeof(hipLaunchParams) * deviceIDs.size()); + for(int deviceID : deviceIDs) { + + // Set device + HIP_CHECK(hipSetDevice(deviceID)); + + // Create stream + hipStream_t stream; + HIP_CHECK(hipStreamCreate(&stream)); + + // Parameters + void* params[] = {&(d_vector[deviceID]), &(d_block_reduced[deviceID]), &(d_partition_reduced[deviceID])}; + + // Set launchParams + launchParamsList[deviceID].func = (void*)vector_reduce_kernel; + launchParamsList[deviceID].gridDim = dim3(1); + launchParamsList[deviceID].blockDim = dim3(threads_per_block); + launchParamsList[deviceID].sharedMem = 0; + launchParamsList[deviceID].stream = stream; + launchParamsList[deviceID].args = params; + } + + HIP_CHECK(hipLaunchCooperativeKernelMultiDevice(launchParamsList, + (int)deviceIDs.size(), + hipCooperativeLaunchMultiDeviceNoPreSync)); + +**Device side synchronization** + +.. tab-set:: + .. tab-item:: Thread-block + :sync: thread-block + + The device side code of the thread_block synchronization over single GPUs: + + .. code-block:: cpp + + thread_block g = this_thread_block(); + g.sync(); + + .. tab-item:: Grid + :sync: grid + + The device side code of the grid synchronization over single GPUs: + + .. code-block:: cpp + + grid_group grid = this_grid(); + grid.sync(); + + .. tab-item:: Multi-grid + :sync: multi-grid + + The device side code of the multi-grid synchronization over multiple GPUs: + + .. code-block:: cpp + + multi_grid_group multi_grid = this_multi_grid(); + multi_grid.sync(); + +Unsupported NVIDIA CUDA features +================================ + +HIP doesn't support the following NVIDIA CUDA optional headers: + +* ``cooperative_groups/memcpy_async.h`` +* ``cooperative_groups/reduce.h`` +* ``cooperative_groups/scan.h`` + +HIP doesn't support the following CUDA class in ``cooperative_groups`` namespace: + +* ``cluster_group`` + +HIP doesn't support the following CUDA functions/operators in ``cooperative_groups`` namespace: + +* ``synchronize`` +* ``memcpy_async`` +* ``wait`` and ``wait_prior`` +* ``barrier_arrive`` and ``barrier_wait`` +* ``invoke_one`` and ``invoke_one_broadcast`` +* ``reduce`` +* ``reduce_update_async`` and ``reduce_store_async`` +* Reduce operators ``plus`` , ``less`` , ``greater`` , ``bit_and`` , ``bit_xor`` and ``bit_or`` +* ``inclusive_scan`` and ``exclusive_scan`` diff --git a/projects/hip/docs/how-to/faq.md b/projects/hip/docs/how-to/faq.md index d87357b617..805e5a5ac0 100644 --- a/projects/hip/docs/how-to/faq.md +++ b/projects/hip/docs/how-to/faq.md @@ -38,7 +38,7 @@ See the [API Support Table](https://github.com/ROCm/HIPIFY/blob/amd-staging/docs * Virtual functions, indirect functions and try/catch (CUDA 4.0) * `__prof_trigger` * PTX assembly (CUDA 4.0). HIP-Clang supports inline GCN assembly. -* Several kernel features are under development. See the {doc}`/reference/kernel_language` for more information. +* Several kernel features are under development. See the {doc}`/reference/cpp_language_extensions` for more information. ## Is HIP a drop-in replacement for CUDA? diff --git a/projects/hip/docs/how-to/hip_porting_driver_api.md b/projects/hip/docs/how-to/hip_porting_driver_api.md deleted file mode 100644 index d42353b631..0000000000 --- a/projects/hip/docs/how-to/hip_porting_driver_api.md +++ /dev/null @@ -1,304 +0,0 @@ -# Porting CUDA Driver API - -## Introduction to the CUDA Driver and Runtime APIs - -CUDA provides a separate CUDA Driver and Runtime APIs. The two APIs have significant overlap in functionality: - -* Both APIs support events, streams, memory management, memory copy, and error handling. -* Both APIs deliver similar performance. -* Driver APIs calls begin with the prefix `cu` while Runtime APIs begin with the prefix `cuda`. For example, the Driver API API contains `cuEventCreate` while the Runtime API contains `cudaEventCreate`, with similar functionality. -* The Driver API defines a different but largely overlapping error code space than the Runtime API, and uses a different coding convention. For example, Driver API defines `CUDA_ERROR_INVALID_VALUE` while the Runtime API defines `cudaErrorInvalidValue` - -The Driver API offers two additional pieces of functionality not provided by the Runtime API: `cuModule` and `cuCtx` APIs. - -### `cuModule` API - -The Module section of the Driver API provides additional control over how and when accelerator code objects are loaded. -For example, the driver API allows code objects to be loaded from files or memory pointers. -Symbols for kernels or global data can be extracted from the loaded code objects. -In contrast, the Runtime API automatically loads and (if necessary) compiles all of the kernels from an executable binary when run. -In this mode, NVCC must be used to compile kernel code so the automatic loading can function correctly. - -Both Driver and Runtime APIs define a function for launching kernels (called `cuLaunchKernel` or `cudaLaunchKernel`. -The kernel arguments and the execution configuration (grid dimensions, group dimensions, dynamic shared memory, and stream) are passed as arguments to the launch function. -The Runtime additionally provides the `<<< >>>` syntax for launching kernels, which resembles a special function call and is easier to use than explicit launch API (in particular with respect to handling of kernel arguments). -However, this syntax is not standard C++ and is available only when NVCC is used to compile the host code. - -The Module features are useful in an environment which generates the code objects directly, such as a new accelerator language front-end. -Here, NVCC is not used. Instead, the environment may have a different kernel language or different compilation flow. -Other environments have many kernels and do not want them to be all loaded automatically. -The Module functions can be used to load the generated code objects and launch kernels. -As we will see below, HIP defines a Module API which provides similar explicit control over code object management. - -### `cuCtx` API - -The Driver API defines "Context" and "Devices" as separate entities. -Contexts contain a single device, and a device can theoretically have multiple contexts. -Each context contains a set of streams and events specific to the context. -Historically contexts also defined a unique address space for the GPU, though this may no longer be the case in Unified Memory platforms (since the CPU and all the devices in the same process share a single unified address space). -The Context APIs also provide a mechanism to switch between devices, which allowed a single CPU thread to send commands to different GPUs. -HIP as well as a recent versions of CUDA Runtime provide other mechanisms to accomplish this feat - for example using streams or `cudaSetDevice`. - -The CUDA Runtime API unifies the Context API with the Device API. This simplifies the APIs and has little loss of functionality since each Context can contain a single device, and the benefits of multiple contexts has been replaced with other interfaces. -HIP provides a context API to facilitate easy porting from existing Driver codes. -In HIP, the `Ctx` functions largely provide an alternate syntax for changing the active device. - -Most new applications will prefer to use `hipSetDevice` or the stream APIs , therefore HIP has marked `hipCtx` APIs as **deprecated**. Support for these APIs may not be available in future releases. For more details on deprecated APIs please refer [HIP deprecated APIs](https://github.com/ROCm/HIP/blob/develop/docs/reference/deprecated_api_list.md). - -## HIP Module and `Ctx` APIs - -Rather than present two separate APIs, HIP extends the HIP API with new APIs for Modules and `Ctx` control. - -### `hipModule` API - -Like the CUDA Driver API, the Module API provides additional control over how code is loaded, including options to load code from files or from in-memory pointers. -NVCC and HIP-Clang target different architectures and use different code object formats: NVCC is `cubin` or `ptx` files, while the HIP-Clang path is the `hsaco` format. -The external compilers which generate these code objects are responsible for generating and loading the correct code object for each platform. -Notably, there is not a fat binary format that can contain code for both NVCC and HIP-Clang platforms. The following table summarizes the formats used on each platform: - -| Format | APIs | NVCC | HIP-CLANG | -| --- | --- | --- | --- | -| Code Object | `hipModuleLoad`, `hipModuleLoadData` | `.cubin` or PTX text | `.hsaco` | -| Fat Binary | `hipModuleLoadFatBin` | `.fatbin` | `.hip_fatbin` | - -`hipcc` uses HIP-Clang or NVCC to compile host codes. Both of these may embed code objects into the final executable, and these code objects will be automatically loaded when the application starts. -The `hipModule` API can be used to load additional code objects, and in this way provides an extended capability to the automatically loaded code objects. -HIP-Clang allows both of these capabilities to be used together, if desired. Of course it is possible to create a program with no kernels and thus no automatic loading. - -### `hipCtx` API - -HIP provides a `Ctx` API as a thin layer over the existing Device functions. This `Ctx` API can be used to set the current context, or to query properties of the device associated with the context. -The current context is implicitly used by other APIs such as `hipStreamCreate`. - -### hipify translation of CUDA Driver API - -The HIPIFY tools convert CUDA Driver APIs for streams, events, modules, devices, memory management, context, profiler to the equivalent HIP driver calls. For example, `cuEventCreate` will be translated to `hipEventCreate`. -HIPIFY tools also convert error codes from the Driver namespace and coding convention to the equivalent HIP error code. Thus, HIP unifies the APIs for these common functions. - -The memory copy API requires additional explanation. The CUDA driver includes the memory direction in the name of the API (`cuMemcpyH2D`) while the CUDA driver API provides a single memory copy API with a parameter that specifies the direction and additionally supports a "default" direction where the runtime determines the direction automatically. -HIP provides APIs with both styles: for example, `hipMemcpyH2D` as well as `hipMemcpy`. -The first flavor may be faster in some cases since they avoid host overhead to detect the different memory directions. - -HIP defines a single error space, and uses camel-case for all errors (i.e. `hipErrorInvalidValue`). - -#### Address Spaces - -HIP-Clang defines a process-wide address space where the CPU and all devices allocate addresses from a single unified pool. -Thus addresses may be shared between contexts, and unlike the original CUDA definition a new context does not create a new address space for the device. - -#### Using `hipModuleLaunchKernel` - -`hipModuleLaunchKernel` is `cuLaunchKernel` in HIP world. It takes the same arguments as `cuLaunchKernel`. - -#### Additional Information - -* HIP-Clang creates a primary context when the HIP API is called. So in a pure driver API code, HIP-Clang will create a primary context while HIP/NVCC will have empty context stack. -HIP-Clang will push primary context to context stack when it is empty. This can have subtle differences on applications which mix the runtime and driver APIs. - -### `hip-clang` Implementation Notes - -#### `.hip_fatbin` - -hip-clang links device code from different translation units together. For each device target, a code object is generated. Code objects for different device targets are bundled by `clang-offload-bundler` as one fatbinary, which is embeded as a global symbol `__hip_fatbin` in the `.hip_fatbin` section of the ELF file of the executable or shared object. - -#### Initialization and Termination Functions - -hip-clang generates initialization and termination functions for each translation unit for host code compilation. The initialization functions call `__hipRegisterFatBinary` to register the fatbinary embeded in the ELF file. They also call `__hipRegisterFunction` and `__hipRegisterVar` to register kernel functions and device side global variables. The termination functions call `__hipUnregisterFatBinary`. -hip-clang emits a global variable `__hip_gpubin_handle` of void** type with linkonce linkage and inital value 0 for each host translation unit. Each initialization function checks `__hip_gpubin_handle` and register the fatbinary only if `__hip_gpubin_handle` is 0 and saves the return value of `__hip_gpubin_handle` to `__hip_gpubin_handle`. This is to guarantee that the fatbinary is only registered once. Similar check is done in the termination functions. - -#### Kernel Launching - -hip-clang supports kernel launching by CUDA `<<<>>>` syntax, hipLaunchKernelGGL. The latter one is macro which expand to CUDA `<<<>>>` syntax. - -When the executable or shared library is loaded by the dynamic linker, the initialization functions are called. In the initialization functions, when `__hipRegisterFatBinary` is called, the code objects containing all kernels are loaded; when `__hipRegisterFunction` is called, the stub functions are associated with the corresponding kernels in code objects. - -hip-clang implements two sets of kernel launching APIs. - -By default, in the host code, for the `<<<>>>` statement, hip-clang first emits call of `hipConfigureCall` to set up the threads and grids, then emits call of the stub function with the given arguments. In the stub function, `hipSetupArgument` is called for each kernel argument, then `hipLaunchByPtr` is called with a function pointer to the stub function. In `hipLaunchByPtr`, the real kernel associated with the stub function is launched. - -### NVCC Implementation Notes - -#### Interoperation between HIP and CUDA Driver - -CUDA applications may want to mix CUDA driver code with HIP code (see example below). This table shows the type equivalence to enable this interaction. - -|**HIP Type** |**CU Driver Type**|**CUDA Runtime Type**| -| ---- | ---- | ---- | -| `hipModule_t` | `CUmodule` | | -| `hipFunction_t` | `CUfunction` | | -| `hipCtx_t` | `CUcontext` | | -| `hipDevice_t` | `CUdevice` | | -| `hipStream_t` | `CUstream` | `cudaStream_t` | -| `hipEvent_t` | `CUevent` | `cudaEvent_t` | -| `hipArray` | `CUarray` | `cudaArray` | - -#### Compilation Options - -The `hipModule_t` interface does not support `cuModuleLoadDataEx` function, which is used to control PTX compilation options. -HIP-Clang does not use PTX and does not support these compilation options. -In fact, HIP-Clang code objects always contain fully compiled ISA and do not require additional compilation as a part of the load step. -The corresponding HIP function `hipModuleLoadDataEx` behaves as `hipModuleLoadData` on HIP-Clang path (compilation options are not used) and as `cuModuleLoadDataEx` on NVCC path. -For example (CUDA): - -```cpp -CUmodule module; -void *imagePtr = ...; // Somehow populate data pointer with code object - -const int numOptions = 1; -CUJit_option options[numOptions]; -void * optionValues[numOptions]; - -options[0] = CU_JIT_MAX_REGISTERS; -unsigned maxRegs = 15; -optionValues[0] = (void*)(&maxRegs); - -cuModuleLoadDataEx(module, imagePtr, numOptions, options, optionValues); - -CUfunction k; -cuModuleGetFunction(&k, module, "myKernel"); -``` - -HIP: - -```cpp -hipModule_t module; -void *imagePtr = ...; // Somehow populate data pointer with code object - -const int numOptions = 1; -hipJitOption options[numOptions]; -void * optionValues[numOptions]; - -options[0] = hipJitOptionMaxRegisters; -unsigned maxRegs = 15; -optionValues[0] = (void*)(&maxRegs); - -// hipModuleLoadData(module, imagePtr) will be called on HIP-Clang path, JIT options will not be used, and -// cupModuleLoadDataEx(module, imagePtr, numOptions, options, optionValues) will be called on NVCC path -hipModuleLoadDataEx(module, imagePtr, numOptions, options, optionValues); - -hipFunction_t k; -hipModuleGetFunction(&k, module, "myKernel"); -``` - -The below sample shows how to use `hipModuleGetFunction`. - -```cpp -#include -#include -#include -#include -#include - -#define LEN 64 -#define SIZE LEN<<2 - -#ifdef __HIP_PLATFORM_AMD__ -#define fileName "vcpy_isa.co" -#endif - -#ifdef __HIP_PLATFORM_NVIDIA__ -#define fileName "vcpy_isa.ptx" -#endif - -#define kernel_name "hello_world" - -int main(){ - float *A, *B; - hipDeviceptr_t Ad, Bd; - A = new float[LEN]; - B = new float[LEN]; - - for(uint32_t i=0;iargBuffer(2); - memcpy(&argBuffer[0], &Ad, sizeof(void*)); - memcpy(&argBuffer[1], &Bd, sizeof(void*)); - - size_t size = argBuffer.size()*sizeof(void*); - - void *config[] = { - HIP_LAUNCH_PARAM_BUFFER_POINTER, &argBuffer[0], - HIP_LAUNCH_PARAM_BUFFER_SIZE, &size, - HIP_LAUNCH_PARAM_END - }; - - hipModuleLaunchKernel(Function, 1, 1, 1, LEN, 1, 1, 0, 0, NULL, (void**)&config); - - hipMemcpyDtoH(B, Bd, SIZE); - for(uint32_t i=0;i tex; - -__global__ void tex2dKernel(hipLaunchParm lp, float* outputData, - int width, - int height) -{ - int x = blockIdx.x*blockDim.x + threadIdx.x; - int y = blockIdx.y*blockDim.y + threadIdx.y; - outputData[y*width + x] = tex2D(tex, x, y); -} - -``` - -```cpp -// Host code: - -texture tex; - -void myFunc () -{ - // ... - - textureReference* texref; - hipModuleGetTexRef(&texref, Module1, "tex"); - hipTexRefSetAddressMode(texref, 0, hipAddressModeWrap); - hipTexRefSetAddressMode(texref, 1, hipAddressModeWrap); - hipTexRefSetFilterMode(texref, hipFilterModePoint); - hipTexRefSetFlags(texref, 0); - hipTexRefSetFormat(texref, HIP_AD_FORMAT_FLOAT, 1); - hipTexRefSetArray(texref, array, HIP_TRSA_OVERRIDE_FORMAT); - - // ... -} -``` diff --git a/projects/hip/docs/how-to/hip_porting_driver_api.rst b/projects/hip/docs/how-to/hip_porting_driver_api.rst new file mode 100644 index 0000000000..ae5bb7226f --- /dev/null +++ b/projects/hip/docs/how-to/hip_porting_driver_api.rst @@ -0,0 +1,537 @@ +.. meta:: + :description: This chapter presents how to port the CUDA driver API and showcases equivalent operations in HIP. + :keywords: AMD, ROCm, HIP, CUDA, driver API + +.. _porting_driver_api: + +******************************************************************************* +Porting CUDA driver API +******************************************************************************* + +NVIDIA provides separate CUDA driver and runtime APIs. The two APIs have significant overlap in functionality: + +* Both APIs support events, streams, memory management, memory copy, and error handling. +* Both APIs deliver similar performance. +* Driver API calls begin with the prefix ``cu``, while runtime API calls begin with the prefix ``cuda``. For example, the driver API contains ``cuEventCreate``, while the runtime API contains ``cudaEventCreate``, which has similar functionality. +* The driver API defines a different, but largely overlapping, error code space than the runtime API and uses a different coding convention. For example, the driver API defines ``CUDA_ERROR_INVALID_VALUE``, while the runtime API defines ``cudaErrorInvalidValue``. + +The driver API offers two additional functionalities not provided by the runtime API: ``cuModule`` and ``cuCtx`` APIs. + +cuModule API +================================================================================ + +The Module section of the driver API provides additional control over how and +when accelerator code objects are loaded. For example, the driver API enables +code objects to load from files or memory pointers. Symbols for kernels or +global data are extracted from the loaded code objects. In contrast, the runtime +API loads automatically and, if necessary, compiles all the kernels from an +executable binary when it runs. In this mode, kernel code must be compiled using +NVCC so that automatic loading can function correctly. + +The Module features are useful in an environment that generates the code objects +directly, such as a new accelerator language front end. NVCC is not used here. +Instead, the environment might have a different kernel language or compilation +flow. Other environments have many kernels and don't want all of them to be +loaded automatically. The Module functions load the generated code objects and +launch kernels. Similar to the cuModule API, HIP defines a hipModule API that +provides similar explicit control over code object management. + +.. _context_driver_api: + +cuCtx API +================================================================================ + +The driver API defines "Context" and "Devices" as separate entities. +Contexts contain a single device, and a device can theoretically have multiple contexts. +Each context contains a set of streams and events specific to the context. +Historically, contexts also defined a unique address space for the GPU. This might no longer be the case in unified memory platforms, because the CPU and all the devices in the same process share a single unified address space. +The Context APIs also provide a mechanism to switch between devices, which enables a single CPU thread to send commands to different GPUs. +HIP and recent versions of the CUDA Runtime provide other mechanisms to accomplish this feat, for example, using streams or ``cudaSetDevice``. + +The CUDA runtime API unifies the Context API with the Device API. This simplifies the APIs and has little loss of functionality. This is because each context can contain a single device, and the benefits of multiple contexts have been replaced with other interfaces. +HIP provides a Context API to facilitate easy porting from existing Driver code. +In HIP, the ``Ctx`` functions largely provide an alternate syntax for changing the active device. + +Most new applications preferentially use ``hipSetDevice`` or the stream APIs. Therefore, HIP has marked the ``hipCtx`` APIs as **deprecated**. Support for these APIs might not be available in future releases. For more details on deprecated APIs, see :doc:`../reference/deprecated_api_list`. + +HIP module and Ctx APIs +================================================================================ + +Rather than present two separate APIs, HIP extends the HIP API with new APIs for +modules and ``Ctx`` control. + +hipModule API +-------------------------------------------------------------------------------- + +Like the CUDA driver API, the Module API provides additional control over how +code is loaded, including options to load code from files or from in-memory +pointers. +NVCC and HIP-Clang target different architectures and use different code object +formats. NVCC supports ``cubin`` or ``ptx`` files, while the HIP-Clang path uses +the ``hsaco`` format. +The external compilers which generate these code objects are responsible for +generating and loading the correct code object for each platform. +Notably, there is no fat binary format that can contain code for both NVCC and +HIP-Clang platforms. The following table summarizes the formats used on each +platform: + +.. list-table:: Module formats + :header-rows: 1 + + * - Format + - APIs + - NVCC + - HIP-CLANG + * - Code object + - ``hipModuleLoad``, ``hipModuleLoadData`` + - ``.cubin`` or PTX text + - ``.hsaco`` + * - Fat binary + - ``hipModuleLoadFatBin`` + - ``.fatbin`` + - ``.hip_fatbin`` + +``hipcc`` uses HIP-Clang or NVCC to compile host code. Both of these compilers can embed code objects into the final executable. These code objects are automatically loaded when the application starts. +The ``hipModule`` API can be used to load additional code objects. When used this way, it extends the capability of the automatically loaded code objects. +HIP-Clang enables both of these capabilities to be used together. Of course, it is possible to create a program with no kernels and no automatic loading. + +For module API reference, visit :ref:`module_management_reference`. + +hipCtx API +-------------------------------------------------------------------------------- + +HIP provides a ``Ctx`` API as a thin layer over the existing device functions. The ``Ctx`` API can be used to set the current context or to query properties of the device associated with the context. +The current context is implicitly used by other APIs, such as ``hipStreamCreate``. + +For context reference, visit :ref:`context_management_reference`. + +HIPIFY translation of CUDA driver API +================================================================================ + +The HIPIFY tools convert CUDA driver APIs for streams, events, modules, devices, memory management, context, and the profiler to the equivalent HIP calls. For example, ``cuEventCreate`` is translated to ``hipEventCreate``. +HIPIFY tools also convert error codes from the driver namespace and coding conventions to the equivalent HIP error code. HIP unifies the APIs for these common functions. + +The memory copy API requires additional explanation. The CUDA driver includes the memory direction in the name of the API (``cuMemcpyH2D``), while the CUDA driver API provides a single memory copy API with a parameter that specifies the direction. It also supports a "default" direction where the runtime determines the direction automatically. +HIP provides APIs with both styles, for example, ``hipMemcpyH2D`` as well as ``hipMemcpy``. +The first version might be faster in some cases because it avoids any host overhead to detect the different memory directions. + +HIP defines a single error space and uses camel case for all errors (i.e. ``hipErrorInvalidValue``). + +For further information, visit the :doc:`hipify:index`. + +Address spaces +-------------------------------------------------------------------------------- + +HIP-Clang defines a process-wide address space where the CPU and all devices allocate addresses from a single unified pool. +This means addresses can be shared between contexts. Unlike the original CUDA implementation, a new context does not create a new address space for the device. + +Using hipModuleLaunchKernel +-------------------------------------------------------------------------------- + +Both CUDA driver and runtime APIs define a function for launching kernels, called ``cuLaunchKernel`` or ``cudaLaunchKernel``. The equivalent API in HIP is ``hipModuleLaunchKernel``. +The kernel arguments and the execution configuration (grid dimensions, group dimensions, dynamic shared memory, and stream) are passed as arguments to the launch function. +The runtime API additionally provides the ``<<< >>>`` syntax for launching kernels, which resembles a special function call and is easier to use than the explicit launch API, especially when handling kernel arguments. +However, this syntax is not standard C++ and is available only when NVCC is used to compile the host code. + +Additional information +-------------------------------------------------------------------------------- + +HIP-Clang creates a primary context when the HIP API is called. So, in pure +driver API code, HIP-Clang creates a primary context while HIP/NVCC has an empty +context stack. HIP-Clang pushes the primary context to the context stack when it +is empty. This can lead to subtle differences in applications which mix the +runtime and driver APIs. + +HIP-Clang implementation notes +================================================================================ + +.hip_fatbin +-------------------------------------------------------------------------------- + +HIP-Clang links device code from different translation units together. For each +device target, it generates a code object. ``clang-offload-bundler`` bundles +code objects for different device targets into one fat binary, which is embedded +as the global symbol ``__hip_fatbin`` in the ``.hip_fatbin`` section of the ELF +file of the executable or shared object. + +Initialization and termination functions +-------------------------------------------------------------------------------- + +HIP-Clang generates initialization and termination functions for each +translation unit for host code compilation. The initialization functions call +``__hipRegisterFatBinary`` to register the fat binary embedded in the ELF file. +They also call ``__hipRegisterFunction`` and ``__hipRegisterVar`` to register +kernel functions and device-side global variables. The termination functions +call ``__hipUnregisterFatBinary``. +HIP-Clang emits a global variable ``__hip_gpubin_handle`` of type ``void**`` +with ``linkonce`` linkage and an initial value of 0 for each host translation +unit. Each initialization function checks ``__hip_gpubin_handle`` and registers +the fat binary only if ``__hip_gpubin_handle`` is 0. It saves the return value +of ``__hip_gpubin_handle`` to ``__hip_gpubin_handle``. This ensures that the fat +binary is registered once. A similar check is performed in the termination +functions. + +Kernel launching +-------------------------------------------------------------------------------- + +HIP-Clang supports kernel launching using either the CUDA ``<<<>>>`` syntax, ``hipLaunchKernel``, or ``hipLaunchKernelGGL``. The last option is a macro which expands to the CUDA ``<<<>>>`` syntax by default. It can also be turned into a template by defining ``HIP_TEMPLATE_KERNEL_LAUNCH``. + +When the executable or shared library is loaded by the dynamic linker, the initialization functions are called. In the initialization functions, the code objects containing all kernels are loaded when ``__hipRegisterFatBinary`` is called. When ``__hipRegisterFunction`` is called, the stub functions are associated with the corresponding kernels in the code objects. + +HIP-Clang implements two sets of APIs for launching kernels. +By default, when HIP-Clang encounters the ``<<<>>>`` statement in the host code, it first calls ``hipConfigureCall`` to set up the threads and grids. It then calls the stub function with the given arguments. The stub function calls ``hipSetupArgument`` for each kernel argument, then calls ``hipLaunchByPtr`` with a function pointer to the stub function. In ``hipLaunchByPtr``, the actual kernel associated with the stub function is launched. + +NVCC implementation notes +================================================================================ + +Interoperation between HIP and CUDA driver +-------------------------------------------------------------------------------- + +CUDA applications might want to mix CUDA driver code with HIP code (see the example below). This table shows the equivalence between CUDA and HIP types required to implement this interaction. + +.. list-table:: Equivalence table between HIP and CUDA types + :header-rows: 1 + + * - HIP type + - CU Driver type + - CUDA Runtime type + * - ``hipModule_t`` + - ``CUmodule`` + - + * - ``hipFunction_t`` + - ``CUfunction`` + - + * - ``hipCtx_t`` + - ``CUcontext`` + - + * - ``hipDevice_t`` + - ``CUdevice`` + - + * - ``hipStream_t`` + - ``CUstream`` + - ``cudaStream_t`` + * - ``hipEvent_t`` + - ``CUevent`` + - ``cudaEvent_t`` + * - ``hipArray`` + - ``CUarray`` + - ``cudaArray`` + +Compilation options +-------------------------------------------------------------------------------- + +The ``hipModule_t`` interface does not support the ``cuModuleLoadDataEx`` function, which is used to control PTX compilation options. +HIP-Clang does not use PTX, so it does not support these compilation options. +In fact, HIP-Clang code objects contain fully compiled code for a device-specific instruction set and don't require additional compilation as a part of the load step. +The corresponding HIP function ``hipModuleLoadDataEx`` behaves like ``hipModuleLoadData`` on the HIP-Clang path (where compilation options are not used) and like ``cuModuleLoadDataEx`` on the NVCC path. + +For example: + +.. tab-set:: + + .. tab-item:: HIP + + .. code-block:: cpp + + hipModule_t module; + void *imagePtr = ...; // Somehow populate data pointer with code object + + const int numOptions = 1; + hipJitOption options[numOptions]; + void *optionValues[numOptions]; + + options[0] = hipJitOptionMaxRegisters; + unsigned maxRegs = 15; + optionValues[0] = (void *)(&maxRegs); + + // hipModuleLoadData(module, imagePtr) will be called on HIP-Clang path, JIT + // options will not be used, and cupModuleLoadDataEx(module, imagePtr, + // numOptions, options, optionValues) will be called on NVCC path + hipModuleLoadDataEx(module, imagePtr, numOptions, options, optionValues); + + hipFunction_t k; + hipModuleGetFunction(&k, module, "myKernel"); + + .. tab-item:: CUDA + + .. code-block:: cpp + + CUmodule module; + void *imagePtr = ...; // Somehow populate data pointer with code object + + const int numOptions = 1; + CUJit_option options[numOptions]; + void *optionValues[numOptions]; + + options[0] = CU_JIT_MAX_REGISTERS; + unsigned maxRegs = 15; + optionValues[0] = (void *)(&maxRegs); + + cuModuleLoadDataEx(module, imagePtr, numOptions, options, optionValues); + + CUfunction k; + cuModuleGetFunction(&k, module, "myKernel"); + +The sample below shows how to use ``hipModuleGetFunction``. + +.. code-block:: cpp + + #include + #include + + #include + + int main() { + + size_t elements = 64*1024; + size_t size_bytes = elements * sizeof(float); + + std::vector A(elements), B(elements); + + // On NVIDIA platforms the driver runtime needs to be initiated + #ifdef __HIP_PLATFORM_NVIDIA__ + hipInit(0); + hipDevice_t device; + hipCtx_t context; + HIPCHECK(hipDeviceGet(&device, 0)); + HIPCHECK(hipCtxCreate(&context, 0, device)); + #endif + + // Allocate device memory + hipDeviceptr_t d_A, d_B; + HIPCHECK(hipMalloc(&d_A, size_bytes)); + HIPCHECK(hipMalloc(&d_B, size_bytes)); + + // Copy data to device + HIPCHECK(hipMemcpyHtoD(d_A, A.data(), size_bytes)); + HIPCHECK(hipMemcpyHtoD(d_B, B.data(), size_bytes)); + + // Load module + hipModule_t Module; + // For AMD the module file has to contain architecture specific object codee + // For NVIDIA the module file has to contain PTX, found in e.g. "vcpy_isa.ptx" + HIPCHECK(hipModuleLoad(&Module, "vcpy_isa.co")); + // Get kernel function from the module via its name + hipFunction_t Function; + HIPCHECK(hipModuleGetFunction(&Function, Module, "hello_world")); + + // Create buffer for kernel arguments + std::vector argBuffer{&d_A, &d_B}; + size_t arg_size_bytes = argBuffer.size() * sizeof(void*); + + // Create configuration passed to the kernel as arguments + void* config[] = {HIP_LAUNCH_PARAM_BUFFER_POINTER, argBuffer.data(), + HIP_LAUNCH_PARAM_BUFFER_SIZE, &arg_size_bytes, HIP_LAUNCH_PARAM_END}; + + int threads_per_block = 128; + int blocks = (elements + threads_per_block - 1) / threads_per_block; + + // Actually launch kernel + HIPCHECK(hipModuleLaunchKernel(Function, blocks, 1, 1, threads_per_block, 1, 1, 0, 0, NULL, config)); + + HIPCHECK(hipMemcpyDtoH(A.data(), d_A, elements)); + HIPCHECK(hipMemcpyDtoH(B.data(), d_B, elements)); + + #ifdef __HIP_PLATFORM_NVIDIA__ + HIPCHECK(hipCtxDetach(context)); + #endif + + HIPCHECK(hipFree(d_A)); + HIPCHECK(hipFree(d_B)); + + return 0; + } + +HIP module and texture Driver API +================================================================================ + +HIP supports texture driver APIs. However, texture references must be declared +within the host scope. The following code demonstrates the use of texture +references for the ``__HIP_PLATFORM_AMD__`` platform. + +.. code-block:: cpp + + // Code to generate code object + + #include "hip/hip_runtime.h" + extern texture tex; + + __global__ void tex2dKernel(hipLaunchParm lp, float *outputData, int width, + int height) { + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; + outputData[y * width + x] = tex2D(tex, x, y); + } + +.. code-block:: cpp + + // Host code: + + texture tex; + + void myFunc () + { + // ... + + textureReference* texref; + hipModuleGetTexRef(&texref, Module1, "tex"); + hipTexRefSetAddressMode(texref, 0, hipAddressModeWrap); + hipTexRefSetAddressMode(texref, 1, hipAddressModeWrap); + hipTexRefSetFilterMode(texref, hipFilterModePoint); + hipTexRefSetFlags(texref, 0); + hipTexRefSetFormat(texref, HIP_AD_FORMAT_FLOAT, 1); + hipTexRefSetArray(texref, array, HIP_TRSA_OVERRIDE_FORMAT); + + // ... + } + +Driver entry point access +================================================================================ + +Starting from HIP version 6.2.0, support for Driver Entry Point Access is +available when using CUDA 12.0 or newer. This feature allows developers to +directly interact with the CUDA driver API, providing more control over GPU +operations. + +Driver Entry Point Access provides several features: + +* Retrieving the address of a runtime function +* Requesting the default stream version on a per-thread basis +* Accessing new HIP features on older toolkits with a newer driver + +For driver entry point access reference, visit :cpp:func:`hipGetProcAddress`. + +Address retrieval +-------------------------------------------------------------------------------- + +The :cpp:func:`hipGetProcAddress` function can be used to obtain the address of +a runtime function. This is demonstrated in the following example: + +.. code-block:: cpp + + #include + #include + + #include + + typedef hipError_t (*hipInit_t)(unsigned int); + + int main() { + // Initialize the HIP runtime + hipError_t res = hipInit(0); + if (res != hipSuccess) { + std::cerr << "Failed to initialize HIP runtime." << std::endl; + return 1; + } + + // Get the address of the hipInit function + hipInit_t hipInitFunc; + int hipVersion = HIP_VERSION; // Use the HIP version defined in hip_runtime_api.h + uint64_t flags = 0; // No special flags + hipDriverProcAddressQueryResult symbolStatus; + + res = hipGetProcAddress("hipInit", (void**)&hipInitFunc, hipVersion, flags, &symbolStatus); + if (res != hipSuccess) { + std::cerr << "Failed to get address of hipInit()." << std::endl; + return 1; + } + + // Call the hipInit function using the obtained address + res = hipInitFunc(0); + if (res == hipSuccess) { + std::cout << "HIP runtime initialized successfully using hipGetProcAddress()." << std::endl; + } else { + std::cerr << "Failed to initialize HIP runtime using hipGetProcAddress()." << std::endl; + } + + return 0; + } + +Per-thread default stream version request +================================================================================ + +HIP offers functionality similar to CUDA for managing streams on a per-thread +basis. By using ``hipStreamPerThread``, each thread can independently manage its +default stream, simplifying operations. The following example demonstrates how +this feature enhances performance by reducing contention and improving +efficiency. + +.. code-block:: cpp + + #include + + #include + + int main() { + // Initialize the HIP runtime + hipError_t res = hipInit(0); + if (res != hipSuccess) { + std::cerr << "Failed to initialize HIP runtime." << std::endl; + return 1; + } + + // Get the per-thread default stream + hipStream_t stream = hipStreamPerThread; + + // Use the stream for some operation + // For example, allocate memory on the device + void* d_ptr; + size_t size = 1024; + res = hipMalloc(&d_ptr, size); + if (res != hipSuccess) { + std::cerr << "Failed to allocate memory." << std::endl; + return 1; + } + + // Perform some operation using the stream + // For example, set memory on the device + res = hipMemsetAsync(d_ptr, 0, size, stream); + if (res != hipSuccess) { + std::cerr << "Failed to set memory." << std::endl; + return 1; + } + + // Synchronize the stream + res = hipStreamSynchronize(stream); + if (res != hipSuccess) { + std::cerr << "Failed to synchronize stream." << std::endl; + return 1; + } + + std::cout << "Operation completed successfully using per-thread default stream." << std::endl; + + // Free the allocated memory + hipFree(d_ptr); + + return 0; + } + +Accessing new HIP features with a newer driver +================================================================================ + +HIP is designed to be forward compatible, allowing newer features to be utilized +with older toolkits, provided a compatible driver is present. Feature support +can be verified through runtime API functions and version checks. This approach +ensures that applications can benefit from new features and improvements in the +HIP runtime without needing to be recompiled with a newer toolkit. The function +:cpp:func:`hipGetProcAddress` enables dynamic querying and the use of newer +functions offered by the HIP runtime, even if the application was built with an +older toolkit. + +An example is provided for a hypothetical ``foo()`` function. + +.. code-block:: cpp + + // Get the address of the foo function + foo_t fooFunc; + int hipVersion = 60300000; // Use an own HIP version number (e.g. 6.3.0) + uint64_t flags = 0; // No special flags + hipDriverProcAddressQueryResult symbolStatus; + + res = hipGetProcAddress("foo", (void**)&fooFunc, hipVersion, flags, &symbolStatus); + +The HIP version number is defined as an integer: + +.. code-block:: cpp + + HIP_VERSION=HIP_VERSION_MAJOR * 10000000 + HIP_VERSION_MINOR * 100000 + HIP_VERSION_PATCH diff --git a/projects/hip/docs/how-to/hip_porting_guide.md b/projects/hip/docs/how-to/hip_porting_guide.md index 1a51339b66..8e123c105a 100644 --- a/projects/hip/docs/how-to/hip_porting_guide.md +++ b/projects/hip/docs/how-to/hip_porting_guide.md @@ -1,4 +1,4 @@ -# HIP Porting Guide +# HIP porting guide In addition to providing a portable C++ programming environment for GPUs, HIP is designed to ease the porting of existing CUDA code into the HIP environment. This section describes the available tools @@ -366,7 +366,7 @@ run hipcc when appropriate. ### ``warpSize`` -Code should not assume a warp size of 32 or 64. See [Warp Cross-Lane Functions](https://rocm.docs.amd.com/projects/HIP/en/latest/reference/kernel_language.html#warp-cross-lane-functions) for information on how to write portable wave-aware code. +Code should not assume a warp size of 32 or 64. See [Warp Cross-Lane Functions](https://rocm.docs.amd.com/projects/HIP/en/latest/reference/cpp_language_extensions.html#warp-cross-lane-functions) for information on how to write portable wave-aware code. ### Kernel launch with group size > 256 @@ -403,7 +403,7 @@ Device Code: __constant__ int Value[LEN]; -__global__ void Get(hipLaunchParm lp, int *Ad) +__global__ void Get(int *Ad) { int tid = threadIdx.x + blockIdx.x * blockDim.x; Ad[tid] = Value[tid]; diff --git a/projects/hip/docs/how-to/hip_rtc.md b/projects/hip/docs/how-to/hip_rtc.md index e509914cd9..b96c069cb2 100644 --- a/projects/hip/docs/how-to/hip_rtc.md +++ b/projects/hip/docs/how-to/hip_rtc.md @@ -1,4 +1,4 @@ -# Programming for HIP Runtime Compiler (RTC) +# Programming for HIP runtime compiler (RTC) HIP lets you compile kernels at runtime with the `hiprtc*` APIs. Kernels can be stored as a text string and can be passed to HIPRTC APIs alongside options to guide the compilation. @@ -6,7 +6,7 @@ Kernels can be stored as a text string and can be passed to HIPRTC APIs alongsid NOTE: * This library can be used on systems without HIP installed nor AMD GPU driver installed at all (offline compilation). Therefore, it does not depend on any HIP runtime library. -* But it does depend on COMGr. You may try to statically link COMGr into HIPRTC to avoid any ambiguity. +* But it does depend on Code Object Manager (comgr). You may try to statically link comgr into HIPRTC to avoid any ambiguity. * Developers can decide to bundle this library with their application. ## Compilation APIs diff --git a/projects/hip/docs/how-to/hipgraph.rst b/projects/hip/docs/how-to/hipgraph.rst new file mode 100644 index 0000000000..958784a71f --- /dev/null +++ b/projects/hip/docs/how-to/hipgraph.rst @@ -0,0 +1,516 @@ +.. meta:: + :description: This chapter describes how to use HIP graphs and highlights their use cases. + :keywords: ROCm, HIP, graph, stream + +.. _how_to_HIP_graph: + +******************************************************************************** +HIP graphs +******************************************************************************** + +.. note:: + The HIP graph API is currently in Beta. Some features can change and might + have outstanding issues. Not all features supported by CUDA graphs are yet + supported. For a list of all currently supported functions see the + :doc:`HIP graph API documentation<../doxygen/html/group___graph>`. + +HIP graphs are an alternative way of executing tasks on a GPU that can provide +performance benefits over launching kernels using the standard +method via streams. A HIP graph is made up of nodes and edges. The nodes of a +HIP graph represent the operations performed, while the edges mark dependencies +between those operations. + +The nodes can be one of the following: + +- empty nodes +- nested graphs +- kernel launches +- host-side function calls +- HIP memory functions (copy, memset, ...) +- HIP events +- signalling or waiting on external semaphores + +.. note:: + The available node types are specified by :cpp:enum:`hipGraphNodeType`. + +The following figure visualizes the concept of graphs, compared to using streams. + +.. figure:: ../data/how-to/hipgraph/hip_graph.svg + :alt: Diagram depicting the difference between using streams to execute + kernels with dependencies, resolved by explicitly synchronizing, + or using graphs, where the edges denote the dependencies. + +The standard method of launching kernels incurs a small overhead for each +iteration of the operation involved. That overhead is negligible, when the +kernel is launched directly with the HIP C/C++ API, but depending on the +framework used, there can be several levels of redirection, until the actual +kernel is launched by the HIP runtime, leading to significant overhead. +Especially for some AI frameworks, a GPU kernel might run faster than the time +it takes for the framework to set up and launch the kernel, and so the overhead +of repeatedly launching kernels can have a significant impact on performance. + +HIP graphs are designed to address this issue, by predefining the HIP API calls +and their dependencies with a graph, and performing most of the initialization +beforehand. Launching a graph only requires a single call, after which the +HIP runtime takes care of executing the operations within the graph. +Graphs can provide additional performance benefits, by enabling optimizations +that are only possible when knowing the dependencies between the operations. + +.. figure:: ../data/how-to/hipgraph/hip_graph_speedup.svg + :alt: Diagram depicting the speed up achievable with HIP graphs compared to + HIP streams when launching many short-running kernels. + + Qualitative presentation of the execution time of many short-running kernels + when launched using HIP stream versus HIP graph. This does not include the + time needed to set up the graph. + +Using HIP graphs +================================================================================ + +There are two different ways of creating graphs: Capturing kernel launches from +a stream, or explicitly creating graphs. The difference between the two +approaches is explained later in this chapter. + +The general flow for using HIP graphs includes the following steps. + +#. Create a :cpp:type:`hipGraph_t` graph template using one of the two approaches described in this chapter +#. Create a :cpp:type:`hipGraphExec_t` executable instance of the graph template using :cpp:func:`hipGraphInstantiate` +#. Use :cpp:func:`hipGraphLaunch` to launch the executable graph to a stream +#. After execution completes free and destroy graph resources + +The first two steps are the initial setup and only need to be executed once. First +step is the definition of the operations (nodes) and the dependencies (edges) +between them. The second step is the instantiation of the graph. This takes care +of validating and initializing the graph, to reduce the overhead when executing +the graph. The third step is the execution of the graph, which takes care of +launching all the kernels and executing the operations while respecting their +dependencies and necessary synchronizations as specified. + +Because HIP graphs require some setup and initialization overhead before their +first execution, graphs only provide a benefit for workloads that require +many iterations to complete. + +In both methods the :cpp:type:`hipGraph_t` template for a graph is used to define the graph. +In order to actually launch a graph, the template needs to be instantiated using +:cpp:func:`hipGraphInstantiate`, which results in an executable graph of type :cpp:type:`hipGraphExec_t`. +This executable graph can then be launched with :cpp:func:`hipGraphLaunch`, replaying the +operations within the graph. Note, that launching graphs is fundamentally no +different to executing other HIP functions on a stream, except for the fact, +that scheduling the operations within the graph encompasses less overhead and +can enable some optimizations, but they still need to be associated with a stream for execution. + +Memory management +-------------------------------------------------------------------------------- + +Memory that is used by operations in graphs can either be pre-allocated or +managed within the graph. Graphs can contain nodes that take care of allocating +memory on the device or copying memory between the host and the device. +Whether you want to pre-allocate the memory or manage it within the graph +depends on the use-case. If the graph is executed in a tight loop the +performance is usually better when the memory is preallocated, so that it +does not need to be reallocated in every iteration. + +The same rules as for normal memory allocations apply for memory allocated and +freed by nodes, meaning that the nodes that access memory allocated in a graph +must be ordered after allocation and before freeing. + +Memory management within the graph enables the runtime to take care of memory reuse and optimizations. +The lifetime of memory managed in a graph begins when the execution reaches the +node allocating the memory, and ends when either reaching the corresponding +free node within the graph, or after graph execution when a corresponding +:cpp:func:`hipFreeAsync` or :cpp:func:`hipFree` call is reached. +The memory can also be freed with a free node in a different graph that is +associated with the same memory address. + +Unlike device memory that is not associated with a graph, this does not necessarily +mean that the freed memory is returned back to the operating system immediately. +Graphs can retain a memory pool for quickly reusing memory within the graph. +This can be especially useful when memory is freed and reallocated later on +within a graph, as that memory doesn't have to be requested from the operating system. +It also potentially reduces the total memory footprint of the graph, by reusing the same memory. + +The amount of memory allocated for graph memory pools on a specific device can +be queried using :cpp:func:`hipDeviceGetGraphMemAttribute`. +In order to return the freed memory :cpp:func:`hipDeviceGraphMemTrim` can be used. +This will return any memory that is not in active use by graphs. + +These memory allocations can also be set up to allow access from multiple GPUs, +just like normal allocations. HIP then takes care of allocating and mapping the +memory to the GPUs. When capturing a graph from a stream, the node sets the +accessibility according to :cpp:func:`hipMemPoolSetAccess` at the time of capturing. + + +Capture graphs from a stream +================================================================================ + +The easy way to integrate HIP graphs into already existing code is to use +:cpp:func:`hipStreamBeginCapture` and :cpp:func:`hipStreamEndCapture` to obtain a :cpp:type:`hipGraph_t` +graph template that includes the captured operations. + +When starting to capture operations for a graph using :cpp:func:`hipStreamBeginCapture`, +the operations assigned to the stream are captured into a graph instead of being +executed. The associated graph is returned when calling :cpp:func:`hipStreamEndCapture`, which +also stops capturing operations. +In order to capture to an already existing graph use :cpp:func:`hipStreamBeginCaptureToGraph`. + +The functions assigned to the capturing stream are not executed, but instead are +captured and defined as nodes in the graph, to be run when the instantiated +graph is launched. + +Functions must be associated with a stream in order to be captured. +This means that non-HIP API functions are not captured by default, but are +executed as standard functions when encountered and not added to the graph. +In order to assign host functions to a stream use +:cpp:func:`hipLaunchHostFunc`, as shown in the following code example. +They will then be captured and defined as a host node in the resulting graph, +and won't be executed when encountered. + +Synchronous HIP API calls that are implicitly assigned to the default stream are +not permitted while capturing a stream and will return an error. This is +because they implicitly synchronize and cause a dependency that can not be +captured within the stream. This includes functions like :cpp:func:`hipMalloc`, +:cpp:func:`hipMemcpy` and :cpp:func:`hipFree`. In order to capture these to the stream, replace +them with the corresponding asynchronous calls like :cpp:func:`hipMallocAsync`, :cpp:func:`hipMemcpyAsync` or :cpp:func:`hipFreeAsync`. + +The general flow for using stream capture to create a graph template is: + +#. Create a stream from which to capture the operations + +#. Call :cpp:func:`hipStreamBeginCapture` before the first operation to be captured + +#. Call :cpp:func:`hipStreamEndCapture` after the last operation to be captured + + #. Define a :cpp:type:`hipGraph_t` graph template to which :cpp:func:`hipStreamEndCapture` + passes the captured graph + +The following code is an example of how to use the HIP graph API to capture a +graph from a stream. + +.. code-block:: cpp + + #include + #include + #include + + #define HIP_CHECK(expression) \ + { \ + const hipError_t status = expression; \ + if(status != hipSuccess){ \ + std::cerr << "HIP error " \ + << status << ": " \ + << hipGetErrorString(status) \ + << " at " << __FILE__ << ":" \ + << __LINE__ << std::endl; \ + } \ + } + + + __global__ void kernelA(double* arrayA, size_t size){ + const size_t x = threadIdx.x + blockDim.x * blockIdx.x; + if(x < size){arrayA[x] *= 2.0;} + }; + __global__ void kernelB(int* arrayB, size_t size){ + const size_t x = threadIdx.x + blockDim.x * blockIdx.x; + if(x < size){arrayB[x] = 3;} + }; + __global__ void kernelC(double* arrayA, const int* arrayB, size_t size){ + const size_t x = threadIdx.x + blockDim.x * blockIdx.x; + if(x < size){arrayA[x] += arrayB[x];} + }; + + struct set_vector_args{ + std::vector& h_array; + double value; + }; + + void set_vector(void* args){ + set_vector_args h_args{*(reinterpret_cast(args))}; + + std::vector& vec{h_args.h_array}; + vec.assign(vec.size(), h_args.value); + } + + int main(){ + constexpr int numOfBlocks = 1024; + constexpr int threadsPerBlock = 1024; + constexpr size_t arraySize = 1U << 20; + + // This example assumes that kernelA operates on data that needs to be initialized on + // and copied from the host, while kernelB initializes the array that is passed to it. + // Both arrays are then used as input to kernelC, where arrayA is also used as + // output, that is copied back to the host, while arrayB is only read from and not modified. + + double* d_arrayA; + int* d_arrayB; + std::vector h_array(arraySize); + constexpr double initValue = 2.0; + + hipStream_t captureStream; + HIP_CHECK(hipStreamCreate(&captureStream)); + + // Start capturing the operations assigned to the stream + HIP_CHECK(hipStreamBeginCapture(captureStream, hipStreamCaptureModeGlobal)); + + // hipMallocAsync and hipMemcpyAsync are needed, to be able to assign it to a stream + HIP_CHECK(hipMallocAsync(&d_arrayA, arraySize*sizeof(double), captureStream)); + HIP_CHECK(hipMallocAsync(&d_arrayB, arraySize*sizeof(int), captureStream)); + + // Assign host function to the stream + // Needs a custom struct to pass the arguments + set_vector_args args{h_array, initValue}; + HIP_CHECK(hipLaunchHostFunc(captureStream, set_vector, &args)); + + HIP_CHECK(hipMemcpyAsync(d_arrayA, h_array.data(), arraySize*sizeof(double), hipMemcpyHostToDevice, captureStream)); + + kernelA<<>>(d_arrayA, arraySize); + kernelB<<>>(d_arrayB, arraySize); + kernelC<<>>(d_arrayA, d_arrayB, arraySize); + + HIP_CHECK(hipMemcpyAsync(h_array.data(), d_arrayA, arraySize*sizeof(*d_arrayA), hipMemcpyDeviceToHost, captureStream)); + + HIP_CHECK(hipFreeAsync(d_arrayA, captureStream)); + HIP_CHECK(hipFreeAsync(d_arrayB, captureStream)); + + // Stop capturing + hipGraph_t graph; + HIP_CHECK(hipStreamEndCapture(captureStream, &graph)); + + // Create an executable graph from the captured graph + hipGraphExec_t graphExec; + HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0)); + + // The graph template can be deleted after the instantiation if it's not needed for later use + HIP_CHECK(hipGraphDestroy(graph)); + + // Actually launch the graph. The stream does not have + // to be the same as the one used for capturing. + HIP_CHECK(hipGraphLaunch(graphExec, captureStream)); + + // Verify results + constexpr double expected = initValue * 2.0 + 3; + bool passed = true; + for(size_t i = 0; i < arraySize; ++i){ + if(h_array[i] != expected){ + passed = false; + std::cerr << "Validation failed! Expected " << expected << " got " << h_array[0] << std::endl; + break; + } + } + if(passed){ + std::cerr << "Validation passed." << std::endl; + } + + // Free graph and stream resources after usage + HIP_CHECK(hipGraphExecDestroy(graphExec)); + HIP_CHECK(hipStreamDestroy(captureStream)); + } + +Explicit graph creation +================================================================================ + +Graphs can also be created directly using the HIP graph API, giving more +fine-grained control over the graph. In this case, the graph nodes are created +explicitly, together with their parameters and dependencies, which specify the +edges of the graph, thereby forming the graph structure. + +The nodes are represented by the generic :cpp:type:`hipGraphNode_t` type. The actual +node type is implicitly defined by the specific function used to add the node to +the graph, for example :cpp:func:`hipGraphAddKernelNode` See the +:doc:`HIP graph API documentation<../doxygen/html/group___graph>` for the +available functions, they are of type ``hipGraphAdd{Type}Node``. Each type of +node also has a predefined set of parameters depending on the operation, for +example :cpp:class:`hipKernelNodeParams` for a kernel launch. See the +:doc:`documentation for the general hipGraphNodeParams type<../doxygen/html/structhip_graph_node_params>` +for a list of available parameter types and their members. + +The general flow for explicitly creating a graph is usually: + +#. Create a graph :cpp:type:`hipGraph_t` + +#. Create the nodes and their parameters and add them to the graph + + #. Define a :cpp:type:`hipGraphNode_t` + + #. Define the parameter struct for the desired operation, by explicitly setting the appropriate struct's members. + + #. Use the appropriate ``hipGraphAdd{Type}Node`` function to add the node to the graph. + + #. The dependencies can be defined when adding the node to the graph, or afterwards by using :cpp:func:`hipGraphAddDependencies` + +The following code example demonstrates how to explicitly create nodes in order to create a graph. + +.. code-block:: cpp + + #include + #include + #include + + #define HIP_CHECK(expression) \ + { \ + const hipError_t status = expression; \ + if(status != hipSuccess){ \ + std::cerr << "HIP error " \ + << status << ": " \ + << hipGetErrorString(status) \ + << " at " << __FILE__ << ":" \ + << __LINE__ << std::endl; \ + } \ + } + + __global__ void kernelA(double* arrayA, size_t size){ + const size_t x = threadIdx.x + blockDim.x * blockIdx.x; + if(x < size){arrayA[x] *= 2.0;} + }; + __global__ void kernelB(int* arrayB, size_t size){ + const size_t x = threadIdx.x + blockDim.x * blockIdx.x; + if(x < size){arrayB[x] = 3;} + }; + __global__ void kernelC(double* arrayA, const int* arrayB, size_t size){ + const size_t x = threadIdx.x + blockDim.x * blockIdx.x; + if(x < size){arrayA[x] += arrayB[x];} + }; + + struct set_vector_args{ + std::vector& h_array; + double value; + }; + + void set_vector(void* args){ + set_vector_args h_args{*(reinterpret_cast(args))}; + + std::vector& vec{h_args.h_array}; + vec.assign(vec.size(), h_args.value); + } + + int main(){ + constexpr int numOfBlocks = 1024; + constexpr int threadsPerBlock = 1024; + size_t arraySize = 1U << 20; + + // The pointers to the device memory don't need to be declared here, + // they are contained within the hipMemAllocNodeParams as the dptr member + std::vector h_array(arraySize); + constexpr double initValue = 2.0; + + // Create graph an empty graph + hipGraph_t graph; + HIP_CHECK(hipGraphCreate(&graph, 0)); + + // Parameters to allocate arrays + hipMemAllocNodeParams allocArrayAParams{}; + allocArrayAParams.poolProps.allocType = hipMemAllocationTypePinned; + allocArrayAParams.poolProps.location.type = hipMemLocationTypeDevice; + allocArrayAParams.poolProps.location.id = 0; // GPU on which memory resides + allocArrayAParams.bytesize = arraySize * sizeof(double); + + hipMemAllocNodeParams allocArrayBParams{}; + allocArrayBParams.poolProps.allocType = hipMemAllocationTypePinned; + allocArrayBParams.poolProps.location.type = hipMemLocationTypeDevice; + allocArrayBParams.poolProps.location.id = 0; // GPU on which memory resides + allocArrayBParams.bytesize = arraySize * sizeof(int); + + // Add the allocation nodes to the graph. They don't have any dependencies + hipGraphNode_t allocNodeA, allocNodeB; + HIP_CHECK(hipGraphAddMemAllocNode(&allocNodeA, graph, nullptr, 0, &allocArrayAParams)); + HIP_CHECK(hipGraphAddMemAllocNode(&allocNodeB, graph, nullptr, 0, &allocArrayBParams)); + + // Parameters for the host function + // Needs custom struct to pass the arguments + set_vector_args args{h_array, initValue}; + hipHostNodeParams hostParams{}; + hostParams.fn = set_vector; + hostParams.userData = static_cast(&args); + + // Add the host node that initializes the host array. It also doesn't have any dependencies + hipGraphNode_t hostNode; + HIP_CHECK(hipGraphAddHostNode(&hostNode, graph, nullptr, 0, &hostParams)); + + // Add memory copy node, that copies the initialized host array to the device. + // It has to wait for the host array to be initialized and the device memory to be allocated + hipGraphNode_t cpyNodeDependencies[] = {allocNodeA, hostNode}; + hipGraphNode_t cpyToDevNode; + HIP_CHECK(hipGraphAddMemcpyNode1D(&cpyToDevNode, graph, cpyNodeDependencies, 1, allocArrayAParams.dptr, h_array.data(), arraySize * sizeof(double), hipMemcpyHostToDevice)); + + // Parameters for kernelA + hipKernelNodeParams kernelAParams; + void* kernelAArgs[] = {&allocArrayAParams.dptr, static_cast(&arraySize)}; + kernelAParams.func = reinterpret_cast(kernelA); + kernelAParams.gridDim = numOfBlocks; + kernelAParams.blockDim = threadsPerBlock; + kernelAParams.sharedMemBytes = 0; + kernelAParams.kernelParams = kernelAArgs; + kernelAParams.extra = nullptr; + + // Add the node for kernelA. It has to wait for the memory copy to finish, as it depends on the values from the host array. + hipGraphNode_t kernelANode; + HIP_CHECK(hipGraphAddKernelNode(&kernelANode, graph, &cpyToDevNode, 1, &kernelAParams)); + + // Parameters for kernelB + hipKernelNodeParams kernelBParams; + void* kernelBArgs[] = {&allocArrayBParams.dptr, static_cast(&arraySize)}; + kernelBParams.func = reinterpret_cast(kernelB); + kernelBParams.gridDim = numOfBlocks; + kernelBParams.blockDim = threadsPerBlock; + kernelBParams.sharedMemBytes = 0; + kernelBParams.kernelParams = kernelBArgs; + kernelBParams.extra = nullptr; + + // Add the node for kernelB. It only has to wait for the memory to be allocated, as it initializes the array. + hipGraphNode_t kernelBNode; + HIP_CHECK(hipGraphAddKernelNode(&kernelBNode, graph, &allocNodeB, 1, &kernelBParams)); + + // Parameters for kernelC + hipKernelNodeParams kernelCParams; + void* kernelCArgs[] = {&allocArrayAParams.dptr, &allocArrayBParams.dptr, static_cast(&arraySize)}; + kernelCParams.func = reinterpret_cast(kernelC); + kernelCParams.gridDim = numOfBlocks; + kernelCParams.blockDim = threadsPerBlock; + kernelCParams.sharedMemBytes = 0; + kernelCParams.kernelParams = kernelCArgs; + kernelCParams.extra = nullptr; + + // Add the node for kernelC. It has to wait on both kernelA and kernelB to finish, as it depends on their results. + hipGraphNode_t kernelCNode; + hipGraphNode_t kernelCDependencies[] = {kernelANode, kernelBNode}; + HIP_CHECK(hipGraphAddKernelNode(&kernelCNode, graph, kernelCDependencies, 1, &kernelCParams)); + + // Copy the results back to the host. Has to wait for kernelC to finish. + hipGraphNode_t cpyToHostNode; + HIP_CHECK(hipGraphAddMemcpyNode1D(&cpyToHostNode, graph, &kernelCNode, 1, h_array.data(), allocArrayAParams.dptr, arraySize * sizeof(double), hipMemcpyDeviceToHost)); + + // Free array of allocNodeA. It needs to wait for the copy to finish, as kernelC stores its results in it. + hipGraphNode_t freeNodeA; + HIP_CHECK(hipGraphAddMemFreeNode(&freeNodeA, graph, &cpyToHostNode, 1, allocArrayAParams.dptr)); + // Free array of allocNodeB. It only needs to wait for kernelC to finish, as it is not written back to the host. + hipGraphNode_t freeNodeB; + HIP_CHECK(hipGraphAddMemFreeNode(&freeNodeB, graph, &kernelCNode, 1, allocArrayBParams.dptr)); + + // Instantiate the graph in order to execute it + hipGraphExec_t graphExec; + HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0)); + + // The graph can be freed after the instantiation if it's not needed for other purposes + HIP_CHECK(hipGraphDestroy(graph)); + + // Actually launch the graph + hipStream_t graphStream; + HIP_CHECK(hipStreamCreate(&graphStream)); + HIP_CHECK(hipGraphLaunch(graphExec, graphStream)); + + // Verify results + constexpr double expected = initValue * 2.0 + 3; + bool passed = true; + for(size_t i = 0; i < arraySize; ++i){ + if(h_array[i] != expected){ + passed = false; + std::cerr << "Validation failed! Expected " << expected << " got " << h_array[0] << std::endl; + break; + } + } + if(passed){ + std::cerr << "Validation passed." << std::endl; + } + + HIP_CHECK(hipGraphExecDestroy(graphExec)); + HIP_CHECK(hipStreamDestroy(graphStream)); + } diff --git a/projects/hip/docs/how-to/performance_guidelines.rst b/projects/hip/docs/how-to/performance_guidelines.rst index aa8bcb1fce..9ebd210106 100644 --- a/projects/hip/docs/how-to/performance_guidelines.rst +++ b/projects/hip/docs/how-to/performance_guidelines.rst @@ -1,125 +1,127 @@ .. meta:: - :description: This chapter describes a set of best practices designed to help developers optimize the performance of HIP-capable GPU architectures. + :description: This chapter describes a set of best practices designed to help + developers optimize the performance of HIP-capable GPU architectures. :keywords: AMD, ROCm, HIP, CUDA, performance, guidelines ******************************************************************************* -Performance Guidelines +Performance guidelines ******************************************************************************* -The AMD HIP Performance Guidelines are a set of best practices designed to help -developers optimize the performance of AMD GPUs. They cover established -parallelization and optimization techniques, coding metaphors, and idioms that -can greatly simplify programming for HIP-capable GPU architectures. +The AMD HIP performance guidelines are a set of best practices designed to help +you optimize the application performance on AMDGPUs. The guidelines discuss +established parallelization and optimization techniques to improve the +application performance on HIP-capable GPU architectures. -By following four main cornerstones, we can exploit the performance -optimization potential of HIP. +Here are the four main cornerstones to help you exploit HIP's performance +optimization potential: -- parallel execution -- memory usage optimization -- optimization for maximum throughput -- minimizing memory thrashing +- Parallel execution +- Memory bandwidth usage optimization +- Maximum throughput optimization +- Memory thrashing minimization -In the following chapters, we will show you their benefits and how to use them -effectively. +This document discusses the usage and benefits of these cornerstones in detail. .. _parallel execution: Parallel execution -================== +================================================================================ -For optimal use, the application should reveal and efficiently imply as much -parallelism as possible to keep all system components active. +For optimal use and to keep all system components busy, the application must +reveal and efficiently provide as much parallelism as possible. The parallelism +can be performed at the application level, device level, and multiprocessor +level. Application level ------------------ +-------------------------------------------------------------------------------- -The application should optimize parallel execution across the host and devices -using asynchronous calls and streams. Workloads should be assigned based on -efficiency: serial to the host, parallel to the devices. +To enable parallel execution of the application across the host and devices, use +asynchronous calls and streams. Assign workloads based on efficiency: serial to +the host or parallel to the devices. -For parallel workloads, when threads need to synchronize to share data, if they -belong to the same block, they should use ``__syncthreads()`` (see: -:ref:`synchronization functions`) within the same kernel invocation. If they -belong to different blocks, they must use global memory with two separate -kernel invocations. The latter should be minimized as it adds overhead. +For parallel workloads, when threads belonging to the same block need to +synchronize to share data, use :cpp:func:`__syncthreads()` (see: +:ref:`synchronization functions`) within the same kernel invocation. For threads +belonging to different blocks, use global memory with two separate +kernel invocations. It is recommended to avoid the latter approach as it adds +overhead. Device level ------------- +-------------------------------------------------------------------------------- -Device-level optimization primarily involves maximizing parallel execution -across the multiprocessors of the device. This can be achieved by executing -multiple kernels concurrently on a device. The management of these kernels is -facilitated by streams, which allow for the overlapping of computation and data -transfers, enhancing performance. The aim is to keep all multiprocessors busy -by executing enough kernels concurrently. However, launching too many kernels -can lead to resource contention, so a balance must be found for optimal -performance. This approach helps in achieving maximum utilization of the -resources of the device. +Device level optimization primarily involves maximizing parallel execution +across the multiprocessors on the device. You can achieve device level +optimization by executing multiple kernels concurrently on a device. To enhance +performance, the management of these kernels is facilitated by streams, which +allows overlapping of computation and data transfers. This approach aims at +keeping all multiprocessors busy by executing enough kernels concurrently. +However, launching too many kernels can lead to resource contention, hence a +balance must be found for optimal performance. The device level optimization +helps in achieving maximum utilization of the device resources. Multiprocessor level --------------------- +-------------------------------------------------------------------------------- -Multiprocessor-level optimization involves maximizing parallel execution within -each multiprocessor on a device. Each multiprocessor can execute a number of -threads concurrently, and the total number of threads that can run in parallel -is determined by the number of concurrent threads each multiprocessor can -handle. +Multiprocessor level optimization involves maximizing parallel execution within +each multiprocessor on a device. The key to multiprocessor level optimization +is to efficiently utilize the various functional units within a multiprocessor. +For example, ensuring a sufficient number of resident warps, so that every clock +cycle has an instruction from a warp is ready for execution. This instruction +could either be another independent instruction of the same warp, which exploits +:ref:`instruction level optimization `, or more +commonly an instruction of another warp, which exploits thread-level parallelism. -The key to multiprocessor-level optimization is to efficiently utilize the -various functional units within a multiprocessor. This can be achieved by -ensuring a sufficient number of resident warps, as at every instruction issue -time, a warp scheduler selects an instruction that is ready to execute. This -instruction can be another independent instruction of the same warp, exploiting -:ref:`instruction optimization`, or more commonly an instruction of another warp, -exploiting thread-level parallelism. - -In comparison, device-level optimization focuses on the device as a whole, -aiming to keep all multiprocessors busy by executing enough kernels -concurrently. Both levels of optimization are crucial for achieving maximum -performance. They work together to ensure efficient utilization of the -resources of the GPU, from the individual multiprocessors to the device as a -whole. +On the other hand, device level optimization focuses on the device as a whole, +aiming at keeping all multiprocessors busy by executing enough kernels +concurrently. Both multiprocessor and device levels of optimization are crucial +for achieving maximum performance. They work together to ensure efficient +utilization of the GPU resources, ranging from individual multiprocessors to the +device as a whole. .. _memory optimization: -Memory optimization -=================== +Memory throughput optimization +================================================================================ The first step in maximizing memory throughput is to minimize low-bandwidth -data transfers. This involves reducing data transfers between the host and the -device, as these have lower bandwidth than transfers between global memory and -the device. +data transfers between the host and the device. -Additionally, data transfers between global memory and the device should be -minimized by maximizing the use of on-chip memory: shared memory and caches. -Shared memory acts as a user-managed cache, where the application explicitly -allocates and accesses it. A common programming pattern is to stage data from -device memory into shared memory. This involves each thread of a block loading -data from device memory to shared memory, synchronizing with all other threads -of the block, processing the data in shared memory, synchronizing again if -necessary, and writing the results back to device global memory. +Additionally, maximize the use of on-chip memory, that is, shared memory and +caches, and minimize transfers with global memory. Shared memory acts as a +user-managed cache explicitly allocated and accessed by the application. A +common programming pattern is to stage data from device memory into shared +memory. The staging of data from the device to shared memory involves the +following steps: + +1. Each thread of a block loading data from device memory to shared memory. +2. Synchronizing with all other threads of the block. +3. Processing the data stored in shared memory. +4. Synchronizing again if necessary. +5. Writing the results back to the device global memory. For some applications, a traditional hardware-managed cache is more appropriate -to exploit data locality. On devices of certain compute capabilities, the same -on-chip memory is used for both L1 and shared memory, and the amount dedicated -to each is configurable for each kernel call. +for exploiting data locality. -Finally, the throughput of memory accesses by a kernel can vary significantly -depending on the access pattern for each type of memory. Therefore, the next -step in maximizing memory throughput is to organize memory accesses as -optimally as possible. This is especially important for global memory accesses, -as global memory bandwidth is low compared to available on-chip bandwidths and -arithmetic instruction throughput. Thus, non-optimal global memory accesses -generally have a high impact on performance. +In conclusion, the throughput of memory accesses by a kernel can vary +significantly depending on the access pattern. Therefore, the next step in +maximizing memory throughput is to organize memory accesses as optimally as +possible. This is especially important for global memory accesses, as global +memory bandwidth is low compared to available on-chip bandwidths and arithmetic +instruction throughput. Thus, non-optimal global memory accesses generally have +a high impact on performance. +The memory throughput optimization techniques are further discussed in detail in +the following sections. -Data Transfer -------------- +.. _data transfer: -Applications should aim to minimize data transfers between the host and the -device. This can be achieved by moving more computations from the host to the -device, even if it means running kernels that do not fully utilize the -parallelism for device. Intermediate data structures can be created, used, -and discarded in device memory without being mapped or copied to host memory. +Data transfer +-------------------------------------------------------------------------------- + +To minimize data transfers between the host and the device, applications should +move more computations from the host to the device, even at the cost of running +kernels that don't fully utilize parallelism for the device. Intermediate data +structures should be created, used, and discarded in device memory without being +mapped or copied to host memory. Batching small transfers into a single large transfer can improve performance due to the overhead associated with each transfer. On systems with a front-side @@ -129,173 +131,185 @@ When using mapped page-locked memory, there is no need to allocate device memory or explicitly copy data between device and host memory. Data transfers occur implicitly each time the kernel accesses the mapped memory. For optimal performance, these memory accesses should be coalesced, similar to global -memory accesses. +memory accesses. The process where threads in a warp access sequential memory +locations is known as coalesced memory access, which can enhance memory data +transfer efficiency. -On integrated systems where device and host memory are physically the same, -any copy operation between host and device memory is unnecessary, and mapped -page-locked memory should be used instead. Applications can check if a device -is integrated by querying the integrated device property. +On integrated systems where device and host memory are physically the same, no +copy operation between host and device memory is required and hence mapped +page-locked memory should be used instead. To check if the device is integrated, +applications can query the integrated device property. +.. _device memory access: -Device Memory Access --------------------- +Device memory access +--------------------- -Memory access instructions may be repeated due to the spread of memory +Memory access instructions might be repeated due to the spread of memory addresses across warp threads. The impact on throughput varies with memory type and is generally reduced when addresses are more scattered, especially in global memory. Device memory is accessed via 32-, 64-, or 128-byte transactions that must be -naturally aligned. Maximizing memory throughput involves coalescing memory -accesses of threads within a warp into minimal transactions, following optimal -access patterns, using properly sized and aligned data types, and padding data -when necessary. +naturally aligned. +Maximizing memory throughput involves: -Global memory instructions support reading or writing data of specific sizes -(1, 2, 4, 8, or 16 bytes) that are naturally aligned. If the size and alignment -requirements are not met, it leads to multiple instructions, reducing -performance. Therefore, using data types that meet these requirements, ensuring -alignment for structures, and maintaining alignment for all values or arrays is -crucial for correct results and optimal performance. +- Coalescing memory accesses of threads within a warp into minimal transactions. +- Following optimal access patterns. +- Using properly sized and aligned data types. +- Padding data when necessary. + +Global memory instructions support reading or writing data of specific sizes (1, +2, 4, 8, or 16 bytes) that are naturally aligned. Not meeting the size and +alignment requirements leads to multiple instructions, which reduces +performance. Therefore, for correct results and optimal performance: + +- Use data types that meet these requirements +- Ensure alignment for structures +- Maintain alignment for all values or arrays. Threads often access 2D arrays at an address calculated as ``BaseAddress + xIndex + width * yIndex``. For efficient memory access, the array and thread block widths should be multiples of the warp size. If the array width is not a multiple of the warp size, it is usually more efficient to -allocate it with a width rounded up to the nearest multiple and pad the rows -accordingly. +allocate the array with a width rounded up to the nearest multiple and pad the +rows accordingly. Local memory is used for certain automatic variables, such as arrays with -non-constant indices, large structures or arrays, and any variable when the +non-constant indices, large structures of arrays, and any variable where the kernel uses more registers than available. Local memory resides in device -memory, leading to high latency and low bandwidth similar to global memory -accesses. However, it is organized for consecutive 32-bit words to be accessed -by consecutive thread IDs, allowing full coalescing when all threads in a warp -access the same relative address. +memory, which leads to high latency and low bandwidth, similar to global memory +accesses. However, the local memory is organized for consecutive 32-bit words to +be accessed by consecutive thread IDs, which allows full coalescing when all +threads in a warp access the same relative address. -Shared memory, located on-chip, provides higher bandwidth and lower latency -than local or global memory. It is divided into banks that can be -simultaneously accessed, boosting bandwidth. However, bank conflicts, where two -addresses fall in the same bank, lead to serialized access and decreased -throughput. Therefore, understanding how memory addresses map to banks and -scheduling requests to minimize conflicts is crucial for optimal performance. +Shared memory is located on-chip and provides higher bandwidth and lower latency +than local or global memory. It is divided into banks that can be simultaneously +accessed, which boosts bandwidth. However, bank conflicts, where two addresses +fall in the same bank, lead to serialized access and decreased throughput. +Therefore, understanding how memory addresses map to banks and scheduling +requests to minimize conflicts is crucial for optimal performance. -Constant memory is in device memory and cached in the constant cache. Requests -are split based on different memory addresses, affecting throughput, and are -serviced at the throughput of the constant cache for cache hits, or the -throughput of the device memory otherwise. +Constant memory is in the device memory and cached in the constant cache. +Requests are split based on different memory addresses and are serviced based +either on the throughput of the constant cache for cache hits or on the +throughput of the device memory otherwise. This splitting of requests affects +throughput. -Texture and surface memory are stored in device memory and cached in texture -cache. This setup optimizes 2D spatial locality, leading to better performance -for threads reading close 2D addresses. Reading device memory through texture -or surface fetching can be advantageous, offering higher bandwidth for local -texture fetches or surface reads, offloading addressing calculations, -allowing data broadcasting, and optional conversion of 8-bit and 16-bit integer -input data to 32-bit floating-point values on-the-fly. +Texture and surface memory are stored in the device memory and cached in the +texture cache. This setup optimizes 2D spatial locality, which leads to better +performance for threads reading close 2D addresses. +Reading device memory through texture or surface fetching provides the following +advantages: + +- Higher bandwidth for local texture fetches or surface reads. +- Offloading addressing calculation. +- Data broadcasting. +- Optional conversion of 8-bit and 16-bit integer input data to 32-bit + floating-point values on the fly. .. _instruction optimization: Optimization for maximum instruction throughput -=============================================== +================================================================================ To maximize instruction throughput: -- minimize low throughput arithmetic instructions -- minimize divergent warps inflicted by control flow instructions -- minimize the number of instruction as possible -- maximize instruction parallelism +- Minimize low throughput arithmetic instructions. +- Minimize divergent warps inflicted by flow control instructions. +- Maximize instruction parallelism. + +These techniques are discussed in detail in the following sections. Arithmetic instructions ------------------------ +-------------------------------------------------------------------------------- The type and complexity of arithmetic operations can significantly impact the performance of your application. We are highlighting some hints how to maximize it. -Using efficient operations: Some arithmetic operations are more costly than -others. For example, multiplication is typically faster than division, and -integer operations are usually faster than floating-point operations, -especially with double-precision. +Use efficient operations: Some arithmetic operations are costlier than others. +For example, multiplication is typically faster than division, and integer +operations are usually faster than floating-point operations, especially with +double precision. -Minimizing low-throughput instructions: This might involve trading precision -for speed when it does not affect the final result. For instance, consider -using single-precision arithmetic instead of double-precision. +Minimize low-throughput instructions: This might involve trading precision for +speed when it does not affect the final result. For instance, consider using +single-precision arithmetic instead of double-precision. -Leverage intrinsic functions: Intrinsic functions are pre-defined functions +Leverage intrinsic functions: Intrinsic functions are predefined functions available in HIP that can often be executed faster than equivalent arithmetic operations (subject to some input or accuracy restrictions). They can help optimize performance by replacing more complex arithmetic operations. -Avoiding divergent warps: Divergent warps occur when threads within the same -warp follow different execution paths. This can happen due to conditional -statements that lead to different arithmetic operations being performed by -different threads. Divergent warps can significantly reduce instruction -throughput, so try to structure your code to minimize divergence. +Optimize memory access: The memory access efficiency can impact the speed of +arithmetic operations. See: :ref:`device memory access`. -Optimizing memory access: The efficiency of memory access can impact the speed -of arithmetic operations. Coalesced memory access, where threads in a warp -access consecutive memory locations, can improve memory throughput and thus -the speed of arithmetic operations. - -Maximizing instruction parallelism: Some GPU architectures could issue parallel -independent instructions simultaneously, for example integer and floating -point, or two operations with independent inputs and outputs. Mostly this is a -work for compiler, but expressing parallelism in the code explicitly can -improve instructions throughput. +.. _control flow instructions: Control flow instructions -------------------------- +-------------------------------------------------------------------------------- -Flow control instructions (``if``, ``else``, ``for``, ``do``, ``while``, +Control flow instructions (``if``, ``else``, ``for``, ``do``, ``while``, ``break``, ``continue``, ``switch``) can impact instruction throughput by causing threads within a warp to diverge and follow different execution paths. -To optimize performance, control conditions should be written to minimize -divergent warps. For example, when the control condition depends on -(``threadIdx`` / ``warpSize``), no warp diverges. The compiler may optimize -loops or short if or switch blocks using branch predication, preventing warp -divergence. With branch predication, instructions associated with a false -predicate are scheduled but not executed, avoiding unnecessary operations. +To optimize performance, write control conditions to minimize divergent warps. +For example, when the control condition depends on ``threadIdx`` or ``warpSize``, +warp doesn't diverge. The compiler might optimize loops, short ifs, or switch +blocks using branch predication, which prevents warp divergence. With branch +predication, instructions associated with a false predicate are scheduled but +not executed, which avoids unnecessary operations. + +Avoiding divergent warps +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Warps diverge when threads within the same warp follow different execution paths. +This is caused by conditional statements that lead to different arithmetic +operations being performed by different threads. Divergent warps can +significantly reduce instruction throughput, so it is advisable to structure +your code to minimize divergence. Synchronization ---------------- +-------------------------------------------------------------------------------- -Synchronization ensures that all threads within a block have completed their +Synchronization ensures that all threads within a block complete their computations and memory accesses before moving forward, which is critical when -threads are dependent on the results of other threads. However, -synchronization can also lead to performance overhead, as it requires threads -to wait, potentially leading to idle GPU resources. +threads depend on other thread results. However, synchronization can also cause +performance overhead, as it needs the threads to wait, which might lead to idle +GPU resources. -``__syncthreads()`` is used to synchronize all threads in a block, ensuring -that all threads have reached the same point in the code and that shared memory -is visible to all threads after the point of synchronization. +To synchronize all threads in a block, use :cpp:func:`__syncthreads()`. +:cpp:func:`__syncthreads()` ensures that, all threads reach the same point in +the code and can access shared memory after reaching that point. -An alternative way to synchronize is using streams. Different streams can -execute commands out of order with respect to one another or concurrently. This -allows for more fine-grained control over the execution order of commands, -which can be beneficial in certain scenarios. +An alternative way to synchronize is to use streams. Different streams can +execute commands either without following a specific order or concurrently. This +is why streams allow more fine-grained control over the execution order of +commands, which can be beneficial in certain scenarios. Minimizing memory thrashing -=========================== +================================================================================ -Applications frequently allocating and freeing memory may experience slower -allocation calls over time. This is expected as memory is released back to the -operating system. To optimize performance in such scenarios, consider some -recommendations: +Applications frequently allocating and freeing memory might experience slower +allocation calls over time as memory is released back to the operating system. +To optimize performance in such scenarios, follow these guidelines: -- avoid allocating all available memory with ``hipMalloc`` / ``hipHostMalloc``, - as this immediately reserves memory and can block other applications from - using it. This could strain the operating system schedulers or even prevent - other applications from running on the same GPU. -- aim to allocate memory in suitably sized blocks early in the lifecycle of the - application and deallocate only when the application no longer needs it. - Minimize the number of ``hipMalloc`` and ``hipFree`` calls in your - application, particularly in areas critical to performance. -- if an application is unable to allocate sufficient device memory, consider - resorting to other memory types such as ``hipHostMalloc`` or - ``hipMallocManaged``. While these may not offer the same performance, they - can allow the application to continue running. -- For supported platforms, ``hipMallocManaged`` allows for oversubscription. - With the right memory advise policies, it can maintain most, if not all, of - the performance of ``hipMalloc``. ``hipMallocManaged`` does not require an - allocation to be resident until it is needed or prefetched, easing the load - on the operating system schedulers and facilitating multi-tenant scenarios. +- Avoid allocating all available memory with :cpp:func:`hipMalloc` or + :cpp:func:`hipHostMalloc`, as this immediately reserves memory and might + prevent other applications from using it. This behavior could strain the + operating system schedulers or prevent other applications from running on the + same GPU. +- Try to allocate memory in suitably sized blocks early in the application's + lifecycle and deallocate only when the application no longer needs it. + Minimize the number of :cpp:func:`hipMalloc` and :cpp:func:`hipFree` calls in + your application, particularly in performance-critical areas. +- Consider resorting to other memory types such as :cpp:func:`hipHostMalloc` or + :cpp:func:`hipMallocManaged`, if an application can't allocate sufficient + device memory. While the other memory types might not offer similar + performance, they allow the application to continue running. +- For supported platforms, use :cpp:func:`hipMallocManaged`, as it allows + oversubscription. With the right policies, :cpp:func:`hipMallocManaged` can + maintain most, if not all, :cpp:func:`hipMalloc` performance. + :cpp:func:`hipMallocManaged` doesn't require an allocation to be resident + until it is needed or prefetched, which eases the load on the operating + system's schedulers and facilitates multitenant scenarios. diff --git a/projects/hip/docs/how-to/programming_manual.md b/projects/hip/docs/how-to/programming_manual.md index df6a80261c..bac20c9996 100644 --- a/projects/hip/docs/how-to/programming_manual.md +++ b/projects/hip/docs/how-to/programming_manual.md @@ -1,4 +1,4 @@ -# HIP Programming Manual +# HIP programming manual ## Host Memory @@ -140,13 +140,13 @@ HIP now supports runtime compilation (HIP RTC), the usage of which will provide HIP RTC APIs accept HIP source files in character string format as input parameters and create handles of programs by compiling the HIP source files without spawning separate processes. -For more details on HIP RTC APIs, refer to [HIP Runtime API Reference](https://rocm.docs.amd.com/projects/HIP/en/latest/doxygen/html/index.html). +For more details on HIP RTC APIs, refer to [HIP Runtime API Reference](../doxygen/html/index). For Linux developers, the link [here](https://github.com/ROCm/hip-tests/blob/develop/samples/2_Cookbook/23_cmake_hiprtc/saxpy.cpp) shows an example how to program HIP application using runtime compilation mechanism, and a detailed [HIP RTC programming guide](./hip_rtc) is also available. ## HIP Graph -HIP graph is supported. For more details, refer to the HIP API Guide. +HIP graphs are supported. For more details, refer to the [HIP API Guide](../doxygen/html/group___graph) or the [how-to section for HIP graphs](../how-to/hipgraph). ## Device-Side Malloc diff --git a/projects/hip/docs/how-to/stream_ordered_allocator.rst b/projects/hip/docs/how-to/stream_ordered_allocator.rst new file mode 100644 index 0000000000..0d130a540d --- /dev/null +++ b/projects/hip/docs/how-to/stream_ordered_allocator.rst @@ -0,0 +1,578 @@ +.. meta:: + :description: + :keywords: stream, memory allocation, SOMA, stream ordered memory allocator + +******************************************************************************* +Stream Ordered Memory Allocator +******************************************************************************* + +The Stream Ordered Memory Allocator (SOMA) is part of the HIP runtime API. SOMA provides an asynchronous memory allocation mechanism with stream-ordering semantics. You can use SOMA to allocate and free memory in stream order, which ensures that all asynchronous accesses occur between the stream executions of allocation and deallocation. Compliance with stream order prevents use-before-allocation or use-after-free errors, which helps to avoid an undefined behavior. + +Advantages of SOMA: + +- Efficient reuse: Enables efficient memory reuse across streams, which reduces unnecessary allocation overhead. +- Fine-grained control: Allows you to set attributes and control caching behavior for memory pools. +- Inter-process sharing: Enables secure sharing of allocations between processes. +- Optimizations: Allows driver to optimize based on its awareness of SOMA and other stream management APIs. + +Disadvantages of SOMA: + +- Temporal constraints: Requires you to adhere strictly to stream order to avoid errors. +- Complexity: Involves memory management in stream order, which can be intricate. +- Learning curve: Requires you to put additional efforts to understand and utilize SOMA effectively. + +Using SOMA +===================================== + +You can allocate memory using ``hipMallocAsync()`` with stream-ordered +semantics. This restricts the asynchronous access to the memory between the stream executions of the allocation and deallocation. Accessing +memory if the compliant memory accesses won't overlap +temporally. ``hipFreeAsync()`` frees memory from the pool with stream-ordered +semantics. + +Here is how to use stream ordered memory allocation: + +.. tab-set:: + .. tab-item:: Stream Ordered Memory Allocation + + .. code-block:: cpp + + #include + #include + + // Kernel to perform some computation on allocated memory. + __global__ void myKernel(int* data, size_t numElements) { + int tid = threadIdx.x + blockIdx.x * blockDim.x; + if (tid < numElements) { + data[tid] = tid * 2; + } + } + + int main() { + // Initialize HIP. + hipInit(0); + + // Stream 0. + constexpr hipStream_t streamId = 0; + + // Allocate memory with stream ordered semantics. + constexpr size_t numElements = 1024; + int* devData; + hipMallocAsync(&devData, numElements * sizeof(*devData), streamId); + + // Launch the kernel to perform computation. + dim3 blockSize(256); + dim3 gridSize((numElements + blockSize.x - 1) / blockSize.x); + myKernel<<>>(devData, numElements); + + // Copy data back to host. + int* hostData = new int[numElements]; + hipMemcpy(hostData, devData, numElements * sizeof(*devData), hipMemcpyDeviceToHost); + + // Print the array. + for (size_t i = 0; i < numElements; ++i) { + std::cout << "Element " << i << ": " << hostData[i] << std::endl; + } + + // Free memory with stream ordered semantics. + hipFreeAsync(devData, streamId); + delete[] hostData; + + // Synchronize to ensure completion. + hipDeviceSynchronize(); + + return 0; + } + + .. tab-item:: Ordinary Allocation + + .. code-block:: cpp + + #include + #include + + // Kernel to perform some computation on allocated memory. + __global__ void myKernel(int* data, size_t numElements) { + int tid = threadIdx.x + blockIdx.x * blockDim.x; + if (tid < numElements) { + data[tid] = tid * 2; + } + } + + int main() { + // Initialize HIP. + hipInit(0); + + // Allocate memory. + constexpr size_t numElements = 1024; + int* devData; + hipMalloc(&devData, numElements * sizeof(*devData)); + + // Launch the kernel to perform computation. + dim3 blockSize(256); + dim3 gridSize((numElements + blockSize.x - 1) / blockSize.x); + myKernel<<>>(devData, numElements); + + // Copy data back to host. + int* hostData = new int[numElements]; + hipMemcpy(hostData, devData, numElements * sizeof(*devData), hipMemcpyDeviceToHost); + + // Print the array. + for (size_t i = 0; i < numElements; ++i) { + std::cout << "Element " << i << ": " << hostData[i] << std::endl; + } + + // Free memory. + hipFree(devData); + delete[] hostData; + + // Synchronize to ensure completion. + hipDeviceSynchronize(); + + return 0; + } + +For more details, see :ref:`stream_ordered_memory_allocator_reference`. + +Memory pools +============ + +Memory pools provide a way to manage memory with stream-ordered behavior while ensuring proper synchronization and avoiding memory access errors. Division of a single memory system into separate pools facilitates querying the access path properties for each partition. Memory pools are used for host memory, device memory, and unified memory. + +Set pools +--------- + +The ``hipMallocAsync()`` function uses the current memory pool and also provides the opportunity to create and access different pools using ``hipMemPoolCreate()`` and ``hipMallocFromPoolAsync()`` functions respectively. + +Unlike NVIDIA CUDA, where stream-ordered memory allocation can be implicit, ROCm HIP is explicit. This requires managing memory allocation for each stream in HIP while ensuring precise control over memory usage and synchronization. + +.. code-block:: cpp + + #include + #include + + // Kernel to perform some computation on allocated memory. + __global__ void myKernel(int* data, size_t numElements) { + int tid = threadIdx.x + blockIdx.x * blockDim.x; + if (tid < numElements) { + data[tid] = tid * 2; + } + } + + int main() { + // Create a stream. + hipStream_t stream; + hipStreamCreate(&stream); + + // Create a memory pool with default properties. + hipMemPoolProps poolProps = {}; + poolProps.allocType = hipMemAllocationTypePinned; + poolProps.handleTypes = hipMemHandleTypePosixFileDescriptor; + poolProps.location.type = hipMemLocationTypeDevice; + poolProps.location.id = 0; // Assuming device 0. + + hipMemPool_t memPool; + hipMemPoolCreate(&memPool, &poolProps); + + // Allocate memory from the pool asynchronously. + constexpr size_t numElements = 1024; + int* devData = nullptr; + hipMallocFromPoolAsync(&devData, numElements * sizeof(*devData), memPool, stream); + + // Define grid and block sizes. + dim3 blockSize(256); + dim3 gridSize((numElements + blockSize.x - 1) / blockSize.x); + + // Launch the kernel to perform computation. + myKernel<<>>(devData, numElements); + + // Synchronize the stream. + hipStreamSynchronize(stream); + + // Copy data back to host. + int* hostData = new int[numElements]; + hipMemcpy(hostData, devData, numElements * sizeof(*devData), hipMemcpyDeviceToHost); + + // Print the array. + for (size_t i = 0; i < numElements; ++i) { + std::cout << "Element " << i << ": " << hostData[i] << std::endl; + } + + // Free the allocated memory. + hipFreeAsync(devData, stream); + + // Synchronize the stream again to ensure all operations are complete. + hipStreamSynchronize(stream); + + // Destroy the memory pool and stream. + hipMemPoolDestroy(memPool); + hipStreamDestroy(stream); + + // Free host memory. + delete[] hostData; + + return 0; + } + +Trim pools +---------- + +The memory allocator allows you to allocate and free memory in stream order. To control memory usage, set the release threshold attribute using ``hipMemPoolAttrReleaseThreshold``. This threshold specifies the amount of reserved memory in bytes to hold onto. + +.. code-block:: cpp + + uint64_t threshold = UINT64_MAX; + hipMemPoolSetAttribute(memPool, hipMemPoolAttrReleaseThreshold, &threshold); + +When the amount of memory held in the memory pool exceeds the threshold, the allocator tries to release memory back to the operating system during the next call to stream, event, or context synchronization. + +To improve performance, it is a good practice to adjust the memory pool size using ``hipMemPoolTrimTo()``. It helps to reclaim memory from an excessive memory pool, which optimizes memory usage for your application. + +.. code-block:: cpp + + #include + #include + + int main() { + hipMemPool_t memPool; + hipDevice_t device = 0; // Specify the device index. + + // Initialize the device. + hipSetDevice(device); + + // Get the default memory pool for the device. + hipDeviceGetDefaultMemPool(&memPool, device); + + // Allocate memory from the pool (e.g., 1 MB). + size_t allocSize = 1 * 1024 * 1024; + void* ptr; + hipMalloc(&ptr, allocSize); + + // Free the allocated memory. + hipFree(ptr); + + // Trim the memory pool to a specific size (e.g., 512 KB). + size_t newSize = 512 * 1024; + hipMemPoolTrimTo(memPool, newSize); + + // Clean up. + hipMemPoolDestroy(memPool); + + std::cout << "Memory pool trimmed to " << newSize << " bytes." << std::endl; + return 0; + } + +Resource usage statistics +------------------------- + +Resource usage statistics help in optimization. Here is the list of pool attributes used to query memory usage: + +- ``hipMemPoolAttrReservedMemCurrent``: Returns the total physical GPU memory currently held in the pool. +- ``hipMemPoolAttrUsedMemCurrent``: Returns the total size of all the memory allocated from the pool. +- ``hipMemPoolAttrReservedMemHigh``: Returns the total physical GPU memory held in the pool since the last reset. +- ``hipMemPoolAttrUsedMemHigh``: Returns the total size of all the memory allocated from the pool since the last reset. + +To reset these attributes to the current value, use ``hipMemPoolSetAttribute()``. + +.. code-block:: cpp + + #include + #include + + // Sample helper functions for getting the usage statistics in bulk. + struct usageStatistics { + uint64_t reservedMemCurrent; + uint64_t reservedMemHigh; + uint64_t usedMemCurrent; + uint64_t usedMemHigh; + }; + + void getUsageStatistics(hipMemPool_t memPool, struct usageStatistics *statistics) { + hipMemPoolGetAttribute(memPool, hipMemPoolAttrReservedMemCurrent, &statistics->reservedMemCurrent); + hipMemPoolGetAttribute(memPool, hipMemPoolAttrReservedMemHigh, &statistics->reservedMemHigh); + hipMemPoolGetAttribute(memPool, hipMemPoolAttrUsedMemCurrent, &statistics->usedMemCurrent); + hipMemPoolGetAttribute(memPool, hipMemPoolAttrUsedMemHigh, &statistics->usedMemHigh); + } + + // Resetting the watermarks resets them to the current value. + void resetStatistics(hipMemPool_t memPool) { + uint64_t value = 0; + hipMemPoolSetAttribute(memPool, hipMemPoolAttrReservedMemHigh, &value); + hipMemPoolSetAttribute(memPool, hipMemPoolAttrUsedMemHigh, &value); + } + + int main() { + hipMemPool_t memPool; + hipDevice_t device = 0; // Specify the device index. + + // Initialize the device. + hipSetDevice(device); + + // Get the default memory pool for the device. + hipDeviceGetDefaultMemPool(&memPool, device); + + // Allocate memory from the pool (e.g., 1 MB). + size_t allocSize = 1 * 1024 * 1024; + void* ptr; + hipMalloc(&ptr, allocSize); + + // Free the allocated memory. + hipFree(ptr); + + // Trim the memory pool to a specific size (e.g., 512 KB). + size_t newSize = 512 * 1024; + hipMemPoolTrimTo(memPool, newSize); + + // Get and print usage statistics before resetting. + usageStatistics statsBefore; + getUsageStatistics(memPool, &statsBefore); + std::cout << "Before resetting statistics:" << std::endl; + std::cout << "Reserved Memory Current: " << statsBefore.reservedMemCurrent << " bytes" << std::endl; + std::cout << "Reserved Memory High: " << statsBefore.reservedMemHigh << " bytes" << std::endl; + std::cout << "Used Memory Current: " << statsBefore.usedMemCurrent << " bytes" << std::endl; + std::cout << "Used Memory High: " << statsBefore.usedMemHigh << " bytes" << std::endl; + + // Reset the statistics. + resetStatistics(memPool); + + // Get and print usage statistics after resetting. + usageStatistics statsAfter; + getUsageStatistics(memPool, &statsAfter); + std::cout << "After resetting statistics:" << std::endl; + std::cout << "Reserved Memory Current: " << statsAfter.reservedMemCurrent << " bytes" << std::endl; + std::cout << "Reserved Memory High: " << statsAfter.reservedMemHigh << " bytes" << std::endl; + std::cout << "Used Memory Current: " << statsAfter.usedMemCurrent << " bytes" << std::endl; + std::cout << "Used Memory High: " << statsAfter.usedMemHigh << " bytes" << std::endl; + + // Clean up. + hipMemPoolDestroy(memPool); + + return 0; + } + +Memory reuse policies +--------------------- + +The allocator might reallocate memory as long as the compliant memory accesses will not to overlap temporally. To optimize the memory usage, disable or enable the following memory pool reuse policy attribute flags: + +- ``hipMemPoolReuseFollowEventDependencies``: Checks event dependencies before allocating additional GPU memory. +- ``hipMemPoolReuseAllowOpportunistic``: Checks freed allocations to determine if the stream order semantic indicated by the free operation has been met. +- ``hipMemPoolReuseAllowInternalDependencies``: Manages reuse based on internal dependencies in runtime. If the driver fails to allocate and map additional physical memory, it searches for memory waiting for another stream's progress and reuses it. + +Device accessibility for multi-GPU support +------------------------------------------ + +Allocations are initially accessible from the device where they reside. + +Interprocess memory handling +============================= + +Interprocess capable (IPC) memory pools facilitate efficient and secure sharing of GPU memory between processes. + +To achieve interprocess memory sharing, you can use either :ref:`device pointer ` or :ref:`shareable handle `. Both provide allocator (export) and consumer (import) interfaces. + +.. _device-pointer: + +Device pointer +-------------- + +To export data to share a memory pool pointer directly between processes, use ``hipMemPoolExportPointer()``. It allows you to share a memory allocation with another process. + +.. code-block:: cpp + + #include + #include + #include + #include + + int main() { + // Allocate memory. + void* devPtr; + hipMalloc(&devPtr, sizeof(int)); + + // Export the memory pool pointer. + hipMemPoolPtrExportData exportData; + hipError_t result = hipMemPoolExportPointer(&exportData, devPtr); + if (result != hipSuccess) { + std::cerr << "Error exporting memory pool pointer: " << hipGetErrorString(result) << std::endl; + return 1; + } + + // Create a named pipe (FIFO). + const char* fifoPath = "/tmp/myfifo"; // Change this to a unique path. + mkfifo(fifoPath, 0666); + + // Write the exported data to the named pipe. + std::ofstream fifoStream(fifoPath, std::ios::out | std::ios::binary); + fifoStream.write(reinterpret_cast(&exportData), sizeof(hipMemPoolPtrExportData)); + fifoStream.close(); + + // Clean up. + hipFree(devPtr); + + return 0; + } + +To import a memory pool pointer directly from another process, use ``hipMemPoolImportPointer()``. + +Here is how to read the pool exported in the preceding example: + +.. code-block:: cpp + + #include + #include + #include + + int main() { + // Considering that you have exported the memory pool pointer already. + // Now, let's simulate reading the exported data from a named pipe (FIFO). + const char* fifoPath = "/tmp/myfifo"; // Change this to a unique path. + std::ifstream fifoStream(fifoPath, std::ios::in | std::ios::binary); + + if (!fifoStream.is_open()) { + std::cerr << "Error opening FIFO file: " << fifoPath << std::endl; + return 1; + } + + // Read the exported data. + hipMemPoolPtrExportData importData; + fifoStream.read(reinterpret_cast(&importData), sizeof(hipMemPoolPtrExportData)); + fifoStream.close(); + + if (fifoStream.fail()) { + std::cerr << "Error reading from FIFO file." << std::endl; + return 1; + } + + // Create a memory pool with default properties. + hipMemPoolProps poolProps = {}; + poolProps.allocType = hipMemAllocationTypePinned; + poolProps.handleTypes = hipMemHandleTypePosixFileDescriptor; + poolProps.location.type = hipMemLocationTypeDevice; + poolProps.location.id = 0; // Assuming device 0. + + hipMemPool_t memPool; + hipMemPoolCreate(&memPool, &poolProps); + + // Import the memory pool pointer. + void* importedDevPtr; + hipError_t result = hipMemPoolImportPointer(&importedDevPtr, memPool, &importData); + if (result != hipSuccess) { + std::cerr << "Error imported memory pool pointer: " << hipGetErrorString(result) << std::endl; + return 1; + } + + // Now you can use the importedDevPtr for your computations. + + // Clean up (free the memory). + hipFree(importedDevPtr); + + return 0; + } + +.. _shareable-handle: + +Shareable handle +---------------- + +To export a memory pool pointer to a shareable handle, use ``hipMemPoolExportToSharedHandle()``. This handle could be a file descriptor or a handle obtained from another process. The exported handle contains information about the memory pool, such as size, location, and other relevant details. + +.. code-block:: cpp + + #include + #include + #include + #include + + int main() { + // Create a memory pool with default properties. + hipMemPoolProps poolProps = {}; + poolProps.allocType = hipMemAllocationTypePinned; + poolProps.handleTypes = hipMemHandleTypePosixFileDescriptor; + poolProps.location.type = hipMemLocationTypeDevice; + poolProps.location.id = 0; // Assuming device 0. + + hipMemPool_t memPool; + hipError_t poolResult = hipMemPoolCreate(&memPool, &poolProps); + if (poolResult != hipSuccess) { + std::cerr << "Error creating memory pool: " << hipGetErrorString(poolResult) << std::endl; + return 1; + } + + // Allocate memory from the memory pool. + void* devPtr; + hipMallocFromPoolAsync(&devPtr, sizeof(int), memPool, 0); + + // Export the memory pool pointer. + int descriptor; + hipError_t result = hipMemPoolExportToShareableHandle(&descriptor, memPool, hipMemHandleTypePosixFileDescriptor, 0); + if (result != hipSuccess) { + std::cerr << "Error exporting memory pool pointer: " << hipGetErrorString(result) << std::endl; + return 1; + } + + // Create a named pipe (FIFO). + const char* fifoPath = "/tmp/myfifo"; // Change this to a unique path. + mkfifo(fifoPath, 0666); + + // Write the exported data to the named pipe. + std::ofstream fifoStream(fifoPath, std::ios::out | std::ios::binary); + fifoStream.write(reinterpret_cast(&descriptor), sizeof(int)); + fifoStream.close(); + + // Clean up. + hipFree(devPtr); + hipMemPoolDestroy(memPool); + + return 0; + } + +To import and restore a memory pool pointer from a shareable handle, which could be a file descriptor or a handle obtained from another process, use ``hipMemPoolImportFromShareableHandle()``. The exported shareable handle data contains information about the memory pool, including its size, location, and other relevant details. Importing the handle provides a valid memory pointer to the same memory, which allows you to share memory across different contexts. + +.. code-block:: cpp + + #include + #include + #include + + int main() { + // Considering that you have exported the memory pool pointer already. + // Now, let's simulate reading the exported data from a named pipe (FIFO). + const char* fifoPath = "/tmp/myfifo"; // Change this to a unique path + std::ifstream fifoStream(fifoPath, std::ios::in | std::ios::binary); + + if (!fifoStream.is_open()) { + std::cerr << "Error opening FIFO file: " << fifoPath << std::endl; + return 1; + } + + // Read the exported data. + int descriptor; + fifoStream.read(reinterpret_cast(&descriptor), sizeof(int)); + fifoStream.close(); + + if (fifoStream.fail()) { + std::cerr << "Error reading from FIFO file." << std::endl; + return 1; + } + + // Import the memory pool. + hipMemPool_t memPool; + hipError_t result = hipMemPoolImportFromShareableHandle(&memPool, &descriptor, hipMemHandleTypePosixFileDescriptor, 0); + if (result != hipSuccess) { + std::cerr << "Error importing memory pool: " << hipGetErrorString(result) << std::endl; + return 1; + } + + // Allocate memory from the imported memory pool. + void* importedDevPtr; + hipMallocFromPoolAsync(&importedDevPtr, sizeof(int), memPool, 0); + + // Now you can use the importedDevPtr for your computations. + + // Clean up (free the memory). + hipFree(importedDevPtr); + hipMemPoolDestroy(memPool); + + return 0; + } diff --git a/projects/hip/docs/how-to/unified_memory.rst b/projects/hip/docs/how-to/unified_memory.rst new file mode 100644 index 0000000000..f64189454c --- /dev/null +++ b/projects/hip/docs/how-to/unified_memory.rst @@ -0,0 +1,577 @@ +.. meta:: + :description: This chapter describes introduces Unified Memory (UM) and shows + how to use it in AMD HIP. + :keywords: AMD, ROCm, HIP, CUDA, unified memory, unified, memory, UM, APU + +******************************************************************************* +Unified memory +******************************************************************************* + +In conventional architectures, CPUs and GPUs have dedicated memory like Random +Access Memory (RAM) and Video Random Access Memory (VRAM). This architectural +design, while effective, can be limiting in terms of memory capacity and +bandwidth, as continuous memory copying is required to allow the processors to +access the appropriate data. New architectural features like Heterogeneous +System Architectures (HSA) and Unified Memory (UM) help avoid these limitations +and promise increased efficiency and innovation. + +Unified memory +============== +Unified Memory is a single memory address space accessible from any processor +within a system. This setup simplifies memory management processes and enables +applications to allocate data that can be read or written by code running on +either CPUs or GPUs. The Unified memory model is shown in the following figure. + +.. figure:: ../data/unified_memory/um.svg + +AMD Accelerated Processing Unit (APU) is a typical example of a Unified Memory +Architecture. On a single die, a central processing unit (CPU) is combined +with an integrated graphics processing unit (iGPU), and both have access to a +high-bandwidth memory (HBM) module named Unified Memory. The CPU enables +high-performance, low-latency operations, while the GPU is optimized for high +throughput (data processed by unit time). + +.. _unified memory system requirements: + +System requirements +=================== +Unified memory is supported on Linux by all modern AMD GPUs from the Vega +series onward. Unified memory management can be achieved with managed memory +allocation and, for the latest GPUs, with a system allocator. + +The table below lists the supported allocators. The allocators are described in +the next section. + +.. list-table:: Supported Unified Memory Allocators + :widths: 40, 25, 25, 25 + :header-rows: 1 + :align: center + + * - Architecture + - ``hipMallocManaged()`` + - ``__managed__`` + - ``malloc()`` + * - MI200, MI300 Series + - ✅ + - ✅ + - ✅ :sup:`1` + * - MI100 + - ✅ + - ✅ + - ❌ + * - RDNA (Navi) Series + - ✅ + - ✅ + - ❌ + * - GCN5 (Vega) Series + - ✅ + - ✅ + - ❌ + +✅: **Supported** + +❌: **Unsupported** + +:sup:`1` Works only with ``XNACK=1``. First GPU access causes recoverable +page-fault. For more details, visit +`GPU memory `_. + +.. _unified memory programming models: + +Unified memory programming models +================================= + +Showcasing various unified memory programming models, the model availability +depends on your architecture. For more information, see :ref:`unified memory +system requirements` and :ref:`checking unified memory management support`. + +- **HIP managed memory allocation API**: + + The ``hipMallocManaged()`` is a dynamic memory allocator available on + all GPUs with unified memory support. For more details, visit + :ref:`unified_memory_reference`. + +- **HIP managed variables**: + + The ``__managed__`` declaration specifier, which serves as its counterpart, + is supported on all modern AMD cards and can be utilized for static + allocation. + +- **System allocation API**: + + Starting with the AMD MI300 series, the ``malloc()`` system allocator allows + you to reserve unified memory. The system allocator is more versatile and + offers an easy transition from a CPU written C++ code to a HIP code as the + same system allocation API is used. + +.. _checking unified memory management support: + +Checking unified memory management support +------------------------------------------ +Some device attributes can offer information about which :ref:`unified memory +programming models` are supported. The attribute value is 1 if the +functionality is supported, and 0 if it is not supported. + +.. list-table:: Device attributes for unified memory management + :widths: 40, 60 + :header-rows: 1 + :align: center + + * - attribute + - description + * - ``hipDeviceAttributeManagedMemory`` + - unified addressing is supported + * - ``hipDeviceAttributeConcurrentManagedAccess`` + - full managed memory support, concurrent access is supported + * - ``hipDeviceAttributePageableMemoryAccess`` + - both managed and system memory allocation API is supported + +The following examples show how to use device attributes: + +.. code-block:: cpp + + #include + #include + + int main() { + int d; + hipGetDevice(&d); + + int is_cma = 0; + hipDeviceGetAttribute(&is_cma, hipDeviceAttributeConcurrentManagedAccess, d); + std::cout << "HIP Managed Memory: " + << (is_cma == 1 ? "is" : "NOT") + << " supported" << std::endl; + return 0; + } + +Example for unified memory management +------------------------------------- + +The following example shows how to use unified memory management with +``hipMallocManaged()``, function, with ``__managed__`` attribute for static +allocation and standard ``malloc()`` allocation. For comparison, the Explicit +Memory Management example is presented in the last tab. + +.. tab-set:: + + .. tab-item:: hipMallocManaged() + + .. code-block:: cpp + :emphasize-lines: 12-15 + + #include + #include + + // Addition of two values. + __global__ void add(int *a, int *b, int *c) { + *c = *a + *b; + } + + int main() { + int *a, *b, *c; + + // Allocate memory for a, b and c that is accessible to both device and host codes. + hipMallocManaged(&a, sizeof(*a)); + hipMallocManaged(&b, sizeof(*b)); + hipMallocManaged(&c, sizeof(*c)); + + // Setup input values. + *a = 1; + *b = 2; + + // Launch add() kernel on GPU. + hipLaunchKernelGGL(add, dim3(1), dim3(1), 0, 0, a, b, c); + + // Wait for GPU to finish before accessing on host. + hipDeviceSynchronize(); + + // Prints the result. + std::cout << *a << " + " << *b << " = " << *c << std::endl; + + // Cleanup allocated memory. + hipFree(a); + hipFree(b); + hipFree(c); + + return 0; + } + + + .. tab-item:: __managed__ + + .. code-block:: cpp + :emphasize-lines: 9-10 + + #include + #include + + // Addition of two values. + __global__ void add(int *a, int *b, int *c) { + *c = *a + *b; + } + + // Declare a, b and c as static variables. + __managed__ int a, b, c; + + int main() { + // Setup input values. + a = 1; + b = 2; + + // Launch add() kernel on GPU. + hipLaunchKernelGGL(add, dim3(1), dim3(1), 0, 0, &a, &b, &c); + + // Wait for GPU to finish before accessing on host. + hipDeviceSynchronize(); + + // Prints the result. + std::cout << a << " + " << b << " = " << c << std::endl; + + return 0; + } + + + .. tab-item:: malloc() + + .. code-block:: cpp + :emphasize-lines: 12-15 + + #include + #include + + // Addition of two values. + __global__ void add(int* a, int* b, int* c) { + *c = *a + *b; + } + + int main() { + int* a, * b, * c; + + // Allocate memory for a, b, and c. + a = (int*)malloc(sizeof(*a)); + b = (int*)malloc(sizeof(*b)); + c = (int*)malloc(sizeof(*c)); + + // Setup input values. + *a = 1; + *b = 2; + + // Launch add() kernel on GPU. + hipLaunchKernelGGL(add, dim3(1), dim3(1), 0, 0, a, b, c); + + // Wait for GPU to finish before accessing on host. + hipDeviceSynchronize(); + + // Prints the result. + std::cout << *a << " + " << *b << " = " << *c << std::endl; + + // Cleanup allocated memory. + free(a); + free(b); + free(c); + + return 0; + } + + + .. tab-item:: Explicit Memory Management + + .. code-block:: cpp + :emphasize-lines: 17-24, 29-30 + + #include + #include + + // Addition of two values. + __global__ void add(int *a, int *b, int *c) { + *c = *a + *b; + } + + int main() { + int a, b, c; + int *d_a, *d_b, *d_c; + + // Setup input values. + a = 1; + b = 2; + + // Allocate device copies of a, b and c. + hipMalloc(&d_a, sizeof(*d_a)); + hipMalloc(&d_b, sizeof(*d_b)); + hipMalloc(&d_c, sizeof(*d_c)); + + // Copy input values to device. + hipMemcpy(d_a, &a, sizeof(*d_a), hipMemcpyHostToDevice); + hipMemcpy(d_b, &b, sizeof(*d_b), hipMemcpyHostToDevice); + + // Launch add() kernel on GPU. + hipLaunchKernelGGL(add, dim3(1), dim3(1), 0, 0, d_a, d_b, d_c); + + // Copy the result back to the host. + hipMemcpy(&c, d_c, sizeof(*d_c), hipMemcpyDeviceToHost); + + // Cleanup allocated memory. + hipFree(d_a); + hipFree(d_b); + hipFree(d_c); + + // Prints the result. + std::cout << a << " + " << b << " = " << c << std::endl; + + return 0; + } + +.. _using unified memory management: + +Using unified memory management (UMM) +===================================== + +Unified memory management (UMM) is a feature that can simplify the complexities +of memory management in GPU computing. It is particularly useful in +heterogeneous computing environments with heavy memory usage with both a CPU +and a GPU, which would require large memory transfers. Here are some areas +where UMM can be beneficial: + +- **Simplification of Memory Management**: + + UMM can help to simplify the complexities of memory management. This can make + it easier for developers to write code without worrying about memory + allocation and deallocation details. + +- **Data Migration**: + + UMM allows for efficient data migration between the host (CPU) and the device + (GPU). This can be particularly useful for applications that need to move + data back and forth between the device and host. + +- **Improved Programming Productivity**: + + As a positive side effect, UMM can reduce the lines of code, thereby + improving programming productivity. + +In HIP, pinned memory allocations are coherent by default. Pinned memory is +host memory mapped into the address space of all GPUs, meaning that the pointer +can be used on both host and device. Using pinned memory instead of pageable +memory on the host can improve bandwidth. + +While UMM can provide numerous benefits, it's important to be aware of the +potential performance overhead associated with UMM. You must thoroughly test +and profile your code to ensure it's the most suitable choice for your use +case. + +.. _unified memory runtime hints: + +Unified memory HIP runtime hints for the better performance +=========================================================== + +Unified memory HIP runtime hints can help improve the performance of your code if +you know your code's ability and infrastructure. Some hint techniques are +presented in this section. + +The hint functions can set actions on a selected device, which can be +identified by ``hipGetDeviceProperties(&prop, device_id)``. There are two +special ``device_id`` values: + +- ``hipCpuDeviceId`` = -1 means that the advised device is the CPU. +- ``hipInvalidDeviceId`` = -2 means that the device is invalid. + +For the best performance, profile your application to optimize the +utilization of HIP runtime hints. + +Data prefetching +---------------- + +Data prefetching is a technique used to improve the performance of your +application by moving data closer to the processing unit before it's actually +needed. + +.. code-block:: cpp + :emphasize-lines: 20-23,31-32 + + // Addition of two values. + __global__ void add(int *a, int *b, int *c) { + *c = *a + *b; + } + + int main() { + int *a, *b, *c; + int deviceId; + hipGetDevice(&deviceId); // Get the current device ID + + // Allocate memory for a, b and c that is accessible to both device and host codes. + hipMallocManaged(&a, sizeof(*a)); + hipMallocManaged(&b, sizeof(*b)); + hipMallocManaged(&c, sizeof(*c)); + + // Setup input values. + *a = 1; + *b = 2; + + // Prefetch the data to the GPU device. + hipMemPrefetchAsync(a, sizeof(*a), deviceId, 0); + hipMemPrefetchAsync(b, sizeof(*b), deviceId, 0); + hipMemPrefetchAsync(c, sizeof(*c), deviceId, 0); + + // Launch add() kernel on GPU. + hipLaunchKernelGGL(add, dim3(1), dim3(1), 0, 0, a, b, c); + + // Wait for GPU to finish before accessing on host. + hipDeviceSynchronize(); + + // Prefetch the result back to the CPU. + hipMemPrefetchAsync(c, sizeof(*c), hipCpuDeviceId, 0); + + // Wait for the prefetch operations to complete. + hipDeviceSynchronize(); + + // Prints the result. + std::cout << *a << " + " << *b << " = " << *c << std::endl; + + // Cleanup allocated memory. + hipFree(a); + hipFree(b); + hipFree(c); + + return 0; + } + +Remember to check the return status of ``hipMemPrefetchAsync()`` to ensure that +the prefetch operations are completed successfully. + +Memory advice +------------- + +The effectiveness of ``hipMemAdvise()`` comes from its ability to inform the +runtime system of the developer's intentions regarding memory usage. When the +runtime system has knowledge of the expected memory access patterns, it can +make better decisions about data placement and caching, leading to more +efficient execution of the application. However, the actual impact on +performance can vary based on the specific use case and the hardware +architecture. + +For the description of ``hipMemAdvise()`` and the detailed list of advice, +visit the :ref:`unified_memory_reference`. + +Here is the updated version of the example above with memory advice. + +.. code-block:: cpp + :emphasize-lines: 17-26 + + #include + #include + + // Addition of two values. + __global__ void add(int *a, int *b, int *c) { + *c = *a + *b; + } + + int main() { + int *a, *b, *c; + + // Allocate memory for a, b, and c accessible to both device and host codes. + hipMallocManaged(&a, sizeof(*a)); + hipMallocManaged(&b, sizeof(*b)); + hipMallocManaged(&c, sizeof(*c)); + + // Set memory advice for a, b, and c to be accessed by the CPU. + hipMemAdvise(a, sizeof(*a), hipMemAdviseSetPreferredLocation, hipCpuDeviceId); + hipMemAdvise(b, sizeof(*b), hipMemAdviseSetPreferredLocation, hipCpuDeviceId); + hipMemAdvise(c, sizeof(*c), hipMemAdviseSetPreferredLocation, hipCpuDeviceId); + + // Additionally, set memory advice for a, b, and c to be read mostly from the device 0. + constexpr int device = 0; + hipMemAdvise(a, sizeof(*a), hipMemAdviseSetReadMostly, device); + hipMemAdvise(b, sizeof(*b), hipMemAdviseSetReadMostly, device); + hipMemAdvise(c, sizeof(*c), hipMemAdviseSetReadMostly, device); + + // Setup input values. + *a = 1; + *b = 2; + + // Launch add() kernel on GPU. + hipLaunchKernelGGL(add, dim3(1), dim3(1), 0, 0, a, b, c); + + // Wait for GPU to finish before accessing on host. + hipDeviceSynchronize(); + + // Prints the result. + std::cout << *a << " + " << *b << " = " << *c << std::endl; + + // Cleanup allocated memory. + hipFree(a); + hipFree(b); + hipFree(c); + + return 0; + } + + +Memory range attributes +----------------------- + +Memory Range attributes allow you to query attributes of a given memory range. + +The ``hipMemRangeGetAttribute()`` is added to the example to query the +``hipMemRangeAttributeReadMostly`` attribute of the memory range pointed to by +``a``. The result is stored in ``attributeValue`` and then printed out. + +For more details, visit the +:ref:`unified_memory_reference`. + +.. code-block:: cpp + :emphasize-lines: 29-34 + + #include + #include + + // Addition of two values. + __global__ void add(int *a, int *b, int *c) { + *c = *a + *b; + } + + int main() { + int *a, *b, *c; + unsigned int attributeValue; + constexpr size_t attributeSize = sizeof(attributeValue); + + // Allocate memory for a, b and c that is accessible to both device and host codes. + hipMallocManaged(&a, sizeof(*a)); + hipMallocManaged(&b, sizeof(*b)); + hipMallocManaged(&c, sizeof(*c)); + + // Setup input values. + *a = 1; + *b = 2; + + // Launch add() kernel on GPU. + hipLaunchKernelGGL(add, dim3(1), dim3(1), 0, 0, a, b, c); + + // Wait for GPU to finish before accessing on host. + hipDeviceSynchronize(); + + // Query an attribute of the memory range. + hipMemRangeGetAttribute(&attributeValue, + attributeSize, + hipMemRangeAttributeReadMostly, + a, + sizeof(*a)); + + // Prints the result. + std::cout << *a << " + " << *b << " = " << *c << std::endl; + std::cout << "The queried attribute value is: " << attributeValue << std::endl; + + // Cleanup allocated memory. + hipFree(a); + hipFree(b); + hipFree(c); + + return 0; + } + +Asynchronously attach memory to a stream +---------------------------------------- + +The ``hipStreamAttachMemAsync`` function would be able to asynchronously attach memory to a stream, which can help concurrent execution when using streams. + +Currently, this function is a no-operation (NOP) function on AMD GPUs. It simply returns success after the runtime memory validation passed. This function is necessary on Microsoft Windows, and UMM is not supported on this operating system with AMD GPUs at the moment. diff --git a/projects/hip/docs/how-to/virtual_memory.rst b/projects/hip/docs/how-to/virtual_memory.rst new file mode 100644 index 0000000000..3e56bfb4fe --- /dev/null +++ b/projects/hip/docs/how-to/virtual_memory.rst @@ -0,0 +1,94 @@ +.. meta:: + :description: This chapter describes introduces Virtual Memory (VM) and shows + how to use it in AMD HIP. + :keywords: AMD, ROCm, HIP, CUDA, virtual memory, virtual, memory, UM, APU + +.. _virtual_memory: + +***************************** +Virtual memory management +***************************** + +Memory management is important when creating high-performance applications in the HIP ecosystem. Both allocating and copying memory can result in bottlenecks, which can significantly impact performance. + +Global memory allocation in HIP uses the C language style allocation function. This works fine for simple cases but can cause problems if your memory needs change. If you need to increase the size of your memory, you must allocate a second larger buffer and copy the data to it before you can free the original buffer. This increases overall memory usage and causes unnecessary ``memcpy`` calls. Another solution is to allocate a larger buffer than you initially need. However, this isn't an efficient way to handle resources and doesn't solve the issue of reallocation when the extra buffer runs out. + +Virtual memory management solves these memory management problems. It helps to reduce memory usage and unnecessary ``memcpy`` calls. + +.. _memory_allocation_virtual_memory: + +Memory allocation +================= + +Standard memory allocation uses the ``hipMalloc`` function to allocate a block of memory on the device. However, when using virtual memory, this process is separated into multiple steps using the ``hipMemCreate``, ``hipMemAddressReserve``, ``hipMemMap``, and ``hipMemSetAccess`` functions. This guide explains what these functions do and how you can use them for virtual memory management. + +Allocate physical memory +------------------------ + +The first step is to allocate the physical memory itself with the ``hipMemCreate`` function. This function accepts the size of the buffer, an ``unsigned long long`` variable for the flags, and a ``hipMemAllocationProp`` variable. ``hipMemAllocationProp`` contains the properties of the memory to be allocated, such as where the memory is physically located and what kind of shareable handles are available. If the allocation is successful, the function returns a value of ``hipSuccess``, with ``hipMemGenericAllocationHandle_t`` representing a valid physical memory allocation. The allocated memory size must be aligned with the granularity appropriate for the properties of the allocation. You can use the ``hipMemGetAllocationGranularity`` function to determine the correct granularity. + +.. code-block:: cpp + + size_t granularity = 0; + hipMemGenericAllocationHandle_t allocHandle; + hipMemAllocationProp prop = {}; + prop.type = HIP_MEM_ALLOCATION_TYPE_PINNED; + prop.location.type = HIP_MEM_LOCATION_TYPE_DEVICE; + prop.location.id = currentDev; + hipMemGetAllocationGranularity(&granularity, &prop, HIP_MEM_ALLOC_GRANULARITY_MINIMUM); + padded_size = ROUND_UP(size, granularity); + hipMemCreate(&allocHandle, padded_size, &prop, 0); + +Reserve virtual address range +----------------------------- + +After you have acquired an allocation of physical memory, you must map it before you can use it. To do so, you need a virtual address to map it to. Mapping means the physical memory allocation is available from the virtual address range it is mapped to. To reserve a virtual memory range, use the ``hipMemAddressReserve`` function. The size of the virtual memory must match the amount of physical memory previously allocated. You can then map the physical memory allocation to the newly-acquired virtual memory address range using the ``hipMemMap`` function. + +.. code-block:: cpp + + hipMemAddressReserve(&ptr, padded_size, 0, 0, 0); + hipMemMap(ptr, padded_size, 0, allocHandle, 0); + +Set memory access +----------------- + +Finally, use the ``hipMemSetAccess`` function to enable memory access. It accepts the pointer to the virtual memory, the size, and a ``hipMemAccessDesc`` descriptor as parameters. In a multi-GPU environment, you can map the device memory of one GPU to another. This feature also works with the traditional memory management system, but isn't as scalable as with virtual memory. When memory is allocated with ``hipMalloc``, ``hipDeviceEnablePeerAccess`` is used to enable peer access. This function enables access between two devices, but it means that every call to ``hipMalloc`` takes more time to perform the checks and the mapping between the devices. When using virtual memory management, peer access is enabled by ``hipMemSetAccess``, which provides a finer level of control over what is shared. This has no performance impact on memory allocation and gives you more control over what memory buffers are shared with which devices. + +.. code-block:: cpp + + hipMemAccessDesc accessDesc = {}; + accessDesc.location.type = HIP_MEM_LOCATION_TYPE_DEVICE; + accessDesc.location.id = currentDev; + accessDesc.flags = HIP_MEM_ACCESS_FLAGS_PROT_READWRITE; + hipMemSetAccess(ptr, padded_size, &accessDesc, 1); + +At this point the memory is allocated, mapped, and ready for use. You can read and write to it, just like you would a C style memory allocation. + +Free virtual memory +------------------- + +To free the memory allocated in this manner, use the corresponding free functions. To unmap the memory, use ``hipMemUnmap``. To release the virtual address range, use ``hipMemAddressFree``. Finally, to release the physical memory, use ``hipMemRelease``. A side effect of these functions is the lack of synchronization when memory is released. If you call ``hipFree`` when you have multiple streams running in parallel, it synchronizes the device. This causes worse resource usage and performance. + +.. code-block:: cpp + + hipMemUnmap(ptr, size); + hipMemRelease(allocHandle); + hipMemAddressFree(ptr, size); + +.. _usage_virtual_memory: + +Memory usage +============ + +Dynamically increase allocation size +------------------------------------ + +The ``hipMemAddressReserve`` function allows you to increase the amount of pre-allocated memory. This function accepts a parameter representing the requested starting address of the virtual memory. This allows you to have a continuous virtual address space without worrying about the underlying physical allocation. + +.. code-block:: cpp + + hipMemAddressReserve(&new_ptr, (new_size - padded_size), 0, ptr + padded_size, 0); + hipMemMap(new_ptr, (new_size - padded_size), 0, newAllocHandle, 0); + hipMemSetAccess(new_ptr, (new_size - padded_size), &accessDesc, 1); + +The code sample above assumes that ``hipMemAddressReserve`` was able to reserve the memory address at the specified location. However, this isn't guaranteed to be true, so you should validate that ``new_ptr`` points to a specific virtual address before using it. diff --git a/projects/hip/docs/index.md b/projects/hip/docs/index.md index 7f418b2885..54fa9cc411 100644 --- a/projects/hip/docs/index.md +++ b/projects/hip/docs/index.md @@ -11,7 +11,7 @@ For HIP supported AMD GPUs on multiple operating systems, see: The CUDA enabled NVIDIA GPUs are supported by HIP. For more information, see [GPU Compute Capability](https://developer.nvidia.com/cuda-gpus). -On the AMD ROCm platform, HIP provides header files and runtime library built on top of HIP-Clang compiler in the repository [Common Language Runtime (CLR)](./understand/amd_clr), which contains source codes for AMD's compute languages runtimes as follows, +On the AMD ROCm platform, HIP provides header files and runtime library built on top of HIP-Clang compiler in the repository [Common Language Runtimes (CLR)](./understand/amd_clr), which contains source codes for AMD's compute languages runtimes as follows, On non-AMD platforms, like NVIDIA, HIP provides header files required to support non-AMD specific back-end implementation in the repository ['hipother'](https://github.com/ROCm/hipother), which translates from the HIP runtime APIs to CUDA runtime APIs. @@ -30,31 +30,40 @@ On non-AMD platforms, like NVIDIA, HIP provides header files required to support :::{grid-item-card} Conceptual * {doc}`./understand/programming_model` -* {doc}`./understand/programming_model_reference` * {doc}`./understand/hardware_implementation` * {doc}`./understand/amd_clr` +* {doc}`./understand/texture_fetching` ::: :::{grid-item-card} How to -* [Programming Manual](./how-to/programming_manual) -* [HIP Porting Guide](./how-to/hip_porting_guide) -* [HIP Porting: Driver API Guide](./how-to/hip_porting_driver_api) +* [Programming manual](./how-to/programming_manual) +* [HIP porting guide](./how-to/hip_porting_guide) +* [HIP porting: driver API guide](./how-to/hip_porting_driver_api) * {doc}`./how-to/hip_rtc` * {doc}`./how-to/performance_guidelines` * [Debugging with HIP](./how-to/debugging) * {doc}`./how-to/logging` +* [Unified memory](./how-to/unified_memory) +* [Virtual memory](./how-to/virtual_memory) +* {doc}`./how-to/stream_ordered_allocator` +* [Cooperative groups](./how-to/cooperative_groups) +* [HIP graphs](./how-to/hipgraph) * {doc}`./how-to/faq` ::: :::{grid-item-card} Reference -* {doc}`/doxygen/html/index` -* [C++ language extensions](./reference/kernel_language) -* [Comparing Syntax for different APIs](./reference/terms) -* [HSA Runtime API for ROCm](./reference/virtual_rocr) +* [HIP runtime API](./reference/hip_runtime_api_reference) + * [Modules](./reference/hip_runtime_api/modules) + * [Global defines, enums, structs and files](./reference/hip_runtime_api/global_defines_enums_structs_files) +* [HSA runtime API for ROCm](./reference/virtual_rocr) +* [C++ language extensions](./reference/cpp_language_extensions) +* [C++ language support](./reference/cpp_language_support) +* [HIP math API](./reference/math_api) +* [Comparing syntax for different APIs](./reference/terms) * [List of deprecated APIs](./reference/deprecated_api_list) * [FP8 numbers in HIP](./reference/fp8_numbers) @@ -62,8 +71,12 @@ On non-AMD platforms, like NVIDIA, HIP provides header files required to support :::{grid-item-card} Tutorial +* [HIP basic examples](https://github.com/ROCm/rocm-examples/tree/develop/HIP-Basic) * [HIP examples](https://github.com/ROCm/HIP-Examples) * [HIP test samples](https://github.com/ROCm/hip-tests/tree/develop/samples) +* [SAXPY tutorial](./tutorial/saxpy) +* [Reduction tutorial](./tutorial/reduction) +* [Cooperative groups tutorial](./tutorial/cooperative_groups_tutorial) ::: diff --git a/projects/hip/docs/reference/kernel_language.rst b/projects/hip/docs/reference/cpp_language_extensions.rst similarity index 57% rename from projects/hip/docs/reference/kernel_language.rst rename to projects/hip/docs/reference/cpp_language_extensions.rst index 9c69b9f019..c0b804c552 100644 --- a/projects/hip/docs/reference/kernel_language.rst +++ b/projects/hip/docs/reference/cpp_language_extensions.rst @@ -5,7 +5,7 @@ :keywords: AMD, ROCm, HIP, CUDA, c++ language extensions, HIP functions ******************************************************************************** -C++ Language Extensions +C++ language extensions ******************************************************************************** HIP provides a C++ syntax that is suitable for compiling most code that commonly appears in @@ -51,8 +51,7 @@ Supported ``__global__`` functions are: * Run on the device * Called (launched) from the host -HIP ``__global__`` functions must have a ``void`` return type. The first parameter in a HIP ``__global__`` -function must have the type ``hipLaunchParm``. Refer to :ref:`kernel-launch-example` to see usage. +HIP ``__global__`` functions must have a ``void`` return type. HIP doesn't support dynamic-parallelism, which means that you can't call ``__global__`` functions from the device. @@ -105,7 +104,7 @@ You can include your kernel arguments after these parameters. .. code-block:: cpp // Example hipLaunchKernelGGL pseudocode: - __global__ MyKernel(hipLaunchParm lp, float *A, float *B, float *C, size_t N) + __global__ void MyKernel(float *A, float *B, float *C, size_t N) { ... } @@ -132,30 +131,31 @@ Kernel launch example // Example showing device function, __device__ __host__ // <- compile for both device and host - float PlusOne(float x) + #include + // Example showing device function, __device__ __host__ + __host__ __device__ float PlusOne(float x) // <- compile for both device and host { return x + 1.0; } - __global__ - void - MyKernel (hipLaunchParm lp, /*lp parm for execution configuration */ - const float *a, const float *b, float *c, unsigned N) + __global__ void MyKernel (const float *a, const float *b, float *c, unsigned N) { - unsigned gid = threadIdx.x; // <- coordinate index function + const int gid = threadIdx.x + blockIdx.x * blockDim.x; // <- coordinate index function if (gid < N) { c[gid] = a[gid] + PlusOne(b[gid]); } } + void callMyKernel() { float *a, *b, *c; // initialization not shown... unsigned N = 1000000; const unsigned blockSize = 256; + const int gridSize = (N + blockSize - 1)/blockSize; - MyKernel<<>> (a,b,c,n); + MyKernel<<>> (a,b,c,N); // Alternatively, kernel can be launched by - // hipLaunchKernelGGL(MyKernel, dim3(N/blockSize), dim3(blockSize), 0, 0, a,b,c,N); + // hipLaunchKernelGGL(MyKernel, dim3(gridSize), dim3(blockSize), 0, 0, a,b,c,N); } Variable type qualifiers @@ -297,8 +297,7 @@ dimensions to 1. Memory fence instructions ==================================================== -HIP supports ``__threadfence()`` and ``__threadfence_block()``. If you're using ``threadfence_system()`` in -the HIP-Clang path, you can use the following workaround: +HIP supports ``__threadfence()`` and ``__threadfence_block()``. If you're using ``threadfence_system()`` in the HIP-Clang path, you can use the following workaround: #. Build HIP with the ``HIP_COHERENT_HOST_ALLOC`` environment variable enabled. #. Modify kernels that use ``__threadfence_system()`` as follows: @@ -311,1124 +310,20 @@ the HIP-Clang path, you can use the following workaround: Synchronization functions ==================================================== + +Synchronization functions causes all threads in the group to wait at this synchronization point, and for all shared and global memory accesses by the threads to complete, before running synchronization. This guarantees the visibility of accessed data for all threads in the group. + The ``__syncthreads()`` built-in function is supported in HIP. The ``__syncthreads_count(int)``, ``__syncthreads_and(int)``, and ``__syncthreads_or(int)`` functions are under development. +The Cooperative Groups API offer options to do synchronization on a developer defined set of thread groups. For further information, check :ref:`Cooperative Groups API ` or :ref:`Cooperative Groups how to `. + Math functions ==================================================== -HIP-Clang supports a set of math operations that are callable from the device. HIP supports most of the device functions supported by CUDA. -These are described in the following sections. - -Single precision mathematical functions --------------------------------------------------------------------------------------------- - -Following is the list of supported single precision mathematical functions. - -.. list-table:: Single precision mathematical functions - - * - **Function** - - **Supported on Host** - - **Supported on Device** - - * - | ``float abs(float x)`` - | Returns the absolute value of :math:`x` - - ✓ - - ✓ - - * - | ``float acosf(float x)`` - | Returns the arc cosine of :math:`x`. - - ✓ - - ✓ - - * - | ``float acoshf(float x)`` - | Returns the nonnegative arc hyperbolic cosine of :math:`x`. - - ✓ - - ✓ - - * - | ``float asinf(float x)`` - | Returns the arc sine of :math:`x`. - - ✓ - - ✓ - - * - | ``float asinhf(float x)`` - | Returns the arc hyperbolic sine of :math:`x`. - - ✓ - - ✓ - - * - | ``float atanf(float x)`` - | Returns the arc tangent of :math:`x`. - - ✓ - - ✓ - - * - | ``float atan2f(float x, float y)`` - | Returns the arc tangent of the ratio of :math:`x` and :math:`y`. - - ✓ - - ✓ - - * - | ``float atanhf(float x)`` - | Returns the arc hyperbolic tangent of :math:`x`. - - ✓ - - ✓ - - * - | ``float cbrtf(float x)`` - | Returns the cube root of :math:`x`. - - ✓ - - ✓ - - * - | ``float ceilf(float x)`` - | Returns ceiling of :math:`x`. - - ✓ - - ✓ - - * - | ``float copysignf(float x, float y)`` - | Create value with given magnitude, copying sign of second value. - - ✓ - - ✓ - - * - | ``float cosf(float x)`` - | Returns the cosine of :math:`x`. - - ✓ - - ✓ - - * - | ``float coshf(float x)`` - | Returns the hyperbolic cosine of :math:`x`. - - ✓ - - ✓ - - * - | ``float cospif(float x)`` - | Returns the cosine of :math:`\pi \cdot x`. - - ✓ - - ✓ - - * - | ``float cyl_bessel_i0f(float x)`` - | Returns the value of the regular modified cylindrical Bessel function of order 0 for :math:`x`. - - ✗ - - ✗ - - * - | ``float cyl_bessel_i1f(float x)`` - | Returns the value of the regular modified cylindrical Bessel function of order 1 for :math:`x`. - - ✗ - - ✗ - - * - | ``float erff(float x)`` - | Returns the error function of :math:`x`. - - ✓ - - ✓ - - * - | ``float erfcf(float x)`` - | Returns the complementary error function of :math:`x`. - - ✓ - - ✓ - - * - | ``float erfcinvf(float x)`` - | Returns the inverse complementary function of :math:`x`. - - ✓ - - ✓ - - * - | ``float erfcxf(float x)`` - | Returns the scaled complementary error function of :math:`x`. - - ✓ - - ✓ - - * - | ``float erfinvf(float x)`` - | Returns the inverse error function of :math:`x`. - - ✓ - - ✓ - - * - | ``float expf(float x)`` - | Returns :math:`e^x`. - - ✓ - - ✓ - - * - | ``float exp10f(float x)`` - | Returns :math:`10^x`. - - ✓ - - ✓ - - * - | ``float exp2f( float x)`` - | Returns :math:`2^x`. - - ✓ - - ✓ - - * - | ``float expm1f(float x)`` - | Returns :math:`ln(x - 1)` - - ✓ - - ✓ - - * - | ``float fabsf(float x)`` - | Returns the absolute value of `x` - - ✓ - - ✓ - - * - | ``float fdimf(float x, float y)`` - | Returns the positive difference between :math:`x` and :math:`y`. - - ✓ - - ✓ - - * - | ``float fdividef(float x, float y)`` - | Divide two floating point values. - - ✓ - - ✓ - - * - | ``float floorf(float x)`` - | Returns the largest integer less than or equal to :math:`x`. - - ✓ - - ✓ - - * - | ``float fmaf(float x, float y, float z)`` - | Returns :math:`x \cdot y + z` as a single operation. - - ✓ - - ✓ - - * - | ``float fmaxf(float x, float y)`` - | Determine the maximum numeric value of :math:`x` and :math:`y`. - - ✓ - - ✓ - - * - | ``float fminf(float x, float y)`` - | Determine the minimum numeric value of :math:`x` and :math:`y`. - - ✓ - - ✓ - - * - | ``float fmodf(float x, float y)`` - | Returns the floating-point remainder of :math:`x / y`. - - ✓ - - ✓ - - * - | ``float modff(float x, float* iptr)`` - | Break down :math:`x` into fractional and integral parts. - - ✓ - - ✗ - - * - | ``float frexpf(float x, int* nptr)`` - | Extract mantissa and exponent of :math:`x`. - - ✓ - - ✗ - - * - | ``float hypotf(float x, float y)`` - | Returns the square root of the sum of squares of :math:`x` and :math:`y`. - - ✓ - - ✓ - - * - | ``int ilogbf(float x)`` - | Returns the unbiased integer exponent of :math:`x`. - - ✓ - - ✓ - - * - | ``bool isfinite(float x)`` - | Determine whether :math:`x` is finite. - - ✓ - - ✓ - - * - | ``bool isinf(float x)`` - | Determine whether :math:`x` is infinite. - - ✓ - - ✓ - - * - | ``bool isnan(float x)`` - | Determine whether :math:`x` is a ``NAN``. - - ✓ - - ✓ - - * - | ``float j0f(float x)`` - | Returns the value of the Bessel function of the first kind of order 0 for :math:`x`. - - ✓ - - ✓ - - * - | ``float j1f(float x)`` - | Returns the value of the Bessel function of the first kind of order 1 for :math:`x`. - - ✓ - - ✓ - - * - | ``float jnf(int n, float x)`` - | Returns the value of the Bessel function of the first kind of order n for :math:`x`. - - ✓ - - ✓ - - * - | ``float ldexpf(float x, int exp)`` - | Returns the natural logarithm of the absolute value of the gamma function of :math:`x`. - - ✓ - - ✓ - - * - | ``float lgammaf(float x)`` - | Returns the natural logarithm of the absolute value of the gamma function of :math:`x`. - - ✓ - - ✗ - - * - | ``long int lrintf(float x)`` - | Round :math:`x` to nearest integer value. - - ✓ - - ✓ - - * - | ``long long int llrintf(float x)`` - | Round :math:`x` to nearest integer value. - - ✓ - - ✓ - - * - | ``long int lroundf(float x)`` - | Round to nearest integer value. - - ✓ - - ✓ - - * - | ``long long int llroundf(float x)`` - | Round to nearest integer value. - - ✓ - - ✓ - - * - | ``float log10f(float x)`` - | Returns the base 10 logarithm of :math:`x`. - - ✓ - - ✓ - - * - | ``float log1pf(float x)`` - | Returns the natural logarithm of :math:`x + 1`. - - ✓ - - ✓ - - * - | ``float log2f(float x)`` - | Returns the base 2 logarithm of :math:`x`. - - ✓ - - ✓ - - * - | ``float logf(float x)`` - | Returns the natural logarithm of :math:`x`. - - ✓ - - ✓ - - * - | ``float logbf(float x)`` - | Returns the floating point representation of the exponent of :math:`x`. - - ✓ - - ✓ - - * - | ``float nanf(const char* tagp)`` - | Returns "Not a Number" value. - - ✗ - - ✓ - - * - | ``float nearbyintf(float x)`` - | Round :math:`x` to the nearest integer. - - ✓ - - ✓ - - * - | ``float nextafterf(float x, float y)`` - | Returns next representable single-precision floating-point value after argument. - - ✓ - - ✗ - - * - | ``float norm3df(float x, float y, float z)`` - | Returns the square root of the sum of squares of :math:`x`, :math:`y` and :math:`z`. - - ✓ - - ✓ - - * - | ``float norm4df(float x, float y, float z, float w)`` - | Returns the square root of the sum of squares of :math:`x`, :math:`y`, :math:`z` and :math:`w`. - - ✓ - - ✓ - - * - | ``float normcdff(float y)`` - | Returns the standard normal cumulative distribution function. - - ✓ - - ✓ - - * - | ``float normcdfinvf(float y)`` - | Returns the inverse of the standard normal cumulative distribution function. - - ✓ - - ✓ - - * - | ``float normf(int dim, const float *a)`` - | Returns the square root of the sum of squares of any number of coordinates. - - ✓ - - ✓ - - * - | ``float powf(float x, float y)`` - | Returns :math:`x^y`. - - ✓ - - ✓ - - * - | ``float powif(float base, int iexp)`` - | Returns the value of first argument to the power of second argument. - - ✓ - - ✓ - - * - | ``float remainderf(float x, float y)`` - | Returns single-precision floating-point remainder. - - ✓ - - ✓ - - * - | ``float remquof(float x, float y, int* quo)`` - | Returns single-precision floating-point remainder and part of quotient. - - ✓ - - ✓ - - * - | ``float roundf(float x)`` - | Round to nearest integer value in floating-point. - - ✓ - - ✓ - - * - | ``float rcbrtf(float x)`` - | Returns the reciprocal cube root function. - - ✓ - - ✓ - - * - | ``float rhypotf(float x, float y)`` - | Returns one over the square root of the sum of squares of two arguments. - - ✓ - - ✓ - - * - | ``float rintf(float x)`` - | Round input to nearest integer value in floating-point. - - ✓ - - ✓ - - * - | ``float rnorm3df(float x, float y, float z)`` - | Returns one over the square root of the sum of squares of three coordinates of the argument. - - ✓ - - ✓ - - * - | ``float rnorm4df(float x, float y, float z, float w)`` - | Returns one over the square root of the sum of squares of four coordinates of the argument. - - ✓ - - ✓ - - * - | ``float rnormf(int dim, const float *a)`` - | Returns the reciprocal of square root of the sum of squares of any number of coordinates. - - ✓ - - ✓ - - * - | ``float scalblnf(float x, long int n)`` - | Scale :math:`x` by :math:`2^n`. - - ✓ - - ✓ - - * - | ``float scalbnf(float x, int n)`` - | Scale :math:`x` by :math:`2^n`. - - ✓ - - ✓ - - * - | ``bool signbit(float x)`` - | Return the sign bit of :math:`x`. - - ✓ - - ✓ - - * - | ``float sinf(float x)`` - | Returns the sine of :math:`x`. - - ✓ - - ✓ - - * - | ``float sinhf(float x)`` - | Returns the hyperbolic sine of :math:`x`. - - ✓ - - ✓ - - * - | ``float sinpif(float x)`` - | Returns the hyperbolic sine of :math:`\pi \cdot x`. - - ✓ - - ✓ - - * - | ``void sincosf(float x, float *sptr, float *cptr)`` - | Returns the sine and cosine of :math:`x`. - - ✓ - - ✓ - - * - | ``void sincospif(float x, float *sptr, float *cptr)`` - | Returns the sine and cosine of :math:`\pi \cdot x`. - - ✓ - - ✓ - - * - | ``float sqrtf(float x)`` - | Returns the square root of :math:`x`. - - ✓ - - ✓ - - * - | ``float rsqrtf(float x)`` - | Returns the reciprocal of the square root of :math:`x`. - - ✗ - - ✓ - - * - | ``float tanf(float x)`` - | Returns the tangent of :math:`x`. - - ✓ - - ✓ - - * - | ``float tanhf(float x)`` - | Returns the hyperbolic tangent of :math:`x`. - - ✓ - - ✓ - - * - | ``float tgammaf(float x)`` - | Returns the gamma function of :math:`x`. - - ✓ - - ✓ - - * - | ``float truncf(float x)`` - | Truncate :math:`x` to the integral part. - - ✓ - - ✓ - - * - | ``float y0f(float x)`` - | Returns the value of the Bessel function of the second kind of order 0 for :math:`x`. - - ✓ - - ✓ - - * - | ``float y1f(float x)`` - | Returns the value of the Bessel function of the second kind of order 1 for :math:`x`. - - ✓ - - ✓ - - * - | ``float ynf(int n, float x)`` - | Returns the value of the Bessel function of the second kind of order n for :math:`x`. - - ✓ - - ✓ - -Double precision mathematical functions --------------------------------------------------------------------------------------------- - -Following is the list of supported double precision mathematical functions. - -.. list-table:: Double precision mathematical functions - - * - **Function** - - **Supported on Host** - - **Supported on Device** - - * - | ``double abs(double x)`` - | Returns the absolute value of :math:`x` - - ✓ - - ✓ - - * - | ``double acos(double x)`` - | Returns the arc cosine of :math:`x`. - - ✓ - - ✓ - - * - | ``double acosh(double x)`` - | Returns the nonnegative arc hyperbolic cosine of :math:`x`. - - ✓ - - ✓ - - * - | ``double asin(double x)`` - | Returns the arc sine of :math:`x`. - - ✓ - - ✓ - - * - | ``double asinh(double x)`` - | Returns the arc hyperbolic sine of :math:`x`. - - ✓ - - ✓ - - * - | ``double atan(double x)`` - | Returns the arc tangent of :math:`x`. - - ✓ - - ✓ - - * - | ``double atan2(double x, double y)`` - | Returns the arc tangent of the ratio of :math:`x` and :math:`y`. - - ✓ - - ✓ - - * - | ``double atanh(double x)`` - | Returns the arc hyperbolic tangent of :math:`x`. - - ✓ - - ✓ - - * - | ``double cbrt(double x)`` - | Returns the cube root of :math:`x`. - - ✓ - - ✓ - - * - | ``double ceil(double x)`` - | Returns ceiling of :math:`x`. - - ✓ - - ✓ - - * - | ``double copysign(double x, double y)`` - | Create value with given magnitude, copying sign of second value. - - ✓ - - ✓ - - * - | ``double cos(double x)`` - | Returns the cosine of :math:`x`. - - ✓ - - ✓ - - * - | ``double cosh(double x)`` - | Returns the hyperbolic cosine of :math:`x`. - - ✓ - - ✓ - - * - | ``double cospi(double x)`` - | Returns the cosine of :math:`\pi \cdot x`. - - ✓ - - ✓ - - * - | ``double cyl_bessel_i0(double x)`` - | Returns the value of the regular modified cylindrical Bessel function of order 0 for :math:`x`. - - ✗ - - ✗ - - * - | ``double cyl_bessel_i1(double x)`` - | Returns the value of the regular modified cylindrical Bessel function of order 1 for :math:`x`. - - ✗ - - ✗ - - * - | ``double erf(double x)`` - | Returns the error function of :math:`x`. - - ✓ - - ✓ - - * - | ``double erfc(double x)`` - | Returns the complementary error function of :math:`x`. - - ✓ - - ✓ - - * - | ``double erfcinv(double x)`` - | Returns the inverse complementary function of :math:`x`. - - ✓ - - ✓ - - * - | ``double erfcx(double x)`` - | Returns the scaled complementary error function of :math:`x`. - - ✓ - - ✓ - - * - | ``double erfinv(double x)`` - | Returns the inverse error function of :math:`x`. - - ✓ - - ✓ - - * - | ``double exp(double x)`` - | Returns :math:`e^x`. - - ✓ - - ✓ - - * - | ``double exp10(double x)`` - | Returns :math:`10^x`. - - ✓ - - ✓ - - * - | ``double exp2( double x)`` - | Returns :math:`2^x`. - - ✓ - - ✓ - - * - | ``double expm1(double x)`` - | Returns :math:`ln(x - 1)` - - ✓ - - ✓ - - * - | ``double fabs(double x)`` - | Returns the absolute value of `x` - - ✓ - - ✓ - - * - | ``double fdim(double x, double y)`` - | Returns the positive difference between :math:`x` and :math:`y`. - - ✓ - - ✓ - - * - | ``double floor(double x)`` - | Returns the largest integer less than or equal to :math:`x`. - - ✓ - - ✓ - - * - | ``double fma(double x, double y, double z)`` - | Returns :math:`x \cdot y + z` as a single operation. - - ✓ - - ✓ - - * - | ``double fmax(double x, double y)`` - | Determine the maximum numeric value of :math:`x` and :math:`y`. - - ✓ - - ✓ - - * - | ``double fmin(double x, double y)`` - | Determine the minimum numeric value of :math:`x` and :math:`y`. - - ✓ - - ✓ - - * - | ``double fmod(double x, double y)`` - | Returns the floating-point remainder of :math:`x / y`. - - ✓ - - ✓ - - * - | ``double modf(double x, double* iptr)`` - | Break down :math:`x` into fractional and integral parts. - - ✓ - - ✗ - - * - | ``double frexp(double x, int* nptr)`` - | Extract mantissa and exponent of :math:`x`. - - ✓ - - ✗ - - * - | ``double hypot(double x, double y)`` - | Returns the square root of the sum of squares of :math:`x` and :math:`y`. - - ✓ - - ✓ - - * - | ``int ilogb(double x)`` - | Returns the unbiased integer exponent of :math:`x`. - - ✓ - - ✓ - - * - | ``bool isfinite(double x)`` - | Determine whether :math:`x` is finite. - - ✓ - - ✓ - - * - | ``bool isin(double x)`` - | Determine whether :math:`x` is infinite. - - ✓ - - ✓ - - * - | ``bool isnan(double x)`` - | Determine whether :math:`x` is a ``NAN``. - - ✓ - - ✓ - - * - | ``double j0(double x)`` - | Returns the value of the Bessel function of the first kind of order 0 for :math:`x`. - - ✓ - - ✓ - - * - | ``double j1(double x)`` - | Returns the value of the Bessel function of the first kind of order 1 for :math:`x`. - - ✓ - - ✓ - - * - | ``double jn(int n, double x)`` - | Returns the value of the Bessel function of the first kind of order n for :math:`x`. - - ✓ - - ✓ - - * - | ``double ldexp(double x, int exp)`` - | Returns the natural logarithm of the absolute value of the gamma function of :math:`x`. - - ✓ - - ✓ - - * - | ``double lgamma(double x)`` - | Returns the natural logarithm of the absolute value of the gamma function of :math:`x`. - - ✓ - - ✗ - - * - | ``long int lrint(double x)`` - | Round :math:`x` to nearest integer value. - - ✓ - - ✓ - - * - | ``long long int llrint(double x)`` - | Round :math:`x` to nearest integer value. - - ✓ - - ✓ - - * - | ``long int lround(double x)`` - | Round to nearest integer value. - - ✓ - - ✓ - - * - | ``long long int llround(double x)`` - | Round to nearest integer value. - - ✓ - - ✓ - - * - | ``double log10(double x)`` - | Returns the base 10 logarithm of :math:`x`. - - ✓ - - ✓ - - * - | ``double log1p(double x)`` - | Returns the natural logarithm of :math:`x + 1`. - - ✓ - - ✓ - - * - | ``double log2(double x)`` - | Returns the base 2 logarithm of :math:`x`. - - ✓ - - ✓ - - * - | ``double log(double x)`` - | Returns the natural logarithm of :math:`x`. - - ✓ - - ✓ - - * - | ``double logb(double x)`` - | Returns the floating point representation of the exponent of :math:`x`. - - ✓ - - ✓ - - * - | ``double nan(const char* tagp)`` - | Returns "Not a Number" value. - - ✗ - - ✓ - - * - | ``double nearbyint(double x)`` - | Round :math:`x` to the nearest integer. - - ✓ - - ✓ - - * - | ``double nextafter(double x, double y)`` - | Returns next representable double-precision floating-point value after argument. - - ✓ - - ✓ - - * - | ``double norm3d(double x, double y, double z)`` - | Returns the square root of the sum of squares of :math:`x`, :math:`y` and :math:`z`. - - ✓ - - ✓ - - * - | ``double norm4d(double x, double y, double z, double w)`` - | Returns the square root of the sum of squares of :math:`x`, :math:`y`, :math:`z` and :math:`w`. - - ✓ - - ✓ - - * - | ``double normcdf(double y)`` - | Returns the standard normal cumulative distribution function. - - ✓ - - ✓ - - * - | ``double normcdfinv(double y)`` - | Returns the inverse of the standard normal cumulative distribution function. - - ✓ - - ✓ - - * - | ``double norm(int dim, const double *a)`` - | Returns the square root of the sum of squares of any number of coordinates. - - ✓ - - ✓ - - * - | ``double pow(double x, double y)`` - | Returns :math:`x^y`. - - ✓ - - ✓ - - * - | ``double powi(double base, int iexp)`` - | Returns the value of first argument to the power of second argument. - - ✓ - - ✓ - - * - | ``double remainder(double x, double y)`` - | Returns double-precision floating-point remainder. - - ✓ - - ✓ - - * - | ``double remquo(double x, double y, int* quo)`` - | Returns double-precision floating-point remainder and part of quotient. - - ✓ - - ✗ - - * - | ``double round(double x)`` - | Round to nearest integer value in floating-point. - - ✓ - - ✓ - - * - | ``double rcbrt(double x)`` - | Returns the reciprocal cube root function. - - ✓ - - ✓ - - * - | ``double rhypot(double x, double y)`` - | Returns one over the square root of the sum of squares of two arguments. - - ✓ - - ✓ - - * - | ``double rint(double x)`` - | Round input to nearest integer value in floating-point. - - ✓ - - ✓ - - * - | ``double rnorm3d(double x, double y, double z)`` - | Returns one over the square root of the sum of squares of three coordinates of the argument. - - ✓ - - ✓ - - * - | ``double rnorm4d(double x, double y, double z, double w)`` - | Returns one over the square root of the sum of squares of four coordinates of the argument. - - ✓ - - ✓ - - * - | ``double rnorm(int dim, const double *a)`` - | Returns the reciprocal of square root of the sum of squares of any number of coordinates. - - ✓ - - ✓ - - * - | ``double scalbln(double x, long int n)`` - | Scale :math:`x` by :math:`2^n`. - - ✓ - - ✓ - - * - | ``double scalbn(double x, int n)`` - | Scale :math:`x` by :math:`2^n`. - - ✓ - - ✓ - - * - | ``bool signbit(double x)`` - | Return the sign bit of :math:`x`. - - ✓ - - ✓ - - * - | ``double sin(double x)`` - | Returns the sine of :math:`x`. - - ✓ - - ✓ - - * - | ``double sinh(double x)`` - | Returns the hyperbolic sine of :math:`x`. - - ✓ - - ✓ - - * - | ``double sinpi(double x)`` - | Returns the hyperbolic sine of :math:`\pi \cdot x`. - - ✓ - - ✓ - - * - | ``void sincos(double x, double *sptr, double *cptr)`` - | Returns the sine and cosine of :math:`x`. - - ✓ - - ✓ - - * - | ``void sincospi(double x, double *sptr, double *cptr)`` - | Returns the sine and cosine of :math:`\pi \cdot x`. - - ✓ - - ✓ - - * - | ``double sqrt(double x)`` - | Returns the square root of :math:`x`. - - ✓ - - ✓ - - * - | ``double rsqrt(double x)`` - | Returns the reciprocal of the square root of :math:`x`. - - ✗ - - ✓ - - * - | ``double tan(double x)`` - | Returns the tangent of :math:`x`. - - ✓ - - ✓ - - * - | ``double tanh(double x)`` - | Returns the hyperbolic tangent of :math:`x`. - - ✓ - - ✓ - - * - | ``double tgamma(double x)`` - | Returns the gamma function of :math:`x`. - - ✓ - - ✓ - - * - | ``double trunc(double x)`` - | Truncate :math:`x` to the integral part. - - ✓ - - ✓ - - * - | ``double y0(double x)`` - | Returns the value of the Bessel function of the second kind of order 0 for :math:`x`. - - ✓ - - ✓ - - * - | ``double y1(double x)`` - | Returns the value of the Bessel function of the second kind of order 1 for :math:`x`. - - ✓ - - ✓ - - * - | ``double yn(int n, double x)`` - | Returns the value of the Bessel function of the second kind of order n for :math:`x`. - - ✓ - - ✓ - -Integer intrinsics --------------------------------------------------------------------------------------------- - -Following is the list of supported integer intrinsics. Note that intrinsics are supported on device only. - -.. list-table:: Integer intrinsics mathematical functions - - * - **Function** - - * - | ``unsigned int __brev(unsigned int x)`` - | Reverse the bit order of a 32 bit unsigned integer. - - * - | ``unsigned long long int __brevll(unsigned long long int x)`` - | Reverse the bit order of a 64 bit unsigned integer. - - * - | ``unsigned int __byte_perm(unsigned int x, unsigned int y, unsigned int z)`` - | Return selected bytes from two 32-bit unsigned integers. - - * - | ``unsigned int __clz(int x)`` - | Return the number of consecutive high-order zero bits in 32 bit integer. - - * - | ``unsigned int __clzll(long long int x)`` - | Return the number of consecutive high-order zero bits in 64 bit integer. - - * - | ``unsigned int __ffs(int x)`` - | Find the position of least significant bit set to 1 in a 32 bit integer. - - * - | ``unsigned int __ffsll(long long int x)`` - | Find the position of least significant bit set to 1 in a 64 bit signed integer. - - * - | ``unsigned int __fns32(unsigned long long mask, unsigned int base, int offset)`` - | Find the position of the n-th set to 1 bit in a 32-bit integer. - - * - | ``unsigned int __fns64(unsigned long long int mask, unsigned int base, int offset)`` - | Find the position of the n-th set to 1 bit in a 64-bit integer. - - * - | ``unsigned int __funnelshift_l(unsigned int lo, unsigned int hi, unsigned int shift)`` - | Concatenate :math:`hi` and :math:`lo`, shift left by shift & 31 bits, return the most significant 32 bits. - - * - | ``unsigned int __funnelshift_lc(unsigned int lo, unsigned int hi, unsigned int shift)`` - | Concatenate :math:`hi` and :math:`lo`, shift left by min(shift, 32) bits, return the most significant 32 bits. - - * - | ``unsigned int __funnelshift_r(unsigned int lo, unsigned int hi, unsigned int shift)`` - | Concatenate :math:`hi` and :math:`lo`, shift right by shift & 31 bits, return the least significant 32 bits. - - * - | ``unsigned int __funnelshift_rc(unsigned int lo, unsigned int hi, unsigned int shift)`` - | Concatenate :math:`hi` and :math:`lo`, shift right by min(shift, 32) bits, return the least significant 32 bits. - - * - | ``unsigned int __hadd(int x, int y)`` - | Compute average of signed input arguments, avoiding overflow in the intermediate sum. - - * - | ``unsigned int __rhadd(int x, int y)`` - | Compute rounded average of signed input arguments, avoiding overflow in the intermediate sum. - - * - | ``unsigned int __uhadd(int x, int y)`` - | Compute average of unsigned input arguments, avoiding overflow in the intermediate sum. - - * - | ``unsigned int __urhadd (unsigned int x, unsigned int y)`` - | Compute rounded average of unsigned input arguments, avoiding overflow in the intermediate sum. - - * - | ``int __sad(int x, int y, int z)`` - | Returns :math:`|x - y| + z`, the sum of absolute difference. - - * - | ``unsigned int __usad(unsigned int x, unsigned int y, unsigned int z)`` - | Returns :math:`|x - y| + z`, the sum of absolute difference. - - * - | ``unsigned int __popc(unsigned int x)`` - | Count the number of bits that are set to 1 in a 32 bit integer. - - * - | ``unsigned int __popcll(unsigned long long int x)`` - | Count the number of bits that are set to 1 in a 64 bit integer. - - * - | ``int __mul24(int x, int y)`` - | Multiply two 24bit integers. - - * - | ``unsigned int __umul24(unsigned int x, unsigned int y)`` - | Multiply two 24bit unsigned integers. - - * - | ``int __mulhi(int x, int y)`` - | Returns the most significant 32 bits of the product of the two 32-bit integers. - - * - | ``unsigned int __umulhi(unsigned int x, unsigned int y)`` - | Returns the most significant 32 bits of the product of the two 32-bit unsigned integers. - - * - | ``long long int __mul64hi(long long int x, long long int y)`` - | Returns the most significant 64 bits of the product of the two 64-bit integers. - - * - | ``unsigned long long int __umul64hi(unsigned long long int x, unsigned long long int y)`` - | Returns the most significant 64 bits of the product of the two 64 unsigned bit integers. - -The HIP-Clang implementation of ``__ffs()`` and ``__ffsll()`` contains code to add a constant +1 to produce the ``ffs`` result format. -For the cases where this overhead is not acceptable and programmer is willing to specialize for the platform, -HIP-Clang provides ``__lastbit_u32_u32(unsigned int input)`` and ``__lastbit_u32_u64(unsigned long long int input)``. -The index returned by ``__lastbit_`` instructions starts at -1, while for ``ffs`` the index starts at 0. - -Floating-point Intrinsics --------------------------------------------------------------------------------------------- - -Following is the list of supported floating-point intrinsics. Note that intrinsics are supported on device only. - -.. note:: - - Only the nearest even rounding mode supported on AMD GPUs by defaults. The ``_rz``, ``_ru`` and - ``_rd`` suffixed intrinsic functions are existing in HIP AMD backend, if the - ``OCML_BASIC_ROUNDED_OPERATIONS`` macro is defined. - -.. list-table:: Single precision intrinsics mathematical functions - - * - **Function** - - * - | ``float __cosf(float x)`` - | Returns the fast approximate cosine of :math:`x`. - - * - | ``float __exp10f(float x)`` - | Returns the fast approximate for 10 :sup:`x`. - - * - | ``float __expf(float x)`` - | Returns the fast approximate for e :sup:`x`. - - * - | ``float __fadd_rn(float x, float y)`` - | Add two floating-point values in round-to-nearest-even mode. - - * - | ``float __fdiv_rn(float x, float y)`` - | Divide two floating point values in round-to-nearest-even mode. - - * - | ``float __fmaf_rn(float x, float y, float z)`` - | Returns ``x × y + z`` as a single operation in round-to-nearest-even mode. - - * - | ``float __fmul_rn(float x, float y)`` - | Multiply two floating-point values in round-to-nearest-even mode. - - * - | ``float __frcp_rn(float x, float y)`` - | Returns ``1 / x`` in round-to-nearest-even mode. - - * - | ``float __frsqrt_rn(float x)`` - | Returns ``1 / √x`` in round-to-nearest-even mode. - - * - | ``float __fsqrt_rn(float x)`` - | Returns ``√x`` in round-to-nearest-even mode. - - * - | ``float __fsub_rn(float x, float y)`` - | Subtract two floating-point values in round-to-nearest-even mode. - - * - | ``float __log10f(float x)`` - | Returns the fast approximate for base 10 logarithm of :math:`x`. - - * - | ``float __log2f(float x)`` - | Returns the fast approximate for base 2 logarithm of :math:`x`. - - * - | ``float __logf(float x)`` - | Returns the fast approximate for natural logarithm of :math:`x`. - - * - | ``float __powf(float x, float y)`` - | Returns the fast approximate of x :sup:`y`. - - * - | ``float __saturatef(float x)`` - | Clamp :math:`x` to [+0.0, 1.0]. - - * - | ``float __sincosf(float x, float* sinptr, float* cosptr)`` - | Returns the fast approximate of sine and cosine of :math:`x`. - - * - | ``float __sinf(float x)`` - | Returns the fast approximate sine of :math:`x`. - - * - | ``float __tanf(float x)`` - | Returns the fast approximate tangent of :math:`x`. - -.. list-table:: Double precision intrinsics mathematical functions - - * - **Function** - - * - | ``double __dadd_rn(double x, double y)`` - | Add two floating-point values in round-to-nearest-even mode. - - * - | ``double __ddiv_rn(double x, double y)`` - | Divide two floating-point values in round-to-nearest-even mode. - - * - | ``double __dmul_rn(double x, double y)`` - | Multiply two floating-point values in round-to-nearest-even mode. - - * - | ``double __drcp_rn(double x, double y)`` - | Returns ``1 / x`` in round-to-nearest-even mode. - - * - | ``double __dsqrt_rn(double x)`` - | Returns ``√x`` in round-to-nearest-even mode. - - * - | ``double __dsub_rn(double x, double y)`` - | Subtract two floating-point values in round-to-nearest-even mode. - - * - | ``double __fma_rn(double x, double y, double z)`` - | Returns ``x × y + z`` as a single operation in round-to-nearest-even mode. - +HIP-Clang supports a set of math operations that are callable from the device. +HIP supports most of the device functions supported by CUDA. These are described +on :ref:`Math API page `. Texture functions =============================================== @@ -1445,7 +340,8 @@ code. Surface functions =============================================== -Surface functions are not supported. +The supported surface functions are located on :ref:`Surface object reference +page `. Timer functions =============================================== @@ -1478,6 +374,8 @@ To read a high-resolution timer from the device, HIP provides the following buil Where ``hipDeviceAttributeWallClockRate`` is a device attribute. Note that wall clock frequency is a per-device attribute. + Note that ``clock()`` and ``clock64()`` do not work properly on AMD RDNA3 (GFX11) graphic processors. + Atomic functions =============================================== @@ -2055,6 +953,8 @@ HIP supports the following kernel language cooperative groups types and function - ✓ - ✓ +For further information, check :ref:`Cooperative Groups API ` or :ref:`Cooperative Groups how to `. + Warp matrix functions ============================================================ @@ -2077,7 +977,7 @@ HIP does not support kernel language warp matrix types or functions. - ✗ - ✓ - * - ``void store_matrix_sync(T* mptr, fragment<...> &a, unsigned lda, layout_t layout)`` + * - ``void store_matrix_sync(T* mptr, fragment<...> &a, unsigned lda, layout_t layout)`` - ✗ - ✓ @@ -2220,32 +1120,13 @@ micro-architecture details like registers, and also the directive allows per-ker Asynchronous Functions ============================================================ -Memory stream --------------------------------------------------------------------------------------------- +The supported asynchronous functions reference are located on the following pages: -.. doxygengroup:: Stream - :content-only: - -.. doxygengroup:: StreamO - :content-only: - -Peer to peer --------------------------------------------------------------------------------------------- - -.. doxygengroup:: PeerToPeer - :content-only: - -Memory management --------------------------------------------------------------------------------------------- - -.. doxygengroup:: Memory - :content-only: - -External Resource Interoperability --------------------------------------------------------------------------------------------- - -.. doxygengroup:: External - :content-only: +* :ref:`stream_management_reference` +* :ref:`stream_ordered_memory_allocator_reference` +* :ref:`peer_to_peer_device_memory_access_reference` +* :ref:`memory_management_reference` +* :ref:`external_resource_interoperability_reference` Register Keyword ============================================================ @@ -2264,7 +1145,7 @@ Unroll with a bounds that is known at compile-time is supported. For example: .. code-block:: cpp - #pragma unroll 1 /* tell compiler to never unroll the loop */ + #pragma unroll 1 /* tell compiler to never unroll the loop */ for (int i=0; i<16; i++) ... .. code-block:: cpp diff --git a/projects/hip/docs/reference/cpp_language_support.rst b/projects/hip/docs/reference/cpp_language_support.rst new file mode 100644 index 0000000000..1635258ccf --- /dev/null +++ b/projects/hip/docs/reference/cpp_language_support.rst @@ -0,0 +1,171 @@ +.. meta:: + :description: This chapter describes the C++ support of the HIP ecosystem + ROCm software. + :keywords: AMD, ROCm, HIP, C++ + +******************************************************************************* +C++ language support +******************************************************************************* + +The ROCm platform enables the power of combined C++ and HIP (Heterogeneous-computing +Interface for Portability) code. This code is compiled with a ``clang`` or ``clang++`` +compiler. The official compilers support the HIP platform, or you can use the +``amdclang`` or ``amdclang++`` included in the ROCm installation, which are a wrapper for +the official versions. + +The source code is compiled according to the ``C++03``, ``C++11``, ``C++14``, ``C++17``, +and ``C++20`` standards, along with HIP-specific extensions, but is subject to +restrictions. The key restriction is the reduced support of standard library in device +code. This is due to the fact that by default a function is considered to run on host, +except for ``constexpr`` functions, which can run on host and device as well. + +.. _language_modern_cpp_support: + +Modern C++ support +=============================================================================== + +C++ is considered a modern programming language as of C++11. This section describes how +HIP supports these new C++ features. + +C++11 support +------------------------------------------------------------------------------- + +The C++11 standard introduced many new features. These features are supported in HIP host +code, with some notable omissions on the device side. The rule of thumb here is that +``constexpr`` functions work on device, the rest doesn't. This means that some important +functionality like ``std::function`` is missing on the device, but unfortunately the +standard library wasn't designed with HIP in mind, which means that the support is in a +state of "works as-is". + +Certain features have restrictions and clarifications. For example, any functions using +the ``constexpr`` qualifier or the new ``initializer lists``, ``std::move`` or +``std::forward`` features are implicitly considered to have the ``__host__`` and +``__device__`` execution space specifier. Also, ``constexpr`` variables that are static +members or namespace scoped can be used from both host and device, but only for read +access. Dereferencing a static ``constexpr`` outside its specified execution space causes +an error. + +Lambdas are supported, but there are some extensions and restrictions on their usage. For +more information, see the `Extended lambdas`_ section below. + +C++14 support +------------------------------------------------------------------------------- + +The C++14 language features are supported. + +C++17 support +------------------------------------------------------------------------------- + +All C++17 language features are supported. + +C++20 support +------------------------------------------------------------------------------- + +All C++20 language features are supported, but extensions and restrictions apply. C++20 +introduced coroutines and modules, which fundamentally changed how programs are written. +HIP doesn't support these features. However, ``consteval`` functions can be called from +host and device, even if specified for host use only. + +The three-way comparison operator (spaceship operator ``<=>``) works with host and device +code. + +.. _language_restrictions: + +Extensions and restrictions +=============================================================================== + +In addition to the deviations from the standard, there are some general extensions and +restrictions to consider. + +Global functions +------------------------------------------------------------------------------- + +Functions that serve as an entry point for device execution are called kernels and are +specified with the ``__global__`` qualifier. To call a kernel function, use the triple +chevron operator: ``<<< >>>``. Kernel functions must have a ``void`` return type. These +functions can't: + +* have a ``constexpr`` specifier +* have a parameter of type ``std::initializer_list`` or ``va_list`` +* use an rvalue reference as a parameter. +* use parameters having different sizes in host and device code, e.g. long double arguments, or structs containing long double members. +* use struct-type arguments which have different layout in host and device code. + +Kernels can have variadic template parameters, but only one parameter pack, which must be +the last item in the template parameter list. + +Device space memory specifiers +------------------------------------------------------------------------------- + +HIP includes device space memory specifiers to indicate whether a variable is allocated +in host or device memory and how its memory should be allocated. HIP supports the +``__device__``, ``__shared__``, ``__managed__``, and ``__constant__`` specifiers. + +The ``__device__`` and ``__constant__`` specifiers define global variables, which are +allocated within global memory on the HIP devices. The only difference is that +``__constant__`` variables can't be changed after allocation. The ``__shared__`` +specifier allocates the variable within shared memory, which is available for all threads +in a block. + +The ``__managed__`` variable specifier creates global variables that are initially +undefined and unaddressed within the global symbol table. The HIP runtime allocates +managed memory and defines the symbol when it loads the device binary. A managed variable +can be accessed in both device and host code. + +It's important to know where a variable is stored because it is only available from +certain locations. Generally, variables allocated in the host memory are not accessible +from the device code, while variables allocated in the device memory are not directly +accessible from the host code. Dereferencing a pointer to device memory on the host +results in a segmentation fault. Accessing device variables in host code should be done +through kernel execution or HIP functions like ``hipMemCpyToSymbol``. + +Exception handling +------------------------------------------------------------------------------- + +An important difference between the host and device code is exception handling. In device +code, this control flow isn't available due to the hardware architecture. The device +code must use return codes to handle errors. + +Kernel parameters +------------------------------------------------------------------------------- + +There are some restrictions on kernel function parameters. They cannot be passed by +reference, because these functions are called from the host but run on the device. Also, +a variable number of arguments is not allowed. + +Classes +------------------------------------------------------------------------------- + +Classes work on both the host and device side, but there are some constraints. The +``static`` member functions can't be ``__global__``. ``Virtual`` member functions work, +but a ``virtual`` function must not be called from the host if the parent object was +created on the device, or the other way around, because this behavior is undefined. +Another minor restriction is that ``__device__`` variables, that are global scoped must +have trivial constructors. + +Polymorphic function wrappers +------------------------------------------------------------------------------- + +HIP doesn't support the polymorphic function wrapper ``std::function``, which was +introduced in C++11. + +Extended lambdas +------------------------------------------------------------------------------- + +HIP supports Lambdas, which by default work as expected. + +Lambdas have implicit host device attributes. This means that they can be executed by +both host and device code, and works the way you would expect. To make a lambda callable +only by host or device code, users can add ``__host__`` or ``__device__`` attribute. The +only restriction is that host variables can only be accessed through copy on the device. +Accessing through reference will cause undefined behavior. + +Inline namespaces +------------------------------------------------------------------------------- + +Inline namespaces are supported, but with a few exceptions. The following entities can't +be declared in namespace scope within an inline unnamed namespace: + +* ``__managed__``, ``__device__``, ``__shared__`` and ``__constant__`` variables +* ``__global__`` function and function templates +* variables with surface or texture type diff --git a/projects/hip/docs/reference/fp8_numbers.rst b/projects/hip/docs/reference/fp8_numbers.rst index 5067604986..00d85a9f15 100644 --- a/projects/hip/docs/reference/fp8_numbers.rst +++ b/projects/hip/docs/reference/fp8_numbers.rst @@ -28,7 +28,7 @@ There are two formats of FP8 numbers, E4M3 and E5M2. HIP Header ========== -HIP header defined the FP8 ocp/fnuz numbers `here `_. +The `HIP header `_ defines the FP8 ocp/fnuz numbers. Supported Devices ================= diff --git a/projects/hip/docs/reference/hip_runtime_api/global_defines_enums_structs_files.rst b/projects/hip/docs/reference/hip_runtime_api/global_defines_enums_structs_files.rst new file mode 100644 index 0000000000..60236e5169 --- /dev/null +++ b/projects/hip/docs/reference/hip_runtime_api/global_defines_enums_structs_files.rst @@ -0,0 +1,15 @@ +.. meta:: + :description: The global defines, enum, structs and files reference page. + +.. _global_defines_enums_structs_files_reference: + +******************************************************************************* +Global defines, enums, structs and files +******************************************************************************* + +The structs, define macros, enums and files in the HIP runtime API. + +* :ref:`global_enum_defines_reference` +* :ref:`driver_types_reference` +* :doc:`hip:doxygen/html/annotated` +* :doc:`hip:doxygen/html/files` diff --git a/projects/hip/docs/reference/hip_runtime_api/global_defines_enums_structs_files/driver_types.rst b/projects/hip/docs/reference/hip_runtime_api/global_defines_enums_structs_files/driver_types.rst new file mode 100644 index 0000000000..552f344e69 --- /dev/null +++ b/projects/hip/docs/reference/hip_runtime_api/global_defines_enums_structs_files/driver_types.rst @@ -0,0 +1,12 @@ +.. meta:: + :description: The driver types reference page. + :keywords: AMD, ROCm, HIP, CUDA, driver types + +.. _driver_types_reference: + +******************************************************************************* +Driver types +******************************************************************************* + +.. doxygengroup:: DriverTypes + :content-only: diff --git a/projects/hip/docs/reference/hip_runtime_api/global_defines_enums_structs_files/global_enum_and_defines.rst b/projects/hip/docs/reference/hip_runtime_api/global_defines_enums_structs_files/global_enum_and_defines.rst new file mode 100644 index 0000000000..0660b13007 --- /dev/null +++ b/projects/hip/docs/reference/hip_runtime_api/global_defines_enums_structs_files/global_enum_and_defines.rst @@ -0,0 +1,12 @@ +.. meta:: + :description: The global enum and defines reference page. + :keywords: AMD, ROCm, HIP, CUDA, global enum, defines + +.. _global_enum_defines_reference: + +******************************************************************************* +Global enum and defines +******************************************************************************* + +.. doxygengroup:: GlobalDefs + :content-only: diff --git a/projects/hip/docs/reference/hip_runtime_api/modules.rst b/projects/hip/docs/reference/hip_runtime_api/modules.rst new file mode 100644 index 0000000000..83c10ccf2e --- /dev/null +++ b/projects/hip/docs/reference/hip_runtime_api/modules.rst @@ -0,0 +1,41 @@ +.. meta:: + :description: The HIP runtime API modules reference page. + :keywords: AMD, ROCm, HIP, CUDA, HIP runtime API modules, modules + +.. _modules_reference: + +******************************************************************************* +Modules +******************************************************************************* + +The API is organized into modules based on functionality. + +* :ref:`initialization_version_reference` +* :ref:`device_management_reference` +* :ref:`execution_control_reference` +* :ref:`error_handling_reference` +* :ref:`stream_management_reference` +* :ref:`stream_memory_operations_reference` +* :ref:`event_management_reference` +* :ref:`memory_management_reference` + + * :ref:`memory_management_deprecated_reference` + * :ref:`external_resource_interoperability_reference` + * :ref:`stream_ordered_memory_allocator_reference` + * :ref:`unified_memory_reference` + * :ref:`virtual_memory_reference` + * :ref:`texture_management_reference` + * :ref:`texture_management_deprecated_reference` + * :ref:`surface_object_reference` + +* :ref:`peer_to_peer_device_memory_access_reference` +* :ref:`context_management_reference` +* :ref:`module_management_reference` +* :ref:`occupancy_reference` +* :ref:`profiler_control_reference` +* :ref:`launch_api_reference` +* :ref:`runtime_compilation_reference` +* :ref:`callback_activity_apis_reference` +* :ref:`graph_management_reference` +* :ref:`opengl_interoperability_reference` +* :ref:`cooperative_groups_reference` diff --git a/projects/hip/docs/reference/hip_runtime_api/modules/callback_activity_apis.rst b/projects/hip/docs/reference/hip_runtime_api/modules/callback_activity_apis.rst new file mode 100644 index 0000000000..d27c59ebbf --- /dev/null +++ b/projects/hip/docs/reference/hip_runtime_api/modules/callback_activity_apis.rst @@ -0,0 +1,12 @@ +.. meta:: + :description: The callback activity APIs reference page. + :keywords: AMD, ROCm, HIP, CUDA, callback activity APIs, callback activity + +.. _callback_activity_apis_reference: + +******************************************************************************* +Callback activity APIs +******************************************************************************* + +.. doxygengroup:: Callback + :content-only: diff --git a/projects/hip/docs/reference/hip_runtime_api/modules/context_management.rst b/projects/hip/docs/reference/hip_runtime_api/modules/context_management.rst new file mode 100644 index 0000000000..83fce7e593 --- /dev/null +++ b/projects/hip/docs/reference/hip_runtime_api/modules/context_management.rst @@ -0,0 +1,12 @@ +.. meta:: + :description: The context management reference page. + :keywords: AMD, ROCm, HIP, CUDA, context management, context + +.. _context_management_reference: + +******************************************************************************* +Context management [deprecated] +******************************************************************************* + +.. doxygengroup:: Context + :content-only: diff --git a/projects/hip/docs/reference/hip_runtime_api/modules/cooperative_groups_reference.rst b/projects/hip/docs/reference/hip_runtime_api/modules/cooperative_groups_reference.rst new file mode 100644 index 0000000000..e28eac1252 --- /dev/null +++ b/projects/hip/docs/reference/hip_runtime_api/modules/cooperative_groups_reference.rst @@ -0,0 +1,70 @@ +.. meta:: + :description: This chapter lists types and device API wrappers related to the + Cooperative Group feature. Programmers can directly use these + API features in their kernels. + :keywords: AMD, ROCm, HIP, cooperative groups + +.. _cooperative_groups_reference: + +******************************************************************************* +Cooperative groups +******************************************************************************* + +Cooperative kernel launches +=========================== + +The following host-side functions are used for cooperative kernel launches. + +.. doxygengroup:: ModuleCooperativeG + :content-only: + +Cooperative groups classes +========================== + +The following cooperative groups classes can be used on the device side. + +.. _thread_group_ref: + +.. doxygenclass:: cooperative_groups::thread_group + :members: + +.. _thread_block_ref: + +.. doxygenclass:: cooperative_groups::thread_block + :members: + +.. _grid_group_ref: + +.. doxygenclass:: cooperative_groups::grid_group + :members: + +.. _multi_grid_group_ref: + +.. doxygenclass:: cooperative_groups::multi_grid_group + :members: + +.. _thread_block_tile_ref: + +.. doxygenclass:: cooperative_groups::thread_block_tile + :members: + +.. _coalesced_group_ref: + +.. doxygenclass:: cooperative_groups::coalesced_group + :members: + +Cooperative groups construct functions +====================================== + +The following functions are used to construct different group-type instances on the device side. + +.. doxygengroup:: CooperativeGConstruct + :content-only: + +Cooperative groups exposed API functions +======================================== + +The following functions are the exposed API for different group-type instances on the device side. + +.. doxygengroup:: CooperativeGAPI + :content-only: diff --git a/projects/hip/docs/reference/hip_runtime_api/modules/device_management.rst b/projects/hip/docs/reference/hip_runtime_api/modules/device_management.rst new file mode 100644 index 0000000000..17b600f5c5 --- /dev/null +++ b/projects/hip/docs/reference/hip_runtime_api/modules/device_management.rst @@ -0,0 +1,12 @@ +.. meta:: + :description: The device management reference page. + :keywords: AMD, ROCm, HIP, CUDA, device management, device + +.. _device_management_reference: + +******************************************************************************* +Device management +******************************************************************************* + +.. doxygengroup:: Device + :content-only: diff --git a/projects/hip/docs/reference/hip_runtime_api/modules/error_handling.rst b/projects/hip/docs/reference/hip_runtime_api/modules/error_handling.rst new file mode 100644 index 0000000000..1e2ec4a1f7 --- /dev/null +++ b/projects/hip/docs/reference/hip_runtime_api/modules/error_handling.rst @@ -0,0 +1,12 @@ +.. meta:: + :description: The error handling reference page. + :keywords: AMD, ROCm, HIP, CUDA, error handling, error + +.. _error_handling_reference: + +******************************************************************************* +Error handling +******************************************************************************* + +.. doxygengroup:: Error + :content-only: diff --git a/projects/hip/docs/reference/hip_runtime_api/modules/event_management.rst b/projects/hip/docs/reference/hip_runtime_api/modules/event_management.rst new file mode 100644 index 0000000000..b4df0d0164 --- /dev/null +++ b/projects/hip/docs/reference/hip_runtime_api/modules/event_management.rst @@ -0,0 +1,12 @@ +.. meta:: + :description: The event management reference page. + :keywords: AMD, ROCm, HIP, CUDA, event management, event + +.. _event_management_reference: + +******************************************************************************* +Event management +******************************************************************************* + +.. doxygengroup:: Event + :content-only: diff --git a/projects/hip/docs/reference/hip_runtime_api/modules/execution_control.rst b/projects/hip/docs/reference/hip_runtime_api/modules/execution_control.rst new file mode 100644 index 0000000000..dfc5b8d5d6 --- /dev/null +++ b/projects/hip/docs/reference/hip_runtime_api/modules/execution_control.rst @@ -0,0 +1,12 @@ +.. meta:: + :description: The execution control reference page. + :keywords: AMD, ROCm, HIP, CUDA, execution control, execution + +.. _execution_control_reference: + +******************************************************************************* +Execution control +******************************************************************************* + +.. doxygengroup:: Execution + :content-only: diff --git a/projects/hip/docs/reference/hip_runtime_api/modules/graph_management.rst b/projects/hip/docs/reference/hip_runtime_api/modules/graph_management.rst new file mode 100644 index 0000000000..6bfca89c2f --- /dev/null +++ b/projects/hip/docs/reference/hip_runtime_api/modules/graph_management.rst @@ -0,0 +1,12 @@ +.. meta:: + :description: The graph management reference page. + :keywords: AMD, ROCm, HIP, CUDA, graph management, graph + +.. _graph_management_reference: + +******************************************************************************* +Graph management +******************************************************************************* + +.. doxygengroup:: Graph + :content-only: diff --git a/projects/hip/docs/reference/hip_runtime_api/modules/initialization_and_version.rst b/projects/hip/docs/reference/hip_runtime_api/modules/initialization_and_version.rst new file mode 100644 index 0000000000..25809662e5 --- /dev/null +++ b/projects/hip/docs/reference/hip_runtime_api/modules/initialization_and_version.rst @@ -0,0 +1,12 @@ +.. meta:: + :description: The initialization and version reference page. + :keywords: AMD, ROCm, HIP, CUDA, initialization, version + +.. _initialization_version_reference: + +******************************************************************************* +Initialization and version +******************************************************************************* + +.. doxygengroup:: Driver + :content-only: diff --git a/projects/hip/docs/reference/hip_runtime_api/modules/launch_api.rst b/projects/hip/docs/reference/hip_runtime_api/modules/launch_api.rst new file mode 100644 index 0000000000..ddd3c32c94 --- /dev/null +++ b/projects/hip/docs/reference/hip_runtime_api/modules/launch_api.rst @@ -0,0 +1,12 @@ +.. meta:: + :description: The launch API reference page. + :keywords: AMD, ROCm, HIP, CUDA, launch API, triple-chevron + +.. _launch_api_reference: + +******************************************************************************* +Launch API +******************************************************************************* + +.. doxygengroup:: Clang + :content-only: diff --git a/projects/hip/docs/reference/hip_runtime_api/modules/memory_management.rst b/projects/hip/docs/reference/hip_runtime_api/modules/memory_management.rst new file mode 100644 index 0000000000..f78805b8f2 --- /dev/null +++ b/projects/hip/docs/reference/hip_runtime_api/modules/memory_management.rst @@ -0,0 +1,12 @@ +.. meta:: + :description: The memory management reference page. + :keywords: AMD, ROCm, HIP, CUDA, memory management, memory + +.. _memory_management_reference: + +******************************************************************************* +Memory management +******************************************************************************* + +.. doxygengroup:: Memory + :content-only: diff --git a/projects/hip/docs/reference/hip_runtime_api/modules/memory_management/external_resource_interoperability.rst b/projects/hip/docs/reference/hip_runtime_api/modules/memory_management/external_resource_interoperability.rst new file mode 100644 index 0000000000..c002d5136c --- /dev/null +++ b/projects/hip/docs/reference/hip_runtime_api/modules/memory_management/external_resource_interoperability.rst @@ -0,0 +1,12 @@ +.. meta:: + :description: The external resource interoperability reference page. + :keywords: AMD, ROCm, HIP, CUDA, external resource interoperability + +.. _external_resource_interoperability_reference: + +******************************************************************************* +External resource interoperability +******************************************************************************* + +.. doxygengroup:: External + :content-only: diff --git a/projects/hip/docs/reference/hip_runtime_api/modules/memory_management/memory_management_deprecated.rst b/projects/hip/docs/reference/hip_runtime_api/modules/memory_management/memory_management_deprecated.rst new file mode 100644 index 0000000000..96b8de64f3 --- /dev/null +++ b/projects/hip/docs/reference/hip_runtime_api/modules/memory_management/memory_management_deprecated.rst @@ -0,0 +1,11 @@ +.. meta:: + :description: The deprecated memory management reference page. + +.. _memory_management_deprecated_reference: + +******************************************************************************* +Memory management (deprecated) +******************************************************************************* + +.. doxygengroup:: MemoryD + :content-only: diff --git a/projects/hip/docs/reference/hip_runtime_api/modules/memory_management/stream_ordered_memory_allocator.rst b/projects/hip/docs/reference/hip_runtime_api/modules/memory_management/stream_ordered_memory_allocator.rst new file mode 100644 index 0000000000..39caaba5e3 --- /dev/null +++ b/projects/hip/docs/reference/hip_runtime_api/modules/memory_management/stream_ordered_memory_allocator.rst @@ -0,0 +1,12 @@ +.. meta:: + :description: The stream ordered memory allocator reference page. + :keywords: AMD, ROCm, HIP, CUDA, stream ordered memory allocator + +.. _stream_ordered_memory_allocator_reference: + +******************************************************************************* +Stream ordered memory allocator +******************************************************************************* + +.. doxygengroup:: StreamO + :content-only: diff --git a/projects/hip/docs/reference/hip_runtime_api/modules/memory_management/surface_object.rst b/projects/hip/docs/reference/hip_runtime_api/modules/memory_management/surface_object.rst new file mode 100644 index 0000000000..560a2eae22 --- /dev/null +++ b/projects/hip/docs/reference/hip_runtime_api/modules/memory_management/surface_object.rst @@ -0,0 +1,15 @@ +.. meta:: + :description: The surface object reference page. + :keywords: AMD, ROCm, HIP, CUDA, surface object, surface + +.. _surface_object_reference: + +******************************************************************************* +Surface object +******************************************************************************* + +.. doxygengroup:: Surface + :content-only: + +.. doxygengroup:: SurfaceAPI + :content-only: diff --git a/projects/hip/docs/reference/hip_runtime_api/modules/memory_management/texture_management.rst b/projects/hip/docs/reference/hip_runtime_api/modules/memory_management/texture_management.rst new file mode 100644 index 0000000000..4688d5df72 --- /dev/null +++ b/projects/hip/docs/reference/hip_runtime_api/modules/memory_management/texture_management.rst @@ -0,0 +1,12 @@ +.. meta:: + :description: The texture management reference page. + :keywords: AMD, ROCm, HIP, CUDA, texture management, texture + +.. _texture_management_reference: + +******************************************************************************* +Texture management +******************************************************************************* + +.. doxygengroup:: Texture + :content-only: diff --git a/projects/hip/docs/reference/hip_runtime_api/modules/memory_management/texture_management_deprecated.rst b/projects/hip/docs/reference/hip_runtime_api/modules/memory_management/texture_management_deprecated.rst new file mode 100644 index 0000000000..886a446dbc --- /dev/null +++ b/projects/hip/docs/reference/hip_runtime_api/modules/memory_management/texture_management_deprecated.rst @@ -0,0 +1,12 @@ +.. meta:: + :description: The deprecated texture management reference page. + :keywords: AMD, ROCm, HIP, CUDA, deprecated texture management + +.. _texture_management_deprecated_reference: + +******************************************************************************* +Texture management (deprecated) +******************************************************************************* + +.. doxygengroup:: TextureD + :content-only: diff --git a/projects/hip/docs/reference/hip_runtime_api/modules/memory_management/unified_memory_reference.rst b/projects/hip/docs/reference/hip_runtime_api/modules/memory_management/unified_memory_reference.rst new file mode 100644 index 0000000000..828d4fdcd4 --- /dev/null +++ b/projects/hip/docs/reference/hip_runtime_api/modules/memory_management/unified_memory_reference.rst @@ -0,0 +1,12 @@ +.. meta:: + :description: The managed memory reference page. + :keywords: AMD, ROCm, HIP, CUDA, unified memory, unified, memory, UM, APU + +.. _unified_memory_reference: + +******************************************************************************* +Managed memory +******************************************************************************* + +.. doxygengroup:: MemoryM + :content-only: diff --git a/projects/hip/docs/reference/hip_runtime_api/modules/memory_management/virtual_memory_reference.rst b/projects/hip/docs/reference/hip_runtime_api/modules/memory_management/virtual_memory_reference.rst new file mode 100644 index 0000000000..e46b6d91e4 --- /dev/null +++ b/projects/hip/docs/reference/hip_runtime_api/modules/memory_management/virtual_memory_reference.rst @@ -0,0 +1,12 @@ +.. meta:: + :description: The virtual memory (VM) management reference page. + :keywords: AMD, ROCm, HIP, CUDA, virtual memory, virtual, memory, VM + +.. _virtual_memory_reference: + +******************************************************************************* +Virtual memory management +******************************************************************************* + +.. doxygengroup:: Virtual + :content-only: diff --git a/projects/hip/docs/reference/hip_runtime_api/modules/module_management.rst b/projects/hip/docs/reference/hip_runtime_api/modules/module_management.rst new file mode 100644 index 0000000000..6a5b645b9b --- /dev/null +++ b/projects/hip/docs/reference/hip_runtime_api/modules/module_management.rst @@ -0,0 +1,12 @@ +.. meta:: + :description: The module management reference page. + :keywords: AMD, ROCm, HIP, CUDA, module management, module + +.. _module_management_reference: + +******************************************************************************* +Module management +******************************************************************************* + +.. doxygengroup:: Module + :content-only: diff --git a/projects/hip/docs/reference/hip_runtime_api/modules/occupancy.rst b/projects/hip/docs/reference/hip_runtime_api/modules/occupancy.rst new file mode 100644 index 0000000000..492c3f2310 --- /dev/null +++ b/projects/hip/docs/reference/hip_runtime_api/modules/occupancy.rst @@ -0,0 +1,12 @@ +.. meta:: + :description: The occupancy reference page. + :keywords: AMD, ROCm, HIP, CUDA, occupancy + +.. _occupancy_reference: + +******************************************************************************* +Occupancy +******************************************************************************* + +.. doxygengroup:: Occupancy + :content-only: diff --git a/projects/hip/docs/reference/hip_runtime_api/modules/opengl_interoperability.rst b/projects/hip/docs/reference/hip_runtime_api/modules/opengl_interoperability.rst new file mode 100644 index 0000000000..d90a790fbf --- /dev/null +++ b/projects/hip/docs/reference/hip_runtime_api/modules/opengl_interoperability.rst @@ -0,0 +1,12 @@ +.. meta:: + :description: The OpenGL interoperability reference page. + :keywords: AMD, ROCm, HIP, CUDA, OpenGL interoperability, OpenGL interop + +.. _opengl_interoperability_reference: + +******************************************************************************* +OpenGL interoperability +******************************************************************************* + +.. doxygengroup:: GL + :content-only: diff --git a/projects/hip/docs/reference/hip_runtime_api/modules/peer_to_peer_device_memory_access.rst b/projects/hip/docs/reference/hip_runtime_api/modules/peer_to_peer_device_memory_access.rst new file mode 100644 index 0000000000..2797776ae4 --- /dev/null +++ b/projects/hip/docs/reference/hip_runtime_api/modules/peer_to_peer_device_memory_access.rst @@ -0,0 +1,12 @@ +.. meta:: + :description: The peer to peer device memory access reference page. + :keywords: AMD, ROCm, HIP, CUDA, peer to peer device memory access, peer to peer + +.. _peer_to_peer_device_memory_access_reference: + +******************************************************************************* +Peer to peer device memory access +******************************************************************************* + +.. doxygengroup:: PeerToPeer + :content-only: diff --git a/projects/hip/docs/reference/hip_runtime_api/modules/profiler_control.rst b/projects/hip/docs/reference/hip_runtime_api/modules/profiler_control.rst new file mode 100644 index 0000000000..dd56e485ba --- /dev/null +++ b/projects/hip/docs/reference/hip_runtime_api/modules/profiler_control.rst @@ -0,0 +1,12 @@ +.. meta:: + :description: The profiler control reference page. + :keywords: AMD, ROCm, HIP, CUDA, profiler control, profiler + +.. _profiler_control_reference: + +******************************************************************************* +Profiler control +******************************************************************************* + +.. doxygengroup:: Profiler + :content-only: diff --git a/projects/hip/docs/reference/hip_runtime_api/modules/runtime_compilation.rst b/projects/hip/docs/reference/hip_runtime_api/modules/runtime_compilation.rst new file mode 100644 index 0000000000..7c76c2047e --- /dev/null +++ b/projects/hip/docs/reference/hip_runtime_api/modules/runtime_compilation.rst @@ -0,0 +1,12 @@ +.. meta:: + :description: The runtime compilation reference page. + :keywords: AMD, ROCm, HIP, CUDA, runtime compilation + +.. _runtime_compilation_reference: + +******************************************************************************* +Runtime compilation +******************************************************************************* + +.. doxygengroup:: Runtime + :content-only: diff --git a/projects/hip/docs/reference/hip_runtime_api/modules/stream_management.rst b/projects/hip/docs/reference/hip_runtime_api/modules/stream_management.rst new file mode 100644 index 0000000000..6e5de67d7c --- /dev/null +++ b/projects/hip/docs/reference/hip_runtime_api/modules/stream_management.rst @@ -0,0 +1,12 @@ +.. meta:: + :description: The stream management reference page. + :keywords: AMD, ROCm, HIP, CUDA, stream management, stream + +.. _stream_management_reference: + +******************************************************************************* +Stream management +******************************************************************************* + +.. doxygengroup:: Stream + :content-only: diff --git a/projects/hip/docs/reference/hip_runtime_api/modules/stream_memory_operations.rst b/projects/hip/docs/reference/hip_runtime_api/modules/stream_memory_operations.rst new file mode 100644 index 0000000000..beed1cd314 --- /dev/null +++ b/projects/hip/docs/reference/hip_runtime_api/modules/stream_memory_operations.rst @@ -0,0 +1,12 @@ +.. meta:: + :description: The stream memory operations reference page. + :keywords: AMD, ROCm, HIP, CUDA, stream memory operations + +.. _stream_memory_operations_reference: + +******************************************************************************* +Stream memory operations +******************************************************************************* + +.. doxygengroup:: StreamM + :content-only: diff --git a/projects/hip/docs/reference/hip_runtime_api_reference.rst b/projects/hip/docs/reference/hip_runtime_api_reference.rst new file mode 100644 index 0000000000..e77490f79e --- /dev/null +++ b/projects/hip/docs/reference/hip_runtime_api_reference.rst @@ -0,0 +1,14 @@ +.. meta:: + :description: HIP runtime API reference page + :keywords: AMD, ROCm, HIP, CUDA, HIP runtime API, HIP runtime + +.. _runtime_api_reference: + +******************************************************************************** +HIP runtime API +******************************************************************************** + +The HIP Runtime API reference: + +* :ref:`modules_reference` +* :ref:`global_defines_enums_structs_files_reference` diff --git a/projects/hip/docs/reference/math_api.rst b/projects/hip/docs/reference/math_api.rst new file mode 100644 index 0000000000..fd3a215dd2 --- /dev/null +++ b/projects/hip/docs/reference/math_api.rst @@ -0,0 +1,1121 @@ +.. meta:: + :description: This chapter describes the built-in math functions that are accessible in HIP. + :keywords: AMD, ROCm, HIP, CUDA, math functions, HIP math functions + +.. _math_api_reference: + +******************************************************************************** +HIP math API +******************************************************************************** + +HIP-Clang supports a set of math operations that are callable from the device. HIP supports most of the device functions supported by NVIDIA CUDA. These are described in the following sections. + +Single precision mathematical functions +======================================= + + +Following is the list of supported single precision mathematical functions. + +.. list-table:: Single precision mathematical functions + + * - **Function** + - **Supported on Host** + - **Supported on Device** + + * - | ``float abs(float x)`` + | Returns the absolute value of :math:`x` + - ✓ + - ✓ + + * - | ``float acosf(float x)`` + | Returns the arc cosine of :math:`x`. + - ✓ + - ✓ + + * - | ``float acoshf(float x)`` + | Returns the nonnegative arc hyperbolic cosine of :math:`x`. + - ✓ + - ✓ + + * - | ``float asinf(float x)`` + | Returns the arc sine of :math:`x`. + - ✓ + - ✓ + + * - | ``float asinhf(float x)`` + | Returns the arc hyperbolic sine of :math:`x`. + - ✓ + - ✓ + + * - | ``float atanf(float x)`` + | Returns the arc tangent of :math:`x`. + - ✓ + - ✓ + + * - | ``float atan2f(float x, float y)`` + | Returns the arc tangent of the ratio of :math:`x` and :math:`y`. + - ✓ + - ✓ + + * - | ``float atanhf(float x)`` + | Returns the arc hyperbolic tangent of :math:`x`. + - ✓ + - ✓ + + * - | ``float cbrtf(float x)`` + | Returns the cube root of :math:`x`. + - ✓ + - ✓ + + * - | ``float ceilf(float x)`` + | Returns ceiling of :math:`x`. + - ✓ + - ✓ + + * - | ``float copysignf(float x, float y)`` + | Create value with given magnitude, copying sign of second value. + - ✓ + - ✓ + + * - | ``float cosf(float x)`` + | Returns the cosine of :math:`x`. + - ✓ + - ✓ + + * - | ``float coshf(float x)`` + | Returns the hyperbolic cosine of :math:`x`. + - ✓ + - ✓ + + * - | ``float cospif(float x)`` + | Returns the cosine of :math:`\pi \cdot x`. + - ✓ + - ✓ + + * - | ``float cyl_bessel_i0f(float x)`` + | Returns the value of the regular modified cylindrical Bessel function of order 0 for :math:`x`. + - ✗ + - ✗ + + * - | ``float cyl_bessel_i1f(float x)`` + | Returns the value of the regular modified cylindrical Bessel function of order 1 for :math:`x`. + - ✗ + - ✗ + + * - | ``float erff(float x)`` + | Returns the error function of :math:`x`. + - ✓ + - ✓ + + * - | ``float erfcf(float x)`` + | Returns the complementary error function of :math:`x`. + - ✓ + - ✓ + + * - | ``float erfcinvf(float x)`` + | Returns the inverse complementary function of :math:`x`. + - ✓ + - ✓ + + * - | ``float erfcxf(float x)`` + | Returns the scaled complementary error function of :math:`x`. + - ✓ + - ✓ + + * - | ``float erfinvf(float x)`` + | Returns the inverse error function of :math:`x`. + - ✓ + - ✓ + + * - | ``float expf(float x)`` + | Returns :math:`e^x`. + - ✓ + - ✓ + + * - | ``float exp10f(float x)`` + | Returns :math:`10^x`. + - ✓ + - ✓ + + * - | ``float exp2f( float x)`` + | Returns :math:`2^x`. + - ✓ + - ✓ + + * - | ``float expm1f(float x)`` + | Returns :math:`ln(x - 1)` + - ✓ + - ✓ + + * - | ``float fabsf(float x)`` + | Returns the absolute value of `x` + - ✓ + - ✓ + + * - | ``float fdimf(float x, float y)`` + | Returns the positive difference between :math:`x` and :math:`y`. + - ✓ + - ✓ + + * - | ``float fdividef(float x, float y)`` + | Divide two floating point values. + - ✓ + - ✓ + + * - | ``float floorf(float x)`` + | Returns the largest integer less than or equal to :math:`x`. + - ✓ + - ✓ + + * - | ``float fmaf(float x, float y, float z)`` + | Returns :math:`x \cdot y + z` as a single operation. + - ✓ + - ✓ + + * - | ``float fmaxf(float x, float y)`` + | Determine the maximum numeric value of :math:`x` and :math:`y`. + - ✓ + - ✓ + + * - | ``float fminf(float x, float y)`` + | Determine the minimum numeric value of :math:`x` and :math:`y`. + - ✓ + - ✓ + + * - | ``float fmodf(float x, float y)`` + | Returns the floating-point remainder of :math:`x / y`. + - ✓ + - ✓ + + * - | ``float modff(float x, float* iptr)`` + | Break down :math:`x` into fractional and integral parts. + - ✓ + - ✗ + + * - | ``float frexpf(float x, int* nptr)`` + | Extract mantissa and exponent of :math:`x`. + - ✓ + - ✗ + + * - | ``float hypotf(float x, float y)`` + | Returns the square root of the sum of squares of :math:`x` and :math:`y`. + - ✓ + - ✓ + + * - | ``int ilogbf(float x)`` + | Returns the unbiased integer exponent of :math:`x`. + - ✓ + - ✓ + + * - | ``bool isfinite(float x)`` + | Determine whether :math:`x` is finite. + - ✓ + - ✓ + + * - | ``bool isinf(float x)`` + | Determine whether :math:`x` is infinite. + - ✓ + - ✓ + + * - | ``bool isnan(float x)`` + | Determine whether :math:`x` is a ``NAN``. + - ✓ + - ✓ + + * - | ``float j0f(float x)`` + | Returns the value of the Bessel function of the first kind of order 0 for :math:`x`. + - ✓ + - ✓ + + * - | ``float j1f(float x)`` + | Returns the value of the Bessel function of the first kind of order 1 for :math:`x`. + - ✓ + - ✓ + + * - | ``float jnf(int n, float x)`` + | Returns the value of the Bessel function of the first kind of order n for :math:`x`. + - ✓ + - ✓ + + * - | ``float ldexpf(float x, int exp)`` + | Returns the natural logarithm of the absolute value of the gamma function of :math:`x`. + - ✓ + - ✓ + + * - | ``float lgammaf(float x)`` + | Returns the natural logarithm of the absolute value of the gamma function of :math:`x`. + - ✓ + - ✗ + + * - | ``long int lrintf(float x)`` + | Round :math:`x` to nearest integer value. + - ✓ + - ✓ + + * - | ``long long int llrintf(float x)`` + | Round :math:`x` to nearest integer value. + - ✓ + - ✓ + + * - | ``long int lroundf(float x)`` + | Round to nearest integer value. + - ✓ + - ✓ + + * - | ``long long int llroundf(float x)`` + | Round to nearest integer value. + - ✓ + - ✓ + + * - | ``float log10f(float x)`` + | Returns the base 10 logarithm of :math:`x`. + - ✓ + - ✓ + + * - | ``float log1pf(float x)`` + | Returns the natural logarithm of :math:`x + 1`. + - ✓ + - ✓ + + * - | ``float log2f(float x)`` + | Returns the base 2 logarithm of :math:`x`. + - ✓ + - ✓ + + * - | ``float logf(float x)`` + | Returns the natural logarithm of :math:`x`. + - ✓ + - ✓ + + * - | ``float logbf(float x)`` + | Returns the floating point representation of the exponent of :math:`x`. + - ✓ + - ✓ + + * - | ``float nanf(const char* tagp)`` + | Returns "Not a Number" value. + - ✗ + - ✓ + + * - | ``float nearbyintf(float x)`` + | Round :math:`x` to the nearest integer. + - ✓ + - ✓ + + * - | ``float nextafterf(float x, float y)`` + | Returns next representable single-precision floating-point value after argument. + - ✓ + - ✗ + + * - | ``float norm3df(float x, float y, float z)`` + | Returns the square root of the sum of squares of :math:`x`, :math:`y` and :math:`z`. + - ✓ + - ✓ + + * - | ``float norm4df(float x, float y, float z, float w)`` + | Returns the square root of the sum of squares of :math:`x`, :math:`y`, :math:`z` and :math:`w`. + - ✓ + - ✓ + + * - | ``float normcdff(float y)`` + | Returns the standard normal cumulative distribution function. + - ✓ + - ✓ + + * - | ``float normcdfinvf(float y)`` + | Returns the inverse of the standard normal cumulative distribution function. + - ✓ + - ✓ + + * - | ``float normf(int dim, const float *a)`` + | Returns the square root of the sum of squares of any number of coordinates. + - ✓ + - ✓ + + * - | ``float powf(float x, float y)`` + | Returns :math:`x^y`. + - ✓ + - ✓ + + * - | ``float powif(float base, int iexp)`` + | Returns the value of first argument to the power of second argument. + - ✓ + - ✓ + + * - | ``float remainderf(float x, float y)`` + | Returns single-precision floating-point remainder. + - ✓ + - ✓ + + * - | ``float remquof(float x, float y, int* quo)`` + | Returns single-precision floating-point remainder and part of quotient. + - ✓ + - ✓ + + * - | ``float roundf(float x)`` + | Round to nearest integer value in floating-point. + - ✓ + - ✓ + + * - | ``float rcbrtf(float x)`` + | Returns the reciprocal cube root function. + - ✓ + - ✓ + + * - | ``float rhypotf(float x, float y)`` + | Returns one over the square root of the sum of squares of two arguments. + - ✓ + - ✓ + + * - | ``float rintf(float x)`` + | Round input to nearest integer value in floating-point. + - ✓ + - ✓ + + * - | ``float rnorm3df(float x, float y, float z)`` + | Returns one over the square root of the sum of squares of three coordinates of the argument. + - ✓ + - ✓ + + * - | ``float rnorm4df(float x, float y, float z, float w)`` + | Returns one over the square root of the sum of squares of four coordinates of the argument. + - ✓ + - ✓ + + * - | ``float rnormf(int dim, const float *a)`` + | Returns the reciprocal of square root of the sum of squares of any number of coordinates. + - ✓ + - ✓ + + * - | ``float scalblnf(float x, long int n)`` + | Scale :math:`x` by :math:`2^n`. + - ✓ + - ✓ + + * - | ``float scalbnf(float x, int n)`` + | Scale :math:`x` by :math:`2^n`. + - ✓ + - ✓ + + * - | ``bool signbit(float x)`` + | Return the sign bit of :math:`x`. + - ✓ + - ✓ + + * - | ``float sinf(float x)`` + | Returns the sine of :math:`x`. + - ✓ + - ✓ + + * - | ``float sinhf(float x)`` + | Returns the hyperbolic sine of :math:`x`. + - ✓ + - ✓ + + * - | ``float sinpif(float x)`` + | Returns the hyperbolic sine of :math:`\pi \cdot x`. + - ✓ + - ✓ + + * - | ``void sincosf(float x, float *sptr, float *cptr)`` + | Returns the sine and cosine of :math:`x`. + - ✓ + - ✓ + + * - | ``void sincospif(float x, float *sptr, float *cptr)`` + | Returns the sine and cosine of :math:`\pi \cdot x`. + - ✓ + - ✓ + + * - | ``float sqrtf(float x)`` + | Returns the square root of :math:`x`. + - ✓ + - ✓ + + * - | ``float rsqrtf(float x)`` + | Returns the reciprocal of the square root of :math:`x`. + - ✗ + - ✓ + + * - | ``float tanf(float x)`` + | Returns the tangent of :math:`x`. + - ✓ + - ✓ + + * - | ``float tanhf(float x)`` + | Returns the hyperbolic tangent of :math:`x`. + - ✓ + - ✓ + + * - | ``float tgammaf(float x)`` + | Returns the gamma function of :math:`x`. + - ✓ + - ✓ + + * - | ``float truncf(float x)`` + | Truncate :math:`x` to the integral part. + - ✓ + - ✓ + + * - | ``float y0f(float x)`` + | Returns the value of the Bessel function of the second kind of order 0 for :math:`x`. + - ✓ + - ✓ + + * - | ``float y1f(float x)`` + | Returns the value of the Bessel function of the second kind of order 1 for :math:`x`. + - ✓ + - ✓ + + * - | ``float ynf(int n, float x)`` + | Returns the value of the Bessel function of the second kind of order n for :math:`x`. + - ✓ + - ✓ + +Double precision mathematical functions +======================================= + +Following is the list of supported double precision mathematical functions. + +.. list-table:: Double precision mathematical functions + + * - **Function** + - **Supported on Host** + - **Supported on Device** + + * - | ``double abs(double x)`` + | Returns the absolute value of :math:`x` + - ✓ + - ✓ + + * - | ``double acos(double x)`` + | Returns the arc cosine of :math:`x`. + - ✓ + - ✓ + + * - | ``double acosh(double x)`` + | Returns the nonnegative arc hyperbolic cosine of :math:`x`. + - ✓ + - ✓ + + * - | ``double asin(double x)`` + | Returns the arc sine of :math:`x`. + - ✓ + - ✓ + + * - | ``double asinh(double x)`` + | Returns the arc hyperbolic sine of :math:`x`. + - ✓ + - ✓ + + * - | ``double atan(double x)`` + | Returns the arc tangent of :math:`x`. + - ✓ + - ✓ + + * - | ``double atan2(double x, double y)`` + | Returns the arc tangent of the ratio of :math:`x` and :math:`y`. + - ✓ + - ✓ + + * - | ``double atanh(double x)`` + | Returns the arc hyperbolic tangent of :math:`x`. + - ✓ + - ✓ + + * - | ``double cbrt(double x)`` + | Returns the cube root of :math:`x`. + - ✓ + - ✓ + + * - | ``double ceil(double x)`` + | Returns ceiling of :math:`x`. + - ✓ + - ✓ + + * - | ``double copysign(double x, double y)`` + | Create value with given magnitude, copying sign of second value. + - ✓ + - ✓ + + * - | ``double cos(double x)`` + | Returns the cosine of :math:`x`. + - ✓ + - ✓ + + * - | ``double cosh(double x)`` + | Returns the hyperbolic cosine of :math:`x`. + - ✓ + - ✓ + + * - | ``double cospi(double x)`` + | Returns the cosine of :math:`\pi \cdot x`. + - ✓ + - ✓ + + * - | ``double cyl_bessel_i0(double x)`` + | Returns the value of the regular modified cylindrical Bessel function of order 0 for :math:`x`. + - ✗ + - ✗ + + * - | ``double cyl_bessel_i1(double x)`` + | Returns the value of the regular modified cylindrical Bessel function of order 1 for :math:`x`. + - ✗ + - ✗ + + * - | ``double erf(double x)`` + | Returns the error function of :math:`x`. + - ✓ + - ✓ + + * - | ``double erfc(double x)`` + | Returns the complementary error function of :math:`x`. + - ✓ + - ✓ + + * - | ``double erfcinv(double x)`` + | Returns the inverse complementary function of :math:`x`. + - ✓ + - ✓ + + * - | ``double erfcx(double x)`` + | Returns the scaled complementary error function of :math:`x`. + - ✓ + - ✓ + + * - | ``double erfinv(double x)`` + | Returns the inverse error function of :math:`x`. + - ✓ + - ✓ + + * - | ``double exp(double x)`` + | Returns :math:`e^x`. + - ✓ + - ✓ + + * - | ``double exp10(double x)`` + | Returns :math:`10^x`. + - ✓ + - ✓ + + * - | ``double exp2( double x)`` + | Returns :math:`2^x`. + - ✓ + - ✓ + + * - | ``double expm1(double x)`` + | Returns :math:`ln(x - 1)` + - ✓ + - ✓ + + * - | ``double fabs(double x)`` + | Returns the absolute value of `x` + - ✓ + - ✓ + + * - | ``double fdim(double x, double y)`` + | Returns the positive difference between :math:`x` and :math:`y`. + - ✓ + - ✓ + + * - | ``double floor(double x)`` + | Returns the largest integer less than or equal to :math:`x`. + - ✓ + - ✓ + + * - | ``double fma(double x, double y, double z)`` + | Returns :math:`x \cdot y + z` as a single operation. + - ✓ + - ✓ + + * - | ``double fmax(double x, double y)`` + | Determine the maximum numeric value of :math:`x` and :math:`y`. + - ✓ + - ✓ + + * - | ``double fmin(double x, double y)`` + | Determine the minimum numeric value of :math:`x` and :math:`y`. + - ✓ + - ✓ + + * - | ``double fmod(double x, double y)`` + | Returns the floating-point remainder of :math:`x / y`. + - ✓ + - ✓ + + * - | ``double modf(double x, double* iptr)`` + | Break down :math:`x` into fractional and integral parts. + - ✓ + - ✗ + + * - | ``double frexp(double x, int* nptr)`` + | Extract mantissa and exponent of :math:`x`. + - ✓ + - ✗ + + * - | ``double hypot(double x, double y)`` + | Returns the square root of the sum of squares of :math:`x` and :math:`y`. + - ✓ + - ✓ + + * - | ``int ilogb(double x)`` + | Returns the unbiased integer exponent of :math:`x`. + - ✓ + - ✓ + + * - | ``bool isfinite(double x)`` + | Determine whether :math:`x` is finite. + - ✓ + - ✓ + + * - | ``bool isin(double x)`` + | Determine whether :math:`x` is infinite. + - ✓ + - ✓ + + * - | ``bool isnan(double x)`` + | Determine whether :math:`x` is a ``NAN``. + - ✓ + - ✓ + + * - | ``double j0(double x)`` + | Returns the value of the Bessel function of the first kind of order 0 for :math:`x`. + - ✓ + - ✓ + + * - | ``double j1(double x)`` + | Returns the value of the Bessel function of the first kind of order 1 for :math:`x`. + - ✓ + - ✓ + + * - | ``double jn(int n, double x)`` + | Returns the value of the Bessel function of the first kind of order n for :math:`x`. + - ✓ + - ✓ + + * - | ``double ldexp(double x, int exp)`` + | Returns the natural logarithm of the absolute value of the gamma function of :math:`x`. + - ✓ + - ✓ + + * - | ``double lgamma(double x)`` + | Returns the natural logarithm of the absolute value of the gamma function of :math:`x`. + - ✓ + - ✗ + + * - | ``long int lrint(double x)`` + | Round :math:`x` to nearest integer value. + - ✓ + - ✓ + + * - | ``long long int llrint(double x)`` + | Round :math:`x` to nearest integer value. + - ✓ + - ✓ + + * - | ``long int lround(double x)`` + | Round to nearest integer value. + - ✓ + - ✓ + + * - | ``long long int llround(double x)`` + | Round to nearest integer value. + - ✓ + - ✓ + + * - | ``double log10(double x)`` + | Returns the base 10 logarithm of :math:`x`. + - ✓ + - ✓ + + * - | ``double log1p(double x)`` + | Returns the natural logarithm of :math:`x + 1`. + - ✓ + - ✓ + + * - | ``double log2(double x)`` + | Returns the base 2 logarithm of :math:`x`. + - ✓ + - ✓ + + * - | ``double log(double x)`` + | Returns the natural logarithm of :math:`x`. + - ✓ + - ✓ + + * - | ``double logb(double x)`` + | Returns the floating point representation of the exponent of :math:`x`. + - ✓ + - ✓ + + * - | ``double nan(const char* tagp)`` + | Returns "Not a Number" value. + - ✗ + - ✓ + + * - | ``double nearbyint(double x)`` + | Round :math:`x` to the nearest integer. + - ✓ + - ✓ + + * - | ``double nextafter(double x, double y)`` + | Returns next representable double-precision floating-point value after argument. + - ✓ + - ✓ + + * - | ``double norm3d(double x, double y, double z)`` + | Returns the square root of the sum of squares of :math:`x`, :math:`y` and :math:`z`. + - ✓ + - ✓ + + * - | ``double norm4d(double x, double y, double z, double w)`` + | Returns the square root of the sum of squares of :math:`x`, :math:`y`, :math:`z` and :math:`w`. + - ✓ + - ✓ + + * - | ``double normcdf(double y)`` + | Returns the standard normal cumulative distribution function. + - ✓ + - ✓ + + * - | ``double normcdfinv(double y)`` + | Returns the inverse of the standard normal cumulative distribution function. + - ✓ + - ✓ + + * - | ``double norm(int dim, const double *a)`` + | Returns the square root of the sum of squares of any number of coordinates. + - ✓ + - ✓ + + * - | ``double pow(double x, double y)`` + | Returns :math:`x^y`. + - ✓ + - ✓ + + * - | ``double powi(double base, int iexp)`` + | Returns the value of first argument to the power of second argument. + - ✓ + - ✓ + + * - | ``double remainder(double x, double y)`` + | Returns double-precision floating-point remainder. + - ✓ + - ✓ + + * - | ``double remquo(double x, double y, int* quo)`` + | Returns double-precision floating-point remainder and part of quotient. + - ✓ + - ✗ + + * - | ``double round(double x)`` + | Round to nearest integer value in floating-point. + - ✓ + - ✓ + + * - | ``double rcbrt(double x)`` + | Returns the reciprocal cube root function. + - ✓ + - ✓ + + * - | ``double rhypot(double x, double y)`` + | Returns one over the square root of the sum of squares of two arguments. + - ✓ + - ✓ + + * - | ``double rint(double x)`` + | Round input to nearest integer value in floating-point. + - ✓ + - ✓ + + * - | ``double rnorm3d(double x, double y, double z)`` + | Returns one over the square root of the sum of squares of three coordinates of the argument. + - ✓ + - ✓ + + * - | ``double rnorm4d(double x, double y, double z, double w)`` + | Returns one over the square root of the sum of squares of four coordinates of the argument. + - ✓ + - ✓ + + * - | ``double rnorm(int dim, const double *a)`` + | Returns the reciprocal of square root of the sum of squares of any number of coordinates. + - ✓ + - ✓ + + * - | ``double scalbln(double x, long int n)`` + | Scale :math:`x` by :math:`2^n`. + - ✓ + - ✓ + + * - | ``double scalbn(double x, int n)`` + | Scale :math:`x` by :math:`2^n`. + - ✓ + - ✓ + + * - | ``bool signbit(double x)`` + | Return the sign bit of :math:`x`. + - ✓ + - ✓ + + * - | ``double sin(double x)`` + | Returns the sine of :math:`x`. + - ✓ + - ✓ + + * - | ``double sinh(double x)`` + | Returns the hyperbolic sine of :math:`x`. + - ✓ + - ✓ + + * - | ``double sinpi(double x)`` + | Returns the hyperbolic sine of :math:`\pi \cdot x`. + - ✓ + - ✓ + + * - | ``void sincos(double x, double *sptr, double *cptr)`` + | Returns the sine and cosine of :math:`x`. + - ✓ + - ✓ + + * - | ``void sincospi(double x, double *sptr, double *cptr)`` + | Returns the sine and cosine of :math:`\pi \cdot x`. + - ✓ + - ✓ + + * - | ``double sqrt(double x)`` + | Returns the square root of :math:`x`. + - ✓ + - ✓ + + * - | ``double rsqrt(double x)`` + | Returns the reciprocal of the square root of :math:`x`. + - ✗ + - ✓ + + * - | ``double tan(double x)`` + | Returns the tangent of :math:`x`. + - ✓ + - ✓ + + * - | ``double tanh(double x)`` + | Returns the hyperbolic tangent of :math:`x`. + - ✓ + - ✓ + + * - | ``double tgamma(double x)`` + | Returns the gamma function of :math:`x`. + - ✓ + - ✓ + + * - | ``double trunc(double x)`` + | Truncate :math:`x` to the integral part. + - ✓ + - ✓ + + * - | ``double y0(double x)`` + | Returns the value of the Bessel function of the second kind of order 0 for :math:`x`. + - ✓ + - ✓ + + * - | ``double y1(double x)`` + | Returns the value of the Bessel function of the second kind of order 1 for :math:`x`. + - ✓ + - ✓ + + * - | ``double yn(int n, double x)`` + | Returns the value of the Bessel function of the second kind of order n for :math:`x`. + - ✓ + - ✓ + +Integer intrinsics +================== + +Following is the list of supported integer intrinsics. Note that intrinsics are supported on device only. + +.. list-table:: Integer intrinsics mathematical functions + + * - **Function** + + * - | ``unsigned int __brev(unsigned int x)`` + | Reverse the bit order of a 32 bit unsigned integer. + + * - | ``unsigned long long int __brevll(unsigned long long int x)`` + | Reverse the bit order of a 64 bit unsigned integer. + + * - | ``unsigned int __byte_perm(unsigned int x, unsigned int y, unsigned int z)`` + | Return selected bytes from two 32-bit unsigned integers. + + * - | ``unsigned int __clz(int x)`` + | Return the number of consecutive high-order zero bits in 32 bit integer. + + * - | ``unsigned int __clzll(long long int x)`` + | Return the number of consecutive high-order zero bits in 64 bit integer. + + * - | ``unsigned int __ffs(int x)`` + | Find the position of least significant bit set to 1 in a 32 bit integer. + + * - | ``unsigned int __ffsll(long long int x)`` + | Find the position of least significant bit set to 1 in a 64 bit signed integer. + + * - | ``unsigned int __fns32(unsigned long long mask, unsigned int base, int offset)`` + | Find the position of the n-th set to 1 bit in a 32-bit integer. + + * - | ``unsigned int __fns64(unsigned long long int mask, unsigned int base, int offset)`` + | Find the position of the n-th set to 1 bit in a 64-bit integer. + + * - | ``unsigned int __funnelshift_l(unsigned int lo, unsigned int hi, unsigned int shift)`` + | Concatenate :math:`hi` and :math:`lo`, shift left by shift & 31 bits, return the most significant 32 bits. + + * - | ``unsigned int __funnelshift_lc(unsigned int lo, unsigned int hi, unsigned int shift)`` + | Concatenate :math:`hi` and :math:`lo`, shift left by min(shift, 32) bits, return the most significant 32 bits. + + * - | ``unsigned int __funnelshift_r(unsigned int lo, unsigned int hi, unsigned int shift)`` + | Concatenate :math:`hi` and :math:`lo`, shift right by shift & 31 bits, return the least significant 32 bits. + + * - | ``unsigned int __funnelshift_rc(unsigned int lo, unsigned int hi, unsigned int shift)`` + | Concatenate :math:`hi` and :math:`lo`, shift right by min(shift, 32) bits, return the least significant 32 bits. + + * - | ``unsigned int __hadd(int x, int y)`` + | Compute average of signed input arguments, avoiding overflow in the intermediate sum. + + * - | ``unsigned int __rhadd(int x, int y)`` + | Compute rounded average of signed input arguments, avoiding overflow in the intermediate sum. + + * - | ``unsigned int __uhadd(int x, int y)`` + | Compute average of unsigned input arguments, avoiding overflow in the intermediate sum. + + * - | ``unsigned int __urhadd (unsigned int x, unsigned int y)`` + | Compute rounded average of unsigned input arguments, avoiding overflow in the intermediate sum. + + * - | ``int __sad(int x, int y, int z)`` + | Returns :math:`|x - y| + z`, the sum of absolute difference. + + * - | ``unsigned int __usad(unsigned int x, unsigned int y, unsigned int z)`` + | Returns :math:`|x - y| + z`, the sum of absolute difference. + + * - | ``unsigned int __popc(unsigned int x)`` + | Count the number of bits that are set to 1 in a 32 bit integer. + + * - | ``unsigned int __popcll(unsigned long long int x)`` + | Count the number of bits that are set to 1 in a 64 bit integer. + + * - | ``int __mul24(int x, int y)`` + | Multiply two 24bit integers. + + * - | ``unsigned int __umul24(unsigned int x, unsigned int y)`` + | Multiply two 24bit unsigned integers. + + * - | ``int __mulhi(int x, int y)`` + | Returns the most significant 32 bits of the product of the two 32-bit integers. + + * - | ``unsigned int __umulhi(unsigned int x, unsigned int y)`` + | Returns the most significant 32 bits of the product of the two 32-bit unsigned integers. + + * - | ``long long int __mul64hi(long long int x, long long int y)`` + | Returns the most significant 64 bits of the product of the two 64-bit integers. + + * - | ``unsigned long long int __umul64hi(unsigned long long int x, unsigned long long int y)`` + | Returns the most significant 64 bits of the product of the two 64 unsigned bit integers. + +The HIP-Clang implementation of ``__ffs()`` and ``__ffsll()`` contains code to add a constant +1 to produce the ``ffs`` result format. +For the cases where this overhead is not acceptable and programmer is willing to specialize for the platform, +HIP-Clang provides ``__lastbit_u32_u32(unsigned int input)`` and ``__lastbit_u32_u64(unsigned long long int input)``. +The index returned by ``__lastbit_`` instructions starts at -1, while for ``ffs`` the index starts at 0. + +Floating-point Intrinsics +========================= + +Following is the list of supported floating-point intrinsics. Note that intrinsics are supported on device only. + +.. note:: + + Only the nearest even rounding mode supported on AMD GPUs by defaults. The ``_rz``, ``_ru`` and + ``_rd`` suffixed intrinsic functions are existing in HIP AMD backend, if the + ``OCML_BASIC_ROUNDED_OPERATIONS`` macro is defined. + +.. list-table:: Single precision intrinsics mathematical functions + + * - **Function** + + * - | ``float __cosf(float x)`` + | Returns the fast approximate cosine of :math:`x`. + + * - | ``float __exp10f(float x)`` + | Returns the fast approximate for 10 :sup:`x`. + + * - | ``float __expf(float x)`` + | Returns the fast approximate for e :sup:`x`. + + * - | ``float __fadd_rn(float x, float y)`` + | Add two floating-point values in round-to-nearest-even mode. + + * - | ``float __fdiv_rn(float x, float y)`` + | Divide two floating point values in round-to-nearest-even mode. + + * - | ``float __fmaf_rn(float x, float y, float z)`` + | Returns ``x × y + z`` as a single operation in round-to-nearest-even mode. + + * - | ``float __fmul_rn(float x, float y)`` + | Multiply two floating-point values in round-to-nearest-even mode. + + * - | ``float __frcp_rn(float x, float y)`` + | Returns ``1 / x`` in round-to-nearest-even mode. + + * - | ``float __frsqrt_rn(float x)`` + | Returns ``1 / √x`` in round-to-nearest-even mode. + + * - | ``float __fsqrt_rn(float x)`` + | Returns ``√x`` in round-to-nearest-even mode. + + * - | ``float __fsub_rn(float x, float y)`` + | Subtract two floating-point values in round-to-nearest-even mode. + + * - | ``float __log10f(float x)`` + | Returns the fast approximate for base 10 logarithm of :math:`x`. + + * - | ``float __log2f(float x)`` + | Returns the fast approximate for base 2 logarithm of :math:`x`. + + * - | ``float __logf(float x)`` + | Returns the fast approximate for natural logarithm of :math:`x`. + + * - | ``float __powf(float x, float y)`` + | Returns the fast approximate of x :sup:`y`. + + * - | ``float __saturatef(float x)`` + | Clamp :math:`x` to [+0.0, 1.0]. + + * - | ``float __sincosf(float x, float* sinptr, float* cosptr)`` + | Returns the fast approximate of sine and cosine of :math:`x`. + + * - | ``float __sinf(float x)`` + | Returns the fast approximate sine of :math:`x`. + + * - | ``float __tanf(float x)`` + | Returns the fast approximate tangent of :math:`x`. + +.. list-table:: Double precision intrinsics mathematical functions + + * - **Function** + + * - | ``double __dadd_rn(double x, double y)`` + | Add two floating-point values in round-to-nearest-even mode. + + * - | ``double __ddiv_rn(double x, double y)`` + | Divide two floating-point values in round-to-nearest-even mode. + + * - | ``double __dmul_rn(double x, double y)`` + | Multiply two floating-point values in round-to-nearest-even mode. + + * - | ``double __drcp_rn(double x, double y)`` + | Returns ``1 / x`` in round-to-nearest-even mode. + + * - | ``double __dsqrt_rn(double x)`` + | Returns ``√x`` in round-to-nearest-even mode. + + * - | ``double __dsub_rn(double x, double y)`` + | Subtract two floating-point values in round-to-nearest-even mode. + + * - | ``double __fma_rn(double x, double y, double z)`` + | Returns ``x × y + z`` as a single operation in round-to-nearest-even mode. diff --git a/projects/hip/docs/reference/terms.md b/projects/hip/docs/reference/terms.md index 4d4be12296..ea2b9d96ab 100644 --- a/projects/hip/docs/reference/terms.md +++ b/projects/hip/docs/reference/terms.md @@ -1,4 +1,4 @@ -# Table Comparing Syntax for Different Compute APIs +# Table comparing syntax for different compute APIs |Term|CUDA|HIP|OpenCL| |---|---|---|---| diff --git a/projects/hip/docs/reference/virtual_rocr.rst b/projects/hip/docs/reference/virtual_rocr.rst index 8241fa07ef..444882fc7e 100644 --- a/projects/hip/docs/reference/virtual_rocr.rst +++ b/projects/hip/docs/reference/virtual_rocr.rst @@ -5,7 +5,7 @@ :keywords: AMD, ROCm, HIP, HSA, ROCR runtime, virtual memory management ******************************************************************************* -HSA Runtime API for ROCm +HSA runtime API for ROCm ******************************************************************************* The following functions are located in the https://github.com/ROCm/ROCR-Runtime repository. diff --git a/projects/hip/docs/sphinx/_toc.yml.in b/projects/hip/docs/sphinx/_toc.yml.in index 5a1d46757f..56abaee03f 100644 --- a/projects/hip/docs/sphinx/_toc.yml.in +++ b/projects/hip/docs/sphinx/_toc.yml.in @@ -16,9 +16,10 @@ subtrees: - caption: Conceptual entries: - file: understand/programming_model - - file: understand/programming_model_reference - file: understand/hardware_implementation - file: understand/amd_clr + - file: understand/texture_fetching + title: Texture fetching - caption: How to entries: @@ -29,16 +30,68 @@ subtrees: - file: how-to/performance_guidelines - file: how-to/debugging - file: how-to/logging + - file: how-to/cooperative_groups + - file: how-to/unified_memory + title: Unified memory + - file: how-to/virtual_memory + title: Virtual memory + - file: how-to/stream_ordered_allocator + - file: how-to/hipgraph + title: HIP graphs - file: how-to/faq - caption: Reference entries: - - file: doxygen/html/index - - file: reference/kernel_language - title: C++ language extensions - - file: reference/terms - title: Comparing Syntax for different APIs + - file: reference/hip_runtime_api_reference + subtrees: + - entries: + - file: reference/hip_runtime_api/modules + subtrees: + - entries: + - file: reference/hip_runtime_api/modules/initialization_and_version + - file: reference/hip_runtime_api/modules/device_management + - file: reference/hip_runtime_api/modules/execution_control + - file: reference/hip_runtime_api/modules/error_handling + - file: reference/hip_runtime_api/modules/stream_management + - file: reference/hip_runtime_api/modules/stream_memory_operations + - file: reference/hip_runtime_api/modules/event_management + - file: reference/hip_runtime_api/modules/memory_management + subtrees: + - entries: + - file: reference/hip_runtime_api/modules/memory_management/memory_management_deprecated + - file: reference/hip_runtime_api/modules/memory_management/external_resource_interoperability + - file: reference/hip_runtime_api/modules/memory_management/stream_ordered_memory_allocator + - file: reference/hip_runtime_api/modules/memory_management/unified_memory_reference + - file: reference/hip_runtime_api/modules/memory_management/virtual_memory_reference + - file: reference/hip_runtime_api/modules/memory_management/texture_management + - file: reference/hip_runtime_api/modules/memory_management/texture_management_deprecated + - file: reference/hip_runtime_api/modules/memory_management/surface_object + - file: reference/hip_runtime_api/modules/peer_to_peer_device_memory_access + - file: reference/hip_runtime_api/modules/context_management + - file: reference/hip_runtime_api/modules/module_management + - file: reference/hip_runtime_api/modules/occupancy + - file: reference/hip_runtime_api/modules/profiler_control + - file: reference/hip_runtime_api/modules/launch_api + - file: reference/hip_runtime_api/modules/runtime_compilation + - file: reference/hip_runtime_api/modules/callback_activity_apis + - file: reference/hip_runtime_api/modules/graph_management + - file: reference/hip_runtime_api/modules/opengl_interoperability + - file: reference/hip_runtime_api/modules/cooperative_groups_reference + - file: reference/hip_runtime_api/global_defines_enums_structs_files + subtrees: + - entries: + - file: reference/hip_runtime_api/global_defines_enums_structs_files/global_enum_and_defines + - file: reference/hip_runtime_api/global_defines_enums_structs_files/driver_types + - file: doxygen/html/annotated + - file: doxygen/html/files - file: reference/virtual_rocr + - file: reference/cpp_language_extensions + title: C++ language extensions + - file: reference/cpp_language_support + title: C++ language support + - file: reference/math_api + - file: reference/terms + title: Comparing syntax for different APIs - file: reference/deprecated_api_list title: List of deprecated APIs - file: reference/fp8_numbers @@ -46,10 +99,15 @@ subtrees: - caption: Tutorials entries: + - url: https://github.com/ROCm/rocm-examples/tree/develop/HIP-Basic + title: HIP basic examples - url: https://github.com/ROCm/HIP-Examples title: HIP examples - url: https://github.com/ROCm/hip-tests/tree/develop/samples title: HIP test samples + - file: tutorial/saxpy + - file: tutorial/reduction + - file: tutorial/cooperative_groups_tutorial - caption: About entries: diff --git a/projects/hip/docs/sphinx/requirements.in b/projects/hip/docs/sphinx/requirements.in index 8d22b2d9da..0dd2727603 100644 --- a/projects/hip/docs/sphinx/requirements.in +++ b/projects/hip/docs/sphinx/requirements.in @@ -1,2 +1,2 @@ -rocm-docs-core[api_reference]==1.1.1 +rocm-docs-core[api_reference]==1.7.2 sphinxcontrib.doxylink diff --git a/projects/hip/docs/sphinx/requirements.txt b/projects/hip/docs/sphinx/requirements.txt index dbe8cdca79..d9978e5045 100644 --- a/projects/hip/docs/sphinx/requirements.txt +++ b/projects/hip/docs/sphinx/requirements.txt @@ -4,11 +4,11 @@ # # pip-compile requirements.in # -accessible-pygments==0.0.4 +accessible-pygments==0.0.5 # via pydata-sphinx-theme -alabaster==0.7.16 +alabaster==1.0.0 # via sphinx -babel==2.14.0 +babel==2.16.0 # via # pydata-sphinx-theme # sphinx @@ -16,9 +16,9 @@ beautifulsoup4==4.12.3 # via pydata-sphinx-theme breathe==4.35.0 # via rocm-docs-core -certifi==2024.2.2 +certifi==2024.8.30 # via requests -cffi==1.16.0 +cffi==1.17.1 # via # cryptography # pynacl @@ -31,7 +31,7 @@ click==8.1.7 # sphinx-external-toc click-log==0.4.0 # via doxysphinx -cryptography==42.0.5 +cryptography==43.0.1 # via pyjwt deprecated==1.2.14 # via pygithub @@ -41,19 +41,19 @@ docutils==0.21.2 # myst-parser # pydata-sphinx-theme # sphinx -doxysphinx==3.3.7 +doxysphinx==3.3.10 # via rocm-docs-core -fastjsonschema==2.19.1 +fastjsonschema==2.20.0 # via rocm-docs-core gitdb==4.0.11 # via gitpython gitpython==3.1.43 # via rocm-docs-core -idna==3.7 +idna==3.8 # via requests imagesize==1.4.1 # via sphinx -jinja2==3.1.3 +jinja2==3.1.4 # via # myst-parser # sphinx @@ -67,27 +67,29 @@ markdown-it-py==3.0.0 # myst-parser markupsafe==2.1.5 # via jinja2 -mdit-py-plugins==0.4.0 +mdit-py-plugins==0.4.1 # via myst-parser mdurl==0.1.2 # via markdown-it-py -mpire==2.10.1 +mpire==2.10.2 # via doxysphinx -myst-parser==3.0.0 +myst-parser==4.0.0 # via rocm-docs-core -packaging==24.0 +numpy==1.26.4 + # via doxysphinx +packaging==24.1 # via # pydata-sphinx-theme # sphinx pycparser==2.22 # via cffi -pydata-sphinx-theme==0.15.2 +pydata-sphinx-theme==0.15.4 # via # rocm-docs-core # sphinx-book-theme -pygithub==2.3.0 +pygithub==2.4.0 # via rocm-docs-core -pygments==2.17.2 +pygments==2.18.0 # via # accessible-pygments # mpire @@ -95,26 +97,26 @@ pygments==2.17.2 # sphinx pyjson5==1.6.6 # via doxysphinx -pyjwt[crypto]==2.8.0 +pyjwt[crypto]==2.9.0 # via pygithub pynacl==1.5.0 # via pygithub -pyparsing==3.1.2 +pyparsing==3.1.4 # via # doxysphinx # sphinxcontrib-doxylink python-dateutil==2.9.0.post0 # via sphinxcontrib-doxylink -pyyaml==6.0.1 +pyyaml==6.0.2 # via # myst-parser # rocm-docs-core # sphinx-external-toc -requests==2.31.0 +requests==2.32.3 # via # pygithub # sphinx -rocm-docs-core[api-reference]==1.1.1 +rocm-docs-core[api-reference]==1.7.2 # via -r requirements.in six==1.16.0 # via python-dateutil @@ -122,9 +124,9 @@ smmap==5.0.1 # via gitdb snowballstemmer==2.2.0 # via sphinx -soupsieve==2.5 +soupsieve==2.6 # via beautifulsoup4 -sphinx==7.3.7 +sphinx==8.0.2 # via # breathe # myst-parser @@ -136,39 +138,39 @@ sphinx==7.3.7 # sphinx-external-toc # sphinx-notfound-page # sphinxcontrib-doxylink -sphinx-book-theme==1.1.2 +sphinx-book-theme==1.1.3 # via rocm-docs-core sphinx-copybutton==0.5.2 # via rocm-docs-core -sphinx-design==0.5.0 +sphinx-design==0.6.1 # via rocm-docs-core sphinx-external-toc==1.0.1 # via rocm-docs-core -sphinx-notfound-page==1.0.0 +sphinx-notfound-page==1.0.4 # via rocm-docs-core -sphinxcontrib-applehelp==1.0.8 +sphinxcontrib-applehelp==2.0.0 # via sphinx -sphinxcontrib-devhelp==1.0.6 +sphinxcontrib-devhelp==2.0.0 # via sphinx sphinxcontrib-doxylink==1.12.3 # via -r requirements.in -sphinxcontrib-htmlhelp==2.0.5 +sphinxcontrib-htmlhelp==2.1.0 # via sphinx sphinxcontrib-jsmath==1.0.1 # via sphinx -sphinxcontrib-qthelp==1.0.7 +sphinxcontrib-qthelp==2.0.0 # via sphinx -sphinxcontrib-serializinghtml==1.1.10 +sphinxcontrib-serializinghtml==2.0.0 # via sphinx tomli==2.0.1 # via sphinx -tqdm==4.66.2 +tqdm==4.66.5 # via mpire -typing-extensions==4.11.0 +typing-extensions==4.12.2 # via # pydata-sphinx-theme # pygithub -urllib3==2.2.1 +urllib3==2.2.2 # via # pygithub # requests diff --git a/projects/hip/docs/tutorial/cooperative_groups_tutorial.rst b/projects/hip/docs/tutorial/cooperative_groups_tutorial.rst new file mode 100644 index 0000000000..2782391b1b --- /dev/null +++ b/projects/hip/docs/tutorial/cooperative_groups_tutorial.rst @@ -0,0 +1,240 @@ +.. meta:: + :description: HIP cooperative groups tutorial + :keywords: AMD, ROCm, HIP, cooperative groups, tutorial + +******************************************************************************* +Cooperative groups +******************************************************************************* + +This tutorial demonstrates the basic concepts of cooperative groups in the HIP (Heterogeneous-computing Interface for Portability) programming model and the most essential tooling supporting it. This topic also reviews the commonalities of heterogeneous APIs. Familiarity with the C/C++ compilation model and the language is assumed. + +Prerequisites +============= + +To follow this tutorial, you'll need properly installed drivers and a HIP compiler toolchain to compile your code. Because ROCm HIP supports compiling and running on Linux and Microsoft Windows with AMD and NVIDIA GPUs, review the HIP development package installation before starting this tutorial. For more information, see :doc:`/install/install`. + +Simple HIP Code +=============== + +To become familiar with heterogeneous programming, review the :doc:`SAXPY tutorial ` and the first HIP code subsection. Compiling is also described in that tutorial. + +Tiled partition +=============== + +You can use tiled partition to calculate the sum of ``partition_size`` length sequences and the sum of ``result_size``/ ``BlockSize`` length sequences. The host-side reference implementation is the following: + +.. code-block:: cpp + + // Host-side function to perform the same reductions as executed on the GPU + std::vector ref_reduced(const unsigned int partition_size, + std::vector input) + { + const unsigned int input_size = input.size(); + const unsigned int result_size = input_size / partition_size; + std::vector result(result_size); + + for(unsigned int i = 0; i < result_size; i++) + { + unsigned int partition_result = 0; + for(unsigned int j = 0; j < partition_size; j++) + { + partition_result += input[partition_size * i + j]; + } + result[i] = partition_result; + } + + return result; + } + +Device-side code +---------------- + +To calculate the sum of the sets of numbers, the tutorial uses the shared memory-based reduction on the device side. The warp level intrinsics usage is not covered in this tutorial, unlike in the :doc:`reduction tutorial. ` ``x`` input variable is a shared pointer, which needs to be synchronized after every value change. The ``thread_group`` input parameter can be ``thread_block_tile`` or ``thread_block`` because the ``thread_group`` is the parent class of these types. The ``val`` are the numbers to calculate the sum of. The returned results of this function return the final results of the reduction on thread ID 0 of the ``thread_group``, and for every other thread, the function results are 0. + +.. code-block:: cuda + + /// \brief Summation of `unsigned int val`'s in `thread_group g` using shared memory `x` + __device__ unsigned int reduce_sum(thread_group g, unsigned int* x, unsigned int val) + { + // Rank of this thread in the group + const unsigned int group_thread_id = g.thread_rank(); + + // We start with half the group size as active threads + // Every iteration the number of active threads halves, until we processed all values + for(unsigned int i = g.size() / 2; i > 0; i /= 2) + { + // Store value for this thread in a shared, temporary array + x[group_thread_id] = val; + + // Synchronize all threads in the group + g.sync(); + + // If our thread is still active, sum with its counterpart in the other half + if(group_thread_id < i) + { + val += x[group_thread_id + i]; + } + + // Synchronize all threads in the group + g.sync(); + } + + // Only the first thread returns a valid value + if(g.thread_rank() == 0) + return val; + else + return 0; + } + +The ``reduce_sum`` device function is reused to calculate the block and custom +partition sum of the input numbers. The kernel has three sections: + +1. Initialization of the reduction function variables. +2. The reduction of thread block and store the results in global memory. +3. The reduction of custom partition and store the results in global memory. + +1. Initialization of the reduction function variables +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +In this code section, the shared memory is declared, the thread_block_group and +custom_partition are defined, and the input variables are loaded from global +memory. + +.. code-block:: cuda + + // threadBlockGroup consists of all threads in the block + thread_block thread_block_group = this_thread_block(); + + // Workspace array in shared memory required for reduction + __shared__ unsigned int workspace[2048]; + + unsigned int output; + + // Input to reduce + const unsigned int input = d_vector[thread_block_group.thread_rank()]; + + // ... + + // Every custom_partition group consists of 16 threads + thread_block_tile custom_partition + = tiled_partition(thread_block_group); + + +2. The reduction of thread block +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +In this code section, the sum is calculated on ``thread_block_group`` level, then the results are stored in global memory. + +.. code-block:: cuda + + // Perform reduction + output = reduce_sum(thread_block_group, workspace, input); + + // Only the first thread returns a valid value + if(thread_block_group.thread_rank() == 0) + { + d_block_reduced_vector[0] = output; + } + +3. The reduction of custom partition +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +In this code section, the sum is calculated on the custom partition level, then the results are stored in global memory. The custom partition is a partial block of the thread block, it means the reduction calculates on a shorter sequence of input numbers than at the ``thread_block_group`` case. + +.. code-block:: cuda + + // Perform reduction + output = reduce_sum(custom_partition, &workspace[group_offset], input); + + // Only the first thread in each partition returns a valid value + if(custom_partition.thread_rank() == 0) + { + const unsigned int partition_id = thread_block_group.thread_rank() / PartitionSize; + d_partition_reduced_vector[partition_id] = output; + } + +Host-side code +-------------- + +On the host-side, the following steps are done in the example: + +1. Confirm the cooperative group support on AMD GPUs. +2. Initialize the cooperative group configuration. +3. Allocate and copy input to global memory. +4. Launch the cooperative kernel. +5. Save the results from global memory. +6. Free the global memory. + +Only the first, second and fourth steps are important from the cooperative groups aspect, that's why those steps are detailed further. + +1. Confirm the cooperative group support on AMD GPUs +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Not all AMD GPUs support cooperative groups. You can confirm support with the following code: + +.. code-block:: cpp + + #ifdef __HIP_PLATFORM_AMD__ + int device = 0; + int supports_coop_launch = 0; + // Check support + // Use hipDeviceAttributeCooperativeMultiDeviceLaunch when launching across multiple devices + HIP_CHECK(hipGetDevice(&device)); + HIP_CHECK( + hipDeviceGetAttribute(&supports_coop_launch, hipDeviceAttributeCooperativeLaunch, device)); + if(!supports_coop_launch) + { + std::cout << "Skipping, device " << device << " does not support cooperative groups" + << std::endl; + return 0; + } + #endif + +2. Initialize the cooperative group configuration +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +In the example, there is only one block in the grid, and the ``threads_per_block`` must be dividable with ``partition_size``. + +.. code-block:: cpp + + // Number of blocks to launch. + constexpr unsigned int num_blocks = 1; + + // Number of threads in each kernel block. + constexpr unsigned int threads_per_block = 64; + + // Total element count of the input vector. + constexpr unsigned int size = num_blocks * threads_per_block; + + // Total elements count of a tiled_partition. + constexpr unsigned int partition_size = 16; + + // Total size (in bytes) of the input vector. + constexpr size_t size_bytes = sizeof(unsigned int) * size; + + static_assert(threads_per_block % partition_size == 0, + "threads_per_block must be a multiple of partition_size"); + +4. Launch the kernel +~~~~~~~~~~~~~~~~~~~~ + +The kernel launch is done with the ``hipLaunchCooperativeKernel`` of the cooperative groups API. + +.. code-block:: cpp + + void* params[] = {&d_vector, &d_block_reduced, &d_partition_reduced}; + // Launching kernel from host. + HIP_CHECK(hipLaunchCooperativeKernel(vector_reduce_kernel, + dim3(num_blocks), + dim3(threads_per_block), + params, + 0, + hipStreamDefault));\ + + // Check if the kernel launch was successful. + HIP_CHECK(hipGetLastError()); + +Conclusion +========== + +With cooperative groups, you can easily use custom partitions to create custom tiles for custom solutions. You can find the complete code at `cooperative groups ROCm example. `_ diff --git a/projects/hip/docs/tutorial/reduction.rst b/projects/hip/docs/tutorial/reduction.rst new file mode 100644 index 0000000000..83ada10962 --- /dev/null +++ b/projects/hip/docs/tutorial/reduction.rst @@ -0,0 +1,722 @@ +.. meta:: + :description: HIP reduction tutorial + :keywords: AMD, ROCm, HIP, reduction, tutorial + +************************************************************* +Reduction +************************************************************* + +Reduction is a common algorithmic operation used in parallel programming to reduce an array of elements into a shorter array of elements or a single value. This document exploits reduction to introduce some key considerations while designing and optimizing GPU algorithms. + +This document is a rejuvenation and extension of the invaluable `work of Mark Harris `_. While the author approaches the topic with a less naive approach, reviewing some original material is valuable to see how much the underlying hardware has changed. This document provides a greater insight to demonstrate progress. + +The algorithm +============= + +Reduction has many names depending on the domain; in functional programming it's referred to as `fold `_, in C++, it's called ``std::accumulate`` and in C++17, as ``std::reduce``. A reduction takes a range of inputs and "reduces" the given range with a binary operation to a singular or scalar output. Canonically, a reduction requires a "zero" element that bootstraps the algorithm and serves as one of the initial operands to the binary operation. The "zero" element is generally called `identity or neutral `_ element in the group theory, which implies that it is an operand that doesn't change the result. Some typical use cases are: calculating a sum or normalizing a dataset and finding the maximum value in the dataset. The latter use case is discussed further in this tutorial. + +.. figure:: ../data/tutorial/reduction/foldl.svg + :alt: Diagram demonstrating fold left + +There are multiple variations of reduction that allow parallel processing. The approach taken by ``std::reduce`` requires the user-provided binary operator to operate on any combination of identity and input range elements, or even exclusively on any of them. This allows you to insert any number of identities to facilitate parallel processing and then combine the partial results of parallel execution. + +.. figure:: ../data/tutorial/reduction/parallel_foldl.svg + :alt: Diagram demonstrating parallel fold left + +Reduction on GPUs +================= + +Implementing reductions on GPUs requires a basic understanding of the :doc:`/understand/programming_model`. The document explores aspects of low-level optimization best discussed through the :ref:`inherent_thread_model`, and refrains from using cooperative groups. + +Synchronizing parallel threads of execution across a GPU is crucial for correctness as the partial results can't be synchronized before they manifest. Synchronizing all the threads running on a GPU at any given time is possible, however, it is a costly and intricate operation. If synchronization is not absolutely necessary, map the parallel algorithm so that multiprocessors and blocks can make independent progress and need not sync frequently. + +There are ten reduction implementations in the `rocm-examples `_, which are described in the following sections. + +Naive shared reduction +---------------------- + +The naive algorithm takes a tree-like shape, where the computational domain is purposefully distributed among blocks. In all blocks, all threads participate in loading data from persistent (from the kernel's perspective) global memory into the shared memory. This helps to perform tree-like reduction for a single thread by writing the partial result to global, in a location unique to the block, which allows the block to make independent progress. The partial results are combined in subsequent launches of the same kernel until a scalar result is reached. + +.. figure:: ../data/tutorial/reduction/naive_reduction.svg + :alt: Diagram demonstrating naive reduction + +This approach requires temporary storage based on the number of blocks launched, as each block outputs a scalar partial result. Depending on the need to store or destroy the input, a second temporary storage might be needed, which could be large enough to store the results of the second kernel launch. Alternatively, you can reuse the storage of the larger than necessary original input. These implementations differ so slightly that the document only considers the use case where the input could be destroyed. + +.. code-block:: C++ + + std::size_t factor = block_size; // block_size from hipGetDeviceProperties() + auto new_size = [factor](const std::size_t actual) + { + // Every pass reduces input length by 'factor'. If actual size is not divisible by factor, + // an extra output element is produced using some number of zero_elem inputs. + return actual / factor + (actual % factor == 0 ? 0 : 1); + }; + +For threads that don't have unique inputs, feed ``zero_elem`` instances to threads. The backing of double-buffering is allocated as such: + +.. code-block:: C++ + + // Initialize host-side storage + std::vector input(input_count); + std::iota(input.begin(), input.end(), 0); + + // Initialize device-side storage + unsigned *front, + *back; + hipMalloc((void**)&front, sizeof(unsigned) * input_count); + hipMalloc((void**)&back, sizeof(unsigned) * new_size(input_count)); + + hipMemcpy(front, input.data(), input.size() * sizeof(unsigned), hipMemcpyHostToDevice); + +Data is initialized on the host and dispatched to the device followed by the commencement of device-side reduction. The swapping of the double-buffer on the last iteration is omitted, therefore the result is in the back-buffer irrespective of the input size. + +.. code-block:: C++ + + for (uint32_t curr = input_count; curr > 1;) + { + hipLaunchKernelGGL( + kernel, + dim3(new_size(curr)), + dim3(block_size), + factor * sizeof(unsigned), + hipStreamDefault, + front, + back, + kernel_op, + zero_elem, + curr); + + curr = new_size(curr); + if (curr > 1) + std::swap(front, back); + } + + +This structure persists in the kernel throughout all the variations of reduction with slight modifications to ``factor`` and shared memory allocation: + +.. code-block:: C++ + + template + __global__ void kernel( + T* front, + T* back, + F op, + T zero_elem, + uint32_t front_size) + { + extern __shared__ T shared[]; + + // Overindex-safe read of input + auto read_global_safe = [&](const uint32_t i) + { + return i < front_size ? front[i] : zero_elem; + }; + + const uint32_t tid = threadIdx.x, + bid = blockIdx.x, + gid = bid * blockDim.x + tid; + + // Read input from front buffer to shared + shared[tid] = read_global_safe(gid); + __syncthreads(); + + // Shared reduction + for (uint32_t i = 1; i < blockDim.x; i *= 2) + { + if (tid % (2 * i) == 0) + shared[tid] = op(shared[tid], shared[tid + i]); + __syncthreads(); + } + + // Write result from shared to back buffer + if (tid == 0) + back[bid] = shared[0]; + } + +While the ``tid % (2 * i) == 0`` indexing scheme yields correct results, it also leads to high thread divergence. Thread divergence indicates the event when the threads in a warp diverge, which implies that the threads have to execute different instructions in a given clock cycle. This is easily manifested using ``if-else`` statements as shown here, but can also be manifested as ``for`` loop dependent on thread ID lengths. Even though the number of active threads participating in the reduction reduces, warps remain active longer than necessary, as at least one lane in a warp hits the ``if`` statement. + +Reducing thread divergence +-------------------------- + +You can reduce divergence by keeping dataflow between memory addresses identical but reassigning the thread ids. + +.. figure:: ../data/tutorial/reduction/reduced_divergence_reduction.svg + :alt: Diagram demonstrating reduced divergence reduction + +.. code-block:: diff + :emphasize-lines: 4-7 + + // Shared reduction + for (uint32_t i = 1; i < blockDim.x; i *= 2) + { + - if (tid % (2 * i) == 0) + - shared[tid] = op(shared[tid], shared[tid + i]); + + if (uint32_t j = 2 * i * tid; j < blockDim.x) + + shared[j] = op(shared[j], shared[j + i]); + __syncthreads(); + } + +This way inactive threads start accumulating uniformly towards the higher thread ID index range and might uniformly skip to ``__syncthreads()``. However, this introduces a bank conflicts issue. + +Resolving bank conflicts +------------------------ + +Both AMD and NVIDIA implement shared memory in the hardware by organizing storage into banks of various sizes. This hardware element is known as Local Data Share (LDS) on AMD hardware. On NVIDIA hardware, it's implemented using the same silicon as the L1 data cache. You can think of shared memory as a striped 2-dimensional range of memory. Shared memory bank's count, width, and depth depend on the architecture. A bank conflict occurs when different threads in a warp access the same bank during the same operation. In this case, the hardware prevents the attempted concurrent accesses to the same bank by converting them into serial accesses. + +- `"AMD Instinct MI200" Instruction Set Architecture, Chapter 11.1 `_ +- `"RDNA 2" Instruction Set Architecture, Chapter 10.1 `_ + +A notable exception is when the shared read uniformly broadcasts to the same address across the entire warp. A better implementation of the naive algorithm is to form continuous ranges of the threads activities and their memory accesses. + +.. code-block:: diff + :emphasize-lines: 2-7 + + // Shared reduction + -for (uint32_t i = 1; i < blockDim.x; i *= 2) + -{ + - if (tid % (2 * i) == 0) + +for (uint32_t i = blockDim.x / 2; i != 0; i /= 2) + +{ + + if (tid < i) + shared[tid] = op(shared[tid], shared[tid + i]); + __syncthreads(); + } + +.. figure:: ../data/tutorial/reduction/conflict_free_reduction.svg + :alt: Diagram demonstrating bank conflict free reduction + +.. note:: + + To avoid bank conflicts, read shared memory in a coalesced manner, which implies that reads/writes of each lane in a warp evaluate to consecutive locations. Analyzing the read/write patterns could help you to understand the cause of bank conflicts. For more details, check `CDNA3 ISA `_ or `RDNA3 ISA `_ data share operations chapter. + +Utilize upper half of the block +------------------------------- + +The preceding implementation is free of low-level GPU-specific anti-patterns. However, it still exhibits some common shortcomings. The loop performing the reduction in the shared memory starts from ``i = blockDim.x / 2`` and the first predicate ``if (tid < i)`` immediately disables half of the block, which only helps load the data into the shared memory. You can change the kernel along with the calculation of ``factor`` on the host, as shown here: + +.. code-block:: diff + :emphasize-lines: 3,4 + + const uint32_t tid = threadIdx.x, + bid = blockIdx.x, + - gid = bid * blockDim.x + tid; + + gid = bid * (blockDim.x * 2) + tid; + + // Read input from front buffer to shared + -shared[tid] = read_global_safe(gid); + +shared[tid] = op(read_global_safe(gid), read_global_safe(gid + blockDim.x)); + __syncthreads(); + +By eliminating half of the threads and giving meaningful work to all the threads by unconditionally performing a binary ``op``, you can prevent the wastage of half of the threads. + +Even though global memory is read in a coalesced fashion, as preferred by the memory controller, optimal performance is still limited by the instruction throughput. +Omit superfluous synchronization +-------------------------------- + +Warps are known to execute in a strict lockstep fashion. Therefore, once shared reduction reaches a point where only a single warp participates meaningfully, you can cut short the loop and let the rest of the warps terminate. Moreover, you can also unroll the loop without syncing the entire block. + +The ``tmp`` namespace used beyond this point in this document holds a handful of template meta-programmed utilities to facilitate writing flexible and optimal code. + +:code:`tmp::static_for` is not just a constant folding within the optimizer but a variation of the language :code:`for` loop, where the running index is a compile-time constant and is eligible for use in compile-time evaluated contexts. + +Consider the following code: + +.. code-block:: C++ + + constexpr int size = 4; + for (int i = 0 ; i < size ; ++i) + { + printf("%d", i); + } + +This compiles to the following binaries: + +**LLVM Block** + +.. code-block:: + + main: + push rbx + lea rbx, [rip + .L.str] + mov rdi, rbx + xor esi, esi + xor eax, eax + call printf@PLT + mov rdi, rbx + mov esi, 1 + xor eax, eax + call printf@PLT + mov rdi, rbx + mov esi, 2 + xor eax, eax + call printf@PLT + mov rdi, rbx + mov esi, 3 + xor eax, eax + call printf@PLT + xor eax, eax + pop rbx + ret + .L.str: + .asciz "%d" + + +**GCC** + +.. code-block:: asm + + .LC0: + .string "%d" + main: + push rbx + xor ebx, ebx + .L2: + mov esi, ebx + mov edi, OFFSET FLAT:.LC0 + xor eax, eax + add ebx, 1 + call printf + cmp ebx, 4 + jne .L2 + xor eax, eax + pop rbx + ret + + +**MSVC** + +.. code-block:: + + main PROC + $LN12: + push rbx + sub rsp, 32 + xor ebx, ebx + npad 8 + $LL4@main: + mov edx, ebx + lea rcx, OFFSET FLAT:'string' + call printf + inc ebx + cmp ebx, 4 + jl SHORT $LL4@main + xor eax, eax + add rsp, 32 + pop rbx + ret 0 + main ENDP + + +LLVM unrolls the loop and compiles to a flat series of ``printf`` invocations, while both GCC and MSVC keep the loop intact, as visible from the compare (``cmp``) and the jump (``jne``, ``jl``) instructions. LLVM code generation is identical to manually writing the unrolled loop: + +.. code-block:: C++ + + printf("%d", 0); + printf("%d", 1); + printf("%d", 2); + printf("%d", 3); + +While various non-standard pragmas are available to hint or force the compiler to unroll the loop, we instead use template meta-programming to force feed the compiler the unrolled loop. + +.. code-block:: C++ + + constexpr int size = 4; + + // Maybe unrolled loop + for (int i = 0 ; i < size ; ++i) + { + printf("%d", i); + } + + // Force unrolled loop + using namespace tmp; + static_for<0, less_than, increment<1>>([]() + { + printf("%d", i); + }); + +The most notable structural difference is that in the language ``for`` loop, the loop variable is given a name in the beginning, while in the ``static_for`` utility, the loop variable is given a name in the end. An important bonus is that in the loop's body, you can use the running index ``i`` in contexts requiring constant expressions such as template arguments or inside ``if constexpr``. + +:code:`tmp::static_switch` takes runtime value and runtime dispatches to a range of set of tabulated functions, where said value is a compile-time constant and is eligible for use in compile-time evaluated contexts. + +Consider the following code: + +.. code-block:: C++ + + int warp_size = device_props.warpSize; + switch (warp_size) + { + case 32: + hipLaunchKernelGGL(kernel<32>, ...); + break; + case 64: + hipLaunchKernelGGL(kernel<64>, ...); + break; + } + +In the preceding code, note the code repetition for all possible values of ``warp_size``, the code is prepared to handle. To avoid this, use ``tmp::static_switch``, as shown: + +.. code-block:: C++ + + tmp::static_switch(warp_size, [&]() + { + hipLaunchKernelGGL(kernel, ...); + }); + +.. code-block:: diff + :emphasize-lines: 1,2,9,10,16-24 + + -template + +template + __global__ void kernel( + ... + ) + { + ... + // Shared reduction + -for (uint32_t i = blockDim.x / 2; i != 0; i /= 2) + +for (uint32_t i = blockDim.x / 2; i > WarpSize; i /= 2) + { + if (tid < i) + shared[tid] = op(shared[tid], shared[tid + i]); + __syncthreads(); + } + +// Warp reduction + +tmp::static_for, tmp::divide<2>>([&]() + +{ + + if (tid < I) + + shared[tid] = op(shared[tid], shared[tid + I]); + +#ifdef __HIP_PLATFORM_NVIDIA__ + + __syncwarp(0xffffffff >> (WarpSize - I)); + +#endif + +}); + +Because HIP typically targets hardware with warp sizes of 32 (NVIDIA GPUs and RDNA AMD GPUs) and 64 (CDNA AMD GPUs), portable HIP code must handle both. That is why instead of assuming a warp size of 32, make the warp size a template argument of the kernel. This allows you to unroll the final loop using ``tmp::static_for`` in a parametric way but still having the code read much like an ordinary loop. + +Promoting the warp size to being a compile-time constant also requires you to handle it similarly on the host-side. You can sandwich the kernel launch with ``tmp::static_switch``, promoting the snake-case run-time ``warp_size`` variable to a camel-case compile-time constant ``WarpSize``. + +.. code-block:: diff + :emphasize-lines: 4,5,7,8,18 + + // Device-side reduction + for (uint32_t curr = input_count; curr > 1;) + { + + tmp::static_range_switch(warp_size, [&]() noexcept + + { + hipLaunchKernelGGL( + - kernel, + + kernel, + dim3(new_size(curr)), + dim3(block_size), + factor * sizeof(unsigned), + hipStreamDefault, + front, + back, + kernel_op, + zero_elem, + curr); + + }); + ... + } + +.. note:: + + Neither RDNA- nor CDNA-based AMD hardware provides guaranteed independent progress to lanes of the same warp. When targeting NVIDIA hardware, lanes of a warp might execute somewhat independently as long as the programmer assists the compiler using dedicated built-in functions. This feature is called Independent Thread Scheduling. The HIP headers don't expose the necessary warp primitives and their overloads. + + Portable applications can still tap into this feature with carefully ``#ifdef`` -ed code, but at this particular optimization level, it's a requirement. The code implicitly relies on the lockstep behavior of an ROCm wavefront, but CUDA warps don't share this property. You must synchronize all the active lanes of a warp to avoid a data race with some lanes progressing faster than others in the same warp. + +Unroll all loops +---------------- + +While the previous step primarily aims to remove unnecessary syncing, it also unrolls the end of the loop. However, you could also force unrolling the first part of the loop. This saves a few scalar registers (values the compiler can prove to be uniform across warps). + +.. code-block:: diff + :emphasize-lines: 1-4,11,12,17,18,20-23,26 + + -template + -__global__ void kernel( + +template + +__global__ __launch_bounds__(BlockSize) void kernel( + T* front, + T* back, + F op, + T zero_elem, + uint32_t front_size) + { + - extern __shared__ T shared[]; + + __shared__ T shared[BlockSize]; + + ... + + // Shared reduction + - for (uint32_t i = blockDim.x / 2; i > WarpSize; i /= 2) + + tmp::static_for, tmp::divide<2>>([&]() + { + - if (tid < i) + - shared[tid] = op(shared[tid], shared[tid + i]); + + if (tid < I) + + shared[tid] = op(shared[tid], shared[tid + I]); + __syncthreads(); + } + + ); + +Introducing yet another template argument for the kernel and moving from ``for`` to ``tmp::static_for`` leads to the following two notable improvements: + +- Introducing new attribute ``__launch_bounds__(BlockSize)`` to the kernel instructs the compiler that the kernel will only be launched using the designated block size. This implies that the launches of differing block sizes will fail. This allows the optimizer to enroll the ``blockDim.x`` variable in constant folding as well as get information about register usage. +- Turning the block size into a compile-time constant allows you to statically allocate the shared memory. + +Communicate using warp-collective functions +------------------------------------------- + +Shared memory provides a fast communication path within a block, however when performing reduction within the last warp, you can use faster means of communication, which is warp-collective or cross-lane functions. Instead of using the hardware-backed shared memory, you can directly copy between the local memory (registers) of each lane in a warp. This can be achieve using the shuffle functions. + +See how to use ``__shfl_down()``, which is one of the most restrictive but also the most structured communication schemes. + +.. code-block:: C++ + + // Warp reduction + if (tid < WarpSize) + { + T res = op(shared[tid], shared[tid + WarpSize]); + tmp::static_for, tmp::divide<2>>([&]() + { + res = op(res, __shfl_down(res, Delta)); + }); + + // Write result from shared to back buffer + if (tid == 0) + back[bid] = res; + } + +Using warp-collective functions for communication requires the control flow to be uniform across warps, as the name warp-collective implies. Therefore, you can see that the thread ID is being checked outside the loop, but the result is written inside due to variable scoping. + +Prefer warp communication over shared +------------------------------------- + +As mentioned in the previous step, communication between local memory is faster than shared memory. Instead of relying on the local memory only at the end of the tree-like reduction, a better approach is to turn the tree reduction inside out and perform multiple warp reductions in parallel on all active threads, thus communicating only their partial results through the shared memory. + +.. figure:: ../data/tutorial/reduction/warp_reduction.svg + :alt: Diagram demonstrating warp reduction + +.. figure:: ../data/tutorial/reduction/warp_reduction_with_shared.svg + :alt: Diagram demonstrating warp reduction and results store in shared memory + +The kernel versions differ significantly enough to be described using a diff; use afresh instead. + +.. code-block:: C++ + + template + __global__ __launch_bounds__(BlockSize) void kernel( + T* front, + T* back, + F op, + T zero_elem, + uint32_t front_size) + { + // ... + } + +The kernel signature and the reduction factor are the same as in previous cases; only the implementation differs. + +.. code-block:: C++ + + static constexpr uint32_t WarpCount = BlockSize / WarpSize; + + __shared__ T shared[WarpCount]; + + auto read_global_safe = + [&](const uint32_t i) { return i < front_size ? front[i] : zero_elem; }; + auto read_shared_safe = + [&](const uint32_t i) { return i < WarpCount ? shared[i] : zero_elem; }; + + const uint32_t tid = threadIdx.x, + bid = blockIdx.x, + gid = bid * (blockDim.x * 2) + tid, + wid = tid / WarpSize, + lid = tid % WarpSize; + + // Read input from front buffer to local + T res = op(read_global_safe(gid), read_global_safe(gid + blockDim.x)); + +As we communicate the results of warps through shared memory, the same number of elements are required in the shared memory as warps within the block. Similar to how you can only launch kernels at block granularity, you can only warp reduce with ``WarpSize`` granularity due to the collective nature of the cross-lane builtins. To address this, you can use ``read_shared_safe`` to pad overindexing by reading ``zero_elem``. Reading from global remains unaffected. + +.. code-block:: C++ + + // Perform warp reductions and communicate results via shared + // for (uint32_t ActiveWarps = WarpCount; + // ActiveWarps != 0; + // ActiveWarps = ActiveWarps != 1 ? + // divide_ceil(ActiveWarps, WarpSize) : + // ActiveWarps = 0) + tmp::static_for< + WarpCount, + tmp::not_equal<0>, + tmp::select< + tmp::not_equal<1>, + tmp::divide_ceil, + tmp::constant<0>>>([&]() + { + if(wid < ActiveWarps) + { + // Warp reduction + tmp::static_for, tmp::divide<2>>([&]() + { + res = op(res, __shfl_down(res, Delta)); + }); + + // Write warp result from local to shared + if(lid == 0) + shared[wid] = res; + } + __syncthreads(); + + // Read warp result from shared to local + res = read_shared_safe(tid); + }); + + // Write result from local to back buffer + if(tid == 0) + back[bid] = res; + +``ActiveWarps`` iterates from ``WarpCount`` until it reaches ``0``. Every iteration of ``ActiveWarps`` reduces the ``WarpSize``. In cases where the partial result count isn't a divisor of ``ActiveWarps`` and you need to launch an extra warp, use ``tmp::divide_ceil``, which always rounds to positive infinity. The tertiary ``tmp::select`` is required because such division never reaches ``0``, so you must terminate the loop after the last warp concludes. + +In each iteration, if the warp is active, which means it has at least a single valid input, it carries out a pass of warp reduction and writes output based on warp ID. Reading is carried out based on thread ID. Global output continues to be based on block ID. + +Amortize bookkeeping variable overhead +-------------------------------------- + +The previous sections explained how to reduce register usage to improve occupancy. This allows more blocks to execute in parallel on all multiprocessors, leading to more global store/load latency to be hidden. Reducing the number of kernels in flight while still carrying out the same workload reduces the wastage of registers while loading and maintaining bookkeeping variables such as kernel indices. + +An example of this optimization is performing one binary ``op`` while loading input from global. Even though the operation is said to be carried out "in flight", the two values are loaded into local memory (registers) before ``op`` is called. + +A more general form of this optimization is wrapping most kernel logic in loops that carry out the workload of multiple kernel instances but require storing only a single instance of most of the bookkeeping logic. In code, this multiplicity factor is referred to via the ``ItemsPerThread`` compile-time constant, which is supplied by a template argument to allow for loop unrolling. + +This kernel variant utilizes another generally applicable utility known as ``hip::static_array``, which is a more restrictive wrapper over the builtin array than ``std::array``, as it allows indexing only compile-time constants using the usual tuple-like ``template auto get(...)`` interface. + +.. note:: + + On a GPU, there is no stack, and the local memory is provisioned from the register file. This provisioning takes place statically. To paraphrase, the address range of a thread's local memory is determined at compile-time. When an array is defined and used in the local storage, the compiler can only maintain its storage in the register file as long as all accesses to the array are computable by the compiler at compile-time. It doesn't need to be a compile-time constant as long as the compiler can resolve the addresses of the accesses through constant folding or some other means. If the compiler fails to do so, the array will be backed by global memory, which is indicated by allocating a non-zero number of spill registers observable using static analysis tools. However, this is slower by the magnitude of multiple order. ``hip::static_array`` via its ``hip::get<>`` interface ensures that no such spills occur. + +.. code-block:: C++ + + template + __global__ static __launch_bounds__(BlockSize) void kernel(...) + +The kernel now has three compile-time configurable parameters. The only part of the kernel that changes depends on how you load data from global and perform the binary operation on those loaded values. So, the following step to read input from front buffer to global is now split into two steps: :ref:`reading-items` and :ref:`processing-items` . + +.. code-block:: C++ + + // Read input from front buffer to local + T res = op(read_global_safe(gid), read_global_safe(gid + blockDim.x)); + +.. _reading-items: + +Reading ``ItemsPerThread`` +^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The change to reading happens inside `read_global_safe`: + +.. code-block:: C++ + + auto read_global_safe = [&](const int32_t i) -> hip::static_array + { + return [&](std::integer_sequence) + { + if(i + ItemsPerThread < front_size) + return hip::static_array{ + front[i + I]... + }; + else + return hip::static_array{ + (i + I < front_size ? front[i + I] : zero_elem)... + }; + }(std::make_integer_sequence()); + }; + +Note that each array element is being loaded consecutively without the flexibility of a configurable ``ItemsPerThread`` property. This is morally equivalent to: + +.. code-block:: C++ + + T arr[4] = { + front[gid + 0], + front[gid + 1], + front[gid + 2], + front[gid + 3] + } + +This is exactly what's happening in the ``front[i + I]...`` fold-expression. However, this can only be issued if the entire read operates on real input without padding using ``zero_elem``. If some reads over-index the input, the read turns into: + +.. code-block:: C++ + + T arr[4] = { + i + 0 < front_size ? front[i + 0] : zero_elem, + i + 1 < front_size ? front[i + 1] : zero_elem, + i + 2 < front_size ? front[i + 2] : zero_elem, + i + 3 < front_size ? front[i + 3] : zero_elem + } + +This makes it easier for the compiler to recognize vector loads from global. As the performance at large is dominated by how you move the data, it's only natural to utilize dedicated instructions to move more data with less binary. This is evident by the huge performance improvement when loading two values per thread. For more information, see `the compiler explorer `_ to learn how loading for AMD (both RDNA and CDNA) compiles to ``global_load_dwordx4``, where ``x4`` denotes the 4-vector variant of the instruction. + +.. note:: + + Note that ``read_global_safe``, which used to take an ``uint32_t`` as the index type, now takes a signed integer. When indexing an array with unsigned integers, the compiler has to handle integer overflows, as the C/C++ standards defined them. It might happen that some part of the vector load indices overflow, thus resulting in a non-contiguous read. If you change the previously linked code to use an unsigned integer as the thread ID, the compiler won't emit a vector load. Signed integer overflow is an undefined behavior, and hence, unknown to the optimizer. To convey the absence of overflow to the compiler with unsigned indices, add ``__builtin_assume(gid + 4 > gid)``, or the more portable ``[[assume]](gid + 4 > gid)``, once ``amdclang++`` supports it. + +``read_global_safe`` implementation is an Immediately Invoked Lambda Expression (IILE), because ``ItemsPerThread`` is an integer value, while you need a compile-time ``iota``-like sequence of integers as a pack for the fold-expressions to expand on. This can only occur as part of template argument deduction on the IILE. + +.. _processing-items: + +Processing ``ItemsPerThread`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Once the kernel reads ``ItemsPerThread`` number of inputs to local, it immediately reduces them to a scalar. There is no reason to propagate the input element multiplicity to the warp reduction phase. + +.. code-block:: C++ + + T res = [&]() + { + // Read input from front buffer to local + hip::static_array arr = read_global_safe(gid); + + // Reduce ItemsPerThread to scalar + tmp::static_for<1, tmp::less_than, tmp::increment<1>>([&]() + { + get<0>(arr) = op(get<0>(arr), get(arr)); + }); + + return get<0>(arr); + }(); + +Two-pass reduction +------------------ + +Alter kernel launch and input fetching such that no more blocks are launched than what a subsequent kernel launch's single block can conveniently reduce, while performing multiple passes of input reading from global and combining their results before engaging in the end game tree-like reduction. + +With this method, you can save at least one to two kernel launches for large inputs. + +Global data share +----------------- + +.. warning:: + + This modification can only be executed on AMD hardware. + +Perform the first step of the two-pass reduction, but in the end, instead of writing to global and reading it back in a subsequent kernel, write the partial results to the Global Data Share (GDS). This is an ``N+1`` th shared memory that is accessed by all multiprocessors and is also on-chip memory. + +.. note:: + + The API doesn't guarantee the order in which blocks are scheduled even though all GPUs schedule them in the same monotonically increasing order of block ids. Relying on this implicitly, the last block of a grid is in the optimal position to observe the side effects of all other blocks (using spinlocks or other methods) without occupying a multiprocessor for longer than necessary. + +Without launching a second kernel, you can make the last block collect the results of all other blocks from GDS by implicitly exploiting the scheduling behavior or relying on another AMD-specific feature called Global Wave Sync (GWS) to merge them for a final tree-like reduction. + +.. note:: + + GDS and GWS are reserved runtime features that the HIP API doesn’t cover. Invoking these functionalities requires inline AMDGCN assembly. Moreover, the fact that the runtime doesn’t virtualize the GDS, imposes further restrictions on concurrent scheduling of other kernels. + +Conclusion +========== + +Optimizing code on GPUs, like on any other architecture, requires careful consideration and balancing of resources and costs of various operations to obtain optimal performance. This document explored optimizing reductions much beyond the territory of diminishing returns. This approach introduced multiple optimization techniques and discussed opportunities. + +The document focused on reductions when an entire device participates in it. Still, the choice of optimal compile-time constants or even the algorithm itself might not be optimal when its multiple blocks participate in multiple parallel reductions or when each thread performs its reduction. However, when multiple devices participate in the same reduction, other aspects must be considered. + +Most solutions, including the ones covered in this document, are given to the end users in a turnkey fashion via algorithm primitive libraries. These solutions might not be the fastest in all cases, but they are close to being the gold standard for carrying out certain operations as reasonably as possible. diff --git a/projects/hip/docs/tutorial/saxpy.rst b/projects/hip/docs/tutorial/saxpy.rst new file mode 100644 index 0000000000..91ecc10be7 --- /dev/null +++ b/projects/hip/docs/tutorial/saxpy.rst @@ -0,0 +1,751 @@ +.. meta:: + :description: The SAXPY tutorial on HIP + :keywords: AMD, ROCm, HIP, SAXPY, tutorial + +******************************************************************************* +SAXPY - Hello, HIP +******************************************************************************* + +This tutorial explains the basic concepts of the single-source +Heterogeneous-computing Interface for Portability (HIP) programming model and +the essential tooling around it. It also reviews some commonalities of +heterogenous APIs in general. This topic assumes basic familiarity with the +C/C++ compilation model and language. + +Prerequisites +============= + +To follow this tutorial, you'll need installed drivers and a HIP compiler +toolchain to compile your code. Because HIP for ROCm supports compiling and +running on Linux and Windows with AMD and NVIDIA GPUs, the combination of +install instructions is more than worth covering as part of this tutorial. For +more information about installing HIP development packages, see +:doc:`/install/install`. + +.. _hip-tutorial-saxpy-heterogeneous-programming: + +Heterogeneous programming +========================= + +*Heterogeneous programming* and *offloading APIs* are often mentioned together. Heterogeneous programming deals with devices of varying capabilities simultaneously. Offloading focuses on the "remote" and asynchronous aspects of computation. HIP encompasses both. It exposes GPGPU (general-purpose GPU) programming much like ordinary host-side CPU programming and lets you move data across various devices. + +When programming in HIP (and other heterogenous APIs for that matter), remember that target devices are built for a specific purpose. They are designed with different tradeoffs than traditional CPUs and therefore have very different performance characteristics. Even subtle changes in code might adversely affect execution time. + +Your first lines of HIP code +============================ + +First, let's do the "Hello, World!" of GPGPU: SAXPY. Single-precision A times X Plus Y (*SAXPY*) is a mathematical acronym; a vector equation :math:`a\cdot x+y=z` where :math:`a\in\mathbb{R}` is a scalar and :math:`x,y,z\in\mathbb{V}` are vector quantities of some large dimensionality. This vector space is defined over the set of reals. Practically speaking, you can compute this using a single ``for`` loop over three arrays. + +.. code-block:: C++ + + for (int i = 0 ; i < N ; ++i) + z[i] = a * x[i] + y[i]; + +In linear algebra libraries, such as BLAS (Basic Linear Algebra Subsystem) this operation is defined as AXPY "A times X Plus Y". The "S" comes from *single-precision*, meaning that array element is ``float`` -s (IEEE 754 binary32 representation). + +To quickly get started, use the set of `HIP samples from GitHub `_. With Git configured on your machine, open a command-line and navigate to your desired working directory, then run: + +.. code-block:: shell + + git clone https://github.com/amd/rocm-examples.git + +A simple implementation of SAXPY resides in the ``HIP-Basic/saxpy/main.hip`` file in this repository. The HIP code here mostly deals with where data has to be and when, and how devices transform this data. The first HIP calls deal with allocating device-side memory and copying data from host-side memory to device side in a C runtime-like fashion. + +.. code-block:: C++ + + // Allocate and copy vectors to device memory. + float* d_x{}; + float* d_y{}; + HIP_CHECK(hipMalloc(&d_x, size_bytes)); + HIP_CHECK(hipMalloc(&d_y, size_bytes)); + HIP_CHECK(hipMemcpy(d_x, x.data(), size_bytes, hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(d_y, y.data(), size_bytes, hipMemcpyHostToDevice)); + +``HIP_CHECK`` is a custom macro borrowed from the examples utilities which checks the error code returned by API functions for errors and reports them to the console. It is not essential to the API, but it is a good practice to check the error codes of the HIP APIs in case you pass on incorrect values to the API, or the API might be out of resources. + +The code selects the device to allocate to and to copy to. Commands are issued to the HIP runtime per thread, and every thread has a device set as the target of commands. The default device is ``0``, which is equivalent to calling ``hipSetDevice(0)``. + +Launch the calculation on the device after the input data has been prepared. + +.. code-block:: C++ + + __global__ void saxpy_kernel(const float a, const float* d_x, float* d_y, const unsigned int size) + { + // ... + } + + int main() + { + // ... + + // Launch the kernel on the default stream. + saxpy_kernel<<>>(a, d_x, d_y, size); + } + +Analyze at the signature of the offloaded function: + +- ``__global__`` instructs the compiler to generate code for this function as an + entrypoint to a device program, such that it can be launched from the host. +- The function does not return anything, because there is no trivial way to + construct a return channel of a parallel invocation. Device-side entrypoints + may not return a value, their results should be communicated using output + parameters. +- Device-side functions are typically called compute kernels, or just kernels + for short. This is to distinguish them from non-graphics-related graphics + shaders, or just shaders for short. +- Arguments are taken by value and all arguments shall be + `TriviallyCopyable `_, + meaning they should be `memcpy`-friendly. (Imagine if they had custom copy + constructors. Where would that logic execute? On the host? On the device?) + Pointer arguments are pointers to device memory, one typically backed by + VRAM. +- We said that we'll be computing :math:`a\cdot x+y=z`, however we only pass + two pointers to the function. We'll be canonically reusing one of the inputs + as outputs. + +This function is launched from the host using a language extension often called +the triple chevron syntax. Inside the angle brackets, provide the following. + +- The number of :ref:`blocks ` to launch (our :ref:`grid ` size) +- The number of threads in a :ref:`block ` (our :ref:`block ` size) +- The amount of shared memory to allocate by the host +- The device stream to enqueue the operation on + +The :ref:`block ` size and shared memory become important later in :doc:`reduction`. For +now, a hardcoded ``256`` is a safe default for simple kernels such as this. +Following the triple chevron is ordinary function argument passing. + +Look at how the kernel is implemented. + +.. code-block:: C++ + + __global__ void saxpy_kernel(const float a, const float* d_x, float* d_y, const unsigned int size) + { + // Compute the current thread's index in the grid. + const unsigned int global_idx = blockIdx.x * blockDim.x + threadIdx.x; + + // The grid can be larger than the number of items in the vectors. Avoid out-of-bounds addressing. + if(global_idx < size) + { + d_y[global_idx] = a * d_x[global_idx] + d_y[global_idx]; + } + } + +- The unique linear index identifying the thread is computed from the :ref:`block ` ID + the thread is a member of, the :ref:`block `'s size and the ID of the thread within + the :ref:`block `. +- A check is made to avoid overindexing the input. +- The useful part of the computation is carried out. + +Retrieval of the result from the device is done much like input data copy. In this current step the results copied from device to host. The opposite direction of the input data copy: + +.. code-block:: C++ + + HIP_CHECK(hipMemcpy(y.data(), d_y, size_bytes, hipMemcpyDeviceToHost)); + +Compiling on the command line +============================= + +.. _setting_up_the_command-line: + +Setting up the command line +--------------------------- + +Strictly speaking there's no such thing as "setting up the command-line +for compilation" on Linux. To make invocations more terse, Linux and Windows +example follow. + +.. tab-set:: + .. tab-item:: Linux and AMD + :sync: linux-amd + + While distro maintainers might package ROCm so that it installs to + system-default locations, AMD's packages aren't installed that way. They need + to be added to the PATH by the user. + + .. code-block:: bash + + export PATH=/opt/rocm/bin:${PATH} + + You should be able to call the compiler on the command line now: + + .. code-block:: bash + + amdclang++ --version + + .. note:: + + Docker images distributed by AMD, such as + `rocm-terminal `_ already + have `/opt/rocm/bin` on the Path for convenience. This subtly affects + CMake package detection logic of ROCm libraries. + + .. tab-item:: Linux and NVIDIA + :sync: linux-nvidia + + Both distro maintainers and NVIDIA package CUDA so that ``nvcc`` and related + tools are available on the command line by default. You can call the + compiler on the command line with: + + .. code-block:: bash + + nvcc --version + + .. tab-item:: Windows and AMD + :sync: windows-amd + + Windows compilers and command line tooling have traditionally relied on + extra environmental variables and PATH entries to function correctly. + Visual Studio refers to command lines with this setup as "Developer + Command Prompt" or "Developer PowerShell" for ``cmd.exe`` and PowerShell + respectively. + + The HIP SDK on Windows doesn't include a complete toolchain. You will also + need: + + - The Microsoft Windows SDK. It provides the import libs to crucial system + libraries that all executables must link to and some auxiliary compiler + tooling. + - A Standard Template Library (STL). Installed as part of the Microsoft + Visual C++ compiler (MSVC) or with Visual Studio. + + If you don't have a version of Visual Studio 2022 installed, for a + minimal command line experience, install the + `Build Tools for Visual Studio 2022 `_ + with the Desktop Developemnt Workload. Under Individual Components select: + + - A version of the Windows SDK + - "MSVC v143 - VS 2022 C++ x64/x86 build tools (Latest)" + - "C++ CMake tools for Windows" (optional) + + .. note:: + + The "C++ CMake tools for Windows" individual component is a convenience which + puts both ``cmake.exe`` and ``ninja.exe`` onto the PATH inside developer + command prompts. You can install these manually, but then you must manage + them manually. + + Visual Studio 2017 and later are detectable as COM object instances via WMI. + To setup a command line from any shell for the latest Visual Studio's + default Visual C++ toolset issue: + + .. code-block:: powershell + + $InstallationPath = Get-CimInstance MSFT_VSInstance | Sort-Object -Property Version -Descending | Select-Object -First 1 -ExpandProperty InstallLocation + Import-Module $InstallationPath\Common7\Tools\Microsoft.VisualStudio.DevShell.dll + Enter-VsDevShell -InstallPath $InstallationPath -SkipAutomaticLocation -Arch amd64 -HostArch amd64 -DevCmdArguments '-no_logo' + $env:PATH = "${env:HIP_PATH}bin;${env:PATH}" + + You should be able to call the compiler on the command line now: + + .. code-block:: powershell + + clang++ --version + + .. tab-item:: Windows and NVIDIA + :sync: windows-nvidia + + Windows compilers and command line tooling have traditionally relied on + extra environmental variables and PATH entries to function correctly. + Visual Studio refers to command lines with this setup as "Developer + Command Prompt" or "Developer PowerShell" for ``cmd.exe`` and PowerShell + respectively. + + The HIP and CUDA SDKs on Windows don't include complete toolchains. You will + also need: + + - The Microsoft Windows SDK. It provides the import libs to crucial system + libraries that all executables must link to and some auxiliary compiler + tooling. + - A Standard Template Library (STL). Installed as part of the Microsoft + Visual C++ compiler (MSVC) or with Visual Studio. + + If you don't have a version of Visual Studio 2022 installed, for a + minimal command line experience, install the + `Build Tools for Visual Studio 2022 `_ + with the Desktop Developemnt Workload. Under Individual Components select: + + - A version of the Windows SDK + - "MSVC v143 - VS 2022 C++ x64/x86 build tools (Latest)" + - "C++ CMake tools for Windows" (optional) + + .. note:: + + The "C++ CMake tools for Windows" individual component is a convenience which + puts both ``cmake.exe`` and ``ninja.exe`` onto the PATH inside developer + command prompts. You can install these manually, but then you must manage + them manually. + + Visual Studio 2017 and later are detectable as COM object instances via WMI. + To setup a command line from any shell for the latest Visual Studio's + default Visual C++ toolset issue: + + .. code-block:: powershell + + $InstallationPath = Get-CimInstance MSFT_VSInstance | Sort-Object -Property Version -Descending | Select-Object -First 1 -ExpandProperty InstallLocation + Import-Module $InstallationPath\Common7\Tools\Microsoft.VisualStudio.DevShell.dll + Enter-VsDevShell -InstallPath $InstallationPath -SkipAutomaticLocation -Arch amd64 -HostArch amd64 -DevCmdArguments '-no_logo' + + You should be able to call the compiler on the command line now: + + .. code-block:: powershell + + nvcc --version + +Invoking the compiler manually +------------------------------ + +To compile and link a single-file application, use the following commands: + +.. tab-set:: + .. tab-item:: Linux and AMD + :sync: linux-amd + + .. code-block:: bash + + amdclang++ ./HIP-Basic/saxpy/main.hip -o saxpy -I ./Common -lamdhip64 -L /opt/rocm/lib -O2 + + .. tab-item:: Linux and NVIDIA + :sync: linux-nvidia + + .. code-block:: bash + + nvcc ./HIP-Basic/saxpy/main.hip -o saxpy -I ./Common -I /opt/rocm/include -O2 -x cu + + .. tab-item:: Windows and AMD + :sync: windows-amd + + .. code-block:: powershell + + clang++ .\HIP-Basic\saxpy\main.hip -o saxpy.exe -I .\Common -lamdhip64 -L ${env:HIP_PATH}lib -O2 + + .. tab-item:: Windows and NVIDIA + :sync: windows-nvidia + + .. code-block:: powershell + + nvcc .\HIP-Basic\saxpy\main.hip -o saxpy.exe -I ${env:HIP_PATH}include -I .\Common -O2 -x cu + +Depending on your computer, the resulting binary might or might not run. If not, +it typically complains about "Invalid device function". That error +(corresponding to the ``hipErrorInvalidDeviceFunction`` entry of ``hipError_t``) +means that the runtime could not find a device program binary of the +appropriate flavor embedded into the executable. + +So far, the discussion has covered how data makes it from the host to the +device and back. It has also discussed the device code as source, with the HIP +runtime arguing that the correct binary to dispatch for execution. How can you +find out what device binary flavors are embedded into the executable? + +.. tab-set:: + + .. tab-item:: Linux and AMD + :sync: linux-amd + + The utilities included with ROCm help significantly to inspect binary + artifacts on disk. Add the ROCmCC installation folder to your PATH if you + want to use these utilities (the utilities expect them to be on the PATH). + + You can list embedded program binaries using ``roc-obj-ls``. + + .. code-block:: bash + + roc-obj-ls ./saxpy + + It should return something like: + + .. code-block:: shell + + 1 host-x86_64-unknown-linux file://./saxpy#offset=12288&size=0 + 1 hipv4-amdgcn-amd-amdhsa--gfx803 file://./saxpy#offset=12288&size=9760 + + The compiler embeds a version 4 code object (more on `code + object versions `_) + and used the LLVM target triple `amdgcn-amd-amdhsa--gfx803` (more on `target triples + `_). You can + extract that program object in a disassembled fashion for human consumption + via ``roc-obj``. + + .. code-block:: bash + + roc-obj -t gfx803 -d ./saxpy + + This creates two files on disk and ``.s`` extension is of most interest. + Opening this file or dumping it to the console using ``cat`` + lets find the disassembled binary of the SAXPY compute kernel, something + similar to: + + .. code-block:: + + Disassembly of section .text: + + <_Z12saxpy_kernelfPKfPfj>: + s_load_dword s0, s[4:5], 0x2c // 000000001000: C0020002 0000002C + s_load_dword s1, s[4:5], 0x18 // 000000001008: C0020042 00000018 + s_waitcnt lgkmcnt(0) // 000000001010: BF8C007F + s_and_b32 s0, s0, 0xffff // 000000001014: 8600FF00 0000FFFF + s_mul_i32 s6, s6, s0 // 00000000101C: 92060006 + v_add_u32_e32 v0, vcc, s6, v0 // 000000001020: 32000006 + v_cmp_gt_u32_e32 vcc, s1, v0 // 000000001024: 7D980001 + s_and_saveexec_b64 s[0:1], vcc // 000000001028: BE80206A + s_cbranch_execz 22 // 00000000102C: BF880016 <_Z12saxpy_kernelfPKfPfj+0x88> + s_load_dwordx4 s[0:3], s[4:5], 0x8 // 000000001030: C00A0002 00000008 + v_mov_b32_e32 v1, 0 // 000000001038: 7E020280 + v_lshlrev_b64 v[0:1], 2, v[0:1] // 00000000103C: D28F0000 00020082 + s_waitcnt lgkmcnt(0) // 000000001044: BF8C007F + v_mov_b32_e32 v3, s1 // 000000001048: 7E060201 + v_add_u32_e32 v2, vcc, s0, v0 // 00000000104C: 32040000 + v_addc_u32_e32 v3, vcc, v3, v1, vcc // 000000001050: 38060303 + flat_load_dword v2, v[2:3] // 000000001054: DC500000 02000002 + v_mov_b32_e32 v3, s3 // 00000000105C: 7E060203 + v_add_u32_e32 v0, vcc, s2, v0 // 000000001060: 32000002 + v_addc_u32_e32 v1, vcc, v3, v1, vcc // 000000001064: 38020303 + flat_load_dword v3, v[0:1] // 000000001068: DC500000 03000000 + s_load_dword s0, s[4:5], 0x0 // 000000001070: C0020002 00000000 + s_waitcnt vmcnt(0) lgkmcnt(0) // 000000001078: BF8C0070 + v_mac_f32_e32 v3, s0, v2 // 00000000107C: 2C060400 + flat_store_dword v[0:1], v3 // 000000001080: DC700000 00000300 + s_endpgm // 000000001088: BF810000 + + Alternatively, call the compiler with ``--save-temps`` to dump all device + binary to disk in separate files. + + .. code-block:: bash + + amdclang++ ./HIP-Basic/saxpy/main.hip -o saxpy -I ./Common -lamdhip64 -L /opt/rocm/lib -O2 --save-temps + + List all the temporaries created while compiling ``main.hip`` with: + + .. code-block:: bash + + ls main-hip-amdgcn-amd-amdhsa-* + main-hip-amdgcn-amd-amdhsa-gfx803.bc + main-hip-amdgcn-amd-amdhsa-gfx803.cui + main-hip-amdgcn-amd-amdhsa-gfx803.o + main-hip-amdgcn-amd-amdhsa-gfx803.out + main-hip-amdgcn-amd-amdhsa-gfx803.out.resolution.txt + main-hip-amdgcn-amd-amdhsa-gfx803.s + + Files with the ``.s`` extension hold the disassembled contents of the binary. + The filename notes the graphics IPs used by the compiler. The contents of + this file are similar to what ``roc-obj`` printed to the console. + + .. tab-item:: Linux and NVIDIA + :sync: linux-nvidia + + Unlike HIP on AMD, when compiling using the NVIDIA support of HIP the resulting + binary will be a valid CUDA executable as far as the binary goes. Therefor + it'll incorporate PTX ISA (Parallel Thread eXecution Instruction Set + Architecture) instead of AMDGPU binary. As s result, tooling shipping with the + CUDA SDK can be used to inspect which device ISA got compiled into a specific + executable. The tool most useful to us currently is ``cuobjdump``. + + .. code-block:: bash + + cuobjdump --list-ptx ./saxpy + + Which will print something like: + + .. code-block:: + + PTX file 1: saxpy.1.sm_52.ptx + + From this we can see that the saxpy kernel is stored as ``sm_52``, which shows + that a compute capability 5.2 ISA got embedded into the executable, so devices + which sport compute capability 5.2 or newer will be able to run this code. + + .. tab-item:: Windows and AMD + :sync: windows-amd + + The HIP SDK for Windows don't yet sport the ``roc-*`` set of utilities to work + with binary artifacts. To find out what binary formats are embedded into an + executable, one may use ``dumpbin`` tool from the Windows SDK to obtain the + raw data of the ``.hip_fat`` section of an executable. (This binary payload is + what gets parsed by the ``roc-*`` set of utilities on Linux.) Skipping over the + reported header, the rendered raw data as ASCII has ~3 lines per entries. + Depending on how many binaries are embedded, you may need to alter the number + of rendered lines. An invocation such as: + + .. code-block:: powershell + + dumpbin.exe /nologo /section:.hip_fat /rawdata:8 .\saxpy.exe | select -Skip 20 -First 12 + + The output may look like: + + .. code-block:: + + 000000014004C000: 5F474E414C435F5F 5F44414F4C46464F __CLANG_OFFLOAD_ + 000000014004C010: 5F5F454C444E5542 0000000000000002 BUNDLE__........ + 000000014004C020: 0000000000001000 0000000000000000 ................ + 000000014004C030: 0000000000000019 3638782D74736F68 ........host-x86 + 000000014004C040: 6E6B6E752D34365F 756E696C2D6E776F _64-unknown-linu + 000000014004C050: 0000000000100078 00000000000D9800 x............... + 000000014004C060: 0000000000001F00 612D347670696800 .........hipv4-a + 000000014004C070: 6D612D6E6367646D 617368646D612D64 mdgcn-amd-amdhsa + 000000014004C080: 3630397866672D2D 0000000000000000 --gfx906........ + 000000014004C090: 0000000000000000 0000000000000000 ................ + 000000014004C0A0: 0000000000000000 0000000000000000 ................ + 000000014004C0B0: 0000000000000000 0000000000000000 ................ + + We can see that the compiler embedded a version 4 code object (more on code + `object versions `_) and + used the LLVM target triple `amdgcn-amd-amdhsa--gfx906` (more on `target triples + `_). Don't be + alarmed about linux showing up as a binary format, AMDGPU binaries uploaded to + the GPU for execution are proper linux ELF binaries in their format. + + Alternatively we can call the compiler with ``--save-temps`` to dump all device + binary to disk in separate files. + + .. code-block:: powershell + + clang++ .\HIP-Basic\saxpy\main.hip -o saxpy.exe -I .\Common -lamdhip64 -L ${env:HIP_PATH}lib -O2 --save-temps + + Now we can list all the temporaries created while compiling ``main.hip`` via + + .. code-block:: powershell + + Get-ChildItem -Filter main-hip-* | select -Property Name + + Name + ---- + main-hip-amdgcn-amd-amdhsa-gfx906.bc + main-hip-amdgcn-amd-amdhsa-gfx906.hipi + main-hip-amdgcn-amd-amdhsa-gfx906.o + main-hip-amdgcn-amd-amdhsa-gfx906.out + main-hip-amdgcn-amd-amdhsa-gfx906.out.resolution.txt + main-hip-amdgcn-amd-amdhsa-gfx906.s + + Files with the ``.s`` extension hold the disassembled contents of the binary and + the filename directly informs us of the graphics IPs used by the compiler. + + .. code-block:: powershell + + Get-ChildItem main-hip-*.s | Get-Content + .text + .amdgcn_target "amdgcn-amd-amdhsa--gfx906" + .protected _Z12saxpy_kernelfPKfPfj ; -- Begin function _Z12saxpy_kernelfPKfPfj + .globl _Z12saxpy_kernelfPKfPfj + .p2align 8 + .type _Z12saxpy_kernelfPKfPfj,@function + _Z12saxpy_kernelfPKfPfj: ; @_Z12saxpy_kernelfPKfPfj + ; %bb.0: + s_load_dword s0, s[4:5], 0x4 + s_load_dword s1, s[6:7], 0x18 + s_waitcnt lgkmcnt(0) + s_and_b32 s0, s0, 0xffff + s_mul_i32 s8, s8, s0 + v_add_u32_e32 v0, s8, v0 + v_cmp_gt_u32_e32 vcc, s1, v0 + s_and_saveexec_b64 s[0:1], vcc + s_cbranch_execz .LBB0_2 + ; %bb.1: + s_load_dwordx4 s[0:3], s[6:7], 0x8 + v_mov_b32_e32 v1, 0 + v_lshlrev_b64 v[0:1], 2, v[0:1] + s_waitcnt lgkmcnt(0) + v_mov_b32_e32 v3, s1 + v_add_co_u32_e32 v2, vcc, s0, v0 + v_addc_co_u32_e32 v3, vcc, v3, v1, vcc + global_load_dword v2, v[2:3], off + v_mov_b32_e32 v3, s3 + v_add_co_u32_e32 v0, vcc, s2, v0 + v_addc_co_u32_e32 v1, vcc, v3, v1, vcc + global_load_dword v3, v[0:1], off + s_load_dword s0, s[6:7], 0x0 + s_waitcnt vmcnt(0) lgkmcnt(0) + v_fmac_f32_e32 v3, s0, v2 + global_store_dword v[0:1], v3, off + .LBB0_2: + s_endpgm + ... + + .. tab-item:: Windows and NVIDIA + :sync: windows-nvidia + + Unlike HIP on AMD, when compiling using the NVIDIA support for HIP, the resulting + binary will be a valid CUDA executable. Therefore, it'll incorporate PTX ISA + (Parallel Thread eXecution Instruction Set Architecture) instead of AMDGPU + binary. As a result, tooling included with the CUDA SDK can be used to + inspect which device ISA was compiled into a specific executable. The most + helpful to us currently is ``cuobjdump``. + + .. code-block:: bash + + cuobjdump.exe --list-ptx .\saxpy.exe + + Which prints something like: + + .. code-block:: + + PTX file 1: saxpy.1.sm_52.ptx + + This example shows that the SAXPY kernel is stored as ``sm_52``. It also shows + that a compute capability 5.2 ISA was embedded into the executable, so devices + that support compute capability 5.2 or newer will be able to run this code. + +Now that you've found what binary got embedded into the executable, find which +format our available devices use. + +.. tab-set:: + .. tab-item:: Linux and AMD + :sync: linux-amd + + On Linux a utility called ``rocminfo`` helps us list all the properties of the + devices available on the system, including which version of graphics IP + (``gfxXYZ``) they employ. You can filter the output to have only these lines: + + .. code-block:: bash + + /opt/rocm/bin/rocminfo | grep gfx + Name: gfx906 + Name: amdgcn-amd-amdhsa--gfx906:sramecc+:xnack- + + Now that you know which graphics IPs our devices use, recompile your program with + the appropriate parameters. + + .. code-block:: bash + + amdclang++ ./HIP-Basic/saxpy/main.hip -o saxpy -I ./Common -lamdhip64 -L /opt/rocm/lib -O2 --offload-arch=gfx906:sramecc+:xnack- + + Now the sample will run. + + .. code-block:: + + ./saxpy + Calculating y[i] = a * x[i] + y[i] over 1000000 elements. + First 10 elements of the results: [ 3, 5, 7, 9, 11, 13, 15, 17, 19, 21 ] + + .. tab-item:: Linux and NVIDIA + :sync: linux-nvidia + + On Linux HIP with the NVIDIA back-end, the ``deviceQuery`` CUDA SDK sample + can help us list all the properties of the devices available on the system, + including which version of compute capability a device sports. + ``.`` compute capability is passed to ``nvcc`` on the + command-line as ``sm_``, for eg. ``8.6`` is ``sm_86``. + + Because it's not included as a binary, compile the matching + example from ROCm. + + .. code-block:: bash + + nvcc ./HIP-Basic/device_query/main.cpp -o device_query -I ./Common -I /opt/rocm/include -O2 + + Filter the output to have only the lines of interest, for example: + + .. code-block:: bash + + ./device_query | grep "major.minor" + major.minor: 8.6 + major.minor: 7.0 + + .. note:: + + In addition to the ``nvcc`` executable is another tool called ``__nvcc_device_query`` + which prints the SM Architecture numbers to standard out as a comma + separated list of numbers. The utility's name suggests it's not a user-facing + executable but is used by ``nvcc`` to determine what devices are in the + system at hand. + + Now that you know which graphics IPs our devices use, recompile your program with + the appropriate parameters. + + .. code-block:: bash + + nvcc ./HIP-Basic/saxpy/main.hip -o saxpy -I ./Common -I /opt/rocm/include -O2 -x cu -arch=sm_70,sm_86 + + .. note:: + + If you want to portably target the development machine which is compiling, you + may specify ``-arch=native`` instead. + + Now the sample will run. + + .. code-block:: + + ./saxpy + Calculating y[i] = a * x[i] + y[i] over 1000000 elements. + First 10 elements of the results: [ 3, 5, 7, 9, 11, 13, 15, 17, 19, 21 ] + + .. tab-item:: Windows and AMD + :sync: windows-amd + + On Windows, a utility called ``hipInfo.exe`` helps us list all the properties + of the devices available on the system, including which version of graphics IP + (``gfxXYZ``) they employ. Filter the output to have only these lines: + + .. code-block:: powershell + + & ${env:HIP_PATH}bin\hipInfo.exe | Select-String gfx + + gcnArchName: gfx1032 + gcnArchName: gfx1035 + + Now that you know which graphics IPs our devices use, recompile your program with + the appropriate parameters. + + .. code-block:: powershell + + clang++ .\HIP-Basic\saxpy\main.hip -o saxpy.exe -I .\Common -lamdhip64 -L ${env:HIP_PATH}lib -O2 --offload-arch=gfx1032 --offload-arch=gfx1035 + + Now the sample will run. + + .. code-block:: + + .\saxpy.exe + Calculating y[i] = a * x[i] + y[i] over 1000000 elements. + First 10 elements of the results: [ 3, 5, 7, 9, 11, 13, 15, 17, 19, 21 ] + + .. tab-item:: Windows and NVIDIA + :sync: windows-nvidia + + On Windows HIP with the NVIDIA back-end, the ``deviceQuery`` CUDA SDK sample + can help us list all the properties of the devices available on the system, + including which version of compute capability a device sports. + ``.`` compute capability is passed to ``nvcc`` on the + command-line as ``sm_``, for eg. ``8.6`` is ``sm_86``. + + Because it's not included as a binary, compile the matching + example from ROCm. + + .. code-block:: powershell + + nvcc .\HIP-Basic\device_query\main.cpp -o device_query.exe -I .\Common -I ${env:HIP_PATH}include -O2 + + Filter the output to have only the lines of interest, for example: + + .. code-block:: powershell + + .\device_query.exe | Select-String "major.minor" + + major.minor: 8.6 + major.minor: 7.0 + + .. note:: + + Next to the ``nvcc`` executable is another tool called ``__nvcc_device_query.exe`` + which simply prints the SM Architecture numbers to standard out as a comma + separated list of numbers. The naming of this utility suggests it's not a user + facing executable but is used by ``nvcc`` to determine what devices are in the + system at hand. + + Now that you know which graphics IPs our devices use, recompile your program with + the appropriate parameters. + + .. code-block:: powershell + + nvcc .\HIP-Basic\saxpy\main.hip -o saxpy.exe -I ${env:HIP_PATH}include -I .\Common -O2 -x cu -arch=sm_70,sm_86 + + .. note:: + + If you want to portably target the development machine which is compiling, you + may specify ``-arch=native`` instead. + + Now the sample will run. + + .. code-block:: + + .\saxpy.exe + Calculating y[i] = a * x[i] + y[i] over 1000000 elements. + First 10 elements of the results: [ 3, 5, 7, 9, 11, 13, 15, 17, 19, 21 ] diff --git a/projects/hip/docs/understand/amd_clr.rst b/projects/hip/docs/understand/amd_clr.rst index 24980468e7..3a643cb051 100644 --- a/projects/hip/docs/understand/amd_clr.rst +++ b/projects/hip/docs/understand/amd_clr.rst @@ -5,16 +5,16 @@ .. _AMD_Compute_Language_Runtimes: ******************************************************************************* -AMD Common Language Runtimes (CLR) +AMD common language runtimes (CLR) ******************************************************************************* CLR contains source codes for AMD's compute languages runtimes: ``HIP`` and ``OpenCL™``. CLR is the part of HIP runtime which is supported on the AMD ROCm platform, it provides a header and runtime library built on top of HIP-Clang compiler. For developers and users, CLR implements HIP runtime APIs including streams, events, and memory APIs, which is a object library that is linked with the application. -The source codes for all headers and the library implementation are available on GitHub in the `clr repository `_. +The source codes for all headers and the library implementation are available on GitHub in the `CLR repository `_. -Project Organisation +Project organization ==================== CLR includes the following source code, diff --git a/projects/hip/docs/understand/hardware_implementation.rst b/projects/hip/docs/understand/hardware_implementation.rst index 8ee3e0e08c..7352841926 100644 --- a/projects/hip/docs/understand/hardware_implementation.rst +++ b/projects/hip/docs/understand/hardware_implementation.rst @@ -5,13 +5,13 @@ .. _hardware_implementation: ******************************************************************************* -Hardware Implementation +Hardware implementation ******************************************************************************* This chapter describes the typical hardware implementation of GPUs supported by HIP, and how the :ref:`inherent_thread_model` maps to the hardware. -Compute Units +Compute units ============= The basic building block of a GPU is a compute unit (CU), also known @@ -79,7 +79,7 @@ instructions of the other branch have to be executed in the same way. The best performance can therefore be achieved when thread divergence is kept to a warp level, i.e. when all threads in a warp take the same execution path. -Vector Cache +Vector cache ------------ The usage of cache on a GPU differs from that on a CPU, as there is less cache @@ -88,7 +88,7 @@ warps in order to reduce the amount of accesses to device memory, and make that memory available for other warps that currently reside on the compute unit, that also need to load those values. -Local Data Share +Local data share ---------------- The local data share is memory that is accessible to all threads within a block. @@ -103,14 +103,14 @@ The scalar unit performs instructions that are uniform within a warp. It thereby improves efficiency and reduces the pressure on the vector ALUs and the vector register file. -CDNA Architecture +CDNA architecture ================= The general structure of CUs stays mostly as it is in GCN architectures. The most prominent change is the addition of matrix ALUs, which can greatly improve the performance of algorithms involving matrix multiply-accumulate operations for -:doc:`int8, float16, bfloat16 or float32`. +:doc:`int8, float16, bfloat16 or float32`. .. figure:: ../data/understand/hardware_implementation/cdna3_cu.png :alt: Block diagram showing the structure of a CDNA3 compute unit. It includes @@ -122,7 +122,7 @@ multiply-accumulate operations for Block Diagram of a CDNA3 Compute Unit. -RDNA Architecture +RDNA architecture ================= RDNA makes a fundamental change to CU design, by changing the @@ -145,7 +145,7 @@ an L0 cache. Block Diagram of an RDNA3 work group processor. -Shader Engines +Shader engines ============== For hardware implementation's sake, multiple CUs are grouped diff --git a/projects/hip/docs/understand/programming_model.rst b/projects/hip/docs/understand/programming_model.rst index 88ba476a89..a4aa41fff7 100644 --- a/projects/hip/docs/understand/programming_model.rst +++ b/projects/hip/docs/understand/programming_model.rst @@ -5,28 +5,34 @@ :keywords: AMD, ROCm, HIP, CUDA, API design ******************************************************************************* -Understanding the HIP programming model +HIP programming model ******************************************************************************* The HIP programming model makes it easy to map data-parallel C/C++ algorithms to massively parallel, wide single instruction, multiple data (SIMD) architectures, -such as GPUs. A basic understanding of the underlying device architecture helps you +such as GPUs. + +While the model may be expressed in most imperative languages, (for example +Python via PyHIP) this document will focus on the original C/C++ API of HIP. + +A basic understanding of the underlying device architecture helps you make efficient use of HIP and general purpose graphics processing unit (GPGPU) programming in general. -RDNA & CDNA Architecture Summary +RDNA & CDNA architecture summary ================================ -Most GPU architectures, like RDNA and CDNA, have a hierarchical structure. -The innermost piece is a SIMD-enabled vector Arithmetic Logical Unit (ALU). -In addition to the vector ALUs, most recent GPUs also house matrix ALUs for -accelerating algorithms involving matrix multiply-accumulate operations. -AMD GPUs also contain scalar ALUs, that can be used to reduce the load on the -vector ALU by performing operations which are uniform for all threads of a warp. +GPUs in general are made up of basic building blocks called compute units (CUs), +that execute the threads of a kernel. These CUs provide the necessary resources +for the threads: the Arithmetic Logical Units (ALUs), register files, caches and +shared memory for efficient communication between the threads. -A set of ALUs, together with register files, caches and shared memory, comprise -a larger block, often referred to as a compute unit (CU), e.g. in OpenCL and -AMD block diagrams, or as streaming multiprocessor (SM). +This design allows for efficient execution of kernels while also being able to +scale from small GPUs embedded in APUs with few CUs up to GPUs designed for data +centers with hundreds of CUs. Figure :ref:`rdna3_cu` and :ref:`cdna3_cu` show +examples of such compute units. + +For architecture details, check :ref:`hardware_implementation`. .. _rdna3_cu: @@ -50,35 +56,60 @@ AMD block diagrams, or as streaming multiprocessor (SM). Block Diagram of a CDNA3 Compute Unit. -For implementation in hardware, multiple CUs are grouped together into -a Shader Engine or Compute Engine, typically sharing some fixed function units or -memory subsystem resources. +Heterogeneous Programming +========================= -.. _cdna2_gcd: +The HIP programming model assumes two execution contexts. One is referred to as +*host* while compute kernels execute on a *device*. These contexts have +different capabilities, therefor slightly different rules apply. The *host* +execution is defined by the C++ abstract machine, while *device* execution +follows the :ref:`SIMT model` of HIP. These execution contexts in +code are signified by the ``__host__`` and ``__device__`` decorators. There are +a few key differences between the two: -.. figure:: ../data/understand/programming_model/cdna2_gcd.png - :alt: Block diagram showing four Compute Engines each with 28 Compute Units - inside. These four Compute Engines share one block of L2 Cache. Around - them are four Memory Controllers. To the top and bottom of all these are - eight blocks of Infinity Fabric Links. Two Video Core Next blocks sit in - the top corners. At the very bottom spans a colored section reading - Infinity Fabric. +* The C++ abstract machine assumes a unified memory address space, meaning that + one can always access any given address in memory (assuming the absence of + data races). HIP however introduces several memory namespaces, an address + from one means nothing in another. Moreover, not all address spaces are + accessible from all contexts. - Block Diagram of a CDNA2 Graphics Compute Die. + Looking at :ref:`rdna3_cu` and :ref:`cdna3_cu`, you can see that + every CU has an instance of storage backing the namespace ``__shared__``. + Even if the host were to have access to these regions of + memory, the performance benefits of the segmented memory subsystem are + supported by the inability of asynchronous access from the host. + +* Not all C++ language features map cleanly to typical device architectures, + some are very expensive (meaning slow) to implement on GPU devices, therefor + they are forbidden in device contexts to avoid users tapping into features + that unexpectedly decimate their program's performance. Offload devices targeted + by HIP aren't general purpose devices, at least not in the sense that a CPU is. + HIP focuses on data parallel computations and as such caters to throughput + optimized architectures, such as GPUs or accelerators derived from GPU + architectures. + +* Asynchrony is at the forefront of the HIP API. Computations launched on the device + execute asynchronously with respect to the host, and it is the user's responsibility to + synchronize their data dispatch/fetch with computations on the device. + + .. note:: + HIP does perform implicit synchronization on occasions, more advanced than other + APIs such as OpenCL or SYCL, in which the responsibility of synchronization mostly + depends on the user. .. _programming_model_simt: -Single Instruction Multiple Threads -=================================== +Single instruction multiple threads (SIMT) +========================================== -The single instruction, multiple threads (SIMT) programming model behind the -HIP device-side execution is a middle-ground between SMT (Simultaneous Multi-Threading) -programming known from multicore CPUs, and SIMD (Single Instruction, Multiple Data) programming -mostly known from exploiting relevant instruction sets on CPUs (for example SSE/AVX/Neon). +The SIMT programming model behind the HIP device-side execution is a middle-ground +between SMT (Simultaneous Multi-Threading) programming known from multicore CPUs, +and SIMD (Single Instruction, Multiple Data) programming mostly known from exploiting +relevant instruction sets on CPUs (for example SSE/AVX/Neon). A HIP device compiler maps SIMT code written in HIP C++ to an inherently SIMD architecture (like GPUs). This is done by scalarizing the entire kernel and issuing the scalar -instructions of multiple kernel instances to each of the SIMD engine lanes, rather +instructions of multiple kernel instances (called threads) to each of the SIMD engine lanes, rather than exploiting data parallelism within a single instance of a kernel and spreading identical instructions over the available SIMD engines. @@ -97,9 +128,10 @@ Consider the following kernel: The incoming four-vector of floating-point values ``b`` is multiplied by a scalar and then added element-wise to the four-vector floating-point values of -``a``. On modern SIMD-capable architectures the four-vector ops are expected to -compile to a single SIMD instruction. GPU execution of this kernel however will -typically look the following: +``a``. On modern SIMD-capable architectures, the four-vector ops are expected to +compile to a single SIMD instruction. However, GPU execution of this kernel will +typically break down the vector elements into 4 separate threads for parallel execution, +as seen in the following figure: .. _simt: @@ -111,46 +143,221 @@ typically look the following: Instruction flow of the sample SIMT program. -In HIP, lanes of a SIMD architecture are fed by mapping threads of a SIMT -execution, one thread down each lane of a SIMD engine. Execution parallelism -usually isn't exploited from the width of the built-in vector types, but via the -thread id constants ``threadIdx.x``, ``blockIdx.x``, etc. For more details, -refer to :ref:`inherent_thread_model`. +In HIP, lanes of the SIMD architecture are fed by mapping threads of a SIMT +execution, one thread down each lane of an SIMD engine. Execution parallelism +usually isn't exploited from the width of the built-in vector types, but across multiple threads via the thread ID constants ``threadIdx.x``, ``blockIdx.x``, etc. -Heterogeneous Programming -========================= +.. _inherent_thread_model: -The HIP programming model assumes two execution contexts. One is referred to as -*host* while compute kernels execute on a *device*. These contexts have -different capabilities, therefor slightly different rules apply. The *host* -execution is defined by the C++ abstract machine, while *device* execution -follows the HIP model, primarily defined by SIMT. These execution contexts in -code are signified by the ``__host__`` and ``__device__`` decorators. There are -a few key differences between the two: +Inherent thread model +===================== -* The C++ abstract machine assumes a unified memory address space, meaning that - one can always access any given address in memory (assuming the absence of - data races). HIP however introduces several memory namespaces, an address - from one means nothing in another. Moreover, not all address spaces are - accessible from all contexts. +The SIMT nature of HIP is captured by the ability to execute user-provided +device programs, expressed as single-source C/C++ functions or sources compiled +online/offline to binaries, in bulk. - If one were to look at :ref:`cdna2_gcd` and inside the :ref:`cdna3_cu`, - every Compute Unit has an instance of storage backing the namespace - ``__shared__``. Even if the host were to have access to these regions of - memory, the performance benefits of the segmented memory subsystem are - supported by the inability of asynchronous access from the host. +All threads of a kernel are uniquely identified by a set of integral values, called thread IDs. +The set of integers identifying a thread relate to the hierarchy in which the threads execute. -* Not all C++ language features map cleanly to typical device architectures, - some are very expensive (meaning slow) to implement on GPU devices, therefor - they are forbidden in device contexts to avoid users tapping into features - that unexpectedly decimate their program's performance. Offload devices targeted - by HIP aren't general purpose devices, at least not in the sense a CPU is. - HIP focuses on data parallel computations and as such caters to throughput - optimized architectures, such as GPUs or accelerators derived from GPU - architectures. +The thread hierarchy inherent to how AMD GPUs operate is depicted in the +following figure. + +.. _inherent_thread_hierarchy: + +.. figure:: ../data/understand/programming_model/thread_hierarchy.svg + :alt: Diagram depicting nested rectangles of varying color. The outermost one + titled "Grid", inside sets of uniform rectangles layered on one another + titled "Block". Each "Block" containing sets of uniform rectangles + layered on one another titled "Warp". Each of the "Warp" titled + rectangles filled with downward pointing arrows inside. + + Hierarchy of thread groups. + +Warp (or Wavefront) + The innermost grouping of threads is called a warp, or a wavefront in ISA terms. A warp + is the most tightly coupled groups of threads, both physically and logically. Threads + inside a warp are also called lanes, and the integral value identifying them is the lane ID. + + .. tip:: + + Lane IDs aren't queried like other thread IDs, but are user-calculated. As a + consequence, they are only as multidimensional as the user interprets the + calculated values to be. + + The size of a warp is architecture dependent and always fixed. For AMD GPUs + the wavefront is typically 64 threads, though sometimes 32 threads. Warps are + signified by the set of communication primitives at their disposal, as + discussed in :ref:`warp-cross-lane`. + +.. _inherent_thread_hierarchy_block: + +Block + The middle grouping is called a block or thread block. The defining feature + of a block is that all threads in a block will share an instance of memory + which they may use to share data or synchronize with one another. + + The size of a block is user-configurable but is limited by the queryable + capabilities of the executing hardware. The unique ID of the thread within a + block is 3-dimensional as provided by the API. When linearizing thread IDs + within a block, assume the "fast index" being dimension ``x``, followed by + the ``y`` and ``z`` dimensions. + +.. _inherent_thread_hierarchy_grid: + +Grid + The outermost grouping is called a grid. A grid manifests as a single + dispatch of kernels for execution. The unique ID of each block within a grid + is 3-dimensional, as provided by the API and is queryable by every thread + within the block. + +Cooperative groups thread model +------------------------------- + +The Cooperative groups API introduces new APIs to launch, group, subdivide, +synchronize and identify threads, as well as some predefined group-collective +algorithms, but most importantly a matching threading model to think in terms of. +It relaxes some restrictions of the :ref:`inherent_thread_model` imposed by the +strict 1:1 mapping of architectural details to the programming model. Cooperative +groups let you define your own set of thread groups which may fit your user-cases +better than the defaults defined by the hardware. + +.. note:: + The implicit groups defined by kernel launch parameters are still available + when working with cooperative groups. + +For further information, see :doc:`Cooperative groups `. + +Memory model +============ + +The hierarchy of threads introduced by the :ref:`inherent_thread_model` is induced +by the memory subsystem of GPUs. The following figure summarizes the memory +namespaces and how they relate to the various levels of the threading model. + +.. _memory_hierarchy: + +.. figure:: ../data/understand/programming_model/memory_hierarchy.svg + :alt: Diagram depicting nested rectangles of varying color. The outermost one + titled "Grid", inside it are two identical rectangles titled "Block", + inside them are ones titled "Local" with multiple "Warp" titled rectangles. + Blocks have not just Local inside, but also rectangles titled "Shared". + Inside the Grid is a rectangle titled "Global" with three others inside: + "Surface", "Texture" (same color) and "Constant" (different color). + + Memory hierarchy. + +Local or per-thread memory + Read-write storage only visible to the threads defining the given variables, + also called per-thread memory. The size of a block for a given kernel, and thereby + the number of concurrent warps, are limited by local memory usage. + This relates to an important aspect: occupancy. This is the default memory + namespace. + +Shared memory + Read-write storage visible to all the threads in a given block. + +Global + Read-write storage visible to all threads in a given grid. There are + specialized versions of global memory with different usage semantics which + are typically backed by the same hardware storing global. + + Constant + Read-only storage visible to all threads in a given grid. It is a limited + segment of global with queryable size. + + Texture + Read-only storage visible to all threads in a given grid and accessible + through additional APIs. + + Surface + A read-write version of texture memory. + +Execution model +=============== + +HIP programs consist of two distinct scopes: + +* The host-side API running on the host processor. There are two APIs available: + + * The HIP runtime API which enables use of the single-source programming + model. + + * The HIP driver API which sits at a lower level and most importantly differs + by removing some facilities provided by the runtime API, most + importantly around kernel launching and argument setting. It is geared + towards implementing abstractions atop, such as the runtime API itself. + Offers two additional pieces of functionality not provided by the Runtime + API: ``hipModule`` and ``hipCtx`` APIs. For further details, check + :doc:`HIP driver API `. + +* The device-side kernels running on GPUs. Both the host and the device-side + APIs have synchronous and asynchronous functions in them. + +.. note:: + + The HIP does not present two *separate* APIs link NVIDIA CUDA. HIP only extends + the HIP runtime API with new APIs for ``hipModule`` and ``hipCtx``. + +Host-side execution +------------------- + +The part of the host-side API which deals with device management and their +queries are synchronous. All asynchronous APIs, such as kernel execution, data +movement and potentially data allocation/freeing all happen in the context of +device streams. + +Streams are FIFO buffers of commands to execute relating to a given device. +Commands which enqueue tasks on a stream all return promptly and the command is +executed asynchronously. All side effects of a command on a stream are visible +to all subsequent commands on the same stream. Multiple streams may point to +the same device and those streams may be fed from multiple concurrent host-side +threads. Execution on multiple streams may be concurrent but isn't required to +be. + +Asynchronous APIs involving a stream all return a stream event which may be +used to synchronize the execution of multiple streams. A user may enqueue a +barrier onto a stream referencing an event. The barrier will block until +the command related to the event does not complete, at which point all +side effects of the command shall be visible to commands following the barrier, +even if those side effects manifest on different devices. + +Streams also support executing user-defined functions as callbacks on the host. +The stream will not launch subsequent commands until the callback completes. + +Device-side execution +--------------------- + +The SIMT programming model behind the HIP device-side execution is a +middle-ground between SMT (Simultaneous Multi-Threading) programming known from +multicore CPUs, and SIMD (Single Instruction, Multiple Data) programming +mostly known from exploiting relevant instruction sets on CPUs (for example +SSE/AVX/Neon). + +Kernel launch +------------- + +Kernels may be launched in multiple ways all with different syntaxes and +intended use-cases. + +* Using the triple-chevron ``<<<...>>>`` operator on a ``__global__`` annotated + function. + +* Using ``hipLaunchKernelGGL()`` on a ``__global__`` annotated function. + + .. tip:: + + This name by default is a macro expanding to triple-chevron. In cases where + language syntax extensions are undesirable, or where launching templated + and/or overloaded kernel functions define the + ``HIP_TEMPLATE_KERNEL_LAUNCH`` preprocessor macro before including the HIP + headers to turn it into a templated function. + +* Using the launch APIs supporting the triple-chevron syntax directly. + + .. caution:: + + These APIs are intended to be used/generated by tools such as the HIP + compiler itself and not intended towards end-user code. Should you be + writing a tool having to launch device code using HIP, consider using these + over the alternatives. -* Asynchrony is at the forefront of the HIP API. Computations launched on the device - execute asynchronously with respect to the host, and it is the user's responsibility to - synchronize their data dispatch/fetch with computations on the device. HIP - does perform implicit synchronization on occasions, more advanced than other APIs such as - OpenCL or SYCL, in which the responsibility of synchronization mostly depends on the user. diff --git a/projects/hip/docs/understand/programming_model_reference.rst b/projects/hip/docs/understand/programming_model_reference.rst deleted file mode 100644 index 7c42569543..0000000000 --- a/projects/hip/docs/understand/programming_model_reference.rst +++ /dev/null @@ -1,258 +0,0 @@ -.. meta:: - :description: This chapter describes the HIP programming model, the contract - between the programmer and the compiler/runtime executing the - code. - :keywords: AMD, ROCm, HIP, CUDA, C++ language extensions - -******************************************************************************* -Programming model reference -******************************************************************************* - -HIP defines a model for mapping single instruction, multiple threads (SIMT) programs -onto various architectures, primarily GPUs. While the model may be expressed -in most imperative languages, (for example Python via PyHIP) this document will focus on -the original C/C++ API of HIP. - -Threading Model -=============== - -The SIMT nature of HIP is captured by the ability to execute user-provided -device programs, expressed as single-source C/C++ functions or sources compiled -online/offline to binaries in bulk. - -Multiple instances of the device program (or kernel) are called threads and may -execute in parallel. All uniquely identified by a set of integral values, or thread IDs. -The set of integers identifying a thread relate to the hierarchy in which threads execute. - -.. _inherent_thread_model: - -Inherent Thread Model ---------------------- - -The thread hierarchy inherent to how AMD GPUs operate is depicted in -:numref:`inherent_thread_hierarchy`. - -.. _inherent_thread_hierarchy: - -.. figure:: ../data/understand/programming_model_reference/thread_hierarchy.svg - :alt: Diagram depicting nested rectangles of varying color. The outermost one - titled "Grid", inside sets of uniform rectangles layered on one another - titled "Block". Each "Block" containing sets of uniform rectangles - layered on one another titled "Warp". Each of the "Warp" titled - rectangles filled with downward pointing arrows inside. - - Hierarchy of thread groups. - -Warp - The innermost grouping is called a warp, or a wavefront in ISA terms. A warp - is the most tightly coupled groups of threads, both physically and logically. - - When referring to threads inside a warp, they may be called lanes, and the - integral value identifying them the lane ID. Lane IDs aren't queried like - other thread IDs, but are user-calculated. As a consequence they are only as - multidimensional as the user interprets the calculated values to be. - - The size of a warp is architecture dependent and always fixed. Warps are - signified by the set of communication primitives at their disposal, as - discussed in :ref:`warp-cross-lane`. - -Block - The middle grouping is called a block or thread block. The defining feature - of a block is that all threads in a block will share an instance of memory - which they may use to share data or synchronize with one another. - - The size of a block is user-configurable but is maximized by the queryable - capabilities of the executing hardware. The unique ID of the thread within a - block is 3-dimensional as provided by the API. When linearizing thread IDs - within a block, assume the "fast index" being dimension ``x``, followed by - the ``y`` and ``z`` dimensions. - -Grid - The outermost grouping is called a grid. A grid manifests as a single - dispatch of kernels for execution. The unique ID of each block within a grid - is 3-dimensional, as provided by the API and is queryable by every thread - within the block. - -Cooperative Groups Thread Model -------------------------------- - -The Cooperative Groups API introduces new APIs to launch, group, subdivide, -synchronize and identify threads, as well as some predefined group-collective -algorithms, but most importantly a matching threading model to think in terms -of. It relaxes some restrictions of the :ref:`inherent_thread_model` -imposed by the strict 1:1 mapping of architectural details to the programming -model. - -The rich set of APIs introduced by Cooperative Groups allow the programmer to -define their own set of thread groups which may fit their user-cases better than -those defined by the hardware. The set of implicit groups by kernel launch -parameters are still available. - -The thread hierarchy abstraction of Cooperative Groups manifest as depicted in -:numref:`coop_thread_hierarchy`. - -.. _coop_thread_hierarchy: - -.. figure:: ../data/understand/programming_model_reference/thread_hierarchy_coop.svg - :alt: Diagram depicting nested rectangles of varying color. The outermost one - titled "Grid", inside sets of different sized rectangles layered on - one another titled "Block". Each "Block" containing sets of uniform - rectangles layered on one another titled "Warp". Each of the "Warp" - titled rectangles filled with downward pointing arrows inside. - - Cooperative group thread hierarchy. - -Multi Grid - An abstraction of potentially multiple simultaneous launches of - the same kernel over multiple devices. Grids inside a multi device kernel - launch need not be of uniform size, thus allowing taking into account - different device capabilities and preferences. - - .. deprecated:: 5.0 - -Grid - Same as the :ref:`inherent_thread_model` Grid entity. The ability to - synchronize over a grid requires the kernel to be launched using the - Cooperative Groups API. - -Block - Same as the :ref:`inherent_thread_model` Block entity. - -.. note:: - - Explicit warp-level thread handling is absent from the Cooperative Groups API. - In order to exploit the known hardware SIMD width on which built-in - functionality translates to simpler logic, one may use the group partitioning - part of the API, such as ``tiled_partition``. - -Memory Model -============ - -The hierarchy of threads introduced by :ref:`inherent_thread_model` is induced -by the memory subsystem of GPUs. :numref:`memory_hierarchy` summarizes that memory namespaces and -how they relate to the various levels of the threading model. - -.. _memory_hierarchy: - -.. figure:: ../data/understand/programming_model_reference/memory_hierarchy.svg - :alt: Diagram depicting nested rectangles of varying color. The outermost one - titled "Grid", inside it are two identical rectangles titled "Block", - inside them are ones titled "Local" with multiple "Warp" titled rectangles. - Blocks have not just Local inside, but also rectangles titled "Shared". - Inside the Grid is a rectangle titled "Global" with three others inside: - "Surface", "Texture" (same color) and "Constant" (different color). - - Memory hierarchy. - -Local or per-thread memory - Read-write storage only visible to the threads defining the given variables, - also called per-thread memory. The size of a block for a given kernel, - the number of concurrent warps are limited by local memory usage. - This relates to an important aspect: occupancy. This is the default memory - namespace. - -Shared memory - Read-write storage visible to all the threads in a given block. - -Global - Read-write storage visible to all threads in a given grid. There are - specialized versions of global memory with different usage semantics which - are typically backed by the same hardware storing global. - - Constant - Read-only storage visible to all threads in a given grid. It is a limited - segment of global with queryable size. - - Texture - Read-only storage visible to all threads in a given grid and accessible - through additional APIs. - - Surface - A read-write version of texture memory. - -Execution Model -=============== - -HIP programs consist of two distinct scopes: - -* The host-side API running on the host processor. There are to APIs available: - - * The HIP runtime API which enables use of the single-source programming - model. - - * The HIP driver API which sits at a lower level and most importantly differs - by removing some facilities provided by the runtime API, most - importantly around kernel launching and argument setting. It is geared - towards implementing abstractions atop, such as the runtime API itself. - -* The device-side kernels running on GPUs. Both the host and the device-side - APIs have synchronous and asynchronous functions in them. - -Host-side execution -------------------- - -The part of the host-side API which deals with device management and their -queries are synchronous. All asynchronous APIs, such as kernel execution, data -movement and potentially data allocation/freeing all happen in the context of -device streams. - -Streams are FIFO buffers of commands to execute relating to a given device. -Commands which enqueue tasks on a stream all return promptly and the command is -executed asynchronously. All side effects of a command on a stream are visible -to all subsequent commands on the same stream. Multiple streams may point to -the same device and those streams may be fed from multiple concurrent host-side -threads. Execution on multiple streams may be concurrent but isn't required to -be. - -Asynchronous APIs involving a stream all return a stream event which may be -used to synchronize the execution of multiple streams. A user may enqueue a -barrier onto a stream referencing an event. The barrier will block until -the command related to the event does not complete, at which point all -side effects of the command shall be visible to commands following the barrier, -even if those side effects manifest on different devices. - -Streams also support executing user-defined functions as callbacks on the host. -The stream will not launch subsequent commands until the callback completes. - -Device-side execution ---------------------- - -The SIMT programming model behind the HIP device-side execution is a -middle-ground between SMT (Simultaneous Multi-Threading) programming known from -multicore CPUs, and SIMD (Single Instruction, Multiple Data) programming -mostly known from exploiting relevant instruction sets on CPUs (for example -SSE/AVX/Neon). - -A HIP device compiler maps our SIMT code written in HIP C++ to an inherently -SIMD architecture (like GPUs) not by exploiting data parallelism within a -single instance of a kernel and spreading identical instructions over the SIMD -engines at hand, but by scalarizing the entire kernel and issuing the scalar -instructions of multiple kernel instances to each of the SIMD engine lanes. - -Kernel launch -------------- - -Kernels may be launched in multiple ways all with different syntaxes and -intended use-cases. - -* Using the triple-chevron ``<<<...>>>`` operator on a ``__global__`` annotated - function. - -* Using ``hipLaunchKernelGGL()`` on a ``__global__`` annotated function. - - .. tip:: - - This name by default is a macro expanding to triple-chevron. In cases where - language syntax extensions are undesirable, or where launching templated - and/or overloaded kernel functions define the - ``HIP_TEMPLATE_KERNEL_LAUNCH`` preprocessor macro before including the HIP - headers to turn it into a templated function. - -* Using the launch APIs supporting the triple-chevron syntax directly. - - .. caution:: - - These APIs are intended to be used/generated by tools such as the HIP - compiler itself and not intended towards end-user code. Should you be - writing a tool having to launch device code using HIP, consider using these - over the alternatives. diff --git a/projects/hip/docs/understand/texture_fetching.rst b/projects/hip/docs/understand/texture_fetching.rst new file mode 100644 index 0000000000..498e5723f3 --- /dev/null +++ b/projects/hip/docs/understand/texture_fetching.rst @@ -0,0 +1,212 @@ +.. meta:: + :description: This chapter describes the texture fetching modes of the HIP ecosystem + ROCm software. + :keywords: AMD, ROCm, HIP, Texture, Texture Fetching + +******************************************************************************* +Texture fetching +******************************************************************************* + +`Textures <../doxygen/html/group___texture.html>`_ are more than just a buffer +interpreted as a 1D, 2D, or 3D array. + +As textures are associated with graphics, they are indexed using floating-point +values. The index can be in the range of [0 to size-1] or [0 to 1]. + +Depending on the index, texture sampling or texture addressing is performed, +which decides the return value. + +**Texture sampling**: When a texture is indexed with a fraction, the queried +value is often between two or more texels (texture elements). The sampling +method defines what value to return in such cases. + +**Texture addressing**: Sometimes, the index is outside the bounds of the +texture. This condition might look like a problem but helps to put a texture on +a surface multiple times or to create a visible sign of out-of-bounds indexing, +in computer graphics. The addressing mode defines what value to return when +indexing a texture out of bounds. + +The different sampling and addressing modes are described in the following +sections. + +Here is the sample texture used in this document for demonstration purposes. It +is 2x2 texels and indexed in the [0 to 1] range. + +.. figure:: ../data/understand/textures/original.png + :width: 150 + :alt: Sample texture + :align: center + + Texture used as example + +Texture sampling +=============================================================================== + +Texture sampling handles the usage of fractional indices. It is the method that +describes, which nearby values will be used, and how they are combined into the +resulting value. + +The various texture sampling methods are discussed in the following sections. + +.. _texture_fetching_nearest: + +Nearest point sampling +------------------------------------------------------------------------------- + +In this method, the modulo of index is calculated as: + +``tex(x) = T[floor(x)]`` + +This is also applicable for 2D and 3D variants. + +This doesn't interpolate between neighboring values, which results in a +pixelated look. + +The following image shows a texture stretched to a 4x4 pixel quad but still +indexed in the [0 to 1] range. The in-between values are the same as the values +of the nearest texel. + +.. figure:: ../data/understand/textures/nearest.png + :width: 300 + :alt: Texture upscaled with nearest point sampling + :align: center + + Texture upscaled with nearest point sampling + +.. _texture_fetching_linear: + +Linear filtering +------------------------------------------------------------------------------- + +The linear filtering method does a linear interpolation between values. Linear +interpolation is used to create a linear transition between two values. The +formula used is ``(1-t)P1 + tP2`` where ``P1`` and ``P2`` are the values and +``t`` is within the [0 to 1] range. + +In the case of texture sampling the following formulas are used: + +* For one dimensional textures: ``tex(x) = (1-α)T[i] + αT[i+1]`` +* For two dimensional textures: ``tex(x,y) = (1-α)(1-β)T[i,j] + α(1-β)T[i+1,j] + (1-α)βT[i,j+1] + αβT[i+1,j+1]`` +* For three dimensional textures: ``tex(x,y,z) = (1-α)(1-β)(1-γ)T[i,j,k] + α(1-β)(1-γ)T[i+1,j,k] + (1-α)β(1-γ)T[i,j+1,k] + αβ(1-γ)T[i+1,j+1,k] + (1-α)(1-β)γT[i,j,k+1] + α(1-β)γT[i+1,j,k+1] + (1-α)βγT[i,j+1,k+1] + αβγT[i+1,j+1,k+1]`` + +Where x, y, and, z are the floating-point indices. i, j, and, k are the integer +indices and, α, β, and, γ values represent how far along the sampled point is on +the three axes. These values are calculated by these formulas: ``i = floor(x')``, ``α = frac(x')``, ``x' = x - 0.5``, ``j = floor(y')``, ``β = frac(y')``, ``y' = y - 0.5``, ``k = floor(z')``, ``γ = frac(z')`` and ``z' = z - 0.5`` + +This following image shows a texture stretched out to a 4x4 pixel quad, but +still indexed in the [0 to 1] range. The in-between values are interpolated +between the neighboring texels. + +.. figure:: ../data/understand/textures/linear.png + :width: 300 + :alt: Texture upscaled with linear filtering + :align: center + + Texture upscaled with linear filtering + +Texture addressing +=============================================================================== + +Texture addressing mode handles the index that is out of bounds of the texture. +This mode describes which values of the texture or a preset value to use when +the index is out of bounds. + +The following sections describe the various texture addressing methods. + +.. _texture_fetching_border: + +Address mode border +------------------------------------------------------------------------------- + +In this method, the texture fetching returns a border value when indexing out of +bounds. The border value must be set before texture fetching. + +The following image shows the texture on a 4x4 pixel quad, indexed in the +[0 to 3] range. The out-of-bounds values are the border color, which is yellow. + +.. figure:: ../data/understand/textures/border.png + :width: 300 + :alt: Texture with yellow border color + :align: center + + Texture with yellow border color. + +The purple lines are not part of the texture. They only denote the edge, where +the addressing begins. + +.. _texture_fetching_clamp: + +Address mode clamp +------------------------------------------------------------------------------- + +This mode clamps the index between [0 to size-1]. Due to this, when indexing +out-of-bounds, the values on the edge of the texture repeat. The clamp mode is +the default addressing mode. + +The following image shows the texture on a 4x4 pixel quad, indexed in the +[0 to 3] range. The out-of-bounds values are repeating the values at the edge of +the texture. + +.. figure:: ../data/understand/textures/clamp.png + :width: 300 + :alt: Texture with clamp addressing + :align: center + + Texture with clamp addressing + +The purple lines are not part of the texture. They only denote the edge, where +the addressing begins. + +.. _texture_fetching_wrap: + +Address mode wrap +------------------------------------------------------------------------------- + +Wrap mode addressing is only available for normalized texture coordinates. In +this addressing mode, the fractional part of the index is used: + +``tex(frac(x))`` + +This creates a repeating image effect. + +The following image shows the texture on a 4x4 pixel quad, indexed in the +[0 to 3] range. The out-of-bounds values are repeating the original texture. + +.. figure:: ../data/understand/textures/wrap.png + :width: 300 + :alt: Texture with wrap addressing + :align: center + + Texture with wrap addressing. + +The purple lines are not part of the texture. They only denote the edge, where +the addressing begins. + +.. _texture_fetching_mirror: + +Address mode mirror +------------------------------------------------------------------------------- + +Similar to the wrap mode the mirror mode is only available for normalized +texture coordinates and also creates a repeating image, but mirroring the +neighboring instances. + +The formula is the following: + +``tex(frac(x))``, if ``floor(x)`` is even, + +``tex(1 - frac(x))``, if ``floor(x)`` is odd. + +The following image shows the texture on a 4x4 pixel quad, indexed in The +[0 to 3] range. The out-of-bounds values are repeating the original texture, but +mirrored. + +.. figure:: ../data/understand/textures/mirror.png + :width: 300 + :alt: Texture with mirror addressing + :align: center + + Texture with mirror addressing + +The purple lines are not part of the texture. They only denote the edge, where +the addressing begins. diff --git a/projects/hip/include/hip/driver_types.h b/projects/hip/include/hip/driver_types.h index 3551f9d596..4c7bec15b1 100644 --- a/projects/hip/include/hip/driver_types.h +++ b/projects/hip/include/hip/driver_types.h @@ -1,5 +1,5 @@ /* -Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved. +Copyright (c) 2015 - 2024 Advanced Micro Devices, Inc. All rights reserved. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -36,298 +36,341 @@ THE SOFTWARE. #include #endif #endif // !defined(__HIPCC_RTC__) + +/** + * @defgroup DriverTypes Driver Types + * @{ + * This section describes the driver data types. + * + */ + typedef void* hipDeviceptr_t; +/** + * HIP channel format kinds + */ typedef enum hipChannelFormatKind { - hipChannelFormatKindSigned = 0, - hipChannelFormatKindUnsigned = 1, - hipChannelFormatKindFloat = 2, - hipChannelFormatKindNone = 3 + hipChannelFormatKindSigned = 0, ///< Signed channel format + hipChannelFormatKindUnsigned = 1, ///< Unsigned channel format + hipChannelFormatKindFloat = 2, ///< Float channel format + hipChannelFormatKindNone = 3 ///< No channel format }hipChannelFormatKind; +/** + * HIP channel format descriptor + */ typedef struct hipChannelFormatDesc { int x; int y; int z; int w; - enum hipChannelFormatKind f; + enum hipChannelFormatKind f; ///< Channel format kind }hipChannelFormatDesc; +/** @brief The hipTexRefSetArray function flags parameter override format value*/ #define HIP_TRSA_OVERRIDE_FORMAT 0x01 +/** @brief The hipTexRefSetFlags function flags parameter read as integer value*/ #define HIP_TRSF_READ_AS_INTEGER 0x01 +/** @brief The hipTexRefSetFlags function flags parameter normalized coordinate value*/ #define HIP_TRSF_NORMALIZED_COORDINATES 0x02 +/** @brief The hipTexRefSetFlags function flags parameter srgb value*/ #define HIP_TRSF_SRGB 0x10 typedef struct hipArray* hipArray_t; typedef const struct hipArray* hipArray_const_t; +/** + * HIP array format + */ typedef enum hipArray_Format { - HIP_AD_FORMAT_UNSIGNED_INT8 = 0x01, - HIP_AD_FORMAT_UNSIGNED_INT16 = 0x02, - HIP_AD_FORMAT_UNSIGNED_INT32 = 0x03, - HIP_AD_FORMAT_SIGNED_INT8 = 0x08, - HIP_AD_FORMAT_SIGNED_INT16 = 0x09, - HIP_AD_FORMAT_SIGNED_INT32 = 0x0a, - HIP_AD_FORMAT_HALF = 0x10, - HIP_AD_FORMAT_FLOAT = 0x20 + HIP_AD_FORMAT_UNSIGNED_INT8 = 0x01, ///< Unsigned 8-bit array format + HIP_AD_FORMAT_UNSIGNED_INT16 = 0x02, ///< Unsigned 16-bit array format + HIP_AD_FORMAT_UNSIGNED_INT32 = 0x03, ///< Unsigned 32-bit array format + HIP_AD_FORMAT_SIGNED_INT8 = 0x08, ///< Signed 8-bit array format + HIP_AD_FORMAT_SIGNED_INT16 = 0x09, ///< Signed 16-bit array format + HIP_AD_FORMAT_SIGNED_INT32 = 0x0a, ///< Signed 32-bit array format + HIP_AD_FORMAT_HALF = 0x10, ///< Half array format + HIP_AD_FORMAT_FLOAT = 0x20 ///< Float array format }hipArray_Format; +/** + * HIP array descriptor + */ typedef struct HIP_ARRAY_DESCRIPTOR { - size_t Width; - size_t Height; - enum hipArray_Format Format; - unsigned int NumChannels; + size_t Width; ///< Width of the array + size_t Height; ///< Height of the array + enum hipArray_Format Format; ///< Format of the array + unsigned int NumChannels; ///< Number of channels of the array }HIP_ARRAY_DESCRIPTOR; + +/** + * HIP 3D array descriptor + */ typedef struct HIP_ARRAY3D_DESCRIPTOR { - size_t Width; - size_t Height; - size_t Depth; - enum hipArray_Format Format; - unsigned int NumChannels; - unsigned int Flags; + size_t Width; ///< Width of the array + size_t Height; ///< Height of the array + size_t Depth; ///< Depth of the array + enum hipArray_Format Format; ///< Format of the array + unsigned int NumChannels; ///< Number of channels of the array + unsigned int Flags; ///< Flags of the array }HIP_ARRAY3D_DESCRIPTOR; #if !defined(__HIPCC_RTC__) +/** + * HIP 2D memory copy parameters + */ typedef struct hip_Memcpy2D { - size_t srcXInBytes; - size_t srcY; - hipMemoryType srcMemoryType; - const void* srcHost; - hipDeviceptr_t srcDevice; - hipArray_t srcArray; - size_t srcPitch; - size_t dstXInBytes; - size_t dstY; - hipMemoryType dstMemoryType; - void* dstHost; - hipDeviceptr_t dstDevice; - hipArray_t dstArray; - size_t dstPitch; - size_t WidthInBytes; - size_t Height; + size_t srcXInBytes; ///< Source width in bytes + size_t srcY; ///< Source height + hipMemoryType srcMemoryType; ///< Source memory type + const void* srcHost; ///< Source pointer + hipDeviceptr_t srcDevice; ///< Source device + hipArray_t srcArray; ///< Source array + size_t srcPitch; ///< Source pitch + size_t dstXInBytes; ///< Destination width in bytes + size_t dstY; ///< Destination height + hipMemoryType dstMemoryType; ///< Destination memory type + void* dstHost; ///< Destination pointer + hipDeviceptr_t dstDevice; ///< Destination device + hipArray_t dstArray; ///< Destination array + size_t dstPitch; ///< Destination pitch + size_t WidthInBytes; ///< Width in bytes of the 2D memory copy + size_t Height; ///< Height of the 2D memory copy } hip_Memcpy2D; #endif // !defined(__HIPCC_RTC__) +/** + * HIP mipmapped array + */ typedef struct hipMipmappedArray { - void* data; - struct hipChannelFormatDesc desc; - unsigned int type; - unsigned int width; - unsigned int height; - unsigned int depth; - unsigned int min_mipmap_level; - unsigned int max_mipmap_level; - unsigned int flags; - enum hipArray_Format format; - unsigned int num_channels; + void* data; ///< Data pointer of the mipmapped array + struct hipChannelFormatDesc desc; ///< Description of the mipmapped array + unsigned int type; ///< Type of the mipmapped array + unsigned int width; ///< Width of the mipmapped array + unsigned int height; ///< Height of the mipmapped array + unsigned int depth; ///< Depth of the mipmapped array + unsigned int min_mipmap_level; ///< Minimum level of the mipmapped array + unsigned int max_mipmap_level; ///< Maximum level of the mipmapped array + unsigned int flags; ///< Flags of the mipmapped array + enum hipArray_Format format; ///< Format of the mipmapped array + unsigned int num_channels; ///< Number of channels of the mipmapped array } hipMipmappedArray; +/** + * HIP mipmapped array pointer + */ typedef struct hipMipmappedArray* hipMipmappedArray_t; typedef hipMipmappedArray_t hipmipmappedArray; typedef const struct hipMipmappedArray* hipMipmappedArray_const_t; /** - * hip resource types + * HIP resource types */ typedef enum hipResourceType { - hipResourceTypeArray = 0x00, - hipResourceTypeMipmappedArray = 0x01, - hipResourceTypeLinear = 0x02, - hipResourceTypePitch2D = 0x03 + hipResourceTypeArray = 0x00, ///< Array resource + hipResourceTypeMipmappedArray = 0x01, ///< Mipmapped array resource + hipResourceTypeLinear = 0x02, ///< Linear resource + hipResourceTypePitch2D = 0x03 ///< Pitch 2D resource }hipResourceType; typedef enum HIPresourcetype_enum { - HIP_RESOURCE_TYPE_ARRAY = 0x00, /**< Array resoure */ - HIP_RESOURCE_TYPE_MIPMAPPED_ARRAY = 0x01, /**< Mipmapped array resource */ - HIP_RESOURCE_TYPE_LINEAR = 0x02, /**< Linear resource */ - HIP_RESOURCE_TYPE_PITCH2D = 0x03 /**< Pitch 2D resource */ + HIP_RESOURCE_TYPE_ARRAY = 0x00, ///< Array resource + HIP_RESOURCE_TYPE_MIPMAPPED_ARRAY = 0x01, ///< Mipmapped array resource + HIP_RESOURCE_TYPE_LINEAR = 0x02, ///< Linear resource + HIP_RESOURCE_TYPE_PITCH2D = 0x03 ///< Pitch 2D resource } HIPresourcetype, hipResourcetype; /** - * hip address modes + * HIP texture address modes */ typedef enum HIPaddress_mode_enum { - HIP_TR_ADDRESS_MODE_WRAP = 0, - HIP_TR_ADDRESS_MODE_CLAMP = 1, - HIP_TR_ADDRESS_MODE_MIRROR = 2, - HIP_TR_ADDRESS_MODE_BORDER = 3 + HIP_TR_ADDRESS_MODE_WRAP = 0, ///< Wrap address mode + HIP_TR_ADDRESS_MODE_CLAMP = 1, ///< Clamp address mode + HIP_TR_ADDRESS_MODE_MIRROR = 2, ///< Mirror address mode + HIP_TR_ADDRESS_MODE_BORDER = 3 ///< Border address mode } HIPaddress_mode; /** - * hip filter modes + * HIP filter modes */ typedef enum HIPfilter_mode_enum { - HIP_TR_FILTER_MODE_POINT = 0, - HIP_TR_FILTER_MODE_LINEAR = 1 + HIP_TR_FILTER_MODE_POINT = 0, ///< Filter mode point + HIP_TR_FILTER_MODE_LINEAR = 1 ///< Filter mode linear } HIPfilter_mode; /** - * Texture descriptor + * HIP texture descriptor */ typedef struct HIP_TEXTURE_DESC_st { - HIPaddress_mode addressMode[3]; /**< Address modes */ - HIPfilter_mode filterMode; /**< Filter mode */ - unsigned int flags; /**< Flags */ - unsigned int maxAnisotropy; /**< Maximum anisotropy ratio */ - HIPfilter_mode mipmapFilterMode; /**< Mipmap filter mode */ - float mipmapLevelBias; /**< Mipmap level bias */ - float minMipmapLevelClamp; /**< Mipmap minimum level clamp */ - float maxMipmapLevelClamp; /**< Mipmap maximum level clamp */ - float borderColor[4]; /**< Border Color */ + HIPaddress_mode addressMode[3]; ///< Address modes + HIPfilter_mode filterMode; ///< Filter mode + unsigned int flags; ///< Flags + unsigned int maxAnisotropy; ///< Maximum anisotropy ratio + HIPfilter_mode mipmapFilterMode; ///< Mipmap filter mode + float mipmapLevelBias; ///< Mipmap level bias + float minMipmapLevelClamp; ///< Mipmap minimum level clamp + float maxMipmapLevelClamp; ///< Mipmap maximum level clamp + float borderColor[4]; ///< Border Color int reserved[12]; } HIP_TEXTURE_DESC; /** - * hip texture resource view formats + * HIP texture resource view formats */ typedef enum hipResourceViewFormat { - hipResViewFormatNone = 0x00, - hipResViewFormatUnsignedChar1 = 0x01, - hipResViewFormatUnsignedChar2 = 0x02, - hipResViewFormatUnsignedChar4 = 0x03, - hipResViewFormatSignedChar1 = 0x04, - hipResViewFormatSignedChar2 = 0x05, - hipResViewFormatSignedChar4 = 0x06, - hipResViewFormatUnsignedShort1 = 0x07, - hipResViewFormatUnsignedShort2 = 0x08, - hipResViewFormatUnsignedShort4 = 0x09, - hipResViewFormatSignedShort1 = 0x0a, - hipResViewFormatSignedShort2 = 0x0b, - hipResViewFormatSignedShort4 = 0x0c, - hipResViewFormatUnsignedInt1 = 0x0d, - hipResViewFormatUnsignedInt2 = 0x0e, - hipResViewFormatUnsignedInt4 = 0x0f, - hipResViewFormatSignedInt1 = 0x10, - hipResViewFormatSignedInt2 = 0x11, - hipResViewFormatSignedInt4 = 0x12, - hipResViewFormatHalf1 = 0x13, - hipResViewFormatHalf2 = 0x14, - hipResViewFormatHalf4 = 0x15, - hipResViewFormatFloat1 = 0x16, - hipResViewFormatFloat2 = 0x17, - hipResViewFormatFloat4 = 0x18, - hipResViewFormatUnsignedBlockCompressed1 = 0x19, - hipResViewFormatUnsignedBlockCompressed2 = 0x1a, - hipResViewFormatUnsignedBlockCompressed3 = 0x1b, - hipResViewFormatUnsignedBlockCompressed4 = 0x1c, - hipResViewFormatSignedBlockCompressed4 = 0x1d, - hipResViewFormatUnsignedBlockCompressed5 = 0x1e, - hipResViewFormatSignedBlockCompressed5 = 0x1f, - hipResViewFormatUnsignedBlockCompressed6H = 0x20, - hipResViewFormatSignedBlockCompressed6H = 0x21, - hipResViewFormatUnsignedBlockCompressed7 = 0x22 + hipResViewFormatNone = 0x00, ///< No resource view format (use underlying resource format) + hipResViewFormatUnsignedChar1 = 0x01, ///< 1 channel, unsigned 8-bit integers + hipResViewFormatUnsignedChar2 = 0x02, ///< 2 channels, unsigned 8-bit integers + hipResViewFormatUnsignedChar4 = 0x03, ///< 4 channels, unsigned 8-bit integers + hipResViewFormatSignedChar1 = 0x04, ///< 1 channel, signed 8-bit integers + hipResViewFormatSignedChar2 = 0x05, ///< 2 channels, signed 8-bit integers + hipResViewFormatSignedChar4 = 0x06, ///< 4 channels, signed 8-bit integers + hipResViewFormatUnsignedShort1 = 0x07, ///< 1 channel, unsigned 16-bit integers + hipResViewFormatUnsignedShort2 = 0x08, ///< 2 channels, unsigned 16-bit integers + hipResViewFormatUnsignedShort4 = 0x09, ///< 4 channels, unsigned 16-bit integers + hipResViewFormatSignedShort1 = 0x0a, ///< 1 channel, signed 16-bit integers + hipResViewFormatSignedShort2 = 0x0b, ///< 2 channels, signed 16-bit integers + hipResViewFormatSignedShort4 = 0x0c, ///< 4 channels, signed 16-bit integers + hipResViewFormatUnsignedInt1 = 0x0d, ///< 1 channel, unsigned 32-bit integers + hipResViewFormatUnsignedInt2 = 0x0e, ///< 2 channels, unsigned 32-bit integers + hipResViewFormatUnsignedInt4 = 0x0f, ///< 4 channels, unsigned 32-bit integers + hipResViewFormatSignedInt1 = 0x10, ///< 1 channel, signed 32-bit integers + hipResViewFormatSignedInt2 = 0x11, ///< 2 channels, signed 32-bit integers + hipResViewFormatSignedInt4 = 0x12, ///< 4 channels, signed 32-bit integers + hipResViewFormatHalf1 = 0x13, ///< 1 channel, 16-bit floating point + hipResViewFormatHalf2 = 0x14, ///< 2 channels, 16-bit floating point + hipResViewFormatHalf4 = 0x15, ///< 4 channels, 16-bit floating point + hipResViewFormatFloat1 = 0x16, ///< 1 channel, 32-bit floating point + hipResViewFormatFloat2 = 0x17, ///< 2 channels, 32-bit floating point + hipResViewFormatFloat4 = 0x18, ///< 4 channels, 32-bit floating point + hipResViewFormatUnsignedBlockCompressed1 = 0x19, ///< Block-compressed 1 + hipResViewFormatUnsignedBlockCompressed2 = 0x1a, ///< Block-compressed 2 + hipResViewFormatUnsignedBlockCompressed3 = 0x1b, ///< Block-compressed 3 + hipResViewFormatUnsignedBlockCompressed4 = 0x1c, ///< Block-compressed 4 unsigned + hipResViewFormatSignedBlockCompressed4 = 0x1d, ///< Block-compressed 4 signed + hipResViewFormatUnsignedBlockCompressed5 = 0x1e, ///< Block-compressed 5 unsigned + hipResViewFormatSignedBlockCompressed5 = 0x1f, ///< Block-compressed 5 signed + hipResViewFormatUnsignedBlockCompressed6H = 0x20, ///< Block-compressed 6 unsigned half-float + hipResViewFormatSignedBlockCompressed6H = 0x21, ///< Block-compressed 6 signed half-float + hipResViewFormatUnsignedBlockCompressed7 = 0x22 ///< Block-compressed 7 }hipResourceViewFormat; +/** + * HIP texture resource view formats + */ typedef enum HIPresourceViewFormat_enum { - HIP_RES_VIEW_FORMAT_NONE = 0x00, /**< No resource view format (use underlying resource format) */ - HIP_RES_VIEW_FORMAT_UINT_1X8 = 0x01, /**< 1 channel unsigned 8-bit integers */ - HIP_RES_VIEW_FORMAT_UINT_2X8 = 0x02, /**< 2 channel unsigned 8-bit integers */ - HIP_RES_VIEW_FORMAT_UINT_4X8 = 0x03, /**< 4 channel unsigned 8-bit integers */ - HIP_RES_VIEW_FORMAT_SINT_1X8 = 0x04, /**< 1 channel signed 8-bit integers */ - HIP_RES_VIEW_FORMAT_SINT_2X8 = 0x05, /**< 2 channel signed 8-bit integers */ - HIP_RES_VIEW_FORMAT_SINT_4X8 = 0x06, /**< 4 channel signed 8-bit integers */ - HIP_RES_VIEW_FORMAT_UINT_1X16 = 0x07, /**< 1 channel unsigned 16-bit integers */ - HIP_RES_VIEW_FORMAT_UINT_2X16 = 0x08, /**< 2 channel unsigned 16-bit integers */ - HIP_RES_VIEW_FORMAT_UINT_4X16 = 0x09, /**< 4 channel unsigned 16-bit integers */ - HIP_RES_VIEW_FORMAT_SINT_1X16 = 0x0a, /**< 1 channel signed 16-bit integers */ - HIP_RES_VIEW_FORMAT_SINT_2X16 = 0x0b, /**< 2 channel signed 16-bit integers */ - HIP_RES_VIEW_FORMAT_SINT_4X16 = 0x0c, /**< 4 channel signed 16-bit integers */ - HIP_RES_VIEW_FORMAT_UINT_1X32 = 0x0d, /**< 1 channel unsigned 32-bit integers */ - HIP_RES_VIEW_FORMAT_UINT_2X32 = 0x0e, /**< 2 channel unsigned 32-bit integers */ - HIP_RES_VIEW_FORMAT_UINT_4X32 = 0x0f, /**< 4 channel unsigned 32-bit integers */ - HIP_RES_VIEW_FORMAT_SINT_1X32 = 0x10, /**< 1 channel signed 32-bit integers */ - HIP_RES_VIEW_FORMAT_SINT_2X32 = 0x11, /**< 2 channel signed 32-bit integers */ - HIP_RES_VIEW_FORMAT_SINT_4X32 = 0x12, /**< 4 channel signed 32-bit integers */ - HIP_RES_VIEW_FORMAT_FLOAT_1X16 = 0x13, /**< 1 channel 16-bit floating point */ - HIP_RES_VIEW_FORMAT_FLOAT_2X16 = 0x14, /**< 2 channel 16-bit floating point */ - HIP_RES_VIEW_FORMAT_FLOAT_4X16 = 0x15, /**< 4 channel 16-bit floating point */ - HIP_RES_VIEW_FORMAT_FLOAT_1X32 = 0x16, /**< 1 channel 32-bit floating point */ - HIP_RES_VIEW_FORMAT_FLOAT_2X32 = 0x17, /**< 2 channel 32-bit floating point */ - HIP_RES_VIEW_FORMAT_FLOAT_4X32 = 0x18, /**< 4 channel 32-bit floating point */ - HIP_RES_VIEW_FORMAT_UNSIGNED_BC1 = 0x19, /**< Block compressed 1 */ - HIP_RES_VIEW_FORMAT_UNSIGNED_BC2 = 0x1a, /**< Block compressed 2 */ - HIP_RES_VIEW_FORMAT_UNSIGNED_BC3 = 0x1b, /**< Block compressed 3 */ - HIP_RES_VIEW_FORMAT_UNSIGNED_BC4 = 0x1c, /**< Block compressed 4 unsigned */ - HIP_RES_VIEW_FORMAT_SIGNED_BC4 = 0x1d, /**< Block compressed 4 signed */ - HIP_RES_VIEW_FORMAT_UNSIGNED_BC5 = 0x1e, /**< Block compressed 5 unsigned */ - HIP_RES_VIEW_FORMAT_SIGNED_BC5 = 0x1f, /**< Block compressed 5 signed */ - HIP_RES_VIEW_FORMAT_UNSIGNED_BC6H = 0x20, /**< Block compressed 6 unsigned half-float */ - HIP_RES_VIEW_FORMAT_SIGNED_BC6H = 0x21, /**< Block compressed 6 signed half-float */ - HIP_RES_VIEW_FORMAT_UNSIGNED_BC7 = 0x22 /**< Block compressed 7 */ + HIP_RES_VIEW_FORMAT_NONE = 0x00, ///< No resource view format (use underlying resource format) + HIP_RES_VIEW_FORMAT_UINT_1X8 = 0x01, ///< 1 channel, unsigned 8-bit integers + HIP_RES_VIEW_FORMAT_UINT_2X8 = 0x02, ///< 2 channels, unsigned 8-bit integers + HIP_RES_VIEW_FORMAT_UINT_4X8 = 0x03, ///< 4 channels, unsigned 8-bit integers + HIP_RES_VIEW_FORMAT_SINT_1X8 = 0x04, ///< 1 channel, signed 8-bit integers + HIP_RES_VIEW_FORMAT_SINT_2X8 = 0x05, ///< 2 channels, signed 8-bit integers + HIP_RES_VIEW_FORMAT_SINT_4X8 = 0x06, ///< 4 channels, signed 8-bit integers + HIP_RES_VIEW_FORMAT_UINT_1X16 = 0x07, ///< 1 channel, unsigned 16-bit integers + HIP_RES_VIEW_FORMAT_UINT_2X16 = 0x08, ///< 2 channels, unsigned 16-bit integers + HIP_RES_VIEW_FORMAT_UINT_4X16 = 0x09, ///< 4 channels, unsigned 16-bit integers + HIP_RES_VIEW_FORMAT_SINT_1X16 = 0x0a, ///< 1 channel, signed 16-bit integers + HIP_RES_VIEW_FORMAT_SINT_2X16 = 0x0b, ///< 2 channels, signed 16-bit integers + HIP_RES_VIEW_FORMAT_SINT_4X16 = 0x0c, ///< 4 channels, signed 16-bit integers + HIP_RES_VIEW_FORMAT_UINT_1X32 = 0x0d, ///< 1 channel, unsigned 32-bit integers + HIP_RES_VIEW_FORMAT_UINT_2X32 = 0x0e, ///< 2 channels, unsigned 32-bit integers + HIP_RES_VIEW_FORMAT_UINT_4X32 = 0x0f, ///< 4 channels, unsigned 32-bit integers + HIP_RES_VIEW_FORMAT_SINT_1X32 = 0x10, ///< 1 channel, signed 32-bit integers + HIP_RES_VIEW_FORMAT_SINT_2X32 = 0x11, ///< 2 channels, signed 32-bit integers + HIP_RES_VIEW_FORMAT_SINT_4X32 = 0x12, ///< 4 channels, signed 32-bit integers + HIP_RES_VIEW_FORMAT_FLOAT_1X16 = 0x13, ///< 1 channel, 16-bit floating point + HIP_RES_VIEW_FORMAT_FLOAT_2X16 = 0x14, ///< 2 channels, 16-bit floating point + HIP_RES_VIEW_FORMAT_FLOAT_4X16 = 0x15, ///< 4 channels, 16-bit floating point + HIP_RES_VIEW_FORMAT_FLOAT_1X32 = 0x16, ///< 1 channel, 32-bit floating point + HIP_RES_VIEW_FORMAT_FLOAT_2X32 = 0x17, ///< 2 channels, 32-bit floating point + HIP_RES_VIEW_FORMAT_FLOAT_4X32 = 0x18, ///< 4 channels, 32-bit floating point + HIP_RES_VIEW_FORMAT_UNSIGNED_BC1 = 0x19, ///< Block-compressed 1 + HIP_RES_VIEW_FORMAT_UNSIGNED_BC2 = 0x1a, ///< Block-compressed 2 + HIP_RES_VIEW_FORMAT_UNSIGNED_BC3 = 0x1b, ///< Block-compressed 3 + HIP_RES_VIEW_FORMAT_UNSIGNED_BC4 = 0x1c, ///< Block-compressed 4 unsigned + HIP_RES_VIEW_FORMAT_SIGNED_BC4 = 0x1d, ///< Block-compressed 4 signed + HIP_RES_VIEW_FORMAT_UNSIGNED_BC5 = 0x1e, ///< Block-compressed 5 unsigned + HIP_RES_VIEW_FORMAT_SIGNED_BC5 = 0x1f, ///< Block-compressed 5 signed + HIP_RES_VIEW_FORMAT_UNSIGNED_BC6H = 0x20, ///< Block-compressed 6 unsigned half-float + HIP_RES_VIEW_FORMAT_SIGNED_BC6H = 0x21, ///< Block-compressed 6 signed half-float + HIP_RES_VIEW_FORMAT_UNSIGNED_BC7 = 0x22 ///< Block-compressed 7 } HIPresourceViewFormat; /** * HIP resource descriptor */ typedef struct hipResourceDesc { - enum hipResourceType resType; + enum hipResourceType resType; ///< Resource type union { struct { - hipArray_t array; + hipArray_t array; ///< HIP array } array; struct { - hipMipmappedArray_t mipmap; + hipMipmappedArray_t mipmap; ///< HIP mipmapped array } mipmap; struct { - void* devPtr; - struct hipChannelFormatDesc desc; - size_t sizeInBytes; + void* devPtr; ///< Device pointer + struct hipChannelFormatDesc desc; ///< Channel format description + size_t sizeInBytes; ///< Size in bytes } linear; struct { - void* devPtr; - struct hipChannelFormatDesc desc; - size_t width; - size_t height; - size_t pitchInBytes; + void* devPtr; ///< Device pointer + struct hipChannelFormatDesc desc; ///< Channel format description + size_t width; ///< Width of the array in elements + size_t height; ///< Height of the array in elements + size_t pitchInBytes; ///< Pitch between two rows in bytes } pitch2D; } res; }hipResourceDesc; + +/** + * HIP resource view descriptor struct + */ typedef struct HIP_RESOURCE_DESC_st { - HIPresourcetype resType; /**< Resource type */ + HIPresourcetype resType; ///< Resource type union { struct { - hipArray_t hArray; /**< HIP array */ + hipArray_t hArray; ///< HIP array } array; struct { - hipMipmappedArray_t hMipmappedArray; /**< HIP mipmapped array */ + hipMipmappedArray_t hMipmappedArray; ///< HIP mipmapped array } mipmap; struct { - hipDeviceptr_t devPtr; /**< Device pointer */ - hipArray_Format format; /**< Array format */ - unsigned int numChannels; /**< Channels per array element */ - size_t sizeInBytes; /**< Size in bytes */ + hipDeviceptr_t devPtr; ///< Device pointer + hipArray_Format format; ///< Array format + unsigned int numChannels; ///< Channels per array element + size_t sizeInBytes; ///< Size in bytes } linear; struct { - hipDeviceptr_t devPtr; /**< Device pointer */ - hipArray_Format format; /**< Array format */ - unsigned int numChannels; /**< Channels per array element */ - size_t width; /**< Width of the array in elements */ - size_t height; /**< Height of the array in elements */ - size_t pitchInBytes; /**< Pitch between two rows in bytes */ + hipDeviceptr_t devPtr; ///< Device pointer + hipArray_Format format; ///< Array format + unsigned int numChannels; ///< Channels per array element + size_t width; ///< Width of the array in elements + size_t height; ///< Height of the array in elements + size_t pitchInBytes; ///< Pitch between two rows in bytes } pitch2D; struct { int reserved[32]; } reserved; } res; - unsigned int flags; /**< Flags (must be zero) */ + unsigned int flags; ///< Flags (must be zero) } HIP_RESOURCE_DESC; /** - * hip resource view descriptor + * HIP resource view descriptor */ struct hipResourceViewDesc { - enum hipResourceViewFormat format; - size_t width; - size_t height; - size_t depth; - unsigned int firstMipmapLevel; - unsigned int lastMipmapLevel; - unsigned int firstLayer; - unsigned int lastLayer; + enum hipResourceViewFormat format; ///< Resource view format + size_t width; ///< Width of the resource view + size_t height; ///< Height of the resource view + size_t depth; ///< Depth of the resource view + unsigned int firstMipmapLevel; ///< First defined mipmap level + unsigned int lastMipmapLevel; ///< Last defined mipmap level + unsigned int firstLayer; ///< First layer index + unsigned int lastLayer; ///< Last layer index }; /** * Resource view descriptor */ typedef struct HIP_RESOURCE_VIEW_DESC_st { - HIPresourceViewFormat format; /**< Resource view format */ - size_t width; /**< Width of the resource view */ - size_t height; /**< Height of the resource view */ - size_t depth; /**< Depth of the resource view */ - unsigned int firstMipmapLevel; /**< First defined mipmap level */ - unsigned int lastMipmapLevel; /**< Last defined mipmap level */ - unsigned int firstLayer; /**< First layer index */ - unsigned int lastLayer; /**< Last layer index */ + HIPresourceViewFormat format; ///< Resource view format + size_t width; ///< Width of the resource view + size_t height; ///< Height of the resource view + size_t depth; ///< Depth of the resource view + unsigned int firstMipmapLevel; ///< First defined mipmap level + unsigned int lastMipmapLevel; ///< Last defined mipmap level + unsigned int firstLayer; ///< First layer index + unsigned int lastLayer; ///< Last layer index unsigned int reserved[16]; } HIP_RESOURCE_VIEW_DESC; /** * Memory copy types - * */ #if !defined(__HIPCC_RTC__) typedef enum hipMemcpyKind { @@ -339,58 +382,83 @@ typedef enum hipMemcpyKind { ///= 2^32. + * size \f$ gridDim \cdot blockDim \geq 2^{32} \f$. * * @returns #hipSuccess, #hipErrorDeinitialized, #hipErrorNotInitialized, #hipErrorInvalidContext, * #hipErrorInvalidHandle, #hipErrorInvalidImage, #hipErrorInvalidValue, @@ -5943,8 +5948,8 @@ hipError_t hipModuleLaunchCooperativeKernelMultiDevice(hipFunctionLaunchParams* unsigned int numDevices, unsigned int flags); /** - * @brief launches kernel f with launch parameters and shared memory on stream with arguments passed - * to kernelparams or extra, where thread blocks can cooperate and synchronize as they execute + * @brief Launches kernel f with launch parameters and shared memory on stream with arguments passed + * to kernelparams or extra, where thread blocks can cooperate and synchronize as they execute. * * @param [in] f - Kernel to launch. * @param [in] gridDim - Grid dimensions specified as multiple of blockDim. @@ -5957,7 +5962,7 @@ hipError_t hipModuleLaunchCooperativeKernelMultiDevice(hipFunctionLaunchParams* * default stream is used with associated synchronization rules. * * Please note, HIP does not support kernel launch with total work items defined in dimension with - * size gridDim x blockDim >= 2^32. + * size \f$ gridDim \cdot blockDim \geq 2^{32} \f$. * * @returns #hipSuccess, #hipErrorNotInitialized, #hipErrorInvalidValue, * #hipErrorCooperativeLaunchTooLarge @@ -5978,6 +5983,10 @@ hipError_t hipLaunchCooperativeKernel(const void* f, dim3 gridDim, dim3 blockDim */ hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsList, int numDevices, unsigned int flags); + +// Doxygen end group ModuleCooperativeG +/** @} */ + /** * @brief Launches kernels on multiple devices and guarantees all specified kernels are dispatched * on respective streams before enqueuing any other work on the specified streams from any other threads @@ -6123,7 +6132,7 @@ hipError_t hipOccupancyMaxPotentialBlockSize(int* gridSize, int* blockSize, * @brief Start recording of profiling information [Deprecated] * When using this API, start the profiler with profiling disabled. (--startdisabled) * @returns #hipErrorNotSupported - * @warning : hipProfilerStart API is deprecated, use roctracer/rocTX instead. + * @warning hipProfilerStart API is deprecated, use roctracer/rocTX instead. */ HIP_DEPRECATED("use roctracer/rocTX instead") hipError_t hipProfilerStart(); @@ -6481,7 +6490,7 @@ hipError_t hipTexObjectGetTextureDesc( * * @return #hipSuccess, #hipErrorInvalidValue, #hipErrorMemoryAllocation * - * @note This API is implemented on Windows, under development on Linux. + * @note This API is implemented on Linux and is under development on Microsoft Windows. * */ hipError_t hipMallocMipmappedArray( @@ -6498,7 +6507,7 @@ hipError_t hipMallocMipmappedArray( * * @return #hipSuccess, #hipErrorInvalidValue * - * @note This API is implemented on Windows, under development on Linux. + * @note This API is implemented on Linux and is under development on Microsoft Windows. * */ hipError_t hipFreeMipmappedArray(hipMipmappedArray_t mipmappedArray); @@ -6512,7 +6521,7 @@ hipError_t hipFreeMipmappedArray(hipMipmappedArray_t mipmappedArray); * * @return #hipSuccess, #hipErrorInvalidValue * - * @note This API is implemented on Windows, under development on Linux. + * @note This API is implemented on Linux and is under development on Microsoft Windows. * */ hipError_t hipGetMipmappedArrayLevel( @@ -6529,7 +6538,7 @@ hipError_t hipGetMipmappedArrayLevel( * * @returns #hipSuccess, #hipErrorNotSupported, #hipErrorInvalidValue * - * @note This API is implemented on Windows, under development on Linux. + * @note This API is implemented on Linux and is under development on Microsoft Windows. */ hipError_t hipMipmappedArrayCreate( hipMipmappedArray_t* pHandle, @@ -6543,7 +6552,7 @@ hipError_t hipMipmappedArrayCreate( * * @returns #hipSuccess, #hipErrorInvalidValue * - * @note This API is implemented on Windows, under development on Linux. + * @note This API is implemented on Linux and is under development on Microsoft Windows. * */ hipError_t hipMipmappedArrayDestroy(hipMipmappedArray_t hMipmappedArray); @@ -6557,7 +6566,7 @@ hipError_t hipMipmappedArrayDestroy(hipMipmappedArray_t hMipmappedArray); * * @returns #hipSuccess, #hipErrorInvalidValue * - * @note This API is implemented on Windows, under development on Linux. + * @note This API is implemented on Linux and is under development on Microsoft Windows. * */ hipError_t hipMipmappedArrayGetLevel( @@ -7179,8 +7188,8 @@ int hipGetStreamDeviceId(hipStream_t stream); * * @returns #hipSuccess, #hipErrorInvalidValue * - * @warning : This API is marked as beta, meaning, while this is feature complete, - * it is still open to changes and may have outstanding issues. + * @warning This API is marked as Beta. While this feature is complete, it can + * change and might have outstanding issues. * */ hipError_t hipStreamBeginCapture(hipStream_t stream, hipStreamCaptureMode mode); @@ -7199,7 +7208,7 @@ are not safe. * * @returns #hipSuccess, #hipErrorInvalidValue * -* @warning : param "const hipGraphEdgeData* dependencyData" is currently not supported and has to +* @warning param "const hipGraphEdgeData* dependencyData" is currently not supported and has to be passed as nullptr. This API is marked as beta, meaning, while this is feature complete, it is still open to changes and may have outstanding issues. * @@ -7213,12 +7222,12 @@ hipError_t hipStreamBeginCaptureToGraph(hipStream_t stream, hipGraph_t graph, * @brief Ends capture on a stream, returning the captured graph. * * @param [in] stream - Stream to end capture. - * @param [out] pGraph - returns the graph captured. + * @param [out] pGraph - Captured graph. * * @returns #hipSuccess, #hipErrorInvalidValue * - * @warning : This API is marked as beta, meaning, while this is feature complete, - * it is still open to changes and may have outstanding issues. + * @warning This API is marked as Beta. While this feature is complete, it can + * change and might have outstanding issues. * */ hipError_t hipStreamEndCapture(hipStream_t stream, hipGraph_t* pGraph); @@ -7226,14 +7235,14 @@ hipError_t hipStreamEndCapture(hipStream_t stream, hipGraph_t* pGraph); /** * @brief Get capture status of a stream. * - * @param [in] stream - Stream under capture. - * @param [out] pCaptureStatus - returns current status of the capture. - * @param [out] pId - unique ID of the capture. + * @param [in] stream - Stream of which to get capture status from. + * @param [out] pCaptureStatus - Returns current capture status. + * @param [out] pId - Unique capture ID. * * @returns #hipSuccess, #hipErrorStreamCaptureImplicit * - * @warning : This API is marked as beta, meaning, while this is feature complete, - * it is still open to changes and may have outstanding issues. + * @warning This API is marked as Beta. While this feature is complete, it can + * change and might have outstanding issues. * */ hipError_t hipStreamGetCaptureInfo(hipStream_t stream, hipStreamCaptureStatus* pCaptureStatus, @@ -7242,17 +7251,17 @@ hipError_t hipStreamGetCaptureInfo(hipStream_t stream, hipStreamCaptureStatus* p /** * @brief Get stream's capture state * - * @param [in] stream - Stream under capture. - * @param [out] captureStatus_out - returns current status of the capture. - * @param [out] id_out - unique ID of the capture. - * @param [in] graph_out - returns the graph being captured into. - * @param [out] dependencies_out - returns pointer to an array of nodes. - * @param [out] numDependencies_out - returns size of the array returned in dependencies_out. + * @param [in] stream - Stream of which to get capture status from. + * @param [out] captureStatus_out - Returns current capture status. + * @param [out] id_out - Unique capture ID. + * @param [out] graph_out - Returns the graph being captured into. + * @param [out] dependencies_out - Pointer to an array of nodes representing the graphs dependencies. + * @param [out] numDependencies_out - Returns size of the array returned in dependencies_out. * * @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorStreamCaptureImplicit * - * @warning : This API is marked as beta, meaning, while this is feature complete, - * it is still open to changes and may have outstanding issues. + * @warning This API is marked as Beta. While this feature is complete, it can + * change and might have outstanding issues. * */ hipError_t hipStreamGetCaptureInfo_v2(hipStream_t stream, hipStreamCaptureStatus* captureStatus_out, @@ -7264,13 +7273,13 @@ hipError_t hipStreamGetCaptureInfo_v2(hipStream_t stream, hipStreamCaptureStatus /** * @brief Get stream's capture state * - * @param [in] stream - Stream under capture. - * @param [out] pCaptureStatus - returns current status of the capture. + * @param [in] stream - Stream of which to get capture status from. + * @param [out] pCaptureStatus - Returns current capture status. * * @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorStreamCaptureImplicit * - * @warning : This API is marked as beta, meaning, while this is feature complete, - * it is still open to changes and may have outstanding issues. + * @warning This API is marked as Beta. While this feature is complete, it can + * change and might have outstanding issues. * */ hipError_t hipStreamIsCapturing(hipStream_t stream, hipStreamCaptureStatus* pCaptureStatus); @@ -7278,15 +7287,15 @@ hipError_t hipStreamIsCapturing(hipStream_t stream, hipStreamCaptureStatus* pCap /** * @brief Update the set of dependencies in a capturing stream * - * @param [in] stream Stream under capture. - * @param [in] dependencies pointer to an array of nodes to Add/Replace. - * @param [in] numDependencies size of the array in dependencies. - * @param [in] flags Flag how to update dependency set. Should be one of value in enum - * #hipStreamUpdateCaptureDependenciesFlags + * @param [in] stream Stream that is being captured. + * @param [in] dependencies Pointer to an array of nodes to add/replace. + * @param [in] numDependencies Size of the dependencies array. + * @param [in] flags Flag to update dependency set. Should be one of the values + * in enum #hipStreamUpdateCaptureDependenciesFlags. * @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorIllegalState * - * @warning : This API is marked as beta, meaning, while this is feature complete, - * it is still open to changes and may have outstanding issues. + * @warning This API is marked as Beta. While this feature is complete, it can + * change and might have outstanding issues. * */ hipError_t hipStreamUpdateCaptureDependencies(hipStream_t stream, hipGraphNode_t* dependencies, @@ -7296,11 +7305,11 @@ hipError_t hipStreamUpdateCaptureDependencies(hipStream_t stream, hipGraphNode_t /** * @brief Swaps the stream capture mode of a thread. * - * @param [in] mode - Pointer to mode value to swap with the current mode + * @param [in] mode - Pointer to mode value to swap with the current mode. * @returns #hipSuccess, #hipErrorInvalidValue * - * @warning : This API is marked as beta, meaning, while this is feature complete, - * it is still open to changes and may have outstanding issues. + * @warning This API is marked as Beta. While this feature is complete, it can + * change and might have outstanding issues. * */ hipError_t hipThreadExchangeStreamCaptureMode(hipStreamCaptureMode* mode); @@ -7313,8 +7322,8 @@ hipError_t hipThreadExchangeStreamCaptureMode(hipStreamCaptureMode* mode); * * @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorMemoryAllocation * - * @warning : This API is marked as beta, meaning, while this is feature complete, - * it is still open to changes and may have outstanding issues. + * @warning This API is marked as Beta. While this feature is complete, it can + * change and might have outstanding issues. * */ hipError_t hipGraphCreate(hipGraph_t* pGraph, unsigned int flags); @@ -7326,8 +7335,8 @@ hipError_t hipGraphCreate(hipGraph_t* pGraph, unsigned int flags); * * @returns #hipSuccess, #hipErrorInvalidValue * - * @warning : This API is marked as beta, meaning, while this is feature complete, - * it is still open to changes and may have outstanding issues. + * @warning This API is marked as Beta. While this feature is complete, it can + * change and might have outstanding issues. * */ hipError_t hipGraphDestroy(hipGraph_t graph); @@ -7335,14 +7344,14 @@ hipError_t hipGraphDestroy(hipGraph_t graph); /** * @brief Adds dependency edges to a graph. * - * @param [in] graph - instance of the graph to add dependencies. - * @param [in] from - pointer to the graph nodes with dependenties to add from. - * @param [in] to - pointer to the graph nodes to add dependenties to. - * @param [in] numDependencies - the number of dependencies to add. + * @param [in] graph - Instance of the graph to add dependencies to. + * @param [in] from - Pointer to the graph nodes with dependencies to add from. + * @param [in] to - Pointer to the graph nodes to add dependencies to. + * @param [in] numDependencies - Number of dependencies to add. * @returns #hipSuccess, #hipErrorInvalidValue * - * @warning : This API is marked as beta, meaning, while this is feature complete, - * it is still open to changes and may have outstanding issues. + * @warning This API is marked as Beta. While this feature is complete, it can + * change and might have outstanding issues. * */ hipError_t hipGraphAddDependencies(hipGraph_t graph, const hipGraphNode_t* from, @@ -7351,14 +7360,14 @@ hipError_t hipGraphAddDependencies(hipGraph_t graph, const hipGraphNode_t* from, /** * @brief Removes dependency edges from a graph. * - * @param [in] graph - instance of the graph to remove dependencies. + * @param [in] graph - Instance of the graph to remove dependencies from. * @param [in] from - Array of nodes that provide the dependencies. * @param [in] to - Array of dependent nodes. - * @param [in] numDependencies - the number of dependencies to remove. + * @param [in] numDependencies - Number of dependencies to remove. * @returns #hipSuccess, #hipErrorInvalidValue * - * @warning : This API is marked as beta, meaning, while this is feature complete, - * it is still open to changes and may have outstanding issues. + * @warning This API is marked as Beta. While this feature is complete, it can + * change and might have outstanding issues. * */ hipError_t hipGraphRemoveDependencies(hipGraph_t graph, const hipGraphNode_t* from, @@ -7367,55 +7376,55 @@ hipError_t hipGraphRemoveDependencies(hipGraph_t graph, const hipGraphNode_t* fr /** * @brief Returns a graph's dependency edges. * - * @param [in] graph - instance of the graph to get the edges from. - * @param [out] from - pointer to the graph nodes to return edge endpoints. - * @param [out] to - pointer to the graph nodes to return edge endpoints. - * @param [out] numEdges - returns number of edges. + * @param [in] graph - Instance of the graph to get the edges from. + * @param [out] from - Pointer to the graph nodes to return edge endpoints. + * @param [out] to - Pointer to the graph nodes to return edge endpoints. + * @param [out] numEdges - Returns number of edges. * @returns #hipSuccess, #hipErrorInvalidValue * * from and to may both be NULL, in which case this function only returns the number of edges in * numEdges. Otherwise, numEdges entries will be filled in. If numEdges is higher than the actual * number of edges, the remaining entries in from and to will be set to NULL, and the number of - * edges actually returned will be written to numEdges - * @warning : This API is marked as beta, meaning, while this is feature complete, - * it is still open to changes and may have outstanding issues. + * edges actually returned will be written to numEdges. + * @warning This API is marked as Beta. While this feature is complete, it can + * change and might have outstanding issues. * */ hipError_t hipGraphGetEdges(hipGraph_t graph, hipGraphNode_t* from, hipGraphNode_t* to, size_t* numEdges); /** - * @brief Returns graph nodes. + * @brief Returns a graph's nodes. * - * @param [in] graph - instance of graph to get the nodes. - * @param [out] nodes - pointer to return the graph nodes. - * @param [out] numNodes - returns number of graph nodes. + * @param [in] graph - Instance of graph to get the nodes from. + * @param [out] nodes - Pointer to return the graph nodes. + * @param [out] numNodes - Returns the number of graph nodes. * @returns #hipSuccess, #hipErrorInvalidValue * * nodes may be NULL, in which case this function will return the number of nodes in numNodes. * Otherwise, numNodes entries will be filled in. If numNodes is higher than the actual number of * nodes, the remaining entries in nodes will be set to NULL, and the number of nodes actually * obtained will be returned in numNodes. - * @warning : This API is marked as beta, meaning, while this is feature complete, - * it is still open to changes and may have outstanding issues. + * @warning This API is marked as Beta. While this feature is complete, it can + * change and might have outstanding issues. * */ hipError_t hipGraphGetNodes(hipGraph_t graph, hipGraphNode_t* nodes, size_t* numNodes); /** - * @brief Returns graph's root nodes. + * @brief Returns a graph's root nodes. * - * @param [in] graph - instance of the graph to get the nodes. - * @param [out] pRootNodes - pointer to return the graph's root nodes. - * @param [out] pNumRootNodes - returns the number of graph's root nodes. + * @param [in] graph - Instance of the graph to get the nodes from. + * @param [out] pRootNodes - Pointer to return the graph's root nodes. + * @param [out] pNumRootNodes - Returns the number of graph's root nodes. * @returns #hipSuccess, #hipErrorInvalidValue * * pRootNodes may be NULL, in which case this function will return the number of root nodes in * pNumRootNodes. Otherwise, pNumRootNodes entries will be filled in. If pNumRootNodes is higher * than the actual number of root nodes, the remaining entries in pRootNodes will be set to NULL, * and the number of nodes actually obtained will be returned in pNumRootNodes. - * @warning : This API is marked as beta, meaning, while this is feature complete, - * it is still open to changes and may have outstanding issues. + * @warning This API is marked as Beta. While this feature is complete, it can + * change and might have outstanding issues. * */ hipError_t hipGraphGetRootNodes(hipGraph_t graph, hipGraphNode_t* pRootNodes, @@ -7424,17 +7433,17 @@ hipError_t hipGraphGetRootNodes(hipGraph_t graph, hipGraphNode_t* pRootNodes, /** * @brief Returns a node's dependencies. * - * @param [in] node - graph node to get the dependencies from. - * @param [out] pDependencies - pointer to to return the dependencies. - * @param [out] pNumDependencies - returns the number of graph node dependencies. + * @param [in] node - Graph node to get the dependencies from. + * @param [out] pDependencies - Pointer to return the dependencies. + * @param [out] pNumDependencies - Returns the number of graph node dependencies. * @returns #hipSuccess, #hipErrorInvalidValue * * pDependencies may be NULL, in which case this function will return the number of dependencies in * pNumDependencies. Otherwise, pNumDependencies entries will be filled in. If pNumDependencies is * higher than the actual number of dependencies, the remaining entries in pDependencies will be set * to NULL, and the number of nodes actually obtained will be returned in pNumDependencies. - * @warning : This API is marked as beta, meaning, while this is feature complete, - * it is still open to changes and may have outstanding issues. + * @warning This API is marked as Beta. While this feature is complete, it can + * change and might have outstanding issues. * */ hipError_t hipGraphNodeGetDependencies(hipGraphNode_t node, hipGraphNode_t* pDependencies, @@ -7443,18 +7452,18 @@ hipError_t hipGraphNodeGetDependencies(hipGraphNode_t node, hipGraphNode_t* pDep /** * @brief Returns a node's dependent nodes. * - * @param [in] node - graph node to get the Dependent nodes from. - * @param [out] pDependentNodes - pointer to return the graph dependent nodes. - * @param [out] pNumDependentNodes - returns the number of graph node dependent nodes. + * @param [in] node - Graph node to get the dependent nodes from. + * @param [out] pDependentNodes - Pointer to return the graph dependent nodes. + * @param [out] pNumDependentNodes - Returns the number of graph node dependent nodes. * @returns #hipSuccess, #hipErrorInvalidValue * - * DependentNodes may be NULL, in which case this function will return the number of dependent nodes + * pDependentNodes may be NULL, in which case this function will return the number of dependent nodes * in pNumDependentNodes. Otherwise, pNumDependentNodes entries will be filled in. If * pNumDependentNodes is higher than the actual number of dependent nodes, the remaining entries in * pDependentNodes will be set to NULL, and the number of nodes actually obtained will be returned * in pNumDependentNodes. - * @warning : This API is marked as beta, meaning, while this is feature complete, - * it is still open to changes and may have outstanding issues. + * @warning This API is marked as Beta. While this feature is complete, it can + * change and might have outstanding issues. * */ hipError_t hipGraphNodeGetDependentNodes(hipGraphNode_t node, hipGraphNode_t* pDependentNodes, @@ -7463,12 +7472,12 @@ hipError_t hipGraphNodeGetDependentNodes(hipGraphNode_t node, hipGraphNode_t* pD /** * @brief Returns a node's type. * - * @param [in] node - instance of the graph to add dependencies. - * @param [out] pType - pointer to the return the type + * @param [in] node - Node to get type of. + * @param [out] pType - Returns the node's type. * @returns #hipSuccess, #hipErrorInvalidValue * - * @warning : This API is marked as beta, meaning, while this is feature complete, - * it is still open to changes and may have outstanding issues. + * @warning This API is marked as Beta. While this feature is complete, it can + * change and might have outstanding issues. * */ hipError_t hipGraphNodeGetType(hipGraphNode_t node, hipGraphNodeType* pType); @@ -7479,8 +7488,8 @@ hipError_t hipGraphNodeGetType(hipGraphNode_t node, hipGraphNodeType* pType); * @param [in] node - graph node to remove * @returns #hipSuccess, #hipErrorInvalidValue * - * @warning : This API is marked as beta, meaning, while this is feature complete, - * it is still open to changes and may have outstanding issues. + * @warning This API is marked as Beta. While this feature is complete, it can + * change and might have outstanding issues. * */ hipError_t hipGraphDestroyNode(hipGraphNode_t node); @@ -7492,8 +7501,8 @@ hipError_t hipGraphDestroyNode(hipGraphNode_t node); * @param [in] originalGraph - original graph to clone from. * @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorMemoryAllocation * - * @warning : This API is marked as beta, meaning, while this is feature complete, - * it is still open to changes and may have outstanding issues. + * @warning This API is marked as Beta. While this feature is complete, it can + * change and might have outstanding issues. * */ hipError_t hipGraphClone(hipGraph_t* pGraphClone, hipGraph_t originalGraph); @@ -7506,8 +7515,8 @@ hipError_t hipGraphClone(hipGraph_t* pGraphClone, hipGraph_t originalGraph); * @param [in] clonedGraph - Cloned graph to query. * @returns #hipSuccess, #hipErrorInvalidValue * - * @warning : This API is marked as beta, meaning, while this is feature complete, - * it is still open to changes and may have outstanding issues. + * @warning This API is marked as Beta. While this feature is complete, it can + * change and might have outstanding issues. * */ hipError_t hipGraphNodeFindInClone(hipGraphNode_t* pNode, hipGraphNode_t originalNode, @@ -7516,17 +7525,17 @@ hipError_t hipGraphNodeFindInClone(hipGraphNode_t* pNode, hipGraphNode_t origina /** * @brief Creates an executable graph from a graph * - * @param [out] pGraphExec - pointer to instantiated executable graph that is created. - * @param [in] graph - instance of graph to instantiate. - * @param [out] pErrorNode - pointer to error node in case error occured in graph instantiation, - * it could modify the correponding node. - * @param [out] pLogBuffer - pointer to log buffer. - * @param [out] bufferSize - the size of log buffer. + * @param [out] pGraphExec - Pointer to instantiated executable graph. + * @param [in] graph - Instance of graph to instantiate. + * @param [out] pErrorNode - Pointer to error node. In case an error occured during + * graph instantiation, it could modify the corresponding node. + * @param [out] pLogBuffer - Pointer to log buffer. + * @param [out] bufferSize - Size of the log buffer. * * @returns #hipSuccess, #hipErrorOutOfMemory * - * @warning : This API is marked as beta, meaning, while this is feature complete, - * it is still open to changes and may have outstanding issues. + * @warning This API is marked as Beta. While this feature is complete, it can + * change and might have outstanding issues. * */ hipError_t hipGraphInstantiate(hipGraphExec_t* pGraphExec, hipGraph_t graph, @@ -7535,14 +7544,14 @@ hipError_t hipGraphInstantiate(hipGraphExec_t* pGraphExec, hipGraph_t graph, /** * @brief Creates an executable graph from a graph. * - * @param [out] pGraphExec - pointer to instantiated executable graph that is created. - * @param [in] graph - instance of graph to instantiate. + * @param [out] pGraphExec - Pointer to instantiated executable graph. + * @param [in] graph - Instance of graph to instantiate. * @param [in] flags - Flags to control instantiation. * @returns #hipSuccess, #hipErrorInvalidValue * - * @warning : This API is marked as beta, meaning, while this is feature complete, - * it is still open to changes and may have outstanding issues.It does not support - * any of flag and is behaving as hipGraphInstantiate. + * @warning This API is marked as Beta. While this feature is complete, it can + * change and might have outstanding issues. It does not support any of + * flag and is behaving as hipGraphInstantiate. */ hipError_t hipGraphInstantiateWithFlags(hipGraphExec_t* pGraphExec, hipGraph_t graph, unsigned long long flags); @@ -7550,99 +7559,99 @@ hipError_t hipGraphInstantiateWithFlags(hipGraphExec_t* pGraphExec, hipGraph_t g /** * @brief Creates an executable graph from a graph. * - * @param [out] pGraphExec - pointer to instantiated executable graph that is created. - * @param [in] graph - instance of graph to instantiate. - * @param [in] instantiateParams - Graph Instantiate Params + * @param [out] pGraphExec - Pointer to instantiated executable graph. + * @param [in] graph - Instance of graph to instantiate. + * @param [in] instantiateParams - Graph instantiation Params * @returns #hipSuccess, #hipErrorInvalidValue * - * @warning : This API is marked as beta, meaning, while this is feature complete, - * it is still open to changes and may have outstanding issues. + * @warning This API is marked as Beta. While this feature is complete, it can + * change and might have outstanding issues. */ hipError_t hipGraphInstantiateWithParams(hipGraphExec_t* pGraphExec, hipGraph_t graph, hipGraphInstantiateParams *instantiateParams); /** - * @brief launches an executable graph in a stream + * @brief Launches an executable graph in the specified stream. * - * @param [in] graphExec - instance of executable graph to launch. - * @param [in] stream - instance of stream in which to launch executable graph. + * @param [in] graphExec - Instance of executable graph to launch. + * @param [in] stream - Instance of stream in which to launch executable graph. * @returns #hipSuccess, #hipErrorInvalidValue * - * @warning : This API is marked as beta, meaning, while this is feature complete, - * it is still open to changes and may have outstanding issues. + * @warning This API is marked as Beta. While this feature is complete, it can + * change and might have outstanding issues. */ hipError_t hipGraphLaunch(hipGraphExec_t graphExec, hipStream_t stream); /** - * @brief uploads an executable graph in a stream + * @brief Uploads an executable graph to a stream * - * @param [in] graphExec - instance of executable graph to launch. - * @param [in] stream - instance of stream in which to launch executable graph. + * @param [in] graphExec - Instance of executable graph to be uploaded. + * @param [in] stream - Instance of stream to which the executable graph is uploaded to. * @returns #hipSuccess, #hipErrorInvalidValue * - * @warning : This API is marked as beta, meaning, while this is feature complete, - * it is still open to changes and may have outstanding issues. + * @warning This API is marked as Beta. While this feature is complete, it can + * change and might have outstanding issues. */ hipError_t hipGraphUpload(hipGraphExec_t graphExec, hipStream_t stream); /** * @brief Creates a kernel execution node and adds it to a graph. * - * @param [out] pGraphNode - pointer to graph node to create. - * @param [in] graph - instance of graph to add the created node. - * @param [in] pDependencies - pointer to the dependencies on the kernel execution node. - * @param [in] numDependencies - the number of the dependencies. - * @param [in] nodeParams - pointer to the parameters for the node. + * @param [out] pGraphNode - Pointer to kernel graph node that is created. + * @param [in] graph - Instance of graph to add the created node to. + * @param [in] pDependencies - Pointer to the dependencies on the kernel execution node. + * @param [in] numDependencies - Number of dependencies. + * @param [in] nodeParams - Pointer to the node parameters. * @returns #hipSuccess, #hipErrorInvalidValue. - * @warning : This API is marked as beta, meaning, while this is feature complete, - * it is still open to changes and may have outstanding issues. + * @warning This API is marked as Beta. While this feature is complete, it can + * change and might have outstanding issues. */ hipError_t hipGraphAddNode(hipGraphNode_t *pGraphNode, hipGraph_t graph, const hipGraphNode_t *pDependencies, size_t numDependencies, hipGraphNodeParams *nodeParams); /** - * @brief Return the flags on executable graph. + * @brief Return the flags of an executable graph. * - * @param [in] graphExec - Executable graph to get the flags. + * @param [in] graphExec - Executable graph to get the flags from. * @param [out] flags - Flags used to instantiate this executable graph. * @returns #hipSuccess, #hipErrorInvalidValue. - * @warning : This API is marked as beta, meaning, while this is feature complete, - * it is still open to changes and may have outstanding issues. + * @warning This API is marked as Beta. While this feature is complete, it can + * change and might have outstanding issues. */ hipError_t hipGraphExecGetFlags(hipGraphExec_t graphExec, unsigned long long* flags); /** - * @brief Updates parameters of a created node. + * @brief Updates parameters of a graph's node. * - * @param [in] node - instance of the node to set parameters to. - * @param [in] nodeParams - pointer to the parameters. + * @param [in] node - Instance of the node to set parameters for. + * @param [in] nodeParams - Pointer to the parameters to be set. * @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidDeviceFunction, #hipErrorNotSupported. - * @warning : This API is marked as beta, meaning, while this is feature complete, - * it is still open to changes and may have outstanding issues. + * @warning This API is marked as Beta. While this feature is complete, it can + * change and might have outstanding issues. */ hipError_t hipGraphNodeSetParams(hipGraphNode_t node, hipGraphNodeParams *nodeParams); /** - * @brief Updates parameters of a created node on executable graph. + * @brief Updates parameters of an executable graph's node. * - * @param [in] graphExec - instance of executable graph. - * @param [in] node - instance of the node to set parameters to. - * @param [in] nodeParams - pointer to the parameters. + * @param [in] graphExec - Instance of the executable graph. + * @param [in] node - Instance of the node to set parameters to. + * @param [in] nodeParams - Pointer to the parameters to be set. * @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidDeviceFunction, #hipErrorNotSupported. - * @warning : This API is marked as beta, meaning, while this is feature complete, - * it is still open to changes and may have outstanding issues. + * @warning This API is marked as Beta. While this feature is complete, it can + * change and might have outstanding issues. */ hipError_t hipGraphExecNodeSetParams(hipGraphExec_t graphExec, hipGraphNode_t node, hipGraphNodeParams* nodeParams); /** * @brief Destroys an executable graph * - * @param [in] graphExec - instance of executable graph to destry. + * @param [in] graphExec - Instance of executable graph to destroy. * * @returns #hipSuccess. * - * @warning : This API is marked as beta, meaning, while this is feature complete, - * it is still open to changes and may have outstanding issues. + * @warning This API is marked as Beta. While this feature is complete, it can + * change and might have outstanding issues. */ hipError_t hipGraphExecDestroy(hipGraphExec_t graphExec); @@ -7654,11 +7663,11 @@ hipError_t hipGraphExecDestroy(hipGraphExec_t graphExec); * @param [in] hGraphExec - instance of executable graph to update. * @param [in] hGraph - graph that contains the updated parameters. * @param [in] hErrorNode_out - node which caused the permissibility check to forbid the update. - * @param [in] updateResult_out - Whether the graph update was permitted. + * @param [in] updateResult_out - Return code whether the graph update was performed. * @returns #hipSuccess, #hipErrorGraphExecUpdateFailure * - * @warning : This API is marked as beta, meaning, while this is feature complete, - * it is still open to changes and may have outstanding issues. + * @warning This API is marked as Beta. While this feature is complete, it can + * change and might have outstanding issues. */ hipError_t hipGraphExecUpdate(hipGraphExec_t hGraphExec, hipGraph_t hGraph, hipGraphNode_t* hErrorNode_out, @@ -7667,14 +7676,14 @@ hipError_t hipGraphExecUpdate(hipGraphExec_t hGraphExec, hipGraph_t hGraph, /** * @brief Creates a kernel execution node and adds it to a graph. * - * @param [out] pGraphNode - pointer to graph node to create. - * @param [in] graph - instance of graph to add the created node. - * @param [in] pDependencies - pointer to the dependencies on the kernel execution node. - * @param [in] numDependencies - the number of the dependencies. - * @param [in] pNodeParams - pointer to the parameters to the kernel execution node on the GPU. + * @param [out] pGraphNode - Pointer to graph node that is created + * @param [in] graph - Instance of graph to add the created node to. + * @param [in] pDependencies - Pointer to the dependencies of the kernel execution node. + * @param [in] numDependencies - The number of the dependencies. + * @param [in] pNodeParams - Pointer to the parameters of the kernel execution node. * @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidDeviceFunction - * @warning : This API is marked as beta, meaning, while this is feature complete, - * it is still open to changes and may have outstanding issues. + * @warning This API is marked as Beta. While this feature is complete, it can + * change and might have outstanding issues. */ hipError_t hipGraphAddKernelNode(hipGraphNode_t* pGraphNode, hipGraph_t graph, const hipGraphNode_t* pDependencies, size_t numDependencies, @@ -7686,31 +7695,31 @@ hipError_t hipGraphAddKernelNode(hipGraphNode_t* pGraphNode, hipGraph_t graph, * @param [in] node - instance of the node to get parameters from. * @param [out] pNodeParams - pointer to the parameters * @returns #hipSuccess, #hipErrorInvalidValue - * @warning : This API is marked as beta, meaning, while this is feature complete, - * it is still open to changes and may have outstanding issues. + * @warning This API is marked as Beta. While this feature is complete, it can + * change and might have outstanding issues. */ hipError_t hipGraphKernelNodeGetParams(hipGraphNode_t node, hipKernelNodeParams* pNodeParams); /** * @brief Sets a kernel node's parameters. * - * @param [in] node - instance of the node to set parameters to. + * @param [in] node - Instance of the node to set parameters of. * @param [in] pNodeParams - const pointer to the parameters. * @returns #hipSuccess, #hipErrorInvalidValue - * @warning : This API is marked as beta, meaning, while this is feature complete, - * it is still open to changes and may have outstanding issues. + * @warning This API is marked as Beta. While this feature is complete, it can + * change and might have outstanding issues. */ hipError_t hipGraphKernelNodeSetParams(hipGraphNode_t node, const hipKernelNodeParams* pNodeParams); /** * @brief Sets the parameters for a kernel node in the given graphExec. * - * @param [in] hGraphExec - instance of the executable graph with the node. - * @param [in] node - instance of the node to set parameters to. + * @param [in] hGraphExec - Instance of the executable graph with the node. + * @param [in] node - Instance of the node to set parameters of. * @param [in] pNodeParams - const pointer to the kernel node parameters. * @returns #hipSuccess, #hipErrorInvalidValue - * @warning : This API is marked as beta, meaning, while this is feature complete, - * it is still open to changes and may have outstanding issues. + * @warning This API is marked as Beta. While this feature is complete, it can + * change and might have outstanding issues. */ hipError_t hipGraphExecKernelNodeSetParams(hipGraphExec_t hGraphExec, hipGraphNode_t node, const hipKernelNodeParams* pNodeParams); @@ -7718,15 +7727,15 @@ hipError_t hipGraphExecKernelNodeSetParams(hipGraphExec_t hGraphExec, hipGraphNo /** * @brief Creates a memcpy node and adds it to a graph. * - * @param [out] phGraphNode - pointer to graph node to create. - * @param [in] hGraph - instance of graph to add the created node. - * @param [in] dependencies - const pointer to the dependencies on the memcpy execution node. - * @param [in] numDependencies - the number of the dependencies. + * @param [out] phGraphNode - Pointer to graph node that is created. + * @param [in] hGraph - Instance of graph to add the created node to. + * @param [in] dependencies - const pointer to the dependencies of the memcpy execution node. + * @param [in] numDependencies - The number of dependencies. * @param [in] copyParams - const pointer to the parameters for the memory copy. - * @param [in] ctx - cotext related to current device. + * @param [in] ctx - context related to current device. * @returns #hipSuccess, #hipErrorInvalidValue - * @warning : This API is marked as beta, meaning, while this is feature complete, - * it is still open to changes and may have outstanding issues. + * @warning This API is marked as Beta. While this feature is complete, it can + * change and might have outstanding issues. */ hipError_t hipDrvGraphAddMemcpyNode(hipGraphNode_t* phGraphNode, hipGraph_t hGraph, const hipGraphNode_t* dependencies, @@ -7735,14 +7744,14 @@ hipError_t hipDrvGraphAddMemcpyNode(hipGraphNode_t* phGraphNode, hipGraph_t hGra /** * @brief Creates a memcpy node and adds it to a graph. * - * @param [out] pGraphNode - pointer to graph node to create. - * @param [in] graph - instance of graph to add the created node. - * @param [in] pDependencies - const pointer to the dependencies on the memcpy execution node. - * @param [in] numDependencies - the number of the dependencies. + * @param [out] pGraphNode - Pointer to graph node that is created. + * @param [in] graph - Instance of graph to add the created node to. + * @param [in] pDependencies - const pointer to the dependencies of the memcpy execution node. + * @param [in] numDependencies - The number of dependencies. * @param [in] pCopyParams - const pointer to the parameters for the memory copy. * @returns #hipSuccess, #hipErrorInvalidValue - * @warning : This API is marked as beta, meaning, while this is feature complete, - * it is still open to changes and may have outstanding issues. + * @warning This API is marked as Beta. While this feature is complete, it can + * change and might have outstanding issues. */ hipError_t hipGraphAddMemcpyNode(hipGraphNode_t* pGraphNode, hipGraph_t graph, const hipGraphNode_t* pDependencies, size_t numDependencies, @@ -7753,8 +7762,8 @@ hipError_t hipGraphAddMemcpyNode(hipGraphNode_t* pGraphNode, hipGraph_t graph, * @param [in] node - instance of the node to get parameters from. * @param [out] pNodeParams - pointer to the parameters. * @returns #hipSuccess, #hipErrorInvalidValue - * @warning : This API is marked as beta, meaning, while this is feature complete, - * it is still open to changes and may have outstanding issues. + * @warning This API is marked as Beta. While this feature is complete, it can + * change and might have outstanding issues. */ hipError_t hipGraphMemcpyNodeGetParams(hipGraphNode_t node, hipMemcpy3DParms* pNodeParams); @@ -7764,44 +7773,44 @@ hipError_t hipGraphMemcpyNodeGetParams(hipGraphNode_t node, hipMemcpy3DParms* pN * @param [in] node - instance of the node to set parameters to. * @param [in] pNodeParams - const pointer to the parameters. * @returns #hipSuccess, #hipErrorInvalidValue - * @warning : This API is marked as beta, meaning, while this is feature complete, - * it is still open to changes and may have outstanding issues. + * @warning This API is marked as Beta. While this feature is complete, it can + * change and might have outstanding issues. */ hipError_t hipGraphMemcpyNodeSetParams(hipGraphNode_t node, const hipMemcpy3DParms* pNodeParams); /** - * @brief Sets a node attribute. + * @brief Sets a node's attribute. * - * @param [in] hNode - instance of the node to set parameters to. - * @param [in] attr - the attribute node is set to. + * @param [in] hNode - Instance of the node to set parameters of. + * @param [in] attr - The attribute type to be set. * @param [in] value - const pointer to the parameters. * @returns #hipSuccess, #hipErrorInvalidValue - * @warning : This API is marked as beta, meaning, while this is feature complete, - * it is still open to changes and may have outstanding issues. + * @warning This API is marked as Beta. While this feature is complete, it can + * change and might have outstanding issues. */ hipError_t hipGraphKernelNodeSetAttribute(hipGraphNode_t hNode, hipKernelNodeAttrID attr, const hipKernelNodeAttrValue* value); /** - * @brief Gets a node attribute. + * @brief Gets a node's attribute. * - * @param [in] hNode - instance of the node to set parameters to. - * @param [in] attr - the attribute node is set to. + * @param [in] hNode - Instance of the node to set parameters of. + * @param [in] attr - The attribute type to be set. * @param [in] value - const pointer to the parameters. * @returns #hipSuccess, #hipErrorInvalidValue - * @warning : This API is marked as beta, meaning, while this is feature complete, - * it is still open to changes and may have outstanding issues. + * @warning This API is marked as Beta. While this feature is complete, it can + * change and might have outstanding issues. */ hipError_t hipGraphKernelNodeGetAttribute(hipGraphNode_t hNode, hipKernelNodeAttrID attr, hipKernelNodeAttrValue* value); /** - * @brief Sets the parameters for a memcpy node in the given graphExec. + * @brief Sets the parameters of a memcpy node in the given graphExec. * - * @param [in] hGraphExec - instance of the executable graph with the node. - * @param [in] node - instance of the node to set parameters to. + * @param [in] hGraphExec - Instance of the executable graph with the node. + * @param [in] node - Instance of the node to set parameters of. * @param [in] pNodeParams - const pointer to the kernel node parameters. * @returns #hipSuccess, #hipErrorInvalidValue - * @warning : This API is marked as beta, meaning, while this is feature complete, - * it is still open to changes and may have outstanding issues. + * @warning This API is marked as Beta. While this feature is complete, it can + * change and might have outstanding issues. */ hipError_t hipGraphExecMemcpyNodeSetParams(hipGraphExec_t hGraphExec, hipGraphNode_t node, hipMemcpy3DParms* pNodeParams); @@ -7809,17 +7818,17 @@ hipError_t hipGraphExecMemcpyNodeSetParams(hipGraphExec_t hGraphExec, hipGraphNo /** * @brief Creates a 1D memcpy node and adds it to a graph. * - * @param [out] pGraphNode - pointer to graph node to create. - * @param [in] graph - instance of graph to add the created node. - * @param [in] pDependencies - const pointer to the dependencies on the memcpy execution node. - * @param [in] numDependencies - the number of the dependencies. - * @param [in] dst - pointer to memory address to the destination. - * @param [in] src - pointer to memory address to the source. - * @param [in] count - the size of the memory to copy. - * @param [in] kind - the type of memory copy. + * @param [out] pGraphNode - Pointer to graph node that is created. + * @param [in] graph - Instance of graph to add the created node to. + * @param [in] pDependencies - const pointer to the dependencies of the memcpy execution node. + * @param [in] numDependencies - The number of dependencies. + * @param [in] dst - Pointer to memory address of the destination. + * @param [in] src - Pointer to memory address of the source. + * @param [in] count - Size of the memory to copy. + * @param [in] kind - Type of memory copy. * @returns #hipSuccess, #hipErrorInvalidValue - * @warning : This API is marked as beta, meaning, while this is feature complete, - * it is still open to changes and may have outstanding issues. + * @warning This API is marked as Beta. While this feature is complete, it can + * change and might have outstanding issues. */ hipError_t hipGraphAddMemcpyNode1D(hipGraphNode_t* pGraphNode, hipGraph_t graph, const hipGraphNode_t* pDependencies, size_t numDependencies, @@ -7828,14 +7837,14 @@ hipError_t hipGraphAddMemcpyNode1D(hipGraphNode_t* pGraphNode, hipGraph_t graph, /** * @brief Sets a memcpy node's parameters to perform a 1-dimensional copy. * - * @param [in] node - instance of the node to set parameters to. - * @param [in] dst - pointer to memory address to the destination. - * @param [in] src - pointer to memory address to the source. - * @param [in] count - the size of the memory to copy. - * @param [in] kind - the type of memory copy. + * @param [in] node - Instance of the node to set parameters of. + * @param [in] dst - Pointer to memory address of the destination. + * @param [in] src - Pointer to memory address of the source. + * @param [in] count - Size of the memory to copy. + * @param [in] kind - Type of memory copy. * @returns #hipSuccess, #hipErrorInvalidValue - * @warning : This API is marked as beta, meaning, while this is feature complete, - * it is still open to changes and may have outstanding issues. + * @warning This API is marked as Beta. While this feature is complete, it can + * change and might have outstanding issues. */ hipError_t hipGraphMemcpyNodeSetParams1D(hipGraphNode_t node, void* dst, const void* src, size_t count, hipMemcpyKind kind); @@ -7844,15 +7853,15 @@ hipError_t hipGraphMemcpyNodeSetParams1D(hipGraphNode_t node, void* dst, const v * @brief Sets the parameters for a memcpy node in the given graphExec to perform a 1-dimensional * copy. * - * @param [in] hGraphExec - instance of the executable graph with the node. - * @param [in] node - instance of the node to set parameters to. - * @param [in] dst - pointer to memory address to the destination. - * @param [in] src - pointer to memory address to the source. - * @param [in] count - the size of the memory to copy. - * @param [in] kind - the type of memory copy. + * @param [in] hGraphExec - Instance of the executable graph with the node. + * @param [in] node - Instance of the node to set parameters of. + * @param [in] dst - Pointer to memory address of the destination. + * @param [in] src - Pointer to memory address of the source. + * @param [in] count - Size of the memory to copy. + * @param [in] kind - Type of memory copy. * @returns #hipSuccess, #hipErrorInvalidValue - * @warning : This API is marked as beta, meaning, while this is feature complete, - * it is still open to changes and may have outstanding issues. + * @warning This API is marked as Beta. While this feature is complete, it can + * change and might have outstanding issues. */ hipError_t hipGraphExecMemcpyNodeSetParams1D(hipGraphExec_t hGraphExec, hipGraphNode_t node, void* dst, const void* src, size_t count, @@ -7861,18 +7870,18 @@ hipError_t hipGraphExecMemcpyNodeSetParams1D(hipGraphExec_t hGraphExec, hipGraph /** * @brief Creates a memcpy node to copy from a symbol on the device and adds it to a graph. * - * @param [out] pGraphNode - pointer to graph node to create. - * @param [in] graph - instance of graph to add the created node. - * @param [in] pDependencies - const pointer to the dependencies on the memcpy execution node. - * @param [in] numDependencies - the number of the dependencies. - * @param [in] dst - pointer to memory address to the destination. + * @param [out] pGraphNode - Pointer to graph node that is created. + * @param [in] graph - Instance of graph to add the created node to. + * @param [in] pDependencies - const pointer to the dependencies of the memcpy execution node. + * @param [in] numDependencies - Number of the dependencies. + * @param [in] dst - Pointer to memory address of the destination. * @param [in] symbol - Device symbol address. - * @param [in] count - the size of the memory to copy. + * @param [in] count - Size of the memory to copy. * @param [in] offset - Offset from start of symbol in bytes. - * @param [in] kind - the type of memory copy. + * @param [in] kind - Type of memory copy. * @returns #hipSuccess, #hipErrorInvalidValue - * @warning : This API is marked as beta, meaning, while this is feature complete, - * it is still open to changes and may have outstanding issues. + * @warning This API is marked as Beta. While this feature is complete, it can + * change and might have outstanding issues. */ hipError_t hipGraphAddMemcpyNodeFromSymbol(hipGraphNode_t* pGraphNode, hipGraph_t graph, const hipGraphNode_t* pDependencies, @@ -7882,15 +7891,15 @@ hipError_t hipGraphAddMemcpyNodeFromSymbol(hipGraphNode_t* pGraphNode, hipGraph_ /** * @brief Sets a memcpy node's parameters to copy from a symbol on the device. * - * @param [in] node - instance of the node to set parameters to. - * @param [in] dst - pointer to memory address to the destination. + * @param [in] node - Instance of the node to set parameters of. + * @param [in] dst - Pointer to memory address of the destination. * @param [in] symbol - Device symbol address. - * @param [in] count - the size of the memory to copy. + * @param [in] count - Size of the memory to copy. * @param [in] offset - Offset from start of symbol in bytes. - * @param [in] kind - the type of memory copy. + * @param [in] kind - Type of memory copy. * @returns #hipSuccess, #hipErrorInvalidValue - * @warning : This API is marked as beta, meaning, while this is feature complete, - * it is still open to changes and may have outstanding issues. + * @warning This API is marked as Beta. While this feature is complete, it can + * change and might have outstanding issues. */ hipError_t hipGraphMemcpyNodeSetParamsFromSymbol(hipGraphNode_t node, void* dst, const void* symbol, size_t count, size_t offset, hipMemcpyKind kind); @@ -7899,16 +7908,16 @@ hipError_t hipGraphMemcpyNodeSetParamsFromSymbol(hipGraphNode_t node, void* dst, * @brief Sets the parameters for a memcpy node in the given graphExec to copy from a symbol on the * * device. * - * @param [in] hGraphExec - instance of the executable graph with the node. - * @param [in] node - instance of the node to set parameters to. - * @param [in] dst - pointer to memory address to the destination. + * @param [in] hGraphExec - Instance of the executable graph with the node. + * @param [in] node - Instance of the node to set parameters of. + * @param [in] dst - Pointer to memory address of the destination. * @param [in] symbol - Device symbol address. - * @param [in] count - the size of the memory to copy. + * @param [in] count - Size of the memory to copy. * @param [in] offset - Offset from start of symbol in bytes. - * @param [in] kind - the type of memory copy. + * @param [in] kind - Type of memory copy. * @returns #hipSuccess, #hipErrorInvalidValue - * @warning : This API is marked as beta, meaning, while this is feature complete, - * it is still open to changes and may have outstanding issues. + * @warning This API is marked as Beta. While this feature is complete, it can + * change and might have outstanding issues. */ hipError_t hipGraphExecMemcpyNodeSetParamsFromSymbol(hipGraphExec_t hGraphExec, hipGraphNode_t node, void* dst, const void* symbol, size_t count, @@ -7917,18 +7926,18 @@ hipError_t hipGraphExecMemcpyNodeSetParamsFromSymbol(hipGraphExec_t hGraphExec, /** * @brief Creates a memcpy node to copy to a symbol on the device and adds it to a graph. * - * @param [out] pGraphNode - pointer to graph node to create. - * @param [in] graph - instance of graph to add the created node. + * @param [out] pGraphNode - Pointer to graph node that is created. + * @param [in] graph - Instance of graph to add the created node to. * @param [in] pDependencies - const pointer to the dependencies on the memcpy execution node. - * @param [in] numDependencies - the number of the dependencies. + * @param [in] numDependencies - Number of dependencies. * @param [in] symbol - Device symbol address. - * @param [in] src - pointer to memory address of the src. - * @param [in] count - the size of the memory to copy. + * @param [in] src - Pointer to memory address of the src. + * @param [in] count - Size of the memory to copy. * @param [in] offset - Offset from start of symbol in bytes. - * @param [in] kind - the type of memory copy. + * @param [in] kind - Type of memory copy. * @returns #hipSuccess, #hipErrorInvalidValue - * @warning : This API is marked as beta, meaning, while this is feature complete, - * it is still open to changes and may have outstanding issues. + * @warning This API is marked as Beta. While this feature is complete, it can + * change and might have outstanding issues. */ hipError_t hipGraphAddMemcpyNodeToSymbol(hipGraphNode_t* pGraphNode, hipGraph_t graph, const hipGraphNode_t* pDependencies, @@ -7939,15 +7948,15 @@ hipError_t hipGraphAddMemcpyNodeToSymbol(hipGraphNode_t* pGraphNode, hipGraph_t /** * @brief Sets a memcpy node's parameters to copy to a symbol on the device. * - * @param [in] node - instance of the node to set parameters to. + * @param [in] node - Instance of the node to set parameters of. * @param [in] symbol - Device symbol address. - * @param [in] src - pointer to memory address of the src. - * @param [in] count - the size of the memory to copy. + * @param [in] src - Pointer to memory address of the src. + * @param [in] count - Size of the memory to copy. * @param [in] offset - Offset from start of symbol in bytes. - * @param [in] kind - the type of memory copy. + * @param [in] kind - Type of memory copy. * @returns #hipSuccess, #hipErrorInvalidValue - * @warning : This API is marked as beta, meaning, while this is feature complete, - * it is still open to changes and may have outstanding issues. + * @warning This API is marked as Beta. While this feature is complete, it can + * change and might have outstanding issues. */ hipError_t hipGraphMemcpyNodeSetParamsToSymbol(hipGraphNode_t node, const void* symbol, const void* src, size_t count, size_t offset, @@ -7957,16 +7966,16 @@ hipError_t hipGraphMemcpyNodeSetParamsToSymbol(hipGraphNode_t node, const void* /** * @brief Sets the parameters for a memcpy node in the given graphExec to copy to a symbol on the * device. - * @param [in] hGraphExec - instance of the executable graph with the node. - * @param [in] node - instance of the node to set parameters to. + * @param [in] hGraphExec - Instance of the executable graph with the node. + * @param [in] node - Instance of the node to set parameters of. * @param [in] symbol - Device symbol address. - * @param [in] src - pointer to memory address of the src. - * @param [in] count - the size of the memory to copy. + * @param [in] src - Pointer to memory address of the src. + * @param [in] count - Size of the memory to copy. * @param [in] offset - Offset from start of symbol in bytes. - * @param [in] kind - the type of memory copy. + * @param [in] kind - Type of memory copy. * @returns #hipSuccess, #hipErrorInvalidValue - * @warning : This API is marked as beta, meaning, while this is feature complete, - * it is still open to changes and may have outstanding issues. + * @warning This API is marked as Beta. While this feature is complete, it can + * change and might have outstanding issues. */ hipError_t hipGraphExecMemcpyNodeSetParamsToSymbol(hipGraphExec_t hGraphExec, hipGraphNode_t node, const void* symbol, const void* src, @@ -7975,14 +7984,14 @@ hipError_t hipGraphExecMemcpyNodeSetParamsToSymbol(hipGraphExec_t hGraphExec, hi /** * @brief Creates a memset node and adds it to a graph. * - * @param [out] pGraphNode - pointer to the graph node to create. - * @param [in] graph - instance of the graph to add the created node. + * @param [out] pGraphNode - Pointer to graph node that is created. + * @param [in] graph - Instance of the graph to add the created node to. * @param [in] pDependencies - const pointer to the dependencies on the memset execution node. - * @param [in] numDependencies - the number of the dependencies. + * @param [in] numDependencies - Number of dependencies. * @param [in] pMemsetParams - const pointer to the parameters for the memory set. * @returns #hipSuccess, #hipErrorInvalidValue - * @warning : This API is marked as beta, meaning, while this is feature complete, - * it is still open to changes and may have outstanding issues. + * @warning This API is marked as Beta. While this feature is complete, it can + * change and might have outstanding issues. */ hipError_t hipGraphAddMemsetNode(hipGraphNode_t* pGraphNode, hipGraph_t graph, const hipGraphNode_t* pDependencies, size_t numDependencies, @@ -7991,34 +8000,34 @@ hipError_t hipGraphAddMemsetNode(hipGraphNode_t* pGraphNode, hipGraph_t graph, /** * @brief Gets a memset node's parameters. * - * @param [in] node - instane of the node to get parameters from. - * @param [out] pNodeParams - pointer to the parameters. + * @param [in] node - Instance of the node to get parameters of. + * @param [out] pNodeParams - Pointer to the parameters. * @returns #hipSuccess, #hipErrorInvalidValue - * @warning : This API is marked as beta, meaning, while this is feature complete, - * it is still open to changes and may have outstanding issues. + * @warning This API is marked as Beta. While this feature is complete, it can + * change and might have outstanding issues. */ hipError_t hipGraphMemsetNodeGetParams(hipGraphNode_t node, hipMemsetParams* pNodeParams); /** * @brief Sets a memset node's parameters. * - * @param [in] node - instance of the node to set parameters to. - * @param [in] pNodeParams - pointer to the parameters. + * @param [in] node - Instance of the node to set parameters of. + * @param [in] pNodeParams - Pointer to the parameters. * @returns #hipSuccess, #hipErrorInvalidValue - * @warning : This API is marked as beta, meaning, while this is feature complete, - * it is still open to changes and may have outstanding issues. + * @warning This API is marked as Beta. While this feature is complete, it can + * change and might have outstanding issues. */ hipError_t hipGraphMemsetNodeSetParams(hipGraphNode_t node, const hipMemsetParams* pNodeParams); /** * @brief Sets the parameters for a memset node in the given graphExec. * - * @param [in] hGraphExec - instance of the executable graph with the node. - * @param [in] node - instance of the node to set parameters to. - * @param [in] pNodeParams - pointer to the parameters. + * @param [in] hGraphExec - Instance of the executable graph with the node. + * @param [in] node - Instance of the node to set parameters of. + * @param [in] pNodeParams - Pointer to the parameters. * @returns #hipSuccess, #hipErrorInvalidValue - * @warning : This API is marked as beta, meaning, while this is feature complete, - * it is still open to changes and may have outstanding issues. + * @warning This API is marked as Beta. While this feature is complete, it can + * change and might have outstanding issues. */ hipError_t hipGraphExecMemsetNodeSetParams(hipGraphExec_t hGraphExec, hipGraphNode_t node, const hipMemsetParams* pNodeParams); @@ -8026,14 +8035,14 @@ hipError_t hipGraphExecMemsetNodeSetParams(hipGraphExec_t hGraphExec, hipGraphNo /** * @brief Creates a host execution node and adds it to a graph. * - * @param [out] pGraphNode - pointer to the graph node to create. - * @param [in] graph - instance of the graph to add the created node. - * @param [in] pDependencies - const pointer to the dependencies on the memset execution node. - * @param [in] numDependencies - the number of the dependencies. - * @param [in] pNodeParams -pointer to the parameters. + * @param [out] pGraphNode - Pointer to graph node that is created. + * @param [in] graph - Instance of the graph to add the created node to. + * @param [in] pDependencies - const pointer to the dependencies of the memset execution node. + * @param [in] numDependencies - Number of dependencies. + * @param [in] pNodeParams - Pointer to the parameters. * @returns #hipSuccess, #hipErrorInvalidValue - * @warning : This API is marked as beta, meaning, while this is feature complete, - * it is still open to changes and may have outstanding issues. + * @warning This API is marked as Beta. While this feature is complete, it can + * change and might have outstanding issues. */ hipError_t hipGraphAddHostNode(hipGraphNode_t* pGraphNode, hipGraph_t graph, const hipGraphNode_t* pDependencies, size_t numDependencies, @@ -8042,34 +8051,34 @@ hipError_t hipGraphAddHostNode(hipGraphNode_t* pGraphNode, hipGraph_t graph, /** * @brief Returns a host node's parameters. * - * @param [in] node - instane of the node to get parameters from. - * @param [out] pNodeParams - pointer to the parameters. + * @param [in] node - Instance of the node to get parameters of. + * @param [out] pNodeParams - Pointer to the parameters. * @returns #hipSuccess, #hipErrorInvalidValue - * @warning : This API is marked as beta, meaning, while this is feature complete, - * it is still open to changes and may have outstanding issues. + * @warning This API is marked as Beta. While this feature is complete, it can + * change and might have outstanding issues. */ hipError_t hipGraphHostNodeGetParams(hipGraphNode_t node, hipHostNodeParams* pNodeParams); /** * @brief Sets a host node's parameters. * - * @param [in] node - instance of the node to set parameters to. - * @param [in] pNodeParams - pointer to the parameters. + * @param [in] node - Instance of the node to set parameters of. + * @param [in] pNodeParams - Pointer to the parameters. * @returns #hipSuccess, #hipErrorInvalidValue - * @warning : This API is marked as beta, meaning, while this is feature complete, - * it is still open to changes and may have outstanding issues. + * @warning This API is marked as Beta. While this feature is complete, it can + * change and might have outstanding issues. */ hipError_t hipGraphHostNodeSetParams(hipGraphNode_t node, const hipHostNodeParams* pNodeParams); /** * @brief Sets the parameters for a host node in the given graphExec. * - * @param [in] hGraphExec - instance of the executable graph with the node. - * @param [in] node - instance of the node to set parameters to. - * @param [in] pNodeParams - pointer to the parameters. + * @param [in] hGraphExec - Instance of the executable graph with the node. + * @param [in] node - Instance of the node to set parameters of. + * @param [in] pNodeParams - Pointer to the parameters. * @returns #hipSuccess, #hipErrorInvalidValue - * @warning : This API is marked as beta, meaning, while this is feature complete, - * it is still open to changes and may have outstanding issues. + * @warning This API is marked as Beta. While this feature is complete, it can + * change and might have outstanding issues. */ hipError_t hipGraphExecHostNodeSetParams(hipGraphExec_t hGraphExec, hipGraphNode_t node, const hipHostNodeParams* pNodeParams); @@ -8077,14 +8086,14 @@ hipError_t hipGraphExecHostNodeSetParams(hipGraphExec_t hGraphExec, hipGraphNode /** * @brief Creates a child graph node and adds it to a graph. * - * @param [out] pGraphNode - pointer to the graph node to create. - * @param [in] graph - instance of the graph to add the created node. - * @param [in] pDependencies - const pointer to the dependencies on the memset execution node. - * @param [in] numDependencies - the number of the dependencies. - * @param [in] childGraph - the graph to clone into this node + * @param [out] pGraphNode - Pointer to graph node that is created. + * @param [in] graph - Instance of the graph to add the created node. + * @param [in] pDependencies - const pointer to the dependencies of the memset execution node. + * @param [in] numDependencies - Number of dependencies. + * @param [in] childGraph - Graph to clone into this node * @returns #hipSuccess, #hipErrorInvalidValue - * @warning : This API is marked as beta, meaning, while this is feature complete, - * it is still open to changes and may have outstanding issues. + * @warning This API is marked as Beta. While this feature is complete, it can + * change and might have outstanding issues. */ hipError_t hipGraphAddChildGraphNode(hipGraphNode_t* pGraphNode, hipGraph_t graph, const hipGraphNode_t* pDependencies, size_t numDependencies, @@ -8093,11 +8102,11 @@ hipError_t hipGraphAddChildGraphNode(hipGraphNode_t* pGraphNode, hipGraph_t grap /** * @brief Gets a handle to the embedded graph of a child graph node. * - * @param [in] node - instane of the node to get child graph. - * @param [out] pGraph - pointer to get the graph. + * @param [in] node - Instance of the node to get child graph of. + * @param [out] pGraph - Pointer to get the graph. * @returns #hipSuccess, #hipErrorInvalidValue - * @warning : This API is marked as beta, meaning, while this is feature complete, - * it is still open to changes and may have outstanding issues. + * @warning This API is marked as Beta. While this feature is complete, it can + * change and might have outstanding issues. */ hipError_t hipGraphChildGraphNodeGetGraph(hipGraphNode_t node, hipGraph_t* pGraph); @@ -8108,8 +8117,8 @@ hipError_t hipGraphChildGraphNodeGetGraph(hipGraphNode_t node, hipGraph_t* pGrap * @param [in] node - node from the graph which was used to instantiate graphExec. * @param [in] childGraph - child graph with updated parameters. * @returns #hipSuccess, #hipErrorInvalidValue - * @warning : This API is marked as beta, meaning, while this is feature complete, - * it is still open to changes and may have outstanding issues. + * @warning This API is marked as Beta. While this feature is complete, it can + * change and might have outstanding issues. */ hipError_t hipGraphExecChildGraphNodeSetParams(hipGraphExec_t hGraphExec, hipGraphNode_t node, hipGraph_t childGraph); @@ -8117,13 +8126,13 @@ hipError_t hipGraphExecChildGraphNodeSetParams(hipGraphExec_t hGraphExec, hipGra /** * @brief Creates an empty node and adds it to a graph. * - * @param [out] pGraphNode - pointer to the graph node to create and add to the graph. - * @param [in] graph - instane of the graph the node is add to. - * @param [in] pDependencies - const pointer to the node dependenties. - * @param [in] numDependencies - the number of dependencies. + * @param [out] pGraphNode - Pointer to graph node that is created. + * @param [in] graph - Instance of the graph the node is added to. + * @param [in] pDependencies - const pointer to the node dependencies. + * @param [in] numDependencies - Number of dependencies. * @returns #hipSuccess, #hipErrorInvalidValue - * @warning : This API is marked as beta, meaning, while this is feature complete, - * it is still open to changes and may have outstanding issues. + * @warning This API is marked as Beta. While this feature is complete, it can + * change and might have outstanding issues. */ hipError_t hipGraphAddEmptyNode(hipGraphNode_t* pGraphNode, hipGraph_t graph, const hipGraphNode_t* pDependencies, size_t numDependencies); @@ -8132,14 +8141,14 @@ hipError_t hipGraphAddEmptyNode(hipGraphNode_t* pGraphNode, hipGraph_t graph, /** * @brief Creates an event record node and adds it to a graph. * - * @param [out] pGraphNode - pointer to the graph node to create and add to the graph. - * @param [in] graph - instane of the graph the node to be added. - * @param [in] pDependencies - const pointer to the node dependenties. - * @param [in] numDependencies - the number of dependencies. - * @param [in] event - Event for the node. + * @param [out] pGraphNode - Pointer to graph node that is created. + * @param [in] graph - Instance of the graph the node is added to. + * @param [in] pDependencies - const pointer to the node dependencies. + * @param [in] numDependencies - Number of dependencies. + * @param [in] event - Event of the node. * @returns #hipSuccess, #hipErrorInvalidValue - * @warning : This API is marked as beta, meaning, while this is feature complete, - * it is still open to changes and may have outstanding issues. + * @warning This API is marked as Beta. While this feature is complete, it can + * change and might have outstanding issues. */ hipError_t hipGraphAddEventRecordNode(hipGraphNode_t* pGraphNode, hipGraph_t graph, const hipGraphNode_t* pDependencies, size_t numDependencies, @@ -8148,22 +8157,22 @@ hipError_t hipGraphAddEventRecordNode(hipGraphNode_t* pGraphNode, hipGraph_t gra /** * @brief Returns the event associated with an event record node. * - * @param [in] node - instane of the node to get event from. + * @param [in] node - Instance of the node to get event of. * @param [out] event_out - Pointer to return the event. * @returns #hipSuccess, #hipErrorInvalidValue - * @warning : This API is marked as beta, meaning, while this is feature complete, - * it is still open to changes and may have outstanding issues. + * @warning This API is marked as Beta. While this feature is complete, it can + * change and might have outstanding issues. */ hipError_t hipGraphEventRecordNodeGetEvent(hipGraphNode_t node, hipEvent_t* event_out); /** * @brief Sets an event record node's event. * - * @param [in] node - instane of the node to set event to. - * @param [in] event - pointer to the event. + * @param [in] node - Instance of the node to set event to. + * @param [in] event - Pointer to the event. * @returns #hipSuccess, #hipErrorInvalidValue - * @warning : This API is marked as beta, meaning, while this is feature complete, - * it is still open to changes and may have outstanding issues. + * @warning This API is marked as Beta. While this feature is complete, it can + * change and might have outstanding issues. */ hipError_t hipGraphEventRecordNodeSetEvent(hipGraphNode_t node, hipEvent_t event); @@ -8174,8 +8183,8 @@ hipError_t hipGraphEventRecordNodeSetEvent(hipGraphNode_t node, hipEvent_t event * @param [in] hNode - node from the graph which was used to instantiate graphExec. * @param [in] event - pointer to the event. * @returns #hipSuccess, #hipErrorInvalidValue - * @warning : This API is marked as beta, meaning, while this is feature complete, - * it is still open to changes and may have outstanding issues. + * @warning This API is marked as Beta. While this feature is complete, it can + * change and might have outstanding issues. */ hipError_t hipGraphExecEventRecordNodeSetEvent(hipGraphExec_t hGraphExec, hipGraphNode_t hNode, hipEvent_t event); @@ -8183,14 +8192,14 @@ hipError_t hipGraphExecEventRecordNodeSetEvent(hipGraphExec_t hGraphExec, hipGra /** * @brief Creates an event wait node and adds it to a graph. * - * @param [out] pGraphNode - pointer to the graph node to create and add to the graph. - * @param [in] graph - instane of the graph the node to be added. - * @param [in] pDependencies - const pointer to the node dependenties. - * @param [in] numDependencies - the number of dependencies. + * @param [out] pGraphNode - Pointer to graph node that is created. + * @param [in] graph - Instance of the graph the node to be added. + * @param [in] pDependencies - const pointer to the node dependencies. + * @param [in] numDependencies - Number of dependencies. * @param [in] event - Event for the node. * @returns #hipSuccess, #hipErrorInvalidValue - * @warning : This API is marked as beta, meaning, while this is feature complete, - * it is still open to changes and may have outstanding issues. + * @warning This API is marked as Beta. While this feature is complete, it can + * change and might have outstanding issues. */ hipError_t hipGraphAddEventWaitNode(hipGraphNode_t* pGraphNode, hipGraph_t graph, const hipGraphNode_t* pDependencies, size_t numDependencies, @@ -8200,22 +8209,22 @@ hipError_t hipGraphAddEventWaitNode(hipGraphNode_t* pGraphNode, hipGraph_t graph /** * @brief Returns the event associated with an event wait node. * - * @param [in] node - instane of the node to get event from. + * @param [in] node - Instance of the node to get event of. * @param [out] event_out - Pointer to return the event. * @returns #hipSuccess, #hipErrorInvalidValue - * @warning : This API is marked as beta, meaning, while this is feature complete, - * it is still open to changes and may have outstanding issues. + * @warning This API is marked as Beta. While this feature is complete, it can + * change and might have outstanding issues. */ hipError_t hipGraphEventWaitNodeGetEvent(hipGraphNode_t node, hipEvent_t* event_out); /** * @brief Sets an event wait node's event. * - * @param [in] node - instane of the node to set event to. - * @param [in] event - pointer to the event. + * @param [in] node - Instance of the node to set event of. + * @param [in] event - Pointer to the event. * @returns #hipSuccess, #hipErrorInvalidValue - * @warning : This API is marked as beta, meaning, while this is feature complete, - * it is still open to changes and may have outstanding issues. + * @warning This API is marked as Beta. While this feature is complete, it can + * change and might have outstanding issues. */ hipError_t hipGraphEventWaitNodeSetEvent(hipGraphNode_t node, hipEvent_t event); @@ -8226,8 +8235,8 @@ hipError_t hipGraphEventWaitNodeSetEvent(hipGraphNode_t node, hipEvent_t event); * @param [in] hNode - node from the graph which was used to instantiate graphExec. * @param [in] event - pointer to the event. * @returns #hipSuccess, #hipErrorInvalidValue - * @warning : This API is marked as beta, meaning, while this is feature complete, - * it is still open to changes and may have outstanding issues. + * @warning This API is marked as Beta. While this feature is complete, it can + * change and might have outstanding issues. */ hipError_t hipGraphExecEventWaitNodeSetEvent(hipGraphExec_t hGraphExec, hipGraphNode_t hNode, hipEvent_t event); @@ -8236,13 +8245,13 @@ hipError_t hipGraphExecEventWaitNodeSetEvent(hipGraphExec_t hGraphExec, hipGraph * @brief Creates a memory allocation node and adds it to a graph * * @param [out] pGraphNode - Pointer to the graph node to create and add to the graph - * @param [in] graph - Instane of the graph the node to be added - * @param [in] pDependencies - Const pointer to the node dependenties + * @param [in] graph - Instance of the graph node to be added + * @param [in] pDependencies - Const pointer to the node dependencies * @param [in] numDependencies - The number of dependencies - * @param [in] pNodeParams - Node parameters for memory allocation + * @param [in, out] pNodeParams - Node parameters for memory allocation, returns a pointer to the allocated memory. * @returns #hipSuccess, #hipErrorInvalidValue - * @warning : This API is marked as beta, meaning, while this is feature complete, - * it is still open to changes and may have outstanding issues. + * @warning This API is marked as Beta. While this feature is complete, it can + * change and might have outstanding issues. */ hipError_t hipGraphAddMemAllocNode(hipGraphNode_t* pGraphNode, hipGraph_t graph, const hipGraphNode_t* pDependencies, size_t numDependencies, hipMemAllocNodeParams* pNodeParams); @@ -8250,11 +8259,11 @@ hipError_t hipGraphAddMemAllocNode(hipGraphNode_t* pGraphNode, hipGraph_t graph, /** * @brief Returns parameters for memory allocation node * - * @param [in] node - Memory allocation node for a query + * @param [in] node - Memory allocation node to query * @param [out] pNodeParams - Parameters for the specified memory allocation node * @returns #hipSuccess, #hipErrorInvalidValue - * @warning : This API is marked as beta, meaning, while this is feature complete, - * it is still open to changes and may have outstanding issues. + * @warning This API is marked as Beta. While this feature is complete, it can + * change and might have outstanding issues. */ hipError_t hipGraphMemAllocNodeGetParams(hipGraphNode_t node, hipMemAllocNodeParams* pNodeParams); @@ -8262,13 +8271,13 @@ hipError_t hipGraphMemAllocNodeGetParams(hipGraphNode_t node, hipMemAllocNodePar * @brief Creates a memory free node and adds it to a graph * * @param [out] pGraphNode - Pointer to the graph node to create and add to the graph - * @param [in] graph - Instane of the graph the node to be added - * @param [in] pDependencies - Const pointer to the node dependenties + * @param [in] graph - Instance of the graph node to be added + * @param [in] pDependencies - Const pointer to the node dependencies * @param [in] numDependencies - The number of dependencies * @param [in] dev_ptr - Pointer to the memory to be freed * @returns #hipSuccess, #hipErrorInvalidValue - * @warning : This API is marked as beta, meaning, while this is feature complete, - * it is still open to changes and may have outstanding issues. + * @warning This API is marked as Beta. While this feature is complete, it can + * change and might have outstanding issues. */ hipError_t hipGraphAddMemFreeNode(hipGraphNode_t* pGraphNode, hipGraph_t graph, const hipGraphNode_t* pDependencies, size_t numDependencies, void* dev_ptr); @@ -8276,46 +8285,46 @@ hipError_t hipGraphAddMemFreeNode(hipGraphNode_t* pGraphNode, hipGraph_t graph, /** * @brief Returns parameters for memory free node * - * @param [in] node - Memory free node for a query - * @param [out] dev_ptr - Device pointer for the specified memory free node + * @param [in] node - Memory free node to query + * @param [out] dev_ptr - Device pointer of the specified memory free node * @returns #hipSuccess, #hipErrorInvalidValue - * @warning : This API is marked as beta, meaning, while this is feature complete, - * it is still open to changes and may have outstanding issues. + * @warning This API is marked as Beta. While this feature is complete, it can + * change and might have outstanding issues. */ hipError_t hipGraphMemFreeNodeGetParams(hipGraphNode_t node, void* dev_ptr); /** * @brief Get the mem attribute for graphs. * - * @param [in] device - device the attr is get for. - * @param [in] attr - attr to get. - * @param [out] value - value for specific attr. + * @param [in] device - Device to get attributes from + * @param [in] attr - Attribute type to be queried + * @param [out] value - Value of the queried attribute * @returns #hipSuccess, #hipErrorInvalidDevice - * @warning : This API is marked as beta, meaning, while this is feature complete, - * it is still open to changes and may have outstanding issues. + * @warning This API is marked as Beta. While this feature is complete, it can + * change and might have outstanding issues. */ hipError_t hipDeviceGetGraphMemAttribute(int device, hipGraphMemAttributeType attr, void* value); /** * @brief Set the mem attribute for graphs. * - * @param [in] device - device the attr is set for. - * @param [in] attr - attr to set. - * @param [in] value - value for specific attr. + * @param [in] device - Device to set attribute of. + * @param [in] attr - Attribute type to be set. + * @param [in] value - Value of the attribute. * @returns #hipSuccess, #hipErrorInvalidDevice - * @warning : This API is marked as beta, meaning, while this is feature complete, - * it is still open to changes and may have outstanding issues. + * @warning This API is marked as Beta. While this feature is complete, it can + * change and might have outstanding issues. */ hipError_t hipDeviceSetGraphMemAttribute(int device, hipGraphMemAttributeType attr, void* value); /** - * @brief Free unused memory on specific device used for graph back to OS. + * @brief Free unused memory reserved for graphs on a specific device and return it back to the OS. * - * @param [in] device - device the memory is used for graphs + * @param [in] device - Device for which memory should be trimmed * @returns #hipSuccess, #hipErrorInvalidDevice * - * @warning : This API is marked as beta, meaning, while this is feature complete, - * it is still open to changes and may have outstanding issues. + * @warning This API is marked as Beta. While this feature is complete, it can + * change and might have outstanding issues. */ hipError_t hipDeviceGraphMemTrim(int device); @@ -8328,8 +8337,8 @@ hipError_t hipDeviceGraphMemTrim(int device); * @param [in] initialRefcount - reference to resource. * @param [in] flags - flags passed to API. * @returns #hipSuccess, #hipErrorInvalidValue - * @warning : This API is marked as beta, meaning, while this is feature complete, - * it is still open to changes and may have outstanding issues. + * @warning This API is marked as Beta. While this feature is complete, it can + * change and might have outstanding issues. */ hipError_t hipUserObjectCreate(hipUserObject_t* object_out, void* ptr, hipHostFn_t destroy, unsigned int initialRefcount, unsigned int flags); @@ -8340,8 +8349,8 @@ hipError_t hipUserObjectCreate(hipUserObject_t* object_out, void* ptr, hipHostFn * @param [in] object - pointer to instace of userobj. * @param [in] count - reference to resource to be retained. * @returns #hipSuccess, #hipErrorInvalidValue - * @warning : This API is marked as beta, meaning, while this is feature complete, - * it is still open to changes and may have outstanding issues. + * @warning This API is marked as Beta. While this feature is complete, it can + * change and might have outstanding issues. */ hipError_t hipUserObjectRelease(hipUserObject_t object, unsigned int count __dparm(1)); @@ -8351,8 +8360,8 @@ hipError_t hipUserObjectRelease(hipUserObject_t object, unsigned int count __dpa * @param [in] object - pointer to instace of userobj. * @param [in] count - reference to resource to be retained. * @returns #hipSuccess, #hipErrorInvalidValue - * @warning : This API is marked as beta, meaning, while this is feature complete, - * it is still open to changes and may have outstanding issues. + * @warning This API is marked as Beta. While this feature is complete, it can + * change and might have outstanding issues. */ hipError_t hipUserObjectRetain(hipUserObject_t object, unsigned int count __dparm(1)); @@ -8364,8 +8373,8 @@ hipError_t hipUserObjectRetain(hipUserObject_t object, unsigned int count __dpar * @param [in] count - reference to resource to be retained. * @param [in] flags - flags passed to API. * @returns #hipSuccess, #hipErrorInvalidValue - * @warning : This API is marked as beta, meaning, while this is feature complete, - * it is still open to changes and may have outstanding issues. + * @warning This API is marked as Beta. While this feature is complete, it can + * change and might have outstanding issues. */ hipError_t hipGraphRetainUserObject(hipGraph_t graph, hipUserObject_t object, unsigned int count __dparm(1), unsigned int flags __dparm(0)); @@ -8377,8 +8386,8 @@ hipError_t hipGraphRetainUserObject(hipGraph_t graph, hipUserObject_t object, * @param [in] object - pointer to instace of userobj. * @param [in] count - reference to resource to be retained. * @returns #hipSuccess, #hipErrorInvalidValue - * @warning : This API is marked as beta, meaning, while this is feature complete, - * it is still open to changes and may have outstanding issues. + * @warning This API is marked as Beta. While this feature is complete, it can + * change and might have outstanding issues. */ hipError_t hipGraphReleaseUserObject(hipGraph_t graph, hipUserObject_t object, unsigned int count __dparm(1)); @@ -8390,8 +8399,8 @@ hipError_t hipGraphReleaseUserObject(hipGraph_t graph, hipUserObject_t object, * @param [in] path - path to write the DOT file. * @param [in] flags - Flags from hipGraphDebugDotFlags to get additional node information. * @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorOperatingSystem - * @warning : This API is marked as beta, meaning, while this is feature complete, - * it is still open to changes and may have outstanding issues. + * @warning This API is marked as Beta. While this feature is complete, it can + * change and might have outstanding issues. */ hipError_t hipGraphDebugDotPrint(hipGraph_t graph, const char* path, unsigned int flags); @@ -8406,8 +8415,8 @@ hipError_t hipGraphDebugDotPrint(hipGraph_t graph, const char* path, unsigned in * For list of attributes see ::hipKernelNodeAttrID. * * @returns #hipSuccess, #hipErrorInvalidContext - * @warning : This API is marked as beta, meaning, while this is feature complete, - * it is still open to changes and may have outstanding issues. + * @warning This API is marked as Beta. While this feature is complete, it can + * change and might have outstanding issues. */ hipError_t hipGraphKernelNodeCopyAttributes(hipGraphNode_t hSrc, hipGraphNode_t hDst); @@ -8430,8 +8439,8 @@ hipError_t hipGraphKernelNodeCopyAttributes(hipGraphNode_t hSrc, hipGraphNode_t * @param [in] isEnabled - Node is enabled if != 0, otherwise the node is disabled. * * @returns #hipSuccess, #hipErrorInvalidValue, - * @warning : This API is marked as beta, meaning, while this is feature complete, - * it is still open to changes and may have outstanding issues. + * @warning This API is marked as Beta. While this feature is complete, it can + * change and might have outstanding issues. */ hipError_t hipGraphNodeSetEnabled(hipGraphExec_t hGraphExec, hipGraphNode_t hNode, unsigned int isEnabled); @@ -8452,8 +8461,8 @@ hipError_t hipGraphNodeSetEnabled(hipGraphExec_t hGraphExec, hipGraphNode_t hNod * @param [out] isEnabled - Location to return the enabled status of the node. * * @returns #hipSuccess, #hipErrorInvalidValue - * @warning : This API is marked as beta, meaning, while this is feature complete, - * it is still open to changes and may have outstanding issues. + * @warning This API is marked as Beta. While this feature is complete, it can + * change and might have outstanding issues. */ hipError_t hipGraphNodeGetEnabled(hipGraphExec_t hGraphExec, hipGraphNode_t hNode, unsigned int* isEnabled); @@ -8467,8 +8476,8 @@ hipError_t hipGraphNodeGetEnabled(hipGraphExec_t hGraphExec, hipGraphNode_t hNod * @param [in] numDependencies - the number of the dependencies. * @param [in] nodeParams -pointer to the parameters. * @returns #hipSuccess, #hipErrorInvalidValue - * @warning : This API is marked as beta, meaning, while this is feature complete, - * it is still open to changes and may have outstanding issues. + * @warning This API is marked as Beta. While this feature is complete, it can + * change and might have outstanding issues. */ hipError_t hipGraphAddExternalSemaphoresWaitNode(hipGraphNode_t* pGraphNode, hipGraph_t graph, const hipGraphNode_t* pDependencies, size_t numDependencies, @@ -8483,8 +8492,8 @@ hipError_t hipGraphAddExternalSemaphoresWaitNode(hipGraphNode_t* pGraphNode, hip * @param [in] numDependencies - the number of the dependencies. * @param [in] nodeParams -pointer to the parameters. * @returns #hipSuccess, #hipErrorInvalidValue - * @warning : This API is marked as beta, meaning, while this is feature complete, - * it is still open to changes and may have outstanding issues. + * @warning This API is marked as Beta. While this feature is complete, it can + * change and might have outstanding issues. */ hipError_t hipGraphAddExternalSemaphoresSignalNode(hipGraphNode_t* pGraphNode, hipGraph_t graph, const hipGraphNode_t* pDependencies, size_t numDependencies, @@ -8495,8 +8504,8 @@ hipError_t hipGraphAddExternalSemaphoresSignalNode(hipGraphNode_t* pGraphNode, h * @param [in] hNode - Node from the graph from which graphExec was instantiated. * @param [in] nodeParams - Pointer to the params to be set. * @returns #hipSuccess, #hipErrorInvalidValue - * @warning : This API is marked as beta, meaning, while this is feature complete, - * it is still open to changes and may have outstanding issues. + * @warning This API is marked as Beta. While this feature is complete, it can + * change and might have outstanding issues. */ hipError_t hipGraphExternalSemaphoresSignalNodeSetParams(hipGraphNode_t hNode, const hipExternalSemaphoreSignalNodeParams* nodeParams); @@ -8506,8 +8515,8 @@ hipError_t hipGraphExternalSemaphoresSignalNodeSetParams(hipGraphNode_t hNode, * @param [in] hNode - Node from the graph from which graphExec was instantiated. * @param [in] nodeParams - Pointer to the params to be set. * @returns #hipSuccess, #hipErrorInvalidValue - * @warning : This API is marked as beta, meaning, while this is feature complete, - * it is still open to changes and may have outstanding issues. + * @warning This API is marked as Beta. While this feature is complete, it can + * change and might have outstanding issues. */ hipError_t hipGraphExternalSemaphoresWaitNodeSetParams(hipGraphNode_t hNode, const hipExternalSemaphoreWaitNodeParams* nodeParams); @@ -8517,8 +8526,8 @@ hipError_t hipGraphExternalSemaphoresWaitNodeSetParams(hipGraphNode_t hNode, * @param [in] hNode - Node from the graph from which graphExec was instantiated. * @param [out] params_out - Pointer to params. * @returns #hipSuccess, #hipErrorInvalidValue - * @warning : This API is marked as beta, meaning, while this is feature complete, - * it is still open to changes and may have outstanding issues. + * @warning This API is marked as Beta. While this feature is complete, it can + * change and might have outstanding issues. */ hipError_t hipGraphExternalSemaphoresSignalNodeGetParams(hipGraphNode_t hNode, hipExternalSemaphoreSignalNodeParams* params_out); @@ -8528,8 +8537,8 @@ hipError_t hipGraphExternalSemaphoresSignalNodeGetParams(hipGraphNode_t hNode, * @param [in] hNode - Node from the graph from which graphExec was instantiated. * @param [out] params_out - Pointer to params. * @returns #hipSuccess, #hipErrorInvalidValue - * @warning : This API is marked as beta, meaning, while this is feature complete, - * it is still open to changes and may have outstanding issues. + * @warning This API is marked as Beta. While this feature is complete, it can + * change and might have outstanding issues. */ hipError_t hipGraphExternalSemaphoresWaitNodeGetParams(hipGraphNode_t hNode, hipExternalSemaphoreWaitNodeParams* params_out); @@ -8540,8 +8549,8 @@ hipError_t hipGraphExternalSemaphoresWaitNodeGetParams(hipGraphNode_t hNode, * @param [in] hNode - Node from the graph from which graphExec was instantiated. * @param [in] nodeParams - Pointer to the params to be set. * @returns #hipSuccess, #hipErrorInvalidValue - * @warning : This API is marked as beta, meaning, while this is feature complete, - * it is still open to changes and may have outstanding issues. + * @warning This API is marked as Beta. While this feature is complete, it can + * change and might have outstanding issues. */ hipError_t hipGraphExecExternalSemaphoresSignalNodeSetParams(hipGraphExec_t hGraphExec, hipGraphNode_t hNode, const hipExternalSemaphoreSignalNodeParams* nodeParams); @@ -8552,8 +8561,8 @@ hipError_t hipGraphExecExternalSemaphoresSignalNodeSetParams(hipGraphExec_t hGra * @param [in] hNode - Node from the graph from which graphExec was instantiated. * @param [in] nodeParams - Pointer to the params to be set. * @returns #hipSuccess, #hipErrorInvalidValue - * @warning : This API is marked as beta, meaning, while this is feature complete, - * it is still open to changes and may have outstanding issues. + * @warning This API is marked as Beta. While this feature is complete, it can + * change and might have outstanding issues. */ hipError_t hipGraphExecExternalSemaphoresWaitNodeSetParams(hipGraphExec_t hGraphExec, hipGraphNode_t hNode, const hipExternalSemaphoreWaitNodeParams* nodeParams); @@ -8564,8 +8573,8 @@ hipError_t hipGraphExecExternalSemaphoresWaitNodeSetParams(hipGraphExec_t hGraph * @param [in] hNode - instance of the node to get parameters from. * @param [out] nodeParams - pointer to the parameters. * @returns #hipSuccess, #hipErrorInvalidValue - * @warning : This API is marked as beta, meaning, while this is feature complete, - * it is still open to changes and may have outstanding issues. + * @warning This API is marked as Beta. While this feature is complete, it can + * change and might have outstanding issues. */ hipError_t hipDrvGraphMemcpyNodeGetParams(hipGraphNode_t hNode, HIP_MEMCPY3D* nodeParams); @@ -8575,8 +8584,8 @@ hipError_t hipDrvGraphMemcpyNodeGetParams(hipGraphNode_t hNode, HIP_MEMCPY3D* no * @param [in] hNode - instance of the node to Set parameters for. * @param [out] nodeParams - pointer to the parameters. * @returns #hipSuccess, #hipErrorInvalidValue - * @warning : This API is marked as beta, meaning, while this is feature complete, - * it is still open to changes and may have outstanding issues. + * @warning This API is marked as Beta. While this feature is complete, it can + * change and might have outstanding issues. */ hipError_t hipDrvGraphMemcpyNodeSetParams(hipGraphNode_t hNode, const HIP_MEMCPY3D* nodeParams); @@ -8590,8 +8599,8 @@ hipError_t hipDrvGraphMemcpyNodeSetParams(hipGraphNode_t hNode, const HIP_MEMCPY * @param [in] memsetParams - const pointer to the parameters for the memory set. * @param [in] ctx - cotext related to current device. * @returns #hipSuccess, #hipErrorInvalidValue - * @warning : This API is marked as beta, meaning, while this is feature complete, - * it is still open to changes and may have outstanding issues. + * @warning This API is marked as Beta. While this feature is complete, it can + * change and might have outstanding issues. */ hipError_t hipDrvGraphAddMemsetNode(hipGraphNode_t* phGraphNode, hipGraph_t hGraph, const hipGraphNode_t* dependencies, size_t numDependencies, @@ -8601,13 +8610,13 @@ hipError_t hipDrvGraphAddMemsetNode(hipGraphNode_t* phGraphNode, hipGraph_t hGra * @brief Creates a memory free node and adds it to a graph * * @param [out] phGraphNode - Pointer to the graph node to create and add to the graph - * @param [in] hGraph - Instane of the graph the node to be added - * @param [in] dependencies - Const pointer to the node dependenties + * @param [in] hGraph - Instance of the graph the node to be added + * @param [in] dependencies - Const pointer to the node dependencies * @param [in] numDependencies - The number of dependencies * @param [in] dptr - Pointer to the memory to be freed * @returns #hipSuccess, #hipErrorInvalidValue - * @warning : This API is marked as beta, meaning, while this is feature complete, - * it is still open to changes and may have outstanding issues. + * @warning This API is marked as Beta. While this feature is complete, it can + * change and might have outstanding issues. */ hipError_t hipDrvGraphAddMemFreeNode(hipGraphNode_t* phGraphNode, hipGraph_t hGraph, const hipGraphNode_t* dependencies, size_t numDependencies, @@ -8621,8 +8630,8 @@ hipError_t hipDrvGraphAddMemFreeNode(hipGraphNode_t* phGraphNode, hipGraph_t hGr * @param [in] copyParams - const pointer to the memcpy node params. * @param [in] ctx - cotext related to current device. * @returns #hipSuccess, #hipErrorInvalidValue - * @warning : This API is marked as beta, meaning, while this is feature complete, - * it is still open to changes and may have outstanding issues. + * @warning This API is marked as Beta. While this feature is complete, it can + * change and might have outstanding issues. */ hipError_t hipDrvGraphExecMemcpyNodeSetParams(hipGraphExec_t hGraphExec, hipGraphNode_t hNode, const HIP_MEMCPY3D* copyParams, hipCtx_t ctx); @@ -8635,8 +8644,8 @@ hipError_t hipDrvGraphExecMemcpyNodeSetParams(hipGraphExec_t hGraphExec, hipGrap * @param [in] memsetParams - pointer to the parameters. * @param [in] ctx - cotext related to current device. * @returns #hipSuccess, #hipErrorInvalidValue - * @warning : This API is marked as beta, meaning, while this is feature complete, - * it is still open to changes and may have outstanding issues. + * @warning This API is marked as Beta. While this feature is complete, it can + * change and might have outstanding issues. */ hipError_t hipDrvGraphExecMemsetNodeSetParams(hipGraphExec_t hGraphExec, hipGraphNode_t hNode, const HIP_MEMSET_NODE_PARAMS* memsetParams, hipCtx_t ctx); @@ -8654,8 +8663,11 @@ hipError_t hipDrvGraphExecMemsetNodeSetParams(hipGraphExec_t hGraphExec, hipGrap * @{ * This section describes the virtual memory management functions of HIP runtime API. * - * @note Please note, the virtual memory management functions of HIP runtime API are implemented - * on Linux, under development on Windows. + * @note Please note, the virtual memory management functions of HIP runtime + * API are implemented on Linux, under development on Windows. The + * following Virtual Memory Management APIs are not (yet) + * supported in HIP: + * - hipMemMapArrayAsync */ /** @@ -8664,10 +8676,10 @@ hipError_t hipDrvGraphExecMemsetNodeSetParams(hipGraphExec_t hGraphExec, hipGrap * @param [in] devPtr - starting address of the range. * @param [in] size - size of the range. * @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorNotSupported - * @warning : This API is marked as beta, meaning, while this is feature complete, - * it is still open to changes and may have outstanding issues. + * @warning This API is marked as Beta. While this feature is complete, it can + * change and might have outstanding issues. * - * @note This API is implemented on Linux, under development on Windows. + * @note This API is implemented on Linux and is under development on Microsoft Windows. */ hipError_t hipMemAddressFree(void* devPtr, size_t size); @@ -8680,10 +8692,10 @@ hipError_t hipMemAddressFree(void* devPtr, size_t size); * @param [in] addr - requested starting address of the range. * @param [in] flags - currently unused, must be zero. * @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorNotSupported - * @warning : This API is marked as beta, meaning, while this is feature complete, - * it is still open to changes and may have outstanding issues. + * @warning This API is marked as Beta. While this feature is complete, it can + * change and might have outstanding issues. * - * @note This API is implemented on Linux, under development on Windows. + * @note This API is implemented on Linux and is under development on Microsoft Windows. */ hipError_t hipMemAddressReserve(void** ptr, size_t size, size_t alignment, void* addr, unsigned long long flags); @@ -8695,10 +8707,10 @@ hipError_t hipMemAddressReserve(void** ptr, size_t size, size_t alignment, void* * @param [in] prop - properties of the allocation. * @param [in] flags - currently unused, must be zero. * @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorNotSupported - * @warning : This API is marked as beta, meaning, while this is feature complete, - * it is still open to changes and may have outstanding issues. + * @warning This API is marked as Beta. While this feature is complete, it can + * change and might have outstanding issues. * - * @note This API is implemented on Linux, under development on Windows. + * @note This API is implemented on Linux and is under development on Microsoft Windows. */ hipError_t hipMemCreate(hipMemGenericAllocationHandle_t* handle, size_t size, const hipMemAllocationProp* prop, unsigned long long flags); @@ -8710,10 +8722,10 @@ hipError_t hipMemCreate(hipMemGenericAllocationHandle_t* handle, size_t size, co * @param [in] handleType - type of the shareable handle. * @param [in] flags - currently unused, must be zero. * @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorNotSupported - * @warning : This API is marked as beta, meaning, while this is feature complete, - * it is still open to changes and may have outstanding issues. + * @warning This API is marked as Beta. While this feature is complete, it can + * change and might have outstanding issues. * - * @note This API is implemented on Linux, under development on Windows. + * @note This API is implemented on Linux and is under development on Microsoft Windows. */ hipError_t hipMemExportToShareableHandle(void* shareableHandle, hipMemGenericAllocationHandle_t handle, hipMemAllocationHandleType handleType, unsigned long long flags); @@ -8724,10 +8736,10 @@ hipError_t hipMemExportToShareableHandle(void* shareableHandle, hipMemGenericAll * @param [in] location - target location. * @param [in] ptr - address to check the access flags. * @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorNotSupported - * @warning : This API is marked as beta, meaning, while this is feature complete, - * it is still open to changes and may have outstanding issues. + * @warning This API is marked as Beta. While this feature is complete, it can + * change and might have outstanding issues. * - * @note This API is implemented on Linux, under development on Windows. + * @note This API is implemented on Linux and is under development on Microsoft Windows. */ hipError_t hipMemGetAccess(unsigned long long* flags, const hipMemLocation* location, void* ptr); @@ -8738,10 +8750,10 @@ hipError_t hipMemGetAccess(unsigned long long* flags, const hipMemLocation* loca * @param [in] prop - location properties. * @param [in] option - determines which granularity to return. * @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorNotSupported - * @warning : This API is marked as beta, meaning, while this is feature complete, - * it is still open to changes and may have outstanding issues. + * @warning This API is marked as Beta. While this feature is complete, it can + * change and might have outstanding issues. * - * @note This API is implemented on Linux, under development on Windows. + * @note This API is implemented on Linux and is under development on Microsoft Windows. * */ hipError_t hipMemGetAllocationGranularity(size_t* granularity, const hipMemAllocationProp* prop, hipMemAllocationGranularity_flags option); @@ -8752,10 +8764,10 @@ hipError_t hipMemGetAllocationGranularity(size_t* granularity, const hipMemAlloc * @param [out] prop - properties of the given handle. * @param [in] handle - handle to perform the query on. * @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorNotSupported - * @warning : This API is marked as beta, meaning, while this is feature complete, - * it is still open to changes and may have outstanding issues. + * @warning This API is marked as Beta. While this feature is complete, it can + * change and might have outstanding issues. * - * @note This API is implemented on Linux under development on Windows. + * @note This API is implemented on Linux and is under development on Microsoft Windows. */ hipError_t hipMemGetAllocationPropertiesFromHandle(hipMemAllocationProp* prop, hipMemGenericAllocationHandle_t handle); @@ -8766,10 +8778,10 @@ hipError_t hipMemGetAllocationPropertiesFromHandle(hipMemAllocationProp* prop, h * @param [in] osHandle - shareable handle representing the memory allocation. * @param [in] shHandleType - handle type. * @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorNotSupported - * @warning : This API is marked as beta, meaning, while this is feature complete, - * it is still open to changes and may have outstanding issues. + * @warning This API is marked as Beta. While this feature is complete, it can + * change and might have outstanding issues. * - * @note This API is implemented on Linux, under development on Windows. + * @note This API is implemented on Linux and is under development on Microsoft Windows. */ hipError_t hipMemImportFromShareableHandle(hipMemGenericAllocationHandle_t* handle, void* osHandle, hipMemAllocationHandleType shHandleType); @@ -8782,10 +8794,10 @@ hipError_t hipMemImportFromShareableHandle(hipMemGenericAllocationHandle_t* hand * @param [in] handle - memory allocation to be mapped. * @param [in] flags - currently unused, must be zero. * @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorNotSupported - * @warning : This API is marked as beta, meaning, while this is feature complete, - * it is still open to changes and may have outstanding issues. + * @warning This API is marked as Beta. While this feature is complete, it can + * change and might have outstanding issues. * - * @note This API is implemented on Linux, under development on Windows. + * @note This API is implemented on Linux and is under development on Microsoft Windows. */ hipError_t hipMemMap(void* ptr, size_t size, size_t offset, hipMemGenericAllocationHandle_t handle, unsigned long long flags); @@ -8796,10 +8808,8 @@ hipError_t hipMemMap(void* ptr, size_t size, size_t offset, hipMemGenericAllocat * @param [in] count - number of hipArrayMapInfo in mapInfoList. * @param [in] stream - stream identifier for the stream to use for map or unmap operations. * @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorNotSupported - * @warning : This API is marked as beta, meaning, while this is feature complete, - * it is still open to changes and may have outstanding issues. - * - * @note This API is implemented on Linux, under development on Windows. + * @warning This API is under development. Currently it is not supported on AMD + * GPUs and returns #hipErrorNotSupported. */ hipError_t hipMemMapArrayAsync(hipArrayMapInfo* mapInfoList, unsigned int count, hipStream_t stream); @@ -8808,10 +8818,10 @@ hipError_t hipMemMapArrayAsync(hipArrayMapInfo* mapInfoList, unsigned int count * * @param [in] handle - handle of the memory allocation. * @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorNotSupported - * @warning : This API is marked as beta, meaning, while this is feature complete, - * it is still open to changes and may have outstanding issues. + * @warning This API is marked as Beta. While this feature is complete, it can + * change and might have outstanding issues. * - * @note This API is implemented on Linux, under development on Windows. + * @note This API is implemented on Linux and is under development on Microsoft Windows. */ hipError_t hipMemRelease(hipMemGenericAllocationHandle_t handle); @@ -8821,10 +8831,10 @@ hipError_t hipMemRelease(hipMemGenericAllocationHandle_t handle); * @param [out] handle - handle representing addr. * @param [in] addr - address to look up. * @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorNotSupported - * @warning : This API is marked as beta, meaning, while this is feature complete, - * it is still open to changes and may have outstanding issues. + * @warning This API is marked as Beta. While this feature is complete, it can + * change and might have outstanding issues. * - * @note This API is implemented on Linux, under development on Windows. + * @note This API is implemented on Linux and is under development on Microsoft Windows. */ hipError_t hipMemRetainAllocationHandle(hipMemGenericAllocationHandle_t* handle, void* addr); @@ -8836,10 +8846,10 @@ hipError_t hipMemRetainAllocationHandle(hipMemGenericAllocationHandle_t* handle, * @param [in] desc - array of hipMemAccessDesc. * @param [in] count - number of hipMemAccessDesc in desc. * @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorNotSupported - * @warning : This API is marked as beta, meaning, while this is feature complete, - * it is still open to changes and may have outstanding issues. + * @warning This API is marked as Beta. While this feature is complete, it can + * change and might have outstanding issues. * - * @note This API is implemented on Linux, under development on Windows. + * @note This API is implemented on Linux and is under development on Microsoft Windows. */ hipError_t hipMemSetAccess(void* ptr, size_t size, const hipMemAccessDesc* desc, size_t count); @@ -8849,10 +8859,10 @@ hipError_t hipMemSetAccess(void* ptr, size_t size, const hipMemAccessDesc* desc, * @param [in] ptr - starting address of the range to unmap. * @param [in] size - size of the virtual address range. * @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorNotSupported - * @warning : This API is marked as beta, meaning, while this is feature complete, - * it is still open to changes and may have outstanding issues. + * @warning This API is marked as Beta. While this feature is complete, it can + * change and might have outstanding issues. * - * @note This API is implemented on Linux, under development on Windows. + * @note This API is implemented on Linux and is under development on Microsoft Windows. */ hipError_t hipMemUnmap(void* ptr, size_t size); @@ -9267,13 +9277,18 @@ return hipOccupancyMaxPotentialBlockSize(gridSize, blockSize,(hipFunction_t)kern * @brief Launches a device function * * @ingroup Execution + * @ingroup ModuleCooperativeG * - * @param [in] f device function symbol - * @param [in] gridDim grid dimentions - * @param [in] blockDim block dimentions - * @param [in] kernelParams kernel parameters - * @param [in] sharedMemBytes shared memory in bytes - * @param [in] stream stream on which kernel launched + * \tparam T The type of the kernel function. + * + * @param [in] f Kernel function to launch. + * @param [in] gridDim Grid dimensions specified as multiple of blockDim. + * @param [in] blockDim Block dimensions specified in work-items. + * @param [in] kernelParams A list of kernel arguments. + * @param [in] sharedMemBytes Amount of dynamic shared memory to allocate for + * this kernel. The HIP-Clang compiler provides + * support for extern shared declarations. + * @param [in] stream Stream which on the kernel launched. * * @return #hipSuccess, #hipErrorLaunchFailure, #hipErrorInvalidValue, * #hipErrorInvalidResourceHandle @@ -9286,14 +9301,15 @@ inline hipError_t hipLaunchCooperativeKernel(T f, dim3 gridDim, dim3 blockDim, blockDim, kernelParams, sharedMemBytes, stream); } /** - * @brief Launches device function on multiple devices where thread blocks can cooperate and - * synchronize on execution. + * @brief Launches kernel function on multiple devices, where thread blocks can + * cooperate and synchronize on execution. * * @ingroup Execution + * @ingroup ModuleCooperativeG * - * @param [in] launchParamsList list of kernel launch parameters, one per device - * @param [in] numDevices size of launchParamsList array - * @param [in] flags flag to handle launch behavior + * @param [in] launchParamsList List of kernel launch parameters, one per device. + * @param [in] numDevices Size of launchParamsList array. + * @param [in] flags Flag to handle launch behavior. * * @return #hipSuccess, #hipErrorLaunchFailure, #hipErrorInvalidValue, * #hipErrorInvalidResourceHandle @@ -9546,7 +9562,7 @@ static inline hipError_t hipUnbindTexture( * * @see hipMallocFromPoolAsync * - * @note This API is implemented on Linux, under development on Windows. + * @note This API is implemented on Linux and is under development on Microsoft Windows. */ static inline hipError_t hipMallocAsync( void** dev_ptr, @@ -9563,7 +9579,7 @@ static inline hipError_t hipMallocAsync( * * @see hipMallocFromPoolAsync * - * @note This API is implemented on Linux, under development on Windows. + * @note This API is implemented on Linux and is under development on Microsoft Windows. */ template static inline hipError_t hipMallocAsync( @@ -9581,7 +9597,7 @@ static inline hipError_t hipMallocAsync( * * @see hipMallocFromPoolAsync * - * @note This API is implemented on Linux, under development on Windows. + * @note This API is implemented on Linux and is under development on Microsoft Windows. */ template static inline hipError_t hipMallocAsync( @@ -9598,7 +9614,7 @@ static inline hipError_t hipMallocAsync( * * @see hipMallocFromPoolAsync * - * @note This API is implemented on Linux, under development on Windows. + * @note This API is implemented on Linux and is under development on Microsoft Windows. */ template static inline hipError_t hipMallocFromPoolAsync(