SWDEV-299127 - Merge 'develop' into 'amd-staging'

Change-Id: If456f5292d9a7775d780c780b7c40564a2ee2648


[ROCm/hip commit: 9ac76dac96]
Cette révision appartient à :
Jenkins
2023-05-03 11:10:54 +00:00
révision 91138d7d97
36 fichiers modifiés avec 3246 ajouts et 4655 suppressions
+12
Voir le fichier
@@ -0,0 +1,12 @@
# To get started with Dependabot version updates, you'll need to specify which
# package ecosystems to update and where the package manifests are located.
# Please see the documentation for all configuration options:
# https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates
version: 2
updates:
- package-ecosystem: "pip" # See documentation for possible values
directory: "/docs/.sphinx" # Location of package manifests
open-pull-requests-limit: 10
schedule:
interval: "daily"
+34 -39
Voir le fichier
@@ -1,6 +1,6 @@
def hipBuildTest(String backendLabel) {
node(backendLabel) {
stage("Source sync ${backendLabel}") {
stage("SYNC - ${backendLabel}") {
// Checkout hip repository with the PR patch
dir("${WORKSPACE}/hip") {
@@ -8,35 +8,25 @@ def hipBuildTest(String backendLabel) {
env.HIP_DIR = "${WORKSPACE}" + "/hip"
}
// Clone hipamd repository
dir("${WORKSPACE}/hipamd") {
git branch: 'develop',
url: 'https://github.com/ROCm-Developer-Tools/hipamd'
env.HIPAMD_DIR = "${WORKSPACE}" + "/hipamd"
}
// Clone hip-tests repository
dir("${WORKSPACE}/hip-tests") {
git branch: 'develop',
url: 'https://github.com/ROCm-Developer-Tools/hip-tests'
env.HIP_TESTS_DIR = "${WORKSPACE}" + "/hip-tests"
}
// Clone vdi and opencl for only amd backend server
if (backendLabel =~ /.*amd.*/) {
dir("${WORKSPACE}/ROCm-OpenCL-Runtime") {
git branch:'develop',
url: 'https://github.com/RadeonOpenCompute/ROCm-OpenCL-Runtime'
env.OPENCL_DIR = "${WORKSPACE}" + "/ROCm-OpenCL-Runtime"
}
dir("${WORKSPACE}/ROCclr") {
git branch:'develop',
url: 'https://github.com/ROCm-Developer-Tools/ROCclr'
env.ROCclr_DIR = "${WORKSPACE}" + "/ROCclr"
}
// Clone clr repository
dir("${WORKSPACE}/clr") {
git branch: 'develop',
credentialsId: 'branch-credentials',
url: 'https://github.com/ROCm-Developer-Tools/clr'
env.CLR_DIR = "${WORKSPACE}" + "/clr"
}
}
stage("Build - HIT framework ${backendLabel}") {
// Running the build on hipamd workspace
dir("${WORKSPACE}/hipamd") {
stage("BUILD - HIT ${backendLabel}") {
// Running the build on clr workspace
dir("${WORKSPACE}/clr") {
sh """#!/usr/bin/env bash
set -x
rm -rf build
@@ -44,22 +34,24 @@ def hipBuildTest(String backendLabel) {
cd build
# Check if backend label contains string "amd" or backend host is a server with amd gpu
if [[ $backendLabel =~ amd ]]; then
cmake -DHIP_CATCH_TEST=0 -DHIP_COMMON_DIR=$HIP_DIR -DAMD_OPENCL_PATH=\$OPENCL_DIR -DROCCLR_PATH=\$ROCclr_DIR -DCMAKE_PREFIX_PATH="/opt/rocm/" -DCMAKE_INSTALL_PREFIX=\$PWD/install ..
cmake -DCLR_BUILD_HIP=ON -DHIP_CATCH_TEST=0 -DHIP_COMMON_DIR=$HIP_DIR -DCMAKE_PREFIX_PATH="/opt/rocm/" -DCMAKE_INSTALL_PREFIX=\$PWD/install ..
else
cmake -DHIP_CATCH_TEST=0 -DHIP_PLATFORM=nvidia -DHIP_COMMON_DIR=$HIP_DIR -DCMAKE_INSTALL_PREFIX=\$PWD/install ..
cmake -DCLR_BUILD_HIP=ON -DHIP_CATCH_TEST=0 -DHIP_PLATFORM=nvidia -DHIP_COMMON_DIR=$HIP_DIR -DCMAKE_INSTALL_PREFIX=\$PWD/install ..
fi
make install -j\$(nproc)
make -j\$(nproc)
make install -j\$(nproc)
if [[ $backendLabel =~ amd ]]; then
make build_tests -j\$(nproc)
make build_tests -j\$(nproc)
else
HIP_COMPILER=nvcc HIP_PLATFORM=nvidia make build_tests -j\$(nproc)
fi
"""
}
}
timeout(time: 1, unit: 'HOURS') {
stage("HIP Unit Tests - HIT framework ${backendLabel}") {
dir("${WORKSPACE}/hipamd/build") {
stage("TEST - HIT ${backendLabel}") {
dir("${WORKSPACE}/clr/build/hipamd") {
sh """#!/usr/bin/env bash
set -x
# Check if backend label contains string "amd" or backend host is a server with amd gpu
@@ -73,28 +65,30 @@ def hipBuildTest(String backendLabel) {
}
}
}
stage("Build - Catch2 framework") {
// Running the build on hipamd workspace
dir("${WORKSPACE}/hipamd") {
stage("BUILD HIP - ${backendLabel}") {
// Running the build on clr workspace
dir("${WORKSPACE}/clr") {
sh """#!/usr/bin/env bash
set -x
rm -rf build
mkdir -p build
cd build
# Check if backend label contains string "amd" or backend host is a server with amd gpu
if [[ $backendLabel =~ amd ]]; then
cmake -DHIP_PATH=\$PWD/install -DHIP_COMMON_DIR=\$HIP_DIR -DAMD_OPENCL_PATH=\$OPENCL_DIR -DROCCLR_PATH=\$ROCclr_DIR -DCMAKE_PREFIX_PATH="/opt/rocm/" -DCMAKE_INSTALL_PREFIX=\$PWD/install ..
cmake -DCLR_BUILD_HIP=ON -DHIP_PATH=\$PWD/install -DHIP_COMMON_DIR=\$HIP_DIR -DCMAKE_PREFIX_PATH="/opt/rocm/" -DCMAKE_INSTALL_PREFIX=\$PWD/install ..
else
cmake -DHIP_PLATFORM=nvidia -DHIP_COMMON_DIR=\$HIP_DIR -DCMAKE_INSTALL_PREFIX=\$PWD/install ..
cmake -DCLR_BUILD_HIP=ON -DHIP_PLATFORM=nvidia -DHIP_COMMON_DIR=\$HIP_DIR -DCMAKE_INSTALL_PREFIX=\$PWD/install ..
fi
make -j\$(nproc)
make install -j\$(nproc)
"""
}
}
stage("Build - HIP TESTS") {
stage("BUILD HIP TESTS - ${backendLabel}") {
// Running the build on HIP TESTS workspace
dir("${WORKSPACE}/hip-tests") {
env.HIP_PATH = "${HIPAMD_DIR}" + "/build/install"
env.HIP_PATH = "${CLR_DIR}" + "/build/install"
sh """#!/usr/bin/env bash
set -x
rm -rf build
@@ -112,16 +106,17 @@ def hipBuildTest(String backendLabel) {
"""
}
}
timeout(time: 1, unit: 'HOURS') {
stage('HIP Unit Tests - Catch2 framework') {
stage("TEST - CATCH2 ${backendLabel}") {
dir("${WORKSPACE}/hip-tests") {
sh """#!/usr/bin/env bash
set -x
cd build
if [[ $backendLabel =~ amd ]]; then
ctest
ctest --overwrite BuildDirectory=. --output-junit hiptest_output_catch_amd.xml
else
ctest -E 'Unit_hipMemcpyHtoD_Positive_Synchronization_Behavior|Unit_hipMemcpy_Positive_Synchronization_Behavior|Unit_hipFreeNegativeHost'
ctest --overwrite BuildDirectory=. --output-junit hiptest_output_catch_nvidia.xml -E 'Unit_hipMemcpyHtoD_Positive_Synchronization_Behavior|Unit_hipMemcpy_Positive_Synchronization_Behavior|Unit_hipFreeNegativeHost'
fi
"""
}
+21
Voir le fichier
@@ -0,0 +1,21 @@
# Read the Docs configuration file
# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
version: 2
sphinx:
configuration: docs/conf.py
formats: [htmlzip]
python:
install:
- requirements: docs/.sphinx/requirements.txt
build:
os: ubuntu-20.04
tools:
python: "3.8"
apt_packages:
- "doxygen"
- "graphviz" # For dot graphs in doxygen
Fichier diff supprimé car celui-ci est trop grand Voir la Diff
+33
Voir le fichier
@@ -0,0 +1,33 @@
# HIP Runtime API Reference {#mainpage}
This is the full HIP Runtime API reference. The API is organized into
[modules](modules.html) based on functionality.
## List of Modules
- @ref GlobalDefs
- @ref Driver
- @ref Device
- @ref Execution
- @ref Error
- @ref Stream
- @ref StreamM
- @ref Memory
- @ref External
- @ref MemoryM
- @ref StreamO
- @ref PeerToPeer
- @ref Context
- @ref ContextD
- @ref Module
- @ref Occupancy
- @ref Profiler
- @ref Clang
- @ref Texture
- @ref TextureD
- @ref TextureU
- @ref Runtime
- @ref Callback
- @ref Graph
- @ref Virtual
- @ref GL
- [Surface Object](#Surface)
+5
Voir le fichier
@@ -0,0 +1,5 @@
/_build
/_doxygen
/_images
/_static
/_templates
+24
Voir le fichier
@@ -0,0 +1,24 @@
root: index
subtrees:
- caption: User Guide
entries:
- file: user_guide/programming_manual
- file: user_guide/hip_rtc
- file: user_guide/faq
- caption: How to Guides
entries:
- file: how_to_guides/install.md
- file: how_to_guides/debugging.md
- caption: Reference
entries:
- file: .doxygen/docBin/html/index
- file: reference/kernel_language
- file: reference/math_api
- file: reference/terms
- file: reference/glossary
- file: reference/deprecated_api_list
- caption: Developer Guide
entries:
- file: developer_guide/build
- file: developer_guide/logging
- file: developer_guide/contributing.md
+1
Voir le fichier
@@ -0,0 +1 @@
rocm-docs-core[api_reference]
+290
Voir le fichier
@@ -0,0 +1,290 @@
#
# This file is autogenerated by pip-compile with Python 3.8
# by the following command:
#
# pip-compile requirements.in
#
accessible-pygments==0.0.3
# via pydata-sphinx-theme
alabaster==0.7.12
# via sphinx
asttokens==2.2.0
# via stack-data
attrs==22.1.0
# via
# jsonschema
# jupyter-cache
babel==2.10.3
# via
# pydata-sphinx-theme
# sphinx
backcall==0.2.0
# via ipython
beautifulsoup4==4.11.1
# via pydata-sphinx-theme
breathe==4.34.0
# via rocm-docs-core
certifi==2022.6.15
# via requests
cffi==1.15.1
# via pynacl
charset-normalizer==2.1.0
# via requests
click==8.1.3
# via
# click-log
# doxysphinx
# jupyter-cache
# sphinx-external-toc
click-log==0.4.0
# via doxysphinx
debugpy==1.6.4
# via ipykernel
decorator==5.1.1
# via ipython
deprecated==1.2.13
# via pygithub
docutils==0.16
# via
# breathe
# myst-parser
# pydata-sphinx-theme
# rocm-docs-core
# sphinx
doxysphinx==3.2.1
# via rocm-docs-core
entrypoints==0.4
# via jupyter-client
executing==1.2.0
# via stack-data
fastjsonschema==2.16.2
# via nbformat
gitdb==4.0.10
# via gitpython
gitpython==3.1.31
# via rocm-docs-core
greenlet==2.0.1
# via sqlalchemy
idna==3.3
# via requests
imagesize==1.4.1
# via sphinx
importlib-metadata==5.1.0
# via
# jupyter-cache
# myst-nb
importlib-resources==5.10.4
# via
# jsonschema
# rocm-docs-core
ipykernel==6.17.1
# via myst-nb
ipython==8.7.0
# via
# ipykernel
# myst-nb
jedi==0.18.2
# via ipython
jinja2==3.1.2
# via
# myst-parser
# sphinx
json5==0.9.11
# via doxysphinx
jsonschema==4.17.3
# via nbformat
jupyter-cache==0.5.0
# via myst-nb
jupyter-client==7.4.7
# via
# ipykernel
# nbclient
jupyter-core==5.1.0
# via
# jupyter-client
# nbformat
linkify-it-py==1.0.3
# via myst-parser
lxml==4.9.2
# via doxysphinx
markdown-it-py==2.1.0
# via
# mdit-py-plugins
# myst-parser
markupsafe==2.1.1
# via jinja2
matplotlib-inline==0.1.6
# via
# ipykernel
# ipython
mdit-py-plugins==0.3.1
# via myst-parser
mdurl==0.1.2
# via markdown-it-py
myst-nb==0.17.1
# via rocm-docs-core
myst-parser[linkify]==0.18.1
# via
# myst-nb
# rocm-docs-core
nbclient==0.5.13
# via
# jupyter-cache
# myst-nb
nbformat==5.7.0
# via
# jupyter-cache
# myst-nb
# nbclient
nest-asyncio==1.5.6
# via
# ipykernel
# jupyter-client
# nbclient
packaging==21.3
# via
# ipykernel
# pydata-sphinx-theme
# sphinx
parso==0.8.3
# via jedi
pexpect==4.8.0
# via ipython
pickleshare==0.7.5
# via ipython
pkgutil-resolve-name==1.3.10
# via jsonschema
platformdirs==2.5.4
# via jupyter-core
prompt-toolkit==3.0.33
# via ipython
psutil==5.9.4
# via ipykernel
ptyprocess==0.7.0
# via pexpect
pure-eval==0.2.2
# via stack-data
pycparser==2.21
# via cffi
pydata-sphinx-theme==0.13.1
# via sphinx-book-theme
pygithub==1.57
# via rocm-docs-core
pygments==2.12.0
# via
# accessible-pygments
# ipython
# pydata-sphinx-theme
# sphinx
pyjwt==2.6.0
# via pygithub
pynacl==1.5.0
# via pygithub
pyparsing==3.0.9
# via
# doxysphinx
# packaging
pyrsistent==0.19.2
# via jsonschema
python-dateutil==2.8.2
# via jupyter-client
pytz==2022.1
# via babel
pyyaml==6.0
# via
# jupyter-cache
# myst-nb
# myst-parser
# sphinx-external-toc
pyzmq==24.0.1
# via
# ipykernel
# jupyter-client
requests==2.28.1
# via
# pygithub
# sphinx
rocm-docs-core[api_reference]==0.2.0
# via -r requirements.in
six==1.16.0
# via
# asttokens
# python-dateutil
smmap==5.0.0
# via gitdb
snowballstemmer==2.2.0
# via sphinx
soupsieve==2.3.2.post1
# via beautifulsoup4
sphinx==4.3.1
# via
# breathe
# myst-nb
# myst-parser
# pydata-sphinx-theme
# rocm-docs-core
# sphinx-book-theme
# sphinx-copybutton
# sphinx-design
# sphinx-external-toc
# sphinx-notfound-page
sphinx-book-theme==1.0.0rc2
# via rocm-docs-core
sphinx-copybutton==0.5.1
# via rocm-docs-core
sphinx-design==0.3.0
# via rocm-docs-core
sphinx-external-toc==0.3.1
# via rocm-docs-core
sphinx-notfound-page==0.8.3
# via rocm-docs-core
sphinxcontrib-applehelp==1.0.2
# via sphinx
sphinxcontrib-devhelp==1.0.2
# via sphinx
sphinxcontrib-htmlhelp==2.0.0
# via sphinx
sphinxcontrib-jsmath==1.0.1
# via sphinx
sphinxcontrib-qthelp==1.0.3
# via sphinx
sphinxcontrib-serializinghtml==1.1.5
# via sphinx
sqlalchemy==1.4.44
# via jupyter-cache
stack-data==0.6.2
# via ipython
tabulate==0.9.0
# via jupyter-cache
tornado==6.2
# via
# ipykernel
# jupyter-client
traitlets==5.6.0
# via
# ipykernel
# ipython
# jupyter-client
# jupyter-core
# matplotlib-inline
# nbclient
# nbformat
typing-extensions==4.4.0
# via
# myst-nb
# myst-parser
uc-micro-py==1.0.1
# via linkify-it-py
urllib3==1.26.11
# via requests
wcwidth==0.2.5
# via prompt-toolkit
wrapt==1.15.0
# via deprecated
zipp==3.11.0
# via
# importlib-metadata
# importlib-resources
# The following packages are considered to be unsafe in a requirements file:
# setuptools
+26
Voir le fichier
@@ -0,0 +1,26 @@
# Configuration file for the Sphinx documentation builder.
#
# This file only contains a selection of the most common options. For a full
# list see the documentation:
# https://www.sphinx-doc.org/en/master/usage/configuration.html
from rocm_docs import ROCmDocs
from typing import Any, Dict, List
docs_core = ROCmDocs("HIP Documentation")
docs_core.run_doxygen()
docs_core.enable_api_reference()
docs_core.setup()
for sphinx_var in ROCmDocs.SPHINX_VARS:
globals()[sphinx_var] = getattr(docs_core, sphinx_var)
# rocm-docs-core might or might not have changed these yet (depending on version),
# and we don't want to wipe their settings if they did
if not "html_theme_options" in globals():
html_theme_options: Dict[str, Any] = {}
if not "exclude_patterns" in globals():
exclude_patterns: List[str] = []
html_theme_options["show_navbar_depth"] = 2
exclude_patterns.append(".doxygen/mainpage.md")
@@ -1,73 +1,56 @@
## Table of Contents
# Building HIP from Source
<!-- toc -->
- [Prerequisites](#Prerequisites)
- [Build HIP on AMD platform](#build-hip-on-amd-platform)
* [Get HIP source code](#get-hip-source-code)
* [Set the environment variables](#set-the-environment-variables)
* [Build HIP](#build-hip)
* [Default paths and environment variables](#default-paths-and-environment-variables)
* [Build HIP Tests](#build-hip-tests)
- [Build HIP on NVIDIA platform](#build-hip-on-NVIDIA-platform)
* [Get HIP source code](#get-hip-source-code)
* [Set the environment variables](#set-the-environment-variables)
* [Build HIP](#build-hip)
* [Build HIP tests](#build-hip-tests)
- [Run HIP](#run-hip)
<!-- tocstop -->
# Prerequisites
## Prerequisites
HIP code can be developed either on AMD ROCm platform using HIP-Clang compiler, or a CUDA platform with nvcc installed.
Before build and run HIP, make sure drivers and pre-build packages are installed properly on the platform.
## AMD platform
### AMD platform
Install ROCm packages (see ROCm Installation Guide on AMD public documentation site (https://docs.amd.com/)) or install pre-built binary packages using the package manager,
```
```shell
sudo apt install mesa-common-dev
sudo apt install clang
sudo apt install comgr
sudo apt-get -y install rocm-dkms
```
## NVIDIA platform
### NVIDIA platform
Install Nvidia driver and pre-build packages (see HIP Installation Guide at https://docs.amd.com/ for the release)
## Branch of repository
### Branch of repository
Before get HIP source code, set the expected branch of repository at the variable ROCM_BRANCH.
Before get HIP source code, set the expected branch of repository at the variable `ROCM_BRANCH`.
For example, for ROCm5.0 release branch, set
```
```shell
export ROCM_BRANCH=rocm-5.0.x
```
ROCm5.4 release branch, set
```
```shell
export ROCM_BRANCH=rocm-5.4.x
```
Similiar format for future branches.
ROCM_PATH is path where ROCM is installed. BY default ROCM_PATH is at /opt/rocm.
`ROCM_PATH` is path where ROCM is installed. BY default `ROCM_PATH` is at `/opt/rocm`.
# Build HIP on AMD platform
## Build HIP on AMD platform
## Get HIP source code
### Get HIP source code
```
```shell
git clone -b "$ROCM_BRANCH" https://github.com/ROCm-Developer-Tools/hipamd.git
git clone -b "$ROCM_BRANCH" https://github.com/ROCm-Developer-Tools/hip.git
git clone -b "$ROCM_BRANCH" https://github.com/ROCm-Developer-Tools/ROCclr.git
git clone -b "$ROCM_BRANCH" https://github.com/RadeonOpenCompute/ROCm-OpenCL-Runtime.git
```
## Set the environment variables
### Set the environment variables
```
```shell
export HIPAMD_DIR="$(readlink -f hipamd)"
export HIP_DIR="$(readlink -f hip)"
```
@@ -78,31 +61,32 @@ See https://github.com/ROCm-Developer-Tools/ROCclr
HIPAMD repository provides implementation specifically for AMD platform.
See https://github.com/ROCm-Developer-Tools/hipamd
## Build HIP
### Build HIP
```
```shell
cd "$HIPAMD_DIR"
mkdir -p build; cd build
cmake -DHIP_COMMON_DIR=$HIP_DIR -DCMAKE_PREFIX_PATH="<ROCM_PATH>/" -DCMAKE_INSTALL_PREFIX=$PWD/install ..
make -j$(nproc)
sudo make install
```
Note: If you don't specify CMAKE_INSTALL_PREFIX, hip runtime will be installed to "<ROCM_PATH>/hip".
::::{note}
If you don't specify `CMAKE_INSTALL_PREFIX`, hip runtime will be installed to `<ROCM_PATH>/hip`.
By default, release version of AMDHIP is built.
::::
## Default paths and environment variables
### Default paths and environment variables
* By default HIP looks for HSA in <ROCM_PATH>/hsa (can be overridden by setting HSA_PATH environment variable).
* By default HIP is installed into <ROCM_PATH>/hip (can be overridden by setting HIP_PATH environment variable).
* By default HIP looks for clang in <ROCM_PATH>/llvm/bin (can be overridden by setting HIP_CLANG_PATH environment variable)
* By default HIP looks for device library in <ROCM_PATH>/lib (can be overridden by setting DEVICE_LIB_PATH environment variable).
* Optionally, consider adding <ROCM_PATH>/bin to your PATH to make it easier to use the tools.
* Optionally, set HIPCC_VERBOSE=7 to output the command line for compilation.
* By default HIP looks for HSA in `<ROCM_PATH>/hsa` (can be overridden by setting `HSA_PATH` environment variable).
* By default HIP is installed into `<ROCM_PATH>/hip` (can be overridden by setting HIP_PATH environment variable).
* By default HIP looks for clang in `<ROCM_PATH>/llvm/bin` (can be overridden by setting `HIP_CLANG_PATH` environment variable)
* By default HIP looks for device library in `<ROCM_PATH>/lib` (can be overridden by setting `DEVICE_LIB_PATH` environment variable).
* Optionally, consider adding `<ROCM_PATH>/bin` to your `PATH` to make it easier to use the tools.
* Optionally, set `HIPCC_VERBOSE=7` to output the command line for compilation.
After make install command, make sure HIP_PATH is pointed to $PWD/install/hip.
After make install command, make sure `HIP_PATH` is pointed to `$PWD/install/hip`.
## Generating profiling header after adding/changing a HIP API
### Generating profiling header after adding/changing a HIP API
When you add or change a HIP API, you might need to generate a new `hip_prof_str.h` header. This header is used by rocm tools to track HIP APIs like rocprofiler/roctracer etc.
To generate the header after your change, use the tool `hip_prof_gen.py` present in `hipamd/src`.
@@ -120,34 +104,39 @@ Flags:
* -e - on error exit mode
* -p - HIP_INIT_API macro patching mode
Example Usage: `hip_prof_gen.py -v -p -t --priv <hip>/include/hip/hip_runtime_api.h <hipamd>/src <hipamd>/include/hip/amd_detail/hip_prof_str.h <hipamd>/include/hip/amd_detail/hip_prof_str.h.new`
Example Usage:
```shell
hip_prof_gen.py -v -p -t --priv <hip>/include/hip/hip_runtime_api.h \
<hipamd>/src <hipamd>/include/hip/amd_detail/hip_prof_str.h \
<hipamd>/include/hip/amd_detail/hip_prof_str.h.new
```
## Build HIP tests
### Build HIP tests
### Build HIP directed tests
#### Build HIP directed tests
Developers can build HIP directed tests right after build HIP commands,
```
```shell
sudo make install
make -j$(nproc) build_tests
```
By default, all HIP directed tests will be built and generated under the folder $HIPAMD_DIR/build/directed_tests.
By default, all HIP directed tests will be built and generated under the folder `$HIPAMD_DIR/build/`directed_tests.
Take HIP directed device APIs tests, as an example, all available test applications will have executable files generated under,
$HIPAMD_DIR/build/directed_tests/runtimeApi/device.
`$HIPAMD_DIR/build/directed_tests/runtimeApi/device`.
Run all HIP directed_tests, use the command,
```
```shell
ctest
```
Or
```
```shell
make test
```
Build and run a single directed test, use the follow command as an example,
```
```shell
make directed_tests.texture.hipTexObjPitch
cd $HIPAMD_DIR/build/directed_tests/texcture
./hipTexObjPitch
@@ -155,18 +144,18 @@ cd $HIPAMD_DIR/build/directed_tests/texcture
Please note, the integrated HIP directed tests, will be deprecated in future release.
### Build HIP catch tests
##### Build HIP catch tests
HIP catch tests, with new architectured Catch2, are official seperated from HIP project, exist in HIP tests repository, can be built via the following instructions.
#### Get HIP tests source code
##### Get HIP tests source code
```
```shell
git clone -b "$ROCM_BRANCH" https://github.com/ROCm-Developer-Tools/hip-tests.git
```
#### Build HIP tests from source
##### Build HIP tests from source
```
```shell
export HIP_TESTS_DIR="$(readlink -f hip-tests)"
cd "$HIP_TESTS_DIR"
mkdir -p build; cd build
@@ -179,16 +168,16 @@ HIP catch tests are built under the folder $HIP_TESTS_DIR/build.
To run any single catch test, the following is an example,
```
```shell
cd $HIP_TESTS_DIR/build/catch_tests/unit/texture
./TextureTest
```
#### Build HIP Catch2 standalone test
##### Build HIP Catch2 standalone test
HIP Catch2 supports build a standalone test, for example,
```
```shell
cd "$HIP_TESTS_DIR"
hipcc $HIP_TESTS_DIR/catch/unit/memory/hipPointerGetAttributes.cc -I ./catch/include ./catch/hipTestMain/standalone_main.cc -I ./catch/external/Catch2 -o hipPointerGetAttributes
./hipPointerGetAttributes
@@ -197,26 +186,26 @@ hipcc $HIP_TESTS_DIR/catch/unit/memory/hipPointerGetAttributes.cc -I ./catch/inc
All tests passed
```
# Build HIP on NVIDIA platform
## Build HIP on NVIDIA platform
## Get HIP source code
### Get HIP source code
```
```shell
git clone -b "$ROCM_BRANCH" https://github.com/ROCm-Developer-Tools/hip.git
git clone -b "$ROCM_BRANCH" https://github.com/ROCm-Developer-Tools/hipamd.git
```
## Set the environment variables
### Set the environment variables
```
```shell
export HIP_DIR="$(readlink -f hip)"
export HIPAMD_DIR="$(readlink -f hipamd)"
```
## Build HIP
### Build HIP
```
```shell
cd "$HIPAMD_DIR"
mkdir -p build; cd build
cmake -DHIP_COMMON_DIR=$HIP_DIR -DHIP_PLATFORM=nvidia -DCMAKE_INSTALL_PREFIX=$PWD/install ..
@@ -224,10 +213,10 @@ make -j$(nproc)
sudo make install
```
## Build HIP tests
Build HIP tests commands on NVIDIA platform are basically the same as AMD, except set -DHIP_PLATFORM=nvidia.
### Build HIP tests
Build HIP tests commands on NVIDIA platform are basically the same as AMD, except set `-DHIP_PLATFORM=nvidia`.
# Run HIP
## Run HIP
Compile and run the [square sample](https://github.com/ROCm-Developer-Tools/HIP/tree/rocm-5.0.x/samples/0_Intro/square).
+16 -17
Voir le fichier
@@ -1,12 +1,12 @@
# Contributor Guidelines
## Make Tips
ROCM_PATH is path where ROCM is installed. BY default ROCM_PATH is /opt/rocm.
When building HIP, you will likely want to build and install to a local user-accessible directory (rather than <ROCM_PATH>).
This can be easily be done by setting the -DCMAKE_INSTALL_PREFIX variable when running cmake. Typical use case is to
set CMAKE_INSTALL_PREFIX to your HIP git root, and then ensure HIP_PATH points to this directory. For example
`ROCM_PATH` is path where ROCM is installed. BY default `ROCM_PATH` is `/opt/rocm`.
When building HIP, you will likely want to build and install to a local user-accessible directory (rather than `<ROCM_PATH>`).
This can be easily be done by setting the `-DCMAKE_INSTALL_PREFIX` variable when running cmake. Typical use case is to
set `CMAKE_INSTALL_PREFIX` to your HIP git root, and then ensure `HIP_PATH` points to this directory. For example
```
```shell
cmake .. -DCMAKE_INSTALL_PREFIX=..
make install
@@ -18,20 +18,19 @@ After making HIP, don't forget the "make install" step !
## Adding a new HIP API
- Add a translation to the hipify-clang tool ; many examples abound.
- For stat tracking purposes, place the API into an appropriate stat category ("dev", "mem", "stream", etc).
- Add a inlined NVIDIA implementation for the function in include/hip/nvidia_detail/hip_runtime_api.h.
- These are typically headers
- Add an HIP_ROCclr definition and Doxygen comments for the function in include/amd_detail/hip_runtime_api.h
- Source implementation typically go in hip/rocclr/hip_*.cpp. The implementation involve calls to HIP runtime (ie for hipStream_t).
- Add a translation to the hipify-clang tool ; many examples abound.
- For stat tracking purposes, place the API into an appropriate stat category ("dev", "mem", "stream", etc).
- Add a inlined NVIDIA implementation for the function in include/hip/nvidia_detail/hip_runtime_api.h.
- These are typically headers
- Add an HIP_ROCclr definition and Doxygen comments for the function in include/amd_detail/hip_runtime_api.h
- Source implementation typically go in hip/rocclr/hip_*.cpp. The implementation involve calls to HIP runtime (ie for hipStream_t).
## Check HIP-Clang version
In some cases new HIP-Clang features are tied to specified releases, and it can be useful to check the current version is sufficiently new enough to support the desired feature.
HIP runtime version
```
```console
> cat <ROCM_PATH>/hip/bin/.hipVersion
# Auto-generated by cmake
HIP_VERSION_MAJOR=3
@@ -41,7 +40,7 @@ HIP_VERSION_PATCH=20345-519ef3f2
HIP-Clang compiler version
```
```console
$ <ROCM_PATH>/llvm/bin/clang -v
clang version 11.0.0 (/src/external/llvm-project/clang 075fedd3fd2f4d9d8cca79d0cd51f64c5ef21432)
Target: x86_64-unknown-linux-gnu
@@ -128,9 +127,9 @@ Differences or limitations of HIP APIs as compared to CUDA APIs should be clearl
Before checking in or submitting a pull request, run all directed tests (see tests/README.md) and all Rodinia tests.
Ensure pass results match starting point:
```shell
> cd examples/
> ./run_all.sh
```console
> cd examples/
> ./run_all.sh
```
@@ -1,32 +1,37 @@
## What is HIP logging for? ###
# Logging Mechanisms
HIP provides a logging mechanism, which is a convinient way of printing important information so as to trace HIP API and runtime codes during the execution of HIP application.
It assists HIP development team in the development of HIP runtime, and is useful for HIP application developers as well.
Depending on the setting of logging level and logging mask, HIP logging will print different kinds of information, for different types of functionalities such as HIP APIs, executed kernels, queue commands and queue contents, etc.
HIP provides a logging mechanism, which is a convenient way of printing
important information so as to trace HIP API and runtime codes during the
execution of HIP application.
It assists HIP development team in the development of HIP runtime, and is useful
for HIP application developers as well.
Depending on the setting of logging level and logging mask, HIP logging will
print different kinds of information, for different types of functionalities
such as HIP APIs, executed kernels, queue commands and queue contents, etc.
## HIP Logging Level:
By Default, HIP logging is disabled, it can be enabled via environment setting,
- AMD_LOG_LEVEL
By default, HIP logging is disabled, it can be enabled via the `AMD_LOG_LEVEL`
environment variable.
The value controls the logging level. The levels are defined as:
The value of the setting controls different logging level,
```
```cpp
enum LogLevel {
LOG_NONE = 0,
LOG_ERROR = 1,
LOG_WARNING = 2,
LOG_INFO = 3,
LOG_DEBUG = 4
LOG_NONE = 0,
LOG_ERROR = 1,
LOG_WARNING = 2,
LOG_INFO = 3,
LOG_DEBUG = 4
};
```
## HIP Logging Mask:
Logging mask is designed to print types of functionalities during the execution of HIP application.
Logging mask is designed to print types of functionalities during the execution
of HIP application.
It can be set as one of the following values,
```
```cpp
enum LogMask {
LOG_API = 0x00000001, //!< API call
LOG_CMD = 0x00000002, //!< Kernel and Copy Commands and Barriers
@@ -49,39 +54,41 @@ enum LogMask {
};
```
Once AMD_LOG_LEVEL is set, logging mask is set as default with the value 0x7FFFFFFF.
However, for different pupose of logging functionalities, logging mask can be defined as well via environment variable,
- AMD_LOG_MASK
Once `AMD_LOG_LEVEL` is set, logging mask is set as default with the value
`0x7FFFFFFF`.
However, for different purpose of logging functionalities, logging mask can be
defined as well via environment variable `AMD_LOG_MASK`
## HIP Logging command:
To pring HIP logging information, the function is defined as
```
#define ClPrint(level, mask, format, ...)
do {
if (AMD_LOG_LEVEL >= level) {
if (AMD_LOG_MASK & mask || mask == amd::LOG_ALWAYS) {
if (AMD_LOG_MASK & amd::LOG_LOCATION) {
amd::log_printf(level, __FILENAME__, __LINE__, format, ##__VA_ARGS__);
} else {
amd::log_printf(level, "", 0, format, ##__VA_ARGS__);
}
}
}
```cpp
#define ClPrint(level, mask, format, ...) \
do { \
if (AMD_LOG_LEVEL >= level) { \
if (AMD_LOG_MASK & mask || mask == amd::LOG_ALWAYS) { \
if (AMD_LOG_MASK & amd::LOG_LOCATION) { \
amd::log_printf(level, __FILENAME__, __LINE__, format, ##__VA_ARGS__);\
} else { \
amd::log_printf(level, "", 0, format, ##__VA_ARGS__); \
} \
} \
} \
} while (false)
```
So in HIP code, call ClPrint() function with proper input varibles as needed, for example,
```
So in HIP code, call `ClPrint()` function with proper input varibles as needed,
for example,
```cpp
ClPrint(amd::LOG_INFO, amd::LOG_INIT, "Initializing HSA stack.");
```
## HIP Logging Example:
Below is an example to enable HIP logging and get logging information during execution of hipinfo,
Below is an example to enable HIP logging and get logging information during
execution of hipinfo,
```
```console
user@user-test:~/hip/bin$ export AMD_LOG_LEVEL=4
user@user-test:~/hip/bin$ ./hipinfo
@@ -174,14 +181,13 @@ memInfo.free: 7.98 GB (100%)
## HIP Logging Tips:
- HIP logging works for both release and debug version of HIP application.
- Logging function with different logging level can be called in the code as needed.
- Logging function with different logging level can be called in the code as
needed.
- Information with logging level less than AMD_LOG_LEVEL will be printed.
- If need to save the HIP logging output information in a file, just define the
file at the command when run the application at the terminal, for example,
- If need to save the HIP logging output information in a file, just define the file at the command when run the application at the terminal, for example,
```
```console
user@user-test:~/hip/bin$ ./hipinfo > ~/hip_log.txt
```
@@ -1,21 +1,6 @@
# HIP Debugging
There are some techniques provided in HIP for developers to trace and debug codes during execution, this section describes some details and practical suggestions on debugging.
Table of Contents
=================
* [ Debugging Tools](#debugging-tools)
* [Using ltrace](#using-ltrace)
* [Using ROCgdb](#using-rocgdb)
* [Other Debugging Tools](#Other-debugging-tools)
* [ Debugging HIP Application](#debugging-hip-application)
* [ Useful Environment Variables](#useful-environment-variables)
* [Kernel Enqueue Serialization](#kernel-enqueue-serialization)
* [Making Device visible](#making-device-visible)
* [Dump code object](#dump-code-object)
* [HSA related environment variables](#HSA-related-environment-variables)
* [ General Debugging Tips](#general-debugging-tips)
## Debugging tools
### Using ltrace
@@ -27,7 +12,7 @@ The trace can also show performance issues related to accidental calls to expens
Here's a simple sample with command-line to trace hip APIs and output:
```
```console
$ ltrace -C -e "hip*" ./hipGetChanDesc
hipGetChanDesc->hipCreateChannelDesc(0x7ffdc4b66860, 32, 0, 0) = 0x7ffdc4b66860
hipGetChanDesc->hipMallocArray(0x7ffdc4b66840, 0x7ffdc4b66860, 8, 8) = 0
@@ -39,7 +24,7 @@ PASSED!
Another sample below with command-line only trace hsa APIs and output:
```
```console
$ ltrace -C -e "hsa*" ./hipGetChanDesc
libamdhip64.so.4->hsa_init(0, 0x7fff325a69d0, 0x9c80e0, 0 <unfinished ...>
libhsa-runtime64.so.1->hsaKmtOpenKFD(0x7fff325a6590, 0x9c38c0, 0, 1) = 0
@@ -98,7 +83,7 @@ For details, see (https://github.com/ROCm-Developer-Tools/ROCgdb).
Below is a sample how to use ROCgdb run and debug HIP application, rocgdb is installed with ROCM package in the folder /opt/rocm/bin.
```
```console
$ export PATH=$PATH:/opt/rocm/bin
$ rocgdb ./hipTexObjPitch
GNU gdb (rocm-dkms-no-npi-hipclang-6549) 10.1
@@ -133,7 +118,7 @@ There are also other debugging tools available online developers can google and
Below is an example to show how to get useful information from the debugger while running a simple memory copy test, which caused an issue of segmentation fault.
```
```console
test: simpleTest2<?> numElements=4194304 sizeElements=4194304 bytes
Segmentation fault (core dumped)
@@ -219,12 +204,12 @@ For system with multiple devices, it's possible to make only certain device(s) v
HIP_VISIBLE_DEVICES, only devices whose index is present in the sequence are visible to HIP.
For example,
```
```console
$ HIP_VISIBLE_DEVICES=0,1
```
or in the application,
```
```cpp
if (totalDeviceNum > 2) {
setenv("HIP_VISIBLE_DEVICES", "0,1,2", 1);
assert(getDeviceNumber(false) == 3);
@@ -272,7 +257,7 @@ The following is the summary of the most useful environment variables in HIP.
(gdb) set env AMD_SERIALIZE_KERNEL 3
```
- The fault will be caught by the runtime but was actually generated by an asynchronous command running on the GPU. So, the GDB backtrace will show a path in the runtime.
- To determine the true location of the fault, force the kernels to execute synchronously by seeing the environment variables AMD_SERIALIZE_KERNEL=3 AMD_SERIALIZE_COPY=3. This will force HIP runtime to wait for the kernel to finish executing before retuning. If the fault occurs during the execution of a kernel, you can see the code which launched the kernel inside the backtrace. A bit of guesswork is required to determine which thread is actually causing the issue - typically it will the thread which is waiting inside the libhsa-runtime64.so.
- To determine the true location of the fault, force the kernels to execute synchronously by seeing the environment variables AMD_SERIALIZE_KERNEL=3 AMD_SERIALIZE_COPY=3. This will force HIP runtime to wait for the kernel to finish executing before retuning. If the fault occurs during the execution of a kernel, you can see the code which launched the kernel inside the backtrace. A bit of guesswork is required to determine which thread is actually causing the issue - typically it will the thread which is waiting inside the `libhsa-runtime64.so`.
- VM faults inside kernels can be caused by:
- incorrect code (ie a for loop which extends past array boundaries),
- memory issues - kernel arguments which are invalid (null pointers, unregistered host pointers, bad pointers),
+3 -14
Voir le fichier
@@ -1,16 +1,4 @@
## Table of Contents
<!-- toc -->
- [Install HIP](#installing-hip)
* [Prerequisites](#prerequisites)
* [AMD Platform](#amd-platform)
* [NVIDIA Platform](#nvidia-platform)
- [Verify your installation](#verify-your-installation)
- [How to build HIP from source](#how-to-build-hip-from-source)
<!-- tocstop -->
# Install HIP
# Installing HIP
HIP can be installed either on AMD ROCm platform with HIP-Clang compiler, or a CUDA platform with nvcc installed.
@@ -59,6 +47,7 @@ Run hipconfig (instructions below assume default installation path) :
# How to build HIP from source
Developers can build HIP from source on either AMD or NVIDIA platforms, see detail instructions at [building HIP] (docs/markdown/hip_build.md).
Developers can build HIP from source on either AMD or NVIDIA platforms, see
detailed instructions at {doc}`/developer_guide/build`
+37
Voir le fichier
@@ -0,0 +1,37 @@
# HIP Documentation
HIP is a C++ runtime API and kernel language that allows developers to create
portable applications for AMD and NVIDIA GPUs from single source code.
## Overview
::::{grid} 1 1 2 2
:gutter: 1
:::{grid-item-card} User Guide
- {doc}`/user_guide/programming_manual`
- {doc}`/user_guide/hip_rtc`
- {doc}`/user_guide/faq`
:::
:::{grid-item-card} How to Guides
- {doc}`/how_to_guides/install`
- {doc}`/how_to_guides/debugging`
:::
:::{grid-item-card} Reference
- {doc}`/.doxygen/docBin/html/index`
- {doc}`/.doxygen/docBin/html/modules`
- {doc}`/reference/kernel_language`
- {doc}`/reference/math_api`
- {doc}`/reference/terms`
- {doc}`/reference/deprecated_api_list`
:::
:::{grid-item-card} Developer Guide
- {doc}`/developer_guide/build`
- {doc}`/developer_guide/logging`
- {doc}`/developer_guide/contributing`
:::
::::
-1
Voir le fichier
@@ -1 +0,0 @@
Document has been moved to https://github.com/ROCm-Developer-Tools/HIPIFY/blob/master/doc/markdown/CUDA_Driver_API_functions_supported_by_HIP.md
Fichier diff supprimé car celui-ci est trop grand Voir la Diff
-763
Voir le fichier
@@ -1,763 +0,0 @@
# Support of Clang options
Clang version: clang version 12.0.0 927e2776dc0e4bb0119efbc5ea405b7425d7f4ac
|Option|Support|Description|
|-------|------|-------|
|`-###`|Supported|`Print (but do not run) the commands to run for this compilation`|
|`--analyzer-output <value>`|Supported|`Static analyzer report output format (html\|plist\|plist-multi-file\|plist-html\|sarif\|text).`|
|`--analyze`|Supported|`Run the static analyzer`|
|`-arcmt-migrate-emit-errors`|Unsupported|`Emit ARC errors even if the migrator can fix them`|
|`-arcmt-migrate-report-output <value>`|Unsupported|`Output path for the plist report`|
|`-byteswapio`|Supported|`Swap byte-order for unformatted input/output`|
|`-B <dir>`|Supported|`Add <dir> to search path for binaries and object files used implicitly`|
|`-CC`|Supported|`Include comments from within macros in preprocessed output`|
|`-cl-denorms-are-zero`|Supported|`OpenCL only. Allow denormals to be flushed to zero.`|
|`-cl-fast-relaxed-math`|Supported|`OpenCL only. Sets -cl-finite-math-only and -cl-unsafe-math-optimizations, and defines __FAST_RELAXED_MATH__.`|
|`-cl-finite-math-only`|Supported|`OpenCL only. Allow floating-point optimizations that assume arguments and results are not NaNs or +-Inf.`|
|`-cl-fp32-correctly-rounded-divide-sqrt`|Supported|`OpenCL only. Specify that single precision floating-point divide and sqrt used in the program source are correctly rounded.`|
|`-cl-kernel-arg-info`|Supported|`OpenCL only. Generate kernel argument metadata.`|
|`-cl-mad-enable`|Supported|`OpenCL only. Allow use of less precise MAD computations in the generated binary.`|
|`-cl-no-signed-zeros`|Supported|`OpenCL only. Allow use of less precise no signed zeros computations in the generated binary.`|
|`-cl-opt-disable`|Supported|`OpenCL only. This option disables all optimizations. By default optimizations are enabled.`|
|`-cl-single-precision-constant`|Supported|`OpenCL only. Treat double precision floating-point constant as single precision constant.`|
|`-cl-std=<value>`|Supported|`OpenCL language standard to compile for.`|
|`-cl-strict-aliasing`|Supported|`OpenCL only. This option is added for compatibility with OpenCL 1.0.`|
|`-cl-uniform-work-group-size`|Supported|`OpenCL only. Defines that the global work-size be a multiple of the work-group size specified to clEnqueueNDRangeKernel`|
|`-cl-unsafe-math-optimizations`|Supported|`OpenCL only. Allow unsafe floating-point optimizations. Also implies -cl-no-signed-zeros and -cl-mad-enable.`|
|`--config <value>`|Supported|`Specifies configuration file`|
|`--cuda-compile-host-device`|Supported|`Compile CUDA code for both host and device (default). Has no effect on non-CUDA compilations.`|
|`--cuda-device-only`|Supported|`Compile CUDA code for device only`|
|`--cuda-host-only`|Supported|`Compile CUDA code for host only. Has no effect on non-CUDA compilations.`|
|`--cuda-include-ptx=<value>`|Unsupported|`Include PTX for the following GPU architecture (e.g. sm_35) or 'all'. May be specified more than once.`|
|`--cuda-noopt-device-debug`|Unsupported|`Enable device-side debug info generation. Disables ptxas optimizations.`|
|`--cuda-path-ignore-env`|Unsupported|`Ignore environment variables to detect CUDA installation`|
|`--cuda-path=<value>`|Unsupported|`CUDA installation path`|
|`-cxx-isystem <directory>`|Supported|`Add directory to the C++ SYSTEM include search path`|
|`-C`|Supported|`Include comments in preprocessed output`|
|`-c`|Supported|`Only run preprocess, compile, and assemble steps`|
|`-dD`|Supported|`Print macro definitions in -E mode in addition to normal output`|
|`-dependency-dot <value>`|Supported|`Filename to write DOT-formatted header dependencies to`|
|`-dependency-file <value>`|Supported|`Filename (or -) to write dependency output to`|
|`-dI`|Supported|`Print include directives in -E mode in addition to normal output`|
|`-dM`|Supported|`Print macro definitions in -E mode instead of normal output`|
|`-dsym-dir <dir>`|Unsupported|`Directory to output dSYM's (if any) to`|
|`-D <macro>`|Supported|`=<value> Define <macro> to <value> (or 1 if <value> omitted)`|
|`-emit-ast`|Supported|`Emit Clang AST files for source inputs`|
|`-emit-interface-stubs`|Supported|`Generate Interface Stub Files.`|
|`-emit-llvm`|Supported|`Use the LLVM representation for assembler and object files`|
|`-emit-merged-ifs`|Supported|`Generate Interface Stub Files, emit merged text not binary.`|
|`--emit-static-lib`|Supported|`Enable linker job to emit a static library.`|
|`-enable-trivial-auto-var-init-zero-knowing-it-will-be-removed-from-clang`|Supported|`Trivial automatic variable initialization to zero is only here for benchmarks, it'll eventually be removed, and I'm OK with that because I'm only using it to benchmark`|
|`-E`|Supported|`Only run the preprocessor`|
|`-fAAPCSBitfieldLoad`|Unsupported|`Follows the AAPCS standard that all volatile bit-field write generates at least one load. (ARM only).`|
|`-faddrsig`|Supported|`Emit an address-significance table`|
|`-faligned-allocation`|Supported|`Enable C++17 aligned allocation functions`|
|`-fallow-editor-placeholders`|Supported|`Treat editor placeholders as valid source code`|
|`-fallow-fortran-gnu-ext`|Supported|`Allow Fortran GNU extensions`|
|`-fansi-escape-codes`|Supported|`Use ANSI escape codes for diagnostics`|
|`-fapple-kext`|Unsupported|`Use Apple's kernel extensions ABI`|
|`-fapple-link-rtlib`|Unsupported|`Force linking the clang builtins runtime library`|
|`-fapple-pragma-pack`|Unsupported|`Enable Apple gcc-compatible #pragma pack handling`|
|`-fapplication-extension`|Unsupported|`Restrict code to those available for App Extensions`|
|`-fbackslash`|Supported|`Treat backslash as C-style escape character`|
|`-fbasic-block-sections=<value>`|Supported|`Place each function's basic blocks in unique sections (ELF Only) : all \| labels \| none \| list=<file>`|
|`-fblocks`|Supported|`Enable the 'blocks' language feature`|
|`-fborland-extensions`|Unsupported|`Accept non-standard constructs supported by the Borland compiler`|
|`-fbuild-session-file=<file>`|Supported|`Use the last modification time of <file> as the build session timestamp`|
|`-fbuild-session-timestamp=<time since Epoch in seconds>`|Supported|`Time when the current build session started`|
|`-fbuiltin-module-map`|Unsupported|`Load the clang builtins module map file.`|
|`-fcall-saved-x10`|Unsupported|`Make the x10 register call-saved (AArch64 only)`|
|`-fcall-saved-x11`|Unsupported|`Make the x11 register call-saved (AArch64 only)`|
|`-fcall-saved-x12`|Unsupported|`Make the x12 register call-saved (AArch64 only)`|
|`-fcall-saved-x13`|Unsupported|`Make the x13 register call-saved (AArch64 only)`|
|`-fcall-saved-x14`|Unsupported|`Make the x14 register call-saved (AArch64 only)`|
|`-fcall-saved-x15`|Unsupported|`Make the x15 register call-saved (AArch64 only)`|
|`-fcall-saved-x18`|Unsupported|`Make the x18 register call-saved (AArch64 only)`|
|`-fcall-saved-x8`|Unsupported|`Make the x8 register call-saved (AArch64 only)`|
|`-fcall-saved-x9`|Unsupported|`Make the x9 register call-saved (AArch64 only)`|
|`-fcf-protection=<value>`|Unsupported|`Instrument control-flow architecture protection. Options: return, branch, full, none.`|
|`-fcf-protection`|Unsupported|`Enable cf-protection in 'full' mode`|
|`-fchar8_t`|Supported|`Enable C++ builtin type char8_t`|
|`-fclang-abi-compat=<version>`|Supported|`Attempt to match the ABI of Clang <version>`|
|`-fcolor-diagnostics`|Supported|`Enable colors in diagnostics`|
|`-fcomment-block-commands=<arg>`|Supported|`Treat each comma separated argument in <arg> as a documentation comment block command`|
|`-fcommon`|Supported|`Place uninitialized global variables in a common block`|
|`-fcomplete-member-pointers`|Supported|`Require member pointer base types to be complete if they would be significant under the Microsoft ABI`|
|`-fconvergent-functions`|Supported|`Assume functions may be convergent`|
|`-fcoroutines-ts`|Supported|`Enable support for the C++ Coroutines TS`|
|`-fcoverage-mapping`|Unsupported|`Generate coverage mapping to enable code coverage analysis`|
|`-fcs-profile-generate=<directory>`|Unsupported|`Generate instrumented code to collect context sensitive execution counts into <directory>/default.profraw (overridden by LLVM_PROFILE_FILE env var)`|
|`-fcs-profile-generate`|Unsupported|`Generate instrumented code to collect context sensitive execution counts into default.profraw (overridden by LLVM_PROFILE_FILE env var)`|
|`-fcuda-approx-transcendentals`|Unsupported|`Use approximate transcendental functions`|
|`-fcuda-flush-denormals-to-zero`|Supported|`Flush denormal floating point values to zero in CUDA device mode.`|
|`-fcuda-short-ptr`|Unsupported|`Use 32-bit pointers for accessing const/local/shared address spaces`|
|`-fcxx-exceptions`|Supported|`Enable C++ exceptions`|
|`-fdata-sections`|Supported|`Place each data in its own section`|
|`-fdebug-compilation-dir <value>`|Supported|`The compilation directory to embed in the debug info.`|
|`-fdebug-default-version=<value>`|Supported|`Default DWARF version to use, if a -g option caused DWARF debug info to be produced`|
|`-fdebug-info-for-profiling`|Supported|`Emit extra debug info to make sample profile more accurate`|
|`-fdebug-macro`|Supported|`Emit macro debug information`|
|`-fdebug-prefix-map=<value>`|Supported|`remap file source paths in debug info`|
|`-fdebug-ranges-base-address`|Supported|`Use DWARF base address selection entries in .debug_ranges`|
|`-fdebug-types-section`|Supported|`Place debug types in their own section (ELF Only)`|
|`-fdeclspec`|Supported|`Allow __declspec as a keyword`|
|`-fdelayed-template-parsing`|Supported|`Parse templated function definitions at the end of the translation unit`|
|`-fdelete-null-pointer-checks`|Supported|`Treat usage of null pointers as undefined behavior (default)`|
|`-fdiagnostics-absolute-paths`|Supported|`Print absolute paths in diagnostics`|
|`-fdiagnostics-hotness-threshold=<number>`|Unsupported|`Prevent optimization remarks from being output if they do not have at least this profile count`|
|`-fdiagnostics-parseable-fixits`|Supported|`Print fix-its in machine parseable form`|
|`-fdiagnostics-print-source-range-info`|Supported|`Print source range spans in numeric form`|
|`-fdiagnostics-show-hotness`|Unsupported|`Enable profile hotness information in diagnostic line`|
|`-fdiagnostics-show-note-include-stack`|Supported|`Display include stacks for diagnostic notes`|
|`-fdiagnostics-show-option`|Supported|`Print option name with mappable diagnostics`|
|`-fdiagnostics-show-template-tree`|Supported|`Print a template comparison tree for differing templates`|
|`-fdigraphs`|Supported|`Enable alternative token representations '<:', ':>', '<%', '%>', '%:', '%:%:' (default)`|
|`-fdiscard-value-names`|Supported|`Discard value names in LLVM IR`|
|`-fdollars-in-identifiers`|Supported|`Allow '$' in identifiers`|
|`-fdouble-square-bracket-attributes`|Supported|`Enable '[[]]' attributes in all C and C++ language modes`|
|`-fdwarf-exceptions`|Unsupported|`Use DWARF style exceptions`|
|`-feliminate-unused-debug-types`|Supported|`Do not emit debug info for defined but unused types`|
|`-fembed-bitcode-marker`|Supported|`Embed placeholder LLVM IR data as a marker`|
|`-fembed-bitcode=<option>`|Supported|`Embed LLVM bitcode (option: off, all, bitcode, marker)`|
|`-fembed-bitcode`|Supported|`Embed LLVM IR bitcode as data`|
|`-femit-all-decls`|Supported|`Emit all declarations, even if unused`|
|`-femulated-tls`|Supported|`Use emutls functions to access thread_local variables`|
|`-fenable-matrix`|Supported|`Enable matrix data type and related builtin functions`|
|`-fexceptions`|Supported|`Enable support for exception handling`|
|`-fexperimental-new-constant-interpreter`|Supported|`Enable the experimental new constant interpreter`|
|`-fexperimental-new-pass-manager`|Supported|`Enables an experimental new pass manager in LLVM.`|
|`-fexperimental-relative-c++-abi-vtables`|Supported|`Use the experimental C++ class ABI for classes with virtual tables`|
|`-fexperimental-strict-floating-point`|Supported|`Enables experimental strict floating point in LLVM.`|
|`-ffast-math`|Supported|`Allow aggressive, lossy floating-point optimizations`|
|`-ffile-prefix-map=<value>`|Supported|`remap file source paths in debug info and predefined preprocessor macros`|
|`-ffine-grained-bitfield-accesses`|Supported|`Use separate accesses for consecutive bitfield runs with legal widths and alignments.`|
|`-ffixed-form`|Supported|`Enable fixed-form format for Fortran`|
|`-ffixed-point`|Supported|`Enable fixed point types`|
|`-ffixed-r19`|Unsupported|`Reserve register r19 (Hexagon only)`|
|`-ffixed-r9`|Unsupported|`Reserve the r9 register (ARM only)`|
|`-ffixed-x10`|Unsupported|`Reserve the x10 register (AArch64/RISC-V only)`|
|`-ffixed-x11`|Unsupported|`Reserve the x11 register (AArch64/RISC-V only)`|
|`-ffixed-x12`|Unsupported|`Reserve the x12 register (AArch64/RISC-V only)`|
|`-ffixed-x13`|Unsupported|`Reserve the x13 register (AArch64/RISC-V only)`|
|`-ffixed-x14`|Unsupported|`Reserve the x14 register (AArch64/RISC-V only)`|
|`-ffixed-x15`|Unsupported|`Reserve the x15 register (AArch64/RISC-V only)`|
|`-ffixed-x16`|Unsupported|`Reserve the x16 register (AArch64/RISC-V only)`|
|`-ffixed-x17`|Unsupported|`Reserve the x17 register (AArch64/RISC-V only)`|
|`-ffixed-x18`|Unsupported|`Reserve the x18 register (AArch64/RISC-V only)`|
|`-ffixed-x19`|Unsupported|`Reserve the x19 register (AArch64/RISC-V only)`|
|`-ffixed-x1`|Unsupported|`Reserve the x1 register (AArch64/RISC-V only)`|
|`-ffixed-x20`|Unsupported|`Reserve the x20 register (AArch64/RISC-V only)`|
|`-ffixed-x21`|Unsupported|`Reserve the x21 register (AArch64/RISC-V only)`|
|`-ffixed-x22`|Unsupported|`Reserve the x22 register (AArch64/RISC-V only)`|
|`-ffixed-x23`|Unsupported|`Reserve the x23 register (AArch64/RISC-V only)`|
|`-ffixed-x24`|Unsupported|`Reserve the x24 register (AArch64/RISC-V only)`|
|`-ffixed-x25`|Unsupported|`Reserve the x25 register (AArch64/RISC-V only)`|
|`-ffixed-x26`|Unsupported|`Reserve the x26 register (AArch64/RISC-V only)`|
|`-ffixed-x27`|Unsupported|`Reserve the x27 register (AArch64/RISC-V only)`|
|`-ffixed-x28`|Unsupported|`Reserve the x28 register (AArch64/RISC-V only)`|
|`-ffixed-x29`|Unsupported|`Reserve the x29 register (AArch64/RISC-V only)`|
|`-ffixed-x2`|Unsupported|`Reserve the x2 register (AArch64/RISC-V only)`|
|`-ffixed-x30`|Unsupported|`Reserve the x30 register (AArch64/RISC-V only)`|
|`-ffixed-x31`|Unsupported|`Reserve the x31 register (AArch64/RISC-V only)`|
|`-ffixed-x3`|Unsupported|`Reserve the x3 register (AArch64/RISC-V only)`|
|`-ffixed-x4`|Unsupported|`Reserve the x4 register (AArch64/RISC-V only)`|
|`-ffixed-x5`|Unsupported|`Reserve the x5 register (AArch64/RISC-V only)`|
|`-ffixed-x6`|Unsupported|`Reserve the x6 register (AArch64/RISC-V only)`|
|`-ffixed-x7`|Unsupported|`Reserve the x7 register (AArch64/RISC-V only)`|
|`-ffixed-x8`|Unsupported|`Reserve the x8 register (AArch64/RISC-V only)`|
|`-ffixed-x9`|Unsupported|`Reserve the x9 register (AArch64/RISC-V only)`|
|`-fforce-dwarf-frame`|Supported|`Always emit a debug frame section`|
|`-fforce-emit-vtables`|Supported|`Emits more virtual tables to improve devirtualization`|
|`-fforce-enable-int128`|Supported|`Enable support for int128_t type`|
|`-ffp-contract=<value>`|Supported|`Form fused FP ops (e.g. FMAs): fast (everywhere) \| on (according to FP_CONTRACT pragma) \| off (never fuse). Default is 'fast' for CUDA/HIP and 'on' otherwise.`|
|`-ffp-exception-behavior=<value>`|Supported|`Specifies the exception behavior of floating-point operations.`|
|`-ffp-model=<value>`|Supported|`Controls the semantics of floating-point calculations.`|
|`-ffree-form`|Supported|`Enable free-form format for Fortran`|
|`-ffreestanding`|Supported|`Assert that the compilation takes place in a freestanding environment`|
|`-ffunc-args-alias`|Supported|`Function argument may alias (equivalent to ansi alias)`|
|`-ffunction-sections`|Supported|`Place each function in its own section`|
|`-fglobal-isel`|Supported|`Enables the global instruction selector`|
|`-fgnu-keywords`|Supported|`Allow GNU-extension keywords regardless of language standard`|
|`-fgnu-runtime`|Unsupported|`Generate output compatible with the standard GNU Objective-C runtime`|
|`-fgnu89-inline`|Unsupported|`Use the gnu89 inline semantics`|
|`-fgnuc-version=<value>`|Supported|`Sets various macros to claim compatibility with the given GCC version (default is 4.2.1)`|
|`-fgpu-allow-device-init`|Supported|`Allow device side init function in HIP`|
|`-fgpu-rdc`|Supported|`Generate relocatable device code, also known as separate compilation mode`|
|`-fhip-new-launch-api`|Supported|`Use new kernel launching API for HIP`|
|`-fignore-exceptions`|Supported|`Enable support for ignoring exception handling constructs`|
|`-fimplicit-module-maps`|Unsupported|`Implicitly search the file system for module map files.`|
|`-finline-functions`|Supported|`Inline suitable functions`|
|`-finline-hint-functions`|Supported|`Inline functions which are (explicitly or implicitly) marked inline`|
|`-finstrument-function-entry-bare`|Unsupported|`Instrument function entry only, after inlining, without arguments to the instrumentation call`|
|`-finstrument-functions-after-inlining`|Unsupported|`Like -finstrument-functions, but insert the calls after inlining`|
|`-finstrument-functions`|Unsupported|`Generate calls to instrument function entry and exit`|
|`-fintegrated-as`|Supported|`Enable the integrated assembler`|
|`-fintegrated-cc1`|Supported|`Run cc1 in-process`|
|`-fjump-tables`|Supported|`Use jump tables for lowering switches`|
|`-fkeep-static-consts`|Supported|`Keep static const variables if unused`|
|`-flax-vector-conversions=<value>`|Supported|`Enable implicit vector bit-casts`|
|`-flto-jobs=<value>`|Unsupported|`Controls the backend parallelism of -flto=thin (default of 0 means the number of threads will be derived from the number of CPUs detected)`|
|`-flto=<value>`|Unsupported|`Set LTO mode to either 'full' or 'thin'`|
|`-flto`|Unsupported|`Enable LTO in 'full' mode`|
|`-fmacro-prefix-map=<value>`|Supported|`remap file source paths in predefined preprocessor macros`|
|`-fmath-errno`|Supported|`Require math functions to indicate errors by setting errno`|
|`-fmax-tokens=<value>`|Supported|`Max total number of preprocessed tokens for -Wmax-tokens.`|
|`-fmax-type-align=<value>`|Supported|`Specify the maximum alignment to enforce on pointers lacking an explicit alignment`|
|`-fmemory-profile`|Supported|`Enable heap memory profiling`|
|`-fmerge-all-constants`|Supported|`Allow merging of constants`|
|`-fmessage-length=<value>`|Supported|`Format message diagnostics so that they fit within N columns`|
|`-fmodule-file=[<name>=]<file>`|Unsupported|`Specify the mapping of module name to precompiled module file, or load a module file if name is omitted.`|
|`-fmodule-map-file=<file>`|Unsupported|`Load this module map file`|
|`-fmodule-name=<name>`|Unsupported|`Specify the name of the module to build`|
|`-fmodules-cache-path=<directory>`|Unsupported|`Specify the module cache path`|
|`-fmodules-decluse`|Unsupported|`Require declaration of modules used within a module`|
|`-fmodules-disable-diagnostic-validation`|Unsupported|`Disable validation of the diagnostic options when loading the module`|
|`-fmodules-ignore-macro=<value>`|Unsupported|`Ignore the definition of the given macro when building and loading modules`|
|`-fmodules-prune-after=<seconds>`|Unsupported|`Specify the interval (in seconds) after which a module file will be considered unused`|
|`-fmodules-prune-interval=<seconds>`|Unsupported|`Specify the interval (in seconds) between attempts to prune the module cache`|
|`-fmodules-search-all`|Unsupported|`Search even non-imported modules to resolve references`|
|`-fmodules-strict-decluse`|Unsupported|`Like -fmodules-decluse but requires all headers to be in modules`|
|`-fmodules-ts`|Unsupported|`Enable support for the C++ Modules TS`|
|`-fmodules-user-build-path <directory>`|Unsupported|`Specify the module user build path`|
|`-fmodules-validate-input-files-content`|Supported|`Validate PCM input files based on content if mtime differs`|
|`-fmodules-validate-once-per-build-session`|Unsupported|`Don't verify input files for the modules if the module has been successfully validated or loaded during this build session`|
|`-fmodules-validate-system-headers`|Supported|`Validate the system headers that a module depends on when loading the module`|
|`-fmodules`|Unsupported|`Enable the 'modules' language feature`|
|`-fms-compatibility-version=<value>`|Supported|`Dot-separated value representing the Microsoft compiler version number to report in _MSC_VER (0 = don't define it (default))`|
|`-fms-compatibility`|Supported|`Enable full Microsoft Visual C++ compatibility`|
|`-fms-extensions`|Supported|`Accept some non-standard constructs supported by the Microsoft compiler`|
|`-fmsc-version=<value>`|Supported|`Microsoft compiler version number to report in _MSC_VER (0 = don't define it (default))`|
|`-fnew-alignment=<align>`|Supported|`Specifies the largest alignment guaranteed by '::operator new(size_t)'`|
|`-fno-addrsig`|Supported|`Don't emit an address-significance table`|
|`-fno-allow-fortran-gnu-ext`|Supported|`Allow Fortran GNU extensions`|
|`-fno-assume-sane-operator-new`|Supported|`Don't assume that C++'s global operator new can't alias any pointer`|
|`-fno-autolink`|Supported|`Disable generation of linker directives for automatic library linking`|
|`-fno-backslash`|Supported|`Treat backslash like any other character in character strings`|
|`-fno-builtin-<value>`|Supported|`Disable implicit builtin knowledge of a specific function`|
|`-fno-builtin`|Supported|`Disable implicit builtin knowledge of functions`|
|`-fno-c++-static-destructors`|Supported|`Disable C++ static destructor registration`|
|`-fno-char8_t`|Supported|`Disable C++ builtin type char8_t`|
|`-fno-color-diagnostics`|Supported|`Disable colors in diagnostics`|
|`-fno-common`|Supported|`Compile common globals like normal definitions`|
|`-fno-complete-member-pointers`|Supported|`Do not require member pointer base types to be complete if they would be significant under the Microsoft ABI`|
|`-fno-constant-cfstrings`|Supported|`Disable creation of CodeFoundation-type constant strings`|
|`-fno-coverage-mapping`|Supported|`Disable code coverage analysis`|
|`-fno-crash-diagnostics`|Supported|`Disable auto-generation of preprocessed source files and a script for reproduction during a clang crash`|
|`-fno-cuda-approx-transcendentals`|Unsupported|`Don't use approximate transcendental functions`|
|`-fno-debug-macro`|Supported|`Do not emit macro debug information`|
|`-fno-declspec`|Unsupported|`Disallow __declspec as a keyword`|
|`-fno-delayed-template-parsing`|Supported|`Disable delayed template parsing`|
|`-fno-delete-null-pointer-checks`|Supported|`Do not treat usage of null pointers as undefined behavior`|
|`-fno-diagnostics-fixit-info`|Supported|`Do not include fixit information in diagnostics`|
|`-fno-digraphs`|Supported|`Disallow alternative token representations '<:', ':>', '<%', '%>', '%:', '%:%:'`|
|`-fno-discard-value-names`|Supported|`Do not discard value names in LLVM IR`|
|`-fno-dollars-in-identifiers`|Supported|`Disallow '$' in identifiers`|
|`-fno-double-square-bracket-attributes`|Supported|`Disable '[[]]' attributes in all C and C++ language modes`|
|`-fno-elide-constructors`|Supported|`Disable C++ copy constructor elision`|
|`-fno-elide-type`|Supported|`Do not elide types when printing diagnostics`|
|`-fno-eliminate-unused-debug-types`|Supported|`Emit debug info for defined but unused types`|
|`-fno-exceptions`|Supported|`Disable support for exception handling`|
|`-fno-experimental-new-pass-manager`|Supported|`Disables an experimental new pass manager in LLVM.`|
|`-fno-experimental-relative-c++-abi-vtables`|Supported|`Do not use the experimental C++ class ABI for classes with virtual tables`|
|`-fno-fine-grained-bitfield-accesses`|Supported|`Use large-integer access for consecutive bitfield runs.`|
|`-fno-fixed-form`|Supported|`Disable fixed-form format for Fortran`|
|`-fno-fixed-point`|Supported|`Disable fixed point types`|
|`-fno-force-enable-int128`|Supported|`Disable support for int128_t type`|
|`-fno-fortran-main`|Supported|`Don't link in Fortran main`|
|`-fno-free-form`|Supported|`Disable free-form format for Fortran`|
|`-fno-func-args-alias`|Supported|`Function argument may alias (equivalent to ansi alias)`|
|`-fno-global-isel`|Supported|`Disables the global instruction selector`|
|`-fno-gnu-inline-asm`|Supported|`Disable GNU style inline asm`|
|`-fno-gpu-allow-device-init`|Supported|`Don't allow device side init function in HIP`|
|`-fno-hip-new-launch-api`|Supported|`Don't use new kernel launching API for HIP`|
|`-fno-integrated-as`|Supported|`Disable the integrated assembler`|
|`-fno-integrated-cc1`|Supported|`Spawn a separate process for each cc1`|
|`-fno-jump-tables`|Supported|`Do not use jump tables for lowering switches`|
|`-fno-keep-static-consts`|Supported|`Don't keep static const variables if unused`|
|`-fno-lto`|Supported|`Disable LTO mode (default)`|
|`-fno-memory-profile`|Supported|`Disable heap memory profiling`|
|`-fno-merge-all-constants`|Supported|`Disallow merging of constants`|
|`-fno-no-access-control`|Supported|`Disable C++ access control`|
|`-fno-objc-infer-related-result-type`|Supported|`do not infer Objective-C related result type based on method family`|
|`-fno-operator-names`|Supported|`Do not treat C++ operator name keywords as synonyms for operators`|
|`-fno-pch-codegen`|Supported|`Do not generate code for uses of this PCH that assumes an explicit object file will be built for the PCH`|
|`-fno-pch-debuginfo`|Supported|`Do not generate debug info for types in an object file built from this PCH and do not generate them elsewhere`|
|`-fno-plt`|Supported|`Use GOT indirection instead of PLT to make external function calls (x86 only)`|
|`-fno-preserve-as-comments`|Supported|`Do not preserve comments in inline assembly`|
|`-fno-profile-generate`|Supported|`Disable generation of profile instrumentation.`|
|`-fno-profile-instr-generate`|Supported|`Disable generation of profile instrumentation.`|
|`-fno-profile-instr-use`|Supported|`Disable using instrumentation data for profile-guided optimization`|
|`-fno-register-global-dtors-with-atexit`|Supported|`Don't use atexit or __cxa_atexit to register global destructors`|
|`-fno-rtlib-add-rpath`|Supported|`Do not add -rpath with architecture-specific resource directory to the linker flags`|
|`-fno-rtti-data`|Supported|`Disable generation of RTTI data`|
|`-fno-rtti`|Supported|`Disable generation of rtti information`|
|`-fno-sanitize-address-poison-custom-array-cookie`|Supported on Host only|`Disable poisoning array cookies when using custom operator new[] in AddressSanitizer`|
|`-fno-sanitize-address-use-after-scope`|Supported on Host only|`Disable use-after-scope detection in AddressSanitizer`|
|`-fno-sanitize-address-use-odr-indicator`|Supported on Host only|`Disable ODR indicator globals`|
|`-fno-sanitize-blacklist`|Supported on Host only|`Don't use blacklist file for sanitizers`|
|`-fno-sanitize-cfi-canonical-jump-tables`|Supported on Host only|`Do not make the jump table addresses canonical in the symbol table`|
|`-fno-sanitize-cfi-cross-dso`|Supported on Host only|`Disable control flow integrity (CFI) checks for cross-DSO calls.`|
|`-fno-sanitize-coverage=<value>`|Supported on Host only|`Disable specified features of coverage instrumentation for Sanitizers`|
|`-fno-sanitize-memory-track-origins`|Supported on Host only|`Disable origins tracking in MemorySanitizer`|
|`-fno-sanitize-memory-use-after-dtor`|Supported on Host only|`Disable use-after-destroy detection in MemorySanitizer`|
|`-fno-sanitize-recover=<value>`|Supported on Host only|`Disable recovery for specified sanitizers`|
|`-fno-sanitize-stats`|Supported on Host only|`Disable sanitizer statistics gathering.`|
|`-fno-sanitize-thread-atomics`|Supported on Host only|`Disable atomic operations instrumentation in ThreadSanitizer`|
|`-fno-sanitize-thread-func-entry-exit`|Supported on Host only|`Disable function entry/exit instrumentation in ThreadSanitizer`|
|`-fno-sanitize-thread-memory-access`|Supported on Host only|`Disable memory access instrumentation in ThreadSanitizer`|
|`-fno-sanitize-trap=<value>`|Supported on Host only|`Disable trapping for specified sanitizers`|
|`-fno-sanitize-trap`|Supported on Host only|`Disable trapping for all sanitizers`|
|`-fno-short-wchar`|Supported|`Force wchar_t to be an unsigned int`|
|`-fno-show-column`|Supported|`Do not include column number on diagnostics`|
|`-fno-show-source-location`|Supported|`Do not include source location information with diagnostics`|
|`-fno-signed-char`|Supported|`char is unsigned`|
|`-fno-signed-zeros`|Supported|`Allow optimizations that ignore the sign of floating point zeros`|
|`-fno-spell-checking`|Supported|`Disable spell-checking`|
|`-fno-split-machine-functions`|Supported|`Disable late function splitting using profile information (x86 ELF)`|
|`-fno-stack-clash-protection`|Supported|`Disable stack clash protection`|
|`-fno-stack-protector`|Supported|`Disable the use of stack protectors`|
|`-fno-standalone-debug`|Supported|`Limit debug information produced to reduce size of debug binary`|
|`-fno-strict-float-cast-overflow`|Supported|`Relax language rules and try to match the behavior of the target's native float-to-int conversion instructions`|
|`-fno-strict-return`|Supported|`Don't treat control flow paths that fall off the end of a non-void function as unreachable`|
|`-fno-sycl`|Unsupported|`Disable SYCL kernels compilation for device`|
|`-fno-temp-file`|Supported|`Directly create compilation output files. This may lead to incorrect incremental builds if the compiler crashes`|
|`-fno-threadsafe-statics`|Supported|`Do not emit code to make initialization of local statics thread safe`|
|`-fno-trigraphs`|Supported|`Do not process trigraph sequences`|
|`-fno-unique-section-names`|Supported|`Don't use unique names for text and data sections`|
|`-fno-unroll-loops`|Supported|`Turn off loop unroller`|
|`-fno-use-cxa-atexit`|Supported|`Don't use __cxa_atexit for calling destructors`|
|`-fno-use-flang-math-libs`|Supported|`Use Flang internal runtime math library instead of LLVM math intrinsics.`|
|`-fno-use-init-array`|Supported|`Use .ctors/.dtors instead of .init_array/.fini_array`|
|`-fno-visibility-inlines-hidden-static-local-var`|Supported|`Disables -fvisibility-inlines-hidden-static-local-var (this is the default on non-darwin targets)`|
|`-fno-xray-function-index`|Unsupported|`Omit function index section at the expense of single-function patching performance`|
|`-fno-zero-initialized-in-bss`|Supported|`Don't place zero initialized data in BSS`|
|`-fobjc-arc-exceptions`|Unsupported|`Use EH-safe code when synthesizing retains and releases in -fobjc-arc`|
|`-fobjc-arc`|Unsupported|`Synthesize retain and release calls for Objective-C pointers`|
|`-fobjc-exceptions`|Unsupported|`Enable Objective-C exceptions`|
|`-fobjc-runtime=<value>`|Unsupported|`Specify the target Objective-C runtime kind and version`|
|`-fobjc-weak`|Unsupported|`Enable ARC-style weak references in Objective-C`|
|`-fopenmp-simd`|Unsupported|`Emit OpenMP code only for SIMD-based constructs.`|
|`-fopenmp-targets=<value>`|Unsupported|`Specify comma-separated list of triples OpenMP offloading targets to be supported`|
|`-fopenmp`|Unsupported|`Parse OpenMP pragmas and generate parallel code.`|
|`-foptimization-record-file=<file>`|Supported|`Specify the output name of the file containing the optimization remarks. Implies -fsave-optimization-record. On Darwin platforms, this cannot be used with multiple -arch <arch> options.`|
|`-foptimization-record-passes=<regex>`|Supported|`Only include passes which match a specified regular expression in the generated optimization record (by default, include all passes)`|
|`-forder-file-instrumentation`|Supported|`Generate instrumented code to collect order file into default.profraw file (overridden by '=' form of option or LLVM_PROFILE_FILE env var)`|
|`-fpack-struct=<value>`|Unsupported|`Specify the default maximum struct packing alignment`|
|`-fpascal-strings`|Supported|`Recognize and construct Pascal-style string literals`|
|`-fpass-plugin=<dsopath>`|Supported|`Load pass plugin from a dynamic shared object file (only with new pass manager).`|
|`-fpatchable-function-entry=<N,M>`|Supported|`Generate M NOPs before function entry and N-M NOPs after function entry`|
|`-fpcc-struct-return`|Unsupported|`Override the default ABI to return all structs on the stack`|
|`-fpch-codegen`|Supported|`Generate code for uses of this PCH that assumes an explicit object file will be built for the PCH`|
|`-fpch-debuginfo`|Supported|`Generate debug info for types in an object file built from this PCH and do not generate them elsewhere`|
|`-fpch-instantiate-templates`|Supported|`Instantiate templates already while building a PCH`|
|`-fpch-validate-input-files-content`|Supported|`Validate PCH input files based on content if mtime differs`|
|`-fplugin=<dsopath>`|Supported|`Load the named plugin (dynamic shared object)`|
|`-fprebuilt-module-path=<directory>`|Unsupported|`Specify the prebuilt module path`|
|`-fprofile-exclude-files=<value>`|Unsupported|`Instrument only functions from files where names don't match all the regexes separated by a semi-colon`|
|`-fprofile-filter-files=<value>`|Unsupported|`Instrument only functions from files where names match any regex separated by a semi-colon`|
|`-fprofile-generate=<directory>`|Unsupported|`Generate instrumented code to collect execution counts into <directory>/default.profraw (overridden by LLVM_PROFILE_FILE env var)`|
|`-fprofile-generate`|Unsupported|`Generate instrumented code to collect execution counts into default.profraw (overridden by LLVM_PROFILE_FILE env var)`|
|`-fprofile-instr-generate=<file>`|Unsupported|`Generate instrumented code to collect execution counts into <file> (overridden by LLVM_PROFILE_FILE env var)`|
|`-fprofile-instr-generate`|Unsupported|`Generate instrumented code to collect execution counts into default.profraw file (overridden by '=' form of option or LLVM_PROFILE_FILE env var)`|
|`-fprofile-instr-use=<value>`|Unsupported|`Use instrumentation data for profile-guided optimization`|
|`-fprofile-remapping-file=<file>`|Unsupported|`Use the remappings described in <file> to match the profile data against names in the program`|
|`-fprofile-sample-accurate`|Unsupported|`Specifies that the sample profile is accurate`|
|`-fprofile-sample-use=<value>`|Unsupported|`Enable sample-based profile guided optimizations`|
|`-fprofile-use=<pathname>`|Unsupported|`Use instrumentation data for profile-guided optimization. If pathname is a directory, it reads from <pathname>/default.profdata. Otherwise, it reads from file <pathname>.`|
|`-freciprocal-math`|Supported|`Allow division operations to be reassociated`|
|`-freg-struct-return`|Unsupported|`Override the default ABI to return small structs in registers`|
|`-fregister-global-dtors-with-atexit`|Supported|`Use atexit or __cxa_atexit to register global destructors`|
|`-frelaxed-template-template-args`|Supported|`Enable C++17 relaxed template template argument matching`|
|`-freroll-loops`|Supported|`Turn on loop reroller`|
|`-fropi`|Unsupported|`Generate read-only position independent code (ARM only)`|
|`-frtlib-add-rpath`|Supported|`Add -rpath with architecture-specific resource directory to the linker flags`|
|`-frwpi`|Unsupported|`Generate read-write position independent code (ARM only)`|
|`-fsanitize-address-field-padding=<value>`|Supported on Host only|`Level of field padding for AddressSanitizer`|
|`-fsanitize-address-globals-dead-stripping`|Supported on Host only|`Enable linker dead stripping of globals in AddressSanitizer`|
|`-fsanitize-address-poison-custom-array-cookie`|Supported on Host only|`Enable poisoning array cookies when using custom operator new[] in AddressSanitizer`|
|`-fsanitize-address-use-after-scope`|Supported on Host only|`Enable use-after-scope detection in AddressSanitizer`|
|`-fsanitize-address-use-odr-indicator`|Supported on Host only|`Enable ODR indicator globals to avoid false ODR violation reports in partially sanitized programs at the cost of an increase in binary size`|
|`-fsanitize-blacklist=<value>`|Supported on Host only|`Path to blacklist file for sanitizers`|
|`-fsanitize-cfi-canonical-jump-tables`|Supported on Host only|`Make the jump table addresses canonical in the symbol table`|
|`-fsanitize-cfi-cross-dso`|Supported on Host only|`Enable control flow integrity (CFI) checks for cross-DSO calls.`|
|`-fsanitize-cfi-icall-generalize-pointers`|Supported on Host only|`Generalize pointers in CFI indirect call type signature checks`|
|`-fsanitize-coverage-allowlist=<value>`|Supported on Host only|`Restrict sanitizer coverage instrumentation exclusively to modules and functions that match the provided special case list, except the blocked ones`|
|`-fsanitize-coverage-blacklist=<value>`|Supported on Host only|`Deprecated, use -fsanitize-coverage-blocklist= instead`|
|`-fsanitize-coverage-blocklist=<value>`|Supported on Host only|`Disable sanitizer coverage instrumentation for modules and functions that match the provided special case list, even the allowed ones`|
|`-fsanitize-coverage-whitelist=<value>`|Supported on Host only|`Deprecated, use -fsanitize-coverage-allowlist= instead`|
|`-fsanitize-coverage=<value>`|Supported on Host only|`Specify the type of coverage instrumentation for Sanitizers`|
|`-fsanitize-hwaddress-abi=<value>`|Supported on Host only|`Select the HWAddressSanitizer ABI to target (interceptor or platform, default interceptor). This option is currently unused.`|
|`-fsanitize-memory-track-origins=<value>`|Supported on Host only|`Enable origins tracking in MemorySanitizer`|
|`-fsanitize-memory-track-origins`|Supported on Host only|`Enable origins tracking in MemorySanitizer`|
|`-fsanitize-memory-use-after-dtor`|Supported on Host only|`Enable use-after-destroy detection in MemorySanitizer`|
|`-fsanitize-recover=<value>`|Supported on Host only|`Enable recovery for specified sanitizers`|
|`-fsanitize-stats`|Supported on Host only|`Enable sanitizer statistics gathering.`|
|`-fsanitize-system-blacklist=<value>`|Supported on Host only|`Path to system blacklist file for sanitizers`|
|`-fsanitize-thread-atomics`|Supported on Host only|`Enable atomic operations instrumentation in ThreadSanitizer (default)`|
|`-fsanitize-thread-func-entry-exit`|Supported on Host only|`Enable function entry/exit instrumentation in ThreadSanitizer (default)`|
|`-fsanitize-thread-memory-access`|Supported on Host only|`Enable memory access instrumentation in ThreadSanitizer (default)`|
|`-fsanitize-trap=<value>`|Supported on Host only|`Enable trapping for specified sanitizers`|
|`-fsanitize-trap`|Supported on Host only|`Enable trapping for all sanitizers`|
|`-fsanitize-undefined-strip-path-components=<number>`|Supported on Host only|`Strip (or keep only, if negative) a given number of path components when emitting check metadata.`|
|`-fsanitize=<check>`|Supported on Host only|`Turn on runtime checks for various forms of undefined or suspicious behavior. See user manual for available checks`|
|`-fsave-optimization-record=<format>`|Supported|`Generate an optimization record file in a specific format`|
|`-fsave-optimization-record`|Supported|`Generate a YAML optimization record file`|
|`-fseh-exceptions`|Supported|`Use SEH style exceptions`|
|`-fshort-enums`|Supported|`Allocate to an enum type only as many bytes as it needs for the declared range of possible values`|
|`-fshort-wchar`|Unsupported|`Force wchar_t to be a short unsigned int`|
|`-fshow-overloads=<value>`|Supported|`Which overload candidates to show when overload resolution fails: best\|all; defaults to all`|
|`-fsigned-char`|Supported|`char is signed`|
|`-fsized-deallocation`|Supported|`Enable C++14 sized global deallocation functions`|
|`-fsjlj-exceptions`|Supported|`Use SjLj style exceptions`|
|`-fslp-vectorize`|Supported|`Enable the superword-level parallelism vectorization passes`|
|`-fsplit-dwarf-inlining`|Unsupported|`Provide minimal debug info in the object/executable to facilitate online symbolication/stack traces in the absence of .dwo/.dwp files when using Split DWARF`|
|`-fsplit-lto-unit`|Unsupported|`Enables splitting of the LTO unit`|
|`-fsplit-machine-functions`|Supported|`Enable late function splitting using profile information (x86 ELF)`|
|`-fstack-clash-protection`|Supported|`Enable stack clash protection`|
|`-fstack-protector-all`|Unsupported|`Enable stack protectors for all functions`|
|`-fstack-protector-strong`|Unsupported|`Enable stack protectors for some functions vulnerable to stack smashing. Compared to -fstack-protector, this uses a stronger heuristic that includes functions containing arrays of any size (and any type), as well as any calls to alloca or the taking of an address from a local variable`|
|`-fstack-protector`|Unsupported|`Enable stack protectors for some functions vulnerable to stack smashing. This uses a loose heuristic which considers functions vulnerable if they contain a char (or 8bit integer) array or constant sized calls to alloca , which are of greater size than ssp-buffer-size (default: 8 bytes). All variable sized calls to alloca are considered vulnerable. A function with a stack protector has a guard value added to the stack frame that is checked on function exit. The guard value must be positioned in the stack frame such that a buffer overflow from a vulnerable variable will overwrite the guard value before overwriting the function's return address. The reference stack guard value is stored in a global variable.`|
|`-fstack-size-section`|Supported|`Emit section containing metadata on function stack sizes`|
|`-fstandalone-debug`|Supported|`Emit full debug info for all types used by the program`|
|`-fstrict-enums`|Supported|`Enable optimizations based on the strict definition of an enum's value range`|
|`-fstrict-float-cast-overflow`|Supported|`Assume that overflowing float-to-int casts are undefined (default)`|
|`-fstrict-vtable-pointers`|Supported|`Enable optimizations based on the strict rules for overwriting polymorphic C++ objects`|
|`-fsycl`|Unsupported|`Enable SYCL kernels compilation for device`|
|`-fsystem-module`|u|`Build this module as a system module. Only used with -emit-module`|
|`-fthin-link-bitcode=<value>`|Supported|`Write minimized bitcode to <file> for the ThinLTO thin link only`|
|`-fthinlto-index=<value>`|Unsupported|`Perform ThinLTO importing using provided function summary index`|
|`-ftime-trace-granularity=<value>`|Supported|`Minimum time granularity (in microseconds) traced by time profiler`|
|`-ftime-trace`|Supported|`Turn on time profiler. Generates JSON file based on output filename.`|
|`-ftrap-function=<value>`|Unsupported|`Issue call to specified function rather than a trap instruction`|
|`-ftrapv-handler=<function name>`|Unsupported|`Specify the function to be called on overflow`|
|`-ftrapv`|Unsupported|`Trap on integer overflow`|
|`-ftrigraphs`|Supported|`Process trigraph sequences`|
|`-ftrivial-auto-var-init-stop-after=<value>`|Supported|`Stop initializing trivial automatic stack variables after the specified number of instances`|
|`-ftrivial-auto-var-init=<value>`|Supported|`Initialize trivial automatic stack variables: uninitialized (default) \| pattern`|
|`-funique-basic-block-section-names`|Supported|`Use unique names for basic block sections (ELF Only)`|
|`-funique-internal-linkage-names`|Supported|`Uniqueify Internal Linkage Symbol Names by appending the MD5 hash of the module path`|
|`-funroll-loops`|Supported|`Turn on loop unroller`|
|`-fuse-flang-math-libs`|Supported|`Use Flang internal runtime math library instead of LLVM math intrinsics.`|
|`-fuse-line-directives`|Supported|`Use #line in preprocessed output`|
|`-fvalidate-ast-input-files-content`|Supported|`Compute and store the hash of input files used to build an AST. Files with mismatching mtime's are considered valid if both contents is identical`|
|`-fveclib=<value>`|Unsupported|`Use the given vector functions library`|
|`-fvectorize`|Unsupported|`Enable the loop vectorization passes`|
|`-fverbose-asm`|Supported|`Generate verbose assembly output`|
|`-fvirtual-function-elimination`|Supported|`Enables dead virtual function elimination optimization. Requires -flto=full`|
|`-fvisibility-global-new-delete-hidden`|Supported|`Give global C++ operator new and delete declarations hidden visibility`|
|`-fvisibility-inlines-hidden-static-local-var`|Supported|`When -fvisibility-inlines-hidden is enabled, static variables in inline C++ member functions will also be given hidden visibility by default`|
|`-fvisibility-inlines-hidden`|Supported|`Give inline C++ member functions hidden visibility by default`|
|`-fvisibility-ms-compat`|Supported|`Give global types 'default' visibility and global functions and variables 'hidden' visibility by default`|
|`-fvisibility=<value>`|Supported|`Set the default symbol visibility for all global declarations`|
|`-fwasm-exceptions`|Unsupported|`Use WebAssembly style exceptions`|
|`-fwhole-program-vtables`|Unsupported|`Enables whole-program vtable optimization. Requires -flto`|
|`-fwrapv`|Supported|`Treat signed integer overflow as two's complement`|
|`-fwritable-strings`|Supported|`Store string literals as writable data`|
|`-fxray-always-emit-customevents`|Unsupported|`Always emit __xray_customevent(...) calls even if the containing function is not always instrumented`|
|`-fxray-always-emit-typedevents`|Unsupported|`Always emit __xray_typedevent(...) calls even if the containing function is not always instrumented`|
|`-fxray-always-instrument= <value>`|Unsupported|`DEPRECATED: Filename defining the whitelist for imbuing the 'always instrument' XRay attribute.`|
|`-fxray-attr-list= <value>`|Unsupported|`Filename defining the list of functions/types for imbuing XRay attributes.`|
|`-fxray-ignore-loops`|Unsupported|`Don't instrument functions with loops unless they also meet the minimum function size`|
|`-fxray-instruction-threshold= <value>`|Unsupported|`Sets the minimum function size to instrument with XRay`|
|`-fxray-instrumentation-bundle= <value>`|Unsupported|`Select which XRay instrumentation points to emit. Options: all, none, function-entry, function-exit, function, custom. Default is 'all'. 'function' includes both 'function-entry' and 'function-exit'.`|
|`-fxray-instrument`|Unsupported|`Generate XRay instrumentation sleds on function entry and exit`|
|`-fxray-link-deps`|Unsupported|`Tells clang to add the link dependencies for XRay.`|
|`-fxray-modes= <value>`|Unsupported|`List of modes to link in by default into XRay instrumented binaries.`|
|`-fxray-never-instrument= <value>`|Unsupported|`DEPRECATED: Filename defining the whitelist for imbuing the 'never instrument' XRay attribute.`|
|`-fzvector`|Supported|`Enable System z vector language extension`|
|`-F <value>`|Unsupported|`Add directory to framework include search path`|
|`--gcc-toolchain=<value>`|Supported|`Use the gcc toolchain at the given directory`|
|`-gcodeview-ghash`|Supported|`Emit type record hashes in a .debug$H section`|
|`-gcodeview`|Supported|`Generate CodeView debug information`|
|`-gdwarf-2`|Supported|`Generate source-level debug information with dwarf version 2`|
|`-gdwarf-3`|Supported|`Generate source-level debug information with dwarf version 3`|
|`-gdwarf-4`|Supported|`Generate source-level debug information with dwarf version 4`|
|`-gdwarf-5`|Supported|`Generate source-level debug information with dwarf version 5`|
|`-gdwarf`|Supported|`Generate source-level debug information with the default dwarf version`|
|`-gembed-source`|Supported|`Embed source text in DWARF debug sections`|
|`-gline-directives-only`|Supported|`Emit debug line info directives only`|
|`-gline-tables-only`|Supported|`Emit debug line number tables only`|
|`-gmodules`|Supported|`Generate debug info with external references to clang modules or precompiled headers`|
|`-gno-embed-source`|Supported|`Restore the default behavior of not embedding source text in DWARF debug sections`|
|`-gno-inline-line-tables`|Supported|`Don't emit inline line tables`|
|`--gpu-max-threads-per-block=<value>`|Supported|`Default max threads per block for kernel launch bounds for HIP`|
|`-gsplit-dwarf=<value>`|Supported|`Set DWARF fission mode to either 'split' or 'single'`|
|`-gz=<value>`|Supported|`DWARF debug sections compression type`|
|`-gz`|Supported|`DWARF debug sections compression type`|
|`-G <size>`|Unsupported|`Put objects of at most <size> bytes into small data section (MIPS / Hexagon)`|
|`-g`|Supported|`Generate source-level debug information`|
|`--help-hidden`|Supported|`Display help for hidden options`|
|`-help`|Supported|`Display available options`|
|`--hip-device-lib=<value>`|Supported|`HIP device library`|
|`--hip-link`|Supported|`Link clang-offload-bundler bundles for HIP`|
|`--hip-version=<value>`|Supported|`HIP version in the format of major.minor.patch`|
|`-H`|Supported|`Show header includes and nesting depth`|
|`-I-`|Supported|`Restrict all prior -I flags to double-quoted inclusion and remove current directory from include path`|
|`-ibuiltininc`|Supported|`Enable builtin #include directories even when -nostdinc is used before or after -ibuiltininc. Using -nobuiltininc after the option disables it`|
|`-idirafter <value>`|Supported|`Add directory to AFTER include search path`|
|`-iframeworkwithsysroot <directory>`|Unsupported|`Add directory to SYSTEM framework search path, absolute paths are relative to -isysroot`|
|`-iframework <value>`|Unsupported|`Add directory to SYSTEM framework search path`|
|`-imacros <file>`|Supported|`Include macros from file before parsing`|
|`-include-pch <file>`|Supported|`Include precompiled header file`|
|`-include <file>`|Supported|`Include file before parsing`|
|`-index-header-map`|Supported|`Make the next included directory (-I or -F) an indexer header map`|
|`-iprefix <dir>`|Supported|`Set the -iwithprefix/-iwithprefixbefore prefix`|
|`-iquote <directory>`|Supported|`Add directory to QUOTE include search path`|
|`-isysroot <dir>`|Supported|`Set the system root directory (usually /)`|
|`-isystem-after <directory>`|Supported|`Add directory to end of the SYSTEM include search path`|
|`-isystem <directory>`|Supported|`Add directory to SYSTEM include search path`|
|`-ivfsoverlay <value>`|Supported|`Overlay the virtual filesystem described by file over the real file system`|
|`-iwithprefixbefore <dir>`|Supported|`Set directory to include search path with prefix`|
|`-iwithprefix <dir>`|Supported|`Set directory to SYSTEM include search path with prefix`|
|`-iwithsysroot <directory>`|Supported|`Add directory to SYSTEM include search path, absolute paths are relative to -isysroot`|
|`-I <dir>`|Supported|`Add directory to include search path. If there are multiple -I options, these directories are searched in the order they are given before the standard system directories are searched. If the same directory is in the SYSTEM include search paths, for example if also specified with -isystem, the -I option will be ignored`|
|`--libomptarget-nvptx-path=<value>`|Unsupported|`Path to libomptarget-nvptx libraries`|
|`-L <dir>`|Supported|`Add directory to library search path`|
|`-mabicalls`|Unsupported|`Enable SVR4-style position-independent code (Mips only)`|
|`-maix-struct-return`|Unsupported|`Return all structs in memory (PPC32 only)`|
|`-malign-branch-boundary=<value>`|Supported|`Specify the boundary's size to align branches`|
|`-malign-branch=<value>`|Supported|`Specify types of branches to align`|
|`-malign-double`|Supported|`Align doubles to two words in structs (x86 only)`|
|`-Mallocatable=<value>`|Unsupported|`Select semantics for assignments to allocatables (F03 or F95)`|
|`-mbackchain`|Unsupported|`Link stack frames through backchain on System Z`|
|`-mbranch-protection=<value>`|Unsupported|`Enforce targets of indirect branches and function returns`|
|`-mbranches-within-32B-boundaries`|Supported|`Align selected branches (fused, jcc, jmp) within 32-byte boundary`|
|`-mcmodel=medany`|Unsupported|`Equivalent to -mcmodel=medium, compatible with RISC-V gcc.`|
|`-mcmodel=medlow`|Unsupported|`Equivalent to -mcmodel=small, compatible with RISC-V gcc.`|
|`-mcmse`|Unsupported|`Allow use of CMSE (Armv8-M Security Extensions)`|
|`-mcode-object-v3`|Supported|`Legacy option to specify code object ABI V2 (-mnocode-object-v3) or V3 (-mcode-object-v3) (AMDGPU only)`|
|`-mcode-object-version=<version>`|Supported|`Specify code object ABI version. Defaults to 4. (AMDGPU only)`|
|`-mcrc`|Unsupported|`Allow use of CRC instructions (ARM/Mips only)`|
|`-mcumode`|Supported|`Specify CU (-mcumode) or WGP (-mno-cumode) wavefront execution mode (AMDGPU only)`|
|`-mdouble=<value>`|Supported|`Force double to be 32 bits or 64 bits`|
|`-MD`|Supported|`Write a depfile containing user and system headers`|
|`-meabi <value>`|Supported|`Set EABI type, e.g. 4, 5 or gnu (default depends on triple)`|
|`-membedded-data`|Unsupported|`Place constants in the .rodata section instead of the .sdata section even if they meet the -G <size> threshold (MIPS)`|
|`-menable-experimental-extensions`|Unsupported|`Enable use of experimental RISC-V extensions.`|
|`-mexec-model=<value>`|Unsupported|`Execution model (WebAssembly only)`|
|`-mexecute-only`|Unsupported|`Disallow generation of data access to code sections (ARM only)`|
|`-mextern-sdata`|Unsupported|`Assume that externally defined data is in the small data if it meets the -G <size> threshold (MIPS)`|
|`-mfentry`|Unsupported|`Insert calls to fentry at function entry (x86/SystemZ only)`|
|`-mfix-cortex-a53-835769`|Unsupported|`Workaround Cortex-A53 erratum 835769 (AArch64 only)`|
|`-mfp32`|Unsupported|`Use 32-bit floating point registers (MIPS only)`|
|`-mfp64`|Unsupported|`Use 64-bit floating point registers (MIPS only)`|
|`-MF <file>`|Supported|`Write depfile output from -MMD, -MD, -MM, or -M to <file>`|
|`-mgeneral-regs-only`|Unsupported|`Generate code which only uses the general purpose registers (AArch64 only)`|
|`-mglobal-merge`|Supported|`Enable merging of globals`|
|`-mgpopt`|Unsupported|`Use GP relative accesses for symbols known to be in a small data section (MIPS)`|
|`-MG`|Supported|`Add missing headers to depfile`|
|`-mharden-sls=<value>`|Unsupported|`Select straight-line speculation hardening scope`|
|`-mhvx-length=<value>`|Unsupported|`Set Hexagon Vector Length`|
|`-mhvx=<value>`|Unsupported|`Enable Hexagon Vector eXtensions`|
|`-mhvx`|Unsupported|`Enable Hexagon Vector eXtensions`|
|`-miamcu`|Unsupported|`Use Intel MCU ABI`|
|`--migrate`|Unsupported|`Run the migrator`|
|`-mincremental-linker-compatible`|Supported|`(integrated-as) Emit an object file which can be used with an incremental linker`|
|`-mindirect-jump=<value>`|Unsupported|`Change indirect jump instructions to inhibit speculation`|
|`-Minform=<value>`|Supported|`Set error level of messages to display`|
|`-mios-version-min=<value>`|Unsupported|`Set iOS deployment target`|
|`-MJ <value>`|Unsupported|`Write a compilation database entry per input`|
|`-mllvm <value>`|Supported|`Additional arguments to forward to LLVM's option processing`|
|`-mlocal-sdata`|Unsupported|`Extend the -G behaviour to object local data (MIPS)`|
|`-mlong-calls`|Supported|`Generate branches with extended addressability, usually via indirect jumps.`|
|`-mlong-double-128`|Supported on Host only|`Force long double to be 128 bits`|
|`-mlong-double-64`|Supported|`Force long double to be 64 bits`|
|`-mlong-double-80`|Supported on Host only|`Force long double to be 80 bits, padded to 128 bits for storage`|
|`-mlvi-cfi`|Supported on Host only|`Enable only control-flow mitigations for Load Value Injection (LVI)`|
|`-mlvi-hardening`|Supported on Host only|`Enable all mitigations for Load Value Injection (LVI)`|
|`-mmacosx-version-min=<value>`|Unsupported|`Set Mac OS X deployment target`|
|`-mmadd4`|Supported|`Enable the generation of 4-operand madd.s, madd.d and related instructions.`|
|`-mmark-bti-property`|Unsupported|`Add .note.gnu.property with BTI to assembly files (AArch64 only)`|
|`-MMD`|Supported|`Write a depfile containing user headers`|
|`-mmemops`|Supported|`Enable generation of memop instructions`|
|`-mms-bitfields`|Unsupported|`Set the default structure layout to be compatible with the Microsoft compiler standard`|
|`-mmsa`|Unsupported|`Enable MSA ASE (MIPS only)`|
|`-mmt`|Unsupported|`Enable MT ASE (MIPS only)`|
|`-MM`|Supported|`Like -MMD, but also implies -E and writes to stdout by default`|
|`-mno-abicalls`|Unsupported|`Disable SVR4-style position-independent code (Mips only)`|
|`-mno-crc`|Unsupported|`Disallow use of CRC instructions (Mips only)`|
|`-mno-embedded-data`|Unsupported|`Do not place constants in the .rodata section instead of the .sdata if they meet the -G <size> threshold (MIPS)`|
|`-mno-execute-only`|Unsupported|`Allow generation of data access to code sections (ARM only)`|
|`-mno-extern-sdata`|Unsupported|`Do not assume that externally defined data is in the small data if it meets the -G <size> threshold (MIPS)`|
|`-mno-fix-cortex-a53-835769`|Unsupported|`Don't workaround Cortex-A53 erratum 835769 (AArch64 only)`|
|`-mno-global-merge`|Supported|`Disable merging of globals`|
|`-mno-gpopt`|Unsupported|`Do not use GP relative accesses for symbols known to be in a small data section (MIPS)`|
|`-mno-hvx`|Unsupported|`Disable Hexagon Vector eXtensions`|
|`-mno-implicit-float`|Supported|`Don't generate implicit floating point instructions`|
|`-mno-incremental-linker-compatible`|Supported|`(integrated-as) Emit an object file which cannot be used with an incremental linker`|
|`-mno-local-sdata`|Unsupported|`Do not extend the -G behaviour to object local data (MIPS)`|
|`-mno-long-calls`|Supported|`Restore the default behaviour of not generating long calls`|
|`-mno-lvi-cfi`|Supported on Host only|`Disable control-flow mitigations for Load Value Injection (LVI)`|
|`-mno-lvi-hardening`|Supported on Host only|`Disable mitigations for Load Value Injection (LVI)`|
|`-mno-madd4`|Supported|`Disable the generation of 4-operand madd.s, madd.d and related instructions.`|
|`-mno-memops`|Supported|`Disable generation of memop instructions`|
|`-mno-movt`|Supported|`Disallow use of movt/movw pairs (ARM only)`|
|`-mno-ms-bitfields`|Supported|`Do not set the default structure layout to be compatible with the Microsoft compiler standard`|
|`-mno-msa`|Unsupported|`Disable MSA ASE (MIPS only)`|
|`-mno-mt`|Unsupported|`Disable MT ASE (MIPS only)`|
|`-mno-neg-immediates`|Supported|`Disallow converting instructions with negative immediates to their negation or inversion.`|
|`-mno-nvj`|Supported|`Disable generation of new-value jumps`|
|`-mno-nvs`|Supported|`Disable generation of new-value stores`|
|`-mno-outline`|Unsupported|`Disable function outlining (AArch64 only)`|
|`-mno-packets`|Supported|`Disable generation of instruction packets`|
|`-mno-relax`|Supported|`Disable linker relaxation`|
|`-mno-restrict-it`|Unsupported|`Allow generation of deprecated IT blocks for ARMv8. It is off by default for ARMv8 Thumb mode`|
|`-mno-save-restore`|Unsupported|`Disable using library calls for save and restore`|
|`-mno-seses`|Unsupported|`Disable speculative execution side effect suppression (SESES)`|
|`-mno-stack-arg-probe`|Supported|`Disable stack probes which are enabled by default`|
|`-mno-tls-direct-seg-refs`|Supported|`Disable direct TLS access through segment registers`|
|`-mno-unaligned-access`|Unsupported|`Force all memory accesses to be aligned (AArch32/AArch64 only)`|
|`-mno-wavefrontsize64`|Supported|`Specify wavefront size 32 mode (AMDGPU only)`|
|`-mnocrc`|Unsupported|`Disallow use of CRC instructions (ARM only)`|
|`-mnop-mcount`|Supported|`Generate mcount/__fentry__ calls as nops. To activate they need to be patched in.`|
|`-mnvj`|Supported|`Enable generation of new-value jumps`|
|`-mnvs`|Supported|`Enable generation of new-value stores`|
|`-module-dependency-dir <value>`|Unsupported|`Directory to dump module dependencies to`|
|`-module-file-info`|Unsupported|`Provide information about a particular module file`|
|`-momit-leaf-frame-pointer`|Supported|`Omit frame pointer setup for leaf functions`|
|`-moutline`|Unsupported|`Enable function outlining (AArch64 only)`|
|`-mpacked-stack`|Unsupported|`Use packed stack layout (SystemZ only).`|
|`-mpackets`|Supported|`Enable generation of instruction packets`|
|`-mpad-max-prefix-size=<value>`|Supported|`Specify maximum number of prefixes to use for padding`|
|`-mpie-copy-relocations`|Supported|`Use copy relocations support for PIE builds`|
|`-mprefer-vector-width=<value>`|Unsupported|`Specifies preferred vector width for auto-vectorization. Defaults to 'none' which allows target specific decisions.`|
|`-MP`|Supported|`Create phony target for each dependency (other than main file)`|
|`-mqdsp6-compat`|Unsupported|`Enable hexagon-qdsp6 backward compatibility`|
|`-MQ <value>`|Supported|`Specify name of main file output to quote in depfile`|
|`-mrecord-mcount`|Supported|`Generate a __mcount_loc section entry for each __fentry__ call.`|
|`-mrelax-all`|Supported|`(integrated-as) Relax all machine instructions`|
|`-mrelax`|Supported|`Enable linker relaxation`|
|`-mrestrict-it`|Unsupported|`Disallow generation of deprecated IT blocks for ARMv8. It is on by default for ARMv8 Thumb mode.`|
|`-mrtd`|Unsupported|`Make StdCall calling convention the default`|
|`-msave-restore`|Unsupported|`Enable using library calls for save and restore`|
|`-mseses`|Unsupported|`Enable speculative execution side effect suppression (SESES). Includes LVI control flow integrity mitigations`|
|`-msign-return-address=<value>`|Unsupported|`Select return address signing scope`|
|`-msmall-data-limit=<value>`|Supported|`Put global and static data smaller than the limit into a special section`|
|`-msoft-float`|Supported|`Use software floating point`|
|`-msram-ecc`|Supported|`Legacy option to specify SRAM ECC mode (AMDGPU only). Should use --offload-arch with :sramecc+ instead`|
|`-mstack-alignment=<value>`|Unsupported|`Set the stack alignment`|
|`-mstack-arg-probe`|Unsupported|`Enable stack probes`|
|`-mstack-probe-size=<value>`|Unsupported|`Set the stack probe size`|
|`-mstackrealign`|Unsupported|`Force realign the stack at entry to every function`|
|`-msve-vector-bits=<value>`|Unsupported|`Specify the size in bits of an SVE vector register. Defaults to the vector length agnostic value of "scalable". (AArch64 only)`|
|`-msvr4-struct-return`|Unsupported|`Return small structs in registers (PPC32 only)`|
|`-mthread-model <value>`|Supported|`The thread model to use, e.g. posix, single (posix by default)`|
|`-mtls-direct-seg-refs`|Supported|`Enable direct TLS access through segment registers (default)`|
|`-mtls-size=<value>`|Unsupported|`Specify bit size of immediate TLS offsets (AArch64 ELF only): 12 (for 4KB) \| 24 (for 16MB, default) \| 32 (for 4GB) \| 48 (for 256TB, needs -mcmodel=large)`|
|`-mtp=<value>`|Unsupported|`Thread pointer access method (AArch32/AArch64 only)`|
|`-mtune=<value>`|Supported on Host only|`Only supported on X86. Otherwise accepted for compatibility with GCC.`|
|`-MT <value>`|Unsupported|`Specify name of main file output in depfile`|
|`-munaligned-access`|Unsupported|`Allow memory accesses to be unaligned (AArch32/AArch64 only)`|
|`-MV`|Supported|`Use NMake/Jom format for the depfile`|
|`-mwavefrontsize64`|Supported|`Specify wavefront size 64 mode (AMDGPU only)`|
|`-mxnack`|Supported|`Legacy option to specify XNACK mode (AMDGPU only). Should use --offload-arch with :xnack+ instead`|
|`-M`|Supported|`Like -MD, but also implies -E and writes to stdout by default`|
|`--no-cuda-include-ptx=<value>`|Supported|`Do not include PTX for the following GPU architecture (e.g. sm_35) or 'all'. May be specified more than once.`|
|`--no-cuda-version-check`|Supported|`Don't error out if the detected version of the CUDA install is too low for the requested CUDA gpu architecture.`|
|`-no-flang-libs`|Supported|`Do not link against Flang libraries`|
|`--no-offload-arch=<value>`|Supported|`Remove CUDA/HIP offloading device architecture (e.g. sm_35, gfx906) from the list of devices to compile for. 'all' resets the list to its default value.`|
|`--no-system-header-prefix=<prefix>`|Supported|`Treat all #include paths starting with <prefix> as not including a system header.`|
|`-nobuiltininc`|Supported|`Disable builtin #include directories`|
|`-nogpuinc`|Supported|`Do not add CUDA/HIP include paths and include default CUDA/HIP wrapper header files`|
|`-nogpulib`|Supported|`Do not link device library for CUDA/HIP device compilation`|
|`-nostdinc++`|Unsupported|`Disable standard #include directories for the C++ standard library`|
|`-ObjC++`|Unsupported|`Treat source input files as Objective-C++ inputs`|
|`-objcmt-atomic-property`|Unsupported|`Make migration to 'atomic' properties`|
|`-objcmt-migrate-all`|Unsupported|`Enable migration to modern ObjC`|
|`-objcmt-migrate-annotation`|Unsupported|`Enable migration to property and method annotations`|
|`-objcmt-migrate-designated-init`|Unsupported|`Enable migration to infer NS_DESIGNATED_INITIALIZER for initializer methods`|
|`-objcmt-migrate-instancetype`|Unsupported|`Enable migration to infer instancetype for method result type`|
|`-objcmt-migrate-literals`|Unsupported|`Enable migration to modern ObjC literals`|
|`-objcmt-migrate-ns-macros`|Unsupported|`Enable migration to NS_ENUM/NS_OPTIONS macros`|
|`-objcmt-migrate-property-dot-syntax`|Unsupported|`Enable migration of setter/getter messages to property-dot syntax`|
|`-objcmt-migrate-property`|Unsupported|`Enable migration to modern ObjC property`|
|`-objcmt-migrate-protocol-conformance`|Unsupported|`Enable migration to add protocol conformance on classes`|
|`-objcmt-migrate-readonly-property`|Unsupported|`Enable migration to modern ObjC readonly property`|
|`-objcmt-migrate-readwrite-property`|Unsupported|`Enable migration to modern ObjC readwrite property`|
|`-objcmt-migrate-subscripting`|Unsupported|`Enable migration to modern ObjC subscripting`|
|`-objcmt-ns-nonatomic-iosonly`|Unsupported|`Enable migration to use NS_NONATOMIC_IOSONLY macro for setting property's 'atomic' attribute`|
|`-objcmt-returns-innerpointer-property`|Unsupported|`Enable migration to annotate property with NS_RETURNS_INNER_POINTER`|
|`-objcmt-whitelist-dir-path=<value>`|Unsupported|`Only modify files with a filename contained in the provided directory path`|
|`-ObjC`|Unsupported|`Treat source input files as Objective-C inputs`|
|`--offload-arch=<value>`|Supported|`CUDA offloading device architecture (e.g. sm_35), or HIP offloading target ID in the form of a device architecture followed by target ID features delimited by a colon. Each target ID feature is a pre-defined string followed by a plus or minus sign (e.g. gfx908:xnack+:sramecc-). May be specified more than once.`|
|`-o <file>`|Supported|`Write output to <file>`|
|`-parallel-jobs=<value>`|Supported|`Number of parallel jobs`|
|`-pg`|Supported|`Enable mcount instrumentation`|
|`-pipe`|Supported|`Use pipes between commands, when possible`|
|`--precompile`|Supported|`Only precompile the input`|
|`-print-effective-triple`|Supported|`Print the effective target triple`|
|`-print-file-name=<file>`|Supported|`Print the full library path of <file>`|
|`-print-ivar-layout`|Unsupported|`Enable Objective-C Ivar layout bitmap print trace`|
|`-print-libgcc-file-name`|Supported|`Print the library path for the currently used compiler runtime library ("libgcc.a" or "libclang_rt.builtins.*.a")`|
|`-print-prog-name=<name>`|Supported|`Print the full program path of <name>`|
|`-print-resource-dir`|Supported|`Print the resource directory pathname`|
|`-print-search-dirs`|Supported|`Print the paths used for finding libraries and programs`|
|`-print-supported-cpus`|Supported|`Print supported cpu models for the given target (if target is not specified, it will print the supported cpus for the default target)`|
|`-print-target-triple`|Supported|`Print the normalized target triple`|
|`-print-targets`|Supported|`Print the registered targets`|
|`-pthread`|Supported|`Support POSIX threads in generated code`|
|`--ptxas-path=<value>`|Unsupported|`Path to ptxas (used for compiling CUDA code)`|
|`-P`|Supported|`Disable linemarker output in -E mode`|
|`-Qn`|Supported|`Do not emit metadata containing compiler name and version`|
|`-Qunused-arguments`|Supported|`Don't emit warning for unused driver arguments`|
|`-Qy`|Supported|`Emit metadata containing compiler name and version`|
|`-relocatable-pch`|Supported|`Whether to build a relocatable precompiled header`|
|`-rewrite-legacy-objc`|Unsupported|`Rewrite Legacy Objective-C source to C++`|
|`-rewrite-objc`|Unsupported|`Rewrite Objective-C source to C++`|
|`--rocm-device-lib-path=<value>`|Supported|`ROCm device library path. Alternative to rocm-path.`|
|`--rocm-path=<value>`|Supported|`ROCm installation path, used for finding and automatically linking required bitcode libraries.`|
|`-Rpass-analysis=<value>`|Supported|`Report transformation analysis from optimization passes whose name matches the given POSIX regular expression`|
|`-Rpass-missed=<value>`|Supported|`Report missed transformations by optimization passes whose name matches the given POSIX regular expression`|
|`-Rpass=<value>`|Supported|`Report transformations performed by optimization passes whose name matches the given POSIX regular expression`|
|`-rtlib=<value>`|Unsupported|`Compiler runtime library to use`|
|`-R<remark>`|Unsupported|`Enable the specified remark`|
|`-save-stats=<value>`|Supported|`Save llvm statistics.`|
|`-save-stats`|Supported|`Save llvm statistics.`|
|`-save-temps=<value>`|Supported|`Save intermediate compilation results.`|
|`-save-temps`|Supported|`Save intermediate compilation results`|
|`-serialize-diagnostics <value>`|Supported|`Serialize compiler diagnostics to a file`|
|`-shared-libsan`|Unsupported|`Dynamically link the sanitizer runtime`|
|`-static-flang-libs`|Supported|`Link using static Flang libraries`|
|`-static-libsan`|Unsupported|`Statically link the sanitizer runtime`|
|`-static-openmp`|Supported|`Use the static host OpenMP runtime while linking.`|
|`-std=<value>`|Supported|`Language standard to compile for`|
|`-stdlib++-isystem <directory>`|Supported|`Use directory as the C++ standard library include path`|
|`-stdlib=<value>`|Supported|`C++ standard library to use`|
|`-sycl-std=<value>`|Unsupported|`SYCL language standard to compile for.`|
|`--system-header-prefix=<prefix>`|Supported|`Treat all #include paths starting with <prefix> as including a system header.`|
|`-S`|Supported|`Only run preprocess and compilation steps`|
|`--target=<value>`|Supported|`Generate code for the given target`|
|`-Tbss <addr>`|Supported|`Set starting address of BSS to <addr>`|
|`-Tdata <addr>`|Supported|`Set starting address of DATA to <addr>`|
|`-time`|Supported|`Time individual commands`|
|`-traditional-cpp`|Unsupported|`Enable some traditional CPP emulation`|
|`-trigraphs`|Supported|`Process trigraph sequences`|
|`-Ttext <addr>`|Supported|`Set starting address of TEXT to <addr>`|
|`-T <script>`|Unsupported|`Specify <script> as linker script`|
|`-undef`|Supported|`undef all system defines`|
|`-unwindlib=<value>`|Supported|`Unwind library to use`|
|`-U <macro>`|Supported|`Undefine macro <macro>`|
|`--verify-debug-info`|Supported|`Verify the binary representation of debug output`|
|`-verify-pch`|Unsupported|`Load and verify that a pre-compiled header file is not stale`|
|`--version`|Supported|`Print version information`|
|`-v`|Supported|`Show commands to run and use verbose output`|
|`-Wa,<arg>`|Supported|`Pass the comma separated arguments in <arg> to the assembler`|
|`-Wdeprecated`|Supported|`Enable warnings for deprecated constructs and define __DEPRECATED`|
|`-Wl,<arg>`|Supported|`Pass the comma separated arguments in <arg> to the linker`|
|`-working-directory <value>`|Supported|`Resolve file paths relative to the specified directory`|
|`-Wp,<arg>`|Supported|`Pass the comma separated arguments in <arg> to the preprocessor`|
|`-W<warning>`|Supported|`Enable the specified warning`|
|`-w`|Supported|`Suppress all warnings`|
|`-Xanalyzer <arg>`|Supported|`Pass <arg> to the static analyzer`|
|`-Xarch_device <arg>`|Supported|`Pass <arg> to the CUDA/HIP device compilation`|
|`-Xarch_host <arg>`|Supported|`Pass <arg> to the CUDA/HIP host compilation`|
|`-Xassembler <arg>`|Supported|`Pass <arg> to the assembler`|
|`-Xclang <arg>`|Supported|`Pass <arg> to the clang compiler`|
|`-Xcuda-fatbinary <arg>`|Supported|`Pass <arg> to fatbinary invocation`|
|`-Xcuda-ptxas <arg>`|Supported|`Pass <arg> to the ptxas assembler`|
|`-Xlinker <arg>`|Supported|`Pass <arg> to the linker`|
|`-Xopenmp-target=<triple> <arg>`|Supported|`Pass <arg> to the target offloading toolchain identified by <triple>.`|
|`-Xopenmp-target <arg>`|Supported|`Pass <arg> to the target offloading toolchain.`|
|`-Xpreprocessor <arg>`|Supported|`Pass <arg> to the preprocessor`|
|`-x <language>`|Supported|`Treat subsequent input files as having type <language>`|
|`-z <arg>`|Supported|`Pass -z <arg> to the linker`|
-758
Voir le fichier
@@ -1,758 +0,0 @@
-### s
--analyzer-output s
--analyze s
-arcmt-migrate-emit-errors n
-arcmt-migrate-report-output n
-byteswapio s
-B s
-CC s
-cl-denorms-are-zero s
-cl-fast-relaxed-math s
-cl-finite-math-only s
-cl-fp32-correctly-rounded-divide-sqrt s
-cl-kernel-arg-info s
-cl-mad-enable s
-cl-no-signed-zeros s
-cl-opt-disable s
-cl-single-precision-constant s
-cl-std s
-cl-strict-aliasing s
-cl-uniform-work-group-size s
-cl-unsafe-math-optimizations s
--config s
--cuda-compile-host-device s
--cuda-device-only s
--cuda-host-only s
--cuda-include-ptx n
--cuda-noopt-device-debug n
--cuda-path-ignore-env n
--cuda-path n
-cxx-isystem s
-C s
-c s
-dD s
-dependency-dot s
-dependency-file s
-dI s
-dM s
-dsym-dir n
-D s
-emit-ast s
-emit-interface-stubs s
-emit-llvm s
-emit-merged-ifs s
--emit-static-lib s
-enable-trivial-auto-var-init-zero-knowing-it-will-be-removed-from-clang s
-E s
-fAAPCSBitfieldLoad n
-faddrsig s
-faligned-allocation s
-fallow-editor-placeholders s
-fallow-fortran-gnu-ext s
-fansi-escape-codes s
-fapple-kext n
-fapple-link-rtlib n
-fapple-pragma-pack n
-fapplication-extension n
-fbackslash s
-fbasic-block-sections s
-fblocks s
-fborland-extensions n
-fbuild-session-file s
-fbuild-session-timestamp s
-fbuiltin-module-map n
-fcall-saved-x10 n
-fcall-saved-x11 n
-fcall-saved-x12 n
-fcall-saved-x13 n
-fcall-saved-x14 n
-fcall-saved-x15 n
-fcall-saved-x18 n
-fcall-saved-x8 n
-fcall-saved-x9 n
-fcf-protection n
-fcf-protection n
-fchar8_t s
-fclang-abi-compat s
-fcolor-diagnostics s
-fcomment-block-commands s
-fcommon s
-fcomplete-member-pointers s
-fconvergent-functions s
-fcoroutines-ts s
-fcoverage-mapping n
-fcs-profile-generate n
-fcs-profile-generate n
-fcuda-approx-transcendentals n
-fcuda-flush-denormals-to-zero s
-fcuda-short-ptr n
-fcxx-exceptions s
-fdata-sections s
-fdebug-compilation-dir s
-fdebug-default-version s
-fdebug-info-for-profiling s
-fdebug-macro s
-fdebug-prefix-map s
-fdebug-ranges-base-address s
-fdebug-types-section s
-fdeclspec s
-fdelayed-template-parsing s
-fdelete-null-pointer-checks s
-fdiagnostics-absolute-paths s
-fdiagnostics-hotness-threshold n
-fdiagnostics-parseable-fixits s
-fdiagnostics-print-source-range-info s
-fdiagnostics-show-hotness n
-fdiagnostics-show-note-include-stack s
-fdiagnostics-show-option s
-fdiagnostics-show-template-tree s
-fdigraphs s
-fdiscard-value-names s
-fdollars-in-identifiers s
-fdouble-square-bracket-attributes s
-fdwarf-exceptions n
-feliminate-unused-debug-types s
-fembed-bitcode-marker s
-fembed-bitcode s
-fembed-bitcode s
-femit-all-decls s
-femulated-tls s
-fenable-matrix s
-fexceptions s
-fexperimental-new-constant-interpreter s
-fexperimental-new-pass-manager s
-fexperimental-relative-c++-abi-vtables s
-fexperimental-strict-floating-point s
-ffast-math s
-ffile-prefix-map s
-ffine-grained-bitfield-accesses s
-ffixed-form s
-ffixed-point s
-ffixed-r19 n
-ffixed-r9 n
-ffixed-x10 n
-ffixed-x11 n
-ffixed-x12 n
-ffixed-x13 n
-ffixed-x14 n
-ffixed-x15 n
-ffixed-x16 n
-ffixed-x17 n
-ffixed-x18 n
-ffixed-x19 n
-ffixed-x1 n
-ffixed-x20 n
-ffixed-x21 n
-ffixed-x22 n
-ffixed-x23 n
-ffixed-x24 n
-ffixed-x25 n
-ffixed-x26 n
-ffixed-x27 n
-ffixed-x28 n
-ffixed-x29 n
-ffixed-x2 n
-ffixed-x30 n
-ffixed-x31 n
-ffixed-x3 n
-ffixed-x4 n
-ffixed-x5 n
-ffixed-x6 n
-ffixed-x7 n
-ffixed-x8 n
-ffixed-x9 n
-fforce-dwarf-frame s
-fforce-emit-vtables s
-fforce-enable-int128 s
-ffp-contract s
-ffp-exception-behavior s
-ffp-model s
-ffree-form s
-ffreestanding s
-ffunc-args-alias s
-ffunction-sections s
-fglobal-isel s
-fgnu-keywords s
-fgnu-runtime n
-fgnu89-inline n
-fgnuc-version s
-fgpu-allow-device-init s
-fgpu-rdc s
-fhip-new-launch-api s
-fignore-exceptions s
-fimplicit-module-maps n
-finline-functions s
-finline-hint-functions s
-finstrument-function-entry-bare n
-finstrument-functions-after-inlining n
-finstrument-functions n
-fintegrated-as s
-fintegrated-cc1 s
-fjump-tables s
-fkeep-static-consts s
-flax-vector-conversions s
-flto-jobs n
-flto n
-flto n
-fmacro-prefix-map s
-fmath-errno s
-fmax-tokens s
-fmax-type-align s
-fmemory-profile s
-fmerge-all-constants s
-fmessage-length s
-fmodule-file n
-fmodule-map-file n
-fmodule-name n
-fmodules-cache-path n
-fmodules-decluse n
-fmodules-disable-diagnostic-validation n
-fmodules-ignore-macro n
-fmodules-prune-after n
-fmodules-prune-interval n
-fmodules-search-all n
-fmodules-strict-decluse n
-fmodules-ts n
-fmodules-user-build-path n
-fmodules-validate-input-files-content s
-fmodules-validate-once-per-build-session n
-fmodules-validate-system-headers s
-fmodules n
-fms-compatibility-version s
-fms-compatibility s
-fms-extensions s
-fmsc-version s
-fnew-alignment s
-fno-addrsig s
-fno-allow-fortran-gnu-ext s
-fno-assume-sane-operator-new s
-fno-autolink s
-fno-backslash s
-fno-builtin- s
-fno-builtin s
-fno-c++-static-destructors s
-fno-char8_t s
-fno-color-diagnostics s
-fno-common s
-fno-complete-member-pointers s
-fno-constant-cfstrings s
-fno-coverage-mapping s
-fno-crash-diagnostics s
-fno-cuda-approx-transcendentals n
-fno-debug-macro s
-fno-declspec n
-fno-delayed-template-parsing s
-fno-delete-null-pointer-checks s
-fno-diagnostics-fixit-info s
-fno-digraphs s
-fno-discard-value-names s
-fno-dollars-in-identifiers s
-fno-double-square-bracket-attributes s
-fno-elide-constructors s
-fno-elide-type s
-fno-eliminate-unused-debug-types s
-fno-exceptions s
-fno-experimental-new-pass-manager s
-fno-experimental-relative-c++-abi-vtables s
-fno-fine-grained-bitfield-accesses s
-fno-fixed-form s
-fno-fixed-point s
-fno-force-enable-int128 s
-fno-fortran-main s
-fno-free-form s
-fno-func-args-alias s
-fno-global-isel s
-fno-gnu-inline-asm s
-fno-gpu-allow-device-init s
-fno-hip-new-launch-api s
-fno-integrated-as s
-fno-integrated-cc1 s
-fno-jump-tables s
-fno-keep-static-consts s
-fno-lto s
-fno-memory-profile s
-fno-merge-all-constants s
-fno-no-access-control s
-fno-objc-infer-related-result-type s
-fno-operator-names s
-fno-pch-codegen s
-fno-pch-debuginfo s
-fno-plt s
-fno-preserve-as-comments s
-fno-profile-generate s
-fno-profile-instr-generate s
-fno-profile-instr-use s
-fno-register-global-dtors-with-atexit s
-fno-rtlib-add-rpath s
-fno-rtti-data s
-fno-rtti s
-fno-sanitize-address-poison-custom-array-cookie h
-fno-sanitize-address-use-after-scope h
-fno-sanitize-address-use-odr-indicator h
-fno-sanitize-blacklist h
-fno-sanitize-cfi-canonical-jump-tables h
-fno-sanitize-cfi-cross-dso h
-fno-sanitize-coverage h
-fno-sanitize-memory-track-origins h
-fno-sanitize-memory-use-after-dtor h
-fno-sanitize-recover h
-fno-sanitize-stats h
-fno-sanitize-thread-atomics h
-fno-sanitize-thread-func-entry-exit h
-fno-sanitize-thread-memory-access h
-fno-sanitize-trap h
-fno-sanitize-trap h
-fno-short-wchar s
-fno-show-column s
-fno-show-source-location s
-fno-signed-char s
-fno-signed-zeros s
-fno-spell-checking s
-fno-split-machine-functions s
-fno-stack-clash-protection s
-fno-stack-protector s
-fno-standalone-debug s
-fno-strict-float-cast-overflow s
-fno-strict-return s
-fno-sycl n
-fno-temp-file s
-fno-threadsafe-statics s
-fno-trigraphs s
-fno-unique-section-names s
-fno-unroll-loops s
-fno-use-cxa-atexit s
-fno-use-flang-math-libs s
-fno-use-init-array s
-fno-visibility-inlines-hidden-static-local-var s
-fno-xray-function-index n
-fno-zero-initialized-in-bss s
-fobjc-arc-exceptions n
-fobjc-arc n
-fobjc-exceptions n
-fobjc-runtime n
-fobjc-weak n
-fopenmp-simd n
-fopenmp-targets n
-fopenmp n
-foptimization-record-file s
-foptimization-record-passes s
-forder-file-instrumentation s
-fpack-struct n
-fpascal-strings s
-fpass-plugin s
-fpatchable-function-entry s
-fpcc-struct-return n
-fpch-codegen s
-fpch-debuginfo s
-fpch-instantiate-templates s
-fpch-validate-input-files-content s
-fplugin s
-fprebuilt-module-path n
-fprofile-exclude-files n
-fprofile-filter-files n
-fprofile-generate n
-fprofile-generate n
-fprofile-instr-generate n
-fprofile-instr-generate n
-fprofile-instr-use n
-fprofile-remapping-file n
-fprofile-sample-accurate n
-fprofile-sample-use n
-fprofile-use n
-freciprocal-math s
-freg-struct-return n
-fregister-global-dtors-with-atexit s
-frelaxed-template-template-args s
-freroll-loops s
-fropi n
-frtlib-add-rpath s
-frwpi n
-fsanitize-address-field-padding h
-fsanitize-address-globals-dead-stripping h
-fsanitize-address-poison-custom-array-cookie h
-fsanitize-address-use-after-scope h
-fsanitize-address-use-odr-indicator h
-fsanitize-blacklist h
-fsanitize-cfi-canonical-jump-tables h
-fsanitize-cfi-cross-dso h
-fsanitize-cfi-icall-generalize-pointers h
-fsanitize-coverage-allowlist h
-fsanitize-coverage-blacklist h
-fsanitize-coverage-blocklist h
-fsanitize-coverage-whitelist h
-fsanitize-coverage h
-fsanitize-hwaddress-abi h
-fsanitize-memory-track-origins h
-fsanitize-memory-track-origins h
-fsanitize-memory-use-after-dtor h
-fsanitize-recover h
-fsanitize-stats h
-fsanitize-system-blacklist h
-fsanitize-thread-atomics h
-fsanitize-thread-func-entry-exit h
-fsanitize-thread-memory-access h
-fsanitize-trap h
-fsanitize-trap h
-fsanitize-undefined-strip-path-components h
-fsanitize h
-fsave-optimization-record s
-fsave-optimization-record s
-fseh-exceptions s
-fshort-enums s
-fshort-wchar n
-fshow-overloads s
-fsigned-char s
-fsized-deallocation s
-fsjlj-exceptions s
-fslp-vectorize s
-fsplit-dwarf-inlining n
-fsplit-lto-unit n
-fsplit-machine-functions s
-fstack-clash-protection s
-fstack-protector-all n
-fstack-protector-strong n
-fstack-protector n
-fstack-size-section s
-fstandalone-debug s
-fstrict-enums s
-fstrict-float-cast-overflow s
-fstrict-vtable-pointers s
-fsycl n
-fsystem-module u
-fthin-link-bitcode s
-fthinlto-index n
-ftime-trace-granularity s
-ftime-trace s
-ftrap-function n
-ftrapv-handler n
-ftrapv n
-ftrigraphs s
-ftrivial-auto-var-init-stop-after s
-ftrivial-auto-var-init s
-funique-basic-block-section-names s
-funique-internal-linkage-names s
-funroll-loops s
-fuse-flang-math-libs s
-fuse-line-directives s
-fvalidate-ast-input-files-content s
-fveclib n
-fvectorize n
-fverbose-asm s
-fvirtual-function-elimination s
-fvisibility-global-new-delete-hidden s
-fvisibility-inlines-hidden-static-local-var s
-fvisibility-inlines-hidden s
-fvisibility-ms-compat s
-fvisibility s
-fwasm-exceptions n
-fwhole-program-vtables n
-fwrapv s
-fwritable-strings s
-fxray-always-emit-customevents n
-fxray-always-emit-typedevents n
-fxray-always-instrument n
-fxray-attr-list n
-fxray-ignore-loops n
-fxray-instruction-threshold n
-fxray-instrumentation-bundle n
-fxray-instrument n
-fxray-link-deps n
-fxray-modes n
-fxray-never-instrument n
-fzvector s
-F n
--gcc-toolchain s
-gcodeview-ghash s
-gcodeview s
-gdwarf-2 s
-gdwarf-3 s
-gdwarf-4 s
-gdwarf-5 s
-gdwarf s
-gembed-source s
-gline-directives-only s
-gline-tables-only s
-gmodules s
-gno-embed-source s
-gno-inline-line-tables s
--gpu-max-threads-per-block s
-gsplit-dwarf s
-gz s
-gz s
-G n
-g s
--help-hidden s
-help s
--hip-device-lib s
--hip-link s
--hip-version s
-H s
-I- s
-ibuiltininc s
-idirafter s
-iframeworkwithsysroot n
-iframework n
-imacros s
-include-pch s
-include s
-index-header-map s
-iprefix s
-iquote s
-isysroot s
-isystem-after s
-isystem s
-ivfsoverlay s
-iwithprefixbefore s
-iwithprefix s
-iwithsysroot s
-I s
--libomptarget-nvptx-path n
-L s
-mabicalls n
-maix-struct-return n
-malign-branch-boundary s
-malign-branch s
-malign-double s
-Mallocatable n
-mbackchain n
-mbranch-protection n
-mbranches-within-32B-boundaries s
-mcmodel n
-mcmodel n
-mcmse n
-mcode-object-v3 s
-mcode-object-version s
-mcrc n
-mcumode s
-mdouble s
-MD s
-meabi s
-membedded-data n
-menable-experimental-extensions n
-mexec-model n
-mexecute-only n
-mextern-sdata n
-mfentry n
-mfix-cortex-a53-835769 n
-mfp32 n
-mfp64 n
-MF s
-mgeneral-regs-only n
-mglobal-merge s
-mgpopt n
-MG s
-mharden-sls n
-mhvx-length n
-mhvx n
-mhvx n
-miamcu n
--migrate n
-mincremental-linker-compatible s
-mindirect-jump n
-Minform s
-mios-version-min n
-MJ n
-mllvm s
-mlocal-sdata n
-mlong-calls s
-mlong-double-128 h
-mlong-double-64 s
-mlong-double-80 h
-mlvi-cfi h
-mlvi-hardening h
-mmacosx-version-min n
-mmadd4 s
-mmark-bti-property n
-MMD s
-mmemops s
-mms-bitfields n
-mmsa n
-mmt n
-MM s
-mno-abicalls n
-mno-crc n
-mno-embedded-data n
-mno-execute-only n
-mno-extern-sdata n
-mno-fix-cortex-a53-835769 n
-mno-global-merge s
-mno-gpopt n
-mno-hvx n
-mno-implicit-float s
-mno-incremental-linker-compatible s
-mno-local-sdata n
-mno-long-calls s
-mno-lvi-cfi h
-mno-lvi-hardening h
-mno-madd4 s
-mno-memops s
-mno-movt s
-mno-ms-bitfields s
-mno-msa n
-mno-mt n
-mno-neg-immediates s
-mno-nvj s
-mno-nvs s
-mno-outline n
-mno-packets s
-mno-relax s
-mno-restrict-it n
-mno-save-restore n
-mno-seses n
-mno-stack-arg-probe s
-mno-tls-direct-seg-refs s
-mno-unaligned-access n
-mno-wavefrontsize64 s
-mnocrc n
-mnop-mcount s
-mnvj s
-mnvs s
-module-dependency-dir n
-module-file-info n
-momit-leaf-frame-pointer s
-moutline n
-mpacked-stack n
-mpackets s
-mpad-max-prefix-size s
-mpie-copy-relocations s
-mprefer-vector-width n
-MP s
-mqdsp6-compat n
-MQ s
-mrecord-mcount s
-mrelax-all s
-mrelax s
-mrestrict-it n
-mrtd n
-msave-restore n
-mseses n
-msign-return-address n
-msmall-data-limit s
-msoft-float s
-msram-ecc s
-mstack-alignment n
-mstack-arg-probe n
-mstack-probe-size n
-mstackrealign n
-msve-vector-bits n
-msvr4-struct-return n
-mthread-model s
-mtls-direct-seg-refs s
-mtls-size n
-mtp n
-mtune h
-MT n
-munaligned-access n
-MV s
-mwavefrontsize64 s
-mxnack s
-M s
--no-cuda-include-ptx s
--no-cuda-version-check s
-no-flang-libs s
--no-offload-arch s
--no-system-header-prefix s
-nobuiltininc s
-nogpuinc s
-nogpulib s
-nostdinc++ n
-ObjC++ n
-objcmt-atomic-property n
-objcmt-migrate-all n
-objcmt-migrate-annotation n
-objcmt-migrate-designated-init n
-objcmt-migrate-instancetype n
-objcmt-migrate-literals n
-objcmt-migrate-ns-macros n
-objcmt-migrate-property-dot-syntax n
-objcmt-migrate-property n
-objcmt-migrate-protocol-conformance n
-objcmt-migrate-readonly-property n
-objcmt-migrate-readwrite-property n
-objcmt-migrate-subscripting n
-objcmt-ns-nonatomic-iosonly n
-objcmt-returns-innerpointer-property n
-objcmt-whitelist-dir-path n
-ObjC n
--offload-arch s
-o s
-parallel-jobs s
-pg s
-pipe s
--precompile s
-print-effective-triple s
-print-file-name s
-print-ivar-layout n
-print-libgcc-file-name s
-print-prog-name s
-print-resource-dir s
-print-search-dirs s
-print-supported-cpus s
-print-target-triple s
-print-targets s
-pthread s
--ptxas-path n
-P s
-Qn s
-Qunused-arguments s
-Qy s
-relocatable-pch s
-rewrite-legacy-objc n
-rewrite-objc n
--rocm-device-lib-path s
--rocm-path s
-Rpass-analysis s
-Rpass-missed s
-Rpass s
-rtlib n
-R n
-save-stats s
-save-stats s
-save-temps s
-save-temps s
-serialize-diagnostics s
-shared-libsan n
-static-flang-libs s
-static-libsan n
-static-openmp s
-std s
-stdlib++-isystem s
-stdlib s
-sycl-std n
--system-header-prefix s
-S s
--target s
-Tbss s
-Tdata s
-time s
-traditional-cpp n
-trigraphs s
-Ttext s
-T n
-undef s
-unwindlib s
-U s
--verify-debug-info s
-verify-pch n
--version s
-v s
-Wa, s
-Wdeprecated s
-Wl, s
-working-directory s
-Wp, s
-W s
-w s
-Xanalyzer s
-Xarch_device s
-Xarch_host s
-Xassembler s
-Xclang s
-Xcuda-fatbinary s
-Xcuda-ptxas s
-Xlinker s
-Xopenmp-target s
-Xopenmp-target s
-Xpreprocessor s
-x s
-z s
-37
Voir le fichier
@@ -1,37 +0,0 @@
# cuComplex API supported by HIP
## **1. cuComplex Data types**
| **type** | **CUDA** | **HIP** |**HIP value** (if differs) |
|-------------:|---------------------------------------------------------------|------------------------------------------------------------|---------------------------|
| float2 |***`cuFloatComplex`*** |***`hipFloatComplex`*** | struct |
| double2 |***`cuDoubleComplex`*** |***`hipDoubleComplex`*** | struct |
| float2 |***`cuComplex`*** |***`hipComplex`*** | struct |
## **2. cuComplex API functions**
| **CUDA** | **HIP** |
|-----------------------------------------------------------|-------------------------------------------------|
|`cuCrealf` |`hipCrealf` |
|`cuCimagf` |`hipCimagf` |
|`make_cuFloatComplex` |`make_hipFloatComplex` |
|`cuConjf` |`hipConjf` |
|`cuCaddf` |`hipCaddf` |
|`cuCsubf` |`hipCsubf` |
|`cuCmulf` |`hipCmulf` |
|`cuCdivf` |`hipCdivf` |
|`cuCabsf` |`hipCabsf` |
|`cuCreal` |`hipCreal` |
|`cuCimag` |`hipCimag` |
|`make_cuDoubleComplex` |`make_hipDoubleComplex` |
|`cuConj` |`hipConj` |
|`cuCadd` |`hipCadd` |
|`cuCsub` |`hipCsub` |
|`cuCmul` |`hipCmul` |
|`cuCdiv` |`hipCdiv` |
|`cuCabs` |`hipCabs` |
|`make_cuComplex` |`make_hipComplex` |
|`cuComplexFloatToDouble` |`hipComplexFloatToDouble` |
|`cuComplexDoubleToFloat` |`hipComplexDoubleToFloat` |
|`cuCfmaf` |`hipCfmaf` |
|`cuCfma` |`hipCfma` |
-172
Voir le fichier
@@ -1,172 +0,0 @@
# HIP Bugs
<!-- toc -->
- [HIP is more restrictive in enforcing restrictions](#hip-is-more-restrictive-in-enforcing-restrictions)
<!-- tocstop -->
### HIP is more restrictive in enforcing restrictions
The language specification for HIP and CUDA forbid calling a
`__device__` function in a `__host__` context. In practice, you may observe
differences in the strictness of this restriction, with HIP exhibiting a tighter
adherence to the specification and thus less tolerant of infringing code. The
solution is to ensure that all functions which are called in a
`__device__` context are correctly annotated to reflect it.
The following is an example of codes using the specification,
```
#include <hip/hip_runtime.h>
#include <type_traits>
#include <random>
#include "test_common.h"
static std::random_device dev;
static std::mt19937 rng(dev());
template <typename T, typename M>
__host__ __device__ inline constexpr int count() {
return sizeof(T) / sizeof(M);
}
inline float getRandomFloat(float min = 10, float max = 100) {
std::uniform_real_distribution<float> gen(min, max);
return gen(rng);
}
template <typename T, typename B>
void fillMatrix(T* a, int size) {
for (int i = 0; i < size; i++) {
T t;
t.x = getRandomFloat();
if constexpr (count<T, B>() >= 2) t.y = getRandomFloat();
if constexpr (count<T, B>() >= 3) t.z = getRandomFloat();
if constexpr (count<T, B>() >= 4) t.w = getRandomFloat();
a[i] = t;
}
}
// Test operations
template <typename T, typename B>
__host__ __device__ void testOperations(T& a, T& b) {
a.x += b.x;
a.x++;
b.x++;
if constexpr (count<T, B>() >= 2) {
a.y = b.x;
a.x = b.y;
}
if constexpr (count<T, B>() >= 3) {
if (a.x > 0) b.x /= a.x;
a.x *= b.z;
a.y--;
}
if constexpr (count<T, B>() >= 4) {
b.w = a.x;
a.w += (-b.y);
}
}
template <typename T, typename B>
__global__ void testOperationsGPU(T* d_a, T* d_b, int size) {
int id = threadIdx.x;
if (id > size) return;
T &a = d_a[id];
T &b = d_b[id];
testOperations<T, B>(a, b);
}
template <typename T>
void dcopy(T* a, T* b, int size) {
for (int i = 0; i < size; i++) {
a[i] = b[i];
}
}
template <typename T>
bool isEqual(T* a, T* b, int size) {
for (int i = 0; i < size; i++) {
if (a[i] != b[i]) {
return false;
}
}
return true;
}
// Main function that tests type
// T = what you want to test
// D = pack of 1 i.e. float1 int1
template <typename T, typename D>
void testType(int msize) {
T *fa, *fb, *fc, *h_fa, *h_fb;
fa = new T[msize];
fb = new T[msize];
fc = new T[msize];
h_fa = new T[msize];
h_fb = new T[msize];
T *d_fa, *d_fb;
constexpr int c = count<T, D>();
if (c <= 0 || c >= 5) {
failed("Invalid Size\n");
}
fillMatrix<T, D>(fa, msize);
dcopy(fb, fa, msize);
dcopy(h_fa, fa, msize);
dcopy(h_fb, fa, msize);
for (int i = 0; i < msize; i++) testOperations<T, D>(h_fa[i], h_fb[i]);
hipMalloc(&d_fa, sizeof(T) * msize);
hipMalloc(&d_fb, sizeof(T) * msize);
hipMemcpy(d_fa, fa, sizeof(T) * msize, hipMemcpyHostToDevice);
hipMemcpy(d_fb, fb, sizeof(T) * msize, hipMemcpyHostToDevice);
auto kernel = testOperationsGPU<T, D>;
hipLaunchKernelGGL(kernel, 1, msize, 0, 0, d_fa, d_fb, msize);
hipMemcpy(fc, d_fa, sizeof(T) * msize, hipMemcpyDeviceToHost);
bool pass = true;
if (!isEqual<T>(h_fa, fc, msize)) {
pass = false;
}
delete[] fa;
delete[] fb;
delete[] fc;
delete[] h_fa;
delete[] h_fb;
hipFree(d_fa);
hipFree(d_fb);
if (!pass) {
failed("Failed");
}
}
int main() {
const int msize = 100;
// double
testType<double1, double1>(msize);
testType<double2, double1>(msize);
testType<double3, double1>(msize);
testType<double4, double1>(msize);
// floats
testType<float1, float1>(msize);
testType<float2, float1>(msize);
testType<float3, float1>(msize);
testType<float4, float1>(msize);
...
passed();
}
```
For more details for the complete program, please refer to HIP test application at the link, https://github.com/ROCm-Developer-Tools/HIP/blob/main/tests/src/deviceLib/hip_floatnTM.cpp
-287
Voir le fichier
@@ -1,287 +0,0 @@
# Porting CUDA Driver API
## Introduction to the CUDA Driver and Runtime APIs
CUDA provides a separate CUDA Driver and Runtime APIs. The two APIs have significant overlap in functionality:
- Both APIs support events, streams, memory management, memory copy, and error handling.
- Both APIs deliver similar performance.
- Driver APIs calls begin with the prefix `cu` while Runtime APIs begin with the prefix `cuda`. For example, the Driver API API contains `cuEventCreate` while the Runtime API contains `cudaEventCreate`, with similar functionality.
- The Driver API defines a different but largely overlapping error code space than the Runtime API, and uses a different coding convention. For example, Driver API defines `CUDA_ERROR_INVALID_VALUE` while the Runtime API defines `cudaErrorInvalidValue`
The Driver API offers two additional pieces of functionality not provided by the Runtime API: cuModule and cuCtx APIs.
### cuModule API
The Module section of the Driver API provides additional control over how and when accelerator code objects are loaded.
For example, the driver API allows code objects to be loaded from files or memory pointers.
Symbols for kernels or global data can be extracted from the loaded code objects.
In contrast, the Runtime API automatically loads and (if necessary) compiles all of the kernels from an executable binary when run.
In this mode, NVCC must be used to compile kernel code so the automatic loading can function correctly.
Both Driver and Runtime APIs define a function for launching kernels (called `cuLaunchKernel` or `cudaLaunchKernel`.
The kernel arguments and the execution configuration (grid dimensions, group dimensions, dynamic shared memory, and stream) are passed as arguments to the launch function.
The Runtime additionally provides the `<<< >>>` syntax for launching kernels, which resembles a special function call and is easier to use than explicit launch API (in particular with respect to handling of kernel arguments).
However, this syntax is not standard C++ and is available only when NVCC is used to compile the host code.
The Module features are useful in an environment which generates the code objects directly, such as a new accelerator language front-end.
Here, NVCC is not used. Instead, the environment may have a different kernel language or different compilation flow.
Other environments have many kernels and do not want them to be all loaded automatically.
The Module functions can be used to load the generated code objects and launch kernels.
As we will see below, HIP defines a Module API which provides similar explicit control over code object management.
### cuCtx API
The Driver API defines "Context" and "Devices" as separate entities.
Contexts contain a single device, and a device can theoretically have multiple contexts.
Each context contains a set of streams and events specific to the context.
Historically contexts also defined a unique address space for the GPU, though this may no longer be the case in Unified Memory platforms (since the CPU and all the devices in the same process share a single unified address space).
The Context APIs also provide a mechanism to switch between devices, which allowed a single CPU thread to send commands to different GPUs.
HIP as well as a recent versions of CUDA Runtime provide other mechanisms to accomplish this feat - for example using streams or `cudaSetDevice`.
The CUDA Runtime API unifies the Context API with the Device API. This simplifies the APIs and has little loss of functionality since each Context can contain a single device, and the benefits of multiple contexts has been replaced with other interfaces.
HIP provides a context API to facilitate easy porting from existing Driver codes.
In HIP, the Ctx functions largely provide an alternate syntax for changing the active device.
Most new applications will prefer to use `hipSetDevice` or the stream APIs , therefore HIP has marked hipCtx APIs as **deprecated**. Support for these APIs may not be available in future releases. For more details on deprecated APIs please refer [HIP deprecated APIs](https://github.com/ROCm-Developer-Tools/HIP/tree/master/docs/markdown/hip_deprecated_api_list.md).
## HIP Module and Ctx APIs
Rather than present two separate APIs, HIP extends the HIP API with new APIs for Modules and Ctx control.
### hipModule API
Like the CUDA Driver API, the Module API provides additional control over how code is loaded, including options to load code from files or from in-memory pointers.
NVCC and HIP-Clang target different architectures and use different code object formats: NVCC is `cubin` or `ptx` files, while the HIP-Clang path is the `hsaco` format.
The external compilers which generate these code objects are responsible for generating and loading the correct code object for each platform.
Notably, there is not a fat binary format that can contain code for both NVCC and HIP-Clang platforms. The following table summarizes the formats used on each platform:
| Format | APIs | NVCC | HIP-CLANG |
| --- | --- | --- | --- |
| Code Object | hipModuleLoad, hipModuleLoadData | .cubin or PTX text | .hsaco |
| Fat Binary | hipModuleLoadFatBin | .fatbin | .hip_fatbin |
`hipcc` uses HIP-Clang or NVCC to compile host codes. Both of these may embed code objects into the final executable, and these code objects will be automatically loaded when the application starts.
The hipModule API can be used to load additional code objects, and in this way provides an extended capability to the automatically loaded code objects.
HIP-Clang allows both of these capabilities to be used together, if desired. Of course it is possible to create a program with no kernels and thus no automatic loading.
### hipCtx API
HIP provides a `Ctx` API as a thin layer over the existing Device functions. This Ctx API can be used to set the current context, or to query properties of the device associated with the context.
The current context is implicitly used by other APIs such as `hipStreamCreate`.
### hipify translation of CUDA Driver API
The HIPIFY tools convert CUDA Driver APIs for streams, events, modules, devices, memory management, context, profiler to the equivalent HIP driver calls. For example, `cuEventCreate` will be translated to `hipEventCreate`.
HIPIFY tools also convert error codes from the Driver namespace and coding convention to the equivalent HIP error code. Thus, HIP unifies the APIs for these common functions.
The memory copy API requires additional explanation. The CUDA driver includes the memory direction in the name of the API (ie `cuMemcpyH2D`) while the CUDA driver API provides a single memory copy API with a parameter that specifies the direction and additionally supports a "default" direction where the runtime determines the direction automatically.
HIP provides APIs with both styles: for example, `hipMemcpyH2D` as well as `hipMemcpy`.
The first flavor may be faster in some cases since they avoid host overhead to detect the different memory directions.
HIP defines a single error space, and uses camel-case for all errors (i.e. `hipErrorInvalidValue`).
#### Address Spaces
HIP-Clang defines a process-wide address space where the CPU and all devices allocate addresses from a single unified pool.
Thus addresses may be shared between contexts, and unlike the original CUDA definition a new context does not create a new address space for the device.
#### Using hipModuleLaunchKernel
`hipModuleLaunchKernel` is `cuLaunchKernel` in HIP world. It takes the same arguments as `cuLaunchKernel`.
#### Additional Information
- HIP-Clang creates a primary context when the HIP API is called. So in a pure driver API code, HIP-Clang will create a primary context while HIP/NVCC will have empty context stack.
HIP-Clang will push primary context to context stack when it is empty. This can have subtle differences on applications which mix the runtime and driver APIs.
### hip-clang Implementation Notes
#### .hip_fatbin
hip-clang links device code from different translation units together. For each device target, a code object is generated. Code objects for different device targets are bundled by clang-offload-bundler as one fatbinary, which is embeded as a global symbol `__hip_fatbin` in the .hip_fatbin section of the ELF file of the executable or shared object.
#### Initialization and Termination Functions
hip-clang generates initializatiion and termination functions for each translation unit for host code compilation. The initialization functions call `__hipRegisterFatBinary` to register the fatbinary embeded in the ELF file. They also call `__hipRegisterFunction` and `__hipRegisterVar` to register kernel functions and device side global variables. The termination functions call `__hipUnregisterFatBinary`.
hip-clang emits a global variable `__hip_gpubin_handle` of void** type with linkonce linkage and inital value 0 for each host translation unit. Each initialization function checks `__hip_gpubin_handle` and register the fatbinary only if `__hip_gpubin_handle` is 0 and saves the return value of `__hip_gpubin_handle` to `__hip_gpubin_handle`. This is to guarantee that the fatbinary is only registered once. Similar check is done in the termination functions.
#### Kernel Launching
hip-clang supports kernel launching by CUDA `<<<>>>` syntax, hipLaunchKernelGGL. The latter one is macro which expand to CUDA `<<<>>>` syntax.
When the executable or shared library is loaded by the dynamic linker, the initilization functions are called. In the initialization functions, when `__hipRegisterFatBinary` is called, the code objects containing all kernels are loaded; when `__hipRegisterFunction` is called, the stub functions are associated with the corresponding kernels in code objects.
hip-clang implements two sets of kernel launching APIs.
By default, in the host code, for the `<<<>>>` statement, hip-clang first emits call of hipConfigureCall to set up the threads and grids, then emits call of the stub function with the given arguments. In the stub function, hipSetupArgument is called for each kernel argument, then hipLaunchByPtr is called with a function pointer to the stub function. In hipLaunchByPtr, the real kernel associated with the stub function is launched.
### NVCC Implementation Notes
#### Interoperation between HIP and CUDA Driver
CUDA applications may want to mix CUDA driver code with HIP code (see example below). This table shows the type equivalence to enable this interaction.
|**HIP Type** |**CU Driver Type**|**CUDA Runtime Type**|
| ---- | ---- | ---- |
| hipModule_t | CUmodule | |
| hipFunction_t | CUfunction | |
| hipCtx_t | CUcontext | |
| hipDevice_t | CUdevice | |
| hipStream_t | CUstream | cudaStream_t |
| hipEvent_t | CUevent | cudaEvent_t |
| hipArray | CUarray | cudaArray |
#### Compilation Options
The `hipModule_t` interface does not support `cuModuleLoadDataEx` function, which is used to control PTX compilation options.
HIP-Clang does not use PTX and does not support these compilation options.
In fact, HIP-Clang code objects always contain fully compiled ISA and do not require additional compilation as a part of the load step.
The corresponding HIP function `hipModuleLoadDataEx` behaves as `hipModuleLoadData` on HIP-Clang path (compilation options are not used) and as `cuModuleLoadDataEx` on NVCC path.
For example (CUDA):
```
CUmodule module;
void *imagePtr = ...; // Somehow populate data pointer with code object
const int numOptions = 1;
CUJit_option options[numOptions];
void * optionValues[numOptions];
options[0] = CU_JIT_MAX_REGISTERS;
unsigned maxRegs = 15;
optionValues[0] = (void*)(&maxRegs);
cuModuleLoadDataEx(module, imagePtr, numOptions, options, optionValues);
CUfunction k;
cuModuleGetFunction(&k, module, "myKernel");
```
HIP:
```
hipModule_t module;
void *imagePtr = ...; // Somehow populate data pointer with code object
const int numOptions = 1;
hipJitOption options[numOptions];
void * optionValues[numOptions];
options[0] = hipJitOptionMaxRegisters;
unsigned maxRegs = 15;
optionValues[0] = (void*)(&maxRegs);
// hipModuleLoadData(module, imagePtr) will be called on HIP-Clang path, JIT options will not be used, and
// cupModuleLoadDataEx(module, imagePtr, numOptions, options, optionValues) will be called on NVCC path
hipModuleLoadDataEx(module, imagePtr, numOptions, options, optionValues);
hipFunction_t k;
hipModuleGetFunction(&k, module, "myKernel");
```
The below sample shows how to use `hipModuleGetFunction`.
```
#include<hip_runtime.h>
#include<hip_runtime_api.h>
#include<iostream>
#include<fstream>
#include<vector>
#define LEN 64
#define SIZE LEN<<2
#ifdef __HIP_PLATFORM_AMD__
#define fileName "vcpy_isa.co"
#endif
#ifdef __HIP_PLATFORM_NVIDIA__
#define fileName "vcpy_isa.ptx"
#endif
#define kernel_name "hello_world"
int main(){
float *A, *B;
hipDeviceptr_t Ad, Bd;
A = new float[LEN];
B = new float[LEN];
for(uint32_t i=0;i<LEN;i++){
A[i] = i*1.0f;
B[i] = 0.0f;
std::cout<<A[i] << " "<<B[i]<<std::endl;
}
#ifdef __HIP_PLATFORM_NVIDIA__
hipInit(0);
hipDevice_t device;
hipCtx_t context;
hipDeviceGet(&device, 0);
hipCtxCreate(&context, 0, device);
#endif
hipMalloc((void**)&Ad, SIZE);
hipMalloc((void**)&Bd, SIZE);
hipMemcpyHtoD(Ad, A, SIZE);
hipMemcpyHtoD(Bd, B, SIZE);
hipModule_t Module;
hipFunction_t Function;
hipModuleLoad(&Module, fileName);
hipModuleGetFunction(&Function, Module, kernel_name);
std::vector<void*>argBuffer(2);
memcpy(&argBuffer[0], &Ad, sizeof(void*));
memcpy(&argBuffer[1], &Bd, sizeof(void*));
size_t size = argBuffer.size()*sizeof(void*);
void *config[] = {
HIP_LAUNCH_PARAM_BUFFER_POINTER, &argBuffer[0],
HIP_LAUNCH_PARAM_BUFFER_SIZE, &size,
HIP_LAUNCH_PARAM_END
};
hipModuleLaunchKernel(Function, 1, 1, 1, LEN, 1, 1, 0, 0, NULL, (void**)&config);
hipMemcpyDtoH(B, Bd, SIZE);
for(uint32_t i=0;i<LEN;i++){
std::cout<<A[i]<<" - "<<B[i]<<std::endl;
}
#ifdef __HIP_PLATFORM_NVIDIA__
hipCtxDetach(context);
#endif
return 0;
}
```
## HIP Module and Texture Driver API
HIP supports texture driver APIs however texture reference should be declared in host scope. Following code explains the use of texture reference for __HIP_PLATFORM_AMD__ platform.
```
// Code to generate code object
#include "hip/hip_runtime.h"
extern texture<float, 2, hipReadModeElementType> tex;
__global__ void tex2dKernel(hipLaunchParm lp, float* outputData,
int width,
int height)
{
int x = blockIdx.x*blockDim.x + threadIdx.x;
int y = blockIdx.y*blockDim.y + threadIdx.y;
outputData[y*width + x] = tex2D(tex, x, y);
}
```
```
// Host code:
texture<float, 2, hipReadModeElementType> tex;
void myFunc ()
{
// ...
textureReference* texref;
hipModuleGetTexRef(&texref, Module1, "tex");
hipTexRefSetAddressMode(texref, 0, hipAddressModeWrap);
hipTexRefSetAddressMode(texref, 1, hipAddressModeWrap);
hipTexRefSetFilterMode(texref, hipFilterModePoint);
hipTexRefSetFlags(texref, 0);
hipTexRefSetFormat(texref, HIP_AD_FORMAT_FLOAT, 1);
hipTexRefSetArray(texref, array, HIP_TRSA_OVERRIDE_FORMAT);
// ...
}
```
-597
Voir le fichier
@@ -1,597 +0,0 @@
# HIP Porting Guide
In addition to providing a portable C++ programming environment for GPUs, HIP is designed to ease
the porting of existing CUDA code into the HIP environment. This section describes the available tools
and provides practical suggestions on how to port CUDA code and work through common issues.
## Table of Contents
<!-- toc -->
- [Porting a New CUDA Project](#porting-a-new-cuda-project)
* [General Tips](#general-tips)
* [Scanning existing CUDA code to scope the porting effort](#scanning-existing-cuda-code-to-scope-the-porting-effort)
* [Converting a project "in-place"](#converting-a-project-in-place)
* [CUDA to HIP Math Library Equivalents](#library-equivalents)
- [Distinguishing Compiler Modes](#distinguishing-compiler-modes)
* [Identifying HIP Target Platform](#identifying-hip-target-platform)
* [Identifying the Compiler: hip-clang, or nvcc](#identifying-the-compiler-hip-clang-or-nvcc)
* [Identifying Current Compilation Pass: Host or Device](#identifying-current-compilation-pass-host-or-device)
* [Compiler Defines: Summary](#compiler-defines-summary)
- [Identifying Architecture Features](#identifying-architecture-features)
* [HIP_ARCH Defines](#hip_arch-defines)
* [Device-Architecture Properties](#device-architecture-properties)
* [Table of Architecture Properties](#table-of-architecture-properties)
- [Finding HIP](#finding-hip)
- [Identifying HIP Runtime](#identifying-hip-runtime)
- [hipLaunchKernelGGL](#hiplaunchkernelGGL)
- [Compiler Options](#compiler-options)
- [Linking Issues](#linking-issues)
* [Linking With hipcc](#linking-with-hipcc)
* [-lm Option](#-lm-option)
- [Linking Code With Other Compilers](#linking-code-with-other-compilers)
* [libc++ and libstdc++](#libc-and-libstdc)
* [HIP Headers (hip_runtime.h, hip_runtime_api.h)](#hip-headers-hip_runtimeh-hip_runtime_apih)
* [Using a Standard C++ Compiler](#using-a-standard-c-compiler)
+ [cuda.h](#cudah)
* [Choosing HIP File Extensions](#choosing-hip-file-extensions)
- [Workarounds](#workarounds)
* [warpSize](#warpsize)
* [Kernel launch with group size > 256](#kernel-launch-with-group-size--256)
- [memcpyToSymbol](#memcpytosymbol)
- [threadfence_system](#threadfence_system)
* [Textures and Cache Control](#textures-and-cache-control)
- [More Tips](#more-tips)
* [HIP Logging](#hip-logging)
* [Debugging hipcc](#debugging-hipcc)
* [What Does This Error Mean?](#what-does-this-error-mean)
+ [/usr/include/c++/v1/memory:5172:15: error: call to implicitly deleted default constructor of 'std::__1::bad_weak_ptr' throw bad_weak_ptr();](#usrincludecv1memory517215-error-call-to-implicitly-deleted-default-constructor-of-std__1bad_weak_ptr-throw-bad_weak_ptr)
* [Editor Highlighting](#editor-highlighting)
<!-- tocstop -->
## Porting a New CUDA Project
### General Tips
- Starting the port on a CUDA machine is often the easiest approach, since you can incrementally port pieces of the code to HIP while leaving the rest in CUDA. (Recall that on CUDA machines HIP is just a thin layer over CUDA, so the two code types can interoperate on nvcc platforms.) Also, the HIP port can be compared with the original CUDA code for function and performance.
- Once the CUDA code is ported to HIP and is running on the CUDA machine, compile the HIP code using the HIP compiler on an AMD machine.
- HIP ports can replace CUDA versions: HIP can deliver the same performance as a native CUDA implementation, with the benefit of portability to both Nvidia and AMD architectures as well as a path to future C++ standard support. You can handle platform-specific features through conditional compilation or by adding them to the open-source HIP infrastructure.
- Use **[hipconvertinplace-perl.sh](https://github.com/ROCm-Developer-Tools/HIPIFY/blob/master/bin/hipconvertinplace-perl.sh)** to hipify all code files in the CUDA source directory.
### Scanning existing CUDA code to scope the porting effort
The **[hipexamine-perl.sh](https://github.com/ROCm-Developer-Tools/HIPIFY/blob/master/bin/hipexamine-perl.sh)** tool will scan a source directory to determine which files contain CUDA code and how much of that code can be automatically hipified.
```
> cd examples/rodinia_3.0/cuda/kmeans
> $HIP_DIR/bin/hipexamine-perl.sh.
info: hipify ./kmeans.h =====>
info: hipify ./unistd.h =====>
info: hipify ./kmeans.c =====>
info: hipify ./kmeans_cuda_kernel.cu =====>
info: converted 40 CUDA->HIP refs( dev:0 mem:0 kern:0 builtin:37 math:0 stream:0 event:0 err:0 def:0 tex:3 other:0 ) warn:0 LOC:185
info: hipify ./getopt.h =====>
info: hipify ./kmeans_cuda.cu =====>
info: converted 49 CUDA->HIP refs( dev:3 mem:32 kern:2 builtin:0 math:0 stream:0 event:0 err:0 def:0 tex:12 other:0 ) warn:0 LOC:311
info: hipify ./rmse.c =====>
info: hipify ./cluster.c =====>
info: hipify ./getopt.c =====>
info: hipify ./kmeans_clustering.c =====>
info: TOTAL-converted 89 CUDA->HIP refs( dev:3 mem:32 kern:2 builtin:37 math:0 stream:0 event:0 err:0 def:0 tex:15 other:0 ) warn:0 LOC:3607
kernels (1 total) : kmeansPoint(1)
```
hipexamine-perl scans each code file (cpp, c, h, hpp, etc.) found in the specified directory:
* Files with no CUDA code (ie kmeans.h) print one line summary just listing the source file name.
* Files with CUDA code print a summary of what was found - for example the kmeans_cuda_kernel.cu file:
```
info: hipify ./kmeans_cuda_kernel.cu =====>
info: converted 40 CUDA->HIP refs( dev:0 mem:0 kern:0 builtin:37 math:0 stream:0 event:0
```
* Interesting information in kmeans_cuda_kernel.cu :
* How many CUDA calls were converted to HIP (40)
* Breakdown of the CUDA functionality used (dev:0 mem:0 etc). This file uses many CUDA builtins (37) and texture functions (3).
* Warning for code that looks like CUDA API but was not converted (0 in this file).
* Count Lines-of-Code (LOC) - 185 for this file.
* hipexamine-perl also presents a summary at the end of the process for the statistics collected across all files. This has similar format to the per-file reporting, and also includes a list of all kernels which have been called. An example from above:
```shell
info: TOTAL-converted 89 CUDA->HIP refs( dev:3 mem:32 kern:2 builtin:37 math:0 stream:0 event:0 err:0 def:0 tex:15 other:0 ) warn:0 LOC:3607
kernels (1 total) : kmeansPoint(1)
```
### Converting a project "in-place"
```shell
> hipify-perl --inplace
```
For each input file FILE, this script will:
- If "FILE.prehip file does not exist, copy the original code to a new file with extension ".prehip". Then hipify the code file.
- If "FILE.prehip" file exists, hipify FILE.prehip and save to FILE.
This is useful for testing improvements to the hipify toolset.
The [hipconvertinplace-perl.sh](https://github.com/ROCm-Developer-Tools/HIPIFY/blob/master/bin/hipconvertinplace-perl.sh) script will perform inplace conversion for all code files in the specified directory.
This can be quite handy when dealing with an existing CUDA code base since the script preserves the existing directory structure
and filenames - and includes work. After converting in-place, you can review the code to add additional parameters to
directory names.
```shell
> hipconvertinplace-perl.sh MY_SRC_DIR
```
### Library Equivalents
| CUDA Library | ROCm Library | Comment |
|------- | --------- | ----- |
| cuBLAS | rocBLAS | Basic Linear Algebra Subroutines
| cuFFT | rocFFT | Fast Fourier Transfer Library
| cuSPARSE | rocSPARSE | Sparse BLAS + SPMV
| cuSolver | rocSOLVER | Lapack library
| AMG-X | rocALUTION | Sparse iterative solvers and preconditioners with Geometric and Algebraic MultiGrid
| Thrust | rocThrust | C++ parallel algorithms library
| CUB | rocPRIM | Low Level Optimized Parallel Primitives
| cuDNN | MIOpen | Deep learning Solver Library
| cuRAND | rocRAND | Random Number Generator Library
| EIGEN | EIGEN – HIP port | C++ template library for linear algebra: matrices, vectors, numerical solvers,
| NCCL | RCCL | Communications Primitives Library based on the MPI equivalents
## Distinguishing Compiler Modes
### Identifying HIP Target Platform
All HIP projects target either AMD or NVIDIA platform. The platform affects which headers are included and which libraries are used for linking.
- `HIP_PLATFORM_AMD` is defined if the HIP platform targets AMD.
Note, `HIP_PLATFORM_HCC` was previously defined if the HIP platform targeted AMD, it is deprecated.
- `HIP_PLATFORM_NVDIA` is defined if the HIP platform targets NVIDIA.
Note, `HIP_PLATFORM_NVCC` was previously defined if the HIP platform targeted NVIDIA, it is deprecated.
### Identifying the Compiler: hip-clang or nvcc
Often, it's useful to know whether the underlying compiler is HIP-Clang or nvcc. This knowledge can guard platform-specific code or aid in platform-specific performance tuning.
```
#ifdef __HIP_PLATFORM_AMD__
// Compiled with HIP-Clang
#endif
```
```
#ifdef __HIP_PLATFORM_NVIDIA__
// Compiled with nvcc
// Could be compiling with CUDA language extensions enabled (for example, a ".cu file)
// Could be in pass-through mode to an underlying host compile OR (for example, a .cpp file)
```
```
#ifdef __CUDACC__
// Compiled with nvcc (CUDA language extensions enabled)
```
Compiler directly generates the host code (using the Clang x86 target) and passes the code to another host compiler. Thus, they have no equivalent of the \__CUDA_ACC define.
### Identifying Current Compilation Pass: Host or Device
nvcc makes two passes over the code: one for host code and one for device code.
HIP-Clang will have multiple passes over the code: one for the host code, and one for each architecture on the device code.
`__HIP_DEVICE_COMPILE__` is set to a nonzero value when the compiler (HIP-Clang or nvcc) is compiling code for a device inside a `__global__` kernel or for a device function. `__HIP_DEVICE_COMPILE__` can replace #ifdef checks on the `__CUDA_ARCH__` define.
```
// #ifdef __CUDA_ARCH__
#if __HIP_DEVICE_COMPILE__
```
Unlike `__CUDA_ARCH__`, the `__HIP_DEVICE_COMPILE__` value is 1 or undefined, and it doesn't represent the feature capability of the target device.
### Compiler Defines: Summary
|Define | HIP-Clang | nvcc | Other (GCC, ICC, Clang, etc.)
|--- | --- | --- |---|
|HIP-related defines:|
|`__HIP_PLATFORM_AMD__`| Defined | Undefined | Defined if targeting AMD platform; undefined otherwise |
|`__HIP_PLATFORM_NVIDIA__`| Undefined | Defined | Defined if targeting NVIDIA platform; undefined otherwise |
|`__HIP_DEVICE_COMPILE__` | 1 if compiling for device; undefined if compiling for host |1 if compiling for device; undefined if compiling for host | Undefined
|`__HIPCC__` | Defined | Defined | Undefined
|`__HIP_ARCH_*` |0 or 1 depending on feature support (see below) | 0 or 1 depending on feature support (see below) | 0
|nvcc-related defines:|
|`__CUDACC__` | Defined if source code is compiled by nvcc; undefined otherwise | Undefined
|`__NVCC__` | Undefined | Defined | Undefined
|`__CUDA_ARCH__` | Undefined | Unsigned representing compute capability (e.g., "130") if in device code; 0 if in host code | Undefined
|hip-clang-related defines:|
|`__HIP__` | Defined | Undefined | Undefined
|HIP-Clang common defines:|
|`__clang__` | Defined | Defined | Undefined | Defined if using Clang; otherwise undefined
## Identifying Architecture Features
### HIP_ARCH Defines
Some CUDA code tests `__CUDA_ARCH__` for a specific value to determine whether the machine supports a certain architectural feature. For instance,
```
#if (__CUDA_ARCH__ >= 130)
// doubles are supported
```
This type of code requires special attention, since AMD and CUDA devices have different architectural capabilities. Moreover, you can't determine the presence of a feature using a simple comparison against an architecture's version number. HIP provides a set of defines and device properties to query whether a specific architectural feature is supported.
The `__HIP_ARCH_*` defines can replace comparisons of `__CUDA_ARCH__` values:
```
//#if (__CUDA_ARCH__ >= 130) // non-portable
if __HIP_ARCH_HAS_DOUBLES__ { // portable HIP feature query
// doubles are supported
}
```
For host code, the `__HIP_ARCH__*` defines are set to 0. You should only use the __HIP_ARCH__ fields in device code.
### Device-Architecture Properties
Host code should query the architecture feature flags in the device properties that hipGetDeviceProperties returns, rather than testing the "major" and "minor" fields directly:
```
hipGetDeviceProperties(&deviceProp, device);
//if ((deviceProp.major == 1 && deviceProp.minor < 2)) // non-portable
if (deviceProp.arch.hasSharedInt32Atomics) { // portable HIP feature query
// has shared int32 atomic operations ...
}
```
### Table of Architecture Properties
The table below shows the full set of architectural properties that HIP supports.
|Define (use only in device code) | Device Property (run-time query) | Comment |
|------- | --------- | ----- |
|32-bit atomics:||
|`__HIP_ARCH_HAS_GLOBAL_INT32_ATOMICS__` | hasGlobalInt32Atomics |32-bit integer atomics for global memory
|`__HIP_ARCH_HAS_GLOBAL_FLOAT_ATOMIC_EXCH__` | hasGlobalFloatAtomicExch |32-bit float atomic exchange for global memory
|`__HIP_ARCH_HAS_SHARED_INT32_ATOMICS__` | hasSharedInt32Atomics |32-bit integer atomics for shared memory
|`__HIP_ARCH_HAS_SHARED_FLOAT_ATOMIC_EXCH__` | hasSharedFloatAtomicExch |32-bit float atomic exchange for shared memory
|`__HIP_ARCH_HAS_FLOAT_ATOMIC_ADD__` | hasFloatAtomicAdd |32-bit float atomic add in global and shared memory
|64-bit atomics: | |
|`__HIP_ARCH_HAS_GLOBAL_INT64_ATOMICS__` | hasGlobalInt64Atomics |64-bit integer atomics for global memory
|`__HIP_ARCH_HAS_SHARED_INT64_ATOMICS__` | hasSharedInt64Atomics |64-bit integer atomics for shared memory
|Doubles: | |
|`__HIP_ARCH_HAS_DOUBLES__` | hasDoubles |Double-precision floating point
|Warp cross-lane operations: | |
|`__HIP_ARCH_HAS_WARP_VOTE__` | hasWarpVote |Warp vote instructions (any, all)
|`__HIP_ARCH_HAS_WARP_BALLOT__` | hasWarpBallot |Warp ballot instructions
|`__HIP_ARCH_HAS_WARP_SHUFFLE__` | hasWarpShuffle |Warp shuffle operations (shfl\_\*)
|`__HIP_ARCH_HAS_WARP_FUNNEL_SHIFT__` | hasFunnelShift |Funnel shift two input words into one
|Sync: | |
|`__HIP_ARCH_HAS_THREAD_FENCE_SYSTEM__` | hasThreadFenceSystem |threadfence\_system
|`__HIP_ARCH_HAS_SYNC_THREAD_EXT__` | hasSyncThreadsExt |syncthreads\_count, syncthreads\_and, syncthreads\_or
|Miscellaneous: | |
|`__HIP_ARCH_HAS_SURFACE_FUNCS__` | hasSurfaceFuncs |
|`__HIP_ARCH_HAS_3DGRID__` | has3dGrid | Grids and groups are 3D
|`__HIP_ARCH_HAS_DYNAMIC_PARALLEL__` | hasDynamicParallelism |
## Finding HIP
Makefiles can use the following syntax to conditionally provide a default HIP_PATH if one does not exist:
```
HIP_PATH ?= $(shell hipconfig --path)
```
## Identifying HIP Runtime
HIP can depend on rocclr, or cuda as runtime
- AMD platform
On AMD platform, HIP uses Radeon Open Compute Common Language Runtime, called ROCclr.
ROCclr is a virtual device interface that HIP runtimes interact with different backends which allows runtimes to work on Linux , as well as Windows without much efforts.
- NVIDIA platform
On Nvidia platform, HIP is just a thin layer on top of CUDA.
On non-AMD platform, HIP runtime determines if cuda is available and can be used. If available, HIP_PLATFORM is set to nvidia and underneath CUDA path is used.
## hipLaunchKernelGGL
hipLaunchKernelGGL is a macro that can serve as an alternative way to launch kernel, which accepts parameters of launch configurations (grid dims, group dims, stream, dynamic shared size) followed by a variable number of kernel arguments.
It can replace <<< >>>, if the user so desires.
## Compiler Options
hipcc is a portable compiler driver that will call nvcc or HIP-Clang (depending on the target system) and attach all required include and library options. It passes options through to the target compiler. Tools that call hipcc must ensure the compiler options are appropriate for the target compiler.
The `hipconfig` script may helpful in identifying the target platform, compiler and runtime. It can also help set options appropriately.
### Compiler options supported on AMD platforms
Here are the main compiler options supported on AMD platforms by HIP-Clang.
| Option | Description |
| ------ | ----------- |
| --amdgpu-target=<gpu_arch> | [DEPRECATED] This option is being replaced by `--offload-arch=<target>`. Generate code for the given GPU target. Supported targets are gfx701, gfx801, gfx802, gfx803, gfx900, gfx906, gfx908, gfx1010, gfx1011, gfx1012, gfx1030, gfx1031. This option could appear multiple times on the same command line to generate a fat binary for multiple targets. |
| --fgpu-rdc | Generate relocatable device code, which allows kernels or device functions calling device functions in different translation units. |
| -ggdb | Equivalent to `-g` plus tuning for GDB. This is recommended when using ROCm's GDB to debug GPU code. |
| --gpu-max-threads-per-block=<num> | Generate code to support up to the specified number of threads per block. |
| -O<n> | Specify the optimization level. |
| -offload-arch=<target> | Specify the AMD GPU [target ID](https://clang.llvm.org/docs/ClangOffloadBundlerFileFormat.html#target-id). |
| -save-temps | Save the compiler generated intermediate files. |
| -v | Show the compilation steps. |
## Linking Issues
### Linking With hipcc
hipcc adds the necessary libraries for HIP as well as for the accelerator compiler (nvcc or AMD compiler). We recommend linking with hipcc since it automatically links the binary to the necessary HIP runtime libraries. It also has knowledge on how to link and to manage the GPU objects.
### -lm Option
hipcc adds -lm by default to the link command.
## Linking Code With Other Compilers
CUDA code often uses nvcc for accelerator code (defining and launching kernels, typically defined in .cu or .cuh files).
It also uses a standard compiler (g++) for the rest of the application. nvcc is a preprocessor that employs a standard host compiler (gcc) to generate the host code.
Code compiled using this tool can employ only the intersection of language features supported by both nvcc and the host compiler.
In some cases, you must take care to ensure the data types and alignment of the host compiler are identical to those of the device compiler. Only some host compilers are supported---for example, recent nvcc versions lack Clang host-compiler capability.
HIP-Clang generates both device and host code using the same Clang-based compiler. The code uses the same API as gcc, which allows code generated by different gcc-compatible compilers to be linked together. For example, code compiled using HIP-Clang can link with code compiled using "standard" compilers (such as gcc, ICC and Clang). Take care to ensure all compilers use the same standard C++ header and library formats.
### libc++ and libstdc++
hipcc links to libstdc++ by default. This provides better compatibility between g++ and HIP.
If you pass "--stdlib=libc++" to hipcc, hipcc will use the libc++ library. Generally, libc++ provides a broader set of C++ features while libstdc++ is the standard for more compilers (notably including g++).
When cross-linking C++ code, any C++ functions that use types from the C++ standard library (including std::string, std::vector and other containers) must use the same standard-library implementation. They include the following:
- Functions or kernels defined in HIP-Clang that are called from a standard compiler
- Functions defined in a standard compiler that are called from HIP-Clanng.
Applications with these interfaces should use the default libstdc++ linking.
Applications which are compiled entirely with hipcc, and which benefit from advanced C++ features not supported in libstdc++, and which do not require portability to nvcc, may choose to use libc++.
### HIP Headers (hip_runtime.h, hip_runtime_api.h)
The hip_runtime.h and hip_runtime_api.h files define the types, functions and enumerations needed to compile a HIP program:
- hip_runtime_api.h: defines all the HIP runtime APIs (e.g., hipMalloc) and the types required to call them. A source file that is only calling HIP APIs but neither defines nor launches any kernels can include hip_runtime_api.h. hip_runtime_api.h uses no custom hc language features and can be compiled using a standard C++ compiler.
- hip_runtime.h: included in hip_runtime_api.h. It additionally provides the types and defines required to create and launch kernels. hip_runtime.h can be compiled using a standard C++ compiler but will expose a subset of the available functions.
CUDA has slightly different contents for these two files. In some cases you may need to convert hipified code to include the richer hip_runtime.h instead of hip_runtime_api.h.
### Using a Standard C++ Compiler
You can compile hip\_runtime\_api.h using a standard C or C++ compiler (e.g., gcc or ICC). The HIP include paths and defines (`__HIP_PLATFORM_AMD__` or `__HIP_PLATFORM_NVIDIA__`) must pass to the standard compiler; hipconfig then returns the necessary options:
```
> hipconfig --cxx_config
-D__HIP_PLATFORM_AMD__ -I/home/user1/hip/include
```
You can capture the hipconfig output and passed it to the standard compiler; below is a sample makefile syntax:
```
CPPFLAGS += $(shell $(HIP_PATH)/bin/hipconfig --cpp_config)
```
nvcc includes some headers by default. However, HIP does not include default headers, and instead all required files must be explicitly included.
Specifically, files that call HIP run-time APIs or define HIP kernels must explicitly include the appropriate HIP headers.
If the compilation process reports that it cannot find necessary APIs (for example, "error: identifier ‘hipSetDevice’ is undefined"),
ensure that the file includes hip_runtime.h (or hip_runtime_api.h, if appropriate).
The hipify-perl script automatically converts "cuda_runtime.h" to "hip_runtime.h," and it converts "cuda_runtime_api.h" to "hip_runtime_api.h", but it may miss nested headers or macros.
#### cuda.h
The HIP-Clang path provides an empty cuda.h file. Some existing CUDA programs include this file but don't require any of the functions.
### Choosing HIP File Extensions
Many existing CUDA projects use the ".cu" and ".cuh" file extensions to indicate code that should be run through the nvcc compiler.
For quick HIP ports, leaving these file extensions unchanged is often easier, as it minimizes the work required to change file names in the directory and #include statements in the files.
For new projects or ports which can be re-factored, we recommend the use of the extension ".hip.cpp" for source files, and
".hip.h" or ".hip.hpp" for header files.
This indicates that the code is standard C++ code, but also provides a unique indication for make tools to
run hipcc when appropriate.
## Workarounds
### warpSize
Code should not assume a warp size of 32 or 64. See [Warp Cross-Lane Functions](hip_kernel_language.md#warp-cross-lane-functions) for information on how to write portable wave-aware code.
### Kernel launch with group size > 256
Kernel code should use ``` __attribute__((amdgpu_flat_work_group_size(<min>,<max>)))```.
For example:
```
__global__ void dot(double *a,double *b,const int n) __attribute__((amdgpu_flat_work_group_size(1, 512)))
```
## memcpyToSymbol
HIP support for hipMemcpyToSymbol is complete. This feature allows a kernel
to define a device-side data symbol which can be accessed on the host side. The symbol
can be in __constant or device space.
Note that the symbol name needs to be encased in the HIP_SYMBOL macro, as shown in the code example below. This also applies to hipMemcpyFromSymbol, hipGetSymbolAddress, and hipGetSymbolSize.
For example:
Device Code:
```
#include<hip/hip_runtime.h>
#include<hip/hip_runtime_api.h>
#include<iostream>
#define HIP_ASSERT(status) \
assert(status == hipSuccess)
#define LEN 512
#define SIZE 2048
__constant__ int Value[LEN];
__global__ void Get(hipLaunchParm lp, int *Ad)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
Ad[tid] = Value[tid];
}
int main()
{
int *A, *B, *Ad;
A = new int[LEN];
B = new int[LEN];
for(unsigned i=0;i<LEN;i++)
{
A[i] = -1*i;
B[i] = 0;
}
HIP_ASSERT(hipMalloc((void**)&Ad, SIZE));
HIP_ASSERT(hipMemcpyToSymbol(HIP_SYMBOL(Value), A, SIZE, 0, hipMemcpyHostToDevice));
hipLaunchKernelGGL(Get, dim3(1,1,1), dim3(LEN,1,1), 0, 0, Ad);
HIP_ASSERT(hipMemcpy(B, Ad, SIZE, hipMemcpyDeviceToHost));
for(unsigned i=0;i<LEN;i++)
{
assert(A[i] == B[i]);
}
std::cout<<"Passed"<<std::endl;
}
```
## CU_POINTER_ATTRIBUTE_MEMORY_TYPE
To get pointer's memory type in HIP/HIP-Clang, developers should use hipPointerGetAttributes API. First parameter of the API is hipPointerAttribute_t which has 'type' as member variable. 'type' indicates input pointer is allocated on device or host.
For example:
```
double * ptr;
hipMalloc(reinterpret_cast<void**>(&ptr), sizeof(double));
hipPointerAttribute_t attr;
hipPointerGetAttributes(&attr, ptr); /*attr.type will have value as hipMemoryTypeDevice*/
double* ptrHost;
hipHostMalloc(&ptrHost, sizeof(double));
hipPointerAttribute_t attr;
hipPointerGetAttributes(&attr, ptrHost); /*attr.type will have value as hipMemoryTypeHost*/
```
Please note, hipMemoryType enum values are different from cudaMemoryType enum values.
For example, on AMD platform, hipMemoryType is defined in hip_runtime_api.h,
```
typedef enum hipMemoryType {
hipMemoryTypeHost = 0, ///< Memory is physically located on host
hipMemoryTypeDevice = 1, ///< Memory is physically located on device. (see deviceId for specific device)
hipMemoryTypeArray = 2, ///< Array memory, physically located on device. (see deviceId for specific device)
hipMemoryTypeUnified = 3, ///< Not used currently
hipMemoryTypeManaged = 4 ///< Managed memory, automaticallly managed by the unified memory system
} hipMemoryType;
```
Looking into CUDA toolkit, it defines cudaMemoryType as following,
```
enum cudaMemoryType
{
cudaMemoryTypeUnregistered = 0, // Unregistered memory.
cudaMemoryTypeHost = 1, // Host memory.
cudaMemoryTypeDevice = 2, // Device memory.
cudaMemoryTypeManaged = 3, // Managed memory
}
```
In this case, memory type translation for hipPointerGetAttributes needs to be handled properly on nvidia platform to get the correct memory type in CUDA, which is done in the file nvidia_hip_runtime_api.h.
So in any HIP applications which use HIP APIs involving memory types, developers should use #ifdef in order to assign the correct enum values depending on Nvidia or AMD platform.
As an example, please see the code from the link,
github.com/ROCm-Developer-Tools/HIP/blob/develop/tests/catch/unit/memory/hipMemcpyParam2D.cc#L77-L96.
With the #ifdef condition, HIP APIs work as expected on both AMD and NVIDIA platforms.
## threadfence_system
Threadfence_system makes all device memory writes, all writes to mapped host memory, and all writes to peer memory visible to CPU and other GPU devices.
Some implementations can provide this behavior by flushing the GPU L2 cache.
HIP/HIP-Clang does not provide this functionality. As a workaround, users can set the environment variable `HSA_DISABLE_CACHE=1` to disable the GPU L2 cache. This will affect all accesses and for all kernels and so may have a performance impact.
### Textures and Cache Control
Compute programs sometimes use textures either to access dedicated texture caches or to use the texture-sampling hardware for interpolation and clamping. The former approach uses simple point samplers with linear interpolation, essentially only reading a single point. The latter approach uses the sampler hardware to interpolate and combine multiple samples. AMD hardware, as well as recent competing hardware, has a unified texture/L1 cache, so it no longer has a dedicated texture cache. But the nvcc path often caches global loads in the L2 cache, and some programs may benefit from explicit control of the L1 cache contents. We recommend the __ldg instruction for this purpose.
AMD compilers currently load all data into both the L1 and L2 caches, so __ldg is treated as a no-op.
We recommend the following for functional portability:
- For programs that use textures only to benefit from improved caching, use the __ldg instruction
- Programs that use texture object and reference APIs, work well on HIP
## More Tips
### HIP Logging
On an AMD platform, set the AMD_LOG_LEVEL environment variable to log HIP application execution information.
The value of the setting controls different logging level,
```
enum LogLevel {
LOG_NONE = 0,
LOG_ERROR = 1,
LOG_WARNING = 2,
LOG_INFO = 3,
LOG_DEBUG = 4
};
```
Logging mask is used to print types of functionalities during the execution of HIP application.
It can be set as one of the following values,
```
enum LogMask {
LOG_API = 0x00000001, //!< API call
LOG_CMD = 0x00000002, //!< Kernel and Copy Commands and Barriers
LOG_WAIT = 0x00000004, //!< Synchronization and waiting for commands to finish
LOG_AQL = 0x00000008, //!< Decode and display AQL packets
LOG_QUEUE = 0x00000010, //!< Queue commands and queue contents
LOG_SIG = 0x00000020, //!< Signal creation, allocation, pool
LOG_LOCK = 0x00000040, //!< Locks and thread-safety code.
LOG_KERN = 0x00000080, //!< kernel creations and arguments, etc.
LOG_COPY = 0x00000100, //!< Copy debug
LOG_COPY2 = 0x00000200, //!< Detailed copy debug
LOG_RESOURCE = 0x00000400, //!< Resource allocation, performance-impacting events.
LOG_INIT = 0x00000800, //!< Initialization and shutdown
LOG_MISC = 0x00001000, //!< misc debug, not yet classified
LOG_AQL2 = 0x00002000, //!< Show raw bytes of AQL packet
LOG_CODE = 0x00004000, //!< Show code creation debug
LOG_CMD2 = 0x00008000, //!< More detailed command info, including barrier commands
LOG_LOCATION = 0x00010000, //!< Log message location
LOG_ALWAYS = 0xFFFFFFFF, //!< Log always even mask flag is zero
};
```
### Debugging hipcc
To see the detailed commands that hipcc issues, set the environment variable HIPCC_VERBOSE to 1. Doing so will print to stderr the HIP-clang (or nvcc) commands that hipcc generates.
```
export HIPCC_VERBOSE=1
make
...
hipcc-cmd: /opt/hcc/bin/hcc -hc -I/opt/hcc/include -stdlib=libc++ -I../../../../hc/include -I../../../../include/amd_detail/cuda -I../../../../include -x c++ -I../../common -O3 -c backprop_cuda.cu
```
### What Does This Error Mean?
#### /usr/include/c++/v1/memory:5172:15: error: call to implicitly deleted default constructor of 'std::__1::bad_weak_ptr' throw bad_weak_ptr();
If you pass a ".cu" file, hcc will attempt to compile it as a CUDA language file. You must tell hcc that it's in fact a C++ file: use the "-x c++" option.
### Editor Highlighting
See the utils/vim or utils/gedit directories to add handy highlighting to hip files.
-468
Voir le fichier
@@ -1,468 +0,0 @@
# HIP C++ Feature
## C++ 11
### Rvalue References
```cpp
struct Y {
int x;
};
__device__ void do_something(Y &&val) { val.x += 1; }
__global__ void kernel() {
Y y{10};
// do_something(y); // does not compile since the argument is an lvalue
do_something(std::move(y));
}
int main() { kernel<<<1, 1>>>(); }
```
### Rvalue References for `*this`
```cpp
struct Sample {
__host__ __device__ void callMe() & { printf("Lval Func\n"); }
__host__ __device__ void callMe() && { printf("Rval Func\n"); }
};
__global__ void kernel() {
Sample s;
s.callMe(); // prints Lval Func
Sample().callMe(); // prints Rval Func
}
int main() { kernel<<<1, 1>>>(); }
```
### Variadic templates, Static Assertions, `auto` Variables
```cpp
template <typename T> __host__ __device__ T add(T val) { return val; }
template <typename T, typename... Targs>
__host__ __device__ T add(T val, Targs... pVal) {
static_assert(std::is_arithmetic<T>::value, "Not a valid type");
return val + add(pVal...);
}
template <typename T, typename... Targs>
__global__ void kernel(T *ptr, Targs... args) {
auto &&sum = add(args...);
*ptr = sum;
}
// Or something like
__device__ int &getX(int &x) { return ++x; }
__device__ int getY(int &x) { return x + 10; }
__global__ void kernel() {
int X = 0;
auto &&x = getX(X);
auto &&y = getY(X);
// Init with value or initializer list
auto val{10};
auto list = {10};
}
int main() { kernel<<<1, 1>>>(); }
```
### Non-static Data Member Initialization
```cpp
struct S {
int a = 1;
int b = 2;
};
__global__ void kernel() {
S s; // s.a == 1 and s.b == 2
}
int main() { kernel<<<1, 1>>>(); }
```
### Lambda Device Functions
```cpp
template <typename T> __global__ void kernel(T f) { f(); }
int main() {
auto func = [=] __device__() { printf("In Kernel\n"); };
kernel<<<1, 1>>>(func);
hipDeviceSynchronize();
}
```
### `decltype` Usage
```cpp
template <typename T> __device__ T ret() {
T x{0};
return x;
}
template <typename T> __global__ void kernel() {
decltype(ret<T>()) a;
int i = 0;
decltype(i) j = i + 1;
}
int main() { kernel<float><<<1, 1>>>(); }
```
### Default Template Arguments
```cpp
template <int N = 5> __global__ void kernel(int x) { x += N; }
int main() {
kernel<<<1, 1>>>(1);
kernel<-2><<<1, 1>>>(1);
}
```
### Template Alias
```cpp
template <typename T> struct Alloc {};
template <typename T, typename U> struct Vector {};
template <typename T> using V = Vector<T, Alloc<T>>;
template <typename T> __global__ void kernel(T x) { V<T> v; }
int main() { kernel<<<1, 1>>>(5); }
```
### Extern Template
```cpp
template <typename T> __global__ void kernel(T x) {}
extern template __global__ void kernel(long x);
int main() {
kernel<<<1, 1, 0, 0>>>(10); // will create a template specialization
// kernel<<<1,1,0,0>>>(10l); // looks for existing kernel<long>, causing
// linking to fail
}
```
### `nullptr` as a Keyword in Device Compiler
```cpp
__global__ void kernel() {
int *ptr = nullptr;
//...
}
int main() { kernel<<<1, 1>>>(); }
```
### Strongly Typed Enums
```cpp
enum class EnumVals { Red, Blue, Green };
__global__ void kernel() {
auto val = EnumVals::Red;
//...
}
int main() { kernel<<<1, 1>>>(); }
```
### Standardized Attribute Syntax
```cpp
[[deprecated]] __global__ void kernel() {
//...
}
int main() { kernel<<<1, 1>>>(); }
```
### `constexpr`
```cpp
struct S {
constexpr __device__ S(double v) : val(v) {}
constexpr __device__ double value() const { return val; }
private:
double val;
};
constexpr __device__ int factorial(int n) {
return n <= 1 ? 1 : (n * factorial(n - 1));
}
__global__ void kernel() {
constexpr S s(factorial(5));
constexpr double d = s.value();
// ...
}
int main() { kernel<<<1, 1>>>(); }
```
### `alignas` with Struct
```cpp
struct alignas(alignof(int)) S {
//...
};
__global__ void kernel() {
S s;
static_assert(alignof(S) == alignof(int), "they have the same alignment");
// check the alignment
}
int main() { kernel<<<1, 1>>>(); }
```
### Delegating Constructors
```cpp
struct S {
private:
int val;
public:
__device__ S(int v) : val(v) {}
__device__ S() : S(42) {}
};
__global__ void kernel() { S s{}; }
int main() { kernel<<<1, 1>>>(); }
```
### Explicit Conversion Functions
```cpp
struct S {
private:
int val;
public:
__device__ S(int val) : val(val) {}
__device__ explicit operator int *() { return &val; }
};
__global__ void kernel() {
S s{0};
// if (s) { // compile error
// without the explicit function specifier then s would be converted to the
// pointer to s.val, which would be non-zero so always true.
//}
if ((int *)(s)) {
// this compiles but is likely not what the user intended
}
}
int main() { kernel<<<1, 1>>>(); }
```
### Unicode Character Types, Unicode String, Universal Character Literal
```cpp
__global__ void kernel() {
// cant print it since printf(gpu) doesnot support unicode char arguments
char16_t a = u'y';
char32_t l = U'';
auto *string = U"इस अनुवाद को करने से आपको क्या मिला?";
}
int main() { kernel<<<1, 1>>>(); }
```
### User Defined Literals
```cpp
__device__ long double operator"" _w(long double a) { return a; }
__device__ unsigned operator"" _w(char const *c) { return *c - '0'; }
__global__ void kernel() {
auto ld = 1.2_w; // calls operator "" _w(1.2L)
auto val = 2_w; // calls operator "" _w("2")
}
int main() { kernel<<<1, 1>>>(); }
```
### `default`/`delete` Functions
```cpp
struct S {
__device__ S() = default;
__device__ S &operator=(const S &) = delete;
};
__global__ void kernel() {
S s, other; // fine
// other = s; // compile error, function deleted
}
int main() { kernel<<<1, 1>>>(); }
```
### Friend Declaration
```cpp
struct Y {};
struct A {
__device__ A() = default;
friend Y;
// friend Z; // compile error since class or struct Z doesn't exist
friend class Z; // this is fine
friend void asdf(int); // functions can be declared without a definition
};
__global__ void kernel() { A a; }
int main() { kernel<<<1, 1>>>(); }
```
### Extended `sizeof`
```cpp
template <typename... Ts> __global__ void kernel(Ts... ts) {
auto size = sizeof...(ts);
// ...
}
int main() { kernel<<<1, 1>>>(); }
```
### Unrestricted Unions
```cpp
struct Point {
__device__ Point() {}
__device__ Point(int x, int y) : x_(x), y_(y) {}
int x_, y_;
};
union U {
int z;
double w;
Point p;
__device__ U() {}
__device__ U(const Point &pt) : p(pt) {}
__device__ U &operator=(const Point &pt) {
new (&p) Point(pt);
return *this;
}
};
__global__ void kernel() {
U u;
//...
}
int main() { kernel<<<1, 1>>>(); }
```
### Inline Namespaces
```cpp
namespace XX {
inline namespace YY {
struct Y {
int x;
};
} // namespace YY
struct X {
int a;
};
} // namespace XX
__global__ void kernel() {
XX::X x{};
XX::Y y{};
}
int main() { kernel<<<1, 1>>>(); }
```
### Range Based For-loop
```cpp
__global__ void kernel() {
for (auto &x : {1, 2, 3, 4, 5}) {
// ...
}
}
int main() { kernel<<<1, 1>>>(); }
```
### `override` Specifier
```cpp
struct Base {
int n;
__device__ Base(int v) : n(v + 1) {}
__device__ Base() : Base(10) {}
__device__ virtual ~Base() {}
__device__ virtual int get() { return n; }
};
struct Derived : public Base {
int n;
__device__ Derived(int v) : n(v) {}
__device__ int get() override { return n; }
__device__ ~Derived() {}
};
__global__ void kernel() {
Derived d(10);
//...
}
int main() { kernel<<<1, 1>>>(); }
```
### `noexcept` Keyword
```cpp
__global__ void kernel() noexcept {
int n;
//...
}
int main() { kernel<<<1, 1>>>(); }
```
### Consecutive Right Angle Brackets in Templates
```cpp
template <typename T> struct A { T a; };
template <typename T> struct B { T b; };
__global__ void kernel() {
A<B<int>> ab;
//...
}
int main() { kernel<<<1, 1>>>(); }
```
### Not Yet Documented
* Right Angled Brackets : http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2005/n1757.html
* Initializer List : http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2008/n2672.htm
* Solving SFINAE problem for expression : http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2008/n2634.html
* Forward Declaration of Enum : http://www.open-std.org/jtc1/sc22/wg21/docs/cwg_defects.html#1206 http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2008/n2764.pdf
* Conditionally supported behavior : http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2004/n1627.pdf
* Inheriting Constructors : http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2008/n2540.htm
* Standard layout types : https://en.cppreference.com/w/cpp/named_req/StandardLayoutType
* Local and unnamed types as template arguments : http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2008/n2657.htm
* Minimal support for Garbage Collection : http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2008/n2670.htm
* Move special functions : https://en.cppreference.com/w/cpp/language/rule_of_three
* long long int
## C++14
### Not Yet Documented
* Tweak C++ contextual conversions : http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2012/n3323.pdf
* Binary literals : http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2012/n3472.pdf
* Functions to deduce return type : https://isocpp.org/files/papers/N3638.html
* Lambda capture changes : https://isocpp.org/files/papers/N3648.html
* Polymorphic lambda : https://isocpp.org/files/papers/N3649.html
* Variable template : https://en.cppreference.com/w/cpp/language/variable_template
* constexpr changes : https://isocpp.org/files/papers/N3652.html
* struct member initializer : http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2013/n3653.html
* clarifying mem allocation : http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2013/n3664.html
* sized dealloc : https://isocpp.org/files/papers/n3778.html
* deprecated attribute : http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2013/n3760.html
* digit separator : http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2013/n3781.pdf
-92
Voir le fichier
@@ -1,92 +0,0 @@
# ROCm Code Object tooling
ROCm compiler generated code objects (executables, object files, and shared
object libraries) can be examined and code objects extracted with the following
tools.
## roc-obj
High-level wrapper around low-level tooling described below. For a more
detailed overview, see the help text available with `roc-obj --help`.
### Examples:
#### Extract all ROCm code objects from a list of executables
roc-obj <executable>...
#### Extract all ROCm code objects from a list of executables, and disassemble them
roc-obj --disassemble <executable>...
# or
roc-obj -d <executable>...
#### Extract all ROCm code objects from a list of executables into dir/
roc-obj --outdir dir/ <executable>...
# or
roc-obj -o dir/ <executable>...
#### Extract only ROCm code objects matching regex over Target ID
roc-obj --target-id gfx9 <executable>...
# or
roc-obj -t gfx9 <executable>...
## Low-Level Tooling
### URI syntax:
ROCm Code Objects can be listed/accessed using the following URI syntax:
```
code_object_uri ::== file_uri | memory_uri
file_uri ::== file:// extract_file [ range_specifier ]
memory_uri ::== memory:// process_id range_specifier
range_specifier ::== [ # | ? ] offset= number & size= number
extract_file ::== URI_ENCODED_OS_FILE_PATH
process_id ::== DECIMAL_NUMBER
number ::== HEX_NUMBER | DECIMAL_NUMBER | OCTAL_NUMBER
```
Example: file://dir1/dir2/hello_world#offset=133&size=14472
memory://1234#offset=0x20000&size=3000
### List available ROCm Code Objects: rocm-obj-ls
Use this tool to list available ROCm code objects. Code objects are listed by bundle number, entry ID, and URI syntax.
Usage: roc-obj-ls [-v|h] executable...
List the URIs of the code objects embedded in the specfied host executables.
-v Verbose output. Adds column headers for more human readable format
-h Show this help message
### Extract ROCm Code Objects: roc-obj-extract
Extracts available ROCm code objects from specified URI.
Usage: roc-obj-extract [-o|v|h] URI...
- URIs can be read from STDIN, one per line.
- From the URIs specified, extracts code objects into files named: <executable_name>-[pid<number>]-offset<number>-size<number>.co
Options:
-o <path> Path for output. If "-" specified, code object is printed to STDOUT.
-v Verbose output (includes Entry ID).
-h Show this help message
Note, when specifying a URI argument to roc-obj-extract, if cut and pasting the output from roc-obj-ls you need to escape the '&' character or your shell will interpret it as the option to run the command as a background process.
As an example, if roc-obj-ls generates a URI like this ```file://my_exe#offset=24576&size=46816xxi```, you need to use the following argument to roc-obj-extract: ```file://my_exe#offset=24576\&size=46816```
### Examples:
#### Dump the ISA for gfx906:
roc-obj-ls -v <exe> | awk '/gfx906/{print $3}' | roc-obj-extract -o - | llvm-objdump -d - > <exe>.gfx906.isa
#### Check the e_flags of the gfx908 code object:
roc-obj-ls -v <exe> | awk '/gfx908/{print $3}' | roc-obj-extract -o - | llvm-readelf -h - | grep Flags
#### Disassemble the fourth code object:
roc-obj-ls <exe> | sed -n 4p | roc-obj-extract -o - | llvm-objdump -d -
#### Sort embedded code objects by size:
for uri in $(roc-obj-ls <exe>); do printf "%d: %s\n" "$(roc-obj-extract -o - "$uri" | wc -c)" "$uri"; done | sort -n
#### Compare disassembly of gfx803 and gfx900 code objects:
dis() { roc-obj-ls -v <exe> | grep "$1" | awk '{print $3}' | roc-obj-extract -o - | llvm-objdump -d -; }
diff <(dis gfx803) <(dis gfx900)
@@ -1,4 +1,4 @@
# HIP Deprecated APIs
# HIP Deprecated Runtime Functions
## HIP Context Management APIs
@@ -1,60 +1,4 @@
## Table of Contents
<!-- toc -->
- [Introduction](#introduction)
- [Function-Type Qualifiers](#function-type-qualifiers)
* [`__device__`](#__device__)
* [`__global__`](#__global__)
* [`__host__`](#__host__)
- [Calling `__global__` Functions](#calling-__global__-functions)
- [Kernel-Launch Example](#kernel-launch-example)
- [Variable-Type Qualifiers](#variable-type-qualifiers)
* [`__constant__`](#__constant__)
* [`__shared__`](#__shared__)
* [`__managed__`](#__managed__)
* [`__restrict__`](#__restrict__)
- [Built-In Variables](#built-in-variables)
* [Coordinate Built-Ins](#coordinate-built-ins)
* [warpSize](#warpsize)
- [Vector Types](#vector-types)
* [Short Vector Types](#short-vector-types)
* [dim3](#dim3)
- [Memory-Fence Instructions](#memory-fence-instructions)
- [Synchronization Functions](#synchronization-functions)
- [Math Functions](#math-functions)
* [Single Precision Mathematical Functions](#single-precision-mathematical-functions)
* [Double Precision Mathematical Functions](#double-precision-mathematical-functions)
* [Integer Intrinsics](#integer-intrinsics)
* [Floating-point Intrinsics](#floating-point-intrinsics)
- [Texture Functions](#texture-functions)
- [Surface Functions](#surface-functions)
- [Timer Functions](#timer-functions)
- [Atomic Functions](#atomic-functions)
- [Warp Cross-Lane Functions](#warp-cross-lane-functions)
* [Warp Vote and Ballot Functions](#warp-vote-and-ballot-functions)
* [Warp Shuffle Functions](#warp-shuffle-functions)
- [Cooperative Groups Functions](#cooperative-groups-functions)
- [Warp Matrix Functions](#warp-matrix-functions)
- [Independent Thread Scheduling](#independent-thread-scheduling)
- [Profiler Counter Function](#profiler-counter-function)
- [Assert](#assert)
- [Printf](#printf)
- [Device-Side Dynamic Global Memory Allocation](#device-side-dynamic-global-memory-allocation)
- [`__launch_bounds__`](#__launch_bounds__)
* [Compiler Impact](#compiler-impact)
* [CU and EU Definitions](#cu-and-eu-definitions)
* [Porting from CUDA __launch_bounds](#porting-from-cuda-__launch_bounds)
* [maxregcount](#maxregcount)
- [Register Keyword](#register-keyword)
- [Pragma Unroll](#pragma-unroll)
- [In-Line Assembly](#in-line-assembly)
- [C++ Support](#c-support)
- [Kernel Compilation](#kernel-compilation)
- [GFX Arch specific kernel](#gfx-arch-specific-kernel)
<!-- tocstop -->
## Introduction
# Kernel Language Syntax
HIP provides a C++ syntax that is suitable for compiling most code that commonly appears in compute kernels, including classes, namespaces, operator overloading, templates and more. Additionally, it defines other language features designed specifically to target accelerators, such as the following:
- A kernel-launch syntax that uses standard C++, resembles a function call and is portable to all HIP targets
@@ -75,7 +19,7 @@ Supported `__device__` functions are
- Executed on the device
- Called from the device only
The `__device__` keyword can combine with the host keyword (see [__host__](#host)).
The `__device__` keyword can combine with the host keyword (see {ref}`host_attr`).
### `__global__`
Supported `__global__` functions are
@@ -86,6 +30,7 @@ HIP `__global__` functions must have a `void` return type, and the first paramet
HIP lacks dynamic-parallelism support, so `__global__ ` functions cannot be called from the device.
(host_attr)=
### `__host__`
Supported `__host__` functions are
- Executed on the host
@@ -123,7 +68,7 @@ MyKernel<<<dim3(gridDim), dim3(groupDim), 0, 0>>> (a,b,c,n);
```
The hipLaunchKernelGGL macro always starts with the five parameters specified above, followed by the kernel arguments. HIPIFY tools optionally convert Cuda launch syntax to hipLaunchKernelGGL, including conversion of optional arguments in <<< >>> to the five required hipLaunchKernelGGL parameters. The dim3 constructor accepts zero to three arguments and will by default initialize unspecified dimensions to 1. See [dim3](#dim3). The kernel uses the coordinate built-ins (thread*, block*, grid*) to determine coordinate index and coordinate bounds of the work item that’s currently executing. See [Coordinate Built-Ins](#coordinate-builtins).
The hipLaunchKernelGGL macro always starts with the five parameters specified above, followed by the kernel arguments. HIPIFY tools optionally convert Cuda launch syntax to hipLaunchKernelGGL, including conversion of optional arguments in <<< >>> to the five required hipLaunchKernelGGL parameters. The dim3 constructor accepts zero to three arguments and will by default initialize unspecified dimensions to 1. See [dim3](#dim3). The kernel uses the coordinate built-ins (thread*, block*, grid*) to determine coordinate index and coordinate bounds of the work item that’s currently executing. See {ref}`coordinate_builtins`.
Please note, HIP does not support kernel launch with total work items defined in dimension with size gridDim x blockDim >= 2^32.
@@ -181,6 +126,7 @@ The `__restrict__` keyword tells the compiler that the associated memory pointer
## Built-In Variables
(coordinate_builtins)=
### Coordinate Built-Ins
Built-ins determine the coordinate of the active work item in the execution grid. They are defined in amd_hip_runtime.h (rather than being implicitly defined by the compiler).
In HIP, built-ins coordinate variable definitions are the same as in Cuda, for instance:
@@ -277,9 +223,9 @@ Following is the list of supported single precision mathematical functions.
| float frexpf ( float x, int* nptr ) <br><sub>Extract mantissa and exponent of a floating-point value.</sub> | ✓ | ✗ |
| float hypotf ( float x, float y ) <br><sub>Calculate the square root of the sum of squares of two arguments.</sub> | ✓ | ✓ |
| int ilogbf ( float x ) <br><sub>Compute the unbiased integer exponent of the argument.</sub> | ✓ | ✓ |
| __RETURN_TYPE<sup id="a1">[1](#f1)</sup> isfinite ( float a ) <br><sub>Determine whether argument is finite.</sub> | ✓ | ✓ |
| __RETURN_TYPE<sup>[1](#f1)</sup> isinf ( float a ) <br><sub>Determine whether argument is infinite.</sub> | ✓ | ✓ |
| __RETURN_TYPE<sup>[1](#f1)</sup> isnan ( float a ) <br><sub>Determine whether argument is a NaN.</sub> | ✓ | ✓ |
| __RETURN_TYPE[^f1] isfinite ( float a ) <br><sub>Determine whether argument is finite.</sub> | ✓ | ✓ |
| __RETURN_TYPE[^f1]</sup> isinf ( float a ) <br><sub>Determine whether argument is infinite.</sub> | ✓ | ✓ |
| __RETURN_TYPE[^f1]</sup> isnan ( float a ) <br><sub>Determine whether argument is a NaN.</sub> | ✓ | ✓ |
| float ldexpf ( float x, int exp ) <br><sub>Calculate the value of x ⋅ 2<sup>exp</sup>.</sub> | ✓ | ✓ |
| float log10f ( float x ) <br><sub>Calculate the base 10 logarithm of the input argument.</sub> | ✓ | ✓ |
| float log1pf ( float x ) <br><sub>Calculate the value of log<sub>e</sub>( 1 + x ).</sub> | ✓ | ✓ |
@@ -294,7 +240,7 @@ Following is the list of supported single precision mathematical functions.
| float remquof ( float x, float y, int* quo ) <br><sub>Compute single-precision floating-point remainder and part of quotient.</sub> | ✓ | ✗ |
| float roundf ( float x ) <br><sub>Round to nearest integer value in floating-point.</sub> | ✓ | ✓ |
| float scalbnf ( float x, int n ) <br><sub>Scale floating-point input by integer power of two.</sub> | ✓ | ✓ |
| __RETURN_TYPE<sup>[1](#f1)</sup> signbit ( float a ) <br><sub>Return the sign bit of the input.</sub> | ✓ | ✓ |
| __RETURN_TYPE[^f1]</sup> signbit ( float a ) <br><sub>Return the sign bit of the input.</sub> | ✓ | ✓ |
| void sincosf ( float x, float* sptr, float* cptr ) <br><sub>Calculate the sine and cosine of the first input argument.</sub> | ✓ | ✗ |
| float sinf ( float x ) <br><sub>Calculate the sine of the input argument.</sub> | ✓ | ✓ |
| float sinhf ( float x ) <br><sub>Calculate the hyperbolic sine of the input argument.</sub> | ✓ | ✓ |
@@ -338,8 +284,7 @@ Following is the list of supported single precision mathematical functions.
| float ynf ( int n, float x ) <br><sub>Calculate the value of the Bessel function of the second kind of order n for the input argument.</sub> | ✓ | ✓ |
<sub><b id="f1"><sup>[1]</sup></b> __RETURN_TYPE is dependent on compiler. It is usually 'int' for C compilers and 'bool' for C++ compilers.</sub> [](#a1)
[^f1]: __RETURN_TYPE is dependent on compiler. It is usually 'int' for C compilers and 'bool' for C++ compilers.
### Double Precision Mathematical Functions
Following is the list of supported double precision mathematical functions.
@@ -374,9 +319,9 @@ Following is the list of supported double precision mathematical functions.
| double frexp ( double x, int* nptr ) <br><sub>Extract mantissa and exponent of a floating-point value.</sub> | ✓ | ✗ |
| double hypot ( double x, double y ) <br><sub>Calculate the square root of the sum of squares of two arguments.</sub> | ✓ | ✓ |
| int ilogb ( double x ) <br><sub>Compute the unbiased integer exponent of the argument.</sub> | ✓ | ✓ |
| __RETURN_TYPE<sup id="a2">[1](#f2)</sup> isfinite ( double a ) <br><sub>Determine whether argument is finite.</sub> | ✓ | ✓ |
| __RETURN_TYPE<sup>[1](#f2)</sup> isinf ( double a ) <br><sub>Determine whether argument is infinite.</sub> | ✓ | ✓ |
| __RETURN_TYPE<sup>[1](#f2)</sup> isnan ( double a ) <br><sub>Determine whether argument is a NaN.</sub> | ✓ | ✓ |
| __RETURN_TYPE[^f1] isfinite ( double a ) <br><sub>Determine whether argument is finite.</sub> | ✓ | ✓ |
| __RETURN_TYPE[^f1]</sup> isinf ( double a ) <br><sub>Determine whether argument is infinite.</sub> | ✓ | ✓ |
| __RETURN_TYPE[^f1]</sup> isnan ( double a ) <br><sub>Determine whether argument is a NaN.</sub> | ✓ | ✓ |
| double ldexp ( double x, int exp ) <br><sub>Calculate the value of x ⋅ 2<sup>exp</sup>.</sub> | ✓ | ✓ |
| double log ( double x ) <br><sub>Calculate the base e logarithm of the input argument.</sub> | ✓ | ✓ |
| double log10 ( double x ) <br><sub>Calculate the base 10 logarithm of the input argument.</sub> | ✓ | ✓ |
@@ -391,7 +336,7 @@ Following is the list of supported double precision mathematical functions.
| double remquo ( double x, double y, int* quo ) <br><sub>Compute double-precision floating-point remainder and part of quotient.</sub> | ✓ | ✗ |
| double round ( double x ) <br><sub>Round to nearest integer value in floating-point.</sub> | ✓ | ✓ |
| double scalbn ( double x, int n ) <br><sub>Scale floating-point input by integer power of two.</sub> | ✓ | ✓ |
| __RETURN_TYPE<sup>[1](#f2)</sup> signbit ( double a ) <br><sub>Return the sign bit of the input.</sub> | ✓ | ✓ |
| __RETURN_TYPE[^f1] signbit ( double a ) <br><sub>Return the sign bit of the input.</sub> | ✓ | ✓ |
| double sin ( double x ) <br><sub>Calculate the sine of the input argument.</sub> | ✓ | ✓ |
| void sincos ( double x, double* sptr, double* cptr ) <br><sub>Calculate the sine and cosine of the first input argument.</sub> | ✓ | ✗ |
| double sinh ( double x ) <br><sub>Calculate the hyperbolic sine of the input argument.</sub> | ✓ | ✓ |
@@ -432,10 +377,6 @@ Following is the list of supported double precision mathematical functions.
| double y1 ( double x ) <br><sub>Calculate the value of the Bessel function of the second kind of order 1 for the input argument.</sub> | ✓ | ✓ |
| double yn ( int n, double x ) <br><sub>Calculate the value of the Bessel function of the second kind of order n for the input argument.</sub> | ✓ | ✓ |
<sub><b id="f2"><sup>[1]</sup></b> __RETURN_TYPE is dependent on compiler. It is usually 'int' for C compilers and 'bool' for C++ compilers.</sub> [](#a2)
### Integer Intrinsics
Following is the list of supported integer intrinsics. Note that intrinsics are supported on device only.
@@ -447,15 +388,15 @@ Following is the list of supported integer intrinsics. Note that intrinsics are
| unsigned int __clz(unsigned int x) <br><sub>Return the number of consecutive high-order zero bits in 32 bit unsigned integer.</sub> |
| int __clzll ( long long int x ) <br><sub>Count the number of consecutive high-order zero bits in a 64 bit integer.</sub> |
| unsigned int __clzll(long long int x) <br><sub>Return the number of consecutive high-order zero bits in 64 bit signed integer.</sub> |
| unsigned int __ffs(unsigned int x) <br><sub>Find the position of least signigicant bit set to 1 in a 32 bit unsigned integer.<sup id="a3">[1](#f3)</sup></sub> |
| unsigned int __ffs(unsigned int x) <br><sub>Find the position of least signigicant bit set to 1 in a 32 bit unsigned integer.[^f3]</sub> |
| unsigned int __ffs(int x) <br><sub>Find the position of least signigicant bit set to 1 in a 32 bit signed integer.</sub> |
| unsigned int __ffsll(unsigned long long int x) <br><sub>Find the position of least signigicant bit set to 1 in a 64 bit unsigned integer.<sup>[1](#f3)</sup></sub> |
| unsigned int __ffsll(unsigned long long int x) <br><sub>Find the position of least signigicant bit set to 1 in a 64 bit unsigned integer.[^f3]</sup></sub> |
| unsigned int __ffsll(long long int x) <br><sub>Find the position of least signigicant bit set to 1 in a 64 bit signed integer.</sub> |
| unsigned int __popc ( unsigned int x ) <br><sub>Count the number of bits that are set to 1 in a 32 bit integer.</sub> |
| unsigned int __popcll ( unsigned long long int x )<br><sub>Count the number of bits that are set to 1 in a 64 bit integer.</sub> |
| int __mul24 ( int x, int y )<br><sub>Multiply two 24bit integers.</sub> |
| unsigned int __umul24 ( unsigned int x, unsigned int y )<br><sub>Multiply two 24bit unsigned integers.</sub> |
<sub><b id="f3"><sup>[1]</sup></b>
<sub>[^f3]
The HIP-Clang implementation of __ffs() and __ffsll() contains code to add a constant +1 to produce the ffs result format.
For the cases where this overhead is not acceptable and programmer is willing to specialize for the platform,
HIP-Clang provides __lastbit_u32_u32(unsigned int input) and __lastbit_u32_u64(unsigned long long int input).
@@ -612,6 +553,7 @@ Towards this end, HIP has four extra functions to help developers more precisely
- `double safeAtomicAdd(double* address, double val)`
- These functions will always produce safe atomic RMW operations, even when `-munsafe-fp-atomics` is set
(warp_cross_lane_functions)=
## Warp Cross-Lane Functions
Warp cross-lane functions operate across all lanes in a warp. The hardware guarantees that all warp lanes will execute in lockstep, so additional synchronization is unnecessary, and the instructions use no shared memory.
+36 -73
Voir le fichier
@@ -1,43 +1,6 @@
# FAQ
# Frequently asked questions
<!-- toc -->
- [What APIs and features does HIP support?](#what-apis-and-features-does-hip-support)
- [What is not supported?](#what-is-not-supported)
* [Runtime/Driver API features](#runtimedriver-api-features)
* [Kernel language features](#kernel-language-features)
- [Is HIP a drop-in replacement for CUDA?](#is-hip-a-drop-in-replacement-for-cuda)
- [What specific version of CUDA does HIP support?](#what-specific-version-of-cuda-does-hip-support)
- [What libraries does HIP support?](#what-libraries-does-hip-support)
- [How does HIP compare with OpenCL?](#how-does-hip-compare-with-opencl)
- [How does porting CUDA to HIP compare to porting CUDA to OpenCL?](#how-does-porting-cuda-to-hip-compare-to-porting-cuda-to-opencl)
- [What hardware does HIP support?](#what-hardware-does-hip-support)
- [Do HIPIFY tools automatically convert all source code?](#do-hipify-tools-automatically-convert-all-source-code)
- [What is NVCC?](#what-is-nvcc)
- [What is HIP-Clang?](#what-is-hip-clang)
- [Why use HIP rather than supporting CUDA directly?](#why-use-hip-rather-than-supporting-cuda-directly)
- [Can I develop HIP code on an Nvidia CUDA platform?](#can-i-develop-hip-code-on-an-nvidia-cuda-platform)
- [Can I develop HIP code on an AMD HIP-Clang platform?](#can-i-develop-hip-code-on-an-amd-hip-clang-platform)
- [What is ROCclr?](#what-is-rocclr)
- [What is hipamd?](#what-is-hipamd)
- [Can a HIP binary run on both AMD and Nvidia platforms?](#can-a-hip-binary-run-on-both-amd-and-nvidia-platforms)
- [On HIP-Clang, can I link HIP code with host code compiled with another compiler such as gcc, icc, or clang?](#on-HIP-Clang-can-i-link-hip-code-with-host-code-compiled-with-another-compiler-such-as-gcc-icc-or-clang-)
- [HIP detected my platform (hip-clang vs nvcc) incorrectly - what should I do?](#hip-detected-my-platform-hip-clang-vs-nvcc-incorrectly---what-should-i-do)
- [Can I install both CUDA SDK and HIP-clang on same machine?](#can-i-install-both-cuda-sdk-and-hip-clang-on-same-machine)
- [On CUDA, can I mix CUDA code with HIP code?](#on-cuda-can-i-mix-cuda-code-with-hip-code)
- [How do I trace HIP application flow?](#how-do-i-trace-hip-application-flow)
- [What if HIP generates an error of "symbol multiply defined!" only on AMD machine?](#what-if-hip-generates-error-of-symbol-multiply-defined-only-on-amd-machine)
- [What is maximum limit of Generic kernel launching parameter?](#what-is-maximum-limit-of-generic-kernel-launching-parameter)
- [Are _shfl_*_sync functions supported on HIP platform?](#are-_shfl_*_sync-functions-supported-on-hip-platform)
- [How to create a guard for code that is specific to the host or the GPU?](#how-to-create-a-guard-for-code-that-is-specific-to-the-host-or-the-gpu)
- [Why _OpenMP is undefined when compiling with -fopenmp?](#why-_openmp-is-undefined-when-compiling-with--fopenmp)
- [Does the HIP-Clang compiler support extern shared declarations?](#does-the-hip-clang-compiler-support-extern-shared-declarations)
- [I have multiple HIP enabled devices and I am getting an error message hipErrorNoBinaryForGpu: Unable to find code object for all current devices?](#i-have-multiple-hip-enabled-devices-and-i-am-getting-an-error-message-hipErrorNoBinaryForGpu-unable-to-find-code-object-for-all-current-devices)
- [How to use per-thread default stream in HIP?](#how-to-use-per-thread-default-stream-in-hip)
- [How can I know the version of HIP?](#how-can-I-know-the-version-of-hip)
<!-- tocstop -->
### What APIs and features does HIP support?
## What APIs and features does HIP support?
HIP provides the following:
- Devices (hipSetDevice(), hipGetDeviceProperties(), etc.)
- Memory management (hipMalloc(), hipMemcpy(), hipFree(), etc.)
@@ -52,9 +15,9 @@ HIP provides the following:
The HIP API documentation describes each API and its limitations, if any, compared with the equivalent CUDA API.
### What is not supported?
## What is not supported?
#### Runtime/Driver API features
### Runtime/Driver API features
At a high-level, the following features are not supported:
- Textures (partial support available)
- Dynamic parallelism (CUDA 5.0)
@@ -65,20 +28,20 @@ At a high-level, the following features are not supported:
See the [API Support Table](CUDA_Runtime_API_functions_supported_by_HIP.md) for more detailed information.
#### Kernel language features
### Kernel language features
- C++-style device-side dynamic memory allocations (free, new, delete) (CUDA 4.0)
- Virtual functions, indirect functions and try/catch (CUDA 4.0)
- `__prof_trigger`
- PTX assembly (CUDA 4.0). HIP-Clang supports inline GCN assembly.
- Several kernel features are under development. See the [HIP Kernel Language](hip_kernel_language.md) for more information.
- Several kernel features are under development. See the {doc}`/reference/kernel_language` for more information.
### Is HIP a drop-in replacement for CUDA?
## Is HIP a drop-in replacement for CUDA?
No. HIP provides porting tools which do most of the work to convert CUDA code into portable C++ code that uses the HIP APIs.
Most developers will port their code from CUDA to HIP and then maintain the HIP version.
HIP code provides the same performance as native CUDA code, plus the benefits of running on AMD platforms.
### What specific version of CUDA does HIP support?
## What specific version of CUDA does HIP support?
HIP APIs and features do not map to a specific CUDA version. HIP provides a strong subset of the functionality provided in CUDA, and the hipify tools can scan code to identify any unsupported CUDA functions - this is useful for identifying the specific features required by a given application.
However, we can provide a rough summary of the features included in each CUDA SDK and the support level in HIP. Each bullet below lists the major new language features in each CUDA release and then indicate which are supported/not supported in HIP:
@@ -104,7 +67,7 @@ However, we can provide a rough summary of the features included in each CUDA SD
- CUDA 9.0 :
- Cooperative Launch, Surface Object Management, Version Management
### What libraries does HIP support?
## What libraries does HIP support?
HIP includes growing support for the four key math libraries using hipBlas, hipFFt, hipRAND and hipSPARSE, as well as MIOpen for machine intelligence applications.
These offer pointer-based memory interfaces (as opposed to opaque buffers) and can be easily interfaced with other HIP applications.
The hip interfaces support both ROCm and CUDA paths, with familiar library interfaces.
@@ -117,7 +80,7 @@ The hip interfaces support both ROCm and CUDA paths, with familiar library inter
Additionally, some of the cublas routines are automatically converted to hipblas equivalents by the HIPIFY tools. These APIs use cublas or hcblas depending on the platform and replace the need to use conditional compilation.
### How does HIP compare with OpenCL?
## How does HIP compare with OpenCL?
Both AMD and Nvidia support OpenCL 1.2 on their devices so that developers can write portable code.
HIP offers several benefits over OpenCL:
- Developers can code in C++ as well as mix host and device C++ code in their source files. HIP C++ code can use templates, lambdas, classes and so on.
@@ -128,7 +91,7 @@ HIP offers several benefits over OpenCL:
- HIP provides device-level control over memory allocation and placement.
- HIP offers an offline compilation model.
### How does porting CUDA to HIP compare to porting CUDA to OpenCL?
## How does porting CUDA to HIP compare to porting CUDA to OpenCL?
Both HIP and CUDA are dialects of C++, and thus porting between them is relatively straightforward.
Both dialects support templates, classes, lambdas, and other C++ constructs.
As one example, the hipify-perl tool was originally a Perl script that used simple text conversions from CUDA to HIP.
@@ -139,39 +102,39 @@ There have been several tools that have attempted to convert CUDA into OpenCL, s
As a result, the OpenCL syntax is different from CUDA, and the porting tools have to perform some heroic transformations to bridge this gap.
The tools also struggle with more complex CUDA applications, in particular, those that use templates, classes, or other C++ features inside the kernel.
### What hardware does HIP support?
## What hardware does HIP support?
- For AMD platforms, see the [ROCm documentation](https://github.com/RadeonOpenCompute/ROCm#supported-gpus) for the list of supported platforms.
- For Nvidia platforms, HIP requires Unified Memory and should run on any device supporting CUDA SDK 6.0 or newer. We have tested the Nvidia Titan and Tesla K40.
### Do HIPIFY tools automatically convert all source code?
## Do HIPIFY tools automatically convert all source code?
Typically, HIPIFY tools can automatically convert almost all run-time code.
Most device code needs no additional conversion since HIP and CUDA have similar names for math and built-in functions.
The hipify-clang tool will automatically modify the kernel signature as needed (automating a step that used to be done manually).
Additional porting may be required to deal with architecture feature queries or with CUDA capabilities that HIP doesn't support.
In general, developers should always expect to perform some platform-specific tuning and optimization.
### What is NVCC?
## What is NVCC?
NVCC is Nvidia's compiler driver for compiling "CUDA C++" code into PTX or device code for Nvidia GPUs. It's a closed-source binary compiler that is provided by the CUDA SDK.
### What is HIP-Clang?
## What is HIP-Clang?
HIP-Clang is a Clang/LLVM based compiler to compile HIP programs which can run on AMD platform.
### Why use HIP rather than supporting CUDA directly?
## Why use HIP rather than supporting CUDA directly?
While HIP is a strong subset of the CUDA, it is a subset. The HIP layer allows that subset to be clearly defined and documented.
Developers who code to the HIP API can be assured their code will remain portable across Nvidia and AMD platforms.
In addition, HIP defines portable mechanisms to query architectural features and supports a larger 64-bit wavesize which expands the return type for cross-lane functions like ballot and shuffle from 32-bit ints to 64-bit ints.
### Can I develop HIP code on an Nvidia CUDA platform?
## Can I develop HIP code on an Nvidia CUDA platform?
Yes. HIP's CUDA path only exposes the APIs and functionality that work on both NVCC and AMDGPU back-ends.
"Extra" APIs, parameters, and features which exist in CUDA but not in HIP-Clang will typically result in compile-time or run-time errors.
Developers need to use the HIP API for most accelerator code and bracket any CUDA-specific code with preprocessor conditionals.
Developers concerned about portability should, of course, run on both platforms, and should expect to tune for performance.
In some cases, CUDA has a richer set of modes for some APIs, and some C++ capabilities such as virtual functions - see the HIP @API documentation for more details.
### Can I develop HIP code on an AMD HIP-Clang platform?
## Can I develop HIP code on an AMD HIP-Clang platform?
Yes. HIP's HIP-Clang path only exposes the APIs and functions that work on AMD runtime back ends. "Extra" APIs, parameters and features that appear in HIP-Clang but not CUDA will typically cause compile- or run-time errors. Developers must use the HIP API for most accelerator code and bracket any HIP-Clang specific code with preprocessor conditionals. Those concerned about portability should, of course, test their code on both platforms and should tune it for performance. Typically, HIP-Clang supports a more modern set of C++11/C++14/C++17 features, so HIP developers who want portability should be careful when using advanced C++ features on the HIP-Clang path.
### How to use HIP-Clang to build HIP programs?
## How to use HIP-Clang to build HIP programs?
The environment variable can be used to set compiler path:
- HIP_CLANG_PATH: path to hip-clang. When set, this variable let hipcc to use hip-clang for compilation/linking.
@@ -179,26 +142,26 @@ There is an alternative environment variable to set compiler path:
- HIP_ROCCLR_HOME: path to root directory of the HIP-ROCclr runtime. When set, this variable let hipcc use hip-clang from the ROCclr distribution.
NOTE: If HIP_ROCCLR_HOME is set, there is no need to set HIP_CLANG_PATH since hipcc will deduce them from HIP_ROCCLR_HOME.
### What is ROCclr?
## What is ROCclr?
ROCclr (Radeon Open Compute Common Language Runtime) is a virtual device interface that compute runtimes interact with backends such as ROCr on Linux, as well as PAL on Windows.
### What is HIPAMD?
## What is HIPAMD?
HIPAMD is a repository branched out from HIP, mainly the implementation for AMD GPU.
### Can a HIP binary run on both AMD and Nvidia platforms?
## Can a HIP binary run on both AMD and Nvidia platforms?
HIP is a source-portable language that can be compiled to run on either AMD or NVIDIA platform. HIP tools don't create a "fat binary" that can run on either platform, however.
### On HIP-Clang, can I link HIP code with host code compiled with another compiler such as gcc, icc, or clang ?
## On HIP-Clang, can I link HIP code with host code compiled with another compiler such as gcc, icc, or clang ?
Yes. HIP generates the object code which conforms to the GCC ABI, and also links with libstdc++. This means you can compile host code with the compiler of your choice and link the generated object code
with GPU code compiled with HIP. Larger projects often contain a mixture of accelerator code (initially written in CUDA with nvcc) and host code (compiled with gcc, icc, or clang). These projects
can convert the accelerator code to HIP, compile that code with hipcc, and link with object code from their preferred compiler.
### Can I install both CUDA SDK and HIP-Clang on the same machine?
## Can I install both CUDA SDK and HIP-Clang on the same machine?
Yes. You can use HIP_PLATFORM to choose which path hipcc targets. This configuration can be useful when using HIP to develop an application which is portable to both AMD and NVIDIA.
### HIP detected my platform (HIP-Clang vs nvcc) incorrectly - what should I do?
## HIP detected my platform (HIP-Clang vs nvcc) incorrectly - what should I do?
HIP will set the platform to AMD and use HIP-Clang as compiler if it sees that the AMD graphics driver is installed and has detected an AMD GPU.
Sometimes this isn't what you want - you can force HIP to recognize the platform by setting the following,
```
@@ -218,7 +181,7 @@ HIP_RUNTIME=nvcc
One symptom of this problem is the message "error: 'unknown error'(11) at square.hipref.cpp:56". This can occur if you have a CUDA installation on an AMD platform, and HIP incorrectly detects the platform as nvcc. HIP may be able to compile the application using the nvcc tool-chain but will generate this error at runtime since the platform does not have a CUDA device.
### On CUDA, can I mix CUDA code with HIP code?
## On CUDA, can I mix CUDA code with HIP code?
Yes. Most HIP data structures (hipStream_t, hipEvent_t) are typedefs to CUDA equivalents and can be intermixed. Both CUDA and HIP use integer device ids.
One notable exception is that hipError_t is a new type, and cannot be used where a cudaError_t is expected. In these cases, refactor the code to remove the expectation. Alternatively, hip_runtime_api.h defines functions which convert between the error code spaces:
@@ -228,30 +191,30 @@ hipCUResultTohipError
If platform portability is important, use #ifdef __HIP_PLATFORM_NVIDIA__ to guard the CUDA-specific code.
### How do I trace HIP application flow?
See the [HIP Logging](hip_logging.md) for more information.
## How do I trace HIP application flow?
See {doc}`/developer_guide/logging` for more information.
### What is maximum limit of kernel launching parameter?
## What is maximum limit of kernel launching parameter?
Product of block.x, block.y, and block.z should be less than 1024.
Please note, HIP does not support kernel launch with total work items defined in dimension with size gridDim x blockDim >= 2^32, so gridDim.x * blockDim.x, gridDim.y * blockDim.y and gridDim.z * blockDim.z are always less than 2^32.
### Are __shfl_*_sync functions supported on HIP platform?
## Are __shfl_*_sync functions supported on HIP platform?
__shfl_*_sync is not supported on HIP but for nvcc path CUDA 9.0 and above all shuffle calls get redirected to it's sync version.
### How to create a guard for code that is specific to the host or the GPU?
## How to create a guard for code that is specific to the host or the GPU?
The compiler defines the `__HIP_DEVICE_COMPILE__` macro only when compiling the code for the GPU. It could be used to guard code that is specific to the host or the GPU.
### Why _OpenMP is undefined when compiling with -fopenmp?
## Why _OpenMP is undefined when compiling with -fopenmp?
When compiling an OpenMP source file with `hipcc -fopenmp`, the compiler may generate error if there is a reference to the `_OPENMP` macro. This is due to a limitation in hipcc that treats any source file type (e.g., `.cpp`) as an HIP translation unit leading to some conflicts with the OpenMP language switch. If the OpenMP source file doesn't contain any HIP language construct, you could workaround this issue by adding the `-x c++` switch to force the compiler to treat the file as regular C++. Another approach would be to guard the OpenMP code with `#ifdef _OPENMP` so that the code block is disabled when compiling for the GPU. The `__HIP_DEVICE_COMPILE__` macro defined by the HIP compiler when compiling GPU code could also be used for guarding code paths specific to the host or the GPU.
### Does the HIP-Clang compiler support extern shared declarations?
## Does the HIP-Clang compiler support extern shared declarations?
Previously, it was essential to declare dynamic shared memory using the HIP_DYNAMIC_SHARED macro for accuracy, as using static shared memory in the same kernel could result in overlapping memory ranges and data-races.
Now, the HIP-Clang compiler provides support for extern shared declarations, and the HIP_DYNAMIC_SHARED option is no longer required. You may use the standard extern definition:
extern __shared__ type var[];
### I have multiple HIP enabled devices and I am getting an error message hipErrorNoBinaryForGpu Unable to find code object for all current devices?
## I have multiple HIP enabled devices and I am getting an error message hipErrorNoBinaryForGpu Unable to find code object for all current devices?
This error message is seen due to the fact that you do not have valid code object for all of your devices.
@@ -262,7 +225,7 @@ If you have a precompiled application/library (like rocblas, tensorflow etc) whi
- The application/library does not ship code object bundles for *all* of your device(s): in this case you need to recompile the application/library yourself with correct `--offload-arch`.
- The application/library does not ship code object bundles for *some* of your device(s), for example you have a system with an APU + GPU and the library does not ship code objects for your APU. For this you can set the environment variable `HIP_VISIBLE_DEVICES` to only enable GPUs for which code object is available. This will limit the GPUs visible to your application and allow it to run.
### How to use per-thread default stream in HIP?
## How to use per-thread default stream in HIP?
The per-thread default stream is an implicit stream local to both the thread and the current device. It does not do any implicit synchronization with other streams (like explicitly created streams), or default per-thread stream on other threads.
@@ -274,7 +237,7 @@ Once source is compiled with per-thread default stream enabled, all APIs will be
Besides, per-thread default stream be enabled per translation unit, users can compile some files with feature enabled and some with feature disabled. Feature enabled translation unit will have default stream as per thread and there will not be any implicit synchronization done but other modules will have legacy default stream which will do implicit synchronization.
### How can I know the version of HIP?
## How can I know the version of HIP?
HIP version definition has been updated since ROCm 4.2 release as the following:
@@ -1,4 +1,4 @@
# HIP Programming Guide
# HIP Programming Manual
## Host Memory
@@ -189,5 +189,3 @@ Here is an example to create and use static libraries:
```
For more information, please see samples/2_Cookbook/15_static_library/host_functions and samples/2_Cookbook/15_static_library/device_functions.
## [Supported Clang Options](clang_options.md)