ROCMOPS-1956 - Push restructured code to hipamd
hipamd will have AMD's ROCCLR based HIP backend implementation Change-Id: Id7de9634519b4ce46fca71a1b61f3d5b1e3fc459
Этот коммит содержится в:
+223
-451
@@ -1,4 +1,5 @@
|
||||
# Copyright (C) 2016-2021 Advanced Micro Devices, Inc. All Rights Reserved.
|
||||
# Copyright (c) 2020-2021 Advanced Micro Devices, Inc. All rights reserved.
|
||||
#
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
# of this software and associated documentation files (the "Software"), to deal
|
||||
# in the Software without restriction, including without limitation the rights
|
||||
@@ -17,498 +18,269 @@
|
||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
# THE SOFTWARE.
|
||||
|
||||
cmake_minimum_required(VERSION 3.10)
|
||||
project(hip)
|
||||
cmake_minimum_required(VERSION 3.5.1)
|
||||
|
||||
# sample command for hip-rocclr runtime, you'll need to have rocclr built
|
||||
# For shared lib of hip-rocclr runtime
|
||||
# For release version
|
||||
# cmake -DCMAKE_PREFIX_PATH="$ROCclr_DIR/build;/opt/rocm/" -DCMAKE_INSTALL_PREFIX=</where/to/install/hip> ..
|
||||
# For debug version
|
||||
# cmake -DCMAKE_BUILD_TYPE=Debug -DCMAKE_PREFIX_PATH="$ROCclr_DIR/build;/opt/rocm/" -DCMAKE_INSTALL_PREFIX=</where/to/install/hip> ..
|
||||
# For static lib of hip-rocclr runtime
|
||||
# For release version
|
||||
# cmake -DBUILD_SHARED_LIBS=OFF -DCMAKE_PREFIX_PATH="$ROCclr_DIR/build;/opt/rocm/" -DCMAKE_INSTALL_PREFIX=</where/to/install/hip> ..
|
||||
# For debug version
|
||||
# cmake -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=Debug -DCMAKE_PREFIX_PATH="$ROCclr_DIR/build;/opt/rocm/" -DCMAKE_INSTALL_PREFIX=</where/to/install/hip> ..
|
||||
# If you don't specify CMAKE_INSTALL_PREFIX, hip-rocclr runtime will be installed to "/opt/rocm/hip".
|
||||
include(GNUInstallDirs)
|
||||
|
||||
set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
|
||||
if(ADDRESS_SANITIZER)
|
||||
set(ASAN_LINKER_FLAGS "-fsanitize=address")
|
||||
set(ASAN_COMPILER_FLAGS "-fno-omit-frame-pointer -fsanitize=address")
|
||||
|
||||
#############################
|
||||
# Options
|
||||
#############################
|
||||
option(BUILD_HIPIFY_CLANG "Enable building the CUDA->HIP converter" OFF)
|
||||
option(__HIP_ENABLE_PCH "Enable/Disable pre-compiled hip headers" ON)
|
||||
option(__HIP_ENABLE_RTC "Enable/Disable pre-processed hiprtc shared lib" ON)
|
||||
|
||||
if(__HIP_ENABLE_PCH)
|
||||
set(_pchStatus 1)
|
||||
else()
|
||||
set(_pchStatus 0)
|
||||
endif()
|
||||
|
||||
#############################
|
||||
# Setup config generation
|
||||
#############################
|
||||
string(TIMESTAMP _timestamp UTC)
|
||||
set(_versionInfo "# Auto-generated by cmake\n")
|
||||
set(_buildInfo "# Auto-generated by cmake on ${_timestamp} UTC\n")
|
||||
macro(add_to_config _configfile _variable)
|
||||
set(${_configfile} "${${_configfile}}${_variable}=${${_variable}}\n")
|
||||
endmacro()
|
||||
|
||||
#############################
|
||||
# Setup version information
|
||||
#############################
|
||||
# hipconfig is a perl script and is not trivially invokable on Windows.
|
||||
if(NOT WIN32)
|
||||
# Determine HIP_BASE_VERSION
|
||||
set(ENV{HIP_PATH} "")
|
||||
execute_process(COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/bin/hipconfig --version
|
||||
OUTPUT_VARIABLE HIP_BASE_VERSION
|
||||
OUTPUT_STRIP_TRAILING_WHITESPACE)
|
||||
string(REPLACE "." ";" VERSION_LIST ${HIP_BASE_VERSION})
|
||||
list(GET VERSION_LIST 0 HIP_VERSION_MAJOR)
|
||||
list(GET VERSION_LIST 1 HIP_VERSION_MINOR)
|
||||
set(HIP_VERSION_GITDATE 0)
|
||||
endif()
|
||||
|
||||
find_package(Git)
|
||||
|
||||
# FIXME: Two different version strings used.
|
||||
# Below we use UNIX commands, not compatible with Windows.
|
||||
if(GIT_FOUND AND (NOT WIN32))
|
||||
# get date information based on UTC
|
||||
# use the last two digits of year + week number + day in the week as HIP_VERSION_GITDATE
|
||||
# use the commit date, instead of build date
|
||||
# add xargs to remove strange trailing newline character
|
||||
execute_process(COMMAND ${GIT_EXECUTABLE} show -s --format=@%ct
|
||||
COMMAND xargs
|
||||
COMMAND date -f - --utc +%y%U%w
|
||||
RESULT_VARIABLE git_result
|
||||
OUTPUT_VARIABLE git_output
|
||||
WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}
|
||||
OUTPUT_STRIP_TRAILING_WHITESPACE)
|
||||
if(git_result EQUAL 0)
|
||||
set(HIP_VERSION_GITDATE ${git_output})
|
||||
endif()
|
||||
|
||||
# get commit short hash
|
||||
execute_process(COMMAND ${GIT_EXECUTABLE} rev-parse --short HEAD
|
||||
WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}
|
||||
RESULT_VARIABLE git_result
|
||||
OUTPUT_VARIABLE git_output
|
||||
OUTPUT_STRIP_TRAILING_WHITESPACE)
|
||||
if(git_result EQUAL 0)
|
||||
set(HIP_VERSION_GITHASH ${git_output})
|
||||
endif()
|
||||
|
||||
# get commit count
|
||||
execute_process(COMMAND ${GIT_EXECUTABLE} rev-list --count HEAD
|
||||
WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}
|
||||
RESULT_VARIABLE git_result
|
||||
OUTPUT_VARIABLE git_output
|
||||
OUTPUT_STRIP_TRAILING_WHITESPACE)
|
||||
if(git_result EQUAL 0)
|
||||
set(HIP_VERSION_GITCOUNT ${git_output})
|
||||
endif()
|
||||
|
||||
set(HIP_VERSION_PATCH ${HIP_VERSION_GITDATE}-${HIP_VERSION_GITHASH})
|
||||
|
||||
if(DEFINED ENV{ROCM_LIBPATCH_VERSION})
|
||||
set(HIP_PACKAGING_VERSION_PATCH ${HIP_VERSION_GITDATE}.${HIP_VERSION_GITCOUNT}.$ENV{ROCM_LIBPATCH_VERSION})
|
||||
else()
|
||||
set(HIP_PACKAGING_VERSION_PATCH ${HIP_VERSION_GITDATE}.${HIP_VERSION_GITCOUNT}-${HIP_VERSION_GITHASH})
|
||||
endif()
|
||||
else()
|
||||
# FIXME: Some parts depend on this being set.
|
||||
set(HIP_PACKAGING_VERSION_PATCH "0")
|
||||
endif()
|
||||
|
||||
## Debian package specific variables
|
||||
if ( DEFINED ENV{CPACK_DEBIAN_PACKAGE_RELEASE} )
|
||||
set ( CPACK_DEBIAN_PACKAGE_RELEASE $ENV{CPACK_DEBIAN_PACKAGE_RELEASE} )
|
||||
else()
|
||||
set ( CPACK_DEBIAN_PACKAGE_RELEASE "local" )
|
||||
endif()
|
||||
message (STATUS "Using CPACK_DEBIAN_PACKAGE_RELEASE ${CPACK_DEBIAN_PACKAGE_RELEASE}" )
|
||||
|
||||
## RPM package specific variables
|
||||
if ( DEFINED ENV{CPACK_RPM_PACKAGE_RELEASE} )
|
||||
set ( CPACK_RPM_PACKAGE_RELEASE $ENV{CPACK_RPM_PACKAGE_RELEASE} )
|
||||
else()
|
||||
set ( CPACK_RPM_PACKAGE_RELEASE "local" )
|
||||
endif()
|
||||
|
||||
## 'dist' breaks manual builds on debian systems due to empty Provides
|
||||
execute_process( COMMAND rpm --eval %{?dist}
|
||||
RESULT_VARIABLE PROC_RESULT
|
||||
OUTPUT_VARIABLE EVAL_RESULT
|
||||
OUTPUT_STRIP_TRAILING_WHITESPACE )
|
||||
|
||||
if ( PROC_RESULT EQUAL "0" AND NOT EVAL_RESULT STREQUAL "" )
|
||||
string ( APPEND CPACK_RPM_PACKAGE_RELEASE "%{?dist}" )
|
||||
endif()
|
||||
message(STATUS "CPACK_RPM_PACKAGE_RELEASE: ${CPACK_RPM_PACKAGE_RELEASE}")
|
||||
|
||||
add_to_config(_versionInfo HIP_PACKAGING_VERSION_PATCH)
|
||||
add_to_config(_versionInfo CPACK_DEBIAN_PACKAGE_RELEASE)
|
||||
add_to_config(_versionInfo CPACK_RPM_PACKAGE_RELEASE)
|
||||
|
||||
add_to_config(_versionInfo HIP_VERSION_MAJOR)
|
||||
add_to_config(_versionInfo HIP_VERSION_MINOR)
|
||||
add_to_config(_versionInfo HIP_VERSION_PATCH)
|
||||
|
||||
set (HIP_LIB_VERSION_MAJOR ${HIP_VERSION_MAJOR})
|
||||
set (HIP_LIB_VERSION_MINOR ${HIP_VERSION_MINOR})
|
||||
if (${ROCM_PATCH_VERSION} )
|
||||
set (HIP_LIB_VERSION_PATCH ${ROCM_PATCH_VERSION})
|
||||
else ()
|
||||
set (HIP_LIB_VERSION_PATCH ${HIP_VERSION_PATCH})
|
||||
endif ()
|
||||
set (HIP_LIB_VERSION_STRING "${HIP_LIB_VERSION_MAJOR}.${HIP_LIB_VERSION_MINOR}.${HIP_LIB_VERSION_PATCH}")
|
||||
if (DEFINED ENV{ROCM_RPATH})
|
||||
set (CMAKE_INSTALL_RPATH "$ENV{ROCM_RPATH}")
|
||||
set (CMAKE_BUILD_WITH_INSTALL_RPATH TRUE)
|
||||
set (CMAKE_SKIP_BUILD_RPATH TRUE)
|
||||
endif ()
|
||||
|
||||
# overwrite HIP_VERSION_PATCH for packaging
|
||||
set(HIP_VERSION ${HIP_VERSION_MAJOR}.${HIP_VERSION_MINOR}.${HIP_PACKAGING_VERSION_PATCH})
|
||||
|
||||
# Remove when CI is updated
|
||||
if(HIP_PLATFORM STREQUAL "rocclr")
|
||||
set(HIP_PLATFORM "amd")
|
||||
endif()
|
||||
#############################
|
||||
# Configure variables
|
||||
#############################
|
||||
# Determine HIP_PLATFORM
|
||||
if(NOT DEFINED HIP_PLATFORM)
|
||||
if(NOT DEFINED ENV{HIP_PLATFORM})
|
||||
execute_process(COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/bin/hipconfig --platform
|
||||
OUTPUT_VARIABLE HIP_PLATFORM
|
||||
OUTPUT_STRIP_TRAILING_WHITESPACE)
|
||||
if(NOT CMAKE_COMPILER_IS_GNUCC)
|
||||
if(BUILD_SHARED_LIBS)
|
||||
set(ASAN_LINKER_FLAGS "${ASAN_LINKER_FLAGS} -shared-libsan")
|
||||
else()
|
||||
set(HIP_PLATFORM $ENV{HIP_PLATFORM} CACHE STRING "HIP Platform")
|
||||
set(ASAN_LINKER_FLAGS "${ASAN_LINKER_FLAGS} -static-libsan")
|
||||
endif()
|
||||
endif()
|
||||
message(STATUS "HIP Platform: " ${HIP_PLATFORM})
|
||||
endif()
|
||||
|
||||
if(HIP_PLATFORM STREQUAL "nvidia")
|
||||
set(HIP_RUNTIME "cuda" CACHE STRING "HIP Runtime")
|
||||
set(HIP_COMPILER "nvcc" CACHE STRING "HIP Compiler")
|
||||
elseif(HIP_PLATFORM STREQUAL "amd")
|
||||
set(HIP_RUNTIME "rocclr" CACHE STRING "HIP Runtime")
|
||||
set(HIP_COMPILER "clang" CACHE STRING "HIP Compiler")
|
||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${ASAN_COMPILER_FLAGS}")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${ASAN_COMPILER_FLAGS}")
|
||||
set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${ASAN_LINKER_FLAGS} -s")
|
||||
set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${ASAN_LINKER_FLAGS}")
|
||||
endif()
|
||||
|
||||
option(BUILD_SHARED_LIBS "Build the shared library" ON)
|
||||
|
||||
list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}/cmake")
|
||||
find_package(ROCclr)
|
||||
|
||||
if(BUILD_SHARED_LIBS)
|
||||
add_library(amdhip64 SHARED)
|
||||
# Windows doesn't have a strip utility, so CMAKE_STRIP won't be set.
|
||||
if((CMAKE_BUILD_TYPE STREQUAL "Release") AND NOT ("${CMAKE_STRIP}" STREQUAL ""))
|
||||
add_custom_command(TARGET amdhip64 POST_BUILD COMMAND ${CMAKE_STRIP} $<TARGET_FILE:amdhip64>)
|
||||
endif()
|
||||
else()
|
||||
message(FATAL_ERROR "Unexpected HIP_PLATFORM: " ${HIP_PLATFORM})
|
||||
add_library(amdhip64 STATIC $<TARGET_OBJECTS:rocclr>)
|
||||
endif()
|
||||
|
||||
message(STATUS "HIP Runtime: " ${HIP_RUNTIME})
|
||||
message(STATUS "HIP Compiler: " ${HIP_COMPILER})
|
||||
set_target_properties(amdhip64 PROPERTIES
|
||||
CXX_STANDARD 14
|
||||
CXX_STANDARD_REQUIRED ON
|
||||
CXX_EXTENSIONS OFF
|
||||
POSITION_INDEPENDENT_CODE ON
|
||||
# Workaround for many places in the HIP project
|
||||
# having hardcoded references to build/lib/libamdhip64.so
|
||||
LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
|
||||
|
||||
add_to_config(_buildInfo HIP_RUNTIME)
|
||||
add_to_config(_buildInfo HIP_COMPILER)
|
||||
|
||||
# Set default build type
|
||||
if(NOT CMAKE_BUILD_TYPE)
|
||||
set(CMAKE_BUILD_TYPE "Release")
|
||||
endif()
|
||||
|
||||
# Determine HIP install path
|
||||
if (UNIX)
|
||||
set(HIP_DEFAULT_INSTALL_PREFIX "/opt/rocm/hip")
|
||||
endif()
|
||||
if(CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT)
|
||||
set(CMAKE_INSTALL_PREFIX ${HIP_DEFAULT_INSTALL_PREFIX} CACHE PATH "Installation path for HIP" FORCE)
|
||||
endif()
|
||||
|
||||
if(DEV_LOG_ENABLE MATCHES "yes")
|
||||
add_definitions(-DDEV_LOG_ENABLE)
|
||||
endif()
|
||||
|
||||
# Set default install path as "/opt/rocm/hip", can override the path from cmake build.
|
||||
set(CPACK_INSTALL_PREFIX ${HIP_DEFAULT_INSTALL_PREFIX} CACHE PATH "Package Installation path for HIP")
|
||||
|
||||
if(IS_ABSOLUTE ${CMAKE_INSTALL_PREFIX})
|
||||
message(STATUS "HIP will be installed in: " ${CMAKE_INSTALL_PREFIX})
|
||||
if(CMAKE_SIZEOF_VOID_P EQUAL 8)
|
||||
set_target_properties(amdhip64 PROPERTIES OUTPUT_NAME "amdhip64")
|
||||
else()
|
||||
message(FATAL_ERROR "Don't know where to install HIP. Please specify absolute path using -DCMAKE_INSTALL_PREFIX")
|
||||
set_target_properties(amdhip64 PROPERTIES OUTPUT_NAME "amdhip32")
|
||||
endif()
|
||||
|
||||
if (NOT DEFINED ROCM_PATH )
|
||||
set ( ROCM_PATH "/opt/rocm" CACHE STRING "Default ROCM installation directory." )
|
||||
endif ()
|
||||
message (STATUS "ROCM Installation path(ROCM_PATH): ${ROCM_PATH}")
|
||||
|
||||
# set the installation path for the installer package
|
||||
set(CPACK_SET_DESTDIR ON CACHE BOOL "Installer package will install hip to CMAKE_INSTALL_PREFIX instead of CPACK_PACKAGING_INSTALL_PREFIX")
|
||||
if (NOT CPACK_SET_DESTDIR)
|
||||
set(CPACK_PACKAGING_INSTALL_PREFIX "/opt/rocm/hip" CACHE PATH "Default installation path of hcc installer package")
|
||||
endif (NOT CPACK_SET_DESTDIR)
|
||||
|
||||
#############################
|
||||
# Build steps
|
||||
#############################
|
||||
set(BIN_INSTALL_DIR ${CMAKE_INSTALL_PREFIX}/bin)
|
||||
set(LIB_INSTALL_DIR ${CMAKE_INSTALL_PREFIX}/lib)
|
||||
set(INCLUDE_INSTALL_DIR ${CMAKE_INSTALL_PREFIX}/include)
|
||||
set(CONFIG_PACKAGE_INSTALL_DIR ${LIB_INSTALL_DIR}/cmake/hip)
|
||||
set(CONFIG_LANG_PACKAGE_INSTALL_DIR ${LIB_INSTALL_DIR}/cmake/hip-lang)
|
||||
|
||||
# Build clang hipify if enabled
|
||||
if (BUILD_HIPIFY_CLANG)
|
||||
add_subdirectory(hipify-clang)
|
||||
# Disable versioning for Windows
|
||||
# as currently HIP_LIB_VERSION_STRING and HIP_LIB_VERSION_MAJOR
|
||||
# are not being populated
|
||||
if(NOT WIN32)
|
||||
if(BUILD_SHARED_LIBS)
|
||||
set_target_properties(amdhip64 PROPERTIES
|
||||
VERSION ${HIP_LIB_VERSION_STRING}
|
||||
SOVERSION ${HIP_LIB_VERSION_MAJOR})
|
||||
endif()
|
||||
endif()
|
||||
|
||||
# Workaround for current versioning logic not being compatible with Windows
|
||||
target_sources(amdhip64 PRIVATE
|
||||
src/cl_gl.cpp
|
||||
src/cl_lqdflash_amd.cpp
|
||||
src/fixme.cpp
|
||||
src/hip_activity.cpp
|
||||
src/hip_code_object.cpp
|
||||
src/hip_context.cpp
|
||||
src/hip_device_runtime.cpp
|
||||
src/hip_device.cpp
|
||||
src/hip_error.cpp
|
||||
src/hip_event.cpp
|
||||
src/hip_fatbin.cpp
|
||||
src/hip_global.cpp
|
||||
src/hip_graph_internal.cpp
|
||||
src/hip_graph.cpp
|
||||
src/hip_hmm.cpp
|
||||
src/hip_intercept.cpp
|
||||
src/hip_memory.cpp
|
||||
src/hip_module.cpp
|
||||
src/hip_peer.cpp
|
||||
src/hip_platform.cpp
|
||||
src/hip_profile.cpp
|
||||
src/hip_rtc.cpp
|
||||
src/hip_stream_ops.cpp
|
||||
src/hip_stream.cpp
|
||||
src/hip_surface.cpp
|
||||
src/hip_texture.cpp)
|
||||
|
||||
if(WIN32)
|
||||
set(HIP_VERSION_MAJOR 0)
|
||||
set(HIP_VERSION_MINOR 0)
|
||||
set(HIP_VERSION_GITDATE 0)
|
||||
target_sources(amdhip64 PRIVATE
|
||||
src/cl_d3d9.cpp
|
||||
src/cl_d3d10.cpp
|
||||
src/cl_d3d11.cpp)
|
||||
endif()
|
||||
|
||||
# Generate hip_version.h
|
||||
set(_versionInfoHeader
|
||||
"// Auto-generated by cmake\n
|
||||
#ifndef HIP_VERSION_H
|
||||
#define HIP_VERSION_H\n
|
||||
#define HIP_VERSION_MAJOR ${HIP_VERSION_MAJOR}
|
||||
#define HIP_VERSION_MINOR ${HIP_VERSION_MINOR}
|
||||
#define HIP_VERSION_PATCH ${HIP_VERSION_GITDATE}
|
||||
#define HIP_VERSION (HIP_VERSION_MAJOR * 100 + HIP_VERSION_MINOR)\n
|
||||
#define __HIP_HAS_GET_PCH ${_pchStatus}\n
|
||||
#endif\n
|
||||
")
|
||||
file(WRITE "${PROJECT_BINARY_DIR}/include/hip/hip_version.h" ${_versionInfoHeader})
|
||||
|
||||
if(HIP_RUNTIME STREQUAL "rocclr")
|
||||
add_subdirectory(src/hipamd)
|
||||
if(BUILD_SHARED_LIBS)
|
||||
if(WIN32)
|
||||
target_sources(amdhip64 PRIVATE src/amdhip.def)
|
||||
else()
|
||||
target_link_libraries(amdhip64 PRIVATE "-Wl,--version-script=${CMAKE_CURRENT_LIST_DIR}/src/hip_hcc.map.in")
|
||||
set_target_properties(amdhip64 PROPERTIES LINK_DEPENDS "${CMAKE_CURRENT_LIST_DIR}/src/hip_hcc.map.in")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
# Generate .hipInfo
|
||||
file(WRITE "${PROJECT_BINARY_DIR}/.hipInfo" ${_buildInfo})
|
||||
target_include_directories(amdhip64
|
||||
PRIVATE
|
||||
${PROJECT_SOURCE_DIR}/src/hipamd/include
|
||||
${PROJECT_SOURCE_DIR}/include
|
||||
${PROJECT_BINARY_DIR}/include)
|
||||
|
||||
# Generate .hipVersion
|
||||
file(WRITE "${PROJECT_BINARY_DIR}/.hipVersion" ${_versionInfo})
|
||||
target_compile_definitions(amdhip64 PRIVATE __HIP_PLATFORM_AMD__)
|
||||
|
||||
# Build doxygen documentation
|
||||
find_program(DOXYGEN_EXE doxygen)
|
||||
if(DOXYGEN_EXE)
|
||||
add_custom_target(doc COMMAND HIP_PATH=${CMAKE_CURRENT_SOURCE_DIR} ${DOXYGEN_EXE} ${CMAKE_CURRENT_SOURCE_DIR}/docs/doxygen-input/doxy.cfg
|
||||
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/docs)
|
||||
target_link_libraries(amdhip64 PRIVATE ${CMAKE_DL_LIBS})
|
||||
# Additional dependencies for hipRTC
|
||||
if(WIN32)
|
||||
target_link_libraries(amdhip64 PRIVATE Dbghelp.lib)
|
||||
endif()
|
||||
|
||||
# Note in static case we cannot link against rocclr.
|
||||
# If we would, we'd also have to export rocclr and have hipcc pass it to the linker.
|
||||
if(BUILD_SHARED_LIBS)
|
||||
target_link_libraries(amdhip64 PRIVATE rocclr)
|
||||
else()
|
||||
target_compile_definitions(amdhip64 PRIVATE $<TARGET_PROPERTY:rocclr,COMPILE_DEFINITIONS>)
|
||||
target_include_directories(amdhip64 PRIVATE $<TARGET_PROPERTY:rocclr,INCLUDE_DIRECTORIES>)
|
||||
endif()
|
||||
|
||||
# Short-Term solution for pre-compiled headers for online compilation
|
||||
# Enable pre compiled header
|
||||
if(__HIP_ENABLE_PCH)
|
||||
find_package(LLVM REQUIRED CONFIG
|
||||
PATHS
|
||||
/opt/rocm/llvm)
|
||||
# find_package(LLVM) returns the lib/cmake/llvm location. We require the root.
|
||||
set(HIP_LLVM_ROOT "${LLVM_DIR}/../../..")
|
||||
|
||||
# execute_process(COMMAND sh -c "${CMAKE_CURRENT_SOURCE_DIR}/../bin/hip_embed_pch.sh ${PROJECT_BINARY_DIR}/include ${PROJECT_SOURCE_DIR}/include ${HIP_LLVM_ROOT}" COMMAND_ECHO STDERR RESULT_VARIABLE EMBED_PCH_RC)
|
||||
execute_process(COMMAND sh -c "${CMAKE_CURRENT_SOURCE_DIR}/../../bin/hip_embed_pch.sh ${PROJECT_BINARY_DIR}/include ${PROJECT_SOURCE_DIR}/include ${PROJECT_SOURCE_DIR}/src/hipamd/include ${HIP_LLVM_ROOT}" COMMAND_ECHO STDERR RESULT_VARIABLE EMBED_PCH_RC)
|
||||
if (EMBED_PCH_RC AND NOT EMBED_PCH_RC EQUAL 0)
|
||||
message(FATAL_ERROR "Failed to embed PCH")
|
||||
endif()
|
||||
|
||||
target_compile_definitions(amdhip64 PRIVATE __HIP_ENABLE_PCH)
|
||||
target_sources(amdhip64 PRIVATE ${CMAKE_BINARY_DIR}/hip_pch.o)
|
||||
endif()
|
||||
|
||||
# Enable preprocessed hiprtc-builtins library
|
||||
if(__HIP_ENABLE_RTC)
|
||||
find_package(LLVM REQUIRED CONFIG
|
||||
PATHS
|
||||
/opt/rocm/llvm)
|
||||
# find_package(LLVM) returns the lib/cmake/llvm location. We require the root.
|
||||
set(HIP_LLVM_ROOT "${LLVM_DIR}/../../..")
|
||||
|
||||
if(WIN32)
|
||||
set(HIPRTC_LIB_NAME "hiprtc-builtins64_${HIP_LIB_VERSION_MAJOR}${HIP_LIB_VERSION_MINOR}.dll")
|
||||
else()
|
||||
set(HIPRTC_LIB_NAME "libhiprtc-builtins.so.${HIP_LIB_VERSION_MAJOR}.${HIP_LIB_VERSION_MINOR}")
|
||||
endif()
|
||||
execute_process(
|
||||
COMMAND sh -c "mkdir -p ${PROJECT_BINARY_DIR}/lib; ${CMAKE_CURRENT_SOURCE_DIR}/../../bin/hip_embed_pch.sh ${PROJECT_BINARY_DIR}/include ${PROJECT_SOURCE_DIR}/include ${PROJECT_SOURCE_DIR}/src/hipamd/include ${HIP_LLVM_ROOT} -r ${PROJECT_BINARY_DIR}/lib/${HIPRTC_LIB_NAME}"
|
||||
COMMAND_ECHO STDERR
|
||||
RESULT_VARIABLE EMBED_RTC_RC
|
||||
)
|
||||
if (EMBED_RTC_RC AND NOT EMBED_RTC_RC EQUAL 0)
|
||||
message(FATAL_ERROR "Failed to create hiprtc shared lib")
|
||||
endif()
|
||||
install(FILES ${PROJECT_BINARY_DIR}/lib/${HIPRTC_LIB_NAME} DESTINATION lib)
|
||||
endif()
|
||||
|
||||
#############################
|
||||
# Install steps
|
||||
# Profiling API support
|
||||
#############################
|
||||
# Generate profiling API macros/structures header
|
||||
set(PROF_API_STR "${PROJECT_BINARY_DIR}/include/hip/amd_detail/hip_prof_str.h")
|
||||
set(PROF_API_HDR "${PROJECT_SOURCE_DIR}/include/hip/hip_runtime_api.h")
|
||||
set(PROF_API_SRC "${CMAKE_CURRENT_SOURCE_DIR}/src")
|
||||
set(PROF_API_GEN "${CMAKE_CURRENT_SOURCE_DIR}/src/hip_prof_gen.py")
|
||||
set(PROF_API_LOG "${PROJECT_BINARY_DIR}/hip_prof_gen.log.txt")
|
||||
|
||||
# Install .hipInfo
|
||||
install(FILES ${PROJECT_BINARY_DIR}/.hipInfo DESTINATION lib)
|
||||
find_package(PythonInterp REQUIRED)
|
||||
add_custom_command(OUTPUT ${PROF_API_STR}
|
||||
COMMAND ${PYTHON_EXECUTABLE} ${PROF_API_GEN} -v -t --priv ${OPT_PROF_API} ${PROF_API_HDR} ${PROF_API_SRC} ${PROF_API_STR}
|
||||
OUTPUT_FILE ${PROF_API_LOG}
|
||||
DEPENDS ${PROF_API_HDR} ${PROF_API_GEN}
|
||||
COMMENT "Generating profiling primitives: ${PROF_API_STR}")
|
||||
|
||||
# Install .hipVersion
|
||||
install(FILES ${PROJECT_BINARY_DIR}/.hipVersion DESTINATION bin)
|
||||
add_custom_target(gen-prof-api-str-header ALL
|
||||
DEPENDS ${PROF_API_STR}
|
||||
SOURCES ${PROF_API_HDR})
|
||||
|
||||
# Install src, bin, include & cmake if necessary
|
||||
execute_process(COMMAND test ${CMAKE_INSTALL_PREFIX} -ef ${CMAKE_CURRENT_SOURCE_DIR}
|
||||
RESULT_VARIABLE INSTALL_SOURCE)
|
||||
if(NOT ${INSTALL_SOURCE} EQUAL 0)
|
||||
install(DIRECTORY bin DESTINATION . USE_SOURCE_PERMISSIONS)
|
||||
set_target_properties(amdhip64 PROPERTIES PUBLIC_HEADER ${PROF_API_STR})
|
||||
|
||||
# The following two lines will be removed after upstream updation
|
||||
install(CODE "MESSAGE(\"Removing ${CMAKE_INSTALL_PREFIX}/include\")")
|
||||
install(CODE "file(REMOVE_RECURSE ${CMAKE_INSTALL_PREFIX}/include)")
|
||||
option(USE_PROF_API ON "Enable roctracer integration")
|
||||
# Enable profiling API
|
||||
if(USE_PROF_API)
|
||||
find_path(PROF_API_HEADER_DIR prof_protocol.h
|
||||
HINTS
|
||||
${PROF_API_HEADER_PATH}
|
||||
PATHS
|
||||
${ROCM_PATH}/roctracer
|
||||
PATH_SUFFIXES
|
||||
include/ext)
|
||||
|
||||
install(DIRECTORY include DESTINATION .)
|
||||
install(DIRECTORY src/hipamd/include/hip/ DESTINATION include/hip/)
|
||||
install(DIRECTORY cmake DESTINATION .)
|
||||
if(NOT PROF_API_HEADER_DIR)
|
||||
message(WARNING "Profiling API header not found. Disabling roctracer integration. Use -DPROF_API_HEADER_PATH=<path to prof_protocol.h header>")
|
||||
else()
|
||||
target_compile_definitions(amdhip64 PUBLIC USE_PROF_API=1)
|
||||
target_include_directories(amdhip64 PUBLIC ${PROF_API_HEADER_DIR})
|
||||
message(STATUS "Profiling API: ${PROF_API_HEADER_DIR}")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
# Install generated headers
|
||||
# FIXME: Associate with individual targets.
|
||||
if(HIP_PLATFORM STREQUAL "amd")
|
||||
install(FILES ${PROJECT_BINARY_DIR}/include/hip/amd_detail/hip_prof_str.h
|
||||
DESTINATION include/hip/amd_detail)
|
||||
endif()
|
||||
install(FILES ${PROJECT_BINARY_DIR}/include/hip/hip_version.h
|
||||
DESTINATION include/hip)
|
||||
add_dependencies(amdhip64 gen-prof-api-str-header)
|
||||
|
||||
add_custom_command(TARGET amdhip64 POST_BUILD COMMAND
|
||||
${CMAKE_COMMAND} -E copy ${PROJECT_BINARY_DIR}/.hipInfo ${PROJECT_BINARY_DIR}/lib/.hipInfo)
|
||||
add_custom_command(TARGET amdhip64 POST_BUILD COMMAND
|
||||
${CMAKE_COMMAND} -E copy_directory ${PROJECT_SOURCE_DIR}/include ${PROJECT_BINARY_DIR}/include)
|
||||
|
||||
add_library(host INTERFACE)
|
||||
target_link_libraries(host INTERFACE amdhip64)
|
||||
|
||||
add_library(device INTERFACE)
|
||||
target_link_libraries(device INTERFACE host)
|
||||
|
||||
INSTALL(TARGETS amdhip64 host device
|
||||
EXPORT hip-targets
|
||||
RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
|
||||
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
|
||||
ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
|
||||
PUBLIC_HEADER DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
|
||||
INSTALL(EXPORT hip-targets DESTINATION ${CONFIG_PACKAGE_INSTALL_DIR} NAMESPACE hip::)
|
||||
|
||||
INSTALL(TARGETS amdhip64 host device
|
||||
EXPORT hip-lang-targets
|
||||
RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
|
||||
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
|
||||
ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
|
||||
PUBLIC_HEADER DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
|
||||
INSTALL(EXPORT hip-lang-targets DESTINATION ${CONFIG_LANG_PACKAGE_INSTALL_DIR} NAMESPACE hip-lang::)
|
||||
|
||||
#############################
|
||||
# hip-config
|
||||
#############################
|
||||
# Packaging invokes UNIX commands, which are not available on Windows.
|
||||
if(NOT WIN32)
|
||||
include(CMakePackageConfigHelpers)
|
||||
|
||||
configure_package_config_file(
|
||||
hip-config.cmake.in
|
||||
${CMAKE_CURRENT_BINARY_DIR}/hip-config.cmake
|
||||
INSTALL_DESTINATION ${CONFIG_PACKAGE_INSTALL_DIR}
|
||||
PATH_VARS LIB_INSTALL_DIR INCLUDE_INSTALL_DIR BIN_INSTALL_DIR
|
||||
)
|
||||
${PROJECT_SOURCE_DIR}/hip-lang-config.cmake.in
|
||||
${CMAKE_CURRENT_BINARY_DIR}/hip-lang-config.cmake
|
||||
INSTALL_DESTINATION ${CONFIG_LANG_PACKAGE_INSTALL_DIR}
|
||||
PATH_VARS LIB_INSTALL_DIR INCLUDE_INSTALL_DIR BIN_INSTALL_DIR)
|
||||
|
||||
write_basic_package_version_file(
|
||||
${CMAKE_CURRENT_BINARY_DIR}/hip-config-version.cmake
|
||||
VERSION "${HIP_VERSION_MAJOR}.${HIP_VERSION_MINOR}.${HIP_VERSION_GITDATE}"
|
||||
COMPATIBILITY SameMajorVersion
|
||||
)
|
||||
${CMAKE_CURRENT_BINARY_DIR}/hip-lang-config-version.cmake
|
||||
VERSION "${HIP_VERSION_MAJOR}.${HIP_VERSION_MINOR}.${HIP_VERSION_GITDATE}"
|
||||
COMPATIBILITY SameMajorVersion)
|
||||
install(
|
||||
FILES
|
||||
${CMAKE_CURRENT_BINARY_DIR}/hip-config.cmake
|
||||
${CMAKE_CURRENT_BINARY_DIR}/hip-config-version.cmake
|
||||
${CMAKE_CURRENT_BINARY_DIR}/hip-lang-config.cmake
|
||||
${CMAKE_CURRENT_BINARY_DIR}/hip-lang-config-version.cmake
|
||||
DESTINATION
|
||||
${CONFIG_PACKAGE_INSTALL_DIR}
|
||||
${CONFIG_LANG_PACKAGE_INSTALL_DIR}/
|
||||
)
|
||||
|
||||
#############################
|
||||
# Packaging steps
|
||||
#############################
|
||||
# Package: hip_base
|
||||
set(BUILD_DIR ${CMAKE_CURRENT_BINARY_DIR}/packages/hip-base)
|
||||
configure_file(packaging/hip-base.txt ${BUILD_DIR}/CMakeLists.txt @ONLY)
|
||||
configure_file(packaging/hip-base.postinst ${BUILD_DIR}/postinst @ONLY)
|
||||
configure_file(packaging/hip-base.prerm ${BUILD_DIR}/prerm @ONLY)
|
||||
|
||||
add_custom_target(pkg_hip_base COMMAND ${CMAKE_COMMAND} .
|
||||
COMMAND rm -rf *.deb *.rpm *.tar.gz
|
||||
COMMAND make package
|
||||
COMMAND cp *.deb ${PROJECT_BINARY_DIR}
|
||||
COMMAND cp *.rpm ${PROJECT_BINARY_DIR}
|
||||
COMMAND cp *.tar.gz ${PROJECT_BINARY_DIR}
|
||||
WORKING_DIRECTORY ${BUILD_DIR} )
|
||||
|
||||
# Packaging needs to wait for hipify-clang to build if it's enabled...
|
||||
if (BUILD_HIPIFY_CLANG)
|
||||
add_dependencies(pkg_hip_base hipify-clang)
|
||||
endif()
|
||||
|
||||
if(HIP_RUNTIME STREQUAL "rocclr")
|
||||
set(BUILD_DIR ${CMAKE_CURRENT_BINARY_DIR}/rocclr)
|
||||
configure_file(packaging/hip-rocclr.txt ${BUILD_DIR}/CMakeLists.txt @ONLY)
|
||||
configure_file(packaging/hip-rocclr.postinst ${BUILD_DIR}/postinst @ONLY)
|
||||
configure_file(packaging/hip-rocclr.prerm ${BUILD_DIR}/prerm @ONLY)
|
||||
add_custom_target(hip_on_rocclr COMMAND ${CMAKE_COMMAND} .
|
||||
COMMAND rm -rf *.deb *.rpm *.tar.gz
|
||||
COMMAND make package
|
||||
COMMAND cp *.deb ${PROJECT_BINARY_DIR}
|
||||
COMMAND cp *.rpm ${PROJECT_BINARY_DIR}
|
||||
COMMAND cp *.tar.gz ${PROJECT_BINARY_DIR}
|
||||
WORKING_DIRECTORY ${BUILD_DIR} )
|
||||
endif()
|
||||
|
||||
# Package: hip_nvcc
|
||||
set(BUILD_DIR ${CMAKE_CURRENT_BINARY_DIR}/packages/hip-nvcc)
|
||||
configure_file(packaging/hip-nvcc.txt ${BUILD_DIR}/CMakeLists.txt @ONLY)
|
||||
add_custom_target(pkg_hip_nvcc COMMAND ${CMAKE_COMMAND} .
|
||||
COMMAND rm -rf *.deb *.rpm *.tar.gz
|
||||
COMMAND make package
|
||||
COMMAND cp *.deb ${PROJECT_BINARY_DIR}
|
||||
COMMAND cp *.rpm ${PROJECT_BINARY_DIR}
|
||||
COMMAND cp *.tar.gz ${PROJECT_BINARY_DIR}
|
||||
WORKING_DIRECTORY ${BUILD_DIR})
|
||||
|
||||
# Package: hip_doc
|
||||
set(BUILD_DIR ${CMAKE_CURRENT_BINARY_DIR}/packages/hip-doc)
|
||||
configure_file(packaging/hip-doc.txt ${BUILD_DIR}/CMakeLists.txt @ONLY)
|
||||
add_custom_target(pkg_hip_doc COMMAND ${CMAKE_COMMAND} .
|
||||
COMMAND rm -rf *.deb *.rpm *.tar.gz
|
||||
COMMAND make package
|
||||
COMMAND cp *.deb ${PROJECT_BINARY_DIR}
|
||||
COMMAND cp *.rpm ${PROJECT_BINARY_DIR}
|
||||
COMMAND cp *.tar.gz ${PROJECT_BINARY_DIR}
|
||||
WORKING_DIRECTORY ${BUILD_DIR})
|
||||
|
||||
# Package: hip_samples
|
||||
set(BUILD_DIR ${CMAKE_CURRENT_BINARY_DIR}/packages/hip_samples)
|
||||
configure_file(packaging/hip-samples.txt ${BUILD_DIR}/CMakeLists.txt @ONLY)
|
||||
add_custom_target(pkg_hip_samples COMMAND ${CMAKE_COMMAND} .
|
||||
COMMAND rm -rf *.deb *.rpm *.tar.gz
|
||||
COMMAND make package
|
||||
COMMAND cp *.deb ${PROJECT_BINARY_DIR}
|
||||
COMMAND cp *.rpm ${PROJECT_BINARY_DIR}
|
||||
COMMAND cp *.tar.gz ${PROJECT_BINARY_DIR}
|
||||
WORKING_DIRECTORY ${BUILD_DIR})
|
||||
|
||||
# Package: all
|
||||
if(POLICY CMP0037)
|
||||
cmake_policy(PUSH)
|
||||
cmake_policy(SET CMP0037 OLD)
|
||||
endif()
|
||||
|
||||
if(HIP_RUNTIME STREQUAL "rocclr")
|
||||
add_custom_target(package
|
||||
WORKING_DIRECTORY ${PROJECT_BINARY_DIR}
|
||||
DEPENDS pkg_hip_base hip_on_rocclr pkg_hip_nvcc pkg_hip_doc pkg_hip_samples)
|
||||
endif()
|
||||
|
||||
if(POLICY CMP0037)
|
||||
cmake_policy(POP)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
#############################
|
||||
# Code analysis
|
||||
#############################
|
||||
# Target: cppcheck
|
||||
find_program(CPPCHECK_EXE cppcheck)
|
||||
if(CPPCHECK_EXE)
|
||||
add_custom_target(cppcheck COMMAND ${CPPCHECK_EXE} --force --quiet --enable=warning,performance,portability,information,missingInclude src include -I /opt/rocm/include/hcc -I /opt/rocm/include --suppress=*:/opt/rocm/include/hcc/hc.hpp
|
||||
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
|
||||
endif()
|
||||
|
||||
#############################
|
||||
# Code formatting
|
||||
#############################
|
||||
# Target: clangformat
|
||||
find_program(CLANGFORMAT_EXE clang-format PATHS ${HCC_HOME}/bin)
|
||||
if(CLANGFORMAT_EXE)
|
||||
file(GLOB_RECURSE FORMAT_SOURCE_FILE_LIST *.cpp *.hpp *.h)
|
||||
add_custom_target(clangformat COMMAND ${CLANGFORMAT_EXE} -style=file -i ${FORMAT_SOURCE_FILE_LIST}
|
||||
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
|
||||
endif()
|
||||
|
||||
#############################
|
||||
# Testing steps
|
||||
#############################
|
||||
# HIT is not compatible with Windows
|
||||
if(NOT WIN32)
|
||||
set(HIP_ROOT_DIR ${CMAKE_CURRENT_BINARY_DIR})
|
||||
set(HIP_SRC_PATH ${CMAKE_CURRENT_SOURCE_DIR})
|
||||
if(HIP_PLATFORM STREQUAL "nvidia")
|
||||
execute_process(COMMAND "${CMAKE_COMMAND}" -E copy_directory "${HIP_SRC_PATH}/include" "${HIP_ROOT_DIR}/include" RESULT_VARIABLE RUN_HIT ERROR_QUIET)
|
||||
endif()
|
||||
execute_process(COMMAND "${CMAKE_COMMAND}" -E copy_directory "${HIP_SRC_PATH}/src/hipamd/include/hip/" "${HIP_ROOT_DIR}/include/hip/" RESULT_VARIABLE RUN_HIT ERROR_QUIET)
|
||||
execute_process(COMMAND "${CMAKE_COMMAND}" -E copy_directory "${HIP_SRC_PATH}/cmake" "${HIP_ROOT_DIR}/cmake" RESULT_VARIABLE RUN_HIT ERROR_QUIET)
|
||||
if(${RUN_HIT} EQUAL 0)
|
||||
execute_process(COMMAND "${CMAKE_COMMAND}" -E copy_directory "${HIP_SRC_PATH}/bin" "${HIP_ROOT_DIR}/bin" RESULT_VARIABLE RUN_HIT ERROR_QUIET)
|
||||
endif()
|
||||
if(HIP_CATCH_TEST EQUAL "1")
|
||||
enable_testing()
|
||||
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/tests/catch)
|
||||
else()
|
||||
if(${RUN_HIT} EQUAL 0)
|
||||
set(CMAKE_MODULE_PATH "${HIP_ROOT_DIR}/cmake" ${CMAKE_MODULE_PATH})
|
||||
include(${HIP_SRC_PATH}/tests/hit/HIT.cmake)
|
||||
include(${HIP_SRC_PATH}/tests/Tests.cmake)
|
||||
else()
|
||||
message(STATUS "Testing targets will not be available. To enable them please ensure that the HIP installation directory is writeable. Use -DCMAKE_INSTALL_PREFIX to specify a suitable location")
|
||||
endif()
|
||||
endif()
|
||||
endif()
|
||||
|
||||
#############################
|
||||
# Code analysis
|
||||
#############################
|
||||
# Target: clang
|
||||
if(HIP_HIPCC_EXECUTABLE)
|
||||
add_custom_target(analyze
|
||||
COMMAND ${HIP_HIPCC_EXECUTABLE} -fvisibility=hidden -fvisibility-inlines-hidden --analyze --analyzer-outputtext -isystem /opt/rocm/include -Wno-unused-command-line-argument -I/opt/rocm/include -c src/*.cpp -Iinclude/ -I./
|
||||
WORKING_DIRECTORY ${HIP_SRC_PATH})
|
||||
if(CPPCHECK_EXE)
|
||||
add_dependencies(analyze cppcheck)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
# vim: ts=4:sw=4:expandtab:smartindent
|
||||
|
||||
@@ -1,156 +0,0 @@
|
||||
# Contributor Guidelines
|
||||
|
||||
## Make Tips
|
||||
When building HIP, you will likely want to build and install to a local user-accessible directory (rather than /opt/rocm).
|
||||
This can be easily be done by setting the -DCMAKE_INSTALL_PREFIX variable when running cmake. Typical use case is to
|
||||
set CMAKE_INSTALL_PREFIX to your HIP git root, and then ensure HIP_PATH points to this directory. For example
|
||||
|
||||
```
|
||||
cmake .. -DCMAKE_INSTALL_PREFIX=..
|
||||
make install
|
||||
|
||||
export HIP_PATH=
|
||||
```
|
||||
|
||||
After making HIP, don't forget the "make install" step !
|
||||
|
||||
|
||||
|
||||
## Adding a new HIP API
|
||||
|
||||
- Add a translation to the hipify-clang tool ; many examples abound.
|
||||
- For stat tracking purposes, place the API into an appropriate stat category ("dev", "mem", "stream", etc).
|
||||
- Add a inlined NVIDIA implementation for the function in include/hip/nvidia_detail/hip_runtime_api.h.
|
||||
- These are typically headers
|
||||
- Add an HIP_ROCclr definition and Doxygen comments for the function in include/amd_detail/hip_runtime_api.h
|
||||
- Source implementation typically go in hip/rocclr/hip_*.cpp. The implementation involve calls to HIP runtime (ie for hipStream_t).
|
||||
|
||||
## Check HIP-Clang version
|
||||
In some cases new HIP-Clang features are tied to specified releases, and it can be useful to check the current version is sufficiently new enough to support the desired feature.
|
||||
|
||||
HIP runtime version
|
||||
|
||||
```
|
||||
> cat /opt/rocm/hip/bin/.hipVersion
|
||||
# Auto-generated by cmake
|
||||
HIP_VERSION_MAJOR=3
|
||||
HIP_VERSION_MINOR=9
|
||||
HIP_VERSION_PATCH=20345-519ef3f2
|
||||
```
|
||||
|
||||
HIP-Clang compiler version
|
||||
|
||||
```
|
||||
$ /opt/rocm/llvm/bin/clang -v
|
||||
clang version 11.0.0 (/src/external/llvm-project/clang 075fedd3fd2f4d9d8cca79d0cd51f64c5ef21432)
|
||||
Target: x86_64-unknown-linux-gnu
|
||||
Thread model: posix
|
||||
InstalledDir: /opt/rocm/llvm/bin
|
||||
Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/7
|
||||
Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/7.5.0
|
||||
Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/8
|
||||
Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/9
|
||||
Selected GCC installation: /usr/lib/gcc/x86_64-linux-gnu/9
|
||||
Candidate multilib: .;@m64
|
||||
Candidate multilib: 32;@m32
|
||||
Candidate multilib: x32;@mx32
|
||||
Selected multilib: .;@m64
|
||||
```
|
||||
|
||||
## Unit Testing Environment
|
||||
|
||||
HIP includes unit tests in the tests/src directory.
|
||||
When adding a new HIP feature, add a new unit test as well.
|
||||
See [tests/README.md](README.md) for more information.
|
||||
|
||||
## Development Flow
|
||||
|
||||
Directed tests provide a great place to develop new features alongside the associated test.
|
||||
|
||||
For applications and benchmarks outside the directed test environment, developments should use a two-step development flow:
|
||||
- #1. Compile, link, and install HIP/ROCclr. See [Installation](README.md#Installation) notes.
|
||||
- #2. Relink the target application to include changes in HIP runtime file.
|
||||
|
||||
## Environment Variables
|
||||
- **HIP_PATH** : Location of HIP include, src, bin, lib directories.
|
||||
- **HCC_ROCCLR_HOME** : Path to HIP/ROCclr directory, used on AMD platforms. Default /opt/rocm/rocclr.
|
||||
- **HSA_PATH** : Path to HSA include, lib. Default /opt/rocm/hsa.
|
||||
- **CUDA_PATH* : On nvcc system, this points to root of CUDA installation.
|
||||
|
||||
## Contribution guidelines ##
|
||||
|
||||
Features (ie functions, classes, types) defined in hip*.h should resemble CUDA APIs.
|
||||
The HIP interface is designed to be very familiar for CUDA programmers.
|
||||
|
||||
Differences or limitations of HIP APIs as compared to CUDA APIs should be clearly documented and described.
|
||||
|
||||
### Coding Guidelines (in brief)
|
||||
- Code Indentation:
|
||||
- Tabs should be expanded to spaces.
|
||||
- Use 4 spaces indentation.
|
||||
- Capitalization and Naming
|
||||
- Prefer camelCase for HIP interfaces and internal symbols. Note HCC uses _ for separator.
|
||||
This guideline is not yet consistently followed in HIP code - eventual compliance is aspirational.
|
||||
- Member variables should begin with a leading "_". This allows them to be easily distinguished from other variables or functions.
|
||||
|
||||
- {} placement
|
||||
- For functions, the opening { should be placed on a new line.
|
||||
- For if/else blocks, the opening { is placed on same line as the if/else. Use a space to separate {/" from if/else. Example
|
||||
'''
|
||||
if (foo) {
|
||||
doFoo()
|
||||
} else {
|
||||
doFooElse();
|
||||
}
|
||||
'''
|
||||
- namespace should be on same line as { and separated by a space.
|
||||
- Single-line if statement should still use {/} pair (even though C++ does not require).
|
||||
- Miscellaneous
|
||||
- All references in function parameter lists should be const.
|
||||
- "ihip" = internal hip structures. These should not be exposed through the HIP API.
|
||||
- Keyword TODO refers to a note that should be addressed in long-term. Could be style issue, software architecture, or known bugs.
|
||||
- FIXME refers to a short-term bug that needs to be addressed.
|
||||
|
||||
- HIP_INIT_API() should be placed at the start of each top-level HIP API. This function will make sure the HIP runtime is initialized,
|
||||
and also constructs an appropriate API string for tracing and CodeXL marker tracing. The arguments to HIP_INIT_API should match
|
||||
those of the parent function.
|
||||
- ihipLogStatus should only be called from top-level HIP APIs,and should be called to log and return the error code. The error code
|
||||
is used by the GetLastError and PeekLastError functions - if a HIP API simply returns, then the error will not be logged correctly.
|
||||
|
||||
- All HIP environment variables should begin with the keyword HIP_
|
||||
Environment variables should be long enough to describe their purpose but short enough so they can be remembered - perhaps 10-20 characters, with 3-4 parts separated by underscores.
|
||||
To see the list of current environment variables, along with their values, set HIP_PRINT_ENV and run any hip applications on ROCm platform .
|
||||
HIPCC or other tools may support additional environment variables which should follow the above convention.
|
||||
|
||||
|
||||
### Presubmit Testing:
|
||||
Before checking in or submitting a pull request, run all directed tests (see tests/README.md) and all Rodinia tests.
|
||||
Ensure pass results match starting point:
|
||||
|
||||
```shell
|
||||
> cd examples/
|
||||
> ./run_all.sh
|
||||
```
|
||||
|
||||
|
||||
### Checkin messages
|
||||
Follow existing best practice for writing a good Git commit message. Some tips:
|
||||
http://chris.beams.io/posts/git-commit/
|
||||
https://robots.thoughtbot.com/5-useful-tips-for-a-better-commit-message
|
||||
|
||||
In particular :
|
||||
- Use imperative voice, ie "Fix this bug", "Refactor the XYZ routine", "Update the doc".
|
||||
Not : "Fixing the bug", "Fixed the bug", "Bug fix", etc.
|
||||
- Subject should summarize the commit. Do not end subject with a period. Use a blank line
|
||||
after the subject.
|
||||
|
||||
|
||||
|
||||
## Doxygen Editing Guidelines
|
||||
|
||||
- bugs should be marked with @bugs near the code where the bug might be fixed. The @bug message will appear in the API description and also in the
|
||||
doxygen bug list.
|
||||
|
||||
## Other Tips:
|
||||
### Markdown Editing
|
||||
Recommended to use an offline Markdown viewer to review documentation, such as Markdown Preview Plus extension in Chrome browser, or Remarkable.
|
||||
@@ -1,124 +0,0 @@
|
||||
## Table of Contents
|
||||
|
||||
<!-- toc -->
|
||||
|
||||
- [Installing pre-built packages](#installing-pre-built-packages)
|
||||
* [Prerequisites](#prerequisites)
|
||||
* [AMD Platform](#amd-platform)
|
||||
* [NVIDIA Platform](#nvidia-platform)
|
||||
- [Building HIP from source](#building-hip-from-source)
|
||||
* [Build ROCclr](#build-rocclr)
|
||||
* [Build HIP](#build-hip)
|
||||
* [Default paths and environment variables](#default-paths-and-environment-variables)
|
||||
- [Verify your installation](#verify-your-installation)
|
||||
<!-- tocstop -->
|
||||
|
||||
# Installing pre-built packages
|
||||
|
||||
HIP can be easily installed using pre-built binary packages using the package manager for your platform.
|
||||
|
||||
## Prerequisites
|
||||
HIP code can be developed either on AMD ROCm platform using HIP-Clang compiler, or a CUDA platform with nvcc installed.
|
||||
|
||||
## AMD Platform
|
||||
|
||||
```
|
||||
sudo apt install mesa-common-dev
|
||||
sudo apt install clang
|
||||
sudo apt install comgr
|
||||
sudo apt-get -y install rocm-dkms
|
||||
```
|
||||
Public link for Rocm installation
|
||||
https://rocmdocs.amd.com/en/latest/Installation_Guide/Installation-Guide.html
|
||||
|
||||
HIP-Clang is the compiler for compiling HIP programs on AMD platform.
|
||||
|
||||
HIP-Clang can be built manually:
|
||||
```
|
||||
git clone -b rocm-4.3.x https://github.com/RadeonOpenCompute/llvm-project.git
|
||||
cd llvm-project
|
||||
mkdir -p build && cd build
|
||||
cmake -DCMAKE_INSTALL_PREFIX=/opt/rocm/llvm -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_ASSERTIONS=1 -DLLVM_TARGETS_TO_BUILD="AMDGPU;X86" -DLLVM_ENABLE_PROJECTS="clang;lld;compiler-rt" ../llvm
|
||||
make -j
|
||||
sudo make install
|
||||
```
|
||||
|
||||
Rocm device library can be manually built as following,
|
||||
```
|
||||
export PATH=/opt/rocm/llvm/bin:$PATH
|
||||
git clone -b rocm-4.3.x https://github.com/RadeonOpenCompute/ROCm-Device-Libs.git
|
||||
cd ROCm-Device-Libs
|
||||
mkdir -p build && cd build
|
||||
CC=clang CXX=clang++ cmake -DLLVM_DIR=/opt/rocm/llvm -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_WERROR=1 -DLLVM_ENABLE_ASSERTIONS=1 -DCMAKE_INSTALL_PREFIX=/opt/rocm ..
|
||||
make -j
|
||||
sudo make install
|
||||
```
|
||||
|
||||
## NVIDIA Platform
|
||||
|
||||
HIP-nvcc is the compiler for HIP program compilation on NVIDIA platform.
|
||||
|
||||
* Add the ROCm package server to your system as per the OS-specific guide available [here](https://rocm.github.io/ROCmInstall.html#installing-from-amd-rocm-repositories).
|
||||
* Install the "hip-nvcc" package. This will install CUDA SDK and the HIP porting layer.
|
||||
```
|
||||
apt-get install hip-nvcc
|
||||
```
|
||||
|
||||
* Default paths and environment variables:
|
||||
* By default HIP looks for CUDA SDK in /usr/local/cuda (can be overriden by setting CUDA_PATH env variable).
|
||||
* By default HIP is installed into /opt/rocm/hip (can be overridden by setting HIP_PATH environment variable).
|
||||
* Optionally, consider adding /opt/rocm/bin to your path to make it easier to use the tools.
|
||||
|
||||
# Building HIP from source
|
||||
|
||||
## Build ROCclr
|
||||
|
||||
ROCclr is defined on AMD platform that HIP use Radeon Open Compute Common Language Runtime (ROCclr), which is a virtual device interface that HIP runtimes interact with different backends.
|
||||
See https://github.com/ROCm-Developer-Tools/ROCclr
|
||||
|
||||
```
|
||||
git clone -b rocm-4.3.x https://github.com/ROCm-Developer-Tools/ROCclr.git
|
||||
export ROCclr_DIR="$(readlink -f ROCclr)"
|
||||
git clone -b rocm-4.3.x https://github.com/RadeonOpenCompute/ROCm-OpenCL-Runtime.git
|
||||
export OPENCL_DIR="$(readlink -f ROCm-OpenCL-Runtime)"
|
||||
cd "$ROCclr_DIR"
|
||||
mkdir -p build;cd build
|
||||
cmake -DOPENCL_DIR="$OPENCL_DIR" -DCMAKE_INSTALL_PREFIX=/opt/rocm/rocclr ..
|
||||
make -j
|
||||
sudo make install
|
||||
```
|
||||
|
||||
## Build HIP
|
||||
|
||||
```
|
||||
git clone -b rocm-4.3.x https://github.com/ROCm-Developer-Tools/HIP.git
|
||||
export HIP_DIR="$(readlink -f HIP)"
|
||||
cd "$HIP_DIR"
|
||||
mkdir -p build; cd build
|
||||
cmake -DCMAKE_PREFIX_PATH="$ROCclr_DIR/build;/opt/rocm/" -DCMAKE_INSTALL_PREFIX=</where/to/install/hip> ..
|
||||
make -j
|
||||
sudo make install
|
||||
Note: If you don't specify CMAKE_INSTALL_PREFIX, hip-rocclr runtime will be installed to "/opt/rocm/hip".
|
||||
```
|
||||
|
||||
## Default paths and environment variables
|
||||
|
||||
* By default HIP looks for HSA in /opt/rocm/hsa (can be overridden by setting HSA_PATH environment variable).
|
||||
* By default HIP is installed into /opt/rocm/hip (can be overridden by setting HIP_PATH environment variable).
|
||||
* By default HIP looks for clang in /opt/rocm/llvm/bin (can be overridden by setting HIP_CLANG_PATH environment variable)
|
||||
* By default HIP looks for device library in /opt/rocm/lib (can be overridden by setting DEVICE_LIB_PATH environment variable).
|
||||
* Optionally, consider adding /opt/rocm/bin to your PATH to make it easier to use the tools.
|
||||
* Optionally, set HIPCC_VERBOSE=7 to output the command line for compilation.
|
||||
|
||||
After installation, make sure HIP_PATH is pointed to /where/to/install/hip
|
||||
|
||||
# Verify your installation
|
||||
|
||||
Run hipconfig (instructions below assume default installation path) :
|
||||
```shell
|
||||
/opt/rocm/bin/hipconfig --full
|
||||
```
|
||||
|
||||
|
||||
Compile and run the [square sample](https://github.com/ROCm-Developer-Tools/HIP/tree/main/samples/0_Intro/square).
|
||||
|
||||
поставляемый
-442
@@ -1,442 +0,0 @@
|
||||
#!/usr/bin/env groovy
|
||||
// Copyright (C) 2017-2021 Advanced Micro Devices, Inc. All Rights Reserved.
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
// of this software and associated documentation files (the "Software"), to deal
|
||||
// in the Software without restriction, including without limitation the rights
|
||||
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
// copies of the Software, and to permit persons to whom the Software is
|
||||
// furnished to do so, subject to the following conditions:
|
||||
//
|
||||
// The above copyright notice and this permission notice shall be included in
|
||||
// all copies or substantial portions of the Software.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
// THE SOFTWARE.
|
||||
|
||||
// Generated from snippet generator 'properties; set job properties'
|
||||
properties([buildDiscarder(logRotator(
|
||||
artifactDaysToKeepStr: '',
|
||||
artifactNumToKeepStr: '',
|
||||
daysToKeepStr: '',
|
||||
numToKeepStr: '10')),
|
||||
disableConcurrentBuilds(),
|
||||
parameters([booleanParam( name: 'push_image_to_docker_hub', defaultValue: false, description: 'Push hip & hcc image to rocm docker-hub' )]),
|
||||
[$class: 'CopyArtifactPermissionProperty', projectNames: '*']
|
||||
])
|
||||
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
// -- AUXILLARY HELPER FUNCTIONS
|
||||
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
// Return build number of upstream job
|
||||
@NonCPS
|
||||
int get_upstream_build_num( )
|
||||
{
|
||||
def upstream_cause = currentBuild.rawBuild.getCause( hudson.model.Cause$UpstreamCause )
|
||||
if( upstream_cause == null)
|
||||
return 0
|
||||
|
||||
return upstream_cause.getUpstreamBuild()
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
// Return project name of upstream job
|
||||
@NonCPS
|
||||
String get_upstream_build_project( )
|
||||
{
|
||||
def upstream_cause = currentBuild.rawBuild.getCause( hudson.model.Cause$UpstreamCause )
|
||||
if( upstream_cause == null)
|
||||
return null
|
||||
|
||||
return upstream_cause.getUpstreamProject()
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
// Construct the docker build image name
|
||||
String docker_build_image_name( )
|
||||
{
|
||||
return "build-ubuntu-16.04"
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
// Construct the relative path of the build directory
|
||||
String build_directory_rel( String build_config )
|
||||
{
|
||||
if( build_config.equalsIgnoreCase( 'release' ) )
|
||||
{
|
||||
return "build/release"
|
||||
}
|
||||
else
|
||||
{
|
||||
return "build/debug"
|
||||
}
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
// Lots of images are created above; no apparent way to delete images:tags with docker global variable
|
||||
def docker_clean_images( String org, String image_name )
|
||||
{
|
||||
// Check if any images exist first grepping for image names
|
||||
int docker_images = sh( script: "docker images | grep \"${org}/${image_name}\"", returnStatus: true )
|
||||
|
||||
// The script returns a 0 for success (images were found )
|
||||
if( docker_images == 0 )
|
||||
{
|
||||
// Deleting images can fail, if other projects have built on top of that image and are now dependent on it.
|
||||
// This should not be treated as a hip build failure. This requires cleanup at a later time, possibly through
|
||||
// another job
|
||||
try
|
||||
{
|
||||
// Best attempt to run bash script to clean images
|
||||
// deleting images based on hash seems to be more stable than through name:tag values because of <none> tags
|
||||
sh "docker images | grep \"${org}/${image_name}\" | awk '{print \$1 \":\" \$2}' | xargs docker rmi"
|
||||
}
|
||||
catch( err )
|
||||
{
|
||||
println 'Failed to cleanup a few images; probably the images are used as a base for other images'
|
||||
currentBuild.result = 'SUCCESS'
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
// -- BUILD RELATED FUNCTIONS
|
||||
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
// Checkout source code, source dependencies and update version number numbers
|
||||
// Returns a relative path to the directory where the source exists in the workspace
|
||||
String checkout_and_version( String platform )
|
||||
{
|
||||
String source_dir_rel = "src"
|
||||
String source_hip_rel = "${source_dir_rel}/hip"
|
||||
|
||||
stage("${platform} clone")
|
||||
{
|
||||
dir( "${source_hip_rel}" )
|
||||
{
|
||||
// checkout hip
|
||||
checkout([
|
||||
$class: 'GitSCM',
|
||||
branches: scm.branches,
|
||||
doGenerateSubmoduleConfigurations: scm.doGenerateSubmoduleConfigurations,
|
||||
extensions: scm.extensions + [[$class: 'CleanCheckout']],
|
||||
userRemoteConfigs: scm.userRemoteConfigs
|
||||
])
|
||||
}
|
||||
}
|
||||
|
||||
return source_hip_rel
|
||||
}
|
||||
|
||||
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
// This creates the docker image that we use to build the project in
|
||||
// The docker images contains all dependencies, including OS platform, to build
|
||||
def docker_build_image( String platform, String org, String optional_build_parm, String source_hip_rel, String from_image )
|
||||
{
|
||||
String build_image_name = docker_build_image_name( )
|
||||
String dockerfile_name = "dockerfile-build-ubuntu-16.04"
|
||||
def build_image = null
|
||||
|
||||
stage("${platform} build image")
|
||||
{
|
||||
dir("${source_hip_rel}")
|
||||
{
|
||||
def user_uid = sh( script: 'id -u', returnStdout: true ).trim()
|
||||
|
||||
// Docker 17.05 introduced the ability to use ARG values in FROM statements
|
||||
// Docker inspect failing on FROM statements with ARG https://issues.jenkins-ci.org/browse/JENKINS-44836
|
||||
// build_image = docker.build( "${org}/${build_image_name}:latest", "--pull -f docker/${dockerfile_name} --build-arg user_uid=${user_uid} --build-arg base_image=${from_image} ." )
|
||||
|
||||
// JENKINS-44836 workaround by using a bash script instead of docker.build()
|
||||
sh "docker build -t ${org}/${build_image_name}:latest -f docker/${dockerfile_name} ${optional_build_parm} --build-arg user_uid=${user_uid} --build-arg base_image=${from_image} ."
|
||||
build_image = docker.image( "${org}/${build_image_name}:latest" )
|
||||
}
|
||||
}
|
||||
|
||||
return build_image
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
// This encapsulates the cmake configure, build and package commands
|
||||
// Leverages docker containers to encapsulate the build in a fixed environment
|
||||
def docker_build_inside_image( def build_image, String inside_args, String platform, String optional_configure, String build_config, String source_hip_rel, String build_dir_rel )
|
||||
{
|
||||
String source_hip_abs = pwd() + "/" + source_hip_rel
|
||||
|
||||
build_image.inside( inside_args )
|
||||
{
|
||||
stage("${platform} make ${build_config}")
|
||||
{
|
||||
// The rm command needs to run as sudo because the test steps below create files owned by root
|
||||
sh """#!/usr/bin/env bash
|
||||
set -x
|
||||
rm -rf ${build_dir_rel}
|
||||
mkdir -p ${build_dir_rel}
|
||||
cd ${build_dir_rel}
|
||||
cmake -DCMAKE_BUILD_TYPE=${build_config} -DCMAKE_INSTALL_PREFIX=staging ${optional_configure} ${source_hip_abs}
|
||||
make -j\$(nproc)
|
||||
"""
|
||||
}
|
||||
|
||||
// Cap the maximum amount of testing, in case of hangs
|
||||
// Excluding hipMultiThreadDevice-pyramid & hipMemoryAllocateCoherentDriver tests from automation; due to its flakiness which requires some investigation
|
||||
timeout(time: 1, unit: 'HOURS')
|
||||
{
|
||||
stage("${platform} unit testing")
|
||||
{
|
||||
sh """#!/usr/bin/env bash
|
||||
set -x
|
||||
cd ${build_dir_rel}
|
||||
make install -j\$(nproc)
|
||||
make build_tests -i -j\$(nproc)
|
||||
ctest --output-on-failure -E "(hipMultiThreadDevice-pyramid|hipMemoryAllocateCoherentDriver)"
|
||||
"""
|
||||
// If unit tests output a junit or xunit file in the future, jenkins can parse that file
|
||||
// to display test results on the dashboard
|
||||
// junit "${build_dir_rel}/*.xml"
|
||||
}
|
||||
}
|
||||
|
||||
// Only create packages from hcc based builds
|
||||
if( platform.toLowerCase( ).startsWith( 'rocm-' ) )
|
||||
{
|
||||
stage("${platform} packaging")
|
||||
{
|
||||
sh """#!/usr/bin/env bash
|
||||
set -x
|
||||
cd ${build_dir_rel}
|
||||
make package
|
||||
"""
|
||||
|
||||
// No matter the base platform, all packages have the same name
|
||||
// Only upload 1 set of packages, so we don't have a race condition uploading packages
|
||||
if( platform.toLowerCase( ).startsWith( 'rocm-head' ) )
|
||||
{
|
||||
archiveArtifacts artifacts: "${build_dir_rel}/*.deb", fingerprint: true
|
||||
archiveArtifacts artifacts: "${build_dir_rel}/*.rpm", fingerprint: true
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return void
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
// This builds a fresh docker image FROM a clean base image, with no build dependencies included
|
||||
// Uploads the new docker image to internal artifactory
|
||||
String docker_upload_artifactory( String hcc_ver, String artifactory_org, String from_image, String source_hip_rel, String build_dir_rel )
|
||||
{
|
||||
def hip_install_image = null
|
||||
String image_name = "hip-${hcc_ver}-ubuntu-16.04"
|
||||
|
||||
stage( 'artifactory' )
|
||||
{
|
||||
println "artifactory_org: ${artifactory_org}"
|
||||
|
||||
// We copy the docker files into the bin directory where the .deb lives so that it's a clean build everytime
|
||||
sh "cp -r ${source_hip_rel}/docker/* ${build_dir_rel}"
|
||||
|
||||
// Docker 17.05 introduced the ability to use ARG values in FROM statements
|
||||
// Docker inspect failing on FROM statements with ARG https://issues.jenkins-ci.org/browse/JENKINS-44836
|
||||
// hip_install_image = docker.build( "${artifactory_org}/${image_name}:${env.BUILD_NUMBER}", "--pull -f ${build_dir_rel}/dockerfile-hip-ubuntu-16.04 --build-arg base_image=${from_image} ${build_dir_rel}" )
|
||||
|
||||
// JENKINS-44836 workaround by using a bash script instead of docker.build()
|
||||
sh "docker build -t ${artifactory_org}/${image_name} --pull -f ${build_dir_rel}/dockerfile-hip-ubuntu-16.04 --build-arg base_image=${from_image} ${build_dir_rel}"
|
||||
hip_install_image = docker.image( "${artifactory_org}/${image_name}" )
|
||||
|
||||
// The connection to artifactory can fail sometimes, but this should not be treated as a build fail
|
||||
try
|
||||
{
|
||||
// Don't push pull requests to artifactory, these tend to accumulate over time
|
||||
if( env.BRANCH_NAME.toLowerCase( ).startsWith( 'pr-' ) )
|
||||
{
|
||||
println 'Pull Request (PR-xxx) detected; NOT pushing to artifactory'
|
||||
}
|
||||
else
|
||||
{
|
||||
docker.withRegistry('http://compute-artifactory:5001', 'artifactory-cred' )
|
||||
{
|
||||
hip_install_image.push( "${env.BUILD_NUMBER}" )
|
||||
hip_install_image.push( 'latest' )
|
||||
}
|
||||
}
|
||||
}
|
||||
catch( err )
|
||||
{
|
||||
currentBuild.result = 'SUCCESS'
|
||||
}
|
||||
}
|
||||
|
||||
return image_name
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
// Uploads the new docker image to the public docker-hub
|
||||
def docker_upload_dockerhub( String local_org, String image_name, String remote_org )
|
||||
{
|
||||
stage( 'docker-hub' )
|
||||
{
|
||||
// Do not treat failures to push to docker-hub as a build fail
|
||||
try
|
||||
{
|
||||
sh """#!/usr/bin/env bash
|
||||
set -x
|
||||
echo inside sh
|
||||
docker tag ${local_org}/${image_name} ${remote_org}/${image_name}
|
||||
"""
|
||||
|
||||
docker_hub_image = docker.image( "${remote_org}/${image_name}" )
|
||||
|
||||
docker.withRegistry('https://registry.hub.docker.com', 'docker-hub-cred' )
|
||||
{
|
||||
docker_hub_image.push( "${env.BUILD_NUMBER}" )
|
||||
docker_hub_image.push( 'latest' )
|
||||
}
|
||||
}
|
||||
catch( err )
|
||||
{
|
||||
currentBuild.result = 'SUCCESS'
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
// -- MAIN
|
||||
// Following this line is the start of MAIN of this Jenkinsfile
|
||||
String build_config = 'Release'
|
||||
String job_name = env.JOB_NAME.toLowerCase( )
|
||||
|
||||
// The following launches 3 builds in parallel: rocm-head, rocm-3.3.x and cuda-10.x
|
||||
parallel rocm_3_3:
|
||||
{
|
||||
node('hip-rocm')
|
||||
{
|
||||
String hcc_ver = 'rocm-3.3.x'
|
||||
String from_image = 'ci_test_nodes/rocm-3.3.x/ubuntu-16.04:latest'
|
||||
String inside_args = '--device=/dev/kfd --device=/dev/dri --group-add=video'
|
||||
|
||||
// Checkout source code, dependencies and version files
|
||||
String source_hip_rel = checkout_and_version( hcc_ver )
|
||||
|
||||
// Create/reuse a docker image that represents the hip build environment
|
||||
def hip_build_image = docker_build_image( hcc_ver, 'hip', '', source_hip_rel, from_image )
|
||||
|
||||
// Print system information for the log
|
||||
hip_build_image.inside( inside_args )
|
||||
{
|
||||
sh """#!/usr/bin/env bash
|
||||
set -x
|
||||
/opt/rocm/bin/rocm_agent_enumerator -t ALL
|
||||
/opt/rocm/bin/hcc --version
|
||||
"""
|
||||
}
|
||||
|
||||
// Conctruct a binary directory path based on build config
|
||||
String build_hip_rel = build_directory_rel( build_config );
|
||||
|
||||
// Build hip inside of the build environment
|
||||
docker_build_inside_image( hip_build_image, inside_args, hcc_ver, '', build_config, source_hip_rel, build_hip_rel )
|
||||
|
||||
// Clean docker build image
|
||||
docker_clean_images( 'hip', docker_build_image_name( ) )
|
||||
|
||||
// After a successful build, upload a docker image of the results
|
||||
/*
|
||||
String hip_image_name = docker_upload_artifactory( hcc_ver, job_name, from_image, source_hip_rel, build_hip_rel )
|
||||
if( params.push_image_to_docker_hub )
|
||||
{
|
||||
docker_upload_dockerhub( job_name, hip_image_name, 'rocm' )
|
||||
docker_clean_images( 'rocm', hip_image_name )
|
||||
}
|
||||
docker_clean_images( job_name, hip_image_name )
|
||||
*/
|
||||
}
|
||||
},
|
||||
rocm_head:
|
||||
{
|
||||
node('hip-rocm')
|
||||
{
|
||||
String hcc_ver = 'rocm-head'
|
||||
String from_image = 'ci_test_nodes/rocm-head/ubuntu-16.04:latest'
|
||||
String inside_args = '--device=/dev/kfd --device=/dev/dri --group-add=video'
|
||||
|
||||
// Checkout source code, dependencies and version files
|
||||
String source_hip_rel = checkout_and_version( hcc_ver )
|
||||
|
||||
// Create/reuse a docker image that represents the hip build environment
|
||||
def hip_build_image = docker_build_image( hcc_ver, 'hip', '', source_hip_rel, from_image )
|
||||
|
||||
// Print system information for the log
|
||||
hip_build_image.inside( inside_args )
|
||||
{
|
||||
sh """#!/usr/bin/env bash
|
||||
set -x
|
||||
/opt/rocm/bin/rocm_agent_enumerator -t ALL
|
||||
/opt/rocm/bin/hcc --version
|
||||
"""
|
||||
}
|
||||
|
||||
// Conctruct a binary directory path based on build config
|
||||
String build_hip_rel = build_directory_rel( build_config );
|
||||
|
||||
// Build hip inside of the build environment
|
||||
docker_build_inside_image( hip_build_image, inside_args, hcc_ver, '', build_config, source_hip_rel, build_hip_rel )
|
||||
|
||||
// Clean docker image
|
||||
docker_clean_images( 'hip', docker_build_image_name( ) )
|
||||
|
||||
// After a successful build, upload a docker image of the results
|
||||
/*
|
||||
String hip_image_name = docker_upload_artifactory( hcc_ver, job_name, from_image, source_hip_rel, build_hip_rel )
|
||||
if( params.push_image_to_docker_hub )
|
||||
{
|
||||
docker_upload_dockerhub( job_name, hip_image_name, 'rocm' )
|
||||
docker_clean_images( 'rocm', hip_image_name )
|
||||
}
|
||||
docker_clean_images( job_name, hip_image_name )
|
||||
*/
|
||||
}
|
||||
},
|
||||
cuda_10_x:
|
||||
{
|
||||
node('hip-cuda')
|
||||
{
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
// Block of string constants customizing behavior for cuda
|
||||
String nvcc_ver = 'cuda-10.x'
|
||||
String from_image = 'ci_test_nodes/cuda-10.x/ubuntu-16.04:latest'
|
||||
String inside_args = '--gpus all';
|
||||
|
||||
// Checkout source code, dependencies and version files
|
||||
String source_hip_rel = checkout_and_version( nvcc_ver )
|
||||
|
||||
// Create/reuse a docker image that represents the hip build environment
|
||||
def hip_build_image = docker_build_image( nvcc_ver, 'hip', '', source_hip_rel, from_image )
|
||||
|
||||
// Print system information for the log
|
||||
hip_build_image.inside( inside_args )
|
||||
{
|
||||
sh """#!/usr/bin/env bash
|
||||
set -x
|
||||
nvidia-smi
|
||||
nvcc --version
|
||||
"""
|
||||
}
|
||||
|
||||
// Conctruct a binary directory path based on build config
|
||||
String build_hip_rel = build_directory_rel( build_config );
|
||||
|
||||
// Build hip inside of the build environment
|
||||
docker_build_inside_image( hip_build_image, inside_args, nvcc_ver, "-DHIP_NVCC_FLAGS=--Wno-deprecated-gpu-targets", build_config, source_hip_rel, build_hip_rel )
|
||||
|
||||
// Clean docker image
|
||||
docker_clean_images( 'hip', docker_build_image_name( ) )
|
||||
}
|
||||
}
|
||||
@@ -1,216 +0,0 @@
|
||||
# Release notes
|
||||
|
||||
We have attempted to document known bugs and limitations - in particular the [HIP Kernel Language](docs/markdown/hip_kernel_language.md) document uses the phrase "Under Development", and the [HIP Runtime API bug list](http://rocm-developer-tools.github.io/HIP/bug.html) lists known bugs.
|
||||
|
||||
|
||||
===================================================================================================
|
||||
|
||||
|
||||
## Revision History:
|
||||
|
||||
===================================================================================================
|
||||
Release: 1.5
|
||||
Date:
|
||||
- Support threadIdx, blockIdx, blockDim directly (no need for hipify conversions in kernels.) HIP
|
||||
Kernel syntax is now identical to CUDA kernel syntax - no need for extra parms or conversions.
|
||||
- Refactor launch syntax. HIP now extracts kernels from the executable and launches them using the
|
||||
existing module interface. Kernels dispatch no longer flows through HCC. Result is faster
|
||||
kernel launches and with less resource usage (no signals required).
|
||||
- Remove requirement for manual "serializers" previously required when passing complex structures
|
||||
into kernels.
|
||||
- Remove need for manual destructors
|
||||
- Provide printf in device code
|
||||
- Support for globals when using module API
|
||||
- hipify-clang now supports using newer versions of clang
|
||||
- HIP texture support equivalent to CUDA texture driver APIs
|
||||
- Updates to hipify-perl, hipify-clang and documentation
|
||||
|
||||
|
||||
===================================================================================================
|
||||
Release: 1.4
|
||||
Date: 2017.10.06
|
||||
- Improvements to HIP event management
|
||||
- Added new HIP_TRACE_API options
|
||||
- Enabled device side assert support
|
||||
- Several bug fixes including hipMallocArray, hipTexture fetch
|
||||
- Support for RHEL/CentOS 7.4
|
||||
- Updates to hipify-perl, hipify-clang and documentation
|
||||
|
||||
|
||||
===================================================================================================
|
||||
Release: 1.3
|
||||
Date: 2017.08.16
|
||||
- hipcc now auto-detects amdgcn arch. No need to specify the arch when building for same system.
|
||||
- HIP texture support (run-time APIs)
|
||||
- Implemented __threadfence_support
|
||||
- Improvements in HIP context management logic
|
||||
- Bug fixes in several APIs including hipDeviceGetPCIBusId, hipEventDestroy, hipMemcpy2DAsync
|
||||
- Updates to hipify-clang and documentation
|
||||
- HIP development now fully open and on GitHub. Developers should submit pull requests.
|
||||
|
||||
|
||||
===================================================================================================
|
||||
Release: 1.2
|
||||
Date: 2017.06.29
|
||||
- new APIs: hipMemcpy2DAsync, hipMallocPitch, hipHostMallocCoherent, hipHostMallocNonCoherent
|
||||
- added support for building hipify-clang using clang 3.9
|
||||
- hipify-clang updates for CUDA 8.0 runtime+driver support
|
||||
- renamed hipify to hipify-perl
|
||||
- initial implementation of hipify-cmakefile
|
||||
- several documentation updates & bug fixes
|
||||
- support for abort() function in device code
|
||||
|
||||
|
||||
===================================================================================================
|
||||
Release: 1.0.17102
|
||||
Date: 2017.03.07
|
||||
- Lots of improvements to hipify-clang.
|
||||
- Added HIP package config for cmake.
|
||||
- Several bug fixes and documentation updates.
|
||||
|
||||
|
||||
===================================================================================================
|
||||
Release: 1.0.17066
|
||||
Date: 2017.02.11
|
||||
- Improved support for math device functions.
|
||||
- Added several half math device functions.
|
||||
- Enabled support for CUDA 8.0 in hipify-clang.
|
||||
- Lots of bug fixes and documentation updates.
|
||||
|
||||
|
||||
===================================================================================================
|
||||
Release: 1.0.17015
|
||||
Date: 2017.01.06
|
||||
- Several improvements to the hipify-clang infrastructure.
|
||||
- Refactored module and function APIs.
|
||||
- HIP now defaults to linking against the shared runtime library.
|
||||
- Documentation updates.
|
||||
|
||||
|
||||
===================================================================================================
|
||||
Release: 1.0.16502
|
||||
Date: 2016.12.13
|
||||
- Added several fast math and packaged math instrincs
|
||||
- Improved debug and profiler documentation
|
||||
- Support for building and linking to HIP shared library
|
||||
- Several improvements to hipify-clang
|
||||
- Several bug fixes
|
||||
|
||||
|
||||
===================================================================================================
|
||||
Release: 1.0.16461
|
||||
Date: 2016.11.14
|
||||
- Significant changes to the HIP Profiling APIs. Refer to the documentation for details
|
||||
- Improvements to P2P support
|
||||
- New API: hipDeviceGetByPCIBusId
|
||||
- Several bug fixes in NV path
|
||||
- hipModuleLaunch now works for multi-dim kernels
|
||||
|
||||
|
||||
===================================================================================================
|
||||
Release:1.0
|
||||
Date: 2016.11.8
|
||||
- Initial implementation for FindHIP.cmake
|
||||
- HIP library now installs as a static library by default
|
||||
- Added support for HIP context and HIP module APIs
|
||||
- Major changes to HIP signal & memory management implementation
|
||||
- Support for complex data type and math functions
|
||||
- clang-hipify is now known as hipify-clang
|
||||
- Added several new HIP samples
|
||||
- Preliminary support for new APIs: hipMemcpyToSymbol, hipDeviceGetLimit, hipRuntimeGetVersion
|
||||
- Added support for async memcpy driver API (for example hipMemcpyHtoDAsync)
|
||||
- Support for memory management device functions: malloc, free, memcpy & memset
|
||||
- Removed deprecated HIP runtime header locations. Please include "hip/hip_runtime.h" instead of "hip_runtime.h". You can use `find . -type f -exec sed -i 's:#include "hip_runtime.h":#include "hip/hip_runtime.h":g' {} +` to replace all such references
|
||||
|
||||
|
||||
===================================================================================================
|
||||
Release:0.92.00
|
||||
Date: 2016.8.14
|
||||
- hipLaunchKernel supports one-dimensional grid and/or block dims, without explicit cast to dim3 type (actually in 0.90.00)
|
||||
- fp16 software support
|
||||
- Support for Hawaii dGPUs using environment variable ROCM_TARGET=hawaii
|
||||
- Support hipArray
|
||||
- Improved profiler support
|
||||
- Documentation updates
|
||||
- Improvements to clang-hipify
|
||||
|
||||
|
||||
===================================================================================================
|
||||
Release:0.90.00
|
||||
Date: 2016.06.29
|
||||
- Support dynamic shared memory allocations
|
||||
- Min HCC compiler version is > 16186.
|
||||
- Expanded math functions (device and host). Document unsupported functions.
|
||||
- hipFree with null pointer initializes runtime and returns success.
|
||||
- Improve error code reporting on nvcc.
|
||||
- Add hipPeekAtError for nvcc.
|
||||
|
||||
|
||||
===================================================================================================
|
||||
Release:0.86.00
|
||||
Date: 2016.06.06
|
||||
- Add clang-hipify : clang-based hipify tool. Improved parsing of source code, and automates
|
||||
creation of hipLaunchParm variable.
|
||||
- Implement memory register / unregister commands (hipHostRegister, hipHostUnregister)
|
||||
- Add cross-linking support between G++ and HCC, in particular for interfaces that use
|
||||
standard C++ libraries (ie std::vectors, std::strings). HIPCC now uses libstdc++ by default on the HCC
|
||||
compilation path.
|
||||
- More samples including gpu-burn, SHOC, nbody, rtm. See [HIP-Examples](https://github.com/ROCm-Developer-Tools/HIP-Examples)
|
||||
|
||||
|
||||
===================================================================================================
|
||||
Release:0.84.01
|
||||
Date: 2016.04.25
|
||||
- Refactor HIP make and install system:
|
||||
- Move to CMake. Refer to the installation section in README.md for details.
|
||||
- Split source into multiple modular .cpp and .h files.
|
||||
- Create static library and link.
|
||||
- Set HIP_PATH to install.
|
||||
- Make hipDevice and hipStream thread-safe.
|
||||
- Preferred hipStream usage is still to create new streams for each new thread, but it works even if you don;t.
|
||||
- Improve automated platform detection: If AMD GPU is installed and detected by driver, default HIP_PLATFORM to hcc.
|
||||
- HIP_TRACE_API now prints arguments to the HIP function (in addition to name of function).
|
||||
- Deprecate hipDeviceGetProp (Replace with hipGetDeviceProp)
|
||||
- Deprecate hipMallocHost (Replace with hipHostMalloc)
|
||||
- Deprecate hipFreeHost (Replace with hipHostFree)
|
||||
- The mixbench benchmark tool for measuring operational intensity now has a HIP target, in addition to CUDA and OpenCL. Let the comparisons begin. :)
|
||||
See here for more : https://github.com/ekondis/mixbench.
|
||||
|
||||
|
||||
===================================================================================================
|
||||
Release:0.82.00
|
||||
Date: 2016.03.07
|
||||
- Bump minimum required HCC workweek to 16074.
|
||||
- Bump minimum required ROCK-Kernel-Driver and ROCR-Runtime to Developer Preview 2.
|
||||
- Enable multi-GPU support.
|
||||
* Use hipSetDevice to select a device for subsequent kernel calls and memory allocations.
|
||||
* CUDA_VISIBLE_DEVICES / HIP_VISIBLE_DEVICE environment variable selects devices visible to the runtime.
|
||||
- Support hipStreams – send sequences of copy and kernel commands to a device.
|
||||
* Asynchronous copies supported.
|
||||
- Optimize memory copy operations.
|
||||
- Support hipPointerGetAttribute – can determine if a pointer is host or device.
|
||||
- Enable atomics to local memory.
|
||||
- Support for LC Direct-To-ISA path.
|
||||
- Improved free memory reporting.
|
||||
* hipMemGetInfo (report full memory used in current process).
|
||||
* hipDeviceReset (deletes all memory allocated by current process).
|
||||
|
||||
|
||||
===================================================================================================
|
||||
Release:0.80.01
|
||||
Date: 2016.02.18
|
||||
- Improve reporting and support for device-side math functions.
|
||||
- Update Runtime Documentation.
|
||||
- Improve implementations of cross-lane operations (_ballot, _any, _all).
|
||||
- Provide shuffle intrinsics (performance optimization in-progress).
|
||||
- Support hipDeviceAttribute for querying "one-shot" device attributes, as an alternative to hipGetDeviceProperties.
|
||||
|
||||
|
||||
===================================================================================================
|
||||
Release:0.80.00
|
||||
Date: 2016.01.25
|
||||
|
||||
Initial release with GPUOpen Launch.
|
||||
|
||||
|
||||
|
||||
@@ -1,27 +0,0 @@
|
||||
#!/bin/bash
|
||||
# Copyright (c) 2016-2021 Advanced Micro Devices, Inc. All rights reserved.
|
||||
#
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
# of this software and associated documentation files (the "Software"), to deal
|
||||
# in the Software without restriction, including without limitation the rights
|
||||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
# copies of the Software, and to permit persons to whom the Software is
|
||||
# furnished to do so, subject to the following conditions:
|
||||
#
|
||||
# The above copyright notice and this permission notice shall be included in
|
||||
# all copies or substantial portions of the Software.
|
||||
#
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
# THE SOFTWARE.
|
||||
|
||||
SEARCH_DIRS=$@
|
||||
|
||||
find $SEARCH_DIRS -name '*.cu'
|
||||
find $SEARCH_DIRS -name '*.cpp' -o -name '*.cxx' -o -name '*.c' -o -name '*.cc'
|
||||
find $SEARCH_DIRS -name '*.cuh'
|
||||
find $SEARCH_DIRS -name '*.h' -o -name '*.hpp' -o -name '*.inc' -o -name '*.inl' -o -name '*.hxx' -o -name '*.hdl'
|
||||
@@ -1,24 +0,0 @@
|
||||
#!/bin/bash
|
||||
# Copyright (c) 2016-2021 Advanced Micro Devices, Inc. All rights reserved.
|
||||
#
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
# of this software and associated documentation files (the "Software"), to deal
|
||||
# in the Software without restriction, including without limitation the rights
|
||||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
# copies of the Software, and to permit persons to whom the Software is
|
||||
# furnished to do so, subject to the following conditions:
|
||||
#
|
||||
# The above copyright notice and this permission notice shall be included in
|
||||
# all copies or substantial portions of the Software.
|
||||
#
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
# THE SOFTWARE.
|
||||
|
||||
SEARCH_DIR=$1
|
||||
|
||||
find $SEARCH_DIR -not -name '*.cu' -and -not -name '*.cpp' -and -not -name '*.cxx' -and -not -name '*.c' -and -not -name '*.cc' -and -not -name '*.cuh' -and -not -name '*.h' -and -not -name '*.hpp' -and -not -name '*.inc' -and -not -name '*.inl' -and -not -name '*.hxx' -and -not -name '*.hdl'
|
||||
@@ -1,191 +0,0 @@
|
||||
#!/bin/bash
|
||||
# Copyright (c) 2020-2021 Advanced Micro Devices, Inc. All rights reserved.
|
||||
#
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
# of this software and associated documentation files (the "Software"), to deal
|
||||
# in the Software without restriction, including without limitation the rights
|
||||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
# copies of the Software, and to permit persons to whom the Software is
|
||||
# furnished to do so, subject to the following conditions:
|
||||
#
|
||||
# The above copyright notice and this permission notice shall be included in
|
||||
# all copies or substantial portions of the Software.
|
||||
#
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
# THE SOFTWARE.
|
||||
|
||||
printUsage() {
|
||||
echo
|
||||
echo "Usage: $(basename "$0") HIP_BUILD_INC_DIR HIP_INC_DIR HIP_AMD_INC_DIR LLVM_DIR [option] [RTC_LIB_OUTPUT]"
|
||||
echo
|
||||
echo "Options:"
|
||||
echo " -p, --generate_pch Generate pre-compiled header (default)"
|
||||
echo " -r, --generate_rtc Generate preprocessor expansion (hiprtc_header.o)"
|
||||
echo " -h, --help Prints this help"
|
||||
echo
|
||||
echo
|
||||
return 0
|
||||
}
|
||||
|
||||
if [ "$1" == "" ]; then
|
||||
printUsage
|
||||
exit 0
|
||||
fi
|
||||
|
||||
HIP_BUILD_INC_DIR="$1"
|
||||
HIP_INC_DIR="$2"
|
||||
HIP_AMD_INC_DIR="$3"
|
||||
LLVM_DIR="$4"
|
||||
# By default, generate pch
|
||||
TARGET="generatepch"
|
||||
|
||||
while [ "$5" != "" ];
|
||||
do
|
||||
case "$5" in
|
||||
-h | --help )
|
||||
printUsage ; exit 0 ;;
|
||||
-p | --generate_pch )
|
||||
TARGET="generatepch" ; break ;;
|
||||
-r | --generate_rtc )
|
||||
TARGET="generatertc" ; break ;;
|
||||
*)
|
||||
echo " UNEXPECTED ERROR Parm : [$4] ">&2 ; exit 20 ;;
|
||||
esac
|
||||
shift 1
|
||||
done
|
||||
|
||||
# Allow hiprtc lib name to be set by argument 7
|
||||
if [[ "$6" != "" ]]; then
|
||||
rtc_shared_lib_out="$6"
|
||||
else
|
||||
if [[ "$OSTYPE" == cygwin ]]; then
|
||||
rtc_shared_lib_out=hiprtc-builtins64.dll
|
||||
else
|
||||
rtc_shared_lib_out=libhiprtc-builtins.so
|
||||
fi
|
||||
fi
|
||||
|
||||
if [[ "$OSTYPE" == cygwin || "$OSTYPE" == msys ]]; then
|
||||
isWindows=1
|
||||
tmpdir=.
|
||||
else
|
||||
isWindows=0
|
||||
tmpdir=/tmp
|
||||
fi
|
||||
|
||||
# Expected first argument $1 to be output file name.
|
||||
create_hip_macro_file() {
|
||||
cat >$1 <<EOF
|
||||
#define __device__ __attribute__((device))
|
||||
#define __host__ __attribute__((host))
|
||||
#define __global__ __attribute__((global))
|
||||
#define __constant__ __attribute__((constant))
|
||||
#define __shared__ __attribute__((shared))
|
||||
|
||||
#define launch_bounds_impl0(requiredMaxThreadsPerBlock) \
|
||||
__attribute__((amdgpu_flat_work_group_size(1, requiredMaxThreadsPerBlock)))
|
||||
#define launch_bounds_impl1(requiredMaxThreadsPerBlock, minBlocksPerMultiprocessor) \
|
||||
__attribute__((amdgpu_flat_work_group_size(1, requiredMaxThreadsPerBlock), \
|
||||
amdgpu_waves_per_eu(minBlocksPerMultiprocessor)))
|
||||
#define select_impl_(_1, _2, impl_, ...) impl_
|
||||
#define __launch_bounds__(...) \
|
||||
select_impl_(__VA_ARGS__, launch_bounds_impl1, launch_bounds_impl0)(__VA_ARGS__)
|
||||
|
||||
EOF
|
||||
}
|
||||
|
||||
generate_pch() {
|
||||
tmp=$tmpdir/hip_pch.$$
|
||||
mkdir -p $tmp
|
||||
|
||||
create_hip_macro_file $tmp/hip_macros.h
|
||||
|
||||
cat >$tmp/hip_pch.h <<EOF
|
||||
#include "hip/hip_runtime.h"
|
||||
#include "hip/hip_fp16.h"
|
||||
EOF
|
||||
|
||||
cat >$tmp/hip_pch.mcin <<EOF
|
||||
.type __hip_pch,@object
|
||||
.section .hip_pch,"aMS",@progbits,1
|
||||
.data
|
||||
.globl __hip_pch
|
||||
.globl __hip_pch_size
|
||||
.p2align 3
|
||||
__hip_pch:
|
||||
.incbin "$tmp/hip.pch"
|
||||
__hip_pch_size:
|
||||
.long __hip_pch_size - __hip_pch
|
||||
EOF
|
||||
|
||||
set -x
|
||||
|
||||
$LLVM_DIR/bin/clang -O3 --rocm-path=$HIP_INC_DIR/.. -std=c++17 -nogpulib -isystem $HIP_INC_DIR -isystem $HIP_BUILD_INC_DIR -isystem $HIP_AMD_INC_DIR --cuda-device-only -x hip $tmp/hip_pch.h -E >$tmp/pch.cui &&
|
||||
|
||||
cat $tmp/hip_macros.h >> $tmp/pch.cui &&
|
||||
|
||||
$LLVM_DIR/bin/clang -cc1 -O3 -emit-pch -triple amdgcn-amd-amdhsa -aux-triple x86_64-unknown-linux-gnu -fcuda-is-device -std=c++17 -fgnuc-version=4.2.1 -o $tmp/hip.pch -x hip-cpp-output - <$tmp/pch.cui &&
|
||||
|
||||
$LLVM_DIR/bin/llvm-mc -o hip_pch.o $tmp/hip_pch.mcin --filetype=obj &&
|
||||
|
||||
rm -rf $tmp
|
||||
}
|
||||
|
||||
generate_rtc_header() {
|
||||
tmp=$tmpdir/hip_rtc.$$
|
||||
mkdir -p $tmp
|
||||
local macroFile="$tmp/hip_macros.h"
|
||||
local headerFile="$tmp/hipRTC_header.h"
|
||||
local mcinFile="$tmp/hipRTC_header.mcin"
|
||||
|
||||
create_hip_macro_file $macroFile
|
||||
|
||||
cat >$headerFile <<EOF
|
||||
#pragma push_macro("CHAR_BIT")
|
||||
#pragma push_macro("INT_MAX")
|
||||
#define CHAR_BIT __CHAR_BIT__
|
||||
#define INT_MAX __INTMAX_MAX__
|
||||
|
||||
#include "hip/hip_runtime.h"
|
||||
#include "hip/hip_fp16.h"
|
||||
|
||||
#pragma pop_macro("CHAR_BIT")
|
||||
#pragma pop_macro("INT_MAX")
|
||||
EOF
|
||||
|
||||
echo "// Automatically generated script for HIP RTC." > $mcinFile
|
||||
if [[ $isWindows -eq 0 ]]; then
|
||||
echo " .type __hipRTC_header,@object" >> $mcinFile
|
||||
echo " .type __hipRTC_header_size,@object" >> $mcinFile
|
||||
fi
|
||||
cat >>$mcinFile <<EOF
|
||||
.section .hipRTC_header,"a"
|
||||
.globl __hipRTC_header
|
||||
.globl __hipRTC_header_size
|
||||
.p2align 3
|
||||
__hipRTC_header:
|
||||
.incbin "$tmp/hiprtc"
|
||||
__hipRTC_header_size:
|
||||
.long __hipRTC_header_size - __hipRTC_header
|
||||
EOF
|
||||
|
||||
set -x
|
||||
$LLVM_DIR/bin/clang -O3 --rocm-path=$HIP_INC_DIR/.. -std=c++14 -nogpulib --hip-version=4.4 -isystem $HIP_INC_DIR -isystem $HIP_BUILD_INC_DIR -isystem $HIP_AMD_INC_DIR --cuda-device-only -D__HIPCC_RTC__ -x hip $tmp/hipRTC_header.h -E -o $tmp/hiprtc &&
|
||||
cat $macroFile >> $tmp/hiprtc &&
|
||||
$LLVM_DIR/bin/llvm-mc -o $tmp/hiprtc_header.o $tmp/hipRTC_header.mcin --filetype=obj &&
|
||||
$LLVM_DIR/bin/clang $tmp/hiprtc_header.o -o $rtc_shared_lib_out -shared &&
|
||||
$LLVM_DIR/bin/clang -O3 --rocm-path=$HIP_INC_DIR/.. -std=c++14 -nogpulib -nogpuinc -emit-llvm -c -o $tmp/tmp.bc --cuda-device-only -D__HIPCC_RTC__ --offload-arch=gfx906 -x hip-cpp-output $tmp/hiprtc &&
|
||||
rm -rf $tmp
|
||||
}
|
||||
|
||||
case $TARGET in
|
||||
(generatertc) generate_rtc_header ;;
|
||||
(generatepch) generate_pch ;;
|
||||
(*) die "Invalid target $TARGET" ;;
|
||||
esac
|
||||
|
||||
@@ -1,772 +0,0 @@
|
||||
#!/usr/bin/perl -w
|
||||
# Copyright (c) 2015-2021 Advanced Micro Devices, Inc. All rights reserved.
|
||||
#
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
# of this software and associated documentation files (the "Software"), to deal
|
||||
# in the Software without restriction, including without limitation the rights
|
||||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
# copies of the Software, and to permit persons to whom the Software is
|
||||
# furnished to do so, subject to the following conditions:
|
||||
#
|
||||
# The above copyright notice and this permission notice shall be included in
|
||||
# all copies or substantial portions of the Software.
|
||||
#
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
# THE SOFTWARE.
|
||||
|
||||
# Need perl > 5.10 to use logic-defined or
|
||||
use 5.006; use v5.10.1;
|
||||
use File::Basename;
|
||||
use File::Temp qw/ :mktemp /;
|
||||
use Cwd;
|
||||
use Cwd 'abs_path';
|
||||
|
||||
# HIP compiler driver
|
||||
# Will call clang or nvcc (depending on target) and pass the appropriate include and library options for
|
||||
# the target compiler and HIP infrastructure.
|
||||
|
||||
# Will pass-through options to the target compiler. The tools calling HIPCC must ensure the compiler
|
||||
# options are appropriate for the target compiler.
|
||||
|
||||
# Environment variable HIP_PLATFORM is to detect amd/nvidia path:
|
||||
# HIP_PLATFORM='nvidia' or HIP_PLATFORM='amd'.
|
||||
# If HIP_PLATFORM is not set hipcc will attempt auto-detect based on if nvcc is found.
|
||||
#
|
||||
# Other environment variable controls:
|
||||
# HIP_PATH : Path to HIP directory, default is one dir level above location of this script.
|
||||
# CUDA_PATH : Path to CUDA SDK (default /usr/local/cuda). Used on NVIDIA platforms only.
|
||||
# HSA_PATH : Path to HSA dir (defaults to ../../hsa relative to abs_path
|
||||
# of this script). Used on AMD platforms only.
|
||||
# HIP_ROCCLR_HOME : Path to HIP/ROCclr directory. Used on AMD platforms only.
|
||||
# HIP_CLANG_PATH : Path to HIP-Clang (default to ../../llvm/bin relative to this
|
||||
# script's abs_path). Used on AMD platforms only.
|
||||
|
||||
if(scalar @ARGV == 0){
|
||||
print "No Arguments passed, exiting ...\n";
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
$verbose = $ENV{'HIPCC_VERBOSE'} // 0;
|
||||
# Verbose: 0x1=commands, 0x2=paths, 0x4=hipcc args
|
||||
|
||||
$HIPCC_COMPILE_FLAGS_APPEND=$ENV{'HIPCC_COMPILE_FLAGS_APPEND'};
|
||||
$HIPCC_LINK_FLAGS_APPEND=$ENV{'HIPCC_LINK_FLAGS_APPEND'};
|
||||
|
||||
# Known HIP target names.
|
||||
@knownTargets = ('gfx700', 'gfx701', 'gfx702', 'gfx703', 'gfx704', 'gfx705',
|
||||
'gfx801', 'gfx802', 'gfx803', 'gfx805', 'gfx810',
|
||||
'gfx900', 'gfx902', 'gfx904', 'gfx906', 'gfx908', 'gfx909', 'gfx90a',
|
||||
'gfx1010', 'gfx1011', 'gfx1012', 'gfx1030', 'gfx1031', 'gfx1032');
|
||||
# Known Features
|
||||
@knownFeatures = ('sramecc-', 'sramecc+', 'xnack-', 'xnack+');
|
||||
|
||||
$HIP_LIB_PATH=$ENV{'HIP_LIB_PATH'};
|
||||
$DEVICE_LIB_PATH=$ENV{'DEVICE_LIB_PATH'};
|
||||
$HIP_CLANG_HCC_COMPAT_MODE=$ENV{'HIP_CLANG_HCC_COMPAT_MODE'}; # HCC compatibility mode
|
||||
$HIP_COMPILE_CXX_AS_HIP=$ENV{'HIP_COMPILE_CXX_AS_HIP'} // "1";
|
||||
|
||||
#---
|
||||
# Temporary directories
|
||||
my @tmpDirs = ();
|
||||
|
||||
#---
|
||||
# Create a new temporary directory and return it
|
||||
sub get_temp_dir {
|
||||
my $tmpdir = mkdtemp("/tmp/hipccXXXXXXXX");
|
||||
push (@tmpDirs, $tmpdir);
|
||||
return $tmpdir;
|
||||
}
|
||||
|
||||
#---
|
||||
# Delete all created temporary directories
|
||||
sub delete_temp_dirs {
|
||||
if (@tmpDirs) {
|
||||
system ('rm -rf ' . join (' ', @tmpDirs));
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
my $base_dir;
|
||||
BEGIN {
|
||||
$base_dir = dirname(Cwd::realpath(__FILE__) );
|
||||
}
|
||||
use lib "$base_dir/";
|
||||
use hipvars;
|
||||
|
||||
$isWindows = $hipvars::isWindows;
|
||||
$HIP_RUNTIME = $hipvars::HIP_RUNTIME;
|
||||
$HIP_PLATFORM = $hipvars::HIP_PLATFORM;
|
||||
$HIP_COMPILER = $hipvars::HIP_COMPILER;
|
||||
$HIP_CLANG_PATH = $hipvars::HIP_CLANG_PATH;
|
||||
$CUDA_PATH = $hipvars::CUDA_PATH;
|
||||
$HIP_PATH = $hipvars::HIP_PATH;
|
||||
$ROCM_PATH = $hipvars::ROCM_PATH;
|
||||
$HIP_VERSION = $hipvars::HIP_VERSION;
|
||||
$HSA_PATH = $hipvars::HSA_PATH;
|
||||
$HIP_ROCCLR_HOME = $hipvars::HIP_ROCCLR_HOME;
|
||||
|
||||
if ($HIP_PLATFORM eq "amd") {
|
||||
# If using ROCclr runtime, need to find HIP_ROCCLR_HOME
|
||||
if (!defined $DEVICE_LIB_PATH and -e "$HIP_ROCCLR_HOME/lib/bitcode") {
|
||||
$DEVICE_LIB_PATH = "$HIP_ROCCLR_HOME/lib/bitcode";
|
||||
}
|
||||
$HIP_INCLUDE_PATH = "$HIP_ROCCLR_HOME/include";
|
||||
if (!defined $HIP_LIB_PATH) {
|
||||
$HIP_LIB_PATH = "$HIP_ROCCLR_HOME/lib";
|
||||
}
|
||||
|
||||
if (!defined $DEVICE_LIB_PATH) {
|
||||
if (-e "$ROCM_PATH/amdgcn/bitcode") {
|
||||
$DEVICE_LIB_PATH = "$ROCM_PATH/amdgcn/bitcode";
|
||||
}
|
||||
else {
|
||||
# This path is to support an older build of the device library
|
||||
# TODO: To be removed in the future.
|
||||
$DEVICE_LIB_PATH = "$ROCM_PATH/lib";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if ($verbose & 0x2) {
|
||||
print ("HIP_PATH=$HIP_PATH\n");
|
||||
print ("HIP_PLATFORM=$HIP_PLATFORM\n");
|
||||
print ("HIP_COMPILER=$HIP_COMPILER\n");
|
||||
print ("HIP_RUNTIME=$HIP_RUNTIME\n");
|
||||
}
|
||||
|
||||
# set if user explicitly requests -stdlib=libc++. (else we default to libstdc++ for better interop with g++):
|
||||
$setStdLib = 0; # TODO - set to 0
|
||||
|
||||
$default_amdgpu_target = 1;
|
||||
|
||||
if ($HIP_PLATFORM eq "amd") {
|
||||
$HIPCC="\"$HIP_CLANG_PATH/clang++\"";
|
||||
|
||||
# If $HIPCC clang++ is not compiled, use clang instead
|
||||
if ( ! -e $HIPCC ) {
|
||||
$HIPCC="\"$HIP_CLANG_PATH/clang\"";
|
||||
$HIPLDFLAGS = "--driver-mode=g++";
|
||||
}
|
||||
|
||||
$HIP_CLANG_VERSION = `$HIPCC --version`;
|
||||
$HIP_CLANG_VERSION=~/.*clang version (\S+).*/;
|
||||
$HIP_CLANG_VERSION=$1;
|
||||
|
||||
if (! defined $HIP_CLANG_INCLUDE_PATH) {
|
||||
$HIP_CLANG_INCLUDE_PATH = abs_path("$HIP_CLANG_PATH/../lib/clang/$HIP_CLANG_VERSION/include");
|
||||
}
|
||||
if (! defined $HIP_INCLUDE_PATH) {
|
||||
$HIP_INCLUDE_PATH = "$HIP_PATH/include";
|
||||
}
|
||||
if (! defined $HIP_LIB_PATH) {
|
||||
$HIP_LIB_PATH = "$HIP_PATH/lib";
|
||||
}
|
||||
if ($verbose & 0x2) {
|
||||
print ("ROCM_PATH=$ROCM_PATH\n");
|
||||
if (defined $HIP_ROCCLR_HOME) {
|
||||
print ("HIP_ROCCLR_HOME=$HIP_ROCCLR_HOME\n");
|
||||
}
|
||||
print ("HIP_CLANG_PATH=$HIP_CLANG_PATH\n");
|
||||
print ("HIP_CLANG_INCLUDE_PATH=$HIP_CLANG_INCLUDE_PATH\n");
|
||||
print ("HIP_INCLUDE_PATH=$HIP_INCLUDE_PATH\n");
|
||||
print ("HIP_LIB_PATH=$HIP_LIB_PATH\n");
|
||||
print ("DEVICE_LIB_PATH=$DEVICE_LIB_PATH\n");
|
||||
}
|
||||
|
||||
if ($isWindows) {
|
||||
$HIPCXXFLAGS .= " -std=c++14 -fms-extensions -fms-compatibility";
|
||||
} else {
|
||||
$HIPCXXFLAGS .= " -std=c++11";
|
||||
}
|
||||
$HIPCXXFLAGS .= " -isystem \"$HIP_CLANG_INCLUDE_PATH/..\"";
|
||||
$HIPCFLAGS .= " -isystem \"$HIP_CLANG_INCLUDE_PATH/..\"";
|
||||
$HIPLDFLAGS .= " -L\"$HIP_LIB_PATH\"";
|
||||
if ($isWindows) {
|
||||
$HIPLDFLAGS .= " -lamdhip64";
|
||||
}
|
||||
if ($HIP_CLANG_HCC_COMPAT_MODE) {
|
||||
## Allow __fp16 as function parameter and return type.
|
||||
$HIPCXXFLAGS .= " -Xclang -fallow-half-arguments-and-returns -D__HIP_HCC_COMPAT_MODE__=1";
|
||||
}
|
||||
|
||||
if (not $isWindows) {
|
||||
$HSA_PATH=$ENV{'HSA_PATH'} // "$ROCM_PATH/hsa";
|
||||
$HIPCXXFLAGS .= " -isystem $HSA_PATH/include";
|
||||
$HIPCFLAGS .= " -isystem $HSA_PATH/include";
|
||||
}
|
||||
|
||||
} elsif ($HIP_PLATFORM eq "nvidia") {
|
||||
$CUDA_PATH=$ENV{'CUDA_PATH'} // '/usr/local/cuda';
|
||||
$HIP_INCLUDE_PATH = "$HIP_PATH/include";
|
||||
if ($verbose & 0x2) {
|
||||
print ("CUDA_PATH=$CUDA_PATH\n");
|
||||
}
|
||||
|
||||
$HIPCC="$CUDA_PATH/bin/nvcc";
|
||||
$HIPCXXFLAGS .= " -Wno-deprecated-gpu-targets ";
|
||||
$HIPCXXFLAGS .= " -isystem $CUDA_PATH/include";
|
||||
$HIPCFLAGS .= " -isystem $CUDA_PATH/include";
|
||||
|
||||
$HIPLDFLAGS = " -Wno-deprecated-gpu-targets -lcuda -lcudart -L$CUDA_PATH/lib64";
|
||||
} else {
|
||||
printf ("error: unknown HIP_PLATFORM = '$HIP_PLATFORM'");
|
||||
printf (" or HIP_COMPILER = '$HIP_COMPILER'");
|
||||
exit (-1);
|
||||
}
|
||||
|
||||
# Add paths to common HIP includes:
|
||||
$HIPCXXFLAGS .= " -isystem \"$HIP_INCLUDE_PATH\"" ;
|
||||
$HIPCFLAGS .= " -isystem \"$HIP_INCLUDE_PATH\"" ;
|
||||
|
||||
my $compileOnly = 0;
|
||||
my $needCXXFLAGS = 0; # need to add CXX flags to compile step
|
||||
my $needCFLAGS = 0; # need to add C flags to compile step
|
||||
my $needLDFLAGS = 1; # need to add LDFLAGS to compile step.
|
||||
my $fileTypeFlag = 0; # to see if -x flag is mentioned
|
||||
my $hasC = 0; # options contain a c-style file
|
||||
my $hasCXX = 0; # options contain a cpp-style file (NVCC must force recognition as GPU file)
|
||||
my $hasCU = 0; # options contain a cu-style file (HCC must force recognition as GPU file)
|
||||
my $hasHIP = 0; # options contain a hip-style file (HIP-Clang must pass offloading options)
|
||||
my $printHipVersion = 0; # print HIP version
|
||||
my $printCXXFlags = 0; # print HIPCXXFLAGS
|
||||
my $printLDFlags = 0; # print HIPLDFLAGS
|
||||
my $runCmd = 1;
|
||||
my $buildDeps = 0;
|
||||
my $linkType = 1;
|
||||
my $setLinkType = 0;
|
||||
my $hsacoVersion = 0;
|
||||
my $funcSupp = 0; # enable function support
|
||||
my $rdc = 0; # whether -fgpu-rdc is on
|
||||
|
||||
my @options = ();
|
||||
my @inputs = ();
|
||||
|
||||
if ($verbose & 0x4) {
|
||||
print "hipcc-args: ", join (" ", @ARGV), "\n";
|
||||
}
|
||||
|
||||
# Handle code object generation
|
||||
my $ISACMD="";
|
||||
if($HIP_PLATFORM eq "nvidia"){
|
||||
$ISACMD .= "$HIP_PATH/bin/hipcc -ptx ";
|
||||
if($ARGV[0] eq "--genco"){
|
||||
foreach $isaarg (@ARGV[1..$#ARGV]){
|
||||
$ISACMD .= " ";
|
||||
$ISACMD .= $isaarg;
|
||||
}
|
||||
if ($verbose & 0x1) {
|
||||
print "hipcc-cmd: ", $ISACMD, "\n";
|
||||
}
|
||||
system($ISACMD) and die();
|
||||
exit(0);
|
||||
}
|
||||
}
|
||||
|
||||
# TODO: convert toolArgs to an array rather than a string
|
||||
my $toolArgs = ""; # arguments to pass to the clang or nvcc tool
|
||||
my $optArg = ""; # -O args
|
||||
|
||||
# TODO: hipcc uses --amdgpu-target for historical reasons. It should be replaced
|
||||
# by clang option --offload-arch.
|
||||
my @targetOpts = ('--offload-arch=', '--amdgpu-target=');
|
||||
|
||||
my $targetsStr = "";
|
||||
my $skipOutputFile = 0; # file followed by -o should not contibute in picking compiler flags
|
||||
my $prevArg = ""; # previous argument
|
||||
|
||||
foreach $arg (@ARGV)
|
||||
{
|
||||
# Save $arg, it can get changed in the loop.
|
||||
$trimarg = $arg;
|
||||
# TODO: figure out why this space removal is wanted.
|
||||
# TODO: If someone has gone to the effort of quoting the spaces to the shell
|
||||
# TODO: why are we removing it here?
|
||||
$trimarg =~ s/^\s+|\s+$//g; # Remive whitespace
|
||||
my $swallowArg = 0;
|
||||
my $escapeArg = 1;
|
||||
if ($arg eq '-c' or $arg eq '--genco' or $arg eq '-E') {
|
||||
$compileOnly = 1;
|
||||
$needLDFLAGS = 0;
|
||||
}
|
||||
|
||||
if ($skipOutputFile) {
|
||||
# TODO: handle filename with shell metacharacters
|
||||
$toolArgs .= " $arg";
|
||||
$prevArg = $arg;
|
||||
$skipOutputFile = 0;
|
||||
next;
|
||||
}
|
||||
|
||||
if ($arg eq '-o') {
|
||||
$needLDFLAGS = 1;
|
||||
$skipOutputFile = 1;
|
||||
}
|
||||
|
||||
if(($trimarg eq '-stdlib=libc++') and ($setStdLib eq 0))
|
||||
{
|
||||
$HIPCXXFLAGS .= " -stdlib=libc++";
|
||||
$setStdLib = 1;
|
||||
}
|
||||
|
||||
# Check target selection option: --offload-arch= and --amdgpu-target=...
|
||||
foreach my $targetOpt (@targetOpts) {
|
||||
if (substr($arg, 0, length($targetOpt)) eq $targetOpt) {
|
||||
# If targets string is not empty, add a comma before adding new target option value.
|
||||
$targetsStr .= ($targetsStr ? ',' : '');
|
||||
$targetsStr .= substr($arg, length($targetOpt));
|
||||
$default_amdgpu_target = 0;
|
||||
# Collect the GPU arch options and pass them to clang later.
|
||||
if ($HIP_PLATFORM eq "amd") {
|
||||
$swallowArg = 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (($arg =~ /--genco/) and $HIP_PLATFORM eq 'amd' ) {
|
||||
$arg = "--cuda-device-only";
|
||||
}
|
||||
|
||||
if($trimarg eq '--version') {
|
||||
$printHipVersion = 1;
|
||||
}
|
||||
if($trimarg eq '--short-version') {
|
||||
$printHipVersion = 1;
|
||||
$runCmd = 0;
|
||||
}
|
||||
if($trimarg eq '--cxxflags') {
|
||||
$printCXXFlags = 1;
|
||||
$runCmd = 0;
|
||||
}
|
||||
if($trimarg eq '--ldflags') {
|
||||
$printLDFlags = 1;
|
||||
$runCmd = 0;
|
||||
}
|
||||
if($trimarg eq '-M') {
|
||||
$compileOnly = 1;
|
||||
$buildDeps = 1;
|
||||
}
|
||||
if($trimarg eq '-use_fast_math') {
|
||||
$HIPCXXFLAGS .= " -DHIP_FAST_MATH ";
|
||||
$HIPCFLAGS .= " -DHIP_FAST_MATH ";
|
||||
}
|
||||
if(($trimarg eq '-use-staticlib') and ($setLinkType eq 0))
|
||||
{
|
||||
$linkType = 0;
|
||||
$setLinkType = 1;
|
||||
$swallowArg = 1;
|
||||
}
|
||||
if(($trimarg eq '-use-sharedlib') and ($setLinkType eq 0))
|
||||
{
|
||||
$linkType = 1;
|
||||
$setLinkType = 1;
|
||||
}
|
||||
if($arg =~ m/^-O/)
|
||||
{
|
||||
$optArg = $arg;
|
||||
}
|
||||
if($arg =~ '--amdhsa-code-object-version=')
|
||||
{
|
||||
$arg =~ s/--amdhsa-code-object-version=//;
|
||||
$hsacoVersion = $arg;
|
||||
$swallowArg = 1;
|
||||
}
|
||||
|
||||
# nvcc does not handle standard compiler options properly
|
||||
# This can prevent hipcc being used as standard CXX/C Compiler
|
||||
# To fix this we need to pass -Xcompiler for options
|
||||
if (($arg eq '-fPIC' or $arg =~ '-Wl,') and $HIP_COMPILER eq 'nvcc')
|
||||
{
|
||||
$HIPCXXFLAGS .= " -Xcompiler ".$arg;
|
||||
$swallowArg = 1;
|
||||
}
|
||||
|
||||
## process linker response file for hip-clang
|
||||
## extract object files from static library and pass them directly to
|
||||
## hip-clang in command line.
|
||||
## ToDo: Remove this after hip-clang switch to lto and lld is able to
|
||||
## handle clang-offload-bundler bundles.
|
||||
if (($arg =~ m/^-Wl,@/ or $arg =~ m/^@/) and
|
||||
$HIP_PLATFORM eq 'amd') {
|
||||
my @split_arg = (split /\@/, $arg); # arg will have options type(-Wl,@ or @) and filename
|
||||
my $file = $split_arg[1];
|
||||
open my $in, "<:encoding(utf8)", $file or die "$file: $!";
|
||||
my $new_arg = "";
|
||||
my $tmpdir = get_temp_dir ();
|
||||
my $new_file = "$tmpdir/response_file";
|
||||
open my $out, ">", $new_file or die "$new_file: $!";
|
||||
while (my $line = <$in>) {
|
||||
chomp $line;
|
||||
if ($line =~ m/\.a$/ || $line =~ m/\.lo$/) {
|
||||
my $libFile = $line;
|
||||
my $path = abs_path($line);
|
||||
my @objs = split ('\n', `cd $tmpdir; ar xv $path`);
|
||||
## Check if all files in .a are object files.
|
||||
my $allIsObj = 1;
|
||||
my $realObjs = "";
|
||||
foreach my $obj (@objs) {
|
||||
chomp $obj;
|
||||
$obj =~ s/^x - //;
|
||||
$obj = "$tmpdir/$obj";
|
||||
my $fileType = `file $obj`;
|
||||
my $isObj = ($fileType =~ m/ELF/ or $fileType =~ m/COFF/);
|
||||
$allIsObj = ($allIsObj and $isObj);
|
||||
if ($isObj) {
|
||||
$realObjs = ($realObjs . " " . $obj);
|
||||
} else {
|
||||
push (@inputs, $obj);
|
||||
$new_arg = "$new_arg $obj";
|
||||
}
|
||||
}
|
||||
chomp $realObjs;
|
||||
if ($allIsObj) {
|
||||
print $out "$line\n";
|
||||
} elsif ($realObjs) {
|
||||
my($libBaseName, $libDir, $libExt) = fileparse($libFile);
|
||||
$libBaseName = mktemp($libBaseName . "XXXX") . $libExt;
|
||||
system("cd $tmpdir; ar c $libBaseName $realObjs");
|
||||
print $out "$tmpdir/$libBaseName\n";
|
||||
}
|
||||
} elsif ($line =~ m/\.o$/) {
|
||||
my $fileType = `file $line`;
|
||||
my $isObj = ($fileType =~ m/ELF/ or $fileType =~ m/COFF/);
|
||||
if ($isObj) {
|
||||
print $out "$line\n";
|
||||
} else {
|
||||
push (@inputs, $line);
|
||||
$new_arg = "$new_arg $line";
|
||||
}
|
||||
} else {
|
||||
print $out "$line\n";
|
||||
}
|
||||
}
|
||||
close $in;
|
||||
close $out;
|
||||
$arg = "$new_arg $split_arg[0]\@$new_file";
|
||||
$escapeArg = 0;
|
||||
} elsif (($arg =~ m/\.a$/ || $arg =~ m/\.lo$/) &&
|
||||
$HIP_PLATFORM eq 'amd') {
|
||||
## process static library for hip-clang
|
||||
## extract object files from static library and pass them directly to
|
||||
## hip-clang.
|
||||
## ToDo: Remove this after hip-clang switch to lto and lld is able to
|
||||
## handle clang-offload-bundler bundles.
|
||||
my $new_arg = "";
|
||||
my $tmpdir = get_temp_dir ();
|
||||
my $libFile = $arg;
|
||||
my $path = abs_path($arg);
|
||||
my @objs = split ('\n', `cd $tmpdir; ar xv $path`);
|
||||
## Check if all files in .a are object files.
|
||||
my $allIsObj = 1;
|
||||
my $realObjs = "";
|
||||
foreach my $obj (@objs) {
|
||||
chomp $obj;
|
||||
$obj =~ s/^x - //;
|
||||
$obj = "$tmpdir/$obj";
|
||||
my $fileType = `file $obj`;
|
||||
my $isObj = ($fileType =~ m/ELF/ or $fileType =~ m/COFF/);
|
||||
if ($fileType =~ m/ELF/) {
|
||||
my $sections = `readelf -e -W $obj`;
|
||||
$isObj = !($sections =~ m/__CLANG_OFFLOAD_BUNDLE__/);
|
||||
}
|
||||
$allIsObj = ($allIsObj and $isObj);
|
||||
if ($isObj) {
|
||||
$realObjs = ($realObjs . " " . $obj);
|
||||
} else {
|
||||
push (@inputs, $obj);
|
||||
if ($new_arg ne "") {
|
||||
$new_arg .= " ";
|
||||
}
|
||||
$new_arg .= "$obj";
|
||||
}
|
||||
}
|
||||
chomp $realObjs;
|
||||
if ($allIsObj) {
|
||||
$new_arg = $arg;
|
||||
} elsif ($realObjs) {
|
||||
my($libBaseName, $libDir, $libExt) = fileparse($libFile);
|
||||
$libBaseName = mktemp($libBaseName . "XXXX") . $libExt;
|
||||
system("cd $tmpdir; ar c $libBaseName $realObjs");
|
||||
$new_arg .= " $tmpdir/$libBaseName";
|
||||
}
|
||||
$arg = "$new_arg";
|
||||
$escapeArg = 0;
|
||||
if ($toolArgs =~ m/-Xlinker$/) {
|
||||
$toolArgs = substr $toolArgs, 0, -8;
|
||||
chomp $toolArgs;
|
||||
}
|
||||
} elsif ($arg eq '-x') {
|
||||
$fileTypeFlag = 1;
|
||||
} elsif (($arg eq 'c' and $prevArg eq '-x') or ($arg eq '-xc')) {
|
||||
$fileTypeFlag = 1;
|
||||
$hasC = 1;
|
||||
$hasCXX = 0;
|
||||
$hasHIP = 0;
|
||||
} elsif (($arg eq 'c++' and $prevArg eq '-x') or ($arg eq '-xc++')) {
|
||||
$fileTypeFlag = 1;
|
||||
$hasC = 0;
|
||||
$hasCXX = 1;
|
||||
$hasHIP = 0;
|
||||
} elsif (($arg eq 'hip' and $prevArg eq '-x') or ($arg eq '-xhip')) {
|
||||
$fileTypeFlag = 1;
|
||||
$hasC = 0;
|
||||
$hasCXX = 0;
|
||||
$hasHIP = 1;
|
||||
} elsif ($arg =~ m/^-/) {
|
||||
# options start with -
|
||||
if ($arg eq '-fgpu-rdc') {
|
||||
$rdc = 1;
|
||||
} elsif ($arg eq '-fno-gpu-rdc') {
|
||||
$rdc = 0;
|
||||
}
|
||||
|
||||
# Process HIPCC options here:
|
||||
if ($arg =~ m/^--hipcc/) {
|
||||
$swallowArg = 1;
|
||||
#if $arg eq "--hipcc_profile") { # Example argument here, hipcc
|
||||
#
|
||||
#}
|
||||
if ($arg eq "--hipcc-func-supp") {
|
||||
$funcSupp = 1;
|
||||
} elsif ($arg eq "--hipcc-no-func-supp") {
|
||||
$funcSupp = 0;
|
||||
}
|
||||
} else {
|
||||
push (@options, $arg);
|
||||
}
|
||||
#print "O: <$arg>\n";
|
||||
} elsif ($prevArg ne '-o') {
|
||||
# input files and libraries
|
||||
# Skip guessing if `-x {c|c++|hip}` is already specified.
|
||||
|
||||
# Add proper file extension before each file type
|
||||
# File Extension -> Flag
|
||||
# .c -> -x c
|
||||
# .cpp/.cxx/.cc/.cu/.cuh/.hip -> -x hip
|
||||
if ($fileTypeFlag eq 0) {
|
||||
if ($arg =~ /\.c$/) {
|
||||
$hasC = 1;
|
||||
$needCFLAGS = 1;
|
||||
$toolArgs .= " -x c";
|
||||
} elsif (($arg =~ /\.cpp$/) or ($arg =~ /\.cxx$/) or ($arg =~ /\.cc$/) or ($arg =~ /\.C$/)) {
|
||||
$needCXXFLAGS = 1;
|
||||
if ($HIP_COMPILE_CXX_AS_HIP eq '0' or $HIP_PLATFORM ne "amd") {
|
||||
$hasCXX = 1;
|
||||
} elsif ($HIP_PLATFORM eq "amd") {
|
||||
$hasHIP = 1;
|
||||
$toolArgs .= " -x hip";
|
||||
}
|
||||
} elsif ((($arg =~ /\.cu$/ or $arg =~ /\.cuh$/) and $HIP_COMPILE_CXX_AS_HIP ne '0') or ($arg =~ /\.hip$/)) {
|
||||
$needCXXFLAGS = 1;
|
||||
if ($HIP_PLATFORM eq "amd") {
|
||||
$hasHIP = 1;
|
||||
$toolArgs .= " -x hip";
|
||||
} else {
|
||||
$hasCU = 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
if ($hasC) {
|
||||
$needCFLAGS = 1;
|
||||
} elsif ($hasCXX or $hasHIP) {
|
||||
$needCXXFLAGS = 1;
|
||||
}
|
||||
push (@inputs, $arg);
|
||||
#print "I: <$arg>\n";
|
||||
}
|
||||
# Produce a version of $arg where characters significant to the shell are
|
||||
# quoted. One could quote everything of course but don't bother for
|
||||
# common characters such as alphanumerics.
|
||||
# Do the quoting here because sometimes the $arg is changed in the loop
|
||||
# Important to have all of '-Xlinker' in the set of unquoted characters.
|
||||
if (not $isWindows and $escapeArg) { # Windows needs different quoting, ignore for now
|
||||
$arg =~ s/[^-a-zA-Z0-9_=+,.\/]/\\$&/g;
|
||||
}
|
||||
$toolArgs .= " $arg" unless $swallowArg;
|
||||
$prevArg = $arg;
|
||||
}
|
||||
|
||||
if($HIP_PLATFORM eq "amd"){
|
||||
# No AMDGPU target specified at commandline. So look for HCC_AMDGPU_TARGET
|
||||
if($default_amdgpu_target eq 1) {
|
||||
if (defined $ENV{HCC_AMDGPU_TARGET}) {
|
||||
$targetsStr = $ENV{HCC_AMDGPU_TARGET};
|
||||
} elsif (not $isWindows) {
|
||||
# Else try using rocm_agent_enumerator
|
||||
$ROCM_AGENT_ENUM = "${ROCM_PATH}/bin/rocm_agent_enumerator";
|
||||
$targetsStr = `${ROCM_AGENT_ENUM} -t GPU`;
|
||||
$targetsStr =~ s/\n/,/g;
|
||||
}
|
||||
$default_amdgpu_target = 0;
|
||||
}
|
||||
|
||||
# Parse the targets collected in targetStr and set corresponding compiler options.
|
||||
my @targets = split(',', $targetsStr);
|
||||
$GPU_ARCH_OPT = " --offload-arch=";
|
||||
|
||||
foreach my $val (@targets) {
|
||||
# Ignore 'gfx000' target reported by rocm_agent_enumerator.
|
||||
if ($val ne 'gfx000') {
|
||||
my @procAndFeatures = split(':', $val);
|
||||
$len = scalar @procAndFeatures;
|
||||
my $procName;
|
||||
if($len ge 1 and $len le 3) { # proc and features
|
||||
$procName = $procAndFeatures[0];
|
||||
for my $i (1 .. $#procAndFeatures) {
|
||||
if (grep($procAndFeatures[$i], @knownFeatures) eq 0) {
|
||||
print "Warning: The Feature: $procAndFeatures[$i] is unknown. Correct compilation is not guaranteed.\n";
|
||||
}
|
||||
}
|
||||
} else {
|
||||
$procName = $val;
|
||||
}
|
||||
$GPU_ARCH_ARG = $GPU_ARCH_OPT . $val;
|
||||
$HIPLDARCHFLAGS .= $GPU_ARCH_ARG;
|
||||
if ($HIP_PLATFORM eq 'amd' and $hasHIP) {
|
||||
$HIPCXXFLAGS .= $GPU_ARCH_ARG;
|
||||
}
|
||||
|
||||
# If the specified target is not in the list of known target names, emit a warning.
|
||||
if (grep($procName, @knownTargets) eq 0) {
|
||||
print "Warning: The specified HIP target: $val is unknown. Correct compilation is not guaranteed.\n";
|
||||
}
|
||||
}
|
||||
}
|
||||
if ($hsacoVersion > 0) {
|
||||
if ($compileOnly eq 0) {
|
||||
$HIPLDFLAGS .= " -mcode-object-version=$hsacoVersion";
|
||||
} else {
|
||||
$HIPCXXFLAGS .= " -mcode-object-version=$hsacoVersion";
|
||||
}
|
||||
}
|
||||
|
||||
# rocm_agent_enumerator failed! Throw an error and die if linking is required
|
||||
if ($default_amdgpu_target eq 1 and $compileOnly eq 0) {
|
||||
print "No valid AMD GPU target was either specified or found. Please specify a valid target using --offload-arch=<target>.\n" and die();
|
||||
}
|
||||
|
||||
$ENV{HCC_EXTRA_LIBRARIES}="\n";
|
||||
}
|
||||
|
||||
if ($hasCXX and $HIP_PLATFORM eq 'nvidia') {
|
||||
$HIPCXXFLAGS .= " -x cu";
|
||||
}
|
||||
|
||||
if ($buildDeps and $HIP_PLATFORM eq 'nvidia') {
|
||||
$HIPCXXFLAGS .= " -M -D__CUDACC__";
|
||||
$HIPCFLAGS .= " -M -D__CUDACC__";
|
||||
}
|
||||
|
||||
if ($buildDeps and $HIP_PLATFORM eq 'amd') {
|
||||
$HIPCXXFLAGS .= " --cuda-host-only";
|
||||
}
|
||||
|
||||
# Add --hip-link only if it is compile only and -fgpu-rdc is on.
|
||||
if ($rdc and !$compileOnly and $HIP_PLATFORM eq 'amd') {
|
||||
$HIPLDFLAGS .= " --hip-link";
|
||||
$HIPLDFLAGS .= $HIPLDARCHFLAGS;
|
||||
}
|
||||
|
||||
# hipcc currrently requires separate compilation of source files, ie it is not possible to pass
|
||||
# CPP files combined with .O files
|
||||
# Reason is that NVCC uses the file extension to determine whether to compile in CUDA mode or
|
||||
# pass-through CPP mode.
|
||||
|
||||
if ($HIP_PLATFORM eq "amd") {
|
||||
# Set default optimization level to -O3 for hip-clang.
|
||||
if ($optArg eq "") {
|
||||
$HIPCXXFLAGS .= " -O3";
|
||||
$HIPCFLAGS .= " -O3";
|
||||
$HIPLDFLAGS .= " -O3";
|
||||
}
|
||||
if (!$funcSupp and $optArg ne "-O0" and $hasHIP) {
|
||||
$HIPCXXFLAGS .= " -mllvm -amdgpu-early-inline-all=true -mllvm -amdgpu-function-calls=false";
|
||||
if ($needLDFLAGS and not $needCXXFLAGS) {
|
||||
$HIPLDFLAGS .= " -mllvm -amdgpu-early-inline-all=true -mllvm -amdgpu-function-calls=false";
|
||||
}
|
||||
}
|
||||
|
||||
if ($hasHIP) {
|
||||
if ($DEVICE_LIB_PATH ne "$ROCM_PATH/amdgcn/bitcode") {
|
||||
$HIPCXXFLAGS .= " --hip-device-lib-path=\"$DEVICE_LIB_PATH\"";
|
||||
}
|
||||
$HIPCXXFLAGS .= " -fhip-new-launch-api";
|
||||
}
|
||||
if (not $isWindows) {
|
||||
$HIPLDFLAGS .= " -lgcc_s -lgcc -lpthread -lm -lrt";
|
||||
}
|
||||
|
||||
if (not $isWindows and not $compileOnly) {
|
||||
if ($linkType eq 0) {
|
||||
$toolArgs .= " -L$HIP_LIB_PATH -lamdhip64 -L$ROCM_PATH/lib -lhsa-runtime64 -ldl -lnuma ";
|
||||
} else {
|
||||
$toolArgs .= " -Wl,--enable-new-dtags -Wl,--rpath=$HIP_LIB_PATH:$ROCM_PATH/lib -lamdhip64 ";
|
||||
}
|
||||
# To support __fp16 and _Float16, explicitly link with compiler-rt
|
||||
$toolArgs .= " -L$HIP_CLANG_PATH/../lib/clang/$HIP_CLANG_VERSION/lib/linux -lclang_rt.builtins-x86_64 "
|
||||
}
|
||||
}
|
||||
|
||||
if ($HIPCC_COMPILE_FLAGS_APPEND) {
|
||||
$HIPCXXFLAGS .= " $HIPCC_COMPILE_FLAGS_APPEND";
|
||||
$HIPCFLAGS .= " $HIPCC_COMPILE_FLAGS_APPEND";
|
||||
}
|
||||
if ($HIPCC_LINK_FLAGS_APPEND) {
|
||||
$HIPLDFLAGS .= " $HIPCC_LINK_FLAGS_APPEND";
|
||||
}
|
||||
|
||||
# TODO: convert CMD to an array rather than a string
|
||||
my $CMD="$HIPCC";
|
||||
|
||||
if ($needCFLAGS) {
|
||||
$CMD .= " $HIPCFLAGS";
|
||||
}
|
||||
|
||||
if ($needCXXFLAGS) {
|
||||
$CMD .= " $HIPCXXFLAGS";
|
||||
}
|
||||
|
||||
if ($needLDFLAGS and not $compileOnly) {
|
||||
$CMD .= " $HIPLDFLAGS";
|
||||
}
|
||||
$CMD .= " $toolArgs";
|
||||
|
||||
if ($verbose & 0x1) {
|
||||
print "hipcc-cmd: ", $CMD, "\n";
|
||||
}
|
||||
|
||||
if ($printHipVersion) {
|
||||
if ($runCmd) {
|
||||
print "HIP version: "
|
||||
}
|
||||
print $HIP_VERSION, "\n";
|
||||
}
|
||||
if ($printCXXFlags) {
|
||||
print $HIPCXXFLAGS;
|
||||
}
|
||||
if ($printLDFlags) {
|
||||
print $HIPLDFLAGS;
|
||||
}
|
||||
if ($runCmd) {
|
||||
system ("$CMD");
|
||||
if ($? == -1) {
|
||||
print "failed to execute: $!\n";
|
||||
exit($?);
|
||||
}
|
||||
elsif ($? & 127) {
|
||||
printf "child died with signal %d, %s coredump\n",
|
||||
($? & 127), ($? & 128) ? 'with' : 'without';
|
||||
exit($?);
|
||||
}
|
||||
else {
|
||||
$CMD_EXIT_CODE = $? >> 8;
|
||||
}
|
||||
$? or delete_temp_dirs ();
|
||||
exit($CMD_EXIT_CODE);
|
||||
}
|
||||
|
||||
# vim: ts=4:sw=4:expandtab:smartindent
|
||||
@@ -1,31 +0,0 @@
|
||||
#!/bin/bash
|
||||
# Copyright (c) 2017-2021 Advanced Micro Devices, Inc. All rights reserved.
|
||||
#
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
# of this software and associated documentation files (the "Software"), to deal
|
||||
# in the Software without restriction, including without limitation the rights
|
||||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
# copies of the Software, and to permit persons to whom the Software is
|
||||
# furnished to do so, subject to the following conditions:
|
||||
#
|
||||
# The above copyright notice and this permission notice shall be included in
|
||||
# all copies or substantial portions of the Software.
|
||||
#
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
# THE SOFTWARE.
|
||||
|
||||
SOURCE="${BASH_SOURCE[0]}"
|
||||
HIP_PATH="$( command cd -P "$( dirname "$SOURCE" )/.." && pwd )"
|
||||
HIP_COMPILER=$(eval "$HIP_PATH/bin/hipconfig --compiler")
|
||||
if [ "$HIP_COMPILER" = "hcc" ]; then
|
||||
HCC_HOME=$1 $HIP_PATH/bin/hipcc "${@:2}"
|
||||
elif [ "$HIP_COMPILER" = "clang" ]; then
|
||||
HIP_CLANG_PATH=$1 $HIP_PATH/bin/hipcc "${@:2}"
|
||||
else
|
||||
$HIP_PATH/bin/hipcc "${@:1}"
|
||||
fi
|
||||
@@ -1,256 +0,0 @@
|
||||
#!/usr/bin/perl -w
|
||||
# Copyright (c) 2015-2021 Advanced Micro Devices, Inc. All rights reserved.
|
||||
#
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
# of this software and associated documentation files (the "Software"), to deal
|
||||
# in the Software without restriction, including without limitation the rights
|
||||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
# copies of the Software, and to permit persons to whom the Software is
|
||||
# furnished to do so, subject to the following conditions:
|
||||
#
|
||||
# The above copyright notice and this permission notice shall be included in
|
||||
# all copies or substantial portions of the Software.
|
||||
#
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
# THE SOFTWARE.
|
||||
|
||||
# Need perl > 5.10 to use logic-defined or
|
||||
use 5.006; use v5.10.1;
|
||||
use Getopt::Long;
|
||||
use Cwd;
|
||||
|
||||
# Return name of HIP compiler - either 'clang' or 'nvcc'
|
||||
#
|
||||
use Getopt::Long;
|
||||
use File::Basename;
|
||||
|
||||
my $base_dir;
|
||||
BEGIN {
|
||||
$base_dir = dirname( Cwd::realpath(__FILE__) );
|
||||
}
|
||||
use lib "$base_dir/";
|
||||
use hipvars;
|
||||
|
||||
$isWindows = $hipvars::isWindows;
|
||||
$HIP_RUNTIME = $hipvars::HIP_RUNTIME;
|
||||
$HIP_PLATFORM = $hipvars::HIP_PLATFORM;
|
||||
$HIP_COMPILER = $hipvars::HIP_COMPILER;
|
||||
$HIP_CLANG_PATH = $hipvars::HIP_CLANG_PATH;
|
||||
$CUDA_PATH = $hipvars::CUDA_PATH;
|
||||
$HIP_PATH = $hipvars::HIP_PATH;
|
||||
$ROCM_PATH = $hipvars::ROCM_PATH;
|
||||
$HIP_VERSION = $hipvars::HIP_VERSION;
|
||||
$HSA_PATH = $hipvars::HSA_PATH;
|
||||
|
||||
Getopt::Long::Configure ( qw{bundling no_ignore_case});
|
||||
GetOptions(
|
||||
"help|h" => \$p_help
|
||||
,"path|p" => \$p_path
|
||||
,"rocmpath|R" => \$p_rocmpath
|
||||
,"compiler|c" => \$p_compiler
|
||||
,"platform|P" => \$p_platform
|
||||
,"runtime|r" => \$p_runtime
|
||||
,"hipclangpath|l" => \$p_hipclangpath
|
||||
,"cpp_config|cxx_config|C" => \$p_cpp_config
|
||||
,"full|f|info" => \$p_full,
|
||||
,"version|v" => \$p_version,
|
||||
,"check" => \$p_check,
|
||||
,"newline|n" => \$p_newline
|
||||
);
|
||||
|
||||
if ($HIP_COMPILER eq "clang") {
|
||||
$HIP_CLANG_VERSION = "";
|
||||
if($isWindows) {
|
||||
$HIP_CLANG_VERSION = `\"$HIP_CLANG_PATH/clang++\" --version`;
|
||||
} else {
|
||||
$HIP_CLANG_VERSION = `$HIP_CLANG_PATH/clang++ --version`;
|
||||
}
|
||||
$HIP_CLANG_VERSION=~/.*clang version (\S+).*/;
|
||||
$HIP_CLANG_VERSION=$1;
|
||||
|
||||
$CPP_CONFIG = " -D__HIP_PLATFORM_HCC__= -D__HIP_PLATFORM_AMD__=";
|
||||
|
||||
$HIP_PATH_INCLUDE = $HIP_PATH."/include";
|
||||
$HIP_CLANG_INCLUDE = $HIP_CLANG_PATH."/../lib/clang/".$HIP_CLANG_VERSION;
|
||||
if($isWindows) {
|
||||
$CPP_CONFIG .= " -I\"$HIP_PATH_INCLUDE\" -I\"$HIP_CLANG_INCLUDE\"";
|
||||
} else {
|
||||
$CPP_CONFIG .= " -I$HIP_PATH_INCLUDE -I$HIP_CLANG_INCLUDE -I$HSA_PATH/include";
|
||||
}
|
||||
}
|
||||
if ($HIP_PLATFORM eq "nvidia") {
|
||||
$CPP_CONFIG = " -D__HIP_PLATFORM_NVCC__= -D__HIP_PLATFORM_NVIDIA__= -I$HIP_PATH/include -I$CUDA_PATH/include";
|
||||
};
|
||||
|
||||
if ($p_help) {
|
||||
print "usage: hipconfig [OPTIONS]\n";
|
||||
print " --path, -p : print HIP_PATH (use env var if set, else determine from hipconfig path)\n";
|
||||
print " --rocmpath, -R : print ROCM_PATH (use env var if set, else determine from hip path or /opt/rocm)\n";
|
||||
print " --cpp_config, -C : print C++ compiler options\n";
|
||||
print " --compiler, -c : print compiler (clang or nvcc)\n";
|
||||
print " --platform, -P : print platform (amd or nvidia)\n";
|
||||
print " --runtime, -r : print runtime (rocclr or cuda)\n";
|
||||
print " --hipclangpath, -l : print HIP_CLANG_PATH\n";
|
||||
print " --full, -f : print full config\n";
|
||||
print " --version, -v : print hip version\n";
|
||||
print " --check : check configuration\n";
|
||||
print " --newline, -n : print newline\n";
|
||||
print " --help, -h : print help message\n";
|
||||
exit();
|
||||
}
|
||||
|
||||
if ($p_path) {
|
||||
print "$HIP_PATH";
|
||||
$printed = 1;
|
||||
}
|
||||
|
||||
if ($p_rocmpath) {
|
||||
print "$ROCM_PATH";
|
||||
$printed = 1;
|
||||
}
|
||||
|
||||
if ($p_cpp_config) {
|
||||
print $CPP_CONFIG;
|
||||
$printed = 1;
|
||||
}
|
||||
|
||||
if ($p_compiler) {
|
||||
print $HIP_COMPILER;
|
||||
$printed = 1;
|
||||
}
|
||||
|
||||
if ($p_platform) {
|
||||
print $HIP_PLATFORM;
|
||||
$printed = 1;
|
||||
}
|
||||
|
||||
if ($p_runtime) {
|
||||
print $HIP_RUNTIME;
|
||||
$printed = 1;
|
||||
}
|
||||
|
||||
if ($p_hipclangpath) {
|
||||
if (defined $HIP_CLANG_PATH) {
|
||||
print $HIP_CLANG_PATH;
|
||||
}
|
||||
$printed = 1;
|
||||
}
|
||||
|
||||
if ($p_version) {
|
||||
print $HIP_VERSION;
|
||||
$printed = 1;
|
||||
}
|
||||
|
||||
if (!$printed or $p_full) {
|
||||
print "HIP version : ", $HIP_VERSION, "\n\n";
|
||||
print "== hipconfig\n";
|
||||
print "HIP_PATH : ", $HIP_PATH, "\n";
|
||||
print "ROCM_PATH : ", $ROCM_PATH, "\n";
|
||||
print "HIP_COMPILER : ", $HIP_COMPILER, "\n";
|
||||
print "HIP_PLATFORM : ", $HIP_PLATFORM, "\n";
|
||||
print "HIP_RUNTIME : ", $HIP_RUNTIME, "\n";
|
||||
print "CPP_CONFIG : ", $CPP_CONFIG, "\n";
|
||||
if ($HIP_PLATFORM eq "amd")
|
||||
{
|
||||
print "\n" ;
|
||||
if ($HIP_COMPILER eq "clang")
|
||||
{
|
||||
print "== hip-clang\n";
|
||||
if (not $isWindows) {
|
||||
print ("HSA_PATH : $HSA_PATH\n");
|
||||
}
|
||||
print ("HIP_CLANG_PATH : $HIP_CLANG_PATH\n");
|
||||
if ($isWindows) {
|
||||
system("\"$HIP_CLANG_PATH/clang++\" --version");
|
||||
system("\"$HIP_CLANG_PATH/llc\" --version");
|
||||
printf("hip-clang-cxxflags : ");
|
||||
$win_output = `perl \"$HIP_PATH/bin/hipcc\" --cxxflags`;
|
||||
printf("$win_output \n");
|
||||
printf("hip-clang-ldflags : ");
|
||||
$win_output = `perl \"$HIP_PATH/bin/hipcc\" --ldflags`;
|
||||
printf("$win_output \n");
|
||||
} else {
|
||||
system("$HIP_CLANG_PATH/clang++ --version");
|
||||
system("$HIP_CLANG_PATH/llc --version");
|
||||
print ("hip-clang-cxxflags : ");
|
||||
system("$HIP_PATH/bin/hipcc --cxxflags");
|
||||
printf("\n");
|
||||
print ("hip-clang-ldflags : ");
|
||||
system("$HIP_PATH/bin/hipcc --ldflags");
|
||||
printf("\n");
|
||||
}
|
||||
} else {
|
||||
print ("Unexpected HIP_COMPILER: $HIP_COMPILER\n");
|
||||
}
|
||||
}
|
||||
if ($HIP_PLATFORM eq "nvidia") {
|
||||
print "\n" ;
|
||||
print "== nvcc\n";
|
||||
#print "CUDA_PATH :", $CUDA_PATH";
|
||||
system("nvcc --version");
|
||||
|
||||
}
|
||||
print "\n" ;
|
||||
|
||||
print "=== Environment Variables\n";
|
||||
if ($isWindows) {
|
||||
print ("PATH=$ENV{PATH}\n");
|
||||
system("set | findstr /B /C:\"HIP\" /C:\"HSA\" /C:\"CUDA\" /C:\"LD_LIBRARY_PATH\"");
|
||||
} else {
|
||||
system("echo PATH=\$PATH");
|
||||
system("env | egrep '^HIP|^HSA|^CUDA|^LD_LIBRARY_PATH'");
|
||||
}
|
||||
|
||||
|
||||
print "\n" ;
|
||||
if ($isWindows) {
|
||||
print "== Windows Display Drivers\n";
|
||||
print "Hostname : "; system ("hostname");
|
||||
system ("wmic path win32_VideoController get AdapterCompatibility,InstalledDisplayDrivers,Name | findstr /B /C:\"Advanced Micro Devices\"");
|
||||
} else {
|
||||
print "== Linux Kernel\n";
|
||||
print "Hostname : "; system ("hostname");
|
||||
system ("uname -a");
|
||||
}
|
||||
|
||||
if (-e "/usr/bin/lsb_release") {
|
||||
system ("/usr/bin/lsb_release -a");
|
||||
}
|
||||
|
||||
print "\n" ;
|
||||
$printed = 1;
|
||||
}
|
||||
|
||||
|
||||
if ($p_check) {
|
||||
print "\nCheck system installation:\n";
|
||||
|
||||
printf ("%-70s", "check hipconfig in PATH...");
|
||||
# Safer to use which hipconfig instead of invoking hipconfig
|
||||
if (system ("which hipconfig > /dev/null 2>&1") != 0) {
|
||||
print "FAIL\n";
|
||||
} else {
|
||||
printf "good\n";
|
||||
}
|
||||
|
||||
if ($HIP_PLATFORM eq "amd") {
|
||||
$LD_LIBRARY_PATH=$ENV{'LD_LIBRARY_PATH'};
|
||||
printf("%-70s", "check LD_LIBRARY_PATH ($LD_LIBRARY_PATH) contains HSA_PATH ($HSA_PATH)...");
|
||||
if (index($LD_LIBRARY_PATH, $HSA_PATH) == -1) {
|
||||
print "FAIL\n";
|
||||
} else {
|
||||
printf "good\n";
|
||||
}
|
||||
|
||||
# TODO - check hipcc / nvcc found and executable.
|
||||
}
|
||||
}
|
||||
|
||||
if ($p_newline) {
|
||||
print "\n";
|
||||
}
|
||||
@@ -1,37 +0,0 @@
|
||||
#!/bin/bash
|
||||
# Copyright (c) 2017-2021 Advanced Micro Devices, Inc. All rights reserved.
|
||||
#
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
# of this software and associated documentation files (the "Software"), to deal
|
||||
# in the Software without restriction, including without limitation the rights
|
||||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
# copies of the Software, and to permit persons to whom the Software is
|
||||
# furnished to do so, subject to the following conditions:
|
||||
#
|
||||
# The above copyright notice and this permission notice shall be included in
|
||||
# all copies or substantial portions of the Software.
|
||||
#
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
# THE SOFTWARE.
|
||||
|
||||
#usage : hipconvertinplace-perl.sh DIRNAME [hipify-perl options]
|
||||
|
||||
#hipify "inplace" all code files in specified directory.
|
||||
# This can be quite handy when dealing with an existing CUDA code base since the script
|
||||
# preserves the existing directory structure.
|
||||
|
||||
# For each code file, this script will:
|
||||
# - If ".prehip file does not exist, copy the original code to a new file with extension ".prehip". Then hipify the code file.
|
||||
# - If ".prehip" file exists, this is used as input to hipify.
|
||||
# (this is useful for testing improvements to the hipify-perl toolset).
|
||||
|
||||
|
||||
SCRIPT_DIR=`dirname $0`
|
||||
SEARCH_DIR=$1
|
||||
shift
|
||||
$SCRIPT_DIR/hipify-perl -inplace -print-stats "$@" `$SCRIPT_DIR/findcode.sh $SEARCH_DIR`
|
||||
@@ -1,43 +0,0 @@
|
||||
#!/bin/bash
|
||||
# Copyright (c) 2016-2021 Advanced Micro Devices, Inc. All rights reserved.
|
||||
#
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
# of this software and associated documentation files (the "Software"), to deal
|
||||
# in the Software without restriction, including without limitation the rights
|
||||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
# copies of the Software, and to permit persons to whom the Software is
|
||||
# furnished to do so, subject to the following conditions:
|
||||
#
|
||||
# The above copyright notice and this permission notice shall be included in
|
||||
# all copies or substantial portions of the Software.
|
||||
#
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
# THE SOFTWARE.
|
||||
|
||||
#usage : hipconvertinplace.sh DIRNAME [hipify options] [--] [clang options]
|
||||
|
||||
#hipify "inplace" all code files in specified directory.
|
||||
# This can be quite handy when dealing with an existing CUDA code base since the script
|
||||
# preserves the existing directory structure.
|
||||
|
||||
SCRIPT_DIR=`dirname $0`
|
||||
SEARCH_DIR=$1
|
||||
|
||||
hipify_args=''
|
||||
while (( "$#" )); do
|
||||
shift
|
||||
if [ "$1" != "--" ]; then
|
||||
hipify_args="$hipify_args $1"
|
||||
else
|
||||
shift
|
||||
break
|
||||
fi
|
||||
done
|
||||
clang_args="$@"
|
||||
|
||||
$SCRIPT_DIR/hipify-clang -inplace -print-stats $hipify_args `$SCRIPT_DIR/findcode.sh $SEARCH_DIR` -- -x cuda $clang_args
|
||||
@@ -1,38 +0,0 @@
|
||||
#!/bin/bash
|
||||
# Copyright (c) 2016-2021 Advanced Micro Devices, Inc. All rights reserved.
|
||||
#
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
# of this software and associated documentation files (the "Software"), to deal
|
||||
# in the Software without restriction, including without limitation the rights
|
||||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
# copies of the Software, and to permit persons to whom the Software is
|
||||
# furnished to do so, subject to the following conditions:
|
||||
#
|
||||
# The above copyright notice and this permission notice shall be included in
|
||||
# all copies or substantial portions of the Software.
|
||||
#
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
# THE SOFTWARE.
|
||||
|
||||
# usage: hipdemangleatp.sh ATP_FILE
|
||||
|
||||
# HIP kernels
|
||||
kernels=$(grep grid_launch_parm $1 | cut -d" " -f1 | sort | uniq)
|
||||
for mangled_sym in $kernels; do
|
||||
real_sym=$(c++filt -p $(c++filt _$mangled_sym | cut -d: -f3 | sed 's/_functor//g' | sed 's/ /\\\ /g'))
|
||||
#echo "$mangled_sym => $real_sym" >> $1.log
|
||||
sed -i "s/$mangled_sym/$real_sym/g" $1
|
||||
done
|
||||
|
||||
# HC kernels
|
||||
kernels=$(grep cxxamp_trampoline $1 | cut -d" " -f1 | sort | uniq)
|
||||
for mangled_sym in $kernels; do
|
||||
real_sym=$(echo $mangled_sym | sed "s/^/_/g; s/_EC_/$/g" | c++filt -p | cut -d\( -f1 | cut -d" " -f1 --complement | sed 's/ /\\\ /g')
|
||||
#echo "$mangled_sym => $real_sym" >> $1.log
|
||||
sed -i "s/$mangled_sym/$real_sym/g" $1
|
||||
done
|
||||
@@ -1,31 +0,0 @@
|
||||
#!/bin/bash
|
||||
# Copyright (c) 2017-2021 Advanced Micro Devices, Inc. All rights reserved.
|
||||
#
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
# of this software and associated documentation files (the "Software"), to deal
|
||||
# in the Software without restriction, including without limitation the rights
|
||||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
# copies of the Software, and to permit persons to whom the Software is
|
||||
# furnished to do so, subject to the following conditions:
|
||||
#
|
||||
# The above copyright notice and this permission notice shall be included in
|
||||
# all copies or substantial portions of the Software.
|
||||
#
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
# THE SOFTWARE.
|
||||
|
||||
#usage : hipexamine-perl.sh DIRNAME [hipify-perl options]
|
||||
|
||||
# Generate HIP stats (LOC, CUDA->API conversions, missing functionality) for all the code files
|
||||
# in the specified directory.
|
||||
|
||||
|
||||
SCRIPT_DIR=`dirname $0`
|
||||
SEARCH_DIR=$1
|
||||
shift
|
||||
$SCRIPT_DIR/hipify-perl -no-output -print-stats "$@" `$SCRIPT_DIR/findcode.sh $SEARCH_DIR`
|
||||
@@ -1,41 +0,0 @@
|
||||
#!/bin/bash
|
||||
# Copyright (c) 2016-2021 Advanced Micro Devices, Inc. All rights reserved.
|
||||
#
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
# of this software and associated documentation files (the "Software"), to deal
|
||||
# in the Software without restriction, including without limitation the rights
|
||||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
# copies of the Software, and to permit persons to whom the Software is
|
||||
# furnished to do so, subject to the following conditions:
|
||||
#
|
||||
# The above copyright notice and this permission notice shall be included in
|
||||
# all copies or substantial portions of the Software.
|
||||
#
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
# THE SOFTWARE.
|
||||
|
||||
#usage : hipexamine.sh DIRNAME [hipify options] [--] [clang options]
|
||||
|
||||
# Generate CUDA->HIP conversion statistics for all the code files in the specified directory.
|
||||
|
||||
SCRIPT_DIR=`dirname $0`
|
||||
SEARCH_DIR=$1
|
||||
|
||||
hipify_args=''
|
||||
while (( "$#" )); do
|
||||
shift
|
||||
if [ "$1" != "--" ]; then
|
||||
hipify_args="$hipify_args $1"
|
||||
else
|
||||
shift
|
||||
break
|
||||
fi
|
||||
done
|
||||
clang_args="$@"
|
||||
|
||||
$SCRIPT_DIR/hipify-clang -examine $hipify_args `$SCRIPT_DIR/findcode.sh $SEARCH_DIR` -- -x cuda $clang_args
|
||||
@@ -1,279 +0,0 @@
|
||||
#!/usr/bin/perl -w
|
||||
##
|
||||
# Copyright (c) 2015-2021 Advanced Micro Devices, Inc. All rights reserved.
|
||||
#
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
# of this software and associated documentation files (the "Software"), to deal
|
||||
# in the Software without restriction, including without limitation the rights
|
||||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
# copies of the Software, and to permit persons to whom the Software is
|
||||
# furnished to do so, subject to the following conditions:
|
||||
#
|
||||
# The above copyright notice and this permission notice shall be included in
|
||||
# all copies or substantial portions of the Software.
|
||||
#
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
# THE SOFTWARE.
|
||||
##
|
||||
#usage hipify-cmakefile [OPTIONS] INPUT_FILE
|
||||
use Getopt::Long;
|
||||
|
||||
GetOptions(
|
||||
"print-stats" => \$print_stats # print the command-line, like a header.
|
||||
, "quiet-warnings" => \$quiet_warnings # don't print warnings on unknown CUDA functions.
|
||||
, "no-output" => \$no_output # don't write any translated output to stdout.
|
||||
, "inplace" => \$inplace # modify input file inplace, save backup in ".prehip" file.
|
||||
, "n" => \$n # combination of print_stats + no-output.
|
||||
);
|
||||
|
||||
$print_stats = 1 if $n;
|
||||
$no_output = 1 if $n;
|
||||
|
||||
@warn_whitelist = ();
|
||||
|
||||
#---
|
||||
#Stats tracking code:
|
||||
@statNames = ( "macro", "include", "option", "other" );
|
||||
|
||||
#---
|
||||
#Compute total of all individual counts:
|
||||
sub totalStats {
|
||||
my %count = %{ shift() };
|
||||
|
||||
my $total = 0;
|
||||
foreach $key ( keys %count ) {
|
||||
$total += $count{$key};
|
||||
}
|
||||
|
||||
return $total;
|
||||
}
|
||||
|
||||
#---
|
||||
sub printStats {
|
||||
my $label = shift();
|
||||
my @statNames = @{ shift() };
|
||||
my %counts = %{ shift() };
|
||||
my $warnings = shift();
|
||||
my $loc = shift();
|
||||
|
||||
my $total = totalStats( \%counts );
|
||||
|
||||
printf STDERR "%s %d CUDA->HIP refs( ", $label, $total;
|
||||
|
||||
foreach $stat (@statNames) {
|
||||
printf STDERR "%s:%d ", $stat, $counts{$stat};
|
||||
}
|
||||
|
||||
printf STDERR ") warn:%d LOC:%d", $warnings, $loc;
|
||||
}
|
||||
|
||||
#---
|
||||
# Add adder stats to dest. Used to add stats for current file to a running total for all files:
|
||||
sub addStats {
|
||||
my $dest_ref = shift();
|
||||
my %adder = %{ shift() };
|
||||
|
||||
foreach $key ( keys %adder ) {
|
||||
$dest_ref->{$key} += $adder{$key};
|
||||
}
|
||||
}
|
||||
|
||||
#---
|
||||
sub clearStats {
|
||||
my $dest_ref = shift();
|
||||
my @statNames = @{ shift() };
|
||||
|
||||
foreach $stat (@statNames) {
|
||||
$dest_ref->{$stat} = 0;
|
||||
}
|
||||
}
|
||||
|
||||
# count of transforms in all files:
|
||||
my %tt;
|
||||
clearStats( \%tt, \@statNames );
|
||||
|
||||
my $fileCount = @ARGV;
|
||||
my $fileName = "";
|
||||
|
||||
while (@ARGV) {
|
||||
$fileName = shift(@ARGV);
|
||||
if ($inplace) {
|
||||
my $file_prehip = "$fileName" . ".prehip";
|
||||
my $infile;
|
||||
my $outfile;
|
||||
if ( -e $file_prehip ) {
|
||||
$infile = $file_prehip;
|
||||
$outfile = $fileName;
|
||||
}
|
||||
else {
|
||||
system("cp $fileName $file_prehip");
|
||||
$infile = $file_prehip;
|
||||
$outfile = $fileName;
|
||||
}
|
||||
open( INFILE, "<", $infile ) or die "error: could not open $infile";
|
||||
open( OUTFILE, ">", $outfile ) or die "error: could not open $outfile";
|
||||
$OUTFILE = OUTFILE;
|
||||
}
|
||||
else {
|
||||
open( INFILE, "<", $fileName ) or die "error: could not open $fileName";
|
||||
$OUTFILE = STDOUT;
|
||||
}
|
||||
|
||||
# count of transforms in this file, init to 0 here:
|
||||
my %ft;
|
||||
clearStats( \%ft, \@statNames );
|
||||
|
||||
my $lineCount = 0;
|
||||
|
||||
undef $/; # Read whole file at once, so we can match newlines.
|
||||
while (<INFILE>) {
|
||||
|
||||
# Replace find_package(CUDA) with find_package(HIP)
|
||||
$ft{'include'} += s/\bfind_package[ ]*\([ ]*CUDA[ ]*[0-9.]*/find_package(HIP/ig;
|
||||
|
||||
# Replace macros
|
||||
$ft{'macro'} += s/\bCUDA_ADD_EXECUTABLE/HIP_ADD_EXECUTABLE/ig;
|
||||
$ft{'macro'} += s/\bCUDA_ADD_LIBRARY/HIP_ADD_LIBRARY/ig;
|
||||
$ft{'macro'} += s/\bCUDA_INCLUDE_DIRECTORIES/HIP_INCLUDE_DIRECTORIES/ig;
|
||||
|
||||
# Replace options
|
||||
$ft{'option'} += s/\bCUDA_NVCC_FLAGS/HIP_NVCC_FLAGS/ig;
|
||||
$ft{'option'} += s/\bCUDA_HOST_COMPILATION_CPP/HIP_HOST_COMPILATION_CPP/ig;
|
||||
$ft{'option'} += s/\bCUDA_SOURCE_PROPERTY_FORMAT/HIP_SOURCE_PROPERTY_FORMAT/ig;
|
||||
|
||||
# Replace variables
|
||||
$ft{'other'} += s/\bCUDA_FOUND/HIP_FOUND/ig;
|
||||
$ft{'other'} += s/\bCUDA_VERSION/HIP_VERSION/ig;
|
||||
$ft{'other'} += s/\bCUDA_TOOLKIT_ROOT_DIR/HIP_ROOT_DIR/ig;
|
||||
|
||||
unless ($quiet_warnings) {
|
||||
|
||||
#print STDERR "Check WARNINGs\n";
|
||||
# copy into array of lines, process line-by-line to show warnings:
|
||||
my @lines = split /\n/, $_;
|
||||
my $tmp = $_; # copies the whole file, could be a little smarter here...
|
||||
my $line_num = 0;
|
||||
|
||||
foreach (@lines) {
|
||||
$line_num++;
|
||||
|
||||
# remove any whitelisted words:
|
||||
foreach $w (@warn_whitelist) {
|
||||
s/\b$w\b/ZAP/;
|
||||
}
|
||||
|
||||
$s = warnUnsupportedSpecialFunctions($line_num);
|
||||
$warnings += $s;
|
||||
}
|
||||
|
||||
$_ = $tmp;
|
||||
}
|
||||
|
||||
#--------
|
||||
# Print it!
|
||||
unless ($no_output) {
|
||||
print $OUTFILE "$_";
|
||||
}
|
||||
$lineCount = $_ =~ tr/\n//;
|
||||
}
|
||||
|
||||
my $totalConverted = totalStats( \%ft );
|
||||
|
||||
if ( ( $totalConverted + $warnings ) and $print_stats ) {
|
||||
printStats( "info: converted", \@statNames, \%ft, $warnings, $lineCount );
|
||||
print STDERR " in '$fileName'\n";
|
||||
print STDERR "You may need to hand-edit '$fileName' to add steps to build correctly on HCC path\n";
|
||||
}
|
||||
|
||||
# Update totals for all files:
|
||||
addStats( \%tt, \%ft );
|
||||
$Twarnings += $warnings;
|
||||
$TlineCount += $lineCount;
|
||||
}
|
||||
|
||||
#-- Print total stats for all files processed:
|
||||
if ( $print_stats and ( $fileCount > 1 ) ) {
|
||||
print STDERR "\n";
|
||||
printStats( "info: TOTAL-converted", \@statNames, \%tt, $Twarnings, $TlineCount );
|
||||
print STDERR "\n";
|
||||
}
|
||||
|
||||
#---
|
||||
sub warnUnsupportedSpecialFunctions {
|
||||
my $line_num = shift;
|
||||
my $m = 0;
|
||||
|
||||
foreach $func (
|
||||
# macros:
|
||||
"CUDA_ADD_CUFFT_TO_TARGET",
|
||||
"CUDA_ADD_CUBLAS_TO_TARGET",
|
||||
#"CUDA_ADD_EXECUTABLE",
|
||||
#"CUDA_ADD_LIBRARY",
|
||||
"CUDA_BUILD_CLEAN_TARGET",
|
||||
"CUDA_COMPILE",
|
||||
"CUDA_COMPILE_PTX",
|
||||
"CUDA_COMPILE_FATBIN",
|
||||
"CUDA_COMPILE_CUBIN",
|
||||
"CUDA_COMPUTE_SEPARABLE_COMPILATION_OBJECT_FILE_NAME",
|
||||
#"CUDA_INCLUDE_DIRECTORIES",
|
||||
"CUDA_LINK_SEPARABLE_COMPILATION_OBJECTS",
|
||||
"CUDA_SELECT_NVCC_ARCH_FLAGS",
|
||||
"CUDA_WRAP_SRCS",
|
||||
|
||||
# options:
|
||||
"CUDA_64_BIT_DEVICE_CODE",
|
||||
"CUDA_ATTACH_VS_BUILD_RULE_TO_CUDA_FILE",
|
||||
"CUDA_BUILD_CUBIN",
|
||||
"CUDA_BUILD_EMULATION",
|
||||
"CUDA_LINK_LIBRARIES_KEYWORD",
|
||||
"CUDA_GENERATED_OUTPUT_DIR",
|
||||
#"CUDA_HOST_COMPILATION_CPP",
|
||||
"CUDA_HOST_COMPILER",
|
||||
#"CUDA_NVCC_FLAGS",
|
||||
#"CUDA_NVCC_FLAGS_<CONFIG>",
|
||||
"CUDA_PROPAGATE_HOST_FLAGS",
|
||||
"CUDA_SEPARABLE_COMPILATION",
|
||||
#"CUDA_SOURCE_PROPERTY_FORMAT",
|
||||
"CUDA_USE_STATIC_CUDA_RUNTIME",
|
||||
"CUDA_VERBOSE_BUILD",
|
||||
|
||||
# others:
|
||||
#"CUDA_VERSION_MAJOR",
|
||||
#"CUDA_VERSION_MINOR",
|
||||
#"CUDA_VERSION",
|
||||
#"CUDA_VERSION_STRING",
|
||||
"CUDA_HAS_FP16",
|
||||
#"CUDA_TOOLKIT_ROOT_DIR",
|
||||
"CUDA_SDK_ROOT_DIR",
|
||||
"CUDA_INCLUDE_DIRS",
|
||||
"CUDA_LIBRARIES",
|
||||
"CUDA_CUFFT_LIBRARIES",
|
||||
"CUDA_CUBLAS_LIBRARIES",
|
||||
"CUDA_cudart_static_LIBRARY",
|
||||
"CUDA_cudadevrt_LIBRARY",
|
||||
"CUDA_cupti_LIBRARY",
|
||||
"CUDA_curand_LIBRARY",
|
||||
"CUDA_cusolver_LIBRARY",
|
||||
"CUDA_cusparse_LIBRARY",
|
||||
"CUDA_npp_LIBRARY",
|
||||
"CUDA_nppc_LIBRARY",
|
||||
"CUDA_nppi_LIBRARY",
|
||||
"CUDA_npps_LIBRARY",
|
||||
"CUDA_nvcuvenc_LIBRARY",
|
||||
"CUDA_nvcuvid_LIBRARY"
|
||||
)
|
||||
{
|
||||
my $mt = m/\b($func)/g;
|
||||
if ($mt) {
|
||||
$m += $mt;
|
||||
print STDERR " warning: $fileName:#$line_num : unsupported macro/option : $_\n";
|
||||
}
|
||||
}
|
||||
|
||||
return $m;
|
||||
}
|
||||
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
@@ -1,152 +0,0 @@
|
||||
#!/usr/bin/perl -w
|
||||
# Copyright (c) 2020-2021 Advanced Micro Devices, Inc. All rights reserved.
|
||||
#
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
# of this software and associated documentation files (the "Software"), to deal
|
||||
# in the Software without restriction, including without limitation the rights
|
||||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
# copies of the Software, and to permit persons to whom the Software is
|
||||
# furnished to do so, subject to the following conditions:
|
||||
#
|
||||
# The above copyright notice and this permission notice shall be included in
|
||||
# all copies or substantial portions of the Software.
|
||||
#
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
# THE SOFTWARE.
|
||||
|
||||
package hipvars;
|
||||
use Getopt::Long;
|
||||
use Cwd;
|
||||
use File::Basename;
|
||||
|
||||
$HIP_BASE_VERSION_MAJOR = "4";
|
||||
$HIP_BASE_VERSION_MINOR = "2";
|
||||
|
||||
#---
|
||||
# Function to parse config file
|
||||
sub parse_config_file {
|
||||
my ($file, $config) = @_;
|
||||
if (open (CONFIG, "$file")) {
|
||||
while (<CONFIG>) {
|
||||
my $config_line=$_;
|
||||
chop ($config_line);
|
||||
$config_line =~ s/^\s*//;
|
||||
$config_line =~ s/\s*$//;
|
||||
if (($config_line !~ /^#/) && ($config_line ne "")) {
|
||||
my ($name, $value) = split (/=/, $config_line);
|
||||
$$config{$name} = $value;
|
||||
}
|
||||
}
|
||||
close(CONFIG);
|
||||
}
|
||||
}
|
||||
|
||||
#---
|
||||
# Function to check if executable can be run
|
||||
sub can_run {
|
||||
my ($exe) = @_;
|
||||
`$exe --version 2>&1`;
|
||||
if ($? == 0) {
|
||||
return 1;
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
$isWindows = $^O eq 'MSWin32';
|
||||
|
||||
#
|
||||
# TODO: Fix rpath LDFLAGS settings
|
||||
#
|
||||
# Since this hipcc script gets installed at two uneven hierarchical levels,
|
||||
# linked by symlink, the absolute path of this script should be used to
|
||||
# derive HIP_PATH, as dirname $0 could be /opt/rocm/bin or /opt/rocm/hip/bin
|
||||
# depending on how it gets invoked.
|
||||
# ROCM_PATH which points to <rocm_install_dir> is determined based on whether
|
||||
# we find bin/rocm_agent_enumerator in the parent of HIP_PATH or not. If it is found,
|
||||
# ROCM_PATH is defined relative to HIP_PATH else it is hardcoded to /opt/rocm.
|
||||
#
|
||||
$HIP_PATH=$ENV{'HIP_PATH'} // dirname(Cwd::abs_path("$0/../")); # use parent directory of hipcc
|
||||
if (-e "$HIP_PATH/../bin/rocm_agent_enumerator") {
|
||||
$ROCM_PATH=$ENV{'ROCM_PATH'} // dirname("$HIP_PATH"); # use parent directory of HIP_PATH
|
||||
} else {
|
||||
$ROCM_PATH=$ENV{'ROCM_PATH'} // "/opt/rocm";
|
||||
}
|
||||
$CUDA_PATH=$ENV{'CUDA_PATH'} // '/usr/local/cuda';
|
||||
$HSA_PATH=$ENV{'HSA_PATH'} // "$ROCM_PATH/hsa";
|
||||
|
||||
# Windows has a different structure, all binaries are inside hip/bin
|
||||
if ($isWindows) {
|
||||
$HIP_CLANG_PATH=$ENV{'HIP_CLANG_PATH'} // "$HIP_PATH/bin";
|
||||
} else {
|
||||
$HIP_CLANG_PATH=$ENV{'HIP_CLANG_PATH'} // "$ROCM_PATH/llvm/bin";
|
||||
}
|
||||
# HIP_ROCCLR_HOME is used by Windows builds
|
||||
$HIP_ROCCLR_HOME=$ENV{'HIP_ROCCLR_HOME'};
|
||||
|
||||
if (defined $HIP_ROCCLR_HOME) {
|
||||
$HIP_INFO_PATH= "$HIP_ROCCLR_HOME/lib/.hipInfo";
|
||||
} else {
|
||||
$HIP_INFO_PATH= "$HIP_PATH/lib/.hipInfo"; # use actual file
|
||||
}
|
||||
#---
|
||||
#HIP_PLATFORM controls whether to use nvidia or amd platform:
|
||||
$HIP_PLATFORM=$ENV{'HIP_PLATFORM'};
|
||||
# Read .hipInfo
|
||||
my %hipInfo = ();
|
||||
parse_config_file("$HIP_INFO_PATH", \%hipInfo);
|
||||
# Prioritize Env first, otherwise use the hipInfo config file
|
||||
$HIP_COMPILER = $ENV{'HIP_COMPILER'} // $hipInfo{'HIP_COMPILER'} // "clang";
|
||||
$HIP_RUNTIME = $ENV{'HIP_RUNTIME'} // $hipInfo{'HIP_RUNTIME'} // "rocclr";
|
||||
|
||||
# If using ROCclr runtime, need to find HIP_ROCCLR_HOME
|
||||
if (defined $HIP_RUNTIME and $HIP_RUNTIME eq "rocclr" and !defined $HIP_ROCCLR_HOME) {
|
||||
my $hipvars_dir = dirname($0);
|
||||
if (-e "$hipvars_dir/../lib/bitcode") {
|
||||
$HIP_ROCCLR_HOME = Cwd::abs_path($hipvars_dir . "/..");
|
||||
} else {
|
||||
$HIP_ROCCLR_HOME = $HIP_PATH; # use HIP_PATH
|
||||
}
|
||||
}
|
||||
|
||||
if (not defined $HIP_PLATFORM) {
|
||||
if (can_run("$HIP_CLANG_PATH/clang++") or can_run("clang++")) {
|
||||
$HIP_PLATFORM = "amd";
|
||||
} elsif (can_run("$CUDA_PATH/bin/nvcc") or can_run("nvcc")) {
|
||||
$HIP_PLATFORM = "nvidia";
|
||||
$HIP_COMPILER = "nvcc";
|
||||
$HIP_RUNTIME = "cuda";
|
||||
} else {
|
||||
# Default to amd for now
|
||||
$HIP_PLATFORM = "amd";
|
||||
}
|
||||
} elsif ($HIP_PLATFORM eq "hcc") {
|
||||
$HIP_PLATFORM = "amd";
|
||||
warn("Warning: HIP_PLATFORM=hcc is deprecated. Please use HIP_PLATFORM=amd. \n")
|
||||
} elsif ($HIP_PLATFORM eq "nvcc") {
|
||||
$HIP_PLATFORM = "nvidia";
|
||||
$HIP_COMPILER = "nvcc";
|
||||
$HIP_RUNTIME = "cuda";
|
||||
warn("Warning: HIP_PLATFORM=nvcc is deprecated. Please use HIP_PLATFORM=nvidia. \n")
|
||||
}
|
||||
|
||||
if ($HIP_COMPILER eq "clang") {
|
||||
# Windows does not have clang at linux default path
|
||||
if (defined $HIP_ROCCLR_HOME and (-e "$HIP_ROCCLR_HOME/bin/clang" or -e "$HIP_ROCCLR_HOME/bin/clang.exe")) {
|
||||
$HIP_CLANG_PATH = "$HIP_ROCCLR_HOME/bin";
|
||||
}
|
||||
}
|
||||
|
||||
#---
|
||||
# Read .hipVersion
|
||||
my %hipVersion = ();
|
||||
parse_config_file("$hipvars::HIP_PATH/bin/.hipVersion", \%hipVersion);
|
||||
$HIP_VERSION_MAJOR = $hipVersion{'HIP_VERSION_MAJOR'} // $HIP_BASE_VERSION_MAJOR;
|
||||
$HIP_VERSION_MINOR = $hipVersion{'HIP_VERSION_MINOR'} // $HIP_BASE_VERSION_MINOR;
|
||||
$HIP_VERSION_PATCH = $hipVersion{'HIP_VERSION_PATCH'} // "0";
|
||||
$HIP_VERSION="$HIP_VERSION_MAJOR.$HIP_VERSION_MINOR.$HIP_VERSION_PATCH";
|
||||
@@ -1,229 +0,0 @@
|
||||
#!/usr/bin/perl
|
||||
# Copyright (c) 2020-2021 Advanced Micro Devices, Inc. All rights reserved.
|
||||
#
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
# of this software and associated documentation files (the "Software"), to deal
|
||||
# in the Software without restriction, including without limitation the rights
|
||||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
# copies of the Software, and to permit persons to whom the Software is
|
||||
# furnished to do so, subject to the following conditions:
|
||||
#
|
||||
# The above copyright notice and this permission notice shall be included in
|
||||
# all copies or substantial portions of the Software.
|
||||
#
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
# THE SOFTWARE.
|
||||
|
||||
use strict;
|
||||
use File::Copy;
|
||||
use File::Spec;
|
||||
use File::Basename;
|
||||
use File::Which;
|
||||
use Cwd 'realpath';
|
||||
use Getopt::Std;
|
||||
use List::Util qw(max);
|
||||
use URI::Encode;
|
||||
|
||||
my $extract_range_specifier;
|
||||
my $extract_pid;
|
||||
my $extract_file;
|
||||
my $output_file;
|
||||
my $output_path;
|
||||
my $extract_offset;
|
||||
my $extract_size;
|
||||
my $pid_running;
|
||||
my $verbose=0;
|
||||
my $error=0;
|
||||
my $output_to_stdout=0;
|
||||
|
||||
sub usage {
|
||||
print("Usage: $0 [-o|v|h] URI... \n");
|
||||
print(" URIs can be read from STDIN, one per line.\n");
|
||||
print(" From the URIs specified, extracts code objects into files named: ");
|
||||
print("<executable_name>-[pid<number>]-offset<number>-size<number>.co\n\n");
|
||||
print("Options:\n");
|
||||
print(" -o <path> \tPath for output. If \"-\" specified, code object is printed to STDOUT.\n");
|
||||
print(" -v \tVerbose output to STDOUT (includes Entry ID).\n");
|
||||
print(" -h \tShow this help message.\n");
|
||||
print("\nURI syntax:\n");
|
||||
print("\tcode_object_uri ::== file_uri | memory_uri\n");
|
||||
print("\tfile_uri ::== \"file://\" extract_file [ range_specifier ]\n");
|
||||
print("\tmemory_uri ::== \"memory://\" process_id range_specifier\n");
|
||||
print("\trange_specifier ::== [ \"#\" | \"?\" ] \"offset=\" number \"&\" \"size=\" number\n");
|
||||
print("\textract_file ::== URI_ENCODED_OS_FILE_PATH\n");
|
||||
print("\tprocess_id ::== DECIMAL_NUMBER\n");
|
||||
print("\tnumber ::== HEX_NUMBER \| DECIMAL_NUMBER \| OCTAL_NUMBER\n\n");
|
||||
print("\tExample: file://dir1/dir2/hello_world#offset=133&size=14472 \n");
|
||||
print("\t memory://1234#offset=0x20000&size=3000\n\n");
|
||||
|
||||
exit($error);
|
||||
}
|
||||
|
||||
# Process options
|
||||
my %options=();
|
||||
getopts('vho:', \%options);
|
||||
|
||||
if (defined $options{h}) {
|
||||
usage();
|
||||
}
|
||||
|
||||
if (defined $options{v}) {
|
||||
$verbose = 1;
|
||||
}
|
||||
|
||||
if (defined $options{o}) {
|
||||
$output_path = $options{o};
|
||||
if ($output_path eq "-") {
|
||||
$output_to_stdout=1;
|
||||
} else {
|
||||
(-d $output_path) || die("Error: Path \'$output_path\' cannot be found.\n");
|
||||
}
|
||||
}
|
||||
|
||||
# push STDIN to ARGV array.
|
||||
push @ARGV, <STDIN> unless -t STDIN;
|
||||
|
||||
# error check: enough arguments presented.
|
||||
if ($#ARGV < 0) {
|
||||
print(STDERR "Error: No arguments.\n"); $error++;
|
||||
usage();
|
||||
}
|
||||
|
||||
# error check: command dd is available.
|
||||
my $dd_cmd = which("dd");
|
||||
(-f $dd_cmd) || die("Error: Can't find dd command\n");
|
||||
|
||||
foreach my $uri_str(@ARGV) {
|
||||
chomp $uri_str;
|
||||
|
||||
# we expect the URI to follow this BNF syntax:
|
||||
#
|
||||
# code_object_uri ::== file_uri | memory_uri
|
||||
# file_uri ::== "file://" extract_file [ range_specifier ]
|
||||
# memory_uri ::== "memory://" process_id range_specifier
|
||||
# range_specifier ::== [ "#" | "?" ] "offset=" number "&" "size=" number
|
||||
# extract_file ::== URI_ENCODED_OS_FILE_PATH
|
||||
# process_id ::== DECIMAL_NUMBER
|
||||
# number ::== HEX_NUMBER | DECIMAL_NUMBER | OCTAL_NUMBER
|
||||
|
||||
# Example: file://dir1/dir2/hello_world#offset=133&size=14472
|
||||
# memory://1234#offset=0x20000&size=3000
|
||||
|
||||
my ($uri_protocol, $specs) = split(/:\/\//,$uri_str);
|
||||
my $obj_uri_encode = URI::Encode->new();
|
||||
my $decoded_extract_file;
|
||||
|
||||
if (lc($uri_protocol) eq "file") {
|
||||
# expect file path
|
||||
($extract_file, $extract_range_specifier) = split(/[#,?]/,$specs);
|
||||
|
||||
# decode the file name. URIs may have file/path names with non-alphanumeric characters, which will be encoded with %. We need to decode these.
|
||||
$decoded_extract_file = $obj_uri_encode->decode($extract_file);
|
||||
|
||||
# verify file exists:
|
||||
if (! -e $decoded_extract_file) {
|
||||
print(STDERR "Error: can't find file: $decoded_extract_file\n"); $error++;
|
||||
next;
|
||||
}
|
||||
|
||||
# use the output_path is specified, otherwise use current working dir.
|
||||
if ($output_path ne "") {
|
||||
$output_file = File::Spec->catfile($output_path, basename($decoded_extract_file));
|
||||
} else {
|
||||
$output_file = basename($decoded_extract_file);
|
||||
}
|
||||
|
||||
} elsif ( lc($uri_protocol) eq "memory") {
|
||||
# expect memory specifier
|
||||
($extract_pid, $extract_range_specifier) = split(/[#,?]/,$specs);
|
||||
|
||||
# verify pid is currently running
|
||||
$pid_running = kill 0, $extract_pid;
|
||||
if (! $pid_running) {
|
||||
print(STDERR "Error: PID: $extract_pid is NOT running\n"); $error++;
|
||||
next;
|
||||
}
|
||||
|
||||
# get pid filename:
|
||||
$extract_file = "/proc/$extract_pid/mem";
|
||||
|
||||
# verify file exists:
|
||||
if (! -e $extract_file) {
|
||||
print(STDERR "Error: can't find file: $extract_file\n"); $error++;
|
||||
next;
|
||||
}
|
||||
|
||||
# for extracting from a pid, make the output file in the current dir/path with the pid value as a name.
|
||||
$output_file = "pid${extract_pid}";
|
||||
|
||||
# need to set $decoded_extract_file, because later we use this for other checks.
|
||||
$decoded_extract_file = $extract_file;
|
||||
|
||||
} else {
|
||||
# error, unrecognized Code Object URI
|
||||
print(STDERR "Error: \'$uri_protocol\' is not recognized as a supported code object URI.\n"); $error++;
|
||||
next;
|
||||
}
|
||||
|
||||
# it is valid to not give a range specifier in a URI, in which case the entire code object will be extracted.
|
||||
if ($extract_range_specifier ne "") {
|
||||
($extract_offset, $extract_size) = split(/[&]/,$extract_range_specifier);
|
||||
(undef, $extract_offset) = split(/=/,$extract_offset);
|
||||
(undef, $extract_size) = split(/=/,$extract_size);
|
||||
} else {
|
||||
# Error if URI is a memory request, and we have no range_specifier.
|
||||
if ($pid_running) {
|
||||
print(STDERR "Error: must specify a Range Specifier (offset and size) for a memory URI: $uri_str\n"); $error++;
|
||||
next;
|
||||
}
|
||||
|
||||
$extract_offset = 0;
|
||||
$extract_size = -s $decoded_extract_file;
|
||||
}
|
||||
|
||||
# We should have at least a valid size to extract; ignore cases with size=0.
|
||||
if ($extract_size != 0) {
|
||||
print("Reading input file \"$extract_file\" ...\n") if ($verbose);
|
||||
|
||||
# only if this is a File URI.
|
||||
if (lc($uri_protocol) eq "file") {
|
||||
# verify that offset+size does not exceed file size:
|
||||
my $file_size = -s $decoded_extract_file;
|
||||
my $size = int($extract_offset) + int($extract_size);
|
||||
if ( $size > $file_size ) {
|
||||
print(STDERR "Error: requested offset($extract_offset) + size($extract_size) exceeds file size($file_size) for file \"$decoded_extract_file\".\n"); $error++;
|
||||
next;
|
||||
}
|
||||
}
|
||||
|
||||
open(INPUT_FP, "<", $decoded_extract_file) || die $!;
|
||||
binmode INPUT_FP;
|
||||
|
||||
# extract the code object
|
||||
my $co_filename;
|
||||
if (!$output_to_stdout) {
|
||||
$co_filename = "of=\'${output_file}-offset${extract_offset}-size${extract_size}.co\'";
|
||||
}
|
||||
|
||||
my $dd_cmd_str = "$dd_cmd if=\'$decoded_extract_file\' $co_filename skip=$extract_offset count=$extract_size bs=1 status=none";
|
||||
|
||||
print("DD Command: $dd_cmd_str\n") if ($verbose);
|
||||
|
||||
my $dd_ret = system($dd_cmd_str);
|
||||
if ($dd_ret != 0) {
|
||||
print(STDERR "Error: DD command ($dd_cmd_str) failed with RC: $dd_ret\n"); $error++;
|
||||
}
|
||||
|
||||
print("Extract request: file: $extract_file offset: $extract_offset size: $extract_size\n") if ($verbose);
|
||||
} else {
|
||||
print("Warning: trying to extract from $extract_file at offset=$extract_offset with size=0. Nothing to extract.\n") if ($verbose);
|
||||
}
|
||||
|
||||
} # end of for each (URI) argument
|
||||
|
||||
exit($error);
|
||||
@@ -1,156 +0,0 @@
|
||||
#!/usr/bin/perl
|
||||
# Copyright (c) 2020-2021 Advanced Micro Devices, Inc. All rights reserved.
|
||||
#
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
# of this software and associated documentation files (the "Software"), to deal
|
||||
# in the Software without restriction, including without limitation the rights
|
||||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
# copies of the Software, and to permit persons to whom the Software is
|
||||
# furnished to do so, subject to the following conditions:
|
||||
#
|
||||
# The above copyright notice and this permission notice shall be included in
|
||||
# all copies or substantial portions of the Software.
|
||||
#
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
# THE SOFTWARE.
|
||||
|
||||
use strict;
|
||||
use File::Copy;
|
||||
use File::Spec;
|
||||
use File::Basename;
|
||||
use File::Which;
|
||||
use Cwd 'realpath';
|
||||
use Getopt::Std;
|
||||
use List::Util qw(max);
|
||||
use URI::Encode;
|
||||
|
||||
sub usage {
|
||||
print("Usage: $0 [-v|h] executable...\n");
|
||||
print("List the URIs of the code objects embedded in the specfied host executables.\n");
|
||||
print("-v \tVerbose output (includes Entry ID)\n");
|
||||
print("-h \tShow this help message\n");
|
||||
exit;
|
||||
}
|
||||
|
||||
# sub to read a qword. 1st arg is a FP, 2nd arg is ref to destination var.
|
||||
sub readq {
|
||||
my ($input_fp, $qword) = @_;
|
||||
read($input_fp, my $bytes, 8) == 8 or die("Error: Failed to read 8 bytes\n");
|
||||
${$qword} = unpack("Q<", $bytes);
|
||||
}
|
||||
|
||||
# Process options
|
||||
my %options=();
|
||||
getopts('vhd', \%options);
|
||||
|
||||
if (defined $options{h}) {
|
||||
usage();
|
||||
}
|
||||
|
||||
my $verbose = $options{v};
|
||||
my $debug = $options{d};
|
||||
|
||||
# look for objdump
|
||||
my $objdump = which("objdump");
|
||||
(-f $objdump) || die("Error: Can't find objdump command\n");
|
||||
|
||||
# for each argument (which should be an executable):
|
||||
foreach my $executable_file(@ARGV) {
|
||||
|
||||
# debug message
|
||||
print("Reading input file \"$executable_file\" ...\n") if ($debug);
|
||||
|
||||
# verify/open file specified.
|
||||
open (INPUT_FP, "<", $executable_file) || die("Error: failed to open file: $executable_file\n");
|
||||
binmode INPUT_FP;
|
||||
|
||||
# kernel section information
|
||||
my $escaped_name=quotemeta($executable_file);
|
||||
my $bundle_section_name = ".hip_fatbin";
|
||||
my $bundle_section_size = hex(`$objdump -h $escaped_name | grep $bundle_section_name | awk '{print \$3}'`);
|
||||
my $bundle_section_offset = hex(`$objdump -h $escaped_name | grep $bundle_section_name | awk '{print \$6}'`);
|
||||
|
||||
$bundle_section_size or die("Error: No kernel section found\n");
|
||||
|
||||
my $bundle_section_end = $bundle_section_offset + $bundle_section_size;
|
||||
|
||||
if ($debug) {
|
||||
print "Code Objects Bundle section size: $bundle_section_size\n";
|
||||
print "Code Objects Bundle section offset: $bundle_section_offset\n";
|
||||
print "Code Objects Bundle section end: $bundle_section_end\n";
|
||||
}
|
||||
|
||||
my $current_bundle_offset = $bundle_section_offset;
|
||||
print "Current Bundle offset: $current_bundle_offset\n" if ($debug);
|
||||
|
||||
# move fp to current_bundle_offset.
|
||||
seek(INPUT_FP, $current_bundle_offset, 0);
|
||||
|
||||
# skip OFFLOAD_BUNDLER_MAGIC_STR
|
||||
my $magic_str;
|
||||
my $read_bytes = read(INPUT_FP, $magic_str, 24);
|
||||
if (($read_bytes != 24) || ($magic_str ne "__CLANG_OFFLOAD_BUNDLE__")) {
|
||||
print(STDERR "Error: Offload bundle magic string not detected\n") if ($debug);
|
||||
last;
|
||||
}
|
||||
|
||||
# read number of bundle entries, which are code objects.
|
||||
my $num_codeobjects;
|
||||
readq(\*INPUT_FP,\$num_codeobjects);
|
||||
# $num_codeobjects = unpack("Q<", $num_codeobjects);
|
||||
|
||||
# Listing
|
||||
print "Bundle of $num_codeobjects HIP Code Objects:\n" if ($verbose);
|
||||
|
||||
# strings for creating new files
|
||||
my $file_co_number = sprintf("%03d", $num_codeobjects);
|
||||
my $filename_prefix = "${executable_file}-${file_co_number}";
|
||||
|
||||
print("Entry ID:\t\t\tURI:\n") if ($verbose);
|
||||
|
||||
# for each Bundle entry (code object) ....
|
||||
for (my $iter = 0; $iter < $num_codeobjects; $iter++) {
|
||||
|
||||
# read bundle entry (code object) offset
|
||||
my $entry_offset;
|
||||
my $abs_offset;
|
||||
readq(*INPUT_FP,\$entry_offset);
|
||||
print("entry_offset: $entry_offset\n") if $debug;
|
||||
|
||||
# read bundle entry (code object) size
|
||||
my $entry_size;
|
||||
readq(*INPUT_FP,\$entry_size);
|
||||
print("entry_size: $entry_size\n") if $debug;
|
||||
|
||||
# read triple size
|
||||
my $triple_size;
|
||||
readq(*INPUT_FP,\$triple_size);
|
||||
print("triple_size: $triple_size\n") if $debug;
|
||||
|
||||
# read triple string
|
||||
my $triple;
|
||||
my $read_bytes = read(INPUT_FP, $triple, $triple_size);
|
||||
$read_bytes == $triple_size or die("Error: Fail to parse triple\n");
|
||||
print("triple: $triple\n") if $debug;
|
||||
|
||||
# because the bundle entry's offset is relative to the beginning of the bundled code object section.
|
||||
$abs_offset = int($entry_offset) + $bundle_section_offset;
|
||||
|
||||
my $obj_uri_encode = URI::Encode->new();
|
||||
my $encoded_executable_file = $obj_uri_encode->encode($executable_file);
|
||||
|
||||
if ($verbose) {
|
||||
print(STDOUT "$triple\tfile:\/\/$encoded_executable_file#offset=$abs_offset\&size=$entry_size\n");
|
||||
} else {
|
||||
print(STDOUT "file:\/\/$encoded_executable_file#offset=$abs_offset\&size=$entry_size\n");
|
||||
}
|
||||
|
||||
} # End of for each Bundle entry (code object) ...
|
||||
} # End of for each command line argument
|
||||
|
||||
exit(0);
|
||||
@@ -1,714 +0,0 @@
|
||||
# Copyright (C) 2016-2021 Advanced Micro Devices, Inc. All Rights Reserved.
|
||||
#
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
# of this software and associated documentation files (the "Software"), to deal
|
||||
# in the Software without restriction, including without limitation the rights
|
||||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
# copies of the Software, and to permit persons to whom the Software is
|
||||
# furnished to do so, subject to the following conditions:
|
||||
#
|
||||
# The above copyright notice and this permission notice shall be included in
|
||||
# all copies or substantial portions of the Software.
|
||||
#
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
# THE SOFTWARE.
|
||||
|
||||
###############################################################################
|
||||
# FindHIP.cmake
|
||||
###############################################################################
|
||||
include(CheckCXXCompilerFlag)
|
||||
###############################################################################
|
||||
# SET: Variable defaults
|
||||
###############################################################################
|
||||
# User defined flags
|
||||
set(HIP_HIPCC_FLAGS "" CACHE STRING "Semicolon delimited flags for HIPCC")
|
||||
set(HIP_CLANG_FLAGS "" CACHE STRING "Semicolon delimited flags for CLANG")
|
||||
set(HIP_NVCC_FLAGS "" CACHE STRING "Semicolon delimted flags for NVCC")
|
||||
mark_as_advanced(HIP_HIPCC_FLAGS HIP_CLANG_FLAGS HIP_NVCC_FLAGS)
|
||||
|
||||
set(_hip_configuration_types ${CMAKE_CONFIGURATION_TYPES} ${CMAKE_BUILD_TYPE} Debug MinSizeRel Release RelWithDebInfo)
|
||||
list(REMOVE_DUPLICATES _hip_configuration_types)
|
||||
foreach(config ${_hip_configuration_types})
|
||||
string(TOUPPER ${config} config_upper)
|
||||
set(HIP_HIPCC_FLAGS_${config_upper} "" CACHE STRING "Semicolon delimited flags for HIPCC")
|
||||
set(HIP_CLANG_FLAGS_${config_upper} "" CACHE STRING "Semicolon delimited flags for CLANG")
|
||||
set(HIP_NVCC_FLAGS_${config_upper} "" CACHE STRING "Semicolon delimited flags for NVCC")
|
||||
mark_as_advanced(HIP_HIPCC_FLAGS_${config_upper} HIP_CLANG_FLAGS_${config_upper} HIP_NVCC_FLAGS_${config_upper})
|
||||
endforeach()
|
||||
option(HIP_HOST_COMPILATION_CPP "Host code compilation mode" ON)
|
||||
option(HIP_VERBOSE_BUILD "Print out the commands run while compiling the HIP source file. With the Makefile generator this defaults to VERBOSE variable specified on the command line, but can be forced on with this option." OFF)
|
||||
mark_as_advanced(HIP_HOST_COMPILATION_CPP)
|
||||
|
||||
###############################################################################
|
||||
# FIND: HIP and associated helper binaries
|
||||
###############################################################################
|
||||
|
||||
get_filename_component(_IMPORT_PREFIX "${CMAKE_CURRENT_LIST_DIR}/../" REALPATH)
|
||||
|
||||
# HIP is supported on Linux only
|
||||
if(UNIX AND NOT APPLE AND NOT CYGWIN)
|
||||
# Search for HIP installation
|
||||
if(NOT HIP_ROOT_DIR)
|
||||
# Search in user specified path first
|
||||
find_path(
|
||||
HIP_ROOT_DIR
|
||||
NAMES bin/hipconfig
|
||||
PATHS
|
||||
"$ENV{ROCM_PATH}/hip"
|
||||
ENV HIP_PATH
|
||||
${_IMPORT_PREFIX}
|
||||
/opt/rocm/hip
|
||||
DOC "HIP installed location"
|
||||
NO_DEFAULT_PATH
|
||||
)
|
||||
if(NOT EXISTS ${HIP_ROOT_DIR})
|
||||
if(HIP_FIND_REQUIRED)
|
||||
message(FATAL_ERROR "Specify HIP_ROOT_DIR")
|
||||
elseif(NOT HIP_FIND_QUIETLY)
|
||||
message("HIP_ROOT_DIR not found or specified")
|
||||
endif()
|
||||
endif()
|
||||
# And push it back to the cache
|
||||
set(HIP_ROOT_DIR ${HIP_ROOT_DIR} CACHE PATH "HIP installed location" FORCE)
|
||||
endif()
|
||||
|
||||
# Find HIPCC executable
|
||||
find_program(
|
||||
HIP_HIPCC_EXECUTABLE
|
||||
NAMES hipcc
|
||||
PATHS
|
||||
"${HIP_ROOT_DIR}"
|
||||
ENV ROCM_PATH
|
||||
ENV HIP_PATH
|
||||
/opt/rocm
|
||||
/opt/rocm/hip
|
||||
PATH_SUFFIXES bin
|
||||
NO_DEFAULT_PATH
|
||||
)
|
||||
if(NOT HIP_HIPCC_EXECUTABLE)
|
||||
# Now search in default paths
|
||||
find_program(HIP_HIPCC_EXECUTABLE hipcc)
|
||||
endif()
|
||||
mark_as_advanced(HIP_HIPCC_EXECUTABLE)
|
||||
|
||||
# Find HIPCONFIG executable
|
||||
find_program(
|
||||
HIP_HIPCONFIG_EXECUTABLE
|
||||
NAMES hipconfig
|
||||
PATHS
|
||||
"${HIP_ROOT_DIR}"
|
||||
ENV ROCM_PATH
|
||||
ENV HIP_PATH
|
||||
/opt/rocm
|
||||
/opt/rocm/hip
|
||||
PATH_SUFFIXES bin
|
||||
NO_DEFAULT_PATH
|
||||
)
|
||||
if(NOT HIP_HIPCONFIG_EXECUTABLE)
|
||||
# Now search in default paths
|
||||
find_program(HIP_HIPCONFIG_EXECUTABLE hipconfig)
|
||||
endif()
|
||||
mark_as_advanced(HIP_HIPCONFIG_EXECUTABLE)
|
||||
|
||||
# Find HIPCC_CMAKE_LINKER_HELPER executable
|
||||
find_program(
|
||||
HIP_HIPCC_CMAKE_LINKER_HELPER
|
||||
NAMES hipcc_cmake_linker_helper
|
||||
PATHS
|
||||
"${HIP_ROOT_DIR}"
|
||||
ENV ROCM_PATH
|
||||
ENV HIP_PATH
|
||||
/opt/rocm
|
||||
/opt/rocm/hip
|
||||
PATH_SUFFIXES bin
|
||||
NO_DEFAULT_PATH
|
||||
)
|
||||
if(NOT HIP_HIPCC_CMAKE_LINKER_HELPER)
|
||||
# Now search in default paths
|
||||
find_program(HIP_HIPCC_CMAKE_LINKER_HELPER hipcc_cmake_linker_helper)
|
||||
endif()
|
||||
mark_as_advanced(HIP_HIPCC_CMAKE_LINKER_HELPER)
|
||||
|
||||
if(HIP_HIPCONFIG_EXECUTABLE AND NOT HIP_VERSION)
|
||||
# Compute the version
|
||||
execute_process(
|
||||
COMMAND ${HIP_HIPCONFIG_EXECUTABLE} --version
|
||||
OUTPUT_VARIABLE _hip_version
|
||||
ERROR_VARIABLE _hip_error
|
||||
OUTPUT_STRIP_TRAILING_WHITESPACE
|
||||
ERROR_STRIP_TRAILING_WHITESPACE
|
||||
)
|
||||
if(NOT _hip_error)
|
||||
set(HIP_VERSION ${_hip_version} CACHE STRING "Version of HIP as computed from hipcc")
|
||||
else()
|
||||
set(HIP_VERSION "0.0.0" CACHE STRING "Version of HIP as computed by FindHIP()")
|
||||
endif()
|
||||
mark_as_advanced(HIP_VERSION)
|
||||
endif()
|
||||
if(HIP_VERSION)
|
||||
string(REPLACE "." ";" _hip_version_list "${HIP_VERSION}")
|
||||
list(GET _hip_version_list 0 HIP_VERSION_MAJOR)
|
||||
list(GET _hip_version_list 1 HIP_VERSION_MINOR)
|
||||
list(GET _hip_version_list 2 HIP_VERSION_PATCH)
|
||||
set(HIP_VERSION_STRING "${HIP_VERSION}")
|
||||
endif()
|
||||
|
||||
if(HIP_HIPCONFIG_EXECUTABLE AND NOT HIP_PLATFORM)
|
||||
# Compute the platform
|
||||
execute_process(
|
||||
COMMAND ${HIP_HIPCONFIG_EXECUTABLE} --platform
|
||||
OUTPUT_VARIABLE _hip_platform
|
||||
OUTPUT_STRIP_TRAILING_WHITESPACE
|
||||
)
|
||||
set(HIP_PLATFORM ${_hip_platform} CACHE STRING "HIP platform as computed by hipconfig")
|
||||
mark_as_advanced(HIP_PLATFORM)
|
||||
endif()
|
||||
|
||||
if(HIP_HIPCONFIG_EXECUTABLE AND NOT HIP_COMPILER)
|
||||
# Compute the compiler
|
||||
execute_process(
|
||||
COMMAND ${HIP_HIPCONFIG_EXECUTABLE} --compiler
|
||||
OUTPUT_VARIABLE _hip_compiler
|
||||
OUTPUT_STRIP_TRAILING_WHITESPACE
|
||||
)
|
||||
set(HIP_COMPILER ${_hip_compiler} CACHE STRING "HIP compiler as computed by hipconfig")
|
||||
mark_as_advanced(HIP_COMPILER)
|
||||
endif()
|
||||
|
||||
if(HIP_HIPCONFIG_EXECUTABLE AND NOT HIP_RUNTIME)
|
||||
# Compute the runtime
|
||||
execute_process(
|
||||
COMMAND ${HIP_HIPCONFIG_EXECUTABLE} --runtime
|
||||
OUTPUT_VARIABLE _hip_runtime
|
||||
OUTPUT_STRIP_TRAILING_WHITESPACE
|
||||
)
|
||||
set(HIP_RUNTIME ${_hip_runtime} CACHE STRING "HIP runtime as computed by hipconfig")
|
||||
mark_as_advanced(HIP_RUNTIME)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
include(FindPackageHandleStandardArgs)
|
||||
find_package_handle_standard_args(
|
||||
HIP
|
||||
REQUIRED_VARS
|
||||
HIP_ROOT_DIR
|
||||
HIP_HIPCC_EXECUTABLE
|
||||
HIP_HIPCONFIG_EXECUTABLE
|
||||
HIP_PLATFORM
|
||||
HIP_COMPILER
|
||||
HIP_RUNTIME
|
||||
VERSION_VAR HIP_VERSION
|
||||
)
|
||||
|
||||
###############################################################################
|
||||
# Set HIP CMAKE Flags
|
||||
###############################################################################
|
||||
# Copy the invocation styles from CXX to HIP
|
||||
set(CMAKE_HIP_ARCHIVE_CREATE ${CMAKE_CXX_ARCHIVE_CREATE})
|
||||
set(CMAKE_HIP_ARCHIVE_APPEND ${CMAKE_CXX_ARCHIVE_APPEND})
|
||||
set(CMAKE_HIP_ARCHIVE_FINISH ${CMAKE_CXX_ARCHIVE_FINISH})
|
||||
set(CMAKE_SHARED_LIBRARY_SONAME_HIP_FLAG ${CMAKE_SHARED_LIBRARY_SONAME_CXX_FLAG})
|
||||
set(CMAKE_SHARED_LIBRARY_CREATE_HIP_FLAGS ${CMAKE_SHARED_LIBRARY_CREATE_CXX_FLAGS})
|
||||
set(CMAKE_SHARED_LIBRARY_HIP_FLAGS ${CMAKE_SHARED_LIBRARY_CXX_FLAGS})
|
||||
#set(CMAKE_SHARED_LIBRARY_LINK_HIP_FLAGS ${CMAKE_SHARED_LIBRARY_LINK_CXX_FLAGS})
|
||||
set(CMAKE_SHARED_LIBRARY_RUNTIME_HIP_FLAG ${CMAKE_SHARED_LIBRARY_RUNTIME_CXX_FLAG})
|
||||
set(CMAKE_SHARED_LIBRARY_RUNTIME_HIP_FLAG_SEP ${CMAKE_SHARED_LIBRARY_RUNTIME_CXX_FLAG_SEP})
|
||||
set(CMAKE_SHARED_LIBRARY_LINK_STATIC_HIP_FLAGS ${CMAKE_SHARED_LIBRARY_LINK_STATIC_CXX_FLAGS})
|
||||
set(CMAKE_SHARED_LIBRARY_LINK_DYNAMIC_HIP_FLAGS ${CMAKE_SHARED_LIBRARY_LINK_DYNAMIC_CXX_FLAGS})
|
||||
|
||||
set(HIP_CLANG_PARALLEL_BUILD_COMPILE_OPTIONS "")
|
||||
set(HIP_CLANG_PARALLEL_BUILD_LINK_OPTIONS "")
|
||||
|
||||
if("${HIP_COMPILER}" STREQUAL "nvcc")
|
||||
# Set the CMake Flags to use the nvcc Compiler.
|
||||
set(CMAKE_HIP_CREATE_SHARED_LIBRARY "${HIP_HIPCC_CMAKE_LINKER_HELPER} <CMAKE_SHARED_LIBRARY_CXX_FLAGS> <LANGUAGE_COMPILE_FLAGS> <LINK_FLAGS> <CMAKE_SHARED_LIBRARY_CREATE_CXX_FLAGS> <SONAME_FLAG><TARGET_SONAME> -o <TARGET> <OBJECTS> <LINK_LIBRARIES>")
|
||||
set(CMAKE_HIP_CREATE_SHARED_MODULE "${HIP_HIPCC_CMAKE_LINKER_HELPER} <CMAKE_CXX_LINK_FLAGS> <LINK_FLAGS> <OBJECTS> <SONAME_FLAG><TARGET_SONAME> -o <TARGET> <LINK_LIBRARIES> -shared" )
|
||||
set(CMAKE_HIP_LINK_EXECUTABLE "${HIP_HIPCC_CMAKE_LINKER_HELPER} <FLAGS> <CMAKE_CXX_LINK_FLAGS> <LINK_FLAGS> <OBJECTS> -o <TARGET> <LINK_LIBRARIES>")
|
||||
elseif("${HIP_COMPILER}" STREQUAL "clang")
|
||||
#Set HIP_CLANG_PATH
|
||||
if("x${HIP_CLANG_PATH}" STREQUAL "x")
|
||||
if(DEFINED ENV{HIP_CLANG_PATH})
|
||||
set(HIP_CLANG_PATH $ENV{HIP_CLANG_PATH})
|
||||
elseif(DEFINED ENV{ROCM_PATH})
|
||||
set(HIP_CLANG_PATH "$ENV{ROCM_PATH}/llvm/bin")
|
||||
elseif(DEFINED ENV{HIP_PATH})
|
||||
set(HIP_CLANG_PATH "$ENV{HIP_PATH}/../llvm/bin")
|
||||
elseif(DEFINED HIP_PATH)
|
||||
set(HIP_CLANG_PATH "${HIP_PATH}/../llvm/bin")
|
||||
else()
|
||||
set(HIP_CLANG_PATH "/opt/rocm/llvm/bin")
|
||||
endif()
|
||||
endif()
|
||||
#Number of parallel jobs by default is 1
|
||||
if(NOT DEFINED HIP_CLANG_NUM_PARALLEL_JOBS)
|
||||
set(HIP_CLANG_NUM_PARALLEL_JOBS 1)
|
||||
endif()
|
||||
#Add support for parallel build and link
|
||||
if(${CMAKE_CXX_COMPILER_ID} STREQUAL "Clang")
|
||||
check_cxx_compiler_flag("-parallel-jobs=1" HIP_CLANG_SUPPORTS_PARALLEL_JOBS)
|
||||
endif()
|
||||
if(HIP_CLANG_NUM_PARALLEL_JOBS GREATER 1)
|
||||
if(${HIP_CLANG_SUPPORTS_PARALLEL_JOBS})
|
||||
set(HIP_CLANG_PARALLEL_BUILD_COMPILE_OPTIONS "-Wno-format-nonliteral -parallel-jobs=${HIP_CLANG_NUM_PARALLEL_JOBS}")
|
||||
set(HIP_CLANG_PARALLEL_BUILD_LINK_OPTIONS "-parallel-jobs=${HIP_CLANG_NUM_PARALLEL_JOBS}")
|
||||
else()
|
||||
message("clang compiler doesn't support parallel jobs")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
# Set the CMake Flags to use the HIP-Clang Compiler.
|
||||
set(CMAKE_HIP_CREATE_SHARED_LIBRARY "${HIP_HIPCC_CMAKE_LINKER_HELPER} ${HIP_CLANG_PATH} ${HIP_CLANG_PARALLEL_BUILD_LINK_OPTIONS} <CMAKE_SHARED_LIBRARY_CXX_FLAGS> <LANGUAGE_COMPILE_FLAGS> <LINK_FLAGS> <CMAKE_SHARED_LIBRARY_CREATE_CXX_FLAGS> <SONAME_FLAG><TARGET_SONAME> -o <TARGET> <OBJECTS> <LINK_LIBRARIES>")
|
||||
set(CMAKE_HIP_CREATE_SHARED_MODULE "${HIP_HIPCC_CMAKE_LINKER_HELPER} ${HIP_CLANG_PATH} ${HIP_CLANG_PARALLEL_BUILD_LINK_OPTIONS} <CMAKE_CXX_LINK_FLAGS> <LINK_FLAGS> <OBJECTS> <SONAME_FLAG><TARGET_SONAME> -o <TARGET> <LINK_LIBRARIES> -shared" )
|
||||
set(CMAKE_HIP_LINK_EXECUTABLE "${HIP_HIPCC_CMAKE_LINKER_HELPER} ${HIP_CLANG_PATH} ${HIP_CLANG_PARALLEL_BUILD_LINK_OPTIONS} <FLAGS> <CMAKE_CXX_LINK_FLAGS> <LINK_FLAGS> <OBJECTS> -o <TARGET> <LINK_LIBRARIES>")
|
||||
|
||||
if("${HIP_RUNTIME}" STREQUAL "rocclr")
|
||||
if(TARGET host)
|
||||
message(STATUS "host interface - found")
|
||||
set(HIP_HOST_INTERFACE host)
|
||||
endif()
|
||||
endif()
|
||||
endif()
|
||||
|
||||
###############################################################################
|
||||
# MACRO: Locate helper files
|
||||
###############################################################################
|
||||
macro(HIP_FIND_HELPER_FILE _name _extension)
|
||||
set(_hip_full_name "${_name}.${_extension}")
|
||||
get_filename_component(CMAKE_CURRENT_LIST_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH)
|
||||
set(HIP_${_name} "${CMAKE_CURRENT_LIST_DIR}/FindHIP/${_hip_full_name}")
|
||||
if(NOT EXISTS "${HIP_${_name}}")
|
||||
set(error_message "${_hip_full_name} not found in ${CMAKE_CURRENT_LIST_DIR}/FindHIP")
|
||||
if(HIP_FIND_REQUIRED)
|
||||
message(FATAL_ERROR "${error_message}")
|
||||
else()
|
||||
if(NOT HIP_FIND_QUIETLY)
|
||||
message(STATUS "${error_message}")
|
||||
endif()
|
||||
endif()
|
||||
endif()
|
||||
# Set this variable as internal, so the user isn't bugged with it.
|
||||
set(HIP_${_name} ${HIP_${_name}} CACHE INTERNAL "Location of ${_full_name}" FORCE)
|
||||
endmacro()
|
||||
|
||||
###############################################################################
|
||||
hip_find_helper_file(run_make2cmake cmake)
|
||||
hip_find_helper_file(run_hipcc cmake)
|
||||
###############################################################################
|
||||
|
||||
###############################################################################
|
||||
# MACRO: Reset compiler flags
|
||||
###############################################################################
|
||||
macro(HIP_RESET_FLAGS)
|
||||
unset(HIP_HIPCC_FLAGS)
|
||||
unset(HIP_CLANG_FLAGS)
|
||||
unset(HIP_NVCC_FLAGS)
|
||||
foreach(config ${_hip_configuration_types})
|
||||
string(TOUPPER ${config} config_upper)
|
||||
unset(HIP_HIPCC_FLAGS_${config_upper})
|
||||
unset(HIP_CLANG_FLAGS_${config_upper})
|
||||
unset(HIP_NVCC_FLAGS_${config_upper})
|
||||
endforeach()
|
||||
endmacro()
|
||||
|
||||
###############################################################################
|
||||
# MACRO: Separate the options from the sources
|
||||
###############################################################################
|
||||
macro(HIP_GET_SOURCES_AND_OPTIONS _sources _cmake_options _hipcc_options _clang_options _nvcc_options)
|
||||
set(${_sources})
|
||||
set(${_cmake_options})
|
||||
set(${_hipcc_options})
|
||||
set(${_clang_options})
|
||||
set(${_nvcc_options})
|
||||
set(_hipcc_found_options FALSE)
|
||||
set(_hcc_found_options FALSE)
|
||||
set(_clang_found_options FALSE)
|
||||
set(_nvcc_found_options FALSE)
|
||||
foreach(arg ${ARGN})
|
||||
if("x${arg}" STREQUAL "xHIPCC_OPTIONS")
|
||||
set(_hipcc_found_options TRUE)
|
||||
set(_hcc_found_options FALSE)
|
||||
set(_clang_found_options FALSE)
|
||||
set(_nvcc_found_options FALSE)
|
||||
elseif("x${arg}" STREQUAL "xHCC_OPTIONS")
|
||||
# To be removed after HCC_OPTIONS is removed from hip_add_executable()
|
||||
# via upstream updation
|
||||
message(WARNING, "Please remove obsolete HCC_OPTIONS from hip_add_executable()")
|
||||
set(_hipcc_found_options FALSE)
|
||||
set(_hcc_found_options TRUE)
|
||||
set(_clang_found_options FALSE)
|
||||
set(_nvcc_found_options FALSE)
|
||||
elseif("x${arg}" STREQUAL "xCLANG_OPTIONS")
|
||||
set(_hipcc_found_options FALSE)
|
||||
set(_hcc_found_options FALSE)
|
||||
set(_clang_found_options TRUE)
|
||||
set(_nvcc_found_options FALSE)
|
||||
elseif("x${arg}" STREQUAL "xNVCC_OPTIONS")
|
||||
set(_hipcc_found_options FALSE)
|
||||
set(_hcc_found_options FALSE)
|
||||
set(_clang_found_options FALSE)
|
||||
set(_nvcc_found_options TRUE)
|
||||
elseif(
|
||||
"x${arg}" STREQUAL "xEXCLUDE_FROM_ALL" OR
|
||||
"x${arg}" STREQUAL "xSTATIC" OR
|
||||
"x${arg}" STREQUAL "xSHARED" OR
|
||||
"x${arg}" STREQUAL "xMODULE"
|
||||
)
|
||||
list(APPEND ${_cmake_options} ${arg})
|
||||
else()
|
||||
if(_hipcc_found_options)
|
||||
list(APPEND ${_hipcc_options} ${arg})
|
||||
elseif(_hcc_found_options)
|
||||
message(WARNING, "Please remove obsolete HCC_OPTIONS ${arg} from hip_add_executable()")
|
||||
elseif(_clang_found_options)
|
||||
list(APPEND ${_clang_options} ${arg})
|
||||
elseif(_nvcc_found_options)
|
||||
list(APPEND ${_nvcc_options} ${arg})
|
||||
else()
|
||||
# Assume this is a file
|
||||
list(APPEND ${_sources} ${arg})
|
||||
endif()
|
||||
endif()
|
||||
endforeach()
|
||||
endmacro()
|
||||
|
||||
###############################################################################
|
||||
# MACRO: Add include directories to pass to the hipcc command
|
||||
###############################################################################
|
||||
set(HIP_HIPCC_INCLUDE_ARGS_USER "")
|
||||
macro(HIP_INCLUDE_DIRECTORIES)
|
||||
foreach(dir ${ARGN})
|
||||
list(APPEND HIP_HIPCC_INCLUDE_ARGS_USER $<$<BOOL:${dir}>:-I${dir}>)
|
||||
endforeach()
|
||||
endmacro()
|
||||
|
||||
###############################################################################
|
||||
# FUNCTION: Helper to avoid clashes of files with the same basename but different paths
|
||||
###############################################################################
|
||||
function(HIP_COMPUTE_BUILD_PATH path build_path)
|
||||
# Convert to cmake style paths
|
||||
file(TO_CMAKE_PATH "${path}" bpath)
|
||||
if(IS_ABSOLUTE "${bpath}")
|
||||
string(FIND "${bpath}" "${CMAKE_CURRENT_BINARY_DIR}" _binary_dir_pos)
|
||||
if(_binary_dir_pos EQUAL 0)
|
||||
file(RELATIVE_PATH bpath "${CMAKE_CURRENT_BINARY_DIR}" "${bpath}")
|
||||
else()
|
||||
file(RELATIVE_PATH bpath "${CMAKE_CURRENT_SOURCE_DIR}" "${bpath}")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
# Remove leading /
|
||||
string(REGEX REPLACE "^[/]+" "" bpath "${bpath}")
|
||||
# Avoid absolute paths by removing ':'
|
||||
string(REPLACE ":" "_" bpath "${bpath}")
|
||||
# Avoid relative paths that go up the tree
|
||||
string(REPLACE "../" "__/" bpath "${bpath}")
|
||||
# Avoid spaces
|
||||
string(REPLACE " " "_" bpath "${bpath}")
|
||||
# Strip off the filename
|
||||
get_filename_component(bpath "${bpath}" PATH)
|
||||
|
||||
set(${build_path} "${bpath}" PARENT_SCOPE)
|
||||
endfunction()
|
||||
|
||||
###############################################################################
|
||||
# MACRO: Parse OPTIONS from ARGN & set variables prefixed by _option_prefix
|
||||
###############################################################################
|
||||
macro(HIP_PARSE_HIPCC_OPTIONS _option_prefix)
|
||||
set(_hip_found_config)
|
||||
foreach(arg ${ARGN})
|
||||
# Determine if we are dealing with a per-configuration flag
|
||||
foreach(config ${_hip_configuration_types})
|
||||
string(TOUPPER ${config} config_upper)
|
||||
if(arg STREQUAL "${config_upper}")
|
||||
set(_hip_found_config _${arg})
|
||||
# Clear arg to prevent it from being processed anymore
|
||||
set(arg)
|
||||
endif()
|
||||
endforeach()
|
||||
if(arg)
|
||||
list(APPEND ${_option_prefix}${_hip_found_config} "${arg}")
|
||||
endif()
|
||||
endforeach()
|
||||
endmacro()
|
||||
|
||||
###############################################################################
|
||||
# MACRO: Try and include dependency file if it exists
|
||||
###############################################################################
|
||||
macro(HIP_INCLUDE_HIPCC_DEPENDENCIES dependency_file)
|
||||
set(HIP_HIPCC_DEPEND)
|
||||
set(HIP_HIPCC_DEPEND_REGENERATE FALSE)
|
||||
|
||||
# Create the dependency file if it doesn't exist
|
||||
if(NOT EXISTS ${dependency_file})
|
||||
file(WRITE ${dependency_file} "# Generated by: FindHIP.cmake. Do not edit.\n")
|
||||
endif()
|
||||
# Include the dependency file
|
||||
include(${dependency_file})
|
||||
|
||||
# Verify the existence of all the included files
|
||||
if(HIP_HIPCC_DEPEND)
|
||||
foreach(f ${HIP_HIPCC_DEPEND})
|
||||
if(NOT EXISTS ${f})
|
||||
# If they aren't there, regenerate the file again
|
||||
set(HIP_HIPCC_DEPEND_REGENERATE TRUE)
|
||||
endif()
|
||||
endforeach()
|
||||
else()
|
||||
# No dependencies, so regenerate the file
|
||||
set(HIP_HIPCC_DEPEND_REGENERATE TRUE)
|
||||
endif()
|
||||
|
||||
# Regenerate the dependency file if needed
|
||||
if(HIP_HIPCC_DEPEND_REGENERATE)
|
||||
set(HIP_HIPCC_DEPEND ${dependency_file})
|
||||
file(WRITE ${dependency_file} "# Generated by: FindHIP.cmake. Do not edit.\n")
|
||||
endif()
|
||||
endmacro()
|
||||
|
||||
###############################################################################
|
||||
# MACRO: Prepare cmake commands for the target
|
||||
###############################################################################
|
||||
macro(HIP_PREPARE_TARGET_COMMANDS _target _format _generated_files _source_files)
|
||||
set(_hip_flags "")
|
||||
string(TOUPPER "${CMAKE_BUILD_TYPE}" _hip_build_configuration)
|
||||
if(HIP_HOST_COMPILATION_CPP)
|
||||
set(HIP_C_OR_CXX CXX)
|
||||
else()
|
||||
set(HIP_C_OR_CXX C)
|
||||
endif()
|
||||
set(generated_extension ${CMAKE_${HIP_C_OR_CXX}_OUTPUT_EXTENSION})
|
||||
|
||||
# Initialize list of includes with those specified by the user. Append with
|
||||
# ones specified to cmake directly.
|
||||
set(HIP_HIPCC_INCLUDE_ARGS ${HIP_HIPCC_INCLUDE_ARGS_USER})
|
||||
|
||||
# Add the include directories
|
||||
set(include_directories_generator "$<TARGET_PROPERTY:${_target},INCLUDE_DIRECTORIES>")
|
||||
list(APPEND HIP_HIPCC_INCLUDE_ARGS "$<$<BOOL:${include_directories_generator}>:-I$<JOIN:${include_directories_generator}, -I>>")
|
||||
|
||||
get_directory_property(_hip_include_directories INCLUDE_DIRECTORIES)
|
||||
list(REMOVE_DUPLICATES _hip_include_directories)
|
||||
if(_hip_include_directories)
|
||||
foreach(dir ${_hip_include_directories})
|
||||
list(APPEND HIP_HIPCC_INCLUDE_ARGS $<$<BOOL:${dir}>:-I${dir}>)
|
||||
endforeach()
|
||||
endif()
|
||||
|
||||
HIP_GET_SOURCES_AND_OPTIONS(_hip_sources _hip_cmake_options _hipcc_options _clang_options _nvcc_options ${ARGN})
|
||||
HIP_PARSE_HIPCC_OPTIONS(HIP_HIPCC_FLAGS ${_hipcc_options})
|
||||
HIP_PARSE_HIPCC_OPTIONS(HIP_CLANG_FLAGS ${_clang_options})
|
||||
HIP_PARSE_HIPCC_OPTIONS(HIP_NVCC_FLAGS ${_nvcc_options})
|
||||
|
||||
# Add the compile definitions
|
||||
set(compile_definition_generator "$<TARGET_PROPERTY:${_target},COMPILE_DEFINITIONS>")
|
||||
list(APPEND HIP_HIPCC_FLAGS "$<$<BOOL:${compile_definition_generator}>:-D$<JOIN:${compile_definition_generator}, -D>>")
|
||||
|
||||
# Check if we are building shared library.
|
||||
set(_hip_build_shared_libs FALSE)
|
||||
list(FIND _hip_cmake_options SHARED _hip_found_SHARED)
|
||||
list(FIND _hip_cmake_options MODULE _hip_found_MODULE)
|
||||
if(_hip_found_SHARED GREATER -1 OR _hip_found_MODULE GREATER -1)
|
||||
set(_hip_build_shared_libs TRUE)
|
||||
endif()
|
||||
list(FIND _hip_cmake_options STATIC _hip_found_STATIC)
|
||||
if(_hip_found_STATIC GREATER -1)
|
||||
set(_hip_build_shared_libs FALSE)
|
||||
endif()
|
||||
|
||||
# If we are building a shared library, add extra flags to HIP_HIPCC_FLAGS
|
||||
if(_hip_build_shared_libs)
|
||||
list(APPEND HIP_CLANG_FLAGS "-fPIC")
|
||||
list(APPEND HIP_NVCC_FLAGS "--shared -Xcompiler '-fPIC'")
|
||||
endif()
|
||||
|
||||
# Set host compiler
|
||||
set(HIP_HOST_COMPILER "${CMAKE_${HIP_C_OR_CXX}_COMPILER}")
|
||||
|
||||
# Set compiler flags
|
||||
set(_HIP_HOST_FLAGS "set(CMAKE_HOST_FLAGS ${CMAKE_${HIP_C_OR_CXX}_FLAGS})")
|
||||
set(_HIP_HIPCC_FLAGS "set(HIP_HIPCC_FLAGS ${HIP_HIPCC_FLAGS})")
|
||||
set(_HIP_CLANG_FLAGS "set(HIP_CLANG_FLAGS ${HIP_CLANG_FLAGS})")
|
||||
set(_HIP_NVCC_FLAGS "set(HIP_NVCC_FLAGS ${HIP_NVCC_FLAGS})")
|
||||
foreach(config ${_hip_configuration_types})
|
||||
string(TOUPPER ${config} config_upper)
|
||||
set(_HIP_HOST_FLAGS "${_HIP_HOST_FLAGS}\nset(CMAKE_HOST_FLAGS_${config_upper} ${CMAKE_${HIP_C_OR_CXX}_FLAGS_${config_upper}})")
|
||||
set(_HIP_HIPCC_FLAGS "${_HIP_HIPCC_FLAGS}\nset(HIP_HIPCC_FLAGS_${config_upper} ${HIP_HIPCC_FLAGS_${config_upper}})")
|
||||
set(_HIP_CLANG_FLAGS "${_HIP_CLANG_FLAGS}\nset(HIP_CLANG_FLAGS_${config_upper} ${HIP_CLANG_FLAGS_${config_upper}})")
|
||||
set(_HIP_NVCC_FLAGS "${_HIP_NVCC_FLAGS}\nset(HIP_NVCC_FLAGS_${config_upper} ${HIP_NVCC_FLAGS_${config_upper}})")
|
||||
endforeach()
|
||||
|
||||
# Reset the output variable
|
||||
set(_hip_generated_files "")
|
||||
set(_hip_source_files "")
|
||||
|
||||
# Iterate over all arguments and create custom commands for all source files
|
||||
foreach(file ${ARGN})
|
||||
# Ignore any file marked as a HEADER_FILE_ONLY
|
||||
get_source_file_property(_is_header ${file} HEADER_FILE_ONLY)
|
||||
# Allow per source file overrides of the format. Also allows compiling non .cu files.
|
||||
get_source_file_property(_hip_source_format ${file} HIP_SOURCE_PROPERTY_FORMAT)
|
||||
if((${file} MATCHES "\\.cu$" OR _hip_source_format) AND NOT _is_header)
|
||||
set(host_flag FALSE)
|
||||
else()
|
||||
set(host_flag TRUE)
|
||||
endif()
|
||||
|
||||
if(NOT host_flag)
|
||||
# Determine output directory
|
||||
HIP_COMPUTE_BUILD_PATH("${file}" hip_build_path)
|
||||
set(hip_compile_output_dir "${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/${_target}.dir/${hip_build_path}")
|
||||
|
||||
get_filename_component(basename ${file} NAME)
|
||||
set(generated_file_path "${hip_compile_output_dir}/${CMAKE_CFG_INTDIR}")
|
||||
set(generated_file_basename "${_target}_generated_${basename}${generated_extension}")
|
||||
|
||||
# Set file names
|
||||
set(generated_file "${generated_file_path}/${generated_file_basename}")
|
||||
set(cmake_dependency_file "${hip_compile_output_dir}/${generated_file_basename}.depend")
|
||||
set(custom_target_script_pregen "${hip_compile_output_dir}/${generated_file_basename}.cmake.pre-gen")
|
||||
set(custom_target_script "${hip_compile_output_dir}/${generated_file_basename}.cmake")
|
||||
|
||||
# Set properties for object files
|
||||
set_source_files_properties("${generated_file}"
|
||||
PROPERTIES
|
||||
EXTERNAL_OBJECT true # This is an object file not to be compiled, but only be linked
|
||||
)
|
||||
|
||||
# Don't add CMAKE_CURRENT_SOURCE_DIR if the path is already an absolute path
|
||||
get_filename_component(file_path "${file}" PATH)
|
||||
if(IS_ABSOLUTE "${file_path}")
|
||||
set(source_file "${file}")
|
||||
else()
|
||||
set(source_file "${CMAKE_CURRENT_SOURCE_DIR}/${file}")
|
||||
endif()
|
||||
|
||||
# Bring in the dependencies
|
||||
HIP_INCLUDE_HIPCC_DEPENDENCIES(${cmake_dependency_file})
|
||||
|
||||
# Configure the build script
|
||||
configure_file("${HIP_run_hipcc}" "${custom_target_script_pregen}" @ONLY)
|
||||
file(GENERATE
|
||||
OUTPUT "${custom_target_script}"
|
||||
INPUT "${custom_target_script_pregen}"
|
||||
)
|
||||
set(main_dep DEPENDS ${source_file})
|
||||
if(CMAKE_GENERATOR MATCHES "Makefiles")
|
||||
set(verbose_output "$(VERBOSE)")
|
||||
elseif(HIP_VERBOSE_BUILD)
|
||||
set(verbose_output ON)
|
||||
else()
|
||||
set(verbose_output OFF)
|
||||
endif()
|
||||
|
||||
# Create up the comment string
|
||||
file(RELATIVE_PATH generated_file_relative_path "${CMAKE_BINARY_DIR}" "${generated_file}")
|
||||
set(hip_build_comment_string "Building HIPCC object ${generated_file_relative_path}")
|
||||
|
||||
# Build the generated file and dependency file
|
||||
add_custom_command(
|
||||
OUTPUT ${generated_file}
|
||||
# These output files depend on the source_file and the contents of cmake_dependency_file
|
||||
${main_dep}
|
||||
DEPENDS ${HIP_HIPCC_DEPEND}
|
||||
DEPENDS ${custom_target_script}
|
||||
# Make sure the output directory exists before trying to write to it.
|
||||
COMMAND ${CMAKE_COMMAND} -E make_directory "${generated_file_path}"
|
||||
COMMAND ${CMAKE_COMMAND} ARGS
|
||||
-D verbose:BOOL=${verbose_output}
|
||||
-D build_configuration:STRING=${_hip_build_configuration}
|
||||
-D "generated_file:STRING=${generated_file}"
|
||||
-P "${custom_target_script}"
|
||||
WORKING_DIRECTORY "${hip_compile_output_dir}"
|
||||
COMMENT "${hip_build_comment_string}"
|
||||
)
|
||||
|
||||
# Make sure the build system knows the file is generated
|
||||
set_source_files_properties(${generated_file} PROPERTIES GENERATED TRUE)
|
||||
list(APPEND _hip_generated_files ${generated_file})
|
||||
list(APPEND _hip_source_files ${file})
|
||||
endif()
|
||||
endforeach()
|
||||
|
||||
# Set the return parameter
|
||||
set(${_generated_files} ${_hip_generated_files})
|
||||
set(${_source_files} ${_hip_source_files})
|
||||
endmacro()
|
||||
|
||||
###############################################################################
|
||||
# HIP_ADD_EXECUTABLE
|
||||
###############################################################################
|
||||
macro(HIP_ADD_EXECUTABLE hip_target)
|
||||
# Separate the sources from the options
|
||||
HIP_GET_SOURCES_AND_OPTIONS(_sources _cmake_options _hipcc_options _clang_options _nvcc_options ${ARGN})
|
||||
HIP_PREPARE_TARGET_COMMANDS(${hip_target} OBJ _generated_files _source_files ${_sources} HIPCC_OPTIONS ${_hipcc_options} CLANG_OPTIONS ${_clang_options} NVCC_OPTIONS ${_nvcc_options})
|
||||
if(_source_files)
|
||||
list(REMOVE_ITEM _sources ${_source_files})
|
||||
endif()
|
||||
if("${HIP_COMPILER}" STREQUAL "clang")
|
||||
if("x${HIP_CLANG_PATH}" STREQUAL "x")
|
||||
if(DEFINED ENV{HIP_CLANG_PATH})
|
||||
set(HIP_CLANG_PATH $ENV{HIP_CLANG_PATH})
|
||||
elseif(DEFINED ENV{ROCM_PATH})
|
||||
set(HIP_CLANG_PATH "$ENV{ROCM_PATH}/llvm/bin")
|
||||
elseif(DEFINED ENV{HIP_PATH})
|
||||
set(HIP_CLANG_PATH "$ENV{HIP_PATH}/../llvm/bin")
|
||||
elseif(DEFINED HIP_PATH)
|
||||
set(HIP_CLANG_PATH "${HIP_PATH}/../llvm/bin")
|
||||
else()
|
||||
set(HIP_CLANG_PATH "/opt/rocm/llvm/bin")
|
||||
endif()
|
||||
endif()
|
||||
set(CMAKE_HIP_LINK_EXECUTABLE "${HIP_HIPCC_CMAKE_LINKER_HELPER} ${HIP_CLANG_PATH} ${HIP_CLANG_PARALLEL_BUILD_LINK_OPTIONS} <FLAGS> <CMAKE_CXX_LINK_FLAGS> <LINK_FLAGS> <OBJECTS> -o <TARGET> <LINK_LIBRARIES>")
|
||||
else()
|
||||
set(CMAKE_HIP_LINK_EXECUTABLE "${HIP_HIPCC_CMAKE_LINKER_HELPER} <FLAGS> <CMAKE_CXX_LINK_FLAGS> <LINK_FLAGS> <OBJECTS> -o <TARGET> <LINK_LIBRARIES>")
|
||||
endif()
|
||||
if ("${_sources}" STREQUAL "")
|
||||
add_executable(${hip_target} ${_cmake_options} ${_generated_files} "")
|
||||
else()
|
||||
add_executable(${hip_target} ${_cmake_options} ${_generated_files} ${_sources})
|
||||
endif()
|
||||
#LINK_OPTIONS
|
||||
if("${HIP_COMPILER}" STREQUAL "nvcc")
|
||||
# Some arch flags need be sent to linker. _nvcc_options mixes compiling and linker flags.
|
||||
string(REPLACE ";" " " _nvcc_flags "${_nvcc_options}") # Replace ',' with space
|
||||
if(NOT "x${_nvcc_flags}" STREQUAL "x")
|
||||
set_target_properties(${hip_target} PROPERTIES LINK_FLAGS "${_nvcc_flags}")
|
||||
endif()
|
||||
endif()
|
||||
set_target_properties(${hip_target} PROPERTIES LINKER_LANGUAGE HIP)
|
||||
# Link with host
|
||||
if (HIP_HOST_INTERFACE)
|
||||
# hip rt should be rocclr, compiler should be clang
|
||||
target_link_libraries(${hip_target} ${HIP_HOST_INTERFACE})
|
||||
endif()
|
||||
endmacro()
|
||||
|
||||
###############################################################################
|
||||
# HIP_ADD_LIBRARY
|
||||
###############################################################################
|
||||
macro(HIP_ADD_LIBRARY hip_target)
|
||||
# Separate the sources from the options
|
||||
HIP_GET_SOURCES_AND_OPTIONS(_sources _cmake_options _hipcc_options _clang_options _nvcc_options ${ARGN})
|
||||
HIP_PREPARE_TARGET_COMMANDS(${hip_target} OBJ _generated_files _source_files ${_sources} ${_cmake_options} HIPCC_OPTIONS ${_hipcc_options} CLANG_OPTIONS ${_clang_options} NVCC_OPTIONS ${_nvcc_options})
|
||||
if(_source_files)
|
||||
list(REMOVE_ITEM _sources ${_source_files})
|
||||
endif()
|
||||
if ("${_sources}" STREQUAL "")
|
||||
add_library(${hip_target} ${_cmake_options} ${_generated_files} "")
|
||||
else()
|
||||
add_library(${hip_target} ${_cmake_options} ${_generated_files} ${_sources})
|
||||
endif()
|
||||
set_target_properties(${hip_target} PROPERTIES LINKER_LANGUAGE ${HIP_C_OR_CXX})
|
||||
# Link with host
|
||||
if (HIP_HOST_INTERFACE)
|
||||
# hip rt should be rocclr, compiler should be clang
|
||||
target_link_libraries(${hip_target} ${HIP_HOST_INTERFACE})
|
||||
endif()
|
||||
endmacro()
|
||||
|
||||
# vim: ts=4:sw=4:expandtab:smartindent
|
||||
@@ -1,194 +0,0 @@
|
||||
# Copyright (C) 2016-2021 Advanced Micro Devices, Inc. All Rights Reserved.
|
||||
#
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
# of this software and associated documentation files (the "Software"), to deal
|
||||
# in the Software without restriction, including without limitation the rights
|
||||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
# copies of the Software, and to permit persons to whom the Software is
|
||||
# furnished to do so, subject to the following conditions:
|
||||
#
|
||||
# The above copyright notice and this permission notice shall be included in
|
||||
# all copies or substantial portions of the Software.
|
||||
#
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
# THE SOFTWARE.
|
||||
|
||||
###############################################################################
|
||||
# Runs commands using HIPCC
|
||||
###############################################################################
|
||||
|
||||
###############################################################################
|
||||
# This file runs the hipcc commands to produce the desired output file
|
||||
# along with the dependency file needed by CMake to compute dependencies.
|
||||
#
|
||||
# Input variables:
|
||||
#
|
||||
# verbose:BOOL=<> OFF: Be as quiet as possible (default)
|
||||
# ON : Describe each step
|
||||
# build_configuration:STRING=<> Build configuration. Defaults to Debug.
|
||||
# generated_file:STRING=<> File to generate. Mandatory argument.
|
||||
|
||||
if(NOT build_configuration)
|
||||
set(build_configuration Debug)
|
||||
endif()
|
||||
if(NOT generated_file)
|
||||
message(FATAL_ERROR "You must specify generated_file on the command line")
|
||||
endif()
|
||||
|
||||
# Set these up as variables to make reading the generated file easier
|
||||
set(HIP_HIPCC_EXECUTABLE "@HIP_HIPCC_EXECUTABLE@") # path
|
||||
set(HIP_HIPCONFIG_EXECUTABLE "@HIP_HIPCONFIG_EXECUTABLE@") #path
|
||||
set(HIP_HOST_COMPILER "@HIP_HOST_COMPILER@") # path
|
||||
set(CMAKE_COMMAND "@CMAKE_COMMAND@") # path
|
||||
set(HIP_run_make2cmake "@HIP_run_make2cmake@") # path
|
||||
set(HIP_CLANG_PATH "@HIP_CLANG_PATH@") #path
|
||||
set(HIP_CLANG_PARALLEL_BUILD_COMPILE_OPTIONS "@HIP_CLANG_PARALLEL_BUILD_COMPILE_OPTIONS@")
|
||||
|
||||
@HIP_HOST_FLAGS@
|
||||
@_HIP_HIPCC_FLAGS@
|
||||
@_HIP_CLANG_FLAGS@
|
||||
@_HIP_NVCC_FLAGS@
|
||||
#Needed to bring the HIP_HIPCC_INCLUDE_ARGS variable in scope
|
||||
set(HIP_HIPCC_INCLUDE_ARGS @HIP_HIPCC_INCLUDE_ARGS@) # list
|
||||
|
||||
set(cmake_dependency_file "@cmake_dependency_file@") # path
|
||||
set(source_file "@source_file@") # path
|
||||
set(host_flag "@host_flag@") # bool
|
||||
|
||||
# Determine compiler and compiler flags
|
||||
execute_process(COMMAND ${HIP_HIPCONFIG_EXECUTABLE} --platform OUTPUT_VARIABLE HIP_PLATFORM OUTPUT_STRIP_TRAILING_WHITESPACE)
|
||||
execute_process(COMMAND ${HIP_HIPCONFIG_EXECUTABLE} --compiler OUTPUT_VARIABLE HIP_COMPILER OUTPUT_STRIP_TRAILING_WHITESPACE)
|
||||
execute_process(COMMAND ${HIP_HIPCONFIG_EXECUTABLE} --runtime OUTPUT_VARIABLE HIP_RUNTIME OUTPUT_STRIP_TRAILING_WHITESPACE)
|
||||
if(NOT host_flag)
|
||||
set(__CC ${HIP_HIPCC_EXECUTABLE})
|
||||
if("${HIP_PLATFORM}" STREQUAL "amd")
|
||||
if("${HIP_COMPILER}" STREQUAL "clang")
|
||||
if(NOT "x${HIP_CLANG_PATH}" STREQUAL "x")
|
||||
set(ENV{HIP_CLANG_PATH} ${HIP_CLANG_PATH})
|
||||
endif()
|
||||
set(__CC_FLAGS ${HIP_CLANG_PARALLEL_BUILD_COMPILE_OPTIONS} ${HIP_HIPCC_FLAGS} ${HIP_CLANG_FLAGS} ${HIP_HIPCC_FLAGS_${build_configuration}} ${HIP_CLANG_FLAGS_${build_configuration}})
|
||||
endif()
|
||||
else()
|
||||
set(__CC_FLAGS ${HIP_HIPCC_FLAGS} ${HIP_NVCC_FLAGS} ${HIP_HIPCC_FLAGS_${build_configuration}} ${HIP_NVCC_FLAGS_${build_configuration}})
|
||||
endif()
|
||||
else()
|
||||
set(__CC ${HIP_HOST_COMPILER})
|
||||
set(__CC_FLAGS ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}})
|
||||
endif()
|
||||
set(__CC_INCLUDES ${HIP_HIPCC_INCLUDE_ARGS})
|
||||
|
||||
# hip_execute_process - Executes a command with optional command echo and status message.
|
||||
# status - Status message to print if verbose is true
|
||||
# command - COMMAND argument from the usual execute_process argument structure
|
||||
# ARGN - Remaining arguments are the command with arguments
|
||||
# HIP_result - Return value from running the command
|
||||
macro(hip_execute_process status command)
|
||||
set(_command ${command})
|
||||
if(NOT "x${_command}" STREQUAL "xCOMMAND")
|
||||
message(FATAL_ERROR "Malformed call to hip_execute_process. Missing COMMAND as second argument. (command = ${command})")
|
||||
endif()
|
||||
if(verbose)
|
||||
execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status})
|
||||
# Build command string to print
|
||||
set(hip_execute_process_string)
|
||||
foreach(arg ${ARGN})
|
||||
# Escape quotes if any
|
||||
string(REPLACE "\"" "\\\"" arg ${arg})
|
||||
# Surround args with spaces with quotes
|
||||
if(arg MATCHES " ")
|
||||
list(APPEND hip_execute_process_string "\"${arg}\"")
|
||||
else()
|
||||
list(APPEND hip_execute_process_string ${arg})
|
||||
endif()
|
||||
endforeach()
|
||||
# Echo the command
|
||||
execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${hip_execute_process_string})
|
||||
endif()
|
||||
# Run the command
|
||||
execute_process(COMMAND ${ARGN} RESULT_VARIABLE HIP_result)
|
||||
endmacro()
|
||||
|
||||
# Delete the target file
|
||||
hip_execute_process(
|
||||
"Removing ${generated_file}"
|
||||
COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
|
||||
)
|
||||
|
||||
# Generate the dependency file
|
||||
hip_execute_process(
|
||||
"Generating dependency file: ${cmake_dependency_file}.pre"
|
||||
COMMAND "${__CC}"
|
||||
-M
|
||||
"${source_file}"
|
||||
-o "${cmake_dependency_file}.pre"
|
||||
${__CC_FLAGS}
|
||||
${__CC_INCLUDES}
|
||||
)
|
||||
|
||||
if(HIP_result)
|
||||
message(FATAL_ERROR "Error generating ${generated_file}")
|
||||
endif()
|
||||
|
||||
# Generate the cmake readable dependency file to a temp file
|
||||
hip_execute_process(
|
||||
"Generating temporary cmake readable file: ${cmake_dependency_file}.tmp"
|
||||
COMMAND "${CMAKE_COMMAND}"
|
||||
-D "input_file:FILEPATH=${cmake_dependency_file}.pre"
|
||||
-D "output_file:FILEPATH=${cmake_dependency_file}.tmp"
|
||||
-D "verbose=${verbose}"
|
||||
-P "${HIP_run_make2cmake}"
|
||||
)
|
||||
|
||||
if(HIP_result)
|
||||
message(FATAL_ERROR "Error generating ${generated_file}")
|
||||
endif()
|
||||
|
||||
# Copy the file if it is different
|
||||
hip_execute_process(
|
||||
"Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}"
|
||||
COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}"
|
||||
)
|
||||
|
||||
if(HIP_result)
|
||||
message(FATAL_ERROR "Error generating ${generated_file}")
|
||||
endif()
|
||||
|
||||
# Delete the temporary file
|
||||
hip_execute_process(
|
||||
"Removing ${cmake_dependency_file}.tmp and ${cmake_dependency_file}.pre"
|
||||
COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${cmake_dependency_file}.pre"
|
||||
)
|
||||
|
||||
if(HIP_result)
|
||||
message(FATAL_ERROR "Error generating ${generated_file}")
|
||||
endif()
|
||||
|
||||
# Generate the output file
|
||||
hip_execute_process(
|
||||
"Generating ${generated_file}"
|
||||
COMMAND "${__CC}"
|
||||
-c
|
||||
"${source_file}"
|
||||
-o "${generated_file}"
|
||||
${__CC_FLAGS}
|
||||
${__CC_INCLUDES}
|
||||
)
|
||||
|
||||
if(HIP_result)
|
||||
# Make sure that we delete the output file
|
||||
hip_execute_process(
|
||||
"Removing ${generated_file}"
|
||||
COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
|
||||
)
|
||||
message(FATAL_ERROR "Error generating file ${generated_file}")
|
||||
else()
|
||||
if(verbose)
|
||||
message("Generated ${generated_file} successfully.")
|
||||
endif()
|
||||
endif()
|
||||
# vim: ts=4:sw=4:expandtab:smartindent
|
||||
@@ -1,70 +0,0 @@
|
||||
# Copyright (C) 2016-2021 Advanced Micro Devices, Inc. All Rights Reserved.
|
||||
#
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
# of this software and associated documentation files (the "Software"), to deal
|
||||
# in the Software without restriction, including without limitation the rights
|
||||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
# copies of the Software, and to permit persons to whom the Software is
|
||||
# furnished to do so, subject to the following conditions:
|
||||
#
|
||||
# The above copyright notice and this permission notice shall be included in
|
||||
# all copies or substantial portions of the Software.
|
||||
#
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
# THE SOFTWARE.
|
||||
|
||||
###############################################################################
|
||||
# Computes dependencies using HIPCC
|
||||
###############################################################################
|
||||
|
||||
###############################################################################
|
||||
# This file converts dependency files generated using hipcc to a format that
|
||||
# cmake can understand.
|
||||
|
||||
# Input variables:
|
||||
#
|
||||
# input_file:STRING=<> Dependency file to parse. Required argument
|
||||
# output_file:STRING=<> Output file to generate. Required argument
|
||||
|
||||
if(NOT input_file OR NOT output_file)
|
||||
message(FATAL_ERROR "You must specify input_file and output_file on the command line")
|
||||
endif()
|
||||
|
||||
file(READ ${input_file} depend_text)
|
||||
|
||||
if (NOT "${depend_text}" STREQUAL "")
|
||||
string(REPLACE " /" "\n/" depend_text ${depend_text})
|
||||
string(REGEX REPLACE "^.*:" "" depend_text ${depend_text})
|
||||
string(REGEX REPLACE "[ \\\\]*\n" ";" depend_text ${depend_text})
|
||||
|
||||
set(dependency_list "")
|
||||
|
||||
foreach(file ${depend_text})
|
||||
string(REGEX REPLACE "^ +" "" file ${file})
|
||||
if(NOT EXISTS "${file}")
|
||||
message(WARNING " Removing non-existent dependency file: ${file}")
|
||||
set(file "")
|
||||
endif()
|
||||
|
||||
if(NOT IS_DIRECTORY "${file}")
|
||||
get_filename_component(file_absolute "${file}" ABSOLUTE)
|
||||
list(APPEND dependency_list "${file_absolute}")
|
||||
endif()
|
||||
endforeach()
|
||||
endif()
|
||||
|
||||
# Remove the duplicate entries and sort them.
|
||||
list(REMOVE_DUPLICATES dependency_list)
|
||||
list(SORT dependency_list)
|
||||
|
||||
foreach(file ${dependency_list})
|
||||
set(hip_hipcc_depend "${hip_hipcc_depend} \"${file}\"\n")
|
||||
endforeach()
|
||||
|
||||
file(WRITE ${output_file} "# Generated by: FindHIP.cmake. Do not edit.\nSET(HIP_HIPCC_DEPEND\n ${hip_hipcc_depend})\n\n")
|
||||
# vim: ts=4:sw=4:expandtab:smartindent
|
||||
поставляемый
@@ -1,34 +0,0 @@
|
||||
# Copyright (C) 2017-2021 Advanced Micro Devices, Inc. All Rights Reserved.
|
||||
#
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
# of this software and associated documentation files (the "Software"), to deal
|
||||
# in the Software without restriction, including without limitation the rights
|
||||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
# copies of the Software, and to permit persons to whom the Software is
|
||||
# furnished to do so, subject to the following conditions:
|
||||
#
|
||||
# The above copyright notice and this permission notice shall be included in
|
||||
# all copies or substantial portions of the Software.
|
||||
#
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
# THE SOFTWARE.
|
||||
|
||||
# Parameters related to building hip
|
||||
ARG base_image
|
||||
|
||||
FROM ${base_image}
|
||||
MAINTAINER Maneesh Gupta <maneesh.gupta@amd>
|
||||
|
||||
ARG user_uid
|
||||
|
||||
# docker pipeline runs containers with particular uid
|
||||
# create a jenkins user with this specific uid so it can use sudo priviledges
|
||||
# Grant any member of sudo group password-less sudo privileges
|
||||
RUN useradd --create-home -u ${user_uid} -G sudo,video --shell /bin/bash jenkins && \
|
||||
mkdir -p /etc/sudoers.d/ && \
|
||||
echo '%sudo ALL=(ALL) NOPASSWD:ALL' | tee /etc/sudoers.d/sudo-nopasswd
|
||||
@@ -1,39 +0,0 @@
|
||||
# Copyright (C) 2017-2021 Advanced Micro Devices, Inc. All Rights Reserved.
|
||||
#
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
# of this software and associated documentation files (the "Software"), to deal
|
||||
# in the Software without restriction, including without limitation the rights
|
||||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
# copies of the Software, and to permit persons to whom the Software is
|
||||
# furnished to do so, subject to the following conditions:
|
||||
#
|
||||
# The above copyright notice and this permission notice shall be included in
|
||||
# all copies or substantial portions of the Software.
|
||||
#
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
# THE SOFTWARE.
|
||||
|
||||
# Parameters related to building hip
|
||||
ARG base_image
|
||||
|
||||
FROM ${base_image}
|
||||
MAINTAINER Kent Knox <kent.knox@amd>
|
||||
|
||||
# Copy the debian package of hip into the container from host
|
||||
COPY *.deb /tmp/
|
||||
|
||||
# Install the debian package
|
||||
RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install --no-install-recommends -y curl \
|
||||
&& apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install --no-install-recommends --allow-unauthenticated -y \
|
||||
/tmp/hip-base-*.deb \
|
||||
/tmp/hip-hcc-*.deb \
|
||||
/tmp/hip-doc-*.deb \
|
||||
/tmp/hip-samples-* \
|
||||
&& rm -f /tmp/*.deb \
|
||||
&& apt-get clean \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
@@ -1,19 +0,0 @@
|
||||
/**
|
||||
* @file mainpage.cpp
|
||||
* @brief : DoxyGen Main Page.
|
||||
* @mainpage Heterogeneous-computing Interface for Portability (HIP)
|
||||
* @tableofcontents
|
||||
*
|
||||
* The HIP interface makes it very easy to port existing CUDA apps to run on AMD GPUs,
|
||||
* or to develop new apps that can run on either CUDA or AMD GPUs from a common source base.
|
||||
*
|
||||
* - HIP is very thin and has little or no performance impact over coding directly in CUDA NVCC mode.
|
||||
* - HIP allows developers to use the "best" development environment and tools on each target platform.
|
||||
* - HIP allows coding in a single-source C++ programming language including features such as templates, C++11 lambdas,and more.
|
||||
* - HIPIFY tools automatically convert CUDA sources to HIP.
|
||||
* - Developers can specialize for CUDA or HIP to tune for performance or handle tricky cases with #ifdef.
|
||||
|
||||
* - See the @ref API.
|
||||
|
||||
|
||||
*/
|
||||
@@ -1,49 +0,0 @@
|
||||
/** @page Synchonization
|
||||
* @tableofcontents
|
||||
|
||||
* # Host-synchronous behavior:
|
||||
The following commands are "host-asynchronous" - meaning they do not wait for any preceding commands to complete, and may return control to the host thread before the requested operation completes:
|
||||
|
||||
- Kernel launches (hipLaunchKernel() )
|
||||
- Asynchronous memory copies - any memory copy API which contains "Async", such as hipMemcpyAsync())
|
||||
- Any memory set (for example, hipMemset());
|
||||
- TODO
|
||||
|
||||
"Host-synchronous" commands have the following properties:
|
||||
- wait for all previous commands to complete.
|
||||
- will not return control back to host until the command completes.
|
||||
|
||||
|
||||
The following commands are "host-synchronous".
|
||||
|
||||
- hipMemcpy waits for preceding work in the same stream to complete.
|
||||
|
||||
|
||||
* # Stream synchronization
|
||||
|
||||
|
||||
### Blocking
|
||||
|
||||
The term "blocking" has two meanings in HIP.
|
||||
|
||||
The first refers to synchronization commands (ie hipStreamSynchronize, hipEventSynchronize) that cause the host CPU to wait for GPU activity to complete.
|
||||
These can either use an active where the host CPU spin-waits on the synchronization variable, or can use an interrupt-based scheme where the core is interrupted
|
||||
when the wait completes. The second technique is referred to as "blocking" (ie hipDeviceBlockingSync, hipEventBlockingSync) while the first is referred
|
||||
to as "active". Active can be appropriate for short tasks where latency is critical, but comes at the expense of a CPU core dedicated to monitoring the event.
|
||||
|
||||
### HIP_LAUNCH_BLOCKING (also can use CUDA_LAUNCH_BLOCKING)
|
||||
|
||||
- The following commands become host-synchronous and will not return until the requested command has completed:
|
||||
|
||||
- Kernel launches (hipKernelLaunch).
|
||||
- Memory set commands (hipMemset, hipMemsetAsync).
|
||||
- Memory copy commands (hipMemcpy, hipMemsetAsync).
|
||||
|
||||
Note CUDA_LAUNCH_BLOCKING does add any pre-serialization to the commands and does not affect the concurrent stream behavior. For example,
|
||||
even when CUDA_LAUNCH_BLOCKING is set, kernels or data copy commands launched to separate streams can execute concurrently. Use the NULL
|
||||
stream if additional stream synchronization is desired.
|
||||
|
||||
|
||||
|
||||
|
||||
*/
|
||||
@@ -1,538 +0,0 @@
|
||||
# CUBLAS API supported by HIP
|
||||
|
||||
## **1. CUBLAS Data types**
|
||||
|
||||
| **type** | **CUDA** |**CUDA version\***| **HIP** |**HIP value** (if differs) |
|
||||
|-------------:|---------------------------------------------------------------|:----------------:|------------------------------------------------------------|---------------------------|
|
||||
| define |`CUBLAS_VER_MAJOR` | 10.1 Update 2 | |
|
||||
| define |`CUBLAS_VER_MINOR` | 10.1 Update 2 | |
|
||||
| define |`CUBLAS_VER_PATCH` | 10.1 Update 2 | |
|
||||
| define |`CUBLAS_VER_BUILD` | 10.1 Update 2 | |
|
||||
| define |`CUBLAS_VERSION` | 10.1 Update 2 | |
|
||||
| enum |***`cublasStatus`*** | |***`hipblasStatus_t`*** |
|
||||
| enum |***`cublasStatus_t`*** | |***`hipblasStatus_t`*** |
|
||||
| 0 |*`CUBLAS_STATUS_SUCCESS`* | |*`HIPBLAS_STATUS_SUCCESS`* |
|
||||
| 1 |*`CUBLAS_STATUS_NOT_INITIALIZED`* | |*`HIPBLAS_STATUS_NOT_INITIALIZED`* |
|
||||
| 3 |*`CUBLAS_STATUS_ALLOC_FAILED`* | |*`HIPBLAS_STATUS_ALLOC_FAILED`* | 2 |
|
||||
| 7 |*`CUBLAS_STATUS_INVALID_VALUE`* | |*`HIPBLAS_STATUS_INVALID_VALUE`* | 3 |
|
||||
| 8 |*`CUBLAS_STATUS_ARCH_MISMATCH`* | |*`HIPBLAS_STATUS_ARCH_MISMATCH`* | |
|
||||
| 11 |*`CUBLAS_STATUS_MAPPING_ERROR`* | |*`HIPBLAS_STATUS_MAPPING_ERROR`* | 4 |
|
||||
| 13 |*`CUBLAS_STATUS_EXECUTION_FAILED`* | |*`HIPBLAS_STATUS_EXECUTION_FAILED`* | 5 |
|
||||
| 14 |*`CUBLAS_STATUS_INTERNAL_ERROR`* | |*`HIPBLAS_STATUS_INTERNAL_ERROR`* | 6 |
|
||||
| 15 |*`CUBLAS_STATUS_NOT_SUPPORTED`* | |*`HIPBLAS_STATUS_NOT_SUPPORTED`* | 7 |
|
||||
| 16 |*`CUBLAS_STATUS_LICENSE_ERROR`* | | |
|
||||
| enum |***`cublasOperation_t`*** | |***`hipblasOperation_t`*** |
|
||||
| 0 |*`CUBLAS_OP_N`* | |*`HIPBLAS_OP_N`* | 111 |
|
||||
| 1 |*`CUBLAS_OP_T`* | |*`HIPBLAS_OP_T`* | 112 |
|
||||
| 2 |*`CUBLAS_OP_C`* | |*`HIPBLAS_OP_C`* | 113 |
|
||||
| 2 |*`CUBLAS_OP_HERMITAN`* | 10.1 |*`HIPBLAS_OP_C`* | 113 |
|
||||
| 3 |*`CUBLAS_OP_CONJG`* | 10.1 | |
|
||||
| enum |***`cublasFillMode_t`*** | |***`hipblasFillMode_t`*** |
|
||||
| 0 |*`CUBLAS_FILL_MODE_LOWER`* | |*`HIPBLAS_FILL_MODE_LOWER`* | 121 |
|
||||
| 1 |*`CUBLAS_FILL_MODE_UPPER`* | |*`HIPBLAS_FILL_MODE_UPPER`* | 122 |
|
||||
| 2 |*`CUBLAS_FILL_MODE_FULL`* | 10.1 |*`HIPBLAS_FILL_MODE_FULL`* | 123 |
|
||||
| enum |***`cublasDiagType_t`*** | |***`hipblasDiagType_t`*** |
|
||||
| 0 |*`CUBLAS_DIAG_NON_UNIT`* | |*`HIPBLAS_DIAG_NON_UNIT`* | 131 |
|
||||
| 1 |*`CUBLAS_DIAG_UNIT`* | |*`HIPBLAS_DIAG_UNIT`* | 132 |
|
||||
| enum |***`cublasSideMode_t`*** | |***`hipblasSideMode_t`*** |
|
||||
| 0 |*`CUBLAS_SIDE_LEFT`* | |*`HIPBLAS_SIDE_LEFT`* | 141 |
|
||||
| 1 |*`CUBLAS_SIDE_RIGHT`* | |*`HIPBLAS_SIDE_RIGHT`* | 142 |
|
||||
| enum |***`cublasPointerMode_t`*** | |***`hipblasPointerMode_t`*** |
|
||||
| 0 |*`CUBLAS_POINTER_MODE_HOST`* | |*`HIPBLAS_POINTER_MODE_HOST`* |
|
||||
| 1 |*`CUBLAS_POINTER_MODE_DEVICE`* | |*`HIPBLAS_POINTER_MODE_DEVICE`* |
|
||||
| enum |***`cublasAtomicsMode_t`*** | | |
|
||||
| 0 |*`CUBLAS_ATOMICS_NOT_ALLOWED`* | | |
|
||||
| 1 |*`CUBLAS_ATOMICS_ALLOWED`* | | |
|
||||
| enum |***`cublasGemmAlgo_t`*** | 8.0 |***`hipblasGemmAlgo_t`*** |
|
||||
| -1 |*`CUBLAS_GEMM_DFALT`* | 8.0 |*`HIPBLAS_GEMM_DEFAULT`* | 160 |
|
||||
| -1 |*`CUBLAS_GEMM_DEFAULT`* | 8.0 |*`HIPBLAS_GEMM_DEFAULT`* | 160 |
|
||||
| 0 |*`CUBLAS_GEMM_ALGO0`* | 8.0 | |
|
||||
| 1 |*`CUBLAS_GEMM_ALGO1`* | 8.0 | |
|
||||
| 2 |*`CUBLAS_GEMM_ALGO2`* | 8.0 | |
|
||||
| 3 |*`CUBLAS_GEMM_ALGO3`* | 8.0 | |
|
||||
| 4 |*`CUBLAS_GEMM_ALGO4`* | 8.0 | |
|
||||
| 5 |*`CUBLAS_GEMM_ALGO5`* | 8.0 | |
|
||||
| 6 |*`CUBLAS_GEMM_ALGO6`* | 8.0 | |
|
||||
| 7 |*`CUBLAS_GEMM_ALGO7`* | 8.0 | |
|
||||
| 8 |*`CUBLAS_GEMM_ALGO8`* | 9.0 | |
|
||||
| 9 |*`CUBLAS_GEMM_ALGO9`* | 9.0 | |
|
||||
| 10 |*`CUBLAS_GEMM_ALGO10`* | 9.0 | |
|
||||
| 11 |*`CUBLAS_GEMM_ALGO11`* | 9.0 | |
|
||||
| 12 |*`CUBLAS_GEMM_ALGO12`* | 9.0 | |
|
||||
| 13 |*`CUBLAS_GEMM_ALGO13`* | 9.0 | |
|
||||
| 14 |*`CUBLAS_GEMM_ALGO14`* | 9.0 | |
|
||||
| 15 |*`CUBLAS_GEMM_ALGO15`* | 9.0 | |
|
||||
| 16 |*`CUBLAS_GEMM_ALGO16`* | 9.0 | |
|
||||
| 17 |*`CUBLAS_GEMM_ALGO17`* | 9.0 | |
|
||||
| 18 |*`CUBLAS_GEMM_ALGO18`* | 9.2 | |
|
||||
| 19 |*`CUBLAS_GEMM_ALGO19`* | 9.2 | |
|
||||
| 20 |*`CUBLAS_GEMM_ALGO20`* | 9.2 | |
|
||||
| 21 |*`CUBLAS_GEMM_ALGO21`* | 9.2 | |
|
||||
| 22 |*`CUBLAS_GEMM_ALGO22`* | 9.2 | |
|
||||
| 23 |*`CUBLAS_GEMM_ALGO23`* | 9.2 | |
|
||||
| 99 |*`CUBLAS_GEMM_DEFAULT_TENSOR_OP`* | 9.0 | |
|
||||
| 99 |*`CUBLAS_GEMM_DFALT_TENSOR_OP`* | 9.0 | |
|
||||
| 100 |*`CUBLAS_GEMM_ALGO0_TENSOR_OP`* | 9.0 | |
|
||||
| 101 |*`CUBLAS_GEMM_ALGO1_TENSOR_OP`* | 9.0 | |
|
||||
| 102 |*`CUBLAS_GEMM_ALGO2_TENSOR_OP`* | 9.0 | |
|
||||
| 103 |*`CUBLAS_GEMM_ALGO3_TENSOR_OP`* | 9.0 | |
|
||||
| 104 |*`CUBLAS_GEMM_ALGO4_TENSOR_OP`* | 9.0 | |
|
||||
| 105 |*`CUBLAS_GEMM_ALGO5_TENSOR_OP`* | 9.2 | |
|
||||
| 106 |*`CUBLAS_GEMM_ALGO6_TENSOR_OP`* | 9.2 | |
|
||||
| 107 |*`CUBLAS_GEMM_ALGO7_TENSOR_OP`* | 9.2 | |
|
||||
| 108 |*`CUBLAS_GEMM_ALGO8_TENSOR_OP`* | 9.2 | |
|
||||
| 109 |*`CUBLAS_GEMM_ALGO9_TENSOR_OP`* | 9.2 | |
|
||||
| 110 |*`CUBLAS_GEMM_ALGO10_TENSOR_OP`* | 9.2 | |
|
||||
| 111 |*`CUBLAS_GEMM_ALGO11_TENSOR_OP`* | 9.2 | |
|
||||
| 112 |*`CUBLAS_GEMM_ALGO12_TENSOR_OP`* | 9.2 | |
|
||||
| 113 |*`CUBLAS_GEMM_ALGO13_TENSOR_OP`* | 9.2 | |
|
||||
| 114 |*`CUBLAS_GEMM_ALGO14_TENSOR_OP`* | 9.2 | |
|
||||
| 115 |*`CUBLAS_GEMM_ALGO15_TENSOR_OP`* | 9.2 | |
|
||||
| enum |***`cublasMath_t`*** | 9.0 | |
|
||||
| 0 |*`CUBLAS_DEFAULT_MATH`* | 9.0 | |
|
||||
| 1 |*`CUBLAS_TENSOR_OP_MATH`* | 9.0 | |
|
||||
| enum* |`cublasDataType_t` | 7.5 | |
|
||||
| struct |`cublasContext` | | |
|
||||
| struct* |`cublasHandle_t` | |`hipblasHandle_t` |
|
||||
|
||||
## **2. CUBLAS API functions**
|
||||
|
||||
| **CUDA** | **HIP** |**CUDA version\***|
|
||||
|-----------------------------------------------------------|-------------------------------------------------|:----------------:|
|
||||
|`cublasCreate` |`hipblasCreate` |
|
||||
|`cublasCreate_v2` |`hipblasCreate` |
|
||||
|`cublasDestroy` |`hipblasDestroy` |
|
||||
|`cublasDestroy_v2` |`hipblasDestroy` |
|
||||
|`cublasGetVersion` | |
|
||||
|`cublasGetVersion_v2` | |
|
||||
|`cublasGetProperty` | | 8.0 |
|
||||
|`cublasGetCudartVersion` | | 10.1 |
|
||||
|`cublasGetStream` |`hipblasGetStream` |
|
||||
|`cublasGetStream_v2` |`hipblasGetStream` |
|
||||
|`cublasSetStream` |`hipblasSetStream` |
|
||||
|`cublasSetStream_v2` |`hipblasSetStream` |
|
||||
|`cublasGetPointerMode` |`hipblasGetPointerMode` |
|
||||
|`cublasGetPointerMode_v2` |`hipblasGetPointerMode` |
|
||||
|`cublasSetPointerMode` |`hipblasSetPointerMode` |
|
||||
|`cublasSetPointerMode_v2` |`hipblasSetPointerMode` |
|
||||
|`cublasGetAtomicsMode` | |
|
||||
|`cublasSetAtomicsMode` | |
|
||||
|`cublasGetMathMode` | | 9.0 |
|
||||
|`cublasSetMathMode` | | 9.0 |
|
||||
|`cublasLogCallback` | | 9.2 |
|
||||
|`cublasLoggerConfigure` | | 9.2 |
|
||||
|`cublasSetLoggerCallback` | | 9.2 |
|
||||
|`cublasGetLoggerCallback` | | 9.2 |
|
||||
|`cublasSetVector` |`hipblasSetVector` |
|
||||
|`cublasGetVector` |`hipblasGetVector` |
|
||||
|`cublasSetMatrix` |`hipblasSetMatrix` |
|
||||
|`cublasGetMatrix` |`hipblasGetMatrix` |
|
||||
|`cublasSetVectorAsync` | |
|
||||
|`cublasGetVectorAsync` | |
|
||||
|`cublasSetMatrixAsync` | |
|
||||
|`cublasGetMatrixAsync` | |
|
||||
|`cublasXerbla` | |
|
||||
|`cublasNrm2Ex` | | 8.0 |
|
||||
|`cublasSnrm2` |`hipblasSnrm2` |
|
||||
|`cublasSnrm2_v2` |`hipblasSnrm2` |
|
||||
|`cublasDnrm2` |`hipblasDnrm2` |
|
||||
|`cublasDnrm2_v2` |`hipblasDnrm2` |
|
||||
|`cublasScnrm2` |`hipblasScnrm2` |
|
||||
|`cublasScnrm2_v2` |`hipblasScnrm2` |
|
||||
|`cublasDznrm2` |`hipblasDznrm2` |
|
||||
|`cublasDznrm2_v2` |`hipblasDznrm2` |
|
||||
|`cublasDotEx` | | 8.0 |
|
||||
|`cublasDotcEx` | | 8.0 |
|
||||
|`cublasSdot` |`hipblasSdot` |
|
||||
|`cublasSdot_v2` |`hipblasSdot` |
|
||||
|`cublasDdot` |`hipblasDdot` |
|
||||
|`cublasDdot_v2` |`hipblasDdot` |
|
||||
|`cublasCdotu` |`hipblasCdotu` |
|
||||
|`cublasCdotu_v2` |`hipblasCdotu` |
|
||||
|`cublasCdotc` |`hipblasCdotc` |
|
||||
|`cublasCdotc_v2` |`hipblasCdotc` |
|
||||
|`cublasZdotu` |`hipblasZdotu` |
|
||||
|`cublasZdotu_v2` |`hipblasZdotu` |
|
||||
|`cublasZdotc` |`hipblasZdotc` |
|
||||
|`cublasZdotc_v2` |`hipblasZdotc` |
|
||||
|`cublasScalEx` | | 8.0 |
|
||||
|`cublasSscal` |`hipblasSscal` |
|
||||
|`cublasSscal_v2` |`hipblasSscal` |
|
||||
|`cublasDscal` |`hipblasDscal` |
|
||||
|`cublasDscal_v2` |`hipblasDscal` |
|
||||
|`cublasCscal` |`hipblasCscal` |
|
||||
|`cublasCscal_v2` |`hipblasCscal` |
|
||||
|`cublasCsscal` |`hipblasCsscal` |
|
||||
|`cublasCsscal_v2` |`hipblasCsscal` |
|
||||
|`cublasZscal` |`hipblasZscal` |
|
||||
|`cublasZscal_v2` |`hipblasZscal` |
|
||||
|`cublasZdscal` |`hipblasZdscal` |
|
||||
|`cublasZdscal_v2` |`hipblasZdscal` |
|
||||
|`cublasAxpyEx` | | 8.0 |
|
||||
|`cublasSaxpy` |`hipblasSaxpy` |
|
||||
|`cublasSaxpy_v2` |`hipblasSaxpy` |
|
||||
|`cublasDaxpy` |`hipblasDaxpy` |
|
||||
|`cublasDaxpy_v2` |`hipblasDaxpy` |
|
||||
|`cublasCaxpy` |`hipblasCaxpy` |
|
||||
|`cublasCaxpy_v2` |`hipblasCaxpy` |
|
||||
|`cublasZaxpy` |`hipblasZaxpy` |
|
||||
|`cublasZaxpy_v2` |`hipblasZaxpy` |
|
||||
|`cublasScopy` |`hipblasScopy` |
|
||||
|`cublasScopy_v2` |`hipblasScopy` |
|
||||
|`cublasDcopy` |`hipblasDcopy` |
|
||||
|`cublasDcopy_v2` |`hipblasDcopy` |
|
||||
|`cublasCcopy` |`hipblasCcopy` |
|
||||
|`cublasCopyEx` | | 10.1 |
|
||||
|`cublasCcopy_v2` |`hipblasCcopy` |
|
||||
|`cublasZcopy` |`hipblasZcopy` |
|
||||
|`cublasZcopy_v2` |`hipblasZcopy` |
|
||||
|`cublasSswap` |`hipblasSswap` |
|
||||
|`cublasSswap_v2` |`hipblasSswap` |
|
||||
|`cublasDswap` |`hipblasDswap` |
|
||||
|`cublasDswap_v2` |`hipblasDswap` |
|
||||
|`cublasCswap` |`hipblasCswap` |
|
||||
|`cublasCswap_v2` |`hipblasCswap` |
|
||||
|`cublasZswap` |`hipblasZswap` |
|
||||
|`cublasZswap_v2` |`hipblasZswap` |
|
||||
|`cublasIamaxEx` | | 10.1 |
|
||||
|`cublasIsamax` |`hipblasIsamax` |
|
||||
|`cublasIsamax_v2` |`hipblasIsamax` |
|
||||
|`cublasIdamax` |`hipblasIdamax` |
|
||||
|`cublasIdamax_v2` |`hipblasIdamax` |
|
||||
|`cublasIcamax` |`hipblasIcamax` |
|
||||
|`cublasIcamax_v2` |`hipblasIcamax` |
|
||||
|`cublasIzamax` |`hipblasIzamax` |
|
||||
|`cublasIzamax_v2` |`hipblasIzamax` |
|
||||
|`cublasIaminEx` | | 10.1 |
|
||||
|`cublasIsamin` |`hipblasIsamin` |
|
||||
|`cublasIsamin_v2` |`hipblasIsamin` |
|
||||
|`cublasIdamin` |`hipblasIdamin` |
|
||||
|`cublasIdamin_v2` |`hipblasIdamin` |
|
||||
|`cublasIcamin` |`hipblasIcamin` |
|
||||
|`cublasIcamin_v2` |`hipblasIcamin` |
|
||||
|`cublasIzamin` |`hipblasIzamin` |
|
||||
|`cublasIzamin_v2` |`hipblasIzamin` |
|
||||
|`cublasAsumEx` | | 10.1 |
|
||||
|`cublasSasum` |`hipblasSasum` |
|
||||
|`cublasSasum_v2` |`hipblasSasum` |
|
||||
|`cublasDasum` |`hipblasDasum` |
|
||||
|`cublasDasum_v2` |`hipblasDasum` |
|
||||
|`cublasScasum` |`hipblasScasum` |
|
||||
|`cublasScasum_v2` |`hipblasScasum` |
|
||||
|`cublasDzasum` |`hipblasDzasum` |
|
||||
|`cublasDzasum_v2` |`hipblasDzasum` |
|
||||
|`cublasRotEx` | | 10.1 |
|
||||
|`cublasSrot` |`hipblasSrot` |
|
||||
|`cublasSrot_v2` |`hipblasSrot` |
|
||||
|`cublasDrot` |`hipblasDrot` |
|
||||
|`cublasDrot_v2` |`hipblasDrot` |
|
||||
|`cublasCrot` |`hipblasCrot` |
|
||||
|`cublasCrot_v2` |`hipblasCrot` |
|
||||
|`cublasCsrot` |`hipblasCsrot` |
|
||||
|`cublasCsrot_v2` |`hipblasCsrot` |
|
||||
|`cublasZrot` |`hipblasZrot` |
|
||||
|`cublasZrot_v2` |`hipblasZrot` |
|
||||
|`cublasRotgEx` | | 10.1 |
|
||||
|`cublasZdrot` |`hipblasZdrot` |
|
||||
|`cublasZdrot_v2` |`hipblasZdrot` |
|
||||
|`cublasSrotg` |`hipblasSrotg` |
|
||||
|`cublasSrotg_v2` |`hipblasSrotg` |
|
||||
|`cublasDrotg` |`hipblasDrotg` |
|
||||
|`cublasDrotg_v2` |`hipblasDrotg` |
|
||||
|`cublasCrotg` |`hipblasCrotg` |
|
||||
|`cublasCrotg_v2` |`hipblasCrotg` |
|
||||
|`cublasZrotg` |`hipblasZrotg` |
|
||||
|`cublasZrotg_v2` |`hipblasZrotg` |
|
||||
|`cublasRotmEx` | | 10.1 |
|
||||
|`cublasSrotm` |`hipblasSrotm` |
|
||||
|`cublasSrotm_v2` |`hipblasSrotm` |
|
||||
|`cublasDrotm` |`hipblasDrotm` |
|
||||
|`cublasDrotm_v2` |`hipblasDrotm` |
|
||||
|`cublasRotmgEx` | | 10.1 |
|
||||
|`cublasSrotmg` |`hipblasSrotmg` |
|
||||
|`cublasSrotmg_v2` |`hipblasSrotmg` |
|
||||
|`cublasDrotmg` |`hipblasDrotmg` |
|
||||
|`cublasDrotmg_v2` |`hipblasDrotmg` |
|
||||
|`cublasSgemv` |`hipblasSgemv` |
|
||||
|`cublasSgemv_v2` |`hipblasSgemv` |
|
||||
|`cublasSgemvBatched` |`hipblasSgemvBatched` |
|
||||
|`cublasDgemv` |`hipblasDgemv` |
|
||||
|`cublasDgemv_v2` |`hipblasDgemv` |
|
||||
|`cublasCgemv` |`hipblasCgemv` |
|
||||
|`cublasCgemv_v2` |`hipblasCgemv` |
|
||||
|`cublasZgemv` |`hipblasZgemv` |
|
||||
|`cublasZgemv_v2` |`hipblasZgemv` |
|
||||
|`cublasSgbmv` | |
|
||||
|`cublasSgbmv_v2` | |
|
||||
|`cublasDgbmv` | |
|
||||
|`cublasDgbmv_v2` | |
|
||||
|`cublasCgbmv` | |
|
||||
|`cublasCgbmv_v2` | |
|
||||
|`cublasZgbmv` | |
|
||||
|`cublasZgbmv_v2` | |
|
||||
|`cublasStrmv` | |
|
||||
|`cublasStrmv_v2` | |
|
||||
|`cublasDtrmv` | |
|
||||
|`cublasDtrmv_v2` | |
|
||||
|`cublasCtrmv` | |
|
||||
|`cublasCtrmv_v2` | |
|
||||
|`cublasZtrmv` | |
|
||||
|`cublasZtrmv_v2` | |
|
||||
|`cublasStbmv` | |
|
||||
|`cublasStbmv_v2` | |
|
||||
|`cublasDtbmv` | |
|
||||
|`cublasDtbmv_v2` | |
|
||||
|`cublasCtbmv` | |
|
||||
|`cublasCtbmv_v2` | |
|
||||
|`cublasZtbmv` | |
|
||||
|`cublasZtbmv_v2` | |
|
||||
|`cublasStpmv` | |
|
||||
|`cublasStpmv_v2` | |
|
||||
|`cublasDtpmv` | |
|
||||
|`cublasDtpmv_v2` | |
|
||||
|`cublasCtpmv` | |
|
||||
|`cublasCtpmv_v2` | |
|
||||
|`cublasZtpmv` | |
|
||||
|`cublasZtpmv_v2` | |
|
||||
|`cublasStrsv` |`hipblasStrsv` |
|
||||
|`cublasStrsv_v2` |`hipblasStrsv` |
|
||||
|`cublasDtrsv` |`hipblasDtrsv` |
|
||||
|`cublasDtrsv_v2` |`hipblasDtrsv` |
|
||||
|`cublasCtrsv` | |
|
||||
|`cublasCtrsv_v2` | |
|
||||
|`cublasZtrsv` | |
|
||||
|`cublasZtrsv_v2` | |
|
||||
|`cublasStpsv` | |
|
||||
|`cublasStpsv_v2` | |
|
||||
|`cublasDtpsv` | |
|
||||
|`cublasDtpsv_v2` | |
|
||||
|`cublasCtpsv` | |
|
||||
|`cublasCtpsv_v2` | |
|
||||
|`cublasZtpsv` | |
|
||||
|`cublasZtpsv_v2` | |
|
||||
|`cublasStbsv` | |
|
||||
|`cublasStbsv_v2` | |
|
||||
|`cublasDtbsv` | |
|
||||
|`cublasDtbsv_v2` | |
|
||||
|`cublasCtbsv` | |
|
||||
|`cublasCtbsv_v2` | |
|
||||
|`cublasZtbsv` | |
|
||||
|`cublasZtbsv_v2` | |
|
||||
|`cublasSsymv` | |
|
||||
|`cublasSsymv_v2` | |
|
||||
|`cublasDsymv` | |
|
||||
|`cublasDsymv_v2` | |
|
||||
|`cublasCsymv` | |
|
||||
|`cublasCsymv_v2` | |
|
||||
|`cublasZsymv` | |
|
||||
|`cublasZsymv_v2` | |
|
||||
|`cublasChemv` | |
|
||||
|`cublasChemv_v2` | |
|
||||
|`cublasZhemv` | |
|
||||
|`cublasZhemv_v2` | |
|
||||
|`cublasSsbmv` | |
|
||||
|`cublasSsbmv_v2` | |
|
||||
|`cublasDsbmv` | |
|
||||
|`cublasDsbmv_v2` | |
|
||||
|`cublasChbmv` | |
|
||||
|`cublasChbmv_v2` | |
|
||||
|`cublasZhbmv` | |
|
||||
|`cublasZhbmv_v2` | |
|
||||
|`cublasSspmv` | |
|
||||
|`cublasSspmv_v2` | |
|
||||
|`cublasDspmv` | |
|
||||
|`cublasDspmv_v2` | |
|
||||
|`cublasChpmv` | |
|
||||
|`cublasChpmv_v2` | |
|
||||
|`cublasZhpmv` | |
|
||||
|`cublasZhpmv_v2` | |
|
||||
|`cublasSger` |`hipblasSger` |
|
||||
|`cublasSger_v2` |`hipblasSger` |
|
||||
|`cublasDger` |`hipblasDger` |
|
||||
|`cublasDger_v2` |`hipblasDger` |
|
||||
|`cublasCgeru` | |
|
||||
|`cublasCgeru_v2` | |
|
||||
|`cublasCgerc` | |
|
||||
|`cublasCgerc_v2` | |
|
||||
|`cublasZgeru` | |
|
||||
|`cublasZgeru_v2` | |
|
||||
|`cublasZgerc` | |
|
||||
|`cublasZgerc_v2` | |
|
||||
|`cublasSsyr` |`hipblasSsyr` |
|
||||
|`cublasSsyr_v2` |`hipblasSsyr` |
|
||||
|`cublasDsyr` |`hipblasDsyr` |
|
||||
|`cublasDsyr_v2` |`hipblasDsyr` |
|
||||
|`cublasCsyr` | |
|
||||
|`cublasCsyr_v2` | |
|
||||
|`cublasZsyr` | |
|
||||
|`cublasZsyr_v2` | |
|
||||
|`cublasCher` | |
|
||||
|`cublasCher_v2` | |
|
||||
|`cublasZher` | |
|
||||
|`cublasZher_v2` | |
|
||||
|`cublasSspr` | |
|
||||
|`cublasSspr_v2` | |
|
||||
|`cublasDspr` | |
|
||||
|`cublasDspr_v2` | |
|
||||
|`cublasChpr` | |
|
||||
|`cublasChpr_v2` | |
|
||||
|`cublasZhpr` | |
|
||||
|`cublasZhpr_v2` | |
|
||||
|`cublasSsyr2` | |
|
||||
|`cublasSsyr2_v2` | |
|
||||
|`cublasDsyr2` | |
|
||||
|`cublasDsyr2_v2` | |
|
||||
|`cublasCsyr2` | |
|
||||
|`cublasCsyr2_v2` | |
|
||||
|`cublasZsyr2` | |
|
||||
|`cublasZsyr2_v2` | |
|
||||
|`cublasCher2` | |
|
||||
|`cublasCher2_v2` | |
|
||||
|`cublasZher2` | |
|
||||
|`cublasZher2_v2` | |
|
||||
|`cublasSspr2` | |
|
||||
|`cublasSspr2_v2` | |
|
||||
|`cublasDspr2` | |
|
||||
|`cublasDspr2_v2` | |
|
||||
|`cublasChpr2` | |
|
||||
|`cublasChpr2_v2` | |
|
||||
|`cublasZhpr2` | |
|
||||
|`cublasZhpr2_v2` | |
|
||||
|`cublasSgemm` |`hipblasSgemm` |
|
||||
|`cublasSgemm_v2` |`hipblasSgemm` |
|
||||
|`cublasDgemm` |`hipblasDgemm` |
|
||||
|`cublasDgemm_v2` |`hipblasDgemm` |
|
||||
|`cublasCgemm` | |
|
||||
|`cublasCgemm_v2` | |
|
||||
|`cublasCgemm3m` | | 8.0 |
|
||||
|`cublasCgemm3mEx` | | 8.0 |
|
||||
|`cublasZgemm` |`hipblasZgemm` |
|
||||
|`cublasZgemm_v2` |`hipblasZgemm` |
|
||||
|`cublasZgemm3m` | | 8.0 |
|
||||
|`cublasHgemm` |`hipblasHgemm` | 7.5 |
|
||||
|`cublasSgemmEx` | | 7.5 |
|
||||
|`cublasGemmEx` |`hipblasGemmEx` | 8.0 |
|
||||
|`cublasCgemmEx` | | 8.0 |
|
||||
|`cublasUint8gemmBias` | | 8.0 |
|
||||
|`cublasSsyrk` | |
|
||||
|`cublasSsyrk_v2` | |
|
||||
|`cublasDsyrk` | |
|
||||
|`cublasDsyrk_v2` | |
|
||||
|`cublasCsyrk` | |
|
||||
|`cublasCsyrk_v2` | |
|
||||
|`cublasZsyrk` | |
|
||||
|`cublasZsyrk_v2` | |
|
||||
|`cublasCsyrkEx` | | 8.0 |
|
||||
|`cublasCsyrk3mEx` | | 8.0 |
|
||||
|`cublasCherk` | |
|
||||
|`cublasCherk_v2` | |
|
||||
|`cublasZherk` | |
|
||||
|`cublasZherk_v2` | |
|
||||
|`cublasCherkEx` | | 8.0 |
|
||||
|`cublasCherk3mEx` | | 8.0 |
|
||||
|`cublasSsyr2k` | |
|
||||
|`cublasSsyr2k_v2` | |
|
||||
|`cublasDsyr2k` | |
|
||||
|`cublasDsyr2k_v2` | |
|
||||
|`cublasCsyr2k` | |
|
||||
|`cublasCsyr2k_v2` | |
|
||||
|`cublasZsyr2k` | |
|
||||
|`cublasZsyr2k_v2` | |
|
||||
|`cublasCher2k` | |
|
||||
|`cublasCher2k_v2` | |
|
||||
|`cublasZher2k` | |
|
||||
|`cublasZher2k_v2` | |
|
||||
|`cublasSsyrkx` | |
|
||||
|`cublasDsyrkx` | |
|
||||
|`cublasCsyrkx` | |
|
||||
|`cublasZsyrkx` | |
|
||||
|`cublasCherkx` | |
|
||||
|`cublasZherkx` | |
|
||||
|`cublasSsymm` | |
|
||||
|`cublasSsymm_v2` | |
|
||||
|`cublasDsymm` | |
|
||||
|`cublasDsymm_v2` | |
|
||||
|`cublasCsymm` | |
|
||||
|`cublasCsymm_v2` | |
|
||||
|`cublasZsymm` | |
|
||||
|`cublasZsymm_v2` | |
|
||||
|`cublasChemm` | |
|
||||
|`cublasChemm_v2` | |
|
||||
|`cublasZhemm` | |
|
||||
|`cublasZhemm_v2` | |
|
||||
|`cublasStrsm` |`hipblasStrsm` |
|
||||
|`cublasStrsm_v2` |`hipblasStrsm` |
|
||||
|`cublasDtrsm` |`hipblasDtrsm` |
|
||||
|`cublasDtrsm_v2` |`hipblasDtrsm` |
|
||||
|`cublasCtrsm` | |
|
||||
|`cublasCtrsm_v2` | |
|
||||
|`cublasZtrsm` | |
|
||||
|`cublasZtrsm_v2` | |
|
||||
|`cublasStrmm` | |
|
||||
|`cublasStrmm_v2` | |
|
||||
|`cublasDtrmm` | |
|
||||
|`cublasDtrmm_v2` | |
|
||||
|`cublasCtrmm` | |
|
||||
|`cublasCtrmm_v2` | |
|
||||
|`cublasZtrmm` | |
|
||||
|`cublasZtrmm_v2` | |
|
||||
|`cublasHgemmBatched` |`hipblasHgemmBatched` | 9.0 |
|
||||
|`cublasSgemmBatched` |`hipblasSgemmBatched` |
|
||||
|`cublasDgemmBatched` |`hipblasDgemmBatched` |
|
||||
|`cublasCgemmBatched` |`hipblasCgemmBatched` |
|
||||
|`cublasCgemm3mBatched` | | 8.0 |
|
||||
|`cublasZgemmBatched` |`hipblasZgemmBatched` |
|
||||
|`cublasGemmBatchedEx` | | 9.1 |
|
||||
|`cublasGemmStridedBatchedEx` | | 9.1 |
|
||||
|`cublasSgemmStridedBatched` |`hipblasSgemmStridedBatched` | 8.0 |
|
||||
|`cublasDgemmStridedBatched` |`hipblasDgemmStridedBatched` | 8.0 |
|
||||
|`cublasCgemmStridedBatched` |`hipblasCgemmStridedBatched` | 8.0 |
|
||||
|`cublasCgemm3mStridedBatched` | | 8.0 |
|
||||
|`cublasZgemmStridedBatched` |`hipblasZgemmStridedBatched` | 8.0 |
|
||||
|`cublasHgemmStridedBatched` |`hipblasHgemmStridedBatched` | 8.0 |
|
||||
|`cublasSgeam` |`hipblasSgeam` |
|
||||
|`cublasDgeam` |`hipblasDgeam` |
|
||||
|`cublasCgeam` | |
|
||||
|`cublasZgeam` | |
|
||||
|`cublasSgetrfBatched` | |
|
||||
|`cublasDgetrfBatched` | |
|
||||
|`cublasCgetrfBatched` | |
|
||||
|`cublasZgetrfBatched` | |
|
||||
|`cublasSgetriBatched` | |
|
||||
|`cublasDgetriBatched` | |
|
||||
|`cublasCgetriBatched` | |
|
||||
|`cublasZgetriBatched` | |
|
||||
|`cublasSgetrsBatched` | |
|
||||
|`cublasDgetrsBatched` | |
|
||||
|`cublasCgetrsBatched` | |
|
||||
|`cublasZgetrsBatched` | |
|
||||
|`cublasStrsmBatched` | |
|
||||
|`cublasDtrsmBatched` | |
|
||||
|`cublasCtrsmBatched` | |
|
||||
|`cublasZtrsmBatched` | |
|
||||
|`cublasSmatinvBatched` | |
|
||||
|`cublasDmatinvBatched` | |
|
||||
|`cublasCmatinvBatched` | |
|
||||
|`cublasZmatinvBatched` | |
|
||||
|`cublasSgeqrfBatched` | |
|
||||
|`cublasDgeqrfBatched` | |
|
||||
|`cublasCgeqrfBatched` | |
|
||||
|`cublasZgeqrfBatched` | |
|
||||
|`cublasSgelsBatched` | |
|
||||
|`cublasDgelsBatched` | |
|
||||
|`cublasCgelsBatched` | |
|
||||
|`cublasZgelsBatched` | |
|
||||
|`cublasSdgmm` | |
|
||||
|`cublasDdgmm` | |
|
||||
|`cublasCdgmm` | |
|
||||
|`cublasZdgmm` | |
|
||||
|`cublasStpttr` | |
|
||||
|`cublasDtpttr` | |
|
||||
|`cublasCtpttr` | |
|
||||
|`cublasZtpttr` | |
|
||||
|`cublasStrttp` | |
|
||||
|`cublasDtrttp` | |
|
||||
|`cublasCtrttp` | |
|
||||
|`cublasZtrttp` | |
|
||||
|
||||
\* CUDA version, in which API has appeared and (optional) last version before abandoning it; no value in case of earlier versions < 7.5.
|
||||
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
@@ -1,586 +0,0 @@
|
||||
# CUDNN API supported by HIP
|
||||
|
||||
## **1. CUDNN Data types**
|
||||
|
||||
| **type** | **CUDA** |**CUDA version\***| **HIP** |**HIP value** (if differs) |
|
||||
|-------------:|---------------------------------------------------------------|:----------------:|------------------------------------------------------------|---------------------------|
|
||||
| define |`CUDNN_VERSION` | |`HIPDNN_VERSION` |
|
||||
| struct |`cudnnContext` | | |
|
||||
| struct* |`cudnnHandle_t` | |`hipdnnHandle_t` |
|
||||
| enum |***`cudnnStatus_t`*** | |***`hipdnnStatus_t`*** |
|
||||
| 0 |*`CUDNN_STATUS_SUCCESS`* | |*`HIPDNN_STATUS_SUCCESS`* |
|
||||
| 1 |*`CUDNN_STATUS_NOT_INITIALIZED`* | |*`HIPDNN_STATUS_NOT_INITIALIZED`* |
|
||||
| 2 |*`CUDNN_STATUS_ALLOC_FAILED`* | |*`HIPDNN_STATUS_ALLOC_FAILED`* |
|
||||
| 3 |*`CUDNN_STATUS_BAD_PARAM`* | |*`HIPDNN_STATUS_BAD_PARAM`* |
|
||||
| 4 |*`CUDNN_STATUS_INTERNAL_ERROR`* | |*`HIPDNN_STATUS_INTERNAL_ERROR`* |
|
||||
| 5 |*`CUDNN_STATUS_INVALID_VALUE`* | |*`HIPDNN_STATUS_INVALID_VALUE`* |
|
||||
| 6 |*`CUDNN_STATUS_ARCH_MISMATCH`* | |*`HIPDNN_STATUS_ARCH_MISMATCH`* |
|
||||
| 7 |*`CUDNN_STATUS_MAPPING_ERROR`* | |*`HIPDNN_STATUS_MAPPING_ERROR`* |
|
||||
| 8 |*`CUDNN_STATUS_EXECUTION_FAILED`* | |*`HIPDNN_STATUS_EXECUTION_FAILED`* |
|
||||
| 9 |*`CUDNN_STATUS_NOT_SUPPORTED`* | |*`HIPDNN_STATUS_NOT_SUPPORTED`* |
|
||||
| 10 |*`CUDNN_STATUS_LICENSE_ERROR`* | |*`HIPDNN_STATUS_LICENSE_ERROR`* |
|
||||
| 11 |*`CUDNN_STATUS_RUNTIME_PREREQUISITE_MISSING`* | 7.5 |*`HIPDNN_STATUS_RUNTIME_PREREQUISITE_MISSING`* |
|
||||
| 12 |*`CUDNN_STATUS_RUNTIME_IN_PROGRESS`* | 8.0 | |
|
||||
| 13 |*`CUDNN_STATUS_RUNTIME_FP_OVERFLOW`* | 8.0 | |
|
||||
| struct |`cudnnRuntimeTag_t` | 8.0 | |
|
||||
| enum |***`cudnnErrQueryMode_t`*** | 8.0 | |
|
||||
| 0 |*`CUDNN_ERRQUERY_RAWCODE`* | 8.0 | |
|
||||
| 1 |*`CUDNN_ERRQUERY_NONBLOCKING`* | 8.0 | |
|
||||
| 2 |*`CUDNN_ERRQUERY_BLOCKING`* | 8.0 | |
|
||||
| struct |`cudnnTensorStruct` | | |
|
||||
| struct* |`cudnnTensorDescriptor_t` | |`hipdnnTensorDescriptor_t` |
|
||||
| struct |`cudnnConvolutionStruct` | | |
|
||||
| struct* |`cudnnConvolutionDescriptor_t` | |`hipdnnConvolutionDescriptor_t` |
|
||||
| struct |`cudnnPoolingStruct` | | |
|
||||
| struct* |`cudnnPoolingDescriptor_t` | |`hipdnnPoolingDescriptor_t` |
|
||||
| struct |`cudnnFilterStruct` | | |
|
||||
| struct* |`cudnnFilterDescriptor_t` | |`hipdnnFilterDescriptor_t` |
|
||||
| struct |`cudnnLRNStruct` | | |
|
||||
| struct* |`cudnnLRNDescriptor_t` | |`hipdnnLRNDescriptor_t` |
|
||||
| struct |`cudnnActivationStruct` | | |
|
||||
| struct* |`cudnnActivationDescriptor_t` | |`hipdnnActivationDescriptor_t` |
|
||||
| struct |`cudnnSpatialTransformerStruct` | 7.5 | |
|
||||
| struct* |`cudnnSpatialTransformerDescriptor_t` | 7.5 | |
|
||||
| struct |`cudnnOpTensorStruct` | 7.5 | |
|
||||
| struct* |`cudnnOpTensorDescriptor_t` | 7.5 |`hipdnnOpTensorDescriptor_t` |
|
||||
| struct |`cudnnReduceTensorStruct` | 7.5 | |
|
||||
| struct* |`cudnnReduceTensorDescriptor_t` | 7.5 |`hipdnnReduceTensorDescriptor_t` |
|
||||
| struct |`cudnnCTCLossStruct` | 8.0 | |
|
||||
| struct* |`cudnnCTCLossDescriptor_t` | 8.0 | |
|
||||
| struct |`cudnnTensorTransformStruct` | 9.0 | |
|
||||
| struct* |`cudnnTensorTransformDescriptor_t` | 9.0 | |
|
||||
| enum |***`cudnnDataType_t`*** | |***`hipdnnDataType_t`*** |
|
||||
| 0 |*`CUDNN_DATA_FLOAT`* | |*`HIPDNN_DATA_FLOAT`* |
|
||||
| 1 |*`CUDNN_DATA_DOUBLE`* | |*`HIPDNN_DATA_DOUBLE`* |
|
||||
| 2 |*`CUDNN_DATA_HALF`* | |*`HIPDNN_DATA_HALF`* |
|
||||
| 3 |*`CUDNN_DATA_INT8`* | 7.5 |*`HIPDNN_DATA_INT8`* |
|
||||
| 4 |*`CUDNN_DATA_INT32`* | 7.5 |*`HIPDNN_DATA_INT32`* |
|
||||
| 5 |*`CUDNN_DATA_INT8x4`* | 7.5 |*`HIPDNN_DATA_INT8x4`* |
|
||||
| 6 |*`CUDNN_DATA_UINT8`* | 8.0 | |
|
||||
| 7 |*`CUDNN_DATA_UINT8x4`* | 8.0 | |
|
||||
| 8 |*`CUDNN_DATA_INT8x32`* | 9.0 | |
|
||||
| enum |***`cudnnMathType_t`*** | 8.0 |***`hipdnnMathType_t`*** |
|
||||
| 0 |*`CUDNN_DEFAULT_MATH`* | 8.0 |*`HIPDNN_DEFAULT_MATH`* |
|
||||
| 1 |*`CUDNN_TENSOR_OP_MATH`* | 8.0 |*`HIPDNN_TENSOR_OP_MATH`* |
|
||||
| enum |***`cudnnNanPropagation_t`*** | |***`hipdnnNanPropagation_t`*** |
|
||||
| 0 |*`CUDNN_NOT_PROPAGATE_NAN`* | |*`HIPDNN_NOT_PROPAGATE_NAN`* |
|
||||
| 1 |*`CUDNN_PROPAGATE_NAN`* | |*`HIPDNN_PROPAGATE_NAN`* |
|
||||
| enum |***`cudnnDeterminism_t`*** | 7.5 | |
|
||||
| 0 |*`CUDNN_NON_DETERMINISTIC`* | 7.5 | |
|
||||
| 1 |*`CUDNN_DETERMINISTIC`* | 7.5 | |
|
||||
| define |`CUDNN_DIM_MAX` | | |
|
||||
| enum |***`cudnnTensorFormat_t`*** | |***`hipdnnTensorFormat_t`*** |
|
||||
| 0 |*`CUDNN_TENSOR_NCHW`* | |*`HIPDNN_TENSOR_NCHW`* |
|
||||
| 1 |*`CUDNN_TENSOR_NHWC`* | |*`HIPDNN_TENSOR_NHWC`* |
|
||||
| 2 |*`CUDNN_TENSOR_NCHW_VECT_C`* | 7.5 |*`HIPDNN_TENSOR_NCHW_VECT_C`* |
|
||||
| enum |***`cudnnFoldingDirection_t`*** | 9.0 | |
|
||||
| 0 |*`CUDNN_TRANSFORM_FOLD`* | 9.0 | |
|
||||
| 1 |*`CUDNN_TRANSFORM_UNFOLD`* | 9.0 | |
|
||||
| enum |***`cudnnOpTensorOp_t`*** | 7.5 |***`hipdnnOpTensorOp_t`*** |
|
||||
| 0 |*`CUDNN_OP_TENSOR_ADD`* | 7.5 |*`HIPDNN_OP_TENSOR_ADD`* |
|
||||
| 1 |*`CUDNN_OP_TENSOR_MUL`* | 7.5 |*`HIPDNN_OP_TENSOR_MUL`* |
|
||||
| 2 |*`CUDNN_OP_TENSOR_MIN`* | 7.5 |*`HIPDNN_OP_TENSOR_MIN`* |
|
||||
| 3 |*`CUDNN_OP_TENSOR_MAX`* | 7.5 |*`HIPDNN_OP_TENSOR_MAX`* |
|
||||
| 4 |*`CUDNN_OP_TENSOR_SQRT`* | 7.5 |*`HIPDNN_OP_TENSOR_SQRT`* |
|
||||
| 5 |*`CUDNN_OP_TENSOR_NOT`* | 8.0 |*`HIPDNN_OP_TENSOR_NOT`* |
|
||||
| enum |***`cudnnReduceTensorOp_t`*** | 7.5 |***`hipdnnReduceTensorOp_t`*** |
|
||||
| 0 |*`CUDNN_REDUCE_TENSOR_ADD`* | 7.5 |*`HIPDNN_REDUCE_TENSOR_ADD`* |
|
||||
| 1 |*`CUDNN_REDUCE_TENSOR_MUL`* | 7.5 |*`HIPDNN_REDUCE_TENSOR_MUL`* |
|
||||
| 2 |*`CUDNN_REDUCE_TENSOR_MIN`* | 7.5 |*`HIPDNN_REDUCE_TENSOR_MIN`* |
|
||||
| 3 |*`CUDNN_REDUCE_TENSOR_MAX`* | 7.5 |*`HIPDNN_REDUCE_TENSOR_MAX`* |
|
||||
| 4 |*`CUDNN_REDUCE_TENSOR_AMAX`* | 7.5 |*`HIPDNN_REDUCE_TENSOR_AMAX`* |
|
||||
| 5 |*`CUDNN_REDUCE_TENSOR_AVG`* | 7.5 |*`HIPDNN_REDUCE_TENSOR_AVG`* |
|
||||
| 6 |*`CUDNN_REDUCE_TENSOR_NORM1`* | 7.5 |*`HIPDNN_REDUCE_TENSOR_NORM1`* |
|
||||
| 7 |*`CUDNN_REDUCE_TENSOR_NORM2`* | 7.5 |*`HIPDNN_REDUCE_TENSOR_NORM2`* |
|
||||
| 8 |*`CUDNN_REDUCE_TENSOR_MUL_NO_ZEROS`* | 8.0 |*`HIPDNN_REDUCE_TENSOR_MUL_NO_ZEROS`* |
|
||||
| enum |***`cudnnReduceTensorIndices_t`*** | 7.5 |***`hipdnnReduceTensorIndices_t`*** |
|
||||
| 0 |*`CUDNN_REDUCE_TENSOR_NO_INDICES`* | 7.5 |*`HIPDNN_REDUCE_TENSOR_NO_INDICES`* |
|
||||
| 1 |*`CUDNN_REDUCE_TENSOR_FLATTENED_INDICES`* | 7.5 |*`HIPDNN_REDUCE_TENSOR_FLATTENED_INDICES`* |
|
||||
| enum |***`cudnnIndicesType_t`*** | 7.5 |***`hipdnnIndicesType_t`*** |
|
||||
| 0 |*`CUDNN_32BIT_INDICES`* | 7.5 |*`HIPDNN_32BIT_INDICES`* |
|
||||
| 1 |*`CUDNN_64BIT_INDICES`* | 7.5 |*`HIPDNN_64BIT_INDICES`* |
|
||||
| 2 |*`CUDNN_16BIT_INDICES`* | 7.5 |*`HIPDNN_16BIT_INDICES`* |
|
||||
| 3 |*`CUDNN_8BIT_INDICES`* | 7.5 |*`HIPDNN_8BIT_INDICES`* |
|
||||
| enum |***`cudnnConvolutionMode_t`*** | |***`hipdnnConvolutionMode_t`*** |
|
||||
| 0 |*`CUDNN_CONVOLUTION`* | |*`HIPDNN_CONVOLUTION`* |
|
||||
| 1 |*`CUDNN_CROSS_CORRELATION`* | |*`HIPDNN_CROSS_CORRELATION`* |
|
||||
| enum |***`cudnnConvolutionFwdPreference_t`*** | |***`hipdnnConvolutionFwdPreference_t`*** |
|
||||
| 0 |*`CUDNN_CONVOLUTION_FWD_NO_WORKSPACE`* | |*`HIPDNN_CONVOLUTION_FWD_NO_WORKSPACE`* |
|
||||
| 1 |*`CUDNN_CONVOLUTION_FWD_PREFER_FASTEST`* | |*`HIPDNN_CONVOLUTION_FWD_PREFER_FASTEST`* |
|
||||
| 2 |*`CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT`* | |*`HIPDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT`* |
|
||||
| enum |***`cudnnConvolutionFwdAlgo_t`*** | |***`hipdnnConvolutionFwdAlgo_t`*** |
|
||||
| 0 |*`CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM`* | |*`HIPDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM`* |
|
||||
| 1 |*`CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM`* | |*`HIPDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM`* |
|
||||
| 2 |*`CUDNN_CONVOLUTION_FWD_ALGO_GEMM`* | |*`HIPDNN_CONVOLUTION_FWD_ALGO_GEMM`* |
|
||||
| 3 |*`CUDNN_CONVOLUTION_FWD_ALGO_DIRECT`* | |*`HIPDNN_CONVOLUTION_FWD_ALGO_DIRECT`* |
|
||||
| 4 |*`CUDNN_CONVOLUTION_FWD_ALGO_FFT`* | |*`HIPDNN_CONVOLUTION_FWD_ALGO_FFT`* |
|
||||
| 5 |*`CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING`* | |*`HIPDNN_CONVOLUTION_FWD_ALGO_FFT_TILING`* |
|
||||
| 6 |*`CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD`* | 7.5 |*`HIPDNN_CONVOLUTION_FWD_ALGO_WINOGRAD`* |
|
||||
| 7 |*`CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED`* | 7.5 |*`HIPDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED`* |
|
||||
| 8 |*`CUDNN_CONVOLUTION_FWD_ALGO_COUNT`* | 7.5 |*`HIPDNN_CONVOLUTION_FWD_ALGO_COUNT`* |
|
||||
| struct |`cudnnConvolutionFwdAlgoPerf_t` | |`hipdnnConvolutionFwdAlgoPerf_t` |
|
||||
| enum |***`cudnnConvolutionBwdFilterPreference_t`*** | |***`hipdnnConvolutionBwdFilterPreference_t`*** |
|
||||
| 0 |*`CUDNN_CONVOLUTION_BWD_FILTER_NO_WORKSPACE`* | |*`HIPDNN_CONVOLUTION_BWD_FILTER_NO_WORKSPACE`* |
|
||||
| 1 |*`CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST`* | |*`HIPDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST`* |
|
||||
| 2 |*`CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT`* | |*`HIPDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT`* |
|
||||
| enum |***`cudnnConvolutionBwdFilterAlgo_t`*** | |***`hipdnnConvolutionBwdFilterAlgo_t`*** |
|
||||
| 0 |*`CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0`* | |*`HIPDNN_CONVOLUTION_BWD_FILTER_ALGO_0`* |
|
||||
| 1 |*`CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1`* | |*`HIPDNN_CONVOLUTION_BWD_FILTER_ALGO_1`* |
|
||||
| 2 |*`CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT`* | |*`HIPDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT`* |
|
||||
| 3 |*`CUDNN_CONVOLUTION_BWD_FILTER_ALGO_3`* | |*`HIPDNN_CONVOLUTION_BWD_FILTER_ALGO_3`* |
|
||||
| 4 |*`CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD`* | 7.5 |*`HIPDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD`* |
|
||||
| 5 |*`CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED`* | 7.5 |*`HIPDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED`* |
|
||||
| 6 |*`CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT_TILING`* | 7.5 |*`HIPDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT_TILING`* |
|
||||
| 7 |*`CUDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT`* | 7.5 |*`HIPDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT`* |
|
||||
| struct |`cudnnConvolutionBwdDataAlgoPerf_t` | |`hipdnnConvolutionBwdDataAlgoPerf_t` |
|
||||
| enum |***`cudnnSoftmaxAlgorithm_t`*** | |***`hipdnnSoftmaxAlgorithm_t`*** |
|
||||
| 0 |*`CUDNN_SOFTMAX_FAST`* | |*`HIPDNN_SOFTMAX_FAST`* |
|
||||
| 1 |*`CUDNN_SOFTMAX_ACCURATE`* | |*`HIPDNN_SOFTMAX_ACCURATE`* |
|
||||
| 2 |*`CUDNN_SOFTMAX_LOG`* | |*`HIPDNN_SOFTMAX_LOG`* |
|
||||
| enum |***`cudnnSoftmaxMode_t`*** | |***`hipdnnSoftmaxMode_t`*** |
|
||||
| 0 |*`CUDNN_SOFTMAX_MODE_INSTANCE`* | |*`HIPDNN_SOFTMAX_MODE_INSTANCE`* |
|
||||
| 1 |*`CUDNN_SOFTMAX_MODE_CHANNEL`* | |*`HIPDNN_SOFTMAX_MODE_CHANNEL`* |
|
||||
| enum |***`cudnnPoolingMode_t`*** | |***`hipdnnPoolingMode_t`*** |
|
||||
| 0 |*`CUDNN_POOLING_MAX`* | |*`HIPDNN_POOLING_MAX`* |
|
||||
| 1 |*`CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING`* | |*`HIPDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING`* |
|
||||
| 2 |*`CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING`* | |*`HIPDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING`* |
|
||||
| 3 |*`CUDNN_POOLING_MAX_DETERMINISTIC`* | 7.5 |*`HIPDNN_POOLING_MAX_DETERMINISTIC`* |
|
||||
| enum |***`cudnnActivationMode_t`*** | |***`hipdnnActivationMode_t`*** |
|
||||
| 0 |*`CUDNN_ACTIVATION_SIGMOID`* | |*`HIPDNN_ACTIVATION_SIGMOID`* |
|
||||
| 1 |*`CUDNN_ACTIVATION_RELU`* | |*`HIPDNN_ACTIVATION_RELU`* |
|
||||
| 2 |*`CUDNN_ACTIVATION_TANH`* | |*`HIPDNN_ACTIVATION_TANH`* |
|
||||
| 3 |*`CUDNN_ACTIVATION_CLIPPED_RELU`* | |*`HIPDNN_ACTIVATION_CLIPPED_RELU`* |
|
||||
| 4 |*`CUDNN_ACTIVATION_ELU`* | 7.5 |*`HIPDNN_ACTIVATION_ELU`* |
|
||||
| 5 |*`CUDNN_ACTIVATION_IDENTITY`* | 8.0 |*`HIPDNN_ACTIVATION_PATHTRU`* |
|
||||
| define |`CUDNN_LRN_MIN_N` | | |
|
||||
| define |`CUDNN_LRN_MAX_N` | | |
|
||||
| define |`CUDNN_LRN_MIN_K` | | |
|
||||
| define |`CUDNN_LRN_MIN_BETA` | | |
|
||||
| enum |***`cudnnLRNMode_t`*** | |***`hipdnnLRNMode_t`*** |
|
||||
| 0 |*`CUDNN_LRN_CROSS_CHANNEL_DIM1`* | |*`HIPDNN_LRN_CROSS_CHANNEL`* |
|
||||
| enum |***`cudnnDivNormMode_t`*** | | |
|
||||
| 0 |*`CUDNN_DIVNORM_PRECOMPUTED_MEANS`* | | |
|
||||
| enum |***`cudnnBatchNormMode_t`*** | |***`hipdnnBatchNormMode_t`*** |
|
||||
| 0 |*`CUDNN_BATCHNORM_PER_ACTIVATION`* | |*`HIPDNN_BATCHNORM_PER_ACTIVATION`* |
|
||||
| 1 |*`CUDNN_BATCHNORM_SPATIAL`* | |*`HIPDNN_BATCHNORM_SPATIAL`* |
|
||||
| 2 |*`CUDNN_BATCHNORM_SPATIAL_PERSISTENT`* | 8.0 |*`HIPDNN_BATCHNORM_SPATIAL_PERSISTENT`* |
|
||||
| define |`CUDNN_BN_MIN_EPSILON` | |`HIPDNN_BN_MIN_EPSILON` |
|
||||
| enum |***`cudnnSamplerType_t`*** | 7.5 | |
|
||||
| 0 |*`CUDNN_SAMPLER_BILINEAR`* | 7.5 | |
|
||||
| struct |`cudnnDropoutStruct` | 7.5 | |
|
||||
| struct* |`cudnnDropoutDescriptor_t` | 7.5 |`hipdnnDropoutDescriptor_t` |
|
||||
| enum |***`cudnnRNNMode_t`*** | 7.5 |***`hipdnnRNNMode_t`*** |
|
||||
| 0 |*`CUDNN_RNN_RELU`* | 7.5 |*`HIPDNN_RNN_RELU`* |
|
||||
| 1 |*`CUDNN_RNN_TANH`* | 7.5 |*`HIPDNN_RNN_TANH`* |
|
||||
| 2 |*`CUDNN_LSTM`* | 7.5 |*`HIPDNN_LSTM`* |
|
||||
| 3 |*`CUDNN_GRU`* | 7.5 |*`HIPDNN_GRU`* |
|
||||
| enum |***`cudnnRNNBiasMode_t`*** | 9.0 |***`hipdnnRNNBiasMode_t`*** |
|
||||
| 0 |*`CUDNN_RNN_NO_BIAS`* | 9.0 |*`HIPDNN_RNN_NO_BIAS`* |
|
||||
| 1 |*`CUDNN_RNN_SINGLE_INP_BIAS`* | 9.0 |*`HIPDNN_RNN_WITH_BIAS`* |
|
||||
| 2 |*`CUDNN_RNN_DOUBLE_BIAS`* | 9.0 |*`HIPDNN_RNN_WITH_BIAS`* | 1 |
|
||||
| 3 |*`CUDNN_RNN_SINGLE_REC_BIAS`* | 9.0 |*`HIPDNN_RNN_WITH_BIAS`* | 1 |
|
||||
| enum |***`cudnnDirectionMode_t`*** | 7.5 |***`hipdnnDirectionMode_t`*** |
|
||||
| 0 |*`CUDNN_UNIDIRECTIONAL`* | 7.5 |*`HIPDNN_UNIDIRECTIONAL`* |
|
||||
| 1 |*`CUDNN_BIDIRECTIONAL`* | 7.5 |*`HIPDNN_BIDIRECTIONAL`* |
|
||||
| enum |***`cudnnRNNAlgo_t`*** | 7.5 |***`hipdnnRNNAlgo_t`*** |
|
||||
| 0 |*`CUDNN_RNN_ALGO_STANDARD`* | 7.5 |*`HIPDNN_RNN_ALGO_STANDARD`* |
|
||||
| 1 |*`CUDNN_RNN_ALGO_PERSIST_STATIC`* | 7.5 |*`HIPDNN_RNN_ALGO_PERSIST_STATIC`* |
|
||||
| 2 |*`CUDNN_RNN_ALGO_PERSIST_DYNAMIC`* | 7.5 |*`HIPDNN_RNN_ALGO_PERSIST_DYNAMIC`* |
|
||||
| 3 |*`CUDNN_RNN_ALGO_COUNT`* | 8.0 | |
|
||||
| struct |`cudnnAlgorithmStruct` | 8.0 | |
|
||||
| struct* |`cudnnAlgorithmDescriptor_t` | 8.0 | |
|
||||
| struct |`cudnnAlgorithmPerformanceStruct` | 8.0 | |
|
||||
| struct* |`cudnnAlgorithmPerformance_t` | 8.0 | |
|
||||
| struct |`cudnnRNNStruct` | 7.5 | |
|
||||
| struct* |`cudnnRNNDescriptor_t` | 7.5 |`hipdnnRNNDescriptor_t` |
|
||||
| struct |`cudnnPersistentRNNPlan` | 7.5 | |
|
||||
| struct* |`cudnnPersistentRNNPlan_t` | 7.5 |`hipdnnPersistentRNNPlan_t` |
|
||||
| enum |***`cudnnCTCLossAlgo_t`*** | 8.0 | |
|
||||
| 0 |*`CUDNN_CTC_LOSS_ALGO_DETERMINISTIC`* | 8.0 | |
|
||||
| 1 |*`CUDNN_CTC_LOSS_ALGO_NON_DETERMINISTIC`* | 8.0 | |
|
||||
| struct |`cudnnAlgorithm_t` | 8.0 | |
|
||||
| enum |***`cudnnSeverity_t`*** | 8.0 | |
|
||||
| 0 |*`CUDNN_SEV_FATAL`* | 8.0 | |
|
||||
| 1 |*`CUDNN_SEV_ERROR`* | 8.0 | |
|
||||
| 2 |*`CUDNN_SEV_WARNING`* | 8.0 | |
|
||||
| 3 |*`CUDNN_SEV_INFO`* | 8.0 | |
|
||||
| define |`CUDNN_SEV_ERROR_EN` | 8.0 | |
|
||||
| define |`CUDNN_SEV_WARNING_EN` | 8.0 | |
|
||||
| define |`CUDNN_SEV_INFO_EN` | 8.0 | |
|
||||
| struct |`cudnnDebug_t` | 8.0 | |
|
||||
| struct |`cudnnCallback_t` | 8.0 | |
|
||||
| enum |***`cudnnBatchNormOps_t`*** | 9.0 | |
|
||||
| 0 |*`CUDNN_BATCHNORM_OPS_BN`* | 9.0 | |
|
||||
| 1 |*`CUDNN_BATCHNORM_OPS_BN_ACTIVATION`* | 9.0 | |
|
||||
| 2 |*`CUDNN_BATCHNORM_OPS_BN_ADD_ACTIVATION`* | 9.0 | |
|
||||
| enum |***`cudnnRNNClipMode_t`*** | 9.0 | |
|
||||
| 0 |*`CUDNN_RNN_CLIP_NONE`* | 9.0 | |
|
||||
| 1 |*`CUDNN_RNN_CLIP_MINMAX`* | 9.0 | |
|
||||
| struct |`cudnnRNNDataStruct` | 9.0 | |
|
||||
| struct* |`cudnnRNNDataDescriptor_t` | 9.0 | |
|
||||
| enum |***`cudnnRNNDataLayout_t`*** | 9.0 | |
|
||||
| 0 |*`CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_UNPACKED`* | 9.0 | |
|
||||
| 1 |*`CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_PACKED`* | 9.0 | |
|
||||
| 2 |*`CUDNN_RNN_DATA_LAYOUT_BATCH_MAJOR_UNPACKED`* | 9.0 | |
|
||||
| enum |***`cudnnRNNPaddingMode_t`*** | 9.0 | |
|
||||
| 0 |*`CUDNN_RNN_PADDED_IO_DISABLED`* | 9.0 | |
|
||||
| 1 |*`CUDNN_RNN_PADDED_IO_ENABLED`* | 9.0 | |
|
||||
| enum |***`cudnnSeqDataAxis_t`*** | 9.0 | |
|
||||
| 0 |*`CUDNN_SEQDATA_TIME_DIM`* | 9.0 | |
|
||||
| 1 |*`CUDNN_SEQDATA_BATCH_DIM`* | 9.0 | |
|
||||
| 2 |*`CUDNN_SEQDATA_BEAM_DIM`* | 9.0 | |
|
||||
| 3 |*`CUDNN_SEQDATA_VECT_DIM`* | 9.0 | |
|
||||
| define |`CUDNN_SEQDATA_DIM_COUNT` | 9.0 | |
|
||||
| struct |`cudnnSeqDataStruct` | 9.0 | |
|
||||
| struct* |`cudnnSeqDataDescriptor_t` | 9.0 | |
|
||||
| unsigned |***`cudnnAttnQueryMap_t`*** | 9.0 | |
|
||||
| 0 |*`CUDNN_ATTN_QUERYMAP_ALL_TO_ONE`* | 9.0 | |
|
||||
| 1U << 0 |*`CUDNN_ATTN_QUERYMAP_ONE_TO_ONE`* | 9.0 | |
|
||||
| 1 |*`CUDNN_ATTN_DISABLE_PROJ_BIASES`* | 10.1 Update 2 | |
|
||||
| 1U << 1 |*`CUDNN_ATTN_ENABLE_PROJ_BIASES`* | 10.1 Update 2 | |
|
||||
| struct |`cudnnAttnStruct` | 9.0 | |
|
||||
| struct* |`cudnnAttnDescriptor_t` | 9.0 | |
|
||||
| enum |***`cudnnMultiHeadAttnWeightKind_t`*** | 9.0 | |
|
||||
| 0 |*`CUDNN_MH_ATTN_Q_WEIGHTS`* | 9.0 | |
|
||||
| 1 |*`CUDNN_MH_ATTN_K_WEIGHTS`* | 9.0 | |
|
||||
| 2 |*`CUDNN_MH_ATTN_V_WEIGHTS`* | 9.0 | |
|
||||
| 3 |*`CUDNN_MH_ATTN_O_WEIGHTS`* | 9.0 | |
|
||||
| 4 |*`CUDNN_MH_ATTN_Q_BIASES`* | 10.1 Update 2 | |
|
||||
| 5 |*`CUDNN_MH_ATTN_K_BIASES`* | 10.1 Update 2 | |
|
||||
| 6 |*`CUDNN_MH_ATTN_V_BIASES`* | 10.1 Update 2 | |
|
||||
| 7 |*`CUDNN_MH_ATTN_O_BIASES`* | 10.1 Update 2 | |
|
||||
| define 8 |`CUDNN_ATTN_WKIND_COUNT` | 10.1 Update 2 | |
|
||||
| enum |***`cudnnWgradMode_t`*** | 9.0 | |
|
||||
| 0 |*`CUDNN_WGRAD_MODE_ADD`* | 9.0 | |
|
||||
| 1 |*`CUDNN_WGRAD_MODE_SET`* | 9.0 | |
|
||||
| enum |***`cudnnReorderType_t`*** | 10.1 | |
|
||||
| 0 |*`CUDNN_DEFAULT_REORDER`* | 10.1 | |
|
||||
| 1 |*`CUDNN_NO_REORDER`* | 10.1 | |
|
||||
| enum |***`cudnnLossNormalizationMode_t`*** | 10.1 | |
|
||||
| 0 |*`CUDNN_LOSS_NORMALIZATION_NONE`* | 10.1 | |
|
||||
| 1 |*`CUDNN_LOSS_NORMALIZATION_SOFTMAX`* | 10.1 | |
|
||||
| struct |`cudnnFusedOpsConstParamStruct` | 10.1 | |
|
||||
| struct* |`cudnnFusedOpsConstParamPack_t` | 10.1 | |
|
||||
| struct |`cudnnFusedOpsVariantParamStruct` | 10.1 | |
|
||||
| struct* |`cudnnFusedOpsVariantParamPack_t` | 10.1 | |
|
||||
| struct |`cudnnFusedOpsPlanStruct` | 10.1 | |
|
||||
| struct* |`cudnnFusedOpsPlan_t` | 10.1 | |
|
||||
| enum |***`cudnnFusedOps_t`*** | 10.1 | |
|
||||
| 0 |*`CUDNN_FUSED_SCALE_BIAS_ACTIVATION_CONV_BNSTATS`* | 10.1 | |
|
||||
| 1 |*`CUDNN_FUSED_SCALE_BIAS_ACTIVATION_WGRAD`* | 10.1 | |
|
||||
| 2 |*`CUDNN_FUSED_BN_FINALIZE_STATISTICS_TRAINING`* | 10.1 | |
|
||||
| 3 |*`CUDNN_FUSED_BN_FINALIZE_STATISTICS_INFERENCE`* | 10.1 | |
|
||||
| 4 |*`CUDNN_FUSED_CONV_SCALE_BIAS_ADD_ACTIVATION`* | 10.1 | |
|
||||
| 5 |*`CUDNN_FUSED_SCALE_BIAS_ADD_ACTIVATION_GEN_BITMASK`* | 10.1 | |
|
||||
| 6 |*`CUDNN_FUSED_DACTIVATION_FORK_DBATCHNORM`* | 10.1 | |
|
||||
| enum |***`cudnnFusedOpsConstParamLabel_t`*** | 10.1 | |
|
||||
| 0 |*`CUDNN_PARAM_XDESC`* | 10.1 | |
|
||||
| 1 |*`CUDNN_PARAM_XDATA_PLACEHOLDER`* | 10.1 | |
|
||||
| 2 |*`CUDNN_PARAM_BN_MODE`* | 10.1 | |
|
||||
| 3 |*`CUDNN_PARAM_BN_EQSCALEBIAS_DESC`* | 10.1 | |
|
||||
| 4 |*`CUDNN_PARAM_BN_EQSCALE_PLACEHOLDER`* | 10.1 | |
|
||||
| 5 |*`CUDNN_PARAM_BN_EQBIAS_PLACEHOLDER`* | 10.1 | |
|
||||
| 6 |*`CUDNN_PARAM_ACTIVATION_DESC`* | 10.1 | |
|
||||
| 7 |*`CUDNN_PARAM_CONV_DESC`* | 10.1 | |
|
||||
| 8 |*`CUDNN_PARAM_WDESC`* | 10.1 | |
|
||||
| 9 |*`CUDNN_PARAM_WDATA_PLACEHOLDER`* | 10.1 | |
|
||||
| 10 |*`CUDNN_PARAM_DWDESC`* | 10.1 | |
|
||||
| 11 |*`CUDNN_PARAM_DWDATA_PLACEHOLDER`* | 10.1 | |
|
||||
| 12 |*`CUDNN_PARAM_YDESC`* | 10.1 | |
|
||||
| 13 |*`CUDNN_PARAM_YDATA_PLACEHOLDER`* | 10.1 | |
|
||||
| 14 |*`CUDNN_PARAM_DYDESC`* | 10.1 | |
|
||||
| 15 |*`CUDNN_PARAM_DYDATA_PLACEHOLDER`* | 10.1 | |
|
||||
| 16 |*`CUDNN_PARAM_YSTATS_DESC`* | 10.1 | |
|
||||
| 17 |*`CUDNN_PARAM_YSUM_PLACEHOLDER`* | 10.1 | |
|
||||
| 18 |*`CUDNN_PARAM_YSQSUM_PLACEHOLDER`* | 10.1 | |
|
||||
| 19 |*`CUDNN_PARAM_BN_SCALEBIAS_MEANVAR_DESC`* | 10.1 | |
|
||||
| 20 |*`CUDNN_PARAM_BN_SCALE_PLACEHOLDER`* | 10.1 | |
|
||||
| 21 |*`CUDNN_PARAM_BN_BIAS_PLACEHOLDER`* | 10.1 | |
|
||||
| 22 |*`CUDNN_PARAM_BN_SAVED_MEAN_PLACEHOLDER`* | 10.1 | |
|
||||
| 23 |*`CUDNN_PARAM_BN_SAVED_INVSTD_PLACEHOLDER`* | 10.1 | |
|
||||
| 24 |*`CUDNN_PARAM_BN_RUNNING_MEAN_PLACEHOLDER`* | 10.1 | |
|
||||
| 25 |*`CUDNN_PARAM_BN_RUNNING_VAR_PLACEHOLDER`* | 10.1 | |
|
||||
| 26 |*`CUDNN_PARAM_ZDESC`* | 10.1 | |
|
||||
| 27 |*`CUDNN_PARAM_ZDATA_PLACEHOLDER`* | 10.1 | |
|
||||
| 28 |*`CUDNN_PARAM_BN_Z_EQSCALEBIAS_DESC`* | 10.1 | |
|
||||
| 29 |*`CUDNN_PARAM_BN_Z_EQSCALE_PLACEHOLDER`* | 10.1 | |
|
||||
| 30 |*`CUDNN_PARAM_BN_Z_EQBIAS_PLACEHOLDER`* | 10.1 | |
|
||||
| 31 |*`CUDNN_PARAM_ACTIVATION_BITMASK_DESC`* | 10.1 | |
|
||||
| 32 |*`CUDNN_PARAM_ACTIVATION_BITMASK_PLACEHOLDER`* | 10.1 | |
|
||||
| 33 |*`CUDNN_PARAM_DXDESC`* | 10.1 | |
|
||||
| 34 |*`CUDNN_PARAM_DXDATA_PLACEHOLDER`* | 10.1 | |
|
||||
| 35 |*`CUDNN_PARAM_DZDESC`* | 10.1 | |
|
||||
| 36 |*`CUDNN_PARAM_DZDATA_PLACEHOLDER`* | 10.1 | |
|
||||
| 37 |*`CUDNN_PARAM_BN_DSCALE_PLACEHOLDER`* | 10.1 | |
|
||||
| 38 |*`CUDNN_PARAM_BN_DBIAS_PLACEHOLDER`* | 10.1 | |
|
||||
| enum |***`cudnnFusedOpsPointerPlaceHolder_t`*** | 10.1 | |
|
||||
| 0 |*`CUDNN_PTR_NULL`* | 10.1 | |
|
||||
| 1 |*`CUDNN_PTR_ELEM_ALIGNED`* | 10.1 | |
|
||||
| 2 |*`CUDNN_PTR_16B_ALIGNED`* | 10.1 | |
|
||||
| enum |***`cudnnFusedOpsVariantParamLabel_t`*** | 10.1 | |
|
||||
| 0 |*`CUDNN_PTR_XDATA`* | 10.1 | |
|
||||
| 1 |*`CUDNN_PTR_BN_EQSCALE`* | 10.1 | |
|
||||
| 2 |*`CUDNN_PTR_BN_EQBIAS`* | 10.1 | |
|
||||
| 3 |*`CUDNN_PTR_WDATA`* | 10.1 | |
|
||||
| 4 |*`CUDNN_PTR_DWDATA`* | 10.1 | |
|
||||
| 5 |*`CUDNN_PTR_YDATA`* | 10.1 | |
|
||||
| 6 |*`CUDNN_PTR_DYDATA`* | 10.1 | |
|
||||
| 7 |*`CUDNN_PTR_YSUM`* | 10.1 | |
|
||||
| 8 |*`CUDNN_PTR_YSQSUM`* | 10.1 | |
|
||||
| 9 |*`CUDNN_PTR_WORKSPACE`* | 10.1 | |
|
||||
| 10 |*`CUDNN_PTR_BN_SCALE`* | 10.1 | |
|
||||
| 11 |*`CUDNN_PTR_BN_BIAS`* | 10.1 | |
|
||||
| 12 |*`CUDNN_PTR_BN_SAVED_MEAN`* | 10.1 | |
|
||||
| 13 |*`CUDNN_PTR_BN_SAVED_INVSTD`* | 10.1 | |
|
||||
| 14 |*`CUDNN_PTR_BN_RUNNING_MEAN`* | 10.1 | |
|
||||
| 15 |*`CUDNN_PTR_BN_RUNNING_VAR`* | 10.1 | |
|
||||
| 16 |*`CUDNN_PTR_ZDATA`* | 10.1 | |
|
||||
| 17 |*`CUDNN_PTR_BN_Z_EQSCALE`* | 10.1 | |
|
||||
| 18 |*`CUDNN_PTR_BN_Z_EQBIAS`* | 10.1 | |
|
||||
| 19 |*`CUDNN_PTR_ACTIVATION_BITMASK`* | 10.1 | |
|
||||
| 20 |*`CUDNN_PTR_DXDATA`* | 10.1 | |
|
||||
| 21 |*`CUDNN_PTR_DZDATA`* | 10.1 | |
|
||||
| 22 |*`CUDNN_PTR_BN_DSCALE`* | 10.1 | |
|
||||
| 23 |*`CUDNN_PTR_BN_DBIAS`* | 10.1 | |
|
||||
| 100 |*`CUDNN_SCALAR_SIZE_T_WORKSPACE_SIZE_IN_BYTES`* | 10.1 | |
|
||||
| 101 |*`CUDNN_SCALAR_INT64_T_BN_ACCUMULATION_COUNT`* | 10.1 | |
|
||||
| 102 |*`CUDNN_SCALAR_DOUBLE_BN_EXP_AVG_FACTOR`* | 10.1 | |
|
||||
| 103 |*`CUDNN_SCALAR_DOUBLE_BN_EPSILON`* | 10.1 | |
|
||||
|
||||
## **2. CUDNN API functions**
|
||||
|
||||
| **CUDA** | **HIP** |**CUDA version\***|
|
||||
|-----------------------------------------------------------|-------------------------------------------------|:-----------------|
|
||||
|`cudnnGetVersion` |`hipdnnGetVersion` |
|
||||
|`cudnnGetCudartVersion` | | 7.5 |
|
||||
|`cudnnGetErrorString` |`hipdnnGetErrorString` |
|
||||
|`cudnnQueryRuntimeError` | | 8.0 |
|
||||
|`cudnnGetProperty` | | 7.5 |
|
||||
|`cudnnCreate` |`hipdnnCreate` |
|
||||
|`cudnnDestroy` |`hipdnnDestroy` |
|
||||
|`cudnnSetStream` |`hipdnnSetStream` |
|
||||
|`cudnnGetStream` |`hipdnnGetStream` |
|
||||
|`cudnnCreateTensorDescriptor` |`hipdnnCreateTensorDescriptor` |
|
||||
|`cudnnSetTensor4dDescriptor` |`hipdnnSetTensor4dDescriptor` |
|
||||
|`cudnnSetTensor4dDescriptorEx` |`hipdnnSetTensor4dDescriptorEx` |
|
||||
|`cudnnGetTensor4dDescriptor` |`hipdnnGetTensor4dDescriptor` |
|
||||
|`cudnnSetTensorNdDescriptor` |`hipdnnSetTensorNdDescriptor` |
|
||||
|`cudnnSetTensorNdDescriptorEx` | | 7.5 |
|
||||
|`cudnnGetTensorNdDescriptor` |`hipdnnGetTensorNdDescriptor` |
|
||||
|`cudnnGetTensorSizeInBytes` | | 7.5 |
|
||||
|`cudnnDestroyTensorDescriptor` |`hipdnnDestroyTensorDescriptor` |
|
||||
|`cudnnTransformTensor` | |
|
||||
|`cudnnTransformTensorEx` | | 9.0 |
|
||||
|`cudnnInitTransformDest` | | 9.0 |
|
||||
|`cudnnCreateTensorTransformDescriptor` | | 9.0 |
|
||||
|`cudnnSetTensorTransformDescriptor` | | 9.0 |
|
||||
|`cudnnGetTensorTransformDescriptor` | | 9.0 |
|
||||
|`cudnnDestroyTensorTransformDescriptor` | | 9.0 |
|
||||
|`cudnnAddTensor` |`hipdnnAddTensor` |
|
||||
|`cudnnCreateOpTensorDescriptor` |`hipdnnCreateOpTensorDescriptor` | 7.5 |
|
||||
|`cudnnSetOpTensorDescriptor` |`hipdnnSetOpTensorDescriptor` | 7.5 |
|
||||
|`cudnnGetOpTensorDescriptor` |`hipdnnGetOpTensorDescriptor` | 7.5 |
|
||||
|`cudnnDestroyOpTensorDescriptor` |`hipdnnDestroyOpTensorDescriptor` | 7.5 |
|
||||
|`cudnnOpTensor` |`hipdnnOpTensor` | 7.5 |
|
||||
|`cudnnGetFoldedConvBackwardDataDescriptors` | | 10.1 |
|
||||
|`cudnnCreateReduceTensorDescriptor` |`hipdnnCreateReduceTensorDescriptor` | 7.5 |
|
||||
|`cudnnSetReduceTensorDescriptor` |`hipdnnSetReduceTensorDescriptor` | 7.5 |
|
||||
|`cudnnGetReduceTensorDescriptor` |`hipdnnGetReduceTensorDescriptor` | 7.5 |
|
||||
|`cudnnDestroyReduceTensorDescriptor` |`hipdnnDestroyReduceTensorDescriptor` | 7.5 |
|
||||
|`cudnnGetReductionIndicesSize` | | 7.5 |
|
||||
|`cudnnGetReductionWorkspaceSize` |`hipdnnGetReductionWorkspaceSize` | 7.5 |
|
||||
|`cudnnReduceTensor` |`hipdnnReduceTensor` | 7.5 |
|
||||
|`cudnnSetTensor` |`hipdnnSetTensor` |
|
||||
|`cudnnScaleTensor` |`hipdnnScaleTensor` |
|
||||
|`cudnnCreateFilterDescriptor` |`hipdnnCreateFilterDescriptor` |
|
||||
|`cudnnSetFilter4dDescriptor` |`hipdnnSetFilter4dDescriptor` |
|
||||
|`cudnnGetFilter4dDescriptor` |`hipdnnGetFilter4dDescriptor` |
|
||||
|`cudnnSetFilterNdDescriptor` |`hipdnnSetFilterNdDescriptor` |
|
||||
|`cudnnGetFilterNdDescriptor` |`hipdnnGetFilterNdDescriptor` |
|
||||
|`cudnnGetFilterSizeInBytes` | | 10.1 |
|
||||
|`cudnnTransformFilter` | | 10.1 |
|
||||
|`cudnnDestroyFilterDescriptor` |`hipdnnDestroyFilterDescriptor` |
|
||||
|`cudnnReorderFilterAndBias` | | 10.1 |
|
||||
|`cudnnCreateConvolutionDescriptor` |`hipdnnCreateConvolutionDescriptor` |
|
||||
|`cudnnSetConvolutionMathType` |`hipdnnSetConvolutionMathType` | 8.0 |
|
||||
|`cudnnGetConvolutionMathType` | | 8.0 |
|
||||
|`cudnnSetConvolutionGroupCount` |`hipdnnSetConvolutionGroupCount` | 8.0 |
|
||||
|`cudnnGetConvolutionGroupCount` | | 8.0 |
|
||||
|`cudnnSetConvolutionReorderType` | | 10.1 |
|
||||
|`cudnnGetConvolutionReorderType` | | 10.1 |
|
||||
|`cudnnSetConvolution2dDescriptor` |`hipdnnSetConvolution2dDescriptor` |
|
||||
|`cudnnGetConvolution2dDescriptor` |`hipdnnGetConvolution2dDescriptor` |
|
||||
|`cudnnGetConvolution2dForwardOutputDim` |`hipdnnGetConvolution2dForwardOutputDim` |
|
||||
|`cudnnSetConvolutionNdDescriptor` |`hipdnnSetConvolutionNdDescriptor` |
|
||||
|`cudnnGetConvolutionNdDescriptor` | |
|
||||
|`cudnnGetConvolutionNdForwardOutputDim` | |
|
||||
|`cudnnDestroyConvolutionDescriptor` | |
|
||||
|`cudnnGetConvolutionForwardAlgorithmMaxCount` | | 8.0 |
|
||||
|`cudnnFindConvolutionForwardAlgorithm` |`hipdnnFindConvolutionForwardAlgorithm` |
|
||||
|`cudnnFindConvolutionForwardAlgorithmEx` |`hipdnnFindConvolutionForwardAlgorithmEx` | 7.5 |
|
||||
|`cudnnGetConvolutionForwardAlgorithm` |`hipdnnGetConvolutionForwardAlgorithm` |
|
||||
|`cudnnGetConvolutionForwardAlgorithm_v7` | | 8.0 |
|
||||
|`cudnnGetConvolutionForwardWorkspaceSize` |`hipdnnGetConvolutionForwardWorkspaceSize` |
|
||||
|`cudnnConvolutionForward` |`hipdnnConvolutionForward` |
|
||||
|`cudnnConvolutionBiasActivationForward` | | 7.5 |
|
||||
|`cudnnConvolutionBackwardBias` |`hipdnnConvolutionBackwardBias` |
|
||||
|`cudnnGetConvolutionBackwardFilterAlgorithmMaxCount` | | 8.0 |
|
||||
|`cudnnFindConvolutionBackwardFilterAlgorithm` |`hipdnnFindConvolutionBackwardFilterAlgorithm` |
|
||||
|`cudnnFindConvolutionBackwardFilterAlgorithmEx` |`hipdnnFindConvolutionBackwardFilterAlgorithmEx` | 7.5 |
|
||||
|`cudnnGetConvolutionBackwardFilterAlgorithm` |`hipdnnGetConvolutionBackwardFilterAlgorithm` |
|
||||
|`cudnnGetConvolutionBackwardFilterAlgorithm_v7` | | 8.0 |
|
||||
|`cudnnGetConvolutionBackwardFilterWorkspaceSize` |`hipdnnGetConvolutionBackwardFilterWorkspaceSize`|
|
||||
|`cudnnConvolutionBackwardFilter` |`hipdnnConvolutionBackwardFilter` |
|
||||
|`cudnnGetConvolutionBackwardDataAlgorithmMaxCount` | | 8.0 |
|
||||
|`cudnnFindConvolutionBackwardDataAlgorithm` |`hipdnnFindConvolutionBackwardDataAlgorithm` |
|
||||
|`cudnnFindConvolutionBackwardDataAlgorithmEx` |`hipdnnFindConvolutionBackwardDataAlgorithmEx` | 7.5 |
|
||||
|`cudnnGetConvolutionBackwardDataAlgorithm` |`hipdnnGetConvolutionBackwardDataAlgorithm` |
|
||||
|`cudnnGetConvolutionBackwardDataAlgorithm_v7` | | 8.0 |
|
||||
|`cudnnGetConvolutionBackwardDataWorkspaceSize` |`hipdnnGetConvolutionBackwardDataWorkspaceSize` |
|
||||
|`cudnnConvolutionBackwardData` |`hipdnnConvolutionBackwardData` |
|
||||
|`cudnnIm2Col` | |
|
||||
|`cudnnSoftmaxForward` |`hipdnnSoftmaxForward` |
|
||||
|`cudnnSoftmaxBackward` |`hipdnnSoftmaxBackward` |
|
||||
|`cudnnCreatePoolingDescriptor` |`hipdnnCreatePoolingDescriptor` |
|
||||
|`cudnnSetPooling2dDescriptor` |`hipdnnSetPooling2dDescriptor` |
|
||||
|`cudnnGetPooling2dDescriptor` |`hipdnnGetPooling2dDescriptor` |
|
||||
|`cudnnSetPoolingNdDescriptor` |`hipdnnSetPoolingNdDescriptor` |
|
||||
|`cudnnGetPoolingNdDescriptor` | |
|
||||
|`cudnnGetPoolingNdForwardOutputDim` | |
|
||||
|`cudnnGetPooling2dForwardOutputDim` |`hipdnnGetPooling2dForwardOutputDim` |
|
||||
|`cudnnDestroyPoolingDescriptor` |`hipdnnDestroyPoolingDescriptor` |
|
||||
|`cudnnPoolingForward` |`hipdnnPoolingForward` |
|
||||
|`cudnnPoolingBackward` |`hipdnnPoolingBackward` |
|
||||
|`cudnnCreateActivationDescriptor` |`hipdnnCreateActivationDescriptor` |
|
||||
|`cudnnSetActivationDescriptor` |`hipdnnSetActivationDescriptor` |
|
||||
|`cudnnGetActivationDescriptor` |`hipdnnGetActivationDescriptor` |
|
||||
|`cudnnDestroyActivationDescriptor` |`hipdnnDestroyActivationDescriptor` |
|
||||
|`cudnnActivationForward` |`hipdnnActivationForward` |
|
||||
|`cudnnActivationBackward` |`hipdnnActivationBackward` |
|
||||
|`cudnnCreateLRNDescriptor` |`hipdnnCreateLRNDescriptor` |
|
||||
|`cudnnSetLRNDescriptor` |`hipdnnSetLRNDescriptor` |
|
||||
|`cudnnGetLRNDescriptor` |`hipdnnGetLRNDescriptor` |
|
||||
|`cudnnDestroyLRNDescriptor` |`hipdnnDestroyLRNDescriptor` |
|
||||
|`cudnnLRNCrossChannelForward` |`hipdnnLRNCrossChannelForward` |
|
||||
|`cudnnLRNCrossChannelBackward` |`hipdnnLRNCrossChannelBackward` |
|
||||
|`cudnnDivisiveNormalizationForward` | |
|
||||
|`cudnnDivisiveNormalizationBackward` | |
|
||||
|`cudnnDeriveBNTensorDescriptor` |`hipdnnDeriveBNTensorDescriptor` |
|
||||
|`cudnnBatchNormalizationForwardTraining` |`hipdnnBatchNormalizationForwardTraining` |
|
||||
|`cudnnBatchNormalizationForwardTrainingEx` | | 9.0 |
|
||||
|`cudnnBatchNormalizationForwardInference` |`hipdnnBatchNormalizationForwardInference` |
|
||||
|`cudnnBatchNormalizationBackward` |`hipdnnBatchNormalizationBackward` |
|
||||
|`cudnnBatchNormalizationBackwardEx` | | 9.0 |
|
||||
|`cudnnCreateSpatialTransformerDescriptor` | | 7.5 |
|
||||
|`cudnnSetSpatialTransformerNdDescriptor` | | 7.5 |
|
||||
|`cudnnDestroySpatialTransformerDescriptor` | | 7.5 |
|
||||
|`cudnnSpatialTfGridGeneratorForward` | | 7.5 |
|
||||
|`cudnnSpatialTfGridGeneratorBackward` | | 7.5 |
|
||||
|`cudnnSpatialTfSamplerForward` | | 7.5 |
|
||||
|`cudnnSpatialTfSamplerBackward` | | 7.5 |
|
||||
|`cudnnCreateDropoutDescriptor` |`hipdnnCreateDropoutDescriptor` | 7.5 |
|
||||
|`cudnnDestroyDropoutDescriptor` |`hipdnnDestroyDropoutDescriptor` | 7.5 |
|
||||
|`cudnnDropoutGetStatesSize` |`hipdnnDropoutGetStatesSize` | 7.5 |
|
||||
|`cudnnDropoutGetReserveSpaceSize` | | 7.5 |
|
||||
|`cudnnSetDropoutDescriptor` |`hipdnnSetDropoutDescriptor` | 7.5 |
|
||||
|`cudnnGetDropoutDescriptor` | | 8.0 |
|
||||
|`cudnnRestoreDropoutDescriptor` | | 8.0 |
|
||||
|`cudnnDropoutForward` | | 7.5 |
|
||||
|`cudnnDropoutBackward` | | 7.5 |
|
||||
|`cudnnCreateRNNDescriptor` |`hipdnnCreateRNNDescriptor` | 7.5 |
|
||||
|`cudnnDestroyRNNDescriptor` |`hipdnnDestroyRNNDescriptor` | 7.5 |
|
||||
|`cudnnGetRNNForwardInferenceAlgorithmMaxCount` | | 8.0 |
|
||||
|`cudnnFindRNNForwardInferenceAlgorithmEx` | | 8.0 |
|
||||
|`cudnnGetRNNForwardTrainingAlgorithmMaxCount` | | 8.0 |
|
||||
|`cudnnFindRNNForwardTrainingAlgorithmEx` | | 8.0 |
|
||||
|`cudnnGetRNNBackwardDataAlgorithmMaxCount` | | 8.0 |
|
||||
|`cudnnFindRNNBackwardDataAlgorithmEx` | | 8.0 |
|
||||
|`cudnnGetRNNBackwardWeightsAlgorithmMaxCount` | | 8.0 |
|
||||
|`cudnnFindRNNBackwardWeightsAlgorithmEx` | | 8.0 |
|
||||
|`cudnnCreatePersistentRNNPlan` |`hipdnnCreatePersistentRNNPlan` | 7.5 |
|
||||
|`cudnnSetPersistentRNNPlan` |`hipdnnSetPersistentRNNPlan` | 7.5 |
|
||||
|`cudnnDestroyPersistentRNNPlan` |`hipdnnDestroyPersistentRNNPlan` | 7.5 |
|
||||
|`cudnnSetRNNDescriptor` |`hipdnnSetRNNDescriptor` | 7.5 |
|
||||
|`cudnnGetRNNDescriptor` |`hipdnnGetRNNDescriptor` | 8.0 |
|
||||
|`cudnnSetRNNProjectionLayers` | | 8.0 |
|
||||
|`cudnnGetRNNProjectionLayers` | | 8.0 |
|
||||
|`cudnnSetRNNAlgorithmDescriptor` | | 8.0 |
|
||||
|`cudnnSetRNNMatrixMathType` | | 8.0 |
|
||||
|`cudnnGetRNNMatrixMathType` | | 8.0 |
|
||||
|`cudnnGetRNNWorkspaceSize` |`hipdnnGetRNNWorkspaceSize` | 7.5 |
|
||||
|`cudnnGetRNNTrainingReserveSize` |`hipdnnGetRNNTrainingReserveSize` | 7.5 |
|
||||
|`cudnnGetRNNParamsSize` |`hipdnnGetRNNParamsSize` | 7.5 |
|
||||
|`cudnnGetRNNLinLayerMatrixParams` |`hipdnnGetRNNLinLayerMatrixParams` | 7.5 |
|
||||
|`cudnnGetRNNLinLayerBiasParams` |`hipdnnGetRNNLinLayerBiasParams` | 7.5 |
|
||||
|`cudnnRNNForwardInference` |`hipdnnRNNForwardInference` | 7.5 |
|
||||
|`cudnnRNNForwardInferenceEx` | | 9.0 |
|
||||
|`cudnnRNNForwardTraining` |`hipdnnRNNForwardTraining` | 7.5 |
|
||||
|`cudnnRNNForwardTrainingEx` | | 9.0 |
|
||||
|`cudnnRNNBackwardData` |`hipdnnRNNBackwardData` | 7.5 |
|
||||
|`cudnnRNNBackwardDataEx` | | 9.0 |
|
||||
|`cudnnRNNBackwardWeights` |`hipdnnRNNBackwardWeights` | 7.5 |
|
||||
|`cudnnRNNBackwardWeightsEx` | | 9.0 |
|
||||
|`cudnnSetRNNPaddingMode` | | 9.0 |
|
||||
|`cudnnGetRNNPaddingMode` | | 9.0 |
|
||||
|`cudnnCreateRNNDataDescriptor` | | 9.0 |
|
||||
|`cudnnDestroyRNNDataDescriptor` | | 9.0 |
|
||||
|`cudnnSetRNNDataDescriptor` | | 9.0 |
|
||||
|`cudnnGetRNNDataDescriptor` | | 9.0 |
|
||||
|`cudnnSetRNNBiasMode` | | 9.0 |
|
||||
|`cudnnGetRNNBiasMode` | | 9.0 |
|
||||
|`cudnnCreateCTCLossDescriptor` | | 8.0 |
|
||||
|`cudnnSetCTCLossDescriptor` | | 8.0 |
|
||||
|`cudnnSetCTCLossDescriptorEx` | | 10.1 |
|
||||
|`cudnnGetCTCLossDescriptor` | | 8.0 |
|
||||
|`cudnnGetCTCLossDescriptorEx` | | 10.1 |
|
||||
|`cudnnDestroyCTCLossDescriptor` | | 8.0 |
|
||||
|`cudnnCTCLoss` | | 8.0 |
|
||||
|`cudnnGetCTCLossWorkspaceSize` | | 8.0 |
|
||||
|`cudnnCreateAlgorithmDescriptor` | | 8.0 |
|
||||
|`cudnnSetAlgorithmDescriptor` | | 8.0 |
|
||||
|`cudnnGetAlgorithmDescriptor` | | 8.0 |
|
||||
|`cudnnCopyAlgorithmDescriptor` | | 8.0 |
|
||||
|`cudnnDestroyAlgorithmDescriptor` | | 8.0 |
|
||||
|`cudnnCreateAlgorithmPerformance` | | 8.0 |
|
||||
|`cudnnSetAlgorithmPerformance` | | 8.0 |
|
||||
|`cudnnGetAlgorithmPerformance` | | 8.0 |
|
||||
|`cudnnDestroyAlgorithmPerformance` | | 8.0 |
|
||||
|`cudnnGetAlgorithmSpaceSize` | | 8.0 |
|
||||
|`cudnnSaveAlgorithm` | | 8.0 |
|
||||
|`cudnnRestoreAlgorithm` | | 8.0 |
|
||||
|`cudnnSetRNNDescriptor_v5` |`hipdnnSetRNNDescriptor_v5` | 8.0 |
|
||||
|`cudnnSetRNNDescriptor_v6` |`hipdnnSetRNNDescriptor_v6` | 7.5 |
|
||||
|`cudnnSetCallback` | | 8.0 |
|
||||
|`cudnnGetCallback` | | 8.0 |
|
||||
|`cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize` | | 9.0 |
|
||||
|`cudnnGetBatchNormalizationBackwardExWorkspaceSize` | | 9.0 |
|
||||
|`cudnnGetBatchNormalizationTrainingExReserveSpaceSize` | | 9.0 |
|
||||
|`cudnnRNNSetClip` | | 9.0 |
|
||||
|`cudnnRNNGetClip` | | 9.0 |
|
||||
|`cudnnCreateSeqDataDescriptor` | | 9.0 |
|
||||
|`cudnnDestroySeqDataDescriptor` | | 9.0 |
|
||||
|`cudnnSetSeqDataDescriptor` | | 9.0 |
|
||||
|`cudnnGetSeqDataDescriptor` | | 9.0 |
|
||||
|`cudnnCreateAttnDescriptor` | | 9.0 |
|
||||
|`cudnnDestroyAttnDescriptor` | | 9.0 |
|
||||
|`cudnnSetAttnDescriptor` | | 9.0 |
|
||||
|`cudnnGetAttnDescriptor` | | 9.0 |
|
||||
|`cudnnGetMultiHeadAttnBuffers` | | 9.0 |
|
||||
|`cudnnGetMultiHeadAttnWeights` | | 9.0 |
|
||||
|`cudnnMultiHeadAttnForward` | | 9.0 |
|
||||
|`cudnnMultiHeadAttnBackwardData` | | 9.0 |
|
||||
|`cudnnMultiHeadAttnBackwardWeights` | | 9.0 |
|
||||
|`cudnnCreateFusedOpsConstParamPack` | | 10.1 |
|
||||
|`cudnnDestroyFusedOpsConstParamPack` | | 10.1 |
|
||||
|`cudnnSetFusedOpsConstParamPackAttribute` | | 10.1 |
|
||||
|`cudnnGetFusedOpsConstParamPackAttribute` | | 10.1 |
|
||||
|`cudnnCreateFusedOpsVariantParamPack` | | 10.1 |
|
||||
|`cudnnDestroyFusedOpsVariantParamPack` | | 10.1 |
|
||||
|`cudnnSetFusedOpsVariantParamPackAttribute` | | 10.1 |
|
||||
|`cudnnGetFusedOpsVariantParamPackAttribute` | | 10.1 |
|
||||
|`cudnnCreateFusedOpsPlan` | | 10.1 |
|
||||
|`cudnnDestroyFusedOpsPlan` | | 10.1 |
|
||||
|`cudnnMakeFusedOpsPlan` | | 10.1 |
|
||||
|`cudnnFusedOpsExecute` | | 10.1 |
|
||||
|
||||
\* CUDA version, in which API has appeared and (optional) last version before abandoning it; no value in case of earlier versions < 7.5.
|
||||
@@ -1,81 +0,0 @@
|
||||
# CUFFT API supported by HIP
|
||||
|
||||
## **1. CUFFT Data types**
|
||||
|
||||
| **type** | **CUDA** |**CUDA version\***| **HIP** |**HIP value** (if differs) |
|
||||
|-------------:|---------------------------------------------------------------|:----------------:|------------------------------------------------------------|---------------------------|
|
||||
| enum |***`cufftResult_t`*** | |***`hipfftResult_t`*** |
|
||||
| enum |***`cufftResult`*** | |***`hipfftResult`*** |
|
||||
| 0x0 |*`CUFFT_SUCCESS`* | |*`HIPFFT_SUCCESS`* | 0 |
|
||||
| 0x1 |*`CUFFT_INVALID_PLAN`* | |*`HIPFFT_INVALID_PLAN`* | 1 |
|
||||
| 0x2 |*`CUFFT_ALLOC_FAILED`* | |*`HIPFFT_ALLOC_FAILED`* | 2 |
|
||||
| 0x3 |*`CUFFT_INVALID_TYPE`* | |*`HIPFFT_INVALID_TYPE`* | 3 |
|
||||
| 0x4 |*`CUFFT_INVALID_VALUE`* | |*`HIPFFT_INVALID_VALUE`* | 4 |
|
||||
| 0x5 |*`CUFFT_INTERNAL_ERROR`* | |*`HIPFFT_INTERNAL_ERROR`* | 5 |
|
||||
| 0x6 |*`CUFFT_EXEC_FAILED`* | |*`HIPFFT_EXEC_FAILED`* | 6 |
|
||||
| 0x7 |*`CUFFT_SETUP_FAILED`* | |*`HIPFFT_SETUP_FAILED`* | 7 |
|
||||
| 0x8 |*`CUFFT_INVALID_SIZE`* | |*`HIPFFT_INVALID_SIZE`* | 8 |
|
||||
| 0x9 |*`CUFFT_UNALIGNED_DATA`* | |*`HIPFFT_UNALIGNED_DATA`* | 9 |
|
||||
| 0xA |*`CUFFT_INCOMPLETE_PARAMETER_LIST`* | |*`HIPFFT_INCOMPLETE_PARAMETER_LIST`* | 10 |
|
||||
| 0xB |*`CUFFT_INVALID_DEVICE`* | |*`HIPFFT_INVALID_DEVICE`* | 11 |
|
||||
| 0xC |*`CUFFT_PARSE_ERROR`* | |*`HIPFFT_PARSE_ERROR`* | 12 |
|
||||
| 0xD |*`CUFFT_NO_WORKSPACE`* | |*`HIPFFT_NO_WORKSPACE`* | 13 |
|
||||
| 0xE |*`CUFFT_NOT_IMPLEMENTED`* | |*`HIPFFT_NOT_IMPLEMENTED`* | 14 |
|
||||
| 0xF |*`CUFFT_LICENSE_ERROR`* | | |
|
||||
| 0x10 |*`CUFFT_NOT_SUPPORTED`* | 8.0 |*`HIPFFT_NOT_SUPPORTED`* | 16 |
|
||||
| float |***`cufftReal`*** | |***`hipfftReal`*** |
|
||||
| double |***`cufftDoubleReal`*** | |***`hipfftDoubleReal`*** |
|
||||
| float2 |***`cufftComplex`*** | |***`hipfftComplex`*** |
|
||||
| double2 |***`cufftDoubleComplex`*** | |***`hipfftDoubleComplex`*** |
|
||||
| define |`CUFFT_FORWARD` | |`HIPFFT_FORWARD` |
|
||||
| define |`CUFFT_INVERSE` | |`HIPFFT_BACKWARD` |
|
||||
| enum |***`cufftType_t`*** | |***`hipfftType_t`*** |
|
||||
| enum |***`cufftType`*** | |***`hipfftType`*** |
|
||||
| 0x2a |*`CUFFT_R2C`* | |*`HIPFFT_R2C`* |
|
||||
| 0x2c |*`CUFFT_C2R`* | |*`HIPFFT_C2R`* |
|
||||
| 0x29 |*`CUFFT_C2C`* | |*`HIPFFT_C2C`* |
|
||||
| 0x6a |*`CUFFT_D2Z`* | |*`HIPFFT_D2Z`* |
|
||||
| 0x6c |*`CUFFT_Z2D`* | |*`HIPFFT_Z2D`* |
|
||||
| 0x69 |*`CUFFT_Z2Z`* | |*`HIPFFT_Z2Z`* |
|
||||
| enum |***`cufftCompatibility_t`*** | | |
|
||||
| enum |***`cufftCompatibility`*** | | |
|
||||
| 0x01 |*`CUFFT_COMPATIBILITY_FFTW_PADDING`* | | |
|
||||
| define |`CUFFT_COMPATIBILITY_DEFAULT` | | |
|
||||
| int |***`cufftHandle`*** | |***`hipfftHandle`*** |
|
||||
|
||||
## **2. CUFFT API functions**
|
||||
|
||||
| **CUDA** | **HIP** |**CUDA version\***|
|
||||
|-----------------------------------------------------------|-------------------------------------------------|:----------------:|
|
||||
|`cufftPlan1d` |`hipfftPlan1d` |
|
||||
|`cufftPlan2d` |`hipfftPlan2d` |
|
||||
|`cufftPlan3d` |`hipfftPlan3d` |
|
||||
|`cufftPlanMany` |`hipfftPlanMany` |
|
||||
|`cufftMakePlan1d` |`hipfftMakePlan1d` |
|
||||
|`cufftMakePlan2d` |`hipfftMakePlan2d` |
|
||||
|`cufftMakePlan3d` |`hipfftMakePlan3d` |
|
||||
|`cufftMakePlanMany` |`hipfftMakePlanMany` |
|
||||
|`cufftMakePlanMany64` |`hipfftMakePlanMany64` | 7.5 |
|
||||
|`cufftGetSizeMany64` |`hipfftGetSizeMany64` | 7.5 |
|
||||
|`cufftEstimate1d` |`hipfftEstimate1d` |
|
||||
|`cufftEstimate2d` |`hipfftEstimate2d` |
|
||||
|`cufftEstimate3d` |`hipfftEstimate3d` |
|
||||
|`cufftEstimateMany` |`hipfftEstimateMany` |
|
||||
|`cufftCreate` |`hipfftCreate` |
|
||||
|`cufftGetSize1d` |`hipfftGetSize1d` |
|
||||
|`cufftGetSize2d` |`hipfftGetSize2d` |
|
||||
|`cufftGetSize3d` |`hipfftGetSize3d` |
|
||||
|`cufftGetSizeMany` |`hipfftGetSizeMany` |
|
||||
|`cufftGetSize` |`hipfftGetSize` |
|
||||
|`cufftSetWorkArea` |`hipfftSetWorkArea` |
|
||||
|`cufftSetAutoAllocation` |`hipfftSetAutoAllocation` |
|
||||
|`cufftExecC2C` |`hipfftExecC2C` |
|
||||
|`cufftExecR2C` |`hipfftExecR2C` |
|
||||
|`cufftExecC2R` |`hipfftExecC2R` |
|
||||
|`cufftExecZ2Z` |`hipfftExecZ2Z` |
|
||||
|`cufftExecD2Z` |`hipfftExecD2Z` |
|
||||
|`cufftExecZ2D` |`hipfftExecZ2D` |
|
||||
|`cufftSetStream` |`hipfftSetStream` |
|
||||
|`cufftDestroy` |`hipfftDestroy` |
|
||||
|`cufftGetVersion` |`hipfftGetVersion` |
|
||||
|`cufftGetProperty` | | 8.0 |
|
||||
@@ -1,172 +0,0 @@
|
||||
# CURAND API supported by HIP
|
||||
|
||||
## **1. CURAND Data types**
|
||||
|
||||
| **type** | **CUDA** |**CUDA version\***| **HIP** | **HIP value** (if differs) |
|
||||
|-------------:|---------------------------------------------------------------|:----------------:|------------------------------------------------------------|----------------------------|
|
||||
| define |`CURAND_VER_MAJOR` | 10.1 Update 2 | |
|
||||
| define |`CURAND_VER_MINOR` | 10.1 Update 2 | |
|
||||
| define |`CURAND_VER_PATCH` | 10.1 Update 2 | |
|
||||
| define |`CURAND_VER_BUILD` | 10.1 Update 2 | |
|
||||
| define |`CURAND_VERSION` | 10.1 Update 2 | |
|
||||
| enum |***`curandStatus`*** | |***`hiprandStatus`*** |
|
||||
| enum |***`curandStatus_t`*** | |***`hiprandStatus_t`*** |
|
||||
| 0 |*`CURAND_STATUS_SUCCESS`* | |*`HIPRAND_STATUS_SUCCESS`* |
|
||||
| 100 |*`CURAND_STATUS_VERSION_MISMATCH`* | |*`HIPRAND_STATUS_VERSION_MISMATCH`* |
|
||||
| 101 |*`CURAND_STATUS_NOT_INITIALIZED`* | |*`HIPRAND_STATUS_NOT_INITIALIZED`* |
|
||||
| 102 |*`CURAND_STATUS_ALLOCATION_FAILED`* | |*`HIPRAND_STATUS_ALLOCATION_FAILED`* |
|
||||
| 103 |*`CURAND_STATUS_TYPE_ERROR`* | |*`HIPRAND_STATUS_TYPE_ERROR`* |
|
||||
| 104 |*`CURAND_STATUS_OUT_OF_RANGE`* | |*`HIPRAND_STATUS_OUT_OF_RANGE`* |
|
||||
| 105 |*`CURAND_STATUS_LENGTH_NOT_MULTIPLE`* | |*`HIPRAND_STATUS_LENGTH_NOT_MULTIPLE`* |
|
||||
| 106 |*`CURAND_STATUS_DOUBLE_PRECISION_REQUIRED`* | |*`HIPRAND_STATUS_DOUBLE_PRECISION_REQUIRED`* |
|
||||
| 201 |*`CURAND_STATUS_LAUNCH_FAILURE`* | |*`HIPRAND_STATUS_LAUNCH_FAILURE`* |
|
||||
| 202 |*`CURAND_STATUS_PREEXISTING_FAILURE`* | |*`HIPRAND_STATUS_PREEXISTING_FAILURE`* |
|
||||
| 203 |*`CURAND_STATUS_INITIALIZATION_FAILED`* | |*`HIPRAND_STATUS_INITIALIZATION_FAILED`* |
|
||||
| 204 |*`CURAND_STATUS_ARCH_MISMATCH`* | |*`HIPRAND_STATUS_ARCH_MISMATCH`* |
|
||||
| 999 |*`CURAND_STATUS_INTERNAL_ERROR`* | |*`HIPRAND_STATUS_INTERNAL_ERROR`* |
|
||||
| enum |***`curandRngType`*** | |***`hiprandRngType`*** |
|
||||
| enum |***`curandRngType_t`*** | |***`hiprandRngType_t`*** |
|
||||
| 0 |*`CURAND_RNG_TEST`* | |*`HIPRAND_RNG_TEST`* |
|
||||
| 100 |*`CURAND_RNG_PSEUDO_DEFAULT`* | |*`HIPRAND_RNG_PSEUDO_DEFAULT`* | 400 |
|
||||
| 101 |*`CURAND_RNG_PSEUDO_XORWOW`* | |*`HIPRAND_RNG_PSEUDO_XORWOW`* | 401 |
|
||||
| 121 |*`CURAND_RNG_PSEUDO_MRG32K3A`* | |*`HIPRAND_RNG_PSEUDO_MRG32K3A`* | 402 |
|
||||
| 141 |*`CURAND_RNG_PSEUDO_MTGP32`* | |*`HIPRAND_RNG_PSEUDO_MTGP32`* | 403 |
|
||||
| 142 |*`CURAND_RNG_PSEUDO_MT19937`* | |*`HIPRAND_RNG_PSEUDO_MT19937`* | 404 |
|
||||
| 161 |*`CURAND_RNG_PSEUDO_PHILOX4_32_10`* | |*`HIPRAND_RNG_PSEUDO_PHILOX4_32_10`* | 405 |
|
||||
| 200 |*`CURAND_RNG_QUASI_DEFAULT`* | |*`HIPRAND_RNG_QUASI_DEFAULT`* | 500 |
|
||||
| 201 |*`CURAND_RNG_QUASI_SOBOL32`* | |*`HIPRAND_RNG_QUASI_SOBOL32`* | 501 |
|
||||
| 202 |*`CURAND_RNG_QUASI_SCRAMBLED_SOBOL32`* | |*`HIPRAND_RNG_QUASI_SCRAMBLED_SOBOL32`* | 502 |
|
||||
| 203 |*`CURAND_RNG_QUASI_SOBOL64`* | |*`HIPRAND_RNG_QUASI_SOBOL64`* | 503 |
|
||||
| 204 |*`CURAND_RNG_QUASI_SCRAMBLED_SOBOL64`* | |*`HIPRAND_RNG_QUASI_SCRAMBLED_SOBOL64`* | 504 |
|
||||
| enum |***`curandOrdering`*** | | |
|
||||
| enum |***`curandOrdering_t`*** | | |
|
||||
| 100 |*`CURAND_ORDERING_PSEUDO_BEST`* | | |
|
||||
| 101 |*`CURAND_ORDERING_PSEUDO_DEFAULT`* | | |
|
||||
| 102 |*`CURAND_ORDERING_PSEUDO_SEEDED`* | | |
|
||||
| 201 |*`CURAND_ORDERING_QUASI_DEFAULT`* | | |
|
||||
| enum |***`curandDirectionVectorSet`*** | | |
|
||||
| enum |***`curandDirectionVectorSet_t`*** | | |
|
||||
| 101 |*`CURAND_DIRECTION_VECTORS_32_JOEKUO6`* | | |
|
||||
| 102 |*`CURAND_SCRAMBLED_DIRECTION_VECTORS_32_JOEKUO6`* | | |
|
||||
| 103 |*`CURAND_DIRECTION_VECTORS_64_JOEKUO6`* | | |
|
||||
| 104 |*`CURAND_SCRAMBLED_DIRECTION_VECTORS_64_JOEKUO6`* | | |
|
||||
| uint | `curandDirectionVectors32_t` | | `hiprandDirectionVectors32_t` |
|
||||
| uint | `curandDirectionVectors64_t` | | |
|
||||
| struct | `curandGenerator_st` | | `hiprandGenerator_st` |
|
||||
| struct* | `curandGenerator_t` | | `hiprandGenerator_t` |
|
||||
| double | `curandDistribution_st` | | |
|
||||
| double | `curandHistogramM2V_st` | | |
|
||||
| double* | `curandDistribution_t` | | |
|
||||
| double* | `curandHistogramM2V_t` | | |
|
||||
| struct | `curandDistributionShift_st` | | |
|
||||
| struct* | `curandDistributionShift_t` | | |
|
||||
| struct | `curandDistributionM2Shift_st` | | |
|
||||
| struct* | `curandDistributionM2Shift_t` | | |
|
||||
| struct | `curandHistogramM2_st` | | |
|
||||
| struct* | `curandHistogramM2_t` | | |
|
||||
| uint | `curandHistogramM2K_st` | | |
|
||||
| uint* | `curandHistogramM2K_t` | | |
|
||||
| struct | `curandDiscreteDistribution_st` | | `hiprandDiscreteDistribution_st` |
|
||||
| struct* | `curandDiscreteDistribution_t` | | `hiprandDiscreteDistribution_t` |
|
||||
| enum |***`curandMethod`*** | | |
|
||||
| enum |***`curandMethod_t`*** | | |
|
||||
| 0 |*`CURAND_CHOOSE_BEST`* | | |
|
||||
| 1 |*`CURAND_ITR`* | | |
|
||||
| 2 |*`CURAND_KNUTH`* | | |
|
||||
| 3 |*`CURAND_HITR`* | | |
|
||||
| 4 |*`CURAND_M1`* | | |
|
||||
| 5 |*`CURAND_M2`* | | |
|
||||
| 6 |*`CURAND_BINARY_SEARCH`* | | |
|
||||
| 7 |*`CURAND_DISCRETE_GAUSS`* | | |
|
||||
| 8 |*`CURAND_REJECTION`* | | |
|
||||
| 9 |*`CURAND_DEVICE_API`* | | |
|
||||
| 10 |*`CURAND_FAST_REJECTION`* | | |
|
||||
| 11 |*`CURAND_3RD`* | | |
|
||||
| 12 |*`CURAND_DEFINITION`* | | |
|
||||
| 13 |*`CURAND_POISSON`* | | |
|
||||
| struct | `curandStateMtgp32` | | `hiprandStateMtgp32` |
|
||||
| typedef | `curandStateMtgp32_t` | | `hiprandStateMtgp32_t` |
|
||||
| struct | `curandStateScrambledSobol64` | | |
|
||||
| typedef | `curandStateScrambledSobol64_t` | | |
|
||||
| struct | `curandStateSobol64` | | |
|
||||
| typedef | `curandStateSobol64_t` | | |
|
||||
| struct | `curandStateScrambledSobol32` | | |
|
||||
| typedef | `curandStateScrambledSobol32_t` | | |
|
||||
| struct | `curandStateSobol32` | | `hiprandStateSobol32` |
|
||||
| typedef | `curandStateSobol32_t` | | `hiprandStateSobol32_t` |
|
||||
| struct | `curandStateMRG32k3a` | | `hiprandStateMRG32k3a` |
|
||||
| typedef | `curandStateMRG32k3a_t` | | `hiprandStateMRG32k3a_t` |
|
||||
| struct | `curandStatePhilox4_32_10` | | `hiprandStatePhilox4_32_10` |
|
||||
| typedef | `curandStatePhilox4_32_10_t` | | `hiprandStatePhilox4_32_10_t` |
|
||||
| struct | `curandStateXORWOW` | | `hiprandStateXORWOW` |
|
||||
| typedef | `curandStateXORWOW_t` | | `hiprandStateXORWOW_t` |
|
||||
| struct | `curandState` | | `hiprandState` |
|
||||
| typedef | `curandState_t` | | `hiprandState_t` |
|
||||
|
||||
## **2. Host API Functions**
|
||||
|
||||
| **CUDA** | **HIP** |
|
||||
|-----------------------------------------------------------|--------------------------------------------|
|
||||
| `curandCreateGenerator` | `hiprandCreateGenerator` |
|
||||
| `curandCreateGeneratorHost` | `hiprandCreateGeneratorHost` |
|
||||
| `curandCreatePoissonDistribution` | `hiprandCreatePoissonDistribution` |
|
||||
| `curandDestroyDistribution` | `hiprandDestroyDistribution` |
|
||||
| `curandDestroyGenerator` | `hiprandDestroyGenerator` |
|
||||
| `curandGenerate` | `hiprandGenerate` |
|
||||
| `curandGenerateLogNormal` | `hiprandGenerateLogNormal` |
|
||||
| `curandGenerateLogNormalDouble` | `hiprandGenerateLogNormalDouble` |
|
||||
| `curandGenerateLongLong` | |
|
||||
| `curandGenerateNormal` | `hiprandGenerateNormal` |
|
||||
| `curandGenerateNormalDouble` | `hiprandGenerateNormalDouble` |
|
||||
| `curandGeneratePoisson` | `hiprandGeneratePoisson` |
|
||||
| `curandGenerateSeeds` | `hiprandGenerateSeeds` |
|
||||
| `curandGenerateUniform` | `hiprandGenerateUniform` |
|
||||
| `curandGenerateUniformDouble` | `hiprandGenerateUniformDouble` |
|
||||
| `curandGetDirectionVectors32` | |
|
||||
| `curandGetDirectionVectors64` | |
|
||||
| `curandGetProperty` | |
|
||||
| `curandGetScrambleConstants32` | |
|
||||
| `curandGetScrambleConstants64` | |
|
||||
| `curandGetVersion` | `hiprandGetVersion` |
|
||||
| `curandSetGeneratorOffset` | `hiprandSetGeneratorOffset` |
|
||||
| `curandSetGeneratorOrdering` | |
|
||||
| `curandSetPseudoRandomGeneratorSeed` | `hiprandSetPseudoRandomGeneratorSeed` |
|
||||
| `curandSetQuasiRandomGeneratorDimensions` | `hiprandSetQuasiRandomGeneratorDimensions` |
|
||||
| `curandSetStream` | `hiprandSetStream` |
|
||||
| `curandMakeMTGP32Constants` | `hiprandMakeMTGP32Constants` |
|
||||
| `curandMakeMTGP32KernelState` | `hiprandMakeMTGP32KernelState` |
|
||||
|
||||
## **3. Device API Functions**
|
||||
|
||||
| **CUDA** | **HIP** |
|
||||
|-----------------------------------------------------------|--------------------------------------------|
|
||||
| `curand` | `hiprand` |
|
||||
| `curand_init` | `hiprand_init` |
|
||||
| `curand_log_normal` | `hiprand_log_normal` |
|
||||
| `curand_log_normal_double` | `hiprand_log_normal_double` |
|
||||
| `curand_log_normal2` | `hiprand_log_normal2` |
|
||||
| `curand_log_normal2_double` | `hiprand_log_normal2_double` |
|
||||
| `curand_log_normal4` | `hiprand_log_normal4` |
|
||||
| `curand_log_normal4_double` | `hiprand_log_normal4_double` |
|
||||
| `curand_mtgp32_single` | |
|
||||
| `curand_mtgp32_single_specific` | |
|
||||
| `curand_mtgp32_specific` | |
|
||||
| `curand_normal` | `hiprand_normal` |
|
||||
| `curand_normal_double` | `hiprand_normal_double` |
|
||||
| `curand_normal2` | `hiprand_normal2` |
|
||||
| `curand_normal2_double` | `hiprand_normal2_double` |
|
||||
| `curand_normal4` | `hiprand_normal4` |
|
||||
| `curand_normal4_double` | `hiprand_normal4_double` |
|
||||
| `curand_uniform` | `hiprand_uniform` |
|
||||
| `curand_uniform_double` | `hiprand_uniform_double` |
|
||||
| `curand_uniform2_double` | `hiprand_uniform2_double` |
|
||||
| `curand_uniform4` | `hiprand_uniform4` |
|
||||
| `curand_uniform4_double` | `hiprand_uniform4_double` |
|
||||
| `curand_discrete` | `hiprand_discrete` |
|
||||
| `curand_discrete4` | `hiprand_discrete4` |
|
||||
| `curand_poisson` | `hiprand_poisson` |
|
||||
| `curand_poisson4` | `hiprand_poisson4` |
|
||||
| `curand_Philox4x32_10` | |
|
||||
| `skipahead` | `skipahead` |
|
||||
| `skipahead_sequence` | `skipahead_sequence` |
|
||||
| `skipahead_subsequence` | `skipahead_subsequence` |
|
||||
@@ -1,797 +0,0 @@
|
||||
# CUSPARSE API supported by HIP
|
||||
|
||||
## **1. cuSPARSE Data types**
|
||||
|
||||
| **type** | **CUDA** |**CUDA version\***| **HIP** |
|
||||
|-------------:|---------------------------------------------------------------|:-----------------|------------------------------------------------------------|
|
||||
| define |`CUSPARSE_VER_MAJOR` | 10.1 Update 2 | |
|
||||
| define |`CUSPARSE_VER_MINOR` | 10.1 Update 2 | |
|
||||
| define |`CUSPARSE_VER_PATCH` | 10.1 Update 2 | |
|
||||
| define |`CUSPARSE_VER_BUILD` | 10.1 Update 2 | |
|
||||
| define |`CUSPARSE_VERSION` | 10.1 Update 2 | |
|
||||
| enum |***`cusparseAction_t`*** | |***`hipsparseAction_t`*** |
|
||||
| 0 |*`CUSPARSE_ACTION_SYMBOLIC`* | |*`HIPSPARSE_ACTION_SYMBOLIC`* |
|
||||
| 1 |*`CUSPARSE_ACTION_NUMERIC`* | |*`HIPSPARSE_ACTION_NUMERIC`* |
|
||||
| enum |***`cusparseDirection_t`*** | |***`hipsparseDirection_t`*** |
|
||||
| 0 |*`CUSPARSE_DIRECTION_ROW`* | |*`HIPSPARSE_DIRECTION_ROW`* |
|
||||
| 1 |*`CUSPARSE_DIRECTION_COLUMN`* | |*`HIPSPARSE_DIRECTION_COLUMN`* |
|
||||
| enum |***`cusparseHybPartition_t`*** | |***`hipsparseHybPartition_t`*** |
|
||||
| 0 |*`CUSPARSE_HYB_PARTITION_AUTO`* | |*`HIPSPARSE_HYB_PARTITION_AUTO`* |
|
||||
| 1 |*`CUSPARSE_HYB_PARTITION_USER`* | |*`HIPSPARSE_HYB_PARTITION_USER`* |
|
||||
| 2 |*`CUSPARSE_HYB_PARTITION_MAX`* | |*`HIPSPARSE_HYB_PARTITION_MAX`* |
|
||||
| enum |***`cusparseDiagType_t`*** | |***`hipsparseDiagType_t`*** |
|
||||
| 0 |*`CUSPARSE_DIAG_TYPE_NON_UNIT`* | |*`HIPSPARSE_DIAG_TYPE_NON_UNIT`* |
|
||||
| 1 |*`CUSPARSE_DIAG_TYPE_UNIT`* | |*`HIPSPARSE_DIAG_TYPE_UNIT`* |
|
||||
| enum |***`cusparseFillMode_t`*** | |***`hipsparseFillMode_t`*** |
|
||||
| 0 |*`CUSPARSE_FILL_MODE_LOWER`* | |*`HIPSPARSE_FILL_MODE_LOWER`* |
|
||||
| 1 |*`CUSPARSE_FILL_MODE_UPPER`* | |*`HIPSPARSE_FILL_MODE_UPPER`* |
|
||||
| enum |***`cusparseIndexBase_t`*** | |***`hipsparseIndexBase_t`*** |
|
||||
| 0 |*`CUSPARSE_INDEX_BASE_ZERO`* | |*`HIPSPARSE_INDEX_BASE_ZERO`* |
|
||||
| 1 |*`CUSPARSE_INDEX_BASE_ONE`* | |*`HIPSPARSE_INDEX_BASE_ONE`* |
|
||||
| enum |***`cusparseMatrixType_t`*** | |***`hipsparseMatrixType_t`*** |
|
||||
| 0 |*`CUSPARSE_MATRIX_TYPE_GENERAL`* | |*`HIPSPARSE_MATRIX_TYPE_GENERAL`* |
|
||||
| 1 |*`CUSPARSE_MATRIX_TYPE_SYMMETRIC`* | |*`HIPSPARSE_MATRIX_TYPE_SYMMETRIC`* |
|
||||
| 2 |*`CUSPARSE_MATRIX_TYPE_HERMITIAN`* | |*`HIPSPARSE_MATRIX_TYPE_HERMITIAN`* |
|
||||
| 3 |*`CUSPARSE_MATRIX_TYPE_TRIANGULAR`* | |*`HIPSPARSE_MATRIX_TYPE_TRIANGULAR`* |
|
||||
| enum |***`cusparseOperation_t`*** | |***`hipsparseOperation_t`*** |
|
||||
| 0 |*`CUSPARSE_OPERATION_NON_TRANSPOSE`* | |*`HIPSPARSE_OPERATION_NON_TRANSPOSE`* |
|
||||
| 1 |*`CUSPARSE_OPERATION_TRANSPOSE`* | |*`HIPSPARSE_OPERATION_TRANSPOSE`* |
|
||||
| 2 |*`CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE`* | |*`HIPSPARSE_OPERATION_CONJUGATE_TRANSPOSE`* |
|
||||
| enum |***`cusparsePointerMode_t`*** | |***`hipsparsePointerMode_t`*** |
|
||||
| 0 |*`CUSPARSE_POINTER_MODE_HOST`* | |*`HIPSPARSE_POINTER_MODE_HOST`* |
|
||||
| 1 |*`CUSPARSE_POINTER_MODE_DEVICE`* | |*`HIPSPARSE_POINTER_MODE_DEVICE`* |
|
||||
| enum |***`cusparseAlgMode_t`*** | 8.0 | |
|
||||
| 0 |*`CUSPARSE_ALG0`* | 8.0 | |
|
||||
| 1 |*`CUSPARSE_ALG1`* | 8.0 | |
|
||||
| 0 |*`CUSPARSE_ALG_NAIVE`* | 9.2 | |
|
||||
| 1 |*`CUSPARSE_ALG_MERGE_PATH`* | 9.2 | |
|
||||
| enum |***`cusparseSolvePolicy_t`*** | |***`hipsparseSolvePolicy_t`*** |
|
||||
| 0 |*`CUSPARSE_SOLVE_POLICY_NO_LEVEL`* | |*`HIPSPARSE_SOLVE_POLICY_NO_LEVEL`* |
|
||||
| 1 |*`CUSPARSE_SOLVE_POLICY_USE_LEVEL`* | |*`HIPSPARSE_SOLVE_POLICY_USE_LEVEL`* |
|
||||
| enum |***`cusparseStatus_t`*** | |***`hipsparseMatrixType_t`*** |
|
||||
| 0 |*`CUSPARSE_STATUS_SUCCESS`* | |*`HIPSPARSE_STATUS_SUCCESS`* |
|
||||
| 1 |*`CUSPARSE_STATUS_NOT_INITIALIZED`* | |*`HIPSPARSE_STATUS_NOT_INITIALIZED`* |
|
||||
| 2 |*`CUSPARSE_STATUS_ALLOC_FAILED`* | |*`HIPSPARSE_STATUS_ALLOC_FAILED`* |
|
||||
| 3 |*`CUSPARSE_STATUS_INVALID_VALUE`* | |*`HIPSPARSE_STATUS_INVALID_VALUE`* |
|
||||
| 4 |*`CUSPARSE_STATUS_ARCH_MISMATCH`* | |*`HIPSPARSE_STATUS_ARCH_MISMATCH`* |
|
||||
| 5 |*`CUSPARSE_STATUS_MAPPING_ERROR`* | |*`HIPSPARSE_STATUS_MAPPING_ERROR`* |
|
||||
| 6 |*`CUSPARSE_STATUS_EXECUTION_FAILED`* | |*`HIPSPARSE_STATUS_EXECUTION_FAILED`* |
|
||||
| 7 |*`CUSPARSE_STATUS_INTERNAL_ERROR`* | |*`HIPSPARSE_STATUS_INTERNAL_ERROR`* |
|
||||
| 8 |*`CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED`* | |*`HIPSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED`* |
|
||||
| 9 |*`CUSPARSE_STATUS_ZERO_PIVOT`* | |*`HIPSPARSE_STATUS_ZERO_PIVOT`* |
|
||||
| struct |`cusparseContext` | | |
|
||||
| typedef |`cusparseHandle_t` | |`hipsparseHandle_t` |
|
||||
| struct |`cusparseHybMat` | | |
|
||||
| typedef |`cusparseHybMat_t` | |`hipsparseHybMat_t` |
|
||||
| struct |`cusparseMatDescr` | | |
|
||||
| typedef |`cusparseMatDescr_t` | |`hipsparseMatDescr_t` |
|
||||
| struct |`cusparseSolveAnalysisInfo` | | |
|
||||
| typedef |`cusparseSolveAnalysisInfo_t` | | |
|
||||
| struct |`csrsv2Info` | | |
|
||||
| typedef |`csrsv2Info_t` | |`csrsv2Info_t` |
|
||||
| struct |`csrsm2Info` | 9.2 |`csrsm2Info` |
|
||||
| typedef |`csrsm2Info_t` | |`csrsm2Info_t` |
|
||||
| struct |`bsrsv2Info` | | |
|
||||
| typedef |`bsrsv2Info_t` | | |
|
||||
| struct |`bsrsm2Info` | | |
|
||||
| typedef |`bsrsm2Info_t` | | |
|
||||
| struct |`bsric02Info` | | |
|
||||
| typedef |`bsric02Info_t` | | |
|
||||
| struct |`csrilu02Info` | | |
|
||||
| typedef |`csrilu02Info_t` | |`csrilu02Info_t` |
|
||||
| struct |`bsrilu02Info` | | |
|
||||
| typedef |`bsrilu02Info_t` | | |
|
||||
| struct |`csru2csrInfo` | | |
|
||||
| typedef |`csru2csrInfo_t` | | |
|
||||
| struct |`csrgemm2Info` | |`csrgemm2Info` |
|
||||
| typedef |`csrgemm2Info_t` | |`csrgemm2Info_t` |
|
||||
| struct |`cusparseColorInfo` | | |
|
||||
| typedef |`cusparseColorInfo_t` | | |
|
||||
| struct |`pruneInfo` | 9.0 | |
|
||||
| typedef |`pruneInfo_t` | 9.0 | |
|
||||
| enum |***`cusparseCsr2CscAlg_t`*** | 10.1 | |
|
||||
| 1 |*`CUSPARSE_CSR2CSC_ALG1`* | 10.1 | |
|
||||
| 2 |*`CUSPARSE_CSR2CSC_ALG2`* | 10.1 | |
|
||||
| enum |***`cusparseFormat_t`*** | 10.1 | |
|
||||
| 1 |*`CUSPARSE_FORMAT_CSR`* | 10.1 | |
|
||||
| 2 |*`CUSPARSE_FORMAT_CSC`* | 10.1 | |
|
||||
| 3 |*`CUSPARSE_FORMAT_COO`* | 10.1 | |
|
||||
| 4 |*`CUSPARSE_FORMAT_COO_AOS`* | 10.1 | |
|
||||
| enum |***`cusparseOrder_t`*** | 10.1 | |
|
||||
| 1 |*`CUSPARSE_ORDER_COL`* | 10.1 | |
|
||||
| 2 |*`CUSPARSE_ORDER_ROW`* | 10.1 | |
|
||||
| enum |***`cusparseSpMVAlg_t`*** | 10.1 | |
|
||||
| 0 |*`CUSPARSE_MV_ALG_DEFAULT`* | 10.1 | |
|
||||
| 1 |*`CUSPARSE_COOMV_ALG`* | 10.1 | |
|
||||
| 2 |*`CUSPARSE_CSRMV_ALG1`* | 10.1 | |
|
||||
| 3 |*`CUSPARSE_CSRMV_ALG2`* | 10.1 | |
|
||||
| enum |***`cusparseSpMMAlg_t`*** | 10.1 | |
|
||||
| 0 |*`CUSPARSE_MM_ALG_DEFAULT`* | 10.1 | |
|
||||
| 1 |*`CUSPARSE_COOMM_ALG1`* | 10.1 | |
|
||||
| 2 |*`CUSPARSE_COOMM_ALG2`* | 10.1 | |
|
||||
| 3 |*`CUSPARSE_COOMM_ALG3`* | 10.1 | |
|
||||
| 4 |*`CUSPARSE_CSRMM_ALG1`* | 10.1 | |
|
||||
| enum |***`cusparseIndexType_t`*** | 10.1 | |
|
||||
| 1 |*`CUSPARSE_INDEX_16U`* | 10.1 | |
|
||||
| 2 |*`CUSPARSE_INDEX_32I`* | 10.1 | |
|
||||
| 3 |*`CUSPARSE_INDEX_64I`* | 10.1 | |
|
||||
| struct |`cusparseSpMatDescr` | 10.1 | |
|
||||
| typedef |`cusparseSpMatDescr_t` | 10.1 | |
|
||||
| struct |`cusparseDnMatDescr` | 10.1 | |
|
||||
| typedef |`cusparseDnMatDescr_t` | 10.1 | |
|
||||
| struct |`cusparseSpVecDescr` | 10.1 | |
|
||||
| typedef |`cusparseSpVecDescr_t` | 10.1 | |
|
||||
| struct |`cusparseDnVecDescr` | 10.1 | |
|
||||
| typedef |`cusparseDnVecDescr_t` | 10.1 | |
|
||||
|
||||
## **2. cuSPARSE Helper Function Reference**
|
||||
|
||||
| **CUDA** | **HIP** |**CUDA version\***|
|
||||
|-----------------------------------------------------------|-------------------------------------------------|:----------------:|
|
||||
|`cusparseCreate` |`hipsparseCreate` |
|
||||
|`cusparseCreateSolveAnalysisInfo` | |
|
||||
|`cusparseCreateHybMat` |`hipsparseCreateHybMat` |
|
||||
|`cusparseCreateMatDescr` |`hipsparseCreateMatDescr` |
|
||||
|`cusparseDestroy` |`hipsparseDestroy` |
|
||||
|`cusparseDestroySolveAnalysisInfo` | |
|
||||
|`cusparseDestroyHybMat` |`hipsparseDestroyHybMat` |
|
||||
|`cusparseDestroyMatDescr` |`hipsparseDestroyMatDescr` |
|
||||
|`cusparseGetLevelInfo` | |
|
||||
|`cusparseGetMatDiagType` |`hipsparseGetMatDiagType` |
|
||||
|`cusparseGetMatFillMode` |`hipsparseGetMatFillMode` |
|
||||
|`cusparseGetMatIndexBase` |`hipsparseGetMatIndexBase` |
|
||||
|`cusparseGetMatType` |`hipsparseGetMatType` |
|
||||
|`cusparseGetPointerMode` |`hipsparseGetPointerMode` |
|
||||
|`cusparseGetVersion` |`hipsparseGetVersion` |
|
||||
|`cusparseSetMatDiagType` |`hipsparseSetMatDiagType` |
|
||||
|`cusparseSetMatFillMode` |`hipsparseSetMatFillMode` |
|
||||
|`cusparseSetMatType` |`hipsparseSetMatType` |
|
||||
|`cusparseSetPointerMode` |`hipsparseSetPointerMode` |
|
||||
|`cusparseSetStream` |`hipsparseSetStream` |
|
||||
|`cusparseGetStream` |`hipsparseGetStream` | 8.0 |
|
||||
|`cusparseCreateCsrsv2Info` |`hipsparseCreateCsrsv2Info` |
|
||||
|`cusparseDestroyCsrsv2Info` |`hipsparseDestroyCsrsv2Info` |
|
||||
|`cusparseCreateCsrsm2Info` |`hipsparseCreateCsrsm2Info` | 9.2 |
|
||||
|`cusparseDestroyCsrsm2Info` |`hipsparseDestroyCsrsm2Info` | 9.2 |
|
||||
|`cusparseCreateCsric02Info` | |
|
||||
|`cusparseDestroyCsric02Info` | |
|
||||
|`cusparseCreateCsrilu02Info` |`hipsparseCreateCsrilu02Info` |
|
||||
|`cusparseDestroyCsrilu02Info` |`hipsparseDestroyCsrilu02Info` |
|
||||
|`cusparseCreateBsrsv2Info` | |
|
||||
|`cusparseDestroyBsrsv2Info` | |
|
||||
|`cusparseCreateBsrsm2Info` | |
|
||||
|`cusparseDestroyBsrsm2Info` | |
|
||||
|`cusparseCreateBsric02Info` | |
|
||||
|`cusparseDestroyBsric02Info` | |
|
||||
|`cusparseCreateBsrilu02Info` | |
|
||||
|`cusparseDestroyBsrilu02Info` | |
|
||||
|`cusparseCreateCsrgemm2Info` |`hipsparseCreateCsrgemm2Info` |
|
||||
|`cusparseDestroyCsrgemm2Info` |`hipsparseDestroyCsrgemm2Info` |
|
||||
|`cusparseCreatePruneInfo` | | 9.0 |
|
||||
|`cusparseDestroyPruneInfo` | | 9.0 |
|
||||
|
||||
## **3. cuSPARSE Level 1 Function Reference**
|
||||
|
||||
| **CUDA** | **HIP** |**CUDA version\***|
|
||||
|-----------------------------------------------------------|-------------------------------------------------|:----------------:|
|
||||
|`cusparseSaxpyi` |`hipsparseSaxpyi` |
|
||||
|`cusparseDaxpyi` |`hipsparseDaxpyi` |
|
||||
|`cusparseCaxpyi` |`hipsparseCaxpyi` |
|
||||
|`cusparseZaxpyi` |`hipsparseZaxpyi` |
|
||||
|`cusparseSdoti` |`hipsparseSdoti` |
|
||||
|`cusparseDdoti` |`hipsparseDdoti` |
|
||||
|`cusparseCdoti` |`hipsparseCdoti` |
|
||||
|`cusparseZdoti` |`hipsparseZdoti` |
|
||||
|`cusparseCdotci` |`hipsparseCdotci` |
|
||||
|`cusparseZdotci` |`hipsparseZdotci` |
|
||||
|`cusparseSgthr` |`hipsparseSgthr` |
|
||||
|`cusparseDgthr` |`hipsparseDgthr` |
|
||||
|`cusparseCgthr` |`hipsparseCgthr` |
|
||||
|`cusparseZgthr` |`hipsparseZgthr` |
|
||||
|`cusparseSgthrz` |`hipsparseSgthrz` |
|
||||
|`cusparseDgthrz` |`hipsparseDgthrz` |
|
||||
|`cusparseCgthrz` |`hipsparseCgthrz` |
|
||||
|`cusparseZgthrz` |`hipsparseZgthrz` |
|
||||
|`cusparseSroti` |`hipsparseSroti` |
|
||||
|`cusparseDroti` |`hipsparseDroti` |
|
||||
|`cusparseSsctr` |`hipsparseSsctr` |
|
||||
|`cusparseDsctr` |`hipsparseDsctr` |
|
||||
|`cusparseCsctr` |`hipsparseCsctr` |
|
||||
|`cusparseZsctr` |`hipsparseZsctr` |
|
||||
|
||||
## **4. cuSPARSE Level 2 Function Reference**
|
||||
|
||||
| **CUDA** | **HIP** |**CUDA version\***|
|
||||
|-----------------------------------------------------------|-------------------------------------------------|:----------------:|
|
||||
|`cusparseSbsrmv` |`hipsparseSbsrmv` |
|
||||
|`cusparseDbsrmv` |`hipsparseDbsrmv` |
|
||||
|`cusparseCbsrmv` |`hipsparseCbsrmv` |
|
||||
|`cusparseZbsrmv` |`hipsparseZbsrmv` |
|
||||
|`cusparseSbsrxmv` | |
|
||||
|`cusparseDbsrxmv` | |
|
||||
|`cusparseCbsrxmv` | |
|
||||
|`cusparseZbsrxmv` | |
|
||||
|`cusparseScsrmv` |`hipsparseScsrmv` |
|
||||
|`cusparseDcsrmv` |`hipsparseDcsrmv` |
|
||||
|`cusparseCcsrmv` |`hipsparseCcsrmv` |
|
||||
|`cusparseZcsrmv` |`hipsparseZcsrmv` |
|
||||
|`cusparseCsrmvEx` | | 8.0 |
|
||||
|`cusparseCsrmvEx_bufferSize` | | 8.0 |
|
||||
|`cusparseScsrmv_mp` | | 8.0 |
|
||||
|`cusparseDcsrmv_mp` | | 8.0 |
|
||||
|`cusparseCcsrmv_mp` | | 8.0 |
|
||||
|`cusparseZcsrmv_mp` | | 8.0 |
|
||||
|`cusparseSgemvi` | | 7.5 |
|
||||
|`cusparseDgemvi` | | 7.5 |
|
||||
|`cusparseCgemvi` | | 7.5 |
|
||||
|`cusparseZgemvi` | | 7.5 |
|
||||
|`cusparseSgemvi_bufferSize` | | 7.5 |
|
||||
|`cusparseDgemvi_bufferSize` | | 7.5 |
|
||||
|`cusparseCgemvi_bufferSize` | | 7.5 |
|
||||
|`cusparseZgemvi_bufferSize` | | 7.5 |
|
||||
|`cusparseSbsrsv2_bufferSize` | |
|
||||
|`cusparseSbsrsv2_bufferSizeExt` | |
|
||||
|`cusparseDbsrsv2_bufferSize` | |
|
||||
|`cusparseDbsrsv2_bufferSizeExt` | |
|
||||
|`cusparseCbsrsv2_bufferSize` | |
|
||||
|`cusparseCbsrsv2_bufferSizeExt` | |
|
||||
|`cusparseZbsrsv2_bufferSize` | |
|
||||
|`cusparseZbsrsv2_bufferSizeExt` | |
|
||||
|`cusparseSbsrsv2_analysis` | |
|
||||
|`cusparseDbsrsv2_analysis` | |
|
||||
|`cusparseCbsrsv2_analysis` | |
|
||||
|`cusparseZbsrsv2_analysis` | |
|
||||
|`cusparseXbsrsv2_zeroPivot` | |
|
||||
|`cusparseSbsrsv2_solve | |
|
||||
|`cusparseDbsrsv2_solve | |
|
||||
|`cusparseCbsrsv2_solve | |
|
||||
|`cusparseZbsrsv2_solve | |
|
||||
|`cusparseScsrsv_analysis` | |
|
||||
|`cusparseDcsrsv_analysis` | |
|
||||
|`cusparseCcsrsv_analysis` | |
|
||||
|`cusparseZcsrsv_analysis` | |
|
||||
|`cusparseCsrsv_analysisEx` | | 8.0 |
|
||||
|`cusparseScsrsv_solve` | |
|
||||
|`cusparseDcsrsv_solve` | |
|
||||
|`cusparseCcsrsv_solve` | |
|
||||
|`cusparseZcsrsv_solve` | |
|
||||
|`cusparseCsrsv_solveEx` | | 8.0 |
|
||||
|`cusparseScsrsv2_bufferSize` |`hipsparseScsrsv2_bufferSize` |
|
||||
|`cusparseScsrsv2_bufferSizeExt` |`hipsparseScsrsv2_bufferSizeExt` |
|
||||
|`cusparseDcsrsv2_bufferSize` |`hipsparseDcsrsv2_bufferSize` |
|
||||
|`cusparseDcsrsv2_bufferSizeExt` |`hipsparseDcsrsv2_bufferSizeExt` |
|
||||
|`cusparseCcsrsv2_bufferSize` |`hipsparseCcsrsv2_bufferSize` |
|
||||
|`cusparseCcsrsv2_bufferSizeExt` |`hipsparseCcsrsv2_bufferSizeExt` |
|
||||
|`cusparseZcsrsv2_bufferSize` |`hipsparseZcsrsv2_bufferSize` |
|
||||
|`cusparseZcsrsv2_bufferSizeExt` |`hipsparseZcsrsv2_bufferSizeExt` |
|
||||
|`cusparseScsrsv2_analysis` |`hipsparseScsrsv2_analysis` |
|
||||
|`cusparseDcsrsv2_analysis` |`hipsparseDcsrsv2_analysis` |
|
||||
|`cusparseCcsrsv2_analysis` |`hipsparseCcsrsv2_analysis` |
|
||||
|`cusparseZcsrsv2_analysis` |`hipsparseZcsrsv2_analysis` |
|
||||
|`cusparseScsrsv2_solve` |`hipsparseScsrsv2_solve` |
|
||||
|`cusparseDcsrsv2_solve` |`hipsparseDcsrsv2_solve` |
|
||||
|`cusparseCcsrsv2_solve` |`hipsparseCcsrsv2_solve` |
|
||||
|`cusparseZcsrsv2_solve` |`hipsparseZcsrsv2_solve` |
|
||||
|`cusparseXcsrsv2_zeroPivot` |`hipsparseXcsrsv2_zeroPivot` |
|
||||
|`cusparseShybmv` |`hipsparseShybmv` |
|
||||
|`cusparseDhybmv` |`hipsparseDhybmv` |
|
||||
|`cusparseChybmv` |`hipsparseChybmv` |
|
||||
|`cusparseZhybmv` |`hipsparseZhybmv` |
|
||||
|`cusparseShybsv_analysis` | |
|
||||
|`cusparseDhybsv_analysis` | |
|
||||
|`cusparseChybsv_analysis` | |
|
||||
|`cusparseZhybsv_analysis` | |
|
||||
|`cusparseShybsv_solve` | |
|
||||
|`cusparseDhybsv_solve` | |
|
||||
|`cusparseChybsv_solve` | |
|
||||
|`cusparseZhybsv_solve` | |
|
||||
|
||||
## **5. cuSPARSE Level 3 Function Reference**
|
||||
|
||||
| **CUDA** | **HIP** |**CUDA version\***|
|
||||
|-----------------------------------------------------------|-------------------------------------------------|:----------------:|
|
||||
|`cusparseScsrmm` |`hipsparseScsrmm` |
|
||||
|`cusparseDcsrmm` |`hipsparseDcsrmm` |
|
||||
|`cusparseCcsrmm` |`hipsparseCcsrmm` |
|
||||
|`cusparseZcsrmm` |`hipsparseZcsrmm` |
|
||||
|`cusparseScsrmm2` |`hipsparseScsrmm2` |
|
||||
|`cusparseDcsrmm2` |`hipsparseDcsrmm2` |
|
||||
|`cusparseCcsrmm2` |`hipsparseCcsrmm2` |
|
||||
|`cusparseZcsrmm2` |`hipsparseZcsrmm2` |
|
||||
|`cusparseScsrsm_analysis` | |
|
||||
|`cusparseDcsrsm_analysis` | |
|
||||
|`cusparseCcsrsm_analysis` | |
|
||||
|`cusparseZcsrsm_analysis` | |
|
||||
|`cusparseScsrsm_solve` | |
|
||||
|`cusparseDcsrsm_solve` | |
|
||||
|`cusparseCcsrsm_solve` | |
|
||||
|`cusparseZcsrsm_solve` | |
|
||||
|`cusparseScsrsm2_bufferSizeExt` |`hipsparseScsrsm2_bufferSizeExt` | 9.2 |
|
||||
|`cusparseDcsrsm2_bufferSizeExt` |`hipsparseDcsrsm2_bufferSizeExt` | 9.2 |
|
||||
|`cusparseCcsrsm2_bufferSizeExt` |`hipsparseCcsrsm2_bufferSizeExt` | 9.2 |
|
||||
|`cusparseZcsrsm2_bufferSizeExt` |`hipsparseZcsrsm2_bufferSizeExt` | 9.2 |
|
||||
|`cusparseScsrsm2_analysis` |`hipsparseScsrsm2_analysis` | 9.2 |
|
||||
|`cusparseDcsrsm2_analysis` |`hipsparseDcsrsm2_analysis` | 9.2 |
|
||||
|`cusparseCcsrsm2_analysis` |`hipsparseCcsrsm2_analysis` | 9.2 |
|
||||
|`cusparseZcsrsm2_analysis` |`hipsparseZcsrsm2_analysis` | 9.2 |
|
||||
|`cusparseScsrsm2_solve` |`hipsparseScsrsm2_solve` | 9.2 |
|
||||
|`cusparseDcsrsm2_solve` |`hipsparseDcsrsm2_solve` | 9.2 |
|
||||
|`cusparseCcsrsm2_solve` |`hipsparseCcsrsm2_solve` | 9.2 |
|
||||
|`cusparseZcsrsm2_solve` |`hipsparseZcsrsm2_solve` | 9.2 |
|
||||
|`cusparseXcsrsm2_zeroPivot` |`hipsparseXcsrsm2_zeroPivot` | 9.2 |
|
||||
|`cusparseSbsrmm` | |
|
||||
|`cusparseDbsrmm` | |
|
||||
|`cusparseCbsrmm` | |
|
||||
|`cusparseZbsrmm` | |
|
||||
|`cusparseSbsrsm2_bufferSize` | |
|
||||
|`cusparseSbsrsm2_bufferSizeExt` | |
|
||||
|`cusparseDbsrsm2_bufferSize` | |
|
||||
|`cusparseDbsrsm2_bufferSizeExt` | |
|
||||
|`cusparseCbsrsm2_bufferSize` | |
|
||||
|`cusparseCbsrsm2_bufferSizeExt` | |
|
||||
|`cusparseZbsrsm2_bufferSize` | |
|
||||
|`cusparseZbsrsm2_bufferSizeExt` | |
|
||||
|`cusparseSbsrsm2_analysis` | |
|
||||
|`cusparseDbsrsm2_analysis` | |
|
||||
|`cusparseCbsrsm2_analysis` | |
|
||||
|`cusparseZbsrsm2_analysis` | |
|
||||
|`cusparseSbsrsm2_solve` | |
|
||||
|`cusparseDbsrsm2_solve` | |
|
||||
|`cusparseCbsrsm2_solve` | |
|
||||
|`cusparseZbsrsm2_solve` | |
|
||||
|`cusparseXbsrsm2_zeroPivot` | |
|
||||
|`cusparseSgemmi` | | 8.0 |
|
||||
|`cusparseDgemmi` | | 8.0 |
|
||||
|`cusparseCgemmi` | | 8.0 |
|
||||
|`cusparseZgemmi` | | 8.0 |
|
||||
|
||||
## **6. cuSPARSE Extra Function Reference**
|
||||
|
||||
| **CUDA** | **HIP** |**CUDA version\***|
|
||||
|-----------------------------------------------------------|-------------------------------------------------|:----------------:|
|
||||
|`cusparseXcsrgeamNnz` |`hipsparseXcsrgeamNnz` |
|
||||
|`cusparseScsrgeam` |`hipsparseScsrgeam` |
|
||||
|`cusparseDcsrgeam` |`hipsparseDcsrgeam` |
|
||||
|`cusparseCcsrgeam` |`hipsparseCcsrgeam` |
|
||||
|`cusparseZcsrgeam` |`hipsparseZcsrgeam` |
|
||||
|`cusparseXcsrgeam2Nnz` |`hipsparseXcsrgeam2Nnz` | 9.2 |
|
||||
|`cusparseScsrgeam2` |`hipsparseScsrgeam2` | 9.2 |
|
||||
|`cusparseDcsrgeam2` |`hipsparseDcsrgeam2` | 9.2 |
|
||||
|`cusparseCcsrgeam2` |`hipsparseCcsrgeam2` | 9.2 |
|
||||
|`cusparseZcsrgeam2` |`hipsparseZcsrgeam2` | 9.2 |
|
||||
|`cusparseScsrgeam2_bufferSizeExt` |`hipsparseScsrgeam2_bufferSizeExt` | 9.2 |
|
||||
|`cusparseDcsrgeam2_bufferSizeExt` |`hipsparseDcsrgeam2_bufferSizeExt` | 9.2 |
|
||||
|`cusparseCcsrgeam2_bufferSizeExt` |`hipsparseCcsrgeam2_bufferSizeExt` | 9.2 |
|
||||
|`cusparseZcsrgeam2_bufferSizeExt` |`hipsparseZcsrgeam2_bufferSizeExt` | 9.2 |
|
||||
|`cusparseXcsrgemmNnz` |`hipsparseXcsrgemmNnz` |
|
||||
|`cusparseScsrgemm` |`hipsparseScsrgemm` |
|
||||
|`cusparseDcsrgemm` |`hipsparseDcsrgemm` |
|
||||
|`cusparseCcsrgemm` |`hipsparseCcsrgemm` |
|
||||
|`cusparseZcsrgemm` |`hipsparseZcsrgemm` |
|
||||
|`cusparseXcsrgemm2Nnz` |`hipsparseXcsrgemm2Nnz` |
|
||||
|`cusparseScsrgemm2` |`hipsparseScsrgemm2` |
|
||||
|`cusparseDcsrgemm2` |`hipsparseDcsrgemm2` |
|
||||
|`cusparseCcsrgemm2` |`hipsparseCcsrgemm2` |
|
||||
|`cusparseZcsrgemm2` |`hipsparseZcsrgemm2` |
|
||||
|`cusparseScsrgemm2_bufferSizeExt` |`hipsparseScsrgemm2_bufferSizeExt` |
|
||||
|`cusparseDcsrgemm2_bufferSizeExt` |`hipsparseDcsrgemm2_bufferSizeExt` |
|
||||
|`cusparseCcsrgemm2_bufferSizeExt` |`hipsparseCcsrgemm2_bufferSizeExt` |
|
||||
|`cusparseZcsrgemm2_bufferSizeExt` |`hipsparseZcsrgemm2_bufferSizeExt` |
|
||||
|
||||
## **7. cuSPARSE Preconditioners Reference**
|
||||
|
||||
## ***7.1. Incomplete Cholesky Factorization: level 0***
|
||||
|
||||
| **CUDA** | **HIP** |**CUDA version\***|
|
||||
|-----------------------------------------------------------|-------------------------------------------------|:----------------:|
|
||||
|`cusparseScsric0` | |
|
||||
|`cusparseDcsric0` | |
|
||||
|`cusparseCcsric0` | |
|
||||
|`cusparseZcsric0` | |
|
||||
|`cusparseScsric02_bufferSize` | |
|
||||
|`cusparseScsric02_bufferSizeExt` | |
|
||||
|`cusparseDcsric02_bufferSize` | |
|
||||
|`cusparseDcsric02_bufferSizeExt` | |
|
||||
|`cusparseCcsric02_bufferSize` | |
|
||||
|`cusparseCcsric02_bufferSizeExt` | |
|
||||
|`cusparseZcsric02_bufferSize` | |
|
||||
|`cusparseZcsric02_bufferSizeExt` | |
|
||||
|`cusparseScsric02_analysis` | |
|
||||
|`cusparseDcsric02_analysis` | |
|
||||
|`cusparseCcsric02_analysis` | |
|
||||
|`cusparseZcsric02_analysis` | |
|
||||
|`cusparseScsric02` | |
|
||||
|`cusparseDcsric02` | |
|
||||
|`cusparseCcsric02` | |
|
||||
|`cusparseZcsric02` | |
|
||||
|`cusparseXcsric02_zeroPivot` | |
|
||||
|`cusparseSbsric02_bufferSize` | |
|
||||
|`cusparseSbsric02_bufferSizeExt` | |
|
||||
|`cusparseDbsric02_bufferSize` | |
|
||||
|`cusparseDbsric02_bufferSizeExt` | |
|
||||
|`cusparseCbsric02_bufferSize` | |
|
||||
|`cusparseCbsric02_bufferSizeExt` | |
|
||||
|`cusparseZbsric02_bufferSize` | |
|
||||
|`cusparseZbsric02_bufferSizeExt` | |
|
||||
|`cusparseSbsric02_analysis` | |
|
||||
|`cusparseDbsric02_analysis` | |
|
||||
|`cusparseCbsric02_analysis` | |
|
||||
|`cusparseZbsric02_analysis` | |
|
||||
|`cusparseSbsric02` | |
|
||||
|`cusparseDbsric02` | |
|
||||
|`cusparseCbsric02` | |
|
||||
|`cusparseZbsric02` | |
|
||||
|`cusparseXbsric02_zeroPivot` | |
|
||||
|
||||
## ***7.2. Incomplete LU Factorization: level 0***
|
||||
|
||||
| **CUDA** | **HIP** |**CUDA version\***|
|
||||
|-----------------------------------------------------------|-------------------------------------------------|:----------------:|
|
||||
|`cusparseScsrilu0` | |
|
||||
|`cusparseDcsrilu0` | |
|
||||
|`cusparseCcsrilu0` | |
|
||||
|`cusparseZcsrilu0` | |
|
||||
|`cusparseCsrilu0Ex` | | 8.0 |
|
||||
|`cusparseScsrilu02_numericBoost` | |
|
||||
|`cusparseDcsrilu02_numericBoost` | |
|
||||
|`cusparseCcsrilu02_numericBoost` | |
|
||||
|`cusparseZcsrilu02_numericBoost` | |
|
||||
|`cusparseXcsrilu02_zeroPivot` |`hipsparseXcsrilu02_zeroPivot` |
|
||||
|`cusparseScsrilu02_bufferSize` |`hipsparseScsrilu02_bufferSize` |
|
||||
|`cusparseScsrilu02_bufferSizeExt` |`hipsparseScsrilu02_bufferSizeExt` |
|
||||
|`cusparseDcsrilu02_bufferSize` |`hipsparseDcsrilu02_bufferSize` |
|
||||
|`cusparseDcsrilu02_bufferSizeExt` |`hipsparseDcsrilu02_bufferSizeExt` |
|
||||
|`cusparseCcsrilu02_bufferSize` |`hipsparseCcsrilu02_bufferSize` |
|
||||
|`cusparseCcsrilu02_bufferSizeExt` |`hipsparseCcsrilu02_bufferSizeExt` |
|
||||
|`cusparseZcsrilu02_bufferSize` |`hipsparseZcsrilu02_bufferSize` |
|
||||
|`cusparseZcsrilu02_bufferSizeExt` |`hipsparseZcsrilu02_bufferSizeExt` |
|
||||
|`cusparseScsrilu02_analysis` |`hipsparseScsrilu02_analysis` |
|
||||
|`cusparseDcsrilu02_analysis` |`hipsparseDcsrilu02_analysis` |
|
||||
|`cusparseCcsrilu02_analysis` |`hipsparseCcsrilu02_analysis` |
|
||||
|`cusparseZcsrilu02_analysis` |`hipsparseZcsrilu02_analysis` |
|
||||
|`cusparseScsrilu02` |`hipsparseScsrilu02` |
|
||||
|`cusparseDcsrilu02` |`hipsparseDcsrilu02` |
|
||||
|`cusparseCcsrilu02` |`hipsparseCcsrilu02` |
|
||||
|`cusparseZcsrilu02` |`hipsparseZcsrilu02` |
|
||||
|`cusparseXbsric02_zeroPivot` |`hipsparseXcsrilu02_zeroPivot` |
|
||||
|`cusparseSbsrilu02_numericBoost` | |
|
||||
|`cusparseDbsrilu02_numericBoost` | |
|
||||
|`cusparseCbsrilu02_numericBoost` | |
|
||||
|`cusparseZbsrilu02_numericBoost` | |
|
||||
|`cusparseSbsrilu02_bufferSize` | |
|
||||
|`cusparseSbsrilu02_bufferSizeExt` | |
|
||||
|`cusparseDbsrilu02_bufferSize` | |
|
||||
|`cusparseDbsrilu02_bufferSizeExt` | |
|
||||
|`cusparseCbsrilu02_bufferSize` | |
|
||||
|`cusparseCbsrilu02_bufferSizeExt` | |
|
||||
|`cusparseZbsrilu02_bufferSize` | |
|
||||
|`cusparseZbsrilu02_bufferSizeExt` | |
|
||||
|`cusparseSbsrilu02_analysis` | |
|
||||
|`cusparseDbsrilu02_analysis` | |
|
||||
|`cusparseCbsrilu02_analysis` | |
|
||||
|`cusparseZbsrilu02_analysis` | |
|
||||
|`cusparseSbsrilu02` | |
|
||||
|`cusparseDbsrilu02` | |
|
||||
|`cusparseCbsrilu02` | |
|
||||
|`cusparseZbsrilu02` | |
|
||||
|`cusparseXbsrilu02_zeroPivot` | |
|
||||
|
||||
## ***7.3. Tridiagonal Solve***
|
||||
|
||||
| **CUDA** | **HIP** |**CUDA version\***|
|
||||
|-----------------------------------------------------------|-------------------------------------------------|:----------------:|
|
||||
|`cusparseSgtsv` | |
|
||||
|`cusparseDgtsv` | |
|
||||
|`cusparseCgtsv` | |
|
||||
|`cusparseZgtsv` | |
|
||||
|`cusparseSgtsv_nopivot` | |
|
||||
|`cusparseDgtsv_nopivot` | |
|
||||
|`cusparseCgtsv_nopivot` | |
|
||||
|`cusparseZgtsv_nopivot` | |
|
||||
|`cusparseSgtsv2_bufferSizeExt` | | 9.0 |
|
||||
|`cusparseDgtsv2_bufferSizeExt` | | 9.0 |
|
||||
|`cusparseCgtsv2_bufferSizeExt` | | 9.0 |
|
||||
|`cusparseZgtsv2_bufferSizeExt` | | 9.0 |
|
||||
|`cusparseSgtsv2` | | 9.0 |
|
||||
|`cusparseDgtsv2` | | 9.0 |
|
||||
|`cusparseCgtsv2` | | 9.0 |
|
||||
|`cusparseZgtsv2` | | 9.0 |
|
||||
|`cusparseSgtsv2_nopivot_bufferSizeExt` | | 9.0 |
|
||||
|`cusparseDgtsv2_nopivot_bufferSizeExt` | | 9.0 |
|
||||
|`cusparseCgtsv2_nopivot_bufferSizeExt` | | 9.0 |
|
||||
|`cusparseZgtsv2_nopivot_bufferSizeExt` | | 9.0 |
|
||||
|`cusparseSgtsv2_nopivot` | | 9.0 |
|
||||
|`cusparseDgtsv2_nopivot` | | 9.0 |
|
||||
|`cusparseCgtsv2_nopivot` | | 9.0 |
|
||||
|`cusparseZgtsv2_nopivot` | | 9.0 |
|
||||
|
||||
## ***7.4. Batched Tridiagonal Solve***
|
||||
|
||||
| **CUDA** | **HIP** |**CUDA version\***|
|
||||
|-----------------------------------------------------------|-------------------------------------------------|:----------------:|
|
||||
|`cusparseSgtsvStridedBatch` | |
|
||||
|`cusparseDgtsvStridedBatch` | |
|
||||
|`cusparseCgtsvStridedBatch` | |
|
||||
|`cusparseZgtsvStridedBatch` | |
|
||||
|`cusparseSgtsv2StridedBatch_bufferSizeExt` | | 9.0 |
|
||||
|`cusparseDgtsv2StridedBatch_bufferSizeExt` | | 9.0 |
|
||||
|`cusparseCgtsv2StridedBatch_bufferSizeExt` | | 9.0 |
|
||||
|`cusparseZgtsv2StridedBatch_bufferSizeExt` | | 9.0 |
|
||||
|`cusparseSgtsv2StridedBatch` | | 9.0 |
|
||||
|`cusparseDgtsv2StridedBatch` | | 9.0 |
|
||||
|`cusparseCgtsv2StridedBatch` | | 9.0 |
|
||||
|`cusparseZgtsv2StridedBatch` | | 9.0 |
|
||||
|`cusparseSgtsvInterleavedBatch_bufferSizeExt` | | 9.2 |
|
||||
|`cusparseDgtsvInterleavedBatch_bufferSizeExt` | | 9.2 |
|
||||
|`cusparseCgtsvInterleavedBatch_bufferSizeExt` | | 9.2 |
|
||||
|`cusparseZgtsvInterleavedBatch_bufferSizeExt` | | 9.2 |
|
||||
|`cusparseSgtsvInterleavedBatch` | | 9.2 |
|
||||
|`cusparseDgtsvInterleavedBatch` | | 9.2 |
|
||||
|`cusparseCgtsvInterleavedBatch` | | 9.2 |
|
||||
|`cusparseZgtsvInterleavedBatch` | | 9.2 |
|
||||
|
||||
## ***7.5. Batched Pentadiagonal Solve***
|
||||
|
||||
| **CUDA** | **HIP** |**CUDA version\***|
|
||||
|-----------------------------------------------------------|-------------------------------------------------|:----------------:|
|
||||
|`cusparseSgpsvInterleavedBatch_bufferSizeExt` | | 9.2 |
|
||||
|`cusparseDgpsvInterleavedBatch_bufferSizeExt` | | 9.2 |
|
||||
|`cusparseCgpsvInterleavedBatch_bufferSizeExt` | | 9.2 |
|
||||
|`cusparseZgpsvInterleavedBatch_bufferSizeExt` | | 9.2 |
|
||||
|`cusparseSgpsvInterleavedBatch` | | 9.2 |
|
||||
|`cusparseDgpsvInterleavedBatch` | | 9.2 |
|
||||
|`cusparseCgpsvInterleavedBatch` | | 9.2 |
|
||||
|`cusparseZgpsvInterleavedBatch` | | 9.2 |
|
||||
|
||||
## **8. cuSPARSE Matrix Reorderings Reference**
|
||||
|
||||
| **CUDA** | **HIP** |**CUDA version\***|
|
||||
|-----------------------------------------------------------|-------------------------------------------------|:----------------:|
|
||||
|`cusparseScsrcolor` | |
|
||||
|`cusparseDcsrcolor` | |
|
||||
|`cusparseCcsrcolor` | |
|
||||
|`cusparseZcsrcolor` | |
|
||||
|
||||
## **9. cuSPARSE Format Conversion Reference**
|
||||
|
||||
| **CUDA** | **HIP** |**CUDA version\***|
|
||||
|-----------------------------------------------------------|-------------------------------------------------|:----------------:|
|
||||
|`cusparseSbsr2csr` | |
|
||||
|`cusparseDbsr2csr` | |
|
||||
|`cusparseCbsr2csr` | |
|
||||
|`cusparseZbsr2csr` | |
|
||||
|`cusparseSgebsr2gebsc_bufferSize` | |
|
||||
|`cusparseSgebsr2gebsc_bufferSizeExt` | |
|
||||
|`cusparseDgebsr2gebsc_bufferSize` | |
|
||||
|`cusparseDgebsr2gebsc_bufferSizeExt` | |
|
||||
|`cusparseCgebsr2gebsc_bufferSize` | |
|
||||
|`cusparseCgebsr2gebsc_bufferSizeExt` | |
|
||||
|`cusparseZgebsr2gebsc_bufferSize` | |
|
||||
|`cusparseZgebsr2gebsc_bufferSizeExt` | |
|
||||
|`cusparseSgebsr2gebsc` | |
|
||||
|`cusparseDgebsr2gebsc` | |
|
||||
|`cusparseCgebsr2gebsc` | |
|
||||
|`cusparseZgebsr2gebsc` | |
|
||||
|`cusparseSgebsr2gebsr_bufferSize` | |
|
||||
|`cusparseSgebsr2gebsr_bufferSizeExt` | |
|
||||
|`cusparseDgebsr2gebsr_bufferSize` | |
|
||||
|`cusparseDgebsr2gebsr_bufferSizeExt` | |
|
||||
|`cusparseCgebsr2gebsr_bufferSize` | |
|
||||
|`cusparseCgebsr2gebsr_bufferSizeExt` | |
|
||||
|`cusparseZgebsr2gebsr_bufferSize` | |
|
||||
|`cusparseZgebsr2gebsr_bufferSizeExt` | |
|
||||
|`cusparseXgebsr2gebsrNnz` | |
|
||||
|`cusparseSgebsr2gebsr` | |
|
||||
|`cusparseDgebsr2gebsr` | |
|
||||
|`cusparseCgebsr2gebsr` | |
|
||||
|`cusparseZgebsr2gebsr` | |
|
||||
|`cusparseXgebsr2csr` | |
|
||||
|`cusparseSgebsr2csr` | |
|
||||
|`cusparseDgebsr2csr` | |
|
||||
|`cusparseCgebsr2csr` | |
|
||||
|`cusparseZgebsr2csr` | |
|
||||
|`cusparseScsr2gebsr_bufferSize` | |
|
||||
|`cusparseScsr2gebsr_bufferSizeExt` | |
|
||||
|`cusparseDcsr2gebsr_bufferSize` | |
|
||||
|`cusparseDcsr2gebsr_bufferSizeExt` | |
|
||||
|`cusparseCcsr2gebsr_bufferSize` | |
|
||||
|`cusparseCcsr2gebsr_bufferSizeExt` | |
|
||||
|`cusparseZcsr2gebsr_bufferSize` | |
|
||||
|`cusparseZcsr2gebsr_bufferSizeExt` | |
|
||||
|`cusparseXcsr2gebsrNnz` | |
|
||||
|`cusparseScsr2gebsr` | |
|
||||
|`cusparseDcsr2gebsr` | |
|
||||
|`cusparseCcsr2gebsr` | |
|
||||
|`cusparseZcsr2gebsr` | |
|
||||
|`cusparseXcoo2csr` |`hipsparseXcoo2csr` |
|
||||
|`cusparseScsc2dense` | |
|
||||
|`cusparseDcsc2dense` | |
|
||||
|`cusparseCcsc2dense` | |
|
||||
|`cusparseZcsc2dense` | |
|
||||
|`cusparseScsc2hyb` | |
|
||||
|`cusparseDcsc2hyb` | |
|
||||
|`cusparseCcsc2hyb` | |
|
||||
|`cusparseZcsc2hyb` | |
|
||||
|`cusparseXcsr2bsrNnz` | |
|
||||
|`cusparseScsr2bsr` | |
|
||||
|`cusparseDcsr2bsr` | |
|
||||
|`cusparseCcsr2bsr` | |
|
||||
|`cusparseZcsr2bsr` | |
|
||||
|`cusparseXcsr2coo` |`hipsparseXcsr2coo` |
|
||||
|`cusparseScsr2csc` |`hipsparseScsr2csc` |
|
||||
|`cusparseDcsr2csc` |`hipsparseDcsr2csc` |
|
||||
|`cusparseCcsr2csc` |`hipsparseCcsr2csc` |
|
||||
|`cusparseZcsr2csc` |`hipsparseZcsr2csc` |
|
||||
|`cusparseCsr2cscEx` | | 8.0 |
|
||||
|`cusparseCsr2cscEx2` | | 10.1 |
|
||||
|`cusparseCsr2cscEx2_bufferSize` | | 10.1 |
|
||||
|`cusparseScsr2dense` | |
|
||||
|`cusparseDcsr2dense` | |
|
||||
|`cusparseCcsr2dense` | |
|
||||
|`cusparseZcsr2dense` | |
|
||||
|`cusparseScsr2csr_compress` | | 8.0 |
|
||||
|`cusparseDcsr2csr_compress` | | 8.0 |
|
||||
|`cusparseCcsr2csr_compress` | | 8.0 |
|
||||
|`cusparseZcsr2csr_compress` | | 8.0 |
|
||||
|`cusparseScsr2hyb` |`hipsparseScsr2hyb` |
|
||||
|`cusparseDcsr2hyb` |`hipsparseDcsr2hyb` |
|
||||
|`cusparseCcsr2hyb` |`hipsparseCcsr2hyb` |
|
||||
|`cusparseZcsr2hyb` |`hipsparseZcsr2hyb` |
|
||||
|`cusparseSdense2csc` | |
|
||||
|`cusparseDdense2csc` | |
|
||||
|`cusparseCdense2csc` | |
|
||||
|`cusparseZdense2csc` | |
|
||||
|`cusparseSdense2csr` | |
|
||||
|`cusparseDdense2csr` | |
|
||||
|`cusparseCdense2csr` | |
|
||||
|`cusparseZdense2csr` | |
|
||||
|`cusparseSdense2hyb` | |
|
||||
|`cusparseDdense2hyb` | |
|
||||
|`cusparseCdense2hyb` | |
|
||||
|`cusparseZdense2hyb` | |
|
||||
|`cusparseShyb2csc` | |
|
||||
|`cusparseDhyb2csc` | |
|
||||
|`cusparseChyb2csc` | |
|
||||
|`cusparseZhyb2csc` | |
|
||||
|`cusparseShyb2csr` | |
|
||||
|`cusparseDhyb2csr` | |
|
||||
|`cusparseChyb2csr` | |
|
||||
|`cusparseZhyb2csr` | |
|
||||
|`cusparseShyb2dense` | |
|
||||
|`cusparseDhyb2dense` | |
|
||||
|`cusparseChyb2dense` | |
|
||||
|`cusparseZhyb2dense` | |
|
||||
|`cusparseSnnz` |`cusparseSnnz` |
|
||||
|`cusparseDnnz` |`cusparseDnnz` |
|
||||
|`cusparseCnnz` |`cusparseCnnz` |
|
||||
|`cusparseZnnz` |`cusparseZnnz` |
|
||||
|`cusparseCreateIdentityPermutation` |`hipsparseCreateIdentityPermutation` |
|
||||
|`cusparseXcoosort_bufferSizeExt` |`hipsparseXcoosort_bufferSizeExt` |
|
||||
|`cusparseXcoosortByRow` |`hipsparseXcoosortByRow` |
|
||||
|`cusparseXcoosortByColumn` |`hipsparseXcoosortByColumn` |
|
||||
|`cusparseXcsrsort_bufferSizeExt` |`hipsparseXcsrsort_bufferSizeExt` |
|
||||
|`cusparseXcsrsort` |`hipsparseXcsrsort` |
|
||||
|`cusparseXcscsort_bufferSizeExt` |`hipsparseXcscsort_bufferSizeExt` |
|
||||
|`cusparseXcscsort` |`hipsparseXcscsort` |
|
||||
|`cusparseCreateCsru2csrInfo` | |
|
||||
|`cusparseDestroyCsru2csrInfo` | |
|
||||
|`cusparseScsru2csr_bufferSizeExt` | |
|
||||
|`cusparseDcsru2csr_bufferSizeExt` | |
|
||||
|`cusparseCcsru2csr_bufferSizeExt` | |
|
||||
|`cusparseZcsru2csr_bufferSizeExt` | |
|
||||
|`cusparseScsru2csr` | |
|
||||
|`cusparseDcsru2csr` | |
|
||||
|`cusparseCcsru2csr` | |
|
||||
|`cusparseZcsru2csr` | |
|
||||
|`cusparseScsr2csru` | |
|
||||
|`cusparseDcsr2csru` | |
|
||||
|`cusparseCcsr2csru` | |
|
||||
|`cusparseZcsr2csru` | |
|
||||
|`cusparseHpruneDense2csr` | | 9.0 |
|
||||
|`cusparseSpruneDense2csr` | | 9.0 |
|
||||
|`cusparseDpruneDense2csr` | | 9.0 |
|
||||
|`cusparseHpruneDense2csr_bufferSizeExt` | | 9.0 |
|
||||
|`cusparseSpruneDense2csr_bufferSizeExt` | | 9.0 |
|
||||
|`cusparseDpruneDense2csr_bufferSizeExt` | | 9.0 |
|
||||
|`cusparseHpruneDense2csrNnz` | | 9.0 |
|
||||
|`cusparseSpruneDense2csrNnz` | | 9.0 |
|
||||
|`cusparseDpruneDense2csrNnz` | | 9.0 |
|
||||
|`cusparseHpruneCsr2csr` | | 9.0 |
|
||||
|`cusparseSpruneCsr2csr` | | 9.0 |
|
||||
|`cusparseDpruneCsr2csr` | | 9.0 |
|
||||
|`cusparseHpruneCsr2csr_bufferSizeExt` | | 9.0 |
|
||||
|`cusparseSpruneCsr2csr_bufferSizeExt` | | 9.0 |
|
||||
|`cusparseDpruneCsr2csr_bufferSizeExt` | | 9.0 |
|
||||
|`cusparseHpruneCsr2csrNnz` | | 9.0 |
|
||||
|`cusparseSpruneCsr2csrNnz` | | 9.0 |
|
||||
|`cusparseDpruneCsr2csrNnz` | | 9.0 |
|
||||
|`cusparseHpruneDense2csrByPercentage` | | 9.0 |
|
||||
|`cusparseSpruneDense2csrByPercentage` | | 9.0 |
|
||||
|`cusparseDpruneDense2csrByPercentage` | | 9.0 |
|
||||
|`cusparseHpruneDense2csrByPercentage_bufferSizeExt` | | 9.0 |
|
||||
|`cusparseSpruneDense2csrByPercentage_bufferSizeExt` | | 9.0 |
|
||||
|`cusparseDpruneDense2csrByPercentage_bufferSizeExt` | | 9.0 |
|
||||
|`cusparseHpruneDense2csrNnzByPercentage` | | 9.0 |
|
||||
|`cusparseSpruneDense2csrNnzByPercentage` | | 9.0 |
|
||||
|`cusparseDpruneDense2csrNnzByPercentage` | | 9.0 |
|
||||
|`cusparseHpruneCsr2csrByPercentage` | | 9.0 |
|
||||
|`cusparseSpruneCsr2csrByPercentage` | | 9.0 |
|
||||
|`cusparseDpruneCsr2csrByPercentage` | | 9.0 |
|
||||
|`cusparseHpruneCsr2csrByPercentage_bufferSizeExt` | | 9.0 |
|
||||
|`cusparseSpruneCsr2csrByPercentage_bufferSizeExt` | | 9.0 |
|
||||
|`cusparseDpruneCsr2csrByPercentage_bufferSizeExt` | | 9.0 |
|
||||
|`cusparseHpruneCsr2csrNnzByPercentage` | | 9.0 |
|
||||
|`cusparseSpruneCsr2csrNnzByPercentage` | | 9.0 |
|
||||
|`cusparseDpruneCsr2csrNnzByPercentage` | | 9.0 |
|
||||
|`cusparseSnnz_compress` |`hipsparseSnnz_compress` | 8.0 |
|
||||
|`cusparseDnnz_compress` |`hipsparseDnnz_compress` | 8.0 |
|
||||
|`cusparseCnnz_compress` |`hipsparseCnnz_compress` | 8.0 |
|
||||
|`cusparseZnnz_compress` |`hipsparseZnnz_compress` | 8.0 |
|
||||
|
||||
## **10. cuSPARSE Generic API Reference**
|
||||
|
||||
## ***10.1. Generic Sparse API helper functions***
|
||||
|
||||
| **CUDA** | **HIP** |**CUDA version\***|
|
||||
|-----------------------------------------------------------|-------------------------------------------------|:----------------:|
|
||||
|`cusparseCreateCoo` | | 10.1 |
|
||||
|`cusparseCreateCooAoS` | | 10.1 |
|
||||
|`cusparseCreateCsr` | | 10.1 |
|
||||
|`cusparseDestroySpMat` | | 10.1 |
|
||||
|`cusparseCooGet` | | 10.1 |
|
||||
|`cusparseCooAoSGet` | | 10.1 |
|
||||
|`cusparseCsrGet` | | 10.1 |
|
||||
|`cusparseSpMatGetFormat` | | 10.1 |
|
||||
|`cusparseSpMatGetIndexBase` | | 10.1 |
|
||||
|`cusparseSpMatGetValues` | | 10.1 |
|
||||
|`cusparseSpMatSetValues` | | 10.1 |
|
||||
|`cusparseSpMatGetStridedBatch` | | 10.1 |
|
||||
|`cusparseSpMatSetStridedBatch` | | 10.1 |
|
||||
|`cusparseSpMatGetNumBatches` | | 10.1 |
|
||||
|`cusparseSpMatSetNumBatches` | | 10.1 |
|
||||
|`cusparseCreateSpVec` | | 10.1 |
|
||||
|`cusparseDestroySpVec` | | 10.1 |
|
||||
|`cusparseSpVecGet` | | 10.1 |
|
||||
|`cusparseSpVecGetIndexBase` | | 10.1 |
|
||||
|`cusparseSpVecGetValues` | | 10.1 |
|
||||
|`cusparseSpVecSetValues` | | 10.1 |
|
||||
|
||||
## ***10.2. Generic Dense API helper functions***
|
||||
|
||||
| **CUDA** | **HIP** |**CUDA version\***|
|
||||
|-----------------------------------------------------------|-------------------------------------------------|:----------------:|
|
||||
|`cusparseCreateDnMat` | | 10.1 |
|
||||
|`cusparseDestroyDnMat` | | 10.1 |
|
||||
|`cusparseDnMatGet` | | 10.1 |
|
||||
|`cusparseDnMatGetValues` | | 10.1 |
|
||||
|`cusparseDnMatSetValues` | | 10.1 |
|
||||
|`cusparseDnMatSetStridedBatch` | | 10.1 |
|
||||
|`cusparseDnMatGetStridedBatch` | | 10.1 |
|
||||
|`cusparseCreateDnVec` | | 10.1 |
|
||||
|`cusparseDestroyDnVec` | | 10.1 |
|
||||
|`cusparseDnVecGet` | | 10.1 |
|
||||
|`cusparseDnVecGetValues` | | 10.1 |
|
||||
|`cusparseDnVecSetValues` | | 10.1 |
|
||||
|
||||
## ***10.3. Generic SpMM API functions***
|
||||
|
||||
| **CUDA** | **HIP** |**CUDA version\***|
|
||||
|-----------------------------------------------------------|-------------------------------------------------|:----------------:|
|
||||
|`cusparseSpMM` | | 10.1 |
|
||||
|`cusparseSpMM_bufferSize` | | 10.1 |
|
||||
|
||||
## ***10.4. Generic SpVV API functions [Undocumented]***
|
||||
|
||||
| **CUDA** | **HIP** |**CUDA version\***|
|
||||
|-----------------------------------------------------------|-------------------------------------------------|:----------------:|
|
||||
|`cusparseSpVV` | | 10.1 |
|
||||
|`cusparseSpVV_bufferSize` | | 10.1 |
|
||||
|
||||
## ***10.5. Generic SpMV API functions [Undocumented]***
|
||||
|
||||
| **CUDA** | **HIP** |**CUDA version\***|
|
||||
|-----------------------------------------------------------|-------------------------------------------------|:----------------:|
|
||||
|`cusparseSpMV` | | 10.1 |
|
||||
|`cusparseSpMV_bufferSize` | | 10.1 |
|
||||
|
||||
\* CUDA version, in which API has appeared and (optional) last version before abandoning it; no value in case of earlier versions < 7.5.
|
||||
@@ -1,763 +0,0 @@
|
||||
# Support of Clang options
|
||||
Clang version: clang version 12.0.0 927e2776dc0e4bb0119efbc5ea405b7425d7f4ac
|
||||
|
||||
|Option|Support|Description|
|
||||
|-------|------|-------|
|
||||
|`-###`|Supported|`Print (but do not run) the commands to run for this compilation`|
|
||||
|`--analyzer-output <value>`|Supported|`Static analyzer report output format (html\|plist\|plist-multi-file\|plist-html\|sarif\|text).`|
|
||||
|`--analyze`|Supported|`Run the static analyzer`|
|
||||
|`-arcmt-migrate-emit-errors`|Unsupported|`Emit ARC errors even if the migrator can fix them`|
|
||||
|`-arcmt-migrate-report-output <value>`|Unsupported|`Output path for the plist report`|
|
||||
|`-byteswapio`|Supported|`Swap byte-order for unformatted input/output`|
|
||||
|`-B <dir>`|Supported|`Add <dir> to search path for binaries and object files used implicitly`|
|
||||
|`-CC`|Supported|`Include comments from within macros in preprocessed output`|
|
||||
|`-cl-denorms-are-zero`|Supported|`OpenCL only. Allow denormals to be flushed to zero.`|
|
||||
|`-cl-fast-relaxed-math`|Supported|`OpenCL only. Sets -cl-finite-math-only and -cl-unsafe-math-optimizations, and defines __FAST_RELAXED_MATH__.`|
|
||||
|`-cl-finite-math-only`|Supported|`OpenCL only. Allow floating-point optimizations that assume arguments and results are not NaNs or +-Inf.`|
|
||||
|`-cl-fp32-correctly-rounded-divide-sqrt`|Supported|`OpenCL only. Specify that single precision floating-point divide and sqrt used in the program source are correctly rounded.`|
|
||||
|`-cl-kernel-arg-info`|Supported|`OpenCL only. Generate kernel argument metadata.`|
|
||||
|`-cl-mad-enable`|Supported|`OpenCL only. Allow use of less precise MAD computations in the generated binary.`|
|
||||
|`-cl-no-signed-zeros`|Supported|`OpenCL only. Allow use of less precise no signed zeros computations in the generated binary.`|
|
||||
|`-cl-opt-disable`|Supported|`OpenCL only. This option disables all optimizations. By default optimizations are enabled.`|
|
||||
|`-cl-single-precision-constant`|Supported|`OpenCL only. Treat double precision floating-point constant as single precision constant.`|
|
||||
|`-cl-std=<value>`|Supported|`OpenCL language standard to compile for.`|
|
||||
|`-cl-strict-aliasing`|Supported|`OpenCL only. This option is added for compatibility with OpenCL 1.0.`|
|
||||
|`-cl-uniform-work-group-size`|Supported|`OpenCL only. Defines that the global work-size be a multiple of the work-group size specified to clEnqueueNDRangeKernel`|
|
||||
|`-cl-unsafe-math-optimizations`|Supported|`OpenCL only. Allow unsafe floating-point optimizations. Also implies -cl-no-signed-zeros and -cl-mad-enable.`|
|
||||
|`--config <value>`|Supported|`Specifies configuration file`|
|
||||
|`--cuda-compile-host-device`|Supported|`Compile CUDA code for both host and device (default). Has no effect on non-CUDA compilations.`|
|
||||
|`--cuda-device-only`|Supported|`Compile CUDA code for device only`|
|
||||
|`--cuda-host-only`|Supported|`Compile CUDA code for host only. Has no effect on non-CUDA compilations.`|
|
||||
|`--cuda-include-ptx=<value>`|Unsupported|`Include PTX for the following GPU architecture (e.g. sm_35) or 'all'. May be specified more than once.`|
|
||||
|`--cuda-noopt-device-debug`|Unsupported|`Enable device-side debug info generation. Disables ptxas optimizations.`|
|
||||
|`--cuda-path-ignore-env`|Unsupported|`Ignore environment variables to detect CUDA installation`|
|
||||
|`--cuda-path=<value>`|Unsupported|`CUDA installation path`|
|
||||
|`-cxx-isystem <directory>`|Supported|`Add directory to the C++ SYSTEM include search path`|
|
||||
|`-C`|Supported|`Include comments in preprocessed output`|
|
||||
|`-c`|Supported|`Only run preprocess, compile, and assemble steps`|
|
||||
|`-dD`|Supported|`Print macro definitions in -E mode in addition to normal output`|
|
||||
|`-dependency-dot <value>`|Supported|`Filename to write DOT-formatted header dependencies to`|
|
||||
|`-dependency-file <value>`|Supported|`Filename (or -) to write dependency output to`|
|
||||
|`-dI`|Supported|`Print include directives in -E mode in addition to normal output`|
|
||||
|`-dM`|Supported|`Print macro definitions in -E mode instead of normal output`|
|
||||
|`-dsym-dir <dir>`|Unsupported|`Directory to output dSYM's (if any) to`|
|
||||
|`-D <macro>`|Supported|`=<value> Define <macro> to <value> (or 1 if <value> omitted)`|
|
||||
|`-emit-ast`|Supported|`Emit Clang AST files for source inputs`|
|
||||
|`-emit-interface-stubs`|Supported|`Generate Interface Stub Files.`|
|
||||
|`-emit-llvm`|Supported|`Use the LLVM representation for assembler and object files`|
|
||||
|`-emit-merged-ifs`|Supported|`Generate Interface Stub Files, emit merged text not binary.`|
|
||||
|`--emit-static-lib`|Supported|`Enable linker job to emit a static library.`|
|
||||
|`-enable-trivial-auto-var-init-zero-knowing-it-will-be-removed-from-clang`|Supported|`Trivial automatic variable initialization to zero is only here for benchmarks, it'll eventually be removed, and I'm OK with that because I'm only using it to benchmark`|
|
||||
|`-E`|Supported|`Only run the preprocessor`|
|
||||
|`-fAAPCSBitfieldLoad`|Unsupported|`Follows the AAPCS standard that all volatile bit-field write generates at least one load. (ARM only).`|
|
||||
|`-faddrsig`|Supported|`Emit an address-significance table`|
|
||||
|`-faligned-allocation`|Supported|`Enable C++17 aligned allocation functions`|
|
||||
|`-fallow-editor-placeholders`|Supported|`Treat editor placeholders as valid source code`|
|
||||
|`-fallow-fortran-gnu-ext`|Supported|`Allow Fortran GNU extensions`|
|
||||
|`-fansi-escape-codes`|Supported|`Use ANSI escape codes for diagnostics`|
|
||||
|`-fapple-kext`|Unsupported|`Use Apple's kernel extensions ABI`|
|
||||
|`-fapple-link-rtlib`|Unsupported|`Force linking the clang builtins runtime library`|
|
||||
|`-fapple-pragma-pack`|Unsupported|`Enable Apple gcc-compatible #pragma pack handling`|
|
||||
|`-fapplication-extension`|Unsupported|`Restrict code to those available for App Extensions`|
|
||||
|`-fbackslash`|Supported|`Treat backslash as C-style escape character`|
|
||||
|`-fbasic-block-sections=<value>`|Supported|`Place each function's basic blocks in unique sections (ELF Only) : all \| labels \| none \| list=<file>`|
|
||||
|`-fblocks`|Supported|`Enable the 'blocks' language feature`|
|
||||
|`-fborland-extensions`|Unsupported|`Accept non-standard constructs supported by the Borland compiler`|
|
||||
|`-fbuild-session-file=<file>`|Supported|`Use the last modification time of <file> as the build session timestamp`|
|
||||
|`-fbuild-session-timestamp=<time since Epoch in seconds>`|Supported|`Time when the current build session started`|
|
||||
|`-fbuiltin-module-map`|Unsupported|`Load the clang builtins module map file.`|
|
||||
|`-fcall-saved-x10`|Unsupported|`Make the x10 register call-saved (AArch64 only)`|
|
||||
|`-fcall-saved-x11`|Unsupported|`Make the x11 register call-saved (AArch64 only)`|
|
||||
|`-fcall-saved-x12`|Unsupported|`Make the x12 register call-saved (AArch64 only)`|
|
||||
|`-fcall-saved-x13`|Unsupported|`Make the x13 register call-saved (AArch64 only)`|
|
||||
|`-fcall-saved-x14`|Unsupported|`Make the x14 register call-saved (AArch64 only)`|
|
||||
|`-fcall-saved-x15`|Unsupported|`Make the x15 register call-saved (AArch64 only)`|
|
||||
|`-fcall-saved-x18`|Unsupported|`Make the x18 register call-saved (AArch64 only)`|
|
||||
|`-fcall-saved-x8`|Unsupported|`Make the x8 register call-saved (AArch64 only)`|
|
||||
|`-fcall-saved-x9`|Unsupported|`Make the x9 register call-saved (AArch64 only)`|
|
||||
|`-fcf-protection=<value>`|Unsupported|`Instrument control-flow architecture protection. Options: return, branch, full, none.`|
|
||||
|`-fcf-protection`|Unsupported|`Enable cf-protection in 'full' mode`|
|
||||
|`-fchar8_t`|Supported|`Enable C++ builtin type char8_t`|
|
||||
|`-fclang-abi-compat=<version>`|Supported|`Attempt to match the ABI of Clang <version>`|
|
||||
|`-fcolor-diagnostics`|Supported|`Enable colors in diagnostics`|
|
||||
|`-fcomment-block-commands=<arg>`|Supported|`Treat each comma separated argument in <arg> as a documentation comment block command`|
|
||||
|`-fcommon`|Supported|`Place uninitialized global variables in a common block`|
|
||||
|`-fcomplete-member-pointers`|Supported|`Require member pointer base types to be complete if they would be significant under the Microsoft ABI`|
|
||||
|`-fconvergent-functions`|Supported|`Assume functions may be convergent`|
|
||||
|`-fcoroutines-ts`|Supported|`Enable support for the C++ Coroutines TS`|
|
||||
|`-fcoverage-mapping`|Unsupported|`Generate coverage mapping to enable code coverage analysis`|
|
||||
|`-fcs-profile-generate=<directory>`|Unsupported|`Generate instrumented code to collect context sensitive execution counts into <directory>/default.profraw (overridden by LLVM_PROFILE_FILE env var)`|
|
||||
|`-fcs-profile-generate`|Unsupported|`Generate instrumented code to collect context sensitive execution counts into default.profraw (overridden by LLVM_PROFILE_FILE env var)`|
|
||||
|`-fcuda-approx-transcendentals`|Unsupported|`Use approximate transcendental functions`|
|
||||
|`-fcuda-flush-denormals-to-zero`|Supported|`Flush denormal floating point values to zero in CUDA device mode.`|
|
||||
|`-fcuda-short-ptr`|Unsupported|`Use 32-bit pointers for accessing const/local/shared address spaces`|
|
||||
|`-fcxx-exceptions`|Supported|`Enable C++ exceptions`|
|
||||
|`-fdata-sections`|Supported|`Place each data in its own section`|
|
||||
|`-fdebug-compilation-dir <value>`|Supported|`The compilation directory to embed in the debug info.`|
|
||||
|`-fdebug-default-version=<value>`|Supported|`Default DWARF version to use, if a -g option caused DWARF debug info to be produced`|
|
||||
|`-fdebug-info-for-profiling`|Supported|`Emit extra debug info to make sample profile more accurate`|
|
||||
|`-fdebug-macro`|Supported|`Emit macro debug information`|
|
||||
|`-fdebug-prefix-map=<value>`|Supported|`remap file source paths in debug info`|
|
||||
|`-fdebug-ranges-base-address`|Supported|`Use DWARF base address selection entries in .debug_ranges`|
|
||||
|`-fdebug-types-section`|Supported|`Place debug types in their own section (ELF Only)`|
|
||||
|`-fdeclspec`|Supported|`Allow __declspec as a keyword`|
|
||||
|`-fdelayed-template-parsing`|Supported|`Parse templated function definitions at the end of the translation unit`|
|
||||
|`-fdelete-null-pointer-checks`|Supported|`Treat usage of null pointers as undefined behavior (default)`|
|
||||
|`-fdiagnostics-absolute-paths`|Supported|`Print absolute paths in diagnostics`|
|
||||
|`-fdiagnostics-hotness-threshold=<number>`|Unsupported|`Prevent optimization remarks from being output if they do not have at least this profile count`|
|
||||
|`-fdiagnostics-parseable-fixits`|Supported|`Print fix-its in machine parseable form`|
|
||||
|`-fdiagnostics-print-source-range-info`|Supported|`Print source range spans in numeric form`|
|
||||
|`-fdiagnostics-show-hotness`|Unsupported|`Enable profile hotness information in diagnostic line`|
|
||||
|`-fdiagnostics-show-note-include-stack`|Supported|`Display include stacks for diagnostic notes`|
|
||||
|`-fdiagnostics-show-option`|Supported|`Print option name with mappable diagnostics`|
|
||||
|`-fdiagnostics-show-template-tree`|Supported|`Print a template comparison tree for differing templates`|
|
||||
|`-fdigraphs`|Supported|`Enable alternative token representations '<:', ':>', '<%', '%>', '%:', '%:%:' (default)`|
|
||||
|`-fdiscard-value-names`|Supported|`Discard value names in LLVM IR`|
|
||||
|`-fdollars-in-identifiers`|Supported|`Allow '$' in identifiers`|
|
||||
|`-fdouble-square-bracket-attributes`|Supported|`Enable '[[]]' attributes in all C and C++ language modes`|
|
||||
|`-fdwarf-exceptions`|Unsupported|`Use DWARF style exceptions`|
|
||||
|`-feliminate-unused-debug-types`|Supported|`Do not emit debug info for defined but unused types`|
|
||||
|`-fembed-bitcode-marker`|Supported|`Embed placeholder LLVM IR data as a marker`|
|
||||
|`-fembed-bitcode=<option>`|Supported|`Embed LLVM bitcode (option: off, all, bitcode, marker)`|
|
||||
|`-fembed-bitcode`|Supported|`Embed LLVM IR bitcode as data`|
|
||||
|`-femit-all-decls`|Supported|`Emit all declarations, even if unused`|
|
||||
|`-femulated-tls`|Supported|`Use emutls functions to access thread_local variables`|
|
||||
|`-fenable-matrix`|Supported|`Enable matrix data type and related builtin functions`|
|
||||
|`-fexceptions`|Supported|`Enable support for exception handling`|
|
||||
|`-fexperimental-new-constant-interpreter`|Supported|`Enable the experimental new constant interpreter`|
|
||||
|`-fexperimental-new-pass-manager`|Supported|`Enables an experimental new pass manager in LLVM.`|
|
||||
|`-fexperimental-relative-c++-abi-vtables`|Supported|`Use the experimental C++ class ABI for classes with virtual tables`|
|
||||
|`-fexperimental-strict-floating-point`|Supported|`Enables experimental strict floating point in LLVM.`|
|
||||
|`-ffast-math`|Supported|`Allow aggressive, lossy floating-point optimizations`|
|
||||
|`-ffile-prefix-map=<value>`|Supported|`remap file source paths in debug info and predefined preprocessor macros`|
|
||||
|`-ffine-grained-bitfield-accesses`|Supported|`Use separate accesses for consecutive bitfield runs with legal widths and alignments.`|
|
||||
|`-ffixed-form`|Supported|`Enable fixed-form format for Fortran`|
|
||||
|`-ffixed-point`|Supported|`Enable fixed point types`|
|
||||
|`-ffixed-r19`|Unsupported|`Reserve register r19 (Hexagon only)`|
|
||||
|`-ffixed-r9`|Unsupported|`Reserve the r9 register (ARM only)`|
|
||||
|`-ffixed-x10`|Unsupported|`Reserve the x10 register (AArch64/RISC-V only)`|
|
||||
|`-ffixed-x11`|Unsupported|`Reserve the x11 register (AArch64/RISC-V only)`|
|
||||
|`-ffixed-x12`|Unsupported|`Reserve the x12 register (AArch64/RISC-V only)`|
|
||||
|`-ffixed-x13`|Unsupported|`Reserve the x13 register (AArch64/RISC-V only)`|
|
||||
|`-ffixed-x14`|Unsupported|`Reserve the x14 register (AArch64/RISC-V only)`|
|
||||
|`-ffixed-x15`|Unsupported|`Reserve the x15 register (AArch64/RISC-V only)`|
|
||||
|`-ffixed-x16`|Unsupported|`Reserve the x16 register (AArch64/RISC-V only)`|
|
||||
|`-ffixed-x17`|Unsupported|`Reserve the x17 register (AArch64/RISC-V only)`|
|
||||
|`-ffixed-x18`|Unsupported|`Reserve the x18 register (AArch64/RISC-V only)`|
|
||||
|`-ffixed-x19`|Unsupported|`Reserve the x19 register (AArch64/RISC-V only)`|
|
||||
|`-ffixed-x1`|Unsupported|`Reserve the x1 register (AArch64/RISC-V only)`|
|
||||
|`-ffixed-x20`|Unsupported|`Reserve the x20 register (AArch64/RISC-V only)`|
|
||||
|`-ffixed-x21`|Unsupported|`Reserve the x21 register (AArch64/RISC-V only)`|
|
||||
|`-ffixed-x22`|Unsupported|`Reserve the x22 register (AArch64/RISC-V only)`|
|
||||
|`-ffixed-x23`|Unsupported|`Reserve the x23 register (AArch64/RISC-V only)`|
|
||||
|`-ffixed-x24`|Unsupported|`Reserve the x24 register (AArch64/RISC-V only)`|
|
||||
|`-ffixed-x25`|Unsupported|`Reserve the x25 register (AArch64/RISC-V only)`|
|
||||
|`-ffixed-x26`|Unsupported|`Reserve the x26 register (AArch64/RISC-V only)`|
|
||||
|`-ffixed-x27`|Unsupported|`Reserve the x27 register (AArch64/RISC-V only)`|
|
||||
|`-ffixed-x28`|Unsupported|`Reserve the x28 register (AArch64/RISC-V only)`|
|
||||
|`-ffixed-x29`|Unsupported|`Reserve the x29 register (AArch64/RISC-V only)`|
|
||||
|`-ffixed-x2`|Unsupported|`Reserve the x2 register (AArch64/RISC-V only)`|
|
||||
|`-ffixed-x30`|Unsupported|`Reserve the x30 register (AArch64/RISC-V only)`|
|
||||
|`-ffixed-x31`|Unsupported|`Reserve the x31 register (AArch64/RISC-V only)`|
|
||||
|`-ffixed-x3`|Unsupported|`Reserve the x3 register (AArch64/RISC-V only)`|
|
||||
|`-ffixed-x4`|Unsupported|`Reserve the x4 register (AArch64/RISC-V only)`|
|
||||
|`-ffixed-x5`|Unsupported|`Reserve the x5 register (AArch64/RISC-V only)`|
|
||||
|`-ffixed-x6`|Unsupported|`Reserve the x6 register (AArch64/RISC-V only)`|
|
||||
|`-ffixed-x7`|Unsupported|`Reserve the x7 register (AArch64/RISC-V only)`|
|
||||
|`-ffixed-x8`|Unsupported|`Reserve the x8 register (AArch64/RISC-V only)`|
|
||||
|`-ffixed-x9`|Unsupported|`Reserve the x9 register (AArch64/RISC-V only)`|
|
||||
|`-fforce-dwarf-frame`|Supported|`Always emit a debug frame section`|
|
||||
|`-fforce-emit-vtables`|Supported|`Emits more virtual tables to improve devirtualization`|
|
||||
|`-fforce-enable-int128`|Supported|`Enable support for int128_t type`|
|
||||
|`-ffp-contract=<value>`|Supported|`Form fused FP ops (e.g. FMAs): fast (everywhere) \| on (according to FP_CONTRACT pragma) \| off (never fuse). Default is 'fast' for CUDA/HIP and 'on' otherwise.`|
|
||||
|`-ffp-exception-behavior=<value>`|Supported|`Specifies the exception behavior of floating-point operations.`|
|
||||
|`-ffp-model=<value>`|Supported|`Controls the semantics of floating-point calculations.`|
|
||||
|`-ffree-form`|Supported|`Enable free-form format for Fortran`|
|
||||
|`-ffreestanding`|Supported|`Assert that the compilation takes place in a freestanding environment`|
|
||||
|`-ffunc-args-alias`|Supported|`Function argument may alias (equivalent to ansi alias)`|
|
||||
|`-ffunction-sections`|Supported|`Place each function in its own section`|
|
||||
|`-fglobal-isel`|Supported|`Enables the global instruction selector`|
|
||||
|`-fgnu-keywords`|Supported|`Allow GNU-extension keywords regardless of language standard`|
|
||||
|`-fgnu-runtime`|Unsupported|`Generate output compatible with the standard GNU Objective-C runtime`|
|
||||
|`-fgnu89-inline`|Unsupported|`Use the gnu89 inline semantics`|
|
||||
|`-fgnuc-version=<value>`|Supported|`Sets various macros to claim compatibility with the given GCC version (default is 4.2.1)`|
|
||||
|`-fgpu-allow-device-init`|Supported|`Allow device side init function in HIP`|
|
||||
|`-fgpu-rdc`|Supported|`Generate relocatable device code, also known as separate compilation mode`|
|
||||
|`-fhip-new-launch-api`|Supported|`Use new kernel launching API for HIP`|
|
||||
|`-fignore-exceptions`|Supported|`Enable support for ignoring exception handling constructs`|
|
||||
|`-fimplicit-module-maps`|Unsupported|`Implicitly search the file system for module map files.`|
|
||||
|`-finline-functions`|Supported|`Inline suitable functions`|
|
||||
|`-finline-hint-functions`|Supported|`Inline functions which are (explicitly or implicitly) marked inline`|
|
||||
|`-finstrument-function-entry-bare`|Unsupported|`Instrument function entry only, after inlining, without arguments to the instrumentation call`|
|
||||
|`-finstrument-functions-after-inlining`|Unsupported|`Like -finstrument-functions, but insert the calls after inlining`|
|
||||
|`-finstrument-functions`|Unsupported|`Generate calls to instrument function entry and exit`|
|
||||
|`-fintegrated-as`|Supported|`Enable the integrated assembler`|
|
||||
|`-fintegrated-cc1`|Supported|`Run cc1 in-process`|
|
||||
|`-fjump-tables`|Supported|`Use jump tables for lowering switches`|
|
||||
|`-fkeep-static-consts`|Supported|`Keep static const variables if unused`|
|
||||
|`-flax-vector-conversions=<value>`|Supported|`Enable implicit vector bit-casts`|
|
||||
|`-flto-jobs=<value>`|Unsupported|`Controls the backend parallelism of -flto=thin (default of 0 means the number of threads will be derived from the number of CPUs detected)`|
|
||||
|`-flto=<value>`|Unsupported|`Set LTO mode to either 'full' or 'thin'`|
|
||||
|`-flto`|Unsupported|`Enable LTO in 'full' mode`|
|
||||
|`-fmacro-prefix-map=<value>`|Supported|`remap file source paths in predefined preprocessor macros`|
|
||||
|`-fmath-errno`|Supported|`Require math functions to indicate errors by setting errno`|
|
||||
|`-fmax-tokens=<value>`|Supported|`Max total number of preprocessed tokens for -Wmax-tokens.`|
|
||||
|`-fmax-type-align=<value>`|Supported|`Specify the maximum alignment to enforce on pointers lacking an explicit alignment`|
|
||||
|`-fmemory-profile`|Supported|`Enable heap memory profiling`|
|
||||
|`-fmerge-all-constants`|Supported|`Allow merging of constants`|
|
||||
|`-fmessage-length=<value>`|Supported|`Format message diagnostics so that they fit within N columns`|
|
||||
|`-fmodule-file=[<name>=]<file>`|Unsupported|`Specify the mapping of module name to precompiled module file, or load a module file if name is omitted.`|
|
||||
|`-fmodule-map-file=<file>`|Unsupported|`Load this module map file`|
|
||||
|`-fmodule-name=<name>`|Unsupported|`Specify the name of the module to build`|
|
||||
|`-fmodules-cache-path=<directory>`|Unsupported|`Specify the module cache path`|
|
||||
|`-fmodules-decluse`|Unsupported|`Require declaration of modules used within a module`|
|
||||
|`-fmodules-disable-diagnostic-validation`|Unsupported|`Disable validation of the diagnostic options when loading the module`|
|
||||
|`-fmodules-ignore-macro=<value>`|Unsupported|`Ignore the definition of the given macro when building and loading modules`|
|
||||
|`-fmodules-prune-after=<seconds>`|Unsupported|`Specify the interval (in seconds) after which a module file will be considered unused`|
|
||||
|`-fmodules-prune-interval=<seconds>`|Unsupported|`Specify the interval (in seconds) between attempts to prune the module cache`|
|
||||
|`-fmodules-search-all`|Unsupported|`Search even non-imported modules to resolve references`|
|
||||
|`-fmodules-strict-decluse`|Unsupported|`Like -fmodules-decluse but requires all headers to be in modules`|
|
||||
|`-fmodules-ts`|Unsupported|`Enable support for the C++ Modules TS`|
|
||||
|`-fmodules-user-build-path <directory>`|Unsupported|`Specify the module user build path`|
|
||||
|`-fmodules-validate-input-files-content`|Supported|`Validate PCM input files based on content if mtime differs`|
|
||||
|`-fmodules-validate-once-per-build-session`|Unsupported|`Don't verify input files for the modules if the module has been successfully validated or loaded during this build session`|
|
||||
|`-fmodules-validate-system-headers`|Supported|`Validate the system headers that a module depends on when loading the module`|
|
||||
|`-fmodules`|Unsupported|`Enable the 'modules' language feature`|
|
||||
|`-fms-compatibility-version=<value>`|Supported|`Dot-separated value representing the Microsoft compiler version number to report in _MSC_VER (0 = don't define it (default))`|
|
||||
|`-fms-compatibility`|Supported|`Enable full Microsoft Visual C++ compatibility`|
|
||||
|`-fms-extensions`|Supported|`Accept some non-standard constructs supported by the Microsoft compiler`|
|
||||
|`-fmsc-version=<value>`|Supported|`Microsoft compiler version number to report in _MSC_VER (0 = don't define it (default))`|
|
||||
|`-fnew-alignment=<align>`|Supported|`Specifies the largest alignment guaranteed by '::operator new(size_t)'`|
|
||||
|`-fno-addrsig`|Supported|`Don't emit an address-significance table`|
|
||||
|`-fno-allow-fortran-gnu-ext`|Supported|`Allow Fortran GNU extensions`|
|
||||
|`-fno-assume-sane-operator-new`|Supported|`Don't assume that C++'s global operator new can't alias any pointer`|
|
||||
|`-fno-autolink`|Supported|`Disable generation of linker directives for automatic library linking`|
|
||||
|`-fno-backslash`|Supported|`Treat backslash like any other character in character strings`|
|
||||
|`-fno-builtin-<value>`|Supported|`Disable implicit builtin knowledge of a specific function`|
|
||||
|`-fno-builtin`|Supported|`Disable implicit builtin knowledge of functions`|
|
||||
|`-fno-c++-static-destructors`|Supported|`Disable C++ static destructor registration`|
|
||||
|`-fno-char8_t`|Supported|`Disable C++ builtin type char8_t`|
|
||||
|`-fno-color-diagnostics`|Supported|`Disable colors in diagnostics`|
|
||||
|`-fno-common`|Supported|`Compile common globals like normal definitions`|
|
||||
|`-fno-complete-member-pointers`|Supported|`Do not require member pointer base types to be complete if they would be significant under the Microsoft ABI`|
|
||||
|`-fno-constant-cfstrings`|Supported|`Disable creation of CodeFoundation-type constant strings`|
|
||||
|`-fno-coverage-mapping`|Supported|`Disable code coverage analysis`|
|
||||
|`-fno-crash-diagnostics`|Supported|`Disable auto-generation of preprocessed source files and a script for reproduction during a clang crash`|
|
||||
|`-fno-cuda-approx-transcendentals`|Unsupported|`Don't use approximate transcendental functions`|
|
||||
|`-fno-debug-macro`|Supported|`Do not emit macro debug information`|
|
||||
|`-fno-declspec`|Unsupported|`Disallow __declspec as a keyword`|
|
||||
|`-fno-delayed-template-parsing`|Supported|`Disable delayed template parsing`|
|
||||
|`-fno-delete-null-pointer-checks`|Supported|`Do not treat usage of null pointers as undefined behavior`|
|
||||
|`-fno-diagnostics-fixit-info`|Supported|`Do not include fixit information in diagnostics`|
|
||||
|`-fno-digraphs`|Supported|`Disallow alternative token representations '<:', ':>', '<%', '%>', '%:', '%:%:'`|
|
||||
|`-fno-discard-value-names`|Supported|`Do not discard value names in LLVM IR`|
|
||||
|`-fno-dollars-in-identifiers`|Supported|`Disallow '$' in identifiers`|
|
||||
|`-fno-double-square-bracket-attributes`|Supported|`Disable '[[]]' attributes in all C and C++ language modes`|
|
||||
|`-fno-elide-constructors`|Supported|`Disable C++ copy constructor elision`|
|
||||
|`-fno-elide-type`|Supported|`Do not elide types when printing diagnostics`|
|
||||
|`-fno-eliminate-unused-debug-types`|Supported|`Emit debug info for defined but unused types`|
|
||||
|`-fno-exceptions`|Supported|`Disable support for exception handling`|
|
||||
|`-fno-experimental-new-pass-manager`|Supported|`Disables an experimental new pass manager in LLVM.`|
|
||||
|`-fno-experimental-relative-c++-abi-vtables`|Supported|`Do not use the experimental C++ class ABI for classes with virtual tables`|
|
||||
|`-fno-fine-grained-bitfield-accesses`|Supported|`Use large-integer access for consecutive bitfield runs.`|
|
||||
|`-fno-fixed-form`|Supported|`Disable fixed-form format for Fortran`|
|
||||
|`-fno-fixed-point`|Supported|`Disable fixed point types`|
|
||||
|`-fno-force-enable-int128`|Supported|`Disable support for int128_t type`|
|
||||
|`-fno-fortran-main`|Supported|`Don't link in Fortran main`|
|
||||
|`-fno-free-form`|Supported|`Disable free-form format for Fortran`|
|
||||
|`-fno-func-args-alias`|Supported|`Function argument may alias (equivalent to ansi alias)`|
|
||||
|`-fno-global-isel`|Supported|`Disables the global instruction selector`|
|
||||
|`-fno-gnu-inline-asm`|Supported|`Disable GNU style inline asm`|
|
||||
|`-fno-gpu-allow-device-init`|Supported|`Don't allow device side init function in HIP`|
|
||||
|`-fno-hip-new-launch-api`|Supported|`Don't use new kernel launching API for HIP`|
|
||||
|`-fno-integrated-as`|Supported|`Disable the integrated assembler`|
|
||||
|`-fno-integrated-cc1`|Supported|`Spawn a separate process for each cc1`|
|
||||
|`-fno-jump-tables`|Supported|`Do not use jump tables for lowering switches`|
|
||||
|`-fno-keep-static-consts`|Supported|`Don't keep static const variables if unused`|
|
||||
|`-fno-lto`|Supported|`Disable LTO mode (default)`|
|
||||
|`-fno-memory-profile`|Supported|`Disable heap memory profiling`|
|
||||
|`-fno-merge-all-constants`|Supported|`Disallow merging of constants`|
|
||||
|`-fno-no-access-control`|Supported|`Disable C++ access control`|
|
||||
|`-fno-objc-infer-related-result-type`|Supported|`do not infer Objective-C related result type based on method family`|
|
||||
|`-fno-operator-names`|Supported|`Do not treat C++ operator name keywords as synonyms for operators`|
|
||||
|`-fno-pch-codegen`|Supported|`Do not generate code for uses of this PCH that assumes an explicit object file will be built for the PCH`|
|
||||
|`-fno-pch-debuginfo`|Supported|`Do not generate debug info for types in an object file built from this PCH and do not generate them elsewhere`|
|
||||
|`-fno-plt`|Supported|`Use GOT indirection instead of PLT to make external function calls (x86 only)`|
|
||||
|`-fno-preserve-as-comments`|Supported|`Do not preserve comments in inline assembly`|
|
||||
|`-fno-profile-generate`|Supported|`Disable generation of profile instrumentation.`|
|
||||
|`-fno-profile-instr-generate`|Supported|`Disable generation of profile instrumentation.`|
|
||||
|`-fno-profile-instr-use`|Supported|`Disable using instrumentation data for profile-guided optimization`|
|
||||
|`-fno-register-global-dtors-with-atexit`|Supported|`Don't use atexit or __cxa_atexit to register global destructors`|
|
||||
|`-fno-rtlib-add-rpath`|Supported|`Do not add -rpath with architecture-specific resource directory to the linker flags`|
|
||||
|`-fno-rtti-data`|Supported|`Disable generation of RTTI data`|
|
||||
|`-fno-rtti`|Supported|`Disable generation of rtti information`|
|
||||
|`-fno-sanitize-address-poison-custom-array-cookie`|Supported on Host only|`Disable poisoning array cookies when using custom operator new[] in AddressSanitizer`|
|
||||
|`-fno-sanitize-address-use-after-scope`|Supported on Host only|`Disable use-after-scope detection in AddressSanitizer`|
|
||||
|`-fno-sanitize-address-use-odr-indicator`|Supported on Host only|`Disable ODR indicator globals`|
|
||||
|`-fno-sanitize-blacklist`|Supported on Host only|`Don't use blacklist file for sanitizers`|
|
||||
|`-fno-sanitize-cfi-canonical-jump-tables`|Supported on Host only|`Do not make the jump table addresses canonical in the symbol table`|
|
||||
|`-fno-sanitize-cfi-cross-dso`|Supported on Host only|`Disable control flow integrity (CFI) checks for cross-DSO calls.`|
|
||||
|`-fno-sanitize-coverage=<value>`|Supported on Host only|`Disable specified features of coverage instrumentation for Sanitizers`|
|
||||
|`-fno-sanitize-memory-track-origins`|Supported on Host only|`Disable origins tracking in MemorySanitizer`|
|
||||
|`-fno-sanitize-memory-use-after-dtor`|Supported on Host only|`Disable use-after-destroy detection in MemorySanitizer`|
|
||||
|`-fno-sanitize-recover=<value>`|Supported on Host only|`Disable recovery for specified sanitizers`|
|
||||
|`-fno-sanitize-stats`|Supported on Host only|`Disable sanitizer statistics gathering.`|
|
||||
|`-fno-sanitize-thread-atomics`|Supported on Host only|`Disable atomic operations instrumentation in ThreadSanitizer`|
|
||||
|`-fno-sanitize-thread-func-entry-exit`|Supported on Host only|`Disable function entry/exit instrumentation in ThreadSanitizer`|
|
||||
|`-fno-sanitize-thread-memory-access`|Supported on Host only|`Disable memory access instrumentation in ThreadSanitizer`|
|
||||
|`-fno-sanitize-trap=<value>`|Supported on Host only|`Disable trapping for specified sanitizers`|
|
||||
|`-fno-sanitize-trap`|Supported on Host only|`Disable trapping for all sanitizers`|
|
||||
|`-fno-short-wchar`|Supported|`Force wchar_t to be an unsigned int`|
|
||||
|`-fno-show-column`|Supported|`Do not include column number on diagnostics`|
|
||||
|`-fno-show-source-location`|Supported|`Do not include source location information with diagnostics`|
|
||||
|`-fno-signed-char`|Supported|`char is unsigned`|
|
||||
|`-fno-signed-zeros`|Supported|`Allow optimizations that ignore the sign of floating point zeros`|
|
||||
|`-fno-spell-checking`|Supported|`Disable spell-checking`|
|
||||
|`-fno-split-machine-functions`|Supported|`Disable late function splitting using profile information (x86 ELF)`|
|
||||
|`-fno-stack-clash-protection`|Supported|`Disable stack clash protection`|
|
||||
|`-fno-stack-protector`|Supported|`Disable the use of stack protectors`|
|
||||
|`-fno-standalone-debug`|Supported|`Limit debug information produced to reduce size of debug binary`|
|
||||
|`-fno-strict-float-cast-overflow`|Supported|`Relax language rules and try to match the behavior of the target's native float-to-int conversion instructions`|
|
||||
|`-fno-strict-return`|Supported|`Don't treat control flow paths that fall off the end of a non-void function as unreachable`|
|
||||
|`-fno-sycl`|Unsupported|`Disable SYCL kernels compilation for device`|
|
||||
|`-fno-temp-file`|Supported|`Directly create compilation output files. This may lead to incorrect incremental builds if the compiler crashes`|
|
||||
|`-fno-threadsafe-statics`|Supported|`Do not emit code to make initialization of local statics thread safe`|
|
||||
|`-fno-trigraphs`|Supported|`Do not process trigraph sequences`|
|
||||
|`-fno-unique-section-names`|Supported|`Don't use unique names for text and data sections`|
|
||||
|`-fno-unroll-loops`|Supported|`Turn off loop unroller`|
|
||||
|`-fno-use-cxa-atexit`|Supported|`Don't use __cxa_atexit for calling destructors`|
|
||||
|`-fno-use-flang-math-libs`|Supported|`Use Flang internal runtime math library instead of LLVM math intrinsics.`|
|
||||
|`-fno-use-init-array`|Supported|`Use .ctors/.dtors instead of .init_array/.fini_array`|
|
||||
|`-fno-visibility-inlines-hidden-static-local-var`|Supported|`Disables -fvisibility-inlines-hidden-static-local-var (this is the default on non-darwin targets)`|
|
||||
|`-fno-xray-function-index`|Unsupported|`Omit function index section at the expense of single-function patching performance`|
|
||||
|`-fno-zero-initialized-in-bss`|Supported|`Don't place zero initialized data in BSS`|
|
||||
|`-fobjc-arc-exceptions`|Unsupported|`Use EH-safe code when synthesizing retains and releases in -fobjc-arc`|
|
||||
|`-fobjc-arc`|Unsupported|`Synthesize retain and release calls for Objective-C pointers`|
|
||||
|`-fobjc-exceptions`|Unsupported|`Enable Objective-C exceptions`|
|
||||
|`-fobjc-runtime=<value>`|Unsupported|`Specify the target Objective-C runtime kind and version`|
|
||||
|`-fobjc-weak`|Unsupported|`Enable ARC-style weak references in Objective-C`|
|
||||
|`-fopenmp-simd`|Unsupported|`Emit OpenMP code only for SIMD-based constructs.`|
|
||||
|`-fopenmp-targets=<value>`|Unsupported|`Specify comma-separated list of triples OpenMP offloading targets to be supported`|
|
||||
|`-fopenmp`|Unsupported|`Parse OpenMP pragmas and generate parallel code.`|
|
||||
|`-foptimization-record-file=<file>`|Supported|`Specify the output name of the file containing the optimization remarks. Implies -fsave-optimization-record. On Darwin platforms, this cannot be used with multiple -arch <arch> options.`|
|
||||
|`-foptimization-record-passes=<regex>`|Supported|`Only include passes which match a specified regular expression in the generated optimization record (by default, include all passes)`|
|
||||
|`-forder-file-instrumentation`|Supported|`Generate instrumented code to collect order file into default.profraw file (overridden by '=' form of option or LLVM_PROFILE_FILE env var)`|
|
||||
|`-fpack-struct=<value>`|Unsupported|`Specify the default maximum struct packing alignment`|
|
||||
|`-fpascal-strings`|Supported|`Recognize and construct Pascal-style string literals`|
|
||||
|`-fpass-plugin=<dsopath>`|Supported|`Load pass plugin from a dynamic shared object file (only with new pass manager).`|
|
||||
|`-fpatchable-function-entry=<N,M>`|Supported|`Generate M NOPs before function entry and N-M NOPs after function entry`|
|
||||
|`-fpcc-struct-return`|Unsupported|`Override the default ABI to return all structs on the stack`|
|
||||
|`-fpch-codegen`|Supported|`Generate code for uses of this PCH that assumes an explicit object file will be built for the PCH`|
|
||||
|`-fpch-debuginfo`|Supported|`Generate debug info for types in an object file built from this PCH and do not generate them elsewhere`|
|
||||
|`-fpch-instantiate-templates`|Supported|`Instantiate templates already while building a PCH`|
|
||||
|`-fpch-validate-input-files-content`|Supported|`Validate PCH input files based on content if mtime differs`|
|
||||
|`-fplugin=<dsopath>`|Supported|`Load the named plugin (dynamic shared object)`|
|
||||
|`-fprebuilt-module-path=<directory>`|Unsupported|`Specify the prebuilt module path`|
|
||||
|`-fprofile-exclude-files=<value>`|Unsupported|`Instrument only functions from files where names don't match all the regexes separated by a semi-colon`|
|
||||
|`-fprofile-filter-files=<value>`|Unsupported|`Instrument only functions from files where names match any regex separated by a semi-colon`|
|
||||
|`-fprofile-generate=<directory>`|Unsupported|`Generate instrumented code to collect execution counts into <directory>/default.profraw (overridden by LLVM_PROFILE_FILE env var)`|
|
||||
|`-fprofile-generate`|Unsupported|`Generate instrumented code to collect execution counts into default.profraw (overridden by LLVM_PROFILE_FILE env var)`|
|
||||
|`-fprofile-instr-generate=<file>`|Unsupported|`Generate instrumented code to collect execution counts into <file> (overridden by LLVM_PROFILE_FILE env var)`|
|
||||
|`-fprofile-instr-generate`|Unsupported|`Generate instrumented code to collect execution counts into default.profraw file (overridden by '=' form of option or LLVM_PROFILE_FILE env var)`|
|
||||
|`-fprofile-instr-use=<value>`|Unsupported|`Use instrumentation data for profile-guided optimization`|
|
||||
|`-fprofile-remapping-file=<file>`|Unsupported|`Use the remappings described in <file> to match the profile data against names in the program`|
|
||||
|`-fprofile-sample-accurate`|Unsupported|`Specifies that the sample profile is accurate`|
|
||||
|`-fprofile-sample-use=<value>`|Unsupported|`Enable sample-based profile guided optimizations`|
|
||||
|`-fprofile-use=<pathname>`|Unsupported|`Use instrumentation data for profile-guided optimization. If pathname is a directory, it reads from <pathname>/default.profdata. Otherwise, it reads from file <pathname>.`|
|
||||
|`-freciprocal-math`|Supported|`Allow division operations to be reassociated`|
|
||||
|`-freg-struct-return`|Unsupported|`Override the default ABI to return small structs in registers`|
|
||||
|`-fregister-global-dtors-with-atexit`|Supported|`Use atexit or __cxa_atexit to register global destructors`|
|
||||
|`-frelaxed-template-template-args`|Supported|`Enable C++17 relaxed template template argument matching`|
|
||||
|`-freroll-loops`|Supported|`Turn on loop reroller`|
|
||||
|`-fropi`|Unsupported|`Generate read-only position independent code (ARM only)`|
|
||||
|`-frtlib-add-rpath`|Supported|`Add -rpath with architecture-specific resource directory to the linker flags`|
|
||||
|`-frwpi`|Unsupported|`Generate read-write position independent code (ARM only)`|
|
||||
|`-fsanitize-address-field-padding=<value>`|Supported on Host only|`Level of field padding for AddressSanitizer`|
|
||||
|`-fsanitize-address-globals-dead-stripping`|Supported on Host only|`Enable linker dead stripping of globals in AddressSanitizer`|
|
||||
|`-fsanitize-address-poison-custom-array-cookie`|Supported on Host only|`Enable poisoning array cookies when using custom operator new[] in AddressSanitizer`|
|
||||
|`-fsanitize-address-use-after-scope`|Supported on Host only|`Enable use-after-scope detection in AddressSanitizer`|
|
||||
|`-fsanitize-address-use-odr-indicator`|Supported on Host only|`Enable ODR indicator globals to avoid false ODR violation reports in partially sanitized programs at the cost of an increase in binary size`|
|
||||
|`-fsanitize-blacklist=<value>`|Supported on Host only|`Path to blacklist file for sanitizers`|
|
||||
|`-fsanitize-cfi-canonical-jump-tables`|Supported on Host only|`Make the jump table addresses canonical in the symbol table`|
|
||||
|`-fsanitize-cfi-cross-dso`|Supported on Host only|`Enable control flow integrity (CFI) checks for cross-DSO calls.`|
|
||||
|`-fsanitize-cfi-icall-generalize-pointers`|Supported on Host only|`Generalize pointers in CFI indirect call type signature checks`|
|
||||
|`-fsanitize-coverage-allowlist=<value>`|Supported on Host only|`Restrict sanitizer coverage instrumentation exclusively to modules and functions that match the provided special case list, except the blocked ones`|
|
||||
|`-fsanitize-coverage-blacklist=<value>`|Supported on Host only|`Deprecated, use -fsanitize-coverage-blocklist= instead`|
|
||||
|`-fsanitize-coverage-blocklist=<value>`|Supported on Host only|`Disable sanitizer coverage instrumentation for modules and functions that match the provided special case list, even the allowed ones`|
|
||||
|`-fsanitize-coverage-whitelist=<value>`|Supported on Host only|`Deprecated, use -fsanitize-coverage-allowlist= instead`|
|
||||
|`-fsanitize-coverage=<value>`|Supported on Host only|`Specify the type of coverage instrumentation for Sanitizers`|
|
||||
|`-fsanitize-hwaddress-abi=<value>`|Supported on Host only|`Select the HWAddressSanitizer ABI to target (interceptor or platform, default interceptor). This option is currently unused.`|
|
||||
|`-fsanitize-memory-track-origins=<value>`|Supported on Host only|`Enable origins tracking in MemorySanitizer`|
|
||||
|`-fsanitize-memory-track-origins`|Supported on Host only|`Enable origins tracking in MemorySanitizer`|
|
||||
|`-fsanitize-memory-use-after-dtor`|Supported on Host only|`Enable use-after-destroy detection in MemorySanitizer`|
|
||||
|`-fsanitize-recover=<value>`|Supported on Host only|`Enable recovery for specified sanitizers`|
|
||||
|`-fsanitize-stats`|Supported on Host only|`Enable sanitizer statistics gathering.`|
|
||||
|`-fsanitize-system-blacklist=<value>`|Supported on Host only|`Path to system blacklist file for sanitizers`|
|
||||
|`-fsanitize-thread-atomics`|Supported on Host only|`Enable atomic operations instrumentation in ThreadSanitizer (default)`|
|
||||
|`-fsanitize-thread-func-entry-exit`|Supported on Host only|`Enable function entry/exit instrumentation in ThreadSanitizer (default)`|
|
||||
|`-fsanitize-thread-memory-access`|Supported on Host only|`Enable memory access instrumentation in ThreadSanitizer (default)`|
|
||||
|`-fsanitize-trap=<value>`|Supported on Host only|`Enable trapping for specified sanitizers`|
|
||||
|`-fsanitize-trap`|Supported on Host only|`Enable trapping for all sanitizers`|
|
||||
|`-fsanitize-undefined-strip-path-components=<number>`|Supported on Host only|`Strip (or keep only, if negative) a given number of path components when emitting check metadata.`|
|
||||
|`-fsanitize=<check>`|Supported on Host only|`Turn on runtime checks for various forms of undefined or suspicious behavior. See user manual for available checks`|
|
||||
|`-fsave-optimization-record=<format>`|Supported|`Generate an optimization record file in a specific format`|
|
||||
|`-fsave-optimization-record`|Supported|`Generate a YAML optimization record file`|
|
||||
|`-fseh-exceptions`|Supported|`Use SEH style exceptions`|
|
||||
|`-fshort-enums`|Supported|`Allocate to an enum type only as many bytes as it needs for the declared range of possible values`|
|
||||
|`-fshort-wchar`|Unsupported|`Force wchar_t to be a short unsigned int`|
|
||||
|`-fshow-overloads=<value>`|Supported|`Which overload candidates to show when overload resolution fails: best\|all; defaults to all`|
|
||||
|`-fsigned-char`|Supported|`char is signed`|
|
||||
|`-fsized-deallocation`|Supported|`Enable C++14 sized global deallocation functions`|
|
||||
|`-fsjlj-exceptions`|Supported|`Use SjLj style exceptions`|
|
||||
|`-fslp-vectorize`|Supported|`Enable the superword-level parallelism vectorization passes`|
|
||||
|`-fsplit-dwarf-inlining`|Unsupported|`Provide minimal debug info in the object/executable to facilitate online symbolication/stack traces in the absence of .dwo/.dwp files when using Split DWARF`|
|
||||
|`-fsplit-lto-unit`|Unsupported|`Enables splitting of the LTO unit`|
|
||||
|`-fsplit-machine-functions`|Supported|`Enable late function splitting using profile information (x86 ELF)`|
|
||||
|`-fstack-clash-protection`|Supported|`Enable stack clash protection`|
|
||||
|`-fstack-protector-all`|Unsupported|`Enable stack protectors for all functions`|
|
||||
|`-fstack-protector-strong`|Unsupported|`Enable stack protectors for some functions vulnerable to stack smashing. Compared to -fstack-protector, this uses a stronger heuristic that includes functions containing arrays of any size (and any type), as well as any calls to alloca or the taking of an address from a local variable`|
|
||||
|`-fstack-protector`|Unsupported|`Enable stack protectors for some functions vulnerable to stack smashing. This uses a loose heuristic which considers functions vulnerable if they contain a char (or 8bit integer) array or constant sized calls to alloca , which are of greater size than ssp-buffer-size (default: 8 bytes). All variable sized calls to alloca are considered vulnerable. A function with a stack protector has a guard value added to the stack frame that is checked on function exit. The guard value must be positioned in the stack frame such that a buffer overflow from a vulnerable variable will overwrite the guard value before overwriting the function's return address. The reference stack guard value is stored in a global variable.`|
|
||||
|`-fstack-size-section`|Supported|`Emit section containing metadata on function stack sizes`|
|
||||
|`-fstandalone-debug`|Supported|`Emit full debug info for all types used by the program`|
|
||||
|`-fstrict-enums`|Supported|`Enable optimizations based on the strict definition of an enum's value range`|
|
||||
|`-fstrict-float-cast-overflow`|Supported|`Assume that overflowing float-to-int casts are undefined (default)`|
|
||||
|`-fstrict-vtable-pointers`|Supported|`Enable optimizations based on the strict rules for overwriting polymorphic C++ objects`|
|
||||
|`-fsycl`|Unsupported|`Enable SYCL kernels compilation for device`|
|
||||
|`-fsystem-module`|u|`Build this module as a system module. Only used with -emit-module`|
|
||||
|`-fthin-link-bitcode=<value>`|Supported|`Write minimized bitcode to <file> for the ThinLTO thin link only`|
|
||||
|`-fthinlto-index=<value>`|Unsupported|`Perform ThinLTO importing using provided function summary index`|
|
||||
|`-ftime-trace-granularity=<value>`|Supported|`Minimum time granularity (in microseconds) traced by time profiler`|
|
||||
|`-ftime-trace`|Supported|`Turn on time profiler. Generates JSON file based on output filename.`|
|
||||
|`-ftrap-function=<value>`|Unsupported|`Issue call to specified function rather than a trap instruction`|
|
||||
|`-ftrapv-handler=<function name>`|Unsupported|`Specify the function to be called on overflow`|
|
||||
|`-ftrapv`|Unsupported|`Trap on integer overflow`|
|
||||
|`-ftrigraphs`|Supported|`Process trigraph sequences`|
|
||||
|`-ftrivial-auto-var-init-stop-after=<value>`|Supported|`Stop initializing trivial automatic stack variables after the specified number of instances`|
|
||||
|`-ftrivial-auto-var-init=<value>`|Supported|`Initialize trivial automatic stack variables: uninitialized (default) \| pattern`|
|
||||
|`-funique-basic-block-section-names`|Supported|`Use unique names for basic block sections (ELF Only)`|
|
||||
|`-funique-internal-linkage-names`|Supported|`Uniqueify Internal Linkage Symbol Names by appending the MD5 hash of the module path`|
|
||||
|`-funroll-loops`|Supported|`Turn on loop unroller`|
|
||||
|`-fuse-flang-math-libs`|Supported|`Use Flang internal runtime math library instead of LLVM math intrinsics.`|
|
||||
|`-fuse-line-directives`|Supported|`Use #line in preprocessed output`|
|
||||
|`-fvalidate-ast-input-files-content`|Supported|`Compute and store the hash of input files used to build an AST. Files with mismatching mtime's are considered valid if both contents is identical`|
|
||||
|`-fveclib=<value>`|Unsupported|`Use the given vector functions library`|
|
||||
|`-fvectorize`|Unsupported|`Enable the loop vectorization passes`|
|
||||
|`-fverbose-asm`|Supported|`Generate verbose assembly output`|
|
||||
|`-fvirtual-function-elimination`|Supported|`Enables dead virtual function elimination optimization. Requires -flto=full`|
|
||||
|`-fvisibility-global-new-delete-hidden`|Supported|`Give global C++ operator new and delete declarations hidden visibility`|
|
||||
|`-fvisibility-inlines-hidden-static-local-var`|Supported|`When -fvisibility-inlines-hidden is enabled, static variables in inline C++ member functions will also be given hidden visibility by default`|
|
||||
|`-fvisibility-inlines-hidden`|Supported|`Give inline C++ member functions hidden visibility by default`|
|
||||
|`-fvisibility-ms-compat`|Supported|`Give global types 'default' visibility and global functions and variables 'hidden' visibility by default`|
|
||||
|`-fvisibility=<value>`|Supported|`Set the default symbol visibility for all global declarations`|
|
||||
|`-fwasm-exceptions`|Unsupported|`Use WebAssembly style exceptions`|
|
||||
|`-fwhole-program-vtables`|Unsupported|`Enables whole-program vtable optimization. Requires -flto`|
|
||||
|`-fwrapv`|Supported|`Treat signed integer overflow as two's complement`|
|
||||
|`-fwritable-strings`|Supported|`Store string literals as writable data`|
|
||||
|`-fxray-always-emit-customevents`|Unsupported|`Always emit __xray_customevent(...) calls even if the containing function is not always instrumented`|
|
||||
|`-fxray-always-emit-typedevents`|Unsupported|`Always emit __xray_typedevent(...) calls even if the containing function is not always instrumented`|
|
||||
|`-fxray-always-instrument= <value>`|Unsupported|`DEPRECATED: Filename defining the whitelist for imbuing the 'always instrument' XRay attribute.`|
|
||||
|`-fxray-attr-list= <value>`|Unsupported|`Filename defining the list of functions/types for imbuing XRay attributes.`|
|
||||
|`-fxray-ignore-loops`|Unsupported|`Don't instrument functions with loops unless they also meet the minimum function size`|
|
||||
|`-fxray-instruction-threshold= <value>`|Unsupported|`Sets the minimum function size to instrument with XRay`|
|
||||
|`-fxray-instrumentation-bundle= <value>`|Unsupported|`Select which XRay instrumentation points to emit. Options: all, none, function-entry, function-exit, function, custom. Default is 'all'. 'function' includes both 'function-entry' and 'function-exit'.`|
|
||||
|`-fxray-instrument`|Unsupported|`Generate XRay instrumentation sleds on function entry and exit`|
|
||||
|`-fxray-link-deps`|Unsupported|`Tells clang to add the link dependencies for XRay.`|
|
||||
|`-fxray-modes= <value>`|Unsupported|`List of modes to link in by default into XRay instrumented binaries.`|
|
||||
|`-fxray-never-instrument= <value>`|Unsupported|`DEPRECATED: Filename defining the whitelist for imbuing the 'never instrument' XRay attribute.`|
|
||||
|`-fzvector`|Supported|`Enable System z vector language extension`|
|
||||
|`-F <value>`|Unsupported|`Add directory to framework include search path`|
|
||||
|`--gcc-toolchain=<value>`|Supported|`Use the gcc toolchain at the given directory`|
|
||||
|`-gcodeview-ghash`|Supported|`Emit type record hashes in a .debug$H section`|
|
||||
|`-gcodeview`|Supported|`Generate CodeView debug information`|
|
||||
|`-gdwarf-2`|Supported|`Generate source-level debug information with dwarf version 2`|
|
||||
|`-gdwarf-3`|Supported|`Generate source-level debug information with dwarf version 3`|
|
||||
|`-gdwarf-4`|Supported|`Generate source-level debug information with dwarf version 4`|
|
||||
|`-gdwarf-5`|Supported|`Generate source-level debug information with dwarf version 5`|
|
||||
|`-gdwarf`|Supported|`Generate source-level debug information with the default dwarf version`|
|
||||
|`-gembed-source`|Supported|`Embed source text in DWARF debug sections`|
|
||||
|`-gline-directives-only`|Supported|`Emit debug line info directives only`|
|
||||
|`-gline-tables-only`|Supported|`Emit debug line number tables only`|
|
||||
|`-gmodules`|Supported|`Generate debug info with external references to clang modules or precompiled headers`|
|
||||
|`-gno-embed-source`|Supported|`Restore the default behavior of not embedding source text in DWARF debug sections`|
|
||||
|`-gno-inline-line-tables`|Supported|`Don't emit inline line tables`|
|
||||
|`--gpu-max-threads-per-block=<value>`|Supported|`Default max threads per block for kernel launch bounds for HIP`|
|
||||
|`-gsplit-dwarf=<value>`|Supported|`Set DWARF fission mode to either 'split' or 'single'`|
|
||||
|`-gz=<value>`|Supported|`DWARF debug sections compression type`|
|
||||
|`-gz`|Supported|`DWARF debug sections compression type`|
|
||||
|`-G <size>`|Unsupported|`Put objects of at most <size> bytes into small data section (MIPS / Hexagon)`|
|
||||
|`-g`|Supported|`Generate source-level debug information`|
|
||||
|`--help-hidden`|Supported|`Display help for hidden options`|
|
||||
|`-help`|Supported|`Display available options`|
|
||||
|`--hip-device-lib=<value>`|Supported|`HIP device library`|
|
||||
|`--hip-link`|Supported|`Link clang-offload-bundler bundles for HIP`|
|
||||
|`--hip-version=<value>`|Supported|`HIP version in the format of major.minor.patch`|
|
||||
|`-H`|Supported|`Show header includes and nesting depth`|
|
||||
|`-I-`|Supported|`Restrict all prior -I flags to double-quoted inclusion and remove current directory from include path`|
|
||||
|`-ibuiltininc`|Supported|`Enable builtin #include directories even when -nostdinc is used before or after -ibuiltininc. Using -nobuiltininc after the option disables it`|
|
||||
|`-idirafter <value>`|Supported|`Add directory to AFTER include search path`|
|
||||
|`-iframeworkwithsysroot <directory>`|Unsupported|`Add directory to SYSTEM framework search path, absolute paths are relative to -isysroot`|
|
||||
|`-iframework <value>`|Unsupported|`Add directory to SYSTEM framework search path`|
|
||||
|`-imacros <file>`|Supported|`Include macros from file before parsing`|
|
||||
|`-include-pch <file>`|Supported|`Include precompiled header file`|
|
||||
|`-include <file>`|Supported|`Include file before parsing`|
|
||||
|`-index-header-map`|Supported|`Make the next included directory (-I or -F) an indexer header map`|
|
||||
|`-iprefix <dir>`|Supported|`Set the -iwithprefix/-iwithprefixbefore prefix`|
|
||||
|`-iquote <directory>`|Supported|`Add directory to QUOTE include search path`|
|
||||
|`-isysroot <dir>`|Supported|`Set the system root directory (usually /)`|
|
||||
|`-isystem-after <directory>`|Supported|`Add directory to end of the SYSTEM include search path`|
|
||||
|`-isystem <directory>`|Supported|`Add directory to SYSTEM include search path`|
|
||||
|`-ivfsoverlay <value>`|Supported|`Overlay the virtual filesystem described by file over the real file system`|
|
||||
|`-iwithprefixbefore <dir>`|Supported|`Set directory to include search path with prefix`|
|
||||
|`-iwithprefix <dir>`|Supported|`Set directory to SYSTEM include search path with prefix`|
|
||||
|`-iwithsysroot <directory>`|Supported|`Add directory to SYSTEM include search path, absolute paths are relative to -isysroot`|
|
||||
|`-I <dir>`|Supported|`Add directory to include search path. If there are multiple -I options, these directories are searched in the order they are given before the standard system directories are searched. If the same directory is in the SYSTEM include search paths, for example if also specified with -isystem, the -I option will be ignored`|
|
||||
|`--libomptarget-nvptx-path=<value>`|Unsupported|`Path to libomptarget-nvptx libraries`|
|
||||
|`-L <dir>`|Supported|`Add directory to library search path`|
|
||||
|`-mabicalls`|Unsupported|`Enable SVR4-style position-independent code (Mips only)`|
|
||||
|`-maix-struct-return`|Unsupported|`Return all structs in memory (PPC32 only)`|
|
||||
|`-malign-branch-boundary=<value>`|Supported|`Specify the boundary's size to align branches`|
|
||||
|`-malign-branch=<value>`|Supported|`Specify types of branches to align`|
|
||||
|`-malign-double`|Supported|`Align doubles to two words in structs (x86 only)`|
|
||||
|`-Mallocatable=<value>`|Unsupported|`Select semantics for assignments to allocatables (F03 or F95)`|
|
||||
|`-mbackchain`|Unsupported|`Link stack frames through backchain on System Z`|
|
||||
|`-mbranch-protection=<value>`|Unsupported|`Enforce targets of indirect branches and function returns`|
|
||||
|`-mbranches-within-32B-boundaries`|Supported|`Align selected branches (fused, jcc, jmp) within 32-byte boundary`|
|
||||
|`-mcmodel=medany`|Unsupported|`Equivalent to -mcmodel=medium, compatible with RISC-V gcc.`|
|
||||
|`-mcmodel=medlow`|Unsupported|`Equivalent to -mcmodel=small, compatible with RISC-V gcc.`|
|
||||
|`-mcmse`|Unsupported|`Allow use of CMSE (Armv8-M Security Extensions)`|
|
||||
|`-mcode-object-v3`|Supported|`Legacy option to specify code object ABI V2 (-mnocode-object-v3) or V3 (-mcode-object-v3) (AMDGPU only)`|
|
||||
|`-mcode-object-version=<version>`|Supported|`Specify code object ABI version. Defaults to 4. (AMDGPU only)`|
|
||||
|`-mcrc`|Unsupported|`Allow use of CRC instructions (ARM/Mips only)`|
|
||||
|`-mcumode`|Supported|`Specify CU (-mcumode) or WGP (-mno-cumode) wavefront execution mode (AMDGPU only)`|
|
||||
|`-mdouble=<value>`|Supported|`Force double to be 32 bits or 64 bits`|
|
||||
|`-MD`|Supported|`Write a depfile containing user and system headers`|
|
||||
|`-meabi <value>`|Supported|`Set EABI type, e.g. 4, 5 or gnu (default depends on triple)`|
|
||||
|`-membedded-data`|Unsupported|`Place constants in the .rodata section instead of the .sdata section even if they meet the -G <size> threshold (MIPS)`|
|
||||
|`-menable-experimental-extensions`|Unsupported|`Enable use of experimental RISC-V extensions.`|
|
||||
|`-mexec-model=<value>`|Unsupported|`Execution model (WebAssembly only)`|
|
||||
|`-mexecute-only`|Unsupported|`Disallow generation of data access to code sections (ARM only)`|
|
||||
|`-mextern-sdata`|Unsupported|`Assume that externally defined data is in the small data if it meets the -G <size> threshold (MIPS)`|
|
||||
|`-mfentry`|Unsupported|`Insert calls to fentry at function entry (x86/SystemZ only)`|
|
||||
|`-mfix-cortex-a53-835769`|Unsupported|`Workaround Cortex-A53 erratum 835769 (AArch64 only)`|
|
||||
|`-mfp32`|Unsupported|`Use 32-bit floating point registers (MIPS only)`|
|
||||
|`-mfp64`|Unsupported|`Use 64-bit floating point registers (MIPS only)`|
|
||||
|`-MF <file>`|Supported|`Write depfile output from -MMD, -MD, -MM, or -M to <file>`|
|
||||
|`-mgeneral-regs-only`|Unsupported|`Generate code which only uses the general purpose registers (AArch64 only)`|
|
||||
|`-mglobal-merge`|Supported|`Enable merging of globals`|
|
||||
|`-mgpopt`|Unsupported|`Use GP relative accesses for symbols known to be in a small data section (MIPS)`|
|
||||
|`-MG`|Supported|`Add missing headers to depfile`|
|
||||
|`-mharden-sls=<value>`|Unsupported|`Select straight-line speculation hardening scope`|
|
||||
|`-mhvx-length=<value>`|Unsupported|`Set Hexagon Vector Length`|
|
||||
|`-mhvx=<value>`|Unsupported|`Enable Hexagon Vector eXtensions`|
|
||||
|`-mhvx`|Unsupported|`Enable Hexagon Vector eXtensions`|
|
||||
|`-miamcu`|Unsupported|`Use Intel MCU ABI`|
|
||||
|`--migrate`|Unsupported|`Run the migrator`|
|
||||
|`-mincremental-linker-compatible`|Supported|`(integrated-as) Emit an object file which can be used with an incremental linker`|
|
||||
|`-mindirect-jump=<value>`|Unsupported|`Change indirect jump instructions to inhibit speculation`|
|
||||
|`-Minform=<value>`|Supported|`Set error level of messages to display`|
|
||||
|`-mios-version-min=<value>`|Unsupported|`Set iOS deployment target`|
|
||||
|`-MJ <value>`|Unsupported|`Write a compilation database entry per input`|
|
||||
|`-mllvm <value>`|Supported|`Additional arguments to forward to LLVM's option processing`|
|
||||
|`-mlocal-sdata`|Unsupported|`Extend the -G behaviour to object local data (MIPS)`|
|
||||
|`-mlong-calls`|Supported|`Generate branches with extended addressability, usually via indirect jumps.`|
|
||||
|`-mlong-double-128`|Supported on Host only|`Force long double to be 128 bits`|
|
||||
|`-mlong-double-64`|Supported|`Force long double to be 64 bits`|
|
||||
|`-mlong-double-80`|Supported on Host only|`Force long double to be 80 bits, padded to 128 bits for storage`|
|
||||
|`-mlvi-cfi`|Supported on Host only|`Enable only control-flow mitigations for Load Value Injection (LVI)`|
|
||||
|`-mlvi-hardening`|Supported on Host only|`Enable all mitigations for Load Value Injection (LVI)`|
|
||||
|`-mmacosx-version-min=<value>`|Unsupported|`Set Mac OS X deployment target`|
|
||||
|`-mmadd4`|Supported|`Enable the generation of 4-operand madd.s, madd.d and related instructions.`|
|
||||
|`-mmark-bti-property`|Unsupported|`Add .note.gnu.property with BTI to assembly files (AArch64 only)`|
|
||||
|`-MMD`|Supported|`Write a depfile containing user headers`|
|
||||
|`-mmemops`|Supported|`Enable generation of memop instructions`|
|
||||
|`-mms-bitfields`|Unsupported|`Set the default structure layout to be compatible with the Microsoft compiler standard`|
|
||||
|`-mmsa`|Unsupported|`Enable MSA ASE (MIPS only)`|
|
||||
|`-mmt`|Unsupported|`Enable MT ASE (MIPS only)`|
|
||||
|`-MM`|Supported|`Like -MMD, but also implies -E and writes to stdout by default`|
|
||||
|`-mno-abicalls`|Unsupported|`Disable SVR4-style position-independent code (Mips only)`|
|
||||
|`-mno-crc`|Unsupported|`Disallow use of CRC instructions (Mips only)`|
|
||||
|`-mno-embedded-data`|Unsupported|`Do not place constants in the .rodata section instead of the .sdata if they meet the -G <size> threshold (MIPS)`|
|
||||
|`-mno-execute-only`|Unsupported|`Allow generation of data access to code sections (ARM only)`|
|
||||
|`-mno-extern-sdata`|Unsupported|`Do not assume that externally defined data is in the small data if it meets the -G <size> threshold (MIPS)`|
|
||||
|`-mno-fix-cortex-a53-835769`|Unsupported|`Don't workaround Cortex-A53 erratum 835769 (AArch64 only)`|
|
||||
|`-mno-global-merge`|Supported|`Disable merging of globals`|
|
||||
|`-mno-gpopt`|Unsupported|`Do not use GP relative accesses for symbols known to be in a small data section (MIPS)`|
|
||||
|`-mno-hvx`|Unsupported|`Disable Hexagon Vector eXtensions`|
|
||||
|`-mno-implicit-float`|Supported|`Don't generate implicit floating point instructions`|
|
||||
|`-mno-incremental-linker-compatible`|Supported|`(integrated-as) Emit an object file which cannot be used with an incremental linker`|
|
||||
|`-mno-local-sdata`|Unsupported|`Do not extend the -G behaviour to object local data (MIPS)`|
|
||||
|`-mno-long-calls`|Supported|`Restore the default behaviour of not generating long calls`|
|
||||
|`-mno-lvi-cfi`|Supported on Host only|`Disable control-flow mitigations for Load Value Injection (LVI)`|
|
||||
|`-mno-lvi-hardening`|Supported on Host only|`Disable mitigations for Load Value Injection (LVI)`|
|
||||
|`-mno-madd4`|Supported|`Disable the generation of 4-operand madd.s, madd.d and related instructions.`|
|
||||
|`-mno-memops`|Supported|`Disable generation of memop instructions`|
|
||||
|`-mno-movt`|Supported|`Disallow use of movt/movw pairs (ARM only)`|
|
||||
|`-mno-ms-bitfields`|Supported|`Do not set the default structure layout to be compatible with the Microsoft compiler standard`|
|
||||
|`-mno-msa`|Unsupported|`Disable MSA ASE (MIPS only)`|
|
||||
|`-mno-mt`|Unsupported|`Disable MT ASE (MIPS only)`|
|
||||
|`-mno-neg-immediates`|Supported|`Disallow converting instructions with negative immediates to their negation or inversion.`|
|
||||
|`-mno-nvj`|Supported|`Disable generation of new-value jumps`|
|
||||
|`-mno-nvs`|Supported|`Disable generation of new-value stores`|
|
||||
|`-mno-outline`|Unsupported|`Disable function outlining (AArch64 only)`|
|
||||
|`-mno-packets`|Supported|`Disable generation of instruction packets`|
|
||||
|`-mno-relax`|Supported|`Disable linker relaxation`|
|
||||
|`-mno-restrict-it`|Unsupported|`Allow generation of deprecated IT blocks for ARMv8. It is off by default for ARMv8 Thumb mode`|
|
||||
|`-mno-save-restore`|Unsupported|`Disable using library calls for save and restore`|
|
||||
|`-mno-seses`|Unsupported|`Disable speculative execution side effect suppression (SESES)`|
|
||||
|`-mno-stack-arg-probe`|Supported|`Disable stack probes which are enabled by default`|
|
||||
|`-mno-tls-direct-seg-refs`|Supported|`Disable direct TLS access through segment registers`|
|
||||
|`-mno-unaligned-access`|Unsupported|`Force all memory accesses to be aligned (AArch32/AArch64 only)`|
|
||||
|`-mno-wavefrontsize64`|Supported|`Specify wavefront size 32 mode (AMDGPU only)`|
|
||||
|`-mnocrc`|Unsupported|`Disallow use of CRC instructions (ARM only)`|
|
||||
|`-mnop-mcount`|Supported|`Generate mcount/__fentry__ calls as nops. To activate they need to be patched in.`|
|
||||
|`-mnvj`|Supported|`Enable generation of new-value jumps`|
|
||||
|`-mnvs`|Supported|`Enable generation of new-value stores`|
|
||||
|`-module-dependency-dir <value>`|Unsupported|`Directory to dump module dependencies to`|
|
||||
|`-module-file-info`|Unsupported|`Provide information about a particular module file`|
|
||||
|`-momit-leaf-frame-pointer`|Supported|`Omit frame pointer setup for leaf functions`|
|
||||
|`-moutline`|Unsupported|`Enable function outlining (AArch64 only)`|
|
||||
|`-mpacked-stack`|Unsupported|`Use packed stack layout (SystemZ only).`|
|
||||
|`-mpackets`|Supported|`Enable generation of instruction packets`|
|
||||
|`-mpad-max-prefix-size=<value>`|Supported|`Specify maximum number of prefixes to use for padding`|
|
||||
|`-mpie-copy-relocations`|Supported|`Use copy relocations support for PIE builds`|
|
||||
|`-mprefer-vector-width=<value>`|Unsupported|`Specifies preferred vector width for auto-vectorization. Defaults to 'none' which allows target specific decisions.`|
|
||||
|`-MP`|Supported|`Create phony target for each dependency (other than main file)`|
|
||||
|`-mqdsp6-compat`|Unsupported|`Enable hexagon-qdsp6 backward compatibility`|
|
||||
|`-MQ <value>`|Supported|`Specify name of main file output to quote in depfile`|
|
||||
|`-mrecord-mcount`|Supported|`Generate a __mcount_loc section entry for each __fentry__ call.`|
|
||||
|`-mrelax-all`|Supported|`(integrated-as) Relax all machine instructions`|
|
||||
|`-mrelax`|Supported|`Enable linker relaxation`|
|
||||
|`-mrestrict-it`|Unsupported|`Disallow generation of deprecated IT blocks for ARMv8. It is on by default for ARMv8 Thumb mode.`|
|
||||
|`-mrtd`|Unsupported|`Make StdCall calling convention the default`|
|
||||
|`-msave-restore`|Unsupported|`Enable using library calls for save and restore`|
|
||||
|`-mseses`|Unsupported|`Enable speculative execution side effect suppression (SESES). Includes LVI control flow integrity mitigations`|
|
||||
|`-msign-return-address=<value>`|Unsupported|`Select return address signing scope`|
|
||||
|`-msmall-data-limit=<value>`|Supported|`Put global and static data smaller than the limit into a special section`|
|
||||
|`-msoft-float`|Supported|`Use software floating point`|
|
||||
|`-msram-ecc`|Supported|`Legacy option to specify SRAM ECC mode (AMDGPU only). Should use --offload-arch with :sramecc+ instead`|
|
||||
|`-mstack-alignment=<value>`|Unsupported|`Set the stack alignment`|
|
||||
|`-mstack-arg-probe`|Unsupported|`Enable stack probes`|
|
||||
|`-mstack-probe-size=<value>`|Unsupported|`Set the stack probe size`|
|
||||
|`-mstackrealign`|Unsupported|`Force realign the stack at entry to every function`|
|
||||
|`-msve-vector-bits=<value>`|Unsupported|`Specify the size in bits of an SVE vector register. Defaults to the vector length agnostic value of "scalable". (AArch64 only)`|
|
||||
|`-msvr4-struct-return`|Unsupported|`Return small structs in registers (PPC32 only)`|
|
||||
|`-mthread-model <value>`|Supported|`The thread model to use, e.g. posix, single (posix by default)`|
|
||||
|`-mtls-direct-seg-refs`|Supported|`Enable direct TLS access through segment registers (default)`|
|
||||
|`-mtls-size=<value>`|Unsupported|`Specify bit size of immediate TLS offsets (AArch64 ELF only): 12 (for 4KB) \| 24 (for 16MB, default) \| 32 (for 4GB) \| 48 (for 256TB, needs -mcmodel=large)`|
|
||||
|`-mtp=<value>`|Unsupported|`Thread pointer access method (AArch32/AArch64 only)`|
|
||||
|`-mtune=<value>`|Supported on Host only|`Only supported on X86. Otherwise accepted for compatibility with GCC.`|
|
||||
|`-MT <value>`|Unsupported|`Specify name of main file output in depfile`|
|
||||
|`-munaligned-access`|Unsupported|`Allow memory accesses to be unaligned (AArch32/AArch64 only)`|
|
||||
|`-MV`|Supported|`Use NMake/Jom format for the depfile`|
|
||||
|`-mwavefrontsize64`|Supported|`Specify wavefront size 64 mode (AMDGPU only)`|
|
||||
|`-mxnack`|Supported|`Legacy option to specify XNACK mode (AMDGPU only). Should use --offload-arch with :xnack+ instead`|
|
||||
|`-M`|Supported|`Like -MD, but also implies -E and writes to stdout by default`|
|
||||
|`--no-cuda-include-ptx=<value>`|Supported|`Do not include PTX for the following GPU architecture (e.g. sm_35) or 'all'. May be specified more than once.`|
|
||||
|`--no-cuda-version-check`|Supported|`Don't error out if the detected version of the CUDA install is too low for the requested CUDA gpu architecture.`|
|
||||
|`-no-flang-libs`|Supported|`Do not link against Flang libraries`|
|
||||
|`--no-offload-arch=<value>`|Supported|`Remove CUDA/HIP offloading device architecture (e.g. sm_35, gfx906) from the list of devices to compile for. 'all' resets the list to its default value.`|
|
||||
|`--no-system-header-prefix=<prefix>`|Supported|`Treat all #include paths starting with <prefix> as not including a system header.`|
|
||||
|`-nobuiltininc`|Supported|`Disable builtin #include directories`|
|
||||
|`-nogpuinc`|Supported|`Do not add CUDA/HIP include paths and include default CUDA/HIP wrapper header files`|
|
||||
|`-nogpulib`|Supported|`Do not link device library for CUDA/HIP device compilation`|
|
||||
|`-nostdinc++`|Unsupported|`Disable standard #include directories for the C++ standard library`|
|
||||
|`-ObjC++`|Unsupported|`Treat source input files as Objective-C++ inputs`|
|
||||
|`-objcmt-atomic-property`|Unsupported|`Make migration to 'atomic' properties`|
|
||||
|`-objcmt-migrate-all`|Unsupported|`Enable migration to modern ObjC`|
|
||||
|`-objcmt-migrate-annotation`|Unsupported|`Enable migration to property and method annotations`|
|
||||
|`-objcmt-migrate-designated-init`|Unsupported|`Enable migration to infer NS_DESIGNATED_INITIALIZER for initializer methods`|
|
||||
|`-objcmt-migrate-instancetype`|Unsupported|`Enable migration to infer instancetype for method result type`|
|
||||
|`-objcmt-migrate-literals`|Unsupported|`Enable migration to modern ObjC literals`|
|
||||
|`-objcmt-migrate-ns-macros`|Unsupported|`Enable migration to NS_ENUM/NS_OPTIONS macros`|
|
||||
|`-objcmt-migrate-property-dot-syntax`|Unsupported|`Enable migration of setter/getter messages to property-dot syntax`|
|
||||
|`-objcmt-migrate-property`|Unsupported|`Enable migration to modern ObjC property`|
|
||||
|`-objcmt-migrate-protocol-conformance`|Unsupported|`Enable migration to add protocol conformance on classes`|
|
||||
|`-objcmt-migrate-readonly-property`|Unsupported|`Enable migration to modern ObjC readonly property`|
|
||||
|`-objcmt-migrate-readwrite-property`|Unsupported|`Enable migration to modern ObjC readwrite property`|
|
||||
|`-objcmt-migrate-subscripting`|Unsupported|`Enable migration to modern ObjC subscripting`|
|
||||
|`-objcmt-ns-nonatomic-iosonly`|Unsupported|`Enable migration to use NS_NONATOMIC_IOSONLY macro for setting property's 'atomic' attribute`|
|
||||
|`-objcmt-returns-innerpointer-property`|Unsupported|`Enable migration to annotate property with NS_RETURNS_INNER_POINTER`|
|
||||
|`-objcmt-whitelist-dir-path=<value>`|Unsupported|`Only modify files with a filename contained in the provided directory path`|
|
||||
|`-ObjC`|Unsupported|`Treat source input files as Objective-C inputs`|
|
||||
|`--offload-arch=<value>`|Supported|`CUDA offloading device architecture (e.g. sm_35), or HIP offloading target ID in the form of a device architecture followed by target ID features delimited by a colon. Each target ID feature is a pre-defined string followed by a plus or minus sign (e.g. gfx908:xnack+:sramecc-). May be specified more than once.`|
|
||||
|`-o <file>`|Supported|`Write output to <file>`|
|
||||
|`-parallel-jobs=<value>`|Supported|`Number of parallel jobs`|
|
||||
|`-pg`|Supported|`Enable mcount instrumentation`|
|
||||
|`-pipe`|Supported|`Use pipes between commands, when possible`|
|
||||
|`--precompile`|Supported|`Only precompile the input`|
|
||||
|`-print-effective-triple`|Supported|`Print the effective target triple`|
|
||||
|`-print-file-name=<file>`|Supported|`Print the full library path of <file>`|
|
||||
|`-print-ivar-layout`|Unsupported|`Enable Objective-C Ivar layout bitmap print trace`|
|
||||
|`-print-libgcc-file-name`|Supported|`Print the library path for the currently used compiler runtime library ("libgcc.a" or "libclang_rt.builtins.*.a")`|
|
||||
|`-print-prog-name=<name>`|Supported|`Print the full program path of <name>`|
|
||||
|`-print-resource-dir`|Supported|`Print the resource directory pathname`|
|
||||
|`-print-search-dirs`|Supported|`Print the paths used for finding libraries and programs`|
|
||||
|`-print-supported-cpus`|Supported|`Print supported cpu models for the given target (if target is not specified, it will print the supported cpus for the default target)`|
|
||||
|`-print-target-triple`|Supported|`Print the normalized target triple`|
|
||||
|`-print-targets`|Supported|`Print the registered targets`|
|
||||
|`-pthread`|Supported|`Support POSIX threads in generated code`|
|
||||
|`--ptxas-path=<value>`|Unsupported|`Path to ptxas (used for compiling CUDA code)`|
|
||||
|`-P`|Supported|`Disable linemarker output in -E mode`|
|
||||
|`-Qn`|Supported|`Do not emit metadata containing compiler name and version`|
|
||||
|`-Qunused-arguments`|Supported|`Don't emit warning for unused driver arguments`|
|
||||
|`-Qy`|Supported|`Emit metadata containing compiler name and version`|
|
||||
|`-relocatable-pch`|Supported|`Whether to build a relocatable precompiled header`|
|
||||
|`-rewrite-legacy-objc`|Unsupported|`Rewrite Legacy Objective-C source to C++`|
|
||||
|`-rewrite-objc`|Unsupported|`Rewrite Objective-C source to C++`|
|
||||
|`--rocm-device-lib-path=<value>`|Supported|`ROCm device library path. Alternative to rocm-path.`|
|
||||
|`--rocm-path=<value>`|Supported|`ROCm installation path, used for finding and automatically linking required bitcode libraries.`|
|
||||
|`-Rpass-analysis=<value>`|Supported|`Report transformation analysis from optimization passes whose name matches the given POSIX regular expression`|
|
||||
|`-Rpass-missed=<value>`|Supported|`Report missed transformations by optimization passes whose name matches the given POSIX regular expression`|
|
||||
|`-Rpass=<value>`|Supported|`Report transformations performed by optimization passes whose name matches the given POSIX regular expression`|
|
||||
|`-rtlib=<value>`|Unsupported|`Compiler runtime library to use`|
|
||||
|`-R<remark>`|Unsupported|`Enable the specified remark`|
|
||||
|`-save-stats=<value>`|Supported|`Save llvm statistics.`|
|
||||
|`-save-stats`|Supported|`Save llvm statistics.`|
|
||||
|`-save-temps=<value>`|Supported|`Save intermediate compilation results.`|
|
||||
|`-save-temps`|Supported|`Save intermediate compilation results`|
|
||||
|`-serialize-diagnostics <value>`|Supported|`Serialize compiler diagnostics to a file`|
|
||||
|`-shared-libsan`|Unsupported|`Dynamically link the sanitizer runtime`|
|
||||
|`-static-flang-libs`|Supported|`Link using static Flang libraries`|
|
||||
|`-static-libsan`|Unsupported|`Statically link the sanitizer runtime`|
|
||||
|`-static-openmp`|Supported|`Use the static host OpenMP runtime while linking.`|
|
||||
|`-std=<value>`|Supported|`Language standard to compile for`|
|
||||
|`-stdlib++-isystem <directory>`|Supported|`Use directory as the C++ standard library include path`|
|
||||
|`-stdlib=<value>`|Supported|`C++ standard library to use`|
|
||||
|`-sycl-std=<value>`|Unsupported|`SYCL language standard to compile for.`|
|
||||
|`--system-header-prefix=<prefix>`|Supported|`Treat all #include paths starting with <prefix> as including a system header.`|
|
||||
|`-S`|Supported|`Only run preprocess and compilation steps`|
|
||||
|`--target=<value>`|Supported|`Generate code for the given target`|
|
||||
|`-Tbss <addr>`|Supported|`Set starting address of BSS to <addr>`|
|
||||
|`-Tdata <addr>`|Supported|`Set starting address of DATA to <addr>`|
|
||||
|`-time`|Supported|`Time individual commands`|
|
||||
|`-traditional-cpp`|Unsupported|`Enable some traditional CPP emulation`|
|
||||
|`-trigraphs`|Supported|`Process trigraph sequences`|
|
||||
|`-Ttext <addr>`|Supported|`Set starting address of TEXT to <addr>`|
|
||||
|`-T <script>`|Unsupported|`Specify <script> as linker script`|
|
||||
|`-undef`|Supported|`undef all system defines`|
|
||||
|`-unwindlib=<value>`|Supported|`Unwind library to use`|
|
||||
|`-U <macro>`|Supported|`Undefine macro <macro>`|
|
||||
|`--verify-debug-info`|Supported|`Verify the binary representation of debug output`|
|
||||
|`-verify-pch`|Unsupported|`Load and verify that a pre-compiled header file is not stale`|
|
||||
|`--version`|Supported|`Print version information`|
|
||||
|`-v`|Supported|`Show commands to run and use verbose output`|
|
||||
|`-Wa,<arg>`|Supported|`Pass the comma separated arguments in <arg> to the assembler`|
|
||||
|`-Wdeprecated`|Supported|`Enable warnings for deprecated constructs and define __DEPRECATED`|
|
||||
|`-Wl,<arg>`|Supported|`Pass the comma separated arguments in <arg> to the linker`|
|
||||
|`-working-directory <value>`|Supported|`Resolve file paths relative to the specified directory`|
|
||||
|`-Wp,<arg>`|Supported|`Pass the comma separated arguments in <arg> to the preprocessor`|
|
||||
|`-W<warning>`|Supported|`Enable the specified warning`|
|
||||
|`-w`|Supported|`Suppress all warnings`|
|
||||
|`-Xanalyzer <arg>`|Supported|`Pass <arg> to the static analyzer`|
|
||||
|`-Xarch_device <arg>`|Supported|`Pass <arg> to the CUDA/HIP device compilation`|
|
||||
|`-Xarch_host <arg>`|Supported|`Pass <arg> to the CUDA/HIP host compilation`|
|
||||
|`-Xassembler <arg>`|Supported|`Pass <arg> to the assembler`|
|
||||
|`-Xclang <arg>`|Supported|`Pass <arg> to the clang compiler`|
|
||||
|`-Xcuda-fatbinary <arg>`|Supported|`Pass <arg> to fatbinary invocation`|
|
||||
|`-Xcuda-ptxas <arg>`|Supported|`Pass <arg> to the ptxas assembler`|
|
||||
|`-Xlinker <arg>`|Supported|`Pass <arg> to the linker`|
|
||||
|`-Xopenmp-target=<triple> <arg>`|Supported|`Pass <arg> to the target offloading toolchain identified by <triple>.`|
|
||||
|`-Xopenmp-target <arg>`|Supported|`Pass <arg> to the target offloading toolchain.`|
|
||||
|`-Xpreprocessor <arg>`|Supported|`Pass <arg> to the preprocessor`|
|
||||
|`-x <language>`|Supported|`Treat subsequent input files as having type <language>`|
|
||||
|`-z <arg>`|Supported|`Pass -z <arg> to the linker`|
|
||||
@@ -1,758 +0,0 @@
|
||||
-### s
|
||||
--analyzer-output s
|
||||
--analyze s
|
||||
-arcmt-migrate-emit-errors n
|
||||
-arcmt-migrate-report-output n
|
||||
-byteswapio s
|
||||
-B s
|
||||
-CC s
|
||||
-cl-denorms-are-zero s
|
||||
-cl-fast-relaxed-math s
|
||||
-cl-finite-math-only s
|
||||
-cl-fp32-correctly-rounded-divide-sqrt s
|
||||
-cl-kernel-arg-info s
|
||||
-cl-mad-enable s
|
||||
-cl-no-signed-zeros s
|
||||
-cl-opt-disable s
|
||||
-cl-single-precision-constant s
|
||||
-cl-std s
|
||||
-cl-strict-aliasing s
|
||||
-cl-uniform-work-group-size s
|
||||
-cl-unsafe-math-optimizations s
|
||||
--config s
|
||||
--cuda-compile-host-device s
|
||||
--cuda-device-only s
|
||||
--cuda-host-only s
|
||||
--cuda-include-ptx n
|
||||
--cuda-noopt-device-debug n
|
||||
--cuda-path-ignore-env n
|
||||
--cuda-path n
|
||||
-cxx-isystem s
|
||||
-C s
|
||||
-c s
|
||||
-dD s
|
||||
-dependency-dot s
|
||||
-dependency-file s
|
||||
-dI s
|
||||
-dM s
|
||||
-dsym-dir n
|
||||
-D s
|
||||
-emit-ast s
|
||||
-emit-interface-stubs s
|
||||
-emit-llvm s
|
||||
-emit-merged-ifs s
|
||||
--emit-static-lib s
|
||||
-enable-trivial-auto-var-init-zero-knowing-it-will-be-removed-from-clang s
|
||||
-E s
|
||||
-fAAPCSBitfieldLoad n
|
||||
-faddrsig s
|
||||
-faligned-allocation s
|
||||
-fallow-editor-placeholders s
|
||||
-fallow-fortran-gnu-ext s
|
||||
-fansi-escape-codes s
|
||||
-fapple-kext n
|
||||
-fapple-link-rtlib n
|
||||
-fapple-pragma-pack n
|
||||
-fapplication-extension n
|
||||
-fbackslash s
|
||||
-fbasic-block-sections s
|
||||
-fblocks s
|
||||
-fborland-extensions n
|
||||
-fbuild-session-file s
|
||||
-fbuild-session-timestamp s
|
||||
-fbuiltin-module-map n
|
||||
-fcall-saved-x10 n
|
||||
-fcall-saved-x11 n
|
||||
-fcall-saved-x12 n
|
||||
-fcall-saved-x13 n
|
||||
-fcall-saved-x14 n
|
||||
-fcall-saved-x15 n
|
||||
-fcall-saved-x18 n
|
||||
-fcall-saved-x8 n
|
||||
-fcall-saved-x9 n
|
||||
-fcf-protection n
|
||||
-fcf-protection n
|
||||
-fchar8_t s
|
||||
-fclang-abi-compat s
|
||||
-fcolor-diagnostics s
|
||||
-fcomment-block-commands s
|
||||
-fcommon s
|
||||
-fcomplete-member-pointers s
|
||||
-fconvergent-functions s
|
||||
-fcoroutines-ts s
|
||||
-fcoverage-mapping n
|
||||
-fcs-profile-generate n
|
||||
-fcs-profile-generate n
|
||||
-fcuda-approx-transcendentals n
|
||||
-fcuda-flush-denormals-to-zero s
|
||||
-fcuda-short-ptr n
|
||||
-fcxx-exceptions s
|
||||
-fdata-sections s
|
||||
-fdebug-compilation-dir s
|
||||
-fdebug-default-version s
|
||||
-fdebug-info-for-profiling s
|
||||
-fdebug-macro s
|
||||
-fdebug-prefix-map s
|
||||
-fdebug-ranges-base-address s
|
||||
-fdebug-types-section s
|
||||
-fdeclspec s
|
||||
-fdelayed-template-parsing s
|
||||
-fdelete-null-pointer-checks s
|
||||
-fdiagnostics-absolute-paths s
|
||||
-fdiagnostics-hotness-threshold n
|
||||
-fdiagnostics-parseable-fixits s
|
||||
-fdiagnostics-print-source-range-info s
|
||||
-fdiagnostics-show-hotness n
|
||||
-fdiagnostics-show-note-include-stack s
|
||||
-fdiagnostics-show-option s
|
||||
-fdiagnostics-show-template-tree s
|
||||
-fdigraphs s
|
||||
-fdiscard-value-names s
|
||||
-fdollars-in-identifiers s
|
||||
-fdouble-square-bracket-attributes s
|
||||
-fdwarf-exceptions n
|
||||
-feliminate-unused-debug-types s
|
||||
-fembed-bitcode-marker s
|
||||
-fembed-bitcode s
|
||||
-fembed-bitcode s
|
||||
-femit-all-decls s
|
||||
-femulated-tls s
|
||||
-fenable-matrix s
|
||||
-fexceptions s
|
||||
-fexperimental-new-constant-interpreter s
|
||||
-fexperimental-new-pass-manager s
|
||||
-fexperimental-relative-c++-abi-vtables s
|
||||
-fexperimental-strict-floating-point s
|
||||
-ffast-math s
|
||||
-ffile-prefix-map s
|
||||
-ffine-grained-bitfield-accesses s
|
||||
-ffixed-form s
|
||||
-ffixed-point s
|
||||
-ffixed-r19 n
|
||||
-ffixed-r9 n
|
||||
-ffixed-x10 n
|
||||
-ffixed-x11 n
|
||||
-ffixed-x12 n
|
||||
-ffixed-x13 n
|
||||
-ffixed-x14 n
|
||||
-ffixed-x15 n
|
||||
-ffixed-x16 n
|
||||
-ffixed-x17 n
|
||||
-ffixed-x18 n
|
||||
-ffixed-x19 n
|
||||
-ffixed-x1 n
|
||||
-ffixed-x20 n
|
||||
-ffixed-x21 n
|
||||
-ffixed-x22 n
|
||||
-ffixed-x23 n
|
||||
-ffixed-x24 n
|
||||
-ffixed-x25 n
|
||||
-ffixed-x26 n
|
||||
-ffixed-x27 n
|
||||
-ffixed-x28 n
|
||||
-ffixed-x29 n
|
||||
-ffixed-x2 n
|
||||
-ffixed-x30 n
|
||||
-ffixed-x31 n
|
||||
-ffixed-x3 n
|
||||
-ffixed-x4 n
|
||||
-ffixed-x5 n
|
||||
-ffixed-x6 n
|
||||
-ffixed-x7 n
|
||||
-ffixed-x8 n
|
||||
-ffixed-x9 n
|
||||
-fforce-dwarf-frame s
|
||||
-fforce-emit-vtables s
|
||||
-fforce-enable-int128 s
|
||||
-ffp-contract s
|
||||
-ffp-exception-behavior s
|
||||
-ffp-model s
|
||||
-ffree-form s
|
||||
-ffreestanding s
|
||||
-ffunc-args-alias s
|
||||
-ffunction-sections s
|
||||
-fglobal-isel s
|
||||
-fgnu-keywords s
|
||||
-fgnu-runtime n
|
||||
-fgnu89-inline n
|
||||
-fgnuc-version s
|
||||
-fgpu-allow-device-init s
|
||||
-fgpu-rdc s
|
||||
-fhip-new-launch-api s
|
||||
-fignore-exceptions s
|
||||
-fimplicit-module-maps n
|
||||
-finline-functions s
|
||||
-finline-hint-functions s
|
||||
-finstrument-function-entry-bare n
|
||||
-finstrument-functions-after-inlining n
|
||||
-finstrument-functions n
|
||||
-fintegrated-as s
|
||||
-fintegrated-cc1 s
|
||||
-fjump-tables s
|
||||
-fkeep-static-consts s
|
||||
-flax-vector-conversions s
|
||||
-flto-jobs n
|
||||
-flto n
|
||||
-flto n
|
||||
-fmacro-prefix-map s
|
||||
-fmath-errno s
|
||||
-fmax-tokens s
|
||||
-fmax-type-align s
|
||||
-fmemory-profile s
|
||||
-fmerge-all-constants s
|
||||
-fmessage-length s
|
||||
-fmodule-file n
|
||||
-fmodule-map-file n
|
||||
-fmodule-name n
|
||||
-fmodules-cache-path n
|
||||
-fmodules-decluse n
|
||||
-fmodules-disable-diagnostic-validation n
|
||||
-fmodules-ignore-macro n
|
||||
-fmodules-prune-after n
|
||||
-fmodules-prune-interval n
|
||||
-fmodules-search-all n
|
||||
-fmodules-strict-decluse n
|
||||
-fmodules-ts n
|
||||
-fmodules-user-build-path n
|
||||
-fmodules-validate-input-files-content s
|
||||
-fmodules-validate-once-per-build-session n
|
||||
-fmodules-validate-system-headers s
|
||||
-fmodules n
|
||||
-fms-compatibility-version s
|
||||
-fms-compatibility s
|
||||
-fms-extensions s
|
||||
-fmsc-version s
|
||||
-fnew-alignment s
|
||||
-fno-addrsig s
|
||||
-fno-allow-fortran-gnu-ext s
|
||||
-fno-assume-sane-operator-new s
|
||||
-fno-autolink s
|
||||
-fno-backslash s
|
||||
-fno-builtin- s
|
||||
-fno-builtin s
|
||||
-fno-c++-static-destructors s
|
||||
-fno-char8_t s
|
||||
-fno-color-diagnostics s
|
||||
-fno-common s
|
||||
-fno-complete-member-pointers s
|
||||
-fno-constant-cfstrings s
|
||||
-fno-coverage-mapping s
|
||||
-fno-crash-diagnostics s
|
||||
-fno-cuda-approx-transcendentals n
|
||||
-fno-debug-macro s
|
||||
-fno-declspec n
|
||||
-fno-delayed-template-parsing s
|
||||
-fno-delete-null-pointer-checks s
|
||||
-fno-diagnostics-fixit-info s
|
||||
-fno-digraphs s
|
||||
-fno-discard-value-names s
|
||||
-fno-dollars-in-identifiers s
|
||||
-fno-double-square-bracket-attributes s
|
||||
-fno-elide-constructors s
|
||||
-fno-elide-type s
|
||||
-fno-eliminate-unused-debug-types s
|
||||
-fno-exceptions s
|
||||
-fno-experimental-new-pass-manager s
|
||||
-fno-experimental-relative-c++-abi-vtables s
|
||||
-fno-fine-grained-bitfield-accesses s
|
||||
-fno-fixed-form s
|
||||
-fno-fixed-point s
|
||||
-fno-force-enable-int128 s
|
||||
-fno-fortran-main s
|
||||
-fno-free-form s
|
||||
-fno-func-args-alias s
|
||||
-fno-global-isel s
|
||||
-fno-gnu-inline-asm s
|
||||
-fno-gpu-allow-device-init s
|
||||
-fno-hip-new-launch-api s
|
||||
-fno-integrated-as s
|
||||
-fno-integrated-cc1 s
|
||||
-fno-jump-tables s
|
||||
-fno-keep-static-consts s
|
||||
-fno-lto s
|
||||
-fno-memory-profile s
|
||||
-fno-merge-all-constants s
|
||||
-fno-no-access-control s
|
||||
-fno-objc-infer-related-result-type s
|
||||
-fno-operator-names s
|
||||
-fno-pch-codegen s
|
||||
-fno-pch-debuginfo s
|
||||
-fno-plt s
|
||||
-fno-preserve-as-comments s
|
||||
-fno-profile-generate s
|
||||
-fno-profile-instr-generate s
|
||||
-fno-profile-instr-use s
|
||||
-fno-register-global-dtors-with-atexit s
|
||||
-fno-rtlib-add-rpath s
|
||||
-fno-rtti-data s
|
||||
-fno-rtti s
|
||||
-fno-sanitize-address-poison-custom-array-cookie h
|
||||
-fno-sanitize-address-use-after-scope h
|
||||
-fno-sanitize-address-use-odr-indicator h
|
||||
-fno-sanitize-blacklist h
|
||||
-fno-sanitize-cfi-canonical-jump-tables h
|
||||
-fno-sanitize-cfi-cross-dso h
|
||||
-fno-sanitize-coverage h
|
||||
-fno-sanitize-memory-track-origins h
|
||||
-fno-sanitize-memory-use-after-dtor h
|
||||
-fno-sanitize-recover h
|
||||
-fno-sanitize-stats h
|
||||
-fno-sanitize-thread-atomics h
|
||||
-fno-sanitize-thread-func-entry-exit h
|
||||
-fno-sanitize-thread-memory-access h
|
||||
-fno-sanitize-trap h
|
||||
-fno-sanitize-trap h
|
||||
-fno-short-wchar s
|
||||
-fno-show-column s
|
||||
-fno-show-source-location s
|
||||
-fno-signed-char s
|
||||
-fno-signed-zeros s
|
||||
-fno-spell-checking s
|
||||
-fno-split-machine-functions s
|
||||
-fno-stack-clash-protection s
|
||||
-fno-stack-protector s
|
||||
-fno-standalone-debug s
|
||||
-fno-strict-float-cast-overflow s
|
||||
-fno-strict-return s
|
||||
-fno-sycl n
|
||||
-fno-temp-file s
|
||||
-fno-threadsafe-statics s
|
||||
-fno-trigraphs s
|
||||
-fno-unique-section-names s
|
||||
-fno-unroll-loops s
|
||||
-fno-use-cxa-atexit s
|
||||
-fno-use-flang-math-libs s
|
||||
-fno-use-init-array s
|
||||
-fno-visibility-inlines-hidden-static-local-var s
|
||||
-fno-xray-function-index n
|
||||
-fno-zero-initialized-in-bss s
|
||||
-fobjc-arc-exceptions n
|
||||
-fobjc-arc n
|
||||
-fobjc-exceptions n
|
||||
-fobjc-runtime n
|
||||
-fobjc-weak n
|
||||
-fopenmp-simd n
|
||||
-fopenmp-targets n
|
||||
-fopenmp n
|
||||
-foptimization-record-file s
|
||||
-foptimization-record-passes s
|
||||
-forder-file-instrumentation s
|
||||
-fpack-struct n
|
||||
-fpascal-strings s
|
||||
-fpass-plugin s
|
||||
-fpatchable-function-entry s
|
||||
-fpcc-struct-return n
|
||||
-fpch-codegen s
|
||||
-fpch-debuginfo s
|
||||
-fpch-instantiate-templates s
|
||||
-fpch-validate-input-files-content s
|
||||
-fplugin s
|
||||
-fprebuilt-module-path n
|
||||
-fprofile-exclude-files n
|
||||
-fprofile-filter-files n
|
||||
-fprofile-generate n
|
||||
-fprofile-generate n
|
||||
-fprofile-instr-generate n
|
||||
-fprofile-instr-generate n
|
||||
-fprofile-instr-use n
|
||||
-fprofile-remapping-file n
|
||||
-fprofile-sample-accurate n
|
||||
-fprofile-sample-use n
|
||||
-fprofile-use n
|
||||
-freciprocal-math s
|
||||
-freg-struct-return n
|
||||
-fregister-global-dtors-with-atexit s
|
||||
-frelaxed-template-template-args s
|
||||
-freroll-loops s
|
||||
-fropi n
|
||||
-frtlib-add-rpath s
|
||||
-frwpi n
|
||||
-fsanitize-address-field-padding h
|
||||
-fsanitize-address-globals-dead-stripping h
|
||||
-fsanitize-address-poison-custom-array-cookie h
|
||||
-fsanitize-address-use-after-scope h
|
||||
-fsanitize-address-use-odr-indicator h
|
||||
-fsanitize-blacklist h
|
||||
-fsanitize-cfi-canonical-jump-tables h
|
||||
-fsanitize-cfi-cross-dso h
|
||||
-fsanitize-cfi-icall-generalize-pointers h
|
||||
-fsanitize-coverage-allowlist h
|
||||
-fsanitize-coverage-blacklist h
|
||||
-fsanitize-coverage-blocklist h
|
||||
-fsanitize-coverage-whitelist h
|
||||
-fsanitize-coverage h
|
||||
-fsanitize-hwaddress-abi h
|
||||
-fsanitize-memory-track-origins h
|
||||
-fsanitize-memory-track-origins h
|
||||
-fsanitize-memory-use-after-dtor h
|
||||
-fsanitize-recover h
|
||||
-fsanitize-stats h
|
||||
-fsanitize-system-blacklist h
|
||||
-fsanitize-thread-atomics h
|
||||
-fsanitize-thread-func-entry-exit h
|
||||
-fsanitize-thread-memory-access h
|
||||
-fsanitize-trap h
|
||||
-fsanitize-trap h
|
||||
-fsanitize-undefined-strip-path-components h
|
||||
-fsanitize h
|
||||
-fsave-optimization-record s
|
||||
-fsave-optimization-record s
|
||||
-fseh-exceptions s
|
||||
-fshort-enums s
|
||||
-fshort-wchar n
|
||||
-fshow-overloads s
|
||||
-fsigned-char s
|
||||
-fsized-deallocation s
|
||||
-fsjlj-exceptions s
|
||||
-fslp-vectorize s
|
||||
-fsplit-dwarf-inlining n
|
||||
-fsplit-lto-unit n
|
||||
-fsplit-machine-functions s
|
||||
-fstack-clash-protection s
|
||||
-fstack-protector-all n
|
||||
-fstack-protector-strong n
|
||||
-fstack-protector n
|
||||
-fstack-size-section s
|
||||
-fstandalone-debug s
|
||||
-fstrict-enums s
|
||||
-fstrict-float-cast-overflow s
|
||||
-fstrict-vtable-pointers s
|
||||
-fsycl n
|
||||
-fsystem-module u
|
||||
-fthin-link-bitcode s
|
||||
-fthinlto-index n
|
||||
-ftime-trace-granularity s
|
||||
-ftime-trace s
|
||||
-ftrap-function n
|
||||
-ftrapv-handler n
|
||||
-ftrapv n
|
||||
-ftrigraphs s
|
||||
-ftrivial-auto-var-init-stop-after s
|
||||
-ftrivial-auto-var-init s
|
||||
-funique-basic-block-section-names s
|
||||
-funique-internal-linkage-names s
|
||||
-funroll-loops s
|
||||
-fuse-flang-math-libs s
|
||||
-fuse-line-directives s
|
||||
-fvalidate-ast-input-files-content s
|
||||
-fveclib n
|
||||
-fvectorize n
|
||||
-fverbose-asm s
|
||||
-fvirtual-function-elimination s
|
||||
-fvisibility-global-new-delete-hidden s
|
||||
-fvisibility-inlines-hidden-static-local-var s
|
||||
-fvisibility-inlines-hidden s
|
||||
-fvisibility-ms-compat s
|
||||
-fvisibility s
|
||||
-fwasm-exceptions n
|
||||
-fwhole-program-vtables n
|
||||
-fwrapv s
|
||||
-fwritable-strings s
|
||||
-fxray-always-emit-customevents n
|
||||
-fxray-always-emit-typedevents n
|
||||
-fxray-always-instrument n
|
||||
-fxray-attr-list n
|
||||
-fxray-ignore-loops n
|
||||
-fxray-instruction-threshold n
|
||||
-fxray-instrumentation-bundle n
|
||||
-fxray-instrument n
|
||||
-fxray-link-deps n
|
||||
-fxray-modes n
|
||||
-fxray-never-instrument n
|
||||
-fzvector s
|
||||
-F n
|
||||
--gcc-toolchain s
|
||||
-gcodeview-ghash s
|
||||
-gcodeview s
|
||||
-gdwarf-2 s
|
||||
-gdwarf-3 s
|
||||
-gdwarf-4 s
|
||||
-gdwarf-5 s
|
||||
-gdwarf s
|
||||
-gembed-source s
|
||||
-gline-directives-only s
|
||||
-gline-tables-only s
|
||||
-gmodules s
|
||||
-gno-embed-source s
|
||||
-gno-inline-line-tables s
|
||||
--gpu-max-threads-per-block s
|
||||
-gsplit-dwarf s
|
||||
-gz s
|
||||
-gz s
|
||||
-G n
|
||||
-g s
|
||||
--help-hidden s
|
||||
-help s
|
||||
--hip-device-lib s
|
||||
--hip-link s
|
||||
--hip-version s
|
||||
-H s
|
||||
-I- s
|
||||
-ibuiltininc s
|
||||
-idirafter s
|
||||
-iframeworkwithsysroot n
|
||||
-iframework n
|
||||
-imacros s
|
||||
-include-pch s
|
||||
-include s
|
||||
-index-header-map s
|
||||
-iprefix s
|
||||
-iquote s
|
||||
-isysroot s
|
||||
-isystem-after s
|
||||
-isystem s
|
||||
-ivfsoverlay s
|
||||
-iwithprefixbefore s
|
||||
-iwithprefix s
|
||||
-iwithsysroot s
|
||||
-I s
|
||||
--libomptarget-nvptx-path n
|
||||
-L s
|
||||
-mabicalls n
|
||||
-maix-struct-return n
|
||||
-malign-branch-boundary s
|
||||
-malign-branch s
|
||||
-malign-double s
|
||||
-Mallocatable n
|
||||
-mbackchain n
|
||||
-mbranch-protection n
|
||||
-mbranches-within-32B-boundaries s
|
||||
-mcmodel n
|
||||
-mcmodel n
|
||||
-mcmse n
|
||||
-mcode-object-v3 s
|
||||
-mcode-object-version s
|
||||
-mcrc n
|
||||
-mcumode s
|
||||
-mdouble s
|
||||
-MD s
|
||||
-meabi s
|
||||
-membedded-data n
|
||||
-menable-experimental-extensions n
|
||||
-mexec-model n
|
||||
-mexecute-only n
|
||||
-mextern-sdata n
|
||||
-mfentry n
|
||||
-mfix-cortex-a53-835769 n
|
||||
-mfp32 n
|
||||
-mfp64 n
|
||||
-MF s
|
||||
-mgeneral-regs-only n
|
||||
-mglobal-merge s
|
||||
-mgpopt n
|
||||
-MG s
|
||||
-mharden-sls n
|
||||
-mhvx-length n
|
||||
-mhvx n
|
||||
-mhvx n
|
||||
-miamcu n
|
||||
--migrate n
|
||||
-mincremental-linker-compatible s
|
||||
-mindirect-jump n
|
||||
-Minform s
|
||||
-mios-version-min n
|
||||
-MJ n
|
||||
-mllvm s
|
||||
-mlocal-sdata n
|
||||
-mlong-calls s
|
||||
-mlong-double-128 h
|
||||
-mlong-double-64 s
|
||||
-mlong-double-80 h
|
||||
-mlvi-cfi h
|
||||
-mlvi-hardening h
|
||||
-mmacosx-version-min n
|
||||
-mmadd4 s
|
||||
-mmark-bti-property n
|
||||
-MMD s
|
||||
-mmemops s
|
||||
-mms-bitfields n
|
||||
-mmsa n
|
||||
-mmt n
|
||||
-MM s
|
||||
-mno-abicalls n
|
||||
-mno-crc n
|
||||
-mno-embedded-data n
|
||||
-mno-execute-only n
|
||||
-mno-extern-sdata n
|
||||
-mno-fix-cortex-a53-835769 n
|
||||
-mno-global-merge s
|
||||
-mno-gpopt n
|
||||
-mno-hvx n
|
||||
-mno-implicit-float s
|
||||
-mno-incremental-linker-compatible s
|
||||
-mno-local-sdata n
|
||||
-mno-long-calls s
|
||||
-mno-lvi-cfi h
|
||||
-mno-lvi-hardening h
|
||||
-mno-madd4 s
|
||||
-mno-memops s
|
||||
-mno-movt s
|
||||
-mno-ms-bitfields s
|
||||
-mno-msa n
|
||||
-mno-mt n
|
||||
-mno-neg-immediates s
|
||||
-mno-nvj s
|
||||
-mno-nvs s
|
||||
-mno-outline n
|
||||
-mno-packets s
|
||||
-mno-relax s
|
||||
-mno-restrict-it n
|
||||
-mno-save-restore n
|
||||
-mno-seses n
|
||||
-mno-stack-arg-probe s
|
||||
-mno-tls-direct-seg-refs s
|
||||
-mno-unaligned-access n
|
||||
-mno-wavefrontsize64 s
|
||||
-mnocrc n
|
||||
-mnop-mcount s
|
||||
-mnvj s
|
||||
-mnvs s
|
||||
-module-dependency-dir n
|
||||
-module-file-info n
|
||||
-momit-leaf-frame-pointer s
|
||||
-moutline n
|
||||
-mpacked-stack n
|
||||
-mpackets s
|
||||
-mpad-max-prefix-size s
|
||||
-mpie-copy-relocations s
|
||||
-mprefer-vector-width n
|
||||
-MP s
|
||||
-mqdsp6-compat n
|
||||
-MQ s
|
||||
-mrecord-mcount s
|
||||
-mrelax-all s
|
||||
-mrelax s
|
||||
-mrestrict-it n
|
||||
-mrtd n
|
||||
-msave-restore n
|
||||
-mseses n
|
||||
-msign-return-address n
|
||||
-msmall-data-limit s
|
||||
-msoft-float s
|
||||
-msram-ecc s
|
||||
-mstack-alignment n
|
||||
-mstack-arg-probe n
|
||||
-mstack-probe-size n
|
||||
-mstackrealign n
|
||||
-msve-vector-bits n
|
||||
-msvr4-struct-return n
|
||||
-mthread-model s
|
||||
-mtls-direct-seg-refs s
|
||||
-mtls-size n
|
||||
-mtp n
|
||||
-mtune h
|
||||
-MT n
|
||||
-munaligned-access n
|
||||
-MV s
|
||||
-mwavefrontsize64 s
|
||||
-mxnack s
|
||||
-M s
|
||||
--no-cuda-include-ptx s
|
||||
--no-cuda-version-check s
|
||||
-no-flang-libs s
|
||||
--no-offload-arch s
|
||||
--no-system-header-prefix s
|
||||
-nobuiltininc s
|
||||
-nogpuinc s
|
||||
-nogpulib s
|
||||
-nostdinc++ n
|
||||
-ObjC++ n
|
||||
-objcmt-atomic-property n
|
||||
-objcmt-migrate-all n
|
||||
-objcmt-migrate-annotation n
|
||||
-objcmt-migrate-designated-init n
|
||||
-objcmt-migrate-instancetype n
|
||||
-objcmt-migrate-literals n
|
||||
-objcmt-migrate-ns-macros n
|
||||
-objcmt-migrate-property-dot-syntax n
|
||||
-objcmt-migrate-property n
|
||||
-objcmt-migrate-protocol-conformance n
|
||||
-objcmt-migrate-readonly-property n
|
||||
-objcmt-migrate-readwrite-property n
|
||||
-objcmt-migrate-subscripting n
|
||||
-objcmt-ns-nonatomic-iosonly n
|
||||
-objcmt-returns-innerpointer-property n
|
||||
-objcmt-whitelist-dir-path n
|
||||
-ObjC n
|
||||
--offload-arch s
|
||||
-o s
|
||||
-parallel-jobs s
|
||||
-pg s
|
||||
-pipe s
|
||||
--precompile s
|
||||
-print-effective-triple s
|
||||
-print-file-name s
|
||||
-print-ivar-layout n
|
||||
-print-libgcc-file-name s
|
||||
-print-prog-name s
|
||||
-print-resource-dir s
|
||||
-print-search-dirs s
|
||||
-print-supported-cpus s
|
||||
-print-target-triple s
|
||||
-print-targets s
|
||||
-pthread s
|
||||
--ptxas-path n
|
||||
-P s
|
||||
-Qn s
|
||||
-Qunused-arguments s
|
||||
-Qy s
|
||||
-relocatable-pch s
|
||||
-rewrite-legacy-objc n
|
||||
-rewrite-objc n
|
||||
--rocm-device-lib-path s
|
||||
--rocm-path s
|
||||
-Rpass-analysis s
|
||||
-Rpass-missed s
|
||||
-Rpass s
|
||||
-rtlib n
|
||||
-R n
|
||||
-save-stats s
|
||||
-save-stats s
|
||||
-save-temps s
|
||||
-save-temps s
|
||||
-serialize-diagnostics s
|
||||
-shared-libsan n
|
||||
-static-flang-libs s
|
||||
-static-libsan n
|
||||
-static-openmp s
|
||||
-std s
|
||||
-stdlib++-isystem s
|
||||
-stdlib s
|
||||
-sycl-std n
|
||||
--system-header-prefix s
|
||||
-S s
|
||||
--target s
|
||||
-Tbss s
|
||||
-Tdata s
|
||||
-time s
|
||||
-traditional-cpp n
|
||||
-trigraphs s
|
||||
-Ttext s
|
||||
-T n
|
||||
-undef s
|
||||
-unwindlib s
|
||||
-U s
|
||||
--verify-debug-info s
|
||||
-verify-pch n
|
||||
--version s
|
||||
-v s
|
||||
-Wa, s
|
||||
-Wdeprecated s
|
||||
-Wl, s
|
||||
-working-directory s
|
||||
-Wp, s
|
||||
-W s
|
||||
-w s
|
||||
-Xanalyzer s
|
||||
-Xarch_device s
|
||||
-Xarch_host s
|
||||
-Xassembler s
|
||||
-Xclang s
|
||||
-Xcuda-fatbinary s
|
||||
-Xcuda-ptxas s
|
||||
-Xlinker s
|
||||
-Xopenmp-target s
|
||||
-Xopenmp-target s
|
||||
-Xpreprocessor s
|
||||
-x s
|
||||
-z s
|
||||
@@ -1,37 +0,0 @@
|
||||
# cuComplex API supported by HIP
|
||||
|
||||
## **1. cuComplex Data types**
|
||||
|
||||
| **type** | **CUDA** | **HIP** |**HIP value** (if differs) |
|
||||
|-------------:|---------------------------------------------------------------|------------------------------------------------------------|---------------------------|
|
||||
| float2 |***`cuFloatComplex`*** |***`hipFloatComplex`*** | struct |
|
||||
| double2 |***`cuDoubleComplex`*** |***`hipDoubleComplex`*** | struct |
|
||||
| float2 |***`cuComplex`*** |***`hipComplex`*** | struct |
|
||||
|
||||
## **2. cuComplex API functions**
|
||||
|
||||
| **CUDA** | **HIP** |
|
||||
|-----------------------------------------------------------|-------------------------------------------------|
|
||||
|`cuCrealf` |`hipCrealf` |
|
||||
|`cuCimagf` |`hipCimagf` |
|
||||
|`make_cuFloatComplex` |`make_hipFloatComplex` |
|
||||
|`cuConjf` |`hipConjf` |
|
||||
|`cuCaddf` |`hipCaddf` |
|
||||
|`cuCsubf` |`hipCsubf` |
|
||||
|`cuCmulf` |`hipCmulf` |
|
||||
|`cuCdivf` |`hipCdivf` |
|
||||
|`cuCabsf` |`hipCabsf` |
|
||||
|`cuCreal` |`hipCreal` |
|
||||
|`cuCimag` |`hipCimag` |
|
||||
|`make_cuDoubleComplex` |`make_hipDoubleComplex` |
|
||||
|`cuConj` |`hipConj` |
|
||||
|`cuCadd` |`hipCadd` |
|
||||
|`cuCsub` |`hipCsub` |
|
||||
|`cuCmul` |`hipCmul` |
|
||||
|`cuCdiv` |`hipCdiv` |
|
||||
|`cuCabs` |`hipCabs` |
|
||||
|`make_cuComplex` |`make_hipComplex` |
|
||||
|`cuComplexFloatToDouble` |`hipComplexFloatToDouble` |
|
||||
|`cuComplexDoubleToFloat` |`hipComplexDoubleToFloat` |
|
||||
|`cuCfmaf` |`hipCfmaf` |
|
||||
|`cuCfma` |`hipCfma` |
|
||||
@@ -1,509 +0,0 @@
|
||||
"""
|
||||
Copyright (c) 2015-2021 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
"""
|
||||
|
||||
"""
|
||||
1. This files uses Python3 to run
|
||||
|
||||
List of device functions:
|
||||
acosf
|
||||
acoshf
|
||||
asinf
|
||||
asinhf
|
||||
atan2f
|
||||
atanf
|
||||
atanhf
|
||||
cbrtf
|
||||
ceilf
|
||||
copysignf
|
||||
cosf
|
||||
coshf
|
||||
cospif
|
||||
cyl_bessel_i0f
|
||||
cyl_bessel_i1f
|
||||
erfcf
|
||||
erfcinvf
|
||||
erfcxf
|
||||
erff
|
||||
erfinvf
|
||||
exp10f
|
||||
exp2f
|
||||
expf
|
||||
expm1f
|
||||
fabsf
|
||||
fdimf
|
||||
fdividef
|
||||
floorf
|
||||
fmaf
|
||||
fmaxf
|
||||
fminf
|
||||
fmodf
|
||||
frexpf
|
||||
hypotf
|
||||
ilogbf
|
||||
isfinite
|
||||
isinf
|
||||
isnan
|
||||
j0f
|
||||
j1f
|
||||
jnf
|
||||
ldexpf
|
||||
lgammaf
|
||||
llrintf
|
||||
llroundf
|
||||
log10f
|
||||
log1pf
|
||||
logbf
|
||||
lrintf
|
||||
lroundf
|
||||
modff
|
||||
nanf
|
||||
nearbyintf
|
||||
nextafterf
|
||||
norm3df
|
||||
norm4df
|
||||
normcdff
|
||||
normcdfinvf
|
||||
normf
|
||||
powf
|
||||
rcbrtf
|
||||
remainderf
|
||||
remquof
|
||||
rhypotf
|
||||
rintf
|
||||
rnorm3df
|
||||
rnorm4df
|
||||
rnormf
|
||||
roundf
|
||||
rsqrtf
|
||||
scalblnf
|
||||
scalbnf
|
||||
signbit
|
||||
sincosf
|
||||
sincospif
|
||||
sinf
|
||||
sinhf
|
||||
sinpif
|
||||
sqrtf
|
||||
tanf
|
||||
tanhf
|
||||
tgammaf
|
||||
truncf
|
||||
y0f
|
||||
y1f
|
||||
ynf
|
||||
acos
|
||||
acosh
|
||||
asin
|
||||
asinh
|
||||
atan
|
||||
atan2
|
||||
atanh
|
||||
cbrt
|
||||
ceil
|
||||
copysign
|
||||
cos
|
||||
cosh
|
||||
cospi
|
||||
cyl_bessel_i0
|
||||
cyl_bessel_i1
|
||||
erf
|
||||
erfc
|
||||
erfcinv
|
||||
erfcx
|
||||
erfinv
|
||||
exp
|
||||
exp10
|
||||
exp2
|
||||
expm1
|
||||
fabs
|
||||
fdim
|
||||
floor
|
||||
fma
|
||||
fmax
|
||||
fmin
|
||||
fmod
|
||||
frexp
|
||||
hypot
|
||||
ilogb
|
||||
isfinite
|
||||
isinf
|
||||
isnan
|
||||
j0
|
||||
j1
|
||||
jn
|
||||
ldexp
|
||||
lgamma
|
||||
llrint
|
||||
llround
|
||||
log
|
||||
log10
|
||||
log1p
|
||||
log2
|
||||
logb
|
||||
lrint
|
||||
lround
|
||||
modf
|
||||
nan
|
||||
nearbyint
|
||||
nextafter
|
||||
norm
|
||||
norm3d
|
||||
norm4d
|
||||
normcdf
|
||||
normcdfinv
|
||||
pow
|
||||
rcbrt
|
||||
remainder
|
||||
remquo
|
||||
rhypot
|
||||
rint
|
||||
rnorm
|
||||
rnorm3d
|
||||
rnorm4d
|
||||
round
|
||||
rsqrt
|
||||
scalbln
|
||||
scalbn
|
||||
signbit
|
||||
sin
|
||||
sincos
|
||||
sincospi
|
||||
sinh
|
||||
sinpi
|
||||
sqrt
|
||||
tan
|
||||
tanh
|
||||
tgamma
|
||||
trunc
|
||||
y0
|
||||
y1
|
||||
yn
|
||||
__cosf
|
||||
__exp10f
|
||||
__expf
|
||||
__fadd_rd
|
||||
__fadd_rn
|
||||
__fadd_ru
|
||||
__fadd_rz
|
||||
__fdiv_rd
|
||||
__fdiv_rn
|
||||
__fdiv_ru
|
||||
__fdiv_rz
|
||||
__fdividef
|
||||
__fmaf_rd
|
||||
__fmaf_rn
|
||||
__fmaf_ru
|
||||
__fmaf_rz
|
||||
__fmul_rd
|
||||
__fmul_rn
|
||||
__fmul_ru
|
||||
__fmul_rz
|
||||
__frcp_rd
|
||||
__frcp_rn
|
||||
__frcp_ru
|
||||
__frcp_rz
|
||||
__frsqrt_rn
|
||||
__fsqrt_rd
|
||||
__fsqrt_rn
|
||||
__fsqrt_ru
|
||||
__fsqrt_rz
|
||||
__fsub_rd
|
||||
__fsub_rn
|
||||
__fsub_ru
|
||||
__log10f
|
||||
__log2f
|
||||
__logf
|
||||
__powf
|
||||
__saturatef
|
||||
__sincosf
|
||||
__sinf
|
||||
__tanf
|
||||
__dadd_rd
|
||||
__dadd_rn
|
||||
__dadd_ru
|
||||
__dadd_rz
|
||||
__ddiv_rd
|
||||
__ddiv_rn
|
||||
__ddiv_ru
|
||||
__ddiv_rz
|
||||
__dmul_rd
|
||||
__dmul_rn
|
||||
__dmul_ru
|
||||
__dmul_rz
|
||||
__drcp_rd
|
||||
__drcp_rn
|
||||
__drcp_ru
|
||||
__drcp_rz
|
||||
__dsqrt_rd
|
||||
__dsqrt_rn
|
||||
__dsqrt_ru
|
||||
__dsqrt_rz
|
||||
__dsub_rd
|
||||
__dsub_rn
|
||||
__dsub_ru
|
||||
__dsub_rz
|
||||
__fma_rd
|
||||
__fma_rn
|
||||
__fma_ru
|
||||
__fma_rz
|
||||
__brev
|
||||
__brevll
|
||||
__byte_perm
|
||||
__clz
|
||||
__clzll
|
||||
__ffs
|
||||
__ffsll
|
||||
__hadd
|
||||
__mul24
|
||||
__mul64hi
|
||||
__mulhi
|
||||
__popc
|
||||
__popcll
|
||||
__rhadd
|
||||
__sad
|
||||
__uhadd
|
||||
__umul24
|
||||
__umul64hi
|
||||
__umulhi
|
||||
__urhadd
|
||||
__usad
|
||||
__double2float_rd
|
||||
__double2float_rn
|
||||
__double2float_ru
|
||||
__double2float_rz
|
||||
__double2hiint
|
||||
__double2int_rd
|
||||
__double2int_rn
|
||||
__double2int_ru
|
||||
__double2int_rz
|
||||
__double2ll_rd
|
||||
__double2ll_rn
|
||||
__double2ll_ru
|
||||
__double2ll_rz
|
||||
__double2loint
|
||||
__double2uint_rd
|
||||
__double2uint_rn
|
||||
__double2uint_ru
|
||||
__double2uint_rz
|
||||
__double2ull_rd
|
||||
__double2ull_rn
|
||||
__double2ull_ru
|
||||
__double2ull_rz
|
||||
__double_as_longlong
|
||||
__float2half_rn
|
||||
__half2float
|
||||
__float2half_rn
|
||||
__half2float
|
||||
__float2int_rd
|
||||
__float2int_rn
|
||||
__float2int_ru
|
||||
__float2int_rz
|
||||
__float2ll_rd
|
||||
__float2ll_rn
|
||||
__float2ll_ru
|
||||
__float2ll_rz
|
||||
__float2uint_rd
|
||||
__float2uint_rn
|
||||
__float2uint_ru
|
||||
__float2uint_rz
|
||||
__float2ull_rd
|
||||
__float2ull_rn
|
||||
__float2ull_ru
|
||||
__float2ull_rz
|
||||
__float_as_int
|
||||
__float_as_uint
|
||||
__hiloint2double
|
||||
__int2double_rn
|
||||
__int2float_rd
|
||||
__int2float_rn
|
||||
__int2float_ru
|
||||
__int2float_rz
|
||||
__int_as_float
|
||||
__ll2double_rd
|
||||
__ll2double_rn
|
||||
__ll2double_ru
|
||||
__ll2double_rz
|
||||
__ll2float_rd
|
||||
__ll2float_rn
|
||||
__ll2float_ru
|
||||
__ll2float_rz
|
||||
__longlong_as_double
|
||||
__uint2double_rn
|
||||
__uint2float_rd
|
||||
__uint2float_rn
|
||||
__uint2float_ru
|
||||
__uint2float_rz
|
||||
__uint_as_float
|
||||
__ull2double_rd
|
||||
__ull2double_rn
|
||||
__ull2double_ru
|
||||
__ull2double_rz
|
||||
__ull2float_rd
|
||||
__ull2float_rn
|
||||
__ull2float_ru
|
||||
__ull2float_rz
|
||||
__heq
|
||||
__hge
|
||||
__hgt
|
||||
__hisinf
|
||||
__hisnan
|
||||
__hle
|
||||
__hlt
|
||||
__hne
|
||||
__hbeq2
|
||||
__hbge2
|
||||
__hbgt2
|
||||
__hble2
|
||||
__hblt2
|
||||
__hbne2
|
||||
__heq2
|
||||
__hge2
|
||||
__hgt2
|
||||
__hisnan2
|
||||
__hle2
|
||||
__hlt2
|
||||
__hne2
|
||||
__float22half2_rn
|
||||
__float2half
|
||||
__float2half2_rn
|
||||
__float2half_rd
|
||||
__float2half_rn
|
||||
__float2half_ru
|
||||
__float2half_rz
|
||||
__floats2half2_rn
|
||||
__half22float2
|
||||
__half2float
|
||||
half2half2
|
||||
__half2int_rd
|
||||
__half2int_rn
|
||||
__half2int_ru
|
||||
__half2int_rz
|
||||
__half2ll_rd
|
||||
__half2ll_rn
|
||||
__half2ll_ru
|
||||
__half2ll_rz
|
||||
__half2short_rd
|
||||
__half2short_rn
|
||||
__half2short_ru
|
||||
__half2short_rz
|
||||
__half2uint_rd
|
||||
__half2uint_rn
|
||||
__half2uint_ru
|
||||
__half2uint_rz
|
||||
__half2ull_rd
|
||||
__half2ull_rn
|
||||
__half2ull_ru
|
||||
__half2ull_rz
|
||||
__half2ushort_rd
|
||||
__half2ushort_rn
|
||||
__half2ushort_ru
|
||||
__half2ushort_rz
|
||||
__half_as_short
|
||||
__half_as_ushort
|
||||
__halves2half2
|
||||
__high2float
|
||||
__high2half
|
||||
__high2half2
|
||||
__highs2half2
|
||||
__int2half_rd
|
||||
__int2half_rn
|
||||
__int2half_ru
|
||||
__int2half_rz
|
||||
__ll2half_rd
|
||||
__ll2half_rn
|
||||
__ll2half_ru
|
||||
__ll2half_rz
|
||||
__low2float
|
||||
__low2half
|
||||
__low2half2
|
||||
__low2half2
|
||||
__lowhigh2highlow
|
||||
__lows2half2
|
||||
__short2half_rd
|
||||
__short2half_rn
|
||||
__short2half_ru
|
||||
__short2half_rz
|
||||
__uint2half_rd
|
||||
__uint2half_rn
|
||||
__uint2half_ru
|
||||
__uint2half_rz
|
||||
__ull2half_rd
|
||||
__ull2half_rn
|
||||
__ull2half_ru
|
||||
__ull2half_rz
|
||||
__ushort2half_rd
|
||||
__ushort2half_rn
|
||||
__ushort2half_ru
|
||||
__ushort2half_rz
|
||||
__ushort_as_half
|
||||
"""
|
||||
# The dictionary is to place description of each device function. Expand it to all the device functions
|
||||
deviceFuncDesc = {'acosf': "This function returns floating point of arc cosine from a floating point input"}
|
||||
|
||||
fnames = ["../../include/hip/amd_detail/math_functions.h","../../include/hip/amd_detail/device_functions.h","../../include/hip/amd_detail/hip_fp16.h"]
|
||||
markdownFileName = "./hip-math-api.md"
|
||||
|
||||
preamble = "# HIP MATH APIs Documentation \n"+\
|
||||
"HIP supports most of the device functions supported by CUDA. Way to find the unsupported one is to search for the function and check its description\n" + \
|
||||
"Note: This document is not human generated. Any changes to this file will be discarded. Please make changes to Python3 script docs/markdown/device_md_gen.py\n\n" + \
|
||||
"## For Developers \n" + \
|
||||
"If you add or fixed a device function, make sure to add a signature of the function and definition later.\n" + \
|
||||
"For example, if you want to add `__device__ float __dotf(float4, float4)`, which does a dot product on 4 float vector components \n" + \
|
||||
"The way to add to the header is, \n" + \
|
||||
"```cpp \n" + \
|
||||
"__device__ static float __dotf(float4, float4); \n" + \
|
||||
"/*Way down in the file....*/\n" + \
|
||||
"__device__ static inline float __dotf(float4 x, float4 y) { \n" + \
|
||||
" /*implementation*/\n}\n" + \
|
||||
"```\n\n" + \
|
||||
"This helps python script to add the device function newly declared into markdown documentation (as it looks at functions with `;` at the end and `__device__` at the beginning)\n\n" + \
|
||||
"The next step would be to add Description to `deviceFuncDesc` dictionary in python script.\n" + \
|
||||
"From the above example, it can be writtern as,\n`deviceFuncDesc['__dotf'] = 'This functions takes 2 4 component float vector and outputs dot product across them'`\n\n"
|
||||
|
||||
def generateSnippet(name, description, signature):
|
||||
return "### " + name + "\n" + \
|
||||
"```cpp \n" + signature + "\n```\n" + \
|
||||
"**Description:** " + description + "\n\n\n"
|
||||
|
||||
def getName(line):
|
||||
l1 = line.split('(')
|
||||
l2 = l1[0].split(' ')
|
||||
return l2[-1]
|
||||
|
||||
with open(markdownFileName, 'w') as mdfd:
|
||||
mdfd.truncate()
|
||||
mdfd.write(preamble)
|
||||
for fname in fnames:
|
||||
with open(fname) as fd:
|
||||
lines = fd.readlines()
|
||||
for line in lines:
|
||||
if line.find('HIP_FAST_MATH') != -1:
|
||||
break;
|
||||
if line.find('__device__') != -1 and line.find(';') != -1 and line.find('hip') == -1:
|
||||
name = getName(line)
|
||||
if line.find('//') == -1:
|
||||
if name in deviceFuncDesc:
|
||||
mdfd.write(generateSnippet(name, deviceFuncDesc[name], line))
|
||||
else:
|
||||
mdfd.write(generateSnippet(name, "Supported", line))
|
||||
else:
|
||||
mdfd.write(generateSnippet(name, "**NOT Supported**", line))
|
||||
fd.close()
|
||||
mdfd.close()
|
||||
@@ -1,102 +0,0 @@
|
||||
#!/bin/bash
|
||||
# Copyright (C) 2019-2021 Advanced Micro Devices, Inc. All Rights Reserved.
|
||||
#
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
# of this software and associated documentation files (the "Software"), to deal
|
||||
# in the Software without restriction, including without limitation the rights
|
||||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
# copies of the Software, and to permit persons to whom the Software is
|
||||
# furnished to do so, subject to the following conditions:
|
||||
#
|
||||
# The above copyright notice and this permission notice shall be included in
|
||||
# all copies or substantial portions of the Software.
|
||||
#
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
# THE SOFTWARE.
|
||||
|
||||
## generates documentation about clang options.
|
||||
|
||||
clang=/opt/rocm*/llvm/bin/clang
|
||||
|
||||
exec > clang_options.md
|
||||
|
||||
echo "# Support of Clang options"
|
||||
echo " Clang version: $($clang --version | head -1|sed 's:\(.*\) (.* \(.*\)).*:\1 \2:')"
|
||||
echo
|
||||
echo "|Option|Support|Description|"
|
||||
echo "|-------|------|-------|"
|
||||
|
||||
declare -A db
|
||||
while read a b; do
|
||||
if [[ "$a" != "" && "$b" != "" ]]; then
|
||||
db[$a]="$b"
|
||||
#echo "db[$a]=${db[$a]}"
|
||||
fi
|
||||
done <clang_options.txt
|
||||
#for K in "${!db[@]}"; do echo $K; done
|
||||
|
||||
tmpf=tmp_clang_option.txt
|
||||
|
||||
[[ -f $tmpf ]] && rm $tmpf
|
||||
|
||||
$clang --help | sed '1,5d'| while read a b; do
|
||||
if [[ "$a" != "-"* ]]; then
|
||||
desc="$a $b"
|
||||
elif [[ "$b" = *'>'* ]]; then
|
||||
opt=$(echo $a $b| sed -e 's:\(^-[^ ]*[= ]*<[^<>]*>\) *\(.*\):\1:')
|
||||
desc=$(echo $a $b| sed -e 's:\(^-[^ ]*[= ]*<[^<>]*>\) *\(.*\):\2:')
|
||||
if [[ "$opt" == "$desc" ]]; then
|
||||
opt="$a"
|
||||
desc="$b"
|
||||
fi
|
||||
else
|
||||
opt="$a"
|
||||
desc="$b"
|
||||
fi
|
||||
supp=
|
||||
key=$(printf "%s" "$opt" |sed 's:\([^ =<]*\).*:\1:')
|
||||
if [[ "$key" != "" ]]; then
|
||||
supp="${db[$key]}"
|
||||
#echo "opt=$opt supp=${db[$opt]}"
|
||||
fi
|
||||
if [[ "$supp" == "" ]]; then
|
||||
if [[ "$desc" = *AArch* ||\
|
||||
"$desc" = *MIPS* || \
|
||||
"$desc" = *ARM* || \
|
||||
"$desc" = *Arm* || \
|
||||
"$desc" = *SYCL* || \
|
||||
"$desc" = *PPC* || \
|
||||
"$desc" = *RISC-V* || \
|
||||
"$desc" = *WebAssembly* || \
|
||||
"$desc" = *Objective-C* || \
|
||||
"$opt" = *xray* \
|
||||
]]; then
|
||||
supp="n"
|
||||
elif [[ "$opt" = *sanity* ]]; then
|
||||
supp="h"
|
||||
else
|
||||
supp="s"
|
||||
fi
|
||||
fi
|
||||
s=$supp
|
||||
case $supp in
|
||||
s) supp="Supported";;
|
||||
n) supp="Unsupported";;
|
||||
h) supp="Supported on Host only";;
|
||||
esac
|
||||
|
||||
desc=$(echo "$desc"| sed -e 's:^ *::' -e 's:|:\\|:g')
|
||||
#echo a=$a
|
||||
#echo b=$b
|
||||
#echo opt=$opt
|
||||
#echo desc=$desc
|
||||
if [[ "$desc" != "" ]]; then
|
||||
printf "%s %s\n" "$key" "$s" >>$tmpf
|
||||
echo '|`'$opt'`|'$supp'|`'$desc'`|'
|
||||
fi
|
||||
done
|
||||
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
@@ -1,87 +0,0 @@
|
||||
# HIP Bugs
|
||||
|
||||
<!-- toc -->
|
||||
- [HIP is more restrictive in enforcing restrictions](#hip-is-more-restrictive-in-enforcing-restrictions)
|
||||
|
||||
<!-- tocstop -->
|
||||
|
||||
|
||||
|
||||
### HIP is more restrictive in enforcing restrictions
|
||||
The language specification for HIP and CUDA forbid calling a
|
||||
`__device__` function in a `__host__` context. In practice, you may observe
|
||||
differences in the strictness of this restriction, with HIP exhibiting a tighter
|
||||
adherence to the specification and thus less tolerant of infringing code. The
|
||||
solution is to ensure that all functions which are called in a
|
||||
`__device__` context are correctly annotated to reflect it. An interesting case
|
||||
where these differences emerge is shown below. This relies on a the common
|
||||
[C++ Member Detector idiom][1], as it would be implemented pre C++11):
|
||||
|
||||
```c++
|
||||
#include <cassert>
|
||||
#include <type_traits>
|
||||
|
||||
struct aye { bool a[1]; };
|
||||
struct nay { bool a[2]; };
|
||||
|
||||
// Dual restriction is necessary in HIP if the detector is to work for
|
||||
// __device__ contexts as well as __host__ ones. NVCC is less strict.
|
||||
template<typename T>
|
||||
__host__ __device__
|
||||
const T& cref_t();
|
||||
|
||||
template<typename T>
|
||||
struct Has_call_operator {
|
||||
// Dual restriction is necessary in HIP if the detector is to work for
|
||||
// __device__ contexts as well as __host__ ones. NVCC is less strict.
|
||||
template<typename C>
|
||||
__host__ __device__
|
||||
static
|
||||
aye test(
|
||||
C const *,
|
||||
typename std::enable_if<
|
||||
(sizeof(cref_t<C>().operator()()) > 0)>::type* = nullptr);
|
||||
static
|
||||
nay test(...);
|
||||
|
||||
enum { value = sizeof(test(static_cast<T*>(0))) == sizeof(aye) };
|
||||
};
|
||||
|
||||
template<typename T, typename U, bool callable = has_call_operator<U>::value>
|
||||
struct Wrapper {
|
||||
template<typename V>
|
||||
V f() const { return T{1}; }
|
||||
};
|
||||
|
||||
|
||||
template<typename T, typename U>
|
||||
struct Wrapper<T, U, true> {
|
||||
template<typename V>
|
||||
V f() const { return T{10}; }
|
||||
};
|
||||
|
||||
// This specialisation will yield a compile-time error, if selected.
|
||||
template<typename T, typename U>
|
||||
struct Wrapper<T, U, false> {};
|
||||
|
||||
template<typename T>
|
||||
struct Functor;
|
||||
|
||||
template<> struct Functor<float> {
|
||||
__device__
|
||||
float operator()() const { return 42.0f; }
|
||||
};
|
||||
|
||||
__device__
|
||||
void this_will_not_compile_if_detector_is_not_marked_device()
|
||||
{
|
||||
float f = Wrapper<float, Functor<float>>().f<float>();
|
||||
}
|
||||
|
||||
__host__
|
||||
void this_will_not_compile_if_detector_is_marked_device_only()
|
||||
{
|
||||
float f = Wrapper<float, Functor<float>>().f<float>();
|
||||
}
|
||||
```
|
||||
[1]: https://en.wikibooks.org/wiki/More_C%2B%2B_Idioms/Member_Detector
|
||||
@@ -1,243 +0,0 @@
|
||||
# HIP Debugging
|
||||
There are some techniques provided in HIP for developers to trace and debug codes during execution, this section describes some details and practical suggestions on debugging.
|
||||
|
||||
Table of Contents
|
||||
=================
|
||||
|
||||
* [ Debugging Tools](#debugging-tools)
|
||||
* [Using ltrace](#using-ltrace)
|
||||
* [Using ROCgdb](#using-rocgdb)
|
||||
* [Other Debugging Tools](#Other-debugging-tools)
|
||||
* [ Debugging HIP Application](#debugging-hip-application)
|
||||
* [ Useful Environment Variables](#useful-environment-variables)
|
||||
* [Kernel Enqueue Serialization](#kernel-enqueue-serialization)
|
||||
* [Making Device visible](#making-device-visible)
|
||||
* [Dump code object](#dump-code-object)
|
||||
* [HSA related environment variables](#HSA-related-environment-variables)
|
||||
* [ General Debugging Tips](#general-debugging-tips)
|
||||
|
||||
## Debugging tools
|
||||
|
||||
### Using ltrace
|
||||
ltrace is a standard linux tool which provides a message to stderr on every dynamic library call.
|
||||
Since ROCr and the ROCt (the ROC thunk, which is the thin user-space interface to the ROC kernel driver) are both dynamic libraries, this provides an easy way to trace the activity in these libraries.
|
||||
Tracing can be a powerful way to quickly observe the flow of the application before diving into the details with a command-line debugger.
|
||||
ltrace is a helpful tool to visualize the runtime behavior of the entire ROCm software stack.
|
||||
The trace can also show performance issues related to accidental calls to expensive API calls on the critical path.
|
||||
|
||||
Here's a simple sample with command-line to trace hip APIs and output:
|
||||
|
||||
```
|
||||
$ ltrace -C -e "hip*" ./hipGetChanDesc
|
||||
hipGetChanDesc->hipCreateChannelDesc(0x7ffdc4b66860, 32, 0, 0) = 0x7ffdc4b66860
|
||||
hipGetChanDesc->hipMallocArray(0x7ffdc4b66840, 0x7ffdc4b66860, 8, 8) = 0
|
||||
hipGetChanDesc->hipGetChannelDesc(0x7ffdc4b66848, 0xa63990, 5, 1) = 0
|
||||
hipGetChanDesc->hipFreeArray(0xa63990, 0, 0x7f8c7fe13778, 0x7ffdc4b66848) = 0
|
||||
PASSED!
|
||||
+++ exited (status 0) +++
|
||||
```
|
||||
|
||||
Another sample below with command-line only trace hsa APIs and output:
|
||||
|
||||
```
|
||||
$ ltrace -C -e "hsa*" ./hipGetChanDesc
|
||||
libamdhip64.so.4->hsa_init(0, 0x7fff325a69d0, 0x9c80e0, 0 <unfinished ...>
|
||||
libhsa-runtime64.so.1->hsaKmtOpenKFD(0x7fff325a6590, 0x9c38c0, 0, 1) = 0
|
||||
libhsa-runtime64.so.1->hsaKmtGetVersion(0x7fff325a6608, 0, 0, 0) = 0
|
||||
libhsa-runtime64.so.1->hsaKmtReleaseSystemProperties(3, 0x80084b01, 0, 0) = 0
|
||||
libhsa-runtime64.so.1->hsaKmtAcquireSystemProperties(0x7fff325a6610, 0, 0, 1) = 0
|
||||
libhsa-runtime64.so.1->hsaKmtGetNodeProperties(0, 0x7fff325a66a0, 0, 0) = 0
|
||||
libhsa-runtime64.so.1->hsaKmtGetNodeMemoryProperties(0, 1, 0x9c42b0, 0x936012) = 0
|
||||
...
|
||||
<... hsaKmtCreateEvent resumed> ) = 0
|
||||
libhsa-runtime64.so.1->hsaKmtAllocMemory(0, 4096, 64, 0x7fff325a6690) = 0
|
||||
libhsa-runtime64.so.1->hsaKmtMapMemoryToGPUNodes(0x7f1202749000, 4096, 0x7fff325a6690, 0) = 0
|
||||
libhsa-runtime64.so.1->hsaKmtCreateEvent(0x7fff325a6700, 0, 0, 0x7fff325a66f0) = 0
|
||||
libhsa-runtime64.so.1->hsaKmtAllocMemory(1, 0x100000000, 576, 0x7fff325a67d8) = 0
|
||||
libhsa-runtime64.so.1->hsaKmtAllocMemory(0, 8192, 64, 0x7fff325a6790) = 0
|
||||
libhsa-runtime64.so.1->hsaKmtMapMemoryToGPUNodes(0x7f120273c000, 8192, 0x7fff325a6790, 0) = 0
|
||||
libhsa-runtime64.so.1->hsaKmtAllocMemory(0, 4096, 4160, 0x7fff325a6450) = 0
|
||||
libhsa-runtime64.so.1->hsaKmtMapMemoryToGPUNodes(0x7f120273a000, 4096, 0x7fff325a6450, 0) = 0
|
||||
libhsa-runtime64.so.1->hsaKmtSetTrapHandler(1, 0x7f120273a000, 4096, 0x7f120273c000) = 0
|
||||
<... hsa_init resumed> ) = 0
|
||||
libamdhip64.so.4->hsa_system_get_major_extension_table(513, 1, 24, 0x7f1202597930) = 0
|
||||
libamdhip64.so.4->hsa_iterate_agents(0x7f120171f050, 0, 0x7fff325a67f8, 0 <unfinished ...>
|
||||
libamdhip64.so.4->hsa_agent_get_info(0x94f110, 17, 0x7fff325a67e8, 0) = 0
|
||||
libamdhip64.so.4->hsa_amd_agent_iterate_memory_pools(0x94f110, 0x7f1201722816, 0x7fff325a67f0, 0x7f1201722816 <unfinished ...>
|
||||
libamdhip64.so.4->hsa_amd_memory_pool_get_info(0x9c7fb0, 0, 0x7fff325a6744, 0x7fff325a67f0) = 0
|
||||
libamdhip64.so.4->hsa_amd_memory_pool_get_info(0x9c7fb0, 1, 0x7fff325a6748, 0x7f1200d82df4) = 0
|
||||
...
|
||||
<... hsa_amd_agent_iterate_memory_pools resumed> ) = 0
|
||||
libamdhip64.so.4->hsa_agent_get_info(0x9dbf30, 17, 0x7fff325a67e8, 0) = 0
|
||||
<... hsa_iterate_agents resumed> ) = 0
|
||||
libamdhip64.so.4->hsa_agent_get_info(0x9dbf30, 0, 0x7fff325a6850, 3) = 0
|
||||
libamdhip64.so.4->hsa_agent_get_info(0x9dbf30, 0xa000, 0x9e7cd8, 0) = 0
|
||||
libamdhip64.so.4->hsa_agent_iterate_isas(0x9dbf30, 0x7f1201720411, 0x7fff325a6760, 0x7f1201720411) = 0
|
||||
libamdhip64.so.4->hsa_isa_get_info_alt(0x94e7c8, 0, 0x7fff325a6728, 1) = 0
|
||||
libamdhip64.so.4->hsa_isa_get_info_alt(0x94e7c8, 1, 0x9e7f90, 0) = 0
|
||||
libamdhip64.so.4->hsa_agent_get_info(0x9dbf30, 4, 0x9e7ce8, 0) = 0
|
||||
...
|
||||
<... hsa_amd_memory_pool_allocate resumed> ) = 0
|
||||
libamdhip64.so.4->hsa_ext_image_create(0x9dbf30, 0xa1c4c8, 0x7f10f2800000, 3 <unfinished ...>
|
||||
libhsa-runtime64.so.1->hsaKmtAllocMemory(0, 4096, 64, 0x7fff325a6740) = 0
|
||||
libhsa-runtime64.so.1->hsaKmtQueryPointerInfo(0x7f1202736000, 0x7fff325a65e0, 0, 0) = 0
|
||||
libhsa-runtime64.so.1->hsaKmtMapMemoryToGPUNodes(0x7f1202736000, 4096, 0x7fff325a66e8, 0) = 0
|
||||
<... hsa_ext_image_create resumed> ) = 0
|
||||
libamdhip64.so.4->hsa_ext_image_destroy(0x9dbf30, 0x7f1202736000, 0x9dbf30, 0 <unfinished ...>
|
||||
libhsa-runtime64.so.1->hsaKmtUnmapMemoryToGPU(0x7f1202736000, 0x7f1202736000, 4096, 0x9c8050) = 0
|
||||
libhsa-runtime64.so.1->hsaKmtFreeMemory(0x7f1202736000, 4096, 0, 0) = 0
|
||||
<... hsa_ext_image_destroy resumed> ) = 0
|
||||
libamdhip64.so.4->hsa_amd_memory_pool_free(0x7f10f2800000, 0x7f10f2800000, 256, 0x9e76f0) = 0
|
||||
PASSED!
|
||||
```
|
||||
|
||||
### Using ROCgdb
|
||||
HIP developers on ROCm can use AMD's ROCgdb for debugging and profiling.
|
||||
ROCgdb is the ROCm source-level debugger for Linux, based on GDB, the GNU source-level debugger, equivalent of cuda-gdb, can be used with debugger frontends, such as eclipse, vscode, or gdb-dashboard.
|
||||
For details, see (https://github.com/ROCm-Developer-Tools/ROCgdb).
|
||||
|
||||
Below is a sample how to use ROCgdb run and debug HIP application, rocgdb is installed with ROCM package in the folder /opt/rocm/bin.
|
||||
|
||||
```
|
||||
$ export PATH=$PATH:/opt/rocm/bin
|
||||
$ rocgdb ./hipTexObjPitch
|
||||
GNU gdb (rocm-dkms-no-npi-hipclang-6549) 10.1
|
||||
Copyright (C) 2020 Free Software Foundation, Inc.
|
||||
License GPLv3+: GNU GPL version 3 or later <http://gnu.org/licenses/gpl.html>
|
||||
...
|
||||
For bug reporting instructions, please see:
|
||||
<https://github.com/ROCm-Developer-Tools/ROCgdb/issues>.
|
||||
Find the GDB manual and other documentation resources online at:
|
||||
<http://www.gnu.org/software/gdb/documentation/>.
|
||||
|
||||
...
|
||||
Reading symbols from ./hipTexObjPitch...
|
||||
(gdb) break main
|
||||
Breakpoint 1 at 0x4013d1: file /home/test/hip/tests/src/texture/hipTexObjPitch.cpp, line 98.
|
||||
(gdb) run
|
||||
Starting program: /home/test/hip/build/directed_tests/texture/hipTexObjPitch
|
||||
[Thread debugging using libthread_db enabled]
|
||||
Using host libthread_db library "/lib/x86_64-linux-gnu/libthread_db.so.1".
|
||||
|
||||
Breakpoint 1, main ()
|
||||
at /home/test/hip/tests/src/texture/hipTexObjPitch.cpp:98
|
||||
98 texture2Dtest<float>();
|
||||
(gdb)c
|
||||
|
||||
```
|
||||
|
||||
### Other Debugging Tools
|
||||
There are also other debugging tools availble online developers can google and choose the one best suits the debugging requirements.
|
||||
|
||||
## Debugging HIP Applications
|
||||
|
||||
Below is an example to show how to get useful information from the debugger while running an application, which caused an issue of GPUVM fault.
|
||||
|
||||
```
|
||||
Memory access fault by GPU node-1 on address 0x5924000. Reason: Page not present or supervisor privilege.
|
||||
|
||||
Program received signal SIGABRT, Aborted.
|
||||
[Switching to Thread 0x7fffdffb5700 (LWP 14893)]
|
||||
0x00007ffff2057c37 in __GI_raise (sig=sig@entry=6) at ../nptl/sysdeps/unix/sysv/linux/raise.c:56
|
||||
56 ../nptl/sysdeps/unix/sysv/linux/raise.c: No such file or directory.
|
||||
(gdb) bt
|
||||
#0 0x00007ffff2057c37 in __GI_raise (sig=sig@entry=6) at ../nptl/sysdeps/unix/sysv/linux/raise.c:56
|
||||
#1 0x00007ffff205b028 in __GI_abort () at abort.c:89
|
||||
#2 0x00007ffff6f960eb in ?? () from /opt/rocm/hsa/lib/libhsa-runtime64.so.1
|
||||
#3 0x00007ffff6f99ea5 in ?? () from /opt/rocm/hsa/lib/libhsa-runtime64.so.1
|
||||
#4 0x00007ffff6f78107 in ?? () from /opt/rocm/hsa/lib/libhsa-runtime64.so.1
|
||||
#5 0x00007ffff744f184 in start_thread (arg=0x7fffdffb5700) at pthread_create.c:312
|
||||
#6 0x00007ffff211b37d in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:111
|
||||
(gdb) info threads
|
||||
Id Target Id Frame
|
||||
4 Thread 0x7fffdd521700 (LWP 14895) "caffe" pthread_cond_wait@@GLIBC_2.3.2 () at ../nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S:185
|
||||
3 Thread 0x7fffddd22700 (LWP 14894) "caffe" pthread_cond_wait@@GLIBC_2.3.2 () at ../nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S:185
|
||||
* 2 Thread 0x7fffdffb5700 (LWP 14893) "caffe" 0x00007ffff2057c37 in __GI_raise (sig=sig@entry=6) at ../nptl/sysdeps/unix/sysv/linux/raise.c:56
|
||||
1 Thread 0x7ffff7fa6ac0 (LWP 14892) "caffe" 0x00007ffff6f934d5 in ?? () from /opt/rocm/hsa/lib/libhsa-runtime64.so.1
|
||||
(gdb) thread 1
|
||||
[Switching to thread 1 (Thread 0x7ffff7fa6ac0 (LWP 14892))]
|
||||
#0 0x00007ffff6f934d5 in ?? () from /opt/rocm/hsa/lib/libhsa-runtime64.so.1
|
||||
(gdb) bt
|
||||
#0 0x00007ffff6f934d5 in ?? () from /opt/rocm/hsa/lib/libhsa-runtime64.so.1
|
||||
#1 0x00007ffff6f929ba in ?? () from /opt/rocm/hsa/lib/libhsa-runtime64.so.1
|
||||
#2 0x00007fffe080beca in HSADispatch::waitComplete() () from /opt/rocm/hcc/lib/libmcwamp_hsa.so
|
||||
#3 0x00007fffe080415f in HSADispatch::dispatchKernelAsync(Kalmar::HSAQueue*, void const*, int, bool) () from /opt/rocm/hcc/lib/libmcwamp_hsa.so
|
||||
#4 0x00007fffe080238e in Kalmar::HSAQueue::dispatch_hsa_kernel(hsa_kernel_dispatch_packet_s const*, void const*, unsigned long, hc::completion_future*) () from /opt/rocm/hcc/lib/libmcwamp_hsa.so
|
||||
#5 0x00007ffff7bb7559 in hipModuleLaunchKernel () from /opt/rocm/hip/lib/libhip_hcc.so
|
||||
#6 0x00007ffff2e6cd2c in mlopen::HIPOCKernel::run (this=0x7fffffffb5a8, args=0x7fffffffb2a8, size=80) at /root/MIOpen/src/hipoc/hipoc_kernel.cpp:15
|
||||
...
|
||||
```
|
||||
|
||||
## Useful Environment Variables
|
||||
HIP provides some environment variables which allow HIP, hip-clang, or HSA driver to disable some feature or optimization.
|
||||
These are not intended for production but can be useful diagnose synchronization problems in the application (or driver).
|
||||
|
||||
Some of the most useful environment variables are described here. They are supported on the ROCm path.
|
||||
|
||||
### Kernel Enqueue Serialization
|
||||
Developers can control kernel command serialization from the host using the environment variable,
|
||||
|
||||
AMD_SERIALIZE_KERNEL, for serializing kernel enqueue.
|
||||
AMD_SERIALIZE_KERNEL = 1, Wait for completion before enqueue,
|
||||
AMD_SERIALIZE_KERNEL = 2, Wait for completion after enqueue,
|
||||
AMD_SERIALIZE_KERNEL = 3, Both.
|
||||
|
||||
Or
|
||||
AMD_SERIALIZE_COPY, for serializing copies.
|
||||
|
||||
AMD_SERIALIZE_COPY = 1, Wait for completion before enqueue,
|
||||
AMD_SERIALIZE_COPY = 2, Wait for completion after enqueue,
|
||||
AMD_SERIALIZE_COPY = 3, Both.
|
||||
|
||||
So HIP runtime can wait for GPU idle before/after any GPU command depending on the environment setting.
|
||||
|
||||
### Making Device visible
|
||||
For system with multiple devices, it's possible to make only certain device(s) visible to HIP via setting environment varible,
|
||||
HIP_VISIBLE_DEVICES, only devices whose index is present in the sequence are visible to HIP.
|
||||
|
||||
For example,
|
||||
```
|
||||
$ HIP_VISIBLE_DEVICES=0,1
|
||||
```
|
||||
|
||||
or in the appliation,
|
||||
```
|
||||
if (totalDeviceNum > 2) {
|
||||
setenv("HIP_VISIBLE_DEVICES", "0,1,2", 1);
|
||||
assert(getDeviceNumber(false) == 3);
|
||||
... ...
|
||||
}
|
||||
```
|
||||
|
||||
### Dump code object
|
||||
Developers can dump code object to anylize compiler related issues via setting environment variable,
|
||||
GPU_DUMP_CODE_OBJECT
|
||||
|
||||
### HSA related environment variables
|
||||
HSA provides some environment varibles help to analize issues in driver or hardware, for example,
|
||||
|
||||
HSA_ENABLE_SDMA=0
|
||||
It causes host-to-device and device-to-host copies to use compute shader blit kernels rather than the dedicated DMA copy engines.
|
||||
Compute shader copies have low latency (typically < 5us) and can achieve approximately 80% of the bandwidth of the DMA copy engine.
|
||||
This environment variable is useful to isolate issues with the hardware copy engines.
|
||||
|
||||
HSA_ENABLE_INTERRUPT=0
|
||||
Causes completion signals to be detected with memory-based polling rather than interrupts.
|
||||
This environment variable can be useful to diagnose interrupt storm issues in the driver.
|
||||
|
||||
## General Debugging Tips
|
||||
- 'gdb --args' can be used to conviently pass the executable and arguments to gdb.
|
||||
- From inside GDB, you can set environment variables "set env". Note the command does not use an '=' sign:
|
||||
|
||||
```
|
||||
(gdb) set env AMD_SERIALIZE_KERNEL 3
|
||||
```
|
||||
- The fault will be caught by the runtime but was actually generated by an asynchronous command running on the GPU. So, the GDB backtrace will show a path in the runtime.
|
||||
- To determine the true location of the fault, force the kernels to execute synchronously by seeing the environment variables AMD_SERIALIZE_KERNEL=3 AMD_SERIALIZE_COPY=3. This will force HIP runtime to wait for the kernel to finish executing before retuning. If the fault occurs during the execution of a kernel, you can see the code which launched the kernel inside the backtrace. A bit of guesswork is required to determine which thread is actually causing the issue - typically it will the thread which is waiting inside the libhsa-runtime64.so.
|
||||
- VM faults inside kernels can be caused by:
|
||||
- incorrect code (ie a for loop which extends past array boundaries),
|
||||
- memory issues - kernel arguments which are invalid (null pointers, unregistered host pointers, bad pointers),
|
||||
- synchronization issues,
|
||||
- compiler issues (incorrect code generation from the compiler),
|
||||
- runtime issues.
|
||||
|
||||
@@ -1,34 +0,0 @@
|
||||
# HIP Deprecated APIs
|
||||
|
||||
## HIP Context Management APIs
|
||||
|
||||
CUDA supports cuCtx API, the Driver API that defines "Context" and "Devices" as separate entities. Contexts contain a single device, and a device can theoretically have multiple contexts. HIP initially added limited support for these API to facilitate easy porting from existing driver codes. These API are marked as deprecated now since there are better alternate interface (such as hipSetDevice or the stream API) to achieve the required functions.
|
||||
|
||||
### hipCtxPopCurrent
|
||||
### hipCtxPushCurrent
|
||||
### hipCtxSetCurrent
|
||||
### hipCtxGetCurrent
|
||||
### hipCtxGetDevice
|
||||
### hipCtxGetApiVersion
|
||||
### hipCtxGetCacheConfig
|
||||
### hipCtxSetCacheConfig
|
||||
### hipCtxSetSharedMemConfig
|
||||
### hipCtxGetSharedMemConfig
|
||||
### hipCtxSynchronize
|
||||
### hipCtxGetFlags
|
||||
### hipCtxEnablePeerAccess
|
||||
### hipCtxDisablePeerAccess
|
||||
|
||||
## HIP Memory Management APIs
|
||||
|
||||
### hipMallocHost
|
||||
Should use "hipHostMalloc" instead.
|
||||
|
||||
### hipMemAllocHost
|
||||
Should use "hipHostMalloc" instead.
|
||||
|
||||
### hipHostAlloc
|
||||
Should use "hipHostMalloc" instead.
|
||||
|
||||
### hipFreeHost
|
||||
Should use "hipHostFree" instead.
|
||||
@@ -1,257 +0,0 @@
|
||||
# FAQ
|
||||
|
||||
<!-- toc -->
|
||||
|
||||
- [What APIs and features does HIP support?](#what-apis-and-features-does-hip-support)
|
||||
- [What is not supported?](#what-is-not-supported)
|
||||
* [Runtime/Driver API features](#runtimedriver-api-features)
|
||||
* [Kernel language features](#kernel-language-features)
|
||||
- [Is HIP a drop-in replacement for CUDA?](#is-hip-a-drop-in-replacement-for-cuda)
|
||||
- [What specific version of CUDA does HIP support?](#what-specific-version-of-cuda-does-hip-support)
|
||||
- [What libraries does HIP support?](#what-libraries-does-hip-support)
|
||||
- [How does HIP compare with OpenCL?](#how-does-hip-compare-with-opencl)
|
||||
- [How does porting CUDA to HIP compare to porting CUDA to OpenCL?](#how-does-porting-cuda-to-hip-compare-to-porting-cuda-to-opencl)
|
||||
- [What hardware does HIP support?](#what-hardware-does-hip-support)
|
||||
- [Do HIPIFY tools automatically convert all source code?](#do-hipify-tools-automatically-convert-all-source-code)
|
||||
- [What is NVCC?](#what-is-nvcc)
|
||||
- [What is HIP-Clang?](#what-is-hip-clang)
|
||||
- [Why use HIP rather than supporting CUDA directly?](#why-use-hip-rather-than-supporting-cuda-directly)
|
||||
- [Can I develop HIP code on an Nvidia CUDA platform?](#can-i-develop-hip-code-on-an-nvidia-cuda-platform)
|
||||
- [Can I develop HIP code on an AMD HIP-Clang platform?](#can-i-develop-hip-code-on-an-amd-hip-clang-platform)
|
||||
- [What is ROCclr?](#what-is-rocclr)
|
||||
- [Can a HIP binary run on both AMD and Nvidia platforms?](#can-a-hip-binary-run-on-both-amd-and-nvidia-platforms)
|
||||
- [On HIP-Clang, can I link HIP code with host code compiled with another compiler such as gcc, icc, or clang?](#on-HIP-Clang-can-i-link-hip-code-with-host-code-compiled-with-another-compiler-such-as-gcc-icc-or-clang-)
|
||||
- [HIP detected my platform (hip-clang vs nvcc) incorrectly - what should I do?](#hip-detected-my-platform-hip-clang-vs-nvcc-incorrectly---what-should-i-do)
|
||||
- [Can I install both CUDA SDK and HIP-clang on same machine?](#can-i-install-both-cuda-sdk-and-hip-clang-on-same-machine)
|
||||
- [On CUDA, can I mix CUDA code with HIP code?](#on-cuda-can-i-mix-cuda-code-with-hip-code)
|
||||
- [How do I trace HIP application flow?](#how-do-i-trace-hip-application-flow)
|
||||
- [What if HIP generates an error of "symbol multiply defined!" only on AMD machine?](#what-if-hip-generates-error-of-symbol-multiply-defined-only-on-amd-machine)
|
||||
- [What is maximum limit of Generic kernel launching parameter?](#what-is-maximum-limit-of-generic-kernel-launching-parameter)
|
||||
- [Are _shfl_*_sync functions supported on HIP platform?](#are-_shfl_*_sync-functions-supported-on-hip-platform)
|
||||
- [How to create a guard for code that is specific to the host or the GPU?](#how-to-create-a-guard-for-code-that-is-specific-to-the-host-or-the-gpu)
|
||||
- [Why _OpenMP is undefined when compiling with -fopenmp?](#why-_openmp-is-undefined-when-compiling-with--fopenmp)
|
||||
- [Does the HIP-Clang compiler support extern shared declarations?](#does-the-hip-clang-compiler-support-extern-shared-declarations)
|
||||
- [I have multiple HIP enabled devices and I am getting an error message hipErrorNoBinaryForGpu: Unable to find code object for all current devices?](#i-have-multiple-hip-enabled-devices-and-i-am-getting-an-error-message-hipErrorNoBinaryForGpu-unable-to-find-code-object-for-all-current-devices)
|
||||
<!-- tocstop -->
|
||||
|
||||
### What APIs and features does HIP support?
|
||||
HIP provides the following:
|
||||
- Devices (hipSetDevice(), hipGetDeviceProperties(), etc.)
|
||||
- Memory management (hipMalloc(), hipMemcpy(), hipFree(), etc.)
|
||||
- Streams (hipStreamCreate(),hipStreamSynchronize(), hipStreamWaitEvent(), etc.)
|
||||
- Events (hipEventRecord(), hipEventElapsedTime(), etc.)
|
||||
- Kernel launching (hipLaunchKernel is a standard C/C++ function that replaces <<< >>>)
|
||||
- HIP Module API to control when adn how code is loaded.
|
||||
- CUDA-style kernel coordinate functions (threadIdx, blockIdx, blockDim, gridDim)
|
||||
- Cross-lane instructions including shfl, ballot, any, all
|
||||
- Most device-side math built-ins
|
||||
- Error reporting (hipGetLastError(), hipGetErrorString())
|
||||
|
||||
The HIP API documentation describes each API and its limitations, if any, compared with the equivalent CUDA API.
|
||||
|
||||
### What is not supported?
|
||||
|
||||
#### Runtime/Driver API features
|
||||
At a high-level, the following features are not supported:
|
||||
- Textures (partial support available)
|
||||
- Dynamic parallelism (CUDA 5.0)
|
||||
- Managed memory (CUDA 6.5)
|
||||
- Graphics interoperability with OpenGL or Direct3D
|
||||
- CUDA IPC Functions (Under Development)
|
||||
- CUDA array, mipmappedArray and pitched memory
|
||||
- Queue priority controls
|
||||
|
||||
See the [API Support Table](CUDA_Runtime_API_functions_supported_by_HIP.md) for more detailed information.
|
||||
|
||||
#### Kernel language features
|
||||
- C++-style device-side dynamic memory allocations (free, new, delete) (CUDA 4.0)
|
||||
- Virtual functions, indirect functions and try/catch (CUDA 4.0)
|
||||
- `__prof_trigger`
|
||||
- PTX assembly (CUDA 4.0). HIP-Clang supports inline GCN assembly.
|
||||
- Several kernel features are under development. See the [HIP Kernel Language](hip_kernel_language.md) for more information. These include:
|
||||
- printf
|
||||
|
||||
|
||||
### Is HIP a drop-in replacement for CUDA?
|
||||
No. HIP provides porting tools which do most of the work to convert CUDA code into portable C++ code that uses the HIP APIs.
|
||||
Most developers will port their code from CUDA to HIP and then maintain the HIP version.
|
||||
HIP code provides the same performance as native CUDA code, plus the benefits of running on AMD platforms.
|
||||
|
||||
### What specific version of CUDA does HIP support?
|
||||
HIP APIs and features do not map to a specific CUDA version. HIP provides a strong subset of the functionality provided in CUDA, and the hipify tools can scan code to identify any unsupported CUDA functions - this is useful for identifying the specific features required by a given application.
|
||||
|
||||
However, we can provide a rough summary of the features included in each CUDA SDK and the support level in HIP. Each bullet below lists the major new language features in each CUDA release and then indicate which are supported/not supported in HIP:
|
||||
|
||||
- CUDA 4.0 and earlier :
|
||||
- HIP supports CUDA 4.0 except for the limitations described above.
|
||||
- CUDA 5.0 :
|
||||
- Dynamic Parallelism (not supported)
|
||||
- cuIpc functions (under development).
|
||||
- CUDA 5.5 :
|
||||
- CUPTI (not directly supported, [AMD GPUPerfAPI](http://developer.amd.com/tools-and-sdks/graphics-development/gpuperfapi/) can be used as an alternative in some cases)
|
||||
- CUDA 6.0 :
|
||||
- Managed memory (under development)
|
||||
- CUDA 6.5 :
|
||||
- __shfl intriniscs (supported)
|
||||
- CUDA 7.0 :
|
||||
- Per-thread-streams (under development)
|
||||
- C++11 (Hip-Clang supports all of C++11, all of C++14 and some C++17 features)
|
||||
- CUDA 7.5 :
|
||||
- float16 (supported)
|
||||
- CUDA 8.0 :
|
||||
- Page Migration including cudaMemAdvise, cudaMemPrefetch, other cudaMem* APIs(not supported)
|
||||
- CUDA 9.0 :
|
||||
- Cooperative Launch, Surface Object Management, Version Management
|
||||
|
||||
### What libraries does HIP support?
|
||||
HIP includes growing support for the four key math libraries using hcBlas, hcFft, hcrng and hcsparse, as well as MIOpen for machine intelligence applications.
|
||||
These offer pointer-based memory interfaces (as opposed to opaque buffers) and can be easily interfaced with other HIP applications.
|
||||
The hip interfaces support both ROCm and CUDA paths, with familiar library interfaces.
|
||||
|
||||
- [hipBlas](https://github.com/ROCmSoftwarePlatform/hipBLAS), which utilizes [rocBlas](https://github.com/ROCmSoftwarePlatform/rocBLAS).
|
||||
- [hipfft](https://github.com/ROCmSoftwarePlatform/hcFFT)
|
||||
- [hipsparse](https://github.com/ROCmSoftwarePlatform/hcSPARSE)
|
||||
- [hiprng](https://github.com/ROCmSoftwarePlatform/hcrng)
|
||||
|
||||
Additionally, some of the cublas routines are automatically converted to hipblas equivalents by the HIPIFY tools. These APIs use cublas or hcblas depending on the platform and replace the need to use conditional compilation.
|
||||
|
||||
### How does HIP compare with OpenCL?
|
||||
Both AMD and Nvidia support OpenCL 1.2 on their devices so that developers can write portable code.
|
||||
HIP offers several benefits over OpenCL:
|
||||
- Developers can code in C++ as well as mix host and device C++ code in their source files. HIP C++ code can use templates, lambdas, classes and so on.
|
||||
- The HIP API is less verbose than OpenCL and is familiar to CUDA developers.
|
||||
- Because both CUDA and HIP are C++ languages, porting from CUDA to HIP is significantly easier than porting from CUDA to OpenCL.
|
||||
- HIP uses the best available development tools on each platform: on Nvidia GPUs, HIP code compiles using NVCC and can employ the nSight profiler and debugger (unlike OpenCL on Nvidia GPUs).
|
||||
- HIP provides pointers and host-side pointer arithmetic.
|
||||
- HIP provides device-level control over memory allocation and placement.
|
||||
- HIP offers an offline compilation model.
|
||||
|
||||
### How does porting CUDA to HIP compare to porting CUDA to OpenCL?
|
||||
Both HIP and CUDA are dialects of C++, and thus porting between them is relatively straightforward.
|
||||
Both dialects support templates, classes, lambdas, and other C++ constructs.
|
||||
As one example, the hipify-perl tool was originally a Perl script that used simple text conversions from CUDA to HIP.
|
||||
HIP and CUDA provide similar math library calls as well. In summary, the HIP philosophy was to make the HIP language close enough to CUDA that the porting effort is relatively simple.
|
||||
This reduces the potential for error, and also makes it easy to automate the translation. HIP's goal is to quickly get the ported program running on both platforms with little manual intervention, so that the programmer can focus on performance optimizations.
|
||||
|
||||
There have been several tools that have attempted to convert CUDA into OpenCL, such as CU2CL. OpenCL is a C99-based kernel language (rather than C++) and also does not support single-source compilation.
|
||||
As a result, the OpenCL syntax is different from CUDA, and the porting tools have to perform some heroic transformations to bridge this gap.
|
||||
The tools also struggle with more complex CUDA applications, in particular, those that use templates, classes, or other C++ features inside the kernel.
|
||||
|
||||
### What hardware does HIP support?
|
||||
- For AMD platforms, see the [ROCm documentation](https://github.com/RadeonOpenCompute/ROCm#supported-gpus) for the list of supported platforms.
|
||||
- For Nvidia platforms, HIP requires Unified Memory and should run on any device supporting CUDA SDK 6.0 or newer. We have tested the Nvidia Titan and Tesla K40.
|
||||
|
||||
### Do HIPIFY tools automatically convert all source code?
|
||||
Typically, HIPIFY tools can automatically convert almost all run-time code, and the coordinate indexing device code ( threadIdx.x -> hipThreadIdx_x ).
|
||||
Most device code needs no additional conversion since HIP and CUDA have similar names for math and built-in functions.
|
||||
The hipify-clang tool will automatically modify the kernel signature as needed (automating a step that used to be done manually).
|
||||
Additional porting may be required to deal with architecture feature queries or with CUDA capabilities that HIP doesn't support.
|
||||
In general, developers should always expect to perform some platform-specific tuning and optimization.
|
||||
|
||||
### What is NVCC?
|
||||
NVCC is Nvidia's compiler driver for compiling "CUDA C++" code into PTX or device code for Nvidia GPUs. It's a closed-source binary compiler that is provided by the CUDA SDK.
|
||||
|
||||
### What is HIP-Clang?
|
||||
HIP-Clang is a Clang/LLVM based compiler to compile HIP programs which can run on AMD platform.
|
||||
|
||||
### Why use HIP rather than supporting CUDA directly?
|
||||
While HIP is a strong subset of the CUDA, it is a subset. The HIP layer allows that subset to be clearly defined and documented.
|
||||
Developers who code to the HIP API can be assured their code will remain portable across Nvidia and AMD platforms.
|
||||
In addition, HIP defines portable mechanisms to query architectural features and supports a larger 64-bit wavesize which expands the return type for cross-lane functions like ballot and shuffle from 32-bit ints to 64-bit ints.
|
||||
|
||||
### Can I develop HIP code on an Nvidia CUDA platform?
|
||||
Yes. HIP's CUDA path only exposes the APIs and functionality that work on both NVCC and AMDGPU back-ends.
|
||||
"Extra" APIs, parameters, and features which exist in CUDA but not in HIP-Clang will typically result in compile-time or run-time errors.
|
||||
Developers need to use the HIP API for most accelerator code and bracket any CUDA-specific code with preprocessor conditionals.
|
||||
Developers concerned about portability should, of course, run on both platforms, and should expect to tune for performance.
|
||||
In some cases, CUDA has a richer set of modes for some APIs, and some C++ capabilities such as virtual functions - see the HIP @API documentation for more details.
|
||||
|
||||
### Can I develop HIP code on an AMD HIP-Clang platform?
|
||||
Yes. HIP's HIP-Clang path only exposes the APIs and functions that work on AMD runtime back ends. "Extra" APIs, parameters and features that appear in HIP-Clang but not CUDA will typically cause compile- or run-time errors. Developers must use the HIP API for most accelerator code and bracket any HIP-Clang specific code with preprocessor conditionals. Those concerned about portability should, of course, test their code on both platforms and should tune it for performance. Typically, HIP-Clang supports a more modern set of C++11/C++14/C++17 features, so HIP developers who want portability should be careful when using advanced C++ features on the HIP-Clang path.
|
||||
|
||||
### How to use HIP-Clang to build HIP programs?
|
||||
The environment variable can be used to set compiler path:
|
||||
- HIP_CLANG_PATH: path to hip-clang. When set, this variable let hipcc to use hip-clang for compilation/linking.
|
||||
|
||||
There is an alternative environment variable to set compiler path:
|
||||
- HIP_ROCCLR_HOME: path to root directory of the HIP-ROCclr runtime. When set, this variable let hipcc use hip-clang from the ROCclr distribution.
|
||||
NOTE: If HIP_ROCCLR_HOME is set, there is no need to set HIP_CLANG_PATH since hipcc will deduce them from HIP_ROCCLR_HOME.
|
||||
|
||||
### What is ROCclr?
|
||||
ROCclr (Radeon Open Compute Common Language Runtime) is a virtual device interface that compute runtimes interact with backends such as ROCr on Linux, as well as PAL on Windows.
|
||||
|
||||
### Can a HIP binary run on both AMD and Nvidia platforms?
|
||||
HIP is a source-portable language that can be compiled to run on either AMD or NVIDIA platform. HIP tools don't create a "fat binary" that can run on either platform, however.
|
||||
|
||||
### On HIP-Clang, can I link HIP code with host code compiled with another compiler such as gcc, icc, or clang ?
|
||||
Yes. HIP generates the object code which conforms to the GCC ABI, and also links with libstdc++. This means you can compile host code with the compiler of your choice and link the generated object code
|
||||
with GPU code compiled with HIP. Larger projects often contain a mixture of accelerator code (initially written in CUDA with nvcc) and host code (compiled with gcc, icc, or clang). These projects
|
||||
can convert the accelerator code to HIP, compile that code with hipcc, and link with object code from their preferred compiler.
|
||||
|
||||
|
||||
### Can I install both CUDA SDK and HIP-Clang on the same machine?
|
||||
Yes. You can use HIP_PLATFORM to choose which path hipcc targets. This configuration can be useful when using HIP to develop an application which is portable to both AMD and NVIDIA.
|
||||
|
||||
|
||||
### HIP detected my platform (HIP-Clang vs nvcc) incorrectly - what should I do?
|
||||
HIP will set the platform to AMD and use HIP-Clang as compiler if it sees that the AMD graphics driver is installed and has detected an AMD GPU.
|
||||
Sometimes this isn't what you want - you can force HIP to recognize the platform by setting the following,
|
||||
```
|
||||
export HIP_PLATFORM=amd
|
||||
```
|
||||
HIP then set and use correct AMD compiler and runtime,
|
||||
HIP_COMPILER=clang
|
||||
HIP_RUNTIME=rocclr
|
||||
|
||||
To choose NVIDIA platform, you can set,
|
||||
```
|
||||
export HIP_PLATFORM=nvidia
|
||||
```
|
||||
In this case, HIP will set and use the following,
|
||||
HIP_COMPILER=cuda
|
||||
HIP_RUNTIME=nvcc
|
||||
|
||||
One symptom of this problem is the message "error: 'unknown error'(11) at square.hipref.cpp:56". This can occur if you have a CUDA installation on an AMD platform, and HIP incorrectly detects the platform as nvcc. HIP may be able to compile the application using the nvcc tool-chain but will generate this error at runtime since the platform does not have a CUDA device.
|
||||
|
||||
### On CUDA, can I mix CUDA code with HIP code?
|
||||
Yes. Most HIP data structures (hipStream_t, hipEvent_t) are typedefs to CUDA equivalents and can be intermixed. Both CUDA and HIP use integer device ids.
|
||||
One notable exception is that hipError_t is a new type, and cannot be used where a cudaError_t is expected. In these cases, refactor the code to remove the expectation. Alternatively, hip_runtime_api.h defines functions which convert between the error code spaces:
|
||||
|
||||
hipErrorToCudaError
|
||||
hipCUDAErrorTohipError
|
||||
hipCUResultTohipError
|
||||
|
||||
If platform portability is important, use #ifdef __HIP_PLATFORM_NVIDIA__ to guard the CUDA-specific code.
|
||||
|
||||
### How do I trace HIP application flow?
|
||||
See the [HIP Logging](hip_logging.md) for more information.
|
||||
|
||||
### What is maximum limit of kernel launching parameter?
|
||||
Product of block.x, block.y, and block.z should be less than 1024.
|
||||
|
||||
### Are __shfl_*_sync functions supported on HIP platform?
|
||||
__shfl_*_sync is not supported on HIP but for nvcc path CUDA 9.0 and above all shuffle calls get redirected to it's sync version.
|
||||
|
||||
### How to create a guard for code that is specific to the host or the GPU?
|
||||
The compiler defines the `__HIP_DEVICE_COMPILE__` macro only when compiling the code for the GPU. It could be used to guard code that is specific to the host or the GPU.
|
||||
|
||||
### Why _OpenMP is undefined when compiling with -fopenmp?
|
||||
When compiling an OpenMP source file with `hipcc -fopenmp`, the compiler may generate error if there is a reference to the `_OPENMP` macro. This is due to a limitation in hipcc that treats any source file type (e.g., `.cpp`) as an HIP translation unit leading to some conflicts with the OpenMP language switch. If the OpenMP source file doesn't contain any HIP language construct, you could workaround this issue by adding the `-x c++` switch to force the compiler to treat the file as regular C++. Another approach would be to guard the OpenMP code with `#ifdef _OPENMP` so that the code block is disabled when compiling for the GPU. The `__HIP_DEVICE_COMPILE__` macro defined by the HIP compiler when compiling GPU code could also be used for guarding code paths specific to the host or the GPU.
|
||||
|
||||
### Does the HIP-Clang compiler support extern shared declarations?
|
||||
|
||||
Previously, it was essential to declare dynamic shared memory using the HIP_DYNAMIC_SHARED macro for accuracy, as using static shared memory in the same kernel could result in overlapping memory ranges and data-races.
|
||||
|
||||
Now, the HIP-Clang compiler provides support for extern shared declarations, and the HIP_DYNAMIC_SHARED option is no longer required. You may use the standard extern definition:
|
||||
extern __shared__ type var[];
|
||||
|
||||
### I have multiple HIP enabled devices and I am getting an error message hipErrorNoBinaryForGpu Unable to find code object for all current devices?
|
||||
|
||||
This error message is seen due to the fact that you do not have valid code object for all of your devices.
|
||||
|
||||
If you have compiled the application yourself, make sure you have given the correct device name(s) and its features via: `--offload-arch`. If you are not mentioning the `--offload-arch`, make sure that `hipcc` is using the correct offload arch by verifying the hipcc output generated by setting the environment variable `HIPCC_VERBOSE=1`.
|
||||
|
||||
If you have a precompiled application/library (like rocblas, tensorflow etc) which gives you such error, there are one of two possibilities.
|
||||
|
||||
- The application/library does not ship code object bundles for *all* of your device(s): in this case you need to recompile the application/library yourself with correct `--offload-arch`.
|
||||
- The application/library does not ship code object bundles for *some* of your device(s), for example you have a system with an APU + GPU and the library does not ship code objects for your APU. For this you can set the environment variable `HIP_VISIBLE_DEVICES` to only enable GPUs for which code object is available. This will limit the GPUs visible to your application and allow it to run.
|
||||
@@ -1,801 +0,0 @@
|
||||
## Table of Contents
|
||||
|
||||
<!-- toc -->
|
||||
|
||||
- [Introduction](#introduction)
|
||||
- [Function-Type Qualifiers](#function-type-qualifiers)
|
||||
* [`__device__`](#__device__)
|
||||
* [`__global__`](#__global__)
|
||||
* [`__host__`](#__host__)
|
||||
- [Calling `__global__` Functions](#calling-__global__-functions)
|
||||
- [Kernel-Launch Example](#kernel-launch-example)
|
||||
- [Variable-Type Qualifiers](#variable-type-qualifiers)
|
||||
* [`__constant__`](#__constant__)
|
||||
* [`__shared__`](#__shared__)
|
||||
* [`__managed__`](#__managed__)
|
||||
* [`__restrict__`](#__restrict__)
|
||||
- [Built-In Variables](#built-in-variables)
|
||||
* [Coordinate Built-Ins](#coordinate-built-ins)
|
||||
* [warpSize](#warpsize)
|
||||
- [Vector Types](#vector-types)
|
||||
* [Short Vector Types](#short-vector-types)
|
||||
* [dim3](#dim3)
|
||||
- [Memory-Fence Instructions](#memory-fence-instructions)
|
||||
- [Synchronization Functions](#synchronization-functions)
|
||||
- [Math Functions](#math-functions)
|
||||
* [Single Precision Mathematical Functions](#single-precision-mathematical-functions)
|
||||
* [Double Precision Mathematical Functions](#double-precision-mathematical-functions)
|
||||
* [Integer Intrinsics](#integer-intrinsics)
|
||||
* [Floating-point Intrinsics](#floating-point-intrinsics)
|
||||
- [Texture Functions](#texture-functions)
|
||||
- [Surface Functions](#surface-functions)
|
||||
- [Timer Functions](#timer-functions)
|
||||
- [Atomic Functions](#atomic-functions)
|
||||
* [Caveats and Features Under-Development:](#caveats-and-features-under-development)
|
||||
- [Warp Cross-Lane Functions](#warp-cross-lane-functions)
|
||||
* [Warp Vote and Ballot Functions](#warp-vote-and-ballot-functions)
|
||||
* [Warp Shuffle Functions](#warp-shuffle-functions)
|
||||
- [Cooperative Groups Functions](#cooperative-groups-functions)
|
||||
- [Warp Matrix Functions](#warp-matrix-functions)
|
||||
- [Independent Thread Scheduling](#independent-thread-scheduling)
|
||||
- [Profiler Counter Function](#profiler-counter-function)
|
||||
- [Assert](#assert)
|
||||
- [Printf](#printf)
|
||||
- [Device-Side Dynamic Global Memory Allocation](#device-side-dynamic-global-memory-allocation)
|
||||
- [`__launch_bounds__`](#__launch_bounds__)
|
||||
* [Compiler Impact](#compiler-impact)
|
||||
* [CU and EU Definitions](#cu-and-eu-definitions)
|
||||
* [Porting from CUDA __launch_bounds](#porting-from-cuda-__launch_bounds)
|
||||
* [maxregcount](#maxregcount)
|
||||
- [Register Keyword](#register-keyword)
|
||||
- [Pragma Unroll](#pragma-unroll)
|
||||
- [In-Line Assembly](#in-line-assembly)
|
||||
- [C++ Support](#c-support)
|
||||
- [Kernel Compilation](#kernel-compilation)
|
||||
- [GFX Arch specific kernel](#gfx-arch-specific-kernel)
|
||||
<!-- tocstop -->
|
||||
|
||||
## Introduction
|
||||
|
||||
HIP provides a C++ syntax that is suitable for compiling most code that commonly appears in compute kernels, including classes, namespaces, operator overloading, templates and more. Additionally, it defines other language features designed specifically to target accelerators, such as the following:
|
||||
- A kernel-launch syntax that uses standard C++, resembles a function call and is portable to all HIP targets
|
||||
- Short-vector headers that can serve on a host or a device
|
||||
- Math functions resembling those in the "math.h" header included with standard C++ compilers
|
||||
- Built-in functions for accessing specific GPU hardware capabilities
|
||||
|
||||
This section describes the built-in variables and functions accessible from the HIP kernel. It’s intended for readers who are familiar with Cuda kernel syntax and want to understand how HIP is different.
|
||||
|
||||
Features are marked with one of the following keywords:
|
||||
- **Supported**---HIP supports the feature with a Cuda-equivalent function
|
||||
- **Not supported**---HIP does not support the feature
|
||||
- **Under development**---the feature is under development but not yet available
|
||||
|
||||
## Function-Type Qualifiers
|
||||
### `__device__`
|
||||
Supported `__device__` functions are
|
||||
- Executed on the device
|
||||
- Called from the device only
|
||||
|
||||
The `__device__` keyword can combine with the host keyword (see [__host__](#host)).
|
||||
|
||||
### `__global__`
|
||||
Supported `__global__` functions are
|
||||
- Executed on the device
|
||||
- Called ("launched") from the host
|
||||
|
||||
HIP `__global__` functions must have a `void` return type, and the first parameter to a HIP `__global__` function must have the type `hipLaunchParm`. See [Kernel-Launch Example](#kernel-launch-example).
|
||||
|
||||
HIP lacks dynamic-parallelism support, so `__global__ ` functions cannot be called from the device.
|
||||
|
||||
### `__host__`
|
||||
Supported `__host__` functions are
|
||||
- Executed on the host
|
||||
- Called from the host
|
||||
|
||||
`__host__` can combine with `__device__`, in which case the function compiles for both the host and device. These functions cannot use the HIP grid coordinate functions (for example, "threadIdx.x"). A possible workaround is to pass the necessary coordinate info as an argument to the function.
|
||||
|
||||
`__host__` cannot combine with `__global__`.
|
||||
|
||||
HIP parses the `__noinline__` and `__forceinline__` keywords and converts them to the appropriate Clang attributes.
|
||||
|
||||
## Calling `__global__` Functions
|
||||
|
||||
`__global__` functions are often referred to as *kernels,* and calling one is termed *launching the kernel.* These functions require the caller to specify an "execution configuration" that includes the grid and block dimensions. The execution configuration can also include other information for the launch, such as the amount of additional shared memory to allocate and the stream where the kernel should execute. HIP introduces a standard C++ calling convention to pass the execution configuration to the kernel in addition to the Cuda <<< >>> syntax. In HIP,
|
||||
- Kernels launch with either <<< >>> syntax or the "hipLaunchKernel" function
|
||||
- The first five parameters to hipLaunchKernel are the following:
|
||||
- **symbol kernelName**: the name of the kernel to launch. To support template kernels which contains "," use the HIP_KERNEL_NAME macro. The hipify tools insert this automatically.
|
||||
- **dim3 gridDim**: 3D-grid dimensions specifying the number of blocks to launch.
|
||||
- **dim3 blockDim**: 3D-block dimensions specifying the number of threads in each block.
|
||||
- **size_t dynamicShared**: amount of additional shared memory to allocate when launching the kernel (see [__shared__](#__shared__))
|
||||
- **hipStream_t**: stream where the kernel should execute. A value of 0 corresponds to the NULL stream (see [Synchronization Functions](#synchronization-functions)).
|
||||
- Kernel arguments follow these first five parameters
|
||||
|
||||
|
||||
```
|
||||
// Example pseudo code introducing hipLaunchKernel:
|
||||
__global__ MyKernel(hipLaunchParm lp, float *A, float *B, float *C, size_t N)
|
||||
{
|
||||
...
|
||||
}
|
||||
|
||||
MyKernel<<<dim3(gridDim), dim3(groupDim), 0, 0>>> (a,b,c,n);
|
||||
// Alternatively, kernel can be launched by
|
||||
// hipLaunchKernel(MyKernel, dim3(gridDim), dim3(groupDim), 0/*dynamicShared*/, 0/*stream), a, b, c, n);
|
||||
|
||||
```
|
||||
|
||||
The hipLaunchKernel macro always starts with the five parameters specified above, followed by the kernel arguments. HIPIFY tools optionally convert Cuda launch syntax to hipLaunchKernel, including conversion of optional arguments in <<< >>> to the five required hipLaunchKernel parameters. The dim3 constructor accepts zero to three arguments and will by default initialize unspecified dimensions to 1. See [dim3](#dim3). The kernel uses the coordinate built-ins (thread*, block*, grid*) to determine coordinate index and coordinate bounds of the work item that’s currently executing. See [Coordinate Built-Ins](#coordinate-builtins).
|
||||
|
||||
|
||||
## Kernel-Launch Example
|
||||
```
|
||||
// Example showing device function, __device__ __host__
|
||||
// <- compile for both device and host
|
||||
float PlusOne(float x)
|
||||
{
|
||||
return x + 1.0;
|
||||
}
|
||||
|
||||
__global__
|
||||
void
|
||||
MyKernel (hipLaunchParm lp, /*lp parm for execution configuration */
|
||||
const float *a, const float *b, float *c, unsigned N)
|
||||
{
|
||||
unsigned gid = threadIdx.x; // <- coordinate index function
|
||||
if (gid < N) {
|
||||
c[gid] = a[gid] + PlusOne(b[gid]);
|
||||
}
|
||||
}
|
||||
void callMyKernel()
|
||||
{
|
||||
float *a, *b, *c; // initialization not shown...
|
||||
unsigned N = 1000000;
|
||||
const unsigned blockSize = 256;
|
||||
|
||||
MyKernel<<<dim3(gridDim), dim3(groupDim), 0, 0>>> (a,b,c,n);
|
||||
// Alternatively, kernel can be launched by
|
||||
// hipLaunchKernel(MyKernel, dim3(N/blockSize), dim3(blockSize), 0, 0, a,b,c,N);
|
||||
}
|
||||
```
|
||||
|
||||
## Variable-Type Qualifiers
|
||||
|
||||
### `__constant__`
|
||||
The `__constant__` keyword is supported. The host writes constant memory before launching the kernel; from the GPU, this memory is read-only during kernel execution. The functions for accessing constant memory (hipGetSymbolAddress(), hipGetSymbolSize(), hipMemcpyToSymbol(), hipMemcpyToSymbolAsync(), hipMemcpyFromSymbol(), hipMemcpyFromSymbolAsync()) are available.
|
||||
|
||||
### `__shared__`
|
||||
The `__shared__` keyword is supported.
|
||||
|
||||
`extern __shared__` allows the host to dynamically allocate shared memory and is specified as a launch parameter.
|
||||
Previously, it was essential to declare dynamic shared memory using the HIP_DYNAMIC_SHARED macro for accuracy, as using static shared memory in the same kernel could result in overlapping memory ranges and data-races.
|
||||
|
||||
Now, the HIP-Clang compiler provides support for extern shared declarations, and the HIP_DYNAMIC_SHARED option is no longer required..
|
||||
|
||||
### `__managed__`
|
||||
Managed memory, except the `__managed__` keyword, are supported in HIP combined host/device compilation.
|
||||
Support of `__managed__` keyword is under development.
|
||||
|
||||
### `__restrict__`
|
||||
The `__restrict__` keyword tells the compiler that the associated memory pointer will not alias with any other pointer in the kernel or function. This feature can help the compiler generate better code. In most cases, all pointer arguments must use this keyword to realize the benefit.
|
||||
|
||||
|
||||
## Built-In Variables
|
||||
|
||||
### Coordinate Built-Ins
|
||||
These built-ins determine the coordinate of the active work item in the execution grid. They are defined in hip_runtime.h (rather than being implicitly defined by the compiler).
|
||||
|
||||
| **HIP Syntax** | **Cuda Syntax** |
|
||||
| --- | --- |
|
||||
| threadIdx.x | threadIdx.x |
|
||||
| threadIdx.y | threadIdx.y |
|
||||
| threadIdx.z | threadIdx.z |
|
||||
| | |
|
||||
| blockIdx.x | blockIdx.x |
|
||||
| blockIdx.y | blockIdx.y |
|
||||
| blockIdx.z | blockIdx.z |
|
||||
| | |
|
||||
| blockDim.x | blockDim.x |
|
||||
| blockDim.y | blockDim.y |
|
||||
| blockDim.z | blockDim.z |
|
||||
| | |
|
||||
| gridDim.x | gridDim.x |
|
||||
| gridDim.y | gridDim.y |
|
||||
| gridDim.z | gridDim.z |
|
||||
|
||||
### warpSize
|
||||
The warpSize variable is of type int and contains the warp size (in threads) for the target device. Note that all current Nvidia devices return 32 for this variable, and all current AMD devices return 64. Device code should use the warpSize built-in to develop portable wave-aware code.
|
||||
|
||||
|
||||
## Vector Types
|
||||
|
||||
Note that these types are defined in hip_runtime.h and are not automatically provided by the compiler.
|
||||
|
||||
|
||||
### Short Vector Types
|
||||
Short vector types derive from the basic integer and floating-point types. They are structures defined in hip_vector_types.h. The first, second, third and fourth components of the vector are accessible through the ```x```, ```y```, ```z``` and ```w``` fields, respectively. All the short vector types support a constructor function of the form ```make_<type_name>()```. For example, ```float4 make_float4(float x, float y, float z, float w)``` creates a vector of type ```float4``` and value ```(x,y,z,w)```.
|
||||
|
||||
HIP supports the following short vector formats:
|
||||
- Signed Integers:
|
||||
- char1, char2, char3, char4
|
||||
- short1, short2, short3, short4
|
||||
- int1, int2, int3, int4
|
||||
- long1, long2, long3, long4
|
||||
- longlong1, longlong2, longlong3, longlong4
|
||||
- Unsigned Integers:
|
||||
- uchar1, uchar2, uchar3, uchar4
|
||||
- ushort1, ushort2, ushort3, ushort4
|
||||
- uint1, uint2, uint3, uint4
|
||||
- ulong1, ulong2, ulong3, ulong4
|
||||
- ulonglong1, ulonglong2, ulonglong3, ulonglong4
|
||||
- Floating Points
|
||||
- float1, float2, float3, float4
|
||||
- double1, double2, double3, double4
|
||||
|
||||
### dim3
|
||||
dim3 is a three-dimensional integer vector type commonly used to specify grid and group dimensions. Unspecified dimensions are initialized to 1.
|
||||
```
|
||||
typedef struct dim3 {
|
||||
uint32_t x;
|
||||
uint32_t y;
|
||||
uint32_t z;
|
||||
|
||||
dim3(uint32_t _x=1, uint32_t _y=1, uint32_t _z=1) : x(_x), y(_y), z(_z) {};
|
||||
};
|
||||
|
||||
```
|
||||
|
||||
## Memory-Fence Instructions
|
||||
HIP supports __threadfence() and __threadfence_block().
|
||||
|
||||
HIP provides workaround for threadfence_system() under the HIP-Clang path.
|
||||
To enable the workaround, HIP should be built with environment variable HIP_COHERENT_HOST_ALLOC enabled.
|
||||
In addition,the kernels that use __threadfence_system() should be modified as follows:
|
||||
- The kernel should only operate on finegrained system memory; which should be allocated with hipHostMalloc().
|
||||
- Remove all memcpy for those allocated finegrained system memory regions.
|
||||
|
||||
## Synchronization Functions
|
||||
The __syncthreads() built-in function is supported in HIP. The __syncthreads_count(int), __syncthreads_and(int) and __syncthreads_or(int) functions are under development.
|
||||
|
||||
## Math Functions
|
||||
HIP-Clang supports a set of math operations callable from the device.
|
||||
|
||||
### Single Precision Mathematical Functions
|
||||
Following is the list of supported single precision mathematical functions.
|
||||
|
||||
| **Function** | **Supported on Host** | **Supported on Device** |
|
||||
| --- | --- | --- |
|
||||
| float acosf ( float x ) <br><sub>Calculate the arc cosine of the input argument.</sub> | ✓ | ✓ |
|
||||
| float acoshf ( float x ) <br><sub>Calculate the nonnegative arc hyperbolic cosine of the input argument.</sub> | ✓ | ✓ |
|
||||
| float asinf ( float x ) <br><sub>Calculate the arc sine of the input argument.</sub> | ✓ | ✓ |
|
||||
| float asinhf ( float x ) <br><sub>Calculate the arc hyperbolic sine of the input argument.</sub> | ✓ | ✓ |
|
||||
| float atan2f ( float y, float x ) <br><sub>Calculate the arc tangent of the ratio of first and second input arguments.</sub> | ✓ | ✓ |
|
||||
| float atanf ( float x ) <br><sub>Calculate the arc tangent of the input argument.</sub> | ✓ | ✓ |
|
||||
| float atanhf ( float x ) <br><sub>Calculate the arc hyperbolic tangent of the input argument.</sub> | ✓ | ✓ |
|
||||
| float cbrtf ( float x ) <br><sub>Calculate the cube root of the input argument.</sub> | ✓ | ✓ |
|
||||
| float ceilf ( float x ) <br><sub>Calculate ceiling of the input argument.</sub> | ✓ | ✓ |
|
||||
| float copysignf ( float x, float y ) <br><sub>Create value with given magnitude, copying sign of second value.</sub> | ✓ | ✓ |
|
||||
| float cosf ( float x ) <br><sub>Calculate the cosine of the input argument.</sub> | ✓ | ✓ |
|
||||
| float coshf ( float x ) <br><sub>Calculate the hyperbolic cosine of the input argument.</sub> | ✓ | ✓ |
|
||||
| float erfcf ( float x ) <br><sub>Calculate the complementary error function of the input argument.</sub> | ✓ | ✓ |
|
||||
| float erff ( float x ) <br><sub>Calculate the error function of the input argument.</sub> | ✓ | ✓ |
|
||||
| float exp10f ( float x ) <br><sub>Calculate the base 10 exponential of the input argument.</sub> | ✓ | ✓ |
|
||||
| float exp2f ( float x ) <br><sub>Calculate the base 2 exponential of the input argument.</sub> | ✓ | ✓ |
|
||||
| float expf ( float x ) <br><sub>Calculate the base e exponential of the input argument.</sub> | ✓ | ✓ |
|
||||
| float expm1f ( float x ) <br><sub>Calculate the base e exponential of the input argument, minus 1.</sub> | ✓ | ✓ |
|
||||
| float fabsf ( float x ) <br><sub>Calculate the absolute value of its argument.</sub> | ✓ | ✓ |
|
||||
| float fdimf ( float x, float y ) <br><sub>Compute the positive difference between `x` and `y`.</sub> | ✓ | ✓ |
|
||||
| float floorf ( float x ) <br><sub>Calculate the largest integer less than or equal to `x`.</sub> | ✓ | ✓ |
|
||||
| float fmaf ( float x, float y, float z ) <br><sub>Compute `x × y + z` as a single operation.</sub> | ✓ | ✓ |
|
||||
| float fmaxf ( float x, float y ) <br><sub>Determine the maximum numeric value of the arguments.</sub> | ✓ | ✓ |
|
||||
| float fminf ( float x, float y ) <br><sub>Determine the minimum numeric value of the arguments.</sub> | ✓ | ✓ |
|
||||
| float fmodf ( float x, float y ) <br><sub>Calculate the floating-point remainder of `x / y`.</sub> | ✓ | ✓ |
|
||||
| float frexpf ( float x, int* nptr ) <br><sub>Extract mantissa and exponent of a floating-point value.</sub> | ✓ | ✗ |
|
||||
| float hypotf ( float x, float y ) <br><sub>Calculate the square root of the sum of squares of two arguments.</sub> | ✓ | ✓ |
|
||||
| int ilogbf ( float x ) <br><sub>Compute the unbiased integer exponent of the argument.</sub> | ✓ | ✓ |
|
||||
| __RETURN_TYPE<sup id="a1">[1](#f1)</sup> isfinite ( float a ) <br><sub>Determine whether argument is finite.</sub> | ✓ | ✓ |
|
||||
| __RETURN_TYPE<sup>[1](#f1)</sup> isinf ( float a ) <br><sub>Determine whether argument is infinite.</sub> | ✓ | ✓ |
|
||||
| __RETURN_TYPE<sup>[1](#f1)</sup> isnan ( float a ) <br><sub>Determine whether argument is a NaN.</sub> | ✓ | ✓ |
|
||||
| float ldexpf ( float x, int exp ) <br><sub>Calculate the value of x ⋅ 2<sup>exp</sup>.</sub> | ✓ | ✓ |
|
||||
| float log10f ( float x ) <br><sub>Calculate the base 10 logarithm of the input argument.</sub> | ✓ | ✓ |
|
||||
| float log1pf ( float x ) <br><sub>Calculate the value of log<sub>e</sub>( 1 + x ).</sub> | ✓ | ✓ |
|
||||
| float logbf ( float x ) <br><sub>Calculate the floating point representation of the exponent of the input argument.</sub> | ✓ | ✓ |
|
||||
| float log2f ( float x ) <br><sub>Calculate the base 2 logarithm of the input argument.</sub> | ✓ | ✓ |
|
||||
| float logf ( float x ) <br><sub>Calculate the natural logarithm of the input argument.</sub> | ✓ | ✓ |
|
||||
| float modff ( float x, float* iptr ) <br><sub>Break down the input argument into fractional and integral parts.</sub> | ✓ | ✗ |
|
||||
| float nanf ( const char* tagp ) <br><sub>Returns "Not a Number" value.</sub> | ✗ | ✓ |
|
||||
| float nearbyintf ( float x ) <br><sub>Round the input argument to the nearest integer.</sub> | ✓ | ✓ |
|
||||
| float powf ( float x, float y ) <br><sub>Calculate the value of first argument to the power of second argument.</sub> | ✓ | ✓ |
|
||||
| float remainderf ( float x, float y ) <br><sub>Compute single-precision floating-point remainder.</sub> | ✓ | ✓ |
|
||||
| float remquof ( float x, float y, int* quo ) <br><sub>Compute single-precision floating-point remainder and part of quotient.</sub> | ✓ | ✗ |
|
||||
| float roundf ( float x ) <br><sub>Round to nearest integer value in floating-point.</sub> | ✓ | ✓ |
|
||||
| float scalbnf ( float x, int n ) <br><sub>Scale floating-point input by integer power of two.</sub> | ✓ | ✓ |
|
||||
| __RETURN_TYPE<sup>[1](#f1)</sup> signbit ( float a ) <br><sub>Return the sign bit of the input.</sub> | ✓ | ✓ |
|
||||
| void sincosf ( float x, float* sptr, float* cptr ) <br><sub>Calculate the sine and cosine of the first input argument.</sub> | ✓ | ✗ |
|
||||
| float sinf ( float x ) <br><sub>Calculate the sine of the input argument.</sub> | ✓ | ✓ |
|
||||
| float sinhf ( float x ) <br><sub>Calculate the hyperbolic sine of the input argument.</sub> | ✓ | ✓ |
|
||||
| float sqrtf ( float x ) <br><sub>Calculate the square root of the input argument.</sub> | ✓ | ✓ |
|
||||
| float tanf ( float x ) <br><sub>Calculate the tangent of the input argument.</sub> | ✓ | ✓ |
|
||||
| float tanhf ( float x ) <br><sub>Calculate the hyperbolic tangent of the input argument.</sub> | ✓ | ✓ |
|
||||
| float truncf ( float x ) <br><sub>Truncate input argument to the integral part.</sub> | ✓ | ✓ |
|
||||
| float tgammaf ( float x ) <br><sub>Calculate the gamma function of the input argument.</sub> | ✓ | ✓ |
|
||||
| float erfcinvf ( float y ) <br><sub>Calculate the inverse complementary function of the input argument.</sub> | ✓ | ✓ |
|
||||
| float erfcxf ( float x ) <br><sub>Calculate the scaled complementary error function of the input argument.</sub> | ✓ | ✓ |
|
||||
| float erfinvf ( float y ) <br><sub>Calculate the inverse error function of the input argument.</sub> | ✓ | ✓ |
|
||||
| float fdividef ( float x, float y ) <br><sub>Divide two floating point values.</sub> | ✓ | ✓ |
|
||||
| float frexpf ( float x, int *nptr ) <br><sub>Extract mantissa and exponent of a floating-point value.</sub> | ✓ | ✓ |
|
||||
| float j0f ( float x ) <br><sub>Calculate the value of the Bessel function of the first kind of order 0 for the input argument.</sub> | ✓ | ✓ |
|
||||
| float j1f ( float x ) <br><sub>Calculate the value of the Bessel function of the first kind of order 1 for the input argument.</sub> | ✓ | ✓ |
|
||||
| float jnf ( int n, float x ) <br><sub>Calculate the value of the Bessel function of the first kind of order n for the input argument.</sub> | ✓ | ✓ |
|
||||
| float lgammaf ( float x ) <br><sub>Calculate the natural logarithm of the absolute value of the gamma function of the input argument.</sub> | ✓ | ✓ |
|
||||
| long long int llrintf ( float x ) <br><sub>Round input to nearest integer value.</sub> | ✓ | ✓ |
|
||||
| long long int llroundf ( float x ) <br><sub>Round to nearest integer value.</sub> | ✓ | ✓ |
|
||||
| long int lrintf ( float x ) <br><sub>Round input to nearest integer value.</sub> | ✓ | ✓ |
|
||||
| long int lroundf ( float x ) <br><sub>Round to nearest integer value.</sub> | ✓ | ✓ |
|
||||
| float modff ( float x, float *iptr ) <br><sub>Break down the input argument into fractional and integral parts.</sub> | ✓ | ✓ |
|
||||
| float nextafterf ( float x, float y ) <br><sub>Returns next representable single-precision floating-point value after argument.</sub> | ✓ | ✓ |
|
||||
| float norm3df ( float a, float b, float c ) <br><sub>Calculate the square root of the sum of squares of three coordinates of the argument.</sub> | ✓ | ✓ |
|
||||
| float norm4df ( float a, float b, float c, float d ) <br><sub>Calculate the square root of the sum of squares of four coordinates of the argument.</sub> | ✓ | ✓ |
|
||||
| float normcdff ( float y ) <br><sub>Calculate the standard normal cumulative distribution function.</sub> | ✓ | ✓ |
|
||||
| float normcdfinvf ( float y ) <br><sub>Calculate the inverse of the standard normal cumulative distribution function.</sub> | ✓ | ✓ |
|
||||
| float normf ( int dim, const float *a ) <br><sub>Calculate the square root of the sum of squares of any number of coordinates.</sub> | ✓ | ✓ |
|
||||
| float rcbrtf ( float x ) <br><sub>Calculate the reciprocal cube root function.</sub> | ✓ | ✓ |
|
||||
| float remquof ( float x, float y, int *quo ) <br><sub>Compute single-precision floating-point remainder and part of quotient.</sub> | ✓ | ✓ |
|
||||
| float rhypotf ( float x, float y ) <br><sub>Calculate one over the square root of the sum of squares of two arguments.</sub> | ✓ | ✓ |
|
||||
| float rintf ( float x ) <br><sub>Round input to nearest integer value in floating-point.</sub> | ✓ | ✓ |
|
||||
| float rnorm3df ( float a, float b, float c ) <br><sub>Calculate one over the square root of the sum of squares of three coordinates of the argument.</sub> | ✓ | ✓ |
|
||||
| float rnorm4df ( float a, float b, float c, float d ) <br><sub>Calculate one over the square root of the sum of squares of four coordinates of the argument.</sub> | ✓ | ✓ |
|
||||
| float rnormf ( int dim, const float *a ) <br><sub>Calculate the reciprocal of square root of the sum of squares of any number of coordinates.</sub> | ✓ | ✓ |
|
||||
| float scalblnf ( float x, long int n ) <br><sub>Scale floating-point input by integer power of two.</sub> | ✓ | ✓ |
|
||||
| void sincosf ( float x, float *sptr, float *cptr ) <br><sub>Calculate the sine and cosine of the first input argument.</sub> | ✓ | ✓ |
|
||||
| void sincospif ( float x, float *sptr, float *cptr ) <br><sub>Calculate the sine and cosine of the first input argument multiplied by PI.</sub> | ✓ | ✓ |
|
||||
| float y0f ( float x ) <br><sub>Calculate the value of the Bessel function of the second kind of order 0 for the input argument.</sub> | ✓ | ✓ |
|
||||
| float y1f ( float x ) <br><sub>Calculate the value of the Bessel function of the second kind of order 1 for the input argument.</sub> | ✓ | ✓ |
|
||||
| float ynf ( int n, float x ) <br><sub>Calculate the value of the Bessel function of the second kind of order n for the input argument.</sub> | ✓ | ✓ |
|
||||
|
||||
|
||||
|
||||
<sub><b id="f1"><sup>[1]</sup></b> __RETURN_TYPE is dependent on compiler. It is usually 'int' for C compilers and 'bool' for C++ compilers.</sub> [↩](#a1)
|
||||
|
||||
### Double Precision Mathematical Functions
|
||||
Following is the list of supported double precision mathematical functions.
|
||||
|
||||
| **Function** | **Supported on Host** | **Supported on Device** |
|
||||
| --- | --- | --- |
|
||||
| double acos ( double x ) <br><sub>Calculate the arc cosine of the input argument.</sub> | ✓ | ✓ |
|
||||
| double acosh ( double x ) <br><sub>Calculate the nonnegative arc hyperbolic cosine of the input argument.</sub> | ✓ | ✓ |
|
||||
| double asin ( double x ) <br><sub>Calculate the arc sine of the input argument.</sub> | ✓ | ✓ |
|
||||
| double asinh ( double x ) <br><sub> Calculate the arc hyperbolic sine of the input argument.</sub> | ✓ | ✓ |
|
||||
| double atan ( double x ) <br><sub>Calculate the arc tangent of the input argument.</sub> | ✓ | ✓ |
|
||||
| double atan2 ( double y, double x ) <br><sub>Calculate the arc tangent of the ratio of first and second input arguments.</sub> | ✓ | ✓ |
|
||||
| double atanh ( double x ) <br><sub>Calculate the arc hyperbolic tangent of the input argument.</sub> | ✓ | ✓ |
|
||||
| double cbrt ( double x ) <br><sub>Calculate the cube root of the input argument.</sub> | ✓ | ✓ |
|
||||
| double ceil ( double x ) <br><sub>Calculate ceiling of the input argument.</sub> | ✓ | ✓ |
|
||||
| double copysign ( double x, double y ) <br><sub>Create value with given magnitude, copying sign of second value.</sub> | ✓ | ✓ |
|
||||
| double cos ( double x ) <br><sub>Calculate the cosine of the input argument.</sub> | ✓ | ✓ |
|
||||
| double cosh ( double x ) <br><sub>Calculate the hyperbolic cosine of the input argument.</sub> | ✓ | ✓ |
|
||||
| double erf ( double x ) <br><sub>Calculate the error function of the input argument.</sub> | ✓ | ✓ |
|
||||
| double erfc ( double x ) <br><sub>Calculate the complementary error function of the input argument.</sub> | ✓ | ✓ |
|
||||
| double exp ( double x ) <br><sub>Calculate the base e exponential of the input argument.</sub> | ✓ | ✓ |
|
||||
| double exp10 ( double x ) <br><sub>Calculate the base 10 exponential of the input argument.</sub> | ✓ | ✓ |
|
||||
| double exp2 ( double x ) <br><sub>Calculate the base 2 exponential of the input argument.</sub> | ✓ | ✓ |
|
||||
| double expm1 ( double x ) <br><sub>Calculate the base e exponential of the input argument, minus 1.</sub> | ✓ | ✓ |
|
||||
| double fabs ( double x ) <br><sub>Calculate the absolute value of the input argument.</sub> | ✓ | ✓ |
|
||||
| double fdim ( double x, double y ) <br><sub>Compute the positive difference between `x` and `y`.</sub> | ✓ | ✓ |
|
||||
| double floor ( double x ) <br><sub>Calculate the largest integer less than or equal to `x`.</sub> | ✓ | ✓ |
|
||||
| double fma ( double x, double y, double z ) <br><sub>Compute `x × y + z` as a single operation.</sub> | ✓ | ✓ |
|
||||
| double fmax ( double , double ) <br><sub>Determine the maximum numeric value of the arguments.</sub> | ✓ | ✓ |
|
||||
| double fmin ( double x, double y ) <br><sub>Determine the minimum numeric value of the arguments.</sub> | ✓ | ✓ |
|
||||
| double fmod ( double x, double y ) <br><sub>Calculate the floating-point remainder of `x / y`.</sub> | ✓ | ✓ |
|
||||
| double frexp ( double x, int* nptr ) <br><sub>Extract mantissa and exponent of a floating-point value.</sub> | ✓ | ✗ |
|
||||
| double hypot ( double x, double y ) <br><sub>Calculate the square root of the sum of squares of two arguments.</sub> | ✓ | ✓ |
|
||||
| int ilogb ( double x ) <br><sub>Compute the unbiased integer exponent of the argument.</sub> | ✓ | ✓ |
|
||||
| __RETURN_TYPE<sup id="a2">[1](#f2)</sup> isfinite ( double a ) <br><sub>Determine whether argument is finite.</sub> | ✓ | ✓ |
|
||||
| __RETURN_TYPE<sup>[1](#f2)</sup> isinf ( double a ) <br><sub>Determine whether argument is infinite.</sub> | ✓ | ✓ |
|
||||
| __RETURN_TYPE<sup>[1](#f2)</sup> isnan ( double a ) <br><sub>Determine whether argument is a NaN.</sub> | ✓ | ✓ |
|
||||
| double ldexp ( double x, int exp ) <br><sub>Calculate the value of x ⋅ 2<sup>exp</sup>.</sub> | ✓ | ✓ |
|
||||
| double log ( double x ) <br><sub>Calculate the base e logarithm of the input argument.</sub> | ✓ | ✓ |
|
||||
| double log10 ( double x ) <br><sub>Calculate the base 10 logarithm of the input argument.</sub> | ✓ | ✓ |
|
||||
| double log1p ( double x ) <br><sub>Calculate the value of log<sub>e</sub>( 1 + x ).</sub> | ✓ | ✓ |
|
||||
| double log2 ( double x ) <br><sub>Calculate the base 2 logarithm of the input argument.</sub> | ✓ | ✓ |
|
||||
| double logb ( double x ) <br><sub>Calculate the floating point representation of the exponent of the input argument.</sub> | ✓ | ✓ |
|
||||
| double modf ( double x, double* iptr ) <br><sub>Break down the input argument into fractional and integral parts.</sub> | ✓ | ✗ |
|
||||
| double nan ( const char* tagp ) <br><sub>Returns "Not a Number" value.</sub> | ✗ | ✓ |
|
||||
| double nearbyint ( double x ) <br><sub>Round the input argument to the nearest integer.</sub> | ✓ | ✓ |
|
||||
| double pow ( double x, double y ) <br><sub>Calculate the value of first argument to the power of second argument.</sub> | ✓ | ✓ |
|
||||
| double remainder ( double x, double y ) <br><sub>Compute double-precision floating-point remainder.</sub> | ✓ | ✓ |
|
||||
| double remquo ( double x, double y, int* quo ) <br><sub>Compute double-precision floating-point remainder and part of quotient.</sub> | ✓ | ✗ |
|
||||
| double round ( double x ) <br><sub>Round to nearest integer value in floating-point.</sub> | ✓ | ✓ |
|
||||
| double scalbn ( double x, int n ) <br><sub>Scale floating-point input by integer power of two.</sub> | ✓ | ✓ |
|
||||
| __RETURN_TYPE<sup>[1](#f2)</sup> signbit ( double a ) <br><sub>Return the sign bit of the input.</sub> | ✓ | ✓ |
|
||||
| double sin ( double x ) <br><sub>Calculate the sine of the input argument.</sub> | ✓ | ✓ |
|
||||
| void sincos ( double x, double* sptr, double* cptr ) <br><sub>Calculate the sine and cosine of the first input argument.</sub> | ✓ | ✗ |
|
||||
| double sinh ( double x ) <br><sub>Calculate the hyperbolic sine of the input argument.</sub> | ✓ | ✓ |
|
||||
| double sqrt ( double x ) <br><sub>Calculate the square root of the input argument.</sub> | ✓ | ✓ |
|
||||
| double tan ( double x ) <br><sub>Calculate the tangent of the input argument.</sub> | ✓ | ✓ |
|
||||
| double tanh ( double x ) <br><sub>Calculate the hyperbolic tangent of the input argument.</sub> | ✓ | ✓ |
|
||||
| double tgamma ( double x ) <br><sub>Calculate the gamma function of the input argument.</sub> | ✓ | ✓ |
|
||||
| double trunc ( double x ) <br><sub>Truncate input argument to the integral part.</sub> | ✓ | ✓ |
|
||||
| double erfcinv ( double y ) <br><sub>Calculate the inverse complementary function of the input argument.</sub> | ✓ | ✓ |
|
||||
| double erfcx ( double x ) <br><sub>Calculate the scaled complementary error function of the input argument.</sub> | ✓ | ✓ |
|
||||
| double erfinv ( double y ) <br><sub>Calculate the inverse error function of the input argument.</sub> | ✓ | ✓ |
|
||||
| double frexp ( float x, int *nptr ) <br><sub>Extract mantissa and exponent of a floating-point value.</sub> | ✓ | ✓ |
|
||||
| double j0 ( double x ) <br><sub>Calculate the value of the Bessel function of the first kind of order 0 for the input argument.</sub> | ✓ | ✓ |
|
||||
| double j1 ( double x ) <br><sub>Calculate the value of the Bessel function of the first kind of order 1 for the input argument.</sub> | ✓ | ✓ |
|
||||
| double jn ( int n, double x ) <br><sub>Calculate the value of the Bessel function of the first kind of order n for the input argument.</sub> | ✓ | ✓ |
|
||||
| double lgamma ( double x ) <br><sub>Calculate the natural logarithm of the absolute value of the gamma function of the input argument.</sub> | ✓ | ✓ |
|
||||
| long long int llrint ( double x ) <br><sub>Round input to nearest integer value.</sub> | ✓ | ✓ |
|
||||
| long long int llround ( double x ) <br><sub>Round to nearest integer value.</sub> | ✓ | ✓ |
|
||||
| long int lrint ( double x ) <br><sub>Round input to nearest integer value.</sub> | ✓ | ✓ |
|
||||
| long int lround ( double x ) <br><sub>Round to nearest integer value.</sub> | ✓ | ✓ |
|
||||
| double modf ( double x, double *iptr ) <br><sub>Break down the input argument into fractional and integral parts.</sub> | ✓ | ✓ |
|
||||
| double nextafter ( double x, double y ) <br><sub>Returns next representable single-precision floating-point value after argument.</sub> | ✓ | ✓ |
|
||||
| double norm3d ( double a, double b, double c ) <br><sub>Calculate the square root of the sum of squares of three coordinates of the argument.</sub> | ✓ | ✓ |
|
||||
| float norm4d ( double a, double b, double c, double d ) <br><sub>Calculate the square root of the sum of squares of four coordinates of the argument.</sub> | ✓ | ✓ |
|
||||
| double normcdf ( double y ) <br><sub>Calculate the standard normal cumulative distribution function.</sub> | ✓ | ✓ |
|
||||
| double normcdfinv ( double y ) <br><sub>Calculate the inverse of the standard normal cumulative distribution function.</sub> | ✓ | ✓ |
|
||||
| double rcbrt ( double x ) <br><sub>Calculate the reciprocal cube root function.</sub> | ✓ | ✓ |
|
||||
| double remquo ( double x, double y, int *quo ) <br><sub>Compute single-precision floating-point remainder and part of quotient.</sub> | ✓ | ✓ |
|
||||
| double rhypot ( double x, double y ) <br><sub>Calculate one over the square root of the sum of squares of two arguments.</sub> | ✓ | ✓ |
|
||||
| double rint ( double x ) <br><sub>Round input to nearest integer value in floating-point.</sub> | ✓ | ✓ |
|
||||
| double rnorm3d ( double a, double b, double c ) <br><sub>Calculate one over the square root of the sum of squares of three coordinates of the argument.</sub> | ✓ | ✓ |
|
||||
| double rnorm4d ( double a, double b, double c, double d ) <br><sub>Calculate one over the square root of the sum of squares of four coordinates of the argument.</sub> | ✓ | ✓ |
|
||||
| double rnorm ( int dim, const double *a ) <br><sub>Calculate the reciprocal of square root of the sum of squares of any number of coordinates.</sub> | ✓ | ✓ |
|
||||
| double scalbln ( double x, long int n ) <br><sub>Scale floating-point input by integer power of two.</sub> | ✓ | ✓ |
|
||||
| void sincos ( double x, double *sptr, double *cptr ) <br><sub>Calculate the sine and cosine of the first input argument.</sub> | ✓ | ✓ |
|
||||
| void sincospi ( double x, double *sptr, double *cptr ) <br><sub>Calculate the sine and cosine of the first input argument multiplied by PI.</sub> | ✓ | ✓ |
|
||||
| double y0f ( double x ) <br><sub>Calculate the value of the Bessel function of the second kind of order 0 for the input argument.</sub> | ✓ | ✓ |
|
||||
| double y1 ( double x ) <br><sub>Calculate the value of the Bessel function of the second kind of order 1 for the input argument.</sub> | ✓ | ✓ |
|
||||
| double yn ( int n, double x ) <br><sub>Calculate the value of the Bessel function of the second kind of order n for the input argument.</sub> | ✓ | ✓ |
|
||||
|
||||
|
||||
|
||||
<sub><b id="f2"><sup>[1]</sup></b> __RETURN_TYPE is dependent on compiler. It is usually 'int' for C compilers and 'bool' for C++ compilers.</sub> [↩](#a2)
|
||||
|
||||
### Integer Intrinsics
|
||||
Following is the list of supported integer intrinsics. Note that intrinsics are supported on device only.
|
||||
|
||||
| **Function** |
|
||||
| --- |
|
||||
| unsigned int __brev ( unsigned int x ) <br><sub>Reverse the bit order of a 32 bit unsigned integer.</sub> |
|
||||
| unsigned long long int __brevll ( unsigned long long int x ) <br><sub>Reverse the bit order of a 64 bit unsigned integer. </sub> |
|
||||
| int __clz ( int x ) <br><sub>Return the number of consecutive high-order zero bits in a 32 bit integer.</sub> |
|
||||
| unsigned int __clz(unsigned int x) <br><sub>Return the number of consecutive high-order zero bits in 32 bit unsigned integer.</sub> |
|
||||
| int __clzll ( long long int x ) <br><sub>Count the number of consecutive high-order zero bits in a 64 bit integer.</sub> |
|
||||
| unsigned int __clzll(long long int x) <br><sub>Return the number of consecutive high-order zero bits in 64 bit signed integer.</sub> |
|
||||
| unsigned int __ffs(unsigned int x) <br><sub>Find the position of least signigicant bit set to 1 in a 32 bit unsigned integer.<sup id="a3">[1](#f3)</sup></sub> |
|
||||
| unsigned int __ffs(int x) <br><sub>Find the position of least signigicant bit set to 1 in a 32 bit signed integer.</sub> |
|
||||
| unsigned int __ffsll(unsigned long long int x) <br><sub>Find the position of least signigicant bit set to 1 in a 64 bit unsigned integer.<sup>[1](#f3)</sup></sub> |
|
||||
| unsigned int __ffsll(long long int x) <br><sub>Find the position of least signigicant bit set to 1 in a 64 bit signed integer.</sub> |
|
||||
| unsigned int __popc ( unsigned int x ) <br><sub>Count the number of bits that are set to 1 in a 32 bit integer.</sub> |
|
||||
| int __popcll ( unsigned long long int x )<br><sub>Count the number of bits that are set to 1 in a 64 bit integer.</sub> |
|
||||
| int __mul24 ( int x, int y )<br><sub>Multiply two 24bit integers.</sub> |
|
||||
| unsigned int __umul24 ( unsigned int x, unsigned int y )<br><sub>Multiply two 24bit unsigned integers.</sub> |
|
||||
<sub><b id="f3"><sup>[1]</sup></b>
|
||||
The HIP-Clang implementation of __ffs() and __ffsll() contains code to add a constant +1 to produce the ffs result format.
|
||||
For the cases where this overhead is not acceptable and programmer is willing to specialize for the platform,
|
||||
HIP-Clang provides __lastbit_u32_u32(unsigned int input) and __lastbit_u32_u64(unsigned long long int input).
|
||||
The index returned by __lastbit_ instructions starts at -1, while for ffs the index starts at 0.
|
||||
|
||||
### Floating-point Intrinsics
|
||||
Following is the list of supported floating-point intrinsics. Note that intrinsics are supported on device only.
|
||||
|
||||
| **Function** |
|
||||
| --- |
|
||||
| float __cosf ( float x ) <br><sub>Calculate the fast approximate cosine of the input argument.</sub> |
|
||||
| float __expf ( float x ) <br><sub>Calculate the fast approximate base e exponential of the input argument.</sub> |
|
||||
| float __frsqrt_rn ( float x ) <br><sub>Compute `1 / √x` in round-to-nearest-even mode.</sub> |
|
||||
| float __fsqrt_rn ( float x ) <br><sub>Compute `√x` in round-to-nearest-even mode.</sub> |
|
||||
| float __log10f ( float x ) <br><sub>Calculate the fast approximate base 10 logarithm of the input argument.</sub> |
|
||||
| float __log2f ( float x ) <br><sub>Calculate the fast approximate base 2 logarithm of the input argument.</sub> |
|
||||
| float __logf ( float x ) <br><sub>Calculate the fast approximate base e logarithm of the input argument.</sub> |
|
||||
| float __powf ( float x, float y ) <br><sub>Calculate the fast approximate of x<sup>y</sup>.</sub> |
|
||||
| float __sinf ( float x ) <br><sub>Calculate the fast approximate sine of the input argument.</sub> |
|
||||
| float __tanf ( float x ) <br><sub>Calculate the fast approximate tangent of the input argument.</sub> |
|
||||
| double __dsqrt_rn ( double x ) <br><sub>Compute `√x` in round-to-nearest-even mode.</sub> |
|
||||
|
||||
## Texture Functions
|
||||
The supported Texture functions are listed in header files "texture_functions.h"(https://github.com/ROCm-Developer-Tools/HIP/blob/main/include/hip/hcc_detail/texture_functions.h) and"texture_indirect_functions.h" (https://github.com/ROCm-Developer-Tools/HIP/blob/main/include/hip/hcc_detail/texture_indirect_functions.h).
|
||||
|
||||
## Surface Functions
|
||||
Surface functions are not supported.
|
||||
|
||||
## Timer Functions
|
||||
HIP provides the following built-in functions for reading a high-resolution timer from the device.
|
||||
```
|
||||
clock_t clock()
|
||||
long long int clock64()
|
||||
```
|
||||
Returns the value of counter that is incremented every clock cycle on device. Difference in values returned provides the cycles used.
|
||||
|
||||
## Atomic Functions
|
||||
|
||||
Atomic functions execute as read-modify-write operations residing in global or shared memory. No other device or thread can observe or modify the memory location during an atomic operation. If multiple instructions from different devices or threads target the same memory location, the instructions are serialized in an undefined order.
|
||||
|
||||
HIP supports the following atomic operations.
|
||||
|
||||
| **Function** | **Supported in HIP** | **Supported in CUDA** |
|
||||
| --- | --- | --- |
|
||||
| int atomicAdd(int* address, int val) | ✓ | ✓ |
|
||||
| unsigned int atomicAdd(unsigned int* address,unsigned int val) | ✓ | ✓ |
|
||||
| unsigned long long int atomicAdd(unsigned long long int* address,unsigned long long int val) | ✓ | ✓ |
|
||||
| float atomicAdd(float* address, float val) | ✓ | ✓ |
|
||||
| int atomicSub(int* address, int val) | ✓ | ✓ |
|
||||
| unsigned int atomicSub(unsigned int* address,unsigned int val) | ✓ | ✓ |
|
||||
| int atomicExch(int* address, int val) | ✓ | ✓ |
|
||||
| unsigned int atomicExch(unsigned int* address,unsigned int val) | ✓ | ✓ |
|
||||
| unsigned long long int atomicExch(unsigned long long int* address,unsigned long long int val) | ✓ | ✓ |
|
||||
| float atomicExch(float* address, float val) | ✓ | ✓ |
|
||||
| int atomicMin(int* address, int val) | ✓ | ✓ |
|
||||
| unsigned int atomicMin(unsigned int* address,unsigned int val) | ✓ | ✓ |
|
||||
| unsigned long long int atomicMin(unsigned long long int* address,unsigned long long int val) | ✓ | ✓ |
|
||||
| int atomicMax(int* address, int val) | ✓ | ✓ |
|
||||
| unsigned int atomicMax(unsigned int* address,unsigned int val) | ✓ | ✓ |
|
||||
| unsigned long long int atomicMax(unsigned long long int* address,unsigned long long int val) | ✓ | ✓ |
|
||||
| unsigned int atomicInc(unsigned int* address)| ✗ | ✓ |
|
||||
| unsigned int atomicDec(unsigned int* address)| ✗ | ✓ |
|
||||
| int atomicCAS(int* address, int compare, int val) | ✓ | ✓ |
|
||||
| unsigned int atomicCAS(unsigned int* address,unsigned int compare,unsigned int val) | ✓ | ✓ |
|
||||
| unsigned long long int atomicCAS(unsigned long long int* address,unsigned long long int compare,unsigned long long int val) | ✓ | ✓ |
|
||||
| int atomicAnd(int* address, int val) | ✓ | ✓ |
|
||||
| unsigned int atomicAnd(unsigned int* address,unsigned int val) | ✓ | ✓ |
|
||||
| unsigned long long int atomicAnd(unsigned long long int* address,unsigned long long int val) | ✓ | ✓ |
|
||||
| int atomicOr(int* address, int val) | ✓ | ✓ |
|
||||
| unsigned int atomicOr(unsigned int* address,unsigned int val) | ✓ | ✓ |
|
||||
| unsigned long long int atomicOr(unsigned long long int* address,unsigned long long int val) | ✓ | ✓ |
|
||||
| int atomicXor(int* address, int val) | ✓ | ✓ |
|
||||
| unsigned int atomicXor(unsigned int* address,unsigned int val) | ✓ | ✓ |
|
||||
| unsigned long long int atomicXor(unsigned long long int* address,unsigned long long int val)) | ✓ | ✓ |
|
||||
|
||||
### Caveats and Features Under-Development:
|
||||
|
||||
- HIP enables atomic operations on 32-bit integers. Additionally, it supports an atomic float add. AMD hardware, however, implements the float add using a CAS loop, so this function may not perform efficiently.
|
||||
|
||||
## Warp Cross-Lane Functions
|
||||
|
||||
Warp cross-lane functions operate across all lanes in a warp. The hardware guarantees that all warp lanes will execute in lockstep, so additional synchronization is unnecessary, and the instructions use no shared memory.
|
||||
|
||||
Note that Nvidia and AMD devices have different warp sizes, so portable code should use the warpSize built-ins to query the warp size. Hipified code from the Cuda path requires careful review to ensure it doesn’t assume a waveSize of 32. "Wave-aware" code that assumes a waveSize of 32 will run on a wave-64 machine, but it will utilize only half of the machine resources. WarpSize built-ins should only be used in device functions and its value depends on GPU arch. Users should not assume warpSize to be a compile-time constant. Host functions should use hipGetDeviceProperties to get the default warp size of a GPU device:
|
||||
|
||||
```
|
||||
cudaDeviceProp props;
|
||||
cudaGetDeviceProperties(&props, deviceID);
|
||||
int w = props.warpSize;
|
||||
// implement portable algorithm based on w (rather than assume 32 or 64)
|
||||
```
|
||||
|
||||
Note that assembly kernels may be built for a warp size which is different than the default warp size.
|
||||
|
||||
### Warp Vote and Ballot Functions
|
||||
|
||||
```
|
||||
int __all(int predicate)
|
||||
int __any(int predicate)
|
||||
uint64_t __ballot(int predicate)
|
||||
```
|
||||
|
||||
Threads in a warp are referred to as *lanes* and are numbered from 0 to warpSize -- 1. For these functions, each warp lane contributes 1 -- the bit value (the predicate), which is efficiently broadcast to all lanes in the warp. The 32-bit int predicate from each lane reduces to a 1-bit value: 0 (predicate = 0) or 1 (predicate != 0). `__any` and `__all` provide a summary view of the predicates that the other warp lanes contribute:
|
||||
|
||||
- `__any()` returns 1 if any warp lane contributes a nonzero predicate, or 0 otherwise
|
||||
- `__all()` returns 1 if all other warp lanes contribute nonzero predicates, or 0 otherwise
|
||||
|
||||
Applications can test whether the target platform supports the any/all instruction using the `hasWarpVote` device property or the HIP_ARCH_HAS_WARP_VOTE compiler define.
|
||||
|
||||
`__ballot` provides a bit mask containing the 1-bit predicate value from each lane. The nth bit of the result contains the 1 bit contributed by the nth warp lane. Note that HIP's `__ballot` function supports a 64-bit return value (compared with Cuda’s 32 bits). Code ported from Cuda should support the larger warp sizes that the HIP version of this instruction supports. Applications can test whether the target platform supports the ballot instruction using the `hasWarpBallot` device property or the HIP_ARCH_HAS_WARP_BALLOT compiler define.
|
||||
|
||||
|
||||
### Warp Shuffle Functions
|
||||
|
||||
Half-float shuffles are not supported. The default width is warpSize---see [Warp Cross-Lane Functions](#warp-cross-lane-functions). Applications should not assume the warpSize is 32 or 64.
|
||||
|
||||
```
|
||||
int __shfl (int var, int srcLane, int width=warpSize);
|
||||
float __shfl (float var, int srcLane, int width=warpSize);
|
||||
int __shfl_up (int var, unsigned int delta, int width=warpSize);
|
||||
float __shfl_up (float var, unsigned int delta, int width=warpSize);
|
||||
int __shfl_down (int var, unsigned int delta, int width=warpSize);
|
||||
float __shfl_down (float var, unsigned int delta, int width=warpSize);
|
||||
int __shfl_xor (int var, int laneMask, int width=warpSize);
|
||||
float __shfl_xor (float var, int laneMask, int width=warpSize);
|
||||
|
||||
```
|
||||
|
||||
## Cooperative Groups Functions
|
||||
|
||||
Cooperative groups is a mechanism for forming and communicating between groups of threads at
|
||||
a granularity different than the block. This feature was introduced in Cuda 9.
|
||||
|
||||
HIP does not support any of the kernel language cooperative groups
|
||||
types or functions.
|
||||
|
||||
|
||||
| **Function** | **Supported in HIP** | **Supported in CUDA** |
|
||||
| --- | --- | --- |
|
||||
| `void thread_group.sync()` | | ✓ |
|
||||
| `unsigned thread_group.size()` | | ✓ |
|
||||
| `unsigned thread_group.thread_rank()` | | ✓ |
|
||||
| `bool thread_group.is_valid()` | | ✓ |
|
||||
| `thread_group tiled_partition(thread_group, size)` | | ✓ |
|
||||
| `thread_block_tile<N> tiled_partition<N>(thread_group)` | | ✓ |
|
||||
| `thread_block this_thread_block()` | | ✓ |
|
||||
| `T thread_block_tile.shfl()` | | ✓ |
|
||||
| `T thread_block_tile.shfl_down()` | | ✓ |
|
||||
| `T thread_block_tile.shfl_up()` | | ✓ |
|
||||
| `T thread_block_tile.shfl_xor()` | | ✓ |
|
||||
| `T thread_block_tile.any()` | | ✓ |
|
||||
| `T thread_block_tile.all()` | | ✓ |
|
||||
| `T thread_block_tile.ballot()` | | ✓ |
|
||||
| `T thread_block_tile.match_any()` | | ✓ |
|
||||
| `T thread_block_tile.match_all()` | | ✓ |
|
||||
| `coalesced_group coalesced_threads()` | | ✓ |
|
||||
| `grid_group this_grid()` | | ✓ |
|
||||
| `void grid_group.sync()` | | ✓ |
|
||||
| `unsigned grid_group.size()` | | ✓ |
|
||||
| `unsigned grid_group.thread_rank()` | | ✓ |
|
||||
| `bool grid_group.is_valid()` | | ✓ |
|
||||
| `multi_grid_group this_multi_grid()` | | ✓ |
|
||||
| `void multi_grid_group.sync()` | | ✓ |
|
||||
| `unsigned multi_grid_group.size()` | | ✓ |
|
||||
| `unsigned multi_grid_group.thread_rank()` | | ✓ |
|
||||
| `bool multi_grid_group.is_valid()` | | ✓ |
|
||||
|
||||
## Warp Matrix Functions
|
||||
|
||||
Warp matrix functions allow a warp to cooperatively operate on small matrices
|
||||
whose elements are spread over the lanes in an unspecified manner. This feature
|
||||
was introduced in Cuda 9.
|
||||
|
||||
HIP does not support any of the kernel language warp matrix
|
||||
types or functions.
|
||||
|
||||
| **Function** | **Supported in HIP** | **Supported in CUDA** |
|
||||
| --- | --- | --- |
|
||||
| `void load_matrix_sync(fragment<...> &a, const T* mptr, unsigned lda)` | | ✓ |
|
||||
| `void load_matrix_sync(fragment<...> &a, const T* mptr, unsigned lda, layout_t layout)` | | ✓ |
|
||||
| `void store_matrix_sync(T* mptr, fragment<...> &a, unsigned lda, layout_t layout)` | | ✓ |
|
||||
| `void fill_fragment(fragment<...> &a, const T &value)` | | ✓ |
|
||||
| `void mma_sync(fragment<...> &d, const fragment<...> &a, const fragment<...> &b, const fragment<...> &c , bool sat)` | | ✓ |
|
||||
|
||||
## Independent Thread Scheduling
|
||||
|
||||
The hardware support for independent thread scheduling introduced in certain architectures
|
||||
supporting Cuda allows threads to progress independently of each other and enables
|
||||
intra-warp synchronizations that were previously not allowed.
|
||||
|
||||
HIP does not support this type of scheduling.
|
||||
|
||||
## Profiler Counter Function
|
||||
|
||||
The Cuda `__prof_trigger()` instruction is not supported.
|
||||
|
||||
## Assert
|
||||
|
||||
The assert function is under development.
|
||||
HIP does support an "abort" call which will terminate the process execution from inside the kernel.
|
||||
|
||||
## Printf
|
||||
|
||||
The printf function is under development.
|
||||
|
||||
## Device-Side Dynamic Global Memory Allocation
|
||||
|
||||
Device-side dynamic global memory allocation is under development. HIP now includes a preliminary
|
||||
implementation of malloc and free that can be called from device functions.
|
||||
|
||||
## `__launch_bounds__`
|
||||
|
||||
|
||||
GPU multiprocessors have a fixed pool of resources (primarily registers and shared memory) which are shared by the actively running warps. Using more resources can increase IPC of the kernel but reduces the resources available for other warps and limits the number of warps that can be simulaneously running. Thus GPUs have a complex relationship between resource usage and performance.
|
||||
|
||||
__launch_bounds__ allows the application to provide usage hints that influence the resources (primarily registers) used by the generated code. It is a function attribute that must be attached to a __global__ function:
|
||||
|
||||
```
|
||||
__global__ void `__launch_bounds__`(MAX_THREADS_PER_BLOCK, MIN_WARPS_PER_EU) MyKernel(...) ...
|
||||
MyKernel(hipGridLaunch lp, ...)
|
||||
...
|
||||
```
|
||||
|
||||
__launch_bounds__ supports two parameters:
|
||||
- MAX_THREADS_PER_BLOCK - The programmers guarantees that kernel will be launched with threads less than MAX_THREADS_PER_BLOCK. (On NVCC this maps to the .maxntid PTX directive). If no launch_bounds is specified, MAX_THREADS_PER_BLOCK is the maximum block size supported by the device (typically 1024 or larger). Specifying MAX_THREADS_PER_BLOCK less than the maximum effectively allows the compiler to use more resources than a default unconstrained compilation that supports all possible block sizes at launch time.
|
||||
The threads-per-block is the product of (hipBlockDim_x * hipBlockDim_y * hipBlockDim_z).
|
||||
- MIN_WARPS_PER_EU - directs the compiler to minimize resource usage so that the requested number of warps can be simultaneously active on a multi-processor. Since active warps compete for the same fixed pool of resources, the compiler must reduce resources required by each warp(primarily registers). MIN_WARPS_PER_EU is optional and defaults to 1 if not specified. Specifying a MIN_WARPS_PER_EU greater than the default 1 effectively constrains the compiler's resource usage.
|
||||
|
||||
### Compiler Impact
|
||||
The compiler uses these parameters as follows:
|
||||
- The compiler uses the hints only to manage register usage, and does not automatically reduce shared memory or other resources.
|
||||
- Compilation fails if compiler cannot generate a kernel which meets the requirements of the specified launch bounds.
|
||||
- From MAX_THREADS_PER_BLOCK, the compiler derives the maximum number of warps/block that can be used at launch time.
|
||||
Values of MAX_THREADS_PER_BLOCK less than the default allows the compiler to use a larger pool of registers : each warp uses registers, and this hint constains the launch to a warps/block size which is less than maximum.
|
||||
- From MIN_WARPS_PER_EU, the compiler derives a maximum number of registers that can be used by the kernel (to meet the required #simultaneous active blocks).
|
||||
If MIN_WARPS_PER_EU is 1, then the kernel can use all registers supported by the multiprocessor.
|
||||
- The compiler ensures that the registers used in the kernel is less than both allowed maximums, typically by spilling registers (to shared or global memory), or by using more instructions.
|
||||
- The compiler may use hueristics to increase register usage, or may simply be able to avoid spilling. The MAX_THREADS_PER_BLOCK is particularly useful in this cases, since it allows the compiler to use more registers and avoid situations where the compiler constrains the register usage (potentially spilling) to meet the requirements of a large block size that is never used at launch time.
|
||||
|
||||
|
||||
### CU and EU Definitions
|
||||
A compute unit (CU) is responsible for executing the waves of a work-group. It is composed of one or more execution units (EU) which are responsible for executing waves. An EU can have enough resources to maintain the state of more than one executing wave. This allows an EU to hide latency by switching between waves in a similar way to symmetric multithreading on a CPU. In order to allow the state for multiple waves to fit on an EU, the resources used by a single wave have to be limited. Limiting such resources can allow greater latency hiding, but can result in having to spill some register state to memory. This attribute allows an advanced developer to tune the number of waves that are capable of fitting within the resources of an EU. It can be used to ensure at least a certain number will fit to help hide latency, and can also be used to ensure no more than a certain number will fit to limit cache thrashing.
|
||||
|
||||
### Porting from CUDA __launch_bounds
|
||||
CUDA defines a __launch_bounds which is also designed to control occupancy:
|
||||
```
|
||||
__launch_bounds(MAX_THREADS_PER_BLOCK, MIN_BLOCKS_PER_MULTIPROCESSOR)
|
||||
```
|
||||
|
||||
- The second parameter __launch_bounds parameters must be converted to the format used __hip_launch_bounds, which uses warps and execution-units rather than blocks and multi-processors (this conversion is performed automatically by hipify tools).
|
||||
```
|
||||
MIN_WARPS_PER_EXECUTION_UNIT = (MIN_BLOCKS_PER_MULTIPROCESSOR * MAX_THREADS_PER_BLOCK) / 32
|
||||
```
|
||||
|
||||
The key differences in the interface are:
|
||||
- Warps (rather than blocks):
|
||||
The developer is trying to tell the compiler to control resource utilization to guarantee some amount of active Warps/EU for latency hiding. Specifying active warps in terms of blocks appears to hide the micro-architectural details of the warp size, but makes the interface more confusing since the developer ultimately needs to compute the number of warps to obtain the desired level of control.
|
||||
- Execution Units (rather than multiProcessor):
|
||||
The use of execution units rather than multiprocessors provides support for architectures with multiple execution units/multi-processor. For example, the AMD GCN architecture has 4 execution units per multiProcessor. The hipDeviceProps has a field executionUnitsPerMultiprocessor.
|
||||
Platform-specific coding techniques such as #ifdef can be used to specify different launch_bounds for NVCC and HIP-Clang platforms, if desired.
|
||||
|
||||
|
||||
### maxregcount
|
||||
Unlike nvcc, HIP-Clang does not support the "--maxregcount" option. Instead, users are encouraged to use the hip_launch_bounds directive since the parameters are more intuitive and portable than
|
||||
micro-architecture details like registers, and also the directive allows per-kernel control rather than an entire file. hip_launch_bounds works on both HIP-Clang and nvcc targets.
|
||||
|
||||
|
||||
## Register Keyword
|
||||
The register keyword is deprecated in C++, and is silently ignored by both nvcc and HIP-Clang. You can pass the option `-Wdeprecated-register` the compiler warning message.
|
||||
|
||||
## Pragma Unroll
|
||||
|
||||
Unroll with a bounds that is known at compile-time is supported. For example:
|
||||
|
||||
```
|
||||
#pragma unroll 16 /* hint to compiler to unroll next loop by 16 */
|
||||
for (int i=0; i<16; i++) ...
|
||||
```
|
||||
|
||||
```
|
||||
#pragma unroll 1 /* tell compiler to never unroll the loop */
|
||||
for (int i=0; i<16; i++) ...
|
||||
```
|
||||
|
||||
|
||||
```
|
||||
#pragma unroll /* hint to compiler to completely unroll next loop. */
|
||||
for (int i=0; i<16; i++) ...
|
||||
```
|
||||
|
||||
|
||||
## In-Line Assembly
|
||||
|
||||
GCN ISA In-line assembly, is supported. For example:
|
||||
|
||||
```
|
||||
asm volatile ("v_mac_f32_e32 %0, %2, %3" : "=v" (out[i]) : "0"(out[i]), "v" (a), "v" (in[i]));
|
||||
```
|
||||
|
||||
We insert the GCN isa into the kernel using `asm()` Assembler statement.
|
||||
`volatile` keyword is used so that the optimizers must not change the number of volatile operations or change their order of execution relative to other volatile operations.
|
||||
`v_mac_f32_e32` is the GCN instruction, for more information please refer - [AMD GCN3 ISA architecture manual](http://gpuopen.com/compute-product/amd-gcn3-isa-architecture-manual/)
|
||||
Index for the respective operand in the ordered fashion is provided by `%` followed by position in the list of operands
|
||||
`"v"` is the constraint code (for target-specific AMDGPU) for 32-bit VGPR register, for more info please refer - [Supported Constraint Code List for AMDGPU](https://llvm.org/docs/LangRef.html#supported-constraint-code-list)
|
||||
Output Constraints are specified by an `"="` prefix as shown above ("=v"). This indicate that assemby will write to this operand, and the operand will then be made available as a return value of the asm expression. Input constraints do not have a prefix - just the constraint code. The constraint string of `"0"` says to use the assigned register for output as an input as well (it being the 0'th constraint).
|
||||
|
||||
## C++ Support
|
||||
The following C++ features are not supported:
|
||||
- Run-time-type information (RTTI)
|
||||
- Virtual functions
|
||||
- Try/catch
|
||||
|
||||
## Kernel Compilation
|
||||
hipcc now supports compiling C++/HIP kernels to binary code objects.
|
||||
The file format for binary is `.co` which means Code Object. The following command builds the code object using `hipcc`.
|
||||
|
||||
`hipcc --genco --offload-arch=[TARGET GPU] [INPUT FILE] -o [OUTPUT FILE]`
|
||||
|
||||
```
|
||||
[TARGET GPU] = GPU architecture
|
||||
[INPUT FILE] = Name of the file containing kernels
|
||||
[OUTPUT FILE] = Name of the generated code object file
|
||||
```
|
||||
|
||||
Note: When using binary code objects is that the number of arguments to the kernel is different on HIP-Clang and NVCC path. Refer to the sample in samples/0_Intro/module_api for differences in the arguments to be passed to the kernel.
|
||||
|
||||
## gfx-arch-specific-kernel
|
||||
Clang defined '__gfx*__' macros can be used to execute gfx arch specific codes inside the kernel. Refer to the sample 14_gpu_arch in samples/2_Cookbook.
|
||||
@@ -1,187 +0,0 @@
|
||||
## What is HIP logging for? ###
|
||||
|
||||
HIP provides a logging mechanism, which is a convinient way of printing important information so as to trace HIP API and runtime codes during the execution of HIP application.
|
||||
It assists HIP development team in the development of HIP runtime, and is useful for HIP application developers as well.
|
||||
Depending on the setting of logging level and logging mask, HIP logging will print different kinds of information, for different types of functionalities such as HIP APIs, executed kernels, queue commands and queue contents, etc.
|
||||
|
||||
## HIP Logging Level:
|
||||
|
||||
By Default, HIP logging is disabled, it can be enabled via environment setting,
|
||||
- AMD_LOG_LEVEL
|
||||
|
||||
The value of the setting controls different logging level,
|
||||
|
||||
```
|
||||
enum LogLevel {
|
||||
LOG_NONE = 0,
|
||||
LOG_ERROR = 1,
|
||||
LOG_WARNING = 2,
|
||||
LOG_INFO = 3,
|
||||
LOG_DEBUG = 4
|
||||
};
|
||||
```
|
||||
|
||||
## HIP Logging Mask:
|
||||
|
||||
Logging mask is designed to print types of functionalities during the execution of HIP application.
|
||||
It can be set as one of the following values,
|
||||
|
||||
```
|
||||
enum LogMask {
|
||||
LOG_API = 0x00000001, //!< API call
|
||||
LOG_CMD = 0x00000002, //!< Kernel and Copy Commands and Barriers
|
||||
LOG_WAIT = 0x00000004, //!< Synchronization and waiting for commands to finish
|
||||
LOG_AQL = 0x00000008, //!< Decode and display AQL packets
|
||||
LOG_QUEUE = 0x00000010, //!< Queue commands and queue contents
|
||||
LOG_SIG = 0x00000020, //!< Signal creation, allocation, pool
|
||||
LOG_LOCK = 0x00000040, //!< Locks and thread-safety code.
|
||||
LOG_KERN = 0x00000080, //!< kernel creations and arguments, etc.
|
||||
LOG_COPY = 0x00000100, //!< Copy debug
|
||||
LOG_COPY2 = 0x00000200, //!< Detailed copy debug
|
||||
LOG_RESOURCE = 0x00000400, //!< Resource allocation, performance-impacting events.
|
||||
LOG_INIT = 0x00000800, //!< Initialization and shutdown
|
||||
LOG_MISC = 0x00001000, //!< misc debug, not yet classified
|
||||
LOG_AQL2 = 0x00002000, //!< Show raw bytes of AQL packet
|
||||
LOG_CODE = 0x00004000, //!< Show code creation debug
|
||||
LOG_CMD2 = 0x00008000, //!< More detailed command info, including barrier commands
|
||||
LOG_LOCATION = 0x00010000, //!< Log message location
|
||||
LOG_ALWAYS = 0xFFFFFFFF, //!< Log always even mask flag is zero
|
||||
};
|
||||
```
|
||||
|
||||
Once AMD_LOG_LEVEL is set, logging mask is set as default with the value 0x7FFFFFFF.
|
||||
However, for different pupose of logging functionalities, logging mask can be defined as well via environment variable,
|
||||
|
||||
- AMD_LOG_MASK
|
||||
|
||||
## HIP Logging command:
|
||||
|
||||
To pring HIP logging information, the function is defined as
|
||||
```
|
||||
#define ClPrint(level, mask, format, ...)
|
||||
do {
|
||||
if (AMD_LOG_LEVEL >= level) {
|
||||
if (AMD_LOG_MASK & mask || mask == amd::LOG_ALWAYS) {
|
||||
if (AMD_LOG_MASK & amd::LOG_LOCATION) {
|
||||
amd::log_printf(level, __FILENAME__, __LINE__, format, ##__VA_ARGS__);
|
||||
} else {
|
||||
amd::log_printf(level, "", 0, format, ##__VA_ARGS__);
|
||||
}
|
||||
}
|
||||
}
|
||||
} while (false)
|
||||
```
|
||||
|
||||
So in HIP code, call ClPrint() function with proper input varibles as needed, for example,
|
||||
```
|
||||
ClPrint(amd::LOG_INFO, amd::LOG_INIT, "Initializing HSA stack.");
|
||||
```
|
||||
|
||||
## HIP Logging Example:
|
||||
|
||||
Below is an example to enable HIP logging and get logging information during execution of hipinfo,
|
||||
|
||||
```
|
||||
user@user-test:~/hip/bin$ export AMD_LOG_LEVEL=4
|
||||
user@user-test:~/hip/bin$ ./hipinfo
|
||||
|
||||
:3:rocdevice.cpp :453 : 23647210092: Initializing HSA stack.
|
||||
:3:comgrctx.cpp :33 : 23647639336: Loading COMGR library.
|
||||
:3:rocdevice.cpp :203 : 23647687108: Numa select cpu agent[0]=0x13407c0(fine=0x13409a0,coarse=0x1340ad0) for gpu agent=0x1346150
|
||||
:4:runtime.cpp :82 : 23647698669: init
|
||||
:3:hip_device_runtime.cpp :473 : 23647698869: 5617 : [7fad295dd840] hipGetDeviceCount: Returned hipSuccess
|
||||
:3:hip_device_runtime.cpp :502 : 23647698990: 5617 : [7fad295dd840] hipSetDevice ( 0 )
|
||||
:3:hip_device_runtime.cpp :507 : 23647699042: 5617 : [7fad295dd840] hipSetDevice: Returned hipSuccess
|
||||
--------------------------------------------------------------------------------
|
||||
device# 0
|
||||
:3:hip_device.cpp :150 : 23647699276: 5617 : [7fad295dd840] hipGetDeviceProperties ( 0x7ffdbe7db730, 0 )
|
||||
:3:hip_device.cpp :237 : 23647699335: 5617 : [7fad295dd840] hipGetDeviceProperties: Returned hipSuccess
|
||||
Name: Device 7341
|
||||
pciBusID: 3
|
||||
pciDeviceID: 0
|
||||
pciDomainID: 0
|
||||
multiProcessorCount: 11
|
||||
maxThreadsPerMultiProcessor: 2560
|
||||
isMultiGpuBoard: 0
|
||||
clockRate: 1900 Mhz
|
||||
memoryClockRate: 875 Mhz
|
||||
memoryBusWidth: 0
|
||||
clockInstructionRate: 1000 Mhz
|
||||
totalGlobalMem: 7.98 GB
|
||||
maxSharedMemoryPerMultiProcessor: 64.00 KB
|
||||
totalConstMem: 8573157376
|
||||
sharedMemPerBlock: 64.00 KB
|
||||
canMapHostMemory: 1
|
||||
regsPerBlock: 0
|
||||
warpSize: 32
|
||||
l2CacheSize: 0
|
||||
computeMode: 0
|
||||
maxThreadsPerBlock: 1024
|
||||
maxThreadsDim.x: 1024
|
||||
maxThreadsDim.y: 1024
|
||||
maxThreadsDim.z: 1024
|
||||
maxGridSize.x: 2147483647
|
||||
maxGridSize.y: 2147483647
|
||||
maxGridSize.z: 2147483647
|
||||
major: 10
|
||||
minor: 12
|
||||
concurrentKernels: 1
|
||||
cooperativeLaunch: 0
|
||||
cooperativeMultiDeviceLaunch: 0
|
||||
arch.hasGlobalInt32Atomics: 1
|
||||
arch.hasGlobalFloatAtomicExch: 1
|
||||
arch.hasSharedInt32Atomics: 1
|
||||
arch.hasSharedFloatAtomicExch: 1
|
||||
arch.hasFloatAtomicAdd: 1
|
||||
arch.hasGlobalInt64Atomics: 1
|
||||
arch.hasSharedInt64Atomics: 1
|
||||
arch.hasDoubles: 1
|
||||
arch.hasWarpVote: 1
|
||||
arch.hasWarpBallot: 1
|
||||
arch.hasWarpShuffle: 1
|
||||
arch.hasFunnelShift: 0
|
||||
arch.hasThreadFenceSystem: 1
|
||||
arch.hasSyncThreadsExt: 0
|
||||
arch.hasSurfaceFuncs: 0
|
||||
arch.has3dGrid: 1
|
||||
arch.hasDynamicParallelism: 0
|
||||
gcnArch: 1012
|
||||
isIntegrated: 0
|
||||
maxTexture1D: 65536
|
||||
maxTexture2D.width: 16384
|
||||
maxTexture2D.height: 16384
|
||||
maxTexture3D.width: 2048
|
||||
maxTexture3D.height: 2048
|
||||
maxTexture3D.depth: 2048
|
||||
isLargeBar: 0
|
||||
:3:hip_device_runtime.cpp :471 : 23647701557: 5617 : [7fad295dd840] hipGetDeviceCount ( 0x7ffdbe7db714 )
|
||||
:3:hip_device_runtime.cpp :473 : 23647701608: 5617 : [7fad295dd840] hipGetDeviceCount: Returned hipSuccess
|
||||
:3:hip_peer.cpp :76 : 23647701731: 5617 : [7fad295dd840] hipDeviceCanAccessPeer ( 0x7ffdbe7db728, 0, 0 )
|
||||
:3:hip_peer.cpp :60 : 23647701784: 5617 : [7fad295dd840] canAccessPeer: Returned hipSuccess
|
||||
:3:hip_peer.cpp :77 : 23647701831: 5617 : [7fad295dd840] hipDeviceCanAccessPeer: Returned hipSuccess
|
||||
peers:
|
||||
:3:hip_peer.cpp :76 : 23647701921: 5617 : [7fad295dd840] hipDeviceCanAccessPeer ( 0x7ffdbe7db728, 0, 0 )
|
||||
:3:hip_peer.cpp :60 : 23647701965: 5617 : [7fad295dd840] canAccessPeer: Returned hipSuccess
|
||||
:3:hip_peer.cpp :77 : 23647701998: 5617 : [7fad295dd840] hipDeviceCanAccessPeer: Returned hipSuccess
|
||||
non-peers: device#0
|
||||
|
||||
:3:hip_memory.cpp :345 : 23647702191: 5617 : [7fad295dd840] hipMemGetInfo ( 0x7ffdbe7db718, 0x7ffdbe7db720 )
|
||||
:3:hip_memory.cpp :360 : 23647702243: 5617 : [7fad295dd840] hipMemGetInfo: Returned hipSuccess
|
||||
memInfo.total: 7.98 GB
|
||||
memInfo.free: 7.98 GB (100%)
|
||||
```
|
||||
|
||||
## HIP Logging Tips:
|
||||
|
||||
- HIP logging works for both release and debug version of HIP application.
|
||||
|
||||
- Logging function with different logging level can be called in the code as needed.
|
||||
|
||||
- Information with logging level less than AMD_LOG_LEVEL will be printed.
|
||||
|
||||
- If need to save the HIP logging output information in a file, just define the file at the command when run the application at the terminal, for example,
|
||||
|
||||
```
|
||||
user@user-test:~/hip/bin$ ./hipinfo > ~/hip_log.txt
|
||||
```
|
||||
|
||||
@@ -1,289 +0,0 @@
|
||||
# Porting CUDA Driver API
|
||||
|
||||
## Introduction to the CUDA Driver and Runtime APIs
|
||||
CUDA provides a separate CUDA Driver and Runtime APIs. The two APIs have significant overlap in functionality:
|
||||
- Both APIs support events, streams, memory management, memory copy, and error handling.
|
||||
- Both APIs deliver similar performance.
|
||||
- Driver APIs calls begin with the prefix `cu` while Runtime APIs begin with the prefix `cuda`. For example, the Driver API API contains `cuEventCreate` while the Runtime API contains `cudaEventCreate`, with similar functionality.
|
||||
- The Driver API defines a different but largely overlapping error code space than the Runtime API, and uses a different coding convention. For example, Driver API defines `CUDA_ERROR_INVALID_VALUE` while the Runtime API defines `cudaErrorInvalidValue`
|
||||
|
||||
|
||||
The Driver API offers two additional pieces of functionality not provided by the Runtime API: cuModule and cuCtx APIs.
|
||||
|
||||
### cuModule API
|
||||
The Module section of the Driver API provides additional control over how and when accelerator code objects are loaded.
|
||||
For example, the driver API allows code objects to be loaded from files or memory pointers.
|
||||
Symbols for kernels or global data can be extracted from the loaded code objects.
|
||||
In contrast, the Runtime API automatically loads and (if necessary) compiles all of the kernels from an executable binary when run.
|
||||
In this mode, NVCC must be used to compile kernel code so the automatic loading can function correctly.
|
||||
|
||||
Both Driver and Runtime APIs define a function for launching kernels (called `cuLaunchKernel` or `cudaLaunchKernel`.
|
||||
The kernel arguments and the execution configuration (grid dimensions, group dimensions, dynamic shared memory, and stream) are passed as arguments to the launch function.
|
||||
The Runtime additionally provides the `<<< >>>` syntax for launching kernels, which resembles a special function call and is easier to use than explicit launch API (in particular with respect to handling of kernel arguments).
|
||||
However, this syntax is not standard C++ and is available only when NVCC is used to compile the host code.
|
||||
|
||||
The Module features are useful in an environment which generates the code objects directly, such as a new accelerator language front-end.
|
||||
Here, NVCC is not used. Instead, the environment may have a different kernel language or different compilation flow.
|
||||
Other environments have many kernels and do not want them to be all loaded automatically.
|
||||
The Module functions can be used to load the generated code objects and launch kernels.
|
||||
As we will see below, HIP defines a Module API which provides similar explicit control over code object management.
|
||||
|
||||
### cuCtx API
|
||||
The Driver API defines "Context" and "Devices" as separate entities.
|
||||
Contexts contain a single device, and a device can theoretically have multiple contexts.
|
||||
Each context contains a set of streams and events specific to the context.
|
||||
Historically contexts also defined a unique address space for the GPU, though this may no longer be the case in Unified Memory platforms (since the CPU and all the devices in the same process share a single unified address space).
|
||||
The Context APIs also provide a mechanism to switch between devices, which allowed a single CPU thread to send commands to different GPUs.
|
||||
HIP as well as a recent versions of CUDA Runtime provide other mechanisms to accomplish this feat - for example using streams or `cudaSetDevice`.
|
||||
|
||||
The CUDA Runtime API unifies the Context API with the Device API. This simplifies the APIs and has little loss of functionality since each Context can contain a single device, and the benefits of multiple contexts has been replaced with other interfaces.
|
||||
HIP provides a context API to facilitate easy porting from existing Driver codes.
|
||||
In HIP, the Ctx functions largely provide an alternate syntax for changing the active device.
|
||||
|
||||
Most new applications will prefer to use `hipSetDevice` or the stream APIs , therefore HIP has marked hipCtx APIs as **deprecated**. Support for these APIs may not be available in future releases. For more details on deprecated APIs please refer [HIP deprecated APIs](https://github.com/ROCm-Developer-Tools/HIP/tree/master/docs/markdown/hip_deprecated_api_list.md).
|
||||
|
||||
## HIP Module and Ctx APIs
|
||||
|
||||
Rather than present two separate APIs, HIP extends the HIP API with new APIs for Modules and Ctx control.
|
||||
|
||||
### hipModule API
|
||||
|
||||
Like the CUDA Driver API, the Module API provides additional control over how code is loaded, including options to load code from files or from in-memory pointers.
|
||||
NVCC and HIP-Clang target different architectures and use different code object formats: NVCC is `cubin` or `ptx` files, while the HIP-Clang path is the `hsaco` format.
|
||||
The external compilers which generate these code objects are responsible for generating and loading the correct code object for each platform.
|
||||
Notably, there is not a fat binary format that can contain code for both NVCC and HIP-Clang platforms. The following table summarizes the formats used on each platform:
|
||||
|
||||
| Format | APIs | NVCC | HIP-CLANG |
|
||||
| --- | --- | --- | --- |
|
||||
| Code Object | hipModuleLoad, hipModuleLoadData | .cubin or PTX text | .hsaco |
|
||||
| Fat Binary | hipModuleLoadFatBin | .fatbin | .hip_fatbin |
|
||||
|
||||
`hipcc` uses HIP-Clang or NVCC to compile host codes. Both of these may embed code objects into the final executable, and these code objects will be automatically loaded when the application starts.
|
||||
The hipModule API can be used to load additional code objects, and in this way provides an extended capability to the automatically loaded code objects.
|
||||
HIP-Clang allows both of these capabilities to be used together, if desired. Of course it is possible to create a program with no kernels and thus no automatic loading.
|
||||
|
||||
|
||||
### hipCtx API
|
||||
HIP provides a `Ctx` API as a thin layer over the existing Device functions. This Ctx API can be used to set the current context, or to query properties of the device associated with the context.
|
||||
The current context is implicitly used by other APIs such as `hipStreamCreate`.
|
||||
|
||||
### hipify translation of CUDA Driver API
|
||||
The HIPIFY tools convert CUDA Driver APIs for streams, events, modules, devices, memory management, context, profiler to the equivalent HIP driver calls. For example, `cuEventCreate` will be translated to `hipEventCreate`.
|
||||
HIPIFY tools also convert error codes from the Driver namespace and coding convention to the equivalent HIP error code. Thus, HIP unifies the APIs for these common functions.
|
||||
|
||||
The memory copy API requires additional explanation. The CUDA driver includes the memory direction in the name of the API (ie `cuMemcpyH2D`) while the CUDA driver API provides a single memory copy API with a parameter that specifies the direction and additionally supports a "default" direction where the runtime determines the direction automatically.
|
||||
HIP provides APIs with both styles: for example, `hipMemcpyH2D` as well as `hipMemcpy`.
|
||||
The first flavor may be faster in some cases since they avoid host overhead to detect the different memory directions.
|
||||
|
||||
HIP defines a single error space, and uses camel-case for all errors (i.e. `hipErrorInvalidValue`).
|
||||
|
||||
#### Address Spaces
|
||||
HIP-Clang defines a process-wide address space where the CPU and all devices allocate addresses from a single unified pool.
|
||||
Thus addresses may be shared between contexts, and unlike the original CUDA definition a new context does not create a new address space for the device.
|
||||
|
||||
#### Using hipModuleLaunchKernel
|
||||
`hipModuleLaunchKernel` is `cuLaunchKernel` in HIP world. It takes the same arguments as `cuLaunchKernel`.
|
||||
|
||||
#### Additional Information
|
||||
- HIP-Clang creates a primary context when the HIP API is called. So in a pure driver API code, HIP-Clang will create a primary context while HIP/NVCC will have empty context stack.
|
||||
HIP-Clang will push primary context to context stack when it is empty. This can have subtle differences on applications which mix the runtime and driver APIs.
|
||||
|
||||
### hip-clang Implementation Notes
|
||||
#### .hip_fatbin
|
||||
hip-clang links device code from different translation units together. For each device target, a code object is generated. Code objects for different device targets are bundled by clang-offload-bundler as one fatbinary, which is embeded as a global symbol `__hip_fatbin` in the .hip_fatbin section of the ELF file of the executable or shared object.
|
||||
|
||||
#### Initialization and Termination Functions
|
||||
hip-clang generates initializatiion and termination functions for each translation unit for host code compilation. The initialization functions call `__hipRegisterFatBinary` to register the fatbinary embeded in the ELF file. They also call `__hipRegisterFunction` and `__hipRegisterVar` to register kernel functions and device side global variables. The termination functions call `__hipUnregisterFatBinary`.
|
||||
hip-clang emits a global variable `__hip_gpubin_handle` of void** type with linkonce linkage and inital value 0 for each host translation unit. Each initialization function checks `__hip_gpubin_handle` and register the fatbinary only if `__hip_gpubin_handle` is 0 and saves the return value of `__hip_gpubin_handle` to `__hip_gpubin_handle`. This is to guarantee that the fatbinary is only registered once. Similar check is done in the termination functions.
|
||||
|
||||
#### Kernel Launching
|
||||
hip-clang supports kernel launching by CUDA `<<<>>>` syntax, hipLaunchKernel, and hipLaunchKernelGGL. The latter two are macros which expand to CUDA `<<<>>>` syntax.
|
||||
|
||||
When the executable or shared library is loaded by the dynamic linker, the initilization functions are called. In the initialization functions, when `__hipRegisterFatBinary` is called, the code objects containing all kernels are loaded; when `__hipRegisterFunction` is called, the stub functions are associated with the corresponding kernels in code objects.
|
||||
|
||||
hip-clang implements two sets of kernel launching APIs.
|
||||
|
||||
By default, in the host code, for the `<<<>>>` statement, hip-clang first emits call of hipConfigureCall to set up the threads and grids, then emits call of the stub function with the given arguments. In the stub function, hipSetupArgument is called for each kernel argument, then hipLaunchByPtr is called with a function pointer to the stub function. In hipLaunchByPtr, the real kernel associated with the stub function is launched.
|
||||
|
||||
If HIP program is compiled with -fhip-new-launch-api, in the host code, for the `<<<>>>` statement, hip-clang first emits call of `__hipPushCallConfiguration` to save the grid dimension, block dimension, shared memory usage and stream to a stack, then emits call of the stub function with the given arguments. In the stub function, `__hipPopCallConfiguration` is called to get the saved grid dimension, block dimension, shared memory usage and stream, then hipLaunchKernel is called with a function pointer to the stub function. In hipLaunchKernel, the real kernel associated with the stub function is launched.
|
||||
|
||||
### NVCC Implementation Notes
|
||||
|
||||
#### Interoperation between HIP and CUDA Driver
|
||||
CUDA applications may want to mix CUDA driver code with HIP code (see example below). This table shows the type equivalence to enable this interaction.
|
||||
|
||||
|**HIP Type** |**CU Driver Type**|**CUDA Runtime Type**|
|
||||
| ---- | ---- | ---- |
|
||||
| hipModule_t | CUmodule | |
|
||||
| hipFunction_t | CUfunction | |
|
||||
| hipCtx_t | CUcontext | |
|
||||
| hipDevice_t | CUdevice | |
|
||||
| hipStream_t | CUstream | cudaStream_t |
|
||||
| hipEvent_t | CUevent | cudaEvent_t |
|
||||
| hipArray | CUarray | cudaArray |
|
||||
|
||||
#### Compilation Options
|
||||
The `hipModule_t` interface does not support `cuModuleLoadDataEx` function, which is used to control PTX compilation options.
|
||||
HIP-Clang does not use PTX and does not support these compilation options.
|
||||
In fact, HIP-Clang code objects always contain fully compiled ISA and do not require additional compilation as a part of the load step.
|
||||
The corresponding HIP function `hipModuleLoadDataEx` behaves as `hipModuleLoadData` on HIP-Clang path (compilation options are not used) and as `cuModuleLoadDataEx` on NVCC path.
|
||||
For example (CUDA):
|
||||
```
|
||||
CUmodule module;
|
||||
void *imagePtr = ...; // Somehow populate data pointer with code object
|
||||
|
||||
const int numOptions = 1;
|
||||
CUJit_option options[numOptions];
|
||||
void * optionValues[numOptions];
|
||||
|
||||
options[0] = CU_JIT_MAX_REGISTERS;
|
||||
unsigned maxRegs = 15;
|
||||
optionValues[0] = (void*)(&maxRegs);
|
||||
|
||||
cuModuleLoadDataEx(module, imagePtr, numOptions, options, optionValues);
|
||||
|
||||
CUfunction k;
|
||||
cuModuleGetFunction(&k, module, "myKernel");
|
||||
```
|
||||
HIP:
|
||||
```
|
||||
hipModule_t module;
|
||||
void *imagePtr = ...; // Somehow populate data pointer with code object
|
||||
|
||||
const int numOptions = 1;
|
||||
hipJitOption options[numOptions];
|
||||
void * optionValues[numOptions];
|
||||
|
||||
options[0] = hipJitOptionMaxRegisters;
|
||||
unsigned maxRegs = 15;
|
||||
optionValues[0] = (void*)(&maxRegs);
|
||||
|
||||
// hipModuleLoadData(module, imagePtr) will be called on HIP-Clang path, JIT options will not be used, and
|
||||
// cupModuleLoadDataEx(module, imagePtr, numOptions, options, optionValues) will be called on NVCC path
|
||||
hipModuleLoadDataEx(module, imagePtr, numOptions, options, optionValues);
|
||||
|
||||
hipFunction_t k;
|
||||
hipModuleGetFunction(&k, module, "myKernel");
|
||||
```
|
||||
|
||||
The below sample shows how to use `hipModuleGetFunction`.
|
||||
|
||||
```
|
||||
#include<hip_runtime.h>
|
||||
#include<hip_runtime_api.h>
|
||||
#include<iostream>
|
||||
#include<fstream>
|
||||
#include<vector>
|
||||
|
||||
#define LEN 64
|
||||
#define SIZE LEN<<2
|
||||
|
||||
#ifdef __HIP_PLATFORM_AMD__
|
||||
#define fileName "vcpy_isa.co"
|
||||
#endif
|
||||
|
||||
#ifdef __HIP_PLATFORM_NVIDIA__
|
||||
#define fileName "vcpy_isa.ptx"
|
||||
#endif
|
||||
|
||||
#define kernel_name "hello_world"
|
||||
|
||||
int main(){
|
||||
float *A, *B;
|
||||
hipDeviceptr_t Ad, Bd;
|
||||
A = new float[LEN];
|
||||
B = new float[LEN];
|
||||
|
||||
for(uint32_t i=0;i<LEN;i++){
|
||||
A[i] = i*1.0f;
|
||||
B[i] = 0.0f;
|
||||
std::cout<<A[i] << " "<<B[i]<<std::endl;
|
||||
}
|
||||
|
||||
|
||||
#ifdef __HIP_PLATFORM_NVIDIA__
|
||||
hipInit(0);
|
||||
hipDevice_t device;
|
||||
hipCtx_t context;
|
||||
hipDeviceGet(&device, 0);
|
||||
hipCtxCreate(&context, 0, device);
|
||||
#endif
|
||||
|
||||
hipMalloc((void**)&Ad, SIZE);
|
||||
hipMalloc((void**)&Bd, SIZE);
|
||||
|
||||
hipMemcpyHtoD(Ad, A, SIZE);
|
||||
hipMemcpyHtoD(Bd, B, SIZE);
|
||||
hipModule_t Module;
|
||||
hipFunction_t Function;
|
||||
hipModuleLoad(&Module, fileName);
|
||||
hipModuleGetFunction(&Function, Module, kernel_name);
|
||||
|
||||
std::vector<void*>argBuffer(2);
|
||||
memcpy(&argBuffer[0], &Ad, sizeof(void*));
|
||||
memcpy(&argBuffer[1], &Bd, sizeof(void*));
|
||||
|
||||
size_t size = argBuffer.size()*sizeof(void*);
|
||||
|
||||
void *config[] = {
|
||||
HIP_LAUNCH_PARAM_BUFFER_POINTER, &argBuffer[0],
|
||||
HIP_LAUNCH_PARAM_BUFFER_SIZE, &size,
|
||||
HIP_LAUNCH_PARAM_END
|
||||
};
|
||||
|
||||
hipModuleLaunchKernel(Function, 1, 1, 1, LEN, 1, 1, 0, 0, NULL, (void**)&config);
|
||||
|
||||
hipMemcpyDtoH(B, Bd, SIZE);
|
||||
for(uint32_t i=0;i<LEN;i++){
|
||||
std::cout<<A[i]<<" - "<<B[i]<<std::endl;
|
||||
}
|
||||
|
||||
#ifdef __HIP_PLATFORM_NVIDIA__
|
||||
hipCtxDetach(context);
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
}
|
||||
```
|
||||
|
||||
## HIP Module and Texture Driver API
|
||||
|
||||
HIP supports texture driver APIs however texture reference should be declared in host scope. Following code explains the use of texture reference for __HIP_PLATFORM_AMD__ platform.
|
||||
|
||||
```
|
||||
// Code to generate code object
|
||||
|
||||
#include "hip/hip_runtime.h"
|
||||
extern texture<float, 2, hipReadModeElementType> tex;
|
||||
|
||||
__global__ void tex2dKernel(hipLaunchParm lp, float* outputData,
|
||||
int width,
|
||||
int height)
|
||||
{
|
||||
int x = blockIdx.x*blockDim.x + threadIdx.x;
|
||||
int y = blockIdx.y*blockDim.y + threadIdx.y;
|
||||
outputData[y*width + x] = tex2D(tex, x, y);
|
||||
}
|
||||
|
||||
```
|
||||
```
|
||||
// Host code:
|
||||
|
||||
texture<float, 2, hipReadModeElementType> tex;
|
||||
|
||||
void myFunc ()
|
||||
{
|
||||
// ...
|
||||
|
||||
textureReference* texref;
|
||||
hipModuleGetTexRef(&texref, Module1, "tex");
|
||||
hipTexRefSetAddressMode(texref, 0, hipAddressModeWrap);
|
||||
hipTexRefSetAddressMode(texref, 1, hipAddressModeWrap);
|
||||
hipTexRefSetFilterMode(texref, hipFilterModePoint);
|
||||
hipTexRefSetFlags(texref, 0);
|
||||
hipTexRefSetFormat(texref, HIP_AD_FORMAT_FLOAT, 1);
|
||||
hipTexRefSetArray(texref, array, HIP_TRSA_OVERRIDE_FORMAT);
|
||||
|
||||
// ...
|
||||
}
|
||||
```
|
||||
@@ -1,593 +0,0 @@
|
||||
# HIP Porting Guide
|
||||
In addition to providing a portable C++ programming environment for GPUs, HIP is designed to ease
|
||||
the porting of existing CUDA code into the HIP environment. This section describes the available tools
|
||||
and provides practical suggestions on how to port CUDA code and work through common issues.
|
||||
|
||||
## Table of Contents
|
||||
|
||||
<!-- toc -->
|
||||
|
||||
- [Porting a New CUDA Project](#porting-a-new-cuda-project)
|
||||
* [General Tips](#general-tips)
|
||||
* [Scanning existing CUDA code to scope the porting effort](#scanning-existing-cuda-code-to-scope-the-porting-effort)
|
||||
* [Converting a project "in-place"](#converting-a-project-in-place)
|
||||
* [CUDA to HIP Math Library Equivalents](#library-equivalents)
|
||||
- [Distinguishing Compiler Modes](#distinguishing-compiler-modes)
|
||||
* [Identifying HIP Target Platform](#identifying-hip-target-platform)
|
||||
* [Identifying the Compiler: hip-clang, or nvcc](#identifying-the-compiler-hip-clang-or-nvcc)
|
||||
* [Identifying Current Compilation Pass: Host or Device](#identifying-current-compilation-pass-host-or-device)
|
||||
* [Compiler Defines: Summary](#compiler-defines-summary)
|
||||
- [Identifying Architecture Features](#identifying-architecture-features)
|
||||
* [HIP_ARCH Defines](#hip_arch-defines)
|
||||
* [Device-Architecture Properties](#device-architecture-properties)
|
||||
* [Table of Architecture Properties](#table-of-architecture-properties)
|
||||
- [Finding HIP](#finding-hip)
|
||||
- [Identifying HIP Runtime](#identifying-hip-runtime)
|
||||
- [hipLaunchKernel](#hiplaunchkernel)
|
||||
- [Compiler Options](#compiler-options)
|
||||
- [Linking Issues](#linking-issues)
|
||||
* [Linking With hipcc](#linking-with-hipcc)
|
||||
* [-lm Option](#-lm-option)
|
||||
- [Linking Code With Other Compilers](#linking-code-with-other-compilers)
|
||||
* [libc++ and libstdc++](#libc-and-libstdc)
|
||||
* [HIP Headers (hip_runtime.h, hip_runtime_api.h)](#hip-headers-hip_runtimeh-hip_runtime_apih)
|
||||
* [Using a Standard C++ Compiler](#using-a-standard-c-compiler)
|
||||
+ [cuda.h](#cudah)
|
||||
* [Choosing HIP File Extensions](#choosing-hip-file-extensions)
|
||||
- [Workarounds](#workarounds)
|
||||
* [warpSize](#warpsize)
|
||||
* [Kernel launch with group size > 256](#kernel-launch-with-group-size--256)
|
||||
- [memcpyToSymbol](#memcpytosymbol)
|
||||
- [threadfence_system](#threadfence_system)
|
||||
* [Textures and Cache Control](#textures-and-cache-control)
|
||||
- [More Tips](#more-tips)
|
||||
* [HIP Logging](#hip-logging)
|
||||
* [Debugging hipcc](#debugging-hipcc)
|
||||
* [What Does This Error Mean?](#what-does-this-error-mean)
|
||||
+ [/usr/include/c++/v1/memory:5172:15: error: call to implicitly deleted default constructor of 'std::__1::bad_weak_ptr' throw bad_weak_ptr();](#usrincludecv1memory517215-error-call-to-implicitly-deleted-default-constructor-of-std__1bad_weak_ptr-throw-bad_weak_ptr)
|
||||
* [Editor Highlighting](#editor-highlighting)
|
||||
|
||||
|
||||
<!-- tocstop -->
|
||||
|
||||
## Porting a New CUDA Project
|
||||
|
||||
### General Tips
|
||||
- Starting the port on a CUDA machine is often the easiest approach, since you can incrementally port pieces of the code to HIP while leaving the rest in CUDA. (Recall that on CUDA machines HIP is just a thin layer over CUDA, so the two code types can interoperate on nvcc platforms.) Also, the HIP port can be compared with the original CUDA code for function and performance.
|
||||
- Once the CUDA code is ported to HIP and is running on the CUDA machine, compile the HIP code using the HIP compiler on an AMD machine.
|
||||
- HIP ports can replace CUDA versions: HIP can deliver the same performance as a native CUDA implementation, with the benefit of portability to both Nvidia and AMD architectures as well as a path to future C++ standard support. You can handle platform-specific features through conditional compilation or by adding them to the open-source HIP infrastructure.
|
||||
- Use **[bin/hipconvertinplace-perl.sh](https://github.com/ROCm-Developer-Tools/HIP/blob/master/bin/hipconvertinplace-perl.sh)** to hipify all code files in the CUDA source directory.
|
||||
|
||||
### Scanning existing CUDA code to scope the porting effort
|
||||
The hipexamine-perl.sh tool will scan a source directory to determine which files contain CUDA code and how much of that code can be automatically hipified.
|
||||
```
|
||||
> cd examples/rodinia_3.0/cuda/kmeans
|
||||
> $HIP_DIR/bin/hipexamine-perl.sh.
|
||||
info: hipify ./kmeans.h =====>
|
||||
info: hipify ./unistd.h =====>
|
||||
info: hipify ./kmeans.c =====>
|
||||
info: hipify ./kmeans_cuda_kernel.cu =====>
|
||||
info: converted 40 CUDA->HIP refs( dev:0 mem:0 kern:0 builtin:37 math:0 stream:0 event:0 err:0 def:0 tex:3 other:0 ) warn:0 LOC:185
|
||||
info: hipify ./getopt.h =====>
|
||||
info: hipify ./kmeans_cuda.cu =====>
|
||||
info: converted 49 CUDA->HIP refs( dev:3 mem:32 kern:2 builtin:0 math:0 stream:0 event:0 err:0 def:0 tex:12 other:0 ) warn:0 LOC:311
|
||||
info: hipify ./rmse.c =====>
|
||||
info: hipify ./cluster.c =====>
|
||||
info: hipify ./getopt.c =====>
|
||||
info: hipify ./kmeans_clustering.c =====>
|
||||
info: TOTAL-converted 89 CUDA->HIP refs( dev:3 mem:32 kern:2 builtin:37 math:0 stream:0 event:0 err:0 def:0 tex:15 other:0 ) warn:0 LOC:3607
|
||||
kernels (1 total) : kmeansPoint(1)
|
||||
```
|
||||
|
||||
hipexamine-perl scans each code file (cpp, c, h, hpp, etc.) found in the specified directory:
|
||||
|
||||
* Files with no CUDA code (ie kmeans.h) print one line summary just listing the source file name.
|
||||
* Files with CUDA code print a summary of what was found - for example the kmeans_cuda_kernel.cu file:
|
||||
```
|
||||
info: hipify ./kmeans_cuda_kernel.cu =====>
|
||||
info: converted 40 CUDA->HIP refs( dev:0 mem:0 kern:0 builtin:37 math:0 stream:0 event:0
|
||||
```
|
||||
* Interesting information in kmeans_cuda_kernel.cu :
|
||||
* How many CUDA calls were converted to HIP (40)
|
||||
* Breakdown of the CUDA functionality used (dev:0 mem:0 etc). This file uses many CUDA builtins (37) and texture functions (3).
|
||||
* Warning for code that looks like CUDA API but was not converted (0 in this file).
|
||||
* Count Lines-of-Code (LOC) - 185 for this file.
|
||||
|
||||
* hipexamine-perl also presents a summary at the end of the process for the statistics collected across all files. This has similar format to the per-file reporting, and also includes a list of all kernels which have been called. An example from above:
|
||||
|
||||
```shell
|
||||
info: TOTAL-converted 89 CUDA->HIP refs( dev:3 mem:32 kern:2 builtin:37 math:0 stream:0 event:0 err:0 def:0 tex:15 other:0 ) warn:0 LOC:3607
|
||||
kernels (1 total) : kmeansPoint(1)
|
||||
```
|
||||
|
||||
### Converting a project "in-place"
|
||||
|
||||
```shell
|
||||
> hipify-perl --inplace
|
||||
```
|
||||
|
||||
For each input file FILE, this script will:
|
||||
- If "FILE.prehip file does not exist, copy the original code to a new file with extension ".prehip". Then hipify the code file.
|
||||
- If "FILE.prehip" file exists, hipify FILE.prehip and save to FILE.
|
||||
|
||||
This is useful for testing improvements to the hipify toolset.
|
||||
|
||||
|
||||
The [hipconvertinplace-perl.sh](https://github.com/ROCm-Developer-Tools/HIP/blob/master/bin/hipconvertinplace-perl.sh) script will perform inplace conversion for all code files in the specified directory.
|
||||
This can be quite handy when dealing with an existing CUDA code base since the script preserves the existing directory structure
|
||||
and filenames - and includes work. After converting in-place, you can review the code to add additional parameters to
|
||||
directory names.
|
||||
|
||||
|
||||
```shell
|
||||
> hipconvertinplace-perl.sh MY_SRC_DIR
|
||||
```
|
||||
|
||||
### Library Equivalents
|
||||
|
||||
| CUDA Library | ROCm Library | Comment |
|
||||
|------- | --------- | ----- |
|
||||
| cuBLAS | rocBLAS | Basic Linear Algebra Subroutines
|
||||
| cuFFT | rocFFT | Fast Fourier Transfer Library
|
||||
| cuSPARSE | rocSPARSE | Sparse BLAS + SPMV
|
||||
| cuSolver | rocSOLVER | Lapack library
|
||||
| AMG-X | rocALUTION | Sparse iterative solvers and preconditioners with Geometric and Algebraic MultiGrid
|
||||
| Thrust | rocThrust | C++ parallel algorithms library
|
||||
| CUB | rocPRIM | Low Level Optimized Parallel Primitives
|
||||
| cuDNN | MIOpen | Deep learning Solver Library
|
||||
| cuRAND | rocRAND | Random Number Generator Library
|
||||
| EIGEN | EIGEN – HIP port | C++ template library for linear algebra: matrices, vectors, numerical solvers,
|
||||
| NCCL | RCCL | Communications Primitives Library based on the MPI equivalents
|
||||
|
||||
|
||||
|
||||
## Distinguishing Compiler Modes
|
||||
|
||||
|
||||
### Identifying HIP Target Platform
|
||||
All HIP projects target either AMD or NVIDIA platform. The platform affects which headers are included and which libraries are used for linking.
|
||||
|
||||
- `HIP_PLATFORM_AMD` is defined if the HIP platform targets AMD.
|
||||
Note, `HIP_PLATFORM_HCC` was previously defined if the HIP platform targeted AMD, it is deprecated.
|
||||
|
||||
- `HIP_PLATFORM_NVDIA` is defined if the HIP platform targets NVIDIA.
|
||||
Note, `HIP_PLATFORM_NVCC` was previously defined if the HIP platform targeted NVIDIA, it is deprecated.
|
||||
|
||||
### Identifying the Compiler: hip-clang or nvcc
|
||||
Often, it's useful to know whether the underlying compiler is HIP-Clang or nvcc. This knowledge can guard platform-specific code or aid in platform-specific performance tuning.
|
||||
|
||||
```
|
||||
#ifdef __HIP_PLATFORM_AMD__
|
||||
// Compiled with HIP-Clang
|
||||
#endif
|
||||
```
|
||||
|
||||
```
|
||||
#ifdef __HIP_PLATFORM_NVIDIA__
|
||||
// Compiled with nvcc
|
||||
// Could be compiling with CUDA language extensions enabled (for example, a ".cu file)
|
||||
// Could be in pass-through mode to an underlying host compile OR (for example, a .cpp file)
|
||||
|
||||
```
|
||||
|
||||
```
|
||||
#ifdef __CUDACC__
|
||||
// Compiled with nvcc (CUDA language extensions enabled)
|
||||
```
|
||||
|
||||
Compiler directly generates the host code (using the Clang x86 target) and passes the code to another host compiler. Thus, they have no equivalent of the \__CUDA_ACC define.
|
||||
|
||||
|
||||
### Identifying Current Compilation Pass: Host or Device
|
||||
|
||||
nvcc makes two passes over the code: one for host code and one for device code.
|
||||
HIP-Clang will have multiple passes over the code: one for the host code, and one for each architecture on the device code.
|
||||
`__HIP_DEVICE_COMPILE__` is set to a nonzero value when the compiler (HIP-Clang or nvcc) is compiling code for a device inside a `__global__` kernel or for a device function. `__HIP_DEVICE_COMPILE__` can replace #ifdef checks on the `__CUDA_ARCH__` define.
|
||||
|
||||
```
|
||||
// #ifdef __CUDA_ARCH__
|
||||
#if __HIP_DEVICE_COMPILE__
|
||||
```
|
||||
|
||||
Unlike `__CUDA_ARCH__`, the `__HIP_DEVICE_COMPILE__` value is 1 or undefined, and it doesn't represent the feature capability of the target device.
|
||||
|
||||
### Compiler Defines: Summary
|
||||
|Define | HIP-Clang | nvcc | Other (GCC, ICC, Clang, etc.)
|
||||
|--- | --- | --- |---|
|
||||
|HIP-related defines:|
|
||||
|`__HIP_PLATFORM_AMD__`| Defined | Undefined | Defined if targeting AMD platform; undefined otherwise |
|
||||
|`__HIP_PLATFORM_NVIDIA__`| Undefined | Defined | Defined if targeting NVIDIA platform; undefined otherwise |
|
||||
|`__HIP_DEVICE_COMPILE__` | 1 if compiling for device; undefined if compiling for host |1 if compiling for device; undefined if compiling for host | Undefined
|
||||
|`__HIPCC__` | Defined | Defined | Undefined
|
||||
|`__HIP_ARCH_*` |0 or 1 depending on feature support (see below) | 0 or 1 depending on feature support (see below) | 0
|
||||
|nvcc-related defines:|
|
||||
|`__CUDACC__` | Defined if source code is compiled by nvcc; undefined otherwise | Undefined
|
||||
|`__NVCC__` | Undefined | Defined | Undefined
|
||||
|`__CUDA_ARCH__` | Undefined | Unsigned representing compute capability (e.g., "130") if in device code; 0 if in host code | Undefined
|
||||
|hip-clang-related defines:|
|
||||
|`__HIP__` | Defined | Undefined | Undefined
|
||||
|HIP-Clang common defines:|
|
||||
|`__clang__` | Defined | Defined | Undefined | Defined if using Clang; otherwise undefined
|
||||
|
||||
## Identifying Architecture Features
|
||||
|
||||
### HIP_ARCH Defines
|
||||
|
||||
Some CUDA code tests `__CUDA_ARCH__` for a specific value to determine whether the machine supports a certain architectural feature. For instance,
|
||||
|
||||
```
|
||||
#if (__CUDA_ARCH__ >= 130)
|
||||
// doubles are supported
|
||||
```
|
||||
This type of code requires special attention, since AMD and CUDA devices have different architectural capabilities. Moreover, you can't determine the presence of a feature using a simple comparison against an architecture's version number. HIP provides a set of defines and device properties to query whether a specific architectural feature is supported.
|
||||
|
||||
The `__HIP_ARCH_*` defines can replace comparisons of `__CUDA_ARCH__` values:
|
||||
```
|
||||
//#if (__CUDA_ARCH__ >= 130) // non-portable
|
||||
if __HIP_ARCH_HAS_DOUBLES__ { // portable HIP feature query
|
||||
// doubles are supported
|
||||
}
|
||||
```
|
||||
|
||||
For host code, the `__HIP_ARCH__*` defines are set to 0. You should only use the __HIP_ARCH__ fields in device code.
|
||||
|
||||
### Device-Architecture Properties
|
||||
|
||||
Host code should query the architecture feature flags in the device properties that hipGetDeviceProperties returns, rather than testing the "major" and "minor" fields directly:
|
||||
|
||||
```
|
||||
hipGetDeviceProperties(&deviceProp, device);
|
||||
//if ((deviceProp.major == 1 && deviceProp.minor < 2)) // non-portable
|
||||
if (deviceProp.arch.hasSharedInt32Atomics) { // portable HIP feature query
|
||||
// has shared int32 atomic operations ...
|
||||
}
|
||||
```
|
||||
|
||||
### Table of Architecture Properties
|
||||
The table below shows the full set of architectural properties that HIP supports.
|
||||
|
||||
|Define (use only in device code) | Device Property (run-time query) | Comment |
|
||||
|------- | --------- | ----- |
|
||||
|32-bit atomics:||
|
||||
|`__HIP_ARCH_HAS_GLOBAL_INT32_ATOMICS__` | hasGlobalInt32Atomics |32-bit integer atomics for global memory
|
||||
|`__HIP_ARCH_HAS_GLOBAL_FLOAT_ATOMIC_EXCH__` | hasGlobalFloatAtomicExch |32-bit float atomic exchange for global memory
|
||||
|`__HIP_ARCH_HAS_SHARED_INT32_ATOMICS__` | hasSharedInt32Atomics |32-bit integer atomics for shared memory
|
||||
|`__HIP_ARCH_HAS_SHARED_FLOAT_ATOMIC_EXCH__` | hasSharedFloatAtomicExch |32-bit float atomic exchange for shared memory
|
||||
|`__HIP_ARCH_HAS_FLOAT_ATOMIC_ADD__` | hasFloatAtomicAdd |32-bit float atomic add in global and shared memory
|
||||
|64-bit atomics: | |
|
||||
|`__HIP_ARCH_HAS_GLOBAL_INT64_ATOMICS__` | hasGlobalInt64Atomics |64-bit integer atomics for global memory
|
||||
|`__HIP_ARCH_HAS_SHARED_INT64_ATOMICS__` | hasSharedInt64Atomics |64-bit integer atomics for shared memory
|
||||
|Doubles: | |
|
||||
|`__HIP_ARCH_HAS_DOUBLES__` | hasDoubles |Double-precision floating point
|
||||
|Warp cross-lane operations: | |
|
||||
|`__HIP_ARCH_HAS_WARP_VOTE__` | hasWarpVote |Warp vote instructions (any, all)
|
||||
|`__HIP_ARCH_HAS_WARP_BALLOT__` | hasWarpBallot |Warp ballot instructions
|
||||
|`__HIP_ARCH_HAS_WARP_SHUFFLE__` | hasWarpShuffle |Warp shuffle operations (shfl\_\*)
|
||||
|`__HIP_ARCH_HAS_WARP_FUNNEL_SHIFT__` | hasFunnelShift |Funnel shift two input words into one
|
||||
|Sync: | |
|
||||
|`__HIP_ARCH_HAS_THREAD_FENCE_SYSTEM__` | hasThreadFenceSystem |threadfence\_system
|
||||
|`__HIP_ARCH_HAS_SYNC_THREAD_EXT__` | hasSyncThreadsExt |syncthreads\_count, syncthreads\_and, syncthreads\_or
|
||||
|Miscellaneous: | |
|
||||
|`__HIP_ARCH_HAS_SURFACE_FUNCS__` | hasSurfaceFuncs |
|
||||
|`__HIP_ARCH_HAS_3DGRID__` | has3dGrid | Grids and groups are 3D
|
||||
|`__HIP_ARCH_HAS_DYNAMIC_PARALLEL__` | hasDynamicParallelism |
|
||||
|
||||
|
||||
## Finding HIP
|
||||
|
||||
Makefiles can use the following syntax to conditionally provide a default HIP_PATH if one does not exist:
|
||||
|
||||
```
|
||||
HIP_PATH ?= $(shell hipconfig --path)
|
||||
```
|
||||
|
||||
## Identifying HIP Runtime
|
||||
|
||||
HIP can depend on rocclr, or cuda as runtime
|
||||
|
||||
- AMD platform
|
||||
On AMD platform, HIP uses Radeon Open Compute Common Language Runtime, called ROCclr.
|
||||
ROCclr is a virtual device interface that HIP runtimes interact with different backends which allows runtimes to work on Linux , as well as Windows without much efforts.
|
||||
|
||||
- NVIDIA platform
|
||||
On Nvidia platform, HIP is just a thin layer on top of CUDA.
|
||||
On non-AMD platform, HIP runtime determines if cuda is available and can be used. If available, HIP_PLATFORM is set to nvidia and underneath CUDA path is used.
|
||||
|
||||
|
||||
## hipLaunchKernel
|
||||
|
||||
hipLaunchKernel is a variadic macro which accepts as parameters the launch configurations (grid dims, group dims, stream, dynamic shared size) followed by a variable number of kernel arguments.
|
||||
This sequence is then expanded into the appropriate kernel launch syntax depending on the platform.
|
||||
While this can be a convenient single-line kernel launch syntax, the macro implementation can cause issues when nested inside other macros. For example, consider the following:
|
||||
|
||||
```
|
||||
// Will cause compile error:
|
||||
#define MY_LAUNCH(command, doTrace) \
|
||||
{\
|
||||
if (doTrace) printf ("TRACE: %s\n", #command); \
|
||||
(command); /* The nested ( ) will cause compile error */\
|
||||
}
|
||||
|
||||
MY_LAUNCH (hipLaunchKernel(vAdd, dim3(1024), dim3(1), 0, 0, Ad), true, "firstCall");
|
||||
```
|
||||
|
||||
Avoid nesting macro parameters inside parenthesis - here's an alternative that will work:
|
||||
|
||||
```
|
||||
#define MY_LAUNCH(command, doTrace) \
|
||||
{\
|
||||
if (doTrace) printf ("TRACE: %s\n", #command); \
|
||||
command;\
|
||||
}
|
||||
|
||||
MY_LAUNCH (hipLaunchKernel(vAdd, dim3(1024), dim3(1), 0, 0, Ad), true, "firstCall");
|
||||
```
|
||||
|
||||
|
||||
|
||||
|
||||
## Compiler Options
|
||||
|
||||
hipcc is a portable compiler driver that will call nvcc or HIP-Clang (depending on the target system) and attach all required include and library options. It passes options through to the target compiler. Tools that call hipcc must ensure the compiler options are appropriate for the target compiler.
|
||||
The `hipconfig` script may helpful in identifying the target platform, compiler and runtime. It can also help set options appropriately.
|
||||
|
||||
### Compiler options supported on AMD platforms
|
||||
|
||||
Here are the main compiler options supported on AMD platforms by HIP-Clang.
|
||||
|
||||
| Option | Description |
|
||||
| ------ | ----------- |
|
||||
| --amdgpu-target=<gpu_arch> | [DEPRECATED] This option is being replaced by `--offload-arch=<target>`. Generate code for the given GPU target. Supported targets are gfx701, gfx801, gfx802, gfx803, gfx900, gfx906, gfx908, gfx1010, gfx1011, gfx1012, gfx1030, gfx1031. This option could appear multiple times on the same command line to generate a fat binary for multiple targets. |
|
||||
| --fgpu-rdc | Generate relocatable device code, which allows kernels or device functions calling device functions in different translation units. |
|
||||
| -ggdb | Equivalent to `-g` plus tuning for GDB. This is recommended when using ROCm's GDB to debug GPU code. |
|
||||
| --gpu-max-threads-per-block=<num> | Generate code to support up to the specified number of threads per block. |
|
||||
| -O<n> | Specify the optimization level. |
|
||||
| -offload-arch=<target> | Specify the AMD GPU [target ID](https://clang.llvm.org/docs/ClangOffloadBundlerFileFormat.html#target-id). |
|
||||
| -save-temps | Save the compiler generated intermediate files. |
|
||||
| -v | Show the compilation steps. |
|
||||
|
||||
## Linking Issues
|
||||
|
||||
### Linking With hipcc
|
||||
|
||||
hipcc adds the necessary libraries for HIP as well as for the accelerator compiler (nvcc or AMD compiler). We recommend linking with hipcc since it automatically links the binary to the necessary HIP runtime libraries. It also has knowledge on how to link and to manage the GPU objects.
|
||||
|
||||
### -lm Option
|
||||
|
||||
hipcc adds -lm by default to the link command.
|
||||
|
||||
|
||||
## Linking Code With Other Compilers
|
||||
|
||||
CUDA code often uses nvcc for accelerator code (defining and launching kernels, typically defined in .cu or .cuh files).
|
||||
It also uses a standard compiler (g++) for the rest of the application. nvcc is a preprocessor that employs a standard host compiler (gcc) to generate the host code.
|
||||
Code compiled using this tool can employ only the intersection of language features supported by both nvcc and the host compiler.
|
||||
In some cases, you must take care to ensure the data types and alignment of the host compiler are identical to those of the device compiler. Only some host compilers are supported---for example, recent nvcc versions lack Clang host-compiler capability.
|
||||
|
||||
HIP-Clang generates both device and host code using the same Clang-based compiler. The code uses the same API as gcc, which allows code generated by different gcc-compatible compilers to be linked together. For example, code compiled using HIP-Clang can link with code compiled using "standard" compilers (such as gcc, ICC and Clang). Take care to ensure all compilers use the same standard C++ header and library formats.
|
||||
|
||||
|
||||
### libc++ and libstdc++
|
||||
|
||||
hipcc links to libstdc++ by default. This provides better compatibility between g++ and HIP.
|
||||
|
||||
If you pass "--stdlib=libc++" to hipcc, hipcc will use the libc++ library. Generally, libc++ provides a broader set of C++ features while libstdc++ is the standard for more compilers (notably including g++).
|
||||
|
||||
When cross-linking C++ code, any C++ functions that use types from the C++ standard library (including std::string, std::vector and other containers) must use the same standard-library implementation. They include the following:
|
||||
|
||||
- Functions or kernels defined in HIP-Clang that are called from a standard compiler
|
||||
- Functions defined in a standard compiler that are called from HIP-Clanng.
|
||||
|
||||
Applications with these interfaces should use the default libstdc++ linking.
|
||||
|
||||
Applications which are compiled entirely with hipcc, and which benefit from advanced C++ features not supported in libstdc++, and which do not require portability to nvcc, may choose to use libc++.
|
||||
|
||||
|
||||
### HIP Headers (hip_runtime.h, hip_runtime_api.h)
|
||||
|
||||
The hip_runtime.h and hip_runtime_api.h files define the types, functions and enumerations needed to compile a HIP program:
|
||||
|
||||
- hip_runtime_api.h: defines all the HIP runtime APIs (e.g., hipMalloc) and the types required to call them. A source file that is only calling HIP APIs but neither defines nor launches any kernels can include hip_runtime_api.h. hip_runtime_api.h uses no custom hc language features and can be compiled using a standard C++ compiler.
|
||||
- hip_runtime.h: included in hip_runtime_api.h. It additionally provides the types and defines required to create and launch kernels. hip_runtime.h can be compiled using a standard C++ compiler but will expose a subset of the available functions.
|
||||
|
||||
CUDA has slightly different contents for these two files. In some cases you may need to convert hipified code to include the richer hip_runtime.h instead of hip_runtime_api.h.
|
||||
|
||||
### Using a Standard C++ Compiler
|
||||
You can compile hip\_runtime\_api.h using a standard C or C++ compiler (e.g., gcc or ICC). The HIP include paths and defines (`__HIP_PLATFORM_AMD__` or `__HIP_PLATFORM_NVIDIA__`) must pass to the standard compiler; hipconfig then returns the necessary options:
|
||||
```
|
||||
> hipconfig --cxx_config
|
||||
-D__HIP_PLATFORM_AMD__ -I/home/user1/hip/include
|
||||
```
|
||||
|
||||
You can capture the hipconfig output and passed it to the standard compiler; below is a sample makefile syntax:
|
||||
|
||||
```
|
||||
CPPFLAGS += $(shell $(HIP_PATH)/bin/hipconfig --cpp_config)
|
||||
```
|
||||
|
||||
nvcc includes some headers by default. However, HIP does not include default headers, and instead all required files must be explicitly included.
|
||||
Specifically, files that call HIP run-time APIs or define HIP kernels must explicitly include the appropriate HIP headers.
|
||||
If the compilation process reports that it cannot find necessary APIs (for example, "error: identifier hipSetDevice is undefined"),
|
||||
ensure that the file includes hip_runtime.h (or hip_runtime_api.h, if appropriate).
|
||||
The hipify-perl script automatically converts "cuda_runtime.h" to "hip_runtime.h," and it converts "cuda_runtime_api.h" to "hip_runtime_api.h", but it may miss nested headers or macros.
|
||||
|
||||
#### cuda.h
|
||||
|
||||
The HIP-Clang path provides an empty cuda.h file. Some existing CUDA programs include this file but don't require any of the functions.
|
||||
|
||||
### Choosing HIP File Extensions
|
||||
|
||||
Many existing CUDA projects use the ".cu" and ".cuh" file extensions to indicate code that should be run through the nvcc compiler.
|
||||
For quick HIP ports, leaving these file extensions unchanged is often easier, as it minimizes the work required to change file names in the directory and #include statements in the files.
|
||||
|
||||
For new projects or ports which can be re-factored, we recommend the use of the extension ".hip.cpp" for source files, and
|
||||
".hip.h" or ".hip.hpp" for header files.
|
||||
This indicates that the code is standard C++ code, but also provides a unique indication for make tools to
|
||||
run hipcc when appropriate.
|
||||
|
||||
## Workarounds
|
||||
|
||||
### warpSize
|
||||
Code should not assume a warp size of 32 or 64. See [Warp Cross-Lane Functions](hip_kernel_language.md#warp-cross-lane-functions) for information on how to write portable wave-aware code.
|
||||
|
||||
### Kernel launch with group size > 256
|
||||
Kernel code should use ``` __attribute__((amdgpu_flat_work_group_size(<min>,<max>)))```.
|
||||
|
||||
For example:
|
||||
```
|
||||
__global__ void dot(double *a,double *b,const int n) __attribute__((amdgpu_flat_work_group_size(1, 512)))
|
||||
```
|
||||
|
||||
## memcpyToSymbol
|
||||
|
||||
HIP support for hipMemcpyToSymbol is complete. This feature allows a kernel
|
||||
to define a device-side data symbol which can be accessed on the host side. The symbol
|
||||
can be in __constant or device space.
|
||||
|
||||
Note that the symbol name needs to be encased in the HIP_SYMBOL macro, as shown in the code example below. This also applies to hipMemcpyFromSymbol, hipGetSymbolAddress, and hipGetSymbolSize.
|
||||
|
||||
For example:
|
||||
|
||||
Device Code:
|
||||
```
|
||||
#include<hip/hip_runtime.h>
|
||||
#include<hip/hip_runtime_api.h>
|
||||
#include<iostream>
|
||||
|
||||
#define HIP_ASSERT(status) \
|
||||
assert(status == hipSuccess)
|
||||
|
||||
#define LEN 512
|
||||
#define SIZE 2048
|
||||
|
||||
__constant__ int Value[LEN];
|
||||
|
||||
__global__ void Get(hipLaunchParm lp, int *Ad)
|
||||
{
|
||||
int tid = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
Ad[tid] = Value[tid];
|
||||
}
|
||||
|
||||
int main()
|
||||
{
|
||||
int *A, *B, *Ad;
|
||||
A = new int[LEN];
|
||||
B = new int[LEN];
|
||||
for(unsigned i=0;i<LEN;i++)
|
||||
{
|
||||
A[i] = -1*i;
|
||||
B[i] = 0;
|
||||
}
|
||||
|
||||
HIP_ASSERT(hipMalloc((void**)&Ad, SIZE));
|
||||
|
||||
HIP_ASSERT(hipMemcpyToSymbol(HIP_SYMBOL(Value), A, SIZE, 0, hipMemcpyHostToDevice));
|
||||
hipLaunchKernel(Get, dim3(1,1,1), dim3(LEN,1,1), 0, 0, Ad);
|
||||
HIP_ASSERT(hipMemcpy(B, Ad, SIZE, hipMemcpyDeviceToHost));
|
||||
|
||||
for(unsigned i=0;i<LEN;i++)
|
||||
{
|
||||
assert(A[i] == B[i]);
|
||||
}
|
||||
std::cout<<"Passed"<<std::endl;
|
||||
}
|
||||
```
|
||||
|
||||
## CU_POINTER_ATTRIBUTE_MEMORY_TYPE
|
||||
To get pointer's memory type in HIP/HIP-Clang one should use hipPointerGetAttributes API. First parameter of the API is hipPointerAttribute_t which has 'memoryType' as member variable. 'memoryType' indicates input pointer is allocated on device or host.
|
||||
|
||||
For example:
|
||||
```
|
||||
double * ptr;
|
||||
hipMalloc(reinterpret_cast<void**>(&ptr), sizeof(double));
|
||||
hipPointerAttribute_t attr;
|
||||
hipPointerGetAttributes(&attr, ptr); /*attr.memoryType will have value as hipMemoryTypeDevice*/
|
||||
|
||||
double* ptrHost;
|
||||
hipHostMalloc(&ptrHost, sizeof(double));
|
||||
hipPointerAttribute_t attr;
|
||||
hipPointerGetAttributes(&attr, ptrHost); /*attr.memoryType will have value as hipMemoryTypeHost*/
|
||||
```
|
||||
|
||||
## threadfence_system
|
||||
Threadfence_system makes all device memory writes, all writes to mapped host memory, and all writes to peer memory visible to CPU and other GPU devices.
|
||||
Some implementations can provide this behavior by flushing the GPU L2 cache.
|
||||
HIP/HIP-Clang does not provide this functionality. As a workaround, users can set the environment variable `HSA_DISABLE_CACHE=1` to disable the GPU L2 cache. This will affect all accesses and for all kernels and so may have a performance impact.
|
||||
|
||||
### Textures and Cache Control
|
||||
|
||||
Compute programs sometimes use textures either to access dedicated texture caches or to use the texture-sampling hardware for interpolation and clamping. The former approach uses simple point samplers with linear interpolation, essentially only reading a single point. The latter approach uses the sampler hardware to interpolate and combine multiple samples. AMD hardware, as well as recent competing hardware, has a unified texture/L1 cache, so it no longer has a dedicated texture cache. But the nvcc path often caches global loads in the L2 cache, and some programs may benefit from explicit control of the L1 cache contents. We recommend the __ldg instruction for this purpose.
|
||||
|
||||
AMD compilers currently load all data into both the L1 and L2 caches, so __ldg is treated as a no-op.
|
||||
|
||||
We recommend the following for functional portability:
|
||||
|
||||
- For programs that use textures only to benefit from improved caching, use the __ldg instruction
|
||||
- Programs that use texture object and reference APIs, work well on HIP
|
||||
|
||||
|
||||
## More Tips
|
||||
|
||||
### HIP Logging
|
||||
|
||||
On an AMD platform, set the AMD_LOG_LEVEL environment variable to log HIP application execution information.
|
||||
|
||||
The value of the setting controls different logging level,
|
||||
|
||||
```
|
||||
enum LogLevel {
|
||||
LOG_NONE = 0,
|
||||
LOG_ERROR = 1,
|
||||
LOG_WARNING = 2,
|
||||
LOG_INFO = 3,
|
||||
LOG_DEBUG = 4
|
||||
};
|
||||
```
|
||||
|
||||
Logging mask is used to print types of functionalities during the execution of HIP application.
|
||||
It can be set as one of the following values,
|
||||
|
||||
```
|
||||
enum LogMask {
|
||||
LOG_API = 0x00000001, //!< API call
|
||||
LOG_CMD = 0x00000002, //!< Kernel and Copy Commands and Barriers
|
||||
LOG_WAIT = 0x00000004, //!< Synchronization and waiting for commands to finish
|
||||
LOG_AQL = 0x00000008, //!< Decode and display AQL packets
|
||||
LOG_QUEUE = 0x00000010, //!< Queue commands and queue contents
|
||||
LOG_SIG = 0x00000020, //!< Signal creation, allocation, pool
|
||||
LOG_LOCK = 0x00000040, //!< Locks and thread-safety code.
|
||||
LOG_KERN = 0x00000080, //!< kernel creations and arguments, etc.
|
||||
LOG_COPY = 0x00000100, //!< Copy debug
|
||||
LOG_COPY2 = 0x00000200, //!< Detailed copy debug
|
||||
LOG_RESOURCE = 0x00000400, //!< Resource allocation, performance-impacting events.
|
||||
LOG_INIT = 0x00000800, //!< Initialization and shutdown
|
||||
LOG_MISC = 0x00001000, //!< misc debug, not yet classified
|
||||
LOG_AQL2 = 0x00002000, //!< Show raw bytes of AQL packet
|
||||
LOG_CODE = 0x00004000, //!< Show code creation debug
|
||||
LOG_CMD2 = 0x00008000, //!< More detailed command info, including barrier commands
|
||||
LOG_LOCATION = 0x00010000, //!< Log message location
|
||||
LOG_ALWAYS = 0xFFFFFFFF, //!< Log always even mask flag is zero
|
||||
};
|
||||
```
|
||||
|
||||
### Debugging hipcc
|
||||
To see the detailed commands that hipcc issues, set the environment variable HIPCC_VERBOSE to 1. Doing so will print to stderr the HIP-clang (or nvcc) commands that hipcc generates.
|
||||
|
||||
```
|
||||
export HIPCC_VERBOSE=1
|
||||
make
|
||||
...
|
||||
hipcc-cmd: /opt/hcc/bin/hcc -hc -I/opt/hcc/include -stdlib=libc++ -I../../../../hc/include -I../../../../include/amd_detail/cuda -I../../../../include -x c++ -I../../common -O3 -c backprop_cuda.cu
|
||||
```
|
||||
|
||||
### What Does This Error Mean?
|
||||
|
||||
#### /usr/include/c++/v1/memory:5172:15: error: call to implicitly deleted default constructor of 'std::__1::bad_weak_ptr' throw bad_weak_ptr();
|
||||
|
||||
If you pass a ".cu" file, hcc will attempt to compile it as a CUDA language file. You must tell hcc that it's in fact a C++ file: use the "-x c++" option.
|
||||
|
||||
|
||||
### Editor Highlighting
|
||||
See the utils/vim or utils/gedit directories to add handy highlighting to hip files.
|
||||
|
||||
|
||||
@@ -1,150 +0,0 @@
|
||||
# HIP Programming Guide
|
||||
|
||||
## Host Memory
|
||||
|
||||
### Introduction
|
||||
hipHostMalloc allocates pinned host memory which is mapped into the address space of all GPUs in the system.
|
||||
There are two use cases for this host memory:
|
||||
- Faster HostToDevice and DeviceToHost Data Transfers:
|
||||
The runtime tracks the hipHostMalloc allocations and can avoid some of the setup required for regular unpinned memory. For exact measurements on a specific system, experiment with --unpinned and --pinned switches for the hipBusBandwidth tool.
|
||||
- Zero-Copy GPU Access:
|
||||
GPU can directly access the host memory over the CPU/GPU interconnect, without need to copy the data. This avoids the need for the copy, but during the kernel access each memory access must traverse the interconnect, which can be tens of times slower than accessing the GPU's local device memory. Zero-copy memory can be a good choice when the memory accesses are infrequent (perhaps only once). Zero-copy memory is typically "Coherent" and thus not cached by the GPU but this can be overridden if desired and is explained in more detail below.
|
||||
|
||||
### Memory allocation flags
|
||||
hipHostMalloc always sets the hipHostMallocPortable and hipHostMallocMapped flags. Both usage models described above use the same allocation flags, and the difference is in how the surrounding code uses the host memory.
|
||||
|
||||
hipHostMallocNumaUser is the flag to allow host memory allocation to follow numa policy set by user.
|
||||
|
||||
See the hipHostMalloc API for more information.
|
||||
|
||||
### Numa-aware host memory allocation
|
||||
Numa policy determines how memory is allocated.
|
||||
Target of Numa policy is to select a CPU that is closest to each GPU.
|
||||
Numa distance is the measurement of how far between GPU and CPU devices.
|
||||
|
||||
By default, each GPU selects a Numa CPU node that has the least Numa distance between them, that is, host memory will be automatically allocated closest on the memory pool of Numa node of the current GPU device. Using hipSetDevice API to a different GPU will still be able to access the host allocation, but can have longer Numa distance.
|
||||
|
||||
### Managed memory allocation
|
||||
Managed memory, except the `__managed__` keyword, are supported in HIP combined host/device compilation.
|
||||
The allocation will be automatically managed by AMD HMM (Heterogeneous Memory Management).
|
||||
|
||||
In HIP application, there should be the capability check before make managed memory API call hipMallocManaged.
|
||||
|
||||
For example,
|
||||
```
|
||||
int managed_memory = 0;
|
||||
HIPCHECK(hipDeviceGetAttribute(&managed_memory,
|
||||
hipDeviceAttributeManagedMemory,p_gpuDevice));
|
||||
|
||||
if (!managed_memory ) {
|
||||
printf ("info: managed memory access not supported on the device %d\n Skipped\n", p_gpuDevice);
|
||||
}
|
||||
else {
|
||||
HIPCHECK(hipSetDevice(p_gpuDevice));
|
||||
HIPCHECK(hipMallocManaged(&Hmm, N * sizeof(T)));
|
||||
. . .
|
||||
}
|
||||
```
|
||||
For more details on managed memory APIs, please refer to the documentation HIP-API.pdf.
|
||||
|
||||
### HIP Stream Memory Operations
|
||||
|
||||
HIP supports Stream Memory Operations to enable direct synchronization between Network Nodes and GPU. Following new APIs are added,
|
||||
hipStreamWaitValue32
|
||||
hipStreamWaitValue64
|
||||
hipStreamWriteValue32
|
||||
hipStreamWriteValue64
|
||||
|
||||
For more details, please check the documentation HIP-API.pdf.
|
||||
|
||||
### Coherency Controls
|
||||
ROCm defines two coherency options for host memory:
|
||||
- Coherent memory : Supports fine-grain synchronization while the kernel is running. For example, a kernel can perform atomic operations that are visible to the host CPU or to other (peer) GPUs. Synchronization instructions include threadfence_system and C++11-style atomic operations. However, coherent memory cannot be cached by the GPU and thus may have lower performance.
|
||||
- Non-coherent memory : Can be cached by GPU, but cannot support synchronization while the kernel is running. Non-coherent memory can be optionally synchronized only at command (end-of-kernel or copy command) boundaries. This memory is appropriate for high-performance access when fine-grain synchronization is not required.
|
||||
|
||||
HIP provides the developer with controls to select which type of memory is used via allocation flags passed to hipHostMalloc and the HIP_HOST_COHERENT environment variable. By default, the environment variable HIP_HOST_COHERENT is set to 0 in HIP.
|
||||
- hipHostMallocCoherent=0, hipHostMallocNonCoherent=0: Use HIP_HOST_COHERENT environment variable,
|
||||
- If HIP_HOST_COHERENT is defined as 1, the host memory allocation is coherent.
|
||||
- If HIP_HOST_COHERENT is not defined, or defined as 0, the host memory allocation is non-coherent.
|
||||
- hipHostMallocCoherent=1, hipHostMallocNonCoherent=0: The host memory allocation will be coherent. HIP_HOST_COHERENT env variable is ignored.
|
||||
- hipHostMallocCoherent=0, hipHostMallocNonCoherent=1: The host memory allocation will be non-coherent. HIP_HOST_COHERENT env variable is ignored.
|
||||
- hipHostMallocCoherent=1, hipHostMallocNonCoherent=1: Illegal.
|
||||
|
||||
### Visibility of Zero-Copy Host Memory
|
||||
Coherent host memory is automatically visible at synchronization points.
|
||||
Non-coherent
|
||||
|
||||
| HIP API | Synchronization Effect | Fence | Coherent Host Memory Visibiity | Non-Coherent Host Memory Visibility|
|
||||
| --- | --- | --- | --- | --- |
|
||||
| hipStreamSynchronize | host waits for all commands in the specified stream to complete | system-scope release | yes | yes |
|
||||
| hipDeviceSynchronize | host waits for all commands in all streams on the specified device to complete | system-scope release | yes | yes |
|
||||
| hipEventSynchronize | host waits for the specified event to complete | device-scope release | yes | depends - see below|
|
||||
| hipStreamWaitEvent | stream waits for the specified event to complete | none | yes | no |
|
||||
|
||||
|
||||
### hipEventSynchronize
|
||||
Developers can control the release scope for hipEvents:
|
||||
- By default, the GPU performs a device-scope acquire and release operation with each recorded event. This will make host and device memory visible to other commands executing on the same device.
|
||||
|
||||
A stronger system-level fence can be specified when the event is created with hipEventCreateWithFlags:
|
||||
- hipEventReleaseToSystem : Perform a system-scope release operation when the event is recorded. This will make both Coherent and Non-Coherent host memory visible to other agents in the system, but may involve heavyweight operations such as cache flushing. Coherent memory will typically use lighter-weight in-kernel synchronization mechanisms such as an atomic operation and thus does not need to use hipEventReleaseToSystem.
|
||||
- hipEventDisableTiming: Events created with this flag would not record profiling data and provide best performance if used for synchronization.
|
||||
|
||||
Note, for HIP Events used in kernel dispatch using hipExtLaunchKernelGGL/hipExtLaunchKernel, events passed in the API are not explicitly recorded and should only be used to get elapsed time for that specific launch.
|
||||
In case events are used across multiple dispatches, for example, start and stop events from different hipExtLaunchKernelGGL/hipExtLaunchKernel calls, they will be treated as invalid unrecorded events, HIP will throw error "hipErrorInvalidHandle" from hipEventElapsedTime.
|
||||
|
||||
### Summary and Recommendations:
|
||||
|
||||
- Coherent host memory is the default and is the easiest to use since the memory is visible to the CPU at typical synchronization points. This memory allows in-kernel synchronization commands such as threadfence_system to work transparently.
|
||||
- HIP/ROCm also supports the ability to cache host memory in the GPU using the "Non-Coherent" host memory allocations. This can provide performance benefit, but care must be taken to use the correct synchronization.
|
||||
|
||||
## Device-Side Malloc
|
||||
|
||||
HIP-Clang currently doesn't supports device-side malloc and free.
|
||||
|
||||
## Use of Long Double Type
|
||||
|
||||
In HIP-Clang, long double type is 80-bit extended precision format for x86_64, which is not supported by AMDGPU. HIP-Clang treats long double type as IEEE double type for AMDGPU. Using long double type in HIP source code will not cause issue as long as data of long double type is not transferred between host and device. However, long double type should not be used as kernel argument type.
|
||||
|
||||
## Use of _Float16 Type
|
||||
|
||||
If a host function is to be used between clang (or hipcc) and gcc for x86_64, i.e. its definition is compiled by one compiler but the caller is compiled by a different compiler, _Float16 or aggregates containing _Float16 should not be used as function argument or return type. This is due to lack of stable ABI for _Float16 on x86_64. Passing _Float16 or aggregates containing _Float16 between clang and gcc could cause undefined behavior.
|
||||
|
||||
## FMA and contractions
|
||||
|
||||
By default HIP-Clang assumes -ffp-contract=fast-honor-pragmas.
|
||||
Users can use '#pragma clang fp contract(on|off|fast)' to control fp contraction of a block of code.
|
||||
For x86_64, FMA is off by default since the generic x86_64 target does not
|
||||
support FMA by default. To turn on FMA on x86_64, either use -mfma or -march=native
|
||||
on CPU's supporting FMA.
|
||||
|
||||
When contractions are enabled and the CPU has not enabled FMA instructions, the
|
||||
GPU can produce different numerical results than the CPU for expressions that
|
||||
can be contracted. Tolerance should be used for floating point comparsions.
|
||||
|
||||
## Math functions with special rounding modes
|
||||
|
||||
HIP does not support math functions with rounding modes ru (round up), rd (round down), and rz (round towards zero). HIP only supports math function with rounding mode rn (round to nearest). The math functions with postfixes _ru, _rd and _rz are implemented in the same way as math functions with postfix _rn. They serve as a workaround to get programs using them compiled.
|
||||
|
||||
## Creating Static Libraries
|
||||
|
||||
HIP-Clang supports generating two types of static libraries. The first type of static library does not export device functions, and only exports and launches host functions within the same library. The advantage of this type is the ability to link with a non-hipcc compiler such as gcc. The second type exports device functions to be linked by other code objects. However this requires using hipcc as the linker.
|
||||
|
||||
In addition, the first type of library contains host objects with device code embedded as fat binaries. It is generated using the flag --emit-static-lib. The second type of library contains relocatable device objects and is generated using ar.
|
||||
|
||||
Here is an example to create and use static libraries:
|
||||
- Type 1 using --emit-static-lib:
|
||||
```
|
||||
hipcc hipOptLibrary.cpp --emit-static-lib -fPIC -o libHipOptLibrary.a
|
||||
gcc test.cpp -L. -lhipOptLibrary -L/path/to/hip/lib -lamdhip64 -o test.out
|
||||
```
|
||||
- Type 2 using system ar:
|
||||
```
|
||||
hipcc hipDevice.cpp -c -fgpu-rdc -o hipDevice.o
|
||||
ar rcsD libHipDevice.a hipDevice.o
|
||||
hipcc libHipDevice.a test.cpp -fgpu-rdc -o test.out
|
||||
```
|
||||
|
||||
For more information, please see samples/2_Cookbook/15_static_library/host_functions and samples/2_Cookbook/15_static_library/device_functions.
|
||||
|
||||
## [Supported Clang Options](clang_options.md)
|
||||
@@ -1,40 +0,0 @@
|
||||
# Table Comparing Syntax for Different Compute APIs
|
||||
|
||||
|Term|CUDA|HIP|HC|C++AMP|OpenCL|
|
||||
|---|---|---|---|---|---|
|
||||
|Device|`int deviceId`|`int deviceId`|`hc::accelerator`|`concurrency::`<br>`accelerator`|`cl_device`
|
||||
|Queue|`cudaStream_t`|`hipStream_t`|`hc::`<br>`accelerator_view`|`concurrency::`<br>`accelerator_view`|`cl_command_queue`
|
||||
|Event|`cudaEvent_t`|`hipEvent_t`|`hc::`<br>`completion_future`|`concurrency::`<br>`completion_future`|`cl_event`
|
||||
|Memory|`void *`|`void *`|`void *`; `hc::array`; `hc::array_view`|`concurrency::array`;<br>`concurrency::array_view`|`cl_mem`
|
||||
|||||
|
||||
| |grid|grid|extent|extent|NDRange
|
||||
| |block|block|tile|tile|work-group
|
||||
| |thread|thread|thread|thread|work-item
|
||||
| |warp|warp|wavefront|N/A|sub-group
|
||||
|||||
|
||||
|Thread-<br>index | threadIdx.x | hipThreadIdx_x | t_idx.local[0] | t_idx.local[0] | get_local_id(0) |
|
||||
|Block-<br>index | blockIdx.x | hipBlockIdx_x | t_idx.tile[0] | t_idx.tile[0] | get_group_id(0) |
|
||||
|Block-<br>dim | blockDim.x | hipBlockDim_x | t_ext.tile_dim[0]| t_idx.tile_dim0 | get_local_size(0) |
|
||||
|Grid-dim | gridDim.x | hipGridDim_x | t_ext[0]| t_ext[0] | get_global_size(0) |
|
||||
|||||
|
||||
|Device Kernel|`__global__`|`__global__`|lambda inside `hc::`<br>`parallel_for_each` or [[hc]]|`restrict(amp)`|`__kernel`
|
||||
|Device Function|`__device__`|`__device__`|`[[hc]]` (detected automatically in many case)|`restrict(amp)`|Implied in device compilation
|
||||
|Host Function|`__host_` (default)|`__host_` (default)|`[[cpu]]` (default)|`restrict(cpu)` (default)|Implied in host compilation.
|
||||
|Host + Device Function|`__host__` `__device__`|`__host__` `__device__`| `[[hc]]` `[[cpu]]`|`restrict(amp,cpu)`|No equivalent
|
||||
|Kernel Launch|`<<< >>>`|`hipLaunchKernel`|`hc::`<br>`parallel_for_each`|`concurrency::`<br>`parallel_for_each`|`clEnqueueNDRangeKernel`
|
||||
||||||
|
||||
|Global Memory|`__global__`|`__global__`|Unnecessary / Implied|Unnecessary / Implied|`__global`
|
||||
|Group Memory|`__shared__`|`__shared__`|`tile_static`|`tile_static`|`__local`
|
||||
|Constant|`__constant__`|`__constant__`|Unnecessary / Implied|Unnecessary / Implied|`__constant`
|
||||
||||||
|
||||
||`__syncthreads`|`__syncthreads`|`tile_static.barrier()`|`t_idx.barrier()`|`barrier(CLK_LOCAL_MEMFENCE)`
|
||||
|Atomic Builtins|`atomicAdd`|`atomicAdd`|`hc::atomic_fetch_add`|`concurrency::`<br>`atomic_fetch_add`|`atomic_add`
|
||||
|Precise Math|`cos(f)`|`cos(f)`|`hc::`<br>`precise_math::cos(f)`|`concurrency::`<br>`precise_math::cos(f)`|`cos(f)`
|
||||
|Fast Math|`__cos(f)`|`__cos(f)`|`hc::`<br>`fast_math::cos(f)`|`concurrency::`<br>`fast_math::cos(f)`|`native_cos(f)`
|
||||
|Vector|`float4`|`float4`|`hc::`<br>`short_vector::float4`|`concurrency::`<br>`graphics::float_4`|`float4`
|
||||
|
||||
### Notes
|
||||
1. For HC and C++AMP, assume a captured _tiled_ext_ named "t_ext" and captured _extent_ named "ext". These languages use captured variables to pass information to the kernel rather than using special built-in functions so the exact variable name may vary.
|
||||
2. The indexing functions (starting with `thread-index`) show the terminology for a 1D grid. Some APIs use reverse order of xyz / 012 indexing for 3D grids.
|
||||
3. HC allows tile dimensions to be specified at runtime while C++AMP requires that tile dimensions be specified at compile-time. Thus hc syntax for tile dims is `t_ext.tile_dim[0]` while C++AMP is t_ext.tile_dim0.
|
||||
|
||||
@@ -1,20 +0,0 @@
|
||||
# Terms used in HIP Documentation
|
||||
|
||||
- host, host cpu : Executes the HIP runtime API and is capable of initiating kernel launches to one or more devices.
|
||||
- default device : Each host thread maintains a "default device".
|
||||
Most HIP runtime APIs (including memory allocation, copy commands, kernel launches) do not use accept an explicit device
|
||||
argument but instead implicitly use the default device.
|
||||
The default device can be set with hipSetDevice.
|
||||
|
||||
- "active host thread" - the thread which is running the HIP APIs.
|
||||
|
||||
- HIP-Clang - Heterogeneous AMDGPU Compiler, with its capability to compile HIP programs on AMD platform (https://github.com/RadeonOpenCompute/llvm-project).
|
||||
|
||||
- ROCclr - a virtual device interface that compute runtimes interact with different backends such as ROCr on Linux or PAL on Windows.
|
||||
The ROCclr (https://github.com/ROCm-Developer-Tools/ROCclr) is an abstraction layer allowing runtimes to work on both OSes without much effort.
|
||||
|
||||
- hipify tools - tools to convert CUDA code to portable C++ code (https://github.com/ROCm-Developer-Tools/HIPIFY).
|
||||
|
||||
- hipconfig - tool to report various configuration properties of the target platform.
|
||||
|
||||
- nvcc = nvcc compiler, do not capitalize.
|
||||
@@ -1,67 +0,0 @@
|
||||
# ROCm Code Object tooling
|
||||
|
||||
ROCm compiler generated code objects (executables, object files, and shared object libraries) can be examined and code objects extracted with the following tools.
|
||||
|
||||
## URI syntax:
|
||||
|
||||
ROCm Code Objects can be listed/accessed using the following URI syntax:
|
||||
```
|
||||
code_object_uri ::== file_uri | memory_uri
|
||||
file_uri ::== file:// extract_file [ range_specifier ]
|
||||
memory_uri ::== memory:// process_id range_specifier
|
||||
range_specifier ::== [ # | ? ] offset= number & size= number
|
||||
extract_file ::== URI_ENCODED_OS_FILE_PATH
|
||||
process_id ::== DECIMAL_NUMBER
|
||||
number ::== HEX_NUMBER | DECIMAL_NUMBER | OCTAL_NUMBER
|
||||
```
|
||||
Example: file://dir1/dir2/hello_world#offset=133&size=14472
|
||||
memory://1234#offset=0x20000&size=3000
|
||||
|
||||
|
||||
## List available ROCm Code Objects: rocm-obj-ls
|
||||
|
||||
Use this tool to list available ROCm code objects. Code objects are listed using URI syntax.
|
||||
|
||||
Usage: roc-obj-ls [-v|h] executable...
|
||||
List the URIs of the code objects embedded in the specfied host executables.
|
||||
-v Verbose output (includes Entry ID)
|
||||
-h Show this help message
|
||||
|
||||
|
||||
## Extract ROCm Code Objects: rocm-obj-extract
|
||||
|
||||
Extracts available ROCm code objects from specified URI.
|
||||
|
||||
Usage: rocm-obj-extract [-o|v|h] URI...
|
||||
- URIs can be read from STDIN, one per line.
|
||||
- From the URIs specified, extracts code objects into files named: <executable_name>-[pid<number>]-offset<number>-size<number>.co
|
||||
|
||||
Options:
|
||||
-o <path> Path for output. If "-" specified, code object is printed to STDOUT.
|
||||
-v Verbose output (includes Entry ID).
|
||||
-h Show this help message
|
||||
|
||||
Note, when specifying a URI argument to roc-obj-extract, if cut and pasting the output from roc-obj-ls you need to escape the '&' character or your shell will interpret it as the option to run the command as a background process.
|
||||
As an example, if roc-obj-ls generates a URI like this ```file://my_exe#offset=24576&size=46816xxi```, you need to use the following argument to roc-obj-extract: ```file://my_exe#offset=24576\&size=46816```
|
||||
|
||||
## Examples:
|
||||
|
||||
### Dump all code objects to current directory:
|
||||
roc-obj-ls <exe> | roc-obj-extract
|
||||
|
||||
### Dump the ISA for gfx906:
|
||||
roc-obj-ls -v <exe> | grep "gfx906" | awk '{print $2}' | roc-obj-extract -o - | llvm-objdump -d - > <exe>.gfx906.isa
|
||||
|
||||
### Check the e_flags of the gfx908 code object:
|
||||
roc-obj-ls -v <exe> | grep "gfx908" | awk '{print $2}' | roc-obj-extract -o - | llvm-readelf -h - | grep Flags
|
||||
|
||||
### Disassemble the fourth code object:
|
||||
roc-obj-ls <exe> | sed -n 4p | roc-obj-extract -o - | llvm-objdump -d -
|
||||
|
||||
### Sort embedded code objects by size:
|
||||
for uri in $(roc-obj-ls <exe>); do printf "%d: %s\n" "$(roc-obj-extract -o - "$uri" | wc -c)" "$uri"; done | sort -n
|
||||
|
||||
### Compare disassembly of gfx803 and gfx900 code objects:
|
||||
dis() { roc-obj-ls -v <exe> | grep "$1" | awk '{print $2}' | roc-obj-extract -o - | llvm-objdump -d -; }
|
||||
diff <(dis gfx803) <(dis gfx900)
|
||||
|
||||
@@ -1,277 +0,0 @@
|
||||
# Copyright (C) 2020-2021 Advanced Micro Devices, Inc. All Rights Reserved.
|
||||
#
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
# of this software and associated documentation files (the "Software"), to deal
|
||||
# in the Software without restriction, including without limitation the rights
|
||||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
# copies of the Software, and to permit persons to whom the Software is
|
||||
# furnished to do so, subject to the following conditions:
|
||||
#
|
||||
# The above copyright notice and this permission notice shall be included in
|
||||
# all copies or substantial portions of the Software.
|
||||
#
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
# THE SOFTWARE.
|
||||
|
||||
@PACKAGE_INIT@
|
||||
include(CheckCXXCompilerFlag)
|
||||
include(CMakeFindDependencyMacro OPTIONAL RESULT_VARIABLE _CMakeFindDependencyMacro_FOUND)
|
||||
if (NOT _CMakeFindDependencyMacro_FOUND)
|
||||
macro(find_dependency dep)
|
||||
if (NOT ${dep}_FOUND)
|
||||
set(cmake_fd_version)
|
||||
if (${ARGC} GREATER 1)
|
||||
set(cmake_fd_version ${ARGV1})
|
||||
endif()
|
||||
set(cmake_fd_exact_arg)
|
||||
if(${CMAKE_FIND_PACKAGE_NAME}_FIND_VERSION_EXACT)
|
||||
set(cmake_fd_exact_arg EXACT)
|
||||
endif()
|
||||
set(cmake_fd_quiet_arg)
|
||||
if(${CMAKE_FIND_PACKAGE_NAME}_FIND_QUIETLY)
|
||||
set(cmake_fd_quiet_arg QUIET)
|
||||
endif()
|
||||
set(cmake_fd_required_arg)
|
||||
if(${CMAKE_FIND_PACKAGE_NAME}_FIND_REQUIRED)
|
||||
set(cmake_fd_required_arg REQUIRED)
|
||||
endif()
|
||||
find_package(${dep} ${cmake_fd_version}
|
||||
${cmake_fd_exact_arg}
|
||||
${cmake_fd_quiet_arg}
|
||||
${cmake_fd_required_arg}
|
||||
)
|
||||
string(TOUPPER ${dep} cmake_dep_upper)
|
||||
if (NOT ${dep}_FOUND AND NOT ${cmake_dep_upper}_FOUND)
|
||||
set(${CMAKE_FIND_PACKAGE_NAME}_NOT_FOUND_MESSAGE "${CMAKE_FIND_PACKAGE_NAME} could not be found because dependency ${dep} could not be found.")
|
||||
set(${CMAKE_FIND_PACKAGE_NAME}_FOUND False)
|
||||
return()
|
||||
endif()
|
||||
set(cmake_fd_version)
|
||||
set(cmake_fd_required_arg)
|
||||
set(cmake_fd_quiet_arg)
|
||||
set(cmake_fd_exact_arg)
|
||||
endif()
|
||||
endmacro()
|
||||
endif()
|
||||
|
||||
#Number of parallel jobs by default is 1
|
||||
if(NOT DEFINED HIP_CLANG_NUM_PARALLEL_JOBS)
|
||||
set(HIP_CLANG_NUM_PARALLEL_JOBS 1)
|
||||
endif()
|
||||
set(HIP_COMPILER "@HIP_COMPILER@")
|
||||
set(HIP_RUNTIME "@HIP_RUNTIME@")
|
||||
|
||||
set_and_check( hip_INCLUDE_DIR "@PACKAGE_INCLUDE_INSTALL_DIR@" )
|
||||
set_and_check( hip_INCLUDE_DIRS "${hip_INCLUDE_DIR}" )
|
||||
set_and_check( hip_LIB_INSTALL_DIR "@PACKAGE_LIB_INSTALL_DIR@" )
|
||||
set_and_check( hip_BIN_INSTALL_DIR "@PACKAGE_BIN_INSTALL_DIR@" )
|
||||
|
||||
set_and_check(hip_HIPCC_EXECUTABLE "${hip_BIN_INSTALL_DIR}/hipcc")
|
||||
set_and_check(hip_HIPCONFIG_EXECUTABLE "${hip_BIN_INSTALL_DIR}/hipconfig")
|
||||
|
||||
# Windows Specific Definition here:
|
||||
if(WIN32)
|
||||
if(DEFINED ENV{HIP_PATH})
|
||||
set(HIP_PATH "$ENV{HIP_PATH}")
|
||||
elseif(DEFINED ENV{HIP_DIR})
|
||||
set(HIP_PATH "$ENV{HIP_DIR}")
|
||||
else()
|
||||
set(HIP_PATH "C:/Program Files/AMD HIP SDK/hip")
|
||||
endif()
|
||||
else()
|
||||
# Linux - set a default path for ROCM_PATH
|
||||
if(NOT DEFINED ROCM_PATH)
|
||||
set(ROCM_PATH /opt/rocm)
|
||||
endif()
|
||||
|
||||
#If HIP is not installed under ROCm, need this to find HSA assuming HSA is under ROCm
|
||||
if(DEFINED ENV{ROCM_PATH})
|
||||
set(ROCM_PATH "$ENV{ROCM_PATH}")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if(HIP_COMPILER STREQUAL "clang")
|
||||
if(WIN32)
|
||||
set(HIP_CLANG_ROOT "${HIP_PATH}")
|
||||
else()
|
||||
set(HIP_CLANG_ROOT "${ROCM_PATH}/llvm")
|
||||
endif()
|
||||
if(NOT HIP_CXX_COMPILER)
|
||||
set(HIP_CXX_COMPILER ${CMAKE_CXX_COMPILER})
|
||||
endif()
|
||||
if(HIP_CXX_COMPILER MATCHES ".*hipcc")
|
||||
execute_process(COMMAND ${HIP_CXX_COMPILER} --version
|
||||
OUTPUT_STRIP_TRAILING_WHITESPACE
|
||||
OUTPUT_VARIABLE HIP_CLANG_CXX_COMPILER_VERSION_OUTPUT)
|
||||
if(HIP_CLANG_CXX_COMPILER_VERSION_OUTPUT MATCHES "InstalledDir:[ \t]*([^\n]*)")
|
||||
get_filename_component(HIP_CLANG_ROOT "${CMAKE_MATCH_1}" DIRECTORY)
|
||||
endif()
|
||||
elseif (HIP_CXX_COMPILER MATCHES ".*clang\\+\\+")
|
||||
get_filename_component(HIP_CLANG_ROOT "${HIP_CXX_COMPILER}" DIRECTORY)
|
||||
get_filename_component(HIP_CLANG_ROOT "${HIP_CLANG_ROOT}" DIRECTORY)
|
||||
endif()
|
||||
file(GLOB HIP_CLANG_INCLUDE_SEARCH_PATHS ${HIP_CLANG_ROOT}/lib/clang/*/include)
|
||||
find_path(HIP_CLANG_INCLUDE_PATH stddef.h
|
||||
HINTS
|
||||
${HIP_CLANG_INCLUDE_SEARCH_PATHS}
|
||||
NO_DEFAULT_PATH)
|
||||
if(NOT WIN32)
|
||||
find_dependency(AMDDeviceLibs)
|
||||
endif()
|
||||
set(AMDGPU_TARGETS "gfx900;gfx906;gfx908" CACHE STRING "AMD GPU targets to compile for")
|
||||
set(GPU_TARGETS "${AMDGPU_TARGETS}" CACHE STRING "GPU targets to compile for")
|
||||
endif()
|
||||
|
||||
if(NOT WIN32)
|
||||
find_dependency(amd_comgr)
|
||||
endif()
|
||||
|
||||
include( "${CMAKE_CURRENT_LIST_DIR}/hip-targets.cmake" )
|
||||
|
||||
#Using find_dependency to locate the dependency for the packages
|
||||
#This makes the cmake generated file xxxx-targets to supply the linker libraries
|
||||
# without worrying other transitive dependencies
|
||||
if(NOT WIN32)
|
||||
find_dependency(hsa-runtime64)
|
||||
find_dependency(Threads)
|
||||
endif()
|
||||
|
||||
#get_filename_component cannot resolve the symlinks if called from /opt/rocm/lib/hip
|
||||
#and do three level up again
|
||||
get_filename_component(_DIR "${CMAKE_CURRENT_LIST_DIR}" REALPATH)
|
||||
get_filename_component(_IMPORT_PREFIX "${_DIR}/../../../" REALPATH)
|
||||
|
||||
# Windows doesn't need HSA
|
||||
if(NOT WIN32)
|
||||
#if HSA is not under ROCm then provide CMAKE_PREFIX_PATH=<HSA_PATH>
|
||||
find_path(HSA_HEADER hsa/hsa.h
|
||||
PATHS
|
||||
"${_IMPORT_PREFIX}/../include"
|
||||
/opt/rocm/include
|
||||
)
|
||||
|
||||
if (HSA_HEADER-NOTFOUND)
|
||||
message (FATAL_ERROR "HSA header not found! ROCM_PATH environment not set")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
# Right now this is only supported for amd platforms
|
||||
set_target_properties(hip::host PROPERTIES
|
||||
INTERFACE_COMPILE_DEFINITIONS "__HIP_PLATFORM_HCC__=1;__HIP_PLATFORM_AMD__=1"
|
||||
)
|
||||
|
||||
if(HIP_RUNTIME MATCHES "rocclr")
|
||||
set_target_properties(hip::amdhip64 PROPERTIES
|
||||
INTERFACE_INCLUDE_DIRECTORIES "${_IMPORT_PREFIX}/include;${HSA_HEADER}"
|
||||
INTERFACE_SYSTEM_INCLUDE_DIRECTORIES "${_IMPORT_PREFIX}/include;${HSA_HEADER}"
|
||||
)
|
||||
|
||||
get_target_property(amdhip64_type hip::amdhip64 TYPE)
|
||||
message(STATUS "hip::amdhip64 is ${amdhip64_type}")
|
||||
|
||||
if(NOT WIN32)
|
||||
set_target_properties(hip::device PROPERTIES
|
||||
INTERFACE_INCLUDE_DIRECTORIES "${_IMPORT_PREFIX}/../include"
|
||||
INTERFACE_SYSTEM_INCLUDE_DIRECTORIES "${_IMPORT_PREFIX}/../include"
|
||||
)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if(HIP_COMPILER STREQUAL "clang")
|
||||
get_property(compilePropIsSet TARGET hip::device PROPERTY INTERFACE_COMPILE_OPTIONS SET)
|
||||
|
||||
if (NOT compilePropIsSet AND HIP_CXX_COMPILER MATCHES ".*clang\\+\\+")
|
||||
set_property(TARGET hip::device APPEND PROPERTY
|
||||
INTERFACE_COMPILE_OPTIONS "$<$<COMPILE_LANGUAGE:CXX>:SHELL:-mllvm;-amdgpu-early-inline-all=true;-mllvm;-amdgpu-function-calls=false>"
|
||||
)
|
||||
endif()
|
||||
|
||||
if (NOT compilePropIsSet)
|
||||
set_property(TARGET hip::device APPEND PROPERTY
|
||||
INTERFACE_COMPILE_OPTIONS "$<$<COMPILE_LANGUAGE:CXX>:SHELL:-x hip>"
|
||||
)
|
||||
if (NOT EXISTS ${AMD_DEVICE_LIBS_PREFIX}/amdgcn/bitcode)
|
||||
# This path is to support an older build of the device library
|
||||
# TODO: To be removed in the future.
|
||||
if(WIN32)
|
||||
set_property(TARGET hip::device APPEND PROPERTY
|
||||
INTERFACE_COMPILE_OPTIONS -xhip -fms-extensions -fms-compatibility
|
||||
)
|
||||
set_property(TARGET hip::device APPEND PROPERTY
|
||||
INTERFACE_COMPILE_OPTIONS "--hip-device-lib-path=${HIP_PATH}/lib/bitcode"
|
||||
)
|
||||
else()
|
||||
set_property(TARGET hip::device APPEND PROPERTY
|
||||
INTERFACE_COMPILE_OPTIONS "$<$<COMPILE_LANGUAGE:CXX>:--hip-device-lib-path=${AMD_DEVICE_LIBS_PREFIX}/lib>"
|
||||
)
|
||||
endif()
|
||||
endif()
|
||||
endif()
|
||||
|
||||
set_property(TARGET hip::device APPEND PROPERTY
|
||||
INTERFACE_LINK_LIBRARIES "$<$<LINK_LANGUAGE:CXX>:--hip-link>"
|
||||
)
|
||||
|
||||
set_property(TARGET hip::device APPEND PROPERTY
|
||||
INTERFACE_INCLUDE_DIRECTORIES "${HIP_CLANG_INCLUDE_PATH}/.."
|
||||
)
|
||||
|
||||
set_property(TARGET hip::device APPEND PROPERTY
|
||||
INTERFACE_SYSTEM_INCLUDE_DIRECTORIES "${HIP_CLANG_INCLUDE_PATH}/.."
|
||||
)
|
||||
|
||||
foreach(GPU_TARGET ${GPU_TARGETS})
|
||||
if (NOT compilePropIsSet)
|
||||
set_property(TARGET hip::device APPEND PROPERTY
|
||||
INTERFACE_COMPILE_OPTIONS "$<$<COMPILE_LANGUAGE:CXX>:--offload-arch=${GPU_TARGET}>"
|
||||
)
|
||||
endif()
|
||||
set_property(TARGET hip::device APPEND PROPERTY
|
||||
INTERFACE_LINK_LIBRARIES "$<$<LINK_LANGUAGE:CXX>:--offload-arch=${GPU_TARGET}>"
|
||||
)
|
||||
endforeach()
|
||||
#Add support for parallel build and link
|
||||
if(${CMAKE_CXX_COMPILER_ID} STREQUAL "Clang")
|
||||
check_cxx_compiler_flag("-parallel-jobs=1" HIP_CLANG_SUPPORTS_PARALLEL_JOBS)
|
||||
endif()
|
||||
if(HIP_CLANG_NUM_PARALLEL_JOBS GREATER 1)
|
||||
if(${HIP_CLANG_SUPPORTS_PARALLEL_JOBS} )
|
||||
if (NOT compilePropIsSet)
|
||||
set_property(TARGET hip::device APPEND PROPERTY
|
||||
INTERFACE_COMPILE_OPTIONS "$<$<COMPILE_LANGUAGE:CXX>:-parallel-jobs=${HIP_CLANG_NUM_PARALLEL_JOBS};-Wno-format-nonliteral>"
|
||||
)
|
||||
endif()
|
||||
set_property(TARGET hip::device APPEND PROPERTY
|
||||
INTERFACE_LINK_LIBRARIES "$<$<LINK_LANGUAGE:CXX>:-parallel-jobs=${HIP_CLANG_NUM_PARALLEL_JOBS}>"
|
||||
)
|
||||
else()
|
||||
message("clang compiler doesn't support parallel jobs")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
# Add support for __fp16 and _Float16, explicitly link with compiler-rt
|
||||
set_property(TARGET hip::host APPEND PROPERTY
|
||||
INTERFACE_LINK_LIBRARIES "$<$<LINK_LANGUAGE:CXX>:${HIP_CLANG_INCLUDE_PATH}/../lib/linux/libclang_rt.builtins-x86_64.a>"
|
||||
)
|
||||
set_property(TARGET hip::device APPEND PROPERTY
|
||||
INTERFACE_LINK_LIBRARIES "$<$<LINK_LANGUAGE:CXX>:${HIP_CLANG_INCLUDE_PATH}/../lib/linux/libclang_rt.builtins-x86_64.a>"
|
||||
)
|
||||
endif()
|
||||
|
||||
set( hip_LIBRARIES hip::host hip::device)
|
||||
set( hip_LIBRARY ${hip_LIBRARIES})
|
||||
|
||||
set(HIP_INCLUDE_DIR ${hip_INCLUDE_DIR})
|
||||
set(HIP_INCLUDE_DIRS ${hip_INCLUDE_DIRS})
|
||||
set(HIP_LIB_INSTALL_DIR ${hip_LIB_INSTALL_DIR})
|
||||
set(HIP_BIN_INSTALL_DIR ${hip_BIN_INSTALL_DIR})
|
||||
set(HIP_LIBRARIES ${hip_LIBRARIES})
|
||||
set(HIP_LIBRARY ${hip_LIBRARY})
|
||||
set(HIP_HIPCC_EXECUTABLE ${hip_HIPCC_EXECUTABLE})
|
||||
set(HIP_HIPCONFIG_EXECUTABLE ${hip_HIPCONFIG_EXECUTABLE})
|
||||
|
||||
@@ -1,132 +0,0 @@
|
||||
# Copyright (C) 2021 Kitware, Inc. All Rights Reserved.
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
# of this software and associated documentation files (the "Software"), to deal
|
||||
# in the Software without restriction, including without limitation the rights
|
||||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
# copies of the Software, and to permit persons to whom the Software is
|
||||
# furnished to do so, subject to the following conditions:
|
||||
#
|
||||
# The above copyright notice and this permission notice shall be included in
|
||||
# all copies or substantial portions of the Software.
|
||||
#
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
# THE SOFTWARE.
|
||||
|
||||
@PACKAGE_INIT@
|
||||
include(CMakeFindDependencyMacro OPTIONAL RESULT_VARIABLE _CMakeFindDependencyMacro_FOUND)
|
||||
if (NOT _CMakeFindDependencyMacro_FOUND)
|
||||
macro(find_dependency dep)
|
||||
if (NOT ${dep}_FOUND)
|
||||
set(cmake_fd_version)
|
||||
if (${ARGC} GREATER 1)
|
||||
set(cmake_fd_version ${ARGV1})
|
||||
endif()
|
||||
set(cmake_fd_exact_arg)
|
||||
if(${CMAKE_FIND_PACKAGE_NAME}_FIND_VERSION_EXACT)
|
||||
set(cmake_fd_exact_arg EXACT)
|
||||
endif()
|
||||
set(cmake_fd_quiet_arg)
|
||||
if(${CMAKE_FIND_PACKAGE_NAME}_FIND_QUIETLY)
|
||||
set(cmake_fd_quiet_arg QUIET)
|
||||
endif()
|
||||
set(cmake_fd_required_arg)
|
||||
if(${CMAKE_FIND_PACKAGE_NAME}_FIND_REQUIRED)
|
||||
set(cmake_fd_required_arg REQUIRED)
|
||||
endif()
|
||||
find_package(${dep} ${cmake_fd_version}
|
||||
${cmake_fd_exact_arg}
|
||||
${cmake_fd_quiet_arg}
|
||||
${cmake_fd_required_arg}
|
||||
)
|
||||
string(TOUPPER ${dep} cmake_dep_upper)
|
||||
if (NOT ${dep}_FOUND AND NOT ${cmake_dep_upper}_FOUND)
|
||||
set(${CMAKE_FIND_PACKAGE_NAME}_NOT_FOUND_MESSAGE "${CMAKE_FIND_PACKAGE_NAME} could not be found because dependency ${dep} could not be found.")
|
||||
set(${CMAKE_FIND_PACKAGE_NAME}_FOUND False)
|
||||
return()
|
||||
endif()
|
||||
set(cmake_fd_version)
|
||||
set(cmake_fd_required_arg)
|
||||
set(cmake_fd_quiet_arg)
|
||||
set(cmake_fd_exact_arg)
|
||||
endif()
|
||||
endmacro()
|
||||
endif()
|
||||
|
||||
set(HIP_COMPILER "@HIP_COMPILER@")
|
||||
set(HIP_RUNTIME "@HIP_RUNTIME@")
|
||||
|
||||
find_dependency(AMDDeviceLibs)
|
||||
find_dependency(amd_comgr)
|
||||
|
||||
include( "${CMAKE_CURRENT_LIST_DIR}/hip-lang-targets.cmake" )
|
||||
|
||||
#get_filename_component cannot resolve the symlinks if called from /opt/rocm/lib/hip
|
||||
#and do three level up again
|
||||
get_filename_component(_DIR "${CMAKE_CURRENT_LIST_DIR}" REALPATH)
|
||||
get_filename_component(_IMPORT_PREFIX "${_DIR}/../../../" REALPATH)
|
||||
|
||||
|
||||
#need _IMPORT_PREFIX to be set
|
||||
file(GLOB HIP_CLANG_INCLUDE_SEARCH_PATHS "${_IMPORT_PREFIX}/../llvm/lib/clang/*/include")
|
||||
find_path(HIP_CLANG_INCLUDE_PATH __clang_cuda_math.h
|
||||
HINTS ${HIP_CLANG_INCLUDE_SEARCH_PATHS}
|
||||
NO_DEFAULT_PATH)
|
||||
get_filename_component(HIP_CLANG_INCLUDE_PATH "${HIP_CLANG_INCLUDE_PATH}" DIRECTORY)
|
||||
|
||||
#If HIP isnot installed under ROCm, need this to find HSA assuming HSA is under ROCm
|
||||
if( DEFINED ENV{ROCM_PATH} )
|
||||
set(ROCM_PATH "$ENV{ROCM_PATH}")
|
||||
endif()
|
||||
|
||||
#if HSA is not under ROCm then provide CMAKE_PREFIX_PATH=<HSA_PATH>
|
||||
find_path(HSA_HEADER hsa/hsa.h
|
||||
PATHS
|
||||
"${_IMPORT_PREFIX}/../include"
|
||||
/opt/rocm/include
|
||||
)
|
||||
|
||||
if (HSA_HEADER-NOTFOUND)
|
||||
message (FATAL_ERROR "HSA header not found! ROCM_PATH environment not set")
|
||||
endif()
|
||||
|
||||
set_target_properties(hip-lang::device PROPERTIES
|
||||
INTERFACE_INCLUDE_DIRECTORIES "$<$<COMPILE_LANGUAGE:HIP>:${_IMPORT_PREFIX}/../include;${HIP_CLANG_INCLUDE_PATH}>"
|
||||
INTERFACE_SYSTEM_INCLUDE_DIRECTORIES "$<$<COMPILE_LANGUAGE:HIP>:${_IMPORT_PREFIX}/../include;${HIP_CLANG_INCLUDE_PATH}>"
|
||||
)
|
||||
|
||||
set_target_properties(hip-lang::amdhip64 PROPERTIES
|
||||
INTERFACE_COMPILE_DEFINITIONS "$<$<COMPILE_LANGUAGE:HIP>:__HIP_ROCclr__=1>"
|
||||
INTERFACE_INCLUDE_DIRECTORIES "$<$<COMPILE_LANGUAGE:HIP>:${_IMPORT_PREFIX}/include;${HSA_HEADER}>"
|
||||
INTERFACE_SYSTEM_INCLUDE_DIRECTORIES "$<$<COMPILE_LANGUAGE:HIP>:${_IMPORT_PREFIX}/include;${HSA_HEADER}>"
|
||||
)
|
||||
set_target_properties(hip-lang::device PROPERTIES
|
||||
INTERFACE_COMPILE_DEFINITIONS "$<$<COMPILE_LANGUAGE:HIP>:__HIP_ROCclr__=1>"
|
||||
)
|
||||
|
||||
set_property(TARGET hip-lang::device APPEND PROPERTY
|
||||
INTERFACE_COMPILE_OPTIONS "$<$<COMPILE_LANGUAGE:HIP>:SHELL:-mllvm;-amdgpu-early-inline-all=true;-mllvm;-amdgpu-function-calls=false>"
|
||||
)
|
||||
|
||||
if (NOT EXISTS "${AMD_DEVICE_LIBS_PREFIX}/amdgcn/bitcode")
|
||||
set_property(TARGET hip-lang::device APPEND PROPERTY
|
||||
INTERFACE_COMPILE_OPTIONS "$<$<COMPILE_LANGUAGE:HIP>:--hip-device-lib-path=${AMD_DEVICE_LIBS_PREFIX}/lib>"
|
||||
)
|
||||
endif()
|
||||
|
||||
set_property(TARGET hip-lang::device APPEND PROPERTY
|
||||
INTERFACE_LINK_OPTIONS "$<$<LINK_LANGUAGE:HIP>:--hip-link>"
|
||||
)
|
||||
|
||||
# Add support for __fp16 and _Float16, explicitly link with compiler-rt
|
||||
set_property(TARGET hip-lang::device APPEND PROPERTY
|
||||
INTERFACE_LINK_LIBRARIES "$<$<LINK_LANGUAGE:HIP>:${HIP_CLANG_INCLUDE_PATH}/lib/linux/libclang_rt.builtins-x86_64.a>"
|
||||
)
|
||||
|
||||
# Approved by CMake to use this name. This is used so that HIP can
|
||||
# change the name of the target and not require any modifications in CMake
|
||||
set(_CMAKE_HIP_DEVICE_RUNTIME_TARGET "hip-lang::device")
|
||||
@@ -1,508 +0,0 @@
|
||||
#!/usr/bin/python
|
||||
# Copyright (C) 2019-2021 Advanced Micro Devices, Inc. All Rights Reserved.
|
||||
#
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
# of this software and associated documentation files (the "Software"), to deal
|
||||
# in the Software without restriction, including without limitation the rights
|
||||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
# copies of the Software, and to permit persons to whom the Software is
|
||||
# furnished to do so, subject to the following conditions:
|
||||
#
|
||||
# The above copyright notice and this permission notice shall be included in
|
||||
# all copies or substantial portions of the Software.
|
||||
#
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
# THE SOFTWARE.
|
||||
|
||||
import os, sys, re
|
||||
|
||||
PROF_HEADER = "hip_prof_str.h"
|
||||
OUTPUT = PROF_HEADER
|
||||
REC_MAX_LEN = 1024
|
||||
|
||||
# Messages and errors controll
|
||||
verbose = 0
|
||||
errexit = 0
|
||||
inp_file = 'none'
|
||||
line_num = -1
|
||||
|
||||
# Verbose message
|
||||
def message(msg):
|
||||
if verbose: sys.stdout.write(msg + '\n')
|
||||
|
||||
# Fatal error termination
|
||||
def error(msg):
|
||||
if line_num != -1:
|
||||
msg += ", file '" + inp_file + "', line (" + str(line_num) + ")"
|
||||
if errexit:
|
||||
msg = " Error: " + msg
|
||||
else:
|
||||
msg = " Warning: " + msg
|
||||
|
||||
sys.stdout.write(msg + '\n')
|
||||
sys.stderr.write(sys.argv[0] + msg +'\n')
|
||||
|
||||
def fatal(msg):
|
||||
error(msg)
|
||||
if errexit: sys.exit(1)
|
||||
|
||||
#############################################################
|
||||
# Normalizing API name
|
||||
def filtr_api_name(name):
|
||||
name = re.sub(r'\s*$', r'', name);
|
||||
return name
|
||||
|
||||
# Normalizing API arguments
|
||||
def filtr_api_args(args_str):
|
||||
args_str = re.sub(r'^\s*', r'', args_str);
|
||||
args_str = re.sub(r'\s*$', r'', args_str);
|
||||
args_str = re.sub(r'\s*,\s*', r',', args_str);
|
||||
args_str = re.sub(r'\s+', r' ', args_str);
|
||||
args_str = re.sub(r'\s*(\*+)\s*', r'\1 ', args_str);
|
||||
args_str = re.sub(r'(enum|struct) ', '', args_str);
|
||||
return args_str
|
||||
|
||||
# Normalizing types
|
||||
def norm_api_types(type_str):
|
||||
type_str = re.sub(r'uint32_t', r'unsigned int', type_str)
|
||||
type_str = re.sub(r'^unsigned$', r'unsigned int', type_str)
|
||||
return type_str
|
||||
|
||||
# Creating a list of arguments [(type, name), ...]
|
||||
def list_api_args(args_str):
|
||||
args_str = filtr_api_args(args_str)
|
||||
args_list = []
|
||||
if args_str != '':
|
||||
for arg_pair in args_str.split(','):
|
||||
if arg_pair == 'void': continue
|
||||
arg_pair = re.sub(r'\s*=\s*\S+$','', arg_pair);
|
||||
m = re.match("^(.*)\s(\S+)$", arg_pair);
|
||||
if m:
|
||||
arg_type = norm_api_types(m.group(1))
|
||||
arg_name = m.group(2)
|
||||
args_list.append((arg_type, arg_name))
|
||||
else:
|
||||
fatal("bad args: args_str: '" + args_str + "' arg_pair: '" + arg_pair + "'")
|
||||
return args_list;
|
||||
|
||||
# Creating arguments string "type0, type1, ..."
|
||||
def filtr_api_types(args_str):
|
||||
args_list = list_api_args(args_str)
|
||||
types_str = ''
|
||||
for arg_tuple in args_list:
|
||||
types_str += arg_tuple[0] + ', '
|
||||
return types_str
|
||||
|
||||
# Creating options list [opt0, opt1, ...]
|
||||
def filtr_api_opts(args_str):
|
||||
args_list = list_api_args(args_str)
|
||||
opts_list = []
|
||||
for arg_tuple in args_list:
|
||||
opts_list.append(arg_tuple[1])
|
||||
return opts_list
|
||||
#############################################################
|
||||
# Parsing API header
|
||||
# hipError_t hipSetupArgument(const void* arg, size_t size, size_t offset);
|
||||
def parse_api(inp_file_p, out):
|
||||
global inp_file
|
||||
global line_num
|
||||
inp_file = inp_file_p
|
||||
|
||||
beg_pattern = re.compile("^(hipError_t|const char\s*\*)\s+([^\(]+)\(");
|
||||
api_pattern = re.compile("^(hipError_t|const char\s*\*)\s+([^\(]+)\(([^\)]*)\)");
|
||||
end_pattern = re.compile("Texture");
|
||||
hidden_pattern = re.compile(r'__attribute__\(\(visibility\("hidden"\)\)\)')
|
||||
nms_open_pattern = re.compile(r'namespace hip_impl {')
|
||||
nms_close_pattern = re.compile(r'}')
|
||||
|
||||
inp = open(inp_file, 'r')
|
||||
|
||||
found = 0
|
||||
hidden = 0
|
||||
nms_level = 0;
|
||||
record = ""
|
||||
line_num = -1
|
||||
|
||||
for line in inp.readlines():
|
||||
record += re.sub(r'^\s+', r' ', line[:-1])
|
||||
line_num += 1
|
||||
|
||||
if len(record) > REC_MAX_LEN:
|
||||
fatal("bad record \"" + record + "\"")
|
||||
|
||||
m = beg_pattern.match(line)
|
||||
if m:
|
||||
name = m.group(2)
|
||||
if hidden != 0:
|
||||
message("api: " + name + " - hidden")
|
||||
elif nms_level != 0:
|
||||
message("api: " + name + " - hip_impl")
|
||||
else:
|
||||
message("api: " + name)
|
||||
found = 1
|
||||
|
||||
if found != 0:
|
||||
record = re.sub("\s__dparm\([^\)]*\)", '', record);
|
||||
m = api_pattern.match(record)
|
||||
if m:
|
||||
found = 0
|
||||
if end_pattern.search(record): break
|
||||
out[filtr_api_name(m.group(2))] = m.group(3)
|
||||
else: continue
|
||||
|
||||
hidden = 0
|
||||
if hidden_pattern.match(line): hidden = 1
|
||||
|
||||
if nms_open_pattern.match(line): nms_level += 1
|
||||
if (nms_level > 0) and nms_close_pattern.match(line): nms_level -= 1
|
||||
if nms_level < 0:
|
||||
fatal("nms level < 0")
|
||||
|
||||
record = ""
|
||||
|
||||
inp.close()
|
||||
line_num = -1
|
||||
#############################################################
|
||||
# Parsing API implementation
|
||||
# hipError_t hipSetupArgument(const void* arg, size_t size, size_t offset) {
|
||||
# HIP_INIT_CB(hipSetupArgument, arg, size, offset);
|
||||
# inp_file - input implementation source file
|
||||
# api_map - input public API map [<api name>] => <api args>
|
||||
# out - output map [<api name>] => [opt0, opt1, ...]
|
||||
def parse_content(inp_file_p, api_map, out):
|
||||
global inp_file
|
||||
global line_num
|
||||
inp_file = inp_file_p
|
||||
|
||||
# API definition begin pattern
|
||||
beg_pattern = re.compile("^(hipError_t|const char\s*\*)\s+[^\(]+\(");
|
||||
# API definition complete pattern
|
||||
api_pattern = re.compile("^(hipError_t|const char\s*\*)\s+([^\(]+)\(([^\)]*)\)\s*{");
|
||||
# API init macro pattern
|
||||
init_pattern = re.compile("^\s*HIP_INIT[_\w]*_API\(([^,]+)(,|\))");
|
||||
target_pattern = re.compile("^(\s*HIP_INIT[^\(]*)(_API\()(.*)\);\s*$");
|
||||
|
||||
# Open input file
|
||||
inp = open(inp_file, 'r')
|
||||
|
||||
# API name
|
||||
api_name = ""
|
||||
# Valid public API found flag
|
||||
api_valid = 0
|
||||
|
||||
# Input file patched content
|
||||
content = ''
|
||||
# Sub content for found API defiition
|
||||
sub_content = ''
|
||||
# Current record, accumulating several API definition related lines
|
||||
record = ''
|
||||
# Current input file line number
|
||||
line_num = -1
|
||||
# API beginning found flag
|
||||
found = 0
|
||||
|
||||
# Reading input file
|
||||
for line in inp.readlines():
|
||||
# Accumulating record
|
||||
record += re.sub(r'^\s+', r' ', line[:-1])
|
||||
line_num += 1
|
||||
|
||||
if len(record) > REC_MAX_LEN:
|
||||
fatal("bad record \"" + record + "\"")
|
||||
break;
|
||||
|
||||
# Looking for API begin
|
||||
if beg_pattern.match(record): found = 1
|
||||
|
||||
# Matching complete API definition
|
||||
if found == 1:
|
||||
record = re.sub("\s__dparm\([^\)]*\)", '', record);
|
||||
m = api_pattern.match(record)
|
||||
# Checking if complete API matched
|
||||
if m:
|
||||
found = 2
|
||||
api_name = filtr_api_name(m.group(2));
|
||||
# Checking if API name is in the API map
|
||||
if api_name in api_map:
|
||||
# Getting API arguments
|
||||
api_args = m.group(3)
|
||||
# Getting etalon arguments from the API map
|
||||
eta_args = api_map[api_name]
|
||||
if eta_args == '':
|
||||
eta_args = api_args
|
||||
api_map[api_name] = eta_args
|
||||
# Normalizing API arguments
|
||||
api_types = filtr_api_types(api_args)
|
||||
# Normalizing etalon arguments
|
||||
eta_types = filtr_api_types(eta_args)
|
||||
if api_types == eta_types:
|
||||
# API is already found
|
||||
if api_name in out:
|
||||
fatal("API redefined \"" + api_name + "\", record \"" + record + "\"")
|
||||
# Set valid public API found flag
|
||||
api_valid = 1
|
||||
# Set output API map with API arguments list
|
||||
out[api_name] = filtr_api_opts(api_args)
|
||||
else:
|
||||
# Warning about mismatched API, possible non public overloaded version
|
||||
api_diff = '\t\t' + inp_file + " line(" + str(line_num) + ")\n\t\tapi: " + api_types + "\n\t\teta: " + eta_types
|
||||
message("\t" + api_name + ' args mismatch:\n' + api_diff + '\n')
|
||||
|
||||
# API found action
|
||||
if found == 2:
|
||||
# Looking for INIT macro
|
||||
m = init_pattern.match(line)
|
||||
if m:
|
||||
found = 0
|
||||
if api_valid == 1:
|
||||
api_valid = 0
|
||||
message("\t" + api_name)
|
||||
else:
|
||||
# Registering dummy API for non public API if the name in INIT is not NONE
|
||||
init_name = m.group(1)
|
||||
# Ignore if it is initialized as NONE
|
||||
if init_name != 'NONE':
|
||||
# Check if init name matching API name
|
||||
if init_name != api_name:
|
||||
fatal("init name mismatch: '" + init_name + "' <> '" + api_name + "'")
|
||||
# If init name is not in public API map then it is private API
|
||||
# else it was not identified and will be checked on finish
|
||||
if not init_name in api_map:
|
||||
if init_name in out:
|
||||
fatal("API reinit \"" + api_name + "\", record \"" + record + "\"")
|
||||
out[init_name] = []
|
||||
elif re.search('}', line):
|
||||
found = 0
|
||||
# Expect INIT macro for valid public API
|
||||
if api_valid == 1:
|
||||
api_valid = 0
|
||||
if api_name in out:
|
||||
del out[api_name]
|
||||
del api_map[api_name]
|
||||
out['.' + api_name] = 1
|
||||
else:
|
||||
fatal("API is not in out \"" + api_name + "\", record \"" + record + "\"")
|
||||
|
||||
if found != 1: record = ""
|
||||
content += line
|
||||
|
||||
inp.close()
|
||||
line_num = -1
|
||||
|
||||
if len(out) != 0:
|
||||
return content
|
||||
else:
|
||||
return ''
|
||||
|
||||
# src path walk
|
||||
def parse_src(api_map, src_path, src_patt, out):
|
||||
pattern = re.compile(src_patt)
|
||||
src_path = re.sub(r'\s', '', src_path)
|
||||
for src_dir in src_path.split(':'):
|
||||
message("Parsing " + src_dir + " for '" + src_patt + "'")
|
||||
for root, dirs, files in os.walk(src_dir):
|
||||
for fnm in files:
|
||||
if pattern.search(fnm):
|
||||
file = root + '/' + fnm
|
||||
message(file)
|
||||
content = parse_content(file, api_map, out);
|
||||
if content != '':
|
||||
f = open(file, 'w')
|
||||
f.write(content)
|
||||
f.close()
|
||||
#############################################################
|
||||
# Generating profiling primitives header
|
||||
# api_map - public API map [<api name>] => [(type, name), ...]
|
||||
# opts_map - opts map [<api name>] => [opt0, opt1, ...]
|
||||
def generate_prof_header(f, api_map, opts_map):
|
||||
# Private API list
|
||||
priv_lst = []
|
||||
|
||||
f.write('// automatically generated sources\n')
|
||||
f.write('#ifndef _HIP_PROF_STR_H\n');
|
||||
f.write('#define _HIP_PROF_STR_H\n');
|
||||
f.write('#define HIP_PROF_VER 1\n')
|
||||
|
||||
# Generating dummy macro for non-public API
|
||||
f.write('\n// Dummy API primitives\n')
|
||||
f.write('#define INIT_NONE_CB_ARGS_DATA(cb_data) {};\n')
|
||||
for name in opts_map:
|
||||
if not name in api_map:
|
||||
opts_lst = opts_map[name]
|
||||
if len(opts_lst) != 0:
|
||||
fatal("bad dummy API \"" + name + "\", args: " + str(opts_lst))
|
||||
f.write('#define INIT_'+ name + '_CB_ARGS_DATA(cb_data) {};\n')
|
||||
priv_lst.append(name)
|
||||
|
||||
for name in priv_lst:
|
||||
message("Private: " + name)
|
||||
|
||||
# Generating the callbacks ID enumaration
|
||||
f.write('\n// HIP API callbacks ID enumaration\n')
|
||||
f.write('enum hip_api_id_t {\n')
|
||||
cb_id = 0
|
||||
for name in api_map.keys():
|
||||
f.write(' HIP_API_ID_' + name + ' = ' + str(cb_id) + ',\n')
|
||||
cb_id += 1
|
||||
f.write(' HIP_API_ID_NUMBER = ' + str(cb_id) + ',\n')
|
||||
f.write('\n')
|
||||
f.write(' HIP_API_ID_NONE = HIP_API_ID_NUMBER,\n')
|
||||
for name in priv_lst:
|
||||
f.write(' HIP_API_ID_' + name + ' = HIP_API_ID_NUMBER,\n')
|
||||
f.write('};\n')
|
||||
|
||||
# Generating the callbacks ID enumaration
|
||||
f.write('\n// Return HIP API string\n')
|
||||
f.write('static inline const char* hip_api_name(const uint32_t id) {\n')
|
||||
f.write(' switch(id) {\n')
|
||||
for name in api_map.keys():
|
||||
f.write(' case HIP_API_ID_' + name + ': return "' + name + '";\n')
|
||||
f.write(' };\n')
|
||||
f.write(' return "unknown";\n')
|
||||
f.write('};\n')
|
||||
|
||||
# Generating the callbacks data structure
|
||||
f.write('\n// HIP API callbacks data structure\n')
|
||||
f.write(
|
||||
'typedef struct hip_api_data_t {\n' +
|
||||
' uint64_t correlation_id;\n' +
|
||||
' uint32_t phase;\n' +
|
||||
' union {\n'
|
||||
)
|
||||
for name, args in api_map.items():
|
||||
if len(args) != 0:
|
||||
f.write(' struct {\n')
|
||||
for arg_tuple in args:
|
||||
if arg_tuple[0] == "hipLimit_t":
|
||||
f.write(' enum ' + arg_tuple[0] + ' ' + arg_tuple[1] + ';\n')
|
||||
else:
|
||||
f.write(' ' + arg_tuple[0] + ' ' + arg_tuple[1] + ';\n')
|
||||
f.write(' } ' + name + ';\n')
|
||||
f.write(
|
||||
' } args;\n' +
|
||||
'} hip_api_data_t;\n'
|
||||
)
|
||||
|
||||
# Generating the callbacks args data filling macros
|
||||
f.write('\n// HIP API callbacks args data filling macros\n')
|
||||
for name, args in api_map.items():
|
||||
f.write('// ' + name + str(args) + '\n')
|
||||
f.write('#define INIT_' + name + '_CB_ARGS_DATA(cb_data) { \\\n')
|
||||
if name in opts_map:
|
||||
opts_list = opts_map[name]
|
||||
if len(args) != len(opts_list):
|
||||
fatal("\"" + name + "\" API args and opts mismatch, args: " + str(args) + ", opts: " + str(opts_list))
|
||||
# API args iterating:
|
||||
# type is args[<ind>][0]
|
||||
# name is args[<ind>][1]
|
||||
for ind in range(0, len(args)):
|
||||
arg_tuple = args[ind]
|
||||
fld_name = arg_tuple[1]
|
||||
arg_name = opts_list[ind]
|
||||
f.write(' cb_data.args.' + name + '.' + fld_name + ' = ' + arg_name + '; \\\n')
|
||||
f.write('};\n')
|
||||
f.write('#define INIT_CB_ARGS_DATA(cb_id, cb_data) INIT_##cb_id##_CB_ARGS_DATA(cb_data)\n')
|
||||
|
||||
# Generating the method for the API string, name and parameters
|
||||
f.write('\n')
|
||||
f.write('#if HIP_PROF_HIP_API_STRING\n')
|
||||
f.write('#include <sstream>\n');
|
||||
f.write('#include <string>\n');
|
||||
f.write('// HIP API string method, method name and parameters\n')
|
||||
f.write('const char* hipApiString(hip_api_id_t id, const hip_api_data_t* data) {\n')
|
||||
f.write(' std::ostringstream oss;\n')
|
||||
f.write(' switch (id) {\n')
|
||||
for name, args in api_map.items():
|
||||
f.write(' case HIP_API_ID_' + name + ':\n')
|
||||
f.write(' oss << "' + name + '("')
|
||||
for ind in range(0, len(args)):
|
||||
arg_tuple = args[ind]
|
||||
arg_name = arg_tuple[1]
|
||||
if ind != 0: f.write(' << ","')
|
||||
f.write('\n << " ' + arg_name + '=" << data->args.' + name + '.' + arg_name)
|
||||
f.write('\n << ")";\n')
|
||||
f.write(' break;\n')
|
||||
f.write(' default: oss << "unknown";\n')
|
||||
f.write(' };\n')
|
||||
f.write(' return strdup(oss.str().c_str());\n')
|
||||
f.write('};\n')
|
||||
f.write('#endif // HIP_PROF_HIP_API_STRING\n')
|
||||
|
||||
f.write('#endif // _HIP_PROF_STR_H\n');
|
||||
|
||||
#############################################################
|
||||
# main
|
||||
# Usage
|
||||
if (len(sys.argv) > 1) and (sys.argv[1] == '-v'):
|
||||
verbose = 1
|
||||
sys.argv.pop(1)
|
||||
|
||||
if (len(sys.argv) > 1) and (sys.argv[1] == '-e'):
|
||||
errexit = 1
|
||||
sys.argv.pop(1)
|
||||
|
||||
if (len(sys.argv) < 3):
|
||||
fatal ("Usage: " + sys.argv[0] + " [-v] <input HIP API .h file> <patched srcs path>\n" +
|
||||
" -v - verbose messages\n" +
|
||||
" example:\n" +
|
||||
" $ hipap.py hip/include/hip/amd_detail/hip_runtime_api.h hip/src")
|
||||
|
||||
# API header file given as an argument
|
||||
api_hfile = sys.argv[1]
|
||||
if not os.path.isfile(api_hfile):
|
||||
fatal("input file '" + api_hfile + "' not found")
|
||||
|
||||
# Srcs directory given as an argument
|
||||
src_pat = "\.cpp$"
|
||||
src_dir = sys.argv[2]
|
||||
if not os.path.isdir(src_dir):
|
||||
fatal("src directory " + src_dir + "' not found")
|
||||
|
||||
if len(sys.argv) > 3: OUTPUT = sys.argv[3]
|
||||
|
||||
# API declaration map
|
||||
api_map = {
|
||||
'hipHccModuleLaunchKernel': '',
|
||||
'hipExtModuleLaunchKernel': ''
|
||||
}
|
||||
# API options map
|
||||
opts_map = {}
|
||||
|
||||
# Parsing API header
|
||||
parse_api(api_hfile, api_map)
|
||||
|
||||
# Parsing sources
|
||||
parse_src(api_map, src_dir, src_pat, opts_map)
|
||||
|
||||
# Checking for non-conformant APIs
|
||||
for name in list(opts_map.keys()):
|
||||
m = re.match(r'\.(\S*)', name)
|
||||
if m:
|
||||
message("Init missing: " + m.group(1))
|
||||
del opts_map[name]
|
||||
|
||||
# Converting api map to map of lists
|
||||
# Checking for not found APIs
|
||||
not_found = 0
|
||||
if len(opts_map) != 0:
|
||||
for name in api_map.keys():
|
||||
args_str = api_map[name];
|
||||
args_list = list_api_args(args_str)
|
||||
api_map[name] = args_list
|
||||
if not name in opts_map:
|
||||
error("implementation not found: " + name)
|
||||
not_found += 1
|
||||
if not_found != 0:
|
||||
fatal(str(not_found) + " API calls missing in interception layer")
|
||||
|
||||
# Generating output header file
|
||||
with open(OUTPUT, 'w') as f:
|
||||
generate_prof_header(f, api_map, opts_map)
|
||||
|
||||
# Successfull exit
|
||||
sys.exit(0)
|
||||
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
Некоторые файлы не были показаны из-за слишком большого количества измененных файлов Показать больше
Ссылка в новой задаче
Block a user