Merge 'hipamd/amd-staging' into amd-staging

Change-Id: I1bbfe6261643fce1c3ce407452ac545f0111e893


[ROCm/clr commit: 854fb9dc95]
这个提交包含在:
Maneesh Gupta
2023-03-26 15:42:30 +00:00
当前提交 5847fc78a0
修改 142 个文件,包含 69129 行新增0 行删除
+10
查看文件
@@ -0,0 +1,10 @@
Language: Cpp
BasedOnStyle: Google
AlignEscapedNewlinesLeft: false
AlignOperands: false
ColumnLimit: 100
AlwaysBreakTemplateDeclarations: false
DerivePointerAlignment: false
IndentFunctionDeclarationAfterType: false
MaxEmptyLinesToKeep: 2
SortIncludes: false
+20
查看文件
@@ -0,0 +1,20 @@
# Set the default behavior, in case people don't have core.autolf set.
* text=auto
# Explicitly declare text files you want to always be normalized and converted
# to have LF line endings on checkout.
*.c text eol=lf
*.cpp text eol=lf
*.cc text eol=lf
*.h text eol=lf
*.hpp text eol=lf
*.txt text eol=lf
# Define files to support auto-remove trailing white space
# Need to run the command below, before add modified file(s) to the staging area
# git config filter.trimspace.clean 'sed -e "s/[[:space:]]*$//g"'
*.cpp filter=trimspace
*.c filter=trimspace
*.h filter=trimspacecpp
*.hpp filter=trimspace
*.md filter=trimspace
+16
查看文件
@@ -0,0 +1,16 @@
.*
!.gitignore
*.o
*.exe
*.swp
lib
packages
build
bin/hipInfo
bin/hipBusBandwidth
bin/hipDispatchLatency
bin/hipify-clang
tags
samples/1_Utils/hipInfo/hipInfo
samples/1_Utils/hipBusBandwidth/hipBusBandwidth
samples/1_Utils/hipDispatchLatency/hipDispatchLatency
+480
查看文件
@@ -0,0 +1,480 @@
# Copyright (c) 2016 - 2021 Advanced Micro Devices, Inc. All Rights Reserved.
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
cmake_minimum_required(VERSION 3.16.8)
project(hip)
include(GNUInstallDirs)
# sample command for hip-rocclr runtime, you'll need to have rocclr built
# ROCM_PATH is the path where ROCM is installed
# For shared lib of hip-rocclr runtime
# For release version
# cmake -DHIP_COMMON_DIR="$HIP_DIR" -DAMD_OPENCL_PATH=$OPENCL_DIR -DROCCLR_PATH=$ROCCLR_DIR -DCMAKE_PREFIX_PATH="<ROCM_PATH>/" -DCMAKE_INSTALL_PREFIX=</where/to/install/hip> ..
# For debug version
# cmake -DHIP_COMMON_DIR="$HIP_DIR" -DAMD_OPENCL_PATH=$OPENCL_DIR -DROCCLR_PATH=$ROCCLR_DIR -DCMAKE_PREFIX_PATH="<ROCM_PATH>/" -DCMAKE_BUILD_TYPE=Debug -DCMAKE_INSTALL_PREFIX=</where/to/install/hip> ..
# For static lib of hip-rocclr runtime
# For release version
# cmake -DHIP_COMMON_DIR="$HIP_DIR" -DAMD_OPENCL_PATH=$OPENCL_DIR -DROCCLR_PATH=$ROCCLR_DIR -DBUILD_SHARED_LIBS=OFF -DCMAKE_PREFIX_PATH="<ROCM_PATH>/" -DCMAKE_INSTALL_PREFIX=</where/to/install/hip> ..
# For debug version
# cmake -DHIP_COMMON_DIR="$HIP_DIR" -DAMD_OPENCL_PATH=$OPENCL_DIR -DROCCLR_PATH=$ROCCLR_DIR -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=Debug -DCMAKE_PREFIX_PATH="<ROCM_PATH>/" -DCMAKE_INSTALL_PREFIX=</where/to/install/hip> ..
# If you don't specify CMAKE_INSTALL_PREFIX, hip-rocclr runtime will be installed to "<ROCM_PATH>/hip".
# By default, CMake will search for a folder named vdi or ROCclr relative to the current path. Specify -DROCCLR_PATH=$ROCCLR_DIR if rocclr source is in obscure location.
# By default, CMake will search for a folder named opencl or ROCm-OpenCL-Runtime relative to the current path. Specify -DAMD_OPENCL_PATH=$OPENCL_DIR if opencl source is in obscure location.
list(APPEND CMAKE_MODULE_PATH ${HIP_COMMON_DIR}/cmake)
# required to add the right link to libhsa-runtime in install/lib path
# CMAKE_PREFIX_PATH is used as rpath to search for libs outside HIP
set(CMAKE_INSTALL_RPATH "${CMAKE_PREFIX_PATH}/${CMAKE_INSTALL_LIBDIR}")
set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)
#############################
# Options
#############################
option(BUILD_HIPIFY_CLANG "Enable building the CUDA->HIP converter" OFF)
option(__HIP_ENABLE_PCH "Enable/Disable pre-compiled hip headers" ON)
option(HIP_OFFICIAL_BUILD "Enable/Disable for mainline/staging builds" OFF)
option(FILE_REORG_BACKWARD_COMPATIBILITY "Enable File Reorg with backward compatibility" ON)
set(HIPCC_BIN_DIR "" CACHE STRING "HIPCC and HIPCONFIG binary directories")
if(__HIP_ENABLE_PCH)
set(_pchStatus 1)
else()
set(_pchStatus 0)
endif()
message(STATUS "HIPCC_BIN_DIR found at ${HIPCC_BIN_DIR}")
message(STATUS "HIP_COMMON_DIR found at ${HIP_COMMON_DIR}")
set(HIP_COMMON_INCLUDE_DIR ${HIP_COMMON_DIR}/include)
set(HIP_COMMON_BIN_DIR ${HIP_COMMON_DIR}/bin)
set(__HIPCONFIG_EXECUTABLE__ ${HIP_COMMON_DIR}/bin/hipconfig)
#############################
# Setup config generation
#############################
string(TIMESTAMP _timestamp UTC)
set(_versionInfo "# Auto-generated by cmake\n")
set(_buildInfo "# Auto-generated by cmake on ${_timestamp} UTC\n")
macro(add_to_config _configfile _variable)
set(${_configfile} "${${_configfile}}${_variable}=${${_variable}}\n")
endmacro()
#############################
# Setup version information
#############################
find_package(Perl REQUIRED)
# Determine HIP_BASE_VERSION
set(ENV{HIP_PATH} "")
file(STRINGS ${HIP_COMMON_DIR}/VERSION VERSION_LIST REGEX "^[0-9]+")
list(GET VERSION_LIST 0 HIP_VERSION_MAJOR)
list(GET VERSION_LIST 1 HIP_VERSION_MINOR)
list(GET VERSION_LIST 2 HIP_VERSION_PATCH)
set(HIP_VERSION_GITDATE 0)
find_package(Git)
# FIXME: Two different version strings used.
# Below we use UNIX commands, not compatible with Windows.
if(GIT_FOUND)
# use the commit date, instead of build date
execute_process(COMMAND ${GIT_EXECUTABLE} show -s --format=%ct
RESULT_VARIABLE git_result
OUTPUT_VARIABLE git_output
WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}
OUTPUT_STRIP_TRAILING_WHITESPACE)
if(git_result EQUAL 0)
set(HIP_VERSION_UNIXDATE ${git_output})
endif()
# get date information based on UTC
# use the last two digits of year + week number + day in the week as HIP_VERSION_GITDATE
execute_process(COMMAND ${PERL_EXECUTABLE} "-MPOSIX=strftime" "-le" "print strftime \'%y%W%w\',gmtime(${HIP_VERSION_UNIXDATE})"
RESULT_VARIABLE git_result
OUTPUT_VARIABLE git_output
WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}
OUTPUT_STRIP_TRAILING_WHITESPACE)
if(git_result EQUAL 0)
set(HIP_VERSION_GITDATE ${git_output})
endif()
# get commit short hash
execute_process(COMMAND ${GIT_EXECUTABLE} rev-parse --short HEAD
WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}
RESULT_VARIABLE git_result
OUTPUT_VARIABLE git_output
OUTPUT_STRIP_TRAILING_WHITESPACE)
if(git_result EQUAL 0)
set(HIP_VERSION_GITHASH ${git_output})
endif()
set(HIP_VERSION_BUILD_ID 0)
set(HIP_VERSION_BUILD_NAME "")
if(NOT DEFINED ENV{HIP_OFFICIAL_BUILD} AND NOT HIP_OFFICIAL_BUILD)
set(HIP_VERSION_PATCH ${HIP_VERSION_GITDATE})
endif()
if(DEFINED ENV{ROCM_LIBPATCH_VERSION})
set(HIP_PACKAGING_VERSION_PATCH ${HIP_VERSION_PATCH}.$ENV{ROCM_LIBPATCH_VERSION})
else()
set(HIP_PACKAGING_VERSION_PATCH ${HIP_VERSION_PATCH}-${HIP_VERSION_GITHASH})
endif()
else()
set(HIP_VERSION_BUILD_ID 0)
set(HIP_VERSION_BUILD_NAME "")
# FIXME: Some parts depend on this being set.
set(HIP_PACKAGING_VERSION_PATCH "0")
endif()
## Debian package specific variables
if ( DEFINED ENV{CPACK_DEBIAN_PACKAGE_RELEASE} )
set ( CPACK_DEBIAN_PACKAGE_RELEASE $ENV{CPACK_DEBIAN_PACKAGE_RELEASE} )
else()
set ( CPACK_DEBIAN_PACKAGE_RELEASE "local" )
endif()
message (STATUS "Using CPACK_DEBIAN_PACKAGE_RELEASE ${CPACK_DEBIAN_PACKAGE_RELEASE}" )
## RPM package specific variables
if ( DEFINED ENV{CPACK_RPM_PACKAGE_RELEASE} )
set ( CPACK_RPM_PACKAGE_RELEASE $ENV{CPACK_RPM_PACKAGE_RELEASE} )
else()
set ( CPACK_RPM_PACKAGE_RELEASE "local" )
endif()
## 'dist' breaks manual builds on debian systems due to empty Provides
execute_process( COMMAND rpm --eval %{?dist}
RESULT_VARIABLE PROC_RESULT
OUTPUT_VARIABLE EVAL_RESULT
OUTPUT_STRIP_TRAILING_WHITESPACE )
if ( PROC_RESULT EQUAL "0" AND NOT EVAL_RESULT STREQUAL "" )
string ( APPEND CPACK_RPM_PACKAGE_RELEASE "%{?dist}" )
endif()
message(STATUS "CPACK_RPM_PACKAGE_RELEASE: ${CPACK_RPM_PACKAGE_RELEASE}")
add_to_config(_versionInfo HIP_PACKAGING_VERSION_PATCH)
add_to_config(_versionInfo CPACK_DEBIAN_PACKAGE_RELEASE)
add_to_config(_versionInfo CPACK_RPM_PACKAGE_RELEASE)
add_to_config(_versionInfo HIP_VERSION_MAJOR)
add_to_config(_versionInfo HIP_VERSION_MINOR)
add_to_config(_versionInfo HIP_VERSION_PATCH)
add_to_config(_versionInfo HIP_VERSION_GITHASH)
set (HIP_LIB_VERSION_MAJOR ${HIP_VERSION_MAJOR})
set (HIP_LIB_VERSION_MINOR ${HIP_VERSION_MINOR})
if (${ROCM_PATCH_VERSION} )
set (HIP_LIB_VERSION_PATCH ${ROCM_PATCH_VERSION})
elseif (DEFINED HIP_VERSION_GITHASH)
set (HIP_LIB_VERSION_PATCH ${HIP_VERSION_PATCH}-${HIP_VERSION_GITHASH})
else ()
set (HIP_LIB_VERSION_PATCH ${HIP_VERSION_PATCH})
endif ()
set (HIP_LIB_VERSION_STRING "${HIP_LIB_VERSION_MAJOR}.${HIP_LIB_VERSION_MINOR}.${HIP_LIB_VERSION_PATCH}")
if (DEFINED ENV{ROCM_RPATH})
set (CMAKE_INSTALL_RPATH "$ENV{ROCM_RPATH}")
set (CMAKE_BUILD_WITH_INSTALL_RPATH TRUE)
set (CMAKE_SKIP_BUILD_RPATH TRUE)
set (CMAKE_INSTALL_RPATH_USE_LINK_PATH FALSE)
endif ()
# overwrite HIP_VERSION_PATCH for packaging
set(HIP_VERSION ${HIP_VERSION_MAJOR}.${HIP_VERSION_MINOR}.${HIP_PACKAGING_VERSION_PATCH})
# Remove when CI is updated
if(HIP_PLATFORM STREQUAL "rocclr")
set(HIP_PLATFORM "amd")
endif()
#############################
# Configure variables
#############################
# Determine HIP_PLATFORM
if(NOT DEFINED HIP_PLATFORM)
if(NOT DEFINED ENV{HIP_PLATFORM})
execute_process(COMMAND ${__HIPCONFIG_EXECUTABLE__} --platform
OUTPUT_VARIABLE HIP_PLATFORM
OUTPUT_STRIP_TRAILING_WHITESPACE)
else()
set(HIP_PLATFORM $ENV{HIP_PLATFORM} CACHE STRING "HIP Platform")
endif()
endif()
message(STATUS "HIP Platform: " ${HIP_PLATFORM})
if(HIP_PLATFORM STREQUAL "nvidia")
set(HIP_RUNTIME "cuda" CACHE STRING "HIP Runtime")
set(HIP_COMPILER "nvcc" CACHE STRING "HIP Compiler")
elseif(HIP_PLATFORM STREQUAL "amd")
set(HIP_RUNTIME "rocclr" CACHE STRING "HIP Runtime")
set(HIP_COMPILER "clang" CACHE STRING "HIP Compiler")
else()
message(FATAL_ERROR "Unexpected HIP_PLATFORM: " ${HIP_PLATFORM})
endif()
message(STATUS "HIP Runtime: " ${HIP_RUNTIME})
message(STATUS "HIP Compiler: " ${HIP_COMPILER})
add_to_config(_buildInfo HIP_RUNTIME)
add_to_config(_buildInfo HIP_COMPILER)
# Set default build type
if(NOT CMAKE_BUILD_TYPE)
set(CMAKE_BUILD_TYPE "Release")
endif()
if (NOT DEFINED ROCM_PATH )
set ( ROCM_PATH "/opt/rocm" CACHE STRING "Default ROCM installation directory." )
endif ()
message (STATUS "ROCM Installation path(ROCM_PATH): ${ROCM_PATH}")
# Determine HIP install path
if (UNIX)
set(HIP_DEFAULT_INSTALL_PREFIX "${ROCM_PATH}")
endif()
if(CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT)
set(CMAKE_INSTALL_PREFIX ${HIP_DEFAULT_INSTALL_PREFIX} CACHE PATH "Installation path for HIP" FORCE)
endif()
if(DEV_LOG_ENABLE MATCHES "yes")
add_definitions(-DDEV_LOG_ENABLE)
endif()
# Set default install path as "${ROCM_PATH}", can override the path from cmake build.
set(CPACK_INSTALL_PREFIX ${HIP_DEFAULT_INSTALL_PREFIX} CACHE PATH "Package Installation path for HIP")
if(IS_ABSOLUTE ${CMAKE_INSTALL_PREFIX})
message(STATUS "HIP will be installed in: " ${CMAKE_INSTALL_PREFIX})
else()
message(FATAL_ERROR "Don't know where to install HIP. Please specify absolute path using -DCMAKE_INSTALL_PREFIX")
endif()
# set the installation path for the installer package
set(CPACK_SET_DESTDIR ON CACHE BOOL "Installer package will install hip to CMAKE_INSTALL_PREFIX instead of CPACK_PACKAGING_INSTALL_PREFIX")
if (NOT CPACK_SET_DESTDIR)
set(CPACK_PACKAGING_INSTALL_PREFIX "${ROCM_PATH}" CACHE PATH "Default installation path of hcc installer package")
endif (NOT CPACK_SET_DESTDIR)
#############################
# Build steps
#############################
set(BIN_INSTALL_DIR ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_BINDIR})
set(LIB_INSTALL_DIR ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR})
set(INCLUDE_INSTALL_DIR ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_INCLUDEDIR})
set(CONFIG_PACKAGE_INSTALL_DIR ${LIB_INSTALL_DIR}/cmake/hip)
set(CONFIG_LANG_PACKAGE_INSTALL_DIR ${LIB_INSTALL_DIR}/cmake/hip-lang)
set(CONFIG_RTC_PACKAGE_INSTALL_DIR ${LIB_INSTALL_DIR}/cmake/hiprtc)
# Build clang hipify if enabled
if (BUILD_HIPIFY_CLANG)
add_subdirectory(hipify-clang)
endif()
# Generate hip_version.h
set(_versionInfoHeader
"// Auto-generated by cmake\n
#ifndef HIP_VERSION_H
#define HIP_VERSION_H\n
#define HIP_VERSION_MAJOR ${HIP_VERSION_MAJOR}
#define HIP_VERSION_MINOR ${HIP_VERSION_MINOR}
#define HIP_VERSION_PATCH ${HIP_VERSION_PATCH}
#define HIP_VERSION_GITHASH \"${HIP_VERSION_GITHASH}\"
#define HIP_VERSION_BUILD_ID ${HIP_VERSION_BUILD_ID}
#define HIP_VERSION_BUILD_NAME \"${HIP_VERSION_BUILD_NAME}\"
#define HIP_VERSION (HIP_VERSION_MAJOR * 10000000 + HIP_VERSION_MINOR * 100000 + HIP_VERSION_PATCH)\n
#define __HIP_HAS_GET_PCH ${_pchStatus}\n
#endif\n
")
file(WRITE "${PROJECT_BINARY_DIR}/include/hip/hip_version.h" ${_versionInfoHeader})
if(HIP_RUNTIME STREQUAL "rocclr")
add_subdirectory(src)
endif()
# Generate .hipInfo
file(WRITE "${PROJECT_BINARY_DIR}/.hipInfo" ${_buildInfo})
# Generate .hipVersion
file(WRITE "${PROJECT_BINARY_DIR}/.hipVersion" ${_versionInfo})
# Build doxygen documentation
find_program(DOXYGEN_EXE doxygen)
if(DOXYGEN_EXE)
add_custom_target(doc COMMAND HIP_PATH=${CMAKE_CURRENT_SOURCE_DIR} ${DOXYGEN_EXE} ${CMAKE_CURRENT_SOURCE_DIR}/docs/doxygen-input/doxy.cfg
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/docs)
endif()
#############################
# Install steps
#############################
# Install .hipInfo
install(FILES ${PROJECT_BINARY_DIR}/.hipInfo DESTINATION ${CMAKE_INSTALL_LIBDIR})
# Install .hipVersion
install(FILES ${PROJECT_BINARY_DIR}/.hipVersion DESTINATION ${CMAKE_INSTALL_BINDIR})
# Install src, bin, include & cmake if necessary
execute_process(COMMAND test ${CMAKE_INSTALL_PREFIX} -ef ${CMAKE_CURRENT_SOURCE_DIR}
RESULT_VARIABLE INSTALL_SOURCE)
if(NOT ${INSTALL_SOURCE} EQUAL 0)
if(WIN32)
install(DIRECTORY ${HIP_COMMON_BIN_DIR} DESTINATION . USE_SOURCE_PERMISSIONS)
if (CMAKE_BUILD_TYPE STREQUAL "Debug")
install(DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/src/" DESTINATION ${CMAKE_INSTALL_BINDIR}
FILES_MATCHING PATTERN "*.pdb"
PATTERN "*.ilk"
PATTERN "CMakeFiles" EXCLUDE
PATTERN "hip_rtc_gen" EXCLUDE
PATTERN "libelf" EXCLUDE
PATTERN "loader" EXCLUDE
PATTERN "pal" EXCLUDE
PATTERN "libamdhsacode" EXCLUDE)
endif()
else()
# Exclude .bat files on Linux.
#Hip bin files moved to /opt/rocm/bin and the file permission need to set properly
install(DIRECTORY ${HIP_COMMON_BIN_DIR} DESTINATION . USE_SOURCE_PERMISSIONS
DIRECTORY_PERMISSIONS OWNER_READ OWNER_WRITE OWNER_EXECUTE GROUP_READ GROUP_EXECUTE WORLD_READ WORLD_EXECUTE
PATTERN *.bat EXCLUDE)
endif()
if(WIN32) #not required for flat folder structure
# The following two lines will be removed after upstream updation
install(CODE "MESSAGE(\"Removing ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_INCLUDEDIR}\")")
install(CODE "file(REMOVE_RECURSE ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_INCLUDEDIR})")
endif()
install(DIRECTORY include DESTINATION .)
install(DIRECTORY ${HIP_COMMON_INCLUDE_DIR}/hip/ DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/hip/)
if(WIN32)
install(DIRECTORY ${HIP_COMMON_DIR}/cmake DESTINATION .)
else()
install(DIRECTORY ${HIP_COMMON_DIR}/cmake/ DESTINATION ${CONFIG_PACKAGE_INSTALL_DIR})
endif()
endif()
# Install generated headers
# FIXME: Associate with individual targets.
if(HIP_PLATFORM STREQUAL "amd")
install(FILES ${PROJECT_BINARY_DIR}/include/hip/amd_detail/hip_prof_str.h
DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/hip/amd_detail)
install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/bin DESTINATION . USE_SOURCE_PERMISSIONS)
endif()
install(FILES ${PROJECT_BINARY_DIR}/include/hip/hip_version.h
DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/hip)
if (NOT ${HIPCC_BIN_DIR} STREQUAL "")
file(TO_CMAKE_PATH "${HIPCC_BIN_DIR}" HIPCC_BIN_DIR)
set(hipcc_bin ${HIPCC_BIN_DIR}/hipcc.bin)
set(hipconfig_bin ${HIPCC_BIN_DIR}/hipconfig.bin)
if(WIN32)
set(hipcc_bin ${hipcc_bin}.exe)
set(hipconfig_bin ${hipconfig_bin}.exe)
endif()
install(PROGRAMS ${hipcc_bin} DESTINATION bin)
install(PROGRAMS ${hipconfig_bin} DESTINATION bin)
endif()
#############################
# hip-config
#############################
include(CMakePackageConfigHelpers)
configure_package_config_file(
hip-config.cmake.in
${CMAKE_CURRENT_BINARY_DIR}/hip-config.cmake
INSTALL_DESTINATION ${CONFIG_PACKAGE_INSTALL_DIR}
PATH_VARS LIB_INSTALL_DIR INCLUDE_INSTALL_DIR BIN_INSTALL_DIR
)
write_basic_package_version_file(
${CMAKE_CURRENT_BINARY_DIR}/hip-config-version.cmake
VERSION "${HIP_VERSION_MAJOR}.${HIP_VERSION_MINOR}.${HIP_VERSION_GITDATE}"
COMPATIBILITY SameMajorVersion
)
install(
FILES
${CMAKE_CURRENT_BINARY_DIR}/hip-config.cmake
${CMAKE_CURRENT_BINARY_DIR}/hip-config-version.cmake
DESTINATION
${CONFIG_PACKAGE_INSTALL_DIR}
)
# Packaging invokes UNIX commands, which are not available on Windows.
if(NOT WIN32)
add_subdirectory(packaging)
endif()
#############################
# Code formatting
#############################
# Target: clangformat
find_program(CLANGFORMAT_EXE clang-format PATHS ${HCC_HOME}/bin)
if(CLANGFORMAT_EXE)
file(GLOB_RECURSE FORMAT_SOURCE_FILE_LIST *.cpp *.hpp *.h)
add_custom_target(clangformat COMMAND ${CLANGFORMAT_EXE} -style=file -i ${FORMAT_SOURCE_FILE_LIST}
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
endif()
#############################
# Testing steps
#############################
# HIT is not compatible with Windows
if(NOT WIN32)
set(HIP_ROOT_DIR ${CMAKE_CURRENT_BINARY_DIR})
set(HIP_SRC_PATH ${CMAKE_CURRENT_SOURCE_DIR})
if(HIP_PLATFORM STREQUAL "nvidia")
execute_process(COMMAND "${CMAKE_COMMAND}" -E copy_directory "${HIP_SRC_PATH}/include" "${HIP_ROOT_DIR}/include" RESULT_VARIABLE RUN_HIT ERROR_QUIET)
endif()
execute_process(COMMAND "${CMAKE_COMMAND}" -E copy_directory "${HIP_COMMON_INCLUDE_DIR}/hip/" "${HIP_ROOT_DIR}/include/hip/" RESULT_VARIABLE RUN_HIT ERROR_QUIET)
execute_process(COMMAND "${CMAKE_COMMAND}" -E copy_directory "${HIP_COMMON_DIR}/cmake" "${HIP_ROOT_DIR}/cmake" RESULT_VARIABLE RUN_HIT ERROR_QUIET)
if(${RUN_HIT} EQUAL 0)
execute_process(COMMAND "${CMAKE_COMMAND}" -E copy_directory "${HIP_COMMON_BIN_DIR}" "${HIP_ROOT_DIR}/bin" RESULT_VARIABLE RUN_HIT ERROR_QUIET)
endif()
if(HIP_CATCH_TEST EQUAL "1")
message(STATUS "Building of catch tests through hipamd is no longer supported. Testing targets will not be available. catch tests have been moved to an independent github project hip-tests. Please refer to hip-tests Readme for build instructions! ")
else()
if(${RUN_HIT} EQUAL 0)
set(CMAKE_MODULE_PATH "${HIP_ROOT_DIR}/cmake" ${CMAKE_MODULE_PATH})
include(${HIP_COMMON_DIR}/tests/hit/HIT.cmake)
include(${HIP_COMMON_DIR}/tests/Tests.cmake)
else()
message(STATUS "Testing targets will not be available. To enable them please ensure that the HIP installation directory is writeable. Use -DCMAKE_INSTALL_PREFIX to specify a suitable location")
endif()
endif()
endif()
#############################
# Code analysis
#############################
# Target: clang
if(HIP_HIPCC_EXECUTABLE)
add_custom_target(analyze
COMMAND ${HIP_HIPCC_EXECUTABLE} -fvisibility=hidden -fvisibility-inlines-hidden --analyze --analyzer-outputtext -isystem ${ROCM_PATH}/${CMAKE_INSTALL_INCLUDEDIR} -Wno-unused-command-line-argument -I${ROCM_PATH}/${CMAKE_INSTALL_INCLUDEDIR} -c src/*.cpp -Iinclude/ -I./
WORKING_DIRECTORY ${HIP_SRC_PATH})
if(CPPCHECK_EXE)
add_dependencies(analyze cppcheck)
endif()
endif()
#File reorg Backward compatibility function
if(NOT WIN32)
if(FILE_REORG_BACKWARD_COMPATIBILITY)
include(hip-backward-compat.cmake)
endif()
endif()
+62
查看文件
@@ -0,0 +1,62 @@
## Prerequisites
- Install mesa-common-dev
- Either build or install [COMGR](https://github.com/RadeonOpenCompute/ROCm-CompilerSupport), [CLANG](https://github.com/RadeonOpenCompute/llvm-project) and [Device Library](https://github.com/RadeonOpenCompute/ROCm-Device-Libs)
## Branch of repository
Before get HIP source code, set the expected branch of repository at the variable HIP_BRANCH.
For example, for ROCm5.0 release branch, set
```
export HIP_BRANCH=rocm-5.0.x
```
ROCm5.1 release branch, set
```
export HIP_BRANCH=rocm-5.1.x
```
Similiar format for future branches.
## Getting the source code
```bash
git clone -b $HIP_BRANCH https://github.com/ROCm-Developer-Tools/hipamd.git
git clone -b $HIP_BRANCH https://github.com/ROCm-Developer-Tools/hip.git
git clone -b $HIP_BRANCH https://github.com/ROCm-Developer-Tools/ROCclr.git
git clone -b $HIP_BRANCH https://github.com/RadeonOpenCompute/ROCm-OpenCL-Runtime.git
```
## Set the environment variables
```bash
export HIPAMD_DIR="$(readlink -f hipamd)"
export HIP_DIR="$(readlink -f hip)"
export ROCclr_DIR="$(readlink -f ROCclr)"
export OPENCL_DIR="$(readlink -f ROCm-OpenCL-Runtime)"
```
## Build HIPAMD
Commands to build hipamd are as following,
```bash
cd "$HIPAMD_DIR"
mkdir -p build; cd build
cmake -DHIP_COMMON_DIR=$HIP_DIR -DAMD_OPENCL_PATH=$OPENCL_DIR -DROCCLR_PATH=$ROCCLR_DIR -DCMAKE_PREFIX_PATH="<ROCM_PATH>/" ..
make -j$(nproc)
sudo make install
```
Please note, HIP_COMMON_DIR looks for hip common ([HIP](https://github.com/ROCm-Developer-Tools/HIP/)) source codes.
By default, release version of hipamd is built. hip will be installed to the default path <ROCM_PATH>/hip
Developer can use cmake option CMAKE_INSTALL_PREFIX to define the path where hip is expected to be installed, commands to build are as following,
```bash
cd "$HIPAMD_DIR"
mkdir -p build; cd build
cmake -DHIP_COMMON_DIR=$HIP_DIR -DAMD_OPENCL_PATH=$OPENCL_DIR -DROCCLR_PATH=$ROCCLR_DIR -DCMAKE_PREFIX_PATH="<ROCM_PATH>/" -DCMAKE_INSTALL_PREFIX=$PWD/install ..
make -j$(nproc)
sudo make install
```
After installation, make sure HIP_PATH is pointed to the path where hip is installed.
+20
查看文件
@@ -0,0 +1,20 @@
Copyright (c) 2008 - 2022 Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
+24
查看文件
@@ -0,0 +1,24 @@
## What is this repository for? ###
This repository provides [HIP](https://github.com/ROCm-Developer-Tools/HIP) implementation specifically for AMD platform.
## DISCLAIMER
The information presented in this document is for informational purposes only and may contain technical inaccuracies, omissions, and typographical errors. The information contained herein is subject to change and may be rendered inaccurate for many reasons, including but not limited to product and roadmap changes, component and motherboard versionchanges, new model and/or product releases, product differences between differing manufacturers, software changes, BIOS flashes, firmware upgrades, or the like. Any computer system has risks of security vulnerabilities that cannot be completely prevented or mitigated.AMD assumes no obligation to update or otherwise correct or revise this information. However, AMD reserves the right to revise this information and to make changes from time to time to the content hereof without obligation of AMD to notify any person of such revisions or changes.THIS INFORMATION IS PROVIDED ‘AS IS.” AMD MAKES NO REPRESENTATIONS OR WARRANTIES WITH RESPECT TO THE CONTENTS HEREOF AND ASSUMES NO RESPONSIBILITY FOR ANY INACCURACIES, ERRORS, OR OMISSIONS THAT MAY APPEAR IN THIS INFORMATION. AMD SPECIFICALLY DISCLAIMS ANY IMPLIED WARRANTIES OF NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR ANY PARTICULAR PURPOSE. IN NO EVENT WILL AMD BE LIABLE TO ANY PERSON FOR ANY RELIANCE, DIRECT, INDIRECT, SPECIAL, OR OTHER CONSEQUENTIAL DAMAGES ARISING FROM THE USE OF ANY INFORMATION CONTAINED HEREIN, EVEN IF AMD IS EXPRESSLY ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. AMD, the AMD Arrow logo, and combinations thereof are trademarks of Advanced Micro Devices, Inc. Other product names used in this publication are for identification purposes only and may be trademarks of their respective companies.
© 2021 Advanced Micro Devices, Inc. All Rights Reserved.
## Repository branches:
The hipamd repository maintains several branches. The branches that are of importance are:
* Main branch: This is the stable branch. It is up to date with the latest release branch, for example, if the latest HIP release is rocm-4.4, main branch will be the repository based on this release.
* Develop branch: This is the default branch, on which the new features are still under development and visible. While this maybe of interest to many, it should be noted that this branch and the features under development might not be stable.
* Release branches. These are branches corresponding to each ROCM release, listed with release tags, such as rocm-4.4, etc.
## Release tagging:
hipamd releases are typically naming convention for each ROCM release to help differentiate them.
* rocm x.yy: These are the stable releases based on the ROCM release.
This type of release is typically made once a month.*
+265
查看文件
@@ -0,0 +1,265 @@
#!/bin/bash
#| Usage: roc-obj [-h] [-t REGEXP] [-o OUTDIR] [-I REPLACE-STRING|-i] [-d]
#| EXECUTABLE... [: [SUFFIX COMMAND [ARGS...] ;]...]
#|
#| Wrapper for roc-obj-ls and roc-obj-extract which extracts code objects
#| embedded in each EXECUTABLE and optionally applies COMMANDs to them.
#|
#| If the POSIX extended regular expression REGEXP is specified, only embedded
#| code objects whose Target ID matches REGEXP are extracted; otherwise all
#| code objects are extracted.
#|
#| If the directory path OUTDIR is specified, it is created if it does not
#| already exist, and the code objects are extracted into it; otherwise they
#| are extracted into the current working directory.
#|
#| The extracted files are named by appending a ":" followed by the Target ID
#| of the extracted code object to the input filename EXECUTABLE they were
#| extracted from.
#|
#| If the list of EXECUTABLE arguments is terminated with ":" then after all
#| selected files are successfully extracted, zero or more additional embedded
#| command-lines, separated by ";", are read from the command-line starting
#| after the ":". These must specify a SUFFIX used to name the output of the
#| corresponding COMMAND, along with the COMMAND name and any ARGS to it.
#|
#| Then each COMMAND is executed, as if by a POSIX "execvp" function, once for
#| each embedded code object that was created in OUTDIR. (Note: Typically this
#| means the user must ensure the commands are present in at least one
#| directory of the "PATH" environment variable.) For each execution of
#| COMMAND:
#|
#| If REPLACE-STRING is specified, all instances of REPLACE-STRING in ARGS are
#| replaced with the file path of the extracted code object before executing
#| COMMAND.
#|
#| The standard input is redirected from the extracted code object.
#|
#| If SUFFIX is "-" the standard output is not redirected. If SUFFIX is "!" the
#| standard output is redirected to /dev/null. Otherwise, the standard output
#| is redirected to files named by the file path of the extracted code object
#| with SUFFIX appended.
#|
#| Note: The executables roc-obj-ls, roc-obj-extract, and llvm-objdump (in the
#| case of disassembly requested using the -d flag) are searched for in a
#| unique way. A series of directories are searched, some conditionally, until
#| a suitable executable is found. If all directories are searched without
#| finding the executable, an error occurs. The first directory searched is the
#| one containing the hard-link to the roc-obj being executed, known as the
#| "base directory". Next, if the environment variable HIP_CLANG_PATH is set,
#| it is searched; otherwise, the base directory path is appended with
#| "../../llvm/bin" and it is searched. Finally, the PATH is searched as if by
#| a POSIX "execvp" function.
#|
#| Option Descriptions:
#| -h, --help print this help text and exit
#| -t, --target-id only extract code objects from EXECUTABLE whose Target ID
#| matches the POSIX extended regular expression REGEXP
#| -o, --outdir set the output directory, which is created if it
#| does not exist
#| -I, --replace-string replace all occurrences of the literal string
#| REPLACE-STRING in ARGS with the input filename
#| -i, --replace equivalent to -I{}
#| -d, --disassemble diassemble extracted code objects; equivalent to
#| : .s llvm-objdump -d - ;
#|
#| Example Usage:
#|
#| Extract all code objects embedded in a.so:
#| $ roc-obj a.so
#|
#| Extract all code objects embedded in a.so, b.so, and c.so:
#| $ roc-obj a.so b.so c.so
#|
#| Extract all code objects embedded in a.so with "gfx9" in their Target ID:
#| $ roc-obj -t gfx9 a.so
#|
#| Extract all code objects embedded in a.so into output/ (creating it if needed):
#| $ roc-obj -o output/ a.so
#|
#| Extract all code objects embedded in a.so with "gfx9" in their Target ID
#| into output/ (creating it if needed):
#| $ roc-obj -t gfx9 -o output/ a.so
#|
#| Extract all code objects embedded in a.so, and then disassemble each of them
#| to files ending with .s:
#| $ roc-obj -d a.so
#|
#| Extract all code objects embedded in a.so, and count the number of bytes in
#| each, writing the results to files ending with .count:
#| $ roc-obj a.so : .count wc -c
#|
#| Extract all code objects embedded in a.so, and inspect their ELF headers
#| using llvm-readelf (which will not read from standard input), writing to
#| files ending with .hdr:
#| $ roc-obj -I'{}' a.so : .hdr llvm-readelf -h '{}'
#|
#| Extract all code objects embedded in a.so, and then extract each of their
#| .text sections using llvm-objcopy (which won't read from standard input
#| or write to standard output):
#| $ roc-obj -I'{}' a.so : ! llvm-objcopy -O binary :only-section=.text '{}' '{}.text'
#|
#| Extract all code objects embedded in a.so, b.so, and c.so with target
#| feature xnack disabled into directory out/. Then, for each:
#| Write the size in bytes into a file ending with .count, and
#| Write a textual description of the ELF headers to a file ending with .hdr, and
#| Extract the .text section to a file ending with .text
#| $ roc-obj -I'{}' -t xnack- -o out/ a.so b.so c.so : \
#| .count wc -c \;
#| .hdr llvm-readelf -h '{}' \;
#| ! llvm-objcopy -O binary --only-section=.text '{}' '{}.text'
set -euo pipefail
usage() {
sed -n 's/^#| \?\(.*\)$/\1/p' "$0"
}
usage_then_exit() {
local -r status="$1"; shift
usage >&$(( status ? 2 : 1 ))
exit "$status"
}
fail() {
printf "error: %s\n" "$*" >&2
exit 1
}
# Account for the fact that we do not necessarily put ROCm tools in the PATH,
# nor do we have a single, unified ROCm "bin/" directory.
#
# Note that this is only used for roc-obj-ls, roc-obj-extract, and "shortcut"
# options like -d, and the user can still use any copy of llvm-* by explicitly
# invoking it with a full path, e.g. : /path/to/llvm-* ... ;
find_rocm_executable_or_fail() {
local -r command="$1"; shift
local file
local searched=()
for dir in "$BASE_DIR" "${HIP_CLANG_PATH:-"$BASE_DIR/../../llvm/bin"}"; do
file="$dir/$command"
if [[ -x $file ]]; then
printf "%s" "$file"
return
else
searched+=("$dir")
fi
done
if hash "$command" 2>/dev/null; then
printf "%s" "$command"
else
fail could not find "$command" in "${searched[*]}" or PATH
fi
}
# Extract the embedded code objects of the executable file given as the first
# argument into OPT_OUTDIR, filtering them via OPT_TARGET_ID.
#
# Deletes any resulting files which are empty, and prints the paths of the
# remaining files.
extract() {
local -r executable="$1"; shift
local prefix
prefix="$(basename -- "$executable")"
# We want the shell to split the result of roc-obj-ls on whitespace, as
# neither the Target ID nor the URI can have embedded spaces.
# shellcheck disable=SC2046
set -- $("$ROC_OBJ_LS" -- "$executable" | awk "\$2~/$OPT_TARGET_ID/")
while (( $# )); do
local output="$prefix:$1"; shift
output="$output.$1"; shift
local uri="$1"; shift
[[ -n $OPT_OUTDIR ]] && output="$OPT_OUTDIR/$output"
"$ROC_OBJ_EXTRACT" -o - -- "$uri" >"$output"
if [[ -s $output ]]; then
printf '%s\n' "$output"
else
rm "$output"
fi
done
(( $# )) && fail expected even number of fields from roc-obj-ls
}
# Run a command over a list of inputs, naming output files with the supplied
# suffix and applying OPT_REPLACE_STRING if needed.
#
# Arguments are of the form:
# $suffix $command $args... ; $inputs
run_command() {
local -r suffix="$1"; shift
local -r command="$1"; shift
local args=()
while (( $# )); do
local arg="$1"; shift
[[ $arg == ';' ]] && break
args+=("$arg")
done
local inputs=("$@")
for input in "${inputs[@]}"; do
case "$suffix" in
'-') output=/dev/stdout;;
'!') output=/dev/null;;
*) output="$input$suffix";;
esac
"$command" "${args[@]//$OPT_REPLACE_STRING/$input}" <"$input" >"$output"
done
}
main() {
[[ -n $OPT_OUTDIR ]] && mkdir -p "$OPT_OUTDIR"
local inputs=()
while (( $# )); do
local executable="$1"; shift
[[ $executable == : ]] && break
# Append the file paths extracted from $executable to $inputs
readarray -t -O "${#inputs[@]}" inputs < <(extract "$executable")
done
(( ${#inputs[@]} )) || fail no executables specified
while (( $# )); do
local suffix="$1"; shift
local command="$1"; shift
local args=()
while (( $# )); do
local arg="$1"; shift
[[ $arg == \; ]] && break
args+=("$arg")
done
run_command "$suffix" "$command" "${args[@]}" \; "${inputs[@]}"
done
(( OPT_DISASSEMBLE )) && run_command .s "$OBJDUMP" -d - \; "${inputs[@]}"
}
OPT_TARGET_ID=''
OPT_OUTDIR=''
OPT_REPLACE_STRING=''
OPT_DISASSEMBLE=0
! getopt -T || fail util-linux enhanced getopt required
getopt="$(getopt -o +ht:o:I:id \
--long help,target-id:,outdir:,replace:,replace-default,disassemble \
-n roc-obj -- "$@")"
eval set -- "$getopt"
unset getopt
while true; do
case "$1" in
-h | --help) usage_then_exit 0;;
-t | --target-id) OPT_TARGET_ID="${2//\//\\\/}"; shift 2;;
-o | --outdir) OPT_OUTDIR="$2"; shift 2;;
-I | --replace-string) OPT_REPLACE_STRING="$2"; shift 2;;
-i | --replace) OPT_REPLACE_STRING='{}'; shift;;
-d | --disassemble) OPT_DISASSEMBLE=1; shift;;
--) shift; break;;
*) usage_then_exit 1;;
esac
done
readonly -- OPT_TARGET_ID OPT_OUTDIR OPT_REPLACE_STRING OPT_DISASSEMBLE
# We expect to be installed as ROCM_PATH/hip/bin/roc-obj, which means BASE_DIR
# is ROCM_PATH/hip/bin.
BASE_DIR="$(cd "$(dirname "$(readlink -f "${BASH_SOURCE[0]}")")" && pwd)"
(( OPT_DISASSEMBLE )) && OBJDUMP="$(find_rocm_executable_or_fail llvm-objdump)"
ROC_OBJ_LS="$(find_rocm_executable_or_fail roc-obj-ls)"
ROC_OBJ_EXTRACT="$(find_rocm_executable_or_fail roc-obj-extract)"
readonly -- BASE_DIR OBJDUMP ROC_OBJ_LS ROC_OBJ_EXTRACT
main "$@"
+244
查看文件
@@ -0,0 +1,244 @@
#!/usr/bin/perl
# Copyright (c) 2020-2021 Advanced Micro Devices, Inc. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
use strict;
use File::Copy;
use File::Spec;
use File::Basename;
use File::Which;
use Cwd 'realpath';
use Getopt::Std;
use List::Util qw(max);
use URI::Encode;
my $extract_range_specifier;
my $extract_pid;
my $extract_file;
my $output_file;
my $output_path;
my $extract_offset;
my $extract_size;
my $pid_running;
my $verbose=0;
my $error=0;
my $output_to_stdout=0;
sub usage {
print("Usage: $0 [-o|v|h] URI... \n");
print(" URIs can be read from STDIN, one per line.\n");
print(" From the URIs specified, extracts code objects into files named: ");
print("<executable_name>-[pid<number>]-offset<number>-size<number>.co\n\n");
print("Options:\n");
print(" -o <path> \tPath for output. If \"-\" specified, code object is printed to STDOUT.\n");
print(" -v \tVerbose output to STDOUT.\n");
print(" -h \tShow this help message.\n");
print("\nURI syntax:\n");
print("\tcode_object_uri ::== file_uri | memory_uri\n");
print("\tfile_uri ::== \"file://\" extract_file [ range_specifier ]\n");
print("\tmemory_uri ::== \"memory://\" process_id range_specifier\n");
print("\trange_specifier ::== range_delimiter range_attribute [\"&\" range_attribute]\n");
print("\trange_delimiter ::== \"#\" | \"?\"\n");
print("\trange_attribute ::== [\"offset=\" number | \"size=\" number ]\n");
print("\textract_file ::== URI_ENCODED_OS_FILE_PATH\n");
print("\tprocess_id ::== DECIMAL_NUMBER\n");
print("\tnumber ::== HEX_NUMBER \| DECIMAL_NUMBER \| OCTAL_NUMBER\n\n");
print("\tExample: file://dir1/dir2/hello_world#offset=133&size=14472 \n");
print("\t memory://1234#offset=0x20000&size=3000\n\n");
print(" NOTES:\n\n");
print("\tWhen specifying a URI in a shell command you will need to escape the \'&\' character in the range_specifier.\n");
print("\tIf \"size=\" is not specified, the default is the remainder of the file from the given offset.\n\n");
exit($error);
}
# Process options
my %options=();
getopts('vho:', \%options);
if (defined $options{h}) {
usage();
}
if (defined $options{v}) {
$verbose = 1;
}
if (defined $options{o}) {
$output_path = $options{o};
if ($output_path eq "-") {
$output_to_stdout=1;
} else {
(-d $output_path) || die("Error: Path \'$output_path\' cannot be found.\n");
}
}
# Only push STDIN if there are no arguments -- otherwise this
# consumes the caller's stdin by accident.
# push STDIN to ARGV array.
if ($#ARGV < 0) {
push @ARGV, <STDIN> unless -t STDIN;
}
# error check: enough arguments presented.
if ($#ARGV < 0) {
print(STDERR "Error: No arguments.\n"); $error++;
usage();
}
# error check: command dd is available.
my $dd_cmd = which("dd");
(-f $dd_cmd) || die("Error: Can't find dd command\n");
foreach my $uri_str(@ARGV) {
chomp $uri_str;
my ($uri_protocol, $specs) = split(/:\/\//,$uri_str);
my $obj_uri_encode = URI::Encode->new();
my $decoded_extract_file;
my $file_size;
if (lc($uri_protocol) eq "file") {
# expect file path
($extract_file, $extract_range_specifier) = split(/[#,?]/,$specs);
# decode the file name. URIs may have file/path names with non-alphanumeric characters, which will be encoded with %. We need to decode these.
$decoded_extract_file = $obj_uri_encode->decode($extract_file);
# verify file exists:
if (! -e $decoded_extract_file) {
print(STDERR "Error: can't find file: $decoded_extract_file\n"); $error++;
next;
}
# use the output_path is specified, otherwise use current working dir.
if ($output_path ne "") {
$output_file = File::Spec->catfile($output_path, basename($decoded_extract_file));
} else {
$output_file = basename($decoded_extract_file);
}
} elsif ( lc($uri_protocol) eq "memory") {
# expect memory specifier
($extract_pid, $extract_range_specifier) = split(/[#,?]/,$specs);
# verify pid is currently running
$pid_running = kill 0, $extract_pid;
if (! $pid_running) {
print(STDERR "Error: PID: $extract_pid is NOT running\n"); $error++;
next;
}
# get pid filename:
$extract_file = "/proc/$extract_pid/mem";
# verify file exists:
if (! -e $extract_file) {
print(STDERR "Error: can't find file: $extract_file\n"); $error++;
next;
}
# for extracting from a pid, make the output file in the current dir/path with the pid value as a name.
$output_file = "pid${extract_pid}";
# need to set $decoded_extract_file, because later we use this for other checks.
$decoded_extract_file = $extract_file;
} else {
# error, unrecognized Code Object URI
print(STDERR "Error: \'$uri_protocol\' is not recognized as a supported code object URI.\n"); $error++;
next;
}
# it is valid to not give a range specifier in a URI, in which case the entire code object will be extracted.
if ($extract_range_specifier ne "") {
my @tokens;
my $str;
my $value;
my $size_specified = 0;
@tokens = split(/[&]/,$extract_range_specifier);
foreach (@tokens) {
($str,$value) = split(/=/,$_);
if ($str eq "size") {
$extract_size=$value;
$size_specified = 1;
} elsif ($str eq "offset") {
$extract_offset=$value;
}
}
if ($size_specified != 1) {
# "size" not specified. default to rest of file (total size - offset)
$extract_size = -s $decoded_extract_file;
$extract_size -= $extract_offset;
}
} else {
# Error if URI is a memory request, and we have no range_specifier.
if ($pid_running) {
print(STDERR "Error: must specify a Range Specifier (offset and size) for a memory URI: $uri_str\n"); $error++;
next;
}
$extract_offset = 0;
$extract_size = -s $decoded_extract_file;
}
# We should have at least a valid size to extract; ignore cases with size=0.
if ($extract_size != 0) {
print("Reading input file \"$extract_file\" ...\n") if ($verbose);
# only if this is a File URI.
if (lc($uri_protocol) eq "file") {
# verify that offset+size does not exceed file size:
my $file_size = -s $decoded_extract_file;
my $size = int($extract_offset) + int($extract_size);
if ( $size > $file_size ) {
print(STDERR "Error: requested offset($extract_offset) + size($extract_size) exceeds file size($file_size) for file \"$decoded_extract_file\".\n"); $error++;
next;
}
}
open(INPUT_FP, "<", $decoded_extract_file) || die $!;
binmode INPUT_FP;
# extract the code object
my $co_filename;
if (!$output_to_stdout) {
$co_filename = "of=\'${output_file}-offset${extract_offset}-size${extract_size}.co\'";
}
my $dd_cmd_str = "$dd_cmd if=\'$decoded_extract_file\' $co_filename skip=$extract_offset count=$extract_size bs=1 status=none";
print("DD Command: $dd_cmd_str\n") if ($verbose);
my $dd_ret = system($dd_cmd_str);
if ($dd_ret != 0) {
print(STDERR "Error: DD command ($dd_cmd_str) failed with RC: $dd_ret\n"); $error++;
}
print("Extract request: file: $extract_file offset: $extract_offset size: $extract_size\n") if ($verbose);
} else {
print("Warning: trying to extract from $extract_file at offset=$extract_offset with size=0. Nothing to extract.\n") if ($verbose);
}
} # end of for each (URI) argument
exit($error);
+192
查看文件
@@ -0,0 +1,192 @@
#!/usr/bin/perl
# Copyright (c) 2020 - 2021 Advanced Micro Devices, Inc. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
use strict;
use File::Copy;
use File::Spec;
use File::Basename;
use File::Which;
use Cwd 'realpath';
use Getopt::Std;
use List::Util qw(max);
use URI::Encode;
sub usage {
print("Usage: $0 [-v|h] executable...\n");
print("List the URIs of the code objects embedded in the specfied host executables.\n");
print("-v \tVerbose output (includes Entry ID)\n");
print("-h \tShow this help message\n");
exit;
}
# sub to read a qword. 1st arg is a FP, 2nd arg is ref to destination var.
sub readq {
my ($input_fp, $qword) = @_;
read($input_fp, my $bytes, 8) == 8 or die("Error: Failed to read 8 bytes\n");
${$qword} = unpack("Q<", $bytes);
}
# sub to move address to next alignment boundary
# first arg is address to move
# second arg is alignment requirement/boundary
sub align_up {
my ($address, $alignment) = @_;
$address = int(($address + ($alignment - 1)) / $alignment) * $alignment;
}
# Process options
my %options=();
getopts('vhd', \%options);
if (defined $options{h}) {
usage();
}
my $verbose = $options{v};
my $debug = $options{d};
my $num_bundles = 1;
my $bundle_alignment = 4096;
# look for objdump
my $objdump = which("objdump");
(-f $objdump) || die("Error: Can't find objdump command\n");
# for each argument (which should be an executable):
foreach my $executable_file(@ARGV) {
# debug message
print("Reading input file \"$executable_file\" ...\n") if ($debug);
# verify/open file specified.
open (INPUT_FP, "<", $executable_file) || die("Error: failed to open file: $executable_file\n");
binmode INPUT_FP;
# kernel section information
my $escaped_name=quotemeta($executable_file);
my $bundle_section_name = ".hip_fatbin";
my $bundle_section_size = hex(`$objdump -h $escaped_name | grep $bundle_section_name | awk '{print \$3}'`);
my $bundle_section_offset = hex(`$objdump -h $escaped_name | grep $bundle_section_name | awk '{print \$6}'`);
$bundle_section_size or die("Error: No kernel section found\n");
my $bundle_section_end = $bundle_section_offset + $bundle_section_size;
if ($debug) {
printf("Code Objects Bundle section size: %x\n",$bundle_section_size);
printf("Code Objects Bundle section offset: %x\n",$bundle_section_offset);
printf("Code Objects Bundle section end: %x\n\n",$bundle_section_end);
}
my $current_bundle_offset = $bundle_section_offset;
printf("Current Bundle offset: 0x%X\n",$current_bundle_offset) if ($debug);
# move fp to current_bundle_offset.
seek(INPUT_FP, $current_bundle_offset, 0);
while ($current_bundle_offset < $bundle_section_end) {
# skip OFFLOAD_BUNDLER_MAGIC_STR
my $magic_str;
my $read_bytes = read(INPUT_FP, $magic_str, 24);
if (($read_bytes != 24) || ($magic_str ne "__CLANG_OFFLOAD_BUNDLE__")) {
print(STDERR "Error: Offload bundle magic string not detected\n") if ($debug);
last;
}
# read number of bundle entries, which are code objects.
my $num_codeobjects;
readq(\*INPUT_FP,\$num_codeobjects);
# header with current bundle number and number of embedded code objcts in that bundle.
# print("Bundle Number: $num_bundles with $num_codeobjects Code Objects:\n") if ($very_verbose);
my $end_of_current_bundle = $current_bundle_offset;
# Column Header.
printf("%-8s%-40s%35s\n","Bundle#","Entry ID:","URI:") if ($verbose);
# for each Bundle entry (code object) ....
for (my $iter = 0; $iter < $num_codeobjects; $iter++) {
print("\nEntry #$iter\n") if $debug;
# read bundle entry (code object) offset
my $entry_offset;
my $abs_offset;
readq(*INPUT_FP,\$entry_offset);
printf("entry_offset: 0x%X\n",$entry_offset) if $debug;
# read bundle entry (code object) size
my $entry_size;
readq(*INPUT_FP,\$entry_size);
printf("entry_size: 0x%X\n",$entry_size) if $debug;
# read triple size
my $triple_size;
readq(*INPUT_FP,\$triple_size);
printf("triple_size: 0x%X\n",$triple_size) if $debug;
# read triple string
my $triple;
my $read_bytes = read(INPUT_FP, $triple, $triple_size);
$read_bytes == $triple_size or die("Error: Fail to parse triple\n");
print("triple: $triple\n") if $debug;
# because the bundle entry's offset is relative to the beginning of the bundled code object section.
$abs_offset = int($current_bundle_offset + $entry_offset);
# and we need to keep track of where we are in the current bundle.
$end_of_current_bundle = int($abs_offset + $entry_size);
printf("abs_offset: 0x%X\n",$abs_offset) if $debug;
my $obj_uri_encode = URI::Encode->new();
my $encoded_executable_file = $obj_uri_encode->encode($executable_file);
printf("%-8s%-40s%35s%s%s%s%s%s%s\n",$num_bundles,$triple,"file:\/\/",$encoded_executable_file,"\#offset=",$abs_offset, "\&size=",$entry_size);
printf("end_of_current_bundle: 0x%X\n",$end_of_current_bundle) if $debug;
printf("Hex values: file:\/\/$encoded_executable_file#offset=0x%X$abs_offset\&size=0x%X\n", $abs_offset, $entry_size) if $debug;
} # End of for each Bundle entry (code object) ...
printf("\n") if ($verbose);
# we've finished listing this current bundle ...
printf("current_bundle_offset: %x \n",$current_bundle_offset) if ($debug);
printf("bundle_section_end: %x \n", $bundle_section_end) if ($debug);
# move current_bundle_offset to next alignment boundary.
$current_bundle_offset = align_up($end_of_current_bundle,$bundle_alignment);
printf("Adjusting for alignment of next bundle: current_bundle_offset: %x \n\n\n", $current_bundle_offset) if ($debug);
# seek to the end of the current bundle:
seek(INPUT_FP, $current_bundle_offset, 0);
# increment the number of bundles listed.
$num_bundles = $num_bundles+1;
} # End of while loop
} # End of for each command line argument
exit(0);
@@ -0,0 +1,39 @@
/*
Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#ifndef @include_guard@
#define @include_guard@
#if defined(__GNUC__)
#warning "This file is deprecated. Use header files from HIP_PATH/include, where HIP_PATH corresponds to HIP install path"
#else
#pragma message("This file is deprecated. Use header files from HIP_PATH/include, where HIP_PATH corresponds to HIP install path")
#endif
@include_statements@
@hashzero_check@
@file_contents@
@hash_endif@
#endif
@@ -0,0 +1,261 @@
# Copyright (c) 2022 Advanced Micro Devices, Inc. All Rights Reserved.
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
cmake_minimum_required(VERSION 3.16.8)
set(HIP_BUILD_DIR ${CMAKE_CURRENT_BINARY_DIR})
set(HIP_WRAPPER_DIR ${HIP_BUILD_DIR}/wrapper_dir)
set(HIP_WRAPPER_INC_DIR ${HIP_WRAPPER_DIR}/include/hip)
set(HIP_WRAPPER_BIN_DIR ${HIP_WRAPPER_DIR}/bin)
set(HIP_WRAPPER_LIB_DIR ${HIP_WRAPPER_DIR}/lib)
set(HIP_WRAPPER_CMAKE_DIR ${HIP_WRAPPER_DIR}/cmake)
set(HIP_WRAPPER_FINDHIP_DIR ${HIP_WRAPPER_DIR}/FindHIP)
set(HIP_SRC_INC_DIR ${HIP_SRC_PATH}/include/hip)
set(HIP_SRC_BIN_DIR ${HIP_SRC_PATH}/bin)
set(HIP_INFO_FILE ".hipInfo")
#Function to set actual file contents in wrapper files
#Some components grep for the contents in the file
function(set_file_contents input_file)
set(hashzero_check "#if 0
/* The following is a copy of the original file for the benefit of build systems which grep for values
* in this file rather than preprocess it. This is just for backward compatibility */")
file(READ ${input_file} file_contents)
set(hash_endif "#endif")
get_filename_component(file_name ${input_file} NAME)
configure_file(${HIP_SRC_PATH}/header_template.hpp.in ${HIP_WRAPPER_INC_DIR}/${file_name})
endfunction()
#use header template file and generate wrapper header files
function(generate_wrapper_header)
#create respecitve folder in /opt/rocm/hip
file(MAKE_DIRECTORY ${HIP_WRAPPER_INC_DIR}/amd_detail)
file(MAKE_DIRECTORY ${HIP_WRAPPER_INC_DIR}/nvidia_detail)
#find all header files from include/hip
file(GLOB include_files ${HIP_BUILD_DIR}/include/hip/*.h)
#Convert the list of files into #includes
foreach(header_file ${include_files})
# set include guard
get_filename_component(INC_GAURD_NAME ${header_file} NAME_WE)
string(TOUPPER ${INC_GAURD_NAME} INC_GAURD_NAME)
set(include_guard "HIP_WRAPPER_INCLUDE_HIP_${INC_GAURD_NAME}_H")
#set #include statement
get_filename_component(file_name ${header_file} NAME)
set(include_statements "#include \"../../../${CMAKE_INSTALL_INCLUDEDIR}/hip/${file_name}\"\n")
if(${file_name} STREQUAL "hip_version.h")
set_file_contents(${header_file})
else()
configure_file(${HIP_SRC_PATH}/header_template.hpp.in ${HIP_WRAPPER_INC_DIR}/${file_name})
endif()
endforeach()
#find all header files from include/hip/amd_detail
file(GLOB include_files ${HIP_SRC_INC_DIR}/amd_detail/*)
#Convert the list of files into #includes
foreach(header_file ${include_files})
# set include guard
get_filename_component(INC_GAURD_NAME ${header_file} NAME_WE)
string(TOUPPER ${INC_GAURD_NAME} INC_GAURD_NAME)
set(include_guard "HIP_WRAPPER_INCLUDE_HIP_AMD_DETAIL_${INC_GAURD_NAME}_H")
#set #include statement
get_filename_component(file_name ${header_file} NAME)
set(include_statements "#include \"../../../../${CMAKE_INSTALL_INCLUDEDIR}/hip/amd_detail/${file_name}\"\n")
configure_file(${HIP_SRC_PATH}/header_template.hpp.in ${HIP_WRAPPER_INC_DIR}/amd_detail/${file_name})
endforeach()
#find all header files from include/hip/nvidia_detail
file(GLOB include_files ${HIP_SRC_INC_DIR}/nvidia_detail/*)
#Convert the list of files into #includes
foreach(header_file ${include_files})
# set include guard
get_filename_component(INC_GAURD_NAME ${header_file} NAME_WE)
string(TOUPPER ${INC_GAURD_NAME} INC_GAURD_NAME)
set(include_guard "HIP_WRAPPER_INCLUDE_HIP_NVIDIA_DETAIL_${INC_GAURD_NAME}_H")
#set #include statement
get_filename_component(file_name ${header_file} NAME)
set(include_statements "#include \"../../../../${CMAKE_INSTALL_INCLUDEDIR}/hip/nvidia_detail/${file_name}\"\n")
configure_file(${HIP_SRC_PATH}/header_template.hpp.in ${HIP_WRAPPER_INC_DIR}/nvidia_detail/${file_name})
endforeach()
endfunction()
#function to create symlink to binaries
function(create_binary_symlink)
file(MAKE_DIRECTORY ${HIP_WRAPPER_BIN_DIR})
#get all binaries
file(GLOB binary_files ${HIP_SRC_BIN_DIR}/*)
#Add .hipVersion to binary list
set(binary_files "${binary_files}" ".hipVersion")
foreach(binary_file ${binary_files})
get_filename_component(file_name ${binary_file} NAME)
add_custom_target(link_${file_name} ALL
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
COMMAND ${CMAKE_COMMAND} -E create_symlink
../../${CMAKE_INSTALL_BINDIR}/${file_name} ${HIP_WRAPPER_BIN_DIR}/${file_name})
endforeach()
unset(binary_files)
file(GLOB binary_files ${HIP_BUILD_DIR}/bin/*)
foreach(binary_file ${binary_files})
get_filename_component(file_name ${binary_file} NAME)
if(WIN32)
add_custom_target(link_${file_name} ALL
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
COMMAND ${CMAKE_COMMAND} -E create_symlink
../../${CMAKE_INSTALL_BINDIR}/${file_name} ${HIP_WRAPPER_BIN_DIR}/${file_name})
else()
if( NOT ${file_name} MATCHES ".bat$")
add_custom_target(link_${file_name} ALL
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
COMMAND ${CMAKE_COMMAND} -E create_symlink
../../${CMAKE_INSTALL_BINDIR}/${file_name} ${HIP_WRAPPER_BIN_DIR}/${file_name})
endif()#end of bat file check
endif()#end of OS check
endforeach()
endfunction()
#function to create symlink to libraries
function(create_library_symlink)
file(MAKE_DIRECTORY ${HIP_WRAPPER_LIB_DIR})
if(BUILD_SHARED_LIBS)
set(LIB_AMDHIP "libamdhip64.so")
set(MAJ_VERSION "${HIP_LIB_VERSION_MAJOR}")
set(SO_VERSION "${HIP_LIB_VERSION_STRING}")
set(library_files "${LIB_AMDHIP}" "${LIB_AMDHIP}.${MAJ_VERSION}" "${LIB_AMDHIP}.${SO_VERSION}")
set(LIB_HIPRTC "libhiprtc-builtins.so")
set(library_files "${library_files}" "${LIB_HIPRTC}" "${LIB_HIPRTC}.${MAJ_VERSION}" "${LIB_HIPRTC}.${SO_VERSION}" )
set(LIB_RTC "libhiprtc.so")
set(library_files "${library_files}" "${LIB_RTC}" "${LIB_RTC}.${MAJ_VERSION}" "${LIB_RTC}.${SO_VERSION}" )
else()
set(library_files "libamdhip64.a")
endif()
foreach(file_name ${library_files})
add_custom_target(link_${file_name} ALL
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
COMMAND ${CMAKE_COMMAND} -E create_symlink
../../${CMAKE_INSTALL_LIBDIR}/${file_name} ${HIP_WRAPPER_LIB_DIR}/${file_name})
endforeach()
#Add symlink for .hipInfo
set(file_name ${HIP_INFO_FILE})
add_custom_target(link_${file_name} ALL
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
COMMAND ${CMAKE_COMMAND} -E create_symlink
../../${CMAKE_INSTALL_LIBDIR}/${file_name} ${HIP_WRAPPER_LIB_DIR}/${file_name})
endfunction()
function(create_cmake_symlink)
file(MAKE_DIRECTORY ${HIP_WRAPPER_CMAKE_DIR}/hip)
#create symlink to all hip config files
file(GLOB config_files ${HIP_BUILD_DIR}/hip-config*)
foreach(config_name ${config_files})
get_filename_component(file_name ${config_name} NAME)
add_custom_target(link_${file_name} ALL
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
COMMAND ${CMAKE_COMMAND} -E create_symlink
../../../../${CMAKE_INSTALL_LIBDIR}/cmake/hip/${file_name} ${HIP_WRAPPER_CMAKE_DIR}/hip/${file_name})
endforeach()
unset(config_files)
#create symlink to hip-lang
file(MAKE_DIRECTORY ${HIP_WRAPPER_CMAKE_DIR}/hip-lang)
file(GLOB config_files ${HIP_BUILD_DIR}/src/hip-lang-config*)
foreach(config_name ${config_files})
get_filename_component(file_name ${config_name} NAME)
add_custom_target(link_${file_name} ALL
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
COMMAND ${CMAKE_COMMAND} -E create_symlink
../../../../${CMAKE_INSTALL_LIBDIR}/cmake/hip-lang/${file_name} ${HIP_WRAPPER_CMAKE_DIR}/hip-lang/${file_name})
endforeach()
unset(config_files)
#create symlink to hiprtc config files
file(MAKE_DIRECTORY ${HIP_WRAPPER_CMAKE_DIR}/hiprtc)
file(GLOB config_files ${HIP_BUILD_DIR}/hiprtc-config*)
foreach(config_name ${config_files})
get_filename_component(file_name ${config_name} NAME)
add_custom_target(link_${file_name} ALL
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
COMMAND ${CMAKE_COMMAND} -E create_symlink
../../../../${CMAKE_INSTALL_LIBDIR}/cmake/hiprtc/${file_name} ${HIP_WRAPPER_CMAKE_DIR}/hiprtc/${file_name})
endforeach()
unset(config_files)
#create symlink to FindHIP
file(MAKE_DIRECTORY ${HIP_WRAPPER_FINDHIP_DIR}/FindHIP)
file(GLOB config_files ${HIP_BUILD_DIR}/cmake/FindHIP/*.cmake)
foreach(config_name ${config_files})
get_filename_component(file_name ${config_name} NAME)
add_custom_target(link_${file_name} ALL
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
COMMAND ${CMAKE_COMMAND} -E create_symlink
../../../${CMAKE_INSTALL_LIBDIR}/cmake/hip/FindHIP/${file_name} ${HIP_WRAPPER_FINDHIP_DIR}/FindHIP/${file_name})
endforeach()
unset(config_files)
file(GLOB config_files ${HIP_BUILD_DIR}/cmake/*.cmake)
foreach(config_name ${config_files})
get_filename_component(file_name ${config_name} NAME)
add_custom_target(link_${file_name} ALL
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
COMMAND ${CMAKE_COMMAND} -E create_symlink
../../${CMAKE_INSTALL_LIBDIR}/cmake/hip/${file_name} ${HIP_WRAPPER_FINDHIP_DIR}/${file_name})
endforeach()
unset(config_files)
endfunction()
#Use template header file and generater wrapper header files
generate_wrapper_header()
install(DIRECTORY ${HIP_WRAPPER_INC_DIR} DESTINATION hip/include COMPONENT dev)
# Create symlink to binaries
create_binary_symlink()
install(DIRECTORY ${HIP_WRAPPER_BIN_DIR} DESTINATION hip COMPONENT dev)
option(BUILD_SHARED_LIBS "Build the shared library" ON)
# Create symlink to library files
create_library_symlink()
if(HIP_PLATFORM STREQUAL "amd" )
if(BUILD_SHARED_LIBS)
install(FILES ${HIP_WRAPPER_LIB_DIR}/libamdhip64.so DESTINATION hip/lib COMPONENT binary)
install(FILES ${HIP_WRAPPER_LIB_DIR}/libamdhip64.so.${HIP_LIB_VERSION_MAJOR} DESTINATION hip/lib COMPONENT binary)
install(FILES ${HIP_WRAPPER_LIB_DIR}/libamdhip64.so.${HIP_LIB_VERSION_STRING} DESTINATION hip/lib COMPONENT binary)
install(FILES ${HIP_WRAPPER_LIB_DIR}/libhiprtc-builtins.so DESTINATION hip/lib COMPONENT binary)
install(FILES ${HIP_WRAPPER_LIB_DIR}/libhiprtc-builtins.so.${HIP_LIB_VERSION_MAJOR} DESTINATION hip/lib COMPONENT binary)
install(FILES ${HIP_WRAPPER_LIB_DIR}/libhiprtc-builtins.so.${HIP_LIB_VERSION_STRING} DESTINATION hip/lib COMPONENT binary)
install(FILES ${HIP_WRAPPER_LIB_DIR}/libhiprtc.so DESTINATION hip/lib COMPONENT binary)
install(FILES ${HIP_WRAPPER_LIB_DIR}/libhiprtc.so.${HIP_LIB_VERSION_MAJOR} DESTINATION hip/lib COMPONENT binary)
install(FILES ${HIP_WRAPPER_LIB_DIR}/libhiprtc.so.${HIP_LIB_VERSION_STRING} DESTINATION hip/lib COMPONENT binary)
else()
install(FILES ${HIP_WRAPPER_LIB_DIR}/libamdhip64.a DESTINATION hip/lib COMPONENT binary)
endif()#End BUILD_SHARED_LIBS
endif()#End HIP_PLATFORM AMD
#install hipInfo
install(FILES ${HIP_WRAPPER_LIB_DIR}/${HIP_INFO_FILE} DESTINATION hip/lib COMPONENT binary)
#create symlink to cmake files
create_cmake_symlink()
install(DIRECTORY ${HIP_WRAPPER_CMAKE_DIR} DESTINATION hip/lib COMPONENT binary)
install(DIRECTORY ${HIP_WRAPPER_FINDHIP_DIR}/ DESTINATION hip/cmake COMPONENT dev)
+266
查看文件
@@ -0,0 +1,266 @@
# Copyright (c) 2020 - 2021 Advanced Micro Devices, Inc. All Rights Reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
cmake_minimum_required(VERSION 3.3)
@PACKAGE_INIT@
include(CheckCXXCompilerFlag)
include(CMakeFindDependencyMacro OPTIONAL RESULT_VARIABLE _CMakeFindDependencyMacro_FOUND)
if (NOT _CMakeFindDependencyMacro_FOUND)
macro(find_dependency dep)
if (NOT ${dep}_FOUND)
set(cmake_fd_version)
if (${ARGC} GREATER 1)
set(cmake_fd_version ${ARGV1})
endif()
set(cmake_fd_exact_arg)
if(${CMAKE_FIND_PACKAGE_NAME}_FIND_VERSION_EXACT)
set(cmake_fd_exact_arg EXACT)
endif()
set(cmake_fd_quiet_arg)
if(${CMAKE_FIND_PACKAGE_NAME}_FIND_QUIETLY)
set(cmake_fd_quiet_arg QUIET)
endif()
set(cmake_fd_required_arg)
if(${CMAKE_FIND_PACKAGE_NAME}_FIND_REQUIRED)
set(cmake_fd_required_arg REQUIRED)
endif()
find_package(${dep} ${cmake_fd_version}
${cmake_fd_exact_arg}
${cmake_fd_quiet_arg}
${cmake_fd_required_arg}
)
string(TOUPPER ${dep} cmake_dep_upper)
if (NOT ${dep}_FOUND AND NOT ${cmake_dep_upper}_FOUND)
set(${CMAKE_FIND_PACKAGE_NAME}_NOT_FOUND_MESSAGE "${CMAKE_FIND_PACKAGE_NAME} could not be found because dependency ${dep} could not be found.")
set(${CMAKE_FIND_PACKAGE_NAME}_FOUND False)
return()
endif()
set(cmake_fd_version)
set(cmake_fd_required_arg)
set(cmake_fd_quiet_arg)
set(cmake_fd_exact_arg)
endif()
endmacro()
endif()
set(_HIP_SHELL "SHELL:")
if(CMAKE_VERSION VERSION_LESS 3.12)
set(_HIP_SHELL "")
endif()
function(hip_add_interface_compile_flags TARGET)
set_property(TARGET ${TARGET} APPEND PROPERTY
INTERFACE_COMPILE_OPTIONS "$<$<COMPILE_LANGUAGE:CXX>:${_HIP_SHELL}${ARGN}>"
)
endfunction()
function(hip_add_interface_link_flags TARGET)
if(CMAKE_VERSION VERSION_LESS 3.20)
set_property(TARGET ${TARGET} APPEND PROPERTY
INTERFACE_LINK_LIBRARIES "${ARGN}"
)
else()
set_property(TARGET ${TARGET} APPEND PROPERTY
INTERFACE_LINK_LIBRARIES "$<$<LINK_LANGUAGE:CXX>:${ARGN}>"
)
endif()
endfunction()
#Number of parallel jobs by default is 1
if(NOT DEFINED HIP_CLANG_NUM_PARALLEL_JOBS)
set(HIP_CLANG_NUM_PARALLEL_JOBS 1)
endif()
set(HIP_COMPILER "@HIP_COMPILER@")
set(HIP_RUNTIME "@HIP_RUNTIME@")
# NOTE: If hip-config is invoked from /opt/rocm-ver/hip/lib/cmake/hip/
# then PACKAGE_PREFIX_DIR will resolve to /opt/rocm-ver/hip, which is for backward compatibility
# The following will ensure PACKAGE_PREFIX_DIR will resolves to /opt/rocm-ver
# First find the real path to hip-config file with symlinks resolved
# Real Path : /opt/rocm-ver/lib/cmake/hip/hip-config.cmake
# Then go up 4 levels to get PACKAGE_PREFIX_DIR
# PACKAGE_PREFIX_DIR : /opt/rocm-ver
# TODO:once file reorg backward compatibility is turned off this can be removed.
if(IS_SYMLINK ${CMAKE_CURRENT_LIST_FILE})
get_filename_component(CONFIG_FILE_PATH "${CMAKE_CURRENT_LIST_FILE}" REALPATH)
get_filename_component(PACKAGE_PREFIX_DIR "${CONFIG_FILE_PATH}/../../../../" ABSOLUTE)
endif()
# end of TODO
set(HIP_PACKAGE_PREFIX_DIR ${PACKAGE_PREFIX_DIR})
set_and_check( hip_INCLUDE_DIR "@PACKAGE_INCLUDE_INSTALL_DIR@" )
set_and_check( hip_INCLUDE_DIRS "${hip_INCLUDE_DIR}" )
set_and_check( hip_LIB_INSTALL_DIR "@PACKAGE_LIB_INSTALL_DIR@" )
set_and_check( hip_BIN_INSTALL_DIR "@PACKAGE_BIN_INSTALL_DIR@" )
if(WIN32)
set_and_check(hip_HIPCC_EXECUTABLE "${hip_BIN_INSTALL_DIR}/hipcc.bat")
set_and_check(hip_HIPCONFIG_EXECUTABLE "${hip_BIN_INSTALL_DIR}/hipconfig.bat")
else()
set_and_check(hip_HIPCC_EXECUTABLE "${hip_BIN_INSTALL_DIR}/hipcc")
set_and_check(hip_HIPCONFIG_EXECUTABLE "${hip_BIN_INSTALL_DIR}/hipconfig")
endif()
# Windows Specific Definition here:
if(WIN32)
if(DEFINED ENV{HIP_PATH})
file(TO_CMAKE_PATH "$ENV{HIP_PATH}" HIP_PATH)
elseif(DEFINED ENV{HIP_DIR})
file(TO_CMAKE_PATH "$ENV{HIP_DIR}" HIP_DIR)
else()
# using the HIP found
set(HIP_PATH ${PACKAGE_PREFIX_DIR})
endif()
else()
# Linux
# If HIP is not installed under ROCm, need this to find HSA assuming HSA is under ROCm
if(DEFINED ENV{ROCM_PATH})
set(ROCM_PATH "$ENV{ROCM_PATH}")
endif()
# set a default path for ROCM_PATH
if(NOT DEFINED ROCM_PATH)
set(ROCM_PATH ${PACKAGE_PREFIX_DIR})
endif()
endif()
if(HIP_COMPILER STREQUAL "clang")
if(WIN32)
# Using SDK folder
file(TO_CMAKE_PATH "${HIP_PATH}" HIP_CLANG_ROOT)
if (NOT EXISTS "${HIP_CLANG_ROOT}/bin/clang.exe")
# if using install folder
file(TO_CMAKE_PATH "${HIP_PATH}/../lc" HIP_CLANG_ROOT)
endif()
else()
set(HIP_CLANG_ROOT "${ROCM_PATH}/llvm")
endif()
if(NOT HIP_CXX_COMPILER)
set(HIP_CXX_COMPILER ${CMAKE_CXX_COMPILER})
endif()
if(NOT WIN32)
find_dependency(AMDDeviceLibs)
endif()
set(AMDGPU_TARGETS "gfx900;gfx906;gfx908;gfx90a;gfx1030" CACHE STRING "AMD GPU targets to compile for")
set(GPU_TARGETS "${AMDGPU_TARGETS}" CACHE STRING "GPU targets to compile for")
endif() # HIP_COMPILER check
if(NOT WIN32)
find_dependency(amd_comgr)
endif()
include( "${CMAKE_CURRENT_LIST_DIR}/hip-targets.cmake" )
#Using find_dependency to locate the dependency for the packages
#This makes the cmake generated file xxxx-targets to supply the linker libraries
# without worrying other transitive dependencies
if(NOT WIN32)
find_dependency(hsa-runtime64)
find_dependency(Threads)
endif()
set(_IMPORT_PREFIX ${HIP_PACKAGE_PREFIX_DIR})
# Right now this is only supported for amd platforms
set_target_properties(hip::host PROPERTIES
INTERFACE_COMPILE_DEFINITIONS "__HIP_PLATFORM_HCC__=1;__HIP_PLATFORM_AMD__=1"
)
if(HIP_RUNTIME MATCHES "rocclr")
set_target_properties(hip::amdhip64 PROPERTIES
INTERFACE_INCLUDE_DIRECTORIES "${_IMPORT_PREFIX}/include"
INTERFACE_SYSTEM_INCLUDE_DIRECTORIES "${_IMPORT_PREFIX}/include"
)
get_target_property(amdhip64_type hip::amdhip64 TYPE)
message(STATUS "hip::amdhip64 is ${amdhip64_type}")
if(NOT WIN32)
set_target_properties(hip::device PROPERTIES
INTERFACE_INCLUDE_DIRECTORIES "${_IMPORT_PREFIX}/include"
INTERFACE_SYSTEM_INCLUDE_DIRECTORIES "${_IMPORT_PREFIX}/include"
)
endif()
endif()
if(HIP_COMPILER STREQUAL "clang")
get_property(compilePropIsSet TARGET hip::device PROPERTY INTERFACE_COMPILE_OPTIONS SET)
if (NOT compilePropIsSet AND HIP_CXX_COMPILER MATCHES ".*clang\\+\\+")
hip_add_interface_compile_flags(hip::device -mllvm -amdgpu-early-inline-all=true -mllvm -amdgpu-function-calls=false)
endif()
if (NOT compilePropIsSet)
hip_add_interface_compile_flags(hip::device -x hip)
endif()
hip_add_interface_link_flags(hip::device --hip-link)
foreach(GPU_TARGET ${GPU_TARGETS})
if (NOT compilePropIsSet)
hip_add_interface_compile_flags(hip::device --offload-arch=${GPU_TARGET})
endif()
hip_add_interface_link_flags(hip::device --offload-arch=${GPU_TARGET})
endforeach()
#Add support for parallel build and link
if(${CMAKE_CXX_COMPILER_ID} STREQUAL "Clang")
check_cxx_compiler_flag("-parallel-jobs=1" HIP_CLANG_SUPPORTS_PARALLEL_JOBS)
endif()
if(HIP_CLANG_NUM_PARALLEL_JOBS GREATER 1)
if(${HIP_CLANG_SUPPORTS_PARALLEL_JOBS} )
if (NOT compilePropIsSet)
hip_add_interface_compile_flags(hip::device -parallel-jobs=${HIP_CLANG_NUM_PARALLEL_JOBS} -Wno-format-nonliteral)
endif()
hip_add_interface_link_flags(hip::device -parallel-jobs=${HIP_CLANG_NUM_PARALLEL_JOBS})
else()
message("clang compiler doesn't support parallel jobs")
endif()
endif()
# Use HIP_CXX option -print-libgcc-file-name --rtlib=compiler-rt
# To fetch the compiler rt library file name.
execute_process(
COMMAND ${HIP_CXX_COMPILER} -print-libgcc-file-name --rtlib=compiler-rt
OUTPUT_VARIABLE CLANGRT_BUILTINS
OUTPUT_STRIP_TRAILING_WHITESPACE
RESULT_VARIABLE CLANGRT_BUILTINS_FETCH_EXIT_CODE)
# Add support for __fp16 and _Float16, explicitly link with compiler-rt
if( "${CLANGRT_BUILTINS_FETCH_EXIT_CODE}" STREQUAL "0" )
# CLANG_RT Builtins found Successfully Set interface link libraries property
set_property(TARGET hip::host APPEND PROPERTY INTERFACE_LINK_LIBRARIES "${CLANGRT_BUILTINS}")
set_property(TARGET hip::device APPEND PROPERTY INTERFACE_LINK_LIBRARIES "${CLANGRT_BUILTINS}")
else()
message(STATUS "clangrt builtins lib not found: ${CLANGRT_BUILTINS_FETCH_EXIT_CODE}")
endif() # CLANGRT_BUILTINS_FETCH_EXIT_CODE Check
endif() # HIP_COMPILER Check
set( hip_LIBRARIES hip::host hip::device)
set( hip_LIBRARY ${hip_LIBRARIES})
set(HIP_INCLUDE_DIR ${hip_INCLUDE_DIR})
set(HIP_INCLUDE_DIRS ${hip_INCLUDE_DIRS})
set(HIP_LIB_INSTALL_DIR ${hip_LIB_INSTALL_DIR})
set(HIP_BIN_INSTALL_DIR ${hip_BIN_INSTALL_DIR})
set(HIP_LIBRARIES ${hip_LIBRARIES})
set(HIP_LIBRARY ${hip_LIBRARY})
set(HIP_HIPCC_EXECUTABLE ${hip_HIPCC_EXECUTABLE})
set(HIP_HIPCONFIG_EXECUTABLE ${hip_HIPCONFIG_EXECUTABLE})
@@ -0,0 +1,348 @@
/*
Copyright (c) 2015 - 2022 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#ifndef HIP_INCLUDE_HIP_AMD_DETAIL_CHANNEL_DESCRIPTOR_H
#define HIP_INCLUDE_HIP_AMD_DETAIL_CHANNEL_DESCRIPTOR_H
#include <hip/hip_common.h>
#include <hip/driver_types.h>
#include <hip/amd_detail/amd_hip_vector_types.h>
#ifdef __cplusplus
extern "C" HIP_PUBLIC_API
hipChannelFormatDesc hipCreateChannelDesc(int x, int y, int z, int w, hipChannelFormatKind f);
static inline hipChannelFormatDesc hipCreateChannelDescHalf() {
int e = (int)sizeof(unsigned short) * 8;
return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindFloat);
}
static inline hipChannelFormatDesc hipCreateChannelDescHalf1() {
int e = (int)sizeof(unsigned short) * 8;
return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindFloat);
}
static inline hipChannelFormatDesc hipCreateChannelDescHalf2() {
int e = (int)sizeof(unsigned short) * 8;
return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindFloat);
}
template <typename T>
static inline hipChannelFormatDesc hipCreateChannelDesc() {
return hipCreateChannelDesc(0, 0, 0, 0, hipChannelFormatKindNone);
}
template <>
inline hipChannelFormatDesc hipCreateChannelDesc<char>() {
int e = (int)sizeof(char) * 8;
return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindSigned);
}
template <>
inline hipChannelFormatDesc hipCreateChannelDesc<signed char>() {
int e = (int)sizeof(signed char) * 8;
return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindSigned);
}
template <>
inline hipChannelFormatDesc hipCreateChannelDesc<unsigned char>() {
int e = (int)sizeof(unsigned char) * 8;
return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindUnsigned);
}
template <>
inline hipChannelFormatDesc hipCreateChannelDesc<uchar1>() {
int e = (int)sizeof(unsigned char) * 8;
return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindUnsigned);
}
template <>
inline hipChannelFormatDesc hipCreateChannelDesc<char1>() {
int e = (int)sizeof(signed char) * 8;
return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindSigned);
}
template <>
inline hipChannelFormatDesc hipCreateChannelDesc<uchar2>() {
int e = (int)sizeof(unsigned char) * 8;
return hipCreateChannelDesc(e, e, 0, 0, hipChannelFormatKindUnsigned);
}
template <>
inline hipChannelFormatDesc hipCreateChannelDesc<char2>() {
int e = (int)sizeof(signed char) * 8;
return hipCreateChannelDesc(e, e, 0, 0, hipChannelFormatKindSigned);
}
#ifndef __GNUC__ // vector3 is the same as vector4
template <>
inline hipChannelFormatDesc hipCreateChannelDesc<uchar3>() {
int e = (int)sizeof(unsigned char) * 8;
return hipCreateChannelDesc(e, e, e, 0, hipChannelFormatKindUnsigned);
}
template <>
inline hipChannelFormatDesc hipCreateChannelDesc<char3>() {
int e = (int)sizeof(signed char) * 8;
return hipCreateChannelDesc(e, e, e, 0, hipChannelFormatKindSigned);
}
#endif
template <>
inline hipChannelFormatDesc hipCreateChannelDesc<uchar4>() {
int e = (int)sizeof(unsigned char) * 8;
return hipCreateChannelDesc(e, e, e, e, hipChannelFormatKindUnsigned);
}
template <>
inline hipChannelFormatDesc hipCreateChannelDesc<char4>() {
int e = (int)sizeof(signed char) * 8;
return hipCreateChannelDesc(e, e, e, e, hipChannelFormatKindSigned);
}
template <>
inline hipChannelFormatDesc hipCreateChannelDesc<unsigned short>() {
int e = (int)sizeof(unsigned short) * 8;
return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindUnsigned);
}
template <>
inline hipChannelFormatDesc hipCreateChannelDesc<signed short>() {
int e = (int)sizeof(signed short) * 8;
return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindSigned);
}
template <>
inline hipChannelFormatDesc hipCreateChannelDesc<ushort1>() {
int e = (int)sizeof(unsigned short) * 8;
return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindUnsigned);
}
template <>
inline hipChannelFormatDesc hipCreateChannelDesc<short1>() {
int e = (int)sizeof(signed short) * 8;
return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindSigned);
}
template <>
inline hipChannelFormatDesc hipCreateChannelDesc<ushort2>() {
int e = (int)sizeof(unsigned short) * 8;
return hipCreateChannelDesc(e, e, 0, 0, hipChannelFormatKindUnsigned);
}
template <>
inline hipChannelFormatDesc hipCreateChannelDesc<short2>() {
int e = (int)sizeof(signed short) * 8;
return hipCreateChannelDesc(e, e, 0, 0, hipChannelFormatKindSigned);
}
#ifndef __GNUC__
template <>
inline hipChannelFormatDesc hipCreateChannelDesc<ushort3>() {
int e = (int)sizeof(unsigned short) * 8;
return hipCreateChannelDesc(e, e, e, 0, hipChannelFormatKindUnsigned);
}
template <>
inline hipChannelFormatDesc hipCreateChannelDesc<short3>() {
int e = (int)sizeof(signed short) * 8;
return hipCreateChannelDesc(e, e, e, 0, hipChannelFormatKindSigned);
}
#endif
template <>
inline hipChannelFormatDesc hipCreateChannelDesc<ushort4>() {
int e = (int)sizeof(unsigned short) * 8;
return hipCreateChannelDesc(e, e, e, e, hipChannelFormatKindUnsigned);
}
template <>
inline hipChannelFormatDesc hipCreateChannelDesc<short4>() {
int e = (int)sizeof(signed short) * 8;
return hipCreateChannelDesc(e, e, e, e, hipChannelFormatKindSigned);
}
template <>
inline hipChannelFormatDesc hipCreateChannelDesc<unsigned int>() {
int e = (int)sizeof(unsigned int) * 8;
return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindUnsigned);
}
template <>
inline hipChannelFormatDesc hipCreateChannelDesc<signed int>() {
int e = (int)sizeof(signed int) * 8;
return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindSigned);
}
template <>
inline hipChannelFormatDesc hipCreateChannelDesc<uint1>() {
int e = (int)sizeof(unsigned int) * 8;
return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindUnsigned);
}
template <>
inline hipChannelFormatDesc hipCreateChannelDesc<int1>() {
int e = (int)sizeof(signed int) * 8;
return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindSigned);
}
template <>
inline hipChannelFormatDesc hipCreateChannelDesc<uint2>() {
int e = (int)sizeof(unsigned int) * 8;
return hipCreateChannelDesc(e, e, 0, 0, hipChannelFormatKindUnsigned);
}
template <>
inline hipChannelFormatDesc hipCreateChannelDesc<int2>() {
int e = (int)sizeof(signed int) * 8;
return hipCreateChannelDesc(e, e, 0, 0, hipChannelFormatKindSigned);
}
#ifndef __GNUC__
template <>
inline hipChannelFormatDesc hipCreateChannelDesc<uint3>() {
int e = (int)sizeof(unsigned int) * 8;
return hipCreateChannelDesc(e, e, e, 0, hipChannelFormatKindUnsigned);
}
template <>
inline hipChannelFormatDesc hipCreateChannelDesc<int3>() {
int e = (int)sizeof(signed int) * 8;
return hipCreateChannelDesc(e, e, e, 0, hipChannelFormatKindSigned);
}
#endif
template <>
inline hipChannelFormatDesc hipCreateChannelDesc<uint4>() {
int e = (int)sizeof(unsigned int) * 8;
return hipCreateChannelDesc(e, e, e, e, hipChannelFormatKindUnsigned);
}
template <>
inline hipChannelFormatDesc hipCreateChannelDesc<int4>() {
int e = (int)sizeof(signed int) * 8;
return hipCreateChannelDesc(e, e, e, e, hipChannelFormatKindSigned);
}
template <>
inline hipChannelFormatDesc hipCreateChannelDesc<float>() {
int e = (int)sizeof(float) * 8;
return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindFloat);
}
template <>
inline hipChannelFormatDesc hipCreateChannelDesc<float1>() {
int e = (int)sizeof(float) * 8;
return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindFloat);
}
template <>
inline hipChannelFormatDesc hipCreateChannelDesc<float2>() {
int e = (int)sizeof(float) * 8;
return hipCreateChannelDesc(e, e, 0, 0, hipChannelFormatKindFloat);
}
#ifndef __GNUC__
template <>
inline hipChannelFormatDesc hipCreateChannelDesc<float3>() {
int e = (int)sizeof(float) * 8;
return hipCreateChannelDesc(e, e, e, 0, hipChannelFormatKindFloat);
}
#endif
template <>
inline hipChannelFormatDesc hipCreateChannelDesc<float4>() {
int e = (int)sizeof(float) * 8;
return hipCreateChannelDesc(e, e, e, e, hipChannelFormatKindFloat);
}
template <>
inline hipChannelFormatDesc hipCreateChannelDesc<unsigned long>() {
int e = (int)sizeof(unsigned long) * 8;
return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindUnsigned);
}
template <>
inline hipChannelFormatDesc hipCreateChannelDesc<signed long>() {
int e = (int)sizeof(signed long) * 8;
return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindSigned);
}
template <>
inline hipChannelFormatDesc hipCreateChannelDesc<ulong1>() {
int e = (int)sizeof(unsigned long) * 8;
return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindUnsigned);
}
template <>
inline hipChannelFormatDesc hipCreateChannelDesc<long1>() {
int e = (int)sizeof(signed long) * 8;
return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindSigned);
}
template <>
inline hipChannelFormatDesc hipCreateChannelDesc<ulong2>() {
int e = (int)sizeof(unsigned long) * 8;
return hipCreateChannelDesc(e, e, 0, 0, hipChannelFormatKindUnsigned);
}
template <>
inline hipChannelFormatDesc hipCreateChannelDesc<long2>() {
int e = (int)sizeof(signed long) * 8;
return hipCreateChannelDesc(e, e, 0, 0, hipChannelFormatKindSigned);
}
#ifndef __GNUC__
template <>
inline hipChannelFormatDesc hipCreateChannelDesc<ulong3>() {
int e = (int)sizeof(unsigned long) * 8;
return hipCreateChannelDesc(e, e, e, 0, hipChannelFormatKindUnsigned);
}
template <>
inline hipChannelFormatDesc hipCreateChannelDesc<long3>() {
int e = (int)sizeof(signed long) * 8;
return hipCreateChannelDesc(e, e, e, 0, hipChannelFormatKindSigned);
}
#endif
template <>
inline hipChannelFormatDesc hipCreateChannelDesc<ulong4>() {
int e = (int)sizeof(unsigned long) * 8;
return hipCreateChannelDesc(e, e, e, e, hipChannelFormatKindUnsigned);
}
template <>
inline hipChannelFormatDesc hipCreateChannelDesc<long4>() {
int e = (int)sizeof(signed long) * 8;
return hipCreateChannelDesc(e, e, e, e, hipChannelFormatKindSigned);
}
#else
struct hipChannelFormatDesc hipCreateChannelDesc(int x, int y, int z, int w,
enum hipChannelFormatKind f);
#endif
#endif
文件差异内容过多而无法显示 加载差异
文件差异内容过多而无法显示 加载差异
@@ -0,0 +1,293 @@
/**
* MIT License
*
* Copyright (c) 2019 - 2022 Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
/*!\file
* \brief hip_bfloat16.h provides struct for hip_bfloat16 typedef
*/
#ifndef _HIP_INCLUDE_HIP_AMD_DETAIL_HIP_BFLOAT16_H_
#define _HIP_INCLUDE_HIP_AMD_DETAIL_HIP_BFLOAT16_H_
#include "host_defines.h"
#if defined(__HIPCC_RTC__)
#define __HOST_DEVICE__ __device__
#else
#define __HOST_DEVICE__ __host__ __device__
#endif
#if __cplusplus < 201103L || !defined(__HIPCC__)
// If this is a C compiler, C++ compiler below C++11, or a host-only compiler, we only
// include a minimal definition of hip_bfloat16
#include <stdint.h>
/*! \brief Struct to represent a 16 bit brain floating point number. */
typedef struct
{
uint16_t data;
} hip_bfloat16;
#else // __cplusplus < 201103L || !defined(__HIPCC__)
#include <hip/hip_runtime.h>
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wshadow"
struct hip_bfloat16
{
__hip_uint16_t data;
enum truncate_t
{
truncate
};
__HOST_DEVICE__ hip_bfloat16() = default;
// round upper 16 bits of IEEE float to convert to bfloat16
explicit __HOST_DEVICE__ hip_bfloat16(float f)
: data(float_to_bfloat16(f))
{
}
explicit __HOST_DEVICE__ hip_bfloat16(float f, truncate_t)
: data(truncate_float_to_bfloat16(f))
{
}
// zero extend lower 16 bits of bfloat16 to convert to IEEE float
__HOST_DEVICE__ operator float() const
{
union
{
uint32_t int32;
float fp32;
} u = {uint32_t(data) << 16};
return u.fp32;
}
__HOST_DEVICE__ hip_bfloat16 &operator=(const float& f)
{
data = float_to_bfloat16(f);
return *this;
}
static __HOST_DEVICE__ hip_bfloat16 round_to_bfloat16(float f)
{
hip_bfloat16 output;
output.data = float_to_bfloat16(f);
return output;
}
static __HOST_DEVICE__ hip_bfloat16 round_to_bfloat16(float f, truncate_t)
{
hip_bfloat16 output;
output.data = truncate_float_to_bfloat16(f);
return output;
}
private:
static __HOST_DEVICE__ __hip_uint16_t float_to_bfloat16(float f)
{
union
{
float fp32;
uint32_t int32;
} u = {f};
if(~u.int32 & 0x7f800000)
{
// When the exponent bits are not all 1s, then the value is zero, normal,
// or subnormal. We round the bfloat16 mantissa up by adding 0x7FFF, plus
// 1 if the least significant bit of the bfloat16 mantissa is 1 (odd).
// This causes the bfloat16's mantissa to be incremented by 1 if the 16
// least significant bits of the float mantissa are greater than 0x8000,
// or if they are equal to 0x8000 and the least significant bit of the
// bfloat16 mantissa is 1 (odd). This causes it to be rounded to even when
// the lower 16 bits are exactly 0x8000. If the bfloat16 mantissa already
// has the value 0x7f, then incrementing it causes it to become 0x00 and
// the exponent is incremented by one, which is the next higher FP value
// to the unrounded bfloat16 value. When the bfloat16 value is subnormal
// with an exponent of 0x00 and a mantissa of 0x7F, it may be rounded up
// to a normal value with an exponent of 0x01 and a mantissa of 0x00.
// When the bfloat16 value has an exponent of 0xFE and a mantissa of 0x7F,
// incrementing it causes it to become an exponent of 0xFF and a mantissa
// of 0x00, which is Inf, the next higher value to the unrounded value.
u.int32 += 0x7fff + ((u.int32 >> 16) & 1); // Round to nearest, round to even
}
else if(u.int32 & 0xffff)
{
// When all of the exponent bits are 1, the value is Inf or NaN.
// Inf is indicated by a zero mantissa. NaN is indicated by any nonzero
// mantissa bit. Quiet NaN is indicated by the most significant mantissa
// bit being 1. Signaling NaN is indicated by the most significant
// mantissa bit being 0 but some other bit(s) being 1. If any of the
// lower 16 bits of the mantissa are 1, we set the least significant bit
// of the bfloat16 mantissa, in order to preserve signaling NaN in case
// the bloat16's mantissa bits are all 0.
u.int32 |= 0x10000; // Preserve signaling NaN
}
return __hip_uint16_t(u.int32 >> 16);
}
// Truncate instead of rounding, preserving SNaN
static __HOST_DEVICE__ __hip_uint16_t truncate_float_to_bfloat16(float f)
{
union
{
float fp32;
uint32_t int32;
} u = {f};
return __hip_uint16_t(u.int32 >> 16) | (!(~u.int32 & 0x7f800000) && (u.int32 & 0xffff));
}
};
#pragma clang diagnostic pop
typedef struct
{
__hip_uint16_t data;
} hip_bfloat16_public;
static_assert(__hip_internal::is_standard_layout<hip_bfloat16>{},
"hip_bfloat16 is not a standard layout type, and thus is "
"incompatible with C.");
static_assert(__hip_internal::is_trivial<hip_bfloat16>{},
"hip_bfloat16 is not a trivial type, and thus is "
"incompatible with C.");
#if !defined(__HIPCC_RTC__)
static_assert(sizeof(hip_bfloat16) == sizeof(hip_bfloat16_public)
&& offsetof(hip_bfloat16, data) == offsetof(hip_bfloat16_public, data),
"internal hip_bfloat16 does not match public hip_bfloat16");
inline std::ostream& operator<<(std::ostream& os, const hip_bfloat16& bf16)
{
return os << float(bf16);
}
#endif
inline __HOST_DEVICE__ hip_bfloat16 operator+(hip_bfloat16 a)
{
return a;
}
inline __HOST_DEVICE__ hip_bfloat16 operator-(hip_bfloat16 a)
{
a.data ^= 0x8000;
return a;
}
inline __HOST_DEVICE__ hip_bfloat16 operator+(hip_bfloat16 a, hip_bfloat16 b)
{
return hip_bfloat16(float(a) + float(b));
}
inline __HOST_DEVICE__ hip_bfloat16 operator-(hip_bfloat16 a, hip_bfloat16 b)
{
return hip_bfloat16(float(a) - float(b));
}
inline __HOST_DEVICE__ hip_bfloat16 operator*(hip_bfloat16 a, hip_bfloat16 b)
{
return hip_bfloat16(float(a) * float(b));
}
inline __HOST_DEVICE__ hip_bfloat16 operator/(hip_bfloat16 a, hip_bfloat16 b)
{
return hip_bfloat16(float(a) / float(b));
}
inline __HOST_DEVICE__ bool operator<(hip_bfloat16 a, hip_bfloat16 b)
{
return float(a) < float(b);
}
inline __HOST_DEVICE__ bool operator==(hip_bfloat16 a, hip_bfloat16 b)
{
return float(a) == float(b);
}
inline __HOST_DEVICE__ bool operator>(hip_bfloat16 a, hip_bfloat16 b)
{
return b < a;
}
inline __HOST_DEVICE__ bool operator<=(hip_bfloat16 a, hip_bfloat16 b)
{
return !(a > b);
}
inline __HOST_DEVICE__ bool operator!=(hip_bfloat16 a, hip_bfloat16 b)
{
return !(a == b);
}
inline __HOST_DEVICE__ bool operator>=(hip_bfloat16 a, hip_bfloat16 b)
{
return !(a < b);
}
inline __HOST_DEVICE__ hip_bfloat16& operator+=(hip_bfloat16& a, hip_bfloat16 b)
{
return a = a + b;
}
inline __HOST_DEVICE__ hip_bfloat16& operator-=(hip_bfloat16& a, hip_bfloat16 b)
{
return a = a - b;
}
inline __HOST_DEVICE__ hip_bfloat16& operator*=(hip_bfloat16& a, hip_bfloat16 b)
{
return a = a * b;
}
inline __HOST_DEVICE__ hip_bfloat16& operator/=(hip_bfloat16& a, hip_bfloat16 b)
{
return a = a / b;
}
inline __HOST_DEVICE__ hip_bfloat16& operator++(hip_bfloat16& a)
{
return a += hip_bfloat16(1.0f);
}
inline __HOST_DEVICE__ hip_bfloat16& operator--(hip_bfloat16& a)
{
return a -= hip_bfloat16(1.0f);
}
inline __HOST_DEVICE__ hip_bfloat16 operator++(hip_bfloat16& a, int)
{
hip_bfloat16 orig = a;
++a;
return orig;
}
inline __HOST_DEVICE__ hip_bfloat16 operator--(hip_bfloat16& a, int)
{
hip_bfloat16 orig = a;
--a;
return orig;
}
namespace std
{
constexpr __HOST_DEVICE__ bool isinf(hip_bfloat16 a)
{
return !(~a.data & 0x7f80) && !(a.data & 0x7f);
}
constexpr __HOST_DEVICE__ bool isnan(hip_bfloat16 a)
{
return !(~a.data & 0x7f80) && +(a.data & 0x7f);
}
constexpr __HOST_DEVICE__ bool iszero(hip_bfloat16 a)
{
return !(a.data & 0x7fff);
}
}
#endif // __cplusplus < 201103L || !defined(__HIPCC__)
#endif // _HIP_BFLOAT16_H_
@@ -0,0 +1,32 @@
/*
Copyright (c) 2019 - 2021 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy of
this software and associated documentation files (the "Software"), to deal in
the Software without restriction, including without limitation the rights to
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
of the Software, and to permit persons to whom the Software is furnished to do
so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
*/
#ifndef HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COMMON_H
#define HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COMMON_H
#if defined(__clang__) && defined(__HIP__)
#define __HIP_CLANG_ONLY__ 1
#else
#define __HIP_CLANG_ONLY__ 0
#endif
#endif // HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COMMON_H
@@ -0,0 +1,314 @@
/*
Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#ifndef HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COMPLEX_H
#define HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COMPLEX_H
#include "hip/amd_detail/amd_hip_vector_types.h"
#if defined(__HIPCC_RTC__)
#define __HOST_DEVICE__ __device__
#else
#define __HOST_DEVICE__ __host__ __device__
// TODO: Clang has a bug which allows device functions to call std functions
// when std functions are introduced into default namespace by using statement.
// math.h may be included after this bug is fixed.
#if __cplusplus
#include <cmath>
#else
#include "math.h"
#endif
#endif // !defined(__HIPCC_RTC__)
#if __cplusplus
#define COMPLEX_NEG_OP_OVERLOAD(type) \
__HOST_DEVICE__ static inline type operator-(const type& op) { \
type ret; \
ret.x = -op.x; \
ret.y = -op.y; \
return ret; \
}
#define COMPLEX_EQ_OP_OVERLOAD(type) \
__HOST_DEVICE__ static inline bool operator==(const type& lhs, const type& rhs) { \
return lhs.x == rhs.x && lhs.y == rhs.y; \
}
#define COMPLEX_NE_OP_OVERLOAD(type) \
__HOST_DEVICE__ static inline bool operator!=(const type& lhs, const type& rhs) { \
return !(lhs == rhs); \
}
#define COMPLEX_ADD_OP_OVERLOAD(type) \
__HOST_DEVICE__ static inline type operator+(const type& lhs, const type& rhs) { \
type ret; \
ret.x = lhs.x + rhs.x; \
ret.y = lhs.y + rhs.y; \
return ret; \
}
#define COMPLEX_SUB_OP_OVERLOAD(type) \
__HOST_DEVICE__ static inline type operator-(const type& lhs, const type& rhs) { \
type ret; \
ret.x = lhs.x - rhs.x; \
ret.y = lhs.y - rhs.y; \
return ret; \
}
#define COMPLEX_MUL_OP_OVERLOAD(type) \
__HOST_DEVICE__ static inline type operator*(const type& lhs, const type& rhs) { \
type ret; \
ret.x = lhs.x * rhs.x - lhs.y * rhs.y; \
ret.y = lhs.x * rhs.y + lhs.y * rhs.x; \
return ret; \
}
#define COMPLEX_DIV_OP_OVERLOAD(type) \
__HOST_DEVICE__ static inline type operator/(const type& lhs, const type& rhs) { \
type ret; \
ret.x = (lhs.x * rhs.x + lhs.y * rhs.y); \
ret.y = (rhs.x * lhs.y - lhs.x * rhs.y); \
ret.x = ret.x / (rhs.x * rhs.x + rhs.y * rhs.y); \
ret.y = ret.y / (rhs.x * rhs.x + rhs.y * rhs.y); \
return ret; \
}
#define COMPLEX_ADD_PREOP_OVERLOAD(type) \
__HOST_DEVICE__ static inline type& operator+=(type& lhs, const type& rhs) { \
lhs.x += rhs.x; \
lhs.y += rhs.y; \
return lhs; \
}
#define COMPLEX_SUB_PREOP_OVERLOAD(type) \
__HOST_DEVICE__ static inline type& operator-=(type& lhs, const type& rhs) { \
lhs.x -= rhs.x; \
lhs.y -= rhs.y; \
return lhs; \
}
#define COMPLEX_MUL_PREOP_OVERLOAD(type) \
__HOST_DEVICE__ static inline type& operator*=(type& lhs, const type& rhs) { \
type temp{lhs}; \
lhs.x = rhs.x * temp.x - rhs.y * temp.y; \
lhs.y = rhs.y * temp.x + rhs.x * temp.y; \
return lhs; \
}
#define COMPLEX_DIV_PREOP_OVERLOAD(type) \
__HOST_DEVICE__ static inline type& operator/=(type& lhs, const type& rhs) { \
type temp; \
temp.x = (lhs.x*rhs.x + lhs.y * rhs.y) / (rhs.x*rhs.x + rhs.y*rhs.y); \
temp.y = (lhs.y * rhs.x - lhs.x * rhs.y) / (rhs.x*rhs.x + rhs.y*rhs.y); \
lhs = temp; \
return lhs; \
}
#define COMPLEX_SCALAR_PRODUCT(type, type1) \
__HOST_DEVICE__ static inline type operator*(const type& lhs, type1 rhs) { \
type ret; \
ret.x = lhs.x * rhs; \
ret.y = lhs.y * rhs; \
return ret; \
}
#endif
typedef float2 hipFloatComplex;
__HOST_DEVICE__ static inline float hipCrealf(hipFloatComplex z) { return z.x; }
__HOST_DEVICE__ static inline float hipCimagf(hipFloatComplex z) { return z.y; }
__HOST_DEVICE__ static inline hipFloatComplex make_hipFloatComplex(float a, float b) {
hipFloatComplex z;
z.x = a;
z.y = b;
return z;
}
__HOST_DEVICE__ static inline hipFloatComplex hipConjf(hipFloatComplex z) {
hipFloatComplex ret;
ret.x = z.x;
ret.y = -z.y;
return ret;
}
__HOST_DEVICE__ static inline float hipCsqabsf(hipFloatComplex z) {
return z.x * z.x + z.y * z.y;
}
__HOST_DEVICE__ static inline hipFloatComplex hipCaddf(hipFloatComplex p, hipFloatComplex q) {
return make_hipFloatComplex(p.x + q.x, p.y + q.y);
}
__HOST_DEVICE__ static inline hipFloatComplex hipCsubf(hipFloatComplex p, hipFloatComplex q) {
return make_hipFloatComplex(p.x - q.x, p.y - q.y);
}
__HOST_DEVICE__ static inline hipFloatComplex hipCmulf(hipFloatComplex p, hipFloatComplex q) {
return make_hipFloatComplex(p.x * q.x - p.y * q.y, p.y * q.x + p.x * q.y);
}
__HOST_DEVICE__ static inline hipFloatComplex hipCdivf(hipFloatComplex p, hipFloatComplex q) {
float sqabs = hipCsqabsf(q);
hipFloatComplex ret;
ret.x = (p.x * q.x + p.y * q.y) / sqabs;
ret.y = (p.y * q.x - p.x * q.y) / sqabs;
return ret;
}
__HOST_DEVICE__ static inline float hipCabsf(hipFloatComplex z) { return sqrtf(hipCsqabsf(z)); }
typedef double2 hipDoubleComplex;
__HOST_DEVICE__ static inline double hipCreal(hipDoubleComplex z) { return z.x; }
__HOST_DEVICE__ static inline double hipCimag(hipDoubleComplex z) { return z.y; }
__HOST_DEVICE__ static inline hipDoubleComplex make_hipDoubleComplex(double a, double b) {
hipDoubleComplex z;
z.x = a;
z.y = b;
return z;
}
__HOST_DEVICE__ static inline hipDoubleComplex hipConj(hipDoubleComplex z) {
hipDoubleComplex ret;
ret.x = z.x;
ret.y = -z.y;
return ret;
}
__HOST_DEVICE__ static inline double hipCsqabs(hipDoubleComplex z) {
return z.x * z.x + z.y * z.y;
}
__HOST_DEVICE__ static inline hipDoubleComplex hipCadd(hipDoubleComplex p, hipDoubleComplex q) {
return make_hipDoubleComplex(p.x + q.x, p.y + q.y);
}
__HOST_DEVICE__ static inline hipDoubleComplex hipCsub(hipDoubleComplex p, hipDoubleComplex q) {
return make_hipDoubleComplex(p.x - q.x, p.y - q.y);
}
__HOST_DEVICE__ static inline hipDoubleComplex hipCmul(hipDoubleComplex p, hipDoubleComplex q) {
return make_hipDoubleComplex(p.x * q.x - p.y * q.y, p.y * q.x + p.x * q.y);
}
__HOST_DEVICE__ static inline hipDoubleComplex hipCdiv(hipDoubleComplex p, hipDoubleComplex q) {
double sqabs = hipCsqabs(q);
hipDoubleComplex ret;
ret.x = (p.x * q.x + p.y * q.y) / sqabs;
ret.y = (p.y * q.x - p.x * q.y) / sqabs;
return ret;
}
__HOST_DEVICE__ static inline double hipCabs(hipDoubleComplex z) { return sqrt(hipCsqabs(z)); }
#if __cplusplus
COMPLEX_NEG_OP_OVERLOAD(hipFloatComplex)
COMPLEX_EQ_OP_OVERLOAD(hipFloatComplex)
COMPLEX_NE_OP_OVERLOAD(hipFloatComplex)
COMPLEX_ADD_OP_OVERLOAD(hipFloatComplex)
COMPLEX_SUB_OP_OVERLOAD(hipFloatComplex)
COMPLEX_MUL_OP_OVERLOAD(hipFloatComplex)
COMPLEX_DIV_OP_OVERLOAD(hipFloatComplex)
COMPLEX_ADD_PREOP_OVERLOAD(hipFloatComplex)
COMPLEX_SUB_PREOP_OVERLOAD(hipFloatComplex)
COMPLEX_MUL_PREOP_OVERLOAD(hipFloatComplex)
COMPLEX_DIV_PREOP_OVERLOAD(hipFloatComplex)
COMPLEX_SCALAR_PRODUCT(hipFloatComplex, unsigned short)
COMPLEX_SCALAR_PRODUCT(hipFloatComplex, signed short)
COMPLEX_SCALAR_PRODUCT(hipFloatComplex, unsigned int)
COMPLEX_SCALAR_PRODUCT(hipFloatComplex, signed int)
COMPLEX_SCALAR_PRODUCT(hipFloatComplex, float)
COMPLEX_SCALAR_PRODUCT(hipFloatComplex, unsigned long)
COMPLEX_SCALAR_PRODUCT(hipFloatComplex, signed long)
COMPLEX_SCALAR_PRODUCT(hipFloatComplex, double)
COMPLEX_SCALAR_PRODUCT(hipFloatComplex, signed long long)
COMPLEX_SCALAR_PRODUCT(hipFloatComplex, unsigned long long)
COMPLEX_NEG_OP_OVERLOAD(hipDoubleComplex)
COMPLEX_EQ_OP_OVERLOAD(hipDoubleComplex)
COMPLEX_NE_OP_OVERLOAD(hipDoubleComplex)
COMPLEX_ADD_OP_OVERLOAD(hipDoubleComplex)
COMPLEX_SUB_OP_OVERLOAD(hipDoubleComplex)
COMPLEX_MUL_OP_OVERLOAD(hipDoubleComplex)
COMPLEX_DIV_OP_OVERLOAD(hipDoubleComplex)
COMPLEX_ADD_PREOP_OVERLOAD(hipDoubleComplex)
COMPLEX_SUB_PREOP_OVERLOAD(hipDoubleComplex)
COMPLEX_MUL_PREOP_OVERLOAD(hipDoubleComplex)
COMPLEX_DIV_PREOP_OVERLOAD(hipDoubleComplex)
COMPLEX_SCALAR_PRODUCT(hipDoubleComplex, unsigned short)
COMPLEX_SCALAR_PRODUCT(hipDoubleComplex, signed short)
COMPLEX_SCALAR_PRODUCT(hipDoubleComplex, unsigned int)
COMPLEX_SCALAR_PRODUCT(hipDoubleComplex, signed int)
COMPLEX_SCALAR_PRODUCT(hipDoubleComplex, float)
COMPLEX_SCALAR_PRODUCT(hipDoubleComplex, unsigned long)
COMPLEX_SCALAR_PRODUCT(hipDoubleComplex, signed long)
COMPLEX_SCALAR_PRODUCT(hipDoubleComplex, double)
COMPLEX_SCALAR_PRODUCT(hipDoubleComplex, signed long long)
COMPLEX_SCALAR_PRODUCT(hipDoubleComplex, unsigned long long)
#endif
typedef hipFloatComplex hipComplex;
__HOST_DEVICE__ static inline hipComplex make_hipComplex(float x, float y) {
return make_hipFloatComplex(x, y);
}
__HOST_DEVICE__ static inline hipFloatComplex hipComplexDoubleToFloat(hipDoubleComplex z) {
return make_hipFloatComplex((float)z.x, (float)z.y);
}
__HOST_DEVICE__ static inline hipDoubleComplex hipComplexFloatToDouble(hipFloatComplex z) {
return make_hipDoubleComplex((double)z.x, (double)z.y);
}
__HOST_DEVICE__ static inline hipComplex hipCfmaf(hipComplex p, hipComplex q, hipComplex r) {
float real = (p.x * q.x) + r.x;
float imag = (q.x * p.y) + r.y;
real = -(p.y * q.y) + real;
imag = (p.x * q.y) + imag;
return make_hipComplex(real, imag);
}
__HOST_DEVICE__ static inline hipDoubleComplex hipCfma(hipDoubleComplex p, hipDoubleComplex q,
hipDoubleComplex r) {
double real = (p.x * q.x) + r.x;
double imag = (q.x * p.y) + r.y;
real = -(p.y * q.y) + real;
imag = (p.x * q.y) + imag;
return make_hipDoubleComplex(real, imag);
}
#endif //HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COMPLEX_H
@@ -0,0 +1,708 @@
/*
Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
/**
* @file amd_detail/hip_cooperative_groups.h
*
* @brief Device side implementation of `Cooperative Group` feature.
*
* Defines new types and device API wrappers related to `Cooperative Group`
* feature, which the programmer can directly use in his kernel(s) in order to
* make use of this feature.
*/
#ifndef HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COOPERATIVE_GROUPS_H
#define HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COOPERATIVE_GROUPS_H
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wc++98-compat"
#pragma clang diagnostic ignored "-Wsign-conversion"
#pragma clang diagnostic ignored "-Wunused-parameter"
#pragma clang diagnostic ignored "-Wreserved-macro-identifier"
#pragma clang diagnostic ignored "-Wpadded"
#if __cplusplus
#if !defined(__HIPCC_RTC__)
#include <hip/amd_detail/hip_cooperative_groups_helper.h>
#endif
#define __hip_abort() \
{ asm("trap;"); }
#if defined(NDEBUG)
#define __hip_assert(COND)
#else
#define __hip_assert(COND) \
{ \
if (!COND) { \
__hip_abort(); \
} \
}
#endif
namespace cooperative_groups {
/** \brief The base type of all cooperative group types
*
* \details Holds the key properties of a constructed cooperative group types
* object, like the group type, its size, etc
*/
class thread_group {
protected:
uint32_t _type; // thread_group type
uint32_t _size; // total number of threads in the tread_group
uint64_t _mask; // Lanemask for coalesced and tiled partitioned group types,
// LSB represents lane 0, and MSB represents lane 63
// Construct a thread group, and set thread group type and other essential
// thread group properties. This generic thread group is directly constructed
// only when the group is supposed to contain only the calling the thread
// (throurh the API - `this_thread()`), and in all other cases, this thread
// group object is a sub-object of some other derived thread group object
__CG_QUALIFIER__ thread_group(internal::group_type type, uint32_t size = static_cast<uint64_t>(0),
uint64_t mask = static_cast<uint64_t>(0)) {
_type = type;
_size = size;
_mask = mask;
}
struct _tiled_info {
bool is_tiled;
unsigned int size;
};
struct _coalesced_info {
lane_mask member_mask;
unsigned int size;
struct _tiled_info tiled_info;
} coalesced_info;
friend __CG_QUALIFIER__ thread_group tiled_partition(const thread_group& parent,
unsigned int tile_size);
friend class thread_block;
public:
// Total number of threads in the thread group, and this serves the purpose
// for all derived cooperative group types since their `size` is directly
// saved during the construction
__CG_QUALIFIER__ uint32_t size() const { return _size; }
__CG_QUALIFIER__ unsigned int cg_type() const { return _type; }
// Rank of the calling thread within [0, size())
__CG_QUALIFIER__ uint32_t thread_rank() const;
// Is this cooperative group type valid?
__CG_QUALIFIER__ bool is_valid() const;
// synchronize the threads in the thread group
__CG_QUALIFIER__ void sync() const;
};
/** \brief The multi-grid cooperative group type
*
* \details Represents an inter-device cooperative group type where the
* participating threads within the group spans across multple
* devices, running the (same) kernel on these devices
*/
class multi_grid_group : public thread_group {
// Only these friend functions are allowed to construct an object of this class
// and access its resources
friend __CG_QUALIFIER__ multi_grid_group this_multi_grid();
protected:
// Construct mutli-grid thread group (through the API this_multi_grid())
explicit __CG_QUALIFIER__ multi_grid_group(uint32_t size)
: thread_group(internal::cg_multi_grid, size) {}
public:
// Number of invocations participating in this multi-grid group. In other
// words, the number of GPUs
__CG_QUALIFIER__ uint32_t num_grids() { return internal::multi_grid::num_grids(); }
// Rank of this invocation. In other words, an ID number within the range
// [0, num_grids()) of the GPU, this kernel is running on
__CG_QUALIFIER__ uint32_t grid_rank() { return internal::multi_grid::grid_rank(); }
__CG_QUALIFIER__ uint32_t thread_rank() const { return internal::multi_grid::thread_rank(); }
__CG_QUALIFIER__ bool is_valid() const { return internal::multi_grid::is_valid(); }
__CG_QUALIFIER__ void sync() const { internal::multi_grid::sync(); }
};
/** \brief User exposed API interface to construct multi-grid cooperative
* group type object - `multi_grid_group`
*
* \details User is not allowed to directly construct an object of type
* `multi_grid_group`. Instead, he should construct it through this
* API function
*/
__CG_QUALIFIER__ multi_grid_group this_multi_grid() {
return multi_grid_group(internal::multi_grid::size());
}
/** \brief The grid cooperative group type
*
* \details Represents an inter-workgroup cooperative group type where the
* participating threads within the group spans across multiple
* workgroups running the (same) kernel on the same device
*/
class grid_group : public thread_group {
// Only these friend functions are allowed to construct an object of this class
// and access its resources
friend __CG_QUALIFIER__ grid_group this_grid();
protected:
// Construct grid thread group (through the API this_grid())
explicit __CG_QUALIFIER__ grid_group(uint32_t size) : thread_group(internal::cg_grid, size) {}
public:
__CG_QUALIFIER__ uint32_t thread_rank() const { return internal::grid::thread_rank(); }
__CG_QUALIFIER__ bool is_valid() const { return internal::grid::is_valid(); }
__CG_QUALIFIER__ void sync() const { internal::grid::sync(); }
};
/** \brief User exposed API interface to construct grid cooperative group type
* object - `grid_group`
*
* \details User is not allowed to directly construct an object of type
* `multi_grid_group`. Instead, he should construct it through this
* API function
*/
__CG_QUALIFIER__ grid_group this_grid() { return grid_group(internal::grid::size()); }
/** \brief The workgroup (thread-block in CUDA terminology) cooperative group
* type
*
* \details Represents an intra-workgroup cooperative group type where the
* participating threads within the group are exactly the same threads
* which are participated in the currently executing `workgroup`
*/
class thread_block : public thread_group {
// Only these friend functions are allowed to construct an object of thi
// class and access its resources
friend __CG_QUALIFIER__ thread_block this_thread_block();
friend __CG_QUALIFIER__ thread_group tiled_partition(const thread_group& parent,
unsigned int tile_size);
friend __CG_QUALIFIER__ thread_group tiled_partition(const thread_block& parent,
unsigned int tile_size);
protected:
// Construct a workgroup thread group (through the API this_thread_block())
explicit __CG_QUALIFIER__ thread_block(uint32_t size)
: thread_group(internal::cg_workgroup, size) {}
__CG_QUALIFIER__ thread_group new_tiled_group(unsigned int tile_size) const {
const bool pow2 = ((tile_size & (tile_size - 1)) == 0);
// Invalid tile size, assert
if (!tile_size || (tile_size > __AMDGCN_WAVEFRONT_SIZE) || !pow2) {
__hip_assert(false && "invalid tile size")
}
thread_group tiledGroup = thread_group(internal::cg_tiled_group, tile_size);
tiledGroup.coalesced_info.tiled_info.size = tile_size;
tiledGroup.coalesced_info.tiled_info.is_tiled = true;
return tiledGroup;
}
public:
// 3-dimensional block index within the grid
__CG_QUALIFIER__ dim3 group_index() { return internal::workgroup::group_index(); }
// 3-dimensional thread index within the block
__CG_QUALIFIER__ dim3 thread_index() { return internal::workgroup::thread_index(); }
__CG_QUALIFIER__ uint32_t thread_rank() const { return internal::workgroup::thread_rank(); }
__CG_QUALIFIER__ bool is_valid() const { return internal::workgroup::is_valid(); }
__CG_QUALIFIER__ void sync() const { internal::workgroup::sync(); }
};
/** \brief User exposed API interface to construct workgroup cooperative
* group type object - `thread_block`.
*
* \details User is not allowed to directly construct an object of type
* `thread_block`. Instead, he should construct it through this API
* function.
*/
__CG_QUALIFIER__ thread_block this_thread_block() {
return thread_block(internal::workgroup::size());
}
/** \brief The tiled_group cooperative group type
*
* \details Represents one tiled thread group in a wavefront.
* This group type also supports sub-wave level intrinsics.
*/
class tiled_group : public thread_group {
private:
friend __CG_QUALIFIER__ thread_group tiled_partition(const thread_group& parent,
unsigned int tile_size);
friend __CG_QUALIFIER__ tiled_group tiled_partition(const tiled_group& parent,
unsigned int tile_size);
__CG_QUALIFIER__ tiled_group new_tiled_group(unsigned int tile_size) const {
const bool pow2 = ((tile_size & (tile_size - 1)) == 0);
if (!tile_size || (tile_size > __AMDGCN_WAVEFRONT_SIZE) || !pow2) {
__hip_assert(false && "invalid tile size")
}
if (size() <= tile_size) {
return *this;
}
tiled_group tiledGroup = tiled_group(tile_size);
tiledGroup.coalesced_info.tiled_info.is_tiled = true;
return tiledGroup;
}
protected:
explicit __CG_QUALIFIER__ tiled_group(unsigned int tileSize)
: thread_group(internal::cg_tiled_group, tileSize) {
coalesced_info.tiled_info.size = tileSize;
coalesced_info.tiled_info.is_tiled = true;
}
public:
__CG_QUALIFIER__ unsigned int size() const { return (coalesced_info.tiled_info.size); }
__CG_QUALIFIER__ unsigned int thread_rank() const {
return (internal::workgroup::thread_rank() & (coalesced_info.tiled_info.size - 1));
}
__CG_QUALIFIER__ void sync() const {
internal::tiled_group::sync();
}
};
/** \brief The coalesced_group cooperative group type
*
* \details Represents a active thread group in a wavefront.
* This group type also supports sub-wave level intrinsics.
*/
class coalesced_group : public thread_group {
private:
friend __CG_QUALIFIER__ coalesced_group coalesced_threads();
friend __CG_QUALIFIER__ thread_group tiled_partition(const thread_group& parent, unsigned int tile_size);
friend __CG_QUALIFIER__ coalesced_group tiled_partition(const coalesced_group& parent, unsigned int tile_size);
__CG_QUALIFIER__ coalesced_group new_tiled_group(unsigned int tile_size) const {
const bool pow2 = ((tile_size & (tile_size - 1)) == 0);
if (!tile_size || (tile_size > size()) || !pow2) {
return coalesced_group(0);
}
// If a tiled group is passed to be partitioned further into a coalesced_group.
// prepare a mask for further partitioning it so that it stays coalesced.
if (coalesced_info.tiled_info.is_tiled) {
unsigned int base_offset = (thread_rank() & (~(tile_size - 1)));
unsigned int masklength = min(static_cast<unsigned int>(size()) - base_offset, tile_size);
lane_mask member_mask = static_cast<lane_mask>(-1) >> (__AMDGCN_WAVEFRONT_SIZE - masklength);
member_mask <<= (__lane_id() & ~(tile_size - 1));
coalesced_group coalesced_tile = coalesced_group(member_mask);
coalesced_tile.coalesced_info.tiled_info.is_tiled = true;
return coalesced_tile;
}
// Here the parent coalesced_group is not partitioned.
else {
lane_mask member_mask = 0;
unsigned int tile_rank = 0;
int lanes_to_skip = ((thread_rank()) / tile_size) * tile_size;
for (unsigned int i = 0; i < __AMDGCN_WAVEFRONT_SIZE; i++) {
lane_mask active = coalesced_info.member_mask & (1 << i);
// Make sure the lane is active
if (active) {
if (lanes_to_skip <= 0 && tile_rank < tile_size) {
// Prepare a member_mask that is appropriate for a tile
member_mask |= active;
tile_rank++;
}
lanes_to_skip--;
}
}
coalesced_group coalesced_tile = coalesced_group(member_mask);
return coalesced_tile;
}
return coalesced_group(0);
}
protected:
// Constructor
explicit __CG_QUALIFIER__ coalesced_group(lane_mask member_mask)
: thread_group(internal::cg_coalesced_group) {
coalesced_info.member_mask = member_mask; // Which threads are active
coalesced_info.size = __popcll(coalesced_info.member_mask); // How many threads are active
coalesced_info.tiled_info.is_tiled = false; // Not a partitioned group
}
public:
__CG_QUALIFIER__ unsigned int size() const {
return coalesced_info.size;
}
__CG_QUALIFIER__ unsigned int thread_rank() const {
return internal::coalesced_group::masked_bit_count(coalesced_info.member_mask);
}
__CG_QUALIFIER__ void sync() const {
internal::coalesced_group::sync();
}
template <class T>
__CG_QUALIFIER__ T shfl(T var, int srcRank) const {
static_assert(is_valid_type<T>::value, "Neither an integer or float type.");
srcRank = srcRank % static_cast<int>(size());
int lane = (size() == __AMDGCN_WAVEFRONT_SIZE) ? srcRank
: (__AMDGCN_WAVEFRONT_SIZE == 64) ? __fns64(coalesced_info.member_mask, 0, (srcRank + 1))
: __fns32(coalesced_info.member_mask, 0, (srcRank + 1));
return __shfl(var, lane, __AMDGCN_WAVEFRONT_SIZE);
}
template <class T>
__CG_QUALIFIER__ T shfl_down(T var, unsigned int lane_delta) const {
static_assert(is_valid_type<T>::value, "Neither an integer or float type.");
// Note: The cuda implementation appears to use the remainder of lane_delta
// and WARP_SIZE as the shift value rather than lane_delta itself.
// This is not described in the documentation and is not done here.
if (size() == __AMDGCN_WAVEFRONT_SIZE) {
return __shfl_down(var, lane_delta, __AMDGCN_WAVEFRONT_SIZE);
}
int lane;
if (__AMDGCN_WAVEFRONT_SIZE == 64) {
lane = __fns64(coalesced_info.member_mask, __lane_id(), lane_delta + 1);
}
else {
lane = __fns32(coalesced_info.member_mask, __lane_id(), lane_delta + 1);
}
if (lane == -1) {
lane = __lane_id();
}
return __shfl(var, lane, __AMDGCN_WAVEFRONT_SIZE);
}
template <class T>
__CG_QUALIFIER__ T shfl_up(T var, unsigned int lane_delta) const {
static_assert(is_valid_type<T>::value, "Neither an integer or float type.");
// Note: The cuda implementation appears to use the remainder of lane_delta
// and WARP_SIZE as the shift value rather than lane_delta itself.
// This is not described in the documentation and is not done here.
if (size() == __AMDGCN_WAVEFRONT_SIZE) {
return __shfl_up(var, lane_delta, __AMDGCN_WAVEFRONT_SIZE);
}
int lane;
if (__AMDGCN_WAVEFRONT_SIZE == 64) {
lane = __fns64(coalesced_info.member_mask, __lane_id(), -(lane_delta + 1));
}
else if (__AMDGCN_WAVEFRONT_SIZE == 32) {
lane = __fns32(coalesced_info.member_mask, __lane_id(), -(lane_delta + 1));
}
if (lane == -1) {
lane = __lane_id();
}
return __shfl(var, lane, __AMDGCN_WAVEFRONT_SIZE);
}
};
/** \brief User exposed API to create coalesced groups.
*
* \details A collective operation that groups all active lanes into a new thread group.
*/
__CG_QUALIFIER__ coalesced_group coalesced_threads() {
return cooperative_groups::coalesced_group(__builtin_amdgcn_read_exec());
}
/**
* Implemenation of all publicly exposed base class APIs
*/
__CG_QUALIFIER__ uint32_t thread_group::thread_rank() const {
switch (this->_type) {
case internal::cg_multi_grid: {
return (static_cast<const multi_grid_group*>(this)->thread_rank());
}
case internal::cg_grid: {
return (static_cast<const grid_group*>(this)->thread_rank());
}
case internal::cg_workgroup: {
return (static_cast<const thread_block*>(this)->thread_rank());
}
case internal::cg_tiled_group: {
return (static_cast<const tiled_group*>(this)->thread_rank());
}
case internal::cg_coalesced_group: {
return (static_cast<const coalesced_group*>(this)->thread_rank());
}
default: {
__hip_assert(false && "invalid cooperative group type")
return -1;
}
}
}
__CG_QUALIFIER__ bool thread_group::is_valid() const {
switch (this->_type) {
case internal::cg_multi_grid: {
return (static_cast<const multi_grid_group*>(this)->is_valid());
}
case internal::cg_grid: {
return (static_cast<const grid_group*>(this)->is_valid());
}
case internal::cg_workgroup: {
return (static_cast<const thread_block*>(this)->is_valid());
}
case internal::cg_tiled_group: {
return (static_cast<const tiled_group*>(this)->is_valid());
}
case internal::cg_coalesced_group: {
return (static_cast<const coalesced_group*>(this)->is_valid());
}
default: {
__hip_assert(false && "invalid cooperative group type")
return false;
}
}
}
__CG_QUALIFIER__ void thread_group::sync() const {
switch (this->_type) {
case internal::cg_multi_grid: {
static_cast<const multi_grid_group*>(this)->sync();
break;
}
case internal::cg_grid: {
static_cast<const grid_group*>(this)->sync();
break;
}
case internal::cg_workgroup: {
static_cast<const thread_block*>(this)->sync();
break;
}
case internal::cg_tiled_group: {
static_cast<const tiled_group*>(this)->sync();
break;
}
case internal::cg_coalesced_group: {
static_cast<const coalesced_group*>(this)->sync();
break;
}
default: {
__hip_assert(false && "invalid cooperative group type")
}
}
}
/**
* Implemenation of publicly exposed `wrapper` APIs on top of basic cooperative
* group type APIs
*/
template <class CGTy> __CG_QUALIFIER__ uint32_t group_size(CGTy const& g) { return g.size(); }
template <class CGTy> __CG_QUALIFIER__ uint32_t thread_rank(CGTy const& g) {
return g.thread_rank();
}
template <class CGTy> __CG_QUALIFIER__ bool is_valid(CGTy const& g) { return g.is_valid(); }
template <class CGTy> __CG_QUALIFIER__ void sync(CGTy const& g) { g.sync(); }
template <unsigned int tileSize> class tile_base {
protected:
_CG_STATIC_CONST_DECL_ unsigned int numThreads = tileSize;
public:
// Rank of the thread within this tile
_CG_STATIC_CONST_DECL_ unsigned int thread_rank() {
return (internal::workgroup::thread_rank() & (numThreads - 1));
}
// Number of threads within this tile
__CG_STATIC_QUALIFIER__ unsigned int size() { return numThreads; }
};
template <unsigned int size> class thread_block_tile_base : public tile_base<size> {
static_assert(is_valid_tile_size<size>::value,
"Tile size is either not a power of 2 or greater than the wavefront size");
using tile_base<size>::numThreads;
public:
__CG_STATIC_QUALIFIER__ void sync() {
internal::tiled_group::sync();
}
template <class T> __CG_QUALIFIER__ T shfl(T var, int srcRank) const {
static_assert(is_valid_type<T>::value, "Neither an integer or float type.");
return (__shfl(var, srcRank, numThreads));
}
template <class T> __CG_QUALIFIER__ T shfl_down(T var, unsigned int lane_delta) const {
static_assert(is_valid_type<T>::value, "Neither an integer or float type.");
return (__shfl_down(var, lane_delta, numThreads));
}
template <class T> __CG_QUALIFIER__ T shfl_up(T var, unsigned int lane_delta) const {
static_assert(is_valid_type<T>::value, "Neither an integer or float type.");
return (__shfl_up(var, lane_delta, numThreads));
}
template <class T> __CG_QUALIFIER__ T shfl_xor(T var, unsigned int laneMask) const {
static_assert(is_valid_type<T>::value, "Neither an integer or float type.");
return (__shfl_xor(var, laneMask, numThreads));
}
};
/** \brief Group type - thread_block_tile
*
* \details Represents one tile of thread group.
*/
template <unsigned int tileSize, class ParentCGTy = void>
class thread_block_tile_type : public thread_block_tile_base<tileSize>, public tiled_group {
_CG_STATIC_CONST_DECL_ unsigned int numThreads = tileSize;
friend class thread_block_tile_type<tileSize, ParentCGTy>;
typedef thread_block_tile_base<numThreads> tbtBase;
protected:
__CG_QUALIFIER__ thread_block_tile_type() : tiled_group(numThreads) {
coalesced_info.tiled_info.size = numThreads;
coalesced_info.tiled_info.is_tiled = true;
}
public:
using tbtBase::size;
using tbtBase::sync;
using tbtBase::thread_rank;
};
/** \brief User exposed API to partition groups.
*
* \details A collective operation that partitions the parent group into a one-dimensional,
* row-major, tiling of subgroups.
*/
__CG_QUALIFIER__ thread_group tiled_partition(const thread_group& parent, unsigned int tile_size) {
if (parent.cg_type() == internal::cg_tiled_group) {
const tiled_group* cg = static_cast<const tiled_group*>(&parent);
return cg->new_tiled_group(tile_size);
}
else if(parent.cg_type() == internal::cg_coalesced_group) {
const coalesced_group* cg = static_cast<const coalesced_group*>(&parent);
return cg->new_tiled_group(tile_size);
}
else {
const thread_block* tb = static_cast<const thread_block*>(&parent);
return tb->new_tiled_group(tile_size);
}
}
// Thread block type overload
__CG_QUALIFIER__ thread_group tiled_partition(const thread_block& parent, unsigned int tile_size) {
return (parent.new_tiled_group(tile_size));
}
__CG_QUALIFIER__ tiled_group tiled_partition(const tiled_group& parent, unsigned int tile_size) {
return (parent.new_tiled_group(tile_size));
}
// If a coalesced group is passed to be partitioned, it should remain coalesced
__CG_QUALIFIER__ coalesced_group tiled_partition(const coalesced_group& parent, unsigned int tile_size) {
return (parent.new_tiled_group(tile_size));
}
template <unsigned int size, class ParentCGTy> class thread_block_tile;
namespace impl {
template <unsigned int size, class ParentCGTy> class thread_block_tile_internal;
template <unsigned int size, class ParentCGTy>
class thread_block_tile_internal : public thread_block_tile_type<size, ParentCGTy> {
protected:
template <unsigned int tbtSize, class tbtParentT>
__CG_QUALIFIER__ thread_block_tile_internal(
const thread_block_tile_internal<tbtSize, tbtParentT>& g)
: thread_block_tile_type<size, ParentCGTy>() {}
__CG_QUALIFIER__ thread_block_tile_internal(const thread_block& g)
: thread_block_tile_type<size, ParentCGTy>() {}
};
} // namespace impl
template <unsigned int size, class ParentCGTy>
class thread_block_tile : public impl::thread_block_tile_internal<size, ParentCGTy> {
protected:
__CG_QUALIFIER__ thread_block_tile(const ParentCGTy& g)
: impl::thread_block_tile_internal<size, ParentCGTy>(g) {}
public:
__CG_QUALIFIER__ operator thread_block_tile<size, void>() const {
return thread_block_tile<size, void>(*this);
}
};
template <unsigned int size>
class thread_block_tile<size, void> : public impl::thread_block_tile_internal<size, void> {
template <unsigned int, class ParentCGTy> friend class thread_block_tile;
protected:
public:
template <class ParentCGTy>
__CG_QUALIFIER__ thread_block_tile(const thread_block_tile<size, ParentCGTy>& g)
: impl::thread_block_tile_internal<size, void>(g) {}
};
template <unsigned int size, class ParentCGTy = void> class thread_block_tile;
namespace impl {
template <unsigned int size, class ParentCGTy = void> struct tiled_partition_internal;
template <unsigned int size>
struct tiled_partition_internal<size, thread_block> : public thread_block_tile<size, thread_block> {
__CG_QUALIFIER__ tiled_partition_internal(const thread_block& g)
: thread_block_tile<size, thread_block>(g) {}
};
} // namespace impl
/** \brief User exposed API to partition groups.
*
* \details This constructs a templated class derieved from thread_group.
* The template defines tile size of the new thread group at compile time.
*/
template <unsigned int size, class ParentCGTy>
__CG_QUALIFIER__ thread_block_tile<size, ParentCGTy> tiled_partition(const ParentCGTy& g) {
static_assert(is_valid_tile_size<size>::value,
"Tiled partition with size > wavefront size. Currently not supported ");
return impl::tiled_partition_internal<size, ParentCGTy>(g);
}
} // namespace cooperative_groups
#pragma clang diagnostic pop
#endif // __cplusplus
#endif // HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COOPERATIVE_GROUPS_H
文件差异内容过多而无法显示 加载差异
@@ -0,0 +1,59 @@
/*
Copyright (c) 2015 - 2022 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#ifndef AMD_HIP_MATH_CONSTANTS_H
#define AMD_HIP_MATH_CONSTANTS_H
#define HIP_INF_F __int_as_float(0x7f800000U)
#define HIP_NAN_F __int_as_float(0x7fffffffU)
#define HIP_MIN_DENORM_F __int_as_float(0x00000001U)
#define HIP_MAX_NORMAL_F __int_as_float(0x7f7fffffU)
#define HIP_NEG_ZERO_F __int_as_float(0x80000000U)
#define HIP_ZERO_F 0.0F
#define HIP_ONE_F 1.0F
#define HIP_SQRT_HALF_F 0.707106781F
#define HIP_SQRT_HALF_HI_F 0.707106781F
#define HIP_SQRT_HALF_LO_F 1.210161749e-08F
#define HIP_SQRT_TWO_F 1.414213562F
#define HIP_THIRD_F 0.333333333F
#define HIP_PIO4_F 0.785398163F
#define HIP_PIO2_F 1.570796327F
#define HIP_3PIO4_F 2.356194490F
#define HIP_2_OVER_PI_F 0.636619772F
#define HIP_SQRT_2_OVER_PI_F 0.797884561F
#define HIP_PI_F 3.141592654F
#define HIP_L2E_F 1.442695041F
#define HIP_L2T_F 3.321928094F
#define HIP_LG2_F 0.301029996F
#define HIP_LGE_F 0.434294482F
#define HIP_LN2_F 0.693147181F
#define HIP_LNT_F 2.302585093F
#define HIP_LNPI_F 1.144729886F
#define HIP_TWO_TO_M126_F 1.175494351e-38F
#define HIP_TWO_TO_126_F 8.507059173e37F
#define HIP_NORM_HUGE_F 3.402823466e38F
#define HIP_TWO_TO_23_F 8388608.0F
#define HIP_TWO_TO_24_F 16777216.0F
#define HIP_TWO_TO_31_F 2147483648.0F
#define HIP_TWO_TO_32_F 4294967296.0F
#define HIP_REMQUO_BITS_F 3U
#define HIP_REMQUO_MASK_F (~((~0U)<<HIPRT_REMQUO_BITS_F))
#define HIP_TRIG_PLOSS_F 105615.0F
#endif
@@ -0,0 +1,435 @@
/*
Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
/**
* @file amd_detail/hip_runtime.h
* @brief Contains definitions of APIs for HIP runtime.
*/
//#pragma once
#ifndef HIP_INCLUDE_HIP_AMD_DETAIL_HIP_RUNTIME_H
#define HIP_INCLUDE_HIP_AMD_DETAIL_HIP_RUNTIME_H
#include <hip/amd_detail/amd_hip_common.h>
#ifdef __cplusplus
extern "C" {
#endif
/**
* @brief Query the installed library build name.
*
* This function can be used even when the library is not initialized.
*
* @returns Returns a string describing the build version of the library. The
* string is owned by the library.
*/
const char* amd_dbgapi_get_build_name();
/**
* @brief Query the installed library git hash.
*
* This function can be used even when the library is not initialized.
*
* @returns Returns git hash of the library.
*/
const char* amd_dbgapi_get_git_hash();
/**
* @brief Query the installed library build ID.
*
* This function can be used even when the library is not initialized.
*
* @returns Returns build ID of the library.
*/
size_t amd_dbgapi_get_build_id();
#ifdef __cplusplus
} /* extern "c" */
#endif
//---
// Top part of file can be compiled with any compiler
#if !defined(__HIPCC_RTC__)
//#include <cstring>
#if __cplusplus
#include <cmath>
#include <cstdint>
#else
#include <math.h>
#include <string.h>
#include <stddef.h>
#endif // __cplusplus
#else
typedef unsigned int uint32_t;
typedef unsigned long long uint64_t;
typedef signed int int32_t;
typedef signed long long int64_t;
namespace std {
using ::uint32_t;
using ::uint64_t;
using ::int32_t;
using ::int64_t;
}
#endif // !defined(__HIPCC_RTC__)
#if __HIP_CLANG_ONLY__
#if !defined(__align__)
#define __align__(x) __attribute__((aligned(x)))
#endif
#define CUDA_SUCCESS hipSuccess
#if !defined(__HIPCC_RTC__)
#include <hip/hip_runtime_api.h>
extern int HIP_TRACE_API;
#endif // !defined(__HIPCC_RTC__)
#ifdef __cplusplus
#include <hip/amd_detail/hip_ldg.h>
#endif
#include <hip/amd_detail/amd_hip_atomic.h>
#include <hip/amd_detail/host_defines.h>
#include <hip/amd_detail/amd_device_functions.h>
#include <hip/amd_detail/amd_surface_functions.h>
#include <hip/amd_detail/texture_fetch_functions.h>
#include <hip/amd_detail/texture_indirect_functions.h>
// TODO-HCC remove old definitions ; ~1602 hcc supports __HCC_ACCELERATOR__ define.
#if defined(__KALMAR_ACCELERATOR__) && !defined(__HCC_ACCELERATOR__)
#define __HCC_ACCELERATOR__ __KALMAR_ACCELERATOR__
#endif
// Feature tests:
#if (defined(__HCC_ACCELERATOR__) && (__HCC_ACCELERATOR__ != 0)) || __HIP_DEVICE_COMPILE__
// Device compile and not host compile:
// 32-bit Atomics:
#define __HIP_ARCH_HAS_GLOBAL_INT32_ATOMICS__ (1)
#define __HIP_ARCH_HAS_GLOBAL_FLOAT_ATOMIC_EXCH__ (1)
#define __HIP_ARCH_HAS_SHARED_INT32_ATOMICS__ (1)
#define __HIP_ARCH_HAS_SHARED_FLOAT_ATOMIC_EXCH__ (1)
#define __HIP_ARCH_HAS_FLOAT_ATOMIC_ADD__ (1)
// 64-bit Atomics:
#define __HIP_ARCH_HAS_GLOBAL_INT64_ATOMICS__ (1)
#define __HIP_ARCH_HAS_SHARED_INT64_ATOMICS__ (1)
// Doubles
#define __HIP_ARCH_HAS_DOUBLES__ (1)
// warp cross-lane operations:
#define __HIP_ARCH_HAS_WARP_VOTE__ (1)
#define __HIP_ARCH_HAS_WARP_BALLOT__ (1)
#define __HIP_ARCH_HAS_WARP_SHUFFLE__ (1)
#define __HIP_ARCH_HAS_WARP_FUNNEL_SHIFT__ (0)
// sync
#define __HIP_ARCH_HAS_THREAD_FENCE_SYSTEM__ (1)
#define __HIP_ARCH_HAS_SYNC_THREAD_EXT__ (0)
// misc
#define __HIP_ARCH_HAS_SURFACE_FUNCS__ (0)
#define __HIP_ARCH_HAS_3DGRID__ (1)
#define __HIP_ARCH_HAS_DYNAMIC_PARALLEL__ (0)
#endif /* Device feature flags */
#define launch_bounds_impl0(requiredMaxThreadsPerBlock) \
__attribute__((amdgpu_flat_work_group_size(1, requiredMaxThreadsPerBlock)))
#define launch_bounds_impl1(requiredMaxThreadsPerBlock, minBlocksPerMultiprocessor) \
__attribute__((amdgpu_flat_work_group_size(1, requiredMaxThreadsPerBlock), \
amdgpu_waves_per_eu(minBlocksPerMultiprocessor)))
#define select_impl_(_1, _2, impl_, ...) impl_
#define __launch_bounds__(...) \
select_impl_(__VA_ARGS__, launch_bounds_impl1, launch_bounds_impl0, )(__VA_ARGS__)
#if !defined(__HIPCC_RTC__)
__host__ inline void* __get_dynamicgroupbaseptr() { return nullptr; }
#endif // !defined(__HIPCC_RTC__)
// End doxygen API:
/**
* @}
*/
//
// hip-clang functions
//
#if !defined(__HIPCC_RTC__)
#define HIP_KERNEL_NAME(...) __VA_ARGS__
#define HIP_SYMBOL(X) X
typedef int hipLaunchParm;
template <std::size_t n, typename... Ts,
typename std::enable_if<n == sizeof...(Ts)>::type* = nullptr>
void pArgs(const std::tuple<Ts...>&, void*) {}
template <std::size_t n, typename... Ts,
typename std::enable_if<n != sizeof...(Ts)>::type* = nullptr>
void pArgs(const std::tuple<Ts...>& formals, void** _vargs) {
using T = typename std::tuple_element<n, std::tuple<Ts...> >::type;
static_assert(!std::is_reference<T>{},
"A __global__ function cannot have a reference as one of its "
"arguments.");
#if defined(HIP_STRICT)
static_assert(std::is_trivially_copyable<T>{},
"Only TriviallyCopyable types can be arguments to a __global__ "
"function");
#endif
_vargs[n] = const_cast<void*>(reinterpret_cast<const void*>(&std::get<n>(formals)));
return pArgs<n + 1>(formals, _vargs);
}
template <typename... Formals, typename... Actuals>
std::tuple<Formals...> validateArgsCountType(void (*kernel)(Formals...), std::tuple<Actuals...>(actuals)) {
static_assert(sizeof...(Formals) == sizeof...(Actuals), "Argument Count Mismatch");
std::tuple<Formals...> to_formals{std::move(actuals)};
return to_formals;
}
#if defined(HIP_TEMPLATE_KERNEL_LAUNCH)
template <typename... Args, typename F = void (*)(Args...)>
void hipLaunchKernelGGL(F kernel, const dim3& numBlocks, const dim3& dimBlocks,
std::uint32_t sharedMemBytes, hipStream_t stream, Args... args) {
constexpr size_t count = sizeof...(Args);
auto tup_ = std::tuple<Args...>{args...};
auto tup = validateArgsCountType(kernel, tup_);
void* _Args[count];
pArgs<0>(tup, _Args);
auto k = reinterpret_cast<void*>(kernel);
hipLaunchKernel(k, numBlocks, dimBlocks, _Args, sharedMemBytes, stream);
}
#else
#define hipLaunchKernelGGLInternal(kernelName, numBlocks, numThreads, memPerBlock, streamId, ...) \
do { \
kernelName<<<(numBlocks), (numThreads), (memPerBlock), (streamId)>>>(__VA_ARGS__); \
} while (0)
#define hipLaunchKernelGGL(kernelName, ...) hipLaunchKernelGGLInternal((kernelName), __VA_ARGS__)
#endif
#include <hip/hip_runtime_api.h>
#endif // !defined(__HIPCC_RTC__)
extern "C" __device__ __attribute__((const)) size_t __ockl_get_local_id(uint);
extern "C" __device__ __attribute__((const)) size_t __ockl_get_group_id(uint);
extern "C" __device__ __attribute__((const)) size_t __ockl_get_local_size(uint);
extern "C" __device__ __attribute__((const)) size_t __ockl_get_num_groups(uint);
struct __HIP_BlockIdx {
__device__
std::uint32_t operator()(std::uint32_t x) const noexcept { return __ockl_get_group_id(x); }
};
struct __HIP_BlockDim {
__device__
std::uint32_t operator()(std::uint32_t x) const noexcept {
return __ockl_get_local_size(x);
}
};
struct __HIP_GridDim {
__device__
std::uint32_t operator()(std::uint32_t x) const noexcept {
return __ockl_get_num_groups(x);
}
};
struct __HIP_ThreadIdx {
__device__
std::uint32_t operator()(std::uint32_t x) const noexcept {
return __ockl_get_local_id(x);
}
};
#if defined(__HIPCC_RTC__)
typedef struct dim3 {
uint32_t x; ///< x
uint32_t y; ///< y
uint32_t z; ///< z
#ifdef __cplusplus
constexpr __device__ dim3(uint32_t _x = 1, uint32_t _y = 1, uint32_t _z = 1) : x(_x), y(_y), z(_z){};
#endif
} dim3;
#endif // !defined(__HIPCC_RTC__)
template <typename F>
struct __HIP_Coordinates {
using R = decltype(F{}(0));
struct __X {
__device__ operator R() const noexcept { return F{}(0); }
__device__ R operator+=(const R& rhs) { return F{}(0) + rhs; }
};
struct __Y {
__device__ operator R() const noexcept { return F{}(1); }
__device__ R operator+=(const R& rhs) { return F{}(1) + rhs; }
};
struct __Z {
__device__ operator R() const noexcept { return F{}(2); }
__device__ R operator+=(const R& rhs) { return F{}(2) + rhs; }
};
static constexpr __X x{};
static constexpr __Y y{};
static constexpr __Z z{};
#ifdef __cplusplus
__device__ operator dim3() const { return dim3(x, y, z); }
#endif
};
template <typename F>
#if !defined(_MSC_VER)
__attribute__((weak))
#endif
constexpr typename __HIP_Coordinates<F>::__X __HIP_Coordinates<F>::x;
template <typename F>
#if !defined(_MSC_VER)
__attribute__((weak))
#endif
constexpr typename __HIP_Coordinates<F>::__Y __HIP_Coordinates<F>::y;
template <typename F>
#if !defined(_MSC_VER)
__attribute__((weak))
#endif
constexpr typename __HIP_Coordinates<F>::__Z __HIP_Coordinates<F>::z;
extern "C" __device__ __attribute__((const)) size_t __ockl_get_global_size(uint);
inline
__device__
std::uint32_t operator*(__HIP_Coordinates<__HIP_GridDim>::__X,
__HIP_Coordinates<__HIP_BlockDim>::__X) noexcept {
return __ockl_get_global_size(0);
}
inline
__device__
std::uint32_t operator*(__HIP_Coordinates<__HIP_BlockDim>::__X,
__HIP_Coordinates<__HIP_GridDim>::__X) noexcept {
return __ockl_get_global_size(0);
}
inline
__device__
std::uint32_t operator*(__HIP_Coordinates<__HIP_GridDim>::__Y,
__HIP_Coordinates<__HIP_BlockDim>::__Y) noexcept {
return __ockl_get_global_size(1);
}
inline
__device__
std::uint32_t operator*(__HIP_Coordinates<__HIP_BlockDim>::__Y,
__HIP_Coordinates<__HIP_GridDim>::__Y) noexcept {
return __ockl_get_global_size(1);
}
inline
__device__
std::uint32_t operator*(__HIP_Coordinates<__HIP_GridDim>::__Z,
__HIP_Coordinates<__HIP_BlockDim>::__Z) noexcept {
return __ockl_get_global_size(2);
}
inline
__device__
std::uint32_t operator*(__HIP_Coordinates<__HIP_BlockDim>::__Z,
__HIP_Coordinates<__HIP_GridDim>::__Z) noexcept {
return __ockl_get_global_size(2);
}
static constexpr __HIP_Coordinates<__HIP_BlockDim> blockDim{};
static constexpr __HIP_Coordinates<__HIP_BlockIdx> blockIdx{};
static constexpr __HIP_Coordinates<__HIP_GridDim> gridDim{};
static constexpr __HIP_Coordinates<__HIP_ThreadIdx> threadIdx{};
extern "C" __device__ __attribute__((const)) size_t __ockl_get_local_id(uint);
#define hipThreadIdx_x (__ockl_get_local_id(0))
#define hipThreadIdx_y (__ockl_get_local_id(1))
#define hipThreadIdx_z (__ockl_get_local_id(2))
extern "C" __device__ __attribute__((const)) size_t __ockl_get_group_id(uint);
#define hipBlockIdx_x (__ockl_get_group_id(0))
#define hipBlockIdx_y (__ockl_get_group_id(1))
#define hipBlockIdx_z (__ockl_get_group_id(2))
extern "C" __device__ __attribute__((const)) size_t __ockl_get_local_size(uint);
#define hipBlockDim_x (__ockl_get_local_size(0))
#define hipBlockDim_y (__ockl_get_local_size(1))
#define hipBlockDim_z (__ockl_get_local_size(2))
extern "C" __device__ __attribute__((const)) size_t __ockl_get_num_groups(uint);
#define hipGridDim_x (__ockl_get_num_groups(0))
#define hipGridDim_y (__ockl_get_num_groups(1))
#define hipGridDim_z (__ockl_get_num_groups(2))
#include <hip/amd_detail/amd_math_functions.h>
#if __HIP_HCC_COMPAT_MODE__
// Define HCC work item functions in terms of HIP builtin variables.
#pragma push_macro("__DEFINE_HCC_FUNC")
#define __DEFINE_HCC_FUNC(hc_fun,hip_var) \
inline __device__ __attribute__((always_inline)) uint hc_get_##hc_fun(uint i) { \
if (i==0) \
return hip_var.x; \
else if(i==1) \
return hip_var.y; \
else \
return hip_var.z; \
}
__DEFINE_HCC_FUNC(workitem_id, threadIdx)
__DEFINE_HCC_FUNC(group_id, blockIdx)
__DEFINE_HCC_FUNC(group_size, blockDim)
__DEFINE_HCC_FUNC(num_groups, gridDim)
#pragma pop_macro("__DEFINE_HCC_FUNC")
extern "C" __device__ __attribute__((const)) size_t __ockl_get_global_id(uint);
inline __device__ __attribute__((always_inline)) uint
hc_get_workitem_absolute_id(int dim)
{
return (uint)__ockl_get_global_id(dim);
}
#endif
#if !__CLANG_HIP_RUNTIME_WRAPPER_INCLUDED__
#if !defined(__HIPCC_RTC__)
// Support std::complex.
#if !_OPENMP || __HIP_ENABLE_CUDA_WRAPPER_FOR_OPENMP__
#pragma push_macro("__CUDA__")
#define __CUDA__
#include <__clang_cuda_math_forward_declares.h>
#include <__clang_cuda_complex_builtins.h>
// Workaround for using libc++ with HIP-Clang.
// The following headers requires clang include path before standard C++ include path.
// However libc++ include path requires to be before clang include path.
// To workaround this, we pass -isystem with the parent directory of clang include
// path instead of the clang include path itself.
#include <include/cuda_wrappers/algorithm>
#include <include/cuda_wrappers/complex>
#include <include/cuda_wrappers/new>
#undef __CUDA__
#pragma pop_macro("__CUDA__")
#endif // !_OPENMP || __HIP_ENABLE_CUDA_WRAPPER_FOR_OPENMP__
#endif // !defined(__HIPCC_RTC__)
#endif // !__CLANG_HIP_RUNTIME_WRAPPER_INCLUDED__
#endif // __HIP_CLANG_ONLY__
#endif // HIP_AMD_DETAIL_RUNTIME_H
@@ -0,0 +1,194 @@
/*
Copyright (c) 2022 - Present Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#pragma once
#ifndef HIP_INCLUDE_HIP_HIP_RUNTIME_PT_API_H
#define HIP_INCLUDE_HIP_HIP_RUNTIME_PT_API_H
#if (defined(__HIP_PLATFORM_HCC__) || defined(__HIP_PLATFORM_AMD__)) && !(defined(__HIP_PLATFORM_NVCC__) || defined(__HIP_PLATFORM_NVIDIA__))
/// hipStreamPerThread implementation
#if defined(HIP_API_PER_THREAD_DEFAULT_STREAM)
#define __HIP_STREAM_PER_THREAD
#define __HIP_API_SPT(api) api ## _spt
#else
#define __HIP_API_SPT(api) api
#endif
#if defined(__HIP_STREAM_PER_THREAD)
// Memory APIs
#define hipMemcpy __HIP_API_SPT(hipMemcpy)
#define hipMemcpyToSymbol __HIP_API_SPT(hipMemcpyToSymbol)
#define hipMemcpyFromSymbol __HIP_API_SPT(hipMemcpyFromSymbol)
#define hipMemcpy2D __HIP_API_SPT(hipMemcpy2D)
#define hipMemcpy2DFromArray __HIP_API_SPT(hipMemcpy2DFromArray)
#define hipMemcpy3D __HIP_API_SPT(hipMemcpy3D)
#define hipMemset __HIP_API_SPT(hipMemset)
#define hipMemset2D __HIP_API_SPT(hipMemset2D)
#define hipMemset3D __HIP_API_SPT(hipMemset3D)
#define hipMemcpyAsync __HIP_API_SPT(hipMemcpyAsync)
#define hipMemset3DAsync __HIP_API_SPT(hipMemset3DAsync)
#define hipMemset2DAsync __HIP_API_SPT(hipMemset2DAsync)
#define hipMemsetAsync __HIP_API_SPT(hipMemsetAsync)
#define hipMemcpy3DAsync __HIP_API_SPT(hipMemcpy3DAsync)
#define hipMemcpy2DAsync __HIP_API_SPT(hipMemcpy2DAsync)
#define hipMemcpyFromSymbolAsync __HIP_API_SPT(hipMemcpyFromSymbolAsync)
#define hipMemcpyToSymbolAsync __HIP_API_SPT(hipMemcpyToSymbolAsync)
#define hipMemcpyFromArray __HIP_API_SPT(hipMemcpyFromArray)
#define hipMemcpy2DToArray __HIP_API_SPT(hipMemcpy2DToArray)
#define hipMemcpy2DFromArrayAsync __HIP_API_SPT(hipMemcpy2DFromArrayAsync)
#define hipMemcpy2DToArrayAsync __HIP_API_SPT(hipMemcpy2DToArrayAsync)
// Stream APIs
#define hipStreamSynchronize __HIP_API_SPT(hipStreamSynchronize)
#define hipStreamQuery __HIP_API_SPT(hipStreamQuery)
#define hipStreamGetFlags __HIP_API_SPT(hipStreamGetFlags)
#define hipStreamGetPriority __HIP_API_SPT(hipStreamGetPriority)
#define hipStreamWaitEvent __HIP_API_SPT(hipStreamWaitEvent)
#define hipStreamAddCallback __HIP_API_SPT(hipStreamAddCallback)
#define hipLaunchHostFunc __HIP_API_SPT(hipLaunchHostFunc)
// Event APIs
#define hipEventRecord __HIP_API_SPT(hipEventRecord)
// Launch APIs
#define hipLaunchKernel __HIP_API_SPT(hipLaunchKernel)
#define hipLaunchCooperativeKernel __HIP_API_SPT(hipLaunchCooperativeKernel)
// Graph APIs
#define hipGraphLaunch __HIP_API_SPT(hipGraphLaunch)
#define hipStreamBeginCapture __HIP_API_SPT(hipStreamBeginCapture)
#define hipStreamEndCapture __HIP_API_SPT(hipStreamEndCapture)
#define hipStreamIsCapturing __HIP_API_SPT(hipStreamIsCapturing)
#define hipStreamGetCaptureInfo __HIP_API_SPT(hipStreamGetCaptureInfo)
#define hipStreamGetCaptureInfo_v2 __HIP_API_SPT(hipStreamGetCaptureInfo_v2)
#endif
#ifdef __cplusplus
extern "C" {
#endif
hipError_t hipMemcpy_spt(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind);
hipError_t hipMemcpyToSymbol_spt(const void* symbol, const void* src, size_t sizeBytes,
size_t offset, hipMemcpyKind kind);
hipError_t hipMemcpyFromSymbol_spt(void* dst, const void* symbol,size_t sizeBytes,
size_t offset, hipMemcpyKind kind);
hipError_t hipMemcpy2D_spt(void* dst, size_t dpitch, const void* src, size_t spitch, size_t width,
size_t height, hipMemcpyKind kind);
hipError_t hipMemcpy2DFromArray_spt( void* dst, size_t dpitch, hipArray_const_t src, size_t wOffset,
size_t hOffset, size_t width, size_t height, hipMemcpyKind kind);
hipError_t hipMemcpy3D_spt(const struct hipMemcpy3DParms* p);
hipError_t hipMemset_spt(void* dst, int value, size_t sizeBytes);
hipError_t hipMemsetAsync_spt(void* dst, int value, size_t sizeBytes, hipStream_t stream);
hipError_t hipMemset2D_spt(void* dst, size_t pitch, int value, size_t width, size_t height);
hipError_t hipMemset2DAsync_spt(void* dst, size_t pitch, int value,
size_t width, size_t height, hipStream_t stream);
hipError_t hipMemset3DAsync_spt(hipPitchedPtr pitchedDevPtr, int value, hipExtent extent, hipStream_t stream);
hipError_t hipMemset3D_spt(hipPitchedPtr pitchedDevPtr, int value, hipExtent extent );
hipError_t hipMemcpyAsync_spt(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind,
hipStream_t stream);
hipError_t hipMemcpy3DAsync_spt(const hipMemcpy3DParms* p, hipStream_t stream);
hipError_t hipMemcpy2DAsync_spt(void* dst, size_t dpitch, const void* src, size_t spitch, size_t width,
size_t height, hipMemcpyKind kind, hipStream_t stream);
hipError_t hipMemcpyFromSymbolAsync_spt(void* dst, const void* symbol, size_t sizeBytes,
size_t offset, hipMemcpyKind kind, hipStream_t stream);
hipError_t hipMemcpyToSymbolAsync_spt(const void* symbol, const void* src, size_t sizeBytes,
size_t offset, hipMemcpyKind kind, hipStream_t stream);
hipError_t hipMemcpyFromArray_spt(void* dst, hipArray_const_t src, size_t wOffsetSrc, size_t hOffset,
size_t count, hipMemcpyKind kind);
hipError_t hipMemcpy2DToArray_spt(hipArray* dst, size_t wOffset, size_t hOffset, const void* src,
size_t spitch, size_t width, size_t height, hipMemcpyKind kind);
hipError_t hipMemcpy2DFromArrayAsync_spt(void* dst, size_t dpitch, hipArray_const_t src,
size_t wOffsetSrc, size_t hOffsetSrc, size_t width, size_t height,
hipMemcpyKind kind, hipStream_t stream);
hipError_t hipMemcpy2DToArrayAsync_spt(hipArray* dst, size_t wOffset, size_t hOffset, const void* src,
size_t spitch, size_t width, size_t height, hipMemcpyKind kind,
hipStream_t stream);
hipError_t hipStreamQuery_spt(hipStream_t stream);
hipError_t hipStreamSynchronize_spt(hipStream_t stream);
hipError_t hipStreamGetPriority_spt(hipStream_t stream, int* priority);
hipError_t hipStreamWaitEvent_spt(hipStream_t stream, hipEvent_t event, unsigned int flags);
hipError_t hipStreamGetFlags_spt(hipStream_t stream, unsigned int* flags);
hipError_t hipStreamAddCallback_spt(hipStream_t stream, hipStreamCallback_t callback, void* userData,
unsigned int flags);
#ifdef __cplusplus
hipError_t hipEventRecord_spt(hipEvent_t event, hipStream_t stream = NULL);
#else
hipError_t hipEventRecord_spt(hipEvent_t event, hipStream_t stream);
#endif
hipError_t hipLaunchCooperativeKernel_spt(const void* f,
dim3 gridDim, dim3 blockDim,
void **kernelParams, uint32_t sharedMemBytes, hipStream_t hStream);
hipError_t hipLaunchKernel_spt(const void* function_address,
dim3 numBlocks,
dim3 dimBlocks,
void** args,
size_t sharedMemBytes, hipStream_t stream);
hipError_t hipGraphLaunch_spt(hipGraphExec_t graphExec, hipStream_t stream);
hipError_t hipStreamBeginCapture_spt(hipStream_t stream, hipStreamCaptureMode mode);
hipError_t hipStreamEndCapture_spt(hipStream_t stream, hipGraph_t* pGraph);
hipError_t hipStreamIsCapturing_spt(hipStream_t stream, hipStreamCaptureStatus* pCaptureStatus);
hipError_t hipStreamGetCaptureInfo_spt(hipStream_t stream, hipStreamCaptureStatus* pCaptureStatus,
unsigned long long* pId);
hipError_t hipStreamGetCaptureInfo_v2_spt(hipStream_t stream, hipStreamCaptureStatus* captureStatus_out,
unsigned long long* id_out, hipGraph_t* graph_out,
const hipGraphNode_t** dependencies_out,
size_t* numDependencies_out);
hipError_t hipLaunchHostFunc_spt(hipStream_t stream, hipHostFn_t fn, void* userData);
#ifdef __cplusplus
}
#endif // extern "C"
#endif //(defined(__HIP_PLATFORM_HCC__) || defined(__HIP_PLATFORM_AMD__)) && !(defined(__HIP_PLATFORM_NVCC__) || defined(__HIP_PLATFORM_NVIDIA__))
#endif //HIP_INCLUDE_HIP_HIP_RUNTIME_PT_API_H
@@ -0,0 +1,570 @@
/*
Copyright (c) 2021 - 2023 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#pragma once
#ifdef __cplusplus
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wold-style-cast"
/**
* @brief Unsafe floating point rmw atomic add.
*
* Performs a relaxed read-modify-write floating point atomic add with
* device memory scope. Original value at \p addr is returned and
* the value of \p addr is updated to have the original value plus \p value
*
* @note This operation currently only performs different operations for
* the gfx90a target. Other devices continue to use safe atomics.
*
* It can be used to generate code that uses fast hardware floating point atomic
* operations which may handle rounding and subnormal values differently than
* non-atomic floating point operations.
*
* The operation is not always safe and can have undefined behavior unless
* following condition are met:
*
* - \p addr is at least 4 bytes aligned
* - If \p addr is a global segment address, it is in a coarse grain allocation.
* Passing in global segment addresses in fine grain allocations will result in
* undefined behavior and is not supported.
*
* @param [in,out] addr Pointer to value to be increment by \p value.
* @param [in] value Value by \p addr is to be incremented.
* @return Original value contained in \p addr.
*/
__device__ inline float unsafeAtomicAdd(float* addr, float value) {
#if defined(__gfx940__) && \
__has_builtin(__builtin_amdgcn_flat_atomic_fadd_f32)
return __builtin_amdgcn_flat_atomic_fadd_f32(addr, value);
#elif defined(__gfx90a__) && \
__has_builtin(__builtin_amdgcn_is_shared) && \
__has_builtin(__builtin_amdgcn_is_private) && \
__has_builtin(__builtin_amdgcn_ds_atomic_fadd_f32) && \
__has_builtin(__builtin_amdgcn_global_atomic_fadd_f32)
if (__builtin_amdgcn_is_shared(
(const __attribute__((address_space(0))) void*)addr))
return __builtin_amdgcn_ds_atomic_fadd_f32(addr, value);
else if (__builtin_amdgcn_is_private(
(const __attribute__((address_space(0))) void*)addr)) {
float temp = *addr;
*addr = temp + value;
return temp;
}
else
return __builtin_amdgcn_global_atomic_fadd_f32(addr, value);
#elif __has_builtin(__hip_atomic_fetch_add)
return __hip_atomic_fetch_add(addr, value, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
#else
return __atomic_fetch_add(addr, value, __ATOMIC_RELAXED);
#endif
}
/**
* @brief Unsafe floating point rmw atomic max.
*
* Performs a relaxed read-modify-write floating point atomic max with
* device memory scope. The original value at \p addr is returned and
* the value at \p addr is replaced by \p val if greater.
*
* @note This operation is currently identical to that performed by
* atomicMax and is included for completeness.
*
* @param [in,out] addr Pointer to value to be updated
* @param [in] val Value used to update the value at \p addr.
* @return Original value contained in \p addr.
*/
__device__ inline float unsafeAtomicMax(float* addr, float val) {
#if __has_builtin(__hip_atomic_load) && \
__has_builtin(__hip_atomic_compare_exchange_strong)
float value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
bool done = false;
while (!done && value < val) {
done = __hip_atomic_compare_exchange_strong(addr, &value, val,
__ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
}
return value;
#else
unsigned int *uaddr = (unsigned int *)addr;
unsigned int value = __atomic_load_n(uaddr, __ATOMIC_RELAXED);
bool done = false;
while (!done && __uint_as_float(value) < val) {
done = __atomic_compare_exchange_n(uaddr, &value, __float_as_uint(val), false,
__ATOMIC_RELAXED, __ATOMIC_RELAXED);
}
return __uint_as_float(value);
#endif
}
/**
* @brief Unsafe floating point rmw atomic min.
*
* Performs a relaxed read-modify-write floating point atomic min with
* device memory scope. The original value at \p addr is returned and
* the value at \p addr is replaced by \p val if lesser.
*
* @note This operation is currently identical to that performed by
* atomicMin and is included for completeness.
*
* @param [in,out] addr Pointer to value to be updated
* @param [in] val Value used to update the value at \p addr.
* @return Original value contained in \p addr.
*/
__device__ inline float unsafeAtomicMin(float* addr, float val) {
#if __has_builtin(__hip_atomic_load) && \
__has_builtin(__hip_atomic_compare_exchange_strong)
float value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
bool done = false;
while (!done && value > val) {
done = __hip_atomic_compare_exchange_strong(addr, &value, val,
__ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
}
return value;
#else
unsigned int *uaddr = (unsigned int *)addr;
unsigned int value = __atomic_load_n(uaddr, __ATOMIC_RELAXED);
bool done = false;
while (!done && __uint_as_float(value) > val) {
done = __atomic_compare_exchange_n(uaddr, &value, __float_as_uint(val), false,
__ATOMIC_RELAXED, __ATOMIC_RELAXED);
}
return __uint_as_float(value);
#endif
}
/**
* @brief Unsafe double precision rmw atomic add.
*
* Performs a relaxed read-modify-write double precision atomic add with
* device memory scope. Original value at \p addr is returned and
* the value of \p addr is updated to have the original value plus \p value
*
* @note This operation currently only performs different operations for
* the gfx90a target. Other devices continue to use safe atomics.
*
* It can be used to generate code that uses fast hardware floating point atomic
* operations which may handle rounding and subnormal values differently than
* non-atomic floating point operations.
*
* The operation is not always safe and can have undefined behavior unless
* following condition are met:
*
* - \p addr is at least 8 byte aligned
* - If \p addr is a global segment address, it is in a coarse grain allocation.
* Passing in global segment addresses in fine grain allocations will result in
* undefined behavior and are not supported.
*
* @param [in,out] addr Pointer to value to be updated.
* @param [in] value Value by \p addr is to be incremented.
* @return Original value contained in \p addr.
*/
__device__ inline double unsafeAtomicAdd(double* addr, double value) {
#if (defined(__gfx90a__) || defined(__gfx940__)) && \
__has_builtin(__builtin_amdgcn_flat_atomic_fadd_f64)
return __builtin_amdgcn_flat_atomic_fadd_f64(addr, value);
#elif defined (__hip_atomic_fetch_add)
return __hip_atomic_fetch_add(addr, value, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
#else
return __atomic_fetch_add(addr, value, __ATOMIC_RELAXED);
#endif
}
/**
* @brief Unsafe double precision rmw atomic max.
*
* Performs a relaxed read-modify-write double precision atomic max with
* device memory scope. Original value at \p addr is returned and
* the value of \p addr is updated with \p val if greater.
*
* @note This operation currently only performs different operations for
* the gfx90a target. Other devices continue to use safe atomics.
*
* It can be used to generate code that uses fast hardware floating point atomic
* operations which may handle rounding and subnormal values differently than
* non-atomic floating point operations.
*
* The operation is not always safe and can have undefined behavior unless
* following condition are met:
*
* - \p addr is at least 8 byte aligned
* - If \p addr is a global segment address, it is in a coarse grain allocation.
* Passing in global segment addresses in fine grain allocations will result in
* undefined behavior and are not supported.
*
* @param [in,out] addr Pointer to value to be updated.
* @param [in] val Value used to updated the contents at \p addr
* @return Original value contained at \p addr.
*/
__device__ inline double unsafeAtomicMax(double* addr, double val) {
#if (defined(__gfx90a__) || defined(__gfx940__)) && \
__has_builtin(__builtin_amdgcn_flat_atomic_fmax_f64)
return __builtin_amdgcn_flat_atomic_fmax_f64(addr, val);
#else
#if __has_builtin(__hip_atomic_load) && \
__has_builtin(__hip_atomic_compare_exchange_strong)
double value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
bool done = false;
while (!done && value < val) {
done = __hip_atomic_compare_exchange_strong(addr, &value, val,
__ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
}
return value;
#else
unsigned long long *uaddr = (unsigned long long *)addr;
unsigned long long value = __atomic_load_n(uaddr, __ATOMIC_RELAXED);
bool done = false;
while (!done && __longlong_as_double(value) < val) {
done = __atomic_compare_exchange_n(uaddr, &value, __double_as_longlong(val), false,
__ATOMIC_RELAXED, __ATOMIC_RELAXED);
}
return __longlong_as_double(value);
#endif
#endif
}
/**
* @brief Unsafe double precision rmw atomic min.
*
* Performs a relaxed read-modify-write double precision atomic min with
* device memory scope. Original value at \p addr is returned and
* the value of \p addr is updated with \p val if lesser.
*
* @note This operation currently only performs different operations for
* the gfx90a target. Other devices continue to use safe atomics.
*
* It can be used to generate code that uses fast hardware floating point atomic
* operations which may handle rounding and subnormal values differently than
* non-atomic floating point operations.
*
* The operation is not always safe and can have undefined behavior unless
* following condition are met:
*
* - \p addr is at least 8 byte aligned
* - If \p addr is a global segment address, it is in a coarse grain allocation.
* Passing in global segment addresses in fine grain allocations will result in
* undefined behavior and are not supported.
*
* @param [in,out] addr Pointer to value to be updated.
* @param [in] val Value used to updated the contents at \p addr
* @return Original value contained at \p addr.
*/
__device__ inline double unsafeAtomicMin(double* addr, double val) {
#if (defined(__gfx90a__) || defined(__gfx940__)) && \
__has_builtin(__builtin_amdgcn_flat_atomic_fmin_f64)
return __builtin_amdgcn_flat_atomic_fmin_f64(addr, val);
#else
#if __has_builtin(__hip_atomic_load) && \
__has_builtin(__hip_atomic_compare_exchange_strong)
double value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
bool done = false;
while (!done && value > val) {
done = __hip_atomic_compare_exchange_strong(addr, &value, val,
__ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
}
return value;
#else
unsigned long long *uaddr = (unsigned long long *)addr;
unsigned long long value = __atomic_load_n(uaddr, __ATOMIC_RELAXED);
bool done = false;
while (!done && __longlong_as_double(value) > val) {
done = __atomic_compare_exchange_n(uaddr, &value, __double_as_longlong(val), false,
__ATOMIC_RELAXED, __ATOMIC_RELAXED);
}
return __longlong_as_double(value);
#endif
#endif
}
/**
* @brief Safe floating point rmw atomic add.
*
* Performs a relaxed read-modify-write floating point atomic add with
* device memory scope. Original value at \p addr is returned and
* the value of \p addr is updated to have the original value plus \p value
*
* @note This operation ensures that, on all targets, we produce safe atomics.
* This will be the case even when -munsafe-fp-atomics is passed into the compiler.
*
* @param [in,out] addr Pointer to value to be increment by \p value.
* @param [in] value Value by \p addr is to be incremented.
* @return Original value contained in \p addr.
*/
__device__ inline float safeAtomicAdd(float* addr, float value) {
#if defined(__gfx908__) || \
(defined(__gfx90a__) && !__has_builtin(__hip_atomic_fetch_add))
// On gfx908, we can generate unsafe FP32 atomic add that does not follow all
// IEEE rules when -munsafe-fp-atomics is passed. Do a CAS loop emulation instead.
// On gfx90a, if we do not have the __hip_atomic_fetch_add builtin, we need to
// force a CAS loop here.
float old_val;
#if __has_builtin(__hip_atomic_load)
old_val = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
#else // !__has_builtin(__hip_atomic_load)
old_val = __uint_as_float(__atomic_load_n(reinterpret_cast<unsigned int*>(addr), __ATOMIC_RELAXED));
#endif // __has_builtin(__hip_atomic_load)
float expected, temp;
do {
temp = expected = old_val;
#if __has_builtin(__hip_atomic_compare_exchange_strong)
__hip_atomic_compare_exchange_strong(addr, &expected, old_val + value, __ATOMIC_RELAXED,
__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
#else // !__has_builtin(__hip_atomic_compare_exchange_strong)
__atomic_compare_exchange_n(addr, &expected, old_val + value, false,
__ATOMIC_RELAXED, __ATOMIC_RELAXED);
#endif // __has_builtin(__hip_atomic_compare_exchange_strong)
old_val = expected;
} while (__float_as_uint(temp) != __float_as_uint(old_val));
return old_val;
#elif defined(__gfx90a__)
// On gfx90a, with the __hip_atomic_fetch_add builtin, relaxed system-scope
// atomics will produce safe CAS loops, but are otherwise not different than
// agent-scope atomics. This logic is only applicable for gfx90a, and should
// not be assumed on other architectures.
return __hip_atomic_fetch_add(addr, value, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
#elif __has_builtin(__hip_atomic_fetch_add)
return __hip_atomic_fetch_add(addr, value, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
#else
return __atomic_fetch_add(addr, value, __ATOMIC_RELAXED);
#endif
}
/**
* @brief Safe floating point rmw atomic max.
*
* Performs a relaxed read-modify-write floating point atomic max with
* device memory scope. The original value at \p addr is returned and
* the value at \p addr is replaced by \p val if greater.
*
* @note This operation ensures that, on all targets, we produce safe atomics.
* This will be the case even when -munsafe-fp-atomics is passed into the compiler.
*
* @param [in,out] addr Pointer to value to be updated
* @param [in] val Value used to update the value at \p addr.
* @return Original value contained in \p addr.
*/
__device__ inline float safeAtomicMax(float* addr, float val) {
#if __has_builtin(__hip_atomic_load) && \
__has_builtin(__hip_atomic_compare_exchange_strong)
float value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
bool done = false;
while (!done && value < val) {
done = __hip_atomic_compare_exchange_strong(addr, &value, val,
__ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
}
return value;
#else
unsigned int *uaddr = (unsigned int *)addr;
unsigned int value = __atomic_load_n(uaddr, __ATOMIC_RELAXED);
bool done = false;
while (!done && __uint_as_float(value) < val) {
done = __atomic_compare_exchange_n(uaddr, &value, __float_as_uint(val), false,
__ATOMIC_RELAXED, __ATOMIC_RELAXED);
}
return __uint_as_float(value);
#endif
}
/**
* @brief Safe floating point rmw atomic min.
*
* Performs a relaxed read-modify-write floating point atomic min with
* device memory scope. The original value at \p addr is returned and
* the value at \p addr is replaced by \p val if lesser.
*
* @note This operation ensures that, on all targets, we produce safe atomics.
* This will be the case even when -munsafe-fp-atomics is passed into the compiler.
*
* @param [in,out] addr Pointer to value to be updated
* @param [in] val Value used to update the value at \p addr.
* @return Original value contained in \p addr.
*/
__device__ inline float safeAtomicMin(float* addr, float val) {
#if __has_builtin(__hip_atomic_load) && \
__has_builtin(__hip_atomic_compare_exchange_strong)
float value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
bool done = false;
while (!done && value > val) {
done = __hip_atomic_compare_exchange_strong(addr, &value, val,
__ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
}
return value;
#else
unsigned int *uaddr = (unsigned int *)addr;
unsigned int value = __atomic_load_n(uaddr, __ATOMIC_RELAXED);
bool done = false;
while (!done && __uint_as_float(value) > val) {
done = __atomic_compare_exchange_n(uaddr, &value, __float_as_uint(val), false,
__ATOMIC_RELAXED, __ATOMIC_RELAXED);
}
return __uint_as_float(value);
#endif
}
/**
* @brief Safe double precision rmw atomic add.
*
* Performs a relaxed read-modify-write double precision atomic add with
* device memory scope. Original value at \p addr is returned and
* the value of \p addr is updated to have the original value plus \p value
*
* @note This operation ensures that, on all targets, we produce safe atomics.
* This will be the case even when -munsafe-fp-atomics is passed into the compiler.
*
* @param [in,out] addr Pointer to value to be increment by \p value.
* @param [in] value Value by \p addr is to be incremented.
* @return Original value contained in \p addr.
*/
__device__ inline double safeAtomicAdd(double* addr, double value) {
#if (defined(__gfx90a__) || defined(__gfx940__)) && \
__has_builtin(__hip_atomic_fetch_add)
// On gfx90a, with the __hip_atomic_fetch_add builtin, relaxed system-scope
// atomics will produce safe CAS loops, but are otherwise not different than
// agent-scope atomics. This logic is only applicable for gfx90a, and should
// not be assumed on other architectures.
return __hip_atomic_fetch_add(addr, value, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
#elif defined(__gfx90a__)
// On gfx90a, if we do not have the __hip_atomic_fetch_add builtin, we need to
// force a CAS loop here.
double old_val;
#if __has_builtin(__hip_atomic_load)
old_val = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
#else // !__has_builtin(__hip_atomic_load)
old_val = __longlong_as_double(__atomic_load_n(reinterpret_cast<unsigned long long*>(addr), __ATOMIC_RELAXED));
#endif // __has_builtin(__hip_atomic_load)
double expected, temp;
do {
temp = expected = old_val;
#if __has_builtin(__hip_atomic_compare_exchange_strong)
__hip_atomic_compare_exchange_strong(addr, &expected, old_val + value, __ATOMIC_RELAXED,
__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
#else // !__has_builtin(__hip_atomic_compare_exchange_strong)
__atomic_compare_exchange_n(addr, &expected, old_val + value, false,
__ATOMIC_RELAXED, __ATOMIC_RELAXED);
#endif // __has_builtin(__hip_atomic_compare_exchange_strong)
old_val = expected;
} while (__double_as_longlong(temp) != __double_as_longlong(old_val));
return old_val;
#else // !defined(__gfx90a__)
#if __has_builtin(__hip_atomic_fetch_add)
return __hip_atomic_fetch_add(addr, value, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
#else // !__has_builtin(__hip_atomic_fetch_add)
return __atomic_fetch_add(addr, value, __ATOMIC_RELAXED);
#endif // __has_builtin(__hip_atomic_fetch_add)
#endif
}
/**
* @brief Safe double precision rmw atomic max.
*
* Performs a relaxed read-modify-write double precision atomic max with
* device memory scope. Original value at \p addr is returned and
* the value of \p addr is updated with \p val if greater.
*
* @note This operation ensures that, on all targets, we produce safe atomics.
* This will be the case even when -munsafe-fp-atomics is passed into the compiler.
*
* @param [in,out] addr Pointer to value to be updated.
* @param [in] val Value used to updated the contents at \p addr
* @return Original value contained at \p addr.
*/
__device__ inline double safeAtomicMax(double* addr, double val) {
#if __has_builtin(__builtin_amdgcn_is_private)
if (__builtin_amdgcn_is_private(
(const __attribute__((address_space(0))) void*)addr)) {
double old = *addr;
*addr = __builtin_fmax(old, val);
return old;
} else {
#endif
#if __has_builtin(__hip_atomic_load) && \
__has_builtin(__hip_atomic_compare_exchange_strong)
double value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
bool done = false;
while (!done && value < val) {
done = __hip_atomic_compare_exchange_strong(addr, &value, val,
__ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
}
return value;
#else
unsigned long long *uaddr = (unsigned long long *)addr;
unsigned long long value = __atomic_load_n(uaddr, __ATOMIC_RELAXED);
bool done = false;
while (!done && __longlong_as_double(value) < val) {
done = __atomic_compare_exchange_n(uaddr, &value, __double_as_longlong(val), false,
__ATOMIC_RELAXED, __ATOMIC_RELAXED);
}
return __longlong_as_double(value);
#endif
#if __has_builtin(__builtin_amdgcn_is_private)
}
#endif
}
/**
* @brief Safe double precision rmw atomic min.
*
* Performs a relaxed read-modify-write double precision atomic min with
* device memory scope. Original value at \p addr is returned and
* the value of \p addr is updated with \p val if lesser.
*
* @note This operation ensures that, on all targets, we produce safe atomics.
* This will be the case even when -munsafe-fp-atomics is passed into the compiler.
*
* @param [in,out] addr Pointer to value to be updated.
* @param [in] val Value used to updated the contents at \p addr
* @return Original value contained at \p addr.
*/
__device__ inline double safeAtomicMin(double* addr, double val) {
#if __has_builtin(__builtin_amdgcn_is_private)
if (__builtin_amdgcn_is_private(
(const __attribute__((address_space(0))) void*)addr)) {
double old = *addr;
*addr = __builtin_fmin(old, val);
return old;
} else {
#endif
#if __has_builtin(__hip_atomic_load) && \
__has_builtin(__hip_atomic_compare_exchange_strong)
double value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
bool done = false;
while (!done && value > val) {
done = __hip_atomic_compare_exchange_strong(addr, &value, val,
__ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
}
return value;
#else
unsigned long long *uaddr = (unsigned long long *)addr;
unsigned long long value = __atomic_load_n(uaddr, __ATOMIC_RELAXED);
bool done = false;
while (!done && __longlong_as_double(value) > val) {
done = __atomic_compare_exchange_n(uaddr, &value, __double_as_longlong(val), false,
__ATOMIC_RELAXED, __ATOMIC_RELAXED);
}
return __longlong_as_double(value);
#endif
#if __has_builtin(__builtin_amdgcn_is_private)
}
#endif
}
#pragma clang diagnostic pop
#endif
文件差异内容过多而无法显示 加载差异
文件差异内容过多而无法显示 加载差异
@@ -0,0 +1,362 @@
/*
Copyright (c) 2018 - 2022 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#ifndef HIP_INCLUDE_HIP_AMD_DETAIL_SURFACE_FUNCTIONS_H
#define HIP_INCLUDE_HIP_AMD_DETAIL_SURFACE_FUNCTIONS_H
#if defined(__cplusplus)
#include <hip/surface_types.h>
#include <hip/hip_vector_types.h>
#include <hip/amd_detail/ockl_image.h>
#define __HIP_SURFACE_OBJECT_PARAMETERS_INIT \
unsigned int ADDRESS_SPACE_CONSTANT* i = (unsigned int ADDRESS_SPACE_CONSTANT*)surfObj;
template<typename T>
struct __hip_is_isurf_channel_type
{
static constexpr bool value =
std::is_same<T, char>::value ||
std::is_same<T, unsigned char>::value ||
std::is_same<T, short>::value ||
std::is_same<T, unsigned short>::value ||
std::is_same<T, int>::value ||
std::is_same<T, unsigned int>::value ||
std::is_same<T, float>::value;
};
template<
typename T,
unsigned int rank>
struct __hip_is_isurf_channel_type<HIP_vector_type<T, rank>>
{
static constexpr bool value =
__hip_is_isurf_channel_type<T>::value &&
((rank == 1) ||
(rank == 2) ||
(rank == 3) ||
(rank == 4));
};
// CUDA is using byte address, need map to pixel address for HIP
static __HOST_DEVICE__ __forceinline__ int __hipGetPixelAddr(int x, int format, int order) {
/*
* use below format index to generate format LUT
typedef enum {
HSA_EXT_IMAGE_CHANNEL_TYPE_SNORM_INT8 = 0,
HSA_EXT_IMAGE_CHANNEL_TYPE_SNORM_INT16 = 1,
HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_INT8 = 2,
HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_INT16 = 3,
HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_INT24 = 4,
HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_SHORT_555 = 5,
HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_SHORT_565 = 6,
HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_SHORT_101010 = 7,
HSA_EXT_IMAGE_CHANNEL_TYPE_SIGNED_INT8 = 8,
HSA_EXT_IMAGE_CHANNEL_TYPE_SIGNED_INT16 = 9,
HSA_EXT_IMAGE_CHANNEL_TYPE_SIGNED_INT32 = 10,
HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8 = 11,
HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16 = 12,
HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32 = 13,
HSA_EXT_IMAGE_CHANNEL_TYPE_HALF_FLOAT = 14,
HSA_EXT_IMAGE_CHANNEL_TYPE_FLOAT = 15
} hsa_ext_image_channel_type_t;
*/
static const int FormatLUT[] = { 0, 1, 0, 1, 3, 1, 1, 1, 0, 1, 2, 0, 1, 2, 1, 2 };
x = FormatLUT[format] == 3 ? x / FormatLUT[format] : x >> FormatLUT[format];
/*
* use below order index to generate order LUT
typedef enum {
HSA_EXT_IMAGE_CHANNEL_ORDER_A = 0,
HSA_EXT_IMAGE_CHANNEL_ORDER_R = 1,
HSA_EXT_IMAGE_CHANNEL_ORDER_RX = 2,
HSA_EXT_IMAGE_CHANNEL_ORDER_RG = 3,
HSA_EXT_IMAGE_CHANNEL_ORDER_RGX = 4,
HSA_EXT_IMAGE_CHANNEL_ORDER_RA = 5,
HSA_EXT_IMAGE_CHANNEL_ORDER_RGB = 6,
HSA_EXT_IMAGE_CHANNEL_ORDER_RGBX = 7,
HSA_EXT_IMAGE_CHANNEL_ORDER_RGBA = 8,
HSA_EXT_IMAGE_CHANNEL_ORDER_BGRA = 9,
HSA_EXT_IMAGE_CHANNEL_ORDER_ARGB = 10,
HSA_EXT_IMAGE_CHANNEL_ORDER_ABGR = 11,
HSA_EXT_IMAGE_CHANNEL_ORDER_SRGB = 12,
HSA_EXT_IMAGE_CHANNEL_ORDER_SRGBX = 13,
HSA_EXT_IMAGE_CHANNEL_ORDER_SRGBA = 14,
HSA_EXT_IMAGE_CHANNEL_ORDER_SBGRA = 15,
HSA_EXT_IMAGE_CHANNEL_ORDER_INTENSITY = 16,
HSA_EXT_IMAGE_CHANNEL_ORDER_LUMINANCE = 17,
HSA_EXT_IMAGE_CHANNEL_ORDER_DEPTH = 18,
HSA_EXT_IMAGE_CHANNEL_ORDER_DEPTH_STENCIL = 19
} hsa_ext_image_channel_order_t;
*/
static const int OrderLUT[] = { 0, 0, 1, 1, 3, 1, 3, 2, 2, 2, 2, 2, 3, 2, 2, 2, 0, 0, 0, 0 };
return x = OrderLUT[order] == 3 ? x / OrderLUT[order] : x >> OrderLUT[order];
}
template <
typename T,
typename std::enable_if<std::is_scalar<T>::value>::type* = nullptr>
static __HOST_DEVICE__ __forceinline__ float4::Native_vec_ __hipMapToNativeFloat4(const T& t) {
float4::Native_vec_ tmp;
tmp.x = static_cast<float>(t);
return tmp;
}
template <
typename T,
typename std::enable_if<!std::is_scalar<T>::value && sizeof(T) / sizeof(typename T::value_type) == 1>::type* = nullptr>
static __HOST_DEVICE__ __forceinline__ float4::Native_vec_ __hipMapToNativeFloat4(const T& t) {
float4::Native_vec_ tmp;
tmp.x = static_cast<float>(t.x);
return tmp;
}
template <
typename T,
typename std::enable_if<!std::is_scalar<T>::value && sizeof(T) / sizeof(typename T::value_type) == 2>::type* = nullptr>
static __HOST_DEVICE__ __forceinline__ float4::Native_vec_ __hipMapToNativeFloat4(const T& t) {
float4::Native_vec_ tmp;
tmp.x = static_cast<float>(t.x);
tmp.y = static_cast<float>(t.y);
return tmp;
}
template <
typename T,
typename std::enable_if<!std::is_scalar<T>::value && sizeof(T) / sizeof(typename T::value_type) == 3>::type* = nullptr>
static __HOST_DEVICE__ __forceinline__ float4::Native_vec_ __hipMapToNativeFloat4(const T& t) {
float4::Native_vec_ tmp;
tmp.x = static_cast<float>(t.x);
tmp.y = static_cast<float>(t.y);
tmp.z = static_cast<float>(t.z);
return tmp;
}
template <
typename T,
typename std::enable_if<!std::is_scalar<T>::value && sizeof(T) / sizeof(typename T::value_type) == 4>::type* = nullptr>
static __HOST_DEVICE__ __forceinline__ float4::Native_vec_ __hipMapToNativeFloat4(const T& t) {
float4::Native_vec_ tmp;
tmp.x = static_cast<float>(t.x);
tmp.y = static_cast<float>(t.y);
tmp.z = static_cast<float>(t.z);
tmp.w = static_cast<float>(t.w);
return tmp;
}
template<typename T>
static __HOST_DEVICE__ __forceinline__
typename std::enable_if<std::is_scalar<T>::value, const T>::type
__hipMapFromNativeFloat4(const float4::Native_vec_& u) {
T tmp;
tmp = static_cast<T>(u.x);
return tmp;
}
template<typename T>
static __HOST_DEVICE__ __forceinline__
typename std::enable_if<!std::is_scalar<T>::value && sizeof(T) / sizeof(typename T::value_type) == 1, const T>::type
__hipMapFromNativeFloat4(const float4::Native_vec_& u) {
T tmp;
tmp.x = static_cast<typename T::value_type>(u.x);
return tmp;
}
template<typename T>
static __HOST_DEVICE__ __forceinline__
typename std::enable_if<!std::is_scalar<T>::value && sizeof(T) / sizeof(typename T::value_type) == 2, const T>::type
__hipMapFromNativeFloat4(const float4::Native_vec_& u) {
T tmp;
tmp.x = static_cast<typename T::value_type>(u.x);
tmp.y = static_cast<typename T::value_type>(u.y);
return tmp;
}
template<typename T>
static __HOST_DEVICE__ __forceinline__
typename std::enable_if<!std::is_scalar<T>::value && sizeof(T) / sizeof(typename T::value_type) == 3, const T>::type
__hipMapFromNativeFloat4(const float4::Native_vec_& u) {
T tmp;
tmp.x = static_cast<typename T::value_type>(u.x);
tmp.y = static_cast<typename T::value_type>(u.y);
tmp.z = static_cast<typename T::value_type>(u.z);
return tmp;
}
template<typename T>
static __HOST_DEVICE__ __forceinline__
typename std::enable_if<!std::is_scalar<T>::value && sizeof(T) / sizeof(typename T::value_type) == 4, const T>::type
__hipMapFromNativeFloat4(const float4::Native_vec_& u) {
T tmp;
tmp.x = static_cast<typename T::value_type>(u.x);
tmp.y = static_cast<typename T::value_type>(u.y);
tmp.z = static_cast<typename T::value_type>(u.z);
tmp.w = static_cast<typename T::value_type>(u.w);
return tmp;
}
template <
typename T,
typename std::enable_if<__hip_is_isurf_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ void surf1Dread(T* data, hipSurfaceObject_t surfObj, int x,
int boundaryMode = hipBoundaryModeZero) {
__HIP_SURFACE_OBJECT_PARAMETERS_INIT
x = __hipGetPixelAddr(x, __ockl_image_channel_data_type_1D(i), __ockl_image_channel_order_1D(i));
auto tmp = __ockl_image_load_1D(i, x);
*data = __hipMapFromNativeFloat4<T>(tmp);
}
template <
typename T,
typename std::enable_if<__hip_is_isurf_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ void surf1Dwrite(T data, hipSurfaceObject_t surfObj, int x) {
__HIP_SURFACE_OBJECT_PARAMETERS_INIT
x = __hipGetPixelAddr(x, __ockl_image_channel_data_type_1D(i), __ockl_image_channel_order_1D(i));
auto tmp = __hipMapToNativeFloat4(data);
__ockl_image_store_1D(i, x, tmp);
}
template <
typename T,
typename std::enable_if<__hip_is_isurf_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ void surf2Dread(T* data, hipSurfaceObject_t surfObj, int x, int y) {
__HIP_SURFACE_OBJECT_PARAMETERS_INIT
x = __hipGetPixelAddr(x, __ockl_image_channel_data_type_2D(i), __ockl_image_channel_order_2D(i));
auto tmp = __ockl_image_load_2D(i, int2(x, y).data);
*data = __hipMapFromNativeFloat4<T>(tmp);
}
template <
typename T,
typename std::enable_if<__hip_is_isurf_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ void surf2Dwrite(T data, hipSurfaceObject_t surfObj, int x, int y) {
__HIP_SURFACE_OBJECT_PARAMETERS_INIT
x = __hipGetPixelAddr(x, __ockl_image_channel_data_type_2D(i), __ockl_image_channel_order_2D(i));
auto tmp = __hipMapToNativeFloat4(data);
__ockl_image_store_2D(i, int2(x, y).data, tmp);
}
template <
typename T,
typename std::enable_if<__hip_is_isurf_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ void surf3Dread(T* data, hipSurfaceObject_t surfObj, int x, int y, int z) {
__HIP_SURFACE_OBJECT_PARAMETERS_INIT
x = __hipGetPixelAddr(x, __ockl_image_channel_data_type_3D(i), __ockl_image_channel_order_3D(i));
auto tmp = __ockl_image_load_3D(i, int4(x, y, z, 0).data);
*data = __hipMapFromNativeFloat4<T>(tmp);
}
template <
typename T,
typename std::enable_if<__hip_is_isurf_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ void surf3Dwrite(T data, hipSurfaceObject_t surfObj, int x, int y, int z) {
__HIP_SURFACE_OBJECT_PARAMETERS_INIT
x = __hipGetPixelAddr(x, __ockl_image_channel_data_type_3D(i), __ockl_image_channel_order_3D(i));
auto tmp = __hipMapToNativeFloat4(data);
__ockl_image_store_3D(i, int4(x, y, z, 0).data, tmp);
}
template <
typename T,
typename std::enable_if<__hip_is_isurf_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ void surf1DLayeredread(T* data, hipSurfaceObject_t surfObj, int x, int layer) {
__HIP_SURFACE_OBJECT_PARAMETERS_INIT
x = __hipGetPixelAddr(x, __ockl_image_channel_data_type_1D(i), __ockl_image_channel_order_1D(i));
auto tmp = __ockl_image_load_lod_1D(i, x, layer);
*data = __hipMapFromNativeFloat4<T>(tmp);
}
template <
typename T,
typename std::enable_if<__hip_is_isurf_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ void surf1DLayeredwrite(T data, hipSurfaceObject_t surfObj, int x, int layer) {
__HIP_SURFACE_OBJECT_PARAMETERS_INIT
x = __hipGetPixelAddr(x, __ockl_image_channel_data_type_1D(i), __ockl_image_channel_order_1D(i));
auto tmp = __hipMapToNativeFloat4(data);
__ockl_image_store_lod_1D(i, x, layer, tmp);
}
template <
typename T,
typename std::enable_if<__hip_is_isurf_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ void surf2DLayeredread(T* data, hipSurfaceObject_t surfObj, int x, int y, int layer) {
__HIP_SURFACE_OBJECT_PARAMETERS_INIT
x = __hipGetPixelAddr(x, __ockl_image_channel_data_type_2D(i), __ockl_image_channel_order_2D(i));
auto tmp = __ockl_image_load_lod_2D(i, int2(x, y).data, layer);
*data = __hipMapFromNativeFloat4<T>(tmp);
}
template <
typename T,
typename std::enable_if<__hip_is_isurf_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ void surf2DLayeredwrite(T data, hipSurfaceObject_t surfObj, int x, int y, int layer) {
__HIP_SURFACE_OBJECT_PARAMETERS_INIT
x = __hipGetPixelAddr(x, __ockl_image_channel_data_type_2D(i), __ockl_image_channel_order_2D(i));
auto tmp = __hipMapToNativeFloat4(data);
__ockl_image_store_lod_2D(i, int2(x, y).data, layer, tmp);
}
template <
typename T,
typename std::enable_if<__hip_is_isurf_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ void surfCubemapread(T* data, hipSurfaceObject_t surfObj, int x, int y, int face) {
__HIP_SURFACE_OBJECT_PARAMETERS_INIT
x = __hipGetPixelAddr(x, __ockl_image_channel_data_type_2D(i), __ockl_image_channel_order_2D(i));
auto tmp = __ockl_image_load_CM(i, int2(x, y).data, face);
*data = __hipMapFromNativeFloat4<T>(tmp);
}
template <
typename T,
typename std::enable_if<__hip_is_isurf_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ void surfCubemapwrite(T data, hipSurfaceObject_t surfObj, int x, int y, int face) {
__HIP_SURFACE_OBJECT_PARAMETERS_INIT
x = __hipGetPixelAddr(x, __ockl_image_channel_data_type_2D(i), __ockl_image_channel_order_2D(i));
auto tmp = __hipMapToNativeFloat4(data);
__ockl_image_store_CM(i, int2(x, y).data, face, tmp);
}
template <
typename T,
typename std::enable_if<__hip_is_isurf_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ void surfCubemapLayeredread(T* data, hipSurfaceObject_t surfObj, int x, int y, int face,
int layer) {
__HIP_SURFACE_OBJECT_PARAMETERS_INIT
x = __hipGetPixelAddr(x, __ockl_image_channel_data_type_2D(i), __ockl_image_channel_order_2D(i));
auto tmp = __ockl_image_load_lod_CM(i, int2(x, y).data, face, layer);
*data = __hipMapFromNativeFloat4<T>(tmp);
}
template <
typename T,
typename std::enable_if<__hip_is_isurf_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ void surfCubemapLayeredwrite(T* data, hipSurfaceObject_t surfObj, int x, int y, int face,
int layer) {
__HIP_SURFACE_OBJECT_PARAMETERS_INIT
x = __hipGetPixelAddr(x, __ockl_image_channel_data_type_2D(i), __ockl_image_channel_order_2D(i));
auto tmp = __hipMapToNativeFloat4(data);
__ockl_image_store_lod_CM(i, int2(x, y).data, face, layer, tmp);
}
#endif
#endif
@@ -0,0 +1,503 @@
/*
Copyright (c) 2022 - 2023 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#ifndef HIP_INCLUDE_HIP_AMD_DETAIL_WARP_FUNCTIONS_H
#define HIP_INCLUDE_HIP_AMD_DETAIL_WARP_FUNCTIONS_H
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wreserved-identifier"
#pragma clang diagnostic ignored "-Wreserved-macro-identifier"
#pragma clang diagnostic ignored "-Wsign-conversion"
#pragma clang diagnostic ignored "-Wold-style-cast"
#pragma clang diagnostic ignored "-Wc++98-compat"
#pragma clang diagnostic ignored "-Wc++98-compat-pedantic"
__device__ static inline unsigned __hip_ds_bpermute(int index, unsigned src) {
union { int i; unsigned u; float f; } tmp; tmp.u = src;
tmp.i = __builtin_amdgcn_ds_bpermute(index, tmp.i);
return tmp.u;
}
__device__ static inline float __hip_ds_bpermutef(int index, float src) {
union { int i; unsigned u; float f; } tmp; tmp.f = src;
tmp.i = __builtin_amdgcn_ds_bpermute(index, tmp.i);
return tmp.f;
}
__device__ static inline unsigned __hip_ds_permute(int index, unsigned src) {
union { int i; unsigned u; float f; } tmp; tmp.u = src;
tmp.i = __builtin_amdgcn_ds_permute(index, tmp.i);
return tmp.u;
}
__device__ static inline float __hip_ds_permutef(int index, float src) {
union { int i; unsigned u; float f; } tmp; tmp.f = src;
tmp.i = __builtin_amdgcn_ds_permute(index, tmp.i);
return tmp.f;
}
#define __hip_ds_swizzle(src, pattern) __hip_ds_swizzle_N<(pattern)>((src))
#define __hip_ds_swizzlef(src, pattern) __hip_ds_swizzlef_N<(pattern)>((src))
template <int pattern>
__device__ static inline unsigned __hip_ds_swizzle_N(unsigned int src) {
union { int i; unsigned u; float f; } tmp; tmp.u = src;
tmp.i = __builtin_amdgcn_ds_swizzle(tmp.i, pattern);
return tmp.u;
}
template <int pattern>
__device__ static inline float __hip_ds_swizzlef_N(float src) {
union { int i; unsigned u; float f; } tmp; tmp.f = src;
tmp.i = __builtin_amdgcn_ds_swizzle(tmp.i, pattern);
return tmp.f;
}
#define __hip_move_dpp(src, dpp_ctrl, row_mask, bank_mask, bound_ctrl) \
__hip_move_dpp_N<(dpp_ctrl), (row_mask), (bank_mask), (bound_ctrl)>((src))
template <int dpp_ctrl, int row_mask, int bank_mask, bool bound_ctrl>
__device__ static inline int __hip_move_dpp_N(int src) {
return __builtin_amdgcn_mov_dpp(src, dpp_ctrl, row_mask, bank_mask,
bound_ctrl);
}
static constexpr int warpSize = __AMDGCN_WAVEFRONT_SIZE;
__device__
inline
int __shfl(int var, int src_lane, int width = warpSize) {
int self = __lane_id();
int index = src_lane + (self & ~(width-1));
return __builtin_amdgcn_ds_bpermute(index<<2, var);
}
__device__
inline
unsigned int __shfl(unsigned int var, int src_lane, int width = warpSize) {
union { int i; unsigned u; float f; } tmp; tmp.u = var;
tmp.i = __shfl(tmp.i, src_lane, width);
return tmp.u;
}
__device__
inline
float __shfl(float var, int src_lane, int width = warpSize) {
union { int i; unsigned u; float f; } tmp; tmp.f = var;
tmp.i = __shfl(tmp.i, src_lane, width);
return tmp.f;
}
__device__
inline
double __shfl(double var, int src_lane, int width = warpSize) {
static_assert(sizeof(double) == 2 * sizeof(int), "");
static_assert(sizeof(double) == sizeof(uint64_t), "");
int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
tmp[0] = __shfl(tmp[0], src_lane, width);
tmp[1] = __shfl(tmp[1], src_lane, width);
uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
double tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
return tmp1;
}
__device__
inline
long __shfl(long var, int src_lane, int width = warpSize)
{
#ifndef _MSC_VER
static_assert(sizeof(long) == 2 * sizeof(int), "");
static_assert(sizeof(long) == sizeof(uint64_t), "");
int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
tmp[0] = __shfl(tmp[0], src_lane, width);
tmp[1] = __shfl(tmp[1], src_lane, width);
uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
return tmp1;
#else
static_assert(sizeof(long) == sizeof(int), "");
return static_cast<long>(__shfl(static_cast<int>(var), src_lane, width));
#endif
}
__device__
inline
unsigned long __shfl(unsigned long var, int src_lane, int width = warpSize) {
#ifndef _MSC_VER
static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), "");
static_assert(sizeof(unsigned long) == sizeof(uint64_t), "");
unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
tmp[0] = __shfl(tmp[0], src_lane, width);
tmp[1] = __shfl(tmp[1], src_lane, width);
uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
unsigned long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
return tmp1;
#else
static_assert(sizeof(unsigned long) == sizeof(unsigned int), "");
return static_cast<unsigned long>(__shfl(static_cast<unsigned int>(var), src_lane, width));
#endif
}
__device__
inline
long long __shfl(long long var, int src_lane, int width = warpSize)
{
static_assert(sizeof(long long) == 2 * sizeof(int), "");
static_assert(sizeof(long long) == sizeof(uint64_t), "");
int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
tmp[0] = __shfl(tmp[0], src_lane, width);
tmp[1] = __shfl(tmp[1], src_lane, width);
uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
return tmp1;
}
__device__
inline
unsigned long long __shfl(unsigned long long var, int src_lane, int width = warpSize) {
static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), "");
static_assert(sizeof(unsigned long long) == sizeof(uint64_t), "");
unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
tmp[0] = __shfl(tmp[0], src_lane, width);
tmp[1] = __shfl(tmp[1], src_lane, width);
uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
unsigned long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
return tmp1;
}
__device__
inline
int __shfl_up(int var, unsigned int lane_delta, int width = warpSize) {
int self = __lane_id();
int index = self - lane_delta;
index = (index < (self & ~(width-1)))?self:index;
return __builtin_amdgcn_ds_bpermute(index<<2, var);
}
__device__
inline
unsigned int __shfl_up(unsigned int var, unsigned int lane_delta, int width = warpSize) {
union { int i; unsigned u; float f; } tmp; tmp.u = var;
tmp.i = __shfl_up(tmp.i, lane_delta, width);
return tmp.u;
}
__device__
inline
float __shfl_up(float var, unsigned int lane_delta, int width = warpSize) {
union { int i; unsigned u; float f; } tmp; tmp.f = var;
tmp.i = __shfl_up(tmp.i, lane_delta, width);
return tmp.f;
}
__device__
inline
double __shfl_up(double var, unsigned int lane_delta, int width = warpSize) {
static_assert(sizeof(double) == 2 * sizeof(int), "");
static_assert(sizeof(double) == sizeof(uint64_t), "");
int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
tmp[0] = __shfl_up(tmp[0], lane_delta, width);
tmp[1] = __shfl_up(tmp[1], lane_delta, width);
uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
double tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
return tmp1;
}
__device__
inline
long __shfl_up(long var, unsigned int lane_delta, int width = warpSize)
{
#ifndef _MSC_VER
static_assert(sizeof(long) == 2 * sizeof(int), "");
static_assert(sizeof(long) == sizeof(uint64_t), "");
int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
tmp[0] = __shfl_up(tmp[0], lane_delta, width);
tmp[1] = __shfl_up(tmp[1], lane_delta, width);
uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
return tmp1;
#else
static_assert(sizeof(long) == sizeof(int), "");
return static_cast<long>(__shfl_up(static_cast<int>(var), lane_delta, width));
#endif
}
__device__
inline
unsigned long __shfl_up(unsigned long var, unsigned int lane_delta, int width = warpSize)
{
#ifndef _MSC_VER
static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), "");
static_assert(sizeof(unsigned long) == sizeof(uint64_t), "");
unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
tmp[0] = __shfl_up(tmp[0], lane_delta, width);
tmp[1] = __shfl_up(tmp[1], lane_delta, width);
uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
unsigned long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
return tmp1;
#else
static_assert(sizeof(unsigned long) == sizeof(unsigned int), "");
return static_cast<unsigned long>(__shfl_up(static_cast<unsigned int>(var), lane_delta, width));
#endif
}
__device__
inline
long long __shfl_up(long long var, unsigned int lane_delta, int width = warpSize)
{
static_assert(sizeof(long long) == 2 * sizeof(int), "");
static_assert(sizeof(long long) == sizeof(uint64_t), "");
int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
tmp[0] = __shfl_up(tmp[0], lane_delta, width);
tmp[1] = __shfl_up(tmp[1], lane_delta, width);
uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
return tmp1;
}
__device__
inline
unsigned long long __shfl_up(unsigned long long var, unsigned int lane_delta, int width = warpSize)
{
static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), "");
static_assert(sizeof(unsigned long long) == sizeof(uint64_t), "");
unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
tmp[0] = __shfl_up(tmp[0], lane_delta, width);
tmp[1] = __shfl_up(tmp[1], lane_delta, width);
uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
unsigned long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
return tmp1;
}
__device__
inline
int __shfl_down(int var, unsigned int lane_delta, int width = warpSize) {
int self = __lane_id();
int index = self + lane_delta;
index = (int)((self&(width-1))+lane_delta) >= width?self:index;
return __builtin_amdgcn_ds_bpermute(index<<2, var);
}
__device__
inline
unsigned int __shfl_down(unsigned int var, unsigned int lane_delta, int width = warpSize) {
union { int i; unsigned u; float f; } tmp; tmp.u = var;
tmp.i = __shfl_down(tmp.i, lane_delta, width);
return tmp.u;
}
__device__
inline
float __shfl_down(float var, unsigned int lane_delta, int width = warpSize) {
union { int i; unsigned u; float f; } tmp; tmp.f = var;
tmp.i = __shfl_down(tmp.i, lane_delta, width);
return tmp.f;
}
__device__
inline
double __shfl_down(double var, unsigned int lane_delta, int width = warpSize) {
static_assert(sizeof(double) == 2 * sizeof(int), "");
static_assert(sizeof(double) == sizeof(uint64_t), "");
int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
tmp[0] = __shfl_down(tmp[0], lane_delta, width);
tmp[1] = __shfl_down(tmp[1], lane_delta, width);
uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
double tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
return tmp1;
}
__device__
inline
long __shfl_down(long var, unsigned int lane_delta, int width = warpSize)
{
#ifndef _MSC_VER
static_assert(sizeof(long) == 2 * sizeof(int), "");
static_assert(sizeof(long) == sizeof(uint64_t), "");
int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
tmp[0] = __shfl_down(tmp[0], lane_delta, width);
tmp[1] = __shfl_down(tmp[1], lane_delta, width);
uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
return tmp1;
#else
static_assert(sizeof(long) == sizeof(int), "");
return static_cast<long>(__shfl_down(static_cast<int>(var), lane_delta, width));
#endif
}
__device__
inline
unsigned long __shfl_down(unsigned long var, unsigned int lane_delta, int width = warpSize)
{
#ifndef _MSC_VER
static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), "");
static_assert(sizeof(unsigned long) == sizeof(uint64_t), "");
unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
tmp[0] = __shfl_down(tmp[0], lane_delta, width);
tmp[1] = __shfl_down(tmp[1], lane_delta, width);
uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
unsigned long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
return tmp1;
#else
static_assert(sizeof(unsigned long) == sizeof(unsigned int), "");
return static_cast<unsigned long>(__shfl_down(static_cast<unsigned int>(var), lane_delta, width));
#endif
}
__device__
inline
long long __shfl_down(long long var, unsigned int lane_delta, int width = warpSize)
{
static_assert(sizeof(long long) == 2 * sizeof(int), "");
static_assert(sizeof(long long) == sizeof(uint64_t), "");
int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
tmp[0] = __shfl_down(tmp[0], lane_delta, width);
tmp[1] = __shfl_down(tmp[1], lane_delta, width);
uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
return tmp1;
}
__device__
inline
unsigned long long __shfl_down(unsigned long long var, unsigned int lane_delta, int width = warpSize)
{
static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), "");
static_assert(sizeof(unsigned long long) == sizeof(uint64_t), "");
unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
tmp[0] = __shfl_down(tmp[0], lane_delta, width);
tmp[1] = __shfl_down(tmp[1], lane_delta, width);
uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
unsigned long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
return tmp1;
}
__device__
inline
int __shfl_xor(int var, int lane_mask, int width = warpSize) {
int self = __lane_id();
int index = self^lane_mask;
index = index >= ((self+width)&~(width-1))?self:index;
return __builtin_amdgcn_ds_bpermute(index<<2, var);
}
__device__
inline
unsigned int __shfl_xor(unsigned int var, int lane_mask, int width = warpSize) {
union { int i; unsigned u; float f; } tmp; tmp.u = var;
tmp.i = __shfl_xor(tmp.i, lane_mask, width);
return tmp.u;
}
__device__
inline
float __shfl_xor(float var, int lane_mask, int width = warpSize) {
union { int i; unsigned u; float f; } tmp; tmp.f = var;
tmp.i = __shfl_xor(tmp.i, lane_mask, width);
return tmp.f;
}
__device__
inline
double __shfl_xor(double var, int lane_mask, int width = warpSize) {
static_assert(sizeof(double) == 2 * sizeof(int), "");
static_assert(sizeof(double) == sizeof(uint64_t), "");
int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
double tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
return tmp1;
}
__device__
inline
long __shfl_xor(long var, int lane_mask, int width = warpSize)
{
#ifndef _MSC_VER
static_assert(sizeof(long) == 2 * sizeof(int), "");
static_assert(sizeof(long) == sizeof(uint64_t), "");
int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
return tmp1;
#else
static_assert(sizeof(long) == sizeof(int), "");
return static_cast<long>(__shfl_xor(static_cast<int>(var), lane_mask, width));
#endif
}
__device__
inline
unsigned long __shfl_xor(unsigned long var, int lane_mask, int width = warpSize)
{
#ifndef _MSC_VER
static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), "");
static_assert(sizeof(unsigned long) == sizeof(uint64_t), "");
unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
unsigned long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
return tmp1;
#else
static_assert(sizeof(unsigned long) == sizeof(unsigned int), "");
return static_cast<unsigned long>(__shfl_xor(static_cast<unsigned int>(var), lane_mask, width));
#endif
}
__device__
inline
long long __shfl_xor(long long var, int lane_mask, int width = warpSize)
{
static_assert(sizeof(long long) == 2 * sizeof(int), "");
static_assert(sizeof(long long) == sizeof(uint64_t), "");
int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
return tmp1;
}
__device__
inline
unsigned long long __shfl_xor(unsigned long long var, int lane_mask, int width = warpSize)
{
static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), "");
static_assert(sizeof(unsigned long long) == sizeof(uint64_t), "");
unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
unsigned long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
return tmp1;
}
#pragma clang diagnostic pop
#endif
@@ -0,0 +1,30 @@
/*
Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#pragma once
namespace hip_impl // Documentation only.
{
#define requires(...)
#define FunctionalProcedure typename
} // namespace hip_impl
@@ -0,0 +1,131 @@
/*
Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
/**
* @file amd_detail/device_library_decls.h
* @brief Contains declarations for types and functions in device library.
* Uses int64_t and uint64_t instead of long, long long, unsigned
* long and unsigned long long types for device library API
* declarations.
*/
#ifndef HIP_INCLUDE_HIP_AMD_DETAIL_DEVICE_LIBRARY_DECLS_H
#define HIP_INCLUDE_HIP_AMD_DETAIL_DEVICE_LIBRARY_DECLS_H
#include "hip/amd_detail/host_defines.h"
typedef unsigned char uchar;
typedef unsigned short ushort;
typedef unsigned int uint;
typedef unsigned long ulong;
typedef unsigned long long ullong;
extern "C" __device__ __attribute__((const)) bool __ockl_wfany_i32(int);
extern "C" __device__ __attribute__((const)) bool __ockl_wfall_i32(int);
extern "C" __device__ uint __ockl_activelane_u32(void);
extern "C" __device__ __attribute__((const)) uint __ockl_mul24_u32(uint, uint);
extern "C" __device__ __attribute__((const)) int __ockl_mul24_i32(int, int);
extern "C" __device__ __attribute__((const)) uint __ockl_mul_hi_u32(uint, uint);
extern "C" __device__ __attribute__((const)) int __ockl_mul_hi_i32(int, int);
extern "C" __device__ __attribute__((const)) uint __ockl_sadd_u32(uint, uint, uint);
extern "C" __device__ __attribute__((const)) uchar __ockl_clz_u8(uchar);
extern "C" __device__ __attribute__((const)) ushort __ockl_clz_u16(ushort);
extern "C" __device__ __attribute__((const)) uint __ockl_clz_u32(uint);
extern "C" __device__ __attribute__((const)) uint64_t __ockl_clz_u64(uint64_t);
extern "C" __device__ __attribute__((const)) float __ocml_floor_f32(float);
extern "C" __device__ __attribute__((const)) float __ocml_rint_f32(float);
extern "C" __device__ __attribute__((const)) float __ocml_ceil_f32(float);
extern "C" __device__ __attribute__((const)) float __ocml_trunc_f32(float);
extern "C" __device__ __attribute__((const)) float __ocml_fmin_f32(float, float);
extern "C" __device__ __attribute__((const)) float __ocml_fmax_f32(float, float);
extern "C" __device__ __attribute__((const)) float __ocml_cvtrtn_f32_f64(double);
extern "C" __device__ __attribute__((const)) float __ocml_cvtrtp_f32_f64(double);
extern "C" __device__ __attribute__((const)) float __ocml_cvtrtz_f32_f64(double);
extern "C" __device__ __attribute__((const)) _Float16 __ocml_cvtrtn_f16_f32(float);
extern "C" __device__ __attribute__((const)) _Float16 __ocml_cvtrtp_f16_f32(float);
extern "C" __device__ __attribute__((const)) _Float16 __ocml_cvtrtz_f16_f32(float);
extern "C" __device__ __attribute__((const)) float __ocml_cvtrtn_f32_s32(int);
extern "C" __device__ __attribute__((const)) float __ocml_cvtrtp_f32_s32(int);
extern "C" __device__ __attribute__((const)) float __ocml_cvtrtz_f32_s32(int);
extern "C" __device__ __attribute__((const)) float __ocml_cvtrtn_f32_u32(uint32_t);
extern "C" __device__ __attribute__((const)) float __ocml_cvtrtp_f32_u32(uint32_t);
extern "C" __device__ __attribute__((const)) float __ocml_cvtrtz_f32_u32(uint32_t);
extern "C" __device__ __attribute__((const)) float __ocml_cvtrtn_f32_s64(int64_t);
extern "C" __device__ __attribute__((const)) float __ocml_cvtrtp_f32_s64(int64_t);
extern "C" __device__ __attribute__((const)) float __ocml_cvtrtz_f32_s64(int64_t);
extern "C" __device__ __attribute__((const)) float __ocml_cvtrtn_f32_u64(uint64_t);
extern "C" __device__ __attribute__((const)) float __ocml_cvtrtp_f32_u64(uint64_t);
extern "C" __device__ __attribute__((const)) float __ocml_cvtrtz_f32_u64(uint64_t);
extern "C" __device__ __attribute__((const)) double __ocml_cvtrtn_f64_s64(int64_t);
extern "C" __device__ __attribute__((const)) double __ocml_cvtrtp_f64_s64(int64_t);
extern "C" __device__ __attribute__((const)) double __ocml_cvtrtz_f64_s64(int64_t);
extern "C" __device__ __attribute__((const)) double __ocml_cvtrtn_f64_u64(uint64_t);
extern "C" __device__ __attribute__((const)) double __ocml_cvtrtp_f64_u64(uint64_t);
extern "C" __device__ __attribute__((const)) double __ocml_cvtrtz_f64_u64(uint64_t);
extern "C" __device__ __attribute__((convergent)) void __ockl_gws_init(uint nwm1, uint rid);
extern "C" __device__ __attribute__((convergent)) void __ockl_gws_barrier(uint nwm1, uint rid);
extern "C" __device__ __attribute__((const)) uint32_t __ockl_lane_u32();
extern "C" __device__ __attribute__((const)) int __ockl_grid_is_valid(void);
extern "C" __device__ __attribute__((convergent)) void __ockl_grid_sync(void);
extern "C" __device__ __attribute__((const)) uint __ockl_multi_grid_num_grids(void);
extern "C" __device__ __attribute__((const)) uint __ockl_multi_grid_grid_rank(void);
extern "C" __device__ __attribute__((const)) uint __ockl_multi_grid_size(void);
extern "C" __device__ __attribute__((const)) uint __ockl_multi_grid_thread_rank(void);
extern "C" __device__ __attribute__((const)) int __ockl_multi_grid_is_valid(void);
extern "C" __device__ __attribute__((convergent)) void __ockl_multi_grid_sync(void);
extern "C" __device__ void __ockl_atomic_add_noret_f32(float*, float);
extern "C" __device__ __attribute__((convergent)) int __ockl_wgred_add_i32(int a);
extern "C" __device__ __attribute__((convergent)) int __ockl_wgred_and_i32(int a);
extern "C" __device__ __attribute__((convergent)) int __ockl_wgred_or_i32(int a);
extern "C" __device__ uint64_t __ockl_fprintf_stderr_begin();
extern "C" __device__ uint64_t __ockl_fprintf_append_args(uint64_t msg_desc, uint32_t num_args,
uint64_t value0, uint64_t value1,
uint64_t value2, uint64_t value3,
uint64_t value4, uint64_t value5,
uint64_t value6, uint32_t is_last);
extern "C" __device__ uint64_t __ockl_fprintf_append_string_n(uint64_t msg_desc, const char* data,
uint64_t length, uint32_t is_last);
// Introduce local address space
#define __local __attribute__((address_space(3)))
#ifdef __HIP_DEVICE_COMPILE__
__device__ inline static __local void* __to_local(unsigned x) { return (__local void*)x; }
#endif //__HIP_DEVICE_COMPILE__
// Using hip.amdgcn.bc - sync threads
#define __CLK_LOCAL_MEM_FENCE 0x01
typedef unsigned __cl_mem_fence_flags;
#endif
@@ -0,0 +1,218 @@
/*
Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#pragma once
#include "concepts.hpp"
#include "helpers.hpp"
#include "program_state.hpp"
#include "hip_runtime_api.h"
#include <cstdint>
#include <cstring>
#include <stdexcept>
#include <tuple>
#include <type_traits>
#include <utility>
hipError_t ihipExtLaunchMultiKernelMultiDevice(hipLaunchParams* launchParamsList, int numDevices,
unsigned int flags, hip_impl::program_state& ps);
hipError_t hipLaunchCooperativeKernel(const void* f, dim3 gridDim,
dim3 blockDim, void** args,
size_t sharedMem, hipStream_t stream,
hip_impl::program_state& ps);
hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsList,
int numDevices,
unsigned int flags,
hip_impl::program_state& ps);
#pragma GCC visibility push(hidden)
namespace hip_impl {
template <typename T, typename std::enable_if<std::is_integral<T>{}>::type* = nullptr>
inline T round_up_to_next_multiple_nonnegative(T x, T y) {
T tmp = x + y - 1;
return tmp - tmp % y;
}
template <
std::size_t n,
typename... Ts,
typename std::enable_if<n == sizeof...(Ts)>::type* = nullptr>
inline hip_impl::kernarg make_kernarg(
const std::tuple<Ts...>&,
const kernargs_size_align&,
hip_impl::kernarg kernarg) {
return kernarg;
}
template <
std::size_t n,
typename... Ts,
typename std::enable_if<n != sizeof...(Ts)>::type* = nullptr>
inline hip_impl::kernarg make_kernarg(
const std::tuple<Ts...>& formals,
const kernargs_size_align& size_align,
hip_impl::kernarg kernarg) {
using T = typename std::tuple_element<n, std::tuple<Ts...>>::type;
static_assert(
!std::is_reference<T>{},
"A __global__ function cannot have a reference as one of its "
"arguments.");
#if defined(HIP_STRICT)
static_assert(
std::is_trivially_copyable<T>{},
"Only TriviallyCopyable types can be arguments to a __global__ "
"function");
#endif
kernarg.resize(round_up_to_next_multiple_nonnegative(
kernarg.size(), size_align.alignment(n)) + size_align.size(n));
std::memcpy(
kernarg.data() + kernarg.size() - size_align.size(n),
&std::get<n>(formals),
size_align.size(n));
return make_kernarg<n + 1>(formals, size_align, std::move(kernarg));
}
template <typename... Formals, typename... Actuals>
inline hip_impl::kernarg make_kernarg(
void (*kernel)(Formals...), std::tuple<Actuals...> actuals) {
static_assert(sizeof...(Formals) == sizeof...(Actuals),
"The count of formal arguments must match the count of actuals.");
if (sizeof...(Formals) == 0) return {};
std::tuple<Formals...> to_formals{std::move(actuals)};
hip_impl::kernarg kernarg;
kernarg.reserve(sizeof(to_formals));
auto& ps = hip_impl::get_program_state();
return make_kernarg<0>(to_formals,
ps.get_kernargs_size_align(
reinterpret_cast<std::uintptr_t>(kernel)),
std::move(kernarg));
}
HIP_INTERNAL_EXPORTED_API hsa_agent_t target_agent(hipStream_t stream);
inline
__attribute__((visibility("hidden")))
void hipLaunchKernelGGLImpl(
std::uintptr_t function_address,
const dim3& numBlocks,
const dim3& dimBlocks,
std::uint32_t sharedMemBytes,
hipStream_t stream,
void** kernarg) {
const auto& kd = hip_impl::get_program_state().kernel_descriptor(function_address,
target_agent(stream));
hipModuleLaunchKernel(kd, numBlocks.x, numBlocks.y, numBlocks.z,
dimBlocks.x, dimBlocks.y, dimBlocks.z, sharedMemBytes,
stream, nullptr, kernarg);
}
} // Namespace hip_impl.
template <class T>
inline
hipError_t hipOccupancyMaxPotentialBlockSize(int* gridSize, int* blockSize,
T kernel, size_t dynSharedMemPerBlk = 0, int blockSizeLimit = 0) {
using namespace hip_impl;
hip_impl::hip_init();
auto f = get_program_state().kernel_descriptor(reinterpret_cast<std::uintptr_t>(kernel),
target_agent(0));
return hipModuleOccupancyMaxPotentialBlockSize(gridSize, blockSize, f,
dynSharedMemPerBlk, blockSizeLimit);
}
template <class T>
inline
hipError_t hipOccupancyMaxPotentialBlockSizeWithFlags(int* gridSize, int* blockSize,
T kernel, size_t dynSharedMemPerBlk = 0, int blockSizeLimit = 0, unsigned int flags = 0 ) {
using namespace hip_impl;
hip_impl::hip_init();
if(flags != hipOccupancyDefault) return hipErrorNotSupported;
auto f = get_program_state().kernel_descriptor(reinterpret_cast<std::uintptr_t>(kernel),
target_agent(0));
return hipModuleOccupancyMaxPotentialBlockSize(gridSize, blockSize, f,
dynSharedMemPerBlk, blockSizeLimit);
}
template <typename... Args, typename F = void (*)(Args...)>
inline
void hipLaunchKernelGGL(F kernel, const dim3& numBlocks, const dim3& dimBlocks,
std::uint32_t sharedMemBytes, hipStream_t stream,
Args... args) {
hip_impl::hip_init();
auto kernarg = hip_impl::make_kernarg(kernel, std::tuple<Args...>{std::move(args)...});
std::size_t kernarg_size = kernarg.size();
void* config[]{
HIP_LAUNCH_PARAM_BUFFER_POINTER,
kernarg.data(),
HIP_LAUNCH_PARAM_BUFFER_SIZE,
&kernarg_size,
HIP_LAUNCH_PARAM_END};
hip_impl::hipLaunchKernelGGLImpl(reinterpret_cast<std::uintptr_t>(kernel),
numBlocks, dimBlocks, sharedMemBytes,
stream, &config[0]);
}
template <typename F>
inline
__attribute__((visibility("hidden")))
hipError_t hipLaunchCooperativeKernel(F f, dim3 gridDim, dim3 blockDim,
void** args, size_t sharedMem,
hipStream_t stream) {
hip_impl::hip_init();
auto& ps = hip_impl::get_program_state();
return hipLaunchCooperativeKernel(reinterpret_cast<void*>(f), gridDim,
blockDim, args, sharedMem, stream, ps);
}
inline
__attribute__((visibility("hidden")))
hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsList,
int numDevices,
unsigned int flags) {
hip_impl::hip_init();
auto& ps = hip_impl::get_program_state();
return hipLaunchCooperativeKernelMultiDevice(launchParamsList, numDevices, flags, ps);
}
#pragma GCC visibility pop
@@ -0,0 +1,67 @@
#pragma once
#include <stdint.h>
#include <hc_defines.h>
#define GRID_LAUNCH_VERSION 20
// Extern definitions
namespace hc{
class completion_future;
class accelerator_view;
}
// 3 dim structure for groups and grids.
typedef struct gl_dim3
{
int x,y,z;
gl_dim3(uint32_t _x=1, uint32_t _y=1, uint32_t _z=1) : x(_x), y(_y), z(_z) {};
} gl_dim3;
typedef enum gl_barrier_bit {
barrier_bit_queue_default,
barrier_bit_none,
barrier_bit_wait,
} gl_barrier_bit;
// grid_launch_parm contains information used to launch the kernel.
typedef struct grid_launch_parm
{
//! Grid dimensions
gl_dim3 grid_dim;
//! Group dimensions
gl_dim3 group_dim;
//! Amount of dynamic group memory to use with the kernel launch.
//! This memory is in addition to the amount used statically in the kernel.
unsigned int dynamic_group_mem_bytes;
//! Control setting of barrier bit on per-packet basis:
//! See gl_barrier_bit description.
//! Placeholder, is not used to control packet dispatch yet
enum gl_barrier_bit barrier_bit;
//! Value of packet fences to apply to launch.
//! The correspond to the value of bits 9:14 in the AQL packet,
//! see HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE and hsa_fence_scope_t.
unsigned int launch_fence;
//! Pointer to the accelerator_view where the kernel should execute.
//! If NULL, the default view on the default accelerator is used.
hc::accelerator_view *av;
//! Pointer to the completion_future used to track the status of the command.
//! If NULL, the command does not write status. In this case,
//! synchronization can be enforced with queue-level waits or
//! waiting on younger commands.
hc::completion_future *cf;
grid_launch_parm() = default;
} grid_launch_parm;
extern void init_grid_launch(grid_launch_parm *gl);
@@ -0,0 +1,50 @@
#pragma once
#include "grid_launch.h"
#include "hc.hpp"
class grid_launch_parm_cxx : public grid_launch_parm
{
public:
grid_launch_parm_cxx() = default;
// customized serialization: don't need av and cf in kernel
__attribute__((annotate("serialize")))
void __cxxamp_serialize(Kalmar::Serialize& s) const {
s.Append(sizeof(int), &grid_dim.x);
s.Append(sizeof(int), &grid_dim.y);
s.Append(sizeof(int), &grid_dim.z);
s.Append(sizeof(int), &group_dim.x);
s.Append(sizeof(int), &group_dim.y);
s.Append(sizeof(int), &group_dim.z);
}
__attribute__((annotate("user_deserialize")))
grid_launch_parm_cxx(int grid_dim_x, int grid_dim_y, int grid_dim_z,
int group_dim_x, int group_dim_y, int group_dim_z) {
grid_dim.x = grid_dim_x;
grid_dim.y = grid_dim_y;
grid_dim.z = grid_dim_z;
group_dim.x = group_dim_x;
group_dim.y = group_dim_y;
group_dim.z = group_dim_z;
}
};
extern inline void grid_launch_init(grid_launch_parm *lp) {
lp->grid_dim.x = lp->grid_dim.y = lp->grid_dim.z = 1;
lp->group_dim.x = lp->group_dim.y = lp->group_dim.z = 1;
lp->dynamic_group_mem_bytes = 0;
lp->barrier_bit = barrier_bit_queue_default;
lp->launch_fence = -1;
// TODO - set to NULL?
static hc::accelerator_view av = hc::accelerator().get_default_view();
lp->av = &av;
lp->cf = NULL;
}
@@ -0,0 +1,26 @@
/*
Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#pragma once
#if GENERIC_GRID_LAUNCH == 1
#include "macro_based_grid_launch.hpp"
#endif // GENERIC_GRID_LAUNCH
@@ -0,0 +1,137 @@
/*
Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#pragma once
#include "concepts.hpp"
#include <type_traits> // For std::conditional, std::decay, std::enable_if,
// std::false_type, std result_of and std::true_type.
#include <utility> // For std::declval.
#ifdef __has_include // Check if __has_include is present
# if __has_include(<version>) // Check for version header
# include <version>
# if defined(__cpp_lib_is_invocable) && !defined(HIP_HAS_INVOCABLE)
# define HIP_HAS_INVOCABLE __cpp_lib_is_invocable
# endif
# if defined(__cpp_lib_result_of_sfinae) && !defined(HIP_HAS_RESULT_OF_SFINAE)
# define HIP_HAS_RESULT_OF_SFINAE __cpp_lib_result_of_sfinae
# endif
# endif
#endif
#ifndef HIP_HAS_INVOCABLE
#define HIP_HAS_INVOCABLE 0
#endif
#ifndef HIP_HAS_RESULT_OF_SFINAE
#define HIP_HAS_RESULT_OF_SFINAE 0
#endif
namespace std { // TODO: these should be removed as soon as possible.
#if (__cplusplus < 201406L)
#if (__cplusplus < 201402L)
template <bool cond, typename T = void>
using enable_if_t = typename enable_if<cond, T>::type;
template <bool cond, typename T, typename U>
using conditional_t = typename conditional<cond, T, U>::type;
template <typename T>
using decay_t = typename decay<T>::type;
template <FunctionalProcedure F, typename... Ts>
using result_of_t = typename result_of<F(Ts...)>::type;
template <typename T>
using remove_reference_t = typename remove_reference<T>::type;
#endif
#endif
} // namespace std
namespace hip_impl {
template <typename...>
using void_t_ = void;
#if HIP_HAS_INVOCABLE
template <typename, typename = void>
struct is_callable_impl;
template <FunctionalProcedure F, typename... Ts>
struct is_callable_impl<F(Ts...)> : std::is_invocable<F, Ts...> {};
#elif HIP_HAS_RESULT_OF_SFINAE
template <typename, typename = void>
struct is_callable_impl : std::false_type {};
template <FunctionalProcedure F, typename... Ts>
struct is_callable_impl<F(Ts...), void_t_<typename std::result_of<F(Ts...)>::type > > : std::true_type {};
#else
template <class Base, class T, class Derived>
auto simple_invoke(T Base::*pmd, Derived&& ref)
-> decltype(static_cast<Derived&&>(ref).*pmd);
template <class PMD, class Pointer>
auto simple_invoke(PMD&& pmd, Pointer&& ptr)
-> decltype((*static_cast<Pointer&&>(ptr)).*static_cast<PMD&&>(pmd));
template <class Base, class T, class Derived>
auto simple_invoke(T Base::*pmd, const std::reference_wrapper<Derived>& ref)
-> decltype(ref.get().*pmd);
template <class Base, class T, class Derived, class... Args>
auto simple_invoke(T Base::*pmf, Derived&& ref, Args&&... args)
-> decltype((static_cast<Derived&&>(ref).*pmf)(static_cast<Args&&>(args)...));
template <class PMF, class Pointer, class... Args>
auto simple_invoke(PMF&& pmf, Pointer&& ptr, Args&&... args)
-> decltype(((*static_cast<Pointer&&>(ptr)).*static_cast<PMF&&>(pmf))(static_cast<Args&&>(args)...));
template <class Base, class T, class Derived, class... Args>
auto simple_invoke(T Base::*pmf, const std::reference_wrapper<Derived>& ref, Args&&... args)
-> decltype((ref.get().*pmf)(static_cast<Args&&>(args)...));
template<class F, class... Ts>
auto simple_invoke(F&& f, Ts&&... xs)
-> decltype(f(static_cast<Ts&&>(xs)...));
template <typename, typename = void>
struct is_callable_impl : std::false_type {};
template <FunctionalProcedure F, typename... Ts>
struct is_callable_impl<F(Ts...), void_t_<decltype(simple_invoke(std::declval<F>(), std::declval<Ts>()...))> >
: std::true_type {};
#endif
template <typename Call>
struct is_callable : is_callable_impl<Call> {};
#define count_macro_args_impl_hip_(_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, \
_14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, \
_26, _27, _28, _29, _30, _31, _n, ...) \
_n
#define count_macro_args_hip_(...) \
count_macro_args_impl_hip_(, ##__VA_ARGS__, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, \
19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, \
0)
#define overloaded_macro_expand_hip_(macro, arg_cnt) macro##arg_cnt
#define overload_macro_impl_hip_(macro, arg_cnt) overloaded_macro_expand_hip_(macro, arg_cnt)
#define overload_macro_hip_(macro, ...) \
overload_macro_impl_hip_(macro, count_macro_args_hip_(__VA_ARGS__))(__VA_ARGS__)
} // namespace hip_impl
@@ -0,0 +1,222 @@
/*
Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
/**
* @file amd_detail/hip_cooperative_groups_helper.h
*
* @brief Device side implementation of cooperative group feature.
*
* Defines helper constructs and APIs which aid the types and device API
* wrappers defined within `amd_detail/hip_cooperative_groups.h`.
*/
#ifndef HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COOPERATIVE_GROUPS_HELPER_H
#define HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COOPERATIVE_GROUPS_HELPER_H
#if __cplusplus
#if !defined(__HIPCC_RTC__)
#include <hip/amd_detail/amd_device_functions.h>
#endif
#if !defined(__align__)
#define __align__(x) __attribute__((aligned(x)))
#endif
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wreserved-macro-identifier"
#pragma clang diagnostic ignored "-Wc++98-compat"
#pragma clang diagnostic ignored "-Wc++98-compat-pedantic"
#pragma clang diagnostic ignored "-Wshorten-64-to-32"
#if !defined(__CG_QUALIFIER__)
#define __CG_QUALIFIER__ __device__ __forceinline__
#endif
#if !defined(__CG_STATIC_QUALIFIER__)
#define __CG_STATIC_QUALIFIER__ __device__ static __forceinline__
#endif
#if !defined(_CG_STATIC_CONST_DECL_)
#define _CG_STATIC_CONST_DECL_ static constexpr
#endif
#if __AMDGCN_WAVEFRONT_SIZE == 32
using lane_mask = unsigned int;
#else
using lane_mask = unsigned long long int;
#endif
namespace cooperative_groups {
/* Global scope */
template <unsigned int size>
using is_power_of_2 = std::integral_constant<bool, (size & (size - 1)) == 0>;
template <unsigned int size>
using is_valid_wavefront = std::integral_constant<bool, (size <= __AMDGCN_WAVEFRONT_SIZE)>;
template <unsigned int size>
using is_valid_tile_size =
std::integral_constant<bool, is_power_of_2<size>::value && is_valid_wavefront<size>::value>;
template <typename T>
using is_valid_type =
std::integral_constant<bool, std::is_integral<T>::value || std::is_floating_point<T>::value>;
namespace internal {
/** \brief Enums representing different cooperative group types
*/
typedef enum {
cg_invalid,
cg_multi_grid,
cg_grid,
cg_workgroup,
cg_tiled_group,
cg_coalesced_group
} group_type;
/**
* Functionalities related to multi-grid cooperative group type
*/
namespace multi_grid {
__CG_STATIC_QUALIFIER__ uint32_t num_grids() {
return static_cast<uint32_t>(__ockl_multi_grid_num_grids()); }
__CG_STATIC_QUALIFIER__ uint32_t grid_rank() {
return static_cast<uint32_t>(__ockl_multi_grid_grid_rank()); }
__CG_STATIC_QUALIFIER__ uint32_t size() { return static_cast<uint32_t>(__ockl_multi_grid_size()); }
__CG_STATIC_QUALIFIER__ uint32_t thread_rank() {
return static_cast<uint32_t>(__ockl_multi_grid_thread_rank()); }
__CG_STATIC_QUALIFIER__ bool is_valid() { return static_cast<bool>(__ockl_multi_grid_is_valid()); }
__CG_STATIC_QUALIFIER__ void sync() { __ockl_multi_grid_sync(); }
} // namespace multi_grid
/**
* Functionalities related to grid cooperative group type
*/
namespace grid {
__CG_STATIC_QUALIFIER__ uint32_t size() {
return static_cast<uint32_t>((blockDim.z * gridDim.z) * (blockDim.y * gridDim.y) *
(blockDim.x * gridDim.x));
}
__CG_STATIC_QUALIFIER__ uint32_t thread_rank() {
// Compute global id of the workgroup to which the current thread belongs to
uint32_t blkIdx = static_cast<uint32_t>((blockIdx.z * gridDim.y * gridDim.x) +
(blockIdx.y * gridDim.x) + (blockIdx.x));
// Compute total number of threads being passed to reach current workgroup
// within grid
uint32_t num_threads_till_current_workgroup =
static_cast<uint32_t>(blkIdx * (blockDim.x * blockDim.y * blockDim.z));
// Compute thread local rank within current workgroup
uint32_t local_thread_rank = static_cast<uint32_t>((threadIdx.z * blockDim.y * blockDim.x) +
(threadIdx.y * blockDim.x) + (threadIdx.x));
return (num_threads_till_current_workgroup + local_thread_rank);
}
__CG_STATIC_QUALIFIER__ bool is_valid() { return static_cast<bool>(__ockl_grid_is_valid()); }
__CG_STATIC_QUALIFIER__ void sync() { __ockl_grid_sync(); }
} // namespace grid
/**
* Functionalities related to `workgroup` (thread_block in CUDA terminology)
* cooperative group type
*/
namespace workgroup {
__CG_STATIC_QUALIFIER__ dim3 group_index() {
return (dim3(static_cast<uint32_t>(blockIdx.x), static_cast<uint32_t>(blockIdx.y),
static_cast<uint32_t>(blockIdx.z)));
}
__CG_STATIC_QUALIFIER__ dim3 thread_index() {
return (dim3(static_cast<uint32_t>(threadIdx.x), static_cast<uint32_t>(threadIdx.y),
static_cast<uint32_t>(threadIdx.z)));
}
__CG_STATIC_QUALIFIER__ uint32_t size() {
return (static_cast<uint32_t>(blockDim.x * blockDim.y * blockDim.z));
}
__CG_STATIC_QUALIFIER__ uint32_t thread_rank() {
return (static_cast<uint32_t>((threadIdx.z * blockDim.y * blockDim.x) +
(threadIdx.y * blockDim.x) + (threadIdx.x)));
}
__CG_STATIC_QUALIFIER__ bool is_valid() {
// TODO(mahesha) any functionality need to be added here? I believe not
return true;
}
__CG_STATIC_QUALIFIER__ void sync() { __syncthreads(); }
} // namespace workgroup
namespace tiled_group {
// enforce ordering for memory intructions
__CG_STATIC_QUALIFIER__ void sync() { __builtin_amdgcn_fence(__ATOMIC_ACQ_REL, "agent"); }
} // namespace tiled_group
namespace coalesced_group {
// enforce ordering for memory intructions
__CG_STATIC_QUALIFIER__ void sync() { __builtin_amdgcn_fence(__ATOMIC_ACQ_REL, "agent"); }
// Masked bit count
//
// For each thread, this function returns the number of active threads which
// have i-th bit of x set and come before the current thread.
__CG_STATIC_QUALIFIER__ unsigned int masked_bit_count(lane_mask x, unsigned int add = 0) {
unsigned int counter=0;
#if __AMDGCN_WAVEFRONT_SIZE == 32
counter = __builtin_amdgcn_mbcnt_lo(x, add);
#else
counter = __builtin_amdgcn_mbcnt_lo(static_cast<lane_mask>(x), add);
counter = __builtin_amdgcn_mbcnt_hi(static_cast<lane_mask>(x >> 32), counter);
#endif
return counter;
}
} // namespace coalesced_group
} // namespace internal
} // namespace cooperative_groups
#pragma clang diagnostic pop
#endif // __cplusplus
#endif // HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COOPERATIVE_GROUPS_HELPER_H
@@ -0,0 +1,254 @@
#pragma once
#if defined(__cplusplus)
#include <cstring>
#endif
struct __half_raw {
unsigned short x;
};
struct __half2_raw {
unsigned short x;
unsigned short y;
};
#if defined(__cplusplus)
struct __half;
__half __float2half(float);
float __half2float(__half);
// BEGIN STRUCT __HALF
struct __half {
protected:
unsigned short __x;
public:
// CREATORS
__half() = default;
__half(const __half_raw& x) : __x{x.x} {}
#if !defined(__HIP_NO_HALF_CONVERSIONS__)
__half(float x) : __x{__float2half(x).__x} {}
__half(double x) : __x{__float2half(x).__x} {}
#endif
__half(const __half&) = default;
__half(__half&&) = default;
~__half() = default;
// MANIPULATORS
__half& operator=(const __half&) = default;
__half& operator=(__half&&) = default;
__half& operator=(const __half_raw& x) { __x = x.x; return *this; }
#if !defined(__HIP_NO_HALF_CONVERSIONS__)
__half& operator=(float x)
{
__x = __float2half(x).__x;
return *this;
}
__half& operator=(double x)
{
return *this = static_cast<float>(x);
}
#endif
// ACCESSORS
operator float() const { return __half2float(*this); }
operator __half_raw() const { return __half_raw{__x}; }
};
// END STRUCT __HALF
// BEGIN STRUCT __HALF2
struct __half2 {
public:
__half x;
__half y;
// CREATORS
__half2() = default;
__half2(const __half2_raw& ix)
:
x{reinterpret_cast<const __half&>(ix.x)},
y{reinterpret_cast<const __half&>(ix.y)}
{}
__half2(const __half& ix, const __half& iy) : x{ix}, y{iy} {}
__half2(const __half2&) = default;
__half2(__half2&&) = default;
~__half2() = default;
// MANIPULATORS
__half2& operator=(const __half2&) = default;
__half2& operator=(__half2&&) = default;
__half2& operator=(const __half2_raw& ix)
{
x = reinterpret_cast<const __half_raw&>(ix.x);
y = reinterpret_cast<const __half_raw&>(ix.y);
return *this;
}
// ACCESSORS
operator __half2_raw() const
{
return __half2_raw{
reinterpret_cast<const unsigned short&>(x),
reinterpret_cast<const unsigned short&>(y)};
}
};
// END STRUCT __HALF2
inline
unsigned short __internal_float2half(
float flt, unsigned int& sgn, unsigned int& rem)
{
unsigned int x{};
std::memcpy(&x, &flt, sizeof(flt));
unsigned int u = (x & 0x7fffffffU);
sgn = ((x >> 16) & 0x8000U);
// NaN/+Inf/-Inf
if (u >= 0x7f800000U) {
rem = 0;
return static_cast<unsigned short>(
(u == 0x7f800000U) ? (sgn | 0x7c00U) : 0x7fffU);
}
// Overflows
if (u > 0x477fefffU) {
rem = 0x80000000U;
return static_cast<unsigned short>(sgn | 0x7bffU);
}
// Normal numbers
if (u >= 0x38800000U) {
rem = u << 19;
u -= 0x38000000U;
return static_cast<unsigned short>(sgn | (u >> 13));
}
// +0/-0
if (u < 0x33000001U) {
rem = u;
return static_cast<unsigned short>(sgn);
}
// Denormal numbers
unsigned int exponent = u >> 23;
unsigned int mantissa = (u & 0x7fffffU);
unsigned int shift = 0x7eU - exponent;
mantissa |= 0x800000U;
rem = mantissa << (32 - shift);
return static_cast<unsigned short>(sgn | (mantissa >> shift));
}
inline
__half __float2half(float x)
{
__half_raw r;
unsigned int sgn{};
unsigned int rem{};
r.x = __internal_float2half(x, sgn, rem);
if (rem > 0x80000000U || (rem == 0x80000000U && (r.x & 0x1))) ++r.x;
return r;
}
inline
__half __float2half_rn(float x) { return __float2half(x); }
inline
__half __float2half_rz(float x)
{
__half_raw r;
unsigned int sgn{};
unsigned int rem{};
r.x = __internal_float2half(x, sgn, rem);
return r;
}
inline
__half __float2half_rd(float x)
{
__half_raw r;
unsigned int sgn{};
unsigned int rem{};
r.x = __internal_float2half(x, sgn, rem);
if (rem && sgn) ++r.x;
return r;
}
inline
__half __float2half_ru(float x)
{
__half_raw r;
unsigned int sgn{};
unsigned int rem{};
r.x = __internal_float2half(x, sgn, rem);
if (rem && !sgn) ++r.x;
return r;
}
inline
__half2 __float2half2_rn(float x)
{
return __half2{__float2half_rn(x), __float2half_rn(x)};
}
inline
__half2 __floats2half2_rn(float x, float y)
{
return __half2{__float2half_rn(x), __float2half_rn(y)};
}
inline
float __internal_half2float(unsigned short x)
{
unsigned int sign = ((x >> 15) & 1);
unsigned int exponent = ((x >> 10) & 0x1f);
unsigned int mantissa = ((x & 0x3ff) << 13);
if (exponent == 0x1fU) { /* NaN or Inf */
mantissa = (mantissa ? (sign = 0, 0x7fffffU) : 0);
exponent = 0xffU;
} else if (!exponent) { /* Denorm or Zero */
if (mantissa) {
unsigned int msb;
exponent = 0x71U;
do {
msb = (mantissa & 0x400000U);
mantissa <<= 1; /* normalize */
--exponent;
} while (!msb);
mantissa &= 0x7fffffU; /* 1.mantissa is implicit */
}
} else {
exponent += 0x70U;
}
unsigned int u = ((sign << 31) | (exponent << 23) | mantissa);
float f;
memcpy(&f, &u, sizeof(u));
return f;
}
inline
float __half2float(__half x)
{
return __internal_half2float(static_cast<__half_raw>(x).x);
}
inline
float __low2float(__half2 x)
{
return __internal_half2float(static_cast<__half2_raw>(x).x);
}
inline
float __high2float(__half2 x)
{
return __internal_half2float(static_cast<__half2_raw>(x).y);
}
#if !defined(HIP_NO_HALF)
using half = __half;
using half2 = __half2;
#endif
#endif // defined(__cplusplus)
@@ -0,0 +1,96 @@
/*
Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#pragma once
// /*
// Half Math Functions
// */
#if !defined(__HIPCC_RTC__)
#include "host_defines.h"
#endif
#if !__CLANG_HIP_RUNTIME_WRAPPER_INCLUDED__
extern "C"
{
__device__ __attribute__((const)) _Float16 __ocml_ceil_f16(_Float16);
__device__ _Float16 __ocml_cos_f16(_Float16);
__device__ __attribute__((pure)) _Float16 __ocml_exp_f16(_Float16);
__device__ __attribute__((pure)) _Float16 __ocml_exp10_f16(_Float16);
__device__ __attribute__((pure)) _Float16 __ocml_exp2_f16(_Float16);
__device__ __attribute__((const)) _Float16 __ocml_floor_f16(_Float16);
__device__ __attribute__((const))
_Float16 __ocml_fma_f16(_Float16, _Float16, _Float16);
__device__ __attribute__((const)) _Float16 __ocml_fabs_f16(_Float16);
__device__ __attribute__((const)) int __ocml_isinf_f16(_Float16);
__device__ __attribute__((const)) int __ocml_isnan_f16(_Float16);
__device__ __attribute__((pure)) _Float16 __ocml_log_f16(_Float16);
__device__ __attribute__((pure)) _Float16 __ocml_log10_f16(_Float16);
__device__ __attribute__((pure)) _Float16 __ocml_log2_f16(_Float16);
__device__ __attribute__((pure)) _Float16 __ocml_pown_f16(_Float16, int);
__device__ __attribute__((const)) _Float16 __ocml_rint_f16(_Float16);
__device__ __attribute__((const)) _Float16 __ocml_rsqrt_f16(_Float16);
__device__ _Float16 __ocml_sin_f16(_Float16);
__device__ __attribute__((const)) _Float16 __ocml_sqrt_f16(_Float16);
__device__ __attribute__((const)) _Float16 __ocml_trunc_f16(_Float16);
__device__ __attribute__((const)) _Float16 __ocml_fmax_f16(_Float16, _Float16);
__device__ __attribute__((const)) _Float16 __ocml_fmin_f16(_Float16, _Float16);
typedef _Float16 __2f16 __attribute__((ext_vector_type(2)));
typedef short __2i16 __attribute__((ext_vector_type(2)));
#if defined(__clang__) && defined(__HIP__)
__device__ __attribute__((const)) float __ockl_fdot2(__2f16 a, __2f16 b, float c, bool s);
#endif
__device__ __attribute__((const)) __2f16 __ocml_ceil_2f16(__2f16);
__device__ __attribute__((const)) __2f16 __ocml_fabs_2f16(__2f16);
__device__ __2f16 __ocml_cos_2f16(__2f16);
__device__ __attribute__((pure)) __2f16 __ocml_exp_2f16(__2f16);
__device__ __attribute__((pure)) __2f16 __ocml_exp10_2f16(__2f16);
__device__ __attribute__((pure)) __2f16 __ocml_exp2_2f16(__2f16);
__device__ __attribute__((const)) __2f16 __ocml_floor_2f16(__2f16);
__device__ __attribute__((const)) __2f16 __ocml_fma_2f16(__2f16, __2f16, __2f16);
__device__ __attribute__((const)) __2i16 __ocml_isinf_2f16(__2f16);
__device__ __attribute__((const)) __2i16 __ocml_isnan_2f16(__2f16);
__device__ __attribute__((pure)) __2f16 __ocml_log_2f16(__2f16);
__device__ __attribute__((pure)) __2f16 __ocml_log10_2f16(__2f16);
__device__ __attribute__((pure)) __2f16 __ocml_log2_2f16(__2f16);
__device__ __attribute__((const)) __2f16 __ocml_rint_2f16(__2f16);
__device__ __attribute__((const)) __2f16 __ocml_rsqrt_2f16(__2f16);
__device__ __2f16 __ocml_sin_2f16(__2f16);
__device__ __attribute__((const)) __2f16 __ocml_sqrt_2f16(__2f16);
__device__ __attribute__((const)) __2f16 __ocml_trunc_2f16(__2f16);
__device__ __attribute__((const)) _Float16 __ocml_cvtrtn_f16_f32(float);
__device__ __attribute__((const)) _Float16 __ocml_cvtrtp_f16_f32(float);
__device__ __attribute__((const)) _Float16 __ocml_cvtrtz_f16_f32(float);
}
#endif // !__CLANG_HIP_RUNTIME_WRAPPER_INCLUDED__
//TODO: remove these after they get into clang header __clang_hip_libdevice_declares.h'
extern "C" {
__device__ __attribute__((const)) _Float16 __ocml_fmax_f16(_Float16, _Float16);
__device__ __attribute__((const)) _Float16 __ocml_fmin_f16(_Float16, _Float16);
__device__ __attribute__((const)) _Float16 __ocml_cvtrtn_f16_f32(float);
__device__ __attribute__((const)) _Float16 __ocml_cvtrtp_f16_f32(float);
__device__ __attribute__((const)) _Float16 __ocml_cvtrtz_f16_f32(float);
}
@@ -0,0 +1,100 @@
/*
Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#ifndef HIP_INCLUDE_HIP_AMD_DETAIL_HIP_LDG_H
#define HIP_INCLUDE_HIP_AMD_DETAIL_HIP_LDG_H
#if __HIP_CLANG_ONLY__
#include "amd_hip_vector_types.h"
#include "host_defines.h"
__device__ inline static char __ldg(const char* ptr) { return *ptr; }
__device__ inline static char2 __ldg(const char2* ptr) { return *ptr; }
__device__ inline static char4 __ldg(const char4* ptr) { return *ptr; }
__device__ inline static signed char __ldg(const signed char* ptr) { return ptr[0]; }
__device__ inline static unsigned char __ldg(const unsigned char* ptr) { return ptr[0]; }
__device__ inline static short __ldg(const short* ptr) { return ptr[0]; }
__device__ inline static short2 __ldg(const short2* ptr) { return ptr[0]; }
__device__ inline static short4 __ldg(const short4* ptr) { return ptr[0]; }
__device__ inline static unsigned short __ldg(const unsigned short* ptr) { return ptr[0]; }
__device__ inline static int __ldg(const int* ptr) { return ptr[0]; }
__device__ inline static int2 __ldg(const int2* ptr) { return ptr[0]; }
__device__ inline static int4 __ldg(const int4* ptr) { return ptr[0]; }
__device__ inline static unsigned int __ldg(const unsigned int* ptr) { return ptr[0]; }
__device__ inline static long __ldg(const long* ptr) { return ptr[0]; }
__device__ inline static unsigned long __ldg(const unsigned long* ptr) { return ptr[0]; }
__device__ inline static long long __ldg(const long long* ptr) { return ptr[0]; }
__device__ inline static longlong2 __ldg(const longlong2* ptr) { return ptr[0]; }
__device__ inline static unsigned long long __ldg(const unsigned long long* ptr) { return ptr[0]; }
__device__ inline static uchar2 __ldg(const uchar2* ptr) { return ptr[0]; }
__device__ inline static uchar4 __ldg(const uchar4* ptr) { return ptr[0]; }
__device__ inline static ushort2 __ldg(const ushort2* ptr) { return ptr[0]; }
__device__ inline static uint2 __ldg(const uint2* ptr) { return ptr[0]; }
__device__ inline static uint4 __ldg(const uint4* ptr) { return ptr[0]; }
__device__ inline static ulonglong2 __ldg(const ulonglong2* ptr) { return ptr[0]; }
__device__ inline static float __ldg(const float* ptr) { return ptr[0]; }
__device__ inline static float2 __ldg(const float2* ptr) { return ptr[0]; }
__device__ inline static float4 __ldg(const float4* ptr) { return ptr[0]; }
__device__ inline static double __ldg(const double* ptr) { return ptr[0]; }
__device__ inline static double2 __ldg(const double2* ptr) { return ptr[0]; }
#endif // __HIP_CLANG_ONLY__
#endif // HIP_LDG_H
文件差异内容过多而无法显示 加载差异
@@ -0,0 +1,77 @@
/*
Copyright (c) 2019 - 2021 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#ifndef HIP_INCLUDE_HIP_AMD_DETAIL_HIP_RUNTIME_PROF_H
#define HIP_INCLUDE_HIP_AMD_DETAIL_HIP_RUNTIME_PROF_H
// HIP ROCclr Op IDs enumeration
enum HipVdiOpId {
kHipVdiOpIdDispatch = 0,
kHipVdiOpIdCopy = 1,
kHipVdiOpIdBarrier = 2,
kHipVdiOpIdNumber = 3
};
// Types of ROCclr commands
enum HipVdiCommandKind {
kHipVdiCommandKernel = 0x11F0,
kHipVdiMemcpyDeviceToHost = 0x11F3,
kHipHipVdiMemcpyHostToDevice = 0x11F4,
kHipVdiMemcpyDeviceToDevice = 0x11F5,
kHipVidMemcpyDeviceToHostRect = 0x1201,
kHipVdiMemcpyHostToDeviceRect = 0x1202,
kHipVdiMemcpyDeviceToDeviceRect = 0x1203,
kHipVdiFillMemory = 0x1207,
};
/**
* @brief Initializes activity callback
*
* @param [input] id_callback Event ID callback function
* @param [input] op_callback Event operation callback function
* @param [input] arg Arguments passed into callback
*
* @returns None
*/
void hipInitActivityCallback(void* id_callback, void* op_callback, void* arg);
/**
* @brief Enables activity callback
*
* @param [input] op Operation, which will trigger a callback (@see HipVdiOpId)
* @param [input] enable Enable state for the callback
*
* @returns True if successful
*/
bool hipEnableActivityCallback(uint32_t op, bool enable);
/**
* @brief Returns the description string for the operation kind
*
* @param [input] id Command kind id (@see HipVdiCommandKind)
*
* @returns A pointer to a const string with the command description
*/
const char* hipGetCmdName(uint32_t id);
#endif // HIP_INCLUDE_HIP_AMD_DETAIL_HIP_RUNTIME_PROF_H
@@ -0,0 +1,184 @@
/*
Copyright (c) 2015 - 2022 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
/**
* @file amd_detail/host_defines.h
* @brief TODO-doc
*/
#ifndef HIP_INCLUDE_HIP_AMD_DETAIL_HOST_DEFINES_H
#define HIP_INCLUDE_HIP_AMD_DETAIL_HOST_DEFINES_H
// The follow macro should be removed after upstream updation.
// It's defined here for workarround of rocThrust building failure.
#define HIP_INCLUDE_HIP_HCC_DETAIL_HOST_DEFINES_H
// Add guard to Generic Grid Launch method
#ifndef GENERIC_GRID_LAUNCH
#define GENERIC_GRID_LAUNCH 1
#endif
#if defined(__clang__) && defined(__HIP__)
namespace __hip_internal {
typedef unsigned char uint8_t;
typedef unsigned short uint16_t;
typedef unsigned int uint32_t;
typedef unsigned long long uint64_t;
typedef signed char int8_t;
typedef signed short int16_t;
typedef signed int int32_t;
typedef signed long long int64_t;
template <class _Tp, _Tp __v> struct integral_constant {
static constexpr const _Tp value = __v;
typedef _Tp value_type;
typedef integral_constant type;
constexpr operator value_type() const { return value; }
constexpr value_type operator()() const { return value; }
};
template <class _Tp, _Tp __v> constexpr const _Tp integral_constant<_Tp, __v>::value;
typedef integral_constant<bool, true> true_type;
typedef integral_constant<bool, false> false_type;
template <bool B> using bool_constant = integral_constant<bool, B>;
typedef bool_constant<true> true_type;
typedef bool_constant<false> false_type;
template <bool __B, class __T = void> struct enable_if {};
template <class __T> struct enable_if<true, __T> { typedef __T type; };
template<bool _B> struct true_or_false_type : public false_type {};
template<> struct true_or_false_type<true> : public true_type {};
template <class _Tp> struct is_integral : public false_type {};
template <> struct is_integral<bool> : public true_type {};
template <> struct is_integral<char> : public true_type {};
template <> struct is_integral<signed char> : public true_type {};
template <> struct is_integral<unsigned char> : public true_type {};
template <> struct is_integral<wchar_t> : public true_type {};
template <> struct is_integral<short> : public true_type {};
template <> struct is_integral<unsigned short> : public true_type {};
template <> struct is_integral<int> : public true_type {};
template <> struct is_integral<unsigned int> : public true_type {};
template <> struct is_integral<long> : public true_type {};
template <> struct is_integral<unsigned long> : public true_type {};
template <> struct is_integral<long long> : public true_type {};
template <> struct is_integral<unsigned long long> : public true_type {};
template <class _Tp> struct is_arithmetic : public false_type {};
template <> struct is_arithmetic<bool> : public true_type {};
template <> struct is_arithmetic<char> : public true_type {};
template <> struct is_arithmetic<signed char> : public true_type {};
template <> struct is_arithmetic<unsigned char> : public true_type {};
template <> struct is_arithmetic<wchar_t> : public true_type {};
template <> struct is_arithmetic<short> : public true_type {};
template <> struct is_arithmetic<unsigned short> : public true_type {};
template <> struct is_arithmetic<int> : public true_type {};
template <> struct is_arithmetic<unsigned int> : public true_type {};
template <> struct is_arithmetic<long> : public true_type {};
template <> struct is_arithmetic<unsigned long> : public true_type {};
template <> struct is_arithmetic<long long> : public true_type {};
template <> struct is_arithmetic<unsigned long long> : public true_type {};
template <> struct is_arithmetic<float> : public true_type {};
template <> struct is_arithmetic<double> : public true_type {};
template<typename _Tp> struct is_floating_point : public false_type {};
template<> struct is_floating_point<float> : public true_type {};
template<> struct is_floating_point<double> : public true_type {};
template<> struct is_floating_point<long double> : public true_type {};
template <typename __T, typename __U> struct is_same : public false_type {};
template <typename __T> struct is_same<__T, __T> : public true_type {};
template<typename _Tp, bool = is_arithmetic<_Tp>::value>
struct is_signed : public false_type {};
template<typename _Tp>
struct is_signed<_Tp, true> : public true_or_false_type<_Tp(-1) < _Tp(0)> {};
template<typename _CharT> struct char_traits;
template<typename _CharT, typename _Traits = char_traits<_CharT>> class basic_istream;
template<typename _CharT, typename _Traits = char_traits<_CharT>> class basic_ostream;
typedef basic_istream<char> istream;
typedef basic_ostream<char> ostream;
template<typename _Tp>
struct is_standard_layout
: public integral_constant<bool, __is_standard_layout(_Tp)>
{ };
template<typename _Tp>
struct is_trivial
: public integral_constant<bool, __is_trivial(_Tp)>
{ };
}
typedef __hip_internal::uint8_t __hip_uint8_t;
typedef __hip_internal::uint16_t __hip_uint16_t;
typedef __hip_internal::uint32_t __hip_uint32_t;
typedef __hip_internal::uint64_t __hip_uint64_t;
typedef __hip_internal::int8_t __hip_int8_t;
typedef __hip_internal::int16_t __hip_int16_t;
typedef __hip_internal::int32_t __hip_int32_t;
typedef __hip_internal::int64_t __hip_int64_t;
#if !__CLANG_HIP_RUNTIME_WRAPPER_INCLUDED__
#define __host__ __attribute__((host))
#define __device__ __attribute__((device))
#define __global__ __attribute__((global))
#define __shared__ __attribute__((shared))
#define __constant__ __attribute__((constant))
#endif // !__CLANG_HIP_RUNTIME_WRAPPER_INCLUDED__
#if !defined(__has_feature) || !__has_feature(cuda_noinline_keyword)
#define __noinline__ __attribute__((noinline))
#endif
#define __forceinline__ inline __attribute__((always_inline))
#if __HIP_NO_IMAGE_SUPPORT
#define __hip_img_chk__ __attribute__((unavailable("The image/texture API not supported on the device")))
#else
#define __hip_img_chk__
#endif
#else
// Non-HCC compiler
/**
* Function and kernel markers
*/
#define __host__
#define __device__
#define __global__
#define __noinline__
#define __forceinline__ inline
#define __shared__
#define __constant__
#define __hip_img_chk__
#endif
#endif
@@ -0,0 +1,102 @@
/*
Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#pragma once
#include <hsa/hsa.h>
#include <cstdint>
#include <functional>
#include <string>
namespace hip_impl {
inline void* address(hsa_executable_symbol_t x) {
void* r = nullptr;
hsa_executable_symbol_get_info(x, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, &r);
return r;
}
inline hsa_agent_t agent(hsa_executable_symbol_t x) {
hsa_agent_t r = {};
hsa_executable_symbol_get_info(x, HSA_EXECUTABLE_SYMBOL_INFO_AGENT, &r);
return r;
}
inline std::uint32_t group_size(hsa_executable_symbol_t x) {
std::uint32_t r = 0u;
hsa_executable_symbol_get_info(x, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE, &r);
return r;
}
inline hsa_isa_t isa(hsa_agent_t x) {
hsa_isa_t r = {};
hsa_agent_iterate_isas(x,
[](hsa_isa_t i, void* o) {
*static_cast<hsa_isa_t*>(o) = i; // Pick the first.
return HSA_STATUS_INFO_BREAK;
},
&r);
return r;
}
inline std::uint64_t kernel_object(hsa_executable_symbol_t x) {
std::uint64_t r = 0u;
hsa_executable_symbol_get_info(x, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &r);
return r;
}
inline std::string name(hsa_executable_symbol_t x) {
std::uint32_t sz = 0u;
hsa_executable_symbol_get_info(x, HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH, &sz);
std::string r(sz, '\0');
hsa_executable_symbol_get_info(x, HSA_EXECUTABLE_SYMBOL_INFO_NAME, &r.front());
return r;
}
inline std::uint32_t private_size(hsa_executable_symbol_t x) {
std::uint32_t r = 0u;
hsa_executable_symbol_get_info(x, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE, &r);
return r;
}
inline std::uint32_t size(hsa_executable_symbol_t x) {
std::uint32_t r = 0;
hsa_executable_symbol_get_info(x, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE, &r);
return r;
}
inline hsa_symbol_kind_t type(hsa_executable_symbol_t x) {
hsa_symbol_kind_t r = {};
hsa_executable_symbol_get_info(x, HSA_EXECUTABLE_SYMBOL_INFO_TYPE, &r);
return r;
}
} // namespace hip_impl
@@ -0,0 +1,798 @@
/*
Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#pragma once
#include "concepts.hpp"
#include "helpers.hpp"
#include "hc.hpp"
#include "hip/hip_ext.h"
#include "hip_runtime.h"
#include <functional>
#include <iostream>
#include <stdexcept>
#include <type_traits>
#include <utility>
namespace hip_impl {
namespace {
struct New_grid_launch_tag {};
struct Old_grid_launch_tag {};
template <typename C, typename D>
class RAII_guard {
D dtor_;
public:
RAII_guard() = default;
RAII_guard(const C& ctor, D dtor) : dtor_{std::move(dtor)} { ctor(); }
RAII_guard(const RAII_guard&) = default;
RAII_guard(RAII_guard&&) = default;
RAII_guard& operator=(const RAII_guard&) = default;
RAII_guard& operator=(RAII_guard&&) = default;
~RAII_guard() { dtor_(); }
};
template <typename C, typename D>
RAII_guard<C, D> make_RAII_guard(const C& ctor, D dtor) {
return RAII_guard<C, D>{ctor, std::move(dtor)};
}
template <FunctionalProcedure F, typename... Ts>
using is_new_grid_launch_t = typename std::conditional<is_callable<F(Ts...)>{}, New_grid_launch_tag,
Old_grid_launch_tag>::type;
} // namespace
// TODO: - dispatch rank should be derived from the domain dimensions passed
// in, and not always assumed to be 3;
template <FunctionalProcedure K, typename... Ts>
requires(Domain<K> ==
{Ts...}) inline void grid_launch_hip_impl_(New_grid_launch_tag, dim3 num_blocks,
dim3 dim_blocks, int group_mem_bytes,
const hc::accelerator_view& acc_v, K k) {
const auto d =
hc::extent<3>{num_blocks.z * dim_blocks.z, num_blocks.y * dim_blocks.y,
num_blocks.x * dim_blocks.x}
.tile_with_dynamic(dim_blocks.z, dim_blocks.y, dim_blocks.x, group_mem_bytes);
try {
hc::parallel_for_each(acc_v, d, k);
} catch (std::exception& ex) {
std::cerr << "Failed in " << __func__ << ", with exception: " << ex.what() << std::endl;
hip_throw(ex);
}
}
// TODO: these are workarounds, they should be removed.
hc::accelerator_view lock_stream_hip_(hipStream_t&, void*&);
void print_prelaunch_trace_(const char*, dim3, dim3, int, hipStream_t);
void unlock_stream_hip_(hipStream_t, void*, const char*, hc::accelerator_view*);
template <FunctionalProcedure K, typename... Ts>
requires(Domain<K> == {Ts...}) inline void grid_launch_hip_impl_(New_grid_launch_tag,
dim3 num_blocks, dim3 dim_blocks,
int group_mem_bytes,
hipStream_t stream,
const char* kernel_name, K k) {
void* lck_stream = nullptr;
auto acc_v = lock_stream_hip_(stream, lck_stream);
auto stream_guard =
make_RAII_guard(std::bind(print_prelaunch_trace_, kernel_name, num_blocks, dim_blocks,
group_mem_bytes, stream),
std::bind(unlock_stream_hip_, stream, lck_stream, kernel_name, &acc_v));
try {
grid_launch_hip_impl_(New_grid_launch_tag{}, std::move(num_blocks), std::move(dim_blocks),
group_mem_bytes, acc_v, std::move(k));
} catch (std::exception& ex) {
std::cerr << "Failed in " << __func__ << ", with exception: " << ex.what() << std::endl;
hip_throw(ex);
}
}
template <FunctionalProcedure K, typename... Ts>
requires(Domain<K> ==
{hipLaunchParm, Ts...}) inline void grid_launch_hip_impl_(Old_grid_launch_tag,
dim3 num_blocks, dim3 dim_blocks,
int group_mem_bytes,
hipStream_t stream, K k) {
grid_launch_hip_impl_(New_grid_launch_tag{}, std::move(num_blocks), std::move(dim_blocks),
group_mem_bytes, std::move(stream), std::move(k));
}
template <FunctionalProcedure K, typename... Ts>
requires(Domain<K> == {hipLaunchParm, Ts...}) inline void grid_launch_hip_impl_(
Old_grid_launch_tag, dim3 num_blocks, dim3 dim_blocks, int group_mem_bytes, hipStream_t stream,
const char* kernel_name, K k) {
grid_launch_hip_impl_(New_grid_launch_tag{}, std::move(num_blocks), std::move(dim_blocks),
group_mem_bytes, std::move(stream), kernel_name, std::move(k));
}
template <FunctionalProcedure K, typename... Ts>
requires(Domain<K> == {Ts...}) inline std::enable_if_t<
!std::is_function<K>::value> grid_launch_hip_(dim3 num_blocks, dim3 dim_blocks,
int group_mem_bytes, hipStream_t stream,
const char* kernel_name, K k) {
grid_launch_hip_impl_(is_new_grid_launch_t<K, Ts...>{}, std::move(num_blocks),
std::move(dim_blocks), group_mem_bytes, std::move(stream), kernel_name,
std::move(k));
}
template <FunctionalProcedure K, typename... Ts>
requires(Domain<K> == {Ts...}) inline std::enable_if_t<
!std::is_function<K>::value> grid_launch_hip_(dim3 num_blocks, dim3 dim_blocks,
int group_mem_bytes, hipStream_t stream, K k) {
grid_launch_hip_impl_(is_new_grid_launch_t<K, Ts...>{}, std::move(num_blocks),
std::move(dim_blocks), group_mem_bytes, std::move(stream), std::move(k));
}
// TODO: these are temporary and purposefully noisy and disruptive.
#define make_kernel_name_hip(k, n) \
HIP_kernel_functor_name_begin##_##k##_##HIP_kernel_functor_name_end##_##n
#define make_kernel_functor_hip_30(function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, \
p9, p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21, \
p22, p23, p24, p25, p26, p27) \
struct make_kernel_name_hip(function_name, 28) { \
std::decay_t<decltype(p0)> _p0_; \
std::decay_t<decltype(p1)> _p1_; \
std::decay_t<decltype(p2)> _p2_; \
std::decay_t<decltype(p3)> _p3_; \
std::decay_t<decltype(p4)> _p4_; \
std::decay_t<decltype(p5)> _p5_; \
std::decay_t<decltype(p6)> _p6_; \
std::decay_t<decltype(p7)> _p7_; \
std::decay_t<decltype(p8)> _p8_; \
std::decay_t<decltype(p9)> _p9_; \
std::decay_t<decltype(p10)> _p10_; \
std::decay_t<decltype(p11)> _p11_; \
std::decay_t<decltype(p12)> _p12_; \
std::decay_t<decltype(p13)> _p13_; \
std::decay_t<decltype(p14)> _p14_; \
std::decay_t<decltype(p15)> _p15_; \
std::decay_t<decltype(p16)> _p16_; \
std::decay_t<decltype(p17)> _p17_; \
std::decay_t<decltype(p18)> _p18_; \
std::decay_t<decltype(p19)> _p19_; \
std::decay_t<decltype(p20)> _p20_; \
std::decay_t<decltype(p21)> _p21_; \
std::decay_t<decltype(p22)> _p22_; \
std::decay_t<decltype(p23)> _p23_; \
std::decay_t<decltype(p24)> _p24_; \
std::decay_t<decltype(p25)> _p25_; \
std::decay_t<decltype(p26)> _p26_; \
std::decay_t<decltype(p27)> _p27_; \
void operator()(const hc::tiled_index<3>&) const [[hc]] { \
kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_, _p10_, _p11_, \
_p12_, _p13_, _p14_, _p15_, _p16_, _p17_, _p18_, _p19_, _p20_, _p21_, \
_p22_, _p23_, _p24_, _p25_, _p26_, _p27_); \
} \
}
#define make_kernel_functor_hip_29(function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, \
p9, p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21, \
p22, p23, p24, p25, p26) \
struct make_kernel_name_hip(function_name, 27) { \
std::decay_t<decltype(p0)> _p0_; \
std::decay_t<decltype(p1)> _p1_; \
std::decay_t<decltype(p2)> _p2_; \
std::decay_t<decltype(p3)> _p3_; \
std::decay_t<decltype(p4)> _p4_; \
std::decay_t<decltype(p5)> _p5_; \
std::decay_t<decltype(p6)> _p6_; \
std::decay_t<decltype(p7)> _p7_; \
std::decay_t<decltype(p8)> _p8_; \
std::decay_t<decltype(p9)> _p9_; \
std::decay_t<decltype(p10)> _p10_; \
std::decay_t<decltype(p11)> _p11_; \
std::decay_t<decltype(p12)> _p12_; \
std::decay_t<decltype(p13)> _p13_; \
std::decay_t<decltype(p14)> _p14_; \
std::decay_t<decltype(p15)> _p15_; \
std::decay_t<decltype(p16)> _p16_; \
std::decay_t<decltype(p17)> _p17_; \
std::decay_t<decltype(p18)> _p18_; \
std::decay_t<decltype(p19)> _p19_; \
std::decay_t<decltype(p20)> _p20_; \
std::decay_t<decltype(p21)> _p21_; \
std::decay_t<decltype(p22)> _p22_; \
std::decay_t<decltype(p23)> _p23_; \
std::decay_t<decltype(p24)> _p24_; \
std::decay_t<decltype(p25)> _p25_; \
std::decay_t<decltype(p26)> _p26_; \
void operator()(const hc::tiled_index<3>&) const [[hc]] { \
kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_, _p10_, _p11_, \
_p12_, _p13_, _p14_, _p15_, _p16_, _p17_, _p18_, _p19_, _p20_, _p21_, \
_p22_, _p23_, _p24_, _p25_, _p26_); \
} \
}
#define make_kernel_functor_hip_28(function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, \
p9, p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21, \
p22, p23, p24, p25) \
struct make_kernel_name_hip(function_name, 26) { \
std::decay_t<decltype(p0)> _p0_; \
std::decay_t<decltype(p1)> _p1_; \
std::decay_t<decltype(p2)> _p2_; \
std::decay_t<decltype(p3)> _p3_; \
std::decay_t<decltype(p4)> _p4_; \
std::decay_t<decltype(p5)> _p5_; \
std::decay_t<decltype(p6)> _p6_; \
std::decay_t<decltype(p7)> _p7_; \
std::decay_t<decltype(p8)> _p8_; \
std::decay_t<decltype(p9)> _p9_; \
std::decay_t<decltype(p10)> _p10_; \
std::decay_t<decltype(p11)> _p11_; \
std::decay_t<decltype(p12)> _p12_; \
std::decay_t<decltype(p13)> _p13_; \
std::decay_t<decltype(p14)> _p14_; \
std::decay_t<decltype(p15)> _p15_; \
std::decay_t<decltype(p16)> _p16_; \
std::decay_t<decltype(p17)> _p17_; \
std::decay_t<decltype(p18)> _p18_; \
std::decay_t<decltype(p19)> _p19_; \
std::decay_t<decltype(p20)> _p20_; \
std::decay_t<decltype(p21)> _p21_; \
std::decay_t<decltype(p22)> _p22_; \
std::decay_t<decltype(p23)> _p23_; \
std::decay_t<decltype(p24)> _p24_; \
std::decay_t<decltype(p25)> _p25_; \
void operator()(const hc::tiled_index<3>&) const [[hc]] { \
kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_, _p10_, _p11_, \
_p12_, _p13_, _p14_, _p15_, _p16_, _p17_, _p18_, _p19_, _p20_, _p21_, \
_p22_, _p23_, _p24_, _p25_); \
} \
}
#define make_kernel_functor_hip_27(function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, \
p9, p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21, \
p22, p23, p24) \
struct make_kernel_name_hip(function_name, 25) { \
std::decay_t<decltype(p0)> _p0_; \
std::decay_t<decltype(p1)> _p1_; \
std::decay_t<decltype(p2)> _p2_; \
std::decay_t<decltype(p3)> _p3_; \
std::decay_t<decltype(p4)> _p4_; \
std::decay_t<decltype(p5)> _p5_; \
std::decay_t<decltype(p6)> _p6_; \
std::decay_t<decltype(p7)> _p7_; \
std::decay_t<decltype(p8)> _p8_; \
std::decay_t<decltype(p9)> _p9_; \
std::decay_t<decltype(p10)> _p10_; \
std::decay_t<decltype(p11)> _p11_; \
std::decay_t<decltype(p12)> _p12_; \
std::decay_t<decltype(p13)> _p13_; \
std::decay_t<decltype(p14)> _p14_; \
std::decay_t<decltype(p15)> _p15_; \
std::decay_t<decltype(p16)> _p16_; \
std::decay_t<decltype(p17)> _p17_; \
std::decay_t<decltype(p18)> _p18_; \
std::decay_t<decltype(p19)> _p19_; \
std::decay_t<decltype(p20)> _p20_; \
std::decay_t<decltype(p21)> _p21_; \
std::decay_t<decltype(p22)> _p22_; \
std::decay_t<decltype(p23)> _p23_; \
std::decay_t<decltype(p24)> _p24_; \
void operator()(const hc::tiled_index<3>&) const [[hc]] { \
kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_, _p10_, _p11_, \
_p12_, _p13_, _p14_, _p15_, _p16_, _p17_, _p18_, _p19_, _p20_, _p21_, \
_p22_, _p23_, _p24_); \
} \
}
#define make_kernel_functor_hip_26(function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, \
p9, p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21, \
p22, p23) \
struct make_kernel_name_hip(function_name, 24) { \
std::decay_t<decltype(p0)> _p0_; \
std::decay_t<decltype(p1)> _p1_; \
std::decay_t<decltype(p2)> _p2_; \
std::decay_t<decltype(p3)> _p3_; \
std::decay_t<decltype(p4)> _p4_; \
std::decay_t<decltype(p5)> _p5_; \
std::decay_t<decltype(p6)> _p6_; \
std::decay_t<decltype(p7)> _p7_; \
std::decay_t<decltype(p8)> _p8_; \
std::decay_t<decltype(p9)> _p9_; \
std::decay_t<decltype(p10)> _p10_; \
std::decay_t<decltype(p11)> _p11_; \
std::decay_t<decltype(p12)> _p12_; \
std::decay_t<decltype(p13)> _p13_; \
std::decay_t<decltype(p14)> _p14_; \
std::decay_t<decltype(p15)> _p15_; \
std::decay_t<decltype(p16)> _p16_; \
std::decay_t<decltype(p17)> _p17_; \
std::decay_t<decltype(p18)> _p18_; \
std::decay_t<decltype(p19)> _p19_; \
std::decay_t<decltype(p20)> _p20_; \
std::decay_t<decltype(p21)> _p21_; \
std::decay_t<decltype(p22)> _p22_; \
std::decay_t<decltype(p23)> _p23_; \
void operator()(const hc::tiled_index<3>&) const [[hc]] { \
kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_, _p10_, _p11_, \
_p12_, _p13_, _p14_, _p15_, _p16_, _p17_, _p18_, _p19_, _p20_, _p21_, \
_p22_, _p23_); \
} \
}
#define make_kernel_functor_hip_25(function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, \
p9, p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21, \
p22) \
struct make_kernel_name_hip(function_name, 23) { \
std::decay_t<decltype(p0)> _p0_; \
std::decay_t<decltype(p1)> _p1_; \
std::decay_t<decltype(p2)> _p2_; \
std::decay_t<decltype(p3)> _p3_; \
std::decay_t<decltype(p4)> _p4_; \
std::decay_t<decltype(p5)> _p5_; \
std::decay_t<decltype(p6)> _p6_; \
std::decay_t<decltype(p7)> _p7_; \
std::decay_t<decltype(p8)> _p8_; \
std::decay_t<decltype(p9)> _p9_; \
std::decay_t<decltype(p10)> _p10_; \
std::decay_t<decltype(p11)> _p11_; \
std::decay_t<decltype(p12)> _p12_; \
std::decay_t<decltype(p13)> _p13_; \
std::decay_t<decltype(p14)> _p14_; \
std::decay_t<decltype(p15)> _p15_; \
std::decay_t<decltype(p16)> _p16_; \
std::decay_t<decltype(p17)> _p17_; \
std::decay_t<decltype(p18)> _p18_; \
std::decay_t<decltype(p19)> _p19_; \
std::decay_t<decltype(p20)> _p20_; \
std::decay_t<decltype(p21)> _p21_; \
std::decay_t<decltype(p22)> _p22_; \
__attribute__((used, flatten)) void operator()(const hc::tiled_index<3>&) const [[hc]] { \
kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_, _p10_, _p11_, \
_p12_, _p13_, _p14_, _p15_, _p16_, _p17_, _p18_, _p19_, _p20_, _p21_, \
_p22_); \
} \
}
#define make_kernel_functor_hip_24(function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, \
p9, p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21) \
struct make_kernel_name_hip(function_name, 22) { \
std::decay_t<decltype(p0)> _p0_; \
std::decay_t<decltype(p1)> _p1_; \
std::decay_t<decltype(p2)> _p2_; \
std::decay_t<decltype(p3)> _p3_; \
std::decay_t<decltype(p4)> _p4_; \
std::decay_t<decltype(p5)> _p5_; \
std::decay_t<decltype(p6)> _p6_; \
std::decay_t<decltype(p7)> _p7_; \
std::decay_t<decltype(p8)> _p8_; \
std::decay_t<decltype(p9)> _p9_; \
std::decay_t<decltype(p10)> _p10_; \
std::decay_t<decltype(p11)> _p11_; \
std::decay_t<decltype(p12)> _p12_; \
std::decay_t<decltype(p13)> _p13_; \
std::decay_t<decltype(p14)> _p14_; \
std::decay_t<decltype(p15)> _p15_; \
std::decay_t<decltype(p16)> _p16_; \
std::decay_t<decltype(p17)> _p17_; \
std::decay_t<decltype(p18)> _p18_; \
std::decay_t<decltype(p19)> _p19_; \
std::decay_t<decltype(p20)> _p20_; \
std::decay_t<decltype(p21)> _p21_; \
void operator()(const hc::tiled_index<3>&) const [[hc]] { \
kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_, _p10_, _p11_, \
_p12_, _p13_, _p14_, _p15_, _p16_, _p17_, _p18_, _p19_, _p20_, _p21_); \
} \
}
#define make_kernel_functor_hip_23(function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, \
p9, p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20) \
struct make_kernel_name_hip(function_name, 21) { \
std::decay_t<decltype(p0)> _p0_; \
std::decay_t<decltype(p1)> _p1_; \
std::decay_t<decltype(p2)> _p2_; \
std::decay_t<decltype(p3)> _p3_; \
std::decay_t<decltype(p4)> _p4_; \
std::decay_t<decltype(p5)> _p5_; \
std::decay_t<decltype(p6)> _p6_; \
std::decay_t<decltype(p7)> _p7_; \
std::decay_t<decltype(p8)> _p8_; \
std::decay_t<decltype(p9)> _p9_; \
std::decay_t<decltype(p10)> _p10_; \
std::decay_t<decltype(p11)> _p11_; \
std::decay_t<decltype(p12)> _p12_; \
std::decay_t<decltype(p13)> _p13_; \
std::decay_t<decltype(p14)> _p14_; \
std::decay_t<decltype(p15)> _p15_; \
std::decay_t<decltype(p16)> _p16_; \
std::decay_t<decltype(p17)> _p17_; \
std::decay_t<decltype(p18)> _p18_; \
std::decay_t<decltype(p19)> _p19_; \
std::decay_t<decltype(p20)> _p20_; \
void operator()(const hc::tiled_index<3>&) const [[hc]] { \
kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_, _p10_, _p11_, \
_p12_, _p13_, _p14_, _p15_, _p16_, _p17_, _p18_, _p19_, _p20_); \
} \
}
#define make_kernel_functor_hip_22(function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, \
p9, p10, p11, p12, p13, p14, p15, p16, p17, p18, p19) \
struct make_kernel_name_hip(function_name, 20) { \
std::decay_t<decltype(p0)> _p0_; \
std::decay_t<decltype(p1)> _p1_; \
std::decay_t<decltype(p2)> _p2_; \
std::decay_t<decltype(p3)> _p3_; \
std::decay_t<decltype(p4)> _p4_; \
std::decay_t<decltype(p5)> _p5_; \
std::decay_t<decltype(p6)> _p6_; \
std::decay_t<decltype(p7)> _p7_; \
std::decay_t<decltype(p8)> _p8_; \
std::decay_t<decltype(p9)> _p9_; \
std::decay_t<decltype(p10)> _p10_; \
std::decay_t<decltype(p11)> _p11_; \
std::decay_t<decltype(p12)> _p12_; \
std::decay_t<decltype(p13)> _p13_; \
std::decay_t<decltype(p14)> _p14_; \
std::decay_t<decltype(p15)> _p15_; \
std::decay_t<decltype(p16)> _p16_; \
std::decay_t<decltype(p17)> _p17_; \
std::decay_t<decltype(p18)> _p18_; \
std::decay_t<decltype(p19)> _p19_; \
void operator()(const hc::tiled_index<3>&) const [[hc]] { \
kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_, _p10_, _p11_, \
_p12_, _p13_, _p14_, _p15_, _p16_, _p17_, _p18_, _p19_); \
} \
}
#define make_kernel_functor_hip_21(function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, \
p9, p10, p11, p12, p13, p14, p15, p16, p17, p18) \
struct make_kernel_name_hip(function_name, 19) { \
std::decay_t<decltype(p0)> _p0_; \
std::decay_t<decltype(p1)> _p1_; \
std::decay_t<decltype(p2)> _p2_; \
std::decay_t<decltype(p3)> _p3_; \
std::decay_t<decltype(p4)> _p4_; \
std::decay_t<decltype(p5)> _p5_; \
std::decay_t<decltype(p6)> _p6_; \
std::decay_t<decltype(p7)> _p7_; \
std::decay_t<decltype(p8)> _p8_; \
std::decay_t<decltype(p9)> _p9_; \
std::decay_t<decltype(p10)> _p10_; \
std::decay_t<decltype(p11)> _p11_; \
std::decay_t<decltype(p12)> _p12_; \
std::decay_t<decltype(p13)> _p13_; \
std::decay_t<decltype(p14)> _p14_; \
std::decay_t<decltype(p15)> _p15_; \
std::decay_t<decltype(p16)> _p16_; \
std::decay_t<decltype(p17)> _p17_; \
std::decay_t<decltype(p18)> _p18_; \
void operator()(const hc::tiled_index<3>&) const [[hc]] { \
kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_, _p10_, _p11_, \
_p12_, _p13_, _p14_, _p15_, _p16_, _p17_, _p18_); \
} \
}
#define make_kernel_functor_hip_20(function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, \
p9, p10, p11, p12, p13, p14, p15, p16, p17) \
struct make_kernel_name_hip(function_name, 18) { \
std::decay_t<decltype(p0)> _p0_; \
std::decay_t<decltype(p1)> _p1_; \
std::decay_t<decltype(p2)> _p2_; \
std::decay_t<decltype(p3)> _p3_; \
std::decay_t<decltype(p4)> _p4_; \
std::decay_t<decltype(p5)> _p5_; \
std::decay_t<decltype(p6)> _p6_; \
std::decay_t<decltype(p7)> _p7_; \
std::decay_t<decltype(p8)> _p8_; \
std::decay_t<decltype(p9)> _p9_; \
std::decay_t<decltype(p10)> _p10_; \
std::decay_t<decltype(p11)> _p11_; \
std::decay_t<decltype(p12)> _p12_; \
std::decay_t<decltype(p13)> _p13_; \
std::decay_t<decltype(p14)> _p14_; \
std::decay_t<decltype(p15)> _p15_; \
std::decay_t<decltype(p16)> _p16_; \
std::decay_t<decltype(p17)> _p17_; \
void operator()(const hc::tiled_index<3>&) const [[hc]] { \
kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_, _p10_, _p11_, \
_p12_, _p13_, _p14_, _p15_, _p16_, _p17_); \
} \
}
#define make_kernel_functor_hip_19(function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, \
p9, p10, p11, p12, p13, p14, p15, p16) \
struct make_kernel_name_hip(function_name, 17) { \
std::decay_t<decltype(p0)> _p0_; \
std::decay_t<decltype(p1)> _p1_; \
std::decay_t<decltype(p2)> _p2_; \
std::decay_t<decltype(p3)> _p3_; \
std::decay_t<decltype(p4)> _p4_; \
std::decay_t<decltype(p5)> _p5_; \
std::decay_t<decltype(p6)> _p6_; \
std::decay_t<decltype(p7)> _p7_; \
std::decay_t<decltype(p8)> _p8_; \
std::decay_t<decltype(p9)> _p9_; \
std::decay_t<decltype(p10)> _p10_; \
std::decay_t<decltype(p11)> _p11_; \
std::decay_t<decltype(p12)> _p12_; \
std::decay_t<decltype(p13)> _p13_; \
std::decay_t<decltype(p14)> _p14_; \
std::decay_t<decltype(p15)> _p15_; \
std::decay_t<decltype(p16)> _p16_; \
void operator()(const hc::tiled_index<3>&) const [[hc]] { \
kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_, _p10_, _p11_, \
_p12_, _p13_, _p14_, _p15_, _p16_); \
} \
}
#define make_kernel_functor_hip_18(function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, \
p9, p10, p11, p12, p13, p14, p15) \
struct make_kernel_name_hip(function_name, 16) { \
std::decay_t<decltype(p0)> _p0_; \
std::decay_t<decltype(p1)> _p1_; \
std::decay_t<decltype(p2)> _p2_; \
std::decay_t<decltype(p3)> _p3_; \
std::decay_t<decltype(p4)> _p4_; \
std::decay_t<decltype(p5)> _p5_; \
std::decay_t<decltype(p6)> _p6_; \
std::decay_t<decltype(p7)> _p7_; \
std::decay_t<decltype(p8)> _p8_; \
std::decay_t<decltype(p9)> _p9_; \
std::decay_t<decltype(p10)> _p10_; \
std::decay_t<decltype(p11)> _p11_; \
std::decay_t<decltype(p12)> _p12_; \
std::decay_t<decltype(p13)> _p13_; \
std::decay_t<decltype(p14)> _p14_; \
std::decay_t<decltype(p15)> _p15_; \
void operator()(const hc::tiled_index<3>&) const [[hc]] { \
kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_, _p10_, _p11_, \
_p12_, _p13_, _p14_, _p15_); \
} \
}
#define make_kernel_functor_hip_17(function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, \
p9, p10, p11, p12, p13, p14) \
struct make_kernel_name_hip(function_name, 15) { \
std::decay_t<decltype(p0)> _p0_; \
std::decay_t<decltype(p1)> _p1_; \
std::decay_t<decltype(p2)> _p2_; \
std::decay_t<decltype(p3)> _p3_; \
std::decay_t<decltype(p4)> _p4_; \
std::decay_t<decltype(p5)> _p5_; \
std::decay_t<decltype(p6)> _p6_; \
std::decay_t<decltype(p7)> _p7_; \
std::decay_t<decltype(p8)> _p8_; \
std::decay_t<decltype(p9)> _p9_; \
std::decay_t<decltype(p10)> _p10_; \
std::decay_t<decltype(p11)> _p11_; \
std::decay_t<decltype(p12)> _p12_; \
std::decay_t<decltype(p13)> _p13_; \
std::decay_t<decltype(p14)> _p14_; \
void operator()(const hc::tiled_index<3>&) const [[hc]] { \
kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_, _p10_, _p11_, \
_p12_, _p13_, _p14_); \
} \
}
#define make_kernel_functor_hip_16(function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, \
p9, p10, p11, p12, p13) \
struct make_kernel_name_hip(function_name, 14) { \
std::decay_t<decltype(p0)> _p0_; \
std::decay_t<decltype(p1)> _p1_; \
std::decay_t<decltype(p2)> _p2_; \
std::decay_t<decltype(p3)> _p3_; \
std::decay_t<decltype(p4)> _p4_; \
std::decay_t<decltype(p5)> _p5_; \
std::decay_t<decltype(p6)> _p6_; \
std::decay_t<decltype(p7)> _p7_; \
std::decay_t<decltype(p8)> _p8_; \
std::decay_t<decltype(p9)> _p9_; \
std::decay_t<decltype(p10)> _p10_; \
std::decay_t<decltype(p11)> _p11_; \
std::decay_t<decltype(p12)> _p12_; \
std::decay_t<decltype(p13)> _p13_; \
void operator()(const hc::tiled_index<3>&) const [[hc]] { \
kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_, _p10_, _p11_, \
_p12_, _p13_); \
} \
}
#define make_kernel_functor_hip_15(function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, \
p9, p10, p11, p12) \
struct make_kernel_name_hip(function_name, 13) { \
std::decay_t<decltype(p0)> _p0_; \
std::decay_t<decltype(p1)> _p1_; \
std::decay_t<decltype(p2)> _p2_; \
std::decay_t<decltype(p3)> _p3_; \
std::decay_t<decltype(p4)> _p4_; \
std::decay_t<decltype(p5)> _p5_; \
std::decay_t<decltype(p6)> _p6_; \
std::decay_t<decltype(p7)> _p7_; \
std::decay_t<decltype(p8)> _p8_; \
std::decay_t<decltype(p9)> _p9_; \
std::decay_t<decltype(p10)> _p10_; \
std::decay_t<decltype(p11)> _p11_; \
std::decay_t<decltype(p12)> _p12_; \
void operator()(const hc::tiled_index<3>&) const [[hc]] { \
kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_, _p10_, _p11_, \
_p12_); \
} \
}
#define make_kernel_functor_hip_14(function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, \
p9, p10, p11) \
struct make_kernel_name_hip(function_name, 12) { \
std::decay_t<decltype(p0)> _p0_; \
std::decay_t<decltype(p1)> _p1_; \
std::decay_t<decltype(p2)> _p2_; \
std::decay_t<decltype(p3)> _p3_; \
std::decay_t<decltype(p4)> _p4_; \
std::decay_t<decltype(p5)> _p5_; \
std::decay_t<decltype(p6)> _p6_; \
std::decay_t<decltype(p7)> _p7_; \
std::decay_t<decltype(p8)> _p8_; \
std::decay_t<decltype(p9)> _p9_; \
std::decay_t<decltype(p10)> _p10_; \
std::decay_t<decltype(p11)> _p11_; \
void operator()(const hc::tiled_index<3>&) const [[hc]] { \
kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_, _p10_, _p11_); \
} \
}
#define make_kernel_functor_hip_13(function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, \
p9, p10) \
struct make_kernel_name_hip(function_name, 11) { \
std::decay_t<decltype(p0)> _p0_; \
std::decay_t<decltype(p1)> _p1_; \
std::decay_t<decltype(p2)> _p2_; \
std::decay_t<decltype(p3)> _p3_; \
std::decay_t<decltype(p4)> _p4_; \
std::decay_t<decltype(p5)> _p5_; \
std::decay_t<decltype(p6)> _p6_; \
std::decay_t<decltype(p7)> _p7_; \
std::decay_t<decltype(p8)> _p8_; \
std::decay_t<decltype(p9)> _p9_; \
std::decay_t<decltype(p10)> _p10_; \
void operator()(const hc::tiled_index<3>&) const [[hc]] { \
kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_, _p10_); \
} \
}
#define make_kernel_functor_hip_12(function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, \
p9) \
struct make_kernel_name_hip(function_name, 10) { \
std::decay_t<decltype(p0)> _p0_; \
std::decay_t<decltype(p1)> _p1_; \
std::decay_t<decltype(p2)> _p2_; \
std::decay_t<decltype(p3)> _p3_; \
std::decay_t<decltype(p4)> _p4_; \
std::decay_t<decltype(p5)> _p5_; \
std::decay_t<decltype(p6)> _p6_; \
std::decay_t<decltype(p7)> _p7_; \
std::decay_t<decltype(p8)> _p8_; \
std::decay_t<decltype(p9)> _p9_; \
void operator()(const hc::tiled_index<3>&) const \
[[hc]] { kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_); } \
}
#define make_kernel_functor_hip_11(function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8) \
struct make_kernel_name_hip(function_name, 9) { \
std::decay_t<decltype(p0)> _p0_; \
std::decay_t<decltype(p1)> _p1_; \
std::decay_t<decltype(p2)> _p2_; \
std::decay_t<decltype(p3)> _p3_; \
std::decay_t<decltype(p4)> _p4_; \
std::decay_t<decltype(p5)> _p5_; \
std::decay_t<decltype(p6)> _p6_; \
std::decay_t<decltype(p7)> _p7_; \
std::decay_t<decltype(p8)> _p8_; \
void operator()(const hc::tiled_index<3>&) const \
[[hc]] { kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_); } \
}
#define make_kernel_functor_hip_10(function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7) \
struct make_kernel_name_hip(function_name, 8) { \
std::decay_t<decltype(p0)> _p0_; \
std::decay_t<decltype(p1)> _p1_; \
std::decay_t<decltype(p2)> _p2_; \
std::decay_t<decltype(p3)> _p3_; \
std::decay_t<decltype(p4)> _p4_; \
std::decay_t<decltype(p5)> _p5_; \
std::decay_t<decltype(p6)> _p6_; \
std::decay_t<decltype(p7)> _p7_; \
void operator()(const hc::tiled_index<3>&) const \
[[hc]] { kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_); } \
}
#define make_kernel_functor_hip_9(function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6) \
struct make_kernel_name_hip(function_name, 7) { \
std::decay_t<decltype(p0)> _p0_; \
std::decay_t<decltype(p1)> _p1_; \
std::decay_t<decltype(p2)> _p2_; \
std::decay_t<decltype(p3)> _p3_; \
std::decay_t<decltype(p4)> _p4_; \
std::decay_t<decltype(p5)> _p5_; \
std::decay_t<decltype(p6)> _p6_; \
void operator()(const hc::tiled_index<3>&) const \
[[hc]] { kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_); } \
}
#define make_kernel_functor_hip_8(function_name, kernel_name, p0, p1, p2, p3, p4, p5) \
struct make_kernel_name_hip(function_name, 6) { \
std::decay_t<decltype(p0)> _p0_; \
std::decay_t<decltype(p1)> _p1_; \
std::decay_t<decltype(p2)> _p2_; \
std::decay_t<decltype(p3)> _p3_; \
std::decay_t<decltype(p4)> _p4_; \
std::decay_t<decltype(p5)> _p5_; \
void operator()(const hc::tiled_index<3>&) const \
[[hc]] { kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_); } \
}
#define make_kernel_functor_hip_7(function_name, kernel_name, p0, p1, p2, p3, p4) \
struct make_kernel_name_hip(function_name, 5) { \
std::decay_t<decltype(p0)> _p0_; \
std::decay_t<decltype(p1)> _p1_; \
std::decay_t<decltype(p2)> _p2_; \
std::decay_t<decltype(p3)> _p3_; \
std::decay_t<decltype(p4)> _p4_; \
void operator()(const hc::tiled_index<3>&) const \
[[hc]] { kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_); } \
}
#define make_kernel_functor_hip_6(function_name, kernel_name, p0, p1, p2, p3) \
struct make_kernel_name_hip(function_name, 4) { \
std::decay_t<decltype(p0)> _p0_; \
std::decay_t<decltype(p1)> _p1_; \
std::decay_t<decltype(p2)> _p2_; \
std::decay_t<decltype(p3)> _p3_; \
void operator()(const hc::tiled_index<3>&) const \
[[hc]] { kernel_name(_p0_, _p1_, _p2_, _p3_); } \
}
#define make_kernel_functor_hip_5(function_name, kernel_name, p0, p1, p2) \
struct make_kernel_name_hip(function_name, 3) { \
std::decay_t<decltype(p0)> _p0_; \
std::decay_t<decltype(p1)> _p1_; \
std::decay_t<decltype(p2)> _p2_; \
void operator()(const hc::tiled_index<3>&) const [[hc]] { kernel_name(_p0_, _p1_, _p2_); } \
}
#define make_kernel_functor_hip_4(function_name, kernel_name, p0, p1) \
struct make_kernel_name_hip(function_name, 2) { \
std::decay_t<decltype(p0)> _p0_; \
std::decay_t<decltype(p1)> _p1_; \
void operator()(const hc::tiled_index<3>&) const [[hc]] { kernel_name(_p0_, _p1_); } \
}
#define fofo(f, n) kernel_prefix_hip##f##kernel_suffix_hip##n
#define make_kernel_functor_hip_3(function_name, kernel_name, p0) \
struct make_kernel_name_hip(function_name, 1) { \
std::decay_t<decltype(p0)> _p0_; \
void operator()(const hc::tiled_index<3>&) const [[hc]] { kernel_name(_p0_); } \
}
#define make_kernel_functor_hip_2(function_name, kernel_name) \
struct make_kernel_name_hip(function_name, 0) { \
void operator()(const hc::tiled_index<3>&)[[hc]] { return kernel_name(hipLaunchParm{}); } \
}
#define make_kernel_functor_hip_1(...)
#define make_kernel_functor_hip_0(...)
#define make_kernel_functor_hip_(...) overload_macro_hip_(make_kernel_functor_hip_, __VA_ARGS__)
#define hipLaunchNamedKernelGGL(function_name, kernel_name, num_blocks, dim_blocks, \
group_mem_bytes, stream, ...) \
do { \
make_kernel_functor_hip_(function_name, kernel_name, __VA_ARGS__) \
hip_kernel_functor_impl_{__VA_ARGS__}; \
hip_impl::grid_launch_hip_(num_blocks, dim_blocks, group_mem_bytes, stream, #kernel_name, \
hip_kernel_functor_impl_); \
} while (0)
#define hipLaunchKernelGGL(kernel_name, num_blocks, dim_blocks, group_mem_bytes, stream, ...) \
do { \
hipLaunchNamedKernelGGL(unnamed, kernel_name, num_blocks, dim_blocks, group_mem_bytes, \
stream, ##__VA_ARGS__); \
} while (0)
#define hipLaunchKernel(kernel_name, num_blocks, dim_blocks, group_mem_bytes, stream, ...) \
do { \
hipLaunchKernelGGL(kernel_name, num_blocks, dim_blocks, group_mem_bytes, stream, \
hipLaunchParm{}, ##__VA_ARGS__); \
} while (0)
} // namespace hip_impl
@@ -0,0 +1,694 @@
/*
Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#pragma once
#include "host_defines.h"
#if defined(__cplusplus)
extern "C" {
#endif
// DOT FUNCTIONS
#if __HIP_CLANG_ONLY__
__device__
__attribute__((const))
int __ockl_sdot2(
HIP_vector_base<short, 2>::Native_vec_,
HIP_vector_base<short, 2>::Native_vec_,
int, bool);
__device__
__attribute__((const))
unsigned int __ockl_udot2(
HIP_vector_base<unsigned short, 2>::Native_vec_,
HIP_vector_base<unsigned short, 2>::Native_vec_,
unsigned int, bool);
__device__
__attribute__((const))
int __ockl_sdot4(
HIP_vector_base<char, 4>::Native_vec_,
HIP_vector_base<char, 4>::Native_vec_,
int, bool);
__device__
__attribute__((const))
unsigned int __ockl_udot4(
HIP_vector_base<unsigned char, 4>::Native_vec_,
HIP_vector_base<unsigned char, 4>::Native_vec_,
unsigned int, bool);
__device__
__attribute__((const))
int __ockl_sdot8(int, int, int, bool);
__device__
__attribute__((const))
unsigned int __ockl_udot8(unsigned int, unsigned int, unsigned int, bool);
#endif
#if !__CLANG_HIP_RUNTIME_WRAPPER_INCLUDED__
// BEGIN FLOAT
__device__
__attribute__((const))
float __ocml_acos_f32(float);
__device__
__attribute__((pure))
float __ocml_acosh_f32(float);
__device__
__attribute__((const))
float __ocml_asin_f32(float);
__device__
__attribute__((pure))
float __ocml_asinh_f32(float);
__device__
__attribute__((const))
float __ocml_atan2_f32(float, float);
__device__
__attribute__((const))
float __ocml_atan_f32(float);
__device__
__attribute__((pure))
float __ocml_atanh_f32(float);
__device__
__attribute__((pure))
float __ocml_cbrt_f32(float);
__device__
__attribute__((const))
float __ocml_ceil_f32(float);
__device__
__attribute__((const))
__device__
float __ocml_copysign_f32(float, float);
__device__
float __ocml_cos_f32(float);
__device__
float __ocml_native_cos_f32(float);
__device__
__attribute__((pure))
__device__
float __ocml_cosh_f32(float);
__device__
float __ocml_cospi_f32(float);
__device__
float __ocml_i0_f32(float);
__device__
float __ocml_i1_f32(float);
__device__
__attribute__((pure))
float __ocml_erfc_f32(float);
__device__
__attribute__((pure))
float __ocml_erfcinv_f32(float);
__device__
__attribute__((pure))
float __ocml_erfcx_f32(float);
__device__
__attribute__((pure))
float __ocml_erf_f32(float);
__device__
__attribute__((pure))
float __ocml_erfinv_f32(float);
__device__
__attribute__((pure))
float __ocml_exp10_f32(float);
__device__
__attribute__((pure))
float __ocml_native_exp10_f32(float);
__device__
__attribute__((pure))
float __ocml_exp2_f32(float);
__device__
__attribute__((pure))
float __ocml_exp_f32(float);
__device__
__attribute__((pure))
float __ocml_native_exp_f32(float);
__device__
__attribute__((pure))
float __ocml_expm1_f32(float);
__device__
__attribute__((const))
float __ocml_fabs_f32(float);
__device__
__attribute__((const))
float __ocml_fdim_f32(float, float);
__device__
__attribute__((const))
float __ocml_floor_f32(float);
__device__
__attribute__((const))
float __ocml_fma_f32(float, float, float);
__device__
__attribute__((const))
float __ocml_fmax_f32(float, float);
__device__
__attribute__((const))
float __ocml_fmin_f32(float, float);
__device__
__attribute__((const))
__device__
float __ocml_fmod_f32(float, float);
__device__
float __ocml_frexp_f32(float, __attribute__((address_space(5))) int*);
__device__
__attribute__((const))
float __ocml_hypot_f32(float, float);
__device__
__attribute__((const))
int __ocml_ilogb_f32(float);
__device__
__attribute__((const))
int __ocml_isfinite_f32(float);
__device__
__attribute__((const))
int __ocml_isinf_f32(float);
__device__
__attribute__((const))
int __ocml_isnan_f32(float);
__device__
float __ocml_j0_f32(float);
__device__
float __ocml_j1_f32(float);
__device__
__attribute__((const))
float __ocml_ldexp_f32(float, int);
__device__
float __ocml_lgamma_f32(float);
__device__
__attribute__((pure))
float __ocml_log10_f32(float);
__device__
__attribute__((pure))
float __ocml_native_log10_f32(float);
__device__
__attribute__((pure))
float __ocml_log1p_f32(float);
__device__
__attribute__((pure))
float __ocml_log2_f32(float);
__device__
__attribute__((pure))
float __ocml_native_log2_f32(float);
__device__
__attribute__((const))
float __ocml_logb_f32(float);
__device__
__attribute__((pure))
float __ocml_log_f32(float);
__device__
__attribute__((pure))
float __ocml_native_log_f32(float);
__device__
float __ocml_modf_f32(float, __attribute__((address_space(5))) float*);
__device__
__attribute__((const))
float __ocml_nearbyint_f32(float);
__device__
__attribute__((const))
float __ocml_nextafter_f32(float, float);
__device__
__attribute__((const))
float __ocml_len3_f32(float, float, float);
__device__
__attribute__((const))
float __ocml_len4_f32(float, float, float, float);
__device__
__attribute__((pure))
float __ocml_ncdf_f32(float);
__device__
__attribute__((pure))
float __ocml_ncdfinv_f32(float);
__device__
__attribute__((pure))
float __ocml_pow_f32(float, float);
__device__
__attribute__((pure))
float __ocml_pown_f32(float, int);
__device__
__attribute__((pure))
float __ocml_rcbrt_f32(float);
__device__
__attribute__((const))
float __ocml_remainder_f32(float, float);
__device__
float __ocml_remquo_f32(float, float, __attribute__((address_space(5))) int*);
__device__
__attribute__((const))
float __ocml_rhypot_f32(float, float);
__device__
__attribute__((const))
float __ocml_rint_f32(float);
__device__
__attribute__((const))
float __ocml_rlen3_f32(float, float, float);
__device__
__attribute__((const))
float __ocml_rlen4_f32(float, float, float, float);
__device__
__attribute__((const))
float __ocml_round_f32(float);
__device__
__attribute__((pure))
float __ocml_rsqrt_f32(float);
__device__
__attribute__((const))
float __ocml_scalb_f32(float, float);
__device__
__attribute__((const))
float __ocml_scalbn_f32(float, int);
__device__
__attribute__((const))
int __ocml_signbit_f32(float);
__device__
float __ocml_sincos_f32(float, __attribute__((address_space(5))) float*);
__device__
float __ocml_sincospi_f32(float, __attribute__((address_space(5))) float*);
__device__
float __ocml_sin_f32(float);
__device__
float __ocml_native_sin_f32(float);
__device__
__attribute__((pure))
float __ocml_sinh_f32(float);
__device__
float __ocml_sinpi_f32(float);
__device__
__attribute__((const))
float __ocml_sqrt_f32(float);
__device__
__attribute__((const))
float __ocml_native_sqrt_f32(float);
__device__
float __ocml_tan_f32(float);
__device__
__attribute__((pure))
float __ocml_tanh_f32(float);
__device__
float __ocml_tgamma_f32(float);
__device__
__attribute__((const))
float __ocml_trunc_f32(float);
__device__
float __ocml_y0_f32(float);
__device__
float __ocml_y1_f32(float);
// BEGIN INTRINSICS
__device__
__attribute__((const))
float __ocml_add_rte_f32(float, float);
__device__
__attribute__((const))
float __ocml_add_rtn_f32(float, float);
__device__
__attribute__((const))
float __ocml_add_rtp_f32(float, float);
__device__
__attribute__((const))
float __ocml_add_rtz_f32(float, float);
__device__
__attribute__((const))
float __ocml_sub_rte_f32(float, float);
__device__
__attribute__((const))
float __ocml_sub_rtn_f32(float, float);
__device__
__attribute__((const))
float __ocml_sub_rtp_f32(float, float);
__device__
__attribute__((const))
float __ocml_sub_rtz_f32(float, float);
__device__
__attribute__((const))
float __ocml_mul_rte_f32(float, float);
__device__
__attribute__((const))
float __ocml_mul_rtn_f32(float, float);
__device__
__attribute__((const))
float __ocml_mul_rtp_f32(float, float);
__device__
__attribute__((const))
float __ocml_mul_rtz_f32(float, float);
__device__
__attribute__((const))
float __ocml_div_rte_f32(float, float);
__device__
__attribute__((const))
float __ocml_div_rtn_f32(float, float);
__device__
__attribute__((const))
float __ocml_div_rtp_f32(float, float);
__device__
__attribute__((const))
float __ocml_div_rtz_f32(float, float);
__device__
__attribute__((const))
float __ocml_sqrt_rte_f32(float);
__device__
__attribute__((const))
float __ocml_sqrt_rtn_f32(float);
__device__
__attribute__((const))
float __ocml_sqrt_rtp_f32(float);
__device__
__attribute__((const))
float __ocml_sqrt_rtz_f32(float);
__device__
__attribute__((const))
float __ocml_fma_rte_f32(float, float, float);
__device__
__attribute__((const))
float __ocml_fma_rtn_f32(float, float, float);
__device__
__attribute__((const))
float __ocml_fma_rtp_f32(float, float, float);
__device__
__attribute__((const))
float __ocml_fma_rtz_f32(float, float, float);
// END INTRINSICS
// END FLOAT
// BEGIN DOUBLE
__device__
__attribute__((const))
double __ocml_acos_f64(double);
__device__
__attribute__((pure))
double __ocml_acosh_f64(double);
__device__
__attribute__((const))
double __ocml_asin_f64(double);
__device__
__attribute__((pure))
double __ocml_asinh_f64(double);
__device__
__attribute__((const))
double __ocml_atan2_f64(double, double);
__device__
__attribute__((const))
double __ocml_atan_f64(double);
__device__
__attribute__((pure))
double __ocml_atanh_f64(double);
__device__
__attribute__((pure))
double __ocml_cbrt_f64(double);
__device__
__attribute__((const))
double __ocml_ceil_f64(double);
__device__
__attribute__((const))
double __ocml_copysign_f64(double, double);
__device__
double __ocml_cos_f64(double);
__device__
__attribute__((pure))
double __ocml_cosh_f64(double);
__device__
double __ocml_cospi_f64(double);
__device__
double __ocml_i0_f64(double);
__device__
double __ocml_i1_f64(double);
__device__
__attribute__((pure))
double __ocml_erfc_f64(double);
__device__
__attribute__((pure))
double __ocml_erfcinv_f64(double);
__device__
__attribute__((pure))
double __ocml_erfcx_f64(double);
__device__
__attribute__((pure))
double __ocml_erf_f64(double);
__device__
__attribute__((pure))
double __ocml_erfinv_f64(double);
__device__
__attribute__((pure))
double __ocml_exp10_f64(double);
__device__
__attribute__((pure))
double __ocml_exp2_f64(double);
__device__
__attribute__((pure))
double __ocml_exp_f64(double);
__device__
__attribute__((pure))
double __ocml_expm1_f64(double);
__device__
__attribute__((const))
double __ocml_fabs_f64(double);
__device__
__attribute__((const))
double __ocml_fdim_f64(double, double);
__device__
__attribute__((const))
double __ocml_floor_f64(double);
__device__
__attribute__((const))
double __ocml_fma_f64(double, double, double);
__device__
__attribute__((const))
double __ocml_fmax_f64(double, double);
__device__
__attribute__((const))
double __ocml_fmin_f64(double, double);
__device__
__attribute__((const))
double __ocml_fmod_f64(double, double);
__device__
double __ocml_frexp_f64(double, __attribute__((address_space(5))) int*);
__device__
__attribute__((const))
double __ocml_hypot_f64(double, double);
__device__
__attribute__((const))
int __ocml_ilogb_f64(double);
__device__
__attribute__((const))
int __ocml_isfinite_f64(double);
__device__
__attribute__((const))
int __ocml_isinf_f64(double);
__device__
__attribute__((const))
int __ocml_isnan_f64(double);
__device__
double __ocml_j0_f64(double);
__device__
double __ocml_j1_f64(double);
__device__
__attribute__((const))
double __ocml_ldexp_f64(double, int);
__device__
double __ocml_lgamma_f64(double);
__device__
__attribute__((pure))
double __ocml_log10_f64(double);
__device__
__attribute__((pure))
double __ocml_log1p_f64(double);
__device__
__attribute__((pure))
double __ocml_log2_f64(double);
__device__
__attribute__((const))
double __ocml_logb_f64(double);
__device__
__attribute__((pure))
double __ocml_log_f64(double);
__device__
double __ocml_modf_f64(double, __attribute__((address_space(5))) double*);
__device__
__attribute__((const))
double __ocml_nearbyint_f64(double);
__device__
__attribute__((const))
double __ocml_nextafter_f64(double, double);
__device__
__attribute__((const))
double __ocml_len3_f64(double, double, double);
__device__
__attribute__((const))
double __ocml_len4_f64(double, double, double, double);
__device__
__attribute__((pure))
double __ocml_ncdf_f64(double);
__device__
__attribute__((pure))
double __ocml_ncdfinv_f64(double);
__device__
__attribute__((pure))
double __ocml_pow_f64(double, double);
__device__
__attribute__((pure))
double __ocml_pown_f64(double, int);
__device__
__attribute__((pure))
double __ocml_rcbrt_f64(double);
__device__
__attribute__((const))
double __ocml_remainder_f64(double, double);
__device__
double __ocml_remquo_f64(
double, double, __attribute__((address_space(5))) int*);
__device__
__attribute__((const))
double __ocml_rhypot_f64(double, double);
__device__
__attribute__((const))
double __ocml_rint_f64(double);
__device__
__attribute__((const))
double __ocml_rlen3_f64(double, double, double);
__device__
__attribute__((const))
double __ocml_rlen4_f64(double, double, double, double);
__device__
__attribute__((const))
double __ocml_round_f64(double);
__device__
__attribute__((pure))
double __ocml_rsqrt_f64(double);
__device__
__attribute__((const))
double __ocml_scalb_f64(double, double);
__device__
__attribute__((const))
double __ocml_scalbn_f64(double, int);
__device__
__attribute__((const))
int __ocml_signbit_f64(double);
__device__
double __ocml_sincos_f64(double, __attribute__((address_space(5))) double*);
__device__
double __ocml_sincospi_f64(double, __attribute__((address_space(5))) double*);
__device__
double __ocml_sin_f64(double);
__device__
__attribute__((pure))
double __ocml_sinh_f64(double);
__device__
double __ocml_sinpi_f64(double);
__device__
__attribute__((const))
double __ocml_sqrt_f64(double);
__device__
double __ocml_tan_f64(double);
__device__
__attribute__((pure))
double __ocml_tanh_f64(double);
__device__
double __ocml_tgamma_f64(double);
__device__
__attribute__((const))
double __ocml_trunc_f64(double);
__device__
double __ocml_y0_f64(double);
__device__
double __ocml_y1_f64(double);
// BEGIN INTRINSICS
__device__
__attribute__((const))
double __ocml_add_rte_f64(double, double);
__device__
__attribute__((const))
double __ocml_add_rtn_f64(double, double);
__device__
__attribute__((const))
double __ocml_add_rtp_f64(double, double);
__device__
__attribute__((const))
double __ocml_add_rtz_f64(double, double);
__device__
__attribute__((const))
double __ocml_sub_rte_f64(double, double);
__device__
__attribute__((const))
double __ocml_sub_rtn_f64(double, double);
__device__
__attribute__((const))
double __ocml_sub_rtp_f64(double, double);
__device__
__attribute__((const))
double __ocml_sub_rtz_f64(double, double);
__device__
__attribute__((const))
double __ocml_mul_rte_f64(double, double);
__device__
__attribute__((const))
double __ocml_mul_rtn_f64(double, double);
__device__
__attribute__((const))
double __ocml_mul_rtp_f64(double, double);
__device__
__attribute__((const))
double __ocml_mul_rtz_f64(double, double);
__device__
__attribute__((const))
double __ocml_div_rte_f64(double, double);
__device__
__attribute__((const))
double __ocml_div_rtn_f64(double, double);
__device__
__attribute__((const))
double __ocml_div_rtp_f64(double, double);
__device__
__attribute__((const))
double __ocml_div_rtz_f64(double, double);
__device__
__attribute__((const))
double __ocml_sqrt_rte_f64(double);
__device__
__attribute__((const))
double __ocml_sqrt_rtn_f64(double);
__device__
__attribute__((const))
double __ocml_sqrt_rtp_f64(double);
__device__
__attribute__((const))
double __ocml_sqrt_rtz_f64(double);
__device__
__attribute__((const))
double __ocml_fma_rte_f64(double, double, double);
__device__
__attribute__((const))
double __ocml_fma_rtn_f64(double, double, double);
__device__
__attribute__((const))
double __ocml_fma_rtp_f64(double, double, double);
__device__
__attribute__((const))
double __ocml_fma_rtz_f64(double, double, double);
// END INTRINSICS
// END DOUBLE
#endif // !__CLANG_HIP_RUNTIME_WRAPPER_INCLUDED__
#if defined(__cplusplus)
} // extern "C"
#endif
@@ -0,0 +1,175 @@
/*
Copyright (c) 2015 - 2022 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#pragma once
#include <hip/hip_vector_types.h>
extern "C" {
#define ADDRESS_SPACE_CONSTANT __attribute__((address_space(4)))
__device__ float4::Native_vec_ __ockl_image_load_1D(unsigned int ADDRESS_SPACE_CONSTANT*i, int c);
__device__ float4::Native_vec_ __ockl_image_load_1Db(unsigned int ADDRESS_SPACE_CONSTANT*i, int c);
__device__ float4::Native_vec_ __ockl_image_load_1Da(unsigned int ADDRESS_SPACE_CONSTANT*i, int2::Native_vec_ c);
__device__ float4::Native_vec_ __ockl_image_load_2D(unsigned int ADDRESS_SPACE_CONSTANT*i, int2::Native_vec_ c);
__device__ float4::Native_vec_ __ockl_image_load_2Da(unsigned int ADDRESS_SPACE_CONSTANT*i, int4::Native_vec_ c);
__device__ float4::Native_vec_ __ockl_image_load_3D(unsigned int ADDRESS_SPACE_CONSTANT*i, int4::Native_vec_ c);
__device__ float4::Native_vec_ __ockl_image_load_CM(unsigned int ADDRESS_SPACE_CONSTANT*i, int2::Native_vec_ c, int f);
__device__ float4::Native_vec_ __ockl_image_load_CMa(unsigned int ADDRESS_SPACE_CONSTANT*i, int4::Native_vec_ c, int f);
__device__ float4::Native_vec_ __ockl_image_load_lod_1D(unsigned int ADDRESS_SPACE_CONSTANT*i, int c, int l);
__device__ float4::Native_vec_ __ockl_image_load_lod_1Da(unsigned int ADDRESS_SPACE_CONSTANT*i, int2::Native_vec_ c, int l);
__device__ float4::Native_vec_ __ockl_image_load_lod_2D(unsigned int ADDRESS_SPACE_CONSTANT*i, int2::Native_vec_ c, int l);
__device__ float4::Native_vec_ __ockl_image_load_lod_2Da(unsigned int ADDRESS_SPACE_CONSTANT*i, int4::Native_vec_ c, int l);
__device__ float4::Native_vec_ __ockl_image_load_lod_3D(unsigned int ADDRESS_SPACE_CONSTANT*i, int4::Native_vec_ c, int l);
__device__ float4::Native_vec_ __ockl_image_load_lod_CM(unsigned int ADDRESS_SPACE_CONSTANT*i, int2::Native_vec_ c, int f, int l);
__device__ float4::Native_vec_ __ockl_image_load_lod_CMa(unsigned int ADDRESS_SPACE_CONSTANT*i, int4::Native_vec_ c, int f, int l);
__device__ void __ockl_image_store_1D(unsigned int ADDRESS_SPACE_CONSTANT*i, int c, float4::Native_vec_ p);
__device__ void __ockl_image_store_1Da(unsigned int ADDRESS_SPACE_CONSTANT*i, int2::Native_vec_ c, float4::Native_vec_ p);
__device__ void __ockl_image_store_2D(unsigned int ADDRESS_SPACE_CONSTANT*i, int2::Native_vec_ c, float4::Native_vec_ p);
__device__ void __ockl_image_store_2Da(unsigned int ADDRESS_SPACE_CONSTANT*i, int4::Native_vec_ c, float4::Native_vec_ p);
__device__ void __ockl_image_store_3D(unsigned int ADDRESS_SPACE_CONSTANT*i, int4::Native_vec_ c, float4::Native_vec_ p);
__device__ void __ockl_image_store_CM(unsigned int ADDRESS_SPACE_CONSTANT*i, int2::Native_vec_ c, int f, float4::Native_vec_ p);
__device__ void __ockl_image_store_CMa(unsigned int ADDRESS_SPACE_CONSTANT*i, int4::Native_vec_ c, int f, float4::Native_vec_ p);
__device__ void __ockl_image_store_lod_1D(unsigned int ADDRESS_SPACE_CONSTANT*i, int c, int l, float4::Native_vec_ p);
__device__ void __ockl_image_store_lod_1Da(unsigned int ADDRESS_SPACE_CONSTANT*i, int2::Native_vec_ c, int l, float4::Native_vec_ p);
__device__ void __ockl_image_store_lod_2D(unsigned int ADDRESS_SPACE_CONSTANT*i, int2::Native_vec_ c, int l, float4::Native_vec_ p);
__device__ void __ockl_image_store_lod_2Da(unsigned int ADDRESS_SPACE_CONSTANT*i, int4::Native_vec_ c, int l, float4::Native_vec_ p);
__device__ void __ockl_image_store_lod_3D(unsigned int ADDRESS_SPACE_CONSTANT*i, int4::Native_vec_ c, int l, float4::Native_vec_ p);
__device__ void __ockl_image_store_lod_CM(unsigned int ADDRESS_SPACE_CONSTANT*i, int2::Native_vec_ c, int f, int l, float4::Native_vec_ p);
__device__ void __ockl_image_store_lod_CMa(unsigned int ADDRESS_SPACE_CONSTANT*i, int4::Native_vec_ c, int f, int l, float4::Native_vec_ p);
__device__ float4::Native_vec_ __ockl_image_sample_1D(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float c);
__device__ float4::Native_vec_ __ockl_image_sample_1Da(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float2::Native_vec_ c);
__device__ float4::Native_vec_ __ockl_image_sample_2D(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float2::Native_vec_ c);
__device__ float4::Native_vec_ __ockl_image_sample_2Da(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float4::Native_vec_ c);
__device__ float4::Native_vec_ __ockl_image_sample_3D(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float4::Native_vec_ c);
__device__ float4::Native_vec_ __ockl_image_sample_CM(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float4::Native_vec_ c);
__device__ float4::Native_vec_ __ockl_image_sample_CMa(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float4::Native_vec_ c);
__device__ float4::Native_vec_ __ockl_image_sample_grad_1D(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float c, float dx, float dy);
__device__ float4::Native_vec_ __ockl_image_sample_grad_1Da(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float2::Native_vec_ c, float dx, float dy);
__device__ float4::Native_vec_ __ockl_image_sample_grad_2D(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float2::Native_vec_ c, float2::Native_vec_ dx, float2::Native_vec_ dy);
__device__ float4::Native_vec_ __ockl_image_sample_grad_2Da(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float4::Native_vec_ c, float2::Native_vec_ dx, float2::Native_vec_ dy);
__device__ float4::Native_vec_ __ockl_image_sample_grad_3D(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float4::Native_vec_ c, float4::Native_vec_ dx, float4::Native_vec_ dy);
__device__ float4::Native_vec_ __ockl_image_sample_lod_1D(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float c, float l);
__device__ float4::Native_vec_ __ockl_image_sample_lod_1Da(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float2::Native_vec_ c, float l);
__device__ float4::Native_vec_ __ockl_image_sample_lod_2D(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float2::Native_vec_ c, float l);
__device__ float4::Native_vec_ __ockl_image_sample_lod_2Da(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float4::Native_vec_ c, float l);
__device__ float4::Native_vec_ __ockl_image_sample_lod_3D(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float4::Native_vec_ c, float l);
__device__ float4::Native_vec_ __ockl_image_sample_lod_CM(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float4::Native_vec_ c, float l);
__device__ float4::Native_vec_ __ockl_image_sample_lod_CMa(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float4::Native_vec_ c, float l);
__device__ float4::Native_vec_ __ockl_image_gather4r_2D(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float2::Native_vec_ c);
__device__ float4::Native_vec_ __ockl_image_gather4g_2D(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float2::Native_vec_ c);
__device__ float4::Native_vec_ __ockl_image_gather4b_2D(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float2::Native_vec_ c);
__device__ float4::Native_vec_ __ockl_image_gather4a_2D(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float2::Native_vec_ c);
__device__ int __ockl_image_channel_data_type_1D(unsigned int ADDRESS_SPACE_CONSTANT* i);
__device__ int __ockl_image_channel_data_type_1Da(unsigned int ADDRESS_SPACE_CONSTANT* i);
__device__ int __ockl_image_channel_data_type_1Db(unsigned int ADDRESS_SPACE_CONSTANT* i);
__device__ int __ockl_image_channel_data_type_2D(unsigned int ADDRESS_SPACE_CONSTANT* i);
__device__ int __ockl_image_channel_data_type_2Da(unsigned int ADDRESS_SPACE_CONSTANT* i);
__device__ int __ockl_image_channel_data_type_2Dad(unsigned int ADDRESS_SPACE_CONSTANT* i);
__device__ int __ockl_image_channel_data_type_2Dd(unsigned int ADDRESS_SPACE_CONSTANT* i);
__device__ int __ockl_image_channel_data_type_3D(unsigned int ADDRESS_SPACE_CONSTANT* i);
__device__ int __ockl_image_channel_data_type_CM(unsigned int ADDRESS_SPACE_CONSTANT* i);
__device__ int __ockl_image_channel_data_type_CMa(unsigned int ADDRESS_SPACE_CONSTANT* i);
__device__ int __ockl_image_channel_order_1D(unsigned int ADDRESS_SPACE_CONSTANT* i);
__device__ int __ockl_image_channel_order_1Da(unsigned int ADDRESS_SPACE_CONSTANT* i);
__device__ int __ockl_image_channel_order_1Db(unsigned int ADDRESS_SPACE_CONSTANT* i);
__device__ int __ockl_image_channel_order_2D(unsigned int ADDRESS_SPACE_CONSTANT* i);
__device__ int __ockl_image_channel_order_2Da(unsigned int ADDRESS_SPACE_CONSTANT* i);
__device__ int __ockl_image_channel_order_2Dad(unsigned int ADDRESS_SPACE_CONSTANT* i);
__device__ int __ockl_image_channel_order_2Dd(unsigned int ADDRESS_SPACE_CONSTANT* i);
__device__ int __ockl_image_channel_order_3D(unsigned int ADDRESS_SPACE_CONSTANT* i);
__device__ int __ockl_image_channel_order_CM(unsigned int ADDRESS_SPACE_CONSTANT* i);
__device__ int __ockl_image_channel_order_CMa(unsigned int ADDRESS_SPACE_CONSTANT* i);
};
@@ -0,0 +1,107 @@
/*
Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#pragma once
#include <hsa/amd_hsa_kernel_code.h>
#include <hsa/hsa.h>
#include <hsa/hsa_ext_amd.h>
#include <hsa/hsa_ven_amd_loader.h>
#include <cstddef>
#include <cstdint>
#include <cstdlib>
#include <hip/hip_common.h>
struct ihipModuleSymbol_t;
using hipFunction_t = ihipModuleSymbol_t*;
namespace hip_impl {
// This section contains internal APIs that
// needs to be exported
#ifdef __GNUC__
#pragma GCC visibility push (default)
#endif
struct kernarg_impl;
class kernarg {
public:
kernarg();
kernarg(kernarg&&);
~kernarg();
std::uint8_t* data();
std::size_t size();
void reserve(std::size_t);
void resize(std::size_t);
private:
kernarg_impl* impl;
};
class kernargs_size_align;
class program_state_impl;
class program_state {
public:
program_state();
~program_state();
program_state(const program_state&) = delete;
hipFunction_t kernel_descriptor(std::uintptr_t,
hsa_agent_t);
kernargs_size_align get_kernargs_size_align(std::uintptr_t);
hsa_executable_t load_executable(const char*, const size_t,
hsa_executable_t,
hsa_agent_t);
hsa_executable_t load_executable_no_copy(const char*, const size_t,
hsa_executable_t,
hsa_agent_t);
void* global_addr_by_name(const char* name);
private:
friend class agent_globals_impl;
program_state_impl* impl;
};
class kernargs_size_align {
public:
std::size_t size(std::size_t n) const;
std::size_t alignment(std::size_t n) const;
const void* getHandle() const {return handle;};
private:
const void* handle;
friend kernargs_size_align program_state::get_kernargs_size_align(std::uintptr_t);
};
#ifdef __GNUC__
#pragma GCC visibility pop
#endif
inline
__attribute__((visibility("hidden")))
program_state& get_program_state() {
static program_state ps;
return ps;
}
} // Namespace hip_impl.
@@ -0,0 +1,388 @@
/*
Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#pragma once
#if defined(__cplusplus)
#include <hip/hip_vector_types.h>
#include <hip/hip_texture_types.h>
#include <hip/amd_detail/ockl_image.h>
#if !defined(__HIPCC_RTC__)
#include <type_traits>
#endif // !defined(__HIPCC_RTC__)
#define TEXTURE_PARAMETERS_INIT \
unsigned int ADDRESS_SPACE_CONSTANT* i = (unsigned int ADDRESS_SPACE_CONSTANT*)t.textureObject; \
unsigned int ADDRESS_SPACE_CONSTANT* s = i + HIP_SAMPLER_OBJECT_OFFSET_DWORD;
template<typename T>
struct __hip_is_tex_channel_type
{
static constexpr bool value =
std::is_same<T, char>::value ||
std::is_same<T, unsigned char>::value ||
std::is_same<T, short>::value ||
std::is_same<T, unsigned short>::value ||
std::is_same<T, int>::value ||
std::is_same<T, unsigned int>::value ||
std::is_same<T, float>::value;
};
template<
typename T,
unsigned int rank>
struct __hip_is_tex_channel_type<HIP_vector_type<T, rank>>
{
static constexpr bool value =
__hip_is_tex_channel_type<T>::value &&
((rank == 1) ||
(rank == 2) ||
(rank == 4));
};
template<typename T>
struct __hip_is_tex_normalized_channel_type
{
static constexpr bool value =
std::is_same<T, char>::value ||
std::is_same<T, unsigned char>::value ||
std::is_same<T, short>::value ||
std::is_same<T, unsigned short>::value;
};
template<
typename T,
unsigned int rank>
struct __hip_is_tex_normalized_channel_type<HIP_vector_type<T, rank>>
{
static constexpr bool value =
__hip_is_tex_normalized_channel_type<T>::value &&
((rank == 1) ||
(rank == 2) ||
(rank == 4));
};
template <
typename T,
hipTextureReadMode readMode,
typename Enable = void>
struct __hip_tex_ret
{
static_assert(std::is_same<Enable, void>::value, "Invalid channel type!");
};
template <
typename T,
hipTextureReadMode readMode>
using __hip_tex_ret_t = typename __hip_tex_ret<T, readMode, bool>::type;
template <typename T>
struct __hip_tex_ret<
T,
hipReadModeElementType,
typename std::enable_if<__hip_is_tex_channel_type<T>::value, bool>::type>
{
using type = T;
};
template<
typename T,
unsigned int rank>
struct __hip_tex_ret<
HIP_vector_type<T, rank>,
hipReadModeElementType,
typename std::enable_if<__hip_is_tex_channel_type<HIP_vector_type<T, rank>>::value, bool>::type>
{
using type = HIP_vector_type<__hip_tex_ret_t<T, hipReadModeElementType>, rank>;
};
template<typename T>
struct __hip_tex_ret<
T,
hipReadModeNormalizedFloat,
typename std::enable_if<__hip_is_tex_normalized_channel_type<T>::value, bool>::type>
{
using type = float;
};
template<
typename T,
unsigned int rank>
struct __hip_tex_ret<
HIP_vector_type<T, rank>,
hipReadModeNormalizedFloat,
typename std::enable_if<__hip_is_tex_normalized_channel_type<HIP_vector_type<T, rank>>::value, bool>::type>
{
using type = HIP_vector_type<__hip_tex_ret_t<T, hipReadModeNormalizedFloat>, rank>;
};
template <typename T, hipTextureReadMode readMode>
static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex1Dfetch(texture<T, hipTextureType1D, readMode> t, int x)
{
TEXTURE_PARAMETERS_INIT;
auto tmp = __ockl_image_load_1Db(i, x);
return mapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
}
template <typename T, hipTextureReadMode readMode>
static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex1D(texture<T, hipTextureType1D, readMode> t, float x)
{
TEXTURE_PARAMETERS_INIT;
auto tmp = __ockl_image_sample_1D(i, s, x);
return mapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
}
template <typename T, hipTextureReadMode readMode>
static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex2D(texture<T, hipTextureType2D, readMode> t, float x, float y)
{
TEXTURE_PARAMETERS_INIT;
auto tmp = __ockl_image_sample_2D(i, s, float2(x, y).data);
return mapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
}
template <typename T, hipTextureReadMode readMode>
static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex1DLayered(texture<T, hipTextureType1DLayered, readMode> t, float x, int layer)
{
TEXTURE_PARAMETERS_INIT;
auto tmp = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
return mapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
}
template <typename T, hipTextureReadMode readMode>
static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex2DLayered(texture<T, hipTextureType2DLayered, readMode> t, float x, float y, int layer)
{
TEXTURE_PARAMETERS_INIT;
auto tmp = __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
return mapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
}
template <typename T, hipTextureReadMode readMode>
static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex3D(texture<T, hipTextureType3D, readMode> t, float x, float y, float z)
{
TEXTURE_PARAMETERS_INIT;
auto tmp = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
return mapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
}
template <typename T, hipTextureReadMode readMode>
static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> texCubemap(texture<T, hipTextureTypeCubemap, readMode> t, float x, float y, float z)
{
TEXTURE_PARAMETERS_INIT;
auto tmp = __ockl_image_sample_CM(i, s, float4(x, y, z, 0.0f).data);
return mapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
}
template <typename T, hipTextureReadMode readMode>
static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex1DLod(texture<T, hipTextureType1D, readMode> t, float x, float level)
{
TEXTURE_PARAMETERS_INIT;
auto tmp = __ockl_image_sample_lod_1D(i, s, x, level);
return mapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
}
template <typename T, hipTextureReadMode readMode>
static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex2DLod(texture<T, hipTextureType2D, readMode> t, float x, float y, float level)
{
TEXTURE_PARAMETERS_INIT;
auto tmp = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
return mapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
}
template <typename T, hipTextureReadMode readMode>
static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex1DLayeredLod(texture<T, hipTextureType1DLayered, readMode> t, float x, int layer, float level)
{
TEXTURE_PARAMETERS_INIT;
auto tmp = __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
return mapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
}
template <typename T, hipTextureReadMode readMode>
static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex2DLayeredLod(texture<T, hipTextureType2DLayered, readMode> t, float x, float y, int layer, float level)
{
TEXTURE_PARAMETERS_INIT;
auto tmp = __ockl_image_sample_lod_2Da(i, s, float4(x, y, layer, 0.0f).data, level);
return mapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
}
template <typename T, hipTextureReadMode readMode>
static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex3DLod(texture<T, hipTextureType3D, readMode> t, float x, float y, float z, float level)
{
TEXTURE_PARAMETERS_INIT;
auto tmp = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data, level);
return mapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
}
template <typename T, hipTextureReadMode readMode>
static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> texCubemapLod(texture<T, hipTextureTypeCubemap, readMode> t, float x, float y, float z, float level)
{
TEXTURE_PARAMETERS_INIT;
auto tmp = __ockl_image_sample_lod_CM(i, s, float4(x, y, z, 0.0f).data, level);
return mapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
}
template <typename T, hipTextureReadMode readMode>
static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> texCubemapLayered(texture<T, hipTextureTypeCubemapLayered, readMode> t, float x, float y, float z, int layer)
{
TEXTURE_PARAMETERS_INIT;
auto tmp = __ockl_image_sample_CMa(i, s, float4(x, y, z, layer).data);
return mapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
}
template <typename T, hipTextureReadMode readMode>
static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> texCubemapLayeredLod(texture<T, hipTextureTypeCubemapLayered, readMode> t, float x, float y, float z, int layer, float level)
{
TEXTURE_PARAMETERS_INIT;
auto tmp = __ockl_image_sample_lod_CMa(i, s, float4(x, y, z, layer).data, level);
return mapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
}
template <typename T, hipTextureReadMode readMode>
static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> texCubemapGrad(texture<T, hipTextureTypeCubemap, readMode> t, float x, float y, float z, float4 dPdx, float4 dPdy)
{
TEXTURE_PARAMETERS_INIT;
// TODO missing in device libs.
// auto tmp = __ockl_image_sample_grad_CM(i, s, float4(x, y, z, 0.0f).data, float4(dPdx.x, dPdx.y, dPdx.z, 0.0f).data, float4(dPdy.x, dPdy.y, dPdy.z, 0.0f).data);
// return mapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
return {};
}
template <typename T, hipTextureReadMode readMode>
static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> texCubemapLayeredGrad(texture<T, hipTextureTypeCubemapLayered, readMode> t, float x, float y, float z, int layer, float4 dPdx, float4 dPdy)
{
TEXTURE_PARAMETERS_INIT;
// TODO missing in device libs.
// auto tmp = __ockl_image_sample_grad_CMa(i, s, float4(x, y, z, layer).data, float4(dPdx.x, dPdx.y, dPdx.z, 0.0f).data, float4(dPdy.x, dPdy.y, dPdy.z, 0.0f).data);
// return mapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
return {};
}
template <typename T, hipTextureReadMode readMode>
static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex1DGrad(texture<T, hipTextureType1D, readMode> t, float x, float dPdx, float dPdy)
{
TEXTURE_PARAMETERS_INIT;
auto tmp = __ockl_image_sample_grad_1D(i, s, x, dPdx, dPdy);
return mapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
}
template <typename T, hipTextureReadMode readMode>
static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex2DGrad(texture<T, hipTextureType2D, readMode> t, float x, float y, float2 dPdx, float2 dPdy)
{
TEXTURE_PARAMETERS_INIT;
auto tmp = __ockl_image_sample_grad_2D(i, s, float2(x, y).data, float2(dPdx.x, dPdx.y).data, float2(dPdy.x, dPdy.y).data);
return mapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
}
template <typename T, hipTextureReadMode readMode>
static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex1DLayeredGrad(texture<T, hipTextureType1DLayered, readMode> t, float x, int layer, float dPdx, float dPdy)
{
TEXTURE_PARAMETERS_INIT;
auto tmp = __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dPdx, dPdy);
return mapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
}
template <typename T, hipTextureReadMode readMode>
static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex2DLayeredGrad(texture<T, hipTextureType2DLayered, readMode> t, float x, float y, int layer, float2 dPdx, float2 dPdy)
{
TEXTURE_PARAMETERS_INIT;
auto tmp = __ockl_image_sample_grad_2Da(i, s, float4(x, y, layer, 0.0f).data, float2(dPdx.x, dPdx.y).data, float2(dPdy.x, dPdy.y).data);
return mapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
}
template <typename T, hipTextureReadMode readMode>
static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex3DGrad(texture<T, hipTextureType3D, readMode> t, float x, float y, float z, float4 dPdx, float4 dPdy)
{
TEXTURE_PARAMETERS_INIT;
auto tmp = __ockl_image_sample_grad_3D(i, s, float4(x, y, z, 0.0f).data, float4(dPdx.x, dPdx.y, dPdx.z, 0.0f).data, float4(dPdy.x, dPdy.y, dPdy.z, 0.0f).data);
return mapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
}
template <
typename T,
hipTextureReadMode readMode,
typename Enable = void>
struct __hip_tex2dgather_ret
{
static_assert(std::is_same<Enable, void>::value, "Invalid channel type!");
};
template <
typename T,
hipTextureReadMode readMode>
using __hip_tex2dgather_ret_t = typename __hip_tex2dgather_ret<T, readMode, bool>::type;
template <typename T>
struct __hip_tex2dgather_ret<
T,
hipReadModeElementType,
typename std::enable_if<__hip_is_tex_channel_type<T>::value, bool>::type>
{
using type = HIP_vector_type<T, 4>;
};
template<
typename T,
unsigned int rank>
struct __hip_tex2dgather_ret<
HIP_vector_type<T, rank>,
hipReadModeElementType,
typename std::enable_if<__hip_is_tex_channel_type<HIP_vector_type<T, rank>>::value, bool>::type>
{
using type = HIP_vector_type<T, 4>;
};
template <typename T>
struct __hip_tex2dgather_ret<
T,
hipReadModeNormalizedFloat,
typename std::enable_if<__hip_is_tex_normalized_channel_type<T>::value, bool>::type>
{
using type = float4;
};
template <typename T, hipTextureReadMode readMode>
static __forceinline__ __device__ __hip_img_chk__ __hip_tex2dgather_ret_t<T, readMode> tex2Dgather(texture<T, hipTextureType2D, readMode> t, float x, float y, int comp=0)
{
TEXTURE_PARAMETERS_INIT;
switch (comp) {
case 1: {
auto tmp = __ockl_image_gather4g_2D(i, s, float2(x, y).data);
return mapFrom<__hip_tex2dgather_ret_t<T, readMode>>(tmp);
}
case 2: {
auto tmp = __ockl_image_gather4b_2D(i, s, float2(x, y).data);
return mapFrom<__hip_tex2dgather_ret_t<T, readMode>>(tmp);
}
case 3: {
auto tmp = __ockl_image_gather4a_2D(i, s, float2(x, y).data);
return mapFrom<__hip_tex2dgather_ret_t<T, readMode>>(tmp);
}
default: {
auto tmp = __ockl_image_gather4r_2D(i, s, float2(x, y).data);
return mapFrom<__hip_tex2dgather_ret_t<T, readMode>>(tmp);
}
}
return {};
}
#endif
@@ -0,0 +1,503 @@
/*
Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#pragma once
#if defined(__cplusplus)
#include <hip/hip_vector_types.h>
#include <hip/hip_texture_types.h>
#include <hip/amd_detail/ockl_image.h>
#if !defined(__HIPCC_RTC__)
#include <type_traits>
#endif // !defined(__HIPCC_RTC__)
#define TEXTURE_OBJECT_PARAMETERS_INIT \
unsigned int ADDRESS_SPACE_CONSTANT* i = (unsigned int ADDRESS_SPACE_CONSTANT*)textureObject; \
unsigned int ADDRESS_SPACE_CONSTANT* s = i + HIP_SAMPLER_OBJECT_OFFSET_DWORD;
template<typename T>
struct __hip_is_itex_channel_type
{
static constexpr bool value =
std::is_same<T, char>::value ||
std::is_same<T, unsigned char>::value ||
std::is_same<T, short>::value ||
std::is_same<T, unsigned short>::value ||
std::is_same<T, int>::value ||
std::is_same<T, unsigned int>::value ||
std::is_same<T, float>::value;
};
template<
typename T,
unsigned int rank>
struct __hip_is_itex_channel_type<HIP_vector_type<T, rank>>
{
static constexpr bool value =
__hip_is_itex_channel_type<T>::value &&
((rank == 1) ||
(rank == 2) ||
(rank == 4));
};
template <
typename T,
typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ T tex1Dfetch(hipTextureObject_t textureObject, int x)
{
TEXTURE_OBJECT_PARAMETERS_INIT
auto tmp = __ockl_image_load_1Db(i, x);
return mapFrom<T>(tmp);
}
template <
typename T,
typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ void tex1Dfetch(T *ptr, hipTextureObject_t textureObject, int x)
{
*ptr = tex1Dfetch<T>(textureObject, x);
}
template <
typename T,
typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ T tex1D(hipTextureObject_t textureObject, float x)
{
TEXTURE_OBJECT_PARAMETERS_INIT
auto tmp = __ockl_image_sample_1D(i, s, x);
return mapFrom<T>(tmp);
}
template <
typename T,
typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ void tex1D(T *ptr, hipTextureObject_t textureObject, float x)
{
*ptr = tex1D<T>(textureObject, x);
}
template <
typename T,
typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ T tex2D(hipTextureObject_t textureObject, float x, float y)
{
TEXTURE_OBJECT_PARAMETERS_INIT
auto tmp = __ockl_image_sample_2D(i, s, float2(x, y).data);
return mapFrom<T>(tmp);
}
template <
typename T,
typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ void tex2D(T *ptr, hipTextureObject_t textureObject, float x, float y)
{
*ptr = tex2D<T>(textureObject, x, y);
}
template <
typename T,
typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ T tex3D(hipTextureObject_t textureObject, float x, float y, float z)
{
TEXTURE_OBJECT_PARAMETERS_INIT
auto tmp = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
return mapFrom<T>(tmp);
}
template <
typename T,
typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ void tex3D(T *ptr, hipTextureObject_t textureObject, float x, float y, float z)
{
*ptr = tex3D<T>(textureObject, x, y, z);
}
template <
typename T,
typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ T tex1DLayered(hipTextureObject_t textureObject, float x, int layer)
{
TEXTURE_OBJECT_PARAMETERS_INIT
auto tmp = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
return mapFrom<T>(tmp);
}
template <
typename T,
typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ void tex1DLayered(T *ptr, hipTextureObject_t textureObject, float x, int layer)
{
*ptr = tex1DLayered<T>(textureObject, x, layer);
}
template <
typename T,
typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ T tex2DLayered(hipTextureObject_t textureObject, float x, float y, int layer)
{
TEXTURE_OBJECT_PARAMETERS_INIT
auto tmp = __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
return mapFrom<T>(tmp);
}
template <
typename T,
typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ void tex2DLayered(T *ptr, hipTextureObject_t textureObject, float x, float y, int layer)
{
*ptr = tex1DLayered<T>(textureObject, x, y, layer);
}
template <
typename T,
typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ T texCubemap(hipTextureObject_t textureObject, float x, float y, float z)
{
TEXTURE_OBJECT_PARAMETERS_INIT
auto tmp = __ockl_image_sample_CM(i, s, float4(x, y, z, 0.0f).data);
return mapFrom<T>(tmp);
}
template <
typename T,
typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ void texCubemap(T *ptr, hipTextureObject_t textureObject, float x, float y, float z)
{
*ptr = texCubemap<T>(textureObject, x, y, z);
}
template <
typename T,
typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ T texCubemapLayered(hipTextureObject_t textureObject, float x, float y, float z, int layer)
{
TEXTURE_OBJECT_PARAMETERS_INIT
auto tmp = __ockl_image_sample_CMa(i, s, float4(x, y, z, layer).data);
return mapFrom<T>(tmp);
}
template <
typename T,
typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ void texCubemapLayered(T *ptr, hipTextureObject_t textureObject, float x, float y, float z, int layer)
{
*ptr = texCubemapLayered<T>(textureObject, x, y, z, layer);
}
template <
typename T,
typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ T tex2Dgather(hipTextureObject_t textureObject, float x, float y, int comp = 0)
{
TEXTURE_OBJECT_PARAMETERS_INIT
switch (comp) {
case 1: {
auto tmp = __ockl_image_gather4r_2D(i, s, float2(x, y).data);
return mapFrom<T>(tmp);
break;
}
case 2: {
auto tmp = __ockl_image_gather4g_2D(i, s, float2(x, y).data);
return mapFrom<T>(tmp);
break;
}
case 3: {
auto tmp = __ockl_image_gather4b_2D(i, s, float2(x, y).data);
return mapFrom<T>(tmp);
break;
}
default: {
auto tmp = __ockl_image_gather4a_2D(i, s, float2(x, y).data);
return mapFrom<T>(tmp);
break;
}
};
return {};
}
template <
typename T,
typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ void tex2Dgather(T *ptr, hipTextureObject_t textureObject, float x, float y, int comp = 0)
{
*ptr = texCubemapLayered<T>(textureObject, x, y, comp);
}
template <
typename T,
typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ T tex1DLod(hipTextureObject_t textureObject, float x, float level)
{
TEXTURE_OBJECT_PARAMETERS_INIT
auto tmp = __ockl_image_sample_lod_1D(i, s, x, level);
return mapFrom<T>(tmp);
}
template <
typename T,
typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ void tex1DLod(T *ptr, hipTextureObject_t textureObject, float x, float level)
{
*ptr = tex1DLod<T>(textureObject, x, level);
}
template <
typename T,
typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ T tex2DLod(hipTextureObject_t textureObject, float x, float y, float level)
{
TEXTURE_OBJECT_PARAMETERS_INIT
auto tmp = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
return mapFrom<T>(tmp);
}
template <
typename T,
typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ void tex2DLod(T *ptr, hipTextureObject_t textureObject, float x, float y, float level)
{
*ptr = tex2DLod<T>(textureObject, x, y, level);
}
template <
typename T,
typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ T tex3DLod(hipTextureObject_t textureObject, float x, float y, float z, float level)
{
TEXTURE_OBJECT_PARAMETERS_INIT
auto tmp = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data, level);
return mapFrom<T>(tmp);
}
template <
typename T,
typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ void tex3DLod(T *ptr, hipTextureObject_t textureObject, float x, float y, float z, float level)
{
*ptr = tex3DLod<T>(textureObject, x, y, z, level);
}
template <
typename T,
typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ T tex1DLayeredLod(hipTextureObject_t textureObject, float x, int layer, float level)
{
TEXTURE_OBJECT_PARAMETERS_INIT
auto tmp = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
return mapFrom<T>(tmp);
}
template <
typename T,
typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ void tex1DLayeredLod(T *ptr, hipTextureObject_t textureObject, float x, int layer, float level)
{
*ptr = tex1DLayeredLod<T>(textureObject, x, layer, level);
}
template <
typename T,
typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ T tex2DLayeredLod(hipTextureObject_t textureObject, float x, float y, int layer, float level)
{
TEXTURE_OBJECT_PARAMETERS_INIT
auto tmp = __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
return mapFrom<T>(tmp);
}
template <
typename T,
typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ void tex2DLayeredLod(T *ptr, hipTextureObject_t textureObject, float x, float y, int layer, float level)
{
*ptr = tex2DLayeredLod<T>(textureObject, x, y, layer, level);
}
template <
typename T,
typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ T texCubemapLod(hipTextureObject_t textureObject, float x, float y, float z, float level)
{
TEXTURE_OBJECT_PARAMETERS_INIT
auto tmp = __ockl_image_sample_lod_CM(i, s, float4(x, y, z, 0.0f).data, level);
return mapFrom<T>(tmp);
}
template <
typename T,
typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ void texCubemapLod(T *ptr, hipTextureObject_t textureObject, float x, float y, float z, float level)
{
*ptr = texCubemapLod<T>(textureObject, x, y, z, level);
}
template <
typename T,
typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ T texCubemapGrad(hipTextureObject_t textureObject, float x, float y, float z, float4 dPdx, float4 dPdy)
{
TEXTURE_OBJECT_PARAMETERS_INIT
// TODO missing in device libs.
// auto tmp = __ockl_image_sample_grad_CM(i, s, float4(x, y, z, 0.0f).data, float4(dPdx.x, dPdx.y, dPdx.z, 0.0f).data, float4(dPdy.x, dPdy.y, dPdy.z, 0.0f).data);
// return mapFrom<T>(tmp);
return {};
}
template <
typename T,
typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ void texCubemapGrad(T *ptr, hipTextureObject_t textureObject, float x, float y, float z, float4 dPdx, float4 dPdy)
{
*ptr = texCubemapGrad<T>(textureObject, x, y, z, dPdx, dPdy);
}
template <
typename T,
typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ T texCubemapLayeredLod(hipTextureObject_t textureObject, float x, float y, float z, int layer, float level)
{
TEXTURE_OBJECT_PARAMETERS_INIT
auto tmp = __ockl_image_sample_lod_CMa(i, s, float4(x, y, z, layer).data, level);
return mapFrom<T>(tmp);
}
template <
typename T,
typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ void texCubemapLayeredLod(T *ptr, hipTextureObject_t textureObject, float x, float y, float z, int layer, float level)
{
*ptr = texCubemapLayeredLod<T>(textureObject, x, y, z, layer, level);
}
template <
typename T,
typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ T tex1DGrad(hipTextureObject_t textureObject, float x, float dPdx, float dPdy)
{
TEXTURE_OBJECT_PARAMETERS_INIT
auto tmp = __ockl_image_sample_grad_1D(i, s, x, dPdx, dPdy);
return mapFrom<T>(tmp);
}
template <
typename T,
typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ void tex1DGrad(T *ptr, hipTextureObject_t textureObject, float x, float dPdx, float dPdy)
{
*ptr = tex1DGrad<T>(textureObject, x, dPdx, dPdy);
}
template <
typename T,
typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ T tex2DGrad(hipTextureObject_t textureObject, float x, float y, float2 dPdx, float2 dPdy)
{
TEXTURE_OBJECT_PARAMETERS_INIT
auto tmp = __ockl_image_sample_grad_2D(i, s, float2(x, y).data, float2(dPdx.x, dPdx.y).data, float2(dPdy.x, dPdy.y).data);
return mapFrom<T>(tmp);
}
template <
typename T,
typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ void tex2DGrad(T *ptr, hipTextureObject_t textureObject, float x, float y, float2 dPdx, float2 dPdy)
{
*ptr = tex2DGrad<T>(textureObject, x, y, dPdx, dPdy);
}
template <
typename T,
typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ T tex3DGrad(hipTextureObject_t textureObject, float x, float y, float z, float4 dPdx, float4 dPdy)
{
TEXTURE_OBJECT_PARAMETERS_INIT
auto tmp = __ockl_image_sample_grad_3D(i, s, float4(x, y, z, 0.0f).data, float4(dPdx.x, dPdx.y, dPdx.z, 0.0f).data, float4(dPdy.x, dPdy.y, dPdy.z, 0.0f).data);
return mapFrom<T>(tmp);
}
template <
typename T,
typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ void tex3DGrad(T *ptr, hipTextureObject_t textureObject, float x, float y, float z, float4 dPdx, float4 dPdy)
{
*ptr = tex3DGrad<T>(textureObject, x, y, z, dPdx, dPdy);
}
template <
typename T,
typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ T tex1DLayeredGrad(hipTextureObject_t textureObject, float x, int layer, float dPdx, float dPdy)
{
TEXTURE_OBJECT_PARAMETERS_INIT
auto tmp = __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dPdx, dPdy);
return mapFrom<T>(tmp);
}
template <
typename T,
typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ void tex1DLayeredGrad(T *ptr, hipTextureObject_t textureObject, float x, int layer, float dPdx, float dPdy)
{
*ptr = tex1DLayeredGrad<T>(textureObject, x, layer, dPdx, dPdy);
}
template <
typename T,
typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ T tex2DLayeredGrad(hipTextureObject_t textureObject, float x, float y, int layer, float2 dPdx, float2 dPdy)
{
TEXTURE_OBJECT_PARAMETERS_INIT
auto tmp = __ockl_image_sample_grad_2Da(i, s, float4(x, y, layer, 0.0f).data, float2(dPdx.x, dPdx.y).data, float2(dPdy.x, dPdy.y).data);
return mapFrom<T>(tmp);
}
template <
typename T,
typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ void tex2DLayeredGrad(T *ptr, hipTextureObject_t textureObject, float x, float y, int layer, float2 dPdx, float2 dPdy)
{
*ptr = tex2DLayeredGrad<T>(textureObject, x, y, layer, dPdx, dPdy);
}
template <
typename T,
typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ T texCubemapLayeredGrad(hipTextureObject_t textureObject, float x, float y, float z, int layer, float4 dPdx, float4 dPdy)
{
TEXTURE_OBJECT_PARAMETERS_INIT
// TODO missing in device libs.
// auto tmp = __ockl_image_sample_grad_CMa(i, s, float4(x, y, z, layer).data, float4(dPdx.x, dPdx.y, dPdx.z, 0.0f).data, float4(dPdy.x, dPdy.y, dPdy.z, 0.0f).data);
// return mapFrom<T>(tmp);
return {};
}
template <
typename T,
typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ void texCubemapLayeredGrad(T *ptr, hipTextureObject_t textureObject, float x, float y, float z, int layer, float4 dPdx, float4 dPdy)
{
*ptr = texCubemapLayeredGrad<T>(textureObject, x, y, z, layer, dPdx, dPdy);
}
#endif
+1
查看文件
@@ -0,0 +1 @@
amd_detail
+1
查看文件
@@ -0,0 +1 @@
nvidia_detail
@@ -0,0 +1,28 @@
/*
Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#ifndef HIP_INCLUDE_HIP_NVIDIA_DETAIL_CHANNEL_DESCRIPTOR_H
#define HIP_INCLUDE_HIP_NVIDIA_DETAIL_CHANNEL_DESCRIPTOR_H
#include "channel_descriptor.h"
#endif
@@ -0,0 +1,75 @@
/*
Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#ifndef HIP_INCLUDE_HIP_NVIDIA_DETAIL_HIP_ATOMICS_H
#define HIP_INCLUDE_HIP_NVIDIA_DETAIL_HIP_ATOMICS_H
__device__ inline float atomicMax(float* addr, float val) {
unsigned int *uaddr = (unsigned int *)addr;
float value = __uint_as_float(*uaddr);
while (value < val) {
value = __uint_as_float(atomicCAS(uaddr, __float_as_uint(value),
__float_as_uint(val)));
}
return value;
}
__device__ inline double atomicMax(double* addr, double val) {
unsigned long long* uaddr = (unsigned long long *)addr;
double value = __longlong_as_double(*uaddr);
while (value < val) {
value = __longlong_as_double(atomicCAS(uaddr,
__double_as_longlong(value),
__double_as_longlong(val)));
}
return value;
}
__device__ inline float atomicMin(float* addr, float val) {
unsigned int *uaddr = (unsigned int *)addr;
float value = __uint_as_float(*uaddr);
while (value > val) {
value = __uint_as_float(atomicCAS(uaddr, __float_as_uint(value),
__float_as_uint(val)));
}
return value;
}
__device__ inline double atomicMin(double* addr, double val) {
unsigned long long* uaddr = (unsigned long long *)addr;
double value = __longlong_as_double(*uaddr);
while (value > val) {
value = __longlong_as_double(atomicCAS(uaddr,
__double_as_longlong(value),
__double_as_longlong(val)));
}
return value;
}
#endif
@@ -0,0 +1,119 @@
/*
Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#ifndef HIP_INCLUDE_HIP_NVIDIA_DETAIL_HIP_COMPLEX_H
#define HIP_INCLUDE_HIP_NVIDIA_DETAIL_HIP_COMPLEX_H
#include "cuComplex.h"
typedef cuFloatComplex hipFloatComplex;
__device__ __host__ static inline float hipCrealf(hipFloatComplex z) { return cuCrealf(z); }
__device__ __host__ static inline float hipCimagf(hipFloatComplex z) { return cuCimagf(z); }
__device__ __host__ static inline hipFloatComplex make_hipFloatComplex(float a, float b) {
return make_cuFloatComplex(a, b);
}
__device__ __host__ static inline hipFloatComplex hipConjf(hipFloatComplex z) { return cuConjf(z); }
__device__ __host__ static inline float hipCsqabsf(hipFloatComplex z) {
return cuCabsf(z) * cuCabsf(z);
}
__device__ __host__ static inline hipFloatComplex hipCaddf(hipFloatComplex p, hipFloatComplex q) {
return cuCaddf(p, q);
}
__device__ __host__ static inline hipFloatComplex hipCsubf(hipFloatComplex p, hipFloatComplex q) {
return cuCsubf(p, q);
}
__device__ __host__ static inline hipFloatComplex hipCmulf(hipFloatComplex p, hipFloatComplex q) {
return cuCmulf(p, q);
}
__device__ __host__ static inline hipFloatComplex hipCdivf(hipFloatComplex p, hipFloatComplex q) {
return cuCdivf(p, q);
}
__device__ __host__ static inline float hipCabsf(hipFloatComplex z) { return cuCabsf(z); }
typedef cuDoubleComplex hipDoubleComplex;
__device__ __host__ static inline double hipCreal(hipDoubleComplex z) { return cuCreal(z); }
__device__ __host__ static inline double hipCimag(hipDoubleComplex z) { return cuCimag(z); }
__device__ __host__ static inline hipDoubleComplex make_hipDoubleComplex(double a, double b) {
return make_cuDoubleComplex(a, b);
}
__device__ __host__ static inline hipDoubleComplex hipConj(hipDoubleComplex z) { return cuConj(z); }
__device__ __host__ static inline double hipCsqabs(hipDoubleComplex z) {
return cuCabs(z) * cuCabs(z);
}
__device__ __host__ static inline hipDoubleComplex hipCadd(hipDoubleComplex p, hipDoubleComplex q) {
return cuCadd(p, q);
}
__device__ __host__ static inline hipDoubleComplex hipCsub(hipDoubleComplex p, hipDoubleComplex q) {
return cuCsub(p, q);
}
__device__ __host__ static inline hipDoubleComplex hipCmul(hipDoubleComplex p, hipDoubleComplex q) {
return cuCmul(p, q);
}
__device__ __host__ static inline hipDoubleComplex hipCdiv(hipDoubleComplex p, hipDoubleComplex q) {
return cuCdiv(p, q);
}
__device__ __host__ static inline double hipCabs(hipDoubleComplex z) { return cuCabs(z); }
typedef cuFloatComplex hipComplex;
__device__ __host__ static inline hipComplex make_Complex(float x, float y) {
return make_cuComplex(x, y);
}
__device__ __host__ static inline hipFloatComplex hipComplexDoubleToFloat(hipDoubleComplex z) {
return cuComplexDoubleToFloat(z);
}
__device__ __host__ static inline hipDoubleComplex hipComplexFloatToDouble(hipFloatComplex z) {
return cuComplexFloatToDouble(z);
}
__device__ __host__ static inline hipComplex hipCfmaf(hipComplex p, hipComplex q, hipComplex r) {
return cuCfmaf(p, q, r);
}
__device__ __host__ static inline hipDoubleComplex hipCfma(hipDoubleComplex p, hipDoubleComplex q,
hipDoubleComplex r) {
return cuCfma(p, q, r);
}
#endif
@@ -0,0 +1,12 @@
#ifndef HIP_INCLUDE_HIP_NVIDIA_DETAIL_HIP_COOPERATIVE_GROUPS_H
#define HIP_INCLUDE_HIP_NVIDIA_DETAIL_HIP_COOPERATIVE_GROUPS_H
// Include CUDA headers
#include <cuda_runtime.h>
#include <cooperative_groups.h>
// Include HIP wrapper headers around CUDA
#include <hip/hip_runtime.h>
#include <hip/hip_runtime_api.h>
#endif // HIP_INCLUDE_HIP_NVIDIA_DETAIL_HIP_COOPERATIVE_GROUPS_H
@@ -0,0 +1,62 @@
/*
Copyright (c) 2015 - 2022 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#ifndef NVIDIA_HIP_MATH_CONSTANTS_H
#define NVIDIA_HIP_MATH_CONSTANTS_H
#include <math_constants.h>
#define HIP_INF_F CUDART_INF_F
#define HIP_NAN_F CUDART_NAN_F
#define HIP_MIN_DENORM_F CUDART_MIN_DENORM_F
#define HIP_MAX_NORMAL_F CUDART_MAX_NORMAL_F
#define HIP_NEG_ZERO_F CUDART_NEG_ZERO_F
#define HIP_ZERO_F CUDART_ZERO_F
#define HIP_ONE_F CUDART_ONE_F
#define HIP_SQRT_HALF_F CUDART_SQRT_HALF_F
#define HIP_SQRT_HALF_HI_F CUDART_SQRT_HALF_HI_F
#define HIP_SQRT_HALF_LO_F CUDART_SQRT_HALF_LO_F
#define HIP_SQRT_TWO_F CUDART_SQRT_TWO_F
#define HIP_THIRD_F CUDART_THIRD_F
#define HIP_PIO4_F CUDART_PIO4_F
#define HIP_PIO2_F CUDART_PIO2_F
#define HIP_3PIO4_F CUDART_3PIO4_F
#define HIP_2_OVER_PI_F CUDART_2_OVER_PI_F
#define HIP_SQRT_2_OVER_PI_F CUDART_SQRT_2_OVER_PI_F
#define HIP_PI_F CUDART_PI_F
#define HIP_L2E_F CUDART_L2E_F
#define HIP_L2T_F CUDART_L2T_F
#define HIP_LG2_F CUDART_LG2_F
#define HIP_LGE_F CUDART_LGE_F
#define HIP_LN2_F CUDART_LN2_F
#define HIP_LNT_F CUDART_LNT_F
#define HIP_LNPI_F CUDART_LNPI_F
#define HIP_TWO_TO_M126_F CUDART_TWO_TO_M126_F
#define HIP_TWO_TO_126_F CUDART_TWO_TO_126_F
#define HIP_NORM_HUGE_F CUDART_NORM_HUGE_F
#define HIP_TWO_TO_23_F CUDART_TWO_TO_23_F
#define HIP_TWO_TO_24_F CUDART_TWO_TO_24_F
#define HIP_TWO_TO_31_F CUDART_TWO_TO_31_F
#define HIP_TWO_TO_32_F CUDART_TWO_TO_32_F
#define HIP_REMQUO_BITS_F CUDART_REMQUO_BITS_F
#define HIP_REMQUO_MASK_F CUDART_REMQUO_MASK_F
#define HIP_TRIG_PLOSS_F CUDART_TRIG_PLOSS_F
#endif
@@ -0,0 +1,124 @@
/*
Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#ifndef HIP_INCLUDE_HIP_NVIDIA_DETAIL_HIP_RUNTIME_H
#define HIP_INCLUDE_HIP_NVIDIA_DETAIL_HIP_RUNTIME_H
#include <cuda_runtime.h>
#include <hip/hip_runtime_api.h>
#define HIP_KERNEL_NAME(...) __VA_ARGS__
typedef int hipLaunchParm;
#define hipLaunchKernelGGLInternal(kernelName, numBlocks, numThreads, memPerBlock, streamId, ...) \
do { \
kernelName<<<numBlocks, numThreads, memPerBlock, streamId>>>(__VA_ARGS__); \
} while (0)
#define hipLaunchKernelGGL(kernelName, ...) hipLaunchKernelGGLInternal((kernelName), __VA_ARGS__)
#define hipReadModeElementType cudaReadModeElementType
#ifdef __CUDA_ARCH__
// 32-bit Atomics:
#define __HIP_ARCH_HAS_GLOBAL_INT32_ATOMICS__ (__CUDA_ARCH__ >= 110)
#define __HIP_ARCH_HAS_GLOBAL_FLOAT_ATOMIC_EXCH__ (__CUDA_ARCH__ >= 110)
#define __HIP_ARCH_HAS_SHARED_INT32_ATOMICS__ (__CUDA_ARCH__ >= 120)
#define __HIP_ARCH_HAS_SHARED_FLOAT_ATOMIC_EXCH__ (__CUDA_ARCH__ >= 120)
#define __HIP_ARCH_HAS_FLOAT_ATOMIC_ADD__ (__CUDA_ARCH__ >= 200)
// 64-bit Atomics:
#define __HIP_ARCH_HAS_GLOBAL_INT64_ATOMICS__ (__CUDA_ARCH__ >= 200)
#define __HIP_ARCH_HAS_SHARED_INT64_ATOMICS__ (__CUDA_ARCH__ >= 120)
// Doubles
#define __HIP_ARCH_HAS_DOUBLES__ (__CUDA_ARCH__ >= 120)
// warp cross-lane operations:
#define __HIP_ARCH_HAS_WARP_VOTE__ (__CUDA_ARCH__ >= 120)
#define __HIP_ARCH_HAS_WARP_BALLOT__ (__CUDA_ARCH__ >= 200)
#define __HIP_ARCH_HAS_WARP_SHUFFLE__ (__CUDA_ARCH__ >= 300)
#define __HIP_ARCH_HAS_WARP_FUNNEL_SHIFT__ (__CUDA_ARCH__ >= 350)
// sync
#define __HIP_ARCH_HAS_THREAD_FENCE_SYSTEM__ (__CUDA_ARCH__ >= 200)
#define __HIP_ARCH_HAS_SYNC_THREAD_EXT__ (__CUDA_ARCH__ >= 200)
// misc
#define __HIP_ARCH_HAS_SURFACE_FUNCS__ (__CUDA_ARCH__ >= 200)
#define __HIP_ARCH_HAS_3DGRID__ (__CUDA_ARCH__ >= 200)
#define __HIP_ARCH_HAS_DYNAMIC_PARALLEL__ (__CUDA_ARCH__ >= 350)
#endif
#ifdef __CUDACC__
#include "nvidia_hip_atomics.h"
#include "nvidia_hip_unsafe_atomics.h"
#define hipThreadIdx_x threadIdx.x
#define hipThreadIdx_y threadIdx.y
#define hipThreadIdx_z threadIdx.z
#define hipBlockIdx_x blockIdx.x
#define hipBlockIdx_y blockIdx.y
#define hipBlockIdx_z blockIdx.z
#define hipBlockDim_x blockDim.x
#define hipBlockDim_y blockDim.y
#define hipBlockDim_z blockDim.z
#define hipGridDim_x gridDim.x
#define hipGridDim_y gridDim.y
#define hipGridDim_z gridDim.z
#define HIP_SYMBOL(X) &X
/**
* Map HIP_DYNAMIC_SHARED to "extern __shared__" for compatibility with old HIP applications
* To be removed in a future release.
*/
#define HIP_DYNAMIC_SHARED(type, var) extern __shared__ type var[];
#define HIP_DYNAMIC_SHARED_ATTRIBUTE
#ifdef __HIP_DEVICE_COMPILE__
#define abort_() \
{ asm("trap;"); }
#undef assert
#define assert(COND) \
{ \
if (!COND) { \
abort_(); \
} \
}
#endif
#define __clock() clock()
#define __clock64() clock64()
#endif
#endif
文件差异内容过多而无法显示 加载差异
@@ -0,0 +1,6 @@
#ifndef HIP_INCLUDE_HIP_NVIDIA_DETAIL_HIP_TEXTURE_TYPES_H
#define HIP_INCLUDE_HIP_NVIDIA_DETAIL_HIP_TEXTURE_TYPES_H
#include <texture_types.h>
#endif
@@ -0,0 +1,100 @@
/*
Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#ifndef HIP_INCLUDE_HIP_NVIDIA_DETAIL_HIP_UNSAFE_ATOMICS_H
#define HIP_INCLUDE_HIP_NVIDIA_DETAIL_HIP_UNSAFE_ATOMICS_H
__device__ inline float unsafeAtomicAdd(float* addr, float value) {
return atomicAdd(addr, value);
}
__device__ inline double unsafeAtomicAdd(double* addr, double value) {
#if __CUDA_ARCH__ < 600
unsigned long long *addr_cast = (unsigned long long*)addr;
unsigned long long old_val = *addr_cast;
unsigned long long expected;
do {
expected = old_val;
old_val = atomicCAS(addr_cast, expected,
__double_as_longlong(value +
__longlong_as_double(expected)));
} while (__double_as_longlong(expected) != __double_as_longlong(old_val));
return old_val;
#else
return atomicAdd(addr, value);
#endif
}
__device__ inline float unsafeAtomicMax(float* addr, float value) {
return atomicMax(addr, value);
}
__device__ inline double unsafeAtomicMax(double* addr, double val) {
return atomicMax(addr, val);
}
__device__ inline float unsafeAtomicMin(float* addr, float value) {
return atomicMin(addr, value);
}
__device__ inline double unsafeAtomicMin(double* addr, double val) {
return atomicMin(addr, val);
}
__device__ inline float safeAtomicAdd(float* addr, float value) {
return atomicAdd(addr, value);
}
__device__ inline double safeAtomicAdd(double* addr, double value) {
#if __CUDA_ARCH__ < 600
unsigned long long *addr_cast = (unsigned long long*)addr;
unsigned long long old_val = *addr_cast;
unsigned long long expected;
do {
expected = old_val;
old_val = atomicCAS(addr_cast, expected,
__double_as_longlong(value +
__longlong_as_double(expected)));
} while (__double_as_longlong(expected) != __double_as_longlong(old_val));
return old_val;
#else
return atomicAdd(addr, value);
#endif
}
__device__ inline float safeAtomicMax(float* addr, float value) {
return atomicMax(addr, value);
}
__device__ inline double safeAtomicMax(double* addr, double val) {
return atomicMax(addr, val);
}
__device__ inline float safeAtomicMin(float* addr, float value) {
return atomicMin(addr, value);
}
__device__ inline double safeAtomicMin(double* addr, double val) {
return atomicMin(addr, val);
}
#endif
@@ -0,0 +1,172 @@
/*
Copyright (c) 2021 - 2022 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#ifndef HIPRTC_H
#define HIPRTC_H
#include <cuda.h>
#include <nvrtc.h>
#ifdef __cplusplus
extern "C" {
#endif /* __cplusplus */
#include <stdlib.h>
#if !defined(_WIN32)
#pragma GCC visibility push(default)
#endif
typedef enum hiprtcResult {
HIPRTC_SUCCESS = 0,
HIPRTC_ERROR_OUT_OF_MEMORY = 1,
HIPRTC_ERROR_PROGRAM_CREATION_FAILURE = 2,
HIPRTC_ERROR_INVALID_INPUT = 3,
HIPRTC_ERROR_INVALID_PROGRAM = 4,
HIPRTC_ERROR_INVALID_OPTION = 5,
HIPRTC_ERROR_COMPILATION = 6,
HIPRTC_ERROR_BUILTIN_OPERATION_FAILURE = 7,
HIPRTC_ERROR_NO_NAME_EXPRESSIONS_AFTER_COMPILATION = 8,
HIPRTC_ERROR_NO_LOWERED_NAMES_BEFORE_COMPILATION = 9,
HIPRTC_ERROR_NAME_EXPRESSION_NOT_VALID = 10,
HIPRTC_ERROR_INTERNAL_ERROR = 11
} hiprtcResult;
inline static nvrtcResult hiprtcResultTonvrtcResult(hiprtcResult result) {
switch (result) {
case HIPRTC_SUCCESS:
return NVRTC_SUCCESS;
case HIPRTC_ERROR_OUT_OF_MEMORY:
return NVRTC_ERROR_OUT_OF_MEMORY;
case HIPRTC_ERROR_PROGRAM_CREATION_FAILURE:
return NVRTC_ERROR_PROGRAM_CREATION_FAILURE;
case HIPRTC_ERROR_INVALID_INPUT:
return NVRTC_ERROR_INVALID_INPUT;
case HIPRTC_ERROR_INVALID_PROGRAM:
return NVRTC_ERROR_INVALID_PROGRAM;
case HIPRTC_ERROR_INVALID_OPTION:
return NVRTC_ERROR_INVALID_OPTION;
case HIPRTC_ERROR_COMPILATION:
return NVRTC_ERROR_COMPILATION;
case HIPRTC_ERROR_BUILTIN_OPERATION_FAILURE:
return NVRTC_ERROR_BUILTIN_OPERATION_FAILURE;
case HIPRTC_ERROR_NO_NAME_EXPRESSIONS_AFTER_COMPILATION:
return NVRTC_ERROR_NO_NAME_EXPRESSIONS_AFTER_COMPILATION;
case HIPRTC_ERROR_NO_LOWERED_NAMES_BEFORE_COMPILATION:
return NVRTC_ERROR_NO_LOWERED_NAMES_BEFORE_COMPILATION;
case HIPRTC_ERROR_NAME_EXPRESSION_NOT_VALID:
return NVRTC_ERROR_NAME_EXPRESSION_NOT_VALID;
case HIPRTC_ERROR_INTERNAL_ERROR:
return NVRTC_ERROR_INTERNAL_ERROR;
default:
return NVRTC_ERROR_INTERNAL_ERROR;
}
}
inline static hiprtcResult nvrtcResultTohiprtcResult(nvrtcResult result) {
switch (result) {
case NVRTC_SUCCESS:
return HIPRTC_SUCCESS;
case NVRTC_ERROR_OUT_OF_MEMORY:
return HIPRTC_ERROR_OUT_OF_MEMORY;
case NVRTC_ERROR_PROGRAM_CREATION_FAILURE:
return HIPRTC_ERROR_PROGRAM_CREATION_FAILURE;
case NVRTC_ERROR_INVALID_INPUT:
return HIPRTC_ERROR_INVALID_INPUT;
case NVRTC_ERROR_INVALID_PROGRAM:
return HIPRTC_ERROR_INVALID_PROGRAM;
case NVRTC_ERROR_INVALID_OPTION:
return HIPRTC_ERROR_INVALID_OPTION;
case NVRTC_ERROR_COMPILATION:
return HIPRTC_ERROR_COMPILATION;
case NVRTC_ERROR_BUILTIN_OPERATION_FAILURE:
return HIPRTC_ERROR_BUILTIN_OPERATION_FAILURE;
case NVRTC_ERROR_NO_NAME_EXPRESSIONS_AFTER_COMPILATION:
return HIPRTC_ERROR_NO_NAME_EXPRESSIONS_AFTER_COMPILATION;
case NVRTC_ERROR_NO_LOWERED_NAMES_BEFORE_COMPILATION:
return HIPRTC_ERROR_NO_LOWERED_NAMES_BEFORE_COMPILATION;
case NVRTC_ERROR_NAME_EXPRESSION_NOT_VALID:
return HIPRTC_ERROR_NAME_EXPRESSION_NOT_VALID;
case NVRTC_ERROR_INTERNAL_ERROR:
return HIPRTC_ERROR_INTERNAL_ERROR;
default:
return HIPRTC_ERROR_INTERNAL_ERROR;
}
}
inline static const char* hiprtcGetErrorString(hiprtcResult result) {
return nvrtcGetErrorString(hiprtcResultTonvrtcResult(result));
}
inline static hiprtcResult hiprtcVersion(int* major, int* minor) {
return nvrtcResultTohiprtcResult(nvrtcVersion(major, minor));
}
typedef nvrtcProgram hiprtcProgram;
inline static hiprtcResult hiprtcAddNameExpression(hiprtcProgram prog, const char* name_expression) {
return nvrtcResultTohiprtcResult(nvrtcAddNameExpression(prog, name_expression));
}
inline static hiprtcResult hiprtcCompileProgram(hiprtcProgram prog, int numOptions, const char** options) {
return nvrtcResultTohiprtcResult(nvrtcCompileProgram(prog, numOptions, options));
}
inline static hiprtcResult hiprtcCreateProgram(hiprtcProgram* prog, const char* src, const char* name,
int numHeaders, const char** headers, const char** includeNames) {
return nvrtcResultTohiprtcResult(
nvrtcCreateProgram(prog, src, name, numHeaders, headers, includeNames));
}
inline static hiprtcResult hiprtcDestroyProgram(hiprtcProgram* prog) {
return nvrtcResultTohiprtcResult(nvrtcDestroyProgram(prog));
}
inline static hiprtcResult hiprtcGetLoweredName(hiprtcProgram prog, const char* name_expression,
const char** lowered_name) {
return nvrtcResultTohiprtcResult(nvrtcGetLoweredName(prog, name_expression, lowered_name));
}
inline static hiprtcResult hiprtcGetProgramLog(hiprtcProgram prog, char* log) {
return nvrtcResultTohiprtcResult(nvrtcGetProgramLog(prog, log));
}
inline static hiprtcResult hiprtcGetProgramLogSize(hiprtcProgram prog, size_t* logSizeRet) {
return nvrtcResultTohiprtcResult(nvrtcGetProgramLogSize(prog, logSizeRet));
}
inline static hiprtcResult hiprtcGetCode(hiprtcProgram prog, char* code) {
return nvrtcResultTohiprtcResult(nvrtcGetPTX(prog, code));
}
inline static hiprtcResult hiprtcGetCodeSize(hiprtcProgram prog, size_t* codeSizeRet) {
return nvrtcResultTohiprtcResult(nvrtcGetPTXSize(prog, codeSizeRet));
}
#if !defined(_WIN32)
#pragma GCC visibility pop
#endif
#ifdef __cplusplus
}
#endif /* __cplusplus */
#endif // HIPRTC_H
+135
查看文件
@@ -0,0 +1,135 @@
#!/bin/bash
# Copyright (c) 2017 - 2021 Advanced Micro Devices, Inc. All Rights Reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
# Parse command-line options
# Option strings
SHORT=h
LONG=help,opencl:,hip:,rocclr:
# read the options
OPTS=$(getopt --options $SHORT --long $LONG --name "$0" -- "$@")
if [ $? != 0 ] ; then echo "Failed to parse options...exiting." >&2 ; exit 1 ; fi
usage() {
echo "Usage: $0 --hip <PATH to the hip common src> --opencl <PATH to the opencl src> --rocclr <PATH to the rocclr src>" ;
exit 1;
}
[ $# -eq 0 ] && usage
eval set -- "$OPTS"
# extract options and their arguments into variables.
while true ; do
case "$1" in
--hip )
HIP_DIR="$2"
shift 2
;;
--rocclr )
ROCCLR_DIR="$2"
shift 2
;;
--opencl )
OPENCL_DIR="$2"
shift 2
;;
-h | --help )
usage
shift
;;
-- )
shift
break
;;
*)
echo "Internal error!"
exit 1
;;
esac
done
BUILD_ROOT="$( mktemp -d )"
SRC_ROOT="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
WORKING_DIR=$PWD
DASH_JAY="-j $(getconf _NPROCESSORS_ONLN)"
OS_NAME="$(cat /etc/os-release | awk -F '=' '/^NAME/{print $2}' | awk '{print $1}' | tr -d '"')"
[[ -z "$ROCM_PATH" ]] && ROCM_PATH=/opt/rocm
err() {
echo "${1-Died}." >&2
}
die() {
err "$1"
exit 1
}
pushd () {
command pushd "$@" > /dev/null
}
popd () {
command popd "$@" > /dev/null
}
function setupENV()
{
if [ "$OS_NAME" == "Ubuntu" ]
then
sudo apt-get update
sudo apt-get install dpkg-dev rpm doxygen libelf-dev rename liburi-encode-perl \
libfile-basedir-perl libfile-copy-recursive-perl libfile-listing-perl
elif [ "$OS_NAME" == "CentOS" ]
then
yum install dpkg-dev rpm-build doxygen elfutils-libelf-devel prename \
perl-URI-Encode perl-File-Listing perl-File-BaseDir
fi
}
function buildHIP()
{
pushd $BUILD_ROOT
HIP_BUILD_DIR="$BUILD_ROOT/hip_build"
mkdir $HIP_BUILD_DIR
pushd $HIP_BUILD_DIR
cmake $SRC_ROOT -DHIP_COMMON_DIR="$HIP_DIR" -DAMD_OPENCL_PATH=$OPENCL_DIR -DROCCLR_PATH=$ROCCLR_DIR -DCMAKE_PREFIX_PATH="$ROCM_PATH" -DCMAKE_BUILD_TYPE=Release
make $DASH_JAY
make package
if [ "$OS_NAME" == "Ubuntu" ]
then
cp hip-*.deb $WORKING_DIR
sudo dpkg -i -B hip-dev*.deb hip-runtime-amd*.deb hip-sample*.deb hip-doc*.deb
elif [ "$OS_NAME" == "CentOS" ]
then
cp hip-*.rpm $WORKING_DIR
sudo rpm -ivh --replacefiles --force hip-devel*.rpm hip-runtime-amd*.rpm hip-sample*.rpm \
hip-doc*.rpm
fi
popd
popd
rm -rf $BUILD_ROOT
}
echo "Preparing build environment"
setupENV || die "setupENV failed"
echo "Building and installing HIP packages"
buildHIP || die "buildHIP failed"
echo "Finished building HIP packages"
@@ -0,0 +1,251 @@
# Copyright (c) 2020 - 2022 Advanced Micro Devices, Inc. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
cmake_minimum_required(VERSION 3.16.8)
#set components for HIP
set(CPACK_COMPONENTS_ALL binary dev doc samples runtime-nvidia)
###############Install Required files for all compnents########
#Enable Component Install
set(CPACK_RPM_COMPONENT_INSTALL ON)
set(CPACK_DEB_COMPONENT_INSTALL ON)
###Set License####
set(CPACK_RESOURCE_FILE_LICENSE ${hip_SOURCE_DIR}/LICENSE.txt)
install(FILES ${CPACK_RESOURCE_FILE_LICENSE} DESTINATION ${CMAKE_INSTALL_DOCDIR} COMPONENT binary)
set(CPACK_RPM_PACKAGE_LICENSE "MIT")
#Begin binary files install
if(HIP_PLATFORM STREQUAL "amd" )
if(BUILD_SHARED_LIBS)
install(FILES ${CMAKE_BINARY_DIR}/lib/libamdhip64.so DESTINATION ${CMAKE_INSTALL_LIBDIR} COMPONENT binary)
install(FILES ${CMAKE_BINARY_DIR}/lib/libamdhip64.so.${HIP_LIB_VERSION_MAJOR} DESTINATION ${CMAKE_INSTALL_LIBDIR} COMPONENT binary)
install(FILES ${CMAKE_BINARY_DIR}/lib/libamdhip64.so.${HIP_LIB_VERSION_STRING} DESTINATION ${CMAKE_INSTALL_LIBDIR} COMPONENT binary)
install(FILES ${CMAKE_BINARY_DIR}/lib/libhiprtc.so DESTINATION ${CMAKE_INSTALL_LIBDIR} COMPONENT binary)
install(FILES ${CMAKE_BINARY_DIR}/lib/libhiprtc.so.${HIP_LIB_VERSION_MAJOR} DESTINATION ${CMAKE_INSTALL_LIBDIR} COMPONENT binary)
install(FILES ${CMAKE_BINARY_DIR}/lib/libhiprtc.so.${HIP_LIB_VERSION_STRING} DESTINATION ${CMAKE_INSTALL_LIBDIR} COMPONENT binary)
install(FILES ${CMAKE_BINARY_DIR}/lib/libhiprtc-builtins.so DESTINATION ${CMAKE_INSTALL_LIBDIR} COMPONENT binary)
install(FILES ${CMAKE_BINARY_DIR}/lib/libhiprtc-builtins.so.${HIP_LIB_VERSION_MAJOR} DESTINATION ${CMAKE_INSTALL_LIBDIR} COMPONENT binary)
install(FILES ${CMAKE_BINARY_DIR}/lib/libhiprtc-builtins.so.${HIP_LIB_VERSION_STRING} DESTINATION ${CMAKE_INSTALL_LIBDIR} COMPONENT binary)
else()
install(FILES ${CMAKE_BINARY_DIR}/lib/libamdhip64.a DESTINATION ${CMAKE_INSTALL_LIBDIR} COMPONENT binary)
endif()#End BUILD_SHARED_LIBS
#TODO:This do not belong in BINARY package.
#Keeping it as is for now
install(FILES ${CMAKE_BINARY_DIR}/.hipInfo DESTINATION ${CMAKE_INSTALL_LIBDIR} COMPONENT binary)
install(FILES ${CMAKE_BINARY_DIR}/hip-config.cmake ${CMAKE_BINARY_DIR}/hip-config-version.cmake DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/hip COMPONENT binary)
install ( EXPORT hip-targets FILE hip-targets.cmake NAMESPACE hip:: DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/hip COMPONENT binary)
install(FILES ${CMAKE_BINARY_DIR}/src/hip-lang-config.cmake ${CMAKE_BINARY_DIR}/src/hip-lang-config-version.cmake DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/hip-lang COMPONENT binary)
install ( EXPORT hip-lang-targets FILE hip-lang-targets.cmake NAMESPACE hip-lang:: DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/hip-lang COMPONENT binary)
install(FILES ${CMAKE_BINARY_DIR}/hiprtc-config.cmake ${CMAKE_BINARY_DIR}/hiprtc-config-version.cmake DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/hiprtc COMPONENT binary)
install ( EXPORT hiprtc-targets FILE hiprtc-targets.cmake NAMESPACE hiprtc:: DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/hiprtc COMPONENT binary)
endif()#End HIP_PLATFORM = "amd"
#End bianry files install
#Begin dev files install
if(WIN32)
install(DIRECTORY ${HIP_COMMON_DIR}/bin DESTINATION . COMPONENT dev
USE_SOURCE_PERMISSIONS)
else()
install(DIRECTORY ${HIP_COMMON_DIR}/bin DESTINATION . COMPONENT dev
USE_SOURCE_PERMISSIONS
DIRECTORY_PERMISSIONS OWNER_READ OWNER_WRITE OWNER_EXECUTE GROUP_READ GROUP_EXECUTE WORLD_READ WORLD_EXECUTE
PATTERN *.bat EXCLUDE)
endif()
install(DIRECTORY ${hip_SOURCE_DIR}/bin DESTINATION . COMPONENT dev
USE_SOURCE_PERMISSIONS
DIRECTORY_PERMISSIONS OWNER_READ OWNER_WRITE OWNER_EXECUTE GROUP_READ GROUP_EXECUTE WORLD_READ WORLD_EXECUTE)
install(DIRECTORY ${HIP_COMMON_DIR}/include DESTINATION . COMPONENT dev)
install(DIRECTORY ${hip_SOURCE_DIR}/include/hip/amd_detail
DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/hip COMPONENT dev)
install(DIRECTORY ${hip_SOURCE_DIR}/include/hip/nvidia_detail
DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/hip COMPONENT dev)
install(FILES ${CMAKE_BINARY_DIR}/include/hip/amd_detail/hip_prof_str.h
DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/hip/amd_detail COMPONENT dev)
install(FILES ${CMAKE_BINARY_DIR}/include/hip/hip_version.h
DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/hip COMPONENT dev)
install(FILES ${CMAKE_BINARY_DIR}/.hipVersion DESTINATION ${CMAKE_INSTALL_BINDIR} COMPONENT dev)
install(DIRECTORY ${HIP_COMMON_DIR}/cmake/ DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/hip COMPONENT dev)
#End dev files install
#Begin doc files install
find_program(DOXYGEN_EXE doxygen)
if(DOXYGEN_EXE)
add_custom_target(build_doxygen ALL
COMMAND HIP_PATH=${HIP_COMMON_DIR} doxygen ${HIP_COMMON_DIR}/docs/doxygen-input/doxy.cfg)
install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/RuntimeAPI/html
DESTINATION ${CMAKE_INSTALL_DOCDIR}/RuntimeAPI COMPONENT doc)
endif()
#End doc files install
#Begin samples files install
install(DIRECTORY ${HIP_COMMON_DIR}/samples DESTINATION ${CMAKE_INSTALL_DATADIR}/hip COMPONENT samples)
#End samples files install
##################################
# Packaging steps COMMON Variables
##################################
set(CPACK_SET_DESTDIR TRUE)
set(CPACK_PACKAGE_VENDOR "Advanced Micro Devices, Inc.")
set(CPACK_PACKAGE_CONTACT "HIP Support <hip.support@amd.com>")
set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "HIP:Heterogenous-computing Interface for Portability")
set(CPACK_PACKAGE_VERSION_MAJOR ${HIP_VERSION_MAJOR})
set(CPACK_PACKAGE_VERSION_MINOR ${HIP_VERSION_MINOR})
set(CPACK_PACKAGE_VERSION_PATCH ${HIP_VERSION_PATCH})
set(CPACK_PACKAGE_VERSION ${HIP_VERSION_MAJOR}.${HIP_VERSION_MINOR}.${HIP_PACKAGING_VERSION_PATCH})
set(CPACK_GENERATOR "TGZ;DEB;RPM" CACHE STRING "Package types to build")
set(CPACK_RPM_EXCLUDE_FROM_AUTO_FILELIST_ADDITION "/opt")
if (CPACK_RPM_PACKAGE_RELEASE MATCHES "local" )
#If building locally default value will cause build failure
#DEBUG SYMBOL pacaking require SOURCE_DIR to be small
set(CPACK_RPM_BUILD_SOURCE_DIRS_PREFIX ${CPACK_INSTALL_PREFIX})
endif()
set(CPACK_RPM_PACKAGE_AUTOREQPROV " no")
set(CPACK_RPM_FILE_NAME "RPM-DEFAULT")
set(CPACK_DEBIAN_FILE_NAME "DEB-DEFAULT")
set(CPACK_SOURCE_GENERATOR "TGZ")
#Begin Binary Packaging setting
set(CPACK_BINARY_DEB "ON")
set(CPACK_BINARY_RPM "ON")
set(CPACK_DEBIAN_BINARY_PACKAGE_NAME "hip-runtime-amd")
set(CPACK_RPM_BINARY_PACKAGE_NAME "hip-runtime-amd")
set(CPACK_COMPONENT_BINARY_DESCRIPTION "HIP:Heterogenous-computing Interface for Portability [RUNTIME - AMD]")
if(FILE_REORG_BACKWARD_COMPATIBILITY)
#This is used for softlinking hip-target files
configure_file(hip-runtime-amd.postinst ${CMAKE_CURRENT_BINARY_DIR}/binary/postinst @ONLY)
configure_file(hip-runtime-amd.prerm ${CMAKE_CURRENT_BINARY_DIR}/binary/prerm @ONLY)
set(CPACK_DEBIAN_BINARY_PACKAGE_CONTROL_EXTRA "${CMAKE_CURRENT_BINARY_DIR}/binary/postinst;${CMAKE_CURRENT_BINARY_DIR}/binary/prerm")
endif()
set(CPACK_DEBIAN_BINARY_PACKAGE_DEPENDS "hsa-rocr-dev (>= 1.3), rocminfo, comgr (>= 2.0), rocm-llvm, libc6, rocm-core")
set(CPACK_DEBIAN_BINARY_PACKAGE_PROVIDES "hip-rocclr (= ${CPACK_PACKAGE_VERSION})")
set(CPACK_DEBIAN_BINARY_PACKAGE_REPLACES "hip-rocclr (= ${CPACK_PACKAGE_VERSION})")
set(CPACK_RPM_BINARY_PACKAGE_ARCHITECTURE "${CMAKE_SYSTEM_PROCESSOR}")
if(FILE_REORG_BACKWARD_COMPATIBILITY)
set(CPACK_RPM_BINARY_POST_INSTALL_SCRIPT_FILE "${CMAKE_CURRENT_BINARY_DIR}/binary/postinst")
set(CPACK_RPM_BINARY_PRE_UNINSTALL_SCRIPT_FILE "${CMAKE_CURRENT_BINARY_DIR}/binary/prerm")
endif()
string(REPLACE "-" "_" HIP_BASE_VERSION ${CPACK_PACKAGE_VERSION})
set(CPACK_RPM_BINARY_PACKAGE_REQUIRES "hsa-rocr-dev >= 1.3, rocminfo, comgr >= 2.0, rocm-llvm, rocm-core")
set(CPACK_RPM_BINARY_PACKAGE_PROVIDES "hip-rocclr = ${HIP_BASE_VERSION}")
set(CPACK_RPM_BINARY_PACKAGE_OBSOLETES "hip-rocclr = ${HIP_BASE_VERSION}")
#End Binary Packaging setting
#Begin dev Packaging setting
set(CPACK_DEV_DEB "ON")
set(CPACK_DEV_RPM "ON")
set(CPACK_DEBIAN_DEV_PACKAGE_NAME "hip-dev")
set(CPACK_RPM_DEV_PACKAGE_NAME "hip-devel")
set(CPACK_COMPONENT_DEV_DESCRIPTION "HIP: Heterogenous-computing Interface for Portability [DEVELOPMENT]")
configure_file(hip-devel.postinst ${CMAKE_CURRENT_BINARY_DIR}/dev/postinst @ONLY)
configure_file(hip-devel.prerm ${CMAKE_CURRENT_BINARY_DIR}/dev/prerm @ONLY)
set(CPACK_DEBIAN_DEV_PACKAGE_CONTROL_EXTRA "${CMAKE_CURRENT_BINARY_DIR}/dev/postinst;${CMAKE_CURRENT_BINARY_DIR}/dev/prerm")
set(CPACK_DEBIAN_DEV_PACKAGE_DEPENDS "perl (>= 5.0), liburi-encode-perl, libfile-basedir-perl, libfile-copy-recursive-perl, libfile-listing-perl, libfile-which-perl, libc6, file, rocm-core")
set(CPACK_DEBIAN_DEV_PACKAGE_PROVIDES "hip-base")
set(CPACK_DEBIAN_DEV_PACKAGE_REPLACES "hip-base")
set(CPACK_RPM_DEV_POST_INSTALL_SCRIPT_FILE "${CMAKE_CURRENT_BINARY_DIR}/dev/postinst")
set(CPACK_RPM_DEV_PRE_UNINSTALL_SCRIPT_FILE "${CMAKE_CURRENT_BINARY_DIR}/dev/prerm")
set(CPACK_RPM_DEV_PACKAGE_REQUIRES "perl >= 5.0, perl-File-Which, perl-File-Listing, perl-File-BaseDir, perl-URI-Encode, file, rocm-core")
set(CPACK_RPM_DEV_PACKAGE_PROVIDES "hip-base")
set(CPACK_RPM_DEV_PACKAGE_OBSOLETES "hip-base")
#End dev Packaging setting
#Begin doc Packaging setting
set(CPACK_DOC_DEB "ON")
set(CPACK_DOC_RPM "ON")
set(CPACK_DEBIAN_DOC_PACKAGE_NAME "hip-doc")
set(CPACK_RPM_DOC_PACKAGE_NAME "hip-doc")
set(CPACK_COMPONENT_DOC_DESCRIPTION "HIP: Heterogenous-computing Interface for Portability [DOCUMENTATION]")
set(CPACK_DEBIAN_DOC_PACKAGE_DEPENDS "hip-dev (= ${CPACK_PACKAGE_VERSION}-${CPACK_DEBIAN_PACKAGE_RELEASE}), rocm-core")
set(CPACK_DEBIAN_DOC_PACKAGE_PROVIDES "hip-doc")
string(REPLACE "-" "_" HIP_BASE_VERSION ${CPACK_PACKAGE_VERSION})
set(CPACK_RPM_DOC_PACKAGE_REQUIRES "hip-devel = ${HIP_BASE_VERSION}-${CPACK_RPM_PACKAGE_RELEASE}, rocm-core")
#End doc Packaging setting
#Begin samples Packaging setting
set(CPACK_SAMPLES_DEB "ON")
set(CPACK_SAMPLES_RPM "ON")
set(CPACK_DEBIAN_SAMPLES_PACKAGE_NAME "hip-samples")
set(CPACK_RPM_SAMPLES_PACKAGE_NAME "hip-samples")
set(CPACK_COMPONENT_SAMPLES_DESCRIPTION "HIP: Heterogenous-computing Interface for Portability [SAMPLES]")
set(CPACK_DEBIAN_SAMPLES_PACKAGE_DEPENDS "hip-dev (= ${CPACK_PACKAGE_VERSION}-${CPACK_DEBIAN_PACKAGE_RELEASE}), rocm-core")
set(CPACK_DEBIAN_SAMPLES_PACKAGE_PROVIDES "hip-samples")
set(CPACK_RPM_SAMPLES_PACKAGE_REQUIRES "hip-devel = ${HIP_BASE_VERSION}-${CPACK_RPM_PACKAGE_RELEASE}, rocm-core")
#End samples Packaging setting
#Begin runtime-nvidia Packaging setting
set(CPACK_RUNTIME-NVIDIA_DEB "ON")
set(CPACK_RUNTIME-NVIDIA_RPM "ON")
set(CPACK_DEBIAN_RUNTIME-NVIDIA_PACKAGE_NAME "hip-runtime-nvidia")
set(CPACK_RPM_RUNTIME-NVIDIA_PACKAGE_NAME "hip-runtime-nvidia")
set(CPACK_COMPONENT_RUNTIME-NVIDIA_DESCRIPTION "HIP: Heterogenous-computing Interface for Portability [RUNTIME-NVIDIA]")
set(CPACK_DEBIAN_RUNTIME-NVIDIA_PACKAGE_DEPENDS "cuda (>= 7.5), rocm-core")
set(CPACK_DEBIAN_RUNTIME-NVIDIA_PACKAGE_PROVIDES "hip-nvcc")
set(CPACK_DEBIAN_RUNTIME-NVIDIA_PACKAGE_REPLACES "hip-nvcc")
set(CPACK_RPM_RUNTIME-NVIDIA_PACKAGE_PROVIDES "hip-nvcc")
set(CPACK_RPM_RUNTIME-NVIDIA_PACKAGE_OBSOLETES "hip-nvcc")
set(CPACK_RPM_RUNTIME-NVIDIA_PACKAGE_REQUIRES "cuda >= 7.5, rocm-core")
# Remove dependency on rocm-core if -DROCM_DEP_ROCMCORE=ON not given to cmake
if(NOT ROCM_DEP_ROCMCORE)
string(REGEX REPLACE ",? ?rocm-core" "" CPACK_RPM_BINARY_PACKAGE_REQUIRES ${CPACK_RPM_BINARY_PACKAGE_REQUIRES})
string(REGEX REPLACE ",? ?rocm-core" "" CPACK_DEBIAN_BINARY_PACKAGE_DEPENDS ${CPACK_DEBIAN_BINARY_PACKAGE_DEPENDS})
string(REGEX REPLACE ",? ?rocm-core" "" CPACK_RPM_DEV_PACKAGE_REQUIRES ${CPACK_RPM_DEV_PACKAGE_REQUIRES})
string(REGEX REPLACE ",? ?rocm-core" "" CPACK_DEBIAN_DEV_PACKAGE_DEPENDS ${CPACK_DEBIAN_DEV_PACKAGE_DEPENDS})
string(REGEX REPLACE ",? ?rocm-core" "" CPACK_RPM_DOC_PACKAGE_REQUIRES ${CPACK_RPM_DOC_PACKAGE_REQUIRES})
string(REGEX REPLACE ",? ?rocm-core" "" CPACK_DEBIAN_DOC_PACKAGE_DEPENDS ${CPACK_DEBIAN_DOC_PACKAGE_DEPENDS})
string(REGEX REPLACE ",? ?rocm-core" "" CPACK_RPM_SAMPLES_PACKAGE_REQUIRES ${CPACK_RPM_SAMPLES_PACKAGE_REQUIRES})
string(REGEX REPLACE ",? ?rocm-core" "" CPACK_DEBIAN_SAMPLES_PACKAGE_DEPENDS ${CPACK_DEBIAN_SAMPLES_PACKAGE_DEPENDS})
string(REGEX REPLACE ",? ?rocm-core" "" CPACK_RPM_RUNTIME-NVIDIA_PACKAGE_REQUIRES ${CPACK_RPM_RUNTIME-NVIDIA_PACKAGE_REQUIRES})
string(REGEX REPLACE ",? ?rocm-core" "" CPACK_DEBIAN_RUNTIME-NVIDIA_PACKAGE_DEPENDS ${CPACK_DEBIAN_RUNTIME-NVIDIA_PACKAGE_DEPENDS})
endif()
include(CPack)
+77
查看文件
@@ -0,0 +1,77 @@
#!/bin/bash
# Copyright (c) 2016 - 2021 Advanced Micro Devices, Inc. All Rights Reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
function die {
echo "${1-Died}." >&2
exit 1
}
function cleanup {
rm -rf "$workdir"
}
# parse arguments
hip_srcdir=$1
html_destdir=$2
[ "$hip_srcdir" != "" ] || [ "$html_destdir" != "" ] || die "Invalid arguments!"
# create temporary directory for grip settings
workdir=`mktemp -d`
trap cleanup EXIT
# setup grip
export GRIPURL=$hip_srcdir
export GRIPHOME=$workdir
echo "CACHE_DIRECTORY = '$html_destdir/asset'" > $workdir/settings.py
mkdir -p $html_destdir $html_destdir/docs/markdown
# convert all md files to html
pushd $hip_srcdir
for f in *.md docs/markdown/*.md; do grip --export --no-inline $f $html_destdir/${f%.*}.html; done
popd
# convert absolute links to relative links
pushd $html_destdir
for f in *.html; do sed -i "s?$GRIPURL/??g" $f; done
for f in docs/markdown/*.html; do sed -i "s?$GRIPURL/?../../?g" $f; done
popd
# update document titles
pushd $html_destdir
for f in *.html; do sed -i "s?.md - Grip??g" $f; done
for f in docs/markdown/*.html; do sed -i "s?.md - Grip??g" $f; done
popd
# replace .md with .html in links
pushd $html_destdir
for f in *.html; do sed -i "s?.md\"?.html\"?g" $f; done
for f in *.html; do sed -i "s?.md#?.html#?g" $f; done
for f in docs/markdown/*.html; do sed -i "s?.md\"?.html\"?g" $f; done
for f in docs/markdown/*.html; do sed -i "s?.md#?.html#?g" $f; done
popd
# replace github.io links
pushd $html_destdir
sed -i "s?http://rocm-developer-tools.github.io/HIP?docs/RuntimeAPI/html/index.html?g" README.html
sed -i "s?http://rocm-developer-tools.github.io/HIP?docs/RuntimeAPI/html/?g" RELEASE.html
popd
exit 0
+38
查看文件
@@ -0,0 +1,38 @@
#!/bin/bash
# Copyright (c) 2016 - 2021 Advanced Micro Devices, Inc. All Rights Reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
ROCMDIR=@ROCM_PATH@
HIPINCDIR=$ROCMDIR/@CMAKE_INSTALL_INCLUDEDIR@/hip
CURRENTDIR=`pwd`
# The following will be removed after upstream updation
cd $HIPINCDIR
ln -r -s -f amd_detail hcc_detail
ln -r -s -f nvidia_detail nvcc_detail
cd $CURRENTDIR
#FILE_REORG_BACKWARD_COMPATIBILITY
HIPINCDIR=$ROCMDIR/hip/include/hip
if [ -d $HIPINCDIR ]; then
# The following will be removed after upstream updation
cd $HIPINCDIR
ln -r -s -f amd_detail hcc_detail
ln -r -s -f nvidia_detail nvcc_detail
cd $CURRENTDIR
fi
+41
查看文件
@@ -0,0 +1,41 @@
#!/bin/bash
# Copyright (c) 2016 - 2021 Advanced Micro Devices, Inc. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
ROCMDIR=@ROCM_PATH@
CURRENTDIR=`pwd`
HIPINCDIR=$ROCMDIR/@CMAKE_INSTALL_INCLUDEDIR@/hip
([ ! -d $HIPINCDIR ]) && exit 0
cd $HIPINCDIR
rm hcc_detail
rm nvcc_detail
cd $CURRENTDIR
#FILE_REORG_BACKWARD_COMPATIBILITY
#backward copatibility code , to be removed later
HIPDIR=$ROCMDIR/hip
HIPINCDIR=$ROCMDIR/hip/include/hip
([ ! -d $HIPINCDIR ]) && exit 0
cd $HIPINCDIR
rm -f hcc_detail
rm -f nvcc_detail
cd $CURRENTDIR
([ ! -d $HIPDIR ]) && exit 0
rmdir --ignore-fail-on-non-empty $HIPDIR
+53
查看文件
@@ -0,0 +1,53 @@
#!/bin/bash
# Copyright (c) 2020 - 2022 Advanced Micro Devices, Inc. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
ROCMDIR=@ROCM_PATH@
ROCMCMAKEDIR=$ROCMDIR/@CMAKE_INSTALL_LIBDIR@/cmake
HIPCMAKEDIR=$ROCMDIR/hip/lib/cmake
CURRENTDIR=`pwd`
mkdir -p $HIPCMAKEDIR/hip
mkdir -p $HIPCMAKEDIR/hip-lang
mkdir -p $HIPCMAKEDIR/hiprtc
HIPTARGETFILES=$(ls -A $ROCMCMAKEDIR/hip | grep "^hip-targets")
cd $HIPCMAKEDIR/hip
for f in $HIPTARGETFILES
do
ln -s -r -f $ROCMCMAKEDIR/hip/$f $(basename $f)
done
cd $CURRENTDIR
HIPLANGTARGETFILES=$(ls -A $ROCMCMAKEDIR/hip-lang | grep "^hip-lang-targets")
cd $HIPCMAKEDIR/hip-lang
for f in $HIPLANGTARGETFILES
do
ln -s -r -f $ROCMCMAKEDIR/hip-lang/$f $(basename $f)
done
cd $CURRENTDIR
HIPRTCTARGETFILES=$(ls -A $ROCMCMAKEDIR/hiprtc | grep "^hiprtc-targets")
cd $HIPCMAKEDIR/hiprtc
for f in $HIPRTCTARGETFILES
do
ln -s -r -f $ROCMCMAKEDIR/hiprtc/$f $(basename $f)
done
cd $CURRENTDIR
+66
查看文件
@@ -0,0 +1,66 @@
#!/bin/bash
# Copyright (c) 2016 - 2022 Advanced Micro Devices, Inc. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
ROCMDIR=@ROCM_PATH@
HIPDIR=$ROCMDIR/hip
HIPCMAKEDIR=$ROCMDIR/hip/lib/cmake/hip
HIPLANGCMAKEDIR=$ROCMDIR/hip/lib/cmake/hip-lang
HIPRTCCMAKEDIR=$ROCMDIR/hip/lib/cmake/hiprtc
CURRENTDIR=`pwd`
([ ! -d $ROCMDIR ] || [ ! -d $HIPDIR ]) && exit 0
([ ! -d $HIPCMAKEDIR ] ) && exit 0
# Remove soft-links to hip-target
HIPTARGETFILES=$(ls -A $HIPCMAKEDIR | grep "^hip-targets")
cd $HIPCMAKEDIR
for f in $HIPTARGETFILES; do
[ -e $f ] || continue
rm $(basename $f)
done
cd $CURRENTDIR
([ ! -d $HIPLANGCMAKEDIR ] ) && exit 0
# Remove soft-links to hip-lang-target
HIPLANGTARGETFILES=$(ls -A $HIPLANGCMAKEDIR | grep "^hip-lang-targets")
cd $HIPLANGCMAKEDIR
for f in $HIPLANGTARGETFILES; do
[ -e $f ] || continue
rm $(basename $f)
done
cd $CURRENTDIR
([ ! -d $HIPRTCCMAKEDIR ] ) && exit 0
# Remove soft-links to hiprtc-target
HIPRTCTARGETFILES=$(ls -A $HIPRTCCMAKEDIR | grep "^hiprtc-targets")
cd $HIPRTCCMAKEDIR
for f in $HIPRTCTARGETFILES; do
[ -e $f ] || continue
rm $(basename $f)
done
cd $CURRENTDIR
rmdir --ignore-fail-on-non-empty $HIPCMAKEDIR
rmdir --ignore-fail-on-non-empty $HIPLANGCMAKEDIR
rmdir --ignore-fail-on-non-empty $HIPRTCCMAKEDIR
+27
查看文件
@@ -0,0 +1,27 @@
#!/bin/bash
# Copyright (c) 2020 - 2021 Advanced Micro Devices, Inc. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
ROCMDIR=@ROCM_PATH@
HIPDIR=$ROCMDIR/hip
if [ -d $ROCMDIR ] ; then
ln -s -f $ROCMDIR /opt/rocm
fi
+24
查看文件
@@ -0,0 +1,24 @@
#!/bin/bash
# Copyright (c) 2020 - 2021 Advanced Micro Devices, Inc. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
if [ -L "/opt/rocm" ] ; then
unlink /opt/rocm
fi
+319
查看文件
@@ -0,0 +1,319 @@
# Copyright (c) 2020 - 2022 Advanced Micro Devices, Inc. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
cmake_minimum_required(VERSION 3.5.1)
include(GNUInstallDirs)
set(VERSION_MAJOR_AMDHIP ${HIP_VERSION_MAJOR})
set(VERSION_MINOR_AMDHIP ${HIP_VERSION_MINOR})
if(ADDRESS_SANITIZER)
set(ASAN_LINKER_FLAGS "-fsanitize=address")
set(ASAN_COMPILER_FLAGS "-fno-omit-frame-pointer -fsanitize=address")
if(NOT CMAKE_COMPILER_IS_GNUCC)
if(BUILD_SHARED_LIBS)
set(ASAN_COMPILER_FLAGS "${ASAN_COMPILER_FLAGS} -shared-libsan")
set(ASAN_LINKER_FLAGS "${ASAN_LINKER_FLAGS} -shared-libsan")
else()
set(ASAN_LINKER_FLAGS "${ASAN_LINKER_FLAGS} -static-libsan")
endif()
endif()
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${ASAN_COMPILER_FLAGS}")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${ASAN_COMPILER_FLAGS}")
set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${ASAN_LINKER_FLAGS} -s -Wl,--build-id=sha1")
set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${ASAN_LINKER_FLAGS} -Wl,--build-id=sha1")
endif()
if(CMAKE_COMPILER_IS_GNUCC)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Werror")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wno-error=deprecated-declarations")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=deprecated-declarations")
endif()
option(DISABLE_DIRECT_DISPATCH "Disable Direct Dispatch" OFF)
option(BUILD_SHARED_LIBS "Build the shared library" ON)
list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}/cmake")
find_package(ROCclr)
if(BUILD_SHARED_LIBS)
add_library(amdhip64 SHARED)
# Windows doesn't have a strip utility, so CMAKE_STRIP won't be set.
if((CMAKE_BUILD_TYPE STREQUAL "Release") AND NOT ("${CMAKE_STRIP}" STREQUAL ""))
add_custom_command(TARGET amdhip64 POST_BUILD COMMAND ${CMAKE_STRIP} $<TARGET_FILE:amdhip64>)
endif()
else()
add_library(amdhip64 STATIC $<TARGET_OBJECTS:rocclr>)
endif()
set_target_properties(amdhip64 PROPERTIES
CXX_STANDARD 17
CXX_STANDARD_REQUIRED ON
CXX_EXTENSIONS OFF
POSITION_INDEPENDENT_CODE ON
# Workaround for many places in the HIP project
# having hardcoded references to build/lib/libamdhip64.so
LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib
ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
if(CMAKE_SIZEOF_VOID_P EQUAL 8)
set_target_properties(amdhip64 PROPERTIES OUTPUT_NAME "amdhip64")
else()
set_target_properties(amdhip64 PROPERTIES OUTPUT_NAME "amdhip32")
endif()
# Disable versioning for Windows
# as currently HIP_LIB_VERSION_STRING and HIP_LIB_VERSION_MAJOR
# are not being populated
if(NOT WIN32)
if(BUILD_SHARED_LIBS)
set_target_properties(amdhip64 PROPERTIES
VERSION ${HIP_LIB_VERSION_STRING}
SOVERSION ${HIP_LIB_VERSION_MAJOR})
endif()
endif()
target_sources(amdhip64 PRIVATE
cl_gl.cpp
fixme.cpp
hip_activity.cpp
hip_code_object.cpp
hip_context.cpp
hip_device_runtime.cpp
hip_device.cpp
hip_error.cpp
hip_event.cpp
hip_event_ipc.cpp
hip_fatbin.cpp
hip_global.cpp
hip_graph_internal.cpp
hip_graph.cpp
hip_hmm.cpp
hip_intercept.cpp
hip_memory.cpp
hip_mempool.cpp
hip_mempool_impl.cpp
hip_module.cpp
hip_peer.cpp
hip_platform.cpp
hip_profile.cpp
hip_stream_ops.cpp
hip_stream.cpp
hip_surface.cpp
hip_texture.cpp
hip_gl.cpp
hip_vm.cpp)
if(WIN32)
target_sources(amdhip64 PRIVATE
cl_d3d9.cpp
cl_d3d10.cpp
cl_d3d11.cpp
hip_runtime.cpp)
endif()
if(BUILD_SHARED_LIBS)
if(WIN32)
target_sources(amdhip64 PRIVATE amdhip.def)
else()
target_link_libraries(amdhip64 PRIVATE "-Wl,--version-script=${CMAKE_CURRENT_LIST_DIR}/hip_hcc.map.in")
set_target_properties(amdhip64 PROPERTIES LINK_DEPENDS "${CMAKE_CURRENT_LIST_DIR}/hip_hcc.map.in")
endif()
endif()
if(WIN32)
configure_file(hip_hcc_in.rc.in hip_hcc_info.rc @ONLY)
target_sources(amdhip64 PRIVATE hip_hcc_info.rc)
endif()
target_include_directories(amdhip64
PRIVATE
${HIP_COMMON_INCLUDE_DIR}
${PROJECT_SOURCE_DIR}/include
${PROJECT_BINARY_DIR}/include)
target_compile_definitions(amdhip64 PRIVATE __HIP_PLATFORM_AMD__)
target_link_libraries(amdhip64 PRIVATE ${OPENGL_LIBRARIES})
target_link_libraries(amdhip64 PRIVATE ${CMAKE_DL_LIBS})
# Note in static case we cannot link against rocclr.
# If we would, we'd also have to export rocclr and have hipcc pass it to the linker.
if(BUILD_SHARED_LIBS)
target_link_libraries(amdhip64 PRIVATE rocclr)
else()
target_compile_definitions(amdhip64 PRIVATE $<TARGET_PROPERTY:rocclr,COMPILE_DEFINITIONS>)
target_include_directories(amdhip64 PRIVATE $<TARGET_PROPERTY:rocclr,INCLUDE_DIRECTORIES>)
endif()
if(DISABLE_DIRECT_DISPATCH)
target_compile_definitions(amdhip64 PRIVATE DISABLE_DIRECT_DISPATCH)
endif()
# Short-Term solution for pre-compiled headers for online compilation
# Enable pre compiled header
if(__HIP_ENABLE_PCH)
find_package(LLVM REQUIRED CONFIG
PATHS
${ROCM_PATH}/llvm)
# find_package(LLVM) returns the lib/cmake/llvm location. We require the root.
if(NOT DEFINED HIP_LLVM_ROOT)
set(HIP_LLVM_ROOT "${LLVM_DIR}/../../..")
endif()
execute_process(COMMAND sh -c "${CMAKE_CURRENT_SOURCE_DIR}/hip_embed_pch.sh ${HIP_COMMON_INCLUDE_DIR} ${PROJECT_BINARY_DIR}/include ${PROJECT_SOURCE_DIR}/include ${HIP_LLVM_ROOT}" COMMAND_ECHO STDERR RESULT_VARIABLE EMBED_PCH_RC WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
if (EMBED_PCH_RC AND NOT EMBED_PCH_RC EQUAL 0)
message(FATAL_ERROR "Failed to embed PCH")
endif()
target_compile_definitions(amdhip64 PRIVATE __HIP_ENABLE_PCH)
target_sources(amdhip64 PRIVATE ${CMAKE_BINARY_DIR}/hip_pch.o)
endif()
set(HIPRTC_OBJECTS)
# Add hiprtc
add_subdirectory(hiprtc)
if(NOT WIN32)
if(BUILD_SHARED_LIBS)
target_link_libraries(amdhip64 PRIVATE ${HIPRTC_OBJECTS})
target_compile_definitions(amdhip64 PRIVATE __HIP_ENABLE_RTC)
add_dependencies(amdhip64 hiprtc-builtins)
INSTALL(TARGETS hiprtc-builtins
RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR})
endif()
endif()
#############################
# Profiling API support
#############################
# Generate profiling API macros/structures header
option(USE_PROF_API "Enable roctracer integration" ON)
# Enable profiling API
if(USE_PROF_API)
set(PROF_API_STR "${PROJECT_BINARY_DIR}/include/hip/amd_detail/hip_prof_str.h")
set(PROF_API_STR_IN "${CMAKE_SOURCE_DIR}/include/hip/amd_detail/hip_prof_str.h")
set(PROF_API_HDR "${HIP_COMMON_INCLUDE_DIR}/hip/hip_runtime_api.h")
set(PROF_API_SRC "${CMAKE_CURRENT_SOURCE_DIR}")
set(PROF_API_GEN "${CMAKE_CURRENT_SOURCE_DIR}/hip_prof_gen.py")
set(PROF_API_LOG "${PROJECT_BINARY_DIR}/hip_prof_gen.log.txt")
find_package(Python3 COMPONENTS Interpreter REQUIRED)
execute_process(COMMAND ${Python3_EXECUTABLE} -c "import CppHeaderParser"
RESULT_VARIABLE CPP_HEADER_PARSER
OUTPUT_QUIET)
if(NOT ${CPP_HEADER_PARSER} EQUAL 0)
message(FATAL_ERROR "\
The \"CppHeaderParser\" Python3 package is not installed. \
Please install it using the following command: \"pip3 install CppHeaderParser\".\
")
endif()
add_custom_command(OUTPUT ${PROF_API_STR}
COMMAND ${Python3_EXECUTABLE} ${PROF_API_GEN} -v -t --priv ${PROF_API_HDR} ${PROF_API_SRC} ${PROF_API_STR_IN} ${PROF_API_STR}
DEPENDS ${PROF_API_STR_IN} ${PROF_API_HDR} ${PROF_API_GEN}
COMMENT "Generating profiling primitives: ${PROF_API_STR}")
add_custom_target(gen-prof-api-str-header ALL
DEPENDS ${PROF_API_STR}
SOURCES ${PROF_API_HDR})
set_target_properties(amdhip64 PROPERTIES PUBLIC_HEADER ${PROF_API_STR})
find_path(PROF_API_HEADER_DIR prof_protocol.h
HINTS
${PROF_API_HEADER_PATH}
PATHS
${ROCM_PATH}/roctracer
PATH_SUFFIXES
include/ext)
if(NOT PROF_API_HEADER_DIR)
message(WARNING "Profiling API header not found. Disabling roctracer integration. Use -DPROF_API_HEADER_PATH=<path to prof_protocol.h header>")
else()
target_compile_definitions(amdhip64 PUBLIC USE_PROF_API=1)
target_include_directories(amdhip64 PUBLIC ${PROF_API_HEADER_DIR})
message(STATUS "Profiling API: ${PROF_API_HEADER_DIR}")
endif()
add_dependencies(amdhip64 gen-prof-api-str-header)
endif()
add_custom_command(TARGET amdhip64 POST_BUILD COMMAND
${CMAKE_COMMAND} -E copy ${PROJECT_BINARY_DIR}/.hipInfo ${PROJECT_BINARY_DIR}/lib/.hipInfo)
add_custom_command(TARGET amdhip64 POST_BUILD COMMAND
${CMAKE_COMMAND} -E copy_directory ${PROJECT_SOURCE_DIR}/include ${PROJECT_BINARY_DIR}/include)
add_library(host INTERFACE)
target_link_libraries(host INTERFACE amdhip64)
add_library(device INTERFACE)
target_link_libraries(device INTERFACE host)
# Current packaging assumes that HIP runtime will always be installed in ${ROCM_PATH}/lib
# This is false to assume, because some distros like CentOS will use the lib64 directory instead of lib
# Relying on CMake to choose the library directory for us will default in that case to lib64
# Hence there will be a mismatch between where HIP is installed and where CMake thinks it is
INSTALL(TARGETS amdhip64 host device
EXPORT hip-targets
RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
PUBLIC_HEADER DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
INSTALL(EXPORT hip-targets DESTINATION ${CONFIG_PACKAGE_INSTALL_DIR} NAMESPACE hip::)
INSTALL(TARGETS amdhip64 host device
EXPORT hip-lang-targets
RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
PUBLIC_HEADER DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
INSTALL(EXPORT hip-lang-targets DESTINATION ${CONFIG_LANG_PACKAGE_INSTALL_DIR} NAMESPACE hip-lang::)
if(NOT WIN32)
include(CMakePackageConfigHelpers)
configure_package_config_file(
${HIP_COMMON_DIR}/hip-lang-config.cmake.in
${CMAKE_CURRENT_BINARY_DIR}/hip-lang-config.cmake
INSTALL_DESTINATION ${CONFIG_LANG_PACKAGE_INSTALL_DIR}
PATH_VARS LIB_INSTALL_DIR INCLUDE_INSTALL_DIR BIN_INSTALL_DIR)
write_basic_package_version_file(
${CMAKE_CURRENT_BINARY_DIR}/hip-lang-config-version.cmake
VERSION "${HIP_VERSION_MAJOR}.${HIP_VERSION_MINOR}.${HIP_VERSION_GITDATE}"
COMPATIBILITY SameMajorVersion)
install(
FILES
${CMAKE_CURRENT_BINARY_DIR}/hip-lang-config.cmake
${CMAKE_CURRENT_BINARY_DIR}/hip-lang-config-version.cmake
DESTINATION
${CONFIG_LANG_PACKAGE_INSTALL_DIR}/
)
endif()
+135
查看文件
@@ -0,0 +1,135 @@
/*
Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#pragma once
// This header file is partially copied from
// https://github.com/llvm/llvm-project/blob/main/llvm/include/llvm/BinaryFormat/ELF.h
// AMDGPU OS for HSA compatible compute kernels.
enum { ELFOSABI_AMDGPU_HSA = 64, ELFOSABI_AMDGPU_PAL = 65, ELFOSABI_AMDGPU_MESA3D = 66 };
enum {
ELFABIVERSION_AMDGPU_HSA_V2 = 0,
ELFABIVERSION_AMDGPU_HSA_V3 = 1,
ELFABIVERSION_AMDGPU_HSA_V4 = 2,
ELFABIVERSION_AMDGPU_HSA_V5 = 3
};
// AMDGPU specific e_flags
enum : unsigned {
EF_AMDGPU_MACH = 0x0ff,
// AMDGPU processors
EF_AMDGPU_MACH_NONE = 0x000,
EF_AMDGPU_MACH_R600_R600 = 0x001,
EF_AMDGPU_MACH_R600_R630 = 0x002,
EF_AMDGPU_MACH_R600_RS880 = 0x003,
EF_AMDGPU_MACH_R600_RV670 = 0x004,
EF_AMDGPU_MACH_R600_RV710 = 0x005,
EF_AMDGPU_MACH_R600_RV730 = 0x006,
EF_AMDGPU_MACH_R600_RV770 = 0x007,
EF_AMDGPU_MACH_R600_CEDAR = 0x008,
EF_AMDGPU_MACH_R600_CYPRESS = 0x009,
EF_AMDGPU_MACH_R600_JUNIPER = 0x00a,
EF_AMDGPU_MACH_R600_REDWOOD = 0x00b,
EF_AMDGPU_MACH_R600_SUMO = 0x00c,
EF_AMDGPU_MACH_R600_BARTS = 0x00d,
EF_AMDGPU_MACH_R600_CAICOS = 0x00e,
EF_AMDGPU_MACH_R600_CAYMAN = 0x00f,
EF_AMDGPU_MACH_R600_TURKS = 0x010,
EF_AMDGPU_MACH_R600_RESERVED_FIRST = 0x011,
EF_AMDGPU_MACH_R600_RESERVED_LAST = 0x01f,
EF_AMDGPU_MACH_R600_FIRST = EF_AMDGPU_MACH_R600_R600,
EF_AMDGPU_MACH_R600_LAST = EF_AMDGPU_MACH_R600_TURKS,
// AMDGCN-based processors.
EF_AMDGPU_MACH_AMDGCN_GFX600 = 0x020,
EF_AMDGPU_MACH_AMDGCN_GFX601 = 0x021,
EF_AMDGPU_MACH_AMDGCN_GFX700 = 0x022,
EF_AMDGPU_MACH_AMDGCN_GFX701 = 0x023,
EF_AMDGPU_MACH_AMDGCN_GFX702 = 0x024,
EF_AMDGPU_MACH_AMDGCN_GFX703 = 0x025,
EF_AMDGPU_MACH_AMDGCN_GFX704 = 0x026,
EF_AMDGPU_MACH_AMDGCN_RESERVED_0X27 = 0x027,
EF_AMDGPU_MACH_AMDGCN_GFX801 = 0x028,
EF_AMDGPU_MACH_AMDGCN_GFX802 = 0x029,
EF_AMDGPU_MACH_AMDGCN_GFX803 = 0x02a,
EF_AMDGPU_MACH_AMDGCN_GFX810 = 0x02b,
EF_AMDGPU_MACH_AMDGCN_GFX900 = 0x02c,
EF_AMDGPU_MACH_AMDGCN_GFX902 = 0x02d,
EF_AMDGPU_MACH_AMDGCN_GFX904 = 0x02e,
EF_AMDGPU_MACH_AMDGCN_GFX906 = 0x02f,
EF_AMDGPU_MACH_AMDGCN_GFX908 = 0x030,
EF_AMDGPU_MACH_AMDGCN_GFX909 = 0x031,
EF_AMDGPU_MACH_AMDGCN_GFX90C = 0x032,
EF_AMDGPU_MACH_AMDGCN_GFX1010 = 0x033,
EF_AMDGPU_MACH_AMDGCN_GFX1011 = 0x034,
EF_AMDGPU_MACH_AMDGCN_GFX1012 = 0x035,
EF_AMDGPU_MACH_AMDGCN_GFX1030 = 0x036,
EF_AMDGPU_MACH_AMDGCN_GFX1031 = 0x037,
EF_AMDGPU_MACH_AMDGCN_GFX1032 = 0x038,
EF_AMDGPU_MACH_AMDGCN_GFX1033 = 0x039,
EF_AMDGPU_MACH_AMDGCN_GFX602 = 0x03a,
EF_AMDGPU_MACH_AMDGCN_GFX705 = 0x03b,
EF_AMDGPU_MACH_AMDGCN_GFX805 = 0x03c,
EF_AMDGPU_MACH_AMDGCN_GFX1035 = 0x03d,
EF_AMDGPU_MACH_AMDGCN_GFX1034 = 0x03e,
EF_AMDGPU_MACH_AMDGCN_GFX90A = 0x03f,
EF_AMDGPU_MACH_AMDGCN_GFX940 = 0x040,
EF_AMDGPU_MACH_AMDGCN_GFX1100 = 0x041,
EF_AMDGPU_MACH_AMDGCN_GFX1013 = 0x042,
EF_AMDGPU_MACH_AMDGCN_RESERVED_0X43 = 0x043,
EF_AMDGPU_MACH_AMDGCN_GFX1103 = 0x044,
EF_AMDGPU_MACH_AMDGCN_GFX1036 = 0x045,
EF_AMDGPU_MACH_AMDGCN_GFX1101 = 0x046,
EF_AMDGPU_MACH_AMDGCN_GFX1102 = 0x047,
// First/last AMDGCN-based processors.
EF_AMDGPU_MACH_AMDGCN_FIRST = EF_AMDGPU_MACH_AMDGCN_GFX600,
EF_AMDGPU_MACH_AMDGCN_LAST = EF_AMDGPU_MACH_AMDGCN_GFX1102,
// Indicates if the "xnack" target feature is enabled for all code contained
// in the object.
//
// Only valid for ELFOSABI_AMDGPU_HSA and ELFABIVERSION_AMDGPU_HSA_V3.
EF_AMDGPU_FEATURE_XNACK_V3 = 0x100,
// Indicates if the "sramecc" target feature is enabled for all code
// contained in the object.
//
// Only valid for ELFOSABI_AMDGPU_HSA and ELFABIVERSION_AMDGPU_HSA_V3.
EF_AMDGPU_FEATURE_SRAMECC_V3 = 0x200,
// Only valid for ELFOSABI_AMDGPU_HSA and ELFABIVERSION_AMDGPU_HSA_V4.
EF_AMDGPU_FEATURE_XNACK_V4 = 0x300,
EF_AMDGPU_FEATURE_XNACK_UNSUPPORTED_V4 = 0x000,
EF_AMDGPU_FEATURE_XNACK_ANY_V4 = 0x100,
EF_AMDGPU_FEATURE_XNACK_OFF_V4 = 0x200,
EF_AMDGPU_FEATURE_XNACK_ON_V4 = 0x300,
// SRAMECC selection mask for EF_AMDGPU_FEATURE_SRAMECC_* values.
// Only valid for ELFOSABI_AMDGPU_HSA and ELFABIVERSION_AMDGPU_HSA_V4.
EF_AMDGPU_FEATURE_SRAMECC_V4 = 0xc00,
EF_AMDGPU_FEATURE_SRAMECC_UNSUPPORTED_V4 = 0x000,
EF_AMDGPU_FEATURE_SRAMECC_ANY_V4 = 0x400,
EF_AMDGPU_FEATURE_SRAMECC_OFF_V4 = 0x800,
EF_AMDGPU_FEATURE_SRAMECC_ON_V4 = 0xc00,
};
+446
查看文件
@@ -0,0 +1,446 @@
EXPORTS
hipChooseDevice
hipCtxCreate
hipCtxDestroy
hipCtxDisablePeerAccess
hipCtxEnablePeerAccess
hipCtxGetApiVersion
hipCtxGetCacheConfig
hipCtxGetCurrent
hipCtxGetDevice
hipCtxGetFlags
hipCtxGetSharedMemConfig
hipCtxPopCurrent
hipCtxPushCurrent
hipCtxSetCacheConfig
hipCtxSetCurrent
hipCtxSetSharedMemConfig
hipCtxSynchronize
hipDeviceCanAccessPeer
hipDeviceComputeCapability
hipDeviceDisablePeerAccess
hipDeviceEnablePeerAccess
hipDeviceGet
hipDeviceGetAttribute
hipDeviceGetByPCIBusId
hipDeviceGetCacheConfig
hipDeviceGetStreamPriorityRange
hipDeviceGetLimit
hipDeviceGetName
hipDeviceGetUuid
hipDeviceGetPCIBusId
hipDeviceGetSharedMemConfig
hipDeviceGetP2PAttribute
hipDevicePrimaryCtxGetState
hipDevicePrimaryCtxRelease
hipDevicePrimaryCtxReset
hipDevicePrimaryCtxRetain
hipDevicePrimaryCtxSetFlags
hipDeviceReset
hipDeviceSetCacheConfig
hipDeviceSetSharedMemConfig
hipDeviceSynchronize
hipDeviceTotalMem
hipDriverGetVersion
hipEventCreate
hipEventCreateWithFlags
hipEventDestroy
hipEventElapsedTime
hipEventQuery
hipEventRecord
hipEventSynchronize
hipExtGetLinkTypeAndHopCount
hipExtLaunchMultiKernelMultiDevice
hipExtMallocWithFlags
hipExtModuleLaunchKernel
hipExtLaunchKernel
hipFree
hipFreeArray
hipFuncSetAttribute
hipFuncSetCacheConfig
hipFuncSetSharedMemConfig
hipGetDevice
hipGetDeviceCount
hipGetDeviceProperties
hipGetErrorName
hipGetErrorString
hipGetLastError
hipMemAllocHost
hipHostAlloc
hipHostFree
hipHostGetDevicePointer
hipHostGetFlags
hipHostMalloc
hipHostRegister
hipHostUnregister
hipInit
hipIpcCloseMemHandle
hipIpcGetMemHandle
hipIpcOpenMemHandle
hipIpcGetEventHandle
hipIpcOpenEventHandle
hipMalloc
hipMalloc3D
hipMalloc3DArray
hipMallocManaged
hipDeviceGetDefaultMemPool
hipDeviceSetMemPool
hipDeviceGetMemPool
hipMallocAsync
hipFreeAsync
hipMemPoolTrimTo
hipMemPoolSetAttribute
hipMemPoolGetAttribute
hipMemPoolSetAccess
hipMemPoolGetAccess
hipMemPoolCreate
hipMemPoolDestroy
hipMallocFromPoolAsync
hipMemPoolExportToShareableHandle
hipMemPoolImportFromShareableHandle
hipMemPoolExportPointer
hipMemPoolImportPointer
hipArrayCreate
hipArray3DCreate
hipArrayDestroy
hipArrayGetInfo
hipArrayGetDescriptor
hipArray3DGetDescriptor
hipMallocArray
hipMemAdvise
hipMemAllocPitch
hipMallocPitch
hipMemcpy
hipMemcpyWithStream
hipMemcpyParam2D
hipMemcpy2D
hipMemcpy2DAsync
hipMemcpy2DToArray
hipMemcpy2DToArrayAsync
hipMemcpy3D
hipMemcpy3DAsync
hipDrvMemcpy3D
hipDrvMemcpy3DAsync
hipMemcpyAsync
hipMemcpyDtoD
hipMemcpyDtoDAsync
hipMemcpyDtoH
hipMemcpyDtoHAsync
hipMemcpyFromSymbol
hipMemcpyFromSymbolAsync
hipMemcpyHtoD
hipMemcpyHtoDAsync
hipMemcpyPeer
hipMemcpyPeerAsync
hipMemcpyToArray
hipMemcpyFromArray
hipMemcpyToSymbol
hipMemcpyToSymbolAsync
hipMemGetAddressRange
hipGetSymbolAddress
hipGetSymbolSize
hipMemGetInfo
hipMemPrefetchAsync
hipMemPtrGetInfo
hipMemRangeGetAttribute
hipMemRangeGetAttributes
hipMemset
hipMemsetAsync
hipMemsetD8
hipMemsetD8Async
hipMemsetD16
hipMemsetD16Async
hipMemsetD32
hipMemsetD32Async
hipMemset2D
hipMemset2DAsync
hipMemset3D
hipMemset3DAsync
hipModuleGetFunction
hipModuleGetGlobal
hipModuleGetTexRef
hipModuleLaunchKernel
hipModuleLaunchKernelExt
hipModuleLaunchCooperativeKernel
hipModuleLaunchCooperativeKernelMultiDevice
hipLaunchCooperativeKernel
hipLaunchCooperativeKernelMultiDevice
hipHccModuleLaunchKernel
hipModuleLoad
hipModuleLoadData
hipModuleLoadDataEx
hipModuleUnload
hipModuleOccupancyMaxPotentialBlockSize
hipModuleOccupancyMaxPotentialBlockSizeWithFlags
hipModuleOccupancyMaxActiveBlocksPerMultiprocessor
hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
hipOccupancyMaxPotentialBlockSize
hipOccupancyMaxActiveBlocksPerMultiprocessor
hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
hipFuncGetAttribute
hipFuncGetAttributes
hipPeekAtLastError
hipPointerGetAttributes
hipProfilerStart
hipProfilerStop
hipRuntimeGetVersion
hipGetDeviceFlags
hipSetDevice
hipSetDeviceFlags
hipStreamAddCallback
hipStreamAttachMemAsync
hipStreamCreate
hipStreamCreateWithFlags
hipStreamCreateWithPriority
hipStreamDestroy
hipStreamGetDevice
hipStreamGetFlags
hipStreamQuery
hipStreamSynchronize
hipStreamWaitEvent
__hipPopCallConfiguration
__hipPushCallConfiguration
__hipRegisterFatBinary
__hipRegisterFunction
__hipRegisterVar
__hipRegisterSurface
__hipRegisterTexture
__hipRegisterManagedVar
__hipUnregisterFatBinary
hipConfigureCall
hipSetupArgument
hipLaunchByPtr
hipLaunchKernel
hipRegisterTracerCallback
hipApiName
hipKernelNameRef
hipBindTexture
hipBindTexture2D
hipBindTextureToArray
hipBindTextureToMipmappedArray
hipGetTextureAlignmentOffset
hipGetTextureReference
hipUnbindTexture
hipCreateChannelDesc
hipCreateTextureObject
hipDestroyTextureObject
hipGetChannelDesc
hipGetTextureObjectResourceDesc
hipGetTextureObjectResourceViewDesc
hipGetTextureObjectTextureDesc
hipTexRefGetAddress
hipTexRefGetAddressMode
hipTexRefGetArray
hipTexRefGetBorderColor
hipTexRefGetFilterMode
hipTexRefGetFlags
hipTexRefGetFormat
hipTexRefGetMaxAnisotropy
hipTexRefGetMipmapFilterMode
hipTexRefGetMipmapLevelBias
hipTexRefGetMipmapLevelClamp
hipTexRefGetMipmappedArray
hipTexRefSetAddress
hipTexRefSetAddress2D
hipTexRefSetAddressMode
hipTexRefSetArray
hipTexRefSetBorderColor
hipTexRefSetFilterMode
hipTexRefSetFlags
hipTexRefSetFormat
hipTexRefSetMaxAnisotropy
hipTexRefSetMipmapFilterMode
hipTexRefSetMipmapLevelBias
hipTexRefSetMipmapLevelClamp
hipTexRefSetMipmappedArray
hipProfilerStart
hipProfilerStop
hipCreateSurfaceObject
hipDestroySurfaceObject
hipGetCmdName
hipMipmappedArrayCreate
hipMallocMipmappedArray
hipMipmappedArrayDestroy
hipFreeMipmappedArray
hipMipmappedArrayGetLevel
hipGetMipmappedArrayLevel
hipMallocHost
hipFreeHost
hipTexObjectCreate
hipTexObjectDestroy
hipTexObjectGetResourceDesc
hipTexObjectGetResourceViewDesc
hipTexObjectGetTextureDesc
hipExtStreamCreateWithCUMask
hipStreamGetPriority
hipMemcpy2DFromArray
hipMemcpy2DFromArrayAsync
hipDrvMemcpy2DUnaligned
hipMemcpyAtoH
hipMemcpyHtoA
hipMemcpyParam2DAsync
__gnu_h2f_ieee
__gnu_f2h_ieee
hipExtStreamGetCUMask
hipImportExternalMemory
hipExternalMemoryGetMappedBuffer
hipDestroyExternalMemory
hipGraphCreate
hipGraphDestroy
hipGraphAddKernelNode
hipGraphAddMemsetNode
hipGraphAddMemcpyNode
hipGraphAddMemcpyNode1D
hipGraphInstantiate
hipGraphLaunch
hipStreamIsCapturing
hipStreamBeginCapture
hipStreamEndCapture
hipGraphExecDestroy
hipPointerGetAttribute
hipDrvPointerGetAttributes
hipImportExternalSemaphore
hipSignalExternalSemaphoresAsync
hipWaitExternalSemaphoresAsync
hipDestroyExternalSemaphore
hipGLGetDevices
hipGraphicsGLRegisterBuffer
hipGraphicsGLRegisterImage
hipGraphicsMapResources
hipGraphicsResourceGetMappedPointer
hipGraphicsSubResourceGetMappedArray
hipGraphicsUnmapResources
hipGraphicsUnregisterResource
hipGraphGetNodes
hipGraphGetRootNodes
hipGraphKernelNodeGetParams
hipGraphKernelNodeSetParams
hipGraphKernelNodeSetAttribute
hipGraphKernelNodeGetAttribute
hipGraphMemcpyNodeGetParams
hipGraphMemcpyNodeSetParams
hipGraphMemsetNodeGetParams
hipGraphMemsetNodeSetParams
hipGraphAddDependencies
hipGraphExecKernelNodeSetParams
hipGraphAddEmptyNode
hipStreamGetCaptureInfo
hipStreamGetCaptureInfo_v2
hipStreamUpdateCaptureDependencies
hipGraphRemoveDependencies
hipGraphGetEdges
hipGraphNodeGetDependencies
hipGraphNodeGetDependentNodes
hipGraphNodeGetType
hipGraphDestroyNode
hipGraphClone
hipGraphNodeFindInClone
hipGraphAddChildGraphNode
hipGraphChildGraphNodeGetGraph
hipGraphExecChildGraphNodeSetParams
hipGraphAddMemcpyNodeFromSymbol
hipGraphMemcpyNodeSetParamsFromSymbol
hipGraphExecMemcpyNodeSetParamsFromSymbol
hipGraphAddMemcpyNodeToSymbol
hipGraphMemcpyNodeSetParamsToSymbol
hipGraphExecMemcpyNodeSetParamsToSymbol
hipGraphExecMemcpyNodeSetParams
hipGraphMemcpyNodeSetParams1D
hipGraphExecMemcpyNodeSetParams1D
hipGraphAddEventRecordNode
hipGraphEventRecordNodeGetEvent
hipGraphEventRecordNodeSetEvent
hipGraphExecEventRecordNodeSetEvent
hipGraphAddEventWaitNode
hipGraphEventWaitNodeGetEvent
hipGraphEventWaitNodeSetEvent
hipGraphExecEventWaitNodeSetEvent
hipGraphAddHostNode
hipGraphHostNodeGetParams
hipGraphHostNodeSetParams
hipGraphExecHostNodeSetParams
hipGraphExecUpdate
hipGraphInstantiateWithFlags
hipGraphExecMemsetNodeSetParams
hipDeviceGetGraphMemAttribute
hipDeviceSetGraphMemAttribute
hipDeviceGraphMemTrim
amd_dbgapi_get_build_name
amd_dbgapi_get_git_hash
amd_dbgapi_get_build_id
hipThreadExchangeStreamCaptureMode
hipMemAddressFree
hipMemAddressReserve
hipMemCreate
hipMemExportToShareableHandle
hipMemGetAccess
hipMemGetAllocationGranularity
hipMemGetAllocationPropertiesFromHandle
hipMemImportFromShareableHandle
hipMemMap
hipMemMapArrayAsync
hipMemRelease
hipMemRetainAllocationHandle
hipMemSetAccess
hipMemUnmap
hipMemcpy_spt
hipMemcpyAsync_spt
hipStreamSynchronize_spt
hipMemcpyToSymbol_spt
hipMemcpyFromSymbol_spt
hipMemcpy2D_spt
hipMemcpy2DToArray_spt
hipMemcpy2DFromArray_spt
hipMemcpy3D_spt
hipMemset_spt
hipMemset2D_spt
hipMemset3D_spt
hipStreamQuery_spt
hipStreamGetFlags_spt
hipStreamGetPriority_spt
hipStreamWaitEvent_spt
hipEventRecord_spt
hipLaunchKernel_spt
hipLaunchCooperativeKernel_spt
hipStreamWriteValue32
hipStreamWriteValue64
hipStreamWaitValue32
hipStreamWaitValue64
hipDeviceSetLimit
hipGetStreamDeviceId
hipGraphLaunch_spt
hipStreamBeginCapture_spt
hipStreamEndCapture_spt
hipStreamIsCapturing_spt
hipStreamGetCaptureInfo_spt
hipStreamGetCaptureInfo_v2_spt
hipStreamAddCallback_spt
hipMemsetAsync_spt
hipMemset2DAsync_spt
hipMemset3DAsync_spt
hipMemcpy3DAsync_spt
hipMemcpy2DAsync_spt
hipMemcpyFromSymbolAsync_spt
hipMemcpyToSymbolAsync_spt
hipMemcpyFromArray_spt
hipMemcpy2DToArray_spt
hipMemcpy2DFromArrayAsync_spt
hipMemcpy2DToArrayAsync_spt
hipDrvGetErrorName
hipDrvGetErrorString
hipUserObjectCreate
hipUserObjectRelease
hipUserObjectRetain
hipGraphRetainUserObject
hipGraphReleaseUserObject
hipLaunchHostFunc
hipLaunchHostFunc_spt
hipGraphDebugDotPrint
hipGraphKernelNodeCopyAttributes
hipGraphNodeGetEnabled
hipGraphNodeSetEnabled
hipGraphUpload
hipGraphAddMemAllocNode
hipGraphMemAllocNodeGetParams
hipGraphAddMemFreeNode
hipGraphMemFreeNodeGetParams
文件差异内容过多而无法显示 加载差异
文件差异内容过多而无法显示 加载差异
+854
查看文件
@@ -0,0 +1,854 @@
/* Copyright (c) 2012 - 2021 Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. */
#ifdef _WIN32
#include "top.hpp"
#include "cl_d3d9_amd.hpp"
#include "platform/command.hpp"
#include <cstring>
#include <utility>
#define D3DFMT_NV_12 static_cast<D3DFORMAT>(MAKEFOURCC('N', 'V', '1', '2'))
#define D3DFMT_P010 static_cast<D3DFORMAT>(MAKEFOURCC('P', '0', '1', '0'))
#define D3DFMT_YV_12 static_cast<D3DFORMAT>(MAKEFOURCC('Y', 'V', '1', '2'))
#define D3DFMT_YUY2 static_cast<D3DFORMAT>(MAKEFOURCC('Y', 'U', 'Y', '2'))
RUNTIME_ENTRY(cl_int, clGetDeviceIDsFromDX9MediaAdapterKHR,
(cl_platform_id platform, cl_uint num_media_adapters,
cl_dx9_media_adapter_type_khr* media_adapters_type, void* media_adapters,
cl_dx9_media_adapter_set_khr media_adapter_set, cl_uint num_entries,
cl_device_id* devices, cl_uint* num_devices)) {
cl_int errcode;
// Accept an array of DX9 devices here as the spec mention of array of num_media_adapters size.
IDirect3DDevice9Ex** d3d9_device = static_cast<IDirect3DDevice9Ex**>(media_adapters);
cl_device_id* gpu_devices = NULL;
cl_uint num_gpu_devices = 0;
static const bool VALIDATE_ONLY = true;
if (platform != NULL && platform != AMD_PLATFORM) {
LogWarning("\"platrform\" is not a valid AMD platform");
return CL_INVALID_PLATFORM;
}
// check if input parameter are correct
if ((num_media_adapters == 0) || (media_adapters_type == NULL) || (media_adapters == NULL) ||
(media_adapter_set != CL_PREFERRED_DEVICES_FOR_DX9_MEDIA_ADAPTER_KHR &&
media_adapter_set != CL_ALL_DEVICES_FOR_DX9_MEDIA_ADAPTER_KHR) ||
(num_entries == 0 && devices != NULL)) {
return CL_INVALID_VALUE;
}
// Get GPU devices
errcode = clGetDeviceIDs(NULL, CL_DEVICE_TYPE_GPU, 0, NULL, &num_gpu_devices);
if (errcode != CL_SUCCESS && errcode != CL_DEVICE_NOT_FOUND) {
return CL_INVALID_VALUE;
}
if (!num_gpu_devices) {
*not_null(num_devices) = 0;
return CL_DEVICE_NOT_FOUND;
}
switch (media_adapter_set) {
case CL_PREFERRED_DEVICES_FOR_DX9_MEDIA_ADAPTER_KHR:
case CL_ALL_DEVICES_FOR_DX9_MEDIA_ADAPTER_KHR: {
gpu_devices = new cl_device_id[num_gpu_devices];
errcode = clGetDeviceIDs(NULL, CL_DEVICE_TYPE_GPU, num_gpu_devices, gpu_devices, NULL);
if (errcode != CL_SUCCESS) {
break;
}
std::vector<amd::Device*> compatible_devices;
for (cl_uint i = 0; i < num_gpu_devices; ++i) {
cl_device_id device = gpu_devices[i];
amd::Context::Flags context_flag;
amd::Context::DeviceFlagIdx devIdx;
switch (media_adapters_type[i]) {
case CL_ADAPTER_D3D9_KHR:
context_flag = amd::Context::Flags::D3D9DeviceKhr;
devIdx = amd::Context::DeviceFlagIdx::D3D9DeviceKhrIdx;
break;
case CL_ADAPTER_D3D9EX_KHR:
context_flag = amd::Context::Flags::D3D9DeviceEXKhr;
devIdx = amd::Context::DeviceFlagIdx::D3D9DeviceEXKhrIdx;
break;
case CL_ADAPTER_DXVA_KHR:
context_flag = amd::Context::Flags::D3D9DeviceVAKhr;
devIdx = amd::Context::DeviceFlagIdx::D3D9DeviceVAKhrIdx;
break;
}
for (cl_uint j = 0; j < num_media_adapters; ++j) {
// Since there can be multiple DX9 adapters passed in the array we need to validate
// interopability with each.
void* external_device[amd::Context::DeviceFlagIdx::LastDeviceFlagIdx] = {};
external_device[devIdx] = d3d9_device[j];
if (is_valid(device) && (media_adapters_type[j] == CL_ADAPTER_D3D9EX_KHR) &&
as_amd(device)->bindExternalDevice(context_flag, external_device, NULL,
VALIDATE_ONLY)) {
compatible_devices.push_back(as_amd(device));
}
}
}
if (compatible_devices.size() == 0) {
*not_null(num_devices) = 0;
errcode = CL_DEVICE_NOT_FOUND;
break;
}
auto it = compatible_devices.cbegin();
cl_uint compatible_count = std::min(num_entries, (cl_uint)compatible_devices.size());
while (compatible_count--) {
*devices++ = as_cl(*it++);
--num_entries;
}
while (num_entries--) {
*devices++ = (cl_device_id)0;
}
*not_null(num_devices) = (cl_uint)compatible_devices.size();
} break;
default:
LogWarning("\"d3d9_device_set\" is invalid");
errcode = CL_INVALID_VALUE;
}
delete[] gpu_devices;
return errcode;
}
RUNTIME_EXIT
RUNTIME_ENTRY_RET(cl_mem, clCreateFromDX9MediaSurfaceKHR,
(cl_context context, cl_mem_flags flags,
cl_dx9_media_adapter_type_khr adapter_type, void* surface_info, cl_uint plane,
cl_int* errcode_ret)) {
cl_mem clMemObj = NULL;
cl_dx9_surface_info_khr* cl_surf_info = NULL;
if (!is_valid(context)) {
*not_null(errcode_ret) = CL_INVALID_CONTEXT;
LogWarning("invalid parameter \"context\"");
return clMemObj;
}
if (!flags) flags = CL_MEM_READ_WRITE;
if (!(((flags & CL_MEM_READ_ONLY) == CL_MEM_READ_ONLY) ||
((flags & CL_MEM_WRITE_ONLY) == CL_MEM_WRITE_ONLY) ||
((flags & CL_MEM_READ_WRITE) == CL_MEM_READ_WRITE))) {
*not_null(errcode_ret) = CL_INVALID_VALUE;
LogWarning("invalid parameter \"flags\"");
return clMemObj;
}
if ((adapter_type != CL_ADAPTER_D3D9_KHR) && (adapter_type != CL_ADAPTER_D3D9EX_KHR) &&
(adapter_type != CL_ADAPTER_DXVA_KHR)) {
*not_null(errcode_ret) = CL_INVALID_VALUE;
return clMemObj;
}
if (!surface_info) {
*not_null(errcode_ret) = CL_INVALID_VALUE;
LogWarning("parameter \"pD3DResource\" is a NULL pointer");
return clMemObj;
}
cl_surf_info = (cl_dx9_surface_info_khr*)surface_info;
IDirect3DSurface9* pD3D9Resource = cl_surf_info->resource;
HANDLE shared_handle = cl_surf_info->shared_handle;
if (!pD3D9Resource) {
*not_null(errcode_ret) = CL_INVALID_VALUE;
LogWarning("parameter \"surface_info\" is a NULL pointer");
return clMemObj;
}
D3DSURFACE_DESC Desc;
pD3D9Resource->GetDesc(&Desc);
if ((Desc.Format != D3DFMT_NV_12) &&
(Desc.Format != D3DFMT_P010) &&
(Desc.Format != D3DFMT_YV_12) && (plane != 0)) {
*not_null(errcode_ret) = CL_INVALID_VALUE;
LogWarning("The plane has to be Zero if the surface format is non-planar !");
return clMemObj;
}
// Check for image support
const std::vector<amd::Device*>& devices = as_amd(context)->devices();
bool supportPass = false;
for (const auto& it : devices) {
if (it->info().imageSupport_) {
supportPass = true;
}
}
if (!supportPass) {
*not_null(errcode_ret) = CL_INVALID_OPERATION;
LogWarning("there are no devices in context to support images");
return (cl_mem)0;
}
// Verify the resource is a 2D image
return amd::clCreateImage2DFromD3D9ResourceAMD(*as_amd(context), flags, adapter_type,
cl_surf_info, plane, errcode_ret);
}
RUNTIME_EXIT
RUNTIME_ENTRY(cl_int, clEnqueueAcquireDX9MediaSurfacesKHR,
(cl_command_queue command_queue, cl_uint num_objects, const cl_mem* mem_objects,
cl_uint num_events_in_wait_list, const cl_event* event_wait_list, cl_event* event)) {
return amd::clEnqueueAcquireExtObjectsAMD(command_queue, num_objects, mem_objects,
num_events_in_wait_list, event_wait_list, event,
CL_COMMAND_ACQUIRE_DX9_MEDIA_SURFACES_KHR);
}
RUNTIME_EXIT
RUNTIME_ENTRY(cl_int, clEnqueueReleaseDX9MediaSurfacesKHR,
(cl_command_queue command_queue, cl_uint num_objects, const cl_mem* mem_objects,
cl_uint num_events_in_wait_list, const cl_event* event_wait_list, cl_event* event)) {
return amd::clEnqueueReleaseExtObjectsAMD(command_queue, num_objects, mem_objects,
num_events_in_wait_list, event_wait_list, event,
CL_COMMAND_RELEASE_DX9_MEDIA_SURFACES_KHR);
}
RUNTIME_EXIT
//
//
// namespace amd
//
//
namespace amd {
/*! @}
* \addtogroup CL-D3D9 interop helper functions
* @{
*/
//
// Class D3D9Object implementation
//
std::vector<std::pair<TD3D9RESINFO, TD3D9RESINFO>> D3D9Object::resources_;
Monitor D3D9Object::resLock_;
//
// clCreateImage2DFromD3D9ResourceAMD
//
cl_mem clCreateImage2DFromD3D9ResourceAMD(Context& amdContext, cl_mem_flags flags,
cl_dx9_media_adapter_type_khr adapter_type,
cl_dx9_surface_info_khr* surface_info, cl_uint plane,
int* errcode_ret) {
cl_dx9_surface_info_khr* cl_surf_info = reinterpret_cast<cl_dx9_surface_info_khr*>(surface_info);
IDirect3DSurface9* pD3D9Resource = cl_surf_info->resource;
HANDLE shared_handle = cl_surf_info->shared_handle;
D3D9Object obj;
cl_int errcode = D3D9Object::initD3D9Object(amdContext, adapter_type, surface_info, plane, obj);
if (CL_SUCCESS != errcode) {
*not_null(errcode_ret) = errcode;
return (cl_mem)0;
}
Image2DD3D9* pImage2DD3D9 = new (amdContext) Image2DD3D9(amdContext, flags, obj);
if (!pImage2DD3D9) {
*not_null(errcode_ret) = CL_OUT_OF_HOST_MEMORY;
return (cl_mem)0;
}
if (!pImage2DD3D9->create()) {
*not_null(errcode_ret) = CL_MEM_OBJECT_ALLOCATION_FAILURE;
pImage2DD3D9->release();
return (cl_mem)0;
}
*not_null(errcode_ret) = CL_SUCCESS;
return as_cl<Memory>(pImage2DD3D9);
}
//
// Helper function SyncD3D9Objects
//
void SyncD3D9Objects(std::vector<amd::Memory*>& memObjects) {
Memory*& mem = memObjects.front();
if (!mem) {
LogWarning("\nNULL memory object\n");
return;
}
InteropObject* interop = mem->getInteropObj();
if (!interop) {
LogWarning("\nNULL interop object\n");
return;
}
D3D9Object* d3d9Obj = interop->asD3D9Object();
if (!d3d9Obj) {
LogWarning("\nNULL D3D9 object\n");
return;
}
IDirect3DQuery9* query = d3d9Obj->getQuery();
if (!query) {
LogWarning("\nNULL IDirect3DQuery9\n");
return;
}
ScopedLock sl(d3d9Obj->getResLock());
query->Issue(D3DISSUE_END);
BOOL data = FALSE;
while (S_OK != query->GetData(&data, sizeof(BOOL), D3DGETDATA_FLUSH)) {
}
}
//
// Class D3D10Object implementation
//
size_t D3D9Object::getElementBytes(D3DFORMAT d3d9Format, cl_uint plane) {
size_t bytesPerPixel;
switch (d3d9Format) {
case D3DFMT_UNKNOWN:
case D3DFMT_UYVY:
case D3DFMT_DXT1:
case D3DFMT_DXT2:
case D3DFMT_DXT3:
case D3DFMT_DXT4:
case D3DFMT_DXT5:
case D3DFMT_VERTEXDATA:
case D3DFMT_D32:
case D3DFMT_D15S1:
case D3DFMT_D24S8:
case D3DFMT_D24X8:
case D3DFMT_D24X4S4:
case D3DFMT_D16:
case D3DFMT_INDEX16:
case D3DFMT_INDEX32:
case D3DFMT_MULTI2_ARGB8:
case D3DFMT_CxV8U8:
// Less than 1 byte per pixel - needs special consideration
bytesPerPixel = 0;
break;
case D3DFMT_R3G3B2:
case D3DFMT_P8:
case D3DFMT_A8:
case D3DFMT_L8:
case D3DFMT_A4L4:
bytesPerPixel = 1;
break;
case D3DFMT_R16F:
case D3DFMT_R5G6B5:
case D3DFMT_X1R5G5B5:
case D3DFMT_A1R5G5B5:
case D3DFMT_A4R4G4B4:
case D3DFMT_A8R3G3B2:
case D3DFMT_X4R4G4B4:
case D3DFMT_A8P8:
case D3DFMT_A8L8:
case D3DFMT_V8U8:
case D3DFMT_L6V5U5:
case D3DFMT_D16_LOCKABLE:
case D3DFMT_L16:
bytesPerPixel = 2;
break;
case D3DFMT_R8G8B8:
case D3DFMT_D24FS8:
bytesPerPixel = 3;
break;
case D3DFMT_D32F_LOCKABLE:
case D3DFMT_A8R8G8B8:
case D3DFMT_R32F:
case D3DFMT_X8R8G8B8:
case D3DFMT_A2B10G10R10:
case D3DFMT_A8B8G8R8:
case D3DFMT_X8B8G8R8:
case D3DFMT_G16R16:
case D3DFMT_A2R10G10B10:
case D3DFMT_Q8W8V8U8:
case D3DFMT_X8L8V8U8:
case D3DFMT_V16U16:
case D3DFMT_A2W10V10U10:
case D3DFMT_R8G8_B8G8:
case D3DFMT_G8R8_G8B8:
case D3DFMT_G16R16F:
case D3DFMT_YUY2:
bytesPerPixel = 4;
break;
case D3DFMT_G32R32F:
case D3DFMT_A16B16G16R16:
case D3DFMT_A16B16G16R16F:
case D3DFMT_Q16W16V16U16:
bytesPerPixel = 8;
break;
case D3DFMT_A32B32G32R32F:
bytesPerPixel = 16;
break;
//#if !defined(D3D_DISABLE_9EX)
// case D3DFMT_D32_LOCKABLE:
// case D3DFMT_S8_LOCKABLE:
//#endif // !D3D_DISABLE_9EX
case D3DFMT_NV_12:
if (plane == 0) {
bytesPerPixel = 1;
} else if (plane == 1) {
bytesPerPixel = 2;
} // plane != 0 or != 1 shouldn't happen here
break;
case D3DFMT_P010:
if (plane == 0) {
bytesPerPixel = 2;
} else if (plane == 1) {
bytesPerPixel = 4;
} // plane != 0 or != 1 shouldn't happen here
break;
case D3DFMT_YV_12:
bytesPerPixel = 1;
break;
default:
bytesPerPixel = 0;
_ASSERT(FALSE);
break;
}
return bytesPerPixel;
}
void setObjDesc(amd::D3D9ObjDesc_t& objDesc, D3DSURFACE_DESC& resDesc, cl_uint plane) {
objDesc.d3dPool_ = resDesc.Pool;
objDesc.resType_ = resDesc.Type;
objDesc.usage_ = resDesc.Usage;
objDesc.d3dFormat_ = resDesc.Format;
switch (resDesc.Format) {
case D3DFMT_NV_12:
case D3DFMT_P010:
objDesc.surfRect_.left = 0;
objDesc.surfRect_.top = 0;
if (plane == 0) {
objDesc.objSize_.Height = resDesc.Height;
objDesc.objSize_.Width = resDesc.Width;
objDesc.surfRect_.right = resDesc.Width; // resDesc.Width/2-1;
objDesc.surfRect_.bottom = 3 * resDesc.Height / 2;
; // 3*resDesc.Height/2-1;
} else if (plane == 1) {
objDesc.objSize_.Height = resDesc.Height / 2;
objDesc.objSize_.Width = resDesc.Width / 2;
objDesc.surfRect_.right = resDesc.Width; // resDesc.Width/2-1;
objDesc.surfRect_.bottom = 3 * resDesc.Height / 2;
; // 3*resDesc.Height/2-1;
} // plane != 0 or != 1 shouldn't happen here
break;
case D3DFMT_YV_12:
objDesc.surfRect_.left = 0;
if (plane == 0) {
objDesc.objSize_.Height = resDesc.Height;
objDesc.objSize_.Width = resDesc.Width;
objDesc.surfRect_.top = 0;
objDesc.surfRect_.right = resDesc.Width - 1;
objDesc.surfRect_.bottom = resDesc.Height - 1;
} else if (plane == 1) {
objDesc.objSize_.Height = resDesc.Height / 2;
objDesc.objSize_.Width = resDesc.Width / 2;
objDesc.surfRect_.top = resDesc.Height;
objDesc.surfRect_.right = resDesc.Width / 2 - 1;
objDesc.surfRect_.bottom = 3 * resDesc.Height / 2 - 1;
} else if (plane == 2) {
objDesc.objSize_.Height = resDesc.Height / 2;
objDesc.objSize_.Width = resDesc.Width / 2;
objDesc.surfRect_.top = 3 * resDesc.Height / 2;
objDesc.surfRect_.right = resDesc.Width / 2 - 1;
objDesc.surfRect_.bottom = 2 * resDesc.Height - 1;
} // plane > 0 or > 2 shouldn't happen here
break;
default:
objDesc.objSize_.Height = resDesc.Height;
objDesc.objSize_.Width = resDesc.Width;
objDesc.surfRect_.left = 0;
objDesc.surfRect_.top = 0;
objDesc.surfRect_.right = resDesc.Width - 1;
objDesc.surfRect_.bottom = resDesc.Height - 1;
if (resDesc.Format == D3DFMT_YUY2) {
objDesc.objSize_.Width >>= 1;
}
break;
}
}
int D3D9Object::initD3D9Object(const Context& amdContext,
cl_dx9_media_adapter_type_khr adapter_type,
cl_dx9_surface_info_khr* cl_surf_info, cl_uint plane,
D3D9Object& obj) {
ScopedLock sl(resLock_);
IDirect3DDevice9Ex* pDev9Ex = NULL;
cl_int errcode = CL_SUCCESS;
// Check if this ressource has already been used for interop
IDirect3DSurface9* pD3D9res = cl_surf_info->resource;
HANDLE shared_handle = cl_surf_info->shared_handle;
if ((adapter_type == CL_ADAPTER_D3D9_KHR) || (adapter_type == CL_ADAPTER_DXVA_KHR)) {
return CL_INVALID_DX9_MEDIA_ADAPTER_KHR; // Not supported yet
}
for (const auto& it : resources_) {
if (it.first.surfInfo.resource == cl_surf_info->resource && it.first.surfPlane == plane) {
return CL_INVALID_D3D9_RESOURCE_KHR;
}
}
HRESULT hr;
D3DQUERYTYPE desc = D3DQUERYTYPE_EVENT;
D3DSURFACE_DESC resDesc;
if (D3D_OK != pD3D9res->GetDesc(&resDesc)) {
return CL_INVALID_D3D9_RESOURCE_KHR;
}
hr = pD3D9res->GetContainer(IID_IDirect3DDevice9Ex, (void**)&pDev9Ex);
if (hr == D3D_OK) {
pDev9Ex->CreateQuery(desc, &(obj.pQuery_));
} else {
return CL_INVALID_D3D9_RESOURCE_KHR; // d3d9ex should be supported
}
obj.handleShared_ = shared_handle;
obj.surfPlane_ = plane;
obj.surfInfo_ = *cl_surf_info;
obj.adapterType_ = adapter_type;
// Init defaults
setObjDesc(obj.objDescOrig_, resDesc, plane);
obj.objDesc_ = obj.objDescOrig_;
// shared handle cases if the shared_handle is NULL
// first check if the format is NV12 or YV12, which we need special handling
if (NULL == shared_handle) {
bool found = false;
for (const auto& it : resources_) {
if (it.first.surfInfo.resource == cl_surf_info->resource &&
it.first.surfPlane != plane) {
obj.handleShared_ = it.second.surfInfo.shared_handle;
obj.pD3D9Res_ = it.second.surfInfo.resource;
obj.pD3D9Res_->AddRef();
obj.objDesc_ = obj.objDescOrig_;
found = true;
break;
}
}
if (!found) {
obj.handleShared_ = 0;
hr = pDev9Ex->CreateOffscreenPlainSurface(resDesc.Width, resDesc.Height, resDesc.Format,
resDesc.Pool, &obj.pD3D9Res_, &obj.handleShared_);
if (D3D_OK != hr) {
errcode = CL_INVALID_D3D9_RESOURCE_KHR;
}
}
// put the original info into the obj
obj.pD3D9ResOrig_ = pD3D9res;
obj.pD3D9ResOrig_->AddRef(); // addRef in case lost the resource
} else {
// Share the original resource
obj.pD3D9ResOrig_ = NULL;
obj.pD3D9Res_ = pD3D9res;
obj.pD3D9Res_->AddRef();
}
// Release the Ex interface
if (pDev9Ex) pDev9Ex->Release();
// Check for CL format compatibilty
if (obj.objDesc_.resType_ == D3DRTYPE_SURFACE) {
cl_image_format clFmt = obj.getCLFormatFromD3D9(obj.objDesc_.d3dFormat_, plane);
amd::Image::Format imageFormat(clFmt);
if (!imageFormat.isSupported(amdContext)) {
return CL_INVALID_IMAGE_FORMAT_DESCRIPTOR;
}
}
TD3D9RESINFO d3d9ObjOri = {*cl_surf_info, plane};
TD3D9RESINFO d3d9ObjShared = {{obj.pD3D9Res_, obj.handleShared_}, plane};
if (errcode == CL_SUCCESS) {
resources_.push_back({d3d9ObjOri, d3d9ObjShared});
}
return errcode;
}
cl_uint D3D9Object::getMiscFlag() {
switch (objDescOrig_.d3dFormat_) {
case D3DFMT_NV_12:
case D3DFMT_P010:
return 1;
break;
case D3DFMT_YV_12:
return 2;
break;
case D3DFMT_YUY2:
return 3;
break;
default:
return 0;
break;
}
}
cl_image_format D3D9Object::getCLFormatFromD3D9() {
return getCLFormatFromD3D9(objDesc_.d3dFormat_, surfPlane_);
}
cl_image_format D3D9Object::getCLFormatFromD3D9(D3DFORMAT d3d9Fmt, cl_uint plane) {
cl_image_format fmt;
fmt.image_channel_order = 0; // CL_RGBA;
fmt.image_channel_data_type = 0; // CL_UNSIGNED_INT8;
switch (d3d9Fmt) {
case D3DFMT_R32F:
fmt.image_channel_order = CL_R;
fmt.image_channel_data_type = CL_FLOAT;
break;
case D3DFMT_R16F:
fmt.image_channel_order = CL_R;
fmt.image_channel_data_type = CL_HALF_FLOAT;
break;
case D3DFMT_L16:
fmt.image_channel_order = CL_R;
fmt.image_channel_data_type = CL_UNORM_INT16;
break;
case D3DFMT_A8:
fmt.image_channel_order = CL_A;
fmt.image_channel_data_type = CL_UNORM_INT8;
break;
case D3DFMT_L8:
fmt.image_channel_order = CL_R;
fmt.image_channel_data_type = CL_UNORM_INT8;
break;
case D3DFMT_G32R32F:
fmt.image_channel_order = CL_RG;
fmt.image_channel_data_type = CL_FLOAT;
break;
case D3DFMT_G16R16F:
fmt.image_channel_order = CL_RG;
fmt.image_channel_data_type = CL_HALF_FLOAT;
break;
case D3DFMT_G16R16:
fmt.image_channel_order = CL_RG;
fmt.image_channel_data_type = CL_UNORM_INT16;
break;
case D3DFMT_A8L8:
fmt.image_channel_order = CL_RG;
fmt.image_channel_data_type = CL_UNORM_INT8;
break;
case D3DFMT_A32B32G32R32F:
fmt.image_channel_order = CL_RGBA;
fmt.image_channel_data_type = CL_FLOAT;
break;
case D3DFMT_A16B16G16R16F:
fmt.image_channel_order = CL_RGBA;
fmt.image_channel_data_type = CL_HALF_FLOAT;
break;
case D3DFMT_A16B16G16R16:
fmt.image_channel_order = CL_RGBA;
fmt.image_channel_data_type = CL_UNORM_INT16;
break;
case D3DFMT_A8B8G8R8:
fmt.image_channel_order = CL_RGBA;
fmt.image_channel_data_type = CL_UNORM_INT8;
break;
case D3DFMT_X8B8G8R8:
fmt.image_channel_order = CL_RGBA;
fmt.image_channel_data_type = CL_UNORM_INT8;
break;
case D3DFMT_A8R8G8B8:
fmt.image_channel_order = CL_BGRA;
fmt.image_channel_data_type = CL_UNORM_INT8;
break;
case D3DFMT_X8R8G8B8:
fmt.image_channel_order = CL_BGRA;
fmt.image_channel_data_type = CL_UNORM_INT8;
break;
case D3DFMT_NV_12:
fmt.image_channel_data_type = CL_UNORM_INT8;
if (plane == 0) {
fmt.image_channel_order = CL_R;
} else if (plane == 1) {
fmt.image_channel_order = CL_RG;
}
break;
case D3DFMT_P010:
fmt.image_channel_data_type = CL_UNORM_INT16;
if (plane == 0) {
fmt.image_channel_order = CL_R;
} else if (plane == 1) {
fmt.image_channel_order = CL_RG;
}
break;
case D3DFMT_YV_12:
fmt.image_channel_order = CL_R;
fmt.image_channel_data_type = CL_UNORM_INT8;
break;
case D3DFMT_YUY2:
fmt.image_channel_order = CL_RGBA;
fmt.image_channel_data_type = CL_UNSIGNED_INT8;
break;
case D3DFMT_UNKNOWN:
case D3DFMT_R8G8B8:
case D3DFMT_R5G6B5:
case D3DFMT_X1R5G5B5:
case D3DFMT_A1R5G5B5:
case D3DFMT_A4R4G4B4:
case D3DFMT_R3G3B2:
case D3DFMT_A8R3G3B2:
case D3DFMT_X4R4G4B4:
case D3DFMT_A2B10G10R10:
case D3DFMT_A2R10G10B10:
case D3DFMT_A8P8:
case D3DFMT_P8:
case D3DFMT_A4L4:
case D3DFMT_V8U8:
case D3DFMT_L6V5U5:
case D3DFMT_X8L8V8U8:
case D3DFMT_Q8W8V8U8:
case D3DFMT_V16U16:
case D3DFMT_A2W10V10U10:
case D3DFMT_UYVY:
case D3DFMT_R8G8_B8G8:
case D3DFMT_G8R8_G8B8:
case D3DFMT_DXT1:
case D3DFMT_DXT2:
case D3DFMT_DXT3:
case D3DFMT_DXT4:
case D3DFMT_DXT5:
case D3DFMT_D16_LOCKABLE:
case D3DFMT_D32:
case D3DFMT_D15S1:
case D3DFMT_D24S8:
case D3DFMT_D24X8:
case D3DFMT_D24X4S4:
case D3DFMT_D16:
case D3DFMT_D32F_LOCKABLE:
case D3DFMT_D24FS8:
//#if !defined(D3D_DISABLE_9EX)
case D3DFMT_D32_LOCKABLE:
case D3DFMT_S8_LOCKABLE:
//#endif // !D3D_DISABLE_9EX
case D3DFMT_VERTEXDATA:
case D3DFMT_INDEX16:
case D3DFMT_INDEX32:
case D3DFMT_Q16W16V16U16:
case D3DFMT_MULTI2_ARGB8:
case D3DFMT_CxV8U8:
//#if !defined(D3D_DISABLE_9EX)
case D3DFMT_A1:
case D3DFMT_A2B10G10R10_XR_BIAS:
case D3DFMT_BINARYBUFFER:
_ASSERT(FALSE); // NOT SURPPORTED
break;
//#endif // !D3D_DISABLE_9EX
default:
_ASSERT(FALSE);
break;
}
return fmt;
}
bool D3D9Object::copyOrigToShared() {
// Don't copy if there is no orig
if (NULL == getD3D9ResOrig()) return true;
IDirect3DDevice9Ex* d3dDev;
HRESULT hr;
ScopedLock sl(getResLock());
IDirect3DSurface9* srcSurf = getD3D9ResOrig();
IDirect3DSurface9* dstSurf = getD3D9Resource();
hr = getD3D9Resource()->GetContainer(IID_IDirect3DDevice9Ex, (void**)&d3dDev);
if (hr != D3D_OK || !d3dDev) {
LogError("\nCannot get D3D9 device from D3D9 surface\n");
return false;
}
hr = d3dDev->StretchRect(srcSurf, NULL, dstSurf, NULL, D3DTEXF_NONE);
if (hr != D3D_OK) {
LogError("\ncopy original surface to shared surface failed\n");
return false;
}
// Flush D3D queues and make sure D3D stuff is finished
pQuery_->Issue(D3DISSUE_END);
BOOL data;
while ((D3D_OK != pQuery_->GetData(&data, sizeof(BOOL), D3DGETDATA_FLUSH)) && (data != TRUE)) {
}
if (d3dDev) d3dDev->Release();
return true;
}
bool D3D9Object::copySharedToOrig() {
// Don't copy if there is no orig
if (NULL == getD3D9ResOrig()) return true;
IDirect3DDevice9Ex* d3dDev;
HRESULT hr;
ScopedLock sl(getResLock());
hr = getD3D9Resource()->GetContainer(IID_IDirect3DDevice9Ex, (void**)&d3dDev);
if (hr != D3D_OK || !d3dDev) {
LogError("\nCannot get D3D9 device from D3D9 surface\n");
return false;
}
hr = d3dDev->StretchRect(getD3D9Resource(), NULL, getD3D9ResOrig(), NULL, D3DTEXF_NONE);
if (hr != D3D_OK) {
LogError("\ncopy shared surface to original surface failed\n");
return false;
}
if (d3dDev) d3dDev->Release();
return true;
}
void Image2DD3D9::initDeviceMemory() {
deviceMemories_ =
reinterpret_cast<DeviceMemory*>(reinterpret_cast<char*>(this) + sizeof(Image2DD3D9));
memset(deviceMemories_, 0, context_().devices().size() * sizeof(DeviceMemory));
}
} // namespace amd
#endif //_WIN32
文件差异内容过多而无法显示 加载差异
+398
查看文件
@@ -0,0 +1,398 @@
/* Copyright (c) 2010 - 2021 Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. */
#ifndef CL_GL_AMD_HPP_
#define CL_GL_AMD_HPP_
#ifdef _WIN32
#include <windows.h>
#else //!_WIN32
#include <dlfcn.h>
#endif //!_WIN32
#include <GL/gl.h>
#include <GL/glext.h>
#include "CL/cl_gl.h"
#ifndef _WIN32
#include <GL/glx.h>
#endif //!_WIN32
#include <EGL/egl.h>
#include <EGL/eglext.h>
#include <EGL/eglplatform.h>
#include "platform/context.hpp"
#include "platform/command.hpp"
namespace amd
{
//! Class GLObject keeps all the info about the GL object
//! from which the CL object is created
class GLObject : public InteropObject
{
protected:
cl_gl_object_type clGLType_; //!< CL GL object type
GLenum glTarget_;
GLuint gluiName_;
GLint gliMipLevel_;
GLenum glInternalFormat_;
GLint gliWidth_;
GLint gliHeight_;
GLint gliDepth_;
GLenum glCubemapFace_;
GLsizei glNumSamples_;
public:
//! GLObject constructor initializes member variables
GLObject(
GLenum glTarget,
GLuint gluiName,
GLint gliMipLevel,
GLenum glInternalFormat,
GLint gliWidth,
GLint gliHeight,
GLint gliDepth,
cl_gl_object_type clGLType,
GLenum glCubemapFace,
GLsizei glNumSamples
): // Initialization of member variables
clGLType_(clGLType),
glTarget_(glTarget),
gluiName_(gluiName),
gliMipLevel_(gliMipLevel),
glInternalFormat_(glInternalFormat),
gliWidth_(gliWidth),
gliHeight_(gliHeight),
gliDepth_(gliDepth),
glCubemapFace_(glCubemapFace),
glNumSamples_(glNumSamples)
{
}
virtual ~GLObject() {}
virtual GLObject* asGLObject() {return this;}
//! GLObject query functions to get GL info from member variables
GLenum getGLTarget() const {return glTarget_;}
GLuint getGLName() const {return gluiName_;}
GLint getGLMipLevel() const {return gliMipLevel_;}
GLenum getGLInternalFormat() const {return glInternalFormat_;}
GLint getGLSize() const {return gliWidth_;}
GLint getGLWidth() const {return gliWidth_;}
GLint getGLHeight() const {return gliHeight_;}
GLint getGLDepth() const {return gliDepth_;}
cl_gl_object_type getCLGLObjectType() const { return clGLType_; }
GLenum getCubemapFace() const {return glCubemapFace_;}
GLsizei getNumSamples() const { return glNumSamples_;}
};
//! Class BufferGL is drived from classes Buffer and GLObject
//! where the former keeps all data for CL object and
//! the latter keeps all data for GL object
class BufferGL : public Buffer, public GLObject
{
protected:
//! Initializes the device memory array which is nested
// after'BufferGL' object in memory layout.
virtual void initDeviceMemory();
public:
//! BufferGL constructor just calls constructors of base classes
//! to pass down the parameters
BufferGL(
Context& amdContext,
cl_mem_flags clFlags,
size_t uiSizeInBytes,
GLenum glTarget,
GLuint gluiName)
: // Call base classes constructors
Buffer(
amdContext,
clFlags,
uiSizeInBytes
),
GLObject(
glTarget,
gluiName,
0, // Mipmap level default
GL_ARRAY_BUFFER, // Just init to some value
(GLint) uiSizeInBytes,
1,
1,
CL_GL_OBJECT_BUFFER,
0,
0
)
{
setInteropObj(this);
}
virtual ~BufferGL() {}
virtual BufferGL* asBufferGL() { return this; }
};
//! Class ImageGL is derived from classes Image and GLObject
//! where the former keeps all data for CL object and
//! the latter keeps all data for GL object
class ImageGL : public Image, public GLObject
{
public:
//! ImageGL constructor just calls constructors of base classes
//! to pass down the parameters
ImageGL(
Context& amdContext,
cl_mem_object_type clType,
cl_mem_flags clFlags,
const Format& format,
size_t width,
size_t height,
size_t depth,
GLenum glTarget,
GLuint gluiName,
GLint gliMipLevel,
GLenum glInternalFormat,
cl_gl_object_type clGLType,
GLsizei numSamples,
GLenum glCubemapFace = 0)
: Image(amdContext, clType, clFlags, format, width, height, depth,
Format(format).getElementSize() * width,
Format(format).getElementSize() * width * depth)
, GLObject(glTarget, gluiName, gliMipLevel, glInternalFormat,
static_cast<GLint>(width), static_cast<GLint>(height),
static_cast<GLint>(depth), clGLType, glCubemapFace,numSamples)
{
setInteropObj(this);
}
virtual ~ImageGL() {}
protected:
//! Initializes the device memory array which is nested
// after'BufferGL' object in memory layout.
virtual void initDeviceMemory();
};
typedef EGLContext (*PFN_eglGetCurrentContext) ();
#ifdef _WIN32
#define APICALL WINAPI
#define GETPROCADDRESS GetProcAddress
#define API_GETPROCADDR "wglGetProcAddress"
#define FCN_STR_TYPE LPCSTR
typedef PROC (WINAPI* PFN_xxxGetProcAddress) (LPCSTR fcnName);
typedef HGLRC (APICALL* PFN_wglCreateContext) (HDC hdc);
typedef HGLRC (APICALL* PFN_wglGetCurrentContext) (void);
typedef HDC (APICALL* PFN_wglGetCurrentDC) (void);
typedef BOOL (APICALL* PFN_wglDeleteContext) (HGLRC hglrc);
typedef BOOL (APICALL* PFN_wglMakeCurrent) (HDC hdc, HGLRC hglrc);
typedef BOOL (APICALL* PFN_wglShareLists) (HGLRC hglrc1, HGLRC hglrc2);
#else //!_WIN32
#define APICALL // __stdcall //??? todo odintsov
#define API_GETPROCADDR "glXGetProcAddress"
#define GETPROCADDRESS dlsym
#define FCN_STR_TYPE const GLubyte*
#define WINAPI
#define PROC void*
typedef void* (*PFN_xxxGetProcAddress) (const GLubyte* procName);
// X11 typedef
typedef Display* (*PFNXOpenDisplay)(_Xconst char* display_name );
typedef int (*PFNXCloseDisplay)(Display* display );
//glx typedefs
typedef GLXDrawable (*PFNglXGetCurrentDrawable)();
typedef Display* (*PFNglXGetCurrentDisplay)();
typedef GLXContext (*PFNglXGetCurrentContext)( void );
typedef XVisualInfo* (*PFNglXChooseVisual)(Display *dpy, int screen, int *attribList);
typedef GLXContext(*PFNglXCreateContext)(Display* dpy,XVisualInfo* vis,GLXContext shareList,Bool direct);
typedef void(*PFNglXDestroyContext)(Display* dpy, GLXContext ctx);
typedef Bool(*PFNglXMakeCurrent)( Display* dpy, GLXDrawable drawable, GLXContext ctx);
typedef void* HMODULE;
#endif //!_WIN32
#define GLPREFIX(rtype, fcn, dclargs) \
typedef rtype (APICALL* PFN_##fcn) dclargs;
// Declare prototypes for GL functions
#include "gl_functions.hpp"
class GLFunctions
{
public:
//! Locks any access to the virtual GPUs
class SetIntEnv : public amd::StackObject {
public:
//! Default constructor
SetIntEnv(GLFunctions* env);
//! Destructor
~SetIntEnv();
//! Checks if the environment setup was successful
bool isValid() const { return isValid_; }
private:
GLFunctions* env_; //!< GL environment
bool isValid_; //!< If TRUE, then it's a valid setup
};
private:
HMODULE libHandle_;
int missed_; // Indicates how many GL functions not init'ed, if any
amd::Monitor lock_;
EGLDisplay eglDisplay_;
EGLContext eglOriginalContext_;
EGLContext eglInternalContext_;
EGLContext eglTempContext_;
bool isEGL_;
PFN_eglGetCurrentContext eglGetCurrentContext_;
#ifdef _WIN32
HGLRC hOrigGLRC_;
HDC hDC_;
HGLRC hIntGLRC_; // handle for internal GLRC to access shared context
HDC tempDC_;
HGLRC tempGLRC_;
public:
PFN_wglCreateContext wglCreateContext_;
PFN_wglGetCurrentContext wglGetCurrentContext_;
PFN_wglGetCurrentDC wglGetCurrentDC_;
PFN_wglDeleteContext wglDeleteContext_;
PFN_wglMakeCurrent wglMakeCurrent_;
PFN_wglShareLists wglShareLists_;
#else
public:
Display* Dpy_;
GLXDrawable Drawable_;
GLXContext origCtx_;
Display* intDpy_;
Window intDrawable_;
GLXContext intCtx_;
Display* tempDpy_;
GLXDrawable tempDrawable_;
GLXContext tempCtx_;
//pointers to X11 functions
PFNXOpenDisplay XOpenDisplay_;
PFNXCloseDisplay XCloseDisplay_;
//pointers to GLX functions
PFNglXGetCurrentDrawable glXGetCurrentDrawable_;
PFNglXGetCurrentDisplay glXGetCurrentDisplay_;
PFNglXGetCurrentContext glXGetCurrentContext_;
PFNglXChooseVisual glXChooseVisual_;
PFNglXCreateContext glXCreateContext_;
PFNglXDestroyContext glXDestroyContext_;
PFNglXMakeCurrent glXMakeCurrent_;
#endif
public:
GLFunctions(HMODULE h, bool isEGL);
~GLFunctions();
bool update(intptr_t hglrc);
bool IsCurrentGlContext(const amd::Context::Info& info) const {
if (isEGL_) {
return ((info.hCtx_ != nullptr) && (eglGetCurrentContext_ != nullptr) &&
(info.hCtx_ == eglGetCurrentContext_()));
} else {
#ifdef _WIN32
return ((info.hCtx_ != nullptr) && (info.hCtx_ == wglGetCurrentContext_()));
#else
return ((info.hCtx_ != nullptr) && (info.hCtx_ == glXGetCurrentContext_()));
#endif // _WIN32
}
}
void WaitCurrentGlContext(const amd::Context::Info& info) const;
// Query CL-GL context association
bool isAssociated() const
{
if (isEGL_ && eglDisplay_ && eglOriginalContext_) return true;
#ifdef _WIN32
if(hDC_ && hOrigGLRC_) return true;
#else //!_WIN32
if(Dpy_ && origCtx_) return true;
#endif //!_WIN32
return false;
}
bool isEGL() const
{
return isEGL_;
}
// Accessor methods
#ifdef _WIN32
HGLRC getOrigGLRC() const {return hOrigGLRC_;}
HDC getDC() const {return hDC_;}
HGLRC getIntGLRC() const {return hIntGLRC_;}
#else //!_WIN32
Display* getDpy() const {return Dpy_;}
GLXDrawable getDrawable() const {return Drawable_;}
GLXContext getOrigCtx() const {return origCtx_;}
Display* getIntDpy() const {return intDpy_;}
GLXDrawable getIntDrawable() const {return intDrawable_;}
GLXContext getIntCtx() const {return intCtx_;}
EGLDisplay getEglDpy() const { return eglDisplay_; }
EGLContext getEglOrigCtx() const { return eglOriginalContext_; }
#endif //!_WIN32
// Initialize GL dynamic library and function pointers
bool init(intptr_t hdc, intptr_t hglrc);
// Return true if successful, false - if error occurred
bool setIntEnv();
bool restoreEnv();
amd::Monitor& getLock() { return lock_; }
PFN_xxxGetProcAddress GetProcAddress_;
#define GLPREFIX(rtype, fcn, dclargs) \
PFN_##fcn fcn##_;
// Declare pointers to GL functions
#include "gl_functions.hpp"
};
//! Functions for executing the GL related stuff
cl_mem clCreateFromGLBufferAMD(Context& amdContext, cl_mem_flags flags,
GLuint bufobj, cl_int* errcode_ret);
cl_mem clCreateFromGLTextureAMD(Context& amdContext, cl_mem_flags flags,
GLenum target, GLint miplevel, GLuint texture, int* errcode_ret);
cl_mem clCreateFromGLRenderbufferAMD(Context& amdContext, cl_mem_flags flags,
GLuint renderbuffer, int* errcode_ret);
bool
getCLFormatFromGL(
const Context& amdContext,
GLint gliInternalFormat,
cl_image_format* pclImageFormat,
int* piBytesPerPixel,
cl_mem_flags flags
);
} //namespace amd
#endif //CL_GL_AMD_HPP_
@@ -0,0 +1,51 @@
# Copyright (c) 2020 - 2021 Advanced Micro Devices, Inc. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
if(ROCCLR_FOUND)
return()
endif()
find_path(ROCCLR_INCLUDE_DIR top.hpp
HINTS
${ROCCLR_PATH}
PATHS
# gerrit repo name
${CMAKE_SOURCE_DIR}/vdi
${CMAKE_SOURCE_DIR}/../vdi
${CMAKE_SOURCE_DIR}/../../vdi
# github repo name
${CMAKE_SOURCE_DIR}/ROCclr
${CMAKE_SOURCE_DIR}/../ROCclr
${CMAKE_SOURCE_DIR}/../../ROCclr
# jenkins repo name
${CMAKE_SOURCE_DIR}/rocclr
${CMAKE_SOURCE_DIR}/../rocclr
${CMAKE_SOURCE_DIR}/../../rocclr
PATH_SUFFIXES
include)
include(FindPackageHandleStandardArgs)
find_package_handle_standard_args(ROCclr
"\nROCclr not found"
ROCCLR_INCLUDE_DIR)
mark_as_advanced(ROCCLR_INCLUDE_DIR)
list(APPEND CMAKE_MODULE_PATH "${ROCCLR_INCLUDE_DIR}/../cmake")
include(ROCclr)
+40
查看文件
@@ -0,0 +1,40 @@
/* Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. */
#include "vdi_common.hpp"
#ifdef _WIN32
#include <windows.h>
#include <d3d9.h>
#include <d3d10_1.h>
#include <CL/cl_d3d10.h>
#include <CL/cl_d3d11.h>
#include <CL/cl_dx9_media_sharing.h>
#endif
#include <CL/cl_icd.h>
cl_icd_dispatch amd::ICDDispatchedObject::icdVendorDispatch_[] = {0};
amd::PlatformIDS amd::PlatformID::Platform = {amd::ICDDispatchedObject::icdVendorDispatch_};
RUNTIME_ENTRY(cl_int, clGetDeviceIDs,
(cl_platform_id platform, cl_device_type device_type, cl_uint num_entries,
cl_device_id* devices, cl_uint* num_devices)) {
return CL_SUCCESS;
}
RUNTIME_EXIT
+26
查看文件
@@ -0,0 +1,26 @@
/* Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. */
#include "platform/activity.hpp"
#include <hip/hip_runtime_api.h>
extern "C" const char* hipGetCmdName(unsigned op) {
return getOclCommandKindString(static_cast<cl_command_type>(op));
}
+910
查看文件
@@ -0,0 +1,910 @@
/*
Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include "hip_code_object.hpp"
#include "amd_hsa_elf.hpp"
#include <cstring>
#include <hip/driver_types.h>
#include "hip/hip_runtime_api.h"
#include "hip/hip_runtime.h"
#include "hip_internal.hpp"
#include "platform/program.hpp"
#include <elf/elf.hpp>
hipError_t ihipFree(void* ptr);
// forward declaration of methods required for managed variables
hipError_t ihipMallocManaged(void** ptr, size_t size, unsigned int align = 0);
namespace {
size_t constexpr strLiteralLength(char const* str) {
return *str ? 1 + strLiteralLength(str + 1) : 0;
}
constexpr char const* CLANG_OFFLOAD_BUNDLER_MAGIC_STR = "__CLANG_OFFLOAD_BUNDLE__";
constexpr char const* OFFLOAD_KIND_HIP = "hip";
constexpr char const* OFFLOAD_KIND_HIPV4 = "hipv4";
constexpr char const* OFFLOAD_KIND_HCC = "hcc";
constexpr char const* AMDGCN_TARGET_TRIPLE = "amdgcn-amd-amdhsa-";
// ClangOFFLOADBundle info.
static constexpr size_t bundle_magic_string_size =
strLiteralLength(CLANG_OFFLOAD_BUNDLER_MAGIC_STR);
// Clang Offload bundler description & Header.
struct __ClangOffloadBundleInfo {
uint64_t offset;
uint64_t size;
uint64_t bundleEntryIdSize;
const char bundleEntryId[1];
};
struct __ClangOffloadBundleHeader {
const char magic[bundle_magic_string_size - 1];
uint64_t numOfCodeObjects;
__ClangOffloadBundleInfo desc[1];
};
} // namespace
namespace hip {
bool CodeObject::IsClangOffloadMagicBundle(const void* data) {
std::string magic(reinterpret_cast<const char*>(data), bundle_magic_string_size);
return magic.compare(CLANG_OFFLOAD_BUNDLER_MAGIC_STR) ? false : true;
}
uint64_t CodeObject::ElfSize(const void* emi) { return amd::Elf::getElfSize(emi); }
static bool getProcName(uint32_t EFlags, std::string& proc_name, bool& xnackSupported,
bool& sramEccSupported) {
switch (EFlags & EF_AMDGPU_MACH) {
case EF_AMDGPU_MACH_AMDGCN_GFX700:
xnackSupported = false;
sramEccSupported = false;
proc_name = "gfx700";
break;
case EF_AMDGPU_MACH_AMDGCN_GFX701:
xnackSupported = false;
sramEccSupported = false;
proc_name = "gfx701";
break;
case EF_AMDGPU_MACH_AMDGCN_GFX702:
xnackSupported = false;
sramEccSupported = false;
proc_name = "gfx702";
break;
case EF_AMDGPU_MACH_AMDGCN_GFX703:
xnackSupported = false;
sramEccSupported = false;
proc_name = "gfx703";
break;
case EF_AMDGPU_MACH_AMDGCN_GFX704:
xnackSupported = false;
sramEccSupported = false;
proc_name = "gfx704";
break;
case EF_AMDGPU_MACH_AMDGCN_GFX705:
xnackSupported = false;
sramEccSupported = false;
proc_name = "gfx705";
break;
case EF_AMDGPU_MACH_AMDGCN_GFX801:
xnackSupported = true;
sramEccSupported = false;
proc_name = "gfx801";
break;
case EF_AMDGPU_MACH_AMDGCN_GFX802:
xnackSupported = false;
sramEccSupported = false;
proc_name = "gfx802";
break;
case EF_AMDGPU_MACH_AMDGCN_GFX803:
xnackSupported = false;
sramEccSupported = false;
proc_name = "gfx803";
break;
case EF_AMDGPU_MACH_AMDGCN_GFX805:
xnackSupported = false;
sramEccSupported = false;
proc_name = "gfx805";
break;
case EF_AMDGPU_MACH_AMDGCN_GFX810:
xnackSupported = true;
sramEccSupported = false;
proc_name = "gfx810";
break;
case EF_AMDGPU_MACH_AMDGCN_GFX900:
xnackSupported = true;
sramEccSupported = false;
proc_name = "gfx900";
break;
case EF_AMDGPU_MACH_AMDGCN_GFX902:
xnackSupported = true;
sramEccSupported = false;
proc_name = "gfx902";
break;
case EF_AMDGPU_MACH_AMDGCN_GFX904:
xnackSupported = true;
sramEccSupported = false;
proc_name = "gfx904";
break;
case EF_AMDGPU_MACH_AMDGCN_GFX906:
xnackSupported = true;
sramEccSupported = true;
proc_name = "gfx906";
break;
case EF_AMDGPU_MACH_AMDGCN_GFX908:
xnackSupported = true;
sramEccSupported = true;
proc_name = "gfx908";
break;
case EF_AMDGPU_MACH_AMDGCN_GFX909:
xnackSupported = true;
sramEccSupported = false;
proc_name = "gfx909";
break;
case EF_AMDGPU_MACH_AMDGCN_GFX90A:
xnackSupported = true;
sramEccSupported = true;
proc_name = "gfx90a";
break;
case EF_AMDGPU_MACH_AMDGCN_GFX90C:
xnackSupported = true;
sramEccSupported = false;
proc_name = "gfx90c";
break;
case EF_AMDGPU_MACH_AMDGCN_GFX940:
xnackSupported = true;
sramEccSupported = true;
proc_name = "gfx940";
break;
case EF_AMDGPU_MACH_AMDGCN_GFX1010:
xnackSupported = true;
sramEccSupported = false;
proc_name = "gfx1010";
break;
case EF_AMDGPU_MACH_AMDGCN_GFX1011:
xnackSupported = true;
sramEccSupported = false;
proc_name = "gfx1011";
break;
case EF_AMDGPU_MACH_AMDGCN_GFX1012:
xnackSupported = true;
sramEccSupported = false;
proc_name = "gfx1012";
break;
case EF_AMDGPU_MACH_AMDGCN_GFX1013:
xnackSupported = true;
sramEccSupported = false;
proc_name = "gfx1013";
break;
case EF_AMDGPU_MACH_AMDGCN_GFX1030:
xnackSupported = false;
sramEccSupported = false;
proc_name = "gfx1030";
break;
case EF_AMDGPU_MACH_AMDGCN_GFX1031:
xnackSupported = false;
sramEccSupported = false;
proc_name = "gfx1031";
break;
case EF_AMDGPU_MACH_AMDGCN_GFX1032:
xnackSupported = false;
sramEccSupported = false;
proc_name = "gfx1032";
break;
case EF_AMDGPU_MACH_AMDGCN_GFX1033:
xnackSupported = false;
sramEccSupported = false;
proc_name = "gfx1033";
break;
case EF_AMDGPU_MACH_AMDGCN_GFX1034:
xnackSupported = false;
sramEccSupported = false;
proc_name = "gfx1034";
break;
case EF_AMDGPU_MACH_AMDGCN_GFX1035:
xnackSupported = false;
sramEccSupported = false;
proc_name = "gfx1035";
break;
case EF_AMDGPU_MACH_AMDGCN_GFX1036:
xnackSupported = false;
sramEccSupported = false;
proc_name = "gfx1036";
break;
case EF_AMDGPU_MACH_AMDGCN_GFX1100:
xnackSupported = false;
sramEccSupported = false;
proc_name = "gfx1100";
break;
case EF_AMDGPU_MACH_AMDGCN_GFX1101:
xnackSupported = false;
sramEccSupported = false;
proc_name = "gfx1101";
break;
case EF_AMDGPU_MACH_AMDGCN_GFX1102:
xnackSupported = false;
sramEccSupported = false;
proc_name = "gfx1102";
break;
case EF_AMDGPU_MACH_AMDGCN_GFX1103:
xnackSupported = false;
sramEccSupported = false;
proc_name = "gfx1103";
break;
default:
return false;
}
return true;
}
static bool getTripleTargetIDFromCodeObject(const void* code_object, std::string& target_id) {
if (!code_object) return false;
const Elf64_Ehdr* ehdr = reinterpret_cast<const Elf64_Ehdr*>(code_object);
if (ehdr->e_machine != EM_AMDGPU) return false;
if (ehdr->e_ident[EI_OSABI] != ELFOSABI_AMDGPU_HSA) return false;
bool isXnackSupported{false}, isSramEccSupported{false};
std::string proc_name;
if (!getProcName(ehdr->e_flags, proc_name, isXnackSupported, isSramEccSupported)) return false;
target_id = std::string(AMDGCN_TARGET_TRIPLE) + '-' + proc_name;
switch (ehdr->e_ident[EI_ABIVERSION]) {
case ELFABIVERSION_AMDGPU_HSA_V2: {
LogPrintfInfo("[Code Object V2, target id:%s]", target_id.c_str());
return false;
}
case ELFABIVERSION_AMDGPU_HSA_V3: {
LogPrintfInfo("[Code Object V3, target id:%s]", target_id.c_str());
if (isSramEccSupported) {
if (ehdr->e_flags & EF_AMDGPU_FEATURE_SRAMECC_V3)
target_id += ":sramecc+";
else
target_id += ":sramecc-";
}
if (isXnackSupported) {
if (ehdr->e_flags & EF_AMDGPU_FEATURE_XNACK_V3)
target_id += ":xnack+";
else
target_id += ":xnack-";
}
break;
}
case ELFABIVERSION_AMDGPU_HSA_V4:
case ELFABIVERSION_AMDGPU_HSA_V5: {
if (ehdr->e_ident[EI_ABIVERSION] & ELFABIVERSION_AMDGPU_HSA_V4) {
LogPrintfInfo("[Code Object V4, target id:%s]", target_id.c_str());
} else {
LogPrintfInfo("[Code Object V5, target id:%s]", target_id.c_str());
}
unsigned co_sram_value = (ehdr->e_flags) & EF_AMDGPU_FEATURE_SRAMECC_V4;
if (co_sram_value == EF_AMDGPU_FEATURE_SRAMECC_OFF_V4)
target_id += ":sramecc-";
else if (co_sram_value == EF_AMDGPU_FEATURE_SRAMECC_ON_V4)
target_id += ":sramecc+";
unsigned co_xnack_value = (ehdr->e_flags) & EF_AMDGPU_FEATURE_XNACK_V4;
if (co_xnack_value == EF_AMDGPU_FEATURE_XNACK_OFF_V4)
target_id += ":xnack-";
else if (co_xnack_value == EF_AMDGPU_FEATURE_XNACK_ON_V4)
target_id += ":xnack+";
break;
}
default: {
return false;
}
}
return true;
}
// Consumes the string 'consume_' from the starting of the given input
// eg: input = amdgcn-amd-amdhsa--gfx908 and consume_ is amdgcn-amd-amdhsa--
// input will become gfx908.
static bool consume(std::string& input, std::string consume_) {
if (input.substr(0, consume_.size()) != consume_) {
return false;
}
input = input.substr(consume_.size());
return true;
}
// Trim String till character, will be used to get gpuname
// example: input is gfx908:sram-ecc+ and trim char is :
// input will become sram-ecc+.
static std::string trimName(std::string& input, char trim) {
auto pos_ = input.find(trim);
auto res = input;
if (pos_ == std::string::npos) {
input = "";
} else {
res = input.substr(0, pos_);
input = input.substr(pos_);
}
return res;
}
static char getFeatureValue(std::string& input, std::string feature) {
char res = ' ';
if (consume(input, std::move(feature))) {
res = input[0];
input = input.substr(1);
}
return res;
}
static bool getTargetIDValue(std::string& input, std::string& processor, char& sramecc_value,
char& xnack_value) {
processor = trimName(input, ':');
sramecc_value = getFeatureValue(input, std::string(":sramecc"));
if (sramecc_value != ' ' && sramecc_value != '+' && sramecc_value != '-') return false;
xnack_value = getFeatureValue(input, std::string(":xnack"));
if (xnack_value != ' ' && xnack_value != '+' && xnack_value != '-') return false;
return true;
}
static bool getTripleTargetID(std::string bundled_co_entry_id, const void* code_object,
std::string& co_triple_target_id) {
std::string offload_kind = trimName(bundled_co_entry_id, '-');
if (offload_kind != OFFLOAD_KIND_HIPV4 && offload_kind != OFFLOAD_KIND_HIP &&
offload_kind != OFFLOAD_KIND_HCC)
return false;
if (offload_kind != OFFLOAD_KIND_HIPV4)
return getTripleTargetIDFromCodeObject(code_object, co_triple_target_id);
// For code object V4 onwards the bundled code object entry ID correctly
// specifies the target triple.
co_triple_target_id = bundled_co_entry_id.substr(1);
return true;
}
static bool isCodeObjectCompatibleWithDevice(std::string co_triple_target_id,
std::string agent_triple_target_id) {
// Primitive Check
if (co_triple_target_id == agent_triple_target_id) return true;
// Parse code object triple target id
if (!consume(co_triple_target_id, std::string(AMDGCN_TARGET_TRIPLE) + '-')) {
return false;
}
std::string co_processor;
char co_sram_ecc, co_xnack;
if (!getTargetIDValue(co_triple_target_id, co_processor, co_sram_ecc, co_xnack)) {
return false;
}
if (!co_triple_target_id.empty()) return false;
// Parse agent isa triple target id
if (!consume(agent_triple_target_id, std::string(AMDGCN_TARGET_TRIPLE) + '-')) {
return false;
}
std::string agent_isa_processor;
char isa_sram_ecc, isa_xnack;
if (!getTargetIDValue(agent_triple_target_id, agent_isa_processor, isa_sram_ecc, isa_xnack)) {
return false;
}
if (!agent_triple_target_id.empty()) return false;
// Check for compatibility
if (agent_isa_processor != co_processor) return false;
if (co_sram_ecc != ' ') {
if (co_sram_ecc != isa_sram_ecc) return false;
}
if (co_xnack != ' ') {
if (co_xnack != isa_xnack) return false;
}
return true;
}
// This will be moved to COMGR eventually
hipError_t CodeObject::ExtractCodeObjectFromFile(
amd::Os::FileDesc fdesc, size_t fsize, const void** image,
const std::vector<std::string>& device_names,
std::vector<std::pair<const void*, size_t>>& code_objs) {
hipError_t hip_error = hipSuccess;
if (fdesc < 0) {
return hipErrorFileNotFound;
}
// Map the file to memory, with offset 0.
// file will be unmapped in ModuleUnload
// const void* image = nullptr;
if (!amd::Os::MemoryMapFileDesc(fdesc, fsize, 0, image)) {
return hipErrorInvalidValue;
}
// retrieve code_objs{binary_image, binary_size} for devices
hip_error = extractCodeObjectFromFatBinary(*image, device_names, code_objs);
return hip_error;
}
// This will be moved to COMGR eventually
hipError_t CodeObject::ExtractCodeObjectFromMemory(
const void* data, const std::vector<std::string>& device_names,
std::vector<std::pair<const void*, size_t>>& code_objs, std::string& uri) {
// Get the URI from memory
if (!amd::Os::GetURIFromMemory(data, 0, uri)) {
return hipErrorInvalidValue;
}
return extractCodeObjectFromFatBinary(data, device_names, code_objs);
}
// This will be moved to COMGR eventually
hipError_t CodeObject::extractCodeObjectFromFatBinary(
const void* data, const std::vector<std::string>& agent_triple_target_ids,
std::vector<std::pair<const void*, size_t>>& code_objs) {
std::string magic((const char*)data, bundle_magic_string_size);
if (magic.compare(CLANG_OFFLOAD_BUNDLER_MAGIC_STR)) {
return hipErrorInvalidKernelFile;
}
// Initialize Code objects
code_objs.reserve(agent_triple_target_ids.size());
for (size_t i = 0; i < agent_triple_target_ids.size(); i++) {
code_objs.push_back(std::make_pair(nullptr, 0));
}
const auto obheader = reinterpret_cast<const __ClangOffloadBundleHeader*>(data);
const auto* desc = &obheader->desc[0];
size_t num_code_objs = code_objs.size();
for (uint64_t i = 0; i < obheader->numOfCodeObjects; ++i,
desc = reinterpret_cast<const __ClangOffloadBundleInfo*>(
reinterpret_cast<uintptr_t>(&desc->bundleEntryId[0]) +
desc->bundleEntryIdSize)) {
const void* image =
reinterpret_cast<const void*>(reinterpret_cast<uintptr_t>(obheader) + desc->offset);
const size_t image_size = desc->size;
if (num_code_objs == 0) break;
std::string bundleEntryId{desc->bundleEntryId, desc->bundleEntryIdSize};
std::string co_triple_target_id;
if (!getTripleTargetID(bundleEntryId, image, co_triple_target_id)) continue;
for (size_t dev = 0; dev < agent_triple_target_ids.size(); ++dev) {
if (code_objs[dev].first) continue;
if (isCodeObjectCompatibleWithDevice(co_triple_target_id, agent_triple_target_ids[dev])) {
code_objs[dev] = std::make_pair(image, image_size);
--num_code_objs;
}
}
}
if (num_code_objs == 0) {
return hipSuccess;
} else {
LogPrintfError("%s",
"hipErrorNoBinaryForGpu: Unable to find code object for all current devices!");
LogPrintfError("%s", " Devices:");
for (size_t i = 0; i < agent_triple_target_ids.size(); i++) {
LogPrintfError(" %s - [%s]", agent_triple_target_ids[i].c_str(),
((code_objs[i].first) ? "Found" : "Not Found"));
}
const auto obheader = reinterpret_cast<const __ClangOffloadBundleHeader*>(data);
const auto* desc = &obheader->desc[0];
LogPrintfError("%s", " Bundled Code Objects:");
for (uint64_t i = 0; i < obheader->numOfCodeObjects; ++i,
desc = reinterpret_cast<const __ClangOffloadBundleInfo*>(
reinterpret_cast<uintptr_t>(&desc->bundleEntryId[0]) +
desc->bundleEntryIdSize)) {
std::string bundleEntryId{desc->bundleEntryId, desc->bundleEntryIdSize};
const void* image =
reinterpret_cast<const void*>(reinterpret_cast<uintptr_t>(obheader) + desc->offset);
std::string co_triple_target_id;
bool valid_co = getTripleTargetID(bundleEntryId, image, co_triple_target_id);
if (valid_co) {
LogPrintfError(" %s - [code object targetID is %s]", bundleEntryId.c_str(),
co_triple_target_id.c_str());
} else {
LogPrintfError(" %s - [Unsupported]", bundleEntryId.c_str());
}
}
LogPrintfError("hipErrorNoBinaryForGpu: Unable to find code object for all current devices! - %d",hipErrorNoBinaryForGpu);
return hipErrorNoBinaryForGpu;
}
}
hipError_t DynCO::loadCodeObject(const char* fname, const void* image) {
amd::ScopedLock lock(dclock_);
// Number of devices = 1 in dynamic code object
fb_info_ = new FatBinaryInfo(fname, image);
std::vector<hip::Device*> devices = {g_devices[ihipGetDevice()]};
IHIP_RETURN_ONFAIL(fb_info_->ExtractFatBinary(devices));
// No Lazy loading for DynCO
IHIP_RETURN_ONFAIL(fb_info_->BuildProgram(ihipGetDevice()));
// Define Global variables
IHIP_RETURN_ONFAIL(populateDynGlobalVars());
// Define Global functions
IHIP_RETURN_ONFAIL(populateDynGlobalFuncs());
return hipSuccess;
}
// Dynamic Code Object
DynCO::~DynCO() {
amd::ScopedLock lock(dclock_);
for (auto& elem : vars_) {
if (elem.second->getVarKind() == Var::DVK_Managed) {
hipError_t err = ihipFree(elem.second->getManagedVarPtr());
assert(err == hipSuccess);
}
delete elem.second;
}
vars_.clear();
for (auto& elem : functions_) {
delete elem.second;
}
functions_.clear();
delete fb_info_;
}
hipError_t DynCO::getDeviceVar(DeviceVar** dvar, std::string var_name) {
amd::ScopedLock lock(dclock_);
CheckDeviceIdMatch();
auto it = vars_.find(var_name);
if (it == vars_.end()) {
LogPrintfError("Cannot find the Var: %s ", var_name.c_str());
return hipErrorNotFound;
}
hipError_t err = it->second->getDeviceVar(dvar, device_id_, module());
return err;
}
hipError_t DynCO::getDynFunc(hipFunction_t* hfunc, std::string func_name) {
amd::ScopedLock lock(dclock_);
CheckDeviceIdMatch();
if (hfunc == nullptr) {
return hipErrorInvalidValue;
}
auto it = functions_.find(func_name);
if (it == functions_.end()) {
LogPrintfError("Cannot find the function: %s ", func_name.c_str());
return hipErrorNotFound;
}
/* See if this could be solved */
return it->second->getDynFunc(hfunc, module());
}
hipError_t DynCO::initDynManagedVars(const std::string& managedVar) {
amd::ScopedLock lock(dclock_);
DeviceVar* dvar;
void* pointer = nullptr;
hipError_t status = hipSuccess;
// To get size of the managed variable
status = getDeviceVar(&dvar, managedVar + ".managed");
if (status != hipSuccess) {
ClPrint(amd::LOG_ERROR, amd::LOG_API, "Status %d, failed to get .managed device variable:%s",
status, managedVar.c_str());
return status;
}
// Allocate managed memory for these symbols
status = ihipMallocManaged(&pointer, dvar->size());
if (status != hipSuccess) {
ClPrint(amd::LOG_ERROR, amd::LOG_API, "Status %d, failed to allocate managed memory", status);
guarantee(false, "Error during allocation of managed memory!");
}
// update as manager variable and set managed memory pointer and size
auto it = vars_.find(managedVar);
it->second->setManagedVarInfo(pointer, dvar->size());
// copy initial value to the managed variable to the managed memory allocated
hip::Stream* stream = hip::getNullStream();
if (stream != nullptr) {
status = ihipMemcpy(pointer, reinterpret_cast<address>(dvar->device_ptr()), dvar->size(),
hipMemcpyDeviceToDevice, *stream);
if (status != hipSuccess) {
ClPrint(amd::LOG_ERROR, amd::LOG_API, "Status %d, failed to copy device ptr:%s", status,
managedVar.c_str());
return status;
}
} else {
ClPrint(amd::LOG_ERROR, amd::LOG_API, "Host Queue is NULL");
return hipErrorInvalidResourceHandle;
}
// Get deivce ptr to initialize with managed memory pointer
status = getDeviceVar(&dvar, managedVar);
if (status != hipSuccess) {
ClPrint(amd::LOG_ERROR, amd::LOG_API, "Status %d, failed to get managed device variable:%s",
status, managedVar.c_str());
return status;
}
// copy managed memory pointer to the managed device variable
status = ihipMemcpy(reinterpret_cast<address>(dvar->device_ptr()), &pointer, dvar->size(),
hipMemcpyHostToDevice, *stream);
if (status != hipSuccess) {
ClPrint(amd::LOG_ERROR, amd::LOG_API, "Status %d, failed to copy device ptr:%s", status,
managedVar.c_str());
return status;
}
return status;
}
hipError_t DynCO::populateDynGlobalVars() {
amd::ScopedLock lock(dclock_);
hipError_t err = hipSuccess;
std::vector<std::string> var_names;
std::string managedVarExt = ".managed";
// For Dynamic Modules there is only one hipFatBinaryDevInfo_
device::Program* dev_program = fb_info_->GetProgram(ihipGetDevice())
->getDeviceProgram(*hip::getCurrentDevice()->devices()[0]);
if (!dev_program->getGlobalVarFromCodeObj(&var_names)) {
LogPrintfError("Could not get Global vars from Code Obj for Module: 0x%x \n", module());
return hipErrorSharedObjectSymbolNotFound;
}
for (auto& elem : var_names) {
vars_.insert(
std::make_pair(elem, new Var(elem, Var::DeviceVarKind::DVK_Variable, 0, 0, 0, nullptr)));
}
for (auto& elem : var_names) {
if (elem.find(managedVarExt) != std::string::npos) {
std::string managedVar = elem;
managedVar.erase(managedVar.length() - managedVarExt.length(), managedVarExt.length());
err = initDynManagedVars(managedVar);
}
}
return err;
}
hipError_t DynCO::populateDynGlobalFuncs() {
amd::ScopedLock lock(dclock_);
std::vector<std::string> func_names;
device::Program* dev_program = fb_info_->GetProgram(ihipGetDevice())
->getDeviceProgram(*hip::getCurrentDevice()->devices()[0]);
// Get all the global func names from COMGR
if (!dev_program->getGlobalFuncFromCodeObj(&func_names)) {
LogPrintfError("Could not get Global Funcs from Code Obj for Module: 0x%x \n", module());
return hipErrorSharedObjectSymbolNotFound;
}
for (auto& elem : func_names) {
functions_.insert(std::make_pair(elem, new Function(elem)));
}
return hipSuccess;
}
// Static Code Object
StatCO::StatCO() {}
StatCO::~StatCO() {
amd::ScopedLock lock(sclock_);
for (auto& elem : functions_) {
delete elem.second;
}
functions_.clear();
for (auto& elem : vars_) {
delete elem.second;
}
vars_.clear();
}
hipError_t StatCO::digestFatBinary(const void* data, FatBinaryInfo*& programs) {
amd::ScopedLock lock(sclock_);
if (programs != nullptr) {
return hipSuccess;
}
// Create a new fat binary object and extract the fat binary for all devices.
programs = new FatBinaryInfo(nullptr, data);
IHIP_RETURN_ONFAIL(programs->ExtractFatBinary(g_devices));
return hipSuccess;
}
FatBinaryInfo** StatCO::addFatBinary(const void* data, bool initialized) {
amd::ScopedLock lock(sclock_);
if (initialized) {
hipError_t err = digestFatBinary(data, modules_[data]);
assert(err == hipSuccess);
}
return &modules_[data];
}
hipError_t StatCO::removeFatBinary(FatBinaryInfo** module) {
amd::ScopedLock lock(sclock_);
auto vit = vars_.begin();
while (vit != vars_.end()) {
if (vit->second->moduleInfo() == module) {
delete vit->second;
vit = vars_.erase(vit);
} else {
++vit;
}
}
auto it = managedVars_.begin();
while (it != managedVars_.end()) {
if ((*it)->moduleInfo() == module) {
for (auto dev : g_devices) {
DeviceVar* dvar = nullptr;
IHIP_RETURN_ONFAIL((*it)->getStatDeviceVar(&dvar, dev->deviceId()));
// free also deletes the device ptr
hipError_t err = ihipFree(dvar->device_ptr());
assert(err == hipSuccess);
}
it = managedVars_.erase(it);
} else {
++it;
}
}
auto fit = functions_.begin();
while (fit != functions_.end()) {
if (fit->second->moduleInfo() == module) {
delete fit->second;
fit = functions_.erase(fit);
} else {
++fit;
}
}
auto mit = modules_.begin();
while (mit != modules_.end()) {
if (&mit->second == module) {
delete mit->second;
mit = modules_.erase(mit);
} else {
++mit;
}
}
return hipSuccess;
}
hipError_t StatCO::registerStatFunction(const void* hostFunction, Function* func) {
amd::ScopedLock lock(sclock_);
if (functions_.find(hostFunction) != functions_.end()) {
DevLogPrintfError("hostFunctionPtr: 0x%x already exists", hostFunction);
}
functions_.insert(std::make_pair(hostFunction, func));
return hipSuccess;
}
const char* StatCO::getStatFuncName(const void* hostFunction) {
amd::ScopedLock lock(sclock_);
const auto it = functions_.find(hostFunction);
if (it == functions_.end()) {
return nullptr;
}
return it->second->name().c_str();
}
hipError_t StatCO::getStatFunc(hipFunction_t* hfunc, const void* hostFunction, int deviceId) {
amd::ScopedLock lock(sclock_);
const auto it = functions_.find(hostFunction);
if (it == functions_.end()) {
return hipErrorInvalidSymbol;
}
return it->second->getStatFunc(hfunc, deviceId);
}
hipError_t StatCO::getStatFuncAttr(hipFuncAttributes* func_attr, const void* hostFunction,
int deviceId) {
amd::ScopedLock lock(sclock_);
const auto it = functions_.find(hostFunction);
if (it == functions_.end()) {
return hipErrorInvalidSymbol;
}
return it->second->getStatFuncAttr(func_attr, deviceId);
}
hipError_t StatCO::registerStatGlobalVar(const void* hostVar, Var* var) {
amd::ScopedLock lock(sclock_);
if (vars_.find(hostVar) != vars_.end()) {
return hipErrorInvalidSymbol;
}
vars_.insert(std::make_pair(hostVar, var));
return hipSuccess;
}
hipError_t StatCO::getStatGlobalVar(const void* hostVar, int deviceId, hipDeviceptr_t* dev_ptr,
size_t* size_ptr) {
amd::ScopedLock lock(sclock_);
const auto it = vars_.find(hostVar);
if (it == vars_.end()) {
return hipErrorInvalidSymbol;
}
DeviceVar* dvar = nullptr;
IHIP_RETURN_ONFAIL(it->second->getStatDeviceVar(&dvar, deviceId));
*dev_ptr = dvar->device_ptr();
*size_ptr = dvar->size();
return hipSuccess;
}
hipError_t StatCO::registerStatManagedVar(Var* var) {
managedVars_.emplace_back(var);
return hipSuccess;
}
hipError_t StatCO::initStatManagedVarDevicePtr(int deviceId) {
amd::ScopedLock lock(sclock_);
hipError_t err = hipSuccess;
if (managedVarsDevicePtrInitalized_.find(deviceId) == managedVarsDevicePtrInitalized_.end() ||
!managedVarsDevicePtrInitalized_[deviceId]) {
for (auto var : managedVars_) {
DeviceVar* dvar = nullptr;
IHIP_RETURN_ONFAIL(var->getStatDeviceVar(&dvar, deviceId));
hip::Stream* stream = g_devices.at(deviceId)->NullStream();
if (stream != nullptr) {
err = ihipMemcpy(reinterpret_cast<address>(dvar->device_ptr()), var->getManagedVarPtr(),
dvar->size(), hipMemcpyHostToDevice, *stream);
} else {
ClPrint(amd::LOG_ERROR, amd::LOG_API, "Host Queue is NULL");
return hipErrorInvalidResourceHandle;
}
}
managedVarsDevicePtrInitalized_[deviceId] = true;
}
return err;
}
}; // namespace hip
+168
查看文件
@@ -0,0 +1,168 @@
/*
Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#ifndef HIP_CODE_OBJECT_HPP
#define HIP_CODE_OBJECT_HPP
#include "hip_global.hpp"
#include <cstring>
#include <unordered_map>
#include "hip/hip_runtime.h"
#include "hip/hip_runtime_api.h"
#include "hip_internal.hpp"
#include "device/device.hpp"
#include "platform/program.hpp"
//Forward Declaration for friend usage
class PlatformState;
namespace hip {
//Code Object base class
class CodeObject {
public:
virtual ~CodeObject() {}
// Functions to add_dev_prog and build
static hipError_t add_program(int deviceId, hipModule_t hmod, const void* binary_ptr,
size_t binary_size);
static hipError_t build_module(hipModule_t hmod, const std::vector<amd::Device*>& devices);
// Given an file desc and file size, extracts to code object for corresponding devices,
// return code_objs{binary_ptr, binary_size}, which could be used to determine foffset
static hipError_t ExtractCodeObjectFromFile(amd::Os::FileDesc fdesc, size_t fsize,
const void ** image, const std::vector<std::string>& device_names,
std::vector<std::pair<const void*, size_t>>& code_objs);
// Given an ptr to memory, extracts to code object for corresponding devices,
// returns code_objs{binary_ptr, binary_size} and uniform resource indicator
static hipError_t ExtractCodeObjectFromMemory(const void* data,
const std::vector<std::string>& device_names,
std::vector<std::pair<const void*, size_t>>& code_objs,
std::string& uri);
static uint64_t ElfSize(const void* emi);
static bool IsClangOffloadMagicBundle(const void* data);
protected:
//Given an ptr to image or file, extracts to code object
//for corresponding devices
static hipError_t extractCodeObjectFromFatBinary(const void*,
const std::vector<std::string>&,
std::vector<std::pair<const void*, size_t>>&);
CodeObject() {}
private:
friend const std::vector<hipModule_t>& modules();
};
//Dynamic Code Object
class DynCO : public CodeObject {
amd::Monitor dclock_{"Guards Dynamic Code object", true};
public:
DynCO() : device_id_(ihipGetDevice()), fb_info_(nullptr) {}
virtual ~DynCO();
//LoadsCodeObject and its data
hipError_t loadCodeObject(const char* fname, const void* image=nullptr);
hipModule_t module() const { return fb_info_->Module(ihipGetDevice()); };
//Gets GlobalVar/Functions from a dynamically loaded code object
hipError_t getDynFunc(hipFunction_t* hfunc, std::string func_name);
hipError_t getDeviceVar(DeviceVar** dvar, std::string var_name);
hipError_t getManagedVarPointer(std::string name, void** pointer, size_t* size_ptr) const {
auto it = vars_.find(name);
if (it != vars_.end() && it->second->getVarKind() == Var::DVK_Managed) {
*pointer = it->second->getManagedVarPtr();
*size_ptr = it->second->getSize();
}
return hipSuccess;
}
// Device ID Check to check if module is launched in the same device it was loaded.
inline void CheckDeviceIdMatch() const {
if (device_id_ != ihipGetDevice()) {
guarantee(false, "Device mismatch from where this module is loaded");
}
}
private:
int device_id_;
FatBinaryInfo* fb_info_;
//Maps for vars/funcs, could be keyed in with std::string name
std::unordered_map<std::string, Function*> functions_;
std::unordered_map<std::string, Var*> vars_;
//Populate Global Vars/Funcs from an code object(@ module_load)
hipError_t populateDynGlobalFuncs();
hipError_t populateDynGlobalVars();
hipError_t initDynManagedVars(const std::string& managedVar);
};
//Static Code Object
class StatCO: public CodeObject {
amd::Monitor sclock_{"Guards Static Code object", true};
public:
StatCO();
virtual ~StatCO();
//Add/Remove/Digest Fat Binaries passed to us from "__hipRegisterFatBinary"
FatBinaryInfo** addFatBinary(const void* data, bool initialized);
hipError_t removeFatBinary(FatBinaryInfo** module);
hipError_t digestFatBinary(const void* data, FatBinaryInfo*& programs);
//Register vars/funcs given to use from __hipRegister[Var/Func/ManagedVar]
hipError_t registerStatFunction(const void* hostFunction, Function* func);
hipError_t registerStatGlobalVar(const void* hostVar, Var* var);
hipError_t registerStatManagedVar(Var *var);
//Retrive Vars/Funcs for a given hostSidePtr(const void*), unless stated otherwise.
const char* getStatFuncName(const void* hostFunction);
hipError_t getStatFunc(hipFunction_t* hfunc, const void* hostFunction, int deviceId);
hipError_t getStatFuncAttr(hipFuncAttributes* func_attr, const void* hostFunction, int deviceId);
hipError_t getStatGlobalVar(const void* hostVar, int deviceId, hipDeviceptr_t* dev_ptr,
size_t* size_ptr);
//Managed variable is a defined symbol in code object
//pointer to the alocated managed memory has to be copied to the address of symbol
hipError_t initStatManagedVarDevicePtr(int deviceId);
private:
friend class ::PlatformState;
//Populated during __hipRegisterFatBinary
std::unordered_map<const void*, FatBinaryInfo*> modules_;
//Populated during __hipRegisterFuncs
std::unordered_map<const void*, Function*> functions_;
//Populated during __hipRegisterVars
std::unordered_map<const void*, Var*> vars_;
//Populated during __hipRegisterManagedVar
std::vector<Var*> managedVars_;
std::unordered_map<int, bool> managedVarsDevicePtrInitalized_;
};
}; // namespace hip
#endif /* HIP_CODE_OBJECT_HPP */
+402
查看文件
@@ -0,0 +1,402 @@
/* Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. */
#include <hip/hip_runtime.h>
#include "hip_internal.hpp"
#include "hip_platform.hpp"
#include "platform/runtime.hpp"
#include "utils/flags.hpp"
#include "utils/versions.hpp"
std::vector<hip::Device*> g_devices;
namespace hip {
thread_local TlsAggregator tls;
amd::Context* host_context = nullptr;
//init() is only to be called from the HIP_INIT macro only once
bool init() {
amd::IS_HIP = true;
GPU_NUM_MEM_DEPENDENCY = 0;
#if DISABLE_DIRECT_DISPATCH
constexpr bool kDirectDispatch = false;
#else
constexpr bool kDirectDispatch = IS_LINUX;
#endif
AMD_DIRECT_DISPATCH = flagIsDefault(AMD_DIRECT_DISPATCH) ? kDirectDispatch : AMD_DIRECT_DISPATCH;
if (!amd::Runtime::init()) {
return false;
}
ClPrint(amd::LOG_INFO, amd::LOG_INIT, "Direct Dispatch: %d", AMD_DIRECT_DISPATCH);
const std::vector<amd::Device*>& devices = amd::Device::getDevices(CL_DEVICE_TYPE_GPU, false);
for (unsigned int i=0; i<devices.size(); i++) {
const std::vector<amd::Device*> device(1, devices[i]);
amd::Context* context = new amd::Context(device, amd::Context::Info());
if (!context) return false;
// Enable active wait on the device by default
devices[i]->SetActiveWait(true);
if (context && CL_SUCCESS != context->create(nullptr)) {
context->release();
} else {
auto device = new Device(context, i);
if ((device == nullptr) || !device->Create()) {
return false;
}
g_devices.push_back(device);
}
}
amd::Context* hContext = new amd::Context(devices, amd::Context::Info());
if (!hContext) return false;
if (CL_SUCCESS != hContext->create(nullptr)) {
hContext->release();
}
host_context = hContext;
PlatformState::instance().init();
return true;
}
Device* getCurrentDevice() {
return tls.device_;
}
void setCurrentDevice(unsigned int index) {
assert(index<g_devices.size());
tls.device_ = g_devices[index];
uint32_t preferredNumaNode = (tls.device_)->devices()[0]->getPreferredNumaNode();
amd::Os::setPreferredNumaNode(preferredNumaNode);
}
hip::Stream* getStream(hipStream_t stream) {
if (stream == nullptr) {
return getNullStream();
} else {
hip::Stream* hip_stream = reinterpret_cast<hip::Stream*>(stream);
if (!(hip_stream->Flags() & hipStreamNonBlocking)) {
constexpr bool WaitNullStreamOnly = true;
iHipWaitActiveStreams(hip_stream, WaitNullStreamOnly);
}
return hip_stream;
}
}
// ================================================================================================
hip::Stream* getNullStream(amd::Context& ctx) {
for (auto& it : g_devices) {
if (it->asContext() == &ctx) {
return it->NullStream();
}
}
// If it's a pure SVM allocation with system memory access, then it shouldn't matter which device
// runtime selects by default
if (hip::host_context == &ctx) {
// Return current...
return getNullStream();
}
return nullptr;
}
// ================================================================================================
int getDeviceID(amd::Context& ctx) {
for (auto& it : g_devices) {
if (it->asContext() == &ctx) {
return it->deviceId();
}
}
return -1;
}
// ================================================================================================
hip::Stream* getNullStream() {
Device* device = getCurrentDevice();
return device ? device->NullStream() : nullptr;
}
};
using namespace hip;
hipError_t hipInit(unsigned int flags) {
HIP_INIT_API(hipInit, flags);
if (flags != 0) {
HIP_RETURN(hipErrorInvalidValue);
}
HIP_RETURN(hipSuccess);
}
hipError_t hipCtxCreate(hipCtx_t *ctx, unsigned int flags, hipDevice_t device) {
HIP_INIT_API(hipCtxCreate, ctx, flags, device);
if (static_cast<size_t>(device) >= g_devices.size()) {
HIP_RETURN(hipErrorInvalidValue);
}
*ctx = reinterpret_cast<hipCtx_t>(g_devices[device]);
// Increment ref count for device primary context
g_devices[device]->retain();
tls.ctxt_stack_.push(g_devices[device]);
HIP_RETURN(hipSuccess);
}
hipError_t hipCtxSetCurrent(hipCtx_t ctx) {
HIP_INIT_API(hipCtxSetCurrent, ctx);
if (ctx == nullptr) {
if(!tls.ctxt_stack_.empty()) {
tls.ctxt_stack_.pop();
}
} else {
hip::tls.device_ = reinterpret_cast<hip::Device*>(ctx);
if(!tls.ctxt_stack_.empty()) {
tls.ctxt_stack_.pop();
}
tls.ctxt_stack_.push(hip::getCurrentDevice());
}
HIP_RETURN(hipSuccess);
}
hipError_t hipCtxGetCurrent(hipCtx_t* ctx) {
HIP_INIT_API(hipCtxGetCurrent, ctx);
*ctx = reinterpret_cast<hipCtx_t>(hip::getCurrentDevice());
HIP_RETURN(hipSuccess);
}
hipError_t hipCtxGetSharedMemConfig(hipSharedMemConfig* pConfig) {
HIP_INIT_API(hipCtxGetSharedMemConfig, pConfig);
*pConfig = hipSharedMemBankSizeFourByte;
HIP_RETURN(hipSuccess);
}
hipError_t hipRuntimeGetVersion(int *runtimeVersion) {
HIP_INIT_API_NO_RETURN(hipRuntimeGetVersion, runtimeVersion);
if (!runtimeVersion) {
HIP_RETURN(hipErrorInvalidValue);
}
// HIP_VERSION = HIP_VERSION_MAJOR*100 + HIP_MINOR_VERSION
*runtimeVersion = HIP_VERSION;
HIP_RETURN(hipSuccess);
}
hipError_t hipCtxDestroy(hipCtx_t ctx) {
HIP_INIT_API(hipCtxDestroy, ctx);
hip::Device* dev = reinterpret_cast<hip::Device*>(ctx);
if (dev == nullptr) {
HIP_RETURN(hipErrorInvalidValue);
}
// Need to remove the ctx of calling thread if its the top one
if (!tls.ctxt_stack_.empty() && tls.ctxt_stack_.top() == dev) {
tls.ctxt_stack_.pop();
}
// Remove context from global context list
for (unsigned int i = 0; i < g_devices.size(); i++) {
if (g_devices[i] == dev) {
// Decrement ref count for device primary context
dev->release();
}
}
HIP_RETURN(hipSuccess);
}
hipError_t hipCtxPopCurrent(hipCtx_t* ctx) {
HIP_INIT_API(hipCtxPopCurrent, ctx);
hip::Device** dev = reinterpret_cast<hip::Device**>(ctx);
if (!tls.ctxt_stack_.empty()) {
if (dev != nullptr) {
*dev = tls.ctxt_stack_.top();
}
tls.ctxt_stack_.pop();
} else {
DevLogError("Context Stack empty \n");
HIP_RETURN(hipErrorInvalidContext);
}
HIP_RETURN(hipSuccess);
}
hipError_t hipCtxPushCurrent(hipCtx_t ctx) {
HIP_INIT_API(hipCtxPushCurrent, ctx);
hip::Device* dev = reinterpret_cast<hip::Device*>(ctx);
if (dev == nullptr) {
HIP_RETURN(hipErrorInvalidContext);
}
hip::tls.device_ = dev;
tls.ctxt_stack_.push(hip::getCurrentDevice());
HIP_RETURN(hipSuccess);
}
hipError_t hipDriverGetVersion(int* driverVersion) {
HIP_INIT_API_NO_RETURN(hipDriverGetVersion, driverVersion);
if (!driverVersion) {
HIP_RETURN(hipErrorInvalidValue);
}
// HIP_VERSION = HIP_VERSION_MAJOR*100 + HIP_MINOR_VERSION
*driverVersion = HIP_VERSION;
HIP_RETURN(hipSuccess);
}
hipError_t hipCtxGetDevice(hipDevice_t* device) {
HIP_INIT_API(hipCtxGetDevice, device);
if (device != nullptr) {
*device = hip::getCurrentDevice()->deviceId();
HIP_RETURN(hipSuccess);
} else {
HIP_RETURN(hipErrorInvalidValue);
}
HIP_RETURN(hipErrorInvalidContext);
}
hipError_t hipCtxGetApiVersion(hipCtx_t ctx, int* apiVersion) {
HIP_INIT_API(hipCtxGetApiVersion, apiVersion);
assert(0 && "Unimplemented");
HIP_RETURN(hipErrorNotSupported);
}
hipError_t hipCtxGetCacheConfig(hipFuncCache_t* cacheConfig) {
HIP_INIT_API(hipCtxGetCacheConfig, cacheConfig);
assert(0 && "Unimplemented");
HIP_RETURN(hipErrorNotSupported);
}
hipError_t hipCtxSetCacheConfig(hipFuncCache_t cacheConfig) {
HIP_INIT_API(hipCtxSetCacheConfig, cacheConfig);
assert(0 && "Unimplemented");
HIP_RETURN(hipErrorNotSupported);
}
hipError_t hipCtxSetSharedMemConfig(hipSharedMemConfig config) {
HIP_INIT_API(hipCtxSetSharedMemConfig, config);
assert(0 && "Unimplemented");
HIP_RETURN(hipErrorNotSupported);
}
hipError_t hipCtxSynchronize(void) {
HIP_INIT_API(hipCtxSynchronize, 1);
assert(0 && "Unimplemented");
HIP_RETURN(hipErrorNotSupported);
}
hipError_t hipCtxGetFlags(unsigned int* flags) {
HIP_INIT_API(hipCtxGetFlags, flags);
assert(0 && "Unimplemented");
HIP_RETURN(hipErrorNotSupported);
}
hipError_t hipDevicePrimaryCtxGetState(hipDevice_t dev, unsigned int* flags, int* active) {
HIP_INIT_API(hipDevicePrimaryCtxGetState, dev, flags, active);
if (static_cast<unsigned int>(dev) >= g_devices.size()) {
HIP_RETURN(hipErrorInvalidDevice);
}
if (flags != nullptr) {
*flags = 0;
}
if (active != nullptr) {
*active = g_devices[dev]->GetActiveStatus() ? 1 : 0;
}
HIP_RETURN(hipSuccess);
}
hipError_t hipDevicePrimaryCtxRelease(hipDevice_t dev) {
HIP_INIT_API(hipDevicePrimaryCtxRelease, dev);
if (static_cast<unsigned int>(dev) >= g_devices.size()) {
HIP_RETURN(hipErrorInvalidDevice);
}
HIP_RETURN(hipSuccess);
}
hipError_t hipDevicePrimaryCtxRetain(hipCtx_t* pctx, hipDevice_t dev) {
HIP_INIT_API(hipDevicePrimaryCtxRetain, pctx, dev);
if (static_cast<unsigned int>(dev) >= g_devices.size()) {
HIP_RETURN(hipErrorInvalidDevice);
}
if (pctx == nullptr) {
HIP_RETURN(hipErrorInvalidValue);
}
*pctx = reinterpret_cast<hipCtx_t>(g_devices[dev]);
HIP_RETURN(hipSuccess);
}
hipError_t hipDevicePrimaryCtxReset(hipDevice_t dev) {
HIP_INIT_API(hipDevicePrimaryCtxReset, dev);
HIP_RETURN(hipSuccess);
}
hipError_t hipDevicePrimaryCtxSetFlags(hipDevice_t dev, unsigned int flags) {
HIP_INIT_API(hipDevicePrimaryCtxSetFlags, dev, flags);
if (static_cast<unsigned int>(dev) >= g_devices.size()) {
HIP_RETURN(hipErrorInvalidDevice);
} else {
HIP_RETURN(hipErrorContextAlreadyInUse);
}
}
+944
查看文件
@@ -0,0 +1,944 @@
/*
Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#pragma once
#include <hip/driver_types.h>
#include <hip/texture_types.h>
namespace hip
{
inline
cl_channel_type getCLChannelType(const hipArray_Format hipFormat,
const hipTextureReadMode hipReadMode) {
if (hipReadMode == hipReadModeElementType) {
switch (hipFormat) {
case HIP_AD_FORMAT_UNSIGNED_INT8:
return CL_UNSIGNED_INT8;
case HIP_AD_FORMAT_SIGNED_INT8:
return CL_SIGNED_INT8;
case HIP_AD_FORMAT_UNSIGNED_INT16:
return CL_UNSIGNED_INT16;
case HIP_AD_FORMAT_SIGNED_INT16:
return CL_SIGNED_INT16;
case HIP_AD_FORMAT_UNSIGNED_INT32:
return CL_UNSIGNED_INT32;
case HIP_AD_FORMAT_SIGNED_INT32:
return CL_SIGNED_INT32;
case HIP_AD_FORMAT_HALF:
return CL_HALF_FLOAT;
case HIP_AD_FORMAT_FLOAT:
return CL_FLOAT;
}
} else if (hipReadMode == hipReadModeNormalizedFloat) {
switch (hipFormat) {
case HIP_AD_FORMAT_UNSIGNED_INT8:
return CL_UNORM_INT8;
case HIP_AD_FORMAT_SIGNED_INT8:
return CL_SNORM_INT8;
case HIP_AD_FORMAT_UNSIGNED_INT16:
return CL_UNORM_INT16;
case HIP_AD_FORMAT_SIGNED_INT16:
return CL_SNORM_INT16;
case HIP_AD_FORMAT_UNSIGNED_INT32:
return CL_UNSIGNED_INT32;
case HIP_AD_FORMAT_SIGNED_INT32:
return CL_SIGNED_INT32;
case HIP_AD_FORMAT_HALF:
return CL_HALF_FLOAT;
case HIP_AD_FORMAT_FLOAT:
return CL_FLOAT;
}
}
//error scenario
return {};
}
inline
cl_channel_order getCLChannelOrder(const unsigned int hipNumChannels,
const int sRGB) {
switch (hipNumChannels) {
case 1:
return CL_R;
case 2:
return CL_RG;
case 4:
return (sRGB == 1) ? CL_sRGBA : CL_RGBA;
default:
break;
}
//error scenario
return {};
}
inline
cl_mem_object_type getCLMemObjectType(const unsigned int hipWidth,
const unsigned int hipHeight,
const unsigned int hipDepth,
const unsigned int flags) {
if (flags == hipArrayDefault) {
if ((hipWidth != 0) && (hipHeight == 0) && (hipDepth == 0)) {
return CL_MEM_OBJECT_IMAGE1D;
} else if ((hipWidth != 0) && (hipHeight != 0) && (hipDepth == 0)) {
return CL_MEM_OBJECT_IMAGE2D;
} else if ((hipWidth != 0) && (hipHeight != 0) && (hipDepth != 0)) {
return CL_MEM_OBJECT_IMAGE3D;
}
} else if (flags == hipArrayLayered) {
if ((hipWidth != 0) && (hipHeight == 0) && (hipDepth != 0)) {
return CL_MEM_OBJECT_IMAGE1D_ARRAY;
} else if ((hipWidth != 0) && (hipHeight != 0) && (hipDepth != 0)) {
return CL_MEM_OBJECT_IMAGE2D_ARRAY;
}
}
// error scenario. ShouldNotReachHere()
return CL_MEM_OBJECT_ALLOCATION_FAILURE;
}
inline
cl_addressing_mode getCLAddressingMode(const hipTextureAddressMode hipAddressMode) {
switch (hipAddressMode) {
case hipAddressModeWrap:
return CL_ADDRESS_REPEAT;
case hipAddressModeClamp:
return CL_ADDRESS_CLAMP_TO_EDGE;
case hipAddressModeMirror:
return CL_ADDRESS_MIRRORED_REPEAT;
case hipAddressModeBorder:
return CL_ADDRESS_CLAMP;
}
//error scenario
return {};
}
inline
cl_filter_mode getCLFilterMode(const hipTextureFilterMode hipFilterMode) {
switch (hipFilterMode) {
case hipFilterModePoint:
return CL_FILTER_NEAREST;
case hipFilterModeLinear:
return CL_FILTER_LINEAR;
}
//error scenario
return {};
}
inline
cl_mem_object_type getCLMemObjectType(const hipResourceType hipResType) {
switch (hipResType) {
case hipResourceTypeLinear:
return CL_MEM_OBJECT_IMAGE1D_BUFFER;
case hipResourceTypePitch2D:
return CL_MEM_OBJECT_IMAGE2D;
default:
break;
}
//error scenario
return {};
}
inline
hipArray_Format getCL2hipArrayFormat(const cl_channel_type type) {
switch (type) {
case CL_SNORM_INT8:
case CL_SIGNED_INT8:
return HIP_AD_FORMAT_SIGNED_INT8;
case CL_UNSIGNED_INT16:
return HIP_AD_FORMAT_UNSIGNED_INT16;
case CL_SIGNED_INT16:
return HIP_AD_FORMAT_SIGNED_INT16;
case CL_SIGNED_INT32:
return HIP_AD_FORMAT_SIGNED_INT32;
case CL_UNSIGNED_INT32:
return HIP_AD_FORMAT_UNSIGNED_INT32;
case CL_FLOAT:
return HIP_AD_FORMAT_FLOAT;
case CL_UNSIGNED_INT8:
case CL_UNORM_INT8:
case CL_UNORM_INT_101010:
default:
return HIP_AD_FORMAT_UNSIGNED_INT8;
}
}
inline
size_t getElementSize(const hipArray_const_t array) {
switch (array->Format) {
case HIP_AD_FORMAT_UNSIGNED_INT8:
case HIP_AD_FORMAT_SIGNED_INT8:
return 1 * array->NumChannels;
case HIP_AD_FORMAT_UNSIGNED_INT16:
case HIP_AD_FORMAT_SIGNED_INT16:
case HIP_AD_FORMAT_HALF:
return 2 * array->NumChannels;
case HIP_AD_FORMAT_UNSIGNED_INT32:
case HIP_AD_FORMAT_SIGNED_INT32:
case HIP_AD_FORMAT_FLOAT:
return 4 * array->NumChannels;
}
//error scenario
return {};
}
inline
hipChannelFormatDesc getChannelFormatDesc(int numChannels,
hipArray_Format arrayFormat) {
switch (arrayFormat) {
case HIP_AD_FORMAT_UNSIGNED_INT8:
switch (numChannels) {
case 1:
return {8, 0, 0, 0, hipChannelFormatKindUnsigned};
case 2:
return {8, 8, 0, 0, hipChannelFormatKindUnsigned};
case 4:
return {8, 8, 8, 8, hipChannelFormatKindUnsigned};
}
case HIP_AD_FORMAT_SIGNED_INT8:
switch (numChannels) {
case 1:
return {8, 0, 0, 0, hipChannelFormatKindSigned};
case 2:
return {8, 8, 0, 0, hipChannelFormatKindSigned};
case 4:
return {8, 8, 8, 8, hipChannelFormatKindSigned};
}
case HIP_AD_FORMAT_UNSIGNED_INT16:
switch (numChannels) {
case 1:
return {16, 0, 0, 0, hipChannelFormatKindUnsigned};
case 2:
return {16, 16, 0, 0, hipChannelFormatKindUnsigned};
case 4:
return {16, 16, 16, 16, hipChannelFormatKindUnsigned};
}
case HIP_AD_FORMAT_SIGNED_INT16:
switch (numChannels) {
case 1:
return {16, 0, 0, 0, hipChannelFormatKindSigned};
case 2:
return {16, 16, 0, 0, hipChannelFormatKindSigned};
case 4:
return {16, 16, 16, 16, hipChannelFormatKindSigned};
}
case HIP_AD_FORMAT_UNSIGNED_INT32:
switch (numChannels) {
case 1:
return {32, 0, 0, 0, hipChannelFormatKindUnsigned};
case 2:
return {32, 32, 0, 0, hipChannelFormatKindUnsigned};
case 4:
return {32, 32, 32, 32, hipChannelFormatKindUnsigned};
}
case HIP_AD_FORMAT_SIGNED_INT32:
switch (numChannels) {
case 1:
return {32, 0, 0, 0, hipChannelFormatKindSigned};
case 2:
return {32, 32, 0, 0, hipChannelFormatKindSigned};
case 4:
return {32, 32, 32, 32, hipChannelFormatKindSigned};
}
case HIP_AD_FORMAT_HALF:
switch (numChannels) {
case 1:
return {16, 0, 0, 0, hipChannelFormatKindFloat};
case 2:
return {16, 16, 0, 0, hipChannelFormatKindFloat};
case 4:
return {16, 16, 16, 16, hipChannelFormatKindFloat};
}
case HIP_AD_FORMAT_FLOAT:
switch (numChannels) {
case 1:
return {32, 0, 0, 0, hipChannelFormatKindFloat};
case 2:
return {32, 32, 0, 0, hipChannelFormatKindFloat};
case 4:
return {32, 32, 32, 32, hipChannelFormatKindFloat};
}
}
//error scenario
return {};
}
inline
unsigned int getNumChannels(const hipChannelFormatDesc& desc) {
return ((desc.x != 0) + (desc.y != 0) + (desc.z != 0) + (desc.w != 0));
}
inline
bool CheckArrayFormat(const hipChannelFormatDesc& desc) {
if(desc.x == 0) {
return false;
} else {
if(desc.y != 0 && desc.y != desc.x) {
return false;
}
if(desc.z !=0 && desc.z != desc.x) {
return false;
}
if(desc.w !=0 && desc.w != desc.x) {
return false;
}
}
// The bit channel description should not allow any channels after a zero channel
if (desc.y == 0) {
return !(desc.z > 0 || desc.w > 0);
}
else if (desc.z == 0) {
return !(desc.w > 0);
}
return true;
}
inline
hipArray_Format getArrayFormat(const hipChannelFormatDesc& desc) {
switch (desc.f) {
case hipChannelFormatKindUnsigned:
switch (desc.x) {
case 8:
return HIP_AD_FORMAT_UNSIGNED_INT8;
case 16:
return HIP_AD_FORMAT_UNSIGNED_INT16;
case 32:
return HIP_AD_FORMAT_UNSIGNED_INT32;
}
case hipChannelFormatKindSigned:
switch (desc.x) {
case 8:
return HIP_AD_FORMAT_SIGNED_INT8;
case 16:
return HIP_AD_FORMAT_SIGNED_INT16;
case 32:
return HIP_AD_FORMAT_SIGNED_INT32;
}
case hipChannelFormatKindFloat:
switch (desc.x) {
case 16:
return HIP_AD_FORMAT_HALF;
case 32:
return HIP_AD_FORMAT_FLOAT;
}
default:
break;
}
//error scenario
return {};
}
inline
int getNumChannels(const hipResourceViewFormat hipFormat) {
switch (hipFormat) {
case hipResViewFormatUnsignedChar1:
case hipResViewFormatSignedChar1:
case hipResViewFormatUnsignedShort1:
case hipResViewFormatSignedShort1:
case hipResViewFormatUnsignedInt1:
case hipResViewFormatSignedInt1:
case hipResViewFormatHalf1:
case hipResViewFormatFloat1:
return 1;
case hipResViewFormatUnsignedChar2:
case hipResViewFormatSignedChar2:
case hipResViewFormatUnsignedShort2:
case hipResViewFormatSignedShort2:
case hipResViewFormatUnsignedInt2:
case hipResViewFormatSignedInt2:
case hipResViewFormatHalf2:
case hipResViewFormatFloat2:
return 2;
case hipResViewFormatUnsignedChar4:
case hipResViewFormatSignedChar4:
case hipResViewFormatUnsignedShort4:
case hipResViewFormatSignedShort4:
case hipResViewFormatUnsignedInt4:
case hipResViewFormatSignedInt4:
case hipResViewFormatHalf4:
case hipResViewFormatFloat4:
return 4;
default:
break;
}
//error scenario
return {};
}
inline
hipArray_Format getArrayFormat(const hipResourceViewFormat hipFormat) {
switch (hipFormat) {
case hipResViewFormatUnsignedChar1:
case hipResViewFormatUnsignedChar2:
case hipResViewFormatUnsignedChar4:
return HIP_AD_FORMAT_UNSIGNED_INT8;
case hipResViewFormatSignedChar1:
case hipResViewFormatSignedChar2:
case hipResViewFormatSignedChar4:
return HIP_AD_FORMAT_SIGNED_INT8;
case hipResViewFormatUnsignedShort1:
case hipResViewFormatUnsignedShort2:
case hipResViewFormatUnsignedShort4:
return HIP_AD_FORMAT_UNSIGNED_INT16;
case hipResViewFormatSignedShort1:
case hipResViewFormatSignedShort2:
case hipResViewFormatSignedShort4:
return HIP_AD_FORMAT_SIGNED_INT16;
case hipResViewFormatUnsignedInt1:
case hipResViewFormatUnsignedInt2:
case hipResViewFormatUnsignedInt4:
return HIP_AD_FORMAT_UNSIGNED_INT32;
case hipResViewFormatSignedInt1:
case hipResViewFormatSignedInt2:
case hipResViewFormatSignedInt4:
return HIP_AD_FORMAT_SIGNED_INT32;
case hipResViewFormatHalf1:
case hipResViewFormatHalf2:
case hipResViewFormatHalf4:
return HIP_AD_FORMAT_HALF;
case hipResViewFormatFloat1:
case hipResViewFormatFloat2:
case hipResViewFormatFloat4:
return HIP_AD_FORMAT_FLOAT;
default:
break;
}
//error scenario
return {};
}
inline
hipResourceViewFormat getResourceViewFormat(const hipChannelFormatDesc& desc) {
switch (desc.f) {
case hipChannelFormatKindUnsigned:
switch (getNumChannels(desc)) {
case 1:
switch (desc.x) {
case 8:
return hipResViewFormatUnsignedChar1;
case 16:
return hipResViewFormatUnsignedShort1;
case 32:
return hipResViewFormatUnsignedInt1;
}
case 2:
switch (desc.x) {
case 8:
return hipResViewFormatUnsignedChar2;
case 16:
return hipResViewFormatUnsignedShort2;
case 32:
return hipResViewFormatUnsignedInt2;
}
case 4:
switch (desc.x) {
case 8:
return hipResViewFormatUnsignedChar4;
case 16:
return hipResViewFormatUnsignedShort4;
case 32:
return hipResViewFormatUnsignedInt4;
}
}
case hipChannelFormatKindSigned:
switch (getNumChannels(desc)) {
case 1:
switch (desc.x) {
case 8:
return hipResViewFormatSignedChar1;
case 16:
return hipResViewFormatSignedShort1;
case 32:
return hipResViewFormatSignedInt1;
}
case 2:
switch (desc.x) {
case 8:
return hipResViewFormatSignedChar2;
case 16:
return hipResViewFormatSignedShort2;
case 32:
return hipResViewFormatSignedInt2;
}
case 4:
switch (desc.x) {
case 8:
return hipResViewFormatSignedChar4;
case 16:
return hipResViewFormatSignedShort4;
case 32:
return hipResViewFormatSignedInt4;
}
}
case hipChannelFormatKindFloat:
switch (getNumChannels(desc)) {
case 1:
switch (desc.x) {
case 16:
return hipResViewFormatHalf1;
case 32:
return hipResViewFormatFloat1;
}
case 2:
switch (desc.x) {
case 16:
return hipResViewFormatHalf2;
case 32:
return hipResViewFormatFloat2;
}
case 4:
switch (desc.x) {
case 16:
return hipResViewFormatHalf4;
case 32:
return hipResViewFormatFloat4;
}
}
default:
break;
}
//error scenario
return {};
}
inline
hipTextureDesc getTextureDesc(const textureReference* texRef) {
hipTextureDesc texDesc = {};
std::memcpy(texDesc.addressMode, texRef->addressMode, sizeof(texDesc.addressMode));
texDesc.filterMode = texRef->filterMode;
texDesc.readMode = texRef->readMode;
texDesc.sRGB = texRef->sRGB;
texDesc.normalizedCoords = texRef->normalized;
texDesc.maxAnisotropy = texRef->maxAnisotropy;
texDesc.mipmapFilterMode = texRef->mipmapFilterMode;
texDesc.mipmapLevelBias = texRef->mipmapLevelBias;
texDesc.minMipmapLevelClamp = texRef->minMipmapLevelClamp;
texDesc.maxMipmapLevelClamp = texRef->maxMipmapLevelClamp;
return texDesc;
}
inline
hipResourceViewDesc getResourceViewDesc(hipArray_const_t array,
const hipResourceViewFormat format) {
hipResourceViewDesc resViewDesc = {};
resViewDesc.format = format;
resViewDesc.width = array->width;
resViewDesc.height = array->height;
resViewDesc.depth = array->depth;
resViewDesc.firstMipmapLevel = 0;
resViewDesc.lastMipmapLevel = 0;
resViewDesc.firstLayer = 0;
resViewDesc.lastLayer = 0; /* TODO add hipArray::numLayers */
return resViewDesc;
}
inline
hipResourceViewDesc getResourceViewDesc(hipMipmappedArray_const_t array,
const hipResourceViewFormat format) {
hipResourceViewDesc resViewDesc = {};
resViewDesc.format = format;
resViewDesc.width = array->width;
resViewDesc.height = array->height;
resViewDesc.depth = array->depth;
resViewDesc.firstMipmapLevel = 0;
resViewDesc.lastMipmapLevel = 0; /* TODO add hipMipmappedArray::numMipLevels */
resViewDesc.firstLayer = 0;
resViewDesc.lastLayer = 0; /* TODO add hipArray::numLayers */
return resViewDesc;
}
inline
std::pair<hipMemoryType, hipMemoryType> getMemoryType(const hipMemcpyKind kind) {
switch (kind) {
case hipMemcpyHostToHost:
return {hipMemoryTypeHost, hipMemoryTypeHost};
case hipMemcpyHostToDevice:
return {hipMemoryTypeHost, hipMemoryTypeDevice};
case hipMemcpyDeviceToHost:
return {hipMemoryTypeDevice, hipMemoryTypeHost};
case hipMemcpyDeviceToDevice:
return {hipMemoryTypeDevice, hipMemoryTypeDevice};
case hipMemcpyDefault:
return {hipMemoryTypeUnified, hipMemoryTypeUnified};
}
//error scenario
return {};
}
inline
HIP_MEMCPY3D getDrvMemcpy3DDesc(const hip_Memcpy2D& desc2D) {
HIP_MEMCPY3D desc3D = {};
desc3D.srcXInBytes = desc2D.srcXInBytes;
desc3D.srcY = desc2D.srcY;
desc3D.srcZ = 0;
desc3D.srcLOD = 0;
desc3D.srcMemoryType = desc2D.srcMemoryType;
desc3D.srcHost = desc2D.srcHost;
desc3D.srcDevice = desc2D.srcDevice;
desc3D.srcArray = desc2D.srcArray;
desc3D.srcPitch = desc2D.srcPitch;
desc3D.srcHeight = 0;
desc3D.dstXInBytes = desc2D.dstXInBytes;
desc3D.dstY = desc2D.dstY;
desc3D.dstZ = 0;
desc3D.dstLOD = 0;
desc3D.dstMemoryType = desc2D.dstMemoryType;
desc3D.dstHost = desc2D.dstHost;
desc3D.dstDevice = desc2D.dstDevice;
desc3D.dstArray = desc2D.dstArray;
desc3D.dstPitch = desc2D.dstPitch;
desc3D.dstHeight = 0;
desc3D.WidthInBytes = desc2D.WidthInBytes;
desc3D.Height = desc2D.Height;
desc3D.Depth = 1;
return desc3D;
}
inline
HIP_MEMCPY3D getDrvMemcpy3DDesc(const hipMemcpy3DParms& desc) {
HIP_MEMCPY3D descDrv = {};
descDrv.WidthInBytes = desc.extent.width;
descDrv.Height = desc.extent.height;
descDrv.Depth = desc.extent.depth;
descDrv.srcXInBytes = desc.srcPos.x;
descDrv.srcY = desc.srcPos.y;
descDrv.srcZ = desc.srcPos.z;
descDrv.srcLOD = 0;
descDrv.dstXInBytes = desc.dstPos.x;
descDrv.dstY = desc.dstPos.y;
descDrv.dstZ = desc.dstPos.z;
descDrv.dstLOD = 0;
if (desc.srcArray != nullptr) {
descDrv.srcMemoryType = hipMemoryTypeArray;
descDrv.srcArray = desc.srcArray;
// When reffering to array memory, hipPos::x is in elements.
descDrv.srcXInBytes *= getElementSize(desc.srcArray);
}
if (desc.srcPtr.ptr != nullptr) {
descDrv.srcMemoryType = std::get<0>(hip::getMemoryType(desc.kind));
descDrv.srcHost = desc.srcPtr.ptr;
descDrv.srcDevice = desc.srcPtr.ptr;
descDrv.srcPitch = desc.srcPtr.pitch;
descDrv.srcHeight = desc.srcPtr.ysize;
}
if (desc.dstArray != nullptr) {
descDrv.dstMemoryType = hipMemoryTypeArray;
descDrv.dstArray = desc.dstArray;
// When reffering to array memory, hipPos::x is in elements.
descDrv.dstXInBytes *= getElementSize(desc.dstArray);
}
if (desc.dstPtr.ptr != nullptr) {
descDrv.dstMemoryType = std::get<1>(getMemoryType(desc.kind));
descDrv.dstHost = desc.dstPtr.ptr;
descDrv.dstDevice = desc.dstPtr.ptr;
descDrv.dstPitch = desc.dstPtr.pitch;
descDrv.dstHeight = desc.dstPtr.ysize;
}
// If a HIP array is participating in the copy, the extent is defined in terms of that array's elements.
if ((desc.srcArray != nullptr) && (desc.dstArray == nullptr)) {
descDrv.WidthInBytes *= getElementSize(desc.srcArray);
} else if ((desc.srcArray == nullptr) && (desc.dstArray != nullptr)) {
descDrv.WidthInBytes *= getElementSize(desc.dstArray);
} else if ((desc.srcArray != nullptr) && (desc.dstArray != nullptr)) {
descDrv.WidthInBytes *= getElementSize(desc.dstArray);
}
return descDrv;
}
inline
hipResourceType getResourceType(const HIPresourcetype resType) {
// These two enums should be isomorphic.
return static_cast<hipResourceType>(resType);
}
inline
HIPresourcetype getResourceType(const hipResourceType resType) {
// These two enums should be isomorphic.
return static_cast<HIPresourcetype>(resType);
}
inline
hipResourceDesc getResourceDesc(const HIP_RESOURCE_DESC& resDesc) {
hipResourceDesc desc;
desc.resType = getResourceType(resDesc.resType);
switch (desc.resType) {
case hipResourceTypeArray:
desc.res.array.array = resDesc.res.array.hArray;
break;
case hipResourceTypeMipmappedArray:
desc.res.mipmap.mipmap = resDesc.res.mipmap.hMipmappedArray;
break;
case hipResourceTypeLinear:
desc.res.linear.devPtr = resDesc.res.linear.devPtr;
desc.res.linear.desc = getChannelFormatDesc(resDesc.res.linear.numChannels, resDesc.res.linear.format);
desc.res.linear.sizeInBytes = resDesc.res.linear.sizeInBytes;
break;
case hipResourceTypePitch2D:
desc.res.pitch2D.devPtr = resDesc.res.pitch2D.devPtr;
desc.res.pitch2D.desc = getChannelFormatDesc(resDesc.res.pitch2D.numChannels, resDesc.res.pitch2D.format);
desc.res.pitch2D.width = resDesc.res.pitch2D.width;
desc.res.pitch2D.height = resDesc.res.pitch2D.height;
desc.res.pitch2D.pitchInBytes = resDesc.res.pitch2D.pitchInBytes;
break;
default:
break;
}
return desc;
}
inline
HIP_RESOURCE_DESC getResourceDesc(const hipResourceDesc& resDesc) {
HIP_RESOURCE_DESC desc;
desc.resType = getResourceType(resDesc.resType);
switch (desc.resType) {
case HIP_RESOURCE_TYPE_ARRAY:
desc.res.array.hArray = resDesc.res.array.array;
break;
case HIP_RESOURCE_TYPE_MIPMAPPED_ARRAY:
desc.res.mipmap.hMipmappedArray = resDesc.res.mipmap.mipmap;
break;
case HIP_RESOURCE_TYPE_LINEAR:
desc.res.linear.devPtr = resDesc.res.linear.devPtr;
desc.res.linear.numChannels = getNumChannels(resDesc.res.linear.desc);
desc.res.linear.format = getArrayFormat(resDesc.res.linear.desc);
desc.res.linear.sizeInBytes = resDesc.res.linear.sizeInBytes;
break;
case HIP_RESOURCE_TYPE_PITCH2D:
desc.res.pitch2D.devPtr = resDesc.res.pitch2D.devPtr;
desc.res.pitch2D.numChannels = getNumChannels(resDesc.res.pitch2D.desc);
desc.res.pitch2D.format = getArrayFormat(resDesc.res.pitch2D.desc);
desc.res.pitch2D.width = resDesc.res.pitch2D.width;
desc.res.pitch2D.height = resDesc.res.pitch2D.height;
desc.res.pitch2D.pitchInBytes = resDesc.res.pitch2D.pitchInBytes;
break;
default:
break;
}
return desc;
}
inline
hipTextureAddressMode getAddressMode(const HIPaddress_mode mode) {
// These two enums should be isomorphic.
return static_cast<hipTextureAddressMode>(mode);
}
inline
HIPaddress_mode getAddressMode(const hipTextureAddressMode mode) {
// These two enums should be isomorphic.
return static_cast<HIPaddress_mode>(mode);
}
inline
hipTextureFilterMode getFilterMode(const HIPfilter_mode mode) {
// These two enums should be isomorphic.
return static_cast<hipTextureFilterMode>(mode);
}
inline
HIPfilter_mode getFilterMode(const hipTextureFilterMode mode) {
// These two enums should be isomorphic.
return static_cast<HIPfilter_mode>(mode);
}
inline
hipTextureReadMode getReadMode(const unsigned int flags) {
if (flags & HIP_TRSF_READ_AS_INTEGER) {
return hipReadModeElementType;
} else {
return hipReadModeNormalizedFloat;
}
}
inline
unsigned int getReadMode(const hipTextureReadMode mode) {
if (mode == hipReadModeElementType) {
return HIP_TRSF_READ_AS_INTEGER;
} else {
return 0;
}
}
inline
int getsRGB(const unsigned int flags) {
if (flags & HIP_TRSF_SRGB) {
return 1;
} else {
return 0;
}
}
inline
unsigned int getsRGB(const int sRGB) {
if (sRGB == 1) {
return HIP_TRSF_SRGB;
} else {
return 0;
}
}
inline
int getNormalizedCoords(const unsigned int flags) {
if (flags & HIP_TRSF_NORMALIZED_COORDINATES) {
return 1;
} else {
return 0;
}
}
inline
unsigned int getNormalizedCoords(const int normalizedCoords) {
if (normalizedCoords == 1) {
return HIP_TRSF_NORMALIZED_COORDINATES;
} else {
return 0;
}
}
inline
hipTextureDesc getTextureDesc(const HIP_TEXTURE_DESC& texDesc) {
hipTextureDesc desc;
desc.addressMode[0] = getAddressMode(texDesc.addressMode[0]);
desc.addressMode[1] = getAddressMode(texDesc.addressMode[1]);
desc.addressMode[2] = getAddressMode(texDesc.addressMode[2]);
desc.filterMode = getFilterMode(texDesc.filterMode);
desc.readMode = getReadMode(texDesc.flags);
desc.sRGB = getsRGB(texDesc.flags);
std::memcpy(desc.borderColor, texDesc.borderColor, sizeof(desc.borderColor));
desc.normalizedCoords = getNormalizedCoords(texDesc.flags);
desc.maxAnisotropy = texDesc.maxAnisotropy;
desc.mipmapFilterMode = getFilterMode(texDesc.mipmapFilterMode);
desc.mipmapLevelBias = texDesc.mipmapLevelBias;
desc.minMipmapLevelClamp = texDesc.minMipmapLevelClamp;
desc.maxMipmapLevelClamp = texDesc.maxMipmapLevelClamp;
return desc;
}
inline
HIP_TEXTURE_DESC getTextureDesc(const hipTextureDesc& texDesc) {
HIP_TEXTURE_DESC desc;
desc.addressMode[0] = getAddressMode(texDesc.addressMode[0]);
desc.addressMode[1] = getAddressMode(texDesc.addressMode[1]);
desc.addressMode[2] = getAddressMode(texDesc.addressMode[2]);
desc.filterMode = getFilterMode(texDesc.filterMode);
desc.flags = 0;
desc.flags |= getReadMode(texDesc.readMode);
desc.flags |= getsRGB(texDesc.sRGB);
desc.flags |= getNormalizedCoords(texDesc.normalizedCoords);
desc.maxAnisotropy = texDesc.maxAnisotropy;
desc.mipmapFilterMode = getFilterMode(texDesc.mipmapFilterMode);
desc.mipmapLevelBias = texDesc.mipmapLevelBias;
desc.minMipmapLevelClamp = texDesc.minMipmapLevelClamp;
desc.maxMipmapLevelClamp = texDesc.maxMipmapLevelClamp;
std::memcpy(desc.borderColor, texDesc.borderColor, sizeof(desc.borderColor));
return desc;
}
inline
hipResourceViewFormat getResourceViewFormat(const HIPresourceViewFormat format) {
// These two enums should be isomorphic.
return static_cast<hipResourceViewFormat>(format);
}
inline
HIPresourceViewFormat getResourceViewFormat(const hipResourceViewFormat format) {
// These two enums should be isomorphic.
return static_cast<HIPresourceViewFormat>(format);
}
inline
hipResourceViewDesc getResourceViewDesc(const HIP_RESOURCE_VIEW_DESC& resViewDesc) {
hipResourceViewDesc desc;
desc.format = getResourceViewFormat(resViewDesc.format);
desc.width = resViewDesc.width;
desc.height = resViewDesc.height;
desc.depth = resViewDesc.depth;
desc.firstMipmapLevel = resViewDesc.firstMipmapLevel;
desc.lastMipmapLevel = resViewDesc.lastMipmapLevel;
desc.firstLayer = resViewDesc.firstLayer;
desc.lastLayer = resViewDesc.lastLayer;
return desc;
}
inline
HIP_RESOURCE_VIEW_DESC getResourceViewDesc(const hipResourceViewDesc& resViewDesc) {
HIP_RESOURCE_VIEW_DESC desc;
desc.format = getResourceViewFormat(resViewDesc.format);
desc.width = resViewDesc.width;
desc.height = resViewDesc.height;
desc.depth = resViewDesc.depth;
desc.firstMipmapLevel = resViewDesc.firstMipmapLevel;
desc.lastMipmapLevel = resViewDesc.lastMipmapLevel;
desc.firstLayer = resViewDesc.firstLayer;
desc.lastLayer = resViewDesc.lastLayer;
return desc;
}
inline
size_t getElementSize(const hipChannelFormatDesc &desc) {
return (desc.x / 8) * getNumChannels(desc);
}
};
+382
查看文件
@@ -0,0 +1,382 @@
/* Copyright (c) 2018 - 2022 Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. */
#include <hip/hip_runtime.h>
#include "hip_internal.hpp"
#include "hip_mempool_impl.hpp"
namespace hip {
// ================================================================================================
hip::Stream* Device::NullStream(bool skip_alloc) {
if (null_stream_ == nullptr && !skip_alloc) {
null_stream_ = new Stream(this, Stream::Priority::Normal, 0, true);
}
if (null_stream_ == nullptr) {
return nullptr;
}
// Wait for all active streams before executing commands on the default
iHipWaitActiveStreams(null_stream_);
return null_stream_;
}
// ================================================================================================
bool Device::Create() {
// Create default memory pool
default_mem_pool_ = new MemoryPool(this);
if (default_mem_pool_ == nullptr) {
return false;
}
// Create graph memory pool
graph_mem_pool_ = new MemoryPool(this);
if (graph_mem_pool_ == nullptr) {
return false;
}
uint64_t max_size = std::numeric_limits<uint64_t>::max();
// Use maximum value to hold memory, because current implementation doesn't support VM
// Note: the call for the threshold is always successful
auto error = graph_mem_pool_->SetAttribute(hipMemPoolAttrReleaseThreshold, &max_size);
// Current is default pool after device creation
current_mem_pool_ = default_mem_pool_;
return true;
}
// ================================================================================================
void Device::AddMemoryPool(MemoryPool* pool) {
amd::ScopedLock lock(lock_);
if (auto it = mem_pools_.find(pool); it == mem_pools_.end()) {
mem_pools_.insert(pool);
}
}
// ================================================================================================
void Device::RemoveMemoryPool(MemoryPool* pool) {
amd::ScopedLock lock(lock_);
if (auto it = mem_pools_.find(pool); it != mem_pools_.end()) {
mem_pools_.erase(it);
}
}
// ================================================================================================
bool Device::FreeMemory(amd::Memory* memory, Stream* stream) {
amd::ScopedLock lock(lock_);
// Search for memory in the entire list of pools
for (auto it : mem_pools_) {
if (it->FreeMemory(memory, stream)) {
return true;
}
}
return false;
}
// ================================================================================================
void Device::ReleaseFreedMemory(Stream* stream) {
amd::ScopedLock lock(lock_);
// Search for memory in the entire list of pools
for (auto it : mem_pools_) {
it->ReleaseFreedMemory(stream);
}
}
// ================================================================================================
void Device::RemoveStreamFromPools(Stream* stream) {
amd::ScopedLock lock(lock_);
// Update all pools with the destroyed stream
for (auto it : mem_pools_) {
it->RemoveStream(stream);
}
}
// ================================================================================================
void Device::Reset() {
{
amd::ScopedLock lock(lock_);
auto it = mem_pools_.begin();
while (it != mem_pools_.end()) {
auto current = it++;
(*current)->ReleaseAllMemory();
delete *current;
}
mem_pools_.clear();
}
flags_ = hipDeviceScheduleSpin;
hip::Stream::destroyAllStreams(deviceId_);
amd::MemObjMap::Purge(devices()[0]);
Create();
}
// ================================================================================================
Device::~Device() {
if (default_mem_pool_ != nullptr) {
default_mem_pool_->release();
}
if (graph_mem_pool_ != nullptr) {
graph_mem_pool_->release();
}
if (null_stream_!= nullptr) {
null_stream_->release();
}
}
}
void ihipDestroyDevice() {
for (auto deviceHandle : g_devices) {
delete deviceHandle;
}
}
hipError_t ihipDeviceGet(hipDevice_t* device, int deviceId) {
if (device == nullptr) {
return hipErrorInvalidValue;
}
if (deviceId < 0 || static_cast<size_t>(deviceId) >= g_devices.size()) {
return hipErrorInvalidDevice;
}
*device = deviceId;
return hipSuccess;
}
hipError_t hipDeviceGet(hipDevice_t* device, int deviceId) {
HIP_INIT_API(hipDeviceGet, device, deviceId);
HIP_RETURN(ihipDeviceGet(device, deviceId));
}
hipError_t hipDeviceTotalMem (size_t *bytes, hipDevice_t device) {
HIP_INIT_API(hipDeviceTotalMem, bytes, device);
if (device < 0 || static_cast<size_t>(device) >= g_devices.size()) {
HIP_RETURN(hipErrorInvalidDevice);
}
if (bytes == nullptr) {
HIP_RETURN(hipErrorInvalidValue);
}
auto* deviceHandle = g_devices[device]->devices()[0];
const auto& info = deviceHandle->info();
*bytes = info.globalMemSize_;
HIP_RETURN(hipSuccess);
}
hipError_t hipDeviceComputeCapability(int *major, int *minor, hipDevice_t device) {
HIP_INIT_API(hipDeviceComputeCapability, major, minor, device);
if (device < 0 || static_cast<size_t>(device) >= g_devices.size()) {
HIP_RETURN(hipErrorInvalidDevice);
}
if (major == nullptr || minor == nullptr) {
HIP_RETURN(hipErrorInvalidValue);
}
auto* deviceHandle = g_devices[device]->devices()[0];
const auto& isa = deviceHandle->isa();
*major = isa.versionMajor();
*minor = isa.versionMinor();
HIP_RETURN(hipSuccess);
}
hipError_t hipDeviceGetCount(int* count) {
HIP_INIT_API(hipDeviceGetCount, count);
HIP_RETURN(ihipDeviceGetCount(count));
}
hipError_t ihipDeviceGetCount(int* count) {
if (count == nullptr) {
return hipErrorInvalidValue;
}
// Get all available devices
*count = g_devices.size();
if (*count < 1) {
return hipErrorNoDevice;
}
return hipSuccess;
}
hipError_t hipDeviceGetName(char *name, int len, hipDevice_t device) {
HIP_INIT_API(hipDeviceGetName, (void*)name, len, device);
if (device < 0 || static_cast<size_t>(device) >= g_devices.size()) {
HIP_RETURN(hipErrorInvalidDevice);
}
if (name == nullptr || len <= 0) {
HIP_RETURN(hipErrorInvalidValue);
}
auto* deviceHandle = g_devices[device]->devices()[0];
const auto& info = deviceHandle->info();
const auto nameLen = ::strlen(info.boardName_);
// Only copy partial name if size of `dest` is smaller than size of `src` including
// trailing zero byte
auto memcpySize = (len <= (nameLen + 1) ? (len - 1) : nameLen);
::memcpy(name, info.boardName_, memcpySize);
name[memcpySize] = '\0';
HIP_RETURN(hipSuccess);
}
hipError_t hipDeviceGetUuid(hipUUID* uuid, hipDevice_t device) {
HIP_INIT_API(hipDeviceGetUuid, reinterpret_cast<void*>(uuid), device);
if (device < 0 || static_cast<size_t>(device) >= g_devices.size()) {
HIP_RETURN(hipErrorInvalidDevice);
}
if (uuid == nullptr) {
HIP_RETURN(hipErrorInvalidValue);
}
auto* deviceHandle = g_devices[device]->devices()[0];
const auto& info = deviceHandle->info();
::strncpy(uuid->bytes, info.uuid_, 16);
HIP_RETURN(hipSuccess);
}
hipError_t ihipGetDeviceProperties(hipDeviceProp_t* props, hipDevice_t device) {
if (props == nullptr) {
return hipErrorInvalidValue;
}
if (unsigned(device) >= g_devices.size()) {
return hipErrorInvalidDevice;
}
auto* deviceHandle = g_devices[device]->devices()[0];
constexpr auto int32_max = static_cast<uint64_t>(std::numeric_limits<int32_t>::max());
hipDeviceProp_t deviceProps = {0};
const auto& info = deviceHandle->info();
const auto& isa = deviceHandle->isa();
::strncpy(deviceProps.name, info.boardName_, 128);
deviceProps.totalGlobalMem = info.globalMemSize_;
deviceProps.sharedMemPerBlock = info.localMemSizePerCU_;
deviceProps.regsPerBlock = info.availableRegistersPerCU_;
deviceProps.warpSize = info.wavefrontWidth_;
deviceProps.maxThreadsPerBlock = info.maxWorkGroupSize_;
deviceProps.maxThreadsDim[0] = info.maxWorkItemSizes_[0];
deviceProps.maxThreadsDim[1] = info.maxWorkItemSizes_[1];
deviceProps.maxThreadsDim[2] = info.maxWorkItemSizes_[2];
deviceProps.maxGridSize[0] = int32_max;
deviceProps.maxGridSize[1] = int32_max;
deviceProps.maxGridSize[2] = int32_max;
deviceProps.clockRate = info.maxEngineClockFrequency_ * 1000;
deviceProps.memoryClockRate = info.maxMemoryClockFrequency_ * 1000;
deviceProps.memoryBusWidth = info.globalMemChannels_;
deviceProps.totalConstMem = std::min(info.maxConstantBufferSize_, int32_max);
deviceProps.major = isa.versionMajor();
deviceProps.minor = isa.versionMinor();
deviceProps.multiProcessorCount = info.maxComputeUnits_;
deviceProps.l2CacheSize = info.l2CacheSize_;
deviceProps.maxThreadsPerMultiProcessor = info.maxThreadsPerCU_;
deviceProps.computeMode = 0;
deviceProps.clockInstructionRate = info.timeStampFrequency_;
deviceProps.arch.hasGlobalInt32Atomics = 1;
deviceProps.arch.hasGlobalFloatAtomicExch = 1;
deviceProps.arch.hasSharedInt32Atomics = 1;
deviceProps.arch.hasSharedFloatAtomicExch = 1;
deviceProps.arch.hasFloatAtomicAdd = 1;
deviceProps.arch.hasGlobalInt64Atomics = 1;
deviceProps.arch.hasSharedInt64Atomics = 1;
deviceProps.arch.hasDoubles = 1;
deviceProps.arch.hasWarpVote = 1;
deviceProps.arch.hasWarpBallot = 1;
deviceProps.arch.hasWarpShuffle = 1;
deviceProps.arch.hasFunnelShift = 0;
deviceProps.arch.hasThreadFenceSystem = 1;
deviceProps.arch.hasSyncThreadsExt = 0;
deviceProps.arch.hasSurfaceFuncs = 0;
deviceProps.arch.has3dGrid = 1;
deviceProps.arch.hasDynamicParallelism = 0;
deviceProps.concurrentKernels = 1;
deviceProps.pciDomainID = info.pciDomainID;
deviceProps.pciBusID = info.deviceTopology_.pcie.bus;
deviceProps.pciDeviceID = info.deviceTopology_.pcie.device;
deviceProps.maxSharedMemoryPerMultiProcessor = info.localMemSizePerCU_;
deviceProps.canMapHostMemory = 1;
// FIXME: This should be removed, targets can have character names as well.
deviceProps.gcnArch = isa.versionMajor() * 100 + isa.versionMinor() * 10 + isa.versionStepping();
sprintf(deviceProps.gcnArchName, "%s", isa.targetId());
deviceProps.cooperativeLaunch = info.cooperativeGroups_;
deviceProps.cooperativeMultiDeviceLaunch = info.cooperativeMultiDeviceGroups_;
deviceProps.cooperativeMultiDeviceUnmatchedFunc = info.cooperativeMultiDeviceGroups_;
deviceProps.cooperativeMultiDeviceUnmatchedGridDim = info.cooperativeMultiDeviceGroups_;
deviceProps.cooperativeMultiDeviceUnmatchedBlockDim = info.cooperativeMultiDeviceGroups_;
deviceProps.cooperativeMultiDeviceUnmatchedSharedMem = info.cooperativeMultiDeviceGroups_;
deviceProps.maxTexture1DLinear = std::min(16 * info.imageMaxBufferSize_, int32_max); // Max pixel size is 16 bytes
deviceProps.maxTexture1D = std::min(info.image1DMaxWidth_, int32_max);
deviceProps.maxTexture2D[0] = std::min(info.image2DMaxWidth_, int32_max);
deviceProps.maxTexture2D[1] = std::min(info.image2DMaxHeight_, int32_max);
deviceProps.maxTexture3D[0] = std::min(info.image3DMaxWidth_, int32_max);
deviceProps.maxTexture3D[1] = std::min(info.image3DMaxHeight_, int32_max);
deviceProps.maxTexture3D[2] = std::min(info.image3DMaxDepth_, int32_max);
deviceProps.hdpMemFlushCntl = info.hdpMemFlushCntl;
deviceProps.hdpRegFlushCntl = info.hdpRegFlushCntl;
deviceProps.memPitch = std::min(info.maxMemAllocSize_, int32_max);
deviceProps.textureAlignment = info.imageBaseAddressAlignment_;
deviceProps.texturePitchAlignment = info.imagePitchAlignment_;
deviceProps.kernelExecTimeoutEnabled = 0;
deviceProps.ECCEnabled = info.errorCorrectionSupport_ ? 1 : 0;
deviceProps.isLargeBar = info.largeBar_ ? 1 : 0;
deviceProps.asicRevision = info.asicRevision_;
// HMM capabilities
deviceProps.managedMemory = info.hmmSupported_;
deviceProps.concurrentManagedAccess = info.hmmSupported_;
deviceProps.directManagedMemAccessFromHost = info.hmmDirectHostAccess_;
deviceProps.pageableMemoryAccess = info.hmmCpuMemoryAccessible_;
deviceProps.pageableMemoryAccessUsesHostPageTables = info.hostUnifiedMemory_;
*props = deviceProps;
return hipSuccess;
}
hipError_t hipGetDeviceProperties(hipDeviceProp_t* props, hipDevice_t device) {
HIP_INIT_API(hipGetDeviceProperties, props, device);
HIP_RETURN(ihipGetDeviceProperties(props, device));
}
@@ -0,0 +1,632 @@
/* Copyright (c) 2018 - 2021 Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. */
#include <hip/hip_runtime.h>
#include "hip_internal.hpp"
hipError_t hipChooseDevice(int* device, const hipDeviceProp_t* properties) {
HIP_INIT_API(hipChooseDevice, device, properties);
if (device == nullptr || properties == nullptr) {
HIP_RETURN(hipErrorInvalidValue);
}
*device = 0;
cl_uint maxMatchedCount = 0;
int count = 0;
HIP_RETURN_ONFAIL(ihipDeviceGetCount(&count));
for (cl_int i = 0; i< count; ++i) {
hipDeviceProp_t currentProp = {0};
cl_uint validPropCount = 0;
cl_uint matchedCount = 0;
hipError_t err = ihipGetDeviceProperties(&currentProp, i);
if (properties->major != 0) {
validPropCount++;
if(currentProp.major >= properties->major) {
matchedCount++;
}
}
if (properties->minor != 0) {
validPropCount++;
if(currentProp.minor >= properties->minor) {
matchedCount++;
}
}
if(properties->totalGlobalMem != 0) {
validPropCount++;
if(currentProp.totalGlobalMem >= properties->totalGlobalMem) {
matchedCount++;
}
}
if(properties->sharedMemPerBlock != 0) {
validPropCount++;
if(currentProp.sharedMemPerBlock >= properties->sharedMemPerBlock) {
matchedCount++;
}
}
if(properties->maxThreadsPerBlock != 0) {
validPropCount++;
if(currentProp.maxThreadsPerBlock >= properties->maxThreadsPerBlock ) {
matchedCount++;
}
}
if(properties->totalConstMem != 0) {
validPropCount++;
if(currentProp.totalConstMem >= properties->totalConstMem ) {
matchedCount++;
}
}
if(properties->multiProcessorCount != 0) {
validPropCount++;
if(currentProp.multiProcessorCount >=
properties->multiProcessorCount ) {
matchedCount++;
}
}
if(properties->maxThreadsPerMultiProcessor != 0) {
validPropCount++;
if(currentProp.maxThreadsPerMultiProcessor >=
properties->maxThreadsPerMultiProcessor ) {
matchedCount++;
}
}
if(properties->memoryClockRate != 0) {
validPropCount++;
if(currentProp.memoryClockRate >= properties->memoryClockRate ) {
matchedCount++;
}
}
if(properties->memoryBusWidth != 0) {
validPropCount++;
if(currentProp.memoryBusWidth >= properties->memoryBusWidth ) {
matchedCount++;
}
}
if(properties->l2CacheSize != 0) {
validPropCount++;
if(currentProp.l2CacheSize >= properties->l2CacheSize ) {
matchedCount++;
}
}
if(properties->regsPerBlock != 0) {
validPropCount++;
if(currentProp.regsPerBlock >= properties->regsPerBlock ) {
matchedCount++;
}
}
if(properties->maxSharedMemoryPerMultiProcessor != 0) {
validPropCount++;
if(currentProp.maxSharedMemoryPerMultiProcessor >=
properties->maxSharedMemoryPerMultiProcessor ) {
matchedCount++;
}
}
if(properties->warpSize != 0) {
validPropCount++;
if(currentProp.warpSize >= properties->warpSize ) {
matchedCount++;
}
}
if(validPropCount == matchedCount) {
*device = matchedCount > maxMatchedCount ? i : *device;
maxMatchedCount = std::max(matchedCount, maxMatchedCount);
}
}
HIP_RETURN(hipSuccess);
}
hipError_t hipDeviceGetAttribute(int* pi, hipDeviceAttribute_t attr, int device) {
HIP_INIT_API(hipDeviceGetAttribute, pi, attr, device);
if (pi == nullptr) {
HIP_RETURN(hipErrorInvalidValue);
}
int count = 0;
HIP_RETURN_ONFAIL(ihipDeviceGetCount(&count));
if (device < 0 || device >= count) {
HIP_RETURN(hipErrorInvalidDevice);
}
//FIXME: should we cache the props, or just select from deviceHandle->info_?
hipDeviceProp_t prop = {0};
HIP_RETURN_ONFAIL(ihipGetDeviceProperties(&prop, device));
constexpr auto int32_max = static_cast<uint64_t>(std::numeric_limits<int32_t>::max());
switch (attr) {
case hipDeviceAttributeMaxThreadsPerBlock:
*pi = prop.maxThreadsPerBlock;
break;
case hipDeviceAttributeMaxBlockDimX:
*pi = prop.maxThreadsDim[0];
break;
case hipDeviceAttributeMaxBlockDimY:
*pi = prop.maxThreadsDim[1];
break;
case hipDeviceAttributeMaxBlockDimZ:
*pi = prop.maxThreadsDim[2];
break;
case hipDeviceAttributeMaxGridDimX:
*pi = prop.maxGridSize[0];
break;
case hipDeviceAttributeMaxGridDimY:
*pi = prop.maxGridSize[1];
break;
case hipDeviceAttributeMaxGridDimZ:
*pi = prop.maxGridSize[2];
break;
case hipDeviceAttributeMaxSharedMemoryPerBlock:
*pi = prop.sharedMemPerBlock;
break;
case hipDeviceAttributeTotalConstantMemory:
// size_t to int casting
*pi = std::min(prop.totalConstMem, int32_max);
break;
case hipDeviceAttributeWarpSize:
*pi = prop.warpSize;
break;
case hipDeviceAttributeMaxRegistersPerBlock:
*pi = prop.regsPerBlock;
break;
case hipDeviceAttributeClockRate:
*pi = prop.clockRate;
break;
case hipDeviceAttributeWallClockRate:
*pi = g_devices[device]->devices()[0]->info().wallClockFrequency_;
break;
case hipDeviceAttributeMemoryClockRate:
*pi = prop.memoryClockRate;
break;
case hipDeviceAttributeMemoryBusWidth:
*pi = prop.memoryBusWidth;
break;
case hipDeviceAttributeMultiprocessorCount:
*pi = prop.multiProcessorCount;
break;
case hipDeviceAttributeComputeMode:
*pi = prop.computeMode;
break;
case hipDeviceAttributeL2CacheSize:
*pi = prop.l2CacheSize;
break;
case hipDeviceAttributeMaxThreadsPerMultiProcessor:
*pi = prop.maxThreadsPerMultiProcessor;
break;
case hipDeviceAttributeComputeCapabilityMajor:
*pi = prop.major;
break;
case hipDeviceAttributeComputeCapabilityMinor:
*pi = prop.minor;
break;
case hipDeviceAttributePciBusId:
*pi = prop.pciBusID;
break;
case hipDeviceAttributeConcurrentKernels:
*pi = prop.concurrentKernels;
break;
case hipDeviceAttributePciDeviceId:
*pi = prop.pciDeviceID;
break;
case hipDeviceAttributeMaxSharedMemoryPerMultiprocessor:
*pi = prop.maxSharedMemoryPerMultiProcessor;
break;
case hipDeviceAttributeIsMultiGpuBoard:
*pi = prop.isMultiGpuBoard;
break;
case hipDeviceAttributeCooperativeLaunch:
*pi = prop.cooperativeLaunch;
break;
case hipDeviceAttributeCooperativeMultiDeviceLaunch:
*pi = prop.cooperativeMultiDeviceLaunch;
break;
case hipDeviceAttributeIntegrated:
*pi = prop.integrated;
break;
case hipDeviceAttributeMaxTexture1DWidth:
*pi = prop.maxTexture1D;
break;
case hipDeviceAttributeMaxTexture2DWidth:
*pi = prop.maxTexture2D[0];
break;
case hipDeviceAttributeMaxTexture2DHeight:
*pi = prop.maxTexture2D[1];
break;
case hipDeviceAttributeMaxTexture3DWidth:
*pi = prop.maxTexture3D[0];
break;
case hipDeviceAttributeMaxTexture3DHeight:
*pi = prop.maxTexture3D[1];
break;
case hipDeviceAttributeMaxTexture3DDepth:
*pi = prop.maxTexture3D[2];
break;
case hipDeviceAttributeHdpMemFlushCntl:
*reinterpret_cast<unsigned int**>(pi) = prop.hdpMemFlushCntl;
break;
case hipDeviceAttributeHdpRegFlushCntl:
*reinterpret_cast<unsigned int**>(pi) = prop.hdpRegFlushCntl;
break;
case hipDeviceAttributeMaxPitch:
// size_t to int casting
*pi = std::min(prop.memPitch, int32_max);
break;
case hipDeviceAttributeTextureAlignment:
*pi = prop.textureAlignment;
break;
case hipDeviceAttributeTexturePitchAlignment:
*pi = prop.texturePitchAlignment;
break;
case hipDeviceAttributeKernelExecTimeout:
*pi = prop.kernelExecTimeoutEnabled;
break;
case hipDeviceAttributeCanMapHostMemory:
*pi = prop.canMapHostMemory;
break;
case hipDeviceAttributeEccEnabled:
*pi = prop.ECCEnabled;
break;
case hipDeviceAttributeCooperativeMultiDeviceUnmatchedFunc:
*pi = prop.cooperativeMultiDeviceUnmatchedFunc;
break;
case hipDeviceAttributeCooperativeMultiDeviceUnmatchedGridDim:
*pi = prop.cooperativeMultiDeviceUnmatchedGridDim;
break;
case hipDeviceAttributeCooperativeMultiDeviceUnmatchedBlockDim:
*pi = prop.cooperativeMultiDeviceUnmatchedBlockDim;
break;
case hipDeviceAttributeCooperativeMultiDeviceUnmatchedSharedMem:
*pi = prop.cooperativeMultiDeviceUnmatchedSharedMem;
break;
case hipDeviceAttributeAsicRevision:
*pi = prop.asicRevision;
break;
case hipDeviceAttributeManagedMemory:
*pi = prop.managedMemory;
break;
case hipDeviceAttributeDirectManagedMemAccessFromHost:
*pi = prop.directManagedMemAccessFromHost;
break;
case hipDeviceAttributeConcurrentManagedAccess:
*pi = prop.concurrentManagedAccess;
break;
case hipDeviceAttributePageableMemoryAccess:
*pi = prop.pageableMemoryAccess;
break;
case hipDeviceAttributePageableMemoryAccessUsesHostPageTables:
*pi = prop.pageableMemoryAccessUsesHostPageTables;
break;
case hipDeviceAttributeUnifiedAddressing:
// HIP runtime always uses SVM for host memory allocations.
// Note: Host registered memory isn't covered by this feature
// and still requires hipMemHostGetDevicePointer() call
*pi = true;
break;
case hipDeviceAttributeCanUseStreamWaitValue:
// hipStreamWaitValue64() and hipStreamWaitValue32() support
*pi = g_devices[device]->devices()[0]->info().aqlBarrierValue_;
break;
case hipDeviceAttributeImageSupport:
*pi = static_cast<int>(g_devices[device]->devices()[0]->info().imageSupport_);
break;
case hipDeviceAttributePhysicalMultiProcessorCount:
*pi = g_devices[device]->devices()[0]->info().maxPhysicalComputeUnits_;
break;
case hipDeviceAttributeFineGrainSupport:
*pi = static_cast<int>(g_devices[device]->devices()[0]->isFineGrainSupported());
break;
case hipDeviceAttributeMemoryPoolsSupported:
*pi = HIP_MEM_POOL_SUPPORT;
break;
case hipDeviceAttributeVirtualMemoryManagementSupported:
*pi = static_cast<int>(g_devices[device]->devices()[0]->info().virtualMemoryManagement_);
break;
default:
HIP_RETURN(hipErrorInvalidValue);
}
HIP_RETURN(hipSuccess);
}
hipError_t hipDeviceGetByPCIBusId(int* device, const char*pciBusIdstr) {
HIP_INIT_API(hipDeviceGetByPCIBusId, device, pciBusIdstr);
if (device == nullptr || pciBusIdstr == nullptr) {
HIP_RETURN(hipErrorInvalidValue);
}
int pciBusID = -1;
int pciDeviceID = -1;
int pciDomainID = -1;
bool found = false;
if (sscanf (pciBusIdstr, "%04x:%02x:%02x", reinterpret_cast<unsigned int*>(&pciDomainID),
reinterpret_cast<unsigned int*>(&pciBusID),
reinterpret_cast<unsigned int*>(&pciDeviceID)) == 0x3) {
int count = 0;
HIP_RETURN_ONFAIL(ihipDeviceGetCount(&count));
for (cl_int i = 0; i < count; i++) {
hipDevice_t dev;
hipDeviceProp_t prop;
HIP_RETURN_ONFAIL(ihipDeviceGet(&dev, i));
HIP_RETURN_ONFAIL(ihipGetDeviceProperties(&prop, dev));
if ((pciBusID == prop.pciBusID) && (pciDomainID == prop.pciDomainID)
&& (pciDeviceID == prop.pciDeviceID)) {
*device = i;
found = true;
break;
}
}
}
if (!found) {
HIP_RETURN(hipErrorInvalidValue);
}
HIP_RETURN(hipSuccess);
}
hipError_t hipDeviceGetCacheConfig ( hipFuncCache_t * cacheConfig ) {
HIP_INIT_API(hipDeviceGetCacheConfig, cacheConfig);
if(cacheConfig == nullptr) {
HIP_RETURN(hipErrorInvalidValue);
}
*cacheConfig = hipFuncCache_t();
HIP_RETURN(hipSuccess);
}
hipError_t hipDeviceGetLimit ( size_t* pValue, hipLimit_t limit ) {
HIP_INIT_API(hipDeviceGetLimit, pValue, limit);
if (pValue == nullptr || limit >= hipLimitRange) {
HIP_RETURN(hipErrorInvalidValue);
}
switch (limit) {
case hipLimitMallocHeapSize:
hipDeviceProp_t prop;
HIP_RETURN_ONFAIL(ihipGetDeviceProperties(&prop, ihipGetDevice()));
*pValue = prop.totalGlobalMem;
break;
case hipLimitStackSize:
*pValue = hip::getCurrentDevice()->devices()[0]->StackSize();
break;
default:
LogPrintfError("UnsupportedLimit = %d is passed", limit);
HIP_RETURN(hipErrorUnsupportedLimit);
}
HIP_RETURN(hipSuccess);
}
hipError_t hipDeviceGetPCIBusId ( char* pciBusId, int len, int device ) {
HIP_INIT_API(hipDeviceGetPCIBusId, (void*)pciBusId, len, device);
int count;
HIP_RETURN_ONFAIL(ihipDeviceGetCount(&count));
if (device < 0 || device >= count) {
HIP_RETURN(hipErrorInvalidDevice);
}
//pciBusId should be large enough to store 13 characters including the NULL-terminator.
if (pciBusId == nullptr || len <= 12) {
HIP_RETURN(hipErrorInvalidValue);
}
hipDeviceProp_t prop;
HIP_RETURN_ONFAIL(ihipGetDeviceProperties(&prop, device));
snprintf (pciBusId, len, "%04x:%02x:%02x.0",
prop.pciDomainID,
prop.pciBusID,
prop.pciDeviceID);
HIP_RETURN(hipSuccess);
}
hipError_t hipDeviceGetSharedMemConfig ( hipSharedMemConfig * pConfig ) {
HIP_INIT_API(hipDeviceGetSharedMemConfig, pConfig);
if (pConfig == nullptr) {
HIP_RETURN(hipErrorInvalidValue);
}
*pConfig = hipSharedMemBankSizeFourByte;
HIP_RETURN(hipSuccess);
}
hipError_t hipDeviceReset ( void ) {
HIP_INIT_API(hipDeviceReset);
hip::getCurrentDevice()->Reset();
HIP_RETURN(hipSuccess);
}
hipError_t hipDeviceSetCacheConfig ( hipFuncCache_t cacheConfig ) {
HIP_INIT_API(hipDeviceSetCacheConfig, cacheConfig);
// No way to set cache config yet.
HIP_RETURN(hipErrorNotSupported);
}
hipError_t hipDeviceSetLimit ( hipLimit_t limit, size_t value ) {
HIP_INIT_API(hipDeviceSetLimit, limit, value);
if (limit >= hipLimitRange) {
HIP_RETURN(hipErrorInvalidValue);
}
switch(limit) {
case hipLimitStackSize :
// need to query device size and take action
if (!hip::getCurrentDevice()->devices()[0]->UpdateStackSize(value)) {
HIP_RETURN(hipErrorInvalidValue);
}
break;
case hipLimitMallocHeapSize:
if (!hip::getCurrentDevice()->devices()[0]->UpdateInitialHeapSize(value)) {
HIP_RETURN(hipErrorInvalidValue);
}
break;
default:
LogPrintfError("UnsupportedLimit = %d is passed", limit);
HIP_RETURN(hipErrorUnsupportedLimit);
}
HIP_RETURN(hipSuccess);
}
hipError_t hipDeviceSetSharedMemConfig ( hipSharedMemConfig config ) {
HIP_INIT_API(hipDeviceSetSharedMemConfig, config);
// No way to set cache config yet.
HIP_RETURN(hipErrorNotSupported);
}
hipError_t hipDeviceSynchronize ( void ) {
HIP_INIT_API(hipDeviceSynchronize);
hip::Stream* stream = hip::getNullStream();
if (!stream) {
HIP_RETURN(hipErrorOutOfMemory);
}
if (hip::Stream::StreamCaptureOngoing() == true) {
HIP_RETURN(hipErrorStreamCaptureUnsupported);
}
stream->finish();
hip::Stream::syncNonBlockingStreams(hip::getCurrentDevice()->deviceId());
HIP_RETURN(hipSuccess);
}
int ihipGetDevice() {
hip::Device* device = hip::getCurrentDevice();
if(device == nullptr){
return -1;
}
return device->deviceId();
}
hipError_t hipGetDevice ( int* deviceId ) {
HIP_INIT_API(hipGetDevice, deviceId);
if (deviceId != nullptr) {
int dev = ihipGetDevice();
if (dev == -1) {
HIP_RETURN(hipErrorNoDevice);
}
*deviceId = dev;
HIP_RETURN(hipSuccess);
} else {
HIP_RETURN(hipErrorInvalidValue);
}
}
hipError_t hipGetDeviceCount ( int* count ) {
HIP_INIT_API_NO_RETURN(hipGetDeviceCount, count);
HIP_RETURN(ihipDeviceGetCount(count));
}
hipError_t hipGetDeviceFlags ( unsigned int* flags ) {
HIP_INIT_API(hipGetDeviceFlags, flags);
if (flags == nullptr) {
HIP_RETURN(hipErrorInvalidValue);
}
*flags = hip::getCurrentDevice()->getFlags();
HIP_RETURN(hipSuccess);
}
hipError_t hipSetDevice ( int device ) {
HIP_INIT_API(hipSetDevice, device);
if (static_cast<unsigned int>(device) < g_devices.size()) {
hip::setCurrentDevice(device);
HIP_RETURN(hipSuccess);
}
HIP_RETURN(hipErrorInvalidDevice);
}
hipError_t hipSetDeviceFlags ( unsigned int flags ) {
HIP_INIT_API(hipSetDeviceFlags, flags);
constexpr uint32_t supportedFlags =
hipDeviceScheduleMask | hipDeviceMapHost | hipDeviceLmemResizeToMax;
constexpr uint32_t mutualExclusiveFlags =
hipDeviceScheduleSpin | hipDeviceScheduleYield | hipDeviceScheduleBlockingSync;
// Only one scheduling flag allowed a time
uint32_t scheduleFlag = flags & hipDeviceScheduleMask;
if (((scheduleFlag & mutualExclusiveFlags) != hipDeviceScheduleSpin) && ((scheduleFlag & mutualExclusiveFlags) != hipDeviceScheduleYield)
&& ((scheduleFlag & mutualExclusiveFlags) != hipDeviceScheduleBlockingSync)
&& ((scheduleFlag & mutualExclusiveFlags) != hipDeviceScheduleAuto)) {
HIP_RETURN(hipErrorInvalidValue);
}
if (flags & ~supportedFlags) {
HIP_RETURN(hipErrorInvalidValue);
}
amd::Device* device = hip::getCurrentDevice()->devices()[0];
switch (scheduleFlag) {
case hipDeviceScheduleAuto:
// Current behavior is different from the spec, due to MT usage in runtime
if (hip::host_context->devices().size() >= std::thread::hardware_concurrency()) {
device->SetActiveWait(false);
break;
}
// Fall through for active wait...
case hipDeviceScheduleSpin:
case hipDeviceScheduleYield:
// The both options falls into yield, because MT usage in runtime
device->SetActiveWait(true);
break;
case hipDeviceScheduleBlockingSync:
device->SetActiveWait(false);
break;
default:
break;
}
hip::getCurrentDevice()->setFlags(flags & hipDeviceScheduleMask);
HIP_RETURN(hipSuccess);
}
hipError_t hipSetValidDevices ( int* device_arr, int len ) {
HIP_INIT_API(hipSetValidDevices, device_arr, len);
assert(0 && "Unimplemented");
HIP_RETURN(hipErrorNotSupported);
}
+209
查看文件
@@ -0,0 +1,209 @@
#!/bin/bash
# Copyright (c) 2020 - 2022 Advanced Micro Devices, Inc. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
printUsage() {
echo
echo "Usage: $(basename "$0") HIP_BUILD_INC_DIR HIP_INC_DIR HIP_AMD_INC_DIR LLVM_DIR [option] [RTC_LIB_OUTPUT]"
echo
echo "Options:"
echo " -p, --generate_pch Generate pre-compiled header (default)"
echo " -r, --generate_rtc Generate preprocessor expansion (hiprtc_header.o)"
echo " -h, --help Prints this help"
echo
echo
return 0
}
if [ "$1" == "" ]; then
printUsage
exit 0
fi
HIP_BUILD_INC_DIR="$1"
HIP_INC_DIR="$2"
HIP_AMD_INC_DIR="$3"
LLVM_DIR="$4"
# By default, generate pch
TARGET="generatepch"
while [ "$5" != "" ];
do
case "$5" in
-h | --help )
printUsage ; exit 0 ;;
-p | --generate_pch )
TARGET="generatepch" ; break ;;
-r | --generate_rtc )
TARGET="generatertc" ; break ;;
*)
echo " UNEXPECTED ERROR Parm : [$4] ">&2 ; exit 20 ;;
esac
shift 1
done
# Allow hiprtc lib name to be set by argument 7
if [[ "$6" != "" ]]; then
rtc_shared_lib_out="$6"
else
if [[ "$OSTYPE" == cygwin ]]; then
rtc_shared_lib_out=hiprtc-builtins64.dll
else
rtc_shared_lib_out=libhiprtc-builtins.so
fi
fi
if [[ "$OSTYPE" == cygwin || "$OSTYPE" == msys ]]; then
isWindows=1
tmpdir=.
else
isWindows=0
tmpdir=/tmp
fi
# Expected first argument $1 to be output file name.
create_hip_macro_file() {
cat >$1 <<EOF
#define __device__ __attribute__((device))
#define __host__ __attribute__((host))
#define __global__ __attribute__((global))
#define __constant__ __attribute__((constant))
#define __shared__ __attribute__((shared))
#define launch_bounds_impl0(requiredMaxThreadsPerBlock) \
__attribute__((amdgpu_flat_work_group_size(1, requiredMaxThreadsPerBlock)))
#define launch_bounds_impl1(requiredMaxThreadsPerBlock, minBlocksPerMultiprocessor) \
__attribute__((amdgpu_flat_work_group_size(1, requiredMaxThreadsPerBlock), \
amdgpu_waves_per_eu(minBlocksPerMultiprocessor)))
#define select_impl_(_1, _2, impl_, ...) impl_
#define __launch_bounds__(...) \
select_impl_(__VA_ARGS__, launch_bounds_impl1, launch_bounds_impl0)(__VA_ARGS__)
EOF
}
generate_pch() {
tmp=$tmpdir/hip_pch.$$
mkdir -p $tmp
create_hip_macro_file $tmp/hip_macros.h
cat >$tmp/hip_pch.h <<EOF
#include "hip/hip_runtime.h"
#include "hip/hip_fp16.h"
EOF
cat >$tmp/hip_pch.mcin <<EOF
.type __hip_pch_wave32,@object
.section .hip_pch_wave32,"aMS",@progbits,1
.data
.globl __hip_pch_wave32
.globl __hip_pch_wave32_size
.p2align 3
__hip_pch_wave32:
.incbin "$tmp/hip_wave32.pch"
__hip_pch_wave32_size:
.long __hip_pch_wave32_size - __hip_pch_wave32
.type __hip_pch_wave64,@object
.section .hip_pch_wave64,"aMS",@progbits,1
.data
.globl __hip_pch_wave64
.globl __hip_pch_wave64_size
.p2align 3
__hip_pch_wave64:
.incbin "$tmp/hip_wave64.pch"
__hip_pch_wave64_size:
.long __hip_pch_wave64_size - __hip_pch_wave64
EOF
set -x
# For gfx10/Navi devices
$LLVM_DIR/bin/clang -O3 --rocm-path=$HIP_INC_DIR/.. -std=c++17 -nogpulib -isystem $HIP_INC_DIR -isystem $HIP_BUILD_INC_DIR -isystem $HIP_AMD_INC_DIR --cuda-device-only --cuda-gpu-arch=gfx1030 -x hip $tmp/hip_pch.h -E >$tmp/pch_wave32.cui &&
cat $tmp/hip_macros.h >> $tmp/pch_wave32.cui &&
$LLVM_DIR/bin/clang -cc1 -O3 -emit-pch -triple amdgcn-amd-amdhsa -aux-triple x86_64-unknown-linux-gnu -fcuda-is-device -std=c++17 -fgnuc-version=4.2.1 -o $tmp/hip_wave32.pch -x hip-cpp-output - <$tmp/pch_wave32.cui &&
# For other devices
$LLVM_DIR/bin/clang -O3 --rocm-path=$HIP_INC_DIR/.. -std=c++17 -nogpulib -isystem $HIP_INC_DIR -isystem $HIP_BUILD_INC_DIR -isystem $HIP_AMD_INC_DIR --cuda-device-only -x hip $tmp/hip_pch.h -E >$tmp/pch_wave64.cui &&
cat $tmp/hip_macros.h >> $tmp/pch_wave64.cui &&
$LLVM_DIR/bin/clang -cc1 -O3 -emit-pch -triple amdgcn-amd-amdhsa -aux-triple x86_64-unknown-linux-gnu -fcuda-is-device -std=c++17 -fgnuc-version=4.2.1 -o $tmp/hip_wave64.pch -x hip-cpp-output - <$tmp/pch_wave64.cui &&
$LLVM_DIR/bin/llvm-mc -o hip_pch.o $tmp/hip_pch.mcin --filetype=obj &&
rm -rf $tmp
}
generate_rtc_header() {
tmp=$tmpdir/hip_rtc.$$
mkdir -p $tmp
local macroFile="$tmp/hip_macros.h"
local headerFile="$tmp/hipRTC_header.h"
local mcinFile="$tmp/hipRTC_header.mcin"
create_hip_macro_file $macroFile
cat >$headerFile <<EOF
#pragma push_macro("CHAR_BIT")
#pragma push_macro("INT_MAX")
#define CHAR_BIT __CHAR_BIT__
#define INT_MAX __INTMAX_MAX__
#include "hip/hip_runtime.h"
#include "hip/hip_fp16.h"
#pragma pop_macro("CHAR_BIT")
#pragma pop_macro("INT_MAX")
EOF
echo "// Automatically generated script for HIP RTC." > $mcinFile
if [[ $isWindows -eq 0 ]]; then
echo " .type __hipRTC_header,@object" >> $mcinFile
echo " .type __hipRTC_header_size,@object" >> $mcinFile
fi
cat >>$mcinFile <<EOF
.section .hipRTC_header,"a"
.globl __hipRTC_header
.globl __hipRTC_header_size
.p2align 3
__hipRTC_header:
.incbin "$tmp/hiprtc"
__hipRTC_header_size:
.long __hipRTC_header_size - __hipRTC_header
EOF
set -x
$LLVM_DIR/bin/clang -O3 --rocm-path=$HIP_INC_DIR/.. -std=c++14 -nogpulib --hip-version=4.4 -isystem $HIP_INC_DIR -isystem $HIP_BUILD_INC_DIR -isystem $HIP_AMD_INC_DIR --cuda-device-only -D__HIPCC_RTC__ -x hip $tmp/hipRTC_header.h -E -o $tmp/hiprtc &&
cat $macroFile >> $tmp/hiprtc &&
$LLVM_DIR/bin/llvm-mc -o $tmp/hiprtc_header.o $tmp/hipRTC_header.mcin --filetype=obj &&
$LLVM_DIR/bin/clang $tmp/hiprtc_header.o -o $rtc_shared_lib_out -shared &&
$LLVM_DIR/bin/clang -O3 --rocm-path=$HIP_INC_DIR/.. -std=c++14 -nogpulib -nogpuinc -emit-llvm -c -o $tmp/tmp.bc --cuda-device-only -D__HIPCC_RTC__ --offload-arch=gfx906 -x hip-cpp-output $tmp/hiprtc &&
rm -rf $tmp
}
case $TARGET in
(generatertc) generate_rtc_header ;;
(generatepch) generate_pch ;;
(*) die "Invalid target $TARGET" ;;
esac
+382
查看文件
@@ -0,0 +1,382 @@
/* Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. */
#include <hip/hip_runtime.h>
#include "hip_internal.hpp"
hipError_t hipGetLastError()
{
HIP_INIT_API(hipGetLastError);
hipError_t err = hip::tls.last_error_;
hip::tls.last_error_ = hipSuccess;
return err;
}
hipError_t hipPeekAtLastError()
{
HIP_INIT_API(hipPeekAtLastError);
hipError_t err = hip::tls.last_error_;
HIP_RETURN(err);
}
const char *ihipGetErrorName(hipError_t hip_error)
{
switch (hip_error) {
case hipSuccess:
return "hipSuccess";
case hipErrorInvalidValue:
return "hipErrorInvalidValue";
case hipErrorOutOfMemory:
return "hipErrorOutOfMemory";
case hipErrorNotInitialized:
return "hipErrorNotInitialized";
case hipErrorDeinitialized:
return "hipErrorDeinitialized";
case hipErrorProfilerDisabled:
return "hipErrorProfilerDisabled";
case hipErrorProfilerNotInitialized:
return "hipErrorProfilerNotInitialized";
case hipErrorProfilerAlreadyStarted:
return "hipErrorProfilerAlreadyStarted";
case hipErrorProfilerAlreadyStopped:
return "hipErrorProfilerAlreadyStopped";
case hipErrorInvalidConfiguration:
return "hipErrorInvalidConfiguration";
case hipErrorInvalidSymbol:
return "hipErrorInvalidSymbol";
case hipErrorInvalidDevicePointer:
return "hipErrorInvalidDevicePointer";
case hipErrorInvalidMemcpyDirection:
return "hipErrorInvalidMemcpyDirection";
case hipErrorInsufficientDriver:
return "hipErrorInsufficientDriver";
case hipErrorMissingConfiguration:
return "hipErrorMissingConfiguration";
case hipErrorPriorLaunchFailure:
return "hipErrorPriorLaunchFailure";
case hipErrorInvalidDeviceFunction:
return "hipErrorInvalidDeviceFunction";
case hipErrorNoDevice:
return "hipErrorNoDevice";
case hipErrorInvalidDevice:
return "hipErrorInvalidDevice";
case hipErrorInvalidPitchValue:
return "hipErrorInvalidPitchValue";
case hipErrorInvalidImage:
return "hipErrorInvalidImage";
case hipErrorInvalidContext:
return "hipErrorInvalidContext";
case hipErrorContextAlreadyCurrent:
return "hipErrorContextAlreadyCurrent";
case hipErrorMapFailed:
return "hipErrorMapFailed";
case hipErrorUnmapFailed:
return "hipErrorUnmapFailed";
case hipErrorArrayIsMapped:
return "hipErrorArrayIsMapped";
case hipErrorAlreadyMapped:
return "hipErrorAlreadyMapped";
case hipErrorNoBinaryForGpu:
return "hipErrorNoBinaryForGpu";
case hipErrorAlreadyAcquired:
return "hipErrorAlreadyAcquired";
case hipErrorNotMapped:
return "hipErrorNotMapped";
case hipErrorNotMappedAsArray:
return "hipErrorNotMappedAsArray";
case hipErrorNotMappedAsPointer:
return "hipErrorNotMappedAsPointer";
case hipErrorECCNotCorrectable:
return "hipErrorECCNotCorrectable";
case hipErrorUnsupportedLimit:
return "hipErrorUnsupportedLimit";
case hipErrorContextAlreadyInUse:
return "hipErrorContextAlreadyInUse";
case hipErrorPeerAccessUnsupported:
return "hipErrorPeerAccessUnsupported";
case hipErrorInvalidKernelFile:
return "hipErrorInvalidKernelFile";
case hipErrorInvalidGraphicsContext:
return "hipErrorInvalidGraphicsContext";
case hipErrorInvalidSource:
return "hipErrorInvalidSource";
case hipErrorFileNotFound:
return "hipErrorFileNotFound";
case hipErrorSharedObjectSymbolNotFound:
return "hipErrorSharedObjectSymbolNotFound";
case hipErrorSharedObjectInitFailed:
return "hipErrorSharedObjectInitFailed";
case hipErrorOperatingSystem:
return "hipErrorOperatingSystem";
case hipErrorInvalidHandle:
return "hipErrorInvalidHandle";
case hipErrorIllegalState:
return "hipErrorIllegalState";
case hipErrorNotFound:
return "hipErrorNotFound";
case hipErrorNotReady:
return "hipErrorNotReady";
case hipErrorIllegalAddress:
return "hipErrorIllegalAddress";
case hipErrorLaunchOutOfResources:
return "hipErrorLaunchOutOfResources";
case hipErrorLaunchTimeOut:
return "hipErrorLaunchTimeOut";
case hipErrorPeerAccessAlreadyEnabled:
return "hipErrorPeerAccessAlreadyEnabled";
case hipErrorPeerAccessNotEnabled:
return "hipErrorPeerAccessNotEnabled";
case hipErrorSetOnActiveProcess:
return "hipErrorSetOnActiveProcess";
case hipErrorContextIsDestroyed:
return "hipErrorContextIsDestroyed";
case hipErrorAssert:
return "hipErrorAssert";
case hipErrorHostMemoryAlreadyRegistered:
return "hipErrorHostMemoryAlreadyRegistered";
case hipErrorHostMemoryNotRegistered:
return "hipErrorHostMemoryNotRegistered";
case hipErrorLaunchFailure:
return "hipErrorLaunchFailure";
case hipErrorNotSupported:
return "hipErrorNotSupported";
case hipErrorUnknown:
return "hipErrorUnknown";
case hipErrorRuntimeMemory:
return "hipErrorRuntimeMemory";
case hipErrorRuntimeOther:
return "hipErrorRuntimeOther";
case hipErrorCooperativeLaunchTooLarge:
return "hipErrorCooperativeLaunchTooLarge";
case hipErrorStreamCaptureUnsupported:
return "hipErrorStreamCaptureUnsupported";
case hipErrorStreamCaptureInvalidated:
return "hipErrorStreamCaptureInvalidated";
case hipErrorStreamCaptureMerge:
return "hipErrorStreamCaptureMerge";
case hipErrorStreamCaptureUnmatched:
return "hipErrorStreamCaptureUnmatched";
case hipErrorStreamCaptureUnjoined:
return "hipErrorStreamCaptureUnjoined";
case hipErrorStreamCaptureIsolation:
return "hipErrorStreamCaptureIsolation";
case hipErrorStreamCaptureImplicit:
return "hipErrorStreamCaptureImplicit";
case hipErrorCapturedEvent:
return "hipErrorCapturedEvent";
case hipErrorStreamCaptureWrongThread:
return "hipErrorStreamCaptureWrongThread";
case hipErrorGraphExecUpdateFailure:
return "hipErrorGraphExecUpdateFailure";
case hipErrorTbd:
return "hipErrorTbd";
default:
return "hipErrorUnknown";
};
}
const char *ihipGetErrorString(hipError_t hip_error) {
switch(hip_error) {
case hipSuccess:
return "no error";
case hipErrorInvalidValue:
return "invalid argument";
case hipErrorOutOfMemory:
return "out of memory";
case hipErrorNotInitialized:
return "initialization error";
case hipErrorDeinitialized:
return "driver shutting down";
case hipErrorProfilerDisabled:
return "profiler disabled while using external profiling tool";
case hipErrorProfilerNotInitialized:
return "profiler is not initialized";
case hipErrorProfilerAlreadyStarted:
return "profiler already started";
case hipErrorProfilerAlreadyStopped:
return "profiler already stopped";
case hipErrorInvalidConfiguration:
return "invalid configuration argument";
case hipErrorInvalidPitchValue:
return "invalid pitch argument";
case hipErrorInvalidSymbol:
return "invalid device symbol";
case hipErrorInvalidDevicePointer:
return "invalid device pointer";
case hipErrorInvalidMemcpyDirection:
return "invalid copy direction for memcpy";
case hipErrorInsufficientDriver:
return "driver version is insufficient for runtime version";
case hipErrorMissingConfiguration:
return "__global__ function call is not configured";
case hipErrorPriorLaunchFailure:
return "unspecified launch failure in prior launch";
case hipErrorInvalidDeviceFunction:
return "invalid device function";
case hipErrorNoDevice:
return "no ROCm-capable device is detected";
case hipErrorInvalidDevice:
return "invalid device ordinal";
case hipErrorInvalidImage:
return "device kernel image is invalid";
case hipErrorInvalidContext:
return "invalid device context";
case hipErrorContextAlreadyCurrent:
return "context is already current context";
case hipErrorMapFailed:
return "mapping of buffer object failed";
case hipErrorUnmapFailed:
return "unmapping of buffer object failed";
case hipErrorArrayIsMapped:
return "array is mapped";
case hipErrorAlreadyMapped:
return "resource already mapped";
case hipErrorNoBinaryForGpu:
return "no kernel image is available for execution on the device";
case hipErrorAlreadyAcquired:
return "resource already acquired";
case hipErrorNotMapped:
return "resource not mapped";
case hipErrorNotMappedAsArray:
return "resource not mapped as array";
case hipErrorNotMappedAsPointer:
return "resource not mapped as pointer";
case hipErrorECCNotCorrectable:
return "uncorrectable ECC error encountered";
case hipErrorUnsupportedLimit:
return "limit is not supported on this architecture";
case hipErrorContextAlreadyInUse:
return "exclusive-thread device already in use by a different thread";
case hipErrorPeerAccessUnsupported:
return "peer access is not supported between these two devices";
case hipErrorInvalidKernelFile:
return "invalid kernel file";
case hipErrorInvalidGraphicsContext:
return "invalid OpenGL or DirectX context";
case hipErrorInvalidSource:
return "device kernel image is invalid";
case hipErrorFileNotFound:
return "file not found";
case hipErrorSharedObjectSymbolNotFound:
return "shared object symbol not found";
case hipErrorSharedObjectInitFailed:
return "shared object initialization failed";
case hipErrorOperatingSystem:
return "OS call failed or operation not supported on this OS";
case hipErrorInvalidHandle:
return "invalid resource handle";
case hipErrorIllegalState:
return "the operation cannot be performed in the present state";
case hipErrorNotFound:
return "named symbol not found";
case hipErrorNotReady:
return "device not ready";
case hipErrorIllegalAddress:
return "an illegal memory access was encountered";
case hipErrorLaunchOutOfResources:
return "too many resources requested for launch";
case hipErrorLaunchTimeOut:
return "the launch timed out and was terminated";
case hipErrorPeerAccessAlreadyEnabled:
return "peer access is already enabled";
case hipErrorPeerAccessNotEnabled:
return "peer access has not been enabled";
case hipErrorSetOnActiveProcess:
return "cannot set while device is active in this process";
case hipErrorContextIsDestroyed:
return "context is destroyed";
case hipErrorAssert:
return "device-side assert triggered";
case hipErrorHostMemoryAlreadyRegistered:
return "part or all of the requested memory range is already mapped";
case hipErrorHostMemoryNotRegistered:
return "pointer does not correspond to a registered memory region";
case hipErrorLaunchFailure:
return "unspecified launch failure";
case hipErrorCooperativeLaunchTooLarge:
return "too many blocks in cooperative launch";
case hipErrorNotSupported:
return "operation not supported";
case hipErrorStreamCaptureUnsupported:
return "operation not permitted when stream is capturing";
case hipErrorStreamCaptureInvalidated:
return "operation failed due to a previous error during capture";
case hipErrorStreamCaptureMerge:
return "operation would result in a merge of separate capture sequences";
case hipErrorStreamCaptureUnmatched:
return "capture was not ended in the same stream as it began";
case hipErrorStreamCaptureUnjoined:
return "capturing stream has unjoined work";
case hipErrorStreamCaptureIsolation:
return "dependency created on uncaptured work in another stream";
case hipErrorStreamCaptureImplicit:
return "operation would make the legacy stream depend on a capturing blocking stream";
case hipErrorCapturedEvent:
return "operation not permitted on an event last recorded in a capturing stream";
case hipErrorStreamCaptureWrongThread:
return "attempt to terminate a thread-local capture sequence from another thread";
case hipErrorGraphExecUpdateFailure:
return "the graph update was not performed because it included changes which violated constraints specific to instantiated graph update";
case hipErrorRuntimeMemory:
return "runtime memory call returned error";
case hipErrorRuntimeOther:
return "runtime call other than memory returned error";
case hipErrorUnknown:
default:
return "unknown error";
}
}
const char* hipGetErrorName(hipError_t hip_error)
{
return ihipGetErrorName(hip_error);
}
const char *hipGetErrorString(hipError_t hip_error)
{
return ihipGetErrorString(hip_error);
}
hipError_t hipDrvGetErrorName(hipError_t hip_error, const char** errStr)
{
if (errStr == nullptr) {
return hipErrorInvalidValue;
}
*errStr = ihipGetErrorName(hip_error);
if (hip_error == hipErrorUnknown || strcmp( *errStr, "hipErrorUnknown") != 0) {
return hipSuccess;
} else {
return hipErrorInvalidValue;
}
}
hipError_t hipDrvGetErrorString(hipError_t hip_error, const char** errStr)
{
if (errStr == nullptr) {
return hipErrorInvalidValue;
}
*errStr = ihipGetErrorString(hip_error);
if (hip_error == hipErrorUnknown || strcmp( *errStr, "unknown error") != 0) {
return hipSuccess;
} else {
return hipErrorInvalidValue;
}
}
+429
查看文件
@@ -0,0 +1,429 @@
/* Copyright (c) 2015 - 2022 Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. */
#include <hip/hip_runtime.h>
#include "hip_event.hpp"
#if !defined(_MSC_VER)
#include <unistd.h>
#endif
namespace hip {
static amd::Monitor eventSetLock{"Guards global event set"};
static std::unordered_set<hipEvent_t> eventSet;
bool Event::ready(eventType type) {
if (event_->status() != CL_COMPLETE) {
event_->notifyCmdQueue();
}
// Check HW status of the ROCcrl event. Note: not all ROCclr modes support HW status
bool ready = CheckHwEvent(type);
if (!ready) {
ready = (event_->status() == CL_COMPLETE);
}
return ready;
}
bool EventDD::ready(eventType type) {
// Check HW status of the ROCcrl event. Note: not all ROCclr modes support HW status
bool ready = CheckHwEvent(type);
// FIXME: Remove status check entirely
if (!ready) {
ready = (event_->status() == CL_COMPLETE);
}
return ready;
}
hipError_t Event::query() {
amd::ScopedLock lock(lock_);
// If event is not recorded, event_ is null, hence return hipSuccess
if (event_ == nullptr) {
return hipSuccess;
}
return ready(Query) ? hipSuccess : hipErrorNotReady;
}
hipError_t Event::synchronize() {
amd::ScopedLock lock(lock_);
// If event is not recorded, event_ is null, hence return hipSuccess
if (event_ == nullptr) {
return hipSuccess;
}
// Check HW status of the ROCcrl event. Note: not all ROCclr modes support HW status
static constexpr bool kWaitCompletion = true;
if (!g_devices[deviceId()]->devices()[0]->IsHwEventReady(*event_, kWaitCompletion)) {
if (event_->HwEvent() != nullptr) {
amd::Command* command = nullptr;
hipError_t status = recordCommand(command, event_->command().queue(), flags);
command->enqueue();
g_devices[deviceId()]->devices()[0]->IsHwEventReady(command->event(), kWaitCompletion);
command->release();
} else {
event_->awaitCompletion();
}
}
return hipSuccess;
}
bool Event::awaitEventCompletion() {
return event_->awaitCompletion();
}
bool EventDD::awaitEventCompletion() {
return g_devices[deviceId()]->devices()[0]->IsHwEventReady(*event_, true);
}
hipError_t Event::elapsedTime(Event& eStop, float& ms) {
amd::ScopedLock startLock(lock_);
if (this == &eStop) {
ms = 0.f;
if (event_ == nullptr) {
return hipErrorInvalidHandle;
}
if (flags & hipEventDisableTiming) {
return hipErrorInvalidHandle;
}
if (!ready(ElapsedTime)) {
return hipErrorNotReady;
}
return hipSuccess;
}
amd::ScopedLock stopLock(eStop.lock());
if (event_ == nullptr || eStop.event() == nullptr) {
return hipErrorInvalidHandle;
}
if ((flags | eStop.flags) & hipEventDisableTiming) {
return hipErrorInvalidHandle;
}
if (!ready(ElapsedTime) || !eStop.ready(ElapsedTime)) {
return hipErrorNotReady;
}
if (event_ == eStop.event_) {
// Events are the same, which indicates the stream is empty and likely
// eventRecord is called on another stream. For such cases insert and measure a
// marker.
amd::Command* command = new amd::Marker(*event_->command().queue(), kMarkerDisableFlush);
command->enqueue();
command->awaitCompletion();
ms = static_cast<float>(static_cast<int64_t>(command->event().profilingInfo().end_) - time(false)) /
1000000.f;
command->release();
} else {
// Note: with direct dispatch eStop.ready() relies on HW event, but CPU status can be delayed.
// Hence for now make sure CPU status is updated by calling awaitCompletion();
awaitEventCompletion();
eStop.awaitEventCompletion();
if (unrecorded_ && eStop.isUnRecorded()) {
// Both the events are not recorded, just need the end and start of stop event
ms = static_cast<float>(eStop.time(false) - eStop.time(true)) / 1000000.f;
} else {
ms = static_cast<float>(eStop.time(false) - time(false)) / 1000000.f;
}
}
return hipSuccess;
}
int64_t Event::time(bool getStartTs) const {
assert(event_ != nullptr);
if (getStartTs) {
return static_cast<int64_t>(event_->profilingInfo().start_);
} else {
return static_cast<int64_t>(event_->profilingInfo().end_);
}
}
int64_t EventDD::time(bool getStartTs) const {
uint64_t start = 0, end = 0;
assert(event_ != nullptr);
g_devices[deviceId()]->devices()[0]->getHwEventTime(*event_, &start, &end);
// FIXME: This is only needed if the command had to wait CL_COMPLETE status
if (start == 0 || end == 0) {
return Event::time(getStartTs);
}
if (getStartTs) {
return static_cast<int64_t>(start);
} else {
return static_cast<int64_t>(end);
}
}
hipError_t Event::streamWaitCommand(amd::Command*& command, hip::Stream* stream) {
amd::Command::EventWaitList eventWaitList;
if (event_ != nullptr) {
eventWaitList.push_back(event_);
}
command = new amd::Marker(*stream, kMarkerDisableFlush, eventWaitList);
if (command == NULL) {
return hipErrorOutOfMemory;
}
return hipSuccess;
}
hipError_t Event::enqueueStreamWaitCommand(hipStream_t stream, amd::Command* command) {
command->enqueue();
return hipSuccess;
}
hipError_t Event::streamWait(hipStream_t stream, uint flags) {
hip::Stream* hip_stream = hip::getStream(stream);
// Access to event_ object must be lock protected
amd::ScopedLock lock(lock_);
if ((event_ == nullptr) || (event_->command().queue() == hip_stream) || ready(StreamWait)) {
return hipSuccess;
}
if (!event_->notifyCmdQueue()) {
return hipErrorLaunchOutOfResources;
}
amd::Command* command;
hipError_t status = streamWaitCommand(command, hip_stream);
if (status != hipSuccess) {
return status;
}
status = enqueueStreamWaitCommand(stream, command);
if (status != hipSuccess) {
return status;
}
command->release();
return hipSuccess;
}
hipError_t Event::recordCommand(amd::Command*& command, amd::HostQueue* stream,
uint32_t ext_flags ) {
if (command == nullptr) {
int32_t releaseFlags = ((ext_flags == 0) ? flags : ext_flags) &
(hipEventReleaseToDevice | hipEventReleaseToSystem |
hipEventDisableSystemFence);
if (releaseFlags & hipEventDisableSystemFence) {
releaseFlags = amd::Device::kCacheStateIgnore;
} else {
releaseFlags = amd::Device::kCacheStateInvalid;
}
// Always submit a EventMarker.
command = new hip::EventMarker(*stream, !kMarkerDisableFlush, true, releaseFlags);
}
return hipSuccess;
}
hipError_t Event::enqueueRecordCommand(hipStream_t stream, amd::Command* command, bool record) {
command->enqueue();
if (event_ == &command->event()) return hipSuccess;
if (event_ != nullptr) {
event_->release();
}
event_ = &command->event();
unrecorded_ = !record;
return hipSuccess;
}
hipError_t Event::addMarker(hipStream_t stream, amd::Command* command, bool record) {
hip::Stream* hip_stream = hip::getStream(stream);
// Keep the lock always at the beginning of this to avoid a race. SWDEV-277847
amd::ScopedLock lock(lock_);
hipError_t status = recordCommand(command, hip_stream);
if (status != hipSuccess) {
return hipSuccess;
}
status = enqueueRecordCommand(stream, command, record);
return status;
}
// ================================================================================================
bool isValid(hipEvent_t event) {
// NULL event is always valid
if (event == nullptr) {
return true;
}
amd::ScopedLock lock(eventSetLock);
if (eventSet.find(event) == eventSet.end()) {
return false;
}
return true;
}
} // namespace hip
// ================================================================================================
hipError_t ihipEventCreateWithFlags(hipEvent_t* event, unsigned flags) {
unsigned supportedFlags = hipEventDefault | hipEventBlockingSync | hipEventDisableTiming |
hipEventReleaseToDevice | hipEventReleaseToSystem |
hipEventInterprocess | hipEventDisableSystemFence;
const unsigned releaseFlags = (hipEventReleaseToDevice | hipEventReleaseToSystem |
hipEventDisableSystemFence);
// can't set any unsupported flags.
// can set only one of the release flags.
// if hipEventInterprocess flag is set, then hipEventDisableTiming flag also must be set
const bool illegalFlags = (flags & ~supportedFlags) ||
([](unsigned int num){
unsigned int bitcount;
for (bitcount = 0; num; bitcount++) {
num &= num - 1;
}
return bitcount; } (flags & releaseFlags) > 1) ||
((flags & hipEventInterprocess) && !(flags & hipEventDisableTiming));
if (!illegalFlags) {
hip::Event* e = nullptr;
if (flags & hipEventInterprocess) {
e = new hip::IPCEvent();
} else {
if (AMD_DIRECT_DISPATCH) {
e = new hip::EventDD(flags);
} else {
e = new hip::Event(flags);
}
}
if (e == nullptr) {
return hipErrorOutOfMemory;
}
*event = reinterpret_cast<hipEvent_t>(e);
amd::ScopedLock lock(hip::eventSetLock);
hip::eventSet.insert(*event);
} else {
return hipErrorInvalidValue;
}
return hipSuccess;
}
hipError_t hipEventCreateWithFlags(hipEvent_t* event, unsigned flags) {
HIP_INIT_API(hipEventCreateWithFlags, event, flags);
if (event == nullptr) {
return hipErrorInvalidValue;
}
HIP_RETURN(ihipEventCreateWithFlags(event, flags), *event);
}
hipError_t hipEventCreate(hipEvent_t* event) {
HIP_INIT_API(hipEventCreate, event);
if (event == nullptr) {
return hipErrorInvalidValue;
}
HIP_RETURN(ihipEventCreateWithFlags(event, 0), *event);
}
hipError_t hipEventDestroy(hipEvent_t event) {
HIP_INIT_API(hipEventDestroy, event);
if (event == nullptr) {
HIP_RETURN(hipErrorInvalidHandle);
}
amd::ScopedLock lock(hip::eventSetLock);
if (hip::eventSet.erase(event) == 0 ) {
return hipErrorContextIsDestroyed;
}
hip::Event* e = reinterpret_cast<hip::Event*>(event);
if (e->GetCaptureStream() != nullptr) {
reinterpret_cast<hip::Stream*>(e->GetCaptureStream())->EraseCaptureEvent(event);
}
delete e;
HIP_RETURN(hipSuccess);
}
hipError_t hipEventElapsedTime(float* ms, hipEvent_t start, hipEvent_t stop) {
HIP_INIT_API(hipEventElapsedTime, ms, start, stop);
if (ms == nullptr) {
HIP_RETURN(hipErrorInvalidValue);
}
if (start == nullptr || stop == nullptr) {
HIP_RETURN(hipErrorInvalidHandle);
}
hip::Event* eStart = reinterpret_cast<hip::Event*>(start);
hip::Event* eStop = reinterpret_cast<hip::Event*>(stop);
if (eStart->deviceId() != eStop->deviceId()) {
HIP_RETURN(hipErrorInvalidHandle);
}
HIP_RETURN(eStart->elapsedTime(*eStop, *ms), "Elapsed Time = ", *ms);
}
hipError_t hipEventRecord_common(hipEvent_t event, hipStream_t stream) {
STREAM_CAPTURE(hipEventRecord, stream, event);
if (event == nullptr) {
return hipErrorInvalidHandle;
}
hip::Event* e = reinterpret_cast<hip::Event*>(event);
hip::Stream* hip_stream = hip::getStream(stream);
if (g_devices[e->deviceId()]->devices()[0] != &hip_stream->device()) {
return hipErrorInvalidHandle;
}
return e->addMarker(stream, nullptr, true);
}
hipError_t hipEventRecord(hipEvent_t event, hipStream_t stream) {
HIP_INIT_API(hipEventRecord, event, stream);
HIP_RETURN(hipEventRecord_common(event, stream));
}
hipError_t hipEventRecord_spt(hipEvent_t event, hipStream_t stream) {
HIP_INIT_API(hipEventRecord, event, stream);
PER_THREAD_DEFAULT_STREAM(stream);
HIP_RETURN(hipEventRecord_common(event, stream));
}
hipError_t hipEventSynchronize(hipEvent_t event) {
HIP_INIT_API(hipEventSynchronize, event);
if (event == nullptr) {
HIP_RETURN(hipErrorInvalidHandle);
}
hip::Event* e = reinterpret_cast<hip::Event*>(event);
HIP_RETURN(e->synchronize());
}
hipError_t ihipEventQuery(hipEvent_t event) {
if (event == nullptr) {
return hipErrorInvalidHandle;
}
hip::Event* e = reinterpret_cast<hip::Event*>(event);
return e->query();
}
hipError_t hipEventQuery(hipEvent_t event) {
HIP_INIT_API(hipEventQuery, event);
HIP_RETURN(ihipEventQuery(event));
}
+253
查看文件
@@ -0,0 +1,253 @@
/* Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. */
#ifndef HIP_EVENT_H
#define HIP_EVENT_H
#include "hip_internal.hpp"
#include "thread/monitor.hpp"
// Internal structure for stream callback handler
class StreamCallback {
protected:
void* userData_;
public:
StreamCallback(void* userData)
: userData_(userData) {}
virtual void CL_CALLBACK callback() = 0;
};
class StreamAddCallback : public StreamCallback {
hipStreamCallback_t callBack_;
hipStream_t stream_;
public:
StreamAddCallback(hipStream_t stream, hipStreamCallback_t callback, void* userData)
: StreamCallback(userData) {
stream_ = stream;
callBack_ = callback;
}
void CL_CALLBACK callback() {
hipError_t status = hipSuccess;
callBack_(stream_, status, userData_);
}
};
class LaunchHostFuncCallback : public StreamCallback {
hipHostFn_t callBack_;
public:
LaunchHostFuncCallback(hipHostFn_t callback, void* userData)
: StreamCallback(userData) {
callBack_ = callback;
}
void CL_CALLBACK callback() { callBack_(userData_); }
};
void CL_CALLBACK ihipStreamCallback(cl_event event, cl_int command_exec_status, void* user_data);
namespace hip {
#define IPC_SIGNALS_PER_EVENT 32
typedef struct ihipIpcEventShmem_s {
std::atomic<int> owners;
std::atomic<int> owners_device_id;
std::atomic<int> owners_process_id;
std::atomic<int> read_index;
std::atomic<int> write_index;
uint32_t signal[IPC_SIGNALS_PER_EVENT];
} ihipIpcEventShmem_t;
class EventMarker : public amd::Marker {
public:
EventMarker(amd::HostQueue& stream, bool disableFlush, bool markerTs = false,
int32_t scope = amd::Device::kCacheStateInvalid)
: amd::Marker(stream, disableFlush) {
profilingInfo_.enabled_ = true;
profilingInfo_.callback_ = nullptr;
profilingInfo_.marker_ts_ = markerTs;
profilingInfo_.clear();
setEventScope(scope);
}
};
enum eventType { Query, StreamWait, ElapsedTime };
class Event {
/// event recorded on stream where capture is active
bool onCapture_;
/// capture stream where event is recorded
hipStream_t captureStream_ = nullptr;
/// Previous captured nodes before event record
std::vector<hipGraphNode_t> nodesPrevToRecorded_;
protected:
bool CheckHwEvent(eventType type) {
bool ready;
if (type == Query) {
ready = g_devices[deviceId()]->devices()[0]->IsHwEventReadyForcedWait(*event_);
} else {
ready = g_devices[deviceId()]->devices()[0]->IsHwEventReady(*event_);
}
return ready;
}
public:
Event(unsigned int flags) : flags(flags), lock_("hipEvent_t", true),
event_(nullptr), unrecorded_(false), stream_(nullptr) {
// No need to init event_ here as addMarker does that
onCapture_ = false;
device_id_ = hip::getCurrentDevice()->deviceId(); // Created in current device ctx
}
virtual ~Event() {
if (event_ != nullptr) {
event_->release();
}
}
unsigned int flags;
virtual hipError_t query();
virtual hipError_t synchronize();
hipError_t elapsedTime(Event& eStop, float& ms);
virtual hipError_t streamWaitCommand(amd::Command*& command, hip::Stream* stream);
virtual hipError_t enqueueStreamWaitCommand(hipStream_t stream, amd::Command* command);
virtual hipError_t streamWait(hipStream_t stream, uint flags);
virtual hipError_t recordCommand(amd::Command*& command, amd::HostQueue* stream,
uint32_t flags = 0);
virtual hipError_t enqueueRecordCommand(hipStream_t stream, amd::Command* command, bool record);
hipError_t addMarker(hipStream_t stream, amd::Command* command, bool record);
void BindCommand(amd::Command& command, bool record) {
amd::ScopedLock lock(lock_);
if (event_ != nullptr) {
event_->release();
}
event_ = &command.event();
unrecorded_ = !record;
command.retain();
}
bool isUnRecorded() const { return unrecorded_; }
amd::Monitor& lock() { return lock_; }
const int deviceId() const { return device_id_; }
void setDeviceId(int id) { device_id_ = id; }
amd::Event* event() { return event_; }
/// End capture on this event
void EndCapture() {
onCapture_ = false;
captureStream_ = nullptr;
}
/// Start capture when waited on this event
void StartCapture(hipStream_t stream) {
onCapture_ = true;
captureStream_ = stream;
}
/// Get capture status of the graph
bool GetCaptureStatus() const { return onCapture_; }
/// Get capture stream where event is recorded
hipStream_t GetCaptureStream() const { return captureStream_; }
/// Set capture stream where event is recorded
void SetCaptureStream(hipStream_t stream) { captureStream_ = stream; }
/// Returns previous captured nodes before event record
std::vector<hipGraphNode_t> GetNodesPrevToRecorded() const { return nodesPrevToRecorded_; }
/// Set last captured graph node before event record
void SetNodesPrevToRecorded(std::vector<hipGraphNode_t>& graphNode) {
nodesPrevToRecorded_ = graphNode;
}
virtual hipError_t GetHandle(ihipIpcEventHandle_t* handle) {
return hipErrorInvalidConfiguration;
}
virtual hipError_t OpenHandle(ihipIpcEventHandle_t* handle) {
return hipErrorInvalidConfiguration;
}
virtual bool awaitEventCompletion();
virtual bool ready(eventType type);
virtual int64_t time(bool getStartTs) const;
protected:
amd::Monitor lock_;
hip::Stream* stream_;
amd::Event* event_;
int device_id_;
//! Flag to indicate hipEventRecord has not been called. This is needed for
//! hip*ModuleLaunchKernel API which takes start and stop events so no
//! hipEventRecord is called. Cleanup needed once those APIs are deprecated.
bool unrecorded_;
};
class EventDD : public Event {
public:
EventDD(unsigned int flags) : Event(flags) {}
virtual ~EventDD() {}
virtual bool awaitEventCompletion();
virtual bool ready(eventType type);
virtual int64_t time(bool getStartTs) const;
};
class IPCEvent : public Event {
// IPC Events
struct ihipIpcEvent_t {
std::string ipc_name_;
int ipc_fd_;
ihipIpcEventShmem_t* ipc_shmem_;
ihipIpcEvent_t() : ipc_name_("dummy"), ipc_fd_(0), ipc_shmem_(nullptr) {}
void setipcname(const char* name) { ipc_name_ = std::string(name); }
};
ihipIpcEvent_t ipc_evt_;
public:
~IPCEvent() {
if (ipc_evt_.ipc_shmem_) {
int owners = --ipc_evt_.ipc_shmem_->owners;
// Make sure event is synchronized
hipError_t status = synchronize();
status = ihipHostUnregister(&ipc_evt_.ipc_shmem_->signal);
if (!amd::Os::MemoryUnmapFile(ipc_evt_.ipc_shmem_, sizeof(hip::ihipIpcEventShmem_t))) {
// print hipErrorInvalidHandle;
}
}
}
IPCEvent() : Event(hipEventInterprocess) {}
bool createIpcEventShmemIfNeeded();
hipError_t GetHandle(ihipIpcEventHandle_t* handle);
hipError_t OpenHandle(ihipIpcEventHandle_t* handle);
hipError_t synchronize();
hipError_t query();
hipError_t streamWaitCommand(amd::Command*& command, hip::Stream* stream);
hipError_t enqueueStreamWaitCommand(hipStream_t stream, amd::Command* command);
hipError_t streamWait(hipStream_t stream, uint flags);
hipError_t recordCommand(amd::Command*& command, amd::HostQueue* queue, uint32_t flags = 0);
hipError_t enqueueRecordCommand(hipStream_t stream, amd::Command* command, bool record);
};
}; // namespace hip
struct CallbackData {
int previous_read_index;
hip::ihipIpcEventShmem_t* shmem;
};
#endif // HIP_EVEMT_H
+250
查看文件
@@ -0,0 +1,250 @@
/* Copyright (c) 2015 - 2022 Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. */
#include <hip/hip_runtime.h>
#include "hip_event.hpp"
#if !defined(_MSC_VER)
#include <unistd.h>
#else
#include <io.h>
#endif
// ================================================================================================
hipError_t ihipEventCreateWithFlags(hipEvent_t* event, unsigned flags);
namespace hip {
bool IPCEvent::createIpcEventShmemIfNeeded() {
if (ipc_evt_.ipc_shmem_) {
// ipc_shmem_ already created, no need to create it again
return true;
}
char name_template[] = "/tmp/eventXXXXXX";
#if !defined(_MSC_VER)
int temp_fd = mkstemp(name_template);
#else
_mktemp_s(name_template, sizeof(name_template));
#endif
ipc_evt_.ipc_name_ = name_template;
ipc_evt_.ipc_name_.replace(0, 5, "/hip_");
if (!amd::Os::MemoryMapFileTruncated(
ipc_evt_.ipc_name_.c_str(),
const_cast<const void**>(reinterpret_cast<void**>(&(ipc_evt_.ipc_shmem_))),
sizeof(hip::ihipIpcEventShmem_t))) {
return false;
}
#if !defined(_MSC_VER)
close(temp_fd);
#endif
ipc_evt_.ipc_shmem_->owners = 1;
ipc_evt_.ipc_shmem_->read_index = -1;
ipc_evt_.ipc_shmem_->write_index = 0;
for (uint32_t sig_idx = 0; sig_idx < IPC_SIGNALS_PER_EVENT; ++sig_idx) {
ipc_evt_.ipc_shmem_->signal[sig_idx] = 0;
}
// device sets 0 to this ptr when the ipc event is completed
hipError_t status = ihipHostRegister(&ipc_evt_.ipc_shmem_->signal,
sizeof(uint32_t) * IPC_SIGNALS_PER_EVENT,
0);
if (status != hipSuccess) {
return false;
}
return true;
}
hipError_t IPCEvent::query() {
if (ipc_evt_.ipc_shmem_) {
int prev_read_idx = ipc_evt_.ipc_shmem_->read_index;
int offset = (prev_read_idx % IPC_SIGNALS_PER_EVENT);
if (ipc_evt_.ipc_shmem_->read_index < prev_read_idx + IPC_SIGNALS_PER_EVENT &&
ipc_evt_.ipc_shmem_->signal[offset] != 0) {
return hipErrorNotReady;
}
}
return hipSuccess;
}
hipError_t IPCEvent::synchronize() {
if (ipc_evt_.ipc_shmem_) {
int prev_read_idx = ipc_evt_.ipc_shmem_->read_index;
if (prev_read_idx >= 0) {
int offset = (prev_read_idx % IPC_SIGNALS_PER_EVENT);
while ((ipc_evt_.ipc_shmem_->read_index < prev_read_idx + IPC_SIGNALS_PER_EVENT) &&
(ipc_evt_.ipc_shmem_->signal[offset] != 0)) {
amd::Os::sleep(1);
}
}
}
return hipSuccess;
}
hipError_t IPCEvent::streamWaitCommand(amd::Command*& command, hip::Stream* stream) {
command = new amd::Marker(*stream, false);
if (command == NULL) {
return hipErrorOutOfMemory;
}
return hipSuccess;
}
hipError_t IPCEvent::enqueueStreamWaitCommand(hipStream_t stream, amd::Command* command) {
auto t{new CallbackData{ipc_evt_.ipc_shmem_->read_index, ipc_evt_.ipc_shmem_}};
StreamCallback* cbo = new StreamAddCallback(
stream, reinterpret_cast<hipStreamCallback_t>(WaitThenDecrementSignal), t);
if (!command->setCallback(CL_COMPLETE, ihipStreamCallback, cbo)) {
command->release();
return hipErrorInvalidHandle;
}
command->enqueue();
command->release();
command->awaitCompletion();
return hipSuccess;
}
hipError_t IPCEvent::streamWait(hipStream_t stream, uint flags) {
hip::Stream* hip_stream = hip::getStream(stream);
amd::ScopedLock lock(lock_);
if(query() != hipSuccess) {
amd::Command* command;
hipError_t status = streamWaitCommand(command, hip_stream);
if (status != hipSuccess) {
return status;
}
status = enqueueStreamWaitCommand(stream, command);
return status;
}
return hipSuccess;
}
hipError_t IPCEvent::recordCommand(amd::Command*& command, amd::HostQueue* stream, uint32_t flags) {
bool unrecorded = isUnRecorded();
if (unrecorded) {
command = new amd::Marker(*stream, kMarkerDisableFlush);
} else {
return Event::recordCommand(command, stream);
}
return hipSuccess;
}
hipError_t IPCEvent::enqueueRecordCommand(hipStream_t stream, amd::Command* command, bool record) {
bool unrecorded = isUnRecorded();
if (unrecorded) {
amd::Event& tEvent = command->event();
createIpcEventShmemIfNeeded();
int write_index = ipc_evt_.ipc_shmem_->write_index++;
int offset = write_index % IPC_SIGNALS_PER_EVENT;
while (ipc_evt_.ipc_shmem_->signal[offset] != 0) {
amd::Os::sleep(1);
}
// Lock signal.
ipc_evt_.ipc_shmem_->signal[offset] = 1;
ipc_evt_.ipc_shmem_->owners_device_id = deviceId();
command->enqueue();
// device writes 0 to signal after the hipEventRecord command is completed
// the signal value is checked by WaitThenDecrementSignal cb
hipError_t status = ihipStreamOperation(stream, ROCCLR_COMMAND_STREAM_WRITE_VALUE,
&(ipc_evt_.ipc_shmem_->signal[offset]),
0,
0, 0, sizeof(uint32_t));
if (status != hipSuccess) {
return status;
}
// Update read index to indicate new signal.
int expected = write_index - 1;
while (!ipc_evt_.ipc_shmem_->read_index.compare_exchange_weak(expected, write_index)) {
amd::Os::sleep(1);
}
} else {
return Event::enqueueRecordCommand(stream, command, record);
}
return hipSuccess;
}
hipError_t IPCEvent::GetHandle(ihipIpcEventHandle_t* handle) {
if (!createIpcEventShmemIfNeeded()) {
return hipErrorInvalidValue;
}
ipc_evt_.ipc_shmem_->owners_device_id = deviceId();
ipc_evt_.ipc_shmem_->owners_process_id = amd::Os::getProcessId();
memset(handle->shmem_name, 0, HIP_IPC_HANDLE_SIZE);
ipc_evt_.ipc_name_.copy(handle->shmem_name, std::string::npos);
return hipSuccess;
}
hipError_t IPCEvent::OpenHandle(ihipIpcEventHandle_t* handle) {
ipc_evt_.ipc_name_ = handle->shmem_name;
if (!amd::Os::MemoryMapFileTruncated(ipc_evt_.ipc_name_.c_str(),
(const void**)&(ipc_evt_.ipc_shmem_),
sizeof(ihipIpcEventShmem_t))) {
return hipErrorInvalidValue;
}
if (amd::Os::getProcessId() == ipc_evt_.ipc_shmem_->owners_process_id.load()) {
// If this is in the same process, return error.
return hipErrorInvalidContext;
}
ipc_evt_.ipc_shmem_->owners += 1;
// device sets 0 to this ptr when the ipc event is completed
hipError_t status = hipSuccess;
status = ihipHostRegister(&ipc_evt_.ipc_shmem_->signal,
sizeof(uint32_t) * IPC_SIGNALS_PER_EVENT,
0);
return status;
}
} // namespace hip
// ================================================================================================
hipError_t hipIpcGetEventHandle(hipIpcEventHandle_t* handle, hipEvent_t event) {
HIP_INIT_API(hipIpcGetEventHandle, handle, event);
if (handle == nullptr || event == nullptr) {
HIP_RETURN(hipErrorInvalidValue);
}
hip::Event* e = reinterpret_cast<hip::Event*>(event);
HIP_RETURN(e->GetHandle(reinterpret_cast<ihipIpcEventHandle_t*>(handle)));
}
hipError_t hipIpcOpenEventHandle(hipEvent_t* event, hipIpcEventHandle_t handle) {
HIP_INIT_API(hipIpcOpenEventHandle, event, handle);
hipError_t hip_err = hipSuccess;
if (event == nullptr) {
HIP_RETURN(hipErrorInvalidValue);
}
hip_err = ihipEventCreateWithFlags(event, hipEventDisableTiming | hipEventInterprocess);
if (hip_err != hipSuccess) {
HIP_RETURN(hip_err);
}
hip::Event* e = reinterpret_cast<hip::Event*>(*event);
ihipIpcEventHandle_t* iHandle = reinterpret_cast<ihipIpcEventHandle_t*>(&handle);
HIP_RETURN(e->OpenHandle(iHandle));
}
+345
查看文件
@@ -0,0 +1,345 @@
#include "hip_fatbin.hpp"
#include <unordered_map>
#include "hip_code_object.hpp"
namespace hip {
FatBinaryDeviceInfo::~FatBinaryDeviceInfo() {
if (program_ != nullptr) {
program_->unload();
program_->release();
program_ = nullptr;
}
}
FatBinaryInfo::FatBinaryInfo(const char* fname, const void* image) : fdesc_(amd::Os::FDescInit()),
fsize_(0), foffset_(0), image_(image), image_mapped_(false),
uri_(std::string()) {
if (fname != nullptr) {
fname_ = std::string(fname);
} else {
fname_ = std::string();
}
fatbin_dev_info_.resize(g_devices.size(), nullptr);
}
FatBinaryInfo::~FatBinaryInfo() {
for (auto* fbd: fatbin_dev_info_) {
if (fbd != nullptr) {
delete fbd;
}
}
if (fdesc_ > 0) {
if (fsize_ && image_mapped_ && !amd::Os::MemoryUnmapFile(image_, fsize_)) {
guarantee(false, "Cannot unmap file");
}
if (!amd::Os::CloseFileHandle(fdesc_)) {
guarantee(false, "Cannot close file");
}
}
fname_ = std::string();
fdesc_ = amd::Os::FDescInit();
fsize_ = 0;
image_ = nullptr;
uri_ = std::string();
}
hipError_t FatBinaryInfo::ExtractFatBinaryUsingCOMGR(const std::vector<hip::Device*>& devices) {
amd_comgr_data_t data_object;
amd_comgr_status_t comgr_status = AMD_COMGR_STATUS_SUCCESS;
hipError_t hip_status = hipSuccess;
amd_comgr_code_object_info_t* query_list_array = nullptr;
// If image was passed as a pointer to our hipMod* api, we can try to extract the file name
// if it was mapped by the app. Otherwise use the COMGR data API.
if (fname_.size() == 0) {
if (image_ == nullptr) {
LogError("Both Filename and image cannot be null");
return hipErrorInvalidValue;
}
if(!amd::Os::FindFileNameFromAddress(image_, &fname_, &foffset_)) {
fname_ = std::string("");
foffset_ = 0;
}
}
// If file name & path are available (or it is passed to you), then get the file desc to use
// COMGR file slice APIs.
if (fname_.size() > 0) {
// Get File Handle & size of the file.
if (!amd::Os::GetFileHandle(fname_.c_str(), &fdesc_, &fsize_))
return hipErrorFileNotFound;
// If the file name exists but the file size is 0, the something wrong with the file or its path
if (fsize_ == 0)
return hipErrorInvalidValue;
// If image_ is nullptr, then file path is passed via hipMod* APIs, so map the file.
if (image_ == nullptr && !amd::Os::MemoryMapFileDesc(fdesc_, fsize_, foffset_, &image_)
&& (image_mapped_ = true)) {
LogError("Cannot map the file descriptor");
amd::Os::CloseFileHandle(fdesc_);
return hipErrorInvalidValue;
}
}
// At this line, image should be a valid ptr.
guarantee(image_ != nullptr, "Image cannot be nullptr, file did not map for some reason");
do {
// If the image ptr is not clang offload bundle then just directly point the image.
if (!CodeObject::IsClangOffloadMagicBundle(image_)) {
for (size_t dev_idx=0; dev_idx < devices.size(); ++dev_idx) {
fatbin_dev_info_[devices[dev_idx]->deviceId()]
= new FatBinaryDeviceInfo(image_, CodeObject::ElfSize(image_), 0);
fatbin_dev_info_[devices[dev_idx]->deviceId()]->program_
= new amd::Program(*devices[dev_idx]->asContext());
if (fatbin_dev_info_[devices[dev_idx]->deviceId()]->program_ == nullptr) {
hip_status = hipErrorOutOfMemory;
break;
}
}
break;
}
// Create a data object, if it fails return error
if ((comgr_status = amd_comgr_create_data(AMD_COMGR_DATA_KIND_FATBIN, &data_object))
!= AMD_COMGR_STATUS_SUCCESS) {
LogPrintfError("Creating data object failed with status %d ", comgr_status);
hip_status = hipErrorInvalidValue;
break;
}
#if !defined(_WIN32)
// Using the file descriptor and file size, map the data object.
if (fdesc_ > 0) {
guarantee(fsize_ > 0, "Cannot have a file size of 0");
if ((comgr_status = amd_comgr_set_data_from_file_slice(data_object, fdesc_, foffset_,
fsize_)) != AMD_COMGR_STATUS_SUCCESS) {
LogPrintfError("Setting data from file slice failed with status %d ", comgr_status);
hip_status = hipErrorInvalidValue;
break;
}
} else
#endif
if (image_ != nullptr) {
// Using the image ptr, map the data object.
if ((comgr_status = amd_comgr_set_data(data_object, 4096,
reinterpret_cast<const char*>(image_))) != AMD_COMGR_STATUS_SUCCESS) {
LogPrintfError("Setting data from file slice failed with status %d ", comgr_status);
hip_status = hipErrorInvalidValue;
break;
}
} else {
guarantee(false, "Cannot have both fname_ and image_ as nullptr");
}
// Find the unique number of ISAs needed for this COMGR query.
std::unordered_map<std::string, std::pair<size_t, size_t>> unique_isa_names;
for (size_t dev_idx = 0; dev_idx < devices.size(); ++dev_idx) {
std::string device_name = devices[dev_idx]->devices()[0]->isa().isaName();
if (unique_isa_names.cend() == unique_isa_names.find(device_name)) {
unique_isa_names.insert({device_name, std::make_pair<size_t, size_t>(0,0)});
}
}
// Create a query list using COMGR info for unique ISAs.
query_list_array = new amd_comgr_code_object_info_t[unique_isa_names.size()];
auto isa_it = unique_isa_names.begin();
for (size_t isa_idx = 0; isa_idx < unique_isa_names.size(); ++isa_idx) {
std::advance(isa_it, isa_idx);
query_list_array[isa_idx].isa = isa_it->first.c_str();
query_list_array[isa_idx].size = 0;
query_list_array[isa_idx].offset = 0;
}
// Look up the code object info passing the query list.
if ((comgr_status = amd_comgr_lookup_code_object(data_object, query_list_array,
unique_isa_names.size())) != AMD_COMGR_STATUS_SUCCESS) {
LogPrintfError("Setting data from file slice failed with status %d ", comgr_status);
hip_status = hipErrorInvalidValue;
break;
}
for (size_t isa_idx = 0; isa_idx < unique_isa_names.size(); ++isa_idx) {
auto unique_it = unique_isa_names.find(query_list_array[isa_idx].isa);
guarantee(unique_isa_names.cend() != unique_it, "Cannot find unique isa");
unique_it->second = std::pair<size_t, size_t>
(static_cast<size_t>(query_list_array[isa_idx].size),
static_cast<size_t>(query_list_array[isa_idx].offset));
}
for (size_t dev_idx = 0; dev_idx < devices.size(); ++dev_idx) {
std::string device_name = devices[dev_idx]->devices()[0]->isa().isaName();
auto dev_it = unique_isa_names.find(device_name);
guarantee(unique_isa_names.cend() != dev_it,
"Cannot find the device name in the unique device name");
fatbin_dev_info_[devices[dev_idx]->deviceId()]
= new FatBinaryDeviceInfo(reinterpret_cast<address>(const_cast<void*>(image_))
+ dev_it->second.second, dev_it->second.first,
dev_it->second.second);
fatbin_dev_info_[devices[dev_idx]->deviceId()]->program_
= new amd::Program(*devices[dev_idx]->asContext());
}
} while(0);
if (query_list_array) {
delete[] query_list_array;
}
// Clean up file and memory resouces if hip_status failed for some reason.
if (hip_status != hipSuccess && hip_status != hipErrorInvalidKernelFile) {
if (image_mapped_) {
if (!amd::Os::MemoryUnmapFile(image_, fsize_))
guarantee(false, "Cannot unmap the file");
image_ = nullptr;
image_mapped_ = false;
}
if (fdesc_ > 0) {
guarantee(fsize_ > 0, "Size has to greater than 0 too");
if (!amd::Os::CloseFileHandle(fdesc_))
guarantee(false, "Cannot close the file handle");
fdesc_ = 0;
fsize_ = 0;
}
if ((comgr_status = amd_comgr_release_data(data_object)) != AMD_COMGR_STATUS_SUCCESS) {
LogPrintfError("Releasing COMGR data failed with status %d ", comgr_status);
return hipErrorInvalidValue;
}
}
return hip_status;
}
hipError_t FatBinaryInfo::ExtractFatBinary(const std::vector<hip::Device*>& devices) {
if (!HIP_USE_RUNTIME_UNBUNDLER) {
return ExtractFatBinaryUsingCOMGR(devices);
}
hipError_t hip_error = hipSuccess;
std::vector<std::pair<const void*, size_t>> code_objs;
// Copy device names for Extract Code object File
std::vector<std::string> device_names;
device_names.reserve(devices.size());
for (size_t dev_idx = 0; dev_idx < devices.size(); ++dev_idx) {
device_names.push_back(devices[dev_idx]->devices()[0]->isa().isaName());
}
// We are given file name, get the file desc and file size
if (fname_.size() > 0) {
// Get File Handle & size of the file.
if (!amd::Os::GetFileHandle(fname_.c_str(), &fdesc_, &fsize_)) {
return hipErrorFileNotFound;
}
if (fsize_ == 0) {
return hipErrorInvalidImage;
}
// Extract the code object from file
hip_error = CodeObject::ExtractCodeObjectFromFile(fdesc_, fsize_, &image_,
device_names, code_objs);
} else if (image_ != nullptr) {
// We are directly given image pointer directly, try to extract file desc & file Size
hip_error = CodeObject::ExtractCodeObjectFromMemory(image_,
device_names, code_objs, uri_);
} else {
return hipErrorInvalidValue;
}
if (hip_error == hipErrorNoBinaryForGpu) {
LogPrintfError("hipErrorNoBinaryForGpu: Couldn't find binary for current devices! - %d",hip_error);
return hip_error;
}
if (hip_error == hipErrorInvalidKernelFile) {
for (size_t dev_idx = 0; dev_idx < devices.size(); ++dev_idx) {
// the image type is no CLANG_OFFLOAD_BUNDLER, image for current device directly passed
fatbin_dev_info_[devices[dev_idx]->deviceId()]
= new FatBinaryDeviceInfo(image_, CodeObject::ElfSize(image_), 0);
}
} else if(hip_error == hipSuccess) {
for (size_t dev_idx = 0; dev_idx < devices.size(); ++dev_idx) {
// Calculate the offset wrt binary_image and the original image
size_t offset_l
= (reinterpret_cast<address>(const_cast<void*>(code_objs[dev_idx].first))
- reinterpret_cast<address>(const_cast<void*>(image_)));
fatbin_dev_info_[devices[dev_idx]->deviceId()]
= new FatBinaryDeviceInfo(code_objs[dev_idx].first, code_objs[dev_idx].second, offset_l);
}
}
for (size_t dev_idx = 0; dev_idx < devices.size(); ++dev_idx) {
fatbin_dev_info_[devices[dev_idx]->deviceId()]->program_
= new amd::Program(*devices[dev_idx]->asContext());
if (fatbin_dev_info_[devices[dev_idx]->deviceId()]->program_ == NULL) {
return hipErrorOutOfMemory;
}
}
return hipSuccess;
}
hipError_t FatBinaryInfo::AddDevProgram(const int device_id) {
// Device Id bounds Check
DeviceIdCheck(device_id);
FatBinaryDeviceInfo* fbd_info = fatbin_dev_info_[device_id];
if (fbd_info == nullptr) {
return hipErrorInvalidKernelFile;
}
// If fat binary was already added, skip this step and return success
if (fbd_info->add_dev_prog_ == false) {
amd::Context* ctx = g_devices[device_id]->asContext();
if (CL_SUCCESS != fbd_info->program_->addDeviceProgram(*ctx->devices()[0],
fbd_info->binary_image_,
fbd_info->binary_size_, false,
nullptr, nullptr, fdesc_,
fbd_info->binary_offset_, uri_)) {
return hipErrorInvalidKernelFile;
}
fbd_info->add_dev_prog_ = true;
}
return hipSuccess;
}
hipError_t FatBinaryInfo::BuildProgram(const int device_id) {
// Device Id Check and Add DeviceProgram if not added so far
DeviceIdCheck(device_id);
IHIP_RETURN_ONFAIL(AddDevProgram(device_id));
// If Program was already built skip this step and return success
FatBinaryDeviceInfo* fbd_info = fatbin_dev_info_[device_id];
if (fbd_info->prog_built_ == false) {
if(CL_SUCCESS != fbd_info->program_->build(g_devices[device_id]->devices(),
nullptr, nullptr, nullptr,
kOptionChangeable, kNewDevProg)) {
return hipErrorSharedObjectInitFailed;
}
fbd_info->prog_built_ = true;
}
if (!fbd_info->program_->load()) {
return hipErrorSharedObjectInitFailed;
}
return hipSuccess;
}
} //namespace : hip
+90
查看文件
@@ -0,0 +1,90 @@
#ifndef HIP_FAT_BINARY_HPP
#define HIP_FAT_BINARY_HPP
#include "hip/hip_runtime.h"
#include "hip/hip_runtime_api.h"
#include "hip_internal.hpp"
#include "platform/program.hpp"
namespace hip {
//Fat Binary Per Device info
class FatBinaryDeviceInfo {
public:
FatBinaryDeviceInfo (const void* binary_image, size_t binary_size, size_t binary_offset)
: binary_image_(binary_image), binary_size_(binary_size),
binary_offset_(binary_offset), program_(nullptr),
add_dev_prog_(false), prog_built_(false) {}
~FatBinaryDeviceInfo();
private:
const void* binary_image_; // binary image ptr
size_t binary_size_; // binary image size
size_t binary_offset_; // image offset from original
amd::Program* program_; // reinterpreted as hipModule_t
friend class FatBinaryInfo;
//Control Variables
bool add_dev_prog_;
bool prog_built_;
};
// Fat Binary Info
class FatBinaryInfo {
public:
FatBinaryInfo(const char* fname, const void* image);
~FatBinaryInfo();
// Loads Fat binary from file or image, unbundles COs for devices.
hipError_t ExtractFatBinaryUsingCOMGR(const std::vector<hip::Device*>& devices);
hipError_t ExtractFatBinary(const std::vector<hip::Device*>& devices);
hipError_t AddDevProgram(const int device_id);
hipError_t BuildProgram(const int device_id);
// Device Id bounds check
inline void DeviceIdCheck(const int device_id) const {
guarantee(device_id >= 0, "Invalid DeviceId less than 0");
guarantee(static_cast<size_t>(device_id) < fatbin_dev_info_.size(), "Invalid DeviceId, greater than no of fatbin device info!");
}
// Getter Methods
amd::Program* GetProgram(int device_id) {
DeviceIdCheck(device_id);
return fatbin_dev_info_[device_id]->program_;
}
hipModule_t Module(int device_id) const {
DeviceIdCheck(device_id);
return reinterpret_cast<hipModule_t>(as_cl(fatbin_dev_info_[device_id]->program_));
}
hipError_t GetModule(int device_id, hipModule_t* hmod) const {
DeviceIdCheck(device_id);
*hmod = reinterpret_cast<hipModule_t>(as_cl(fatbin_dev_info_[device_id]->program_));
return hipSuccess;
}
private:
std::string fname_; // File name
amd::Os::FileDesc fdesc_; // File descriptor
size_t fsize_; // Total file size
size_t foffset_; // File Offset where the fat binary is present.
// Even when file is passed image will be mmapped till ~desctructor.
const void* image_; // Image
bool image_mapped_; // flag to detect if image is mapped
// Only used for FBs where image is directly passed
std::string uri_; // Uniform resource indicator
// Per Device Info, like corresponding binary ptr, size.
std::vector<FatBinaryDeviceInfo*> fatbin_dev_info_;
};
}; /* namespace hip */
#endif /* HIP_FAT_BINARY_HPP */
+877
查看文件
@@ -0,0 +1,877 @@
/* Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. */
#include <hip/hip_runtime_api.h>
#include <hip/hiprtc.h>
inline std::ostream& operator<<(std::ostream& os, const hipTextureFilterMode& s) {
switch (s) {
case hipFilterModePoint:
os << "hipFilterModePoint";
break;
case hipFilterModeLinear:
os << "hipFilterModeLinear";
break;
default:
os << "hipFilterModePoint";
};
return os;
}
inline std::ostream& operator<<(std::ostream& os, const hipTextureReadMode& s) {
switch (s) {
case hipReadModeElementType:
os << "hipReadModeElementType";
break;
case hipReadModeNormalizedFloat:
os << "hipReadModeNormalizedFloat";
break;
default:
os << "hipReadModeElementType";
};
return os;
}
inline std::ostream& operator<<(std::ostream& os, const hipTextureAddressMode& s) {
switch (s) {
case hipAddressModeWrap:
os << "hipAddressModeWrap";
break;
case hipAddressModeClamp:
os << "hipAddressModeClamp";
break;
case hipAddressModeMirror:
os << "hipAddressModeMirror";
break;
case hipAddressModeBorder:
os << "hipAddressModeBorder";
break;
default:
os << "hipAddressModeWrap";
};
return os;
}
inline std::ostream& operator<<(std::ostream& os, const hipMemcpyKind& s) {
switch (s) {
case hipMemcpyHostToHost:
os << "hipMemcpyHostToHost";
break;
case hipMemcpyHostToDevice:
os << "hipMemcpyHostToDevice";
break;
case hipMemcpyDeviceToHost:
os << "hipMemcpyDeviceToHost";
break;
case hipMemcpyDeviceToDevice:
os << "hipMemcpyDeviceToDevice";
break;
case hipMemcpyDefault:
os << "hipMemcpyDefault";
break;
default:
os << "hipMemcpyDefault";
};
return os;
}
inline std::ostream& operator<<(std::ostream& os, const hipChannelFormatKind& s) {
switch (s) {
case hipChannelFormatKindSigned:
os << "hipChannelFormatKindSigned";
break;
case hipChannelFormatKindUnsigned:
os << "hipMemcpyHostToDevice";
break;
case hipChannelFormatKindFloat:
os << "hipChannelFormatKindFloat";
break;
case hipChannelFormatKindNone:
os << "hipChannelFormatKindNone";
break;
default:
os << "hipChannelFormatKindNone";
};
return os;
}
inline std::ostream& operator<<(std::ostream& os, const hipArray_Format& s) {
switch (s) {
case HIP_AD_FORMAT_UNSIGNED_INT8:
os << "HIP_AD_FORMAT_UNSIGNED_INT8";
break;
case HIP_AD_FORMAT_UNSIGNED_INT16:
os << "HIP_AD_FORMAT_UNSIGNED_INT16";
break;
case HIP_AD_FORMAT_UNSIGNED_INT32:
os << "HIP_AD_FORMAT_UNSIGNED_INT32";
break;
case HIP_AD_FORMAT_SIGNED_INT8:
os << "HIP_AD_FORMAT_SIGNED_INT8";
break;
case HIP_AD_FORMAT_SIGNED_INT16:
os << "HIP_AD_FORMAT_SIGNED_INT16";
break;
case HIP_AD_FORMAT_SIGNED_INT32:
os << "HIP_AD_FORMAT_SIGNED_INT32";
break;
case HIP_AD_FORMAT_HALF:
os << "HIP_AD_FORMAT_HALF";
break;
case HIP_AD_FORMAT_FLOAT:
os << "HIP_AD_FORMAT_FLOAT";
break;
default:
os << "HIP_AD_FORMAT_FLOAT";
};
return os;
}
inline std::ostream& operator<<(std::ostream& os, const hipResourceViewFormat& s) {
switch (s) {
case hipResViewFormatNone:
os << "hipResViewFormatNone";
break;
case hipResViewFormatUnsignedChar1:
os << "hipResViewFormatUnsignedChar1";
break;
case hipResViewFormatUnsignedChar2:
os << "hipResViewFormatUnsignedChar2";
break;
case hipResViewFormatUnsignedChar4:
os << "hipResViewFormatUnsignedChar4";
break;
case hipResViewFormatSignedChar1:
os << "hipResViewFormatSignedChar1";
break;
case hipResViewFormatSignedChar2:
os << "hipResViewFormatSignedChar2";
break;
case hipResViewFormatSignedChar4:
os << "hipResViewFormatSignedChar4";
break;
case hipResViewFormatUnsignedShort1:
os << "hipResViewFormatUnsignedShort1";
break;
case hipResViewFormatUnsignedShort2:
os << "hipResViewFormatUnsignedShort2";
break;
case hipResViewFormatUnsignedShort4:
os << "hipResViewFormatUnsignedShort4";
break;
case hipResViewFormatSignedShort1:
os << "hipResViewFormatSignedShort1";
break;
case hipResViewFormatSignedShort2:
os << "hipResViewFormatSignedShort2";
break;
case hipResViewFormatSignedShort4:
os << "hipResViewFormatSignedShort4";
break;
case hipResViewFormatUnsignedInt1:
os << "hipResViewFormatUnsignedInt1";
break;
case hipResViewFormatUnsignedInt2:
os << "hipResViewFormatUnsignedInt2";
break;
case hipResViewFormatUnsignedInt4:
os << "hipResViewFormatUnsignedInt4";
break;
case hipResViewFormatSignedInt1:
os << "hipResViewFormatSignedInt1";
break;
case hipResViewFormatSignedInt2:
os << "hipResViewFormatSignedInt2";
break;
case hipResViewFormatSignedInt4:
os << "hipResViewFormatSignedInt4";
break;
case hipResViewFormatHalf1:
os << "hipResViewFormatHalf1";
break;
case hipResViewFormatHalf2:
os << "hipResViewFormatHalf2";
break;
case hipResViewFormatHalf4:
os << "hipResViewFormatHalf4";
break;
case hipResViewFormatFloat1:
os << "hipResViewFormatFloat1";
break;
case hipResViewFormatFloat2:
os << "hipResViewFormatFloat2";
break;
case hipResViewFormatFloat4:
os << "hipResViewFormatFloat4";
break;
case hipResViewFormatUnsignedBlockCompressed1:
os << "hipResViewFormatUnsignedBlockCompressed1";
break;
case hipResViewFormatUnsignedBlockCompressed2:
os << "hipResViewFormatUnsignedBlockCompressed2";
break;
case hipResViewFormatUnsignedBlockCompressed3:
os << "hipResViewFormatUnsignedBlockCompressed3";
break;
case hipResViewFormatUnsignedBlockCompressed4:
os << "hipResViewFormatUnsignedBlockCompressed4";
break;
case hipResViewFormatSignedBlockCompressed4:
os << "hipResViewFormatSignedBlockCompressed4";
break;
case hipResViewFormatUnsignedBlockCompressed5:
os << "hipResViewFormatUnsignedBlockCompressed5";
break;
case hipResViewFormatSignedBlockCompressed5:
os << "hipResViewFormatSignedBlockCompressed5";
break;
case hipResViewFormatUnsignedBlockCompressed6H:
os << "hipResViewFormatUnsignedBlockCompressed6H";
break;
case hipResViewFormatSignedBlockCompressed6H:
os << "hipResViewFormatSignedBlockCompressed6H";
break;
case hipResViewFormatUnsignedBlockCompressed7:
os << "hipResViewFormatUnsignedBlockCompressed7";
break;
default:
os << "hipResViewFormatNone";
};
return os;
}
inline std::ostream& operator<<(std::ostream& os, const hipFunction_attribute& s) {
switch (s) {
case HIP_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK:
os << "HIP_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK";
break;
case HIP_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES:
os << "HIP_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES";
break;
case HIP_FUNC_ATTRIBUTE_CONST_SIZE_BYTES:
os << "HIP_FUNC_ATTRIBUTE_CONST_SIZE_BYTES";
break;
case HIP_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES:
os << "HIP_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES";
break;
case HIP_FUNC_ATTRIBUTE_NUM_REGS:
os << "HIP_FUNC_ATTRIBUTE_NUM_REGS";
break;
case HIP_FUNC_ATTRIBUTE_PTX_VERSION:
os << "HIP_FUNC_ATTRIBUTE_PTX_VERSION";
break;
case HIP_FUNC_ATTRIBUTE_BINARY_VERSION:
os << "HIP_FUNC_ATTRIBUTE_BINARY_VERSION";
break;
case HIP_FUNC_ATTRIBUTE_CACHE_MODE_CA:
os << "HIP_FUNC_ATTRIBUTE_CACHE_MODE_CA";
break;
case HIP_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES:
os << "HIP_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES";
break;
case HIP_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT:
os << "HIP_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT";
break;
case HIP_FUNC_ATTRIBUTE_MAX:
os << "HIP_FUNC_ATTRIBUTE_MAX";
break;
default:
os << "HIP_FUNC_ATTRIBUTE_MAX";
};
return os;
}
inline std::ostream& operator<<(std::ostream& os, const hiprtcResult& s) {
switch (s) {
case HIPRTC_SUCCESS:
os << "HIPRTC_SUCCESS";
break;
case HIPRTC_ERROR_OUT_OF_MEMORY:
os << "HIPRTC_ERROR_OUT_OF_MEMORY";
break;
case HIPRTC_ERROR_PROGRAM_CREATION_FAILURE:
os << "HIPRTC_ERROR_PROGRAM_CREATION_FAILURE";
break;
case HIPRTC_ERROR_INVALID_INPUT:
os << "HIPRTC_ERROR_INVALID_INPUT";
break;
case HIPRTC_ERROR_INVALID_PROGRAM:
os << "HIPRTC_ERROR_INVALID_PROGRAM";
break;
case HIPRTC_ERROR_INVALID_OPTION:
os << "HIPRTC_ERROR_INVALID_OPTION";
break;
case HIPRTC_ERROR_COMPILATION:
os << "HIPRTC_ERROR_COMPILATION";
break;
case HIPRTC_ERROR_BUILTIN_OPERATION_FAILURE:
os << "HIPRTC_ERROR_BUILTIN_OPERATION_FAILURE";
break;
case HIPRTC_ERROR_NO_NAME_EXPRESSIONS_AFTER_COMPILATION:
os << "HIPRTC_ERROR_NO_NAME_EXPRESSIONS_AFTER_COMPILATION";
break;
case HIPRTC_ERROR_NO_LOWERED_NAMES_BEFORE_COMPILATION:
os << "IPRTC_ERROR_NO_LOWERED_NAMES_BEFORE_COMPILATION";
break;
case HIPRTC_ERROR_NAME_EXPRESSION_NOT_VALID:
os << "HIPRTC_ERROR_NAME_EXPRESSION_NOT_VALID";
break;
case HIPRTC_ERROR_INTERNAL_ERROR:
os << "HIPRTC_ERROR_INTERNAL_ERROR";
break;
default:
os << "HIPRTC_ERROR_INTERNAL_ERROR";
};
return os;
}
inline std::ostream& operator<<(std::ostream& os, const hipJitOption& s) {
switch (s) {
case HIPRTC_JIT_MAX_REGISTERS:
os << "HIPRTC_JIT_MAX_REGISTERS";
break;
case HIPRTC_JIT_THREADS_PER_BLOCK:
os << "HIPRTC_JIT_THREADS_PER_BLOCK";
break;
case HIPRTC_JIT_WALL_TIME:
os << "HIPRTC_JIT_WALL_TIME";
break;
case HIPRTC_JIT_INFO_LOG_BUFFER:
os << "HIPRTC_JIT_INFO_LOG_BUFFER";
break;
case HIPRTC_JIT_INFO_LOG_BUFFER_SIZE_BYTES:
os << "HIPRTC_JIT_ERROR_LOG_BUFFER_SIZE_BYTES";
break;
case HIPRTC_JIT_ERROR_LOG_BUFFER:
os << "HIPRTC_JIT_ERROR_LOG_BUFFER";
break;
case HIPRTC_JIT_ERROR_LOG_BUFFER_SIZE_BYTES:
os << "HIPRTC_JIT_ERROR_LOG_BUFFER_SIZE_BYTES";
break;
case HIPRTC_JIT_OPTIMIZATION_LEVEL:
os << "HIPRTC_JIT_OPTIMIZATION_LEVEL";
break;
case HIPRTC_JIT_TARGET_FROM_HIPCONTEXT:
os << "HIPRTC_JIT_TARGET_FROM_HIPCONTEXT";
break;
case HIPRTC_JIT_TARGET:
os << "HIPRTC_JIT_TARGET";
break;
case HIPRTC_JIT_FALLBACK_STRATEGY:
os << "HIPRTC_JIT_FALLBACK_STRATEGY";
break;
case HIPRTC_JIT_GENERATE_DEBUG_INFO:
os << "HIPRTC_JIT_GENERATE_DEBUG_INFO";
break;
case HIPRTC_JIT_CACHE_MODE:
os << "HIPRTC_JIT_CACHE_MODE";
break;
case HIPRTC_JIT_NEW_SM3X_OPT:
os << "HIPRTC_JIT_NEW_SM3X_OPT";
break;
case HIPRTC_JIT_FAST_COMPILE:
os << "HIPRTC_JIT_FAST_COMPILE";
break;
case HIPRTC_JIT_GLOBAL_SYMBOL_NAMES:
os << "HIPRTC_JIT_GLOBAL_SYMBOL_NAMES";
break;
case HIPRTC_JIT_GLOBAL_SYMBOL_ADDRESS:
os << "HIPRTC_JIT_GLOBAL_SYMBOL_ADDRESS";
break;
case HIPRTC_JIT_GLOBAL_SYMBOL_COUNT:
os << "HIPRTC_JIT_GLOBAL_SYMBOL_COUNT";
break;
case HIPRTC_JIT_LTO:
os << "HIPRTC_JIT_LTO";
break;
case HIPRTC_JIT_FTZ:
os << "HIPRTC_JIT_FTZ";
break;
case HIPRTC_JIT_PREC_DIV:
os << "HIPRTC_JIT_PREC_DIV";
break;
case HIPRTC_JIT_PREC_SQRT:
os << "HIPRTC_JIT_PREC_SQRT";
break;
case HIPRTC_JIT_FMA:
os << "HIPRTC_JIT_FMA";
break;
case HIPRTC_JIT_NUM_OPTIONS:
os << "HIPRTC_JIT_NUM_OPTIONS";
break;
default:
os << "HIPRTC_JIT_MAX_REGISTERS";
};
return os;
}
inline std::ostream& operator<<(std::ostream& os, const hipFuncCache_t& s) {
switch (s) {
case hipFuncCachePreferNone:
os << "hipFuncCachePreferNone";
break;
case hipFuncCachePreferShared:
os << "hipFuncCachePreferShared";
break;
case hipFuncCachePreferL1:
os << "hipFuncCachePreferL1";
break;
case hipFuncCachePreferEqual:
os << "hipFuncCachePreferEqual";
break;
default:
os << "hipFuncCachePreferNone";
};
return os;
}
inline std::ostream& operator<<(std::ostream& os, const hipSharedMemConfig& s) {
switch (s) {
case hipSharedMemBankSizeDefault:
os << "hipSharedMemBankSizeDefault";
break;
case hipSharedMemBankSizeFourByte:
os << "hipSharedMemBankSizeFourByte";
break;
case hipSharedMemBankSizeEightByte:
os << "hipSharedMemBankSizeEightByte";
break;
default:
os << "hipSharedMemBankSizeDefault";
};
return os;
}
inline std::ostream& operator<<(std::ostream& os, const hipDataType& s) {
switch (s) {
case HIP_R_16F:
os << "HIP_R_16F";
break;
case HIP_R_32F:
os << "HIP_R_32F";
break;
case HIP_R_64F:
os << "HIP_R_64F";
break;
case HIP_C_16F:
os << "HIP_C_16F";
break;
case HIP_C_32F:
os << "HIP_C_32F";
break;
case HIP_C_64F:
os << "HIP_C_64F";
break;
default:
os << "HIP_R_16F";
};
return os;
}
inline std::ostream& operator<<(std::ostream& os, const hipLibraryPropertyType& s) {
switch (s) {
case HIP_LIBRARY_MAJOR_VERSION:
os << "HIP_LIBRARY_MAJOR_VERSION";
break;
case HIP_LIBRARY_MINOR_VERSION:
os << "HIP_LIBRARY_MINOR_VERSION";
break;
case HIP_LIBRARY_PATCH_LEVEL:
os << "HIP_LIBRARY_PATCH_LEVEL";
break;
default:
os << "HIP_LIBRARY_MAJOR_VERSION";
};
return os;
}
inline std::ostream& operator<<(std::ostream& os, const hip_api_id_t& s) {
os << hip_api_name(s);
return os;
}
inline std::ostream& operator<<(std::ostream& os, const hip_api_id_t* s) {
if (s) {
os << *s;
} else {
os << "nullptr";
}
return os;
}
inline std::ostream& operator<<(std::ostream& os, const hipTextureDesc& s) {
os << '{'
<< '{'
<< s.addressMode[0]
<< ','
<< s.addressMode[1]
<< ','
<< s.addressMode[2]
<< '}'
<< ','
<< s.filterMode
<< ','
<< s.readMode
<< ','
<< s.sRGB
<< ','
<< '{'
<< s.borderColor[0]
<< ','
<< s.borderColor[1]
<< ','
<< s.borderColor[2]
<< ','
<< s.borderColor[3]
<< '}'
<< ','
<< s.normalizedCoords
<< ','
<< s.mipmapFilterMode
<< ','
<< s.mipmapLevelBias
<< ','
<< s.minMipmapLevelClamp
<< ','
<< s.maxMipmapLevelClamp
<< '}';
return os;
}
inline std::ostream& operator<<(std::ostream& os, const hipTextureDesc* s) {
if (s) {
os << *s;
} else {
os << "nullptr";
}
return os;
}
inline std::ostream& operator<<(std::ostream& os, const dim3& s) {
os << '{'
<< s.x
<< ','
<< s.y
<< ','
<< s.z
<< '}';
return os;
}
inline std::ostream& operator<<(std::ostream& os, const dim3* s) {
if (s) {
os << *s;
} else {
os << "nullptr";
}
return os;
}
inline std::ostream& operator<<(std::ostream& os, const hipChannelFormatDesc& s) {
os << '{'
<< s.x
<< ','
<< s.y
<< ','
<< s.z
<< ','
<< s.w
<< ','
<< s.f
<< '}';
return os;
}
inline std::ostream& operator<<(std::ostream& os, const hipChannelFormatDesc* s) {
if (s) {
os << *s;
} else {
os << "nullptr";
}
return os;
}
inline std::ostream& operator<<(std::ostream& os, const hipMipmappedArray& s) {
os << '{'
<< s.data
<< ','
<< s.desc
<< ','
<< s.width
<< ','
<< s.height
<< ','
<< s.depth
<< '}';
return os;
}
inline std::ostream& operator<<(std::ostream& os, const hipMipmappedArray* s) {
if (s) {
os << *s;
} else {
os << "nullptr";
}
return os;
}
inline std::ostream& operator<<(std::ostream& os, const hipResourceDesc& s) {
os << '{'
<< s.resType
<< ','
<< '{';
switch (s.resType) {
case hipResourceTypeLinear:
os << s.res.linear.devPtr
<< ','
<< s.res.linear.desc
<< ','
<< s.res.linear.sizeInBytes;
break;
case hipResourceTypePitch2D:
os << s.res.pitch2D.devPtr
<< ','
<< s.res.pitch2D.desc
<< ','
<< s.res.pitch2D.width
<< ','
<< s.res.pitch2D.height
<< ','
<< s.res.pitch2D.pitchInBytes;
break;
case hipResourceTypeArray:
os << s.res.array.array;
break;
case hipResourceTypeMipmappedArray:
os <<s.res.mipmap.mipmap;
break;
default:
break;
}
os << '}';
return os;
}
inline std::ostream& operator<<(std::ostream& os, const hipResourceDesc* s) {
if (s) {
os << *s;
} else {
os << "nullptr";
}
return os;
}
inline std::ostream& operator<<(std::ostream& os, const hipArray& s) {
os << '{'
<< s.data
<< ','
<< s.desc
<< ','
<< s.type
<< ','
<< s.width
<< ','
<< s.height
<< ','
<< s.depth
<< ','
<< s.Format
<< ','
<< s.NumChannels
<< ','
<< s.isDrv
<< ','
<< s.textureType
<< '}';
return os;
}
inline std::ostream& operator<<(std::ostream& os, const hipArray* s) {
if (s) {
os << *s;
} else {
os << "nullptr";
}
return os;
}
inline std::ostream& operator<<(std::ostream& os, const textureReference& s) {
os << '{'
<< s.normalized
<< ','
<< s.readMode
<< ','
<< s.filterMode
<< ','
<< '{'
<< s.addressMode[0]
<< ','
<< s.addressMode[1]
<< ','
<< s.addressMode[2]
<< '}'
<< ','
<< s.channelDesc
<< ','
<< s.sRGB
<< ','
<< s.maxAnisotropy
<< ','
<< s.mipmapFilterMode
<< ','
<< s.mipmapLevelBias
<< ','
<< s.minMipmapLevelClamp
<< ','
<< s.maxMipmapLevelClamp
<< ','
<< s.textureObject
<< '}';
return os;
}
inline std::ostream& operator<<(std::ostream& os, const textureReference* s) {
if (s) {
os << *s;
} else {
os << "nullptr";
}
return os;
}
inline std::ostream& operator<<(std::ostream& os, const hipError_t& s) {
os << hipGetErrorName(s);
return os;
}
inline std::ostream& operator<<(std::ostream& os, const hipError_t* s) {
if (s) {
os << *s;
} else {
os << "nullptr";
}
return os;
}
inline std::ostream& operator<<(std::ostream& os, const hipResourceViewDesc& s) {
os << '{'
<< s.format
<< ','
<< s.width
<< ','
<< s.height
<< ','
<< s.depth
<< ','
<< s.firstMipmapLevel
<< ','
<< s.lastMipmapLevel
<< ','
<< s.firstLayer
<< ','
<< s.lastLayer
<< '}';
return os;
}
inline std::ostream& operator<<(std::ostream& os, const hipResourceViewDesc* s) {
if (s) {
os << *s;
} else {
os << "nullptr";
}
return os;
}
inline std::ostream& operator<<(std::ostream& os, const HIP_ARRAY_DESCRIPTOR& s) {
os << '{'
<< s.Width
<< ','
<< s.Height
<< ','
<< s.Format
<< ','
<< s.NumChannels
<< '}';
return os;
}
inline std::ostream& operator<<(std::ostream& os, const HIP_ARRAY_DESCRIPTOR* s) {
if (s) {
os << *s;
} else {
os << "nullptr";
}
return os;
}
inline std::ostream& operator<<(std::ostream& os, const HIP_ARRAY3D_DESCRIPTOR& s) {
os << '{'
<< s.Width
<< ','
<< s.Height
<< ','
<< s.Depth
<< ','
<< s.Format
<< ','
<< s.NumChannels
<< ','
<< s.Flags
<< '}';
return os;
}
inline std::ostream& operator<<(std::ostream& os, const HIP_ARRAY3D_DESCRIPTOR* s) {
if (s) {
os << *s;
} else {
os << "nullptr";
}
return os;
}
inline std::ostream& operator<<(std::ostream& os, const hipExtent& s) {
os << '{'
<< s.width
<< ','
<< s.height
<< ','
<< s.depth
<< '}';
return os;
}
inline std::ostream& operator<<(std::ostream& os, const hipIpcEventHandle_t& s) {
//TODO fill in later
return os;
}
inline std::ostream& operator<<(std::ostream& os, const hipIpcEventHandle_t* s) {
//TODO fill in later
return os;
}
+758
查看文件
@@ -0,0 +1,758 @@
/* Copyright (c) 2010 - 2021 Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. */
#include "top.hpp"
#include "hip/hip_runtime.h"
#include "hip_internal.hpp"
#include "cl_gl_amd.hpp"
#include "cl_common.hpp"
#include <GL/gl.h>
#include <GL/glext.h>
#include "hip_conversions.hpp"
namespace amd {
static std::once_flag interopOnce;
}
// Sets up GL context association with amd context.
// NOTE: Refer to Context setup code in OCLTestImp.cpp
void setupGLInteropOnce() {
amd::Context* amdContext = hip::getCurrentDevice()->asContext();
//current context will be read in amdContext->create
cl_context_properties properties[] = {CL_CONTEXT_PLATFORM,
(cl_context_properties)AMD_PLATFORM,
ROCCLR_HIP_GL_CONTEXT_KHR,
(cl_context_properties) nullptr,
#ifdef _WIN32
ROCCLR_HIP_WGL_HDC_KHR,
(cl_context_properties) nullptr,
#else
ROCCLR_HIP_GLX_DISPLAY_KHR,
(cl_context_properties) nullptr,
#endif
0};
amd::Context::Info info;
if (CL_SUCCESS != amd::Context::checkProperties(properties, &info)) {
LogError("Context setup failed \n");
return;
}
amdContext->setInfo(info);
if (CL_SUCCESS != amdContext->create(properties)) {
LogError("Context setup failed \n");
}
}
static inline hipError_t hipSetInteropObjects(int num_objects, void** mem_objects,
std::vector<amd::Memory*>& interopObjects) {
if ((num_objects == 0 && mem_objects != nullptr) || (num_objects != 0 && mem_objects == nullptr)) {
return hipErrorUnknown;
}
while (num_objects-- > 0) {
void* obj = *mem_objects++;
if (obj == nullptr) {
return hipErrorInvalidResourceHandle;
}
amd::Memory* mem = reinterpret_cast<amd::Memory*>(obj);
if (mem->getInteropObj() == nullptr) {
return hipErrorInvalidResourceHandle;
}
interopObjects.push_back(mem);
}
return hipSuccess;
}
// NOTE: This method cooresponds to OpenCL functionality in clGetGLContextInfoKHR()
hipError_t hipGLGetDevices(unsigned int* pHipDeviceCount, int* pHipDevices,
unsigned int hipDeviceCount, hipGLDeviceList deviceList) {
HIP_INIT_API(hipGLGetDevices, pHipDeviceCount, pHipDevices, hipDeviceCount, deviceList);
std::call_once(amd::interopOnce, setupGLInteropOnce);
static const bool VALIDATE_ONLY = true;
if (deviceList == hipGLDeviceListNextFrame) {
LogError(" hipGLDeviceListNextFrame not supported yet.\n");
HIP_RETURN(hipErrorNotSupported);
}
if (pHipDeviceCount == nullptr || pHipDevices == nullptr || hipDeviceCount == 0) {
LogError(" Invalid Argument \n");
HIP_RETURN(hipErrorInvalidValue);
}
hipDeviceCount = std::min(hipDeviceCount, static_cast<unsigned int>(g_devices.size()));
amd::Context::Info info = hip::getCurrentDevice()->asContext()->info();
if (!(info.flags_ & amd::Context::GLDeviceKhr)) {
LogError("Failed : Invalid Shared Group Reference \n");
HIP_RETURN(hipErrorInvalidValue);
}
amd::GLFunctions* glenv = hip::getCurrentDevice()->asContext()->glenv();
if (glenv != nullptr) {
#ifdef _WIN32
info.hCtx_ = glenv->wglGetCurrentContext_();
#else
info.hCtx_ = glenv->glXGetCurrentContext_();
#endif
hip::getCurrentDevice()->asContext()->setInfo(info);
glenv->update(reinterpret_cast<intptr_t>(info.hCtx_));
}
*pHipDeviceCount = 0;
switch (deviceList) {
case hipGLDeviceListCurrentFrame:
for (int i = 0; i < hipDeviceCount; ++i) {
const std::vector<amd::Device*>& devices = g_devices[i]->devices();
if (devices.size() > 0 &&
devices[0]->bindExternalDevice(info.flags_, info.hDev_, info.hCtx_, VALIDATE_ONLY)) {
pHipDevices[0] = i;
*pHipDeviceCount = 1;
break;
}
}
break;
case hipGLDeviceListAll: {
int foundDeviceCount = 0;
for (int i = 0; i < hipDeviceCount; ++i) {
const std::vector<amd::Device*>& devices = g_devices[i]->devices();
if (devices.size() > 0 &&
devices[0]->bindExternalDevice(info.flags_, info.hDev_, info.hCtx_, VALIDATE_ONLY)) {
pHipDevices[foundDeviceCount++] = i;
break;
}
}
*pHipDeviceCount = foundDeviceCount;
} break;
default:
LogWarning("Invalid deviceList value");
HIP_RETURN(hipErrorInvalidValue);
}
HIP_RETURN(*pHipDeviceCount > 0 ? hipSuccess : hipErrorNoDevice);
}
static inline void clearGLErrors(const amd::Context& amdContext) {
GLenum glErr, glLastErr = GL_NO_ERROR;
while (1) {
glErr = amdContext.glenv()->glGetError_();
if (glErr == GL_NO_ERROR || glErr == glLastErr) {
break;
}
glLastErr = glErr;
LogWarning("GL error");
}
}
static inline GLenum checkForGLError(const amd::Context& amdContext) {
GLenum glRetErr = GL_NO_ERROR;
GLenum glErr;
while (GL_NO_ERROR != (glErr = amdContext.glenv()->glGetError_())) {
glRetErr = glErr; // Just return the last GL error
LogWarning("Check GL error");
}
return glRetErr;
}
hipError_t hipGraphicsSubResourceGetMappedArray(hipArray_t* array, hipGraphicsResource_t resource,
unsigned int arrayIndex, unsigned int mipLevel) {
HIP_INIT_API(hipGraphicsSubResourceGetMappedArray, array, resource, arrayIndex, mipLevel);
amd::Context& amdContext = *(hip::getCurrentDevice()->asContext());
if (array == nullptr || resource == nullptr) {
LogError("invalid array/resource");
HIP_RETURN(hipErrorInvalidValue);
}
amd::Image* image = (reinterpret_cast<amd::Memory*>(resource))->asImage();
if (image == nullptr) {
LogError("invalid resource/image");
HIP_RETURN(hipErrorInvalidValue);
}
// arrayIndex higher than zero not implmented
assert(arrayIndex == 0) ;
amd::Image * view = image->createView(amdContext, image->getImageFormat(), nullptr, mipLevel, 0);
hipArray* myarray = new hipArray();
myarray->data = as_cl<amd::Memory> (view);
myarray->width = view->getWidth();
myarray->height = view->getHeight();
myarray->depth = view->getDepth();
const cl_mem_object_type image_type = hip::getCLMemObjectType(myarray->width, myarray->height, myarray->depth, hipArrayDefault);
myarray->type = image_type;
amd::Image::Format f = image->getImageFormat();
myarray->Format = hip::getCL2hipArrayFormat(f.image_channel_data_type);
myarray->desc = hip::getChannelFormatDesc(f.getNumChannels(), myarray->Format);
myarray->NumChannels = hip::getNumChannels(myarray->desc);
myarray->isDrv = 0;
myarray->textureType = 0;
*array = myarray;
{
amd::ScopedLock lock(hip::hipArraySetLock);
hip::hipArraySet.insert(*array);
}
HIP_RETURN(hipSuccess);
}
hipError_t hipGraphicsGLRegisterImage(hipGraphicsResource** resource, GLuint image, GLenum target,
unsigned int flags) {
HIP_INIT_API(hipGraphicsGLRegisterImage, resource, image, target, flags);
if (!((flags == hipGraphicsRegisterFlagsNone) || (flags & hipGraphicsRegisterFlagsReadOnly) ||
(flags & hipGraphicsRegisterFlagsWriteDiscard) ||
(flags & hipGraphicsRegisterFlagsSurfaceLoadStore) ||
(flags & hipGraphicsRegisterFlagsTextureGather))) {
LogError("invalid parameter \"flags\"");
HIP_RETURN(hipErrorInvalidValue);
}
if (resource == nullptr) {
LogError("invalid resource");
HIP_RETURN(hipErrorInvalidValue);
}
GLint miplevel = 0;
amd::Context& amdContext = *(hip::getCurrentDevice()->asContext());
if (amdContext.glenv() == nullptr) {
LogError("invalid context, gl interop not initialized");
HIP_RETURN(hipErrorInvalidValue);
}
amd::GLFunctions::SetIntEnv ie(amdContext.glenv());
if (!ie.isValid()) {
LogWarning("\"amdContext\" is not created from GL context or share list \n");
HIP_RETURN(hipErrorUnknown);
}
amd::ImageGL* pImageGL = NULL;
GLenum glErr;
GLenum glTarget = 0;
GLenum glInternalFormat;
cl_image_format clImageFormat;
uint dim = 1;
cl_mem_object_type clType;
cl_gl_object_type clGLType;
GLsizei numSamples = 1;
GLint gliTexWidth = 1;
GLint gliTexHeight = 1;
GLint gliTexDepth = 1;
// Verify GL texture object
clearGLErrors(amdContext);
if ((GL_FALSE == amdContext.glenv()->glIsTexture_(image)) ||
(GL_NO_ERROR != (glErr = amdContext.glenv()->glGetError_()))) {
LogWarning("\"texture\" is not a GL texture object");
HIP_RETURN(hipErrorUnknown);
}
bool isImage = true;
// Check target value validity
switch (target) {
case GL_TEXTURE_BUFFER:
glTarget = GL_TEXTURE_BUFFER;
dim = 1;
clType = CL_MEM_OBJECT_IMAGE1D_BUFFER;
clGLType = CL_GL_OBJECT_TEXTURE_BUFFER;
isImage = false;
break;
case GL_TEXTURE_1D:
glTarget = GL_TEXTURE_1D;
dim = 1;
clType = CL_MEM_OBJECT_IMAGE1D;
clGLType = CL_GL_OBJECT_TEXTURE1D;
break;
case GL_TEXTURE_CUBE_MAP_POSITIVE_X:
case GL_TEXTURE_CUBE_MAP_NEGATIVE_X:
case GL_TEXTURE_CUBE_MAP_POSITIVE_Y:
case GL_TEXTURE_CUBE_MAP_NEGATIVE_Y:
case GL_TEXTURE_CUBE_MAP_POSITIVE_Z:
case GL_TEXTURE_CUBE_MAP_NEGATIVE_Z:
glTarget = GL_TEXTURE_CUBE_MAP;
dim = 2;
clType = CL_MEM_OBJECT_IMAGE2D;
clGLType = CL_GL_OBJECT_TEXTURE2D;
break;
case GL_TEXTURE_1D_ARRAY:
glTarget = GL_TEXTURE_1D_ARRAY;
dim = 2;
clType = CL_MEM_OBJECT_IMAGE1D_ARRAY;
clGLType = CL_GL_OBJECT_TEXTURE1D_ARRAY;
break;
case GL_TEXTURE_2D:
glTarget = GL_TEXTURE_2D;
dim = 2;
clType = CL_MEM_OBJECT_IMAGE2D;
clGLType = CL_GL_OBJECT_TEXTURE2D;
break;
case GL_TEXTURE_2D_MULTISAMPLE:
glTarget = GL_TEXTURE_2D_MULTISAMPLE;
dim = 2;
clType = CL_MEM_OBJECT_IMAGE2D;
clGLType = CL_GL_OBJECT_TEXTURE2D;
break;
case GL_TEXTURE_RECTANGLE_ARB:
glTarget = GL_TEXTURE_RECTANGLE_ARB;
dim = 2;
clType = CL_MEM_OBJECT_IMAGE2D;
clGLType = CL_GL_OBJECT_TEXTURE2D;
break;
case GL_TEXTURE_2D_ARRAY:
glTarget = GL_TEXTURE_2D_ARRAY;
dim = 3;
clType = CL_MEM_OBJECT_IMAGE2D_ARRAY;
clGLType = CL_GL_OBJECT_TEXTURE2D_ARRAY;
break;
case GL_TEXTURE_3D:
glTarget = GL_TEXTURE_3D;
dim = 3;
clType = CL_MEM_OBJECT_IMAGE3D;
clGLType = CL_GL_OBJECT_TEXTURE3D;
break;
default:
// wrong value
LogWarning("invalid \"target\" value");
HIP_RETURN(hipErrorInvalidValue);
break;
}
amdContext.glenv()->glBindTexture_(glTarget, image);
// Check if size is available - data store is created
if (isImage) {
// Check mipmap level for "texture" name
GLint gliTexBaseLevel;
GLint gliTexMaxLevel;
clearGLErrors(amdContext);
amdContext.glenv()->glGetTexParameteriv_(glTarget, GL_TEXTURE_BASE_LEVEL, &gliTexBaseLevel);
if (GL_NO_ERROR != (glErr = amdContext.glenv()->glGetError_())) {
LogWarning("Cannot get base mipmap level of a GL \"texture\" object");
HIP_RETURN(hipErrorInvalidValue);
}
clearGLErrors(amdContext);
amdContext.glenv()->glGetTexParameteriv_(glTarget, GL_TEXTURE_MAX_LEVEL, &gliTexMaxLevel);
if (GL_NO_ERROR != (glErr = amdContext.glenv()->glGetError_())) {
LogWarning("Cannot get max mipmap level of a GL \"texture\" object");
HIP_RETURN(hipErrorInvalidValue);
}
if ((gliTexBaseLevel > miplevel) || (miplevel > gliTexMaxLevel)) {
LogWarning("\"miplevel\" is not a valid mipmap level of the GL \"texture\" object");
HIP_RETURN(hipErrorInvalidValue);
}
// Get GL texture format and check if it's compatible with CL format
clearGLErrors(amdContext);
amdContext.glenv()->glGetTexLevelParameteriv_(target, miplevel, GL_TEXTURE_INTERNAL_FORMAT,
(GLint*)&glInternalFormat);
if (GL_NO_ERROR != (glErr = amdContext.glenv()->glGetError_())) {
LogWarning("Cannot get internal format of \"miplevel\" of GL \"texture\" object");
HIP_RETURN(hipErrorInvalidValue);
}
amdContext.glenv()->glGetTexLevelParameteriv_(target, miplevel, GL_TEXTURE_SAMPLES,
(GLint*)&numSamples);
if (GL_NO_ERROR != (glErr = amdContext.glenv()->glGetError_())) {
LogWarning("Cannot get numbers of samples of GL \"texture\" object");
HIP_RETURN(hipErrorInvalidValue);
}
if (numSamples > 1) {
LogWarning("MSAA \"texture\" object is not suppoerted for the device");
HIP_RETURN(hipErrorInvalidValue);
}
// Now get CL format from GL format and bytes per pixel
int iBytesPerPixel = 0;
if (!getCLFormatFromGL(amdContext, glInternalFormat, &clImageFormat, &iBytesPerPixel,
0)) { //clFlags)) {
LogWarning("\"texture\" format does not map to an appropriate CL image format");
HIP_RETURN(hipErrorInvalidValue);
}
switch (dim) {
case 3:
clearGLErrors(amdContext);
amdContext.glenv()->glGetTexLevelParameteriv_(target, miplevel, GL_TEXTURE_DEPTH,
&gliTexDepth);
if (GL_NO_ERROR != (glErr = amdContext.glenv()->glGetError_())) {
LogWarning("Cannot get the depth of \"miplevel\" of GL \"texure\"");
HIP_RETURN(hipErrorInvalidValue);
}
// Fall trough to process other dimensions...
case 2:
clearGLErrors(amdContext);
amdContext.glenv()->glGetTexLevelParameteriv_(target, miplevel, GL_TEXTURE_HEIGHT,
&gliTexHeight);
if (GL_NO_ERROR != (glErr = amdContext.glenv()->glGetError_())) {
LogWarning("Cannot get the height of \"miplevel\" of GL \"texure\"");
HIP_RETURN(hipErrorInvalidValue);
}
// Fall trough to process other dimensions...
case 1:
clearGLErrors(amdContext);
amdContext.glenv()->glGetTexLevelParameteriv_(target, miplevel, GL_TEXTURE_WIDTH,
&gliTexWidth);
if (GL_NO_ERROR != (glErr = amdContext.glenv()->glGetError_())) {
LogWarning("Cannot get the width of \"miplevel\" of GL \"texure\"");
HIP_RETURN(hipErrorInvalidValue);
}
break;
default:
LogWarning("invalid \"target\" value");
HIP_RETURN(hipErrorInvalidValue);
}
} else {
GLint size;
// In case target is GL_TEXTURE_BUFFER
GLint backingBuffer;
clearGLErrors(amdContext);
amdContext.glenv()->glGetTexLevelParameteriv_(
glTarget, 0, GL_TEXTURE_BUFFER_DATA_STORE_BINDING, &backingBuffer);
if (GL_NO_ERROR != (glErr = amdContext.glenv()->glGetError_())) {
LogWarning("Cannot get backing buffer for GL \"texture buffer\" object");
HIP_RETURN(hipErrorInvalidValue);
}
amdContext.glenv()->glBindBuffer_(glTarget, backingBuffer);
// Get GL texture format and check if it's compatible with CL format
clearGLErrors(amdContext);
amdContext.glenv()->glGetIntegerv_(GL_TEXTURE_BUFFER_FORMAT_EXT,
reinterpret_cast<GLint*>(&glInternalFormat));
if (GL_NO_ERROR != (glErr = amdContext.glenv()->glGetError_())) {
LogWarning("Cannot get internal format of \"miplevel\" of GL \"texture\" object");
HIP_RETURN(hipErrorInvalidValue);
}
// Now get CL format from GL format and bytes per pixel
int iBytesPerPixel = 0;
if (!getCLFormatFromGL(amdContext, glInternalFormat, &clImageFormat, &iBytesPerPixel,
flags)) {
LogWarning("\"texture\" format does not map to an appropriate CL image format");
HIP_RETURN(hipErrorInvalidValue);
}
clearGLErrors(amdContext);
amdContext.glenv()->glGetBufferParameteriv_(glTarget, GL_BUFFER_SIZE, &size);
if (GL_NO_ERROR != (glErr = amdContext.glenv()->glGetError_())) {
LogWarning("Cannot get internal format of \"miplevel\" of GL \"texture\" object");
HIP_RETURN(hipErrorInvalidValue);
}
gliTexWidth = size / iBytesPerPixel;
}
size_t imageSize = (clType == CL_MEM_OBJECT_IMAGE1D_ARRAY) ? static_cast<size_t>(gliTexHeight)
: static_cast<size_t>(gliTexDepth);
if (!amd::Image::validateDimensions(
amdContext.devices(), clType, static_cast<size_t>(gliTexWidth),
static_cast<size_t>(gliTexHeight), static_cast<size_t>(gliTexDepth), imageSize)) {
LogWarning("The GL \"texture\" data store is not created or out of supported dimensions");
HIP_RETURN(hipErrorInvalidValue);
}
target = (glTarget == GL_TEXTURE_CUBE_MAP) ? target : 0;
pImageGL = new (amdContext)
amd::ImageGL(amdContext, clType, flags, clImageFormat, static_cast<size_t>(gliTexWidth),
static_cast<size_t>(gliTexHeight), static_cast<size_t>(gliTexDepth), glTarget,
image, 0, glInternalFormat, clGLType, numSamples, target);
if (!pImageGL) {
LogWarning("Cannot create class ImageGL - out of memory?");
HIP_RETURN(hipErrorUnknown);
}
if (!pImageGL->create()) {
pImageGL->release();
HIP_RETURN(hipErrorUnknown);
}
// Create interop object
if (pImageGL->getInteropObj() == nullptr) {
LogWarning("cannot create object of class BufferGL");
pImageGL->release();
HIP_RETURN(hipErrorUnknown);
}
// Fixme: If more than one device is present in the context, we choose the first device.
// We should come up with a more elegant solution to handle this.
assert(amdContext.devices().size() == 1);
const amd::Device& dev = *(amdContext.devices()[0]);
device::Memory* mem = pImageGL->getDeviceMemory(dev);
if (nullptr == mem) {
LogPrintfError("Can't allocate memory size - 0x%08X bytes!", pImageGL->getSize());
pImageGL->release();
HIP_RETURN(hipErrorUnknown);
}
mem->processGLResource(device::Memory::GLDecompressResource);
*resource = reinterpret_cast<hipGraphicsResource*>(pImageGL);
HIP_RETURN(hipSuccess);
}
hipError_t hipGraphicsGLRegisterBuffer(hipGraphicsResource** resource, GLuint buffer,
unsigned int flags) {
HIP_INIT_API(hipGraphicsGLRegisterBuffer, resource, buffer, flags);
if (!((flags == hipGraphicsRegisterFlagsNone) || (flags & hipGraphicsRegisterFlagsReadOnly) ||
(flags & hipGraphicsRegisterFlagsWriteDiscard))) {
LogError("invalid parameter \"flags\"");
HIP_RETURN(hipErrorInvalidValue);
}
if (resource == nullptr) {
LogError("invalid resource");
HIP_RETURN(hipErrorInvalidValue);
}
amd::BufferGL* pBufferGL = nullptr;
GLenum glErr;
GLenum glTarget = GL_ARRAY_BUFFER;
GLint gliSize = 0;
GLint gliMapped = 0;
amd::Context& amdContext = *(hip::getCurrentDevice()->asContext());
if (amdContext.glenv() == nullptr) {
LogError("invalid context, gl interop not initialized");
HIP_RETURN(hipErrorInvalidValue);
}
// Add this scope to bound the scoped lock
{
amd::GLFunctions::SetIntEnv ie(amdContext.glenv());
if (!ie.isValid()) {
LogWarning("\"amdContext\" is not created from GL context or share list \n");
HIP_RETURN(hipErrorUnknown);
}
// Verify GL buffer object
clearGLErrors(amdContext);
if ((GL_FALSE == amdContext.glenv()->glIsBuffer_(buffer)) ||
(GL_NO_ERROR != (glErr = amdContext.glenv()->glGetError_()))) {
LogWarning("\"buffer\" is not a GL buffer object \n");
HIP_RETURN(hipErrorInvalidResourceHandle);
}
// Check if size is available - data store is created
amdContext.glenv()->glBindBuffer_(glTarget, buffer);
clearGLErrors(amdContext);
amdContext.glenv()->glGetBufferParameteriv_(glTarget, GL_BUFFER_SIZE, &gliSize);
if (GL_NO_ERROR != (glErr = amdContext.glenv()->glGetError_())) {
LogWarning("cannot get the GL buffer size \n");
HIP_RETURN(hipErrorInvalidResourceHandle);
}
if (gliSize == 0) {
LogWarning("the GL buffer's data store is not created \n");
HIP_RETURN(hipErrorInvalidResourceHandle);
}
} // Release scoped lock
// Now create BufferGL object
pBufferGL = new (amdContext) amd::BufferGL(amdContext, flags, gliSize, 0, buffer);
if (!pBufferGL) {
LogWarning("cannot create object of class BufferGL");
HIP_RETURN(hipErrorUnknown);
}
if (!pBufferGL->create()) {
pBufferGL->release();
HIP_RETURN(hipErrorUnknown);
}
// Create interop object
if (pBufferGL->getInteropObj() == nullptr) {
LogWarning("cannot create object of class BufferGL");
HIP_RETURN(hipErrorUnknown);
}
// Fixme: If more than one device is present in the context, we choose the first device.
// We should come up with a more elegant solution to handle this.
assert(amdContext.devices().size() == 1);
const auto it = amdContext.devices().cbegin();
const amd::Device& dev = *(*it);
device::Memory* mem = pBufferGL->getDeviceMemory(dev);
if (nullptr == mem) {
LogPrintfError("Can't allocate memory size - 0x%08X bytes!", pBufferGL->getSize());
HIP_RETURN(hipErrorUnknown);
}
mem->processGLResource(device::Memory::GLDecompressResource);
*resource = reinterpret_cast<hipGraphicsResource*>(pBufferGL);
HIP_RETURN(hipSuccess);
}
hipError_t hipGraphicsMapResources(int count, hipGraphicsResource_t* resources,
hipStream_t stream) {
HIP_INIT_API(hipGraphicsMapResources, count, resources, stream);
amd::Context* amdContext = hip::getCurrentDevice()->asContext();
if (!amdContext || !amdContext->glenv()) {
HIP_RETURN(hipErrorUnknown);
}
clearGLErrors(*amdContext);
amdContext->glenv()->glFinish_();
if (checkForGLError(*amdContext) != GL_NO_ERROR) {
HIP_RETURN(hipErrorUnknown);
}
hip::Stream* hip_stream = hip::getStream(stream);
if (nullptr == hip_stream) {
HIP_RETURN(hipErrorUnknown);
}
if (!hip_stream->context().glenv() || !hip_stream->context().glenv()->isAssociated()) {
LogWarning("\"amdContext\" is not created from GL context or share list");
HIP_RETURN(hipErrorUnknown);
}
std::vector<amd::Memory*> memObjects;
hipError_t err = hipSetInteropObjects(count, reinterpret_cast<void**>(resources), memObjects);
if (err != hipSuccess) {
HIP_RETURN(err);
}
amd::Command::EventWaitList nullWaitList;
//! Now create command and enqueue
amd::AcquireExtObjectsCommand* command = new amd::AcquireExtObjectsCommand(
*hip_stream, nullWaitList, count, memObjects, CL_COMMAND_ACQUIRE_GL_OBJECTS);
if (command == nullptr) {
HIP_RETURN(hipErrorUnknown);
}
// Make sure we have memory for the command execution
if (!command->validateMemory()) {
delete command;
HIP_RETURN(hipErrorUnknown);
}
command->enqueue();
// *not_null(event) = as_cl(&command->event());
if (as_cl(&command->event()) == nullptr) {
command->release();
}
HIP_RETURN(hipSuccess);
}
hipError_t hipGraphicsResourceGetMappedPointer(void** devPtr, size_t* size,
hipGraphicsResource_t resource) {
HIP_INIT_API(hipGraphicsResourceGetMappedPointer, devPtr, size, resource);
amd::Context* amdContext = hip::getCurrentDevice()->asContext();
if (!amdContext || !amdContext->glenv()) {
HIP_RETURN(hipErrorUnknown);
}
// Fixme: If more than one device is present in the context, we choose the first device.
// We should come up with a more elegant solution to handle this.
assert(amdContext->devices().size() == 1);
const auto it = amdContext->devices().cbegin();
amd::Device* curDev = *it;
amd::Memory* amdMem = reinterpret_cast<amd::Memory*>(resource);
*size = amdMem->getSize();
// Interop resources don't have svm allocations they are added to
// amd::MemObjMap using device virtual address during creation.
device::Memory* mem = reinterpret_cast<device::Memory*>(amdMem->getDeviceMemory(*curDev));
*devPtr = reinterpret_cast<void*>(static_cast<uintptr_t>(mem->virtualAddress()));
HIP_RETURN(hipSuccess);
}
hipError_t hipGraphicsUnmapResources(int count, hipGraphicsResource_t* resources,
hipStream_t stream) {
HIP_INIT_API(hipGraphicsUnmapResources, count, resources, stream);
if (!hip::isValid(stream)) {
HIP_RETURN(hipErrorContextIsDestroyed);
}
// Wait for the current host queue
hip::getStream(stream)->finish();
hip::Stream* hip_stream = hip::getStream(stream);
if (nullptr == hip_stream) {
HIP_RETURN(hipErrorUnknown);
}
std::vector<amd::Memory*> memObjects;
hipError_t err = hipSetInteropObjects(count, reinterpret_cast<void**>(resources), memObjects);
if (err != hipSuccess) {
HIP_RETURN(err);
}
amd::Command::EventWaitList nullWaitList;
// Now create command and enqueue
amd::ReleaseExtObjectsCommand* command = new amd::ReleaseExtObjectsCommand(
*hip_stream, nullWaitList, count, memObjects, CL_COMMAND_RELEASE_GL_OBJECTS);
if (command == nullptr) {
HIP_RETURN(hipErrorUnknown);
}
// Make sure we have memory for the command execution
if (!command->validateMemory()) {
delete command;
HIP_RETURN(hipErrorUnknown);
}
command->enqueue();
if (as_cl(&command->event()) == nullptr) {
command->release();
}
HIP_RETURN(hipSuccess);
}
hipError_t hipGraphicsUnregisterResource(hipGraphicsResource_t resource) {
HIP_INIT_API(hipGraphicsUnregisterResource, resource);
amd::BufferGL* pBufferGL = reinterpret_cast<amd::BufferGL*>(resource);
delete pBufferGL;
HIP_RETURN(hipSuccess);
}
+235
查看文件
@@ -0,0 +1,235 @@
#include "hip_global.hpp"
#include "hip/hip_runtime.h"
#include "hip_internal.hpp"
#include "hip_code_object.hpp"
#include "platform/program.hpp"
#include <hip/hip_version.h>
const char* amd_dbgapi_get_build_name(void) {
return HIP_VERSION_BUILD_NAME;
}
const char* amd_dbgapi_get_git_hash() {
return HIP_VERSION_GITHASH;
}
size_t amd_dbgapi_get_build_id() {
return HIP_VERSION_BUILD_ID;
}
#ifdef __HIP_ENABLE_PCH
extern const char __hip_pch_wave32[];
extern const char __hip_pch_wave64[];
extern unsigned __hip_pch_wave32_size;
extern unsigned __hip_pch_wave64_size;
void __hipGetPCH(const char** pch, unsigned int *size) {
hipDeviceProp_t deviceProp;
int deviceId;
hipError_t error = hipGetDevice(&deviceId);
error = hipGetDeviceProperties(&deviceProp, deviceId);
if (deviceProp.warpSize == 32) {
*pch = __hip_pch_wave32;
*size = __hip_pch_wave32_size;
} else {
*pch = __hip_pch_wave64;
*size = __hip_pch_wave64_size;
}
}
#endif
namespace hip {
//Device Vars
DeviceVar::DeviceVar(std::string name,
hipModule_t hmod,
int deviceId) :
shadowVptr(nullptr), name_(name),
amd_mem_obj_(nullptr), device_ptr_(nullptr),
size_(0) {
amd::Program* program = as_amd(reinterpret_cast<cl_program>(hmod));
device::Program* dev_program =
program->getDeviceProgram(*g_devices.at(deviceId)->devices()[0]);
if (dev_program == nullptr) {
LogPrintfError("Cannot get Device Program for module: 0x%x \n", hmod);
guarantee(false, "Cannot get Device Program");
}
if(!dev_program->createGlobalVarObj(&amd_mem_obj_, &device_ptr_, &size_, name.c_str())) {
LogPrintfError("Cannot create Global Var obj for symbol: %s \n", name.c_str());
guarantee(false, "Cannot create GlobalVar Obj");
}
// Handle size 0 symbols
if (size_ != 0) {
if (amd_mem_obj_ == nullptr || device_ptr_ == nullptr) {
LogPrintfError("Cannot get memory for creating device Var: %s", name.c_str());
guarantee(false, "Cannot get memory for creating device var");
}
amd::MemObjMap::AddMemObj(device_ptr_, amd_mem_obj_);
}
}
DeviceVar::~DeviceVar() {
if (amd_mem_obj_ != nullptr) {
amd::MemObjMap::RemoveMemObj(device_ptr_);
amd_mem_obj_->release();
}
if (shadowVptr != nullptr) {
textureReference* texRef = reinterpret_cast<textureReference*>(shadowVptr);
hipError_t err = ihipUnbindTexture(texRef);
delete texRef;
shadowVptr = nullptr;
}
device_ptr_ = nullptr;
size_ = 0;
}
//Device Functions
DeviceFunc::DeviceFunc(std::string name, hipModule_t hmod) : dflock_("function lock"),
name_(name), kernel_(nullptr) {
amd::Program* program = as_amd(reinterpret_cast<cl_program>(hmod));
const amd::Symbol *symbol = program->findSymbol(name.c_str());
if (symbol == nullptr) {
LogPrintfError("Cannot find Symbol with name: %s \n", name.c_str());
guarantee(false, "Cannot find Symbol");
}
kernel_ = new amd::Kernel(*program, *symbol, name);
if (kernel_ == nullptr) {
LogPrintfError("Cannot create kernel with name: %s \n", name.c_str());
guarantee(false, "Cannot Create kernel");
}
}
DeviceFunc::~DeviceFunc() {
if (kernel_ != nullptr) {
kernel_->release();
}
}
//Abstract functions
Function::Function(const std::string& name, FatBinaryInfo** modules)
: name_(name), modules_(modules) {
dFunc_.resize(g_devices.size());
}
Function::~Function() {
for (auto& elem : dFunc_) {
delete elem;
}
name_ = "";
modules_ = nullptr;
}
hipError_t Function::getDynFunc(hipFunction_t* hfunc, hipModule_t hmod) {
guarantee((dFunc_.size() == g_devices.size()), "dFunc Size mismatch");
if (dFunc_[ihipGetDevice()] == nullptr) {
dFunc_[ihipGetDevice()] = new DeviceFunc(name_, hmod);
}
*hfunc = dFunc_[ihipGetDevice()]->asHipFunction();
return hipSuccess;
}
hipError_t Function::getStatFunc(hipFunction_t* hfunc, int deviceId) {
guarantee(modules_ != nullptr, "Module not initialized");
hipModule_t hmod = nullptr;
IHIP_RETURN_ONFAIL((*modules_)->BuildProgram(deviceId));
IHIP_RETURN_ONFAIL((*modules_)->GetModule(deviceId, &hmod));
if (dFunc_[deviceId] == nullptr) {
dFunc_[deviceId] = new DeviceFunc(name_, hmod);
}
*hfunc = dFunc_[deviceId]->asHipFunction();
return hipSuccess;
}
hipError_t Function::getStatFuncAttr(hipFuncAttributes* func_attr, int deviceId) {
guarantee((modules_ != nullptr), "Module not initialized");
hipModule_t hmod = nullptr;
IHIP_RETURN_ONFAIL((*modules_)->BuildProgram(deviceId));
IHIP_RETURN_ONFAIL((*modules_)->GetModule(deviceId, &hmod));
if (dFunc_[deviceId] == nullptr) {
dFunc_[deviceId] = new DeviceFunc(name_, hmod);
}
const std::vector<amd::Device*>& devices = amd::Device::getDevices(CL_DEVICE_TYPE_GPU, false);
amd::Kernel* kernel = dFunc_[deviceId]->kernel();
const device::Kernel::WorkGroupInfo* wginfo = kernel->getDeviceKernel(*devices[deviceId])->workGroupInfo();
func_attr->sharedSizeBytes = static_cast<int>(wginfo->localMemSize_);
func_attr->binaryVersion = static_cast<int>(kernel->signature().version());
func_attr->cacheModeCA = 0;
func_attr->constSizeBytes = 0;
func_attr->localSizeBytes = wginfo->privateMemSize_;
func_attr->maxDynamicSharedSizeBytes = static_cast<int>(wginfo->availableLDSSize_
- wginfo->localMemSize_);
func_attr->maxThreadsPerBlock = static_cast<int>(wginfo->size_);
func_attr->numRegs = static_cast<int>(wginfo->usedVGPRs_);
func_attr->preferredShmemCarveout = 0;
func_attr->ptxVersion = 30;
return hipSuccess;
}
//Abstract Vars
Var::Var(const std::string& name, DeviceVarKind dVarKind, size_t size, int type, int norm,
FatBinaryInfo** modules) : name_(name), dVarKind_(dVarKind), size_(size),
type_(type), norm_(norm), modules_(modules), managedVarPtr_(nullptr), align_(0) {
dVar_.resize(g_devices.size());
}
Var::Var(const std::string& name, DeviceVarKind dVarKind, void *pointer, size_t size,
unsigned align, FatBinaryInfo** modules) : name_(name), dVarKind_(dVarKind),
size_(size), modules_(modules), managedVarPtr_(pointer), align_(align),
type_(0), norm_(0) {
dVar_.resize(g_devices.size());
}
Var::~Var() {
for (auto& elem : dVar_) {
delete elem;
}
modules_ = nullptr;
}
hipError_t Var::getDeviceVar(DeviceVar** dvar, int deviceId, hipModule_t hmod) {
guarantee((deviceId >= 0), "Invalid DeviceId, less than zero");
guarantee((static_cast<size_t>(deviceId) < g_devices.size()),
"Invalid DeviceId, greater than no of code objects");
guarantee((dVar_.size() == g_devices.size()),
"Device Var not initialized to size");
if (dVar_[deviceId] == nullptr) {
dVar_[deviceId] = new DeviceVar(name_, hmod, deviceId);
}
*dvar = dVar_[deviceId];
return hipSuccess;
}
hipError_t Var::getStatDeviceVar(DeviceVar** dvar, int deviceId) {
guarantee((deviceId >= 0) , "Invalid DeviceId, less than zero");
guarantee((static_cast<size_t>(deviceId) < g_devices.size()),
"Invalid DeviceId, greater than no of code objects");
if (dVar_[deviceId] == nullptr) {
hipModule_t hmod = nullptr;
IHIP_RETURN_ONFAIL((*modules_)->BuildProgram(deviceId));
IHIP_RETURN_ONFAIL((*modules_)->GetModule(deviceId, &hmod));
dVar_[deviceId] = new DeviceVar(name_, hmod, deviceId);
}
*dvar = dVar_[deviceId];
return hipSuccess;
}
}; //namespace: hip
+128
查看文件
@@ -0,0 +1,128 @@
#ifndef HIP_GLOBAL_HPP
#define HIP_GLOBAL_HPP
#include <vector>
#include <string>
#include "hip/hip_runtime_api.h"
#include "hip/hip_runtime.h"
#include "hip_internal.hpp"
#include "hip_fatbin.hpp"
#include "platform/program.hpp"
namespace hip {
//Forward Declaration
class CodeObject;
//Device Structures
class DeviceVar {
public:
DeviceVar(std::string name, hipModule_t hmod, int deviceId);
~DeviceVar();
//Accessors for device ptr and size, populated during constructor.
hipDeviceptr_t device_ptr() const { return device_ptr_; }
size_t size() const { return size_; }
std::string name() const { return name_; }
void* shadowVptr;
private:
std::string name_; //Name of the var
amd::Memory* amd_mem_obj_; //amd_mem_obj abstraction
hipDeviceptr_t device_ptr_; //Device Pointer
size_t size_; //Size of the var
};
class DeviceFunc {
public:
DeviceFunc(std::string name, hipModule_t hmod);
~DeviceFunc();
amd::Monitor dflock_;
//Converts DeviceFunc to hipFunction_t(used by app) and vice versa.
hipFunction_t asHipFunction() { return reinterpret_cast<hipFunction_t>(this); }
static DeviceFunc* asFunction(hipFunction_t f) { return reinterpret_cast<DeviceFunc*>(f); }
//Accessor for kernel_ and name_ populated during constructor.
std::string name() const { return name_; }
amd::Kernel* kernel() const { return kernel_; }
private:
std::string name_; //name of the func(not unique identifier)
amd::Kernel* kernel_; //Kernel ptr referencing to ROCclr Symbol
};
//Abstract Structures
class Function {
public:
Function(const std::string& name, FatBinaryInfo** modules=nullptr);
~Function();
//Return DeviceFunc for this this dynamically loaded module
hipError_t getDynFunc(hipFunction_t* hfunc, hipModule_t hmod);
//Return Device Func & attr . Generate/build if not already done so.
hipError_t getStatFunc(hipFunction_t *hfunc, int deviceId);
hipError_t getStatFuncAttr(hipFuncAttributes* func_attr, int deviceId);
void resize_dFunc(size_t size) { dFunc_.resize(size); }
FatBinaryInfo** moduleInfo() { return modules_; }
const std::string& name() const { return name_; }
private:
std::vector<DeviceFunc*> dFunc_; //DeviceFuncObj per Device
std::string name_; //name of the func(not unique identifier)
FatBinaryInfo** modules_; // static module where it is referenced
};
class Var {
public:
//Types of variable
enum DeviceVarKind {
DVK_Variable = 0,
DVK_Surface,
DVK_Texture,
DVK_Managed
};
Var(const std::string& name, DeviceVarKind dVarKind, size_t size, int type, int norm,
FatBinaryInfo** modules = nullptr);
Var(const std::string& name, DeviceVarKind dVarKind, void *pointer, size_t size, unsigned align,
FatBinaryInfo** modules = nullptr);
~Var();
//Return DeviceVar for this dynamically loaded module
hipError_t getDeviceVar(DeviceVar** dvar, int deviceId, hipModule_t hmod);
//Return DeviceVar for module Generate/build if not already done so.
hipError_t getStatDeviceVar(DeviceVar** dvar, int deviceId);
void resize_dVar(size_t size) { dVar_.resize(size); }
FatBinaryInfo** moduleInfo() { return modules_; };
DeviceVarKind getVarKind() const { return dVarKind_; }
size_t getSize() const { return size_; }
void* getManagedVarPtr() { return managedVarPtr_; };
void setManagedVarInfo(void* pointer, size_t size) {
managedVarPtr_ = pointer;
size_ = size;
dVarKind_ = DVK_Managed;
}
private:
std::vector<DeviceVar*> dVar_; // DeviceVarObj per Device
std::string name_; // Variable name (not unique identifier)
DeviceVarKind dVarKind_; // Variable kind
size_t size_; // Size of the variable
int type_; // Type(Textures/Surfaces only)
int norm_; // Type(Textures/Surfaces only)
FatBinaryInfo** modules_; // static module where it is referenced
void *managedVarPtr_; // Managed memory pointer with size_ & align_
unsigned int align_; // Managed memory alignment
};
}; //namespace: hip
#endif /* HIP_GLOBAL_HPP */
文件差异内容过多而无法显示 加载差异

某些文件未显示,因为此 diff 中更改的文件太多 显示更多