From 186f95ea30b49d5fb347de8cc5610261d322e138 Mon Sep 17 00:00:00 2001 From: Tao Sang Date: Fri, 27 Nov 2020 20:13:14 -0500 Subject: [PATCH] Remove hip-hcc codes: Part one Remove hip-hcc codes from hip code base Simplify hip CMakeLists.txt to exclude hip-hcc Simplify cmake cmd for hip-rocclr building Some minor fixes Change-Id: I1ae357ecfd638d6c25bca293c1724b026be21ecd --- CMakeLists.txt | 265 +-- INSTALL.md | 3 +- bin/hipconfig | 30 +- include/hip/nvcc_detail/hip_runtime_api.h | 0 lpl_ca/CMakeLists.txt | 3 - lpl_ca/ca.hpp | 3 +- lpl_ca/clara/clara.hpp | 2 +- {src => lpl_ca}/code_object_bundle.inl | 0 src/AMDGPUPTNote.h | 45 - src/AMDGPURuntimeMetadata.h | 290 --- src/device_util.h | 136 -- src/env.cpp | 109 - src/env.h | 27 - src/functional_grid_launch.inl | 60 - src/grid_launch.cpp | 29 - src/h2f.cpp | 70 - src/hip_clang.cpp | 533 ----- src/hip_context.cpp | 332 --- src/hip_device.cpp | 648 ------ src/hip_error.cpp | 61 - src/hip_event.cpp | 483 ---- src/hip_fatbin.cpp | 91 - src/hip_fatbin.h | 58 - src/hip_hcc.cpp | 2548 -------------------- src/hip_hcc_internal.h | 1102 --------- src/hip_intercept.cpp | 53 - src/hip_memory.cpp | 2560 --------------------- src/hip_module.cpp | 1757 -------------- src/hip_peer.cpp | 231 -- src/hip_prof_api.h | 200 -- src/hip_stream.cpp | 296 --- src/hip_surface.cpp | 87 - src/hip_surface.h | 32 - src/hip_texture.cpp | 851 ------- src/hip_texture.h | 37 - src/hip_util.h | 38 - src/hiprtc.cpp | 634 ----- src/macro_based_grid_launch.inl | 97 - src/program_state.cpp | 101 - src/program_state.inl | 1001 -------- src/trace_helper.h | 125 - 41 files changed, 37 insertions(+), 14991 deletions(-) mode change 100755 => 100644 include/hip/nvcc_detail/hip_runtime_api.h rename {src => lpl_ca}/code_object_bundle.inl (100%) delete mode 100644 src/AMDGPUPTNote.h delete mode 100644 src/AMDGPURuntimeMetadata.h delete mode 100644 src/device_util.h delete mode 100644 src/env.cpp delete mode 100644 src/env.h delete mode 100644 src/functional_grid_launch.inl delete mode 100644 src/grid_launch.cpp delete mode 100644 src/h2f.cpp delete mode 100644 src/hip_clang.cpp delete mode 100644 src/hip_context.cpp delete mode 100644 src/hip_device.cpp delete mode 100644 src/hip_error.cpp delete mode 100644 src/hip_event.cpp delete mode 100644 src/hip_fatbin.cpp delete mode 100644 src/hip_fatbin.h delete mode 100644 src/hip_hcc.cpp delete mode 100644 src/hip_hcc_internal.h delete mode 100644 src/hip_intercept.cpp delete mode 100644 src/hip_memory.cpp delete mode 100644 src/hip_module.cpp delete mode 100644 src/hip_peer.cpp delete mode 100644 src/hip_prof_api.h delete mode 100644 src/hip_stream.cpp delete mode 100644 src/hip_surface.cpp delete mode 100644 src/hip_surface.h delete mode 100644 src/hip_texture.cpp delete mode 100644 src/hip_texture.h delete mode 100644 src/hip_util.h delete mode 100644 src/hiprtc.cpp delete mode 100644 src/macro_based_grid_launch.inl delete mode 100644 src/program_state.cpp delete mode 100644 src/program_state.inl delete mode 100644 src/trace_helper.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 117f4d0a9a..0b0fe84f81 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,12 +1,18 @@ cmake_minimum_required(VERSION 3.4.3) project(hip) -# sample command for hip-hcc -# cmake -DHIP_RUNTIME=hcc .. -# sample command for hip-rocclr, you'll need to have rocclr installed -# cmake .. -# cmake -DHIP_COMPILER=clang .. -# cmake -DHIP_COMPILER=clang -DHIP_RUNTIME=rocclr .. -# cmake -DHIP_COMPILER=clang -DHIP_RUNTIME=rocclr -DOPENCL_DIR=/path/to/opencl/api/opencl -DCMAKE_PREFIX_PATH=/path/to/rocclr/build/or/install/directory .. + +# sample command for hip-rocclr runtime, you'll need to have rocclr built +# For shared lib of hip-rocclr runtime +# For release version +# cmake -DCMAKE_PREFIX_PATH="$ROCclr_DIR/build;/opt/rocm/" -DCMAKE_INSTALL_PREFIX= .. +# For debug version +# cmake -DCMAKE_BUILD_TYPE=Debug -DCMAKE_PREFIX_PATH="$ROCclr_DIR/build;/opt/rocm/" -DCMAKE_INSTALL_PREFIX= .. +# For static lib of hip-rocclr runtime +# For release version +# cmake -DBUILD_SHARED_LIBS=OFF -DCMAKE_PREFIX_PATH="$ROCclr_DIR/build;/opt/rocm/" -DCMAKE_INSTALL_PREFIX= .. +# For debug version +# cmake -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=Debug -DCMAKE_PREFIX_PATH="$ROCclr_DIR/build;/opt/rocm/" -DCMAKE_INSTALL_PREFIX= .. +# If you don't specify CMAKE_INSTALL_PREFIX, hip-rocclr runtime will be installed to "/opt/rocm/hip". set(BUILD_SHARED_LIBS ON CACHE BOOL "Build shared library (.so) or static lib (.a) ") @@ -134,7 +140,7 @@ add_to_config(_versionInfo HIP_VERSION_PATCH) set (HIP_LIB_VERSION_MAJOR ${HIP_VERSION_MAJOR}) set (HIP_LIB_VERSION_MINOR ${HIP_VERSION_MINOR}) if (${ROCM_PATCH_VERSION} ) - set (HIP_LIB_VERSION_PATCH ${ROCM_PATCH_VERSION}) + set (HIP_LIB_VERSION_PATCH ${ROCM_PATCH_VERSION}) else () set (HIP_LIB_VERSION_PATCH ${HIP_VERSION_PATCH}) endif () @@ -145,15 +151,6 @@ if (DEFINED ENV{ROCM_RPATH}) set (CMAKE_SKIP_BUILD_RPATH TRUE) endif () -if(CMAKE_CXX_COMPILER MATCHES ".*hcc") - set(HIP_COMPILER "hcc" CACHE STRING "HIP Compiler") - set(HIP_PLATFORM "amd" CACHE STRING "HIP Platform") - set(HIP_RUNTIME "hcc" CACHE STRING "HIP Runtime") - get_filename_component(CXX_PATH ${CMAKE_CXX_COMPILER} DIRECTORY) - get_filename_component(CXX_PATH ${CXX_PATH} DIRECTORY) - set(HCC_HOME "${CXX_PATH}" CACHE PATH "Path to which hcc has been installed") -endif() - # overwrite HIP_VERSION_PATCH for packaging set(HIP_VERSION ${HIP_VERSION_MAJOR}.${HIP_VERSION_MINOR}.${HIP_PACKAGING_VERSION_PATCH}) @@ -177,68 +174,21 @@ endif() message(STATUS "HIP Platform: " ${HIP_PLATFORM}) if(HIP_PLATFORM STREQUAL "nvidia") - set(HIP_COMPILER "nvcc" CACHE STRING "HIP Compiler") set(HIP_RUNTIME "cuda" CACHE STRING "HIP Runtime") -endif() - -# default runtime is rocclr -if(NOT DEFINED HIP_RUNTIME) - if(NOT DEFINED ENV{HIP_RUNTIME}) - set(HIP_RUNTIME "rocclr" CACHE STRING "HIP Runtime") - else() - set(HIP_RUNTIME $ENV{HIP_RUNTIME} CACHE STRING "HIP Compiler") - endif() + set(HIP_COMPILER "nvcc" CACHE STRING "HIP Compiler") +elseif(HIP_PLATFORM STREQUAL "amd") + set(HIP_RUNTIME "rocclr" CACHE STRING "HIP Runtime") + set(HIP_COMPILER "clang" CACHE STRING "HIP Compiler") +else() + message(FATAL_ERROR "Unexpected HIP_PLATFORM: " ${HIP_PLATFORM}) endif() message(STATUS "HIP Runtime: " ${HIP_RUNTIME}) -add_to_config(_buildInfo HIP_RUNTIME) - -# Determine HIP_COMPILER -# Either hcc or clang; default is clang -if(NOT DEFINED HIP_COMPILER) - if(NOT DEFINED ENV{HIP_COMPILER}) - if(HIP_RUNTIME STREQUAL "hcc") - set(HIP_COMPILER "hcc" CACHE STRING "HIP Compiler") - else() - set(HIP_COMPILER "clang" CACHE STRING "HIP Compiler") - endif() - else() - set(HIP_COMPILER $ENV{HIP_COMPILER} CACHE STRING "HIP Compiler") - endif() -endif() - message(STATUS "HIP Compiler: " ${HIP_COMPILER}) + +add_to_config(_buildInfo HIP_RUNTIME) add_to_config(_buildInfo HIP_COMPILER) -########### Determine HCC_HOME If compiler is hcc ################## - -if(HIP_COMPILER STREQUAL "hcc") - # Determine HCC_HOME - if(NOT DEFINED HCC_HOME) - if(NOT DEFINED ENV{HCC_HOME}) - set(HCC_HOME "/opt/rocm/hcc" CACHE PATH "Path to which HCC has been installed") - else() - set(HCC_HOME $ENV{HCC_HOME} CACHE PATH "Path to which HCC has been installed") - endif() - endif() - if(IS_ABSOLUTE ${HCC_HOME} AND EXISTS ${HCC_HOME} AND IS_DIRECTORY ${HCC_HOME}) - execute_process(COMMAND ${HCC_HOME}/bin/hcc --version - OUTPUT_VARIABLE HCC_VERSION - OUTPUT_STRIP_TRAILING_WHITESPACE) - string(REGEX REPLACE ".*based on HCC " "" HCC_VERSION ${HCC_VERSION}) - string(REGEX REPLACE " .*" "" HCC_VERSION ${HCC_VERSION}) - message(STATUS "Looking for HCC in: " ${HCC_HOME} ". Found version: " ${HCC_VERSION}) - else() - message(FATAL_ERROR "Don't know where to find HCC. Please specify abolute path using -DHCC_HOME") - endif() - add_to_config(_buildInfo HCC_VERSION) - string(REPLACE "-" ";" HCC_VERSION_LIST ${HCC_VERSION}) - list(GET HCC_VERSION_LIST 0 HCC_PACKAGE_VERSION) - string(REPLACE "." ";" HCC_VERSION_LIST ${HCC_PACKAGE_VERSION}) - list(GET HCC_VERSION_LIST 0 HCC_VERSION_MAJOR) - list(GET HCC_VERSION_LIST 1 HCC_VERSION_MINOR) -endif() - ############ If HIP_PLATFORM is amd, HSA_PATH has to be defined ################## if(HIP_PLATFORM STREQUAL "amd") @@ -256,7 +206,7 @@ if(HIP_PLATFORM STREQUAL "amd") message(FATAL_ERROR "Don't know where to find HSA runtime. Please specify absolute path using -DHSA_PATH") endif() endif() -message(STATUS "\nHSA runtime in: " ${HSA_PATH}) +message(STATUS "HSA runtime in: " ${HSA_PATH}) # Set default build type if(NOT CMAKE_BUILD_TYPE) @@ -268,13 +218,7 @@ if (UNIX) set(HIP_DEFAULT_INSTALL_PREFIX "/opt/rocm/hip") endif() if(CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT) - if(CMAKE_BUILD_TYPE MATCHES Debug) - set(CMAKE_INSTALL_PREFIX ${CMAKE_CURRENT_SOURCE_DIR} CACHE PATH "Installation path for HIP" FORCE) - elseif(CMAKE_BUILD_TYPE MATCHES Release) - set(CMAKE_INSTALL_PREFIX ${HIP_DEFAULT_INSTALL_PREFIX} CACHE PATH "Installation path for HIP" FORCE) - else() - message(FATAL_ERROR "Invalid CMAKE_BUILD_TYPE specified. Valid values are Debug and Release") - endif() + set(CMAKE_INSTALL_PREFIX ${HIP_DEFAULT_INSTALL_PREFIX} CACHE PATH "Installation path for HIP" FORCE) endif() if(DEV_LOG_ENABLE MATCHES "yes") @@ -341,129 +285,6 @@ if(HIP_RUNTIME STREQUAL "rocclr") set(HIP_HCC_BUILD_FLAGS "${HIP_HCC_BUILD_FLAGS} -fPIC ${HCC_CXX_FLAGS} -I${HSA_PATH}/include") endif() -# Build hip_hcc if runtime is hcc -if(HIP_RUNTIME STREQUAL "hcc") - ############################# - # Profiling API support - ############################# - # Generate profiling API macros/structures header - if(USE_PROF_API EQUAL 1) - set(PROF_API_STR "${PROJECT_BINARY_DIR}/include/hip/hcc_detail/hip_prof_str.h") - set(PROF_API_HDR "${CMAKE_CURRENT_SOURCE_DIR}/include/hip/hcc_detail/hip_runtime_api.h") - set(PROF_API_SRC "${CMAKE_CURRENT_SOURCE_DIR}/src") - set(PROF_API_GEN "${CMAKE_CURRENT_SOURCE_DIR}/hip_prof_gen.py") - set(PROF_API_LOG "${PROJECT_BINARY_DIR}/hip_prof_gen.log.txt") - set(PROF_API_CMD "${PROF_API_GEN} -v ${OPT_PROF_API} ${PROF_API_HDR} ${PROF_API_SRC} ${PROF_API_STR} >${PROF_API_LOG}") - MESSAGE(STATUS "Generating profiling promitives: ${PROF_API_STR}") - execute_process(COMMAND sh -c "rm -f ${PROF_API_STR}; ${PROF_API_CMD}") - set_property(DIRECTORY APPEND PROPERTY CMAKE_CONFIGURE_DEPENDS ${PROF_API_GEN} ${PROF_API_HDR} ${PROF_API_STR}) - - # Enable profiling API - find_path(PROF_API_HEADER_DIR prof_protocol.h - HINTS - ${PROF_API_HEADER_PATH} - PATHS - /opt/rocm/roctracer - PATH_SUFFIXES - include/ext - ) - if(NOT PROF_API_HEADER_DIR) - MESSAGE(WARNING "Profiling API header not found. Disabling roctracer integration. Use -DPROF_API_HEADER_PATH=") - else() - add_definitions(-DUSE_PROF_API=1) - include_directories(${PROF_API_HEADER_DIR}) - MESSAGE(STATUS "Profiling API: ${PROF_API_HEADER_DIR}") - endif() - endif() - - include_directories(${PROJECT_BINARY_DIR}/include) - include_directories(${PROJECT_SOURCE_DIR}/include) - set(HIP_HCC_BUILD_FLAGS) - - # Add HIP_VERSION to CMAKE__FLAGS - set(HIP_HCC_BUILD_FLAGS "${HIP_HCC_BUILD_FLAGS} -DHIP_VERSION_MAJOR=${HIP_VERSION_MAJOR} -DHIP_VERSION_MINOR=${HIP_VERSION_MINOR} -DHIP_VERSION_PATCH=${HIP_VERSION_GITDATE}") - - # Add remaining flags - set(HCC_CXX_FLAGS "-Xlinker --enable-new-dtags -hc -fno-gpu-rdc --amdgpu-target=gfx803 --amdgpu-target=gfx900 --amdgpu-target=gfx906 --amdgpu-target=gfx908 ") - set(HIP_HCC_BUILD_FLAGS "${HIP_HCC_BUILD_FLAGS} -fPIC ${HCC_CXX_FLAGS} -I${HSA_PATH}/include") - - # Set compiler and compiler flags - set(CMAKE_CXX_COMPILER "${HCC_HOME}/bin/hcc") - set(CMAKE_C_COMPILER "${HCC_HOME}/bin/hcc") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${HIP_HCC_BUILD_FLAGS}") - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${HIP_HCC_BUILD_FLAGS}") - - set(SOURCE_FILES_RUNTIME - src/program_state.cpp - src/hip_clang.cpp - src/hip_hcc.cpp - src/hip_context.cpp - src/hip_device.cpp - src/hip_error.cpp - src/hip_event.cpp - src/hip_fatbin.cpp - src/hip_memory.cpp - src/hip_peer.cpp - src/hip_stream.cpp - src/hip_module.cpp - src/hip_db.cpp - src/grid_launch.cpp - src/hip_texture.cpp - src/hip_surface.cpp - src/hip_intercept.cpp - src/env.cpp - src/h2f.cpp) - - add_library(hip_hcc SHARED ${SOURCE_FILES_RUNTIME}) - add_library(hip_hcc_static STATIC ${SOURCE_FILES_RUNTIME}) - - ## Set the VERSION and SOVERSION values - set_property ( TARGET hip_hcc PROPERTY VERSION "${HIP_LIB_VERSION_STRING}" ) - set_property ( TARGET hip_hcc PROPERTY SOVERSION "${HIP_LIB_VERSION_MAJOR}" ) - - target_link_libraries(hip_hcc PRIVATE hc_am) - target_link_libraries(hip_hcc_static PRIVATE hc_am) - - add_library(hiprtc SHARED src/hiprtc.cpp) - target_compile_options(hiprtc PRIVATE -DDISABLE_REDUCED_GPU_BLOB_COPY) - set_property ( TARGET hiprtc PROPERTY VERSION "${HIP_LIB_VERSION_STRING}" ) - set_property ( TARGET hiprtc PROPERTY SOVERSION "${HIP_LIB_VERSION_MAJOR}" ) - - target_include_directories( - hiprtc SYSTEM - PRIVATE ${PROJECT_SOURCE_DIR}/include ${HSA_PATH}/include) - - set_target_properties(hip_hcc PROPERTIES CXX_VISIBILITY_PRESET hidden) - set_target_properties(hip_hcc PROPERTIES VISIBILITY_INLINES_HIDDEN 1) - set_target_properties(hiprtc PROPERTIES CXX_VISIBILITY_PRESET hidden) - set_target_properties(hiprtc PROPERTIES VISIBILITY_INLINES_HIDDEN 1) - - find_package(amd_comgr REQUIRED CONFIG - PATHS - /opt/rocm/ - PATH_SUFFIXES - cmake/amd_comgr - lib/cmake/amd_comgr - ) - MESSAGE(STATUS "Code Object Manager found at ${amd_comgr_DIR}.") - - target_link_libraries(hip_hcc PRIVATE amd_comgr) - target_link_libraries(hip_hcc_static PRIVATE amd_comgr) - - string(REPLACE " " ";" HCC_CXX_FLAGS_LIST ${HCC_CXX_FLAGS}) - foreach(TARGET hip_hcc hip_hcc_static) - target_include_directories(${TARGET} SYSTEM INTERFACE $/include>;${HSA_PATH}/include) - endforeach() - add_library(host INTERFACE) - target_link_libraries(host INTERFACE hip_hcc) - add_library(device INTERFACE) - if(HIP_COMPILER STREQUAL "hcc") - target_link_libraries(device INTERFACE host hcc::hccrt hcc::hc_am) - else() - target_link_libraries(device INTERFACE host) - endif() -endif() - if(HIP_PLATFORM STREQUAL "amd") add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/lpl_ca) endif() @@ -488,10 +309,6 @@ endif() ############################# # Install steps ############################# -# Install hip_hcc if runtime is hcc -if(HIP_RUNTIME STREQUAL "hcc") - install(TARGETS hip_hcc_static hip_hcc hiprtc DESTINATION lib) -endif() # Install .hipInfo install(FILES ${PROJECT_BINARY_DIR}/.hipInfo DESTINATION lib) @@ -503,9 +320,7 @@ install(FILES ${PROJECT_BINARY_DIR}/.hipVersion DESTINATION bin) execute_process(COMMAND test ${CMAKE_INSTALL_PREFIX} -ef ${CMAKE_CURRENT_SOURCE_DIR} RESULT_VARIABLE INSTALL_SOURCE) if(NOT ${INSTALL_SOURCE} EQUAL 0) - if(HIP_RUNTIME STREQUAL "hcc") - install(DIRECTORY src DESTINATION .) - elseif(HIP_RUNTIME STREQUAL "rocclr") + if(HIP_RUNTIME STREQUAL "rocclr") install(DIRECTORY rocclr DESTINATION .) endif() install(DIRECTORY bin DESTINATION . USE_SOURCE_PERMISSIONS) @@ -522,13 +337,6 @@ install(DIRECTORY ${PROJECT_BINARY_DIR}/include/hip DESTINATION include FILES_MATCHING PATTERN "*.h*") -if(HIP_RUNTIME STREQUAL "hcc") - install(TARGETS hip_hcc_static hip_hcc host device EXPORT hip-targets DESTINATION ${LIB_INSTALL_DIR}) - install(EXPORT hip-targets DESTINATION ${CONFIG_PACKAGE_INSTALL_DIR} NAMESPACE hip::) -elseif(HIP_RUNTIME STREQUAL "rocclr") -# install(TARGETS hip_on_rocclr host device EXPORT hip-targets DESTINATION ${LIB_INSTALL_DIR}) -endif() - ############################# # hip-config ############################# @@ -577,22 +385,7 @@ if (BUILD_HIPIFY_CLANG) add_dependencies(pkg_hip_base hipify-clang) endif() -if(HIP_RUNTIME STREQUAL "hcc") - message("HCC Package\n") - # Package: hip_hcc - set(BUILD_DIR ${CMAKE_CURRENT_BINARY_DIR}/packages/hip_hcc) - configure_file(packaging/hip-hcc.txt ${BUILD_DIR}/CMakeLists.txt @ONLY) - configure_file(packaging/hip-hcc.postinst ${BUILD_DIR}/postinst @ONLY) - configure_file(packaging/hip-hcc.prerm ${BUILD_DIR}/prerm @ONLY) - add_custom_target(pkg_hip_hcc COMMAND ${CMAKE_COMMAND} . - COMMAND rm -rf *.deb *.rpm *.tar.gz - COMMAND make package - COMMAND cp *.deb ${PROJECT_BINARY_DIR} - COMMAND cp *.rpm ${PROJECT_BINARY_DIR} - COMMAND cp *.tar.gz ${PROJECT_BINARY_DIR} - WORKING_DIRECTORY ${BUILD_DIR} - DEPENDS hip_hcc hip_hcc_static hiprtc) -elseif(HIP_RUNTIME STREQUAL "rocclr") +if(HIP_RUNTIME STREQUAL "rocclr") set(BUILD_DIR ${CMAKE_CURRENT_BINARY_DIR}/rocclr) configure_file(packaging/hip-rocclr.txt ${BUILD_DIR}/CMakeLists.txt @ONLY) configure_file(packaging/hip-rocclr.postinst ${BUILD_DIR}/postinst @ONLY) @@ -645,11 +438,7 @@ if(POLICY CMP0037) cmake_policy(SET CMP0037 OLD) endif() -if(HIP_RUNTIME STREQUAL "hcc") - add_custom_target(package - WORKING_DIRECTORY ${PROJECT_BINARY_DIR} - DEPENDS pkg_hip_base pkg_hip_hcc pkg_hip_nvcc pkg_hip_doc pkg_hip_samples) -elseif(HIP_RUNTIME STREQUAL "rocclr") +if(HIP_RUNTIME STREQUAL "rocclr") add_custom_target(package WORKING_DIRECTORY ${PROJECT_BINARY_DIR} DEPENDS pkg_hip_base hip_on_rocclr pkg_hip_nvcc pkg_hip_doc pkg_hip_samples) diff --git a/INSTALL.md b/INSTALL.md index 74df7372a9..7254799d3e 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -95,9 +95,10 @@ git clone -b rocm-3.10.x https://github.com/ROCm-Developer-Tools/HIP.git export HIP_DIR="$(readlink -f HIP)" cd "$HIP_DIR" mkdir -p build; cd build -cmake -DCMAKE_BUILD_TYPE=Release -DHIP_COMPILER=clang -DHIP_PLATFORM=rocclr -DCMAKE_PREFIX_PATH="$ROCclr_DIR/build;/opt/rocm/" -DCMAKE_INSTALL_PREFIX= .. +cmake -DCMAKE_PREFIX_PATH="$ROCclr_DIR/build;/opt/rocm/" -DCMAKE_INSTALL_PREFIX= .. make -j sudo make install +Note: If you don't specify CMAKE_INSTALL_PREFIX, hip-rocclr runtime will be installed to "/opt/rocm/hip". ``` ## Default paths and environment variables diff --git a/bin/hipconfig b/bin/hipconfig index 63eb87c58c..b0c446380c 100755 --- a/bin/hipconfig +++ b/bin/hipconfig @@ -8,7 +8,7 @@ use 5.006; use v5.10.1; use Getopt::Long; use Cwd; -# Return name of HIP compiler - either 'nvcc' or 'hcc' +# Return name of HIP compiler - either 'clang' or 'nvcc' # use Getopt::Long; use File::Basename; @@ -34,9 +34,9 @@ if ($p_help) { print " --path, -p : print HIP_PATH (use env var if set, else determine from hipconfig path)\n"; print " --rocmpath, -R : print ROCM_PATH (use env var if set, else determine from hip path or /opt/rocm)\n"; print " --cpp_config, -C : print C++ compiler options\n"; - print " --compiler, -c : print compiler (hcc or clang or nvcc)\n"; + print " --compiler, -c : print compiler (clang or nvcc)\n"; print " --platform, -P : print platform (amd or nvidia)\n"; - print " --runtime, -r : print runtime (hcc or rocclr)\n"; + print " --runtime, -r : print runtime (rocclr or cuda)\n"; print " --hipclangpath, -l : print HIP_CLANG_PATH\n"; print " --full, -f : print full config\n"; print " --version, -v : print hip version\n"; @@ -86,7 +86,6 @@ if (-e "$HIP_PATH/../bin/rocm_agent_enumerator") { $ROCM_PATH=$ENV{'ROCM_PATH'} // "/opt/rocm"; } $CUDA_PATH=$ENV{'CUDA_PATH'} // '/usr/local/cuda'; -$HCC_HOME=$ENV{'HCC_HOME'} // "$ROCM_PATH/hcc"; $HSA_PATH=$ENV{'HSA_PATH'} // "$ROCM_PATH/hsa"; $HIP_CLANG_PATH=$ENV{'HIP_CLANG_PATH'} // "$ROCM_PATH/llvm/bin"; # HIP_ROCCLR_HOME is used by Windows builds @@ -120,8 +119,6 @@ if (defined $HIP_RUNTIME and $HIP_RUNTIME eq "rocclr" and !defined $HIP_ROCCLR_H if (not defined $HIP_PLATFORM) { if (can_run("$HIP_CLANG_PATH/clang++") or can_run("clang++")) { $HIP_PLATFORM = "amd"; - } elsif (can_run("$HCC_HOME/bin/hcc") or can_run("hcc")) { - $HIP_PLATFORM = "amd"; } elsif (can_run("$CUDA_PATH/bin/nvcc") or can_run("nvcc")) { $HIP_PLATFORM = "nvidia"; $HIP_COMPILER = "nvcc"; @@ -132,9 +129,6 @@ if (not defined $HIP_PLATFORM) { } } -if ($HIP_COMPILER eq "hcc") { - $CPP_CONFIG = " -D__HIP_PLATFORM_HCC__= -I$HIP_PATH/include -I$HCC_HOME/include -I$HSA_PATH/include"; -} if ($HIP_COMPILER eq "clang") { # Windows does not have clang at linux default path if (defined $HIP_ROCCLR_HOME and (-e "$HIP_ROCCLR_HOME/bin/clang" or -e "$HIP_ROCCLR_HOME/bin/clang.exe")) { @@ -216,20 +210,6 @@ if (!$printed or $p_full) { if ($HIP_PLATFORM eq "amd") { print "\n" ; - if ($HIP_COMPILER eq "hcc") - { - print "== hcc\n"; - print ("HSA_PATH : $HSA_PATH\n"); - print ("HCC_HOME : $HCC_HOME\n"); - system("$HCC_HOME/bin/hcc --version"); - system("$HCC_HOME/bin/llc --version"); - print ("HCC-cxxflags : "); - system("$HCC_HOME/bin/hcc-config --cxxflags"); - printf("\n"); - print ("HCC-ldflags : "); - system("$HCC_HOME/bin/hcc-config --ldflags"); - printf("\n"); - } if ($HIP_COMPILER eq "clang") { print "== hip-clang\n"; @@ -243,6 +223,8 @@ if (!$printed or $p_full) { print ("hip-clang-ldflags : "); system("$HIP_PATH/bin/hipcc --ldflags"); printf("\n"); + } else { + print ("Unexpected HIP_COMPILER: $HIP_COMPILER\n"); } } if ($HIP_PLATFORM eq "nvidia") { @@ -256,7 +238,7 @@ if (!$printed or $p_full) { print "=== Environment Variables\n"; system("echo PATH=\$PATH"); - system("env | egrep '^HIP|^HSA|^HCC|^CUDA|^LD_LIBRARY_PATH'"); + system("env | egrep '^HIP|^HSA|^CUDA|^LD_LIBRARY_PATH'"); print "\n" ; diff --git a/include/hip/nvcc_detail/hip_runtime_api.h b/include/hip/nvcc_detail/hip_runtime_api.h old mode 100755 new mode 100644 diff --git a/lpl_ca/CMakeLists.txt b/lpl_ca/CMakeLists.txt index 346173bc09..45e04ecde2 100644 --- a/lpl_ca/CMakeLists.txt +++ b/lpl_ca/CMakeLists.txt @@ -13,8 +13,6 @@ set_target_properties( CXX_EXTENSIONS OFF RUNTIME_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}) target_include_directories(lpl - PUBLIC - ${PROJECT_SOURCE_DIR}/src PRIVATE $) @@ -34,7 +32,6 @@ set_target_properties( CXX_EXTENSIONS OFF RUNTIME_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}) target_include_directories(ca SYSTEM PUBLIC ${HSA_PATH}/include) -target_include_directories(ca PUBLIC ${PROJECT_SOURCE_DIR}/src) find_package(hsa-runtime64 REQUIRED CONFIG PATHS diff --git a/lpl_ca/ca.hpp b/lpl_ca/ca.hpp index 2d691cd38a..9936a3f8aa 100644 --- a/lpl_ca/ca.hpp +++ b/lpl_ca/ca.hpp @@ -2,8 +2,7 @@ #include "common.hpp" -#include "../src/code_object_bundle.inl" - +#include "code_object_bundle.inl" #include "clara/clara.hpp" #include diff --git a/lpl_ca/clara/clara.hpp b/lpl_ca/clara/clara.hpp index 10b70da644..5df0b86574 100644 --- a/lpl_ca/clara/clara.hpp +++ b/lpl_ca/clara/clara.hpp @@ -637,7 +637,7 @@ struct BoundFlagRefBase : BoundRefBase { auto isFlag() const -> bool override { return true; } auto setValue(std::string const& arg) -> ParserResult override { - bool flag; + bool flag = 0; auto result = convertInto(arg, flag); if (result) setFlag(flag); return result; diff --git a/src/code_object_bundle.inl b/lpl_ca/code_object_bundle.inl similarity index 100% rename from src/code_object_bundle.inl rename to lpl_ca/code_object_bundle.inl diff --git a/src/AMDGPUPTNote.h b/src/AMDGPUPTNote.h deleted file mode 100644 index f2a656f2d7..0000000000 --- a/src/AMDGPUPTNote.h +++ /dev/null @@ -1,45 +0,0 @@ -//===-- AMDGPUNoteType.h - AMDGPU ELF PT_NOTE section info-------*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file -/// -/// Enums and constants for AMDGPU PT_NOTE sections. -/// -// -//===----------------------------------------------------------------------===// -// -#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUPTNOTE_H -#define LLVM_LIB_TARGET_AMDGPU_AMDGPUPTNOTE_H - -namespace AMDGPU { - -namespace ElfNote { - -const char SectionName[] = ".note"; - -const char NoteName[] = "AMD"; - -// TODO: Move this enum to include/llvm/Support so it can be used in tools? -enum NoteType { - NT_AMDGPU_HSA_CODE_OBJECT_VERSION = 1, - NT_AMDGPU_HSA_HSAIL = 2, - NT_AMDGPU_HSA_ISA = 3, - NT_AMDGPU_HSA_PRODUCER = 4, - NT_AMDGPU_HSA_PRODUCER_OPTIONS = 5, - NT_AMDGPU_HSA_EXTENSION = 6, - NT_AMDGPU_HSA_RUNTIME_METADATA_V_1 = 7, // deprecated since 12/14/16. - NT_AMDGPU_HSA_RUNTIME_METADATA_V_2 = 8, - NT_AMDGPU_HSA_RUNTIME_METADATA = NT_AMDGPU_HSA_RUNTIME_METADATA_V_2, - NT_AMDGPU_HSA_HLDEBUG_DEBUG = 101, - NT_AMDGPU_HSA_HLDEBUG_TARGET = 102 -}; -} // namespace ElfNote -} // namespace AMDGPU - -#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUNOTETYPE_H diff --git a/src/AMDGPURuntimeMetadata.h b/src/AMDGPURuntimeMetadata.h deleted file mode 100644 index cc42f473f7..0000000000 --- a/src/AMDGPURuntimeMetadata.h +++ /dev/null @@ -1,290 +0,0 @@ -//===-- AMDGPURuntimeMetadata.h - AMDGPU Runtime Metadata -------*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file -/// -/// Enums and structure types used by runtime metadata. -/// -/// Runtime requests certain information (metadata) about kernels to be able -/// to execute the kernels and answer the queries about the kernels. -/// The metadata is represented as a note element in the .note ELF section of a -/// binary (code object). The desc field of the note element is a YAML string -/// consisting of key-value pairs. Each key is a string. Each value can be -/// an integer, a string, or an YAML sequence. There are 3 levels of YAML maps. -/// At the beginning of the YAML string is the module level YAML map. A -/// kernel-level YAML map is in the amd.Kernels sequence. A -/// kernel-argument-level map is in the amd.Args sequence. -/// -/// The format should be kept backward compatible. New enum values and bit -/// fields should be appended at the end. It is suggested to bump up the -/// revision number whenever the format changes and document the change -/// in the revision in this header. -/// -// -//===----------------------------------------------------------------------===// -// -#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPURUNTIMEMETADATA_H -#define LLVM_LIB_TARGET_AMDGPU_AMDGPURUNTIMEMETADATA_H - -#include -#include -#include - -namespace AMDGPU { -namespace RuntimeMD { - -// Version and revision of runtime metadata -const unsigned char MDVersion = 2; -const unsigned char MDRevision = 1; - -// Name of keys for runtime metadata. -namespace KeyName { - -// Runtime metadata version -const char MDVersion[] = "amd.MDVersion"; - -// Instruction set architecture information -const char IsaInfo[] = "amd.IsaInfo"; -// Wavefront size -const char IsaInfoWavefrontSize[] = "amd.IsaInfoWavefrontSize"; -// Local memory size in bytes -const char IsaInfoLocalMemorySize[] = "amd.IsaInfoLocalMemorySize"; -// Number of execution units per compute unit -const char IsaInfoEUsPerCU[] = "amd.IsaInfoEUsPerCU"; -// Maximum number of waves per execution unit -const char IsaInfoMaxWavesPerEU[] = "amd.IsaInfoMaxWavesPerEU"; -// Maximum flat work group size -const char IsaInfoMaxFlatWorkGroupSize[] = "amd.IsaInfoMaxFlatWorkGroupSize"; -// SGPR allocation granularity -const char IsaInfoSGPRAllocGranule[] = "amd.IsaInfoSGPRAllocGranule"; -// Total number of SGPRs -const char IsaInfoTotalNumSGPRs[] = "amd.IsaInfoTotalNumSGPRs"; -// Addressable number of SGPRs -const char IsaInfoAddressableNumSGPRs[] = "amd.IsaInfoAddressableNumSGPRs"; -// VGPR allocation granularity -const char IsaInfoVGPRAllocGranule[] = "amd.IsaInfoVGPRAllocGranule"; -// Total number of VGPRs -const char IsaInfoTotalNumVGPRs[] = "amd.IsaInfoTotalNumVGPRs"; -// Addressable number of VGPRs -const char IsaInfoAddressableNumVGPRs[] = "amd.IsaInfoAddressableNumVGPRs"; - -// Language -const char Language[] = "amd.Language"; -// Language version -const char LanguageVersion[] = "amd.LanguageVersion"; - -// Kernels -const char Kernels[] = "amd.Kernels"; -// Kernel name -const char KernelName[] = "amd.KernelName"; -// Kernel arguments -const char Args[] = "amd.Args"; -// Kernel argument size in bytes -const char ArgSize[] = "amd.ArgSize"; -// Kernel argument alignment -const char ArgAlign[] = "amd.ArgAlign"; -// Kernel argument type name -const char ArgTypeName[] = "amd.ArgTypeName"; -// Kernel argument name -const char ArgName[] = "amd.ArgName"; -// Kernel argument kind -const char ArgKind[] = "amd.ArgKind"; -// Kernel argument value type -const char ArgValueType[] = "amd.ArgValueType"; -// Kernel argument address qualifier -const char ArgAddrQual[] = "amd.ArgAddrQual"; -// Kernel argument access qualifier -const char ArgAccQual[] = "amd.ArgAccQual"; -// Kernel argument is const qualified -const char ArgIsConst[] = "amd.ArgIsConst"; -// Kernel argument is restrict qualified -const char ArgIsRestrict[] = "amd.ArgIsRestrict"; -// Kernel argument is volatile qualified -const char ArgIsVolatile[] = "amd.ArgIsVolatile"; -// Kernel argument is pipe qualified -const char ArgIsPipe[] = "amd.ArgIsPipe"; -// Required work group size -const char ReqdWorkGroupSize[] = "amd.ReqdWorkGroupSize"; -// Work group size hint -const char WorkGroupSizeHint[] = "amd.WorkGroupSizeHint"; -// Vector type hint -const char VecTypeHint[] = "amd.VecTypeHint"; -// Kernel index for device enqueue -const char KernelIndex[] = "amd.KernelIndex"; -// No partial work groups -const char NoPartialWorkGroups[] = "amd.NoPartialWorkGroups"; -// Prinf function call information -const char PrintfInfo[] = "amd.PrintfInfo"; -// The actual kernel argument access qualifier -const char ArgActualAcc[] = "amd.ArgActualAcc"; -// Alignment of pointee type -const char ArgPointeeAlign[] = "amd.ArgPointeeAlign"; - -} // end namespace KeyName - -namespace KernelArg { - -enum Kind : uint8_t { - ByValue = 0, - GlobalBuffer = 1, - DynamicSharedPointer = 2, - Sampler = 3, - Image = 4, - Pipe = 5, - Queue = 6, - HiddenGlobalOffsetX = 7, - HiddenGlobalOffsetY = 8, - HiddenGlobalOffsetZ = 9, - HiddenNone = 10, - HiddenPrintfBuffer = 11, - HiddenDefaultQueue = 12, - HiddenCompletionAction = 13, -}; - -enum ValueType : uint16_t { - Struct = 0, - I8 = 1, - U8 = 2, - I16 = 3, - U16 = 4, - F16 = 5, - I32 = 6, - U32 = 7, - F32 = 8, - I64 = 9, - U64 = 10, - F64 = 11, -}; - -// Avoid using 'None' since it conflicts with a macro in X11 header file. -enum AccessQualifer : uint8_t { - AccNone = 0, - ReadOnly = 1, - WriteOnly = 2, - ReadWrite = 3, -}; - -enum AddressSpaceQualifer : uint8_t { - Private = 0, - Global = 1, - Constant = 2, - Local = 3, - Generic = 4, - Region = 5, -}; - -} // end namespace KernelArg - -// Invalid values are used to indicate an optional key should not be emitted. -const uint8_t INVALID_ADDR_QUAL = 0xff; -const uint8_t INVALID_ACC_QUAL = 0xff; -const uint32_t INVALID_KERNEL_INDEX = ~0U; - -namespace KernelArg { - -// In-memory representation of kernel argument information. -struct Metadata { - uint32_t Size = 0; - uint32_t Align = 0; - uint32_t PointeeAlign = 0; - uint8_t Kind = 0; - uint16_t ValueType = 0; - std::string TypeName; - std::string Name; - uint8_t AddrQual = INVALID_ADDR_QUAL; - uint8_t AccQual = INVALID_ACC_QUAL; - uint8_t IsVolatile = 0; - uint8_t IsConst = 0; - uint8_t IsRestrict = 0; - uint8_t IsPipe = 0; - - Metadata() = default; -}; - -} // end namespace KernelArg - -namespace Kernel { - -// In-memory representation of kernel information. -struct Metadata { - std::string Name; - std::string Language; - std::vector LanguageVersion; - std::vector ReqdWorkGroupSize; - std::vector WorkGroupSizeHint; - std::string VecTypeHint; - uint32_t KernelIndex = INVALID_KERNEL_INDEX; - uint8_t NoPartialWorkGroups = 0; - std::vector Args; - - Metadata() = default; -}; - -} // end namespace Kernel - -namespace IsaInfo { - -/// \brief In-memory representation of instruction set architecture -/// information. -struct Metadata { - /// \brief Wavefront size. - unsigned WavefrontSize = 0; - /// \brief Local memory size in bytes. - unsigned LocalMemorySize = 0; - /// \brief Number of execution units per compute unit. - unsigned EUsPerCU = 0; - /// \brief Maximum number of waves per execution unit. - unsigned MaxWavesPerEU = 0; - /// \brief Maximum flat work group size. - unsigned MaxFlatWorkGroupSize = 0; - /// \brief SGPR allocation granularity. - unsigned SGPRAllocGranule = 0; - /// \brief Total number of SGPRs. - unsigned TotalNumSGPRs = 0; - /// \brief Addressable number of SGPRs. - unsigned AddressableNumSGPRs = 0; - /// \brief VGPR allocation granularity. - unsigned VGPRAllocGranule = 0; - /// \brief Total number of VGPRs. - unsigned TotalNumVGPRs = 0; - /// \brief Addressable number of VGPRs. - unsigned AddressableNumVGPRs = 0; - - Metadata() = default; -}; - -} // end namespace IsaInfo - -namespace Program { - -// In-memory representation of program information. -struct Metadata { - std::vector MDVersionSeq; - IsaInfo::Metadata IsaInfo; - std::vector PrintfInfo; - std::vector Kernels; - - explicit Metadata() = default; - - // Construct from an YAML string. - explicit Metadata(const std::string& YAML); - - // Convert to YAML string. - std::string toYAML(); - - // Convert from YAML string. - static Metadata fromYAML(const std::string& S); -}; - -} // end namespace Program - -} // end namespace RuntimeMD -} // end namespace AMDGPU - -#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPURUNTIMEMETADATA_H diff --git a/src/device_util.h b/src/device_util.h deleted file mode 100644 index 84dbbf71ed..0000000000 --- a/src/device_util.h +++ /dev/null @@ -1,136 +0,0 @@ -/* -Copyright (c) 2015-2017 Advanced Micro Devices, Inc. All rights reserved. - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. -*/ - -#ifndef DEVICE_UTIL_H -#define DEVICE_UTIL_H - -#include - -/* - Heap size computation for malloc and free device functions. -*/ - -#define HIP_SQRT_2 1.41421356237 -#define HIP_SQRT_PI 1.77245385091 - -#define __hip_erfinva3 -0.140543331 -#define __hip_erfinva2 0.914624893 -#define __hip_erfinva1 -1.645349621 -#define __hip_erfinva0 0.886226899 - -#define __hip_erfinvb4 0.012229801 -#define __hip_erfinvb3 -0.329097515 -#define __hip_erfinvb2 1.442710462 -#define __hip_erfinvb1 -2.118377725 -#define __hip_erfinvb0 1 - -#define __hip_erfinvc3 1.641345311 -#define __hip_erfinvc2 3.429567803 -#define __hip_erfinvc1 -1.62490649 -#define __hip_erfinvc0 -1.970840454 - -#define __hip_erfinvd2 1.637067800 -#define __hip_erfinvd1 3.543889200 -#define __hip_erfinvd0 1 - -#define HIP_PI 3.14159265358979323846 - -__device__ float __hip_erfinvf(float x); -__device__ double __hip_erfinv(double x); - -__device__ float __hip_j0f(float x); -__device__ double __hip_j0(double x); - -__device__ float __hip_j1f(float x); -__device__ double __hip_j1(double x); - -__device__ float __hip_y0f(float x); -__device__ double __hip_y0(double x); - -__device__ float __hip_y1f(float x); -__device__ double __hip_y1(double x); - -__device__ float __hip_jnf(int n, float x); -__device__ double __hip_jn(int n, double x); - -__device__ float __hip_ynf(int n, float x); -__device__ double __hip_yn(int n, double x); - -__device__ float __hip_precise_cosf(float x); -__device__ float __hip_precise_exp10f(float x); -__device__ float __hip_precise_expf(float x); -__device__ float __hip_precise_frsqrt_rn(float x); -__device__ float __hip_precise_fsqrt_rd(float x); -__device__ float __hip_precise_fsqrt_rn(float x); -__device__ float __hip_precise_fsqrt_ru(float x); -__device__ float __hip_precise_fsqrt_rz(float x); -__device__ float __hip_precise_log10f(float x); -__device__ float __hip_precise_log2f(float x); -__device__ float __hip_precise_logf(float x); -__device__ float __hip_precise_powf(float base, float exponent); -__device__ void __hip_precise_sincosf(float x, float* s, float* c); -__device__ float __hip_precise_sinf(float x); -__device__ float __hip_precise_tanf(float x); -// Double Precision Math -__device__ double __hip_precise_dsqrt_rd(double x); -__device__ double __hip_precise_dsqrt_rn(double x); -__device__ double __hip_precise_dsqrt_ru(double x); -__device__ double __hip_precise_dsqrt_rz(double x); - - -// Float Fast Math -__device__ float __hip_fast_exp10f(float x); -__device__ float __hip_fast_expf(float x); -__device__ float __hip_fast_frsqrt_rn(float x); -__device__ float __hip_fast_fsqrt_rn(float x); -__device__ float __hip_fast_fsqrt_ru(float x); -__device__ float __hip_fast_fsqrt_rz(float x); -__device__ float __hip_fast_log10f(float x); -__device__ float __hip_fast_logf(float x); -__device__ float __hip_fast_powf(float base, float exponent); -__device__ void __hip_fast_sincosf(float x, float* s, float* c); -__device__ float __hip_fast_tanf(float x); -// Double Precision Math -__device__ double __hip_fast_dsqrt_rd(double x); -__device__ double __hip_fast_dsqrt_rn(double x); -__device__ double __hip_fast_dsqrt_ru(double x); -__device__ double __hip_fast_dsqrt_rz(double x); - -float __hip_host_j0f(float x); -double __hip_host_j0(double x); - -float __hip_host_j1f(float x); -double __hip_host_j1(double x); - -float __hip_host_y0f(float x); -double __hip_host_y1(double x); - -float __hip_host_y1f(float x); -double __hip_host_y1(double x); - -float __hip_host_jnf(int n, float x); -double __hip_host_jn(int n, double x); - -float __hip_host_ynf(int n, float x); -double __hip_host_yn(int n, double x); - -#endif diff --git a/src/env.cpp b/src/env.cpp deleted file mode 100644 index a3e0bfbed4..0000000000 --- a/src/env.cpp +++ /dev/null @@ -1,109 +0,0 @@ -/* -Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved. - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. -*/ - -#include "hip_hcc_internal.h" -#include "trace_helper.h" -#include "env.h" - -//--- -// Read environment variables. -void ihipReadEnv_I(int* var_ptr, const char* var_name1, const char* var_name2, - const char* description) { - char* env = getenv(var_name1); - - // Check second name if first not defined, used to allow HIP_ or CUDA_ env vars. - if ((env == NULL) && strcmp(var_name2, "0")) { - env = getenv(var_name2); - } - - // Default is set when variable is initialized (at top of this file), so only override if we - // find an environment variable. - if (env) { - long int v = strtol(env, NULL, 0); - *var_ptr = (int)(v); - } - if (HIP_PRINT_ENV) { - printf("%-30s = %2d : %s\n", var_name1, *var_ptr, description); - } -} - - -void ihipReadEnv_S(std::string* var_ptr, const char* var_name1, const char* var_name2, - const char* description) { - char* env = getenv(var_name1); - - // Check second name if first not defined, used to allow HIP_ or CUDA_ env vars. - if ((env == NULL) && strcmp(var_name2, "0")) { - env = getenv(var_name2); - } - - if (env) { - *static_cast(var_ptr) = env; - } - if (HIP_PRINT_ENV) { - printf("%-30s = %s : %s\n", var_name1, var_ptr->c_str(), description); - } -} - - -void ihipReadEnv_Callback(void* var_ptr, const char* var_name1, const char* var_name2, - const char* description, - std::string (*setterCallback)(void* var_ptr, const char* env)) { - char* env = getenv(var_name1); - - // Check second name if first not defined, used to allow HIP_ or CUDA_ env vars. - if ((env == NULL) && strcmp(var_name2, "0")) { - env = getenv(var_name2); - } - - std::string var_string = "0"; - if (env) { - var_string = setterCallback(var_ptr, env); - } - if (HIP_PRINT_ENV) { - printf("%-30s = %s : %s\n", var_name1, var_string.c_str(), description); - } -} - - -void tokenize(const std::string& s, char delim, std::vector* tokens) { - std::stringstream ss; - ss.str(s); - std::string item; - while (getline(ss, item, delim)) { - item.erase(std::remove(item.begin(), item.end(), ' '), item.end()); // remove whitespace. - tokens->push_back(item); - } -} - -void trim(std::string* s) { - // trim whitespace from beginning and end: - const char* t = "\t\n\r\f\v"; - s->erase(0, s->find_first_not_of(t)); - s->erase(s->find_last_not_of(t) + 1); -} - -static void ltrim(std::string* s) { - // trim whitespace from beginning - const char* t = "\t\n\r\f\v"; - s->erase(0, s->find_first_not_of(t)); -} diff --git a/src/env.h b/src/env.h deleted file mode 100644 index e574f158fb..0000000000 --- a/src/env.h +++ /dev/null @@ -1,27 +0,0 @@ -#pragma once - -extern void HipReadEnv(); - - -#define READ_ENV_I(_build, _ENV_VAR, _ENV_VAR2, _description) \ - ihipReadEnv_I(&_ENV_VAR, #_ENV_VAR, #_ENV_VAR2, _description); - -#define READ_ENV_S(_build, _ENV_VAR, _ENV_VAR2, _description) \ - ihipReadEnv_S(&_ENV_VAR, #_ENV_VAR, #_ENV_VAR2, _description); - -#define READ_ENV_C(_build, _ENV_VAR, _ENV_VAR2, _description, _callback) \ - ihipReadEnv_Callback(&_ENV_VAR, #_ENV_VAR, #_ENV_VAR2, _description, _callback); - - -extern void ihipReadEnv_I(int* var_ptr, const char* var_name1, const char* var_name2, - const char* description); -extern void ihipReadEnv_S(std::string* var_ptr, const char* var_name1, const char* var_name2, - const char* description); -extern void ihipReadEnv_Callback(void* var_ptr, const char* var_name1, const char* var_name2, - const char* description, - std::string (*setterCallback)(void* var_ptr, const char* env)); - - -// String functions: -extern void trim(std::string* s); -extern void tokenize(const std::string& s, char delim, std::vector* tokens); diff --git a/src/functional_grid_launch.inl b/src/functional_grid_launch.inl deleted file mode 100644 index a99a15db28..0000000000 --- a/src/functional_grid_launch.inl +++ /dev/null @@ -1,60 +0,0 @@ -/* -Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved. - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. -*/ - -#include "hip/hcc_detail/program_state.hpp" - -#include "hip/hip_runtime_api.h" - -// Internal header, do not percolate upwards. -#include "hip_hcc_internal.h" -#include "hc.hpp" -#include "trace_helper.h" - -#include -#include -#include -#include - -#include - -using namespace hc; -using namespace std; - -namespace hip_impl -{ - HIP_INTERNAL_EXPORTED_API hsa_agent_t target_agent(hipStream_t stream) - { - if (stream) { - return *static_cast( - stream->locked_getAv()->get_hsa_agent()); - } - GET_TLS(); - if (ihipGetTlsDefaultCtx() && ihipGetTlsDefaultCtx()->getDevice()) { - return ihipGetDevice( - ihipGetTlsDefaultCtx()->getDevice()->_deviceId)->_hsaAgent; - } - else { - return *static_cast( - accelerator{}.get_default_view().get_hsa_agent()); - } - } -} diff --git a/src/grid_launch.cpp b/src/grid_launch.cpp deleted file mode 100644 index 5c2b147281..0000000000 --- a/src/grid_launch.cpp +++ /dev/null @@ -1,29 +0,0 @@ -/* -Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved. - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. -*/ - -#include "hip/hcc_detail/grid_launch_GGL.hpp" - -#if __hcc_workweek__ >= 17481 -#include "functional_grid_launch.inl" -#else -#include "macro_based_grid_launch.inl" -#endif \ No newline at end of file diff --git a/src/h2f.cpp b/src/h2f.cpp deleted file mode 100644 index 84d067166f..0000000000 --- a/src/h2f.cpp +++ /dev/null @@ -1,70 +0,0 @@ -/* -Copyright (c) 2018 - present Advanced Micro Devices, Inc. All rights reserved. - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. -*/ - -#include -#include - -// conversion routines between float and half precision -static inline std::uint32_t f32_as_u32(float f) { union { float f; std::uint32_t u; } v; v.f = f; return v.u; } -static inline float u32_as_f32(std::uint32_t u) { union { float f; std::uint32_t u; } v; v.u = u; return v.f; } -static inline int clamp_int(int i, int l, int h) { return std::min(std::max(i, l), h); } - -// half to float, the f16 is in the low 16 bits of the input argument a -static inline float __convert_half_to_float(std::uint32_t a) noexcept { - std::uint32_t u = ((a << 13) + 0x70000000U) & 0x8fffe000U; - std::uint32_t v = f32_as_u32(u32_as_f32(u) * 0x1.0p+112f) + 0x38000000U; - u = (a & 0x7fff) != 0 ? v : u; - return u32_as_f32(u) * 0x1.0p-112f; -} - -// float to half with nearest even rounding -// The lower 16 bits of the result is the bit pattern for the f16 -static inline std::uint32_t __convert_float_to_half(float a) noexcept { - std::uint32_t u = f32_as_u32(a); - int e = static_cast((u >> 23) & 0xff) - 127 + 15; - std::uint32_t m = ((u >> 11) & 0xffe) | ((u & 0xfff) != 0); - std::uint32_t i = 0x7c00 | (m != 0 ? 0x0200 : 0); - std::uint32_t n = ((std::uint32_t)e << 12) | m; - std::uint32_t s = (u >> 16) & 0x8000; - int b = clamp_int(1-e, 0, 13); - std::uint32_t d = (0x1000 | m) >> b; - d |= (d << b) != (0x1000 | m); - std::uint32_t v = e < 1 ? d : n; - v = (v >> 2) + (((v & 0x7) == 3) | ((v & 0x7) > 5)); - v = e > 30 ? 0x7c00 : v; - v = e == 143 ? i : v; - return s | v; -} - -// On machines without fp16 instructions, clang lowers llvm.convert.from.fp16 -// to call of this function. -extern "C" __attribute__((visibility("default"))) -float __gnu_h2f_ieee(unsigned short h){ - return __convert_half_to_float((std::uint32_t) h); -} - -// On machines without fp16 instructions, clang lowers llvm.convert.to.fp16 -// to call of this function. -extern "C" __attribute__((visibility("default"))) -unsigned short __gnu_f2h_ieee(float f){ - return (unsigned short)__convert_float_to_half(f); -} diff --git a/src/hip_clang.cpp b/src/hip_clang.cpp deleted file mode 100644 index 7a2e72a20e..0000000000 --- a/src/hip_clang.cpp +++ /dev/null @@ -1,533 +0,0 @@ -/* -Copyright (c) 2018 - present Advanced Micro Devices, Inc. All rights reserved. - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. -*/ - -#include -#include -#include - -#include "hip/hip_runtime.h" -#include "hip_hcc_internal.h" -#include "hip_fatbin.h" -#include "trace_helper.h" -#include "program_state.inl" - -#ifdef __GNUC__ -#pragma GCC visibility push (default) -#endif - -extern "C" std::vector* -__hipRegisterFatBinary(const void* data) -{ - hip_impl::hip_init(); - - tprintf(DB_FB, "Enter __hipRegisterFatBinary(%p)\n", data); - const __CudaFatBinaryWrapper* fbwrapper = reinterpret_cast(data); - if (fbwrapper->magic != __hipFatMAGIC2 || fbwrapper->version != 1) { - return nullptr; - } - - const __ClangOffloadBundleHeader* header = fbwrapper->binary; - std::string magic(reinterpret_cast(header), sizeof(CLANG_OFFLOAD_BUNDLER_MAGIC) - 1); - if (magic.compare(CLANG_OFFLOAD_BUNDLER_MAGIC)) { - return nullptr; - } - - auto modules = new std::vector(g_deviceCnt); - if (!modules) { - return nullptr; - } - - const __ClangOffloadBundleDesc* desc = &header->desc[0]; - for (uint64_t i = 0; i < header->numBundles; ++i, - desc = reinterpret_cast( - reinterpret_cast(&desc->triple[0]) + desc->tripleSize)) { - - std::string triple{&desc->triple[0], sizeof(AMDGCN_AMDHSA_TRIPLE) - 1}; - if (triple.compare(AMDGCN_AMDHSA_TRIPLE)) - continue; - - std::string target{&desc->triple[sizeof(AMDGCN_AMDHSA_TRIPLE)], - desc->tripleSize - sizeof(AMDGCN_AMDHSA_TRIPLE)}; - tprintf(DB_FB, "Found bundle for %s\n", target.c_str()); - - for (int deviceId = 0; deviceId < g_deviceCnt; ++deviceId) { - hsa_agent_t agent = g_allAgents[deviceId + 1]; - - char name[64] = {}; - hsa_agent_get_info(agent, HSA_AGENT_INFO_NAME, name); - if (target.compare(name)) { - continue; - } - - ihipModule_t* module = new ihipModule_t; - if (!module) { - continue; - } - - hsa_executable_create_alt(HSA_PROFILE_FULL, HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT, nullptr, - &module->executable); - - std::string image{reinterpret_cast( - reinterpret_cast(header) + desc->offset), desc->size}; - if (HIP_DUMP_CODE_OBJECT) - __hipDumpCodeObject(image); - module->executable = hip_impl::get_program_state().load_executable_no_copy( - reinterpret_cast(header) + desc->offset, desc->size, - module->executable, agent); - - if (module->executable.handle) { - hip_impl::program_state_impl::read_kernarg_metadata(image, module->kernargs); - modules->at(deviceId) = module; - - tprintf(DB_FB, "Loaded code object for %s, args size=%ld\n", name, module->kernargs.size()); - } else { - fprintf(stderr, "Failed to load code object for %s\n", name); - abort(); - } - } - } - - for (int deviceId = 0; deviceId < g_deviceCnt; ++deviceId) { - hsa_agent_t agent = g_allAgents[deviceId + 1]; - - char name[64] = {}; - hsa_agent_get_info(agent, HSA_AGENT_INFO_NAME, name); - if (!(*modules)[deviceId]) { - fprintf(stderr, "No device code bundle for %s\n", name); - abort(); - } - } - - tprintf(DB_FB, "__hipRegisterFatBinary succeeds and returns %p\n", modules); - return modules; -} - -std::map> g_functions; - -extern "C" void __hipRegisterFunction( - std::vector* modules, - const void* hostFunction, - char* deviceFunction, - const char* deviceName, - unsigned int threadLimit, - uint3* tid, - uint3* bid, - dim3* blockDim, - dim3* gridDim, - int* wSize) -{ - HIP_INIT_API(NONE, modules, hostFunction, deviceFunction, deviceName); - std::vector functions(g_deviceCnt); - - assert(modules && modules->size() >= g_deviceCnt); - for (int deviceId = 0; deviceId < g_deviceCnt; ++deviceId) { - hipFunction_t function; - hsa_agent_t agent = g_allAgents[deviceId + 1]; - if ((hipSuccess == hipModuleGetFunctionEx(&function, modules->at(deviceId), deviceName, &agent) || - // With code-object-v3, we need to match the kernel descriptor symbol name - (hipSuccess == hipModuleGetFunctionEx( - &function, modules->at(deviceId), - (std::string(deviceName) + std::string(".kd")).c_str(), - &agent - ))) && function != nullptr) { - functions[deviceId] = function; - } - else { - tprintf(DB_FB, "__hipRegisterFunction cannot find kernel %s for" - " device %d\n", deviceName, deviceId); - } - } - - g_functions.insert(std::make_pair(hostFunction, std::move(functions))); -} - -static inline const char* hsa_strerror(hsa_status_t status) { - const char* str = nullptr; - if (hsa_status_string(status, &str) == HSA_STATUS_SUCCESS) { - return str; - } - return "Unknown error"; -} - -struct RegisteredVar { -public: - RegisteredVar(): size_(0), devicePtr_(nullptr) {} - ~RegisteredVar() {} - - static inline const char* hsa_strerror(hsa_status_t status) { - const char* str = nullptr; - if (hsa_status_string(status, &str) == HSA_STATUS_SUCCESS) { - return str; - } - return "Unknown error"; -} - -hipDeviceptr_t getdeviceptr() const { return devicePtr_; }; - size_t getvarsize() const { return size_; }; - - size_t size_; // Size of the variable - hipDeviceptr_t devicePtr_; //Device Memory Address of the variable. -}; - -struct DeviceVar { - void* shadowVptr; - std::string hostVar; - size_t size; - std::vector* modules; - std::vector rvars; - bool dyn_undef; -}; - -std::unordered_multimap g_vars; - -//The logic follows PlatformState::getGlobalVar in ROCclr RT -static DeviceVar* findVar(std::string hostVar, int deviceId, hipModule_t hmod) { - DeviceVar* dvar = nullptr; - if (hmod != nullptr) { - // If module is provided, then get the var only from that module - auto var_range = g_vars.equal_range(hostVar); - for (auto it = var_range.first; it != var_range.second; ++it) { - if ((*it->second.modules)[deviceId] == hmod) { - dvar = &(it->second); - break; - } - } - } else { - // If var count is < 2, return the var - if (g_vars.count(hostVar) < 2) { - auto it = g_vars.find(hostVar); - dvar = ((it == g_vars.end()) ? nullptr : &(it->second)); - } else { - // If var count is > 2, return the original var, - // if original var count != 1, return g_vars.end()/Invalid - size_t orig_global_count = 0; - auto var_range = g_vars.equal_range(hostVar); - for (auto it = var_range.first; it != var_range.second; ++it) { - // when dyn_undef is set, it is a shadow var - if (it->second.dyn_undef == false) { - ++orig_global_count; - dvar = &(it->second); - } - } - dvar = ((orig_global_count == 1) ? dvar : nullptr); - } - } - return dvar; -} - -hipError_t ihipGetGlobalVar(hipDeviceptr_t* dev_ptr, size_t* size_ptr, - const char* hostVar, hipModule_t hmod) { - GET_TLS(); - auto ctx = ihipGetTlsDefaultCtx(); - - if (!ctx) return hipErrorInvalidValue; - - auto device = ctx->getDevice(); - - if (!device) return hipErrorInvalidValue; - - ihipDevice_t* currentDevice = ihipGetDevice(device->_deviceId); - - if (!currentDevice) return hipErrorInvalidValue; - - int deviceId = device->_deviceId; - - DeviceVar* dvar = findVar(std::string(hostVar), deviceId, hmod); - if (dvar == nullptr) return hipErrorInvalidValue; - - if (dvar->rvars[deviceId].getdeviceptr() == nullptr) return hipErrorInvalidValue; - - *size_ptr = dvar->rvars[deviceId].getvarsize(); - *dev_ptr = dvar->rvars[deviceId].getdeviceptr(); - return hipSuccess; -} - -static bool createGlobalVarObj(const hsa_executable_t& hsaExecutable, const hsa_agent_t& hasAgent, - const char* global_name, void** device_pptr, size_t* bytes) { - hsa_status_t status = HSA_STATUS_SUCCESS; - hsa_symbol_kind_t sym_type; - hsa_executable_symbol_t global_symbol; - std::string buildLog; - - /* Find HSA Symbol by name */ - status = hsa_executable_get_symbol_by_name(hsaExecutable, global_name, &hasAgent, - &global_symbol); - if (status != HSA_STATUS_SUCCESS) { - buildLog += "Error: Failed to find the Symbol by Name: "; - buildLog += hsa_strerror(status); - tprintf(DB_FB, "createGlobalVarObj: %s\n", buildLog.c_str()); - return false; - } - - /* Find HSA Symbol Type */ - status = hsa_executable_symbol_get_info(global_symbol, HSA_EXECUTABLE_SYMBOL_INFO_TYPE, - &sym_type); - if (status != HSA_STATUS_SUCCESS) { - buildLog += "Error: Failed to find the Symbol Type : "; - buildLog += hsa_strerror(status); - tprintf(DB_FB, "createGlobalVarObj: %s\n", buildLog.c_str()); - return false; - } - - /* Make sure symbol type is VARIABLE */ - if (sym_type != HSA_SYMBOL_KIND_VARIABLE) { - buildLog += "Error: Symbol is not of type VARIABLE : "; - buildLog += hsa_strerror(status); - tprintf(DB_FB, "createGlobalVarObj: %s\n", buildLog.c_str()); - return false; - } - - /* Retrieve the size of the variable */ - status = hsa_executable_symbol_get_info(global_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE, bytes); - - if (status != HSA_STATUS_SUCCESS) { - buildLog += "Error: Failed to retrieve the Symbol Size : "; - buildLog += hsa_strerror(status); - tprintf(DB_FB, "createGlobalVarObj: %s\n", buildLog.c_str()); - return false; - } - - /* Find HSA Symbol Address */ - status = hsa_executable_symbol_get_info(global_symbol, - HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, device_pptr); - if (status != HSA_STATUS_SUCCESS) { - buildLog += "Error: Failed to find the Symbol Address : "; - buildLog += hsa_strerror(status); - tprintf(DB_FB, "createGlobalVarObj: %s\n", buildLog.c_str()); - return false; - } else { - tprintf(DB_FB, "createGlobalVarObj: var %s : device=%p, size=%zu\n", global_name, *device_pptr, *bytes); - } - - return true; -} - -// Registers a device-side global variable. -// For each global variable in device code, there is a corresponding shadow -// global variable in host code. The shadow host variable is used to keep -// track of the value of the device side global variable between kernel -// executions. -// The basic logic is taken from ROCclr RT, but there is much difference. -extern "C" void __hipRegisterVar( - std::vector* modules, // The device modules containing code object - char* var, // The shadow variable in host code - char* hostVar, // Variable name in host code - const char* deviceVar, // Variable name in device code - int ext, // Whether this variable is external - int size, // Size of the variable - int constant, // Whether this variable is constant - int global) // Unknown, always 0 -{ - HIP_INIT_API(__hipRegisterVar, modules, var, hostVar, deviceVar, ext, size, constant, global); - - DeviceVar dvar{var, std::string{ hostVar }, static_cast(size), modules, - std::vector{ g_deviceCnt }, false }; - - for (int deviceId = 0; deviceId < g_deviceCnt; deviceId++) { - auto device = ihipGetDevice(deviceId); - if(!device) { - continue; - } - hsa_executable_t& executable = (*modules)[deviceId]->executable; - hsa_agent_t& agent = g_allAgents[deviceId + 1]; - size_t bytes = 0; - hipDeviceptr_t devicePtr = nullptr; - - bool success = createGlobalVarObj(executable, agent, hostVar, &devicePtr, &bytes); - if(!success) { - return; - } - dvar.rvars[deviceId].devicePtr_ = devicePtr; - dvar.rvars[deviceId].size_ = bytes; - - hc::AmPointerInfo ptrInfo(nullptr, devicePtr, devicePtr, bytes, device->_acc, true, false); - hc::am_memtracker_add(devicePtr, ptrInfo); - - #if USE_APP_PTR_FOR_CTX - hc::am_memtracker_update(devicePtr, device->_deviceId, 0u, ihipGetTlsDefaultCtx()); - #else - hc::am_memtracker_update(devicePtr, device->_deviceId, 0u); - #endif - } - g_vars.insert(std::make_pair(std::string(hostVar), dvar)); -} - -extern "C" void __hipUnregisterFatBinary(std::vector* modules) -{ - std::for_each(modules->begin(), modules->end(), [](hipModule_t module){ delete module; }); - delete modules; -} - -hipError_t hipConfigureCall( - dim3 gridDim, - dim3 blockDim, - size_t sharedMem, - hipStream_t stream) -{ - GET_TLS(); - auto ctx = ihipGetTlsDefaultCtx(); - LockedAccessor_CtxCrit_t crit(ctx->criticalData()); - - crit->_execStack.push(ihipExec_t{gridDim, blockDim, sharedMem, stream}); - return hipSuccess; -} - - -extern "C" hipError_t __hipPushCallConfiguration( - dim3 gridDim, - dim3 blockDim, - size_t sharedMem, - hipStream_t stream) -{ - GET_TLS(); - auto ctx = ihipGetTlsDefaultCtx(); - LockedAccessor_CtxCrit_t crit(ctx->criticalData()); - - crit->_execStack.push(ihipExec_t{gridDim, blockDim, sharedMem, stream}); - return hipSuccess; -} - -extern "C" hipError_t __hipPopCallConfiguration( - dim3 *gridDim, - dim3 *blockDim, - size_t *sharedMem, - hipStream_t *stream) -{ - GET_TLS(); - auto ctx = ihipGetTlsDefaultCtx(); - LockedAccessor_CtxCrit_t crit(ctx->criticalData()); - - ihipExec_t exec; - exec = std::move(crit->_execStack.top()); - crit->_execStack.pop(); - - *gridDim = exec._gridDim; - *blockDim = exec._blockDim; - *sharedMem = exec._sharedMem; - *stream = exec._hStream; - - return hipSuccess; -} - -int getCurrentDeviceId() -{ - GET_TLS(); - - int deviceId = 0; - auto ctx = ihipGetTlsDefaultCtx(); - - if(!ctx) return deviceId; - - LockedAccessor_CtxCrit_t crit(ctx->criticalData()); - - if(crit->_execStack.size() != 0) - { - auto &exec = crit->_execStack.top(); - - if (exec._hStream) { - deviceId = exec._hStream->getDevice()->_deviceId; - } else if (ctx->getDevice()) { - deviceId = ctx->getDevice()->_deviceId; - } - } else if (ctx->getDevice()) { - deviceId = ctx->getDevice()->_deviceId; - } - return deviceId; -} - -hipFunction_t ihipGetDeviceFunction(const void *hostFunction) -{ - int deviceId = getCurrentDeviceId(); - auto it = g_functions.find(hostFunction); - if (it == g_functions.end() || !it->second[deviceId]) { - return nullptr; - } - return it->second[deviceId]; -} - -hipError_t hipSetupArgument( - const void *arg, - size_t size, - size_t offset) -{ - HIP_INIT_API(hipSetupArgument, arg, size, offset); - auto ctx = ihipGetTlsDefaultCtx(); - LockedAccessor_CtxCrit_t crit(ctx->criticalData()); - auto& arguments = crit->_execStack.top()._arguments; - - if (arguments.size() < offset + size) { - arguments.resize(offset + size); - } - - ::memcpy(&arguments[offset], arg, size); - return hipSuccess; -} - -hipError_t hipLaunchByPtr(const void *hostFunction) -{ - HIP_INIT_API(hipLaunchByPtr, hostFunction); - ihipExec_t exec; - { - auto ctx = ihipGetTlsDefaultCtx(); - LockedAccessor_CtxCrit_t crit(ctx->criticalData()); - exec = std::move(crit->_execStack.top()); - crit->_execStack.pop(); - } - - int deviceId; - if (exec._hStream) { - deviceId = exec._hStream->getDevice()->_deviceId; - } - else if (ihipGetTlsDefaultCtx() && ihipGetTlsDefaultCtx()->getDevice()) { - deviceId = ihipGetTlsDefaultCtx()->getDevice()->_deviceId; - } - else { - deviceId = 0; - } - - hipError_t e = hipSuccess; - decltype(g_functions)::iterator it; - if ((it = g_functions.find(hostFunction)) == g_functions.end() || - !it->second[deviceId]) { - e = hipErrorUnknown; - fprintf(stderr, "hipLaunchByPtr cannot find kernel with stub address %p" - " for device %d!\n", hostFunction, deviceId); - abort(); - } else { - size_t size = exec._arguments.size(); - void *extra[] = { - HIP_LAUNCH_PARAM_BUFFER_POINTER, &exec._arguments[0], - HIP_LAUNCH_PARAM_BUFFER_SIZE, &size, - HIP_LAUNCH_PARAM_END - }; - - e = hipModuleLaunchKernel(it->second[deviceId], - exec._gridDim.x, exec._gridDim.y, exec._gridDim.z, - exec._blockDim.x, exec._blockDim.y, exec._blockDim.z, - exec._sharedMem, exec._hStream, nullptr, extra); - } - - return ihipLogStatus(e); -} -#ifdef __GNUC__ -#pragma GCC visibility pop -#endif diff --git a/src/hip_context.cpp b/src/hip_context.cpp deleted file mode 100644 index 300877f8b5..0000000000 --- a/src/hip_context.cpp +++ /dev/null @@ -1,332 +0,0 @@ -/* -Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved. - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. -*/ - -//--- -// Driver initialization and reporting: - -#include - -#include "hip/hip_runtime.h" -#include "hip_hcc_internal.h" -#include "trace_helper.h" - -void ihipCtxStackUpdate() { - GET_TLS(); - if (tls->ctxStack.empty()) { - tls->ctxStack.push(ihipGetTlsDefaultCtx()); - } -} - -hipError_t hipInit(unsigned int flags) { - HIP_INIT_API(hipInit, flags); - - hipError_t e = hipSuccess; - - // Flags must be 0 - if (flags != 0) { - e = hipErrorInvalidValue; - } - - return ihipLogStatus(e); -} - -hipError_t hipCtxCreate(hipCtx_t* ctx, unsigned int flags, hipDevice_t device) { - HIP_INIT_API(hipCtxCreate, ctx, flags, device); // FIXME - review if we want to init - hipError_t e = hipSuccess; - auto deviceHandle = ihipGetDevice(device); - { - // Obtain mutex access to the device critical data, release by destructor - LockedAccessor_DeviceCrit_t deviceCrit(deviceHandle->criticalData()); - auto ictx = new ihipCtx_t(deviceHandle, g_deviceCnt, flags); - *ctx = ictx; - ihipSetTlsDefaultCtx(*ctx); - tls->ctxStack.push(*ctx); - tls->getPrimaryCtx = false; - deviceCrit->addContext(ictx); - } - - return ihipLogStatus(e); -} - -hipError_t hipDeviceGet(hipDevice_t* device, int deviceId) { - HIP_INIT_API(hipDeviceGet, device, deviceId); // FIXME - review if we want to init - - auto deviceHandle = ihipGetDevice(deviceId); - - hipError_t e = hipSuccess; - if (deviceHandle == NULL) { - e = hipErrorInvalidDevice; - } else { - *device = deviceId; - } - - return ihipLogStatus(e); -}; - -hipError_t hipDriverGetVersion(int* driverVersion) { - HIP_INIT_API(hipDriverGetVersion, driverVersion); - hipError_t e = hipSuccess; - if (driverVersion) { - *driverVersion = 4; - } else { - e = hipErrorInvalidValue; - } - - return ihipLogStatus(e); -} - -hipError_t hipRuntimeGetVersion(int* runtimeVersion) { - HIP_INIT_API(hipRuntimeGetVersion, runtimeVersion); - hipError_t e = hipSuccess; - if (runtimeVersion) { - *runtimeVersion = HIP_VERSION_PATCH; - } else { - e = hipErrorInvalidValue; - } - - return ihipLogStatus(e); -} - -hipError_t hipCtxDestroy(hipCtx_t ctx) { - HIP_INIT_API(hipCtxDestroy, ctx); - hipError_t e = hipSuccess; - ihipCtx_t* currentCtx = ihipGetTlsDefaultCtx(); - ihipCtx_t* primaryCtx = ((ihipDevice_t*)ctx->getDevice())->_primaryCtx; - if (primaryCtx == ctx) { - e = hipErrorInvalidValue; - } else { - if (currentCtx == ctx) { - // need to destroy the ctx associated with calling thread - tls->ctxStack.pop(); - } - { - auto deviceHandle = ctx->getWriteableDevice(); - deviceHandle->locked_removeContext(ctx); - ctx->locked_reset(); - } - delete ctx; // As per CUDA docs , attempting to access ctx from those threads which has - // this ctx as current, will result in the error HIP_ERROR_CONTEXT_IS_DESTROYED. - } - - return ihipLogStatus(e); -} - -hipError_t hipCtxPopCurrent(hipCtx_t* ctx) { - HIP_INIT_API(hipCtxPopCurrent, ctx); - hipError_t e = hipSuccess; - ihipCtx_t* currentCtx = ihipGetTlsDefaultCtx(); - auto deviceHandle = currentCtx->getDevice(); - *ctx = currentCtx; - - if (!tls->ctxStack.empty()) { - tls->ctxStack.pop(); - } - - if (!tls->ctxStack.empty()) { - currentCtx = tls->ctxStack.top(); - } else { - currentCtx = deviceHandle->_primaryCtx; - } - - ihipSetTlsDefaultCtx(currentCtx); // TOD0 - Shall check for NULL? - return ihipLogStatus(e); -} - -hipError_t hipCtxPushCurrent(hipCtx_t ctx) { - HIP_INIT_API(hipCtxPushCurrent, ctx); - hipError_t e = hipSuccess; - if (ctx != NULL) { // TODO- is this check needed? - ihipSetTlsDefaultCtx(ctx); - tls->ctxStack.push(ctx); - tls->getPrimaryCtx = false; - } else { - e = hipErrorInvalidContext; - } - return ihipLogStatus(e); -} - -hipError_t hipCtxGetCurrent(hipCtx_t* ctx) { - HIP_INIT_API(hipCtxGetCurrent, ctx); - hipError_t e = hipSuccess; - if ((tls->getPrimaryCtx) || tls->ctxStack.empty()) { - *ctx = ihipGetTlsDefaultCtx(); - } else { - *ctx = tls->ctxStack.top(); - } - return ihipLogStatus(e); -} - -hipError_t hipCtxSetCurrent(hipCtx_t ctx) { - HIP_INIT_API(hipCtxSetCurrent, ctx); - hipError_t e = hipSuccess; - if (ctx == NULL) { - tls->ctxStack.pop(); - } else { - ihipSetTlsDefaultCtx(ctx); - tls->ctxStack.push(ctx); - tls->getPrimaryCtx = false; - } - return ihipLogStatus(e); -} - -hipError_t hipCtxGetDevice(hipDevice_t* device) { - HIP_INIT_API(hipCtxGetDevice, device); - hipError_t e = hipSuccess; - - ihipCtx_t* ctx = ihipGetTlsDefaultCtx(); - - if (ctx == nullptr) { - e = hipErrorInvalidContext; - // TODO *device = nullptr; - } else { - auto deviceHandle = ctx->getDevice(); - *device = deviceHandle->_deviceId; - } - return ihipLogStatus(e); -} - -hipError_t hipCtxGetApiVersion(hipCtx_t ctx, int* apiVersion) { - HIP_INIT_API(hipCtxGetApiVersion, apiVersion); - - if (apiVersion) { - *apiVersion = 4; - } - - return ihipLogStatus(hipSuccess); -} - -hipError_t hipCtxGetCacheConfig(hipFuncCache_t* cacheConfig) { - HIP_INIT_API(hipCtxGetCacheConfig, cacheConfig); - - *cacheConfig = hipFuncCachePreferNone; - - return ihipLogStatus(hipSuccess); -} - -hipError_t hipCtxSetCacheConfig(hipFuncCache_t cacheConfig) { - HIP_INIT_API(hipCtxSetCacheConfig, cacheConfig); - - // Nop, AMD does not support variable cache configs. - - return ihipLogStatus(hipSuccess); -} - -hipError_t hipCtxSetSharedMemConfig(hipSharedMemConfig config) { - HIP_INIT_API(hipCtxSetSharedMemConfig, config); - - // Nop, AMD does not support variable shared mem configs. - - return ihipLogStatus(hipSuccess); -} - -hipError_t hipCtxGetSharedMemConfig(hipSharedMemConfig* pConfig) { - HIP_INIT_API(hipCtxGetSharedMemConfig, pConfig); - - *pConfig = hipSharedMemBankSizeFourByte; - - return ihipLogStatus(hipSuccess); -} - -hipError_t hipCtxSynchronize(void) { - HIP_INIT_API(hipCtxSynchronize, 1); - return ihipLogStatus(ihipSynchronize(tls)); // TODO Shall check validity of ctx? -} - -hipError_t hipCtxGetFlags(unsigned int* flags) { - HIP_INIT_API(hipCtxGetFlags, flags); - hipError_t e = hipSuccess; - ihipCtx_t* tempCtx; - tempCtx = ihipGetTlsDefaultCtx(); - *flags = tempCtx->_ctxFlags; - return ihipLogStatus(e); -} - -hipError_t hipDevicePrimaryCtxGetState(hipDevice_t dev, unsigned int* flags, int* active) { - HIP_INIT_API(hipDevicePrimaryCtxGetState, dev, flags, active); - hipError_t e = hipSuccess; - auto deviceHandle = ihipGetDevice(dev); - - if (deviceHandle == NULL) { - e = hipErrorInvalidDevice; - } - - ihipCtx_t* tempCtx; - tempCtx = ihipGetTlsDefaultCtx(); - ihipCtx_t* primaryCtx = deviceHandle->_primaryCtx; - if (tempCtx == primaryCtx) { - *active = 1; - *flags = tempCtx->_ctxFlags; - } else { - *active = 0; - *flags = primaryCtx->_ctxFlags; - } - return ihipLogStatus(e); -} - -hipError_t hipDevicePrimaryCtxRelease(hipDevice_t dev) { - HIP_INIT_API(hipDevicePrimaryCtxRelease, dev); - hipError_t e = hipSuccess; - auto deviceHandle = ihipGetDevice(dev); - - if (deviceHandle == NULL) { - e = hipErrorInvalidDevice; - } - return ihipLogStatus(e); -} - -hipError_t hipDevicePrimaryCtxRetain(hipCtx_t* pctx, hipDevice_t dev) { - HIP_INIT_API(hipDevicePrimaryCtxRetain, pctx, dev); - hipError_t e = hipSuccess; - auto deviceHandle = ihipGetDevice(dev); - - if (deviceHandle == NULL) { - e = hipErrorInvalidDevice; - } - *pctx = deviceHandle->_primaryCtx; - return ihipLogStatus(e); -} - -hipError_t hipDevicePrimaryCtxReset(hipDevice_t dev) { - HIP_INIT_API(hipDevicePrimaryCtxReset, dev); - hipError_t e = hipSuccess; - auto deviceHandle = ihipGetDevice(dev); - - if (deviceHandle == NULL) { - e = hipErrorInvalidDevice; - } - ihipCtx_t* primaryCtx = deviceHandle->_primaryCtx; - primaryCtx->locked_reset(); - return ihipLogStatus(e); -} - -hipError_t hipDevicePrimaryCtxSetFlags(hipDevice_t dev, unsigned int flags) { - HIP_INIT_API(hipDevicePrimaryCtxSetFlags, dev, flags); - hipError_t e = hipSuccess; - auto deviceHandle = ihipGetDevice(dev); - - if (deviceHandle == NULL) { - e = hipErrorInvalidDevice; - } else { - e = hipErrorContextAlreadyInUse; - } - return ihipLogStatus(e); -} diff --git a/src/hip_device.cpp b/src/hip_device.cpp deleted file mode 100644 index f7d6b3ac79..0000000000 --- a/src/hip_device.cpp +++ /dev/null @@ -1,648 +0,0 @@ -/* -Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved. - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. -*/ - -#include "hip/hip_runtime.h" -#include "hip_hcc_internal.h" -#include "trace_helper.h" -#include "device_util.h" - -//------------------------------------------------------------------------------------------------- -// Devices -//------------------------------------------------------------------------------------------------- -// TODO - does this initialize HIP runtime? -hipError_t hipGetDevice(int* deviceId) { - HIP_INIT_API(hipGetDevice, deviceId); - - hipError_t e = hipSuccess; - if (deviceId == nullptr) - return ihipLogStatus(hipErrorInvalidValue); - - auto ctx = ihipGetTlsDefaultCtx(); - - if (ctx == nullptr) { - e = hipErrorInvalidDevice; // TODO, check error code. - *deviceId = -1; - } else { - *deviceId = ctx->getDevice()->_deviceId; - } - - return ihipLogStatus(e); -} - -// TODO - does this initialize HIP runtime? -hipError_t ihipGetDeviceCount(int* count) { - hipError_t e = hipSuccess; - - if (count != nullptr) { - *count = g_deviceCnt; - - if (*count > 0) { - e = hipSuccess; - } else { - e = hipErrorNoDevice; - } - } else { - e = hipErrorInvalidValue; - } - return e; -} - -hipError_t hipGetDeviceCount(int* count) { - HIP_INIT_API(hipGetDeviceCount, count); - return ihipLogStatus(ihipGetDeviceCount(count)); -} - -hipError_t hipDeviceSetCacheConfig(hipFuncCache_t cacheConfig) { - HIP_INIT_API(hipDeviceSetCacheConfig, cacheConfig); - - // Nop, AMD does not support variable cache configs. - - return ihipLogStatus(hipSuccess); -} - -hipError_t hipDeviceGetCacheConfig(hipFuncCache_t* cacheConfig) { - HIP_INIT_API(hipDeviceGetCacheConfig, cacheConfig); - - if (cacheConfig == nullptr) { - return ihipLogStatus(hipErrorInvalidValue); - } - - *cacheConfig = hipFuncCachePreferNone; - - return ihipLogStatus(hipSuccess); -} - -hipError_t hipDeviceGetLimit(size_t* pValue, hipLimit_t limit) { - HIP_INIT_API(hipDeviceGetLimit, pValue, limit); - if (pValue == nullptr) { - return ihipLogStatus(hipErrorInvalidValue); - } -#if __HIP_ENABLE_DEVICE_MALLOC__ - if (limit == hipLimitMallocHeapSize) { - *pValue = (size_t)__HIP_SIZE_OF_HEAP; - return ihipLogStatus(hipSuccess); - } -#endif - return ihipLogStatus(hipErrorUnsupportedLimit); -} - -hipError_t hipFuncSetCacheConfig(const void* func, hipFuncCache_t cacheConfig) { - HIP_INIT_API(hipFuncSetCacheConfig, cacheConfig); - - // Nop, AMD does not support variable cache configs. - - return ihipLogStatus(hipSuccess); -} - -hipError_t hipDeviceSetSharedMemConfig(hipSharedMemConfig config) { - HIP_INIT_API(hipDeviceSetSharedMemConfig, config); - - // Nop, AMD does not support variable shared mem configs. - - return ihipLogStatus(hipSuccess); -} - -hipError_t hipDeviceGetSharedMemConfig(hipSharedMemConfig* pConfig) { - HIP_INIT_API(hipDeviceGetSharedMemConfig, pConfig); - - *pConfig = hipSharedMemBankSizeFourByte; - - return ihipLogStatus(hipSuccess); -} - -hipError_t hipSetDevice(int deviceId) { - HIP_INIT_API(hipSetDevice, deviceId); - if ((deviceId < 0) || (deviceId >= g_deviceCnt)) { - return ihipLogStatus(hipErrorInvalidDevice); - } else { - ihipSetTlsDefaultCtx(ihipGetPrimaryCtx(deviceId)); - tls->getPrimaryCtx = true; - return ihipLogStatus(hipSuccess); - } -} - -hipError_t hipDeviceSynchronize(void) { - HIP_INIT_SPECIAL_API(hipDeviceSynchronize, TRACE_SYNC); - return ihipLogStatus(ihipSynchronize(tls)); -} - -hipError_t hipDeviceReset(void) { - HIP_INIT_API(hipDeviceReset, ); - - auto* ctx = ihipGetTlsDefaultCtx(); - - // TODO-HCC - // This function currently does a user-level cleanup of known resources. - // It could benefit from KFD support to perform a more "nuclear" clean that would include any - // associated kernel resources and page table entries. - -#if 0 - if (ctx) { - // Release ctx resources (streams and memory): - ctx->locked_reset(); - } -#endif - if (ctx) { - ihipDevice_t* deviceHandle = ctx->getWriteableDevice(); - deviceHandle->locked_reset(); - } - - return ihipLogStatus(hipSuccess); -} - - -hipError_t ihipDeviceSetState(TlsData *tls) { - hipError_t e = hipErrorInvalidContext; - auto* ctx = ihipGetTlsDefaultCtx(); - - if (ctx) { - ihipDevice_t* deviceHandle = ctx->getWriteableDevice(); - if (deviceHandle->_state == 0) { - deviceHandle->_state = 1; - } - e = hipSuccess; - } - - return e; -} - - -hipError_t ihipDeviceGetAttribute(int* pi, hipDeviceAttribute_t attr, int device) { - hipError_t e = hipSuccess; - - if (pi == nullptr) { - return hipErrorInvalidValue; - } - - auto* hipDevice = ihipGetDevice(device); - hipDeviceProp_t* prop = &hipDevice->_props; - if (hipDevice) { - switch (attr) { - case hipDeviceAttributeMaxThreadsPerBlock: - *pi = prop->maxThreadsPerBlock; - break; - case hipDeviceAttributeMaxBlockDimX: - *pi = prop->maxThreadsDim[0]; - break; - case hipDeviceAttributeMaxBlockDimY: - *pi = prop->maxThreadsDim[1]; - break; - case hipDeviceAttributeMaxBlockDimZ: - *pi = prop->maxThreadsDim[2]; - break; - case hipDeviceAttributeMaxGridDimX: - *pi = prop->maxGridSize[0]; - break; - case hipDeviceAttributeMaxGridDimY: - *pi = prop->maxGridSize[1]; - break; - case hipDeviceAttributeMaxGridDimZ: - *pi = prop->maxGridSize[2]; - break; - case hipDeviceAttributeMaxSharedMemoryPerBlock: - *pi = prop->sharedMemPerBlock; - break; - case hipDeviceAttributeTotalConstantMemory: - *pi = prop->totalConstMem; - break; - case hipDeviceAttributeWarpSize: - *pi = prop->warpSize; - break; - case hipDeviceAttributeMaxRegistersPerBlock: - *pi = prop->regsPerBlock; - break; - case hipDeviceAttributeClockRate: - *pi = prop->clockRate; - break; - case hipDeviceAttributeMemoryClockRate: - *pi = prop->memoryClockRate; - break; - case hipDeviceAttributeMemoryBusWidth: - *pi = prop->memoryBusWidth; - break; - case hipDeviceAttributeMultiprocessorCount: - *pi = prop->multiProcessorCount; - break; - case hipDeviceAttributeComputeMode: - *pi = prop->computeMode; - break; - case hipDeviceAttributeL2CacheSize: - *pi = prop->l2CacheSize; - break; - case hipDeviceAttributeMaxThreadsPerMultiProcessor: - *pi = prop->maxThreadsPerMultiProcessor; - break; - case hipDeviceAttributeComputeCapabilityMajor: - *pi = prop->major; - break; - case hipDeviceAttributeComputeCapabilityMinor: - *pi = prop->minor; - break; - case hipDeviceAttributePciBusId: - *pi = prop->pciBusID; - break; - case hipDeviceAttributeConcurrentKernels: - *pi = prop->concurrentKernels; - break; - case hipDeviceAttributePciDeviceId: - *pi = prop->pciDeviceID; - break; - case hipDeviceAttributeMaxSharedMemoryPerMultiprocessor: - *pi = prop->maxSharedMemoryPerMultiProcessor; - break; - case hipDeviceAttributeIsMultiGpuBoard: - *pi = prop->isMultiGpuBoard; - break; - case hipDeviceAttributeIntegrated: - *pi = prop->integrated; - break; - case hipDeviceAttributeMaxTexture1DWidth: - *pi = prop->maxTexture1D; - break; - case hipDeviceAttributeMaxTexture2DWidth: - *pi = prop->maxTexture2D[0]; - break; - case hipDeviceAttributeMaxTexture2DHeight: - *pi = prop->maxTexture2D[1]; - break; - case hipDeviceAttributeMaxTexture3DWidth: - *pi = prop->maxTexture3D[0]; - break; - case hipDeviceAttributeMaxTexture3DHeight: - *pi = prop->maxTexture3D[1]; - break; - case hipDeviceAttributeMaxTexture3DDepth: - *pi = prop->maxTexture3D[2]; - break; - case hipDeviceAttributeHdpMemFlushCntl: - { - uint32_t** hdp = reinterpret_cast(pi); - *hdp = prop->hdpMemFlushCntl; - } - break; - case hipDeviceAttributeHdpRegFlushCntl: - { - uint32_t** hdp = reinterpret_cast(pi); - *hdp = prop->hdpRegFlushCntl; - } - break; - case hipDeviceAttributeCooperativeLaunch: - *pi = prop->cooperativeLaunch; - break; - case hipDeviceAttributeCooperativeMultiDeviceLaunch: - *pi = prop->cooperativeMultiDeviceLaunch; - break; - case hipDeviceAttributeCooperativeMultiDeviceUnmatchedFunc: - *pi = prop->cooperativeMultiDeviceUnmatchedFunc; - break; - case hipDeviceAttributeCooperativeMultiDeviceUnmatchedGridDim: - *pi = prop->cooperativeMultiDeviceUnmatchedGridDim; - break; - case hipDeviceAttributeCooperativeMultiDeviceUnmatchedBlockDim: - *pi = prop->cooperativeMultiDeviceUnmatchedBlockDim; - break; - case hipDeviceAttributeCooperativeMultiDeviceUnmatchedSharedMem: - *pi = prop->cooperativeMultiDeviceUnmatchedSharedMem; - break; - case hipDeviceAttributeMaxPitch: - *pi = prop->memPitch; - break; - case hipDeviceAttributeTextureAlignment: - *pi = prop->textureAlignment; - break; - case hipDeviceAttributeTexturePitchAlignment: - *pi = prop->texturePitchAlignment; - break; - case hipDeviceAttributeKernelExecTimeout: - *pi = prop->kernelExecTimeoutEnabled; - break; - case hipDeviceAttributeCanMapHostMemory: - *pi = prop->canMapHostMemory; - break; - case hipDeviceAttributeEccEnabled: - *pi = prop->ECCEnabled; - break; - default: - e = hipErrorInvalidValue; - break; - } - } else { - e = hipErrorInvalidDevice; - } - return e; -} - -hipError_t hipDeviceGetAttribute(int* pi, hipDeviceAttribute_t attr, int device) { - HIP_INIT_API(hipDeviceGetAttribute, pi, attr, device); - if ((device < 0) || (device >= g_deviceCnt)) { - return ihipLogStatus(hipErrorInvalidDevice); - } - return ihipLogStatus(ihipDeviceGetAttribute(pi, attr, device)); -} - -hipError_t ihipGetDeviceProperties(hipDeviceProp_t* props, int device) { - hipError_t e; - - if (props != nullptr) { - auto* hipDevice = ihipGetDevice(device); - if (hipDevice) { - // copy saved props - *props = hipDevice->_props; - e = hipSuccess; - } else { - e = hipErrorInvalidDevice; - } - } else { - e = hipErrorInvalidDevice; - } - - return e; -} - -hipError_t hipGetDeviceProperties(hipDeviceProp_t* props, int device) { - HIP_INIT_API(hipGetDeviceProperties, props, device); - if ((device < 0) || (device >= g_deviceCnt)) { - return ihipLogStatus(hipErrorInvalidDevice); - } - return ihipLogStatus(ihipGetDeviceProperties(props, device)); -} - -hipError_t hipSetDeviceFlags(unsigned int flags) { - HIP_INIT_API(hipSetDeviceFlags, flags); - - hipError_t e = hipSuccess; - - auto* ctx = ihipGetTlsDefaultCtx(); - - // TODO : does this really OR in the flags or replaces previous flags: - // TODO : Review error handling behavior for this function, it often returns - // ErrorSetOnActiveProcess - if (ctx) { - auto* deviceHandle = ctx->getDevice(); - if (deviceHandle->_state == 0) { - ctx->_ctxFlags = ctx->_ctxFlags | flags; - if (flags & hipDeviceScheduleMask) { - switch (hipDeviceScheduleMask) { - case hipDeviceScheduleAuto: - case hipDeviceScheduleSpin: - case hipDeviceScheduleYield: - case hipDeviceScheduleBlockingSync: - e = hipSuccess; - break; - default: - e = hipSuccess; // TODO - should this be error? Map to Auto? - // e = hipErrorInvalidValue; - break; - } - } - - unsigned supportedFlags = - hipDeviceScheduleMask | hipDeviceMapHost | hipDeviceLmemResizeToMax; - - if (flags & (~supportedFlags)) { - e = hipErrorInvalidValue; - } - } else { - e = hipErrorSetOnActiveProcess; - } - } else { - e = hipErrorInvalidDevice; - } - - return ihipLogStatus(e); -}; - -hipError_t hipDeviceComputeCapability(int* major, int* minor, hipDevice_t device) { - HIP_INIT_API(hipDeviceComputeCapability, major, minor, device); - hipError_t e = hipSuccess; - if ((device < 0) || (device >= g_deviceCnt)) { - e = hipErrorInvalidDevice; - } else { - e = ihipDeviceGetAttribute(major, hipDeviceAttributeComputeCapabilityMajor, device); - e = ihipDeviceGetAttribute(minor, hipDeviceAttributeComputeCapabilityMinor, device); - } - return ihipLogStatus(e); -} - -hipError_t hipDeviceGetName(char* name, int len, hipDevice_t device) { - // Cast to void* here to avoid printing garbage in debug modes. - HIP_INIT_API(hipDeviceGetName, (void*)name, len, device); - hipError_t e = hipSuccess; - if ((device < 0) || (device >= g_deviceCnt)) { - e = hipErrorInvalidDevice; - } else { - auto deviceHandle = ihipGetDevice(device); - int nameLen = strlen(deviceHandle->_props.name); - if (nameLen <= len) memcpy(name, deviceHandle->_props.name, nameLen); - } - return ihipLogStatus(e); -} - -hipError_t hipDeviceGetPCIBusId(char* pciBusId, int len, int device) { - // Cast to void* here to avoid printing garbage in debug modes. - HIP_INIT_API(hipDeviceGetPCIBusId, (void*)pciBusId, len, device); - hipError_t e = hipErrorInvalidValue; - if ((device < 0) || (device >= g_deviceCnt)) { - e = hipErrorInvalidDevice; - } else { - if ((pciBusId != nullptr) && (len > 0)) { - auto deviceHandle = ihipGetDevice(device); - int retVal = - snprintf(pciBusId, len, "%04x:%02x:%02x.0", deviceHandle->_props.pciDomainID, - deviceHandle->_props.pciBusID, deviceHandle->_props.pciDeviceID); - if (retVal > 0 && retVal < len) { - e = hipSuccess; - } - } - } - return ihipLogStatus(e); -} - -hipError_t hipDeviceTotalMem(size_t* bytes, hipDevice_t device) { - HIP_INIT_API(hipDeviceTotalMem, bytes, device); - hipError_t e = hipSuccess; - if ((device < 0) || (device >= g_deviceCnt)) { - e = hipErrorInvalidDevice; - } else { - auto deviceHandle = ihipGetDevice(device); - *bytes = deviceHandle->_props.totalGlobalMem; - } - return ihipLogStatus(e); -} - -hipError_t hipDeviceGetByPCIBusId(int* device, const char* pciBusId) { - HIP_INIT_API(hipDeviceGetByPCIBusId, device, pciBusId); - hipDeviceProp_t tempProp; - int deviceCount = 0; - hipError_t e = hipErrorInvalidValue; - if ((device != nullptr) && (pciBusId != nullptr)) { - int pciBusID = -1; - int pciDeviceID = -1; - int pciDomainID = -1; - int len = 0; - len = sscanf(pciBusId, "%04x:%02x:%02x", &pciDomainID, &pciBusID, &pciDeviceID); - if (len == 3) { - ihipGetDeviceCount(&deviceCount); - for (int i = 0; i < deviceCount; i++) { - ihipGetDeviceProperties(&tempProp, i); - if (tempProp.pciBusID == pciBusID) { - *device = i; - e = hipSuccess; - break; - } - } - } - } - return ihipLogStatus(e); -} - -hipError_t hipChooseDevice(int* device, const hipDeviceProp_t* prop) { - HIP_INIT_API(hipChooseDevice, device, prop); - hipDeviceProp_t tempProp; - hipError_t e = hipSuccess; - if ((device == NULL) || (prop == NULL)) { - e = hipErrorInvalidValue; - } - if (e == hipSuccess) { - int deviceCount; - int inPropCount = 0; - int matchedPropCount = 0; - ihipGetDeviceCount(&deviceCount); - *device = 0; - for (int i = 0; i < deviceCount; i++) { - ihipGetDeviceProperties(&tempProp, i); - if (prop->major != 0) { - inPropCount++; - if (tempProp.major >= prop->major) { - matchedPropCount++; - } - if (prop->minor != 0) { - inPropCount++; - if (tempProp.minor >= prop->minor) { - matchedPropCount++; - } - } - } - if (prop->totalGlobalMem != 0) { - inPropCount++; - if (tempProp.totalGlobalMem >= prop->totalGlobalMem) { - matchedPropCount++; - } - } - if (prop->sharedMemPerBlock != 0) { - inPropCount++; - if (tempProp.sharedMemPerBlock >= prop->sharedMemPerBlock) { - matchedPropCount++; - } - } - if (prop->maxThreadsPerBlock != 0) { - inPropCount++; - if (tempProp.maxThreadsPerBlock >= prop->maxThreadsPerBlock) { - matchedPropCount++; - } - } - if (prop->totalConstMem != 0) { - inPropCount++; - if (tempProp.totalConstMem >= prop->totalConstMem) { - matchedPropCount++; - } - } - if (prop->multiProcessorCount != 0) { - inPropCount++; - if (tempProp.multiProcessorCount >= prop->multiProcessorCount) { - matchedPropCount++; - } - } - if (prop->maxThreadsPerMultiProcessor != 0) { - inPropCount++; - if (tempProp.maxThreadsPerMultiProcessor >= prop->maxThreadsPerMultiProcessor) { - matchedPropCount++; - } - } - if (prop->memoryClockRate != 0) { - inPropCount++; - if (tempProp.memoryClockRate >= prop->memoryClockRate) { - matchedPropCount++; - } - } - if (inPropCount == matchedPropCount) { - *device = i; - } -#if 0 - else{ - e= hipErrorInvalidValue; - } -#endif - } - } - return ihipLogStatus(e); -} - -#define HSA_ERROR_CHECK(hsa_error_code) \ - if ((hsa_error_code != HSA_STATUS_SUCCESS) && (hsa_error_code != HSA_STATUS_INFO_BREAK)) { \ - return ihipLogStatus(hipErrorRuntimeOther); \ - } - -hipError_t hipExtGetLinkTypeAndHopCount(int device1, int device2, uint32_t* linktype, uint32_t* hopcount) { - HIP_INIT_API(hipExtGetLinkTypeAndHopCount, device1, device2, linktype, hopcount); - - if ((device1 < 0) || (device1 >= g_deviceCnt) || (device2 < 0) || (device2 >= g_deviceCnt)) { - return ihipLogStatus(hipErrorInvalidDevice); - } else { - auto device1Handle = ihipGetDevice(device1); - auto device2Handle = ihipGetDevice(device2); - - const auto& find_pool = [](hsa_amd_memory_pool_t pool, void* data) { - bool allowed; - hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED, &allowed); - if (allowed) { - hsa_amd_segment_t segment; - hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &segment); - if (HSA_AMD_SEGMENT_GLOBAL != segment) return HSA_STATUS_SUCCESS; - - uint32_t flags; - hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &flags); - if (flags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED) { - *((hsa_amd_memory_pool_t*)data) = pool; - return HSA_STATUS_INFO_BREAK; - } - } - return HSA_STATUS_SUCCESS; - }; - - hsa_status_t err; - hsa_amd_memory_pool_t pool; - err = hsa_amd_agent_iterate_memory_pools(device2Handle->_hsaAgent, find_pool, (void*)&pool); - HSA_ERROR_CHECK(err); - - hsa_amd_memory_pool_link_info_t link_info; - err = hsa_amd_agent_memory_pool_get_info(device1Handle->_hsaAgent, pool, HSA_AMD_AGENT_MEMORY_POOL_INFO_LINK_INFO, &link_info); - HSA_ERROR_CHECK(err); - *linktype = link_info.link_type; - - if (link_info.numa_distance < 30) - *hopcount = 1; - else - *hopcount = 2; - return ihipLogStatus(hipSuccess); - } -} diff --git a/src/hip_error.cpp b/src/hip_error.cpp deleted file mode 100644 index 0f876b4f26..0000000000 --- a/src/hip_error.cpp +++ /dev/null @@ -1,61 +0,0 @@ -/* -Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved. - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. -*/ - -#include "hip/hip_runtime.h" -#include "hip_hcc_internal.h" -#include "trace_helper.h" - -//------------------------------------------------------------------------------------------------- -//------------------------------------------------------------------------------------------------- -// Error Handling -//--- - -hipError_t hipGetLastError() { - HIP_INIT_API(hipGetLastError); - - // Return last error, but then reset the state: - hipError_t e = ihipLogStatus(tls->lastHipError); - tls->lastHipError = hipSuccess; - return e; -} - -hipError_t hipPeekAtLastError() { - HIP_INIT_API(hipPeekAtLastError); - - // peek at last error, but don't reset it. - return ihipLogStatus(tls->lastHipError); -} - -const char* hipGetErrorName(hipError_t hip_error) { - HIP_INIT_API(hipGetErrorName, hip_error); - - return ihipErrorString(hip_error); -} - -const char* hipGetErrorString(hipError_t hip_error) { - HIP_INIT_API(hipGetErrorString, hip_error); - - // TODO - return a message explaining the error. - // TODO - This should be set up to return the same string reported in the the doxygen comments, - // somehow. - return hipGetErrorName(hip_error); -} diff --git a/src/hip_event.cpp b/src/hip_event.cpp deleted file mode 100644 index c626f7956d..0000000000 --- a/src/hip_event.cpp +++ /dev/null @@ -1,483 +0,0 @@ -/* -Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved. - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. -*/ - -#include "hip/hip_runtime.h" -#include "hip_hcc_internal.h" -#include "trace_helper.h" - -#include // errno, ENOENT -#include // O_RDWR, O_CREATE -#include // shm_open, shm_unlink, mmap, munmap, PROT_READ, PROT_WRITE, MAP_SHARED, MAP_FAILED -#include // ftruncate, close - -namespace { - - inline - const char* hsa_to_string(hsa_status_t err) noexcept - { - const char* r{}; - - if (hsa_status_string(err, &r) == HSA_STATUS_SUCCESS) return r; - - return "Unknown."; - } - - template - inline - void throwing_result_check(hsa_status_t res, const char (&file)[m], - const char (&function)[n], int line) { - if (res == HSA_STATUS_SUCCESS) return; - - throw std::runtime_error{"Failed in file " + (file + - (", in function \"" + (function + - ("\", on line " + std::to_string(line))))) + - ", with error: " + hsa_to_string(res)}; - } - - template - inline - void throwing_retval_check(int good, int retval, const char (&file)[m], - const char (&function)[n], int line) { - if (retval == good) return; - - throw std::runtime_error{"Failed in file " + (file + - (", in function \"" + (function + - ("\", on line " + std::to_string(line))))) + - ", with error: " + strerror(retval)}; - } - - template - inline - void throwing_msg_check(bool bad, const char (&msg)[o], - const char (&file)[m], - const char (&function)[n], int line) { - if (!bad) return; - - throw std::runtime_error{"Failed in file " + (file + - (", in function \"" + (function + - ("\", on line " + std::to_string(line))))) + - ", with error: " + msg}; - } - - template - inline - void throwing_errno_check(bool bad, const char (&file)[m], - const char (&function)[n], int line) { - if (!bad) return; - - throw std::runtime_error{"Failed in file " + (file + - (", in function \"" + (function + - ("\", on line " + std::to_string(line))))) + - ", with error: " + strerror(errno)}; - } - -} // Unnamed namespace. - -//------------------------------------------------------------------------------------------------- -//------------------------------------------------------------------------------------------------- -// Events -//--- - - -ihipEvent_t::ihipEvent_t(unsigned flags) : _criticalData(this) { - _flags = flags; - GET_TLS(); - auto ctx = ihipGetTlsDefaultCtx(); - _deviceId = ctx == nullptr ? -1 : ctx->getDevice()->_deviceId; -}; - - -// Attach to an existing completion future: -void ihipEvent_t::attachToCompletionFuture(const hc::completion_future* cf, hipStream_t stream, - ihipEventType_t eventType) { - LockedAccessor_EventCrit_t crit(_criticalData); - crit->_eventData.marker(*cf); - crit->_eventData._type = eventType; - crit->_eventData._stream = stream; - crit->_eventData._state = hipEventStatusRecording; -} - - -static void createIpcEventShmemIfNeeded(ihipEventData_t &ecd) { - if (!ecd._ipc_name.empty()) return; - - // create random shmem name - char name_template[] = "/tmp/eventXXXXXX"; - int temp_fd = mkstemp(name_template); - throwing_errno_check(-1 == temp_fd, __FILE__, __func__, __LINE__); - - // copy shmem name into event data, reformat to use a single slash - ecd._ipc_name = name_template; - ecd._ipc_name.replace(0, 5, "/hip_"); - - // open shmem - ecd._ipc_fd = shm_open(ecd._ipc_name.c_str(), O_RDWR | O_CREAT, 0777); - throwing_errno_check(ecd._ipc_fd < 0, __FILE__, __func__, __LINE__); - - // size it - throwing_retval_check(0, ftruncate(ecd._ipc_fd, sizeof(ihipIpcEventShmem_t)), __FILE__, __func__, __LINE__); - - // mmap it - ecd._ipc_shmem = (ihipIpcEventShmem_t*)mmap(0, sizeof(ihipIpcEventShmem_t), PROT_READ | PROT_WRITE, MAP_SHARED, ecd._ipc_fd, 0); - throwing_errno_check(NULL == ecd._ipc_shmem, __FILE__, __func__, __LINE__); - - // initialize shared state - ecd._ipc_shmem->owners = 1; - ecd._ipc_shmem->read_index = -1; - ecd._ipc_shmem->write_index = 0; - for (int i=0; i < IPC_SIGNALS_PER_EVENT; i++) { - ecd._ipc_shmem->signal[i] = 0; - } - - // remove temp file - throwing_errno_check(-1 == close(temp_fd), __FILE__, __func__, __LINE__); - throwing_errno_check(-1 == unlink(name_template), __FILE__, __func__, __LINE__); -} - - -static std::pair refreshEventStatus(ihipEventData_t &ecd) { - if (ecd._state == hipEventStatusRecording && ecd.marker().is_ready()) { - if ((ecd._type == hipEventTypeIndependent) || - (ecd._type == hipEventTypeStopCommand)) { - ecd._timestamp = ecd.marker().get_end_tick(); - } else if (ecd._type == hipEventTypeStartCommand) { - ecd._timestamp = ecd.marker().get_begin_tick(); - } else { - ecd._timestamp = 0; - assert(0); // TODO - move to debug assert - } - - ecd._state = hipEventStatusComplete; - - return std::pair(ecd._state, - ecd._timestamp); - } - - // Not complete path here: - return std::pair(ecd._state, ecd._timestamp); -} - - -hipError_t ihipEventCreate(hipEvent_t* event, unsigned flags) { - hipError_t e = hipSuccess; - - unsigned supportedFlags = hipEventDefault | hipEventBlockingSync | hipEventDisableTiming | - hipEventReleaseToDevice | hipEventReleaseToSystem | - hipEventInterprocess; - const unsigned releaseFlags = (hipEventReleaseToDevice | hipEventReleaseToSystem); - - const bool illegalFlags = - (flags & ~supportedFlags) || // can't set any unsupported flags. - (flags & releaseFlags) == releaseFlags; // can't set both release flags - - if (event && !illegalFlags) { - *event = new ihipEvent_t(flags); - } else { - e = hipErrorInvalidValue; - } - - return e; -} - -hipError_t hipEventCreateWithFlags(hipEvent_t* event, unsigned flags) { - HIP_INIT_API(hipEventCreateWithFlags, event, flags); - - return ihipLogStatus(ihipEventCreate(event, flags)); -} - -hipError_t hipEventCreate(hipEvent_t* event) { - HIP_INIT_API(hipEventCreate, event); - - return ihipLogStatus(ihipEventCreate(event, 0)); -} - -hipError_t hipEventRecord(hipEvent_t event, hipStream_t stream) { - HIP_INIT_SPECIAL_API(hipEventRecord, TRACE_SYNC, event, stream); - if (!event) return ihipLogStatus(hipErrorInvalidHandle); - stream = ihipSyncAndResolveStream(stream); - LockedAccessor_EventCrit_t eCrit(event->criticalData()); - auto &ecd{eCrit->_eventData}; - if (ecd._state == hipEventStatusUnitialized) return ihipLogStatus(hipErrorInvalidHandle); - if (HIP_SYNC_NULL_STREAM && stream->isDefaultStream()) { - // TODO-HIP_SYNC_NULL_STREAM : can remove this code when HIP_SYNC_NULL_STREAM = 0 - // If default stream , then wait on all queues. - ihipCtx_t* ctx = ihipGetTlsDefaultCtx(); - ctx->locked_syncDefaultStream(true, true); - ecd.marker(hc::completion_future()); // reset event - ecd._stream = stream; - ecd._timestamp = hc::get_system_ticks(); - ecd._state = hipEventStatusComplete; - // TODO handle IPC case? - } - else { - // Record the event in the stream: - ecd.marker(stream->locked_recordEvent(event)); - ecd._stream = stream; - ecd._timestamp = 0; - ecd._state = hipEventStatusRecording; - if (event->_flags & hipEventInterprocess) { - createIpcEventShmemIfNeeded(ecd); - int write_index = ecd._ipc_shmem->write_index++; // fetch add - int offset = write_index % IPC_SIGNALS_PER_EVENT; - // While event still valid and still locked, spin. - while (ecd._ipc_shmem->signal[offset] != 0) { - // TODO backoff - } - // Lock signal. - ecd._ipc_shmem->signal[offset] = 1; - // forward signal state from local signal to IPC signal via host callback - // create callback that can be passed to hsa_amd_signal_async_handler - // this function decrements the IPC signal by 1 to indicate completion - std::atomic *signal = &ecd._ipc_shmem->signal[offset]; - auto t{new std::function{[=]() { - signal->store(0); - }}}; - // register above callback with HSA runtime to be called when local signal - // is decremented from 1 to 0 by CP - auto local_signal = *reinterpret_cast(eCrit->_eventData.marker().get_native_handle()); - hsa_amd_signal_async_handler(local_signal, HSA_SIGNAL_CONDITION_LT, 1, - [](hsa_signal_value_t x, void* p) { - (*static_cast(p))(); - delete static_cast(p); - return false; - }, t); - // Update read index to indicate new signal. - int expected = write_index-1; - while (!ecd._ipc_shmem->read_index.compare_exchange_weak(expected, write_index)) { - throwing_msg_check( - expected >= write_index, - "IPC event record update read index failure", - __FILE__, __func__, __LINE__); - expected = write_index-1; - } - } - } - return ihipLogStatus(hipSuccess); -} - - -hipError_t hipEventDestroy(hipEvent_t event) { - HIP_INIT_API(hipEventDestroy, event); - - if (event) { - { - LockedAccessor_EventCrit_t crit(event->criticalData()); - auto &ecd{crit->_eventData}; - if (ecd._ipc_shmem) { - int owners = --ecd._ipc_shmem->owners; - throwing_errno_check(-1 == munmap(ecd._ipc_shmem, sizeof(ihipIpcEventShmem_t)), __FILE__, __func__, __LINE__); - throwing_errno_check(-1 == close(ecd._ipc_fd), __FILE__, __func__, __LINE__); - if (0 == owners) - throwing_errno_check(-1 == shm_unlink(ecd._ipc_name.c_str()), __FILE__, __func__, __LINE__); - } - } - delete event; - return ihipLogStatus(hipSuccess); - } else { - return ihipLogStatus(hipErrorInvalidHandle); - } -} - -hipError_t hipEventSynchronize(hipEvent_t event) { - HIP_INIT_SPECIAL_API(hipEventSynchronize, TRACE_SYNC, event); - - if (!event) return ihipLogStatus(hipErrorInvalidHandle); - - if (!(event->_flags & hipEventReleaseToSystem)) { - tprintf(DB_WARN, - "hipEventSynchronize on event without system-scope fence ; consider creating with " - "hipEventReleaseToSystem\n"); - } - - auto ecd = event->locked_copyCrit(); - - if (event->_flags & hipEventInterprocess) { - // this is an IPC event - int previous_read_index = ecd._ipc_shmem->read_index; - if (previous_read_index >= 0) { - // we have at least one recorded event, so proceed - int offset = previous_read_index % IPC_SIGNALS_PER_EVENT; - // While event still valid and still locked, spin. - while (ecd._ipc_shmem->read_index < previous_read_index+IPC_SIGNALS_PER_EVENT && ecd._ipc_shmem->signal[offset] != 0) { - // TODO backoff - } - } - return ihipLogStatus(hipSuccess); - } - - if (ecd._state == hipEventStatusUnitialized) { - return ihipLogStatus(hipErrorInvalidHandle); - } else if (ecd._state == hipEventStatusCreated) { - // Created but not actually recorded on any device: - return ihipLogStatus(hipSuccess); - } else if (HIP_SYNC_NULL_STREAM && (ecd._stream->isDefaultStream())) { - auto* ctx = ihipGetTlsDefaultCtx(); - // TODO-HIP_SYNC_NULL_STREAM - can remove this code - ctx->locked_syncDefaultStream(true, true); - return ihipLogStatus(hipSuccess); - } else { - ecd.marker().wait((event->_flags & hipEventBlockingSync) ? hc::hcWaitModeBlocked - : hc::hcWaitModeActive); - return ihipLogStatus(hipSuccess); - } -} - -hipError_t hipEventElapsedTime(float* ms, hipEvent_t start, hipEvent_t stop) { - HIP_INIT_API(hipEventElapsedTime, ms, start, stop); - - if (ms == nullptr) return ihipLogStatus(hipErrorInvalidValue); - if ((start == nullptr) || (stop == nullptr) || - (start->_deviceId != stop->_deviceId)) - return ihipLogStatus(hipErrorInvalidHandle); - - *ms = 0.0f; - auto startEcd = start->locked_copyCrit(); - auto stopEcd = stop->locked_copyCrit(); - - if ((start->_flags & hipEventDisableTiming) || - (startEcd._state == hipEventStatusUnitialized) || - (startEcd._state == hipEventStatusCreated) || - (stop->_flags & hipEventDisableTiming) || - (stopEcd._state == hipEventStatusUnitialized) || - (stopEcd._state == hipEventStatusCreated)) { - // Both events must be at least recorded else return hipErrorInvalidHandle - return ihipLogStatus(hipErrorInvalidHandle); - } - - // Refresh status, if still recording... - - auto startStatus = refreshEventStatus(startEcd); // pair < state, timestamp > - auto stopStatus = refreshEventStatus(stopEcd); // pair < state, timestamp > - - if ((startStatus.first == hipEventStatusComplete) && - (stopStatus.first == hipEventStatusComplete)) { - // Common case, we have good information for both events. 'second' is the timestamp: - int64_t tickDiff = (stopStatus.second - startStatus.second); - uint64_t freqHz; - hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY, &freqHz); - if (freqHz) { - *ms = ((double)(tickDiff) / (double)(freqHz)) * 1000.0f; - return ihipLogStatus(hipSuccess); - } else { - *ms = 0.0f; - return ihipLogStatus(hipErrorInvalidValue); - } - } else if ((startStatus.first == hipEventStatusRecording) || - (stopStatus.first == hipEventStatusRecording)) { - return ihipLogStatus(hipErrorNotReady); - } else { - assert(0); // TODO should we return hipErrorUnknown ? - } - - return ihipLogStatus(hipSuccess); -} - -hipError_t hipEventQuery(hipEvent_t event) { - HIP_INIT_SPECIAL_API(hipEventQuery, TRACE_QUERY, event); - - if (!event) return ihipLogStatus(hipErrorInvalidHandle); - - if (!(event->_flags & hipEventReleaseToSystem)) { - tprintf(DB_WARN, - "hipEventQuery on event without system-scope fence ; consider creating with " - "hipEventReleaseToSystem\n"); - } - - auto ecd = event->locked_copyCrit(); - - // this event is either from an ipc handle, or the owner of a local ipc event - if (event->_flags & hipEventInterprocess) { - if (ecd._ipc_shmem) { - int previous_read_index = ecd._ipc_shmem->read_index; - int offset = previous_read_index % IPC_SIGNALS_PER_EVENT; - if (ecd._ipc_shmem->read_index < previous_read_index+IPC_SIGNALS_PER_EVENT && ecd._ipc_shmem->signal[offset] != 0) { - return ihipLogStatus(hipErrorNotReady); - } - else { - return ihipLogStatus(hipSuccess); - } - } - } - // normal event - else { - if (ecd._state == hipEventStatusRecording && !ecd.marker().is_ready()) { - return ihipLogStatus(hipErrorNotReady); - } - } - - return ihipLogStatus(hipSuccess); -} - -hipError_t hipIpcGetEventHandle(hipIpcEventHandle_t* handle, hipEvent_t event) -{ - HIP_INIT_API(hipIpcGetEventHandle, handle, event); - -#if USE_IPC && ATOMIC_INT_LOCK_FREE == 2 - if (!handle) return ihipLogStatus(hipErrorInvalidHandle); - if (!event) return ihipLogStatus(hipErrorInvalidHandle); - if (!(event->_flags & hipEventInterprocess)) return ihipLogStatus(hipErrorInvalidHandle); - if (!(event->_flags & hipEventDisableTiming)) return ihipLogStatus(hipErrorInvalidHandle); - - LockedAccessor_EventCrit_t crit(event->criticalData()); - - auto &ecd{crit->_eventData}; - createIpcEventShmemIfNeeded(ecd); - // copy name into handle - ihipIpcEventHandle_t* iHandle = (ihipIpcEventHandle_t*)handle; - memset(iHandle->shmem_name, 0, HIP_IPC_HANDLE_SIZE); - ecd._ipc_name.copy(iHandle->shmem_name, std::string::npos); - - return ihipLogStatus(hipSuccess); -#else - return ihipLogStatus(hipErrorNotSupported); -#endif -} - -hipError_t hipIpcOpenEventHandle(hipEvent_t* event, hipIpcEventHandle_t handle) -{ - HIP_INIT_API(hipIpcOpenEventHandle, event, &handle); - -#if USE_IPC && ATOMIC_INT_LOCK_FREE == 2 - if (!event) return ihipLogStatus(hipErrorInvalidHandle); - - // create a new event with timing disabled, per spec - auto hip_status = ihipEventCreate(event, hipEventDisableTiming | hipEventInterprocess); - if (hip_status != hipSuccess) return ihipLogStatus(hip_status); - - LockedAccessor_EventCrit_t crit((*event)->criticalData()); - auto &ecd{crit->_eventData}; - ihipIpcEventHandle_t* iHandle = (ihipIpcEventHandle_t*)&handle; - ecd._ipc_name = iHandle->shmem_name; - // open shmem - ecd._ipc_fd = shm_open(ecd._ipc_name.c_str(), O_RDWR, 0777); - throwing_errno_check(ecd._ipc_fd < 0, __FILE__, __func__, __LINE__); - // mmap it - ecd._ipc_shmem = (ihipIpcEventShmem_t*)mmap(0, sizeof(ihipIpcEventShmem_t), PROT_READ | PROT_WRITE, MAP_SHARED, ecd._ipc_fd, 0); - throwing_errno_check(NULL == ecd._ipc_shmem, __FILE__, __func__, __LINE__); - // update shared state - ecd._ipc_shmem->owners += 1; - - return ihipLogStatus(hipSuccess); -#else - return ihipLogStatus(hipErrorNotSupported); -#endif -} diff --git a/src/hip_fatbin.cpp b/src/hip_fatbin.cpp deleted file mode 100644 index 5aab0b7101..0000000000 --- a/src/hip_fatbin.cpp +++ /dev/null @@ -1,91 +0,0 @@ -/* -Copyright (c) 2018 - present Advanced Micro Devices, Inc. All rights reserved. - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. -*/ -#include -#include - -#include "hip_fatbin.h" -#include "hip/hip_runtime.h" -#include "hip_hcc_internal.h" -#include "trace_helper.h" - -void __hipDumpCodeObject(const std::string& image) { - char fname[30]; - static std::atomic index; - sprintf(fname, "__hip_dump_code_object%04d.o", index++); - tprintf(DB_FB, "Dump code object %s\n", fname); - std::ofstream ofs; - ofs.open(fname, std::ios::binary); - ofs << image; - ofs.close(); -} - -// Returns a pointer to the code object in the fatbin. The pointer should not -// be freed. -const void* __hipExtractCodeObjectFromFatBinary(const void* data, - const char* agent_name) -{ - hip_impl::hip_init(); - - tprintf(DB_FB, "Enter __hipExtractCodeObjectFromFatBinary(%p, \"%s\")\n", - data, agent_name); - - const __ClangOffloadBundleHeader* header - = reinterpret_cast(data); - std::string magic(reinterpret_cast(header), - sizeof(CLANG_OFFLOAD_BUNDLER_MAGIC) - 1); - if (magic.compare(CLANG_OFFLOAD_BUNDLER_MAGIC)) { - return nullptr; - } - - const __ClangOffloadBundleDesc* desc = &header->desc[0]; - for (uint64_t i = 0; i < header->numBundles; ++i, - desc = reinterpret_cast( - reinterpret_cast(&desc->triple[0]) + desc->tripleSize)) { - - std::string triple{&desc->triple[0], sizeof(AMDGCN_AMDHSA_TRIPLE) - 1}; - if (triple.compare(AMDGCN_AMDHSA_TRIPLE)) - continue; - - std::string target{&desc->triple[sizeof(AMDGCN_AMDHSA_TRIPLE)], - desc->tripleSize - sizeof(AMDGCN_AMDHSA_TRIPLE)}; - tprintf(DB_FB, "Found hip-clang bundle for %s\n", target.c_str()); - if (target.compare(agent_name)) { - continue; - } - - auto *codeobj = reinterpret_cast( - reinterpret_cast(header) + desc->offset); - if (HIP_DUMP_CODE_OBJECT) - __hipDumpCodeObject(std::string{codeobj, desc->size}); - - tprintf(DB_FB, "__hipExtractCodeObjectFromFatBinary succeeds and returns %p\n", - codeobj); - return codeobj; - } - - // hipcc --genco for HCC generates fat binaries with different triple strings. - // It will reach here and return a null pointer. The fat binary itself will - // be handled in a different place. - tprintf(DB_FB, "No hip-clang device code bundle for %s\n", agent_name); - return nullptr; -} - diff --git a/src/hip_fatbin.h b/src/hip_fatbin.h deleted file mode 100644 index 7b4a063c68..0000000000 --- a/src/hip_fatbin.h +++ /dev/null @@ -1,58 +0,0 @@ -/* -Copyright (c) 2018 - present Advanced Micro Devices, Inc. All rights reserved. - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. -*/ -#ifndef HIP_SRC_HIP_FATBIN_H -#define HIP_SRC_HIP_FATBIN_H - -#include "hip/hip_runtime.h" -#include "hip_hcc_internal.h" - -// hip-clang fatbin format -constexpr unsigned __hipFatMAGIC2 = 0x48495046; // "HIPF" - -#define CLANG_OFFLOAD_BUNDLER_MAGIC "__CLANG_OFFLOAD_BUNDLE__" -#define AMDGCN_AMDHSA_TRIPLE "hip-amdgcn-amd-amdhsa" - -struct __ClangOffloadBundleDesc { - uint64_t offset; - uint64_t size; - uint64_t tripleSize; - const char triple[1]; -}; - -struct __ClangOffloadBundleHeader { - const char magic[sizeof(CLANG_OFFLOAD_BUNDLER_MAGIC) - 1]; - uint64_t numBundles; - __ClangOffloadBundleDesc desc[1]; -}; - -struct __CudaFatBinaryWrapper { - unsigned int magic; - unsigned int version; - __ClangOffloadBundleHeader* binary; - void* unused; -}; - -const void* __hipExtractCodeObjectFromFatBinary(const void* data, - const char* agent_name); -void __hipDumpCodeObject(const std::string& image); - -#endif // HIP_SRC_HIP_FATBIN_H diff --git a/src/hip_hcc.cpp b/src/hip_hcc.cpp deleted file mode 100644 index 4b8b2ae9fd..0000000000 --- a/src/hip_hcc.cpp +++ /dev/null @@ -1,2548 +0,0 @@ -/* -Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved. - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. -*/ - -/** - * @file hip_hcc.cpp - * - * Contains definitions for functions that are large enough that we don't want to inline them - * everywhere. This file is compiled and linked into apps running HIP / HCC path. - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include "hsa/hsa_ext_amd.h" -#include "hsa/hsa_ext_image.h" -#include "hip/hip_runtime.h" -#include "hip_hcc_internal.h" -#include "hip/hip_ext.h" -#include "trace_helper.h" -#include "env.h" - -// TODO - create a stream-based debug interface as an additional option for tprintf -#define DB_PEER_CTX 0 - - -//================================================================================================= -// Global variables: -//================================================================================================= -const int release = 1; - -const char* API_COLOR = KGRN; -const char* API_COLOR_END = KNRM; - -int HIP_LAUNCH_BLOCKING = 0; -std::string HIP_LAUNCH_BLOCKING_KERNELS; -std::vector g_hipLaunchBlockingKernels; -int HIP_API_BLOCKING = 0; - - -int HIP_PRINT_ENV = 0; -int HIP_TRACE_API = 0; -std::string HIP_TRACE_API_COLOR("green"); - -// TODO - DB_START/STOP need more testing. -std::string HIP_DB_START_API; -std::string HIP_DB_STOP_API; -int HIP_DB = 0; -int HIP_VISIBLE_DEVICES = 0; -int HIP_WAIT_MODE = 0; - -int HIP_FORCE_P2P_HOST = 0; -int HIP_FAIL_SOC = 0; -int HIP_DENY_PEER_ACCESS = 0; - -int HIP_HIDDEN_FREE_MEM = 256; -// Force async copies to actually use the synchronous copy interface. -int HIP_FORCE_SYNC_COPY = 0; - -// TODO - set these to 0 and 1 -int HIP_EVENT_SYS_RELEASE = 0; -int HIP_HOST_COHERENT = 1; - -int HIP_SYNC_HOST_ALLOC = 1; -int HIP_SYNC_FREE = 0; - - -int HIP_INIT_ALLOC = -1; -int HIP_SYNC_STREAM_WAIT = 0; -int HIP_FORCE_NULL_STREAM = 0; - -int HIP_DUMP_CODE_OBJECT = 0; - - -#if (__hcc_workweek__ >= 17300) -// Make sure we have required bug fix in HCC -// Perform resolution on the GPU: -// Chicken bit to sync on host to implement null stream. -// If 0, null stream synchronization is performed on the GPU -int HIP_SYNC_NULL_STREAM = 0; -#else -int HIP_SYNC_NULL_STREAM = 1; -#endif - -// HIP needs to change some behavior based on HCC_OPT_FLUSH : -#if (__hcc_workweek__ >= 17296) -int HCC_OPT_FLUSH = 1; -#else -#warning "HIP disabled HCC_OPT_FLUSH since HCC version does not yet support" -int HCC_OPT_FLUSH = 0; -#endif - -// Array of pointers to devices. -ihipDevice_t** g_deviceArray; - - -bool g_visible_device = false; -unsigned g_deviceCnt; -std::vector g_hip_visible_devices; -hsa_agent_t g_cpu_agent; -hsa_agent_t* g_allAgents; // CPU agents + all the visible GPU agents. -unsigned g_numLogicalThreads; -bool g_initDeviceFound = false; - -std::atomic g_lastShortTid(1); - -// Indexed by short-tid: -// -std::vector g_dbStartTriggers; -std::vector g_dbStopTriggers; - -//================================================================================================= -// Top-level "free" functions: -//================================================================================================= -uint64_t recordApiTrace(TlsData *tls, std::string* fullStr, const std::string& apiStr) { - auto apiSeqNum = tls->tidInfo.apiSeqNum(); - auto tid = tls->tidInfo.tid(); - - if ((tid < g_dbStartTriggers.size()) && (apiSeqNum >= g_dbStartTriggers[tid].nextTrigger())) { - printf("info: resume profiling at %lu\n", apiSeqNum); - g_dbStartTriggers.pop_back(); - }; - if ((tid < g_dbStopTriggers.size()) && (apiSeqNum >= g_dbStopTriggers[tid].nextTrigger())) { - printf("info: stop profiling at %lu\n", apiSeqNum); - g_dbStopTriggers.pop_back(); - }; - - fullStr->reserve(16 + apiStr.length()); - *fullStr = std::to_string(tid) + "."; - *fullStr += std::to_string(apiSeqNum); - *fullStr += " "; - *fullStr += apiStr; - - uint64_t apiStartTick = getTicks(); - - - if (COMPILE_HIP_DB && HIP_TRACE_API) { - fprintf(stderr, "%s<tidInfo.pid(), fullStr->c_str(), apiStartTick, - API_COLOR_END); - } - - return apiStartTick; -} - - -static inline bool ihipIsValidDevice(unsigned deviceIndex) { - // deviceIndex is unsigned so always > 0 - return (deviceIndex < g_deviceCnt); -} - - -ihipDevice_t* ihipGetDevice(int deviceIndex) { - if (ihipIsValidDevice(deviceIndex)) { - return g_deviceArray[deviceIndex]; - } else { - return NULL; - } -} - -ihipCtx_t* ihipGetPrimaryCtx(unsigned deviceIndex) { - ihipDevice_t* device = ihipGetDevice(deviceIndex); - return device ? device->getPrimaryCtx() : NULL; -}; - -hipError_t ihipSynchronize(TlsData *tls) { - ihipGetTlsDefaultCtx()->locked_waitAllStreams(); // ignores non-blocking streams, this waits - // for all activity to finish. - - return (hipSuccess); -} - -TlsData* tls_get_ptr() { - static thread_local TlsData data; - return &data; -} - -//================================================================================================= -// ihipStream_t: -//================================================================================================= -TidInfo::TidInfo() : _apiSeqNum(0) { - _shortTid = g_lastShortTid.fetch_add(1); - _pid = getpid(); - - if (COMPILE_HIP_DB && HIP_TRACE_API) { - std::stringstream tid_ss; - std::stringstream tid_ss_num; - tid_ss_num << std::this_thread::get_id(); - tid_ss << std::hex << std::stoull(tid_ss_num.str()); - - // cannot use tprintf here since it will recurse back into TlsData constructor -#if COMPILE_HIP_DB - if (HIP_DB & (1 << DB_API)) { - char msgStr[1000]; - snprintf(msgStr, sizeof(msgStr), - "HIP initialized short_tid#%d (maps to full_tid: 0x%s)\n", - tid(), tid_ss.str().c_str()); - fprintf(stderr, " %ship-%s pid:%d tid:%d:%s%s", dbName[DB_API]._color, - dbName[DB_API]._shortName, pid(), tid(), msgStr, KNRM); - } -#endif - }; -} - -//================================================================================================= -// ihipStream_t: -//================================================================================================= -//--- -ihipStream_t::ihipStream_t(ihipCtx_t* ctx, hc::accelerator_view av, unsigned int flags) - : _id(0), // will be set by add function. - _flags(flags), - _ctx(ctx), - _criticalData(this, av) { - unsigned schedBits = ctx->_ctxFlags & hipDeviceScheduleMask; - - switch (schedBits) { - case hipDeviceScheduleAuto: - _scheduleMode = Auto; - break; - case hipDeviceScheduleSpin: - _scheduleMode = Spin; - break; - case hipDeviceScheduleYield: - _scheduleMode = Yield; - break; - case hipDeviceScheduleBlockingSync: - _scheduleMode = Yield; - break; - default: - _scheduleMode = Auto; - }; -}; - - -//--- -ihipStream_t::~ihipStream_t() { - GET_TLS(); - for (auto mem : coopMemsTracker) { - hip_internal::ihipHostFree(tls, mem->mgs); - hip_internal::ihipHostFree(tls, mem); - } -} - - -hc::hcWaitMode ihipStream_t::waitMode() const { - hc::hcWaitMode waitMode = hc::hcWaitModeActive; - - if (_scheduleMode == Auto) { - if (g_deviceCnt > g_numLogicalThreads) { - waitMode = hc::hcWaitModeActive; - } else { - waitMode = hc::hcWaitModeBlocked; - } - } else if (_scheduleMode == Spin) { - waitMode = hc::hcWaitModeActive; - } else if (_scheduleMode == Yield) { - waitMode = hc::hcWaitModeBlocked; - } else { - assert(0); // bad wait mode. - } - - if (HIP_WAIT_MODE == 1) { - waitMode = hc::hcWaitModeBlocked; - } else if (HIP_WAIT_MODE == 2) { - waitMode = hc::hcWaitModeActive; - } - - return waitMode; -} - -// Wait for all kernel and data copy commands in this stream to complete. -// This signature should be used in routines that already have locked the stream mutex -void ihipStream_t::wait(LockedAccessor_StreamCrit_t& crit) { - tprintf(DB_SYNC, "%s wait for queue-empty..\n", ToString(this).c_str()); - - crit->_av.wait(waitMode()); -} - -//--- -// Wait for all kernel and data copy commands in this stream to complete. -inline void ihipStream_t::locked_wait(bool& waited) { - // create a marker while holding stream lock, - // but release lock prior to waiting on the marker - hc::completion_future marker; - { - LockedAccessor_StreamCrit_t crit(_criticalData); - // skipping marker since stream is empty - if (crit->_av.get_is_empty()) { - waited = false; - return; - } - marker = crit->_av.create_marker(hc::no_scope); - } - - marker.wait(waitMode()); - waited = true; - return; -}; - -void ihipStream_t::locked_wait() { - bool waited; - locked_wait(waited); -}; - -typedef struct { - int previous_read_index; - ihipIpcEventShmem_t *shmem; - hsa_signal_t signal; -} callback_data_t; - -static void WaitThenDecrementSignal(callback_data_t *data) { - int offset = data->previous_read_index % IPC_SIGNALS_PER_EVENT; - // While event valid and locked, spin. - while (data->shmem->read_index < data->previous_read_index+IPC_SIGNALS_PER_EVENT && data->shmem->signal[offset] != 0) { - } - hsa_signal_store_relaxed(data->signal, 0); - delete data; -} - -// Causes current stream to wait for specified event to complete: -// Note this does not provide any kind of host serialization. -void ihipStream_t::locked_streamWaitEvent(ihipEventData_t& ecd) { - LockedAccessor_StreamCrit_t crit(_criticalData); - - // if event is an IPC event, it doesn't have a marker - // we use a host callback to block stream with a signal wait - if (ecd._ipc_shmem) { - // create first marker - auto cf = crit->_av.create_marker(hc::no_scope); - // get its signal - auto signal = *reinterpret_cast(cf.get_native_handle()); - // increment its signal value - hsa_signal_add_relaxed(signal, 1); - - // create callback that can be passed to hsa_amd_signal_async_handler - // this function will host wait on IPC signal, then sets first packet's signal to 0 to indicate completion - auto t{new callback_data_t{ecd._ipc_shmem->read_index, ecd._ipc_shmem, signal}}; - - // register above callback with HSA runtime to be called when first packet's signal - // is decremented from 2 to 1 by CP (or it is already at 1) - // the HSA async handler is single threaded, so we can't block, therefore use a detached thread - hsa_amd_signal_async_handler(signal, HSA_SIGNAL_CONDITION_EQ, 1, - [](hsa_signal_value_t x, void* p) { - std::thread(WaitThenDecrementSignal, static_cast(p)).detach(); - return false; - }, t); - - // create additional marker that blocks on the first one - crit->_av.create_blocking_marker(cf, hc::accelerator_scope); - } - else { - crit->_av.create_blocking_marker(ecd.marker(), hc::accelerator_scope); - } -} - - -// Create a marker in this stream. -// Save state in the event so it can track the status of the event. -hc::completion_future ihipStream_t::locked_recordEvent(hipEvent_t event) { - auto scopeFlag = hc::accelerator_scope; - // The env var HIP_EVENT_SYS_RELEASE sets the default, - // The explicit flags override the env var (if specified) - if (event->_flags & hipEventReleaseToSystem) { - scopeFlag = hc::system_scope; - } else if (event->_flags & hipEventReleaseToDevice) { - scopeFlag = hc::accelerator_scope; - } else { - scopeFlag = HIP_EVENT_SYS_RELEASE ? hc::system_scope : hc::accelerator_scope; - } - - // Lock the stream to prevent simultaneous access - LockedAccessor_StreamCrit_t crit(_criticalData); - return crit->_av.create_marker(scopeFlag); -}; - -//============================================================================= - - -//------------------------------------------------------------------------------------------------- - - -//--- -const ihipDevice_t* ihipStream_t::getDevice() const { return _ctx->getDevice(); }; - - -ihipCtx_t* ihipStream_t::getCtx() const { return _ctx; }; - - -//-- -// Lock the stream to prevent other threads from intervening. -LockedAccessor_StreamCrit_t ihipStream_t::lockopen_preKernelCommand() { - LockedAccessor_StreamCrit_t crit(_criticalData, false /*no unlock at destruction*/); - - - return crit; -} - - -//--- -// Must be called after kernel finishes, this releases the lock on the stream so other commands can -// submit. -void ihipStream_t::lockclose_postKernelCommand(const char* kernelName, hc::accelerator_view* av, bool unlockPostponed) { - bool blockThisKernel = false; - - if (!g_hipLaunchBlockingKernels.empty()) { - std::string kernelNameString(kernelName); - for (auto o = g_hipLaunchBlockingKernels.begin(); o != g_hipLaunchBlockingKernels.end(); - o++) { - if ((*o == kernelNameString)) { - // printf ("force blocking for kernel %s\n", o->c_str()); - blockThisKernel = true; - } - } - } - - if (HIP_LAUNCH_BLOCKING || blockThisKernel) { - // TODO - fix this so it goes through proper stream::wait() call.// direct wait OK since we - // know the stream is locked. - av->wait(hc::hcWaitModeActive); - tprintf(DB_SYNC, "%s LAUNCH_BLOCKING for kernel '%s' completion\n", ToString(this).c_str(), - kernelName); - } - - // if unlockPostponed is true then this stream will be unlocked later (e.g., see hipExtLaunchMultiKernelMultiDevice for a sample call) - if (!unlockPostponed) { - _criticalData.unlock(); // paired with lock from lockopen_preKernelCommand. - } -}; - - -//============================================================================= -// Recompute the peercnt and the packed _peerAgents whenever a peer is added or deleted. -// The packed _peerAgents can efficiently be used on each memory allocation. -template <> -void ihipCtxCriticalBase_t::recomputePeerAgents() { - _peerCnt = 0; - std::for_each(_peers.begin(), _peers.end(), [this](ihipCtx_t* ctx) { - _peerAgents[_peerCnt++] = ctx->getDevice()->_hsaAgent; - }); -} - - -template <> -bool ihipCtxCriticalBase_t::isPeerWatcher(const ihipCtx_t* peer) { - auto match = std::find_if(_peers.begin(), _peers.end(), [=](const ihipCtx_t* d) { - return d->getDeviceNum() == peer->getDeviceNum(); - }); - - return (match != std::end(_peers)); -} - - -template <> -bool ihipCtxCriticalBase_t::addPeerWatcher(const ihipCtx_t* thisCtx, - ihipCtx_t* peerWatcher) { - auto match = std::find(_peers.begin(), _peers.end(), peerWatcher); - if (match == std::end(_peers)) { - // Not already a peer, let's update the list: - tprintf(DB_COPY, "addPeerWatcher. Allocations on %s now visible to peerWatcher %s.\n", - thisCtx->toString().c_str(), peerWatcher->toString().c_str()); - _peers.push_back(peerWatcher); - recomputePeerAgents(); - return true; - } - - // If we get here - peer was already on list, silently ignore. - return false; -} - - -template <> -bool ihipCtxCriticalBase_t::removePeerWatcher(const ihipCtx_t* thisCtx, - ihipCtx_t* peerWatcher) { - auto match = std::find(_peers.begin(), _peers.end(), peerWatcher); - if (match != std::end(_peers)) { - // Found a valid peer, let's remove it. - tprintf( - DB_COPY, - "removePeerWatcher. Allocations on %s no longer visible to former peerWatcher %s.\n", - thisCtx->toString().c_str(), peerWatcher->toString().c_str()); - _peers.remove(peerWatcher); - recomputePeerAgents(); - return true; - } else { - return false; - } -} - - -template <> -void ihipCtxCriticalBase_t::resetPeerWatchers(ihipCtx_t* thisCtx) { - tprintf(DB_COPY, "resetPeerWatchers for context=%s\n", thisCtx->toString().c_str()); - _peers.clear(); - _peerCnt = 0; - addPeerWatcher(thisCtx, thisCtx); // peer-list always contains self agent. -} - - -template <> -void ihipCtxCriticalBase_t::printPeerWatchers(FILE* f) const { - for (auto iter = _peers.begin(); iter != _peers.end(); iter++) { - fprintf(f, "%s ", (*iter)->toString().c_str()); - }; -} - - -template <> -void ihipCtxCriticalBase_t::addStream(ihipStream_t* stream) { - stream->_id = _streams.size(); - _streams.push_back(stream); - tprintf(DB_SYNC, " addStream: %s\n", ToString(stream).c_str()); -} - -template <> -void ihipDeviceCriticalBase_t::addContext(ihipCtx_t* ctx) { - _ctxs.push_back(ctx); - tprintf(DB_SYNC, " addContext: %s\n", ToString(ctx).c_str()); -} - -//============================================================================= - -//================================================================================================= -// ihipDevice_t -//================================================================================================= -ihipDevice_t::ihipDevice_t(unsigned deviceId, unsigned deviceCnt, hc::accelerator& acc) - : _deviceId(deviceId), _acc(acc), _state(0), _criticalData(this) { - hsa_agent_t* agent = static_cast(acc.get_hsa_agent()); - if (agent) { - int err; - err = hsa_agent_get_info( - *agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, &_computeUnits); - if (err != HSA_STATUS_SUCCESS) { - _computeUnits = 1; - } - err = hsa_agent_get_info( - *agent, (hsa_agent_info_t) HSA_AMD_AGENT_INFO_DRIVER_NODE_ID, &_driver_node_id); - if (err != HSA_STATUS_SUCCESS){ - _driver_node_id = 0; - } - - _hsaAgent = *agent; - } else { - _hsaAgent.handle = static_cast(-1); - } - - initProperties(&_props); - - - _primaryCtx = new ihipCtx_t(this, deviceCnt, hipDeviceMapHost); -} - - -ihipDevice_t::~ihipDevice_t() { - delete _primaryCtx; - _primaryCtx = NULL; -} - -void ihipDevice_t::locked_removeContext(ihipCtx_t* c) { - LockedAccessor_DeviceCrit_t crit(_criticalData); - - crit->ctxs().remove(c); - tprintf(DB_SYNC, " locked_removeContext: %s\n", ToString(c).c_str()); -} - - -void ihipDevice_t::locked_reset() { - // Obtain mutex access to the device critical data, release by destructor - LockedAccessor_DeviceCrit_t crit(_criticalData); - - - //--- - // Wait for pending activity to complete? TODO - check if this is required behavior: - tprintf(DB_SYNC, "locked_reset waiting for activity to complete.\n"); - - // Reset and remove streams: - // Delete all created streams including the default one. - for (auto ctxI = crit->const_ctxs().begin(); ctxI != crit->const_ctxs().end(); ctxI++) { - ihipCtx_t* ctx = *ctxI; - (*ctxI)->locked_reset(); - tprintf(DB_SYNC, " ctx cleanup %s\n", ToString(ctx).c_str()); - - delete ctx; - } - // Clear the list. - crit->ctxs().clear(); - - - // reset _primaryCtx - _primaryCtx->locked_reset(); - tprintf(DB_SYNC, " _primaryCtx cleanup %s\n", ToString(_primaryCtx).c_str()); - // Reset and release all memory stored in the tracker: - // Reset will remove peer mapping so don't need to do this explicitly. - // FIXME - This is clearly a non-const action! Is this a context reset or a device reset - - // maybe should reference count? - - _state = 0; - am_memtracker_reset(_acc); - - // FIXME - Calling am_memtracker_reset is really bad since it destroyed all buffers allocated by - // the HCC runtime as well such as the printf buffer. Re-initialze the printf buffer as a - // workaround for now. -#ifdef HC_FEATURE_PRINTF - Kalmar::getContext()->initPrintfBuffer(); -#endif -}; - -#define ErrorCheck(x) error_check(x, __LINE__, __FILE__) - -void error_check(hsa_status_t hsa_error_code, int line_num, std::string str) { - if ((hsa_error_code != HSA_STATUS_SUCCESS) && (hsa_error_code != HSA_STATUS_INFO_BREAK)) { - printf("HSA reported error!\n In file: %s\nAt line: %d\n", str.c_str(), line_num); - } -} - - -//--- -// Helper for initProperties -// Determines if the given agent is of type HSA_DEVICE_TYPE_GPU and counts it. -static hsa_status_t countGpuAgents(hsa_agent_t agent, void* data) { - if (data == NULL) { - return HSA_STATUS_ERROR_INVALID_ARGUMENT; - } - hsa_device_type_t device_type; - hsa_status_t status = hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &device_type); - if (status != HSA_STATUS_SUCCESS) { - return status; - } - if (device_type == HSA_DEVICE_TYPE_GPU) { - (*static_cast(data))++; - } - return HSA_STATUS_SUCCESS; -} - - -hsa_status_t FindGpuDevice(hsa_agent_t agent, void* data) { - if (data == NULL) { - return HSA_STATUS_ERROR_INVALID_ARGUMENT; - } - - hsa_device_type_t hsa_device_type; - hsa_status_t hsa_error_code = - hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &hsa_device_type); - if (hsa_error_code != HSA_STATUS_SUCCESS) { - return hsa_error_code; - } - - if (hsa_device_type == HSA_DEVICE_TYPE_GPU) { - *((hsa_agent_t*)data) = agent; - return HSA_STATUS_INFO_BREAK; - } - - return HSA_STATUS_SUCCESS; -} - -hsa_status_t GetDevicePool(hsa_amd_memory_pool_t pool, void* data) { - if (NULL == data) { - return HSA_STATUS_ERROR_INVALID_ARGUMENT; - } - - hsa_status_t err; - hsa_amd_segment_t segment; - uint32_t flag; - - err = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &segment); - ErrorCheck(err); - if (HSA_AMD_SEGMENT_GLOBAL != segment) return HSA_STATUS_SUCCESS; - err = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &flag); - ErrorCheck(err); - *((hsa_amd_memory_pool_t*)data) = pool; - return HSA_STATUS_SUCCESS; -} - - -int checkAccess(hsa_agent_t agent, hsa_amd_memory_pool_t pool) { - hsa_status_t err; - hsa_amd_memory_pool_access_t access; - err = hsa_amd_agent_memory_pool_get_info(agent, pool, HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS, - &access); - ErrorCheck(err); - return access; -} - -hsa_status_t get_pool_info(hsa_amd_memory_pool_t pool, void* data) { - hsa_status_t err; - hipDeviceProp_t* p_prop = reinterpret_cast(data); - uint32_t region_segment; - - // Get pool segment - err = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, ®ion_segment); - ErrorCheck(err); - - switch (region_segment) { - case HSA_REGION_SEGMENT_READONLY: - err = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_SIZE, - &(p_prop->totalConstMem)); - break; - case HSA_REGION_SEGMENT_GROUP: - err = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_SIZE, - &(p_prop->maxSharedMemoryPerMultiProcessor)); - break; - default: - break; - } - return err; -} - - -// Determines if the given agent is of type HSA_DEVICE_TYPE_GPU and counts it. -static hsa_status_t findCpuAgent(hsa_agent_t agent, void* data) { - hsa_device_type_t device_type; - hsa_status_t status = hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &device_type); - if (status != HSA_STATUS_SUCCESS) { - return status; - } - if (device_type == HSA_DEVICE_TYPE_CPU) { - (*static_cast(data)) = agent; - return HSA_STATUS_INFO_BREAK; - } - - return HSA_STATUS_SUCCESS; -} - - -#define DeviceErrorCheck(x) \ - if (x != HSA_STATUS_SUCCESS) { \ - return hipErrorInvalidDevice; \ - } - -//--- -// Initialize properties for the device. -// Call this once when the ihipDevice_t is created: -hipError_t ihipDevice_t::initProperties(hipDeviceProp_t* prop) { - hipError_t e = hipSuccess; - hsa_status_t err; - - memset(prop, 0, sizeof(hipDeviceProp_t)); - - if (_hsaAgent.handle == -1) { - return hipErrorInvalidDevice; - } - - // Iterates over the agents to determine Multiple GPU devices - // using the countGpuAgents callback. - //! @bug : on HCC, isMultiGpuBoard returns True if system contains multiple GPUS (rather than if - //! GPU is on a multi-ASIC board) - int gpuAgentsCount = 0; - err = hsa_iterate_agents(countGpuAgents, &gpuAgentsCount); - if (err == HSA_STATUS_INFO_BREAK) { - err = HSA_STATUS_SUCCESS; - } - DeviceErrorCheck(err); - prop->isMultiGpuBoard = 0 ? gpuAgentsCount < 2 : 1; - - // Get agent name - - err = hsa_agent_get_info(_hsaAgent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_PRODUCT_NAME, - &(prop->name)); - DeviceErrorCheck(err); - - char archName[256]; - err = hsa_agent_get_info(_hsaAgent, HSA_AGENT_INFO_NAME, &archName); - - prop->gcnArch = atoi(archName + 3); - DeviceErrorCheck(err); - - // Get agent node - uint32_t node; - err = hsa_agent_get_info(_hsaAgent, HSA_AGENT_INFO_NODE, &node); - DeviceErrorCheck(err); - - // Get wavefront size - err = hsa_agent_get_info(_hsaAgent, HSA_AGENT_INFO_WAVEFRONT_SIZE, &prop->warpSize); - DeviceErrorCheck(err); - - // Get max total number of work-items in a workgroup - err = - hsa_agent_get_info(_hsaAgent, HSA_AGENT_INFO_WORKGROUP_MAX_SIZE, &prop->maxThreadsPerBlock); - DeviceErrorCheck(err); - - // Get max number of work-items of each dimension of a work-group - uint16_t work_group_max_dim[3]; - err = hsa_agent_get_info(_hsaAgent, HSA_AGENT_INFO_WORKGROUP_MAX_DIM, work_group_max_dim); - DeviceErrorCheck(err); - for (int i = 0; i < 3; i++) { - prop->maxThreadsDim[i] = work_group_max_dim[i]; - } - - hsa_dim3_t grid_max_dim; - err = hsa_agent_get_info(_hsaAgent, HSA_AGENT_INFO_GRID_MAX_DIM, &grid_max_dim); - DeviceErrorCheck(err); - prop->maxGridSize[0] = (int)((grid_max_dim.x == UINT32_MAX) ? (INT32_MAX) : grid_max_dim.x); - prop->maxGridSize[1] = (int)((grid_max_dim.y == UINT32_MAX) ? (INT32_MAX) : grid_max_dim.y); - prop->maxGridSize[2] = (int)((grid_max_dim.z == UINT32_MAX) ? (INT32_MAX) : grid_max_dim.z); - - // Get Max clock frequency - err = hsa_agent_get_info(_hsaAgent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_MAX_CLOCK_FREQUENCY, - &prop->clockRate); - prop->clockRate *= 1000.0; // convert Mhz to Khz. - DeviceErrorCheck(err); - - uint64_t counterHz; - err = hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY, &counterHz); - DeviceErrorCheck(err); - prop->clockInstructionRate = counterHz / 1000; - - // Get Agent BDFID (bus/device/function ID) - uint16_t bdf_id = 1; - err = hsa_agent_get_info(_hsaAgent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_BDFID, &bdf_id); - DeviceErrorCheck(err); - - // BDFID is 16bit uint: [8bit - BusID | 5bit - Device ID | 3bit - FunctionID] - prop->pciDeviceID = (bdf_id >> 3) & 0x1F; - prop->pciBusID = (bdf_id >> 8) & 0xFF; - - err = hsa_agent_get_info(_hsaAgent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_DOMAIN, &prop->pciDomainID); - DeviceErrorCheck(err); - - // Masquerade as a 3.0-level device. This will change as more HW functions are properly - // supported. Application code should use the arch.has* to do detailed feature detection. - prop->major = 3; - prop->minor = 0; - - // Get number of Compute Unit - err = hsa_agent_get_info(_hsaAgent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, - &(prop->multiProcessorCount)); - DeviceErrorCheck(err); - - // TODO-hsart - this appears to return 0? - uint32_t cache_size[4]; - err = hsa_agent_get_info(_hsaAgent, HSA_AGENT_INFO_CACHE_SIZE, cache_size); - DeviceErrorCheck(err); - prop->l2CacheSize = cache_size[1]; - - /* Computemode for HSA Devices is always : cudaComputeModeDefault */ - prop->computeMode = 0; - - prop->isLargeBar = _acc.has_cpu_accessible_am() ? 1 : 0; - - // Get Max Threads Per Multiprocessor - uint32_t max_waves_per_cu; - err = hsa_agent_get_info(_hsaAgent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU, - &max_waves_per_cu); - DeviceErrorCheck(err); - prop->maxThreadsPerMultiProcessor = prop->warpSize * max_waves_per_cu; - - // Get memory properties - err = hsa_amd_agent_iterate_memory_pools(_hsaAgent, get_pool_info, prop); - if (err == HSA_STATUS_INFO_BREAK) { - err = HSA_STATUS_SUCCESS; - } - DeviceErrorCheck(err); - - // Get the size of the pool we are using for Accelerator Memory allocations: - hsa_region_t* am_region = static_cast(_acc.get_hsa_am_region()); - err = hsa_region_get_info(*am_region, HSA_REGION_INFO_SIZE, &prop->totalGlobalMem); - DeviceErrorCheck(err); - // Current GPUs allow a workgroup to use all of LDS in a CU, so these two are equal. - prop->sharedMemPerBlock = prop->maxSharedMemoryPerMultiProcessor; - - // Get Max memory clock frequency - err = - hsa_region_get_info(*am_region, (hsa_region_info_t)HSA_AMD_REGION_INFO_MAX_CLOCK_FREQUENCY, - &prop->memoryClockRate); - DeviceErrorCheck(err); - prop->memoryClockRate *= 1000.0; // convert Mhz to Khz. - - // Get global memory bus width in bits - err = hsa_region_get_info(*am_region, (hsa_region_info_t)HSA_AMD_REGION_INFO_BUS_WIDTH, - &prop->memoryBusWidth); - DeviceErrorCheck(err); - - // Set feature flags - these are all mandatory for HIP on HCC path: - // Some features are under-development and future revs may support flags that are currently 0. - // Reporting of these flags should be synchronized with the HIP_ARCH* compile-time defines in - // hip_runtime.h - - prop->arch.hasGlobalInt32Atomics = 1; - prop->arch.hasGlobalFloatAtomicExch = 1; - prop->arch.hasSharedInt32Atomics = 1; - prop->arch.hasSharedFloatAtomicExch = 1; - prop->arch.hasFloatAtomicAdd = 1; // supported with CAS loop, but is supported - prop->arch.hasGlobalInt64Atomics = 1; - prop->arch.hasSharedInt64Atomics = 1; - prop->arch.hasDoubles = 1; - prop->arch.hasWarpVote = 1; - prop->arch.hasWarpBallot = 1; - prop->arch.hasWarpShuffle = 1; - prop->arch.hasFunnelShift = 0; // TODO-hcc - prop->arch.hasThreadFenceSystem = 1; - prop->arch.hasSyncThreadsExt = 0; // TODO-hcc - prop->arch.hasSurfaceFuncs = 0; // TODO-hcc - prop->arch.has3dGrid = 1; - prop->arch.hasDynamicParallelism = 0; - - prop->concurrentKernels = - 1; // All ROCm hardware supports executing multiple kernels concurrently - - prop->canMapHostMemory = 1; // All ROCm devices can map host memory - prop->totalConstMem = 16384; -#if 0 - // TODO - code broken below since it always returns 1. - // Are the flags part of the context or part of the device? - if ( _device_flags | hipDeviceMapHost) { - prop->canMapHostMemory = 1; - } else { - prop->canMapHostMemory = 0; - } -#endif - // Get profile - hsa_profile_t agent_profile; - err = hsa_agent_get_info(_hsaAgent, HSA_AGENT_INFO_PROFILE, &agent_profile); - DeviceErrorCheck(err); - if(agent_profile == HSA_PROFILE_FULL) { - prop->integrated = 1; - } - - // Enable the cooperative group for GPUs that support all the required features - err = hsa_agent_get_info(_hsaAgent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_COOPERATIVE_QUEUES, - &prop->cooperativeLaunch); - DeviceErrorCheck(err); - prop->cooperativeMultiDeviceLaunch = prop->cooperativeLaunch; - - prop->cooperativeMultiDeviceUnmatchedFunc = prop->cooperativeMultiDeviceLaunch; - prop->cooperativeMultiDeviceUnmatchedGridDim = prop->cooperativeMultiDeviceLaunch; - prop->cooperativeMultiDeviceUnmatchedBlockDim = prop->cooperativeMultiDeviceLaunch; - prop->cooperativeMultiDeviceUnmatchedSharedMem = prop->cooperativeMultiDeviceLaunch; - - err = hsa_agent_get_info(_hsaAgent, (hsa_agent_info_t)HSA_EXT_AGENT_INFO_IMAGE_1D_MAX_ELEMENTS, - &prop->maxTexture1D); - DeviceErrorCheck(err); - - err = hsa_agent_get_info(_hsaAgent, (hsa_agent_info_t)HSA_EXT_AGENT_INFO_IMAGE_2D_MAX_ELEMENTS, - prop->maxTexture2D); - DeviceErrorCheck(err); - - err = hsa_agent_get_info(_hsaAgent, (hsa_agent_info_t)HSA_EXT_AGENT_INFO_IMAGE_3D_MAX_ELEMENTS, - prop->maxTexture3D); - DeviceErrorCheck(err); - - // Get Agent HDP Flush Register Memory - hsa_amd_hdp_flush_t hdpinfo; - err = hsa_agent_get_info(_hsaAgent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_HDP_FLUSH, &hdpinfo); - DeviceErrorCheck(err); - - prop->hdpMemFlushCntl = hdpinfo.HDP_MEM_FLUSH_CNTL; - prop->hdpRegFlushCntl = hdpinfo.HDP_REG_FLUSH_CNTL; - - prop->memPitch = INT_MAX; //Maximum pitch in bytes allowed by memory copies (hardcoded 128 bytes in hipMallocPitch) - prop->textureAlignment = 0; //Alignment requirement for textures - prop->texturePitchAlignment = IMAGE_PITCH_ALIGNMENT; //Alignment requirment for texture pitch - prop->kernelExecTimeoutEnabled = 0; //no run time limit for running kernels on device - - hsa_isa_t isa; - err = hsa_agent_get_info(_hsaAgent, (hsa_agent_info_t)HSA_AGENT_INFO_ISA, &isa); - DeviceErrorCheck(err); - std::size_t isa_sz = 0u; - hsa_isa_get_info_alt(isa, HSA_ISA_INFO_NAME_LENGTH, &isa_sz); - std::string isa_name(isa_sz, '\0'); - hsa_isa_get_info_alt(isa, HSA_ISA_INFO_NAME, &isa_name.front()); - if (isa_name.find("sram-ecc") != std::string::npos) - prop->ECCEnabled = 1; //Device has ECC support Enabled - else - prop->ECCEnabled = 0; //Device has ECC support disabled - - prop->tccDriver = 0; // valid only for nvcc platform - return e; -} - - -//================================================================================================= -// ihipCtx_t -//================================================================================================= -ihipCtx_t::ihipCtx_t(ihipDevice_t* device, unsigned deviceCnt, unsigned flags) - : _ctxFlags(flags), _device(device), _criticalData(this, deviceCnt) { - // locked_reset(); - LockedAccessor_CtxCrit_t crit(_criticalData); - _defaultStream = new ihipStream_t(this, getDevice()->_acc.get_default_view(), hipStreamDefault); - crit->addStream(_defaultStream); - - - // Reset peer list to just me: - crit->resetPeerWatchers(this); - tprintf(DB_SYNC, "created ctx with defaultStream=%p (%s)\n", _defaultStream, - ToString(_defaultStream).c_str()); -}; - - -ihipCtx_t::~ihipCtx_t() { - if (_defaultStream) { - delete _defaultStream; - _defaultStream = NULL; - } -} -// Reset the device - this is called from hipDeviceReset. -// Device may be reset multiple times, and may be reset after init. -void ihipCtx_t::locked_reset() { - // Obtain mutex access to the device critical data, release by destructor - LockedAccessor_CtxCrit_t crit(_criticalData); - - - //--- - // Wait for pending activity to complete? TODO - check if this is required behavior: - tprintf(DB_SYNC, "locked_reset waiting for activity to complete.\n"); - - // Reset and remove streams: - // Delete all created streams including the default one. - for (auto streamI = crit->const_streams().begin(); streamI != crit->const_streams().end(); - streamI++) { - ihipStream_t* stream = *streamI; - (*streamI)->locked_wait(); - tprintf(DB_SYNC, " delete %s\n", ToString(stream).c_str()); - - delete stream; - } - // Clear the list. - crit->streams().clear(); - - - // Create a fresh default stream and add it: - _defaultStream = new ihipStream_t(this, getDevice()->_acc.get_default_view(), hipStreamDefault); - crit->addStream(_defaultStream); - -#if 0 - // Reset peer list to just me: - crit->resetPeerWatchers(this); - - // Reset and release all memory stored in the tracker: - // Reset will remove peer mapping so don't need to do this explicitly. - // FIXME - This is clearly a non-const action! Is this a context reset or a device reset - maybe should reference count? - ihipDevice_t *device = getWriteableDevice(); - device->_state = 0; - am_memtracker_reset(device->_acc); -#endif -}; - - -//--- -std::string ihipCtx_t::toString() const { - std::ostringstream ss; - ss << this; - return ss.str(); -}; - - -//---- - - -//================================================================================================= -// Utility functions, these are not part of the public HIP API -//================================================================================================= - -//================================================================================================= - - -// This called for submissions that are sent to the null/default stream. This routine ensures -// that this new command waits for activity in the other streams to complete before proceeding. -// -// HIP_SYNC_NULL_STREAM=0 does all dependency resolutiokn on the GPU -// HIP_SYNC_NULL_STREAM=1 s legacy non-optimal mode which conservatively waits on host. -// -// If waitOnSelf is set, this additionally waits for the default stream to empty. -// In new HIP_SYNC_NULL_STREAM=0 mode, this enqueues a marker which causes the default stream to -// wait for other activity, but doesn't actually block the host. If host blocking is desired, the -// caller should set syncHost. -// -// syncToHost causes host to wait for the stream to finish. -// Note HIP_SYNC_NULL_STREAM=1 path always sync to Host. -void ihipCtx_t::locked_syncDefaultStream(bool waitOnSelf, bool syncHost) { - LockedAccessor_CtxCrit_t crit(_criticalData); - - tprintf(DB_SYNC, "syncDefaultStream \n"); - - // Vector of ops sent to each stream that will complete before ops sent to null stream: - std::vector depOps; - - bool last_stream_waited = false; - for (auto streamI = crit->const_streams().begin(); streamI != crit->const_streams().end(); - streamI++) { - ihipStream_t* stream = *streamI; - - // Don't wait for streams that have "opted-out" of syncing with NULL stream. - // And - don't wait for the NULL stream, unless waitOnSelf specified. - bool waitThisStream = (!(stream->_flags & hipStreamNonBlocking)) && - (waitOnSelf || (stream != _defaultStream)); - - if (HIP_SYNC_NULL_STREAM) { - if (waitThisStream) { - last_stream_waited = false; - stream->locked_wait(last_stream_waited); - } - } else { - if (waitThisStream) { - LockedAccessor_StreamCrit_t streamCrit(stream->criticalData()); - - // The last marker will provide appropriate visibility: - if (!streamCrit->_av.get_is_empty()) { - depOps.push_back(streamCrit->_av.create_marker(hc::accelerator_scope)); - tprintf(DB_SYNC, " push marker to wait for stream=%s\n", - ToString(stream).c_str()); - } else { - tprintf(DB_SYNC, " skipped stream=%s since it is empty\n", - ToString(stream).c_str()); - } - } - } - } - - - // Enqueue a barrier to wait on all the barriers we sent above: - if (!HIP_SYNC_NULL_STREAM && !depOps.empty()) { - LockedAccessor_StreamCrit_t defaultStreamCrit(_defaultStream->criticalData()); - tprintf(DB_SYNC, " null-stream wait on %zu non-empty streams. sync_host=%d\n", - depOps.size(), syncHost); - hc::completion_future defaultCf = defaultStreamCrit->_av.create_blocking_marker( - depOps.begin(), depOps.end(), hc::accelerator_scope); - if (syncHost) { - defaultCf.wait(); // TODO - account for active or blocking here. - } - } - else if ( (HIP_SYNC_NULL_STREAM && !last_stream_waited) || - (!HIP_SYNC_NULL_STREAM && depOps.empty()) ) { - // This catches all the conditions where the printf buffer - // need to be explicitly flushed - if (syncHost) { - Kalmar::getContext()->flushPrintfBuffer(); - } - } - - tprintf(DB_SYNC, " syncDefaultStream depOps=%zu\n", depOps.size()); -} - - -//--- -void ihipCtx_t::locked_removeStream(ihipStream_t* s) { - LockedAccessor_CtxCrit_t crit(_criticalData); - - crit->streams().remove(s); -} - - -//--- -// Heavyweight synchronization that waits on all streams, ignoring hipStreamNonBlocking flag. -void ihipCtx_t::locked_waitAllStreams() { - LockedAccessor_CtxCrit_t crit(_criticalData); - - tprintf(DB_SYNC, "waitAllStream\n"); - bool need_printf_flush = false; - for (auto streamI = crit->const_streams().begin(); streamI != crit->const_streams().end(); - streamI++) { - bool waited; - (*streamI)->locked_wait(waited); - need_printf_flush = !waited; - } - - // When synchronizing with the last stream, if we didn't do an explicit wait, - // then we need to an extra flush to the printf buffer - if (need_printf_flush) { - Kalmar::getContext()->flushPrintfBuffer(); - } -} - - -std::string HIP_DB_string(unsigned db) { - std::string dbStr; - bool first = true; - for (int i = 0; i < DB_MAX_FLAG; i++) { - if (db & (1 << i)) { - if (!first) { - dbStr += "+"; - }; - dbStr += dbName[i]._color; - dbStr += dbName[i]._shortName; - dbStr += KNRM; - first = false; - }; - } - - return dbStr; -} - -// Callback used to process HIP_DB input, supports either -// integer or flag names separated by + -std::string HIP_DB_callback(void* var_ptr, const char* envVarString) { - int* var_ptr_int = static_cast(var_ptr); - - std::string e(envVarString); - trim(&e); - if (!e.empty() && isdigit(e.c_str()[0])) { - long int v = strtol(envVarString, NULL, 0); - *var_ptr_int = (int)(v); - } else { - *var_ptr_int = 0; - std::vector tokens; - tokenize(e, '+', &tokens); - for (auto t = tokens.begin(); t != tokens.end(); t++) { - for (int i = 0; i < DB_MAX_FLAG; i++) { - if (!strcmp(t->c_str(), dbName[i]._shortName)) { - *var_ptr_int |= (1 << i); - } // TODO - else throw error? - } - } - } - - return HIP_DB_string(*var_ptr_int); - ; -} - - -// Callback used to process list of visible devices. -std::string HIP_VISIBLE_DEVICES_callback(void* var_ptr, const char* envVarString) { - // Parse the string stream of env and store the device ids to g_hip_visible_devices global - // variable - std::string str = envVarString; - std::istringstream ss(str); - std::string device_id; - // Clean up the defult value - g_hip_visible_devices.clear(); - g_visible_device = true; - // Read the visible device numbers - while (std::getline(ss, device_id, ',')) { - if (atoi(device_id.c_str()) >= 0) { - g_hip_visible_devices.push_back(atoi(device_id.c_str())); - } else { // Any device number after invalid number will not present - break; - } - } - - std::string valueString; - // Print out the number of ids - for (int i = 0; i < g_hip_visible_devices.size(); i++) { - valueString += std::to_string((g_hip_visible_devices[i])); - valueString += ' '; - } - - return valueString; -} - - -// TODO - change last arg to pointer. -void parseTrigger(std::string triggerString, std::vector& profTriggers) { - std::vector tidApiTokens; - tokenize(std::string(triggerString), ',', &tidApiTokens); - for (auto t = tidApiTokens.begin(); t != tidApiTokens.end(); t++) { - std::vector oneToken; - // std::cout << "token=" << *t << "\n"; - tokenize(std::string(*t), '.', &oneToken); - int tid = 1; - uint64_t apiTrigger = 0; - if (oneToken.size() == 1) { - // the case with just apiNum - apiTrigger = std::strtoull(oneToken[0].c_str(), nullptr, 0); - } else if (oneToken.size() == 2) { - // the case with tid.apiNum - tid = std::strtoul(oneToken[0].c_str(), nullptr, 0); - apiTrigger = std::strtoull(oneToken[1].c_str(), nullptr, 0); - } else { - throw ihipException(hipErrorRuntimeOther); // TODO -> bad env var? - } - - if (tid > 10000) { - throw ihipException(hipErrorRuntimeOther); // TODO -> bad env var? - } else { - profTriggers.resize(tid + 1); - // std::cout << "tid:" << tid << " add: " << apiTrigger << "\n"; - profTriggers[tid].add(apiTrigger); - } - } - - - for (int tid = 1; tid < profTriggers.size(); tid++) { - profTriggers[tid].sort(); - profTriggers[tid].print(tid); - } -} - - -void HipReadEnv() { - /* - * Environment variables - */ - g_hip_visible_devices.push_back(0); /* Set the default value of visible devices */ - READ_ENV_I(release, HIP_PRINT_ENV, 0, "Print HIP environment variables."); - //-- READ HIP_PRINT_ENV env first, since it has impact on later env var reading - - // TODO: In HIP/hcc, this variable blocks after both kernel commmands and data transfer. Maybe - // should be bit-mask for each command type? - READ_ENV_I(release, HIP_LAUNCH_BLOCKING, CUDA_LAUNCH_BLOCKING, - "Make HIP kernel launches 'host-synchronous', so they block until any kernel " - "launches. Alias: CUDA_LAUNCH_BLOCKING."); - READ_ENV_S(release, HIP_LAUNCH_BLOCKING_KERNELS, 0, - "Comma-separated list of kernel names to make host-synchronous, so they block until " - "completed."); - if (!HIP_LAUNCH_BLOCKING_KERNELS.empty()) { - tokenize(HIP_LAUNCH_BLOCKING_KERNELS, ',', &g_hipLaunchBlockingKernels); - } - READ_ENV_I(release, HIP_API_BLOCKING, 0, - "Make HIP APIs 'host-synchronous', so they block until completed. Impacts " - "hipMemcpyAsync, hipMemsetAsync."); - - READ_ENV_I(release, HIP_HIDDEN_FREE_MEM, 0, - "Amount of memory to hide from the free memory reported by hipMemGetInfo, specified " - "in MB. Impacts hipMemGetInfo."); - - READ_ENV_C(release, HIP_DB, 0, - "Print debug info. Bitmask (HIP_DB=0xff) or flags separated by '+' " - "(HIP_DB=api+sync+mem+copy+fatbin)", - HIP_DB_callback); - if ((HIP_DB & (1 << DB_API)) && (HIP_TRACE_API == 0)) { - // Set HIP_TRACE_API default before we read it, so it is printed correctly. - HIP_TRACE_API = 1; - } - - - READ_ENV_I(release, HIP_TRACE_API, 0, - "Trace each HIP API call. Print function name and return code to stderr as program " - "executes."); - READ_ENV_S(release, HIP_TRACE_API_COLOR, 0, - "Color to use for HIP_API. None/Red/Green/Yellow/Blue/Magenta/Cyan/White"); - READ_ENV_S(release, HIP_DB_START_API, 0, - "Comma-separated list of tid.api_seq_num for when to start debug and profiling."); - READ_ENV_S(release, HIP_DB_STOP_API, 0, - "Comma-separated list of tid.api_seq_num for when to stop debug and profiling."); - - READ_ENV_C(release, HIP_VISIBLE_DEVICES, CUDA_VISIBLE_DEVICES, - "Only devices whose index is present in the sequence are visible to HIP " - "applications and they are enumerated in the order of sequence.", - HIP_VISIBLE_DEVICES_callback); - - - READ_ENV_I(release, HIP_WAIT_MODE, 0, - "Force synchronization mode. 1= force yield, 2=force spin, 0=defaults specified in " - "application"); - READ_ENV_I(release, HIP_FORCE_P2P_HOST, 0, - "Force use of host/staging copy for peer-to-peer copies.1=always use copies, " - "2=always return false for hipDeviceCanAccessPeer"); - READ_ENV_I(release, HIP_FORCE_SYNC_COPY, 0, - "Force all copies (even hipMemcpyAsync) to use sync copies"); - READ_ENV_I(release, HIP_FAIL_SOC, 0, - "Fault on Sub-Optimal-Copy, rather than use a slower but functional implementation. " - " Bit 0x1=Fail on async copy with unpinned memory. Bit 0x2=Fail peer copy rather " - "than use staging buffer copy"); - - READ_ENV_I(release, HIP_SYNC_HOST_ALLOC, 0, - "Sync before and after all host memory allocations. May help stability"); - READ_ENV_I(release, HIP_INIT_ALLOC, 0, - "If not -1, initialize allocated memory to specified byte"); - READ_ENV_I(release, HIP_SYNC_NULL_STREAM, 0, "Synchronize on host for null stream submissions"); - READ_ENV_I(release, HIP_FORCE_NULL_STREAM, 0, - "Force all stream allocations to secretly return the null stream"); - - READ_ENV_I(release, HIP_SYNC_STREAM_WAIT, 0, "hipStreamWaitEvent will synchronize to host"); - - READ_ENV_I(release, HIP_SYNC_FREE, 0, - "Force all calls to hipFree to sync all devices and all streams"); - - READ_ENV_I(release, HIP_HOST_COHERENT, 0, - "If set, all host memory will be allocated as fine-grained system memory. This " - "allows threadfence_system to work but prevents host memory from being cached on " - "GPU which may have performance impact."); - - - READ_ENV_I(release, HCC_OPT_FLUSH, 0, - "When set, use agent-scope fence operations rather than system-scope fence " - "operationsflush when possible. This flag controls both HIP and HCC behavior."); - READ_ENV_I(release, HIP_EVENT_SYS_RELEASE, 0, - "If set, event are created with hipEventReleaseToSystem by default. If 0, events " - "are created with hipEventReleaseToDevice by default. The defaults can be " - "overridden by specifying hipEventReleaseToSystem or hipEventReleaseToDevice flag " - "when creating the event."); - - READ_ENV_I(release, HIP_DUMP_CODE_OBJECT, 0, - "If set, dump code object as __hip_dump_code_object[nnnn].o in the current directory," - "where nnnn is the index number."); - - // Some flags have both compile-time and runtime flags - generate a warning if user enables the - // runtime flag but the compile-time flag is disabled. - if (HIP_DB && !COMPILE_HIP_DB) { - fprintf(stderr, - "warning: env var HIP_DB=0x%x but COMPILE_HIP_DB=0. (perhaps enable " - "COMPILE_HIP_DB in src code before compiling?)\n", - HIP_DB); - } - - if (HIP_TRACE_API && !COMPILE_HIP_TRACE_API) { - fprintf(stderr, - "warning: env var HIP_TRACE_API=0x%x but COMPILE_HIP_TRACE_API=0. (perhaps enable " - "COMPILE_HIP_TRACE_API in src code before compiling?)\n", - HIP_DB); - } - if (HIP_TRACE_API) { - HIP_DB |= 0x1; - } - - if (HIP_DB) { - fprintf(stderr, "HIP_DB=0x%x [%s]\n", HIP_DB, HIP_DB_string(HIP_DB).c_str()); - } - - std::transform(HIP_TRACE_API_COLOR.begin(), HIP_TRACE_API_COLOR.end(), - HIP_TRACE_API_COLOR.begin(), ::tolower); - - if (HIP_TRACE_API_COLOR == "none") { - API_COLOR = ""; - API_COLOR_END = ""; - } else if (HIP_TRACE_API_COLOR == "red") { - API_COLOR = KRED; - } else if (HIP_TRACE_API_COLOR == "green") { - API_COLOR = KGRN; - } else if (HIP_TRACE_API_COLOR == "yellow") { - API_COLOR = KYEL; - } else if (HIP_TRACE_API_COLOR == "blue") { - API_COLOR = KBLU; - } else if (HIP_TRACE_API_COLOR == "magenta") { - API_COLOR = KMAG; - } else if (HIP_TRACE_API_COLOR == "cyan") { - API_COLOR = KCYN; - } else if (HIP_TRACE_API_COLOR == "white") { - API_COLOR = KWHT; - } else { - fprintf(stderr, - "warning: env var HIP_TRACE_API_COLOR=%s must be " - "None/Red/Green/Yellow/Blue/Magenta/Cyan/White", - HIP_TRACE_API_COLOR.c_str()); - }; - - parseTrigger(HIP_DB_START_API, g_dbStartTriggers); - parseTrigger(HIP_DB_STOP_API, g_dbStopTriggers); -}; - - -//--- -// Function called one-time at initialization time to construct a table of all GPU devices. -// HIP/CUDA uses integer "deviceIds" - these are indexes into this table. -// AMP maintains a table of accelerators, but some are emulated - ie for debug or CPU. -// This function creates a vector with only the GPU accelerators. -// It is called with C++11 call_once, which provided thread-safety. -void ihipInit() { - - HipReadEnv(); - - - /* - * Build a table of valid compute devices. - */ - auto accs = hc::accelerator::get_all(); - - int deviceCnt = 0; - for (int i = 0; i < accs.size(); i++) { - if (!accs[i].get_is_emulated()) { - deviceCnt++; - } - }; - - // Make sure the hip visible devices are within the deviceCnt range - for (int i = 0; i < g_hip_visible_devices.size(); i++) { - if ((g_hip_visible_devices[i] >= deviceCnt) ||(g_hip_visible_devices[i] < 0)){ - // Make sure any DeviceID after invalid DeviceID will be erased. - g_hip_visible_devices.resize(i); - break; - } - } - - hsa_status_t err = hsa_iterate_agents(findCpuAgent, &g_cpu_agent); - if (err != HSA_STATUS_INFO_BREAK) { - // didn't find a CPU. - g_initDeviceFound = false; - return; - } - - g_deviceArray = new ihipDevice_t*[deviceCnt]; - g_deviceCnt = 0; - - if(g_visible_device) { - for (int i = 0; i < g_hip_visible_devices.size(); i++) { - int devIndex = g_hip_visible_devices[i]; - if (!accs[devIndex+1].get_is_emulated()) { - g_deviceArray[g_deviceCnt] = new ihipDevice_t(g_deviceCnt, deviceCnt, accs[devIndex+1]); - g_deviceCnt++; - } - } - }else { - for (int i = 0; i < accs.size(); i++) { - if (!accs[i].get_is_emulated()) { - g_deviceArray[g_deviceCnt] = new ihipDevice_t(g_deviceCnt, deviceCnt, accs[i]); - g_deviceCnt++; - } - } - } - - g_allAgents = static_cast(malloc((g_deviceCnt + 1) * sizeof(hsa_agent_t))); - g_allAgents[0] = g_cpu_agent; - for (int i = 0; i < g_deviceCnt; i++) { - g_allAgents[i + 1] = g_deviceArray[i]->_hsaAgent; - } - - - g_numLogicalThreads = std::thread::hardware_concurrency(); - - // If HIP_VISIBLE_DEVICES is not set, make sure all devices are initialized - if (!g_visible_device) { - assert(deviceCnt == g_deviceCnt); - } - - tprintf(DB_SYNC, "pid=%u %-30s g_numLogicalThreads=%u\n", getpid(), "", - g_numLogicalThreads); - - g_initDeviceFound = true; -} - -namespace hip_impl { -hipError_t hip_init() { - static std::once_flag hip_initialized; - std::call_once(hip_initialized, ihipInit); - if (g_initDeviceFound) { - ihipCtxStackUpdate(); - return hipSuccess; - } - else { - return hipErrorInsufficientDriver; - } -} -} - -hipError_t ihipStreamSynchronize(TlsData *tls, hipStream_t stream) { - hipError_t e = hipSuccess; - - if (stream == hipStreamNull) { - ihipCtx_t* ctx = ihipGetTlsDefaultCtx(); - ctx->locked_syncDefaultStream(true /*waitOnSelf*/, true /*syncToHost*/); - } else { - // note this does not synchornize with the NULL stream: - bool waited; - stream->locked_wait(waited); - if (!waited) { - Kalmar::getContext()->flushPrintfBuffer(); - } - e = hipSuccess; - } - - return e; -} - -//--- -// Get the stream to use for a command submission. -// -// If stream==NULL synchronize appropriately with other streams and return the default av for the -// device. If stream is valid, return the AV to use. -hipStream_t ihipSyncAndResolveStream(hipStream_t stream, bool lockAcquired) { - if (stream == hipStreamNull) { - // Submitting to NULL stream, call locked_syncDefaultStream to wait for all other streams: - GET_TLS(); - ihipCtx_t* ctx = ihipGetTlsDefaultCtx(); - tprintf(DB_SYNC, "ihipSyncAndResolveStream %s wait on default stream\n", - ToString(stream).c_str()); - -#ifndef HIP_API_PER_THREAD_DEFAULT_STREAM - ctx->locked_syncDefaultStream(false, false); -#endif - return ctx->_defaultStream; - } else { - // Submitting to a "normal" stream, just wait for null stream: - if (!(stream->_flags & hipStreamNonBlocking)) { - if (HIP_SYNC_NULL_STREAM) { - tprintf(DB_SYNC, "ihipSyncAndResolveStream %s host-wait on default stream\n", - ToString(stream).c_str()); - stream->getCtx()->_defaultStream->locked_wait(); - } else { - ihipStream_t* defaultStream = stream->getCtx()->_defaultStream; - - - bool needGatherMarker = false; // used to gather together other markers. - hc::completion_future dcf; - { - LockedAccessor_StreamCrit_t defaultStreamCrit(defaultStream->criticalData()); - // TODO - could call create_blocking_marker(queue) or uses existing marker. - if (!defaultStreamCrit->_av.get_is_empty()) { - needGatherMarker = true; - - tprintf(DB_SYNC, " %s adding marker to default %s for dependency\n", - ToString(stream).c_str(), ToString(defaultStream).c_str()); - dcf = defaultStreamCrit->_av.create_marker(hc::accelerator_scope); - } else { - tprintf(DB_SYNC, " %s skipping marker since default stream is empty\n", - ToString(stream).c_str()); - } - } - - if (needGatherMarker) { - // ensure any commands sent to this stream wait on the NULL stream before - // continuing - if (!lockAcquired) { - LockedAccessor_StreamCrit_t thisStreamCrit(stream->criticalData()); - // TODO - could be "noret" version of create_blocking_marker - thisStreamCrit->_av.create_blocking_marker(dcf, hc::accelerator_scope); - } else { - // this stream is already locked (e.g., call from hipExtLaunchMultiKernelMultiDevice) - stream->criticalData()._av.create_blocking_marker(dcf, hc::accelerator_scope); - } - tprintf( - DB_SYNC, - " %s adding marker to wait for freshly recorded default-stream marker \n", - ToString(stream).c_str()); - } - } - } - - return stream; - } -} - -void ihipPrintKernelLaunch(const char* kernelName, const grid_launch_parm* lp, - const hipStream_t stream) { - if ((HIP_TRACE_API & (1 << TRACE_KCMD)) || - (COMPILE_HIP_DB & HIP_TRACE_API)) { - GET_TLS(); - std::stringstream os; - os << tls->tidInfo.pid() << " " << tls->tidInfo.tid() << "." << tls->tidInfo.apiSeqNum() << " hipLaunchKernel '" - << kernelName << "'" - << " gridDim:" << lp->grid_dim << " groupDim:" << lp->group_dim << " sharedMem:+" - << lp->dynamic_group_mem_bytes << " " << *stream; - - if (COMPILE_HIP_DB && HIP_TRACE_API) { - std::string fullStr; - recordApiTrace(tls, &fullStr, os.str()); - } - } -} - -// Called just before a kernel is launched from hipLaunchKernel. -// Allows runtime to track some information about the stream. -hipStream_t ihipPreLaunchKernel(hipStream_t stream, dim3 grid, dim3 block, grid_launch_parm* lp, - const char* kernelNameStr, bool lockAcquired) { - if (stream == nullptr || stream != stream->getCtx()->_defaultStream) { - stream = ihipSyncAndResolveStream(stream, lockAcquired); - } - lp->grid_dim.x = grid.x; - lp->grid_dim.y = grid.y; - lp->grid_dim.z = grid.z; - lp->group_dim.x = block.x; - lp->group_dim.y = block.y; - lp->group_dim.z = block.z; - lp->barrier_bit = barrier_bit_queue_default; - - if (!lockAcquired) stream->lockopen_preKernelCommand(); - auto &crit = stream->criticalData(); - lp->av = &(crit._av); - lp->cf = nullptr; - auto acq = (HCC_OPT_FLUSH && !crit._last_op_was_a_copy) ? - HSA_FENCE_SCOPE_AGENT : HSA_FENCE_SCOPE_SYSTEM; - auto rel = HCC_OPT_FLUSH ? - HSA_FENCE_SCOPE_AGENT : HSA_FENCE_SCOPE_SYSTEM; - lp->launch_fence = (acq << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE) | - (rel << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE); - crit._last_op_was_a_copy = false; - ihipPrintKernelLaunch(kernelNameStr, lp, stream); - - return (stream); -} - - -hipStream_t ihipPreLaunchKernel(hipStream_t stream, size_t grid, dim3 block, grid_launch_parm* lp, - const char* kernelNameStr, bool lockAcquired) { - return ihipPreLaunchKernel(stream, dim3(grid), block, lp, kernelNameStr, lockAcquired); -} - - -hipStream_t ihipPreLaunchKernel(hipStream_t stream, dim3 grid, size_t block, grid_launch_parm* lp, - const char* kernelNameStr, bool lockAcquired) { - return ihipPreLaunchKernel(stream, grid, dim3(block), lp, kernelNameStr, lockAcquired); -} - - -hipStream_t ihipPreLaunchKernel(hipStream_t stream, size_t grid, size_t block, grid_launch_parm* lp, - const char* kernelNameStr, bool lockAcquired) { - return ihipPreLaunchKernel(stream, dim3(grid), dim3(block), lp, kernelNameStr, lockAcquired); -} - - -//--- -// Called after kernel finishes execution. -// This releases the lock on the stream. -void ihipPostLaunchKernel(const char* kernelName, hipStream_t stream, grid_launch_parm& lp, bool unlockPostponed) { - tprintf(DB_SYNC, "ihipPostLaunchKernel, unlocking stream\n"); - - stream->lockclose_postKernelCommand(kernelName, lp.av, unlockPostponed); -} - -//================================================================================================= -// HIP API Implementation -// -// Implementor notes: -// _ All functions should call HIP_INIT_API as first action: -// HIP_INIT_API(); -// -// - ALl functions should use ihipLogStatus to return error code (not return error directly). -//================================================================================================= -// -//--- - -//------------------------------------------------------------------------------------------------- - - -const char* ihipErrorString(hipError_t hip_error) { - switch (hip_error) { - case hipSuccess: - return "hipSuccess"; - case hipErrorOutOfMemory: - return "hipErrorOutOfMemory"; - case hipErrorNotInitialized: - return "hipErrorNotInitialized"; - case hipErrorDeinitialized: - return "hipErrorDeinitialized"; - case hipErrorProfilerDisabled: - return "hipErrorProfilerDisabled"; - case hipErrorProfilerNotInitialized: - return "hipErrorProfilerNotInitialized"; - case hipErrorProfilerAlreadyStarted: - return "hipErrorProfilerAlreadyStarted"; - case hipErrorProfilerAlreadyStopped: - return "hipErrorProfilerAlreadyStopped"; - case hipErrorInsufficientDriver: - return "hipErrorInsufficientDriver"; - case hipErrorInvalidImage: - return "hipErrorInvalidImage"; - case hipErrorInvalidContext: - return "hipErrorInvalidContext"; - case hipErrorContextAlreadyCurrent: - return "hipErrorContextAlreadyCurrent"; - case hipErrorMapFailed: - return "hipErrorMapFailed"; - case hipErrorUnmapFailed: - return "hipErrorUnmapFailed"; - case hipErrorArrayIsMapped: - return "hipErrorArrayIsMapped"; - case hipErrorAlreadyMapped: - return "hipErrorAlreadyMapped"; - case hipErrorNoBinaryForGpu: - return "hipErrorNoBinaryForGpu"; - case hipErrorAlreadyAcquired: - return "hipErrorAlreadyAcquired"; - case hipErrorNotMapped: - return "hipErrorNotMapped"; - case hipErrorNotMappedAsArray: - return "hipErrorNotMappedAsArray"; - case hipErrorNotMappedAsPointer: - return "hipErrorNotMappedAsPointer"; - case hipErrorECCNotCorrectable: - return "hipErrorECCNotCorrectable"; - case hipErrorUnsupportedLimit: - return "hipErrorUnsupportedLimit"; - case hipErrorContextAlreadyInUse: - return "hipErrorContextAlreadyInUse"; - case hipErrorPeerAccessUnsupported: - return "hipErrorPeerAccessUnsupported"; - case hipErrorInvalidKernelFile: - return "hipErrorInvalidKernelFile"; - case hipErrorInvalidGraphicsContext: - return "hipErrorInvalidGraphicsContext"; - case hipErrorInvalidSource: - return "hipErrorInvalidSource"; - case hipErrorFileNotFound: - return "hipErrorFileNotFound"; - case hipErrorSharedObjectSymbolNotFound: - return "hipErrorSharedObjectSymbolNotFound"; - case hipErrorSharedObjectInitFailed: - return "hipErrorSharedObjectInitFailed"; - case hipErrorOperatingSystem: - return "hipErrorOperatingSystem"; - case hipErrorSetOnActiveProcess: - return "hipErrorSetOnActiveProcess"; - case hipErrorInvalidHandle: - return "hipErrorInvalidHandle"; - case hipErrorNotFound: - return "hipErrorNotFound"; - case hipErrorIllegalAddress: - return "hipErrorIllegalAddress"; - case hipErrorInvalidSymbol: - return "hipErrorInvalidSymbol"; - case hipErrorMissingConfiguration: - return "hipErrorMissingConfiguration"; - case hipErrorLaunchFailure: - return "hipErrorLaunchFailure"; - case hipErrorCooperativeLaunchTooLarge: - return "hipErrorCooperativeLaunchTooLarge"; - case hipErrorPriorLaunchFailure: - return "hipErrorPriorLaunchFailure"; - case hipErrorLaunchTimeOut: - return "hipErrorLaunchTimeOut"; - case hipErrorLaunchOutOfResources: - return "hipErrorLaunchOutOfResources"; - case hipErrorInvalidDeviceFunction: - return "hipErrorInvalidDeviceFunction"; - case hipErrorInvalidConfiguration: - return "hipErrorInvalidConfiguration"; - case hipErrorInvalidDevice: - return "hipErrorInvalidDevice"; - case hipErrorInvalidValue: - return "hipErrorInvalidValue"; - case hipErrorInvalidDevicePointer: - return "hipErrorInvalidDevicePointer"; - case hipErrorInvalidMemcpyDirection: - return "hipErrorInvalidMemcpyDirection"; - case hipErrorUnknown: - return "hipErrorUnknown"; - case hipErrorNotReady: - return "hipErrorNotReady"; - case hipErrorNoDevice: - return "hipErrorNoDevice"; - case hipErrorPeerAccessAlreadyEnabled: - return "hipErrorPeerAccessAlreadyEnabled"; - case hipErrorPeerAccessNotEnabled: - return "hipErrorPeerAccessNotEnabled"; - case hipErrorRuntimeMemory: - return "hipErrorRuntimeMemory"; - case hipErrorRuntimeOther: - return "hipErrorRuntimeOther"; - case hipErrorHostMemoryAlreadyRegistered: - return "hipErrorHostMemoryAlreadyRegistered"; - case hipErrorHostMemoryNotRegistered: - return "hipErrorHostMemoryNotRegistered"; - case hipErrorAssert: - return "hipErrorAssert"; - case hipErrorNotSupported: - return "hipErrorNotSupported"; - case hipErrorTbd: - return "hipErrorTbd"; - default: - return "hipErrorUnknown"; - }; -}; - - -// Returns true if copyEngineCtx can see the memory allocated on dstCtx and srcCtx. -// The peer-list for a context controls which contexts have access to the memory allocated on that -// context. So we check dstCtx's and srcCtx's peerList to see if the both include thisCtx. -bool ihipStream_t::canSeeMemory(const ihipCtx_t* copyEngineCtx, const hc::AmPointerInfo* dstPtrInfo, - const hc::AmPointerInfo* srcPtrInfo) { - if (copyEngineCtx == nullptr) { - return false; - } - - // Make sure this is a device-to-device copy with all memory available to the requested copy - // engine - // - // TODO - pointer-info stores a deviceID not a context,may have some unusual side-effects here: - if (dstPtrInfo->_sizeBytes == 0) { - return false; - } else if (dstPtrInfo->_appId != -1) { -#if USE_APP_PTR_FOR_CTX - ihipCtx_t* dstCtx = static_cast(dstPtrInfo->_appPtr); -#else - ihipCtx_t* dstCtx = ihipGetPrimaryCtx(dstPtrInfo->_appId); -#endif - if (copyEngineCtx != dstCtx) { - // Only checks peer list if contexts are different - LockedAccessor_CtxCrit_t ctxCrit(dstCtx->criticalData()); -#if DB_PEER_CTX - std::cerr << "checking peer : copyEngineCtx =" << copyEngineCtx << " dstCtx =" << dstCtx - << " peerCnt=" << ctxCrit->peerCnt() << "\n"; -#endif - if (!ctxCrit->isPeerWatcher(copyEngineCtx)) { - return false; - }; - } - } - - - // TODO - pointer-info stores a deviceID not a context,may have some unusual side-effects here: - if (srcPtrInfo->_sizeBytes == 0) { - return false; - } else if (srcPtrInfo->_appId != -1) { -#if USE_APP_PTR_FOR_CTX - ihipCtx_t* srcCtx = static_cast(srcPtrInfo->_appPtr); -#else - ihipCtx_t* srcCtx = ihipGetPrimaryCtx(srcPtrInfo->_appId); -#endif - if (copyEngineCtx != srcCtx) { - // Only checks peer list if contexts are different - LockedAccessor_CtxCrit_t ctxCrit(srcCtx->criticalData()); -#if DB_PEER_CTX - std::cerr << "checking peer : copyEngineCtx =" << copyEngineCtx << " srcCtx =" << srcCtx - << " peerCnt=" << ctxCrit->peerCnt() << "\n"; -#endif - if (!ctxCrit->isPeerWatcher(copyEngineCtx)) { - return false; - }; - } - } - - return true; -}; - - -#define CASE_STRING(X) \ - case X: \ - return #X; \ - break; - -const char* hipMemcpyStr(unsigned memKind) { - switch (memKind) { - CASE_STRING(hipMemcpyHostToHost); - CASE_STRING(hipMemcpyHostToDevice); - CASE_STRING(hipMemcpyDeviceToHost); - CASE_STRING(hipMemcpyDeviceToDevice); - CASE_STRING(hipMemcpyDefault); - default: - return ("unknown memcpyKind"); - }; -} - -const char* hcMemcpyStr(hc::hcCommandKind memKind) { - using namespace hc; - switch (memKind) { - CASE_STRING(hcMemcpyHostToHost); - CASE_STRING(hcMemcpyHostToDevice); - CASE_STRING(hcMemcpyDeviceToHost); - CASE_STRING(hcMemcpyDeviceToDevice); - // CASE_STRING(hcMemcpyDefault); - default: - return ("unknown memcpyKind"); - }; -} - - -// Resolve hipMemcpyDefault to a known type. -unsigned ihipStream_t::resolveMemcpyDirection(bool srcInDeviceMem, bool dstInDeviceMem) { - hipMemcpyKind kind = hipMemcpyDefault; - - if (srcInDeviceMem && dstInDeviceMem) { - kind = hipMemcpyDeviceToDevice; - } - if (srcInDeviceMem && !dstInDeviceMem) { - kind = hipMemcpyDeviceToHost; - } - if (!srcInDeviceMem && !dstInDeviceMem) { - kind = hipMemcpyHostToHost; - } - if (!srcInDeviceMem && dstInDeviceMem) { - kind = hipMemcpyHostToDevice; - } - - assert(kind != hipMemcpyDefault); - return kind; -} - - -// hipMemKind must be "resolved" to a specific direction - cannot be default. -void ihipStream_t::resolveHcMemcpyDirection(unsigned hipMemKind, - const hc::AmPointerInfo* dstPtrInfo, - const hc::AmPointerInfo* srcPtrInfo, - hc::hcCommandKind* hcCopyDir, ihipCtx_t** copyDevice, - bool* forceUnpinnedCopy) { - // Ignore what the user tells us and always resolve the direction: - // Some apps apparently rely on this. - hipMemKind = resolveMemcpyDirection(srcPtrInfo->_isInDeviceMem, dstPtrInfo->_isInDeviceMem); - - - switch (hipMemKind) { - case hipMemcpyHostToHost: - *hcCopyDir = hc::hcMemcpyHostToHost; - break; - case hipMemcpyHostToDevice: - *hcCopyDir = hc::hcMemcpyHostToDevice; - break; - case hipMemcpyDeviceToHost: - *hcCopyDir = hc::hcMemcpyDeviceToHost; - break; - case hipMemcpyDeviceToDevice: - *hcCopyDir = hc::hcMemcpyDeviceToDevice; - break; - default: - throw ihipException(hipErrorRuntimeOther); - }; - - if (srcPtrInfo->_isInDeviceMem) { - *copyDevice = ihipGetPrimaryCtx(srcPtrInfo->_appId); - } else if (dstPtrInfo->_isInDeviceMem) { - *copyDevice = ihipGetPrimaryCtx(dstPtrInfo->_appId); - } else { - *copyDevice = nullptr; - } - - *forceUnpinnedCopy = false; - if (canSeeMemory(*copyDevice, dstPtrInfo, srcPtrInfo)) { - if (HIP_FORCE_P2P_HOST & 0x1) { - *forceUnpinnedCopy = true; - tprintf(DB_COPY, - "Copy engine (dev:%d agent=0x%lx) can see src and dst but " - "HIP_FORCE_P2P_HOST=0, forcing copy through staging buffers.\n", - *copyDevice ? (*copyDevice)->getDeviceNum() : -1, - *copyDevice ? (*copyDevice)->getDevice()->_hsaAgent.handle : 0x0); - - } else { - tprintf(DB_COPY, "Copy engine (dev:%d agent=0x%lx) can see src and dst.\n", - *copyDevice ? (*copyDevice)->getDeviceNum() : -1, - *copyDevice ? (*copyDevice)->getDevice()->_hsaAgent.handle : 0x0); - } - } else { - *forceUnpinnedCopy = true; - tprintf(DB_COPY, - "Copy engine(dev:%d agent=0x%lx) cannot see both host and device pointers - " - "forcing copy with unpinned engine.\n", - *copyDevice ? (*copyDevice)->getDeviceNum() : -1, - *copyDevice ? (*copyDevice)->getDevice()->_hsaAgent.handle : 0x0); - if (HIP_FAIL_SOC & 0x2) { - fprintf(stderr, - "HIP_FAIL_SOC: P2P: copy engine(dev:%d agent=0x%lx) cannot see both host and " - "device pointers - forcing copy with unpinned engine.\n", - *copyDevice ? (*copyDevice)->getDeviceNum() : -1, - *copyDevice ? (*copyDevice)->getDevice()->_hsaAgent.handle : 0x0); - throw ihipException(hipErrorRuntimeOther); - } - } -} - - -void printPointerInfo(unsigned dbFlag, const char* tag, const void* ptr, - const hc::AmPointerInfo& ptrInfo) { - tprintf(dbFlag, - " %s=%p baseHost=%p baseDev=%p sz=%zu home_dev=%d tracked=%d isDevMem=%d " - "registered=%d allocSeqNum=%zu, appAllocationFlags=%x, appPtr=%p\n", - tag, ptr, ptrInfo._hostPointer, ptrInfo._devicePointer, ptrInfo._sizeBytes, - ptrInfo._appId, ptrInfo._sizeBytes != 0, ptrInfo._isInDeviceMem, !ptrInfo._isAmManaged, - ptrInfo._allocSeqNum, ptrInfo._appAllocationFlags, ptrInfo._appPtr); -} - - -// the pointer-info as returned by HC refers to the allocation -// This routine modifies the pointer-info so it appears to refer to the specific ptr and sizeBytes. -// TODO -remove this when HCC uses HSA pointer info functions directly. -void tailorPtrInfo(hc::AmPointerInfo* ptrInfo, const void* ptr, size_t sizeBytes) { - const char* ptrc = static_cast(ptr); - if (ptrInfo->_sizeBytes == 0) { - // invalid ptrInfo, don't modify - return; - } else if (ptrInfo->_isInDeviceMem) { - assert(ptrInfo->_devicePointer != nullptr); - std::ptrdiff_t diff = ptrc - static_cast(ptrInfo->_devicePointer); - - // TODO : assert-> runtime assert that only appears in debug mode - assert(diff >= 0); - assert(diff <= ptrInfo->_sizeBytes); - - ptrInfo->_devicePointer = const_cast(ptr); - - if (ptrInfo->_hostPointer != nullptr) { - ptrInfo->_hostPointer = static_cast(ptrInfo->_hostPointer) + diff; - } - - } else { - assert(ptrInfo->_hostPointer != nullptr); - std::ptrdiff_t diff = ptrc - static_cast(ptrInfo->_hostPointer); - - // TODO : assert-> runtime assert that only appears in debug mode - assert(diff >= 0); - assert(diff <= ptrInfo->_sizeBytes); - - ptrInfo->_hostPointer = const_cast(ptr); - - if (ptrInfo->_devicePointer != nullptr) { - ptrInfo->_devicePointer = static_cast(ptrInfo->_devicePointer) + diff; - } - } - - assert(sizeBytes <= ptrInfo->_sizeBytes); - ptrInfo->_sizeBytes = sizeBytes; -}; - - -bool getTailoredPtrInfo(const char* tag, hc::AmPointerInfo* ptrInfo, const void* ptr, - size_t sizeBytes) { - bool tracked = (hc::am_memtracker_getinfo(ptrInfo, ptr) == AM_SUCCESS); - printPointerInfo(DB_COPY, tag, ptr, *ptrInfo); - - if (tracked) { - tailorPtrInfo(ptrInfo, ptr, sizeBytes); - printPointerInfo(DB_COPY, " mod", ptr, *ptrInfo); - } - - return tracked; -}; - - -// TODO : For registered and host memory, if the portable flag is set, we need to recognize that and -// perform appropriate copy operation. What can happen now is that Portable memory is mapped into -// multiple devices but Peer access is not enabled. i The peer detection logic doesn't see that the -// memory is already mapped and so tries to use an unpinned copy algorithm. If this is PinInPlace, -// then an error can occur. Need to track Portable flag correctly or use new RT functionality to -// query the peer status for the pointer. -// -// TODO - remove kind parm from here or use it below? -void ihipStream_t::locked_copySync(void* dst, const void* src, size_t sizeBytes, unsigned kind, - bool resolveOn) { - ihipCtx_t* ctx = this->getCtx(); - const ihipDevice_t* device = ctx->getDevice(); - - if (device == NULL) { - throw ihipException(hipErrorInvalidDevice); - } - - hc::accelerator acc; -#if (__hcc_workweek__ >= 17332) - hc::AmPointerInfo dstPtrInfo(NULL, NULL, NULL, 0, acc, 0, 0); - hc::AmPointerInfo srcPtrInfo(NULL, NULL, NULL, 0, acc, 0, 0); -#else - hc::AmPointerInfo dstPtrInfo(NULL, NULL, 0, acc, 0, 0); - hc::AmPointerInfo srcPtrInfo(NULL, NULL, 0, acc, 0, 0); -#endif - bool dstTracked = getTailoredPtrInfo(" dst", &dstPtrInfo, dst, sizeBytes); - bool srcTracked = getTailoredPtrInfo(" src", &srcPtrInfo, src, sizeBytes); - - - // Some code in HCC and in printPointerInfo uses _sizeBytes==0 as an indication ptr is not - // valid, so check it here: - if (!dstTracked) { - assert(dstPtrInfo._sizeBytes == 0); - } - if (!srcTracked) { - assert(srcPtrInfo._sizeBytes == 0); - } - - - hc::hcCommandKind hcCopyDir; - ihipCtx_t* copyDevice; - bool forceUnpinnedCopy; - resolveHcMemcpyDirection(kind, &dstPtrInfo, &srcPtrInfo, &hcCopyDir, ©Device, - &forceUnpinnedCopy); - - { - LockedAccessor_StreamCrit_t crit(_criticalData); - tprintf(DB_COPY, - "copySync copyDev:%d dst=%p (phys_dev:%d, isDevMem:%d) src=%p(phys_dev:%d, " - "isDevMem:%d) sz=%zu dir=%s forceUnpinnedCopy=%d\n", - copyDevice ? copyDevice->getDeviceNum() : -1, dst, dstPtrInfo._appId, - dstPtrInfo._isInDeviceMem, src, srcPtrInfo._appId, srcPtrInfo._isInDeviceMem, - sizeBytes, hcMemcpyStr(hcCopyDir), forceUnpinnedCopy); - printPointerInfo(DB_COPY, " dst", dst, dstPtrInfo); - printPointerInfo(DB_COPY, " src", src, srcPtrInfo); - - - crit->_av.copy_ext(src, dst, sizeBytes, hcCopyDir, srcPtrInfo, dstPtrInfo, - copyDevice ? ©Device->getDevice()->_acc : nullptr, - forceUnpinnedCopy); - } -} - -bool ihipStream_t::locked_copy2DSync(void* dst, const void* src, size_t width, size_t height, size_t srcPitch, size_t dstPitch, unsigned kind, - bool resolveOn) { - bool retStatus = true; - ihipCtx_t* ctx = this->getCtx(); - const ihipDevice_t* device = ctx->getDevice(); - - if (device == NULL) { - throw ihipException(hipErrorInvalidDevice); - } - size_t sizeBytes = width*height; - hc::accelerator acc; -#if (__hcc_workweek__ >= 17332) - hc::AmPointerInfo dstPtrInfo(NULL, NULL, NULL, 0, acc, 0, 0); - hc::AmPointerInfo srcPtrInfo(NULL, NULL, NULL, 0, acc, 0, 0); -#else - hc::AmPointerInfo dstPtrInfo(NULL, NULL, 0, acc, 0, 0); - hc::AmPointerInfo srcPtrInfo(NULL, NULL, 0, acc, 0, 0); -#endif - bool dstTracked = getTailoredPtrInfo(" dst", &dstPtrInfo, dst, sizeBytes); - bool srcTracked = getTailoredPtrInfo(" src", &srcPtrInfo, src, sizeBytes); - - // Some code in HCC and in printPointerInfo uses _sizeBytes==0 as an indication ptr is not - // // valid, so check it here: - if (!dstTracked) { - assert(dstPtrInfo._sizeBytes == 0); - } - if (!srcTracked) { - assert(srcPtrInfo._sizeBytes == 0); - } - - - hc::hcCommandKind hcCopyDir; - ihipCtx_t* copyDevice; - bool forceUnpinnedCopy; - resolveHcMemcpyDirection(kind, &dstPtrInfo, &srcPtrInfo, &hcCopyDir, ©Device, - &forceUnpinnedCopy); - - { - LockedAccessor_StreamCrit_t crit(_criticalData); - tprintf(DB_COPY, - "copy2DSync copyDev:%d dst=%p (phys_dev:%d, isDevMem:%d) src=%p(phys_dev:%d, " - "isDevMem:%d) sz=%zu dir=%s forceUnpinnedCopy=%d\n", - copyDevice ? copyDevice->getDeviceNum() : -1, dst, dstPtrInfo._appId, - dstPtrInfo._isInDeviceMem, src, srcPtrInfo._appId, srcPtrInfo._isInDeviceMem, - sizeBytes, hcMemcpyStr(hcCopyDir), forceUnpinnedCopy); - printPointerInfo(DB_COPY, " dst", dst, dstPtrInfo); - printPointerInfo(DB_COPY, " src", src, srcPtrInfo); -#if (__hcc_workweek__ >= 19101) - if(!crit->_av.copy2d_ext(src, dst, width, height, srcPitch, dstPitch, hcCopyDir, srcPtrInfo, dstPtrInfo, - copyDevice ? ©Device->getDevice()->_acc : nullptr, - forceUnpinnedCopy)) { - tprintf(DB_COPY,"locked_copy2DSync failed to use SDMA\n"); - retStatus = false; - } -#else - crit->_av.copy2d_ext(src, dst, width, height, srcPitch, dstPitch, hcCopyDir, srcPtrInfo, dstPtrInfo, - copyDevice ? ©Device->getDevice()->_acc : nullptr, - forceUnpinnedCopy); -#endif - } - return retStatus; -} - -void ihipStream_t::addSymbolPtrToTracker(hc::accelerator& acc, void* ptr, size_t sizeBytes) { -#if (__hcc_workweek__ >= 17332) - hc::AmPointerInfo ptrInfo(NULL, ptr, ptr, sizeBytes, acc, true, false); -#else - hc::AmPointerInfo ptrInfo(NULL, ptr, sizeBytes, acc, true, false); -#endif - hc::am_memtracker_add(ptr, ptrInfo); -} - -void ihipStream_t::lockedSymbolCopySync(hc::accelerator& acc, void* dst, void* src, - size_t sizeBytes, size_t offset, unsigned kind) { - if (kind == hipMemcpyHostToHost) { - acc.memcpy_symbol(dst, (void*)src, sizeBytes, offset, Kalmar::hcMemcpyHostToHost); - } - if (kind == hipMemcpyHostToDevice) { - acc.memcpy_symbol(dst, (void*)src, sizeBytes, offset); - } - if (kind == hipMemcpyDeviceToDevice) { - acc.memcpy_symbol(dst, (void*)src, sizeBytes, offset, Kalmar::hcMemcpyDeviceToDevice); - } - if (kind == hipMemcpyDeviceToHost) { - acc.memcpy_symbol((void*)src, (void*)dst, sizeBytes, offset, Kalmar::hcMemcpyDeviceToHost); - } -} - -void ihipStream_t::lockedSymbolCopyAsync(hc::accelerator& acc, void* dst, void* src, - size_t sizeBytes, size_t offset, unsigned kind) { - // TODO - review - this looks broken , should not be adding pointers to tracker dynamically: - if (kind == hipMemcpyHostToDevice) { -#if (__hcc_workweek__ >= 17332) - hc::AmPointerInfo srcPtrInfo(NULL, NULL, NULL, 0, acc, 0, 0); -#else - hc::AmPointerInfo srcPtrInfo(NULL, NULL, 0, acc, 0, 0); -#endif - bool srcTracked = (hc::am_memtracker_getinfo(&srcPtrInfo, src) == AM_SUCCESS); - if (srcTracked) { - addSymbolPtrToTracker(acc, dst, sizeBytes); - locked_getAv()->copy_async((void*)src, dst, sizeBytes); - } else { - LockedAccessor_StreamCrit_t crit(_criticalData); - this->wait(crit); - acc.memcpy_symbol(dst, (void*)src, sizeBytes, offset); - } - } - if (kind == hipMemcpyDeviceToHost) { -#if (__hcc_workweek__ >= 17332) - hc::AmPointerInfo dstPtrInfo(NULL, NULL, NULL, 0, acc, 0, 0); -#else - hc::AmPointerInfo dstPtrInfo(NULL, NULL, 0, acc, 0, 0); -#endif - bool dstTracked = (hc::am_memtracker_getinfo(&dstPtrInfo, dst) == AM_SUCCESS); - if (dstTracked) { - addSymbolPtrToTracker(acc, src, sizeBytes); - locked_getAv()->copy_async((void*)src, dst, sizeBytes); - } else { - LockedAccessor_StreamCrit_t crit(_criticalData); - this->wait(crit); - acc.memcpy_symbol((void*)src, (void*)dst, sizeBytes, offset, - Kalmar::hcMemcpyDeviceToHost); - } - } -} - - -void ihipStream_t::locked_copyAsync(void* dst, const void* src, size_t sizeBytes, unsigned kind) { - const ihipCtx_t* ctx = this->getCtx(); - - if ((ctx == nullptr) || (ctx->getDevice() == nullptr)) { - tprintf(DB_COPY, "locked_copyAsync bad ctx or device\n"); - throw ihipException(hipErrorInvalidDevice); - } - - if (kind == hipMemcpyHostToHost) { - tprintf(DB_COPY, "locked_copyAsync: H2H with memcpy"); - - // TODO - consider if we want to perhaps use the GPU SDMA engines anyway, to avoid the - // host-side sync here and keep everything flowing on the GPU. - /* As this is a CPU op, we need to wait until all - the commands in current stream are finished. - */ - LockedAccessor_StreamCrit_t crit(_criticalData); - this->wait(crit); - - memcpy(dst, src, sizeBytes); - - } else { - hc::accelerator acc; -#if (__hcc_workweek__ >= 17332) - hc::AmPointerInfo dstPtrInfo(NULL, NULL, NULL, 0, acc, 0, 0); - hc::AmPointerInfo srcPtrInfo(NULL, NULL, NULL, 0, acc, 0, 0); -#else - hc::AmPointerInfo dstPtrInfo(NULL, NULL, 0, acc, 0, 0); - hc::AmPointerInfo srcPtrInfo(NULL, NULL, 0, acc, 0, 0); -#endif - tprintf(DB_COPY, "copyASync dst=%p src=%p, sz=%zu\n", dst, src, sizeBytes); - bool dstTracked = getTailoredPtrInfo(" dst", &dstPtrInfo, dst, sizeBytes); - bool srcTracked = getTailoredPtrInfo(" src", &srcPtrInfo, src, sizeBytes); - - - hc::hcCommandKind hcCopyDir; - ihipCtx_t* copyDevice; - bool forceUnpinnedCopy; - resolveHcMemcpyDirection(kind, &dstPtrInfo, &srcPtrInfo, &hcCopyDir, ©Device, - &forceUnpinnedCopy); - tprintf(DB_COPY, " copyDev:%d dir=%s forceUnpinnedCopy=%d\n", - copyDevice ? copyDevice->getDeviceNum() : -1, hcMemcpyStr(hcCopyDir), - forceUnpinnedCopy); - - // "tracked" really indicates if the pointer's virtual address is available in the GPU - // address space. If both pointers are not tracked, we need to fall back to a sync copy. - if (dstTracked && srcTracked && !forceUnpinnedCopy && - copyDevice /*code below assumes this is !nullptr*/) { - LockedAccessor_StreamCrit_t crit(_criticalData); - - // Perform fast asynchronous copy - we know copyDevice != NULL based on check above - try { - if (HIP_FORCE_SYNC_COPY) { - crit->_av.copy_ext(src, dst, sizeBytes, hcCopyDir, srcPtrInfo, dstPtrInfo, - ©Device->getDevice()->_acc, forceUnpinnedCopy); - - } else { - crit->_av.copy_async_ext(src, dst, sizeBytes, hcCopyDir, srcPtrInfo, dstPtrInfo, - ©Device->getDevice()->_acc); - } - } catch (Kalmar::runtime_exception) { - throw ihipException(hipErrorRuntimeOther); - }; - - - if (HIP_API_BLOCKING) { - tprintf(DB_SYNC, "%s LAUNCH_BLOCKING for completion of hipMemcpyAsync(sz=%zu)\n", - ToString(this).c_str(), sizeBytes); - this->wait(crit); - } - - } else { - if (HIP_FAIL_SOC & 0x1) { - fprintf(stderr, - "HIP_FAIL_SOC failed, async_copy requested but could not be completed " - "since src or dst not accesible to copy agent\n"); - fprintf(stderr, - "copyASync copyDev:%d dst=%p (phys_dev:%d, isDevMem:%d) " - "src=%p(phys_dev:%d, isDevMem:%d) sz=%zu dir=%s forceUnpinnedCopy=%d\n", - copyDevice ? copyDevice->getDeviceNum() : -1, dst, dstPtrInfo._appId, - dstPtrInfo._isInDeviceMem, src, srcPtrInfo._appId, - srcPtrInfo._isInDeviceMem, sizeBytes, hcMemcpyStr(hcCopyDir), - forceUnpinnedCopy); - fprintf( - stderr, - " dst=%p baseHost=%p baseDev=%p sz=%zu home_dev=%d tracked=%d isDevMem=%d\n", - dst, dstPtrInfo._hostPointer, dstPtrInfo._devicePointer, dstPtrInfo._sizeBytes, - dstPtrInfo._appId, dstTracked, dstPtrInfo._isInDeviceMem); - fprintf( - stderr, - " src=%p baseHost=%p baseDev=%p sz=%zu home_dev=%d tracked=%d isDevMem=%d\n", - src, srcPtrInfo._hostPointer, srcPtrInfo._devicePointer, srcPtrInfo._sizeBytes, - srcPtrInfo._appId, srcTracked, srcPtrInfo._isInDeviceMem); - throw ihipException(hipErrorRuntimeOther); - } - // Perform slow synchronous copy: - LockedAccessor_StreamCrit_t crit(_criticalData); - - - crit->_av.copy_ext(src, dst, sizeBytes, hcCopyDir, srcPtrInfo, dstPtrInfo, - copyDevice ? ©Device->getDevice()->_acc : nullptr, - forceUnpinnedCopy); - } - } -} - -bool ihipStream_t::locked_copy2DAsync(void* dst, const void* src, size_t width, size_t height, size_t srcPitch, size_t dstPitch, unsigned kind) -{ - bool retStatus = true; - const ihipCtx_t* ctx = this->getCtx(); - - if ((ctx == nullptr) || (ctx->getDevice() == nullptr)) { - tprintf(DB_COPY, "locked_copy2DAsync bad ctx or device\n"); - throw ihipException(hipErrorInvalidDevice); - } - hc::accelerator acc; - size_t sizeBytes = width*height; -#if (__hcc_workweek__ >= 17332) - hc::AmPointerInfo dstPtrInfo(NULL, NULL, NULL, 0, acc, 0, 0); - hc::AmPointerInfo srcPtrInfo(NULL, NULL, NULL, 0, acc, 0, 0); -#else - hc::AmPointerInfo dstPtrInfo(NULL, NULL, 0, acc, 0, 0); - hc::AmPointerInfo srcPtrInfo(NULL, NULL, 0, acc, 0, 0); -#endif - tprintf(DB_COPY, "copy2DAsync dst=%p src=%p, sz=%zu\n", dst, src, sizeBytes); - bool dstTracked = getTailoredPtrInfo(" dst", &dstPtrInfo, dst, sizeBytes); - bool srcTracked = getTailoredPtrInfo(" src", &srcPtrInfo, src, sizeBytes); - - - hc::hcCommandKind hcCopyDir; - ihipCtx_t* copyDevice; - bool forceUnpinnedCopy; - resolveHcMemcpyDirection(kind, &dstPtrInfo, &srcPtrInfo, &hcCopyDir, ©Device, - &forceUnpinnedCopy); - tprintf(DB_COPY, " copyDev:%d dir=%s forceUnpinnedCopy=%d\n", - copyDevice ? copyDevice->getDeviceNum() : -1, hcMemcpyStr(hcCopyDir), - forceUnpinnedCopy); - if (dstTracked && srcTracked && !forceUnpinnedCopy && - copyDevice /*code below assumes this is !nullptr*/) { - LockedAccessor_StreamCrit_t crit(_criticalData); - - try { - if (HIP_FORCE_SYNC_COPY) { -#if (__hcc_workweek__ >= 19101) - if(!crit->_av.copy2d_ext(src, dst, width, height, srcPitch, dstPitch, hcCopyDir, srcPtrInfo, dstPtrInfo, - ©Device->getDevice()->_acc, - forceUnpinnedCopy)){ - tprintf(DB_COPY,"locked_copy2DASync with HIP_FORCE_SYNC_COPY failed to use SDMA\n"); - retStatus = false; - } -#else - crit->_av.copy2d_ext(src, dst, width, height, srcPitch, dstPitch, hcCopyDir, srcPtrInfo, dstPtrInfo, - ©Device->getDevice()->_acc, - forceUnpinnedCopy); -#endif - - } else { - const auto& future = crit->_av.copy2d_async_ext(src, dst, width, height, srcPitch, dstPitch, hcCopyDir, srcPtrInfo, dstPtrInfo, - ©Device->getDevice()->_acc); - if(!future.valid()) { - tprintf(DB_COPY, "locked_copy2DAsync failed to use SDMA\n"); - retStatus = false; - } - } - } catch (Kalmar::runtime_exception) { - throw ihipException(hipErrorRuntimeOther); - }; - - if (HIP_API_BLOCKING) { - tprintf(DB_SYNC, "%s LAUNCH_BLOCKING for completion of hipMemcpy2DAsync(sz=%zu)\n", - ToString(this).c_str(), sizeBytes); - this->wait(crit); - } - - } else { - //Do sync 2D copy - LockedAccessor_StreamCrit_t crit(_criticalData); -#if (__hcc_workweek__ >= 19101) - if(!crit->_av.copy2d_ext(src, dst, width, height, srcPitch, dstPitch, hcCopyDir, srcPtrInfo, dstPtrInfo, - copyDevice ? ©Device->getDevice()->_acc : nullptr, - forceUnpinnedCopy)){ - tprintf(DB_COPY, "locked_copy2DAsync Sync copy failed to use SDMA\n"); - retStatus = false; - } -#else - crit->_av.copy2d_ext(src, dst, width, height, srcPitch, dstPitch, hcCopyDir, srcPtrInfo, dstPtrInfo, - copyDevice ? ©Device->getDevice()->_acc : nullptr, - forceUnpinnedCopy); -#endif - } - return retStatus; -} - -hipError_t hipProfilerStart() { - HIP_INIT_API(hipProfilerStart); - return ihipLogStatus(hipSuccess); -}; - - -hipError_t hipProfilerStop() { - HIP_INIT_API(hipProfilerStop); - return ihipLogStatus(hipSuccess); -}; - -//// TODO - add identifier numbers for streams and devices to help with debugging. -// TODO - add a contect sequence number for debug. Print operator<< ctx:0.1 (device.ctx) - -namespace hip_impl { - std::unordered_set& get_all_gpuarch() { - static std::unordered_set r{}; - static std::once_flag init; - std::call_once(init, []() { - for (int i=0; i < g_deviceCnt; i++){ - r.insert("hcc-amdgcn-amd-amdhsa--gfx"+std::to_string(g_deviceArray[i]->_props.gcnArch)); - }}); - return r; - } - - std::vector all_hsa_agents() { - std::vector r{}; - std::vector visible_accelerators; - for (int i=0; i < g_deviceCnt; i++) - visible_accelerators.push_back(g_deviceArray[i]->_acc); - for (auto&& acc : visible_accelerators) { - const auto agent = acc.get_hsa_agent(); - - if (!agent || !acc.is_hsa_accelerator()) continue; - - r.emplace_back(*static_cast(agent)); - } - return r; - } - - [[noreturn]] - void hip_throw(const std::exception& ex) { - #if defined(__cpp_exceptions) - if (auto rte = dynamic_cast(&ex)) throw *rte; - if (auto lge = dynamic_cast(&ex)) throw *lge; - throw ex; - #else - std::cerr << ex.what() << std::endl; - std::terminate(); - #endif - } - -} // Namespace hip_impl. diff --git a/src/hip_hcc_internal.h b/src/hip_hcc_internal.h deleted file mode 100644 index cf3b7f6d45..0000000000 --- a/src/hip_hcc_internal.h +++ /dev/null @@ -1,1102 +0,0 @@ -/* -Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved. - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. -*/ - -#ifndef HIP_SRC_HIP_HCC_INTERNAL_H -#define HIP_SRC_HIP_HCC_INTERNAL_H - -#include -#include -#include -#include - -#include "hsa/hsa_ext_amd.h" -#include "hip/hip_runtime.h" -#include "hip_prof_api.h" -#include "hip_util.h" -#include "env.h" -#include - -#if (__hcc_workweek__ < 16354) -#error("This version of HIP requires a newer version of HCC."); -#endif - -// Use the __appPtr field in the am memtracker to store the context. -// Requires a bug fix in HCC -#if defined(__HCC_HAS_EXTENDED_AM_MEMTRACKER_UPDATE) and \ - (__HCC_HAS_EXTENDED_AM_MEMTRACKER_UPDATE != 0) -#define USE_APP_PTR_FOR_CTX 1 -#endif - - -#define USE_IPC 1 - -//--- -// Environment variables: - -// Intended to distinguish whether an environment variable should be visible only in debug mode, or -// in debug+release. -// static const int debug = 0; -extern const int release; - -// TODO - this blocks both kernels and memory ops. Perhaps should have separate env var for -// kernels? -extern int HIP_LAUNCH_BLOCKING; -extern int HIP_API_BLOCKING; - -extern int HIP_PRINT_ENV; -// extern int HIP_TRACE_API; -extern int HIP_ATP; -extern int HIP_DB; -extern int HIP_STAGING_SIZE; /* size of staging buffers, in KB */ -extern int HIP_STREAM_SIGNALS; /* number of signals to allocate at stream creation */ -extern int HIP_VISIBLE_DEVICES; /* Contains a comma-separated sequence of GPU identifiers */ -extern int HIP_FORCE_P2P_HOST; - -extern int HIP_HOST_COHERENT; - -extern int HIP_HIDDEN_FREE_MEM; -//--- -// Chicken bits for disabling functionality to work around potential issues: -extern int HIP_SYNC_HOST_ALLOC; -extern int HIP_SYNC_STREAM_WAIT; - -extern int HIP_SYNC_NULL_STREAM; -extern int HIP_INIT_ALLOC; -extern int HIP_FORCE_NULL_STREAM; - -extern int HIP_SYNC_FREE; - -extern int HIP_DUMP_CODE_OBJECT; - -// TODO - remove when this is standard behavior. -extern int HCC_OPT_FLUSH; - -#define IMAGE_PITCH_ALIGNMENT 256 -template inline T alignDown(T value, size_t alignment) { - return (T)(value & ~(alignment - 1)); -} - -template inline T* alignDown(T* value, size_t alignment) { - return (T*)alignDown((intptr_t)value, alignment); -} - -template inline T alignUp(T value, size_t alignment) { - return alignDown((T)(value + alignment - 1), alignment); -} - -template inline T* alignUp(T* value, size_t alignment) { - return (T*)alignDown((intptr_t)(value + alignment - 1), alignment); -} - -size_t getNumChannels(hsa_ext_image_channel_order_t channelOrder) { - switch (channelOrder) { - case HSA_EXT_IMAGE_CHANNEL_ORDER_RG: - return 2; - case HSA_EXT_IMAGE_CHANNEL_ORDER_RGB: - return 3; - case HSA_EXT_IMAGE_CHANNEL_ORDER_RGBA: - return 4; - case HSA_EXT_IMAGE_CHANNEL_ORDER_R: - default: - return 1; - } -} - -size_t getElementSize(hsa_ext_image_channel_order_t channelOrder, hsa_ext_image_channel_type_t channelType) { - size_t bytesPerPixel = getNumChannels(channelOrder); - switch (channelType) { - case HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8: - case HSA_EXT_IMAGE_CHANNEL_TYPE_SIGNED_INT8: - break; - - case HSA_EXT_IMAGE_CHANNEL_TYPE_SIGNED_INT32: - case HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32: - case HSA_EXT_IMAGE_CHANNEL_TYPE_FLOAT: - bytesPerPixel *= 4; - break; - - default: - bytesPerPixel *= 2; - break; - } - return bytesPerPixel; -} - -// Class to assign a short TID to each new thread, for HIP debugging purposes. -class TidInfo { - public: - TidInfo(); - - int tid() const { return _shortTid; }; - pid_t pid() const { return _pid; }; - uint64_t incApiSeqNum() { return ++_apiSeqNum; }; - uint64_t apiSeqNum() const { return _apiSeqNum; }; - - private: - int _shortTid; - pid_t _pid; - - // monotonically increasing API sequence number for this threa. - uint64_t _apiSeqNum; -}; - -struct ProfTrigger { - static const uint64_t MAX_TRIGGER = std::numeric_limits::max(); - - void print(int tid) { - std::cout << "Enabling tracing for "; - for (auto iter = _profTrigger.begin(); iter != _profTrigger.end(); iter++) { - std::cout << "tid:" << tid << "." << *iter << ","; - } - std::cout << "\n"; - }; - - uint64_t nextTrigger() { return _profTrigger.empty() ? MAX_TRIGGER : _profTrigger.back(); }; - void add(uint64_t trigger) { _profTrigger.push_back(trigger); }; - void sort() { std::sort(_profTrigger.begin(), _profTrigger.end(), std::greater()); }; - - private: - std::vector _profTrigger; -}; - - -//--- -// Extern TLS -// Use a single struct to hold all TLS data. Attempt to reduce TLS accesses. -struct TlsData { - explicit TlsData() { - lastHipError = hipSuccess; - getPrimaryCtx = true; - defaultCtx = nullptr; - } - - hipError_t lastHipError; - TidInfo tidInfo; - // This is the implicit context used by all HIP commands. - // It can be set by hipSetDevice or by the CTX manipulation commands: - ihipCtx_t* defaultCtx; - // Stack of contexts - std::stack ctxStack; - bool getPrimaryCtx; -}; -TlsData* tls_get_ptr(); -#define GET_TLS() TlsData *tls = tls_get_ptr() - -extern std::vector g_dbStartTriggers; -extern std::vector g_dbStopTriggers; - -//--- -// Forward defs: -class ihipStream_t; -class ihipDevice_t; -class ihipCtx_t; -struct ihipEventData_t; - -// Color defs for debug messages: -#define KNRM "\x1B[0m" -#define KRED "\x1B[31m" -#define KGRN "\x1B[32m" -#define KYEL "\x1B[33m" -#define KBLU "\x1B[34m" -#define KMAG "\x1B[35m" -#define KCYN "\x1B[36m" -#define KWHT "\x1B[37m" - -extern const char* API_COLOR; -extern const char* API_COLOR_END; - - -// If set, thread-safety is enforced on all event/stream/ctx/device functions. -// Can disable for performance or functional experiments - in this case -// the code uses a dummy "no-op" mutex. -#define EVENT_THREAD_SAFE 1 - -#define STREAM_THREAD_SAFE 1 - -#define CTX_THREAD_SAFE 1 - -#define DEVICE_THREAD_SAFE 1 - - -// Compile debug trace mode - this prints debug messages to stderr when env var HIP_DB is set. -// May be set to 0 to remove debug if checks - possible code size and performance difference? -#define COMPILE_HIP_DB 1 - - -// Compile HIP tracing capability. -// 0x1 = print a string at function entry with arguments. -// 0x2 = prints a simple message with function name + return code when function exits. -// 0x3 = print both. -// Must be enabled at runtime with HIP_TRACE_API -#define COMPILE_HIP_TRACE_API 0x3 - -//--- -// HIP Trace modes - use with HIP_TRACE_API=... -#define TRACE_ALL 0 // 0x01 -#define TRACE_KCMD 1 // 0x02, kernel command -#define TRACE_MCMD 2 // 0x04, memory command -#define TRACE_MEM 3 // 0x08, memory allocation or deallocation. -#define TRACE_SYNC 4 // 0x10, synchronization (host or hipStreamWaitEvent) -#define TRACE_QUERY 5 // 0x20, hipEventRecord, hipEventQuery, hipStreamQuery - - -//--- -// HIP_DB Debug flags: -#define DB_API 0 /* 0x01 - shortcut to enable HIP_TRACE_API on single switch */ -#define DB_SYNC 1 /* 0x02 - trace synchronization pieces */ -#define DB_MEM 2 /* 0x04 - trace memory allocation / deallocation */ -#define DB_COPY 3 /* 0x08 - trace memory copy and peer commands. . */ -#define DB_WARN 4 /* 0x10 - warn about sub-optimal or shady behavior */ -#define DB_FB 5 /* 0x20 - trace loading fat binary */ -#define DB_MAX_FLAG 6 -// When adding a new debug flag, also add to the char name table below. -// -// - -struct DbName { - const char* _color; - const char* _shortName; -}; - -// This table must be kept in-sync with the defines above. -static const DbName dbName[] = { - {KGRN, "api"}, // not used, - {KYEL, "sync"}, {KCYN, "mem"}, {KMAG, "copy"}, {KRED, "warn"}, - {KBLU, "fatbin"}, -}; - - -#if COMPILE_HIP_DB -#define tprintf(trace_level, ...) \ - { \ - if (HIP_DB & (1 << (trace_level))) { \ - GET_TLS(); \ - char msgStr[1000]; \ - snprintf(msgStr, sizeof(msgStr), __VA_ARGS__); \ - fprintf(stderr, " %ship-%s pid:%d tid:%d:%s%s", dbName[trace_level]._color, \ - dbName[trace_level]._shortName, tls->tidInfo.pid(), tls->tidInfo.tid(), msgStr, KNRM); \ - } \ - } -#else -/* Compile to empty code */ -#define tprintf(trace_level, ...) -#endif - - -static inline uint64_t getTicks() { return hc::get_system_ticks(); } - -//--- -extern uint64_t recordApiTrace(TlsData *tls, std::string* fullStr, const std::string& apiStr); - -#if (COMPILE_HIP_TRACE_API & 0x1) -#define API_TRACE(forceTrace, ...) \ - GET_TLS(); \ - uint64_t hipApiStartTick = 0; \ - { \ - tls->tidInfo.incApiSeqNum(); \ - if (forceTrace || \ - (COMPILE_HIP_DB && (HIP_TRACE_API & (1 << TRACE_ALL)))) { \ - std::string apiStr = std::string(__func__) + " (" + ToString(__VA_ARGS__) + ')'; \ - std::string fullStr; \ - hipApiStartTick = recordApiTrace(tls, &fullStr, apiStr); \ - } \ - } - -#else -// Swallow API_TRACE -#define API_TRACE(IS_CMD, ...) GET_TLS(); tls->tidInfo.incApiSeqNum(); -#endif - -#define ihipGetTlsDefaultCtx() iihipGetTlsDefaultCtx(tls) -#define ihipSetTlsDefaultCtx(ctx) tls->defaultCtx = ctx - -#define HIP_SET_DEVICE() ihipDeviceSetState(tls); - -// This macro should be called at the beginning of every HIP API. -// It initializes the hip runtime (exactly once), and -// generates a trace string that can be output to stderr or to ATP file. -#define HIP_INIT_API(cid, ...) \ - hip_impl::hip_init(); \ - API_TRACE(0, __VA_ARGS__); \ - HIP_CB_SPAWNER_OBJECT(cid); - - -// Like above, but will trace with a specified "special" bit. -// Replace HIP_INIT_API with this call inside HIP APIs that launch work on the GPU: -// kernel launches, copy commands, memory sets, etc. -#define HIP_INIT_SPECIAL_API(cid, tbit, ...) \ - hip_impl::hip_init(); \ - API_TRACE((HIP_TRACE_API & (1 << tbit)), __VA_ARGS__); \ - HIP_CB_SPAWNER_OBJECT(cid); - - -// This macro should be called at the end of every HIP API, and only at the end of top-level hip -// APIS (not internal hip) It has dual function: logs the last error returned for use by -// hipGetLastError, and also prints the closing message when the debug trace is enabled. -#define ihipLogStatus(hipStatus) \ - ({ \ - hipError_t localHipStatus = hipStatus; /*local copy so hipStatus only evaluated once*/ \ - tls->lastHipError = localHipStatus; \ - \ - if ((COMPILE_HIP_TRACE_API & 0x2) && HIP_TRACE_API & (1 << TRACE_ALL)) { \ - auto ticks = getTicks() - hipApiStartTick; \ - fprintf(stderr, " %ship-api pid:%d tid:%d.%lu %-30s ret=%2d (%s)>> +%lu ns%s\n", \ - (localHipStatus == 0) ? API_COLOR : KRED, tls->tidInfo.pid(), tls->tidInfo.tid(), \ - tls->tidInfo.apiSeqNum(), __func__, localHipStatus, \ - ihipErrorString(localHipStatus), ticks, API_COLOR_END); \ - } \ - localHipStatus; \ - }) - - -class ihipException : public std::exception { - public: - explicit ihipException(hipError_t e) : _code(e){}; - - hipError_t _code; -}; - - -#ifdef __cplusplus -extern "C" { -#endif - - -#ifdef __cplusplus -} -#endif - -const hipStream_t hipStreamNull = 0x0; - - -/** - * HIP IPC Mem Handle Size - */ -#define HIP_IPC_MEM_RESERVED_SIZE 24 -class ihipIpcMemHandle_t { - public: -#if USE_IPC - hsa_amd_ipc_memory_t ipc_handle; ///< ipc memory handle on ROCr -#endif - size_t psize; - char reserved[HIP_IPC_MEM_RESERVED_SIZE]; -}; - -/** - * HIP IPC Event Handle Size - */ -#define HIP_IPC_EVENT_RESERVED_SIZE 32 -class ihipIpcEventHandle_t { - public: -#if USE_IPC - char shmem_name[HIP_IPC_HANDLE_SIZE]; -#endif -}; - -struct ihipModule_t { - std::string fileName; - hsa_executable_t executable = {}; - hsa_code_object_reader_t coReader = {}; - std::string hash; - std::unordered_map< - std::string, std::vector>> kernargs; - - ~ihipModule_t() { - if (executable.handle) hsa_executable_destroy(executable); - if (coReader.handle) hsa_code_object_reader_destroy(coReader); - } -}; - - -//--- -// Used to remove lock, for performance or stimulating bugs. -class FakeMutex { - public: - void lock() {} - bool try_lock() { return true; } - void unlock() {} -}; - -#if EVENT_THREAD_SAFE -typedef std::mutex EventMutex; -#else -#warning "Stream thread-safe disabled" -typedef FakeMutex EventMutex; -#endif - -#if STREAM_THREAD_SAFE -typedef std::mutex StreamMutex; -#else -#warning "Stream thread-safe disabled" -typedef FakeMutex StreamMutex; -#endif - -// Pair Device and Ctx together, these could also be toggled separately if desired. -#if CTX_THREAD_SAFE -typedef std::mutex CtxMutex; -#else -typedef FakeMutex CtxMutex; -#warning "Ctx thread-safe disabled" -#endif - -#if DEVICE_THREAD_SAFE -typedef std::mutex DeviceMutex; -#else -typedef FakeMutex DeviceMutex; -#warning "Device thread-safe disabled" -#endif - -// -//--- -// Protects access to the member _data with a lock acquired on contruction/destruction. -// T must contain a _mutex field which meets the BasicLockable requirements (lock/unlock) -template -class LockedAccessor { - public: - LockedAccessor(T& criticalData, bool autoUnlock = true) - : _criticalData(&criticalData), - _autoUnlock(autoUnlock) - - { - tprintf(DB_SYNC, "locking criticalData=%p for %s..\n", _criticalData, - ToString(_criticalData->_parent).c_str()); - _criticalData->_mutex.lock(); - }; - - ~LockedAccessor() { - if (_autoUnlock) { - tprintf(DB_SYNC, "auto-unlocking criticalData=%p for %s...\n", _criticalData, - ToString(_criticalData->_parent).c_str()); - _criticalData->_mutex.unlock(); - } - } - - void unlock() { - tprintf(DB_SYNC, "unlocking criticalData=%p for %s...\n", _criticalData, - ToString(_criticalData->_parent).c_str()); - _criticalData->_mutex.unlock(); - } - - // Syntactic sugar so -> can be used to get the underlying type. - T* operator->() { return _criticalData; }; - - private: - T* _criticalData; - bool _autoUnlock; -}; - - -template -struct LockedBase { - // Experts-only interface for explicit locking. - // Most uses should use the lock-accessor. - void lock() { _mutex.lock(); } - void unlock() { _mutex.unlock(); } - bool try_lock() { return _mutex.try_lock(); } - - MUTEX_TYPE _mutex; -}; - - -template -class ihipStreamCriticalBase_t : public LockedBase { -public: - ihipStreamCriticalBase_t(ihipStream_t* parentStream, hc::accelerator_view av) - : _parent{parentStream}, _av{av}, _last_op_was_a_copy{false} - {} - - ~ihipStreamCriticalBase_t() {} - - ihipStreamCriticalBase_t* mlock() { - LockedBase::lock(); - return this; - }; - - void munlock() { - tprintf(DB_SYNC, "munlocking criticalData=%p for %s...\n", this, - ToString(this->_parent).c_str()); - LockedBase::unlock(); - }; - - ihipStreamCriticalBase_t* mtry_lock() { - bool gotLock = LockedBase::try_lock(); - tprintf(DB_SYNC, "mtry_locking=%d criticalData=%p for %s...\n", gotLock, this, - ToString(this->_parent).c_str()); - return gotLock ? this : nullptr; - }; - - ihipStream_t* _parent; - hc::accelerator_view _av; - bool _last_op_was_a_copy; -}; - - -// if HIP code needs to acquire locks for both ihipCtx_t and ihipStream_t, it should first acquire -// the lock for the ihipCtx_t and then for the individual streams. The locks should not be acquired -// in reverse order or deadlock may occur. In some cases, it may be possible to reduce the range -// where the locks must be held. HIP routines should avoid acquiring and releasing the same lock -// during the execution of a single HIP API. Another option is to use try_lock in the innermost lock -// query. - - -typedef ihipStreamCriticalBase_t ihipStreamCritical_t; -typedef LockedAccessor LockedAccessor_StreamCrit_t; - -// do not change these two structs without changing the device library -struct mg_sync { - uint w0; - uint w1; -}; - -struct mg_info { - struct mg_sync *mgs; - uint grid_id; - uint num_grids; - ulong prev_sum; - ulong all_sum; -}; - -//--- -// Internal stream structure. -class ihipStream_t { - public: - enum ScheduleMode { Auto, Spin, Yield }; - typedef uint64_t SeqNum_t; - - // TODOD -make av a reference to avoid shared_ptr overhead? - ihipStream_t(ihipCtx_t* ctx, hc::accelerator_view av, unsigned int flags); - ~ihipStream_t(); - - // kind is hipMemcpyKind - void locked_copySync(void* dst, const void* src, size_t sizeBytes, unsigned kind, - bool resolveOn = true); - - bool locked_copy2DSync(void* dst, const void* src, size_t width, size_t height, size_t srcPitch, size_t dstPitch, unsigned kind, - bool resolveOn = true); - - void locked_copyAsync(void* dst, const void* src, size_t sizeBytes, unsigned kind); - - bool locked_copy2DAsync(void* dst, const void* src, size_t width, size_t height, size_t srcPitch, size_t dstPitch, unsigned kind); - - void lockedSymbolCopySync(hc::accelerator& acc, void* dst, void* src, size_t sizeBytes, - size_t offset, unsigned kind); - void lockedSymbolCopyAsync(hc::accelerator& acc, void* dst, void* src, size_t sizeBytes, - size_t offset, unsigned kind); - - //--- - // Member functions that begin with locked_ are thread-safe accessors - these acquire / release - // the critical mutex. - LockedAccessor_StreamCrit_t lockopen_preKernelCommand(); - void lockclose_postKernelCommand(const char* kernelName, hc::accelerator_view* av, bool unlockNotNeeded = 0); - - void locked_wait(bool& waited); - void locked_wait(); - - hc::accelerator_view* locked_getAv() { - LockedAccessor_StreamCrit_t crit(_criticalData); - return &(crit->_av); - }; - - void locked_streamWaitEvent(ihipEventData_t& event); - hc::completion_future locked_recordEvent(hipEvent_t event); - - ihipStreamCritical_t& criticalData() { return _criticalData; }; - - //--- - hc::hcWaitMode waitMode() const; - - // Use this if we already have the stream critical data mutex: - void wait(LockedAccessor_StreamCrit_t& crit); - - void launchModuleKernel(hc::accelerator_view av, hsa_signal_t signal, uint32_t blockDimX, - uint32_t blockDimY, uint32_t blockDimZ, uint32_t gridDimX, - uint32_t gridDimY, uint32_t gridDimZ, uint32_t groupSegmentSize, - uint32_t sharedMemBytes, void* kernarg, size_t kernSize, - uint64_t kernel); - - - //-- Non-racy accessors: - // These functions access fields set at initialization time and are non-racy (so do not acquire - // mutex) - const ihipDevice_t* getDevice() const; - ihipCtx_t* getCtx() const; - - // Before calling this function, stream must be resolved from "0" to the actual stream: - bool isDefaultStream() const { return _id == 0; }; - - std::vector coopMemsTracker; - - public: - //--- - // Public member vars - these are set at initialization and never change: - SeqNum_t _id; // monotonic sequence ID. 0 is the default stream. - unsigned _flags; - - - private: - // The unsigned return is hipMemcpyKind - unsigned resolveMemcpyDirection(bool srcInDeviceMem, bool dstInDeviceMem); - void resolveHcMemcpyDirection(unsigned hipMemKind, const hc::AmPointerInfo* dstPtrInfo, - const hc::AmPointerInfo* srcPtrInfo, hc::hcCommandKind* hcCopyDir, - ihipCtx_t** copyDevice, bool* forceUnpinnedCopy); - - bool canSeeMemory(const ihipCtx_t* thisCtx, const hc::AmPointerInfo* dstInfo, - const hc::AmPointerInfo* srcInfo); - - void addSymbolPtrToTracker(hc::accelerator& acc, void* ptr, size_t sizeBytes); - - private: // Data - // Critical Data - MUST be accessed through LockedAccessor_StreamCrit_t - ihipStreamCritical_t _criticalData; - - std::mutex _hasQueueLock; - - ihipCtx_t* _ctx; // parent context that owns this stream. - - // Friends: - friend std::ostream& operator<<(std::ostream& os, const ihipStream_t& s); - friend hipError_t hipStreamQuery(hipStream_t); - - ScheduleMode _scheduleMode; -}; - - -//---- -// Internal event structure: -enum hipEventStatus_t { - hipEventStatusUnitialized = 0, // event is uninitialized, must be "Created" before use. - hipEventStatusCreated = 1, // event created, but not yet Recorded - hipEventStatusRecording = 2, // event has been recorded into a stream but not completed yet. - hipEventStatusComplete = 3, // event has been recorded - timestamps are valid. -}; - -// TODO - rename to ihip type of some kind -enum ihipEventType_t { - hipEventTypeIndependent, - hipEventTypeStartCommand, - hipEventTypeStopCommand, -}; - -#define IPC_SIGNALS_PER_EVENT 32 -typedef struct ihipIpcEventShmem_s { - std::atomic owners; - std::atomic read_index; - std::atomic write_index; - std::atomic signal[IPC_SIGNALS_PER_EVENT]; -} ihipIpcEventShmem_t; - - -struct ihipEventData_t { - ihipEventData_t() { - _state = hipEventStatusCreated; - _stream = NULL; - _timestamp = 0; - _type = hipEventTypeIndependent; - _ipc_name = ""; - _ipc_fd = 0; - _ipc_shmem = NULL; - }; - - void marker(const hc::completion_future& marker) { _marker = marker; } - hc::completion_future& marker() { return _marker; } - uint64_t timestamp() const { return _timestamp; } - ihipEventType_t type() const { return _type; } - - ihipEventType_t _type; - hipEventStatus_t _state; - hipStream_t _stream; // Stream where the event is recorded. Null stream is resolved to actual - // stream when recorded - uint64_t _timestamp; // store timestamp, may be set on host or by marker. - std::string _ipc_name; - int _ipc_fd; - ihipIpcEventShmem_t *_ipc_shmem; - private: - hc::completion_future _marker; -}; - - -//============================================================================= -// class ihipEventCriticalBase_t -template -class ihipEventCriticalBase_t : LockedBase { - public: - explicit ihipEventCriticalBase_t(const ihipEvent_t* parentEvent) : _parent(parentEvent) {} - ~ihipEventCriticalBase_t() {} - - // Keep data in structure so it can be easily copied into snapshots - // (used to reduce lock contention and preserve correct lock order) - ihipEventData_t _eventData; - - private: - const ihipEvent_t* _parent; - friend class LockedAccessor; -}; - -typedef ihipEventCriticalBase_t ihipEventCritical_t; - -typedef LockedAccessor LockedAccessor_EventCrit_t; - -// internal hip event structure. -class ihipEvent_t { - public: - explicit ihipEvent_t(unsigned flags); - void attachToCompletionFuture(const hc::completion_future* cf, hipStream_t stream, - ihipEventType_t eventType); - - // Return a copy of the critical state. The critical data is locked during the copy. - ihipEventData_t locked_copyCrit() { - LockedAccessor_EventCrit_t crit(_criticalData); - return _criticalData._eventData; - }; - - ihipEventCritical_t& criticalData() { return _criticalData; }; - - public: - unsigned _flags; - int _deviceId; - - private: - ihipEventCritical_t _criticalData; - - friend hipError_t hipEventRecord(hipEvent_t event, hipStream_t stream); -}; - - -//============================================================================= -// class ihipDeviceCriticalBase_t -template -class ihipDeviceCriticalBase_t : LockedBase { - public: - explicit ihipDeviceCriticalBase_t(ihipDevice_t* parentDevice) - : _parent(parentDevice), _ctxCount(0){}; - - ~ihipDeviceCriticalBase_t() {} - - // Contexts: - void addContext(ihipCtx_t* ctx); - void removeContext(ihipCtx_t* ctx); - std::list& ctxs() { return _ctxs; }; - const std::list& const_ctxs() const { return _ctxs; }; - int getcount() { return _ctxCount; }; - friend class LockedAccessor; - - private: - ihipDevice_t* _parent; - - //--- Context Tracker: - std::list _ctxs; // contexts associated with this device across all threads. - - int _ctxCount; -}; - -typedef ihipDeviceCriticalBase_t ihipDeviceCritical_t; - -typedef LockedAccessor LockedAccessor_DeviceCrit_t; - -//---- -// Properties of the HIP device. -// Multiple contexts can point to same device. -class ihipDevice_t { - public: - ihipDevice_t(unsigned deviceId, unsigned deviceCnt, hc::accelerator& acc); - ~ihipDevice_t(); - - // Accessors: - ihipCtx_t* getPrimaryCtx() const { return _primaryCtx; }; - void locked_removeContext(ihipCtx_t* c); - void locked_reset(); - ihipDeviceCritical_t& criticalData() { return _criticalData; }; - - public: - unsigned _deviceId; // device ID - - hc::accelerator _acc; - hsa_agent_t _hsaAgent; // hsa agent handle - - //! Number of compute units supported by the device: - unsigned _computeUnits; - hipDeviceProp_t _props; // saved device properties. - - // Node id reported by kfd for this device - uint32_t _driver_node_id; - - ihipCtx_t* _primaryCtx; - - int _state; // 1 if device is set otherwise 0 - - private: - hipError_t initProperties(hipDeviceProp_t* prop); - - private: - ihipDeviceCritical_t _criticalData; -}; -//============================================================================= - - -//--- -// -struct ihipExec_t { - dim3 _gridDim; - dim3 _blockDim; - size_t _sharedMem; - hipStream_t _hStream; - std::vector _arguments; -}; - -//============================================================================= -// class ihipCtxCriticalBase_t -template -class ihipCtxCriticalBase_t : LockedBase { - public: - ihipCtxCriticalBase_t(ihipCtx_t* parentCtx, unsigned deviceCnt) - : _parent(parentCtx), _peerCnt(0) { - _peerAgents = new hsa_agent_t[deviceCnt]; - }; - - ~ihipCtxCriticalBase_t() { - if (_peerAgents != nullptr) { - delete _peerAgents; - _peerAgents = nullptr; - } - _peerCnt = 0; - } - - // Streams: - void addStream(ihipStream_t* stream); - std::list& streams() { return _streams; }; - const std::list& const_streams() const { return _streams; }; - - - // Peer Accessor classes: - bool isPeerWatcher(const ihipCtx_t* peer); // returns True if peer has access to memory - // physically located on this device. - bool addPeerWatcher(const ihipCtx_t* thisCtx, ihipCtx_t* peer); - bool removePeerWatcher(const ihipCtx_t* thisCtx, ihipCtx_t* peer); - void resetPeerWatchers(ihipCtx_t* thisDevice); - void printPeerWatchers(FILE* f) const; - - uint32_t peerCnt() const { return _peerCnt; }; - hsa_agent_t* peerAgents() const { return _peerAgents; }; - - - // TODO - move private - std::list _peers; // list of enabled peer devices. - //--- Execution stack: - std::stack _execStack; // Execution stack for this device. - - friend class LockedAccessor; - - private: - ihipCtx_t* _parent; - - //--- Stream Tracker: - std::list _streams; // streams associated with this device. - - - //--- Peer Tracker: - // These reflect the currently Enabled set of peers for this GPU: - // Enabled peers have permissions to access the memory physically allocated on this device. - // Note the peers always contain the self agent for easy interfacing with HSA APIs. - uint32_t _peerCnt; // number of enabled peers - hsa_agent_t* _peerAgents; // efficient packed array of enabled agents (to use for allocations.) - private: - void recomputePeerAgents(); -}; -// Note Mutex type Real/Fake selected based on CtxMutex -typedef ihipCtxCriticalBase_t ihipCtxCritical_t; - -// This type is used by functions that need access to the critical device structures. -typedef LockedAccessor LockedAccessor_CtxCrit_t; -//============================================================================= - - -//============================================================================= -// class ihipCtx_t: -// A HIP CTX (context) points at one of the existing devices and contains the streams, -// peer-to-peer mappings, creation flags. Multiple contexts can point to the same -// device. -// -class ihipCtx_t { - public: // Functions: - ihipCtx_t(ihipDevice_t* device, unsigned deviceCnt, - unsigned flags); // note: calls constructor for _criticalData - ~ihipCtx_t(); - - // Functions which read or write the critical data are named locked_. - // (might be better called "locking_" - // ihipCtx_t does not use recursive locks so the ihip implementation must avoid calling a - // locked_ function from within a locked_ function. External functions which call several - // locked_ functions will acquire and release the lock for each function. if this occurs in - // performance-sensitive code we may want to refactor by adding non-locked functions and - // creating a new locked_ member function to call them all. - void locked_removeStream(ihipStream_t* s); - void locked_reset(); - void locked_waitAllStreams(); - void locked_syncDefaultStream(bool waitOnSelf, bool syncHost); - - ihipCtxCritical_t& criticalData() { return _criticalData; }; - - const ihipDevice_t* getDevice() const { return _device; }; - int getDeviceNum() const { return _device->_deviceId; }; - - // TODO - review uses of getWriteableDevice(), can these be converted to getDevice() - ihipDevice_t* getWriteableDevice() const { return _device; }; - - std::string toString() const; - - public: // Data - // The NULL stream is used if no other stream is specified. - // Default stream has special synchronization properties with other streams. - ihipStream_t* _defaultStream; - - // Flags specified when the context is created: - unsigned _ctxFlags; - - private: - ihipDevice_t* _device; - - - private: // Critical data, protected with locked access: - // Members of _protected data MUST be accessed through the LockedAccessor. - // Search for LockedAccessor for examples; do not access _criticalData - // directly. - ihipCtxCritical_t _criticalData; -}; - - -//================================================================================================= -// Global variable definition: -extern unsigned g_deviceCnt; -extern hsa_agent_t g_cpu_agent; // the CPU agent. -extern hsa_agent_t* g_allAgents; // CPU agents + all the visible GPU agents. - -//================================================================================================= -// Extern functions: -extern void ihipInit(); -extern const char* ihipErrorString(hipError_t); -extern hipError_t ihipSynchronize(TlsData *tls); -extern void ihipCtxStackUpdate(); -extern hipError_t ihipDeviceSetState(TlsData *tls); - -extern ihipDevice_t* ihipGetDevice(int); -ihipCtx_t* ihipGetPrimaryCtx(unsigned deviceIndex); -hipError_t hipModuleGetFunctionEx(hipFunction_t* hfunc, hipModule_t hmod, - const char* name, hsa_agent_t *agent); - - -hipStream_t ihipSyncAndResolveStream(hipStream_t, bool lockAcquired = 0); -hipError_t ihipStreamSynchronize(TlsData *tls, hipStream_t stream); - -/** - * @brief Copies the memory address and size of symbol @p symbolName - * - * @param[in] symbolName - Symbol on device - * @param[out] devPtr - Pointer to a pointer to the memory referred to by the symbol - * @param[out] size - Pointer to the size of the symbol - * @return #hipSuccess, #hipErrorNotInitialized, #hipErrorNotFound, #hipErrorInvalidValue - * - */ -hipError_t ihipGetGlobalVar(hipDeviceptr_t* dev_ptr, size_t* size_ptr, const char* hostVar, - hipModule_t hmod = nullptr); - -// Stream printf functions: -inline std::ostream& operator<<(std::ostream& os, const ihipStream_t& s) { - os << "stream:"; - os << s.getDevice()->_deviceId; - ; - os << '.'; - os << s._id; - return os; -} - -inline std::ostream& operator<<(std::ostream& os, const dim3& s) { - os << '{'; - os << s.x; - os << ','; - os << s.y; - os << ','; - os << s.z; - os << '}'; - return os; -} - -inline std::ostream& operator<<(std::ostream& os, const gl_dim3& s) { - os << '{'; - os << s.x; - os << ','; - os << s.y; - os << ','; - os << s.z; - os << '}'; - return os; -} - -// Stream printf functions: -inline std::ostream& operator<<(std::ostream& os, const hipEvent_t& e) { - os << "event:" << std::hex << static_cast(e); - return os; -} - -inline std::ostream& operator<<(std::ostream& os, const ihipCtx_t* c) { - os << "ctx:" << static_cast(c) << ".dev:" << c->getDevice()->_deviceId; - return os; -} - - -// Helper functions that are used across src files: -namespace hip_internal { -hipError_t memcpyAsync(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind, - hipStream_t stream); - -hipError_t ihipHostMalloc(TlsData *tls, void** ptr, size_t sizeBytes, unsigned int flags, bool noSync = 0); - -hipError_t ihipHostFree(TlsData *tls, void* ptr); - -}; - -#define MAX_COOPERATIVE_GPUs 255 - -//--- -// TODO - review the context creation strategy here. Really should be: -// - first "non-device" runtime call creates the context for this thread. Allowed to call -// setDevice first. -// - hipDeviceReset destroys the primary context for device? -// - Then context is created again for next usage. -static inline ihipCtx_t* iihipGetTlsDefaultCtx(TlsData* tls) { - // Per-thread initialization of the TLS: - if ((tls->defaultCtx == nullptr) && (g_deviceCnt > 0)) { - tls->defaultCtx = ihipGetPrimaryCtx(0); - } - return tls->defaultCtx; -} - -/** - * @brief Get device function from host kernel function pointer - * Needed only for clang + HIP-HCC RT - * - * @param [in] hostFunction host kernel function pointer - * - * @returns hipFuntion_t, nullptr - */ -hipFunction_t ihipGetDeviceFunction(const void *hostFunction); - -#endif diff --git a/src/hip_intercept.cpp b/src/hip_intercept.cpp deleted file mode 100644 index 6e8b120360..0000000000 --- a/src/hip_intercept.cpp +++ /dev/null @@ -1,53 +0,0 @@ -/* -Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved. - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. -*/ - -#include "hip/hip_runtime.h" -#include "hip_prof_api.h" - -// HIP API callback/activity - -api_callbacks_table_t callbacks_table; - -extern std::string& FunctionSymbol(const hipFunction_t f); -const char* hipKernelNameRef(const hipFunction_t f) { return FunctionSymbol(f).c_str(); } - -hipError_t hipRegisterApiCallback(uint32_t id, void* fun, void* arg) { - return callbacks_table.set_callback(id, reinterpret_cast(fun), arg) ? - hipSuccess : hipErrorInvalidValue; -} - -hipError_t hipRemoveApiCallback(uint32_t id) { - return callbacks_table.set_callback(id, NULL, NULL) ? hipSuccess : hipErrorInvalidValue; -} - -hipError_t hipRegisterActivityCallback(uint32_t id, void* fun, void* arg) { - return callbacks_table.set_activity(id, reinterpret_cast(fun), arg) ? - hipSuccess : hipErrorInvalidValue; -} - -hipError_t hipRemoveActivityCallback(uint32_t id) { - return callbacks_table.set_activity(id, NULL, NULL) ? hipSuccess : hipErrorInvalidValue; -} - -const char* hipApiName(uint32_t id) { - return hip_api_name(id); -} diff --git a/src/hip_memory.cpp b/src/hip_memory.cpp deleted file mode 100644 index 82ecaea82a..0000000000 --- a/src/hip_memory.cpp +++ /dev/null @@ -1,2560 +0,0 @@ -/* -Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved. - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. -*/ - -#include -#include "hsa/hsa.h" -#include "hsa/hsa_ext_amd.h" - -#include "hip/hip_runtime.h" -#include "hip_hcc_internal.h" -#include "trace_helper.h" - -#include -#include - -#if __HIP_ENABLE_DEVICE_MALLOC__ -__device__ char __hip_device_heap[__HIP_SIZE_OF_HEAP]; -__device__ uint32_t __hip_device_page_flag[__HIP_NUM_PAGES]; -#endif - -// Internal HIP APIS: -namespace hip_internal { - -namespace { - inline - const char* hsa_to_string(hsa_status_t err) noexcept - { - const char* r{}; - - if (hsa_status_string(err, &r) == HSA_STATUS_SUCCESS) return r; - - return "Unknown."; - } - - template - inline - void throwing_result_check(hsa_status_t res, const char (&file)[m], - const char (&function)[n], int line) { - if (res == HSA_STATUS_SUCCESS) return; - if (res == HSA_STATUS_INFO_BREAK) return; - - throw std::runtime_error{"Failed in file " + (file + - (", in function \"" + (function + - ("\", on line " + std::to_string(line))))) + - ", with error: " + hsa_to_string(res)}; - } - - inline - hsa_agent_t cpu_agent() { - hsa_agent_t r{}; - throwing_result_check(hsa_iterate_agents([](hsa_agent_t x, void* pr) { - hsa_device_type_t t{}; - hsa_agent_get_info(x, HSA_AGENT_INFO_DEVICE, &t); - - if (t != HSA_DEVICE_TYPE_CPU) return HSA_STATUS_SUCCESS; - - *static_cast(pr) = x; - - return HSA_STATUS_INFO_BREAK; - }, &r), __FILE__, __func__, __LINE__); - - return r; - } - - inline - hsa_device_type_t type(hsa_agent_t x) - { - hsa_device_type_t r{}; - throwing_result_check(hsa_agent_get_info(x, HSA_AGENT_INFO_DEVICE, &r), - __FILE__, __func__, __LINE__); - - return r; - } - - const auto is_large_BAR{[](){ - std::unique_ptr hsa{ - (hsa_init() == HSA_STATUS_SUCCESS) - ? reinterpret_cast(UINT64_MAX) : nullptr, - [](void* p) { if (p) hsa_shut_down(); }}; - - if (!hsa) return false; - - bool r{true}; - - throwing_result_check(hsa_iterate_agents([](hsa_agent_t x, void* pr) { - if (x.handle == cpu_agent().handle) return HSA_STATUS_SUCCESS; - - throwing_result_check( - hsa_agent_iterate_regions(x, [](hsa_region_t y, void* p) { - hsa_region_segment_t seg{}; - throwing_result_check( - hsa_region_get_info(y, HSA_REGION_INFO_SEGMENT, &seg), - __FILE__, __func__, __LINE__); - - if (seg != HSA_REGION_SEGMENT_GLOBAL) { - return HSA_STATUS_SUCCESS; - } - - uint32_t flags{}; - throwing_result_check(hsa_region_get_info( - y, HSA_REGION_INFO_GLOBAL_FLAGS, &flags), - __FILE__, __func__, __LINE__); - - if (flags & HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED) { - hsa_amd_memory_pool_access_t tmp{}; - throwing_result_check( - hsa_amd_agent_memory_pool_get_info( - cpu_agent(), - hsa_amd_memory_pool_t{y.handle}, - HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS, - &tmp), - __FILE__, __func__, __LINE__); - - *static_cast(p) &= - tmp != HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED; - } - - return HSA_STATUS_SUCCESS; - }, pr), __FILE__, __func__, __LINE__); - - return HSA_STATUS_SUCCESS; - }, &r), __FILE__, __func__, __LINE__); - - return r; - }()}; - - constexpr std::uint32_t is_cpu_owned{UINT32_MAX}; - - inline - hsa_amd_pointer_info_t info(const void* p) - { - hsa_amd_pointer_info_t r{sizeof(hsa_amd_pointer_info_t)}; - throwing_result_check( - hsa_amd_pointer_info( - const_cast(p), &r, nullptr, nullptr, nullptr), - __FILE__, __func__, __LINE__); - - if (type(r.agentOwner) == HSA_DEVICE_TYPE_CPU) r.size = is_cpu_owned; - - return r; - } - - constexpr size_t staging_sz{4 * 1024 * 1024}; // 2 Pages. - constexpr size_t max_h2d_std_memcpy_sz{8 * 1024}; // 8 KiB. - constexpr size_t max_d2h_std_memcpy_sz{64}; // 1 cacheline. - - thread_local const std::unique_ptr staging_buffer{ - []() { - hsa_region_t r{}; - throwing_result_check(hsa_agent_iterate_regions( - cpu_agent(), [](hsa_region_t x, void *p) { - hsa_region_segment_t seg{}; - throwing_result_check( - hsa_region_get_info(x, HSA_REGION_INFO_SEGMENT, &seg), - __FILE__, __func__, __LINE__); - - if (seg != HSA_REGION_SEGMENT_GLOBAL) return HSA_STATUS_SUCCESS; - - uint32_t flags{}; - throwing_result_check(hsa_region_get_info( - x, HSA_REGION_INFO_GLOBAL_FLAGS, &flags), - __FILE__, __func__, __LINE__); - - if (flags & HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED) { - *static_cast(p) = x; - - return HSA_STATUS_INFO_BREAK; - } - - return HSA_STATUS_SUCCESS; - }, &r), __FILE__, __func__, __LINE__); - - void *tp{}; - throwing_result_check(hsa_memory_allocate(r, staging_sz, &tp), - __FILE__, __func__, __LINE__); - - return tp; - }(), - [](void *ptr) { hsa_memory_free(ptr); }}; - - thread_local hsa_signal_t copy_signal{[]() { - hsa_agent_t cpu{cpu_agent()}; - hsa_signal_t sgn{}; - throwing_result_check(hsa_signal_create(1, 1, &cpu, &sgn), - __FILE__, __func__, __LINE__); - - return sgn; - }()}; -} // Unnamed namespace. - -inline -void do_copy(void* __restrict dst, const void* __restrict src, size_t n, - hsa_agent_t da, hsa_agent_t sa) { - hsa_signal_silent_store_relaxed(copy_signal, 1); - throwing_result_check( - hsa_amd_memory_async_copy(dst, da, src, sa, n, 0, nullptr, copy_signal), - __FILE__, __func__, __LINE__); - - while (hsa_signal_wait_relaxed(copy_signal, HSA_SIGNAL_CONDITION_EQ, 0, - UINT64_MAX, HSA_WAIT_STATE_ACTIVE)); -} - -inline -void do_std_memcpy( - void* __restrict dst, const void* __restrict src, std::size_t n) { - std::memcpy(dst, src, n); - - return std::atomic_thread_fence(std::memory_order_seq_cst); -} - -inline -void d2h_copy(void* __restrict dst, const void* __restrict src, size_t n, - hsa_amd_pointer_info_t si) { - const auto di{info(dst)}; - const auto is_locked{di.type == HSA_EXT_POINTER_TYPE_LOCKED}; - - if (!is_locked && si.size == is_cpu_owned) { - return do_std_memcpy(dst, src, n); - } - if (!is_locked && is_large_BAR && n <= max_d2h_std_memcpy_sz) { - return do_std_memcpy(dst, src, n); - } - if (di.type == HSA_EXT_POINTER_TYPE_HSA) { - return do_copy(dst, src, n, si.agentOwner, si.agentOwner); - } - - if (is_locked) { - dst = static_cast(di.agentBaseAddress) + - (static_cast(dst) - - static_cast(di.hostBaseAddress)); - do_copy(dst, src, n, si.agentOwner, si.agentOwner); - } - else if (n <= staging_sz) { - do_copy(staging_buffer.get(), src, n, si.agentOwner, si.agentOwner); - std::memcpy(dst, staging_buffer.get(), n); - } - else { - std::unique_ptr lck{ - dst, [](void* p) { hsa_amd_memory_unlock(p); }}; - - throwing_result_check(hsa_amd_memory_lock(dst, n, &si.agentOwner, 1, - const_cast(&dst)), - __FILE__, __func__, __LINE__); - - do_copy(dst, src, n, si.agentOwner, si.agentOwner); - } -} - -inline -void h2d_copy(void* __restrict dst, const void* __restrict src, size_t n, - hsa_amd_pointer_info_t di) { - const auto si{info(const_cast(src))}; - const auto is_locked{si.type == HSA_EXT_POINTER_TYPE_LOCKED}; - - if (!is_locked && di.size == is_cpu_owned) { - return do_std_memcpy(dst, src, n); - } - if (!is_locked && is_large_BAR && n <= max_h2d_std_memcpy_sz) { - return do_std_memcpy(dst, src, n); - } - if (si.type == HSA_EXT_POINTER_TYPE_HSA) { - return do_copy(dst, src, n, di.agentOwner, di.agentOwner); - } - - if (is_locked) { - src = static_cast(si.agentBaseAddress) + - (static_cast(src) - - static_cast(si.hostBaseAddress)); - do_copy(dst, src, n, di.agentOwner, di.agentOwner); - } - else if (n <= staging_sz) { - std::memcpy(staging_buffer.get(), src, n); - do_copy(dst, staging_buffer.get(), n, di.agentOwner, di.agentOwner); - } - else { - std::unique_ptr lck{ - const_cast(src), [](void* p) { hsa_amd_memory_unlock(p); }}; - - throwing_result_check(hsa_amd_memory_lock(const_cast(src), n, - &di.agentOwner, 1, - const_cast(&src)), - __FILE__, __func__, __LINE__); - - do_copy(dst, src, n, di.agentOwner, di.agentOwner); - } -} - -inline -void generic_copy(void* __restrict dst, const void* __restrict src, size_t n, - hsa_amd_pointer_info_t di, hsa_amd_pointer_info_t si) { - if (di.size == is_cpu_owned && si.size == is_cpu_owned) { - return do_std_memcpy(dst, src, n); - } - if (di.size == is_cpu_owned) return d2h_copy(dst, src, n, si); - if (si.size == is_cpu_owned) return h2d_copy(dst, src, n, di); - - hsa_status_t res = hsa_amd_agents_allow_access(1u, &si.agentOwner, - nullptr, di.agentBaseAddress); - if (res == HSA_STATUS_SUCCESS){ - return do_copy(dst, src, n, di.agentOwner, si.agentOwner); - } - // If devices do not have access then fallback mechanism will be used - // copy will be slower - throwing_result_check(hsa_memory_copy(dst,src,n), __FILE__, __func__, __LINE__); -} - -inline -void memcpy_impl(void* __restrict dst, const void* __restrict src, size_t n, - hipMemcpyKind k) { - auto si{info(src)}; - auto di{info(dst)}; - - if (!is_large_BAR){ - // Pointer info takes presidence over hipMemcpyKind - // if there is mismatch b/w Memcpy kind and dst/src pointer - // E.g. dst(host pointer),src(device pointer) and hipMemcpyKind set as hipMemcpyHostToDevice - if (di.size == is_cpu_owned && si.size == is_cpu_owned) - k = hipMemcpyHostToHost; - else if (si.size == is_cpu_owned && di.size != is_cpu_owned) - k = hipMemcpyHostToDevice; - else if (di.size == is_cpu_owned && si.size != is_cpu_owned) - k = hipMemcpyDeviceToHost; - else - k = hipMemcpyDeviceToDevice; - } - switch (k) { - case hipMemcpyHostToHost: std::memcpy(dst, src, n); break; - case hipMemcpyHostToDevice: return h2d_copy(dst, src, n, di); - case hipMemcpyDeviceToHost: return d2h_copy(dst, src, n, si); - case hipMemcpyDeviceToDevice: { - hsa_status_t res = hsa_amd_agents_allow_access(1u, &si.agentOwner, - nullptr, di.agentBaseAddress); - if (res == HSA_STATUS_SUCCESS){ - return do_copy(dst, src, n, di.agentOwner, si.agentOwner); - } - - // If devices do not have access then fallback mechanism will be used - // copy will be slower - throwing_result_check(hsa_memory_copy(dst,src,n), __FILE__, __func__, __LINE__); - break; - } - default: return generic_copy(dst, src, n, di, si); - } -} - -hipError_t memcpyAsync(void* dst, const void* src, size_t sizeBytes, - hipMemcpyKind kind, hipStream_t stream) { - if (sizeBytes == 0) return hipSuccess; - if (!dst || !src) return hipErrorInvalidValue; - - try { - stream = ihipSyncAndResolveStream(stream); - - if (!stream) return hipErrorInvalidValue; - - stream->locked_copyAsync(dst, src, sizeBytes, kind); - } - catch (const ihipException& ex) { - return ex._code; - } - catch (const std::exception& ex) { - std::cerr << ex.what() << std::endl; - throw; - } - catch (...) { - return hipErrorUnknown; - } - - return hipSuccess; -} - -hipError_t memcpySync(void* dst, const void* src, size_t sizeBytes, - hipMemcpyKind kind, hipStream_t stream) { - if (sizeBytes == 0) return hipSuccess; - if (!dst || !src) return hipErrorInvalidValue; - - try { - stream = ihipSyncAndResolveStream(stream); - - if (!stream) return hipErrorInvalidValue; - - LockedAccessor_StreamCrit_t cs{stream->criticalData()}; - cs->_av.wait(); - - memcpy_impl(dst, src, sizeBytes, kind); - cs->_last_op_was_a_copy = true; - } - catch (const ihipException& ex) { - return ex._code; - } - catch (const std::exception& ex) { - std::cerr << ex.what() << std::endl; - throw; - } - catch (...) { - return hipErrorUnknown; - } - - return hipSuccess; -} - -// return 0 on success or -1 on error: -int sharePtr(void* ptr, ihipCtx_t* ctx, bool shareWithAll, unsigned hipFlags) { - int ret = 0; - - auto device = ctx->getWriteableDevice(); - - if (shareWithAll) { - // shareWithAll memory is not mapped to any device - hc::am_memtracker_update(ptr, -1, hipFlags); - hsa_status_t s = hsa_amd_agents_allow_access(g_deviceCnt + 1, g_allAgents, NULL, ptr); - tprintf(DB_MEM, " allow access to CPU + all %d GPUs (shareWithAll)\n", g_deviceCnt); - if (s != HSA_STATUS_SUCCESS) { - ret = -1; - } - } else { -#if USE_APP_PTR_FOR_CTX - hc::am_memtracker_update(ptr, device->_deviceId, hipFlags, ctx); -#else - hc::am_memtracker_update(ptr, device->_deviceId, hipFlags); -#endif - int peerCnt = 0; - { - LockedAccessor_CtxCrit_t crit(ctx->criticalData()); - // the peerCnt always stores self so make sure the trace actually - peerCnt = crit->peerCnt(); - tprintf(DB_MEM, " allow access to %d other peer(s)\n", peerCnt - 1); - if (peerCnt > 1) { - // printf ("peer self access\n"); - - // TODOD - remove me: - for (auto iter = crit->_peers.begin(); iter != crit->_peers.end(); iter++) { - tprintf(DB_MEM, " allow access to peer: %s%s\n", (*iter)->toString().c_str(), - (iter == crit->_peers.begin()) ? " (self)" : ""); - }; - - hsa_status_t s = - hsa_amd_agents_allow_access(crit->peerCnt(), crit->peerAgents(), NULL, ptr); - if (s != HSA_STATUS_SUCCESS) { - ret = -1; - } - } - } - } - - return ret; -} - - -// Allocate a new pointer with am_alloc and share with all valid peers. -// Returns null-ptr if a memory error occurs (either allocation or sharing) -void* allocAndSharePtr(const char* msg, size_t sizeBytes, ihipCtx_t* ctx, bool shareWithAll, - unsigned amFlags, unsigned hipFlags, size_t alignment) { - void* ptr = nullptr; - - auto device = ctx->getWriteableDevice(); - -#if (__hcc_workweek__ >= 17332) - if (alignment != 0) { - ptr = hc::am_aligned_alloc(sizeBytes, device->_acc, amFlags, alignment); - } else -#endif - { - ptr = hc::am_alloc(sizeBytes, device->_acc, amFlags); - } - tprintf(DB_MEM, " alloc %s ptr:%p-%p size:%zu on dev:%d\n", msg, ptr, - static_cast(ptr) + sizeBytes, sizeBytes, device->_deviceId); - - if (HIP_INIT_ALLOC != -1) { - // TODO , dont' call HIP API directly here: - hipMemset(ptr, HIP_INIT_ALLOC, sizeBytes); - } - - if (ptr != nullptr) { - int r = sharePtr(ptr, ctx, shareWithAll, hipFlags); - if (r != 0) { - ptr = nullptr; - } - } - - return ptr; -} - -hipError_t ihipHostMalloc(TlsData *tls, void** ptr, size_t sizeBytes, unsigned int flags, bool noSync) { - hipError_t hip_status = hipSuccess; - - if (sizeBytes == 0) { - return hipSuccess; - } - - if (HIP_SYNC_HOST_ALLOC && !noSync) { - hipDeviceSynchronize(); - } - - auto ctx = ihipGetTlsDefaultCtx(); - if ((ctx == nullptr) || (ptr == nullptr)) { - hip_status = hipErrorInvalidValue; - } else { - unsigned trueFlags = flags; - if (flags == hipHostMallocDefault) { - // HCC/ROCM provide a modern system with unified memory and should set both of these - // flags by default: - trueFlags = hipHostMallocMapped | hipHostMallocPortable; - } - - - const unsigned supportedFlags = hipHostMallocPortable | hipHostMallocMapped | - hipHostMallocWriteCombined | hipHostMallocCoherent | - hipHostMallocNonCoherent; - - - const unsigned coherencyFlags = hipHostMallocCoherent | hipHostMallocNonCoherent; - - if ((flags & ~supportedFlags) || ((flags & coherencyFlags) == coherencyFlags)) { - *ptr = nullptr; - // can't specify unsupported flags, can't specify both Coherent + NonCoherent - hip_status = hipErrorInvalidValue; - } else { - auto device = ctx->getWriteableDevice(); -#if (__hcc_workweek__ >= 19115) - //Avoid mapping host pinned memory to all devices by HCC - unsigned amFlags = amHostUnmapped; -#else - unsigned amFlags = 0; -#endif - if (flags & hipHostMallocCoherent) { - amFlags |= amHostCoherent; - } else if (flags & hipHostMallocNonCoherent) { - amFlags |= amHostNonCoherent; - } else { - // depends on env variables: - amFlags |= HIP_HOST_COHERENT ? amHostCoherent : amHostNonCoherent; - } - - - *ptr = hip_internal::allocAndSharePtr( - (amFlags & amHostCoherent) ? "finegrained_host" : "pinned_host", sizeBytes, ctx, - true /*shareWithAll*/, amFlags, flags, 0); - - if (sizeBytes && (*ptr == NULL)) { - hip_status = hipErrorOutOfMemory; - } - } - } - - if (HIP_SYNC_HOST_ALLOC && !noSync) { - hipDeviceSynchronize(); - } - return hip_status; -} - -hipError_t ihipHostFree(TlsData *tls, void* ptr) { - - // Synchronize to ensure all work has finished. - ihipGetTlsDefaultCtx()->locked_waitAllStreams(); // ignores non-blocking streams, this waits - // for all activity to finish. - - hipError_t hipStatus = hipErrorInvalidValue; - if (ptr) { - hc::accelerator acc; -#if (__hcc_workweek__ >= 17332) - hc::AmPointerInfo amPointerInfo(NULL, NULL, NULL, 0, acc, 0, 0); -#else - hc::AmPointerInfo amPointerInfo(NULL, NULL, 0, acc, 0, 0); -#endif - am_status_t status = hc::am_memtracker_getinfo(&amPointerInfo, ptr); - if (status == AM_SUCCESS) { - if (amPointerInfo._hostPointer == ptr) { - hc::am_free(ptr); - hipStatus = hipSuccess; - } - } - } else { - // free NULL pointer succeeds and is common technique to initialize runtime - hipStatus = hipSuccess; - } - - return hipStatus; -} - - -} // end namespace hip_internal - -//------------------------------------------------------------------------------------------------- -//------------------------------------------------------------------------------------------------- -// Memory -// -// -// -// HIP uses several "app*" fields HC memory tracker to track state necessary for the HIP API. -//_appId : DeviceID. For device mem, this is device where the memory is physically allocated. -// For host or registered mem, this is the current device when the memory is allocated or -// registered. This device will have a GPUVM mapping for the host mem. -// -//_appAllocationFlags : These are flags provided by the user when allocation is performed. They are -//returned to user in hipHostGetFlags and other APIs. -// TODO - add more info here when available. -// -hipError_t hipPointerGetAttributes(hipPointerAttribute_t* attributes, const void* ptr) { - HIP_INIT_API(hipPointerGetAttributes, attributes, ptr); - - hipError_t e = hipSuccess; - if ((attributes == nullptr) || (ptr == nullptr)) { - e = hipErrorInvalidValue; - } else { - hc::accelerator acc; -#if (__hcc_workweek__ >= 17332) - hc::AmPointerInfo amPointerInfo(NULL, NULL, NULL, 0, acc, 0, 0); -#else - hc::AmPointerInfo amPointerInfo(NULL, NULL, 0, acc, 0, 0); -#endif - am_status_t status = hc::am_memtracker_getinfo(&amPointerInfo, ptr); - if (status == AM_SUCCESS) { - attributes->memoryType = - amPointerInfo._isInDeviceMem ? hipMemoryTypeDevice : hipMemoryTypeHost; - attributes->hostPointer = amPointerInfo._hostPointer; - attributes->devicePointer = amPointerInfo._devicePointer; - attributes->isManaged = 0; - if (attributes->memoryType == hipMemoryTypeHost) { - attributes->hostPointer = (void*)ptr; - } - if (attributes->memoryType == hipMemoryTypeDevice) { - attributes->devicePointer = (void*)ptr; - } - attributes->allocationFlags = amPointerInfo._appAllocationFlags; - attributes->device = amPointerInfo._appId; - - if (attributes->device < -1) { - e = hipErrorInvalidDevice; - } - } else { - attributes->memoryType = hipMemoryTypeDevice; - attributes->hostPointer = 0; - attributes->devicePointer = 0; - attributes->device = -2; - attributes->isManaged = 0; - attributes->allocationFlags = 0; - - e = hipErrorInvalidValue; - } - } - return ihipLogStatus(e); -} - - -hipError_t hipHostGetDevicePointer(void** devicePointer, void* hostPointer, unsigned flags) { - HIP_INIT_API(hipHostGetDevicePointer, devicePointer, hostPointer, flags); - - hipError_t e = hipSuccess; - - // Flags must be 0: - if ((flags != 0) || (devicePointer == nullptr) || (hostPointer == nullptr)) { - e = hipErrorInvalidValue; - } else { - hc::accelerator acc; - *devicePointer = NULL; -#if (__hcc_workweek__ >= 17332) - hc::AmPointerInfo amPointerInfo(NULL, NULL, NULL, 0, acc, 0, 0); -#else - hc::AmPointerInfo amPointerInfo(NULL, NULL, 0, acc, 0, 0); -#endif - am_status_t status = hc::am_memtracker_getinfo(&amPointerInfo, hostPointer); - if (status == AM_SUCCESS) { - *devicePointer = - static_cast(amPointerInfo._devicePointer) + - (static_cast(hostPointer) - static_cast(amPointerInfo._hostPointer)); - tprintf(DB_MEM, " host_ptr=%p returned device_pointer=%p\n", hostPointer, - *devicePointer); - } else { - e = hipErrorOutOfMemory; - } - } - return ihipLogStatus(e); -} - - -hipError_t hipMalloc(void** ptr, size_t sizeBytes) { - HIP_INIT_SPECIAL_API(hipMalloc, (TRACE_MEM), ptr, sizeBytes); - HIP_SET_DEVICE(); - hipError_t hip_status = hipSuccess; - - if (sizeBytes == 0) { - if (ptr) *ptr = NULL; - return ihipLogStatus(hipSuccess); - } - - auto ctx = ihipGetTlsDefaultCtx(); - // return NULL pointer when malloc size is 0 - if ( nullptr == ctx || nullptr == ptr) { - hip_status = hipErrorInvalidValue; - } else { - auto device = ctx->getWriteableDevice(); - *ptr = hip_internal::allocAndSharePtr("device_mem", sizeBytes, ctx, false /*shareWithAll*/, - 0 /*amFlags*/, 0 /*hipFlags*/, 0); - - if (sizeBytes && (*ptr == NULL)) { - hip_status = hipErrorOutOfMemory; - } - } - - - return ihipLogStatus(hip_status); -} - -hipError_t hipExtMallocWithFlags(void** ptr, size_t sizeBytes, unsigned int flags) { - HIP_INIT_SPECIAL_API(hipExtMallocWithFlags, (TRACE_MEM), ptr, sizeBytes, flags); - HIP_SET_DEVICE(); - -#if (__hcc_workweek__ >= 19115) - if (sizeBytes == 0) { - if (ptr) *ptr = NULL; - return ihipLogStatus(hipSuccess); - } - - hipError_t hip_status = hipSuccess; - - auto ctx = ihipGetTlsDefaultCtx(); - if ((ctx == nullptr) || (ptr == nullptr)) { - hip_status = hipErrorInvalidValue; - } else { - unsigned amFlags = 0; - if (flags & hipDeviceMallocFinegrained) { - amFlags = amDeviceFinegrained; - } else if (flags != hipDeviceMallocDefault) { - hip_status = hipErrorInvalidValue; - return ihipLogStatus(hip_status); - } - auto device = ctx->getWriteableDevice(); - *ptr = hip_internal::allocAndSharePtr("device_mem", sizeBytes, ctx, false /*shareWithAll*/, - amFlags /*amFlags*/, 0 /*hipFlags*/, 0); - - if (sizeBytes && (*ptr == NULL)) { - hip_status = hipErrorOutOfMemory; - } - } -#else - hipError_t hip_status = hipErrorOutOfMemory; -#endif - - return ihipLogStatus(hip_status); -} - - -hipError_t hipHostMalloc(void** ptr, size_t sizeBytes, unsigned int flags) { - HIP_INIT_SPECIAL_API(hipHostMalloc, (TRACE_MEM), ptr, sizeBytes, flags); - HIP_SET_DEVICE(); - if (sizeBytes == 0) { - return ihipLogStatus(hipSuccess); - } - hipError_t hip_status = hipSuccess; - hip_status = hip_internal::ihipHostMalloc(tls, ptr, sizeBytes, flags); - return ihipLogStatus(hip_status); -} - -hipError_t hipMallocManaged(void** devPtr, size_t size, unsigned int flags) { - HIP_INIT_SPECIAL_API(hipMallocManaged, (TRACE_MEM), devPtr, size, flags); - HIP_SET_DEVICE(); - if (size == 0) { - return ihipLogStatus(hipSuccess); - } - hipError_t hip_status = hipSuccess; - if(flags != hipMemAttachGlobal) - hip_status = hipErrorInvalidValue; - else - hip_status = hip_internal::ihipHostMalloc(tls, devPtr, size, hipHostMallocDefault); - return ihipLogStatus(hip_status); -} - -// Deprecated function: -hipError_t hipMallocHost(void** ptr, size_t sizeBytes) { return hipHostMalloc(ptr, sizeBytes, 0); } - -// Deprecated function: -hipError_t hipMemAllocHost(void** ptr, size_t sizeBytes) { return hipHostMalloc(ptr, sizeBytes, 0); } - -// Deprecated function: -hipError_t hipHostAlloc(void** ptr, size_t sizeBytes, unsigned int flags) { - return hipHostMalloc(ptr, sizeBytes, flags); -}; - -hipError_t allocImage(TlsData* tls,hsa_ext_image_geometry_t geometry, int width, int height, int depth, hsa_ext_image_channel_order_t channelOrder, hsa_ext_image_channel_type_t channelType,void ** ptr, hsa_ext_image_data_info_t &imageInfo, int array_size __dparm(0)) { - auto ctx = ihipGetTlsDefaultCtx(); - if (ctx) { - hc::accelerator acc = ctx->getDevice()->_acc; - hsa_agent_t* agent = static_cast(acc.get_hsa_agent()); - if (!agent) - return hipErrorInvalidHandle; - size_t allocGranularity = 0; - hsa_amd_memory_pool_t* allocRegion = static_cast(acc.get_hsa_am_region()); - hsa_amd_memory_pool_get_info(*allocRegion, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE, &allocGranularity); - - size_t rowPitch = getElementSize(channelOrder, channelType) * alignUp(width, IMAGE_PITCH_ALIGNMENT); - if(HSA_EXT_IMAGE_GEOMETRY_2DA == geometry) - imageInfo.size = rowPitch * (height == 0 ? 1 : height) * (array_size == 0 ? 1 : array_size) ; - else - imageInfo.size = rowPitch * (height == 0 ? 1 : height) * (depth == 0 ? 1 : depth) ; - - imageInfo.alignment = IMAGE_PITCH_ALIGNMENT; - size_t alignment = imageInfo.alignment <= allocGranularity ? 0 : imageInfo.alignment; - const unsigned am_flags = 0; - *ptr = hip_internal::allocAndSharePtr("device_array", imageInfo.size, ctx, - false /*shareWithAll*/, am_flags, 0, alignment); - if (*ptr == NULL) { - return hipErrorOutOfMemory; - } - return hipSuccess; - } - else { - return hipErrorOutOfMemory; - } -} - -// width in bytes -hipError_t ihipMallocPitch(TlsData* tls, void** ptr, size_t* pitch, size_t width, size_t height, size_t depth) { - hipError_t hip_status = hipSuccess; - if(ptr==NULL || pitch == NULL){ - return hipErrorInvalidValue; - } - hsa_ext_image_data_info_t imageInfo; - if (depth == 0) - hip_status = allocImage(tls,HSA_EXT_IMAGE_GEOMETRY_2D,width,height,0,HSA_EXT_IMAGE_CHANNEL_ORDER_R, - HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32,ptr,imageInfo); - else - hip_status = allocImage(tls,HSA_EXT_IMAGE_GEOMETRY_3D,width,height,depth,HSA_EXT_IMAGE_CHANNEL_ORDER_R, - HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32,ptr,imageInfo); - - if(hip_status == hipSuccess) - *pitch = imageInfo.size/(height == 0 ? 1 : height)/(depth == 0 ? 1 : depth); - - return hip_status; -} - -// width in bytes -hipError_t hipMallocPitch(void** ptr, size_t* pitch, size_t width, size_t height) { - HIP_INIT_SPECIAL_API(hipMallocPitch, (TRACE_MEM), ptr, pitch, width, height); - HIP_SET_DEVICE(); - hipError_t hip_status = hipSuccess; - - if (width == 0 || height == 0) return ihipLogStatus(hipErrorUnknown); - - hip_status = ihipMallocPitch(tls, ptr, pitch, width, height, 0); - return ihipLogStatus(hip_status); -} - -hipError_t hipMemAllocPitch(hipDeviceptr_t* dptr, size_t* pitch, size_t widthInBytes, size_t height, unsigned int elementSizeBytes){ - HIP_INIT_SPECIAL_API(hipMemAllocPitch, (TRACE_MEM), dptr, pitch, widthInBytes, height,elementSizeBytes); - HIP_SET_DEVICE(); - - if (widthInBytes == 0 || height == 0) return ihipLogStatus(hipErrorInvalidValue); - - return ihipLogStatus(ihipMallocPitch(tls, dptr, pitch, widthInBytes, height, 0)); -} - -hipError_t hipMalloc3D(hipPitchedPtr* pitchedDevPtr, hipExtent extent) { - HIP_INIT_API(hipMalloc3D, pitchedDevPtr, &extent); - HIP_SET_DEVICE(); - hipError_t hip_status = hipSuccess; - - if (extent.width == 0 || extent.height == 0) return ihipLogStatus(hipErrorUnknown); - if (!pitchedDevPtr) return ihipLogStatus(hipErrorInvalidValue); - void* ptr; - size_t pitch; - - hip_status = - ihipMallocPitch(tls, &pitchedDevPtr->ptr, &pitch, extent.width, extent.height, extent.depth); - if (hip_status == hipSuccess) { - pitchedDevPtr->pitch = pitch; - pitchedDevPtr->xsize = extent.width; - pitchedDevPtr->ysize = extent.height; - } - return ihipLogStatus(hip_status); -} - -hipChannelFormatDesc hipCreateChannelDesc(int x, int y, int z, int w, hipChannelFormatKind f) { - hipChannelFormatDesc cd; - cd.x = x; - cd.y = y; - cd.z = z; - cd.w = w; - cd.f = f; - return cd; -} - -extern void getChannelOrderAndType(const hipChannelFormatDesc& desc, - enum hipTextureReadMode readMode, - hsa_ext_image_channel_order_t* channelOrder, - hsa_ext_image_channel_type_t* channelType); - -hipError_t GetImageInfo(hsa_ext_image_geometry_t geometry,int width, int height, int depth, hipChannelFormatDesc desc, hsa_ext_image_data_info_t &imageInfo,int array_size __dparm(0)) -{ - hsa_ext_image_channel_order_t channelOrder; - hsa_ext_image_channel_type_t channelType; - getChannelOrderAndType(desc, hipReadModeElementType, &channelOrder, &channelType); - - size_t rowPitch = getElementSize(channelOrder, channelType) * alignUp(width, IMAGE_PITCH_ALIGNMENT); - if(HSA_EXT_IMAGE_GEOMETRY_2DA == geometry) - imageInfo.size = rowPitch * (height == 0 ? 1 : height) * (array_size == 0 ? 1 : array_size); - else - imageInfo.size = rowPitch * (height == 0 ? 1 : height) * (depth == 0 ? 1 : depth); - imageInfo.alignment = IMAGE_PITCH_ALIGNMENT; - return hipSuccess; -} - -hipError_t GetImageInfo(hsa_ext_image_geometry_t geometry,size_t width, size_t height, size_t depth, hsa_ext_image_channel_order_t channelOrder, hsa_ext_image_channel_type_t channelType, hsa_ext_image_data_info_t &imageInfo,size_t array_size __dparm(0)) -{ - - size_t rowPitch = getElementSize(channelOrder, channelType) * alignUp(width, IMAGE_PITCH_ALIGNMENT); - - if(HSA_EXT_IMAGE_GEOMETRY_2DA == geometry) - imageInfo.size = rowPitch * (height == 0 ? 1 : height) * (array_size == 0 ? 1 : array_size); - else - imageInfo.size = rowPitch * (height == 0 ? 1 : height) * (depth == 0 ? 1 : depth); - - imageInfo.alignment = IMAGE_PITCH_ALIGNMENT; - - return hipSuccess; -} - -hipError_t ihipArrayToImageFormat(hipArray_Format format,hsa_ext_image_channel_type_t &channelType) { - switch (format) { - case HIP_AD_FORMAT_UNSIGNED_INT8: - channelType = HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8; - break; - case HIP_AD_FORMAT_UNSIGNED_INT16: - channelType = HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16; - break; - case HIP_AD_FORMAT_UNSIGNED_INT32: - channelType = HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32; - break; - case HIP_AD_FORMAT_SIGNED_INT8: - channelType = HSA_EXT_IMAGE_CHANNEL_TYPE_SIGNED_INT8; - break; - case HIP_AD_FORMAT_SIGNED_INT16: - channelType = HSA_EXT_IMAGE_CHANNEL_TYPE_SIGNED_INT16; - break; - case HIP_AD_FORMAT_SIGNED_INT32: - channelType = HSA_EXT_IMAGE_CHANNEL_TYPE_SIGNED_INT32; - break; - case HIP_AD_FORMAT_HALF: - channelType = HSA_EXT_IMAGE_CHANNEL_TYPE_HALF_FLOAT; - break; - case HIP_AD_FORMAT_FLOAT: - channelType = HSA_EXT_IMAGE_CHANNEL_TYPE_FLOAT; - break; - default: - return hipErrorUnknown; - break; - } - return hipSuccess; -} - -hipError_t hipArrayCreate(hipArray** array, const HIP_ARRAY_DESCRIPTOR* pAllocateArray) { - HIP_INIT_SPECIAL_API(hipArrayCreate, (TRACE_MEM), array, pAllocateArray); - HIP_SET_DEVICE(); - hipError_t hip_status = hipSuccess; - if (pAllocateArray->Width > 0) { - *array = (hipArray*)malloc(sizeof(hipArray)); - array[0]->width = pAllocateArray->Width; - array[0]->height = pAllocateArray->Height; - array[0]->Format = pAllocateArray->Format; - array[0]->NumChannels = pAllocateArray->NumChannels; - array[0]->isDrv = true; - array[0]->textureType = hipTextureType2D; - void** ptr = &array[0]->data; - hsa_ext_image_channel_type_t channelType; - hsa_ext_image_channel_order_t channelOrder; - - hip_status = ihipArrayToImageFormat(pAllocateArray->Format,channelType); - if(hipSuccess != hip_status) - return ihipLogStatus(hip_status); - - if (pAllocateArray->NumChannels == 4) { - channelOrder = HSA_EXT_IMAGE_CHANNEL_ORDER_RGBA; - } else if (pAllocateArray->NumChannels == 2) { - channelOrder = HSA_EXT_IMAGE_CHANNEL_ORDER_RG; - } else if (pAllocateArray->NumChannels == 1) { - channelOrder = HSA_EXT_IMAGE_CHANNEL_ORDER_R; - } - hsa_ext_image_data_info_t imageInfo; - return ihipLogStatus(allocImage(tls,HSA_EXT_IMAGE_GEOMETRY_2D,pAllocateArray->Width, - pAllocateArray->Height,0,channelOrder,channelType,ptr,imageInfo)); - } else { - return ihipLogStatus(hipErrorInvalidValue); - } -} - -hipError_t hipMallocArray(hipArray** array, const hipChannelFormatDesc* desc, size_t width, - size_t height, unsigned int flags) { - HIP_INIT_SPECIAL_API(hipMallocArray, (TRACE_MEM), array, desc, width, height, flags); - HIP_SET_DEVICE(); - hipError_t hip_status = hipSuccess; - if (width > 0) { - *array = (hipArray*)malloc(sizeof(hipArray)); - array[0]->type = flags; - array[0]->width = width; - array[0]->height = height; - array[0]->depth = 1; - array[0]->desc = *desc; - array[0]->isDrv = false; - array[0]->textureType = hipTextureType2D; - void** ptr = &array[0]->data; - - hsa_ext_image_channel_order_t channelOrder; - hsa_ext_image_channel_type_t channelType; - getChannelOrderAndType(*desc, hipReadModeElementType, &channelOrder, &channelType); - hsa_ext_image_data_info_t imageInfo; - switch (flags) { - case hipArrayLayered: - case hipArrayCubemap: - case hipArraySurfaceLoadStore: - case hipArrayTextureGather: - assert(0); - break; - case hipArrayDefault: - default: - hip_status = allocImage(tls,HSA_EXT_IMAGE_GEOMETRY_2D,width,height,0,channelOrder,channelType,ptr,imageInfo); - break; - } - } else { - hip_status = hipErrorInvalidValue; - } - return ihipLogStatus(hip_status); -} - -hipError_t hipArray3DCreate(hipArray** array, const HIP_ARRAY3D_DESCRIPTOR* pAllocateArray) { - HIP_INIT_SPECIAL_API(hipArray3DCreate, (TRACE_MEM), array, pAllocateArray); - hipError_t hip_status = hipSuccess; - - *array = (hipArray*)malloc(sizeof(hipArray)); - array[0]->type = pAllocateArray->Flags; - array[0]->width = pAllocateArray->Width; - array[0]->height = pAllocateArray->Height; - array[0]->depth = pAllocateArray->Depth; - array[0]->Format = pAllocateArray->Format; - array[0]->NumChannels = pAllocateArray->NumChannels; - array[0]->isDrv = true; - void** ptr = &array[0]->data; - - hsa_ext_image_channel_order_t channelOrder; - if (pAllocateArray->NumChannels == 4) { - channelOrder = HSA_EXT_IMAGE_CHANNEL_ORDER_RGBA; - } else if (pAllocateArray->NumChannels == 2) { - channelOrder = HSA_EXT_IMAGE_CHANNEL_ORDER_RG; - } else if (pAllocateArray->NumChannels == 1) { - channelOrder = HSA_EXT_IMAGE_CHANNEL_ORDER_R; - } - - hsa_ext_image_channel_type_t channelType; - hip_status = ihipArrayToImageFormat(pAllocateArray->Format,channelType); - hsa_ext_image_data_info_t imageInfo; - switch (pAllocateArray->Flags) { - case hipArrayLayered: - hip_status = allocImage(tls,HSA_EXT_IMAGE_GEOMETRY_2DA,pAllocateArray->Width,pAllocateArray->Height,0, - channelOrder,channelType,ptr,imageInfo,pAllocateArray->Depth); - array[0]->textureType = hipTextureType2DLayered; - break; - case hipArraySurfaceLoadStore: - case hipArrayTextureGather: - assert(0); - break; - case hipArrayDefault: - case hipArrayCubemap: - default: - array[0]->type = hipArrayCubemap; - hip_status = allocImage(tls,HSA_EXT_IMAGE_GEOMETRY_3D,pAllocateArray->Width,pAllocateArray->Height, - pAllocateArray->Depth,channelOrder,channelType,ptr,imageInfo); - array[0]->textureType = hipTextureType3D; - break; - } - - return ihipLogStatus(hip_status); -} - -hipError_t hipMalloc3DArray(hipArray** array, const struct hipChannelFormatDesc* desc, - struct hipExtent extent, unsigned int flags) { - - HIP_INIT_API(hipMalloc3DArray, array, desc, &extent, flags); - HIP_SET_DEVICE(); - hipError_t hip_status = hipSuccess; - - if(array==NULL ){ - return ihipLogStatus(hipErrorInvalidValue); - } - *array = (hipArray*)malloc(sizeof(hipArray)); - array[0]->type = flags; - array[0]->width = extent.width; - array[0]->height = extent.height; - array[0]->depth = extent.depth; - array[0]->desc = *desc; - array[0]->isDrv = false; - void** ptr = &array[0]->data; - hsa_ext_image_channel_order_t channelOrder; - hsa_ext_image_channel_type_t channelType; - getChannelOrderAndType(*desc, hipReadModeElementType, &channelOrder, &channelType); - hsa_ext_image_data_info_t imageInfo; - switch (flags) { - case hipArrayLayered: - hip_status = allocImage(tls,HSA_EXT_IMAGE_GEOMETRY_2DA,extent.width,extent.height,0,channelOrder,channelType,ptr,imageInfo,extent.depth); - array[0]->textureType = hipTextureType2DLayered; - break; - case hipArraySurfaceLoadStore: - case hipArrayTextureGather: - assert(0); - break; - case hipArrayDefault: - case hipArrayCubemap: - default: - array[0]->type = hipArrayCubemap; - hip_status = allocImage(tls,HSA_EXT_IMAGE_GEOMETRY_3D,extent.width,extent.height,extent.depth,channelOrder,channelType,ptr,imageInfo); - array[0]->textureType = hipTextureType3D; - break; - } - return ihipLogStatus(hip_status); -} - -hipError_t hipHostGetFlags(unsigned int* flagsPtr, void* hostPtr) { - HIP_INIT_API(hipHostGetFlags, flagsPtr, hostPtr); - - hipError_t hip_status = hipSuccess; - - hc::accelerator acc; -#if (__hcc_workweek__ >= 17332) - hc::AmPointerInfo amPointerInfo(NULL, NULL, NULL, 0, acc, 0, 0); -#else - hc::AmPointerInfo amPointerInfo(NULL, NULL, 0, acc, 0, 0); -#endif - am_status_t status = hc::am_memtracker_getinfo(&amPointerInfo, hostPtr); - if (status == AM_SUCCESS) { - *flagsPtr = amPointerInfo._appAllocationFlags; - //0 is valid flag hipHostMallocDefault, and during hipHostMalloc if unsupported flags are passed as parameter it throws error - hip_status = hipSuccess; - tprintf(DB_MEM, " %s: host ptr=%p\n", __func__, hostPtr); - } else { - hip_status = hipErrorInvalidValue; - } - return ihipLogStatus(hip_status); -} - - -// TODO - need to fix several issues here related to P2P access, host memory fallback. -hipError_t hipHostRegister(void* hostPtr, size_t sizeBytes, unsigned int flags) { - HIP_INIT_API(hipHostRegister, hostPtr, sizeBytes, flags); - - hipError_t hip_status = hipSuccess; - - auto ctx = ihipGetTlsDefaultCtx(); - if (hostPtr == NULL) { - return ihipLogStatus(hipErrorInvalidValue); - } - - hc::accelerator acc; -#if (__hcc_workweek__ >= 17332) - hc::AmPointerInfo amPointerInfo(NULL, NULL, NULL, 0, acc, 0, 0); -#else - hc::AmPointerInfo amPointerInfo(NULL, NULL, 0, acc, 0, 0); -#endif - am_status_t am_status = hc::am_memtracker_getinfo(&amPointerInfo, hostPtr); - - if (am_status == AM_SUCCESS) { - hip_status = hipErrorHostMemoryAlreadyRegistered; - } else { - auto ctx = ihipGetTlsDefaultCtx(); - if (hostPtr == NULL) { - return ihipLogStatus(hipErrorInvalidValue); - } - // TODO-test : multi-gpu access to registered host memory. - if (ctx) { - if ((flags == hipHostRegisterDefault) || (flags & hipHostRegisterPortable) || - (flags & hipHostRegisterMapped) || (flags == hipExtHostRegisterCoarseGrained)) { - auto device = ctx->getWriteableDevice(); - std::vector vecAcc; - for (int i = 0; i < g_deviceCnt; i++) { - vecAcc.push_back(ihipGetDevice(i)->_acc); - } -#if (__hcc_workweek__ >= 19183) - if(flags & hipExtHostRegisterCoarseGrained) { - am_status = hc::am_memory_host_lock(device->_acc, hostPtr, sizeBytes, &vecAcc[0], - vecAcc.size()); - } else { - am_status = hc::am_memory_host_lock_with_flag(device->_acc, hostPtr, sizeBytes, &vecAcc[0], - vecAcc.size()); - } -#else - am_status = hc::am_memory_host_lock(device->_acc, hostPtr, sizeBytes, &vecAcc[0], - vecAcc.size()); -#endif - if ( am_status == AM_SUCCESS ) { - am_status = hc::am_memtracker_getinfo(&amPointerInfo, hostPtr); - - if ( am_status == AM_SUCCESS ) { - void *devPtr = amPointerInfo._devicePointer; - #if USE_APP_PTR_FOR_CTX - hc::am_memtracker_update(hostPtr, device->_deviceId, flags, ctx); - hc::am_memtracker_update(devPtr, device->_deviceId, flags, ctx); - #else - hc::am_memtracker_update(hostPtr, device->_deviceId, flags); - hc::am_memtracker_update(devPtr, device->_deviceId, flags); - #endif - tprintf(DB_MEM, " %s registered ptr=%p and allowed access to %zu peers\n", __func__, - hostPtr, vecAcc.size()); - }; - }; - if (am_status == AM_SUCCESS) { - hip_status = hipSuccess; - } else { - hip_status = hipErrorOutOfMemory; - } - } else { - hip_status = hipErrorInvalidValue; - } - } - } - return ihipLogStatus(hip_status); -} - -hipError_t hipHostUnregister(void* hostPtr) { - HIP_INIT_API(hipHostUnregister, hostPtr); - auto ctx = ihipGetTlsDefaultCtx(); - hipError_t hip_status = hipSuccess; - if (hostPtr == NULL) { - hip_status = hipErrorInvalidValue; - } else { - auto device = ctx->getWriteableDevice(); - am_status_t am_status = hc::am_memory_host_unlock(device->_acc, hostPtr); - tprintf(DB_MEM, " %s unregistered ptr=%p\n", __func__, hostPtr); - if (am_status != AM_SUCCESS) { - hip_status = hipErrorHostMemoryNotRegistered; - } - } - return ihipLogStatus(hip_status); -} - -namespace hip_impl { -hipError_t hipMemcpyToSymbol(void* dst, const void* src, size_t count, - size_t offset, hipMemcpyKind kind, - const char* symbol_name) { - HIP_INIT_SPECIAL_API(hipMemcpyToSymbol, (TRACE_MCMD), symbol_name, src, - count, offset, kind); - - tprintf(DB_MEM, " symbol '%s' resolved to address:%p\n", symbol_name, dst); - - if (count == 0) return ihipLogStatus(hipSuccess); - if (dst == nullptr) { - return ihipLogStatus(hipErrorInvalidSymbol); - } - - if (kind == hipMemcpyDeviceToHost || kind == hipMemcpyHostToHost) { - return ihipLogStatus(hipErrorInvalidMemcpyDirection); - } else if (kind == hipMemcpyDeviceToDevice) { - return ihipLogStatus(hipErrorInvalidValue); - } - - return ihipLogStatus(hip_internal::memcpySync(static_cast(dst)+offset, src, count, kind, - hipStreamNull)); -} - -hipError_t hipMemcpyFromSymbol(void* dst, const void* src, size_t count, - size_t offset, hipMemcpyKind kind, - const char* symbol_name) { - HIP_INIT_SPECIAL_API(hipMemcpyFromSymbol, (TRACE_MCMD), symbol_name, dst, - count, offset, kind); - - tprintf(DB_MEM, " symbol '%s' resolved to address:%p\n", symbol_name, dst); - - if (count == 0) return ihipLogStatus(hipSuccess); - if (src == nullptr || dst == nullptr) { - return ihipLogStatus(hipErrorInvalidSymbol); - } - - if (kind == hipMemcpyHostToDevice || kind == hipMemcpyHostToHost) { - return ihipLogStatus(hipErrorInvalidMemcpyDirection); - } else if (kind == hipMemcpyDeviceToDevice) { - return ihipLogStatus(hipErrorInvalidValue); - } - - return ihipLogStatus(hip_internal::memcpySync(dst, static_cast(src)+offset, count, kind, - hipStreamNull)); -} - - -hipError_t hipMemcpyToSymbolAsync(void* dst, const void* src, size_t count, - size_t offset, hipMemcpyKind kind, - hipStream_t stream, const char* symbol_name) { - HIP_INIT_SPECIAL_API(hipMemcpyToSymbolAsync, (TRACE_MCMD), symbol_name, src, - count, offset, kind, stream); - - tprintf(DB_MEM, " symbol '%s' resolved to address:%p\n", symbol_name, dst); - - if (count == 0) return ihipLogStatus(hipSuccess); - if (dst == nullptr) { - return ihipLogStatus(hipErrorInvalidSymbol); - } - - if (kind == hipMemcpyDeviceToHost || kind == hipMemcpyHostToHost) { - return ihipLogStatus(hipErrorInvalidMemcpyDirection); - } else if (kind == hipMemcpyDeviceToDevice) { - return ihipLogStatus(hipErrorInvalidValue); - } - - hipError_t e = hipSuccess; - if (stream) { - try { - hip_internal::memcpyAsync(static_cast(dst)+offset, src, count, kind, stream); - } catch (ihipException& ex) { - e = ex._code; - } - } else { - e = hipErrorInvalidValue; - } - - return ihipLogStatus(e); -} - -hipError_t hipMemcpyFromSymbolAsync(void* dst, const void* src, size_t count, - size_t offset, hipMemcpyKind kind, - hipStream_t stream, const char* symbol_name) { - HIP_INIT_SPECIAL_API(hipMemcpyFromSymbolAsync, (TRACE_MCMD), symbol_name, - dst, count, offset, kind, stream); - - tprintf(DB_MEM, " symbol '%s' resolved to address:%p\n", symbol_name, src); - - if (count == 0) return ihipLogStatus(hipSuccess); - if (src == nullptr || dst == nullptr) { - return ihipLogStatus(hipErrorInvalidSymbol); - } - - if (kind == hipMemcpyHostToDevice || kind == hipMemcpyHostToHost) { - return ihipLogStatus(hipErrorInvalidMemcpyDirection); - } else if (kind == hipMemcpyDeviceToDevice) { - return ihipLogStatus(hipErrorInvalidValue); - } - - hipError_t e = hipSuccess; - stream = ihipSyncAndResolveStream(stream); - if (stream) { - try { - hip_internal::memcpyAsync(dst, static_cast(src)+offset, count, kind, stream); - } catch (ihipException& ex) { - e = ex._code; - } - } else { - e = hipErrorInvalidValue; - } - - return ihipLogStatus(e); -} -} // Namespace hip_impl. - -//--- -hipError_t hipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind) { - HIP_INIT_SPECIAL_API(hipMemcpy, (TRACE_MCMD), dst, src, sizeBytes, kind); - - return ihipLogStatus(hip_internal::memcpySync(dst, src, sizeBytes, kind, - hipStreamNull)); -} - -hipError_t hipMemcpyHtoD(hipDeviceptr_t dst, void* src, size_t sizeBytes) { - HIP_INIT_SPECIAL_API(hipMemcpyHtoD, (TRACE_MCMD), dst, src, sizeBytes); - - return ihipLogStatus(hip_internal::memcpySync(dst, src, sizeBytes, - hipMemcpyHostToDevice, - hipStreamNull)); -} - -hipError_t hipMemcpyDtoH(void* dst, hipDeviceptr_t src, size_t sizeBytes) { - HIP_INIT_SPECIAL_API(hipMemcpyDtoH, (TRACE_MCMD), dst, src, sizeBytes); - - return ihipLogStatus(hip_internal::memcpySync(dst, src, sizeBytes, - hipMemcpyDeviceToHost, - hipStreamNull)); -} - -hipError_t hipMemcpyDtoD(hipDeviceptr_t dst, hipDeviceptr_t src, size_t sizeBytes) { - HIP_INIT_SPECIAL_API(hipMemcpyDtoD, (TRACE_MCMD), dst, src, sizeBytes); - - return ihipLogStatus(hip_internal::memcpySync(dst, src, sizeBytes, - hipMemcpyDeviceToDevice, - hipStreamNull)); -} - -hipError_t hipMemcpyHtoH(void* dst, void* src, size_t sizeBytes) { - HIP_INIT_SPECIAL_API(hipMemcpyHtoH, (TRACE_MCMD), dst, src, sizeBytes); - - return ihipLogStatus(hip_internal::memcpySync(dst, src, sizeBytes, - hipMemcpyHostToHost, - hipStreamNull)); -} - -hipError_t hipMemcpyWithStream(void* dst, const void* src, size_t sizeBytes, - hipMemcpyKind kind, hipStream_t stream) { - HIP_INIT_SPECIAL_API(hipMemcpyWithStream, (TRACE_MCMD), dst, src, sizeBytes, - kind, stream); - - return ihipLogStatus(hip_internal::memcpySync(dst, src, sizeBytes, kind, - stream)); -} - -hipError_t hipMemcpyAsync(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind, - hipStream_t stream) { - HIP_INIT_SPECIAL_API(hipMemcpyAsync, (TRACE_MCMD), dst, src, sizeBytes, kind, stream); - - return ihipLogStatus(hip_internal::memcpyAsync(dst, src, sizeBytes, kind, stream)); -} - -hipError_t hipMemcpyHtoDAsync(hipDeviceptr_t dst, void* src, size_t sizeBytes, hipStream_t stream) { - HIP_INIT_SPECIAL_API(hipMemcpyHtoDAsync, (TRACE_MCMD), dst, src, sizeBytes, stream); - - return ihipLogStatus( - hip_internal::memcpyAsync(dst, src, sizeBytes, hipMemcpyHostToDevice, stream)); -} - -hipError_t hipMemcpyDtoDAsync(hipDeviceptr_t dst, hipDeviceptr_t src, size_t sizeBytes, - hipStream_t stream) { - HIP_INIT_SPECIAL_API(hipMemcpyDtoDAsync, (TRACE_MCMD), dst, src, sizeBytes, stream); - - return ihipLogStatus( - hip_internal::memcpyAsync(dst, src, sizeBytes, hipMemcpyDeviceToDevice, stream)); -} - -hipError_t hipMemcpyDtoHAsync(void* dst, hipDeviceptr_t src, size_t sizeBytes, hipStream_t stream) { - HIP_INIT_SPECIAL_API(hipMemcpyDtoHAsync, (TRACE_MCMD), dst, src, sizeBytes, stream); - - return ihipLogStatus( - hip_internal::memcpyAsync(dst, src, sizeBytes, hipMemcpyDeviceToHost, stream)); -} - -hipError_t hipMemcpy2DToArray(hipArray* dst, size_t wOffset, size_t hOffset, const void* src, - size_t spitch, size_t width, size_t height, hipMemcpyKind kind) { - HIP_INIT_SPECIAL_API(hipMemcpy2DToArray, (TRACE_MCMD), dst, wOffset, hOffset, src, spitch, width, height, kind); - - hipStream_t stream = ihipSyncAndResolveStream(hipStreamNull); - - hipError_t e = hipSuccess; - - size_t byteSize; - if (dst) { - switch (dst[0].desc.f) { - case hipChannelFormatKindSigned: - byteSize = sizeof(int); - break; - case hipChannelFormatKindUnsigned: - byteSize = sizeof(unsigned int); - break; - case hipChannelFormatKindFloat: - byteSize = sizeof(float); - break; - case hipChannelFormatKindNone: - byteSize = sizeof(size_t); - break; - default: - byteSize = 0; - break; - } - } else { - return ihipLogStatus(hipErrorUnknown); - } - - if ((wOffset + width > (dst->width * byteSize)) || width > spitch) { - return ihipLogStatus(hipErrorUnknown); - } - - size_t src_w = spitch; - size_t dst_w = (dst->width) * byteSize; - - try { - for (int i = 0; i < height; ++i) { - stream->locked_copySync((unsigned char*)dst->data + i * dst_w, - (unsigned char*)src + i * src_w, width, kind); - } - } catch (ihipException& ex) { - e = ex._code; - } - - return ihipLogStatus(e); -} - -hipError_t hipMemcpyToArray(hipArray* dst, size_t wOffset, size_t hOffset, const void* src, - size_t count, hipMemcpyKind kind) { - HIP_INIT_SPECIAL_API(hipMemcpyToArray, (TRACE_MCMD), dst, wOffset, hOffset, src, count, kind); - - hipStream_t stream = ihipSyncAndResolveStream(hipStreamNull); - - hipError_t e = hipSuccess; - - try { - stream->locked_copySync((char*)dst->data + wOffset, src, count, kind); - } catch (ihipException& ex) { - e = ex._code; - } - - return ihipLogStatus(e); -} - -hipError_t hipMemcpyFromArray(void* dst, hipArray_const_t srcArray, size_t wOffset, size_t hOffset, - size_t count, hipMemcpyKind kind) { - HIP_INIT_SPECIAL_API(hipMemcpyFromArray, (TRACE_MCMD), dst, srcArray, wOffset, hOffset, count, kind); - - hipStream_t stream = ihipSyncAndResolveStream(hipStreamNull); - - hipError_t e = hipSuccess; - - try { - stream->locked_copySync((char*)dst, (char*)srcArray->data + wOffset, count, kind); - } catch (ihipException& ex) { - e = ex._code; - } - - return ihipLogStatus(e); -} - -hipError_t hipMemcpyHtoA(hipArray* dstArray, size_t dstOffset, const void* srcHost, size_t count) { - HIP_INIT_SPECIAL_API(hipMemcpyHtoA, (TRACE_MCMD), dstArray, dstOffset, srcHost, count); - - hipStream_t stream = ihipSyncAndResolveStream(hipStreamNull); - - hipError_t e = hipSuccess; - try { - stream->locked_copySync((char*)dstArray->data + dstOffset, srcHost, count, - hipMemcpyHostToDevice); - } catch (ihipException& ex) { - e = ex._code; - } - - return ihipLogStatus(e); -} - -hipError_t hipMemcpyAtoH(void* dst, hipArray* srcArray, size_t srcOffset, size_t count) { - HIP_INIT_SPECIAL_API(hipMemcpyAtoH, (TRACE_MCMD), dst, srcArray, srcOffset, count); - - hipStream_t stream = ihipSyncAndResolveStream(hipStreamNull); - - hipError_t e = hipSuccess; - - try { - stream->locked_copySync((char*)dst, (char*)srcArray->data + srcOffset, count, - hipMemcpyDeviceToHost); - } catch (ihipException& ex) { - e = ex._code; - } - - return ihipLogStatus(e); -} - -int getByteSizeFromFormat(const hipChannelFormatDesc& desc){ - int byteSize =0; - switch (desc.f) { - case hipChannelFormatKindUnsigned: - switch (desc.x) { - case 32: - byteSize = sizeof(uint32_t); - break; - case 16: - byteSize = sizeof(uint16_t); - break; - case 8: - byteSize = sizeof(uint8_t); - break; - default: - byteSize = sizeof(uint32_t); - } - break; - case hipChannelFormatKindSigned: - switch (desc.x) { - case 32: - byteSize = sizeof(int32_t); - break; - case 16: - byteSize = sizeof(int16_t); - break; - case 8: - byteSize = sizeof(int8_t); - break; - default: - byteSize = sizeof(int32_t); - } - break; - case hipChannelFormatKindFloat: - switch (desc.x) { - case 32: - byteSize = sizeof(float); - break; - case 16: - byteSize = sizeof(_Float16); - break; - default: - byteSize = sizeof(float); - } - break; - case hipChannelFormatKindNone: - default: - break; - } - return byteSize; -} - -hipError_t ihipMemcpy3D(const struct hipMemcpy3DParms* p, hipStream_t stream, bool isAsync) { - hipError_t e = hipSuccess; - if(p) { - size_t dstByteSize, srcByteSize, copyWidth, copyHeight, copyDepth, widthInBytes, srcPitch, dstPitch, srcYsize, dstYsize; - size_t srcXoffset, srcYoffset, srcZoffset, dstXoffset, dstYoffset, dstZoffset; - size_t srcWidth, srcHeight, srcDepth, dstWidth, dstHeight, dstDepth; - - void* srcPtr, *dstPtr; - bool copyWidthUpdate= false; - copyDepth = p->extent.depth; - copyHeight = p->extent.height; - copyWidth = p->extent.width; // in bytes ? - dstXoffset = p->dstPos.x; - dstYoffset = p->dstPos.y; - dstZoffset = p->dstPos.z; - srcXoffset = p->srcPos.x; - srcYoffset = p->srcPos.y; - srcZoffset = p->srcPos.z; - if (copyWidth == 0) return hipSuccess; - if (p->dstArray != nullptr) { - if ((p->dstArray->isDrv == true) ||( p->dstPtr.ptr!= nullptr)){ - return hipErrorInvalidValue; - } - // Array destination - dstByteSize = getByteSizeFromFormat(p->dstArray->desc); - hipChannelFormatDesc desc; - desc = p->dstArray->desc; - dstPtr = p->dstArray->data; - dstWidth = p->dstArray->width; - dstHeight = p->dstArray->height; - dstDepth = p->dstArray->depth; - dstPitch = dstByteSize * alignUp(dstWidth, IMAGE_PITCH_ALIGNMENT); - if(!copyWidthUpdate) { - copyWidth = copyWidth * dstByteSize; - copyWidthUpdate = true; - } - } else { - //Non Array destination - dstPtr = p->dstPtr.ptr; - dstWidth = p->dstPtr.xsize; - dstHeight = p->dstPtr.ysize; - dstPitch = p->dstPtr.pitch; - } - - if (p->srcArray != nullptr) { - if ((p->srcArray->isDrv == true) ||( p->srcPtr.ptr!= nullptr)){ - return hipErrorInvalidValue; - } - // Array source - srcByteSize = getByteSizeFromFormat(p->srcArray->desc); - hipChannelFormatDesc desc; - desc = p->srcArray->desc; - srcPtr = p->srcArray->data; - srcWidth = p->srcArray->width; - srcHeight = p->srcArray->height; - srcDepth = p->srcArray->depth; - srcPitch = srcByteSize * alignUp(srcWidth, IMAGE_PITCH_ALIGNMENT); - if(!copyWidthUpdate) { - copyWidth = copyWidth * srcByteSize; - copyWidthUpdate = true; - } - } else { - //Non Array source - srcPtr = p->srcPtr.ptr; - srcWidth = p->srcPtr.xsize; - srcHeight = p->srcPtr.ysize; - srcPitch = p->srcPtr.pitch; - } - - stream = ihipSyncAndResolveStream(stream); - try { - if((copyWidth == dstPitch) && (copyWidth == srcPitch)&& (copyHeight == dstHeight) &&(copyHeight == srcHeight)) { - if(isAsync) - stream->locked_copyAsync((void*)dstPtr, (void*)srcPtr, copyWidth*copyHeight*copyDepth, p->kind); - else - stream->locked_copySync((void*)dstPtr, (void*)srcPtr, copyWidth*copyHeight*copyDepth, p->kind, false); - } else { - for (int i = 0; i < copyDepth; i++) { - for (int j = 0; j < copyHeight; j++) { - unsigned char* src = - (unsigned char*)srcPtr + (i + srcZoffset) * srcHeight * srcPitch + (j + srcYoffset) * srcPitch + srcXoffset; - unsigned char* dst = - (unsigned char*)dstPtr + (i + dstZoffset) * dstHeight * dstPitch + (j + dstYoffset) * dstPitch + dstXoffset; - if(isAsync) - stream->locked_copyAsync(dst, src, copyWidth, p->kind); - else - stream->locked_copySync(dst, src, copyWidth, p->kind); - } - } - } - } catch (ihipException ex) { - e = ex._code; - } - } else { - e = hipErrorInvalidValue; - } - return e; -} - -hipError_t hipMemcpy3D(const struct hipMemcpy3DParms* p) { - HIP_INIT_SPECIAL_API(hipMemcpy3D, (TRACE_MCMD), p); - hipError_t e = hipSuccess; - e = ihipMemcpy3D(p, hipStreamNull, false); - return ihipLogStatus(e); -} - -hipError_t hipMemcpy3DAsync(const struct hipMemcpy3DParms* p, hipStream_t stream) { - HIP_INIT_SPECIAL_API(hipMemcpy3DAsync, (TRACE_MCMD), p, stream); - hipError_t e = hipSuccess; - e = ihipMemcpy3D(p, stream, true); - return ihipLogStatus(e); -} - -namespace { -template -__global__ void hip_fill_n(RandomAccessIterator f, N n, T value) { - const auto grid_dim = gridDim.x * blockDim.x * items_per_lane; - const auto gidx = blockIdx.x * block_dim + threadIdx.x; - - size_t idx = gidx * items_per_lane; - while (idx + items_per_lane <= n) { - for (auto i = 0u; i != items_per_lane; ++i) { - __builtin_nontemporal_store(value, &f[idx + i]); - } - idx += grid_dim; - } - - if (gidx < n % grid_dim) { - __builtin_nontemporal_store(value, &f[n - gidx - 1]); - } -} - -template {}>::type* = nullptr> -inline const T& clamp_integer(const T& x, const T& lower, const T& upper) { - assert(!(upper < lower)); - - return std::min(upper, std::max(x, lower)); -} - -template -__global__ void hip_copy2d_n(T* dst, const T* src, size_t width, size_t height, size_t destPitch, size_t srcPitch) { - - size_t idx = blockIdx.x * blockDim.x + threadIdx.x; - size_t idy = blockIdx.y * blockDim.y + threadIdx.y; - size_t floorWidth = (width/sizeof(T)); - T *dstPtr = (T *)((uint8_t*) dst + idy * destPitch); - T *srcPtr = (T *)((uint8_t*) src + idy * srcPitch); - if((idx < floorWidth) && (idy < height)){ - dstPtr[idx] = srcPtr[idx]; - } else if((idx < width) && (idy < height)){ - size_t bytesToCopy = width - (floorWidth * sizeof(T)); - dstPtr += floorWidth; - srcPtr += floorWidth; - __builtin_memcpy(reinterpret_cast(dstPtr), reinterpret_cast(srcPtr),bytesToCopy); - } -} -} // namespace - -//Get the allocated size -hipError_t ihipMemPtrGetInfo(void* ptr, size_t* size) { - hipError_t e = hipSuccess; - if (ptr != nullptr && size != nullptr) { - *size = 0; - hc::accelerator acc; -#if (__hcc_workweek__ >= 17332) - hc::AmPointerInfo amPointerInfo(NULL, NULL, NULL, 0, acc, 0, 0); -#else - hc::AmPointerInfo amPointerInfo(NULL, NULL, 0, acc, 0, 0); -#endif - am_status_t status = hc::am_memtracker_getinfo(&amPointerInfo, ptr); - if (status == AM_SUCCESS) { - *size = amPointerInfo._sizeBytes; - } else { - e = hipErrorInvalidValue; - } - } else { - e = hipErrorInvalidValue; - } - return e; -} - -template -void ihipMemsetKernel(hipStream_t stream, T* ptr, T val, size_t count) { - static constexpr uint32_t block_dim = 256; - static constexpr uint32_t max_write_width = 4 * sizeof(std::uint32_t); // 4 DWORDs - static constexpr uint32_t items_per_lane = max_write_width / sizeof(T); - - const uint32_t grid_dim = clamp_integer( - count / (block_dim * items_per_lane), 1, UINT32_MAX); - - hipLaunchKernelGGL(hip_fill_n, dim3(grid_dim), - dim3{block_dim}, 0u, stream, ptr, count, std::move(val)); -} - -template -void ihipMemcpy2dKernel(hipStream_t stream, T* dst, const T* src, size_t width, size_t height, size_t destPitch, size_t srcPitch) { - size_t threadsPerBlock_x = 64; - size_t threadsPerBlock_y = 4; - uint32_t grid_dim_x = clamp_integer( (width+(threadsPerBlock_x*sizeof(T)-1)) / (threadsPerBlock_x*sizeof(T)), 1, UINT32_MAX); - uint32_t grid_dim_y = clamp_integer( (height+(threadsPerBlock_y-1)) / threadsPerBlock_y, 1, UINT32_MAX); - hipLaunchKernelGGL(hip_copy2d_n, dim3(grid_dim_x,grid_dim_y), dim3(threadsPerBlock_x,threadsPerBlock_y), 0u, stream, dst, src, - width, height, destPitch, srcPitch); -} - -typedef enum ihipMemsetDataType { - ihipMemsetDataTypeChar = 0, - ihipMemsetDataTypeShort = 1, - ihipMemsetDataTypeInt = 2 -}ihipMemsetDataType; - -hipError_t ihipMemsetAsync(void* dst, int value, size_t count, hipStream_t stream, enum ihipMemsetDataType copyDataType) { - if (count == 0) return hipSuccess; - if (!dst) return hipErrorInvalidValue; - - try { - if (copyDataType == ihipMemsetDataTypeChar) { - if ((count & 0x3) == 0) { - // use a faster dword-per-workitem copy: - value = value & 0xff; - uint32_t value32 = (value << 24) | (value << 16) | (value << 8) | (value) ; - ihipMemsetKernel (stream, static_cast (dst), value32, count/sizeof(uint32_t)); - } else { - // use a slow byte-per-workitem copy: - ihipMemsetKernel (stream, static_cast (dst), value, count); - } - } else if (copyDataType == ihipMemsetDataTypeInt) { // 4 Bytes value - ihipMemsetKernel (stream, static_cast (dst), value, count); - } else if (copyDataType == ihipMemsetDataTypeShort) { - value = value & 0xffff; - ihipMemsetKernel (stream, static_cast (dst), value, count); - } - } catch (...) { - return hipErrorInvalidValue; - } - - if (HIP_API_BLOCKING) { - tprintf (DB_SYNC, "%s LAUNCH_BLOCKING wait for hipMemsetAsync.\n", ToString(stream).c_str()); - stream->locked_wait(); - } - - return hipSuccess; -} - -namespace { - template - void handleHeadTail(T* dst, std::size_t n_head, std::size_t n_body, - std::size_t n_tail, hipStream_t stream, int value) { - struct Cleaner { - static - __global__ - void clean(T* p, std::size_t nh, std::size_t nb, int x) noexcept { - p[(threadIdx.x < nh) ? threadIdx.x : (threadIdx.x - nh + nb)] = x; - } - }; - - hipLaunchKernelGGL(Cleaner::clean, 1, n_head + n_tail, 0, stream, - dst, n_head, - n_body * sizeof(std::uint32_t) / sizeof(T), value); - - } -} // Anonymous namespace. - -hipError_t ihipMemsetSync(void* dst, int value, size_t count, hipStream_t stream, ihipMemsetDataType copyDataType) { - if (count == 0) return hipSuccess; - if (!dst) return hipErrorInvalidValue; - - try { - size_t n = count; - auto aligned_dst{(copyDataType == ihipMemsetDataTypeInt) ? dst : - reinterpret_cast( - hip_impl::round_up_to_next_multiple_nonnegative( - reinterpret_cast(dst), 4ul))}; - size_t n_head{}; - size_t n_tail{}; - int original_value = value; - - switch (copyDataType) { - case ihipMemsetDataTypeChar: - value &= 0xff; - value = (value << 24) | (value << 16) | (value << 8) | value; - n_head = static_cast(aligned_dst) - - static_cast(dst); - n -= n_head; - n /= sizeof(std::uint32_t); - n_tail = count % sizeof(std::uint32_t); - break; - case ihipMemsetDataTypeShort: - value &= 0xffff; - value = (value << 16) | value; - n_head = static_cast(aligned_dst) - - static_cast(dst); - n = (count - n_head) * - sizeof(std::uint16_t) / sizeof(std::uint32_t); - n_tail = ((count - n_head) * - sizeof(std::uint16_t)) % sizeof(std::uint32_t); - break; - default: break; - } - - // queue the memset kernel for the remainder of the buffer before the HSA call below - if (aligned_dst != dst || n_tail != 0) { - switch (copyDataType) { - case ihipMemsetDataTypeChar: - handleHeadTail(static_cast(dst), n_head, n, - n_tail, stream, value & 0xff); - break; - case ihipMemsetDataTypeShort: - handleHeadTail(static_cast(dst), n_head, n, - n_tail, stream, value & 0xffff); - break; - default: break; - } - } - - // The stream must be locked from all other op insertions to guarantee - // that the following HSA call can complete before any other ops. - // Flush the stream while locked. Once the stream is empty, we can safely perform - // the out-of-band HSA call. Lastly, the stream will unlock via RAII. - if (!stream) stream = ihipSyncAndResolveStream(stream); - if (!stream) return hipErrorInvalidValue; - - LockedAccessor_StreamCrit_t crit(stream->criticalData()); - crit->_av.wait(stream->waitMode()); - const auto s = hsa_amd_memory_fill(aligned_dst, value, n); - if (s != HSA_STATUS_SUCCESS) return hipErrorInvalidValue; - } - catch (...) { - return hipErrorInvalidValue; - } - - if (HIP_API_BLOCKING) { - tprintf (DB_SYNC, "%s LAUNCH_BLOCKING wait for hipMemsetSync.\n", ToString(stream).c_str()); - stream->locked_wait(); - } - - return hipSuccess; -} - -hipError_t getLockedPointer(void *hostPtr, size_t dataLen, void **devicePtrPtr) -{ - hc::accelerator acc; - -#if (__hcc_workweek__ >= 17332) - hc::AmPointerInfo amPointerInfo(NULL, NULL, NULL, 0, acc, 0, 0); -#else - hc::AmPointerInfo amPointerInfo(NULL, NULL, 0, acc, 0, 0); -#endif - am_status_t status = hc::am_memtracker_getinfo(&amPointerInfo, hostPtr); - if (status == AM_SUCCESS) { - *devicePtrPtr = static_cast(amPointerInfo._devicePointer) + - (static_cast(hostPtr) - static_cast(amPointerInfo._hostPointer)); - return(hipSuccess); - }; - return(hipErrorHostMemoryNotRegistered); -} - -// TODO - review and optimize -hipError_t ihipMemcpy2D(void* dst, size_t dpitch, const void* src, size_t spitch, size_t width, - size_t height, hipMemcpyKind kind) { - if (height == 0 || width == 0) return hipSuccess; - if (dst == nullptr || src == nullptr || width > dpitch || width > spitch) return hipErrorInvalidValue; - - hipStream_t stream = ihipSyncAndResolveStream(hipStreamNull); - int isLockedOrD2D = 0; - void *pinnedPtr=NULL; - void *actualSrc = (void*)src; - void *actualDest = dst; - if(kind == hipMemcpyHostToDevice ) { - if(getLockedPointer((void*)src, spitch, &pinnedPtr) == hipSuccess ){ - isLockedOrD2D = 1; - actualSrc = pinnedPtr; - } - } else if(kind == hipMemcpyDeviceToHost) { - if(getLockedPointer((void*)dst, dpitch, &pinnedPtr) == hipSuccess ){ - isLockedOrD2D = 1; - actualDest = pinnedPtr; - } - } else if(kind == hipMemcpyDeviceToDevice) { - isLockedOrD2D = 1; - } - - hc::completion_future marker; - - hipError_t e = hipSuccess; - if((width == dpitch) && (width == spitch)) { - stream->locked_copySync((void*)dst, (void*)src, width*height, kind, false); - } else { - try { - if(!isLockedOrD2D) { - for (int i = 0; i < height; ++i) - stream->locked_copySync((unsigned char*)dst + i * dpitch, - (unsigned char*)src + i * spitch, width, kind); - } else { - if(!stream->locked_copy2DSync(dst, src, width, height, spitch, dpitch, kind)){ - ihipMemcpy2dKernel (stream, static_cast (dst), static_cast (src), width, height, dpitch, spitch); - stream->locked_wait(); - } - } - } catch (ihipException& ex) { - e = ex._code; - } - } - - return e; -} - -hipError_t hipMemcpy2D(void* dst, size_t dpitch, const void* src, size_t spitch, size_t width, - size_t height, hipMemcpyKind kind) { - HIP_INIT_SPECIAL_API(hipMemcpy2D, (TRACE_MCMD), dst, dpitch, src, spitch, width, height, kind); - hipError_t e = hipSuccess; - e = ihipMemcpy2D(dst, dpitch, src, spitch, width, height, kind); - return ihipLogStatus(e); -} - -hipError_t ihipMemcpy2DAsync(void* dst, size_t dpitch, const void* src, size_t spitch, size_t width, - size_t height, hipMemcpyKind kind, hipStream_t stream) { - if (height == 0 || width == 0) return hipSuccess; - if (dst == nullptr || src == nullptr || width > dpitch || width > spitch) return hipErrorInvalidValue; - hipError_t e = hipSuccess; - int isLockedOrD2D = 0; - void *pinnedPtr=NULL; - void *actualSrc = (void*)src; - void *actualDest = dst; - stream = ihipSyncAndResolveStream(stream); - if(kind == hipMemcpyHostToDevice ) { - if(getLockedPointer((void*)src, spitch, &pinnedPtr) == hipSuccess ){ - isLockedOrD2D = 1; - actualSrc = pinnedPtr; - } - } else if(kind == hipMemcpyDeviceToHost) { - if(getLockedPointer((void*)dst, dpitch, &pinnedPtr) == hipSuccess ){ - isLockedOrD2D = 1; - actualDest = pinnedPtr; - } - } else if(kind == hipMemcpyDeviceToDevice) { - isLockedOrD2D = 1; - } - - if((width == dpitch) && (width == spitch)) { - hip_internal::memcpyAsync(dst, src, width*height, kind, stream); - } else { - try { - if(!isLockedOrD2D){ - for (int i = 0; i < height; ++i) - e = hip_internal::memcpyAsync((unsigned char*)dst + i * dpitch, - (unsigned char*)src + i * spitch, width, kind, stream); - } else{ - if(!stream->locked_copy2DAsync(dst, src, width, height, spitch, dpitch, kind)){ - ihipMemcpy2dKernel (stream, static_cast (dst), static_cast (src), width, height, dpitch, spitch); - } - } - } catch (ihipException& ex) { - e = ex._code; - } - } - - return e; -} - -hipError_t hipMemcpy2DAsync(void* dst, size_t dpitch, const void* src, size_t spitch, size_t width, - size_t height, hipMemcpyKind kind, hipStream_t stream) { - HIP_INIT_SPECIAL_API(hipMemcpy2DAsync, (TRACE_MCMD), dst, dpitch, src, spitch, width, height, kind, stream); - hipError_t e = hipSuccess; - e = ihipMemcpy2DAsync(dst, dpitch, src, spitch, width, height, kind, stream); - return ihipLogStatus(e); -} - -hipError_t ihip2dOffsetMemcpy(void* dst, size_t dpitch, const void* src, size_t spitch, size_t width, - size_t height, size_t srcXOffsetInBytes, size_t srcYOffset, - size_t dstXOffsetInBytes, size_t dstYOffset,hipMemcpyKind kind, - hipStream_t stream, bool isAsync) { - if (height == 0 || width == 0) return hipSuccess; - if((spitch < width + srcXOffsetInBytes) || (srcYOffset >= height)){ - return hipErrorInvalidValue; - } else if((dpitch < width + dstXOffsetInBytes) || (dstYOffset >= height)){ - return hipErrorInvalidValue; - } - src = (void*)((char*)src+ srcYOffset*spitch + srcXOffsetInBytes); - dst = (void*)((char*)dst+ dstYOffset*dpitch + dstXOffsetInBytes); - if(isAsync){ - return ihipMemcpy2DAsync(dst, dpitch, src, spitch, width, height, hipMemcpyDefault, stream); - } else{ - return ihipMemcpy2D(dst, dpitch, src, spitch, width, height, hipMemcpyDefault); - } -} - -hipError_t ihipMemcpyParam2D(const hip_Memcpy2D* pCopy, hipStream_t stream, bool isAsync) { - if (pCopy == nullptr) { - return hipErrorInvalidValue; - } - if (pCopy->Height == 0 || pCopy->WidthInBytes == 0) return hipSuccess; - void* dst; const void* src; - size_t spitch = pCopy->srcPitch; - size_t dpitch = pCopy->dstPitch; - switch(pCopy->srcMemoryType){ - case hipMemoryTypeHost: - src = pCopy->srcHost; - break; - case hipMemoryTypeArray: - src = pCopy->srcArray->data; - spitch = pCopy->WidthInBytes; - break; - case hipMemoryTypeUnified: - case hipMemoryTypeDevice: - src = pCopy->srcDevice; - break; - default: - return hipErrorInvalidValue; - } - switch(pCopy->dstMemoryType){ - case hipMemoryTypeHost: - dst = pCopy->dstHost; - break; - case hipMemoryTypeArray: - dst = pCopy->dstArray->data; - dpitch = pCopy->WidthInBytes; - break; - case hipMemoryTypeUnified: - case hipMemoryTypeDevice: - dst = pCopy->dstDevice; - break; - default: - return hipErrorInvalidValue; - } - return ihip2dOffsetMemcpy(dst, dpitch, src, spitch, pCopy->WidthInBytes, - pCopy->Height, pCopy->srcXInBytes, pCopy->srcY, - pCopy->dstXInBytes, pCopy->dstY, hipMemcpyDefault, - stream, isAsync); -} - -hipError_t hipMemcpyParam2D(const hip_Memcpy2D* pCopy) { - HIP_INIT_SPECIAL_API(hipMemcpyParam2D, (TRACE_MCMD), pCopy); - return ihipLogStatus(ihipMemcpyParam2D(pCopy, hipStreamNull, false)); -} - -hipError_t hipMemcpyParam2DAsync(const hip_Memcpy2D* pCopy, hipStream_t stream) { - HIP_INIT_SPECIAL_API(hipMemcpyParam2DAsync, (TRACE_MCMD), pCopy, stream); - return ihipLogStatus(ihipMemcpyParam2D(pCopy, stream, true)); -} - -hipError_t hipMemcpy2DFromArray( void* dst, size_t dpitch, hipArray_const_t src, size_t wOffset, size_t hOffset, size_t width, size_t height, hipMemcpyKind kind ){ - HIP_INIT_SPECIAL_API(hipMemcpy2DFromArray, (TRACE_MCMD), dst, dpitch, src, wOffset, hOffset, width, height, kind); - size_t byteSize; - if(src) { - switch (src->desc.f) { - case hipChannelFormatKindSigned: - byteSize = sizeof(int); - break; - case hipChannelFormatKindUnsigned: - byteSize = sizeof(unsigned int); - break; - case hipChannelFormatKindFloat: - byteSize = sizeof(float); - break; - case hipChannelFormatKindNone: - byteSize = sizeof(size_t); - break; - default: - byteSize = 0; - break; - } - } else { - return ihipLogStatus(hipErrorInvalidValue); - } - return ihipLogStatus(ihip2dOffsetMemcpy(dst, dpitch, src->data, src->width*byteSize, width, height, wOffset, hOffset, 0, 0, kind, hipStreamNull, false)); -} - -hipError_t hipMemcpy2DFromArrayAsync( void* dst, size_t dpitch, hipArray_const_t src, size_t wOffset, size_t hOffset, size_t width, size_t height, hipMemcpyKind kind, hipStream_t stream ){ - HIP_INIT_SPECIAL_API(hipMemcpy2DFromArrayAsync, (TRACE_MCMD), dst, dpitch, src, wOffset, hOffset, width, height, kind, stream); - size_t byteSize; - if (height == 0 || width == 0) return ihipLogStatus(hipSuccess); - if(src) { - switch (src->desc.f) { - case hipChannelFormatKindSigned: - byteSize = sizeof(int); - break; - case hipChannelFormatKindUnsigned: - byteSize = sizeof(unsigned int); - break; - case hipChannelFormatKindFloat: - byteSize = sizeof(float); - break; - case hipChannelFormatKindNone: - byteSize = sizeof(size_t); - break; - default: - byteSize = 0; - break; - } - } else { - return ihipLogStatus(hipErrorInvalidValue); - } - return ihipLogStatus(ihip2dOffsetMemcpy(dst, dpitch, src->data, src->width*byteSize, width, height, wOffset, hOffset, 0, 0, kind, stream, true)); -} - -// TODO-sync: function is async unless target is pinned host memory - then these are fully sync. -hipError_t hipMemsetAsync(void* dst, int value, size_t sizeBytes, hipStream_t stream) { - HIP_INIT_SPECIAL_API(hipMemsetAsync, (TRACE_MCMD), dst, value, sizeBytes, stream); - return ihipLogStatus(ihipMemsetAsync(dst, value, sizeBytes, stream, ihipMemsetDataTypeChar)); -} - -hipError_t hipMemsetD32Async(hipDeviceptr_t dst, int value, size_t count, hipStream_t stream) { - HIP_INIT_SPECIAL_API(hipMemsetD32Async, (TRACE_MCMD), dst, value, count, stream); - return ihipLogStatus(ihipMemsetAsync(dst, value, count, stream, ihipMemsetDataTypeInt)); -} - -hipError_t hipMemset(void* dst, int value, size_t sizeBytes) { - HIP_INIT_SPECIAL_API(hipMemset, (TRACE_MCMD), dst, value, sizeBytes); - return ihipLogStatus(ihipMemsetSync(dst, value, sizeBytes, nullptr, ihipMemsetDataTypeChar)); -} - -hipError_t ihipMemsetND(void* dst, size_t pitch, int value, size_t width, size_t height, size_t setHeight,size_t depth, - hipStream_t stream, enum ihipMemsetDataType copyDataType, bool async) { - size_t sizeBytes =0; - hipError_t hipStatus = hipSuccess; - if ((pitch == width) && (height == setHeight)) { - sizeBytes = pitch * setHeight * depth; - if(async) - return ihipMemsetAsync(dst, value, sizeBytes, stream, copyDataType); - else - return ihipMemsetSync(dst, value, sizeBytes, nullptr, copyDataType); - } else { - for(size_t i = 0; i < depth; ++i) { - for(size_t j = 0; j < setHeight; ++j) { - void* dstPtr = ((unsigned char*) dst + i * height * pitch + j * pitch); - if(async) - hipStatus = ihipMemsetAsync(dstPtr, value, width, stream, copyDataType); - else - hipStatus = ihipMemsetSync(dstPtr, value, width, nullptr, copyDataType); - if (hipStatus != hipSuccess) - return hipStatus; - } - } - } - return hipStatus; -} - -hipError_t hipMemset2D(void* dst, size_t pitch, int value, size_t width, size_t height) { - HIP_INIT_SPECIAL_API(hipMemset2D, (TRACE_MCMD), dst, pitch, value, width, height); - return ihipLogStatus(ihipMemsetND(dst, pitch, value, width, height, height, 1, hipStreamNull, ihipMemsetDataTypeChar, false)); -} - -hipError_t hipMemset2DAsync(void* dst, size_t pitch, int value, size_t width, size_t height, hipStream_t stream ) { - HIP_INIT_SPECIAL_API(hipMemset2DAsync, (TRACE_MCMD), dst, pitch, value, width, height, stream); - return ihipLogStatus(ihipMemsetND(dst, pitch, value, width, height, height, 1, stream, ihipMemsetDataTypeChar, true)); -} - -hipError_t hipMemsetD8(hipDeviceptr_t dst, unsigned char value, size_t count) { - HIP_INIT_SPECIAL_API(hipMemsetD8, (TRACE_MCMD), dst, value, count); - return ihipLogStatus(ihipMemsetSync(dst, value, count, nullptr, ihipMemsetDataTypeChar)); -} - -hipError_t hipMemsetD8Async(hipDeviceptr_t dst, unsigned char value, size_t count , hipStream_t stream ) { - HIP_INIT_SPECIAL_API(hipMemsetD8Async, (TRACE_MCMD), dst, value, count, stream); - return ihipLogStatus(ihipMemsetAsync(dst, value, count, stream, ihipMemsetDataTypeChar)); -} - -hipError_t hipMemsetD16(hipDeviceptr_t dst, unsigned short value, size_t count){ - HIP_INIT_SPECIAL_API(hipMemsetD16, (TRACE_MCMD), dst, value, count); - return ihipLogStatus(ihipMemsetSync(dst, value, count, nullptr, ihipMemsetDataTypeShort)); -} - -hipError_t hipMemsetD16Async(hipDeviceptr_t dst, unsigned short value, size_t count, hipStream_t stream ){ - HIP_INIT_SPECIAL_API(hipMemsetD16Async, (TRACE_MCMD), dst, value, count, stream); - return ihipLogStatus(ihipMemsetAsync(dst, value, count, stream, ihipMemsetDataTypeShort)); -} - -hipError_t hipMemsetD32(hipDeviceptr_t dst, int value, size_t count) { - HIP_INIT_SPECIAL_API(hipMemsetD32, (TRACE_MCMD), dst, value, count); - return ihipLogStatus(ihipMemsetSync(dst, value, count, nullptr, ihipMemsetDataTypeInt)); -} - -hipError_t hipMemset3D(hipPitchedPtr pitchedDevPtr, int value, hipExtent extent) { - HIP_INIT_SPECIAL_API(hipMemset3D, (TRACE_MCMD), &pitchedDevPtr, value, &extent); - return ihipLogStatus(ihipMemsetND(pitchedDevPtr.ptr, pitchedDevPtr.pitch ,value, extent.width, pitchedDevPtr.ysize, extent.height, extent.depth, hipStreamNull, ihipMemsetDataTypeChar, false)); -} - -hipError_t hipMemset3DAsync(hipPitchedPtr pitchedDevPtr, int value, hipExtent extent ,hipStream_t stream ) { - HIP_INIT_SPECIAL_API(hipMemset3DAsync, (TRACE_MCMD), &pitchedDevPtr, value, &extent); - return ihipLogStatus(ihipMemsetND(pitchedDevPtr.ptr,pitchedDevPtr.pitch, value, extent.width, pitchedDevPtr.ysize, extent.height, extent.depth, stream, ihipMemsetDataTypeChar, true)); -} - -hipError_t hipMemGetInfo(size_t* free, size_t* total) { - HIP_INIT_API(hipMemGetInfo, free, total); - - hipError_t e = hipSuccess; - - ihipCtx_t* ctx = ihipGetTlsDefaultCtx(); - if (ctx) { - auto device = ctx->getWriteableDevice(); - if (total) { - *total = device->_props.totalGlobalMem; - } - - if (free) { - if (!device->_driver_node_id) return ihipLogStatus(hipErrorInvalidDevice); - - std::string fileName = std::string("/sys/class/kfd/kfd/topology/nodes/") + std::to_string(device->_driver_node_id) + std::string("/mem_banks/0/used_memory"); - std::ifstream file; - file.open(fileName); - if (!file) return ihipLogStatus(hipErrorFileNotFound); - - std::string deviceSize; - size_t deviceMemSize; - - file >> deviceSize; - file.close(); - if ((deviceMemSize=strtol(deviceSize.c_str(),NULL,10))){ - *free = device->_props.totalGlobalMem - deviceMemSize; - // Deduct the amount of memory from the free memory reported from the system - if (HIP_HIDDEN_FREE_MEM) *free -= (size_t)HIP_HIDDEN_FREE_MEM * 1024 * 1024; - } else { - return ihipLogStatus(hipErrorInvalidValue); - } - } - - } else { - e = hipErrorInvalidDevice; - } - - return ihipLogStatus(e); -} - -hipError_t hipMemPtrGetInfo(void* ptr, size_t* size) { - HIP_INIT_API(hipMemPtrGetInfo, ptr, size); - - return ihipLogStatus(ihipMemPtrGetInfo(ptr, size)); -} - - -hipError_t hipFree(void* ptr) { - HIP_INIT_SPECIAL_API(hipFree, (TRACE_MEM), ptr); - - hipError_t hipStatus = hipErrorInvalidDevicePointer; - - if (ptr) { - hc::accelerator acc; -#if (__hcc_workweek__ >= 17332) - hc::AmPointerInfo amPointerInfo(NULL, NULL, NULL, 0, acc, 0, 0); -#else - hc::AmPointerInfo amPointerInfo(NULL, NULL, 0, acc, 0, 0); -#endif - am_status_t status = hc::am_memtracker_getinfo(&amPointerInfo, ptr); - if (status == AM_SUCCESS) { - /*if (amPointerInfo._hostPointer == NULL) */ //TODO: Fix it when there is proper managed memory support - { - if (HIP_SYNC_FREE) { - // Synchronize all devices, all streams - // to ensure all work has finished on all devices. - // This is disabled by default. - for (unsigned i = 0; i < g_deviceCnt; i++) { - ihipGetPrimaryCtx(i)->locked_waitAllStreams(); - } - } - else { - ihipCtx_t* ctx; - if (amPointerInfo._appId != -1) { -#if USE_APP_PTR_FOR_CTX - ctx = static_cast(amPointerInfo._appPtr); -#else - ctx = ihipGetPrimaryCtx(amPointerInfo._appId); -#endif - } else { - ctx = ihipGetTlsDefaultCtx(); - } - // Synchronize to ensure all work has finished on device owning the memory. - ctx->locked_waitAllStreams(); // ignores non-blocking streams, this waits - // for all activity to finish. - } - hc::am_free(ptr); - hipStatus = hipSuccess; - } - } - } else { - // free NULL pointer succeeds and is common technique to initialize runtime - hipStatus = hipSuccess; - } - - return ihipLogStatus(hipStatus); -} - - -hipError_t hipHostFree(void* ptr) { - HIP_INIT_SPECIAL_API(hipHostFree, (TRACE_MEM), ptr); - - hipError_t hipStatus = hipSuccess; - hipStatus = hip_internal::ihipHostFree(tls, ptr); - - return ihipLogStatus(hipStatus); -}; - - -// Deprecated: -hipError_t hipFreeHost(void* ptr) { return hipHostFree(ptr); } - -hipError_t hipFreeArray(hipArray* array) { - HIP_INIT_SPECIAL_API(hipFreeArray, (TRACE_MEM), array); - - hipError_t hipStatus = hipErrorInvalidDevicePointer; - - // Synchronize to ensure all work has finished. - ihipGetTlsDefaultCtx()->locked_waitAllStreams(); // ignores non-blocking streams, this waits - // for all activity to finish. - - if (array->data) { - hc::accelerator acc; -#if (__hcc_workweek__ >= 17332) - hc::AmPointerInfo amPointerInfo(NULL, NULL, NULL, 0, acc, 0, 0); -#else - hc::AmPointerInfo amPointerInfo(NULL, NULL, 0, acc, 0, 0); -#endif - am_status_t status = hc::am_memtracker_getinfo(&amPointerInfo, array->data); - if (status == AM_SUCCESS) { - if (amPointerInfo._hostPointer == NULL) { - hc::am_free(array->data); - hipStatus = hipSuccess; - } - } - } - - return ihipLogStatus(hipStatus); -} - -hipError_t hipMemGetAddressRange(hipDeviceptr_t* pbase, size_t* psize, hipDeviceptr_t dptr) { - HIP_INIT_API(hipMemGetAddressRange, pbase, psize, dptr); - hipError_t hipStatus = hipSuccess; - hc::accelerator acc; -#if (__hcc_workweek__ >= 17332) - hc::AmPointerInfo amPointerInfo(NULL, NULL, NULL, 0, acc, 0, 0); -#else - hc::AmPointerInfo amPointerInfo(NULL, NULL, 0, acc, 0, 0); -#endif - am_status_t status = hc::am_memtracker_getinfo(&amPointerInfo, dptr); - if (status == AM_SUCCESS) { - *pbase = amPointerInfo._devicePointer; - *psize = amPointerInfo._sizeBytes; - } else - hipStatus = hipErrorInvalidDevicePointer; - return ihipLogStatus(hipStatus); -} - - -// TODO: IPC implementaiton: - -hipError_t hipIpcGetMemHandle(hipIpcMemHandle_t* handle, void* devPtr) { - HIP_INIT_API(hipIpcGetMemHandle, handle, devPtr); - hipError_t hipStatus = hipSuccess; - // Get the size of allocated pointer - size_t psize = 0u; - hc::accelerator acc; - if ((handle == NULL) || (devPtr == NULL)) { - hipStatus = hipErrorInvalidHandle; - } else { -#if (__hcc_workweek__ >= 17332) - hc::AmPointerInfo amPointerInfo(NULL, NULL, NULL, 0, acc, 0, 0); -#else - hc::AmPointerInfo amPointerInfo(NULL, NULL, 0, acc, 0, 0); -#endif - am_status_t status = hc::am_memtracker_getinfo(&amPointerInfo, devPtr); - if (status == AM_SUCCESS) { - psize = (size_t)amPointerInfo._sizeBytes; - } else { - hipStatus = hipErrorInvalidHandle; - } - ihipIpcMemHandle_t* iHandle = (ihipIpcMemHandle_t*)handle; - // Save the size of the pointer to hipIpcMemHandle - iHandle->psize = psize; - -#if USE_IPC - // Create HSA ipc memory - hsa_status_t hsa_status = - hsa_amd_ipc_memory_create(devPtr, psize, (hsa_amd_ipc_memory_t*)&(iHandle->ipc_handle)); - if (hsa_status != HSA_STATUS_SUCCESS) hipStatus = hipErrorOutOfMemory; -#else - hipStatus = hipErrorRuntimeOther; -#endif - } - return ihipLogStatus(hipStatus); -} - -hipError_t hipIpcOpenMemHandle(void** devPtr, hipIpcMemHandle_t handle, unsigned int flags) { - HIP_INIT_API(hipIpcOpenMemHandle, devPtr, &handle, flags); - hipError_t hipStatus = hipSuccess; - if (devPtr == NULL) - return ihipLogStatus(hipErrorInvalidValue); - -#if USE_IPC - // Get the current device agent. - hc::accelerator acc; - hsa_agent_t* agent = static_cast(acc.get_hsa_agent()); - if (!agent) - return ihipLogStatus(hipErrorInvalidHandle); - - ihipIpcMemHandle_t* iHandle = (ihipIpcMemHandle_t*)&handle; - // Attach ipc memory - auto ctx = ihipGetTlsDefaultCtx(); - { - LockedAccessor_CtxCrit_t crit(ctx->criticalData()); - auto device = ctx->getWriteableDevice(); - // the peerCnt always stores self so make sure the trace actually - if(hsa_amd_ipc_memory_attach( - (hsa_amd_ipc_memory_t*)&(iHandle->ipc_handle), iHandle->psize, crit->peerCnt(), - crit->peerAgents(), devPtr) != HSA_STATUS_SUCCESS) - return ihipLogStatus(hipErrorRuntimeOther); - - hc::AmPointerInfo ampi(NULL, *devPtr, *devPtr, iHandle->psize, acc, true, true); - am_status_t am_status = hc::am_memtracker_add(*devPtr,ampi); - if (am_status != AM_SUCCESS) - return ihipLogStatus(hipErrorMapFailed); - -#if USE_APP_PTR_FOR_CTX - am_status = hc::am_memtracker_update(*devPtr, device->_deviceId, 0, ctx); -#else - am_status = hc::am_memtracker_update(*devPtr, device->_deviceId, 0); -#endif - if(am_status != AM_SUCCESS) - return ihipLogStatus(hipErrorMapFailed); - } -#else - hipStatus = hipErrorRuntimeOther; -#endif - - return ihipLogStatus(hipStatus); -} - -hipError_t hipIpcCloseMemHandle(void* devPtr) { - HIP_INIT_API(hipIpcCloseMemHandle, devPtr); - hipError_t hipStatus = hipSuccess; - if (devPtr == NULL) - return ihipLogStatus(hipErrorInvalidValue); - -#if USE_IPC - if(hc::am_memtracker_remove(devPtr) != AM_SUCCESS) - return ihipLogStatus(hipErrorInvalidValue); - - if (hsa_amd_ipc_memory_detach(devPtr) != HSA_STATUS_SUCCESS) - return ihipLogStatus(hipErrorInvalidHandle); -#else - hipStatus = hipErrorRuntimeOther; -#endif - - return ihipLogStatus(hipStatus); -} - -// hipError_t hipIpcOpenEventHandle(hipEvent_t* event, hipIpcEventHandle_t handle){ -// return hipSuccess; -// } diff --git a/src/hip_module.cpp b/src/hip_module.cpp deleted file mode 100644 index 65077d389a..0000000000 --- a/src/hip_module.cpp +++ /dev/null @@ -1,1757 +0,0 @@ -/* -Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved. - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. -*/ - -#include "hip/hip_runtime.h" -#include "hip/hcc_detail/elfio/elfio.hpp" -#include "hip/hcc_detail/hsa_helpers.hpp" -#include "hip/hcc_detail/program_state.hpp" -#include "hip_hcc_internal.h" -#include "hip/hip_ext.h" -#include "program_state.inl" -#include "trace_helper.h" -#include "hc_am.hpp" - -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "code_object_bundle.inl" -#include "hip_fatbin.h" -// TODO Use Pool APIs from HCC to get memory regions. - -using namespace ELFIO; -using namespace std; - -// For HIP implicit kernargs. -static const size_t HIP_IMPLICIT_KERNARG_SIZE = 56; -static const size_t HIP_IMPLICIT_KERNARG_ALIGNMENT = 8; - -struct amd_kernel_code_v3_t { - uint32_t group_segment_fixed_size; - uint32_t private_segment_fixed_size; - uint8_t reserved0[8]; - int64_t kernel_code_entry_byte_offset; - uint8_t reserved1[24]; - uint32_t compute_pgm_rsrc1; - uint32_t compute_pgm_rsrc2; - uint16_t kernel_code_properties; - uint8_t reserved2[6]; -}; - -// calculate MD5 checksum -inline std::string checksum(size_t size, const char *source) { - // FNV-1a hashing, 64-bit version - const uint64_t FNV_prime = 0x100000001b3; - const uint64_t FNV_basis = 0xcbf29ce484222325; - uint64_t hash = FNV_basis; - - const char *str = static_cast(source); - for (auto i = 0; i < size; ++i) { - hash ^= *str++; - hash *= FNV_prime; - } - return std::to_string(hash); -} - -inline uint64_t alignTo(uint64_t Value, uint64_t Align, uint64_t Skew = 0) { - assert(Align != 0u && "Align can't be 0."); - Skew %= Align; - return (Value + Align - 1 - Skew) / Align * Align + Skew; -} - - -struct ihipKernArgInfo { - vector Size; - vector Align; - vector ArgType; - vector ArgName; - uint32_t totalSize; -}; - -map kernelArguments; - -struct ihipModuleSymbol_t { - uint64_t _object{}; // The kernel object. - amd_kernel_code_t const* _header{}; - string _name; // TODO - review for performance cost. Name is just used for debug. - vector> _kernarg_layout{}; - bool _is_code_object_v3{}; -}; - -template <> -string ToString(hipFunction_t v) { - std::ostringstream ss; - ss << "0x" << std::hex << v->_object; - return ss.str(); -}; - -const std::string& FunctionSymbol(const hipFunction_t f) { return f->_name; }; - -extern hipError_t ihipGetDeviceProperties(hipDeviceProp_t* props, int device); - -#define CHECK_HSA(hsaStatus, hipStatus) \ - if (hsaStatus != HSA_STATUS_SUCCESS) { \ - return hipStatus; \ - } - -#define CHECKLOG_HSA(hsaStatus, hipStatus) \ - if (hsaStatus != HSA_STATUS_SUCCESS) { \ - return ihipLogStatus(hipStatus); \ - } - -hipError_t ihipModuleLaunchKernel(TlsData *tls, hipFunction_t f, uint32_t globalWorkSizeX, - uint32_t globalWorkSizeY, uint32_t globalWorkSizeZ, - uint32_t localWorkSizeX, uint32_t localWorkSizeY, - uint32_t localWorkSizeZ, size_t sharedMemBytes, - hipStream_t hStream, void** kernelParams, void** extra, - hipEvent_t startEvent, hipEvent_t stopEvent, uint32_t flags, bool isStreamLocked = 0, - void** impCoopParams = 0, hc::accelerator_view* coopAV = 0) { - using namespace hip_impl; - - auto ctx = ihipGetTlsDefaultCtx(); - hipError_t ret = hipSuccess; - - if (ctx == nullptr) { - ret = hipErrorInvalidDevice; - - } else { - int deviceId = ctx->getDevice()->_deviceId; - ihipDevice_t* currentDevice = ihipGetDevice(deviceId); - hsa_agent_t gpuAgent = (hsa_agent_t)currentDevice->_hsaAgent; - - std::vector kernargs{}; - if (kernelParams) { - if (extra) return hipErrorInvalidValue; - - for (auto&& x : f->_kernarg_layout) { - const auto p{static_cast(*kernelParams)}; - - kernargs.insert( - kernargs.cend(), - round_up_to_next_multiple_nonnegative( - kernargs.size(), x.second) - kernargs.size(), - '\0'); - kernargs.insert(kernargs.cend(), p, p + x.first); - - ++kernelParams; - } - } else if (extra) { - if (extra[0] == HIP_LAUNCH_PARAM_BUFFER_POINTER && - extra[2] == HIP_LAUNCH_PARAM_BUFFER_SIZE && extra[4] == HIP_LAUNCH_PARAM_END) { - auto args = (char*)extra[1]; - size_t argSize = *(size_t*)(extra[3]); - kernargs.insert(kernargs.end(), args, args+argSize); - } else { - return hipErrorNotInitialized; - } - - } - else if (f->_kernarg_layout.size() != 0) { - return hipErrorInvalidValue; - } - - // Insert 56-bytes at the end for implicit kernel arguments and fill with value zero. - size_t padSize = (~kernargs.size() + 1) & (HIP_IMPLICIT_KERNARG_ALIGNMENT - 1); - kernargs.insert(kernargs.end(), padSize + HIP_IMPLICIT_KERNARG_SIZE, 0); - - if (impCoopParams) { - const auto p{static_cast(*impCoopParams)}; - // The sixth index is for multi-grid synchronization - copy(p, p + HIP_IMPLICIT_KERNARG_ALIGNMENT, - (kernargs.end() - HIP_IMPLICIT_KERNARG_SIZE) + 6 * HIP_IMPLICIT_KERNARG_ALIGNMENT); - } - - /* - Kernel argument preparation. - */ - grid_launch_parm lp; - lp.dynamic_group_mem_bytes = - sharedMemBytes; // TODO - this should be part of preLaunchKernel. - hStream = ihipPreLaunchKernel( - hStream, dim3(globalWorkSizeX/localWorkSizeX, globalWorkSizeY/localWorkSizeY, globalWorkSizeZ/localWorkSizeZ), - dim3(localWorkSizeX, localWorkSizeY, localWorkSizeZ), &lp, f->_name.c_str(), isStreamLocked); - - hsa_kernel_dispatch_packet_t aql; - - memset(&aql, 0, sizeof(aql)); - - // aql.completion_signal._handle = 0; - // aql.kernarg_address = 0; - - aql.workgroup_size_x = localWorkSizeX; - aql.workgroup_size_y = localWorkSizeY; - aql.workgroup_size_z = localWorkSizeZ; - aql.grid_size_x = globalWorkSizeX; - aql.grid_size_y = globalWorkSizeY; - aql.grid_size_z = globalWorkSizeZ; - if (f->_is_code_object_v3) { - const auto* header = - reinterpret_cast(f->_header); - aql.group_segment_size = - header->group_segment_fixed_size + sharedMemBytes; - aql.private_segment_size = - header->private_segment_fixed_size; - } else { - aql.group_segment_size = - f->_header->workgroup_group_segment_byte_size + sharedMemBytes; - aql.private_segment_size = - f->_header->workitem_private_segment_byte_size; - } - aql.kernel_object = f->_object; - aql.setup = 3 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS; - aql.header = - (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE); - if((flags & 0x1)== 0 ) { - //in_order - aql.header |= (1 << HSA_PACKET_HEADER_BARRIER); - } - - aql.header |= lp.launch_fence; - - hc::completion_future cf; - - if (coopAV) { - lp.av = coopAV; - } - - lp.av->dispatch_hsa_kernel(&aql, kernargs.data(), kernargs.size(), - (startEvent || stopEvent) ? &cf : nullptr -#if (__hcc_workweek__ > 17312) - , - f->_name.c_str() -#endif - ); - - - if (startEvent) { - startEvent->attachToCompletionFuture(&cf, hStream, hipEventTypeStartCommand); - } - if (stopEvent) { - stopEvent->attachToCompletionFuture(&cf, hStream, hipEventTypeStopCommand); - } - - ihipPostLaunchKernel(f->_name.c_str(), hStream, lp, isStreamLocked); - - - } - - return ret; -} - -hipError_t hipModuleLaunchKernel(hipFunction_t f, uint32_t gridDimX, uint32_t gridDimY, - uint32_t gridDimZ, uint32_t blockDimX, uint32_t blockDimY, - uint32_t blockDimZ, uint32_t sharedMemBytes, hipStream_t hStream, - void** kernelParams, void** extra) { - HIP_INIT_API(hipModuleLaunchKernel, f, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, sharedMemBytes, - hStream, kernelParams, extra); - - size_t globalWorkSizeX = (size_t)gridDimX * (size_t)blockDimX; - size_t globalWorkSizeY = (size_t)gridDimY * (size_t)blockDimY; - size_t globalWorkSizeZ = (size_t)gridDimZ * (size_t)blockDimZ; - if(globalWorkSizeX > UINT32_MAX || globalWorkSizeY > UINT32_MAX || globalWorkSizeZ > UINT32_MAX) - { - return hipErrorInvalidConfiguration; - } - - return ihipLogStatus(ihipModuleLaunchKernel(tls, - f, globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ, blockDimX, blockDimY, - blockDimZ, sharedMemBytes, hStream, kernelParams, extra, nullptr, nullptr, 0)); -} - -hipError_t hipExtModuleLaunchKernel(hipFunction_t f, uint32_t globalWorkSizeX, - uint32_t globalWorkSizeY, uint32_t globalWorkSizeZ, - uint32_t localWorkSizeX, uint32_t localWorkSizeY, - uint32_t localWorkSizeZ, size_t sharedMemBytes, - hipStream_t hStream, void** kernelParams, void** extra, - hipEvent_t startEvent, hipEvent_t stopEvent, uint32_t flags) { - HIP_INIT_API(hipExtModuleLaunchKernel, f, globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ, localWorkSizeX, - localWorkSizeY, localWorkSizeZ, sharedMemBytes, hStream, kernelParams, extra); - - return ihipLogStatus(ihipModuleLaunchKernel(tls, - f, globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ, localWorkSizeX, localWorkSizeY, - localWorkSizeZ, sharedMemBytes, hStream, kernelParams, extra, startEvent, stopEvent, flags)); -} - -hipError_t hipHccModuleLaunchKernel(hipFunction_t f, uint32_t globalWorkSizeX, - uint32_t globalWorkSizeY, uint32_t globalWorkSizeZ, - uint32_t localWorkSizeX, uint32_t localWorkSizeY, - uint32_t localWorkSizeZ, size_t sharedMemBytes, - hipStream_t hStream, void** kernelParams, void** extra, - hipEvent_t startEvent, hipEvent_t stopEvent) { - HIP_INIT_API(hipHccModuleLaunchKernel, f, globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ, localWorkSizeX, - localWorkSizeY, localWorkSizeZ, sharedMemBytes, hStream, kernelParams, extra); - - return ihipLogStatus(ihipModuleLaunchKernel(tls, - f, globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ, localWorkSizeX, localWorkSizeY, - localWorkSizeZ, sharedMemBytes, hStream, kernelParams, extra, startEvent, stopEvent, 0)); -} - -__attribute__((visibility("default"))) -hipError_t ihipExtLaunchMultiKernelMultiDevice(hipLaunchParams* launchParamsList, - int numDevices, unsigned int flags, hip_impl::program_state& ps) { - hipError_t result; - - if ((numDevices > g_deviceCnt) || (launchParamsList == nullptr)) { - return hipErrorInvalidValue; - } - - std::vector kds(numDevices,0); - - // prepare all kernel descriptors for each device as all streams will be locked in the next loop - for (int i = 0; i < numDevices; ++i) { - const hipLaunchParams& lp = launchParamsList[i]; - if (lp.stream == nullptr) { - return hipErrorNotInitialized; - } - kds[i] = ps.kernel_descriptor(reinterpret_cast(lp.func), - hip_impl::target_agent(lp.stream)); - - if (kds[i] == nullptr) { - return hipErrorInvalidValue; - } - if (!kds[i]->_kernarg_layout.empty()) continue; - - hip_impl::kernargs_size_align kargs = ps.get_kernargs_size_align( - reinterpret_cast(lp.func)); - kds[i]->_kernarg_layout = *reinterpret_cast>*>( - kargs.getHandle()); - } - - // lock all streams before launching kernels to each device - for (int i = 0; i < numDevices; ++i) { - LockedAccessor_StreamCrit_t streamCrit(launchParamsList[i].stream->criticalData(), false); - #if (__hcc_workweek__ >= 19213) - streamCrit->_av.acquire_locked_hsa_queue(); - #endif - } - - GET_TLS(); - - size_t globalWorkSizeX = 0, globalWorkSizeY = 0, globalWorkSizeZ = 0; - - // launch kernels for each device - for (int i = 0; i < numDevices; ++i) { - const hipLaunchParams& lp = launchParamsList[i]; - - globalWorkSizeX = (size_t)lp.gridDim.x * (size_t)lp.blockDim.x; - globalWorkSizeY = (size_t)lp.gridDim.y * (size_t)lp.blockDim.y; - globalWorkSizeZ = (size_t)lp.gridDim.z * (size_t)lp.blockDim.z; - - if(globalWorkSizeX > UINT32_MAX || globalWorkSizeY > UINT32_MAX || globalWorkSizeZ > UINT32_MAX) - { - return hipErrorInvalidConfiguration; - } - - result = ihipModuleLaunchKernel(tls, kds[i], - lp.gridDim.x * lp.blockDim.x, - lp.gridDim.y * lp.blockDim.y, - lp.gridDim.z * lp.blockDim.z, - lp.blockDim.x, lp.blockDim.y, - lp.blockDim.z, lp.sharedMem, - lp.stream, lp.args, nullptr, nullptr, nullptr, 0, - true /* stream is already locked above and will be unlocked - in the below code after launching kernels on all devices*/); - } - - // unlock all streams - for (int i = 0; i < numDevices; ++i) { - launchParamsList[i].stream->criticalData().unlock(); - #if (__hcc_workweek__ >= 19213) - launchParamsList[i].stream->criticalData()._av.release_locked_hsa_queue(); - #endif - } - - return result; -} - -__attribute__((visibility("default"))) -hipError_t hipExtLaunchMultiKernelMultiDevice(hipLaunchParams* launchParamsList, - int numDevices, unsigned int flags) { - HIP_INIT_API(hipExtLaunchMultiKernelMultiDevice, launchParamsList, numDevices, flags); - auto& ps = hip_impl::get_program_state(); - return ihipExtLaunchMultiKernelMultiDevice(launchParamsList, numDevices, flags, ps); -} - -void getGprsLdsUsage(hipFunction_t f, size_t* usedVGPRS, size_t* usedSGPRS, size_t* usedLDS) -{ - if (f->_is_code_object_v3) { - const auto header = reinterpret_cast(f->_header); - // GRANULATED_WAVEFRONT_VGPR_COUNT is specified in 0:5 bits of COMPUTE_PGM_RSRC1 - // the granularity for gfx6-gfx9 is max(0, ceil(vgprs_used / 4) - 1) - *usedVGPRS = ((header->compute_pgm_rsrc1 & 0x3F) + 1) << 2; - // GRANULATED_WAVEFRONT_SGPR_COUNT is specified in 6:9 bits of COMPUTE_PGM_RSRC1 - // the granularity for gfx9+ is 2 * max(0, ceil(sgprs_used / 16) - 1) - *usedSGPRS = ((((header->compute_pgm_rsrc1 & 0x3C0) >> 6) >> 1) + 1) << 4; - *usedLDS = header->group_segment_fixed_size; - } - else { - const auto header = f->_header; - // VGPRs granularity is 4 - *usedVGPRS = ((header->workitem_vgpr_count + 3) >> 2) << 2; - // adding 2 to take into account the 2 VCC registers & handle the granularity of 16 - *usedSGPRS = header->wavefront_sgpr_count + 2; - *usedSGPRS = ((*usedSGPRS + 15) >> 4) << 4; - *usedLDS = header->workgroup_group_segment_byte_size; - } -} - -static hipError_t ihipOccupancyMaxActiveBlocksPerMultiprocessor( - TlsData *tls, int* numBlocks, hipFunction_t f, int blockSize, size_t dynSharedMemPerBlk) -{ - using namespace hip_impl; - - auto ctx = ihipGetTlsDefaultCtx(); - if (ctx == nullptr) { - return hipErrorInvalidDevice; - } - if (numBlocks == nullptr) { - return hipErrorInvalidValue; - } - - hipDeviceProp_t prop{}; - ihipGetDeviceProperties(&prop, ihipGetTlsDefaultCtx()->getDevice()->_deviceId); - - if (blockSize > prop.maxThreadsPerBlock) { - *numBlocks = 0; - return hipSuccess; - } - - prop.regsPerBlock = prop.regsPerBlock ? prop.regsPerBlock : 64 * 1024; - - size_t usedVGPRS = 0; - size_t usedSGPRS = 0; - size_t usedLDS = 0; - getGprsLdsUsage(f, &usedVGPRS, &usedSGPRS, &usedLDS); - - // Due to SPI and private memory limitations, the max of wavefronts per CU in 32 - size_t wavefrontSize = prop.warpSize; - size_t maxWavefrontsPerCU = min(prop.maxThreadsPerMultiProcessor / wavefrontSize, 32); - - const size_t simdPerCU = 4; - const size_t maxWavesPerSimd = maxWavefrontsPerCU / simdPerCU; - - size_t numWavefronts = (blockSize + wavefrontSize - 1) / wavefrontSize; - - size_t availableVGPRs = (prop.regsPerBlock / wavefrontSize / simdPerCU); - size_t vgprs_alu_occupancy = simdPerCU * (usedVGPRS == 0 ? maxWavesPerSimd - : std::min(maxWavesPerSimd, availableVGPRs / usedVGPRS)); - - // Calculate blocks occupancy per CU based on VGPR usage - *numBlocks = vgprs_alu_occupancy / numWavefronts; - - const size_t availableSGPRs = (prop.gcnArch < 800) ? 512 : 800; - size_t sgprs_alu_occupancy = simdPerCU * (usedSGPRS == 0 ? maxWavesPerSimd - : std::min(maxWavesPerSimd, availableSGPRs / usedSGPRS)); - - // Calculate blocks occupancy per CU based on SGPR usage - *numBlocks = std::min(*numBlocks, (int) (sgprs_alu_occupancy / numWavefronts)); - - size_t total_used_lds = usedLDS + dynSharedMemPerBlk; - if (total_used_lds != 0) { - // Calculate LDS occupacy per CU. lds_per_cu / (static_lsd + dynamic_lds) - size_t lds_occupancy = prop.maxSharedMemoryPerMultiProcessor / total_used_lds; - *numBlocks = std::min(*numBlocks, (int) lds_occupancy); - } - - return hipSuccess; -} - -namespace { -// kernel for initializing GWS -// nwm1 is the total number of work groups minus 1 -__global__ void init_gws(uint nwm1) { - __ockl_gws_init(nwm1, 0); -} -} - -hipError_t ihipLaunchCooperativeKernel(const void* f, dim3 gridDim, - dim3 blockDim, void** kernelParams, unsigned int sharedMemBytes, - hipStream_t stream, hip_impl::program_state& ps) { - -#if (__hcc_workweek__ >= 20115) - hipError_t result; - - - if (f == nullptr || kernelParams == nullptr) { - return hipErrorNotInitialized; - } - - stream = ihipSyncAndResolveStream(stream); - - if (!stream->getDevice()->_props.cooperativeLaunch || - blockDim.x * blockDim.y * blockDim.z > stream->getDevice()->_props.maxThreadsPerBlock) { - return hipErrorInvalidConfiguration; - } - - size_t globalWorkSizeX = (size_t)gridDim.x * (size_t)blockDim.x; - size_t globalWorkSizeY = (size_t)gridDim.y * (size_t)blockDim.y; - size_t globalWorkSizeZ = (size_t)gridDim.z * (size_t)blockDim.z; - if(globalWorkSizeX > UINT32_MAX || globalWorkSizeY > UINT32_MAX || globalWorkSizeZ > UINT32_MAX) { - return hipErrorInvalidConfiguration; - } - - // Prepare the kernel descriptor for initializing the GWS - hipFunction_t gwsKD = ps.kernel_descriptor( - reinterpret_cast(&init_gws), - hip_impl::target_agent(stream)); - - if (gwsKD == nullptr) { - return hipErrorInvalidValue; - } - hip_impl::kernargs_size_align gwsKargs = ps.get_kernargs_size_align( - reinterpret_cast(&init_gws)); - - gwsKD->_kernarg_layout = *reinterpret_cast>*>(gwsKargs.getHandle()); - - // Prepare the kernel descriptor for the main kernel - hipFunction_t kd = ps.kernel_descriptor( - reinterpret_cast(f), - hip_impl::target_agent(stream)); - if (kd == nullptr) { - return hipErrorInvalidValue; - } - hip_impl::kernargs_size_align kargs = - ps.get_kernargs_size_align( - reinterpret_cast(f)); - - kd->_kernarg_layout = *reinterpret_cast>*>(kargs.getHandle()); - - GET_TLS(); - int numBlocksPerSm = 0; - result = ihipOccupancyMaxActiveBlocksPerMultiprocessor(tls, &numBlocksPerSm, kd, - blockDim.x * blockDim.y * blockDim.z, sharedMemBytes); - if (result != hipSuccess) { - return hipErrorLaunchFailure; - } - int maxActiveBlocks = numBlocksPerSm * stream->getDevice()->_props.multiProcessorCount; - - //check to see if the workload fits on the GPU - if (gridDim.x * gridDim.y * gridDim.z > maxActiveBlocks) { - return hipErrorCooperativeLaunchTooLarge; - } - - void *gwsKernelParam[1]; - // calculate total number of work groups minus 1 for the main kernel - uint nwm1 = (gridDim.x * gridDim.y * gridDim.z) - 1; - gwsKernelParam[0] = &nwm1; - - hc::accelerator acc = stream->getDevice()->_acc; - // create a cooperative accelerated view for launching gws and main kernels - hc::accelerator_view coopAV = acc.create_cooperative_view(); - - LockedAccessor_StreamCrit_t streamCrit(stream->criticalData(), false); - - // the cooperative queue will wait until this stream completes its operations - hc::completion_future streamCF; - if (!streamCrit->_av.get_is_empty()) { - streamCF = streamCrit->_av.create_marker(hc::accelerator_scope); - coopAV.create_blocking_marker(streamCF, hc::accelerator_scope); - } - - streamCrit->_av.acquire_locked_hsa_queue(); - coopAV.acquire_locked_hsa_queue(); - - // launch the init_gws kernel to initialize the GWS in the dedicated cooperative queue - result = ihipModuleLaunchKernel(tls, gwsKD, 1, 1, 1, 1, 1, 1, - 0, stream, gwsKernelParam, nullptr, nullptr, nullptr, 0, true, nullptr , &coopAV); - - if (result != hipSuccess) { - stream->criticalData().unlock(); - stream->criticalData()._av.release_locked_hsa_queue(); - coopAV.release_locked_hsa_queue(); - return hipErrorLaunchFailure; - } - - size_t impCoopArg = 1; - void* impCoopParams[1]; - impCoopParams[0] = &impCoopArg; - - // launch the main kernel in the cooperative queue - result = ihipModuleLaunchKernel(tls, kd, - gridDim.x * blockDim.x, - gridDim.y * blockDim.y, - gridDim.z * blockDim.z, - blockDim.x, blockDim.y, blockDim.z, - sharedMemBytes, stream, kernelParams, nullptr, nullptr, - nullptr, 0, true, impCoopParams, &coopAV); - - - coopAV.release_locked_hsa_queue(); - stream->criticalData()._av.release_locked_hsa_queue(); - - // this stream will wait until the cooperative queue completes its operations - hc::completion_future cooperativeCF; - if (!coopAV.get_is_empty()) { - cooperativeCF = coopAV.create_marker(hc::accelerator_scope); - streamCrit->_av.create_blocking_marker(cooperativeCF, hc::accelerator_scope); - } - - stream->criticalData().unlock(); - - return result; -#else - return hipErrorInvalidConfiguration; -#endif - -} - -__attribute__((visibility("default"))) -hipError_t hipLaunchCooperativeKernel(const void* func, dim3 gridDim, - dim3 blockDim, void** args, - size_t sharedMem, hipStream_t stream, - hip_impl::program_state& ps) { - - // Skipping passing in ps, because the logging function does not like it - HIP_INIT_API(hipLaunchCooperativeKernel, func, gridDim, blockDim, args, - sharedMem, stream); - - return ihipLogStatus(ihipLaunchCooperativeKernel(func, gridDim, blockDim, - args, sharedMem, stream, ps)); -} - - -hipError_t ihipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsList, - int numDevices, unsigned int flags, hip_impl::program_state& ps) { - -#if (__hcc_workweek__ >= 20115) - hipError_t result; - - if (numDevices > g_deviceCnt || launchParamsList == nullptr || numDevices > MAX_COOPERATIVE_GPUs) { - return hipErrorInvalidValue; - } - - vector streams; - vector deviceIDs; - // check to see if we have valid distinct streams/devices, if cooperative multi device - // launch is supported and if grid/block dimensions are valid - for (int i = 0; i < numDevices; ++i) { - const hipLaunchParams& lp = launchParamsList[i]; - - if (lp.stream == nullptr){ - return hipErrorInvalidResourceHandle; - } - - if (find(streams.begin(), streams.end(), lp.stream) == streams.end()) { - streams.push_back(lp.stream); - } else { - return hipErrorInvalidDevice; - } - - const ihipDevice_t* currentDevice = lp.stream->getDevice(); - if (find(deviceIDs.begin(), deviceIDs.end(), currentDevice->_deviceId) == deviceIDs.end()) { - deviceIDs.push_back(currentDevice->_deviceId); - } else { - return hipErrorInvalidDevice; - } - - if (!currentDevice->_props.cooperativeMultiDeviceLaunch) { - return hipErrorInvalidConfiguration; - } - - if (lp.gridDim.x == 0 || lp.gridDim.y == 0 || lp.gridDim.z == 0 || - lp.blockDim.x == 0 || lp.blockDim.y == 0 || lp.blockDim.z == 0 || - lp.blockDim.x * lp.blockDim.y * lp.blockDim.z > currentDevice->_props.maxThreadsPerBlock){ - return hipErrorInvalidConfiguration; - } - } - - vector gwsKds; - vector kds; - - GET_TLS(); - // prepare all kernel descriptors for initializing the GWS and the main kernels per device - for (int i = 0; i < numDevices; ++i) { - const hipLaunchParams& lp = launchParamsList[i]; - - gwsKds.push_back(ps.kernel_descriptor(reinterpret_cast(&init_gws), - hip_impl::target_agent(lp.stream))); - if (gwsKds[i] == nullptr) { - return hipErrorInvalidValue; - } - hip_impl::kernargs_size_align gwsKargs = ps.get_kernargs_size_align( - reinterpret_cast(&init_gws)); - gwsKds[i]->_kernarg_layout = *reinterpret_cast>*>( - gwsKargs.getHandle()); - - - kds.push_back(ps.kernel_descriptor(reinterpret_cast(lp.func), - hip_impl::target_agent(lp.stream))); - if (kds[i] == nullptr) { - return hipErrorInvalidValue; - } - hip_impl::kernargs_size_align kargs = ps.get_kernargs_size_align( - reinterpret_cast(lp.func)); - kds[i]->_kernarg_layout = *reinterpret_cast>*>( - kargs.getHandle()); - - int numBlocksPerSm = 0; - result = ihipOccupancyMaxActiveBlocksPerMultiprocessor(tls, &numBlocksPerSm, kds[i], - lp.blockDim.x * lp.blockDim.y * lp.blockDim.z, lp.sharedMem); - if (result != hipSuccess) { - return hipErrorLaunchFailure; - } - int maxActiveBlocks = numBlocksPerSm * lp.stream->getDevice()->_props.multiProcessorCount; - - //check to see if the workload fits on the GPU - if (lp.gridDim.x * lp.gridDim.y * lp.gridDim.z > maxActiveBlocks) { - return hipErrorCooperativeLaunchTooLarge; - } - } - - vector coopAVs; - - // create cooperative accelerated views for launching gws and main kernels on each device - for (int i = 0; i < numDevices; ++i) { - hc::accelerator acc = launchParamsList[i].stream->getDevice()->_acc; - coopAVs.push_back(acc.create_cooperative_view()); - } - - mg_sync *mg_sync_ptr = 0; - vector mg_info_ptr; - - result = hip_internal::ihipHostMalloc(tls, (void **)&mg_sync_ptr, sizeof(mg_sync), hipHostMallocDefault, true); - if (result != hipSuccess) { - return hipErrorInvalidValue; - } - mg_sync_ptr->w0 = 0; - mg_sync_ptr->w1 = 0; - - uint all_sum = 0; - for (int i = 0; i < numDevices; ++i) { - mg_info *mg_info_temp = nullptr; - result = hip_internal::ihipHostMalloc(tls, (void **)&mg_info_temp, sizeof(mg_info), hipHostMallocDefault, true); - if (result != hipSuccess) { - hip_internal::ihipHostFree(tls, mg_sync_ptr); - for (int j = 0; j < i; ++j) { - hip_internal::ihipHostFree(tls, mg_info_ptr[j]); - } - return hipErrorInvalidValue; - } - mg_info_ptr.push_back(mg_info_temp); - // calculate the sum of sizes of all grids - const hipLaunchParams& lp = launchParamsList[i]; - all_sum += lp.blockDim.x * lp.blockDim.y * lp.blockDim.z * - lp.gridDim.x * lp.gridDim.y * lp.gridDim.z; - } - - // lock all streams before launching the blit kernels for initializing the GWS and main kernels to each device - for (int i = 0; i < numDevices; ++i) { - LockedAccessor_StreamCrit_t streamCrit(launchParamsList[i].stream->criticalData(), false); - - hc::completion_future streamCF; - if (!streamCrit->_av.get_is_empty()) { - streamCF = streamCrit->_av.create_marker(hc::accelerator_scope); - if (flags & hipCooperativeLaunchMultiDeviceNoPreSync) { - coopAVs[i].create_blocking_marker(streamCF, hc::accelerator_scope); - streamCrit->_av.acquire_locked_hsa_queue(); - coopAVs[i].acquire_locked_hsa_queue(); - } else { - for (int j = 0; j < numDevices; ++j) { - coopAVs[j].create_blocking_marker(streamCF, hc::accelerator_scope); - } - } - } - } - if ((flags & hipCooperativeLaunchMultiDeviceNoPreSync) == 0) { - for (int i = 0; i < numDevices; ++i) { - launchParamsList[i].stream->criticalData()._av.acquire_locked_hsa_queue(); - coopAVs[i].acquire_locked_hsa_queue(); - } - } - - // launch the init_gws kernel to initialize the GWS for each device - for (int i = 0; i < numDevices; ++i) { - const hipLaunchParams& lp = launchParamsList[i]; - - void *gwsKernelParam[1]; - uint nwm1 = (lp.gridDim.x * lp.gridDim.y * lp.gridDim.z) - 1; - gwsKernelParam[0] = &nwm1; - - result = ihipModuleLaunchKernel(tls, gwsKds[i], 1, 1, 1, 1, 1, 1, - 0, lp.stream, gwsKernelParam, nullptr, nullptr, nullptr, 0, true, nullptr, &coopAVs[i]); - - if (result != hipSuccess) { - for (int j = 0; j < numDevices; ++j) { - launchParamsList[j].stream->criticalData().unlock(); - launchParamsList[j].stream->criticalData()._av.release_locked_hsa_queue(); - coopAVs[i].release_locked_hsa_queue(); - } - hip_internal::ihipHostFree(tls, mg_sync_ptr); - for (int j = 0; j < numDevices; ++j) { - hip_internal::ihipHostFree(tls, mg_info_ptr[j]); - } - - return hipErrorLaunchFailure; - } - } - - void* impCoopParams[1]; - ulong prev_sum = 0; - - size_t globalWorkSizeX = 0, globalWorkSizeY = 0, globalWorkSizeZ = 0; - // launch the main kernels for each device - for (int i = 0; i < numDevices; ++i) { - const hipLaunchParams& lp = launchParamsList[i]; - - //initialize and setup the implicit kernel argument for multi-grid sync - mg_info_ptr[i]->mgs = mg_sync_ptr; - mg_info_ptr[i]->grid_id = i; - mg_info_ptr[i]->num_grids = numDevices; - mg_info_ptr[i]->all_sum = all_sum; - mg_info_ptr[i]->prev_sum = prev_sum; - prev_sum += lp.blockDim.x * lp.blockDim.y * lp.blockDim.z * - lp.gridDim.x * lp.gridDim.y * lp.gridDim.z; - - lp.stream->coopMemsTracker.push_back(mg_info_ptr[i]); - - impCoopParams[0] = &mg_info_ptr[i]; - - globalWorkSizeX = (size_t)lp.gridDim.x * (size_t)lp.blockDim.x; - globalWorkSizeY = (size_t)lp.gridDim.y * (size_t)lp.blockDim.y; - globalWorkSizeZ = (size_t)lp.gridDim.z * (size_t)lp.blockDim.z; - if(globalWorkSizeX > UINT32_MAX || globalWorkSizeY > UINT32_MAX || globalWorkSizeZ > UINT32_MAX) { - return hipErrorInvalidConfiguration; - } - - result = ihipModuleLaunchKernel(tls, kds[i], - lp.gridDim.x * lp.blockDim.x, - lp.gridDim.y * lp.blockDim.y, - lp.gridDim.z * lp.blockDim.z, - lp.blockDim.x, lp.blockDim.y, - lp.blockDim.z, lp.sharedMem, - lp.stream, lp.args, nullptr, nullptr, nullptr, 0, - true, impCoopParams, &coopAVs[i]); - - if (result != hipSuccess) { - for (int j = 0; j < numDevices; ++j) { - launchParamsList[j].stream->criticalData().unlock(); - launchParamsList[j].stream->criticalData()._av.release_locked_hsa_queue(); - coopAVs[i].release_locked_hsa_queue(); - } - hip_internal::ihipHostFree(tls, mg_sync_ptr); - for (int j = 0; j < numDevices; ++j) { - hip_internal::ihipHostFree(tls, mg_info_ptr[j]); - launchParamsList[j].stream->coopMemsTracker.pop_back(); - } - - return hipErrorLaunchFailure; - } - - } - - // unlock streams and create blocking markers on them based on the workload - // on cooperative queues on each device - for (int i = 0; i < numDevices; ++i) { - coopAVs[i].release_locked_hsa_queue(); - launchParamsList[i].stream->criticalData()._av.release_locked_hsa_queue(); - } - - for (int i = 0; i < numDevices; ++i) { - hc::completion_future cooperativeCF; - if (!coopAVs[i].get_is_empty()) { - cooperativeCF = coopAVs[i].create_marker(hc::accelerator_scope); - if (flags & hipCooperativeLaunchMultiDeviceNoPostSync) { - launchParamsList[i].stream->criticalData()._av.create_blocking_marker( - cooperativeCF, hc::accelerator_scope); - launchParamsList[i].stream->criticalData().unlock(); - } else { - for (int j = 0; j < numDevices; ++j) { - launchParamsList[j].stream->criticalData()._av.create_blocking_marker( - cooperativeCF, hc::accelerator_scope); - } - } - } - } - - if ((flags & hipCooperativeLaunchMultiDeviceNoPostSync) == 0) { - for (int i = 0; i < numDevices; ++i) { - launchParamsList[i].stream->criticalData().unlock(); - } - } - - return result; -#else - return hipErrorInvalidConfiguration; -#endif -} - -__attribute__((visibility("default"))) -hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsList, - int numDevices, - unsigned int flags, - hip_impl::program_state& ps) { - - // Skipping passing in ps, because the logging function does not like it - HIP_INIT_API(hipLaunchCooperativeKernelMultiDevice, launchParamsList, - numDevices, flags); - - return ihipLogStatus(ihipLaunchCooperativeKernelMultiDevice(launchParamsList, - numDevices, - flags, ps)); -} - -namespace hip_impl { - hsa_executable_t executable_for(hipModule_t hmod) { - return hmod->executable; - } - - const char* hash_for(hipModule_t hmod) { - return hmod->hash.c_str(); - } - - hsa_agent_t this_agent() { - GET_TLS(); - auto ctx = ihipGetTlsDefaultCtx(); - - if (!ctx) throw runtime_error{"No active HIP context."}; - - auto device = ctx->getDevice(); - - if (!device) throw runtime_error{"No device available for HIP."}; - - ihipDevice_t* currentDevice = ihipGetDevice(device->_deviceId); - - if (!currentDevice) throw runtime_error{"No active device for HIP."}; - - return currentDevice->_hsaAgent; - } - - struct Agent_global { - Agent_global() : name(nullptr), address(nullptr), byte_cnt(0) {} - Agent_global(const char* name, hipDeviceptr_t address, uint32_t byte_cnt) - : name(nullptr), address(address), byte_cnt(byte_cnt) { - if (name) - this->name = strdup(name); - } - - Agent_global& operator=(Agent_global&& t) { - if (this == &t) return *this; - - if (name) free(name); - name = t.name; - address = t.address; - byte_cnt = t.byte_cnt; - - t.name = nullptr; - t.address = nullptr; - t.byte_cnt = 0; - - return *this; - } - - Agent_global(Agent_global&& t) - : name(nullptr), address(nullptr), byte_cnt(0) { - *this = std::move(t); - } - - // not needed, delete them to prevent bugs - Agent_global(const Agent_global&) = delete; - Agent_global& operator=(Agent_global& t) = delete; - - ~Agent_global() { if (name) free(name); } - - char* name; - hipDeviceptr_t address; - uint32_t byte_cnt; - }; - - template - std::pair read_global_description( - ForwardIterator f, ForwardIterator l, const char* name) { - const auto it = std::find_if(f, l, [=](const Agent_global& x) { - return strcmp(x.name, name) == 0; - }); - - return it == l ? - std::make_pair(nullptr, 0u) : std::make_pair(it->address, it->byte_cnt); - } - - std::vector read_agent_globals(hsa_agent_t agent, - hsa_executable_t executable); - class agent_globals_impl { - private: - std::pair< - std::mutex, - std::unordered_map< - std::string, std::vector>> globals_from_module; - - std::unordered_map< - hsa_agent_t, - std::pair< - std::once_flag, - std::vector>> globals_from_process; - - public: - - hipError_t read_agent_global_from_module(hipDeviceptr_t* dptr, size_t* bytes, - hipModule_t hmod, const char* name) { - // the key of the map would the hash of code object associated with the - // hipModule_t instance - std::string key(hash_for(hmod)); - - if (globals_from_module.second.count(key) == 0) { - std::lock_guard lck{globals_from_module.first}; - - if (globals_from_module.second.count(key) == 0) { - globals_from_module.second.emplace( - key, read_agent_globals(this_agent(), executable_for(hmod))); - } - } - - const auto it0 = globals_from_module.second.find(key); - if (it0 == globals_from_module.second.cend()) { - hip_throw( - std::runtime_error{"agent_globals data structure corrupted."}); - } - - std::tie(*dptr, *bytes) = read_global_description(it0->second.cbegin(), - it0->second.cend(), name); - // HACK for SWDEV-173477 - // - // For code objects with global symbols of length 0, ROCR runtime's fix - // may not be working correctly. Therefore the - // result from read_agent_globals() can't be trusted entirely. - // - // As a workaround to tame applications which depend on the existence of - // global symbols with length 0, always return hipSuccess here. - // - // This behavior shall be reverted once ROCR runtime has been fixed to - // address SWDEV-173477 and SWDEV-190701 - - //return *dptr ? hipSuccess : hipErrorNotFound; - return hipSuccess; - } - - hipError_t read_agent_global_from_process(hipDeviceptr_t* dptr, size_t* bytes, - const char* name) { - - auto agent = this_agent(); - - std::call_once(globals_from_process[agent].first, [this](hsa_agent_t aa) { - std::vector tmp0; - for (auto&& executable : hip_impl::get_program_state().impl->get_executables(aa)) { - auto tmp1 = read_agent_globals(aa, executable); - tmp0.insert(tmp0.end(), make_move_iterator(tmp1.begin()), - make_move_iterator(tmp1.end())); - } - globals_from_process[aa].second = move(move(tmp0)); - }, agent); - - const auto it = globals_from_process.find(agent); - - if (it == globals_from_process.cend()) return hipErrorNotInitialized; - - std::tie(*dptr, *bytes) = read_global_description(it->second.second.cbegin(), - it->second.second.cend(), name); - - return *dptr ? hipSuccess : hipErrorNotFound; - } - - }; - - agent_globals::agent_globals() : impl(new agent_globals_impl()) { - if (!impl) - hip_throw( - std::runtime_error{"Error when constructing agent global data structures."}); - } - agent_globals::~agent_globals() { delete impl; } - - hipError_t agent_globals::read_agent_global_from_module(hipDeviceptr_t* dptr, size_t* bytes, - hipModule_t hmod, const char* name) { - return impl->read_agent_global_from_module(dptr, bytes, hmod, name); - } - - hipError_t agent_globals::read_agent_global_from_process(hipDeviceptr_t* dptr, size_t* bytes, - const char* name) { - hipError_t result = impl->read_agent_global_from_process(dptr, bytes, name); - if(result != hipSuccess) { - // For Clang Compiler + Hcc Rt - result = ihipGetGlobalVar(dptr, bytes, name); - } - return result; - } - -} // Namespace hip_impl. - -hipError_t hipModuleGetGlobal(hipDeviceptr_t* dptr, size_t* bytes, - hipModule_t hmod, const char* name) { - HIP_INIT_API(hipModuleGetGlobal, dptr, bytes, hmod, name); - if (!dptr || !bytes || !hmod) return hipErrorInvalidValue; - - if (!name) return hipErrorNotInitialized; - - return hip_impl::get_agent_globals().read_agent_global_from_module(dptr, bytes, hmod, name); -} - -namespace { -inline void track(const hip_impl::Agent_global& x, hsa_agent_t agent) { - GET_TLS(); - tprintf(DB_MEM, " add variable '%s' with ptr=%p size=%u to tracker\n", x.name, - x.address, x.byte_cnt); - - int deviceIndex =0; - for ( deviceIndex = 0; deviceIndex < g_deviceCnt; deviceIndex++) { - if(g_allAgents[deviceIndex] == agent) - break; - } - auto device = ihipGetDevice(deviceIndex - 1); - hc::AmPointerInfo ptr_info(nullptr, x.address, x.address, x.byte_cnt, device->_acc, true, - false); - hc::am_memtracker_add(x.address, ptr_info); -#if USE_APP_PTR_FOR_CTX - hc::am_memtracker_update(x.address, device->_deviceId, 0u, ihipGetTlsDefaultCtx()); -#else - hc::am_memtracker_update(x.address, device->_deviceId, 0u); -#endif - -} - -template > -inline hsa_status_t copy_agent_global_variables(hsa_executable_t, hsa_agent_t agent, - hsa_executable_symbol_t x, void* out) { - using namespace hip_impl; - - assert(out); - - hsa_symbol_kind_t t = {}; - hsa_executable_symbol_get_info(x, HSA_EXECUTABLE_SYMBOL_INFO_TYPE, &t); - - if (t == HSA_SYMBOL_KIND_VARIABLE) { - hip_impl::Agent_global tmp(name(x).c_str(), address(x), size(x)); - static_cast(out)->push_back(std::move(tmp)); - - track(static_cast(out)->back(),agent); - } - - return HSA_STATUS_SUCCESS; -} - -inline hsa_status_t remove_agent_global_variables(hsa_executable_t, hsa_agent_t agent, - hsa_executable_symbol_t x, void* unused) { - hsa_symbol_kind_t t = {}; - hsa_executable_symbol_get_info(x, HSA_EXECUTABLE_SYMBOL_INFO_TYPE, &t); - - if (t == HSA_SYMBOL_KIND_VARIABLE) { - hc::am_memtracker_remove(hip_impl::address(x)); - } - - return HSA_STATUS_SUCCESS; -} - -hsa_executable_symbol_t find_kernel_by_name(hsa_executable_t executable, const char* kname, - hsa_agent_t* agent = nullptr) { - using namespace hip_impl; - hsa_executable_symbol_t symbol = { 0 }; - hsa_agent_t thisagent = agent ? *agent : this_agent(); - hsa_status_t err = hsa_executable_get_symbol_by_name(executable, kname, &thisagent ,&symbol); - //TODO check err ? - return symbol; -} - - -string read_elf_file_as_string(const void* file) { - // Precondition: file points to an ELF image that was BITWISE loaded - // into process accessible memory, and not one loaded by - // the loader. This is because in the latter case - // alignment may differ, which will break the size - // computation. - // the image is Elf64, and matches endianness i.e. it is - // Little Endian. - if (!file) return {}; - - auto h = static_cast(file); - auto s = static_cast(file); - // This assumes the common case of SHT being the last part of the ELF. - auto sz = h->e_shoff + h->e_shentsize * h->e_shnum; - - return string{s, s + sz}; -} - -string code_object_blob_for_agent(const void* maybe_bundled_code, hsa_agent_t agent) { - using namespace hip_impl; - - if (!maybe_bundled_code) return {}; - - Bundled_code_header tmp{maybe_bundled_code}; - - if (!valid(tmp)) return {}; - - const auto agent_isa = isa(agent); - - const auto it = find_if(bundles(tmp).cbegin(), bundles(tmp).cend(), [=](const Bundled_code& x) { - return agent_isa == triple_to_hsa_isa(x.triple); - ; - }); - - if (it == bundles(tmp).cend()) return {}; - - return string{it->blob.cbegin(), it->blob.cend()}; -} -} // Unnamed namespace. - -namespace hip_impl { - vector read_agent_globals(hsa_agent_t agent, - hsa_executable_t executable) { - vector r; - - hsa_executable_iterate_agent_symbols( - executable, agent, copy_agent_global_variables, &r); - - return r; - } - void remove_agent_globals_from_tracker(hsa_agent_t agent, hsa_executable_t executable) { - hsa_executable_iterate_agent_symbols(executable, agent, remove_agent_global_variables, NULL); - } -} // Namespace hip_impl. - -hipError_t hipModuleUnload(hipModule_t hmod) { - HIP_INIT_API(hipModuleUnload, hmod); - - // TODO - improve this synchronization so it is thread-safe. - // Currently we want for all inflight activity to complete, but don't prevent another - // thread from launching new kernels before we finish this operation. - ihipSynchronize(tls); - - // deleting ihipModule_t does not remove agent globals from hc_am memtracker - hip_impl::remove_agent_globals_from_tracker(hip_impl::this_agent(), hip_impl::executable_for(hmod)); - - delete hmod; // The ihipModule_t dtor will clean everything up. - hmod = nullptr; - - return ihipLogStatus(hipSuccess); -} - -hipError_t ihipModuleGetFunction(TlsData *tls, hipFunction_t* func, hipModule_t hmod, const char* name, - hsa_agent_t *agent = nullptr) { - using namespace hip_impl; - - if (!func || !name) return hipErrorInvalidValue; - - auto ctx = ihipGetTlsDefaultCtx(); - - if (!ctx) return hipErrorInvalidContext; - - *func = new ihipModuleSymbol_t; - - if (!*func) return hipErrorInvalidValue; - - std::string name_str(name); - std::string namekd_str(name_str + ".kd"); - bool kernel_by_namekd = false; - - auto kernel = find_kernel_by_name(hmod->executable, name_str.c_str(), agent); - - if (kernel.handle == 0u) { - kernel_by_namekd = true; //Find kernel by namekd_str - kernel = find_kernel_by_name(hmod->executable, namekd_str.c_str(), agent); - } - - if (kernel.handle == 0u) return hipErrorNotFound; - - //For hipModuleLoad(), hmod->kernargs must contain an args with key - //name_str or namekd_str. - //For hipLaunchKernelGGL(), hmod->kernargs is empty, thus we need - //insert hmod->kernargs[name_str] - auto it = hmod->kernargs.find(name_str); //Look up args from the original name - if (it == hmod->kernargs.end()) { - it = hmod->kernargs.find(namekd_str); //Look up args from .kd name - } - - // TODO: refactor the whole ihipThisThat, which is a mess and yields the - // below, due to hipFunction_t being a pointer to ihipModuleSymbol_t. - - func[0][0] = *static_cast( - Kernel_descriptor{kernel_object(kernel), - kernel_by_namekd ? namekd_str : name_str, - it != hmod->kernargs.end() ? it->second : hmod->kernargs[name_str]}); - - return hipSuccess; -} - -// Get kernel for the current hsa agent. -hipError_t hipModuleGetFunction(hipFunction_t* hfunc, hipModule_t hmod, const char* name) { - HIP_INIT_API(hipModuleGetFunction, hfunc, hmod, name); - return ihipLogStatus(ihipModuleGetFunction(tls, hfunc, hmod, name)); -} - -// Get kernel for the given hsa agent. Internal use only. -hipError_t hipModuleGetFunctionEx(hipFunction_t* hfunc, hipModule_t hmod, - const char* name, hsa_agent_t *agent) { - HIP_INIT_API(hipModuleGetFunctionEx, hfunc, hmod, name, agent); - return ihipLogStatus(ihipModuleGetFunction(tls, hfunc, hmod, name, agent)); -} - -namespace { -const amd_kernel_code_v3_t *header_v3(const ihipModuleSymbol_t& kd) { - return reinterpret_cast(kd._header); -} - -hipFuncAttributes make_function_attributes(TlsData *tls, ihipModuleSymbol_t& kd) { - hipFuncAttributes r{}; - - hipDeviceProp_t prop{}; - hipGetDeviceProperties(&prop, ihipGetTlsDefaultCtx()->getDevice()->_deviceId); - // TODO: at the moment there is no way to query the count of registers - // available per CU, therefore we hardcode it to 64 KiRegisters. - prop.regsPerBlock = prop.regsPerBlock ? prop.regsPerBlock : 64 * 1024; - - if (kd._is_code_object_v3) { - r.binaryVersion = 0; // FIXME: should it be the ISA version or code - // object format version? - r.localSizeBytes = header_v3(kd)->private_segment_fixed_size; - r.sharedSizeBytes = header_v3(kd)->group_segment_fixed_size; - } else { - r.localSizeBytes = kd._header->workitem_private_segment_byte_size; - r.sharedSizeBytes = kd._header->workgroup_group_segment_byte_size; - r.binaryVersion = - kd._header->amd_machine_version_major * 10 + - kd._header->amd_machine_version_minor; - } - r.maxDynamicSharedSizeBytes = prop.sharedMemPerBlock - r.sharedSizeBytes; - - size_t usedVGPRS = 0; - size_t usedSGPRS = 0; - size_t usedLDS = 0; - getGprsLdsUsage(&kd, &usedVGPRS, &usedSGPRS, &usedLDS); - - r.numRegs = usedVGPRS; - - size_t wavefrontSize = prop.warpSize; - size_t maxWavefrontsPerBlock = prop.maxThreadsPerBlock / wavefrontSize; - size_t maxWavefrontsPerCU = min(prop.maxThreadsPerMultiProcessor / wavefrontSize, 32); - const size_t numSIMD = 4; - const size_t maxWavesPerSimd = maxWavefrontsPerCU / numSIMD; - size_t maxWaves = 0; - for (int i = 0; i < maxWavefrontsPerBlock; i++) { - size_t wavefronts = i + 1; - - if (usedVGPRS > 0) { - size_t availableVGPRs = (prop.regsPerBlock / wavefrontSize / numSIMD); - size_t vgprs_alu_occupancy = numSIMD * std::min(maxWavesPerSimd, availableVGPRs / usedVGPRS); - - // Calculate blocks occupancy per CU based on VGPR usage - if (vgprs_alu_occupancy < wavefronts) - break; - } - - if (usedSGPRS > 0) { - const size_t availableSGPRs = (prop.gcnArch < 800) ? 512 : 800; - size_t sgprs_alu_occupancy = numSIMD * ((usedSGPRS == 0) ? maxWavesPerSimd - : std::min(maxWavesPerSimd, availableSGPRs / usedSGPRS)); - - // Calculate blocks occupancy per CU based on SGPR usage - if (sgprs_alu_occupancy < wavefronts) - break; - } - maxWaves = wavefronts; - } - - r.maxThreadsPerBlock = maxWaves * wavefrontSize; - r.ptxVersion = prop.major * 10 + prop.minor; // HIP currently presents itself as PTX 3.0. - - return r; -} -} // Unnamed namespace. - -hipError_t hipFuncGetAttributes(hipFuncAttributes* attr, const void* func) -{ - HIP_INIT_API(hipFuncGetAttributes, attr, func); - using namespace hip_impl; - - if (!attr) return ihipLogStatus(hipErrorInvalidValue); - if (!func) return ihipLogStatus(hipErrorInvalidDeviceFunction); - - auto agent = this_agent(); - auto kd = get_program_state().kernel_descriptor(reinterpret_cast(func), agent); - - if (!kd->_header) throw runtime_error{"Ill-formed Kernel_descriptor."}; - - *attr = make_function_attributes(tls, *kd); - - return ihipLogStatus(hipSuccess); -} - -hipError_t hipFuncSetAttribute(const void* func, hipFuncAttribute attr, int value) { - HIP_INIT_API(hipFuncSetCacheConfig, func, attr, value); - - // Nop, AMD does not support setting shared memory size for function. - - return ihipLogStatus(hipSuccess); -} - -hipError_t hipFuncSetSharedMemConfig(const void* func, hipSharedMemConfig config) { - HIP_INIT_API(hipFuncSetSharedMemConfig, func, config); - - // Nop, AMD does not support setting shared memory size for function. - - return ihipLogStatus(hipSuccess); -} - -hipError_t hipFuncGetAttribute(int* value, hipFunction_attribute attrib, hipFunction_t hfunc) -{ - HIP_INIT_API(hipFuncGetAttribute, value, attrib, hfunc); - using namespace hip_impl; - - hipError_t retVal = hipSuccess; - if (!value) return ihipLogStatus(hipErrorInvalidValue); - hipFuncAttributes attr{}; - attr = make_function_attributes(tls, *hfunc); - switch(attrib) { - case HIP_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES: - *value = (int) attr.sharedSizeBytes; - break; - case HIP_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK: - *value = attr.maxThreadsPerBlock; - break; - case HIP_FUNC_ATTRIBUTE_CONST_SIZE_BYTES: - *value = (int) attr.constSizeBytes; - break; - case HIP_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES: - *value = (int) attr.localSizeBytes; - break; - case HIP_FUNC_ATTRIBUTE_NUM_REGS: - *value = attr.numRegs; - break; - case HIP_FUNC_ATTRIBUTE_PTX_VERSION: - *value = attr.ptxVersion; - break; - case HIP_FUNC_ATTRIBUTE_BINARY_VERSION: - *value = attr.binaryVersion; - break; - case HIP_FUNC_ATTRIBUTE_CACHE_MODE_CA: - *value = attr.cacheModeCA; - break; - case HIP_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES: - *value = attr.maxDynamicSharedSizeBytes; - break; - case HIP_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT: - *value = attr.preferredShmemCarveout; - break; - default: - retVal = hipErrorInvalidValue; - } - return ihipLogStatus(retVal); -} - -hipError_t ihipModuleLoadData(TlsData *tls, hipModule_t* module, const void* image) { - using namespace hip_impl; - - if (!module) return hipErrorInvalidValue; - - *module = new ihipModule_t; - - auto ctx = ihipGetTlsDefaultCtx(); - if (!ctx) return hipErrorInvalidContext; - - // try extracting code object from image as fatbin. - char name[64] = {}; - hsa_agent_get_info(this_agent(), HSA_AGENT_INFO_NAME, name); - if (auto *code_obj = __hipExtractCodeObjectFromFatBinary(image, name)) - image = code_obj; - - hsa_executable_create_alt(HSA_PROFILE_FULL, HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT, nullptr, - &(*module)->executable); - - auto tmp = code_object_blob_for_agent(image, this_agent()); - - auto content = tmp.empty() ? read_elf_file_as_string(image) : tmp; - - (*module)->executable = get_program_state().load_executable( - content.data(), content.size(), (*module)->executable, - this_agent()); - - program_state_impl::read_kernarg_metadata(content, (*module)->kernargs); - - // compute the hash of the code object - (*module)->hash = checksum(content.length(), content.data()); - - return (*module)->executable.handle ? hipSuccess : hipErrorUnknown; -} - -hipError_t hipModuleLoadData(hipModule_t* module, const void* image) { - HIP_INIT_API(hipModuleLoadData, module, image); - return ihipLogStatus(ihipModuleLoadData(tls,module,image)); -} - -hipError_t hipModuleLoad(hipModule_t* module, const char* fname) { - HIP_INIT_API(hipModuleLoad, module, fname); - - if (!fname) return ihipLogStatus(hipErrorInvalidValue); - - ifstream file{fname}; - - if (!file.is_open()) return ihipLogStatus(hipErrorFileNotFound); - - vector tmp{istreambuf_iterator{file}, istreambuf_iterator{}}; - - return ihipLogStatus(ihipModuleLoadData(tls, module, tmp.data())); -} - -hipError_t hipModuleLoadDataEx(hipModule_t* module, const void* image, unsigned int numOptions, - hipJitOption* options, void** optionValues) { - HIP_INIT_API(hipModuleLoadDataEx, module, image, numOptions, options, optionValues); - return ihipLogStatus(ihipModuleLoadData(tls, module, image)); -} - -hipError_t hipModuleGetTexRef(textureReference** texRef, hipModule_t hmod, const char* name) { - using namespace hip_impl; - - HIP_INIT_API(hipModuleGetTexRef, texRef, hmod, name); - - hipError_t ret = hipErrorNotFound; - if (!texRef) return ihipLogStatus(hipErrorInvalidValue); - - if (!hmod || !name) return ihipLogStatus(hipErrorNotInitialized); - - auto addr = get_program_state().global_addr_by_name(name); - if (addr == nullptr) return ihipLogStatus(hipErrorInvalidValue); - - *texRef = reinterpret_cast(addr); - return ihipLogStatus(hipSuccess); -} - -hipError_t ihipOccupancyMaxPotentialBlockSize(TlsData *tls, int* gridSize, int* blockSize, - hipFunction_t f, size_t dynSharedMemPerBlk, - int blockSizeLimit) -{ - using namespace hip_impl; - - auto ctx = ihipGetTlsDefaultCtx(); - if (ctx == nullptr) { - return hipErrorInvalidDevice; - } - - hipDeviceProp_t prop{}; - ihipGetDeviceProperties(&prop, ihipGetTlsDefaultCtx()->getDevice()->_deviceId); - - prop.regsPerBlock = prop.regsPerBlock ? prop.regsPerBlock : 64 * 1024; - - size_t usedVGPRS = 0; - size_t usedSGPRS = 0; - size_t usedLDS = 0; - getGprsLdsUsage(f, &usedVGPRS, &usedSGPRS, &usedLDS); - - // try different workgroup sizes to find the maximum potential occupancy - // based on the usage of VGPRs and LDS - size_t wavefrontSize = prop.warpSize; - size_t maxWavefrontsPerBlock = prop.maxThreadsPerBlock / wavefrontSize; - - // Due to SPI and private memory limitations, the max of wavefronts per CU in 32 - size_t maxWavefrontsPerCU = min(prop.maxThreadsPerMultiProcessor / wavefrontSize, 32); - - const size_t numSIMD = 4; - size_t maxActivWaves = 0; - size_t maxWavefronts = 0; - for (int i = 0; i < maxWavefrontsPerBlock; i++) { - size_t wavefrontsPerWG = i + 1; - - // workgroup per CU is 40 for WG size of 1 wavefront; otherwise it is 16 - size_t maxWorkgroupPerCU = (wavefrontsPerWG == 1) ? 40 : 16; - size_t maxWavesWGLimited = min(wavefrontsPerWG * maxWorkgroupPerCU, maxWavefrontsPerCU); - - // Compute VGPR limited wavefronts per block - size_t wavefrontsVGPRS; - if (usedVGPRS == 0) { - wavefrontsVGPRS = maxWavesWGLimited; - } - else { - // find how many VGPRs are available for each SIMD - size_t numVGPRsPerSIMD = (prop.regsPerBlock / wavefrontSize / numSIMD); - wavefrontsVGPRS = (numVGPRsPerSIMD / usedVGPRS) * numSIMD; - } - - size_t maxWavesVGPRSLimited = 0; - if (wavefrontsVGPRS > maxWavesWGLimited) { - maxWavesVGPRSLimited = maxWavesWGLimited; - } - else { - maxWavesVGPRSLimited = (wavefrontsVGPRS / wavefrontsPerWG) * wavefrontsPerWG; - } - - // Compute SGPR limited wavefronts per block - size_t wavefrontsSGPRS; - if (usedSGPRS == 0) { - wavefrontsSGPRS = maxWavesWGLimited; - } - else { - const size_t numSGPRsPerSIMD = (prop.gcnArch < 800) ? 512 : 800; - wavefrontsSGPRS = (numSGPRsPerSIMD / usedSGPRS) * numSIMD; - } - - size_t maxWavesSGPRSLimited = 0; - if (wavefrontsSGPRS > maxWavesWGLimited) { - maxWavesSGPRSLimited = maxWavesWGLimited; - } - else { - maxWavesSGPRSLimited = (wavefrontsSGPRS / wavefrontsPerWG) * wavefrontsPerWG; - } - - // Compute LDS limited wavefronts per block - size_t wavefrontsLDS; - if (usedLDS == 0) { - wavefrontsLDS = maxWorkgroupPerCU * wavefrontsPerWG; - } - else { - size_t availableSharedMemPerCU = prop.maxSharedMemoryPerMultiProcessor; - size_t workgroupPerCU = availableSharedMemPerCU / (usedLDS + dynSharedMemPerBlk); - wavefrontsLDS = min(workgroupPerCU, maxWorkgroupPerCU) * wavefrontsPerWG; - } - - size_t maxWavesLDSLimited = min(wavefrontsLDS, maxWavefrontsPerCU); - - size_t activeWavefronts = 0; - size_t tmp_min = (size_t)min(maxWavesLDSLimited, maxWavesWGLimited); - tmp_min = min(maxWavesSGPRSLimited, tmp_min); - activeWavefronts = min(maxWavesVGPRSLimited, tmp_min); - - if (maxActivWaves <= activeWavefronts) { - maxActivWaves = activeWavefronts; - maxWavefronts = wavefrontsPerWG; - } - } - - // determine the grid and block sizes for maximum potential occupancy - size_t maxThreadsCnt = prop.maxThreadsPerMultiProcessor*prop.multiProcessorCount; - if (blockSizeLimit > 0) { - maxThreadsCnt = min(maxThreadsCnt, blockSizeLimit); - } - - *blockSize = maxWavefronts * wavefrontSize; - *gridSize = min((maxThreadsCnt + *blockSize - 1) / *blockSize, prop.multiProcessorCount); - - return hipSuccess; -} - -hipError_t hipModuleOccupancyMaxPotentialBlockSize(int* gridSize, int* blockSize, - hipFunction_t f, size_t dynSharedMemPerBlk, - int blockSizeLimit) -{ - HIP_INIT_API(hipModuleOccupancyMaxPotentialBlockSize, gridSize, blockSize, f, dynSharedMemPerBlk, blockSizeLimit); - - return ihipLogStatus(ihipOccupancyMaxPotentialBlockSize(tls, - gridSize, blockSize, f, dynSharedMemPerBlk, blockSizeLimit)); -} - -hipError_t hipModuleOccupancyMaxPotentialBlockSizeWithFlags(int* gridSize, int* blockSize, - hipFunction_t f, size_t dynSharedMemPerBlk, - int blockSizeLimit, unsigned int flags) -{ - HIP_INIT_API(hipModuleOccupancyMaxPotentialBlockSizeWithFlags, gridSize, blockSize, f, dynSharedMemPerBlk, - blockSizeLimit, flags); - if(flags != hipOccupancyDefault) return ihipLogStatus(hipErrorNotSupported); - return ihipLogStatus(ihipOccupancyMaxPotentialBlockSize(tls, - gridSize, blockSize, f, dynSharedMemPerBlk, blockSizeLimit)); -} - -hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor( - int* numBlocks, const void* f, int blockSize, size_t dynSharedMemPerBlk) -{ - HIP_INIT_API(hipOccupancyMaxActiveBlocksPerMultiprocessor, numBlocks, f, blockSize, dynSharedMemPerBlk); - auto F = hip_impl::get_program_state().kernel_descriptor((std::uintptr_t)(f), - hip_impl::target_agent(0)); - return ihipLogStatus(ihipOccupancyMaxActiveBlocksPerMultiprocessor( - tls, numBlocks, F, blockSize, dynSharedMemPerBlk)); -} - -hipError_t hipModuleOccupancyMaxActiveBlocksPerMultiprocessor( - int* numBlocks, hipFunction_t f, int blockSize, size_t dynSharedMemPerBlk) -{ - HIP_INIT_API(hipModuleOccupancyMaxActiveBlocksPerMultiprocessor, numBlocks, f, blockSize, dynSharedMemPerBlk); - - return ihipLogStatus(ihipOccupancyMaxActiveBlocksPerMultiprocessor( - tls, numBlocks, f, blockSize, dynSharedMemPerBlk)); -} - -hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags( - int* numBlocks, const void* f, int blockSize, size_t dynSharedMemPerBlk, - unsigned int flags) -{ - HIP_INIT_API(hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags, numBlocks, f, blockSize, dynSharedMemPerBlk, flags); - if(flags != hipOccupancyDefault) return ihipLogStatus(hipErrorNotSupported); - auto F = hip_impl::get_program_state().kernel_descriptor((std::uintptr_t)(f), - hip_impl::target_agent(0)); - return ihipLogStatus(ihipOccupancyMaxActiveBlocksPerMultiprocessor( - tls, numBlocks, F, blockSize, dynSharedMemPerBlk)); -} - -hipError_t hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags( - int* numBlocks, hipFunction_t f, int blockSize, size_t dynSharedMemPerBlk, - unsigned int flags) -{ - HIP_INIT_API(hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags, numBlocks, f, blockSize, dynSharedMemPerBlk, flags); - if(flags != hipOccupancyDefault) return ihipLogStatus(hipErrorNotSupported); - return ihipLogStatus(ihipOccupancyMaxActiveBlocksPerMultiprocessor( - tls, numBlocks, f, blockSize, dynSharedMemPerBlk)); -} - -hipError_t hipLaunchKernel( - const void* func_addr, dim3 numBlocks, dim3 dimBlocks, void** args, - size_t sharedMemBytes, hipStream_t stream) -{ - HIP_INIT_API(hipLaunchKernel,func_addr,numBlocks,dimBlocks,args,sharedMemBytes,stream); - - hipFunction_t kd = hip_impl::get_program_state().kernel_descriptor((std::uintptr_t)func_addr, - hip_impl::target_agent(stream)); - - return hipModuleLaunchKernel(kd, numBlocks.x, numBlocks.y, numBlocks.z, - dimBlocks.x, dimBlocks.y, dimBlocks.z, sharedMemBytes, - stream, args, nullptr); -} - -hipError_t hipExtLaunchKernel(const void* function, dim3 numBlocks, dim3 dimBlocks, void** args, - size_t sharedMemBytes, hipStream_t stream, hipEvent_t startEvent, - hipEvent_t stopEvent, int flags) { - HIP_INIT_API(hipExtLaunchKernel,function,numBlocks,dimBlocks,args,sharedMemBytes,stream,startEvent,stopEvent,flags); - - hipFunction_t kd = hip_impl::get_program_state().kernel_descriptor((std::uintptr_t)function, - hip_impl::target_agent(stream)); - - uint32_t globalWorkSizeX = numBlocks.x * dimBlocks.x; - uint32_t globalWorkSizeY = numBlocks.y * dimBlocks.y; - uint32_t globalWorkSizeZ = numBlocks.z * dimBlocks.z; - if (globalWorkSizeX > UINT32_MAX || globalWorkSizeY > UINT32_MAX || - globalWorkSizeZ > UINT32_MAX) { - return hipErrorInvalidConfiguration; - } - - return ihipLogStatus(ihipModuleLaunchKernel( - tls, kd, globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ, dimBlocks.x, dimBlocks.y, - dimBlocks.z, sharedMemBytes, stream, args, nullptr, startEvent, stopEvent, flags)); -} diff --git a/src/hip_peer.cpp b/src/hip_peer.cpp deleted file mode 100644 index 8fd66a52bb..0000000000 --- a/src/hip_peer.cpp +++ /dev/null @@ -1,231 +0,0 @@ -/* -Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved. - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. -*/ - -#include - -#include "hip/hip_runtime.h" -#include "hip_hcc_internal.h" -#include "trace_helper.h" - - -// Peer access functions. -// There are two flavors: -// - one where contexts are specified with hipCtx_t type. -// - one where contexts are specified with integer deviceIds, that are mapped to the primary -// context for that device. -// The implementation contains a set of internal ihip* functions which operate on contexts. Then -// the public APIs are thin wrappers which call into this internal implementations. -// TODO - actually not yet - currently the integer deviceId flavors just call the context APIs. need -// to fix. - - -hipError_t ihipDeviceCanAccessPeer(int* canAccessPeer, hipCtx_t thisCtx, hipCtx_t peerCtx) { - hipError_t err = hipSuccess; - - if(canAccessPeer == NULL) { - err = hipErrorInvalidValue; - } - else if ((thisCtx != NULL) && (peerCtx != NULL)) { - if (thisCtx == peerCtx) { - *canAccessPeer = 0; - tprintf(DB_MEM, "Can't be peer to self. (this=%s, peer=%s)\n", - thisCtx->toString().c_str(), peerCtx->toString().c_str()); - } else if (HIP_FORCE_P2P_HOST & 0x2) { - *canAccessPeer = false; - tprintf(DB_MEM, - "HIP_FORCE_P2P_HOST denies peer access this=%s peer=%s canAccessPeer=%d\n", - thisCtx->toString().c_str(), peerCtx->toString().c_str(), *canAccessPeer); - } else { - *canAccessPeer = peerCtx->getDevice()->_acc.get_is_peer(thisCtx->getDevice()->_acc); - tprintf(DB_MEM, "deviceCanAccessPeer this=%s peer=%s canAccessPeer=%d\n", - thisCtx->toString().c_str(), peerCtx->toString().c_str(), *canAccessPeer); - } - - } else { - *canAccessPeer = 0; - err = hipErrorInvalidDevice; - } - - - return err; -} - - -/** - * HCC returns 0 in *canAccessPeer ; Need to update this function when RT supports P2P - */ -//--- -hipError_t hipDeviceCanAccessPeer(int* canAccessPeer, hipCtx_t thisCtx, hipCtx_t peerCtx) { - HIP_INIT_API(NONE, canAccessPeer, thisCtx, peerCtx); - - return ihipLogStatus(ihipDeviceCanAccessPeer(canAccessPeer, thisCtx, peerCtx)); -} - - -//--- -// Disable visibility of this device into memory allocated on peer device. -// Remove this device from peer device peerlist. -hipError_t ihipDisablePeerAccess(TlsData* tls, hipCtx_t peerCtx) { - hipError_t err = hipSuccess; - - auto thisCtx = ihipGetTlsDefaultCtx(); - if ((thisCtx != NULL) && (peerCtx != NULL)) { - bool canAccessPeer = peerCtx->getDevice()->_acc.get_is_peer(thisCtx->getDevice()->_acc); - - if (!canAccessPeer) { - err = hipErrorInvalidDevice; // P2P not allowed between these devices. - } else if (thisCtx == peerCtx) { - err = hipErrorInvalidDevice; // Can't disable peer access to self. - } else { - LockedAccessor_CtxCrit_t peerCrit(peerCtx->criticalData()); - bool changed = peerCrit->removePeerWatcher(peerCtx, thisCtx); - if (changed) { - tprintf(DB_MEM, "device %s disable access to memory allocated on peer:%s\n", - thisCtx->toString().c_str(), peerCtx->toString().c_str()); - // Update the peers for all memory already saved in the tracker: - am_memtracker_update_peers(peerCtx->getDevice()->_acc, peerCrit->peerCnt(), - peerCrit->peerAgents()); - } else { - err = hipErrorPeerAccessNotEnabled; // never enabled P2P access. - } - } - } else { - err = hipErrorInvalidDevice; - } - - return err; -}; - - -//--- -// Allow the current device to see all memory allocated on peerCtx. -// This should add this device to the peer-device peer list. -hipError_t ihipEnablePeerAccess(TlsData* tls, hipCtx_t peerCtx, unsigned int flags) { - hipError_t err = hipSuccess; - if (flags != 0) { - err = hipErrorInvalidValue; - } else { - auto thisCtx = ihipGetTlsDefaultCtx(); - if (thisCtx == peerCtx) { - err = hipErrorInvalidDevice; // Can't enable peer access to self. - } else if ((thisCtx != NULL) && (peerCtx != NULL)) { - - int canAccess = 0; - if ((hipSuccess != ihipDeviceCanAccessPeer(&canAccess,thisCtx,peerCtx)) || (canAccess == 0)){ - tprintf(DB_MEM, "device=%s can't access peer=%s\n",thisCtx->toString().c_str(), peerCtx->toString().c_str()); - err = hipErrorInvalidDevice; - } else { - LockedAccessor_CtxCrit_t peerCrit(peerCtx->criticalData()); - // Add thisCtx to peerCtx's access list so that new allocations on peer will be made - // visible to this device: - bool isNewPeer = peerCrit->addPeerWatcher(peerCtx, thisCtx); - if (isNewPeer) { - tprintf(DB_MEM, "device=%s can now see all memory allocated on peer=%s\n", - thisCtx->toString().c_str(), peerCtx->toString().c_str()); - am_memtracker_update_peers(peerCtx->getDevice()->_acc, peerCrit->peerCnt(), - peerCrit->peerAgents()); - } else { - err = hipErrorPeerAccessAlreadyEnabled; - } - } - } else { - err = hipErrorInvalidDevice; - } - } - - return err; -} - - -//--- -hipError_t hipMemcpyPeer(void* dst, hipCtx_t dstCtx, const void* src, hipCtx_t srcCtx, - size_t sizeBytes) { - HIP_INIT_API(NONE, dst, dstCtx, src, srcCtx, sizeBytes); - - // TODO - move to ihip memory copy implementaion. - // HCC has a unified memory architecture so device specifiers are not required. - return ihipLogStatus(hipMemcpy(dst, src, sizeBytes, hipMemcpyDefault)); -}; - - -//--- -hipError_t hipMemcpyPeerAsync(void* dst, hipCtx_t dstDevice, const void* src, hipCtx_t srcDevice, - size_t sizeBytes, hipStream_t stream) { - HIP_INIT_API(NONE, dst, dstDevice, src, srcDevice, sizeBytes, stream); - - // TODO - move to ihip memory copy implementaion. - // HCC has a unified memory architecture so device specifiers are not required. - return ihipLogStatus(hip_internal::memcpyAsync(dst, src, sizeBytes, hipMemcpyDefault, stream)); -}; - - -//============================================================================= -// These are the flavors that accept integer deviceIDs. -// Implementations map these to primary contexts and call the internal functions above. -//============================================================================= - -hipError_t hipDeviceCanAccessPeer(int* canAccessPeer, int deviceId, int peerDeviceId) { - HIP_INIT_API(hipDeviceCanAccessPeer, canAccessPeer, deviceId, peerDeviceId); - return ihipLogStatus(ihipDeviceCanAccessPeer(canAccessPeer, ihipGetPrimaryCtx(deviceId), - ihipGetPrimaryCtx(peerDeviceId))); -} - - -hipError_t hipDeviceDisablePeerAccess(int peerDeviceId) { - HIP_INIT_API(hipDeviceDisablePeerAccess, peerDeviceId); - - return ihipLogStatus(ihipDisablePeerAccess(tls, ihipGetPrimaryCtx(peerDeviceId))); -} - - -hipError_t hipDeviceEnablePeerAccess(int peerDeviceId, unsigned int flags) { - HIP_INIT_API(hipDeviceEnablePeerAccess, peerDeviceId, flags); - - return ihipLogStatus(ihipEnablePeerAccess(tls, ihipGetPrimaryCtx(peerDeviceId), flags)); -} - - -hipError_t hipMemcpyPeer(void* dst, int dstDevice, const void* src, int srcDevice, - size_t sizeBytes) { - HIP_INIT_API(hipMemcpyPeer, dst, dstDevice, src, srcDevice, sizeBytes); - return ihipLogStatus(hipMemcpyPeer(dst, ihipGetPrimaryCtx(dstDevice), src, - ihipGetPrimaryCtx(srcDevice), sizeBytes)); -} - - -hipError_t hipMemcpyPeerAsync(void* dst, int dstDevice, const void* src, int srcDevice, - size_t sizeBytes, hipStream_t stream) { - HIP_INIT_API(hipMemcpyPeerAsync, dst, dstDevice, src, srcDevice, sizeBytes, stream); - return ihipLogStatus(hip_internal::memcpyAsync(dst, src, sizeBytes, hipMemcpyDefault, stream)); -} - -hipError_t hipCtxEnablePeerAccess(hipCtx_t peerCtx, unsigned int flags) { - HIP_INIT_API(hipCtxEnablePeerAccess, peerCtx, flags); - - return ihipLogStatus(ihipEnablePeerAccess(tls, peerCtx, flags)); -} - -hipError_t hipCtxDisablePeerAccess(hipCtx_t peerCtx) { - HIP_INIT_API(hipCtxDisablePeerAccess, peerCtx); - - return ihipLogStatus(ihipDisablePeerAccess(tls, peerCtx)); -} diff --git a/src/hip_prof_api.h b/src/hip_prof_api.h deleted file mode 100644 index 8a69746f93..0000000000 --- a/src/hip_prof_api.h +++ /dev/null @@ -1,200 +0,0 @@ -// automatically generated sources -#ifndef _HIP_PROF_API_H -#define _HIP_PROF_API_H - -#include -#include -#include - -#include "hip/hcc_detail/hip_prof_str.h" - -template -class api_callbacks_table_templ { - public: - typedef std::recursive_mutex mutex_t; - - typedef Record record_t; - typedef Fun fun_t; - typedef Act act_t; - - // HIP API callbacks table - struct hip_cb_table_entry_t { - volatile std::atomic sync; - volatile std::atomic sem; - act_t act; - void* a_arg; - fun_t fun; - void* arg; - }; - - struct hip_cb_table_t { - hip_cb_table_entry_t arr[HIP_API_ID_NUMBER]; - }; - - api_callbacks_table_templ() { - memset(&callbacks_table_, 0, sizeof(callbacks_table_)); - } - - bool set_activity(uint32_t id, act_t fun, void* arg) { - std::lock_guard lock(mutex_); - bool ret = true; - if (id < HIP_API_ID_NUMBER) { - cb_sync(id); - callbacks_table_.arr[id].act = fun; - callbacks_table_.arr[id].a_arg = arg; - cb_release(id); - } else { - ret = false; - } - return ret; - } - - bool set_callback(uint32_t id, fun_t fun, void* arg) { - std::lock_guard lock(mutex_); - bool ret = true; - if (id < HIP_API_ID_NUMBER) { - cb_sync(id); - callbacks_table_.arr[id].fun = fun; - callbacks_table_.arr[id].arg = arg; - cb_release(id); - } else { - ret = false; - } - return ret; - } - - inline hip_cb_table_entry_t& entry(const uint32_t& id) { - return callbacks_table_.arr[id]; - } - - inline void sem_sync(const uint32_t& id) { - sem_increment(id); - if (entry(id).sync.load() == true) sync_wait(id); - } - - inline void sem_release(const uint32_t& id) { - sem_decrement(id); - } - - private: - inline void cb_sync(const uint32_t& id) { - entry(id).sync.store(true); - while (entry(id).sem.load() != 0) {} - } - - inline void cb_release(const uint32_t& id) { - entry(id).sync.store(false); - } - - inline void sem_increment(const uint32_t& id) { - const uint32_t prev = entry(id).sem.fetch_add(1); - if (prev == UINT32_MAX) { - std::cerr << "sem overflow id = " << id << std::endl << std::flush; - abort(); - } - } - - inline void sem_decrement(const uint32_t& id) { - const uint32_t prev = entry(id).sem.fetch_sub(1); - if (prev == 0) { - std::cerr << "sem corrupted id = " << id << std::endl << std::flush; - abort(); - } - } - - void sync_wait(const uint32_t& id) { - sem_decrement(id); - while (entry(id).sync.load() == true) {} - sem_increment(id); - } - - mutex_t mutex_; - hip_cb_table_t callbacks_table_; -}; - - -#if USE_PROF_API -#include - -static const uint32_t HIP_DOMAIN_ID = ACTIVITY_DOMAIN_HIP_API; -typedef activity_record_t hip_api_record_t; -typedef activity_rtapi_callback_t hip_api_callback_t; -typedef activity_sync_callback_t hip_act_callback_t; - -// HIP API callbacks spawner object macro -#define HIP_CB_SPAWNER_OBJECT(CB_ID) \ - hip_api_data_t api_data{}; \ - INIT_CB_ARGS_DATA(CB_ID, api_data); \ - api_callbacks_spawner_t __api_tracer(HIP_API_ID_##CB_ID, api_data); - -typedef api_callbacks_table_templ api_callbacks_table_t; -extern api_callbacks_table_t callbacks_table; - -template -class api_callbacks_spawner_t { - public: - api_callbacks_spawner_t(const hip_api_id_t& cid, hip_api_data_t& api_data) : - api_data_(api_data), - record_({}) - { - if (cid_ >= HIP_API_ID_NUMBER) { - fprintf(stderr, "HIP %s bad id %d\n", __FUNCTION__, cid_); - abort(); - } - callbacks_table.sem_sync(cid_); - - act = entry(cid_).act; - a_arg = entry(cid_).a_arg; - fun = entry(cid_).fun; - arg = entry(cid_).arg; - - api_data_.phase = 0; - if (act != NULL) act(cid_, &record_, &api_data_, a_arg); - if (fun != NULL) fun(HIP_DOMAIN_ID, cid_, &api_data_, arg); - } - - ~api_callbacks_spawner_t() { - api_data_.phase = 1; - if (act != NULL) act(cid_, &record_, &api_data_, a_arg); - if (fun != NULL) fun(HIP_DOMAIN_ID, cid_, &api_data_, arg); - - callbacks_table.sem_release(cid_); - } - - private: - inline api_callbacks_table_t::hip_cb_table_entry_t& entry(const uint32_t& id) { - return callbacks_table.entry(id); - } - - hip_api_data_t& api_data_; - hip_api_record_t record_; - - hip_act_callback_t act; - void* a_arg; - hip_api_callback_t fun; - void* arg; -}; - -template <> -class api_callbacks_spawner_t { - public: - api_callbacks_spawner_t(const hip_api_id_t& cid, hip_api_data_t& api_data) {} -}; - -#else - -#define HIP_CB_SPAWNER_OBJECT(x) do {} while(0) - -class api_callbacks_table_t { - public: - typedef void* act_t; - typedef void* fun_t; - bool set_activity(uint32_t id, act_t fun, void* arg) { return false; } - bool set_callback(uint32_t id, fun_t fun, void* arg) { return false; } -}; - -#endif - -#endif // _HIP_PROF_API_H diff --git a/src/hip_stream.cpp b/src/hip_stream.cpp deleted file mode 100644 index 5b56b71cd8..0000000000 --- a/src/hip_stream.cpp +++ /dev/null @@ -1,296 +0,0 @@ -/* -Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved. - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. -*/ - -#include -#include -#include "hip/hip_runtime.h" -#include "hip_hcc_internal.h" -#include "trace_helper.h" - - -//------------------------------------------------------------------------------------------------- -//------------------------------------------------------------------------------------------------- -// Stream -// -#if defined(__HCC__) && (__hcc_major__ < 3) && (__hcc_minor__ < 3) -enum queue_priority -{ - priority_high = 0, - priority_normal = 0, - priority_low = 0 -}; -#else -enum queue_priority -{ - priority_high = Kalmar::priority_high, - priority_normal = Kalmar::priority_normal, - priority_low = Kalmar::priority_low -}; -#endif - -//--- -hipError_t ihipStreamCreate(TlsData *tls, hipStream_t* stream, unsigned int flags, int priority) { - ihipCtx_t* ctx = ihipGetTlsDefaultCtx(); - - hipError_t e = hipSuccess; - - if (ctx) { - if (HIP_FORCE_NULL_STREAM) { - *stream = 0; - } else if( NULL == stream ){ - e = hipErrorInvalidValue; - } else { - hc::accelerator acc = ctx->getWriteableDevice()->_acc; - - // TODO - se try-catch loop to detect memory exception? - // - // Note this is an execute_any_order queue, - // CUDA stream behavior is that all kernels submitted will automatically - // wait for prev to complete, this behaviour will be mainatined by - // hipModuleLaunchKernel. execute_any_order will help - // hipExtModuleLaunchKernel , which uses a special flag - - { - // Obtain mutex access to the device critical data, release by destructor - LockedAccessor_CtxCrit_t ctxCrit(ctx->criticalData()); - -#if defined(__HCC__) && (__hcc_major__ < 3) && (__hcc_minor__ < 3) - auto istream = new ihipStream_t(ctx, acc.create_view(), flags); -#else - auto istream = new ihipStream_t(ctx, acc.create_view(Kalmar::execute_any_order, Kalmar::queuing_mode_automatic, (Kalmar::queue_priority)priority), flags); -#endif - - ctxCrit->addStream(istream); - *stream = istream; - } - tprintf(DB_SYNC, "hipStreamCreate, %s\n", ToString(*stream).c_str()); - } - - } else { - e = hipErrorInvalidDevice; - } - - return e; -} - - -//--- -hipError_t hipStreamCreateWithFlags(hipStream_t* stream, unsigned int flags) { - HIP_INIT_API(hipStreamCreateWithFlags, stream, flags); - if(flags == hipStreamDefault || flags == hipStreamNonBlocking) - return ihipLogStatus(ihipStreamCreate(tls, stream, flags, priority_normal)); - else - return ihipLogStatus(hipErrorInvalidValue); -} - -//--- -hipError_t hipStreamCreate(hipStream_t* stream) { - HIP_INIT_API(hipStreamCreate, stream); - - return ihipLogStatus(ihipStreamCreate(tls, stream, hipStreamDefault, priority_normal)); -} - -//--- -hipError_t hipStreamCreateWithPriority(hipStream_t* stream, unsigned int flags, int priority) { - HIP_INIT_API(hipStreamCreateWithPriority, stream, flags, priority); - - // clamp priority to range [priority_high:priority_low] - priority = (priority < priority_high ? priority_high : (priority > priority_low ? priority_low : priority)); - return ihipLogStatus(ihipStreamCreate(tls, stream, flags, priority)); -} - -//--- -hipError_t hipDeviceGetStreamPriorityRange(int* leastPriority, int* greatestPriority) { - HIP_INIT_API(hipDeviceGetStreamPriorityRange, leastPriority, greatestPriority); - - if (leastPriority != NULL) *leastPriority = priority_low; - if (greatestPriority != NULL) *greatestPriority = priority_high; - return ihipLogStatus(hipSuccess); -} - -hipError_t hipStreamWaitEvent(hipStream_t stream, hipEvent_t event, unsigned int flags) { - HIP_INIT_SPECIAL_API(hipStreamWaitEvent, TRACE_SYNC, stream, event, flags); - - if (!event) return ihipLogStatus(hipErrorInvalidHandle); - - auto ecd = event->locked_copyCrit(); - if (event->_flags & hipEventInterprocess) { - // this is an IPC event - if (ecd._ipc_shmem->read_index >= 0) { - // we have at least one recorded event, so proceed - stream->locked_streamWaitEvent(ecd); - } - } - else { - if ((ecd._state != hipEventStatusUnitialized) && (ecd._state != hipEventStatusCreated)) { - if (HIP_SYNC_STREAM_WAIT || (HIP_SYNC_NULL_STREAM && (stream == 0))) { - ecd.marker().wait((event->_flags & hipEventBlockingSync) ? hc::hcWaitModeBlocked - : hc::hcWaitModeActive); - } else { - stream = ihipSyncAndResolveStream(stream); - // This will use create_blocking_marker to wait on the specified queue. - stream->locked_streamWaitEvent(ecd); - } - } - } - - return ihipLogStatus(hipSuccess); -}; - - -//--- -hipError_t hipStreamQuery(hipStream_t stream) { - HIP_INIT_SPECIAL_API(hipStreamQuery, TRACE_QUERY, stream); - - // Use default stream if 0 specified: - if (stream == hipStreamNull) { - ihipCtx_t* device = ihipGetTlsDefaultCtx(); - stream = device->_defaultStream; - } - - bool isEmpty = 0; - - { - LockedAccessor_StreamCrit_t crit(stream->_criticalData); - isEmpty = crit->_av.get_is_empty(); - } - - hipError_t e = isEmpty ? hipSuccess : hipErrorNotReady; - - return ihipLogStatus(e); -} - - -//--- -hipError_t hipStreamSynchronize(hipStream_t stream) { - HIP_INIT_SPECIAL_API(hipStreamSynchronize, TRACE_SYNC, stream); - - return ihipLogStatus(ihipStreamSynchronize(tls, stream)); -} - - -//--- -/** - * @return #hipSuccess, #hipErrorInvalidHandle - */ -hipError_t hipStreamDestroy(hipStream_t stream) { - HIP_INIT_API(hipStreamDestroy, stream); - - hipError_t e = hipSuccess; - - //--- Drain the stream: - if (stream == NULL) { - if (!HIP_FORCE_NULL_STREAM) { - e = hipErrorInvalidHandle; - } - } else { - stream->locked_wait(); - - ihipCtx_t* ctx = stream->getCtx(); - - if (ctx) { - ctx->locked_removeStream(stream); - delete stream; - } else { - e = hipErrorInvalidHandle; - } - } - - return ihipLogStatus(e); -} - - -//--- -hipError_t hipStreamGetFlags(hipStream_t stream, unsigned int* flags) { - HIP_INIT_API(hipStreamGetFlags, stream, flags); - - if (flags == NULL) { - return ihipLogStatus(hipErrorInvalidValue); - } else if (stream == hipStreamNull) { - return ihipLogStatus(hipErrorInvalidHandle); - } else { - *flags = stream->_flags; - return ihipLogStatus(hipSuccess); - } -} - - -//-- -hipError_t hipStreamGetPriority(hipStream_t stream, int* priority) { - HIP_INIT_API(hipStreamGetPriority, stream, priority); - - if (priority == NULL) { - return ihipLogStatus(hipErrorInvalidValue); - } else if (stream == hipStreamNull) { - return ihipLogStatus(hipErrorInvalidHandle); - } else { -#if defined(__HCC__) && (__hcc_major__ < 3) && (__hcc_minor__ < 3) - *priority = 0; -#else - LockedAccessor_StreamCrit_t crit(stream->criticalData()); - *priority = crit->_av.get_queue_priority(); -#endif - return ihipLogStatus(hipSuccess); - } -} - - -//--- -hipError_t hipStreamAddCallback(hipStream_t stream, hipStreamCallback_t callback, void* userData, - unsigned int flags) { - HIP_INIT_API(hipStreamAddCallback, stream, callback, userData, flags); - - auto stream_original{stream}; - stream = ihipSyncAndResolveStream(stream); - - if (!stream) return hipErrorInvalidValue; - - LockedAccessor_StreamCrit_t cs{stream->criticalData()}; - - // create first marker - auto cf = cs->_av.create_marker(hc::no_scope); - // get its signal - auto signal = *reinterpret_cast(cf.get_native_handle()); - // increment its signal value - hsa_signal_add_relaxed(signal, 1); - - // create callback that can be passed to hsa_amd_signal_async_handler - // this function will call the user's callback, then sets first packet's signal to 0 to indicate completion - auto t{new std::function{[=]() { - callback(stream_original, hipSuccess, userData); - hsa_signal_store_relaxed(signal, 0); - }}}; - - // register above callback with HSA runtime to be called when first packet's signal - // is decremented from 2 to 1 by CP (or it is already at 1) - hsa_amd_signal_async_handler(signal, HSA_SIGNAL_CONDITION_EQ, 1, - [](hsa_signal_value_t x, void* p) { - (*static_cast(p))(); - delete static_cast(p); - return false; - }, t); - - // create additional marker that blocks on the first one - cs->_av.create_blocking_marker(cf, hc::no_scope); - - return ihipLogStatus(hipSuccess); -} diff --git a/src/hip_surface.cpp b/src/hip_surface.cpp deleted file mode 100644 index 9acd827f73..0000000000 --- a/src/hip_surface.cpp +++ /dev/null @@ -1,87 +0,0 @@ -/* -Copyright (c) 2018 - present Advanced Micro Devices, Inc. All rights reserved. - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. -*/ - -#include - -#include - -#include "hip/hip_runtime.h" -#include "hip_hcc_internal.h" -#include "trace_helper.h" - -#include "hip_surface.h" - -static std::map surfaceHash; - -void saveSurfaceInfo(const hipSurface* pSurface, const hipResourceDesc* pResDesc) { - if (pResDesc != nullptr) { - memcpy((void*)&(pSurface->resDesc), (void*)pResDesc, sizeof(hipResourceDesc)); - } -} - -// Surface Object APIs -hipError_t hipCreateSurfaceObject(hipSurfaceObject_t* pSurfObject, - const hipResourceDesc* pResDesc) { - HIP_INIT_API(hipCreateSurfaceObject, pSurfObject, pResDesc); - hipError_t hip_status = hipSuccess; - - auto ctx = ihipGetTlsDefaultCtx(); - if (ctx) { - hipSurface* pSurface = (hipSurface*)malloc(sizeof(hipSurface)); - if (pSurface != nullptr) { - memset(pSurface, 0, sizeof(hipSurface)); - saveSurfaceInfo(pSurface, pResDesc); - } - - switch (pResDesc->resType) { - case hipResourceTypeArray: - pSurface->array = pResDesc->res.array.array; - break; - default: - break; - } - unsigned int* surfObj; - hipMalloc((void**)&surfObj, sizeof(hipArray)); - hipMemcpy(surfObj, (void*)pResDesc->res.array.array, sizeof(hipArray), - hipMemcpyHostToDevice); - *pSurfObject = (hipSurfaceObject_t)surfObj; - surfaceHash[*pSurfObject] = pSurface; - } - - return ihipLogStatus(hip_status); -} - -hipError_t hipDestroySurfaceObject(hipSurfaceObject_t surfaceObject) { - HIP_INIT_API(hipDestroySurfaceObject, surfaceObject); - - hipError_t hip_status = hipSuccess; - - auto ctx = ihipGetTlsDefaultCtx(); - if (ctx) { - hipSurface* pSurface = surfaceHash[surfaceObject]; - if (pSurface != nullptr) { - free(pSurface); - surfaceHash.erase(surfaceObject); - } - } - return ihipLogStatus(hip_status); -} diff --git a/src/hip_surface.h b/src/hip_surface.h deleted file mode 100644 index 8b30c95f2b..0000000000 --- a/src/hip_surface.h +++ /dev/null @@ -1,32 +0,0 @@ -/* -Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved. - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. -*/ - -#ifndef HIP_INCLUDE_HIP_HCC_DETAIL_HIP_SURFACE_H -#define HIP_INCLUDE_HIP_HCC_DETAIL_HIP_SURFACE_H - -#include -struct hipSurface { - hipArray* array; - hipResourceDesc resDesc; -}; - -#endif diff --git a/src/hip_texture.cpp b/src/hip_texture.cpp deleted file mode 100644 index 29f0465dc1..0000000000 --- a/src/hip_texture.cpp +++ /dev/null @@ -1,851 +0,0 @@ - -#include - -#include - -#include "hsa/hsa.h" -#include "hsa/hsa_ext_amd.h" - -#include "hip/hip_runtime.h" -#include "hip_hcc_internal.h" -#include "trace_helper.h" - -#include "hip_texture.h" - -static std::map textureHash; - -void saveTextureInfo(const hipTexture* pTexture, const hipResourceDesc* pResDesc, - const hipTextureDesc* pTexDesc, const hipResourceViewDesc* pResViewDesc) { - if (pResDesc != nullptr) { - memcpy((void*)&(pTexture->resDesc), (void*)pResDesc, sizeof(hipResourceDesc)); - } - - if (pTexDesc != nullptr) { - memcpy((void*)&(pTexture->texDesc), (void*)pTexDesc, sizeof(hipTextureDesc)); - } - - if (pResViewDesc != nullptr) { - memcpy((void*)&(pTexture->resViewDesc), (void*)pResViewDesc, sizeof(hipResourceViewDesc)); - } -} - -void getDrvChannelOrderAndType(const enum hipArray_Format Format, enum hipTextureReadMode readMode, unsigned int NumChannels, - hsa_ext_image_channel_order_t* channelOrder, - hsa_ext_image_channel_type_t* channelType) { - switch (Format) { - case HIP_AD_FORMAT_UNSIGNED_INT8: - *channelType = readMode == hipReadModeNormalizedFloat - ? HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_INT8 - : HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8; - break; - case HIP_AD_FORMAT_UNSIGNED_INT16: - *channelType = readMode == hipReadModeNormalizedFloat - ? HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_INT16 - : HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16; - break; - case HIP_AD_FORMAT_UNSIGNED_INT32: - *channelType = HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32; - break; - case HIP_AD_FORMAT_SIGNED_INT8: - *channelType = readMode == hipReadModeNormalizedFloat - ? HSA_EXT_IMAGE_CHANNEL_TYPE_SNORM_INT8 - : HSA_EXT_IMAGE_CHANNEL_TYPE_SIGNED_INT8; - break; - case HIP_AD_FORMAT_SIGNED_INT16: - *channelType = readMode == hipReadModeNormalizedFloat - ? HSA_EXT_IMAGE_CHANNEL_TYPE_SNORM_INT16 - : HSA_EXT_IMAGE_CHANNEL_TYPE_SIGNED_INT16; - break; - case HIP_AD_FORMAT_SIGNED_INT32: - *channelType = HSA_EXT_IMAGE_CHANNEL_TYPE_SIGNED_INT32; - break; - case HIP_AD_FORMAT_HALF: - *channelType = HSA_EXT_IMAGE_CHANNEL_TYPE_HALF_FLOAT; - break; - case HIP_AD_FORMAT_FLOAT: - *channelType = HSA_EXT_IMAGE_CHANNEL_TYPE_FLOAT; - break; - default: - break; - } - - if (NumChannels == 4) { - *channelOrder = HSA_EXT_IMAGE_CHANNEL_ORDER_RGBA; - } else if (NumChannels == 2) { - *channelOrder = HSA_EXT_IMAGE_CHANNEL_ORDER_RG; - } else if (NumChannels == 1) { - *channelOrder = HSA_EXT_IMAGE_CHANNEL_ORDER_R; - } -} -void getChannelOrderAndType(const hipChannelFormatDesc& desc, enum hipTextureReadMode readMode, - hsa_ext_image_channel_order_t* channelOrder, - hsa_ext_image_channel_type_t* channelType) { - if (desc.x != 0 && desc.y != 0 && desc.z != 0 && desc.w != 0) { - *channelOrder = HSA_EXT_IMAGE_CHANNEL_ORDER_RGBA; - } else if (desc.x != 0 && desc.y != 0 && desc.z != 0 && desc.w == 0) { - *channelOrder = HSA_EXT_IMAGE_CHANNEL_ORDER_RGB; - } else if (desc.x != 0 && desc.y != 0 && desc.z == 0 && desc.w == 0) { - *channelOrder = HSA_EXT_IMAGE_CHANNEL_ORDER_RG; - } else if (desc.x != 0 && desc.y == 0 && desc.z == 0 && desc.w == 0) { - *channelOrder = HSA_EXT_IMAGE_CHANNEL_ORDER_R; - } else { - } - - switch (desc.f) { - case hipChannelFormatKindUnsigned: - switch (desc.x) { - case 32: - *channelType = HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32; - break; - case 16: - *channelType = readMode == hipReadModeNormalizedFloat - ? HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_INT16 - : HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16; - break; - case 8: - *channelType = readMode == hipReadModeNormalizedFloat - ? HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_INT8 - : HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8; - break; - default: - *channelType = HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32; - } - break; - case hipChannelFormatKindSigned: - switch (desc.x) { - case 32: - *channelType = HSA_EXT_IMAGE_CHANNEL_TYPE_SIGNED_INT32; - break; - case 16: - *channelType = readMode == hipReadModeNormalizedFloat - ? HSA_EXT_IMAGE_CHANNEL_TYPE_SNORM_INT16 - : HSA_EXT_IMAGE_CHANNEL_TYPE_SIGNED_INT16; - break; - case 8: - *channelType = readMode == hipReadModeNormalizedFloat - ? HSA_EXT_IMAGE_CHANNEL_TYPE_SNORM_INT8 - : HSA_EXT_IMAGE_CHANNEL_TYPE_SIGNED_INT8; - break; - default: - *channelType = HSA_EXT_IMAGE_CHANNEL_TYPE_SIGNED_INT32; - } - break; - case hipChannelFormatKindFloat: - switch (desc.x) { - case 32: - *channelType = HSA_EXT_IMAGE_CHANNEL_TYPE_FLOAT; - break; - case 16: - *channelType = HSA_EXT_IMAGE_CHANNEL_TYPE_HALF_FLOAT; - break; - case 8: - break; - default: - *channelType = HSA_EXT_IMAGE_CHANNEL_TYPE_FLOAT; - } - break; - case hipChannelFormatKindNone: - default: - break; - } -} - -void fillSamplerDescriptor(hsa_ext_sampler_descriptor_t& samplerDescriptor, - enum hipTextureAddressMode addressMode, - enum hipTextureFilterMode filterMode, int normalizedCoords) { - if (normalizedCoords) { - samplerDescriptor.coordinate_mode = HSA_EXT_SAMPLER_COORDINATE_MODE_NORMALIZED; - } else { - samplerDescriptor.coordinate_mode = HSA_EXT_SAMPLER_COORDINATE_MODE_UNNORMALIZED; - } - - switch (filterMode) { - case hipFilterModePoint: - samplerDescriptor.filter_mode = HSA_EXT_SAMPLER_FILTER_MODE_NEAREST; - break; - case hipFilterModeLinear: - samplerDescriptor.filter_mode = HSA_EXT_SAMPLER_FILTER_MODE_LINEAR; - break; - } - - switch (addressMode) { - case hipAddressModeWrap: - samplerDescriptor.address_mode = HSA_EXT_SAMPLER_ADDRESSING_MODE_REPEAT; - break; - case hipAddressModeClamp: - samplerDescriptor.address_mode = HSA_EXT_SAMPLER_ADDRESSING_MODE_CLAMP_TO_EDGE; - break; - case hipAddressModeMirror: - samplerDescriptor.address_mode = HSA_EXT_SAMPLER_ADDRESSING_MODE_MIRRORED_REPEAT; - break; - case hipAddressModeBorder: - samplerDescriptor.address_mode = HSA_EXT_SAMPLER_ADDRESSING_MODE_CLAMP_TO_BORDER; - break; - } -} - -bool getHipTextureObject(hipTextureObject_t* pTexObject, hsa_ext_image_t& image, - hsa_ext_sampler_t sampler) { - unsigned int* texSRD; - hipMalloc((void**)&texSRD, HIP_TEXTURE_OBJECT_SIZE_DWORD * 4); - hipMemcpy(texSRD, (void*)image.handle, HIP_IMAGE_OBJECT_SIZE_DWORD * 4, - hipMemcpyDeviceToDevice); - hipMemcpy(texSRD + HIP_SAMPLER_OBJECT_OFFSET_DWORD, (void*)sampler.handle, - HIP_SAMPLER_OBJECT_SIZE_DWORD * 4, hipMemcpyDeviceToDevice); - *pTexObject = (hipTextureObject_t)texSRD; - -#ifdef DEBUG - unsigned int* srd = (unsigned int*)malloc(HIP_TEXTURE_OBJECT_SIZE_DWORD * 4); - hipMemcpy(srd, texSRD, HIP_TEXTURE_OBJECT_SIZE_DWORD * 4, hipMemcpyDeviceToHost); - printf("New SRD: \n"); - for (int i = 0; i < HIP_TEXTURE_OBJECT_SIZE_DWORD; i++) { - printf("SRD[%d]: %x\n", i, srd[i]); - } - printf("\n"); -#endif - return true; -} - -// Texture Object APIs -hipError_t hipCreateTextureObject(hipTextureObject_t* pTexObject, const hipResourceDesc* pResDesc, - const hipTextureDesc* pTexDesc, - const hipResourceViewDesc* pResViewDesc) { - HIP_INIT_API(hipCreateTextureObject, pTexObject, pResDesc, pTexDesc, pResViewDesc); - hipError_t hip_status = hipSuccess; - - auto ctx = ihipGetTlsDefaultCtx(); - if (ctx) { - hc::accelerator acc = ctx->getDevice()->_acc; - auto device = ctx->getWriteableDevice(); - - hsa_agent_t* agent = static_cast(acc.get_hsa_agent()); - - hipTexture* pTexture = (hipTexture*)malloc(sizeof(hipTexture)); - if (pTexture != nullptr) { - memset(pTexture, 0, sizeof(hipTexture)); - saveTextureInfo(pTexture, pResDesc, pTexDesc, pResViewDesc); - } - - hsa_ext_image_descriptor_t imageDescriptor; - hsa_ext_image_channel_order_t channelOrder; - hsa_ext_image_channel_type_t channelType; - void* devPtr = nullptr; - size_t pitch = 0; - switch (pResDesc->resType) { - case hipResourceTypeArray: - devPtr = pResDesc->res.array.array->data; - imageDescriptor.width = pResDesc->res.array.array->width; - imageDescriptor.height = pResDesc->res.array.array->height; - switch (pResDesc->res.array.array->type) { - case hipArrayLayered: - imageDescriptor.geometry = HSA_EXT_IMAGE_GEOMETRY_2DA; - imageDescriptor.depth = 0; - imageDescriptor.array_size = pResDesc->res.array.array->depth; - break; - case hipArrayCubemap: - imageDescriptor.geometry = HSA_EXT_IMAGE_GEOMETRY_3D; - imageDescriptor.depth = pResDesc->res.array.array->depth; - imageDescriptor.array_size = 0; - break; - case hipArraySurfaceLoadStore: - case hipArrayTextureGather: - case hipArrayDefault: - default: - imageDescriptor.geometry = HSA_EXT_IMAGE_GEOMETRY_2D; - imageDescriptor.depth = 0; - imageDescriptor.array_size = 0; - break; - } - getChannelOrderAndType(pResDesc->res.array.array->desc, pTexDesc->readMode, - &channelOrder, &channelType); - break; - case hipResourceTypeMipmappedArray: - devPtr = pResDesc->res.mipmap.mipmap->data; - imageDescriptor.width = pResDesc->res.mipmap.mipmap->width; - imageDescriptor.height = pResDesc->res.mipmap.mipmap->height; - imageDescriptor.depth = pResDesc->res.mipmap.mipmap->depth; - imageDescriptor.array_size = 0; - imageDescriptor.geometry = HSA_EXT_IMAGE_GEOMETRY_2D; - getChannelOrderAndType(pResDesc->res.mipmap.mipmap->desc, pTexDesc->readMode, - &channelOrder, &channelType); - break; - case hipResourceTypeLinear: - devPtr = pResDesc->res.linear.devPtr; - imageDescriptor.width = pResDesc->res.linear.sizeInBytes/((pResDesc->res.linear.desc.x + pResDesc->res.linear.desc.y + pResDesc->res.linear.desc.z + pResDesc->res.linear.desc.w)/8); - imageDescriptor.height = 1; - imageDescriptor.depth = 0; - imageDescriptor.array_size = 0; - imageDescriptor.geometry = - HSA_EXT_IMAGE_GEOMETRY_1D; // ? HSA_EXT_IMAGE_DATA_LAYOUT_LINEAR - getChannelOrderAndType(pResDesc->res.linear.desc, pTexDesc->readMode, &channelOrder, - &channelType); - break; - case hipResourceTypePitch2D: - devPtr = pResDesc->res.pitch2D.devPtr; - imageDescriptor.width = pResDesc->res.pitch2D.width; - imageDescriptor.height = pResDesc->res.pitch2D.height; - imageDescriptor.depth = 0; - imageDescriptor.array_size = 0; - imageDescriptor.geometry = HSA_EXT_IMAGE_GEOMETRY_2D; - pitch = pResDesc->res.pitch2D.pitchInBytes; - getChannelOrderAndType(pResDesc->res.pitch2D.desc, pTexDesc->readMode, - &channelOrder, &channelType); - break; - default: - break; - } - - imageDescriptor.format.channel_order = channelOrder; - imageDescriptor.format.channel_type = channelType; - - hsa_ext_sampler_descriptor_t samplerDescriptor; - fillSamplerDescriptor(samplerDescriptor, pTexDesc->addressMode[0], pTexDesc->filterMode, - pTexDesc->normalizedCoords); - if(hipResourceTypeLinear == pResDesc->resType) { - samplerDescriptor.filter_mode = HSA_EXT_SAMPLER_FILTER_MODE_NEAREST; - samplerDescriptor.address_mode = HSA_EXT_SAMPLER_ADDRESSING_MODE_CLAMP_TO_BORDER; - } else if(!pTexDesc->normalizedCoords) { - samplerDescriptor.address_mode = HSA_EXT_SAMPLER_ADDRESSING_MODE_CLAMP_TO_EDGE; - } - hsa_access_permission_t permission = HSA_ACCESS_PERMISSION_RW; - - if(hipResourceTypePitch2D != pResDesc->resType) - pitch = getElementSize(channelOrder, channelType) * alignUp(imageDescriptor.width, IMAGE_PITCH_ALIGNMENT); - - if (HSA_STATUS_SUCCESS != hsa_ext_image_create_with_layout( - *agent, &imageDescriptor, devPtr, permission, - HSA_EXT_IMAGE_DATA_LAYOUT_LINEAR, pitch, 0, &(pTexture->image)) || - HSA_STATUS_SUCCESS != - hsa_ext_sampler_create(*agent, &samplerDescriptor, &(pTexture->sampler))) { - free(pTexture); - return ihipLogStatus(hipErrorRuntimeOther); - } - - getHipTextureObject(pTexObject, pTexture->image, pTexture->sampler); - - textureHash[*pTexObject] = pTexture; - } - - return ihipLogStatus(hip_status); -} - -hipError_t hipDestroyTextureObject(hipTextureObject_t textureObject) { - HIP_INIT_API(hipDestroyTextureObject, textureObject); - - hipError_t hip_status = hipSuccess; - - auto ctx = ihipGetTlsDefaultCtx(); - if (ctx) { - hc::accelerator acc = ctx->getDevice()->_acc; - auto device = ctx->getWriteableDevice(); - - hsa_agent_t* agent = static_cast(acc.get_hsa_agent()); - - hipTexture* pTexture = textureHash[textureObject]; - if (pTexture != nullptr) { - hsa_ext_image_destroy(*agent, pTexture->image); - hsa_ext_sampler_destroy(*agent, pTexture->sampler); - free(pTexture); - textureHash.erase(textureObject); - } - } - return ihipLogStatus(hip_status); -} - -hipError_t hipGetTextureObjectResourceDesc(hipResourceDesc* pResDesc, - hipTextureObject_t textureObject) { - HIP_INIT_API(hipGetTextureObjectResourceDesc, pResDesc, textureObject); - hipError_t hip_status = hipSuccess; - - auto ctx = ihipGetTlsDefaultCtx(); - if (ctx) { - hipTexture* pTexture = textureHash[textureObject]; - if (pTexture != nullptr && pResDesc != nullptr) { - memcpy((void*)pResDesc, (void*)&(pTexture->resDesc), sizeof(hipResourceDesc)); - } - } - return ihipLogStatus(hip_status); -} - -hipError_t hipGetTextureObjectResourceViewDesc(hipResourceViewDesc* pResViewDesc, - hipTextureObject_t textureObject) { - HIP_INIT_API(hipGetTextureObjectResourceViewDesc, pResViewDesc, textureObject); - hipError_t hip_status = hipSuccess; - - auto ctx = ihipGetTlsDefaultCtx(); - if (ctx) { - hipTexture* pTexture = textureHash[textureObject]; - if (pTexture != nullptr && pResViewDesc != nullptr) { - memcpy((void*)pResViewDesc, (void*)&(pTexture->resViewDesc), - sizeof(hipResourceViewDesc)); - } - } - return ihipLogStatus(hip_status); -} - -hipError_t hipGetTextureObjectTextureDesc(hipTextureDesc* pTexDesc, - hipTextureObject_t textureObject) { - HIP_INIT_API(hipGetTextureObjectTextureDesc, pTexDesc, textureObject); - - hipError_t hip_status = hipSuccess; - - auto ctx = ihipGetTlsDefaultCtx(); - if (ctx) { - hipTexture* pTexture = textureHash[textureObject]; - if (pTexture != nullptr && pTexDesc != nullptr) { - memcpy((void*)pTexDesc, (void*)&(pTexture->texDesc), sizeof(hipTextureDesc)); - } - } - return ihipLogStatus(hip_status); -} - -// Texture Reference APIs -hipError_t ihipBindTextureImpl(TlsData *tls_, int dim, enum hipTextureReadMode readMode, size_t* offset, - const void* devPtr, const struct hipChannelFormatDesc* desc, - size_t size, textureReference* tex) { - TlsData *tls = (tls_ == nullptr) ? tls_get_ptr() : tls_; - hipError_t hip_status = hipSuccess; - enum hipTextureAddressMode addressMode = tex->addressMode[0]; - enum hipTextureFilterMode filterMode = tex->filterMode; - int normalizedCoords = tex->normalized; - hipTextureObject_t& textureObject = tex->textureObject; - if(offset != nullptr) - *offset = 0; - auto ctx = ihipGetTlsDefaultCtx(); - if (ctx) { - hc::accelerator acc = ctx->getDevice()->_acc; - auto device = ctx->getWriteableDevice(); - - hsa_agent_t* agent = static_cast(acc.get_hsa_agent()); - - hipTexture* pTexture = (hipTexture*)malloc(sizeof(hipTexture)); - if (pTexture != nullptr) { - memset(pTexture, 0, sizeof(hipTexture)); - } - - hsa_ext_image_descriptor_t imageDescriptor; - - assert(dim == hipTextureType1D); - - imageDescriptor.geometry = HSA_EXT_IMAGE_GEOMETRY_1D; - imageDescriptor.width = size; - imageDescriptor.height = 1; - imageDescriptor.depth = 1; - imageDescriptor.array_size = 0; - - hsa_ext_image_channel_order_t channelOrder; - hsa_ext_image_channel_type_t channelType; - if (NULL == desc) { - getDrvChannelOrderAndType(tex->format, readMode, tex->numChannels, &channelOrder, &channelType); - } else { - getChannelOrderAndType(*desc, readMode, &channelOrder, &channelType); - } - imageDescriptor.format.channel_order = channelOrder; - imageDescriptor.format.channel_type = channelType; - - hsa_ext_sampler_descriptor_t samplerDescriptor; - samplerDescriptor.filter_mode = HSA_EXT_SAMPLER_FILTER_MODE_NEAREST; - samplerDescriptor.address_mode = HSA_EXT_SAMPLER_ADDRESSING_MODE_CLAMP_TO_BORDER; - if (normalizedCoords) { - samplerDescriptor.coordinate_mode = HSA_EXT_SAMPLER_COORDINATE_MODE_NORMALIZED; - } else { - samplerDescriptor.coordinate_mode = HSA_EXT_SAMPLER_COORDINATE_MODE_UNNORMALIZED; - } - - hsa_access_permission_t permission = HSA_ACCESS_PERMISSION_RW; - - size_t rowPitch = getElementSize(channelOrder, channelType) * alignUp(size, IMAGE_PITCH_ALIGNMENT); - - if (HSA_STATUS_SUCCESS != hsa_ext_image_create_with_layout( - *agent, &imageDescriptor, devPtr, permission, - HSA_EXT_IMAGE_DATA_LAYOUT_LINEAR, rowPitch, 0, &(pTexture->image)) || - HSA_STATUS_SUCCESS != - hsa_ext_sampler_create(*agent, &samplerDescriptor, &(pTexture->sampler))) { - free(pTexture); - return hipErrorRuntimeOther; - } - getHipTextureObject(&textureObject, pTexture->image, pTexture->sampler); - pTexture->devPtr = (void*) devPtr; - textureHash[textureObject] = pTexture; - } - - return hip_status; -} - -hipError_t hipBindTexture(size_t* offset, textureReference* tex, const void* devPtr, - const hipChannelFormatDesc* desc, size_t size) { - HIP_INIT_API(hipBindTexture, offset, tex, devPtr, desc, size); - hipError_t hip_status = hipSuccess; - // TODO: hipReadModeElementType is default. - hip_status = ihipBindTextureImpl(tls, hipTextureType1D, hipReadModeElementType, offset, devPtr, desc, - size, tex); - return ihipLogStatus(hip_status); -} - -hipError_t ihipBindTexture2DImpl(TlsData *tls, int dim, enum hipTextureReadMode readMode, size_t* offset, - const void* devPtr, const struct hipChannelFormatDesc* desc, - size_t width, size_t height, textureReference* tex, size_t pitch) { - hipError_t hip_status = hipSuccess; - enum hipTextureAddressMode addressMode = tex->addressMode[0]; - enum hipTextureFilterMode filterMode = tex->filterMode; - int normalizedCoords = tex->normalized; - hipTextureObject_t& textureObject = tex->textureObject; - if(offset != nullptr) - *offset = 0; - auto ctx = ihipGetTlsDefaultCtx(); - if (ctx) { - hc::accelerator acc = ctx->getDevice()->_acc; - auto device = ctx->getWriteableDevice(); - - hsa_agent_t* agent = static_cast(acc.get_hsa_agent()); - - hipTexture* pTexture = (hipTexture*)malloc(sizeof(hipTexture)); - if (pTexture != nullptr) { - memset(pTexture, 0, sizeof(hipTexture)); - } - - hsa_ext_image_descriptor_t imageDescriptor; - - assert(dim == hipTextureType2D); - - imageDescriptor.geometry = HSA_EXT_IMAGE_GEOMETRY_2D; - imageDescriptor.width = width; - imageDescriptor.height = height; - imageDescriptor.depth = 1; - imageDescriptor.array_size = 0; - - hsa_ext_image_channel_order_t channelOrder; - hsa_ext_image_channel_type_t channelType; - - if (NULL == desc) { - getDrvChannelOrderAndType(tex->format, readMode, tex->numChannels, &channelOrder, &channelType); - } else { - getChannelOrderAndType(*desc, readMode, &channelOrder, &channelType); - } - imageDescriptor.format.channel_order = channelOrder; - imageDescriptor.format.channel_type = channelType; - - hsa_ext_sampler_descriptor_t samplerDescriptor; - fillSamplerDescriptor(samplerDescriptor, addressMode, filterMode, normalizedCoords); - if(!normalizedCoords) { - samplerDescriptor.address_mode = HSA_EXT_SAMPLER_ADDRESSING_MODE_CLAMP_TO_EDGE; - } - hsa_access_permission_t permission = HSA_ACCESS_PERMISSION_RW; - - if( 0 == pitch) - pitch = getElementSize(channelOrder, channelType) * alignUp(width, IMAGE_PITCH_ALIGNMENT); - - if (HSA_STATUS_SUCCESS != hsa_ext_image_create_with_layout( - *agent, &imageDescriptor, devPtr, permission, - HSA_EXT_IMAGE_DATA_LAYOUT_LINEAR, pitch, 0, &(pTexture->image)) || - HSA_STATUS_SUCCESS != - hsa_ext_sampler_create(*agent, &samplerDescriptor, &(pTexture->sampler))) { - free(pTexture); - return hipErrorRuntimeOther; - } - getHipTextureObject(&textureObject, pTexture->image, pTexture->sampler); - pTexture->devPtr = (void*) devPtr; - textureHash[textureObject] = pTexture; - } - - return hip_status; -} - -hipError_t hipBindTexture2D(size_t* offset, textureReference* tex, const void* devPtr, - const hipChannelFormatDesc* desc, size_t width, size_t height, - size_t pitch) { - HIP_INIT_API(hipBindTexture2D, offset, tex, devPtr, desc, width, height, pitch); - hipError_t hip_status = hipSuccess; - - //TODO: Fix when HSA accepts user defined pitch - if(pitch % 64) pitch =0; - - hip_status = ihipBindTexture2DImpl(tls, hipTextureType2D, hipReadModeElementType, offset, devPtr, - desc, width, height, tex, pitch); - return ihipLogStatus(hip_status); -} - -hipError_t ihipBindTextureToArrayImpl(TlsData *tls_, int dim, enum hipTextureReadMode readMode, - hipArray_const_t array, - const struct hipChannelFormatDesc& desc, - textureReference* tex) { - TlsData *tls = (tls_ == nullptr) ? tls_get_ptr() : tls_; - hipError_t hip_status = hipSuccess; - enum hipTextureAddressMode addressMode = tex->addressMode[0]; - enum hipTextureFilterMode filterMode = tex->filterMode; - int normalizedCoords = tex->normalized; - hipTextureObject_t& textureObject = tex->textureObject; - auto ctx = ihipGetTlsDefaultCtx(); - if (ctx) { - hc::accelerator acc = ctx->getDevice()->_acc; - auto device = ctx->getWriteableDevice(); - - hsa_agent_t* agent = static_cast(acc.get_hsa_agent()); - - hipTexture* pTexture = (hipTexture*)malloc(sizeof(hipTexture)); - if (pTexture != nullptr) { - memset(pTexture, 0, sizeof(hipTexture)); - } - - hsa_ext_image_descriptor_t imageDescriptor; - - imageDescriptor.width = array->width; - imageDescriptor.height = array->height; - imageDescriptor.depth = array->depth; - imageDescriptor.array_size = 0; - - switch (dim) { - case hipTextureType1D: - imageDescriptor.geometry = HSA_EXT_IMAGE_GEOMETRY_1D; - imageDescriptor.height = 1; - imageDescriptor.depth = 1; - break; - case hipTextureType2D: - imageDescriptor.geometry = HSA_EXT_IMAGE_GEOMETRY_2D; - imageDescriptor.depth = 1; - break; - case hipTextureType3D: - case hipTextureTypeCubemap: - imageDescriptor.geometry = HSA_EXT_IMAGE_GEOMETRY_3D; - break; - case hipTextureType1DLayered: - imageDescriptor.geometry = HSA_EXT_IMAGE_GEOMETRY_1DA; - imageDescriptor.height = 1; - imageDescriptor.array_size = array->height; - break; - case hipTextureType2DLayered: - imageDescriptor.geometry = HSA_EXT_IMAGE_GEOMETRY_2DA; - imageDescriptor.depth = 1; - imageDescriptor.array_size = array->depth; - break; - case hipTextureTypeCubemapLayered: - default: - break; - } - - hsa_ext_image_channel_order_t channelOrder; - hsa_ext_image_channel_type_t channelType; - if (array->isDrv) { - getDrvChannelOrderAndType(array->Format, readMode, array->NumChannels, - &channelOrder, &channelType); - } else { - getChannelOrderAndType(desc, readMode, &channelOrder, &channelType); - } - imageDescriptor.format.channel_order = channelOrder; - imageDescriptor.format.channel_type = channelType; - - hsa_ext_sampler_descriptor_t samplerDescriptor; - fillSamplerDescriptor(samplerDescriptor, addressMode, filterMode, normalizedCoords); - if(!normalizedCoords) { - samplerDescriptor.address_mode = HSA_EXT_SAMPLER_ADDRESSING_MODE_CLAMP_TO_EDGE; - } - hsa_access_permission_t permission = HSA_ACCESS_PERMISSION_RW; - - size_t rowPitch = getElementSize(channelOrder, channelType) * alignUp(imageDescriptor.width, IMAGE_PITCH_ALIGNMENT); - - if (HSA_STATUS_SUCCESS != hsa_ext_image_create_with_layout( - *agent, &imageDescriptor, array->data, permission, - HSA_EXT_IMAGE_DATA_LAYOUT_LINEAR, rowPitch, 0, &(pTexture->image)) || - HSA_STATUS_SUCCESS != - hsa_ext_sampler_create(*agent, &samplerDescriptor, &(pTexture->sampler))) { - return hipErrorRuntimeOther; - } - getHipTextureObject(&textureObject, pTexture->image, pTexture->sampler); - pTexture->devPtr = (void*) array; - textureHash[textureObject] = pTexture; - } - - return hip_status; -} - -hipError_t hipBindTextureToArray(textureReference* tex, hipArray_const_t array, - const hipChannelFormatDesc* desc) { - HIP_INIT_API(hipBindTextureToArray, tex, array, desc); - hipError_t hip_status = hipSuccess; - // TODO: hipReadModeElementType is default. - hip_status = - ihipBindTextureToArrayImpl(tls, array->textureType, hipReadModeElementType, array, *desc, tex); - return ihipLogStatus(hip_status); -} - -hipError_t hipBindTextureToMipmappedArray(textureReference* tex, - hipMipmappedArray_const_t mipmappedArray, - const hipChannelFormatDesc* desc) { - HIP_INIT_API(hipBindTextureToMipmappedArray, tex, mipmappedArray, desc); - hipError_t hip_status = hipSuccess; - return ihipLogStatus(hip_status); -} - -hipError_t ihipUnbindTextureImpl(const hipTextureObject_t& textureObject) { - hipError_t hip_status = hipSuccess; - TlsData* tls=tls_get_ptr(); - auto ctx = ihipGetTlsDefaultCtx(); - if (ctx) { - hc::accelerator acc = ctx->getDevice()->_acc; - auto device = ctx->getWriteableDevice(); - - hsa_agent_t* agent = static_cast(acc.get_hsa_agent()); - hipTexture* pTexture = textureHash[textureObject]; - if (pTexture != nullptr) { - hsa_ext_image_destroy(*agent, pTexture->image); - hsa_ext_sampler_destroy(*agent, pTexture->sampler); - free(pTexture); - textureHash.erase(textureObject); - } - } - - return hip_status; -} - -hipError_t hipUnbindTexture(const textureReference* tex) { - HIP_INIT_API(hipUnbindTexture, tex); - hipError_t hip_status = hipSuccess; - hip_status = ihipUnbindTextureImpl(tex->textureObject); - return ihipLogStatus(hip_status); -} - -hipError_t hipGetChannelDesc(hipChannelFormatDesc* desc, hipArray_const_t array) { - HIP_INIT_API(hipGetChannelDesc, desc, array); - hipError_t hip_status = hipSuccess; - - auto ctx = ihipGetTlsDefaultCtx(); - if (ctx) { - *desc = array->desc; - } - return ihipLogStatus(hip_status); -} - -hipError_t hipGetTextureAlignmentOffset(size_t* offset, const textureReference* tex) { - HIP_INIT_API(hipGetTextureAlignmentOffset, offset, tex); - - hipError_t hip_status = hipSuccess; - - auto ctx = ihipGetTlsDefaultCtx(); - if (ctx) { - if(offset != nullptr) - *offset = 0; - } - return ihipLogStatus(hip_status); -} - -hipError_t hipGetTextureReference(const textureReference** tex, const void* symbol) { - HIP_INIT_API(hipGetTextureReference, tex, symbol); - - hipError_t hip_status = hipSuccess; - - auto ctx = ihipGetTlsDefaultCtx(); - if (ctx) { - } - return ihipLogStatus(hip_status); -} - -hipError_t hipTexRefSetFormat(textureReference* tex, hipArray_Format fmt, int NumPackedComponents) { - HIP_INIT_API(hipTexRefSetFormat, tex, fmt, NumPackedComponents); - hipError_t hip_status = hipSuccess; - tex->format = fmt; - tex->numChannels = NumPackedComponents; - return ihipLogStatus(hip_status); -} - -hipError_t hipTexRefSetFlags(textureReference* tex, unsigned int flags) { - HIP_INIT_API(hipTexRefSetFlags, tex, flags); - hipError_t hip_status = hipSuccess; - if(flags == HIP_TRSF_READ_AS_INTEGER) - tex->readMode = hipReadModeElementType; - else if(flags == HIP_TRSF_NORMALIZED_COORDINATES) - tex->normalized = flags; - return ihipLogStatus(hip_status); -} - -hipError_t hipTexRefSetFilterMode(textureReference* tex, hipTextureFilterMode fm) { - HIP_INIT_API(hipTexRefSetFilterMode, tex, fm); - hipError_t hip_status = hipSuccess; - tex->filterMode = fm; - return ihipLogStatus(hip_status); -} - -hipError_t hipTexRefSetAddressMode(textureReference* tex, int dim, hipTextureAddressMode am) { - HIP_INIT_API(hipTexRefSetAddressMode, tex, dim, am); - hipError_t hip_status = hipSuccess; - tex->addressMode[dim] = am; - return ihipLogStatus(hip_status); -} - -hipError_t hipTexRefGetAddressMode(hipTextureAddressMode* am, textureReference tex, int dim) { - HIP_INIT_API(hipTexRefGetAddressMode,am, &tex, dim); - - if ((am == nullptr) || (dim >= 3)) - return ihipLogStatus(hipErrorInvalidValue); - - *am = tex.addressMode[dim]; - - return ihipLogStatus(hipSuccess); -} - -hipError_t hipTexRefSetArray(textureReference* tex, hipArray_const_t array, unsigned int flags) { - HIP_INIT_API(hipTexRefSetArray, tex, array, flags); - hipError_t hip_status = hipSuccess; - - hip_status = ihipBindTextureToArrayImpl(tls, array->textureType, tex->readMode, array, - array->desc, tex); - return ihipLogStatus(hip_status); -} - -hipError_t hipTexRefGetArray(hipArray_t* array, textureReference tex) { - HIP_INIT_API(hipTexRefGetArray, array, &tex); - - if (array == nullptr) - return ihipLogStatus(hipErrorInvalidValue); - - hipTexture* pTexture = textureHash[tex.textureObject]; - if((pTexture == nullptr) || (hipResourceTypeArray != pTexture->resDesc.resType)) - return ihipLogStatus(hipErrorInvalidImage); - - if (pTexture->devPtr == nullptr) - return ihipLogStatus(hipErrorUnknown); - - *array = reinterpret_cast(pTexture->devPtr); - - return ihipLogStatus(hipSuccess); -} - -hipError_t hipTexRefSetAddress(size_t* offset, textureReference* tex, hipDeviceptr_t devPtr, - size_t size) { - HIP_INIT_API(hipTexRefSetAddress, offset, tex, devPtr, size); - hipError_t hip_status = hipSuccess; - // TODO: hipReadModeElementType is default. - hip_status = ihipBindTextureImpl(tls, hipTextureType1D, tex->readMode, offset, devPtr, NULL, - size, tex); - return ihipLogStatus(hip_status); -} - -hipError_t hipTexRefGetAddress(hipDeviceptr_t* dev_ptr, textureReference tex) { - HIP_INIT_API(hipTexRefGetAddress,dev_ptr, &tex); - - if (dev_ptr == nullptr) - return ihipLogStatus(hipErrorInvalidValue); - - hipTexture* pTexture = textureHash[tex.textureObject]; - if (pTexture == nullptr) - return ihipLogStatus(hipErrorInvalidImage); - - if (pTexture->devPtr == nullptr) - return ihipLogStatus(hipErrorUnknown); - - *dev_ptr = reinterpret_cast(pTexture->devPtr); - return ihipLogStatus(hipSuccess); -} - -hipError_t hipTexRefSetAddress2D(textureReference* tex, const HIP_ARRAY_DESCRIPTOR* desc, - hipDeviceptr_t devPtr, size_t pitch) { - HIP_INIT_API(hipTexRefSetAddress2D, tex, desc, devPtr, pitch); - size_t offset; - hipError_t hip_status = hipSuccess; - // TODO: hipReadModeElementType is default. - //TODO: Fix when HSA accepts user defined pitch - if(pitch % 64) pitch =0; - - hip_status = ihipBindTexture2DImpl(tls, hipTextureType2D, tex->readMode, &offset, devPtr, - NULL, desc->Width, desc->Height, tex, pitch); - return ihipLogStatus(hip_status); -} diff --git a/src/hip_texture.h b/src/hip_texture.h deleted file mode 100644 index 1affc48b85..0000000000 --- a/src/hip_texture.h +++ /dev/null @@ -1,37 +0,0 @@ -/* -Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved. - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. -*/ - -#ifndef HIP_INCLUDE_HCC_DETAIL_HIP_TEXTURE_H -#define HIP_INCLUDE_HCC_DETAIL_HIP_TEXTURE_H - -#include - -struct hipTexture { - hipResourceDesc resDesc; - hipTextureDesc texDesc; - hipResourceViewDesc resViewDesc; - hsa_ext_image_t image; - hsa_ext_sampler_t sampler; - void* devPtr; -}; - -#endif diff --git a/src/hip_util.h b/src/hip_util.h deleted file mode 100644 index 8c4d19bb40..0000000000 --- a/src/hip_util.h +++ /dev/null @@ -1,38 +0,0 @@ -/* -Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved. - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. -*/ - -#ifndef HIP_INCLUDE_HCC_DETAIL_HIP_UTIL_H -#define HIP_INCLUDE_HCC_DETAIL_HIP_UTIL_H - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - - -#endif diff --git a/src/hiprtc.cpp b/src/hiprtc.cpp deleted file mode 100644 index a11207f337..0000000000 --- a/src/hiprtc.cpp +++ /dev/null @@ -1,634 +0,0 @@ -/* -Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved. - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. -*/ - -#include "../include/hip/hiprtc.h" -#include "code_object_bundle.inl" -#include "../include/hip/hcc_detail/elfio/elfio.hpp" -#include "../include/hip/hcc_detail/program_state.hpp" - -#include "../lpl_ca/pstreams/pstream.h" - -#include - -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -extern "C" const char* hiprtcGetErrorString(hiprtcResult x) -{ - switch (x) { - case HIPRTC_SUCCESS: - return "HIPRTC_SUCCESS"; - case HIPRTC_ERROR_OUT_OF_MEMORY: - return "HIPRTC_ERROR_OUT_OF_MEMORY"; - case HIPRTC_ERROR_PROGRAM_CREATION_FAILURE: - return "HIPRTC_ERROR_PROGRAM_CREATION_FAILURE"; - case HIPRTC_ERROR_INVALID_INPUT: - return "HIPRTC_ERROR_INVALID_INPUT"; - case HIPRTC_ERROR_INVALID_PROGRAM: - return "HIPRTC_ERROR_INVALID_PROGRAM"; - case HIPRTC_ERROR_INVALID_OPTION: - return "HIPRTC_ERROR_INVALID_OPTION"; - case HIPRTC_ERROR_COMPILATION: - return "HIPRTC_ERROR_COMPILATION"; - case HIPRTC_ERROR_BUILTIN_OPERATION_FAILURE: - return "HIPRTC_ERROR_BUILTIN_OPERATION_FAILURE"; - case HIPRTC_ERROR_NO_NAME_EXPRESSIONS_AFTER_COMPILATION: - return "HIPRTC_ERROR_NO_NAME_EXPRESSIONS_AFTER_COMPILATION"; - case HIPRTC_ERROR_NO_LOWERED_NAMES_BEFORE_COMPILATION: - return "HIPRTC_ERROR_NO_LOWERED_NAMES_BEFORE_COMPILATION"; - case HIPRTC_ERROR_NAME_EXPRESSION_NOT_VALID: - return "HIPRTC_ERROR_NAME_EXPRESSION_NOT_VALID"; - case HIPRTC_ERROR_INTERNAL_ERROR: - return "HIPRTC_ERROR_INTERNAL_ERROR"; - default: throw std::logic_error{"Invalid HIPRTC result."}; - }; -} - -namespace hip_impl { -inline bool create_directory(const std::string& path) { - mode_t mode = 0755; - int ret = mkdir(path.c_str(), mode); - if (ret == 0) return true; - return false; -} - -inline bool fileExists (const std::string& name) { - struct stat buffer; - return (stat (name.c_str(), &buffer) == 0); -} -} // namespace hip_impl - -namespace -{ - char* demangle(const char* x) - { - if (!x) return nullptr; - - int s{}; - char* tmp = abi::__cxa_demangle(x, nullptr, nullptr, &s); - - if (s != 0) return nullptr; - - return tmp; - } -} // Unnamed namespace. - -namespace -{ - struct Symbol { - std::string name; - ELFIO::Elf64_Addr value = 0; - ELFIO::Elf_Xword size = 0; - ELFIO::Elf_Half sect_idx = 0; - std::uint8_t bind = 0; - std::uint8_t type = 0; - std::uint8_t other = 0; - }; - - inline - Symbol read_symbol(const ELFIO::symbol_section_accessor& section, - unsigned int idx) { - assert(idx < section.get_symbols_num()); - - Symbol r; - section.get_symbol( - idx, r.name, r.value, r.size, r.bind, r.type, r.sect_idx, r.other); - - return r; - } -} // Unnamed namespace. - -struct _hiprtcProgram { - // DATA - STATICS - static std::vector> programs; - static std::mutex mtx; - - // DATA - std::vector> headers; - std::vector> names; - std::vector loweredNames; - std::vector elf; - std::string source; - std::string name; - std::string log; - bool compiled; - - // STATICS - static - hiprtcResult destroy(_hiprtcProgram* p) - { - using namespace std; - - lock_guard lck{mtx}; - - const auto it{find_if(programs.cbegin(), programs.cend(), - [=](const unique_ptr<_hiprtcProgram>& x) { - return x.get() == p; - })}; - - if (it == programs.cend()) return HIPRTC_ERROR_INVALID_PROGRAM; - - return HIPRTC_SUCCESS; - } - - static - std::string handleMangledName(std::string name) - { - using namespace std; - - char* demangled = demangle(name.c_str()); - name.assign(demangled == nullptr ? "" : demangled); - free(demangled); - - if (name.empty()) return name; - - if (name.find("void ") == 0) name.erase(0, strlen("void ")); - - auto dx{name.find_first_of("(<")}; - - if (dx == string::npos) return name; - - if (name[dx] == '<') { - auto cnt{1u}; - do { - ++dx; - cnt += (name[dx] == '<') ? 1 : ((name[dx] == '>') ? -1 : 0); - } while (cnt); - - name.erase(++dx); - } - else name.erase(dx); - - return name; - } - - static - _hiprtcProgram* make(std::string s, std::string n, - std::vector> h) - { - using namespace std; - - unique_ptr<_hiprtcProgram> tmp{new _hiprtcProgram{move(h), {}, {}, {}, - move(s), move(n), {}, - false}}; - - lock_guard lck{mtx}; - - programs.push_back(move(tmp)); - - return programs.back().get(); - } - - static - bool isValid(_hiprtcProgram* p) noexcept - { - return std::find_if(programs.cbegin(), programs.cend(), - [=](const std::unique_ptr<_hiprtcProgram>& x) { - return x.get() == p; - }) != programs.cend(); - } - - // MANIPULATORS - bool compile(const std::vector& args) - { - using namespace ELFIO; - using namespace redi; - using namespace std; - - ipstream compile{args.front(), args, pstreambuf::pstderr}; - - constexpr const auto tmp_size{1024u}; - char tmp[tmp_size]{}; - while (!compile.eof()) { - log.append(tmp, tmp + compile.readsome(tmp, tmp_size)); - } - - compile.close(); - - if (compile.rdbuf()->exited() && - compile.rdbuf()->status() != EXIT_SUCCESS) return false; - - elfio reader; - if (!reader.load(args.back())) return false; - - const auto it{find_if(reader.sections.begin(), reader.sections.end(), - [](const section* x) { - return (x->get_name() == ".hip_fatbin") || (x->get_name() == ".kernel"); - })}; - - if (it == reader.sections.end()) return false; - - hip_impl::Bundled_code_header h{(*it)->get_data()}; - - if (bundles(h).empty()) return false; - - elf.assign(bundles(h).back().blob.cbegin(), - bundles(h).back().blob.cend()); - - return true; - } - - bool readLoweredNames() - { - using namespace ELFIO; - using namespace hip_impl; - using namespace std; - - if (names.empty()) return true; - - istringstream blob{string{elf.cbegin(), elf.cend()}}; - - elfio reader; - - if (!reader.load(blob)) return false; - - const auto it{find_if(reader.sections.begin(), reader.sections.end(), - [](const section* x) { - return x->get_type() == SHT_SYMTAB; - })}; - - ELFIO::symbol_section_accessor symbols{reader, *it}; - - auto n{symbols.get_symbols_num()}; - - if (n < loweredNames.size()) return false; - - while (n--) { - const auto tmp{read_symbol(symbols, n)}; - - auto it{find_if(names.cbegin(), names.cend(), - [&](const pair& x) { - return x.second == tmp.name; - })}; - - if (it == names.cend()) { - const auto name{handleMangledName(tmp.name)}; - - if (name.empty()) continue; - - it = find_if(names.cbegin(), names.cend(), - [&](const pair& x) { - return x.second == name; - }); - - if (it == names.cend()) continue; - } - - loweredNames[distance(names.cbegin(), it)] = tmp.name; - } - - return true; - } - - void replaceExtension(std::string& fileName, const std::string &ext) const { - auto res = fileName.rfind('.'); - auto sloc = fileName.rfind('/'); // slash location - if (res != std::string::npos && (res > sloc || sloc == std::string::npos)) { - fileName.replace(fileName.begin() + res, fileName.end(), ext); - } else { - fileName += ext; - } - } - - // ACCESSORS - std::string writeTemporaryFiles( - const std::string& programFolder) const - { - using namespace std; - - vector> fut{headers.size()}; - transform(headers.cbegin(), headers.cend(), begin(fut), - [&](const pair& x) { - return async([&]() { - ofstream h{programFolder + '/' + x.first}; - h.write(x.second.data(), x.second.size()); - }); - }); - - auto tmp{(programFolder + '/' + name)}; - replaceExtension(tmp, ".cpp"); - ofstream{tmp}.write(source.data(), source.size()); - - return tmp; - } - - -}; -std::vector> _hiprtcProgram::programs{}; -std::mutex _hiprtcProgram::mtx{}; - -namespace -{ - inline - bool isValidProgram(const hiprtcProgram p) - { - if (!p) return false; - - std::lock_guard lck{_hiprtcProgram::mtx}; - - return _hiprtcProgram::isValid(p); - } -} // Unnamed namespace. - -extern "C" hiprtcResult hiprtcAddNameExpression(hiprtcProgram p, const char* n) -{ - if (!n) return HIPRTC_ERROR_INVALID_INPUT; - if (!isValidProgram(p)) return HIPRTC_ERROR_INVALID_PROGRAM; - if (p->compiled) return HIPRTC_ERROR_NO_NAME_EXPRESSIONS_AFTER_COMPILATION; - - const auto id{p->names.size()}; - - p->names.emplace_back(n, n); - p->loweredNames.emplace_back(); - - if (p->names.back().second.back() == ')') { - p->names.back().second.pop_back(); - p->names.back().second.erase(0, p->names.back().second.find('(')); - } - if (p->names.back().second.front() == '&') { - p->names.back().second.erase(0, 1); - } - - const auto var{"__hiprtc_" + std::to_string(id)}; - p->source.append("\nextern \"C\" constexpr auto " + var + " = " + n + ';'); - - return HIPRTC_SUCCESS; -} - -namespace -{ - class Unique_temporary_path { - // DATA - std::string path_{}; - public: - // CREATORS - Unique_temporary_path() : path_{std::tmpnam(nullptr)} - { - while (hip_impl::fileExists(path_)) { - path_ = std::tmpnam(nullptr); - } - } - - Unique_temporary_path(const Unique_temporary_path&) = default; - Unique_temporary_path(Unique_temporary_path&&) = default; - - ~Unique_temporary_path() noexcept - { - std::string s("rm -r " + path_); - system(s.c_str()); - } - - // MANIPULATORS - Unique_temporary_path& operator=( - const Unique_temporary_path&) = default; - Unique_temporary_path& operator=(Unique_temporary_path&&) = default; - - // ACCESSORS - const std::string& path() const noexcept - { - return path_; - } - }; -} // Unnamed namespace. - -namespace -{ - const std::string& defaultTarget() - { - using namespace std; - - static string r{"gfx900"}; - static once_flag f{}; - - call_once(f, []() { - static hsa_agent_t a{}; - hsa_iterate_agents([](hsa_agent_t x, void*) { - hsa_device_type_t t{}; - hsa_agent_get_info(x, HSA_AGENT_INFO_DEVICE, &t); - - if (t != HSA_DEVICE_TYPE_GPU) return HSA_STATUS_SUCCESS; - - a = x; - - return HSA_STATUS_INFO_BREAK; - }, nullptr); - - if (!a.handle) return; - - hsa_agent_iterate_isas(a, [](hsa_isa_t x, void*){ - uint32_t n{}; - hsa_isa_get_info_alt(x, HSA_ISA_INFO_NAME_LENGTH, &n); - - if (n == 0) return HSA_STATUS_SUCCESS; - - r.resize(n); - hsa_isa_get_info_alt(x, HSA_ISA_INFO_NAME, &r[0]); - - r.erase(0, r.find("gfx")); - - return HSA_STATUS_INFO_BREAK; - }, nullptr); - }); - - return r; - } - - inline - void handleTarget(std::vector& args) - { - using namespace std; - - bool hasTarget{false}; - for (auto&& x : args) { - const auto dx{x.find("--gpu-architecture")}; - const auto dy{(dx == string::npos) ? x.find("-arch") - : string::npos}; - - if (dx == dy) continue; - - x.replace(0, x.find('=', min(dx, dy)), "--amdgpu-target"); - hasTarget = true; - - break; - } - if (!hasTarget) args.push_back("--amdgpu-target=" + defaultTarget()); - } -} // Unnamed namespace. - -extern "C" hiprtcResult hiprtcCompileProgram(hiprtcProgram p, int n, const char** o) -{ - using namespace std; - - if (n && !o) return HIPRTC_ERROR_INVALID_INPUT; - if (!isValidProgram(p)) return HIPRTC_ERROR_INVALID_PROGRAM; - if (p->compiled) return HIPRTC_ERROR_COMPILATION; - - static const string hipcc{ - getenv("HIP_PATH") ? (getenv("HIP_PATH") + string{"/bin/hipcc"}) - : "/opt/rocm/bin/hipcc"}; - - if (!hip_impl::fileExists(hipcc)) { - return HIPRTC_ERROR_INTERNAL_ERROR; - } - - Unique_temporary_path tmp{}; - hip_impl::create_directory(tmp.path()); - - const auto src{p->writeTemporaryFiles(tmp.path())}; - - vector args{hipcc, "-fPIC -shared"}; - if (n) args.insert(args.cend(), o, o + n); - - handleTarget(args); - - args.emplace_back(src); - args.emplace_back("-o"); - args.emplace_back(tmp.path() + '/' + "hiprtc.out"); - - if (!p->compile(args)) return HIPRTC_ERROR_INTERNAL_ERROR; - if (!p->readLoweredNames()) return HIPRTC_ERROR_INTERNAL_ERROR; - - p->compiled = true; - - return HIPRTC_SUCCESS; -} - -extern "C" hiprtcResult hiprtcCreateProgram(hiprtcProgram* p, const char* src, - const char* name, int n, const char** hdrs, - const char** incs) -{ - using namespace std; - - if (!p) return HIPRTC_ERROR_INVALID_PROGRAM; - if (n < 0) return HIPRTC_ERROR_INVALID_INPUT; - if (n && (!hdrs || !incs)) return HIPRTC_ERROR_INVALID_INPUT; - - vector> h; - for (auto i = 0; i != n; ++i) h.emplace_back(incs[i], hdrs[i]); - - *p = _hiprtcProgram::make(src, name ? name : "default_name", move(h)); - - return HIPRTC_SUCCESS; -} - -extern "C" hiprtcResult hiprtcDestroyProgram(hiprtcProgram* p) -{ - if (!p) return HIPRTC_SUCCESS; - - return _hiprtcProgram::destroy(*p); -} - -extern "C" hiprtcResult hiprtcGetLoweredName(hiprtcProgram p, const char* n, - const char** ln) -{ - using namespace std; - - if (!n || !ln) return HIPRTC_ERROR_INVALID_INPUT; - if (!isValidProgram(p)) return HIPRTC_ERROR_INVALID_PROGRAM; - if (!p->compiled) return HIPRTC_ERROR_NO_LOWERED_NAMES_BEFORE_COMPILATION; - - const auto it{find_if(p->names.cbegin(), p->names.cend(), - [=](const pair& x) { - return x.first == n; - })}; - - if (it == p->names.cend()) return HIPRTC_ERROR_NAME_EXPRESSION_NOT_VALID; - - *ln = p->loweredNames[distance(p->names.cbegin(), it)].c_str(); - - return HIPRTC_SUCCESS; -} - -extern "C" hiprtcResult hiprtcGetProgramLog(hiprtcProgram p, char* l) -{ - if (!l) return HIPRTC_ERROR_INVALID_INPUT; - if (!isValidProgram(p)) return HIPRTC_ERROR_INVALID_PROGRAM; - if (!p->compiled) return HIPRTC_ERROR_INVALID_PROGRAM; - - l = std::copy_n(p->log.data(), p->log.size(), l); - *l = '\0'; - - return HIPRTC_SUCCESS; -} - -extern "C" hiprtcResult hiprtcGetProgramLogSize(hiprtcProgram p, std::size_t* sz) -{ - if (!sz) return HIPRTC_ERROR_INVALID_INPUT; - if (!isValidProgram(p)) return HIPRTC_ERROR_INVALID_PROGRAM; - if (!p->compiled) return HIPRTC_ERROR_INVALID_PROGRAM; - - *sz = p->log.empty() ? 0 : p->log.size() + 1; - - return HIPRTC_SUCCESS; -} - -extern "C" hiprtcResult hiprtcGetCode(hiprtcProgram p, char* c) -{ - if (!c) return HIPRTC_ERROR_INVALID_INPUT; - if (!isValidProgram(p)) return HIPRTC_ERROR_INVALID_PROGRAM; - if (!p->compiled) return HIPRTC_ERROR_INVALID_PROGRAM; - - std::copy_n(p->elf.data(), p->elf.size(), c); - - return HIPRTC_SUCCESS; -} - -extern "C" hiprtcResult hiprtcGetCodeSize(hiprtcProgram p, std::size_t* sz) -{ - if (!sz) return HIPRTC_ERROR_INVALID_INPUT; - if (!isValidProgram(p)) return HIPRTC_ERROR_INVALID_PROGRAM; - if (!p->compiled) return HIPRTC_ERROR_INVALID_PROGRAM; - - *sz = p->elf.size(); - - return HIPRTC_SUCCESS; -} - -extern "C" hiprtcResult hiprtcVersion(int* major, int* minor) -{ - if (major == nullptr || minor == nullptr) { - return HIPRTC_ERROR_INVALID_INPUT; - } - - *major = 9; - *minor = 0; - - return HIPRTC_SUCCESS; -} diff --git a/src/macro_based_grid_launch.inl b/src/macro_based_grid_launch.inl deleted file mode 100644 index 2e804f090b..0000000000 --- a/src/macro_based_grid_launch.inl +++ /dev/null @@ -1,97 +0,0 @@ -/* -Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved. - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. -*/ - -// Internal header, do not percolate upwards. -#include "hip_hcc_internal.h" -#include "hc.hpp" -#include "trace_helper.h" - -#include -#include - -namespace hip_impl -{ - hc::accelerator_view lock_stream_hip_( - hipStream_t& stream, void*& locked_stream) - { // This allocated but does not take ownership of locked_stream. If it is - // not deleted elsewhere it will leak. - using L = decltype(stream->lockopen_preKernelCommand()); - - HIP_INIT(); - - stream = ihipSyncAndResolveStream(stream); - locked_stream = new L{stream->lockopen_preKernelCommand()}; - return (*static_cast(locked_stream))->_av; - } - - void print_prelaunch_trace_( - const char* kernel_name, - dim3 num_blocks, - dim3 dim_blocks, - int group_mem_bytes, - hipStream_t stream) - { - if ((HIP_TRACE_API & (1 << TRACE_KCMD)) || - HIP_PROFILE_API || - (COMPILE_HIP_DB && (HIP_TRACE_API & (1<lockopen_preKernelCommand()); - - stream->lockclose_postKernelCommand(kernel_name, acc_v); - - delete static_cast(locked_stream); - if(HIP_PROFILE_API) { - MARKER_END(); - } - } -} diff --git a/src/program_state.cpp b/src/program_state.cpp deleted file mode 100644 index 975dcda321..0000000000 --- a/src/program_state.cpp +++ /dev/null @@ -1,101 +0,0 @@ -#include "../include/hip/hcc_detail/program_state.hpp" -// contains implementation of program_state_impl -#include "program_state.inl" - -#include - -#include -#include -#include -#include - -namespace hip_impl { - - kernarg::kernarg() : impl(new kernarg_impl) { - } - - kernarg::kernarg(kernarg&& k) : impl(k.impl) { - k.impl = nullptr; - } - - kernarg::~kernarg() { - if (impl) - delete(impl); - } - - std::uint8_t* kernarg::data() { - return impl->v.data(); - } - - std::size_t kernarg::size() { - return impl->v.size(); - } - - void kernarg::reserve(std::size_t c) { - impl->v.reserve(c); - } - - void kernarg::resize(std::size_t c) { - impl->v.resize(c); - } - - std::size_t kernargs_size_align::kernargs_size_align::size(std::size_t n) const{ - return (*reinterpret_cast>*>(handle))[n].first; - } - - std::size_t kernargs_size_align::alignment(std::size_t n) const{ - return (*reinterpret_cast>*>(handle))[n].second; - } - - program_state::program_state() : impl(new program_state_impl) { - if (!impl) hip_throw(std::runtime_error { - "Unknown error when constructing program state."}); - } - - program_state::~program_state() { - delete(impl); - } - - void* program_state::global_addr_by_name(const char* name) { - const auto it = impl->get_globals().find(name); - if (it == impl->get_globals().end()) - return nullptr; - else - return it->second.first; - } - - hsa_executable_t program_state::load_executable(const char* data, - const size_t data_size, - hsa_executable_t executable, - hsa_agent_t agent) { - return impl->load_executable(data, data_size, true, executable, agent); - } - - hsa_executable_t program_state::load_executable_no_copy(const char* data, - const size_t data_size, - hsa_executable_t executable, - hsa_agent_t agent) { - return impl->load_executable(data, data_size, false, executable, agent); - } - - hipFunction_t program_state::kernel_descriptor(std::uintptr_t function_address, - hsa_agent_t agent) { - auto& kd = impl->kernel_descriptor(function_address, agent); - return kd; - } - - kernargs_size_align program_state::get_kernargs_size_align(std::uintptr_t kernel) { - kernargs_size_align t; - t.handle = reinterpret_cast(&impl->kernargs_size_align(kernel)); - return t; - } - - std::mutex executables_cache_mutex; - std::vector& executables_cache( - std::string elf, hsa_isa_t isa, hsa_agent_t agent) { - static std::unordered_map>>> cache; - return cache[elf][isa][agent]; - } -}; diff --git a/src/program_state.inl b/src/program_state.inl deleted file mode 100644 index 0314c7d4ed..0000000000 --- a/src/program_state.inl +++ /dev/null @@ -1,1001 +0,0 @@ -#include "../include/hip/hcc_detail/program_state.hpp" - -#include "code_object_bundle.inl" -#include "../include/hip/hcc_detail/hsa_helpers.hpp" - -#if !defined(__cpp_exceptions) - #define try if (true) - #define catch(...) if (false) -#endif -#include "../include/hip/hcc_detail/elfio/elfio.hpp" -#if !defined(__cpp_exceptions) - #undef try - #undef catch -#endif - -#include -#include -#include -#include -#include -#include "hc.hpp" -#include "hip_hcc_internal.h" -#include "trace_helper.h" - -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -namespace std { -template<> -struct hash { - size_t operator()(hsa_agent_t x) const { - return hash{}(x.handle); - } -}; - -template<> -struct hash { - size_t operator()(hsa_isa_t x) const { - return hash{}(x.handle); - } -}; -} // namespace std - -inline constexpr bool operator==(hsa_agent_t x, hsa_agent_t y) { - return x.handle == y.handle; -} -inline constexpr bool operator==(hsa_isa_t x, hsa_isa_t y) { - return x.handle == y.handle; -} - -namespace hip_impl { - -[[noreturn]] -void hip_throw(const std::exception&); - -std::vector all_hsa_agents(); - -extern std::mutex executables_cache_mutex; - -std::vector& executables_cache(std::string, hsa_isa_t, hsa_agent_t); - -template -inline -ELFIO::section* find_section_if(ELFIO::elfio& reader, P p) { - const auto it = std::find_if( - reader.sections.begin(), reader.sections.end(), std::move(p)); - - return it != reader.sections.end() ? *it : nullptr; -} - -struct Symbol { - std::string name; - ELFIO::Elf64_Addr value = 0; - ELFIO::Elf_Xword size = 0; - ELFIO::Elf_Half sect_idx = 0; - std::uint8_t bind = 0; - std::uint8_t type = 0; - std::uint8_t other = 0; -}; - -class Kernel_descriptor { - std::uint64_t kernel_object_{}; - amd_kernel_code_t const* header_{}; - std::string name_; - std::vector> kernarg_layout_{}; - bool is_code_object_v3_{}; -public: - Kernel_descriptor() = default; - Kernel_descriptor( - std::uint64_t kernel_object, - const std::string& name, - std::vector> kernarg_layout = {}) - : - kernel_object_{kernel_object}, - name_{name}, - kernarg_layout_{std::move(kernarg_layout)}, - is_code_object_v3_{name.find(".kd") != std::string::npos} - { - bool supported{false}; - std::uint16_t min_v{UINT16_MAX}; - auto r = hsa_system_major_extension_supported( - HSA_EXTENSION_AMD_LOADER, 1, &min_v, &supported); - - if (r != HSA_STATUS_SUCCESS || !supported) return; - - hsa_ven_amd_loader_1_01_pfn_t tbl{}; - - r = hsa_system_get_major_extension_table( - HSA_EXTENSION_AMD_LOADER, - 1, - sizeof(tbl), - reinterpret_cast(&tbl)); - - if (r != HSA_STATUS_SUCCESS) return; - if (!tbl.hsa_ven_amd_loader_query_host_address) return; - - r = tbl.hsa_ven_amd_loader_query_host_address( - reinterpret_cast(kernel_object_), - reinterpret_cast(&header_)); - - if (r != HSA_STATUS_SUCCESS) return; - } - Kernel_descriptor(const Kernel_descriptor&) = default; - Kernel_descriptor(Kernel_descriptor&&) = default; - ~Kernel_descriptor() = default; - - Kernel_descriptor& operator=(const Kernel_descriptor&) = default; - Kernel_descriptor& operator=(Kernel_descriptor&&) = default; - - operator hipFunction_t() const { // TODO: this is awful and only meant for illustration. - return reinterpret_cast(const_cast(this)); - } -}; - -class program_state_impl { - -public: - - std::pair< - std::once_flag, - std::unordered_map< - std::string, - std::unordered_map< - hsa_isa_t, - std::vector>>> code_object_blobs; - - std::pair< - std::once_flag, - std::unordered_map< - std::string, - std::pair>> symbol_addresses; - - std::unordered_map< - hsa_agent_t, - std::pair< - std::once_flag, - std::vector>> executables; - - std::unordered_map< - hsa_agent_t, - std::pair< - std::once_flag, - std::unordered_map< - std::string, - std::vector>>> kernels; - - std::pair< - std::once_flag, - std::unordered_map< - std::string, std::vector>>> kernargs; - - std::pair< - std::once_flag, - std::unordered_map> function_names; - - std::unordered_map< - hsa_agent_t, - std::pair< - std::once_flag, - std::unordered_map< - std::uintptr_t, - Kernel_descriptor>>> functions; - - std::tuple< - std::once_flag, - std::mutex, - // map from string to pair - std::unordered_map>> globals; - - using RAII_code_reader = - std::unique_ptr>; - std::pair< - std::mutex, - std::deque>> code_readers; - - program_state_impl() { - // Create placeholder for each agent for the per-agent members. - for (auto&& x : hip_impl::all_hsa_agents()) { - (void)executables[x]; - (void)kernels[x]; - (void)functions[x]; - } - } - - const std::unordered_map< - std::string, - std::unordered_map< - hsa_isa_t, - std::vector>>& get_code_object_blobs() { - - std::call_once(code_object_blobs.first, [this]() { - dl_iterate_phdr([](dl_phdr_info* info, std::size_t, void* p) { - ELFIO::elfio tmp; - - const auto elf = (info->dlpi_addr && std::strlen(info->dlpi_name) != 0) ? - info->dlpi_name : "/proc/self/exe"; - - if (!tmp.load(elf)) return 0; - - const auto it = find_section_if(tmp, [](const ELFIO::section* x) { - return x->get_name() == ".kernel"; - }); - - if (!it) return 0; - - auto& impl = *static_cast(p); - - std::vector multi_arch_blob(it->get_data(), it->get_data() + it->get_size()); - auto blob_it = multi_arch_blob.begin(); - while (blob_it != multi_arch_blob.end()) { - Bundled_code_header tmp{blob_it, multi_arch_blob.end()}; - - if (!valid(tmp)) break; - - for (auto&& bundle : bundles(tmp)) { - if(bundle.blob.size()) - impl.code_object_blobs.second[elf][triple_to_hsa_isa(bundle.triple)].push_back(bundle.blob); - } - - blob_it += tmp.bundled_code_size; - }; - - return 0; - }, this); - }); - - return code_object_blobs.second; - } - - Symbol read_symbol(const ELFIO::symbol_section_accessor& section, - unsigned int idx) { - assert(idx < section.get_symbols_num()); - - Symbol r; - section.get_symbol( - idx, r.name, r.value, r.size, r.bind, r.type, r.sect_idx, r.other); - - return r; - } - - const std::unordered_map< - std::string, - std::pair>& get_symbol_addresses() { - - std::call_once(symbol_addresses.first, [this]() { - dl_iterate_phdr([](dl_phdr_info* info, std::size_t, void* psi_ptr) { - - if (!psi_ptr) - return 0; - - program_state_impl* t = static_cast(psi_ptr); - - ELFIO::elfio tmp; - const auto elf = (info->dlpi_addr && std::strlen(info->dlpi_name) != 0) ? - info->dlpi_name : "/proc/self/exe"; - - if (!tmp.load(elf)) return 0; - - auto it = find_section_if(tmp, [](const ELFIO::section* x) { - return x->get_type() == SHT_SYMTAB; - }); - - if (!it) return 0; - - const ELFIO::symbol_section_accessor symtab{tmp, it}; - - for (auto i = 0u; i != symtab.get_symbols_num(); ++i) { - auto s = t->read_symbol(symtab, i); - - if (s.type != STT_OBJECT || s.sect_idx == SHN_UNDEF) continue; - - const auto addr = s.value + info->dlpi_addr; - t->symbol_addresses.second.emplace(std::move(s.name), std::make_pair(addr, s.size)); - } - - return 0; - }, this); - }); - - return symbol_addresses.second; - } - - std::unordered_map>& get_globals() { - std::call_once(std::get<0>(globals), [this]() { - std::get<2>(globals).reserve(get_symbol_addresses().size()); - }); - return std::get<2>(globals); - } - - std::mutex& get_globals_mutex() { - return std::get<1>(globals); - } - - std::vector copy_names_of_undefined_symbols( - const ELFIO::symbol_section_accessor& section) { - std::vector r; - - for (auto i = 0u; i != section.get_symbols_num(); ++i) { - // TODO: this is boyscout code, caching the temporaries - // may be of worth. - auto tmp = read_symbol(section, i); - if (tmp.sect_idx != SHN_UNDEF || tmp.name.empty()) continue; - - r.push_back(std::move(tmp.name)); - } - - return r; - } - - void associate_code_object_symbols_with_host_allocation( - const ELFIO::elfio& reader, - ELFIO::section* code_object_dynsym, - hsa_agent_t agent, - hsa_executable_t executable) { - if (!code_object_dynsym) return; - - const auto undefined_symbols = copy_names_of_undefined_symbols( - ELFIO::symbol_section_accessor{reader, code_object_dynsym}); - - auto& g = get_globals(); - auto& g_mutex = get_globals_mutex(); - for (auto&& x : undefined_symbols) { - - const auto it1 = get_symbol_addresses().find(x); - if (it1 == get_symbol_addresses().cend()) { - // For a unknown symbol, initialize it with a magic poison - hsa_executable_agent_global_variable_define( - executable, agent, x.c_str(), - reinterpret_cast(0xDEADBEEFDEADBEEFull)); - continue; - } - - hsa_status_t status; - auto check_hsa_global_var_define_error = [&x](hsa_status_t s) { - if (s != HSA_STATUS_SUCCESS) { - const char* es; - hsa_status_string(s, &es); - hip_throw(std::runtime_error{ "Error when defining symbol " + x + " : " + es}); - } - }; - - auto retrieve_pinned_address_from_cache = [](decltype(g) g, decltype(x) x) { - const auto& global_addr = g.find(x); - if (global_addr != g.cend()) { - return global_addr->second.second; - } - return (void*)nullptr; - }; - - void* p = retrieve_pinned_address_from_cache(g, x); - if (p == nullptr) { - std::lock_guard lck{g_mutex}; - p = retrieve_pinned_address_from_cache(g, x); - if (p == nullptr) { - if (x == "_ZN2hc13printf_bufferE") { - // This is the printf buffer, get the pinned address from HCC - p = Kalmar::getContext()->getPrintfBufferPointerVA(); - } - else { - status = hsa_amd_memory_lock(reinterpret_cast(it1->second.first), - it1->second.second, - nullptr, // All agents. - 0, &p); - check_hsa_global_var_define_error(status); - } - // cache the global address and its pinned address - g.emplace(x, std::make_pair(reinterpret_cast(it1->second.first), p)); - } - } - status = hsa_executable_agent_global_variable_define( - executable, agent, x.c_str(), p); - check_hsa_global_var_define_error(status); - } - } - - void load_code_object_and_freeze_executable( - const char* data, - const size_t data_size, bool make_copy, - hsa_agent_t agent, hsa_executable_t executable) { - // TODO: the following sequence is inefficient, should be refactored - // into a single load of the file and subsequent ELFIO - // processing. - if (!data_size) return; - - static const auto cor_deleter = [] (hsa_code_object_reader_t* p) { - if (!p) return; - hsa_code_object_reader_destroy(*p); - delete p; - }; - - RAII_code_reader tmp{new hsa_code_object_reader_t, cor_deleter}; - - decltype(code_readers.second)::iterator it; - { - std::lock_guard lck{code_readers.first}; - - std::string file; - if (make_copy) - file = std::string(data, data_size); - - code_readers.second.emplace_back(move(file), move(tmp)); - it = std::prev(code_readers.second.end()); - - if (make_copy) - data = it->first.data(); - } - - auto check_hsa_error = [](hsa_status_t s) { - if (s != HSA_STATUS_SUCCESS) { - const char* hsa_err_msg; - hsa_status_string(s, &hsa_err_msg); - hip_throw(std::runtime_error{ - std::string("error when loading code object: ") + - hsa_err_msg}); - } - }; - - check_hsa_error(hsa_code_object_reader_create_from_memory( - data, data_size, it->second.get())); - - check_hsa_error(hsa_executable_load_agent_code_object( - executable, agent, *it->second, nullptr, nullptr)); - - check_hsa_error(hsa_executable_freeze(executable, nullptr)); - } - - - const std::vector& get_executables(hsa_agent_t agent) { - - if (executables.find(agent) == executables.cend()) { - hip_throw(std::runtime_error{"invalid agent"}); - } - - std::call_once(executables[agent].first, [this](hsa_agent_t aa) { - auto data = std::make_pair(this, &aa); - hsa_agent_iterate_isas(aa, [](hsa_isa_t x, void* d) { - auto& p = *static_cast(d); - auto& impl = *(p.first); - for (const auto code_object_it : impl.get_code_object_blobs()) { - const auto elf = code_object_it.first; - const auto code_object_blobs = code_object_it.second; - const auto it = code_object_blobs.find(x); - - if (it == code_object_blobs.cend()) continue; - - hsa_agent_t a = *static_cast(p.second); - - std::lock_guard lck{executables_cache_mutex}; - - std::vector& current_exes = - hip_impl::executables_cache(elf, x, a); - // check the cache for already loaded executables - if (current_exes.empty()) { - // executables do not yet exist for this elf+isa+agent, create and cache them - for (auto&& blob : it->second) { - hsa_executable_t tmp = {}; - - hsa_executable_create_alt( - HSA_PROFILE_FULL, - HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT, - nullptr, - &tmp); - - // TODO: this is massively inefficient and only meant for - // illustration. - tmp = impl.load_executable(blob.data(), blob.size(), true, tmp, a); - - if (tmp.handle) current_exes.push_back(tmp); - } - } - // append cached executables to our agent's vector of executables - impl.executables[a].second.insert(impl.executables[a].second.end(), - current_exes.begin(), current_exes.end()); - } - return HSA_STATUS_SUCCESS; - }, &data); - }, agent); - - return executables[agent].second; - } - - hsa_executable_t load_executable(const char* data, - const size_t data_size, - bool make_copy, - hsa_executable_t executable, - hsa_agent_t agent) { - ELFIO::elfio reader; - std::string ts = std::string(data, data_size); - std::stringstream tmp{ts}; - - if (!reader.load(tmp)) return hsa_executable_t{}; - const auto code_object_dynsym = find_section_if( - reader, [](const ELFIO::section* x) { - return x->get_type() == SHT_DYNSYM; - }); - - associate_code_object_symbols_with_host_allocation(reader, - code_object_dynsym, - agent, executable); - - load_code_object_and_freeze_executable(data, data_size, make_copy, agent, executable); - - return executable; - } - - std::vector> function_names_for( - const ELFIO::elfio& reader, ELFIO::section* symtab) { - std::vector> r; - ELFIO::symbol_section_accessor symbols{reader, symtab}; - - for (auto i = 0u; i != symbols.get_symbols_num(); ++i) { - // TODO: this is boyscout code, caching the temporaries - // may be of worth. - auto tmp = read_symbol(symbols, i); - - if (tmp.type != STT_FUNC) continue; - if (tmp.type == SHN_UNDEF) continue; - if (tmp.name.empty()) continue; - - r.emplace_back(tmp.value, tmp.name); - } - - return r; - } - - const std::unordered_map& get_function_names() { - - std::call_once(function_names.first, [this]() { - dl_iterate_phdr([](dl_phdr_info* info, std::size_t, void* p) { - ELFIO::elfio tmp; - const auto elf = (info->dlpi_addr && std::strlen(info->dlpi_name) != 0) ? - info->dlpi_name : "/proc/self/exe"; - - if (!tmp.load(elf)) return 0; - - const auto it = find_section_if(tmp, [](const ELFIO::section* x) { - return x->get_type() == SHT_SYMTAB; - }); - - if (!it) return 0; - - auto& impl = *static_cast(p); - - auto names = impl.function_names_for(tmp, it); - for (auto&& x : names) x.first += info->dlpi_addr; - - impl.function_names.second.insert( - std::make_move_iterator(names.begin()), - std::make_move_iterator(names.end())); - - return 0; - }, this); - }); - - return function_names.second; - } - - const std::unordered_map< - std::string, std::vector>& get_kernels(hsa_agent_t agent) { - - if (kernels.find(agent) == kernels.cend()) { - hip_throw(std::runtime_error{"invalid agent"}); - } - - std::call_once(kernels[agent].first, [this](hsa_agent_t aa) { - static const auto copy_kernels = []( - hsa_executable_t, hsa_agent_t a, hsa_executable_symbol_t x, void* p) { - auto& impl = *static_cast(p); - if (type(x) == HSA_SYMBOL_KIND_KERNEL) impl.kernels[a].second[hip_impl::name(x)].push_back(x); - - return HSA_STATUS_SUCCESS; - }; - - for (auto&& executable : get_executables(aa)) { - hsa_executable_iterate_agent_symbols( - executable, aa, copy_kernels, this); - } - }, agent); - - return kernels[agent].second; - } - - const std::unordered_map< - std::uintptr_t, - Kernel_descriptor>& get_functions(hsa_agent_t agent) { - - if (functions.find(agent) == functions.cend()) { - hip_throw(std::runtime_error{"invalid agent"}); - } - - std::call_once(functions[agent].first, [this](hsa_agent_t aa) { - for (auto&& function : get_function_names()) { - auto it = get_kernels(aa).find(function.second); - - if (it == get_kernels(aa).cend()) { - it = get_kernels(aa).find(function.second + ".kd"); - if (it == get_kernels(aa).cend()) - continue; - } - - for (auto&& kernel_symbol : it->second) { - functions[aa].second.emplace( - function.first, - Kernel_descriptor{kernel_object(kernel_symbol), it->first, - kernargs_size_align(function.first)}); - } - } - }, agent); - - return functions[agent].second; - } - - static - std::size_t parse_args_v2( - const std::string& metadata, - std::size_t f, - std::size_t l, - std::vector>& size_align) { - if (f == l) return f; - if (!size_align.empty()) return l; - - do { - static constexpr size_t size_sz{5}; - f = metadata.find("Size:", f) + size_sz; - - if (l <= f) return f; - - auto size = std::strtoul(&metadata[f], nullptr, 10); - - static constexpr size_t align_sz{6}; - f = metadata.find("Align:", f) + align_sz; - - char* l{}; - auto align = std::strtoul(&metadata[f], &l, 10); - - f += (l - &metadata[f]) + 1; - - size_align.emplace_back(size, align); - } while (true); - } - - static - void read_kernarg_metadata_v2( - const std::string& kernels_md, - std::size_t dx, - std::unordered_map< - std::string, - std::vector>>& kernargs) { - do { - dx = kernels_md.find("Name:", dx); - - if (dx == std::string::npos) break; - - static constexpr decltype(kernels_md.size()) name_sz{5}; - dx = kernels_md.find_first_not_of(" '", dx + name_sz); - - auto fn = - kernels_md.substr(dx, kernels_md.find_first_of("'\n", dx) - dx); - dx += fn.size(); - - auto dx1 = kernels_md.find("CodeProps", dx); - dx = kernels_md.find("Args:", dx); - - if (dx1 < dx || dx == std::string::npos) { - dx = dx1; - // create an empty kernarg laybout vector for kernels without any arg - kernargs[fn]; - continue; - } - - static constexpr decltype(kernels_md.size()) args_sz{5}; - dx = parse_args_v2(kernels_md, dx + args_sz, dx1, kernargs[fn]); - } while (true); - } - - static - std::string metadata_to_string(const amd_comgr_metadata_node_t& md) { - std::string str; - size_t size; - - if (amd_comgr_get_metadata_string(md, &size, NULL) - == AMD_COMGR_STATUS_SUCCESS) { - str.resize(size - 1); - amd_comgr_get_metadata_string(md, &size, &str[0]); - } - return str; - } - - static - void parse_args_v3( - const amd_comgr_metadata_node_t& args_md, - std::vector>& size_align) { - size_t arg_count = 0; - if (amd_comgr_get_metadata_list_size(args_md, &arg_count) - != AMD_COMGR_STATUS_SUCCESS) - return; - - for (size_t i = 0; i < arg_count; ++i) { - amd_comgr_metadata_node_t arg_md; - - if (amd_comgr_index_list_metadata(args_md, i, &arg_md) - != AMD_COMGR_STATUS_SUCCESS) - return; - - //Look up “.value_kind” to decide whether to ignore it - //See http://llvm.org/docs/AMDGPUUsage.html#code-object-v3-metadata-mattr-code-object-v3 - amd_comgr_metadata_node_t arg_value_kind_md; - if (amd_comgr_metadata_lookup(arg_md, ".value_kind", &arg_value_kind_md) - != AMD_COMGR_STATUS_SUCCESS) - return; - - std::string arg_value_kind{ metadata_to_string(arg_value_kind_md) }; - - if (amd_comgr_destroy_metadata(arg_value_kind_md) - != AMD_COMGR_STATUS_SUCCESS) - return; - - if (arg_value_kind.find("hidden_") == 0) { - if (amd_comgr_destroy_metadata(arg_md) - != AMD_COMGR_STATUS_SUCCESS) - return; - - continue; //Ignore hidden arg - } - - amd_comgr_metadata_node_t arg_size_md; - if (amd_comgr_metadata_lookup(arg_md, ".size", &arg_size_md) - != AMD_COMGR_STATUS_SUCCESS) - return; - - size_t arg_size = std::stoul(metadata_to_string(arg_size_md)); - - if (amd_comgr_destroy_metadata(arg_size_md) - != AMD_COMGR_STATUS_SUCCESS) - return; - - size_t arg_align; - - amd_comgr_metadata_node_t arg_offset_md; - if (amd_comgr_metadata_lookup(arg_md, ".offset", &arg_offset_md) - != AMD_COMGR_STATUS_SUCCESS) - return; - - size_t arg_offset = std::stoul(metadata_to_string(arg_offset_md)); - - if (amd_comgr_destroy_metadata(arg_offset_md) - != AMD_COMGR_STATUS_SUCCESS) - return; - - arg_align = 1; - while (arg_offset && (arg_offset & 1) == 0) { - arg_offset >>= 1; - arg_align <<= 1; - } - - size_align.emplace_back(arg_size, arg_align); - - if (amd_comgr_destroy_metadata(arg_md) - != AMD_COMGR_STATUS_SUCCESS) - return; - } - } - - static - void read_kernarg_metadata_v3( - const std::string& blob, - std::unordered_map< - std::string, - std::vector>>& kernargs) { - amd_comgr_data_t dataIn; - amd_comgr_status_t status; - - if (amd_comgr_create_data(AMD_COMGR_DATA_KIND_RELOCATABLE, &dataIn) - != AMD_COMGR_STATUS_SUCCESS) - return; - - if (amd_comgr_set_data(dataIn, blob.size(), blob.data()) - != AMD_COMGR_STATUS_SUCCESS) - return; - - amd_comgr_metadata_node_t metadata; - if (amd_comgr_get_data_metadata(dataIn, &metadata) - != AMD_COMGR_STATUS_SUCCESS) - return; - - amd_comgr_metadata_node_t kernels_md; - if (amd_comgr_metadata_lookup(metadata, "Kernels", &kernels_md) - != AMD_COMGR_STATUS_SUCCESS) { - if (amd_comgr_metadata_lookup(metadata, - "amdhsa.kernels", - &kernels_md) - != AMD_COMGR_STATUS_SUCCESS) - return; - } - - size_t kernel_count = 0; - if (amd_comgr_get_metadata_list_size(kernels_md, &kernel_count) - != AMD_COMGR_STATUS_SUCCESS) - return; - - for (size_t i = 0; i < kernel_count; i++) { - amd_comgr_metadata_node_t kernel_md; - - if (amd_comgr_index_list_metadata(kernels_md, i, &kernel_md) - != AMD_COMGR_STATUS_SUCCESS) - continue; - - amd_comgr_metadata_node_t name_md; - if (amd_comgr_metadata_lookup(kernel_md, ".name", &name_md) - != AMD_COMGR_STATUS_SUCCESS) - continue; - - std::string kernel_name_str = metadata_to_string(name_md); - - if (amd_comgr_destroy_metadata(name_md) - != AMD_COMGR_STATUS_SUCCESS) - continue; - - amd_comgr_metadata_node_t args_md; - if (amd_comgr_metadata_lookup(kernel_md, ".args", &args_md) - != AMD_COMGR_STATUS_SUCCESS) - continue; - - auto foundKernel = kernargs.find(kernel_name_str); - // parse arguments for a given kernel only once - if (foundKernel == kernargs.end()) { - parse_args_v3(args_md, kernargs[kernel_name_str]); - } - - if (amd_comgr_destroy_metadata(args_md) != AMD_COMGR_STATUS_SUCCESS - || amd_comgr_destroy_metadata(kernel_md) - != AMD_COMGR_STATUS_SUCCESS) - continue; - } - - if (amd_comgr_destroy_metadata(kernels_md) != AMD_COMGR_STATUS_SUCCESS - || amd_comgr_destroy_metadata(metadata) != AMD_COMGR_STATUS_SUCCESS) - return; - - amd_comgr_release_data(dataIn); - } - - static - void read_kernarg_metadata( - const std::string& blob, - std::unordered_map< - std::string, - std::vector>>& kernargs) - { - std::istringstream istr{blob}; - ELFIO::elfio reader; - - if (!reader.load(istr)) return; - - // TODO: this is inefficient. - auto it = find_section_if(reader, [](const ELFIO::section* x) { - return x->get_type() == SHT_NOTE; - }); - - if (!it) return; - - const ELFIO::note_section_accessor acc{reader, it}; - auto n{acc.get_notes_num()}; - while (n--) { - ELFIO::Elf_Word type{}; - std::string name{}; - void* desc{}; - ELFIO::Elf_Word desc_size{}; - - acc.get_note(n, type, name, desc, desc_size); - - if (name == "AMDGPU") { - return read_kernarg_metadata_v3(blob, kernargs); - } - if (name != "AMD") continue; // TODO: switch to using NT_AMD_AMDGPU_HSA_METADATA. - - std::string tmp{ - static_cast(desc), static_cast(desc) + desc_size}; - - auto dx = tmp.find("Kernels:"); - - if (dx == std::string::npos) continue; - - return read_kernarg_metadata_v2(tmp, dx + 8u, kernargs); // Skip "Kernels:". - } - } - - const std::unordered_map>>& get_kernargs() { - - std::call_once(kernargs.first, [this]() { - for (auto&& name_and_isa_blobs : get_code_object_blobs()) { - for (auto&& isa_blobs : name_and_isa_blobs.second) { - for (auto&& blob : isa_blobs.second) { - read_kernarg_metadata(blob, kernargs.second); - } - } - } - }); - - return kernargs.second; - } - - std::string name(std::uintptr_t function_address) - { - const auto it = get_function_names().find(function_address); - - if (it == get_function_names().cend()) { - hip_throw(std::runtime_error{ - "Invalid function passed to hipLaunchKernelGGL."}); - } - - return it->second; - } - - std::string name(hsa_agent_t agent) - { - char n[64]{}; - hsa_agent_get_info(agent, HSA_AGENT_INFO_NAME, n); - - return std::string{n}; - } - - const Kernel_descriptor& kernel_descriptor(std::uintptr_t function_address, - hsa_agent_t agent) { - - auto it0 = get_functions(agent).find(function_address); - - if (it0 != get_functions(agent).cend()) return it0->second; - - // For hip-clang compiler + Hcc RT - hipFunction_t f = ihipGetDeviceFunction((const void*)function_address); - if (f) return reinterpret_cast(*f); - - hip_throw(std::runtime_error{ - "No device code available for function: " + - std::string(name(function_address)) + - ", for agent: " + name(agent)}); - } - - const std::vector>& - kernargs_size_align(std::uintptr_t kernel) { - - auto it = get_function_names().find(kernel); - if (it == get_function_names().cend()) { - hip_throw(std::runtime_error{"Undefined __global__ function."}); - } - - auto it1 = get_kernargs().find(it->second); - if (it1 == get_kernargs().end()) { - it1 = get_kernargs().find(it->second + ".kd"); - if (it1 == get_kernargs().end()) { - hip_throw(std::runtime_error{ - "Missing metadata for __global__ function: " + it->second}); - } - } - - return it1->second; - } -}; // class program_state_impl - -struct kernarg_impl { - std::vector v; -}; - - -}; diff --git a/src/trace_helper.h b/src/trace_helper.h deleted file mode 100644 index 202a302f70..0000000000 --- a/src/trace_helper.h +++ /dev/null @@ -1,125 +0,0 @@ -/* -Copyright (c) 2015-2017 Advanced Micro Devices, Inc. All rights reserved. - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. -*/ - -//#pragma once - -#ifndef TRACE_HELPER_H -#define TRACE_HELPER_H - -#include -#include -#include -#include -//--- -// Helper functions to convert HIP function arguments into strings. -// Handles POD data types as well as enumerations (ie hipMemcpyKind). -// The implementation uses C++11 variadic templates and template specialization. -// The hipMemcpyKind example below is a good example that shows how to implement conversion for a -// new HSA type. - - -// Handy macro to convert an enumeration to a stringified version of same: -#define CASE_STR(x) \ - case x: \ - return #x; - - -// Building block functions: -template -inline std::string ToHexString(T v) { - std::ostringstream ss; - ss << "0x" << std::hex << v; - return ss.str(); -}; - - -//--- -// Template overloads for ToString to handle specific types - -// This is the default which works for most types: -template -inline std::string ToString(T v) { - std::ostringstream ss; - ss << v; - return ss.str(); -}; - - -// hipEvent_t specialization. TODO - maybe add an event ID for debug? -template <> -inline std::string ToString(hipEvent_t v) { - std::ostringstream ss; - ss << v; - return ss.str(); -}; -// hipIpcEventHandle_t specialization. TODO -template <> -inline std::string ToString(hipIpcEventHandle_t v) { - return std::string{}; -}; -// hipStream_t -template <> -inline std::string ToString(hipStream_t v) { - std::ostringstream ss; - if (v == NULL) { - ss << "stream:"; - } else { - ss << *v; - } - - return ss.str(); -}; - -// hipMemcpyKind specialization -template <> -inline std::string ToString(hipMemcpyKind v) { - switch (v) { - CASE_STR(hipMemcpyHostToHost); - CASE_STR(hipMemcpyHostToDevice); - CASE_STR(hipMemcpyDeviceToHost); - CASE_STR(hipMemcpyDeviceToDevice); - CASE_STR(hipMemcpyDefault); - default: - return ToHexString(v); - }; -}; - - -template <> -inline std::string ToString(hipError_t v) { - return ihipErrorString(v); -}; - - -// Catch empty arguments case -inline std::string ToString() { return (""); } - - -//--- -// C++11 variadic template - peels off first argument, converts to string, and calls itself again to -// peel the next arg. Strings are automatically separated by comma+space. -template -inline std::string ToString(T first, Args... args) { - return ToString(first) + ", " + ToString(args...); -} - -#endif